GeoPubby  Version 0.1.0.0
IRIEncoder Class Reference

Implements the IRI-to-URI and URI-to-IRI conversions defined in RFC 3987. More...

Collaboration diagram for IRIEncoder:

Static Public Member Functions

static String toIRI (String uri)
 Converts a URI to an IRI by removing unnecessary percent-encoding of UTF-8 sequences. More...
 
static String toURI (String iri)
 Converts an IRI to a URI by percent-encoding characters outside of the US-ASCII range. More...
 

Static Private Member Functions

static String decode (String percentEncoded)
 
static boolean isContinuationOctet (int octet)
 
static void appendOctet (StringBuffer sb, byte octet)
 
static int getBytesInSequence (int octet)
 
static char toCharacter (int[] octets, int offset, int length)
 
static boolean isUnreservedASCII (char c)
 
static int[] toBytes (String percentEncoded)
 
static int toByte (char hex1, char hex2)
 
static int toByte (char hex)
 

Static Private Attributes

static final Pattern percentEncoding = Pattern.compile("(%[0-9a-fA-F][0-9a-fA-F])+")
 
static char[] hexDigits = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'}
 

Detailed Description

Implements the IRI-to-URI and URI-to-IRI conversions defined in RFC 3987.

TODO: This really needs some unit tests TODO: Make this an IRIRewriter?

Member Function Documentation

◆ appendOctet()

static void appendOctet ( StringBuffer  sb,
byte  octet 
)
staticprivate
121  {
122  sb.append('%');
123  sb.append(hexDigits[(octet >> 4) & 0x0F]);
124  sb.append(hexDigits[octet & 0x0F]);
125  }
static char[] hexDigits
Definition: IRIEncoder.java:53

References IRIEncoder.hexDigits.

Referenced by IRIEncoder.decode(), and IRIEncoder.toURI().

◆ decode()

static String decode ( String  percentEncoded)
staticprivate
55  {
56  StringBuffer decoded = new StringBuffer();
57  int[] octets = toBytes(percentEncoded);
58  int i = 0;
59  while (i < octets.length) {
60  if (octets[i] <= 0x7F) {
61  // US-ASCII character. Decode, except if it's one of
62  // %, reserved, or not allowed in IRIs. In that case, re-encode.
63  if (isUnreservedASCII((char) octets[i])) {
64  decoded.append((char) octets[i]);
65  } else {
66  // FIXME: Strictly speaking, the spec says that the original
67  // percent-encoding remains unchanged, meaning lower-case
68  // hex digits would remain lower-case. We upper-case them
69  // here by re-encoding.
70  appendOctet(decoded, (byte) octets[i]);
71  }
72  i++;
73  continue;
74  }
75  if (isContinuationOctet(octets[i])) {
76  appendOctet(decoded, (byte) octets[i]);
77  i++;
78  continue;
79  }
80  int bytesInSequence = getBytesInSequence(octets[i]);
81  if (i + bytesInSequence > octets.length) {
82  // Not enough continuation bytes to complete the character.
83  // Re-encode one byte, then let the main loop eat the rest.
84  appendOctet(decoded, (byte) octets[i]);
85  i++;
86  continue;
87  }
88  // Next, check if the next n bytes are all continuation bytes.
89  boolean enoughContinuationBytes = true;
90  for (int j = 1; j < bytesInSequence; j++) {
91  if (!isContinuationOctet(octets[i + j])) {
92  // Nope
93  enoughContinuationBytes = false;
94  break;
95  }
96  }
97  if (!enoughContinuationBytes) {
98  // Re-encode one byte, and let the main loop eat the rest.
99  appendOctet(decoded, (byte) octets[i]);
100  i++;
101  continue;
102  }
103  // UTF-8 encoding looks fine. Decode to one character.
104 
105  // FIXME: RFC 3987 says here:
106  // 4. Re-percent-encode all octets produced in step 3 that in UTF-8
107  // represent characters that are not appropriate according to
108  // sections 2.2, 4.1, and 6.1.
109  // This is about weird unicode characters that are inappropriate
110  // in IRIs for various reasons. We ignore this currently.
111  decoded.append(toCharacter(octets, i, bytesInSequence));
112  i += bytesInSequence;
113  }
114  return decoded.toString();
115  }
static boolean isContinuationOctet(int octet)
Definition: IRIEncoder.java:117
static int getBytesInSequence(int octet)
Definition: IRIEncoder.java:127
static int[] toBytes(String percentEncoded)
Definition: IRIEncoder.java:158
static boolean isUnreservedASCII(char c)
Definition: IRIEncoder.java:152
static char toCharacter(int[] octets, int offset, int length)
Definition: IRIEncoder.java:139
static void appendOctet(StringBuffer sb, byte octet)
Definition: IRIEncoder.java:121

References IRIEncoder.appendOctet(), IRIEncoder.getBytesInSequence(), IRIEncoder.isContinuationOctet(), IRIEncoder.isUnreservedASCII(), IRIEncoder.toBytes(), and IRIEncoder.toCharacter().

Referenced by IRIEncoder.toIRI().

◆ getBytesInSequence()

static int getBytesInSequence ( int  octet)
staticprivate
127  {
128  // See table in http://en.wikipedia.org/wiki/UTF-8#Description
129  if ((octet & 0x80) == 0) return 1;
130  if ((octet & 0xC0) == 0x80) return 0; // Continuation octet
131  if ((octet & 0xE0) == 0xC0) return 2;
132  if ((octet & 0xF0) == 0xE0) return 3;
133  if ((octet & 0xF8) == 0xF0) return 4;
134  if ((octet & 0xFC) == 0xF8) return 5;
135  if ((octet & 0xFE) == 0xFC) return 6;
136  return 0; // Shouldn't happen
137  }

Referenced by IRIEncoder.decode().

◆ isContinuationOctet()

static boolean isContinuationOctet ( int  octet)
staticprivate
117  {
118  return (octet & 0xC0) == 0x80;
119  }

Referenced by IRIEncoder.decode().

◆ isUnreservedASCII()

static boolean isUnreservedASCII ( char  c)
staticprivate
152  {
153  // unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
154  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
155  (c >= '0' && c <= '9') || c == '-' || c == '.' || c == '_' || c == '~';
156  }

Referenced by IRIEncoder.decode().

◆ toByte() [1/2]

static int toByte ( char  hex)
staticprivate
171  {
172  if (hex >= '0' && hex <= '9') {
173  return hex - '0';
174  }
175  if (hex >= 'a' && hex <= 'f') {
176  return hex - 'a' + 10;
177  }
178  if (hex >= 'A' && hex <= 'F') {
179  return hex - 'A' + 10;
180  }
181  throw new IllegalArgumentException("Not a hex digit: " + hex);
182  }

◆ toByte() [2/2]

static int toByte ( char  hex1,
char  hex2 
)
staticprivate
167  {
168  return (toByte(hex1) << 4) | toByte(hex2);
169  }
static int toByte(char hex1, char hex2)
Definition: IRIEncoder.java:167

Referenced by IRIEncoder.toBytes().

◆ toBytes()

static int [] toBytes ( String  percentEncoded)
staticprivate
158  {
159  int length = percentEncoded.length() / 3;
160  int[] result = new int[length];
161  for (int i = 0; i < length; i++) {
162  result[i] = toByte(percentEncoded.charAt(i * 3 + 1), percentEncoded.charAt(i * 3 + 2));
163  }
164  return result;
165  }

References IRIEncoder.toByte().

Referenced by IRIEncoder.decode().

◆ toCharacter()

static char toCharacter ( int[]  octets,
int  offset,
int  length 
)
staticprivate
139  {
140  byte[] bytes = new byte[length];
141  for (int i = 0; i < length; i++) {
142  bytes[i] = (byte) octets[offset + i];
143  }
144  try {
145  return new String(bytes, "utf-8").charAt(0);
146  } catch (UnsupportedEncodingException ex) {
147  // Can't happen
148  throw new RuntimeException(ex);
149  }
150  }

Referenced by IRIEncoder.decode().

◆ toIRI()

static String toIRI ( String  uri)
static

Converts a URI to an IRI by removing unnecessary percent-encoding of UTF-8 sequences.

20  {
21  StringBuffer decoded = new StringBuffer();
22  Matcher matcher = percentEncoding.matcher(uri);
23  while (matcher.find()) {
24  matcher.appendReplacement(decoded, decode(matcher.group()));
25  }
26  matcher.appendTail(decoded);
27  return decoded.toString();
28  }
static final Pattern percentEncoding
Definition: IRIEncoder.java:29
static String decode(String percentEncoded)
Definition: IRIEncoder.java:55

References IRIEncoder.decode(), and IRIEncoder.percentEncoding.

Referenced by ValuesBaseServlet.doGet(), Configuration.getControls(), and PubbyIRIEscaper.rewrite().

◆ toURI()

static String toURI ( String  iri)
static

Converts an IRI to a URI by percent-encoding characters outside of the US-ASCII range.

35  {
36  try {
37  StringBuffer encoded = new StringBuffer();
38  for (int i = 0; i < iri.length(); i++) {
39  if ((int) iri.charAt(i) <= 128) {
40  encoded.append(iri.charAt(i));
41  continue;
42  }
43  for (byte b: iri.substring(i, i + 1).getBytes("utf-8")) {
44  appendOctet(encoded, b);
45  }
46  }
47  return encoded.toString();
48  } catch (UnsupportedEncodingException ex) {
49  // Can't happen
50  return iri;
51  }
52  }

References IRIEncoder.appendOctet().

Referenced by RootServlet.doGet(), WebURIServlet.doGet(), and PubbyIRIEscaper.unrewrite().

Member Data Documentation

◆ hexDigits

char [] hexDigits = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'}
staticprivate

Referenced by IRIEncoder.appendOctet().

◆ percentEncoding

final Pattern percentEncoding = Pattern.compile("(%[0-9a-fA-F][0-9a-fA-F])+")
staticprivate

Referenced by IRIEncoder.toIRI().