Vault
4.1
|
00001 /* 00002 Copyright c1997-2014 Trygve Isaacson. All rights reserved. 00003 This file is part of the Code Vault version 4.1 00004 http://www.bombaydigital.com/ 00005 License: MIT. See LICENSE.md in the Vault top level directory. 00006 */ 00007 00010 #include "vcodepoint.h" 00011 00012 #include "vchar.h" 00013 #include "vstring.h" 00014 #include "vbinaryiostream.h" 00015 #include "vexception.h" 00016 #include "vhex.h" 00017 00018 // VCodePoint ----------------------------------------------------------------- 00019 00020 VCodePoint::VCodePoint(int i) 00021 : mIntValue(i) 00022 , mUTF8Length(VCodePoint::getUTF8LengthFromCodePointValue(mIntValue)) 00023 , mUTF16Length(VCodePoint::getUTF16LengthFromCodePointValue(mIntValue)) 00024 { 00025 } 00026 00027 VCodePoint::VCodePoint(char c) 00028 : mIntValue(VChar(c).intValue()) 00029 , mUTF8Length(VCodePoint::getUTF8LengthFromCodePointValue(mIntValue)) 00030 , mUTF16Length(VCodePoint::getUTF16LengthFromCodePointValue(mIntValue)) 00031 { 00032 } 00033 00034 VCodePoint::VCodePoint(const VChar& c) 00035 : mIntValue(c.intValue()) 00036 , mUTF8Length(VCodePoint::getUTF8LengthFromCodePointValue(mIntValue)) 00037 , mUTF16Length(VCodePoint::getUTF16LengthFromCodePointValue(mIntValue)) 00038 { 00039 } 00040 00041 VCodePoint::VCodePoint(const VString& hexNotation) 00042 : mIntValue(0) 00043 , mUTF8Length(0) 00044 , mUTF16Length(0) 00045 { 00046 // If the string starts with "U+" we skip it. 00047 // From there we assume the rest is hexadecimal, at most 8 digits. 00048 int length = hexNotation.length(); 00049 int start = 0; 00050 if (hexNotation.startsWith("U+")) { 00051 start += 2; 00052 } 00053 00054 if (length - start > 8) { 00055 throw VRangeException(VSTRING_FORMAT("VCodePoint: attempt to construct with invalid notation '%s'.", hexNotation.chars())); 00056 } 00057 00058 // Walk backwards until we process all characters or see the '+'. 00059 00060 int valueByteIndex = 0; 00061 for (VString::const_reverse_iterator ri = hexNotation.rbegin(); ri != hexNotation.rend(); /*incremented below*/) { 00062 //for (int index = length-1; index >= start; ) { 00063 VCodePoint nextChar = *ri; 00064 ++ri; 00065 00066 if (nextChar == '+') { 00067 break; 00068 } 00069 00070 VCodePoint lowNibbleChar = nextChar; 00071 VCodePoint highNibbleChar('0'); 00072 00073 if (ri != hexNotation.rend()) { 00074 nextChar = *ri; 00075 ++ri; 00076 00077 if (nextChar != '+') { 00078 highNibbleChar = nextChar; 00079 } 00080 } 00081 00082 if (!highNibbleChar.isHexadecimal() || !lowNibbleChar.isHexadecimal()) { 00083 throw VRangeException(VSTRING_FORMAT("VCodePoint: attempt to construct with invalid notation '%s'.", hexNotation.chars())); 00084 } 00085 00086 // At this point we have the two hex chars. Convert to a byte, and or it into the result at the appropriate location. 00087 Vs32 byteValue = (Vs32) VHex::hexCharsToByte((char) highNibbleChar.intValue(), (char) lowNibbleChar.intValue()); // char TODO: VHex API update to VCodePoint 00088 byteValue <<= (valueByteIndex * 8); 00089 Vs32 mask = 0x000000FF << (valueByteIndex * 8); 00090 00091 mIntValue |= (int) (byteValue & mask); 00092 00093 ++valueByteIndex; 00094 00095 if (nextChar == '+') { 00096 break; 00097 } 00098 } 00099 00100 mUTF8Length = VCodePoint::getUTF8LengthFromCodePointValue(mIntValue); 00101 mUTF16Length = VCodePoint::getUTF16LengthFromCodePointValue(mIntValue); 00102 } 00103 00104 VCodePoint::VCodePoint(const Vu8* buffer, int startOffset) 00105 : mIntValue(0) 00106 , mUTF8Length(0) 00107 , mUTF16Length(0) 00108 { 00109 const Vu8* source = buffer + startOffset; 00110 00111 Vu8 source0 = source[0]; 00112 int numBytesToRead = VCodePoint::getUTF8LengthFromUTF8StartByte(source0); 00113 00114 this->_initFromUTF8Bytes(numBytesToRead, source0, 00115 (numBytesToRead > 1) ? source[1] : 0, 00116 (numBytesToRead > 2) ? source[2] : 0, 00117 (numBytesToRead > 3) ? source[3] : 0); 00118 } 00119 00120 VCodePoint::VCodePoint(VBinaryIOStream& stream) { 00121 Vu8 source0 = stream.readU8(); 00122 int numBytesToRead = VCodePoint::getUTF8LengthFromUTF8StartByte(source0); 00123 00124 this->_initFromUTF8Bytes(numBytesToRead, source0, 00125 (numBytesToRead > 1) ? stream.readU8() : 0, 00126 (numBytesToRead > 2) ? stream.readU8() : 0, 00127 (numBytesToRead > 3) ? stream.readU8() : 0); 00128 } 00129 00130 VCodePoint::VCodePoint(VTextIOStream& utf8Stream) { 00131 Vu8 source0 = utf8Stream.readGuaranteedByte(); 00132 int numBytesToRead = VCodePoint::getUTF8LengthFromUTF8StartByte(source0); 00133 00134 this->_initFromUTF8Bytes(numBytesToRead, source0, 00135 (numBytesToRead > 1) ? utf8Stream.readGuaranteedByte() : 0, 00136 (numBytesToRead > 2) ? utf8Stream.readGuaranteedByte() : 0, 00137 (numBytesToRead > 3) ? utf8Stream.readGuaranteedByte() : 0); 00138 } 00139 00140 VCodePoint::VCodePoint(const std::wstring& utf16WideString, int atIndex) { 00141 wchar_t firstUnit = utf16WideString[atIndex]; 00142 if (!VCodePoint::isUTF16SurrogateCodeUnit(firstUnit)) { 00143 mIntValue = firstUnit; 00144 } else { 00145 if (static_cast<int>(utf16WideString.length()) <= (atIndex + 1)) { 00146 throw VEOFException("Reached end of utf16WideString in the middle of a two-unit code point."); // Note: Stream-oriented reading is the way to avoid this case when reading in chunks. 00147 } 00148 00149 this->_initFromUTF16Surrogates(firstUnit, utf16WideString[atIndex + 1]); 00150 } 00151 } 00152 00153 #define UTF8_BYTE_1_OF_2(mIntValue) (0xC0 + mIntValue / 0x40) // first byte binary: 110xxxxx (with highest 5 bits) 00154 #define UTF8_BYTE_2_OF_2(mIntValue) (0x80 + mIntValue % 0x40) // second byte binary: 10xxxxxx (with next 6 bits) 00155 00156 #define UTF8_BYTE_1_OF_3(mIntValue) (0xE0 + mIntValue / 0x1000) // first byte binary: 1110xxxx (with highest 4 bits) 00157 #define UTF8_BYTE_2_OF_3(mIntValue) (0x80 + mIntValue / 0x40 % 0x40) // second byte binary: 10xxxxxx (with next 6 bits) 00158 #define UTF8_BYTE_3_OF_3(mIntValue) (0x80 + mIntValue % 0x40) // third byte binary: 10xxxxxx (with next 6 bits) 00159 00160 #define UTF8_BYTE_1_OF_4(mIntValue) (0xF0 + mIntValue / 0x40000) // first byte binary: 11110xxx (with highest 3 bits) 00161 #define UTF8_BYTE_2_OF_4(mIntValue) (0x80 + mIntValue / 0x1000 % 0x40) // second byte binary: 10xxxxxx (with next 6 bits) 00162 #define UTF8_BYTE_3_OF_4(mIntValue) (0x80 + mIntValue / 0x40 % 0x40) // third byte binary: 10xxxxxx (with next 6 bits) 00163 #define UTF8_BYTE_4_OF_4(mIntValue) (0x80 + mIntValue % 0x40) // fourth byte binary: 10xxxxxx (with next 6 bits) 00164 00165 VString VCodePoint::toString() const { 00166 VString s; 00167 00168 // Use of 0x40 (decimal 64) here is to chop a number into 6-bit parts. 00169 // 0x40 is binary 01000000, so 00170 // n / 0x40 effectively strips off the low 6 bits 00171 // n % 0x40 effectively strips off all but the low 6 bits 00172 // n / 0x40 % 0x40 effectively yields the "next" 6 bits by combining those two operations 00173 00174 switch (VCodePoint::getUTF8LengthFromCodePointValue(mIntValue)) { 00175 00176 case 1: 00177 s += (char) mIntValue; // first byte binary: 0xxxxxxx (with 7 used bits) 00178 break; 00179 00180 case 2: 00181 s += (char) UTF8_BYTE_1_OF_2(mIntValue); 00182 s += (char) UTF8_BYTE_2_OF_2(mIntValue); 00183 break; 00184 00185 case 3: 00186 s += (char) UTF8_BYTE_1_OF_3(mIntValue); 00187 s += (char) UTF8_BYTE_2_OF_3(mIntValue); 00188 s += (char) UTF8_BYTE_3_OF_3(mIntValue); 00189 break; 00190 00191 case 4: 00192 s += (char) UTF8_BYTE_1_OF_4(mIntValue); 00193 s += (char) UTF8_BYTE_2_OF_4(mIntValue); 00194 s += (char) UTF8_BYTE_3_OF_4(mIntValue); 00195 s += (char) UTF8_BYTE_4_OF_4(mIntValue); 00196 break; 00197 00198 default: 00199 throw VRangeException(VSTRING_FORMAT("VCodePoint::toString() for an invalid UTF-8 code point 0x%X", mIntValue)); 00200 break; 00201 } 00202 00203 return s; 00204 } 00205 00206 VChar VCodePoint::toASCIIChar() const { 00207 if (! this->isASCII()) { 00208 throw VRangeException(VSTRING_FORMAT("VCodePoint::toASCIIChar() for an invalid UTF-8 code point 0x%X", mIntValue)); 00209 } 00210 00211 return VChar(mIntValue); 00212 } 00213 00214 bool VCodePoint::isWhitespace() const { 00215 // Need to be careful about signage for values > 0x7F. 00216 int value = this->intValue(); 00217 return (value <= 0x20) || (value == 0x7F); 00218 } 00219 00220 bool VCodePoint::isAlpha() const { 00221 int value = this->intValue(); 00222 return ((value >= 'a') && (value <= 'z')) || 00223 ((value >= 'A') && (value <= 'Z')); 00224 } 00225 00226 bool VCodePoint::isNumeric() const { 00227 int value = this->intValue(); 00228 return (value >= '0') && (value <= '9'); 00229 } 00230 00231 bool VCodePoint::isAlphaNumeric() const { 00232 return this->isAlpha() || this->isNumeric(); 00233 } 00234 00235 bool VCodePoint::isHexadecimal() const { 00236 int value = this->intValue(); 00237 return ((value >= '0') && (value <= '9')) || 00238 ((value >= 'a') && (value <= 'f')) || 00239 ((value >= 'A') && (value <= 'F')); 00240 } 00241 00242 std::wstring VCodePoint::toUTF16WideString() const { 00243 std::wstring s; 00244 00245 switch (VCodePoint::getUTF16LengthFromCodePointValue(mIntValue)) { 00246 00247 case 1: 00248 s += (wchar_t) (mIntValue); // first byte binary: same as code point value 00249 break; 00250 00251 case 2: { 00252 int leadSurrogate = ((mIntValue - 0x10000) >> 10) + 0xD800; 00253 int trailSurrogate = ((mIntValue - 0x10000) & 0x03FF) + 0xDC00; 00254 s += (wchar_t) leadSurrogate; 00255 s += (wchar_t) trailSurrogate; 00256 } 00257 break; 00258 00259 default: 00260 throw VRangeException(VSTRING_FORMAT("VCodePoint::toString() for an invalid UTF-8 code point 0x%X", mIntValue)); 00261 break; 00262 } 00263 00264 return s; 00265 } 00266 00267 void VCodePoint::writeToBinaryStream(VBinaryIOStream& stream) const { 00268 switch (VCodePoint::getUTF8LengthFromCodePointValue(mIntValue)) { 00269 00270 case 1: 00271 stream.writeU8((Vu8) mIntValue); // first byte binary: 0xxxxxxx (with 7 used bits) 00272 break; 00273 00274 case 2: 00275 stream.writeU8((Vu8) UTF8_BYTE_1_OF_2(mIntValue)); 00276 stream.writeU8((Vu8) UTF8_BYTE_2_OF_2(mIntValue)); 00277 break; 00278 00279 case 3: 00280 stream.writeU8((Vu8) UTF8_BYTE_1_OF_3(mIntValue)); 00281 stream.writeU8((Vu8) UTF8_BYTE_2_OF_3(mIntValue)); 00282 stream.writeU8((Vu8) UTF8_BYTE_3_OF_3(mIntValue)); 00283 break; 00284 00285 case 4: 00286 stream.writeU8((Vu8) UTF8_BYTE_1_OF_4(mIntValue)); 00287 stream.writeU8((Vu8) UTF8_BYTE_2_OF_4(mIntValue)); 00288 stream.writeU8((Vu8) UTF8_BYTE_3_OF_4(mIntValue)); 00289 stream.writeU8((Vu8) UTF8_BYTE_4_OF_4(mIntValue)); 00290 break; 00291 00292 default: 00293 throw VRangeException(VSTRING_FORMAT("VCodePoint::writeToBinaryStream() for an invalid UTF-8 code point 0x%X", mIntValue)); 00294 break; 00295 } 00296 } 00297 00298 // static 00299 int VCodePoint::getUTF8LengthFromUTF8StartByte(Vu8 startByte) { 00300 // In UTF-8 the number of leading 1 bits on the first byte tells us how many "extra" bytes make up the code point in the buffer. 00301 int utf8Length = 1; 00302 00303 if (((startByte & 0x80) != 0x00) && ((startByte & 0x40) != 0x00)) { // test for binary 11?????? (2 bits found so far) 00304 ++utf8Length; 00305 00306 if ((startByte & 0x20) != 0x00) { // test for binary ??1????? (3 bits found so far) 00307 ++utf8Length; 00308 00309 if ((startByte & 0x10) != 0x00) { // test for binary ???1???? (4 bits found total) 00310 ++utf8Length; 00311 } 00312 } 00313 } 00314 00315 return utf8Length; 00316 } 00317 00318 // static 00319 int VCodePoint::getUTF8LengthFromCodePointValue(int intValue) { 00320 if (intValue < 0x80) { 00321 return 1; 00322 } else if (intValue < 0x00000800) { 00323 return 2; 00324 } else if (intValue < 0x00010000) { 00325 return 3; 00326 } else if (intValue < 0x00110000) { 00327 return 4; 00328 } else { 00329 throw VRangeException(VSTRING_FORMAT("VCodePoint::getUTF8LengthFromCodePointValue() for an invalid UTF-8 code point 0x%X", intValue)); 00330 } 00331 } 00332 00333 // static 00334 bool VCodePoint::isUTF8ContinuationByte(Vu8 byteValue) { 00335 // 0xC0 mask value of 0x80 (10xxxxxx) detects UTF-8 continuation bytes; anything else is start of a character (single or multi-byte). 00336 return ((byteValue & 0xC0) == 0x80); 00337 } 00338 00339 // static 00340 int VCodePoint::countUTF8CodePoints(const Vu8* buffer, int numBytes) { 00341 int numCodePoints = 0; 00342 int offset = 0; 00343 while (offset < numBytes) { 00344 VCodePoint cp(buffer, offset); 00345 ++numCodePoints; 00346 offset += cp.getUTF8Length(); 00347 } 00348 00349 return numCodePoints; 00350 } 00351 00352 // static 00353 int VCodePoint::getPreviousUTF8CodePointOffset(const Vu8* buffer, int offset) { 00354 int previousOffset = offset - 1; 00355 00356 while ((previousOffset > 0) && VCodePoint::isUTF8ContinuationByte(buffer[previousOffset])) { 00357 --previousOffset; 00358 } 00359 00360 return previousOffset; 00361 } 00362 00363 // static 00364 bool VCodePoint::isUTF16SurrogateCodeUnit(wchar_t codeUnit) { 00365 /* 00366 In UTF-16 two known ranges of values occupy a single code unit. Anything else uses two code units. 00367 The single unit ranges are: 00368 U+0000 to U+D7FF 00369 U+E000 to U+FFFF 00370 Therefore, only values in the remaining range indicate a lead surrogate: 00371 U+D800 to U+DFFF 00372 And anything above U+FFFF is a trail surrogate. 00373 */ 00374 return 00375 ((codeUnit >= 0xD800) && (codeUnit <= 0xDFFF)) || // lead surrogate range 00376 (codeUnit >= 0x10000); // trail surrogate range 00377 } 00378 00379 // static 00380 int VCodePoint::getUTF16LengthFromCodePointValue(int intValue) { 00381 if (((intValue >= 0x0000) && (intValue <= 0xD7FF)) || 00382 ((intValue >= 0xE000) && (intValue <= 0xFFFF))) { 00383 return 1; 00384 } 00385 00386 return 2; 00387 } 00388 00389 void VCodePoint::_initFromUTF8Bytes(int numBytesToUse, Vu8 byte0, Vu8 byte1, Vu8 byte2, Vu8 byte3) { 00390 mIntValue = 0; 00391 00392 if (numBytesToUse == 1) { 00393 mIntValue = byte0; 00394 00395 } else if (numBytesToUse == 2) { 00396 mIntValue |= ((byte0 & 0x1F) << 6); 00397 mIntValue |= (byte1 & 0x3F); 00398 00399 } else if (numBytesToUse == 3) { 00400 mIntValue |= ((byte0 & 0x0F) << 12); 00401 mIntValue |= ((byte1 & 0x3F) << 6); 00402 mIntValue |= (byte2 & 0x3F); 00403 00404 } else /* numBytesToUse is 4 */ { 00405 mIntValue |= ((byte0 & 0x07) << 18); 00406 mIntValue |= ((byte1 & 0x3F) << 12); 00407 mIntValue |= ((byte2 & 0x3F) << 6); 00408 mIntValue |= (byte3 & 0x3F); 00409 } 00410 00411 mUTF8Length = numBytesToUse; 00412 mUTF16Length = VCodePoint::getUTF16LengthFromCodePointValue(mIntValue); 00413 } 00414 00415 void VCodePoint::_initFromUTF16Surrogates(wchar_t leadSurrogate, wchar_t trailSurrogate) { 00416 int x = (leadSurrogate & ((1 << 6) -1)) << 10 | (trailSurrogate & ((1 << 10) -1)); 00417 int w = (leadSurrogate >> 6) & ((1 << 5) - 1); 00418 int u = w + 1; 00419 mIntValue = u << 16 | x; 00420 00421 mUTF8Length = VCodePoint::getUTF8LengthFromCodePointValue(mIntValue); 00422 mUTF16Length = 2; 00423 }