Vault  4.1
vcodepoint.cpp
Go to the documentation of this file.
00001 /*
00002 Copyright c1997-2014 Trygve Isaacson. All rights reserved.
00003 This file is part of the Code Vault version 4.1
00004 http://www.bombaydigital.com/
00005 License: MIT. See LICENSE.md in the Vault top level directory.
00006 */
00007 
00010 #include "vcodepoint.h"
00011 
00012 #include "vchar.h"
00013 #include "vstring.h"
00014 #include "vbinaryiostream.h"
00015 #include "vexception.h"
00016 #include "vhex.h"
00017 
00018 // VCodePoint -----------------------------------------------------------------
00019 
00020 VCodePoint::VCodePoint(int i)
00021     : mIntValue(i)
00022     , mUTF8Length(VCodePoint::getUTF8LengthFromCodePointValue(mIntValue))
00023     , mUTF16Length(VCodePoint::getUTF16LengthFromCodePointValue(mIntValue))
00024     {
00025 }
00026 
00027 VCodePoint::VCodePoint(char c)
00028     : mIntValue(VChar(c).intValue())
00029     , mUTF8Length(VCodePoint::getUTF8LengthFromCodePointValue(mIntValue))
00030     , mUTF16Length(VCodePoint::getUTF16LengthFromCodePointValue(mIntValue))
00031     {
00032 }
00033 
00034 VCodePoint::VCodePoint(const VChar& c)
00035     : mIntValue(c.intValue())
00036     , mUTF8Length(VCodePoint::getUTF8LengthFromCodePointValue(mIntValue))
00037     , mUTF16Length(VCodePoint::getUTF16LengthFromCodePointValue(mIntValue))
00038     {
00039 }
00040 
00041 VCodePoint::VCodePoint(const VString& hexNotation)
00042     : mIntValue(0)
00043     , mUTF8Length(0)
00044     , mUTF16Length(0)
00045     {
00046     // If the string starts with "U+" we skip it.
00047     // From there we assume the rest is hexadecimal, at most 8 digits.
00048     int length = hexNotation.length();
00049     int start = 0;
00050     if (hexNotation.startsWith("U+")) {
00051         start += 2;
00052     }
00053     
00054     if (length - start > 8) {
00055         throw VRangeException(VSTRING_FORMAT("VCodePoint: attempt to construct with invalid notation '%s'.", hexNotation.chars()));
00056     }
00057     
00058     // Walk backwards until we process all characters or see the '+'.
00059     
00060     int valueByteIndex = 0;
00061     for (VString::const_reverse_iterator ri = hexNotation.rbegin(); ri != hexNotation.rend(); /*incremented below*/) {
00062     //for (int index = length-1; index >= start; ) {
00063         VCodePoint nextChar = *ri;
00064         ++ri;
00065 
00066         if (nextChar == '+') {
00067             break;
00068         }
00069 
00070         VCodePoint lowNibbleChar = nextChar;
00071         VCodePoint highNibbleChar('0');
00072 
00073         if (ri != hexNotation.rend()) {
00074             nextChar = *ri;
00075             ++ri;
00076 
00077             if (nextChar != '+') {
00078                 highNibbleChar = nextChar;
00079             }
00080         }
00081 
00082         if (!highNibbleChar.isHexadecimal() || !lowNibbleChar.isHexadecimal()) {
00083             throw VRangeException(VSTRING_FORMAT("VCodePoint: attempt to construct with invalid notation '%s'.", hexNotation.chars()));
00084         }
00085         
00086         // At this point we have the two hex chars. Convert to a byte, and or it into the result at the appropriate location.
00087         Vs32 byteValue = (Vs32) VHex::hexCharsToByte((char) highNibbleChar.intValue(), (char) lowNibbleChar.intValue()); // char TODO: VHex API update to VCodePoint
00088         byteValue <<= (valueByteIndex * 8);
00089         Vs32 mask = 0x000000FF << (valueByteIndex * 8);
00090         
00091         mIntValue |= (int) (byteValue & mask);
00092         
00093         ++valueByteIndex;
00094 
00095         if (nextChar == '+') {
00096             break;
00097         }
00098     }
00099 
00100     mUTF8Length = VCodePoint::getUTF8LengthFromCodePointValue(mIntValue);
00101     mUTF16Length = VCodePoint::getUTF16LengthFromCodePointValue(mIntValue);
00102 }
00103 
00104 VCodePoint::VCodePoint(const Vu8* buffer, int startOffset)
00105     : mIntValue(0)
00106     , mUTF8Length(0)
00107     , mUTF16Length(0)
00108     {
00109     const Vu8* source = buffer + startOffset;
00110 
00111     Vu8 source0 = source[0];
00112     int numBytesToRead = VCodePoint::getUTF8LengthFromUTF8StartByte(source0);
00113     
00114     this->_initFromUTF8Bytes(numBytesToRead, source0,
00115         (numBytesToRead > 1) ? source[1] : 0,
00116         (numBytesToRead > 2) ? source[2] : 0,
00117         (numBytesToRead > 3) ? source[3] : 0);
00118 }
00119 
00120 VCodePoint::VCodePoint(VBinaryIOStream& stream) {
00121     Vu8 source0 = stream.readU8();
00122     int numBytesToRead = VCodePoint::getUTF8LengthFromUTF8StartByte(source0);
00123     
00124     this->_initFromUTF8Bytes(numBytesToRead, source0,
00125         (numBytesToRead > 1) ? stream.readU8() : 0,
00126         (numBytesToRead > 2) ? stream.readU8() : 0,
00127         (numBytesToRead > 3) ? stream.readU8() : 0);
00128 }
00129 
00130 VCodePoint::VCodePoint(VTextIOStream& utf8Stream) {
00131     Vu8 source0 = utf8Stream.readGuaranteedByte();
00132     int numBytesToRead = VCodePoint::getUTF8LengthFromUTF8StartByte(source0);
00133     
00134     this->_initFromUTF8Bytes(numBytesToRead, source0,
00135         (numBytesToRead > 1) ? utf8Stream.readGuaranteedByte() : 0,
00136         (numBytesToRead > 2) ? utf8Stream.readGuaranteedByte() : 0,
00137         (numBytesToRead > 3) ? utf8Stream.readGuaranteedByte() : 0);
00138 }
00139 
00140 VCodePoint::VCodePoint(const std::wstring& utf16WideString, int atIndex) {
00141     wchar_t firstUnit = utf16WideString[atIndex];
00142     if (!VCodePoint::isUTF16SurrogateCodeUnit(firstUnit)) {
00143         mIntValue = firstUnit;
00144     } else {
00145         if (static_cast<int>(utf16WideString.length()) <= (atIndex + 1)) {
00146             throw VEOFException("Reached end of utf16WideString in the middle of a two-unit code point."); // Note: Stream-oriented reading is the way to avoid this case when reading in chunks.
00147         }
00148         
00149         this->_initFromUTF16Surrogates(firstUnit, utf16WideString[atIndex + 1]);
00150     }
00151 }
00152 
00153 #define UTF8_BYTE_1_OF_2(mIntValue) (0xC0 + mIntValue / 0x40)          // first byte binary:   110xxxxx (with highest 5 bits)
00154 #define UTF8_BYTE_2_OF_2(mIntValue) (0x80 + mIntValue % 0x40)          // second byte binary:  10xxxxxx (with next 6 bits)
00155 
00156 #define UTF8_BYTE_1_OF_3(mIntValue) (0xE0 + mIntValue / 0x1000)        // first byte binary:   1110xxxx (with highest 4 bits)
00157 #define UTF8_BYTE_2_OF_3(mIntValue) (0x80 + mIntValue / 0x40 % 0x40)   // second byte binary:  10xxxxxx (with next 6 bits)
00158 #define UTF8_BYTE_3_OF_3(mIntValue) (0x80 + mIntValue % 0x40)          // third byte binary:   10xxxxxx (with next 6 bits)
00159 
00160 #define UTF8_BYTE_1_OF_4(mIntValue) (0xF0 + mIntValue / 0x40000)       // first byte binary:   11110xxx (with highest 3 bits)
00161 #define UTF8_BYTE_2_OF_4(mIntValue) (0x80 + mIntValue / 0x1000 % 0x40) // second byte binary:  10xxxxxx (with next 6 bits)
00162 #define UTF8_BYTE_3_OF_4(mIntValue) (0x80 + mIntValue / 0x40 % 0x40)   // third byte binary:   10xxxxxx (with next 6 bits)
00163 #define UTF8_BYTE_4_OF_4(mIntValue) (0x80 + mIntValue % 0x40)          // fourth byte binary:  10xxxxxx (with next 6 bits)
00164 
00165 VString VCodePoint::toString() const {
00166     VString s;
00167     
00168     // Use of 0x40 (decimal 64) here is to chop a number into 6-bit parts.
00169     // 0x40 is binary 01000000, so
00170     //      n / 0x40 effectively strips off the low 6 bits
00171     //      n % 0x40 effectively strips off all but the low 6 bits
00172     //      n / 0x40 % 0x40 effectively yields the "next" 6 bits by combining those two operations
00173     
00174     switch (VCodePoint::getUTF8LengthFromCodePointValue(mIntValue)) {
00175 
00176         case 1:
00177             s += (char) mIntValue;  // first byte binary:   0xxxxxxx (with 7 used bits)
00178             break;
00179 
00180         case 2:
00181             s += (char) UTF8_BYTE_1_OF_2(mIntValue);
00182             s += (char) UTF8_BYTE_2_OF_2(mIntValue);
00183             break;
00184 
00185         case 3:
00186             s += (char) UTF8_BYTE_1_OF_3(mIntValue);
00187             s += (char) UTF8_BYTE_2_OF_3(mIntValue);
00188             s += (char) UTF8_BYTE_3_OF_3(mIntValue);
00189             break;
00190 
00191         case 4:
00192             s += (char) UTF8_BYTE_1_OF_4(mIntValue);
00193             s += (char) UTF8_BYTE_2_OF_4(mIntValue);
00194             s += (char) UTF8_BYTE_3_OF_4(mIntValue);
00195             s += (char) UTF8_BYTE_4_OF_4(mIntValue);
00196             break;
00197             
00198         default:
00199             throw VRangeException(VSTRING_FORMAT("VCodePoint::toString() for an invalid UTF-8 code point 0x%X", mIntValue));
00200             break;
00201     }
00202 
00203     return s;
00204 }
00205 
00206 VChar VCodePoint::toASCIIChar() const {
00207     if (! this->isASCII()) {
00208         throw VRangeException(VSTRING_FORMAT("VCodePoint::toASCIIChar() for an invalid UTF-8 code point 0x%X", mIntValue));
00209     }
00210     
00211     return VChar(mIntValue);
00212 }
00213 
00214 bool VCodePoint::isWhitespace() const {
00215     // Need to be careful about signage for values > 0x7F.
00216     int value = this->intValue();
00217     return (value <= 0x20) || (value == 0x7F);
00218 }
00219 
00220 bool VCodePoint::isAlpha() const {
00221     int value = this->intValue();
00222     return ((value >= 'a') && (value <= 'z')) ||
00223            ((value >= 'A') && (value <= 'Z'));
00224 }
00225 
00226 bool VCodePoint::isNumeric() const {
00227     int value = this->intValue();
00228     return (value >= '0') && (value <= '9');
00229 }
00230 
00231 bool VCodePoint::isAlphaNumeric() const {
00232     return this->isAlpha() || this->isNumeric();
00233 }
00234 
00235 bool VCodePoint::isHexadecimal() const {
00236     int value = this->intValue();
00237     return ((value >= '0') && (value <= '9')) ||
00238            ((value >= 'a') && (value <= 'f')) ||
00239            ((value >= 'A') && (value <= 'F'));
00240 }
00241 
00242 std::wstring VCodePoint::toUTF16WideString() const {
00243     std::wstring s;
00244     
00245     switch (VCodePoint::getUTF16LengthFromCodePointValue(mIntValue)) {
00246 
00247         case 1:
00248             s += (wchar_t) (mIntValue); // first byte binary:   same as code point value
00249             break;
00250 
00251         case 2: {
00252             int leadSurrogate =  ((mIntValue - 0x10000) >> 10) + 0xD800;
00253             int trailSurrogate = ((mIntValue - 0x10000) & 0x03FF) + 0xDC00;
00254             s += (wchar_t) leadSurrogate;
00255             s += (wchar_t) trailSurrogate;
00256             }
00257             break;
00258 
00259         default:
00260             throw VRangeException(VSTRING_FORMAT("VCodePoint::toString() for an invalid UTF-8 code point 0x%X", mIntValue));
00261             break;
00262     }
00263     
00264     return s;
00265 }
00266 
00267 void VCodePoint::writeToBinaryStream(VBinaryIOStream& stream) const {
00268     switch (VCodePoint::getUTF8LengthFromCodePointValue(mIntValue)) {
00269 
00270         case 1:
00271             stream.writeU8((Vu8) mIntValue);    // first byte binary:   0xxxxxxx (with 7 used bits)
00272             break;
00273 
00274         case 2:
00275             stream.writeU8((Vu8) UTF8_BYTE_1_OF_2(mIntValue));
00276             stream.writeU8((Vu8) UTF8_BYTE_2_OF_2(mIntValue));
00277             break;
00278 
00279         case 3:
00280             stream.writeU8((Vu8) UTF8_BYTE_1_OF_3(mIntValue));
00281             stream.writeU8((Vu8) UTF8_BYTE_2_OF_3(mIntValue));
00282             stream.writeU8((Vu8) UTF8_BYTE_3_OF_3(mIntValue));
00283             break;
00284 
00285         case 4:
00286             stream.writeU8((Vu8) UTF8_BYTE_1_OF_4(mIntValue));
00287             stream.writeU8((Vu8) UTF8_BYTE_2_OF_4(mIntValue));
00288             stream.writeU8((Vu8) UTF8_BYTE_3_OF_4(mIntValue));
00289             stream.writeU8((Vu8) UTF8_BYTE_4_OF_4(mIntValue));
00290             break;
00291             
00292         default:
00293             throw VRangeException(VSTRING_FORMAT("VCodePoint::writeToBinaryStream() for an invalid UTF-8 code point 0x%X", mIntValue));
00294             break;
00295     }
00296 }
00297 
00298 // static
00299 int VCodePoint::getUTF8LengthFromUTF8StartByte(Vu8 startByte) {
00300     // In UTF-8 the number of leading 1 bits on the first byte tells us how many "extra" bytes make up the code point in the buffer.
00301     int utf8Length = 1;
00302 
00303     if (((startByte & 0x80) != 0x00) && ((startByte & 0x40) != 0x00)) { // test for binary 11?????? (2 bits found so far)
00304         ++utf8Length;
00305         
00306         if ((startByte & 0x20) != 0x00) { // test for binary ??1????? (3 bits found so far)
00307             ++utf8Length;
00308         
00309             if ((startByte & 0x10) != 0x00) { // test for binary ???1???? (4 bits found total)
00310                 ++utf8Length;
00311             }
00312         }
00313     }
00314     
00315     return utf8Length;
00316 }
00317 
00318 // static
00319 int VCodePoint::getUTF8LengthFromCodePointValue(int intValue) {
00320     if (intValue < 0x80) {
00321         return 1;
00322     } else if (intValue < 0x00000800) {
00323         return 2;
00324     } else if (intValue < 0x00010000) {
00325         return 3;
00326     } else if (intValue < 0x00110000) {
00327         return 4;
00328     } else {
00329         throw VRangeException(VSTRING_FORMAT("VCodePoint::getUTF8LengthFromCodePointValue() for an invalid UTF-8 code point 0x%X", intValue));
00330     }
00331 }
00332 
00333 // static
00334 bool VCodePoint::isUTF8ContinuationByte(Vu8 byteValue) {
00335     // 0xC0 mask value of 0x80 (10xxxxxx) detects UTF-8 continuation bytes; anything else is start of a character (single or multi-byte).
00336     return ((byteValue & 0xC0) == 0x80);
00337 }
00338 
00339 // static
00340 int VCodePoint::countUTF8CodePoints(const Vu8* buffer, int numBytes) {
00341     int numCodePoints = 0;
00342     int offset = 0;
00343     while (offset < numBytes) {
00344         VCodePoint cp(buffer, offset);
00345         ++numCodePoints;
00346         offset += cp.getUTF8Length();
00347     }
00348 
00349     return numCodePoints;
00350 }
00351 
00352 // static
00353 int VCodePoint::getPreviousUTF8CodePointOffset(const Vu8* buffer, int offset) {
00354     int previousOffset = offset - 1;
00355     
00356     while ((previousOffset > 0) && VCodePoint::isUTF8ContinuationByte(buffer[previousOffset])) {
00357         --previousOffset;
00358     }
00359     
00360     return previousOffset;
00361 }
00362 
00363 // static
00364 bool VCodePoint::isUTF16SurrogateCodeUnit(wchar_t codeUnit) {
00365     /*
00366     In UTF-16 two known ranges of values occupy a single code unit. Anything else uses two code units.
00367     The single unit ranges are:
00368         U+0000 to U+D7FF
00369         U+E000 to U+FFFF
00370     Therefore, only values in the remaining range indicate a lead surrogate:
00371         U+D800 to U+DFFF
00372     And anything above U+FFFF is a trail surrogate.
00373     */
00374     return
00375         ((codeUnit >= 0xD800) && (codeUnit <= 0xDFFF)) ||   // lead surrogate range
00376         (codeUnit >= 0x10000);                              // trail surrogate range
00377 }
00378 
00379 // static
00380 int VCodePoint::getUTF16LengthFromCodePointValue(int intValue) {
00381     if (((intValue >= 0x0000) && (intValue <= 0xD7FF)) ||
00382         ((intValue >= 0xE000) && (intValue <= 0xFFFF))) {
00383         return 1;
00384     }
00385 
00386     return 2;
00387 }
00388 
00389 void VCodePoint::_initFromUTF8Bytes(int numBytesToUse, Vu8 byte0, Vu8 byte1, Vu8 byte2, Vu8 byte3) {
00390     mIntValue = 0;
00391 
00392     if (numBytesToUse == 1) {
00393         mIntValue = byte0;
00394 
00395     } else if (numBytesToUse == 2) {
00396         mIntValue |= ((byte0 & 0x1F) << 6);
00397         mIntValue |=  (byte1 & 0x3F);
00398 
00399     } else if (numBytesToUse == 3) {
00400         mIntValue |= ((byte0 & 0x0F) << 12);
00401         mIntValue |= ((byte1 & 0x3F) << 6);
00402         mIntValue |=  (byte2 & 0x3F);
00403 
00404     } else /* numBytesToUse is 4 */ {
00405         mIntValue |= ((byte0 & 0x07) << 18);
00406         mIntValue |= ((byte1 & 0x3F) << 12);
00407         mIntValue |= ((byte2 & 0x3F) << 6);
00408         mIntValue |=  (byte3 & 0x3F);
00409     }
00410 
00411     mUTF8Length = numBytesToUse;
00412     mUTF16Length = VCodePoint::getUTF16LengthFromCodePointValue(mIntValue);
00413 }
00414 
00415 void VCodePoint::_initFromUTF16Surrogates(wchar_t leadSurrogate, wchar_t trailSurrogate) {
00416     int x = (leadSurrogate & ((1 << 6) -1)) << 10 | (trailSurrogate & ((1 << 10) -1));
00417     int w = (leadSurrogate >> 6) & ((1 << 5) - 1);
00418     int u = w + 1;
00419     mIntValue = u << 16 | x;
00420 
00421     mUTF8Length = VCodePoint::getUTF8LengthFromCodePointValue(mIntValue);
00422     mUTF16Length = 2;
00423 }

Copyright ©1997-2014 Trygve Isaacson. All rights reserved. This documentation was generated with Doxygen.