24 const auto ch =
static_cast<unsigned char>(*in);
29 result.
append(
static_cast<char>((ch >> 6) | 0xc0));
30 result.
append(
static_cast<char>((ch & 0x3f) | 0x80));
39 static const unsigned char firstByteMark[] = { 0x00, 0x00, 0xC0, 0xE0 };
40 static const unsigned unicodevalues[] = {
41 0x0402, 0x0403, 0x201A, 0x0453, 0x201E, 0x2026, 0x2020, 0x2021,
42 0x20AC, 0x2030, 0x0409, 0x2039, 0x040A, 0x040C, 0x040B, 0x040F,
43 0x0452, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
44 0xFFFD, 0x2122, 0x0459, 0x203A, 0x045A, 0x045C, 0x045B, 0x045F,
45 0x00A0, 0x040E, 0x045E, 0x0408, 0x00A4, 0x0490, 0x00A6, 0x00A7,
46 0x0401, 0x00A9, 0x0404, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x0407,
47 0x00B0, 0x00B1, 0x0406, 0x0456, 0x0491, 0x00B5, 0x00B6, 0x00B7,
48 0x0451, 0x2116, 0x0454, 0x00BB, 0x0458, 0x0405, 0x0455, 0x0457
56 const auto ch =
static_cast<unsigned char>(*in);
58 size_t bytesToWrite = 0;
59 char sequence[4] = {0, 0, 0, 0};
62 "we require char to be exactly 8 bits");
68 u = unicodevalues[ch - 0x80];
77 switch (bytesToWrite) {
79 sequence[2] =
static_cast<char>(u & 0x3f) | 0x80;
83 sequence[1] =
static_cast<char>(u & 0x3f) | 0x80;
87 sequence[0] =
static_cast<char>(u) | firstByteMark[bytesToWrite];
89 result.
append(sequence, bytesToWrite);
103 if ((b0 & 0x80) == 0)
105 if ((b0 & 0xC0) != 0xC0)
107 if ((b0 & 0xE0) == 0xC0)
109 if ((b0 & 0xF0) == 0xE0)
111 if ((b0 & 0xF8) == 0xF0)
126 const unsigned char* srcptr = source + length;
132 if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
return false;
135 if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
return false;
138 if ((a = (*--srcptr)) > 0xBF)
return false;
143 if (a < 0xA0)
return false;
146 if (a > 0x9F)
return false;
149 if (a < 0x90)
return false;
152 if (a > 0x8F)
return false;
155 if (a < 0x80)
return false;
161 if (*source >= 0x80 && *source < 0xC2)
return false;
173 while (source < sourceEnd) {
175 if (source + length > sourceEnd || !
isValidUtf8CodePoint(
reinterpret_cast<const unsigned char*
>(source), length))
SBuf & append(const SBuf &S)
A const & max(A const &lhs, A const &rhs)
static size_t utf8CodePointLength(const char b0)
SBuf Cp1251ToUtf8(const char *in)
converts CP1251 to UTF-8
SBuf Latin1ToUtf8(const char *in)
converts ISO-LATIN-1 to UTF-8
static bool isValidUtf8CodePoint(const unsigned char *source, const size_t length)
bool isValidUtf8String(const char *source, const char *sourceEnd)
returns whether the given input is a valid (or empty) sequence of UTF-8 code points