fix utf-8 decoder & add u32string functions
This commit is contained in:
parent
bd87a586d8
commit
4343e81e00
@ -116,20 +116,22 @@ const utf_t utf[] = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
inline uint utf8_len(ubyte cp) {
|
inline uint utf8_len(ubyte cp) {
|
||||||
uint len = 0;
|
if ((cp & 0x80) == 0) {
|
||||||
for (const utf_t* u = utf; u->mask; ++u) {
|
return 1;
|
||||||
if ((cp >= u->beg) && (cp <= u->end)) {
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
++len;
|
if ((cp & 0xE0) == 0xC0) {
|
||||||
|
return 2;
|
||||||
}
|
}
|
||||||
if (len > 4) /* Out of bounds */
|
if ((cp & 0xF0) == 0xE0) {
|
||||||
throw std::runtime_error("utf-8 decode error");
|
return 3;
|
||||||
|
}
|
||||||
return len;
|
if ((cp & 0xF8) == 0xF0) {
|
||||||
|
return 4;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
extern uint32_t util::decode_utf8(uint& size, const char* chr) {
|
uint32_t util::decode_utf8(uint& size, const char* chr) {
|
||||||
size = utf8_len(*chr);
|
size = utf8_len(*chr);
|
||||||
int shift = utf[0].bits_stored * (size - 1);
|
int shift = utf[0].bits_stored * (size - 1);
|
||||||
uint32_t code = (*chr++ & utf[size].mask) << shift;
|
uint32_t code = (*chr++ & utf[size].mask) << shift;
|
||||||
@ -145,7 +147,7 @@ size_t util::crop_utf8(std::string_view s, size_t maxSize) {
|
|||||||
size_t pos = 0;
|
size_t pos = 0;
|
||||||
uint size = 0;
|
uint size = 0;
|
||||||
while (pos < s.length()) {
|
while (pos < s.length()) {
|
||||||
decode_utf8(size, &s.at(pos));
|
decode_utf8(size, s.data() + pos);
|
||||||
if (pos + size > maxSize) {
|
if (pos + size > maxSize) {
|
||||||
return pos;
|
return pos;
|
||||||
}
|
}
|
||||||
@ -154,11 +156,13 @@ size_t util::crop_utf8(std::string_view s, size_t maxSize) {
|
|||||||
return pos;
|
return pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string util::wstr2str_utf8(const std::wstring& ws) {
|
template<class C>
|
||||||
|
std::string xstr2str_utf8(const std::basic_string<C>& xs) {
|
||||||
std::vector<char> chars;
|
std::vector<char> chars;
|
||||||
char buffer[4];
|
ubyte buffer[4];
|
||||||
for (wchar_t wc : ws) {
|
for (C xc : xs) {
|
||||||
uint size = encode_utf8((uint)wc, (ubyte*)buffer);
|
uint size = util::encode_utf8(
|
||||||
|
static_cast<uint>(xc), buffer);
|
||||||
for (uint i = 0; i < size; i++) {
|
for (uint i = 0; i < size; i++) {
|
||||||
chars.push_back(buffer[i]);
|
chars.push_back(buffer[i]);
|
||||||
}
|
}
|
||||||
@ -166,15 +170,32 @@ std::string util::wstr2str_utf8(const std::wstring& ws) {
|
|||||||
return std::string(chars.data(), chars.size());
|
return std::string(chars.data(), chars.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
std::wstring util::str2wstr_utf8(const std::string& s) {
|
std::string util::wstr2str_utf8(const std::wstring& ws) {
|
||||||
std::vector<wchar_t> chars;
|
return xstr2str_utf8(ws);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string util::u32str2str_utf8(const std::u32string& ws) {
|
||||||
|
return xstr2str_utf8(ws);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class C>
|
||||||
|
std::basic_string<C> str2xstr_utf8(const std::string& s) {
|
||||||
|
std::vector<C> chars;
|
||||||
size_t pos = 0;
|
size_t pos = 0;
|
||||||
uint size = 0;
|
uint size = 0;
|
||||||
while (pos < s.length()) {
|
while (pos < s.length()) {
|
||||||
chars.push_back(decode_utf8(size, &s.at(pos)));
|
chars.push_back(util::decode_utf8(size, &s.at(pos)));
|
||||||
pos += size;
|
pos += size;
|
||||||
}
|
}
|
||||||
return std::wstring(chars.data(), chars.size());
|
return std::basic_string<C>(chars.data(), chars.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
std::wstring util::str2wstr_utf8(const std::string& s) {
|
||||||
|
return str2xstr_utf8<wchar_t>(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::u32string util::str2u32str_utf8(const std::string& s) {
|
||||||
|
return str2xstr_utf8<char32_t>(s);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool util::is_integer(const std::string& text) {
|
bool util::is_integer(const std::string& text) {
|
||||||
|
|||||||
@ -23,11 +23,21 @@ namespace util {
|
|||||||
/// @return new UTF-8 encoded string
|
/// @return new UTF-8 encoded string
|
||||||
std::string wstr2str_utf8(const std::wstring& ws);
|
std::string wstr2str_utf8(const std::wstring& ws);
|
||||||
|
|
||||||
/// @brief Decode UTF
|
/// @brief Decode UTF-8 string
|
||||||
/// @param s source encoded string
|
/// @param s source encoded string
|
||||||
/// @return new raw decoded string
|
/// @return new raw decoded wstring
|
||||||
std::wstring str2wstr_utf8(const std::string& s);
|
std::wstring str2wstr_utf8(const std::string& s);
|
||||||
|
|
||||||
|
/// @brief Encode raw u32string to UTF-8
|
||||||
|
/// @param ws source raw wstring
|
||||||
|
/// @return new UTF-8 encoded string
|
||||||
|
std::string u32str2str_utf8(const std::u32string& ws);
|
||||||
|
|
||||||
|
/// @brief Decode UTF-8 string
|
||||||
|
/// @param s source encoded string
|
||||||
|
/// @return new raw decoded u32string
|
||||||
|
std::u32string str2u32str_utf8(const std::string& s);
|
||||||
|
|
||||||
/// @brief Calculated length of UTF-8 encoded string that fits into maxSize
|
/// @brief Calculated length of UTF-8 encoded string that fits into maxSize
|
||||||
/// @param s source UTF-8 encoded string view
|
/// @param s source UTF-8 encoded string view
|
||||||
/// @param maxSize max encoded string length after crop
|
/// @param maxSize max encoded string length after crop
|
||||||
|
|||||||
@ -8,3 +8,10 @@ TEST(stringutil, crop_utf8) {
|
|||||||
str = str.substr(0, util::crop_utf8(str, 7));
|
str = str.substr(0, util::crop_utf8(str, 7));
|
||||||
EXPECT_EQ(str, u8"при");
|
EXPECT_EQ(str, u8"при");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(stringutil, utf8) {
|
||||||
|
std::string str = u8"テキストデモ";
|
||||||
|
auto u32str = util::str2u32str_utf8(str);
|
||||||
|
std::string str2 = util::u32str2str_utf8(u32str);
|
||||||
|
EXPECT_EQ(str, str2);
|
||||||
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user