EHS/include/ehs/UTF.h

455 lines
17 KiB
C
Raw Normal View History

2024-02-05 22:25:30 -08:00
#pragma once
#include "EHS.h"
#include "Str.h"
namespace ehs
{
enum class CharEncoding
{
UTF_32,
UTF_16,
UTF_8
};
/// A helper class for converting between UTF8, 16 and 32.
class UTF
{
public:
/// Converts the given UTF16 C-style string into UTF32.
/// @tparam N The number data type to use.
/// @param [in] from The given C-style UTF16 string.
/// @param [in] size The size of the given C-style UTF16 string.
/// @returns The result.
template<typename N = UInt_64>
static Str<Char_32, N> To_32(const Char_16* const from, const N size = 0)
{
Str<Char_32, N> result((size) ? size : Str<Char_16, N>::Len(from));
N index = 0;
for (N i = 0; i < result.Size(); ++i)
{
if (i != result.Size() - 1)
{
if ((from[i] & 0xDC00) == 0xDC00 && (from[i + 1] & 0xD800) == 0xD800)
{
result[index++] = (((from[i] - 0xD800) * 0x400) | (from[i] - 0xDC00)) + 0x10000;
continue;
}
}
result[index++] = (Char_32)from[i];
}
result.Resize(index);
return result;
}
/// Converts the given UTF16 string object into UTF32.
/// @tparam N The number data type to use.
/// @param [in] from The given UTF16 string.
/// @returns The result.
template<typename N = UInt_64>
static Str<Char_32, N> To_32(const Str<Char_16, N>& from)
{
Str<Char_32, N> result(from.Size());
N index = 0;
for (N i = 0; i < from.Size(); ++i)
{
if (i != from.Size() - 1)
{
if ((from[i] & 0xDC00) == 0xDC00 && (from[i + 1] & 0xD800) == 0xD800)
{
result[index++] = (((from[i] - 0xD800) * 0x400) | (from[i] - 0xDC00)) + 0x10000;
continue;
}
}
result[index++] = (Char_32)from[i];
}
result.Resize(index);
return result;
}
/// Converts the given UTF8 C-style string into UTF32.
/// @tparam N The number data type to use.
/// @param [in] from The given C-style UTF8 string.
/// @param [in] size The size of the given C-style UTF8 string.
/// @returns The result.
template<typename N = UInt_64>
static Str<Char_32, N> To_32(const Char_8* from, const N size = 0)
{
N rSize = size ? size : Str<Char_8, N>::Len(from);
Str<Char_32, N> r(rSize);
N c = 0;
for (N i = 0; i < rSize; ++i)
{
if (from[i] <= 0b11110111 && i + 3 < rSize && from[i + 1] <= 0b10111111 && from[i + 2] <= 0b10111111 && from[i + 3] <= 0b10111111)
r[c++] = (Char_32)(from[i++] & 0b00000111) << 18 | (Char_32)(from[i++] & 0b00111111) << 12 | (Char_32)(from[i++] & 0b00111111) << 6 | (Char_32)(from[i] & 0b00111111);
else if (from[i] <= 0b11101111 && i + 2 < rSize && from[i + 1] <= 0b10111111 && from[i + 2] <= 0b10111111)
r[c++] = (Char_32)(from[i++] & 0b00001111) << 12 | (Char_32)(from[i++] & 0b00111111) << 6 | ((Char_32)from[i] & 0b00111111);
else if (from[i] <= 0b11011111 && i + 1 < rSize && from[i + 1] <= 0b10111111)
r[c++] = (Char_32)(from[i++] & 0b00011111) << 6 | (Char_32)(from[i] & 0b00111111);
else
r[c++] = (Char_32)from[i];
}
r.Resize(c);
return r;
}
/// Converts the given UTF8 string object into UTF32.
/// @tparam N The number data type to use.
/// @param [in] from The given UTF8 string.
/// @returns The result.
template<typename N = UInt_64>
static Str<Char_32, N> To_32(const Str<Char_8, N>& from)
{
Str<Char_32, N> r(from.Size());
N c = 0;
for (N i = 0; i < from.Size(); ++i)
{
if (from[i] <= 0b11110111 && i + 3 < from.Size() && from[i + 1] <= 0b10111111 && from[i + 2] <= 0b10111111 && from[i + 3] <= 0b10111111)
r[c++] = (Char_32)(from[i++] & 0b00000111) << 18 | (Char_32)(from[i++] & 0b00111111) << 12 | (Char_32)(from[i++] & 0b00111111) << 6 | (Char_32)(from[i] & 0b00111111);
else if (from[i] <= 0b11101111 && i + 2 < from.Size() && from[i + 1] <= 0b10111111 && from[i + 2] <= 0b10111111)
r[c++] = (Char_32)(from[i++] & 0b00001111) << 12 | (Char_32)(from[i++] & 0b00111111) << 6 | ((Char_32)from[i] & 0b00111111);
else if (from[i] <= 0b11011111 && i + 1 < from.Size() && from[i + 1] <= 0b10111111)
r[c++] = (Char_32)(from[i++] & 0b00011111) << 6 | (Char_32)(from[i] & 0b00111111);
else
r[c++] = (Char_32)from[i];
}
r.Resize(c);
return r;
}
/// Converts the given UTF32 C-style string object into UTF16.
/// @tparam N The number data type to use.
/// @param [in] from The given UTF32 string.
/// @param [in] size The size of the give C-style UTF32 string.
/// @returns The result.
template<typename N = UInt_64>
static Str<Char_16, N> To_16(const Char_32* const from, const N size = 0)
{
N rSize = size ? size : Str<Char_32, N>::Len(from);
Str<Char_16, N> result(rSize * sizeof(Char_16));
N index = 0;
for (N i = 0; i < rSize; ++i)
{
if (from[i] <= 0xFFFF)
{
result[index++] = (Char_16)from[i];
}
else
{
Char_32 t = from[i] - 0x10000;
result[index++] |= (t >> 10) + 0xD800;
result[index++] |= t + 0xDC00;
}
}
result.Resize(index);
return result;
}
/// Converts the given UTF32 string object into UTF16.
/// @tparam N The number data type to use.
/// @param [in] from The given UTF32 string.
/// @returns The result.
template<typename N = UInt_64>
static Str<Char_16, N> To_16(const Str<Char_32, N>& from)
{
Str<Char_16, N> result(from.Size() * sizeof(Char_16));
N index = 0;
for (N i = 0; i < from.Size(); ++i)
{
if (from[i] <= 0xFFFF)
{
result[index++] = (Char_16)from[i];
}
else
{
Char_32 t = from[i] - 0x10000;
result[index++] |= (t >> 10) + 0xD800;
result[index++] |= t + 0xDC00;
}
}
result.Resize(index);
return result;
}
/// Converts the given UTF8 C-style string into UTF16.
/// @tparam N The number data type to use.
/// @param [in] from The given UTF8 C-style string.
/// @param [in] size The size of the given C-style UTF8 string.
/// @returns The result.
template<typename N = UInt_64>
static Str<Char_16, N> To_16(const Char_8* const from, const N size = 0)
{
N rSize = size ? size : Str<Char_8, N>::Len(from);
const Byte* const data = (const Byte* const)from;
Str<Char_16, N> r(rSize);
N c = 0;
for (N i = 0; i < rSize; ++i)
{
if (data[i] >= 0b11110000 && i + 3 < rSize && data[i + 1] <= 0b10111111 && data[i + 2] <= 0b10111111 && data[i + 3] <= 0b10111111)
r[c++] = (0b00000011111111110000001111111111 &
((Char_16)(data[i++] & 0b00000111) << 23) |
((Char_16)(data[i++] & 0b00111111) << 18) |
((Char_16)(data[i++] & 0b00111111) << 12) |
((Char_16)(data[i++] & 0b00111111) << 6) |
(Char_16)(data[i] & 0b00111111)) |
0b11011000000000001101110000000000;
else if (data[i] >= 0b11100000 && i + 2 < rSize && data[i + 1] <= 0b10111111 && data[i + 2] <= 0b10111111)
r[c++] = ((Char_16)(data[i++] & 0b00001111) << 12) | ((Char_16)(data[i++] & 0b00111111) << 6) | (Char_16)(data[i] & 0b00111111);
else if (data[i] >= 0b11000000 && i + 1 < rSize && data[i + 1] <= 0b10111111)
r[c++] = (Char_16)(data[i++] & 0b00011111) << 6 | (Char_16 )(data[i] & 0b00111111);
else
r[c++] = (Char_16 )data[i];
}
r.Resize(c);
return r;
}
/// Converts the given UTF8 string object into UTF16.
/// @tparam N The number data type to use.
/// @param [in] from The given UTF8 string.
/// @returns The result.
template<typename N = UInt_64>
static Str<Char_16, N> To_16(const Str<Char_8, N>& from)
{
const Byte* const data = from.ToBytes();
Str<Char_16, N> r(from.Size());
N c = 0;
for (N i = 0; i < from.Size(); ++i)
{
if (data[i] >= 0b11110000 && i + 3 < from.Size() && data[i + 1] <= 0b10111111 && data[i + 2] <= 0b10111111 && data[i + 3] <= 0b10111111)
r[c++] = (0b00000011111111110000001111111111 &
((Char_16)(data[i++] & 0b00000111) << 23) |
((Char_16)(data[i++] & 0b00111111) << 18) |
((Char_16)(data[i++] & 0b00111111) << 12) |
((Char_16)(data[i++] & 0b00111111) << 6) |
(Char_16)(data[i] & 0b00111111)) |
0b11011000000000001101110000000000;
else if (data[i] >= 0b11100000 && i + 2 < from.Size() && data[i + 1] <= 0b10111111 && data[i + 2] <= 0b10111111)
r[c++] = ((Char_16)(data[i++] & 0b00001111) << 12) | ((Char_16)(data[i++] & 0b00111111) << 6) | (Char_16)(data[i] & 0b00111111);
else if (data[i] >= 0b11000000 && i + 1 < from.Size() && data[i + 1] <= 0b10111111)
r[c++] = (Char_16)(data[i++] & 0b00011111) << 6 | (Char_16 )(data[i] & 0b00111111);
else
r[c++] = (Char_16 )data[i];
}
r.Resize(c);
return r;
}
/// Converts the given UTF16 C-style string into UTF8.
/// @tparam N The number data type to use.
/// @param [in] from The given UTF16 string.
/// @param [in] size The size of the given C-style UTF8 string.
/// @returns The result.
template<typename N = UInt_64>
static Str<Char_8, N> To_8(const Char_16* const from, const N size = 0)
{
N rSize = size ? size : Str<Char_16, N>::Len(from);
Str<Char_8, N> r(rSize * sizeof(Char_16));
N c = 0;
for (N i = 0; i < rSize; ++i)
{
if (from[i] & 0b1101100000000000 && i + 1 < rSize && from[i] & 0b1101110000000000)
{
r[c++] = ((Byte*)&from[i])[1] & 00000111 | 0b11110000;
r[c++] = ((Byte*)&from[i])[0] >> 2 & 0b00111111 | 0b10000000;
r[c++] = ((Byte*)&from[i])[0] << 4 | (((Byte*)&from[i + 1])[1] & 0b00000011) << 2 | ((Byte*)&from[i + 1])[0] >> 6 & 0b00111111 | 0b10000000;
r[c++] = ((Byte*)&from[++i])[0] & 0b00111111 | 0b10000000;
}
else if (from[i] <= 0b11111111)
{
r[c++] = (Byte)from[i];
}
else if (from[i] > 0b11111111 && from[i] <= 0b0000011111111111)
{
r[c++] = ((Byte*)&from[i])[1] << 2 | ((Byte*)&from[i])[0] >> 6 | 0b11000000;
r[c++] = ((Byte*)&from[i])[0] & 0b00111111 | 0b10000000;
}
else if (from[i] > 0b0000011111111111)
{
r[c++] = ((Byte*)&from[i])[1] >> 4 | 0b11100000;
r[c++] = ((Byte*)&from[i])[1] << 2 | ((Byte*)&from[i])[0] >> 6 & 0b00111111 | 0b10000000;
r[c++] = ((Byte*)&from[i])[0] & 0b00111111 | 0b10000000;
}
}
r.Resize(c);
return r;
}
/// Converts the given UTF16 string object into UTF8.
/// @tparam N The number data type to use.
/// @param [in] from The given UTF16 string.
/// @returns The result.
template<typename N = UInt_64>
static Str<Char_8, N> To_8(const Str<Char_16, N>& from)
{
Str<Char_8, N> r(from.Size(true) * sizeof(Char_16));
N c = 0;
for (N i = 0; i < from.Size(); ++i)
{
if (from[i] & 0b1101100000000000 && i + 1 < from.Size() && from[i] & 0b1101110000000000)
{
r[c++] = ((Byte*)&from[i])[1] & 00000111 | 0b11110000;
r[c++] = ((Byte*)&from[i])[0] >> 2 & 0b00111111 | 0b10000000;
r[c++] = ((Byte*)&from[i])[0] << 4 | (((Byte*)&from[i + 1])[1] & 0b00000011) << 2 | ((Byte*)&from[i + 1])[0] >> 6 & 0b00111111 | 0b10000000;
r[c++] = ((Byte*)&from[++i])[0] & 0b00111111 | 0b10000000;
}
else if (from[i] <= 0b11111111)
{
r[c++] = (Byte)from[i];
}
else if (from[i] > 0b11111111 && from[i] <= 0b0000011111111111)
{
r[c++] = ((Byte*)&from[i])[1] << 2 | ((Byte*)&from[i])[0] >> 6 | 0b11000000;
r[c++] = ((Byte*)&from[i])[0] & 0b00111111 | 0b10000000;
}
else if (from[i] > 0b0000011111111111)
{
r[c++] = ((Byte*)&from[i])[1] >> 4 | 0b11100000;
r[c++] = ((Byte*)&from[i])[1] << 2 | ((Byte*)&from[i])[0] >> 6 & 0b00111111 | 0b10000000;
r[c++] = ((Byte*)&from[i])[0] & 0b00111111 | 0b10000000;
}
}
r.Resize(c);
return r;
}
/// Converts the given UTF32 C-style string into UTF8.
/// @tparam N The number data type to use.
/// @param [in] from The given UTF32 string.
/// @param [in] size The size of the give C-style UTF32 string.
/// @returns The result.
template<typename N = UInt_64>
static Str<Char_8, N> To_8(const Char_32* const from, const N size = 0)
{
N rSize = size ? size : Str<Char_32, N>::Len(from);
Str<Char_8, N> r(rSize * sizeof(Char_32));
N c = 0;
for (N i = 0; i < rSize; ++i)
{
if (from[i] <= 0b11111111)
{
r[c++] = (Char_8)from[i];
}
else if (from[i] > 0b11111111 && from[i] <= 0b0000011111111111)
{
r[c++] = ((Byte*)&from[i])[1] << 2 | ((Byte*)&from[i])[0] >> 6 | 0b11000000;
r[c++] = ((Byte*)&from[i])[0] & 0b00111111 | 0b10000000;
}
else if (from[i] > 0b0000011111111111 && from[i] <= 0b1111111111111111)
{
r[c++] = ((Byte*)&from[i])[2] << 2 | ((Byte*)&from[i])[1] >> 6 | 0b11100000;
r[c++] = ((Byte*)&from[i])[1] << 2 | ((Byte*)&from[i])[0] >> 6 & 0b00111111 | 0b10000000;
r[c++] = ((Byte*)&from[i])[0] & 0b00111111 | 0b10000000;
}
else if (from[i] > 0b1111111111111111)
{
r[c++] = ((Byte*)&from[i])[3] << 2 | ((Byte*)&from[i])[3] >> 2 & 0b00000111 | 0b11110000;
r[c++] = ((Byte*)&from[i])[2] << 2 | ((Byte*)&from[i])[2] >> 6 & 0b00111111 | 0b11100000;
r[c++] = ((Byte*)&from[i])[1] << 2 | ((Byte*)&from[i])[1] >> 6 & 0b00111111 | 0b10000000;
r[c++] = ((Byte*)&from[i])[0] & 0b00111111 | 0b10000000;
}
}
r.Resize(c);
return r;
}
/// Converts the given UTF32 string object into UTF8.
/// @tparam N The number data type to use.
/// @param [in] from The given UTF32 string.
/// @returns The result.
template<typename N = UInt_64>
static Str<Char_8, N> To_8(const Str<Char_32, N>& from)
{
Str<Char_8, N> r(from.Size() * sizeof(Char_32));
N c = 0;
for (N i = 0; i < from.Size(); ++i)
{
if (from[i] <= 0b11111111)
{
r[c++] = (Char_8)from[i];
}
else if (from[i] > 0b11111111 && from[i] <= 0b0000011111111111)
{
r[c++] = ((Byte*)&from[i])[1] << 2 | ((Byte*)&from[i])[0] >> 6 | 0b11000000;
r[c++] = ((Byte*)&from[i])[0] & 0b00111111 | 0b10000000;
}
else if (from[i] > 0b0000011111111111 && from[i] <= 0b1111111111111111)
{
r[c++] = ((Byte*)&from[i])[2] << 2 | ((Byte*)&from[i])[1] >> 6 | 0b11100000;
r[c++] = ((Byte*)&from[i])[1] << 2 | ((Byte*)&from[i])[0] >> 6 & 0b00111111 | 0b10000000;
r[c++] = ((Byte*)&from[i])[0] & 0b00111111 | 0b10000000;
}
else if (from[i] > 0b1111111111111111)
{
r[c++] = ((Byte*)&from[i])[3] << 2 | ((Byte*)&from[i])[3] >> 2 & 0b00000111 | 0b11110000;
r[c++] = ((Byte*)&from[i])[2] << 2 | ((Byte*)&from[i])[2] >> 6 & 0b00111111 | 0b11100000;
r[c++] = ((Byte*)&from[i])[1] << 2 | ((Byte*)&from[i])[1] >> 6 & 0b00111111 | 0b10000000;
r[c++] = ((Byte*)&from[i])[0] & 0b00111111 | 0b10000000;
}
}
r.Resize(c);
return r;
}
};
}