Function read_utf8_codepoint

Synopsis

#include <toml/parser.hpp>

std::string read_utf8_codepoint(const region &reg, const location &loc)

Description

No description yet.

Mentioned in

Source

Lines 255-310 in toml/parser.hpp.

inline std::string read_utf8_codepoint(const region& reg, const location& loc)
{
    const auto str = reg.str().substr(1);
    std::uint_least32_t codepoint;
    std::istringstream iss(str);
    iss >> std::hex >> codepoint;

    const auto to_char = [](const std::uint_least32_t i) noexcept -> char {
        const auto uc = static_cast<unsigned char>(i);
        return *reinterpret_cast<const char*>(std::addressof(uc));
    };

    std::string character;
    if(codepoint < 0x80) // U+0000 ... U+0079 ; just an ASCII.
    {
        character += static_cast<char>(codepoint);
    }
    else if(codepoint < 0x800) //U+0080 ... U+07FF
    {
        // 110yyyyx 10xxxxxx; 0x3f == 0b0011'1111
        character += to_char(0xC0| codepoint >> 6);
        character += to_char(0x80|(codepoint & 0x3F));
    }
    else if(codepoint < 0x10000) // U+0800...U+FFFF
    {
        if(0xD800 <= codepoint && codepoint <= 0xDFFF)
        {
            throw syntax_error(format_underline(
                "toml::read_utf8_codepoint: codepoints in the range "
                "[0xD800, 0xDFFF] are not valid UTF-8.", {{
                    source_location(loc), "not a valid UTF-8 codepoint"
                }}), source_location(loc));
        }
        assert(codepoint < 0xD800 || 0xDFFF < codepoint);
        // 1110yyyy 10yxxxxx 10xxxxxx
        character += to_char(0xE0| codepoint >> 12);
        character += to_char(0x80|(codepoint >> 6 & 0x3F));
        character += to_char(0x80|(codepoint      & 0x3F));
    }
    else if(codepoint < 0x110000) // U+010000 ... U+10FFFF
    {
        // 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
        character += to_char(0xF0| codepoint >> 18);
        character += to_char(0x80|(codepoint >> 12 & 0x3F));
        character += to_char(0x80|(codepoint >> 6  & 0x3F));
        character += to_char(0x80|(codepoint       & 0x3F));
    }
    else // out of UTF-8 region
    {
        throw syntax_error(format_underline("toml::read_utf8_codepoint:"
            " input codepoint is too large.",
            {{source_location(loc), "should be in [0x00..0x10FFFF]"}}),
            source_location(loc));
    }
    return character;
}





Add Discussion as Guest

Log in