LCOV - code coverage report
Current view: top level - spb - utf8.h (source / functions) Coverage Total Hit
Test: coverage.info Lines: 94.4 % 36 34
Test Date: 2026-05-30 19:41:37 Functions: 100.0 % 4 4

            Line data    Source code
       1              : 
       2              : /***************************************************************************\
       3              : * Name        : utf8                                                        *
       4              : * Description : utf8 validation and utf8 to unicode convert                 *
       5              : * Author      : antonin.kriz@gmail.com                                      *
       6              : * reference   : https://bjoern.hoehrmann.de/utf-8/decoder/dfa/              *
       7              : * ------------------------------------------------------------------------- *
       8              : * This is free software; you can redistribute it and/or modify it under the *
       9              : * terms of the MIT license. A copy of the license can be found in the file  *
      10              : * "LICENSE" at the root of this distribution.                               *
      11              : \***************************************************************************/
      12              : #pragma once
      13              : 
      14              : #include <climits>
      15              : #include <cstdint>
      16              : #include <stdexcept>
      17              : #include <string_view>
      18              : 
      19              : namespace spb::detail::utf8
      20              : {
      21              : 
      22              : // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
      23              : // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
      24              : 
      25              : constexpr uint8_t ok = 0;
      26              : 
      27         2003 : static auto inline decode_point(uint32_t *state, uint32_t *codep, uint8_t byte) -> uint32_t
      28              : {
      29              :     static const uint8_t utf8d[] = {
      30              :         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
      31              :         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, // 00..1f
      32              :         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
      33              :         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, // 20..3f
      34              :         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
      35              :         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, // 40..5f
      36              :         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
      37              :         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, // 60..7f
      38              :         1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
      39              :         9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9, // 80..9f
      40              :         7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,
      41              :         7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7, // a0..bf
      42              :         8,   8,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
      43              :         2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   // c0..df
      44              :         0xa, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, // e0..ef
      45              :         0xb, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, // f0..ff
      46              :         0x0, 0x1, 0x2, 0x3, 0x5, 0x8, 0x7, 0x1, 0x1, 0x1, 0x4, 0x6, 0x1, 0x1, 0x1, 0x1, // s0..s0
      47              :         1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
      48              :         1,   0,   1,   1,   1,   1,   1,   0,   1,   0,   1,   1,   1,   1,   1,   1, // s1..s2
      49              :         1,   2,   1,   1,   1,   1,   1,   2,   1,   2,   1,   1,   1,   1,   1,   1,
      50              :         1,   1,   1,   1,   1,   1,   1,   2,   1,   1,   1,   1,   1,   1,   1,   1, // s3..s4
      51              :         1,   2,   1,   1,   1,   1,   1,   1,   1,   2,   1,   1,   1,   1,   1,   1,
      52              :         1,   1,   1,   1,   1,   1,   1,   3,   1,   3,   1,   1,   1,   1,   1,   1, // s5..s6
      53              :         1,   3,   1,   1,   1,   1,   1,   3,   1,   3,   1,   1,   1,   1,   1,   1,
      54              :         1,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1, // s7..s8
      55              :     };
      56              : 
      57         2003 :     uint32_t type = utf8d[byte];
      58              : 
      59         2003 :     *codep = (*state != ok) ? (byte & 0x3fu) | (*codep << 6) : (0xff >> type) & (byte);
      60              : 
      61         2003 :     *state = utf8d[256 + *state * 16 + type];
      62         2003 :     return *state;
      63              : }
      64              : 
      65              : /**
      66              :  * @brief encode codepoint to utf8
      67              :  *
      68              :  * @param unicode codepoint
      69              :  * @param utf8 output
      70              :  * @return size of output in bytes, 0 on error
      71              :  */
      72            8 : static inline auto encode_point(uint32_t unicode, char utf8[4]) -> uint32_t
      73              : {
      74            8 :     if (unicode <= 0x7F)
      75              :     {
      76            2 :         utf8[0] = (char)unicode;
      77            2 :         return 1;
      78              :     }
      79            6 :     if (unicode <= 0x7FF)
      80              :     {
      81            2 :         utf8[0] = (char)((unicode >> 6) | 0xC0);
      82            2 :         utf8[1] = (char)((unicode & 0x3F) | 0x80);
      83            2 :         return 2;
      84              :     }
      85            4 :     if (unicode >= 0xD800 && unicode < 0xE000)
      86            0 :         return 0;
      87              : 
      88            4 :     if (unicode <= 0xFFFF)
      89              :     {
      90            2 :         utf8[0] = (char)((unicode >> 12) | 0xE0);
      91            2 :         utf8[1] = (char)(((unicode >> 6) & 0x3F) | 0x80);
      92            2 :         utf8[2] = (char)((unicode & 0x3F) | 0x80);
      93            2 :         return 3;
      94              :     }
      95            2 :     if (unicode <= 0x10FFFF)
      96              :     {
      97            2 :         utf8[0] = (char)((unicode >> 18) | 0xF0);
      98            2 :         utf8[1] = (char)(((unicode >> 12) & 0x3F) | 0x80);
      99            2 :         utf8[2] = (char)(((unicode >> 6) & 0x3F) | 0x80);
     100            2 :         utf8[3] = (char)((unicode & 0x3F) | 0x80);
     101            2 :         return 4;
     102              :     }
     103            0 :     return 0;
     104              : }
     105              : 
     106          635 : static inline auto is_valid(std::string_view str) -> bool
     107              : {
     108              :     uint32_t codepoint;
     109          635 :     uint32_t state = ok;
     110              : 
     111         2587 :     for (uint8_t c : str)
     112              :     {
     113         1952 :         decode_point(&state, &codepoint, c);
     114              :     }
     115              : 
     116          635 :     return state == ok;
     117              : }
     118              : 
     119          635 : static inline void validate(std::string_view value)
     120              : {
     121          635 :     if (!spb::detail::utf8::is_valid(std::string_view(value.data(), value.size()))) [[unlikely]]
     122            1 :         throw std::runtime_error("invalid utf8 string");
     123          634 : }
     124              : 
     125              : } // namespace spb::detail::utf8
        

Generated by: LCOV version 2.0-1