LCOV - code coverage report
Current view: top level - spb - utf8.h (source / functions) Coverage Total Hit
Test: coverage.info Lines: 94.4 % 36 34
Test Date: 2025-05-23 14:18:13 Functions: 100.0 % 4 4

            Line data    Source code
       1              : 
       2              : /***************************************************************************\
       3              : * Name        : utf8                                                        *
       4              : * Description : utf8 validation and utf8 to unicode convert                 *
       5              : * Author      : antonin.kriz@gmail.com                                      *
       6              : * reference   : https://bjoern.hoehrmann.de/utf-8/decoder/dfa/              *
       7              : * ------------------------------------------------------------------------- *
       8              : * This is free software; you can redistribute it and/or modify it under the *
       9              : * terms of the MIT license. A copy of the license can be found in the file  *
      10              : * "LICENSE" at the root of this distribution.                               *
      11              : \***************************************************************************/
      12              : #pragma once
      13              : 
      14              : #include <climits>
      15              : #include <cstdint>
      16              : #include <stdexcept>
      17              : #include <string_view>
      18              : 
      19              : namespace spb::detail::utf8
      20              : {
      21              : 
      22              : // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
      23              : // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
      24              : 
      25              : constexpr uint8_t ok = 0;
      26              : 
      27         1395 : static auto inline decode_point( uint32_t * state, uint32_t * codep, uint8_t byte ) -> uint32_t
      28              : {
      29              :     static const uint8_t utf8d[] = {
      30              :         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
      31              :         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,// 00..1f
      32              :         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
      33              :         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,// 20..3f
      34              :         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
      35              :         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,// 40..5f
      36              :         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
      37              :         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,// 60..7f
      38              :         1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
      39              :         9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,// 80..9f
      40              :         7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,
      41              :         7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,// a0..bf
      42              :         8,   8,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
      43              :         2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,  // c0..df
      44              :         0xa, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3,// e0..ef
      45              :         0xb, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8,// f0..ff
      46              :         0x0, 0x1, 0x2, 0x3, 0x5, 0x8, 0x7, 0x1, 0x1, 0x1, 0x4, 0x6, 0x1, 0x1, 0x1, 0x1,// s0..s0
      47              :         1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
      48              :         1,   0,   1,   1,   1,   1,   1,   0,   1,   0,   1,   1,   1,   1,   1,   1,// s1..s2
      49              :         1,   2,   1,   1,   1,   1,   1,   2,   1,   2,   1,   1,   1,   1,   1,   1,
      50              :         1,   1,   1,   1,   1,   1,   1,   2,   1,   1,   1,   1,   1,   1,   1,   1,// s3..s4
      51              :         1,   2,   1,   1,   1,   1,   1,   1,   1,   2,   1,   1,   1,   1,   1,   1,
      52              :         1,   1,   1,   1,   1,   1,   1,   3,   1,   3,   1,   1,   1,   1,   1,   1,// s5..s6
      53              :         1,   3,   1,   1,   1,   1,   1,   3,   1,   3,   1,   1,   1,   1,   1,   1,
      54              :         1,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,// s7..s8
      55              :     };
      56              : 
      57         1395 :     uint32_t type = utf8d[ byte ];
      58              : 
      59         1395 :     *codep = ( *state != ok ) ? ( byte & 0x3fu ) | ( *codep << 6 ) : ( 0xff >> type ) & ( byte );
      60              : 
      61         1395 :     *state = utf8d[ 256 + *state * 16 + type ];
      62         1395 :     return *state;
      63              : }
      64              : 
      65              : /**
      66              :  * @brief encode codepoint to utf8
      67              :  *
      68              :  * @param unicode codepoint
      69              :  * @param utf8 output
      70              :  * @return size of output in bytes, 0 on error
      71              :  */
      72            8 : static inline auto encode_point( uint32_t unicode, char utf8[ 4 ] ) -> uint32_t
      73              : {
      74            8 :     if( unicode <= 0x7F )
      75              :     {
      76            2 :         utf8[ 0 ] = ( char ) unicode;
      77            2 :         return 1;
      78              :     }
      79            6 :     if( unicode <= 0x7FF )
      80              :     {
      81            2 :         utf8[ 0 ] = ( char ) ( ( unicode >> 6 ) | 0xC0 );
      82            2 :         utf8[ 1 ] = ( char ) ( ( unicode & 0x3F ) | 0x80 );
      83            2 :         return 2;
      84              :     }
      85            4 :     if( unicode >= 0xD800 && unicode < 0xE000 )
      86              :     {
      87            0 :         return 0;
      88              :     }
      89            4 :     if( unicode <= 0xFFFF )
      90              :     {
      91            2 :         utf8[ 0 ] = ( char ) ( ( unicode >> 12 ) | 0xE0 );
      92            2 :         utf8[ 1 ] = ( char ) ( ( ( unicode >> 6 ) & 0x3F ) | 0x80 );
      93            2 :         utf8[ 2 ] = ( char ) ( ( unicode & 0x3F ) | 0x80 );
      94            2 :         return 3;
      95              :     }
      96            2 :     if( unicode <= 0x10FFFF )
      97              :     {
      98            2 :         utf8[ 0 ] = ( char ) ( ( unicode >> 18 ) | 0xF0 );
      99            2 :         utf8[ 1 ] = ( char ) ( ( ( unicode >> 12 ) & 0x3F ) | 0x80 );
     100            2 :         utf8[ 2 ] = ( char ) ( ( ( unicode >> 6 ) & 0x3F ) | 0x80 );
     101            2 :         utf8[ 3 ] = ( char ) ( ( unicode & 0x3F ) | 0x80 );
     102            2 :         return 4;
     103              :     }
     104            0 :     return 0;
     105              : }
     106              : 
     107          211 : static inline auto is_valid( std::string_view str ) -> bool
     108              : {
     109              :     uint32_t codepoint;
     110          211 :     uint32_t state = ok;
     111              : 
     112         1555 :     for( uint8_t c : str )
     113              :     {
     114         1344 :         decode_point( &state, &codepoint, c );
     115              :     }
     116              : 
     117          211 :     return state == ok;
     118              : }
     119              : 
     120          211 : static inline void validate( std::string_view value )
     121              : {
     122          211 :     if( !spb::detail::utf8::is_valid( std::string_view( value.data( ), value.size( ) ) ) )
     123              :     {
     124            1 :         throw std::runtime_error( "invalid utf8 string" );
     125              :     }
     126          210 : }
     127              : 
     128              : }// namespace spb::detail::utf8
        

Generated by: LCOV version 2.0-1