Line data Source code
1 :
2 : /***************************************************************************\
3 : * Name : utf8 *
4 : * Description : utf8 validation and utf8 to unicode convert *
5 : * Author : antonin.kriz@gmail.com *
6 : * reference : https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ *
7 : * ------------------------------------------------------------------------- *
8 : * This is free software; you can redistribute it and/or modify it under the *
9 : * terms of the MIT license. A copy of the license can be found in the file *
10 : * "LICENSE" at the root of this distribution. *
11 : \***************************************************************************/
12 : #pragma once
13 :
14 : #include <climits>
15 : #include <cstdint>
16 : #include <stdexcept>
17 : #include <string_view>
18 :
19 : namespace spb::detail::utf8
20 : {
21 :
22 : // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
23 : // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
24 :
25 : constexpr uint8_t ok = 0;
26 :
27 1395 : static auto inline decode_point( uint32_t * state, uint32_t * codep, uint8_t byte ) -> uint32_t
28 : {
29 : static const uint8_t utf8d[] = {
30 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
31 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,// 00..1f
32 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
33 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,// 20..3f
34 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
35 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,// 40..5f
36 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
37 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,// 60..7f
38 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
39 : 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,// 80..9f
40 : 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
41 : 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,// a0..bf
42 : 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
43 : 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // c0..df
44 : 0xa, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3,// e0..ef
45 : 0xb, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8,// f0..ff
46 : 0x0, 0x1, 0x2, 0x3, 0x5, 0x8, 0x7, 0x1, 0x1, 0x1, 0x4, 0x6, 0x1, 0x1, 0x1, 0x1,// s0..s0
47 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
48 : 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1,// s1..s2
49 : 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1,
50 : 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,// s3..s4
51 : 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1,
52 : 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1,// s5..s6
53 : 1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1,
54 : 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,// s7..s8
55 : };
56 :
57 1395 : uint32_t type = utf8d[ byte ];
58 :
59 1395 : *codep = ( *state != ok ) ? ( byte & 0x3fu ) | ( *codep << 6 ) : ( 0xff >> type ) & ( byte );
60 :
61 1395 : *state = utf8d[ 256 + *state * 16 + type ];
62 1395 : return *state;
63 : }
64 :
65 : /**
66 : * @brief encode codepoint to utf8
67 : *
68 : * @param unicode codepoint
69 : * @param utf8 output
70 : * @return size of output in bytes, 0 on error
71 : */
72 8 : static inline auto encode_point( uint32_t unicode, char utf8[ 4 ] ) -> uint32_t
73 : {
74 8 : if( unicode <= 0x7F )
75 : {
76 2 : utf8[ 0 ] = ( char ) unicode;
77 2 : return 1;
78 : }
79 6 : if( unicode <= 0x7FF )
80 : {
81 2 : utf8[ 0 ] = ( char ) ( ( unicode >> 6 ) | 0xC0 );
82 2 : utf8[ 1 ] = ( char ) ( ( unicode & 0x3F ) | 0x80 );
83 2 : return 2;
84 : }
85 4 : if( unicode >= 0xD800 && unicode < 0xE000 )
86 : {
87 0 : return 0;
88 : }
89 4 : if( unicode <= 0xFFFF )
90 : {
91 2 : utf8[ 0 ] = ( char ) ( ( unicode >> 12 ) | 0xE0 );
92 2 : utf8[ 1 ] = ( char ) ( ( ( unicode >> 6 ) & 0x3F ) | 0x80 );
93 2 : utf8[ 2 ] = ( char ) ( ( unicode & 0x3F ) | 0x80 );
94 2 : return 3;
95 : }
96 2 : if( unicode <= 0x10FFFF )
97 : {
98 2 : utf8[ 0 ] = ( char ) ( ( unicode >> 18 ) | 0xF0 );
99 2 : utf8[ 1 ] = ( char ) ( ( ( unicode >> 12 ) & 0x3F ) | 0x80 );
100 2 : utf8[ 2 ] = ( char ) ( ( ( unicode >> 6 ) & 0x3F ) | 0x80 );
101 2 : utf8[ 3 ] = ( char ) ( ( unicode & 0x3F ) | 0x80 );
102 2 : return 4;
103 : }
104 0 : return 0;
105 : }
106 :
107 211 : static inline auto is_valid( std::string_view str ) -> bool
108 : {
109 : uint32_t codepoint;
110 211 : uint32_t state = ok;
111 :
112 1555 : for( uint8_t c : str )
113 : {
114 1344 : decode_point( &state, &codepoint, c );
115 : }
116 :
117 211 : return state == ok;
118 : }
119 :
120 211 : static inline void validate( std::string_view value )
121 : {
122 211 : if( !spb::detail::utf8::is_valid( std::string_view( value.data( ), value.size( ) ) ) )
123 : {
124 1 : throw std::runtime_error( "invalid utf8 string" );
125 : }
126 210 : }
127 :
128 : }// namespace spb::detail::utf8
|