Line data Source code
1 :
2 : /***************************************************************************\
3 : * Name : utf8 *
4 : * Description : utf8 validation and utf8 to unicode convert *
5 : * Author : antonin.kriz@gmail.com *
6 : * reference : https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ *
7 : * ------------------------------------------------------------------------- *
8 : * This is free software; you can redistribute it and/or modify it under the *
9 : * terms of the MIT license. A copy of the license can be found in the file *
10 : * "LICENSE" at the root of this distribution. *
11 : \***************************************************************************/
12 : #pragma once
13 :
14 : #include <climits>
15 : #include <cstdint>
16 : #include <stdexcept>
17 : #include <string_view>
18 :
19 : namespace spb::detail::utf8
20 : {
21 :
22 : // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
23 : // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
24 :
25 : constexpr uint8_t ok = 0;
26 :
27 2003 : static auto inline decode_point(uint32_t *state, uint32_t *codep, uint8_t byte) -> uint32_t
28 : {
29 : static const uint8_t utf8d[] = {
30 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
31 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..1f
32 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
33 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..3f
34 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
35 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40..5f
36 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
37 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60..7f
38 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
39 : 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 80..9f
40 : 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
41 : 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // a0..bf
42 : 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
43 : 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // c0..df
44 : 0xa, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, // e0..ef
45 : 0xb, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, // f0..ff
46 : 0x0, 0x1, 0x2, 0x3, 0x5, 0x8, 0x7, 0x1, 0x1, 0x1, 0x4, 0x6, 0x1, 0x1, 0x1, 0x1, // s0..s0
47 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
48 : 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, // s1..s2
49 : 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1,
50 : 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // s3..s4
51 : 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1,
52 : 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, // s5..s6
53 : 1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1,
54 : 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // s7..s8
55 : };
56 :
57 2003 : uint32_t type = utf8d[byte];
58 :
59 2003 : *codep = (*state != ok) ? (byte & 0x3fu) | (*codep << 6) : (0xff >> type) & (byte);
60 :
61 2003 : *state = utf8d[256 + *state * 16 + type];
62 2003 : return *state;
63 : }
64 :
65 : /**
66 : * @brief encode codepoint to utf8
67 : *
68 : * @param unicode codepoint
69 : * @param utf8 output
70 : * @return size of output in bytes, 0 on error
71 : */
72 8 : static inline auto encode_point(uint32_t unicode, char utf8[4]) -> uint32_t
73 : {
74 8 : if (unicode <= 0x7F)
75 : {
76 2 : utf8[0] = (char)unicode;
77 2 : return 1;
78 : }
79 6 : if (unicode <= 0x7FF)
80 : {
81 2 : utf8[0] = (char)((unicode >> 6) | 0xC0);
82 2 : utf8[1] = (char)((unicode & 0x3F) | 0x80);
83 2 : return 2;
84 : }
85 4 : if (unicode >= 0xD800 && unicode < 0xE000)
86 0 : return 0;
87 :
88 4 : if (unicode <= 0xFFFF)
89 : {
90 2 : utf8[0] = (char)((unicode >> 12) | 0xE0);
91 2 : utf8[1] = (char)(((unicode >> 6) & 0x3F) | 0x80);
92 2 : utf8[2] = (char)((unicode & 0x3F) | 0x80);
93 2 : return 3;
94 : }
95 2 : if (unicode <= 0x10FFFF)
96 : {
97 2 : utf8[0] = (char)((unicode >> 18) | 0xF0);
98 2 : utf8[1] = (char)(((unicode >> 12) & 0x3F) | 0x80);
99 2 : utf8[2] = (char)(((unicode >> 6) & 0x3F) | 0x80);
100 2 : utf8[3] = (char)((unicode & 0x3F) | 0x80);
101 2 : return 4;
102 : }
103 0 : return 0;
104 : }
105 :
106 635 : static inline auto is_valid(std::string_view str) -> bool
107 : {
108 : uint32_t codepoint;
109 635 : uint32_t state = ok;
110 :
111 2587 : for (uint8_t c : str)
112 : {
113 1952 : decode_point(&state, &codepoint, c);
114 : }
115 :
116 635 : return state == ok;
117 : }
118 :
119 635 : static inline void validate(std::string_view value)
120 : {
121 635 : if (!spb::detail::utf8::is_valid(std::string_view(value.data(), value.size()))) [[unlikely]]
122 1 : throw std::runtime_error("invalid utf8 string");
123 634 : }
124 :
125 : } // namespace spb::detail::utf8
|