roo_io
API Documentation for roo_io
Loading...
Searching...
No Matches
unicode.h
Go to the documentation of this file.
1#pragma once
2
3#include <vector>
4
5#include "roo_backport.h"
6#include "roo_backport/string_view.h"
7#include "roo_io/base/byte.h"
9#include "roo_io/data/read.h"
11
12namespace roo_io {
13
15 public:
16 // Creates a decoder that will represent the specified byte array as Unicode
17 // code points.
18 Utf8Decoder(const byte *data, size_t size) : ptr_(data), end_(data + size) {}
19
20 // Creates a decoder that will represent the specified char array as Unicode
21 // code points.
22 Utf8Decoder(const char *data, size_t size)
23 : Utf8Decoder((const byte *)data, size) {}
24
25 // Creates a decoder that will represent the specified byte array as Unicode
26 // code points.
27 template <size_t N>
28 Utf8Decoder(const byte data[N]) : Utf8Decoder(data, N) {}
29
30 // Creates a decoder that will represent the specified byte array as Unicode
31 // code points.
32 template <size_t N>
33 Utf8Decoder(const char data[N]) : Utf8Decoder((const byte *)data, N) {}
34
35 // Convenience constructor that reads the input from a specifed string.
36 Utf8Decoder(roo::string_view s) : Utf8Decoder((const byte *)s.data(), s.size()) {}
37
38#if __cplusplus >= 202002L
39 // Convenience constructor that reads the input from a specifed string.
40 Utf8Decoder(std::basic_string_view<char8_t> s)
41 : Utf8Decoder((const byte *)s.data(), s.size()) {}
42#endif
43
44 const byte *data() const { return ptr_; }
45
46 bool next(char32_t &result) {
47 if (ptr_ == end_) return false;
48 ptr_ += u8c::u8next_((const char *)ptr_, (const char *)end_, result);
49 return true;
50 }
51
52 private:
53 const byte *ptr_;
54 const byte *end_;
55};
56
57template <typename OutputItr>
58void DecodeUtfString(roo::string_view s, OutputItr itr) {
60 char32_t ch;
61 while (decoder.next(ch)) *itr++ = ch;
62}
63
64inline std::vector<char32_t> DecodeUtfStringToVector(roo::string_view s) {
65 std::vector<char32_t> result;
66 DecodeUtfString(s, std::back_inserter(result));
67 return result;
68}
69
70// Writes a single Unicode code point, encoded as UTF-8, to the specified
71// iterator.
72template <typename OutputIterator>
74 if (v <= 0x7F) {
75 itr.write((byte)v);
76 } else if (v <= 0x7FF) {
77 itr.write((byte)((v >> 6) | 0xC0));
78 itr.write((byte)((v & 0x3F) | 0x80));
79 } else if (v <= 0xFFFF) {
80 itr.write((byte)((v >> 12) | 0xE0));
81 itr.write((byte)(((v >> 6) & 0x3F) | 0x80));
82 itr.write((byte)((v & 0x3F) | 0x80));
83 } else {
84 itr.write((byte)((v >> 18) | 0xF0));
85 itr.write((byte)(((v >> 12) & 0x3F) | 0x80));
86 itr.write((byte)(((v >> 6) & 0x3F) | 0x80));
87 itr.write((byte)((v & 0x3F) | 0x80));
88 }
89}
90
91// Writes the UTF-8 representation of the rune to buf. The `buf` must have
92// sufficient size (4 is always safe). Returns the number of bytes actually
93// written.
94inline int WriteUtf8Char(byte *buf, char32_t ch) {
95 if (ch <= 0x7F) {
96 buf[0] = (byte)ch;
97 return 1;
98 }
99 if (ch <= 0x7FF) {
100 buf[1] = (byte)((ch & 0x3F) | 0x80);
101 ch >>= 6;
102 buf[0] = (byte)(ch | 0xC0);
103 return 2;
104 }
105 if (ch <= 0xFFFF) {
106 buf[2] = (byte)((ch & 0x3F) | 0x80);
107 ch >>= 6;
108 buf[1] = (byte)((ch & 0x3F) | 0x80);
109 ch >>= 6;
110 buf[0] = (byte)(ch | 0xE0);
111 return 3;
112 }
113 buf[3] = (byte)((ch & 0x3F) | 0x80);
114 ch >>= 6;
115 buf[2] = (byte)((ch & 0x3F) | 0x80);
116 ch >>= 6;
117 buf[1] = (byte)((ch & 0x3F) | 0x80);
118 ch >>= 6;
119 buf[0] = (byte)(ch | 0xF0);
120 return 4;
121}
122
123inline int WriteUtf8Char(char *buf, char32_t ch) {
124 return WriteUtf8Char((byte *)buf, ch);
125}
126
127} // namespace roo_io
Utf8Decoder(const byte *data, size_t size)
Definition unicode.h:18
Utf8Decoder(const byte data[N])
Definition unicode.h:28
bool next(char32_t &result)
Definition unicode.h:46
const byte * data() const
Definition unicode.h:44
Utf8Decoder(const char *data, size_t size)
Definition unicode.h:22
Utf8Decoder(const char data[N])
Definition unicode.h:33
Utf8Decoder(roo::string_view s)
Definition unicode.h:36
Definition byte.h:6
void DecodeUtfString(roo::string_view s, OutputItr itr)
Definition unicode.h:58
roo::basic_string_view< CharT, Traits > basic_string_view
Definition string_view.h:8
void WriteUtf8Char(OutputIterator &itr, char32_t v)
Definition unicode.h:73
std::vector< char32_t > DecodeUtfStringToVector(roo::string_view s)
Definition unicode.h:64
roo::byte byte
Definition byte.h:8
size_t u8next_(const char *start, const char *end, char32_t &val)
Definition u8c.cpp:66