Asterisk - The Open Source Telephony Project GIT-master-7e7a603
utf8.h
Go to the documentation of this file.
1/*
2 * Asterisk -- An open source telephony toolkit.
3 *
4 * Copyright (C) 2020, Sean Bright
5 *
6 * Sean Bright <sean.bright@gmail.com>
7 *
8 * See http://www.asterisk.org for more information about
9 * the Asterisk project. Please do not directly contact
10 * any of the maintainers of this project for assistance;
11 * the project provides a web site, mailing lists and IRC
12 * channels for your use.
13 *
14 * This program is free software, distributed under the terms of
15 * the GNU General Public License Version 2. See the LICENSE file
16 * at the top of the source tree.
17 */
18
19/*! \file
20 *
21 * \brief UTF-8 information and validation functions
22 */
23
24#ifndef ASTERISK_UTF8_H
25#define ASTERISK_UTF8_H
26
27/*!
28 * \brief Check if a zero-terminated string is valid UTF-8
29 * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
30 *
31 * \param str The zero-terminated string to check
32 *
33 * \retval 0 if the string is not valid UTF-8
34 * \retval Non-zero if the string is valid UTF-8
35 */
36int ast_utf8_is_valid(const char *str);
37
38/*!
39 * \brief Check if the first \a size bytes of a string are valid UTF-8
40 * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
41 *
42 * Similar to \a ast_utf8_is_valid() but checks the first \a size bytes or until
43 * a zero byte is reached, whichever comes first.
44 *
45 * \param str The string to check
46 * \param size The number of bytes to evaluate
47 *
48 * \retval 0 if the string is not valid UTF-8
49 * \retval Non-zero if the string is valid UTF-8
50 */
51int ast_utf8_is_validn(const char *str, size_t size);
52
53/*!
54 * \brief Copy a string safely ensuring valid UTF-8
55 * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
56 *
57 * This is similar to \ref ast_copy_string, but it will only copy valid UTF-8
58 * sequences from the source string into the destination buffer. If an invalid
59 * UTF-8 sequence is encountered, or the available space in the destination
60 * buffer is exhausted in the middle of an otherwise valid UTF-8 sequence, the
61 * destination buffer will be truncated to ensure that it only contains valid
62 * UTF-8.
63 *
64 * \param dst The destination buffer.
65 * \param src The source string
66 * \param size The size of the destination buffer
67 */
68void ast_utf8_copy_string(char *dst, const char *src, size_t size);
69
71 /*! \brief Source contained fully valid UTF-8
72 *
73 * The entire string was valid UTF-8 and no replacement
74 * was required.
75 */
77
78 /*! \brief Source contained at least 1 invalid UTF-8 sequence
79 *
80 * Parts of the string contained invalid UTF-8 sequences
81 * but those were successfully replaced with the U+FFFD
82 * replacement sequence.
83 */
85
86 /*! \brief Not enough space to copy entire source
87 *
88 * The destination buffer wasn't large enough to copy
89 * all of the source characters. As many of the source
90 * characters that could be copied/replaced were done so
91 * and a final NULL terminator added.
92 */
94};
95
96/*!
97 * \brief Copy a string safely replacing any invalid UTF-8 sequences
98 *
99 * This is similar to \ref ast_copy_string, but it will only copy valid UTF-8
100 * sequences from the source string into the destination buffer.
101 * If an invalid sequence is encountered, it's replaced with the \uFFFD
102 * sequence which is the valid UTF-8 sequence that represents an unknown,
103 * unrecognized, or unrepresentable character. Since \uFFFD is actually a
104 * 3 byte sequence, the destination buffer will need to be larger than
105 * the corresponding source string if it contains invalid sequences.
106 * You can pass NULL as the destination buffer pointer to get the actual
107 * size required, then call the function again with the properly sized
108 * buffer.
109 *
110 * \param dst Pointer to the destination buffer. If NULL,
111 * dst_size will be set to the size of the
112 * buffer required to fully process the
113 * source string.
114 * \param dst_size A pointer to the size of the dst buffer
115 * \param src The source string
116 * \param src_len The number of bytes to copy
117 *
118 * \return \ref ast_utf8_replace_result
119 */
121 size_t *dst_size, const char *src, size_t src_len);
122
124 /*! \brief The consumed sequence is valid UTF-8
125 *
126 * The bytes consumed thus far by the validator represent a valid sequence of
127 * UTF-8 bytes. If additional bytes are fed into the validator, it can
128 * transition into either \a AST_UTF8_INVALID or \a AST_UTF8_UNKNOWN
129 */
131
132 /*! \brief The consumed sequence is invalid UTF-8
133 *
134 * The bytes consumed thus far by the validator represent an invalid sequence
135 * of UTF-8 bytes. Feeding additional bytes into the validator will not
136 * change its state.
137 */
139
140 /*! \brief The validator is in an intermediate state
141 *
142 * The validator is in the process of validating a multibyte UTF-8 sequence
143 * and requires additional data to be fed into it to determine validity. If
144 * additional bytes are fed into the validator, it can transition into either
145 * \a AST_UTF8_VALID or \a AST_UTF8_INVALID. If you have no additional data
146 * to feed into the validator the UTF-8 sequence is invalid.
147 */
149};
150
151/*!
152 * \brief Opaque type for UTF-8 validator state.
153 * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
154 */
155struct ast_utf8_validator;
156
157/*!
158 * \brief Create a new UTF-8 validator
159 * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
160 *
161 * \param[out] validator The validator instance
162 *
163 * \retval 0 on success
164 * \retval -1 on failure
165 */
166int ast_utf8_validator_new(struct ast_utf8_validator **validator);
167
168/*!
169 * \brief Feed a zero-terminated string into the UTF-8 validator
170 * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
171 *
172 * \param validator The validator instance
173 * \param data The zero-terminated string to feed into the validator
174 *
175 * \return The \ref ast_utf8_validation_result indicating the current state of
176 * the validator.
177 */
179 struct ast_utf8_validator *validator, const char *data);
180
181/*!
182 * \brief Feed a string into the UTF-8 validator
183 * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
184 *
185 * Similar to \a ast_utf8_validator_feed but will stop feeding in data if a zero
186 * byte is encountered or \a size bytes have been read.
187 *
188 * \param validator The validator instance
189 * \param data The string to feed into the validator
190 * \param size The number of bytes to feed into the validator
191 *
192 * \return The \ref ast_utf8_validation_result indicating the current state of
193 * the validator.
194 */
196 struct ast_utf8_validator *validator, const char *data, size_t size);
197
198/*!
199 * \brief Get the current UTF-8 validator state
200 * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
201 *
202 * \param validator The validator instance
203 *
204 * \return The \ref ast_utf8_validation_result indicating the current state of
205 * the validator.
206 */
208 struct ast_utf8_validator *validator);
209
210/*!
211 * \brief Reset the state of a UTF-8 validator
212 * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
213 *
214 * Resets the provided UTF-8 validator to its initial state so that it can be
215 * reused.
216 *
217 * \param validator The validator instance to reset
218 */
220 struct ast_utf8_validator *validator);
221
222/*!
223 * \brief Destroy a UTF-8 validator
224 * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
225 *
226 * \param validator The validator instance to destroy
227 */
228void ast_utf8_validator_destroy(struct ast_utf8_validator *validator);
229
230/*!
231 * \brief Register UTF-8 tests
232 * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
233 *
234 * Does nothing unless TEST_FRAMEWORK is defined.
235 *
236 * \retval 0 Always
237 */
238int ast_utf8_init(void);
239
240#endif /* ASTERISK_UTF8_H */
const char * str
Definition: app_jack.c:147
enum ast_utf8_validation_result ast_utf8_validator_state(struct ast_utf8_validator *validator)
Get the current UTF-8 validator state.
Definition: utf8.c:324
int ast_utf8_init(void)
Register UTF-8 tests.
Definition: utf8.c:919
ast_utf8_validation_result
Definition: utf8.h:123
@ AST_UTF8_INVALID
The consumed sequence is invalid UTF-8.
Definition: utf8.h:138
@ AST_UTF8_UNKNOWN
The validator is in an intermediate state.
Definition: utf8.h:148
@ AST_UTF8_VALID
The consumed sequence is valid UTF-8.
Definition: utf8.h:130
int ast_utf8_validator_new(struct ast_utf8_validator **validator)
Create a new UTF-8 validator.
Definition: utf8.c:311
int ast_utf8_is_valid(const char *str)
Check if a zero-terminated string is valid UTF-8.
Definition: utf8.c:110
enum ast_utf8_validation_result ast_utf8_validator_feed(struct ast_utf8_validator *validator, const char *data)
Feed a zero-terminated string into the UTF-8 validator.
Definition: utf8.c:337
ast_utf8_replace_result
Definition: utf8.h:70
@ AST_UTF8_REPLACE_INVALID
Source contained at least 1 invalid UTF-8 sequence.
Definition: utf8.h:84
@ AST_UTF8_REPLACE_VALID
Source contained fully valid UTF-8.
Definition: utf8.h:76
@ AST_UTF8_REPLACE_OVERRUN
Not enough space to copy entire source.
Definition: utf8.h:93
int ast_utf8_is_validn(const char *str, size_t size)
Check if the first size bytes of a string are valid UTF-8.
Definition: utf8.c:121
void ast_utf8_validator_reset(struct ast_utf8_validator *validator)
Reset the state of a UTF-8 validator.
Definition: utf8.c:358
void ast_utf8_validator_destroy(struct ast_utf8_validator *validator)
Destroy a UTF-8 validator.
Definition: utf8.c:363
void ast_utf8_copy_string(char *dst, const char *src, size_t size)
Copy a string safely ensuring valid UTF-8.
Definition: utf8.c:133
enum ast_utf8_replace_result ast_utf8_replace_invalid_chars(char *dst, size_t *dst_size, const char *src, size_t src_len)
Copy a string safely replacing any invalid UTF-8 sequences.
Definition: utf8.c:173
enum ast_utf8_validation_result ast_utf8_validator_feedn(struct ast_utf8_validator *validator, const char *data, size_t size)
Feed a string into the UTF-8 validator.
Definition: utf8.c:347