Asterisk - The Open Source Telephony Project GIT-master-7921072
Data Structures | Macros | Functions | Variables
utf8.c File Reference

UTF-8 information and validation functions. More...

#include "asterisk.h"
#include "asterisk/utils.h"
#include "asterisk/utf8.h"
#include "asterisk/test.h"
Include dependency graph for utf8.c:

Go to the source code of this file.

Data Structures

struct  ast_utf8_validator
 

Macros

#define REPL_SEQ   "\xEF\xBF\xBD"
 
#define REPL_SEQ_LEN   3
 
#define UTF8_ACCEPT   0
 
#define UTF8_REJECT   12
 

Functions

void ast_utf8_copy_string (char *dst, const char *src, size_t size)
 Copy a string safely ensuring valid UTF-8. More...
 
int ast_utf8_init (void)
 Register UTF-8 tests. More...
 
int ast_utf8_is_valid (const char *src)
 Check if a zero-terminated string is valid UTF-8. More...
 
int ast_utf8_is_validn (const char *src, size_t size)
 Check if the first size bytes of a string are valid UTF-8. More...
 
enum ast_utf8_replace_result ast_utf8_replace_invalid_chars (char *dst, size_t *dst_size, const char *src, size_t src_len)
 Copy a string safely replacing any invalid UTF-8 sequences. More...
 
void ast_utf8_validator_destroy (struct ast_utf8_validator *validator)
 Destroy a UTF-8 validator. More...
 
enum ast_utf8_validation_result ast_utf8_validator_feed (struct ast_utf8_validator *validator, const char *data)
 Feed a zero-terminated string into the UTF-8 validator. More...
 
enum ast_utf8_validation_result ast_utf8_validator_feedn (struct ast_utf8_validator *validator, const char *data, size_t size)
 Feed a string into the UTF-8 validator. More...
 
int ast_utf8_validator_new (struct ast_utf8_validator **validator)
 Create a new UTF-8 validator. More...
 
void ast_utf8_validator_reset (struct ast_utf8_validator *validator)
 Reset the state of a UTF-8 validator. More...
 
enum ast_utf8_validation_result ast_utf8_validator_state (struct ast_utf8_validator *validator)
 Get the current UTF-8 validator state. More...
 
static uint32_t decode (uint32_t *state, uint32_t byte)
 

Variables

static const uint8_t utf8d []
 

Detailed Description

UTF-8 information and validation functions.

Definition in file utf8.c.

Macro Definition Documentation

◆ REPL_SEQ

#define REPL_SEQ   "\xEF\xBF\xBD"
Warning
A UTF-8 sequence could be 1, 2, 3 or 4 bytes long depending on the first byte in the sequence. Don't try to modify this function without understanding how UTF-8 works.

Definition at line 169 of file utf8.c.

◆ REPL_SEQ_LEN

#define REPL_SEQ_LEN   3

Definition at line 170 of file utf8.c.

◆ UTF8_ACCEPT

#define UTF8_ACCEPT   0

Definition at line 60 of file utf8.c.

◆ UTF8_REJECT

#define UTF8_REJECT   12

Definition at line 61 of file utf8.c.

Function Documentation

◆ ast_utf8_copy_string()

void ast_utf8_copy_string ( char *  dst,
const char *  src,
size_t  size 
)

Copy a string safely ensuring valid UTF-8.

Since
13.36.0, 16.13.0, 17.7.0, 18.0.0

This is similar to ast_copy_string, but it will only copy valid UTF-8 sequences from the source string into the destination buffer. If an invalid UTF-8 sequence is encountered, or the available space in the destination buffer is exhausted in the middle of an otherwise valid UTF-8 sequence, the destination buffer will be truncated to ensure that it only contains valid UTF-8.

Parameters
dstThe destination buffer.
srcThe source string
sizeThe size of the destination buffer

Definition at line 133 of file utf8.c.

134{
135 uint32_t state = UTF8_ACCEPT;
136 char *last_good = dst;
137
138 ast_assert(size > 0);
139
140 while (size && *src) {
141 if (decode(&state, (uint8_t) *src) == UTF8_REJECT) {
142 /* We _could_ replace with U+FFFD and try to recover, but for now
143 * we treat this the same as if we had run out of space */
144 break;
145 }
146
147 *dst++ = *src++;
148 size--;
149
150 if (size && state == UTF8_ACCEPT) {
151 /* last_good is where we will ultimately write the 0 byte */
152 last_good = dst;
153 }
154 }
155
156 *last_good = '\0';
157}
#define UTF8_REJECT
Definition: utf8.c:61
#define UTF8_ACCEPT
Definition: utf8.c:60
static uint32_t decode(uint32_t *state, uint32_t byte)
Definition: utf8.c:98
#define ast_assert(a)
Definition: utils.h:739

References ast_assert, decode(), UTF8_ACCEPT, and UTF8_REJECT.

◆ ast_utf8_init()

int ast_utf8_init ( void  )

Register UTF-8 tests.

Since
13.36.0, 16.13.0, 17.7.0, 18.0.0

Does nothing unless TEST_FRAMEWORK is defined.

Return values
0Always

Definition at line 919 of file utf8.c.

920{
921 return 0;
922}

Referenced by asterisk_daemon().

◆ ast_utf8_is_valid()

int ast_utf8_is_valid ( const char *  str)

Check if a zero-terminated string is valid UTF-8.

Since
13.36.0, 16.13.0, 17.7.0, 18.0.0
Parameters
strThe zero-terminated string to check
Return values
0if the string is not valid UTF-8
Non-zeroif the string is valid UTF-8

Definition at line 110 of file utf8.c.

111{
112 uint32_t state = UTF8_ACCEPT;
113
114 while (*src) {
115 decode(&state, (uint8_t) *src++);
116 }
117
118 return state == UTF8_ACCEPT;
119}

References decode(), and UTF8_ACCEPT.

◆ ast_utf8_is_validn()

int ast_utf8_is_validn ( const char *  str,
size_t  size 
)

Check if the first size bytes of a string are valid UTF-8.

Since
13.36.0, 16.13.0, 17.7.0, 18.0.0

Similar to ast_utf8_is_valid() but checks the first size bytes or until a zero byte is reached, whichever comes first.

Parameters
strThe string to check
sizeThe number of bytes to evaluate
Return values
0if the string is not valid UTF-8
Non-zeroif the string is valid UTF-8

Definition at line 121 of file utf8.c.

122{
123 uint32_t state = UTF8_ACCEPT;
124
125 while (size && *src) {
126 decode(&state, (uint8_t) *src++);
127 size--;
128 }
129
130 return state == UTF8_ACCEPT;
131}

References decode(), and UTF8_ACCEPT.

◆ ast_utf8_replace_invalid_chars()

enum ast_utf8_replace_result ast_utf8_replace_invalid_chars ( char *  dst,
size_t *  dst_size,
const char *  src,
size_t  src_len 
)

Copy a string safely replacing any invalid UTF-8 sequences.

This is similar to ast_copy_string, but it will only copy valid UTF-8 sequences from the source string into the destination buffer. If an invalid sequence is encountered, it's replaced with the \uFFFD sequence which is the valid UTF-8 sequence that represents an unknown, unrecognized, or unrepresentable character. Since \uFFFD is actually a 3 byte sequence, the destination buffer will need to be larger than the corresponding source string if it contains invalid sequences. You can pass NULL as the destination buffer pointer to get the actual size required, then call the function again with the properly sized buffer.

Parameters
dstPointer to the destination buffer. If NULL, dst_size will be set to the size of the buffer required to fully process the source string.
dst_sizeA pointer to the size of the dst buffer
srcThe source string
src_lenThe number of bytes to copy
Returns
ast_utf8_replace_result

Definition at line 173 of file utf8.c.

175{
177 size_t src_pos = 0;
178 size_t dst_pos = 0;
179 uint32_t prev_state = UTF8_ACCEPT;
180 uint32_t curr_state = UTF8_ACCEPT;
181 /*
182 * UTF-8 sequences can be 1 - 4 bytes in length so we
183 * have to keep track of where we are.
184 */
185 int seq_len = 0;
186
187 if (dst) {
188 memset(dst, 0, *dst_size);
189 } else {
190 *dst_size = 0;
191 }
192
193 if (!src || src_len == 0) {
195 }
196
197 for (prev_state = 0, curr_state = 0; src_pos < src_len; prev_state = curr_state, src_pos++) {
198 uint32_t rc;
199
200 rc = decode(&curr_state, (uint8_t) src[src_pos]);
201
202 if (dst && dst_pos >= *dst_size - 1) {
203 if (prev_state > UTF8_REJECT) {
204 /*
205 * We ran out of space in the middle of a possible
206 * multi-byte sequence so we have to back up and
207 * overwrite the start of the sequence with the
208 * NULL terminator.
209 */
210 dst_pos -= (seq_len - (prev_state / 36));
211 }
212 dst[dst_pos] = '\0';
213
215 }
216
217 if (rc == UTF8_ACCEPT) {
218 if (dst) {
219 dst[dst_pos] = src[src_pos];
220 }
221 dst_pos++;
222 seq_len = 0;
223 }
224
225 if (rc > UTF8_REJECT) {
226 /*
227 * We're possibly at the start of, or in the middle of,
228 * a multi-byte sequence. The curr_state will tell us how many
229 * bytes _should_ be remaining in the sequence.
230 */
231 if (prev_state == UTF8_ACCEPT) {
232 /* If the previous state was a good character then
233 * this can only be the start of s sequence
234 * which is all we care about.
235 */
236 seq_len = curr_state / 36 + 1;
237 }
238
239 if (dst) {
240 dst[dst_pos] = src[src_pos];
241 }
242 dst_pos++;
243 }
244
245 if (rc == UTF8_REJECT) {
246 /* We got at least 1 rejection so the string is invalid */
248
249 if (prev_state != UTF8_ACCEPT) {
250 /*
251 * If we were in a multi-byte sequence and this
252 * byte isn't valid at this time, we'll back
253 * the destination pointer back to the start
254 * of the now-invalid sequence and write the
255 * replacement bytes there. Then we'll
256 * process the current byte again in the next
257 * loop iteration. It may be quite valid later.
258 */
259 dst_pos -= (seq_len - (prev_state / 36));
260 src_pos--;
261 }
262 if (dst) {
263 /*
264 * If we're not just calculating the needed destination
265 * buffer space, and we don't have enough room to write
266 * the replacement sequence, terminate the output
267 * and return.
268 */
269 if (dst_pos > *dst_size - 4) {
270 dst[dst_pos] = '\0';
272 }
273 memcpy(&dst[dst_pos], REPL_SEQ, REPL_SEQ_LEN);
274 }
275 dst_pos += REPL_SEQ_LEN;
276 /* Reset the state machine */
277 curr_state = UTF8_ACCEPT;
278 }
279 }
280
281 if (curr_state != UTF8_ACCEPT) {
282 /*
283 * We were probably in the middle of a
284 * sequence and ran out of space.
285 */
286 res = AST_UTF8_INVALID;
287 dst_pos -= (seq_len - (prev_state / 36));
288 if (dst) {
289 if (dst_pos > *dst_size - 4) {
290 dst[dst_pos] = '\0';
292 }
293 memcpy(&dst[dst_pos], REPL_SEQ, REPL_SEQ_LEN);
294 }
295 dst_pos += REPL_SEQ_LEN;
296 }
297
298 if (dst) {
299 dst[dst_pos] = '\0';
300 } else {
301 *dst_size = dst_pos + 1;
302 }
303
304 return res;
305}
#define REPL_SEQ_LEN
Definition: utf8.c:170
#define REPL_SEQ
Definition: utf8.c:169
@ AST_UTF8_INVALID
The consumed sequence is invalid UTF-8.
Definition: utf8.h:138
ast_utf8_replace_result
Definition: utf8.h:70
@ AST_UTF8_REPLACE_INVALID
Source contained at least 1 invalid UTF-8 sequence.
Definition: utf8.h:84
@ AST_UTF8_REPLACE_VALID
Source contained fully valid UTF-8.
Definition: utf8.h:76
@ AST_UTF8_REPLACE_OVERRUN
Not enough space to copy entire source.
Definition: utf8.h:93

References AST_UTF8_INVALID, AST_UTF8_REPLACE_INVALID, AST_UTF8_REPLACE_OVERRUN, AST_UTF8_REPLACE_VALID, decode(), REPL_SEQ, REPL_SEQ_LEN, UTF8_ACCEPT, and UTF8_REJECT.

Referenced by ast_channel_publish_varset(), and set_id_from_hdr().

◆ ast_utf8_validator_destroy()

void ast_utf8_validator_destroy ( struct ast_utf8_validator validator)

Destroy a UTF-8 validator.

Since
13.36.0, 16.13.0, 17.7.0, 18.0.0
Parameters
validatorThe validator instance to destroy

Definition at line 363 of file utf8.c.

364{
365 ast_free(validator);
366}
#define ast_free(a)
Definition: astmm.h:180

References ast_free.

◆ ast_utf8_validator_feed()

enum ast_utf8_validation_result ast_utf8_validator_feed ( struct ast_utf8_validator validator,
const char *  data 
)

Feed a zero-terminated string into the UTF-8 validator.

Since
13.36.0, 16.13.0, 17.7.0, 18.0.0
Parameters
validatorThe validator instance
dataThe zero-terminated string to feed into the validator
Returns
The ast_utf8_validation_result indicating the current state of the validator.

Definition at line 337 of file utf8.c.

339{
340 while (*data) {
341 decode(&validator->state, (uint8_t) *data++);
342 }
343
344 return ast_utf8_validator_state(validator);
345}
uint32_t state
Definition: utf8.c:308
enum ast_utf8_validation_result ast_utf8_validator_state(struct ast_utf8_validator *validator)
Get the current UTF-8 validator state.
Definition: utf8.c:324

References ast_utf8_validator_state(), decode(), and ast_utf8_validator::state.

◆ ast_utf8_validator_feedn()

enum ast_utf8_validation_result ast_utf8_validator_feedn ( struct ast_utf8_validator validator,
const char *  data,
size_t  size 
)

Feed a string into the UTF-8 validator.

Since
13.36.0, 16.13.0, 17.7.0, 18.0.0

Similar to ast_utf8_validator_feed but will stop feeding in data if a zero byte is encountered or size bytes have been read.

Parameters
validatorThe validator instance
dataThe string to feed into the validator
sizeThe number of bytes to feed into the validator
Returns
The ast_utf8_validation_result indicating the current state of the validator.

Definition at line 347 of file utf8.c.

349{
350 while (size && *data) {
351 decode(&validator->state, (uint8_t) *data++);
352 size--;
353 }
354
355 return ast_utf8_validator_state(validator);
356}

References ast_utf8_validator_state(), decode(), and ast_utf8_validator::state.

◆ ast_utf8_validator_new()

int ast_utf8_validator_new ( struct ast_utf8_validator **  validator)

Create a new UTF-8 validator.

Since
13.36.0, 16.13.0, 17.7.0, 18.0.0
Parameters
[out]validatorThe validator instance
Return values
0on success
-1on failure

Definition at line 311 of file utf8.c.

312{
313 struct ast_utf8_validator *tmp = ast_malloc(sizeof(*tmp));
314
315 if (!tmp) {
316 return 1;
317 }
318
319 tmp->state = UTF8_ACCEPT;
320 *validator = tmp;
321 return 0;
322}
#define ast_malloc(len)
A wrapper for malloc()
Definition: astmm.h:191
static int tmp()
Definition: bt_open.c:389

References ast_malloc, tmp(), and UTF8_ACCEPT.

◆ ast_utf8_validator_reset()

void ast_utf8_validator_reset ( struct ast_utf8_validator validator)

Reset the state of a UTF-8 validator.

Since
13.36.0, 16.13.0, 17.7.0, 18.0.0

Resets the provided UTF-8 validator to its initial state so that it can be reused.

Parameters
validatorThe validator instance to reset

Definition at line 358 of file utf8.c.

359{
360 validator->state = UTF8_ACCEPT;
361}

References ast_utf8_validator::state, and UTF8_ACCEPT.

◆ ast_utf8_validator_state()

enum ast_utf8_validation_result ast_utf8_validator_state ( struct ast_utf8_validator validator)

Get the current UTF-8 validator state.

Since
13.36.0, 16.13.0, 17.7.0, 18.0.0
Parameters
validatorThe validator instance
Returns
The ast_utf8_validation_result indicating the current state of the validator.

Definition at line 324 of file utf8.c.

326{
327 switch (validator->state) {
328 case UTF8_ACCEPT:
329 return AST_UTF8_VALID;
330 case UTF8_REJECT:
331 return AST_UTF8_INVALID;
332 default:
333 return AST_UTF8_UNKNOWN;
334 }
335}
@ AST_UTF8_UNKNOWN
The validator is in an intermediate state.
Definition: utf8.h:148
@ AST_UTF8_VALID
The consumed sequence is valid UTF-8.
Definition: utf8.h:130

References AST_UTF8_INVALID, AST_UTF8_UNKNOWN, AST_UTF8_VALID, ast_utf8_validator::state, UTF8_ACCEPT, and UTF8_REJECT.

Referenced by ast_utf8_validator_feed(), and ast_utf8_validator_feedn().

◆ decode()

static uint32_t decode ( uint32_t *  state,
uint32_t  byte 
)
inlinestatic

Definition at line 98 of file utf8.c.

98 {
99 uint32_t type = utf8d[byte];
100 *state = utf8d[256 + *state + type];
101 return *state;
102}
enum cc_state state
Definition: ccss.c:393
static const char type[]
Definition: chan_ooh323.c:109
static const uint8_t utf8d[]
Definition: utf8.c:63

References state, type, and utf8d.

Referenced by ast_utf8_copy_string(), ast_utf8_is_valid(), ast_utf8_is_validn(), ast_utf8_replace_invalid_chars(), ast_utf8_validator_feed(), and ast_utf8_validator_feedn().

Variable Documentation

◆ utf8d

const uint8_t utf8d[]
static

Definition at line 63 of file utf8.c.

Referenced by decode().