Asterisk - The Open Source Telephony Project GIT-master-f36a736
utf8.c
Go to the documentation of this file.
1/*
2 * Asterisk -- An open source telephony toolkit.
3 *
4 * Copyright (C) 2020, Sean Bright
5 *
6 * Sean Bright <sean.bright@gmail.com>
7 *
8 * See http://www.asterisk.org for more information about
9 * the Asterisk project. Please do not directly contact
10 * any of the maintainers of this project for assistance;
11 * the project provides a web site, mailing lists and IRC
12 * channels for your use.
13 *
14 * This program is free software, distributed under the terms of
15 * the GNU General Public License Version 2. See the LICENSE file
16 * at the top of the source tree.
17 */
18
19/*! \file
20 *
21 * \brief UTF-8 information and validation functions
22 */
23
24/*** MODULEINFO
25 <support_level>core</support_level>
26***/
27
28#include "asterisk.h"
29
30#include "asterisk/utils.h"
31#include "asterisk/utf8.h"
32#include "asterisk/test.h"
33
34/*
35 * BEGIN THIRD PARTY CODE
36 *
37 * Copyright (c) 2008-2010 Björn Höhrmann <bjoern@hoehrmann.de>
38 *
39 * Permission is hereby granted, free of charge, to any person obtaining a copy
40 * of this software and associated documentation files (the "Software"), to deal
41 * in the Software without restriction, including without limitation the rights
42 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
43 * copies of the Software, and to permit persons to whom the Software is
44 * furnished to do so, subject to the following conditions:
45 *
46 * The above copyright notice and this permission notice shall be included in all
47 * copies or substantial portions of the Software.
48 *
49 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
50 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
51 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
52 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
53 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
54 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
55 * SOFTWARE.
56 *
57 * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
58 */
59
60#define UTF8_ACCEPT 0
61#define UTF8_REJECT 12
62
63static const uint8_t utf8d[] = {
64 /* The first part of the table maps bytes to character classes that
65 * to reduce the size of the transition table and create bitmasks. */
66 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
67 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
68 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
69 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
70 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
71 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
72 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
73 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
74
75 /* The second part is a transition table that maps a combination
76 * of a state of the automaton and a character class to a state. */
77 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
78 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
79 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
80 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
81 12,36,12,12,12,12,12,12,12,12,12,12,
82};
83
84#if 0
85/* We can bring this back if we need the codepoint? */
86static uint32_t inline decode(uint32_t *state, uint32_t *codep, uint32_t byte) {
87 uint32_t type = utf8d[byte];
88
89 *codep = (*state != UTF8_ACCEPT) ?
90 (byte & 0x3fu) | (*codep << 6) :
91 (0xff >> type) & (byte);
92
93 *state = utf8d[256 + *state + type];
94 return *state;
95}
96#endif
97
98static uint32_t inline decode(uint32_t *state, uint32_t byte) {
99 uint32_t type = utf8d[byte];
100 *state = utf8d[256 + *state + type];
101 return *state;
102}
103
104/*
105 * END THIRD PARTY CODE
106 *
107 * See copyright notice above.
108 */
109
110int ast_utf8_is_valid(const char *src)
111{
112 uint32_t state = UTF8_ACCEPT;
113
114 while (*src) {
115 decode(&state, (uint8_t) *src++);
116 }
117
118 return state == UTF8_ACCEPT;
119}
120
121int ast_utf8_is_validn(const char *src, size_t size)
122{
123 uint32_t state = UTF8_ACCEPT;
124
125 while (size && *src) {
126 decode(&state, (uint8_t) *src++);
127 size--;
128 }
129
130 return state == UTF8_ACCEPT;
131}
132
133void ast_utf8_copy_string(char *dst, const char *src, size_t size)
134{
135 uint32_t state = UTF8_ACCEPT;
136 char *last_good = dst;
137
138 ast_assert(size > 0);
139
140 while (size && *src) {
141 if (decode(&state, (uint8_t) *src) == UTF8_REJECT) {
142 /* We _could_ replace with U+FFFD and try to recover, but for now
143 * we treat this the same as if we had run out of space */
144 break;
145 }
146
147 *dst++ = *src++;
148 size--;
149
150 if (size && state == UTF8_ACCEPT) {
151 /* last_good is where we will ultimately write the 0 byte */
152 last_good = dst;
153 }
154 }
155
156 *last_good = '\0';
157}
158
159/*!
160 * \warning A UTF-8 sequence could be 1, 2, 3 or 4 bytes long depending
161 * on the first byte in the sequence. Don't try to modify this function
162 * without understanding how UTF-8 works.
163 */
164
165/*
166 * The official unicode replacement character is U+FFFD
167 * which is actually the 3 following bytes:
168 */
169#define REPL_SEQ "\xEF\xBF\xBD"
170#define REPL_SEQ_LEN 3
171
173ast_utf8_replace_invalid_chars(char *dst, size_t *dst_size, const char *src,
174 size_t src_len)
175{
177 size_t src_pos = 0;
178 size_t dst_pos = 0;
179 uint32_t prev_state = UTF8_ACCEPT;
180 uint32_t curr_state = UTF8_ACCEPT;
181 /*
182 * UTF-8 sequences can be 1 - 4 bytes in length so we
183 * have to keep track of where we are.
184 */
185 int seq_len = 0;
186
187 if (dst) {
188 memset(dst, 0, *dst_size);
189 } else {
190 *dst_size = 0;
191 }
192
193 if (!src || src_len == 0) {
195 }
196
197 for (prev_state = 0, curr_state = 0; src_pos < src_len; prev_state = curr_state, src_pos++) {
198 uint32_t rc;
199
200 rc = decode(&curr_state, (uint8_t) src[src_pos]);
201
202 if (dst && dst_pos >= *dst_size - 1) {
203 if (prev_state > UTF8_REJECT) {
204 /*
205 * We ran out of space in the middle of a possible
206 * multi-byte sequence so we have to back up and
207 * overwrite the start of the sequence with the
208 * NULL terminator.
209 */
210 dst_pos -= (seq_len - (prev_state / 36));
211 }
212 dst[dst_pos] = '\0';
213
215 }
216
217 if (rc == UTF8_ACCEPT) {
218 if (dst) {
219 dst[dst_pos] = src[src_pos];
220 }
221 dst_pos++;
222 seq_len = 0;
223 }
224
225 if (rc > UTF8_REJECT) {
226 /*
227 * We're possibly at the start of, or in the middle of,
228 * a multi-byte sequence. The curr_state will tell us how many
229 * bytes _should_ be remaining in the sequence.
230 */
231 if (prev_state == UTF8_ACCEPT) {
232 /* If the previous state was a good character then
233 * this can only be the start of s sequence
234 * which is all we care about.
235 */
236 seq_len = curr_state / 36 + 1;
237 }
238
239 if (dst) {
240 dst[dst_pos] = src[src_pos];
241 }
242 dst_pos++;
243 }
244
245 if (rc == UTF8_REJECT) {
246 /* We got at least 1 rejection so the string is invalid */
248
249 if (prev_state != UTF8_ACCEPT) {
250 /*
251 * If we were in a multi-byte sequence and this
252 * byte isn't valid at this time, we'll back
253 * the destination pointer back to the start
254 * of the now-invalid sequence and write the
255 * replacement bytes there. Then we'll
256 * process the current byte again in the next
257 * loop iteration. It may be quite valid later.
258 */
259 dst_pos -= (seq_len - (prev_state / 36));
260 src_pos--;
261 }
262 if (dst) {
263 /*
264 * If we're not just calculating the needed destination
265 * buffer space, and we don't have enough room to write
266 * the replacement sequence, terminate the output
267 * and return.
268 */
269 if (dst_pos > *dst_size - 4) {
270 dst[dst_pos] = '\0';
272 }
273 memcpy(&dst[dst_pos], REPL_SEQ, REPL_SEQ_LEN);
274 }
275 dst_pos += REPL_SEQ_LEN;
276 /* Reset the state machine */
277 curr_state = UTF8_ACCEPT;
278 }
279 }
280
281 if (curr_state != UTF8_ACCEPT) {
282 /*
283 * We were probably in the middle of a
284 * sequence and ran out of space.
285 */
286 res = AST_UTF8_INVALID;
287 dst_pos -= (seq_len - (prev_state / 36));
288 if (dst) {
289 if (dst_pos > *dst_size - 4) {
290 dst[dst_pos] = '\0';
292 }
293 memcpy(&dst[dst_pos], REPL_SEQ, REPL_SEQ_LEN);
294 }
295 dst_pos += REPL_SEQ_LEN;
296 }
297
298 if (dst) {
299 dst[dst_pos] = '\0';
300 } else {
301 *dst_size = dst_pos + 1;
302 }
303
304 return res;
305}
306
308 uint32_t state;
309};
310
312{
313 struct ast_utf8_validator *tmp = ast_malloc(sizeof(*tmp));
314
315 if (!tmp) {
316 return 1;
317 }
318
319 tmp->state = UTF8_ACCEPT;
320 *validator = tmp;
321 return 0;
322}
323
325 struct ast_utf8_validator *validator)
326{
327 switch (validator->state) {
328 case UTF8_ACCEPT:
329 return AST_UTF8_VALID;
330 case UTF8_REJECT:
331 return AST_UTF8_INVALID;
332 default:
333 return AST_UTF8_UNKNOWN;
334 }
335}
336
338 struct ast_utf8_validator *validator, const char *data)
339{
340 while (*data) {
341 decode(&validator->state, (uint8_t) *data++);
342 }
343
344 return ast_utf8_validator_state(validator);
345}
346
348 struct ast_utf8_validator *validator, const char *data, size_t size)
349{
350 while (size && *data) {
351 decode(&validator->state, (uint8_t) *data++);
352 size--;
353 }
354
355 return ast_utf8_validator_state(validator);
356}
357
359{
360 validator->state = UTF8_ACCEPT;
361}
362
364{
365 ast_free(validator);
366}
367
368#ifdef TEST_FRAMEWORK
369
370#include "asterisk/json.h"
371
372AST_TEST_DEFINE(test_utf8_is_valid)
373{
374 switch (cmd) {
375 case TEST_INIT:
376 info->name = "is_valid";
377 info->category = "/main/utf8/";
378 info->summary = "Test ast_utf8_is_valid and ast_utf8_is_validn";
379 info->description =
380 "Tests UTF-8 string validation code.";
381 return AST_TEST_NOT_RUN;
382 case TEST_EXECUTE:
383 break;
384 }
385
386 /* Valid UTF-8 */
387 ast_test_validate(test, ast_utf8_is_valid("Asterisk"));
388 ast_test_validate(test, ast_utf8_is_valid("\xce\xbb"));
389 ast_test_validate(test, ast_utf8_is_valid("\xe2\x8a\x9b"));
390 ast_test_validate(test, ast_utf8_is_valid("\xf0\x9f\x93\x9e"));
391
392 /* Valid with leading */
393 ast_test_validate(test, ast_utf8_is_valid("aaa Asterisk"));
394 ast_test_validate(test, ast_utf8_is_valid("aaa \xce\xbb"));
395 ast_test_validate(test, ast_utf8_is_valid("aaa \xe2\x8a\x9b"));
396 ast_test_validate(test, ast_utf8_is_valid("aaa \xf0\x9f\x93\x9e"));
397
398 /* Valid with trailing */
399 ast_test_validate(test, ast_utf8_is_valid("Asterisk aaa"));
400 ast_test_validate(test, ast_utf8_is_valid("\xce\xbb aaa"));
401 ast_test_validate(test, ast_utf8_is_valid("\xe2\x8a\x9b aaa"));
402 ast_test_validate(test, ast_utf8_is_valid("\xf0\x9f\x93\x9e aaa"));
403
404 /* Valid with leading and trailing */
405 ast_test_validate(test, ast_utf8_is_valid("aaa Asterisk aaa"));
406 ast_test_validate(test, ast_utf8_is_valid("aaa \xce\xbb aaa"));
407 ast_test_validate(test, ast_utf8_is_valid("aaa \xe2\x8a\x9b aaa"));
408 ast_test_validate(test, ast_utf8_is_valid("aaa \xf0\x9f\x93\x9e aaa"));
409
410 /* Valid if limited by number of bytes */
411 ast_test_validate(test, ast_utf8_is_validn("Asterisk" "\xff", strlen("Asterisk")));
412 ast_test_validate(test, ast_utf8_is_validn("\xce\xbb" "\xff", strlen("\xce\xbb")));
413 ast_test_validate(test, ast_utf8_is_validn("\xe2\x8a\x9b" "\xff", strlen("\xe2\x8a\x9b")));
414 ast_test_validate(test, ast_utf8_is_validn("\xf0\x9f\x93\x9e" "\xff", strlen("\xf0\x9f\x93\x9e")));
415
416 /* Invalid */
417 ast_test_validate(test, !ast_utf8_is_valid("\xc0\x8a")); /* Overlong */
418 ast_test_validate(test, !ast_utf8_is_valid("98.6\xa7")); /* 'High ASCII' */
419 ast_test_validate(test, !ast_utf8_is_valid("\xc3\x28"));
420 ast_test_validate(test, !ast_utf8_is_valid("\xa0\xa1"));
421 ast_test_validate(test, !ast_utf8_is_valid("\xe2\x28\xa1"));
422 ast_test_validate(test, !ast_utf8_is_valid("\xe2\x82\x28"));
423 ast_test_validate(test, !ast_utf8_is_valid("\xf0\x28\x8c\xbc"));
424 ast_test_validate(test, !ast_utf8_is_valid("\xf0\x90\x28\xbc"));
425 ast_test_validate(test, !ast_utf8_is_valid("\xf0\x28\x8c\x28"));
426
427 return AST_TEST_PASS;
428}
429
430static int test_copy_and_compare(const char *src, size_t dst_len, const char *cmp)
431{
432 char dst[dst_len];
433 ast_utf8_copy_string(dst, src, dst_len);
434 return strcmp(dst, cmp) == 0;
435}
436
437AST_TEST_DEFINE(test_utf8_copy_string)
438{
439 switch (cmd) {
440 case TEST_INIT:
441 info->name = "copy_string";
442 info->category = "/main/utf8/";
443 info->summary = "Test ast_utf8_copy_string";
444 info->description =
445 "Tests UTF-8 string copying code.";
446 return AST_TEST_NOT_RUN;
447 case TEST_EXECUTE:
448 break;
449 }
450
451 ast_test_validate(test, test_copy_and_compare("Asterisk", 6, "Aster"));
452 ast_test_validate(test, test_copy_and_compare("Asterisk \xc2\xae", 11, "Asterisk "));
453 ast_test_validate(test, test_copy_and_compare("Asterisk \xc2\xae", 12, "Asterisk \xc2\xae"));
454 ast_test_validate(test, test_copy_and_compare("Asterisk \xc0\x8a", 12, "Asterisk "));
455 ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 1, ""));
456 ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 2, ""));
457 ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 3, "\xce\xbb"));
458 ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 4, "\xce\xbb "));
459 ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 5, "\xce\xbb x"));
460 ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 6, "\xce\xbb xy"));
461 ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 7, "\xce\xbb xyz"));
462
463 return AST_TEST_PASS;
464}
465
466/*
467 * Let the replace function determine how much
468 * buffer space is required for the destination.
469 */
470#define SIZE_REQUIRED 0
471/*
472 * Set the destination buffer size to the size
473 * we expect it to be. 0xDead has no meaning
474 * other than it's larger than any test needs
475 * a buffer to be.
476 */
477#define SIZE_EXPECTED 0xDead
478
479static int tracs(int run, const char *src, const char *cmp,
480 size_t dst_size, enum ast_utf8_replace_result exp_result)
481{
482 char *dst = NULL;
483 struct ast_json *blob;
485
486 if (dst_size == SIZE_REQUIRED) {
487 ast_utf8_replace_invalid_chars(dst, &dst_size, src, src ? strlen(src) : 0);
488 } else if (dst_size == SIZE_EXPECTED) {
489 dst_size = strlen(cmp) + 1;
490 }
491
492 dst = (char *)ast_alloca(dst_size);
493 result = ast_utf8_replace_invalid_chars(dst, &dst_size, src, src ? strlen(src) : 0);
494 if (result != exp_result || strcmp(dst, cmp) != 0) {
495 ast_log(LOG_ERROR, "Run: %2d Invalid result. Src: '%s', Dst: '%s', ExpDst: '%s' Result: %d ExpResult: %d\n",
496 run, src, dst, cmp, result, exp_result);
497 return 0;
498 }
499
500 /*
501 * The ultimate test: Does jansson accept the result as valid UTF-8?
502 */
503 blob = ast_json_pack("{s: s, s: s}",
504 "variable", "doesntmatter",
505 "value", dst);
506 ast_json_unref(blob);
507
508 return blob != NULL;
509}
510
511#define ATV(t, v) ast_test_validate(t, v)
512
513AST_TEST_DEFINE(test_utf8_replace_invalid_chars)
514{
515 const char *src;
516 size_t dst_size;
518 int k = 0;
519
520 switch (cmd) {
521 case TEST_INIT:
522 info->name = "replace_invalid";
523 info->category = "/main/utf8/";
524 info->summary = "Test ast_utf8_replace_invalid_chars";
525 info->description =
526 "Tests UTF-8 string copying/replacing code.";
527 return AST_TEST_NOT_RUN;
528 case TEST_EXECUTE:
529 break;
530 }
531
532/*
533 Table 3-7. Well-Formed UTF-8 Byte Sequences
534 Code Points First Second Third Fourth
535 Byte Byte Byte Byte
536 U+0000..U+007F 00..7F
537 U+0080..U+07FF C2..DF 80..BF
538 U+0800..U+0FFF E0 A0..BF 80..BF
539 U+1000..U+CFFF E1..EC 80..BF 80..BF
540 U+D000..U+D7FF ED 80..9F 80..BF
541 U+E000..U+FFFF EE..EF 80..BF 80..BF
542 U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
543 U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
544 U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
545
546 Older compilers don't support using the \uXXXX or \UXXXXXXXX
547 universal character notation so we have to manually specify
548 the byte sequences even for valid UTF-8 sequences.
549
550 These are the ones used for the tests below:
551
552 \u00B0 = \xC2\xB0
553 \u0800 = \xE0\xA0\x80
554 \uE000 = \xEE\x80\x80
555 \U00040000 = \xF1\x80\x80\x80
556*/
557
558 /*
559 * Check that NULL destination with a valid source string gives us a
560 * valid result code and buffer size = the length of the input string
561 * plus room for the NULL terminator.
562 */
563 src = "ABC\xC2\xB0xyz";
564 result = ast_utf8_replace_invalid_chars(NULL, &dst_size, src, src ? strlen(src) : 0);
565 ATV(test, result == AST_UTF8_REPLACE_VALID && dst_size == strlen(src) + 1);
566
567 /*
568 * Check that NULL destination with an invalid source string gives us an
569 * invalid result code and buffer size = the length of the input string
570 * plus room for the NULL terminator plus the 2 extra bytes needed for
571 * the one replacement character.
572 */
573 src = "ABC\xFFxyz";
574 result = ast_utf8_replace_invalid_chars(NULL, &dst_size, src, src ? strlen(src) : 0);
575 ATV(test, result == AST_UTF8_REPLACE_INVALID && dst_size == strlen(src) + 3);
576
577 /*
578 * NULL or empty input
579 */
580 ATV(test, tracs(__LINE__, NULL, "", 80, AST_UTF8_REPLACE_VALID));
581 ATV(test, tracs(__LINE__, "", "", 80, AST_UTF8_REPLACE_VALID));
582
583
584 /* Let the replace function calculate the space needed for result */
585 k = SIZE_REQUIRED;
586
587 /*
588 * Basic ASCII string
589 */
590 ATV(test, tracs(__LINE__, "ABC xyzA", "ABC xyzA", k, AST_UTF8_REPLACE_VALID));
591
592 /*
593 * Mid string.
594 */
595 /* good single sequences */
596 ATV(test, tracs(__LINE__, "ABC\xC2\xB0xyz", "ABC\xC2\xB0xyz", k, AST_UTF8_REPLACE_VALID));
597 ATV(test, tracs(__LINE__, "ABC\xE0\xA0\x80xyz", "ABC\xE0\xA0\x80xyz", k, AST_UTF8_REPLACE_VALID));
598 ATV(test, tracs(__LINE__, "ABC\xF1\x80\x80\x80xyz", "ABC\xF1\x80\x80\x80xyz", k, AST_UTF8_REPLACE_VALID));
599 /* good multiple adjacent sequences */
600 ATV(test, tracs(__LINE__, "ABC\xC2\xB0\xC2\xB0xyz", "ABC\xC2\xB0\xC2\xB0xyz", k, AST_UTF8_REPLACE_VALID));
601 ATV(test, tracs(__LINE__, "ABC\xE0\xA0\x80\xC2\xB0xyz", "ABC\xE0\xA0\x80\xC2\xB0xyz", k, AST_UTF8_REPLACE_VALID));
602 ATV(test, tracs(__LINE__, "ABC\xF1\x80\x80\x80\xC2\xB0xyz", "ABC\xF1\x80\x80\x80\xC2\xB0xyz", k, AST_UTF8_REPLACE_VALID));
603 /* Bad sequences */
604 ATV(test, tracs(__LINE__, "ABC\xC2xyz", "ABC\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
605 ATV(test, tracs(__LINE__, "ABC\xC2\xC2xyz", "ABC\xEF\xBF\xBD\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
606 ATV(test, tracs(__LINE__, "ABC\xB0xyz", "ABC\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
607 ATV(test, tracs(__LINE__, "ABC\xE0\xA0\xC2xyz", "ABC\xEF\xBF\xBD\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
608 ATV(test, tracs(__LINE__, "ABC\xE0\xA0\xF5xyz", "ABC\xEF\xBF\xBD\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
609 ATV(test, tracs(__LINE__, "ABC\xE0\xA0xyz", "ABC\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
610
611 /*
612 * Beginning of string.
613 */
614 /* good single sequences */
615 ATV(test, tracs(__LINE__, "\xC2\xB0xyz", "\xC2\xB0xyz", k, AST_UTF8_REPLACE_VALID));
616 ATV(test, tracs(__LINE__, "\xE0\xA0\x80xyz", "\xE0\xA0\x80xyz", k, AST_UTF8_REPLACE_VALID));
617 ATV(test, tracs(__LINE__, "\xF1\x80\x80\x80xyz", "\xF1\x80\x80\x80xyz", k, AST_UTF8_REPLACE_VALID));
618 /* good multiple adjacent sequences */
619 ATV(test, tracs(__LINE__, "\xC2\xB0\xC2\xB0xyz", "\xC2\xB0\xC2\xB0xyz", k, AST_UTF8_REPLACE_VALID));
620 ATV(test, tracs(__LINE__, "\xE0\xA0\x80\xC2\xB0xyz", "\xE0\xA0\x80\xC2\xB0xyz", k, AST_UTF8_REPLACE_VALID));
621 ATV(test, tracs(__LINE__, "\xF1\x80\x80\x80\xC2\xB0xyz", "\xF1\x80\x80\x80\xC2\xB0xyz", k, AST_UTF8_REPLACE_VALID));
622 /* Bad sequences */
623 ATV(test, tracs(__LINE__, "\xC2xyz", "\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
624 ATV(test, tracs(__LINE__, "\xC2\xC2xyz", "\xEF\xBF\xBD\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
625 ATV(test, tracs(__LINE__, "\xB0xyz", "\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
626 ATV(test, tracs(__LINE__, "\xE0\xA0\xC2xyz", "\xEF\xBF\xBD\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
627 ATV(test, tracs(__LINE__, "\xE0\xA0\xF5xyz", "\xEF\xBF\xBD\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
628 ATV(test, tracs(__LINE__, "\xE0\xA0xyz", "\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
629
630 /*
631 * End of string.
632 */
633 /* good single sequences */
634 ATV(test, tracs(__LINE__, "ABC\xC2\xB0", "ABC\xC2\xB0", k, AST_UTF8_REPLACE_VALID));
635 ATV(test, tracs(__LINE__, "ABC\xE0\xA0\x80", "ABC\xE0\xA0\x80", k, AST_UTF8_REPLACE_VALID));
636 ATV(test, tracs(__LINE__, "ABC\xF1\x80\x80\x80", "ABC\xF1\x80\x80\x80", k, AST_UTF8_REPLACE_VALID));
637 /* good multiple adjacent sequences */
638 ATV(test, tracs(__LINE__, "ABC\xC2\xB0\xC2\xB0", "ABC\xC2\xB0\xC2\xB0", k, AST_UTF8_REPLACE_VALID));
639 ATV(test, tracs(__LINE__, "ABC\xE0\xA0\x80\xC2\xB0", "ABC\xE0\xA0\x80\xC2\xB0", k, AST_UTF8_REPLACE_VALID));
640 ATV(test, tracs(__LINE__, "ABC\xF1\x80\x80\x80\xC2\xB0", "ABC\xF1\x80\x80\x80\xC2\xB0", k, AST_UTF8_REPLACE_VALID));
641 /* Bad sequences */
642 ATV(test, tracs(__LINE__, "ABC\xC2", "ABC\xEF\xBF\xBD", k, AST_UTF8_REPLACE_INVALID));
643 ATV(test, tracs(__LINE__, "ABC\xC2\xC2", "ABC\xEF\xBF\xBD\xEF\xBF\xBD", k, AST_UTF8_REPLACE_INVALID));
644 ATV(test, tracs(__LINE__, "ABC\xB0", "ABC\xEF\xBF\xBD", k, AST_UTF8_REPLACE_INVALID));
645 ATV(test, tracs(__LINE__, "ABC\xE0\xA0\xC2", "ABC\xEF\xBF\xBD\xEF\xBF\xBD", k, AST_UTF8_REPLACE_INVALID));
646 ATV(test, tracs(__LINE__, "ABC\xE0\xA0\xF5", "ABC\xEF\xBF\xBD\xEF\xBF\xBD", k, AST_UTF8_REPLACE_INVALID));
647 ATV(test, tracs(__LINE__, "ABC\xE0\xA0", "ABC\xEF\xBF\xBD", k, AST_UTF8_REPLACE_INVALID));
648
649
650 /* Force destination buffer to be only large enough to hold the expected result */
651 k = SIZE_EXPECTED;
652
653 /*
654 * Mid string.
655 */
656 /* good single sequences */
657 ATV(test, tracs(__LINE__, "ABC\xC2\xB0xyz", "ABC\xC2\xB0xyz", k, AST_UTF8_REPLACE_VALID));
658 ATV(test, tracs(__LINE__, "ABC\xE0\xA0\x80xyz", "ABC\xE0\xA0\x80xyz", k, AST_UTF8_REPLACE_VALID));
659 ATV(test, tracs(__LINE__, "ABC\xF1\x80\x80\x80xyz", "ABC\xF1\x80\x80\x80xyz", k, AST_UTF8_REPLACE_VALID));
660 /* good multiple adjacent sequences */
661 ATV(test, tracs(__LINE__, "ABC\xC2\xB0\xC2\xB0xyz", "ABC\xC2\xB0\xC2\xB0xyz", k, AST_UTF8_REPLACE_VALID));
662 ATV(test, tracs(__LINE__, "ABC\xE0\xA0\x80\xC2\xB0xyz", "ABC\xE0\xA0\x80\xC2\xB0xyz", k, AST_UTF8_REPLACE_VALID));
663 ATV(test, tracs(__LINE__, "ABC\xF1\x80\x80\x80\xC2\xB0xyz", "ABC\xF1\x80\x80\x80\xC2\xB0xyz", k, AST_UTF8_REPLACE_VALID));
664 /* Bad sequences */
665 ATV(test, tracs(__LINE__, "ABC\xC2xyz", "ABC\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
666 ATV(test, tracs(__LINE__, "ABC\xC2\xC2xyz", "ABC\xEF\xBF\xBD\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
667 ATV(test, tracs(__LINE__, "ABC\xB0xyz", "ABC\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
668 ATV(test, tracs(__LINE__, "ABC\xE0\xA0\xC2xyz", "ABC\xEF\xBF\xBD\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
669 ATV(test, tracs(__LINE__, "ABC\xE0\xA0\xF5xyz", "ABC\xEF\xBF\xBD\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
670 ATV(test, tracs(__LINE__, "ABC\xE0\xA0xyz", "ABC\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
671
672 /*
673 * Beginning of string.
674 */
675 /* good single sequences */
676 ATV(test, tracs(__LINE__, "\xC2\xB0xyz", "\xC2\xB0xyz", k, AST_UTF8_REPLACE_VALID));
677 ATV(test, tracs(__LINE__, "\xE0\xA0\x80xyz", "\xE0\xA0\x80xyz", k, AST_UTF8_REPLACE_VALID));
678 ATV(test, tracs(__LINE__, "\xF1\x80\x80\x80xyz", "\xF1\x80\x80\x80xyz", k, AST_UTF8_REPLACE_VALID));
679 /* good multiple adjacent sequences */
680 ATV(test, tracs(__LINE__, "\xC2\xB0\xC2\xB0xyz", "\xC2\xB0\xC2\xB0xyz", k, AST_UTF8_REPLACE_VALID));
681 ATV(test, tracs(__LINE__, "\xE0\xA0\x80\xC2\xB0xyz", "\xE0\xA0\x80\xC2\xB0xyz", k, AST_UTF8_REPLACE_VALID));
682 ATV(test, tracs(__LINE__, "\xF1\x80\x80\x80\xC2\xB0xyz", "\xF1\x80\x80\x80\xC2\xB0xyz", k, AST_UTF8_REPLACE_VALID));
683 /* Bad sequences */
684 ATV(test, tracs(__LINE__, "\xC2xyz", "\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
685 ATV(test, tracs(__LINE__, "\xC2\xC2xyz", "\xEF\xBF\xBD\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
686 ATV(test, tracs(__LINE__, "\xB0xyz", "\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
687 ATV(test, tracs(__LINE__, "\xE0\xA0\xC2xyz", "\xEF\xBF\xBD\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
688 ATV(test, tracs(__LINE__, "\xE0\xA0\xF5xyz", "\xEF\xBF\xBD\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
689 ATV(test, tracs(__LINE__, "\xE0\xA0xyz", "\xEF\xBF\xBDxyz", k, AST_UTF8_REPLACE_INVALID));
690
691 /*
692 * End of string.
693 */
694 /* good single sequences */
695 ATV(test, tracs(__LINE__, "ABC\xC2\xB0", "ABC\xC2\xB0", k, AST_UTF8_REPLACE_VALID));
696 ATV(test, tracs(__LINE__, "ABC\xE0\xA0\x80", "ABC\xE0\xA0\x80", k, AST_UTF8_REPLACE_VALID));
697 ATV(test, tracs(__LINE__, "ABC\xF1\x80\x80\x80", "ABC\xF1\x80\x80\x80", k, AST_UTF8_REPLACE_VALID));
698 /* good multiple adjacent sequences */
699 ATV(test, tracs(__LINE__, "ABC\xC2\xB0\xC2\xB0", "ABC\xC2\xB0\xC2\xB0", k, AST_UTF8_REPLACE_VALID));
700 ATV(test, tracs(__LINE__, "ABC\xE0\xA0\x80\xC2\xB0", "ABC\xE0\xA0\x80\xC2\xB0", k, AST_UTF8_REPLACE_VALID));
701 ATV(test, tracs(__LINE__, "ABC\xF1\x80\x80\x80\xC2\xB0", "ABC\xF1\x80\x80\x80\xC2\xB0", k, AST_UTF8_REPLACE_VALID));
702 /* Bad sequences */
703 ATV(test, tracs(__LINE__, "ABC\xC2", "ABC\xEF\xBF\xBD", k, AST_UTF8_REPLACE_INVALID));
704 ATV(test, tracs(__LINE__, "ABC\xC2\xC2", "ABC\xEF\xBF\xBD\xEF\xBF\xBD", k, AST_UTF8_REPLACE_INVALID));
705 ATV(test, tracs(__LINE__, "ABC\xB0", "ABC\xEF\xBF\xBD", k, AST_UTF8_REPLACE_INVALID));
706 ATV(test, tracs(__LINE__, "ABC\xE0\xA0\xC2", "ABC\xEF\xBF\xBD\xEF\xBF\xBD", k, AST_UTF8_REPLACE_INVALID));
707 ATV(test, tracs(__LINE__, "ABC\xE0\xA0\xF5", "ABC\xEF\xBF\xBD\xEF\xBF\xBD", k, AST_UTF8_REPLACE_INVALID));
708 ATV(test, tracs(__LINE__, "ABC\xE0\xA0", "ABC\xEF\xBF\xBD", k, AST_UTF8_REPLACE_INVALID));
709
710
711 /*
712 * Overrun Prevention
713 */
714
715 /* No frills. */
716 k = 9;
717 ATV(test, tracs(__LINE__, "ABC xyzA", "ABC xyzA", k--, AST_UTF8_REPLACE_VALID));
718 ATV(test, tracs(__LINE__, "ABC xyzA", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
719 ATV(test, tracs(__LINE__, "ABC xyzA", "ABC xy", k--, AST_UTF8_REPLACE_OVERRUN));
720
721 /* good single sequences */
722 k = 9; /* \xC2\xB0 needs 2 bytes */
723 ATV(test, tracs(__LINE__, "ABC\xC2\xB0xyz", "ABC\xC2\xB0xyz", k--, AST_UTF8_REPLACE_VALID));
724 ATV(test, tracs(__LINE__, "ABC\xC2\xB0xyz", "ABC\xC2\xB0xy", k--, AST_UTF8_REPLACE_OVERRUN));
725 ATV(test, tracs(__LINE__, "ABC\xC2\xB0xyz", "ABC\xC2\xB0x", k--, AST_UTF8_REPLACE_OVERRUN));
726 ATV(test, tracs(__LINE__, "ABC\xC2\xB0xyz", "ABC\xC2\xB0", k--, AST_UTF8_REPLACE_OVERRUN));
727 ATV(test, tracs(__LINE__, "ABC\xC2\xB0xyz", "ABC", k--, AST_UTF8_REPLACE_OVERRUN));
728 ATV(test, tracs(__LINE__, "ABC\xC2\xB0xyz", "ABC", k--, AST_UTF8_REPLACE_OVERRUN));
729 ATV(test, tracs(__LINE__, "ABC\xC2\xB0xyz", "AB", k--, AST_UTF8_REPLACE_OVERRUN));
730
731 k = 10; /* \xE0\xA0\x80 needs 3 bytes */
732 ATV(test, tracs(__LINE__, "ABC\xE0\xA0\x80xyz", "ABC\xE0\xA0\x80xyz", k--, AST_UTF8_REPLACE_VALID));
733 ATV(test, tracs(__LINE__, "ABC\xE0\xA0\x80xyz", "ABC\xE0\xA0\x80xy", k--, AST_UTF8_REPLACE_OVERRUN));
734 ATV(test, tracs(__LINE__, "ABC\xE0\xA0\x80xyz", "ABC\xE0\xA0\x80x", k--, AST_UTF8_REPLACE_OVERRUN));
735 ATV(test, tracs(__LINE__, "ABC\xE0\xA0\x80xyz", "ABC\xE0\xA0\x80", k--, AST_UTF8_REPLACE_OVERRUN));
736 ATV(test, tracs(__LINE__, "ABC\xE0\xA0\x80xyz", "ABC", k--, AST_UTF8_REPLACE_OVERRUN));
737 ATV(test, tracs(__LINE__, "ABC\xE0\xA0\x80xyz", "ABC", k--, AST_UTF8_REPLACE_OVERRUN));
738 ATV(test, tracs(__LINE__, "ABC\xE0\xA0\x80xyz", "ABC", k--, AST_UTF8_REPLACE_OVERRUN));
739 ATV(test, tracs(__LINE__, "ABC\xE0\xA0\x80xyz", "AB", k--, AST_UTF8_REPLACE_OVERRUN));
740
741 k = 10; /* \xEF\xBF\xBD needs 3 bytes */
742 ATV(test, tracs(__LINE__, "ABC\xC2xyz", "ABC\xEF\xBF\xBDxyz", k--, AST_UTF8_REPLACE_INVALID));
743 ATV(test, tracs(__LINE__, "ABC\xC2xyz", "ABC\xEF\xBF\xBDxy", k--, AST_UTF8_REPLACE_OVERRUN));
744 ATV(test, tracs(__LINE__, "ABC\xC2xyz", "ABC\xEF\xBF\xBDx", k--, AST_UTF8_REPLACE_OVERRUN));
745 ATV(test, tracs(__LINE__, "ABC\xC2xyz", "ABC\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
746 ATV(test, tracs(__LINE__, "ABC\xC2xyz", "ABC", k--, AST_UTF8_REPLACE_OVERRUN));
747 ATV(test, tracs(__LINE__, "ABC\xC2xyz", "ABC", k--, AST_UTF8_REPLACE_OVERRUN));
748 ATV(test, tracs(__LINE__, "ABC\xC2xyz", "ABC", k--, AST_UTF8_REPLACE_OVERRUN));
749 ATV(test, tracs(__LINE__, "ABC\xC2xyz", "AB", k--, AST_UTF8_REPLACE_OVERRUN));
750
751 k = 14; /* Each \xEF\xBF\xBD needs 3 bytes */
752 ATV(test, tracs(__LINE__, "ABC xyz\xC2\xFF", "ABC xyz\xEF\xBF\xBD\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_INVALID));
753 ATV(test, tracs(__LINE__, "ABC xyz\xC2\xFF", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
754 ATV(test, tracs(__LINE__, "ABC xyz\xC2\xFF", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
755 ATV(test, tracs(__LINE__, "ABC xyz\xC2\xFF", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
756 ATV(test, tracs(__LINE__, "ABC xyz\xC2\xFF", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
757 ATV(test, tracs(__LINE__, "ABC xyz\xC2\xFF", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
758 ATV(test, tracs(__LINE__, "ABC xyz\xC2\xFF", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
759 ATV(test, tracs(__LINE__, "ABC xyz\xC2\xFF", "ABC xy", k--, AST_UTF8_REPLACE_OVERRUN));
760 ATV(test, tracs(__LINE__, "ABC xyz\xC2\xFF", "ABC x", k--, AST_UTF8_REPLACE_OVERRUN));
761
762 /*
763 * The following tests are classed as "Everything including the kitchen sink".
764 * Some tests may be redundant.
765 */
766 k = 11;
767 ATV(test, tracs(__LINE__, "ABC xyz\xFF", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_INVALID));
768 ATV(test, tracs(__LINE__, "ABC xyz\xFF", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
769 ATV(test, tracs(__LINE__, "ABC xyz\xFF", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
770 ATV(test, tracs(__LINE__, "ABC xyz\xFF", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
771 ATV(test, tracs(__LINE__, "ABC xyz\xFF", "ABC xy", k--, AST_UTF8_REPLACE_OVERRUN));
772
773 k = 11;
774 ATV(test, tracs(__LINE__, "ABC xyz\xC2\xB0", "ABC xyz\xC2\xB0", k--, AST_UTF8_REPLACE_VALID));
775 ATV(test, tracs(__LINE__, "ABC xyz\xC2\xB0", "ABC xyz\xC2\xB0", k--, AST_UTF8_REPLACE_VALID));
776 ATV(test, tracs(__LINE__, "ABC xyz\xC2\xB0", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
777 ATV(test, tracs(__LINE__, "ABC xyz\xC2\xB0", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
778 ATV(test, tracs(__LINE__, "ABC xyz\xC2\xB0", "ABC xy", k--, AST_UTF8_REPLACE_OVERRUN));
779
780 k = 11;
781 ATV(test, tracs(__LINE__, "ABC xyz\xC2", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_INVALID));
782 ATV(test, tracs(__LINE__, "ABC xyz\xC2", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
783 ATV(test, tracs(__LINE__, "ABC xyz\xC2", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
784 ATV(test, tracs(__LINE__, "ABC xyz\xC2", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
785 ATV(test, tracs(__LINE__, "ABC xyz\xC2", "ABC xy", k--, AST_UTF8_REPLACE_OVERRUN));
786
787 k = 12;
788 ATV(test, tracs(__LINE__, "ABC xyz\xEE\x80\x80", "ABC xyz\xEE\x80\x80", k--, AST_UTF8_REPLACE_VALID));
789 ATV(test, tracs(__LINE__, "ABC xyz\xEE\x80\x80", "ABC xyz\xEE\x80\x80", k--, AST_UTF8_REPLACE_VALID));
790 ATV(test, tracs(__LINE__, "ABC xyz\xEE\x80\x80", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
791 ATV(test, tracs(__LINE__, "ABC xyz\xEE\x80\x80", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
792 ATV(test, tracs(__LINE__, "ABC xyz\xEE\x80\x80", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
793 ATV(test, tracs(__LINE__, "ABC xyz\xEE\x80\x80", "ABC xy", k--, AST_UTF8_REPLACE_OVERRUN));
794
795 k = 11;
796 ATV(test, tracs(__LINE__, "ABC xyz\xED", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_INVALID));
797 ATV(test, tracs(__LINE__, "ABC xyz\xED", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
798 ATV(test, tracs(__LINE__, "ABC xyz\xED", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
799 ATV(test, tracs(__LINE__, "ABC xyz\xED", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
800 ATV(test, tracs(__LINE__, "ABC xyz\xED", "ABC xy", k--, AST_UTF8_REPLACE_OVERRUN));
801
802 k = 14;
803 ATV(test, tracs(__LINE__, "ABC xyz\xED\xBF", "ABC xyz\xEF\xBF\xBD\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_INVALID));
804 ATV(test, tracs(__LINE__, "ABC xyz\xED\xBF", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
805 ATV(test, tracs(__LINE__, "ABC xyz\xED\xBF", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
806 ATV(test, tracs(__LINE__, "ABC xyz\xED\xBF", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
807 ATV(test, tracs(__LINE__, "ABC xyz\xED\xBF", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
808 ATV(test, tracs(__LINE__, "ABC xyz\xED\xBF", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
809 ATV(test, tracs(__LINE__, "ABC xyz\xED\xBF", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
810 ATV(test, tracs(__LINE__, "ABC xyz\xED\xBF", "ABC xy", k--, AST_UTF8_REPLACE_OVERRUN));
811 ATV(test, tracs(__LINE__, "ABC xyz\xED\xBF", "ABC x", k--, AST_UTF8_REPLACE_OVERRUN));
812
813 k = 14;
814 ATV(test, tracs(__LINE__, "ABC xyz\xED\xFF", "ABC xyz\xEF\xBF\xBD\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_INVALID));
815 ATV(test, tracs(__LINE__, "ABC xyz\xED\xFF", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
816 ATV(test, tracs(__LINE__, "ABC xyz\xED\xFF", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
817 ATV(test, tracs(__LINE__, "ABC xyz\xED\xFF", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
818 ATV(test, tracs(__LINE__, "ABC xyz\xED\xFF", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
819 ATV(test, tracs(__LINE__, "ABC xyz\xED\xFF", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
820 ATV(test, tracs(__LINE__, "ABC xyz\xED\xFF", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
821 ATV(test, tracs(__LINE__, "ABC xyz\xED\xFF", "ABC xy", k--, AST_UTF8_REPLACE_OVERRUN));
822 ATV(test, tracs(__LINE__, "ABC xyz\xED\xFF", "ABC x", k--, AST_UTF8_REPLACE_OVERRUN));
823
824 k = 14;
825 ATV(test, tracs(__LINE__, "ABC xyz\xED\xC2", "ABC xyz\xEF\xBF\xBD\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_INVALID));
826 ATV(test, tracs(__LINE__, "ABC xyz\xED\xC2", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
827 ATV(test, tracs(__LINE__, "ABC xyz\xED\xC2", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
828 ATV(test, tracs(__LINE__, "ABC xyz\xED\xC2", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
829 ATV(test, tracs(__LINE__, "ABC xyz\xED\xC2", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
830 ATV(test, tracs(__LINE__, "ABC xyz\xED\xC2", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
831 ATV(test, tracs(__LINE__, "ABC xyz\xED\xC2", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
832 ATV(test, tracs(__LINE__, "ABC xyz\xED\xC2", "ABC xy", k--, AST_UTF8_REPLACE_OVERRUN));
833 ATV(test, tracs(__LINE__, "ABC xyz\xED\xC2", "ABC x", k--, AST_UTF8_REPLACE_OVERRUN));
834
835 k = 14;
836 ATV(test, tracs(__LINE__, "ABC xyz\xED\x80\xC0", "ABC xyz\xEF\xBF\xBD\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_INVALID));
837 ATV(test, tracs(__LINE__, "ABC xyz\xED\x80\xC0", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
838 ATV(test, tracs(__LINE__, "ABC xyz\xED\x80\xC0", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
839 ATV(test, tracs(__LINE__, "ABC xyz\xED\x80\xC0", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
840 ATV(test, tracs(__LINE__, "ABC xyz\xED\x80\xC0", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
841 ATV(test, tracs(__LINE__, "ABC xyz\xED\x80\xC0", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
842 ATV(test, tracs(__LINE__, "ABC xyz\xED\x80\xC0", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
843 ATV(test, tracs(__LINE__, "ABC xyz\xED\x80\xC0", "ABC xy", k--, AST_UTF8_REPLACE_OVERRUN));
844 ATV(test, tracs(__LINE__, "ABC xyz\xED\x80\xC0", "ABC x", k--, AST_UTF8_REPLACE_OVERRUN));
845
846 k = 13;
847 ATV(test, tracs(__LINE__, "ABC xyz\xED\xC2\xB0", "ABC xyz\xEF\xBF\xBD\xC2\xB0", k--, AST_UTF8_REPLACE_INVALID));
848 ATV(test, tracs(__LINE__, "ABC xyz\xED\xC2\xB0", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
849 ATV(test, tracs(__LINE__, "ABC xyz\xED\xC2\xB0", "ABC xyz\xEF\xBF\xBD", k--, AST_UTF8_REPLACE_OVERRUN));
850 ATV(test, tracs(__LINE__, "ABC xyz\xED\xC2\xB0", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
851 ATV(test, tracs(__LINE__, "ABC xyz\xED\xC2\xB0", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
852 ATV(test, tracs(__LINE__, "ABC xyz\xED\xC2\xB0", "ABC xyz", k--, AST_UTF8_REPLACE_OVERRUN));
853 ATV(test, tracs(__LINE__, "ABC xyz\xED\xC2\xB0", "ABC xy", k--, AST_UTF8_REPLACE_OVERRUN));
854
855 return AST_TEST_PASS;
856}
857
858AST_TEST_DEFINE(test_utf8_validator)
859{
860 struct ast_utf8_validator *validator;
861
862 switch (cmd) {
863 case TEST_INIT:
864 info->name = "utf8_validator";
865 info->category = "/main/utf8/";
866 info->summary = "Test ast_utf8_validator";
867 info->description =
868 "Tests UTF-8 progressive validator code.";
869 return AST_TEST_NOT_RUN;
870 case TEST_EXECUTE:
871 break;
872 }
873
874 if (ast_utf8_validator_new(&validator)) {
875 return AST_TEST_FAIL;
876 }
877
878 ast_test_validate(test, ast_utf8_validator_feed(validator, "Asterisk") == AST_UTF8_VALID);
879 ast_test_validate(test, ast_utf8_validator_feed(validator, "\xc2") == AST_UTF8_UNKNOWN);
880 ast_test_validate(test, ast_utf8_validator_feed(validator, "\xae") == AST_UTF8_VALID);
881 ast_test_validate(test, ast_utf8_validator_feed(validator, "Private") == AST_UTF8_VALID);
882 ast_test_validate(test, ast_utf8_validator_feed(validator, "Branch") == AST_UTF8_VALID);
883 ast_test_validate(test, ast_utf8_validator_feed(validator, "Exchange") == AST_UTF8_VALID);
884 ast_test_validate(test, ast_utf8_validator_feed(validator, "\xe2") == AST_UTF8_UNKNOWN);
885 ast_test_validate(test, ast_utf8_validator_feed(validator, "\x84") == AST_UTF8_UNKNOWN);
886 ast_test_validate(test, ast_utf8_validator_feed(validator, "\xbb") == AST_UTF8_VALID);
887 ast_test_validate(test, ast_utf8_validator_feed(validator, "\xc0\x8a") == AST_UTF8_INVALID);
888 ast_test_validate(test, ast_utf8_validator_feed(validator, "valid") == AST_UTF8_INVALID);
889 ast_test_validate(test, ast_utf8_validator_feed(validator, "valid") == AST_UTF8_INVALID);
890 ast_test_validate(test, ast_utf8_validator_feed(validator, "valid") == AST_UTF8_INVALID);
891
893
894 return AST_TEST_PASS;
895}
896
897static void test_utf8_shutdown(void)
898{
899 AST_TEST_UNREGISTER(test_utf8_is_valid);
900 AST_TEST_UNREGISTER(test_utf8_copy_string);
901 AST_TEST_UNREGISTER(test_utf8_validator);
902 AST_TEST_UNREGISTER(test_utf8_replace_invalid_chars);
903}
904
905int ast_utf8_init(void)
906{
907 AST_TEST_REGISTER(test_utf8_is_valid);
908 AST_TEST_REGISTER(test_utf8_copy_string);
909 AST_TEST_REGISTER(test_utf8_validator);
910 AST_TEST_REGISTER(test_utf8_replace_invalid_chars);
911
912 ast_register_cleanup(test_utf8_shutdown);
913
914 return 0;
915}
916
917#else /* !TEST_FRAMEWORK */
918
920{
921 return 0;
922}
923
924#endif
Asterisk main include file. File version handling, generic pbx functions.
int ast_register_cleanup(void(*func)(void))
Register a function to be executed before Asterisk gracefully exits.
Definition: clicompat.c:19
#define ast_alloca(size)
call __builtin_alloca to ensure we get gcc builtin semantics
Definition: astmm.h:288
#define ast_free(a)
Definition: astmm.h:180
#define ast_malloc(len)
A wrapper for malloc()
Definition: astmm.h:191
#define ast_log
Definition: astobj2.c:42
static int tmp()
Definition: bt_open.c:389
enum cc_state state
Definition: ccss.c:393
static PGresult * result
Definition: cel_pgsql.c:84
static const char type[]
Definition: chan_ooh323.c:109
#define LOG_ERROR
Asterisk JSON abstraction layer.
void ast_json_unref(struct ast_json *value)
Decrease refcount on value. If refcount reaches zero, value is freed.
Definition: json.c:73
struct ast_json * ast_json_pack(char const *format,...)
Helper for creating complex JSON values.
Definition: json.c:612
def info(msg)
#define NULL
Definition: resample.c:96
Abstract JSON element (object, array, string, int, ...).
uint32_t state
Definition: utf8.c:308
Test Framework API.
@ TEST_INIT
Definition: test.h:200
@ TEST_EXECUTE
Definition: test.h:201
#define AST_TEST_REGISTER(cb)
Definition: test.h:127
#define AST_TEST_UNREGISTER(cb)
Definition: test.h:128
#define AST_TEST_DEFINE(hdr)
Definition: test.h:126
@ AST_TEST_PASS
Definition: test.h:195
@ AST_TEST_FAIL
Definition: test.h:196
@ AST_TEST_NOT_RUN
Definition: test.h:194
enum ast_utf8_validation_result ast_utf8_validator_state(struct ast_utf8_validator *validator)
Get the current UTF-8 validator state.
Definition: utf8.c:324
int ast_utf8_init(void)
Register UTF-8 tests.
Definition: utf8.c:919
#define UTF8_REJECT
Definition: utf8.c:61
int ast_utf8_validator_new(struct ast_utf8_validator **validator)
Create a new UTF-8 validator.
Definition: utf8.c:311
#define REPL_SEQ_LEN
Definition: utf8.c:170
int ast_utf8_is_valid(const char *src)
Check if a zero-terminated string is valid UTF-8.
Definition: utf8.c:110
int ast_utf8_is_validn(const char *src, size_t size)
Check if the first size bytes of a string are valid UTF-8.
Definition: utf8.c:121
enum ast_utf8_validation_result ast_utf8_validator_feed(struct ast_utf8_validator *validator, const char *data)
Feed a zero-terminated string into the UTF-8 validator.
Definition: utf8.c:337
#define UTF8_ACCEPT
Definition: utf8.c:60
#define REPL_SEQ
Definition: utf8.c:169
void ast_utf8_validator_reset(struct ast_utf8_validator *validator)
Reset the state of a UTF-8 validator.
Definition: utf8.c:358
void ast_utf8_validator_destroy(struct ast_utf8_validator *validator)
Destroy a UTF-8 validator.
Definition: utf8.c:363
static const uint8_t utf8d[]
Definition: utf8.c:63
void ast_utf8_copy_string(char *dst, const char *src, size_t size)
Copy a string safely ensuring valid UTF-8.
Definition: utf8.c:133
static uint32_t decode(uint32_t *state, uint32_t byte)
Definition: utf8.c:98
enum ast_utf8_replace_result ast_utf8_replace_invalid_chars(char *dst, size_t *dst_size, const char *src, size_t src_len)
Copy a string safely replacing any invalid UTF-8 sequences.
Definition: utf8.c:173
enum ast_utf8_validation_result ast_utf8_validator_feedn(struct ast_utf8_validator *validator, const char *data, size_t size)
Feed a string into the UTF-8 validator.
Definition: utf8.c:347
UTF-8 information and validation functions.
ast_utf8_validation_result
Definition: utf8.h:123
@ AST_UTF8_INVALID
The consumed sequence is invalid UTF-8.
Definition: utf8.h:138
@ AST_UTF8_UNKNOWN
The validator is in an intermediate state.
Definition: utf8.h:148
@ AST_UTF8_VALID
The consumed sequence is valid UTF-8.
Definition: utf8.h:130
ast_utf8_replace_result
Definition: utf8.h:70
@ AST_UTF8_REPLACE_INVALID
Source contained at least 1 invalid UTF-8 sequence.
Definition: utf8.h:84
@ AST_UTF8_REPLACE_VALID
Source contained fully valid UTF-8.
Definition: utf8.h:76
@ AST_UTF8_REPLACE_OVERRUN
Not enough space to copy entire source.
Definition: utf8.h:93
Utility functions.
#define ast_assert(a)
Definition: utils.h:739