UTF-8 information and validation functions. More...

#include "asterisk.h"
#include "asterisk/utils.h"
#include "asterisk/utf8.h"
#include "asterisk/test.h"

Include dependency graph for utf8.c:

Data Structures
struct	ast_utf8_validator

Macros
#define	REPL_SEQ "\xEF\xBF\xBD"

#define	REPL_SEQ_LEN 3

#define	UTF8_ACCEPT 0

#define	UTF8_REJECT 12

Functions
void	ast_utf8_copy_string (char dst, const char src, size_t size)
	Copy a string safely ensuring valid UTF-8. More...

int	ast_utf8_init (void)
	Register UTF-8 tests. More...

int	ast_utf8_is_valid (const char *src)
	Check if a zero-terminated string is valid UTF-8. More...

int	ast_utf8_is_validn (const char *src, size_t size)
	Check if the first size bytes of a string are valid UTF-8. More...

enum ast_utf8_replace_result	ast_utf8_replace_invalid_chars (char dst, size_t dst_size, const char *src, size_t src_len)
	Copy a string safely replacing any invalid UTF-8 sequences. More...

void	ast_utf8_validator_destroy (struct ast_utf8_validator *validator)
	Destroy a UTF-8 validator. More...

enum ast_utf8_validation_result	ast_utf8_validator_feed (struct ast_utf8_validator validator, const char data)
	Feed a zero-terminated string into the UTF-8 validator. More...

enum ast_utf8_validation_result	ast_utf8_validator_feedn (struct ast_utf8_validator validator, const char data, size_t size)
	Feed a string into the UTF-8 validator. More...

int	ast_utf8_validator_new (struct ast_utf8_validator **validator)
	Create a new UTF-8 validator. More...

void	ast_utf8_validator_reset (struct ast_utf8_validator *validator)
	Reset the state of a UTF-8 validator. More...

enum ast_utf8_validation_result	ast_utf8_validator_state (struct ast_utf8_validator *validator)
	Get the current UTF-8 validator state. More...

static uint32_t	decode (uint32_t *state, uint32_t byte)

Variables
static const uint8_t	utf8d []

Detailed Description

UTF-8 information and validation functions.

Definition in file utf8.c.

Macro Definition Documentation

◆ REPL_SEQ

#define REPL_SEQ "\xEF\xBF\xBD"

Warning: A UTF-8 sequence could be 1, 2, 3 or 4 bytes long depending on the first byte in the sequence. Don't try to modify this function without understanding how UTF-8 works.

Definition at line 169 of file utf8.c.

◆ REPL_SEQ_LEN

#define REPL_SEQ_LEN 3

Definition at line 170 of file utf8.c.

◆ UTF8_ACCEPT

#define UTF8_ACCEPT 0

Definition at line 60 of file utf8.c.

◆ UTF8_REJECT

#define UTF8_REJECT 12

Definition at line 61 of file utf8.c.

Function Documentation

◆ ast_utf8_copy_string()

void ast_utf8_copy_string	(	char *	dst,
		const char *	src,
		size_t	size
	)

Copy a string safely ensuring valid UTF-8.

Since: 13.36.0, 16.13.0, 17.7.0, 18.0.0

This is similar to ast_copy_string, but it will only copy valid UTF-8 sequences from the source string into the destination buffer. If an invalid UTF-8 sequence is encountered, or the available space in the destination buffer is exhausted in the middle of an otherwise valid UTF-8 sequence, the destination buffer will be truncated to ensure that it only contains valid UTF-8.

Parameters

dst	The destination buffer.
src	The source string
size	The size of the destination buffer

Definition at line 133 of file utf8.c.

{
    uint32_t state = UTF8_ACCEPT;
    char *last_good = dst;
 
    ast_assert(size > 0);
 
    while (size && *src) {
        if (decode(&state, (uint8_t) *src) == UTF8_REJECT) {
            /* We _could_ replace with U+FFFD and try to recover, but for now
             * we treat this the same as if we had run out of space */
            break;
        }
 
        *dst++ = *src++;
        size--;
 
        if (size && state == UTF8_ACCEPT) {
            /* last_good is where we will ultimately write the 0 byte */
            last_good = dst;
        }
    }
 
    *last_good = '\0';
}

References ast_assert, decode(), UTF8_ACCEPT, and UTF8_REJECT.

◆ ast_utf8_init()

int ast_utf8_init ( void )

Register UTF-8 tests.

Since: 13.36.0, 16.13.0, 17.7.0, 18.0.0

Does nothing unless TEST_FRAMEWORK is defined.

Return values

0 Always

Definition at line 919 of file utf8.c.

{
    return 0;
}

Referenced by asterisk_daemon().

◆ ast_utf8_is_valid()

int ast_utf8_is_valid ( const char * str )

Check if a zero-terminated string is valid UTF-8.

Since: 13.36.0, 16.13.0, 17.7.0, 18.0.0

Parameters

str	The zero-terminated string to check

Return values

0	if the string is not valid UTF-8
Non-zero	if the string is valid UTF-8

Definition at line 110 of file utf8.c.

{
    uint32_t state = UTF8_ACCEPT;
 
    while (*src) {
        decode(&state, (uint8_t) *src++);
    }
 
    return state == UTF8_ACCEPT;
}

References decode(), and UTF8_ACCEPT.

◆ ast_utf8_is_validn()

int ast_utf8_is_validn	(	const char *	str,
		size_t	size
	)

Check if the first size bytes of a string are valid UTF-8.

Since: 13.36.0, 16.13.0, 17.7.0, 18.0.0

Similar to ast_utf8_is_valid() but checks the first size bytes or until a zero byte is reached, whichever comes first.

Parameters

str	The string to check
size	The number of bytes to evaluate

Return values

0	if the string is not valid UTF-8
Non-zero	if the string is valid UTF-8

Definition at line 121 of file utf8.c.

{
    uint32_t state = UTF8_ACCEPT;
 
    while (size && *src) {
        decode(&state, (uint8_t) *src++);
        size--;
    }
 
    return state == UTF8_ACCEPT;
}

References decode(), and UTF8_ACCEPT.

◆ ast_utf8_replace_invalid_chars()

enum ast_utf8_replace_result ast_utf8_replace_invalid_chars	(	char *	dst,
		size_t *	dst_size,
		const char *	src,
		size_t	src_len
	)

Copy a string safely replacing any invalid UTF-8 sequences.

This is similar to ast_copy_string, but it will only copy valid UTF-8 sequences from the source string into the destination buffer. If an invalid sequence is encountered, it's replaced with the \uFFFD sequence which is the valid UTF-8 sequence that represents an unknown, unrecognized, or unrepresentable character. Since \uFFFD is actually a 3 byte sequence, the destination buffer will need to be larger than the corresponding source string if it contains invalid sequences. You can pass NULL as the destination buffer pointer to get the actual size required, then call the function again with the properly sized buffer.

Parameters

dst	Pointer to the destination buffer. If NULL, dst_size will be set to the size of the buffer required to fully process the source string.
dst_size	A pointer to the size of the dst buffer
src	The source string
src_len	The number of bytes to copy

Returns: ast_utf8_replace_result

Definition at line 173 of file utf8.c.

{
    enum ast_utf8_replace_result res = AST_UTF8_REPLACE_VALID;
    size_t src_pos = 0;
    size_t dst_pos = 0;
    uint32_t prev_state = UTF8_ACCEPT;
    uint32_t curr_state = UTF8_ACCEPT;
    /*
    * UTF-8 sequences can be 1 - 4 bytes in length so we
    * have to keep track of where we are.
    */
    int seq_len = 0;
 
    if (dst) {
        memset(dst, 0, *dst_size);
    } else {
        *dst_size = 0;
    }
 
    if (!src || src_len == 0) {
        return AST_UTF8_REPLACE_VALID;
    }
 
    for (prev_state = 0, curr_state = 0; src_pos < src_len; prev_state = curr_state, src_pos++) {
        uint32_t rc;
 
        rc = decode(&curr_state, (uint8_t) src[src_pos]);
 
        if (dst && dst_pos >= *dst_size - 1) {
            if (prev_state > UTF8_REJECT) {
                /*
                 * We ran out of space in the middle of a possible
                 * multi-byte sequence so we have to back up and
                 * overwrite the start of the sequence with the
                 * NULL terminator.
                 */
                dst_pos -= (seq_len - (prev_state / 36));
            }
            dst[dst_pos] = '\0';
 
            return AST_UTF8_REPLACE_OVERRUN;
        }
 
        if (rc == UTF8_ACCEPT) {
            if (dst) {
                dst[dst_pos] = src[src_pos];
            }
            dst_pos++;
            seq_len = 0;
        }
 
        if (rc > UTF8_REJECT) {
            /*
             * We're possibly at the start of, or in the middle of,
             * a multi-byte sequence. The curr_state will tell us how many
             * bytes _should_ be remaining in the sequence.
             */
            if (prev_state == UTF8_ACCEPT) {
                /* If the previous state was a good character then
                 * this can only be the start of s sequence
                 * which is all we care about.
                 */
                seq_len = curr_state / 36 + 1;
            }
 
            if (dst) {
                dst[dst_pos] = src[src_pos];
            }
            dst_pos++;
        }
 
        if (rc == UTF8_REJECT) {
            /* We got at least 1 rejection so the string is invalid */
            res = AST_UTF8_REPLACE_INVALID;
 
            if (prev_state != UTF8_ACCEPT) {
                /*
                 * If we were in a multi-byte sequence and this
                 * byte isn't valid at this time, we'll back
                 * the destination pointer back to the start
                 * of the now-invalid sequence and write the
                 * replacement bytes there.  Then we'll
                 * process the current byte again in the next
                 * loop iteration.  It may be quite valid later.
                 */
                dst_pos -= (seq_len - (prev_state / 36));
                src_pos--;
            }
            if (dst) {
                /*
                 * If we're not just calculating the needed destination
                 * buffer space, and we don't have enough room to write
                 * the replacement sequence, terminate the output
                 * and return.
                 */
                if (dst_pos > *dst_size - 4) {
                    dst[dst_pos] = '\0';
                    return AST_UTF8_REPLACE_OVERRUN;
                }
                memcpy(&dst[dst_pos], REPL_SEQ, REPL_SEQ_LEN);
            }
            dst_pos += REPL_SEQ_LEN;
            /* Reset the state machine */
            curr_state = UTF8_ACCEPT;
        }
    }
 
    if (curr_state != UTF8_ACCEPT) {
        /*
         * We were probably in the middle of a
         * sequence and ran out of space.
         */
        res = AST_UTF8_INVALID;
        dst_pos -= (seq_len - (prev_state / 36));
        if (dst) {
            if (dst_pos > *dst_size - 4) {
                dst[dst_pos] = '\0';
                return AST_UTF8_REPLACE_OVERRUN;
            }
            memcpy(&dst[dst_pos], REPL_SEQ, REPL_SEQ_LEN);
        }
        dst_pos += REPL_SEQ_LEN;
    }
 
    if (dst) {
        dst[dst_pos] = '\0';
    } else {
        *dst_size = dst_pos + 1;
    }
 
    return res;
}

References AST_UTF8_INVALID, AST_UTF8_REPLACE_INVALID, AST_UTF8_REPLACE_OVERRUN, AST_UTF8_REPLACE_VALID, decode(), REPL_SEQ, REPL_SEQ_LEN, UTF8_ACCEPT, and UTF8_REJECT.

Referenced by ast_channel_publish_varset(), and set_id_from_hdr().

◆ ast_utf8_validator_destroy()

void ast_utf8_validator_destroy ( struct ast_utf8_validator * validator )

Destroy a UTF-8 validator.

Since: 13.36.0, 16.13.0, 17.7.0, 18.0.0

Parameters

validator The validator instance to destroy

Definition at line 363 of file utf8.c.

{
    ast_free(validator);
}

References ast_free.

◆ ast_utf8_validator_feed()

enum ast_utf8_validation_result ast_utf8_validator_feed	(	struct ast_utf8_validator *	validator,
		const char *	data
	)

Feed a zero-terminated string into the UTF-8 validator.

Since: 13.36.0, 16.13.0, 17.7.0, 18.0.0

Parameters

validator	The validator instance
data	The zero-terminated string to feed into the validator

Returns: The ast_utf8_validation_result indicating the current state of the validator.

Definition at line 337 of file utf8.c.

{
    while (*data) {
        decode(&validator->state, (uint8_t) *data++);
    }
 
    return ast_utf8_validator_state(validator);
}

References ast_utf8_validator_state(), decode(), and ast_utf8_validator::state.

◆ ast_utf8_validator_feedn()

enum ast_utf8_validation_result ast_utf8_validator_feedn	(	struct ast_utf8_validator *	validator,
		const char *	data,
		size_t	size
	)

Feed a string into the UTF-8 validator.

Since: 13.36.0, 16.13.0, 17.7.0, 18.0.0

Similar to ast_utf8_validator_feed but will stop feeding in data if a zero byte is encountered or size bytes have been read.

Parameters

validator	The validator instance
data	The string to feed into the validator
size	The number of bytes to feed into the validator

Returns: The ast_utf8_validation_result indicating the current state of the validator.

Definition at line 347 of file utf8.c.

{
    while (size && *data) {
        decode(&validator->state, (uint8_t) *data++);
        size--;
    }
 
    return ast_utf8_validator_state(validator);
}

References ast_utf8_validator_state(), decode(), and ast_utf8_validator::state.

◆ ast_utf8_validator_new()

int ast_utf8_validator_new ( struct ast_utf8_validator ** validator )

Create a new UTF-8 validator.

Since: 13.36.0, 16.13.0, 17.7.0, 18.0.0

Parameters

[out] validator The validator instance

Return values

0	on success
-1	on failure

Definition at line 311 of file utf8.c.

{
    struct ast_utf8_validator *tmp = ast_malloc(sizeof(*tmp));
 
    if (!tmp) {
        return 1;
    }
 
    tmp->state = UTF8_ACCEPT;
    *validator = tmp;
    return 0;
}

References ast_malloc, ast_utf8_validator::state, and UTF8_ACCEPT.

◆ ast_utf8_validator_reset()

void ast_utf8_validator_reset ( struct ast_utf8_validator * validator )

Reset the state of a UTF-8 validator.

Since: 13.36.0, 16.13.0, 17.7.0, 18.0.0

Resets the provided UTF-8 validator to its initial state so that it can be reused.

Parameters

validator The validator instance to reset

Definition at line 358 of file utf8.c.

{
    validator->state = UTF8_ACCEPT;
}

References ast_utf8_validator::state, and UTF8_ACCEPT.

◆ ast_utf8_validator_state()

enum ast_utf8_validation_result ast_utf8_validator_state ( struct ast_utf8_validator * validator )

Get the current UTF-8 validator state.

Since: 13.36.0, 16.13.0, 17.7.0, 18.0.0

Parameters

validator The validator instance

Returns: The ast_utf8_validation_result indicating the current state of the validator.

Definition at line 324 of file utf8.c.

{
    switch (validator->state) {
    case UTF8_ACCEPT:
        return AST_UTF8_VALID;
    case UTF8_REJECT:
        return AST_UTF8_INVALID;
    default:
        return AST_UTF8_UNKNOWN;
    }
}

References AST_UTF8_INVALID, AST_UTF8_UNKNOWN, AST_UTF8_VALID, ast_utf8_validator::state, UTF8_ACCEPT, and UTF8_REJECT.

Referenced by ast_utf8_validator_feed(), and ast_utf8_validator_feedn().

◆ decode()

static uint32_t decode	(	uint32_t *	state,
		uint32_t	byte
	)

inlinestatic

Definition at line 98 of file utf8.c.

                                                              {
    uint32_t type = utf8d[byte];
    *state = utf8d[256 + *state + type];
    return *state;
}

References state, type, and utf8d.

Referenced by ast_utf8_copy_string(), ast_utf8_is_valid(), ast_utf8_is_validn(), ast_utf8_replace_invalid_chars(), ast_utf8_validator_feed(), and ast_utf8_validator_feedn().

Variable Documentation

◆ utf8d

const uint8_t utf8d[]

static

Definition at line 63 of file utf8.c.

Referenced by decode().

Data Structures

Macros

Functions

Variables

Detailed Description

Macro Definition Documentation

◆ REPL_SEQ

◆ REPL_SEQ_LEN

◆ UTF8_ACCEPT

◆ UTF8_REJECT

Function Documentation

◆ ast_utf8_copy_string()

◆ ast_utf8_init()

◆ ast_utf8_is_valid()

◆ ast_utf8_is_validn()

◆ ast_utf8_replace_invalid_chars()

◆ ast_utf8_validator_destroy()

◆ ast_utf8_validator_feed()

◆ ast_utf8_validator_feedn()

◆ ast_utf8_validator_new()

◆ ast_utf8_validator_reset()

◆ ast_utf8_validator_state()

◆ decode()

Variable Documentation

◆ utf8d