db/d7c/utf8_8h_source.html

/*

 * Asterisk -- An open source telephony toolkit.

 *

 * Copyright (C) 2020, Sean Bright

 *

 * Sean Bright <sean.bright@gmail.com>

 *

 * See http://www.asterisk.org for more information about

 * the Asterisk project. Please do not directly contact

 * any of the maintainers of this project for assistance;

 * the project provides a web site, mailing lists and IRC

 * channels for your use.

 *

 * This program is free software, distributed under the terms of

 * the GNU General Public License Version 2. See the LICENSE file

 * at the top of the source tree.

 */


/*! \file

 *

 * \brief UTF-8 information and validation functions

 */


#ifndef ASTERISK_UTF8_H

#define ASTERISK_UTF8_H


/*!

 * \brief Check if a zero-terminated string is valid UTF-8

 * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0

 *

 * \param str The zero-terminated string to check

 *

 * \retval 0 if the string is not valid UTF-8

 * \retval Non-zero if the string is valid UTF-8

 */

int ast_utf8_is_valid(const char *str);


/*!

 * \brief Check if the first \a size bytes of a string are valid UTF-8

 * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0

 *

 * Similar to \a ast_utf8_is_valid() but checks the first \a size bytes or until

 * a zero byte is reached, whichever comes first.

 *

 * \param str The string to check

 * \param size The number of bytes to evaluate

 *

 * \retval 0 if the string is not valid UTF-8

 * \retval Non-zero if the string is valid UTF-8

 */

int ast_utf8_is_validn(const char *str, size_t size);


/*!

 * \brief Copy a string safely ensuring valid UTF-8

 * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0

 *

 * This is similar to \ref ast_copy_string, but it will only copy valid UTF-8

 * sequences from the source string into the destination buffer. If an invalid

 * UTF-8 sequence is encountered, or the available space in the destination

 * buffer is exhausted in the middle of an otherwise valid UTF-8 sequence, the

 * destination buffer will be truncated to ensure that it only contains valid

 * UTF-8.

 *

 * \param dst The destination buffer.

 * \param src The source string

 * \param size The size of the destination buffer

 */

void ast_utf8_copy_string(char *dst, const char *src, size_t size);


enum ast_utf8_replace_result {

    /*! \brief Source contained fully valid UTF-8

     *

     * The entire string was valid UTF-8 and no replacement

     * was required.

     */

    AST_UTF8_REPLACE_VALID,


    /*! \brief Source contained at least 1 invalid UTF-8 sequence

     *

     * Parts of the string contained invalid UTF-8 sequences

     * but those were successfully replaced with the U+FFFD

     * replacement sequence.

     */

    AST_UTF8_REPLACE_INVALID,


    /*! \brief Not enough space to copy entire source

     *

     * The destination buffer wasn't large enough to copy

     * all of the source characters.  As many of the source

     * characters that could be copied/replaced were done so

     * and a final NULL terminator added.

     */

    AST_UTF8_REPLACE_OVERRUN,

};


/*!

 * \brief Copy a string safely replacing any invalid UTF-8 sequences

 *

 * This is similar to \ref ast_copy_string, but it will only copy valid UTF-8

 * sequences from the source string into the destination buffer.

 * If an invalid sequence is encountered, it's replaced with the \uFFFD

 * sequence which is the valid UTF-8 sequence that represents an unknown,

 * unrecognized, or unrepresentable character.  Since \uFFFD is actually a

 * 3 byte sequence, the destination buffer will need to be larger than

 * the corresponding source string if it contains invalid sequences.

 * You can pass NULL as the destination buffer pointer to get the actual

 * size required, then call the function again with the properly sized

 * buffer.

 *

 * \param dst       Pointer to the destination buffer. If NULL,

 *                  dst_size will be set to the size of the

 *                  buffer required to fully process the

 *                  source string.

 * \param dst_size  A pointer to the size of the dst buffer

 * \param src       The source string

 * \param src_len   The number of bytes to copy

 *

 * \return \ref ast_utf8_replace_result

 */

enum ast_utf8_replace_result ast_utf8_replace_invalid_chars(char *dst,

    size_t *dst_size, const char *src, size_t src_len);


enum ast_utf8_validation_result {

    /*! \brief The consumed sequence is valid UTF-8

     *

     * The bytes consumed thus far by the validator represent a valid sequence of

     * UTF-8 bytes. If additional bytes are fed into the validator, it can

     * transition into either \a AST_UTF8_INVALID or \a AST_UTF8_UNKNOWN

     */

    AST_UTF8_VALID,


    /*! \brief The consumed sequence is invalid UTF-8

     *

     * The bytes consumed thus far by the validator represent an invalid sequence

     * of UTF-8 bytes. Feeding additional bytes into the validator will not

     * change its state.

     */

    AST_UTF8_INVALID,


    /*! \brief The validator is in an intermediate state

     *

     * The validator is in the process of validating a multibyte UTF-8 sequence

     * and requires additional data to be fed into it to determine validity. If

     * additional bytes are fed into the validator, it can transition into either

     * \a AST_UTF8_VALID or \a AST_UTF8_INVALID. If you have no additional data

     * to feed into the validator the UTF-8 sequence is invalid.

     */

    AST_UTF8_UNKNOWN,

};


/*!

 * \brief Opaque type for UTF-8 validator state.

 * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0

 */

struct ast_utf8_validator;


/*!

 * \brief Create a new UTF-8 validator

 * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0

 *

 * \param[out] validator The validator instance

 *

 * \retval 0 on success

 * \retval -1 on failure

 */

int ast_utf8_validator_new(struct ast_utf8_validator **validator);


/*!

 * \brief Feed a zero-terminated string into the UTF-8 validator

 * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0

 *

 * \param validator The validator instance

 * \param data The zero-terminated string to feed into the validator

 *

 * \return The \ref ast_utf8_validation_result indicating the current state of

 *         the validator.

 */

enum ast_utf8_validation_result ast_utf8_validator_feed(

    struct ast_utf8_validator *validator, const char *data);


/*!

 * \brief Feed a string into the UTF-8 validator

 * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0

 *

 * Similar to \a ast_utf8_validator_feed but will stop feeding in data if a zero

 * byte is encountered or \a size bytes have been read.

 *

 * \param validator The validator instance

 * \param data The string to feed into the validator

 * \param size The number of bytes to feed into the validator

 *

 * \return The \ref ast_utf8_validation_result indicating the current state of

 *         the validator.

 */

enum ast_utf8_validation_result ast_utf8_validator_feedn(

    struct ast_utf8_validator *validator, const char *data, size_t size);


/*!

 * \brief Get the current UTF-8 validator state

 * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0

 *

 * \param validator The validator instance

 *

 * \return The \ref ast_utf8_validation_result indicating the current state of

 *         the validator.

 */

enum ast_utf8_validation_result ast_utf8_validator_state(

    struct ast_utf8_validator *validator);


/*!

 * \brief Reset the state of a UTF-8 validator

 * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0

 *

 * Resets the provided UTF-8 validator to its initial state so that it can be

 * reused.

 *

 * \param validator The validator instance to reset

 */

void ast_utf8_validator_reset(

    struct ast_utf8_validator *validator);


/*!

 * \brief Destroy a UTF-8 validator

 * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0

 *

 * \param validator The validator instance to destroy

 */

void ast_utf8_validator_destroy(struct ast_utf8_validator *validator);


/*!

 * \brief Register UTF-8 tests

 * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0

 *

 * Does nothing unless TEST_FRAMEWORK is defined.

 *

 * \retval 0 Always

 */

int ast_utf8_init(void);


#endif /* ASTERISK_UTF8_H */

str
const char * str
Definition: app_jack.c:150

ast_utf8_validator
Definition: utf8.c:307

ast_utf8_validator_state
enum ast_utf8_validation_result ast_utf8_validator_state(struct ast_utf8_validator *validator)
Get the current UTF-8 validator state.
Definition: utf8.c:324

ast_utf8_init
int ast_utf8_init(void)
Register UTF-8 tests.
Definition: utf8.c:919

ast_utf8_validation_result
ast_utf8_validation_result
Definition: utf8.h:123

AST_UTF8_INVALID
@ AST_UTF8_INVALID
The consumed sequence is invalid UTF-8.
Definition: utf8.h:138

AST_UTF8_UNKNOWN
@ AST_UTF8_UNKNOWN
The validator is in an intermediate state.
Definition: utf8.h:148

AST_UTF8_VALID
@ AST_UTF8_VALID
The consumed sequence is valid UTF-8.
Definition: utf8.h:130

ast_utf8_validator_new
int ast_utf8_validator_new(struct ast_utf8_validator **validator)
Create a new UTF-8 validator.
Definition: utf8.c:311

ast_utf8_is_valid
int ast_utf8_is_valid(const char *str)
Check if a zero-terminated string is valid UTF-8.
Definition: utf8.c:110

ast_utf8_validator_feed
enum ast_utf8_validation_result ast_utf8_validator_feed(struct ast_utf8_validator *validator, const char *data)
Feed a zero-terminated string into the UTF-8 validator.
Definition: utf8.c:337

ast_utf8_replace_result
ast_utf8_replace_result
Definition: utf8.h:70

AST_UTF8_REPLACE_INVALID
@ AST_UTF8_REPLACE_INVALID
Source contained at least 1 invalid UTF-8 sequence.
Definition: utf8.h:84

AST_UTF8_REPLACE_VALID
@ AST_UTF8_REPLACE_VALID
Source contained fully valid UTF-8.
Definition: utf8.h:76

AST_UTF8_REPLACE_OVERRUN
@ AST_UTF8_REPLACE_OVERRUN
Not enough space to copy entire source.
Definition: utf8.h:93

ast_utf8_is_validn
int ast_utf8_is_validn(const char *str, size_t size)
Check if the first size bytes of a string are valid UTF-8.
Definition: utf8.c:121

ast_utf8_validator_reset
void ast_utf8_validator_reset(struct ast_utf8_validator *validator)
Reset the state of a UTF-8 validator.
Definition: utf8.c:358

ast_utf8_validator_destroy
void ast_utf8_validator_destroy(struct ast_utf8_validator *validator)
Destroy a UTF-8 validator.
Definition: utf8.c:363

ast_utf8_copy_string
void ast_utf8_copy_string(char *dst, const char *src, size_t size)
Copy a string safely ensuring valid UTF-8.
Definition: utf8.c:133

ast_utf8_replace_invalid_chars
enum ast_utf8_replace_result ast_utf8_replace_invalid_chars(char *dst, size_t *dst_size, const char *src, size_t src_len)
Copy a string safely replacing any invalid UTF-8 sequences.
Definition: utf8.c:173

ast_utf8_validator_feedn
enum ast_utf8_validation_result ast_utf8_validator_feedn(struct ast_utf8_validator *validator, const char *data, size_t size)
Feed a string into the UTF-8 validator.
Definition: utf8.c:347