From 452eedc77ca09d7af450f97896238db0dc4b9290 Mon Sep 17 00:00:00 2001 From: Alexey Fedoseev Date: Sun, 3 Mar 2024 14:56:01 +0300 Subject: [PATCH] utf8 statecharts behavior convertor --- test_utf8.c | 54 ++++++++++++++++++ utf8enc.c | 155 ++++++++++++++++++++++++++++++++++++++++++++++++++++ utf8enc.h | 37 +++++++++++++ 3 files changed, 246 insertions(+) create mode 100644 test_utf8.c create mode 100644 utf8enc.c create mode 100644 utf8enc.h diff --git a/test_utf8.c b/test_utf8.c new file mode 100644 index 0000000..6f8dcb6 --- /dev/null +++ b/test_utf8.c @@ -0,0 +1,54 @@ +/* ----------------------------------------------------------------------------- + * The Cyberiada GraphML library implemention + * + * The UTF-8 encoder testing program + * + * Copyright (C) 2024 Alexey Fedoseev + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see https://www.gnu.org/licenses/ + * ----------------------------------------------------------------------------- */ + +#include +#include +#include +#include "utf8enc.h" + +int main(void) +{ + const char* a = "Hello! Съешь еще этих мягких французских булок и выпей чаю"; + char *b, *c; + size_t b_l, c_l; + printf("String len %ld\n", strlen(a)); + b = utf8_encode(a, strlen(a), &b_l); + if (!b) { + printf("String encoding error\n"); + return 1; + } + printf("Encoded len %ld %ld\n", strlen(b), b_l); + c = utf8_decode(b, strlen(b), &c_l); + if (!c) { + printf("String decoding error\n"); + free(b); + return 1; + } + printf("Decoded len %ld %ld\n", strlen(c), c_l); + printf("Orig: %s\n", a); + printf("Enc: %s\n", b); + printf("Dec: %s\n", c); + if (strcmp(a, c) != 0) { + printf("Strings don't match\n"); + } + free(b); + free(c); +} diff --git a/utf8enc.c b/utf8enc.c new file mode 100644 index 0000000..f15f8fa --- /dev/null +++ b/utf8enc.c @@ -0,0 +1,155 @@ +/* ----------------------------------------------------------------------------- + * The Cyberiada GraphML library implemention + * + * UTF-8 string encoding functions + * + * Copyright (C) 2024 Alexey Fedoseev + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see https://www.gnu.org/licenses/ + * + * ----------------------------------------------------------------------------- */ + +#include +#include +#include "utf8enc.h" + +static char encode_digit(int num) +{ + if (num < 10) { + return (char)('0' + num); + } else { + return (char)('A' + num - 10); + } +} + +static void encode_char(unsigned char c, char* buffer) +{ + buffer[0] = '_'; + buffer[1] = '_'; + buffer[2] = 'x'; + buffer[3] = '_'; + buffer[4] = encode_digit((c >> 4) & 0xf); + buffer[5] = encode_digit(c & 0xf); +} + +static int decode_char(int c) +{ + if (c >= '0' && c <= '9') { + c = c - '0'; + } else if (c >= 'a' && c <= 'f') { + c = c - 'a' + 10; + } else if (c >= 'A' && c <= 'F') { + c = c - 'A' + 10; + } + return c; +} + +static char decode_number(const char* buffer, size_t buffer_len) +{ + int c1, c2; + if (buffer_len < 6) { + return 0; + } + c1 = buffer[4]; + c2 = buffer[5]; + return (char)((decode_char(c1) << 4) | (decode_char(c2) & 0xF)); +} + +char* utf8_encode(const char *data, size_t input_len, size_t *output_len) +{ + size_t i, o; + char *encoded_data = NULL; + size_t output; + + if (!data || !input_len || !output_len) { + return NULL; + } + + output = 0; + i = 0; + while (i < input_len) { + unsigned char c = (unsigned char)data[i]; + if (c < 128) { + output++; + } else { + output += 6; + } + i++; + } + + encoded_data = (char*)malloc(output + 1); + memset(encoded_data, 0, output + 1); + + i = o = 0; + while (i < input_len) { + unsigned char c = (unsigned char)data[i]; + if (c < 128) { + encoded_data[o] = (char)c; + o++; + } else { + encode_char(c, encoded_data + o); + o += 6; + } + i++; + } + + *output_len = output; + return encoded_data; +} + +char* utf8_decode(const char *data, size_t input_len, size_t *output_len) +{ + size_t i, o; + char *decoded_data = NULL; + size_t output; + + if (!data || !input_len || !output_len) { + return NULL; + } + + output = 0; + i = 0; + while (i < input_len) { + int c = data[i]; + if (c == '_' && i + 6 <= input_len && + data[i + 1] == '_' && + data[i + 2] == 'x' && + data[i + 3] == '_') { + i += 6; + } else { + i++; + } + output++; + } + + decoded_data = (char*)malloc(output + 1); + memset(decoded_data, 0, output + 1); + + i = o = 0; + while (i < input_len) { + unsigned char c = (unsigned char)data[i]; + if (c == '_' && i + 6 <= input_len && data[i + 1] == '_' && data[i + 2] == 'x' && data[i + 3] == '_') { + decoded_data[o] = decode_number(data + i, input_len - i); + i += 6; + } else { + decoded_data[o] = (char)c; + i++; + } + o++; + } + + decoded_data[output] = 0; + *output_len = output; + return decoded_data; +} diff --git a/utf8enc.h b/utf8enc.h new file mode 100644 index 0000000..f36f873 --- /dev/null +++ b/utf8enc.h @@ -0,0 +1,37 @@ +/* ----------------------------------------------------------------------------- + * The Cyberiada GraphML library implemention + * + * The string encoder header + * + * Copyright (C) 2024 Alexey Fedoseev + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see https://www.gnu.org/licenses/ + * + * ----------------------------------------------------------------------------- */ + +#ifndef __CYBERIADA_UTF8ENC_H +#define __CYBERIADA_UTF8ENC_H + +#ifdef __cplusplus +extern "C" { +#endif + + char* utf8_encode(const char *data, size_t input_len, size_t *output_len); + char* utf8_decode(const char *data, size_t input_len, size_t *output_len); + +#ifdef __cplusplus +} +#endif + +#endif