utf8 statecharts behavior convertor
This commit is contained in:
54
test_utf8.c
Normal file
54
test_utf8.c
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
/* -----------------------------------------------------------------------------
|
||||||
|
* The Cyberiada GraphML library implemention
|
||||||
|
*
|
||||||
|
* The UTF-8 encoder testing program
|
||||||
|
*
|
||||||
|
* Copyright (C) 2024 Alexey Fedoseev <aleksey@fedoseev.net>
|
||||||
|
*
|
||||||
|
* This program is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program. If not, see https://www.gnu.org/licenses/
|
||||||
|
* ----------------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include "utf8enc.h"
|
||||||
|
|
||||||
|
int main(void)
|
||||||
|
{
|
||||||
|
const char* a = "Hello! Съешь еще этих мягких французских булок и выпей чаю";
|
||||||
|
char *b, *c;
|
||||||
|
size_t b_l, c_l;
|
||||||
|
printf("String len %ld\n", strlen(a));
|
||||||
|
b = utf8_encode(a, strlen(a), &b_l);
|
||||||
|
if (!b) {
|
||||||
|
printf("String encoding error\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
printf("Encoded len %ld %ld\n", strlen(b), b_l);
|
||||||
|
c = utf8_decode(b, strlen(b), &c_l);
|
||||||
|
if (!c) {
|
||||||
|
printf("String decoding error\n");
|
||||||
|
free(b);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
printf("Decoded len %ld %ld\n", strlen(c), c_l);
|
||||||
|
printf("Orig: %s\n", a);
|
||||||
|
printf("Enc: %s\n", b);
|
||||||
|
printf("Dec: %s\n", c);
|
||||||
|
if (strcmp(a, c) != 0) {
|
||||||
|
printf("Strings don't match\n");
|
||||||
|
}
|
||||||
|
free(b);
|
||||||
|
free(c);
|
||||||
|
}
|
||||||
155
utf8enc.c
Normal file
155
utf8enc.c
Normal file
@@ -0,0 +1,155 @@
|
|||||||
|
/* -----------------------------------------------------------------------------
|
||||||
|
* The Cyberiada GraphML library implemention
|
||||||
|
*
|
||||||
|
* UTF-8 string encoding functions
|
||||||
|
*
|
||||||
|
* Copyright (C) 2024 Alexey Fedoseev <aleksey@fedoseev.net>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 3 of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program. If not, see https://www.gnu.org/licenses/
|
||||||
|
*
|
||||||
|
* ----------------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include "utf8enc.h"
|
||||||
|
|
||||||
|
static char encode_digit(int num)
|
||||||
|
{
|
||||||
|
if (num < 10) {
|
||||||
|
return (char)('0' + num);
|
||||||
|
} else {
|
||||||
|
return (char)('A' + num - 10);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void encode_char(unsigned char c, char* buffer)
|
||||||
|
{
|
||||||
|
buffer[0] = '_';
|
||||||
|
buffer[1] = '_';
|
||||||
|
buffer[2] = 'x';
|
||||||
|
buffer[3] = '_';
|
||||||
|
buffer[4] = encode_digit((c >> 4) & 0xf);
|
||||||
|
buffer[5] = encode_digit(c & 0xf);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int decode_char(int c)
|
||||||
|
{
|
||||||
|
if (c >= '0' && c <= '9') {
|
||||||
|
c = c - '0';
|
||||||
|
} else if (c >= 'a' && c <= 'f') {
|
||||||
|
c = c - 'a' + 10;
|
||||||
|
} else if (c >= 'A' && c <= 'F') {
|
||||||
|
c = c - 'A' + 10;
|
||||||
|
}
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
static char decode_number(const char* buffer, size_t buffer_len)
|
||||||
|
{
|
||||||
|
int c1, c2;
|
||||||
|
if (buffer_len < 6) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
c1 = buffer[4];
|
||||||
|
c2 = buffer[5];
|
||||||
|
return (char)((decode_char(c1) << 4) | (decode_char(c2) & 0xF));
|
||||||
|
}
|
||||||
|
|
||||||
|
char* utf8_encode(const char *data, size_t input_len, size_t *output_len)
|
||||||
|
{
|
||||||
|
size_t i, o;
|
||||||
|
char *encoded_data = NULL;
|
||||||
|
size_t output;
|
||||||
|
|
||||||
|
if (!data || !input_len || !output_len) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
output = 0;
|
||||||
|
i = 0;
|
||||||
|
while (i < input_len) {
|
||||||
|
unsigned char c = (unsigned char)data[i];
|
||||||
|
if (c < 128) {
|
||||||
|
output++;
|
||||||
|
} else {
|
||||||
|
output += 6;
|
||||||
|
}
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
encoded_data = (char*)malloc(output + 1);
|
||||||
|
memset(encoded_data, 0, output + 1);
|
||||||
|
|
||||||
|
i = o = 0;
|
||||||
|
while (i < input_len) {
|
||||||
|
unsigned char c = (unsigned char)data[i];
|
||||||
|
if (c < 128) {
|
||||||
|
encoded_data[o] = (char)c;
|
||||||
|
o++;
|
||||||
|
} else {
|
||||||
|
encode_char(c, encoded_data + o);
|
||||||
|
o += 6;
|
||||||
|
}
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
*output_len = output;
|
||||||
|
return encoded_data;
|
||||||
|
}
|
||||||
|
|
||||||
|
char* utf8_decode(const char *data, size_t input_len, size_t *output_len)
|
||||||
|
{
|
||||||
|
size_t i, o;
|
||||||
|
char *decoded_data = NULL;
|
||||||
|
size_t output;
|
||||||
|
|
||||||
|
if (!data || !input_len || !output_len) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
output = 0;
|
||||||
|
i = 0;
|
||||||
|
while (i < input_len) {
|
||||||
|
int c = data[i];
|
||||||
|
if (c == '_' && i + 6 <= input_len &&
|
||||||
|
data[i + 1] == '_' &&
|
||||||
|
data[i + 2] == 'x' &&
|
||||||
|
data[i + 3] == '_') {
|
||||||
|
i += 6;
|
||||||
|
} else {
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
output++;
|
||||||
|
}
|
||||||
|
|
||||||
|
decoded_data = (char*)malloc(output + 1);
|
||||||
|
memset(decoded_data, 0, output + 1);
|
||||||
|
|
||||||
|
i = o = 0;
|
||||||
|
while (i < input_len) {
|
||||||
|
unsigned char c = (unsigned char)data[i];
|
||||||
|
if (c == '_' && i + 6 <= input_len && data[i + 1] == '_' && data[i + 2] == 'x' && data[i + 3] == '_') {
|
||||||
|
decoded_data[o] = decode_number(data + i, input_len - i);
|
||||||
|
i += 6;
|
||||||
|
} else {
|
||||||
|
decoded_data[o] = (char)c;
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
o++;
|
||||||
|
}
|
||||||
|
|
||||||
|
decoded_data[output] = 0;
|
||||||
|
*output_len = output;
|
||||||
|
return decoded_data;
|
||||||
|
}
|
||||||
37
utf8enc.h
Normal file
37
utf8enc.h
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
/* -----------------------------------------------------------------------------
|
||||||
|
* The Cyberiada GraphML library implemention
|
||||||
|
*
|
||||||
|
* The string encoder header
|
||||||
|
*
|
||||||
|
* Copyright (C) 2024 Alexey Fedoseev <aleksey@fedoseev.net>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 3 of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program. If not, see https://www.gnu.org/licenses/
|
||||||
|
*
|
||||||
|
* ----------------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
#ifndef __CYBERIADA_UTF8ENC_H
|
||||||
|
#define __CYBERIADA_UTF8ENC_H
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
char* utf8_encode(const char *data, size_t input_len, size_t *output_len);
|
||||||
|
char* utf8_decode(const char *data, size_t input_len, size_t *output_len);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
Reference in New Issue
Block a user