utf8 statecharts behavior convertor
This commit is contained in:
54
test_utf8.c
Normal file
54
test_utf8.c
Normal file
@@ -0,0 +1,54 @@
|
||||
/* -----------------------------------------------------------------------------
|
||||
* The Cyberiada GraphML library implemention
|
||||
*
|
||||
* The UTF-8 encoder testing program
|
||||
*
|
||||
* Copyright (C) 2024 Alexey Fedoseev <aleksey@fedoseev.net>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see https://www.gnu.org/licenses/
|
||||
* ----------------------------------------------------------------------------- */
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include "utf8enc.h"
|
||||
|
||||
int main(void)
|
||||
{
|
||||
const char* a = "Hello! Съешь еще этих мягких французских булок и выпей чаю";
|
||||
char *b, *c;
|
||||
size_t b_l, c_l;
|
||||
printf("String len %ld\n", strlen(a));
|
||||
b = utf8_encode(a, strlen(a), &b_l);
|
||||
if (!b) {
|
||||
printf("String encoding error\n");
|
||||
return 1;
|
||||
}
|
||||
printf("Encoded len %ld %ld\n", strlen(b), b_l);
|
||||
c = utf8_decode(b, strlen(b), &c_l);
|
||||
if (!c) {
|
||||
printf("String decoding error\n");
|
||||
free(b);
|
||||
return 1;
|
||||
}
|
||||
printf("Decoded len %ld %ld\n", strlen(c), c_l);
|
||||
printf("Orig: %s\n", a);
|
||||
printf("Enc: %s\n", b);
|
||||
printf("Dec: %s\n", c);
|
||||
if (strcmp(a, c) != 0) {
|
||||
printf("Strings don't match\n");
|
||||
}
|
||||
free(b);
|
||||
free(c);
|
||||
}
|
||||
155
utf8enc.c
Normal file
155
utf8enc.c
Normal file
@@ -0,0 +1,155 @@
|
||||
/* -----------------------------------------------------------------------------
|
||||
* The Cyberiada GraphML library implemention
|
||||
*
|
||||
* UTF-8 string encoding functions
|
||||
*
|
||||
* Copyright (C) 2024 Alexey Fedoseev <aleksey@fedoseev.net>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see https://www.gnu.org/licenses/
|
||||
*
|
||||
* ----------------------------------------------------------------------------- */
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "utf8enc.h"
|
||||
|
||||
static char encode_digit(int num)
|
||||
{
|
||||
if (num < 10) {
|
||||
return (char)('0' + num);
|
||||
} else {
|
||||
return (char)('A' + num - 10);
|
||||
}
|
||||
}
|
||||
|
||||
static void encode_char(unsigned char c, char* buffer)
|
||||
{
|
||||
buffer[0] = '_';
|
||||
buffer[1] = '_';
|
||||
buffer[2] = 'x';
|
||||
buffer[3] = '_';
|
||||
buffer[4] = encode_digit((c >> 4) & 0xf);
|
||||
buffer[5] = encode_digit(c & 0xf);
|
||||
}
|
||||
|
||||
static int decode_char(int c)
|
||||
{
|
||||
if (c >= '0' && c <= '9') {
|
||||
c = c - '0';
|
||||
} else if (c >= 'a' && c <= 'f') {
|
||||
c = c - 'a' + 10;
|
||||
} else if (c >= 'A' && c <= 'F') {
|
||||
c = c - 'A' + 10;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
static char decode_number(const char* buffer, size_t buffer_len)
|
||||
{
|
||||
int c1, c2;
|
||||
if (buffer_len < 6) {
|
||||
return 0;
|
||||
}
|
||||
c1 = buffer[4];
|
||||
c2 = buffer[5];
|
||||
return (char)((decode_char(c1) << 4) | (decode_char(c2) & 0xF));
|
||||
}
|
||||
|
||||
char* utf8_encode(const char *data, size_t input_len, size_t *output_len)
|
||||
{
|
||||
size_t i, o;
|
||||
char *encoded_data = NULL;
|
||||
size_t output;
|
||||
|
||||
if (!data || !input_len || !output_len) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
output = 0;
|
||||
i = 0;
|
||||
while (i < input_len) {
|
||||
unsigned char c = (unsigned char)data[i];
|
||||
if (c < 128) {
|
||||
output++;
|
||||
} else {
|
||||
output += 6;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
encoded_data = (char*)malloc(output + 1);
|
||||
memset(encoded_data, 0, output + 1);
|
||||
|
||||
i = o = 0;
|
||||
while (i < input_len) {
|
||||
unsigned char c = (unsigned char)data[i];
|
||||
if (c < 128) {
|
||||
encoded_data[o] = (char)c;
|
||||
o++;
|
||||
} else {
|
||||
encode_char(c, encoded_data + o);
|
||||
o += 6;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
*output_len = output;
|
||||
return encoded_data;
|
||||
}
|
||||
|
||||
char* utf8_decode(const char *data, size_t input_len, size_t *output_len)
|
||||
{
|
||||
size_t i, o;
|
||||
char *decoded_data = NULL;
|
||||
size_t output;
|
||||
|
||||
if (!data || !input_len || !output_len) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
output = 0;
|
||||
i = 0;
|
||||
while (i < input_len) {
|
||||
int c = data[i];
|
||||
if (c == '_' && i + 6 <= input_len &&
|
||||
data[i + 1] == '_' &&
|
||||
data[i + 2] == 'x' &&
|
||||
data[i + 3] == '_') {
|
||||
i += 6;
|
||||
} else {
|
||||
i++;
|
||||
}
|
||||
output++;
|
||||
}
|
||||
|
||||
decoded_data = (char*)malloc(output + 1);
|
||||
memset(decoded_data, 0, output + 1);
|
||||
|
||||
i = o = 0;
|
||||
while (i < input_len) {
|
||||
unsigned char c = (unsigned char)data[i];
|
||||
if (c == '_' && i + 6 <= input_len && data[i + 1] == '_' && data[i + 2] == 'x' && data[i + 3] == '_') {
|
||||
decoded_data[o] = decode_number(data + i, input_len - i);
|
||||
i += 6;
|
||||
} else {
|
||||
decoded_data[o] = (char)c;
|
||||
i++;
|
||||
}
|
||||
o++;
|
||||
}
|
||||
|
||||
decoded_data[output] = 0;
|
||||
*output_len = output;
|
||||
return decoded_data;
|
||||
}
|
||||
37
utf8enc.h
Normal file
37
utf8enc.h
Normal file
@@ -0,0 +1,37 @@
|
||||
/* -----------------------------------------------------------------------------
|
||||
* The Cyberiada GraphML library implemention
|
||||
*
|
||||
* The string encoder header
|
||||
*
|
||||
* Copyright (C) 2024 Alexey Fedoseev <aleksey@fedoseev.net>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see https://www.gnu.org/licenses/
|
||||
*
|
||||
* ----------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef __CYBERIADA_UTF8ENC_H
|
||||
#define __CYBERIADA_UTF8ENC_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
char* utf8_encode(const char *data, size_t input_len, size_t *output_len);
|
||||
char* utf8_decode(const char *data, size_t input_len, size_t *output_len);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user