| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| |
|
| | #include <stdio.h> |
| | #include <stdlib.h> |
| | #include <ctype.h> |
| | #include <string.h> |
| | #include <errno.h> |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| |
|
| | static const unsigned int utf8_table1[] = { |
| | 0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff}; |
| |
|
| | static const unsigned char utf8_table2[] = { |
| | 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc}; |
| |
|
| | static const unsigned char utf8_table3[] = { |
| | 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01}; |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | static size_t |
| | ord2utf8(unsigned long int cvalue, unsigned char *buffer) |
| | { |
| | size_t i, j; |
| | for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++) |
| | if (cvalue <= utf8_table1[i]) break; |
| | if (i >= sizeof(utf8_table1)/sizeof(int)) return 0; |
| | buffer += i; |
| | for (j = i; j > 0; j--) |
| | { |
| | *buffer-- = 0x80 | (cvalue & 0x3f); |
| | cvalue >>= 6; |
| | } |
| | *buffer = (unsigned char)(utf8_table2[i] | cvalue); |
| | return i + 1; |
| | } |
| |
|
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | static int |
| | utf82ord(const unsigned char *buffer, const unsigned char *buffend, |
| | long unsigned int *vptr, int *lenptr) |
| | { |
| | unsigned int c = *buffer++; |
| | unsigned int d = c; |
| | int i, j, s; |
| |
|
| | |
| | |
| |
|
| | for (i = -1; i < 6; i++) |
| | { |
| | if ((d & 0x80) == 0) break; |
| | d <<= 1; |
| | } |
| |
|
| | switch (i) |
| | { |
| | case -1: |
| | *vptr = c; |
| | return 1; |
| |
|
| | case 0: |
| | *lenptr = 0; |
| | return 0; |
| |
|
| | case 6: |
| | *lenptr = 0; |
| | return -1; |
| |
|
| | default: |
| | break; |
| | } |
| |
|
| | |
| |
|
| | s = 6 * i; |
| | d = (c & utf8_table3[i]) << s; |
| |
|
| | for (j = 0; j < i; j++) |
| | { |
| | if (buffer >= buffend) |
| | { |
| | *lenptr = j + 1; |
| | return -2; |
| | } |
| | c = *buffer++; |
| | if ((c & 0xc0) != 0x80) |
| | { |
| | *lenptr = j + 1; |
| | return -3; |
| | } |
| | s -= 6; |
| | d |= (c & 0x3f) << s; |
| | } |
| |
|
| | |
| |
|
| | *vptr = d; |
| |
|
| | |
| |
|
| | for (j = 0; j < (int)(sizeof(utf8_table1)/sizeof(int)); j++) |
| | if (d <= utf8_table1[j]) break; |
| | if (j != i) |
| | { |
| | *lenptr = i + 1; |
| | return -4; |
| | } |
| |
|
| | |
| |
|
| | return i + 1; |
| | } |
| |
|
| | |
| | |
| | |
| |
|
| | static void |
| | usage(const char *argv0) |
| | { |
| | printf("%s [option ..] argument ..\n\n", argv0); |
| | puts("Encode/decode Unicode codepoints with UTF-8 code units\n"); |
| | puts("The arguments are either single codepoint values written as U+hh.."); |
| | puts("or 0xhh.. for conversion to UTF-8, or sequences of hex values,"); |
| | puts("written without a prefix and optionally including spaces (but such"); |
| | puts("arguments must be quoted), for encoding from UTF-8 code units to"); |
| | puts("Unicode codepoints."); |
| | puts("For details on usage and examples read the comments in source code.\n"); |
| | puts("Options:\n"); |
| | puts(" -h|--help\tthis help"); |
| | puts(" -s\t\tprint character"); |
| | puts(" -b[=<file>]\twrite encoded data to file (default: testinput11)\n"); |
| | } |
| |
|
| | |
| | |
| | |
| |
|
| | int |
| | main(int argc, char **argv) |
| | { |
| | int i = 1; |
| | int show = 0; |
| | unsigned char buffer[64]; |
| | const char *argv0 = "utf8"; |
| | FILE *f = NULL; |
| |
|
| | for (int c = argc; c-- > 1; i++) |
| | { |
| | const char *x = argv[i]; |
| |
|
| | if (*x++ != '-') break; |
| | if (*x == '-' && *++x == 0) |
| | { |
| | i++; |
| | break; |
| | } |
| | switch (*x++) |
| | { |
| | case 's': show = 1; break; |
| | case 'b': |
| | { |
| | const char *output = "testinput11"; |
| | if (*x++ == '=' && *x != 0) output = x; |
| | f = fopen(output, "wb"); |
| | } |
| | break; |
| | default: |
| | { |
| | const char last_option = x[-1]; |
| | argv0 = argv[0]; |
| | usage(argv0); |
| | return (last_option != 'h'); |
| | } |
| | } |
| | } |
| |
|
| | if (i >= argc) |
| | { |
| | usage(argv0); |
| | return 1; |
| | } |
| |
|
| | for (; i < argc; i++) |
| | { |
| | const char *x = argv[i]; |
| |
|
| | if (strlen(x) >= 3 && |
| | (strncmp(x, "0x", 2) == 0 || strncmp(x, "U+", 2) == 0) && |
| | isxdigit(x[2])) |
| | { |
| | size_t rc; |
| | unsigned long d; |
| | char *endptr; |
| | int utf8_input = 0; |
| |
|
| | errno = 0; |
| | d = strtoul(x + 2, &endptr, 16); |
| | if (errno != 0 || *endptr != 0) |
| | { |
| | printf("** Invalid hex number %s\n", x); |
| | continue; |
| | } |
| | if (d > 0xffffffff) |
| | { |
| | puts("** Code points must fit an uint32_t"); |
| | continue; |
| | } |
| | else if (f != NULL && d > 0x7fffffff) |
| | { |
| | buffer[0] = 0xff; |
| | fwrite(buffer, 1, 1, f); |
| | utf8_input = 1; |
| | d &= 0x7fffffff; |
| | } |
| |
|
| | rc = ord2utf8(d, buffer); |
| | printf("U+%08lx => ", d); |
| | if (rc == 0) |
| | fputs("** -b needed for codepoints greater than 0x7fffffff", stdout); |
| | else |
| | { |
| | size_t j; |
| |
|
| | for (j = 0; j < rc; j++) printf("%02x ", buffer[j]); |
| | if (f != NULL) fwrite(buffer, rc, 1, f); |
| | if (utf8_input) |
| | fputs("** Not valid UTF-8, top bit set", stdout); |
| | else if (d > 0x10ffff) |
| | fputs("** Invalid Unicode (greater than U+10ffff)", stdout); |
| | else if (0xd800 <= d && d <= 0xdfff) |
| | fputs("** Invalid Unicode (UTF-16 surrogate)", stdout); |
| | else if (show) |
| | { |
| | putchar('>'); |
| | for (j = 0; j < rc; j++) printf("%c", buffer[j]); |
| | putchar('<'); |
| | } |
| | } |
| | putchar('\n'); |
| | } |
| | else |
| | { |
| | unsigned char *bptr; |
| | const unsigned char *buffend; |
| | unsigned char y = 0; |
| | int len = 0; |
| | int z = 0; |
| |
|
| | for (;;) |
| | { |
| | while (*x == ' ') x++; |
| | if (*x == 0 && !z) break; |
| | if (!isxdigit(*x)) |
| | { |
| | printf("** Malformed hex string: %s\n", argv[i]); |
| | len = -1; |
| | break; |
| | } |
| | y = y * 16 + |
| | (unsigned char)(tolower(*x) - ((isdigit(*x))? '0' : 'a' - 10)); |
| | x++; |
| | if (z) |
| | { |
| | buffer[len++] = y; |
| | y = 0; |
| | } |
| | z ^= 1; |
| | } |
| |
|
| | if (len < 0) continue; |
| |
|
| | bptr = buffer; |
| | buffend = buffer + len; |
| |
|
| | while (bptr < buffend) |
| | { |
| | unsigned long int d; |
| | int j; |
| | int offset; |
| | int rc = utf82ord(bptr, buffend, &d, &offset); |
| |
|
| | if (rc > 0) |
| | { |
| | printf("U+%08lx <= ", d); |
| | for (j = 0; j < rc; j++) printf("%02x ", bptr[j]); |
| | if (d <= 0x10ffff && (d < 0xd800 || 0xdfff < d) && show) |
| | { |
| | putchar('>'); |
| | for (j = 0; j < rc; j++) printf("%c", bptr[j]); |
| | putchar('<'); |
| | } |
| | putchar('\n'); |
| | bptr += rc; |
| | } |
| | else if (rc == -4) |
| | { |
| | printf("U+%08lx <= ", d); |
| | for (j = 0; j < offset; j++) printf("%02x ", bptr[j]); |
| | puts("** Overlong UTF-8 sequence"); |
| | bptr += offset; |
| | } |
| | else |
| | { |
| | switch (rc) |
| | { |
| | case 0: fputs("** First byte missing 0x40 bit", stdout); |
| | break; |
| |
|
| | case -1: fputs("** First byte has too many high-order bits", stdout); |
| | break; |
| |
|
| | case -2: fputs("** Incomplete UTF-8 sequence at end of string", |
| | stdout); |
| | break; |
| |
|
| | case -3: fputs("** Incomplete UTF-8 sequence", stdout); |
| | break; |
| |
|
| | default: printf("** Unexpected return %d from utf82ord()", rc); |
| | break; |
| | } |
| | printf(" at offset %d in string ", offset); |
| | while (bptr < buffend) printf("%02x ", *bptr++); |
| | putchar('\n'); |
| | break; |
| | } |
| | } |
| | } |
| | } |
| |
|
| | if (f != NULL) fclose(f); |
| |
|
| | return 0; |
| | } |
| |
|
| | |
| |
|