| | from __future__ import annotations
|
| |
|
| | import array
|
| | import unicodedata
|
| | import requests
|
| |
|
| |
|
| | MAX_CODEPOINTS = 0x110000
|
| |
|
| | UNICODE_DATA_URL = "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt"
|
| |
|
| |
|
| |
|
| | def unicode_data_iter():
|
| | res = requests.get(UNICODE_DATA_URL)
|
| | res.raise_for_status()
|
| | data = res.content.decode()
|
| |
|
| | prev = []
|
| |
|
| | for line in data.splitlines():
|
| |
|
| | line = line.split(";")
|
| |
|
| | cpt = int(line[0], base=16)
|
| | assert cpt < MAX_CODEPOINTS
|
| |
|
| | cpt_lower = int(line[-2] or "0", base=16)
|
| | assert cpt_lower < MAX_CODEPOINTS
|
| |
|
| | cpt_upper = int(line[-3] or "0", base=16)
|
| | assert cpt_upper < MAX_CODEPOINTS
|
| |
|
| | categ = line[2].strip()
|
| | assert len(categ) == 2
|
| |
|
| | bidir = line[4].strip()
|
| | assert len(categ) == 2
|
| |
|
| | name = line[1]
|
| | if name.endswith(", First>"):
|
| | prev = (cpt, cpt_lower, cpt_upper, categ, bidir)
|
| | continue
|
| | if name.endswith(", Last>"):
|
| | assert prev[1:] == (0, 0, categ, bidir)
|
| | for c in range(prev[0], cpt):
|
| | yield (c, cpt_lower, cpt_upper, categ, bidir)
|
| |
|
| | yield (cpt, cpt_lower, cpt_upper, categ, bidir)
|
| |
|
| |
|
| |
|
| | CODEPOINT_FLAG_UNDEFINED = 0x0001
|
| | CODEPOINT_FLAG_NUMBER = 0x0002
|
| | CODEPOINT_FLAG_LETTER = 0x0004
|
| | CODEPOINT_FLAG_SEPARATOR = 0x0008
|
| | CODEPOINT_FLAG_MARK = 0x0010
|
| | CODEPOINT_FLAG_PUNCTUATION = 0x0020
|
| | CODEPOINT_FLAG_SYMBOL = 0x0040
|
| | CODEPOINT_FLAG_CONTROL = 0x0080
|
| |
|
| | UNICODE_CATEGORY_TO_FLAG = {
|
| | "Cn": CODEPOINT_FLAG_UNDEFINED,
|
| | "Cc": CODEPOINT_FLAG_CONTROL,
|
| | "Cf": CODEPOINT_FLAG_CONTROL,
|
| | "Co": CODEPOINT_FLAG_CONTROL,
|
| | "Cs": CODEPOINT_FLAG_CONTROL,
|
| | "Ll": CODEPOINT_FLAG_LETTER,
|
| | "Lm": CODEPOINT_FLAG_LETTER,
|
| | "Lo": CODEPOINT_FLAG_LETTER,
|
| | "Lt": CODEPOINT_FLAG_LETTER,
|
| | "Lu": CODEPOINT_FLAG_LETTER,
|
| | "L&": CODEPOINT_FLAG_LETTER,
|
| | "Mc": CODEPOINT_FLAG_MARK,
|
| | "Me": CODEPOINT_FLAG_MARK,
|
| | "Mn": CODEPOINT_FLAG_MARK,
|
| | "Nd": CODEPOINT_FLAG_NUMBER,
|
| | "Nl": CODEPOINT_FLAG_NUMBER,
|
| | "No": CODEPOINT_FLAG_NUMBER,
|
| | "Pc": CODEPOINT_FLAG_PUNCTUATION,
|
| | "Pd": CODEPOINT_FLAG_PUNCTUATION,
|
| | "Pe": CODEPOINT_FLAG_PUNCTUATION,
|
| | "Pf": CODEPOINT_FLAG_PUNCTUATION,
|
| | "Pi": CODEPOINT_FLAG_PUNCTUATION,
|
| | "Po": CODEPOINT_FLAG_PUNCTUATION,
|
| | "Ps": CODEPOINT_FLAG_PUNCTUATION,
|
| | "Sc": CODEPOINT_FLAG_SYMBOL,
|
| | "Sk": CODEPOINT_FLAG_SYMBOL,
|
| | "Sm": CODEPOINT_FLAG_SYMBOL,
|
| | "So": CODEPOINT_FLAG_SYMBOL,
|
| | "Zl": CODEPOINT_FLAG_SEPARATOR,
|
| | "Zp": CODEPOINT_FLAG_SEPARATOR,
|
| | "Zs": CODEPOINT_FLAG_SEPARATOR,
|
| | }
|
| |
|
| |
|
| | codepoint_flags = array.array('H', [CODEPOINT_FLAG_UNDEFINED]) * MAX_CODEPOINTS
|
| | table_whitespace = []
|
| | table_lowercase = []
|
| | table_uppercase = []
|
| | table_nfd = []
|
| |
|
| | for (cpt, cpt_lower, cpt_upper, categ, bidir) in unicode_data_iter():
|
| |
|
| | char = chr(cpt)
|
| |
|
| |
|
| | codepoint_flags[cpt] = UNICODE_CATEGORY_TO_FLAG[categ]
|
| |
|
| |
|
| | if cpt_lower:
|
| | table_lowercase.append((cpt, cpt_lower))
|
| |
|
| |
|
| | if cpt_upper:
|
| | table_uppercase.append((cpt, cpt_upper))
|
| |
|
| |
|
| | norm = ord(unicodedata.normalize('NFD', char)[0])
|
| | if cpt != norm:
|
| | table_nfd.append((cpt, norm))
|
| |
|
| |
|
| |
|
| | table_whitespace.extend(range(0x0009, 0x000D + 1))
|
| | table_whitespace.extend(range(0x2000, 0x200A + 1))
|
| | table_whitespace.extend([0x0020, 0x0085, 0x00A0, 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000])
|
| |
|
| |
|
| |
|
| | table_whitespace.sort()
|
| | table_lowercase.sort()
|
| | table_uppercase.sort()
|
| | table_nfd.sort()
|
| |
|
| |
|
| |
|
| | ranges_flags: list[tuple[int, int]] = [(0, codepoint_flags[0])]
|
| | for codepoint, flags in enumerate(codepoint_flags):
|
| | if flags != ranges_flags[-1][1]:
|
| | ranges_flags.append((codepoint, flags))
|
| | ranges_flags.append((MAX_CODEPOINTS, 0x0000))
|
| |
|
| |
|
| |
|
| | ranges_nfd: list[tuple[int, int, int]] = [(0, 0, 0)]
|
| | for codepoint, norm in table_nfd:
|
| | start = ranges_nfd[-1][0]
|
| | if ranges_nfd[-1] != (start, codepoint - 1, norm):
|
| | ranges_nfd.append(None)
|
| | start = codepoint
|
| | ranges_nfd[-1] = (start, codepoint, norm)
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | def out(line=""):
|
| | print(line, end='\n')
|
| |
|
| |
|
| | out("""\
|
| | // generated with scripts/gen-unicode-data.py
|
| |
|
| | #include "unicode-data.h"
|
| |
|
| | #include <cstdint>
|
| | #include <vector>
|
| | #include <unordered_map>
|
| | #include <unordered_set>
|
| | """)
|
| |
|
| | out("const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = { // start, flags // last=next_start-1")
|
| | for codepoint, flags in ranges_flags:
|
| | out("{0x%06X, 0x%04X}," % (codepoint, flags))
|
| | out("};\n")
|
| |
|
| | out("const std::unordered_set<uint32_t> unicode_set_whitespace = {")
|
| | for codepoint in table_whitespace:
|
| | out("0x%06X," % codepoint)
|
| | out("};\n")
|
| |
|
| | out("const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {")
|
| | for tuple_lw in table_lowercase:
|
| | out("{0x%06X, 0x%06X}," % tuple_lw)
|
| | out("};\n")
|
| |
|
| | out("const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {")
|
| | for tuple_up in table_uppercase:
|
| | out("{0x%06X, 0x%06X}," % tuple_up)
|
| | out("};\n")
|
| |
|
| | out("const std::vector<range_nfd> unicode_ranges_nfd = { // start, last, nfd")
|
| | for triple in ranges_nfd:
|
| | out("{0x%06X, 0x%06X, 0x%06X}," % triple)
|
| | out("};\n")
|
| |
|