|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| """
|
| Rules to verbalize numbers into Chinese characters.
|
| https://zh.wikipedia.org/wiki/中文数字#現代中文
|
| """
|
|
|
| import re
|
| from collections import OrderedDict
|
| from typing import List
|
|
|
| DIGITS = {str(i): tran for i, tran in enumerate("零一二三四五六七八九")}
|
| UNITS = OrderedDict(
|
| {
|
| 1: "十",
|
| 2: "百",
|
| 3: "千",
|
| 4: "万",
|
| 8: "亿",
|
| }
|
| )
|
|
|
| COM_QUANTIFIERS = "(处|台|架|枚|趟|幅|平|方|堵|间|床|株|批|项|例|列|篇|栋|注|亩|封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)"
|
|
|
|
|
| RE_FRAC = re.compile(r"(-?)(\d+)/(\d+)")
|
|
|
|
|
| def replace_frac(match) -> str:
|
| """
|
| Args:
|
| match (re.Match)
|
| Returns:
|
| str
|
| """
|
| sign = match.group(1)
|
| nominator = match.group(2)
|
| denominator = match.group(3)
|
| sign: str = "负" if sign else ""
|
| nominator: str = num2str(nominator)
|
| denominator: str = num2str(denominator)
|
| result = f"{sign}{denominator}分之{nominator}"
|
| return result
|
|
|
|
|
|
|
| RE_PERCENTAGE = re.compile(r"(-?)(\d+(\.\d+)?)%")
|
|
|
|
|
| def replace_percentage(match) -> str:
|
| """
|
| Args:
|
| match (re.Match)
|
| Returns:
|
| str
|
| """
|
| sign = match.group(1)
|
| percent = match.group(2)
|
| sign: str = "负" if sign else ""
|
| percent: str = num2str(percent)
|
| result = f"{sign}百分之{percent}"
|
| return result
|
|
|
|
|
|
|
|
|
| RE_INTEGER = re.compile(r"(-)" r"(\d+)")
|
|
|
|
|
| def replace_negative_num(match) -> str:
|
| """
|
| Args:
|
| match (re.Match)
|
| Returns:
|
| str
|
| """
|
| sign = match.group(1)
|
| number = match.group(2)
|
| sign: str = "负" if sign else ""
|
| number: str = num2str(number)
|
| result = f"{sign}{number}"
|
| return result
|
|
|
|
|
|
|
|
|
| RE_DEFAULT_NUM = re.compile(r"\d{3}\d*")
|
|
|
|
|
| def replace_default_num(match):
|
| """
|
| Args:
|
| match (re.Match)
|
| Returns:
|
| str
|
| """
|
| number = match.group(0)
|
| return verbalize_digit(number, alt_one=True)
|
|
|
|
|
|
|
|
|
|
|
| RE_ASMD = re.compile(
|
| r"((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))([\+\-\×÷=])((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))"
|
| )
|
|
|
| asmd_map = {"+": "加", "-": "减", "×": "乘", "÷": "除", "=": "等于"}
|
|
|
|
|
| def replace_asmd(match) -> str:
|
| """
|
| Args:
|
| match (re.Match)
|
| Returns:
|
| str
|
| """
|
| result = match.group(1) + asmd_map[match.group(8)] + match.group(9)
|
| return result
|
|
|
|
|
|
|
| RE_POWER = re.compile(r"[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]+")
|
|
|
| power_map = {
|
| "⁰": "0",
|
| "¹": "1",
|
| "²": "2",
|
| "³": "3",
|
| "⁴": "4",
|
| "⁵": "5",
|
| "⁶": "6",
|
| "⁷": "7",
|
| "⁸": "8",
|
| "⁹": "9",
|
| "ˣ": "x",
|
| "ʸ": "y",
|
| "ⁿ": "n",
|
| }
|
|
|
|
|
| def replace_power(match) -> str:
|
| """
|
| Args:
|
| match (re.Match)
|
| Returns:
|
| str
|
| """
|
| power_num = ""
|
| for m in match.group(0):
|
| power_num += power_map[m]
|
| result = "的" + power_num + "次方"
|
| return result
|
|
|
|
|
|
|
|
|
| RE_DECIMAL_NUM = re.compile(r"(-?)((\d+)(\.\d+))" r"|(\.(\d+))")
|
|
|
| RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几\+])?" + COM_QUANTIFIERS)
|
| RE_NUMBER = re.compile(r"(-?)((\d+)(\.\d+)?)" r"|(\.(\d+))")
|
|
|
|
|
| def replace_positive_quantifier(match) -> str:
|
| """
|
| Args:
|
| match (re.Match)
|
| Returns:
|
| str
|
| """
|
| number = match.group(1)
|
| match_2 = match.group(2)
|
| if match_2 == "+":
|
| match_2 = "多"
|
| match_2: str = match_2 if match_2 else ""
|
| quantifiers: str = match.group(3)
|
| number: str = num2str(number)
|
| result = f"{number}{match_2}{quantifiers}"
|
| return result
|
|
|
|
|
| def replace_number(match) -> str:
|
| """
|
| Args:
|
| match (re.Match)
|
| Returns:
|
| str
|
| """
|
| sign = match.group(1)
|
| number = match.group(2)
|
| pure_decimal = match.group(5)
|
| if pure_decimal:
|
| result = num2str(pure_decimal)
|
| else:
|
| sign: str = "负" if sign else ""
|
| number: str = num2str(number)
|
| result = f"{sign}{number}"
|
| return result
|
|
|
|
|
|
|
|
|
|
|
| RE_RANGE = re.compile(
|
| r"""
|
| (?<![\d\+\-\×÷=]) # 使用反向前瞻以确保数字范围之前没有其他数字和操作符
|
| ((-?)((\d+)(\.\d+)?)) # 匹配范围起始的负数或正数(整数或小数)
|
| [-~] # 匹配范围分隔符
|
| ((-?)((\d+)(\.\d+)?)) # 匹配范围结束的负数或正数(整数或小数)
|
| (?![\d\+\-\×÷=]) # 使用正向前瞻以确保数字范围之后没有其他数字和操作符
|
| """,
|
| re.VERBOSE,
|
| )
|
|
|
|
|
| def replace_range(match) -> str:
|
| """
|
| Args:
|
| match (re.Match)
|
| Returns:
|
| str
|
| """
|
| first, second = match.group(1), match.group(6)
|
| first = RE_NUMBER.sub(replace_number, first)
|
| second = RE_NUMBER.sub(replace_number, second)
|
| result = f"{first}到{second}"
|
| return result
|
|
|
|
|
|
|
| RE_TO_RANGE = re.compile(
|
| r"((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)[~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)"
|
| )
|
|
|
|
|
| def replace_to_range(match) -> str:
|
| """
|
| Args:
|
| match (re.Match)
|
| Returns:
|
| str
|
| """
|
| result = match.group(0).replace("~", "至")
|
| return result
|
|
|
|
|
| def _get_value(value_string: str, use_zero: bool = True) -> List[str]:
|
| stripped = value_string.lstrip("0")
|
| if len(stripped) == 0:
|
| return []
|
| elif len(stripped) == 1:
|
| if use_zero and len(stripped) < len(value_string):
|
| return [DIGITS["0"], DIGITS[stripped]]
|
| else:
|
| return [DIGITS[stripped]]
|
| else:
|
| largest_unit = next(
|
| power for power in reversed(UNITS.keys()) if power < len(stripped)
|
| )
|
| first_part = value_string[:-largest_unit]
|
| second_part = value_string[-largest_unit:]
|
| return _get_value(first_part) + [UNITS[largest_unit]] + _get_value(second_part)
|
|
|
|
|
| def verbalize_cardinal(value_string: str) -> str:
|
| if not value_string:
|
| return ""
|
|
|
|
|
| value_string = value_string.lstrip("0")
|
| if len(value_string) == 0:
|
| return DIGITS["0"]
|
|
|
| result_symbols = _get_value(value_string)
|
|
|
| if (
|
| len(result_symbols) >= 2
|
| and result_symbols[0] == DIGITS["1"]
|
| and result_symbols[1] == UNITS[1]
|
| ):
|
| result_symbols = result_symbols[1:]
|
| return "".join(result_symbols)
|
|
|
|
|
| def verbalize_digit(value_string: str, alt_one=False) -> str:
|
| result_symbols = [DIGITS[digit] for digit in value_string]
|
| result = "".join(result_symbols)
|
| if alt_one:
|
| result = result.replace("一", "幺")
|
| return result
|
|
|
|
|
| def num2str(value_string: str) -> str:
|
| integer_decimal = value_string.split(".")
|
| if len(integer_decimal) == 1:
|
| integer = integer_decimal[0]
|
| decimal = ""
|
| elif len(integer_decimal) == 2:
|
| integer, decimal = integer_decimal
|
| else:
|
| raise ValueError(
|
| f"The value string: '${value_string}' has more than one point in it."
|
| )
|
|
|
| result = verbalize_cardinal(integer)
|
|
|
| decimal = decimal.rstrip("0")
|
| if decimal:
|
|
|
|
|
| result = result if result else "零"
|
| result += "点" + verbalize_digit(decimal)
|
| return result
|
|
|
|
|
| if __name__ == "__main__":
|
|
|
| text = ""
|
| text = num2str(text)
|
| print(text)
|
| pass
|
|
|