Spaces:
Running
Running
| # Copyright 2024 Hung-Shin Lee (hungshinlee@gmail.com) | |
| # Apache 2.0 | |
| import itertools | |
| import re | |
| c_basic = "零一二三四五六七八九" | |
| d2c = {str(d): c for d, c in enumerate(c_basic)} | |
| d2c["."] = "點" | |
| def num4year(matched): | |
| def _num4year(num): | |
| return "{}".format("".join([c_basic[int(i)] for i in num])) | |
| matched_str = matched.group(0) | |
| for m in matched.groups(): | |
| matched_str = matched_str.replace(m, _num4year(m)) | |
| return matched_str | |
| def num2chines_simple(matched): | |
| return "{}".format("".join([d2c[i] for i in matched])) | |
| def num4percent(matched): | |
| matched = matched.group(1) | |
| return "百分之{}".format(num2chinese(matched[:-1])) | |
| def num4cellphone(matched): | |
| matched = matched.group(1) | |
| matched = matched.replace(" ", "").replace("-", "") | |
| return "".join([c_basic[int(i)] for i in matched]) | |
| def num4er(matched): # 2 to 二 | |
| matched = matched.group(1) | |
| return matched.replace("2", "二") | |
| def num4liang(matched): # 2 to 兩 | |
| matched = matched.group(1) | |
| return matched.replace("2", "兩") | |
| def num4general(matched): | |
| num = matched.group(1) | |
| if re.match("[A-Za-z-─]", num[0]): | |
| if len(num[1:]) < 3: | |
| # MP3 or F-16 | |
| return "{}{}".format(num[0], num2chinese(num[1:])) | |
| else: | |
| # AM104 | |
| return "{}{}".format(num[0], num2chines_simple(num[1:])) | |
| else: | |
| if re.match("[0-9]", num[0]): | |
| return "{}".format(num2chinese(num)) | |
| else: | |
| return "{}{}".format(num[0], num2chinese(num[1:])) | |
| def parse_num(text: str) -> str: | |
| # year | |
| text = re.sub("([0-9]{4})[到至]([0-9]{4})年", num4year, text) | |
| text = re.sub("([0-9]{4})年", num4year, text) | |
| # percentage | |
| text = re.sub(r"([0-9]+\.?[0-9]?%)", num4percent, text) | |
| # cellphone | |
| text = re.sub(r"([0-9]{4}\s?-\s?[0-9]{6})", num4cellphone, text) | |
| # single 2 to 二 | |
| text = re.sub(r"([^\d]2[診樓月號])", num4er, text) | |
| text = re.sub(r"([初]2[^\d])", num4er, text) | |
| # single 2 to 兩 | |
| text = re.sub(r"([^\d]2[^\d])", num4liang, text) | |
| # general number | |
| text = re.sub(r"([^0-9]?[0-9]+\.?[0-9]?)", num4general, text) | |
| return text | |
| def num2chinese(num, big=False, simp=False, o=False, twoalt=True) -> str: | |
| """ | |
| Converts numbers to Chinese representations. | |
| https://gist.github.com/gumblex/0d65cad2ba607fd14de7 | |
| `big` : use financial characters. | |
| `simp` : use simplified characters instead of traditional characters. | |
| `o` : use 〇 for zero. | |
| `twoalt`: use 两/兩 for two when appropriate. | |
| Note that `o` and `twoalt` is ignored when `big` is used, | |
| and `twoalt` is ignored when `o` is used for formal representations. | |
| """ | |
| # check num first | |
| nd = str(num) | |
| if abs(float(nd)) >= 1e48: | |
| raise ValueError("number out of range") | |
| elif "e" in nd: | |
| raise ValueError("scientific notation is not supported") | |
| c_symbol = "正负点" if simp else "正負點" | |
| if o: # formal | |
| twoalt = False | |
| if big: | |
| c_basic = "零壹贰叁肆伍陆柒捌玖" if simp else "零壹貳參肆伍陸柒捌玖" | |
| c_unit1 = "拾佰仟" | |
| c_twoalt = "贰" if simp else "貳" | |
| else: | |
| c_basic = "〇一二三四五六七八九" if o else "零一二三四五六七八九" | |
| c_unit1 = "十百千" | |
| if twoalt: | |
| c_twoalt = "两" if simp else "兩" | |
| else: | |
| c_twoalt = "二" | |
| c_unit2 = "万亿兆京垓秭穰沟涧正载" if simp else "萬億兆京垓秭穰溝澗正載" | |
| def revuniq(l): | |
| return "".join(k for k, g in itertools.groupby(reversed(l))) | |
| nd = str(num) | |
| result = [] | |
| if nd[0] == "+": | |
| result.append(c_symbol[0]) | |
| elif nd[0] == "-": | |
| result.append(c_symbol[1]) | |
| if "." in nd: | |
| integer, remainder = nd.lstrip("+-").split(".") | |
| else: | |
| integer, remainder = nd.lstrip("+-"), None | |
| if int(integer): | |
| splitted = [integer[max(i - 4, 0) : i] for i in range(len(integer), 0, -4)] | |
| intresult = [] | |
| for nu, unit in enumerate(splitted): | |
| # special cases | |
| if int(unit) == 0: # 0000 | |
| intresult.append(c_basic[0]) | |
| continue | |
| elif nu > 0 and int(unit) == 2: # 0002 | |
| intresult.append(c_twoalt + c_unit2[nu - 1]) | |
| continue | |
| ulist = [] | |
| unit = unit.zfill(4) | |
| for nc, ch in enumerate(reversed(unit)): | |
| if ch == "0": | |
| if ulist: # ???0 | |
| ulist.append(c_basic[0]) | |
| elif nc == 0: | |
| ulist.append(c_basic[int(ch)]) | |
| elif nc == 1 and ch == "1" and all([i == "0" for i in unit[: nc + 1]]): | |
| # special case for tens | |
| # edit the 'elif' if you don't like | |
| # 十四, 三千零十四, 三千三百一十四 | |
| ulist.append(c_unit1[0]) | |
| elif nc > 1 and ch == "2": | |
| ulist.append(c_twoalt + c_unit1[nc - 1]) | |
| else: | |
| ulist.append(c_basic[int(ch)] + c_unit1[nc - 1]) | |
| # print(ulist) | |
| ustr = revuniq(ulist) | |
| if nu == 0: | |
| intresult.append(ustr) | |
| else: | |
| intresult.append(ustr + c_unit2[nu - 1]) | |
| result.append(revuniq(intresult).strip(c_basic[0])) | |
| else: | |
| result.append(c_basic[0]) | |
| if remainder: | |
| result.append(c_symbol[2]) | |
| result.append("".join(c_basic[int(ch)] for ch in remainder)) | |
| return "".join(result) | |
| if __name__ == "__main__": | |
| text = "若手機仔幾多號?吾手機仔係0964-498042。" | |
| print(f"{text} -> {parse_num(text)}") | |