File size: 7,784 Bytes
c9a2709
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32c85a9
 
 
 
 
 
 
 
 
 
 
 
 
 
c9a2709
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32c85a9
 
 
 
 
 
 
 
c9a2709
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
"""Phone-attendant text normalizer (zh-TW + English) — the single source of truth for how entities are
read, used BOTH to prep teacher text (so VoxCPM2 audio matches) and at inference (frontend). Handles:
email, phone/extension, serial number, price, percent, temperature(°C), date, person-count, address —
digit-by-digit vs cardinal vs ordinal chosen by entity + language context. Plain numbers are left for
cn2an (zh) / g2p_en (en) downstream. Idempotent-ish: safe to run once on raw text."""
import re
try:
    import inflect; _P = inflect.engine()
except Exception:
    _P = None
import cn2an

_ZH = {"0":"零","1":"一","2":"二","3":"三","4":"四","5":"五","6":"六","7":"七","8":"八","9":"九"}
_EN = {"0":"zero","1":"one","2":"two","3":"three","4":"four","5":"five","6":"six","7":"seven","8":"eight","9":"nine"}
_ADDR = "號樓段巷弄室坪"
_zh = lambda c: '一' <= c <= '鿿'

def _en_ctx(text, s, e):
    L = next((c for c in reversed(text[:s]) if _zh(c) or re.match(r'[A-Za-z]', c)), None)
    R = next((c for c in text[e:] if _zh(c) or re.match(r'[A-Za-z]', c)), None)
    for c in (L, R):
        if c is None: continue
        if re.match(r'[A-Za-z]', c): return True
        if _zh(c): return False
    return False

def _dd(d, en): return (" ".join(_EN[c] for c in d)) if en else ("".join(_ZH[c] for c in d))
def _card_zh(d):
    try: return cn2an.an2cn(int(d), "low")
    except Exception: return _dd(d, False)
def _card_en(n):
    return _P.number_to_words(int(n), andword="").replace("-", " ").replace(",", "") if _P else _dd(str(n), True)
def _ord_en(n):
    return _P.number_to_words(_P.ordinal(int(n))).replace("-", " ") if _P else str(n)
def _ord_zh(n): return _card_zh(n)

_MONTH = {1:"January",2:"February",3:"March",4:"April",5:"May",6:"June",7:"July",8:"August",
          9:"September",10:"October",11:"November",12:"December"}
def _year_en(y):
    y = int(y)
    if 2000 <= y <= 2009: return "two thousand" + ("" if y == 2000 else " " + _card_en(y % 100))
    if 1000 <= y <= 2099:
        return f"{_card_en(y//100)} {('oh ' + _card_en(y%100)) if 0 < y%100 < 10 else (_card_en(y%100) if y%100 else 'hundred')}"
    return _card_en(y)

def normalize(text: str) -> str:
    if not text: return text
    # 1) EMAIL  -> spell (at / dot, en digits)
    def email(m):
        s = m.group(0).replace("@", " at ").replace(".", " dot ")
        s = re.sub(r"\d", lambda d: " " + _EN[d.group(0)] + " ", s)
        return " " + re.sub(r"\s+", " ", s).strip() + " "
    text = re.sub(r"[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}", email, text)

    # 1.5) TIME of day  9:30 / 3:15 PM / 14:30 (needs :MM so ratios like 3:2 are untouched)
    def time_repl(m):
        h, mm, ap = int(m.group(1)), m.group(2), m.group(3)
        mi = int(mm); en = _en_ctx(text, m.start(), m.end())
        if en:
            hh = _card_en(h)
            t = f"{hh} o'clock" if mi == 0 else (f"{hh} oh {_card_en(mm)}" if mi < 10 else f"{hh} {_card_en(mm)}")
            if ap: t += " " + ("a m" if 'a' in ap.lower() else "p m")
            return " " + t + " "
        pre = ("上午" if 'a' in ap.lower() else "下午") if ap else ""
        t = f"{pre}{_card_zh(str(h))}點" + ("整" if mi == 0 else ("半" if mi == 30 else _card_zh(mm) + "分"))
        return " " + t + " "
    text = re.sub(r"\b(\d{1,2}):(\d{2})(?::\d{2})?\s*([AaPp][.]?[Mm][.]?)?\b", time_repl, text)

    # 2) DATE  zh 2024年3月15日 / en mixes
    def date_zh(m):
        y, mo, d = m.group(1), m.group(2), m.group(3)
        out = _dd(y, False) + "年"
        if mo: out += _card_zh(mo) + "月"
        if d: out += _card_zh(d) + "日"
        return out
    text = re.sub(r"(\d{4})\s*年\s*(?:(\d{1,2})\s*月)?\s*(?:(\d{1,2})\s*[日號])?", date_zh, text)
    def month_day(m):  # en "March 15" / "March 15, 2024"
        mon, d, y = m.group(1), m.group(2), m.group(3)
        out = f"{mon} {_ord_en(d)}"
        if y: out += " " + _year_en(y)
        return out
    text = re.sub(r"\b(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{1,2})(?:st|nd|rd|th)?(?:,?\s*(\d{4}))?\b", month_day, text)

    # 3) TEMPERATURE  28°C / 攝氏28度 / -5°C
    def temp(m):
        sign, n = m.group(1), m.group(2)
        en = _en_ctx(text, m.start(), m.end())
        neg = sign in ("-", "-")
        if en:
            return f" {'minus ' if neg else ''}{_card_en(n)} degrees Celsius "
        return f" 攝氏{'零下' if neg else ''}{_card_zh(n)}度 "
    text = re.sub(r"(?:攝氏)?\s*([--]?)(\d+)\s*(?:°\s*[Cc]|度[Cc]?|℃)", temp, text)

    # 4) PERCENT  70%
    def pct(m):
        n = m.group(1)
        return f" {_card_en(n)} percent " if _en_ctx(text, m.start(), m.end()) else f" 百分之{_card_zh(n)} "
    text = re.sub(r"(\d+)\s*[%%]", pct, text)

    # 5) PRICE  $1,299 / NT$500 / 1299元 / USD 49.99  (context-aware; USD/US$ force English)
    def price_cur(m):
        cur = m.group(0); whole = m.group(1).replace(",", ""); cents = m.group(2)
        en = _en_ctx(text, m.start(), m.end()) or ("USD" in cur) or ("US$" in cur)
        if en:
            out = _card_en(whole) + " dollar" + ("" if whole == "1" else "s")
            if cents: out += " and " + _card_en(cents) + " cent" + ("" if cents == "01" else "s")
        else:
            out = _card_zh(whole) + "元"
            if cents: out += _card_zh(cents) + "分"
        return " " + out + " "
    text = re.sub(r"(?:NT\$|US\$|USD|\$)\s*([\d,]+)(?:\.(\d{2}))?", price_cur, text)
    def price_zh(m):
        return " " + _card_zh(m.group(1).replace(",", "")) + (m.group(2) or "元") + " "
    text = re.sub(r"([\d,]+)\s*(元|塊錢|塊|台幣|新台幣)", price_zh, text)

    # 6) PHONE / extension groups -> digit-by-digit
    def phone(m):
        en = _en_ctx(text, m.start(), m.end())
        groups = [re.sub(r"\D", "", g) for g in re.split(r"[-\s]+", m.group(0).strip("()"))]
        return " " + " ".join(_dd(g, en) for g in groups if g) + " "
    text = re.sub(r"\(?\d{2,4}\)?(?:[-\s]\d{2,4}){1,4}", phone, text)
    text = re.sub(r"(分機|內線|ext\.?|extension)\s*(\d{2,6})",
                  lambda m: m.group(1) + " " + _dd(m.group(2), _en_ctx(text, m.start(), m.end())), text, flags=re.I)

    # 7) SERIAL / order code  (序號/型號/SN context OR alnum code with both letters+digits)
    def serial(m):
        s = m.group(0)
        return " " + " ".join((_EN[c] if c.isdigit() else c.upper()) for c in s if c.isalnum()) + " "
    text = re.sub(r"\b(?=[A-Za-z0-9-]*[A-Za-z])(?=[A-Za-z0-9-]*\d)[A-Za-z0-9]{2,}(?:-[A-Za-z0-9]+)*\b", serial, text)

    # 7.5) DECIMALS  12.5 -> 十二點五 / twelve point five (avoid IPs/versions a.b.c)
    def dec_repl(m):
        whole, frac = m.group(1), m.group(2)
        if _en_ctx(text, m.start(), m.end()):
            return f" {_card_en(whole)} point {' '.join(_EN[c] for c in frac)} "
        return f" {_card_zh(whole)}{''.join(_ZH[c] for c in frac)} "
    text = re.sub(r"(?<![\d.])(\d+)\.(\d+)(?![\d.])", dec_repl, text)

    # 8) COUNTS / remaining standalone digit runs
    def num(m):
        d = m.group(0); en = _en_ctx(text, m.start(), m.end()); after = text[m.end():m.end()+1]
        if len(d) >= 5:                       # long id/order -> digit-by-digit
            return _dd(d, en)
        if en:
            return _dd(d, True) if len(d) >= 4 else d  # 4-digit en standalone (ext) digits; else let g2p_en handle
        return _dd(d, False) if (len(d) >= 4 and after not in _ADDR) else _card_zh(d)
    text = re.sub(r"\d+", num, text)

    text = text.replace(",", ",").replace("。", ".").replace("?", "?").replace("!", "!")
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\s+([,.?!])", r"\1", text)        # no space before punctuation
    return text.strip()