| """Phone-attendant text normalizer (zh-TW + English) — the single source of truth for how entities are |
| read, used BOTH to prep teacher text (so VoxCPM2 audio matches) and at inference (frontend). Handles: |
| email, phone/extension, serial number, price, percent, temperature(°C), date, person-count, address — |
| digit-by-digit vs cardinal vs ordinal chosen by entity + language context. Plain numbers are left for |
| cn2an (zh) / g2p_en (en) downstream. Idempotent-ish: safe to run once on raw text.""" |
| import re |
| try: |
| import inflect; _P = inflect.engine() |
| except Exception: |
| _P = None |
| import cn2an |
|
|
| _ZH = {"0":"零","1":"一","2":"二","3":"三","4":"四","5":"五","6":"六","7":"七","8":"八","9":"九"} |
| _EN = {"0":"zero","1":"one","2":"two","3":"three","4":"four","5":"five","6":"six","7":"seven","8":"eight","9":"nine"} |
| _ADDR = "號樓段巷弄室坪" |
| _zh = lambda c: '一' <= c <= '鿿' |
|
|
| def _en_ctx(text, s, e): |
| L = next((c for c in reversed(text[:s]) if _zh(c) or re.match(r'[A-Za-z]', c)), None) |
| R = next((c for c in text[e:] if _zh(c) or re.match(r'[A-Za-z]', c)), None) |
| for c in (L, R): |
| if c is None: continue |
| if re.match(r'[A-Za-z]', c): return True |
| if _zh(c): return False |
| return False |
|
|
| def _dd(d, en): return (" ".join(_EN[c] for c in d)) if en else ("".join(_ZH[c] for c in d)) |
| def _card_zh(d): |
| try: return cn2an.an2cn(int(d), "low") |
| except Exception: return _dd(d, False) |
| def _card_en(n): |
| return _P.number_to_words(int(n), andword="").replace("-", " ").replace(",", "") if _P else _dd(str(n), True) |
| def _ord_en(n): |
| return _P.number_to_words(_P.ordinal(int(n))).replace("-", " ") if _P else str(n) |
| def _ord_zh(n): return _card_zh(n) |
|
|
| _MONTH = {1:"January",2:"February",3:"March",4:"April",5:"May",6:"June",7:"July",8:"August", |
| 9:"September",10:"October",11:"November",12:"December"} |
| def _year_en(y): |
| y = int(y) |
| if 2000 <= y <= 2009: return "two thousand" + ("" if y == 2000 else " " + _card_en(y % 100)) |
| if 1000 <= y <= 2099: |
| return f"{_card_en(y//100)} {('oh ' + _card_en(y%100)) if 0 < y%100 < 10 else (_card_en(y%100) if y%100 else 'hundred')}" |
| return _card_en(y) |
|
|
| def normalize(text: str) -> str: |
| if not text: return text |
| |
| def email(m): |
| s = m.group(0).replace("@", " at ").replace(".", " dot ") |
| s = re.sub(r"\d", lambda d: " " + _EN[d.group(0)] + " ", s) |
| return " " + re.sub(r"\s+", " ", s).strip() + " " |
| text = re.sub(r"[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}", email, text) |
|
|
| |
| def time_repl(m): |
| h, mm, ap = int(m.group(1)), m.group(2), m.group(3) |
| mi = int(mm); en = _en_ctx(text, m.start(), m.end()) |
| if en: |
| hh = _card_en(h) |
| t = f"{hh} o'clock" if mi == 0 else (f"{hh} oh {_card_en(mm)}" if mi < 10 else f"{hh} {_card_en(mm)}") |
| if ap: t += " " + ("a m" if 'a' in ap.lower() else "p m") |
| return " " + t + " " |
| pre = ("上午" if 'a' in ap.lower() else "下午") if ap else "" |
| t = f"{pre}{_card_zh(str(h))}點" + ("整" if mi == 0 else ("半" if mi == 30 else _card_zh(mm) + "分")) |
| return " " + t + " " |
| text = re.sub(r"\b(\d{1,2}):(\d{2})(?::\d{2})?\s*([AaPp][.]?[Mm][.]?)?\b", time_repl, text) |
|
|
| |
| def date_zh(m): |
| y, mo, d = m.group(1), m.group(2), m.group(3) |
| out = _dd(y, False) + "年" |
| if mo: out += _card_zh(mo) + "月" |
| if d: out += _card_zh(d) + "日" |
| return out |
| text = re.sub(r"(\d{4})\s*年\s*(?:(\d{1,2})\s*月)?\s*(?:(\d{1,2})\s*[日號])?", date_zh, text) |
| def month_day(m): |
| mon, d, y = m.group(1), m.group(2), m.group(3) |
| out = f"{mon} {_ord_en(d)}" |
| if y: out += " " + _year_en(y) |
| return out |
| text = re.sub(r"\b(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{1,2})(?:st|nd|rd|th)?(?:,?\s*(\d{4}))?\b", month_day, text) |
|
|
| |
| def temp(m): |
| sign, n = m.group(1), m.group(2) |
| en = _en_ctx(text, m.start(), m.end()) |
| neg = sign in ("-", "-") |
| if en: |
| return f" {'minus ' if neg else ''}{_card_en(n)} degrees Celsius " |
| return f" 攝氏{'零下' if neg else ''}{_card_zh(n)}度 " |
| text = re.sub(r"(?:攝氏)?\s*([--]?)(\d+)\s*(?:°\s*[Cc]|度[Cc]?|℃)", temp, text) |
|
|
| |
| def pct(m): |
| n = m.group(1) |
| return f" {_card_en(n)} percent " if _en_ctx(text, m.start(), m.end()) else f" 百分之{_card_zh(n)} " |
| text = re.sub(r"(\d+)\s*[%%]", pct, text) |
|
|
| |
| def price_cur(m): |
| cur = m.group(0); whole = m.group(1).replace(",", ""); cents = m.group(2) |
| en = _en_ctx(text, m.start(), m.end()) or ("USD" in cur) or ("US$" in cur) |
| if en: |
| out = _card_en(whole) + " dollar" + ("" if whole == "1" else "s") |
| if cents: out += " and " + _card_en(cents) + " cent" + ("" if cents == "01" else "s") |
| else: |
| out = _card_zh(whole) + "元" |
| if cents: out += _card_zh(cents) + "分" |
| return " " + out + " " |
| text = re.sub(r"(?:NT\$|US\$|USD|\$)\s*([\d,]+)(?:\.(\d{2}))?", price_cur, text) |
| def price_zh(m): |
| return " " + _card_zh(m.group(1).replace(",", "")) + (m.group(2) or "元") + " " |
| text = re.sub(r"([\d,]+)\s*(元|塊錢|塊|台幣|新台幣)", price_zh, text) |
|
|
| |
| def phone(m): |
| en = _en_ctx(text, m.start(), m.end()) |
| groups = [re.sub(r"\D", "", g) for g in re.split(r"[-\s]+", m.group(0).strip("()"))] |
| return " " + " ".join(_dd(g, en) for g in groups if g) + " " |
| text = re.sub(r"\(?\d{2,4}\)?(?:[-\s]\d{2,4}){1,4}", phone, text) |
| text = re.sub(r"(分機|內線|ext\.?|extension)\s*(\d{2,6})", |
| lambda m: m.group(1) + " " + _dd(m.group(2), _en_ctx(text, m.start(), m.end())), text, flags=re.I) |
|
|
| |
| def serial(m): |
| s = m.group(0) |
| return " " + " ".join((_EN[c] if c.isdigit() else c.upper()) for c in s if c.isalnum()) + " " |
| text = re.sub(r"\b(?=[A-Za-z0-9-]*[A-Za-z])(?=[A-Za-z0-9-]*\d)[A-Za-z0-9]{2,}(?:-[A-Za-z0-9]+)*\b", serial, text) |
|
|
| |
| def dec_repl(m): |
| whole, frac = m.group(1), m.group(2) |
| if _en_ctx(text, m.start(), m.end()): |
| return f" {_card_en(whole)} point {' '.join(_EN[c] for c in frac)} " |
| return f" {_card_zh(whole)}點{''.join(_ZH[c] for c in frac)} " |
| text = re.sub(r"(?<![\d.])(\d+)\.(\d+)(?![\d.])", dec_repl, text) |
|
|
| |
| def num(m): |
| d = m.group(0); en = _en_ctx(text, m.start(), m.end()); after = text[m.end():m.end()+1] |
| if len(d) >= 5: |
| return _dd(d, en) |
| if en: |
| return _dd(d, True) if len(d) >= 4 else d |
| return _dd(d, False) if (len(d) >= 4 and after not in _ADDR) else _card_zh(d) |
| text = re.sub(r"\d+", num, text) |
|
|
| text = text.replace(",", ",").replace("。", ".").replace("?", "?").replace("!", "!") |
| text = re.sub(r"[ \t]+", " ", text) |
| text = re.sub(r"\s+([,.?!])", r"\1", text) |
| return text.strip() |
|
|