File size: 7,784 Bytes
c9a2709 32c85a9 c9a2709 32c85a9 c9a2709 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 | """Phone-attendant text normalizer (zh-TW + English) — the single source of truth for how entities are
read, used BOTH to prep teacher text (so VoxCPM2 audio matches) and at inference (frontend). Handles:
email, phone/extension, serial number, price, percent, temperature(°C), date, person-count, address —
digit-by-digit vs cardinal vs ordinal chosen by entity + language context. Plain numbers are left for
cn2an (zh) / g2p_en (en) downstream. Idempotent-ish: safe to run once on raw text."""
import re
try:
import inflect; _P = inflect.engine()
except Exception:
_P = None
import cn2an
_ZH = {"0":"零","1":"一","2":"二","3":"三","4":"四","5":"五","6":"六","7":"七","8":"八","9":"九"}
_EN = {"0":"zero","1":"one","2":"two","3":"three","4":"four","5":"five","6":"six","7":"seven","8":"eight","9":"nine"}
_ADDR = "號樓段巷弄室坪"
_zh = lambda c: '一' <= c <= '鿿'
def _en_ctx(text, s, e):
L = next((c for c in reversed(text[:s]) if _zh(c) or re.match(r'[A-Za-z]', c)), None)
R = next((c for c in text[e:] if _zh(c) or re.match(r'[A-Za-z]', c)), None)
for c in (L, R):
if c is None: continue
if re.match(r'[A-Za-z]', c): return True
if _zh(c): return False
return False
def _dd(d, en): return (" ".join(_EN[c] for c in d)) if en else ("".join(_ZH[c] for c in d))
def _card_zh(d):
try: return cn2an.an2cn(int(d), "low")
except Exception: return _dd(d, False)
def _card_en(n):
return _P.number_to_words(int(n), andword="").replace("-", " ").replace(",", "") if _P else _dd(str(n), True)
def _ord_en(n):
return _P.number_to_words(_P.ordinal(int(n))).replace("-", " ") if _P else str(n)
def _ord_zh(n): return _card_zh(n)
_MONTH = {1:"January",2:"February",3:"March",4:"April",5:"May",6:"June",7:"July",8:"August",
9:"September",10:"October",11:"November",12:"December"}
def _year_en(y):
y = int(y)
if 2000 <= y <= 2009: return "two thousand" + ("" if y == 2000 else " " + _card_en(y % 100))
if 1000 <= y <= 2099:
return f"{_card_en(y//100)} {('oh ' + _card_en(y%100)) if 0 < y%100 < 10 else (_card_en(y%100) if y%100 else 'hundred')}"
return _card_en(y)
def normalize(text: str) -> str:
if not text: return text
# 1) EMAIL -> spell (at / dot, en digits)
def email(m):
s = m.group(0).replace("@", " at ").replace(".", " dot ")
s = re.sub(r"\d", lambda d: " " + _EN[d.group(0)] + " ", s)
return " " + re.sub(r"\s+", " ", s).strip() + " "
text = re.sub(r"[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}", email, text)
# 1.5) TIME of day 9:30 / 3:15 PM / 14:30 (needs :MM so ratios like 3:2 are untouched)
def time_repl(m):
h, mm, ap = int(m.group(1)), m.group(2), m.group(3)
mi = int(mm); en = _en_ctx(text, m.start(), m.end())
if en:
hh = _card_en(h)
t = f"{hh} o'clock" if mi == 0 else (f"{hh} oh {_card_en(mm)}" if mi < 10 else f"{hh} {_card_en(mm)}")
if ap: t += " " + ("a m" if 'a' in ap.lower() else "p m")
return " " + t + " "
pre = ("上午" if 'a' in ap.lower() else "下午") if ap else ""
t = f"{pre}{_card_zh(str(h))}點" + ("整" if mi == 0 else ("半" if mi == 30 else _card_zh(mm) + "分"))
return " " + t + " "
text = re.sub(r"\b(\d{1,2}):(\d{2})(?::\d{2})?\s*([AaPp][.]?[Mm][.]?)?\b", time_repl, text)
# 2) DATE zh 2024年3月15日 / en mixes
def date_zh(m):
y, mo, d = m.group(1), m.group(2), m.group(3)
out = _dd(y, False) + "年"
if mo: out += _card_zh(mo) + "月"
if d: out += _card_zh(d) + "日"
return out
text = re.sub(r"(\d{4})\s*年\s*(?:(\d{1,2})\s*月)?\s*(?:(\d{1,2})\s*[日號])?", date_zh, text)
def month_day(m): # en "March 15" / "March 15, 2024"
mon, d, y = m.group(1), m.group(2), m.group(3)
out = f"{mon} {_ord_en(d)}"
if y: out += " " + _year_en(y)
return out
text = re.sub(r"\b(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{1,2})(?:st|nd|rd|th)?(?:,?\s*(\d{4}))?\b", month_day, text)
# 3) TEMPERATURE 28°C / 攝氏28度 / -5°C
def temp(m):
sign, n = m.group(1), m.group(2)
en = _en_ctx(text, m.start(), m.end())
neg = sign in ("-", "-")
if en:
return f" {'minus ' if neg else ''}{_card_en(n)} degrees Celsius "
return f" 攝氏{'零下' if neg else ''}{_card_zh(n)}度 "
text = re.sub(r"(?:攝氏)?\s*([--]?)(\d+)\s*(?:°\s*[Cc]|度[Cc]?|℃)", temp, text)
# 4) PERCENT 70%
def pct(m):
n = m.group(1)
return f" {_card_en(n)} percent " if _en_ctx(text, m.start(), m.end()) else f" 百分之{_card_zh(n)} "
text = re.sub(r"(\d+)\s*[%%]", pct, text)
# 5) PRICE $1,299 / NT$500 / 1299元 / USD 49.99 (context-aware; USD/US$ force English)
def price_cur(m):
cur = m.group(0); whole = m.group(1).replace(",", ""); cents = m.group(2)
en = _en_ctx(text, m.start(), m.end()) or ("USD" in cur) or ("US$" in cur)
if en:
out = _card_en(whole) + " dollar" + ("" if whole == "1" else "s")
if cents: out += " and " + _card_en(cents) + " cent" + ("" if cents == "01" else "s")
else:
out = _card_zh(whole) + "元"
if cents: out += _card_zh(cents) + "分"
return " " + out + " "
text = re.sub(r"(?:NT\$|US\$|USD|\$)\s*([\d,]+)(?:\.(\d{2}))?", price_cur, text)
def price_zh(m):
return " " + _card_zh(m.group(1).replace(",", "")) + (m.group(2) or "元") + " "
text = re.sub(r"([\d,]+)\s*(元|塊錢|塊|台幣|新台幣)", price_zh, text)
# 6) PHONE / extension groups -> digit-by-digit
def phone(m):
en = _en_ctx(text, m.start(), m.end())
groups = [re.sub(r"\D", "", g) for g in re.split(r"[-\s]+", m.group(0).strip("()"))]
return " " + " ".join(_dd(g, en) for g in groups if g) + " "
text = re.sub(r"\(?\d{2,4}\)?(?:[-\s]\d{2,4}){1,4}", phone, text)
text = re.sub(r"(分機|內線|ext\.?|extension)\s*(\d{2,6})",
lambda m: m.group(1) + " " + _dd(m.group(2), _en_ctx(text, m.start(), m.end())), text, flags=re.I)
# 7) SERIAL / order code (序號/型號/SN context OR alnum code with both letters+digits)
def serial(m):
s = m.group(0)
return " " + " ".join((_EN[c] if c.isdigit() else c.upper()) for c in s if c.isalnum()) + " "
text = re.sub(r"\b(?=[A-Za-z0-9-]*[A-Za-z])(?=[A-Za-z0-9-]*\d)[A-Za-z0-9]{2,}(?:-[A-Za-z0-9]+)*\b", serial, text)
# 7.5) DECIMALS 12.5 -> 十二點五 / twelve point five (avoid IPs/versions a.b.c)
def dec_repl(m):
whole, frac = m.group(1), m.group(2)
if _en_ctx(text, m.start(), m.end()):
return f" {_card_en(whole)} point {' '.join(_EN[c] for c in frac)} "
return f" {_card_zh(whole)}點{''.join(_ZH[c] for c in frac)} "
text = re.sub(r"(?<![\d.])(\d+)\.(\d+)(?![\d.])", dec_repl, text)
# 8) COUNTS / remaining standalone digit runs
def num(m):
d = m.group(0); en = _en_ctx(text, m.start(), m.end()); after = text[m.end():m.end()+1]
if len(d) >= 5: # long id/order -> digit-by-digit
return _dd(d, en)
if en:
return _dd(d, True) if len(d) >= 4 else d # 4-digit en standalone (ext) digits; else let g2p_en handle
return _dd(d, False) if (len(d) >= 4 and after not in _ADDR) else _card_zh(d)
text = re.sub(r"\d+", num, text)
text = text.replace(",", ",").replace("。", ".").replace("?", "?").replace("!", "!")
text = re.sub(r"[ \t]+", " ", text)
text = re.sub(r"\s+([,.?!])", r"\1", text) # no space before punctuation
return text.strip()
|