ASR_AGENT_ / analysis /normalize.py
unknown
Normalize traditional Chinese to simplified for ASR evaluation
4ed43e6
from __future__ import annotations
import re
import unicodedata
from typing import Optional
from opencc import OpenCC
RE_MULTI_SPACE = re.compile(r"\s+")
RE_PUNCT = re.compile(r"[,。!?、;:,.!?;:()\[\]{}<>\"'“”‘’·…\-—_]+")
# Traditional Chinese -> Simplified Chinese
CC_T2S = OpenCC("t2s")
def normalize_text_zh(
text: Optional[str],
remove_punct: bool = True,
to_simplified: bool = True,
) -> Optional[str]:
"""
Normalize Chinese text for ASR evaluation.
Steps:
1. Unicode normalization (NFKC)
2. Optional Traditional -> Simplified conversion
3. Optional punctuation removal
4. Collapse repeated whitespace
This is used on BOTH ref and hyp before WER/CER/alignment,
so differences like '天气' vs '天氣' won't be counted as ASR errors.
"""
if text is None:
return None
t = unicodedata.normalize("NFKC", text)
t = t.strip()
if to_simplified:
t = CC_T2S.convert(t)
if remove_punct:
t = RE_PUNCT.sub(" ", t)
t = RE_MULTI_SPACE.sub(" ", t).strip()
return t