from __future__ import annotations import re import unicodedata from typing import Optional from opencc import OpenCC RE_MULTI_SPACE = re.compile(r"\s+") RE_PUNCT = re.compile(r"[,。!?、;:,.!?;:()\[\]{}<>\"'“”‘’·…\-—_]+") # Traditional Chinese -> Simplified Chinese CC_T2S = OpenCC("t2s") def normalize_text_zh( text: Optional[str], remove_punct: bool = True, to_simplified: bool = True, ) -> Optional[str]: """ Normalize Chinese text for ASR evaluation. Steps: 1. Unicode normalization (NFKC) 2. Optional Traditional -> Simplified conversion 3. Optional punctuation removal 4. Collapse repeated whitespace This is used on BOTH ref and hyp before WER/CER/alignment, so differences like '天气' vs '天氣' won't be counted as ASR errors. """ if text is None: return None t = unicodedata.normalize("NFKC", text) t = t.strip() if to_simplified: t = CC_T2S.convert(t) if remove_punct: t = RE_PUNCT.sub(" ", t) t = RE_MULTI_SPACE.sub(" ", t).strip() return t