Spaces:
Running
Running
| from __future__ import annotations | |
| import re | |
| import unicodedata | |
| from typing import Optional | |
| from opencc import OpenCC | |
| RE_MULTI_SPACE = re.compile(r"\s+") | |
| RE_PUNCT = re.compile(r"[,。!?、;:,.!?;:()\[\]{}<>\"'“”‘’·…\-—_]+") | |
| # Traditional Chinese -> Simplified Chinese | |
| CC_T2S = OpenCC("t2s") | |
| def normalize_text_zh( | |
| text: Optional[str], | |
| remove_punct: bool = True, | |
| to_simplified: bool = True, | |
| ) -> Optional[str]: | |
| """ | |
| Normalize Chinese text for ASR evaluation. | |
| Steps: | |
| 1. Unicode normalization (NFKC) | |
| 2. Optional Traditional -> Simplified conversion | |
| 3. Optional punctuation removal | |
| 4. Collapse repeated whitespace | |
| This is used on BOTH ref and hyp before WER/CER/alignment, | |
| so differences like '天气' vs '天氣' won't be counted as ASR errors. | |
| """ | |
| if text is None: | |
| return None | |
| t = unicodedata.normalize("NFKC", text) | |
| t = t.strip() | |
| if to_simplified: | |
| t = CC_T2S.convert(t) | |
| if remove_punct: | |
| t = RE_PUNCT.sub(" ", t) | |
| t = RE_MULTI_SPACE.sub(" ", t).strip() | |
| return t | |