Spaces:
Running
Running
File size: 1,097 Bytes
b9196ed 4ed43e6 b9196ed 4ed43e6 b9196ed 4ed43e6 b9196ed 4ed43e6 b9196ed 4ed43e6 b9196ed 4ed43e6 b9196ed 4ed43e6 b9196ed | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 | from __future__ import annotations
import re
import unicodedata
from typing import Optional
from opencc import OpenCC
RE_MULTI_SPACE = re.compile(r"\s+")
RE_PUNCT = re.compile(r"[,。!?、;:,.!?;:()\[\]{}<>\"'“”‘’·…\-—_]+")
# Traditional Chinese -> Simplified Chinese
CC_T2S = OpenCC("t2s")
def normalize_text_zh(
text: Optional[str],
remove_punct: bool = True,
to_simplified: bool = True,
) -> Optional[str]:
"""
Normalize Chinese text for ASR evaluation.
Steps:
1. Unicode normalization (NFKC)
2. Optional Traditional -> Simplified conversion
3. Optional punctuation removal
4. Collapse repeated whitespace
This is used on BOTH ref and hyp before WER/CER/alignment,
so differences like '天气' vs '天氣' won't be counted as ASR errors.
"""
if text is None:
return None
t = unicodedata.normalize("NFKC", text)
t = t.strip()
if to_simplified:
t = CC_T2S.convert(t)
if remove_punct:
t = RE_PUNCT.sub(" ", t)
t = RE_MULTI_SPACE.sub(" ", t).strip()
return t
|