Spaces:

ZTXRiley
/

ASR_AGENT_

Running

File size: 1,097 Bytes

b9196ed
4ed43e6
b9196ed
 
 
 
4ed43e6
 
b9196ed
 
 
4ed43e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b9196ed
4ed43e6
 
 
b9196ed
 
4ed43e6
b9196ed
 
4ed43e6
 
 
 
b9196ed
 
4ed43e6
b9196ed

from __future__ import annotations

import re
import unicodedata
from typing import Optional

from opencc import OpenCC

RE_MULTI_SPACE = re.compile(r"\s+")
RE_PUNCT = re.compile(r"[，。！？、；：,.!?;:()\[\]{}<>\"'“”‘’·…\-—_]+")

# Traditional Chinese -> Simplified Chinese
CC_T2S = OpenCC("t2s")


def normalize_text_zh(
    text: Optional[str],
    remove_punct: bool = True,
    to_simplified: bool = True,
) -> Optional[str]:
    """
    Normalize Chinese text for ASR evaluation.

    Steps:
    1. Unicode normalization (NFKC)
    2. Optional Traditional -> Simplified conversion
    3. Optional punctuation removal
    4. Collapse repeated whitespace

    This is used on BOTH ref and hyp before WER/CER/alignment,
    so differences like '天气' vs '天氣' won't be counted as ASR errors.
    """
    if text is None:
        return None

    t = unicodedata.normalize("NFKC", text)
    t = t.strip()

    if to_simplified:
        t = CC_T2S.convert(t)

    if remove_punct:
        t = RE_PUNCT.sub(" ", t)

    t = RE_MULTI_SPACE.sub(" ", t).strip()
    return t