File size: 1,097 Bytes
b9196ed
4ed43e6
b9196ed
 
 
 
4ed43e6
 
b9196ed
 
 
4ed43e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b9196ed
4ed43e6
 
 
b9196ed
 
4ed43e6
b9196ed
 
4ed43e6
 
 
 
b9196ed
 
4ed43e6
b9196ed
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from __future__ import annotations

import re
import unicodedata
from typing import Optional

from opencc import OpenCC

RE_MULTI_SPACE = re.compile(r"\s+")
RE_PUNCT = re.compile(r"[,。!?、;:,.!?;:()\[\]{}<>\"'“”‘’·…\-—_]+")

# Traditional Chinese -> Simplified Chinese
CC_T2S = OpenCC("t2s")


def normalize_text_zh(
    text: Optional[str],
    remove_punct: bool = True,
    to_simplified: bool = True,
) -> Optional[str]:
    """
    Normalize Chinese text for ASR evaluation.

    Steps:
    1. Unicode normalization (NFKC)
    2. Optional Traditional -> Simplified conversion
    3. Optional punctuation removal
    4. Collapse repeated whitespace

    This is used on BOTH ref and hyp before WER/CER/alignment,
    so differences like '天气' vs '天氣' won't be counted as ASR errors.
    """
    if text is None:
        return None

    t = unicodedata.normalize("NFKC", text)
    t = t.strip()

    if to_simplified:
        t = CC_T2S.convert(t)

    if remove_punct:
        t = RE_PUNCT.sub(" ", t)

    t = RE_MULTI_SPACE.sub(" ", t).strip()
    return t