Spaces:

ZTXRiley
/

ASR_AGENT_

Running

unknown

Normalize traditional Chinese to simplified for ASR evaluation

4ed43e6 12 days ago

1.1 kB

	from __future__ import annotations

	import re
	import unicodedata
	from typing import Optional

	from opencc import OpenCC

	RE_MULTI_SPACE = re.compile(r"\s+")
	RE_PUNCT = re.compile(r"[，。！？、；：,.!?;:()\[\]{}<>\"'“”‘’·…\-—_]+")

	# Traditional Chinese -> Simplified Chinese
	CC_T2S = OpenCC("t2s")


	def normalize_text_zh(
	text: Optional[str],
	remove_punct: bool = True,
	to_simplified: bool = True,
	) -> Optional[str]:
	"""
	Normalize Chinese text for ASR evaluation.

	Steps:
	1. Unicode normalization (NFKC)
	2. Optional Traditional -> Simplified conversion
	3. Optional punctuation removal
	4. Collapse repeated whitespace

	This is used on BOTH ref and hyp before WER/CER/alignment,
	so differences like '天气' vs '天氣' won't be counted as ASR errors.
	"""
	if text is None:
	return None

	t = unicodedata.normalize("NFKC", text)
	t = t.strip()

	if to_simplified:
	t = CC_T2S.convert(t)

	if remove_punct:
	t = RE_PUNCT.sub(" ", t)

	t = RE_MULTI_SPACE.sub(" ", t).strip()
	return t