Spaces:

ZTXRiley
/

ASR_AGENT_

Sleeping

ASR_AGENT_ / analysis /language_utils.py

unknown

Update wer and cer

d7df0a5 about 2 months ago

1.11 kB

	from __future__ import annotations

	import re
	from typing import Literal

	LangType = Literal["zh", "en", "mixed", "other"]
	LevelType = Literal["char", "word"]

	RE_CJK = re.compile(r"[\u4e00-\u9fff]")
	RE_LATIN = re.compile(r"[A-Za-z]")
	RE_WORD = re.compile(r"[A-Za-z]+(?:['’][A-Za-z]+)?\|\d+(?:[\.:]\d+)?\|[\u4e00-\u9fff]\|[^\w\s]", re.UNICODE)


	def detect_lang_type(text: str \| None) -> LangType:
	text = (text or "").strip()
	if not text:
	return "other"
	zh_count = len(RE_CJK.findall(text))
	latin_count = len(RE_LATIN.findall(text))
	if zh_count > 0 and latin_count > 0:
	return "mixed"
	if zh_count > 0:
	return "zh"
	if latin_count > 0:
	return "en"
	return "other"


	def choose_primary_level(lang_type: str) -> LevelType:
	return "word" if lang_type == "en" else "char"


	def split_word_like(text: str) -> list[str]:
	text = (text or "").strip()
	if not text:
	return []
	return RE_WORD.findall(text)


	def split_chars_no_space(text: str) -> list[str]:
	text = (text or "").replace(" ", "")
	return list(text) if text else []