from __future__ import annotations import re from typing import Literal LangType = Literal["zh", "en", "mixed", "other"] LevelType = Literal["char", "word"] RE_CJK = re.compile(r"[\u4e00-\u9fff]") RE_LATIN = re.compile(r"[A-Za-z]") RE_WORD = re.compile(r"[A-Za-z]+(?:['’][A-Za-z]+)?|\d+(?:[\.:]\d+)?|[\u4e00-\u9fff]|[^\w\s]", re.UNICODE) def detect_lang_type(text: str | None) -> LangType: text = (text or "").strip() if not text: return "other" zh_count = len(RE_CJK.findall(text)) latin_count = len(RE_LATIN.findall(text)) if zh_count > 0 and latin_count > 0: return "mixed" if zh_count > 0: return "zh" if latin_count > 0: return "en" return "other" def choose_primary_level(lang_type: str) -> LevelType: return "word" if lang_type == "en" else "char" def split_word_like(text: str) -> list[str]: text = (text or "").strip() if not text: return [] return RE_WORD.findall(text) def split_chars_no_space(text: str) -> list[str]: text = (text or "").replace(" ", "") return list(text) if text else []