Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import re | |
| from typing import Literal | |
| LangType = Literal["zh", "en", "mixed", "other"] | |
| LevelType = Literal["char", "word"] | |
| RE_CJK = re.compile(r"[\u4e00-\u9fff]") | |
| RE_LATIN = re.compile(r"[A-Za-z]") | |
| RE_WORD = re.compile(r"[A-Za-z]+(?:['’][A-Za-z]+)?|\d+(?:[\.:]\d+)?|[\u4e00-\u9fff]|[^\w\s]", re.UNICODE) | |
| def detect_lang_type(text: str | None) -> LangType: | |
| text = (text or "").strip() | |
| if not text: | |
| return "other" | |
| zh_count = len(RE_CJK.findall(text)) | |
| latin_count = len(RE_LATIN.findall(text)) | |
| if zh_count > 0 and latin_count > 0: | |
| return "mixed" | |
| if zh_count > 0: | |
| return "zh" | |
| if latin_count > 0: | |
| return "en" | |
| return "other" | |
| def choose_primary_level(lang_type: str) -> LevelType: | |
| return "word" if lang_type == "en" else "char" | |
| def split_word_like(text: str) -> list[str]: | |
| text = (text or "").strip() | |
| if not text: | |
| return [] | |
| return RE_WORD.findall(text) | |
| def split_chars_no_space(text: str) -> list[str]: | |
| text = (text or "").replace(" ", "") | |
| return list(text) if text else [] | |