DerivedFunction1's picture
add
e6c77d8
from __future__ import annotations
import json
from pathlib import Path
import pycountry
LANGUAGE_ALIASES_JSON = Path(__file__).with_name("language_aliases.json")
def _load_json_mapping(path: Path) -> dict[str, object]:
with path.open(encoding="utf-8") as f:
data = json.load(f)
if not isinstance(data, dict):
raise ValueError(f"Expected JSON object in {path}")
return data
def _load_language_aliases(path: Path = LANGUAGE_ALIASES_JSON) -> dict[str, tuple[str, ...]]:
aliases = _load_json_mapping(path)
normalized: dict[str, tuple[str, ...]] = {}
for canonical, source_aliases in aliases.items():
if not isinstance(canonical, str) or not isinstance(source_aliases, list):
continue
normalized[canonical] = tuple(alias for alias in source_aliases if isinstance(alias, str) and alias)
return normalized
LANGUAGE_ALIASES = _load_language_aliases()
ALL_LANGS = list(LANGUAGE_ALIASES.keys())
LATIN_ONLY_LANGS = {
lang
for lang in ALL_LANGS
if lang
not in {
"ar",
"fa",
"ps",
"sd",
"ug",
"ur",
"ckb",
"he",
"yi",
"ru",
"uk",
"be",
"bg",
"mk",
"kk",
"mn",
"tt",
"ky",
"tg",
"ba",
"ce",
"el",
"hy",
"ka",
"am",
"ti",
"dv",
"km",
"lo",
"my",
"th",
"si",
"bo",
"hi",
"mr",
"ne",
"bn",
"as",
"ta",
"te",
"gu",
"kn",
"ml",
"pa",
"or",
"ja",
"zh",
"ko",
}
}
LANG_ISO2_TO_ISO3 = {
lang: (
getattr(pycountry.languages.get(alpha_2=lang) or pycountry.languages.get(alpha_3=lang), "alpha_3", None)
or lang
)
for lang in ALL_LANGS
}
LANG_ALIASES = {
alias: canonical
for canonical, aliases in LANGUAGE_ALIASES.items()
for alias in aliases
}
def canonical_lang(lang: str) -> str:
return LANG_ALIASES.get(lang, lang)
def label_script_suffix(label: str) -> str | None:
label = (label or "").strip()
if "_" not in label:
return None
suffix = label.rsplit("_", 1)[1].strip()
return suffix or None
def is_latin_script_label(label: str) -> bool:
return label_script_suffix(label) == "Latn"
def is_latin_script_compatible(lang: str, label: str) -> bool:
"""Return False when a `_Latn` label is used for a non-Latin language."""
if not is_latin_script_label(label):
return True
return canonical_lang(lang) in LATIN_ONLY_LANGS