| from __future__ import annotations |
|
|
| import json |
| from pathlib import Path |
|
|
| import pycountry |
|
|
| LANGUAGE_ALIASES_JSON = Path(__file__).with_name("language_aliases.json") |
|
|
|
|
| def _load_json_mapping(path: Path) -> dict[str, object]: |
| with path.open(encoding="utf-8") as f: |
| data = json.load(f) |
| if not isinstance(data, dict): |
| raise ValueError(f"Expected JSON object in {path}") |
| return data |
|
|
|
|
| def _load_language_aliases(path: Path = LANGUAGE_ALIASES_JSON) -> dict[str, tuple[str, ...]]: |
| aliases = _load_json_mapping(path) |
| normalized: dict[str, tuple[str, ...]] = {} |
| for canonical, source_aliases in aliases.items(): |
| if not isinstance(canonical, str) or not isinstance(source_aliases, list): |
| continue |
| normalized[canonical] = tuple(alias for alias in source_aliases if isinstance(alias, str) and alias) |
| return normalized |
|
|
|
|
| LANGUAGE_ALIASES = _load_language_aliases() |
| ALL_LANGS = list(LANGUAGE_ALIASES.keys()) |
| LATIN_ONLY_LANGS = { |
| lang |
| for lang in ALL_LANGS |
| if lang |
| not in { |
| "ar", |
| "fa", |
| "ps", |
| "sd", |
| "ug", |
| "ur", |
| "ckb", |
| "he", |
| "yi", |
| "ru", |
| "uk", |
| "be", |
| "bg", |
| "mk", |
| "kk", |
| "mn", |
| "tt", |
| "ky", |
| "tg", |
| "ba", |
| "ce", |
| "el", |
| "hy", |
| "ka", |
| "am", |
| "ti", |
| "dv", |
| "km", |
| "lo", |
| "my", |
| "th", |
| "si", |
| "bo", |
| "hi", |
| "mr", |
| "ne", |
| "bn", |
| "as", |
| "ta", |
| "te", |
| "gu", |
| "kn", |
| "ml", |
| "pa", |
| "or", |
| "ja", |
| "zh", |
| "ko", |
| } |
| } |
| LANG_ISO2_TO_ISO3 = { |
| lang: ( |
| getattr(pycountry.languages.get(alpha_2=lang) or pycountry.languages.get(alpha_3=lang), "alpha_3", None) |
| or lang |
| ) |
| for lang in ALL_LANGS |
| } |
| LANG_ALIASES = { |
| alias: canonical |
| for canonical, aliases in LANGUAGE_ALIASES.items() |
| for alias in aliases |
| } |
|
|
|
|
| def canonical_lang(lang: str) -> str: |
| return LANG_ALIASES.get(lang, lang) |
|
|
|
|
| def label_script_suffix(label: str) -> str | None: |
| label = (label or "").strip() |
| if "_" not in label: |
| return None |
| suffix = label.rsplit("_", 1)[1].strip() |
| return suffix or None |
|
|
|
|
| def is_latin_script_label(label: str) -> bool: |
| return label_script_suffix(label) == "Latn" |
|
|
|
|
| def is_latin_script_compatible(lang: str, label: str) -> bool: |
| """Return False when a `_Latn` label is used for a non-Latin language.""" |
| if not is_latin_script_label(label): |
| return True |
| return canonical_lang(lang) in LATIN_ONLY_LANGS |
|
|