Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import json | |
| import os | |
| from pathlib import Path | |
| import pycountry | |
| from source_config import LANGUAGE_BUCKETS | |
| LANGUAGE_GROUPS = {group: cfg["langs"] for group, cfg in LANGUAGE_BUCKETS.items()} | |
| LANGUAGE_GROUP_WEIGHTS = {group: float(cfg["weight"]) for group, cfg in LANGUAGE_BUCKETS.items()} | |
| LANGUAGE_GROUP_MIN_CHARS = {group: int(cfg["min_chars"]) for group, cfg in LANGUAGE_BUCKETS.items()} | |
| LATIN_GROUPS = {group for group, cfg in LANGUAGE_BUCKETS.items() if cfg.get("latin")} | |
| LANGUAGE_ALIASES_JSON = Path(__file__).with_name("language_aliases.json") | |
| def _load_json_mapping(path: Path) -> dict[str, object]: | |
| with path.open(encoding="utf-8") as f: | |
| data = json.load(f) | |
| if not isinstance(data, dict): | |
| raise ValueError(f"Expected JSON object in {path}") | |
| return data | |
| def _load_language_aliases(path: Path = LANGUAGE_ALIASES_JSON) -> dict[str, tuple[str, ...]]: | |
| aliases = _load_json_mapping(path) | |
| normalized: dict[str, tuple[str, ...]] = {} | |
| for canonical, source_aliases in aliases.items(): | |
| if not isinstance(canonical, str) or not isinstance(source_aliases, list): | |
| continue | |
| normalized[canonical] = tuple(alias for alias in source_aliases if isinstance(alias, str) and alias) | |
| return normalized | |
| LANGUAGE_ALIASES = _load_language_aliases() | |
| ALL_LANGS = list(LANGUAGE_ALIASES.keys()) | |
| LANG_ISO2_TO_ISO3 = { | |
| lang: ( | |
| getattr(pycountry.languages.get(alpha_2=lang) or pycountry.languages.get(alpha_3=lang), "alpha_3", None) | |
| or lang | |
| ) | |
| for lang in ALL_LANGS | |
| } | |
| LANG_ALIASES = { | |
| alias: canonical | |
| for canonical, aliases in LANGUAGE_ALIASES.items() | |
| for alias in aliases | |
| } | |
| LANG_SOURCE_CODES = {canonical: tuple(aliases) for canonical, aliases in LANGUAGE_ALIASES.items()} | |
| LANG_TO_GROUP = {lang: group for group, langs in LANGUAGE_GROUPS.items() for lang in langs} | |
| def canonical_lang(lang: str) -> str: | |
| return LANG_ALIASES.get(lang, lang) | |
| def canonical_lang_family(lang: str) -> str: | |
| """Collapse closely related variants to the app's display/training families.""" | |
| return canonical_lang(lang) | |