from __future__ import annotations import json import os from pathlib import Path import pycountry from source_config import LANGUAGE_BUCKETS LANGUAGE_GROUPS = {group: cfg["langs"] for group, cfg in LANGUAGE_BUCKETS.items()} LANGUAGE_GROUP_WEIGHTS = {group: float(cfg["weight"]) for group, cfg in LANGUAGE_BUCKETS.items()} LANGUAGE_GROUP_MIN_CHARS = {group: int(cfg["min_chars"]) for group, cfg in LANGUAGE_BUCKETS.items()} LATIN_GROUPS = {group for group, cfg in LANGUAGE_BUCKETS.items() if cfg.get("latin")} LANGUAGE_ALIASES_JSON = Path(__file__).with_name("language_aliases.json") def _load_json_mapping(path: Path) -> dict[str, object]: with path.open(encoding="utf-8") as f: data = json.load(f) if not isinstance(data, dict): raise ValueError(f"Expected JSON object in {path}") return data def _load_language_aliases(path: Path = LANGUAGE_ALIASES_JSON) -> dict[str, tuple[str, ...]]: aliases = _load_json_mapping(path) normalized: dict[str, tuple[str, ...]] = {} for canonical, source_aliases in aliases.items(): if not isinstance(canonical, str) or not isinstance(source_aliases, list): continue normalized[canonical] = tuple(alias for alias in source_aliases if isinstance(alias, str) and alias) return normalized LANGUAGE_ALIASES = _load_language_aliases() ALL_LANGS = list(LANGUAGE_ALIASES.keys()) LANG_ISO2_TO_ISO3 = { lang: ( getattr(pycountry.languages.get(alpha_2=lang) or pycountry.languages.get(alpha_3=lang), "alpha_3", None) or lang ) for lang in ALL_LANGS } LANG_ALIASES = { alias: canonical for canonical, aliases in LANGUAGE_ALIASES.items() for alias in aliases } LANG_SOURCE_CODES = {canonical: tuple(aliases) for canonical, aliases in LANGUAGE_ALIASES.items()} LANG_TO_GROUP = {lang: group for group, langs in LANGUAGE_GROUPS.items() for lang in langs} def canonical_lang(lang: str) -> str: return LANG_ALIASES.get(lang, lang) def canonical_lang_family(lang: str) -> str: """Collapse closely related variants to the app's display/training families.""" return canonical_lang(lang)