Spaces:

krislette
/

kataklassifer

Running

File size: 2,468 Bytes

caf26c9

"""
Shared constants used across the gairaigo origin classifier pipeline.

ISO_639_2_NAMES maps the three-letter ISO 639-2 language codes that JMdict
uses in its <lsource> tags to their full English language names. This is used
to replace raw codes like 'fre' or 'ger' with readable labels like 'French'
or 'German' throughout the pipeline — in charts, printed reports, and CSVs.

If a code appears in the data but is not listed here, the code itself is kept
as-is so nothing breaks silently. New codes can simply be added to the dict.

Reference: https://www.loc.gov/standards/iso639-2/php/code_list.php
"""

ISO_639_2_NAMES: dict[str, str] = {
    # Most common donor languages in JMdict
    "eng": "English",
    "fre": "French",
    "ger": "German",
    "por": "Portuguese",
    "dut": "Dutch",
    "ita": "Italian",
    "spa": "Spanish",
    "chi": "Chinese",
    "kor": "Korean",
    "rus": "Russian",
    "ara": "Arabic",
    "lat": "Latin",
    "grc": "Ancient Greek",
    "gre": "Modern Greek",
    "san": "Sanskrit",
    "ain": "Ainu",
    # Less common but present in JMdict
    "afr": "Afrikaans",
    "alb": "Albanian",
    "arm": "Armenian",
    "bnt": "Bantu",
    "bur": "Burmese",
    "cze": "Czech",
    "dan": "Danish",
    "egy": "Ancient Egyptian",
    "epo": "Esperanto",
    "fin": "Finnish",
    "geo": "Georgian",
    "haw": "Hawaiian",
    "heb": "Hebrew",
    "hin": "Hindi",
    "hun": "Hungarian",
    "ice": "Icelandic",
    "ind": "Indonesian",
    "iri": "Irish",
    "khm": "Khmer",
    "may": "Malay",
    "mol": "Moldavian",
    "mon": "Mongolian",
    "nor": "Norwegian",
    "per": "Persian",
    "pol": "Polish",
    "rum": "Romanian",
    "scr": "Croatian",
    "slo": "Slovak",
    "slv": "Slovenian",
    "swa": "Swahili",
    "swe": "Swedish",
    "tha": "Thai",
    "tib": "Tibetan",
    "tur": "Turkish",
    "ukr": "Ukrainian",
    "vie": "Vietnamese",
    "wel": "Welsh",
    "yid": "Yiddish",
    # Catch-all label for consolidated rare classes
    "other": "Other",
}


def decode_language(code: str) -> str:
    """
    Convert an ISO 639-2 code to a full language name.

    Falls back to the code itself if it is not in the mapping table,
    so the pipeline never crashes on an unexpected code.

    Args:
        code : ISO 639-2 three-letter language code (e.g. 'fre').

    Returns:
        Full language name (e.g. 'French'), or the original code if unknown.
    """
    return ISO_639_2_NAMES.get(code, code)