Spaces:
Sleeping
Sleeping
| """ | |
| Shared constants used across the gairaigo origin classifier pipeline. | |
| ISO_639_2_NAMES maps the three-letter ISO 639-2 language codes that JMdict | |
| uses in its <lsource> tags to their full English language names. This is used | |
| to replace raw codes like 'fre' or 'ger' with readable labels like 'French' | |
| or 'German' throughout the pipeline — in charts, printed reports, and CSVs. | |
| If a code appears in the data but is not listed here, the code itself is kept | |
| as-is so nothing breaks silently. New codes can simply be added to the dict. | |
| Reference: https://www.loc.gov/standards/iso639-2/php/code_list.php | |
| """ | |
| ISO_639_2_NAMES: dict[str, str] = { | |
| # Most common donor languages in JMdict | |
| "eng": "English", | |
| "fre": "French", | |
| "ger": "German", | |
| "por": "Portuguese", | |
| "dut": "Dutch", | |
| "ita": "Italian", | |
| "spa": "Spanish", | |
| "chi": "Chinese", | |
| "kor": "Korean", | |
| "rus": "Russian", | |
| "ara": "Arabic", | |
| "lat": "Latin", | |
| "grc": "Ancient Greek", | |
| "gre": "Modern Greek", | |
| "san": "Sanskrit", | |
| "ain": "Ainu", | |
| # Less common but present in JMdict | |
| "afr": "Afrikaans", | |
| "alb": "Albanian", | |
| "arm": "Armenian", | |
| "bnt": "Bantu", | |
| "bur": "Burmese", | |
| "cze": "Czech", | |
| "dan": "Danish", | |
| "egy": "Ancient Egyptian", | |
| "epo": "Esperanto", | |
| "fin": "Finnish", | |
| "geo": "Georgian", | |
| "haw": "Hawaiian", | |
| "heb": "Hebrew", | |
| "hin": "Hindi", | |
| "hun": "Hungarian", | |
| "ice": "Icelandic", | |
| "ind": "Indonesian", | |
| "iri": "Irish", | |
| "khm": "Khmer", | |
| "may": "Malay", | |
| "mol": "Moldavian", | |
| "mon": "Mongolian", | |
| "nor": "Norwegian", | |
| "per": "Persian", | |
| "pol": "Polish", | |
| "rum": "Romanian", | |
| "scr": "Croatian", | |
| "slo": "Slovak", | |
| "slv": "Slovenian", | |
| "swa": "Swahili", | |
| "swe": "Swedish", | |
| "tha": "Thai", | |
| "tib": "Tibetan", | |
| "tur": "Turkish", | |
| "ukr": "Ukrainian", | |
| "vie": "Vietnamese", | |
| "wel": "Welsh", | |
| "yid": "Yiddish", | |
| # Catch-all label for consolidated rare classes | |
| "other": "Other", | |
| } | |
| def decode_language(code: str) -> str: | |
| """ | |
| Convert an ISO 639-2 code to a full language name. | |
| Falls back to the code itself if it is not in the mapping table, | |
| so the pipeline never crashes on an unexpected code. | |
| Args: | |
| code : ISO 639-2 three-letter language code (e.g. 'fre'). | |
| Returns: | |
| Full language name (e.g. 'French'), or the original code if unknown. | |
| """ | |
| return ISO_639_2_NAMES.get(code, code) | |