Spaces:

krislette
/

kataklassifer

Sleeping

File size: 9,213 Bytes

98a8573

"""
Parses the full JMdict XML and exports all gairaigo (外来語) entries
grouped by donor language, with country metadata for the world map.

Output: gairaigo_full.json
  {
    "eng": {
      "language": "English",
      "country": "United Kingdom",
      "iso2": "GB",
      "words": [
        { "katakana": "コーヒー", "meaning": "coffee" },
        ...
      ]
    },
    ...
  }

Usage:
    python export_gairaigo.py --jmdict data/JMdict --out data/gairaigo_full.json
"""

import argparse
import json
import re
from lxml import etree
from pathlib import Path


# ISO 639-2 → { language name, country name, ISO 3166-1 alpha-2 } mapping
# Only languages actually present in JMdict as lsource donor codes.
# "iso2" is the country code used by the D3 world map (TopoJSON).
LANGUAGE_META: dict[str, dict] = {
    # Core European donors
    "eng": {"language": "English", "country": "United Kingdom", "iso2": "GB"},
    "fre": {"language": "French", "country": "France", "iso2": "FR"},
    "ger": {"language": "German", "country": "Germany", "iso2": "DE"},
    "por": {"language": "Portuguese", "country": "Portugal", "iso2": "PT"},
    "spa": {"language": "Spanish", "country": "Spain", "iso2": "ES"},
    "ita": {"language": "Italian", "country": "Italy", "iso2": "IT"},
    "dut": {"language": "Dutch", "country": "Netherlands", "iso2": "NL"},
    "rus": {"language": "Russian", "country": "Russia", "iso2": "RU"},
    "swe": {"language": "Swedish", "country": "Sweden", "iso2": "SE"},
    "nor": {"language": "Norwegian", "country": "Norway", "iso2": "NO"},
    "dan": {"language": "Danish", "country": "Denmark", "iso2": "DK"},
    "fin": {"language": "Finnish", "country": "Finland", "iso2": "FI"},
    "gre": {"language": "Greek", "country": "Greece", "iso2": "GR"},
    "pol": {"language": "Polish", "country": "Poland", "iso2": "PL"},
    "cze": {"language": "Czech", "country": "Czech Republic", "iso2": "CZ"},
    "hun": {"language": "Hungarian", "country": "Hungary", "iso2": "HU"},
    "rum": {"language": "Romanian", "country": "Romania", "iso2": "RO"},
    # Asian donors
    "chi": {"language": "Chinese", "country": "China", "iso2": "CN"},
    "kor": {"language": "Korean", "country": "South Korea", "iso2": "KR"},
    "vie": {"language": "Vietnamese", "country": "Vietnam", "iso2": "VN"},
    "mal": {"language": "Malay", "country": "Malaysia", "iso2": "MY"},
    "tgl": {"language": "Tagalog", "country": "Philippines", "iso2": "PH"},
    "ind": {"language": "Indonesian", "country": "Indonesia", "iso2": "ID"},
    "tha": {"language": "Thai", "country": "Thailand", "iso2": "TH"},
    "bur": {"language": "Burmese", "country": "Myanmar", "iso2": "MM"},
    "khm": {"language": "Khmer", "country": "Cambodia", "iso2": "KH"},
    # South Asian donors
    "san": {"language": "Sanskrit", "country": "India", "iso2": "IN"},
    "hin": {"language": "Hindi", "country": "India", "iso2": "IN"},
    "tam": {"language": "Tamil", "country": "India", "iso2": "IN"},
    "urd": {"language": "Urdu", "country": "Pakistan", "iso2": "PK"},
    "ben": {"language": "Bengali", "country": "Bangladesh", "iso2": "BD"},
    # Middle Eastern / Central Asian donors
    "ara": {"language": "Arabic", "country": "Saudi Arabia", "iso2": "SA"},
    "per": {"language": "Persian", "country": "Iran", "iso2": "IR"},
    "tur": {"language": "Turkish", "country": "Turkey", "iso2": "TR"},
    # African donors
    "swa": {"language": "Swahili", "country": "Tanzania", "iso2": "TZ"},
    "amh": {"language": "Amharic", "country": "Ethiopia", "iso2": "ET"},
    # Americas donors
    "ain": {"language": "Ainu", "country": "Japan", "iso2": "JP"},
    "grn": {"language": "Guaraní", "country": "Paraguay", "iso2": "PY"},
    "que": {"language": "Quechua", "country": "Peru", "iso2": "PE"},
    "nah": {"language": "Nahuatl", "country": "Mexico", "iso2": "MX"},
    # Classical / ecclesiastical
    "lat": {"language": "Latin", "country": "Italy", "iso2": "IT"},
    "heb": {"language": "Hebrew", "country": "Israel", "iso2": "IL"},
    # Oceanian donors
    "mao": {"language": "Māori", "country": "New Zealand", "iso2": "NZ"},
    # Catch-all fallback (JMdict default: absent xml:lang = English)
    "unknown": {"language": "Unknown", "country": "Unknown", "iso2": "XX"},
}

# Katakana Unicode block: \u30A0–\u30FF  (+ prolonged sound mark \u30FC)
KATAKANA_RE = re.compile(r"^[\u30A0-\u30FF\u30FC\u30FB\u30FE\u30FD]+$")


def is_katakana(text: str) -> bool:
    return bool(KATAKANA_RE.match(text))


def parse_jmdict(jmdict_path: Path) -> dict[str, dict]:
    """
    Parse JMdict XML and return data grouped by ISO 639-2 donor language code.
    Returns:
        {
            "eng": { ...meta..., "words": [{"katakana": ..., "meaning": ...}, ...] },
            ...
        }
    """
    result: dict[str, dict] = {}
    seen: set[tuple[str, str]] = set()  # (katakana, lang_code) dedup

    context = etree.iterparse(str(jmdict_path), events=("end",), tag="entry")

    for _, entry in context:
        # Collect katakana readings
        katakana_forms: list[str] = []

        # Prefer k_ele (kanji element) that is pure katakana
        for k_ele in entry.findall("k_ele/keb"):
            if k_ele.text and is_katakana(k_ele.text):
                katakana_forms.append(k_ele.text)

        # Fall back to r_ele (reading element)
        if not katakana_forms:
            for r_ele in entry.findall("r_ele/reb"):
                if r_ele.text and is_katakana(r_ele.text):
                    katakana_forms.append(r_ele.text)

        if not katakana_forms:
            entry.clear()
            continue

        # Extra sense blocks
        for sense in entry.findall("sense"):
            lsource = sense.find("lsource")
            if lsource is None:
                entry.clear()
                break  # no lsource, means not a gairaigo entry

            # JMdict convention: absent xml:lang defaults to English
            lang_code = lsource.get("{http://www.w3.org/XML/1998/namespace}lang", "eng")

            # Collect English glosses for this sense
            glosses = [g.text.strip() for g in sense.findall("gloss") if g.text]
            meaning = "; ".join(glosses) if glosses else ""

            # Insert into result dict
            if lang_code not in result:
                meta = LANGUAGE_META.get(
                    lang_code,
                    {
                        "language": lang_code,
                        "country": lang_code,
                        "iso2": "XX",
                    },
                )
                result[lang_code] = {
                    "language": meta["language"],
                    "country": meta["country"],
                    "iso2": meta["iso2"],
                    "words": [],
                }

            for kana in katakana_forms:
                key = (kana, lang_code)
                if key not in seen:
                    seen.add(key)
                    result[lang_code]["words"].append(
                        {
                            "katakana": kana,
                            "meaning": meaning,
                        }
                    )

        entry.clear()

    return result


def main():
    parser = argparse.ArgumentParser(description="Export JMdict gairaigo to JSON")
    parser.add_argument(
        "--jmdict",
        default="data/JMdict",
        help="Path to the JMdict XML file (default: data/JMdict)",
    )
    parser.add_argument(
        "--out",
        default="data/gairaigo_full.json",
        help="Output JSON path (default: data/gairaigo_full.json)",
    )
    parser.add_argument(
        "--min-words",
        type=int,
        default=1,
        help="Minimum word count to include a language (default: 1)",
    )
    args = parser.parse_args()

    jmdict_path = Path(args.jmdict)
    if not jmdict_path.exists():
        raise FileNotFoundError(
            f"JMdict file not found at '{jmdict_path}'. "
            "Download it from https://www.edrdg.org/wiki/index.php/JMdict-EDICT_Dictionary_Project "
            "and place it under data/JMdict."
        )

    print(f"Parsing {jmdict_path} ...")
    data = parse_jmdict(jmdict_path)

    # Filter out languages below the threshold
    data = {k: v for k, v in data.items() if len(v["words"]) >= args.min_words}

    # Sort words within each language alphabetically by katakana
    for lang in data.values():
        lang["words"].sort(key=lambda w: w["katakana"])

    # Build a summary for the terminal
    total_words = sum(len(v["words"]) for v in data.values())
    print(f"\n✓ Languages found : {len(data)}")
    print(f"✓ Total entries   : {total_words}")
    print("\nTop 10 by word count:")
    top = sorted(data.items(), key=lambda kv: len(kv[1]["words"]), reverse=True)[:10]
    for code, meta in top:
        print(f"  {code:6s}  {meta['language']:20s}  {len(meta['words']):>5} words")

    # Write output
    out_path = Path(args.out)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    print(f"\n✓ Saved → {out_path}")


if __name__ == "__main__":
    main()