Spaces:
Sleeping
Sleeping
| """ | |
| Parses the full JMdict XML and exports all gairaigo (外来語) entries | |
| grouped by donor language, with country metadata for the world map. | |
| Output: gairaigo_full.json | |
| { | |
| "eng": { | |
| "language": "English", | |
| "country": "United Kingdom", | |
| "iso2": "GB", | |
| "words": [ | |
| { "katakana": "コーヒー", "meaning": "coffee" }, | |
| ... | |
| ] | |
| }, | |
| ... | |
| } | |
| Usage: | |
| python export_gairaigo.py --jmdict data/JMdict --out data/gairaigo_full.json | |
| """ | |
| import argparse | |
| import json | |
| import re | |
| from lxml import etree | |
| from pathlib import Path | |
| # ISO 639-2 → { language name, country name, ISO 3166-1 alpha-2 } mapping | |
| # Only languages actually present in JMdict as lsource donor codes. | |
| # "iso2" is the country code used by the D3 world map (TopoJSON). | |
| LANGUAGE_META: dict[str, dict] = { | |
| # Core European donors | |
| "eng": {"language": "English", "country": "United Kingdom", "iso2": "GB"}, | |
| "fre": {"language": "French", "country": "France", "iso2": "FR"}, | |
| "ger": {"language": "German", "country": "Germany", "iso2": "DE"}, | |
| "por": {"language": "Portuguese", "country": "Portugal", "iso2": "PT"}, | |
| "spa": {"language": "Spanish", "country": "Spain", "iso2": "ES"}, | |
| "ita": {"language": "Italian", "country": "Italy", "iso2": "IT"}, | |
| "dut": {"language": "Dutch", "country": "Netherlands", "iso2": "NL"}, | |
| "rus": {"language": "Russian", "country": "Russia", "iso2": "RU"}, | |
| "swe": {"language": "Swedish", "country": "Sweden", "iso2": "SE"}, | |
| "nor": {"language": "Norwegian", "country": "Norway", "iso2": "NO"}, | |
| "dan": {"language": "Danish", "country": "Denmark", "iso2": "DK"}, | |
| "fin": {"language": "Finnish", "country": "Finland", "iso2": "FI"}, | |
| "gre": {"language": "Greek", "country": "Greece", "iso2": "GR"}, | |
| "pol": {"language": "Polish", "country": "Poland", "iso2": "PL"}, | |
| "cze": {"language": "Czech", "country": "Czech Republic", "iso2": "CZ"}, | |
| "hun": {"language": "Hungarian", "country": "Hungary", "iso2": "HU"}, | |
| "rum": {"language": "Romanian", "country": "Romania", "iso2": "RO"}, | |
| # Asian donors | |
| "chi": {"language": "Chinese", "country": "China", "iso2": "CN"}, | |
| "kor": {"language": "Korean", "country": "South Korea", "iso2": "KR"}, | |
| "vie": {"language": "Vietnamese", "country": "Vietnam", "iso2": "VN"}, | |
| "mal": {"language": "Malay", "country": "Malaysia", "iso2": "MY"}, | |
| "tgl": {"language": "Tagalog", "country": "Philippines", "iso2": "PH"}, | |
| "ind": {"language": "Indonesian", "country": "Indonesia", "iso2": "ID"}, | |
| "tha": {"language": "Thai", "country": "Thailand", "iso2": "TH"}, | |
| "bur": {"language": "Burmese", "country": "Myanmar", "iso2": "MM"}, | |
| "khm": {"language": "Khmer", "country": "Cambodia", "iso2": "KH"}, | |
| # South Asian donors | |
| "san": {"language": "Sanskrit", "country": "India", "iso2": "IN"}, | |
| "hin": {"language": "Hindi", "country": "India", "iso2": "IN"}, | |
| "tam": {"language": "Tamil", "country": "India", "iso2": "IN"}, | |
| "urd": {"language": "Urdu", "country": "Pakistan", "iso2": "PK"}, | |
| "ben": {"language": "Bengali", "country": "Bangladesh", "iso2": "BD"}, | |
| # Middle Eastern / Central Asian donors | |
| "ara": {"language": "Arabic", "country": "Saudi Arabia", "iso2": "SA"}, | |
| "per": {"language": "Persian", "country": "Iran", "iso2": "IR"}, | |
| "tur": {"language": "Turkish", "country": "Turkey", "iso2": "TR"}, | |
| # African donors | |
| "swa": {"language": "Swahili", "country": "Tanzania", "iso2": "TZ"}, | |
| "amh": {"language": "Amharic", "country": "Ethiopia", "iso2": "ET"}, | |
| # Americas donors | |
| "ain": {"language": "Ainu", "country": "Japan", "iso2": "JP"}, | |
| "grn": {"language": "Guaraní", "country": "Paraguay", "iso2": "PY"}, | |
| "que": {"language": "Quechua", "country": "Peru", "iso2": "PE"}, | |
| "nah": {"language": "Nahuatl", "country": "Mexico", "iso2": "MX"}, | |
| # Classical / ecclesiastical | |
| "lat": {"language": "Latin", "country": "Italy", "iso2": "IT"}, | |
| "heb": {"language": "Hebrew", "country": "Israel", "iso2": "IL"}, | |
| # Oceanian donors | |
| "mao": {"language": "Māori", "country": "New Zealand", "iso2": "NZ"}, | |
| # Catch-all fallback (JMdict default: absent xml:lang = English) | |
| "unknown": {"language": "Unknown", "country": "Unknown", "iso2": "XX"}, | |
| } | |
| # Katakana Unicode block: \u30A0–\u30FF (+ prolonged sound mark \u30FC) | |
| KATAKANA_RE = re.compile(r"^[\u30A0-\u30FF\u30FC\u30FB\u30FE\u30FD]+$") | |
| def is_katakana(text: str) -> bool: | |
| return bool(KATAKANA_RE.match(text)) | |
| def parse_jmdict(jmdict_path: Path) -> dict[str, dict]: | |
| """ | |
| Parse JMdict XML and return data grouped by ISO 639-2 donor language code. | |
| Returns: | |
| { | |
| "eng": { ...meta..., "words": [{"katakana": ..., "meaning": ...}, ...] }, | |
| ... | |
| } | |
| """ | |
| result: dict[str, dict] = {} | |
| seen: set[tuple[str, str]] = set() # (katakana, lang_code) dedup | |
| context = etree.iterparse(str(jmdict_path), events=("end",), tag="entry") | |
| for _, entry in context: | |
| # Collect katakana readings | |
| katakana_forms: list[str] = [] | |
| # Prefer k_ele (kanji element) that is pure katakana | |
| for k_ele in entry.findall("k_ele/keb"): | |
| if k_ele.text and is_katakana(k_ele.text): | |
| katakana_forms.append(k_ele.text) | |
| # Fall back to r_ele (reading element) | |
| if not katakana_forms: | |
| for r_ele in entry.findall("r_ele/reb"): | |
| if r_ele.text and is_katakana(r_ele.text): | |
| katakana_forms.append(r_ele.text) | |
| if not katakana_forms: | |
| entry.clear() | |
| continue | |
| # Extra sense blocks | |
| for sense in entry.findall("sense"): | |
| lsource = sense.find("lsource") | |
| if lsource is None: | |
| entry.clear() | |
| break # no lsource, means not a gairaigo entry | |
| # JMdict convention: absent xml:lang defaults to English | |
| lang_code = lsource.get("{http://www.w3.org/XML/1998/namespace}lang", "eng") | |
| # Collect English glosses for this sense | |
| glosses = [g.text.strip() for g in sense.findall("gloss") if g.text] | |
| meaning = "; ".join(glosses) if glosses else "" | |
| # Insert into result dict | |
| if lang_code not in result: | |
| meta = LANGUAGE_META.get( | |
| lang_code, | |
| { | |
| "language": lang_code, | |
| "country": lang_code, | |
| "iso2": "XX", | |
| }, | |
| ) | |
| result[lang_code] = { | |
| "language": meta["language"], | |
| "country": meta["country"], | |
| "iso2": meta["iso2"], | |
| "words": [], | |
| } | |
| for kana in katakana_forms: | |
| key = (kana, lang_code) | |
| if key not in seen: | |
| seen.add(key) | |
| result[lang_code]["words"].append( | |
| { | |
| "katakana": kana, | |
| "meaning": meaning, | |
| } | |
| ) | |
| entry.clear() | |
| return result | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Export JMdict gairaigo to JSON") | |
| parser.add_argument( | |
| "--jmdict", | |
| default="data/JMdict", | |
| help="Path to the JMdict XML file (default: data/JMdict)", | |
| ) | |
| parser.add_argument( | |
| "--out", | |
| default="data/gairaigo_full.json", | |
| help="Output JSON path (default: data/gairaigo_full.json)", | |
| ) | |
| parser.add_argument( | |
| "--min-words", | |
| type=int, | |
| default=1, | |
| help="Minimum word count to include a language (default: 1)", | |
| ) | |
| args = parser.parse_args() | |
| jmdict_path = Path(args.jmdict) | |
| if not jmdict_path.exists(): | |
| raise FileNotFoundError( | |
| f"JMdict file not found at '{jmdict_path}'. " | |
| "Download it from https://www.edrdg.org/wiki/index.php/JMdict-EDICT_Dictionary_Project " | |
| "and place it under data/JMdict." | |
| ) | |
| print(f"Parsing {jmdict_path} ...") | |
| data = parse_jmdict(jmdict_path) | |
| # Filter out languages below the threshold | |
| data = {k: v for k, v in data.items() if len(v["words"]) >= args.min_words} | |
| # Sort words within each language alphabetically by katakana | |
| for lang in data.values(): | |
| lang["words"].sort(key=lambda w: w["katakana"]) | |
| # Build a summary for the terminal | |
| total_words = sum(len(v["words"]) for v in data.values()) | |
| print(f"\n✓ Languages found : {len(data)}") | |
| print(f"✓ Total entries : {total_words}") | |
| print("\nTop 10 by word count:") | |
| top = sorted(data.items(), key=lambda kv: len(kv[1]["words"]), reverse=True)[:10] | |
| for code, meta in top: | |
| print(f" {code:6s} {meta['language']:20s} {len(meta['words']):>5} words") | |
| # Write output | |
| out_path = Path(args.out) | |
| out_path.parent.mkdir(parents=True, exist_ok=True) | |
| with open(out_path, "w", encoding="utf-8") as f: | |
| json.dump(data, f, ensure_ascii=False, indent=2) | |
| print(f"\n✓ Saved → {out_path}") | |
| if __name__ == "__main__": | |
| main() | |