Spaces:
Sleeping
Sleeping
File size: 9,213 Bytes
98a8573 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 | """
Parses the full JMdict XML and exports all gairaigo (外来語) entries
grouped by donor language, with country metadata for the world map.
Output: gairaigo_full.json
{
"eng": {
"language": "English",
"country": "United Kingdom",
"iso2": "GB",
"words": [
{ "katakana": "コーヒー", "meaning": "coffee" },
...
]
},
...
}
Usage:
python export_gairaigo.py --jmdict data/JMdict --out data/gairaigo_full.json
"""
import argparse
import json
import re
from lxml import etree
from pathlib import Path
# ISO 639-2 → { language name, country name, ISO 3166-1 alpha-2 } mapping
# Only languages actually present in JMdict as lsource donor codes.
# "iso2" is the country code used by the D3 world map (TopoJSON).
LANGUAGE_META: dict[str, dict] = {
# Core European donors
"eng": {"language": "English", "country": "United Kingdom", "iso2": "GB"},
"fre": {"language": "French", "country": "France", "iso2": "FR"},
"ger": {"language": "German", "country": "Germany", "iso2": "DE"},
"por": {"language": "Portuguese", "country": "Portugal", "iso2": "PT"},
"spa": {"language": "Spanish", "country": "Spain", "iso2": "ES"},
"ita": {"language": "Italian", "country": "Italy", "iso2": "IT"},
"dut": {"language": "Dutch", "country": "Netherlands", "iso2": "NL"},
"rus": {"language": "Russian", "country": "Russia", "iso2": "RU"},
"swe": {"language": "Swedish", "country": "Sweden", "iso2": "SE"},
"nor": {"language": "Norwegian", "country": "Norway", "iso2": "NO"},
"dan": {"language": "Danish", "country": "Denmark", "iso2": "DK"},
"fin": {"language": "Finnish", "country": "Finland", "iso2": "FI"},
"gre": {"language": "Greek", "country": "Greece", "iso2": "GR"},
"pol": {"language": "Polish", "country": "Poland", "iso2": "PL"},
"cze": {"language": "Czech", "country": "Czech Republic", "iso2": "CZ"},
"hun": {"language": "Hungarian", "country": "Hungary", "iso2": "HU"},
"rum": {"language": "Romanian", "country": "Romania", "iso2": "RO"},
# Asian donors
"chi": {"language": "Chinese", "country": "China", "iso2": "CN"},
"kor": {"language": "Korean", "country": "South Korea", "iso2": "KR"},
"vie": {"language": "Vietnamese", "country": "Vietnam", "iso2": "VN"},
"mal": {"language": "Malay", "country": "Malaysia", "iso2": "MY"},
"tgl": {"language": "Tagalog", "country": "Philippines", "iso2": "PH"},
"ind": {"language": "Indonesian", "country": "Indonesia", "iso2": "ID"},
"tha": {"language": "Thai", "country": "Thailand", "iso2": "TH"},
"bur": {"language": "Burmese", "country": "Myanmar", "iso2": "MM"},
"khm": {"language": "Khmer", "country": "Cambodia", "iso2": "KH"},
# South Asian donors
"san": {"language": "Sanskrit", "country": "India", "iso2": "IN"},
"hin": {"language": "Hindi", "country": "India", "iso2": "IN"},
"tam": {"language": "Tamil", "country": "India", "iso2": "IN"},
"urd": {"language": "Urdu", "country": "Pakistan", "iso2": "PK"},
"ben": {"language": "Bengali", "country": "Bangladesh", "iso2": "BD"},
# Middle Eastern / Central Asian donors
"ara": {"language": "Arabic", "country": "Saudi Arabia", "iso2": "SA"},
"per": {"language": "Persian", "country": "Iran", "iso2": "IR"},
"tur": {"language": "Turkish", "country": "Turkey", "iso2": "TR"},
# African donors
"swa": {"language": "Swahili", "country": "Tanzania", "iso2": "TZ"},
"amh": {"language": "Amharic", "country": "Ethiopia", "iso2": "ET"},
# Americas donors
"ain": {"language": "Ainu", "country": "Japan", "iso2": "JP"},
"grn": {"language": "Guaraní", "country": "Paraguay", "iso2": "PY"},
"que": {"language": "Quechua", "country": "Peru", "iso2": "PE"},
"nah": {"language": "Nahuatl", "country": "Mexico", "iso2": "MX"},
# Classical / ecclesiastical
"lat": {"language": "Latin", "country": "Italy", "iso2": "IT"},
"heb": {"language": "Hebrew", "country": "Israel", "iso2": "IL"},
# Oceanian donors
"mao": {"language": "Māori", "country": "New Zealand", "iso2": "NZ"},
# Catch-all fallback (JMdict default: absent xml:lang = English)
"unknown": {"language": "Unknown", "country": "Unknown", "iso2": "XX"},
}
# Katakana Unicode block: \u30A0–\u30FF (+ prolonged sound mark \u30FC)
KATAKANA_RE = re.compile(r"^[\u30A0-\u30FF\u30FC\u30FB\u30FE\u30FD]+$")
def is_katakana(text: str) -> bool:
return bool(KATAKANA_RE.match(text))
def parse_jmdict(jmdict_path: Path) -> dict[str, dict]:
"""
Parse JMdict XML and return data grouped by ISO 639-2 donor language code.
Returns:
{
"eng": { ...meta..., "words": [{"katakana": ..., "meaning": ...}, ...] },
...
}
"""
result: dict[str, dict] = {}
seen: set[tuple[str, str]] = set() # (katakana, lang_code) dedup
context = etree.iterparse(str(jmdict_path), events=("end",), tag="entry")
for _, entry in context:
# Collect katakana readings
katakana_forms: list[str] = []
# Prefer k_ele (kanji element) that is pure katakana
for k_ele in entry.findall("k_ele/keb"):
if k_ele.text and is_katakana(k_ele.text):
katakana_forms.append(k_ele.text)
# Fall back to r_ele (reading element)
if not katakana_forms:
for r_ele in entry.findall("r_ele/reb"):
if r_ele.text and is_katakana(r_ele.text):
katakana_forms.append(r_ele.text)
if not katakana_forms:
entry.clear()
continue
# Extra sense blocks
for sense in entry.findall("sense"):
lsource = sense.find("lsource")
if lsource is None:
entry.clear()
break # no lsource, means not a gairaigo entry
# JMdict convention: absent xml:lang defaults to English
lang_code = lsource.get("{http://www.w3.org/XML/1998/namespace}lang", "eng")
# Collect English glosses for this sense
glosses = [g.text.strip() for g in sense.findall("gloss") if g.text]
meaning = "; ".join(glosses) if glosses else ""
# Insert into result dict
if lang_code not in result:
meta = LANGUAGE_META.get(
lang_code,
{
"language": lang_code,
"country": lang_code,
"iso2": "XX",
},
)
result[lang_code] = {
"language": meta["language"],
"country": meta["country"],
"iso2": meta["iso2"],
"words": [],
}
for kana in katakana_forms:
key = (kana, lang_code)
if key not in seen:
seen.add(key)
result[lang_code]["words"].append(
{
"katakana": kana,
"meaning": meaning,
}
)
entry.clear()
return result
def main():
parser = argparse.ArgumentParser(description="Export JMdict gairaigo to JSON")
parser.add_argument(
"--jmdict",
default="data/JMdict",
help="Path to the JMdict XML file (default: data/JMdict)",
)
parser.add_argument(
"--out",
default="data/gairaigo_full.json",
help="Output JSON path (default: data/gairaigo_full.json)",
)
parser.add_argument(
"--min-words",
type=int,
default=1,
help="Minimum word count to include a language (default: 1)",
)
args = parser.parse_args()
jmdict_path = Path(args.jmdict)
if not jmdict_path.exists():
raise FileNotFoundError(
f"JMdict file not found at '{jmdict_path}'. "
"Download it from https://www.edrdg.org/wiki/index.php/JMdict-EDICT_Dictionary_Project "
"and place it under data/JMdict."
)
print(f"Parsing {jmdict_path} ...")
data = parse_jmdict(jmdict_path)
# Filter out languages below the threshold
data = {k: v for k, v in data.items() if len(v["words"]) >= args.min_words}
# Sort words within each language alphabetically by katakana
for lang in data.values():
lang["words"].sort(key=lambda w: w["katakana"])
# Build a summary for the terminal
total_words = sum(len(v["words"]) for v in data.values())
print(f"\n✓ Languages found : {len(data)}")
print(f"✓ Total entries : {total_words}")
print("\nTop 10 by word count:")
top = sorted(data.items(), key=lambda kv: len(kv[1]["words"]), reverse=True)[:10]
for code, meta in top:
print(f" {code:6s} {meta['language']:20s} {len(meta['words']):>5} words")
# Write output
out_path = Path(args.out)
out_path.parent.mkdir(parents=True, exist_ok=True)
with open(out_path, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print(f"\n✓ Saved → {out_path}")
if __name__ == "__main__":
main()
|