address / scripts /build_mapping.py
rain1024's picture
Add Vietnamese address converter for post-merger admin units (01/07/2025)
efd7cfc
"""
Script to extract mapping data from vietnamadminunits package
and generate data/mapping.json for standalone use.
Usage:
uv run python scripts/build_mapping.py
"""
import json
from pathlib import Path
def build_mapping():
import vietnamadminunits
pkg_dir = Path(vietnamadminunits.__file__).parent
# Load source data
with open(pkg_dir / "data" / "converter_2025.json") as f:
converter = json.load(f)
with open(pkg_dir / "data" / "parser_legacy.json") as f:
legacy = json.load(f)
with open(pkg_dir / "data" / "parser_from_2025.json") as f:
new_parser = json.load(f)
# === Province mapping: old_key -> new_key ===
# converter DICT_PROVINCE: {new_key: [old_key1, old_key2, ...]}
province_mapping = {}
for new_key, old_keys in converter["DICT_PROVINCE"].items():
for old_key in old_keys:
province_mapping[old_key] = new_key
# === Province info: key -> display name ===
province_names = {}
for key, info in new_parser["DICT_PROVINCE"].items():
province_names[key] = {
"name": info["province"],
"short": info["provinceShort"],
"code": info["provinceCode"],
}
old_province_names = {}
for key, info in legacy["DICT_PROVINCE"].items():
old_province_names[key] = {
"name": info["province"],
"short": info["provinceShort"],
"code": info["provinceCode"],
}
# === New ward info: province_key -> ward_key -> display name ===
new_ward_names = {}
for prov_key, wards in new_parser["DICT_PROVINCE_WARD_NO_ACCENTED"].items():
new_ward_names[prov_key] = {}
for ward_key, info in wards.items():
new_ward_names[prov_key][ward_key] = {
"name": info["ward"],
"short": info["wardShort"],
"type": info["wardType"],
"code": info["wardCode"],
}
# === Old ward info: province_key -> district_key -> ward_key -> display name ===
old_ward_names = {}
for prov_key, districts in legacy["DICT_PROVINCE_DISTRICT_WARD_NO_ACCENTED"].items():
old_ward_names[prov_key] = {}
for dist_key, wards in districts.items():
for ward_key, info in wards.items():
old_ward_names[prov_key][f"{prov_key}_{dist_key}_{ward_key}"] = {
"name": info["ward"],
"short": info["wardShort"],
"type": info["wardType"],
"code": info["wardCode"],
}
# === Old district info ===
old_district_names = {}
for prov_key, districts in legacy.get("DICT_PROVINCE_DISTRICT", {}).items():
old_district_names[prov_key] = {}
for dist_key, info in districts.items():
old_district_names[prov_key][dist_key] = {
"name": info.get("district", ""),
"short": info.get("districtShort", ""),
"type": info.get("districtType", ""),
}
# === Ward mapping records ===
ward_mapping = []
# NO_DIVIDED: each new ward maps to one or more old wards (unchanged or renamed/merged)
for new_prov_key, wards in converter["DICT_PROVINCE_WARD_NO_DIVIDED"].items():
new_prov_info = province_names.get(new_prov_key, {})
for new_ward_key, old_compound_keys in wards.items():
new_ward_info = new_ward_names.get(new_prov_key, {}).get(new_ward_key, {})
for old_compound_key in old_compound_keys:
# Parse old compound key: "old_prov_key_old_dist_key_old_ward_key"
parts = old_compound_key.split("_", 2)
if len(parts) < 2:
continue
old_prov_key = parts[0]
rest = "_".join(parts[1:]) if len(parts) > 1 else ""
# Find old ward info
old_full_key = old_compound_key
old_ward_info = {}
old_dist_info = {}
# Find in old_ward_names
if old_prov_key in old_ward_names:
old_ward_info = old_ward_names[old_prov_key].get(old_full_key, {})
# Parse district key from compound
if len(parts) == 3:
old_dist_key = parts[1]
old_ward_key_str = parts[2]
if old_prov_key in old_district_names:
old_dist_info = old_district_names[old_prov_key].get(old_dist_key, {})
elif len(parts) == 2:
old_dist_key = parts[1]
old_ward_key_str = ""
if old_prov_key in old_district_names:
old_dist_info = old_district_names[old_prov_key].get(old_dist_key, {})
# Determine mapping type
if len(old_compound_keys) == 1:
# Only one old ward maps to this new ward
if old_ward_info.get("name") == new_ward_info.get("name"):
mapping_type = "unchanged"
else:
mapping_type = "renamed"
else:
mapping_type = "merged"
record = {
"old_province": old_province_names.get(old_prov_key, {}).get("name", ""),
"old_province_key": old_prov_key,
"old_district": old_dist_info.get("name", ""),
"old_district_key": parts[1] if len(parts) >= 2 else "",
"old_ward": old_ward_info.get("name", ""),
"old_ward_key": old_ward_key_str if len(parts) == 3 else "",
"new_province": new_prov_info.get("name", ""),
"new_province_key": new_prov_key,
"new_ward": new_ward_info.get("name", ""),
"new_ward_key": new_ward_key,
"mapping_type": mapping_type,
}
ward_mapping.append(record)
# DIVIDED: old wards split into multiple new wards
for new_prov_key, old_wards in converter["DICT_PROVINCE_WARD_DIVIDED"].items():
new_prov_info = province_names.get(new_prov_key, {})
for old_compound_key, new_ward_options in old_wards.items():
parts = old_compound_key.split("_", 2)
if len(parts) < 2:
continue
old_prov_key = parts[0]
old_ward_info = {}
old_dist_info = {}
if old_prov_key in old_ward_names:
old_ward_info = old_ward_names[old_prov_key].get(old_compound_key, {})
if len(parts) >= 2 and old_prov_key in old_district_names:
old_dist_info = old_district_names[old_prov_key].get(parts[1], {})
for option in new_ward_options:
new_ward_key = option["newWardKey"]
new_ward_info = new_ward_names.get(new_prov_key, {}).get(new_ward_key, {})
record = {
"old_province": old_province_names.get(old_prov_key, {}).get("name", ""),
"old_province_key": old_prov_key,
"old_district": old_dist_info.get("name", ""),
"old_district_key": parts[1] if len(parts) >= 2 else "",
"old_ward": old_ward_info.get("name", ""),
"old_ward_key": parts[2] if len(parts) == 3 else "",
"new_province": new_prov_info.get("name", ""),
"new_province_key": new_prov_key,
"new_ward": new_ward_info.get("name", ""),
"new_ward_key": new_ward_key,
"mapping_type": "divided",
"is_default": option.get("isDefaultNewWard", False),
}
ward_mapping.append(record)
# Build final mapping
mapping = {
"metadata": {
"source": "vietnamadminunits",
"version": "1.0.4",
"effective_date": "2025-07-01",
"old_provinces": len(old_province_names),
"new_provinces": len(province_names),
"total_records": len(ward_mapping),
},
"province_mapping": province_mapping,
"province_names": province_names,
"old_province_names": old_province_names,
"ward_mapping": ward_mapping,
}
output = Path(__file__).parent.parent / "data" / "mapping.json"
output.parent.mkdir(parents=True, exist_ok=True)
with open(output, "w", encoding="utf-8") as f:
json.dump(mapping, f, ensure_ascii=False, indent=2)
print(f"Generated {output}")
print(f" Province mappings: {len(province_mapping)} old -> {len(province_names)} new")
print(f" Ward mapping records: {len(ward_mapping)}")
# Stats
types = {}
for r in ward_mapping:
t = r["mapping_type"]
types[t] = types.get(t, 0) + 1
for t, c in sorted(types.items()):
print(f" {t}: {c}")
if __name__ == "__main__":
build_mapping()