use serde::Deserialize; use std::collections::HashMap; use std::io::{self, BufRead, Write}; use std::time::Instant; use unicode_normalization::UnicodeNormalization; // ── Data model ────────────────────────────────────────────────────────────── #[derive(Deserialize)] struct MappingData { province_mapping: HashMap, province_names: HashMap, old_province_names: HashMap, ward_mapping: Vec, } #[derive(Deserialize, Clone)] struct ProvinceInfo { name: String, short: String, #[allow(dead_code)] code: String, } #[derive(Deserialize, Clone)] struct WardRecord { #[allow(dead_code)] old_province: String, old_province_key: String, #[allow(dead_code)] old_district: String, old_district_key: String, #[allow(dead_code)] old_ward: String, old_ward_key: String, #[allow(dead_code)] new_province: String, #[allow(dead_code)] new_province_key: String, new_ward: String, #[allow(dead_code)] new_ward_key: String, #[allow(dead_code)] mapping_type: String, #[serde(default)] is_default: bool, } // ── Index ─────────────────────────────────────────────────────────────────── struct Index { province_mapping: HashMap, province_names: HashMap, province_keywords: HashMap, exact: HashMap<(String, String, String), Vec>, ward_only: HashMap<(String, String), Vec>, records: Vec, } fn build_index(data: MappingData) -> Index { let mut province_keywords: HashMap = HashMap::new(); for (key, info) in &data.old_province_names { province_keywords.insert(normalize_key(&info.name), key.clone()); province_keywords.insert(normalize_key(&info.short), key.clone()); province_keywords.insert(key.clone(), key.clone()); } let mut exact: HashMap<(String, String, String), Vec> = HashMap::new(); let mut ward_only: HashMap<(String, String), Vec> = HashMap::new(); for (i, rec) in data.ward_mapping.iter().enumerate() { let ek = ( rec.old_province_key.clone(), rec.old_district_key.clone(), rec.old_ward_key.clone(), ); exact.entry(ek).or_default().push(i); let wk = (rec.old_province_key.clone(), rec.old_ward_key.clone()); ward_only.entry(wk).or_default().push(i); } Index { province_mapping: data.province_mapping, province_names: data.province_names, province_keywords, exact, ward_only, records: data.ward_mapping, } } // ── Normalizer ────────────────────────────────────────────────────────────── fn remove_diacritics(text: &str) -> String { let nfkd: String = text.nfkd().collect(); let mut result = String::with_capacity(nfkd.len()); for c in nfkd.chars() { if c == 'đ' { result.push('d'); } else if c == 'Đ' { result.push('D'); } else if !unicode_normalization::char::is_combining_mark(c) { result.push(c); } } result } fn normalize_key(text: &str) -> String { let lower = text.to_lowercase(); let lower = lower.trim(); let no_dia = remove_diacritics(lower); no_dia .chars() .filter(|c| c.is_ascii_alphanumeric()) .collect() } // Abbreviation table (sorted longest-first to match Python behavior) const ABBREVIATIONS: &[(&str, &str)] = &[ ("t.p.", "thành phố "), ("t.p ", "thành phố "), ("t.x.", "thị xã "), ("t.t.", "thị trấn "), ("tp.", "thành phố "), ("tp ", "thành phố "), ("tx.", "thị xã "), ("tt.", "thị trấn "), ("p.", "phường "), ("q.", "quận "), ("h.", "huyện "), ("x.", "xã "), ]; fn expand_abbreviations(text: &str) -> String { let mut result = text.to_lowercase(); result = result.trim().to_string(); for &(abbr, full) in ABBREVIATIONS { result = result.replace(abbr, full); } result.trim().to_string() } fn normalize_for_matching(text: &str) -> String { let expanded = expand_abbreviations(text); normalize_key(&expanded) } // ── Parser ────────────────────────────────────────────────────────────────── const WARD_PREFIXES: &[&str] = &["phường", "xã", "thị trấn"]; struct AdminUnit { province: String, district: String, ward: String, street: String, } impl AdminUnit { fn empty() -> Self { AdminUnit { province: String::new(), district: String::new(), ward: String::new(), street: String::new(), } } } fn parse_address(address: &str) -> AdminUnit { let expanded = expand_abbreviations(address); let parts: Vec<&str> = expanded .split(',') .map(|s| s.trim()) .filter(|s| !s.is_empty()) .collect(); if parts.is_empty() { return AdminUnit::empty(); } let mut unit = AdminUnit::empty(); let n = parts.len(); if n >= 1 { unit.province = parts[n - 1].to_string(); } if n >= 2 { unit.district = parts[n - 2].to_string(); } if n >= 3 { unit.ward = parts[n - 3].to_string(); } if n >= 4 { unit.street = parts[..n - 3].join(", "); } // Handle 2-part addresses: could be "ward, province" if n == 2 { let lower = parts[0].to_lowercase(); let lower = lower.trim(); for prefix in WARD_PREFIXES { if lower.starts_with(prefix) { unit.ward = unit.district.clone(); unit.district = String::new(); break; } } } unit } // ── Converter ─────────────────────────────────────────────────────────────── fn resolve_province(index: &Index, text: &str) -> Option { let normalized = normalize_for_matching(text); index.province_keywords.get(&normalized).cloned() } fn find_mapping(index: &Index, prov: &str, dist: &str, ward: &str) -> Vec { // Tier 1: exact let ek = (prov.to_string(), dist.to_string(), ward.to_string()); if let Some(ids) = index.exact.get(&ek) { if !ids.is_empty() { return ids.clone(); } } // Tier 2: ward-only let wk = (prov.to_string(), ward.to_string()); if let Some(ids) = index.ward_only.get(&wk) { return ids.clone(); } Vec::new() } fn select_best(index: &Index, ids: &[usize]) -> Option { if ids.is_empty() { return None; } if ids.len() == 1 { return Some(ids[0]); } for &id in ids { if index.records[id].is_default { return Some(id); } } Some(ids[0]) } fn convert_address(index: &Index, address: &str) -> String { let parsed = parse_address(address); // Resolve province let mut old_prov_key = resolve_province(index, &parsed.province); if old_prov_key.is_none() && !parsed.district.is_empty() { old_prov_key = resolve_province(index, &parsed.district); } let old_prov_key = match old_prov_key { Some(k) => k, None => return String::new(), }; // Get new province let new_prov_key = match index.province_mapping.get(&old_prov_key) { Some(k) => k.clone(), None => return String::new(), }; let new_prov_name = index .province_names .get(&new_prov_key) .map(|i| i.name.as_str()) .unwrap_or(""); // Province-only if parsed.ward.is_empty() && parsed.district.is_empty() { return new_prov_name.to_string(); } let old_dist_key = if !parsed.district.is_empty() { normalize_key(&parsed.district) } else { String::new() }; let old_ward_key = if !parsed.ward.is_empty() { normalize_key(&parsed.ward) } else { String::new() }; let mut ids = find_mapping(index, &old_prov_key, &old_dist_key, &old_ward_key); // Fallback: try ward in district field (for 2-part "ward, province") if ids.is_empty() && !parsed.ward.is_empty() { let alt_ward = if !parsed.district.is_empty() { normalize_key(&parsed.district) } else { String::new() }; if !alt_ward.is_empty() { ids = find_mapping(index, &old_prov_key, "", &alt_ward); } } if ids.is_empty() { // Ward not found, return province + street let mut parts: Vec<&str> = Vec::new(); if !parsed.street.is_empty() { parts.push(&parsed.street); } parts.push(new_prov_name); return parts.join(", "); } let best_id = select_best(index, &ids).unwrap(); let rec = &index.records[best_id]; let mut parts: Vec<&str> = Vec::new(); if !parsed.street.is_empty() { parts.push(&parsed.street); } if !rec.new_ward.is_empty() { parts.push(&rec.new_ward); } parts.push(new_prov_name); parts.join(", ") } // ── Main ──────────────────────────────────────────────────────────────────── fn load_index(data_path: &str) -> Index { let data_bytes = std::fs::read(data_path).expect("Failed to read mapping.json"); let data: MappingData = serde_json::from_slice(&data_bytes).expect("Failed to parse JSON"); build_index(data) } fn main() { let args: Vec = std::env::args().collect(); if args.len() < 2 { eprintln!("Usage: address-converter [address]"); std::process::exit(1); } let data_path = std::env::var("MAPPING_JSON") .unwrap_or_else(|_| { // Default: ../data/mapping.json relative to executable let exe = std::env::current_exe().unwrap(); let project_root = exe .parent().unwrap() // target/release .parent().unwrap() // target .parent().unwrap(); // rust/ project_root .parent().unwrap() // address/ .join("data") .join("mapping.json") .to_string_lossy() .to_string() }); match args[1].as_str() { "convert" => { if args.len() < 3 { eprintln!("Usage: address-converter convert
"); std::process::exit(1); } let index = load_index(&data_path); let result = convert_address(&index, &args[2]); println!("{}", result); } "bench" => { // Read addresses from stdin, one per line let index = load_index(&data_path); let stdin = io::stdin(); let addresses: Vec = stdin .lock() .lines() .map(|l| l.expect("Failed to read line")) .collect(); let n = addresses.len(); let start = Instant::now(); let mut results: Vec = Vec::with_capacity(n); for addr in &addresses { results.push(convert_address(&index, addr)); } let elapsed = start.elapsed(); // Print results to stdout let stdout = io::stdout(); let mut out = stdout.lock(); for r in &results { writeln!(out, "{}", r).unwrap(); } // Print timing to stderr eprintln!( "BENCH: {} addresses in {:.6} s ({:.3} us/addr)", n, elapsed.as_secs_f64(), elapsed.as_secs_f64() * 1_000_000.0 / n.max(1) as f64 ); } _ => { eprintln!("Unknown command: {}. Use 'convert' or 'bench'.", args[1]); std::process::exit(1); } } }