|
|
use serde::Deserialize; |
|
|
use std::collections::HashMap; |
|
|
use std::io::{self, BufRead, Write}; |
|
|
use std::time::Instant; |
|
|
use unicode_normalization::UnicodeNormalization; |
|
|
|
|
|
|
|
|
|
|
|
#[derive(Deserialize)] |
|
|
struct MappingData { |
|
|
province_mapping: HashMap<String, String>, |
|
|
province_names: HashMap<String, ProvinceInfo>, |
|
|
old_province_names: HashMap<String, ProvinceInfo>, |
|
|
ward_mapping: Vec<WardRecord>, |
|
|
} |
|
|
|
|
|
#[derive(Deserialize, Clone)] |
|
|
struct ProvinceInfo { |
|
|
name: String, |
|
|
short: String, |
|
|
#[allow(dead_code)] |
|
|
code: String, |
|
|
} |
|
|
|
|
|
#[derive(Deserialize, Clone)] |
|
|
struct WardRecord { |
|
|
#[allow(dead_code)] |
|
|
old_province: String, |
|
|
old_province_key: String, |
|
|
#[allow(dead_code)] |
|
|
old_district: String, |
|
|
old_district_key: String, |
|
|
#[allow(dead_code)] |
|
|
old_ward: String, |
|
|
old_ward_key: String, |
|
|
#[allow(dead_code)] |
|
|
new_province: String, |
|
|
#[allow(dead_code)] |
|
|
new_province_key: String, |
|
|
new_ward: String, |
|
|
#[allow(dead_code)] |
|
|
new_ward_key: String, |
|
|
#[allow(dead_code)] |
|
|
mapping_type: String, |
|
|
#[serde(default)] |
|
|
is_default: bool, |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
struct Index { |
|
|
province_mapping: HashMap<String, String>, |
|
|
province_names: HashMap<String, ProvinceInfo>, |
|
|
province_keywords: HashMap<String, String>, |
|
|
exact: HashMap<(String, String, String), Vec<usize>>, |
|
|
ward_only: HashMap<(String, String), Vec<usize>>, |
|
|
records: Vec<WardRecord>, |
|
|
} |
|
|
|
|
|
fn build_index(data: MappingData) -> Index { |
|
|
let mut province_keywords: HashMap<String, String> = HashMap::new(); |
|
|
|
|
|
for (key, info) in &data.old_province_names { |
|
|
province_keywords.insert(normalize_key(&info.name), key.clone()); |
|
|
province_keywords.insert(normalize_key(&info.short), key.clone()); |
|
|
province_keywords.insert(key.clone(), key.clone()); |
|
|
} |
|
|
|
|
|
let mut exact: HashMap<(String, String, String), Vec<usize>> = HashMap::new(); |
|
|
let mut ward_only: HashMap<(String, String), Vec<usize>> = HashMap::new(); |
|
|
|
|
|
for (i, rec) in data.ward_mapping.iter().enumerate() { |
|
|
let ek = ( |
|
|
rec.old_province_key.clone(), |
|
|
rec.old_district_key.clone(), |
|
|
rec.old_ward_key.clone(), |
|
|
); |
|
|
exact.entry(ek).or_default().push(i); |
|
|
|
|
|
let wk = (rec.old_province_key.clone(), rec.old_ward_key.clone()); |
|
|
ward_only.entry(wk).or_default().push(i); |
|
|
} |
|
|
|
|
|
Index { |
|
|
province_mapping: data.province_mapping, |
|
|
province_names: data.province_names, |
|
|
province_keywords, |
|
|
exact, |
|
|
ward_only, |
|
|
records: data.ward_mapping, |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
fn remove_diacritics(text: &str) -> String { |
|
|
let nfkd: String = text.nfkd().collect(); |
|
|
let mut result = String::with_capacity(nfkd.len()); |
|
|
for c in nfkd.chars() { |
|
|
if c == 'Δ' { |
|
|
result.push('d'); |
|
|
} else if c == 'Δ' { |
|
|
result.push('D'); |
|
|
} else if !unicode_normalization::char::is_combining_mark(c) { |
|
|
result.push(c); |
|
|
} |
|
|
} |
|
|
result |
|
|
} |
|
|
|
|
|
fn normalize_key(text: &str) -> String { |
|
|
let lower = text.to_lowercase(); |
|
|
let lower = lower.trim(); |
|
|
let no_dia = remove_diacritics(lower); |
|
|
no_dia |
|
|
.chars() |
|
|
.filter(|c| c.is_ascii_alphanumeric()) |
|
|
.collect() |
|
|
} |
|
|
|
|
|
|
|
|
const ABBREVIATIONS: &[(&str, &str)] = &[ |
|
|
("t.p.", "thΓ nh phα» "), |
|
|
("t.p ", "thΓ nh phα» "), |
|
|
("t.x.", "thα» xΓ£ "), |
|
|
("t.t.", "thα» trαΊ₯n "), |
|
|
("tp.", "thΓ nh phα» "), |
|
|
("tp ", "thΓ nh phα» "), |
|
|
("tx.", "thα» xΓ£ "), |
|
|
("tt.", "thα» trαΊ₯n "), |
|
|
("p.", "phΖ°α»ng "), |
|
|
("q.", "quαΊn "), |
|
|
("h.", "huyα»n "), |
|
|
("x.", "xΓ£ "), |
|
|
]; |
|
|
|
|
|
fn expand_abbreviations(text: &str) -> String { |
|
|
let mut result = text.to_lowercase(); |
|
|
result = result.trim().to_string(); |
|
|
for &(abbr, full) in ABBREVIATIONS { |
|
|
result = result.replace(abbr, full); |
|
|
} |
|
|
result.trim().to_string() |
|
|
} |
|
|
|
|
|
fn normalize_for_matching(text: &str) -> String { |
|
|
let expanded = expand_abbreviations(text); |
|
|
normalize_key(&expanded) |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
const WARD_PREFIXES: &[&str] = &["phΖ°α»ng", "xΓ£", "thα» trαΊ₯n"]; |
|
|
|
|
|
struct AdminUnit { |
|
|
province: String, |
|
|
district: String, |
|
|
ward: String, |
|
|
street: String, |
|
|
} |
|
|
|
|
|
impl AdminUnit { |
|
|
fn empty() -> Self { |
|
|
AdminUnit { |
|
|
province: String::new(), |
|
|
district: String::new(), |
|
|
ward: String::new(), |
|
|
street: String::new(), |
|
|
} |
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
fn parse_address(address: &str) -> AdminUnit { |
|
|
let expanded = expand_abbreviations(address); |
|
|
let parts: Vec<&str> = expanded |
|
|
.split(',') |
|
|
.map(|s| s.trim()) |
|
|
.filter(|s| !s.is_empty()) |
|
|
.collect(); |
|
|
|
|
|
if parts.is_empty() { |
|
|
return AdminUnit::empty(); |
|
|
} |
|
|
|
|
|
let mut unit = AdminUnit::empty(); |
|
|
|
|
|
let n = parts.len(); |
|
|
if n >= 1 { |
|
|
unit.province = parts[n - 1].to_string(); |
|
|
} |
|
|
if n >= 2 { |
|
|
unit.district = parts[n - 2].to_string(); |
|
|
} |
|
|
if n >= 3 { |
|
|
unit.ward = parts[n - 3].to_string(); |
|
|
} |
|
|
if n >= 4 { |
|
|
unit.street = parts[..n - 3].join(", "); |
|
|
} |
|
|
|
|
|
|
|
|
if n == 2 { |
|
|
let lower = parts[0].to_lowercase(); |
|
|
let lower = lower.trim(); |
|
|
for prefix in WARD_PREFIXES { |
|
|
if lower.starts_with(prefix) { |
|
|
unit.ward = unit.district.clone(); |
|
|
unit.district = String::new(); |
|
|
break; |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
unit |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
fn resolve_province(index: &Index, text: &str) -> Option<String> { |
|
|
let normalized = normalize_for_matching(text); |
|
|
index.province_keywords.get(&normalized).cloned() |
|
|
} |
|
|
|
|
|
fn find_mapping(index: &Index, prov: &str, dist: &str, ward: &str) -> Vec<usize> { |
|
|
|
|
|
let ek = (prov.to_string(), dist.to_string(), ward.to_string()); |
|
|
if let Some(ids) = index.exact.get(&ek) { |
|
|
if !ids.is_empty() { |
|
|
return ids.clone(); |
|
|
} |
|
|
} |
|
|
|
|
|
let wk = (prov.to_string(), ward.to_string()); |
|
|
if let Some(ids) = index.ward_only.get(&wk) { |
|
|
return ids.clone(); |
|
|
} |
|
|
Vec::new() |
|
|
} |
|
|
|
|
|
fn select_best(index: &Index, ids: &[usize]) -> Option<usize> { |
|
|
if ids.is_empty() { |
|
|
return None; |
|
|
} |
|
|
if ids.len() == 1 { |
|
|
return Some(ids[0]); |
|
|
} |
|
|
for &id in ids { |
|
|
if index.records[id].is_default { |
|
|
return Some(id); |
|
|
} |
|
|
} |
|
|
Some(ids[0]) |
|
|
} |
|
|
|
|
|
fn convert_address(index: &Index, address: &str) -> String { |
|
|
let parsed = parse_address(address); |
|
|
|
|
|
|
|
|
let mut old_prov_key = resolve_province(index, &parsed.province); |
|
|
if old_prov_key.is_none() && !parsed.district.is_empty() { |
|
|
old_prov_key = resolve_province(index, &parsed.district); |
|
|
} |
|
|
let old_prov_key = match old_prov_key { |
|
|
Some(k) => k, |
|
|
None => return String::new(), |
|
|
}; |
|
|
|
|
|
|
|
|
let new_prov_key = match index.province_mapping.get(&old_prov_key) { |
|
|
Some(k) => k.clone(), |
|
|
None => return String::new(), |
|
|
}; |
|
|
let new_prov_name = index |
|
|
.province_names |
|
|
.get(&new_prov_key) |
|
|
.map(|i| i.name.as_str()) |
|
|
.unwrap_or(""); |
|
|
|
|
|
|
|
|
if parsed.ward.is_empty() && parsed.district.is_empty() { |
|
|
return new_prov_name.to_string(); |
|
|
} |
|
|
|
|
|
let old_dist_key = if !parsed.district.is_empty() { |
|
|
normalize_key(&parsed.district) |
|
|
} else { |
|
|
String::new() |
|
|
}; |
|
|
let old_ward_key = if !parsed.ward.is_empty() { |
|
|
normalize_key(&parsed.ward) |
|
|
} else { |
|
|
String::new() |
|
|
}; |
|
|
|
|
|
let mut ids = find_mapping(index, &old_prov_key, &old_dist_key, &old_ward_key); |
|
|
|
|
|
|
|
|
if ids.is_empty() && !parsed.ward.is_empty() { |
|
|
let alt_ward = if !parsed.district.is_empty() { |
|
|
normalize_key(&parsed.district) |
|
|
} else { |
|
|
String::new() |
|
|
}; |
|
|
if !alt_ward.is_empty() { |
|
|
ids = find_mapping(index, &old_prov_key, "", &alt_ward); |
|
|
} |
|
|
} |
|
|
|
|
|
if ids.is_empty() { |
|
|
|
|
|
let mut parts: Vec<&str> = Vec::new(); |
|
|
if !parsed.street.is_empty() { |
|
|
parts.push(&parsed.street); |
|
|
} |
|
|
parts.push(new_prov_name); |
|
|
return parts.join(", "); |
|
|
} |
|
|
|
|
|
let best_id = select_best(index, &ids).unwrap(); |
|
|
let rec = &index.records[best_id]; |
|
|
|
|
|
let mut parts: Vec<&str> = Vec::new(); |
|
|
if !parsed.street.is_empty() { |
|
|
parts.push(&parsed.street); |
|
|
} |
|
|
if !rec.new_ward.is_empty() { |
|
|
parts.push(&rec.new_ward); |
|
|
} |
|
|
parts.push(new_prov_name); |
|
|
parts.join(", ") |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
fn load_index(data_path: &str) -> Index { |
|
|
let data_bytes = std::fs::read(data_path).expect("Failed to read mapping.json"); |
|
|
let data: MappingData = serde_json::from_slice(&data_bytes).expect("Failed to parse JSON"); |
|
|
build_index(data) |
|
|
} |
|
|
|
|
|
fn main() { |
|
|
let args: Vec<String> = std::env::args().collect(); |
|
|
if args.len() < 2 { |
|
|
eprintln!("Usage: address-converter <convert|bench> [address]"); |
|
|
std::process::exit(1); |
|
|
} |
|
|
|
|
|
let data_path = std::env::var("MAPPING_JSON") |
|
|
.unwrap_or_else(|_| { |
|
|
|
|
|
let exe = std::env::current_exe().unwrap(); |
|
|
let project_root = exe |
|
|
.parent().unwrap() |
|
|
.parent().unwrap() |
|
|
.parent().unwrap(); |
|
|
project_root |
|
|
.parent().unwrap() |
|
|
.join("data") |
|
|
.join("mapping.json") |
|
|
.to_string_lossy() |
|
|
.to_string() |
|
|
}); |
|
|
|
|
|
match args[1].as_str() { |
|
|
"convert" => { |
|
|
if args.len() < 3 { |
|
|
eprintln!("Usage: address-converter convert <address>"); |
|
|
std::process::exit(1); |
|
|
} |
|
|
let index = load_index(&data_path); |
|
|
let result = convert_address(&index, &args[2]); |
|
|
println!("{}", result); |
|
|
} |
|
|
"bench" => { |
|
|
|
|
|
let index = load_index(&data_path); |
|
|
let stdin = io::stdin(); |
|
|
let addresses: Vec<String> = stdin |
|
|
.lock() |
|
|
.lines() |
|
|
.map(|l| l.expect("Failed to read line")) |
|
|
.collect(); |
|
|
|
|
|
let n = addresses.len(); |
|
|
let start = Instant::now(); |
|
|
let mut results: Vec<String> = Vec::with_capacity(n); |
|
|
for addr in &addresses { |
|
|
results.push(convert_address(&index, addr)); |
|
|
} |
|
|
let elapsed = start.elapsed(); |
|
|
|
|
|
|
|
|
let stdout = io::stdout(); |
|
|
let mut out = stdout.lock(); |
|
|
for r in &results { |
|
|
writeln!(out, "{}", r).unwrap(); |
|
|
} |
|
|
|
|
|
|
|
|
eprintln!( |
|
|
"BENCH: {} addresses in {:.6} s ({:.3} us/addr)", |
|
|
n, |
|
|
elapsed.as_secs_f64(), |
|
|
elapsed.as_secs_f64() * 1_000_000.0 / n.max(1) as f64 |
|
|
); |
|
|
} |
|
|
_ => { |
|
|
eprintln!("Unknown command: {}. Use 'convert' or 'bench'.", args[1]); |
|
|
std::process::exit(1); |
|
|
} |
|
|
} |
|
|
} |
|
|
|