address / rust /src /main.rs
rain1024's picture
Add Rust address converter and use underthesea for normalization
1efa4be
use serde::Deserialize;
use std::collections::HashMap;
use std::io::{self, BufRead, Write};
use std::time::Instant;
use unicode_normalization::UnicodeNormalization;
// ── Data model ──────────────────────────────────────────────────────────────
#[derive(Deserialize)]
struct MappingData {
province_mapping: HashMap<String, String>,
province_names: HashMap<String, ProvinceInfo>,
old_province_names: HashMap<String, ProvinceInfo>,
ward_mapping: Vec<WardRecord>,
}
#[derive(Deserialize, Clone)]
struct ProvinceInfo {
name: String,
short: String,
#[allow(dead_code)]
code: String,
}
#[derive(Deserialize, Clone)]
struct WardRecord {
#[allow(dead_code)]
old_province: String,
old_province_key: String,
#[allow(dead_code)]
old_district: String,
old_district_key: String,
#[allow(dead_code)]
old_ward: String,
old_ward_key: String,
#[allow(dead_code)]
new_province: String,
#[allow(dead_code)]
new_province_key: String,
new_ward: String,
#[allow(dead_code)]
new_ward_key: String,
#[allow(dead_code)]
mapping_type: String,
#[serde(default)]
is_default: bool,
}
// ── Index ───────────────────────────────────────────────────────────────────
struct Index {
province_mapping: HashMap<String, String>,
province_names: HashMap<String, ProvinceInfo>,
province_keywords: HashMap<String, String>,
exact: HashMap<(String, String, String), Vec<usize>>,
ward_only: HashMap<(String, String), Vec<usize>>,
records: Vec<WardRecord>,
}
fn build_index(data: MappingData) -> Index {
let mut province_keywords: HashMap<String, String> = HashMap::new();
for (key, info) in &data.old_province_names {
province_keywords.insert(normalize_key(&info.name), key.clone());
province_keywords.insert(normalize_key(&info.short), key.clone());
province_keywords.insert(key.clone(), key.clone());
}
let mut exact: HashMap<(String, String, String), Vec<usize>> = HashMap::new();
let mut ward_only: HashMap<(String, String), Vec<usize>> = HashMap::new();
for (i, rec) in data.ward_mapping.iter().enumerate() {
let ek = (
rec.old_province_key.clone(),
rec.old_district_key.clone(),
rec.old_ward_key.clone(),
);
exact.entry(ek).or_default().push(i);
let wk = (rec.old_province_key.clone(), rec.old_ward_key.clone());
ward_only.entry(wk).or_default().push(i);
}
Index {
province_mapping: data.province_mapping,
province_names: data.province_names,
province_keywords,
exact,
ward_only,
records: data.ward_mapping,
}
}
// ── Normalizer ──────────────────────────────────────────────────────────────
fn remove_diacritics(text: &str) -> String {
let nfkd: String = text.nfkd().collect();
let mut result = String::with_capacity(nfkd.len());
for c in nfkd.chars() {
if c == 'Δ‘' {
result.push('d');
} else if c == 'Đ' {
result.push('D');
} else if !unicode_normalization::char::is_combining_mark(c) {
result.push(c);
}
}
result
}
fn normalize_key(text: &str) -> String {
let lower = text.to_lowercase();
let lower = lower.trim();
let no_dia = remove_diacritics(lower);
no_dia
.chars()
.filter(|c| c.is_ascii_alphanumeric())
.collect()
}
// Abbreviation table (sorted longest-first to match Python behavior)
const ABBREVIATIONS: &[(&str, &str)] = &[
("t.p.", "thΓ nh phα»‘ "),
("t.p ", "thΓ nh phα»‘ "),
("t.x.", "thα»‹ xΓ£ "),
("t.t.", "thα»‹ trαΊ₯n "),
("tp.", "thΓ nh phα»‘ "),
("tp ", "thΓ nh phα»‘ "),
("tx.", "thα»‹ xΓ£ "),
("tt.", "thα»‹ trαΊ₯n "),
("p.", "phường "),
("q.", "quαΊ­n "),
("h.", "huyện "),
("x.", "xΓ£ "),
];
fn expand_abbreviations(text: &str) -> String {
let mut result = text.to_lowercase();
result = result.trim().to_string();
for &(abbr, full) in ABBREVIATIONS {
result = result.replace(abbr, full);
}
result.trim().to_string()
}
fn normalize_for_matching(text: &str) -> String {
let expanded = expand_abbreviations(text);
normalize_key(&expanded)
}
// ── Parser ──────────────────────────────────────────────────────────────────
const WARD_PREFIXES: &[&str] = &["phường", "xΓ£", "thα»‹ trαΊ₯n"];
struct AdminUnit {
province: String,
district: String,
ward: String,
street: String,
}
impl AdminUnit {
fn empty() -> Self {
AdminUnit {
province: String::new(),
district: String::new(),
ward: String::new(),
street: String::new(),
}
}
}
fn parse_address(address: &str) -> AdminUnit {
let expanded = expand_abbreviations(address);
let parts: Vec<&str> = expanded
.split(',')
.map(|s| s.trim())
.filter(|s| !s.is_empty())
.collect();
if parts.is_empty() {
return AdminUnit::empty();
}
let mut unit = AdminUnit::empty();
let n = parts.len();
if n >= 1 {
unit.province = parts[n - 1].to_string();
}
if n >= 2 {
unit.district = parts[n - 2].to_string();
}
if n >= 3 {
unit.ward = parts[n - 3].to_string();
}
if n >= 4 {
unit.street = parts[..n - 3].join(", ");
}
// Handle 2-part addresses: could be "ward, province"
if n == 2 {
let lower = parts[0].to_lowercase();
let lower = lower.trim();
for prefix in WARD_PREFIXES {
if lower.starts_with(prefix) {
unit.ward = unit.district.clone();
unit.district = String::new();
break;
}
}
}
unit
}
// ── Converter ───────────────────────────────────────────────────────────────
fn resolve_province(index: &Index, text: &str) -> Option<String> {
let normalized = normalize_for_matching(text);
index.province_keywords.get(&normalized).cloned()
}
fn find_mapping(index: &Index, prov: &str, dist: &str, ward: &str) -> Vec<usize> {
// Tier 1: exact
let ek = (prov.to_string(), dist.to_string(), ward.to_string());
if let Some(ids) = index.exact.get(&ek) {
if !ids.is_empty() {
return ids.clone();
}
}
// Tier 2: ward-only
let wk = (prov.to_string(), ward.to_string());
if let Some(ids) = index.ward_only.get(&wk) {
return ids.clone();
}
Vec::new()
}
fn select_best(index: &Index, ids: &[usize]) -> Option<usize> {
if ids.is_empty() {
return None;
}
if ids.len() == 1 {
return Some(ids[0]);
}
for &id in ids {
if index.records[id].is_default {
return Some(id);
}
}
Some(ids[0])
}
fn convert_address(index: &Index, address: &str) -> String {
let parsed = parse_address(address);
// Resolve province
let mut old_prov_key = resolve_province(index, &parsed.province);
if old_prov_key.is_none() && !parsed.district.is_empty() {
old_prov_key = resolve_province(index, &parsed.district);
}
let old_prov_key = match old_prov_key {
Some(k) => k,
None => return String::new(),
};
// Get new province
let new_prov_key = match index.province_mapping.get(&old_prov_key) {
Some(k) => k.clone(),
None => return String::new(),
};
let new_prov_name = index
.province_names
.get(&new_prov_key)
.map(|i| i.name.as_str())
.unwrap_or("");
// Province-only
if parsed.ward.is_empty() && parsed.district.is_empty() {
return new_prov_name.to_string();
}
let old_dist_key = if !parsed.district.is_empty() {
normalize_key(&parsed.district)
} else {
String::new()
};
let old_ward_key = if !parsed.ward.is_empty() {
normalize_key(&parsed.ward)
} else {
String::new()
};
let mut ids = find_mapping(index, &old_prov_key, &old_dist_key, &old_ward_key);
// Fallback: try ward in district field (for 2-part "ward, province")
if ids.is_empty() && !parsed.ward.is_empty() {
let alt_ward = if !parsed.district.is_empty() {
normalize_key(&parsed.district)
} else {
String::new()
};
if !alt_ward.is_empty() {
ids = find_mapping(index, &old_prov_key, "", &alt_ward);
}
}
if ids.is_empty() {
// Ward not found, return province + street
let mut parts: Vec<&str> = Vec::new();
if !parsed.street.is_empty() {
parts.push(&parsed.street);
}
parts.push(new_prov_name);
return parts.join(", ");
}
let best_id = select_best(index, &ids).unwrap();
let rec = &index.records[best_id];
let mut parts: Vec<&str> = Vec::new();
if !parsed.street.is_empty() {
parts.push(&parsed.street);
}
if !rec.new_ward.is_empty() {
parts.push(&rec.new_ward);
}
parts.push(new_prov_name);
parts.join(", ")
}
// ── Main ────────────────────────────────────────────────────────────────────
fn load_index(data_path: &str) -> Index {
let data_bytes = std::fs::read(data_path).expect("Failed to read mapping.json");
let data: MappingData = serde_json::from_slice(&data_bytes).expect("Failed to parse JSON");
build_index(data)
}
fn main() {
let args: Vec<String> = std::env::args().collect();
if args.len() < 2 {
eprintln!("Usage: address-converter <convert|bench> [address]");
std::process::exit(1);
}
let data_path = std::env::var("MAPPING_JSON")
.unwrap_or_else(|_| {
// Default: ../data/mapping.json relative to executable
let exe = std::env::current_exe().unwrap();
let project_root = exe
.parent().unwrap() // target/release
.parent().unwrap() // target
.parent().unwrap(); // rust/
project_root
.parent().unwrap() // address/
.join("data")
.join("mapping.json")
.to_string_lossy()
.to_string()
});
match args[1].as_str() {
"convert" => {
if args.len() < 3 {
eprintln!("Usage: address-converter convert <address>");
std::process::exit(1);
}
let index = load_index(&data_path);
let result = convert_address(&index, &args[2]);
println!("{}", result);
}
"bench" => {
// Read addresses from stdin, one per line
let index = load_index(&data_path);
let stdin = io::stdin();
let addresses: Vec<String> = stdin
.lock()
.lines()
.map(|l| l.expect("Failed to read line"))
.collect();
let n = addresses.len();
let start = Instant::now();
let mut results: Vec<String> = Vec::with_capacity(n);
for addr in &addresses {
results.push(convert_address(&index, addr));
}
let elapsed = start.elapsed();
// Print results to stdout
let stdout = io::stdout();
let mut out = stdout.lock();
for r in &results {
writeln!(out, "{}", r).unwrap();
}
// Print timing to stderr
eprintln!(
"BENCH: {} addresses in {:.6} s ({:.3} us/addr)",
n,
elapsed.as_secs_f64(),
elapsed.as_secs_f64() * 1_000_000.0 / n.max(1) as f64
);
}
_ => {
eprintln!("Unknown command: {}. Use 'convert' or 'bench'.", args[1]);
std::process::exit(1);
}
}
}