address / src /parser.py
rain1024's picture
Add Rust address converter and use underthesea for normalization
1efa4be
"""Parse Vietnamese address strings into components."""
from .models import AdminUnit
from .normalizer import expand_abbreviations
# Ward-level prefixes
WARD_PREFIXES = ("phường", "xã", "thị trấn")
def parse_address(address: str) -> AdminUnit:
"""
Parse Vietnamese address string into AdminUnit components.
Expected format: "street, ward, district, province"
Parsing is right-to-left (province is rightmost).
"""
# Expand abbreviations first
expanded = expand_abbreviations(address)
# Split by comma
parts = [p.strip() for p in expanded.split(",") if p.strip()]
if not parts:
return AdminUnit()
unit = AdminUnit()
# Right-to-left assignment
if len(parts) >= 1:
unit.province = parts[-1].strip()
if len(parts) >= 2:
unit.district = parts[-2].strip()
if len(parts) >= 3:
unit.ward = parts[-3].strip()
if len(parts) >= 4:
# Everything before ward is street
unit.street = ", ".join(parts[:-3]).strip()
# Handle 2-part addresses: could be "ward, province" or "district, province"
if len(parts) == 2:
lower = parts[0].lower().strip()
for prefix in WARD_PREFIXES:
if lower.startswith(prefix):
unit.ward = unit.district
unit.district = ""
break
return unit