"""Parse Vietnamese address strings into components.""" from .models import AdminUnit from .normalizer import expand_abbreviations # Ward-level prefixes WARD_PREFIXES = ("phường", "xã", "thị trấn") def parse_address(address: str) -> AdminUnit: """ Parse Vietnamese address string into AdminUnit components. Expected format: "street, ward, district, province" Parsing is right-to-left (province is rightmost). """ # Expand abbreviations first expanded = expand_abbreviations(address) # Split by comma parts = [p.strip() for p in expanded.split(",") if p.strip()] if not parts: return AdminUnit() unit = AdminUnit() # Right-to-left assignment if len(parts) >= 1: unit.province = parts[-1].strip() if len(parts) >= 2: unit.district = parts[-2].strip() if len(parts) >= 3: unit.ward = parts[-3].strip() if len(parts) >= 4: # Everything before ward is street unit.street = ", ".join(parts[:-3]).strip() # Handle 2-part addresses: could be "ward, province" or "district, province" if len(parts) == 2: lower = parts[0].lower().strip() for prefix in WARD_PREFIXES: if lower.startswith(prefix): unit.ward = unit.district unit.district = "" break return unit