address / src /normalizer.py
rain1024's picture
Add Rust address converter and use underthesea for normalization
1efa4be
"""Vietnamese text normalization for address matching.
Uses underthesea character normalization (NFC + character map) to fix
encoding issues, then applies address-specific transformations
(abbreviation expansion, diacritics removal, key generation).
"""
import re
import unicodedata
from underthesea.pipeline.text_normalize.character_normalize import (
normalize_characters_in_text,
)
# Abbreviation expansions
ABBREVIATIONS = {
"tp.": "thành phố ",
"tp ": "thành phố ",
"t.p.": "thành phố ",
"t.p ": "thành phố ",
"p.": "phường ",
"q.": "quận ",
"h.": "huyện ",
"tx.": "thị xã ",
"t.x.": "thị xã ",
"tt.": "thị trấn ",
"t.t.": "thị trấn ",
"x.": "xã ",
}
def remove_diacritics(text: str) -> str:
"""Remove Vietnamese diacritics from text.
First applies underthesea character normalization (NFC + character map)
to fix encoding issues, then strips combining marks via NFKD decomposition.
"""
text = normalize_characters_in_text(text)
nfkd = unicodedata.normalize("NFKD", text)
result = "".join(c for c in nfkd if not unicodedata.combining(c))
# Handle đ/Đ separately (not decomposed by NFKD)
result = result.replace("đ", "d").replace("Đ", "D")
return result
def normalize_key(text: str) -> str:
"""Normalize text to a lookup key (lowercase, no diacritics, no spaces/punctuation)."""
text = text.lower().strip()
text = remove_diacritics(text)
text = re.sub(r"[^a-z0-9]", "", text)
return text
def expand_abbreviations(text: str) -> str:
"""Expand common Vietnamese address abbreviations."""
result = text.lower().strip()
# Sort by length descending to match longer abbreviations first
for abbr, full in sorted(ABBREVIATIONS.items(), key=lambda x: -len(x[0])):
result = result.replace(abbr, full)
return result.strip()
def normalize_for_matching(text: str) -> str:
"""Full normalization pipeline for fuzzy matching."""
text = expand_abbreviations(text)
return normalize_key(text)