|
|
"""Vietnamese text normalization for address matching. |
|
|
|
|
|
Uses underthesea character normalization (NFC + character map) to fix |
|
|
encoding issues, then applies address-specific transformations |
|
|
(abbreviation expansion, diacritics removal, key generation). |
|
|
""" |
|
|
|
|
|
import re |
|
|
import unicodedata |
|
|
|
|
|
from underthesea.pipeline.text_normalize.character_normalize import ( |
|
|
normalize_characters_in_text, |
|
|
) |
|
|
|
|
|
|
|
|
ABBREVIATIONS = { |
|
|
"tp.": "thành phố ", |
|
|
"tp ": "thành phố ", |
|
|
"t.p.": "thành phố ", |
|
|
"t.p ": "thành phố ", |
|
|
"p.": "phường ", |
|
|
"q.": "quận ", |
|
|
"h.": "huyện ", |
|
|
"tx.": "thị xã ", |
|
|
"t.x.": "thị xã ", |
|
|
"tt.": "thị trấn ", |
|
|
"t.t.": "thị trấn ", |
|
|
"x.": "xã ", |
|
|
} |
|
|
|
|
|
|
|
|
def remove_diacritics(text: str) -> str: |
|
|
"""Remove Vietnamese diacritics from text. |
|
|
|
|
|
First applies underthesea character normalization (NFC + character map) |
|
|
to fix encoding issues, then strips combining marks via NFKD decomposition. |
|
|
""" |
|
|
text = normalize_characters_in_text(text) |
|
|
nfkd = unicodedata.normalize("NFKD", text) |
|
|
result = "".join(c for c in nfkd if not unicodedata.combining(c)) |
|
|
|
|
|
result = result.replace("đ", "d").replace("Đ", "D") |
|
|
return result |
|
|
|
|
|
|
|
|
def normalize_key(text: str) -> str: |
|
|
"""Normalize text to a lookup key (lowercase, no diacritics, no spaces/punctuation).""" |
|
|
text = text.lower().strip() |
|
|
text = remove_diacritics(text) |
|
|
text = re.sub(r"[^a-z0-9]", "", text) |
|
|
return text |
|
|
|
|
|
|
|
|
def expand_abbreviations(text: str) -> str: |
|
|
"""Expand common Vietnamese address abbreviations.""" |
|
|
result = text.lower().strip() |
|
|
|
|
|
for abbr, full in sorted(ABBREVIATIONS.items(), key=lambda x: -len(x[0])): |
|
|
result = result.replace(abbr, full) |
|
|
return result.strip() |
|
|
|
|
|
|
|
|
def normalize_for_matching(text: str) -> str: |
|
|
"""Full normalization pipeline for fuzzy matching.""" |
|
|
text = expand_abbreviations(text) |
|
|
return normalize_key(text) |
|
|
|