File size: 2,081 Bytes
1efa4be efd7cfc 1efa4be efd7cfc 1efa4be efd7cfc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
"""Vietnamese text normalization for address matching.
Uses underthesea character normalization (NFC + character map) to fix
encoding issues, then applies address-specific transformations
(abbreviation expansion, diacritics removal, key generation).
"""
import re
import unicodedata
from underthesea.pipeline.text_normalize.character_normalize import (
normalize_characters_in_text,
)
# Abbreviation expansions
ABBREVIATIONS = {
"tp.": "thành phố ",
"tp ": "thành phố ",
"t.p.": "thành phố ",
"t.p ": "thành phố ",
"p.": "phường ",
"q.": "quận ",
"h.": "huyện ",
"tx.": "thị xã ",
"t.x.": "thị xã ",
"tt.": "thị trấn ",
"t.t.": "thị trấn ",
"x.": "xã ",
}
def remove_diacritics(text: str) -> str:
"""Remove Vietnamese diacritics from text.
First applies underthesea character normalization (NFC + character map)
to fix encoding issues, then strips combining marks via NFKD decomposition.
"""
text = normalize_characters_in_text(text)
nfkd = unicodedata.normalize("NFKD", text)
result = "".join(c for c in nfkd if not unicodedata.combining(c))
# Handle đ/Đ separately (not decomposed by NFKD)
result = result.replace("đ", "d").replace("Đ", "D")
return result
def normalize_key(text: str) -> str:
"""Normalize text to a lookup key (lowercase, no diacritics, no spaces/punctuation)."""
text = text.lower().strip()
text = remove_diacritics(text)
text = re.sub(r"[^a-z0-9]", "", text)
return text
def expand_abbreviations(text: str) -> str:
"""Expand common Vietnamese address abbreviations."""
result = text.lower().strip()
# Sort by length descending to match longer abbreviations first
for abbr, full in sorted(ABBREVIATIONS.items(), key=lambda x: -len(x[0])):
result = result.replace(abbr, full)
return result.strip()
def normalize_for_matching(text: str) -> str:
"""Full normalization pipeline for fuzzy matching."""
text = expand_abbreviations(text)
return normalize_key(text)
|