File size: 2,081 Bytes
1efa4be
 
 
 
 
 
efd7cfc
 
 
 
1efa4be
 
 
 
efd7cfc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1efa4be
 
 
 
 
 
efd7cfc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
"""Vietnamese text normalization for address matching.

Uses underthesea character normalization (NFC + character map) to fix
encoding issues, then applies address-specific transformations
(abbreviation expansion, diacritics removal, key generation).
"""

import re
import unicodedata

from underthesea.pipeline.text_normalize.character_normalize import (
    normalize_characters_in_text,
)

# Abbreviation expansions
ABBREVIATIONS = {
    "tp.": "thành phố ",
    "tp ": "thành phố ",
    "t.p.": "thành phố ",
    "t.p ": "thành phố ",
    "p.": "phường ",
    "q.": "quận ",
    "h.": "huyện ",
    "tx.": "thị xã ",
    "t.x.": "thị xã ",
    "tt.": "thị trấn ",
    "t.t.": "thị trấn ",
    "x.": "xã ",
}


def remove_diacritics(text: str) -> str:
    """Remove Vietnamese diacritics from text.

    First applies underthesea character normalization (NFC + character map)
    to fix encoding issues, then strips combining marks via NFKD decomposition.
    """
    text = normalize_characters_in_text(text)
    nfkd = unicodedata.normalize("NFKD", text)
    result = "".join(c for c in nfkd if not unicodedata.combining(c))
    # Handle đ/Đ separately (not decomposed by NFKD)
    result = result.replace("đ", "d").replace("Đ", "D")
    return result


def normalize_key(text: str) -> str:
    """Normalize text to a lookup key (lowercase, no diacritics, no spaces/punctuation)."""
    text = text.lower().strip()
    text = remove_diacritics(text)
    text = re.sub(r"[^a-z0-9]", "", text)
    return text


def expand_abbreviations(text: str) -> str:
    """Expand common Vietnamese address abbreviations."""
    result = text.lower().strip()
    # Sort by length descending to match longer abbreviations first
    for abbr, full in sorted(ABBREVIATIONS.items(), key=lambda x: -len(x[0])):
        result = result.replace(abbr, full)
    return result.strip()


def normalize_for_matching(text: str) -> str:
    """Full normalization pipeline for fuzzy matching."""
    text = expand_abbreviations(text)
    return normalize_key(text)