File size: 5,797 Bytes
47bc13b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
"""Delhi locality gazetteer for fuzzy matching and validation."""

from rapidfuzz import fuzz, process


class DelhiGazetteer:
    """

    Gazetteer of Delhi localities, areas, and common address terms.



    Used for:

    - Fuzzy matching to correct misspellings

    - Entity validation

    - Confidence boosting for known locations

    """

    # Major Delhi localities/areas
    LOCALITIES = {
        # South Delhi
        "SAKET", "MALVIYA NAGAR", "HAUZ KHAS", "GREEN PARK", "GREATER KAILASH",
        "DEFENCE COLONY", "LAJPAT NAGAR", "SOUTH EXTENSION", "CHITTARANJAN PARK",
        "KALKAJI", "NEHRU PLACE", "OKHLA", "JASOLA", "SARITA VIHAR",
        "ALAKNANDA", "SAFDARJUNG", "VASANT KUNJ", "MEHRAULI", "CHATTARPUR",

        # North Delhi
        "CIVIL LINES", "MODEL TOWN", "MUKHERJEE NAGAR", "KAMLA NAGAR",
        "SHAKTI NAGAR", "GULABI BAGH", "ASHOK VIHAR", "SHALIMAR BAGH",
        "PITAMPURA", "ROHINI", "NARELA", "BAWANA", "ALIPUR",

        # East Delhi
        "PREET VIHAR", "MAYUR VIHAR", "PATPARGANJ", "PANDAV NAGAR",
        "LAKSHMI NAGAR", "SHAKARPUR", "GEETA COLONY", "GANDHI NAGAR",
        "DILSHAD GARDEN", "SEELAMPUR", "SHAHDARA", "ANAND VIHAR",

        # West Delhi
        "JANAKPURI", "DWARKA", "PALAM", "UTTAM NAGAR", "VIKASPURI",
        "TILAK NAGAR", "RAJOURI GARDEN", "PUNJABI BAGH", "PASCHIM VIHAR",
        "MEERA BAGH", "PEERAGARHI", "MUNDKA", "NANGLOI", "NAJAFGARH",
        "BINDAPUR", "KAKROLA", "MOHAN GARDEN", "NAWADA",

        # Central Delhi
        "CONNAUGHT PLACE", "KAROL BAGH", "PAHARGANJ", "DARYAGANJ",
        "CHANDNI CHOWK", "SADAR BAZAAR", "RAJENDER NAGAR", "PATEL NAGAR",
        "KIRTI NAGAR", "MOTIA KHAN", "ANAND PARBAT", "JHANDEWALAN",

        # New Delhi
        "CHANAKYAPURI", "LODHI ROAD", "GOLF LINKS", "JORBAGH",
        "SUNDAR NAGAR", "NIZAMUDDIN", "LODI COLONY", "PANDARA ROAD",

        # Other areas
        "BADARPUR", "TUGHLAKABAD", "SANGAM VIHAR", "MADANPUR KHADAR",
        "GOVINDPURI", "AMBEDKAR NAGAR", "LADO SARAI", "TIGRI",
        "BURARI", "KARAWAL NAGAR", "BHAJANPURA", "MUSTAFABAD",
        "JAFFRABAD", "MAUJPUR", "GOKALPUR", "SEEMAPURI",
    }

    # Common colony/nagar suffixes
    NAGAR_SUFFIXES = {
        "NAGAR", "VIHAR", "COLONY", "ENCLAVE", "EXTENSION", "PURI",
        "PARK", "GARDEN", "BAGH", "KUNJ", "APARTMENT", "RESIDENCY",
        "COMPLEX", "PHASE", "SECTOR", "BLOCK", "POCKET",
    }

    # Common area names from the training data
    COMMON_AREAS = {
        "KAUNWAR SINGH NAGAR", "BABA HARI DAS COLONY", "TIKARI KALA",
        "CHANCHAL PARK", "SWARN PARK", "MUNDKA", "NANGLOI", "BAKKARWALA",
        "MAJRA DABAS", "CHAND NAGAR", "RANHOLA", "BAPROLA", "POOTH KHURD",
        "KIRARI", "SULTANPURI", "MANGOLPURI", "BEGUMPUR", "KADIPUR",
        "RAMA VIHAR", "PREM NAGAR", "VIJAY PARK", "AMBICA VIHAR",
        "SHIV PURI", "BUDH VIHAR", "POOTH KALAN", "QUTUBGARH",
        "RANI KHERA", "SHAHABAD DAIRY", "SAMAIPUR", "JAHANGIRPURI",
        "SANNOTH", "KANJHAWALA", "BAWANA", "ALIPUR",
    }

    # Common Hindi transliterated terms
    HINDI_TERMS = {
        "MOHALLA", "GALI", "KATRA", "BASTI", "BAZAR", "CHOWK",
        "GANJ", "PUR", "ABAD", "GARH", "GAON", "KHERA", "KHURD", "KALAN",
    }

    def __init__(self, min_similarity: float = 80.0):
        """

        Initialize gazetteer.



        Args:

            min_similarity: Minimum fuzzy match score (0-100)

        """
        self.min_similarity = min_similarity

        # Build combined set for matching
        self.all_places = (
            self.LOCALITIES |
            self.COMMON_AREAS |
            {f"{term}" for term in self.HINDI_TERMS}
        )

    def fuzzy_match(

        self,

        text: str,

        limit: int = 3

    ) -> list[tuple[str, float]]:
        """

        Find fuzzy matches for a text in the gazetteer.



        Args:

            text: Text to match

            limit: Maximum number of matches



        Returns:

            List of (matched_text, score) tuples

        """
        if not text or len(text) < 3:
            return []

        matches = process.extract(
            text.upper(),
            self.all_places,
            scorer=fuzz.ratio,
            limit=limit
        )

        return [(m[0], m[1]) for m in matches if m[1] >= self.min_similarity]

    def is_known_locality(self, text: str, threshold: float = 85.0) -> bool:
        """Check if text matches a known locality."""
        matches = self.fuzzy_match(text, limit=1)
        return bool(matches and matches[0][1] >= threshold)

    def correct_spelling(self, text: str) -> str | None:
        """

        Attempt to correct spelling using gazetteer.



        Returns corrected text or None if no good match.

        """
        matches = self.fuzzy_match(text, limit=1)
        if matches and matches[0][1] >= 90.0:
            return matches[0][0]
        return None

    def get_locality_type(self, text: str) -> str | None:
        """

        Determine if text contains a known locality type suffix.



        Returns the suffix type or None.

        """
        text_upper = text.upper()
        for suffix in self.NAGAR_SUFFIXES:
            if text_upper.endswith(suffix):
                return suffix
        return None

    def validate_pincode(self, pincode: str, locality: str | None = None) -> bool:
        """

        Validate if a pincode is valid for Delhi.



        Delhi pincodes are in range 110001-110097.

        """
        if not pincode or not pincode.isdigit() or len(pincode) != 6:
            return False

        code = int(pincode)
        # Delhi pincode range
        return 110001 <= code <= 110097