File size: 20,484 Bytes
47bc13b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5363153
47bc13b
 
 
 
 
5363153
47bc13b
 
 
 
 
5363153
47bc13b
 
 
 
 
5363153
47bc13b
 
5363153
 
 
47bc13b
5363153
47bc13b
 
 
 
 
 
 
 
 
 
 
 
5363153
47bc13b
 
 
 
 
5363153
47bc13b
 
 
 
5363153
47bc13b
 
 
5363153
47bc13b
5363153
 
 
 
47bc13b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5363153
47bc13b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
"""Rule-based post-processing for entity refinement."""

import re

from address_parser.postprocessing.gazetteer import DelhiGazetteer
from address_parser.schemas import AddressEntity


class RuleBasedRefiner:
    """

    Post-processing rules for refining NER predictions.



    Handles:

    - Pattern-based entity detection (pincodes, khasra numbers)

    - Entity boundary correction using gazetteer

    - Entity merging for fragmented predictions

    - Confidence adjustment

    - Validation and filtering

    """

    # Regex patterns for deterministic entities
    PATTERNS = {
        "PINCODE": re.compile(r'\b[1-9]\d{5}\b'),
        "KHASRA": re.compile(
            r'\b(?:KH\.?\s*(?:NO\.?)?\s*|KHASRA\s*(?:NO\.?)?\s*)[\d/]+(?:[/-]\d+)*\b',
            re.IGNORECASE
        ),
        "HOUSE_NUMBER": re.compile(
            r'\b(?:H\.?\s*(?:NO\.?)?\s*|HOUSE\s*(?:NO\.?)?\s*|PLOT\s*(?:NO\.?)?\s*)?[A-Z]?\d+[A-Z]?(?:[-/]\d+)*\b',
            re.IGNORECASE
        ),
        "FLOOR": re.compile(
            r'\b(?:GROUND|FIRST|SECOND|THIRD|FOURTH|FIFTH|1ST|2ND|3RD|4TH|5TH|GF|FF|SF|TF)?\s*(?:FLOOR|FLR)\b',
            re.IGNORECASE
        ),
        "BLOCK": re.compile(
            r'\b(?:BLOCK|BLK|BL)\s*[A-Z]?[-]?[A-Z0-9]+\b',
            re.IGNORECASE
        ),
        "SECTOR": re.compile(
            r'\b(?:SECTOR|SEC)\s*\d+[A-Z]?\b',
            re.IGNORECASE
        ),
        "GALI": re.compile(
            r'\b(?:GALI|GALLI|LANE)\s*(?:NO\.?)?\s*\d+[A-Z]?\b',
            re.IGNORECASE
        ),
    }

    # Area patterns - directional areas
    AREA_PATTERNS = [
        (re.compile(r'\bSOUTH\s+DELHI\b', re.IGNORECASE), "SOUTH DELHI"),
        (re.compile(r'\bNORTH\s+DELHI\b', re.IGNORECASE), "NORTH DELHI"),
        (re.compile(r'\bEAST\s+DELHI\b', re.IGNORECASE), "EAST DELHI"),
        (re.compile(r'\bWEST\s+DELHI\b', re.IGNORECASE), "WEST DELHI"),
        (re.compile(r'\bCENTRAL\s+DELHI\b', re.IGNORECASE), "CENTRAL DELHI"),
        (re.compile(r'\bSOUTH\s+WEST\s+DELHI\b', re.IGNORECASE), "SOUTH WEST DELHI"),
        (re.compile(r'\bNORTH\s+WEST\s+DELHI\b', re.IGNORECASE), "NORTH WEST DELHI"),
        (re.compile(r'\bNORTH\s+EAST\s+DELHI\b', re.IGNORECASE), "NORTH EAST DELHI"),
        (re.compile(r'\bSOUTH\s+EAST\s+DELHI\b', re.IGNORECASE), "SOUTH EAST DELHI"),
        (re.compile(r'\bOUTER\s+DELHI\b', re.IGNORECASE), "OUTER DELHI"),
    ]

    # City patterns
    CITY_PATTERNS = [
        (re.compile(r'\bNEW\s+DELHI\b', re.IGNORECASE), "NEW DELHI"),
        (re.compile(r'\bDELHI\b', re.IGNORECASE), "DELHI"),
        (re.compile(r'\bNOIDA\b', re.IGNORECASE), "NOIDA"),
        (re.compile(r'\bGURUGRAM\b', re.IGNORECASE), "GURUGRAM"),
        (re.compile(r'\bGURGAON\b', re.IGNORECASE), "GURGAON"),
        (re.compile(r'\bFARIDABAD\b', re.IGNORECASE), "FARIDABAD"),
        (re.compile(r'\bGHAZIABAD\b', re.IGNORECASE), "GHAZIABAD"),
    ]

    # State patterns
    STATE_PATTERNS = [
        (re.compile(r'\bDELHI\b', re.IGNORECASE), "DELHI"),
        (re.compile(r'\bHARYANA\b', re.IGNORECASE), "HARYANA"),
        (re.compile(r'\bUTTAR\s+PRADESH\b', re.IGNORECASE), "UTTAR PRADESH"),
        (re.compile(r'\bU\.?\s*P\.?\b'), "UTTAR PRADESH"),
    ]

    # Colony/Nagar indicators
    COLONY_SUFFIXES = [
        "NAGAR", "VIHAR", "COLONY", "ENCLAVE", "PARK", "GARDEN",
        "PURI", "BAGH", "KUNJ", "EXTENSION", "EXTN", "PHASE",
    ]

    # Known multi-word localities that get fragmented
    KNOWN_LOCALITIES = [
        "LAJPAT NAGAR", "MALVIYA NAGAR", "KAROL BAGH", "HAUZ KHAS",
        "GREEN PARK", "GREATER KAILASH", "DEFENCE COLONY", "SOUTH EXTENSION",
        "CHITTARANJAN PARK", "NEHRU PLACE", "SARITA VIHAR", "VASANT KUNJ",
        "CIVIL LINES", "MODEL TOWN", "MUKHERJEE NAGAR", "KAMLA NAGAR",
        "ASHOK VIHAR", "SHALIMAR BAGH", "PREET VIHAR", "MAYUR VIHAR",
        "LAKSHMI NAGAR", "GANDHI NAGAR", "DILSHAD GARDEN", "ANAND VIHAR",
        "UTTAM NAGAR", "TILAK NAGAR", "RAJOURI GARDEN", "PUNJABI BAGH",
        "PASCHIM VIHAR", "CONNAUGHT PLACE", "RAJENDER NAGAR", "PATEL NAGAR",
        "KIRTI NAGAR", "LODHI ROAD", "GOLF LINKS", "SANGAM VIHAR",
        "GOVINDPURI", "AMBEDKAR NAGAR", "LADO SARAI", "KAUNWAR SINGH NAGAR",
        "BABA HARI DAS COLONY", "SWARN PARK", "CHANCHAL PARK", "DURGA PARK",
        "RAJ NAGAR", "SADH NAGAR", "VIJAY ENCLAVE", "PALAM COLONY",
    ]

    def __init__(self, use_gazetteer: bool = True):
        """

        Initialize refiner.



        Args:

            use_gazetteer: Use gazetteer for validation/correction

        """
        self.gazetteer = DelhiGazetteer() if use_gazetteer else None

    def refine(

        self,

        text: str,

        entities: list[AddressEntity]

    ) -> list[AddressEntity]:
        """

        Refine entity predictions.



        Args:

            text: Original address text

            entities: Predicted entities from NER model



        Returns:

            Refined list of entities

        """
        refined = list(entities)

        # First: detect and fix known localities from gazetteer
        refined = self._fix_known_localities(text, refined)

        # Add rule-based entities that may have been missed
        refined = self._add_pattern_entities(text, refined)

        # Detect area patterns (SOUTH DELHI, etc.)
        refined = self._add_area_patterns(text, refined)

        # Correct entity boundaries
        refined = self._correct_boundaries(text, refined)

        # Merge fragmented entities
        refined = self._merge_fragmented_entities(text, refined)

        # Adjust confidence scores
        refined = self._adjust_confidence(text, refined)

        # Remove duplicates and overlapping entities
        refined = self._remove_overlaps(refined)

        # Validate entities
        refined = self._validate_entities(refined)

        return refined

    def _fix_known_localities(

        self,

        text: str,

        entities: list[AddressEntity]

    ) -> list[AddressEntity]:
        """Fix fragmented known localities using gazetteer lookup."""
        text_upper = text.upper()
        result = []
        used_ranges: list[tuple[int, int]] = []

        # First pass: find all known localities in text
        locality_entities = []
        for locality in self.KNOWN_LOCALITIES:
            idx = 0
            while True:
                pos = text_upper.find(locality, idx)
                if pos == -1:
                    break
                end = pos + len(locality)
                locality_entities.append(AddressEntity(
                    label="SUBAREA",
                    value=text[pos:end],
                    start=pos,
                    end=end,
                    confidence=0.95
                ))
                used_ranges.append((pos, end))
                idx = end

        # Also check area patterns
        for pattern, area_name in self.AREA_PATTERNS:
            match = pattern.search(text)
            if match:
                start, end = match.start(), match.end()
                # Check for overlap with existing ranges
                overlaps = any(
                    not (end <= s or start >= e)
                    for s, e in used_ranges
                )
                if not overlaps:
                    locality_entities.append(AddressEntity(
                        label="AREA",
                        value=area_name,
                        start=start,
                        end=end,
                        confidence=0.95
                    ))
                    used_ranges.append((start, end))

        # Filter out original entities that overlap with found localities
        for entity in entities:
            # Check if entity overlaps with any locality range
            overlaps_locality = any(
                not (entity.end <= start or entity.start >= end)
                for start, end in used_ranges
            )

            if overlaps_locality and entity.label in ("AREA", "SUBAREA", "COLONY", "CITY"):
                # Skip this fragmented entity
                continue

            result.append(entity)

        # Add the locality entities
        result.extend(locality_entities)

        return result

    def _add_area_patterns(

        self,

        text: str,

        entities: list[AddressEntity]

    ) -> list[AddressEntity]:
        """Add area patterns like SOUTH DELHI, NORTH DELHI (already handled in _fix_known_localities)."""
        # This is now handled in _fix_known_localities to avoid duplicates
        return entities

    def _merge_fragmented_entities(

        self,

        text: str,

        entities: list[AddressEntity]

    ) -> list[AddressEntity]:
        """Merge adjacent entities of same type that should be together."""
        if len(entities) < 2:
            return entities

        # Sort by position
        sorted_entities = sorted(entities, key=lambda e: e.start)
        result = []
        i = 0

        while i < len(sorted_entities):
            current = sorted_entities[i]

            # Look for adjacent entities to merge
            if current.label in ("AREA", "SUBAREA", "COLONY", "CITY"):
                merged_end = current.end
                merged_confidence = current.confidence
                j = i + 1

                # Check subsequent entities
                while j < len(sorted_entities):
                    next_ent = sorted_entities[j]

                    # Check if adjacent (within 2 chars - allows for space)
                    gap = next_ent.start - merged_end
                    if gap <= 2 and next_ent.label in ("AREA", "SUBAREA", "COLONY", "CITY"):
                        # Check if the merged text forms a known locality
                        merged_text = text[current.start:next_ent.end].strip()
                        if self._is_valid_merge(merged_text):
                            merged_end = next_ent.end
                            merged_confidence = max(merged_confidence, next_ent.confidence)
                            j += 1
                        else:
                            break
                    else:
                        break

                # Create merged entity if we merged anything
                if j > i + 1:
                    merged_value = text[current.start:merged_end].strip()
                    result.append(AddressEntity(
                        label=current.label,
                        value=merged_value,
                        start=current.start,
                        end=merged_end,
                        confidence=merged_confidence
                    ))
                    i = j
                    continue

            result.append(current)
            i += 1

        return result

    def _is_valid_merge(self, text: str) -> bool:
        """Check if merged text forms a valid locality name."""
        text_upper = text.upper().strip()

        # Check against known localities
        if text_upper in self.KNOWN_LOCALITIES:
            return True

        # Check gazetteer
        if self.gazetteer and self.gazetteer.is_known_locality(text_upper, threshold=80):
            return True

        # Check if ends with common suffix
        for suffix in self.COLONY_SUFFIXES:
            if text_upper.endswith(suffix):
                return True

        return False

    def _add_pattern_entities(

        self,

        text: str,

        entities: list[AddressEntity]

    ) -> list[AddressEntity]:
        """Add entities detected by regex patterns."""
        result = list(entities)
        existing_spans = {(e.start, e.end) for e in entities}

        # Check for pincode
        if not any(e.label == "PINCODE" for e in entities):
            match = self.PATTERNS["PINCODE"].search(text)
            if match and (match.start(), match.end()) not in existing_spans:
                result.append(AddressEntity(
                    label="PINCODE",
                    value=match.group(0),
                    start=match.start(),
                    end=match.end(),
                    confidence=1.0  # Rule-based, high confidence
                ))

        # Check for city - DELHI addresses always have DELHI as city
        has_city = any(e.label == "CITY" for e in result)
        if not has_city:
            # If text contains DELHI anywhere, set city to DELHI
            if "DELHI" in text.upper():
                # Find the last occurrence of DELHI (usually the city mention)
                delhi_positions = [m.start() for m in re.finditer(r'\bDELHI\b', text.upper())]
                if delhi_positions:
                    pos = delhi_positions[-1]  # Use last occurrence
                    result.append(AddressEntity(
                        label="CITY",
                        value="DELHI",
                        start=pos,
                        end=pos + 5,
                        confidence=0.90
                    ))
            else:
                # Check other city patterns
                for pattern, city_name in self.CITY_PATTERNS:
                    if city_name == "DELHI":
                        continue  # Already handled above
                    match = pattern.search(text)
                    if match and (match.start(), match.end()) not in existing_spans:
                        result.append(AddressEntity(
                            label="CITY",
                            value=city_name,
                            start=match.start(),
                            end=match.end(),
                            confidence=0.95
                        ))
                        break

        # Check for state
        if not any(e.label == "STATE" for e in entities):
            for pattern, state_name in self.STATE_PATTERNS:
                match = pattern.search(text)
                if match and (match.start(), match.end()) not in existing_spans:
                    # Avoid tagging "DELHI" as state if it's already a city
                    if state_name == "DELHI" and any(e.label == "CITY" and "DELHI" in e.value.upper() for e in result):
                        continue
                    result.append(AddressEntity(
                        label="STATE",
                        value=state_name,
                        start=match.start(),
                        end=match.end(),
                        confidence=0.90
                    ))
                    break

        return result

    def _correct_boundaries(

        self,

        text: str,

        entities: list[AddressEntity]

    ) -> list[AddressEntity]:
        """Correct entity boundaries based on patterns."""
        result = []

        for entity in entities:
            updates: dict[str, object] = {}

            # Expand KHASRA to include full pattern
            if entity.label == "KHASRA":
                match = self.PATTERNS["KHASRA"].search(text)
                if match:
                    updates = {"value": match.group(0), "start": match.start(), "end": match.end()}

            # Expand BLOCK to include identifier
            elif entity.label == "BLOCK":
                match = self.PATTERNS["BLOCK"].search(text)
                if match:
                    updates = {"value": match.group(0), "start": match.start(), "end": match.end()}

            # Expand FLOOR to include floor number
            elif entity.label == "FLOOR":
                match = self.PATTERNS["FLOOR"].search(text)
                if match:
                    updates = {"value": match.group(0), "start": match.start(), "end": match.end()}

            # Clean up leading/trailing whitespace from value
            final_value = (updates.get("value") or entity.value).strip()
            if final_value != entity.value or updates:
                updates["value"] = final_value

            result.append(entity.model_copy(update=updates) if updates else entity)

        return result

    def _adjust_confidence(

        self,

        text: str,

        entities: list[AddressEntity]

    ) -> list[AddressEntity]:
        """Adjust confidence scores based on patterns and gazetteer."""
        result = []

        for entity in entities:
            new_confidence = entity.confidence

            # Boost confidence for pattern matches
            if entity.label in self.PATTERNS:
                pattern = self.PATTERNS[entity.label]
                if pattern.fullmatch(entity.value):
                    new_confidence = min(1.0, new_confidence + 0.1)

            # Boost confidence for gazetteer matches
            if self.gazetteer and entity.label in ("AREA", "SUBAREA", "COLONY"):
                if self.gazetteer.is_known_locality(entity.value):
                    new_confidence = min(1.0, new_confidence + 0.15)

            # Reduce confidence for very short entities
            if len(entity.value) < 3:
                new_confidence = max(0.0, new_confidence - 0.2)

            if new_confidence != entity.confidence:
                result.append(entity.model_copy(update={"confidence": new_confidence}))
            else:
                result.append(entity)

        return result

    def _remove_overlaps(

        self,

        entities: list[AddressEntity]

    ) -> list[AddressEntity]:
        """Remove overlapping entities, keeping higher confidence ones."""
        if not entities:
            return entities

        # Separate CITY and PINCODE entities - these should always be kept
        # as they represent different semantic levels than AREA/SUBAREA
        preserved_labels = {"CITY", "PINCODE", "STATE"}
        preserved_entities = [e for e in entities if e.label in preserved_labels]
        other_entities = [e for e in entities if e.label not in preserved_labels]

        # Sort non-preserved by confidence (descending) then by start position
        sorted_entities = sorted(other_entities, key=lambda e: (-e.confidence, e.start))

        result: list[AddressEntity] = []
        used_ranges: list[tuple[int, int]] = []

        for entity in sorted_entities:
            # Check for overlap with existing entities
            overlaps = False
            for start, end in used_ranges:
                if not (entity.end <= start or entity.start >= end):
                    overlaps = True
                    break

            if not overlaps:
                result.append(entity)
                used_ranges.append((entity.start, entity.end))

        # Add back preserved entities (CITY, PINCODE, STATE)
        result.extend(preserved_entities)

        # Sort by position for output
        return sorted(result, key=lambda e: e.start)

    def _validate_entities(

        self,

        entities: list[AddressEntity]

    ) -> list[AddressEntity]:
        """Validate and filter entities."""
        result = []

        for entity in entities:
            # Skip empty values
            if not entity.value.strip():
                continue

            # Skip very low confidence
            if entity.confidence < 0.3:
                continue

            # Validate pincode format
            if entity.label == "PINCODE":
                if not re.fullmatch(r'[1-9]\d{5}', entity.value):
                    continue
                if self.gazetteer and not self.gazetteer.validate_pincode(entity.value):
                    # Pincode outside Delhi range - reduce confidence but keep
                    entity = entity.model_copy(update={"confidence": entity.confidence * 0.7})

            result.append(entity)

        return result

    def extract_all_patterns(self, text: str) -> dict[str, list[str]]:
        """

        Extract all pattern-based entities from text.



        Returns dict of label -> list of matched values.

        """
        results = {}

        for label, pattern in self.PATTERNS.items():
            matches = pattern.findall(text)
            if matches:
                results[label] = matches

        return results