x commited on
Commit
5363153
·
verified ·
1 Parent(s): 734de03

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -1,13 +1,25 @@
1
  ---
2
  title: Indian Address Parser
3
- emoji: 🏠
4
  colorFrom: blue
5
  colorTo: green
6
  sdk: gradio
7
- sdk_version: "6.3.0"
 
8
  app_file: app.py
9
  pinned: false
10
  license: mit
 
 
 
 
 
 
 
 
 
 
 
11
  ---
12
 
13
  # Indian Address Parser
@@ -18,7 +30,7 @@ Parse unstructured Indian addresses into structured components using **IndicBERT
18
 
19
  - **Multilingual**: Supports Hindi (Devanagari) + English
20
  - **15 Entity Types**: House Number, Floor, Block, Gali, Colony, Area, Khasra, Pincode, etc.
21
- - **~80% F1 score** on held-out test data (mBERT-CRF baseline)
22
  - **Fast**: < 30ms inference time
23
 
24
  ## Example
@@ -43,4 +55,4 @@ PLOT NO752 FIRST FLOOR, BLOCK H-3 KH NO 24/1/3/2/2/202, KAUNWAR SINGH NAGAR NEW
43
 
44
  - **Model**: ai4bharat/IndicBERTv2-SS + CRF layer
45
  - **Training Data**: 600+ annotated Delhi addresses
46
- - **Framework**: PyTorch + HuggingFace Transformers
 
1
  ---
2
  title: Indian Address Parser
3
+ emoji: "\U0001F3E0"
4
  colorFrom: blue
5
  colorTo: green
6
  sdk: gradio
7
+ sdk_version: "6.5.1"
8
+ python_version: "3.14"
9
  app_file: app.py
10
  pinned: false
11
  license: mit
12
+ short_description: Parse Indian addresses with IndicBERTv2-CRF NER
13
+ models:
14
+ - x2aqq/indian-address-parser-model
15
+ tags:
16
+ - ner
17
+ - address-parsing
18
+ - indian-addresses
19
+ - bert
20
+ - crf
21
+ preload_from_hub:
22
+ - x2aqq/indian-address-parser-model
23
  ---
24
 
25
  # Indian Address Parser
 
30
 
31
  - **Multilingual**: Supports Hindi (Devanagari) + English
32
  - **15 Entity Types**: House Number, Floor, Block, Gali, Colony, Area, Khasra, Pincode, etc.
33
+ - **~80% F1 score** on held-out test data (IndicBERTv2-CRF)
34
  - **Fast**: < 30ms inference time
35
 
36
  ## Example
 
55
 
56
  - **Model**: ai4bharat/IndicBERTv2-SS + CRF layer
57
  - **Training Data**: 600+ annotated Delhi addresses
58
+ - **Framework**: PyTorch + HuggingFace Transformers + Pydantic v2
requirements.txt CHANGED
@@ -3,7 +3,6 @@ torch>=2.9.1
3
  transformers>=4.57.6
4
  tokenizers>=0.22.2
5
  huggingface_hub>=0.25.0
6
- gradio>=6.3.0
7
  pydantic>=2.12.5
8
  indic-transliteration>=2.3.75
9
  rapidfuzz>=3.14.3
 
3
  transformers>=4.57.6
4
  tokenizers>=0.22.2
5
  huggingface_hub>=0.25.0
 
6
  pydantic>=2.12.5
7
  indic-transliteration>=2.3.75
8
  rapidfuzz>=3.14.3
src/address_parser/__pycache__/pipeline.cpython-314.pyc CHANGED
Binary files a/src/address_parser/__pycache__/pipeline.cpython-314.pyc and b/src/address_parser/__pycache__/pipeline.cpython-314.pyc differ
 
src/address_parser/__pycache__/schemas.cpython-314.pyc CHANGED
Binary files a/src/address_parser/__pycache__/schemas.cpython-314.pyc and b/src/address_parser/__pycache__/schemas.cpython-314.pyc differ
 
src/address_parser/models/__pycache__/bert_crf.cpython-314.pyc CHANGED
Binary files a/src/address_parser/models/__pycache__/bert_crf.cpython-314.pyc and b/src/address_parser/models/__pycache__/bert_crf.cpython-314.pyc differ
 
src/address_parser/models/__pycache__/config.cpython-314.pyc CHANGED
Binary files a/src/address_parser/models/__pycache__/config.cpython-314.pyc and b/src/address_parser/models/__pycache__/config.cpython-314.pyc differ
 
src/address_parser/postprocessing/__pycache__/gazetteer.cpython-314.pyc CHANGED
Binary files a/src/address_parser/postprocessing/__pycache__/gazetteer.cpython-314.pyc and b/src/address_parser/postprocessing/__pycache__/gazetteer.cpython-314.pyc differ
 
src/address_parser/postprocessing/__pycache__/rules.cpython-314.pyc CHANGED
Binary files a/src/address_parser/postprocessing/__pycache__/rules.cpython-314.pyc and b/src/address_parser/postprocessing/__pycache__/rules.cpython-314.pyc differ
 
src/address_parser/postprocessing/gazetteer.py CHANGED
@@ -1,6 +1,5 @@
1
  """Delhi locality gazetteer for fuzzy matching and validation."""
2
 
3
-
4
  from rapidfuzz import fuzz, process
5
 
6
 
 
1
  """Delhi locality gazetteer for fuzzy matching and validation."""
2
 
 
3
  from rapidfuzz import fuzz, process
4
 
5
 
src/address_parser/postprocessing/rules.py CHANGED
@@ -390,36 +390,32 @@ class RuleBasedRefiner:
390
  result = []
391
 
392
  for entity in entities:
393
- corrected = entity.model_copy()
394
 
395
  # Expand KHASRA to include full pattern
396
  if entity.label == "KHASRA":
397
  match = self.PATTERNS["KHASRA"].search(text)
398
  if match:
399
- corrected.value = match.group(0)
400
- corrected.start = match.start()
401
- corrected.end = match.end()
402
 
403
  # Expand BLOCK to include identifier
404
  elif entity.label == "BLOCK":
405
  match = self.PATTERNS["BLOCK"].search(text)
406
  if match:
407
- corrected.value = match.group(0)
408
- corrected.start = match.start()
409
- corrected.end = match.end()
410
 
411
  # Expand FLOOR to include floor number
412
  elif entity.label == "FLOOR":
413
  match = self.PATTERNS["FLOOR"].search(text)
414
  if match:
415
- corrected.value = match.group(0)
416
- corrected.start = match.start()
417
- corrected.end = match.end()
418
 
419
  # Clean up leading/trailing whitespace from value
420
- corrected.value = corrected.value.strip()
 
 
421
 
422
- result.append(corrected)
423
 
424
  return result
425
 
@@ -432,24 +428,27 @@ class RuleBasedRefiner:
432
  result = []
433
 
434
  for entity in entities:
435
- adjusted = entity.model_copy()
436
 
437
  # Boost confidence for pattern matches
438
  if entity.label in self.PATTERNS:
439
  pattern = self.PATTERNS[entity.label]
440
  if pattern.fullmatch(entity.value):
441
- adjusted.confidence = min(1.0, entity.confidence + 0.1)
442
 
443
  # Boost confidence for gazetteer matches
444
  if self.gazetteer and entity.label in ("AREA", "SUBAREA", "COLONY"):
445
  if self.gazetteer.is_known_locality(entity.value):
446
- adjusted.confidence = min(1.0, entity.confidence + 0.15)
447
 
448
  # Reduce confidence for very short entities
449
  if len(entity.value) < 3:
450
- adjusted.confidence = max(0.0, entity.confidence - 0.2)
451
 
452
- result.append(adjusted)
 
 
 
453
 
454
  return result
455
 
@@ -513,8 +512,7 @@ class RuleBasedRefiner:
513
  continue
514
  if self.gazetteer and not self.gazetteer.validate_pincode(entity.value):
515
  # Pincode outside Delhi range - reduce confidence but keep
516
- entity = entity.model_copy()
517
- entity.confidence *= 0.7
518
 
519
  result.append(entity)
520
 
 
390
  result = []
391
 
392
  for entity in entities:
393
+ updates: dict[str, object] = {}
394
 
395
  # Expand KHASRA to include full pattern
396
  if entity.label == "KHASRA":
397
  match = self.PATTERNS["KHASRA"].search(text)
398
  if match:
399
+ updates = {"value": match.group(0), "start": match.start(), "end": match.end()}
 
 
400
 
401
  # Expand BLOCK to include identifier
402
  elif entity.label == "BLOCK":
403
  match = self.PATTERNS["BLOCK"].search(text)
404
  if match:
405
+ updates = {"value": match.group(0), "start": match.start(), "end": match.end()}
 
 
406
 
407
  # Expand FLOOR to include floor number
408
  elif entity.label == "FLOOR":
409
  match = self.PATTERNS["FLOOR"].search(text)
410
  if match:
411
+ updates = {"value": match.group(0), "start": match.start(), "end": match.end()}
 
 
412
 
413
  # Clean up leading/trailing whitespace from value
414
+ final_value = (updates.get("value") or entity.value).strip()
415
+ if final_value != entity.value or updates:
416
+ updates["value"] = final_value
417
 
418
+ result.append(entity.model_copy(update=updates) if updates else entity)
419
 
420
  return result
421
 
 
428
  result = []
429
 
430
  for entity in entities:
431
+ new_confidence = entity.confidence
432
 
433
  # Boost confidence for pattern matches
434
  if entity.label in self.PATTERNS:
435
  pattern = self.PATTERNS[entity.label]
436
  if pattern.fullmatch(entity.value):
437
+ new_confidence = min(1.0, new_confidence + 0.1)
438
 
439
  # Boost confidence for gazetteer matches
440
  if self.gazetteer and entity.label in ("AREA", "SUBAREA", "COLONY"):
441
  if self.gazetteer.is_known_locality(entity.value):
442
+ new_confidence = min(1.0, new_confidence + 0.15)
443
 
444
  # Reduce confidence for very short entities
445
  if len(entity.value) < 3:
446
+ new_confidence = max(0.0, new_confidence - 0.2)
447
 
448
+ if new_confidence != entity.confidence:
449
+ result.append(entity.model_copy(update={"confidence": new_confidence}))
450
+ else:
451
+ result.append(entity)
452
 
453
  return result
454
 
 
512
  continue
513
  if self.gazetteer and not self.gazetteer.validate_pincode(entity.value):
514
  # Pincode outside Delhi range - reduce confidence but keep
515
+ entity = entity.model_copy(update={"confidence": entity.confidence * 0.7})
 
516
 
517
  result.append(entity)
518
 
src/address_parser/preprocessing/__pycache__/hindi.cpython-314.pyc CHANGED
Binary files a/src/address_parser/preprocessing/__pycache__/hindi.cpython-314.pyc and b/src/address_parser/preprocessing/__pycache__/hindi.cpython-314.pyc differ
 
src/address_parser/preprocessing/__pycache__/normalizer.cpython-314.pyc CHANGED
Binary files a/src/address_parser/preprocessing/__pycache__/normalizer.cpython-314.pyc and b/src/address_parser/preprocessing/__pycache__/normalizer.cpython-314.pyc differ
 
src/address_parser/schemas.py CHANGED
@@ -1,6 +1,8 @@
1
- """Pydantic schemas for address parsing I/O."""
2
 
3
- from pydantic import BaseModel, ConfigDict, Field
 
 
4
 
5
  # Entity label definitions
6
  ENTITY_LABELS = [
@@ -21,6 +23,13 @@ ENTITY_LABELS = [
21
  "STATE",
22
  ]
23
 
 
 
 
 
 
 
 
24
  # BIO tag generation
25
  BIO_LABELS = ["O"] + [f"B-{label}" for label in ENTITY_LABELS] + [f"I-{label}" for label in ENTITY_LABELS]
26
  LABEL2ID = {label: i for i, label in enumerate(BIO_LABELS)}
@@ -28,15 +37,11 @@ ID2LABEL = {i: label for i, label in enumerate(BIO_LABELS)}
28
 
29
 
30
  class AddressEntity(BaseModel):
31
- """A single extracted entity from an address."""
32
-
33
- label: str = Field(..., description="Entity type (e.g., HOUSE_NUMBER, AREA)")
34
- value: str = Field(..., description="Extracted text value")
35
- start: int = Field(..., description="Start character offset in original text")
36
- end: int = Field(..., description="End character offset in original text")
37
- confidence: float = Field(default=1.0, ge=0.0, le=1.0, description="Confidence score")
38
 
39
  model_config = ConfigDict(
 
 
40
  json_schema_extra={
41
  "example": {
42
  "label": "HOUSE_NUMBER",
@@ -45,49 +50,21 @@ class AddressEntity(BaseModel):
45
  "end": 10,
46
  "confidence": 0.95,
47
  }
48
- }
49
  )
50
 
 
 
 
 
 
51
 
52
- class ParsedAddress(BaseModel):
53
- """Complete parsed address with all entities."""
54
-
55
- raw_address: str = Field(..., description="Original input address")
56
- normalized_address: str = Field(..., description="Normalized/cleaned address")
57
- entities: list[AddressEntity] = Field(default_factory=list, description="Extracted entities")
58
 
59
- # Convenience accessors for common fields
60
- house_number: str | None = Field(None, description="Extracted house/plot number")
61
- floor: str | None = Field(None, description="Extracted floor")
62
- block: str | None = Field(None, description="Extracted block")
63
- gali: str | None = Field(None, description="Extracted gali/lane")
64
- colony: str | None = Field(None, description="Extracted colony name")
65
- area: str | None = Field(None, description="Extracted area/locality")
66
- subarea: str | None = Field(None, description="Extracted sub-area")
67
- sector: str | None = Field(None, description="Extracted sector")
68
- khasra: str | None = Field(None, description="Extracted khasra number")
69
- pincode: str | None = Field(None, description="Extracted PIN code")
70
- city: str | None = Field(None, description="Extracted city")
71
- state: str | None = Field(None, description="Extracted state")
72
-
73
- def model_post_init(self, __context) -> None:
74
- """Populate convenience fields from entities."""
75
- entity_map = {e.label.upper(): e.value for e in self.entities}
76
-
77
- self.house_number = entity_map.get("HOUSE_NUMBER") or entity_map.get("PLOT")
78
- self.floor = entity_map.get("FLOOR")
79
- self.block = entity_map.get("BLOCK")
80
- self.gali = entity_map.get("GALI")
81
- self.colony = entity_map.get("COLONY")
82
- self.area = entity_map.get("AREA")
83
- self.subarea = entity_map.get("SUBAREA")
84
- self.sector = entity_map.get("SECTOR")
85
- self.khasra = entity_map.get("KHASRA")
86
- self.pincode = entity_map.get("PINCODE")
87
- self.city = entity_map.get("CITY")
88
- self.state = entity_map.get("STATE")
89
 
90
  model_config = ConfigDict(
 
91
  json_schema_extra={
92
  "example": {
93
  "raw_address": "PLOT NO752 FIRST FLOOR, BLOCK H-3, NEW DELHI, 110041",
@@ -99,25 +76,97 @@ class ParsedAddress(BaseModel):
99
  "house_number": "PLOT NO752",
100
  "floor": "FIRST FLOOR",
101
  }
102
- }
103
  )
104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
  class ParseRequest(BaseModel):
107
  """Request schema for parsing addresses."""
108
 
109
- address: str = Field(..., min_length=5, max_length=500, description="Address to parse")
110
- return_confidence: bool = Field(default=True, description="Include confidence scores")
111
-
112
  model_config = ConfigDict(
 
113
  json_schema_extra={
114
  "example": {
115
  "address": "PLOT NO752 FIRST FLOOR, BLOCK H-3, NEW DELHI, 110041",
116
  "return_confidence": True,
117
  }
118
- }
119
  )
120
 
 
 
 
121
 
122
  class BatchParseRequest(BaseModel):
123
  """Request schema for batch parsing."""
 
1
+ """Pydantic v2 schemas for address parsing I/O."""
2
 
3
+ from typing import Literal
4
+
5
+ from pydantic import BaseModel, ConfigDict, Field, computed_field
6
 
7
  # Entity label definitions
8
  ENTITY_LABELS = [
 
23
  "STATE",
24
  ]
25
 
26
+ # Type-safe entity label literal
27
+ EntityLabel = Literal[
28
+ "AREA", "SUBAREA", "HOUSE_NUMBER", "SECTOR", "GALI",
29
+ "COLONY", "BLOCK", "CAMP", "POLE", "KHASRA",
30
+ "FLOOR", "PLOT", "PINCODE", "CITY", "STATE",
31
+ ]
32
+
33
  # BIO tag generation
34
  BIO_LABELS = ["O"] + [f"B-{label}" for label in ENTITY_LABELS] + [f"I-{label}" for label in ENTITY_LABELS]
35
  LABEL2ID = {label: i for i, label in enumerate(BIO_LABELS)}
 
37
 
38
 
39
  class AddressEntity(BaseModel):
40
+ """A single extracted entity from an address. Immutable after creation."""
 
 
 
 
 
 
41
 
42
  model_config = ConfigDict(
43
+ frozen=True,
44
+ str_strip_whitespace=True,
45
  json_schema_extra={
46
  "example": {
47
  "label": "HOUSE_NUMBER",
 
50
  "end": 10,
51
  "confidence": 0.95,
52
  }
53
+ },
54
  )
55
 
56
+ label: EntityLabel = Field(..., description="Entity type (e.g., HOUSE_NUMBER, AREA)")
57
+ value: str = Field(..., min_length=1, description="Extracted text value")
58
+ start: int = Field(..., ge=0, description="Start character offset in original text")
59
+ end: int = Field(..., ge=0, description="End character offset in original text")
60
+ confidence: float = Field(default=1.0, ge=0.0, le=1.0, description="Confidence score")
61
 
 
 
 
 
 
 
62
 
63
+ class ParsedAddress(BaseModel):
64
+ """Complete parsed address with all entities and computed convenience accessors."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
  model_config = ConfigDict(
67
+ str_strip_whitespace=True,
68
  json_schema_extra={
69
  "example": {
70
  "raw_address": "PLOT NO752 FIRST FLOOR, BLOCK H-3, NEW DELHI, 110041",
 
76
  "house_number": "PLOT NO752",
77
  "floor": "FIRST FLOOR",
78
  }
79
+ },
80
  )
81
 
82
+ raw_address: str = Field(..., description="Original input address")
83
+ normalized_address: str = Field(..., description="Normalized/cleaned address")
84
+ entities: list[AddressEntity] = Field(default_factory=list, description="Extracted entities")
85
+
86
+ def _get_entity(self, *labels: str) -> str | None:
87
+ """Look up first matching entity value by label(s)."""
88
+ for entity in self.entities:
89
+ if entity.label in labels:
90
+ return entity.value
91
+ return None
92
+
93
+ @computed_field(description="Extracted house/plot number")
94
+ @property
95
+ def house_number(self) -> str | None:
96
+ return self._get_entity("HOUSE_NUMBER", "PLOT")
97
+
98
+ @computed_field(description="Extracted floor")
99
+ @property
100
+ def floor(self) -> str | None:
101
+ return self._get_entity("FLOOR")
102
+
103
+ @computed_field(description="Extracted block")
104
+ @property
105
+ def block(self) -> str | None:
106
+ return self._get_entity("BLOCK")
107
+
108
+ @computed_field(description="Extracted gali/lane")
109
+ @property
110
+ def gali(self) -> str | None:
111
+ return self._get_entity("GALI")
112
+
113
+ @computed_field(description="Extracted colony name")
114
+ @property
115
+ def colony(self) -> str | None:
116
+ return self._get_entity("COLONY")
117
+
118
+ @computed_field(description="Extracted area/locality")
119
+ @property
120
+ def area(self) -> str | None:
121
+ return self._get_entity("AREA")
122
+
123
+ @computed_field(description="Extracted sub-area")
124
+ @property
125
+ def subarea(self) -> str | None:
126
+ return self._get_entity("SUBAREA")
127
+
128
+ @computed_field(description="Extracted sector")
129
+ @property
130
+ def sector(self) -> str | None:
131
+ return self._get_entity("SECTOR")
132
+
133
+ @computed_field(description="Extracted khasra number")
134
+ @property
135
+ def khasra(self) -> str | None:
136
+ return self._get_entity("KHASRA")
137
+
138
+ @computed_field(description="Extracted PIN code")
139
+ @property
140
+ def pincode(self) -> str | None:
141
+ return self._get_entity("PINCODE")
142
+
143
+ @computed_field(description="Extracted city")
144
+ @property
145
+ def city(self) -> str | None:
146
+ return self._get_entity("CITY")
147
+
148
+ @computed_field(description="Extracted state")
149
+ @property
150
+ def state(self) -> str | None:
151
+ return self._get_entity("STATE")
152
+
153
 
154
  class ParseRequest(BaseModel):
155
  """Request schema for parsing addresses."""
156
 
 
 
 
157
  model_config = ConfigDict(
158
+ str_strip_whitespace=True,
159
  json_schema_extra={
160
  "example": {
161
  "address": "PLOT NO752 FIRST FLOOR, BLOCK H-3, NEW DELHI, 110041",
162
  "return_confidence": True,
163
  }
164
+ },
165
  )
166
 
167
+ address: str = Field(..., min_length=5, max_length=500, description="Address to parse")
168
+ return_confidence: bool = Field(default=True, description="Include confidence scores")
169
+
170
 
171
  class BatchParseRequest(BaseModel):
172
  """Request schema for batch parsing."""