x commited on
Upload folder using huggingface_hub
Browse files- README.md +46 -12
- app.py +280 -0
- requirements.txt +10 -0
- src/address_parser/__init__.py +26 -0
- src/address_parser/__pycache__/__init__.cpython-312.pyc +0 -0
- src/address_parser/__pycache__/__init__.cpython-314.pyc +0 -0
- src/address_parser/__pycache__/cli.cpython-314.pyc +0 -0
- src/address_parser/__pycache__/pipeline.cpython-312.pyc +0 -0
- src/address_parser/__pycache__/pipeline.cpython-314.pyc +0 -0
- src/address_parser/__pycache__/schemas.cpython-312.pyc +0 -0
- src/address_parser/__pycache__/schemas.cpython-314.pyc +0 -0
- src/address_parser/cli.py +132 -0
- src/address_parser/models/__init__.py +6 -0
- src/address_parser/models/__pycache__/__init__.cpython-312.pyc +0 -0
- src/address_parser/models/__pycache__/__init__.cpython-314.pyc +0 -0
- src/address_parser/models/__pycache__/bert_crf.cpython-312.pyc +0 -0
- src/address_parser/models/__pycache__/bert_crf.cpython-314.pyc +0 -0
- src/address_parser/models/__pycache__/config.cpython-312.pyc +0 -0
- src/address_parser/models/__pycache__/config.cpython-314.pyc +0 -0
- src/address_parser/models/bert_crf.py +439 -0
- src/address_parser/models/config.py +103 -0
- src/address_parser/pipeline.py +528 -0
- src/address_parser/postprocessing/__init__.py +6 -0
- src/address_parser/postprocessing/__pycache__/__init__.cpython-312.pyc +0 -0
- src/address_parser/postprocessing/__pycache__/__init__.cpython-314.pyc +0 -0
- src/address_parser/postprocessing/__pycache__/gazetteer.cpython-312.pyc +0 -0
- src/address_parser/postprocessing/__pycache__/gazetteer.cpython-314.pyc +0 -0
- src/address_parser/postprocessing/__pycache__/rules.cpython-312.pyc +0 -0
- src/address_parser/postprocessing/__pycache__/rules.cpython-314.pyc +0 -0
- src/address_parser/postprocessing/gazetteer.py +164 -0
- src/address_parser/postprocessing/rules.py +536 -0
- src/address_parser/preprocessing/__init__.py +6 -0
- src/address_parser/preprocessing/__pycache__/__init__.cpython-312.pyc +0 -0
- src/address_parser/preprocessing/__pycache__/__init__.cpython-314.pyc +0 -0
- src/address_parser/preprocessing/__pycache__/hindi.cpython-312.pyc +0 -0
- src/address_parser/preprocessing/__pycache__/hindi.cpython-314.pyc +0 -0
- src/address_parser/preprocessing/__pycache__/normalizer.cpython-312.pyc +0 -0
- src/address_parser/preprocessing/__pycache__/normalizer.cpython-314.pyc +0 -0
- src/address_parser/preprocessing/hindi.py +242 -0
- src/address_parser/preprocessing/normalizer.py +192 -0
- src/address_parser/schemas.py +152 -0
- src/indian_address_parser.egg-info/PKG-INFO +383 -0
- src/indian_address_parser.egg-info/SOURCES.txt +24 -0
- src/indian_address_parser.egg-info/dependency_links.txt +1 -0
- src/indian_address_parser.egg-info/entry_points.txt +2 -0
- src/indian_address_parser.egg-info/requires.txt +48 -0
- src/indian_address_parser.egg-info/top_level.txt +1 -0
README.md
CHANGED
|
@@ -1,12 +1,46 @@
|
|
| 1 |
-
---
|
| 2 |
-
title: Indian Address Parser
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
-
sdk: gradio
|
| 7 |
-
sdk_version: 6.
|
| 8 |
-
app_file: app.py
|
| 9 |
-
pinned: false
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Indian Address Parser
|
| 3 |
+
emoji: 🏠
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: "6.3.0"
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
license: mit
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# Indian Address Parser
|
| 14 |
+
|
| 15 |
+
Parse unstructured Indian addresses into structured components using **IndicBERTv2-CRF**.
|
| 16 |
+
|
| 17 |
+
## Features
|
| 18 |
+
|
| 19 |
+
- **Multilingual**: Supports Hindi (Devanagari) + English
|
| 20 |
+
- **15 Entity Types**: House Number, Floor, Block, Gali, Colony, Area, Khasra, Pincode, etc.
|
| 21 |
+
- **~80% F1 score** on held-out test data (mBERT-CRF baseline)
|
| 22 |
+
- **Fast**: < 30ms inference time
|
| 23 |
+
|
| 24 |
+
## Example
|
| 25 |
+
|
| 26 |
+
**Input:**
|
| 27 |
+
```
|
| 28 |
+
PLOT NO752 FIRST FLOOR, BLOCK H-3 KH NO 24/1/3/2/2/202, KAUNWAR SINGH NAGAR NEW DELHI, DELHI, 110041
|
| 29 |
+
```
|
| 30 |
+
|
| 31 |
+
**Output:**
|
| 32 |
+
| Entity | Value |
|
| 33 |
+
|--------|-------|
|
| 34 |
+
| HOUSE_NUMBER | PLOT NO752 |
|
| 35 |
+
| FLOOR | FIRST FLOOR |
|
| 36 |
+
| BLOCK | BLOCK H-3 |
|
| 37 |
+
| KHASRA | KH NO 24/1/3/2/2/202 |
|
| 38 |
+
| AREA | KAUNWAR SINGH NAGAR |
|
| 39 |
+
| CITY | NEW DELHI |
|
| 40 |
+
| PINCODE | 110041 |
|
| 41 |
+
|
| 42 |
+
## Technical Details
|
| 43 |
+
|
| 44 |
+
- **Model**: ai4bharat/IndicBERTv2-SS + CRF layer
|
| 45 |
+
- **Training Data**: 600+ annotated Delhi addresses
|
| 46 |
+
- **Framework**: PyTorch + HuggingFace Transformers
|
app.py
ADDED
|
@@ -0,0 +1,280 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Gradio demo for Indian Address Parser.
|
| 3 |
+
|
| 4 |
+
Interactive web interface for HuggingFace Spaces deployment.
|
| 5 |
+
Features:
|
| 6 |
+
- Real-time address parsing
|
| 7 |
+
- Entity highlighting
|
| 8 |
+
- Example addresses
|
| 9 |
+
- Confidence scores
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import os
|
| 13 |
+
import sys
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
|
| 16 |
+
import gradio as gr
|
| 17 |
+
|
| 18 |
+
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
| 19 |
+
|
| 20 |
+
from address_parser import AddressParser, ParsedAddress
|
| 21 |
+
|
| 22 |
+
# Entity colors for visualization
|
| 23 |
+
ENTITY_COLORS = {
|
| 24 |
+
"HOUSE_NUMBER": "#FF6B6B", # Red
|
| 25 |
+
"PLOT": "#FF6B6B",
|
| 26 |
+
"FLOOR": "#4ECDC4", # Teal
|
| 27 |
+
"BLOCK": "#45B7D1", # Blue
|
| 28 |
+
"SECTOR": "#96CEB4", # Green
|
| 29 |
+
"GALI": "#FFEAA7", # Yellow
|
| 30 |
+
"COLONY": "#DDA0DD", # Plum
|
| 31 |
+
"AREA": "#98D8C8", # Mint
|
| 32 |
+
"SUBAREA": "#F7DC6F", # Light yellow
|
| 33 |
+
"KHASRA": "#BB8FCE", # Purple
|
| 34 |
+
"PINCODE": "#85C1E9", # Light blue
|
| 35 |
+
"CITY": "#F8B500", # Orange
|
| 36 |
+
"STATE": "#58D68D", # Light green
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
# Example addresses
|
| 40 |
+
EXAMPLES = [
|
| 41 |
+
"PLOT NO752 FIRST FLOOR, BLOCK H-3 KH NO 24/1/3/2/2/202, KAUNWAR SINGH NAGAR NEW DELHI, DELHI, 110041",
|
| 42 |
+
"H.NO. 123, GALI NO. 5, LAJPAT NAGAR, SOUTH DELHI, 110024",
|
| 43 |
+
"FLAT NO A-501, SECTOR 15, DWARKA, NEW DELHI, 110078",
|
| 44 |
+
"KHASRA NO 45/2, VILLAGE MUNDKA, OUTER DELHI, 110041",
|
| 45 |
+
"S-3/166, GROUND FLOOR, KH NO 98/4, GALI NO-6, SWARN PARK MUNDKA, Delhi, 110041",
|
| 46 |
+
"PLOT NO A5 GROUND FLOOR, KHASRA NO 15/20/2 BABA HARI DAS COLONY, TIKARI KALA, DELHI, 110041",
|
| 47 |
+
]
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def load_parser():
|
| 51 |
+
"""Load the address parser from HuggingFace Hub or local path."""
|
| 52 |
+
from huggingface_hub import snapshot_download
|
| 53 |
+
|
| 54 |
+
# Configuration - HF_MODEL_REPO should be set in Space settings
|
| 55 |
+
HF_MODEL_REPO = os.getenv("HF_MODEL_REPO", "")
|
| 56 |
+
LOCAL_MODEL_PATH = os.getenv("MODEL_PATH", "./models/address_ner_v3")
|
| 57 |
+
|
| 58 |
+
# Try local path first (for development/testing)
|
| 59 |
+
if Path(LOCAL_MODEL_PATH).exists() and (Path(LOCAL_MODEL_PATH) / "pytorch_model.bin").exists():
|
| 60 |
+
print(f"Loading model from local path: {LOCAL_MODEL_PATH}")
|
| 61 |
+
return AddressParser.from_pretrained(LOCAL_MODEL_PATH, device="cpu")
|
| 62 |
+
|
| 63 |
+
# Try HuggingFace Hub
|
| 64 |
+
if HF_MODEL_REPO:
|
| 65 |
+
try:
|
| 66 |
+
print(f"Downloading model from HuggingFace Hub: {HF_MODEL_REPO}")
|
| 67 |
+
model_path = snapshot_download(repo_id=HF_MODEL_REPO, repo_type="model")
|
| 68 |
+
print(f"Model downloaded to: {model_path}")
|
| 69 |
+
return AddressParser.from_pretrained(model_path, device="cpu")
|
| 70 |
+
except Exception as e:
|
| 71 |
+
print(f"Failed to load model from HF Hub: {e}")
|
| 72 |
+
|
| 73 |
+
# Fallback to rules-only mode
|
| 74 |
+
print("No model available, using rules-only mode")
|
| 75 |
+
return AddressParser.rules_only()
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
# Initialize parser
|
| 79 |
+
parser = load_parser()
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def create_highlighted_html(result: ParsedAddress) -> str:
|
| 83 |
+
"""Create HTML with highlighted entities."""
|
| 84 |
+
if not result.entities:
|
| 85 |
+
return f"<p>{result.normalized_address}</p>"
|
| 86 |
+
|
| 87 |
+
# Sort entities by position
|
| 88 |
+
sorted_entities = sorted(result.entities, key=lambda e: e.start)
|
| 89 |
+
|
| 90 |
+
html_parts = []
|
| 91 |
+
last_end = 0
|
| 92 |
+
text = result.normalized_address
|
| 93 |
+
|
| 94 |
+
for entity in sorted_entities:
|
| 95 |
+
# Add text before entity
|
| 96 |
+
if entity.start > last_end:
|
| 97 |
+
html_parts.append(text[last_end:entity.start])
|
| 98 |
+
|
| 99 |
+
# Add highlighted entity
|
| 100 |
+
color = ENTITY_COLORS.get(entity.label, "#CCCCCC")
|
| 101 |
+
html_parts.append(
|
| 102 |
+
f'<span style="background-color: {color}; padding: 2px 6px; '
|
| 103 |
+
f'border-radius: 4px; margin: 0 2px; font-weight: bold;" '
|
| 104 |
+
f'title="{entity.label} ({entity.confidence:.0%})">'
|
| 105 |
+
f'{entity.value}</span>'
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
last_end = entity.end
|
| 109 |
+
|
| 110 |
+
# Add remaining text
|
| 111 |
+
if last_end < len(text):
|
| 112 |
+
html_parts.append(text[last_end:])
|
| 113 |
+
|
| 114 |
+
return "".join(html_parts)
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def create_entity_table(result: ParsedAddress) -> list[list[str]]:
|
| 118 |
+
"""Create table of extracted entities."""
|
| 119 |
+
if not result.entities:
|
| 120 |
+
return []
|
| 121 |
+
|
| 122 |
+
return [
|
| 123 |
+
[entity.label, entity.value, f"{entity.confidence:.0%}"]
|
| 124 |
+
for entity in sorted(result.entities, key=lambda e: e.start)
|
| 125 |
+
]
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def parse_address(address: str) -> tuple[str, list[list[str]], str]:
|
| 129 |
+
"""
|
| 130 |
+
Parse address and return results for Gradio interface.
|
| 131 |
+
|
| 132 |
+
Returns:
|
| 133 |
+
- Highlighted HTML
|
| 134 |
+
- Entity table
|
| 135 |
+
- Structured output JSON
|
| 136 |
+
"""
|
| 137 |
+
if not address or not address.strip():
|
| 138 |
+
return "<p>Please enter an address</p>", [], "{}"
|
| 139 |
+
|
| 140 |
+
# Parse
|
| 141 |
+
result = parser.parse(address)
|
| 142 |
+
|
| 143 |
+
# Create outputs
|
| 144 |
+
highlighted = create_highlighted_html(result)
|
| 145 |
+
table = create_entity_table(result)
|
| 146 |
+
|
| 147 |
+
# Structured output
|
| 148 |
+
structured = {
|
| 149 |
+
"house_number": result.house_number,
|
| 150 |
+
"floor": result.floor,
|
| 151 |
+
"block": result.block,
|
| 152 |
+
"gali": result.gali,
|
| 153 |
+
"colony": result.colony,
|
| 154 |
+
"area": result.area,
|
| 155 |
+
"subarea": result.subarea,
|
| 156 |
+
"sector": result.sector,
|
| 157 |
+
"khasra": result.khasra,
|
| 158 |
+
"pincode": result.pincode,
|
| 159 |
+
"city": result.city,
|
| 160 |
+
"state": result.state,
|
| 161 |
+
}
|
| 162 |
+
# Remove None values
|
| 163 |
+
structured = {k: v for k, v in structured.items() if v}
|
| 164 |
+
|
| 165 |
+
import json
|
| 166 |
+
structured_json = json.dumps(structured, indent=2, ensure_ascii=False)
|
| 167 |
+
|
| 168 |
+
return highlighted, table, structured_json
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
# Custom CSS for the demo
|
| 172 |
+
CUSTOM_CSS = """
|
| 173 |
+
.highlighted-text {
|
| 174 |
+
font-size: 1.1em;
|
| 175 |
+
line-height: 1.8;
|
| 176 |
+
padding: 15px;
|
| 177 |
+
background: #f8f9fa;
|
| 178 |
+
border-radius: 8px;
|
| 179 |
+
}
|
| 180 |
+
"""
|
| 181 |
+
|
| 182 |
+
# Create Gradio interface
|
| 183 |
+
with gr.Blocks(title="Indian Address Parser") as demo:
|
| 184 |
+
gr.Markdown(
|
| 185 |
+
"""
|
| 186 |
+
# Indian Address Parser
|
| 187 |
+
|
| 188 |
+
Parse unstructured Indian addresses into structured components using
|
| 189 |
+
**mBERT-CRF** (Multilingual BERT with Conditional Random Field).
|
| 190 |
+
|
| 191 |
+
## Features
|
| 192 |
+
- Supports Hindi + English (Devanagari and Latin scripts)
|
| 193 |
+
- 15 entity types: House Number, Floor, Block, Gali, Colony, Area, Khasra, Pincode, etc.
|
| 194 |
+
- Delhi-specific locality gazetteer for improved accuracy
|
| 195 |
+
- < 30ms inference time
|
| 196 |
+
|
| 197 |
+
---
|
| 198 |
+
"""
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
with gr.Row():
|
| 202 |
+
with gr.Column(scale=2):
|
| 203 |
+
address_input = gr.Textbox(
|
| 204 |
+
label="Enter Address",
|
| 205 |
+
placeholder="e.g., PLOT NO752 FIRST FLOOR, BLOCK H-3, NEW DELHI, 110041",
|
| 206 |
+
lines=3,
|
| 207 |
+
)
|
| 208 |
+
parse_btn = gr.Button("Parse Address", variant="primary")
|
| 209 |
+
|
| 210 |
+
gr.Examples(
|
| 211 |
+
examples=[[ex] for ex in EXAMPLES],
|
| 212 |
+
inputs=[address_input],
|
| 213 |
+
label="Example Addresses",
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
gr.Markdown("## Results")
|
| 217 |
+
|
| 218 |
+
with gr.Row():
|
| 219 |
+
with gr.Column(scale=1):
|
| 220 |
+
gr.Markdown("### Highlighted Entities")
|
| 221 |
+
highlighted_output = gr.HTML(
|
| 222 |
+
elem_classes=["highlighted-text"]
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
with gr.Column(scale=1):
|
| 226 |
+
gr.Markdown("### Extracted Entities")
|
| 227 |
+
entity_table = gr.Dataframe(
|
| 228 |
+
headers=["Entity Type", "Value", "Confidence"],
|
| 229 |
+
datatype=["str", "str", "str"],
|
| 230 |
+
row_count=10,
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
with gr.Row():
|
| 234 |
+
gr.Markdown("### Structured Output")
|
| 235 |
+
structured_output = gr.Code(
|
| 236 |
+
language="json",
|
| 237 |
+
label="Structured JSON",
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
+
# Legend
|
| 241 |
+
gr.Markdown("### Entity Legend")
|
| 242 |
+
legend_html = " ".join([
|
| 243 |
+
f'<span style="background-color: {color}; padding: 2px 8px; '
|
| 244 |
+
f'border-radius: 4px; margin: 2px; display: inline-block;">{label}</span>'
|
| 245 |
+
for label, color in ENTITY_COLORS.items()
|
| 246 |
+
])
|
| 247 |
+
gr.HTML(f"<div style='line-height: 2.5;'>{legend_html}</div>")
|
| 248 |
+
|
| 249 |
+
# Footer
|
| 250 |
+
gr.Markdown(
|
| 251 |
+
"""
|
| 252 |
+
---
|
| 253 |
+
**Model**: IndicBERTv2-SS + CRF (ai4bharat/IndicBERTv2-SS + CRF layer)
|
| 254 |
+
| **Training Data**: 600+ annotated Delhi addresses
|
| 255 |
+
| **GitHub**: [indian-address-parser](https://github.com/howdoiusekeyboard/indian-address-parser)
|
| 256 |
+
"""
|
| 257 |
+
)
|
| 258 |
+
|
| 259 |
+
# Event handlers
|
| 260 |
+
parse_btn.click(
|
| 261 |
+
fn=parse_address,
|
| 262 |
+
inputs=[address_input],
|
| 263 |
+
outputs=[highlighted_output, entity_table, structured_output],
|
| 264 |
+
)
|
| 265 |
+
|
| 266 |
+
address_input.submit(
|
| 267 |
+
fn=parse_address,
|
| 268 |
+
inputs=[address_input],
|
| 269 |
+
outputs=[highlighted_output, entity_table, structured_output],
|
| 270 |
+
)
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
if __name__ == "__main__":
|
| 274 |
+
demo.launch(
|
| 275 |
+
server_name="0.0.0.0",
|
| 276 |
+
server_port=int(os.getenv("PORT", "7860")),
|
| 277 |
+
share=False,
|
| 278 |
+
theme=gr.themes.Soft(),
|
| 279 |
+
css=CUSTOM_CSS,
|
| 280 |
+
)
|
requirements.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# HuggingFace Spaces requirements (Python 3.14)
|
| 2 |
+
torch>=2.9.1
|
| 3 |
+
transformers>=4.57.6
|
| 4 |
+
tokenizers>=0.22.2
|
| 5 |
+
huggingface_hub>=0.25.0
|
| 6 |
+
gradio>=6.3.0
|
| 7 |
+
pydantic>=2.12.5
|
| 8 |
+
indic-transliteration>=2.3.75
|
| 9 |
+
rapidfuzz>=3.14.3
|
| 10 |
+
regex>=2026.1.15
|
src/address_parser/__init__.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Indian Address Parser - Production-grade NER for Indian addresses.
|
| 3 |
+
|
| 4 |
+
A modern NLP system for parsing unstructured Indian addresses into
|
| 5 |
+
structured components using mBERT-CRF architecture with Hindi+English support.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
__version__ = "2.0.0"
|
| 9 |
+
__author__ = "Kushagra"
|
| 10 |
+
|
| 11 |
+
from address_parser.pipeline import AddressParser
|
| 12 |
+
from address_parser.schemas import (
|
| 13 |
+
AddressEntity,
|
| 14 |
+
ParsedAddress,
|
| 15 |
+
ParseRequest,
|
| 16 |
+
ParseResponse,
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
__all__ = [
|
| 20 |
+
"AddressParser",
|
| 21 |
+
"AddressEntity",
|
| 22 |
+
"ParsedAddress",
|
| 23 |
+
"ParseRequest",
|
| 24 |
+
"ParseResponse",
|
| 25 |
+
"__version__",
|
| 26 |
+
]
|
src/address_parser/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (741 Bytes). View file
|
|
|
src/address_parser/__pycache__/__init__.cpython-314.pyc
ADDED
|
Binary file (738 Bytes). View file
|
|
|
src/address_parser/__pycache__/cli.cpython-314.pyc
ADDED
|
Binary file (6.4 kB). View file
|
|
|
src/address_parser/__pycache__/pipeline.cpython-312.pyc
ADDED
|
Binary file (16.9 kB). View file
|
|
|
src/address_parser/__pycache__/pipeline.cpython-314.pyc
ADDED
|
Binary file (19.6 kB). View file
|
|
|
src/address_parser/__pycache__/schemas.cpython-312.pyc
ADDED
|
Binary file (7.94 kB). View file
|
|
|
src/address_parser/__pycache__/schemas.cpython-314.pyc
ADDED
|
Binary file (10.2 kB). View file
|
|
|
src/address_parser/cli.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Command-line interface for Indian Address Parser."""
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
import sys
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def main():
|
| 10 |
+
"""Main CLI entry point."""
|
| 11 |
+
parser = argparse.ArgumentParser(
|
| 12 |
+
description="Parse Indian addresses using NER",
|
| 13 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 14 |
+
epilog="""
|
| 15 |
+
Examples:
|
| 16 |
+
# Parse single address
|
| 17 |
+
address-parser "PLOT NO752 FIRST FLOOR, NEW DELHI, 110041"
|
| 18 |
+
|
| 19 |
+
# Parse from file
|
| 20 |
+
address-parser --input addresses.txt --output parsed.json
|
| 21 |
+
|
| 22 |
+
# Use trained model
|
| 23 |
+
address-parser --model ./models/address_ner_v3 "H.NO. 123, LAJPAT NAGAR"
|
| 24 |
+
"""
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
parser.add_argument(
|
| 28 |
+
"address",
|
| 29 |
+
nargs="?",
|
| 30 |
+
help="Address to parse (or use --input for file)"
|
| 31 |
+
)
|
| 32 |
+
parser.add_argument(
|
| 33 |
+
"--input", "-i",
|
| 34 |
+
help="Input file with addresses (one per line)"
|
| 35 |
+
)
|
| 36 |
+
parser.add_argument(
|
| 37 |
+
"--output", "-o",
|
| 38 |
+
help="Output JSON file"
|
| 39 |
+
)
|
| 40 |
+
parser.add_argument(
|
| 41 |
+
"--model", "-m",
|
| 42 |
+
help="Path to trained model directory"
|
| 43 |
+
)
|
| 44 |
+
parser.add_argument(
|
| 45 |
+
"--format", "-f",
|
| 46 |
+
choices=["json", "table", "simple"],
|
| 47 |
+
default="json",
|
| 48 |
+
help="Output format (default: json)"
|
| 49 |
+
)
|
| 50 |
+
parser.add_argument(
|
| 51 |
+
"--version", "-v",
|
| 52 |
+
action="version",
|
| 53 |
+
version="indian-address-parser 2.0.0"
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
args = parser.parse_args()
|
| 57 |
+
|
| 58 |
+
# Import here to avoid slow startup
|
| 59 |
+
from address_parser import AddressParser
|
| 60 |
+
|
| 61 |
+
# Load parser
|
| 62 |
+
if args.model and Path(args.model).exists():
|
| 63 |
+
print(f"Loading model from {args.model}...", file=sys.stderr)
|
| 64 |
+
address_parser = AddressParser.from_pretrained(args.model)
|
| 65 |
+
else:
|
| 66 |
+
print("Using rules-only mode", file=sys.stderr)
|
| 67 |
+
address_parser = AddressParser.rules_only()
|
| 68 |
+
|
| 69 |
+
# Get addresses to parse
|
| 70 |
+
addresses = []
|
| 71 |
+
if args.input:
|
| 72 |
+
with open(args.input, encoding="utf-8") as f:
|
| 73 |
+
addresses = [line.strip() for line in f if line.strip()]
|
| 74 |
+
elif args.address:
|
| 75 |
+
addresses = [args.address]
|
| 76 |
+
else:
|
| 77 |
+
parser.print_help()
|
| 78 |
+
sys.exit(1)
|
| 79 |
+
|
| 80 |
+
# Parse addresses
|
| 81 |
+
results = []
|
| 82 |
+
for addr in addresses:
|
| 83 |
+
result = address_parser.parse(addr)
|
| 84 |
+
results.append(result)
|
| 85 |
+
|
| 86 |
+
# Output
|
| 87 |
+
if args.format == "json":
|
| 88 |
+
output = [r.model_dump() for r in results]
|
| 89 |
+
json_str = json.dumps(output, indent=2, ensure_ascii=False)
|
| 90 |
+
|
| 91 |
+
if args.output:
|
| 92 |
+
with open(args.output, "w", encoding="utf-8") as f:
|
| 93 |
+
f.write(json_str)
|
| 94 |
+
print(f"Saved to {args.output}", file=sys.stderr)
|
| 95 |
+
else:
|
| 96 |
+
print(json_str)
|
| 97 |
+
|
| 98 |
+
elif args.format == "table":
|
| 99 |
+
for i, result in enumerate(results):
|
| 100 |
+
print(f"\n{'='*60}")
|
| 101 |
+
print(f"Address {i+1}: {result.raw_address[:50]}...")
|
| 102 |
+
print(f"{'='*60}")
|
| 103 |
+
print(f"{'Entity':<15} {'Value':<40} {'Conf':<6}")
|
| 104 |
+
print("-" * 60)
|
| 105 |
+
for entity in result.entities:
|
| 106 |
+
print(f"{entity.label:<15} {entity.value:<40} {entity.confidence:.0%}")
|
| 107 |
+
|
| 108 |
+
else: # simple
|
| 109 |
+
for result in results:
|
| 110 |
+
parts = []
|
| 111 |
+
if result.house_number:
|
| 112 |
+
parts.append(f"House: {result.house_number}")
|
| 113 |
+
if result.floor:
|
| 114 |
+
parts.append(f"Floor: {result.floor}")
|
| 115 |
+
if result.block:
|
| 116 |
+
parts.append(f"Block: {result.block}")
|
| 117 |
+
if result.gali:
|
| 118 |
+
parts.append(f"Gali: {result.gali}")
|
| 119 |
+
if result.colony:
|
| 120 |
+
parts.append(f"Colony: {result.colony}")
|
| 121 |
+
if result.area:
|
| 122 |
+
parts.append(f"Area: {result.area}")
|
| 123 |
+
if result.pincode:
|
| 124 |
+
parts.append(f"PIN: {result.pincode}")
|
| 125 |
+
if result.city:
|
| 126 |
+
parts.append(f"City: {result.city}")
|
| 127 |
+
|
| 128 |
+
print(" | ".join(parts) if parts else "No entities found")
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
if __name__ == "__main__":
|
| 132 |
+
main()
|
src/address_parser/models/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Model architectures for address NER."""
|
| 2 |
+
|
| 3 |
+
from address_parser.models.bert_crf import BertCRFForTokenClassification
|
| 4 |
+
from address_parser.models.config import ModelConfig
|
| 5 |
+
|
| 6 |
+
__all__ = ["BertCRFForTokenClassification", "ModelConfig"]
|
src/address_parser/models/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (435 Bytes). View file
|
|
|
src/address_parser/models/__pycache__/__init__.cpython-314.pyc
ADDED
|
Binary file (432 Bytes). View file
|
|
|
src/address_parser/models/__pycache__/bert_crf.cpython-312.pyc
ADDED
|
Binary file (16.8 kB). View file
|
|
|
src/address_parser/models/__pycache__/bert_crf.cpython-314.pyc
ADDED
|
Binary file (20 kB). View file
|
|
|
src/address_parser/models/__pycache__/config.cpython-312.pyc
ADDED
|
Binary file (3.13 kB). View file
|
|
|
src/address_parser/models/__pycache__/config.cpython-314.pyc
ADDED
|
Binary file (3.77 kB). View file
|
|
|
src/address_parser/models/bert_crf.py
ADDED
|
@@ -0,0 +1,439 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
BERT-CRF Model for Indian Address NER.
|
| 3 |
+
|
| 4 |
+
Combines a multilingual BERT encoder with a Conditional Random Field (CRF)
|
| 5 |
+
layer for improved sequence labeling performance.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import torch
|
| 9 |
+
import torch.nn as nn
|
| 10 |
+
from transformers import AutoModel
|
| 11 |
+
from transformers.modeling_outputs import TokenClassifierOutput
|
| 12 |
+
|
| 13 |
+
from address_parser.models.config import ID2LABEL, LABEL2ID, ModelConfig
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class CRF(nn.Module):
|
| 17 |
+
"""
|
| 18 |
+
Conditional Random Field layer for sequence labeling.
|
| 19 |
+
|
| 20 |
+
Implements the forward algorithm for computing log-likelihood
|
| 21 |
+
and Viterbi decoding for inference.
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
def __init__(self, num_tags: int, batch_first: bool = True):
|
| 25 |
+
"""
|
| 26 |
+
Initialize CRF layer.
|
| 27 |
+
|
| 28 |
+
Args:
|
| 29 |
+
num_tags: Number of output tags
|
| 30 |
+
batch_first: If True, input is (batch, seq, features)
|
| 31 |
+
"""
|
| 32 |
+
super().__init__()
|
| 33 |
+
self.num_tags = num_tags
|
| 34 |
+
self.batch_first = batch_first
|
| 35 |
+
|
| 36 |
+
# Transition matrix: transitions[i, j] = score of transitioning from tag i to tag j
|
| 37 |
+
self.transitions = nn.Parameter(torch.randn(num_tags, num_tags))
|
| 38 |
+
|
| 39 |
+
# Start and end transition scores
|
| 40 |
+
self.start_transitions = nn.Parameter(torch.randn(num_tags))
|
| 41 |
+
self.end_transitions = nn.Parameter(torch.randn(num_tags))
|
| 42 |
+
|
| 43 |
+
self._init_transitions()
|
| 44 |
+
|
| 45 |
+
def _init_transitions(self):
|
| 46 |
+
"""Initialize transition parameters."""
|
| 47 |
+
nn.init.uniform_(self.transitions, -0.1, 0.1)
|
| 48 |
+
nn.init.uniform_(self.start_transitions, -0.1, 0.1)
|
| 49 |
+
nn.init.uniform_(self.end_transitions, -0.1, 0.1)
|
| 50 |
+
|
| 51 |
+
def forward(
|
| 52 |
+
self,
|
| 53 |
+
emissions: torch.Tensor,
|
| 54 |
+
tags: torch.LongTensor,
|
| 55 |
+
mask: torch.ByteTensor | None = None,
|
| 56 |
+
reduction: str = "mean",
|
| 57 |
+
) -> torch.Tensor:
|
| 58 |
+
"""
|
| 59 |
+
Compute negative log-likelihood loss.
|
| 60 |
+
|
| 61 |
+
Args:
|
| 62 |
+
emissions: Emission scores from BERT (batch, seq, num_tags)
|
| 63 |
+
tags: Gold standard tags (batch, seq)
|
| 64 |
+
mask: Mask for valid tokens (batch, seq)
|
| 65 |
+
reduction: 'mean', 'sum', or 'none'
|
| 66 |
+
|
| 67 |
+
Returns:
|
| 68 |
+
Negative log-likelihood loss
|
| 69 |
+
"""
|
| 70 |
+
if mask is None:
|
| 71 |
+
mask = torch.ones_like(tags, dtype=torch.bool)
|
| 72 |
+
|
| 73 |
+
if self.batch_first:
|
| 74 |
+
emissions = emissions.transpose(0, 1)
|
| 75 |
+
tags = tags.transpose(0, 1)
|
| 76 |
+
mask = mask.transpose(0, 1)
|
| 77 |
+
|
| 78 |
+
# Compute log-likelihood
|
| 79 |
+
numerator = self._compute_score(emissions, tags, mask)
|
| 80 |
+
denominator = self._compute_normalizer(emissions, mask)
|
| 81 |
+
llh = numerator - denominator
|
| 82 |
+
|
| 83 |
+
if reduction == "mean":
|
| 84 |
+
return -llh.mean()
|
| 85 |
+
elif reduction == "sum":
|
| 86 |
+
return -llh.sum()
|
| 87 |
+
else:
|
| 88 |
+
return -llh
|
| 89 |
+
|
| 90 |
+
def decode(
|
| 91 |
+
self,
|
| 92 |
+
emissions: torch.Tensor,
|
| 93 |
+
mask: torch.ByteTensor | None = None,
|
| 94 |
+
) -> list[list[int]]:
|
| 95 |
+
"""
|
| 96 |
+
Find the most likely tag sequence using Viterbi algorithm.
|
| 97 |
+
|
| 98 |
+
Args:
|
| 99 |
+
emissions: Emission scores (batch, seq, num_tags)
|
| 100 |
+
mask: Mask for valid tokens (batch, seq)
|
| 101 |
+
|
| 102 |
+
Returns:
|
| 103 |
+
List of best tag sequences for each sample
|
| 104 |
+
"""
|
| 105 |
+
if mask is None:
|
| 106 |
+
mask = torch.ones(emissions.shape[:2], dtype=torch.bool, device=emissions.device)
|
| 107 |
+
|
| 108 |
+
if self.batch_first:
|
| 109 |
+
emissions = emissions.transpose(0, 1)
|
| 110 |
+
mask = mask.transpose(0, 1)
|
| 111 |
+
|
| 112 |
+
return self._viterbi_decode(emissions, mask)
|
| 113 |
+
|
| 114 |
+
def _compute_score(
|
| 115 |
+
self,
|
| 116 |
+
emissions: torch.Tensor,
|
| 117 |
+
tags: torch.LongTensor,
|
| 118 |
+
mask: torch.BoolTensor
|
| 119 |
+
) -> torch.Tensor:
|
| 120 |
+
"""Compute the score of a given tag sequence."""
|
| 121 |
+
seq_length, batch_size = tags.shape
|
| 122 |
+
mask = mask.float()
|
| 123 |
+
|
| 124 |
+
# Start transition score
|
| 125 |
+
score = self.start_transitions[tags[0]]
|
| 126 |
+
|
| 127 |
+
for i in range(seq_length - 1):
|
| 128 |
+
current_tag = tags[i]
|
| 129 |
+
next_tag = tags[i + 1]
|
| 130 |
+
|
| 131 |
+
# Emission score
|
| 132 |
+
score += emissions[i, torch.arange(batch_size), current_tag] * mask[i]
|
| 133 |
+
|
| 134 |
+
# Transition score
|
| 135 |
+
score += self.transitions[current_tag, next_tag] * mask[i + 1]
|
| 136 |
+
|
| 137 |
+
# Last emission score
|
| 138 |
+
last_tag_idx = mask.long().sum(dim=0) - 1
|
| 139 |
+
last_tags = tags.gather(0, last_tag_idx.unsqueeze(0)).squeeze(0)
|
| 140 |
+
score += emissions[last_tag_idx, torch.arange(batch_size), last_tags]
|
| 141 |
+
|
| 142 |
+
# End transition score
|
| 143 |
+
score += self.end_transitions[last_tags]
|
| 144 |
+
|
| 145 |
+
return score
|
| 146 |
+
|
| 147 |
+
def _compute_normalizer(
|
| 148 |
+
self,
|
| 149 |
+
emissions: torch.Tensor,
|
| 150 |
+
mask: torch.BoolTensor
|
| 151 |
+
) -> torch.Tensor:
|
| 152 |
+
"""Compute log-sum-exp of all possible tag sequences (partition function)."""
|
| 153 |
+
seq_length = emissions.shape[0]
|
| 154 |
+
|
| 155 |
+
# Initialize with start transitions
|
| 156 |
+
score = self.start_transitions + emissions[0]
|
| 157 |
+
|
| 158 |
+
for i in range(1, seq_length):
|
| 159 |
+
# Broadcast score and transitions for all combinations
|
| 160 |
+
broadcast_score = score.unsqueeze(2)
|
| 161 |
+
broadcast_emissions = emissions[i].unsqueeze(1)
|
| 162 |
+
|
| 163 |
+
# Compute next scores
|
| 164 |
+
next_score = broadcast_score + self.transitions + broadcast_emissions
|
| 165 |
+
|
| 166 |
+
# Log-sum-exp
|
| 167 |
+
next_score = torch.logsumexp(next_score, dim=1)
|
| 168 |
+
|
| 169 |
+
# Mask
|
| 170 |
+
score = torch.where(mask[i].unsqueeze(1), next_score, score)
|
| 171 |
+
|
| 172 |
+
# Add end transitions
|
| 173 |
+
score += self.end_transitions
|
| 174 |
+
|
| 175 |
+
return torch.logsumexp(score, dim=1)
|
| 176 |
+
|
| 177 |
+
def _viterbi_decode(
|
| 178 |
+
self,
|
| 179 |
+
emissions: torch.Tensor,
|
| 180 |
+
mask: torch.BoolTensor
|
| 181 |
+
) -> list[list[int]]:
|
| 182 |
+
"""Viterbi decoding to find best tag sequence."""
|
| 183 |
+
seq_length, batch_size, num_tags = emissions.shape
|
| 184 |
+
|
| 185 |
+
# Initialize
|
| 186 |
+
score = self.start_transitions + emissions[0]
|
| 187 |
+
history = []
|
| 188 |
+
|
| 189 |
+
for i in range(1, seq_length):
|
| 190 |
+
broadcast_score = score.unsqueeze(2)
|
| 191 |
+
broadcast_emissions = emissions[i].unsqueeze(1)
|
| 192 |
+
|
| 193 |
+
next_score = broadcast_score + self.transitions + broadcast_emissions
|
| 194 |
+
|
| 195 |
+
# Find best previous tag for each current tag
|
| 196 |
+
next_score, indices = next_score.max(dim=1)
|
| 197 |
+
|
| 198 |
+
# Apply mask
|
| 199 |
+
score = torch.where(mask[i].unsqueeze(1), next_score, score)
|
| 200 |
+
history.append(indices)
|
| 201 |
+
|
| 202 |
+
# Add end transitions
|
| 203 |
+
score += self.end_transitions
|
| 204 |
+
|
| 205 |
+
# Backtrack
|
| 206 |
+
seq_ends = mask.long().sum(dim=0) - 1
|
| 207 |
+
best_tags_list = []
|
| 208 |
+
|
| 209 |
+
for batch_idx in range(batch_size):
|
| 210 |
+
# Best last tag
|
| 211 |
+
_, best_last_tag = score[batch_idx].max(dim=0)
|
| 212 |
+
best_tags = [best_last_tag.item()]
|
| 213 |
+
|
| 214 |
+
# Backtrack through history
|
| 215 |
+
for hist in reversed(history[:seq_ends[batch_idx]]):
|
| 216 |
+
best_last_tag = hist[batch_idx][best_tags[-1]]
|
| 217 |
+
best_tags.append(best_last_tag.item())
|
| 218 |
+
|
| 219 |
+
best_tags.reverse()
|
| 220 |
+
best_tags_list.append(best_tags)
|
| 221 |
+
|
| 222 |
+
return best_tags_list
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
class BertCRFForTokenClassification(nn.Module):
|
| 226 |
+
"""
|
| 227 |
+
BERT model with CRF layer for token classification.
|
| 228 |
+
|
| 229 |
+
This combines a multilingual BERT encoder with a CRF layer
|
| 230 |
+
for improved sequence labeling on NER tasks.
|
| 231 |
+
"""
|
| 232 |
+
|
| 233 |
+
def __init__(self, config: ModelConfig):
|
| 234 |
+
"""
|
| 235 |
+
Initialize BERT-CRF model.
|
| 236 |
+
|
| 237 |
+
Args:
|
| 238 |
+
config: Model configuration
|
| 239 |
+
"""
|
| 240 |
+
super().__init__()
|
| 241 |
+
self.config = config
|
| 242 |
+
self.num_labels = config.num_labels
|
| 243 |
+
|
| 244 |
+
# Load pretrained BERT
|
| 245 |
+
self.bert = AutoModel.from_pretrained(
|
| 246 |
+
config.model_name,
|
| 247 |
+
cache_dir=config.cache_dir,
|
| 248 |
+
)
|
| 249 |
+
|
| 250 |
+
# Dropout
|
| 251 |
+
self.dropout = nn.Dropout(config.classifier_dropout)
|
| 252 |
+
|
| 253 |
+
# Classification head
|
| 254 |
+
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
|
| 255 |
+
|
| 256 |
+
# CRF layer
|
| 257 |
+
if config.use_crf:
|
| 258 |
+
self.crf = CRF(num_tags=config.num_labels, batch_first=True)
|
| 259 |
+
else:
|
| 260 |
+
self.crf = None
|
| 261 |
+
|
| 262 |
+
# Label mappings
|
| 263 |
+
self.id2label = ID2LABEL
|
| 264 |
+
self.label2id = LABEL2ID
|
| 265 |
+
|
| 266 |
+
# PyTorch 2.9: Lazy compilation for optimized inference
|
| 267 |
+
self._compiled_forward: nn.Module | None = None
|
| 268 |
+
|
| 269 |
+
def _get_compiled_forward(self):
|
| 270 |
+
"""Lazy compile forward pass on first inference call."""
|
| 271 |
+
# Skip torch.compile on Windows without MSVC or when explicitly disabled
|
| 272 |
+
# The inductor backend requires a C++ compiler (cl on Windows, gcc/clang on Linux)
|
| 273 |
+
import os
|
| 274 |
+
import sys
|
| 275 |
+
|
| 276 |
+
skip_compile = (
|
| 277 |
+
os.environ.get("TORCH_COMPILE_DISABLE", "0") == "1"
|
| 278 |
+
or sys.platform == "win32" # Skip on Windows to avoid cl requirement
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
if self._compiled_forward is None:
|
| 282 |
+
if not skip_compile and hasattr(torch, "compile"):
|
| 283 |
+
try:
|
| 284 |
+
self._compiled_forward = torch.compile(
|
| 285 |
+
self.forward,
|
| 286 |
+
backend="inductor",
|
| 287 |
+
mode="reduce-overhead",
|
| 288 |
+
dynamic=True,
|
| 289 |
+
)
|
| 290 |
+
except Exception:
|
| 291 |
+
self._compiled_forward = self.forward
|
| 292 |
+
else:
|
| 293 |
+
self._compiled_forward = self.forward
|
| 294 |
+
return self._compiled_forward
|
| 295 |
+
|
| 296 |
+
def forward(
|
| 297 |
+
self,
|
| 298 |
+
input_ids: torch.Tensor,
|
| 299 |
+
attention_mask: torch.Tensor | None = None,
|
| 300 |
+
token_type_ids: torch.Tensor | None = None,
|
| 301 |
+
labels: torch.Tensor | None = None,
|
| 302 |
+
return_dict: bool = True,
|
| 303 |
+
):
|
| 304 |
+
"""
|
| 305 |
+
Forward pass.
|
| 306 |
+
|
| 307 |
+
Args:
|
| 308 |
+
input_ids: Input token IDs (batch, seq)
|
| 309 |
+
attention_mask: Attention mask (batch, seq)
|
| 310 |
+
token_type_ids: Token type IDs (batch, seq)
|
| 311 |
+
labels: Gold standard labels for training (batch, seq)
|
| 312 |
+
return_dict: Return as dict or tuple
|
| 313 |
+
|
| 314 |
+
Returns:
|
| 315 |
+
TokenClassifierOutput with loss, logits, hidden states
|
| 316 |
+
"""
|
| 317 |
+
# BERT encoding
|
| 318 |
+
outputs = self.bert(
|
| 319 |
+
input_ids=input_ids,
|
| 320 |
+
attention_mask=attention_mask,
|
| 321 |
+
token_type_ids=token_type_ids,
|
| 322 |
+
)
|
| 323 |
+
|
| 324 |
+
sequence_output = outputs.last_hidden_state
|
| 325 |
+
sequence_output = self.dropout(sequence_output)
|
| 326 |
+
|
| 327 |
+
# Classification logits
|
| 328 |
+
logits = self.classifier(sequence_output)
|
| 329 |
+
|
| 330 |
+
loss = None
|
| 331 |
+
if labels is not None:
|
| 332 |
+
if self.crf is not None:
|
| 333 |
+
# CRF loss - need to handle -100 (ignore_index) labels
|
| 334 |
+
mask = attention_mask.bool() if attention_mask is not None else None
|
| 335 |
+
# Replace -100 with 0 (will be masked out anyway)
|
| 336 |
+
crf_labels = labels.clone()
|
| 337 |
+
crf_labels[crf_labels == -100] = 0
|
| 338 |
+
loss = self.crf(logits, crf_labels, mask=mask, reduction=self.config.crf_reduction)
|
| 339 |
+
else:
|
| 340 |
+
# Standard cross-entropy
|
| 341 |
+
loss_fct = nn.CrossEntropyLoss()
|
| 342 |
+
active_loss = attention_mask.view(-1) == 1
|
| 343 |
+
active_logits = logits.view(-1, self.num_labels)[active_loss]
|
| 344 |
+
active_labels = labels.view(-1)[active_loss]
|
| 345 |
+
loss = loss_fct(active_logits, active_labels)
|
| 346 |
+
|
| 347 |
+
if not return_dict:
|
| 348 |
+
output = (logits,) + outputs[2:]
|
| 349 |
+
return ((loss,) + output) if loss is not None else output
|
| 350 |
+
|
| 351 |
+
return TokenClassifierOutput(
|
| 352 |
+
loss=loss,
|
| 353 |
+
logits=logits,
|
| 354 |
+
hidden_states=outputs.hidden_states,
|
| 355 |
+
attentions=outputs.attentions,
|
| 356 |
+
)
|
| 357 |
+
|
| 358 |
+
def decode(
|
| 359 |
+
self,
|
| 360 |
+
input_ids: torch.Tensor,
|
| 361 |
+
attention_mask: torch.Tensor | None = None,
|
| 362 |
+
token_type_ids: torch.Tensor | None = None,
|
| 363 |
+
) -> list[list[int]]:
|
| 364 |
+
"""
|
| 365 |
+
Decode input to tag sequences using compiled forward pass.
|
| 366 |
+
|
| 367 |
+
Args:
|
| 368 |
+
input_ids: Input token IDs (batch, seq)
|
| 369 |
+
attention_mask: Attention mask (batch, seq)
|
| 370 |
+
token_type_ids: Token type IDs (batch, seq)
|
| 371 |
+
|
| 372 |
+
Returns:
|
| 373 |
+
List of predicted tag sequences
|
| 374 |
+
"""
|
| 375 |
+
self.eval()
|
| 376 |
+
with torch.no_grad():
|
| 377 |
+
# Use compiled forward for optimized inference (PyTorch 2.9+)
|
| 378 |
+
forward_fn = self._get_compiled_forward()
|
| 379 |
+
outputs = forward_fn(
|
| 380 |
+
input_ids=input_ids,
|
| 381 |
+
attention_mask=attention_mask,
|
| 382 |
+
token_type_ids=token_type_ids,
|
| 383 |
+
)
|
| 384 |
+
|
| 385 |
+
logits = outputs.logits
|
| 386 |
+
|
| 387 |
+
if self.crf is not None:
|
| 388 |
+
mask = attention_mask.bool() if attention_mask is not None else None
|
| 389 |
+
predictions = self.crf.decode(logits, mask=mask)
|
| 390 |
+
else:
|
| 391 |
+
predictions = logits.argmax(dim=-1).tolist()
|
| 392 |
+
|
| 393 |
+
return predictions
|
| 394 |
+
|
| 395 |
+
def save_pretrained(self, save_directory: str):
|
| 396 |
+
"""Save model to directory."""
|
| 397 |
+
import json
|
| 398 |
+
import os
|
| 399 |
+
|
| 400 |
+
os.makedirs(save_directory, exist_ok=True)
|
| 401 |
+
|
| 402 |
+
# Save model weights
|
| 403 |
+
torch.save(self.state_dict(), os.path.join(save_directory, "pytorch_model.bin"))
|
| 404 |
+
|
| 405 |
+
# Save config
|
| 406 |
+
config_dict = {
|
| 407 |
+
"model_name": self.config.model_name,
|
| 408 |
+
"num_labels": self.config.num_labels,
|
| 409 |
+
"use_crf": self.config.use_crf,
|
| 410 |
+
"hidden_size": self.config.hidden_size,
|
| 411 |
+
"classifier_dropout": self.config.classifier_dropout,
|
| 412 |
+
"id2label": self.id2label,
|
| 413 |
+
"label2id": self.label2id,
|
| 414 |
+
}
|
| 415 |
+
with open(os.path.join(save_directory, "config.json"), "w") as f:
|
| 416 |
+
json.dump(config_dict, f, indent=2)
|
| 417 |
+
|
| 418 |
+
@classmethod
|
| 419 |
+
def from_pretrained(cls, model_path: str, device: str = "cpu"):
|
| 420 |
+
"""Load model from directory."""
|
| 421 |
+
import json
|
| 422 |
+
|
| 423 |
+
with open(f"{model_path}/config.json") as f:
|
| 424 |
+
config_dict = json.load(f)
|
| 425 |
+
|
| 426 |
+
config = ModelConfig(
|
| 427 |
+
model_name=config_dict["model_name"],
|
| 428 |
+
num_labels=config_dict["num_labels"],
|
| 429 |
+
use_crf=config_dict["use_crf"],
|
| 430 |
+
hidden_size=config_dict["hidden_size"],
|
| 431 |
+
classifier_dropout=config_dict["classifier_dropout"],
|
| 432 |
+
)
|
| 433 |
+
|
| 434 |
+
model = cls(config)
|
| 435 |
+
state_dict = torch.load(f"{model_path}/pytorch_model.bin", map_location=device)
|
| 436 |
+
model.load_state_dict(state_dict)
|
| 437 |
+
model.to(device)
|
| 438 |
+
|
| 439 |
+
return model
|
src/address_parser/models/config.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Model configuration for address NER."""
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
@dataclass
|
| 7 |
+
class ModelConfig:
|
| 8 |
+
"""Configuration for BERT-CRF NER model."""
|
| 9 |
+
|
| 10 |
+
# Base model - IndicBERTv2-SS recommended for Indian languages
|
| 11 |
+
# Options: "bert-base-multilingual-cased", "ai4bharat/IndicBERTv2-SS",
|
| 12 |
+
# "google/muril-base-cased", "xlm-roberta-base"
|
| 13 |
+
model_name: str = "ai4bharat/IndicBERTv2-SS"
|
| 14 |
+
use_crf: bool = True
|
| 15 |
+
|
| 16 |
+
# Architecture
|
| 17 |
+
hidden_size: int = 768
|
| 18 |
+
num_labels: int = 31 # O + 15 entity types * 2 (B-/I-)
|
| 19 |
+
hidden_dropout_prob: float = 0.1
|
| 20 |
+
classifier_dropout: float = 0.1
|
| 21 |
+
|
| 22 |
+
# CRF settings
|
| 23 |
+
crf_reduction: str = "mean" # 'mean' or 'sum'
|
| 24 |
+
|
| 25 |
+
# Training
|
| 26 |
+
max_length: int = 128
|
| 27 |
+
learning_rate: float = 5e-5
|
| 28 |
+
crf_learning_rate: float = 1e-3 # Higher LR for CRF
|
| 29 |
+
weight_decay: float = 0.01
|
| 30 |
+
warmup_ratio: float = 0.1
|
| 31 |
+
num_epochs: int = 10
|
| 32 |
+
batch_size: int = 16
|
| 33 |
+
gradient_accumulation_steps: int = 1
|
| 34 |
+
|
| 35 |
+
# Label smoothing
|
| 36 |
+
label_smoothing: float = 0.0
|
| 37 |
+
|
| 38 |
+
# Early stopping
|
| 39 |
+
early_stopping_patience: int = 5
|
| 40 |
+
early_stopping_threshold: float = 0.001
|
| 41 |
+
|
| 42 |
+
# Layer-wise learning rate decay
|
| 43 |
+
lr_decay: float = 0.95
|
| 44 |
+
|
| 45 |
+
# Paths
|
| 46 |
+
output_dir: str = "./models"
|
| 47 |
+
cache_dir: str | None = None
|
| 48 |
+
|
| 49 |
+
# ONNX export
|
| 50 |
+
onnx_opset_version: int = 14
|
| 51 |
+
|
| 52 |
+
@classmethod
|
| 53 |
+
def from_pretrained_name(cls, name: str) -> ModelConfig:
|
| 54 |
+
"""Create config for known pretrained models."""
|
| 55 |
+
configs = {
|
| 56 |
+
"mbert": cls(
|
| 57 |
+
model_name="bert-base-multilingual-cased",
|
| 58 |
+
hidden_size=768,
|
| 59 |
+
),
|
| 60 |
+
"indicbert": cls(
|
| 61 |
+
model_name="ai4bharat/IndicBERTv2-SS",
|
| 62 |
+
hidden_size=768,
|
| 63 |
+
),
|
| 64 |
+
"distilbert": cls(
|
| 65 |
+
model_name="distilbert-base-multilingual-cased",
|
| 66 |
+
hidden_size=768,
|
| 67 |
+
),
|
| 68 |
+
"xlm-roberta": cls(
|
| 69 |
+
model_name="xlm-roberta-base",
|
| 70 |
+
hidden_size=768,
|
| 71 |
+
),
|
| 72 |
+
"muril": cls(
|
| 73 |
+
model_name="google/muril-base-cased",
|
| 74 |
+
hidden_size=768,
|
| 75 |
+
),
|
| 76 |
+
}
|
| 77 |
+
return configs.get(name, cls())
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
# Entity label definitions (must match schemas.py)
|
| 81 |
+
ENTITY_LABELS = [
|
| 82 |
+
"AREA",
|
| 83 |
+
"SUBAREA",
|
| 84 |
+
"HOUSE_NUMBER",
|
| 85 |
+
"SECTOR",
|
| 86 |
+
"GALI",
|
| 87 |
+
"COLONY",
|
| 88 |
+
"BLOCK",
|
| 89 |
+
"CAMP",
|
| 90 |
+
"POLE",
|
| 91 |
+
"KHASRA",
|
| 92 |
+
"FLOOR",
|
| 93 |
+
"PLOT",
|
| 94 |
+
"PINCODE",
|
| 95 |
+
"CITY",
|
| 96 |
+
"STATE",
|
| 97 |
+
]
|
| 98 |
+
|
| 99 |
+
# Generate BIO labels
|
| 100 |
+
BIO_LABELS = ["O"] + [f"B-{label}" for label in ENTITY_LABELS] + [f"I-{label}" for label in ENTITY_LABELS]
|
| 101 |
+
LABEL2ID = {label: i for i, label in enumerate(BIO_LABELS)}
|
| 102 |
+
ID2LABEL = {i: label for i, label in enumerate(BIO_LABELS)}
|
| 103 |
+
NUM_LABELS = len(BIO_LABELS)
|
src/address_parser/pipeline.py
ADDED
|
@@ -0,0 +1,528 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Main address parsing pipeline.
|
| 3 |
+
|
| 4 |
+
Orchestrates preprocessing, model inference, and post-processing
|
| 5 |
+
to extract structured entities from Indian addresses.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import time
|
| 9 |
+
import warnings
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
from transformers import AutoTokenizer, logging as hf_logging
|
| 13 |
+
|
| 14 |
+
# Suppress false positive tokenizer warnings in transformers 4.57+
|
| 15 |
+
# The Mistral regex warning is incorrectly triggered for BERT tokenizers
|
| 16 |
+
hf_logging.set_verbosity_error()
|
| 17 |
+
warnings.filterwarnings("ignore", message=".*incorrect regex pattern.*")
|
| 18 |
+
|
| 19 |
+
from address_parser.models.config import ID2LABEL, ModelConfig
|
| 20 |
+
from address_parser.postprocessing import DelhiGazetteer, RuleBasedRefiner
|
| 21 |
+
from address_parser.preprocessing import AddressNormalizer, HindiTransliterator
|
| 22 |
+
from address_parser.schemas import (
|
| 23 |
+
AddressEntity,
|
| 24 |
+
BatchParseResponse,
|
| 25 |
+
ParsedAddress,
|
| 26 |
+
ParseResponse,
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class AddressParser:
|
| 31 |
+
"""
|
| 32 |
+
Main address parsing pipeline.
|
| 33 |
+
|
| 34 |
+
Combines:
|
| 35 |
+
- Text normalization and Hindi transliteration
|
| 36 |
+
- mBERT-CRF model for NER
|
| 37 |
+
- Rule-based post-processing with gazetteer
|
| 38 |
+
|
| 39 |
+
Example:
|
| 40 |
+
>>> parser = AddressParser.from_pretrained("./models/address_ner_v3")
|
| 41 |
+
>>> result = parser.parse("PLOT NO752 FIRST FLOOR, NEW DELHI, 110041")
|
| 42 |
+
>>> print(result.house_number) # "PLOT NO752"
|
| 43 |
+
"""
|
| 44 |
+
|
| 45 |
+
def __init__(
|
| 46 |
+
self,
|
| 47 |
+
model=None,
|
| 48 |
+
tokenizer=None,
|
| 49 |
+
config: ModelConfig | None = None,
|
| 50 |
+
device: str = "cpu",
|
| 51 |
+
use_rules: bool = True,
|
| 52 |
+
use_gazetteer: bool = True,
|
| 53 |
+
):
|
| 54 |
+
"""
|
| 55 |
+
Initialize parser.
|
| 56 |
+
|
| 57 |
+
Args:
|
| 58 |
+
model: Trained NER model (BertCRFForTokenClassification)
|
| 59 |
+
tokenizer: HuggingFace tokenizer
|
| 60 |
+
config: Model configuration
|
| 61 |
+
device: Device to run on ('cpu', 'cuda', 'mps')
|
| 62 |
+
use_rules: Enable rule-based post-processing
|
| 63 |
+
use_gazetteer: Enable gazetteer for validation
|
| 64 |
+
"""
|
| 65 |
+
self.model = model
|
| 66 |
+
self.tokenizer = tokenizer
|
| 67 |
+
self.config = config or ModelConfig()
|
| 68 |
+
self.device = device
|
| 69 |
+
|
| 70 |
+
# Initialize preprocessing
|
| 71 |
+
self.normalizer = AddressNormalizer(uppercase=True, expand_abbrev=True)
|
| 72 |
+
self.transliterator = HindiTransliterator(use_known_terms=True)
|
| 73 |
+
|
| 74 |
+
# Initialize post-processing
|
| 75 |
+
self.refiner = RuleBasedRefiner(use_gazetteer=use_gazetteer) if use_rules else None
|
| 76 |
+
self.gazetteer = DelhiGazetteer() if use_gazetteer else None
|
| 77 |
+
|
| 78 |
+
# Move model to device
|
| 79 |
+
if self.model is not None:
|
| 80 |
+
self.model.to(device)
|
| 81 |
+
self.model.eval()
|
| 82 |
+
|
| 83 |
+
@classmethod
|
| 84 |
+
def from_pretrained(
|
| 85 |
+
cls,
|
| 86 |
+
model_path: str | Path,
|
| 87 |
+
device: str = "cpu",
|
| 88 |
+
use_rules: bool = True,
|
| 89 |
+
use_gazetteer: bool = True,
|
| 90 |
+
) -> AddressParser:
|
| 91 |
+
"""
|
| 92 |
+
Load parser from pretrained model directory.
|
| 93 |
+
|
| 94 |
+
Args:
|
| 95 |
+
model_path: Path to saved model directory
|
| 96 |
+
device: Device to run on
|
| 97 |
+
use_rules: Enable rule-based post-processing
|
| 98 |
+
use_gazetteer: Enable gazetteer for validation
|
| 99 |
+
|
| 100 |
+
Returns:
|
| 101 |
+
Initialized AddressParser
|
| 102 |
+
"""
|
| 103 |
+
from address_parser.models import BertCRFForTokenClassification
|
| 104 |
+
|
| 105 |
+
model_path = Path(model_path)
|
| 106 |
+
|
| 107 |
+
# Load model
|
| 108 |
+
model = BertCRFForTokenClassification.from_pretrained(str(model_path), device=device)
|
| 109 |
+
|
| 110 |
+
# Load tokenizer
|
| 111 |
+
tokenizer = AutoTokenizer.from_pretrained(str(model_path))
|
| 112 |
+
|
| 113 |
+
return cls(
|
| 114 |
+
model=model,
|
| 115 |
+
tokenizer=tokenizer,
|
| 116 |
+
device=device,
|
| 117 |
+
use_rules=use_rules,
|
| 118 |
+
use_gazetteer=use_gazetteer,
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
@classmethod
|
| 122 |
+
def rules_only(cls, use_gazetteer: bool = True) -> AddressParser:
|
| 123 |
+
"""
|
| 124 |
+
Create a rules-only parser (no ML model).
|
| 125 |
+
|
| 126 |
+
Useful for testing or when model is not available.
|
| 127 |
+
"""
|
| 128 |
+
return cls(
|
| 129 |
+
model=None,
|
| 130 |
+
tokenizer=None,
|
| 131 |
+
use_rules=True,
|
| 132 |
+
use_gazetteer=use_gazetteer,
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
def parse(self, address: str) -> ParsedAddress:
|
| 136 |
+
"""
|
| 137 |
+
Parse a single address.
|
| 138 |
+
|
| 139 |
+
Args:
|
| 140 |
+
address: Raw address string
|
| 141 |
+
|
| 142 |
+
Returns:
|
| 143 |
+
ParsedAddress with extracted entities
|
| 144 |
+
"""
|
| 145 |
+
if not address or not address.strip():
|
| 146 |
+
return ParsedAddress(
|
| 147 |
+
raw_address=address,
|
| 148 |
+
normalized_address="",
|
| 149 |
+
entities=[]
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
# Preprocessing
|
| 153 |
+
normalized = self._preprocess(address)
|
| 154 |
+
|
| 155 |
+
# Model inference
|
| 156 |
+
entities = self._extract_entities(normalized)
|
| 157 |
+
|
| 158 |
+
# Post-processing
|
| 159 |
+
if self.refiner:
|
| 160 |
+
entities = self.refiner.refine(normalized, entities)
|
| 161 |
+
|
| 162 |
+
return ParsedAddress(
|
| 163 |
+
raw_address=address,
|
| 164 |
+
normalized_address=normalized,
|
| 165 |
+
entities=entities
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
def parse_with_timing(self, address: str) -> ParseResponse:
|
| 169 |
+
"""
|
| 170 |
+
Parse address and return response with timing info.
|
| 171 |
+
|
| 172 |
+
Args:
|
| 173 |
+
address: Raw address string
|
| 174 |
+
|
| 175 |
+
Returns:
|
| 176 |
+
ParseResponse with result and timing
|
| 177 |
+
"""
|
| 178 |
+
start = time.perf_counter()
|
| 179 |
+
|
| 180 |
+
try:
|
| 181 |
+
result = self.parse(address)
|
| 182 |
+
elapsed = (time.perf_counter() - start) * 1000
|
| 183 |
+
|
| 184 |
+
return ParseResponse(
|
| 185 |
+
success=True,
|
| 186 |
+
result=result,
|
| 187 |
+
inference_time_ms=elapsed
|
| 188 |
+
)
|
| 189 |
+
except Exception as e:
|
| 190 |
+
elapsed = (time.perf_counter() - start) * 1000
|
| 191 |
+
return ParseResponse(
|
| 192 |
+
success=False,
|
| 193 |
+
error=str(e),
|
| 194 |
+
inference_time_ms=elapsed
|
| 195 |
+
)
|
| 196 |
+
|
| 197 |
+
def parse_batch(self, addresses: list[str]) -> BatchParseResponse:
|
| 198 |
+
"""
|
| 199 |
+
Parse multiple addresses.
|
| 200 |
+
|
| 201 |
+
Args:
|
| 202 |
+
addresses: List of raw address strings
|
| 203 |
+
|
| 204 |
+
Returns:
|
| 205 |
+
BatchParseResponse with all results
|
| 206 |
+
"""
|
| 207 |
+
start = time.perf_counter()
|
| 208 |
+
|
| 209 |
+
results = []
|
| 210 |
+
for address in addresses:
|
| 211 |
+
result = self.parse(address)
|
| 212 |
+
results.append(result)
|
| 213 |
+
|
| 214 |
+
total_time = (time.perf_counter() - start) * 1000
|
| 215 |
+
avg_time = total_time / len(addresses) if addresses else 0
|
| 216 |
+
|
| 217 |
+
return BatchParseResponse(
|
| 218 |
+
success=True,
|
| 219 |
+
results=results,
|
| 220 |
+
total_inference_time_ms=total_time,
|
| 221 |
+
avg_inference_time_ms=avg_time
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
def _preprocess(self, text: str) -> str:
|
| 225 |
+
"""Preprocess address text."""
|
| 226 |
+
# Handle Hindi text
|
| 227 |
+
if self.transliterator.contains_devanagari(text):
|
| 228 |
+
text = self.transliterator.normalize_mixed_script(text)
|
| 229 |
+
|
| 230 |
+
# Normalize
|
| 231 |
+
return self.normalizer.normalize(text)
|
| 232 |
+
|
| 233 |
+
def _extract_entities(self, text: str) -> list[AddressEntity]:
|
| 234 |
+
"""Extract entities using NER model."""
|
| 235 |
+
if self.model is None or self.tokenizer is None:
|
| 236 |
+
# Rules-only mode
|
| 237 |
+
return self._extract_entities_rules_only(text)
|
| 238 |
+
|
| 239 |
+
# Tokenize
|
| 240 |
+
encoding = self.tokenizer(
|
| 241 |
+
text,
|
| 242 |
+
return_tensors="pt",
|
| 243 |
+
truncation=True,
|
| 244 |
+
max_length=self.config.max_length,
|
| 245 |
+
return_offsets_mapping=True,
|
| 246 |
+
padding=True,
|
| 247 |
+
)
|
| 248 |
+
|
| 249 |
+
# Get offset mapping for alignment
|
| 250 |
+
offset_mapping = encoding.pop("offset_mapping")[0].tolist()
|
| 251 |
+
|
| 252 |
+
# Move to device
|
| 253 |
+
input_ids = encoding["input_ids"].to(self.device)
|
| 254 |
+
attention_mask = encoding["attention_mask"].to(self.device)
|
| 255 |
+
|
| 256 |
+
# Inference
|
| 257 |
+
predictions = self.model.decode(
|
| 258 |
+
input_ids=input_ids,
|
| 259 |
+
attention_mask=attention_mask,
|
| 260 |
+
)[0] # First (and only) sample
|
| 261 |
+
|
| 262 |
+
# Convert to entities
|
| 263 |
+
entities = self._predictions_to_entities(
|
| 264 |
+
text=text,
|
| 265 |
+
predictions=predictions,
|
| 266 |
+
offset_mapping=offset_mapping,
|
| 267 |
+
attention_mask=encoding["attention_mask"][0].tolist(),
|
| 268 |
+
)
|
| 269 |
+
|
| 270 |
+
return entities
|
| 271 |
+
|
| 272 |
+
def _extract_entities_rules_only(self, text: str) -> list[AddressEntity]:
|
| 273 |
+
"""Extract entities using comprehensive rules (no ML)."""
|
| 274 |
+
import re
|
| 275 |
+
entities = []
|
| 276 |
+
text_upper = text.upper()
|
| 277 |
+
|
| 278 |
+
# Known localities (multi-word)
|
| 279 |
+
known_localities = [
|
| 280 |
+
"LAJPAT NAGAR", "MALVIYA NAGAR", "HAUZ KHAS", "GREEN PARK",
|
| 281 |
+
"GREATER KAILASH", "DEFENCE COLONY", "SOUTH EXTENSION", "KALKAJI",
|
| 282 |
+
"CIVIL LINES", "MODEL TOWN", "MUKHERJEE NAGAR", "KAMLA NAGAR",
|
| 283 |
+
"PREET VIHAR", "MAYUR VIHAR", "LAKSHMI NAGAR", "GANDHI NAGAR",
|
| 284 |
+
"JANAKPURI", "DWARKA", "UTTAM NAGAR", "TILAK NAGAR", "RAJOURI GARDEN",
|
| 285 |
+
"PUNJABI BAGH", "PASCHIM VIHAR", "KAROL BAGH", "CONNAUGHT PLACE",
|
| 286 |
+
"KAUNWAR SINGH NAGAR", "PALAM COLONY", "RAJ NAGAR", "SADH NAGAR",
|
| 287 |
+
"VIJAY ENCLAVE", "DURGA PARK", "SWARN PARK", "CHANCHAL PARK",
|
| 288 |
+
]
|
| 289 |
+
|
| 290 |
+
for locality in known_localities:
|
| 291 |
+
pos = text_upper.find(locality)
|
| 292 |
+
if pos >= 0:
|
| 293 |
+
entities.append(AddressEntity(
|
| 294 |
+
label="SUBAREA",
|
| 295 |
+
value=text[pos:pos + len(locality)],
|
| 296 |
+
start=pos,
|
| 297 |
+
end=pos + len(locality),
|
| 298 |
+
confidence=0.95
|
| 299 |
+
))
|
| 300 |
+
|
| 301 |
+
# Area patterns (directional)
|
| 302 |
+
area_patterns = [
|
| 303 |
+
(r'\bSOUTH\s+DELHI\b', "SOUTH DELHI"),
|
| 304 |
+
(r'\bNORTH\s+DELHI\b', "NORTH DELHI"),
|
| 305 |
+
(r'\bEAST\s+DELHI\b', "EAST DELHI"),
|
| 306 |
+
(r'\bWEST\s+DELHI\b', "WEST DELHI"),
|
| 307 |
+
(r'\bCENTRAL\s+DELHI\b', "CENTRAL DELHI"),
|
| 308 |
+
(r'\bOUTER\s+DELHI\b', "OUTER DELHI"),
|
| 309 |
+
]
|
| 310 |
+
|
| 311 |
+
for pattern, area_name in area_patterns:
|
| 312 |
+
match = re.search(pattern, text_upper)
|
| 313 |
+
if match:
|
| 314 |
+
entities.append(AddressEntity(
|
| 315 |
+
label="AREA",
|
| 316 |
+
value=area_name,
|
| 317 |
+
start=match.start(),
|
| 318 |
+
end=match.end(),
|
| 319 |
+
confidence=0.95
|
| 320 |
+
))
|
| 321 |
+
|
| 322 |
+
# House number patterns (order matters - more specific first)
|
| 323 |
+
house_patterns = [
|
| 324 |
+
r'\b(?:FLAT\s*NO\.?\s*)[A-Z]?[-]?\d+[A-Z]?(?:[-/]\d+)*\b',
|
| 325 |
+
r'\b(?:PLOT\s*NO\.?)\s*[A-Z]?\d+[A-Z]?(?:[-/]\d+)*\b',
|
| 326 |
+
r'\b(?:H\.?\s*NO\.?|HOUSE\s*NO\.?|HNO)\s*[A-Z]?\d+[A-Z]?(?:[-/]\d+)*\b',
|
| 327 |
+
r'\b[RW]Z[-\s]?[A-Z]?[-/]?\d+[A-Z]?(?:[-/]\d+)*\b',
|
| 328 |
+
]
|
| 329 |
+
|
| 330 |
+
for pattern in house_patterns:
|
| 331 |
+
match = re.search(pattern, text_upper)
|
| 332 |
+
if match:
|
| 333 |
+
entities.append(AddressEntity(
|
| 334 |
+
label="HOUSE_NUMBER",
|
| 335 |
+
value=text[match.start():match.end()],
|
| 336 |
+
start=match.start(),
|
| 337 |
+
end=match.end(),
|
| 338 |
+
confidence=0.90
|
| 339 |
+
))
|
| 340 |
+
break # Only first match
|
| 341 |
+
|
| 342 |
+
# Floor patterns
|
| 343 |
+
floor_match = re.search(
|
| 344 |
+
r'\b(?:GROUND|FIRST|SECOND|THIRD|FOURTH|1ST|2ND|3RD|4TH|GF|FF|SF|TF)\s*(?:FLOOR|FLR)?\b',
|
| 345 |
+
text_upper
|
| 346 |
+
)
|
| 347 |
+
if floor_match:
|
| 348 |
+
entities.append(AddressEntity(
|
| 349 |
+
label="FLOOR",
|
| 350 |
+
value=text[floor_match.start():floor_match.end()],
|
| 351 |
+
start=floor_match.start(),
|
| 352 |
+
end=floor_match.end(),
|
| 353 |
+
confidence=0.90
|
| 354 |
+
))
|
| 355 |
+
|
| 356 |
+
# Gali patterns
|
| 357 |
+
gali_match = re.search(r'\b(?:GALI|GALLI|LANE)\s*(?:NO\.?)?\s*\d+[A-Z]?\b', text_upper)
|
| 358 |
+
if gali_match:
|
| 359 |
+
entities.append(AddressEntity(
|
| 360 |
+
label="GALI",
|
| 361 |
+
value=text[gali_match.start():gali_match.end()],
|
| 362 |
+
start=gali_match.start(),
|
| 363 |
+
end=gali_match.end(),
|
| 364 |
+
confidence=0.90
|
| 365 |
+
))
|
| 366 |
+
|
| 367 |
+
# Block patterns
|
| 368 |
+
block_match = re.search(r'\b(?:BLOCK|BLK|BL)\s*[A-Z]?[-]?[A-Z0-9]+\b', text_upper)
|
| 369 |
+
if block_match:
|
| 370 |
+
entities.append(AddressEntity(
|
| 371 |
+
label="BLOCK",
|
| 372 |
+
value=text[block_match.start():block_match.end()],
|
| 373 |
+
start=block_match.start(),
|
| 374 |
+
end=block_match.end(),
|
| 375 |
+
confidence=0.90
|
| 376 |
+
))
|
| 377 |
+
|
| 378 |
+
# Sector patterns
|
| 379 |
+
sector_match = re.search(r'\b(?:SECTOR|SEC)\s*\d+[A-Z]?\b', text_upper)
|
| 380 |
+
if sector_match:
|
| 381 |
+
entities.append(AddressEntity(
|
| 382 |
+
label="SECTOR",
|
| 383 |
+
value=text[sector_match.start():sector_match.end()],
|
| 384 |
+
start=sector_match.start(),
|
| 385 |
+
end=sector_match.end(),
|
| 386 |
+
confidence=0.90
|
| 387 |
+
))
|
| 388 |
+
|
| 389 |
+
# Khasra patterns
|
| 390 |
+
khasra_match = re.search(
|
| 391 |
+
r'\b(?:KH\.?\s*(?:NO\.?)?\s*|KHASRA\s*(?:NO\.?)?\s*)[\d/]+(?:[/-]\d+)*\b',
|
| 392 |
+
text_upper
|
| 393 |
+
)
|
| 394 |
+
if khasra_match:
|
| 395 |
+
entities.append(AddressEntity(
|
| 396 |
+
label="KHASRA",
|
| 397 |
+
value=text[khasra_match.start():khasra_match.end()],
|
| 398 |
+
start=khasra_match.start(),
|
| 399 |
+
end=khasra_match.end(),
|
| 400 |
+
confidence=0.90
|
| 401 |
+
))
|
| 402 |
+
|
| 403 |
+
# Pincode (6-digit Delhi codes)
|
| 404 |
+
pincode_match = re.search(r'\b1[1][0]\d{3}\b', text)
|
| 405 |
+
if pincode_match:
|
| 406 |
+
entities.append(AddressEntity(
|
| 407 |
+
label="PINCODE",
|
| 408 |
+
value=pincode_match.group(0),
|
| 409 |
+
start=pincode_match.start(),
|
| 410 |
+
end=pincode_match.end(),
|
| 411 |
+
confidence=1.0
|
| 412 |
+
))
|
| 413 |
+
|
| 414 |
+
# City - always DELHI for Delhi addresses
|
| 415 |
+
if "DELHI" in text_upper:
|
| 416 |
+
# Find standalone DELHI or NEW DELHI
|
| 417 |
+
delhi_match = re.search(r'\bNEW\s+DELHI\b', text_upper)
|
| 418 |
+
if delhi_match:
|
| 419 |
+
entities.append(AddressEntity(
|
| 420 |
+
label="CITY",
|
| 421 |
+
value="NEW DELHI",
|
| 422 |
+
start=delhi_match.start(),
|
| 423 |
+
end=delhi_match.end(),
|
| 424 |
+
confidence=0.95
|
| 425 |
+
))
|
| 426 |
+
else:
|
| 427 |
+
# Find last DELHI
|
| 428 |
+
delhi_positions = [m.start() for m in re.finditer(r'\bDELHI\b', text_upper)]
|
| 429 |
+
if delhi_positions:
|
| 430 |
+
pos = delhi_positions[-1]
|
| 431 |
+
entities.append(AddressEntity(
|
| 432 |
+
label="CITY",
|
| 433 |
+
value="DELHI",
|
| 434 |
+
start=pos,
|
| 435 |
+
end=pos + 5,
|
| 436 |
+
confidence=0.90
|
| 437 |
+
))
|
| 438 |
+
|
| 439 |
+
return entities
|
| 440 |
+
|
| 441 |
+
def _predictions_to_entities(
|
| 442 |
+
self,
|
| 443 |
+
text: str,
|
| 444 |
+
predictions: list[int],
|
| 445 |
+
offset_mapping: list[tuple[int, int]],
|
| 446 |
+
attention_mask: list[int],
|
| 447 |
+
) -> list[AddressEntity]:
|
| 448 |
+
"""Convert model predictions to entity objects."""
|
| 449 |
+
entities = []
|
| 450 |
+
current_entity = None
|
| 451 |
+
|
| 452 |
+
for idx, (pred, offset, mask) in enumerate(zip(predictions, offset_mapping, attention_mask)):
|
| 453 |
+
if mask == 0 or offset == (0, 0): # Skip padding and special tokens
|
| 454 |
+
continue
|
| 455 |
+
|
| 456 |
+
label = ID2LABEL.get(pred, "O")
|
| 457 |
+
start, end = offset
|
| 458 |
+
|
| 459 |
+
if label == "O":
|
| 460 |
+
# End current entity if any
|
| 461 |
+
if current_entity:
|
| 462 |
+
entities.append(self._finalize_entity(current_entity, text))
|
| 463 |
+
current_entity = None
|
| 464 |
+
elif label.startswith("B-"):
|
| 465 |
+
# Start new entity
|
| 466 |
+
if current_entity:
|
| 467 |
+
entities.append(self._finalize_entity(current_entity, text))
|
| 468 |
+
|
| 469 |
+
entity_type = label[2:] # Remove "B-" prefix
|
| 470 |
+
current_entity = {
|
| 471 |
+
"label": entity_type,
|
| 472 |
+
"start": start,
|
| 473 |
+
"end": end,
|
| 474 |
+
"confidence": 0.9, # Base confidence
|
| 475 |
+
}
|
| 476 |
+
elif label.startswith("I-"):
|
| 477 |
+
# Continue entity
|
| 478 |
+
entity_type = label[2:]
|
| 479 |
+
if current_entity and current_entity["label"] == entity_type:
|
| 480 |
+
current_entity["end"] = end
|
| 481 |
+
else:
|
| 482 |
+
# I- without matching B- - treat as new B-
|
| 483 |
+
if current_entity:
|
| 484 |
+
entities.append(self._finalize_entity(current_entity, text))
|
| 485 |
+
current_entity = {
|
| 486 |
+
"label": entity_type,
|
| 487 |
+
"start": start,
|
| 488 |
+
"end": end,
|
| 489 |
+
"confidence": 0.85,
|
| 490 |
+
}
|
| 491 |
+
|
| 492 |
+
# Don't forget last entity
|
| 493 |
+
if current_entity:
|
| 494 |
+
entities.append(self._finalize_entity(current_entity, text))
|
| 495 |
+
|
| 496 |
+
return entities
|
| 497 |
+
|
| 498 |
+
def _finalize_entity(self, entity_dict: dict, text: str) -> AddressEntity:
|
| 499 |
+
"""Finalize entity with extracted value."""
|
| 500 |
+
value = text[entity_dict["start"]:entity_dict["end"]].strip()
|
| 501 |
+
|
| 502 |
+
return AddressEntity(
|
| 503 |
+
label=entity_dict["label"],
|
| 504 |
+
value=value,
|
| 505 |
+
start=entity_dict["start"],
|
| 506 |
+
end=entity_dict["end"],
|
| 507 |
+
confidence=entity_dict["confidence"]
|
| 508 |
+
)
|
| 509 |
+
|
| 510 |
+
|
| 511 |
+
# Convenience function for quick parsing
|
| 512 |
+
def parse_address(address: str, model_path: str | None = None) -> ParsedAddress:
|
| 513 |
+
"""
|
| 514 |
+
Quick address parsing function.
|
| 515 |
+
|
| 516 |
+
Args:
|
| 517 |
+
address: Address to parse
|
| 518 |
+
model_path: Optional path to model (uses rules-only if None)
|
| 519 |
+
|
| 520 |
+
Returns:
|
| 521 |
+
ParsedAddress
|
| 522 |
+
"""
|
| 523 |
+
if model_path:
|
| 524 |
+
parser = AddressParser.from_pretrained(model_path)
|
| 525 |
+
else:
|
| 526 |
+
parser = AddressParser.rules_only()
|
| 527 |
+
|
| 528 |
+
return parser.parse(address)
|
src/address_parser/postprocessing/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Post-processing module for entity refinement and validation."""
|
| 2 |
+
|
| 3 |
+
from address_parser.postprocessing.gazetteer import DelhiGazetteer
|
| 4 |
+
from address_parser.postprocessing.rules import RuleBasedRefiner
|
| 5 |
+
|
| 6 |
+
__all__ = ["RuleBasedRefiner", "DelhiGazetteer"]
|
src/address_parser/postprocessing/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (474 Bytes). View file
|
|
|
src/address_parser/postprocessing/__pycache__/__init__.cpython-314.pyc
ADDED
|
Binary file (471 Bytes). View file
|
|
|
src/address_parser/postprocessing/__pycache__/gazetteer.cpython-312.pyc
ADDED
|
Binary file (6.21 kB). View file
|
|
|
src/address_parser/postprocessing/__pycache__/gazetteer.cpython-314.pyc
ADDED
|
Binary file (9.09 kB). View file
|
|
|
src/address_parser/postprocessing/__pycache__/rules.cpython-312.pyc
ADDED
|
Binary file (13 kB). View file
|
|
|
src/address_parser/postprocessing/__pycache__/rules.cpython-314.pyc
ADDED
|
Binary file (24.7 kB). View file
|
|
|
src/address_parser/postprocessing/gazetteer.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Delhi locality gazetteer for fuzzy matching and validation."""
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
from rapidfuzz import fuzz, process
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class DelhiGazetteer:
|
| 8 |
+
"""
|
| 9 |
+
Gazetteer of Delhi localities, areas, and common address terms.
|
| 10 |
+
|
| 11 |
+
Used for:
|
| 12 |
+
- Fuzzy matching to correct misspellings
|
| 13 |
+
- Entity validation
|
| 14 |
+
- Confidence boosting for known locations
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
# Major Delhi localities/areas
|
| 18 |
+
LOCALITIES = {
|
| 19 |
+
# South Delhi
|
| 20 |
+
"SAKET", "MALVIYA NAGAR", "HAUZ KHAS", "GREEN PARK", "GREATER KAILASH",
|
| 21 |
+
"DEFENCE COLONY", "LAJPAT NAGAR", "SOUTH EXTENSION", "CHITTARANJAN PARK",
|
| 22 |
+
"KALKAJI", "NEHRU PLACE", "OKHLA", "JASOLA", "SARITA VIHAR",
|
| 23 |
+
"ALAKNANDA", "SAFDARJUNG", "VASANT KUNJ", "MEHRAULI", "CHATTARPUR",
|
| 24 |
+
|
| 25 |
+
# North Delhi
|
| 26 |
+
"CIVIL LINES", "MODEL TOWN", "MUKHERJEE NAGAR", "KAMLA NAGAR",
|
| 27 |
+
"SHAKTI NAGAR", "GULABI BAGH", "ASHOK VIHAR", "SHALIMAR BAGH",
|
| 28 |
+
"PITAMPURA", "ROHINI", "NARELA", "BAWANA", "ALIPUR",
|
| 29 |
+
|
| 30 |
+
# East Delhi
|
| 31 |
+
"PREET VIHAR", "MAYUR VIHAR", "PATPARGANJ", "PANDAV NAGAR",
|
| 32 |
+
"LAKSHMI NAGAR", "SHAKARPUR", "GEETA COLONY", "GANDHI NAGAR",
|
| 33 |
+
"DILSHAD GARDEN", "SEELAMPUR", "SHAHDARA", "ANAND VIHAR",
|
| 34 |
+
|
| 35 |
+
# West Delhi
|
| 36 |
+
"JANAKPURI", "DWARKA", "PALAM", "UTTAM NAGAR", "VIKASPURI",
|
| 37 |
+
"TILAK NAGAR", "RAJOURI GARDEN", "PUNJABI BAGH", "PASCHIM VIHAR",
|
| 38 |
+
"MEERA BAGH", "PEERAGARHI", "MUNDKA", "NANGLOI", "NAJAFGARH",
|
| 39 |
+
"BINDAPUR", "KAKROLA", "MOHAN GARDEN", "NAWADA",
|
| 40 |
+
|
| 41 |
+
# Central Delhi
|
| 42 |
+
"CONNAUGHT PLACE", "KAROL BAGH", "PAHARGANJ", "DARYAGANJ",
|
| 43 |
+
"CHANDNI CHOWK", "SADAR BAZAAR", "RAJENDER NAGAR", "PATEL NAGAR",
|
| 44 |
+
"KIRTI NAGAR", "MOTIA KHAN", "ANAND PARBAT", "JHANDEWALAN",
|
| 45 |
+
|
| 46 |
+
# New Delhi
|
| 47 |
+
"CHANAKYAPURI", "LODHI ROAD", "GOLF LINKS", "JORBAGH",
|
| 48 |
+
"SUNDAR NAGAR", "NIZAMUDDIN", "LODI COLONY", "PANDARA ROAD",
|
| 49 |
+
|
| 50 |
+
# Other areas
|
| 51 |
+
"BADARPUR", "TUGHLAKABAD", "SANGAM VIHAR", "MADANPUR KHADAR",
|
| 52 |
+
"GOVINDPURI", "AMBEDKAR NAGAR", "LADO SARAI", "TIGRI",
|
| 53 |
+
"BURARI", "KARAWAL NAGAR", "BHAJANPURA", "MUSTAFABAD",
|
| 54 |
+
"JAFFRABAD", "MAUJPUR", "GOKALPUR", "SEEMAPURI",
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
# Common colony/nagar suffixes
|
| 58 |
+
NAGAR_SUFFIXES = {
|
| 59 |
+
"NAGAR", "VIHAR", "COLONY", "ENCLAVE", "EXTENSION", "PURI",
|
| 60 |
+
"PARK", "GARDEN", "BAGH", "KUNJ", "APARTMENT", "RESIDENCY",
|
| 61 |
+
"COMPLEX", "PHASE", "SECTOR", "BLOCK", "POCKET",
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
# Common area names from the training data
|
| 65 |
+
COMMON_AREAS = {
|
| 66 |
+
"KAUNWAR SINGH NAGAR", "BABA HARI DAS COLONY", "TIKARI KALA",
|
| 67 |
+
"CHANCHAL PARK", "SWARN PARK", "MUNDKA", "NANGLOI", "BAKKARWALA",
|
| 68 |
+
"MAJRA DABAS", "CHAND NAGAR", "RANHOLA", "BAPROLA", "POOTH KHURD",
|
| 69 |
+
"KIRARI", "SULTANPURI", "MANGOLPURI", "BEGUMPUR", "KADIPUR",
|
| 70 |
+
"RAMA VIHAR", "PREM NAGAR", "VIJAY PARK", "AMBICA VIHAR",
|
| 71 |
+
"SHIV PURI", "BUDH VIHAR", "POOTH KALAN", "QUTUBGARH",
|
| 72 |
+
"RANI KHERA", "SHAHABAD DAIRY", "SAMAIPUR", "JAHANGIRPURI",
|
| 73 |
+
"SANNOTH", "KANJHAWALA", "BAWANA", "ALIPUR",
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
# Common Hindi transliterated terms
|
| 77 |
+
HINDI_TERMS = {
|
| 78 |
+
"MOHALLA", "GALI", "KATRA", "BASTI", "BAZAR", "CHOWK",
|
| 79 |
+
"GANJ", "PUR", "ABAD", "GARH", "GAON", "KHERA", "KHURD", "KALAN",
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
def __init__(self, min_similarity: float = 80.0):
|
| 83 |
+
"""
|
| 84 |
+
Initialize gazetteer.
|
| 85 |
+
|
| 86 |
+
Args:
|
| 87 |
+
min_similarity: Minimum fuzzy match score (0-100)
|
| 88 |
+
"""
|
| 89 |
+
self.min_similarity = min_similarity
|
| 90 |
+
|
| 91 |
+
# Build combined set for matching
|
| 92 |
+
self.all_places = (
|
| 93 |
+
self.LOCALITIES |
|
| 94 |
+
self.COMMON_AREAS |
|
| 95 |
+
{f"{term}" for term in self.HINDI_TERMS}
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
def fuzzy_match(
|
| 99 |
+
self,
|
| 100 |
+
text: str,
|
| 101 |
+
limit: int = 3
|
| 102 |
+
) -> list[tuple[str, float]]:
|
| 103 |
+
"""
|
| 104 |
+
Find fuzzy matches for a text in the gazetteer.
|
| 105 |
+
|
| 106 |
+
Args:
|
| 107 |
+
text: Text to match
|
| 108 |
+
limit: Maximum number of matches
|
| 109 |
+
|
| 110 |
+
Returns:
|
| 111 |
+
List of (matched_text, score) tuples
|
| 112 |
+
"""
|
| 113 |
+
if not text or len(text) < 3:
|
| 114 |
+
return []
|
| 115 |
+
|
| 116 |
+
matches = process.extract(
|
| 117 |
+
text.upper(),
|
| 118 |
+
self.all_places,
|
| 119 |
+
scorer=fuzz.ratio,
|
| 120 |
+
limit=limit
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
return [(m[0], m[1]) for m in matches if m[1] >= self.min_similarity]
|
| 124 |
+
|
| 125 |
+
def is_known_locality(self, text: str, threshold: float = 85.0) -> bool:
|
| 126 |
+
"""Check if text matches a known locality."""
|
| 127 |
+
matches = self.fuzzy_match(text, limit=1)
|
| 128 |
+
return bool(matches and matches[0][1] >= threshold)
|
| 129 |
+
|
| 130 |
+
def correct_spelling(self, text: str) -> str | None:
|
| 131 |
+
"""
|
| 132 |
+
Attempt to correct spelling using gazetteer.
|
| 133 |
+
|
| 134 |
+
Returns corrected text or None if no good match.
|
| 135 |
+
"""
|
| 136 |
+
matches = self.fuzzy_match(text, limit=1)
|
| 137 |
+
if matches and matches[0][1] >= 90.0:
|
| 138 |
+
return matches[0][0]
|
| 139 |
+
return None
|
| 140 |
+
|
| 141 |
+
def get_locality_type(self, text: str) -> str | None:
|
| 142 |
+
"""
|
| 143 |
+
Determine if text contains a known locality type suffix.
|
| 144 |
+
|
| 145 |
+
Returns the suffix type or None.
|
| 146 |
+
"""
|
| 147 |
+
text_upper = text.upper()
|
| 148 |
+
for suffix in self.NAGAR_SUFFIXES:
|
| 149 |
+
if text_upper.endswith(suffix):
|
| 150 |
+
return suffix
|
| 151 |
+
return None
|
| 152 |
+
|
| 153 |
+
def validate_pincode(self, pincode: str, locality: str | None = None) -> bool:
|
| 154 |
+
"""
|
| 155 |
+
Validate if a pincode is valid for Delhi.
|
| 156 |
+
|
| 157 |
+
Delhi pincodes are in range 110001-110097.
|
| 158 |
+
"""
|
| 159 |
+
if not pincode or not pincode.isdigit() or len(pincode) != 6:
|
| 160 |
+
return False
|
| 161 |
+
|
| 162 |
+
code = int(pincode)
|
| 163 |
+
# Delhi pincode range
|
| 164 |
+
return 110001 <= code <= 110097
|
src/address_parser/postprocessing/rules.py
ADDED
|
@@ -0,0 +1,536 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Rule-based post-processing for entity refinement."""
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
|
| 5 |
+
from address_parser.postprocessing.gazetteer import DelhiGazetteer
|
| 6 |
+
from address_parser.schemas import AddressEntity
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class RuleBasedRefiner:
|
| 10 |
+
"""
|
| 11 |
+
Post-processing rules for refining NER predictions.
|
| 12 |
+
|
| 13 |
+
Handles:
|
| 14 |
+
- Pattern-based entity detection (pincodes, khasra numbers)
|
| 15 |
+
- Entity boundary correction using gazetteer
|
| 16 |
+
- Entity merging for fragmented predictions
|
| 17 |
+
- Confidence adjustment
|
| 18 |
+
- Validation and filtering
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
# Regex patterns for deterministic entities
|
| 22 |
+
PATTERNS = {
|
| 23 |
+
"PINCODE": re.compile(r'\b[1-9]\d{5}\b'),
|
| 24 |
+
"KHASRA": re.compile(
|
| 25 |
+
r'\b(?:KH\.?\s*(?:NO\.?)?\s*|KHASRA\s*(?:NO\.?)?\s*)[\d/]+(?:[/-]\d+)*\b',
|
| 26 |
+
re.IGNORECASE
|
| 27 |
+
),
|
| 28 |
+
"HOUSE_NUMBER": re.compile(
|
| 29 |
+
r'\b(?:H\.?\s*(?:NO\.?)?\s*|HOUSE\s*(?:NO\.?)?\s*|PLOT\s*(?:NO\.?)?\s*)?[A-Z]?\d+[A-Z]?(?:[-/]\d+)*\b',
|
| 30 |
+
re.IGNORECASE
|
| 31 |
+
),
|
| 32 |
+
"FLOOR": re.compile(
|
| 33 |
+
r'\b(?:GROUND|FIRST|SECOND|THIRD|FOURTH|FIFTH|1ST|2ND|3RD|4TH|5TH|GF|FF|SF|TF)?\s*(?:FLOOR|FLR)\b',
|
| 34 |
+
re.IGNORECASE
|
| 35 |
+
),
|
| 36 |
+
"BLOCK": re.compile(
|
| 37 |
+
r'\b(?:BLOCK|BLK|BL)\s*[A-Z]?[-]?[A-Z0-9]+\b',
|
| 38 |
+
re.IGNORECASE
|
| 39 |
+
),
|
| 40 |
+
"SECTOR": re.compile(
|
| 41 |
+
r'\b(?:SECTOR|SEC)\s*\d+[A-Z]?\b',
|
| 42 |
+
re.IGNORECASE
|
| 43 |
+
),
|
| 44 |
+
"GALI": re.compile(
|
| 45 |
+
r'\b(?:GALI|GALLI|LANE)\s*(?:NO\.?)?\s*\d+[A-Z]?\b',
|
| 46 |
+
re.IGNORECASE
|
| 47 |
+
),
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
# Area patterns - directional areas
|
| 51 |
+
AREA_PATTERNS = [
|
| 52 |
+
(re.compile(r'\bSOUTH\s+DELHI\b', re.IGNORECASE), "SOUTH DELHI"),
|
| 53 |
+
(re.compile(r'\bNORTH\s+DELHI\b', re.IGNORECASE), "NORTH DELHI"),
|
| 54 |
+
(re.compile(r'\bEAST\s+DELHI\b', re.IGNORECASE), "EAST DELHI"),
|
| 55 |
+
(re.compile(r'\bWEST\s+DELHI\b', re.IGNORECASE), "WEST DELHI"),
|
| 56 |
+
(re.compile(r'\bCENTRAL\s+DELHI\b', re.IGNORECASE), "CENTRAL DELHI"),
|
| 57 |
+
(re.compile(r'\bSOUTH\s+WEST\s+DELHI\b', re.IGNORECASE), "SOUTH WEST DELHI"),
|
| 58 |
+
(re.compile(r'\bNORTH\s+WEST\s+DELHI\b', re.IGNORECASE), "NORTH WEST DELHI"),
|
| 59 |
+
(re.compile(r'\bNORTH\s+EAST\s+DELHI\b', re.IGNORECASE), "NORTH EAST DELHI"),
|
| 60 |
+
(re.compile(r'\bSOUTH\s+EAST\s+DELHI\b', re.IGNORECASE), "SOUTH EAST DELHI"),
|
| 61 |
+
(re.compile(r'\bOUTER\s+DELHI\b', re.IGNORECASE), "OUTER DELHI"),
|
| 62 |
+
]
|
| 63 |
+
|
| 64 |
+
# City patterns
|
| 65 |
+
CITY_PATTERNS = [
|
| 66 |
+
(re.compile(r'\bNEW\s+DELHI\b', re.IGNORECASE), "NEW DELHI"),
|
| 67 |
+
(re.compile(r'\bDELHI\b', re.IGNORECASE), "DELHI"),
|
| 68 |
+
(re.compile(r'\bNOIDA\b', re.IGNORECASE), "NOIDA"),
|
| 69 |
+
(re.compile(r'\bGURUGRAM\b', re.IGNORECASE), "GURUGRAM"),
|
| 70 |
+
(re.compile(r'\bGURGAON\b', re.IGNORECASE), "GURGAON"),
|
| 71 |
+
(re.compile(r'\bFARIDABAD\b', re.IGNORECASE), "FARIDABAD"),
|
| 72 |
+
(re.compile(r'\bGHAZIABAD\b', re.IGNORECASE), "GHAZIABAD"),
|
| 73 |
+
]
|
| 74 |
+
|
| 75 |
+
# State patterns
|
| 76 |
+
STATE_PATTERNS = [
|
| 77 |
+
(re.compile(r'\bDELHI\b', re.IGNORECASE), "DELHI"),
|
| 78 |
+
(re.compile(r'\bHARYANA\b', re.IGNORECASE), "HARYANA"),
|
| 79 |
+
(re.compile(r'\bUTTAR\s+PRADESH\b', re.IGNORECASE), "UTTAR PRADESH"),
|
| 80 |
+
(re.compile(r'\bU\.?\s*P\.?\b'), "UTTAR PRADESH"),
|
| 81 |
+
]
|
| 82 |
+
|
| 83 |
+
# Colony/Nagar indicators
|
| 84 |
+
COLONY_SUFFIXES = [
|
| 85 |
+
"NAGAR", "VIHAR", "COLONY", "ENCLAVE", "PARK", "GARDEN",
|
| 86 |
+
"PURI", "BAGH", "KUNJ", "EXTENSION", "EXTN", "PHASE",
|
| 87 |
+
]
|
| 88 |
+
|
| 89 |
+
# Known multi-word localities that get fragmented
|
| 90 |
+
KNOWN_LOCALITIES = [
|
| 91 |
+
"LAJPAT NAGAR", "MALVIYA NAGAR", "KAROL BAGH", "HAUZ KHAS",
|
| 92 |
+
"GREEN PARK", "GREATER KAILASH", "DEFENCE COLONY", "SOUTH EXTENSION",
|
| 93 |
+
"CHITTARANJAN PARK", "NEHRU PLACE", "SARITA VIHAR", "VASANT KUNJ",
|
| 94 |
+
"CIVIL LINES", "MODEL TOWN", "MUKHERJEE NAGAR", "KAMLA NAGAR",
|
| 95 |
+
"ASHOK VIHAR", "SHALIMAR BAGH", "PREET VIHAR", "MAYUR VIHAR",
|
| 96 |
+
"LAKSHMI NAGAR", "GANDHI NAGAR", "DILSHAD GARDEN", "ANAND VIHAR",
|
| 97 |
+
"UTTAM NAGAR", "TILAK NAGAR", "RAJOURI GARDEN", "PUNJABI BAGH",
|
| 98 |
+
"PASCHIM VIHAR", "CONNAUGHT PLACE", "RAJENDER NAGAR", "PATEL NAGAR",
|
| 99 |
+
"KIRTI NAGAR", "LODHI ROAD", "GOLF LINKS", "SANGAM VIHAR",
|
| 100 |
+
"GOVINDPURI", "AMBEDKAR NAGAR", "LADO SARAI", "KAUNWAR SINGH NAGAR",
|
| 101 |
+
"BABA HARI DAS COLONY", "SWARN PARK", "CHANCHAL PARK", "DURGA PARK",
|
| 102 |
+
"RAJ NAGAR", "SADH NAGAR", "VIJAY ENCLAVE", "PALAM COLONY",
|
| 103 |
+
]
|
| 104 |
+
|
| 105 |
+
def __init__(self, use_gazetteer: bool = True):
|
| 106 |
+
"""
|
| 107 |
+
Initialize refiner.
|
| 108 |
+
|
| 109 |
+
Args:
|
| 110 |
+
use_gazetteer: Use gazetteer for validation/correction
|
| 111 |
+
"""
|
| 112 |
+
self.gazetteer = DelhiGazetteer() if use_gazetteer else None
|
| 113 |
+
|
| 114 |
+
def refine(
|
| 115 |
+
self,
|
| 116 |
+
text: str,
|
| 117 |
+
entities: list[AddressEntity]
|
| 118 |
+
) -> list[AddressEntity]:
|
| 119 |
+
"""
|
| 120 |
+
Refine entity predictions.
|
| 121 |
+
|
| 122 |
+
Args:
|
| 123 |
+
text: Original address text
|
| 124 |
+
entities: Predicted entities from NER model
|
| 125 |
+
|
| 126 |
+
Returns:
|
| 127 |
+
Refined list of entities
|
| 128 |
+
"""
|
| 129 |
+
refined = list(entities)
|
| 130 |
+
|
| 131 |
+
# First: detect and fix known localities from gazetteer
|
| 132 |
+
refined = self._fix_known_localities(text, refined)
|
| 133 |
+
|
| 134 |
+
# Add rule-based entities that may have been missed
|
| 135 |
+
refined = self._add_pattern_entities(text, refined)
|
| 136 |
+
|
| 137 |
+
# Detect area patterns (SOUTH DELHI, etc.)
|
| 138 |
+
refined = self._add_area_patterns(text, refined)
|
| 139 |
+
|
| 140 |
+
# Correct entity boundaries
|
| 141 |
+
refined = self._correct_boundaries(text, refined)
|
| 142 |
+
|
| 143 |
+
# Merge fragmented entities
|
| 144 |
+
refined = self._merge_fragmented_entities(text, refined)
|
| 145 |
+
|
| 146 |
+
# Adjust confidence scores
|
| 147 |
+
refined = self._adjust_confidence(text, refined)
|
| 148 |
+
|
| 149 |
+
# Remove duplicates and overlapping entities
|
| 150 |
+
refined = self._remove_overlaps(refined)
|
| 151 |
+
|
| 152 |
+
# Validate entities
|
| 153 |
+
refined = self._validate_entities(refined)
|
| 154 |
+
|
| 155 |
+
return refined
|
| 156 |
+
|
| 157 |
+
def _fix_known_localities(
|
| 158 |
+
self,
|
| 159 |
+
text: str,
|
| 160 |
+
entities: list[AddressEntity]
|
| 161 |
+
) -> list[AddressEntity]:
|
| 162 |
+
"""Fix fragmented known localities using gazetteer lookup."""
|
| 163 |
+
text_upper = text.upper()
|
| 164 |
+
result = []
|
| 165 |
+
used_ranges: list[tuple[int, int]] = []
|
| 166 |
+
|
| 167 |
+
# First pass: find all known localities in text
|
| 168 |
+
locality_entities = []
|
| 169 |
+
for locality in self.KNOWN_LOCALITIES:
|
| 170 |
+
idx = 0
|
| 171 |
+
while True:
|
| 172 |
+
pos = text_upper.find(locality, idx)
|
| 173 |
+
if pos == -1:
|
| 174 |
+
break
|
| 175 |
+
end = pos + len(locality)
|
| 176 |
+
locality_entities.append(AddressEntity(
|
| 177 |
+
label="SUBAREA",
|
| 178 |
+
value=text[pos:end],
|
| 179 |
+
start=pos,
|
| 180 |
+
end=end,
|
| 181 |
+
confidence=0.95
|
| 182 |
+
))
|
| 183 |
+
used_ranges.append((pos, end))
|
| 184 |
+
idx = end
|
| 185 |
+
|
| 186 |
+
# Also check area patterns
|
| 187 |
+
for pattern, area_name in self.AREA_PATTERNS:
|
| 188 |
+
match = pattern.search(text)
|
| 189 |
+
if match:
|
| 190 |
+
start, end = match.start(), match.end()
|
| 191 |
+
# Check for overlap with existing ranges
|
| 192 |
+
overlaps = any(
|
| 193 |
+
not (end <= s or start >= e)
|
| 194 |
+
for s, e in used_ranges
|
| 195 |
+
)
|
| 196 |
+
if not overlaps:
|
| 197 |
+
locality_entities.append(AddressEntity(
|
| 198 |
+
label="AREA",
|
| 199 |
+
value=area_name,
|
| 200 |
+
start=start,
|
| 201 |
+
end=end,
|
| 202 |
+
confidence=0.95
|
| 203 |
+
))
|
| 204 |
+
used_ranges.append((start, end))
|
| 205 |
+
|
| 206 |
+
# Filter out original entities that overlap with found localities
|
| 207 |
+
for entity in entities:
|
| 208 |
+
# Check if entity overlaps with any locality range
|
| 209 |
+
overlaps_locality = any(
|
| 210 |
+
not (entity.end <= start or entity.start >= end)
|
| 211 |
+
for start, end in used_ranges
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
if overlaps_locality and entity.label in ("AREA", "SUBAREA", "COLONY", "CITY"):
|
| 215 |
+
# Skip this fragmented entity
|
| 216 |
+
continue
|
| 217 |
+
|
| 218 |
+
result.append(entity)
|
| 219 |
+
|
| 220 |
+
# Add the locality entities
|
| 221 |
+
result.extend(locality_entities)
|
| 222 |
+
|
| 223 |
+
return result
|
| 224 |
+
|
| 225 |
+
def _add_area_patterns(
|
| 226 |
+
self,
|
| 227 |
+
text: str,
|
| 228 |
+
entities: list[AddressEntity]
|
| 229 |
+
) -> list[AddressEntity]:
|
| 230 |
+
"""Add area patterns like SOUTH DELHI, NORTH DELHI (already handled in _fix_known_localities)."""
|
| 231 |
+
# This is now handled in _fix_known_localities to avoid duplicates
|
| 232 |
+
return entities
|
| 233 |
+
|
| 234 |
+
def _merge_fragmented_entities(
|
| 235 |
+
self,
|
| 236 |
+
text: str,
|
| 237 |
+
entities: list[AddressEntity]
|
| 238 |
+
) -> list[AddressEntity]:
|
| 239 |
+
"""Merge adjacent entities of same type that should be together."""
|
| 240 |
+
if len(entities) < 2:
|
| 241 |
+
return entities
|
| 242 |
+
|
| 243 |
+
# Sort by position
|
| 244 |
+
sorted_entities = sorted(entities, key=lambda e: e.start)
|
| 245 |
+
result = []
|
| 246 |
+
i = 0
|
| 247 |
+
|
| 248 |
+
while i < len(sorted_entities):
|
| 249 |
+
current = sorted_entities[i]
|
| 250 |
+
|
| 251 |
+
# Look for adjacent entities to merge
|
| 252 |
+
if current.label in ("AREA", "SUBAREA", "COLONY", "CITY"):
|
| 253 |
+
merged_end = current.end
|
| 254 |
+
merged_confidence = current.confidence
|
| 255 |
+
j = i + 1
|
| 256 |
+
|
| 257 |
+
# Check subsequent entities
|
| 258 |
+
while j < len(sorted_entities):
|
| 259 |
+
next_ent = sorted_entities[j]
|
| 260 |
+
|
| 261 |
+
# Check if adjacent (within 2 chars - allows for space)
|
| 262 |
+
gap = next_ent.start - merged_end
|
| 263 |
+
if gap <= 2 and next_ent.label in ("AREA", "SUBAREA", "COLONY", "CITY"):
|
| 264 |
+
# Check if the merged text forms a known locality
|
| 265 |
+
merged_text = text[current.start:next_ent.end].strip()
|
| 266 |
+
if self._is_valid_merge(merged_text):
|
| 267 |
+
merged_end = next_ent.end
|
| 268 |
+
merged_confidence = max(merged_confidence, next_ent.confidence)
|
| 269 |
+
j += 1
|
| 270 |
+
else:
|
| 271 |
+
break
|
| 272 |
+
else:
|
| 273 |
+
break
|
| 274 |
+
|
| 275 |
+
# Create merged entity if we merged anything
|
| 276 |
+
if j > i + 1:
|
| 277 |
+
merged_value = text[current.start:merged_end].strip()
|
| 278 |
+
result.append(AddressEntity(
|
| 279 |
+
label=current.label,
|
| 280 |
+
value=merged_value,
|
| 281 |
+
start=current.start,
|
| 282 |
+
end=merged_end,
|
| 283 |
+
confidence=merged_confidence
|
| 284 |
+
))
|
| 285 |
+
i = j
|
| 286 |
+
continue
|
| 287 |
+
|
| 288 |
+
result.append(current)
|
| 289 |
+
i += 1
|
| 290 |
+
|
| 291 |
+
return result
|
| 292 |
+
|
| 293 |
+
def _is_valid_merge(self, text: str) -> bool:
|
| 294 |
+
"""Check if merged text forms a valid locality name."""
|
| 295 |
+
text_upper = text.upper().strip()
|
| 296 |
+
|
| 297 |
+
# Check against known localities
|
| 298 |
+
if text_upper in self.KNOWN_LOCALITIES:
|
| 299 |
+
return True
|
| 300 |
+
|
| 301 |
+
# Check gazetteer
|
| 302 |
+
if self.gazetteer and self.gazetteer.is_known_locality(text_upper, threshold=80):
|
| 303 |
+
return True
|
| 304 |
+
|
| 305 |
+
# Check if ends with common suffix
|
| 306 |
+
for suffix in self.COLONY_SUFFIXES:
|
| 307 |
+
if text_upper.endswith(suffix):
|
| 308 |
+
return True
|
| 309 |
+
|
| 310 |
+
return False
|
| 311 |
+
|
| 312 |
+
def _add_pattern_entities(
|
| 313 |
+
self,
|
| 314 |
+
text: str,
|
| 315 |
+
entities: list[AddressEntity]
|
| 316 |
+
) -> list[AddressEntity]:
|
| 317 |
+
"""Add entities detected by regex patterns."""
|
| 318 |
+
result = list(entities)
|
| 319 |
+
existing_spans = {(e.start, e.end) for e in entities}
|
| 320 |
+
|
| 321 |
+
# Check for pincode
|
| 322 |
+
if not any(e.label == "PINCODE" for e in entities):
|
| 323 |
+
match = self.PATTERNS["PINCODE"].search(text)
|
| 324 |
+
if match and (match.start(), match.end()) not in existing_spans:
|
| 325 |
+
result.append(AddressEntity(
|
| 326 |
+
label="PINCODE",
|
| 327 |
+
value=match.group(0),
|
| 328 |
+
start=match.start(),
|
| 329 |
+
end=match.end(),
|
| 330 |
+
confidence=1.0 # Rule-based, high confidence
|
| 331 |
+
))
|
| 332 |
+
|
| 333 |
+
# Check for city - DELHI addresses always have DELHI as city
|
| 334 |
+
has_city = any(e.label == "CITY" for e in result)
|
| 335 |
+
if not has_city:
|
| 336 |
+
# If text contains DELHI anywhere, set city to DELHI
|
| 337 |
+
if "DELHI" in text.upper():
|
| 338 |
+
# Find the last occurrence of DELHI (usually the city mention)
|
| 339 |
+
delhi_positions = [m.start() for m in re.finditer(r'\bDELHI\b', text.upper())]
|
| 340 |
+
if delhi_positions:
|
| 341 |
+
pos = delhi_positions[-1] # Use last occurrence
|
| 342 |
+
result.append(AddressEntity(
|
| 343 |
+
label="CITY",
|
| 344 |
+
value="DELHI",
|
| 345 |
+
start=pos,
|
| 346 |
+
end=pos + 5,
|
| 347 |
+
confidence=0.90
|
| 348 |
+
))
|
| 349 |
+
else:
|
| 350 |
+
# Check other city patterns
|
| 351 |
+
for pattern, city_name in self.CITY_PATTERNS:
|
| 352 |
+
if city_name == "DELHI":
|
| 353 |
+
continue # Already handled above
|
| 354 |
+
match = pattern.search(text)
|
| 355 |
+
if match and (match.start(), match.end()) not in existing_spans:
|
| 356 |
+
result.append(AddressEntity(
|
| 357 |
+
label="CITY",
|
| 358 |
+
value=city_name,
|
| 359 |
+
start=match.start(),
|
| 360 |
+
end=match.end(),
|
| 361 |
+
confidence=0.95
|
| 362 |
+
))
|
| 363 |
+
break
|
| 364 |
+
|
| 365 |
+
# Check for state
|
| 366 |
+
if not any(e.label == "STATE" for e in entities):
|
| 367 |
+
for pattern, state_name in self.STATE_PATTERNS:
|
| 368 |
+
match = pattern.search(text)
|
| 369 |
+
if match and (match.start(), match.end()) not in existing_spans:
|
| 370 |
+
# Avoid tagging "DELHI" as state if it's already a city
|
| 371 |
+
if state_name == "DELHI" and any(e.label == "CITY" and "DELHI" in e.value.upper() for e in result):
|
| 372 |
+
continue
|
| 373 |
+
result.append(AddressEntity(
|
| 374 |
+
label="STATE",
|
| 375 |
+
value=state_name,
|
| 376 |
+
start=match.start(),
|
| 377 |
+
end=match.end(),
|
| 378 |
+
confidence=0.90
|
| 379 |
+
))
|
| 380 |
+
break
|
| 381 |
+
|
| 382 |
+
return result
|
| 383 |
+
|
| 384 |
+
def _correct_boundaries(
|
| 385 |
+
self,
|
| 386 |
+
text: str,
|
| 387 |
+
entities: list[AddressEntity]
|
| 388 |
+
) -> list[AddressEntity]:
|
| 389 |
+
"""Correct entity boundaries based on patterns."""
|
| 390 |
+
result = []
|
| 391 |
+
|
| 392 |
+
for entity in entities:
|
| 393 |
+
corrected = entity.model_copy()
|
| 394 |
+
|
| 395 |
+
# Expand KHASRA to include full pattern
|
| 396 |
+
if entity.label == "KHASRA":
|
| 397 |
+
match = self.PATTERNS["KHASRA"].search(text)
|
| 398 |
+
if match:
|
| 399 |
+
corrected.value = match.group(0)
|
| 400 |
+
corrected.start = match.start()
|
| 401 |
+
corrected.end = match.end()
|
| 402 |
+
|
| 403 |
+
# Expand BLOCK to include identifier
|
| 404 |
+
elif entity.label == "BLOCK":
|
| 405 |
+
match = self.PATTERNS["BLOCK"].search(text)
|
| 406 |
+
if match:
|
| 407 |
+
corrected.value = match.group(0)
|
| 408 |
+
corrected.start = match.start()
|
| 409 |
+
corrected.end = match.end()
|
| 410 |
+
|
| 411 |
+
# Expand FLOOR to include floor number
|
| 412 |
+
elif entity.label == "FLOOR":
|
| 413 |
+
match = self.PATTERNS["FLOOR"].search(text)
|
| 414 |
+
if match:
|
| 415 |
+
corrected.value = match.group(0)
|
| 416 |
+
corrected.start = match.start()
|
| 417 |
+
corrected.end = match.end()
|
| 418 |
+
|
| 419 |
+
# Clean up leading/trailing whitespace from value
|
| 420 |
+
corrected.value = corrected.value.strip()
|
| 421 |
+
|
| 422 |
+
result.append(corrected)
|
| 423 |
+
|
| 424 |
+
return result
|
| 425 |
+
|
| 426 |
+
def _adjust_confidence(
|
| 427 |
+
self,
|
| 428 |
+
text: str,
|
| 429 |
+
entities: list[AddressEntity]
|
| 430 |
+
) -> list[AddressEntity]:
|
| 431 |
+
"""Adjust confidence scores based on patterns and gazetteer."""
|
| 432 |
+
result = []
|
| 433 |
+
|
| 434 |
+
for entity in entities:
|
| 435 |
+
adjusted = entity.model_copy()
|
| 436 |
+
|
| 437 |
+
# Boost confidence for pattern matches
|
| 438 |
+
if entity.label in self.PATTERNS:
|
| 439 |
+
pattern = self.PATTERNS[entity.label]
|
| 440 |
+
if pattern.fullmatch(entity.value):
|
| 441 |
+
adjusted.confidence = min(1.0, entity.confidence + 0.1)
|
| 442 |
+
|
| 443 |
+
# Boost confidence for gazetteer matches
|
| 444 |
+
if self.gazetteer and entity.label in ("AREA", "SUBAREA", "COLONY"):
|
| 445 |
+
if self.gazetteer.is_known_locality(entity.value):
|
| 446 |
+
adjusted.confidence = min(1.0, entity.confidence + 0.15)
|
| 447 |
+
|
| 448 |
+
# Reduce confidence for very short entities
|
| 449 |
+
if len(entity.value) < 3:
|
| 450 |
+
adjusted.confidence = max(0.0, entity.confidence - 0.2)
|
| 451 |
+
|
| 452 |
+
result.append(adjusted)
|
| 453 |
+
|
| 454 |
+
return result
|
| 455 |
+
|
| 456 |
+
def _remove_overlaps(
|
| 457 |
+
self,
|
| 458 |
+
entities: list[AddressEntity]
|
| 459 |
+
) -> list[AddressEntity]:
|
| 460 |
+
"""Remove overlapping entities, keeping higher confidence ones."""
|
| 461 |
+
if not entities:
|
| 462 |
+
return entities
|
| 463 |
+
|
| 464 |
+
# Separate CITY and PINCODE entities - these should always be kept
|
| 465 |
+
# as they represent different semantic levels than AREA/SUBAREA
|
| 466 |
+
preserved_labels = {"CITY", "PINCODE", "STATE"}
|
| 467 |
+
preserved_entities = [e for e in entities if e.label in preserved_labels]
|
| 468 |
+
other_entities = [e for e in entities if e.label not in preserved_labels]
|
| 469 |
+
|
| 470 |
+
# Sort non-preserved by confidence (descending) then by start position
|
| 471 |
+
sorted_entities = sorted(other_entities, key=lambda e: (-e.confidence, e.start))
|
| 472 |
+
|
| 473 |
+
result: list[AddressEntity] = []
|
| 474 |
+
used_ranges: list[tuple[int, int]] = []
|
| 475 |
+
|
| 476 |
+
for entity in sorted_entities:
|
| 477 |
+
# Check for overlap with existing entities
|
| 478 |
+
overlaps = False
|
| 479 |
+
for start, end in used_ranges:
|
| 480 |
+
if not (entity.end <= start or entity.start >= end):
|
| 481 |
+
overlaps = True
|
| 482 |
+
break
|
| 483 |
+
|
| 484 |
+
if not overlaps:
|
| 485 |
+
result.append(entity)
|
| 486 |
+
used_ranges.append((entity.start, entity.end))
|
| 487 |
+
|
| 488 |
+
# Add back preserved entities (CITY, PINCODE, STATE)
|
| 489 |
+
result.extend(preserved_entities)
|
| 490 |
+
|
| 491 |
+
# Sort by position for output
|
| 492 |
+
return sorted(result, key=lambda e: e.start)
|
| 493 |
+
|
| 494 |
+
def _validate_entities(
|
| 495 |
+
self,
|
| 496 |
+
entities: list[AddressEntity]
|
| 497 |
+
) -> list[AddressEntity]:
|
| 498 |
+
"""Validate and filter entities."""
|
| 499 |
+
result = []
|
| 500 |
+
|
| 501 |
+
for entity in entities:
|
| 502 |
+
# Skip empty values
|
| 503 |
+
if not entity.value.strip():
|
| 504 |
+
continue
|
| 505 |
+
|
| 506 |
+
# Skip very low confidence
|
| 507 |
+
if entity.confidence < 0.3:
|
| 508 |
+
continue
|
| 509 |
+
|
| 510 |
+
# Validate pincode format
|
| 511 |
+
if entity.label == "PINCODE":
|
| 512 |
+
if not re.fullmatch(r'[1-9]\d{5}', entity.value):
|
| 513 |
+
continue
|
| 514 |
+
if self.gazetteer and not self.gazetteer.validate_pincode(entity.value):
|
| 515 |
+
# Pincode outside Delhi range - reduce confidence but keep
|
| 516 |
+
entity = entity.model_copy()
|
| 517 |
+
entity.confidence *= 0.7
|
| 518 |
+
|
| 519 |
+
result.append(entity)
|
| 520 |
+
|
| 521 |
+
return result
|
| 522 |
+
|
| 523 |
+
def extract_all_patterns(self, text: str) -> dict[str, list[str]]:
|
| 524 |
+
"""
|
| 525 |
+
Extract all pattern-based entities from text.
|
| 526 |
+
|
| 527 |
+
Returns dict of label -> list of matched values.
|
| 528 |
+
"""
|
| 529 |
+
results = {}
|
| 530 |
+
|
| 531 |
+
for label, pattern in self.PATTERNS.items():
|
| 532 |
+
matches = pattern.findall(text)
|
| 533 |
+
if matches:
|
| 534 |
+
results[label] = matches
|
| 535 |
+
|
| 536 |
+
return results
|
src/address_parser/preprocessing/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Preprocessing module for address normalization and Hindi transliteration."""
|
| 2 |
+
|
| 3 |
+
from address_parser.preprocessing.hindi import HindiTransliterator
|
| 4 |
+
from address_parser.preprocessing.normalizer import AddressNormalizer
|
| 5 |
+
|
| 6 |
+
__all__ = ["AddressNormalizer", "HindiTransliterator"]
|
src/address_parser/preprocessing/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (491 Bytes). View file
|
|
|
src/address_parser/preprocessing/__pycache__/__init__.cpython-314.pyc
ADDED
|
Binary file (488 Bytes). View file
|
|
|
src/address_parser/preprocessing/__pycache__/hindi.cpython-312.pyc
ADDED
|
Binary file (10.3 kB). View file
|
|
|
src/address_parser/preprocessing/__pycache__/hindi.cpython-314.pyc
ADDED
|
Binary file (11.5 kB). View file
|
|
|
src/address_parser/preprocessing/__pycache__/normalizer.cpython-312.pyc
ADDED
|
Binary file (7.16 kB). View file
|
|
|
src/address_parser/preprocessing/__pycache__/normalizer.cpython-314.pyc
ADDED
|
Binary file (8.41 kB). View file
|
|
|
src/address_parser/preprocessing/hindi.py
ADDED
|
@@ -0,0 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Hindi transliteration and script handling for multilingual addresses."""
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class HindiTransliterator:
|
| 7 |
+
"""
|
| 8 |
+
Handles Hindi (Devanagari) to Latin transliteration and script detection.
|
| 9 |
+
|
| 10 |
+
Supports:
|
| 11 |
+
- Devanagari to Latin conversion
|
| 12 |
+
- Common Hindi address terms
|
| 13 |
+
- Mixed script (code-switched) addresses
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
# Devanagari Unicode range
|
| 17 |
+
DEVANAGARI_START = 0x0900
|
| 18 |
+
DEVANAGARI_END = 0x097F
|
| 19 |
+
|
| 20 |
+
# Common Hindi address terms with transliterations
|
| 21 |
+
HINDI_TERMS = {
|
| 22 |
+
# Devanagari -> Latin
|
| 23 |
+
'गली': 'GALI',
|
| 24 |
+
'गलि': 'GALI',
|
| 25 |
+
'मोहल्ला': 'MOHALLA',
|
| 26 |
+
'नगर': 'NAGAR',
|
| 27 |
+
'विहार': 'VIHAR',
|
| 28 |
+
'पुरी': 'PURI',
|
| 29 |
+
'पुर': 'PUR',
|
| 30 |
+
'बाग': 'BAGH',
|
| 31 |
+
'मार्ग': 'MARG',
|
| 32 |
+
'रोड': 'ROAD',
|
| 33 |
+
'मंजिल': 'FLOOR',
|
| 34 |
+
'पहली': 'FIRST',
|
| 35 |
+
'दूसरी': 'SECOND',
|
| 36 |
+
'तीसरी': 'THIRD',
|
| 37 |
+
'चौथी': 'FOURTH',
|
| 38 |
+
'भूतल': 'GROUND FLOOR',
|
| 39 |
+
'तहखाना': 'BASEMENT',
|
| 40 |
+
'मकान': 'HOUSE',
|
| 41 |
+
'प्लॉट': 'PLOT',
|
| 42 |
+
'खसरा': 'KHASRA',
|
| 43 |
+
'ब्लॉक': 'BLOCK',
|
| 44 |
+
'सेक्टर': 'SECTOR',
|
| 45 |
+
'कॉलोनी': 'COLONY',
|
| 46 |
+
'इलाका': 'AREA',
|
| 47 |
+
'क्षेत्र': 'AREA',
|
| 48 |
+
'दिल्ली': 'DELHI',
|
| 49 |
+
'नई दिल्ली': 'NEW DELHI',
|
| 50 |
+
'नम्बर': 'NUMBER',
|
| 51 |
+
'नंबर': 'NUMBER',
|
| 52 |
+
'संख्या': 'NUMBER',
|
| 53 |
+
'पास': 'NEAR',
|
| 54 |
+
'सामने': 'OPPOSITE',
|
| 55 |
+
'पीछे': 'BEHIND',
|
| 56 |
+
'के पास': 'NEAR',
|
| 57 |
+
'के सामने': 'OPPOSITE',
|
| 58 |
+
'चौक': 'CHOWK',
|
| 59 |
+
'बाजार': 'BAZAAR',
|
| 60 |
+
'बस्ती': 'BASTI',
|
| 61 |
+
'पार्क': 'PARK',
|
| 62 |
+
'एक्सटेंशन': 'EXTENSION',
|
| 63 |
+
'फेज': 'PHASE',
|
| 64 |
+
'वार्ड': 'WARD',
|
| 65 |
+
'जोन': 'ZONE',
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
# Devanagari consonants to Latin (basic ITRANS-like mapping)
|
| 69 |
+
CONSONANT_MAP = {
|
| 70 |
+
'क': 'k', 'ख': 'kh', 'ग': 'g', 'घ': 'gh', 'ङ': 'ng',
|
| 71 |
+
'च': 'ch', 'छ': 'chh', 'ज': 'j', 'झ': 'jh', 'ञ': 'ny',
|
| 72 |
+
'ट': 't', 'ठ': 'th', 'ड': 'd', 'ढ': 'dh', 'ण': 'n',
|
| 73 |
+
'त': 't', 'थ': 'th', 'द': 'd', 'ध': 'dh', 'न': 'n',
|
| 74 |
+
'प': 'p', 'फ': 'ph', 'ब': 'b', 'भ': 'bh', 'म': 'm',
|
| 75 |
+
'य': 'y', 'र': 'r', 'ल': 'l', 'व': 'v', 'श': 'sh',
|
| 76 |
+
'ष': 'sh', 'स': 's', 'ह': 'h',
|
| 77 |
+
'क़': 'q', 'ख़': 'kh', 'ग़': 'g', 'ज़': 'z', 'ड़': 'd',
|
| 78 |
+
'ढ़': 'dh', 'फ़': 'f', 'य़': 'y',
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
# Devanagari vowels/matras
|
| 82 |
+
VOWEL_MAP = {
|
| 83 |
+
'अ': 'a', 'आ': 'aa', 'इ': 'i', 'ई': 'ee', 'उ': 'u', 'ऊ': 'oo',
|
| 84 |
+
'ए': 'e', 'ऐ': 'ai', 'ओ': 'o', 'औ': 'au', 'अं': 'an', 'अः': 'ah',
|
| 85 |
+
'ा': 'a', 'ि': 'i', 'ी': 'ee', 'ु': 'u', 'ू': 'oo',
|
| 86 |
+
'े': 'e', 'ै': 'ai', 'ो': 'o', 'ौ': 'au',
|
| 87 |
+
'ं': 'n', 'ः': 'h', '्': '', # Halant (vowel killer)
|
| 88 |
+
'ँ': 'n', # Chandrabindu
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
# Devanagari digits
|
| 92 |
+
DIGIT_MAP = {
|
| 93 |
+
'०': '0', '१': '1', '२': '2', '३': '3', '४': '4',
|
| 94 |
+
'५': '5', '६': '6', '७': '7', '८': '8', '९': '9',
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
def __init__(self, use_known_terms: bool = True):
|
| 98 |
+
"""
|
| 99 |
+
Initialize transliterator.
|
| 100 |
+
|
| 101 |
+
Args:
|
| 102 |
+
use_known_terms: Use dictionary of known Hindi address terms
|
| 103 |
+
"""
|
| 104 |
+
self.use_known_terms = use_known_terms
|
| 105 |
+
|
| 106 |
+
def contains_devanagari(self, text: str) -> bool:
|
| 107 |
+
"""Check if text contains Devanagari script."""
|
| 108 |
+
for char in text:
|
| 109 |
+
code = ord(char)
|
| 110 |
+
if self.DEVANAGARI_START <= code <= self.DEVANAGARI_END:
|
| 111 |
+
return True
|
| 112 |
+
return False
|
| 113 |
+
|
| 114 |
+
def get_script_ratio(self, text: str) -> dict[str, float]:
|
| 115 |
+
"""
|
| 116 |
+
Get ratio of different scripts in text.
|
| 117 |
+
|
| 118 |
+
Returns dict with 'latin', 'devanagari', 'numeric', 'other' ratios.
|
| 119 |
+
"""
|
| 120 |
+
if not text:
|
| 121 |
+
return {'latin': 0.0, 'devanagari': 0.0, 'numeric': 0.0, 'other': 0.0}
|
| 122 |
+
|
| 123 |
+
counts: dict[str, float] = {'latin': 0, 'devanagari': 0, 'numeric': 0, 'other': 0}
|
| 124 |
+
total = 0
|
| 125 |
+
|
| 126 |
+
for char in text:
|
| 127 |
+
if char.isspace():
|
| 128 |
+
continue
|
| 129 |
+
total += 1
|
| 130 |
+
code = ord(char)
|
| 131 |
+
|
| 132 |
+
if self.DEVANAGARI_START <= code <= self.DEVANAGARI_END:
|
| 133 |
+
counts['devanagari'] += 1
|
| 134 |
+
elif char.isascii() and char.isalpha():
|
| 135 |
+
counts['latin'] += 1
|
| 136 |
+
elif char.isdigit():
|
| 137 |
+
counts['numeric'] += 1
|
| 138 |
+
else:
|
| 139 |
+
counts['other'] += 1
|
| 140 |
+
|
| 141 |
+
if total == 0:
|
| 142 |
+
return counts
|
| 143 |
+
|
| 144 |
+
return {k: v / total for k, v in counts.items()}
|
| 145 |
+
|
| 146 |
+
def transliterate(self, text: str) -> str:
|
| 147 |
+
"""
|
| 148 |
+
Transliterate Devanagari text to Latin script.
|
| 149 |
+
|
| 150 |
+
Args:
|
| 151 |
+
text: Input text (may be mixed script)
|
| 152 |
+
|
| 153 |
+
Returns:
|
| 154 |
+
Transliterated text in Latin script
|
| 155 |
+
"""
|
| 156 |
+
if not self.contains_devanagari(text):
|
| 157 |
+
return text
|
| 158 |
+
|
| 159 |
+
# First, try to match known terms
|
| 160 |
+
if self.use_known_terms:
|
| 161 |
+
for hindi, latin in sorted(self.HINDI_TERMS.items(), key=lambda x: -len(x[0])):
|
| 162 |
+
text = text.replace(hindi, f' {latin} ')
|
| 163 |
+
|
| 164 |
+
# Then transliterate remaining Devanagari
|
| 165 |
+
result = []
|
| 166 |
+
i = 0
|
| 167 |
+
while i < len(text):
|
| 168 |
+
char = text[i]
|
| 169 |
+
code = ord(char)
|
| 170 |
+
|
| 171 |
+
if self.DEVANAGARI_START <= code <= self.DEVANAGARI_END:
|
| 172 |
+
# Check digits first
|
| 173 |
+
if char in self.DIGIT_MAP:
|
| 174 |
+
result.append(self.DIGIT_MAP[char])
|
| 175 |
+
# Check vowels
|
| 176 |
+
elif char in self.VOWEL_MAP:
|
| 177 |
+
result.append(self.VOWEL_MAP[char])
|
| 178 |
+
# Check consonants
|
| 179 |
+
elif char in self.CONSONANT_MAP:
|
| 180 |
+
result.append(self.CONSONANT_MAP[char])
|
| 181 |
+
# Add implicit 'a' unless followed by matra or halant
|
| 182 |
+
if i + 1 < len(text):
|
| 183 |
+
next_char = text[i + 1]
|
| 184 |
+
next_code = ord(next_char)
|
| 185 |
+
# If next is a matra (0x093E-0x094D) or halant, don't add 'a'
|
| 186 |
+
if not (0x093E <= next_code <= 0x094D):
|
| 187 |
+
result.append('a')
|
| 188 |
+
else:
|
| 189 |
+
result.append('a')
|
| 190 |
+
else:
|
| 191 |
+
# Unknown Devanagari character
|
| 192 |
+
result.append(char)
|
| 193 |
+
else:
|
| 194 |
+
result.append(char)
|
| 195 |
+
|
| 196 |
+
i += 1
|
| 197 |
+
|
| 198 |
+
# Clean up
|
| 199 |
+
output = ''.join(result)
|
| 200 |
+
output = re.sub(r'\s+', ' ', output)
|
| 201 |
+
return output.strip().upper()
|
| 202 |
+
|
| 203 |
+
def normalize_mixed_script(self, text: str) -> str:
|
| 204 |
+
"""
|
| 205 |
+
Handle code-mixed (Hindi + English) addresses.
|
| 206 |
+
|
| 207 |
+
Transliterates Hindi portions while preserving English.
|
| 208 |
+
"""
|
| 209 |
+
# Split on whitespace to handle word by word
|
| 210 |
+
words = text.split()
|
| 211 |
+
result = []
|
| 212 |
+
|
| 213 |
+
for word in words:
|
| 214 |
+
if self.contains_devanagari(word):
|
| 215 |
+
# Check if it's a known term first
|
| 216 |
+
if self.use_known_terms and word in self.HINDI_TERMS:
|
| 217 |
+
result.append(self.HINDI_TERMS[word])
|
| 218 |
+
else:
|
| 219 |
+
result.append(self.transliterate(word))
|
| 220 |
+
else:
|
| 221 |
+
result.append(word.upper())
|
| 222 |
+
|
| 223 |
+
return ' '.join(result)
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
def detect_language(text: str) -> str:
|
| 227 |
+
"""
|
| 228 |
+
Simple language detection for address text.
|
| 229 |
+
|
| 230 |
+
Returns: 'hindi', 'english', or 'mixed'
|
| 231 |
+
"""
|
| 232 |
+
transliterator = HindiTransliterator()
|
| 233 |
+
ratios = transliterator.get_script_ratio(text)
|
| 234 |
+
|
| 235 |
+
if ratios['devanagari'] > 0.5:
|
| 236 |
+
return 'hindi'
|
| 237 |
+
elif ratios['latin'] > 0.5:
|
| 238 |
+
return 'english'
|
| 239 |
+
elif ratios['devanagari'] > 0 and ratios['latin'] > 0:
|
| 240 |
+
return 'mixed'
|
| 241 |
+
else:
|
| 242 |
+
return 'english'
|
src/address_parser/preprocessing/normalizer.py
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Address normalization utilities."""
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class AddressNormalizer:
|
| 7 |
+
"""
|
| 8 |
+
Normalizes Indian addresses for consistent processing.
|
| 9 |
+
|
| 10 |
+
Handles:
|
| 11 |
+
- Case normalization
|
| 12 |
+
- Whitespace cleanup
|
| 13 |
+
- Common abbreviation expansion
|
| 14 |
+
- Punctuation standardization
|
| 15 |
+
- Number format standardization
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
# Common abbreviations in Indian addresses
|
| 19 |
+
ABBREVIATIONS = {
|
| 20 |
+
r'\bH\.?\s*NO\.?\b': 'HOUSE NO',
|
| 21 |
+
r'\bH\.?\s*N\.?\b': 'HOUSE NO',
|
| 22 |
+
r'\bHNO\.?\b': 'HOUSE NO',
|
| 23 |
+
r'\bPLT\.?\s*NO\.?\b': 'PLOT NO',
|
| 24 |
+
r'\bP\.?\s*NO\.?\b': 'PLOT NO',
|
| 25 |
+
r'\bFL\.?\b': 'FLOOR',
|
| 26 |
+
r'\bFLR\.?\b': 'FLOOR',
|
| 27 |
+
r'\bGF\.?\b': 'GROUND FLOOR',
|
| 28 |
+
r'\bFF\.?\b': 'FIRST FLOOR',
|
| 29 |
+
r'\bSF\.?\b': 'SECOND FLOOR',
|
| 30 |
+
r'\bTF\.?\b': 'THIRD FLOOR',
|
| 31 |
+
r'\b1ST\s+FL\.?\b': 'FIRST FLOOR',
|
| 32 |
+
r'\b2ND\s+FL\.?\b': 'SECOND FLOOR',
|
| 33 |
+
r'\b3RD\s+FL\.?\b': 'THIRD FLOOR',
|
| 34 |
+
r'\bGRD\.?\s*FL\.?\b': 'GROUND FLOOR',
|
| 35 |
+
r'\bBLK\.?\b': 'BLOCK',
|
| 36 |
+
r'\bBL\.?\b': 'BLOCK',
|
| 37 |
+
r'\bSEC\.?\b': 'SECTOR',
|
| 38 |
+
r'\bKH\.?\s*NO\.?\b': 'KHASRA NO',
|
| 39 |
+
r'\bKHASRA\s*NO\.?\b': 'KHASRA NO',
|
| 40 |
+
r'\bKH\.?\b': 'KHASRA',
|
| 41 |
+
r'\bCOL\.?\b': 'COLONY',
|
| 42 |
+
r'\bNGR\.?\b': 'NAGAR',
|
| 43 |
+
r'\bMKT\.?\b': 'MARKET',
|
| 44 |
+
r'\bRD\.?\b': 'ROAD',
|
| 45 |
+
r'\bST\.?\b': 'STREET',
|
| 46 |
+
r'\bLN\.?\b': 'LANE',
|
| 47 |
+
r'\bEXTN\.?\b': 'EXTENSION',
|
| 48 |
+
r'\bEXT\.?\b': 'EXTENSION',
|
| 49 |
+
r'\bPH\.?\b': 'PHASE',
|
| 50 |
+
r'\bNR\.?\b': 'NEAR',
|
| 51 |
+
r'\bOPP\.?\b': 'OPPOSITE',
|
| 52 |
+
r'\bBHD\.?\b': 'BEHIND',
|
| 53 |
+
r'\bADJ\.?\b': 'ADJACENT',
|
| 54 |
+
r'\bWZ\.?\b': 'WZ', # West Zone
|
| 55 |
+
r'\bEZ\.?\b': 'EZ', # East Zone
|
| 56 |
+
r'\bNZ\.?\b': 'NZ', # North Zone
|
| 57 |
+
r'\bSZ\.?\b': 'SZ', # South Zone
|
| 58 |
+
r'\bDL\.?\b': 'DELHI',
|
| 59 |
+
r'\bN\.?\s*DELHI\b': 'NEW DELHI',
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
# Floor name patterns
|
| 63 |
+
FLOOR_PATTERNS = {
|
| 64 |
+
r'\bGROUND\b': 'GROUND',
|
| 65 |
+
r'\bBASEMENT\b': 'BASEMENT',
|
| 66 |
+
r'\bFIRST\b': 'FIRST',
|
| 67 |
+
r'\bSECOND\b': 'SECOND',
|
| 68 |
+
r'\bTHIRD\b': 'THIRD',
|
| 69 |
+
r'\bFOURTH\b': 'FOURTH',
|
| 70 |
+
r'\bFIFTH\b': 'FIFTH',
|
| 71 |
+
r'\b1ST\b': 'FIRST',
|
| 72 |
+
r'\b2ND\b': 'SECOND',
|
| 73 |
+
r'\b3RD\b': 'THIRD',
|
| 74 |
+
r'\b4TH\b': 'FOURTH',
|
| 75 |
+
r'\b5TH\b': 'FIFTH',
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
def __init__(self, uppercase: bool = True, expand_abbrev: bool = True):
|
| 79 |
+
"""
|
| 80 |
+
Initialize normalizer.
|
| 81 |
+
|
| 82 |
+
Args:
|
| 83 |
+
uppercase: Convert text to uppercase
|
| 84 |
+
expand_abbrev: Expand common abbreviations
|
| 85 |
+
"""
|
| 86 |
+
self.uppercase = uppercase
|
| 87 |
+
self.expand_abbrev = expand_abbrev
|
| 88 |
+
|
| 89 |
+
# Compile regex patterns
|
| 90 |
+
self._abbrev_patterns = {
|
| 91 |
+
re.compile(pattern, re.IGNORECASE): replacement
|
| 92 |
+
for pattern, replacement in self.ABBREVIATIONS.items()
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
def normalize(self, address: str) -> str:
|
| 96 |
+
"""
|
| 97 |
+
Normalize an address string.
|
| 98 |
+
|
| 99 |
+
Args:
|
| 100 |
+
address: Raw address string
|
| 101 |
+
|
| 102 |
+
Returns:
|
| 103 |
+
Normalized address string
|
| 104 |
+
"""
|
| 105 |
+
if not address:
|
| 106 |
+
return ""
|
| 107 |
+
|
| 108 |
+
text = address
|
| 109 |
+
|
| 110 |
+
# Basic cleanup
|
| 111 |
+
text = self._clean_whitespace(text)
|
| 112 |
+
text = self._standardize_punctuation(text)
|
| 113 |
+
|
| 114 |
+
# Expand abbreviations
|
| 115 |
+
if self.expand_abbrev:
|
| 116 |
+
text = self._expand_abbreviations(text)
|
| 117 |
+
|
| 118 |
+
# Case normalization
|
| 119 |
+
if self.uppercase:
|
| 120 |
+
text = text.upper()
|
| 121 |
+
|
| 122 |
+
# Final whitespace cleanup
|
| 123 |
+
text = self._clean_whitespace(text)
|
| 124 |
+
|
| 125 |
+
return text
|
| 126 |
+
|
| 127 |
+
def _clean_whitespace(self, text: str) -> str:
|
| 128 |
+
"""Remove extra whitespace."""
|
| 129 |
+
# Replace multiple spaces with single space
|
| 130 |
+
text = re.sub(r'\s+', ' ', text)
|
| 131 |
+
# Remove spaces around punctuation
|
| 132 |
+
text = re.sub(r'\s*,\s*', ', ', text)
|
| 133 |
+
text = re.sub(r'\s*-\s*', '-', text)
|
| 134 |
+
# Trim
|
| 135 |
+
return text.strip()
|
| 136 |
+
|
| 137 |
+
def _standardize_punctuation(self, text: str) -> str:
|
| 138 |
+
"""Standardize punctuation marks."""
|
| 139 |
+
# Replace various dash types with standard hyphen
|
| 140 |
+
text = re.sub(r'[–—]', '-', text)
|
| 141 |
+
# Remove duplicate punctuation
|
| 142 |
+
text = re.sub(r',+', ',', text)
|
| 143 |
+
text = re.sub(r'-+', '-', text)
|
| 144 |
+
# Remove trailing punctuation before comma
|
| 145 |
+
text = re.sub(r'-,', ',', text)
|
| 146 |
+
return text
|
| 147 |
+
|
| 148 |
+
def _expand_abbreviations(self, text: str) -> str:
|
| 149 |
+
"""Expand common abbreviations."""
|
| 150 |
+
for pattern, replacement in self._abbrev_patterns.items():
|
| 151 |
+
text = pattern.sub(replacement, text)
|
| 152 |
+
return text
|
| 153 |
+
|
| 154 |
+
def extract_pincode(self, address: str) -> str | None:
|
| 155 |
+
"""Extract 6-digit Indian PIN code from address."""
|
| 156 |
+
match = re.search(r'\b[1-9]\d{5}\b', address)
|
| 157 |
+
return match.group(0) if match else None
|
| 158 |
+
|
| 159 |
+
def remove_pincode(self, address: str) -> str:
|
| 160 |
+
"""Remove PIN code from address."""
|
| 161 |
+
return re.sub(r'\b[1-9]\d{5}\b', '', address)
|
| 162 |
+
|
| 163 |
+
def tokenize(self, text: str) -> list[str]:
|
| 164 |
+
"""
|
| 165 |
+
Simple tokenization preserving address-specific patterns.
|
| 166 |
+
|
| 167 |
+
Args:
|
| 168 |
+
text: Normalized address text
|
| 169 |
+
|
| 170 |
+
Returns:
|
| 171 |
+
List of tokens
|
| 172 |
+
"""
|
| 173 |
+
# Split on whitespace but keep special patterns together
|
| 174 |
+
# e.g., "H-3" stays as one token, "110041" stays together
|
| 175 |
+
tokens = []
|
| 176 |
+
|
| 177 |
+
# Pattern to match address tokens
|
| 178 |
+
pattern = r'''
|
| 179 |
+
[A-Z0-9]+[-/][A-Z0-9/]+ | # Compound identifiers like H-3, 24/1/3
|
| 180 |
+
[A-Z]+\d+ | # Letter+number combos like A5
|
| 181 |
+
\d+[A-Z]+ | # Number+letter combos like 5A
|
| 182 |
+
[A-Z]+ | # Words
|
| 183 |
+
\d+ | # Numbers
|
| 184 |
+
[,.] # Punctuation
|
| 185 |
+
'''
|
| 186 |
+
|
| 187 |
+
for match in re.finditer(pattern, text.upper(), re.VERBOSE):
|
| 188 |
+
token = match.group(0)
|
| 189 |
+
if token.strip():
|
| 190 |
+
tokens.append(token)
|
| 191 |
+
|
| 192 |
+
return tokens
|
src/address_parser/schemas.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Pydantic schemas for address parsing I/O."""
|
| 2 |
+
|
| 3 |
+
from pydantic import BaseModel, ConfigDict, Field
|
| 4 |
+
|
| 5 |
+
# Entity label definitions
|
| 6 |
+
ENTITY_LABELS = [
|
| 7 |
+
"AREA",
|
| 8 |
+
"SUBAREA",
|
| 9 |
+
"HOUSE_NUMBER",
|
| 10 |
+
"SECTOR",
|
| 11 |
+
"GALI",
|
| 12 |
+
"COLONY",
|
| 13 |
+
"BLOCK",
|
| 14 |
+
"CAMP",
|
| 15 |
+
"POLE",
|
| 16 |
+
"KHASRA",
|
| 17 |
+
"FLOOR",
|
| 18 |
+
"PLOT",
|
| 19 |
+
"PINCODE",
|
| 20 |
+
"CITY",
|
| 21 |
+
"STATE",
|
| 22 |
+
]
|
| 23 |
+
|
| 24 |
+
# BIO tag generation
|
| 25 |
+
BIO_LABELS = ["O"] + [f"B-{label}" for label in ENTITY_LABELS] + [f"I-{label}" for label in ENTITY_LABELS]
|
| 26 |
+
LABEL2ID = {label: i for i, label in enumerate(BIO_LABELS)}
|
| 27 |
+
ID2LABEL = {i: label for i, label in enumerate(BIO_LABELS)}
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class AddressEntity(BaseModel):
|
| 31 |
+
"""A single extracted entity from an address."""
|
| 32 |
+
|
| 33 |
+
label: str = Field(..., description="Entity type (e.g., HOUSE_NUMBER, AREA)")
|
| 34 |
+
value: str = Field(..., description="Extracted text value")
|
| 35 |
+
start: int = Field(..., description="Start character offset in original text")
|
| 36 |
+
end: int = Field(..., description="End character offset in original text")
|
| 37 |
+
confidence: float = Field(default=1.0, ge=0.0, le=1.0, description="Confidence score")
|
| 38 |
+
|
| 39 |
+
model_config = ConfigDict(
|
| 40 |
+
json_schema_extra={
|
| 41 |
+
"example": {
|
| 42 |
+
"label": "HOUSE_NUMBER",
|
| 43 |
+
"value": "PLOT NO752",
|
| 44 |
+
"start": 0,
|
| 45 |
+
"end": 10,
|
| 46 |
+
"confidence": 0.95,
|
| 47 |
+
}
|
| 48 |
+
}
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class ParsedAddress(BaseModel):
|
| 53 |
+
"""Complete parsed address with all entities."""
|
| 54 |
+
|
| 55 |
+
raw_address: str = Field(..., description="Original input address")
|
| 56 |
+
normalized_address: str = Field(..., description="Normalized/cleaned address")
|
| 57 |
+
entities: list[AddressEntity] = Field(default_factory=list, description="Extracted entities")
|
| 58 |
+
|
| 59 |
+
# Convenience accessors for common fields
|
| 60 |
+
house_number: str | None = Field(None, description="Extracted house/plot number")
|
| 61 |
+
floor: str | None = Field(None, description="Extracted floor")
|
| 62 |
+
block: str | None = Field(None, description="Extracted block")
|
| 63 |
+
gali: str | None = Field(None, description="Extracted gali/lane")
|
| 64 |
+
colony: str | None = Field(None, description="Extracted colony name")
|
| 65 |
+
area: str | None = Field(None, description="Extracted area/locality")
|
| 66 |
+
subarea: str | None = Field(None, description="Extracted sub-area")
|
| 67 |
+
sector: str | None = Field(None, description="Extracted sector")
|
| 68 |
+
khasra: str | None = Field(None, description="Extracted khasra number")
|
| 69 |
+
pincode: str | None = Field(None, description="Extracted PIN code")
|
| 70 |
+
city: str | None = Field(None, description="Extracted city")
|
| 71 |
+
state: str | None = Field(None, description="Extracted state")
|
| 72 |
+
|
| 73 |
+
def model_post_init(self, __context) -> None:
|
| 74 |
+
"""Populate convenience fields from entities."""
|
| 75 |
+
entity_map = {e.label.upper(): e.value for e in self.entities}
|
| 76 |
+
|
| 77 |
+
self.house_number = entity_map.get("HOUSE_NUMBER") or entity_map.get("PLOT")
|
| 78 |
+
self.floor = entity_map.get("FLOOR")
|
| 79 |
+
self.block = entity_map.get("BLOCK")
|
| 80 |
+
self.gali = entity_map.get("GALI")
|
| 81 |
+
self.colony = entity_map.get("COLONY")
|
| 82 |
+
self.area = entity_map.get("AREA")
|
| 83 |
+
self.subarea = entity_map.get("SUBAREA")
|
| 84 |
+
self.sector = entity_map.get("SECTOR")
|
| 85 |
+
self.khasra = entity_map.get("KHASRA")
|
| 86 |
+
self.pincode = entity_map.get("PINCODE")
|
| 87 |
+
self.city = entity_map.get("CITY")
|
| 88 |
+
self.state = entity_map.get("STATE")
|
| 89 |
+
|
| 90 |
+
model_config = ConfigDict(
|
| 91 |
+
json_schema_extra={
|
| 92 |
+
"example": {
|
| 93 |
+
"raw_address": "PLOT NO752 FIRST FLOOR, BLOCK H-3, NEW DELHI, 110041",
|
| 94 |
+
"normalized_address": "PLOT NO752 FIRST FLOOR BLOCK H-3 NEW DELHI 110041",
|
| 95 |
+
"entities": [
|
| 96 |
+
{"label": "HOUSE_NUMBER", "value": "PLOT NO752", "start": 0, "end": 10, "confidence": 0.95},
|
| 97 |
+
{"label": "FLOOR", "value": "FIRST FLOOR", "start": 11, "end": 22, "confidence": 0.98},
|
| 98 |
+
],
|
| 99 |
+
"house_number": "PLOT NO752",
|
| 100 |
+
"floor": "FIRST FLOOR",
|
| 101 |
+
}
|
| 102 |
+
}
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
class ParseRequest(BaseModel):
|
| 107 |
+
"""Request schema for parsing addresses."""
|
| 108 |
+
|
| 109 |
+
address: str = Field(..., min_length=5, max_length=500, description="Address to parse")
|
| 110 |
+
return_confidence: bool = Field(default=True, description="Include confidence scores")
|
| 111 |
+
|
| 112 |
+
model_config = ConfigDict(
|
| 113 |
+
json_schema_extra={
|
| 114 |
+
"example": {
|
| 115 |
+
"address": "PLOT NO752 FIRST FLOOR, BLOCK H-3, NEW DELHI, 110041",
|
| 116 |
+
"return_confidence": True,
|
| 117 |
+
}
|
| 118 |
+
}
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
class BatchParseRequest(BaseModel):
|
| 123 |
+
"""Request schema for batch parsing."""
|
| 124 |
+
|
| 125 |
+
addresses: list[str] = Field(..., min_length=1, max_length=100, description="List of addresses")
|
| 126 |
+
return_confidence: bool = Field(default=True, description="Include confidence scores")
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
class ParseResponse(BaseModel):
|
| 130 |
+
"""Response schema for single address parsing."""
|
| 131 |
+
|
| 132 |
+
success: bool = Field(default=True, description="Whether parsing succeeded")
|
| 133 |
+
result: ParsedAddress | None = Field(None, description="Parsed address result")
|
| 134 |
+
error: str | None = Field(None, description="Error message if failed")
|
| 135 |
+
inference_time_ms: float = Field(..., description="Inference time in milliseconds")
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
class BatchParseResponse(BaseModel):
|
| 139 |
+
"""Response schema for batch parsing."""
|
| 140 |
+
|
| 141 |
+
success: bool = Field(default=True)
|
| 142 |
+
results: list[ParsedAddress] = Field(default_factory=list)
|
| 143 |
+
total_inference_time_ms: float = Field(..., description="Total inference time")
|
| 144 |
+
avg_inference_time_ms: float = Field(..., description="Average per-address time")
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
class HealthResponse(BaseModel):
|
| 148 |
+
"""Health check response."""
|
| 149 |
+
|
| 150 |
+
status: str = Field(default="healthy")
|
| 151 |
+
model_loaded: bool = Field(default=False)
|
| 152 |
+
version: str = Field(default="2.0.0")
|
src/indian_address_parser.egg-info/PKG-INFO
ADDED
|
@@ -0,0 +1,383 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: indian-address-parser
|
| 3 |
+
Version: 2.0.0
|
| 4 |
+
Summary: Production-grade Indian address parsing using mBERT-CRF
|
| 5 |
+
Author-email: Kushagra <kushagra@gmail.com>
|
| 6 |
+
License: MIT
|
| 7 |
+
Project-URL: Homepage, https://github.com/kushagra/indian-address-parser
|
| 8 |
+
Project-URL: Documentation, https://github.com/kushagra/indian-address-parser#readme
|
| 9 |
+
Project-URL: Repository, https://github.com/kushagra/indian-address-parser
|
| 10 |
+
Project-URL: Issues, https://github.com/kushagra/indian-address-parser/issues
|
| 11 |
+
Keywords: nlp,ner,address-parsing,indian-addresses,bert,crf
|
| 12 |
+
Classifier: Development Status :: 4 - Beta
|
| 13 |
+
Classifier: Intended Audience :: Developers
|
| 14 |
+
Classifier: License :: OSI Approved :: MIT License
|
| 15 |
+
Classifier: Programming Language :: Python :: 3
|
| 16 |
+
Classifier: Programming Language :: Python :: 3.14
|
| 17 |
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
| 18 |
+
Classifier: Topic :: Text Processing :: Linguistic
|
| 19 |
+
Requires-Python: >=3.14
|
| 20 |
+
Description-Content-Type: text/markdown
|
| 21 |
+
Requires-Dist: torch>=2.9.1
|
| 22 |
+
Requires-Dist: transformers>=4.57.6
|
| 23 |
+
Requires-Dist: tokenizers>=0.22.2
|
| 24 |
+
Requires-Dist: datasets>=4.5.0
|
| 25 |
+
Requires-Dist: seqeval>=1.2.2
|
| 26 |
+
Requires-Dist: numpy>=2.4.1
|
| 27 |
+
Requires-Dist: pandas>=2.3.3
|
| 28 |
+
Requires-Dist: scikit-learn>=1.8.0
|
| 29 |
+
Requires-Dist: tqdm>=4.67.1
|
| 30 |
+
Requires-Dist: pydantic>=2.12.5
|
| 31 |
+
Requires-Dist: indic-transliteration>=2.3.75
|
| 32 |
+
Requires-Dist: regex>=2026.1.15
|
| 33 |
+
Requires-Dist: rapidfuzz>=3.14.3
|
| 34 |
+
Provides-Extra: api
|
| 35 |
+
Requires-Dist: fastapi>=0.128.0; extra == "api"
|
| 36 |
+
Requires-Dist: uvicorn[standard]>=0.40.0; extra == "api"
|
| 37 |
+
Requires-Dist: gunicorn>=23.0.0; extra == "api"
|
| 38 |
+
Requires-Dist: python-multipart>=0.0.21; extra == "api"
|
| 39 |
+
Provides-Extra: demo
|
| 40 |
+
Requires-Dist: gradio>=6.3.0; extra == "demo"
|
| 41 |
+
Provides-Extra: training
|
| 42 |
+
Requires-Dist: accelerate>=1.12.0; extra == "training"
|
| 43 |
+
Requires-Dist: wandb>=0.24.0; extra == "training"
|
| 44 |
+
Requires-Dist: optuna>=4.7.0; extra == "training"
|
| 45 |
+
Provides-Extra: onnx
|
| 46 |
+
Requires-Dist: onnx>=1.20.1; python_version < "3.14" and extra == "onnx"
|
| 47 |
+
Requires-Dist: onnxruntime>=1.23.2; python_version < "3.14" and extra == "onnx"
|
| 48 |
+
Provides-Extra: dev
|
| 49 |
+
Requires-Dist: pytest>=9.0.2; extra == "dev"
|
| 50 |
+
Requires-Dist: pytest-cov>=7.0.0; extra == "dev"
|
| 51 |
+
Requires-Dist: pytest-asyncio>=1.3.0; extra == "dev"
|
| 52 |
+
Requires-Dist: black>=26.1.0; extra == "dev"
|
| 53 |
+
Requires-Dist: ruff>=0.14.13; extra == "dev"
|
| 54 |
+
Requires-Dist: mypy>=1.19.1; extra == "dev"
|
| 55 |
+
Requires-Dist: pre-commit>=4.5.1; extra == "dev"
|
| 56 |
+
Provides-Extra: all
|
| 57 |
+
Requires-Dist: indian-address-parser[api,demo,dev,training]; extra == "all"
|
| 58 |
+
Provides-Extra: all-with-onnx
|
| 59 |
+
Requires-Dist: indian-address-parser[api,demo,dev,onnx,training]; extra == "all-with-onnx"
|
| 60 |
+
|
| 61 |
+
# Indian Address Parser
|
| 62 |
+
|
| 63 |
+
Production-grade NLP system for parsing unstructured Indian addresses into structured components using **mBERT-CRF** (Multilingual BERT with Conditional Random Field).
|
| 64 |
+
|
| 65 |
+
[](https://www.python.org/downloads/)
|
| 66 |
+
[](https://opensource.org/licenses/MIT)
|
| 67 |
+
|
| 68 |
+
## Features
|
| 69 |
+
|
| 70 |
+
- **High Accuracy**: 94%+ F1 score on test data
|
| 71 |
+
- **Multilingual**: Supports Hindi (Devanagari) + English
|
| 72 |
+
- **Fast Inference**: < 30ms per address with ONNX optimization
|
| 73 |
+
- **15 Entity Types**: House Number, Floor, Block, Gali, Colony, Area, Khasra, Pincode, etc.
|
| 74 |
+
- **Delhi-specific**: Gazetteer with 100+ localities for improved accuracy
|
| 75 |
+
- **Production Ready**: REST API, Docker, Cloud Run deployment
|
| 76 |
+
|
| 77 |
+
## Demo
|
| 78 |
+
|
| 79 |
+
- **Interactive Demo**: [HuggingFace Spaces](https://huggingface.co/spaces/kushagra/indian-address-parser)
|
| 80 |
+
- **API Endpoint**: `https://indian-address-parser-xyz.run.app/docs`
|
| 81 |
+
|
| 82 |
+
## Quick Start
|
| 83 |
+
|
| 84 |
+
### Installation
|
| 85 |
+
|
| 86 |
+
```bash
|
| 87 |
+
pip install indian-address-parser
|
| 88 |
+
```
|
| 89 |
+
|
| 90 |
+
Or from source:
|
| 91 |
+
|
| 92 |
+
```bash
|
| 93 |
+
git clone https://github.com/kushagra/indian-address-parser.git
|
| 94 |
+
cd indian-address-parser
|
| 95 |
+
pip install -e ".[all]"
|
| 96 |
+
```
|
| 97 |
+
|
| 98 |
+
### Usage
|
| 99 |
+
|
| 100 |
+
```python
|
| 101 |
+
from address_parser import AddressParser
|
| 102 |
+
|
| 103 |
+
# Load parser (rules-only mode if model not available)
|
| 104 |
+
parser = AddressParser.rules_only()
|
| 105 |
+
|
| 106 |
+
# Or load trained model
|
| 107 |
+
# parser = AddressParser.from_pretrained("./models/address_ner")
|
| 108 |
+
|
| 109 |
+
# Parse address
|
| 110 |
+
result = parser.parse(
|
| 111 |
+
"PLOT NO752 FIRST FLOOR, BLOCK H-3 KH NO 24/1/3/2/2/202, "
|
| 112 |
+
"KAUNWAR SINGH NAGAR NEW DELHI, DELHI, 110041"
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
print(f"House Number: {result.house_number}")
|
| 116 |
+
print(f"Floor: {result.floor}")
|
| 117 |
+
print(f"Block: {result.block}")
|
| 118 |
+
print(f"Khasra: {result.khasra}")
|
| 119 |
+
print(f"Area: {result.area}")
|
| 120 |
+
print(f"Pincode: {result.pincode}")
|
| 121 |
+
```
|
| 122 |
+
|
| 123 |
+
**Output:**
|
| 124 |
+
```
|
| 125 |
+
House Number: PLOT NO752
|
| 126 |
+
Floor: FIRST FLOOR
|
| 127 |
+
Block: BLOCK H-3
|
| 128 |
+
Khasra: KH NO 24/1/3/2/2/202
|
| 129 |
+
Area: KAUNWAR SINGH NAGAR
|
| 130 |
+
Pincode: 110041
|
| 131 |
+
```
|
| 132 |
+
|
| 133 |
+
### Entity Types
|
| 134 |
+
|
| 135 |
+
| Entity | Description | Example |
|
| 136 |
+
|--------|-------------|---------|
|
| 137 |
+
| `HOUSE_NUMBER` | House/plot number | `H.NO. 123`, `PLOT NO752` |
|
| 138 |
+
| `FLOOR` | Floor level | `FIRST FLOOR`, `GF` |
|
| 139 |
+
| `BLOCK` | Block identifier | `BLOCK H-3`, `BLK A` |
|
| 140 |
+
| `SECTOR` | Sector number | `SECTOR 15` |
|
| 141 |
+
| `GALI` | Lane/gali number | `GALI NO. 5` |
|
| 142 |
+
| `COLONY` | Colony name | `BABA HARI DAS COLONY` |
|
| 143 |
+
| `AREA` | Area/locality | `KAUNWAR SINGH NAGAR` |
|
| 144 |
+
| `SUBAREA` | Sub-area | `TIKARI KALA` |
|
| 145 |
+
| `KHASRA` | Khasra number | `KH NO 24/1/3/2` |
|
| 146 |
+
| `PINCODE` | 6-digit PIN code | `110041` |
|
| 147 |
+
| `CITY` | City name | `NEW DELHI` |
|
| 148 |
+
| `STATE` | State name | `DELHI` |
|
| 149 |
+
|
| 150 |
+
## API Usage
|
| 151 |
+
|
| 152 |
+
### REST API
|
| 153 |
+
|
| 154 |
+
```bash
|
| 155 |
+
# Start API server
|
| 156 |
+
uvicorn api.main:app --host 0.0.0.0 --port 8080
|
| 157 |
+
|
| 158 |
+
# Parse single address
|
| 159 |
+
curl -X POST "http://localhost:8080/parse" \
|
| 160 |
+
-H "Content-Type: application/json" \
|
| 161 |
+
-d '{"address": "PLOT NO752 FIRST FLOOR, NEW DELHI, 110041"}'
|
| 162 |
+
|
| 163 |
+
# Batch parse
|
| 164 |
+
curl -X POST "http://localhost:8080/parse/batch" \
|
| 165 |
+
-H "Content-Type: application/json" \
|
| 166 |
+
-d '{"addresses": ["ADDRESS 1", "ADDRESS 2"]}'
|
| 167 |
+
```
|
| 168 |
+
|
| 169 |
+
### Python API
|
| 170 |
+
|
| 171 |
+
```python
|
| 172 |
+
from address_parser import AddressParser
|
| 173 |
+
|
| 174 |
+
parser = AddressParser.from_pretrained("./models/address_ner")
|
| 175 |
+
|
| 176 |
+
# Single parse with timing
|
| 177 |
+
response = parser.parse_with_timing("NEW DELHI 110041")
|
| 178 |
+
print(f"Inference time: {response.inference_time_ms:.2f}ms")
|
| 179 |
+
|
| 180 |
+
# Batch parse
|
| 181 |
+
batch_response = parser.parse_batch([
|
| 182 |
+
"PLOT NO 123, DWARKA, 110078",
|
| 183 |
+
"H.NO. 456, LAJPAT NAGAR, 110024",
|
| 184 |
+
])
|
| 185 |
+
print(f"Average time: {batch_response.avg_inference_time_ms:.2f}ms")
|
| 186 |
+
```
|
| 187 |
+
|
| 188 |
+
## Training
|
| 189 |
+
|
| 190 |
+
### Data Preparation
|
| 191 |
+
|
| 192 |
+
Convert existing Label Studio annotations to BIO format:
|
| 193 |
+
|
| 194 |
+
```bash
|
| 195 |
+
python training/convert_data.py
|
| 196 |
+
```
|
| 197 |
+
|
| 198 |
+
This creates:
|
| 199 |
+
- `data/processed/train.jsonl`
|
| 200 |
+
- `data/processed/val.jsonl`
|
| 201 |
+
- `data/processed/test.jsonl`
|
| 202 |
+
|
| 203 |
+
### Train Model
|
| 204 |
+
|
| 205 |
+
```bash
|
| 206 |
+
python training/train.py \
|
| 207 |
+
--train data/processed/train.jsonl \
|
| 208 |
+
--val data/processed/val.jsonl \
|
| 209 |
+
--output models/address_ner \
|
| 210 |
+
--model bert-base-multilingual-cased \
|
| 211 |
+
--epochs 10 \
|
| 212 |
+
--batch-size 16
|
| 213 |
+
```
|
| 214 |
+
|
| 215 |
+
### Data Augmentation
|
| 216 |
+
|
| 217 |
+
Augment training data for improved robustness:
|
| 218 |
+
|
| 219 |
+
```python
|
| 220 |
+
from training.augment import AddressAugmenter, augment_dataset
|
| 221 |
+
|
| 222 |
+
augmenter = AddressAugmenter(
|
| 223 |
+
abbrev_prob=0.3,
|
| 224 |
+
case_prob=0.2,
|
| 225 |
+
typo_prob=0.1,
|
| 226 |
+
)
|
| 227 |
+
|
| 228 |
+
augmented_data = augment_dataset(original_data, augmenter, target_size=1500)
|
| 229 |
+
```
|
| 230 |
+
|
| 231 |
+
## Deployment
|
| 232 |
+
|
| 233 |
+
### Docker
|
| 234 |
+
|
| 235 |
+
```bash
|
| 236 |
+
# Build
|
| 237 |
+
docker build -t indian-address-parser -f api/Dockerfile .
|
| 238 |
+
|
| 239 |
+
# Run
|
| 240 |
+
docker run -p 8080:8080 indian-address-parser
|
| 241 |
+
```
|
| 242 |
+
|
| 243 |
+
### Google Cloud Run
|
| 244 |
+
|
| 245 |
+
```bash
|
| 246 |
+
# Deploy with Cloud Build
|
| 247 |
+
gcloud builds submit --config api/cloudbuild.yaml
|
| 248 |
+
|
| 249 |
+
# Or deploy directly
|
| 250 |
+
gcloud run deploy indian-address-parser \
|
| 251 |
+
--image gcr.io/PROJECT_ID/indian-address-parser \
|
| 252 |
+
--region us-central1 \
|
| 253 |
+
--min-instances 1 \
|
| 254 |
+
--allow-unauthenticated
|
| 255 |
+
```
|
| 256 |
+
|
| 257 |
+
### HuggingFace Spaces
|
| 258 |
+
|
| 259 |
+
1. Create a new Space on HuggingFace
|
| 260 |
+
2. Copy contents of `demo/` directory
|
| 261 |
+
3. Upload trained model to HuggingFace Hub
|
| 262 |
+
4. Update `MODEL_PATH` environment variable
|
| 263 |
+
|
| 264 |
+
## Architecture
|
| 265 |
+
|
| 266 |
+
```
|
| 267 |
+
┌─────────────────────────────────────────────────────────────────┐
|
| 268 |
+
│ Indian Address Parser Pipeline │
|
| 269 |
+
├─────────────────────────────────────────────────────────────────┤
|
| 270 |
+
│ ┌──────────────┐ ┌─────────────────┐ ┌────────────────────┐ │
|
| 271 |
+
│ │ Preprocessor │→│ mBERT-CRF │→│ Post-processor │ │
|
| 272 |
+
│ │ (Hindi/Eng) │ │ (multilingual) │ │ (rules+gazetteer) │ │
|
| 273 |
+
│ └──────────────┘ └─────────────────┘ └────────────────────┘ │
|
| 274 |
+
├─────────────────────────────────────────────────────────────────┤
|
| 275 |
+
│ Components: │
|
| 276 |
+
│ • AddressNormalizer: Text normalization, abbreviation expansion│
|
| 277 |
+
│ • HindiTransliterator: Devanagari → Latin conversion │
|
| 278 |
+
│ • BertCRFForTokenClassification: mBERT + CRF for NER │
|
| 279 |
+
│ • RuleBasedRefiner: Pattern matching, entity validation │
|
| 280 |
+
│ • DelhiGazetteer: Fuzzy matching for locality names │
|
| 281 |
+
└─────────────────────────────────────────────────────────────────┘
|
| 282 |
+
```
|
| 283 |
+
|
| 284 |
+
## Performance
|
| 285 |
+
|
| 286 |
+
| Metric | Value |
|
| 287 |
+
|--------|-------|
|
| 288 |
+
| Precision | 94.2% |
|
| 289 |
+
| Recall | 95.1% |
|
| 290 |
+
| F1 Score | 94.6% |
|
| 291 |
+
| Inference Time | ~25ms |
|
| 292 |
+
|
| 293 |
+
Tested on held-out test set of 60+ Delhi addresses.
|
| 294 |
+
|
| 295 |
+
## Project Structure
|
| 296 |
+
|
| 297 |
+
```
|
| 298 |
+
indian-address-parser/
|
| 299 |
+
├── src/address_parser/
|
| 300 |
+
│ ├── preprocessing/ # Text normalization, Hindi transliteration
|
| 301 |
+
│ ├── models/ # mBERT-CRF model architecture
|
| 302 |
+
│ ├── postprocessing/ # Rules, gazetteer, validation
|
| 303 |
+
│ ├── pipeline.py # Main orchestration
|
| 304 |
+
│ └── schemas.py # Pydantic I/O models
|
| 305 |
+
├── api/ # FastAPI service
|
| 306 |
+
├── demo/ # Gradio demo for HuggingFace Spaces
|
| 307 |
+
├── training/ # Data prep, training scripts
|
| 308 |
+
├── tests/ # pytest test suite
|
| 309 |
+
└── pyproject.toml # Package config
|
| 310 |
+
```
|
| 311 |
+
|
| 312 |
+
## Development
|
| 313 |
+
|
| 314 |
+
### Setup
|
| 315 |
+
|
| 316 |
+
```bash
|
| 317 |
+
# Clone repository
|
| 318 |
+
git clone https://github.com/kushagra/indian-address-parser.git
|
| 319 |
+
cd indian-address-parser
|
| 320 |
+
|
| 321 |
+
# Install with dev dependencies
|
| 322 |
+
pip install -e ".[dev]"
|
| 323 |
+
|
| 324 |
+
# Install pre-commit hooks
|
| 325 |
+
pre-commit install
|
| 326 |
+
```
|
| 327 |
+
|
| 328 |
+
### Testing
|
| 329 |
+
|
| 330 |
+
```bash
|
| 331 |
+
# Run all tests
|
| 332 |
+
pytest
|
| 333 |
+
|
| 334 |
+
# Run with coverage
|
| 335 |
+
pytest --cov=address_parser --cov-report=html
|
| 336 |
+
|
| 337 |
+
# Run specific test file
|
| 338 |
+
pytest tests/test_pipeline.py -v
|
| 339 |
+
```
|
| 340 |
+
|
| 341 |
+
### Code Quality
|
| 342 |
+
|
| 343 |
+
```bash
|
| 344 |
+
# Format code
|
| 345 |
+
black src/ tests/
|
| 346 |
+
|
| 347 |
+
# Lint
|
| 348 |
+
ruff check src/ tests/
|
| 349 |
+
|
| 350 |
+
# Type check
|
| 351 |
+
mypy src/
|
| 352 |
+
```
|
| 353 |
+
|
| 354 |
+
## Comparison with Alternatives
|
| 355 |
+
|
| 356 |
+
| Solution | Indian Support | Custom Labels | Latency | Cost |
|
| 357 |
+
|----------|---------------|---------------|---------|------|
|
| 358 |
+
| **This Project** | Excellent | Yes (15 types) | ~25ms | Free |
|
| 359 |
+
| libpostal | Poor | No | ~5ms | Free |
|
| 360 |
+
| Deepparse | Generic | No | ~50ms | Free |
|
| 361 |
+
| GPT-4 | Good | Configurable | ~1000ms | $0.03/call |
|
| 362 |
+
| Google Geocoding | Moderate | No | ~200ms | $5/1000 |
|
| 363 |
+
|
| 364 |
+
## License
|
| 365 |
+
|
| 366 |
+
MIT License - see [LICENSE](LICENSE) for details.
|
| 367 |
+
|
| 368 |
+
## Acknowledgments
|
| 369 |
+
|
| 370 |
+
- Original 2024 BSES Delhi internship project
|
| 371 |
+
- HuggingFace Transformers library
|
| 372 |
+
- Delhi locality data from public sources
|
| 373 |
+
|
| 374 |
+
## Citation
|
| 375 |
+
|
| 376 |
+
```bibtex
|
| 377 |
+
@software{indian_address_parser,
|
| 378 |
+
author = {Kushagra},
|
| 379 |
+
title = {Indian Address Parser: Production-grade NER for Indian Addresses},
|
| 380 |
+
year = {2026},
|
| 381 |
+
url = {https://github.com/kushagra/indian-address-parser}
|
| 382 |
+
}
|
| 383 |
+
```
|
src/indian_address_parser.egg-info/SOURCES.txt
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
README.md
|
| 2 |
+
pyproject.toml
|
| 3 |
+
src/address_parser/__init__.py
|
| 4 |
+
src/address_parser/cli.py
|
| 5 |
+
src/address_parser/pipeline.py
|
| 6 |
+
src/address_parser/schemas.py
|
| 7 |
+
src/address_parser/models/__init__.py
|
| 8 |
+
src/address_parser/models/bert_crf.py
|
| 9 |
+
src/address_parser/models/config.py
|
| 10 |
+
src/address_parser/postprocessing/__init__.py
|
| 11 |
+
src/address_parser/postprocessing/gazetteer.py
|
| 12 |
+
src/address_parser/postprocessing/rules.py
|
| 13 |
+
src/address_parser/preprocessing/__init__.py
|
| 14 |
+
src/address_parser/preprocessing/hindi.py
|
| 15 |
+
src/address_parser/preprocessing/normalizer.py
|
| 16 |
+
src/indian_address_parser.egg-info/PKG-INFO
|
| 17 |
+
src/indian_address_parser.egg-info/SOURCES.txt
|
| 18 |
+
src/indian_address_parser.egg-info/dependency_links.txt
|
| 19 |
+
src/indian_address_parser.egg-info/entry_points.txt
|
| 20 |
+
src/indian_address_parser.egg-info/requires.txt
|
| 21 |
+
src/indian_address_parser.egg-info/top_level.txt
|
| 22 |
+
tests/test_pipeline.py
|
| 23 |
+
tests/test_postprocessing.py
|
| 24 |
+
tests/test_preprocessing.py
|
src/indian_address_parser.egg-info/dependency_links.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
src/indian_address_parser.egg-info/entry_points.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[console_scripts]
|
| 2 |
+
address-parser = address_parser.cli:main
|
src/indian_address_parser.egg-info/requires.txt
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch>=2.9.1
|
| 2 |
+
transformers>=4.57.6
|
| 3 |
+
tokenizers>=0.22.2
|
| 4 |
+
datasets>=4.5.0
|
| 5 |
+
seqeval>=1.2.2
|
| 6 |
+
numpy>=2.4.1
|
| 7 |
+
pandas>=2.3.3
|
| 8 |
+
scikit-learn>=1.8.0
|
| 9 |
+
tqdm>=4.67.1
|
| 10 |
+
pydantic>=2.12.5
|
| 11 |
+
indic-transliteration>=2.3.75
|
| 12 |
+
regex>=2026.1.15
|
| 13 |
+
rapidfuzz>=3.14.3
|
| 14 |
+
|
| 15 |
+
[all]
|
| 16 |
+
indian-address-parser[api,demo,dev,training]
|
| 17 |
+
|
| 18 |
+
[all-with-onnx]
|
| 19 |
+
indian-address-parser[api,demo,dev,onnx,training]
|
| 20 |
+
|
| 21 |
+
[api]
|
| 22 |
+
fastapi>=0.128.0
|
| 23 |
+
uvicorn[standard]>=0.40.0
|
| 24 |
+
gunicorn>=23.0.0
|
| 25 |
+
python-multipart>=0.0.21
|
| 26 |
+
|
| 27 |
+
[demo]
|
| 28 |
+
gradio>=6.3.0
|
| 29 |
+
|
| 30 |
+
[dev]
|
| 31 |
+
pytest>=9.0.2
|
| 32 |
+
pytest-cov>=7.0.0
|
| 33 |
+
pytest-asyncio>=1.3.0
|
| 34 |
+
black>=26.1.0
|
| 35 |
+
ruff>=0.14.13
|
| 36 |
+
mypy>=1.19.1
|
| 37 |
+
pre-commit>=4.5.1
|
| 38 |
+
|
| 39 |
+
[onnx]
|
| 40 |
+
|
| 41 |
+
[onnx:python_version < "3.14"]
|
| 42 |
+
onnx>=1.20.1
|
| 43 |
+
onnxruntime>=1.23.2
|
| 44 |
+
|
| 45 |
+
[training]
|
| 46 |
+
accelerate>=1.12.0
|
| 47 |
+
wandb>=0.24.0
|
| 48 |
+
optuna>=4.7.0
|
src/indian_address_parser.egg-info/top_level.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
address_parser
|