x commited on
Commit
47bc13b
·
verified ·
1 Parent(s): 976f8e9

Upload folder using huggingface_hub

Browse files
Files changed (47) hide show
  1. README.md +46 -12
  2. app.py +280 -0
  3. requirements.txt +10 -0
  4. src/address_parser/__init__.py +26 -0
  5. src/address_parser/__pycache__/__init__.cpython-312.pyc +0 -0
  6. src/address_parser/__pycache__/__init__.cpython-314.pyc +0 -0
  7. src/address_parser/__pycache__/cli.cpython-314.pyc +0 -0
  8. src/address_parser/__pycache__/pipeline.cpython-312.pyc +0 -0
  9. src/address_parser/__pycache__/pipeline.cpython-314.pyc +0 -0
  10. src/address_parser/__pycache__/schemas.cpython-312.pyc +0 -0
  11. src/address_parser/__pycache__/schemas.cpython-314.pyc +0 -0
  12. src/address_parser/cli.py +132 -0
  13. src/address_parser/models/__init__.py +6 -0
  14. src/address_parser/models/__pycache__/__init__.cpython-312.pyc +0 -0
  15. src/address_parser/models/__pycache__/__init__.cpython-314.pyc +0 -0
  16. src/address_parser/models/__pycache__/bert_crf.cpython-312.pyc +0 -0
  17. src/address_parser/models/__pycache__/bert_crf.cpython-314.pyc +0 -0
  18. src/address_parser/models/__pycache__/config.cpython-312.pyc +0 -0
  19. src/address_parser/models/__pycache__/config.cpython-314.pyc +0 -0
  20. src/address_parser/models/bert_crf.py +439 -0
  21. src/address_parser/models/config.py +103 -0
  22. src/address_parser/pipeline.py +528 -0
  23. src/address_parser/postprocessing/__init__.py +6 -0
  24. src/address_parser/postprocessing/__pycache__/__init__.cpython-312.pyc +0 -0
  25. src/address_parser/postprocessing/__pycache__/__init__.cpython-314.pyc +0 -0
  26. src/address_parser/postprocessing/__pycache__/gazetteer.cpython-312.pyc +0 -0
  27. src/address_parser/postprocessing/__pycache__/gazetteer.cpython-314.pyc +0 -0
  28. src/address_parser/postprocessing/__pycache__/rules.cpython-312.pyc +0 -0
  29. src/address_parser/postprocessing/__pycache__/rules.cpython-314.pyc +0 -0
  30. src/address_parser/postprocessing/gazetteer.py +164 -0
  31. src/address_parser/postprocessing/rules.py +536 -0
  32. src/address_parser/preprocessing/__init__.py +6 -0
  33. src/address_parser/preprocessing/__pycache__/__init__.cpython-312.pyc +0 -0
  34. src/address_parser/preprocessing/__pycache__/__init__.cpython-314.pyc +0 -0
  35. src/address_parser/preprocessing/__pycache__/hindi.cpython-312.pyc +0 -0
  36. src/address_parser/preprocessing/__pycache__/hindi.cpython-314.pyc +0 -0
  37. src/address_parser/preprocessing/__pycache__/normalizer.cpython-312.pyc +0 -0
  38. src/address_parser/preprocessing/__pycache__/normalizer.cpython-314.pyc +0 -0
  39. src/address_parser/preprocessing/hindi.py +242 -0
  40. src/address_parser/preprocessing/normalizer.py +192 -0
  41. src/address_parser/schemas.py +152 -0
  42. src/indian_address_parser.egg-info/PKG-INFO +383 -0
  43. src/indian_address_parser.egg-info/SOURCES.txt +24 -0
  44. src/indian_address_parser.egg-info/dependency_links.txt +1 -0
  45. src/indian_address_parser.egg-info/entry_points.txt +2 -0
  46. src/indian_address_parser.egg-info/requires.txt +48 -0
  47. src/indian_address_parser.egg-info/top_level.txt +1 -0
README.md CHANGED
@@ -1,12 +1,46 @@
1
- ---
2
- title: Indian Address Parser
3
- emoji: 🦀
4
- colorFrom: purple
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 6.5.1
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Indian Address Parser
3
+ emoji: 🏠
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: "6.3.0"
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ # Indian Address Parser
14
+
15
+ Parse unstructured Indian addresses into structured components using **IndicBERTv2-CRF**.
16
+
17
+ ## Features
18
+
19
+ - **Multilingual**: Supports Hindi (Devanagari) + English
20
+ - **15 Entity Types**: House Number, Floor, Block, Gali, Colony, Area, Khasra, Pincode, etc.
21
+ - **~80% F1 score** on held-out test data (mBERT-CRF baseline)
22
+ - **Fast**: < 30ms inference time
23
+
24
+ ## Example
25
+
26
+ **Input:**
27
+ ```
28
+ PLOT NO752 FIRST FLOOR, BLOCK H-3 KH NO 24/1/3/2/2/202, KAUNWAR SINGH NAGAR NEW DELHI, DELHI, 110041
29
+ ```
30
+
31
+ **Output:**
32
+ | Entity | Value |
33
+ |--------|-------|
34
+ | HOUSE_NUMBER | PLOT NO752 |
35
+ | FLOOR | FIRST FLOOR |
36
+ | BLOCK | BLOCK H-3 |
37
+ | KHASRA | KH NO 24/1/3/2/2/202 |
38
+ | AREA | KAUNWAR SINGH NAGAR |
39
+ | CITY | NEW DELHI |
40
+ | PINCODE | 110041 |
41
+
42
+ ## Technical Details
43
+
44
+ - **Model**: ai4bharat/IndicBERTv2-SS + CRF layer
45
+ - **Training Data**: 600+ annotated Delhi addresses
46
+ - **Framework**: PyTorch + HuggingFace Transformers
app.py ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio demo for Indian Address Parser.
3
+
4
+ Interactive web interface for HuggingFace Spaces deployment.
5
+ Features:
6
+ - Real-time address parsing
7
+ - Entity highlighting
8
+ - Example addresses
9
+ - Confidence scores
10
+ """
11
+
12
+ import os
13
+ import sys
14
+ from pathlib import Path
15
+
16
+ import gradio as gr
17
+
18
+ sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
19
+
20
+ from address_parser import AddressParser, ParsedAddress
21
+
22
+ # Entity colors for visualization
23
+ ENTITY_COLORS = {
24
+ "HOUSE_NUMBER": "#FF6B6B", # Red
25
+ "PLOT": "#FF6B6B",
26
+ "FLOOR": "#4ECDC4", # Teal
27
+ "BLOCK": "#45B7D1", # Blue
28
+ "SECTOR": "#96CEB4", # Green
29
+ "GALI": "#FFEAA7", # Yellow
30
+ "COLONY": "#DDA0DD", # Plum
31
+ "AREA": "#98D8C8", # Mint
32
+ "SUBAREA": "#F7DC6F", # Light yellow
33
+ "KHASRA": "#BB8FCE", # Purple
34
+ "PINCODE": "#85C1E9", # Light blue
35
+ "CITY": "#F8B500", # Orange
36
+ "STATE": "#58D68D", # Light green
37
+ }
38
+
39
+ # Example addresses
40
+ EXAMPLES = [
41
+ "PLOT NO752 FIRST FLOOR, BLOCK H-3 KH NO 24/1/3/2/2/202, KAUNWAR SINGH NAGAR NEW DELHI, DELHI, 110041",
42
+ "H.NO. 123, GALI NO. 5, LAJPAT NAGAR, SOUTH DELHI, 110024",
43
+ "FLAT NO A-501, SECTOR 15, DWARKA, NEW DELHI, 110078",
44
+ "KHASRA NO 45/2, VILLAGE MUNDKA, OUTER DELHI, 110041",
45
+ "S-3/166, GROUND FLOOR, KH NO 98/4, GALI NO-6, SWARN PARK MUNDKA, Delhi, 110041",
46
+ "PLOT NO A5 GROUND FLOOR, KHASRA NO 15/20/2 BABA HARI DAS COLONY, TIKARI KALA, DELHI, 110041",
47
+ ]
48
+
49
+
50
+ def load_parser():
51
+ """Load the address parser from HuggingFace Hub or local path."""
52
+ from huggingface_hub import snapshot_download
53
+
54
+ # Configuration - HF_MODEL_REPO should be set in Space settings
55
+ HF_MODEL_REPO = os.getenv("HF_MODEL_REPO", "")
56
+ LOCAL_MODEL_PATH = os.getenv("MODEL_PATH", "./models/address_ner_v3")
57
+
58
+ # Try local path first (for development/testing)
59
+ if Path(LOCAL_MODEL_PATH).exists() and (Path(LOCAL_MODEL_PATH) / "pytorch_model.bin").exists():
60
+ print(f"Loading model from local path: {LOCAL_MODEL_PATH}")
61
+ return AddressParser.from_pretrained(LOCAL_MODEL_PATH, device="cpu")
62
+
63
+ # Try HuggingFace Hub
64
+ if HF_MODEL_REPO:
65
+ try:
66
+ print(f"Downloading model from HuggingFace Hub: {HF_MODEL_REPO}")
67
+ model_path = snapshot_download(repo_id=HF_MODEL_REPO, repo_type="model")
68
+ print(f"Model downloaded to: {model_path}")
69
+ return AddressParser.from_pretrained(model_path, device="cpu")
70
+ except Exception as e:
71
+ print(f"Failed to load model from HF Hub: {e}")
72
+
73
+ # Fallback to rules-only mode
74
+ print("No model available, using rules-only mode")
75
+ return AddressParser.rules_only()
76
+
77
+
78
+ # Initialize parser
79
+ parser = load_parser()
80
+
81
+
82
+ def create_highlighted_html(result: ParsedAddress) -> str:
83
+ """Create HTML with highlighted entities."""
84
+ if not result.entities:
85
+ return f"<p>{result.normalized_address}</p>"
86
+
87
+ # Sort entities by position
88
+ sorted_entities = sorted(result.entities, key=lambda e: e.start)
89
+
90
+ html_parts = []
91
+ last_end = 0
92
+ text = result.normalized_address
93
+
94
+ for entity in sorted_entities:
95
+ # Add text before entity
96
+ if entity.start > last_end:
97
+ html_parts.append(text[last_end:entity.start])
98
+
99
+ # Add highlighted entity
100
+ color = ENTITY_COLORS.get(entity.label, "#CCCCCC")
101
+ html_parts.append(
102
+ f'<span style="background-color: {color}; padding: 2px 6px; '
103
+ f'border-radius: 4px; margin: 0 2px; font-weight: bold;" '
104
+ f'title="{entity.label} ({entity.confidence:.0%})">'
105
+ f'{entity.value}</span>'
106
+ )
107
+
108
+ last_end = entity.end
109
+
110
+ # Add remaining text
111
+ if last_end < len(text):
112
+ html_parts.append(text[last_end:])
113
+
114
+ return "".join(html_parts)
115
+
116
+
117
+ def create_entity_table(result: ParsedAddress) -> list[list[str]]:
118
+ """Create table of extracted entities."""
119
+ if not result.entities:
120
+ return []
121
+
122
+ return [
123
+ [entity.label, entity.value, f"{entity.confidence:.0%}"]
124
+ for entity in sorted(result.entities, key=lambda e: e.start)
125
+ ]
126
+
127
+
128
+ def parse_address(address: str) -> tuple[str, list[list[str]], str]:
129
+ """
130
+ Parse address and return results for Gradio interface.
131
+
132
+ Returns:
133
+ - Highlighted HTML
134
+ - Entity table
135
+ - Structured output JSON
136
+ """
137
+ if not address or not address.strip():
138
+ return "<p>Please enter an address</p>", [], "{}"
139
+
140
+ # Parse
141
+ result = parser.parse(address)
142
+
143
+ # Create outputs
144
+ highlighted = create_highlighted_html(result)
145
+ table = create_entity_table(result)
146
+
147
+ # Structured output
148
+ structured = {
149
+ "house_number": result.house_number,
150
+ "floor": result.floor,
151
+ "block": result.block,
152
+ "gali": result.gali,
153
+ "colony": result.colony,
154
+ "area": result.area,
155
+ "subarea": result.subarea,
156
+ "sector": result.sector,
157
+ "khasra": result.khasra,
158
+ "pincode": result.pincode,
159
+ "city": result.city,
160
+ "state": result.state,
161
+ }
162
+ # Remove None values
163
+ structured = {k: v for k, v in structured.items() if v}
164
+
165
+ import json
166
+ structured_json = json.dumps(structured, indent=2, ensure_ascii=False)
167
+
168
+ return highlighted, table, structured_json
169
+
170
+
171
+ # Custom CSS for the demo
172
+ CUSTOM_CSS = """
173
+ .highlighted-text {
174
+ font-size: 1.1em;
175
+ line-height: 1.8;
176
+ padding: 15px;
177
+ background: #f8f9fa;
178
+ border-radius: 8px;
179
+ }
180
+ """
181
+
182
+ # Create Gradio interface
183
+ with gr.Blocks(title="Indian Address Parser") as demo:
184
+ gr.Markdown(
185
+ """
186
+ # Indian Address Parser
187
+
188
+ Parse unstructured Indian addresses into structured components using
189
+ **mBERT-CRF** (Multilingual BERT with Conditional Random Field).
190
+
191
+ ## Features
192
+ - Supports Hindi + English (Devanagari and Latin scripts)
193
+ - 15 entity types: House Number, Floor, Block, Gali, Colony, Area, Khasra, Pincode, etc.
194
+ - Delhi-specific locality gazetteer for improved accuracy
195
+ - < 30ms inference time
196
+
197
+ ---
198
+ """
199
+ )
200
+
201
+ with gr.Row():
202
+ with gr.Column(scale=2):
203
+ address_input = gr.Textbox(
204
+ label="Enter Address",
205
+ placeholder="e.g., PLOT NO752 FIRST FLOOR, BLOCK H-3, NEW DELHI, 110041",
206
+ lines=3,
207
+ )
208
+ parse_btn = gr.Button("Parse Address", variant="primary")
209
+
210
+ gr.Examples(
211
+ examples=[[ex] for ex in EXAMPLES],
212
+ inputs=[address_input],
213
+ label="Example Addresses",
214
+ )
215
+
216
+ gr.Markdown("## Results")
217
+
218
+ with gr.Row():
219
+ with gr.Column(scale=1):
220
+ gr.Markdown("### Highlighted Entities")
221
+ highlighted_output = gr.HTML(
222
+ elem_classes=["highlighted-text"]
223
+ )
224
+
225
+ with gr.Column(scale=1):
226
+ gr.Markdown("### Extracted Entities")
227
+ entity_table = gr.Dataframe(
228
+ headers=["Entity Type", "Value", "Confidence"],
229
+ datatype=["str", "str", "str"],
230
+ row_count=10,
231
+ )
232
+
233
+ with gr.Row():
234
+ gr.Markdown("### Structured Output")
235
+ structured_output = gr.Code(
236
+ language="json",
237
+ label="Structured JSON",
238
+ )
239
+
240
+ # Legend
241
+ gr.Markdown("### Entity Legend")
242
+ legend_html = " ".join([
243
+ f'<span style="background-color: {color}; padding: 2px 8px; '
244
+ f'border-radius: 4px; margin: 2px; display: inline-block;">{label}</span>'
245
+ for label, color in ENTITY_COLORS.items()
246
+ ])
247
+ gr.HTML(f"<div style='line-height: 2.5;'>{legend_html}</div>")
248
+
249
+ # Footer
250
+ gr.Markdown(
251
+ """
252
+ ---
253
+ **Model**: IndicBERTv2-SS + CRF (ai4bharat/IndicBERTv2-SS + CRF layer)
254
+ | **Training Data**: 600+ annotated Delhi addresses
255
+ | **GitHub**: [indian-address-parser](https://github.com/howdoiusekeyboard/indian-address-parser)
256
+ """
257
+ )
258
+
259
+ # Event handlers
260
+ parse_btn.click(
261
+ fn=parse_address,
262
+ inputs=[address_input],
263
+ outputs=[highlighted_output, entity_table, structured_output],
264
+ )
265
+
266
+ address_input.submit(
267
+ fn=parse_address,
268
+ inputs=[address_input],
269
+ outputs=[highlighted_output, entity_table, structured_output],
270
+ )
271
+
272
+
273
+ if __name__ == "__main__":
274
+ demo.launch(
275
+ server_name="0.0.0.0",
276
+ server_port=int(os.getenv("PORT", "7860")),
277
+ share=False,
278
+ theme=gr.themes.Soft(),
279
+ css=CUSTOM_CSS,
280
+ )
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # HuggingFace Spaces requirements (Python 3.14)
2
+ torch>=2.9.1
3
+ transformers>=4.57.6
4
+ tokenizers>=0.22.2
5
+ huggingface_hub>=0.25.0
6
+ gradio>=6.3.0
7
+ pydantic>=2.12.5
8
+ indic-transliteration>=2.3.75
9
+ rapidfuzz>=3.14.3
10
+ regex>=2026.1.15
src/address_parser/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Indian Address Parser - Production-grade NER for Indian addresses.
3
+
4
+ A modern NLP system for parsing unstructured Indian addresses into
5
+ structured components using mBERT-CRF architecture with Hindi+English support.
6
+ """
7
+
8
+ __version__ = "2.0.0"
9
+ __author__ = "Kushagra"
10
+
11
+ from address_parser.pipeline import AddressParser
12
+ from address_parser.schemas import (
13
+ AddressEntity,
14
+ ParsedAddress,
15
+ ParseRequest,
16
+ ParseResponse,
17
+ )
18
+
19
+ __all__ = [
20
+ "AddressParser",
21
+ "AddressEntity",
22
+ "ParsedAddress",
23
+ "ParseRequest",
24
+ "ParseResponse",
25
+ "__version__",
26
+ ]
src/address_parser/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (741 Bytes). View file
 
src/address_parser/__pycache__/__init__.cpython-314.pyc ADDED
Binary file (738 Bytes). View file
 
src/address_parser/__pycache__/cli.cpython-314.pyc ADDED
Binary file (6.4 kB). View file
 
src/address_parser/__pycache__/pipeline.cpython-312.pyc ADDED
Binary file (16.9 kB). View file
 
src/address_parser/__pycache__/pipeline.cpython-314.pyc ADDED
Binary file (19.6 kB). View file
 
src/address_parser/__pycache__/schemas.cpython-312.pyc ADDED
Binary file (7.94 kB). View file
 
src/address_parser/__pycache__/schemas.cpython-314.pyc ADDED
Binary file (10.2 kB). View file
 
src/address_parser/cli.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Command-line interface for Indian Address Parser."""
2
+
3
+ import argparse
4
+ import json
5
+ import sys
6
+ from pathlib import Path
7
+
8
+
9
+ def main():
10
+ """Main CLI entry point."""
11
+ parser = argparse.ArgumentParser(
12
+ description="Parse Indian addresses using NER",
13
+ formatter_class=argparse.RawDescriptionHelpFormatter,
14
+ epilog="""
15
+ Examples:
16
+ # Parse single address
17
+ address-parser "PLOT NO752 FIRST FLOOR, NEW DELHI, 110041"
18
+
19
+ # Parse from file
20
+ address-parser --input addresses.txt --output parsed.json
21
+
22
+ # Use trained model
23
+ address-parser --model ./models/address_ner_v3 "H.NO. 123, LAJPAT NAGAR"
24
+ """
25
+ )
26
+
27
+ parser.add_argument(
28
+ "address",
29
+ nargs="?",
30
+ help="Address to parse (or use --input for file)"
31
+ )
32
+ parser.add_argument(
33
+ "--input", "-i",
34
+ help="Input file with addresses (one per line)"
35
+ )
36
+ parser.add_argument(
37
+ "--output", "-o",
38
+ help="Output JSON file"
39
+ )
40
+ parser.add_argument(
41
+ "--model", "-m",
42
+ help="Path to trained model directory"
43
+ )
44
+ parser.add_argument(
45
+ "--format", "-f",
46
+ choices=["json", "table", "simple"],
47
+ default="json",
48
+ help="Output format (default: json)"
49
+ )
50
+ parser.add_argument(
51
+ "--version", "-v",
52
+ action="version",
53
+ version="indian-address-parser 2.0.0"
54
+ )
55
+
56
+ args = parser.parse_args()
57
+
58
+ # Import here to avoid slow startup
59
+ from address_parser import AddressParser
60
+
61
+ # Load parser
62
+ if args.model and Path(args.model).exists():
63
+ print(f"Loading model from {args.model}...", file=sys.stderr)
64
+ address_parser = AddressParser.from_pretrained(args.model)
65
+ else:
66
+ print("Using rules-only mode", file=sys.stderr)
67
+ address_parser = AddressParser.rules_only()
68
+
69
+ # Get addresses to parse
70
+ addresses = []
71
+ if args.input:
72
+ with open(args.input, encoding="utf-8") as f:
73
+ addresses = [line.strip() for line in f if line.strip()]
74
+ elif args.address:
75
+ addresses = [args.address]
76
+ else:
77
+ parser.print_help()
78
+ sys.exit(1)
79
+
80
+ # Parse addresses
81
+ results = []
82
+ for addr in addresses:
83
+ result = address_parser.parse(addr)
84
+ results.append(result)
85
+
86
+ # Output
87
+ if args.format == "json":
88
+ output = [r.model_dump() for r in results]
89
+ json_str = json.dumps(output, indent=2, ensure_ascii=False)
90
+
91
+ if args.output:
92
+ with open(args.output, "w", encoding="utf-8") as f:
93
+ f.write(json_str)
94
+ print(f"Saved to {args.output}", file=sys.stderr)
95
+ else:
96
+ print(json_str)
97
+
98
+ elif args.format == "table":
99
+ for i, result in enumerate(results):
100
+ print(f"\n{'='*60}")
101
+ print(f"Address {i+1}: {result.raw_address[:50]}...")
102
+ print(f"{'='*60}")
103
+ print(f"{'Entity':<15} {'Value':<40} {'Conf':<6}")
104
+ print("-" * 60)
105
+ for entity in result.entities:
106
+ print(f"{entity.label:<15} {entity.value:<40} {entity.confidence:.0%}")
107
+
108
+ else: # simple
109
+ for result in results:
110
+ parts = []
111
+ if result.house_number:
112
+ parts.append(f"House: {result.house_number}")
113
+ if result.floor:
114
+ parts.append(f"Floor: {result.floor}")
115
+ if result.block:
116
+ parts.append(f"Block: {result.block}")
117
+ if result.gali:
118
+ parts.append(f"Gali: {result.gali}")
119
+ if result.colony:
120
+ parts.append(f"Colony: {result.colony}")
121
+ if result.area:
122
+ parts.append(f"Area: {result.area}")
123
+ if result.pincode:
124
+ parts.append(f"PIN: {result.pincode}")
125
+ if result.city:
126
+ parts.append(f"City: {result.city}")
127
+
128
+ print(" | ".join(parts) if parts else "No entities found")
129
+
130
+
131
+ if __name__ == "__main__":
132
+ main()
src/address_parser/models/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """Model architectures for address NER."""
2
+
3
+ from address_parser.models.bert_crf import BertCRFForTokenClassification
4
+ from address_parser.models.config import ModelConfig
5
+
6
+ __all__ = ["BertCRFForTokenClassification", "ModelConfig"]
src/address_parser/models/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (435 Bytes). View file
 
src/address_parser/models/__pycache__/__init__.cpython-314.pyc ADDED
Binary file (432 Bytes). View file
 
src/address_parser/models/__pycache__/bert_crf.cpython-312.pyc ADDED
Binary file (16.8 kB). View file
 
src/address_parser/models/__pycache__/bert_crf.cpython-314.pyc ADDED
Binary file (20 kB). View file
 
src/address_parser/models/__pycache__/config.cpython-312.pyc ADDED
Binary file (3.13 kB). View file
 
src/address_parser/models/__pycache__/config.cpython-314.pyc ADDED
Binary file (3.77 kB). View file
 
src/address_parser/models/bert_crf.py ADDED
@@ -0,0 +1,439 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BERT-CRF Model for Indian Address NER.
3
+
4
+ Combines a multilingual BERT encoder with a Conditional Random Field (CRF)
5
+ layer for improved sequence labeling performance.
6
+ """
7
+
8
+ import torch
9
+ import torch.nn as nn
10
+ from transformers import AutoModel
11
+ from transformers.modeling_outputs import TokenClassifierOutput
12
+
13
+ from address_parser.models.config import ID2LABEL, LABEL2ID, ModelConfig
14
+
15
+
16
+ class CRF(nn.Module):
17
+ """
18
+ Conditional Random Field layer for sequence labeling.
19
+
20
+ Implements the forward algorithm for computing log-likelihood
21
+ and Viterbi decoding for inference.
22
+ """
23
+
24
+ def __init__(self, num_tags: int, batch_first: bool = True):
25
+ """
26
+ Initialize CRF layer.
27
+
28
+ Args:
29
+ num_tags: Number of output tags
30
+ batch_first: If True, input is (batch, seq, features)
31
+ """
32
+ super().__init__()
33
+ self.num_tags = num_tags
34
+ self.batch_first = batch_first
35
+
36
+ # Transition matrix: transitions[i, j] = score of transitioning from tag i to tag j
37
+ self.transitions = nn.Parameter(torch.randn(num_tags, num_tags))
38
+
39
+ # Start and end transition scores
40
+ self.start_transitions = nn.Parameter(torch.randn(num_tags))
41
+ self.end_transitions = nn.Parameter(torch.randn(num_tags))
42
+
43
+ self._init_transitions()
44
+
45
+ def _init_transitions(self):
46
+ """Initialize transition parameters."""
47
+ nn.init.uniform_(self.transitions, -0.1, 0.1)
48
+ nn.init.uniform_(self.start_transitions, -0.1, 0.1)
49
+ nn.init.uniform_(self.end_transitions, -0.1, 0.1)
50
+
51
+ def forward(
52
+ self,
53
+ emissions: torch.Tensor,
54
+ tags: torch.LongTensor,
55
+ mask: torch.ByteTensor | None = None,
56
+ reduction: str = "mean",
57
+ ) -> torch.Tensor:
58
+ """
59
+ Compute negative log-likelihood loss.
60
+
61
+ Args:
62
+ emissions: Emission scores from BERT (batch, seq, num_tags)
63
+ tags: Gold standard tags (batch, seq)
64
+ mask: Mask for valid tokens (batch, seq)
65
+ reduction: 'mean', 'sum', or 'none'
66
+
67
+ Returns:
68
+ Negative log-likelihood loss
69
+ """
70
+ if mask is None:
71
+ mask = torch.ones_like(tags, dtype=torch.bool)
72
+
73
+ if self.batch_first:
74
+ emissions = emissions.transpose(0, 1)
75
+ tags = tags.transpose(0, 1)
76
+ mask = mask.transpose(0, 1)
77
+
78
+ # Compute log-likelihood
79
+ numerator = self._compute_score(emissions, tags, mask)
80
+ denominator = self._compute_normalizer(emissions, mask)
81
+ llh = numerator - denominator
82
+
83
+ if reduction == "mean":
84
+ return -llh.mean()
85
+ elif reduction == "sum":
86
+ return -llh.sum()
87
+ else:
88
+ return -llh
89
+
90
+ def decode(
91
+ self,
92
+ emissions: torch.Tensor,
93
+ mask: torch.ByteTensor | None = None,
94
+ ) -> list[list[int]]:
95
+ """
96
+ Find the most likely tag sequence using Viterbi algorithm.
97
+
98
+ Args:
99
+ emissions: Emission scores (batch, seq, num_tags)
100
+ mask: Mask for valid tokens (batch, seq)
101
+
102
+ Returns:
103
+ List of best tag sequences for each sample
104
+ """
105
+ if mask is None:
106
+ mask = torch.ones(emissions.shape[:2], dtype=torch.bool, device=emissions.device)
107
+
108
+ if self.batch_first:
109
+ emissions = emissions.transpose(0, 1)
110
+ mask = mask.transpose(0, 1)
111
+
112
+ return self._viterbi_decode(emissions, mask)
113
+
114
+ def _compute_score(
115
+ self,
116
+ emissions: torch.Tensor,
117
+ tags: torch.LongTensor,
118
+ mask: torch.BoolTensor
119
+ ) -> torch.Tensor:
120
+ """Compute the score of a given tag sequence."""
121
+ seq_length, batch_size = tags.shape
122
+ mask = mask.float()
123
+
124
+ # Start transition score
125
+ score = self.start_transitions[tags[0]]
126
+
127
+ for i in range(seq_length - 1):
128
+ current_tag = tags[i]
129
+ next_tag = tags[i + 1]
130
+
131
+ # Emission score
132
+ score += emissions[i, torch.arange(batch_size), current_tag] * mask[i]
133
+
134
+ # Transition score
135
+ score += self.transitions[current_tag, next_tag] * mask[i + 1]
136
+
137
+ # Last emission score
138
+ last_tag_idx = mask.long().sum(dim=0) - 1
139
+ last_tags = tags.gather(0, last_tag_idx.unsqueeze(0)).squeeze(0)
140
+ score += emissions[last_tag_idx, torch.arange(batch_size), last_tags]
141
+
142
+ # End transition score
143
+ score += self.end_transitions[last_tags]
144
+
145
+ return score
146
+
147
+ def _compute_normalizer(
148
+ self,
149
+ emissions: torch.Tensor,
150
+ mask: torch.BoolTensor
151
+ ) -> torch.Tensor:
152
+ """Compute log-sum-exp of all possible tag sequences (partition function)."""
153
+ seq_length = emissions.shape[0]
154
+
155
+ # Initialize with start transitions
156
+ score = self.start_transitions + emissions[0]
157
+
158
+ for i in range(1, seq_length):
159
+ # Broadcast score and transitions for all combinations
160
+ broadcast_score = score.unsqueeze(2)
161
+ broadcast_emissions = emissions[i].unsqueeze(1)
162
+
163
+ # Compute next scores
164
+ next_score = broadcast_score + self.transitions + broadcast_emissions
165
+
166
+ # Log-sum-exp
167
+ next_score = torch.logsumexp(next_score, dim=1)
168
+
169
+ # Mask
170
+ score = torch.where(mask[i].unsqueeze(1), next_score, score)
171
+
172
+ # Add end transitions
173
+ score += self.end_transitions
174
+
175
+ return torch.logsumexp(score, dim=1)
176
+
177
+ def _viterbi_decode(
178
+ self,
179
+ emissions: torch.Tensor,
180
+ mask: torch.BoolTensor
181
+ ) -> list[list[int]]:
182
+ """Viterbi decoding to find best tag sequence."""
183
+ seq_length, batch_size, num_tags = emissions.shape
184
+
185
+ # Initialize
186
+ score = self.start_transitions + emissions[0]
187
+ history = []
188
+
189
+ for i in range(1, seq_length):
190
+ broadcast_score = score.unsqueeze(2)
191
+ broadcast_emissions = emissions[i].unsqueeze(1)
192
+
193
+ next_score = broadcast_score + self.transitions + broadcast_emissions
194
+
195
+ # Find best previous tag for each current tag
196
+ next_score, indices = next_score.max(dim=1)
197
+
198
+ # Apply mask
199
+ score = torch.where(mask[i].unsqueeze(1), next_score, score)
200
+ history.append(indices)
201
+
202
+ # Add end transitions
203
+ score += self.end_transitions
204
+
205
+ # Backtrack
206
+ seq_ends = mask.long().sum(dim=0) - 1
207
+ best_tags_list = []
208
+
209
+ for batch_idx in range(batch_size):
210
+ # Best last tag
211
+ _, best_last_tag = score[batch_idx].max(dim=0)
212
+ best_tags = [best_last_tag.item()]
213
+
214
+ # Backtrack through history
215
+ for hist in reversed(history[:seq_ends[batch_idx]]):
216
+ best_last_tag = hist[batch_idx][best_tags[-1]]
217
+ best_tags.append(best_last_tag.item())
218
+
219
+ best_tags.reverse()
220
+ best_tags_list.append(best_tags)
221
+
222
+ return best_tags_list
223
+
224
+
225
+ class BertCRFForTokenClassification(nn.Module):
226
+ """
227
+ BERT model with CRF layer for token classification.
228
+
229
+ This combines a multilingual BERT encoder with a CRF layer
230
+ for improved sequence labeling on NER tasks.
231
+ """
232
+
233
+ def __init__(self, config: ModelConfig):
234
+ """
235
+ Initialize BERT-CRF model.
236
+
237
+ Args:
238
+ config: Model configuration
239
+ """
240
+ super().__init__()
241
+ self.config = config
242
+ self.num_labels = config.num_labels
243
+
244
+ # Load pretrained BERT
245
+ self.bert = AutoModel.from_pretrained(
246
+ config.model_name,
247
+ cache_dir=config.cache_dir,
248
+ )
249
+
250
+ # Dropout
251
+ self.dropout = nn.Dropout(config.classifier_dropout)
252
+
253
+ # Classification head
254
+ self.classifier = nn.Linear(config.hidden_size, config.num_labels)
255
+
256
+ # CRF layer
257
+ if config.use_crf:
258
+ self.crf = CRF(num_tags=config.num_labels, batch_first=True)
259
+ else:
260
+ self.crf = None
261
+
262
+ # Label mappings
263
+ self.id2label = ID2LABEL
264
+ self.label2id = LABEL2ID
265
+
266
+ # PyTorch 2.9: Lazy compilation for optimized inference
267
+ self._compiled_forward: nn.Module | None = None
268
+
269
+ def _get_compiled_forward(self):
270
+ """Lazy compile forward pass on first inference call."""
271
+ # Skip torch.compile on Windows without MSVC or when explicitly disabled
272
+ # The inductor backend requires a C++ compiler (cl on Windows, gcc/clang on Linux)
273
+ import os
274
+ import sys
275
+
276
+ skip_compile = (
277
+ os.environ.get("TORCH_COMPILE_DISABLE", "0") == "1"
278
+ or sys.platform == "win32" # Skip on Windows to avoid cl requirement
279
+ )
280
+
281
+ if self._compiled_forward is None:
282
+ if not skip_compile and hasattr(torch, "compile"):
283
+ try:
284
+ self._compiled_forward = torch.compile(
285
+ self.forward,
286
+ backend="inductor",
287
+ mode="reduce-overhead",
288
+ dynamic=True,
289
+ )
290
+ except Exception:
291
+ self._compiled_forward = self.forward
292
+ else:
293
+ self._compiled_forward = self.forward
294
+ return self._compiled_forward
295
+
296
+ def forward(
297
+ self,
298
+ input_ids: torch.Tensor,
299
+ attention_mask: torch.Tensor | None = None,
300
+ token_type_ids: torch.Tensor | None = None,
301
+ labels: torch.Tensor | None = None,
302
+ return_dict: bool = True,
303
+ ):
304
+ """
305
+ Forward pass.
306
+
307
+ Args:
308
+ input_ids: Input token IDs (batch, seq)
309
+ attention_mask: Attention mask (batch, seq)
310
+ token_type_ids: Token type IDs (batch, seq)
311
+ labels: Gold standard labels for training (batch, seq)
312
+ return_dict: Return as dict or tuple
313
+
314
+ Returns:
315
+ TokenClassifierOutput with loss, logits, hidden states
316
+ """
317
+ # BERT encoding
318
+ outputs = self.bert(
319
+ input_ids=input_ids,
320
+ attention_mask=attention_mask,
321
+ token_type_ids=token_type_ids,
322
+ )
323
+
324
+ sequence_output = outputs.last_hidden_state
325
+ sequence_output = self.dropout(sequence_output)
326
+
327
+ # Classification logits
328
+ logits = self.classifier(sequence_output)
329
+
330
+ loss = None
331
+ if labels is not None:
332
+ if self.crf is not None:
333
+ # CRF loss - need to handle -100 (ignore_index) labels
334
+ mask = attention_mask.bool() if attention_mask is not None else None
335
+ # Replace -100 with 0 (will be masked out anyway)
336
+ crf_labels = labels.clone()
337
+ crf_labels[crf_labels == -100] = 0
338
+ loss = self.crf(logits, crf_labels, mask=mask, reduction=self.config.crf_reduction)
339
+ else:
340
+ # Standard cross-entropy
341
+ loss_fct = nn.CrossEntropyLoss()
342
+ active_loss = attention_mask.view(-1) == 1
343
+ active_logits = logits.view(-1, self.num_labels)[active_loss]
344
+ active_labels = labels.view(-1)[active_loss]
345
+ loss = loss_fct(active_logits, active_labels)
346
+
347
+ if not return_dict:
348
+ output = (logits,) + outputs[2:]
349
+ return ((loss,) + output) if loss is not None else output
350
+
351
+ return TokenClassifierOutput(
352
+ loss=loss,
353
+ logits=logits,
354
+ hidden_states=outputs.hidden_states,
355
+ attentions=outputs.attentions,
356
+ )
357
+
358
+ def decode(
359
+ self,
360
+ input_ids: torch.Tensor,
361
+ attention_mask: torch.Tensor | None = None,
362
+ token_type_ids: torch.Tensor | None = None,
363
+ ) -> list[list[int]]:
364
+ """
365
+ Decode input to tag sequences using compiled forward pass.
366
+
367
+ Args:
368
+ input_ids: Input token IDs (batch, seq)
369
+ attention_mask: Attention mask (batch, seq)
370
+ token_type_ids: Token type IDs (batch, seq)
371
+
372
+ Returns:
373
+ List of predicted tag sequences
374
+ """
375
+ self.eval()
376
+ with torch.no_grad():
377
+ # Use compiled forward for optimized inference (PyTorch 2.9+)
378
+ forward_fn = self._get_compiled_forward()
379
+ outputs = forward_fn(
380
+ input_ids=input_ids,
381
+ attention_mask=attention_mask,
382
+ token_type_ids=token_type_ids,
383
+ )
384
+
385
+ logits = outputs.logits
386
+
387
+ if self.crf is not None:
388
+ mask = attention_mask.bool() if attention_mask is not None else None
389
+ predictions = self.crf.decode(logits, mask=mask)
390
+ else:
391
+ predictions = logits.argmax(dim=-1).tolist()
392
+
393
+ return predictions
394
+
395
+ def save_pretrained(self, save_directory: str):
396
+ """Save model to directory."""
397
+ import json
398
+ import os
399
+
400
+ os.makedirs(save_directory, exist_ok=True)
401
+
402
+ # Save model weights
403
+ torch.save(self.state_dict(), os.path.join(save_directory, "pytorch_model.bin"))
404
+
405
+ # Save config
406
+ config_dict = {
407
+ "model_name": self.config.model_name,
408
+ "num_labels": self.config.num_labels,
409
+ "use_crf": self.config.use_crf,
410
+ "hidden_size": self.config.hidden_size,
411
+ "classifier_dropout": self.config.classifier_dropout,
412
+ "id2label": self.id2label,
413
+ "label2id": self.label2id,
414
+ }
415
+ with open(os.path.join(save_directory, "config.json"), "w") as f:
416
+ json.dump(config_dict, f, indent=2)
417
+
418
+ @classmethod
419
+ def from_pretrained(cls, model_path: str, device: str = "cpu"):
420
+ """Load model from directory."""
421
+ import json
422
+
423
+ with open(f"{model_path}/config.json") as f:
424
+ config_dict = json.load(f)
425
+
426
+ config = ModelConfig(
427
+ model_name=config_dict["model_name"],
428
+ num_labels=config_dict["num_labels"],
429
+ use_crf=config_dict["use_crf"],
430
+ hidden_size=config_dict["hidden_size"],
431
+ classifier_dropout=config_dict["classifier_dropout"],
432
+ )
433
+
434
+ model = cls(config)
435
+ state_dict = torch.load(f"{model_path}/pytorch_model.bin", map_location=device)
436
+ model.load_state_dict(state_dict)
437
+ model.to(device)
438
+
439
+ return model
src/address_parser/models/config.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Model configuration for address NER."""
2
+
3
+ from dataclasses import dataclass
4
+
5
+
6
+ @dataclass
7
+ class ModelConfig:
8
+ """Configuration for BERT-CRF NER model."""
9
+
10
+ # Base model - IndicBERTv2-SS recommended for Indian languages
11
+ # Options: "bert-base-multilingual-cased", "ai4bharat/IndicBERTv2-SS",
12
+ # "google/muril-base-cased", "xlm-roberta-base"
13
+ model_name: str = "ai4bharat/IndicBERTv2-SS"
14
+ use_crf: bool = True
15
+
16
+ # Architecture
17
+ hidden_size: int = 768
18
+ num_labels: int = 31 # O + 15 entity types * 2 (B-/I-)
19
+ hidden_dropout_prob: float = 0.1
20
+ classifier_dropout: float = 0.1
21
+
22
+ # CRF settings
23
+ crf_reduction: str = "mean" # 'mean' or 'sum'
24
+
25
+ # Training
26
+ max_length: int = 128
27
+ learning_rate: float = 5e-5
28
+ crf_learning_rate: float = 1e-3 # Higher LR for CRF
29
+ weight_decay: float = 0.01
30
+ warmup_ratio: float = 0.1
31
+ num_epochs: int = 10
32
+ batch_size: int = 16
33
+ gradient_accumulation_steps: int = 1
34
+
35
+ # Label smoothing
36
+ label_smoothing: float = 0.0
37
+
38
+ # Early stopping
39
+ early_stopping_patience: int = 5
40
+ early_stopping_threshold: float = 0.001
41
+
42
+ # Layer-wise learning rate decay
43
+ lr_decay: float = 0.95
44
+
45
+ # Paths
46
+ output_dir: str = "./models"
47
+ cache_dir: str | None = None
48
+
49
+ # ONNX export
50
+ onnx_opset_version: int = 14
51
+
52
+ @classmethod
53
+ def from_pretrained_name(cls, name: str) -> ModelConfig:
54
+ """Create config for known pretrained models."""
55
+ configs = {
56
+ "mbert": cls(
57
+ model_name="bert-base-multilingual-cased",
58
+ hidden_size=768,
59
+ ),
60
+ "indicbert": cls(
61
+ model_name="ai4bharat/IndicBERTv2-SS",
62
+ hidden_size=768,
63
+ ),
64
+ "distilbert": cls(
65
+ model_name="distilbert-base-multilingual-cased",
66
+ hidden_size=768,
67
+ ),
68
+ "xlm-roberta": cls(
69
+ model_name="xlm-roberta-base",
70
+ hidden_size=768,
71
+ ),
72
+ "muril": cls(
73
+ model_name="google/muril-base-cased",
74
+ hidden_size=768,
75
+ ),
76
+ }
77
+ return configs.get(name, cls())
78
+
79
+
80
+ # Entity label definitions (must match schemas.py)
81
+ ENTITY_LABELS = [
82
+ "AREA",
83
+ "SUBAREA",
84
+ "HOUSE_NUMBER",
85
+ "SECTOR",
86
+ "GALI",
87
+ "COLONY",
88
+ "BLOCK",
89
+ "CAMP",
90
+ "POLE",
91
+ "KHASRA",
92
+ "FLOOR",
93
+ "PLOT",
94
+ "PINCODE",
95
+ "CITY",
96
+ "STATE",
97
+ ]
98
+
99
+ # Generate BIO labels
100
+ BIO_LABELS = ["O"] + [f"B-{label}" for label in ENTITY_LABELS] + [f"I-{label}" for label in ENTITY_LABELS]
101
+ LABEL2ID = {label: i for i, label in enumerate(BIO_LABELS)}
102
+ ID2LABEL = {i: label for i, label in enumerate(BIO_LABELS)}
103
+ NUM_LABELS = len(BIO_LABELS)
src/address_parser/pipeline.py ADDED
@@ -0,0 +1,528 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Main address parsing pipeline.
3
+
4
+ Orchestrates preprocessing, model inference, and post-processing
5
+ to extract structured entities from Indian addresses.
6
+ """
7
+
8
+ import time
9
+ import warnings
10
+ from pathlib import Path
11
+
12
+ from transformers import AutoTokenizer, logging as hf_logging
13
+
14
+ # Suppress false positive tokenizer warnings in transformers 4.57+
15
+ # The Mistral regex warning is incorrectly triggered for BERT tokenizers
16
+ hf_logging.set_verbosity_error()
17
+ warnings.filterwarnings("ignore", message=".*incorrect regex pattern.*")
18
+
19
+ from address_parser.models.config import ID2LABEL, ModelConfig
20
+ from address_parser.postprocessing import DelhiGazetteer, RuleBasedRefiner
21
+ from address_parser.preprocessing import AddressNormalizer, HindiTransliterator
22
+ from address_parser.schemas import (
23
+ AddressEntity,
24
+ BatchParseResponse,
25
+ ParsedAddress,
26
+ ParseResponse,
27
+ )
28
+
29
+
30
+ class AddressParser:
31
+ """
32
+ Main address parsing pipeline.
33
+
34
+ Combines:
35
+ - Text normalization and Hindi transliteration
36
+ - mBERT-CRF model for NER
37
+ - Rule-based post-processing with gazetteer
38
+
39
+ Example:
40
+ >>> parser = AddressParser.from_pretrained("./models/address_ner_v3")
41
+ >>> result = parser.parse("PLOT NO752 FIRST FLOOR, NEW DELHI, 110041")
42
+ >>> print(result.house_number) # "PLOT NO752"
43
+ """
44
+
45
+ def __init__(
46
+ self,
47
+ model=None,
48
+ tokenizer=None,
49
+ config: ModelConfig | None = None,
50
+ device: str = "cpu",
51
+ use_rules: bool = True,
52
+ use_gazetteer: bool = True,
53
+ ):
54
+ """
55
+ Initialize parser.
56
+
57
+ Args:
58
+ model: Trained NER model (BertCRFForTokenClassification)
59
+ tokenizer: HuggingFace tokenizer
60
+ config: Model configuration
61
+ device: Device to run on ('cpu', 'cuda', 'mps')
62
+ use_rules: Enable rule-based post-processing
63
+ use_gazetteer: Enable gazetteer for validation
64
+ """
65
+ self.model = model
66
+ self.tokenizer = tokenizer
67
+ self.config = config or ModelConfig()
68
+ self.device = device
69
+
70
+ # Initialize preprocessing
71
+ self.normalizer = AddressNormalizer(uppercase=True, expand_abbrev=True)
72
+ self.transliterator = HindiTransliterator(use_known_terms=True)
73
+
74
+ # Initialize post-processing
75
+ self.refiner = RuleBasedRefiner(use_gazetteer=use_gazetteer) if use_rules else None
76
+ self.gazetteer = DelhiGazetteer() if use_gazetteer else None
77
+
78
+ # Move model to device
79
+ if self.model is not None:
80
+ self.model.to(device)
81
+ self.model.eval()
82
+
83
+ @classmethod
84
+ def from_pretrained(
85
+ cls,
86
+ model_path: str | Path,
87
+ device: str = "cpu",
88
+ use_rules: bool = True,
89
+ use_gazetteer: bool = True,
90
+ ) -> AddressParser:
91
+ """
92
+ Load parser from pretrained model directory.
93
+
94
+ Args:
95
+ model_path: Path to saved model directory
96
+ device: Device to run on
97
+ use_rules: Enable rule-based post-processing
98
+ use_gazetteer: Enable gazetteer for validation
99
+
100
+ Returns:
101
+ Initialized AddressParser
102
+ """
103
+ from address_parser.models import BertCRFForTokenClassification
104
+
105
+ model_path = Path(model_path)
106
+
107
+ # Load model
108
+ model = BertCRFForTokenClassification.from_pretrained(str(model_path), device=device)
109
+
110
+ # Load tokenizer
111
+ tokenizer = AutoTokenizer.from_pretrained(str(model_path))
112
+
113
+ return cls(
114
+ model=model,
115
+ tokenizer=tokenizer,
116
+ device=device,
117
+ use_rules=use_rules,
118
+ use_gazetteer=use_gazetteer,
119
+ )
120
+
121
+ @classmethod
122
+ def rules_only(cls, use_gazetteer: bool = True) -> AddressParser:
123
+ """
124
+ Create a rules-only parser (no ML model).
125
+
126
+ Useful for testing or when model is not available.
127
+ """
128
+ return cls(
129
+ model=None,
130
+ tokenizer=None,
131
+ use_rules=True,
132
+ use_gazetteer=use_gazetteer,
133
+ )
134
+
135
+ def parse(self, address: str) -> ParsedAddress:
136
+ """
137
+ Parse a single address.
138
+
139
+ Args:
140
+ address: Raw address string
141
+
142
+ Returns:
143
+ ParsedAddress with extracted entities
144
+ """
145
+ if not address or not address.strip():
146
+ return ParsedAddress(
147
+ raw_address=address,
148
+ normalized_address="",
149
+ entities=[]
150
+ )
151
+
152
+ # Preprocessing
153
+ normalized = self._preprocess(address)
154
+
155
+ # Model inference
156
+ entities = self._extract_entities(normalized)
157
+
158
+ # Post-processing
159
+ if self.refiner:
160
+ entities = self.refiner.refine(normalized, entities)
161
+
162
+ return ParsedAddress(
163
+ raw_address=address,
164
+ normalized_address=normalized,
165
+ entities=entities
166
+ )
167
+
168
+ def parse_with_timing(self, address: str) -> ParseResponse:
169
+ """
170
+ Parse address and return response with timing info.
171
+
172
+ Args:
173
+ address: Raw address string
174
+
175
+ Returns:
176
+ ParseResponse with result and timing
177
+ """
178
+ start = time.perf_counter()
179
+
180
+ try:
181
+ result = self.parse(address)
182
+ elapsed = (time.perf_counter() - start) * 1000
183
+
184
+ return ParseResponse(
185
+ success=True,
186
+ result=result,
187
+ inference_time_ms=elapsed
188
+ )
189
+ except Exception as e:
190
+ elapsed = (time.perf_counter() - start) * 1000
191
+ return ParseResponse(
192
+ success=False,
193
+ error=str(e),
194
+ inference_time_ms=elapsed
195
+ )
196
+
197
+ def parse_batch(self, addresses: list[str]) -> BatchParseResponse:
198
+ """
199
+ Parse multiple addresses.
200
+
201
+ Args:
202
+ addresses: List of raw address strings
203
+
204
+ Returns:
205
+ BatchParseResponse with all results
206
+ """
207
+ start = time.perf_counter()
208
+
209
+ results = []
210
+ for address in addresses:
211
+ result = self.parse(address)
212
+ results.append(result)
213
+
214
+ total_time = (time.perf_counter() - start) * 1000
215
+ avg_time = total_time / len(addresses) if addresses else 0
216
+
217
+ return BatchParseResponse(
218
+ success=True,
219
+ results=results,
220
+ total_inference_time_ms=total_time,
221
+ avg_inference_time_ms=avg_time
222
+ )
223
+
224
+ def _preprocess(self, text: str) -> str:
225
+ """Preprocess address text."""
226
+ # Handle Hindi text
227
+ if self.transliterator.contains_devanagari(text):
228
+ text = self.transliterator.normalize_mixed_script(text)
229
+
230
+ # Normalize
231
+ return self.normalizer.normalize(text)
232
+
233
+ def _extract_entities(self, text: str) -> list[AddressEntity]:
234
+ """Extract entities using NER model."""
235
+ if self.model is None or self.tokenizer is None:
236
+ # Rules-only mode
237
+ return self._extract_entities_rules_only(text)
238
+
239
+ # Tokenize
240
+ encoding = self.tokenizer(
241
+ text,
242
+ return_tensors="pt",
243
+ truncation=True,
244
+ max_length=self.config.max_length,
245
+ return_offsets_mapping=True,
246
+ padding=True,
247
+ )
248
+
249
+ # Get offset mapping for alignment
250
+ offset_mapping = encoding.pop("offset_mapping")[0].tolist()
251
+
252
+ # Move to device
253
+ input_ids = encoding["input_ids"].to(self.device)
254
+ attention_mask = encoding["attention_mask"].to(self.device)
255
+
256
+ # Inference
257
+ predictions = self.model.decode(
258
+ input_ids=input_ids,
259
+ attention_mask=attention_mask,
260
+ )[0] # First (and only) sample
261
+
262
+ # Convert to entities
263
+ entities = self._predictions_to_entities(
264
+ text=text,
265
+ predictions=predictions,
266
+ offset_mapping=offset_mapping,
267
+ attention_mask=encoding["attention_mask"][0].tolist(),
268
+ )
269
+
270
+ return entities
271
+
272
+ def _extract_entities_rules_only(self, text: str) -> list[AddressEntity]:
273
+ """Extract entities using comprehensive rules (no ML)."""
274
+ import re
275
+ entities = []
276
+ text_upper = text.upper()
277
+
278
+ # Known localities (multi-word)
279
+ known_localities = [
280
+ "LAJPAT NAGAR", "MALVIYA NAGAR", "HAUZ KHAS", "GREEN PARK",
281
+ "GREATER KAILASH", "DEFENCE COLONY", "SOUTH EXTENSION", "KALKAJI",
282
+ "CIVIL LINES", "MODEL TOWN", "MUKHERJEE NAGAR", "KAMLA NAGAR",
283
+ "PREET VIHAR", "MAYUR VIHAR", "LAKSHMI NAGAR", "GANDHI NAGAR",
284
+ "JANAKPURI", "DWARKA", "UTTAM NAGAR", "TILAK NAGAR", "RAJOURI GARDEN",
285
+ "PUNJABI BAGH", "PASCHIM VIHAR", "KAROL BAGH", "CONNAUGHT PLACE",
286
+ "KAUNWAR SINGH NAGAR", "PALAM COLONY", "RAJ NAGAR", "SADH NAGAR",
287
+ "VIJAY ENCLAVE", "DURGA PARK", "SWARN PARK", "CHANCHAL PARK",
288
+ ]
289
+
290
+ for locality in known_localities:
291
+ pos = text_upper.find(locality)
292
+ if pos >= 0:
293
+ entities.append(AddressEntity(
294
+ label="SUBAREA",
295
+ value=text[pos:pos + len(locality)],
296
+ start=pos,
297
+ end=pos + len(locality),
298
+ confidence=0.95
299
+ ))
300
+
301
+ # Area patterns (directional)
302
+ area_patterns = [
303
+ (r'\bSOUTH\s+DELHI\b', "SOUTH DELHI"),
304
+ (r'\bNORTH\s+DELHI\b', "NORTH DELHI"),
305
+ (r'\bEAST\s+DELHI\b', "EAST DELHI"),
306
+ (r'\bWEST\s+DELHI\b', "WEST DELHI"),
307
+ (r'\bCENTRAL\s+DELHI\b', "CENTRAL DELHI"),
308
+ (r'\bOUTER\s+DELHI\b', "OUTER DELHI"),
309
+ ]
310
+
311
+ for pattern, area_name in area_patterns:
312
+ match = re.search(pattern, text_upper)
313
+ if match:
314
+ entities.append(AddressEntity(
315
+ label="AREA",
316
+ value=area_name,
317
+ start=match.start(),
318
+ end=match.end(),
319
+ confidence=0.95
320
+ ))
321
+
322
+ # House number patterns (order matters - more specific first)
323
+ house_patterns = [
324
+ r'\b(?:FLAT\s*NO\.?\s*)[A-Z]?[-]?\d+[A-Z]?(?:[-/]\d+)*\b',
325
+ r'\b(?:PLOT\s*NO\.?)\s*[A-Z]?\d+[A-Z]?(?:[-/]\d+)*\b',
326
+ r'\b(?:H\.?\s*NO\.?|HOUSE\s*NO\.?|HNO)\s*[A-Z]?\d+[A-Z]?(?:[-/]\d+)*\b',
327
+ r'\b[RW]Z[-\s]?[A-Z]?[-/]?\d+[A-Z]?(?:[-/]\d+)*\b',
328
+ ]
329
+
330
+ for pattern in house_patterns:
331
+ match = re.search(pattern, text_upper)
332
+ if match:
333
+ entities.append(AddressEntity(
334
+ label="HOUSE_NUMBER",
335
+ value=text[match.start():match.end()],
336
+ start=match.start(),
337
+ end=match.end(),
338
+ confidence=0.90
339
+ ))
340
+ break # Only first match
341
+
342
+ # Floor patterns
343
+ floor_match = re.search(
344
+ r'\b(?:GROUND|FIRST|SECOND|THIRD|FOURTH|1ST|2ND|3RD|4TH|GF|FF|SF|TF)\s*(?:FLOOR|FLR)?\b',
345
+ text_upper
346
+ )
347
+ if floor_match:
348
+ entities.append(AddressEntity(
349
+ label="FLOOR",
350
+ value=text[floor_match.start():floor_match.end()],
351
+ start=floor_match.start(),
352
+ end=floor_match.end(),
353
+ confidence=0.90
354
+ ))
355
+
356
+ # Gali patterns
357
+ gali_match = re.search(r'\b(?:GALI|GALLI|LANE)\s*(?:NO\.?)?\s*\d+[A-Z]?\b', text_upper)
358
+ if gali_match:
359
+ entities.append(AddressEntity(
360
+ label="GALI",
361
+ value=text[gali_match.start():gali_match.end()],
362
+ start=gali_match.start(),
363
+ end=gali_match.end(),
364
+ confidence=0.90
365
+ ))
366
+
367
+ # Block patterns
368
+ block_match = re.search(r'\b(?:BLOCK|BLK|BL)\s*[A-Z]?[-]?[A-Z0-9]+\b', text_upper)
369
+ if block_match:
370
+ entities.append(AddressEntity(
371
+ label="BLOCK",
372
+ value=text[block_match.start():block_match.end()],
373
+ start=block_match.start(),
374
+ end=block_match.end(),
375
+ confidence=0.90
376
+ ))
377
+
378
+ # Sector patterns
379
+ sector_match = re.search(r'\b(?:SECTOR|SEC)\s*\d+[A-Z]?\b', text_upper)
380
+ if sector_match:
381
+ entities.append(AddressEntity(
382
+ label="SECTOR",
383
+ value=text[sector_match.start():sector_match.end()],
384
+ start=sector_match.start(),
385
+ end=sector_match.end(),
386
+ confidence=0.90
387
+ ))
388
+
389
+ # Khasra patterns
390
+ khasra_match = re.search(
391
+ r'\b(?:KH\.?\s*(?:NO\.?)?\s*|KHASRA\s*(?:NO\.?)?\s*)[\d/]+(?:[/-]\d+)*\b',
392
+ text_upper
393
+ )
394
+ if khasra_match:
395
+ entities.append(AddressEntity(
396
+ label="KHASRA",
397
+ value=text[khasra_match.start():khasra_match.end()],
398
+ start=khasra_match.start(),
399
+ end=khasra_match.end(),
400
+ confidence=0.90
401
+ ))
402
+
403
+ # Pincode (6-digit Delhi codes)
404
+ pincode_match = re.search(r'\b1[1][0]\d{3}\b', text)
405
+ if pincode_match:
406
+ entities.append(AddressEntity(
407
+ label="PINCODE",
408
+ value=pincode_match.group(0),
409
+ start=pincode_match.start(),
410
+ end=pincode_match.end(),
411
+ confidence=1.0
412
+ ))
413
+
414
+ # City - always DELHI for Delhi addresses
415
+ if "DELHI" in text_upper:
416
+ # Find standalone DELHI or NEW DELHI
417
+ delhi_match = re.search(r'\bNEW\s+DELHI\b', text_upper)
418
+ if delhi_match:
419
+ entities.append(AddressEntity(
420
+ label="CITY",
421
+ value="NEW DELHI",
422
+ start=delhi_match.start(),
423
+ end=delhi_match.end(),
424
+ confidence=0.95
425
+ ))
426
+ else:
427
+ # Find last DELHI
428
+ delhi_positions = [m.start() for m in re.finditer(r'\bDELHI\b', text_upper)]
429
+ if delhi_positions:
430
+ pos = delhi_positions[-1]
431
+ entities.append(AddressEntity(
432
+ label="CITY",
433
+ value="DELHI",
434
+ start=pos,
435
+ end=pos + 5,
436
+ confidence=0.90
437
+ ))
438
+
439
+ return entities
440
+
441
+ def _predictions_to_entities(
442
+ self,
443
+ text: str,
444
+ predictions: list[int],
445
+ offset_mapping: list[tuple[int, int]],
446
+ attention_mask: list[int],
447
+ ) -> list[AddressEntity]:
448
+ """Convert model predictions to entity objects."""
449
+ entities = []
450
+ current_entity = None
451
+
452
+ for idx, (pred, offset, mask) in enumerate(zip(predictions, offset_mapping, attention_mask)):
453
+ if mask == 0 or offset == (0, 0): # Skip padding and special tokens
454
+ continue
455
+
456
+ label = ID2LABEL.get(pred, "O")
457
+ start, end = offset
458
+
459
+ if label == "O":
460
+ # End current entity if any
461
+ if current_entity:
462
+ entities.append(self._finalize_entity(current_entity, text))
463
+ current_entity = None
464
+ elif label.startswith("B-"):
465
+ # Start new entity
466
+ if current_entity:
467
+ entities.append(self._finalize_entity(current_entity, text))
468
+
469
+ entity_type = label[2:] # Remove "B-" prefix
470
+ current_entity = {
471
+ "label": entity_type,
472
+ "start": start,
473
+ "end": end,
474
+ "confidence": 0.9, # Base confidence
475
+ }
476
+ elif label.startswith("I-"):
477
+ # Continue entity
478
+ entity_type = label[2:]
479
+ if current_entity and current_entity["label"] == entity_type:
480
+ current_entity["end"] = end
481
+ else:
482
+ # I- without matching B- - treat as new B-
483
+ if current_entity:
484
+ entities.append(self._finalize_entity(current_entity, text))
485
+ current_entity = {
486
+ "label": entity_type,
487
+ "start": start,
488
+ "end": end,
489
+ "confidence": 0.85,
490
+ }
491
+
492
+ # Don't forget last entity
493
+ if current_entity:
494
+ entities.append(self._finalize_entity(current_entity, text))
495
+
496
+ return entities
497
+
498
+ def _finalize_entity(self, entity_dict: dict, text: str) -> AddressEntity:
499
+ """Finalize entity with extracted value."""
500
+ value = text[entity_dict["start"]:entity_dict["end"]].strip()
501
+
502
+ return AddressEntity(
503
+ label=entity_dict["label"],
504
+ value=value,
505
+ start=entity_dict["start"],
506
+ end=entity_dict["end"],
507
+ confidence=entity_dict["confidence"]
508
+ )
509
+
510
+
511
+ # Convenience function for quick parsing
512
+ def parse_address(address: str, model_path: str | None = None) -> ParsedAddress:
513
+ """
514
+ Quick address parsing function.
515
+
516
+ Args:
517
+ address: Address to parse
518
+ model_path: Optional path to model (uses rules-only if None)
519
+
520
+ Returns:
521
+ ParsedAddress
522
+ """
523
+ if model_path:
524
+ parser = AddressParser.from_pretrained(model_path)
525
+ else:
526
+ parser = AddressParser.rules_only()
527
+
528
+ return parser.parse(address)
src/address_parser/postprocessing/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """Post-processing module for entity refinement and validation."""
2
+
3
+ from address_parser.postprocessing.gazetteer import DelhiGazetteer
4
+ from address_parser.postprocessing.rules import RuleBasedRefiner
5
+
6
+ __all__ = ["RuleBasedRefiner", "DelhiGazetteer"]
src/address_parser/postprocessing/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (474 Bytes). View file
 
src/address_parser/postprocessing/__pycache__/__init__.cpython-314.pyc ADDED
Binary file (471 Bytes). View file
 
src/address_parser/postprocessing/__pycache__/gazetteer.cpython-312.pyc ADDED
Binary file (6.21 kB). View file
 
src/address_parser/postprocessing/__pycache__/gazetteer.cpython-314.pyc ADDED
Binary file (9.09 kB). View file
 
src/address_parser/postprocessing/__pycache__/rules.cpython-312.pyc ADDED
Binary file (13 kB). View file
 
src/address_parser/postprocessing/__pycache__/rules.cpython-314.pyc ADDED
Binary file (24.7 kB). View file
 
src/address_parser/postprocessing/gazetteer.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Delhi locality gazetteer for fuzzy matching and validation."""
2
+
3
+
4
+ from rapidfuzz import fuzz, process
5
+
6
+
7
+ class DelhiGazetteer:
8
+ """
9
+ Gazetteer of Delhi localities, areas, and common address terms.
10
+
11
+ Used for:
12
+ - Fuzzy matching to correct misspellings
13
+ - Entity validation
14
+ - Confidence boosting for known locations
15
+ """
16
+
17
+ # Major Delhi localities/areas
18
+ LOCALITIES = {
19
+ # South Delhi
20
+ "SAKET", "MALVIYA NAGAR", "HAUZ KHAS", "GREEN PARK", "GREATER KAILASH",
21
+ "DEFENCE COLONY", "LAJPAT NAGAR", "SOUTH EXTENSION", "CHITTARANJAN PARK",
22
+ "KALKAJI", "NEHRU PLACE", "OKHLA", "JASOLA", "SARITA VIHAR",
23
+ "ALAKNANDA", "SAFDARJUNG", "VASANT KUNJ", "MEHRAULI", "CHATTARPUR",
24
+
25
+ # North Delhi
26
+ "CIVIL LINES", "MODEL TOWN", "MUKHERJEE NAGAR", "KAMLA NAGAR",
27
+ "SHAKTI NAGAR", "GULABI BAGH", "ASHOK VIHAR", "SHALIMAR BAGH",
28
+ "PITAMPURA", "ROHINI", "NARELA", "BAWANA", "ALIPUR",
29
+
30
+ # East Delhi
31
+ "PREET VIHAR", "MAYUR VIHAR", "PATPARGANJ", "PANDAV NAGAR",
32
+ "LAKSHMI NAGAR", "SHAKARPUR", "GEETA COLONY", "GANDHI NAGAR",
33
+ "DILSHAD GARDEN", "SEELAMPUR", "SHAHDARA", "ANAND VIHAR",
34
+
35
+ # West Delhi
36
+ "JANAKPURI", "DWARKA", "PALAM", "UTTAM NAGAR", "VIKASPURI",
37
+ "TILAK NAGAR", "RAJOURI GARDEN", "PUNJABI BAGH", "PASCHIM VIHAR",
38
+ "MEERA BAGH", "PEERAGARHI", "MUNDKA", "NANGLOI", "NAJAFGARH",
39
+ "BINDAPUR", "KAKROLA", "MOHAN GARDEN", "NAWADA",
40
+
41
+ # Central Delhi
42
+ "CONNAUGHT PLACE", "KAROL BAGH", "PAHARGANJ", "DARYAGANJ",
43
+ "CHANDNI CHOWK", "SADAR BAZAAR", "RAJENDER NAGAR", "PATEL NAGAR",
44
+ "KIRTI NAGAR", "MOTIA KHAN", "ANAND PARBAT", "JHANDEWALAN",
45
+
46
+ # New Delhi
47
+ "CHANAKYAPURI", "LODHI ROAD", "GOLF LINKS", "JORBAGH",
48
+ "SUNDAR NAGAR", "NIZAMUDDIN", "LODI COLONY", "PANDARA ROAD",
49
+
50
+ # Other areas
51
+ "BADARPUR", "TUGHLAKABAD", "SANGAM VIHAR", "MADANPUR KHADAR",
52
+ "GOVINDPURI", "AMBEDKAR NAGAR", "LADO SARAI", "TIGRI",
53
+ "BURARI", "KARAWAL NAGAR", "BHAJANPURA", "MUSTAFABAD",
54
+ "JAFFRABAD", "MAUJPUR", "GOKALPUR", "SEEMAPURI",
55
+ }
56
+
57
+ # Common colony/nagar suffixes
58
+ NAGAR_SUFFIXES = {
59
+ "NAGAR", "VIHAR", "COLONY", "ENCLAVE", "EXTENSION", "PURI",
60
+ "PARK", "GARDEN", "BAGH", "KUNJ", "APARTMENT", "RESIDENCY",
61
+ "COMPLEX", "PHASE", "SECTOR", "BLOCK", "POCKET",
62
+ }
63
+
64
+ # Common area names from the training data
65
+ COMMON_AREAS = {
66
+ "KAUNWAR SINGH NAGAR", "BABA HARI DAS COLONY", "TIKARI KALA",
67
+ "CHANCHAL PARK", "SWARN PARK", "MUNDKA", "NANGLOI", "BAKKARWALA",
68
+ "MAJRA DABAS", "CHAND NAGAR", "RANHOLA", "BAPROLA", "POOTH KHURD",
69
+ "KIRARI", "SULTANPURI", "MANGOLPURI", "BEGUMPUR", "KADIPUR",
70
+ "RAMA VIHAR", "PREM NAGAR", "VIJAY PARK", "AMBICA VIHAR",
71
+ "SHIV PURI", "BUDH VIHAR", "POOTH KALAN", "QUTUBGARH",
72
+ "RANI KHERA", "SHAHABAD DAIRY", "SAMAIPUR", "JAHANGIRPURI",
73
+ "SANNOTH", "KANJHAWALA", "BAWANA", "ALIPUR",
74
+ }
75
+
76
+ # Common Hindi transliterated terms
77
+ HINDI_TERMS = {
78
+ "MOHALLA", "GALI", "KATRA", "BASTI", "BAZAR", "CHOWK",
79
+ "GANJ", "PUR", "ABAD", "GARH", "GAON", "KHERA", "KHURD", "KALAN",
80
+ }
81
+
82
+ def __init__(self, min_similarity: float = 80.0):
83
+ """
84
+ Initialize gazetteer.
85
+
86
+ Args:
87
+ min_similarity: Minimum fuzzy match score (0-100)
88
+ """
89
+ self.min_similarity = min_similarity
90
+
91
+ # Build combined set for matching
92
+ self.all_places = (
93
+ self.LOCALITIES |
94
+ self.COMMON_AREAS |
95
+ {f"{term}" for term in self.HINDI_TERMS}
96
+ )
97
+
98
+ def fuzzy_match(
99
+ self,
100
+ text: str,
101
+ limit: int = 3
102
+ ) -> list[tuple[str, float]]:
103
+ """
104
+ Find fuzzy matches for a text in the gazetteer.
105
+
106
+ Args:
107
+ text: Text to match
108
+ limit: Maximum number of matches
109
+
110
+ Returns:
111
+ List of (matched_text, score) tuples
112
+ """
113
+ if not text or len(text) < 3:
114
+ return []
115
+
116
+ matches = process.extract(
117
+ text.upper(),
118
+ self.all_places,
119
+ scorer=fuzz.ratio,
120
+ limit=limit
121
+ )
122
+
123
+ return [(m[0], m[1]) for m in matches if m[1] >= self.min_similarity]
124
+
125
+ def is_known_locality(self, text: str, threshold: float = 85.0) -> bool:
126
+ """Check if text matches a known locality."""
127
+ matches = self.fuzzy_match(text, limit=1)
128
+ return bool(matches and matches[0][1] >= threshold)
129
+
130
+ def correct_spelling(self, text: str) -> str | None:
131
+ """
132
+ Attempt to correct spelling using gazetteer.
133
+
134
+ Returns corrected text or None if no good match.
135
+ """
136
+ matches = self.fuzzy_match(text, limit=1)
137
+ if matches and matches[0][1] >= 90.0:
138
+ return matches[0][0]
139
+ return None
140
+
141
+ def get_locality_type(self, text: str) -> str | None:
142
+ """
143
+ Determine if text contains a known locality type suffix.
144
+
145
+ Returns the suffix type or None.
146
+ """
147
+ text_upper = text.upper()
148
+ for suffix in self.NAGAR_SUFFIXES:
149
+ if text_upper.endswith(suffix):
150
+ return suffix
151
+ return None
152
+
153
+ def validate_pincode(self, pincode: str, locality: str | None = None) -> bool:
154
+ """
155
+ Validate if a pincode is valid for Delhi.
156
+
157
+ Delhi pincodes are in range 110001-110097.
158
+ """
159
+ if not pincode or not pincode.isdigit() or len(pincode) != 6:
160
+ return False
161
+
162
+ code = int(pincode)
163
+ # Delhi pincode range
164
+ return 110001 <= code <= 110097
src/address_parser/postprocessing/rules.py ADDED
@@ -0,0 +1,536 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Rule-based post-processing for entity refinement."""
2
+
3
+ import re
4
+
5
+ from address_parser.postprocessing.gazetteer import DelhiGazetteer
6
+ from address_parser.schemas import AddressEntity
7
+
8
+
9
+ class RuleBasedRefiner:
10
+ """
11
+ Post-processing rules for refining NER predictions.
12
+
13
+ Handles:
14
+ - Pattern-based entity detection (pincodes, khasra numbers)
15
+ - Entity boundary correction using gazetteer
16
+ - Entity merging for fragmented predictions
17
+ - Confidence adjustment
18
+ - Validation and filtering
19
+ """
20
+
21
+ # Regex patterns for deterministic entities
22
+ PATTERNS = {
23
+ "PINCODE": re.compile(r'\b[1-9]\d{5}\b'),
24
+ "KHASRA": re.compile(
25
+ r'\b(?:KH\.?\s*(?:NO\.?)?\s*|KHASRA\s*(?:NO\.?)?\s*)[\d/]+(?:[/-]\d+)*\b',
26
+ re.IGNORECASE
27
+ ),
28
+ "HOUSE_NUMBER": re.compile(
29
+ r'\b(?:H\.?\s*(?:NO\.?)?\s*|HOUSE\s*(?:NO\.?)?\s*|PLOT\s*(?:NO\.?)?\s*)?[A-Z]?\d+[A-Z]?(?:[-/]\d+)*\b',
30
+ re.IGNORECASE
31
+ ),
32
+ "FLOOR": re.compile(
33
+ r'\b(?:GROUND|FIRST|SECOND|THIRD|FOURTH|FIFTH|1ST|2ND|3RD|4TH|5TH|GF|FF|SF|TF)?\s*(?:FLOOR|FLR)\b',
34
+ re.IGNORECASE
35
+ ),
36
+ "BLOCK": re.compile(
37
+ r'\b(?:BLOCK|BLK|BL)\s*[A-Z]?[-]?[A-Z0-9]+\b',
38
+ re.IGNORECASE
39
+ ),
40
+ "SECTOR": re.compile(
41
+ r'\b(?:SECTOR|SEC)\s*\d+[A-Z]?\b',
42
+ re.IGNORECASE
43
+ ),
44
+ "GALI": re.compile(
45
+ r'\b(?:GALI|GALLI|LANE)\s*(?:NO\.?)?\s*\d+[A-Z]?\b',
46
+ re.IGNORECASE
47
+ ),
48
+ }
49
+
50
+ # Area patterns - directional areas
51
+ AREA_PATTERNS = [
52
+ (re.compile(r'\bSOUTH\s+DELHI\b', re.IGNORECASE), "SOUTH DELHI"),
53
+ (re.compile(r'\bNORTH\s+DELHI\b', re.IGNORECASE), "NORTH DELHI"),
54
+ (re.compile(r'\bEAST\s+DELHI\b', re.IGNORECASE), "EAST DELHI"),
55
+ (re.compile(r'\bWEST\s+DELHI\b', re.IGNORECASE), "WEST DELHI"),
56
+ (re.compile(r'\bCENTRAL\s+DELHI\b', re.IGNORECASE), "CENTRAL DELHI"),
57
+ (re.compile(r'\bSOUTH\s+WEST\s+DELHI\b', re.IGNORECASE), "SOUTH WEST DELHI"),
58
+ (re.compile(r'\bNORTH\s+WEST\s+DELHI\b', re.IGNORECASE), "NORTH WEST DELHI"),
59
+ (re.compile(r'\bNORTH\s+EAST\s+DELHI\b', re.IGNORECASE), "NORTH EAST DELHI"),
60
+ (re.compile(r'\bSOUTH\s+EAST\s+DELHI\b', re.IGNORECASE), "SOUTH EAST DELHI"),
61
+ (re.compile(r'\bOUTER\s+DELHI\b', re.IGNORECASE), "OUTER DELHI"),
62
+ ]
63
+
64
+ # City patterns
65
+ CITY_PATTERNS = [
66
+ (re.compile(r'\bNEW\s+DELHI\b', re.IGNORECASE), "NEW DELHI"),
67
+ (re.compile(r'\bDELHI\b', re.IGNORECASE), "DELHI"),
68
+ (re.compile(r'\bNOIDA\b', re.IGNORECASE), "NOIDA"),
69
+ (re.compile(r'\bGURUGRAM\b', re.IGNORECASE), "GURUGRAM"),
70
+ (re.compile(r'\bGURGAON\b', re.IGNORECASE), "GURGAON"),
71
+ (re.compile(r'\bFARIDABAD\b', re.IGNORECASE), "FARIDABAD"),
72
+ (re.compile(r'\bGHAZIABAD\b', re.IGNORECASE), "GHAZIABAD"),
73
+ ]
74
+
75
+ # State patterns
76
+ STATE_PATTERNS = [
77
+ (re.compile(r'\bDELHI\b', re.IGNORECASE), "DELHI"),
78
+ (re.compile(r'\bHARYANA\b', re.IGNORECASE), "HARYANA"),
79
+ (re.compile(r'\bUTTAR\s+PRADESH\b', re.IGNORECASE), "UTTAR PRADESH"),
80
+ (re.compile(r'\bU\.?\s*P\.?\b'), "UTTAR PRADESH"),
81
+ ]
82
+
83
+ # Colony/Nagar indicators
84
+ COLONY_SUFFIXES = [
85
+ "NAGAR", "VIHAR", "COLONY", "ENCLAVE", "PARK", "GARDEN",
86
+ "PURI", "BAGH", "KUNJ", "EXTENSION", "EXTN", "PHASE",
87
+ ]
88
+
89
+ # Known multi-word localities that get fragmented
90
+ KNOWN_LOCALITIES = [
91
+ "LAJPAT NAGAR", "MALVIYA NAGAR", "KAROL BAGH", "HAUZ KHAS",
92
+ "GREEN PARK", "GREATER KAILASH", "DEFENCE COLONY", "SOUTH EXTENSION",
93
+ "CHITTARANJAN PARK", "NEHRU PLACE", "SARITA VIHAR", "VASANT KUNJ",
94
+ "CIVIL LINES", "MODEL TOWN", "MUKHERJEE NAGAR", "KAMLA NAGAR",
95
+ "ASHOK VIHAR", "SHALIMAR BAGH", "PREET VIHAR", "MAYUR VIHAR",
96
+ "LAKSHMI NAGAR", "GANDHI NAGAR", "DILSHAD GARDEN", "ANAND VIHAR",
97
+ "UTTAM NAGAR", "TILAK NAGAR", "RAJOURI GARDEN", "PUNJABI BAGH",
98
+ "PASCHIM VIHAR", "CONNAUGHT PLACE", "RAJENDER NAGAR", "PATEL NAGAR",
99
+ "KIRTI NAGAR", "LODHI ROAD", "GOLF LINKS", "SANGAM VIHAR",
100
+ "GOVINDPURI", "AMBEDKAR NAGAR", "LADO SARAI", "KAUNWAR SINGH NAGAR",
101
+ "BABA HARI DAS COLONY", "SWARN PARK", "CHANCHAL PARK", "DURGA PARK",
102
+ "RAJ NAGAR", "SADH NAGAR", "VIJAY ENCLAVE", "PALAM COLONY",
103
+ ]
104
+
105
+ def __init__(self, use_gazetteer: bool = True):
106
+ """
107
+ Initialize refiner.
108
+
109
+ Args:
110
+ use_gazetteer: Use gazetteer for validation/correction
111
+ """
112
+ self.gazetteer = DelhiGazetteer() if use_gazetteer else None
113
+
114
+ def refine(
115
+ self,
116
+ text: str,
117
+ entities: list[AddressEntity]
118
+ ) -> list[AddressEntity]:
119
+ """
120
+ Refine entity predictions.
121
+
122
+ Args:
123
+ text: Original address text
124
+ entities: Predicted entities from NER model
125
+
126
+ Returns:
127
+ Refined list of entities
128
+ """
129
+ refined = list(entities)
130
+
131
+ # First: detect and fix known localities from gazetteer
132
+ refined = self._fix_known_localities(text, refined)
133
+
134
+ # Add rule-based entities that may have been missed
135
+ refined = self._add_pattern_entities(text, refined)
136
+
137
+ # Detect area patterns (SOUTH DELHI, etc.)
138
+ refined = self._add_area_patterns(text, refined)
139
+
140
+ # Correct entity boundaries
141
+ refined = self._correct_boundaries(text, refined)
142
+
143
+ # Merge fragmented entities
144
+ refined = self._merge_fragmented_entities(text, refined)
145
+
146
+ # Adjust confidence scores
147
+ refined = self._adjust_confidence(text, refined)
148
+
149
+ # Remove duplicates and overlapping entities
150
+ refined = self._remove_overlaps(refined)
151
+
152
+ # Validate entities
153
+ refined = self._validate_entities(refined)
154
+
155
+ return refined
156
+
157
+ def _fix_known_localities(
158
+ self,
159
+ text: str,
160
+ entities: list[AddressEntity]
161
+ ) -> list[AddressEntity]:
162
+ """Fix fragmented known localities using gazetteer lookup."""
163
+ text_upper = text.upper()
164
+ result = []
165
+ used_ranges: list[tuple[int, int]] = []
166
+
167
+ # First pass: find all known localities in text
168
+ locality_entities = []
169
+ for locality in self.KNOWN_LOCALITIES:
170
+ idx = 0
171
+ while True:
172
+ pos = text_upper.find(locality, idx)
173
+ if pos == -1:
174
+ break
175
+ end = pos + len(locality)
176
+ locality_entities.append(AddressEntity(
177
+ label="SUBAREA",
178
+ value=text[pos:end],
179
+ start=pos,
180
+ end=end,
181
+ confidence=0.95
182
+ ))
183
+ used_ranges.append((pos, end))
184
+ idx = end
185
+
186
+ # Also check area patterns
187
+ for pattern, area_name in self.AREA_PATTERNS:
188
+ match = pattern.search(text)
189
+ if match:
190
+ start, end = match.start(), match.end()
191
+ # Check for overlap with existing ranges
192
+ overlaps = any(
193
+ not (end <= s or start >= e)
194
+ for s, e in used_ranges
195
+ )
196
+ if not overlaps:
197
+ locality_entities.append(AddressEntity(
198
+ label="AREA",
199
+ value=area_name,
200
+ start=start,
201
+ end=end,
202
+ confidence=0.95
203
+ ))
204
+ used_ranges.append((start, end))
205
+
206
+ # Filter out original entities that overlap with found localities
207
+ for entity in entities:
208
+ # Check if entity overlaps with any locality range
209
+ overlaps_locality = any(
210
+ not (entity.end <= start or entity.start >= end)
211
+ for start, end in used_ranges
212
+ )
213
+
214
+ if overlaps_locality and entity.label in ("AREA", "SUBAREA", "COLONY", "CITY"):
215
+ # Skip this fragmented entity
216
+ continue
217
+
218
+ result.append(entity)
219
+
220
+ # Add the locality entities
221
+ result.extend(locality_entities)
222
+
223
+ return result
224
+
225
+ def _add_area_patterns(
226
+ self,
227
+ text: str,
228
+ entities: list[AddressEntity]
229
+ ) -> list[AddressEntity]:
230
+ """Add area patterns like SOUTH DELHI, NORTH DELHI (already handled in _fix_known_localities)."""
231
+ # This is now handled in _fix_known_localities to avoid duplicates
232
+ return entities
233
+
234
+ def _merge_fragmented_entities(
235
+ self,
236
+ text: str,
237
+ entities: list[AddressEntity]
238
+ ) -> list[AddressEntity]:
239
+ """Merge adjacent entities of same type that should be together."""
240
+ if len(entities) < 2:
241
+ return entities
242
+
243
+ # Sort by position
244
+ sorted_entities = sorted(entities, key=lambda e: e.start)
245
+ result = []
246
+ i = 0
247
+
248
+ while i < len(sorted_entities):
249
+ current = sorted_entities[i]
250
+
251
+ # Look for adjacent entities to merge
252
+ if current.label in ("AREA", "SUBAREA", "COLONY", "CITY"):
253
+ merged_end = current.end
254
+ merged_confidence = current.confidence
255
+ j = i + 1
256
+
257
+ # Check subsequent entities
258
+ while j < len(sorted_entities):
259
+ next_ent = sorted_entities[j]
260
+
261
+ # Check if adjacent (within 2 chars - allows for space)
262
+ gap = next_ent.start - merged_end
263
+ if gap <= 2 and next_ent.label in ("AREA", "SUBAREA", "COLONY", "CITY"):
264
+ # Check if the merged text forms a known locality
265
+ merged_text = text[current.start:next_ent.end].strip()
266
+ if self._is_valid_merge(merged_text):
267
+ merged_end = next_ent.end
268
+ merged_confidence = max(merged_confidence, next_ent.confidence)
269
+ j += 1
270
+ else:
271
+ break
272
+ else:
273
+ break
274
+
275
+ # Create merged entity if we merged anything
276
+ if j > i + 1:
277
+ merged_value = text[current.start:merged_end].strip()
278
+ result.append(AddressEntity(
279
+ label=current.label,
280
+ value=merged_value,
281
+ start=current.start,
282
+ end=merged_end,
283
+ confidence=merged_confidence
284
+ ))
285
+ i = j
286
+ continue
287
+
288
+ result.append(current)
289
+ i += 1
290
+
291
+ return result
292
+
293
+ def _is_valid_merge(self, text: str) -> bool:
294
+ """Check if merged text forms a valid locality name."""
295
+ text_upper = text.upper().strip()
296
+
297
+ # Check against known localities
298
+ if text_upper in self.KNOWN_LOCALITIES:
299
+ return True
300
+
301
+ # Check gazetteer
302
+ if self.gazetteer and self.gazetteer.is_known_locality(text_upper, threshold=80):
303
+ return True
304
+
305
+ # Check if ends with common suffix
306
+ for suffix in self.COLONY_SUFFIXES:
307
+ if text_upper.endswith(suffix):
308
+ return True
309
+
310
+ return False
311
+
312
+ def _add_pattern_entities(
313
+ self,
314
+ text: str,
315
+ entities: list[AddressEntity]
316
+ ) -> list[AddressEntity]:
317
+ """Add entities detected by regex patterns."""
318
+ result = list(entities)
319
+ existing_spans = {(e.start, e.end) for e in entities}
320
+
321
+ # Check for pincode
322
+ if not any(e.label == "PINCODE" for e in entities):
323
+ match = self.PATTERNS["PINCODE"].search(text)
324
+ if match and (match.start(), match.end()) not in existing_spans:
325
+ result.append(AddressEntity(
326
+ label="PINCODE",
327
+ value=match.group(0),
328
+ start=match.start(),
329
+ end=match.end(),
330
+ confidence=1.0 # Rule-based, high confidence
331
+ ))
332
+
333
+ # Check for city - DELHI addresses always have DELHI as city
334
+ has_city = any(e.label == "CITY" for e in result)
335
+ if not has_city:
336
+ # If text contains DELHI anywhere, set city to DELHI
337
+ if "DELHI" in text.upper():
338
+ # Find the last occurrence of DELHI (usually the city mention)
339
+ delhi_positions = [m.start() for m in re.finditer(r'\bDELHI\b', text.upper())]
340
+ if delhi_positions:
341
+ pos = delhi_positions[-1] # Use last occurrence
342
+ result.append(AddressEntity(
343
+ label="CITY",
344
+ value="DELHI",
345
+ start=pos,
346
+ end=pos + 5,
347
+ confidence=0.90
348
+ ))
349
+ else:
350
+ # Check other city patterns
351
+ for pattern, city_name in self.CITY_PATTERNS:
352
+ if city_name == "DELHI":
353
+ continue # Already handled above
354
+ match = pattern.search(text)
355
+ if match and (match.start(), match.end()) not in existing_spans:
356
+ result.append(AddressEntity(
357
+ label="CITY",
358
+ value=city_name,
359
+ start=match.start(),
360
+ end=match.end(),
361
+ confidence=0.95
362
+ ))
363
+ break
364
+
365
+ # Check for state
366
+ if not any(e.label == "STATE" for e in entities):
367
+ for pattern, state_name in self.STATE_PATTERNS:
368
+ match = pattern.search(text)
369
+ if match and (match.start(), match.end()) not in existing_spans:
370
+ # Avoid tagging "DELHI" as state if it's already a city
371
+ if state_name == "DELHI" and any(e.label == "CITY" and "DELHI" in e.value.upper() for e in result):
372
+ continue
373
+ result.append(AddressEntity(
374
+ label="STATE",
375
+ value=state_name,
376
+ start=match.start(),
377
+ end=match.end(),
378
+ confidence=0.90
379
+ ))
380
+ break
381
+
382
+ return result
383
+
384
+ def _correct_boundaries(
385
+ self,
386
+ text: str,
387
+ entities: list[AddressEntity]
388
+ ) -> list[AddressEntity]:
389
+ """Correct entity boundaries based on patterns."""
390
+ result = []
391
+
392
+ for entity in entities:
393
+ corrected = entity.model_copy()
394
+
395
+ # Expand KHASRA to include full pattern
396
+ if entity.label == "KHASRA":
397
+ match = self.PATTERNS["KHASRA"].search(text)
398
+ if match:
399
+ corrected.value = match.group(0)
400
+ corrected.start = match.start()
401
+ corrected.end = match.end()
402
+
403
+ # Expand BLOCK to include identifier
404
+ elif entity.label == "BLOCK":
405
+ match = self.PATTERNS["BLOCK"].search(text)
406
+ if match:
407
+ corrected.value = match.group(0)
408
+ corrected.start = match.start()
409
+ corrected.end = match.end()
410
+
411
+ # Expand FLOOR to include floor number
412
+ elif entity.label == "FLOOR":
413
+ match = self.PATTERNS["FLOOR"].search(text)
414
+ if match:
415
+ corrected.value = match.group(0)
416
+ corrected.start = match.start()
417
+ corrected.end = match.end()
418
+
419
+ # Clean up leading/trailing whitespace from value
420
+ corrected.value = corrected.value.strip()
421
+
422
+ result.append(corrected)
423
+
424
+ return result
425
+
426
+ def _adjust_confidence(
427
+ self,
428
+ text: str,
429
+ entities: list[AddressEntity]
430
+ ) -> list[AddressEntity]:
431
+ """Adjust confidence scores based on patterns and gazetteer."""
432
+ result = []
433
+
434
+ for entity in entities:
435
+ adjusted = entity.model_copy()
436
+
437
+ # Boost confidence for pattern matches
438
+ if entity.label in self.PATTERNS:
439
+ pattern = self.PATTERNS[entity.label]
440
+ if pattern.fullmatch(entity.value):
441
+ adjusted.confidence = min(1.0, entity.confidence + 0.1)
442
+
443
+ # Boost confidence for gazetteer matches
444
+ if self.gazetteer and entity.label in ("AREA", "SUBAREA", "COLONY"):
445
+ if self.gazetteer.is_known_locality(entity.value):
446
+ adjusted.confidence = min(1.0, entity.confidence + 0.15)
447
+
448
+ # Reduce confidence for very short entities
449
+ if len(entity.value) < 3:
450
+ adjusted.confidence = max(0.0, entity.confidence - 0.2)
451
+
452
+ result.append(adjusted)
453
+
454
+ return result
455
+
456
+ def _remove_overlaps(
457
+ self,
458
+ entities: list[AddressEntity]
459
+ ) -> list[AddressEntity]:
460
+ """Remove overlapping entities, keeping higher confidence ones."""
461
+ if not entities:
462
+ return entities
463
+
464
+ # Separate CITY and PINCODE entities - these should always be kept
465
+ # as they represent different semantic levels than AREA/SUBAREA
466
+ preserved_labels = {"CITY", "PINCODE", "STATE"}
467
+ preserved_entities = [e for e in entities if e.label in preserved_labels]
468
+ other_entities = [e for e in entities if e.label not in preserved_labels]
469
+
470
+ # Sort non-preserved by confidence (descending) then by start position
471
+ sorted_entities = sorted(other_entities, key=lambda e: (-e.confidence, e.start))
472
+
473
+ result: list[AddressEntity] = []
474
+ used_ranges: list[tuple[int, int]] = []
475
+
476
+ for entity in sorted_entities:
477
+ # Check for overlap with existing entities
478
+ overlaps = False
479
+ for start, end in used_ranges:
480
+ if not (entity.end <= start or entity.start >= end):
481
+ overlaps = True
482
+ break
483
+
484
+ if not overlaps:
485
+ result.append(entity)
486
+ used_ranges.append((entity.start, entity.end))
487
+
488
+ # Add back preserved entities (CITY, PINCODE, STATE)
489
+ result.extend(preserved_entities)
490
+
491
+ # Sort by position for output
492
+ return sorted(result, key=lambda e: e.start)
493
+
494
+ def _validate_entities(
495
+ self,
496
+ entities: list[AddressEntity]
497
+ ) -> list[AddressEntity]:
498
+ """Validate and filter entities."""
499
+ result = []
500
+
501
+ for entity in entities:
502
+ # Skip empty values
503
+ if not entity.value.strip():
504
+ continue
505
+
506
+ # Skip very low confidence
507
+ if entity.confidence < 0.3:
508
+ continue
509
+
510
+ # Validate pincode format
511
+ if entity.label == "PINCODE":
512
+ if not re.fullmatch(r'[1-9]\d{5}', entity.value):
513
+ continue
514
+ if self.gazetteer and not self.gazetteer.validate_pincode(entity.value):
515
+ # Pincode outside Delhi range - reduce confidence but keep
516
+ entity = entity.model_copy()
517
+ entity.confidence *= 0.7
518
+
519
+ result.append(entity)
520
+
521
+ return result
522
+
523
+ def extract_all_patterns(self, text: str) -> dict[str, list[str]]:
524
+ """
525
+ Extract all pattern-based entities from text.
526
+
527
+ Returns dict of label -> list of matched values.
528
+ """
529
+ results = {}
530
+
531
+ for label, pattern in self.PATTERNS.items():
532
+ matches = pattern.findall(text)
533
+ if matches:
534
+ results[label] = matches
535
+
536
+ return results
src/address_parser/preprocessing/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """Preprocessing module for address normalization and Hindi transliteration."""
2
+
3
+ from address_parser.preprocessing.hindi import HindiTransliterator
4
+ from address_parser.preprocessing.normalizer import AddressNormalizer
5
+
6
+ __all__ = ["AddressNormalizer", "HindiTransliterator"]
src/address_parser/preprocessing/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (491 Bytes). View file
 
src/address_parser/preprocessing/__pycache__/__init__.cpython-314.pyc ADDED
Binary file (488 Bytes). View file
 
src/address_parser/preprocessing/__pycache__/hindi.cpython-312.pyc ADDED
Binary file (10.3 kB). View file
 
src/address_parser/preprocessing/__pycache__/hindi.cpython-314.pyc ADDED
Binary file (11.5 kB). View file
 
src/address_parser/preprocessing/__pycache__/normalizer.cpython-312.pyc ADDED
Binary file (7.16 kB). View file
 
src/address_parser/preprocessing/__pycache__/normalizer.cpython-314.pyc ADDED
Binary file (8.41 kB). View file
 
src/address_parser/preprocessing/hindi.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Hindi transliteration and script handling for multilingual addresses."""
2
+
3
+ import re
4
+
5
+
6
+ class HindiTransliterator:
7
+ """
8
+ Handles Hindi (Devanagari) to Latin transliteration and script detection.
9
+
10
+ Supports:
11
+ - Devanagari to Latin conversion
12
+ - Common Hindi address terms
13
+ - Mixed script (code-switched) addresses
14
+ """
15
+
16
+ # Devanagari Unicode range
17
+ DEVANAGARI_START = 0x0900
18
+ DEVANAGARI_END = 0x097F
19
+
20
+ # Common Hindi address terms with transliterations
21
+ HINDI_TERMS = {
22
+ # Devanagari -> Latin
23
+ 'गली': 'GALI',
24
+ 'गलि': 'GALI',
25
+ 'मोहल्ला': 'MOHALLA',
26
+ 'नगर': 'NAGAR',
27
+ 'विहार': 'VIHAR',
28
+ 'पुरी': 'PURI',
29
+ 'पुर': 'PUR',
30
+ 'बाग': 'BAGH',
31
+ 'मार्ग': 'MARG',
32
+ 'रोड': 'ROAD',
33
+ 'मंजिल': 'FLOOR',
34
+ 'पहली': 'FIRST',
35
+ 'दूसरी': 'SECOND',
36
+ 'तीसरी': 'THIRD',
37
+ 'चौथी': 'FOURTH',
38
+ 'भूतल': 'GROUND FLOOR',
39
+ 'तहखाना': 'BASEMENT',
40
+ 'मकान': 'HOUSE',
41
+ 'प्लॉट': 'PLOT',
42
+ 'खसरा': 'KHASRA',
43
+ 'ब्लॉक': 'BLOCK',
44
+ 'सेक्टर': 'SECTOR',
45
+ 'कॉलोनी': 'COLONY',
46
+ 'इलाका': 'AREA',
47
+ 'क्षेत्र': 'AREA',
48
+ 'दिल्ली': 'DELHI',
49
+ 'नई दिल्ली': 'NEW DELHI',
50
+ 'नम्बर': 'NUMBER',
51
+ 'नंबर': 'NUMBER',
52
+ 'संख्या': 'NUMBER',
53
+ 'पास': 'NEAR',
54
+ 'सामने': 'OPPOSITE',
55
+ 'पीछे': 'BEHIND',
56
+ 'के पास': 'NEAR',
57
+ 'के सामने': 'OPPOSITE',
58
+ 'चौक': 'CHOWK',
59
+ 'बाजार': 'BAZAAR',
60
+ 'बस्ती': 'BASTI',
61
+ 'पार्क': 'PARK',
62
+ 'एक्सटेंशन': 'EXTENSION',
63
+ 'फेज': 'PHASE',
64
+ 'वार्ड': 'WARD',
65
+ 'जोन': 'ZONE',
66
+ }
67
+
68
+ # Devanagari consonants to Latin (basic ITRANS-like mapping)
69
+ CONSONANT_MAP = {
70
+ 'क': 'k', 'ख': 'kh', 'ग': 'g', 'घ': 'gh', 'ङ': 'ng',
71
+ 'च': 'ch', 'छ': 'chh', 'ज': 'j', 'झ': 'jh', 'ञ': 'ny',
72
+ 'ट': 't', 'ठ': 'th', 'ड': 'd', 'ढ': 'dh', 'ण': 'n',
73
+ 'त': 't', 'थ': 'th', 'द': 'd', 'ध': 'dh', 'न': 'n',
74
+ 'प': 'p', 'फ': 'ph', 'ब': 'b', 'भ': 'bh', 'म': 'm',
75
+ 'य': 'y', 'र': 'r', 'ल': 'l', 'व': 'v', 'श': 'sh',
76
+ 'ष': 'sh', 'स': 's', 'ह': 'h',
77
+ 'क़': 'q', 'ख़': 'kh', 'ग़': 'g', 'ज़': 'z', 'ड़': 'd',
78
+ 'ढ़': 'dh', 'फ़': 'f', 'य़': 'y',
79
+ }
80
+
81
+ # Devanagari vowels/matras
82
+ VOWEL_MAP = {
83
+ 'अ': 'a', 'आ': 'aa', 'इ': 'i', 'ई': 'ee', 'उ': 'u', 'ऊ': 'oo',
84
+ 'ए': 'e', 'ऐ': 'ai', 'ओ': 'o', 'औ': 'au', 'अं': 'an', 'अः': 'ah',
85
+ 'ा': 'a', 'ि': 'i', 'ी': 'ee', 'ु': 'u', 'ू': 'oo',
86
+ 'े': 'e', 'ै': 'ai', 'ो': 'o', 'ौ': 'au',
87
+ 'ं': 'n', 'ः': 'h', '्': '', # Halant (vowel killer)
88
+ 'ँ': 'n', # Chandrabindu
89
+ }
90
+
91
+ # Devanagari digits
92
+ DIGIT_MAP = {
93
+ '०': '0', '१': '1', '२': '2', '३': '3', '४': '4',
94
+ '५': '5', '६': '6', '७': '7', '८': '8', '९': '9',
95
+ }
96
+
97
+ def __init__(self, use_known_terms: bool = True):
98
+ """
99
+ Initialize transliterator.
100
+
101
+ Args:
102
+ use_known_terms: Use dictionary of known Hindi address terms
103
+ """
104
+ self.use_known_terms = use_known_terms
105
+
106
+ def contains_devanagari(self, text: str) -> bool:
107
+ """Check if text contains Devanagari script."""
108
+ for char in text:
109
+ code = ord(char)
110
+ if self.DEVANAGARI_START <= code <= self.DEVANAGARI_END:
111
+ return True
112
+ return False
113
+
114
+ def get_script_ratio(self, text: str) -> dict[str, float]:
115
+ """
116
+ Get ratio of different scripts in text.
117
+
118
+ Returns dict with 'latin', 'devanagari', 'numeric', 'other' ratios.
119
+ """
120
+ if not text:
121
+ return {'latin': 0.0, 'devanagari': 0.0, 'numeric': 0.0, 'other': 0.0}
122
+
123
+ counts: dict[str, float] = {'latin': 0, 'devanagari': 0, 'numeric': 0, 'other': 0}
124
+ total = 0
125
+
126
+ for char in text:
127
+ if char.isspace():
128
+ continue
129
+ total += 1
130
+ code = ord(char)
131
+
132
+ if self.DEVANAGARI_START <= code <= self.DEVANAGARI_END:
133
+ counts['devanagari'] += 1
134
+ elif char.isascii() and char.isalpha():
135
+ counts['latin'] += 1
136
+ elif char.isdigit():
137
+ counts['numeric'] += 1
138
+ else:
139
+ counts['other'] += 1
140
+
141
+ if total == 0:
142
+ return counts
143
+
144
+ return {k: v / total for k, v in counts.items()}
145
+
146
+ def transliterate(self, text: str) -> str:
147
+ """
148
+ Transliterate Devanagari text to Latin script.
149
+
150
+ Args:
151
+ text: Input text (may be mixed script)
152
+
153
+ Returns:
154
+ Transliterated text in Latin script
155
+ """
156
+ if not self.contains_devanagari(text):
157
+ return text
158
+
159
+ # First, try to match known terms
160
+ if self.use_known_terms:
161
+ for hindi, latin in sorted(self.HINDI_TERMS.items(), key=lambda x: -len(x[0])):
162
+ text = text.replace(hindi, f' {latin} ')
163
+
164
+ # Then transliterate remaining Devanagari
165
+ result = []
166
+ i = 0
167
+ while i < len(text):
168
+ char = text[i]
169
+ code = ord(char)
170
+
171
+ if self.DEVANAGARI_START <= code <= self.DEVANAGARI_END:
172
+ # Check digits first
173
+ if char in self.DIGIT_MAP:
174
+ result.append(self.DIGIT_MAP[char])
175
+ # Check vowels
176
+ elif char in self.VOWEL_MAP:
177
+ result.append(self.VOWEL_MAP[char])
178
+ # Check consonants
179
+ elif char in self.CONSONANT_MAP:
180
+ result.append(self.CONSONANT_MAP[char])
181
+ # Add implicit 'a' unless followed by matra or halant
182
+ if i + 1 < len(text):
183
+ next_char = text[i + 1]
184
+ next_code = ord(next_char)
185
+ # If next is a matra (0x093E-0x094D) or halant, don't add 'a'
186
+ if not (0x093E <= next_code <= 0x094D):
187
+ result.append('a')
188
+ else:
189
+ result.append('a')
190
+ else:
191
+ # Unknown Devanagari character
192
+ result.append(char)
193
+ else:
194
+ result.append(char)
195
+
196
+ i += 1
197
+
198
+ # Clean up
199
+ output = ''.join(result)
200
+ output = re.sub(r'\s+', ' ', output)
201
+ return output.strip().upper()
202
+
203
+ def normalize_mixed_script(self, text: str) -> str:
204
+ """
205
+ Handle code-mixed (Hindi + English) addresses.
206
+
207
+ Transliterates Hindi portions while preserving English.
208
+ """
209
+ # Split on whitespace to handle word by word
210
+ words = text.split()
211
+ result = []
212
+
213
+ for word in words:
214
+ if self.contains_devanagari(word):
215
+ # Check if it's a known term first
216
+ if self.use_known_terms and word in self.HINDI_TERMS:
217
+ result.append(self.HINDI_TERMS[word])
218
+ else:
219
+ result.append(self.transliterate(word))
220
+ else:
221
+ result.append(word.upper())
222
+
223
+ return ' '.join(result)
224
+
225
+
226
+ def detect_language(text: str) -> str:
227
+ """
228
+ Simple language detection for address text.
229
+
230
+ Returns: 'hindi', 'english', or 'mixed'
231
+ """
232
+ transliterator = HindiTransliterator()
233
+ ratios = transliterator.get_script_ratio(text)
234
+
235
+ if ratios['devanagari'] > 0.5:
236
+ return 'hindi'
237
+ elif ratios['latin'] > 0.5:
238
+ return 'english'
239
+ elif ratios['devanagari'] > 0 and ratios['latin'] > 0:
240
+ return 'mixed'
241
+ else:
242
+ return 'english'
src/address_parser/preprocessing/normalizer.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Address normalization utilities."""
2
+
3
+ import re
4
+
5
+
6
+ class AddressNormalizer:
7
+ """
8
+ Normalizes Indian addresses for consistent processing.
9
+
10
+ Handles:
11
+ - Case normalization
12
+ - Whitespace cleanup
13
+ - Common abbreviation expansion
14
+ - Punctuation standardization
15
+ - Number format standardization
16
+ """
17
+
18
+ # Common abbreviations in Indian addresses
19
+ ABBREVIATIONS = {
20
+ r'\bH\.?\s*NO\.?\b': 'HOUSE NO',
21
+ r'\bH\.?\s*N\.?\b': 'HOUSE NO',
22
+ r'\bHNO\.?\b': 'HOUSE NO',
23
+ r'\bPLT\.?\s*NO\.?\b': 'PLOT NO',
24
+ r'\bP\.?\s*NO\.?\b': 'PLOT NO',
25
+ r'\bFL\.?\b': 'FLOOR',
26
+ r'\bFLR\.?\b': 'FLOOR',
27
+ r'\bGF\.?\b': 'GROUND FLOOR',
28
+ r'\bFF\.?\b': 'FIRST FLOOR',
29
+ r'\bSF\.?\b': 'SECOND FLOOR',
30
+ r'\bTF\.?\b': 'THIRD FLOOR',
31
+ r'\b1ST\s+FL\.?\b': 'FIRST FLOOR',
32
+ r'\b2ND\s+FL\.?\b': 'SECOND FLOOR',
33
+ r'\b3RD\s+FL\.?\b': 'THIRD FLOOR',
34
+ r'\bGRD\.?\s*FL\.?\b': 'GROUND FLOOR',
35
+ r'\bBLK\.?\b': 'BLOCK',
36
+ r'\bBL\.?\b': 'BLOCK',
37
+ r'\bSEC\.?\b': 'SECTOR',
38
+ r'\bKH\.?\s*NO\.?\b': 'KHASRA NO',
39
+ r'\bKHASRA\s*NO\.?\b': 'KHASRA NO',
40
+ r'\bKH\.?\b': 'KHASRA',
41
+ r'\bCOL\.?\b': 'COLONY',
42
+ r'\bNGR\.?\b': 'NAGAR',
43
+ r'\bMKT\.?\b': 'MARKET',
44
+ r'\bRD\.?\b': 'ROAD',
45
+ r'\bST\.?\b': 'STREET',
46
+ r'\bLN\.?\b': 'LANE',
47
+ r'\bEXTN\.?\b': 'EXTENSION',
48
+ r'\bEXT\.?\b': 'EXTENSION',
49
+ r'\bPH\.?\b': 'PHASE',
50
+ r'\bNR\.?\b': 'NEAR',
51
+ r'\bOPP\.?\b': 'OPPOSITE',
52
+ r'\bBHD\.?\b': 'BEHIND',
53
+ r'\bADJ\.?\b': 'ADJACENT',
54
+ r'\bWZ\.?\b': 'WZ', # West Zone
55
+ r'\bEZ\.?\b': 'EZ', # East Zone
56
+ r'\bNZ\.?\b': 'NZ', # North Zone
57
+ r'\bSZ\.?\b': 'SZ', # South Zone
58
+ r'\bDL\.?\b': 'DELHI',
59
+ r'\bN\.?\s*DELHI\b': 'NEW DELHI',
60
+ }
61
+
62
+ # Floor name patterns
63
+ FLOOR_PATTERNS = {
64
+ r'\bGROUND\b': 'GROUND',
65
+ r'\bBASEMENT\b': 'BASEMENT',
66
+ r'\bFIRST\b': 'FIRST',
67
+ r'\bSECOND\b': 'SECOND',
68
+ r'\bTHIRD\b': 'THIRD',
69
+ r'\bFOURTH\b': 'FOURTH',
70
+ r'\bFIFTH\b': 'FIFTH',
71
+ r'\b1ST\b': 'FIRST',
72
+ r'\b2ND\b': 'SECOND',
73
+ r'\b3RD\b': 'THIRD',
74
+ r'\b4TH\b': 'FOURTH',
75
+ r'\b5TH\b': 'FIFTH',
76
+ }
77
+
78
+ def __init__(self, uppercase: bool = True, expand_abbrev: bool = True):
79
+ """
80
+ Initialize normalizer.
81
+
82
+ Args:
83
+ uppercase: Convert text to uppercase
84
+ expand_abbrev: Expand common abbreviations
85
+ """
86
+ self.uppercase = uppercase
87
+ self.expand_abbrev = expand_abbrev
88
+
89
+ # Compile regex patterns
90
+ self._abbrev_patterns = {
91
+ re.compile(pattern, re.IGNORECASE): replacement
92
+ for pattern, replacement in self.ABBREVIATIONS.items()
93
+ }
94
+
95
+ def normalize(self, address: str) -> str:
96
+ """
97
+ Normalize an address string.
98
+
99
+ Args:
100
+ address: Raw address string
101
+
102
+ Returns:
103
+ Normalized address string
104
+ """
105
+ if not address:
106
+ return ""
107
+
108
+ text = address
109
+
110
+ # Basic cleanup
111
+ text = self._clean_whitespace(text)
112
+ text = self._standardize_punctuation(text)
113
+
114
+ # Expand abbreviations
115
+ if self.expand_abbrev:
116
+ text = self._expand_abbreviations(text)
117
+
118
+ # Case normalization
119
+ if self.uppercase:
120
+ text = text.upper()
121
+
122
+ # Final whitespace cleanup
123
+ text = self._clean_whitespace(text)
124
+
125
+ return text
126
+
127
+ def _clean_whitespace(self, text: str) -> str:
128
+ """Remove extra whitespace."""
129
+ # Replace multiple spaces with single space
130
+ text = re.sub(r'\s+', ' ', text)
131
+ # Remove spaces around punctuation
132
+ text = re.sub(r'\s*,\s*', ', ', text)
133
+ text = re.sub(r'\s*-\s*', '-', text)
134
+ # Trim
135
+ return text.strip()
136
+
137
+ def _standardize_punctuation(self, text: str) -> str:
138
+ """Standardize punctuation marks."""
139
+ # Replace various dash types with standard hyphen
140
+ text = re.sub(r'[–—]', '-', text)
141
+ # Remove duplicate punctuation
142
+ text = re.sub(r',+', ',', text)
143
+ text = re.sub(r'-+', '-', text)
144
+ # Remove trailing punctuation before comma
145
+ text = re.sub(r'-,', ',', text)
146
+ return text
147
+
148
+ def _expand_abbreviations(self, text: str) -> str:
149
+ """Expand common abbreviations."""
150
+ for pattern, replacement in self._abbrev_patterns.items():
151
+ text = pattern.sub(replacement, text)
152
+ return text
153
+
154
+ def extract_pincode(self, address: str) -> str | None:
155
+ """Extract 6-digit Indian PIN code from address."""
156
+ match = re.search(r'\b[1-9]\d{5}\b', address)
157
+ return match.group(0) if match else None
158
+
159
+ def remove_pincode(self, address: str) -> str:
160
+ """Remove PIN code from address."""
161
+ return re.sub(r'\b[1-9]\d{5}\b', '', address)
162
+
163
+ def tokenize(self, text: str) -> list[str]:
164
+ """
165
+ Simple tokenization preserving address-specific patterns.
166
+
167
+ Args:
168
+ text: Normalized address text
169
+
170
+ Returns:
171
+ List of tokens
172
+ """
173
+ # Split on whitespace but keep special patterns together
174
+ # e.g., "H-3" stays as one token, "110041" stays together
175
+ tokens = []
176
+
177
+ # Pattern to match address tokens
178
+ pattern = r'''
179
+ [A-Z0-9]+[-/][A-Z0-9/]+ | # Compound identifiers like H-3, 24/1/3
180
+ [A-Z]+\d+ | # Letter+number combos like A5
181
+ \d+[A-Z]+ | # Number+letter combos like 5A
182
+ [A-Z]+ | # Words
183
+ \d+ | # Numbers
184
+ [,.] # Punctuation
185
+ '''
186
+
187
+ for match in re.finditer(pattern, text.upper(), re.VERBOSE):
188
+ token = match.group(0)
189
+ if token.strip():
190
+ tokens.append(token)
191
+
192
+ return tokens
src/address_parser/schemas.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Pydantic schemas for address parsing I/O."""
2
+
3
+ from pydantic import BaseModel, ConfigDict, Field
4
+
5
+ # Entity label definitions
6
+ ENTITY_LABELS = [
7
+ "AREA",
8
+ "SUBAREA",
9
+ "HOUSE_NUMBER",
10
+ "SECTOR",
11
+ "GALI",
12
+ "COLONY",
13
+ "BLOCK",
14
+ "CAMP",
15
+ "POLE",
16
+ "KHASRA",
17
+ "FLOOR",
18
+ "PLOT",
19
+ "PINCODE",
20
+ "CITY",
21
+ "STATE",
22
+ ]
23
+
24
+ # BIO tag generation
25
+ BIO_LABELS = ["O"] + [f"B-{label}" for label in ENTITY_LABELS] + [f"I-{label}" for label in ENTITY_LABELS]
26
+ LABEL2ID = {label: i for i, label in enumerate(BIO_LABELS)}
27
+ ID2LABEL = {i: label for i, label in enumerate(BIO_LABELS)}
28
+
29
+
30
+ class AddressEntity(BaseModel):
31
+ """A single extracted entity from an address."""
32
+
33
+ label: str = Field(..., description="Entity type (e.g., HOUSE_NUMBER, AREA)")
34
+ value: str = Field(..., description="Extracted text value")
35
+ start: int = Field(..., description="Start character offset in original text")
36
+ end: int = Field(..., description="End character offset in original text")
37
+ confidence: float = Field(default=1.0, ge=0.0, le=1.0, description="Confidence score")
38
+
39
+ model_config = ConfigDict(
40
+ json_schema_extra={
41
+ "example": {
42
+ "label": "HOUSE_NUMBER",
43
+ "value": "PLOT NO752",
44
+ "start": 0,
45
+ "end": 10,
46
+ "confidence": 0.95,
47
+ }
48
+ }
49
+ )
50
+
51
+
52
+ class ParsedAddress(BaseModel):
53
+ """Complete parsed address with all entities."""
54
+
55
+ raw_address: str = Field(..., description="Original input address")
56
+ normalized_address: str = Field(..., description="Normalized/cleaned address")
57
+ entities: list[AddressEntity] = Field(default_factory=list, description="Extracted entities")
58
+
59
+ # Convenience accessors for common fields
60
+ house_number: str | None = Field(None, description="Extracted house/plot number")
61
+ floor: str | None = Field(None, description="Extracted floor")
62
+ block: str | None = Field(None, description="Extracted block")
63
+ gali: str | None = Field(None, description="Extracted gali/lane")
64
+ colony: str | None = Field(None, description="Extracted colony name")
65
+ area: str | None = Field(None, description="Extracted area/locality")
66
+ subarea: str | None = Field(None, description="Extracted sub-area")
67
+ sector: str | None = Field(None, description="Extracted sector")
68
+ khasra: str | None = Field(None, description="Extracted khasra number")
69
+ pincode: str | None = Field(None, description="Extracted PIN code")
70
+ city: str | None = Field(None, description="Extracted city")
71
+ state: str | None = Field(None, description="Extracted state")
72
+
73
+ def model_post_init(self, __context) -> None:
74
+ """Populate convenience fields from entities."""
75
+ entity_map = {e.label.upper(): e.value for e in self.entities}
76
+
77
+ self.house_number = entity_map.get("HOUSE_NUMBER") or entity_map.get("PLOT")
78
+ self.floor = entity_map.get("FLOOR")
79
+ self.block = entity_map.get("BLOCK")
80
+ self.gali = entity_map.get("GALI")
81
+ self.colony = entity_map.get("COLONY")
82
+ self.area = entity_map.get("AREA")
83
+ self.subarea = entity_map.get("SUBAREA")
84
+ self.sector = entity_map.get("SECTOR")
85
+ self.khasra = entity_map.get("KHASRA")
86
+ self.pincode = entity_map.get("PINCODE")
87
+ self.city = entity_map.get("CITY")
88
+ self.state = entity_map.get("STATE")
89
+
90
+ model_config = ConfigDict(
91
+ json_schema_extra={
92
+ "example": {
93
+ "raw_address": "PLOT NO752 FIRST FLOOR, BLOCK H-3, NEW DELHI, 110041",
94
+ "normalized_address": "PLOT NO752 FIRST FLOOR BLOCK H-3 NEW DELHI 110041",
95
+ "entities": [
96
+ {"label": "HOUSE_NUMBER", "value": "PLOT NO752", "start": 0, "end": 10, "confidence": 0.95},
97
+ {"label": "FLOOR", "value": "FIRST FLOOR", "start": 11, "end": 22, "confidence": 0.98},
98
+ ],
99
+ "house_number": "PLOT NO752",
100
+ "floor": "FIRST FLOOR",
101
+ }
102
+ }
103
+ )
104
+
105
+
106
+ class ParseRequest(BaseModel):
107
+ """Request schema for parsing addresses."""
108
+
109
+ address: str = Field(..., min_length=5, max_length=500, description="Address to parse")
110
+ return_confidence: bool = Field(default=True, description="Include confidence scores")
111
+
112
+ model_config = ConfigDict(
113
+ json_schema_extra={
114
+ "example": {
115
+ "address": "PLOT NO752 FIRST FLOOR, BLOCK H-3, NEW DELHI, 110041",
116
+ "return_confidence": True,
117
+ }
118
+ }
119
+ )
120
+
121
+
122
+ class BatchParseRequest(BaseModel):
123
+ """Request schema for batch parsing."""
124
+
125
+ addresses: list[str] = Field(..., min_length=1, max_length=100, description="List of addresses")
126
+ return_confidence: bool = Field(default=True, description="Include confidence scores")
127
+
128
+
129
+ class ParseResponse(BaseModel):
130
+ """Response schema for single address parsing."""
131
+
132
+ success: bool = Field(default=True, description="Whether parsing succeeded")
133
+ result: ParsedAddress | None = Field(None, description="Parsed address result")
134
+ error: str | None = Field(None, description="Error message if failed")
135
+ inference_time_ms: float = Field(..., description="Inference time in milliseconds")
136
+
137
+
138
+ class BatchParseResponse(BaseModel):
139
+ """Response schema for batch parsing."""
140
+
141
+ success: bool = Field(default=True)
142
+ results: list[ParsedAddress] = Field(default_factory=list)
143
+ total_inference_time_ms: float = Field(..., description="Total inference time")
144
+ avg_inference_time_ms: float = Field(..., description="Average per-address time")
145
+
146
+
147
+ class HealthResponse(BaseModel):
148
+ """Health check response."""
149
+
150
+ status: str = Field(default="healthy")
151
+ model_loaded: bool = Field(default=False)
152
+ version: str = Field(default="2.0.0")
src/indian_address_parser.egg-info/PKG-INFO ADDED
@@ -0,0 +1,383 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: indian-address-parser
3
+ Version: 2.0.0
4
+ Summary: Production-grade Indian address parsing using mBERT-CRF
5
+ Author-email: Kushagra <kushagra@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/kushagra/indian-address-parser
8
+ Project-URL: Documentation, https://github.com/kushagra/indian-address-parser#readme
9
+ Project-URL: Repository, https://github.com/kushagra/indian-address-parser
10
+ Project-URL: Issues, https://github.com/kushagra/indian-address-parser/issues
11
+ Keywords: nlp,ner,address-parsing,indian-addresses,bert,crf
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.14
17
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
+ Classifier: Topic :: Text Processing :: Linguistic
19
+ Requires-Python: >=3.14
20
+ Description-Content-Type: text/markdown
21
+ Requires-Dist: torch>=2.9.1
22
+ Requires-Dist: transformers>=4.57.6
23
+ Requires-Dist: tokenizers>=0.22.2
24
+ Requires-Dist: datasets>=4.5.0
25
+ Requires-Dist: seqeval>=1.2.2
26
+ Requires-Dist: numpy>=2.4.1
27
+ Requires-Dist: pandas>=2.3.3
28
+ Requires-Dist: scikit-learn>=1.8.0
29
+ Requires-Dist: tqdm>=4.67.1
30
+ Requires-Dist: pydantic>=2.12.5
31
+ Requires-Dist: indic-transliteration>=2.3.75
32
+ Requires-Dist: regex>=2026.1.15
33
+ Requires-Dist: rapidfuzz>=3.14.3
34
+ Provides-Extra: api
35
+ Requires-Dist: fastapi>=0.128.0; extra == "api"
36
+ Requires-Dist: uvicorn[standard]>=0.40.0; extra == "api"
37
+ Requires-Dist: gunicorn>=23.0.0; extra == "api"
38
+ Requires-Dist: python-multipart>=0.0.21; extra == "api"
39
+ Provides-Extra: demo
40
+ Requires-Dist: gradio>=6.3.0; extra == "demo"
41
+ Provides-Extra: training
42
+ Requires-Dist: accelerate>=1.12.0; extra == "training"
43
+ Requires-Dist: wandb>=0.24.0; extra == "training"
44
+ Requires-Dist: optuna>=4.7.0; extra == "training"
45
+ Provides-Extra: onnx
46
+ Requires-Dist: onnx>=1.20.1; python_version < "3.14" and extra == "onnx"
47
+ Requires-Dist: onnxruntime>=1.23.2; python_version < "3.14" and extra == "onnx"
48
+ Provides-Extra: dev
49
+ Requires-Dist: pytest>=9.0.2; extra == "dev"
50
+ Requires-Dist: pytest-cov>=7.0.0; extra == "dev"
51
+ Requires-Dist: pytest-asyncio>=1.3.0; extra == "dev"
52
+ Requires-Dist: black>=26.1.0; extra == "dev"
53
+ Requires-Dist: ruff>=0.14.13; extra == "dev"
54
+ Requires-Dist: mypy>=1.19.1; extra == "dev"
55
+ Requires-Dist: pre-commit>=4.5.1; extra == "dev"
56
+ Provides-Extra: all
57
+ Requires-Dist: indian-address-parser[api,demo,dev,training]; extra == "all"
58
+ Provides-Extra: all-with-onnx
59
+ Requires-Dist: indian-address-parser[api,demo,dev,onnx,training]; extra == "all-with-onnx"
60
+
61
+ # Indian Address Parser
62
+
63
+ Production-grade NLP system for parsing unstructured Indian addresses into structured components using **mBERT-CRF** (Multilingual BERT with Conditional Random Field).
64
+
65
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
66
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
67
+
68
+ ## Features
69
+
70
+ - **High Accuracy**: 94%+ F1 score on test data
71
+ - **Multilingual**: Supports Hindi (Devanagari) + English
72
+ - **Fast Inference**: < 30ms per address with ONNX optimization
73
+ - **15 Entity Types**: House Number, Floor, Block, Gali, Colony, Area, Khasra, Pincode, etc.
74
+ - **Delhi-specific**: Gazetteer with 100+ localities for improved accuracy
75
+ - **Production Ready**: REST API, Docker, Cloud Run deployment
76
+
77
+ ## Demo
78
+
79
+ - **Interactive Demo**: [HuggingFace Spaces](https://huggingface.co/spaces/kushagra/indian-address-parser)
80
+ - **API Endpoint**: `https://indian-address-parser-xyz.run.app/docs`
81
+
82
+ ## Quick Start
83
+
84
+ ### Installation
85
+
86
+ ```bash
87
+ pip install indian-address-parser
88
+ ```
89
+
90
+ Or from source:
91
+
92
+ ```bash
93
+ git clone https://github.com/kushagra/indian-address-parser.git
94
+ cd indian-address-parser
95
+ pip install -e ".[all]"
96
+ ```
97
+
98
+ ### Usage
99
+
100
+ ```python
101
+ from address_parser import AddressParser
102
+
103
+ # Load parser (rules-only mode if model not available)
104
+ parser = AddressParser.rules_only()
105
+
106
+ # Or load trained model
107
+ # parser = AddressParser.from_pretrained("./models/address_ner")
108
+
109
+ # Parse address
110
+ result = parser.parse(
111
+ "PLOT NO752 FIRST FLOOR, BLOCK H-3 KH NO 24/1/3/2/2/202, "
112
+ "KAUNWAR SINGH NAGAR NEW DELHI, DELHI, 110041"
113
+ )
114
+
115
+ print(f"House Number: {result.house_number}")
116
+ print(f"Floor: {result.floor}")
117
+ print(f"Block: {result.block}")
118
+ print(f"Khasra: {result.khasra}")
119
+ print(f"Area: {result.area}")
120
+ print(f"Pincode: {result.pincode}")
121
+ ```
122
+
123
+ **Output:**
124
+ ```
125
+ House Number: PLOT NO752
126
+ Floor: FIRST FLOOR
127
+ Block: BLOCK H-3
128
+ Khasra: KH NO 24/1/3/2/2/202
129
+ Area: KAUNWAR SINGH NAGAR
130
+ Pincode: 110041
131
+ ```
132
+
133
+ ### Entity Types
134
+
135
+ | Entity | Description | Example |
136
+ |--------|-------------|---------|
137
+ | `HOUSE_NUMBER` | House/plot number | `H.NO. 123`, `PLOT NO752` |
138
+ | `FLOOR` | Floor level | `FIRST FLOOR`, `GF` |
139
+ | `BLOCK` | Block identifier | `BLOCK H-3`, `BLK A` |
140
+ | `SECTOR` | Sector number | `SECTOR 15` |
141
+ | `GALI` | Lane/gali number | `GALI NO. 5` |
142
+ | `COLONY` | Colony name | `BABA HARI DAS COLONY` |
143
+ | `AREA` | Area/locality | `KAUNWAR SINGH NAGAR` |
144
+ | `SUBAREA` | Sub-area | `TIKARI KALA` |
145
+ | `KHASRA` | Khasra number | `KH NO 24/1/3/2` |
146
+ | `PINCODE` | 6-digit PIN code | `110041` |
147
+ | `CITY` | City name | `NEW DELHI` |
148
+ | `STATE` | State name | `DELHI` |
149
+
150
+ ## API Usage
151
+
152
+ ### REST API
153
+
154
+ ```bash
155
+ # Start API server
156
+ uvicorn api.main:app --host 0.0.0.0 --port 8080
157
+
158
+ # Parse single address
159
+ curl -X POST "http://localhost:8080/parse" \
160
+ -H "Content-Type: application/json" \
161
+ -d '{"address": "PLOT NO752 FIRST FLOOR, NEW DELHI, 110041"}'
162
+
163
+ # Batch parse
164
+ curl -X POST "http://localhost:8080/parse/batch" \
165
+ -H "Content-Type: application/json" \
166
+ -d '{"addresses": ["ADDRESS 1", "ADDRESS 2"]}'
167
+ ```
168
+
169
+ ### Python API
170
+
171
+ ```python
172
+ from address_parser import AddressParser
173
+
174
+ parser = AddressParser.from_pretrained("./models/address_ner")
175
+
176
+ # Single parse with timing
177
+ response = parser.parse_with_timing("NEW DELHI 110041")
178
+ print(f"Inference time: {response.inference_time_ms:.2f}ms")
179
+
180
+ # Batch parse
181
+ batch_response = parser.parse_batch([
182
+ "PLOT NO 123, DWARKA, 110078",
183
+ "H.NO. 456, LAJPAT NAGAR, 110024",
184
+ ])
185
+ print(f"Average time: {batch_response.avg_inference_time_ms:.2f}ms")
186
+ ```
187
+
188
+ ## Training
189
+
190
+ ### Data Preparation
191
+
192
+ Convert existing Label Studio annotations to BIO format:
193
+
194
+ ```bash
195
+ python training/convert_data.py
196
+ ```
197
+
198
+ This creates:
199
+ - `data/processed/train.jsonl`
200
+ - `data/processed/val.jsonl`
201
+ - `data/processed/test.jsonl`
202
+
203
+ ### Train Model
204
+
205
+ ```bash
206
+ python training/train.py \
207
+ --train data/processed/train.jsonl \
208
+ --val data/processed/val.jsonl \
209
+ --output models/address_ner \
210
+ --model bert-base-multilingual-cased \
211
+ --epochs 10 \
212
+ --batch-size 16
213
+ ```
214
+
215
+ ### Data Augmentation
216
+
217
+ Augment training data for improved robustness:
218
+
219
+ ```python
220
+ from training.augment import AddressAugmenter, augment_dataset
221
+
222
+ augmenter = AddressAugmenter(
223
+ abbrev_prob=0.3,
224
+ case_prob=0.2,
225
+ typo_prob=0.1,
226
+ )
227
+
228
+ augmented_data = augment_dataset(original_data, augmenter, target_size=1500)
229
+ ```
230
+
231
+ ## Deployment
232
+
233
+ ### Docker
234
+
235
+ ```bash
236
+ # Build
237
+ docker build -t indian-address-parser -f api/Dockerfile .
238
+
239
+ # Run
240
+ docker run -p 8080:8080 indian-address-parser
241
+ ```
242
+
243
+ ### Google Cloud Run
244
+
245
+ ```bash
246
+ # Deploy with Cloud Build
247
+ gcloud builds submit --config api/cloudbuild.yaml
248
+
249
+ # Or deploy directly
250
+ gcloud run deploy indian-address-parser \
251
+ --image gcr.io/PROJECT_ID/indian-address-parser \
252
+ --region us-central1 \
253
+ --min-instances 1 \
254
+ --allow-unauthenticated
255
+ ```
256
+
257
+ ### HuggingFace Spaces
258
+
259
+ 1. Create a new Space on HuggingFace
260
+ 2. Copy contents of `demo/` directory
261
+ 3. Upload trained model to HuggingFace Hub
262
+ 4. Update `MODEL_PATH` environment variable
263
+
264
+ ## Architecture
265
+
266
+ ```
267
+ ┌─────────────────────────────────────────────────────────────────┐
268
+ │ Indian Address Parser Pipeline │
269
+ ├─────────────────────────────────────────────────────────────────┤
270
+ │ ┌──────────────┐ ┌─────────────────┐ ┌────────────────────┐ │
271
+ │ │ Preprocessor │→│ mBERT-CRF │→│ Post-processor │ │
272
+ │ │ (Hindi/Eng) │ │ (multilingual) │ │ (rules+gazetteer) │ │
273
+ │ └──────────────┘ └─────────────────┘ └────────────────────┘ │
274
+ ├─────────────────────────────────────────────────────────────────┤
275
+ │ Components: │
276
+ │ • AddressNormalizer: Text normalization, abbreviation expansion│
277
+ │ • HindiTransliterator: Devanagari → Latin conversion │
278
+ │ • BertCRFForTokenClassification: mBERT + CRF for NER │
279
+ │ • RuleBasedRefiner: Pattern matching, entity validation │
280
+ │ • DelhiGazetteer: Fuzzy matching for locality names │
281
+ └─────────────────────────────────────────────────────────────────┘
282
+ ```
283
+
284
+ ## Performance
285
+
286
+ | Metric | Value |
287
+ |--------|-------|
288
+ | Precision | 94.2% |
289
+ | Recall | 95.1% |
290
+ | F1 Score | 94.6% |
291
+ | Inference Time | ~25ms |
292
+
293
+ Tested on held-out test set of 60+ Delhi addresses.
294
+
295
+ ## Project Structure
296
+
297
+ ```
298
+ indian-address-parser/
299
+ ├── src/address_parser/
300
+ │ ├── preprocessing/ # Text normalization, Hindi transliteration
301
+ │ ├── models/ # mBERT-CRF model architecture
302
+ │ ├── postprocessing/ # Rules, gazetteer, validation
303
+ │ ├── pipeline.py # Main orchestration
304
+ │ └── schemas.py # Pydantic I/O models
305
+ ├── api/ # FastAPI service
306
+ ├── demo/ # Gradio demo for HuggingFace Spaces
307
+ ├── training/ # Data prep, training scripts
308
+ ├── tests/ # pytest test suite
309
+ └── pyproject.toml # Package config
310
+ ```
311
+
312
+ ## Development
313
+
314
+ ### Setup
315
+
316
+ ```bash
317
+ # Clone repository
318
+ git clone https://github.com/kushagra/indian-address-parser.git
319
+ cd indian-address-parser
320
+
321
+ # Install with dev dependencies
322
+ pip install -e ".[dev]"
323
+
324
+ # Install pre-commit hooks
325
+ pre-commit install
326
+ ```
327
+
328
+ ### Testing
329
+
330
+ ```bash
331
+ # Run all tests
332
+ pytest
333
+
334
+ # Run with coverage
335
+ pytest --cov=address_parser --cov-report=html
336
+
337
+ # Run specific test file
338
+ pytest tests/test_pipeline.py -v
339
+ ```
340
+
341
+ ### Code Quality
342
+
343
+ ```bash
344
+ # Format code
345
+ black src/ tests/
346
+
347
+ # Lint
348
+ ruff check src/ tests/
349
+
350
+ # Type check
351
+ mypy src/
352
+ ```
353
+
354
+ ## Comparison with Alternatives
355
+
356
+ | Solution | Indian Support | Custom Labels | Latency | Cost |
357
+ |----------|---------------|---------------|---------|------|
358
+ | **This Project** | Excellent | Yes (15 types) | ~25ms | Free |
359
+ | libpostal | Poor | No | ~5ms | Free |
360
+ | Deepparse | Generic | No | ~50ms | Free |
361
+ | GPT-4 | Good | Configurable | ~1000ms | $0.03/call |
362
+ | Google Geocoding | Moderate | No | ~200ms | $5/1000 |
363
+
364
+ ## License
365
+
366
+ MIT License - see [LICENSE](LICENSE) for details.
367
+
368
+ ## Acknowledgments
369
+
370
+ - Original 2024 BSES Delhi internship project
371
+ - HuggingFace Transformers library
372
+ - Delhi locality data from public sources
373
+
374
+ ## Citation
375
+
376
+ ```bibtex
377
+ @software{indian_address_parser,
378
+ author = {Kushagra},
379
+ title = {Indian Address Parser: Production-grade NER for Indian Addresses},
380
+ year = {2026},
381
+ url = {https://github.com/kushagra/indian-address-parser}
382
+ }
383
+ ```
src/indian_address_parser.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ README.md
2
+ pyproject.toml
3
+ src/address_parser/__init__.py
4
+ src/address_parser/cli.py
5
+ src/address_parser/pipeline.py
6
+ src/address_parser/schemas.py
7
+ src/address_parser/models/__init__.py
8
+ src/address_parser/models/bert_crf.py
9
+ src/address_parser/models/config.py
10
+ src/address_parser/postprocessing/__init__.py
11
+ src/address_parser/postprocessing/gazetteer.py
12
+ src/address_parser/postprocessing/rules.py
13
+ src/address_parser/preprocessing/__init__.py
14
+ src/address_parser/preprocessing/hindi.py
15
+ src/address_parser/preprocessing/normalizer.py
16
+ src/indian_address_parser.egg-info/PKG-INFO
17
+ src/indian_address_parser.egg-info/SOURCES.txt
18
+ src/indian_address_parser.egg-info/dependency_links.txt
19
+ src/indian_address_parser.egg-info/entry_points.txt
20
+ src/indian_address_parser.egg-info/requires.txt
21
+ src/indian_address_parser.egg-info/top_level.txt
22
+ tests/test_pipeline.py
23
+ tests/test_postprocessing.py
24
+ tests/test_preprocessing.py
src/indian_address_parser.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
src/indian_address_parser.egg-info/entry_points.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [console_scripts]
2
+ address-parser = address_parser.cli:main
src/indian_address_parser.egg-info/requires.txt ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch>=2.9.1
2
+ transformers>=4.57.6
3
+ tokenizers>=0.22.2
4
+ datasets>=4.5.0
5
+ seqeval>=1.2.2
6
+ numpy>=2.4.1
7
+ pandas>=2.3.3
8
+ scikit-learn>=1.8.0
9
+ tqdm>=4.67.1
10
+ pydantic>=2.12.5
11
+ indic-transliteration>=2.3.75
12
+ regex>=2026.1.15
13
+ rapidfuzz>=3.14.3
14
+
15
+ [all]
16
+ indian-address-parser[api,demo,dev,training]
17
+
18
+ [all-with-onnx]
19
+ indian-address-parser[api,demo,dev,onnx,training]
20
+
21
+ [api]
22
+ fastapi>=0.128.0
23
+ uvicorn[standard]>=0.40.0
24
+ gunicorn>=23.0.0
25
+ python-multipart>=0.0.21
26
+
27
+ [demo]
28
+ gradio>=6.3.0
29
+
30
+ [dev]
31
+ pytest>=9.0.2
32
+ pytest-cov>=7.0.0
33
+ pytest-asyncio>=1.3.0
34
+ black>=26.1.0
35
+ ruff>=0.14.13
36
+ mypy>=1.19.1
37
+ pre-commit>=4.5.1
38
+
39
+ [onnx]
40
+
41
+ [onnx:python_version < "3.14"]
42
+ onnx>=1.20.1
43
+ onnxruntime>=1.23.2
44
+
45
+ [training]
46
+ accelerate>=1.12.0
47
+ wandb>=0.24.0
48
+ optuna>=4.7.0
src/indian_address_parser.egg-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ address_parser