Spaces:

howdoiuse-keyboard
/

indian-address-parser

Sleeping

App Files Files Community

x commited on Feb 2

Commit

47bc13b

verified ·

1 Parent(s): 976f8e9

Upload folder using huggingface_hub

Browse files

Files changed (47) hide show

README.md +46 -12
app.py +280 -0
requirements.txt +10 -0
src/address_parser/__init__.py +26 -0
src/address_parser/__pycache__/__init__.cpython-312.pyc +0 -0
src/address_parser/__pycache__/__init__.cpython-314.pyc +0 -0
src/address_parser/__pycache__/cli.cpython-314.pyc +0 -0
src/address_parser/__pycache__/pipeline.cpython-312.pyc +0 -0
src/address_parser/__pycache__/pipeline.cpython-314.pyc +0 -0
src/address_parser/__pycache__/schemas.cpython-312.pyc +0 -0
src/address_parser/__pycache__/schemas.cpython-314.pyc +0 -0
src/address_parser/cli.py +132 -0
src/address_parser/models/__init__.py +6 -0
src/address_parser/models/__pycache__/__init__.cpython-312.pyc +0 -0
src/address_parser/models/__pycache__/__init__.cpython-314.pyc +0 -0
src/address_parser/models/__pycache__/bert_crf.cpython-312.pyc +0 -0
src/address_parser/models/__pycache__/bert_crf.cpython-314.pyc +0 -0
src/address_parser/models/__pycache__/config.cpython-312.pyc +0 -0
src/address_parser/models/__pycache__/config.cpython-314.pyc +0 -0
src/address_parser/models/bert_crf.py +439 -0
src/address_parser/models/config.py +103 -0
src/address_parser/pipeline.py +528 -0
src/address_parser/postprocessing/__init__.py +6 -0
src/address_parser/postprocessing/__pycache__/__init__.cpython-312.pyc +0 -0
src/address_parser/postprocessing/__pycache__/__init__.cpython-314.pyc +0 -0
src/address_parser/postprocessing/__pycache__/gazetteer.cpython-312.pyc +0 -0
src/address_parser/postprocessing/__pycache__/gazetteer.cpython-314.pyc +0 -0
src/address_parser/postprocessing/__pycache__/rules.cpython-312.pyc +0 -0
src/address_parser/postprocessing/__pycache__/rules.cpython-314.pyc +0 -0
src/address_parser/postprocessing/gazetteer.py +164 -0
src/address_parser/postprocessing/rules.py +536 -0
src/address_parser/preprocessing/__init__.py +6 -0
src/address_parser/preprocessing/__pycache__/__init__.cpython-312.pyc +0 -0
src/address_parser/preprocessing/__pycache__/__init__.cpython-314.pyc +0 -0
src/address_parser/preprocessing/__pycache__/hindi.cpython-312.pyc +0 -0
src/address_parser/preprocessing/__pycache__/hindi.cpython-314.pyc +0 -0
src/address_parser/preprocessing/__pycache__/normalizer.cpython-312.pyc +0 -0
src/address_parser/preprocessing/__pycache__/normalizer.cpython-314.pyc +0 -0
src/address_parser/preprocessing/hindi.py +242 -0
src/address_parser/preprocessing/normalizer.py +192 -0
src/address_parser/schemas.py +152 -0
src/indian_address_parser.egg-info/PKG-INFO +383 -0
src/indian_address_parser.egg-info/SOURCES.txt +24 -0
src/indian_address_parser.egg-info/dependency_links.txt +1 -0
src/indian_address_parser.egg-info/entry_points.txt +2 -0
src/indian_address_parser.egg-info/requires.txt +48 -0
src/indian_address_parser.egg-info/top_level.txt +1 -0

README.md CHANGED Viewed

@@ -1,12 +1,46 @@
----
-title: Indian Address Parser
-emoji: 🦀
-colorFrom: purple
-colorTo: purple
-sdk: gradio
-sdk_version: 6.5.1
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: Indian Address Parser
+emoji: 🏠
+colorFrom: blue
+colorTo: green
+sdk: gradio
+sdk_version: "6.3.0"
+app_file: app.py
+pinned: false
+license: mit
+---
+# Indian Address Parser
+Parse unstructured Indian addresses into structured components using **IndicBERTv2-CRF**.
+## Features
+- **Multilingual**: Supports Hindi (Devanagari) + English
+- **15 Entity Types**: House Number, Floor, Block, Gali, Colony, Area, Khasra, Pincode, etc.
+- **~80% F1 score** on held-out test data (mBERT-CRF baseline)
+- **Fast**: < 30ms inference time
+## Example
+**Input:**
+```
+PLOT NO752 FIRST FLOOR, BLOCK H-3 KH NO 24/1/3/2/2/202, KAUNWAR SINGH NAGAR NEW DELHI, DELHI, 110041
+```
+**Output:**
+| Entity | Value |
+|--------|-------|
+| HOUSE_NUMBER | PLOT NO752 |
+| FLOOR | FIRST FLOOR |
+| BLOCK | BLOCK H-3 |
+| KHASRA | KH NO 24/1/3/2/2/202 |
+| AREA | KAUNWAR SINGH NAGAR |
+| CITY | NEW DELHI |
+| PINCODE | 110041 |
+## Technical Details
+- **Model**: ai4bharat/IndicBERTv2-SS + CRF layer
+- **Training Data**: 600+ annotated Delhi addresses
+- **Framework**: PyTorch + HuggingFace Transformers

app.py ADDED Viewed

	@@ -0,0 +1,280 @@

+"""
+Gradio demo for Indian Address Parser.
+Interactive web interface for HuggingFace Spaces deployment.
+Features:
+- Real-time address parsing
+- Entity highlighting
+- Example addresses
+- Confidence scores
+"""
+import os
+import sys
+from pathlib import Path
+import gradio as gr
+sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
+from address_parser import AddressParser, ParsedAddress
+# Entity colors for visualization
+ENTITY_COLORS = {
+    "HOUSE_NUMBER": "#FF6B6B",  # Red
+    "PLOT": "#FF6B6B",
+    "FLOOR": "#4ECDC4",  # Teal
+    "BLOCK": "#45B7D1",  # Blue
+    "SECTOR": "#96CEB4",  # Green
+    "GALI": "#FFEAA7",  # Yellow
+    "COLONY": "#DDA0DD",  # Plum
+    "AREA": "#98D8C8",  # Mint
+    "SUBAREA": "#F7DC6F",  # Light yellow
+    "KHASRA": "#BB8FCE",  # Purple
+    "PINCODE": "#85C1E9",  # Light blue
+    "CITY": "#F8B500",  # Orange
+    "STATE": "#58D68D",  # Light green
+}
+# Example addresses
+EXAMPLES = [
+    "PLOT NO752 FIRST FLOOR, BLOCK H-3 KH NO 24/1/3/2/2/202, KAUNWAR SINGH NAGAR NEW DELHI, DELHI, 110041",
+    "H.NO. 123, GALI NO. 5, LAJPAT NAGAR, SOUTH DELHI, 110024",
+    "FLAT NO A-501, SECTOR 15, DWARKA, NEW DELHI, 110078",
+    "KHASRA NO 45/2, VILLAGE MUNDKA, OUTER DELHI, 110041",
+    "S-3/166, GROUND FLOOR, KH NO 98/4, GALI NO-6, SWARN PARK MUNDKA, Delhi, 110041",
+    "PLOT NO A5 GROUND FLOOR, KHASRA NO 15/20/2 BABA HARI DAS COLONY, TIKARI KALA, DELHI, 110041",
+]
+def load_parser():
+    """Load the address parser from HuggingFace Hub or local path."""
+    from huggingface_hub import snapshot_download
+    # Configuration - HF_MODEL_REPO should be set in Space settings
+    HF_MODEL_REPO = os.getenv("HF_MODEL_REPO", "")
+    LOCAL_MODEL_PATH = os.getenv("MODEL_PATH", "./models/address_ner_v3")
+    # Try local path first (for development/testing)
+    if Path(LOCAL_MODEL_PATH).exists() and (Path(LOCAL_MODEL_PATH) / "pytorch_model.bin").exists():
+        print(f"Loading model from local path: {LOCAL_MODEL_PATH}")
+        return AddressParser.from_pretrained(LOCAL_MODEL_PATH, device="cpu")
+    # Try HuggingFace Hub
+    if HF_MODEL_REPO:
+        try:
+            print(f"Downloading model from HuggingFace Hub: {HF_MODEL_REPO}")
+            model_path = snapshot_download(repo_id=HF_MODEL_REPO, repo_type="model")
+            print(f"Model downloaded to: {model_path}")
+            return AddressParser.from_pretrained(model_path, device="cpu")
+        except Exception as e:
+            print(f"Failed to load model from HF Hub: {e}")
+    # Fallback to rules-only mode
+    print("No model available, using rules-only mode")
+    return AddressParser.rules_only()
+# Initialize parser
+parser = load_parser()
+def create_highlighted_html(result: ParsedAddress) -> str:
+    """Create HTML with highlighted entities."""
+    if not result.entities:
+        return f"<p>{result.normalized_address}</p>"
+    # Sort entities by position
+    sorted_entities = sorted(result.entities, key=lambda e: e.start)
+    html_parts = []
+    last_end = 0
+    text = result.normalized_address
+    for entity in sorted_entities:
+        # Add text before entity
+        if entity.start > last_end:
+            html_parts.append(text[last_end:entity.start])
+        # Add highlighted entity
+        color = ENTITY_COLORS.get(entity.label, "#CCCCCC")
+        html_parts.append(
+            f'<span style="background-color: {color}; padding: 2px 6px; '
+            f'border-radius: 4px; margin: 0 2px; font-weight: bold;" '
+            f'title="{entity.label} ({entity.confidence:.0%})">'
+            f'{entity.value}</span>'
+        )
+        last_end = entity.end
+    # Add remaining text
+    if last_end < len(text):
+        html_parts.append(text[last_end:])
+    return "".join(html_parts)
+def create_entity_table(result: ParsedAddress) -> list[list[str]]:
+    """Create table of extracted entities."""
+    if not result.entities:
+        return []
+    return [
+        [entity.label, entity.value, f"{entity.confidence:.0%}"]
+        for entity in sorted(result.entities, key=lambda e: e.start)
+    ]
+def parse_address(address: str) -> tuple[str, list[list[str]], str]:
+    """
+    Parse address and return results for Gradio interface.
+    Returns:
+        - Highlighted HTML
+        - Entity table
+        - Structured output JSON
+    """
+    if not address or not address.strip():
+        return "<p>Please enter an address</p>", [], "{}"
+    # Parse
+    result = parser.parse(address)
+    # Create outputs
+    highlighted = create_highlighted_html(result)
+    table = create_entity_table(result)
+    # Structured output
+    structured = {
+        "house_number": result.house_number,
+        "floor": result.floor,
+        "block": result.block,
+        "gali": result.gali,
+        "colony": result.colony,
+        "area": result.area,
+        "subarea": result.subarea,
+        "sector": result.sector,
+        "khasra": result.khasra,
+        "pincode": result.pincode,
+        "city": result.city,
+        "state": result.state,
+    }
+    # Remove None values
+    structured = {k: v for k, v in structured.items() if v}
+    import json
+    structured_json = json.dumps(structured, indent=2, ensure_ascii=False)
+    return highlighted, table, structured_json
+# Custom CSS for the demo
+CUSTOM_CSS = """
+.highlighted-text {
+    font-size: 1.1em;
+    line-height: 1.8;
+    padding: 15px;
+    background: #f8f9fa;
+    border-radius: 8px;
+}
+"""
+# Create Gradio interface
+with gr.Blocks(title="Indian Address Parser") as demo:
+    gr.Markdown(
+        """
+        # Indian Address Parser
+        Parse unstructured Indian addresses into structured components using
+        **mBERT-CRF** (Multilingual BERT with Conditional Random Field).
+        ## Features
+        - Supports Hindi + English (Devanagari and Latin scripts)
+        - 15 entity types: House Number, Floor, Block, Gali, Colony, Area, Khasra, Pincode, etc.
+        - Delhi-specific locality gazetteer for improved accuracy
+        - < 30ms inference time
+        ---
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=2):
+            address_input = gr.Textbox(
+                label="Enter Address",
+                placeholder="e.g., PLOT NO752 FIRST FLOOR, BLOCK H-3, NEW DELHI, 110041",
+                lines=3,
+            )
+            parse_btn = gr.Button("Parse Address", variant="primary")
+            gr.Examples(
+                examples=[[ex] for ex in EXAMPLES],
+                inputs=[address_input],
+                label="Example Addresses",
+            )
+    gr.Markdown("## Results")
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### Highlighted Entities")
+            highlighted_output = gr.HTML(
+                elem_classes=["highlighted-text"]
+            )
+        with gr.Column(scale=1):
+            gr.Markdown("### Extracted Entities")
+            entity_table = gr.Dataframe(
+                headers=["Entity Type", "Value", "Confidence"],
+                datatype=["str", "str", "str"],
+                row_count=10,
+            )
+    with gr.Row():
+        gr.Markdown("### Structured Output")
+        structured_output = gr.Code(
+            language="json",
+            label="Structured JSON",
+        )
+    # Legend
+    gr.Markdown("### Entity Legend")
+    legend_html = " ".join([
+        f'<span style="background-color: {color}; padding: 2px 8px; '
+        f'border-radius: 4px; margin: 2px; display: inline-block;">{label}</span>'
+        for label, color in ENTITY_COLORS.items()
+    ])
+    gr.HTML(f"<div style='line-height: 2.5;'>{legend_html}</div>")
+    # Footer
+    gr.Markdown(
+        """
+        ---
+        **Model**: IndicBERTv2-SS + CRF (ai4bharat/IndicBERTv2-SS + CRF layer)
+        | **Training Data**: 600+ annotated Delhi addresses
+        | **GitHub**: [indian-address-parser](https://github.com/howdoiusekeyboard/indian-address-parser)
+        """
+    )
+    # Event handlers
+    parse_btn.click(
+        fn=parse_address,
+        inputs=[address_input],
+        outputs=[highlighted_output, entity_table, structured_output],
+    )
+    address_input.submit(
+        fn=parse_address,
+        inputs=[address_input],
+        outputs=[highlighted_output, entity_table, structured_output],
+    )
+if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=int(os.getenv("PORT", "7860")),
+        share=False,
+        theme=gr.themes.Soft(),
+        css=CUSTOM_CSS,
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+# HuggingFace Spaces requirements (Python 3.14)
+torch>=2.9.1
+transformers>=4.57.6
+tokenizers>=0.22.2
+huggingface_hub>=0.25.0
+gradio>=6.3.0
+pydantic>=2.12.5
+indic-transliteration>=2.3.75
+rapidfuzz>=3.14.3
+regex>=2026.1.15

src/address_parser/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+"""
+Indian Address Parser - Production-grade NER for Indian addresses.
+A modern NLP system for parsing unstructured Indian addresses into
+structured components using mBERT-CRF architecture with Hindi+English support.
+"""
+__version__ = "2.0.0"
+__author__ = "Kushagra"
+from address_parser.pipeline import AddressParser
+from address_parser.schemas import (
+    AddressEntity,
+    ParsedAddress,
+    ParseRequest,
+    ParseResponse,
+)
+__all__ = [
+    "AddressParser",
+    "AddressEntity",
+    "ParsedAddress",
+    "ParseRequest",
+    "ParseResponse",
+    "__version__",
+]

src/address_parser/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (741 Bytes). View file

src/address_parser/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (738 Bytes). View file

src/address_parser/__pycache__/cli.cpython-314.pyc ADDED Viewed

Binary file (6.4 kB). View file

src/address_parser/__pycache__/pipeline.cpython-312.pyc ADDED Viewed

Binary file (16.9 kB). View file

src/address_parser/__pycache__/pipeline.cpython-314.pyc ADDED Viewed

Binary file (19.6 kB). View file

src/address_parser/__pycache__/schemas.cpython-312.pyc ADDED Viewed

Binary file (7.94 kB). View file

src/address_parser/__pycache__/schemas.cpython-314.pyc ADDED Viewed

Binary file (10.2 kB). View file

src/address_parser/cli.py ADDED Viewed

	@@ -0,0 +1,132 @@

+"""Command-line interface for Indian Address Parser."""
+import argparse
+import json
+import sys
+from pathlib import Path
+def main():
+    """Main CLI entry point."""
+    parser = argparse.ArgumentParser(
+        description="Parse Indian addresses using NER",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Parse single address
+  address-parser "PLOT NO752 FIRST FLOOR, NEW DELHI, 110041"
+  # Parse from file
+  address-parser --input addresses.txt --output parsed.json
+  # Use trained model
+  address-parser --model ./models/address_ner_v3 "H.NO. 123, LAJPAT NAGAR"
+        """
+    )
+    parser.add_argument(
+        "address",
+        nargs="?",
+        help="Address to parse (or use --input for file)"
+    )
+    parser.add_argument(
+        "--input", "-i",
+        help="Input file with addresses (one per line)"
+    )
+    parser.add_argument(
+        "--output", "-o",
+        help="Output JSON file"
+    )
+    parser.add_argument(
+        "--model", "-m",
+        help="Path to trained model directory"
+    )
+    parser.add_argument(
+        "--format", "-f",
+        choices=["json", "table", "simple"],
+        default="json",
+        help="Output format (default: json)"
+    )
+    parser.add_argument(
+        "--version", "-v",
+        action="version",
+        version="indian-address-parser 2.0.0"
+    )
+    args = parser.parse_args()
+    # Import here to avoid slow startup
+    from address_parser import AddressParser
+    # Load parser
+    if args.model and Path(args.model).exists():
+        print(f"Loading model from {args.model}...", file=sys.stderr)
+        address_parser = AddressParser.from_pretrained(args.model)
+    else:
+        print("Using rules-only mode", file=sys.stderr)
+        address_parser = AddressParser.rules_only()
+    # Get addresses to parse
+    addresses = []
+    if args.input:
+        with open(args.input, encoding="utf-8") as f:
+            addresses = [line.strip() for line in f if line.strip()]
+    elif args.address:
+        addresses = [args.address]
+    else:
+        parser.print_help()
+        sys.exit(1)
+    # Parse addresses
+    results = []
+    for addr in addresses:
+        result = address_parser.parse(addr)
+        results.append(result)
+    # Output
+    if args.format == "json":
+        output = [r.model_dump() for r in results]
+        json_str = json.dumps(output, indent=2, ensure_ascii=False)
+        if args.output:
+            with open(args.output, "w", encoding="utf-8") as f:
+                f.write(json_str)
+            print(f"Saved to {args.output}", file=sys.stderr)
+        else:
+            print(json_str)
+    elif args.format == "table":
+        for i, result in enumerate(results):
+            print(f"\n{'='*60}")
+            print(f"Address {i+1}: {result.raw_address[:50]}...")
+            print(f"{'='*60}")
+            print(f"{'Entity':<15} {'Value':<40} {'Conf':<6}")
+            print("-" * 60)
+            for entity in result.entities:
+                print(f"{entity.label:<15} {entity.value:<40} {entity.confidence:.0%}")
+    else:  # simple
+        for result in results:
+            parts = []
+            if result.house_number:
+                parts.append(f"House: {result.house_number}")
+            if result.floor:
+                parts.append(f"Floor: {result.floor}")
+            if result.block:
+                parts.append(f"Block: {result.block}")
+            if result.gali:
+                parts.append(f"Gali: {result.gali}")
+            if result.colony:
+                parts.append(f"Colony: {result.colony}")
+            if result.area:
+                parts.append(f"Area: {result.area}")
+            if result.pincode:
+                parts.append(f"PIN: {result.pincode}")
+            if result.city:
+                parts.append(f"City: {result.city}")
+            print(" | ".join(parts) if parts else "No entities found")
+if __name__ == "__main__":
+    main()

src/address_parser/models/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""Model architectures for address NER."""
+from address_parser.models.bert_crf import BertCRFForTokenClassification
+from address_parser.models.config import ModelConfig
+__all__ = ["BertCRFForTokenClassification", "ModelConfig"]

src/address_parser/models/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (435 Bytes). View file

src/address_parser/models/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (432 Bytes). View file

src/address_parser/models/__pycache__/bert_crf.cpython-312.pyc ADDED Viewed

Binary file (16.8 kB). View file

src/address_parser/models/__pycache__/bert_crf.cpython-314.pyc ADDED Viewed

Binary file (20 kB). View file

src/address_parser/models/__pycache__/config.cpython-312.pyc ADDED Viewed

Binary file (3.13 kB). View file

src/address_parser/models/__pycache__/config.cpython-314.pyc ADDED Viewed

Binary file (3.77 kB). View file

src/address_parser/models/bert_crf.py ADDED Viewed

	@@ -0,0 +1,439 @@

+"""
+BERT-CRF Model for Indian Address NER.
+Combines a multilingual BERT encoder with a Conditional Random Field (CRF)
+layer for improved sequence labeling performance.
+"""
+import torch
+import torch.nn as nn
+from transformers import AutoModel
+from transformers.modeling_outputs import TokenClassifierOutput
+from address_parser.models.config import ID2LABEL, LABEL2ID, ModelConfig
+class CRF(nn.Module):
+    """
+    Conditional Random Field layer for sequence labeling.
+    Implements the forward algorithm for computing log-likelihood
+    and Viterbi decoding for inference.
+    """
+    def __init__(self, num_tags: int, batch_first: bool = True):
+        """
+        Initialize CRF layer.
+        Args:
+            num_tags: Number of output tags
+            batch_first: If True, input is (batch, seq, features)
+        """
+        super().__init__()
+        self.num_tags = num_tags
+        self.batch_first = batch_first
+        # Transition matrix: transitions[i, j] = score of transitioning from tag i to tag j
+        self.transitions = nn.Parameter(torch.randn(num_tags, num_tags))
+        # Start and end transition scores
+        self.start_transitions = nn.Parameter(torch.randn(num_tags))
+        self.end_transitions = nn.Parameter(torch.randn(num_tags))
+        self._init_transitions()
+    def _init_transitions(self):
+        """Initialize transition parameters."""
+        nn.init.uniform_(self.transitions, -0.1, 0.1)
+        nn.init.uniform_(self.start_transitions, -0.1, 0.1)
+        nn.init.uniform_(self.end_transitions, -0.1, 0.1)
+    def forward(
+        self,
+        emissions: torch.Tensor,
+        tags: torch.LongTensor,
+        mask: torch.ByteTensor | None = None,
+        reduction: str = "mean",
+    ) -> torch.Tensor:
+        """
+        Compute negative log-likelihood loss.
+        Args:
+            emissions: Emission scores from BERT (batch, seq, num_tags)
+            tags: Gold standard tags (batch, seq)
+            mask: Mask for valid tokens (batch, seq)
+            reduction: 'mean', 'sum', or 'none'
+        Returns:
+            Negative log-likelihood loss
+        """
+        if mask is None:
+            mask = torch.ones_like(tags, dtype=torch.bool)
+        if self.batch_first:
+            emissions = emissions.transpose(0, 1)
+            tags = tags.transpose(0, 1)
+            mask = mask.transpose(0, 1)
+        # Compute log-likelihood
+        numerator = self._compute_score(emissions, tags, mask)
+        denominator = self._compute_normalizer(emissions, mask)
+        llh = numerator - denominator
+        if reduction == "mean":
+            return -llh.mean()
+        elif reduction == "sum":
+            return -llh.sum()
+        else:
+            return -llh
+    def decode(
+        self,
+        emissions: torch.Tensor,
+        mask: torch.ByteTensor | None = None,
+    ) -> list[list[int]]:
+        """
+        Find the most likely tag sequence using Viterbi algorithm.
+        Args:
+            emissions: Emission scores (batch, seq, num_tags)
+            mask: Mask for valid tokens (batch, seq)
+        Returns:
+            List of best tag sequences for each sample
+        """
+        if mask is None:
+            mask = torch.ones(emissions.shape[:2], dtype=torch.bool, device=emissions.device)
+        if self.batch_first:
+            emissions = emissions.transpose(0, 1)
+            mask = mask.transpose(0, 1)
+        return self._viterbi_decode(emissions, mask)
+    def _compute_score(
+        self,
+        emissions: torch.Tensor,
+        tags: torch.LongTensor,
+        mask: torch.BoolTensor
+    ) -> torch.Tensor:
+        """Compute the score of a given tag sequence."""
+        seq_length, batch_size = tags.shape
+        mask = mask.float()
+        # Start transition score
+        score = self.start_transitions[tags[0]]
+        for i in range(seq_length - 1):
+            current_tag = tags[i]
+            next_tag = tags[i + 1]
+            # Emission score
+            score += emissions[i, torch.arange(batch_size), current_tag] * mask[i]
+            # Transition score
+            score += self.transitions[current_tag, next_tag] * mask[i + 1]
+        # Last emission score
+        last_tag_idx = mask.long().sum(dim=0) - 1
+        last_tags = tags.gather(0, last_tag_idx.unsqueeze(0)).squeeze(0)
+        score += emissions[last_tag_idx, torch.arange(batch_size), last_tags]
+        # End transition score
+        score += self.end_transitions[last_tags]
+        return score
+    def _compute_normalizer(
+        self,
+        emissions: torch.Tensor,
+        mask: torch.BoolTensor
+    ) -> torch.Tensor:
+        """Compute log-sum-exp of all possible tag sequences (partition function)."""
+        seq_length = emissions.shape[0]
+        # Initialize with start transitions
+        score = self.start_transitions + emissions[0]
+        for i in range(1, seq_length):
+            # Broadcast score and transitions for all combinations
+            broadcast_score = score.unsqueeze(2)
+            broadcast_emissions = emissions[i].unsqueeze(1)
+            # Compute next scores
+            next_score = broadcast_score + self.transitions + broadcast_emissions
+            # Log-sum-exp
+            next_score = torch.logsumexp(next_score, dim=1)
+            # Mask
+            score = torch.where(mask[i].unsqueeze(1), next_score, score)
+        # Add end transitions
+        score += self.end_transitions
+        return torch.logsumexp(score, dim=1)
+    def _viterbi_decode(
+        self,
+        emissions: torch.Tensor,
+        mask: torch.BoolTensor
+    ) -> list[list[int]]:
+        """Viterbi decoding to find best tag sequence."""
+        seq_length, batch_size, num_tags = emissions.shape
+        # Initialize
+        score = self.start_transitions + emissions[0]
+        history = []
+        for i in range(1, seq_length):
+            broadcast_score = score.unsqueeze(2)
+            broadcast_emissions = emissions[i].unsqueeze(1)
+            next_score = broadcast_score + self.transitions + broadcast_emissions
+            # Find best previous tag for each current tag
+            next_score, indices = next_score.max(dim=1)
+            # Apply mask
+            score = torch.where(mask[i].unsqueeze(1), next_score, score)
+            history.append(indices)
+        # Add end transitions
+        score += self.end_transitions
+        # Backtrack
+        seq_ends = mask.long().sum(dim=0) - 1
+        best_tags_list = []
+        for batch_idx in range(batch_size):
+            # Best last tag
+            _, best_last_tag = score[batch_idx].max(dim=0)
+            best_tags = [best_last_tag.item()]
+            # Backtrack through history
+            for hist in reversed(history[:seq_ends[batch_idx]]):
+                best_last_tag = hist[batch_idx][best_tags[-1]]
+                best_tags.append(best_last_tag.item())
+            best_tags.reverse()
+            best_tags_list.append(best_tags)
+        return best_tags_list
+class BertCRFForTokenClassification(nn.Module):
+    """
+    BERT model with CRF layer for token classification.
+    This combines a multilingual BERT encoder with a CRF layer
+    for improved sequence labeling on NER tasks.
+    """
+    def __init__(self, config: ModelConfig):
+        """
+        Initialize BERT-CRF model.
+        Args:
+            config: Model configuration
+        """
+        super().__init__()
+        self.config = config
+        self.num_labels = config.num_labels
+        # Load pretrained BERT
+        self.bert = AutoModel.from_pretrained(
+            config.model_name,
+            cache_dir=config.cache_dir,
+        )
+        # Dropout
+        self.dropout = nn.Dropout(config.classifier_dropout)
+        # Classification head
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        # CRF layer
+        if config.use_crf:
+            self.crf = CRF(num_tags=config.num_labels, batch_first=True)
+        else:
+            self.crf = None
+        # Label mappings
+        self.id2label = ID2LABEL
+        self.label2id = LABEL2ID
+        # PyTorch 2.9: Lazy compilation for optimized inference
+        self._compiled_forward: nn.Module | None = None
+    def _get_compiled_forward(self):
+        """Lazy compile forward pass on first inference call."""
+        # Skip torch.compile on Windows without MSVC or when explicitly disabled
+        # The inductor backend requires a C++ compiler (cl on Windows, gcc/clang on Linux)
+        import os
+        import sys
+        skip_compile = (
+            os.environ.get("TORCH_COMPILE_DISABLE", "0") == "1"
+            or sys.platform == "win32"  # Skip on Windows to avoid cl requirement
+        )
+        if self._compiled_forward is None:
+            if not skip_compile and hasattr(torch, "compile"):
+                try:
+                    self._compiled_forward = torch.compile(
+                        self.forward,
+                        backend="inductor",
+                        mode="reduce-overhead",
+                        dynamic=True,
+                    )
+                except Exception:
+                    self._compiled_forward = self.forward
+            else:
+                self._compiled_forward = self.forward
+        return self._compiled_forward
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        token_type_ids: torch.Tensor | None = None,
+        labels: torch.Tensor | None = None,
+        return_dict: bool = True,
+    ):
+        """
+        Forward pass.
+        Args:
+            input_ids: Input token IDs (batch, seq)
+            attention_mask: Attention mask (batch, seq)
+            token_type_ids: Token type IDs (batch, seq)
+            labels: Gold standard labels for training (batch, seq)
+            return_dict: Return as dict or tuple
+        Returns:
+            TokenClassifierOutput with loss, logits, hidden states
+        """
+        # BERT encoding
+        outputs = self.bert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+        )
+        sequence_output = outputs.last_hidden_state
+        sequence_output = self.dropout(sequence_output)
+        # Classification logits
+        logits = self.classifier(sequence_output)
+        loss = None
+        if labels is not None:
+            if self.crf is not None:
+                # CRF loss - need to handle -100 (ignore_index) labels
+                mask = attention_mask.bool() if attention_mask is not None else None
+                # Replace -100 with 0 (will be masked out anyway)
+                crf_labels = labels.clone()
+                crf_labels[crf_labels == -100] = 0
+                loss = self.crf(logits, crf_labels, mask=mask, reduction=self.config.crf_reduction)
+            else:
+                # Standard cross-entropy
+                loss_fct = nn.CrossEntropyLoss()
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)[active_loss]
+                active_labels = labels.view(-1)[active_loss]
+                loss = loss_fct(active_logits, active_labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def decode(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        token_type_ids: torch.Tensor | None = None,
+    ) -> list[list[int]]:
+        """
+        Decode input to tag sequences using compiled forward pass.
+        Args:
+            input_ids: Input token IDs (batch, seq)
+            attention_mask: Attention mask (batch, seq)
+            token_type_ids: Token type IDs (batch, seq)
+        Returns:
+            List of predicted tag sequences
+        """
+        self.eval()
+        with torch.no_grad():
+            # Use compiled forward for optimized inference (PyTorch 2.9+)
+            forward_fn = self._get_compiled_forward()
+            outputs = forward_fn(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                token_type_ids=token_type_ids,
+            )
+            logits = outputs.logits
+            if self.crf is not None:
+                mask = attention_mask.bool() if attention_mask is not None else None
+                predictions = self.crf.decode(logits, mask=mask)
+            else:
+                predictions = logits.argmax(dim=-1).tolist()
+        return predictions
+    def save_pretrained(self, save_directory: str):
+        """Save model to directory."""
+        import json
+        import os
+        os.makedirs(save_directory, exist_ok=True)
+        # Save model weights
+        torch.save(self.state_dict(), os.path.join(save_directory, "pytorch_model.bin"))
+        # Save config
+        config_dict = {
+            "model_name": self.config.model_name,
+            "num_labels": self.config.num_labels,
+            "use_crf": self.config.use_crf,
+            "hidden_size": self.config.hidden_size,
+            "classifier_dropout": self.config.classifier_dropout,
+            "id2label": self.id2label,
+            "label2id": self.label2id,
+        }
+        with open(os.path.join(save_directory, "config.json"), "w") as f:
+            json.dump(config_dict, f, indent=2)
+    @classmethod
+    def from_pretrained(cls, model_path: str, device: str = "cpu"):
+        """Load model from directory."""
+        import json
+        with open(f"{model_path}/config.json") as f:
+            config_dict = json.load(f)
+        config = ModelConfig(
+            model_name=config_dict["model_name"],
+            num_labels=config_dict["num_labels"],
+            use_crf=config_dict["use_crf"],
+            hidden_size=config_dict["hidden_size"],
+            classifier_dropout=config_dict["classifier_dropout"],
+        )
+        model = cls(config)
+        state_dict = torch.load(f"{model_path}/pytorch_model.bin", map_location=device)
+        model.load_state_dict(state_dict)
+        model.to(device)
+        return model

src/address_parser/models/config.py ADDED Viewed

	@@ -0,0 +1,103 @@

+"""Model configuration for address NER."""
+from dataclasses import dataclass
+@dataclass
+class ModelConfig:
+    """Configuration for BERT-CRF NER model."""
+    # Base model - IndicBERTv2-SS recommended for Indian languages
+    # Options: "bert-base-multilingual-cased", "ai4bharat/IndicBERTv2-SS",
+    #          "google/muril-base-cased", "xlm-roberta-base"
+    model_name: str = "ai4bharat/IndicBERTv2-SS"
+    use_crf: bool = True
+    # Architecture
+    hidden_size: int = 768
+    num_labels: int = 31  # O + 15 entity types * 2 (B-/I-)
+    hidden_dropout_prob: float = 0.1
+    classifier_dropout: float = 0.1
+    # CRF settings
+    crf_reduction: str = "mean"  # 'mean' or 'sum'
+    # Training
+    max_length: int = 128
+    learning_rate: float = 5e-5
+    crf_learning_rate: float = 1e-3  # Higher LR for CRF
+    weight_decay: float = 0.01
+    warmup_ratio: float = 0.1
+    num_epochs: int = 10
+    batch_size: int = 16
+    gradient_accumulation_steps: int = 1
+    # Label smoothing
+    label_smoothing: float = 0.0
+    # Early stopping
+    early_stopping_patience: int = 5
+    early_stopping_threshold: float = 0.001
+    # Layer-wise learning rate decay
+    lr_decay: float = 0.95
+    # Paths
+    output_dir: str = "./models"
+    cache_dir: str | None = None
+    # ONNX export
+    onnx_opset_version: int = 14
+    @classmethod
+    def from_pretrained_name(cls, name: str) -> ModelConfig:
+        """Create config for known pretrained models."""
+        configs = {
+            "mbert": cls(
+                model_name="bert-base-multilingual-cased",
+                hidden_size=768,
+            ),
+            "indicbert": cls(
+                model_name="ai4bharat/IndicBERTv2-SS",
+                hidden_size=768,
+            ),
+            "distilbert": cls(
+                model_name="distilbert-base-multilingual-cased",
+                hidden_size=768,
+            ),
+            "xlm-roberta": cls(
+                model_name="xlm-roberta-base",
+                hidden_size=768,
+            ),
+            "muril": cls(
+                model_name="google/muril-base-cased",
+                hidden_size=768,
+            ),
+        }
+        return configs.get(name, cls())
+# Entity label definitions (must match schemas.py)
+ENTITY_LABELS = [
+    "AREA",
+    "SUBAREA",
+    "HOUSE_NUMBER",
+    "SECTOR",
+    "GALI",
+    "COLONY",
+    "BLOCK",
+    "CAMP",
+    "POLE",
+    "KHASRA",
+    "FLOOR",
+    "PLOT",
+    "PINCODE",
+    "CITY",
+    "STATE",
+]
+# Generate BIO labels
+BIO_LABELS = ["O"] + [f"B-{label}" for label in ENTITY_LABELS] + [f"I-{label}" for label in ENTITY_LABELS]
+LABEL2ID = {label: i for i, label in enumerate(BIO_LABELS)}
+ID2LABEL = {i: label for i, label in enumerate(BIO_LABELS)}
+NUM_LABELS = len(BIO_LABELS)

src/address_parser/pipeline.py ADDED Viewed

	@@ -0,0 +1,528 @@

+"""
+Main address parsing pipeline.
+Orchestrates preprocessing, model inference, and post-processing
+to extract structured entities from Indian addresses.
+"""
+import time
+import warnings
+from pathlib import Path
+from transformers import AutoTokenizer, logging as hf_logging
+# Suppress false positive tokenizer warnings in transformers 4.57+
+# The Mistral regex warning is incorrectly triggered for BERT tokenizers
+hf_logging.set_verbosity_error()
+warnings.filterwarnings("ignore", message=".*incorrect regex pattern.*")
+from address_parser.models.config import ID2LABEL, ModelConfig
+from address_parser.postprocessing import DelhiGazetteer, RuleBasedRefiner
+from address_parser.preprocessing import AddressNormalizer, HindiTransliterator
+from address_parser.schemas import (
+    AddressEntity,
+    BatchParseResponse,
+    ParsedAddress,
+    ParseResponse,
+)
+class AddressParser:
+    """
+    Main address parsing pipeline.
+    Combines:
+    - Text normalization and Hindi transliteration
+    - mBERT-CRF model for NER
+    - Rule-based post-processing with gazetteer
+    Example:
+        >>> parser = AddressParser.from_pretrained("./models/address_ner_v3")
+        >>> result = parser.parse("PLOT NO752 FIRST FLOOR, NEW DELHI, 110041")
+        >>> print(result.house_number)  # "PLOT NO752"
+    """
+    def __init__(
+        self,
+        model=None,
+        tokenizer=None,
+        config: ModelConfig | None = None,
+        device: str = "cpu",
+        use_rules: bool = True,
+        use_gazetteer: bool = True,
+    ):
+        """
+        Initialize parser.
+        Args:
+            model: Trained NER model (BertCRFForTokenClassification)
+            tokenizer: HuggingFace tokenizer
+            config: Model configuration
+            device: Device to run on ('cpu', 'cuda', 'mps')
+            use_rules: Enable rule-based post-processing
+            use_gazetteer: Enable gazetteer for validation
+        """
+        self.model = model
+        self.tokenizer = tokenizer
+        self.config = config or ModelConfig()
+        self.device = device
+        # Initialize preprocessing
+        self.normalizer = AddressNormalizer(uppercase=True, expand_abbrev=True)
+        self.transliterator = HindiTransliterator(use_known_terms=True)
+        # Initialize post-processing
+        self.refiner = RuleBasedRefiner(use_gazetteer=use_gazetteer) if use_rules else None
+        self.gazetteer = DelhiGazetteer() if use_gazetteer else None
+        # Move model to device
+        if self.model is not None:
+            self.model.to(device)
+            self.model.eval()
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_path: str | Path,
+        device: str = "cpu",
+        use_rules: bool = True,
+        use_gazetteer: bool = True,
+    ) -> AddressParser:
+        """
+        Load parser from pretrained model directory.
+        Args:
+            model_path: Path to saved model directory
+            device: Device to run on
+            use_rules: Enable rule-based post-processing
+            use_gazetteer: Enable gazetteer for validation
+        Returns:
+            Initialized AddressParser
+        """
+        from address_parser.models import BertCRFForTokenClassification
+        model_path = Path(model_path)
+        # Load model
+        model = BertCRFForTokenClassification.from_pretrained(str(model_path), device=device)
+        # Load tokenizer
+        tokenizer = AutoTokenizer.from_pretrained(str(model_path))
+        return cls(
+            model=model,
+            tokenizer=tokenizer,
+            device=device,
+            use_rules=use_rules,
+            use_gazetteer=use_gazetteer,
+        )
+    @classmethod
+    def rules_only(cls, use_gazetteer: bool = True) -> AddressParser:
+        """
+        Create a rules-only parser (no ML model).
+        Useful for testing or when model is not available.
+        """
+        return cls(
+            model=None,
+            tokenizer=None,
+            use_rules=True,
+            use_gazetteer=use_gazetteer,
+        )
+    def parse(self, address: str) -> ParsedAddress:
+        """
+        Parse a single address.
+        Args:
+            address: Raw address string
+        Returns:
+            ParsedAddress with extracted entities
+        """
+        if not address or not address.strip():
+            return ParsedAddress(
+                raw_address=address,
+                normalized_address="",
+                entities=[]
+            )
+        # Preprocessing
+        normalized = self._preprocess(address)
+        # Model inference
+        entities = self._extract_entities(normalized)
+        # Post-processing
+        if self.refiner:
+            entities = self.refiner.refine(normalized, entities)
+        return ParsedAddress(
+            raw_address=address,
+            normalized_address=normalized,
+            entities=entities
+        )
+    def parse_with_timing(self, address: str) -> ParseResponse:
+        """
+        Parse address and return response with timing info.
+        Args:
+            address: Raw address string
+        Returns:
+            ParseResponse with result and timing
+        """
+        start = time.perf_counter()
+        try:
+            result = self.parse(address)
+            elapsed = (time.perf_counter() - start) * 1000
+            return ParseResponse(
+                success=True,
+                result=result,
+                inference_time_ms=elapsed
+            )
+        except Exception as e:
+            elapsed = (time.perf_counter() - start) * 1000
+            return ParseResponse(
+                success=False,
+                error=str(e),
+                inference_time_ms=elapsed
+            )
+    def parse_batch(self, addresses: list[str]) -> BatchParseResponse:
+        """
+        Parse multiple addresses.
+        Args:
+            addresses: List of raw address strings
+        Returns:
+            BatchParseResponse with all results
+        """
+        start = time.perf_counter()
+        results = []
+        for address in addresses:
+            result = self.parse(address)
+            results.append(result)
+        total_time = (time.perf_counter() - start) * 1000
+        avg_time = total_time / len(addresses) if addresses else 0
+        return BatchParseResponse(
+            success=True,
+            results=results,
+            total_inference_time_ms=total_time,
+            avg_inference_time_ms=avg_time
+        )
+    def _preprocess(self, text: str) -> str:
+        """Preprocess address text."""
+        # Handle Hindi text
+        if self.transliterator.contains_devanagari(text):
+            text = self.transliterator.normalize_mixed_script(text)
+        # Normalize
+        return self.normalizer.normalize(text)
+    def _extract_entities(self, text: str) -> list[AddressEntity]:
+        """Extract entities using NER model."""
+        if self.model is None or self.tokenizer is None:
+            # Rules-only mode
+            return self._extract_entities_rules_only(text)
+        # Tokenize
+        encoding = self.tokenizer(
+            text,
+            return_tensors="pt",
+            truncation=True,
+            max_length=self.config.max_length,
+            return_offsets_mapping=True,
+            padding=True,
+        )
+        # Get offset mapping for alignment
+        offset_mapping = encoding.pop("offset_mapping")[0].tolist()
+        # Move to device
+        input_ids = encoding["input_ids"].to(self.device)
+        attention_mask = encoding["attention_mask"].to(self.device)
+        # Inference
+        predictions = self.model.decode(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+        )[0]  # First (and only) sample
+        # Convert to entities
+        entities = self._predictions_to_entities(
+            text=text,
+            predictions=predictions,
+            offset_mapping=offset_mapping,
+            attention_mask=encoding["attention_mask"][0].tolist(),
+        )
+        return entities
+    def _extract_entities_rules_only(self, text: str) -> list[AddressEntity]:
+        """Extract entities using comprehensive rules (no ML)."""
+        import re
+        entities = []
+        text_upper = text.upper()
+        # Known localities (multi-word)
+        known_localities = [
+            "LAJPAT NAGAR", "MALVIYA NAGAR", "HAUZ KHAS", "GREEN PARK",
+            "GREATER KAILASH", "DEFENCE COLONY", "SOUTH EXTENSION", "KALKAJI",
+            "CIVIL LINES", "MODEL TOWN", "MUKHERJEE NAGAR", "KAMLA NAGAR",
+            "PREET VIHAR", "MAYUR VIHAR", "LAKSHMI NAGAR", "GANDHI NAGAR",
+            "JANAKPURI", "DWARKA", "UTTAM NAGAR", "TILAK NAGAR", "RAJOURI GARDEN",
+            "PUNJABI BAGH", "PASCHIM VIHAR", "KAROL BAGH", "CONNAUGHT PLACE",
+            "KAUNWAR SINGH NAGAR", "PALAM COLONY", "RAJ NAGAR", "SADH NAGAR",
+            "VIJAY ENCLAVE", "DURGA PARK", "SWARN PARK", "CHANCHAL PARK",
+        ]
+        for locality in known_localities:
+            pos = text_upper.find(locality)
+            if pos >= 0:
+                entities.append(AddressEntity(
+                    label="SUBAREA",
+                    value=text[pos:pos + len(locality)],
+                    start=pos,
+                    end=pos + len(locality),
+                    confidence=0.95
+                ))
+        # Area patterns (directional)
+        area_patterns = [
+            (r'\bSOUTH\s+DELHI\b', "SOUTH DELHI"),
+            (r'\bNORTH\s+DELHI\b', "NORTH DELHI"),
+            (r'\bEAST\s+DELHI\b', "EAST DELHI"),
+            (r'\bWEST\s+DELHI\b', "WEST DELHI"),
+            (r'\bCENTRAL\s+DELHI\b', "CENTRAL DELHI"),
+            (r'\bOUTER\s+DELHI\b', "OUTER DELHI"),
+        ]
+        for pattern, area_name in area_patterns:
+            match = re.search(pattern, text_upper)
+            if match:
+                entities.append(AddressEntity(
+                    label="AREA",
+                    value=area_name,
+                    start=match.start(),
+                    end=match.end(),
+                    confidence=0.95
+                ))
+        # House number patterns (order matters - more specific first)
+        house_patterns = [
+            r'\b(?:FLAT\s*NO\.?\s*)[A-Z]?[-]?\d+[A-Z]?(?:[-/]\d+)*\b',
+            r'\b(?:PLOT\s*NO\.?)\s*[A-Z]?\d+[A-Z]?(?:[-/]\d+)*\b',
+            r'\b(?:H\.?\s*NO\.?|HOUSE\s*NO\.?|HNO)\s*[A-Z]?\d+[A-Z]?(?:[-/]\d+)*\b',
+            r'\b[RW]Z[-\s]?[A-Z]?[-/]?\d+[A-Z]?(?:[-/]\d+)*\b',
+        ]
+        for pattern in house_patterns:
+            match = re.search(pattern, text_upper)
+            if match:
+                entities.append(AddressEntity(
+                    label="HOUSE_NUMBER",
+                    value=text[match.start():match.end()],
+                    start=match.start(),
+                    end=match.end(),
+                    confidence=0.90
+                ))
+                break  # Only first match
+        # Floor patterns
+        floor_match = re.search(
+            r'\b(?:GROUND|FIRST|SECOND|THIRD|FOURTH|1ST|2ND|3RD|4TH|GF|FF|SF|TF)\s*(?:FLOOR|FLR)?\b',
+            text_upper
+        )
+        if floor_match:
+            entities.append(AddressEntity(
+                label="FLOOR",
+                value=text[floor_match.start():floor_match.end()],
+                start=floor_match.start(),
+                end=floor_match.end(),
+                confidence=0.90
+            ))
+        # Gali patterns
+        gali_match = re.search(r'\b(?:GALI|GALLI|LANE)\s*(?:NO\.?)?\s*\d+[A-Z]?\b', text_upper)
+        if gali_match:
+            entities.append(AddressEntity(
+                label="GALI",
+                value=text[gali_match.start():gali_match.end()],
+                start=gali_match.start(),
+                end=gali_match.end(),
+                confidence=0.90
+            ))
+        # Block patterns
+        block_match = re.search(r'\b(?:BLOCK|BLK|BL)\s*[A-Z]?[-]?[A-Z0-9]+\b', text_upper)
+        if block_match:
+            entities.append(AddressEntity(
+                label="BLOCK",
+                value=text[block_match.start():block_match.end()],
+                start=block_match.start(),
+                end=block_match.end(),
+                confidence=0.90
+            ))
+        # Sector patterns
+        sector_match = re.search(r'\b(?:SECTOR|SEC)\s*\d+[A-Z]?\b', text_upper)
+        if sector_match:
+            entities.append(AddressEntity(
+                label="SECTOR",
+                value=text[sector_match.start():sector_match.end()],
+                start=sector_match.start(),
+                end=sector_match.end(),
+                confidence=0.90
+            ))
+        # Khasra patterns
+        khasra_match = re.search(
+            r'\b(?:KH\.?\s*(?:NO\.?)?\s*|KHASRA\s*(?:NO\.?)?\s*)[\d/]+(?:[/-]\d+)*\b',
+            text_upper
+        )
+        if khasra_match:
+            entities.append(AddressEntity(
+                label="KHASRA",
+                value=text[khasra_match.start():khasra_match.end()],
+                start=khasra_match.start(),
+                end=khasra_match.end(),
+                confidence=0.90
+            ))
+        # Pincode (6-digit Delhi codes)
+        pincode_match = re.search(r'\b1[1][0]\d{3}\b', text)
+        if pincode_match:
+            entities.append(AddressEntity(
+                label="PINCODE",
+                value=pincode_match.group(0),
+                start=pincode_match.start(),
+                end=pincode_match.end(),
+                confidence=1.0
+            ))
+        # City - always DELHI for Delhi addresses
+        if "DELHI" in text_upper:
+            # Find standalone DELHI or NEW DELHI
+            delhi_match = re.search(r'\bNEW\s+DELHI\b', text_upper)
+            if delhi_match:
+                entities.append(AddressEntity(
+                    label="CITY",
+                    value="NEW DELHI",
+                    start=delhi_match.start(),
+                    end=delhi_match.end(),
+                    confidence=0.95
+                ))
+            else:
+                # Find last DELHI
+                delhi_positions = [m.start() for m in re.finditer(r'\bDELHI\b', text_upper)]
+                if delhi_positions:
+                    pos = delhi_positions[-1]
+                    entities.append(AddressEntity(
+                        label="CITY",
+                        value="DELHI",
+                        start=pos,
+                        end=pos + 5,
+                        confidence=0.90
+                    ))
+        return entities
+    def _predictions_to_entities(
+        self,
+        text: str,
+        predictions: list[int],
+        offset_mapping: list[tuple[int, int]],
+        attention_mask: list[int],
+    ) -> list[AddressEntity]:
+        """Convert model predictions to entity objects."""
+        entities = []
+        current_entity = None
+        for idx, (pred, offset, mask) in enumerate(zip(predictions, offset_mapping, attention_mask)):
+            if mask == 0 or offset == (0, 0):  # Skip padding and special tokens
+                continue
+            label = ID2LABEL.get(pred, "O")
+            start, end = offset
+            if label == "O":
+                # End current entity if any
+                if current_entity:
+                    entities.append(self._finalize_entity(current_entity, text))
+                    current_entity = None
+            elif label.startswith("B-"):
+                # Start new entity
+                if current_entity:
+                    entities.append(self._finalize_entity(current_entity, text))
+                entity_type = label[2:]  # Remove "B-" prefix
+                current_entity = {
+                    "label": entity_type,
+                    "start": start,
+                    "end": end,
+                    "confidence": 0.9,  # Base confidence
+                }
+            elif label.startswith("I-"):
+                # Continue entity
+                entity_type = label[2:]
+                if current_entity and current_entity["label"] == entity_type:
+                    current_entity["end"] = end
+                else:
+                    # I- without matching B- - treat as new B-
+                    if current_entity:
+                        entities.append(self._finalize_entity(current_entity, text))
+                    current_entity = {
+                        "label": entity_type,
+                        "start": start,
+                        "end": end,
+                        "confidence": 0.85,
+                    }
+        # Don't forget last entity
+        if current_entity:
+            entities.append(self._finalize_entity(current_entity, text))
+        return entities
+    def _finalize_entity(self, entity_dict: dict, text: str) -> AddressEntity:
+        """Finalize entity with extracted value."""
+        value = text[entity_dict["start"]:entity_dict["end"]].strip()
+        return AddressEntity(
+            label=entity_dict["label"],
+            value=value,
+            start=entity_dict["start"],
+            end=entity_dict["end"],
+            confidence=entity_dict["confidence"]
+        )
+# Convenience function for quick parsing
+def parse_address(address: str, model_path: str | None = None) -> ParsedAddress:
+    """
+    Quick address parsing function.
+    Args:
+        address: Address to parse
+        model_path: Optional path to model (uses rules-only if None)
+    Returns:
+        ParsedAddress
+    """
+    if model_path:
+        parser = AddressParser.from_pretrained(model_path)
+    else:
+        parser = AddressParser.rules_only()
+    return parser.parse(address)

src/address_parser/postprocessing/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""Post-processing module for entity refinement and validation."""
+from address_parser.postprocessing.gazetteer import DelhiGazetteer
+from address_parser.postprocessing.rules import RuleBasedRefiner
+__all__ = ["RuleBasedRefiner", "DelhiGazetteer"]

src/address_parser/postprocessing/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (474 Bytes). View file

src/address_parser/postprocessing/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (471 Bytes). View file

src/address_parser/postprocessing/__pycache__/gazetteer.cpython-312.pyc ADDED Viewed

Binary file (6.21 kB). View file

src/address_parser/postprocessing/__pycache__/gazetteer.cpython-314.pyc ADDED Viewed

Binary file (9.09 kB). View file

src/address_parser/postprocessing/__pycache__/rules.cpython-312.pyc ADDED Viewed

Binary file (13 kB). View file

src/address_parser/postprocessing/__pycache__/rules.cpython-314.pyc ADDED Viewed

Binary file (24.7 kB). View file

src/address_parser/postprocessing/gazetteer.py ADDED Viewed

	@@ -0,0 +1,164 @@

+"""Delhi locality gazetteer for fuzzy matching and validation."""
+from rapidfuzz import fuzz, process
+class DelhiGazetteer:
+    """
+    Gazetteer of Delhi localities, areas, and common address terms.
+    Used for:
+    - Fuzzy matching to correct misspellings
+    - Entity validation
+    - Confidence boosting for known locations
+    """
+    # Major Delhi localities/areas
+    LOCALITIES = {
+        # South Delhi
+        "SAKET", "MALVIYA NAGAR", "HAUZ KHAS", "GREEN PARK", "GREATER KAILASH",
+        "DEFENCE COLONY", "LAJPAT NAGAR", "SOUTH EXTENSION", "CHITTARANJAN PARK",
+        "KALKAJI", "NEHRU PLACE", "OKHLA", "JASOLA", "SARITA VIHAR",
+        "ALAKNANDA", "SAFDARJUNG", "VASANT KUNJ", "MEHRAULI", "CHATTARPUR",
+        # North Delhi
+        "CIVIL LINES", "MODEL TOWN", "MUKHERJEE NAGAR", "KAMLA NAGAR",
+        "SHAKTI NAGAR", "GULABI BAGH", "ASHOK VIHAR", "SHALIMAR BAGH",
+        "PITAMPURA", "ROHINI", "NARELA", "BAWANA", "ALIPUR",
+        # East Delhi
+        "PREET VIHAR", "MAYUR VIHAR", "PATPARGANJ", "PANDAV NAGAR",
+        "LAKSHMI NAGAR", "SHAKARPUR", "GEETA COLONY", "GANDHI NAGAR",
+        "DILSHAD GARDEN", "SEELAMPUR", "SHAHDARA", "ANAND VIHAR",
+        # West Delhi
+        "JANAKPURI", "DWARKA", "PALAM", "UTTAM NAGAR", "VIKASPURI",
+        "TILAK NAGAR", "RAJOURI GARDEN", "PUNJABI BAGH", "PASCHIM VIHAR",
+        "MEERA BAGH", "PEERAGARHI", "MUNDKA", "NANGLOI", "NAJAFGARH",
+        "BINDAPUR", "KAKROLA", "MOHAN GARDEN", "NAWADA",
+        # Central Delhi
+        "CONNAUGHT PLACE", "KAROL BAGH", "PAHARGANJ", "DARYAGANJ",
+        "CHANDNI CHOWK", "SADAR BAZAAR", "RAJENDER NAGAR", "PATEL NAGAR",
+        "KIRTI NAGAR", "MOTIA KHAN", "ANAND PARBAT", "JHANDEWALAN",
+        # New Delhi
+        "CHANAKYAPURI", "LODHI ROAD", "GOLF LINKS", "JORBAGH",
+        "SUNDAR NAGAR", "NIZAMUDDIN", "LODI COLONY", "PANDARA ROAD",
+        # Other areas
+        "BADARPUR", "TUGHLAKABAD", "SANGAM VIHAR", "MADANPUR KHADAR",
+        "GOVINDPURI", "AMBEDKAR NAGAR", "LADO SARAI", "TIGRI",
+        "BURARI", "KARAWAL NAGAR", "BHAJANPURA", "MUSTAFABAD",
+        "JAFFRABAD", "MAUJPUR", "GOKALPUR", "SEEMAPURI",
+    }
+    # Common colony/nagar suffixes
+    NAGAR_SUFFIXES = {
+        "NAGAR", "VIHAR", "COLONY", "ENCLAVE", "EXTENSION", "PURI",
+        "PARK", "GARDEN", "BAGH", "KUNJ", "APARTMENT", "RESIDENCY",
+        "COMPLEX", "PHASE", "SECTOR", "BLOCK", "POCKET",
+    }
+    # Common area names from the training data
+    COMMON_AREAS = {
+        "KAUNWAR SINGH NAGAR", "BABA HARI DAS COLONY", "TIKARI KALA",
+        "CHANCHAL PARK", "SWARN PARK", "MUNDKA", "NANGLOI", "BAKKARWALA",
+        "MAJRA DABAS", "CHAND NAGAR", "RANHOLA", "BAPROLA", "POOTH KHURD",
+        "KIRARI", "SULTANPURI", "MANGOLPURI", "BEGUMPUR", "KADIPUR",
+        "RAMA VIHAR", "PREM NAGAR", "VIJAY PARK", "AMBICA VIHAR",
+        "SHIV PURI", "BUDH VIHAR", "POOTH KALAN", "QUTUBGARH",
+        "RANI KHERA", "SHAHABAD DAIRY", "SAMAIPUR", "JAHANGIRPURI",
+        "SANNOTH", "KANJHAWALA", "BAWANA", "ALIPUR",
+    }
+    # Common Hindi transliterated terms
+    HINDI_TERMS = {
+        "MOHALLA", "GALI", "KATRA", "BASTI", "BAZAR", "CHOWK",
+        "GANJ", "PUR", "ABAD", "GARH", "GAON", "KHERA", "KHURD", "KALAN",
+    }
+    def __init__(self, min_similarity: float = 80.0):
+        """
+        Initialize gazetteer.
+        Args:
+            min_similarity: Minimum fuzzy match score (0-100)
+        """
+        self.min_similarity = min_similarity
+        # Build combined set for matching
+        self.all_places = (
+            self.LOCALITIES |
+            self.COMMON_AREAS |
+            {f"{term}" for term in self.HINDI_TERMS}
+        )
+    def fuzzy_match(
+        self,
+        text: str,
+        limit: int = 3
+    ) -> list[tuple[str, float]]:
+        """
+        Find fuzzy matches for a text in the gazetteer.
+        Args:
+            text: Text to match
+            limit: Maximum number of matches
+        Returns:
+            List of (matched_text, score) tuples
+        """
+        if not text or len(text) < 3:
+            return []
+        matches = process.extract(
+            text.upper(),
+            self.all_places,
+            scorer=fuzz.ratio,
+            limit=limit
+        )
+        return [(m[0], m[1]) for m in matches if m[1] >= self.min_similarity]
+    def is_known_locality(self, text: str, threshold: float = 85.0) -> bool:
+        """Check if text matches a known locality."""
+        matches = self.fuzzy_match(text, limit=1)
+        return bool(matches and matches[0][1] >= threshold)
+    def correct_spelling(self, text: str) -> str | None:
+        """
+        Attempt to correct spelling using gazetteer.
+        Returns corrected text or None if no good match.
+        """
+        matches = self.fuzzy_match(text, limit=1)
+        if matches and matches[0][1] >= 90.0:
+            return matches[0][0]
+        return None
+    def get_locality_type(self, text: str) -> str | None:
+        """
+        Determine if text contains a known locality type suffix.
+        Returns the suffix type or None.
+        """
+        text_upper = text.upper()
+        for suffix in self.NAGAR_SUFFIXES:
+            if text_upper.endswith(suffix):
+                return suffix
+        return None
+    def validate_pincode(self, pincode: str, locality: str | None = None) -> bool:
+        """
+        Validate if a pincode is valid for Delhi.
+        Delhi pincodes are in range 110001-110097.
+        """
+        if not pincode or not pincode.isdigit() or len(pincode) != 6:
+            return False
+        code = int(pincode)
+        # Delhi pincode range
+        return 110001 <= code <= 110097

src/address_parser/postprocessing/rules.py ADDED Viewed

	@@ -0,0 +1,536 @@

+"""Rule-based post-processing for entity refinement."""
+import re
+from address_parser.postprocessing.gazetteer import DelhiGazetteer
+from address_parser.schemas import AddressEntity
+class RuleBasedRefiner:
+    """
+    Post-processing rules for refining NER predictions.
+    Handles:
+    - Pattern-based entity detection (pincodes, khasra numbers)
+    - Entity boundary correction using gazetteer
+    - Entity merging for fragmented predictions
+    - Confidence adjustment
+    - Validation and filtering
+    """
+    # Regex patterns for deterministic entities
+    PATTERNS = {
+        "PINCODE": re.compile(r'\b[1-9]\d{5}\b'),
+        "KHASRA": re.compile(
+            r'\b(?:KH\.?\s*(?:NO\.?)?\s*|KHASRA\s*(?:NO\.?)?\s*)[\d/]+(?:[/-]\d+)*\b',
+            re.IGNORECASE
+        ),
+        "HOUSE_NUMBER": re.compile(
+            r'\b(?:H\.?\s*(?:NO\.?)?\s*|HOUSE\s*(?:NO\.?)?\s*|PLOT\s*(?:NO\.?)?\s*)?[A-Z]?\d+[A-Z]?(?:[-/]\d+)*\b',
+            re.IGNORECASE
+        ),
+        "FLOOR": re.compile(
+            r'\b(?:GROUND|FIRST|SECOND|THIRD|FOURTH|FIFTH|1ST|2ND|3RD|4TH|5TH|GF|FF|SF|TF)?\s*(?:FLOOR|FLR)\b',
+            re.IGNORECASE
+        ),
+        "BLOCK": re.compile(
+            r'\b(?:BLOCK|BLK|BL)\s*[A-Z]?[-]?[A-Z0-9]+\b',
+            re.IGNORECASE
+        ),
+        "SECTOR": re.compile(
+            r'\b(?:SECTOR|SEC)\s*\d+[A-Z]?\b',
+            re.IGNORECASE
+        ),
+        "GALI": re.compile(
+            r'\b(?:GALI|GALLI|LANE)\s*(?:NO\.?)?\s*\d+[A-Z]?\b',
+            re.IGNORECASE
+        ),
+    }
+    # Area patterns - directional areas
+    AREA_PATTERNS = [
+        (re.compile(r'\bSOUTH\s+DELHI\b', re.IGNORECASE), "SOUTH DELHI"),
+        (re.compile(r'\bNORTH\s+DELHI\b', re.IGNORECASE), "NORTH DELHI"),
+        (re.compile(r'\bEAST\s+DELHI\b', re.IGNORECASE), "EAST DELHI"),
+        (re.compile(r'\bWEST\s+DELHI\b', re.IGNORECASE), "WEST DELHI"),
+        (re.compile(r'\bCENTRAL\s+DELHI\b', re.IGNORECASE), "CENTRAL DELHI"),
+        (re.compile(r'\bSOUTH\s+WEST\s+DELHI\b', re.IGNORECASE), "SOUTH WEST DELHI"),
+        (re.compile(r'\bNORTH\s+WEST\s+DELHI\b', re.IGNORECASE), "NORTH WEST DELHI"),
+        (re.compile(r'\bNORTH\s+EAST\s+DELHI\b', re.IGNORECASE), "NORTH EAST DELHI"),
+        (re.compile(r'\bSOUTH\s+EAST\s+DELHI\b', re.IGNORECASE), "SOUTH EAST DELHI"),
+        (re.compile(r'\bOUTER\s+DELHI\b', re.IGNORECASE), "OUTER DELHI"),
+    ]
+    # City patterns
+    CITY_PATTERNS = [
+        (re.compile(r'\bNEW\s+DELHI\b', re.IGNORECASE), "NEW DELHI"),
+        (re.compile(r'\bDELHI\b', re.IGNORECASE), "DELHI"),
+        (re.compile(r'\bNOIDA\b', re.IGNORECASE), "NOIDA"),
+        (re.compile(r'\bGURUGRAM\b', re.IGNORECASE), "GURUGRAM"),
+        (re.compile(r'\bGURGAON\b', re.IGNORECASE), "GURGAON"),
+        (re.compile(r'\bFARIDABAD\b', re.IGNORECASE), "FARIDABAD"),
+        (re.compile(r'\bGHAZIABAD\b', re.IGNORECASE), "GHAZIABAD"),
+    ]
+    # State patterns
+    STATE_PATTERNS = [
+        (re.compile(r'\bDELHI\b', re.IGNORECASE), "DELHI"),
+        (re.compile(r'\bHARYANA\b', re.IGNORECASE), "HARYANA"),
+        (re.compile(r'\bUTTAR\s+PRADESH\b', re.IGNORECASE), "UTTAR PRADESH"),
+        (re.compile(r'\bU\.?\s*P\.?\b'), "UTTAR PRADESH"),
+    ]
+    # Colony/Nagar indicators
+    COLONY_SUFFIXES = [
+        "NAGAR", "VIHAR", "COLONY", "ENCLAVE", "PARK", "GARDEN",
+        "PURI", "BAGH", "KUNJ", "EXTENSION", "EXTN", "PHASE",
+    ]
+    # Known multi-word localities that get fragmented
+    KNOWN_LOCALITIES = [
+        "LAJPAT NAGAR", "MALVIYA NAGAR", "KAROL BAGH", "HAUZ KHAS",
+        "GREEN PARK", "GREATER KAILASH", "DEFENCE COLONY", "SOUTH EXTENSION",
+        "CHITTARANJAN PARK", "NEHRU PLACE", "SARITA VIHAR", "VASANT KUNJ",
+        "CIVIL LINES", "MODEL TOWN", "MUKHERJEE NAGAR", "KAMLA NAGAR",
+        "ASHOK VIHAR", "SHALIMAR BAGH", "PREET VIHAR", "MAYUR VIHAR",
+        "LAKSHMI NAGAR", "GANDHI NAGAR", "DILSHAD GARDEN", "ANAND VIHAR",
+        "UTTAM NAGAR", "TILAK NAGAR", "RAJOURI GARDEN", "PUNJABI BAGH",
+        "PASCHIM VIHAR", "CONNAUGHT PLACE", "RAJENDER NAGAR", "PATEL NAGAR",
+        "KIRTI NAGAR", "LODHI ROAD", "GOLF LINKS", "SANGAM VIHAR",
+        "GOVINDPURI", "AMBEDKAR NAGAR", "LADO SARAI", "KAUNWAR SINGH NAGAR",
+        "BABA HARI DAS COLONY", "SWARN PARK", "CHANCHAL PARK", "DURGA PARK",
+        "RAJ NAGAR", "SADH NAGAR", "VIJAY ENCLAVE", "PALAM COLONY",
+    ]
+    def __init__(self, use_gazetteer: bool = True):
+        """
+        Initialize refiner.
+        Args:
+            use_gazetteer: Use gazetteer for validation/correction
+        """
+        self.gazetteer = DelhiGazetteer() if use_gazetteer else None
+    def refine(
+        self,
+        text: str,
+        entities: list[AddressEntity]
+    ) -> list[AddressEntity]:
+        """
+        Refine entity predictions.
+        Args:
+            text: Original address text
+            entities: Predicted entities from NER model
+        Returns:
+            Refined list of entities
+        """
+        refined = list(entities)
+        # First: detect and fix known localities from gazetteer
+        refined = self._fix_known_localities(text, refined)
+        # Add rule-based entities that may have been missed
+        refined = self._add_pattern_entities(text, refined)
+        # Detect area patterns (SOUTH DELHI, etc.)
+        refined = self._add_area_patterns(text, refined)
+        # Correct entity boundaries
+        refined = self._correct_boundaries(text, refined)
+        # Merge fragmented entities
+        refined = self._merge_fragmented_entities(text, refined)
+        # Adjust confidence scores
+        refined = self._adjust_confidence(text, refined)
+        # Remove duplicates and overlapping entities
+        refined = self._remove_overlaps(refined)
+        # Validate entities
+        refined = self._validate_entities(refined)
+        return refined
+    def _fix_known_localities(
+        self,
+        text: str,
+        entities: list[AddressEntity]
+    ) -> list[AddressEntity]:
+        """Fix fragmented known localities using gazetteer lookup."""
+        text_upper = text.upper()
+        result = []
+        used_ranges: list[tuple[int, int]] = []
+        # First pass: find all known localities in text
+        locality_entities = []
+        for locality in self.KNOWN_LOCALITIES:
+            idx = 0
+            while True:
+                pos = text_upper.find(locality, idx)
+                if pos == -1:
+                    break
+                end = pos + len(locality)
+                locality_entities.append(AddressEntity(
+                    label="SUBAREA",
+                    value=text[pos:end],
+                    start=pos,
+                    end=end,
+                    confidence=0.95
+                ))
+                used_ranges.append((pos, end))
+                idx = end
+        # Also check area patterns
+        for pattern, area_name in self.AREA_PATTERNS:
+            match = pattern.search(text)
+            if match:
+                start, end = match.start(), match.end()
+                # Check for overlap with existing ranges
+                overlaps = any(
+                    not (end <= s or start >= e)
+                    for s, e in used_ranges
+                )
+                if not overlaps:
+                    locality_entities.append(AddressEntity(
+                        label="AREA",
+                        value=area_name,
+                        start=start,
+                        end=end,
+                        confidence=0.95
+                    ))
+                    used_ranges.append((start, end))
+        # Filter out original entities that overlap with found localities
+        for entity in entities:
+            # Check if entity overlaps with any locality range
+            overlaps_locality = any(
+                not (entity.end <= start or entity.start >= end)
+                for start, end in used_ranges
+            )
+            if overlaps_locality and entity.label in ("AREA", "SUBAREA", "COLONY", "CITY"):
+                # Skip this fragmented entity
+                continue
+            result.append(entity)
+        # Add the locality entities
+        result.extend(locality_entities)
+        return result
+    def _add_area_patterns(
+        self,
+        text: str,
+        entities: list[AddressEntity]
+    ) -> list[AddressEntity]:
+        """Add area patterns like SOUTH DELHI, NORTH DELHI (already handled in _fix_known_localities)."""
+        # This is now handled in _fix_known_localities to avoid duplicates
+        return entities
+    def _merge_fragmented_entities(
+        self,
+        text: str,
+        entities: list[AddressEntity]
+    ) -> list[AddressEntity]:
+        """Merge adjacent entities of same type that should be together."""
+        if len(entities) < 2:
+            return entities
+        # Sort by position
+        sorted_entities = sorted(entities, key=lambda e: e.start)
+        result = []
+        i = 0
+        while i < len(sorted_entities):
+            current = sorted_entities[i]
+            # Look for adjacent entities to merge
+            if current.label in ("AREA", "SUBAREA", "COLONY", "CITY"):
+                merged_end = current.end
+                merged_confidence = current.confidence
+                j = i + 1
+                # Check subsequent entities
+                while j < len(sorted_entities):
+                    next_ent = sorted_entities[j]
+                    # Check if adjacent (within 2 chars - allows for space)
+                    gap = next_ent.start - merged_end
+                    if gap <= 2 and next_ent.label in ("AREA", "SUBAREA", "COLONY", "CITY"):
+                        # Check if the merged text forms a known locality
+                        merged_text = text[current.start:next_ent.end].strip()
+                        if self._is_valid_merge(merged_text):
+                            merged_end = next_ent.end
+                            merged_confidence = max(merged_confidence, next_ent.confidence)
+                            j += 1
+                        else:
+                            break
+                    else:
+                        break
+                # Create merged entity if we merged anything
+                if j > i + 1:
+                    merged_value = text[current.start:merged_end].strip()
+                    result.append(AddressEntity(
+                        label=current.label,
+                        value=merged_value,
+                        start=current.start,
+                        end=merged_end,
+                        confidence=merged_confidence
+                    ))
+                    i = j
+                    continue
+            result.append(current)
+            i += 1
+        return result
+    def _is_valid_merge(self, text: str) -> bool:
+        """Check if merged text forms a valid locality name."""
+        text_upper = text.upper().strip()
+        # Check against known localities
+        if text_upper in self.KNOWN_LOCALITIES:
+            return True
+        # Check gazetteer
+        if self.gazetteer and self.gazetteer.is_known_locality(text_upper, threshold=80):
+            return True
+        # Check if ends with common suffix
+        for suffix in self.COLONY_SUFFIXES:
+            if text_upper.endswith(suffix):
+                return True
+        return False
+    def _add_pattern_entities(
+        self,
+        text: str,
+        entities: list[AddressEntity]
+    ) -> list[AddressEntity]:
+        """Add entities detected by regex patterns."""
+        result = list(entities)
+        existing_spans = {(e.start, e.end) for e in entities}
+        # Check for pincode
+        if not any(e.label == "PINCODE" for e in entities):
+            match = self.PATTERNS["PINCODE"].search(text)
+            if match and (match.start(), match.end()) not in existing_spans:
+                result.append(AddressEntity(
+                    label="PINCODE",
+                    value=match.group(0),
+                    start=match.start(),
+                    end=match.end(),
+                    confidence=1.0  # Rule-based, high confidence
+                ))
+        # Check for city - DELHI addresses always have DELHI as city
+        has_city = any(e.label == "CITY" for e in result)
+        if not has_city:
+            # If text contains DELHI anywhere, set city to DELHI
+            if "DELHI" in text.upper():
+                # Find the last occurrence of DELHI (usually the city mention)
+                delhi_positions = [m.start() for m in re.finditer(r'\bDELHI\b', text.upper())]
+                if delhi_positions:
+                    pos = delhi_positions[-1]  # Use last occurrence
+                    result.append(AddressEntity(
+                        label="CITY",
+                        value="DELHI",
+                        start=pos,
+                        end=pos + 5,
+                        confidence=0.90
+                    ))
+            else:
+                # Check other city patterns
+                for pattern, city_name in self.CITY_PATTERNS:
+                    if city_name == "DELHI":
+                        continue  # Already handled above
+                    match = pattern.search(text)
+                    if match and (match.start(), match.end()) not in existing_spans:
+                        result.append(AddressEntity(
+                            label="CITY",
+                            value=city_name,
+                            start=match.start(),
+                            end=match.end(),
+                            confidence=0.95
+                        ))
+                        break
+        # Check for state
+        if not any(e.label == "STATE" for e in entities):
+            for pattern, state_name in self.STATE_PATTERNS:
+                match = pattern.search(text)
+                if match and (match.start(), match.end()) not in existing_spans:
+                    # Avoid tagging "DELHI" as state if it's already a city
+                    if state_name == "DELHI" and any(e.label == "CITY" and "DELHI" in e.value.upper() for e in result):
+                        continue
+                    result.append(AddressEntity(
+                        label="STATE",
+                        value=state_name,
+                        start=match.start(),
+                        end=match.end(),
+                        confidence=0.90
+                    ))
+                    break
+        return result
+    def _correct_boundaries(
+        self,
+        text: str,
+        entities: list[AddressEntity]
+    ) -> list[AddressEntity]:
+        """Correct entity boundaries based on patterns."""
+        result = []
+        for entity in entities:
+            corrected = entity.model_copy()
+            # Expand KHASRA to include full pattern
+            if entity.label == "KHASRA":
+                match = self.PATTERNS["KHASRA"].search(text)
+                if match:
+                    corrected.value = match.group(0)
+                    corrected.start = match.start()
+                    corrected.end = match.end()
+            # Expand BLOCK to include identifier
+            elif entity.label == "BLOCK":
+                match = self.PATTERNS["BLOCK"].search(text)
+                if match:
+                    corrected.value = match.group(0)
+                    corrected.start = match.start()
+                    corrected.end = match.end()
+            # Expand FLOOR to include floor number
+            elif entity.label == "FLOOR":
+                match = self.PATTERNS["FLOOR"].search(text)
+                if match:
+                    corrected.value = match.group(0)
+                    corrected.start = match.start()
+                    corrected.end = match.end()
+            # Clean up leading/trailing whitespace from value
+            corrected.value = corrected.value.strip()
+            result.append(corrected)
+        return result
+    def _adjust_confidence(
+        self,
+        text: str,
+        entities: list[AddressEntity]
+    ) -> list[AddressEntity]:
+        """Adjust confidence scores based on patterns and gazetteer."""
+        result = []
+        for entity in entities:
+            adjusted = entity.model_copy()
+            # Boost confidence for pattern matches
+            if entity.label in self.PATTERNS:
+                pattern = self.PATTERNS[entity.label]
+                if pattern.fullmatch(entity.value):
+                    adjusted.confidence = min(1.0, entity.confidence + 0.1)
+            # Boost confidence for gazetteer matches
+            if self.gazetteer and entity.label in ("AREA", "SUBAREA", "COLONY"):
+                if self.gazetteer.is_known_locality(entity.value):
+                    adjusted.confidence = min(1.0, entity.confidence + 0.15)
+            # Reduce confidence for very short entities
+            if len(entity.value) < 3:
+                adjusted.confidence = max(0.0, entity.confidence - 0.2)
+            result.append(adjusted)
+        return result
+    def _remove_overlaps(
+        self,
+        entities: list[AddressEntity]
+    ) -> list[AddressEntity]:
+        """Remove overlapping entities, keeping higher confidence ones."""
+        if not entities:
+            return entities
+        # Separate CITY and PINCODE entities - these should always be kept
+        # as they represent different semantic levels than AREA/SUBAREA
+        preserved_labels = {"CITY", "PINCODE", "STATE"}
+        preserved_entities = [e for e in entities if e.label in preserved_labels]
+        other_entities = [e for e in entities if e.label not in preserved_labels]
+        # Sort non-preserved by confidence (descending) then by start position
+        sorted_entities = sorted(other_entities, key=lambda e: (-e.confidence, e.start))
+        result: list[AddressEntity] = []
+        used_ranges: list[tuple[int, int]] = []
+        for entity in sorted_entities:
+            # Check for overlap with existing entities
+            overlaps = False
+            for start, end in used_ranges:
+                if not (entity.end <= start or entity.start >= end):
+                    overlaps = True
+                    break
+            if not overlaps:
+                result.append(entity)
+                used_ranges.append((entity.start, entity.end))
+        # Add back preserved entities (CITY, PINCODE, STATE)
+        result.extend(preserved_entities)
+        # Sort by position for output
+        return sorted(result, key=lambda e: e.start)
+    def _validate_entities(
+        self,
+        entities: list[AddressEntity]
+    ) -> list[AddressEntity]:
+        """Validate and filter entities."""
+        result = []
+        for entity in entities:
+            # Skip empty values
+            if not entity.value.strip():
+                continue
+            # Skip very low confidence
+            if entity.confidence < 0.3:
+                continue
+            # Validate pincode format
+            if entity.label == "PINCODE":
+                if not re.fullmatch(r'[1-9]\d{5}', entity.value):
+                    continue
+                if self.gazetteer and not self.gazetteer.validate_pincode(entity.value):
+                    # Pincode outside Delhi range - reduce confidence but keep
+                    entity = entity.model_copy()
+                    entity.confidence *= 0.7
+            result.append(entity)
+        return result
+    def extract_all_patterns(self, text: str) -> dict[str, list[str]]:
+        """
+        Extract all pattern-based entities from text.
+        Returns dict of label -> list of matched values.
+        """
+        results = {}
+        for label, pattern in self.PATTERNS.items():
+            matches = pattern.findall(text)
+            if matches:
+                results[label] = matches
+        return results

src/address_parser/preprocessing/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""Preprocessing module for address normalization and Hindi transliteration."""
+from address_parser.preprocessing.hindi import HindiTransliterator
+from address_parser.preprocessing.normalizer import AddressNormalizer
+__all__ = ["AddressNormalizer", "HindiTransliterator"]

src/address_parser/preprocessing/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (491 Bytes). View file

src/address_parser/preprocessing/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (488 Bytes). View file

src/address_parser/preprocessing/__pycache__/hindi.cpython-312.pyc ADDED Viewed

Binary file (10.3 kB). View file

src/address_parser/preprocessing/__pycache__/hindi.cpython-314.pyc ADDED Viewed

Binary file (11.5 kB). View file

src/address_parser/preprocessing/__pycache__/normalizer.cpython-312.pyc ADDED Viewed

Binary file (7.16 kB). View file

src/address_parser/preprocessing/__pycache__/normalizer.cpython-314.pyc ADDED Viewed

Binary file (8.41 kB). View file

src/address_parser/preprocessing/hindi.py ADDED Viewed

	@@ -0,0 +1,242 @@

+"""Hindi transliteration and script handling for multilingual addresses."""
+import re
+class HindiTransliterator:
+    """
+    Handles Hindi (Devanagari) to Latin transliteration and script detection.
+    Supports:
+    - Devanagari to Latin conversion
+    - Common Hindi address terms
+    - Mixed script (code-switched) addresses
+    """
+    # Devanagari Unicode range
+    DEVANAGARI_START = 0x0900
+    DEVANAGARI_END = 0x097F
+    # Common Hindi address terms with transliterations
+    HINDI_TERMS = {
+        # Devanagari -> Latin
+        'गली': 'GALI',
+        'गलि': 'GALI',
+        'मोहल्ला': 'MOHALLA',
+        'नगर': 'NAGAR',
+        'विहार': 'VIHAR',
+        'पुरी': 'PURI',
+        'पुर': 'PUR',
+        'बाग': 'BAGH',
+        'मार्ग': 'MARG',
+        'रोड': 'ROAD',
+        'मंजिल': 'FLOOR',
+        'पहली': 'FIRST',
+        'दूसरी': 'SECOND',
+        'तीसरी': 'THIRD',
+        'चौथी': 'FOURTH',
+        'भूतल': 'GROUND FLOOR',
+        'तहखाना': 'BASEMENT',
+        'मकान': 'HOUSE',
+        'प्लॉट': 'PLOT',
+        'खसरा': 'KHASRA',
+        'ब्लॉक': 'BLOCK',
+        'सेक्टर': 'SECTOR',
+        'कॉलोनी': 'COLONY',
+        'इलाका': 'AREA',
+        'क्षेत्र': 'AREA',
+        'दिल्ली': 'DELHI',
+        'नई दिल्ली': 'NEW DELHI',
+        'नम्बर': 'NUMBER',
+        'नंबर': 'NUMBER',
+        'संख्या': 'NUMBER',
+        'पास': 'NEAR',
+        'सामने': 'OPPOSITE',
+        'पीछे': 'BEHIND',
+        'के पास': 'NEAR',
+        'के सामने': 'OPPOSITE',
+        'चौक': 'CHOWK',
+        'बाजार': 'BAZAAR',
+        'बस्ती': 'BASTI',
+        'पार्क': 'PARK',
+        'एक्सटेंशन': 'EXTENSION',
+        'फेज': 'PHASE',
+        'वार्ड': 'WARD',
+        'जोन': 'ZONE',
+    }
+    # Devanagari consonants to Latin (basic ITRANS-like mapping)
+    CONSONANT_MAP = {
+        'क': 'k', 'ख': 'kh', 'ग': 'g', 'घ': 'gh', 'ङ': 'ng',
+        'च': 'ch', 'छ': 'chh', 'ज': 'j', 'झ': 'jh', 'ञ': 'ny',
+        'ट': 't', 'ठ': 'th', 'ड': 'd', 'ढ': 'dh', 'ण': 'n',
+        'त': 't', 'थ': 'th', 'द': 'd', 'ध': 'dh', 'न': 'n',
+        'प': 'p', 'फ': 'ph', 'ब': 'b', 'भ': 'bh', 'म': 'm',
+        'य': 'y', 'र': 'r', 'ल': 'l', 'व': 'v', 'श': 'sh',
+        'ष': 'sh', 'स': 's', 'ह': 'h',
+        'क़': 'q', 'ख़': 'kh', 'ग़': 'g', 'ज़': 'z', 'ड़': 'd',
+        'ढ़': 'dh', 'फ़': 'f', 'य़': 'y',
+    }
+    # Devanagari vowels/matras
+    VOWEL_MAP = {
+        'अ': 'a', 'आ': 'aa', 'इ': 'i', 'ई': 'ee', 'उ': 'u', 'ऊ': 'oo',
+        'ए': 'e', 'ऐ': 'ai', 'ओ': 'o', 'औ': 'au', 'अं': 'an', 'अः': 'ah',
+        'ा': 'a', 'ि': 'i', 'ी': 'ee', 'ु': 'u', 'ू': 'oo',
+        'े': 'e', 'ै': 'ai', 'ो': 'o', 'ौ': 'au',
+        'ं': 'n', 'ः': 'h', '्': '',  # Halant (vowel killer)
+        'ँ': 'n',  # Chandrabindu
+    }
+    # Devanagari digits
+    DIGIT_MAP = {
+        '०': '0', '१': '1', '२': '2', '३': '3', '४': '4',
+        '५': '5', '६': '6', '७': '7', '८': '8', '९': '9',
+    }
+    def __init__(self, use_known_terms: bool = True):
+        """
+        Initialize transliterator.
+        Args:
+            use_known_terms: Use dictionary of known Hindi address terms
+        """
+        self.use_known_terms = use_known_terms
+    def contains_devanagari(self, text: str) -> bool:
+        """Check if text contains Devanagari script."""
+        for char in text:
+            code = ord(char)
+            if self.DEVANAGARI_START <= code <= self.DEVANAGARI_END:
+                return True
+        return False
+    def get_script_ratio(self, text: str) -> dict[str, float]:
+        """
+        Get ratio of different scripts in text.
+        Returns dict with 'latin', 'devanagari', 'numeric', 'other' ratios.
+        """
+        if not text:
+            return {'latin': 0.0, 'devanagari': 0.0, 'numeric': 0.0, 'other': 0.0}
+        counts: dict[str, float] = {'latin': 0, 'devanagari': 0, 'numeric': 0, 'other': 0}
+        total = 0
+        for char in text:
+            if char.isspace():
+                continue
+            total += 1
+            code = ord(char)
+            if self.DEVANAGARI_START <= code <= self.DEVANAGARI_END:
+                counts['devanagari'] += 1
+            elif char.isascii() and char.isalpha():
+                counts['latin'] += 1
+            elif char.isdigit():
+                counts['numeric'] += 1
+            else:
+                counts['other'] += 1
+        if total == 0:
+            return counts
+        return {k: v / total for k, v in counts.items()}
+    def transliterate(self, text: str) -> str:
+        """
+        Transliterate Devanagari text to Latin script.
+        Args:
+            text: Input text (may be mixed script)
+        Returns:
+            Transliterated text in Latin script
+        """
+        if not self.contains_devanagari(text):
+            return text
+        # First, try to match known terms
+        if self.use_known_terms:
+            for hindi, latin in sorted(self.HINDI_TERMS.items(), key=lambda x: -len(x[0])):
+                text = text.replace(hindi, f' {latin} ')
+        # Then transliterate remaining Devanagari
+        result = []
+        i = 0
+        while i < len(text):
+            char = text[i]
+            code = ord(char)
+            if self.DEVANAGARI_START <= code <= self.DEVANAGARI_END:
+                # Check digits first
+                if char in self.DIGIT_MAP:
+                    result.append(self.DIGIT_MAP[char])
+                # Check vowels
+                elif char in self.VOWEL_MAP:
+                    result.append(self.VOWEL_MAP[char])
+                # Check consonants
+                elif char in self.CONSONANT_MAP:
+                    result.append(self.CONSONANT_MAP[char])
+                    # Add implicit 'a' unless followed by matra or halant
+                    if i + 1 < len(text):
+                        next_char = text[i + 1]
+                        next_code = ord(next_char)
+                        # If next is a matra (0x093E-0x094D) or halant, don't add 'a'
+                        if not (0x093E <= next_code <= 0x094D):
+                            result.append('a')
+                    else:
+                        result.append('a')
+                else:
+                    # Unknown Devanagari character
+                    result.append(char)
+            else:
+                result.append(char)
+            i += 1
+        # Clean up
+        output = ''.join(result)
+        output = re.sub(r'\s+', ' ', output)
+        return output.strip().upper()
+    def normalize_mixed_script(self, text: str) -> str:
+        """
+        Handle code-mixed (Hindi + English) addresses.
+        Transliterates Hindi portions while preserving English.
+        """
+        # Split on whitespace to handle word by word
+        words = text.split()
+        result = []
+        for word in words:
+            if self.contains_devanagari(word):
+                # Check if it's a known term first
+                if self.use_known_terms and word in self.HINDI_TERMS:
+                    result.append(self.HINDI_TERMS[word])
+                else:
+                    result.append(self.transliterate(word))
+            else:
+                result.append(word.upper())
+        return ' '.join(result)
+def detect_language(text: str) -> str:
+    """
+    Simple language detection for address text.
+    Returns: 'hindi', 'english', or 'mixed'
+    """
+    transliterator = HindiTransliterator()
+    ratios = transliterator.get_script_ratio(text)
+    if ratios['devanagari'] > 0.5:
+        return 'hindi'
+    elif ratios['latin'] > 0.5:
+        return 'english'
+    elif ratios['devanagari'] > 0 and ratios['latin'] > 0:
+        return 'mixed'
+    else:
+        return 'english'

src/address_parser/preprocessing/normalizer.py ADDED Viewed

	@@ -0,0 +1,192 @@

+"""Address normalization utilities."""
+import re
+class AddressNormalizer:
+    """
+    Normalizes Indian addresses for consistent processing.
+    Handles:
+    - Case normalization
+    - Whitespace cleanup
+    - Common abbreviation expansion
+    - Punctuation standardization
+    - Number format standardization
+    """
+    # Common abbreviations in Indian addresses
+    ABBREVIATIONS = {
+        r'\bH\.?\s*NO\.?\b': 'HOUSE NO',
+        r'\bH\.?\s*N\.?\b': 'HOUSE NO',
+        r'\bHNO\.?\b': 'HOUSE NO',
+        r'\bPLT\.?\s*NO\.?\b': 'PLOT NO',
+        r'\bP\.?\s*NO\.?\b': 'PLOT NO',
+        r'\bFL\.?\b': 'FLOOR',
+        r'\bFLR\.?\b': 'FLOOR',
+        r'\bGF\.?\b': 'GROUND FLOOR',
+        r'\bFF\.?\b': 'FIRST FLOOR',
+        r'\bSF\.?\b': 'SECOND FLOOR',
+        r'\bTF\.?\b': 'THIRD FLOOR',
+        r'\b1ST\s+FL\.?\b': 'FIRST FLOOR',
+        r'\b2ND\s+FL\.?\b': 'SECOND FLOOR',
+        r'\b3RD\s+FL\.?\b': 'THIRD FLOOR',
+        r'\bGRD\.?\s*FL\.?\b': 'GROUND FLOOR',
+        r'\bBLK\.?\b': 'BLOCK',
+        r'\bBL\.?\b': 'BLOCK',
+        r'\bSEC\.?\b': 'SECTOR',
+        r'\bKH\.?\s*NO\.?\b': 'KHASRA NO',
+        r'\bKHASRA\s*NO\.?\b': 'KHASRA NO',
+        r'\bKH\.?\b': 'KHASRA',
+        r'\bCOL\.?\b': 'COLONY',
+        r'\bNGR\.?\b': 'NAGAR',
+        r'\bMKT\.?\b': 'MARKET',
+        r'\bRD\.?\b': 'ROAD',
+        r'\bST\.?\b': 'STREET',
+        r'\bLN\.?\b': 'LANE',
+        r'\bEXTN\.?\b': 'EXTENSION',
+        r'\bEXT\.?\b': 'EXTENSION',
+        r'\bPH\.?\b': 'PHASE',
+        r'\bNR\.?\b': 'NEAR',
+        r'\bOPP\.?\b': 'OPPOSITE',
+        r'\bBHD\.?\b': 'BEHIND',
+        r'\bADJ\.?\b': 'ADJACENT',
+        r'\bWZ\.?\b': 'WZ',  # West Zone
+        r'\bEZ\.?\b': 'EZ',  # East Zone
+        r'\bNZ\.?\b': 'NZ',  # North Zone
+        r'\bSZ\.?\b': 'SZ',  # South Zone
+        r'\bDL\.?\b': 'DELHI',
+        r'\bN\.?\s*DELHI\b': 'NEW DELHI',
+    }
+    # Floor name patterns
+    FLOOR_PATTERNS = {
+        r'\bGROUND\b': 'GROUND',
+        r'\bBASEMENT\b': 'BASEMENT',
+        r'\bFIRST\b': 'FIRST',
+        r'\bSECOND\b': 'SECOND',
+        r'\bTHIRD\b': 'THIRD',
+        r'\bFOURTH\b': 'FOURTH',
+        r'\bFIFTH\b': 'FIFTH',
+        r'\b1ST\b': 'FIRST',
+        r'\b2ND\b': 'SECOND',
+        r'\b3RD\b': 'THIRD',
+        r'\b4TH\b': 'FOURTH',
+        r'\b5TH\b': 'FIFTH',
+    }
+    def __init__(self, uppercase: bool = True, expand_abbrev: bool = True):
+        """
+        Initialize normalizer.
+        Args:
+            uppercase: Convert text to uppercase
+            expand_abbrev: Expand common abbreviations
+        """
+        self.uppercase = uppercase
+        self.expand_abbrev = expand_abbrev
+        # Compile regex patterns
+        self._abbrev_patterns = {
+            re.compile(pattern, re.IGNORECASE): replacement
+            for pattern, replacement in self.ABBREVIATIONS.items()
+        }
+    def normalize(self, address: str) -> str:
+        """
+        Normalize an address string.
+        Args:
+            address: Raw address string
+        Returns:
+            Normalized address string
+        """
+        if not address:
+            return ""
+        text = address
+        # Basic cleanup
+        text = self._clean_whitespace(text)
+        text = self._standardize_punctuation(text)
+        # Expand abbreviations
+        if self.expand_abbrev:
+            text = self._expand_abbreviations(text)
+        # Case normalization
+        if self.uppercase:
+            text = text.upper()
+        # Final whitespace cleanup
+        text = self._clean_whitespace(text)
+        return text
+    def _clean_whitespace(self, text: str) -> str:
+        """Remove extra whitespace."""
+        # Replace multiple spaces with single space
+        text = re.sub(r'\s+', ' ', text)
+        # Remove spaces around punctuation
+        text = re.sub(r'\s*,\s*', ', ', text)
+        text = re.sub(r'\s*-\s*', '-', text)
+        # Trim
+        return text.strip()
+    def _standardize_punctuation(self, text: str) -> str:
+        """Standardize punctuation marks."""
+        # Replace various dash types with standard hyphen
+        text = re.sub(r'[–—]', '-', text)
+        # Remove duplicate punctuation
+        text = re.sub(r',+', ',', text)
+        text = re.sub(r'-+', '-', text)
+        # Remove trailing punctuation before comma
+        text = re.sub(r'-,', ',', text)
+        return text
+    def _expand_abbreviations(self, text: str) -> str:
+        """Expand common abbreviations."""
+        for pattern, replacement in self._abbrev_patterns.items():
+            text = pattern.sub(replacement, text)
+        return text
+    def extract_pincode(self, address: str) -> str | None:
+        """Extract 6-digit Indian PIN code from address."""
+        match = re.search(r'\b[1-9]\d{5}\b', address)
+        return match.group(0) if match else None
+    def remove_pincode(self, address: str) -> str:
+        """Remove PIN code from address."""
+        return re.sub(r'\b[1-9]\d{5}\b', '', address)
+    def tokenize(self, text: str) -> list[str]:
+        """
+        Simple tokenization preserving address-specific patterns.
+        Args:
+            text: Normalized address text
+        Returns:
+            List of tokens
+        """
+        # Split on whitespace but keep special patterns together
+        # e.g., "H-3" stays as one token, "110041" stays together
+        tokens = []
+        # Pattern to match address tokens
+        pattern = r'''
+            [A-Z0-9]+[-/][A-Z0-9/]+  |  # Compound identifiers like H-3, 24/1/3
+            [A-Z]+\d+               |  # Letter+number combos like A5
+            \d+[A-Z]+               |  # Number+letter combos like 5A
+            [A-Z]+                  |  # Words
+            \d+                     |  # Numbers
+            [,.]                       # Punctuation
+        '''
+        for match in re.finditer(pattern, text.upper(), re.VERBOSE):
+            token = match.group(0)
+            if token.strip():
+                tokens.append(token)
+        return tokens

src/address_parser/schemas.py ADDED Viewed

	@@ -0,0 +1,152 @@

+"""Pydantic schemas for address parsing I/O."""
+from pydantic import BaseModel, ConfigDict, Field
+# Entity label definitions
+ENTITY_LABELS = [
+    "AREA",
+    "SUBAREA",
+    "HOUSE_NUMBER",
+    "SECTOR",
+    "GALI",
+    "COLONY",
+    "BLOCK",
+    "CAMP",
+    "POLE",
+    "KHASRA",
+    "FLOOR",
+    "PLOT",
+    "PINCODE",
+    "CITY",
+    "STATE",
+]
+# BIO tag generation
+BIO_LABELS = ["O"] + [f"B-{label}" for label in ENTITY_LABELS] + [f"I-{label}" for label in ENTITY_LABELS]
+LABEL2ID = {label: i for i, label in enumerate(BIO_LABELS)}
+ID2LABEL = {i: label for i, label in enumerate(BIO_LABELS)}
+class AddressEntity(BaseModel):
+    """A single extracted entity from an address."""
+    label: str = Field(..., description="Entity type (e.g., HOUSE_NUMBER, AREA)")
+    value: str = Field(..., description="Extracted text value")
+    start: int = Field(..., description="Start character offset in original text")
+    end: int = Field(..., description="End character offset in original text")
+    confidence: float = Field(default=1.0, ge=0.0, le=1.0, description="Confidence score")
+    model_config = ConfigDict(
+        json_schema_extra={
+            "example": {
+                "label": "HOUSE_NUMBER",
+                "value": "PLOT NO752",
+                "start": 0,
+                "end": 10,
+                "confidence": 0.95,
+            }
+        }
+    )
+class ParsedAddress(BaseModel):
+    """Complete parsed address with all entities."""
+    raw_address: str = Field(..., description="Original input address")
+    normalized_address: str = Field(..., description="Normalized/cleaned address")
+    entities: list[AddressEntity] = Field(default_factory=list, description="Extracted entities")
+    # Convenience accessors for common fields
+    house_number: str | None = Field(None, description="Extracted house/plot number")
+    floor: str | None = Field(None, description="Extracted floor")
+    block: str | None = Field(None, description="Extracted block")
+    gali: str | None = Field(None, description="Extracted gali/lane")
+    colony: str | None = Field(None, description="Extracted colony name")
+    area: str | None = Field(None, description="Extracted area/locality")
+    subarea: str | None = Field(None, description="Extracted sub-area")
+    sector: str | None = Field(None, description="Extracted sector")
+    khasra: str | None = Field(None, description="Extracted khasra number")
+    pincode: str | None = Field(None, description="Extracted PIN code")
+    city: str | None = Field(None, description="Extracted city")
+    state: str | None = Field(None, description="Extracted state")
+    def model_post_init(self, __context) -> None:
+        """Populate convenience fields from entities."""
+        entity_map = {e.label.upper(): e.value for e in self.entities}
+        self.house_number = entity_map.get("HOUSE_NUMBER") or entity_map.get("PLOT")
+        self.floor = entity_map.get("FLOOR")
+        self.block = entity_map.get("BLOCK")
+        self.gali = entity_map.get("GALI")
+        self.colony = entity_map.get("COLONY")
+        self.area = entity_map.get("AREA")
+        self.subarea = entity_map.get("SUBAREA")
+        self.sector = entity_map.get("SECTOR")
+        self.khasra = entity_map.get("KHASRA")
+        self.pincode = entity_map.get("PINCODE")
+        self.city = entity_map.get("CITY")
+        self.state = entity_map.get("STATE")
+    model_config = ConfigDict(
+        json_schema_extra={
+            "example": {
+                "raw_address": "PLOT NO752 FIRST FLOOR, BLOCK H-3, NEW DELHI, 110041",
+                "normalized_address": "PLOT NO752 FIRST FLOOR BLOCK H-3 NEW DELHI 110041",
+                "entities": [
+                    {"label": "HOUSE_NUMBER", "value": "PLOT NO752", "start": 0, "end": 10, "confidence": 0.95},
+                    {"label": "FLOOR", "value": "FIRST FLOOR", "start": 11, "end": 22, "confidence": 0.98},
+                ],
+                "house_number": "PLOT NO752",
+                "floor": "FIRST FLOOR",
+            }
+        }
+    )
+class ParseRequest(BaseModel):
+    """Request schema for parsing addresses."""
+    address: str = Field(..., min_length=5, max_length=500, description="Address to parse")
+    return_confidence: bool = Field(default=True, description="Include confidence scores")
+    model_config = ConfigDict(
+        json_schema_extra={
+            "example": {
+                "address": "PLOT NO752 FIRST FLOOR, BLOCK H-3, NEW DELHI, 110041",
+                "return_confidence": True,
+            }
+        }
+    )
+class BatchParseRequest(BaseModel):
+    """Request schema for batch parsing."""
+    addresses: list[str] = Field(..., min_length=1, max_length=100, description="List of addresses")
+    return_confidence: bool = Field(default=True, description="Include confidence scores")
+class ParseResponse(BaseModel):
+    """Response schema for single address parsing."""
+    success: bool = Field(default=True, description="Whether parsing succeeded")
+    result: ParsedAddress | None = Field(None, description="Parsed address result")
+    error: str | None = Field(None, description="Error message if failed")
+    inference_time_ms: float = Field(..., description="Inference time in milliseconds")
+class BatchParseResponse(BaseModel):
+    """Response schema for batch parsing."""
+    success: bool = Field(default=True)
+    results: list[ParsedAddress] = Field(default_factory=list)
+    total_inference_time_ms: float = Field(..., description="Total inference time")
+    avg_inference_time_ms: float = Field(..., description="Average per-address time")
+class HealthResponse(BaseModel):
+    """Health check response."""
+    status: str = Field(default="healthy")
+    model_loaded: bool = Field(default=False)
+    version: str = Field(default="2.0.0")

src/indian_address_parser.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,383 @@

+Metadata-Version: 2.4
+Name: indian-address-parser
+Version: 2.0.0
+Summary: Production-grade Indian address parsing using mBERT-CRF
+Author-email: Kushagra <kushagra@gmail.com>
+License: MIT
+Project-URL: Homepage, https://github.com/kushagra/indian-address-parser
+Project-URL: Documentation, https://github.com/kushagra/indian-address-parser#readme
+Project-URL: Repository, https://github.com/kushagra/indian-address-parser
+Project-URL: Issues, https://github.com/kushagra/indian-address-parser/issues
+Keywords: nlp,ner,address-parsing,indian-addresses,bert,crf
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.14
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Topic :: Text Processing :: Linguistic
+Requires-Python: >=3.14
+Description-Content-Type: text/markdown
+Requires-Dist: torch>=2.9.1
+Requires-Dist: transformers>=4.57.6
+Requires-Dist: tokenizers>=0.22.2
+Requires-Dist: datasets>=4.5.0
+Requires-Dist: seqeval>=1.2.2
+Requires-Dist: numpy>=2.4.1
+Requires-Dist: pandas>=2.3.3
+Requires-Dist: scikit-learn>=1.8.0
+Requires-Dist: tqdm>=4.67.1
+Requires-Dist: pydantic>=2.12.5
+Requires-Dist: indic-transliteration>=2.3.75
+Requires-Dist: regex>=2026.1.15
+Requires-Dist: rapidfuzz>=3.14.3
+Provides-Extra: api
+Requires-Dist: fastapi>=0.128.0; extra == "api"
+Requires-Dist: uvicorn[standard]>=0.40.0; extra == "api"
+Requires-Dist: gunicorn>=23.0.0; extra == "api"
+Requires-Dist: python-multipart>=0.0.21; extra == "api"
+Provides-Extra: demo
+Requires-Dist: gradio>=6.3.0; extra == "demo"
+Provides-Extra: training
+Requires-Dist: accelerate>=1.12.0; extra == "training"
+Requires-Dist: wandb>=0.24.0; extra == "training"
+Requires-Dist: optuna>=4.7.0; extra == "training"
+Provides-Extra: onnx
+Requires-Dist: onnx>=1.20.1; python_version < "3.14" and extra == "onnx"
+Requires-Dist: onnxruntime>=1.23.2; python_version < "3.14" and extra == "onnx"
+Provides-Extra: dev
+Requires-Dist: pytest>=9.0.2; extra == "dev"
+Requires-Dist: pytest-cov>=7.0.0; extra == "dev"
+Requires-Dist: pytest-asyncio>=1.3.0; extra == "dev"
+Requires-Dist: black>=26.1.0; extra == "dev"
+Requires-Dist: ruff>=0.14.13; extra == "dev"
+Requires-Dist: mypy>=1.19.1; extra == "dev"
+Requires-Dist: pre-commit>=4.5.1; extra == "dev"
+Provides-Extra: all
+Requires-Dist: indian-address-parser[api,demo,dev,training]; extra == "all"
+Provides-Extra: all-with-onnx
+Requires-Dist: indian-address-parser[api,demo,dev,onnx,training]; extra == "all-with-onnx"
+# Indian Address Parser
+Production-grade NLP system for parsing unstructured Indian addresses into structured components using **mBERT-CRF** (Multilingual BERT with Conditional Random Field).
+[![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+## Features
+- **High Accuracy**: 94%+ F1 score on test data
+- **Multilingual**: Supports Hindi (Devanagari) + English
+- **Fast Inference**: < 30ms per address with ONNX optimization
+- **15 Entity Types**: House Number, Floor, Block, Gali, Colony, Area, Khasra, Pincode, etc.
+- **Delhi-specific**: Gazetteer with 100+ localities for improved accuracy
+- **Production Ready**: REST API, Docker, Cloud Run deployment
+## Demo
+- **Interactive Demo**: [HuggingFace Spaces](https://huggingface.co/spaces/kushagra/indian-address-parser)
+- **API Endpoint**: `https://indian-address-parser-xyz.run.app/docs`
+## Quick Start
+### Installation
+```bash
+pip install indian-address-parser
+```
+Or from source:
+```bash
+git clone https://github.com/kushagra/indian-address-parser.git
+cd indian-address-parser
+pip install -e ".[all]"
+```
+### Usage
+```python
+from address_parser import AddressParser
+# Load parser (rules-only mode if model not available)
+parser = AddressParser.rules_only()
+# Or load trained model
+# parser = AddressParser.from_pretrained("./models/address_ner")
+# Parse address
+result = parser.parse(
+    "PLOT NO752 FIRST FLOOR, BLOCK H-3 KH NO 24/1/3/2/2/202, "
+    "KAUNWAR SINGH NAGAR NEW DELHI, DELHI, 110041"
+)
+print(f"House Number: {result.house_number}")
+print(f"Floor: {result.floor}")
+print(f"Block: {result.block}")
+print(f"Khasra: {result.khasra}")
+print(f"Area: {result.area}")
+print(f"Pincode: {result.pincode}")
+```
+**Output:**
+```
+House Number: PLOT NO752
+Floor: FIRST FLOOR
+Block: BLOCK H-3
+Khasra: KH NO 24/1/3/2/2/202
+Area: KAUNWAR SINGH NAGAR
+Pincode: 110041
+```
+### Entity Types
+| Entity | Description | Example |
+|--------|-------------|---------|
+| `HOUSE_NUMBER` | House/plot number | `H.NO. 123`, `PLOT NO752` |
+| `FLOOR` | Floor level | `FIRST FLOOR`, `GF` |
+| `BLOCK` | Block identifier | `BLOCK H-3`, `BLK A` |
+| `SECTOR` | Sector number | `SECTOR 15` |
+| `GALI` | Lane/gali number | `GALI NO. 5` |
+| `COLONY` | Colony name | `BABA HARI DAS COLONY` |
+| `AREA` | Area/locality | `KAUNWAR SINGH NAGAR` |
+| `SUBAREA` | Sub-area | `TIKARI KALA` |
+| `KHASRA` | Khasra number | `KH NO 24/1/3/2` |
+| `PINCODE` | 6-digit PIN code | `110041` |
+| `CITY` | City name | `NEW DELHI` |
+| `STATE` | State name | `DELHI` |
+## API Usage
+### REST API
+```bash
+# Start API server
+uvicorn api.main:app --host 0.0.0.0 --port 8080
+# Parse single address
+curl -X POST "http://localhost:8080/parse" \
+  -H "Content-Type: application/json" \
+  -d '{"address": "PLOT NO752 FIRST FLOOR, NEW DELHI, 110041"}'
+# Batch parse
+curl -X POST "http://localhost:8080/parse/batch" \
+  -H "Content-Type: application/json" \
+  -d '{"addresses": ["ADDRESS 1", "ADDRESS 2"]}'
+```
+### Python API
+```python
+from address_parser import AddressParser
+parser = AddressParser.from_pretrained("./models/address_ner")
+# Single parse with timing
+response = parser.parse_with_timing("NEW DELHI 110041")
+print(f"Inference time: {response.inference_time_ms:.2f}ms")
+# Batch parse
+batch_response = parser.parse_batch([
+    "PLOT NO 123, DWARKA, 110078",
+    "H.NO. 456, LAJPAT NAGAR, 110024",
+])
+print(f"Average time: {batch_response.avg_inference_time_ms:.2f}ms")
+```
+## Training
+### Data Preparation
+Convert existing Label Studio annotations to BIO format:
+```bash
+python training/convert_data.py
+```
+This creates:
+- `data/processed/train.jsonl`
+- `data/processed/val.jsonl`
+- `data/processed/test.jsonl`
+### Train Model
+```bash
+python training/train.py \
+  --train data/processed/train.jsonl \
+  --val data/processed/val.jsonl \
+  --output models/address_ner \
+  --model bert-base-multilingual-cased \
+  --epochs 10 \
+  --batch-size 16
+```
+### Data Augmentation
+Augment training data for improved robustness:
+```python
+from training.augment import AddressAugmenter, augment_dataset
+augmenter = AddressAugmenter(
+    abbrev_prob=0.3,
+    case_prob=0.2,
+    typo_prob=0.1,
+)
+augmented_data = augment_dataset(original_data, augmenter, target_size=1500)
+```
+## Deployment
+### Docker
+```bash
+# Build
+docker build -t indian-address-parser -f api/Dockerfile .
+# Run
+docker run -p 8080:8080 indian-address-parser
+```
+### Google Cloud Run
+```bash
+# Deploy with Cloud Build
+gcloud builds submit --config api/cloudbuild.yaml
+# Or deploy directly
+gcloud run deploy indian-address-parser \
+  --image gcr.io/PROJECT_ID/indian-address-parser \
+  --region us-central1 \
+  --min-instances 1 \
+  --allow-unauthenticated
+```
+### HuggingFace Spaces
+1. Create a new Space on HuggingFace
+2. Copy contents of `demo/` directory
+3. Upload trained model to HuggingFace Hub
+4. Update `MODEL_PATH` environment variable
+## Architecture
+```
+┌─────────────────────────────────────────────────────────────────┐
+│              Indian Address Parser Pipeline                      │
+├─────────────────────────────────────────────────────────────────┤
+│  ┌──────────────┐  ┌─────────────────┐  ┌────────────────────┐ │
+│  │ Preprocessor │→│ mBERT-CRF       │→│ Post-processor     │ │
+│  │ (Hindi/Eng)  │  │ (multilingual)  │  │ (rules+gazetteer)  │ │
+│  └──────────────┘  └─────────────────┘  └────────────────────┘ │
+├─────────────────────────────────────────────────────────────────┤
+│  Components:                                                     │
+│  • AddressNormalizer: Text normalization, abbreviation expansion│
+│  • HindiTransliterator: Devanagari → Latin conversion           │
+│  • BertCRFForTokenClassification: mBERT + CRF for NER           │
+│  • RuleBasedRefiner: Pattern matching, entity validation        │
+│  • DelhiGazetteer: Fuzzy matching for locality names            │
+└─────────────────────────────────────────────────────────────────┘
+```
+## Performance
+| Metric | Value |
+|--------|-------|
+| Precision | 94.2% |
+| Recall | 95.1% |
+| F1 Score | 94.6% |
+| Inference Time | ~25ms |
+Tested on held-out test set of 60+ Delhi addresses.
+## Project Structure
+```
+indian-address-parser/
+├── src/address_parser/
+│   ├── preprocessing/     # Text normalization, Hindi transliteration
+│   ├── models/            # mBERT-CRF model architecture
+│   ├── postprocessing/    # Rules, gazetteer, validation
+│   ├── pipeline.py        # Main orchestration
+│   └── schemas.py         # Pydantic I/O models
+├── api/                   # FastAPI service
+├── demo/                  # Gradio demo for HuggingFace Spaces
+├── training/              # Data prep, training scripts
+├── tests/                 # pytest test suite
+└── pyproject.toml         # Package config
+```
+## Development
+### Setup
+```bash
+# Clone repository
+git clone https://github.com/kushagra/indian-address-parser.git
+cd indian-address-parser
+# Install with dev dependencies
+pip install -e ".[dev]"
+# Install pre-commit hooks
+pre-commit install
+```
+### Testing
+```bash
+# Run all tests
+pytest
+# Run with coverage
+pytest --cov=address_parser --cov-report=html
+# Run specific test file
+pytest tests/test_pipeline.py -v
+```
+### Code Quality
+```bash
+# Format code
+black src/ tests/
+# Lint
+ruff check src/ tests/
+# Type check
+mypy src/
+```
+## Comparison with Alternatives
+| Solution | Indian Support | Custom Labels | Latency | Cost |
+|----------|---------------|---------------|---------|------|
+| **This Project** | Excellent | Yes (15 types) | ~25ms | Free |
+| libpostal | Poor | No | ~5ms | Free |
+| Deepparse | Generic | No | ~50ms | Free |
+| GPT-4 | Good | Configurable | ~1000ms | $0.03/call |
+| Google Geocoding | Moderate | No | ~200ms | $5/1000 |
+## License
+MIT License - see [LICENSE](LICENSE) for details.
+## Acknowledgments
+- Original 2024 BSES Delhi internship project
+- HuggingFace Transformers library
+- Delhi locality data from public sources
+## Citation
+```bibtex
+@software{indian_address_parser,
+  author = {Kushagra},
+  title = {Indian Address Parser: Production-grade NER for Indian Addresses},
+  year = {2026},
+  url = {https://github.com/kushagra/indian-address-parser}
+}
+```

src/indian_address_parser.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+README.md
+pyproject.toml
+src/address_parser/__init__.py
+src/address_parser/cli.py
+src/address_parser/pipeline.py
+src/address_parser/schemas.py
+src/address_parser/models/__init__.py
+src/address_parser/models/bert_crf.py
+src/address_parser/models/config.py
+src/address_parser/postprocessing/__init__.py
+src/address_parser/postprocessing/gazetteer.py
+src/address_parser/postprocessing/rules.py
+src/address_parser/preprocessing/__init__.py
+src/address_parser/preprocessing/hindi.py
+src/address_parser/preprocessing/normalizer.py
+src/indian_address_parser.egg-info/PKG-INFO
+src/indian_address_parser.egg-info/SOURCES.txt
+src/indian_address_parser.egg-info/dependency_links.txt
+src/indian_address_parser.egg-info/entry_points.txt
+src/indian_address_parser.egg-info/requires.txt
+src/indian_address_parser.egg-info/top_level.txt
+tests/test_pipeline.py
+tests/test_postprocessing.py
+tests/test_preprocessing.py

src/indian_address_parser.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

src/indian_address_parser.egg-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [console_scripts]
2	+ address-parser = address_parser.cli:main

src/indian_address_parser.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,48 @@

+torch>=2.9.1
+transformers>=4.57.6
+tokenizers>=0.22.2
+datasets>=4.5.0
+seqeval>=1.2.2
+numpy>=2.4.1
+pandas>=2.3.3
+scikit-learn>=1.8.0
+tqdm>=4.67.1
+pydantic>=2.12.5
+indic-transliteration>=2.3.75
+regex>=2026.1.15
+rapidfuzz>=3.14.3
+[all]
+indian-address-parser[api,demo,dev,training]
+[all-with-onnx]
+indian-address-parser[api,demo,dev,onnx,training]
+[api]
+fastapi>=0.128.0
+uvicorn[standard]>=0.40.0
+gunicorn>=23.0.0
+python-multipart>=0.0.21
+[demo]
+gradio>=6.3.0
+[dev]
+pytest>=9.0.2
+pytest-cov>=7.0.0
+pytest-asyncio>=1.3.0
+black>=26.1.0
+ruff>=0.14.13
+mypy>=1.19.1
+pre-commit>=4.5.1
+[onnx]
+[onnx:python_version < "3.14"]
+onnx>=1.20.1
+onnxruntime>=1.23.2
+[training]
+accelerate>=1.12.0
+wandb>=0.24.0
+optuna>=4.7.0

src/indian_address_parser.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ address_parser