| """Address normalization utilities."""
|
|
|
| import re
|
|
|
|
|
| class AddressNormalizer:
|
| """
|
| Normalizes Indian addresses for consistent processing.
|
|
|
| Handles:
|
| - Case normalization
|
| - Whitespace cleanup
|
| - Common abbreviation expansion
|
| - Punctuation standardization
|
| - Number format standardization
|
| """
|
|
|
|
|
| ABBREVIATIONS = {
|
| r'\bH\.?\s*NO\.?\b': 'HOUSE NO',
|
| r'\bH\.?\s*N\.?\b': 'HOUSE NO',
|
| r'\bHNO\.?\b': 'HOUSE NO',
|
| r'\bPLT\.?\s*NO\.?\b': 'PLOT NO',
|
| r'\bP\.?\s*NO\.?\b': 'PLOT NO',
|
| r'\bFL\.?\b': 'FLOOR',
|
| r'\bFLR\.?\b': 'FLOOR',
|
| r'\bGF\.?\b': 'GROUND FLOOR',
|
| r'\bFF\.?\b': 'FIRST FLOOR',
|
| r'\bSF\.?\b': 'SECOND FLOOR',
|
| r'\bTF\.?\b': 'THIRD FLOOR',
|
| r'\b1ST\s+FL\.?\b': 'FIRST FLOOR',
|
| r'\b2ND\s+FL\.?\b': 'SECOND FLOOR',
|
| r'\b3RD\s+FL\.?\b': 'THIRD FLOOR',
|
| r'\bGRD\.?\s*FL\.?\b': 'GROUND FLOOR',
|
| r'\bBLK\.?\b': 'BLOCK',
|
| r'\bBL\.?\b': 'BLOCK',
|
| r'\bSEC\.?\b': 'SECTOR',
|
| r'\bKH\.?\s*NO\.?\b': 'KHASRA NO',
|
| r'\bKHASRA\s*NO\.?\b': 'KHASRA NO',
|
| r'\bKH\.?\b': 'KHASRA',
|
| r'\bCOL\.?\b': 'COLONY',
|
| r'\bNGR\.?\b': 'NAGAR',
|
| r'\bMKT\.?\b': 'MARKET',
|
| r'\bRD\.?\b': 'ROAD',
|
| r'\bST\.?\b': 'STREET',
|
| r'\bLN\.?\b': 'LANE',
|
| r'\bEXTN\.?\b': 'EXTENSION',
|
| r'\bEXT\.?\b': 'EXTENSION',
|
| r'\bPH\.?\b': 'PHASE',
|
| r'\bNR\.?\b': 'NEAR',
|
| r'\bOPP\.?\b': 'OPPOSITE',
|
| r'\bBHD\.?\b': 'BEHIND',
|
| r'\bADJ\.?\b': 'ADJACENT',
|
| r'\bWZ\.?\b': 'WZ',
|
| r'\bEZ\.?\b': 'EZ',
|
| r'\bNZ\.?\b': 'NZ',
|
| r'\bSZ\.?\b': 'SZ',
|
| r'\bDL\.?\b': 'DELHI',
|
| r'\bN\.?\s*DELHI\b': 'NEW DELHI',
|
| }
|
|
|
|
|
| FLOOR_PATTERNS = {
|
| r'\bGROUND\b': 'GROUND',
|
| r'\bBASEMENT\b': 'BASEMENT',
|
| r'\bFIRST\b': 'FIRST',
|
| r'\bSECOND\b': 'SECOND',
|
| r'\bTHIRD\b': 'THIRD',
|
| r'\bFOURTH\b': 'FOURTH',
|
| r'\bFIFTH\b': 'FIFTH',
|
| r'\b1ST\b': 'FIRST',
|
| r'\b2ND\b': 'SECOND',
|
| r'\b3RD\b': 'THIRD',
|
| r'\b4TH\b': 'FOURTH',
|
| r'\b5TH\b': 'FIFTH',
|
| }
|
|
|
| def __init__(self, uppercase: bool = True, expand_abbrev: bool = True):
|
| """
|
| Initialize normalizer.
|
|
|
| Args:
|
| uppercase: Convert text to uppercase
|
| expand_abbrev: Expand common abbreviations
|
| """
|
| self.uppercase = uppercase
|
| self.expand_abbrev = expand_abbrev
|
|
|
|
|
| self._abbrev_patterns = {
|
| re.compile(pattern, re.IGNORECASE): replacement
|
| for pattern, replacement in self.ABBREVIATIONS.items()
|
| }
|
|
|
| def normalize(self, address: str) -> str:
|
| """
|
| Normalize an address string.
|
|
|
| Args:
|
| address: Raw address string
|
|
|
| Returns:
|
| Normalized address string
|
| """
|
| if not address:
|
| return ""
|
|
|
| text = address
|
|
|
|
|
| text = self._clean_whitespace(text)
|
| text = self._standardize_punctuation(text)
|
|
|
|
|
| if self.expand_abbrev:
|
| text = self._expand_abbreviations(text)
|
|
|
|
|
| if self.uppercase:
|
| text = text.upper()
|
|
|
|
|
| text = self._clean_whitespace(text)
|
|
|
| return text
|
|
|
| def _clean_whitespace(self, text: str) -> str:
|
| """Remove extra whitespace."""
|
|
|
| text = re.sub(r'\s+', ' ', text)
|
|
|
| text = re.sub(r'\s*,\s*', ', ', text)
|
| text = re.sub(r'\s*-\s*', '-', text)
|
|
|
| return text.strip()
|
|
|
| def _standardize_punctuation(self, text: str) -> str:
|
| """Standardize punctuation marks."""
|
|
|
| text = re.sub(r'[–—]', '-', text)
|
|
|
| text = re.sub(r',+', ',', text)
|
| text = re.sub(r'-+', '-', text)
|
|
|
| text = re.sub(r'-,', ',', text)
|
| return text
|
|
|
| def _expand_abbreviations(self, text: str) -> str:
|
| """Expand common abbreviations."""
|
| for pattern, replacement in self._abbrev_patterns.items():
|
| text = pattern.sub(replacement, text)
|
| return text
|
|
|
| def extract_pincode(self, address: str) -> str | None:
|
| """Extract 6-digit Indian PIN code from address."""
|
| match = re.search(r'\b[1-9]\d{5}\b', address)
|
| return match.group(0) if match else None
|
|
|
| def remove_pincode(self, address: str) -> str:
|
| """Remove PIN code from address."""
|
| return re.sub(r'\b[1-9]\d{5}\b', '', address)
|
|
|
| def tokenize(self, text: str) -> list[str]:
|
| """
|
| Simple tokenization preserving address-specific patterns.
|
|
|
| Args:
|
| text: Normalized address text
|
|
|
| Returns:
|
| List of tokens
|
| """
|
|
|
|
|
| tokens = []
|
|
|
|
|
| pattern = r'''
|
| [A-Z0-9]+[-/][A-Z0-9/]+ | # Compound identifiers like H-3, 24/1/3
|
| [A-Z]+\d+ | # Letter+number combos like A5
|
| \d+[A-Z]+ | # Number+letter combos like 5A
|
| [A-Z]+ | # Words
|
| \d+ | # Numbers
|
| [,.] # Punctuation
|
| '''
|
|
|
| for match in re.finditer(pattern, text.upper(), re.VERBOSE):
|
| token = match.group(0)
|
| if token.strip():
|
| tokens.append(token)
|
|
|
| return tokens
|
|
|