Spaces:
Sleeping
Sleeping
| """Address normalization utilities.""" | |
| import re | |
| class AddressNormalizer: | |
| """ | |
| Normalizes Indian addresses for consistent processing. | |
| Handles: | |
| - Case normalization | |
| - Whitespace cleanup | |
| - Common abbreviation expansion | |
| - Punctuation standardization | |
| - Number format standardization | |
| """ | |
| # Common abbreviations in Indian addresses | |
| ABBREVIATIONS = { | |
| r'\bH\.?\s*NO\.?\b': 'HOUSE NO', | |
| r'\bH\.?\s*N\.?\b': 'HOUSE NO', | |
| r'\bHNO\.?\b': 'HOUSE NO', | |
| r'\bPLT\.?\s*NO\.?\b': 'PLOT NO', | |
| r'\bP\.?\s*NO\.?\b': 'PLOT NO', | |
| r'\bFL\.?\b': 'FLOOR', | |
| r'\bFLR\.?\b': 'FLOOR', | |
| r'\bGF\.?\b': 'GROUND FLOOR', | |
| r'\bFF\.?\b': 'FIRST FLOOR', | |
| r'\bSF\.?\b': 'SECOND FLOOR', | |
| r'\bTF\.?\b': 'THIRD FLOOR', | |
| r'\b1ST\s+FL\.?\b': 'FIRST FLOOR', | |
| r'\b2ND\s+FL\.?\b': 'SECOND FLOOR', | |
| r'\b3RD\s+FL\.?\b': 'THIRD FLOOR', | |
| r'\bGRD\.?\s*FL\.?\b': 'GROUND FLOOR', | |
| r'\bBLK\.?\b': 'BLOCK', | |
| r'\bBL\.?\b': 'BLOCK', | |
| r'\bSEC\.?\b': 'SECTOR', | |
| r'\bKH\.?\s*NO\.?\b': 'KHASRA NO', | |
| r'\bKHASRA\s*NO\.?\b': 'KHASRA NO', | |
| r'\bKH\.?\b': 'KHASRA', | |
| r'\bCOL\.?\b': 'COLONY', | |
| r'\bNGR\.?\b': 'NAGAR', | |
| r'\bMKT\.?\b': 'MARKET', | |
| r'\bRD\.?\b': 'ROAD', | |
| r'\bST\.?\b': 'STREET', | |
| r'\bLN\.?\b': 'LANE', | |
| r'\bEXTN\.?\b': 'EXTENSION', | |
| r'\bEXT\.?\b': 'EXTENSION', | |
| r'\bPH\.?\b': 'PHASE', | |
| r'\bNR\.?\b': 'NEAR', | |
| r'\bOPP\.?\b': 'OPPOSITE', | |
| r'\bBHD\.?\b': 'BEHIND', | |
| r'\bADJ\.?\b': 'ADJACENT', | |
| r'\bWZ\.?\b': 'WZ', # West Zone | |
| r'\bEZ\.?\b': 'EZ', # East Zone | |
| r'\bNZ\.?\b': 'NZ', # North Zone | |
| r'\bSZ\.?\b': 'SZ', # South Zone | |
| r'\bDL\.?\b': 'DELHI', | |
| r'\bN\.?\s*DELHI\b': 'NEW DELHI', | |
| } | |
| # Floor name patterns | |
| FLOOR_PATTERNS = { | |
| r'\bGROUND\b': 'GROUND', | |
| r'\bBASEMENT\b': 'BASEMENT', | |
| r'\bFIRST\b': 'FIRST', | |
| r'\bSECOND\b': 'SECOND', | |
| r'\bTHIRD\b': 'THIRD', | |
| r'\bFOURTH\b': 'FOURTH', | |
| r'\bFIFTH\b': 'FIFTH', | |
| r'\b1ST\b': 'FIRST', | |
| r'\b2ND\b': 'SECOND', | |
| r'\b3RD\b': 'THIRD', | |
| r'\b4TH\b': 'FOURTH', | |
| r'\b5TH\b': 'FIFTH', | |
| } | |
| def __init__(self, uppercase: bool = True, expand_abbrev: bool = True): | |
| """ | |
| Initialize normalizer. | |
| Args: | |
| uppercase: Convert text to uppercase | |
| expand_abbrev: Expand common abbreviations | |
| """ | |
| self.uppercase = uppercase | |
| self.expand_abbrev = expand_abbrev | |
| # Compile regex patterns | |
| self._abbrev_patterns = { | |
| re.compile(pattern, re.IGNORECASE): replacement | |
| for pattern, replacement in self.ABBREVIATIONS.items() | |
| } | |
| def normalize(self, address: str) -> str: | |
| """ | |
| Normalize an address string. | |
| Args: | |
| address: Raw address string | |
| Returns: | |
| Normalized address string | |
| """ | |
| if not address: | |
| return "" | |
| text = address | |
| # Basic cleanup | |
| text = self._clean_whitespace(text) | |
| text = self._standardize_punctuation(text) | |
| # Expand abbreviations | |
| if self.expand_abbrev: | |
| text = self._expand_abbreviations(text) | |
| # Case normalization | |
| if self.uppercase: | |
| text = text.upper() | |
| # Final whitespace cleanup | |
| text = self._clean_whitespace(text) | |
| return text | |
| def _clean_whitespace(self, text: str) -> str: | |
| """Remove extra whitespace.""" | |
| # Replace multiple spaces with single space | |
| text = re.sub(r'\s+', ' ', text) | |
| # Remove spaces around punctuation | |
| text = re.sub(r'\s*,\s*', ', ', text) | |
| text = re.sub(r'\s*-\s*', '-', text) | |
| # Trim | |
| return text.strip() | |
| def _standardize_punctuation(self, text: str) -> str: | |
| """Standardize punctuation marks.""" | |
| # Replace various dash types with standard hyphen | |
| text = re.sub(r'[–—]', '-', text) | |
| # Remove duplicate punctuation | |
| text = re.sub(r',+', ',', text) | |
| text = re.sub(r'-+', '-', text) | |
| # Remove trailing punctuation before comma | |
| text = re.sub(r'-,', ',', text) | |
| return text | |
| def _expand_abbreviations(self, text: str) -> str: | |
| """Expand common abbreviations.""" | |
| for pattern, replacement in self._abbrev_patterns.items(): | |
| text = pattern.sub(replacement, text) | |
| return text | |
| def extract_pincode(self, address: str) -> str | None: | |
| """Extract 6-digit Indian PIN code from address.""" | |
| match = re.search(r'\b[1-9]\d{5}\b', address) | |
| return match.group(0) if match else None | |
| def remove_pincode(self, address: str) -> str: | |
| """Remove PIN code from address.""" | |
| return re.sub(r'\b[1-9]\d{5}\b', '', address) | |
| def tokenize(self, text: str) -> list[str]: | |
| """ | |
| Simple tokenization preserving address-specific patterns. | |
| Args: | |
| text: Normalized address text | |
| Returns: | |
| List of tokens | |
| """ | |
| # Split on whitespace but keep special patterns together | |
| # e.g., "H-3" stays as one token, "110041" stays together | |
| tokens = [] | |
| # Pattern to match address tokens | |
| pattern = r''' | |
| [A-Z0-9]+[-/][A-Z0-9/]+ | # Compound identifiers like H-3, 24/1/3 | |
| [A-Z]+\d+ | # Letter+number combos like A5 | |
| \d+[A-Z]+ | # Number+letter combos like 5A | |
| [A-Z]+ | # Words | |
| \d+ | # Numbers | |
| [,.] # Punctuation | |
| ''' | |
| for match in re.finditer(pattern, text.upper(), re.VERBOSE): | |
| token = match.group(0) | |
| if token.strip(): | |
| tokens.append(token) | |
| return tokens | |