x2aqq's picture
Upload folder using huggingface_hub
47bc13b verified
"""Address normalization utilities."""
import re
class AddressNormalizer:
"""
Normalizes Indian addresses for consistent processing.
Handles:
- Case normalization
- Whitespace cleanup
- Common abbreviation expansion
- Punctuation standardization
- Number format standardization
"""
# Common abbreviations in Indian addresses
ABBREVIATIONS = {
r'\bH\.?\s*NO\.?\b': 'HOUSE NO',
r'\bH\.?\s*N\.?\b': 'HOUSE NO',
r'\bHNO\.?\b': 'HOUSE NO',
r'\bPLT\.?\s*NO\.?\b': 'PLOT NO',
r'\bP\.?\s*NO\.?\b': 'PLOT NO',
r'\bFL\.?\b': 'FLOOR',
r'\bFLR\.?\b': 'FLOOR',
r'\bGF\.?\b': 'GROUND FLOOR',
r'\bFF\.?\b': 'FIRST FLOOR',
r'\bSF\.?\b': 'SECOND FLOOR',
r'\bTF\.?\b': 'THIRD FLOOR',
r'\b1ST\s+FL\.?\b': 'FIRST FLOOR',
r'\b2ND\s+FL\.?\b': 'SECOND FLOOR',
r'\b3RD\s+FL\.?\b': 'THIRD FLOOR',
r'\bGRD\.?\s*FL\.?\b': 'GROUND FLOOR',
r'\bBLK\.?\b': 'BLOCK',
r'\bBL\.?\b': 'BLOCK',
r'\bSEC\.?\b': 'SECTOR',
r'\bKH\.?\s*NO\.?\b': 'KHASRA NO',
r'\bKHASRA\s*NO\.?\b': 'KHASRA NO',
r'\bKH\.?\b': 'KHASRA',
r'\bCOL\.?\b': 'COLONY',
r'\bNGR\.?\b': 'NAGAR',
r'\bMKT\.?\b': 'MARKET',
r'\bRD\.?\b': 'ROAD',
r'\bST\.?\b': 'STREET',
r'\bLN\.?\b': 'LANE',
r'\bEXTN\.?\b': 'EXTENSION',
r'\bEXT\.?\b': 'EXTENSION',
r'\bPH\.?\b': 'PHASE',
r'\bNR\.?\b': 'NEAR',
r'\bOPP\.?\b': 'OPPOSITE',
r'\bBHD\.?\b': 'BEHIND',
r'\bADJ\.?\b': 'ADJACENT',
r'\bWZ\.?\b': 'WZ', # West Zone
r'\bEZ\.?\b': 'EZ', # East Zone
r'\bNZ\.?\b': 'NZ', # North Zone
r'\bSZ\.?\b': 'SZ', # South Zone
r'\bDL\.?\b': 'DELHI',
r'\bN\.?\s*DELHI\b': 'NEW DELHI',
}
# Floor name patterns
FLOOR_PATTERNS = {
r'\bGROUND\b': 'GROUND',
r'\bBASEMENT\b': 'BASEMENT',
r'\bFIRST\b': 'FIRST',
r'\bSECOND\b': 'SECOND',
r'\bTHIRD\b': 'THIRD',
r'\bFOURTH\b': 'FOURTH',
r'\bFIFTH\b': 'FIFTH',
r'\b1ST\b': 'FIRST',
r'\b2ND\b': 'SECOND',
r'\b3RD\b': 'THIRD',
r'\b4TH\b': 'FOURTH',
r'\b5TH\b': 'FIFTH',
}
def __init__(self, uppercase: bool = True, expand_abbrev: bool = True):
"""
Initialize normalizer.
Args:
uppercase: Convert text to uppercase
expand_abbrev: Expand common abbreviations
"""
self.uppercase = uppercase
self.expand_abbrev = expand_abbrev
# Compile regex patterns
self._abbrev_patterns = {
re.compile(pattern, re.IGNORECASE): replacement
for pattern, replacement in self.ABBREVIATIONS.items()
}
def normalize(self, address: str) -> str:
"""
Normalize an address string.
Args:
address: Raw address string
Returns:
Normalized address string
"""
if not address:
return ""
text = address
# Basic cleanup
text = self._clean_whitespace(text)
text = self._standardize_punctuation(text)
# Expand abbreviations
if self.expand_abbrev:
text = self._expand_abbreviations(text)
# Case normalization
if self.uppercase:
text = text.upper()
# Final whitespace cleanup
text = self._clean_whitespace(text)
return text
def _clean_whitespace(self, text: str) -> str:
"""Remove extra whitespace."""
# Replace multiple spaces with single space
text = re.sub(r'\s+', ' ', text)
# Remove spaces around punctuation
text = re.sub(r'\s*,\s*', ', ', text)
text = re.sub(r'\s*-\s*', '-', text)
# Trim
return text.strip()
def _standardize_punctuation(self, text: str) -> str:
"""Standardize punctuation marks."""
# Replace various dash types with standard hyphen
text = re.sub(r'[–—]', '-', text)
# Remove duplicate punctuation
text = re.sub(r',+', ',', text)
text = re.sub(r'-+', '-', text)
# Remove trailing punctuation before comma
text = re.sub(r'-,', ',', text)
return text
def _expand_abbreviations(self, text: str) -> str:
"""Expand common abbreviations."""
for pattern, replacement in self._abbrev_patterns.items():
text = pattern.sub(replacement, text)
return text
def extract_pincode(self, address: str) -> str | None:
"""Extract 6-digit Indian PIN code from address."""
match = re.search(r'\b[1-9]\d{5}\b', address)
return match.group(0) if match else None
def remove_pincode(self, address: str) -> str:
"""Remove PIN code from address."""
return re.sub(r'\b[1-9]\d{5}\b', '', address)
def tokenize(self, text: str) -> list[str]:
"""
Simple tokenization preserving address-specific patterns.
Args:
text: Normalized address text
Returns:
List of tokens
"""
# Split on whitespace but keep special patterns together
# e.g., "H-3" stays as one token, "110041" stays together
tokens = []
# Pattern to match address tokens
pattern = r'''
[A-Z0-9]+[-/][A-Z0-9/]+ | # Compound identifiers like H-3, 24/1/3
[A-Z]+\d+ | # Letter+number combos like A5
\d+[A-Z]+ | # Number+letter combos like 5A
[A-Z]+ | # Words
\d+ | # Numbers
[,.] # Punctuation
'''
for match in re.finditer(pattern, text.upper(), re.VERBOSE):
token = match.group(0)
if token.strip():
tokens.append(token)
return tokens