Spaces:
Sleeping
Sleeping
File size: 6,181 Bytes
47bc13b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
"""Address normalization utilities."""
import re
class AddressNormalizer:
"""
Normalizes Indian addresses for consistent processing.
Handles:
- Case normalization
- Whitespace cleanup
- Common abbreviation expansion
- Punctuation standardization
- Number format standardization
"""
# Common abbreviations in Indian addresses
ABBREVIATIONS = {
r'\bH\.?\s*NO\.?\b': 'HOUSE NO',
r'\bH\.?\s*N\.?\b': 'HOUSE NO',
r'\bHNO\.?\b': 'HOUSE NO',
r'\bPLT\.?\s*NO\.?\b': 'PLOT NO',
r'\bP\.?\s*NO\.?\b': 'PLOT NO',
r'\bFL\.?\b': 'FLOOR',
r'\bFLR\.?\b': 'FLOOR',
r'\bGF\.?\b': 'GROUND FLOOR',
r'\bFF\.?\b': 'FIRST FLOOR',
r'\bSF\.?\b': 'SECOND FLOOR',
r'\bTF\.?\b': 'THIRD FLOOR',
r'\b1ST\s+FL\.?\b': 'FIRST FLOOR',
r'\b2ND\s+FL\.?\b': 'SECOND FLOOR',
r'\b3RD\s+FL\.?\b': 'THIRD FLOOR',
r'\bGRD\.?\s*FL\.?\b': 'GROUND FLOOR',
r'\bBLK\.?\b': 'BLOCK',
r'\bBL\.?\b': 'BLOCK',
r'\bSEC\.?\b': 'SECTOR',
r'\bKH\.?\s*NO\.?\b': 'KHASRA NO',
r'\bKHASRA\s*NO\.?\b': 'KHASRA NO',
r'\bKH\.?\b': 'KHASRA',
r'\bCOL\.?\b': 'COLONY',
r'\bNGR\.?\b': 'NAGAR',
r'\bMKT\.?\b': 'MARKET',
r'\bRD\.?\b': 'ROAD',
r'\bST\.?\b': 'STREET',
r'\bLN\.?\b': 'LANE',
r'\bEXTN\.?\b': 'EXTENSION',
r'\bEXT\.?\b': 'EXTENSION',
r'\bPH\.?\b': 'PHASE',
r'\bNR\.?\b': 'NEAR',
r'\bOPP\.?\b': 'OPPOSITE',
r'\bBHD\.?\b': 'BEHIND',
r'\bADJ\.?\b': 'ADJACENT',
r'\bWZ\.?\b': 'WZ', # West Zone
r'\bEZ\.?\b': 'EZ', # East Zone
r'\bNZ\.?\b': 'NZ', # North Zone
r'\bSZ\.?\b': 'SZ', # South Zone
r'\bDL\.?\b': 'DELHI',
r'\bN\.?\s*DELHI\b': 'NEW DELHI',
}
# Floor name patterns
FLOOR_PATTERNS = {
r'\bGROUND\b': 'GROUND',
r'\bBASEMENT\b': 'BASEMENT',
r'\bFIRST\b': 'FIRST',
r'\bSECOND\b': 'SECOND',
r'\bTHIRD\b': 'THIRD',
r'\bFOURTH\b': 'FOURTH',
r'\bFIFTH\b': 'FIFTH',
r'\b1ST\b': 'FIRST',
r'\b2ND\b': 'SECOND',
r'\b3RD\b': 'THIRD',
r'\b4TH\b': 'FOURTH',
r'\b5TH\b': 'FIFTH',
}
def __init__(self, uppercase: bool = True, expand_abbrev: bool = True):
"""
Initialize normalizer.
Args:
uppercase: Convert text to uppercase
expand_abbrev: Expand common abbreviations
"""
self.uppercase = uppercase
self.expand_abbrev = expand_abbrev
# Compile regex patterns
self._abbrev_patterns = {
re.compile(pattern, re.IGNORECASE): replacement
for pattern, replacement in self.ABBREVIATIONS.items()
}
def normalize(self, address: str) -> str:
"""
Normalize an address string.
Args:
address: Raw address string
Returns:
Normalized address string
"""
if not address:
return ""
text = address
# Basic cleanup
text = self._clean_whitespace(text)
text = self._standardize_punctuation(text)
# Expand abbreviations
if self.expand_abbrev:
text = self._expand_abbreviations(text)
# Case normalization
if self.uppercase:
text = text.upper()
# Final whitespace cleanup
text = self._clean_whitespace(text)
return text
def _clean_whitespace(self, text: str) -> str:
"""Remove extra whitespace."""
# Replace multiple spaces with single space
text = re.sub(r'\s+', ' ', text)
# Remove spaces around punctuation
text = re.sub(r'\s*,\s*', ', ', text)
text = re.sub(r'\s*-\s*', '-', text)
# Trim
return text.strip()
def _standardize_punctuation(self, text: str) -> str:
"""Standardize punctuation marks."""
# Replace various dash types with standard hyphen
text = re.sub(r'[–—]', '-', text)
# Remove duplicate punctuation
text = re.sub(r',+', ',', text)
text = re.sub(r'-+', '-', text)
# Remove trailing punctuation before comma
text = re.sub(r'-,', ',', text)
return text
def _expand_abbreviations(self, text: str) -> str:
"""Expand common abbreviations."""
for pattern, replacement in self._abbrev_patterns.items():
text = pattern.sub(replacement, text)
return text
def extract_pincode(self, address: str) -> str | None:
"""Extract 6-digit Indian PIN code from address."""
match = re.search(r'\b[1-9]\d{5}\b', address)
return match.group(0) if match else None
def remove_pincode(self, address: str) -> str:
"""Remove PIN code from address."""
return re.sub(r'\b[1-9]\d{5}\b', '', address)
def tokenize(self, text: str) -> list[str]:
"""
Simple tokenization preserving address-specific patterns.
Args:
text: Normalized address text
Returns:
List of tokens
"""
# Split on whitespace but keep special patterns together
# e.g., "H-3" stays as one token, "110041" stays together
tokens = []
# Pattern to match address tokens
pattern = r'''
[A-Z0-9]+[-/][A-Z0-9/]+ | # Compound identifiers like H-3, 24/1/3
[A-Z]+\d+ | # Letter+number combos like A5
\d+[A-Z]+ | # Number+letter combos like 5A
[A-Z]+ | # Words
\d+ | # Numbers
[,.] # Punctuation
'''
for match in re.finditer(pattern, text.upper(), re.VERBOSE):
token = match.group(0)
if token.strip():
tokens.append(token)
return tokens
|