"""Delhi locality gazetteer for fuzzy matching and validation.""" from rapidfuzz import fuzz, process class DelhiGazetteer: """ Gazetteer of Delhi localities, areas, and common address terms. Used for: - Fuzzy matching to correct misspellings - Entity validation - Confidence boosting for known locations """ # Major Delhi localities/areas LOCALITIES = { # South Delhi "SAKET", "MALVIYA NAGAR", "HAUZ KHAS", "GREEN PARK", "GREATER KAILASH", "DEFENCE COLONY", "LAJPAT NAGAR", "SOUTH EXTENSION", "CHITTARANJAN PARK", "KALKAJI", "NEHRU PLACE", "OKHLA", "JASOLA", "SARITA VIHAR", "ALAKNANDA", "SAFDARJUNG", "VASANT KUNJ", "MEHRAULI", "CHATTARPUR", # North Delhi "CIVIL LINES", "MODEL TOWN", "MUKHERJEE NAGAR", "KAMLA NAGAR", "SHAKTI NAGAR", "GULABI BAGH", "ASHOK VIHAR", "SHALIMAR BAGH", "PITAMPURA", "ROHINI", "NARELA", "BAWANA", "ALIPUR", # East Delhi "PREET VIHAR", "MAYUR VIHAR", "PATPARGANJ", "PANDAV NAGAR", "LAKSHMI NAGAR", "SHAKARPUR", "GEETA COLONY", "GANDHI NAGAR", "DILSHAD GARDEN", "SEELAMPUR", "SHAHDARA", "ANAND VIHAR", # West Delhi "JANAKPURI", "DWARKA", "PALAM", "UTTAM NAGAR", "VIKASPURI", "TILAK NAGAR", "RAJOURI GARDEN", "PUNJABI BAGH", "PASCHIM VIHAR", "MEERA BAGH", "PEERAGARHI", "MUNDKA", "NANGLOI", "NAJAFGARH", "BINDAPUR", "KAKROLA", "MOHAN GARDEN", "NAWADA", # Central Delhi "CONNAUGHT PLACE", "KAROL BAGH", "PAHARGANJ", "DARYAGANJ", "CHANDNI CHOWK", "SADAR BAZAAR", "RAJENDER NAGAR", "PATEL NAGAR", "KIRTI NAGAR", "MOTIA KHAN", "ANAND PARBAT", "JHANDEWALAN", # New Delhi "CHANAKYAPURI", "LODHI ROAD", "GOLF LINKS", "JORBAGH", "SUNDAR NAGAR", "NIZAMUDDIN", "LODI COLONY", "PANDARA ROAD", # Other areas "BADARPUR", "TUGHLAKABAD", "SANGAM VIHAR", "MADANPUR KHADAR", "GOVINDPURI", "AMBEDKAR NAGAR", "LADO SARAI", "TIGRI", "BURARI", "KARAWAL NAGAR", "BHAJANPURA", "MUSTAFABAD", "JAFFRABAD", "MAUJPUR", "GOKALPUR", "SEEMAPURI", } # Common colony/nagar suffixes NAGAR_SUFFIXES = { "NAGAR", "VIHAR", "COLONY", "ENCLAVE", "EXTENSION", "PURI", "PARK", "GARDEN", "BAGH", "KUNJ", "APARTMENT", "RESIDENCY", "COMPLEX", "PHASE", "SECTOR", "BLOCK", "POCKET", } # Common area names from the training data COMMON_AREAS = { "KAUNWAR SINGH NAGAR", "BABA HARI DAS COLONY", "TIKARI KALA", "CHANCHAL PARK", "SWARN PARK", "MUNDKA", "NANGLOI", "BAKKARWALA", "MAJRA DABAS", "CHAND NAGAR", "RANHOLA", "BAPROLA", "POOTH KHURD", "KIRARI", "SULTANPURI", "MANGOLPURI", "BEGUMPUR", "KADIPUR", "RAMA VIHAR", "PREM NAGAR", "VIJAY PARK", "AMBICA VIHAR", "SHIV PURI", "BUDH VIHAR", "POOTH KALAN", "QUTUBGARH", "RANI KHERA", "SHAHABAD DAIRY", "SAMAIPUR", "JAHANGIRPURI", "SANNOTH", "KANJHAWALA", "BAWANA", "ALIPUR", } # Common Hindi transliterated terms HINDI_TERMS = { "MOHALLA", "GALI", "KATRA", "BASTI", "BAZAR", "CHOWK", "GANJ", "PUR", "ABAD", "GARH", "GAON", "KHERA", "KHURD", "KALAN", } def __init__(self, min_similarity: float = 80.0): """ Initialize gazetteer. Args: min_similarity: Minimum fuzzy match score (0-100) """ self.min_similarity = min_similarity # Build combined set for matching self.all_places = ( self.LOCALITIES | self.COMMON_AREAS | {f"{term}" for term in self.HINDI_TERMS} ) def fuzzy_match( self, text: str, limit: int = 3 ) -> list[tuple[str, float]]: """ Find fuzzy matches for a text in the gazetteer. Args: text: Text to match limit: Maximum number of matches Returns: List of (matched_text, score) tuples """ if not text or len(text) < 3: return [] matches = process.extract( text.upper(), self.all_places, scorer=fuzz.ratio, limit=limit ) return [(m[0], m[1]) for m in matches if m[1] >= self.min_similarity] def is_known_locality(self, text: str, threshold: float = 85.0) -> bool: """Check if text matches a known locality.""" matches = self.fuzzy_match(text, limit=1) return bool(matches and matches[0][1] >= threshold) def correct_spelling(self, text: str) -> str | None: """ Attempt to correct spelling using gazetteer. Returns corrected text or None if no good match. """ matches = self.fuzzy_match(text, limit=1) if matches and matches[0][1] >= 90.0: return matches[0][0] return None def get_locality_type(self, text: str) -> str | None: """ Determine if text contains a known locality type suffix. Returns the suffix type or None. """ text_upper = text.upper() for suffix in self.NAGAR_SUFFIXES: if text_upper.endswith(suffix): return suffix return None def validate_pincode(self, pincode: str, locality: str | None = None) -> bool: """ Validate if a pincode is valid for Delhi. Delhi pincodes are in range 110001-110097. """ if not pincode or not pincode.isdigit() or len(pincode) != 6: return False code = int(pincode) # Delhi pincode range return 110001 <= code <= 110097