Spaces:
Running
Running
| """ | |
| BharatGraph - Data Cleaner | |
| Normalizes messy Indian names, titles, and text fields | |
| scraped from government datasets. | |
| """ | |
| import re | |
| import unicodedata | |
| from loguru import logger | |
| INDIAN_TITLES = [ | |
| "shri", "smt", "smt.", "shri.", "dr", "dr.", "prof", "prof.", | |
| "mr", "mr.", "mrs", "mrs.", "ms", "ms.", "adv", "adv.", | |
| "hon", "hon.", "honble", "hon'ble", "er", "er.", | |
| "col", "col.", "gen", "brig", "maj", "capt", | |
| "justice", "judge", "chief justice", | |
| ] | |
| class NameCleaner: | |
| """ | |
| Cleans and normalizes Indian personal names and company names. | |
| Used before entity resolution to ensure consistent matching. | |
| """ | |
| def clean_person_name(self, name: str) -> str: | |
| """ | |
| Normalize a person name. | |
| 'SHRI RAHUL KUMAR' -> 'Rahul Kumar' | |
| 'Dr. A.P.J. Abdul Kalam' -> 'A.P.J. Abdul Kalam' | |
| """ | |
| if not name or not isinstance(name, str): | |
| return "" | |
| name = unicodedata.normalize("NFKC", name).strip() | |
| name = re.sub(r"\(.*?\)", "", name).strip() | |
| name = re.sub(r"[^\w\s.\-']", " ", name) | |
| name = name.title() | |
| for title in INDIAN_TITLES: | |
| for pat in [rf"^{re.escape(title)}\s+", rf"^{re.escape(title)}\.\s*"]: | |
| name = re.sub(pat, "", name, flags=re.IGNORECASE).strip() | |
| return re.sub(r"\s+", " ", name).strip() | |
| def clean_company_name(self, name: str) -> str: | |
| """ | |
| Normalize a company name. | |
| 'M/S. DELHI ROADS LTD' -> 'Delhi Roads Ltd' | |
| 'M/S SAMPLE INFRASTRUCTURE PRIVATE LIMITED' -> 'Sample Infrastructure Pvt Ltd' | |
| 'ABC CONSTRUCTIONS PVT. LTD.' -> 'Abc Constructions Pvt Ltd' | |
| """ | |
| if not name or not isinstance(name, str): | |
| return "" | |
| name = unicodedata.normalize("NFKC", name).strip() | |
| name = re.sub(r"\s+", " ", name) | |
| # Strip M/s prefix before title case | |
| name = re.sub(r"^m\s*/\s*s\.?\s*", "", name, flags=re.IGNORECASE).strip() | |
| name = name.title() | |
| # Normalize end suffixes | |
| end_map = [ | |
| (r"private limited$", "Pvt Ltd"), | |
| (r"pvt\.\s*ltd\.$", "Pvt Ltd"), | |
| (r"pvt\.\s*ltd$", "Pvt Ltd"), | |
| (r"pvt\s+ltd$", "Pvt Ltd"), | |
| (r"p\s*ltd$", "Pvt Ltd"), | |
| (r"l\.l\.p\.?$", "LLP"), | |
| (r"llp$", "LLP"), | |
| (r"limited$", "Ltd"), | |
| (r"ltd\.$", "Ltd"), | |
| ] | |
| for pattern, replacement in end_map: | |
| name, n = re.subn(pattern, replacement, name, flags=re.IGNORECASE) | |
| if n: | |
| break | |
| return re.sub(r"\s+", " ", name).strip() | |
| def clean_amount(self, amount_str: str) -> float: | |
| """ | |
| Parse Indian currency strings to float in crore. | |
| '150 Cr' -> 150.0, '500 lakh' -> 5.0, | |
| '1500000' -> 1.5 (raw rupees), 'Rs.75cr' -> 75.0 | |
| """ | |
| if not amount_str: | |
| return 0.0 | |
| s = str(amount_str).strip() | |
| # Plain number -> treat as raw rupees | |
| try: | |
| return round(float(s.replace(",", "")) / 1e7, 4) | |
| except ValueError: | |
| pass | |
| # Remove currency symbols and commas | |
| s = re.sub(r"[Rs.,\s]", "", s) | |
| s = re.sub(r"^rs\.?", "", s, flags=re.IGNORECASE) | |
| # Detect suffix | |
| if re.search(r"crore|cr", s, re.IGNORECASE): | |
| s = re.sub(r"crore|cr", "", s, flags=re.IGNORECASE) | |
| multiplier = 1.0 | |
| elif re.search(r"lakh|lac", s, re.IGNORECASE): | |
| s = re.sub(r"lakh|lac", "", s, flags=re.IGNORECASE) | |
| multiplier = 0.01 | |
| else: | |
| multiplier = 1 / 1e7 | |
| try: | |
| return round(float(s.strip()) * multiplier, 4) | |
| except ValueError: | |
| return 0.0 | |
| def clean_state_name(self, state: str) -> str: | |
| """Normalize Indian state names. 'TN' -> 'Tamil Nadu'""" | |
| if not state: | |
| return "" | |
| STATE_MAP = { | |
| "tn": "Tamil Nadu", "tamilnadu": "Tamil Nadu", | |
| "mh": "Maharashtra", "maharashtra": "Maharashtra", | |
| "dl": "Delhi", "delhi": "Delhi", | |
| "ka": "Karnataka", "karnataka": "Karnataka", | |
| "up": "Uttar Pradesh", "uttarpradesh": "Uttar Pradesh", | |
| "wb": "West Bengal", "westbengal": "West Bengal", | |
| "rj": "Rajasthan", "rajasthan": "Rajasthan", | |
| "mp": "Madhya Pradesh", "madhyapradesh": "Madhya Pradesh", | |
| "gj": "Gujarat", "gujarat": "Gujarat", | |
| "ap": "Andhra Pradesh", "andhrapradesh": "Andhra Pradesh", | |
| "ts": "Telangana", "telangana": "Telangana", | |
| "kl": "Kerala", "kerala": "Kerala", | |
| "pb": "Punjab", "punjab": "Punjab", | |
| "hr": "Haryana", "haryana": "Haryana", | |
| "br": "Bihar", "bihar": "Bihar", | |
| "or": "Odisha", "odisha": "Odisha", | |
| "as": "Assam", "assam": "Assam", | |
| "jh": "Jharkhand", "jharkhand": "Jharkhand", | |
| "hp": "Himachal Pradesh", "himachalpradesh": "Himachal Pradesh", | |
| "uk": "Uttarakhand", "uttarakhand": "Uttarakhand", | |
| "ct": "Chhattisgarh", "chhattisgarh": "Chhattisgarh", | |
| "ga": "Goa", "goa": "Goa", | |
| "mn": "Manipur", "manipur": "Manipur", | |
| "ml": "Meghalaya", "meghalaya": "Meghalaya", | |
| "mz": "Mizoram", "mizoram": "Mizoram", | |
| "nl": "Nagaland", "nagaland": "Nagaland", | |
| "tr": "Tripura", "tripura": "Tripura", | |
| "sk": "Sikkim", "sikkim": "Sikkim", | |
| "ar": "Arunachal Pradesh", | |
| } | |
| key = re.sub(r"\s+", "", state.lower().strip()) | |
| return STATE_MAP.get(key, state.title()) | |
| def clean_record(self, record: dict, record_type: str = "person") -> dict: | |
| """Clean all fields in a scraped record dict.""" | |
| cleaned = dict(record) | |
| if record_type == "person": | |
| if "name" in cleaned: | |
| cleaned["name_raw"] = cleaned["name"] | |
| cleaned["name"] = self.clean_person_name(cleaned["name"]) | |
| if "party" in cleaned: | |
| cleaned["party"] = cleaned["party"].strip().title() | |
| if "state" in cleaned: | |
| cleaned["state"] = self.clean_state_name(cleaned["state"]) | |
| elif record_type == "company": | |
| if "name" in cleaned: | |
| cleaned["name_raw"] = cleaned["name"] | |
| cleaned["name"] = self.clean_company_name(cleaned["name"]) | |
| if "state" in cleaned: | |
| cleaned["state"] = self.clean_state_name(cleaned["state"]) | |
| elif record_type == "contract": | |
| if "seller_name" in cleaned: | |
| cleaned["seller_name_raw"] = cleaned["seller_name"] | |
| cleaned["seller_name"] = self.clean_company_name(cleaned["seller_name"]) | |
| if "buyer_org" in cleaned: | |
| cleaned["buyer_org"] = cleaned["buyer_org"].strip().title() | |
| if "state" in cleaned: | |
| cleaned["state"] = self.clean_state_name(cleaned["state"]) | |
| if "amount_inr" in cleaned: | |
| cleaned["amount_crore"] = self.clean_amount( | |
| str(cleaned.get("amount_crore", cleaned.get("amount_inr", 0))) | |
| ) | |
| cleaned["_cleaned"] = True | |
| return cleaned | |
| if __name__ == "__main__": | |
| print("=" * 55) | |
| print("BharatGraph - Name Cleaner Test") | |
| print("=" * 55) | |
| c = NameCleaner() | |
| print("\n[1] Person names:") | |
| for n in ["SHRI RAHUL KUMAR","smt. priya devi","Dr. A.P.J. Abdul Kalam", | |
| "Hon. Justice Ranjan Gogoi","MR. NARENDRA MODI"," adv. sunita sharma "]: | |
| print(f" '{n}' -> '{c.clean_person_name(n)}'") | |
| print("\n[2] Company names:") | |
| for n in ["M/S SAMPLE INFRASTRUCTURE PRIVATE LIMITED", | |
| "ABC CONSTRUCTIONS PVT. LTD.","xyz trading co. llp","M/S. DELHI ROADS LTD"]: | |
| print(f" '{n}' -> '{c.clean_company_name(n)}'") | |
| print("\n[3] Amounts (to crore):") | |
| for a in ["150 Cr","500 lakh","1500000","Rs. 2,50,00,000","Rs.75cr"]: | |
| print(f" '{a}' -> {c.clean_amount(a)} Cr") | |
| print("\n[4] States:") | |
| for s in ["TN","tamilnadu","MH","dl","UP","wb"]: | |
| print(f" '{s}' -> '{c.clean_state_name(s)}'") | |
| print("\n[5] Full record:") | |
| r = {"name":"SHRI RAMESH KUMAR","party":"sample party","state":"TN","criminal_cases":"3"} | |
| print(f" In: {r}") | |
| print(f" Out: {c.clean_record(r,'person')}") | |
| print("\nDone!") | |