bharatgraph / processing /cleaner.py
abinazebinoy's picture
fix(ascii): replace all non-ASCII chars in Python source files
fa87992
Raw
History Blame Contribute Delete
8.71 kB
"""
BharatGraph - Data Cleaner
Normalizes messy Indian names, titles, and text fields
scraped from government datasets.
"""
import re
import unicodedata
from loguru import logger
INDIAN_TITLES = [
"shri", "smt", "smt.", "shri.", "dr", "dr.", "prof", "prof.",
"mr", "mr.", "mrs", "mrs.", "ms", "ms.", "adv", "adv.",
"hon", "hon.", "honble", "hon'ble", "er", "er.",
"col", "col.", "gen", "brig", "maj", "capt",
"justice", "judge", "chief justice",
]
class NameCleaner:
"""
Cleans and normalizes Indian personal names and company names.
Used before entity resolution to ensure consistent matching.
"""
def clean_person_name(self, name: str) -> str:
"""
Normalize a person name.
'SHRI RAHUL KUMAR' -> 'Rahul Kumar'
'Dr. A.P.J. Abdul Kalam' -> 'A.P.J. Abdul Kalam'
"""
if not name or not isinstance(name, str):
return ""
name = unicodedata.normalize("NFKC", name).strip()
name = re.sub(r"\(.*?\)", "", name).strip()
name = re.sub(r"[^\w\s.\-']", " ", name)
name = name.title()
for title in INDIAN_TITLES:
for pat in [rf"^{re.escape(title)}\s+", rf"^{re.escape(title)}\.\s*"]:
name = re.sub(pat, "", name, flags=re.IGNORECASE).strip()
return re.sub(r"\s+", " ", name).strip()
def clean_company_name(self, name: str) -> str:
"""
Normalize a company name.
'M/S. DELHI ROADS LTD' -> 'Delhi Roads Ltd'
'M/S SAMPLE INFRASTRUCTURE PRIVATE LIMITED' -> 'Sample Infrastructure Pvt Ltd'
'ABC CONSTRUCTIONS PVT. LTD.' -> 'Abc Constructions Pvt Ltd'
"""
if not name or not isinstance(name, str):
return ""
name = unicodedata.normalize("NFKC", name).strip()
name = re.sub(r"\s+", " ", name)
# Strip M/s prefix before title case
name = re.sub(r"^m\s*/\s*s\.?\s*", "", name, flags=re.IGNORECASE).strip()
name = name.title()
# Normalize end suffixes
end_map = [
(r"private limited$", "Pvt Ltd"),
(r"pvt\.\s*ltd\.$", "Pvt Ltd"),
(r"pvt\.\s*ltd$", "Pvt Ltd"),
(r"pvt\s+ltd$", "Pvt Ltd"),
(r"p\s*ltd$", "Pvt Ltd"),
(r"l\.l\.p\.?$", "LLP"),
(r"llp$", "LLP"),
(r"limited$", "Ltd"),
(r"ltd\.$", "Ltd"),
]
for pattern, replacement in end_map:
name, n = re.subn(pattern, replacement, name, flags=re.IGNORECASE)
if n:
break
return re.sub(r"\s+", " ", name).strip()
def clean_amount(self, amount_str: str) -> float:
"""
Parse Indian currency strings to float in crore.
'150 Cr' -> 150.0, '500 lakh' -> 5.0,
'1500000' -> 1.5 (raw rupees), 'Rs.75cr' -> 75.0
"""
if not amount_str:
return 0.0
s = str(amount_str).strip()
# Plain number -> treat as raw rupees
try:
return round(float(s.replace(",", "")) / 1e7, 4)
except ValueError:
pass
# Remove currency symbols and commas
s = re.sub(r"[Rs.,\s]", "", s)
s = re.sub(r"^rs\.?", "", s, flags=re.IGNORECASE)
# Detect suffix
if re.search(r"crore|cr", s, re.IGNORECASE):
s = re.sub(r"crore|cr", "", s, flags=re.IGNORECASE)
multiplier = 1.0
elif re.search(r"lakh|lac", s, re.IGNORECASE):
s = re.sub(r"lakh|lac", "", s, flags=re.IGNORECASE)
multiplier = 0.01
else:
multiplier = 1 / 1e7
try:
return round(float(s.strip()) * multiplier, 4)
except ValueError:
return 0.0
def clean_state_name(self, state: str) -> str:
"""Normalize Indian state names. 'TN' -> 'Tamil Nadu'"""
if not state:
return ""
STATE_MAP = {
"tn": "Tamil Nadu", "tamilnadu": "Tamil Nadu",
"mh": "Maharashtra", "maharashtra": "Maharashtra",
"dl": "Delhi", "delhi": "Delhi",
"ka": "Karnataka", "karnataka": "Karnataka",
"up": "Uttar Pradesh", "uttarpradesh": "Uttar Pradesh",
"wb": "West Bengal", "westbengal": "West Bengal",
"rj": "Rajasthan", "rajasthan": "Rajasthan",
"mp": "Madhya Pradesh", "madhyapradesh": "Madhya Pradesh",
"gj": "Gujarat", "gujarat": "Gujarat",
"ap": "Andhra Pradesh", "andhrapradesh": "Andhra Pradesh",
"ts": "Telangana", "telangana": "Telangana",
"kl": "Kerala", "kerala": "Kerala",
"pb": "Punjab", "punjab": "Punjab",
"hr": "Haryana", "haryana": "Haryana",
"br": "Bihar", "bihar": "Bihar",
"or": "Odisha", "odisha": "Odisha",
"as": "Assam", "assam": "Assam",
"jh": "Jharkhand", "jharkhand": "Jharkhand",
"hp": "Himachal Pradesh", "himachalpradesh": "Himachal Pradesh",
"uk": "Uttarakhand", "uttarakhand": "Uttarakhand",
"ct": "Chhattisgarh", "chhattisgarh": "Chhattisgarh",
"ga": "Goa", "goa": "Goa",
"mn": "Manipur", "manipur": "Manipur",
"ml": "Meghalaya", "meghalaya": "Meghalaya",
"mz": "Mizoram", "mizoram": "Mizoram",
"nl": "Nagaland", "nagaland": "Nagaland",
"tr": "Tripura", "tripura": "Tripura",
"sk": "Sikkim", "sikkim": "Sikkim",
"ar": "Arunachal Pradesh",
}
key = re.sub(r"\s+", "", state.lower().strip())
return STATE_MAP.get(key, state.title())
def clean_record(self, record: dict, record_type: str = "person") -> dict:
"""Clean all fields in a scraped record dict."""
cleaned = dict(record)
if record_type == "person":
if "name" in cleaned:
cleaned["name_raw"] = cleaned["name"]
cleaned["name"] = self.clean_person_name(cleaned["name"])
if "party" in cleaned:
cleaned["party"] = cleaned["party"].strip().title()
if "state" in cleaned:
cleaned["state"] = self.clean_state_name(cleaned["state"])
elif record_type == "company":
if "name" in cleaned:
cleaned["name_raw"] = cleaned["name"]
cleaned["name"] = self.clean_company_name(cleaned["name"])
if "state" in cleaned:
cleaned["state"] = self.clean_state_name(cleaned["state"])
elif record_type == "contract":
if "seller_name" in cleaned:
cleaned["seller_name_raw"] = cleaned["seller_name"]
cleaned["seller_name"] = self.clean_company_name(cleaned["seller_name"])
if "buyer_org" in cleaned:
cleaned["buyer_org"] = cleaned["buyer_org"].strip().title()
if "state" in cleaned:
cleaned["state"] = self.clean_state_name(cleaned["state"])
if "amount_inr" in cleaned:
cleaned["amount_crore"] = self.clean_amount(
str(cleaned.get("amount_crore", cleaned.get("amount_inr", 0)))
)
cleaned["_cleaned"] = True
return cleaned
if __name__ == "__main__":
print("=" * 55)
print("BharatGraph - Name Cleaner Test")
print("=" * 55)
c = NameCleaner()
print("\n[1] Person names:")
for n in ["SHRI RAHUL KUMAR","smt. priya devi","Dr. A.P.J. Abdul Kalam",
"Hon. Justice Ranjan Gogoi","MR. NARENDRA MODI"," adv. sunita sharma "]:
print(f" '{n}' -> '{c.clean_person_name(n)}'")
print("\n[2] Company names:")
for n in ["M/S SAMPLE INFRASTRUCTURE PRIVATE LIMITED",
"ABC CONSTRUCTIONS PVT. LTD.","xyz trading co. llp","M/S. DELHI ROADS LTD"]:
print(f" '{n}' -> '{c.clean_company_name(n)}'")
print("\n[3] Amounts (to crore):")
for a in ["150 Cr","500 lakh","1500000","Rs. 2,50,00,000","Rs.75cr"]:
print(f" '{a}' -> {c.clean_amount(a)} Cr")
print("\n[4] States:")
for s in ["TN","tamilnadu","MH","dl","UP","wb"]:
print(f" '{s}' -> '{c.clean_state_name(s)}'")
print("\n[5] Full record:")
r = {"name":"SHRI RAMESH KUMAR","party":"sample party","state":"TN","criminal_cases":"3"}
print(f" In: {r}")
print(f" Out: {c.clean_record(r,'person')}")
print("\nDone!")