File size: 8,710 Bytes
45703e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d60d321
 
 
45703e8
 
 
d60d321
45703e8
 
 
 
d60d321
45703e8
d60d321
45703e8
 
 
 
d60d321
 
 
45703e8
 
 
 
 
d60d321
 
45703e8
d60d321
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45703e8
d60d321
45703e8
 
 
d60d321
 
fa87992
45703e8
 
 
d60d321
 
45703e8
d60d321
45703e8
 
 
fa87992
d60d321
 
 
 
 
 
 
 
 
 
45703e8
d60d321
45703e8
 
 
 
d60d321
45703e8
 
 
d60d321
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45703e8
d60d321
 
 
 
 
 
 
 
 
45703e8
 
 
 
 
 
d60d321
 
45703e8
 
d60d321
 
45703e8
d60d321
45703e8
d60d321
45703e8
 
d60d321
 
45703e8
d60d321
45703e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d60d321
 
 
 
 
 
 
 
 
 
 
 
 
fa87992
d60d321
 
 
 
 
 
 
 
 
 
45703e8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
"""
BharatGraph - Data Cleaner
Normalizes messy Indian names, titles, and text fields
scraped from government datasets.
"""

import re
import unicodedata
from loguru import logger


INDIAN_TITLES = [
    "shri", "smt", "smt.", "shri.", "dr", "dr.", "prof", "prof.",
    "mr", "mr.", "mrs", "mrs.", "ms", "ms.", "adv", "adv.",
    "hon", "hon.", "honble", "hon'ble", "er", "er.",
    "col", "col.", "gen", "brig", "maj", "capt",
    "justice", "judge", "chief justice",
]


class NameCleaner:
    """
    Cleans and normalizes Indian personal names and company names.
    Used before entity resolution to ensure consistent matching.
    """

    def clean_person_name(self, name: str) -> str:
        """
        Normalize a person name.
        'SHRI RAHUL KUMAR' -> 'Rahul Kumar'
        'Dr. A.P.J. Abdul Kalam' -> 'A.P.J. Abdul Kalam'
        """
        if not name or not isinstance(name, str):
            return ""
        name = unicodedata.normalize("NFKC", name).strip()
        name = re.sub(r"\(.*?\)", "", name).strip()
        name = re.sub(r"[^\w\s.\-']", " ", name)
        name = name.title()
        for title in INDIAN_TITLES:
            for pat in [rf"^{re.escape(title)}\s+", rf"^{re.escape(title)}\.\s*"]:
                name = re.sub(pat, "", name, flags=re.IGNORECASE).strip()
        return re.sub(r"\s+", " ", name).strip()

    def clean_company_name(self, name: str) -> str:
        """
        Normalize a company name.
        'M/S. DELHI ROADS LTD' -> 'Delhi Roads Ltd'
        'M/S SAMPLE INFRASTRUCTURE PRIVATE LIMITED' -> 'Sample Infrastructure Pvt Ltd'
        'ABC CONSTRUCTIONS PVT. LTD.' -> 'Abc Constructions Pvt Ltd'
        """
        if not name or not isinstance(name, str):
            return ""
        name = unicodedata.normalize("NFKC", name).strip()
        name = re.sub(r"\s+", " ", name)
        # Strip M/s prefix before title case
        name = re.sub(r"^m\s*/\s*s\.?\s*", "", name, flags=re.IGNORECASE).strip()
        name = name.title()
        # Normalize end suffixes
        end_map = [
            (r"private limited$", "Pvt Ltd"),
            (r"pvt\.\s*ltd\.$",   "Pvt Ltd"),
            (r"pvt\.\s*ltd$",     "Pvt Ltd"),
            (r"pvt\s+ltd$",       "Pvt Ltd"),
            (r"p\s*ltd$",         "Pvt Ltd"),
            (r"l\.l\.p\.?$",      "LLP"),
            (r"llp$",             "LLP"),
            (r"limited$",         "Ltd"),
            (r"ltd\.$",           "Ltd"),
        ]
        for pattern, replacement in end_map:
            name, n = re.subn(pattern, replacement, name, flags=re.IGNORECASE)
            if n:
                break
        return re.sub(r"\s+", " ", name).strip()

    def clean_amount(self, amount_str: str) -> float:
        """
        Parse Indian currency strings to float in crore.
        '150 Cr' -> 150.0,  '500 lakh' -> 5.0,
        '1500000' -> 1.5 (raw rupees),  'Rs.75cr' -> 75.0
        """
        if not amount_str:
            return 0.0
        s = str(amount_str).strip()
        # Plain number -> treat as raw rupees
        try:
            return round(float(s.replace(",", "")) / 1e7, 4)
        except ValueError:
            pass
        # Remove currency symbols and commas
        s = re.sub(r"[Rs.,\s]", "", s)
        s = re.sub(r"^rs\.?", "", s, flags=re.IGNORECASE)
        # Detect suffix
        if re.search(r"crore|cr", s, re.IGNORECASE):
            s = re.sub(r"crore|cr", "", s, flags=re.IGNORECASE)
            multiplier = 1.0
        elif re.search(r"lakh|lac", s, re.IGNORECASE):
            s = re.sub(r"lakh|lac", "", s, flags=re.IGNORECASE)
            multiplier = 0.01
        else:
            multiplier = 1 / 1e7
        try:
            return round(float(s.strip()) * multiplier, 4)
        except ValueError:
            return 0.0

    def clean_state_name(self, state: str) -> str:
        """Normalize Indian state names. 'TN' -> 'Tamil Nadu'"""
        if not state:
            return ""
        STATE_MAP = {
            "tn": "Tamil Nadu",       "tamilnadu": "Tamil Nadu",
            "mh": "Maharashtra",      "maharashtra": "Maharashtra",
            "dl": "Delhi",            "delhi": "Delhi",
            "ka": "Karnataka",        "karnataka": "Karnataka",
            "up": "Uttar Pradesh",    "uttarpradesh": "Uttar Pradesh",
            "wb": "West Bengal",      "westbengal": "West Bengal",
            "rj": "Rajasthan",        "rajasthan": "Rajasthan",
            "mp": "Madhya Pradesh",   "madhyapradesh": "Madhya Pradesh",
            "gj": "Gujarat",          "gujarat": "Gujarat",
            "ap": "Andhra Pradesh",   "andhrapradesh": "Andhra Pradesh",
            "ts": "Telangana",        "telangana": "Telangana",
            "kl": "Kerala",           "kerala": "Kerala",
            "pb": "Punjab",           "punjab": "Punjab",
            "hr": "Haryana",          "haryana": "Haryana",
            "br": "Bihar",            "bihar": "Bihar",
            "or": "Odisha",           "odisha": "Odisha",
            "as": "Assam",            "assam": "Assam",
            "jh": "Jharkhand",        "jharkhand": "Jharkhand",
            "hp": "Himachal Pradesh", "himachalpradesh": "Himachal Pradesh",
            "uk": "Uttarakhand",      "uttarakhand": "Uttarakhand",
            "ct": "Chhattisgarh",     "chhattisgarh": "Chhattisgarh",
            "ga": "Goa",              "goa": "Goa",
            "mn": "Manipur",          "manipur": "Manipur",
            "ml": "Meghalaya",        "meghalaya": "Meghalaya",
            "mz": "Mizoram",          "mizoram": "Mizoram",
            "nl": "Nagaland",         "nagaland": "Nagaland",
            "tr": "Tripura",          "tripura": "Tripura",
            "sk": "Sikkim",           "sikkim": "Sikkim",
            "ar": "Arunachal Pradesh",
        }
        key = re.sub(r"\s+", "", state.lower().strip())
        return STATE_MAP.get(key, state.title())

    def clean_record(self, record: dict, record_type: str = "person") -> dict:
        """Clean all fields in a scraped record dict."""
        cleaned = dict(record)
        if record_type == "person":
            if "name" in cleaned:
                cleaned["name_raw"] = cleaned["name"]
                cleaned["name"]     = self.clean_person_name(cleaned["name"])
            if "party" in cleaned:
                cleaned["party"]    = cleaned["party"].strip().title()
            if "state" in cleaned:
                cleaned["state"]    = self.clean_state_name(cleaned["state"])
        elif record_type == "company":
            if "name" in cleaned:
                cleaned["name_raw"] = cleaned["name"]
                cleaned["name"]     = self.clean_company_name(cleaned["name"])
            if "state" in cleaned:
                cleaned["state"]    = self.clean_state_name(cleaned["state"])
        elif record_type == "contract":
            if "seller_name" in cleaned:
                cleaned["seller_name_raw"] = cleaned["seller_name"]
                cleaned["seller_name"]     = self.clean_company_name(cleaned["seller_name"])
            if "buyer_org" in cleaned:
                cleaned["buyer_org"]       = cleaned["buyer_org"].strip().title()
            if "state" in cleaned:
                cleaned["state"]           = self.clean_state_name(cleaned["state"])
            if "amount_inr" in cleaned:
                cleaned["amount_crore"]    = self.clean_amount(
                    str(cleaned.get("amount_crore", cleaned.get("amount_inr", 0)))
                )
        cleaned["_cleaned"] = True
        return cleaned


if __name__ == "__main__":
    print("=" * 55)
    print("BharatGraph - Name Cleaner Test")
    print("=" * 55)
    c = NameCleaner()

    print("\n[1] Person names:")
    for n in ["SHRI RAHUL KUMAR","smt. priya devi","Dr. A.P.J. Abdul Kalam",
              "Hon. Justice Ranjan Gogoi","MR. NARENDRA  MODI","  adv. sunita sharma  "]:
        print(f"  '{n}' -> '{c.clean_person_name(n)}'")

    print("\n[2] Company names:")
    for n in ["M/S SAMPLE INFRASTRUCTURE PRIVATE LIMITED",
              "ABC CONSTRUCTIONS PVT. LTD.","xyz trading co. llp","M/S. DELHI ROADS LTD"]:
        print(f"  '{n}' -> '{c.clean_company_name(n)}'")

    print("\n[3] Amounts (to crore):")
    for a in ["150 Cr","500 lakh","1500000","Rs. 2,50,00,000","Rs.75cr"]:
        print(f"  '{a}' -> {c.clean_amount(a)} Cr")

    print("\n[4] States:")
    for s in ["TN","tamilnadu","MH","dl","UP","wb"]:
        print(f"  '{s}' -> '{c.clean_state_name(s)}'")

    print("\n[5] Full record:")
    r = {"name":"SHRI RAMESH KUMAR","party":"sample party","state":"TN","criminal_cases":"3"}
    print(f"  In:  {r}")
    print(f"  Out: {c.clean_record(r,'person')}")
    print("\nDone!")