pujithapsx commited on
Commit
e9084d7
Β·
1 Parent(s): 236f74b

initial push

Browse files
README.md CHANGED
@@ -1,12 +1,10 @@
1
  ---
2
- title: HDFC EMBEDDING RA MATCHING
3
- emoji: πŸ“‰
4
  colorFrom: purple
5
- colorTo: green
6
  sdk: gradio
7
- sdk_version: 6.9.0
8
- app_file: app.py
9
  pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: GEN AI Record Level Matching
3
+ emoji: πŸ”
4
  colorFrom: purple
5
+ colorTo: orange
6
  sdk: gradio
7
+ sdk_version: "4.44.0"
8
+ app_file: frontend/app.py
9
  pinned: false
10
+ ---
 
 
backend/__init__.py ADDED
File without changes
backend/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (169 Bytes). View file
 
backend/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (173 Bytes). View file
 
backend/__pycache__/matching_service.cpython-310.pyc ADDED
Binary file (19.5 kB). View file
 
backend/__pycache__/matching_service.cpython-312.pyc ADDED
Binary file (32.9 kB). View file
 
backend/__pycache__/models.cpython-310.pyc ADDED
Binary file (14.2 kB). View file
 
backend/__pycache__/models.cpython-312.pyc ADDED
Binary file (18.9 kB). View file
 
backend/__pycache__/server.cpython-310.pyc ADDED
Binary file (8.2 kB). View file
 
backend/__pycache__/server.cpython-312.pyc ADDED
Binary file (11.8 kB). View file
 
backend/config/common.properties ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [MAPPING_DICT]
2
+
3
+ CITY_MAPPING = {"MUMBAI":["MUMBAI","BOMBAY","MUMBAI SUBURBAN"],"DELHI":["DELHI","NEW DELHI","DELHI NCR","NCT OF DELHI","SEELAMPUR","SHAHDARA","DWARKA","ROHINI","PITAMPURA","KAROL BAGH","LAJPAT NAGAR","SAKET","JANAKPURI","MAYUR VIHAR","VASANT KUNJ","OKHLA"],"BENGALURU":["BENGALURU","BANGALORE","BENGALURU URBAN"],"HYDERABAD":["HYDERABAD","SECUNDERABAD","HYDERABAD CITY"],"CHENNAI":["CHENNAI","MADRAS","CHENNAI CITY"],"KOLKATA":["KOLKATA","CALCUTTA","KOLKATA CITY"],"PUNE":["PUNE","POONA"],"AHMEDABAD":["AHMEDABAD","AMDAVAD"],"JAIPUR":["JAIPUR","PINK CITY"],"LUCKNOW":["LUCKNOW","LAKHNAU"],"KANPUR":["KANPUR","CAWNPORE"],"NAGPUR":["NAGPUR"],"INDORE":["INDORE"],"THANE":["THANE","THANA"],"BHOPAL":["BHOPAL"],"VISAKHAPATNAM":["VISAKHAPATNAM","VIZAG","VISHAKHAPATNAM"],"PIMPRI-CHINCHWAD":["PIMPRI-CHINCHWAD","PIMPRI CHINCHWAD","PCMC"],"PATNA":["PATNA","PATALIPUTRA"],"VADODARA":["VADODARA","BARODA"],"GHAZIABAD":["GHAZIABAD","GHZ"],"LUDHIANA":["LUDHIANA"],"AGRA":["AGRA"],"NASHIK":["NASHIK","NASIK"],"FARIDABAD":["FARIDABAD"],"MEERUT":["MEERUT"],"RAJKOT":["RAJKOT"],"KALYAN-DOMBIVLI":["KALYAN-DOMBIVLI","KALYAN","DOMBIVLI"],"VASAI-VIRAR":["VASAI-VIRAR","VASAI","VIRAR"],"VARANASI":["VARANASI","BANARAS","BENARES","KASHI"],"SRINAGAR":["SRINAGAR"],"AURANGABAD":["AURANGABAD"],"DHANBAD":["DHANBAD"],"AMRITSAR":["AMRITSAR"],"NAVI MUMBAI":["NAVI MUMBAI","NEW BOMBAY"],"ALLAHABAD":["ALLAHABAD","PRAYAGRAJ","ILAHABAD"],"RANCHI":["RANCHI"],"HOWRAH":["HOWRAH","HAORA"],"COIMBATORE":["COIMBATORE"],"JABALPUR":["JABALPUR","JUBBULPORE"],"GWALIOR":["GWALIOR"],"VIJAYAWADA":["VIJAYAWADA"],"JODHPUR":["JODHPUR"],"MADURAI":["MADURAI"],"RAIPUR":["RAIPUR"],"KOTA":["KOTA"],"GUWAHATI":["GUWAHATI","GAUHATI"],"CHANDIGARH":["CHANDIGARH","MOHALI","SAS NAGAR","KHARAR","PANCHKULA","ZIRAKPUR"],"SOLAPUR":["SOLAPUR","SHOLAPUR"],"HUBLI-DHARWAD":["HUBLI-DHARWAD","HUBLI","DHARWAD"],"BAREILLY":["BAREILLY"],"MORADABAD":["MORADABAD"],"MYSORE":["MYSORE","MYSURU"],"GURGAON":["GURGAON","GURUGRAM"],"ALIGARH":["ALIGARH"],"JALANDHAR":["JALANDHAR"],"TIRUCHIRAPPALLI":["TIRUCHIRAPPALLI","TRICHY","TRICHINOPOLY"],"BHUBANESWAR":["BHUBANESWAR","BHUBANESHWAR"],"SALEM":["SALEM"],"WARANGAL":["WARANGAL"],"THIRUVANANTHAPURAM":["THIRUVANANTHAPURAM","TRIVANDRUM"],"GUNTUR":["GUNTUR"],"BHIWANDI":["BHIWANDI"],"SAHARANPUR":["SAHARANPUR"],"GORAKHPUR":["GORAKHPUR"],"BIKANER":["BIKANER"],"AMRAVATI":["AMRAVATI"],"NOIDA":["NOIDA"],"JAMSHEDPUR":["JAMSHEDPUR","TATANAGAR"],"BHILAI":["BHILAI","BHILAI NAGAR"],"CUTTACK":["CUTTACK"],"FIROZABAD":["FIROZABAD"],"KOCHI":["KOCHI","COCHIN"],"BHAVNAGAR":["BHAVNAGAR"],"DEHRADUN":["DEHRADUN","DEHRA DUN"],"DURGAPUR":["DURGAPUR"],"ASANSOL":["ASANSOL"],"NANDED":["NANDED"],"KOLHAPUR":["KOLHAPUR"],"AJMER":["AJMER"],"GULBARGA":["GULBARGA","KALABURAGI"],"JAMNAGAR":["JAMNAGAR"],"UJJAIN":["UJJAIN"],"LONI":["LONI"],"SILIGURI":["SILIGURI"],"JHANSI":["JHANSI"],"ULHASNAGAR":["ULHASNAGAR"],"NELLORE":["NELLORE"],"JAMMU":["JAMMU"],"SANGALI-MIRAJ-KUPWAD":["SANGALI-MIRAJ-KUPWAD","SANGALI","MIRAJ","KUPWAD"],"BELGAUM":["BELGAUM","BELAGAVI"],"MANGALORE":["MANGALORE","MANGALURU"],"AMBATTUR":["AMBATTUR"],"TIRUNELVELI":["TIRUNELVELI"],"MALEGAON":["MALEGAON"],"GREATER NOIDA":["GREATER NOIDA"]}
4
+
5
+ STATE_MAPPING = {"ANDHRA PRADESH":["ANDHRA PRADESH","ANDHRAPRADESH","ANDHRA","AP","A.P","A.P.","AP STATE","IN-AP"],"ARUNACHAL PRADESH":["ARUNACHAL PRADESH","ARUNACHAL","AR","A.R","ARUNACHAL PRADESH STATE","IN-AR"],"ASSAM":["ASSAM","AS","A.S","ASSAM STATE","IN-AS"],"BIHAR":["BIHAR","BR","B.R","BIHAR STATE","IN-BR"],"CHHATTISGARH":["CHHATTISGARH","CHATTISGARH","CHHATISGARH","CG","C.G","CT","CHATTISGARH STATE","IN-CG"],"GOA":["GOA","GA","G.A","IN-GA"],"GUJARAT":["GUJARAT","GUJRAT","GUJARATH","GJ","G.J","IN-GJ"],"HARYANA":["HARYANA","HARIYANA","HR","H.R","IN-HR"],"HIMACHAL PRADESH":["HIMACHAL PRADESH","HIMACHAL","HP","H.P","H.P.","IN-HP"],"JHARKHAND":["JHARKHAND","JH","J.H","IN-JH"],"KARNATAKA":["KARNATAKA","KARNATAK","KARN","KA","K.A","MYSORE STATE","IN-KA"],"KERALA":["KERALA","KERALAM","KL","K.L","IN-KL"],"MADHYA PRADESH":["MADHYA PRADESH","MADHYAPRADESH","MADHYA","MP","M.P","M.P.","MP STATE","IN-MP"],"MAHARASHTRA":["MAHARASHTRA","MAHARASTRA","MAHA","MH","M.H","MAHARASHTRA STATE","IN-MH"],"MANIPUR":["MANIPUR","MN","M.N","IN-MN"],"MEGHALAYA":["MEGHALAYA","ML","M.L","IN-ML"],"MIZORAM":["MIZORAM","MZ","M.Z","IN-MZ"],"NAGALAND":["NAGALAND","NL","N.L","IN-NL"],"ODISHA":["ODISHA","ORISSA","OD","O.D","OR","O.R","ODISHA STATE","IN-OD"],"PUNJAB":["PUNJAB","PANJAB","PB","P.B","IN-PB"],"RAJASTHAN":["RAJASTHAN","RAJ","RJ","R.J","RAJASTHAN STATE","IN-RJ"],"SIKKIM":["SIKKIM","SK","S.K","IN-SK"],"TAMIL NADU":["TAMIL NADU","TAMILNADU","TAMIL","TN","T.N","T.N.","TAMILNADU STATE","IN-TN"],"TELANGANA":["TELANGANA","TELENGANA","TG","T.G","TS","T.S","TELANGANA STATE","IN-TS"],"TRIPURA":["TRIPURA","TR","T.R","IN-TR"],"UTTAR PRADESH":["UTTAR PRADESH","UTTARPRADESH","UTTAR","UP","U.P","U.P.","UP STATE","IN-UP"],"UTTARAKHAND":["UTTARAKHAND","UTTARANCHAL","UK","U.K","UA","UTTARAKHAND STATE","IN-UK"],"WEST BENGAL":["WEST BENGAL","WESTBENGAL","WB","W.B","W.B.","WEST BENGAL STATE","IN-WB"],"ANDAMAN AND NICOBAR ISLANDS":["ANDAMAN AND NICOBAR ISLANDS","ANDAMAN NICOBAR","ANDAMAN","NICOBAR","AN","A.N","A & N ISLANDS","IN-AN"],"CHANDIGARH":["CHANDIGARH","CH","C.H","IN-CH","MOHALI","SAS NAGAR","KHARAR","PANCHKULA","ZIRAKPUR"],"DADRA AND NAGAR HAVELI AND DAMAN AND DIU":["DADRA AND NAGAR HAVELI AND DAMAN AND DIU","DADRA NAGAR HAVELI","DAMAN DIU","DN","D.N","DNH","DD","IN-DH"],"DELHI":["DELHI","NEW DELHI","DL","D.L","NCT OF DELHI","NATIONAL CAPITAL TERRITORY OF DELHI","NORTH EAST DELHI","NORTH WEST DELHI","SOUTH EAST DELHI","SOUTH WEST DELHI","SEELAMPUR","SHAHDARA","DWARKA","ROHINI","PITAMPURA","KAROL BAGH","LAJPAT NAGAR","SAKET","JANAKPURI","MAYUR VIHAR","VASANT KUNJ","OKHLA","NOIDA","GREATER NOIDA","FARIDABAD","GHAZIABAD","GHZ","INDIRAPURAM","GURUGRAM","GURGAON","IN-DL"],"JAMMU AND KASHMIR":["JAMMU AND KASHMIR","JAMMU","KASHMIR","JK","J.K","J&K","JAMMU & KASHMIR","IN-JK"],"LADAKH":["LADAKH","LA","L.A","IN-LA"],"LAKSHADWEEP":["LAKSHADWEEP","LAKSHADWEEP ISLANDS","LD","L.D","IN-LD"],"PUDUCHERRY":["PUDUCHERRY","PONDICHERRY","PY","P.Y","IN-PY"]}
6
+
7
+ ADDRESS_MAPPING = {"DIST":["DISTRICT","DIST","DST","DSTR","DT","ZILLA","JILLA","ZILA"],"TALUK":["TALUK","TAL","TALUKA","TQ","TEH","TEHS","TEHSIL","MANDAL","MD"],"VILLAGE":["VILLAGE","VILL","VIL","VLG","GRAMA","GRAM","GAON"],"CITY":["CITY","CTY","TOWN","TWN","NAGAR","NAG","PURAM","PURA"],"STATE":["STATE","ST","RAJYA","PRADESH"],"HOUSE":["HOUSE NO","H NO","H.NO","H-NO","H/NO","HNO","HOUSE NUMBER","HOUSE#","HOUSE NUM","PROPERTY NO","PROPERTY NUMBER","RESIDENCE NO","RES NO","H:NO","H.NO:","D:NO","D.NO:"],"HOUSE":["D NO","D.NO","D-NO","D/NO","DNO","DOOR NO","DOOR NUMBER"],"APT":["APARTMENT","APT","APT NO","APT NUMBER","APARTMENT NO","TOWER","TOWER NO","WING","PHASE","PHASE NO","RESIDENCY","RESIDENTIAL COMPLEX","HEIGHTS","ENCLAVE","APARTMENTS","SOCIETY","SOCIETY NO","CHS"],"BLDG":["BLDG","BLDG NO","BUILDING","BUILDING NO"],"BLOCK":["BLK","BLOCK","BLOCK NO"],"FLAT":["FLAT","FLAT NO","FLAT NUMBER","FLT","FLT NO","UNIT","UNIT NO","UNIT NUMBER","PORTION","PORTION NO","OFFICE NO","OFFICE NUMBER"],"SHOP":["SHOP NO","SHOP NUMBER"],"ROAD":["ROAD","RD","R D","MARG","MRG","PATH"],"STR":["STREET","ST","STR","GALI","GALLLI","LANE","LN","MARG","PATH","CIRCLE","CIR","SECTOR","SEC"],"LANE":["LANE","LN","BYLANE","CROSS","CR"],"EXTN":["EXTENSION","EXT","EXTN"],"LOCALITY":["LOCALITY","LAYOUT","LYT","PHASE","PH","SECTOR","SEC"],"CLNY":["COLONY","COL","CLNY"],"BUILDING":["BUILDING","APT","APARTMENT","BLDG","TOWER"]}
8
+
9
+
10
+ [IDENTIFIERS]
11
+
12
+ APARTMENT_IDENTIFIER=APT
13
+ FLAT_NUMBER_IDENTIFIER=FLAT
14
+ HOUSE_NUMBER_IDENTIFIER= HOUSE
15
+ STREET_KEYWORD=STR
16
+ FLOOR_KEYWORD=FLR
17
+ DOOR_NO_KEYWORD=DOOR
18
+ INDIAN_SURNAMES = ["SINGH","THAKUR","RAJPUT","SHARMA","PANDEY","PANDE","PANDAY","MISHRA","TIWARI","TRIPATHI","DWIVEDI","AWASTHI","GUPTA","AGARWAL","AGGARWAL","BANSAL","GOYAL","MITTAL","VERMA","SAXENA","SRIVASTAVA","NIGAM","RASTOGI","MALHOTRA","KHANNA","ARORA","BHATIA","KOHLI","OBEROI","CHAUDHARY","CHOUDHARY","CHOWDHARY","CHOWDARY","PATEL","SHAH","MEHTA","PANDYA","TRIVEDI","DAVE","DESAI","PAREKH","JOSHI","BHATT","VORA","MODI","KULKARNI","DESHPANDE","APTE","CHITNIS","GOKHALE","PHADKE","NAIK","PAWAR","JADHAV","SHINDE","SAWANT","REDDY","REDDAPPA","REDDIGARI","RAO","NAIDU","RAJU","VARMA","SASTRY","SHASTRI","GOUD","CHOWDARY","CHOUDARY","SETTY","SHETTY","GOWDA","HEGDE","BHAT","PAI","KAMATH","SHENOY","PRABHU","NAYAK","SHETTY","IYER","IYENGAR","AYYAR","PILLAI","MUDALIAR","THEVAR","GOUNDER","CHETTIAR","NADAR","KANNAN","KRISHNAN","RAMAN","SUBRAMANIAN","NAIR","MENON","PILLAI","KURUP","PANICKER","VARMA","NAMBOOTHIRI","NAMBIAR","CHACKO","MATHEW","THOMAS","VARGHESE","DAS","DUTTA","BANERJEE","MUKHERJEE","CHATTERJEE","GHOSH","BOSE","SEN","ROY","SARKAR","MONDAL","MALLICK","MOHANTY","DASH","SAHOO","PATNAIK","NAYAK","PANDA","MUNDA","ORAON","TOPPO","TIRKEY","EKKA","MINZ","YADAV","SAHU","RATHORE","BAGHEL","LODHI","KUSHWAHA","THAKUR","BARUAH","BARUA","GOGOI","SAIKIA","DEKA","HAZARIKA","SANGMA","MARAK","LYNGDOH","LALLAWMA","LALTHANGLIANA","AO","SEMA","KONYAK","SHIMRAY","NINGOMBAM","ALI","HASSAN","KOYA","JAIN","MALIK","SINGHAL","AMBEDKAR","CHAMAR","PANDIT","KAPOOR","CHOPRA","MALIK","SINGHAL","JAIN","PADUKONE"]
19
+
20
+
21
+
22
+ [csv]
23
+ name_variation_standard = data/name_variation_standard.csv
24
+ hno_variation_standard = data/hno_variation_standard.csv
25
+ city_prev_pres = data/city_prev_pres.csv
26
+ state_name_standard = data/state_name_standard.csv
27
+ sur_comm_names = data/sur_comm_names.csv
28
+ pin_city_state = data/pin_city_state.csv
29
+
30
+ [MATCHING_LOGIC]
31
+ MODEL_WEIGHTS = {
32
+ "simple_ratio": 0.0,
33
+ "token_set_ratio": 0.0,
34
+ "partial_ratio": 0.1,
35
+ "w_ratio": 0.1,
36
+ "semantic_score": 0.8
37
+ }
38
+
39
+ MATCHING_RULES = [
40
+
41
+ ([("NAME", 100), ("ZIPCODE", 100), ("ADDRESSLINE", 65)], "NAME >= 100 AND ZIPCODE >= 100 AND ADDRESS >= 65"),
42
+ ([("NAME", 100), ("CITY", 100), ("ADDRESSLINE", 65)], "NAME >= 100 AND CITY >= 100 AND ADDRESS >= 65"),
43
+ ([("NAME", 85), ("LASTNAME", 85), ("BIRTHDATE", 100), ("ADDRESSLINE", 60)], "NAME >= 85 AND LASTNAME >= 85 AND DOB >= 100 AND ADDRESS >= 60"),
44
+ ([("NAME", 85), ("BIRTHDATE", 100), ("ZIPCODE", 100)], "NAME >= 85 AND DOB >= 100 AND ZIPCODE >= 100"),
45
+ ([("NAME", 85), ("BIRTHDATE", 100), ("CITY", 100)], "NAME >= 85 AND DOB >= 100 AND CITY >= 100"),
46
+ ([("NAME", 85), ("ZIPCODE", 100), ("ADDRESSLINE", 60)], "NAME >= 85 AND ZIPCODE >= 100 AND ADDRESS >= 60"),
47
+ ([("NAME", 85), ("CITY", 100), ("ADDRESSLINE", 60)], "NAME >= 85 AND CITY >= 100 AND ADDRESS >= 60"),
48
+ ([("BIRTHDATE", 100), ("ZIPCODE", 100), ("ADDRESSLINE", 65)], "BIRTHDATE >= 100 AND ZIPCODE >= 100 AND ADDRESS >= 65"),
49
+ ([("BIRTHDATE", 100), ("CITY", 100), ("ADDRESSLINE", 65)], "BIRTHDATE >= 100 AND CITY >= 100 AND ADDRESS >= 65"),
50
+ ([("LASTNAME", 85), ("ZIPCODE", 100), ("ADDRESSLINE", 60)], "LASTNAME >= 85 AND ZIPCODE >= 100 AND ADDRESS >= 60"),
51
+ ([("NAME", 85), ("PHONE", 100)], "NAME >= 85 AND PHONE >= 100"),
52
+ ([("BIRTHDATE", 100), ("PHONE", 100)], "BIRTHDATE >= 100 AND PHONE >= 100"),
53
+ ([("BIRTHDATE", 100), ("NAME", 85)], "BIRTHDATE >=100 AND NAME>=85"),
54
+ ([("ADDRESSLINE", 60), ("TAXID", 100)], "ADDRESS >= 60 and PAN >= 100"),
55
+ ([("ADDRESSLINE", 60), ("LICENSEID", 100)], "ADDRESS >= 60 and DRIVING_LICN_NO >= 100"),
56
+ ([("BIRTHDATE", 75), ("PHONE", 100)], "BIRTHDATE >= 75 and PHONE >= 100"),
57
+ ([("BIRTHDATE", 75), ("TAXID", 100)], "BIRTHDATE >= 75 and PAN >= 100"),
58
+ ([("BIRTHDATE", 75), ("LICENSEID", 100)], "BIRTHDATE >= 75 and DRIVING_LICN_NO >= 100"),
59
+ ([("BIRTHDATE", 75), ("PASSPORTID", 100)], "BIRTHDATE >= 75 and PASSPORT_NO >= 100"),
60
+ ([("NAME", 60), ("PASSPORTID", 100)], "NAME >= 60 and PASSPORT_NO >= 100"),
61
+ ([("NAME", 60), ("LICENSEID", 100)], "NAME >= 60 and DRIVING_LICN_NO >= 100"),
62
+ ([("NAME", 60), ("TAXID", 100)], "NAME >= 60 and AADHAR >= 100"),
63
+ ([("NAME", 60), ("AADHAR", 100)], "NAME >= 60 and AADHAR >= 100"),
64
+ ([("NAME", 60), ("PAN", 100)], "NAME >= 60 and PAN >= 100"),
65
+ ([("PHONE", 100)], "PHONE >= 100"),
66
+ ([("LICENSEID", 100)], "DRIVING_LICN_NO >= 100"),
67
+ ([("PASSPORTID", 100)], "PASSPORT_NO >= 100"),
68
+ ([("TAXID", 100)], "AADHAR >= 100"),
69
+ ([("AADHAR", 100)], "AADHAR >= 100"),
70
+ ([("PAN", 100)], "PAN >= 100"),
71
+ ([("EMAIL", 100)], "EMAIL >= 100")
72
+ ]
73
+
74
+
75
+
76
+ [NAME_MATCHING]
77
+ NAME_MODEL_WEIGHTS = {
78
+ "simple_ratio": 0.04,
79
+ "token_set_ratio": 0.04,
80
+ "partial_ratio": 0.04,
81
+ "w_ratio": 0.04,
82
+ "token_sort_ratio": 0.04,
83
+ "semantic_score": 0.7,
84
+ "jaro_winkler": 0.05,
85
+ "indic_soundex": 0.05
86
+ }
87
+
88
+ NAME_MATCH_ADJUSTMENTS = {
89
+ "surname_penalty": -20,
90
+ "initial_boost": 20,
91
+ "subset_boost": 20
92
+ }
93
+
94
+ [ADDRESS_MATCHING]
95
+ ADDRESS_MODEL_WEIGHTS = {
96
+ "simple_ratio": 0.04,
97
+ "token_set_ratio": 0.04,
98
+ "partial_ratio": 0.04,
99
+ "w_ratio": 0.04,
100
+ "token_sort_ratio": 0.04,
101
+ "semantic_score": 0.8
102
+ }
103
+
104
+ ADDRESS_MATCH_ADJUSTMENTS = {
105
+ "house_match_boost": 30,
106
+ "house_mismatch_penalty": 70
107
+ }
108
+
109
+ [EMBEDDING_MODELS]
110
+ MODEL_1_NAME = BAAI/bge-small-en-v1.5
111
+ MODEL_2_NAME = sentence-transformers/gtr-t5-base
backend/matching_service.py ADDED
@@ -0,0 +1,683 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ import time
4
+ import logging
5
+ from typing import Dict, Any, Optional, Tuple, Union
6
+
7
+ # Ensure project root is on sys.path so sibling modules resolve
8
+ _PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
9
+ if _PROJECT_ROOT not in sys.path:
10
+ sys.path.insert(0, _PROJECT_ROOT)
11
+
12
+ import numpy as np
13
+ import pandas as pd
14
+ from concurrent.futures import ThreadPoolExecutor, as_completed
15
+
16
+ from services.config import (
17
+ config,
18
+ pin_city_state_df,
19
+ ADDRESS_MATCH_ADJUSTMENTS,
20
+ )
21
+
22
+ from services.rules import (
23
+ preprocess_text,
24
+ standardize_column,
25
+ standardize_city,
26
+ standardize_state,
27
+ standardize_dob,
28
+ compare_exact,
29
+ compare_any_match,
30
+ compare_phone_any_match,
31
+ compare_email_any_match,
32
+ evaluate_matching_rules,
33
+
34
+ apply_pattern_matching_logic,
35
+ pincode_similarity_india,
36
+ extract_address_components,
37
+ validate_and_normalize_phone,
38
+ validate_and_normalize_email,
39
+ validate_and_normalize_pan,
40
+ validate_and_normalize_aadhar,
41
+ preprocess_name,
42
+ detect_surnames,
43
+ compute_initial_letter_boost,
44
+ clean_text,
45
+ deduplicate_tokens,
46
+ deduplicate_consecutive_tokens,
47
+ strip_non_alphanumeric,
48
+ )
49
+
50
+ from services.model import (
51
+ match_names_cross_records as embedding_match_names,
52
+ match_single_field as embedding_match_single,
53
+ )
54
+ from services.address_matcher import match_addresses_enhanced
55
+
56
+
57
+
58
+ logger = logging.getLogger("matching_service")
59
+
60
+ # =========================================================
61
+ # SENTINEL
62
+ # =========================================================
63
+ _MISSING = -1 # internal sentinel for "field not provided"
64
+
65
+
66
+ # =========================================================
67
+ # HELPERS
68
+ # =========================================================
69
+ def _is_valid_value(val: Any) -> bool:
70
+ if val is None:
71
+ return False
72
+ s = str(val).strip().lower()
73
+ return s not in ("", "-", " ", "na", "n/a", "null", "none", "missing value", "missing")
74
+
75
+
76
+ def _clean_input(val: str) -> str:
77
+ """Convert placeholder strings to empty string."""
78
+ if not val:
79
+ return ""
80
+ if val.strip().lower() in ("missing value", "missing", "na", "n/a", "null", "none"):
81
+ return ""
82
+ return val
83
+
84
+
85
+ def _is_field_empty(val: Any) -> bool:
86
+ return not _is_valid_value(val)
87
+
88
+
89
+ def _normalize_gender(val: Any) -> Optional[str]:
90
+ """Normalize gender values to canonical MALE / FEMALE / OTHER."""
91
+ if not _is_valid_value(val):
92
+ return None
93
+ s = str(val).strip().lower()
94
+ if s in ("m", "male", "men", "man"):
95
+ return "MALE"
96
+ if s in ("f", "female", "women", "woman"):
97
+ return "FEMALE"
98
+ return s.upper()
99
+
100
+
101
+ def _safe_round(val: Any) -> float:
102
+ try:
103
+ v = round(float(val), 2)
104
+ return max(v, 0.0)
105
+ except (ValueError, TypeError):
106
+ return 0.0
107
+
108
+
109
+ # =========================================================
110
+ # FLATTEN: EntityRecord β†’ flat dict
111
+ # =========================================================
112
+ def flatten_entity_record(record) -> Dict[str, Any]:
113
+ """
114
+ Convert an backend.models.EntityRecord (structured, nested) into the flat
115
+ dictionary format that match_structured_records() expects.
116
+
117
+ Mapping:
118
+ addresses[0] β†’ addressline_0, city_0, state_0, zipcode_0
119
+ addresses[1] β†’ addressline_1, city_1, state_1, zipcode_1
120
+ addresses[N] β†’ addressline_N, city_N, state_N, zipcode_N ← N entries
121
+ phones[0] β†’ phone_0
122
+ phones[1] β†’ phone_1 ← N entries
123
+ emails[0] β†’ email_0
124
+ emails[1] β†’ email_1 ← N entries
125
+ custom_fields β†’ top-level keys (uppercase)
126
+
127
+ The downstream match_structured_records() function picks up all these
128
+ indexed keys via get_dynamic_fields() and runs best-of-N / any-match
129
+ comparisons automatically β€” no changes needed there.
130
+ """
131
+ flat: Dict[str, Any] = {
132
+ "name": _clean_input(record.name),
133
+ "firstname": _clean_input(record.firstname),
134
+ "middlename": _clean_input(record.middlename),
135
+ "lastname": _clean_input(record.lastname),
136
+ "mothername": _clean_input(record.mothername),
137
+ "fathername": _clean_input(record.fathername),
138
+ "spousename": _clean_input(record.spousename),
139
+ "othername": _clean_input(record.othername),
140
+ "dob": _clean_input(record.dob),
141
+ "gender": _clean_input(record.gender),
142
+ "AADHAR": _clean_input(record.aadhar),
143
+ "pan": _clean_input(record.pan),
144
+ "licenseid": _clean_input(record.licenseid),
145
+ "passportid": _clean_input(record.passportid),
146
+ "voterid": _clean_input(record.voterid),
147
+ "companyname": _clean_input(record.companyname),
148
+ "parentcompanyname": _clean_input(record.parentcompanyname),
149
+ }
150
+
151
+ # ── Addresses β†’ addressline_0, city_0, …, addressline_N, city_N, … ──
152
+ # models.py already deduplicates and removes blank entries before we
153
+ # get here, so record.addresses contains only meaningful addresses.
154
+ for idx, addr in enumerate(record.addresses):
155
+ flat[f"addressline_{idx}"] = _clean_input(addr.addressline)
156
+ flat[f"city_{idx}"] = _clean_input(addr.city)
157
+ flat[f"state_{idx}"] = _clean_input(addr.state)
158
+ flat[f"zipcode_{idx}"] = _clean_input(addr.zipcode)
159
+
160
+ # ── Phones β†’ phone_0, phone_1, … ─────────────────────────────────────
161
+ # models.py already deduplicates and removes placeholder entries.
162
+ for idx, phone in enumerate(record.phones):
163
+ flat[f"phone_{idx}"] = phone
164
+
165
+ # ── Emails β†’ email_0, email_1, … ─────────────────────────────────────
166
+ for idx, email in enumerate(record.emails):
167
+ flat[f"email_{idx}"] = email
168
+
169
+ # ── Custom fields β†’ top-level keys ───────────────────────────────────
170
+ for key, value in record.custom_fields.items():
171
+ safe_key = str(key).strip()
172
+ if safe_key:
173
+ flat[safe_key] = value
174
+
175
+ logger.debug(
176
+ "flatten_entity_record β†’ %d addresses, %d phones, %d emails",
177
+ len(record.addresses), len(record.phones), len(record.emails),
178
+ )
179
+ return flat
180
+
181
+
182
+ # =========================================================
183
+ # STANDARDIZE: apply preprocessing pipeline to a flat dict
184
+ # =========================================================
185
+ def standardize_record(raw: Dict[str, Any]) -> Dict[str, Any]:
186
+ """
187
+ Apply the full standardization pipeline to a flat record dict.
188
+ Mirrors the logic from app_streamlit.py main() lines 1434-1512.
189
+
190
+ Dynamic fields (addressline_N, city_N, state_N, zipcode_N, phone_N,
191
+ email_N) are handled by the loop at the bottom β€” no changes needed
192
+ for multi-value support.
193
+ """
194
+ processed: Dict[str, Any] = {
195
+ "gender": preprocess_text(raw.get("gender", "")),
196
+ "name": preprocess_name(raw.get("name", "")),
197
+ "firstname": preprocess_name(raw.get("firstname", "")),
198
+ "middlename": preprocess_name(raw.get("middlename", "")),
199
+ "lastname": preprocess_name(raw.get("lastname", "")),
200
+ "spousename": preprocess_name(raw.get("spousename", "")),
201
+ "mothername": preprocess_name(raw.get("mothername", "")),
202
+ "fathername": preprocess_name(raw.get("fathername", "")),
203
+ "companyname": standardize_column(raw.get("companyname", ""), "companyname"),
204
+ "parentcompanyname": standardize_column(raw.get("parentcompanyname", ""), "parentcompanyname"),
205
+ "AADHAR": standardize_column(raw.get("AADHAR", ""), "AADHAR"),
206
+ "pan": standardize_column(raw.get("pan", ""), "pan"),
207
+ "licenseid": standardize_column(raw.get("licenseid", ""), "licenseid"),
208
+ "passportid": standardize_column(raw.get("passportid", ""), "passportid"),
209
+ "voterid": standardize_column(raw.get("voterid", ""), "voterid"),
210
+ "dob": standardize_dob(raw.get("dob", "")),
211
+ }
212
+
213
+ # ── Dynamic fields β€” handle all N entries automatically ──────────────
214
+ for key, value in raw.items():
215
+ if key.startswith("addressline_"):
216
+ processed[key] = standardize_column(raw.get(key, ""), "addressline")
217
+ elif key.startswith("city_"):
218
+ processed[key] = standardize_city(value) if _is_valid_value(value) else None
219
+ elif key.startswith("zipcode_"):
220
+ processed[key] = standardize_column(raw.get(key, ""), key)
221
+ elif key.startswith("state_"):
222
+ processed[key] = standardize_state(value) if _is_valid_value(value) else None
223
+ elif key.startswith("phone_"):
224
+ processed[key] = standardize_column(raw.get(key, ""), key)
225
+ elif key.startswith("email_"):
226
+ processed[key] = standardize_column(raw.get(key, ""), key)
227
+ elif key not in processed:
228
+ # Custom fields β€” keep uppercase keys
229
+ safe_key = str(key).strip()
230
+ if safe_key:
231
+ processed[safe_key.upper()] = value
232
+
233
+ return processed
234
+
235
+
236
+
237
+
238
+
239
+ # =========================================================
240
+ # MATCH STRUCTURED RECORDS
241
+ # Extracted from app_streamlit.py lines 912-1250
242
+ # =========================================================
243
+ def match_structured_records(
244
+ r1: Dict[str, Any],
245
+ r2: Dict[str, Any],
246
+ mode: str = "embedding",
247
+ ) -> Dict[str, Any]:
248
+ """
249
+ Core matching engine. Accepts two *standardized* flat record dicts.
250
+ Returns a dict of field β†’ score.
251
+
252
+ Multi-value handling (all already in place β€” no changes needed):
253
+ ─ Addresses : get_dynamic_fields("addressline_") collects all N
254
+ addresslines from both records; embedding_match_addresses
255
+ run best-of-N across all combos.
256
+ ─ Phones : get_dynamic_fields("phone_") + compare_phone_any_match
257
+ β€” any-match across all phone combinations.
258
+ ─ Emails : get_dynamic_fields("email_") + compare_email_any_match
259
+ β€” any-match across all email combinations.
260
+ ─ City/State/Zipcode: cross-compared across all pincode combinations
261
+ via the nested loop (i, j) β€” already handles N pincodes.
262
+ """
263
+
264
+ # ── helpers (closures) ──────────────────────────────────────────────
265
+ def get_dynamic_fields(record: Dict, prefix: str):
266
+ """
267
+ Extract all dynamic fields with given prefix from record.
268
+ Works for any N: phone_0, phone_1, …, phone_N.
269
+ """
270
+ fields = []
271
+ i = 0
272
+ while True:
273
+ key = f"{prefix}{i}"
274
+ if key in record:
275
+ fields.append(record.get(key))
276
+ i += 1
277
+ else:
278
+ break
279
+ return fields
280
+
281
+ def lookup_pincode_df(pincode, df):
282
+ if not _is_valid_value(pincode):
283
+ return None, None
284
+ row = df.loc[df["pincode"].astype(str) == str(pincode)]
285
+ if not row.empty:
286
+ return row.iloc[0]["districtname"], row.iloc[0]["statename"]
287
+ return None, None
288
+
289
+ # ── geo / pincode enrichment ─────────────────────────────────────────
290
+ city_match = 0
291
+ state_match = 0
292
+ zipcode_match = 0
293
+
294
+ pincode_match_found = False
295
+ best_pincode_score = 0
296
+
297
+ # Use pre-loaded CSV DataFrame instead of MySQL
298
+ pincode_df = pin_city_state_df
299
+
300
+ r1_pincodes = get_dynamic_fields(r1, "zipcode_")
301
+ r2_pincodes = get_dynamic_fields(r2, "zipcode_")
302
+ r1_cities_user = get_dynamic_fields(r1, "city_")
303
+ r2_cities_user = get_dynamic_fields(r2, "city_")
304
+ r1_states_user = get_dynamic_fields(r1, "state_")
305
+ r2_states_user = get_dynamic_fields(r2, "state_")
306
+
307
+ r1_cities, r2_cities = [], []
308
+ r1_states, r2_states = [], []
309
+
310
+ # Cross-compare ALL pincode combinations (i Γ— j) β€” handles N pincodes
311
+ for i, r1_pincode in enumerate(r1_pincodes):
312
+ for j, r2_pincode in enumerate(r2_pincodes):
313
+ r1_city_user = r1_cities_user[i] if i < len(r1_cities_user) else None
314
+ r2_city_user = r2_cities_user[j] if j < len(r2_cities_user) else None
315
+ r1_state_user = r1_states_user[i] if i < len(r1_states_user) else None
316
+ r2_state_user = r2_states_user[j] if j < len(r2_states_user) else None
317
+
318
+ if _is_valid_value(r1_pincode) and _is_valid_value(r2_pincode):
319
+ # --- CSV lookup first ---
320
+ r1_city_df, r1_state_df = lookup_pincode_df(r1_pincode, pincode_df)
321
+ r2_city_df, r2_state_df = lookup_pincode_df(r2_pincode, pincode_df)
322
+
323
+ # --- Pincode similarity (for scoring) ---
324
+ if r1_pincode == r2_pincode:
325
+ pincode_match_found = True
326
+ best_pincode_score = max(best_pincode_score, 100)
327
+ else:
328
+ # Different pincodes β€” pincode score = 0 (exact match or nothing)
329
+ pass
330
+
331
+ # --- City/State enrichment: prefer CSV, then pgeocode fallback ---
332
+ got_r1_geo = bool(r1_city_df and r1_state_df)
333
+ got_r2_geo = bool(r2_city_df and r2_state_df)
334
+
335
+ if got_r1_geo:
336
+ r1_cities.append(standardize_city(r1_city_df))
337
+ r1_states.append(standardize_state(r1_state_df))
338
+ if got_r2_geo:
339
+ r2_cities.append(standardize_city(r2_city_df))
340
+ r2_states.append(standardize_state(r2_state_df))
341
+
342
+ # pgeocode fallback for any pin not found in CSV
343
+ got_r1_pgeo = False
344
+ got_r2_pgeo = False
345
+ if not got_r1_geo or not got_r2_geo:
346
+ pin_result = pincode_similarity_india(r1_pincode, r2_pincode)
347
+ if not got_r1_geo:
348
+ if pin_result.get("pin1_county_name"):
349
+ r1_cities.append(standardize_city(pin_result["pin1_county_name"]))
350
+ got_r1_pgeo = True
351
+ if pin_result.get("pin1_state_name"):
352
+ r1_states.append(standardize_state(pin_result["pin1_state_name"]))
353
+ got_r1_pgeo = True
354
+ if not got_r2_geo:
355
+ if pin_result.get("pin2_county_name"):
356
+ r2_cities.append(standardize_city(pin_result["pin2_county_name"]))
357
+ got_r2_pgeo = True
358
+ if pin_result.get("pin2_state_name"):
359
+ r2_states.append(standardize_state(pin_result["pin2_state_name"]))
360
+ got_r2_pgeo = True
361
+
362
+ # If STILL no geo for a pin, fall back to user-entered city/state
363
+ if not got_r1_geo and not got_r1_pgeo:
364
+ if _is_valid_value(r1_city_user):
365
+ r1_cities.append(standardize_city(r1_city_user))
366
+ if _is_valid_value(r1_state_user):
367
+ r1_states.append(standardize_state(r1_state_user))
368
+ if not got_r2_geo and not got_r2_pgeo:
369
+ if _is_valid_value(r2_city_user):
370
+ r2_cities.append(standardize_city(r2_city_user))
371
+ if _is_valid_value(r2_state_user):
372
+ r2_states.append(standardize_state(r2_state_user))
373
+ else:
374
+ if _is_valid_value(r1_city_user):
375
+ r1_cities.append(standardize_city(r1_city_user))
376
+ if _is_valid_value(r2_city_user):
377
+ r2_cities.append(standardize_city(r2_city_user))
378
+ if _is_valid_value(r1_state_user):
379
+ r1_states.append(standardize_state(r1_state_user))
380
+ if _is_valid_value(r2_state_user):
381
+ r2_states.append(standardize_state(r2_state_user))
382
+
383
+ # Fallback: user-entered cities/states if no pincodes provided
384
+ if not r1_pincodes or not r2_pincodes:
385
+ for city_val in r1_cities_user:
386
+ if _is_valid_value(city_val) and standardize_city(city_val) not in r1_cities:
387
+ r1_cities.append(standardize_city(city_val))
388
+ for city_val in r2_cities_user:
389
+ if _is_valid_value(city_val) and standardize_city(city_val) not in r2_cities:
390
+ r2_cities.append(standardize_city(city_val))
391
+ for state_val in r1_states_user:
392
+ if _is_valid_value(state_val) and standardize_state(state_val) not in r1_states:
393
+ r1_states.append(standardize_state(state_val))
394
+ for state_val in r2_states_user:
395
+ if _is_valid_value(state_val) and standardize_state(state_val) not in r2_states:
396
+ r2_states.append(standardize_state(state_val))
397
+
398
+ # City / State / Pincode scoring
399
+ if r1_cities and r2_cities:
400
+ if any(c1 == c2 for c1 in r1_cities for c2 in r2_cities):
401
+ city_match = 100
402
+ elif pincode_match_found:
403
+ city_match = 100
404
+
405
+ if r1_states and r2_states:
406
+ if any(s1 == s2 for s1 in r1_states for s2 in r2_states):
407
+ state_match = 100
408
+
409
+ zipcode_match = compare_any_match(r1_pincodes, r2_pincodes, field_type="pincode")
410
+
411
+ # ── Phone / Email matching ───────────────────────────────────────────
412
+ # compare_phone_any_match / compare_email_any_match already handle
413
+ # lists of any length β€” any-match strategy.
414
+ r1_phones = get_dynamic_fields(r1, "phone_")
415
+ r2_phones = get_dynamic_fields(r2, "phone_")
416
+ phone_match = compare_phone_any_match(r1_phones, r2_phones)
417
+
418
+ r1_emails = get_dynamic_fields(r1, "email_")
419
+ r2_emails = get_dynamic_fields(r2, "email_")
420
+ email_match = compare_email_any_match(r1_emails, r2_emails)
421
+
422
+ logger.debug(
423
+ "match_structured_records β€” phones R1=%s R2=%s | emails R1=%s R2=%s | "
424
+ "addresses R1=%d R2=%d",
425
+ r1_phones, r2_phones, r1_emails, r2_emails,
426
+ len(get_dynamic_fields(r1, "addressline_")),
427
+ len(get_dynamic_fields(r2, "addressline_")),
428
+ )
429
+
430
+ # ── Name / Address / Single-field matching (mode-dependent) ─────────
431
+
432
+ name_match = embedding_match_names(
433
+ r1.get("name"), r1.get("firstname"), r1.get("lastname"), r1.get("middlename"),
434
+ r2.get("name"), r2.get("firstname"), r2.get("lastname"), r2.get("middlename"),
435
+ )
436
+
437
+ r1_addrs = get_dynamic_fields(r1, "addressline_")
438
+ r2_addrs = get_dynamic_fields(r2, "addressline_")
439
+
440
+ # ── Enhanced address matching with full pipeline ──
441
+ address_match = match_addresses_enhanced(r1_addrs, r2_addrs)
442
+
443
+ spousename_match = embedding_match_single(r1.get("spousename"), r2.get("spousename"))
444
+ mothername_match = embedding_match_single(r1.get("mothername"), r2.get("mothername"))
445
+ fathername_match = embedding_match_single(r1.get("fathername"), r2.get("fathername"))
446
+ companyname_match = embedding_match_single(r1.get("companyname"), r2.get("companyname"))
447
+ parentcompanyname_match = embedding_match_single(r1.get("parentcompanyname"), r2.get("parentcompanyname"))
448
+
449
+ # ── Exact matching ───────────────────────────────────────────────────
450
+ g1 = _normalize_gender(r1.get("gender"))
451
+ g2 = _normalize_gender(r2.get("gender"))
452
+ if not g1 and not g2:
453
+ gender_match = ""
454
+ elif g1 and g2 and g1 == g2:
455
+ gender_match = 100
456
+ else:
457
+ gender_match = 0
458
+
459
+ aadhar_match = compare_exact(r1.get("AADHAR"), r2.get("AADHAR"))
460
+ pan_match = compare_exact(r1.get("pan"), r2.get("pan"))
461
+ licenseid_match = compare_exact(r1.get("licenseid"), r2.get("licenseid"))
462
+ passportid_match = compare_exact(r1.get("passportid"), r2.get("passportid"))
463
+ voterid_match = compare_exact(r1.get("voterid"), r2.get("voterid"))
464
+ dob_match = compare_exact(r1.get("dob"), r2.get("dob"))
465
+
466
+ # ── Assemble results ─────────────────────────────────────────────────
467
+ results = {
468
+ "GENDER": apply_pattern_matching_logic("GENDER", gender_match) if gender_match != "" else _MISSING,
469
+ "NAME": name_match["full_name_percent"] if name_match is not None else _MISSING,
470
+ "FIRSTNAME": name_match["firstname_percent"] if name_match is not None else _MISSING,
471
+ "MIDDLENAME": name_match["middlename_percent"] if name_match is not None else _MISSING,
472
+ "LASTNAME": name_match["lastname_percent"] if name_match is not None else _MISSING,
473
+ "SPOUSENAME": apply_pattern_matching_logic("SPOUSENAME", spousename_match) if spousename_match != "" else _MISSING,
474
+ "MOTHERNAME": apply_pattern_matching_logic("MOTHERNAME", mothername_match) if mothername_match != "" else _MISSING,
475
+ "FATHERNAME": apply_pattern_matching_logic("FATHERNAME", fathername_match) if fathername_match != "" else _MISSING,
476
+ "COMPANYNAME": apply_pattern_matching_logic("COMPANYNAME", companyname_match) if companyname_match != "" else _MISSING,
477
+ "PARENTCOMPANYNAME":apply_pattern_matching_logic("PARENTCOMPANYNAME",parentcompanyname_match)if parentcompanyname_match!= "" else _MISSING,
478
+ "AADHAR": apply_pattern_matching_logic("AADHAR", aadhar_match) if aadhar_match != "" else _MISSING,
479
+ "PAN": apply_pattern_matching_logic("PAN", pan_match) if pan_match != "" else _MISSING,
480
+ "LICENSEID": apply_pattern_matching_logic("LICENSEID", licenseid_match) if licenseid_match != "" else _MISSING,
481
+ "PASSPORTID": apply_pattern_matching_logic("PASSPORTID", passportid_match) if passportid_match != "" else _MISSING,
482
+ "VOTERID": apply_pattern_matching_logic("TAXID", voterid_match) if voterid_match != "" else _MISSING,
483
+ "ADDRESSLINE": apply_pattern_matching_logic("ADDRESSLINE", address_match) if address_match != "" else _MISSING,
484
+ "BIRTHDATE": apply_pattern_matching_logic("BIRTHDATE", dob_match) if dob_match != "" else _MISSING,
485
+ "PHONE": apply_pattern_matching_logic("PHONE", phone_match) if phone_match != "" else _MISSING,
486
+ "EMAIL": apply_pattern_matching_logic("EMAIL", email_match) if email_match != "" else _MISSING,
487
+ "CITY": apply_pattern_matching_logic("CITY", city_match) if city_match != "" else _MISSING,
488
+ "STATE": apply_pattern_matching_logic("STATE", state_match) if state_match != "" else _MISSING,
489
+ "ZIPCODE": apply_pattern_matching_logic("ZIPCODE", zipcode_match) if zipcode_match != "" else _MISSING,
490
+ }
491
+
492
+ # ── Custom field matching ────────────────────────────────────────────
493
+ known_keys = {
494
+ "name", "firstname", "middlename", "lastname", "spousename",
495
+ "mothername", "fathername", "dob", "gender", "AADHAR", "pan",
496
+ "licenseid", "passportid", "voterid", "companyname", "parentcompanyname",
497
+ }
498
+ all_keys = set(r1.keys()) | set(r2.keys())
499
+ for key in all_keys:
500
+ key_str = str(key)
501
+ if key_str in known_keys:
502
+ continue
503
+ if any(key_str.startswith(p) for p in (
504
+ "zipcode_", "city_", "state_", "phone_", "email_", "addressline_"
505
+ )):
506
+ continue
507
+ val1 = r1.get(key) or r1.get(key_str.upper())
508
+ val2 = r2.get(key) or r2.get(key_str.upper())
509
+ if val1 or val2:
510
+ score = compare_exact(val1, val2)
511
+ results[key_str.upper()] = score
512
+
513
+ return results
514
+
515
+
516
+ # =========================================================
517
+ # FORMAT RESULTS
518
+ # =========================================================
519
+ def format_results(
520
+ field_results: Dict[str, Any],
521
+ r1_processed: Dict[str, Any],
522
+ r2_processed: Dict[str, Any],
523
+ mode: str,
524
+ ) -> Dict[str, Any]:
525
+ """
526
+ Convert raw field scores into the final response format.
527
+ Handles 'missing value' detection and mode-specific formatting.
528
+
529
+ Dynamic prefixes (addressline_, phone_, email_, city_, state_, zipcode_)
530
+ are scanned from both processed records so missing-value detection works
531
+ correctly regardless of how many entries are present in each record.
532
+
533
+ [MODIFIED 2026-03-15]
534
+ - Expanded 'field_to_inputs' mapping for NAME to explicitly check First,
535
+ Middle, and Last name fields. This prevents `format_results` from blindly
536
+ overwriting a valid exact FML match score back to 'missing value'.
537
+ """
538
+
539
+ # Static field β†’ input key mappings
540
+ field_to_inputs: Dict[str, list] = {
541
+ "GENDER": [("gender",)],
542
+ "NAME": [("name",), ("firstname",), ("middlename",), ("lastname",)],
543
+ "FIRSTNAME": [("firstname",)],
544
+ "MIDDLENAME": [("middlename",)],
545
+ "LASTNAME": [("lastname",)],
546
+ "SPOUSENAME": [("spousename",)],
547
+ "MOTHERNAME": [("mothername",)],
548
+ "FATHERNAME": [("fathername",)],
549
+ "COMPANYNAME": [("companyname",)],
550
+ "PARENTCOMPANYNAME":[("parentcompanyname",)],
551
+ "AADHAR": [("AADHAR",)],
552
+ "PAN": [("pan",)],
553
+ "LICENSEID": [("licenseid",)],
554
+ "PASSPORTID": [("passportid",)],
555
+ "VOTERID": [("voterid",)],
556
+ "BIRTHDATE": [("dob",)],
557
+ }
558
+
559
+ # Dynamic field mappings β€” scan ALL keys from both records so N-entry
560
+ # fields are correctly represented regardless of how many were sent.
561
+ for prefix, result_key in [
562
+ ("addressline_", "ADDRESSLINE"),
563
+ ("phone_", "PHONE"),
564
+ ("email_", "EMAIL"),
565
+ ("city_", "CITY"),
566
+ ("state_", "STATE"),
567
+ ("zipcode_", "ZIPCODE"),
568
+ ]:
569
+ input_keys = [
570
+ k for k in list(r1_processed.keys()) + list(r2_processed.keys())
571
+ if k.startswith(prefix)
572
+ ]
573
+ field_to_inputs[result_key] = [(k,) for k in input_keys] if input_keys else []
574
+
575
+ def check_missing(result_key: str) -> bool:
576
+ """
577
+ Return True only if BOTH records have no valid data for this field.
578
+ For multi-value fields, any single valid value in either record means
579
+ the field is NOT missing.
580
+ """
581
+ input_defs = field_to_inputs.get(result_key)
582
+ if input_defs is None:
583
+ return field_results.get(result_key) == _MISSING
584
+ if not input_defs:
585
+ return True
586
+
587
+ # For CITY and STATE, consider them present if ZIPCODE was provided,
588
+ # because the backend infers city/state from the zipcode.
589
+ if result_key in ["CITY", "STATE"]:
590
+ zipcode_defs = field_to_inputs.get("ZIPCODE", [])
591
+ for (field_key,) in zipcode_defs:
592
+ r1_val = r1_processed.get(field_key)
593
+ r2_val = r2_processed.get(field_key)
594
+ if not _is_field_empty(r1_val) or not _is_field_empty(r2_val):
595
+ return False
596
+
597
+ for (field_key,) in input_defs:
598
+ r1_val = r1_processed.get(field_key)
599
+ r2_val = r2_processed.get(field_key)
600
+ if not _is_field_empty(r1_val) or not _is_field_empty(r2_val):
601
+ return False
602
+ return True
603
+
604
+ formatted_scores: Dict[str, Any] = {}
605
+ all_keys = list(field_to_inputs.keys())
606
+
607
+ for k in all_keys:
608
+ v = field_results.get(k, _MISSING)
609
+
610
+ if check_missing(k):
611
+ formatted_scores[k] = "missing value"
612
+ elif mode == "embedding":
613
+ if v == _MISSING:
614
+ formatted_scores[k] = "missing value"
615
+ else:
616
+ try:
617
+ formatted_scores[k] = _safe_round(v)
618
+ except (ValueError, TypeError):
619
+ formatted_scores[k] = "missing value"
620
+ else:
621
+ if v == _MISSING:
622
+ formatted_scores[k] = 0.0
623
+ else:
624
+ formatted_scores[k] = _safe_round(v)
625
+
626
+ return formatted_scores
627
+
628
+
629
+ # =========================================================
630
+ # PUBLIC backend β€” single match
631
+ # =========================================================
632
+ def perform_match(record1, record2, mode: str = "embedding") -> Dict[str, Any]:
633
+ """
634
+ End-to-end matching pipeline.
635
+
636
+ Args:
637
+ record1: backend.models.EntityRecord (Pydantic model)
638
+ record2: backend.models.EntityRecord (Pydantic model)
639
+ mode: "embedding"
640
+
641
+ Returns:
642
+ Dict with keys: overall_decision, reason, field_scores, mode,
643
+ processing_time_ms
644
+ """
645
+ t0 = time.perf_counter()
646
+
647
+ # 1. Flatten nested Pydantic models β†’ flat dicts
648
+ r1_flat = flatten_entity_record(record1)
649
+ r2_flat = flatten_entity_record(record2)
650
+
651
+ logger.info(
652
+ "Flattened records β€” R1 keys: %s | R2 keys: %s",
653
+ list(r1_flat.keys()), list(r2_flat.keys()),
654
+ )
655
+
656
+ # 2. Standardize
657
+ r1_processed = standardize_record(r1_flat)
658
+ r2_processed = standardize_record(r2_flat)
659
+
660
+ logger.info("Standardized records β€” mode=%s", mode)
661
+
662
+
663
+ # 4. Match
664
+ field_results = match_structured_records(r1_processed, r2_processed, mode=mode)
665
+
666
+ # 5. Evaluate rules
667
+
668
+ overall_decision, reason = evaluate_matching_rules(field_results)
669
+
670
+ # 6. Format
671
+ formatted_scores = format_results(field_results, r1_processed, r2_processed, mode)
672
+
673
+ elapsed_ms = (time.perf_counter() - t0) * 1000
674
+
675
+ return {
676
+ "overall_decision": overall_decision,
677
+ "reason": reason,
678
+ "field_scores": formatted_scores,
679
+ "mode": mode,
680
+ "processing_time_ms": round(elapsed_ms, 2),
681
+ }
682
+
683
+
backend/models.py ADDED
@@ -0,0 +1,536 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field, field_validator, model_validator
2
+ from typing import Dict, List, Optional, Any, Union
3
+ from enum import Enum
4
+
5
+
6
+ # =========================================================
7
+ # ENUM
8
+ # =========================================================
9
+ class MatchingMode(str, Enum):
10
+ """Supported matching modes"""
11
+ EMBEDDING = "embedding"
12
+
13
+
14
+ # =========================================================
15
+ # CONSTANTS
16
+ # =========================================================
17
+ MISSING_PLACEHOLDERS = {"missing value", "missing", "na", "n/a", "null", "none", "-"}
18
+
19
+ # ---------------------------------------------------------------------------
20
+ # Flat-format key β†’ EntityRecord field name mapping.
21
+ # To support a new flat key in future, just add it here.
22
+ # ---------------------------------------------------------------------------
23
+ _FLAT_KEY_MAP: Dict[str, str] = {
24
+ # Personal identifiers
25
+ "GENDER": "gender",
26
+ "NAME": "name",
27
+ "FIRSTNAME": "firstname",
28
+ "MIDDLENAME": "middlename",
29
+ "LASTNAME": "lastname",
30
+ "SPOUSENAME": "spousename",
31
+ "MOTHERNAME": "mothername",
32
+ "FATHERNAME": "fathername",
33
+ "COMPANYNAME": "companyname",
34
+ "PARENTCOMPANYNAME": "parentcompanyname",
35
+ # ID documents
36
+ "AADHAR": "aadhar",
37
+ "PAN": "pan",
38
+ "LICENSEID": "licenseid",
39
+ "PASSPORTID": "passportid",
40
+ "VOTERID": "voterid",
41
+ # DOB
42
+ "BIRTHDATE": "dob",
43
+ "DOB": "dob",
44
+ # Contact β€” collected into lists
45
+ "PHONE": "_phone_flat",
46
+ "EMAIL": "_email_flat",
47
+ # Address components β€” collected into addresses[0]
48
+ "ADDRESSLINE": "_addressline_flat",
49
+ "CITY": "_city_flat",
50
+ "STATE": "_state_flat",
51
+ "ZIPCODE": "_zipcode_flat",
52
+ }
53
+
54
+ _FLAT_ADDRESS_KEYS = {"_addressline_flat", "_city_flat", "_state_flat", "_zipcode_flat"}
55
+
56
+
57
+ def _is_placeholder(val: Any) -> bool:
58
+ """Return True if value is a known missing/placeholder sentinel."""
59
+ if val is None:
60
+ return True
61
+ return str(val).strip().lower() in MISSING_PLACEHOLDERS
62
+
63
+
64
+ def _normalize_flat_to_nested(data: Dict[str, Any]) -> Dict[str, Any]:
65
+ """
66
+ Detect whether *data* is in flat format (uppercase keys like ADDRESSLINE,
67
+ BIRTHDATE …) and, if so, convert it to the nested EntityRecord format.
68
+
69
+ If data already looks nested (has 'addresses' / 'phones' / 'emails' keys)
70
+ it is returned unchanged β€” this is the fast-path for the nested format
71
+ that supports multiple addresses/phones/emails.
72
+ """
73
+ # Fast-path: already nested
74
+ if "addresses" in data or "phones" in data or "emails" in data:
75
+ return data
76
+
77
+ # Check if this looks like flat format
78
+ upper_keys = {k.upper() for k in data}
79
+ is_flat = bool(upper_keys & set(_FLAT_KEY_MAP.keys()))
80
+ if not is_flat:
81
+ return data # Unrecognized β€” pass through and let Pydantic handle
82
+
83
+ # ---- Convert flat β†’ nested -------------------------------------------
84
+ nested: Dict[str, Any] = {}
85
+ address_parts: Dict[str, str] = {}
86
+ phones: List[str] = []
87
+ emails: List[str] = []
88
+
89
+ for raw_key, raw_val in data.items():
90
+ target = _FLAT_KEY_MAP.get(raw_key.upper())
91
+
92
+ if target is None:
93
+ # Unknown flat key β€” pass through (may end up in custom_fields)
94
+ nested[raw_key] = raw_val
95
+ continue
96
+
97
+ if _is_placeholder(raw_val):
98
+ continue
99
+
100
+ if target == "_phone_flat":
101
+ phones.append(str(raw_val).strip())
102
+ elif target == "_email_flat":
103
+ emails.append(str(raw_val).strip())
104
+ elif target in _FLAT_ADDRESS_KEYS:
105
+ addr_key = target.replace("_flat", "").lstrip("_")
106
+ address_parts[addr_key] = str(raw_val).strip()
107
+ else:
108
+ nested[target] = raw_val
109
+
110
+ if address_parts:
111
+ nested["addresses"] = [address_parts]
112
+ if phones:
113
+ nested["phones"] = phones
114
+ if emails:
115
+ nested["emails"] = emails
116
+
117
+ return nested
118
+
119
+
120
+ # =========================================================
121
+ # REQUEST MODELS
122
+ # =========================================================
123
+ class AddressRecord(BaseModel):
124
+ """A single address entry."""
125
+ addressline: str = Field(default="", description="Street address")
126
+ city: str = Field(default="", description="City name")
127
+ state: str = Field(default="", description="State name")
128
+ zipcode: str = Field(default="", description="6-digit postal code (pincode)")
129
+
130
+ @model_validator(mode="before")
131
+ @classmethod
132
+ def strip_address_placeholders(cls, values: Any) -> Any:
133
+ """Replace placeholder strings in address fields with empty string."""
134
+ if isinstance(values, dict):
135
+ return {
136
+ k: ("" if _is_placeholder(v) else v)
137
+ for k, v in values.items()
138
+ }
139
+ return values
140
+
141
+ def is_empty(self) -> bool:
142
+ """Return True when every field is blank β€” used to filter ghost entries."""
143
+ return not any([self.addressline, self.city, self.state, self.zipcode])
144
+
145
+
146
+ class EntityRecord(BaseModel):
147
+ """
148
+ A single entity record with all possible fields.
149
+ All fields are optional β€” only provided fields are matched.
150
+
151
+ ── Multi-value fields ──────────────────────────────────────────
152
+ addresses : List[AddressRecord]
153
+ Send as many addresses as needed.
154
+ Duplicates and all-blank entries are removed automatically.
155
+ Matching uses best-of-N across all address combinations
156
+ (handled by get_dynamic_fields + embedding_match_addresses
157
+ in matching_service.py β€” no service changes needed).
158
+
159
+ phones : List[str]
160
+ Send as many phone numbers as needed.
161
+ Duplicates and placeholder strings are removed automatically.
162
+ Matching uses compare_phone_any_match (any-match across all phones).
163
+
164
+ emails : List[str]
165
+ Same as phones, uses compare_email_any_match.
166
+
167
+ ── Input formats ───────────────────────────────────────────────
168
+ Accepts BOTH nested format and flat uppercase-key format.
169
+ Flat keys are transparently converted to nested via handle_flat_format.
170
+ """
171
+
172
+ # ---- Name fields -------------------------------------------------------
173
+ name: str = Field(default="", description="Full name")
174
+ firstname: str = Field(default="", description="First name")
175
+ middlename: str = Field(default="", description="Middle name")
176
+ lastname: str = Field(default="", description="Last name")
177
+
178
+ # ---- Related person names ----------------------------------------------
179
+ mothername: str = Field(default="", description="Mother's name")
180
+ fathername: str = Field(default="", description="Father's name")
181
+ spousename: str = Field(default="", description="Spouse's name")
182
+ othername: str = Field(default="", description="Other/alias name")
183
+
184
+ # ---- Personal info -----------------------------------------------------
185
+ dob: str = Field(default="", description="Date of birth (various formats accepted)")
186
+ gender: str = Field(default="", description="Gender (M/F/Male/Female/Other)")
187
+
188
+ # ---- Identity documents ------------------------------------------------
189
+ aadhar: str = Field(default="", alias="AADHAR", description="Aadhar number (12 digits)")
190
+ pan: str = Field(default="", description="PAN number (AAAAA9999A)")
191
+ licenseid: str = Field(default="", description="Driving license number")
192
+ passportid: str = Field(default="", description="Passport number")
193
+ voterid: str = Field(default="", description="Voter ID")
194
+
195
+ # ---- Addresses β€” N entries supported -----------------------------------
196
+ addresses: List[AddressRecord] = Field(
197
+ default_factory=list,
198
+ description=(
199
+ "List of addresses. Send any number β€” duplicates and blank entries "
200
+ "are removed. Matching uses best-of-N across all combinations."
201
+ )
202
+ )
203
+
204
+ # ---- Contact β€” N entries supported -------------------------------------
205
+ phones: List[str] = Field(
206
+ default_factory=list,
207
+ description=(
208
+ "List of phone numbers. Send any number β€” duplicates and placeholders "
209
+ "are removed. Matching uses any-match (match if any pair matches)."
210
+ )
211
+ )
212
+ emails: List[str] = Field(
213
+ default_factory=list,
214
+ description=(
215
+ "List of email addresses. Send any number β€” duplicates and placeholders "
216
+ "are removed. Matching uses any-match."
217
+ )
218
+ )
219
+
220
+ # ---- Employment --------------------------------------------------------
221
+ companyname: str = Field(default="", description="Company/employer name")
222
+ parentcompanyname: str = Field(default="", description="Parent company name")
223
+
224
+ # ---- Custom fields -----------------------------------------------------
225
+ custom_fields: Dict[str, str] = Field(
226
+ default_factory=dict,
227
+ description="Arbitrary key-value pairs for exact matching (e.g. MemberID, AccountNumber)"
228
+ )
229
+
230
+ # ── model_validator: runs BEFORE individual field validators ──────────
231
+ @model_validator(mode="before")
232
+ @classmethod
233
+ def handle_flat_format(cls, values: Any) -> Any:
234
+ """
235
+ Transparently convert flat-format records (uppercase keys like
236
+ ADDRESSLINE, BIRTHDATE, PHONE …) into the nested format.
237
+ Already-nested data is returned unchanged.
238
+ """
239
+ if isinstance(values, dict):
240
+ return _normalize_flat_to_nested(values)
241
+ return values
242
+
243
+ # ── Scalar field placeholder cleanup ────────────���────────────────────
244
+ @field_validator(
245
+ "name", "firstname", "middlename", "lastname",
246
+ "mothername", "fathername", "spousename", "othername",
247
+ "dob", "gender", "aadhar", "pan", "licenseid",
248
+ "passportid", "voterid", "companyname", "parentcompanyname",
249
+ mode="before"
250
+ )
251
+ @classmethod
252
+ def strip_missing_placeholders(cls, v):
253
+ """Convert placeholder strings β†’ empty string."""
254
+ if isinstance(v, str) and v.strip().lower() in MISSING_PLACEHOLDERS:
255
+ return ""
256
+ return v
257
+
258
+ # ── phones: deduplicate + strip placeholders ─────────────────────────
259
+ @field_validator("phones", mode="before")
260
+ @classmethod
261
+ def clean_phones(cls, v):
262
+ if not isinstance(v, list):
263
+ return v
264
+ seen, result = set(), []
265
+ for item in v:
266
+ s = str(item).strip()
267
+ if s and s.lower() not in MISSING_PLACEHOLDERS and s not in seen:
268
+ seen.add(s)
269
+ result.append(s)
270
+ return result
271
+
272
+ # ── emails: deduplicate + strip placeholders ─────────────────────────
273
+ @field_validator("emails", mode="before")
274
+ @classmethod
275
+ def clean_emails(cls, v):
276
+ if not isinstance(v, list):
277
+ return v
278
+ seen, result = set(), []
279
+ for item in v:
280
+ s = str(item).strip().lower()
281
+ if s and s not in MISSING_PLACEHOLDERS and s not in seen:
282
+ seen.add(s)
283
+ result.append(s)
284
+ return result
285
+
286
+ # ── addresses: remove empty entries + deduplicate ────────────────────
287
+ @field_validator("addresses", mode="after")
288
+ @classmethod
289
+ def clean_addresses(cls, v: List[AddressRecord]) -> List[AddressRecord]:
290
+ """
291
+ Remove all-blank address entries and deduplicate by
292
+ (addressline, city, state, zipcode) tuple.
293
+ This prevents ghost entries from inflating match scores.
294
+ """
295
+ seen, result = set(), []
296
+ for addr in v:
297
+ if addr.is_empty():
298
+ continue
299
+ key = (
300
+ addr.addressline.strip().lower(),
301
+ addr.city.strip().lower(),
302
+ addr.state.strip().lower(),
303
+ addr.zipcode.strip(),
304
+ )
305
+ if key not in seen:
306
+ seen.add(key)
307
+ result.append(addr)
308
+ return result
309
+
310
+ model_config = {
311
+ "populate_by_name": True,
312
+ "alias_generator": str.upper,
313
+ "json_schema_extra": {
314
+ "examples": [
315
+ # ── Nested format: multiple addresses + phones ──
316
+ {
317
+ "name": "RAJESH KUMAR SHARMA",
318
+ "firstname": "RAJESH",
319
+ "dob": "15-01-1990",
320
+ "aadhar": "234567890123",
321
+ "addresses": [
322
+ {
323
+ "addressline": "123 MG Road, Koramangala",
324
+ "city": "Bangalore",
325
+ "state": "Karnataka",
326
+ "zipcode": "560034"
327
+ },
328
+ {
329
+ "addressline": "45 Brigade Road",
330
+ "city": "Bangalore",
331
+ "state": "Karnataka",
332
+ "zipcode": "560025"
333
+ }
334
+ ],
335
+ "phones": ["9876543210", "9123456789"],
336
+ "emails": ["rajesh@example.com"]
337
+ },
338
+ # ── Flat format (single address/phone/email) ──
339
+ {
340
+ "NAME": "RAJESH KUMAR SHARMA",
341
+ "BIRTHDATE": "15-01-1990",
342
+ "AADHAR": "234567890123",
343
+ "ADDRESSLINE": "123 MG Road, Koramangala",
344
+ "CITY": "Bangalore",
345
+ "STATE": "Karnataka",
346
+ "ZIPCODE": "560034",
347
+ "PHONE": "9876543210",
348
+ "EMAIL": "rajesh@example.com"
349
+ }
350
+ ]
351
+ }
352
+ }
353
+
354
+
355
+ class MatchRequest(BaseModel):
356
+ """Request body for matching two entity records."""
357
+ record1: EntityRecord = Field(..., description="First entity record")
358
+ record2: EntityRecord = Field(..., description="Second entity record")
359
+ mode: MatchingMode = Field(
360
+ default=MatchingMode.EMBEDDING,
361
+ description="Matching mode: 'embedding'"
362
+ )
363
+
364
+ model_config = {
365
+ "json_schema_extra": {
366
+ "examples": [
367
+ # ── Example 1: Multiple addresses + phones (nested) ──────────────
368
+ {
369
+ "mode": "embedding",
370
+ "record1": {
371
+ "NAME": "RAJESH KUMAR SHARMA",
372
+ "dob": "15-01-1990",
373
+ "phones": ["9876543210", "9123456789"],
374
+ "emails": ["rajesh@example.com"],
375
+ "addresses": [
376
+ {
377
+ "addressline": "123 MG Road",
378
+ "city": "Bangalore",
379
+ "state": "Karnataka",
380
+ "zipcode": "560034"
381
+ },
382
+ {
383
+ "addressline": "45 Brigade Road",
384
+ "city": "Bangalore",
385
+ "state": "Karnataka",
386
+ "zipcode": "560025"
387
+ }
388
+ ]
389
+ },
390
+ "record2": {
391
+ "NAME": "RAJESH K SHARMA",
392
+ "dob": "15/01/1990",
393
+ "phones": ["9876543210"],
394
+ "emails": ["rajesh@example.com"],
395
+ "addresses": [
396
+ {
397
+ "addressline": "123 Mahatma Gandhi Rd",
398
+ "city": "Bengaluru",
399
+ "state": "KA",
400
+ "zipcode": "560034"
401
+ },
402
+ {
403
+ "addressline": "45 Brigade Road",
404
+ "city": "Bangalore",
405
+ "state": "Karnataka",
406
+ "zipcode": "560025"
407
+ }
408
+ ]
409
+ }
410
+ },
411
+ # ── Example 2: Flat format ───────────────────────────────────────
412
+ {
413
+ "mode": "embedding",
414
+ "record1": {
415
+ "GENDER": "missing value",
416
+ "NAME": "RAJESH KUMAR SHARMA",
417
+ "FIRSTNAME": "missing value",
418
+ "MIDDLENAME": "missing value",
419
+ "LASTNAME": "missing value",
420
+ "SPOUSENAME": "missing value",
421
+ "MOTHERNAME": "missing value",
422
+ "FATHERNAME": "missing value",
423
+ "COMPANYNAME": "missing value",
424
+ "PARENTCOMPANYNAME": "missing value",
425
+ "AADHAR": "missing value",
426
+ "PAN": "missing value",
427
+ "LICENSEID": "missing value",
428
+ "PASSPORTID": "missing value",
429
+ "VOTERID": "missing value",
430
+ "ADDRESSLINE": "123 MG Road",
431
+ "BIRTHDATE": "15-01-1990",
432
+ "PHONE": "9876543210",
433
+ "EMAIL": "missing value",
434
+ "CITY": "Bangalore",
435
+ "STATE": "Karnataka",
436
+ "ZIPCODE": "560034"
437
+ },
438
+ "record2": {
439
+ "GENDER": "missing value",
440
+ "NAME": "RAJESH K SHARMA",
441
+ "FIRSTNAME": "missing value",
442
+ "MIDDLENAME": "missing value",
443
+ "LASTNAME": "missing value",
444
+ "SPOUSENAME": "missing value",
445
+ "MOTHERNAME": "missing value",
446
+ "FATHERNAME": "missing value",
447
+ "COMPANYNAME": "missing value",
448
+ "PARENTCOMPANYNAME": "missing value",
449
+ "AADHAR": "missing value",
450
+ "PAN": "missing value",
451
+ "LICENSEID": "missing value",
452
+ "PASSPORTID": "missing value",
453
+ "VOTERID": "missing value",
454
+ "ADDRESSLINE": "123 Mahatma Gandhi Rd",
455
+ "BIRTHDATE": "15/01/1990",
456
+ "PHONE": "9876543210",
457
+ "EMAIL": "missing value",
458
+ "CITY": "Bengaluru",
459
+ "STATE": "KA",
460
+ "ZIPCODE": "560034"
461
+ }
462
+ }
463
+ ]
464
+ }
465
+ }
466
+
467
+
468
+ class BatchMatchRequest(BaseModel):
469
+ """Request body for batch matching (load testing)."""
470
+ pairs: List[MatchRequest] = Field(
471
+ ...,
472
+ description="List of record pairs to match",
473
+ min_length=1,
474
+ max_length=100
475
+ )
476
+
477
+
478
+ # =========================================================
479
+ # RESPONSE MODELS
480
+ # =========================================================
481
+ class FieldScore(BaseModel):
482
+ """Individual field matching result."""
483
+ field: str
484
+ score: Union[float, str] = Field(
485
+ description="Numeric score (0-100) in embedding mode"
486
+ )
487
+
488
+
489
+ class MatchResult(BaseModel):
490
+ """Result of matching two entity records."""
491
+ overall_decision: str = Field(description="'Match' or 'No Match'")
492
+ reason: str = Field(description="Human-readable explanation of the matching decision")
493
+ field_scores: Dict[str, Union[float, str]] = Field(
494
+ description="Per-field matching scores. Embedding: numeric 0-100."
495
+ )
496
+ mode: str = Field(description="Matching mode used: 'embedding'")
497
+
498
+
499
+ class MatchResponse(BaseModel):
500
+ """API response for a single match request."""
501
+ success: bool = True
502
+ result: Optional[MatchResult] = None
503
+ error: Optional[str] = None
504
+ processing_time_ms: float = Field(description="Time taken to process this match in milliseconds")
505
+
506
+ model_config = {"populate_by_name": True}
507
+
508
+
509
+ class BatchMatchResponse(BaseModel):
510
+ """API response for batch matching."""
511
+ success: bool = True
512
+ total: int = Field(description="Total number of pairs submitted")
513
+ completed: int = Field(description="Number of pairs successfully matched")
514
+ failed: int = Field(description="Number of pairs that failed")
515
+ results: List[MatchResponse] = Field(description="Individual match results")
516
+ total_processing_time_ms: float = Field(description="Total processing time in milliseconds")
517
+
518
+ model_config = {"populate_by_name": True}
519
+
520
+
521
+ class HealthResponse(BaseModel):
522
+ """Health check response."""
523
+ status: str = Field(description="'healthy' or 'unhealthy'")
524
+ version: str = Field(default="8.0", description="API version")
525
+ components: Dict[str, str] = Field(
526
+ description="Health status of individual components (csv_data, embedding_models)"
527
+ )
528
+
529
+ model_config = {"populate_by_name": True}
530
+
531
+
532
+ class ErrorResponse(BaseModel):
533
+ """Standard error response."""
534
+ success: bool = False
535
+ error: str
536
+ detail: Optional[str] = None
backend/server.py ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ import time
4
+ import logging
5
+ import traceback
6
+ from typing import List
7
+ from contextlib import asynccontextmanager
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
9
+ import asyncio
10
+
11
+ # Ensure project root is importable
12
+ _PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
13
+ if _PROJECT_ROOT not in sys.path:
14
+ sys.path.insert(0, _PROJECT_ROOT)
15
+
16
+ from fastapi import FastAPI, HTTPException, Request
17
+ from fastapi.middleware.cors import CORSMiddleware
18
+ from fastapi.responses import JSONResponse
19
+
20
+ import requests as http_requests
21
+
22
+ # -- Project imports ----------------------------------------------------------
23
+
24
+
25
+ from backend.models import (
26
+ MatchRequest,
27
+ MatchResponse,
28
+ MatchResult,
29
+ BatchMatchRequest,
30
+ BatchMatchResponse,
31
+ HealthResponse,
32
+ ErrorResponse,
33
+ )
34
+
35
+ from backend.matching_service import perform_match
36
+
37
+ # =========================================================
38
+ # LOGGING
39
+ # =========================================================
40
+ logging.basicConfig(
41
+ level=logging.INFO,
42
+ format="%(asctime)s | %(levelname)-7s | %(name)s | %(message)s",
43
+ datefmt="%Y-%m-%d %H:%M:%S",
44
+ force=True,
45
+ handlers=[logging.StreamHandler(sys.stdout)]
46
+ )
47
+ logger = logging.getLogger("backend_server")
48
+
49
+
50
+ # =========================================================
51
+ # LIFESPAN – startup / shutdown hooks
52
+ # =========================================================
53
+ @asynccontextmanager
54
+ async def lifespan(app: FastAPI):
55
+ """
56
+ Startup:
57
+ - Pre-warm embedding models (loaded at import time via model.py)
58
+ - Check CSV data
59
+ Shutdown:
60
+ - Nothing to close (CSV-based, no database connections)
61
+ """
62
+ logger.info("=" * 60)
63
+ logger.info("Entity Matching backend – Starting up")
64
+ logger.info("=" * 60)
65
+
66
+ logger.info("Embedding models loaded (sentence-transformers).")
67
+
68
+ try:
69
+ from services.config import pin_city_state_df, name_variation_df
70
+ csv_loaded = not pin_city_state_df.empty
71
+ logger.info("CSV data source: %s (%d pincode rows)",
72
+ "OK" if csv_loaded else "EMPTY",
73
+ len(pin_city_state_df))
74
+ except Exception as e:
75
+ logger.warning("CSV data source check failed: %s", e)
76
+
77
+
78
+ logger.info("backend ready to serve requests")
79
+ logger.info("=" * 60)
80
+
81
+ yield # ── app is running ──
82
+
83
+ logger.info("Entity Matching backend – Shutting down")
84
+
85
+
86
+ # =========================================================
87
+ # APP INSTANCE
88
+ # =========================================================
89
+ app = FastAPI(
90
+ title="Entity Matching backend",
91
+ description=(
92
+ "Gen AI Record-Level Entity Matching backend.\n\n"
93
+ "Compares two entity records and determines if they represent the same person/entity.\n\n"
94
+ "**Multi-value fields:** `addresses`, `phones`, and `emails` each accept a list "
95
+ "of any length. Matching uses best-of-N for addresses and any-match for phones/emails.\n\n"
96
+ "**Supported matching modes:**\n"
97
+ "- `embedding` (default): Sentence Transformers + Fuzzy matching\n"
98
+
99
+ "**Input formats:**\n"
100
+ "- Nested (recommended for multiple values): pass `addresses`, `phones`, `emails` as lists\n"
101
+ "- Flat (single address/phone/email): pass uppercase keys like `ADDRESSLINE`, `PHONE`, `EMAIL`"
102
+ ),
103
+ version="8.0.0",
104
+ lifespan=lifespan,
105
+ docs_url="/docs",
106
+ redoc_url="/redoc",
107
+ )
108
+
109
+ # -- CORS middleware ----------------------------------------------------------
110
+ app.add_middleware(
111
+ CORSMiddleware,
112
+ allow_origins=["*"], # Restrict in production
113
+ allow_credentials=True,
114
+ allow_methods=["*"],
115
+ allow_headers=["*"],
116
+ )
117
+
118
+
119
+ # =========================================================
120
+ # REQUEST LOGGING MIDDLEWARE
121
+ # =========================================================
122
+ @app.middleware("http")
123
+ async def log_requests(request: Request, call_next):
124
+ """Log every request with timing."""
125
+ start = time.perf_counter()
126
+ response = await call_next(request)
127
+ elapsed = (time.perf_counter() - start) * 1000
128
+ logger.info(
129
+ "%s %s – %d (%.1f ms)",
130
+ request.method,
131
+ request.url.path,
132
+ response.status_code,
133
+ elapsed,
134
+ )
135
+ return response
136
+
137
+
138
+ # =========================================================
139
+ # GLOBAL EXCEPTION HANDLER
140
+ # =========================================================
141
+ @app.exception_handler(Exception)
142
+ async def global_exception_handler(request: Request, exc: Exception):
143
+ logger.error("Unhandled exception: %s\n%s", exc, traceback.format_exc())
144
+ return JSONResponse(
145
+ status_code=500,
146
+ content={
147
+ "success": False,
148
+ "error": "Internal server error",
149
+ "detail": str(exc),
150
+ },
151
+ )
152
+
153
+
154
+ # =========================================================
155
+ # ENDPOINTS
156
+ # =========================================================
157
+
158
+ # ── Health Checks ─────────────────────────────────────────────────────────────
159
+ @app.get(
160
+ "/backend/v1/health",
161
+ response_model=HealthResponse,
162
+ tags=["Health"],
163
+ summary="Full system health check",
164
+ )
165
+ async def health_check():
166
+ """Check the health of all system components."""
167
+ components = {}
168
+
169
+ try:
170
+ from services.config import pin_city_state_df
171
+ components["csv_data"] = (
172
+ "healthy" if not pin_city_state_df.empty else "unhealthy"
173
+ )
174
+ except Exception as e:
175
+ components["csv_data"] = f"error: {e}"
176
+
177
+ try:
178
+ from services.model import MODEL_STORE
179
+ components["embedding_models"] = "healthy" if MODEL_STORE else "unhealthy"
180
+ except Exception as e:
181
+ components["embedding_models"] = f"error: {e}"
182
+
183
+
184
+
185
+ overall = (
186
+ "healthy"
187
+ if all(v == "healthy" for v in components.values() if v != "not_configured")
188
+ else "degraded"
189
+ )
190
+ return HealthResponse(status=overall, version="8.0", components=components)
191
+
192
+
193
+
194
+
195
+ # ── Single Match ──────────────────────────────────────────────────────────────
196
+ @app.post(
197
+ "/backend/v1/match",
198
+ response_model=MatchResponse,
199
+ tags=["Matching"],
200
+ summary="Match two entity records",
201
+ responses={
202
+ 200: {"description": "Successful matching result"},
203
+ 400: {"model": ErrorResponse, "description": "Invalid input"},
204
+ 500: {"model": ErrorResponse, "description": "Internal error"},
205
+ },
206
+ )
207
+ async def match_records(request: MatchRequest):
208
+ """
209
+ Compare two entity records and determine if they represent the same entity.
210
+
211
+ **Multi-value fields:**
212
+ Pass `addresses`, `phones`, and `emails` as lists of any length:
213
+ ```json
214
+ {
215
+ "mode": "embedding",
216
+ "record1": {
217
+ "NAME": "RAJESH KUMAR SHARMA",
218
+ "dob": "15-01-1990",
219
+ "phones": ["9876543210", "9123456789"],
220
+ "addresses": [
221
+ {"addressline": "123 MG Road", "city": "Bangalore", "state": "Karnataka", "zipcode": "560034"},
222
+ {"addressline": "45 Brigade Road", "city": "Bangalore", "state": "Karnataka", "zipcode": "560025"}
223
+ ]
224
+ },
225
+ "record2": {
226
+ "NAME": "RAJESH K SHARMA",
227
+ "dob": "15/01/1990",
228
+ "phones": ["9876543210"],
229
+ "addresses": [
230
+ {"addressline": "123 Mahatma Gandhi Rd", "city": "Bengaluru", "state": "KA", "zipcode": "560034"}
231
+ ]
232
+ }
233
+ }
234
+ ```
235
+
236
+ **Matching strategy for lists:**
237
+ - `addresses`: best-of-N (highest score across all pair combinations)
238
+ - `phones`: any-match (match if any phone pair matches)
239
+ - `emails`: any-match (match if any email pair matches)
240
+
241
+ **Modes:**
242
+ - `embedding` (default): Sentence Transformers + RbackenddFuzz
243
+
244
+ """
245
+ mode = request.mode.value
246
+
247
+
248
+ t0 = time.perf_counter()
249
+ try:
250
+ # Pre-print to terminal specifically for user visibility
251
+ import json
252
+ print("\n\n" + "="*80)
253
+ print(f" NEW MATCH REQUEST RECEIVED (Mode: {mode})")
254
+ print("="*80)
255
+ print(f" RECORD 1 INPUT:\n{json.dumps(request.record1.model_dump(by_alias=True), indent=2)}")
256
+ print(f" RECORD 2 INPUT:\n{json.dumps(request.record2.model_dump(by_alias=True), indent=2)}")
257
+ print("-" * 80)
258
+
259
+ # perform_match is synchronous (CPU + IO bound); run in thread pool
260
+ # so it doesn't block the asyncio event loop.
261
+ result = await asyncio.to_thread(
262
+ perform_match, request.record1, request.record2, mode=mode
263
+ )
264
+ elapsed_ms = (time.perf_counter() - t0) * 1000
265
+ logger.info(
266
+ "Match complete β€” decision=%s mode=%s time=%.1fms",
267
+ result["overall_decision"], mode, elapsed_ms,
268
+ )
269
+
270
+ # Post-print to terminal specifically for user visibility
271
+ print("πŸ“€ MATCH RESULT OUT:\n" + json.dumps({
272
+ "overall_decision": result["overall_decision"],
273
+ "reason": result["reason"],
274
+ "field_scores": result["field_scores"]
275
+ }, indent=2))
276
+ print("="*80 + "\n\n")
277
+
278
+ return MatchResponse(
279
+ success=True,
280
+ result=MatchResult(
281
+ overall_decision=result["overall_decision"],
282
+ reason=result["reason"],
283
+ field_scores=result["field_scores"],
284
+ mode=mode,
285
+ ),
286
+ processing_time_ms=round(elapsed_ms, 2),
287
+ )
288
+ except Exception as e:
289
+ elapsed_ms = (time.perf_counter() - t0) * 1000
290
+ logger.error("Match failed: %s\n%s", e, traceback.format_exc())
291
+ return MatchResponse(
292
+ success=False,
293
+ error=str(e),
294
+ processing_time_ms=round(elapsed_ms, 2),
295
+ )
296
+
297
+
298
+
299
+ # =========================================================
300
+ # ROOT / INFO
301
+ # =========================================================
302
+ @app.get("/", tags=["Info"], include_in_schema=False)
303
+ async def root():
304
+ return {
305
+ "service": "Entity Matching backend",
306
+ "version": "8.0.0",
307
+ "docs": "/docs",
308
+ "health": "/backend/v1/health",
309
+ }
310
+
311
+
312
+ # =========================================================
313
+ # MAIN (for direct execution: python backend/server.py)
314
+ # =========================================================
315
+ if __name__ == "__main__":
316
+ import uvicorn
317
+ uvicorn.run(
318
+ "backend.server:app",
319
+ host="0.0.0.0",
320
+ port=8000,
321
+ reload=True,
322
+ log_level="info",
323
+ )
data/city_prev_pres.csv ADDED
@@ -0,0 +1,395 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "variation","standard"
2
+ BENGALURU,BANGALORE
3
+ JULLUNDER,JALANDHAR
4
+ CHERPULASSERY,CHERPULACHERRY
5
+ CHIKKAMAGALUR,CHIKMAGALUR
6
+ CHUCHURA,CHINSURAH MAGRA
7
+ HUBBALLI,HUBLI
8
+ INDUR,INDORE
9
+ KANCHIPURAM,KANCHEEPURAM
10
+ MANDAVGARH,MANDU
11
+ MANGALURU,MANGALORE
12
+ MANSANAGAR,VIJAYAWADA
13
+ NORTH PARAVUR,PARAVUR
14
+ SAS NAGAR,SASNAGAR
15
+ SHAHEED BHAGAT SINGH NAGAR,NAWANSHAHR
16
+ PANAJI,NORTH GOA
17
+ ANDAMAN NICOBAR ISLANDS,ANDAMANNICOBAR ISLANDS
18
+ ANDAMAN & NICOBAR ISLANDS,ANDAMANNICOBAR ISLANDS
19
+ ARIANKUPPAM COMMUNE PANCHAYAT,ARIANKUPPAMCOMMUNE PANCHAYAT
20
+ BAHOUR COMMUNE PANCHAYAT,BAHOURCOMMUNE PANCHAYAT
21
+ BAKSHI KA TALAB**,BAKSHIKA TALAB
22
+ BAKSHI KA TALAB,BAKSHIKA TALAB
23
+ CHANDRA SEKHARA PURAM,CHANDRASEKHARA PURAM
24
+ CHAUTH KA BARWARA,CHAUTHKA BARWARA
25
+ CHERUKUPALLE H/O ARUMBAKA,CHERUKUPALLEHO ARUMBAKA
26
+ CHERUKUPALLE HO ARUMBAKA,CHERUKUPALLEHO ARUMBAKA
27
+ CHILAKALURIPET H/O.PURUSHOTHA PATNAM,CHILAKALURIPETHOPURUSHOTHA PATNAM
28
+ CHILAKALURIPET HOPURUSHOTHA PATNAM,CHILAKALURIPETHOPURUSHOTHA PATNAM
29
+ CHINSURAH MAGRA,CHINSURAH MAGRA
30
+ CHINSURAH - MAGRA,CHINSURAH MAGRA
31
+ CHURACHANDPUR NORTH SUB-DIV.,CHURACHANDPURNORTH SUBDIV
32
+ CHURACHANDPUR NORTH SUBDIV,CHURACHANDPURNORTH SUBDIV
33
+ DADRA NAGAR HAVELI,DADRANAGAR HAVELI
34
+ DADRA & NAGAR HAVELI,DADRANAGAR HAVELI
35
+ DAMAN DIU,DAMAN DIU
36
+ DAMAN & DIU,DAMAN DIU
37
+ DERA BABA NANAK,DERABABA NANAK
38
+ EAST GARO HILLS,EASTGARO HILLS
39
+ EAST KHASI HILLS,EASTKHASI HILLS
40
+ EGMORE NUNGAMBAKKAM,EGMORE NUNGAMBAKKAM
41
+ EGMORE - NUNGAMBAKKAM,EGMORE NUNGAMBAKKAM
42
+ FORT TONDIARPET,FORT TONDIARPET
43
+ FORT - TONDIARPET,FORT TONDIARPET
44
+ GAUTAM BUDDHA NAGAR **,GAUTAMBUDDHA NAGAR
45
+ GAUTAM BUDDHA NAGAR ,GAUTAMBUDDHA NAGAR
46
+ GAUTAM BUDDHA NAGAR,GAUTAMBUDDHA NAGAR
47
+ GOLA GOKARAN NATH,GOLAGOKARAN NATH
48
+ GOOL GULAB GARH,GOOLGULAB GARH
49
+ GUDEM KOTHA VEEDHI,GUDEMKOTHA VEEDHI
50
+ JAJI REDDI GUDEM,JAJIREDDI GUDEM
51
+ JAMMU KASHMIR,JAMMU KASHMIR
52
+ JAMMU & KASHMIR,JAMMU KASHMIR
53
+ JYOTIBA PHULE NAGAR,JYOTIBAPHULE NAGAR
54
+ KAMJONG CHASSAD SUB-DIV.,KAMJONGCHASSAD SUBDIV
55
+ KAMJONG CHASSAD SUBDIV,KAMJONGCHASSAD SUBDIV
56
+ KEIRAO BITRA SUB-DIVISION,KEIRAOBITRA SUBDIVISION
57
+ KEIRAO BITRA SUBDIVISION,KEIRAOBITRA SUBDIVISION
58
+ KOTTUCHERRY COMMUNE PANCHAYAT,KOTTUCHERRYCOMMUNE PANCHAYAT
59
+ KUSHESHWAR ASTHAN PURBI,KUSHESHWARASTHAN PURBI
60
+ LAHUL SPITI,LAHUL SPITI
61
+ LAHUL & SPITI,LAHUL SPITI
62
+ MAMBALAM GUINDY,MAMBALAM GUINDY
63
+ MAMBALAM - GUINDY,MAMBALAM GUINDY
64
+ MANNADIPET COMMUNE PANCHAYAT,MANNADIPETCOMMUNE PANCHAYAT
65
+ MYLAPORE TRIPLICANE,MYLAPORE TRIPLICANE
66
+ MYLAPORE - TRIPLICANE,MYLAPORE TRIPLICANE
67
+ NEDUNGADU COMMUNE PANCHAYAT,NEDUNGADUCOMMUNE PANCHAYAT
68
+ NEEM CHAK BATHANI,NEEMCHAK BATHANI
69
+ NERAVY COMMUNE PANCHAYAT,NERAVYCOMMUNE PANCHAYAT
70
+ NETTAPAKKAM COMMUNE PANCHAYAT,NETTAPAKKAMCOMMUNE PANCHAYAT
71
+ NORTH 24 PARGANAS,NORTH24 PARGANAS
72
+ NORTH AND MIDDLE ANDAMAN,NORTHANDMIDDLE ANDAMAN
73
+ NORTH CACHAR HILLS,NORTHCACHAR HILLS
74
+ NORTH EAST DELHI,NORTHEAST DELHI
75
+ NORTH WEST DELHI,NORTHWEST DELHI
76
+ PEDDA ADISERLA PALLE,PEDDAADISERLA PALLE
77
+ PERAMBUR PURASAWAKKAM,PERAMBUR PURASAWAKKAM
78
+ PERAMBUR - PURASAWAKKAM,PERAMBUR PURASAWAKKAM
79
+ POLBA DADPUR,POLBA DADPUR
80
+ POLBA - DADPUR,POLBA DADPUR
81
+ RAIPUR KARCHULIYAN,RAIPUR KARCHULIYAN
82
+ RAIPUR - KARCHULIYAN,RAIPUR KARCHULIYAN
83
+ SADAR HILLS EAST SUB-DIVISION,SADARHILLSEAST SUBDIVISION
84
+ SADAR HILLS EAST SUBDIVISION,SADARHILLSEAST SUBDIVISION
85
+ SADAR HILLS WEST SUB-DIVISION,SADARHILLSWEST SUBDIVISION
86
+ SADAR HILLS WEST SUBDIVISION,SADARHILLSWEST SUBDIVISION
87
+ SAITU GAMPHAZOL SUB-DIVISION,SAITUGAMPHAZOL SUBDIVISION
88
+ SAITU GAMPHAZOL SUBDIVISION,SAITUGAMPHAZOL SUBDIVISION
89
+ SANT KABIR NAGAR,SANTKABIR NAGAR
90
+ SANT RAVIDAS NAGAR,SANTRAVIDAS NAGAR
91
+ SAVALYAPURAM H/O KANAMARLAPUDI,SAVALYAPURAMHO KANAMARLAPUDI
92
+ SAVALYAPURAM HO KANAMARLAPUDI,SAVALYAPURAMHO KANAMARLAPUDI
93
+ SOUTH 24 PARGANAS,SOUTH24 PARGANAS
94
+ SOUTH EAST DELHI,SOUTHEAST DELHI
95
+ SOUTH GARO HILLS,SOUTHGARO HILLS
96
+ SOUTH WEST DELHI,SOUTHWEST DELHI
97
+ TAMENGLONG NORTH SUB-DIVISION,TAMENGLONGNORTH SUBDIVISION
98
+ TAMENGLONG NORTH SUBDIVISION,TAMENGLONGNORTH SUBDIVISION
99
+ TELANGANA,ANDHRA PRADESH
100
+ THIRUMALAIRAYAN PATTINAM COMMUNE PANCHAYAT,THIRUMALAIRAYANPATTINAMCOMMUNE PANCHAYAT
101
+ THIRUNALLAR COMMUNE PANCHAYAT,THIRUNALLARCOMMUNE PANCHAYAT
102
+ UDHAM SINGH NAGAR,UDHAMSINGH NAGAR
103
+ UKHRUL CENTRAL SUB-DIVISION,UKHRULCENTRAL SUBDIVISION
104
+ UKHRUL CENTRAL SUBDIVISION,UKHRULCENTRAL SUBDIVISION
105
+ VILLIANUR COMMUNE PANCHAYAT,VILLIANURCOMMUNE PANCHAYAT
106
+ WEST GARO HILLS,WESTGARO HILLS
107
+ WEST KHASI HILLS,WESTKHASI HILLS
108
+ EDLABAD,ADILABAD
109
+ KARNAVATI,AHMEDABAD
110
+ AJAYMERU,AJMER
111
+ ALLEPPEY,ALAPPUZHA
112
+ ALLYGURH,ALIGARH
113
+ PRAYAG,ALLAHABAD
114
+ ALWAYE,ALUVA
115
+ BALLARI,BELLARY
116
+ BURDWAN,BARDHAMAN
117
+ BHIR,BEED
118
+ BELAGAVI,BELGAUM
119
+ BROACH,BHARUCH
120
+ BHAVENA ,BHAVNAGAR
121
+ GOHILWAD,BHAVNAGAR
122
+ BELLASGATE,BHEDAGHAT MARBLES
123
+ VIRAVATI,VADODARA
124
+ CHANDRAVATI,VADODARA
125
+ BARODA,VADODARA
126
+ VADPATRA,VADODARA
127
+ MADRAS,CHENNAI
128
+ CHERPALCHERY,CHERPULACHERRY
129
+ SHERTHALAI,CHERTHALA
130
+ CHIKMAGALUR,CHIKMAGALUR
131
+ CHINSURAH,CHINSURAH MAGRA
132
+ ELLORE,ELURU
133
+ GARTHAPURI,GUNTUR
134
+ GURUGRAM,GURGAON
135
+ GAUHATI,GUWAHATI
136
+ HOSAPETE,HOSPET
137
+ HUBLI,HUBLI
138
+ BHAGYANAGARAM,HYDERABAD
139
+ AHILYANAGARI,INDORE
140
+ JUBBULPORE,JABALPUR
141
+ KADAPA,CUDDAPAH
142
+ COCANADA,KAKINADA
143
+ KALABURAGI,GULBARGA
144
+ CONJEEVARAM,KANCHEEPURAM
145
+ CANNANORE,KANNUR
146
+ CAWNPORE,KANPUR
147
+ CAPE COMORIN,KANYAKUMARI
148
+ ELAGANDLA,KARIMNAGAR
149
+ KARUVUR,KARUR
150
+ CAMBAY,KHAMBHAT
151
+ COCHIN,KOCHI
152
+ CRANGANORE,KODUNGALLUR
153
+ CALCUTTA,KOLKATA
154
+ QUILON,KOLLAM
155
+ KONEY,KONNI
156
+ CALICUT,KOZHIKODE
157
+ KANDENAVOLU,KURNOOL
158
+ MUZAFFARNAGAR,MUZAFFARNAGAR
159
+ LAKSHMINAGAR,MUZAFFARNAGAR
160
+ MASULIPATAM,MACHILIPATNAM
161
+ MARCERA,MADIKERI
162
+ PALAMURU,MAHBUBNAGAR
163
+ MANDU,MANDU
164
+ MANDAV NAGAR,MANDI
165
+ MANGALORE,MANGALORE
166
+ KRISHNA,VIJAYAWADA
167
+ MAYAVARAM,MAYILADUTHURAI
168
+ SIDDAPURAM,MEDAK
169
+ METUKU SEEMA,MEDAK
170
+ BOMBAY,MUMBAI
171
+ MYSORE,MYSURU
172
+ NOWGONG,NAGAON
173
+ NASIK,NASHIK
174
+ VIKRAMA SIMHAPURI,NELLORE
175
+ PARUR,PARAVUR
176
+ EKASILANAGARAM,WARANGAL
177
+ ORUGALLU,WARANGAL
178
+ PALAI,PALA
179
+ PALGHAT,PALAKKAD
180
+ PANJIM,NORTH GOA
181
+ PORTO NOVO,PARANGIPETTAI
182
+ PRAKASAM,ONGOLE
183
+ PUDUCHERRY ,PONDICHERRY
184
+ POONA,PUNE
185
+ RASSEN,RAISEN
186
+ RAJAMAHENDRAVARAMU,RAJAHMUNDRY
187
+ KANPUR DEHAT,KANPUR DEHAT
188
+ RAMABAI NAGAR,KANPUR DEHAT
189
+ ROPAR,RUPNAGAR
190
+ SAUGOR,SAGAR
191
+ MOHALI,SASNAGAR
192
+ VIRATNAGARI,SHAHDOL
193
+ NAWAN SHAHAR,NAWANSHAHR
194
+ SIMLA,SHIMLA
195
+ SHIVAMOGGA,SHIMOGA
196
+ SAIHA,SAIHA
197
+ SIVASAGAR,SIBSAGAR
198
+ SRI POTTI SRI RAMULU NELLORE,NELLORE
199
+ SIKKOLU,SRIKAKULAM
200
+ SURYAPUR,SURAT
201
+ TELLICHERRY,THALASSERY
202
+ THANA,THANE
203
+ TANJORE,THANJAVUR
204
+ TRANQUEBAR,THARANGAMBADI
205
+ TRIVANDRUM,THIRUVANANTHAPURAM
206
+ TUTICORIN,THOOTHUKKUDI
207
+ TRICHUR,THRISSUR
208
+ TRICHINOPOLY,TIRUCHIRAPPALLI
209
+ TINNEVELLY,TIRUNELVELI
210
+ TRINOMALEE,TIRUVANNAMALAI
211
+ TUMAKURU,TUMKUR
212
+ OOTACAMUND,UDHAGAMANDALAM
213
+ AVANTIKA,UJJAIN
214
+ OJJAIN,UJJAIN
215
+ BULSAR,VALSAD
216
+ BANARAS,VARANASI
217
+ BADAGARA,VADAKARA
218
+ BHELSA,VIDISHA
219
+ VIJAYAPURA,BIJAPUR
220
+ BEZAWADA,VIJAYAWADA
221
+ VIRUDUPATTI,VIRUDHUNAGAR
222
+ WALTAIR,VISAKHAPATNAM
223
+ VIZAGAPATAM,VISAKHAPATNAM
224
+ YSR DISTRICT,CUDDAPAH
225
+ MUMBAI,MUMBAI
226
+ BOMBAY,MUMBAI
227
+ MUMBAI SUBURBAN,MUMBAI
228
+ DELHI,DELHI
229
+ NEW DELHI,DELHI
230
+ DELHI NCR,DELHI
231
+ NCT OF DELHI,DELHI
232
+ SEELAMPUR,DELHI
233
+ SHAHDARA,DELHI
234
+ DWARKA,DELHI
235
+ ROHINI,DELHI
236
+ PITAMPURA,DELHI
237
+ KAROL BAGH,DELHI
238
+ LAJPAT NAGAR,DELHI
239
+ SAKET,DELHI
240
+ JANAKPURI,DELHI
241
+ MAYUR VIHAR,DELHI
242
+ VASANT KUNJ,DELHI
243
+ OKHLA,DELHI
244
+ BENGALURU,BENGALURU
245
+ BANGALORE,BENGALURU
246
+ BENGALURU URBAN,BENGALURU
247
+ HYDERABAD,HYDERABAD
248
+ SECUNDERABAD,HYDERABAD
249
+ HYDERABAD CITY,HYDERABAD
250
+ CHENNAI,CHENNAI
251
+ MADRAS,CHENNAI
252
+ CHENNAI CITY,CHENNAI
253
+ KOLKATA,KOLKATA
254
+ CALCUTTA,KOLKATA
255
+ KOLKATA CITY,KOLKATA
256
+ PUNE,PUNE
257
+ POONA,PUNE
258
+ AHMEDABAD,AHMEDABAD
259
+ AMDAVAD,AHMEDABAD
260
+ JAIPUR,JAIPUR
261
+ PINK CITY,JAIPUR
262
+ LUCKNOW,LUCKNOW
263
+ LAKHNAU,LUCKNOW
264
+ KANPUR,KANPUR
265
+ CAWNPORE,KANPUR
266
+ NAGPUR,NAGPUR
267
+ INDORE,INDORE
268
+ THANE,THANE
269
+ THANA,THANE
270
+ BHOPAL,BHOPAL
271
+ VISAKHAPATNAM,VISAKHAPATNAM
272
+ VIZAG,VISAKHAPATNAM
273
+ VISHAKHAPATNAM,VISAKHAPATNAM
274
+ PIMPRI-CHINCHWAD,PIMPRI-CHINCHWAD
275
+ PIMPRI CHINCHWAD,PIMPRI-CHINCHWAD
276
+ PCMC,PIMPRI-CHINCHWAD
277
+ PATNA,PATNA
278
+ PATALIPUTRA,PATNA
279
+ VADODARA,VADODARA
280
+ BARODA,VADODARA
281
+ GHAZIABAD,GHAZIABAD
282
+ GHZ,GHAZIABAD
283
+ LUDHIANA,LUDHIANA
284
+ AGRA,AGRA
285
+ NASHIK,NASHIK
286
+ NASIK,NASHIK
287
+ FARIDABAD,FARIDABAD
288
+ MEERUT,MEERUT
289
+ RAJKOT,RAJKOT
290
+ KALYAN-DOMBIVLI,KALYAN-DOMBIVLI
291
+ KALYAN,KALYAN-DOMBIVLI
292
+ DOMBIVLI,KALYAN-DOMBIVLI
293
+ VASAI-VIRAR,VASAI-VIRAR
294
+ VASAI,VASAI-VIRAR
295
+ VIRAR,VASAI-VIRAR
296
+ VARANASI,VARANASI
297
+ BANARAS,VARANASI
298
+ BENARES,VARANASI
299
+ KASHI,VARANASI
300
+ SRINAGAR,SRINAGAR
301
+ AURANGABAD,AURANGABAD
302
+ DHANBAD,DHANBAD
303
+ AMRITSAR,AMRITSAR
304
+ NAVI MUMBAI,NAVI MUMBAI
305
+ NEW BOMBAY,NAVI MUMBAI
306
+ ALLAHABAD,ALLAHABAD
307
+ PRAYAGRAJ,ALLAHABAD
308
+ ILAHABAD,ALLAHABAD
309
+ RANCHI,RANCHI
310
+ HOWRAH,HOWRAH
311
+ HAORA,HOWRAH
312
+ COIMBATORE,COIMBATORE
313
+ JABALPUR,JABALPUR
314
+ JUBBULPORE,JABALPUR
315
+ GWALIOR,GWALIOR
316
+ VIJAYAWADA,VIJAYAWADA
317
+ JODHPUR,JODHPUR
318
+ MADURAI,MADURAI
319
+ RAIPUR,RAIPUR
320
+ KOTA,KOTA
321
+ GUWAHATI,GUWAHATI
322
+ GAUHATI,GUWAHATI
323
+ CHANDIGARH,CHANDIGARH
324
+ MOHALI,CHANDIGARH
325
+ SAS NAGAR,CHANDIGARH
326
+ KHARAR,CHANDIGARH
327
+ PANCHKULA,CHANDIGARH
328
+ ZIRAKPUR,CHANDIGARH
329
+ SOLAPUR,SOLAPUR
330
+ SHOLAPUR,SOLAPUR
331
+ HUBLI-DHARWAD,HUBLI-DHARWAD
332
+ HUBLI,HUBLI-DHARWAD
333
+ DHARWAD,HUBLI-DHARWAD
334
+ BAREILLY,BAREILLY
335
+ MORADABAD,MORADABAD
336
+ MYSORE,MYSORE
337
+ MYSURU,MYSORE
338
+ GURGAON,GURGAON
339
+ GURUGRAM,GURGAON
340
+ ALIGARH,ALIGARH
341
+ JALANDHAR,JALANDHAR
342
+ TIRUCHIRAPPALLI,TIRUCHIRAPPALLI
343
+ TRICHY,TIRUCHIRAPPALLI
344
+ TRICHINOPOLY,TIRUCHIRAPPALLI
345
+ BHUBANESWAR,BHUBANESWAR
346
+ BHUBANESHWAR,BHUBANESWAR
347
+ SALEM,SALEM
348
+ WARANGAL,WARANGAL
349
+ THIRUVANANTHAPURAM,THIRUVANANTHAPURAM
350
+ TRIVANDRUM,THIRUVANANTHAPURAM
351
+ GUNTUR,GUNTUR
352
+ BHIWANDI,BHIWANDI
353
+ SAHARANPUR,SAHARANPUR
354
+ GORAKHPUR,GORAKHPUR
355
+ BIKANER,BIKANER
356
+ AMRAVATI,AMRAVATI
357
+ NOIDA,NOIDA
358
+ JAMSHEDPUR,JAMSHEDPUR
359
+ TATANAGAR,JAMSHEDPUR
360
+ BHILAI,BHILAI
361
+ BHILAI NAGAR,BHILAI
362
+ CUTTACK,CUTTACK
363
+ FIROZABAD,FIROZABAD
364
+ KOCHI,KOCHI
365
+ COCHIN,KOCHI
366
+ BHAVNAGAR,BHAVNAGAR
367
+ DEHRADUN,DEHRADUN
368
+ DEHRA DUN,DEHRADUN
369
+ DURGAPUR,DURGAPUR
370
+ ASANSOL,ASANSOL
371
+ NANDED,NANDED
372
+ KOLHAPUR,KOLHAPUR
373
+ AJMER,AJMER
374
+ GULBARGA,GULBARGA
375
+ KALABURAGI,GULBARGA
376
+ JAMNAGAR,JAMNAGAR
377
+ UJJAIN,UJJAIN
378
+ LONI,LONI
379
+ SILIGURI,SILIGURI
380
+ JHANSI,JHANSI
381
+ ULHASNAGAR,ULHASNAGAR
382
+ NELLORE,NELLORE
383
+ JAMMU,JAMMU
384
+ SANGALI-MIRAJ-KUPWAD,SANGALI-MIRAJ-KUPWAD
385
+ SANGALI,SANGALI-MIRAJ-KUPWAD
386
+ MIRAJ,SANGALI-MIRAJ-KUPWAD
387
+ KUPWAD,SANGALI-MIRAJ-KUPWAD
388
+ BELGAUM,BELGAUM
389
+ BELAGAVI,BELGAUM
390
+ MANGALORE,MANGALORE
391
+ MANGALURU,MANGALORE
392
+ AMBATTUR,AMBATTUR
393
+ TIRUNELVELI,TIRUNELVELI
394
+ MALEGAON,MALEGAON
395
+ GREATER NOIDA,GREATER NOIDA
data/hno_variation_standard.csv ADDED
@@ -0,0 +1,619 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "variation","standard"
2
+ ADJUCENT , ADJ
3
+ ADJACNT , ADJ
4
+ ADJNT , ADJ
5
+ ADJT , ADJ
6
+ ADJN , ADJ
7
+ APPART MENTS , APT
8
+ APARTMENTSNO,APT
9
+ APPART MENT , APT
10
+ APART MENTS, APT
11
+ APPARTMENTS , APT
12
+ APART MENT , APT
13
+ APARTUMENT , APT
14
+ APARTMENTS , APT
15
+ APPARTMENT , APT
16
+ APARTMENT , APT
17
+ APPART , APT
18
+ APPATS , APT
19
+ APTMNT , APT
20
+ APARTM , APT
21
+ APATT , APT
22
+ APATS , APT
23
+ APPTS , APT
24
+ APPTT , APT
25
+ APRTS , APT
26
+ APTMS , APT
27
+ APAT , APT
28
+ APTS , APT
29
+ APPT , APT
30
+ APRT , APT
31
+ APTT , APT
32
+ ARKHADHE , ARCADE
33
+ ARKHADE , ARCADE
34
+ ARKADE , ARCADE
35
+ ARKEDE , ARCADE
36
+ ARCAD , ARCADE
37
+ ARCDE , ARCADE
38
+ AREAA , AREA
39
+ ARIAA , AREA
40
+ AVENUIE , AVENUE
41
+ AVANUE , AVENUE
42
+ AVNUE , AVENUE
43
+ AVEN , AVENUE
44
+ BANC , BANK
45
+ BNK , BANK
46
+ BASTHY , BASTI
47
+ BASTY , BASTI
48
+ BEHIND , BEHIND
49
+ B/D , BEHIND
50
+ BEH , BEHIND
51
+ BHI , BEHIND
52
+ BH, BEHIND
53
+ BESIDCE , BESIDE
54
+ BE SIDE , BESIDE
55
+ BESIDES , BESIDE
56
+ BESID , BESIDE
57
+ BSD , BESIDE
58
+ BUILDINGS , BLDG
59
+ BUILDING , BLDG
60
+ BUILDIN , BLDG
61
+ BUILDG , BLDG
62
+ BUILDI , BLDG
63
+ BUILDL , BLDG
64
+ BUILD,BLDG
65
+ BLDGS , BLDG
66
+ BULDG , BLDG
67
+ BLIG , BLDG
68
+ BULD , BLDG
69
+ BDG , BLDG
70
+ BLD , BLDG
71
+ BLG , BLDG
72
+ BLC0K NO , BLOCK
73
+ BLOK NO , BLOCK
74
+ BLCK NO , BLOCK
75
+ BLK NO , BLOCK
76
+ BL NO,BLOCK
77
+ B NO , BLOCK
78
+ BNO , BLOCK
79
+ BAZAAR , BZR
80
+ BAZZAR , BZR
81
+ COLN , CLNY
82
+ COLY , CLNY
83
+ CLN , CLNY
84
+ CLY , CLNY
85
+ COL , CLNY
86
+ COMPHLEX , CMPLX
87
+ COMPLEX , CMPLX
88
+ CMPLEX , CMPLX
89
+ COMPLX , CMPLX
90
+ CMPL , CMPLX
91
+ CPLX , CMPLX
92
+ CENTRAAL , CNTR
93
+ CENTRAL , CNTR
94
+ CENTER , CNTR
95
+ CENTRE , CNTR
96
+ CENTR , CNTR
97
+ CENTL , CNTR
98
+ CNTRL , CNTR
99
+ CENT , CNTR
100
+ CEN , CNTR
101
+ CNT , CNTR
102
+ COMPOUND , COMPND
103
+ CMPOND , COMPND
104
+ COMPD , COMPND
105
+ COMPO , COMPND
106
+ CMPD , COMPND
107
+ CO OPERATIVE , COOP
108
+ COOPERATIVE , COOP
109
+ CO OPERATE , COOP
110
+ CO OPERAT , COOP
111
+ CO OPP , COOP
112
+ CO-OP , COOP
113
+ CO OP HOUSING SOCIETY , COOPHOUSOC
114
+ CO OP HOUSING SOC , COOPHOUSOC
115
+ CO-OP HOUS SOCTY , COOPHOUSOC
116
+ COOP HSG SOCBLDG , COOPHOUSOC
117
+ CO- OP HOU SOC , COOPHOUSOC
118
+ COOP HSG SOCY , COOPHOUSOC
119
+ CO OP HSU SOC , COOPHOUSOC
120
+ CO O HSG SOC , COOPHOUSOC
121
+ COOP HO SOC , COOPHOUSOC
122
+ CO OP HOUS , COOPHOUSOC
123
+ CO OP SOC , COOPHOUSOC
124
+ CO-OP HS , COOPHOUSOC
125
+ CHS , COOPHOUSOC
126
+ CO OPERATE SOCIETY , COOPSOCIETY
127
+ CORNER , CORN
128
+ CROSS,CROSS
129
+ DISTRICT , DIST
130
+ DISTICT , DIST
131
+ DISTR , DIST
132
+ DOORNUMBER ,DOOR
133
+ DOOR NO ,DOOR
134
+ DOORNO,DOOR
135
+ DOR NO ,DOOR
136
+ DRNO ,DOOR
137
+ D NO ,DOOR
138
+ D-NO ,DOOR
139
+ D.NO,DOOR
140
+ DNO ,DOOR
141
+ ENCLAVE , ENCL
142
+ ENKLAVE , ENCL
143
+ ENCLE , ENCL
144
+ ENC , ENCL
145
+ ESTATES , ESTATE
146
+ ESTAT , ESTATE
147
+ ESTA , ESTATE
148
+ ESTS , ESTATE
149
+ ESTT , ESTATE
150
+ EST , ESTATE
151
+ EXTENCTION , EXTN
152
+ EXTENSION , EXTN
153
+ EXT N , EXTN
154
+ EXTEN , EXTN
155
+ EXETN , EXTN
156
+ EXNT , EXTN
157
+ EXT , EXTN
158
+ EX , EXTN
159
+ V TH ,FIFTH
160
+ FIRST ,FIRST
161
+ I ST ,FIRST
162
+ FIRS ,FIRST
163
+ IST ,FIRST
164
+ FIRSTFLOOR , FIRSTFLR
165
+ FLATNUMBER , FLAT
166
+ F NUMBER , FLAT
167
+ FLAT NOS,FLAT
168
+ FLAT NO,FLAT
169
+ FLATN O , FLAT
170
+ FLAT-NO,FLAT
171
+ FT NO , FLAT
172
+ F NO , FLAT
173
+ FLT , FLAT
174
+ FNO , FLAT
175
+ FT,FLAT
176
+ FLOO , FLR
177
+ FLOR , FLR
178
+ FLUR , FLR
179
+ FR , FLR
180
+ FL , FLR
181
+ IVTH ,FOURTH
182
+ GALLI , GALLY
183
+ GILLY , GALLY
184
+ GULLY , GALLY
185
+ GALA , GALLY
186
+ GALI , GALLY
187
+ GADENS , GARDENS
188
+ GRDENS , GARDENS
189
+ GARDNS , GARDENS
190
+ GRDNS , GARDENS
191
+ GDNS , GARDENS
192
+ GROIUND , GND
193
+ GROUND , GND
194
+ GRD , GND
195
+ GR , GND
196
+ GROUNDFLOOR ,GNDFLR
197
+ GOVERNMENT , GOVT
198
+ GAVERNMENT , GOVT
199
+ GOVRNMNT , GOVT
200
+ GOV , GOVT
201
+ GENERAL POST OFFICE , GPO
202
+ GENRAL POST OFICE , GPO
203
+ GNRL POST OFF , GPO
204
+ GENERAL P O , GPO
205
+ GPO , GPO
206
+ GUNJ , GUNZ
207
+ GANJ , GUNZ
208
+ HILS , HILLS
209
+ HASPATAL , HOSPITAL
210
+ HASPITAL , HOSPITAL
211
+ HOSPTL , HOSPITAL
212
+ HSPTL , HOSPITAL
213
+ HOSPI , HOSPITAL
214
+ HOSP , HOSPITAL
215
+ HOUSE NUMBER ,HOUSE
216
+ HOUSENUMBER ,HOUSE
217
+ HOUSE NO ,HOUSE
218
+ HAUSE NO ,HOUSE
219
+ HOUS NO ,HOUSE
220
+ HOU NO ,HOUSE
221
+ HUS NO ,HOUSE
222
+ H-NO ,HOUSE
223
+ H NO ,HOUSE
224
+ BAZAR , BZR
225
+ BAJAR , BZR
226
+ BAZER , BZR
227
+ BAZR , BZR
228
+ CAMPUS , CAMP
229
+ CAMPAS , CAMP
230
+ CAMPS , CAMP
231
+ CMP , CAMP
232
+ COLONY , CLNY
233
+ COLNI , CLNY
234
+ COLOY , CLNY
235
+ CONLY , CLNY
236
+ H.N.,HOUSE
237
+ H.NO,HOUSE
238
+ HONO,HOUSE
239
+ HNO,HOUSE
240
+ INDUSTRIALESTATE , INDESTATE
241
+ INDUSTREALS , INDL
242
+ INDUSTRIAL , INDL
243
+ INDUSTRIES , INDL
244
+ INDUSTREAL , INDL
245
+ INDUSTRY , INDL
246
+ INDUST , INDL
247
+ INDUL , INDL
248
+ INDLL , INDL
249
+ INDUS , INDL
250
+ INDLS , INDL
251
+ INDU , INDL
252
+ INDS , INDL
253
+ IND , INDL
254
+ SECOND ,SECOND
255
+ II ND ,SECOND
256
+ IIND ,SECOND
257
+ 2NDFLR ,SECONDFLOOR
258
+ SECTOR , SECT
259
+ SECTAR , SECT
260
+ SCTR , SECT
261
+ SEC , SECT
262
+ SHOP NO,SHOP
263
+ SHOPNO,SHOP
264
+ SITENO,SITE
265
+ LMTED , LTD
266
+ LT , LTD
267
+ MAIN,MAIN
268
+ MRG , MARG
269
+ MARKETS , MKT
270
+ MARKET , MKT
271
+ MRKET , MKT
272
+ MEKT , MKT
273
+ MRKT , MKT
274
+ MKTE , MKT
275
+ NAGAR , NGR
276
+ NAGER , NGR
277
+ NAGR , NGR
278
+ NGAR , NGR
279
+ NAR , NGR
280
+ NG , NGR
281
+ NATIONAL HIGH WAY , NH
282
+ NATIONAL HI WAY , NH
283
+ NATIONAL HYWAY , NH
284
+ NHW , NH
285
+ NH , NH
286
+ NIWAS , NIVAS
287
+ NUMBER , NO
288
+ NEMBER , NO
289
+ NUMBUR , NO
290
+ NEMBUR , NO
291
+ NUMBR , NO
292
+ NMBR , NO
293
+ OFFICE , OFF
294
+ OFFIC , OFF
295
+ OFICE , OFF
296
+ O/O , OFF
297
+ OPPOSITE , OPP
298
+ OPPOSTE , OPP
299
+ OPPSITE,OPP
300
+ OPPE , OPP
301
+ OPPS , OPP
302
+ OPPT , OPP
303
+ PHESE , PHASE
304
+ FASE , PHASE
305
+ PHAS,PHASE
306
+ PH,PHASE
307
+ PCKET,PKT
308
+ PKT,PKT
309
+ PLOT SR NO,PLOT
310
+ PL NUMBER , PLOT
311
+ P NUMBER , PLOT
312
+ PLOT.NO.,PLOT
313
+ PLOT NO,PLOT
314
+ PLOT.NO,PLOT
315
+ PLOT-NO , PLOT
316
+ PLO NO,PLOT
317
+ PLOTNO , PLOT
318
+ PLT NO , PLOT
319
+ PL.NO.,PLOT
320
+ PL NO , PLOT
321
+ PLOT,PLOT
322
+ PTNO,PLOT
323
+ P NO , PLOT
324
+ PLT,PLOT
325
+ PNO , PLOT
326
+ POST OFFICE , PO
327
+ POSTOFFICE , PO
328
+ POST OFF , PO
329
+ P OFFI , PO
330
+ POS OF , PO
331
+ POST , PO
332
+ P OF , PO
333
+ PT , PO
334
+ POST BOX , POBOX
335
+ PST BOX , POBOX
336
+ P O BOX , POBOX
337
+ POST BX , POBOX
338
+ POS BOX , POBOX
339
+ POCKET ,POCKET
340
+ QUARTER NUMBER ,QUTR
341
+ QUARTERNUMBER ,QUTR
342
+ QUARTER NO ,QUTR
343
+ QUARTERS ,QUTR
344
+ QUARTER,QUTR
345
+ QRTR NO ,QUTR
346
+ QURT NO ,QUTR
347
+ QRTERS ,QUTR
348
+ QTR NO,QUTR
349
+ QRT NO,QUTR
350
+ QR NO ,QUTR
351
+ QTARS ,QUTR
352
+ QURTS ,QUTR
353
+ Q. NO,QUTR
354
+ QTRNO,QUTR
355
+ ADJACENT , ADJ
356
+ SITE,SITE
357
+ VI TH ,SIXTH
358
+ SOCIETY , SOC
359
+ SOCTY , SOC
360
+ SOSTY , SOC
361
+ SOCT , SOC
362
+ SCTY , SOC
363
+ SOCI , SOC
364
+ SOCY , SOC
365
+ STAGE,STAGE
366
+ STETION , STN
367
+ STATION , STN
368
+ STANT , STN
369
+ STION , STN
370
+ STETION ROAD , STNRD
371
+ STATION ROAD , STNRD
372
+ SN ROAD , STNRD
373
+ STREETNUMBER , STR
374
+ ST NUMBER , STR
375
+ STREET NO , STR
376
+ STREEET , STR
377
+ STREET , STR
378
+ STREAT , STR
379
+ STRAET , STR
380
+ STRIT , STR
381
+ STRET , STR
382
+ STEET , STR
383
+ ST NO , STR
384
+ STRT , STR
385
+ STR, STR
386
+ SRT , STR
387
+ SU DIVISION , SUBDIVISION
388
+ SU DIVIZAN , SUBDIVISION
389
+ SU DIVIZON , SUBDIVISION
390
+ SUB DIVI , SUBDIVISION
391
+ SU DIVIS , SUBDIVISION
392
+ SU DVSN , SUBDIVISION
393
+ SURVEY NO,SURVEY
394
+ SURVEYNO,SURVEY
395
+ SY NO,SURVEY
396
+ TALUKHAA , TALUKA
397
+ TALOOKHA , TALUKA
398
+ TALOOKA , TALUKA
399
+ TALUQA , TALUKA
400
+ TALUCA , TALUKA
401
+ TAL , TALUKA
402
+ III RD ,THIRD
403
+ TOWER,TOWER
404
+ VIHAAR , VIHAR
405
+ VIHR , VIHAR
406
+ VILL , VILLAGE
407
+ VILL. , VILLAGE
408
+ VILLGE , VILLAGE
409
+ VILLA , VILLAGE
410
+ VILLG , VILLAGE
411
+ VIL , VILLAGE
412
+ WATER TAN , WATERTANK
413
+ WATER TNK , WATERTANK
414
+ WATR TAN , WATERTANK
415
+ WATER TK , WATERTANK
416
+ WATER T , WATERTANK
417
+ WTR TAN , WATERTANK
418
+ WTR TNK , WATERTANK
419
+ W TNK , WATERTANK
420
+ VINGS , WING
421
+ WINGS , WING
422
+ VING , WING
423
+ CROSS ROAD , XRD
424
+ CROSSROAD , XRD
425
+ CROSS RD , XRD
426
+ XRD , XRD
427
+ JONE , ZONE
428
+ JUNCTION , JN
429
+ JUNCTN , JN
430
+ JNCTN , JN
431
+ JNCN , JN
432
+ JNC , JN
433
+ JNT , JN
434
+ LINE , LANE
435
+ LNE , LANE
436
+ LN , LANE
437
+ LYT,LAYOUT
438
+ LIMITED , LTD
439
+ LIMITID , LTD
440
+ LIMETED , LTD
441
+ LIMTED , LTD
442
+ LIMTD , LTD
443
+ FLOOR , FLR
444
+ I FLOOR,"1 ST FLR"
445
+ FIRST FLOOR,"1 ST FLR"
446
+ GROUND FLOOR,"0 TH FLR"
447
+ Q NO,QUTR
448
+ Q-NO ,QUTR
449
+ QRTS ,QUTR
450
+ QRS ,QUTR
451
+ QRT ,QUTR
452
+ QTS ,QUTR
453
+ QNO ,QUTR
454
+ QR ,QUTR
455
+ RAIL , RAILWAY
456
+ RAWY , RAILWAY
457
+ RLY , RAILWAY
458
+ RAILWAYQUARTERS , RAILWAYQTR
459
+ RLY QRTS , RAILWAYQTR
460
+ RAILWAY STATION , RAILWAYSTN
461
+ RAILWAY STION , RAILWAYSTN
462
+ RLY STATION , RAILWAYSTN
463
+ RL STN , RAILWAYSTN
464
+ RESIDENCE , RES
465
+ RECIDANCE , RES
466
+ RSDENCE , RES
467
+ RSDNCE , RES
468
+ RESI , RES
469
+ RSDN , RES
470
+ RS , RES
471
+ ROAD NO ,ROAD
472
+ RAOD NO ,ROAD
473
+ ROADNO,ROAD
474
+ RD NO,ROAD
475
+ ROADS ,ROAD
476
+ RHODE ,ROAD
477
+ ROAD ,ROAD
478
+ RODE ,ROAD
479
+ R NO ,ROAD
480
+ RDNO ,ROAD
481
+ R-NO ,ROAD
482
+ RAD ,ROAD
483
+ ROA ,ROAD
484
+ ROD ,ROAD
485
+ ROOM NO,ROOM
486
+ ROOMNO,ROOM
487
+ R.NO,ROOM
488
+ R NO,ROOM
489
+ ROUTE , RT
490
+ ROOTE , RT
491
+ RUTE , RT
492
+ RTE , RT
493
+ RUT , RT
494
+ S CRUZ , SANTCRUZ
495
+ SECCOND ,SECOND
496
+ DISTRICT,DIST
497
+ DIST,DIST
498
+ DST,DIST
499
+ DSTR,DIST
500
+ DT,DIST
501
+ ZILLA,DIST
502
+ JILLA,DIST
503
+ ZILA,DIST
504
+ TALUK,TALUK
505
+ TAL,TALUK
506
+ TALUKA,TALUK
507
+ TQ,TALUK
508
+ TEH,TALUK
509
+ TEHS,TALUK
510
+ TEHSIL,TALUK
511
+ MANDAL,TALUK
512
+ MD,TALUK
513
+ VILLAGE,VILLAGE
514
+ VILL,VILLAGE
515
+ VIL,VILLAGE
516
+ VLG,VILLAGE
517
+ GRAMA,VILLAGE
518
+ GRAM,VILLAGE
519
+ GAON,VILLAGE
520
+ CITY,CITY
521
+ CTY,CITY
522
+ TOWN,CITY
523
+ TWN,CITY
524
+ NAGAR,CITY
525
+ NAG,CITY
526
+ PURAM,CITY
527
+ PURA,CITY
528
+ STATE,STATE
529
+ ST,STATE
530
+ RAJYA,STATE
531
+ PRADESH,STATE
532
+ D NO,HOUSE
533
+ D.NO,HOUSE
534
+ D-NO,HOUSE
535
+ D/NO,HOUSE
536
+ DNO,HOUSE
537
+ DOOR NO,HOUSE
538
+ DOOR NUMBER,HOUSE
539
+ APARTMENT,APT
540
+ APT,APT
541
+ APT NO,APT
542
+ APT NUMBER,APT
543
+ APARTMENT NO,APT
544
+ TOWER,APT
545
+ TOWER NO,APT
546
+ WING,APT
547
+ PHASE,APT
548
+ PHASE NO,APT
549
+ RESIDENCY,APT
550
+ RESIDENTIAL COMPLEX,APT
551
+ HEIGHTS,APT
552
+ ENCLAVE,APT
553
+ APARTMENTS,APT
554
+ SOCIETY,APT
555
+ SOCIETY NO,APT
556
+ CHS,APT
557
+ BLDG,BLDG
558
+ BLDG NO,BLDG
559
+ BUILDING,BLDG
560
+ BUILDING NO,BLDG
561
+ BLK,BLOCK
562
+ BLOCK,BLOCK
563
+ BLOCK NO,BLOCK
564
+ FLAT,FLAT
565
+ FLAT NO,FLAT
566
+ FLAT NUMBER,FLAT
567
+ FLT,FLAT
568
+ FLT NO,FLAT
569
+ UNIT,FLAT
570
+ UNIT NO,FLAT
571
+ UNIT NUMBER,FLAT
572
+ PORTION,FLAT
573
+ PORTION NO,FLAT
574
+ OFFICE NO,FLAT
575
+ OFFICE NUMBER,FLAT
576
+ SHOP NO,SHOP
577
+ SHOP NUMBER,SHOP
578
+ ROAD,ROAD
579
+ RD,ROAD
580
+ R D,ROAD
581
+ MARG,ROAD
582
+ MRG,ROAD
583
+ PATH,ROAD
584
+ STREET,STR
585
+ ST,STR
586
+ STR,STR
587
+ GALI,STR
588
+ GALLLI,STR
589
+ LANE,STR
590
+ LN,STR
591
+ MARG,STR
592
+ PATH,STR
593
+ CIRCLE,STR
594
+ CIR,STR
595
+ SECTOR,STR
596
+ SEC,STR
597
+ LANE,LANE
598
+ LN,LANE
599
+ BYLANE,LANE
600
+ CROSS,LANE
601
+ CR,LANE
602
+ EXTENSION,EXTN
603
+ EXT,EXTN
604
+ EXTN,EXTN
605
+ LOCALITY,LOCALITY
606
+ LAYOUT,LOCALITY
607
+ LYT,LOCALITY
608
+ PHASE,LOCALITY
609
+ PH,LOCALITY
610
+ SECTOR,LOCALITY
611
+ SEC,LOCALITY
612
+ COLONY,CLNY
613
+ COL,CLNY
614
+ CLNY,CLNY
615
+ BUILDING,BUILDING
616
+ APT,BUILDING
617
+ APARTMENT,BUILDING
618
+ BLDG,BUILDING
619
+ TOWER,BUILDING
data/name_variation_standard.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/pin_city_state.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/state_name_standard.csv ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "variation","standard"
2
+ ANDHRA PRADESH,ANDHRA PRADESH
3
+ ANDHRAPRADESH,ANDHRA PRADESH
4
+ ANDHRA,ANDHRA PRADESH
5
+ AP,ANDHRA PRADESH
6
+ A.P,ANDHRA PRADESH
7
+ A.P.,ANDHRA PRADESH
8
+ AP STATE,ANDHRA PRADESH
9
+ IN-AP,ANDHRA PRADESH
10
+ ARUNACHAL PRADESH,ARUNACHAL PRADESH
11
+ ARUNACHAL,ARUNACHAL PRADESH
12
+ AR,ARUNACHAL PRADESH
13
+ A.R,ARUNACHAL PRADESH
14
+ ARUNACHAL PRADESH STATE,ARUNACHAL PRADESH
15
+ IN-AR,ARUNACHAL PRADESH
16
+ ASSAM,ASSAM
17
+ AS,ASSAM
18
+ A.S,ASSAM
19
+ ASSAM STATE,ASSAM
20
+ IN-AS,ASSAM
21
+ BIHAR,BIHAR
22
+ BR,BIHAR
23
+ B.R,BIHAR
24
+ BIHAR STATE,BIHAR
25
+ IN-BR,BIHAR
26
+ CHHATTISGARH,CHHATTISGARH
27
+ CHATTISGARH,CHHATTISGARH
28
+ CHHATISGARH,CHHATTISGARH
29
+ CG,CHHATTISGARH
30
+ C.G,CHHATTISGARH
31
+ CT,CHHATTISGARH
32
+ CHATTISGARH STATE,CHHATTISGARH
33
+ IN-CG,CHHATTISGARH
34
+ GOA,GOA
35
+ GA,GOA
36
+ G.A,GOA
37
+ IN-GA,GOA
38
+ GUJARAT,GUJARAT
39
+ GUJRAT,GUJARAT
40
+ GUJARATH,GUJARAT
41
+ GJ,GUJARAT
42
+ G.J,GUJARAT
43
+ IN-GJ,GUJARAT
44
+ HARYANA,HARYANA
45
+ HARIYANA,HARYANA
46
+ HR,HARYANA
47
+ H.R,HARYANA
48
+ IN-HR,HARYANA
49
+ HIMACHAL PRADESH,HIMACHAL PRADESH
50
+ HIMACHAL,HIMACHAL PRADESH
51
+ HP,HIMACHAL PRADESH
52
+ H.P,HIMACHAL PRADESH
53
+ H.P.,HIMACHAL PRADESH
54
+ IN-HP,HIMACHAL PRADESH
55
+ JHARKHAND,JHARKHAND
56
+ JH,JHARKHAND
57
+ J.H,JHARKHAND
58
+ IN-JH,JHARKHAND
59
+ KARNATAKA,KARNATAKA
60
+ KARNATAK,KARNATAKA
61
+ KARN,KARNATAKA
62
+ KA,KARNATAKA
63
+ K.A,KARNATAKA
64
+ MYSORE STATE,KARNATAKA
65
+ IN-KA,KARNATAKA
66
+ KERALA,KERALA
67
+ KERALAM,KERALA
68
+ KL,KERALA
69
+ K.L,KERALA
70
+ IN-KL,KERALA
71
+ MADHYA PRADESH,MADHYA PRADESH
72
+ MADHYAPRADESH,MADHYA PRADESH
73
+ MADHYA,MADHYA PRADESH
74
+ MP,MADHYA PRADESH
75
+ M.P,MADHYA PRADESH
76
+ M.P.,MADHYA PRADESH
77
+ MP STATE,MADHYA PRADESH
78
+ IN-MP,MADHYA PRADESH
79
+ MAHARASHTRA,MAHARASHTRA
80
+ MAHARASTRA,MAHARASHTRA
81
+ MAHA,MAHARASHTRA
82
+ MH,MAHARASHTRA
83
+ M.H,MAHARASHTRA
84
+ MAHARASHTRA STATE,MAHARASHTRA
85
+ IN-MH,MAHARASHTRA
86
+ MANIPUR,MANIPUR
87
+ MN,MANIPUR
88
+ M.N,MANIPUR
89
+ IN-MN,MANIPUR
90
+ MEGHALAYA,MEGHALAYA
91
+ ML,MEGHALAYA
92
+ M.L,MEGHALAYA
93
+ IN-ML,MEGHALAYA
94
+ MIZORAM,MIZORAM
95
+ MZ,MIZORAM
96
+ M.Z,MIZORAM
97
+ IN-MZ,MIZORAM
98
+ NAGALAND,NAGALAND
99
+ NL,NAGALAND
100
+ N.L,NAGALAND
101
+ IN-NL,NAGALAND
102
+ ODISHA,ODISHA
103
+ ORISSA,ODISHA
104
+ OD,ODISHA
105
+ O.D,ODISHA
106
+ OR,ODISHA
107
+ O.R,ODISHA
108
+ ODISHA STATE,ODISHA
109
+ IN-OD,ODISHA
110
+ PUNJAB,PUNJAB
111
+ PANJAB,PUNJAB
112
+ PB,PUNJAB
113
+ P.B,PUNJAB
114
+ IN-PB,PUNJAB
115
+ RAJASTHAN,RAJASTHAN
116
+ RAJ,RAJASTHAN
117
+ RJ,RAJASTHAN
118
+ R.J,RAJASTHAN
119
+ RAJASTHAN STATE,RAJASTHAN
120
+ IN-RJ,RAJASTHAN
121
+ SIKKIM,SIKKIM
122
+ SK,SIKKIM
123
+ S.K,SIKKIM
124
+ IN-SK,SIKKIM
125
+ TAMIL NADU,TAMIL NADU
126
+ TAMILNADU,TAMIL NADU
127
+ TAMIL,TAMIL NADU
128
+ TN,TAMIL NADU
129
+ T.N,TAMIL NADU
130
+ T.N.,TAMIL NADU
131
+ TAMILNADU STATE,TAMIL NADU
132
+ IN-TN,TAMIL NADU
133
+ TELANGANA,TELANGANA
134
+ TELENGANA,TELANGANA
135
+ TG,TELANGANA
136
+ T.G,TELANGANA
137
+ TS,TELANGANA
138
+ T.S,TELANGANA
139
+ TELANGANA STATE,TELANGANA
140
+ IN-TS,TELANGANA
141
+ TRIPURA,TRIPURA
142
+ TR,TRIPURA
143
+ T.R,TRIPURA
144
+ IN-TR,TRIPURA
145
+ UTTAR PRADESH,UTTAR PRADESH
146
+ UTTARPRADESH,UTTAR PRADESH
147
+ UTTAR,UTTAR PRADESH
148
+ UP,UTTAR PRADESH
149
+ U.P,UTTAR PRADESH
150
+ U.P.,UTTAR PRADESH
151
+ UP STATE,UTTAR PRADESH
152
+ IN-UP,UTTAR PRADESH
153
+ UTTARAKHAND,UTTARAKHAND
154
+ UTTARANCHAL,UTTARAKHAND
155
+ UK,UTTARAKHAND
156
+ U.K,UTTARAKHAND
157
+ UA,UTTARAKHAND
158
+ UTTARAKHAND STATE,UTTARAKHAND
159
+ IN-UK,UTTARAKHAND
160
+ WEST BENGAL,WEST BENGAL
161
+ WESTBENGAL,WEST BENGAL
162
+ WB,WEST BENGAL
163
+ W.B,WEST BENGAL
164
+ W.B.,WEST BENGAL
165
+ WEST BENGAL STATE,WEST BENGAL
166
+ IN-WB,WEST BENGAL
167
+ ANDAMAN AND NICOBAR ISLANDS,ANDAMAN AND NICOBAR ISLANDS
168
+ ANDAMAN NICOBAR,ANDAMAN AND NICOBAR ISLANDS
169
+ ANDAMAN,ANDAMAN AND NICOBAR ISLANDS
170
+ NICOBAR,ANDAMAN AND NICOBAR ISLANDS
171
+ AN,ANDAMAN AND NICOBAR ISLANDS
172
+ A.N,ANDAMAN AND NICOBAR ISLANDS
173
+ A & N ISLANDS,ANDAMAN AND NICOBAR ISLANDS
174
+ IN-AN,ANDAMAN AND NICOBAR ISLANDS
175
+ CHANDIGARH,CHANDIGARH
176
+ CH,CHANDIGARH
177
+ C.H,CHANDIGARH
178
+ IN-CH,CHANDIGARH
179
+ MOHALI,CHANDIGARH
180
+ SAS NAGAR,CHANDIGARH
181
+ KHARAR,CHANDIGARH
182
+ PANCHKULA,CHANDIGARH
183
+ ZIRAKPUR,CHANDIGARH
184
+ DADRA AND NAGAR HAVELI AND DAMAN AND DIU,DADRA AND NAGAR HAVELI AND DAMAN AND DIU
185
+ DADRA NAGAR HAVELI,DADRA AND NAGAR HAVELI AND DAMAN AND DIU
186
+ DAMAN DIU,DADRA AND NAGAR HAVELI AND DAMAN AND DIU
187
+ DN,DADRA AND NAGAR HAVELI AND DAMAN AND DIU
188
+ D.N,DADRA AND NAGAR HAVELI AND DAMAN AND DIU
189
+ DNH,DADRA AND NAGAR HAVELI AND DAMAN AND DIU
190
+ DD,DADRA AND NAGAR HAVELI AND DAMAN AND DIU
191
+ IN-DH,DADRA AND NAGAR HAVELI AND DAMAN AND DIU
192
+ DELHI,DELHI
193
+ NEW DELHI,DELHI
194
+ DL,DELHI
195
+ D.L,DELHI
196
+ NCT OF DELHI,DELHI
197
+ NATIONAL CAPITAL TERRITORY OF DELHI,DELHI
198
+ NORTH EAST DELHI,DELHI
199
+ NORTH WEST DELHI,DELHI
200
+ SOUTH EAST DELHI,DELHI
201
+ SOUTH WEST DELHI,DELHI
202
+ SEELAMPUR,DELHI
203
+ SHAHDARA,DELHI
204
+ DWARKA,DELHI
205
+ ROHINI,DELHI
206
+ PITAMPURA,DELHI
207
+ KAROL BAGH,DELHI
208
+ LAJPAT NAGAR,DELHI
209
+ SAKET,DELHI
210
+ JANAKPURI,DELHI
211
+ MAYUR VIHAR,DELHI
212
+ VASANT KUNJ,DELHI
213
+ OKHLA,DELHI
214
+ NOIDA,DELHI
215
+ GREATER NOIDA,DELHI
216
+ FARIDABAD,DELHI
217
+ GHAZIABAD,DELHI
218
+ GHZ,DELHI
219
+ INDIRAPURAM,DELHI
220
+ GURUGRAM,DELHI
221
+ GURGAON,DELHI
222
+ IN-DL,DELHI
223
+ JAMMU AND KASHMIR,JAMMU AND KASHMIR
224
+ JAMMU,JAMMU AND KASHMIR
225
+ KASHMIR,JAMMU AND KASHMIR
226
+ JK,JAMMU AND KASHMIR
227
+ J.K,JAMMU AND KASHMIR
228
+ J&K,JAMMU AND KASHMIR
229
+ JAMMU & KASHMIR,JAMMU AND KASHMIR
230
+ IN-JK,JAMMU AND KASHMIR
231
+ LADAKH,LADAKH
232
+ LA,LADAKH
233
+ L.A,LADAKH
234
+ IN-LA,LADAKH
235
+ LAKSHADWEEP,LAKSHADWEEP
236
+ LAKSHADWEEP ISLANDS,LAKSHADWEEP
237
+ LD,LAKSHADWEEP
238
+ L.D,LAKSHADWEEP
239
+ IN-LD,LAKSHADWEEP
240
+ PUDUCHERRY,PUDUCHERRY
241
+ PONDICHERRY,PUDUCHERRY
242
+ PY,PUDUCHERRY
243
+ P.Y,PUDUCHERRY
244
+ IN-PY,PUDUCHERRY
data/sur_comm_names.csv ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "s_no","surname_community_extension"
2
+ 1,SINGH
3
+ 2,SHARMA
4
+ 3,GUPTA
5
+ 4,VERMA
6
+ 5,AGARWAL
7
+ 6,AGGARWAL
8
+ 7,BANSAL
9
+ 8,GOYAL
10
+ 9,MITTAL
11
+ 10,SRIVASTAVA
12
+ 11,CHAUDHARY
13
+ 12,CHOUDHARY
14
+ 13,PANDEY
15
+ 14,MISHRA
16
+ 15,TIWARI
17
+ 16,YADAV
18
+ 17,PATEL
19
+ 18,SHAH
20
+ 19,MEHTA
21
+ 20,DESAI
22
+ 21,JOSHI
23
+ 22,KULKARNI
24
+ 23,PAWAR
25
+ 24,JADHAV
26
+ 25,SHINDE
27
+ 26,REDDY
28
+ 27,RAO
29
+ 28,NAIDU
30
+ 29,VARMA
31
+ 30,GOWDA
32
+ 31,SHETTY
33
+ 32,SETTY
34
+ 33,NAIR
35
+ 34,PILLAI
36
+ 35,MENON
37
+ 36,DAS
38
+ 37,DUTTA
39
+ 38,ROY
40
+ 39,SARKAR
41
+ 40,MONDAL
42
+ 41,GHOSH
43
+ 42,BOSE
44
+ 43,SEN
45
+ 44,THOMAS
46
+ 45,MATHEW
47
+ 46,JOSEPH
48
+ 47,JOHN
49
+ 48,GEORGE
50
+ 49,KAUR
51
+ 50,KHAN
52
+ 51,AHMED
53
+ 52,AHMAD
54
+ 53,ANSARI
55
+ 54,SHAIKH
56
+ 55,SHEIKH
57
+ 56,SYED
58
+ 57,HUSSAIN
59
+ 58,QURESHI
60
+ 59,SIDDIQUI
61
+ 60,FAROOQI
62
+ 61,PATHAN
63
+ 62,BEG
64
+ 63,BAIG
65
+ 64,MIRZA
66
+ 65,USMANI
67
+ 66,RAZA
68
+ 67,PATAN
69
+ 68,NAQVI
70
+ 69,RIZVI
71
+ 70,KAZMI
72
+ 71,ZAIDI
73
+ 72,BUKHARI
74
+ 73,CHISHTI
75
+ 74,MADANI
76
+ 75,NOMANI
77
+ 76,FARUQI
78
+ 77,HASHMI
79
+ 78,AZMI
80
+ 79,KAZI
81
+ 80,QAZI
82
+ 81,MEMON
83
+ 82,BOHRA
84
+ 83,ATTAR
85
+ 84,TAMBOLI
86
+ 85,NADAF
87
+ 86,PINJARI
88
+ 87,BAGWAN
89
+ 88,KUMAR
90
+ 89,KUMARI
91
+ 90,DEVI
92
+ 91,LAL
93
+ 92,PRASAD
94
+ 93,CHANDRA
95
+ 94,NATH
96
+ 95,RAJ
97
+ 96,DEV
98
+ 97,BABU
99
+ 98,MOHAMMED
100
+ 99,MUHAMMAD
101
+ 100,MOHAMMAD
102
+ 101,MOHAMED
103
+ 102,MOHD
104
+ 103,MD
105
+ 104,MHD
106
+ 105,ABDUL
107
+ 106,ABD
108
+ 107,ALI
109
+ 108,HASSAN
110
+ 109,PAUL
111
+ 110,PAL
112
+ 111,TRIPATHI
113
+ 112,DWIVEDI
114
+ 113,CHATURVEDI
115
+ 114,UPADHYAY
116
+ 115,BHARDWAJ
117
+ 116,BHARGAVA
118
+ 117,VASHISHTHA
119
+ 118,SHUKLA
120
+ 119,DUBEY
121
+ 120,DUBE
122
+ 121,TYAGI
123
+ 122,SAXENA
124
+ 123,MATHUR
125
+ 124,TANDON
126
+ 125,KHANNA
127
+ 126,ARORA
128
+ 127,MALHOTRA
129
+ 128,BATRA
130
+ 129,GROVER
131
+ 130,BEDI
132
+ 131,SODHI
133
+ 132,AHUJA
134
+ 133,CHAWLA
135
+ 134,SANDHU
136
+ 135,SIDHU
137
+ 136,DHILLON
138
+ 137,BRAR
139
+ 138,RANDHAWA
140
+ 139,GILL
141
+ 140,MANN
142
+ 141,CHEEMA
143
+ 142,CHAHAL
144
+ 143,PUNIA
145
+ 144,JAIN
146
+ 145,DOSHI
147
+ 146,DALAL
148
+ 147,MODI
149
+ 148,PAREKH
150
+ 149,ZAVERI
151
+ 150,SANGHVI
152
+ 151,SOMANI
153
+ 152,LODHA
154
+ 153,LODH
155
+ 154,PATIL
156
+ 155,DESHMUKH
157
+ 156,GAIKWAD
158
+ 157,KADAM
159
+ 158,KAMBLE
160
+ 159,SALUNKHE
161
+ 160,BHOSALE
162
+ 161,MORE
163
+ 162,PENDSE
164
+ 163,KARANDE
165
+ 164,ACHARYA
166
+ 165,HEGDE
167
+ 166,BHAT
168
+ 167,BHATT
169
+ 168,IYER
170
+ 169,IYENGAR
171
+ 170,MALIK
172
+ 171,REHMAN
173
+ 172,RAHMAN
174
+ 173,AKHTAR
175
+ 174,IQBAL
176
+ 175,SALMAN
177
+ 176,SULTAN
178
+ 177,TARIQ
179
+ 178,JAVED
180
+ 179,FAIZ
181
+ 180,rai
182
+ 181,
frontend/app.py ADDED
@@ -0,0 +1,673 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ import re
4
+ import os
5
+ import requests
6
+ import sys, os
7
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
8
+ from backend.matching_service import perform_match
9
+ from backend.models import EntityRecord, MatchMode
10
+
11
+ # =========================================================
12
+ # CONSTANTS
13
+ # =========================================================
14
+ MAX_FIELDS = 20
15
+
16
+ # =========================================================
17
+ # CUSTOM CSS - Matching the original Streamlit design
18
+ # =========================================================
19
+ CUSTOM_CSS = """
20
+ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');
21
+
22
+ * {
23
+ font-family: 'Inter', sans-serif;
24
+ box-sizing: border-box;
25
+ }
26
+
27
+ body, .gradio-container {
28
+ background-color: #f0f2f5 !important;
29
+ color: #333 !important;
30
+ }
31
+
32
+ /* ── App wrapper ── */
33
+ .gradio-container {
34
+ max-width: 1400px !important;
35
+ margin: 0 auto !important;
36
+ padding: 20px !important;
37
+ }
38
+
39
+ /* ── Header ── */
40
+ .app-header {
41
+ text-align: center;
42
+ margin-bottom: 24px;
43
+ padding: 24px 0 8px;
44
+ }
45
+ .app-header h1 {
46
+ color: #5B4E8B;
47
+ font-size: 26px;
48
+ font-weight: 700;
49
+ margin: 0 0 6px;
50
+ }
51
+ .app-header p {
52
+ color: #666;
53
+ font-size: 14px;
54
+ margin: 0;
55
+ }
56
+
57
+ /* ── Record header ── */
58
+ .record-header {
59
+ color: #612383;
60
+ font-size: 22px;
61
+ font-weight: 700;
62
+ padding-bottom: 10px;
63
+ margin-bottom: 18px;
64
+ border-bottom: 3px solid transparent;
65
+ border-image: linear-gradient(90deg, #612383, #E9592E, #F5A700) 1;
66
+ }
67
+
68
+ /* ── Section card ── */
69
+ .section-card {
70
+ background: white;
71
+ border-radius: 12px;
72
+ box-shadow: 0 2px 8px rgba(0,0,0,0.08);
73
+ margin-bottom: 18px;
74
+ overflow: hidden;
75
+ }
76
+ .section-header-gradient {
77
+ background: linear-gradient(90deg, #612383 0%, #E9592E 100%);
78
+ color: white;
79
+ padding: 12px 18px;
80
+ font-size: 13px;
81
+ font-weight: 600;
82
+ text-transform: uppercase;
83
+ letter-spacing: 0.5px;
84
+ }
85
+ .section-body {
86
+ padding: 18px;
87
+ }
88
+
89
+ /* ── Gradio overrides ── */
90
+ .gr-form, .gr-box {
91
+ background: transparent !important;
92
+ border: none !important;
93
+ box-shadow: none !important;
94
+ padding: 0 !important;
95
+ }
96
+
97
+ label span {
98
+ color: #555 !important;
99
+ font-size: 13px !important;
100
+ font-weight: 500 !important;
101
+ }
102
+
103
+ input[type="text"], textarea {
104
+ background-color: #fafbfc !important;
105
+ color: #333 !important;
106
+ border: 1px solid #e1e4e8 !important;
107
+ border-radius: 8px !important;
108
+ font-size: 14px !important;
109
+ transition: border-color 0.2s;
110
+ }
111
+ input[type="text"]:focus, textarea:focus {
112
+ border-color: #E9592E !important;
113
+ box-shadow: 0 0 0 3px rgba(233,89,46,0.10) !important;
114
+ outline: none !important;
115
+ }
116
+
117
+ /* ── Run Match button ── */
118
+ #run-match-btn {
119
+ background: linear-gradient(90deg, #612383 0%, #E9592E 100%) !important;
120
+ color: white !important;
121
+ border: none !important;
122
+ border-radius: 10px !important;
123
+ padding: 16px 32px !important;
124
+ font-size: 16px !important;
125
+ font-weight: 600 !important;
126
+ text-transform: uppercase;
127
+ letter-spacing: 0.5px;
128
+ box-shadow: 0 4px 12px rgba(97,35,131,0.25) !important;
129
+ transition: all 0.3s ease;
130
+ cursor: pointer;
131
+ width: 100%;
132
+ }
133
+ #run-match-btn:hover {
134
+ background: linear-gradient(90deg, #E9592E 0%, #612383 100%) !important;
135
+ transform: translateY(-2px);
136
+ box-shadow: 0 6px 16px rgba(233,89,46,0.35) !important;
137
+ }
138
+
139
+ /* ── Add/Remove inline buttons ── */
140
+ .btn-inline {
141
+ width: 36px !important;
142
+ height: 36px !important;
143
+ min-width: 36px !important;
144
+ padding: 0 !important;
145
+ border-radius: 6px !important;
146
+ font-size: 20px !important;
147
+ font-weight: 500 !important;
148
+ background-color: white !important;
149
+ color: #612383 !important;
150
+ border: 1px solid #d0d7de !important;
151
+ cursor: pointer;
152
+ }
153
+ .btn-inline:hover {
154
+ border-color: #28a745 !important;
155
+ color: #28a745 !important;
156
+ background: #f6fef9 !important;
157
+ }
158
+
159
+ /* ── Add Field gradient button ── */
160
+ .btn-add-field {
161
+ background: linear-gradient(90deg, #612383 0%, #E9592E 100%) !important;
162
+ color: white !important;
163
+ border: none !important;
164
+ border-radius: 8px !important;
165
+ padding: 10px 22px !important;
166
+ font-size: 13px !important;
167
+ font-weight: 600 !important;
168
+ text-transform: uppercase;
169
+ letter-spacing: 0.5px;
170
+ cursor: pointer;
171
+ margin-top: 10px;
172
+ box-shadow: 0 3px 8px rgba(97,35,131,0.2);
173
+ }
174
+ .btn-add-field:hover {
175
+ background: linear-gradient(90deg, #E9592E 0%, #612383 100%) !important;
176
+ }
177
+
178
+ /* ── Backend status ── */
179
+ .status-online { color: #28a745; font-weight: 600; font-size: 14px; }
180
+ .status-offline { color: #dc3545; font-weight: 600; font-size: 14px; }
181
+
182
+ /* ── Result box ── */
183
+ .result-box {
184
+ background: white;
185
+ border-radius: 12px;
186
+ padding: 24px;
187
+ margin-top: 24px;
188
+ box-shadow: 0 4px 16px rgba(0,0,0,0.10);
189
+ border-top: 4px solid transparent;
190
+ border-image: linear-gradient(90deg, #612383, #E9592E, #F5A700) 1;
191
+ }
192
+ .result-header {
193
+ color: #612383;
194
+ font-size: 17px;
195
+ font-weight: 600;
196
+ margin-bottom: 12px;
197
+ }
198
+
199
+ /* ── Subsection label ── */
200
+ .subsection-label {
201
+ color: #666;
202
+ font-size: 13px;
203
+ font-weight: 600;
204
+ margin: 14px 0 8px;
205
+ text-transform: uppercase;
206
+ letter-spacing: 0.3px;
207
+ }
208
+
209
+ /* ── Address divider ── */
210
+ .addr-divider {
211
+ border: none;
212
+ border-top: 1px solid #e1e4e8;
213
+ margin: 16px 0;
214
+ }
215
+
216
+ /* ── Accordion / Group override ── */
217
+ .gr-group {
218
+ border: none !important;
219
+ background: transparent !important;
220
+ padding: 0 !important;
221
+ }
222
+ """
223
+
224
+ # =========================================================
225
+ # HELPERS
226
+ # =========================================================
227
+ def preprocess_text(text):
228
+ if not text:
229
+ return ""
230
+ return re.sub(r"\s+", " ", text.strip())
231
+
232
+
233
+ def check_backend_health():
234
+ try:
235
+ r = requests.get("http://127.0.0.1:8000/backend/v1/health", timeout=3)
236
+ if r.status_code == 200:
237
+ return "🟒 Backend Server Reachable"
238
+ except Exception:
239
+ pass
240
+ return "πŸ”΄ Backend Terminated"
241
+
242
+
243
+ def convert_to_api_payload(record: dict) -> dict:
244
+ payload = {
245
+ "name": record.get("name", ""),
246
+ "firstname": record.get("firstname", ""),
247
+ "middlename": record.get("middlename", ""),
248
+ "lastname": record.get("lastname", ""),
249
+ "mothername": record.get("mothername", ""),
250
+ "fathername": record.get("fathername", ""),
251
+ "spousename": record.get("spousename", ""),
252
+ "othername": record.get("othername", ""),
253
+ "dob": record.get("dob", ""),
254
+ "gender": record.get("gender", ""),
255
+ "AADHAR": record.get("AADHAR", ""),
256
+ "pan": record.get("pan", ""),
257
+ "licenseid": record.get("licenseid", ""),
258
+ "passportid": record.get("passportid", ""),
259
+ "voterid": record.get("voterid", ""),
260
+ "companyname": record.get("companyname", ""),
261
+ "parentcompanyname": record.get("parentcompanyname", ""),
262
+ "phones": [],
263
+ "emails": [],
264
+ "addresses": [],
265
+ "custom_fields": {},
266
+ }
267
+
268
+ for i in range(MAX_FIELDS):
269
+ val = record.get(f"phone_{i}", "")
270
+ if val:
271
+ payload["phones"].append(str(val))
272
+
273
+ val = record.get(f"email_{i}", "")
274
+ if val:
275
+ payload["emails"].append(str(val))
276
+
277
+ addr_keys = [f"addressline_{i}", f"city_{i}", f"state_{i}", f"zipcode_{i}"]
278
+ if any(k in record for k in addr_keys):
279
+ addr = {
280
+ "addressline": record.get(f"addressline_{i}", ""),
281
+ "city": record.get(f"city_{i}", ""),
282
+ "state": record.get(f"state_{i}", ""),
283
+ "zipcode": record.get(f"zipcode_{i}", ""),
284
+ }
285
+ payload["addresses"].append(addr)
286
+
287
+ known_keys = set(payload.keys()) - {"phones", "emails", "addresses", "custom_fields"}
288
+ known_prefixes = ("addressline_", "city_", "state_", "zipcode_", "phone_", "email_")
289
+
290
+ for k, v in record.items():
291
+ k_str = str(k)
292
+ if k_str in known_keys:
293
+ continue
294
+ if any(k_str.startswith(p) for p in known_prefixes):
295
+ continue
296
+ if v and str(v).strip():
297
+ payload["custom_fields"][k_str] = str(v)
298
+
299
+ return payload
300
+
301
+
302
+ # =========================================================
303
+ # MATCH FUNCTION (called by the Run button)
304
+ # =========================================================
305
+ def run_match(
306
+ # ── Record 1 personal ──
307
+ r1_name, r1_firstname, r1_middlename, r1_lastname,
308
+ r1_mothername, r1_fathername, r1_spousename, r1_othername,
309
+ r1_dob, r1_gender,
310
+ # ── Record 1 identifiers ──
311
+ r1_aadhar, r1_pan, r1_licenseid, r1_passportid, r1_voterid,
312
+ # ── Record 1 addresses (5 slots) ──
313
+ r1_addr0_line, r1_addr0_city, r1_addr0_state, r1_addr0_zip,
314
+ r1_addr1_line, r1_addr1_city, r1_addr1_state, r1_addr1_zip,
315
+ r1_addr2_line, r1_addr2_city, r1_addr2_state, r1_addr2_zip,
316
+ r1_addr3_line, r1_addr3_city, r1_addr3_state, r1_addr3_zip,
317
+ r1_addr4_line, r1_addr4_city, r1_addr4_state, r1_addr4_zip,
318
+ # ── Record 1 phones (5 slots) ──
319
+ r1_phone0, r1_phone1, r1_phone2, r1_phone3, r1_phone4,
320
+ # ── Record 1 emails (5 slots) ──
321
+ r1_email0, r1_email1, r1_email2, r1_email3, r1_email4,
322
+ # ── Record 1 employment ──
323
+ r1_company, r1_parent_company,
324
+ # ── Record 1 custom fields (5 slots) ──
325
+ r1_cf0_name, r1_cf0_val,
326
+ r1_cf1_name, r1_cf1_val,
327
+ r1_cf2_name, r1_cf2_val,
328
+ r1_cf3_name, r1_cf3_val,
329
+ r1_cf4_name, r1_cf4_val,
330
+
331
+ # ── Record 2 personal ��─
332
+ r2_name, r2_firstname, r2_middlename, r2_lastname,
333
+ r2_mothername, r2_fathername, r2_spousename, r2_othername,
334
+ r2_dob, r2_gender,
335
+ # ── Record 2 identifiers ──
336
+ r2_aadhar, r2_pan, r2_licenseid, r2_passportid, r2_voterid,
337
+ # ── Record 2 addresses (5 slots) ──
338
+ r2_addr0_line, r2_addr0_city, r2_addr0_state, r2_addr0_zip,
339
+ r2_addr1_line, r2_addr1_city, r2_addr1_state, r2_addr1_zip,
340
+ r2_addr2_line, r2_addr2_city, r2_addr2_state, r2_addr2_zip,
341
+ r2_addr3_line, r2_addr3_city, r2_addr3_state, r2_addr3_zip,
342
+ r2_addr4_line, r2_addr4_city, r2_addr4_state, r2_addr4_zip,
343
+ # ── Record 2 phones (5 slots) ──
344
+ r2_phone0, r2_phone1, r2_phone2, r2_phone3, r2_phone4,
345
+ # ── Record 2 emails (5 slots) ──
346
+ r2_email0, r2_email1, r2_email2, r2_email3, r2_email4,
347
+ # ── Record 2 employment ──
348
+ r2_company, r2_parent_company,
349
+ # ── Record 2 custom fields (5 slots) ──
350
+ r2_cf0_name, r2_cf0_val,
351
+ r2_cf1_name, r2_cf1_val,
352
+ r2_cf2_name, r2_cf2_val,
353
+ r2_cf3_name, r2_cf3_val,
354
+ r2_cf4_name, r2_cf4_val,
355
+ ):
356
+ def build_record(
357
+ name, firstname, middlename, lastname,
358
+ mothername, fathername, spousename, othername, dob, gender,
359
+ aadhar, pan, licenseid, passportid, voterid,
360
+ addr_lines, phones, emails,
361
+ company, parent_company,
362
+ custom_fields_pairs,
363
+ ):
364
+ rec = {
365
+ "name": name, "firstname": firstname, "middlename": middlename,
366
+ "lastname": lastname, "mothername": mothername, "fathername": fathername,
367
+ "spousename": spousename, "othername": othername, "dob": dob, "gender": gender,
368
+ "AADHAR": aadhar, "pan": pan, "licenseid": licenseid,
369
+ "passportid": passportid, "voterid": voterid,
370
+ "companyname": company, "parentcompanyname": parent_company,
371
+ }
372
+ for i, (line, city, state, zipcode) in enumerate(addr_lines):
373
+ rec[f"addressline_{i}"] = line
374
+ rec[f"city_{i}"] = city
375
+ rec[f"state_{i}"] = state
376
+ rec[f"zipcode_{i}"] = zipcode
377
+ for i, ph in enumerate(phones):
378
+ rec[f"phone_{i}"] = ph
379
+ for i, em in enumerate(emails):
380
+ rec[f"email_{i}"] = em
381
+ for cf_name, cf_val in custom_fields_pairs:
382
+ if cf_name and cf_name.strip():
383
+ rec[cf_name.strip()] = cf_val
384
+ return rec
385
+
386
+ r1 = build_record(
387
+ r1_name, r1_firstname, r1_middlename, r1_lastname,
388
+ r1_mothername, r1_fathername, r1_spousename, r1_othername, r1_dob, r1_gender,
389
+ r1_aadhar, r1_pan, r1_licenseid, r1_passportid, r1_voterid,
390
+ [
391
+ (r1_addr0_line, r1_addr0_city, r1_addr0_state, r1_addr0_zip),
392
+ (r1_addr1_line, r1_addr1_city, r1_addr1_state, r1_addr1_zip),
393
+ (r1_addr2_line, r1_addr2_city, r1_addr2_state, r1_addr2_zip),
394
+ (r1_addr3_line, r1_addr3_city, r1_addr3_state, r1_addr3_zip),
395
+ (r1_addr4_line, r1_addr4_city, r1_addr4_state, r1_addr4_zip),
396
+ ],
397
+ [r1_phone0, r1_phone1, r1_phone2, r1_phone3, r1_phone4],
398
+ [r1_email0, r1_email1, r1_email2, r1_email3, r1_email4],
399
+ r1_company, r1_parent_company,
400
+ [
401
+ (r1_cf0_name, r1_cf0_val), (r1_cf1_name, r1_cf1_val),
402
+ (r1_cf2_name, r1_cf2_val), (r1_cf3_name, r1_cf3_val),
403
+ (r1_cf4_name, r1_cf4_val),
404
+ ],
405
+ )
406
+
407
+ r2 = build_record(
408
+ r2_name, r2_firstname, r2_middlename, r2_lastname,
409
+ r2_mothername, r2_fathername, r2_spousename, r2_othername, r2_dob, r2_gender,
410
+ r2_aadhar, r2_pan, r2_licenseid, r2_passportid, r2_voterid,
411
+ [
412
+ (r2_addr0_line, r2_addr0_city, r2_addr0_state, r2_addr0_zip),
413
+ (r2_addr1_line, r2_addr1_city, r2_addr1_state, r2_addr1_zip),
414
+ (r2_addr2_line, r2_addr2_city, r2_addr2_state, r2_addr2_zip),
415
+ (r2_addr3_line, r2_addr3_city, r2_addr3_state, r2_addr3_zip),
416
+ (r2_addr4_line, r2_addr4_city, r2_addr4_state, r2_addr4_zip),
417
+ ],
418
+ [r2_phone0, r2_phone1, r2_phone2, r2_phone3, r2_phone4],
419
+ [r2_email0, r2_email1, r2_email2, r2_email3, r2_email4],
420
+ r2_company, r2_parent_company,
421
+ [
422
+ (r2_cf0_name, r2_cf0_val), (r2_cf1_name, r2_cf1_val),
423
+ (r2_cf2_name, r2_cf2_val), (r2_cf3_name, r2_cf3_val),
424
+ (r2_cf4_name, r2_cf4_val),
425
+ ],
426
+ )
427
+
428
+ api_url = "http://127.0.0.1:8000/backend/v1/match"
429
+ try:
430
+ r1_payload = convert_to_api_payload(r1)
431
+ r2_payload = convert_to_api_payload(r2)
432
+
433
+ rec1 = EntityRecord(**r1_payload)
434
+ rec2 = EntityRecord(**r2_payload)
435
+
436
+ result_data = perform_match(rec1, rec2, mode="embedding")
437
+
438
+ result = {
439
+ "overall_decision": result_data["overall_decision"],
440
+ "reason": result_data["reason"],
441
+ "field_results": result_data["field_scores"],
442
+ }
443
+ return json.dumps(result, indent=2)
444
+
445
+ except Exception as e:
446
+ return json.dumps({"error": str(e)}, indent=2)
447
+
448
+ # =========================================================
449
+ # UI BUILDER HELPERS
450
+ # =========================================================
451
+ def section_card(title: str, icon: str = ""):
452
+ """Returns an HTML header string for a section card."""
453
+ return f"""
454
+ <div class="section-card">
455
+ <div class="section-header-gradient">{icon}&nbsp;&nbsp;{title}</div>
456
+ <div class="section-body">
457
+ """
458
+
459
+ def personal_fields(prefix):
460
+ inputs = []
461
+ with gr.Row():
462
+ full_name = gr.Textbox(label="Full Name", placeholder="Enter full name", elem_id=f"{prefix}_name")
463
+ first_name = gr.Textbox(label="First Name", placeholder="Enter first name", elem_id=f"{prefix}_firstname")
464
+ inputs += [full_name, first_name]
465
+ with gr.Row():
466
+ middle_name = gr.Textbox(label="Middle Name", placeholder="Enter middle name", elem_id=f"{prefix}_middlename")
467
+ last_name = gr.Textbox(label="Last Name", placeholder="Enter last name", elem_id=f"{prefix}_lastname")
468
+ inputs += [middle_name, last_name]
469
+ with gr.Row():
470
+ mother_name = gr.Textbox(label="Mother's Name", placeholder="Enter mother's name", elem_id=f"{prefix}_mothername")
471
+ father_name = gr.Textbox(label="Father's Name", placeholder="Enter father's name", elem_id=f"{prefix}_fathername")
472
+ inputs += [mother_name, father_name]
473
+ with gr.Row():
474
+ spouse_name = gr.Textbox(label="Spouse's Name", placeholder="Enter spouse's name", elem_id=f"{prefix}_spousename")
475
+ other_name = gr.Textbox(label="Other Name", placeholder="Enter other name", elem_id=f"{prefix}_othername")
476
+ inputs += [spouse_name, other_name]
477
+ with gr.Row():
478
+ dob = gr.Textbox(label="Date of Birth", placeholder="YYYY-MM-DD", elem_id=f"{prefix}_dob")
479
+ gender = gr.Textbox(label="Gender", placeholder="Male/Female/Other", elem_id=f"{prefix}_gender")
480
+ inputs += [dob, gender]
481
+ return inputs # [name, firstname, middlename, lastname, mothername, fathername, spousename, othername, dob, gender]
482
+
483
+ def identifier_fields(prefix):
484
+ inputs = []
485
+ with gr.Row():
486
+ aadhar = gr.Textbox(label="Aadhar Number", placeholder="Enter Aadhar number", elem_id=f"{prefix}_aadhar")
487
+ pan = gr.Textbox(label="PAN Number", placeholder="Enter PAN number", elem_id=f"{prefix}_pan")
488
+ inputs += [aadhar, pan]
489
+ with gr.Row():
490
+ license_ = gr.Textbox(label="License Number", placeholder="Enter license number", elem_id=f"{prefix}_license")
491
+ passport = gr.Textbox(label="Passport Number", placeholder="Enter passport number", elem_id=f"{prefix}_passport")
492
+ inputs += [license_, passport]
493
+ with gr.Row():
494
+ voter_id = gr.Textbox(label="Voter ID", placeholder="Enter voter ID", elem_id=f"{prefix}_voterid")
495
+ gr.HTML("") # spacer
496
+ inputs += [voter_id]
497
+ return inputs # [aadhar, pan, licenseid, passportid, voterid]
498
+
499
+ def address_fields(prefix, slot):
500
+ """Single address slot (0-indexed)."""
501
+ with gr.Group():
502
+ label = "Primary Address" if slot == 0 else f"Address {slot+1}"
503
+ gr.HTML(f'<div class="subsection-label">{label}</div>')
504
+ line = gr.Textbox(label="Street Address", placeholder="Street, Building, Area", elem_id=f"{prefix}_addr{slot}_line")
505
+ with gr.Row():
506
+ city = gr.Textbox(label="City", placeholder="Enter city", elem_id=f"{prefix}_addr{slot}_city")
507
+ state = gr.Textbox(label="State", placeholder="Enter state", elem_id=f"{prefix}_addr{slot}_state")
508
+ zipcode = gr.Textbox(label="Pincode", placeholder="6-digit postal code", elem_id=f"{prefix}_addr{slot}_zip")
509
+ return line, city, state, zipcode
510
+
511
+ def contact_fields(prefix):
512
+ phone_inputs = []
513
+ email_inputs = []
514
+ gr.HTML('<div class="subsection-label">πŸ“ž Phone Numbers</div>')
515
+ for i in range(5):
516
+ ph = gr.Textbox(label=f"Phone {i+1}", placeholder="Enter phone number", elem_id=f"{prefix}_phone{i}")
517
+ phone_inputs.append(ph)
518
+ gr.HTML('<hr class="addr-divider"><div class="subsection-label">βœ‰οΈ Email Addresses</div>')
519
+ for i in range(5):
520
+ em = gr.Textbox(label=f"Email {i+1}", placeholder="Enter email address", elem_id=f"{prefix}_email{i}")
521
+ email_inputs.append(em)
522
+ return phone_inputs, email_inputs # each is list of 5
523
+
524
+ def employment_fields(prefix):
525
+ with gr.Row():
526
+ company = gr.Textbox(label="Company Name", placeholder="Enter company name", elem_id=f"{prefix}_company")
527
+ parent_company = gr.Textbox(label="Parent Company Name", placeholder="Enter parent company name", elem_id=f"{prefix}_pcompany")
528
+ return company, parent_company
529
+
530
+ def custom_field_slots(prefix):
531
+ """5 name+value custom field pairs."""
532
+ pairs = []
533
+ gr.HTML('<div class="subsection-label">Custom Fields (up to 5)</div>')
534
+ for i in range(5):
535
+ with gr.Row():
536
+ cf_name = gr.Textbox(label=f"Field Name {i+1}", placeholder=f"e.g. MemberID", elem_id=f"{prefix}_cf{i}_name")
537
+ cf_val = gr.Textbox(label=f"Field Value {i+1}", placeholder="Value", elem_id=f"{prefix}_cf{i}_val")
538
+ pairs.append((cf_name, cf_val))
539
+ return pairs # list of 5 (name_widget, val_widget) tuples
540
+
541
+
542
+ # =========================================================
543
+ # BUILD THE GRADIO APP
544
+ # =========================================================
545
+ def build_app():
546
+ with gr.Blocks(css=CUSTOM_CSS, title="GEN AI Record Level Matching") as demo:
547
+
548
+ # ── Header ──
549
+ gr.HTML("""
550
+ <div class="app-header">
551
+ <h1>Record Level Matching Using Embedding Models</h1>
552
+ <p>Enter details for two records below and click "Run Record Match" to see the matching result</p>
553
+ </div>
554
+ """)
555
+
556
+ # ── Backend status (shown once on load) ──
557
+ backend_status = gr.HTML(value=check_backend_health, every=30)
558
+
559
+ # ── Two-column record layout ──
560
+ with gr.Row(equal_height=False):
561
+
562
+ # ════════════════════════
563
+ # RECORD 1
564
+ # ════════════════════════
565
+ with gr.Column():
566
+ gr.HTML('<div class="record-header">Record 1</div>')
567
+
568
+ # Personal Details
569
+ gr.HTML('<div class="section-card"><div class="section-header-gradient">πŸ‘€&nbsp;&nbsp;PERSONAL DETAILS</div><div class="section-body">')
570
+ r1_personal = personal_fields("r1")
571
+ gr.HTML('</div></div>')
572
+
573
+ # Identifiers / Equalities
574
+ gr.HTML('<div class="section-card"><div class="section-header-gradient">πŸͺͺ&nbsp;&nbsp;EQUALITIES</div><div class="section-body">')
575
+ r1_ids = identifier_fields("r1")
576
+ # Custom fields live inside Equalities (as in original)
577
+ r1_custom_pairs = custom_field_slots("r1")
578
+ gr.HTML('</div></div>')
579
+
580
+ # Address Details
581
+ gr.HTML('<div class="section-card"><div class="section-header-gradient">πŸ“&nbsp;&nbsp;ADDRESS DETAILS</div><div class="section-body">')
582
+ r1_addr_fields = []
583
+ for slot in range(5):
584
+ line, city, state, zipcode = address_fields("r1", slot)
585
+ r1_addr_fields += [line, city, state, zipcode]
586
+ if slot < 4:
587
+ gr.HTML('<hr class="addr-divider">')
588
+ gr.HTML('</div></div>')
589
+
590
+ # Contact Information
591
+ gr.HTML('<div class="section-card"><div class="section-header-gradient">πŸ“±&nbsp;&nbsp;CONTACT INFORMATION</div><div class="section-body">')
592
+ r1_phones, r1_emails = contact_fields("r1")
593
+ gr.HTML('</div></div>')
594
+
595
+ # Employment Details
596
+ gr.HTML('<div class="section-card"><div class="section-header-gradient">πŸ’Ό&nbsp;&nbsp;EMPLOYMENT DETAILS</div><div class="section-body">')
597
+ r1_company, r1_pcompany = employment_fields("r1")
598
+ gr.HTML('</div></div>')
599
+
600
+ # ════════════════════════
601
+ # RECORD 2
602
+ # ════════════════════════
603
+ with gr.Column():
604
+ gr.HTML('<div class="record-header">Record 2</div>')
605
+
606
+ gr.HTML('<div class="section-card"><div class="section-header-gradient">πŸ‘€&nbsp;&nbsp;PERSONAL DETAILS</div><div class="section-body">')
607
+ r2_personal = personal_fields("r2")
608
+ gr.HTML('</div></div>')
609
+
610
+ gr.HTML('<div class="section-card"><div class="section-header-gradient">πŸͺͺ&nbsp;&nbsp;EQUALITIES</div><div class="section-body">')
611
+ r2_ids = identifier_fields("r2")
612
+ r2_custom_pairs = custom_field_slots("r2")
613
+ gr.HTML('</div></div>')
614
+
615
+ gr.HTML('<div class="section-card"><div class="section-header-gradient">πŸ“&nbsp;&nbsp;ADDRESS DETAILS</div><div class="section-body">')
616
+ r2_addr_fields = []
617
+ for slot in range(5):
618
+ line, city, state, zipcode = address_fields("r2", slot)
619
+ r2_addr_fields += [line, city, state, zipcode]
620
+ if slot < 4:
621
+ gr.HTML('<hr class="addr-divider">')
622
+ gr.HTML('</div></div>')
623
+
624
+ gr.HTML('<div class="section-card"><div class="section-header-gradient">πŸ“±&nbsp;&nbsp;CONTACT INFORMATION</div><div class="section-body">')
625
+ r2_phones, r2_emails = contact_fields("r2")
626
+ gr.HTML('</div></div>')
627
+
628
+ gr.HTML('<div class="section-card"><div class="section-header-gradient">πŸ’Ό&nbsp;&nbsp;EMPLOYMENT DETAILS</div><div class="section-body">')
629
+ r2_company, r2_pcompany = employment_fields("r2")
630
+ gr.HTML('</div></div>')
631
+
632
+ # ── Run Match Button ──
633
+ run_btn = gr.Button("πŸ” RUN RECORD MATCH", variant="primary", elem_id="run-match-btn")
634
+
635
+ # ── Result output ──
636
+ gr.HTML('<div class="result-box"><div class="result-header">Matching Result (Backend API)</div></div>')
637
+ result_output = gr.Code(label="Result JSON", language="json", lines=20)
638
+
639
+ # ── Wire up the button ──
640
+ # Collect all inputs in exact order matching run_match() signature
641
+ all_inputs = (
642
+ r1_personal # 10: name..gender
643
+ + r1_ids # 5: aadhar..voterid
644
+ + r1_addr_fields # 20: 5 addr Γ— 4 fields
645
+ + r1_phones # 5
646
+ + r1_emails # 5
647
+ + [r1_company, r1_pcompany] # 2
648
+ + [w for pair in r1_custom_pairs for w in pair] # 10: 5 pairs Γ— 2
649
+ + r2_personal # 10
650
+ + r2_ids # 5
651
+ + r2_addr_fields # 20
652
+ + r2_phones # 5
653
+ + r2_emails # 5
654
+ + [r2_company, r2_pcompany] # 2
655
+ + [w for pair in r2_custom_pairs for w in pair] # 10
656
+ )
657
+ # Total = 10+5+20+5+5+2+10 + 10+5+20+5+5+2+10 = 57+57 = 114 inputs
658
+
659
+ run_btn.click(
660
+ fn=run_match,
661
+ inputs=all_inputs,
662
+ outputs=result_output,
663
+ )
664
+
665
+ return demo
666
+
667
+
668
+ # =========================================================
669
+ # ENTRY POINT
670
+ # =========================================================
671
+ if __name__ == "__main__":
672
+ app = build_app()
673
+ app.launch()
frontend/assests/Logo icon_color.png ADDED
none.webp ADDED
note.txt ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Current Version of this application features:
2
+
3
+ 1. dual mode with embedding and llm mode
4
+ 2. data preprocessing retrieving from csv data
5
+ 3. Pincode Logic has been updated
6
+
7
+
8
+
9
+ Objective:
10
+ This repository contains the implementation of a **GenAI-based Entity Matching** system. It supports a dual‑mode architecture with a Fastapi backend, a Streamlit frontend, and a collection of services for data processing and model interaction.
11
+
12
+
13
+ Features:
14
+
15
+ - **Flexible matching service** implemented in `backend/matching_service.py`.
16
+ - **Modular data models** defined in `backend/models.py`.
17
+ - **Streamlit frontend** for quick experimentation (`frontend/app_streamlit.py`).
18
+ - **Configurable rules and LLM model integration** under `services/`.
19
+ - **Extensive test suite** located in `tests/`.
20
+ - **Configuration files** and property management in `backend/config` and `services/config.py`.
21
+
22
+
23
+ Active endpoints :
24
+
25
+ POST /backend/v1/match – Match a single pair of records
26
+ POST /backend/v1/match/batch – Match multiple pairs # multithread implementation
27
+ GET /backend/v1/health – Full health check (CSV data, models, LLM)
28
+ GET /backend/v1/health/llm – LLM server health check only
29
+
30
+
31
+
32
+
33
+ To Run the application :
34
+
35
+ for embedding mode:
36
+ models will be loaded when we initiate the server
37
+
38
+ for llm mode:
39
+ we have to paste the llm up url in the common.properties , base-url:
40
+
41
+ for frontend :
42
+
43
+ python -m streamlit run frontend/app_streamlit.py
44
+
45
+
46
+ for backend:
47
+
48
+ python -m uvicorn backend.server:app
requirements.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy
2
+ pandas
3
+ python-dateutil
4
+ pytz
5
+ regex
6
+ scipy
7
+ Pillow
8
+ gradio>=4.44.0
9
+
10
+ rapidfuzz==3.9.3
11
+ scikit-learn==1.5.2
12
+ sentence-transformers==2.7.0
13
+ pgeocode==0.5.0
14
+
15
+
16
+ openai
17
+ torch --index-url https://download.pytorch.org/whl/cpu
18
+ fastapi
19
+ uvicorn
20
+ requests
services/__pycache__/config.cpython-310.pyc ADDED
Binary file (3.2 kB). View file
 
services/__pycache__/config.cpython-312.pyc ADDED
Binary file (6.51 kB). View file
 
services/__pycache__/llm_model.cpython-310.pyc ADDED
Binary file (19.1 kB). View file
 
services/__pycache__/llm_model.cpython-312.pyc ADDED
Binary file (24.2 kB). View file
 
services/__pycache__/model.cpython-310.pyc ADDED
Binary file (24.8 kB). View file
 
services/__pycache__/model.cpython-312.pyc ADDED
Binary file (29.6 kB). View file
 
services/__pycache__/rules.cpython-310.pyc ADDED
Binary file (70.4 kB). View file
 
services/__pycache__/rules.cpython-312.pyc ADDED
Binary file (55.3 kB). View file
 
services/address_matcher.py ADDED
@@ -0,0 +1,722 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ address_matcher.py
3
+ ==================
4
+ Enhanced Indian address parser + matcher.
5
+
6
+ Reuses existing infrastructure from rules.py and model.py:
7
+ - clean_text, strip_non_alphanumeric, deduplicate_consecutive_tokens (rules.py)
8
+ - standardize_city, standardize_state (rules.py)
9
+ - validate_and_normalize_pincode, pincode_similarity_india (rules.py)
10
+ - extract_address_components (rules.py) ← house/flat/apt/street
11
+ - roman_to_number (rules.py)
12
+ - normalize_and_deduplicate_address (rules.py)
13
+ - hno_variation_df, city_prev_pres_df, state_name_standard_df,
14
+ pin_city_state_df, CITY_MAPPING, STATE_MAPPING (config via rules.py)
15
+ - calculate_semantic_similarity, match_entities (model.py)
16
+ - ADDRESS_MODEL_WEIGHTS (config)
17
+
18
+ New additions in this file
19
+ --------------------------
20
+ 1. standardize_address_line() – token-level hno / city / state variation replacement
21
+ 2. extract_geo_anchors() – PIN / city / state from a *single* address string
22
+ 3. extract_extended_components() – block, sector, ward, apt-name, locality on top of
23
+ the existing extract_address_components()
24
+ 4. match_address_lines() – full 3-stage pipeline returning score + breakdown
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import re
30
+ import math
31
+ import logging
32
+ from typing import Dict, Optional, Tuple
33
+
34
+ logger = logging.getLogger("address_matcher")
35
+
36
+ # ── existing imports (your project layout) ───────────────────────────────────
37
+ from services.rules import (
38
+ clean_text,
39
+ strip_non_alphanumeric,
40
+ deduplicate_consecutive_tokens,
41
+ standardize_city,
42
+ standardize_state,
43
+ validate_and_normalize_pincode,
44
+ pincode_similarity_india,
45
+ extract_address_components,
46
+ roman_to_number,
47
+ normalize_and_deduplicate_address,
48
+ replace_with_standard,
49
+ lookup_from_mapping,
50
+ )
51
+ from services.config import (
52
+ hno_variation_df,
53
+ city_prev_pres_df,
54
+ state_name_standard_df,
55
+ pin_city_state_df,
56
+ CITY_MAPPING,
57
+ STATE_MAPPING,
58
+ ADDRESS_MODEL_WEIGHTS,
59
+ )
60
+ from services.model import (
61
+ calculate_semantic_similarity,
62
+ match_entities,
63
+ preprocess_for_matching,
64
+ )
65
+
66
+ # ─────────────────────────────────────────────────────────────────────────────
67
+ # 1. CONSTANTS
68
+ # ─────────────────────────────────────────────────────────────────────────────
69
+
70
+ # Scoring constants (requirement spec)
71
+ GEO_MISMATCH_SCORE = 20 # hard cap when city/state/PIN mismatch detected
72
+ GEO_MATCH_BOOST = 10 # per matching geo component
73
+ ID_MATCH_BOOST = 30 # numeric identifier components match (gated by embed > 40)
74
+ ID_MISMATCH_PENALTY = 30 # numeric identifier mismatch penalty
75
+ EMBED_GATE_THRESHOLD = 40 # embedding score must exceed this to award ID_MATCH_BOOST
76
+
77
+ # Semantic model to use for address residual comparison
78
+ ADDRESS_EMBED_MODEL = "model2" # change to "model1" if preferred
79
+
80
+ # ── Extended component regex patterns ────────────────────────────────────────
81
+ _BLOCK_RE = re.compile(
82
+ r'\b(?:block|blk|bl)\.?\s*(?:no\.?\s*)?([a-z0-9]{1,4})\b', re.I)
83
+ _SECTOR_RE = re.compile(
84
+ r'\b(?:sector|sec)\.?\s*(?:no\.?\s*)?(\d{1,3}[a-z]?)\b', re.I)
85
+ _WARD_RE = re.compile(
86
+ r'\b(?:ward)\.?\s*(?:no\.?\s*)?(\d{1,3}[a-z]?)\b', re.I)
87
+ _PHASE_RE = re.compile(
88
+ r'\b(?:phase|ph)\.?\s*(?:no\.?\s*)?(\d{1,2})\b', re.I)
89
+ _PLOT_RE = re.compile(
90
+ r'\b(?:plot|plt)\.?\s*(?:no\.?\s*)?([a-z0-9]{1,6}(?:[/-][a-z0-9]{1,4})?)\b', re.I)
91
+
92
+ # PIN: 6 digits, first digit 1-9 (valid Indian PIN range)
93
+ _PIN_RE = re.compile(r'(?<!\d)([1-9]\d{5})(?!\d)')
94
+
95
+ # City / State boundary markers (help isolate tail of address)
96
+ _TAIL_SEP = re.compile(r'[-–,]\s*')
97
+
98
+ # Hard identifier component keys – mismatch on ANY of these β†’ -30 penalty
99
+ HARD_ID_KEYS = ('house_number', 'flat_number', 'block', 'sector', 'ward', 'plot', 'phase')
100
+
101
+ # Geo-anchor keys – mismatch on ANY of these β†’ score = 20 early exit
102
+ GEO_KEYS = ('pin', 'city', 'state')
103
+
104
+
105
+ # ─────────────────────────────────────────────────────────────────────────────
106
+ # 2. STEP 1 – ADDRESS STANDARDIZATION
107
+ # ────────────────────────────────────────────────────────��────────────────────
108
+
109
+ def _build_hno_lookup() -> Dict[str, str]:
110
+ """
111
+ Build a token-level lookup from hno_variation_df loaded in config.
112
+ Cached on first call via function attribute.
113
+ Expected columns: 'variation', 'standard'
114
+ """
115
+ if hasattr(_build_hno_lookup, '_cache'):
116
+ return _build_hno_lookup._cache
117
+
118
+ lookup: Dict[str, str] = {}
119
+ if hno_variation_df is not None and not hno_variation_df.empty:
120
+ df = hno_variation_df.copy()
121
+ df.columns = df.columns.str.lower()
122
+ for _, row in df.iterrows():
123
+ var = str(row.get('variation', '')).strip().lower()
124
+ std = str(row.get('standard', '')).strip().lower()
125
+ if var and std:
126
+ lookup[var] = std
127
+
128
+ _build_hno_lookup._cache = lookup
129
+ return lookup
130
+
131
+
132
+ def standardize_address_line(address: str) -> str:
133
+ """
134
+ Stage 0 – token-level standardization of a raw address string.
135
+
136
+ Steps (in order):
137
+ 1. clean_text() – strips HTML, control chars, lowercases
138
+ 2. roman_to_number() – "Sector IV" β†’ "Sector 4"
139
+ 3. Replace h_no variation tokens (h.no / hno / h no / door no / d.no …)
140
+ 4. Standardize city tokens via CITY_MAPPING / city_prev_pres_df
141
+ 5. Standardize state tokens via STATE_MAPPING / state_name_standard_df
142
+ 6. Collapse duplicate consecutive tokens
143
+ 7. Strip trailing punctuation noise
144
+
145
+ Returns: cleaned, lower-case address string ready for component extraction.
146
+ """
147
+ if not address:
148
+ return ""
149
+
150
+ text = clean_text(str(address)) # step 1
151
+ text = roman_to_number(text) # step 2
152
+
153
+ # step 3 – hno variation token replacement (word-boundary safe)
154
+ hno_lookup = _build_hno_lookup()
155
+ if hno_lookup:
156
+ tokens = text.split()
157
+ replaced = []
158
+ i = 0
159
+ while i < len(tokens):
160
+ # try 2-token phrases first (e.g. "h no", "door no")
161
+ two = (tokens[i] + ' ' + tokens[i + 1]).lower() if i + 1 < len(tokens) else ''
162
+ if two in hno_lookup:
163
+ replaced.append(hno_lookup[two])
164
+ i += 2
165
+ continue
166
+ one = tokens[i].lower().rstrip('.')
167
+ if one in hno_lookup:
168
+ replaced.append(hno_lookup[one])
169
+ else:
170
+ replaced.append(tokens[i])
171
+ i += 1
172
+ text = ' '.join(replaced)
173
+
174
+ # steps 4+5 – city / state token replacement (applied to whole tokens)
175
+ words = text.split()
176
+ for idx, word in enumerate(words):
177
+ # try 2-word combos for multi-word city/state names
178
+ if idx + 1 < len(words):
179
+ two_word = word + ' ' + words[idx + 1]
180
+ city_std = standardize_city(two_word)
181
+ if city_std and city_std.lower() != two_word.lower():
182
+ words[idx] = city_std
183
+ words[idx + 1] = ''
184
+ continue
185
+ state_std = standardize_state(two_word)
186
+ if state_std and state_std.lower() != two_word.lower():
187
+ words[idx] = state_std
188
+ words[idx + 1] = ''
189
+ continue
190
+ single_city = standardize_city(word)
191
+ if single_city and single_city.lower() != word.lower():
192
+ words[idx] = single_city
193
+ continue
194
+ single_state = standardize_state(word)
195
+ if single_state and single_state.lower() != word.lower():
196
+ words[idx] = single_state
197
+
198
+ text = ' '.join(w for w in words if w)
199
+
200
+ # step 6+7 – dedup consecutive, strip stray punctuation
201
+ text = deduplicate_consecutive_tokens(text)
202
+ text = re.sub(r'\s+', ' ', text).strip(' ,.-')
203
+ return text
204
+
205
+
206
+ # ─────────────────────────────────────────────────────────────────────────────
207
+ # 3. STEP 2 – GEO-ANCHOR EXTRACTION (PIN / city / state)
208
+ # ─────────────────────────────────────────────────────────────────────────────
209
+
210
+ def _pin_from_text(text: str) -> Optional[str]:
211
+ """Extract and validate first Indian PIN from text."""
212
+ for m in _PIN_RE.finditer(text):
213
+ candidate = m.group(1)
214
+ normalized = validate_and_normalize_pincode(candidate)
215
+ if normalized:
216
+ return normalized
217
+ return None
218
+
219
+
220
+ def _city_from_text(text: str) -> Optional[str]:
221
+ """
222
+ Token scan for city names.
223
+ Tries 2-word and 1-word combinations against CITY_MAPPING / city_prev_pres_df.
224
+ Returns the canonical (standardized) city name or None.
225
+ """
226
+ tokens = text.split()
227
+ for i in range(len(tokens)):
228
+ # 2-word
229
+ if i + 1 < len(tokens):
230
+ candidate = tokens[i] + ' ' + tokens[i + 1]
231
+ std = standardize_city(candidate)
232
+ if std and std.lower() != candidate.lower():
233
+ return std.lower()
234
+ # also accept direct match if it IS a known city already
235
+ known = lookup_from_mapping(candidate.upper(), CITY_MAPPING)
236
+ if known:
237
+ return known.lower()
238
+ # 1-word
239
+ std = standardize_city(tokens[i])
240
+ if std and std.lower() != tokens[i].lower():
241
+ return std.lower()
242
+ known = lookup_from_mapping(tokens[i].upper(), CITY_MAPPING)
243
+ if known:
244
+ return known.lower()
245
+ return None
246
+
247
+
248
+ def _state_from_text(text: str) -> Optional[str]:
249
+ """
250
+ Token scan for state names.
251
+ Tries 2-word and 1-word combinations.
252
+ """
253
+ tokens = text.split()
254
+ for i in range(len(tokens)):
255
+ if i + 1 < len(tokens):
256
+ candidate = tokens[i] + ' ' + tokens[i + 1]
257
+ std = standardize_state(candidate)
258
+ if std and std.lower() != candidate.lower():
259
+ return std.lower()
260
+ known = lookup_from_mapping(candidate.upper(), STATE_MAPPING)
261
+ if known:
262
+ return known.lower()
263
+ std = standardize_state(tokens[i])
264
+ if std and std.lower() != tokens[i].lower():
265
+ return std.lower()
266
+ known = lookup_from_mapping(tokens[i].upper(), STATE_MAPPING)
267
+ if known:
268
+ return known.lower()
269
+ return None
270
+
271
+
272
+ def _enrich_from_pincode(pin: str) -> Tuple[Optional[str], Optional[str]]:
273
+ """
274
+ Use pin_city_state_df first, then pgeocode as fallback,
275
+ to fill in city and state from a PIN code.
276
+ Returns (city, state) both lowercase or None.
277
+ """
278
+ city, state = None, None
279
+
280
+ # ── try local CSV first ────────────────────────────────────────────────
281
+ if pin_city_state_df is not None and not pin_city_state_df.empty:
282
+ df = pin_city_state_df.copy()
283
+ df.columns = df.columns.str.lower()
284
+ # expected columns: pincode / pin, city, state
285
+ pin_col = next((c for c in df.columns if 'pin' in c), None)
286
+ if pin_col:
287
+ row = df[df[pin_col].astype(str).str.zfill(6) == pin]
288
+ if not row.empty:
289
+ city_col = next((c for c in df.columns if 'city' in c), None)
290
+ state_col = next((c for c in df.columns if 'state' in c), None)
291
+ if city_col:
292
+ city = str(row.iloc[0][city_col]).strip().lower()
293
+ if state_col:
294
+ state = str(row.iloc[0][state_col]).strip().lower()
295
+ if city and state:
296
+ return standardize_city(city), standardize_state(state)
297
+
298
+ # ── pgeocode fallback ──────────────────────────────────────────────────
299
+ try:
300
+ import pgeocode
301
+ nomi = pgeocode.Nominatim('in')
302
+ result = nomi.query_postal_code(pin)
303
+ if result is not None and not result.empty:
304
+ raw_city = getattr(result, 'county_name', None)
305
+ raw_state = getattr(result, 'state_name', None)
306
+ if raw_city is not None:
307
+ cv = raw_city.values[0] if hasattr(raw_city, 'values') else raw_city
308
+ if cv and not (isinstance(cv, float) and math.isnan(cv)):
309
+ city = standardize_city(str(cv).strip().lower())
310
+ if raw_state is not None:
311
+ sv = raw_state.values[0] if hasattr(raw_state, 'values') else raw_state
312
+ if sv and not (isinstance(sv, float) and math.isnan(sv)):
313
+ state = standardize_state(str(sv).strip().lower())
314
+ except Exception:
315
+ pass
316
+
317
+ return city, state
318
+
319
+
320
+ def extract_geo_anchors(standardized_address: str) -> Dict[str, Optional[str]]:
321
+ """
322
+ Extract { pin, city, state } from a *standardized* address string.
323
+
324
+ Priority order:
325
+ 1. PIN extracted from text β†’ pgeocode/CSV fills city+state if absent
326
+ 2. City / state scanned directly from tokens
327
+ 3. Any remaining None fields stay None (component absent)
328
+ """
329
+ text = standardized_address
330
+
331
+ pin = _pin_from_text(text)
332
+ city = _city_from_text(text)
333
+ state = _state_from_text(text)
334
+
335
+ # Enrich city/state from PIN if either is still missing
336
+ if pin and (not city or not state):
337
+ pin_city, pin_state = _enrich_from_pincode(pin)
338
+ if not city and pin_city:
339
+ city = pin_city
340
+ if not state and pin_state:
341
+ state = pin_state
342
+
343
+ return {
344
+ 'pin': pin,
345
+ 'city': city,
346
+ 'state': state,
347
+ }
348
+
349
+
350
+ def _remove_geo_tokens(text: str, pin: Optional[str],
351
+ city: Optional[str], state: Optional[str]) -> str:
352
+ """
353
+ Strip extracted geo tokens from the address string so they don't
354
+ contaminate the residual that goes to the embedding model.
355
+ """
356
+ if pin:
357
+ text = re.sub(re.escape(pin), ' ', text)
358
+ if city:
359
+ text = re.sub(re.escape(city), ' ', text, flags=re.I)
360
+ if state:
361
+ text = re.sub(re.escape(state), ' ', text, flags=re.I)
362
+ return re.sub(r'\s+', ' ', text).strip(' ,.-')
363
+
364
+
365
+ # ─────────────────────────────────────────────────────────────────────────────
366
+ # 4. STEP 3 – EXTENDED COMPONENT EXTRACTION
367
+ # Wraps existing extract_address_components() and adds block/sector/ward/etc.
368
+ # ─────────────────────────────────────────────────────────────────────────────
369
+
370
+ def _extract_pattern(pattern: re.Pattern, text: str) -> Optional[str]:
371
+ """Return first group of first match, or None."""
372
+ m = pattern.search(text)
373
+ return m.group(1).strip().upper() if m else None
374
+
375
+
376
+ def _remove_pattern_match(pattern: re.Pattern, text: str) -> str:
377
+ """Remove the entire match (not just group 1) from text."""
378
+ return re.sub(pattern, ' ', text, count=1, flags=re.I)
379
+
380
+
381
+ def extract_extended_components(standardized_address: str,
382
+ geo: Dict[str, Optional[str]]) -> Dict:
383
+ """
384
+ Full component extraction pipeline.
385
+
386
+ Returns a dict with keys:
387
+ house_number, flat_number, apartment, street, ← from existing rules.py
388
+ block, sector, ward, phase, plot, ← new patterns
389
+ apartment_name, ← existing (apartment field)
390
+ residual ← leftover for embedding
391
+ """
392
+ # ── Step A: strip geo tokens before passing to rules extractor ────────
393
+ text = _remove_geo_tokens(
394
+ standardized_address,
395
+ geo.get('pin'), geo.get('city'), geo.get('state')
396
+ )
397
+
398
+ # ── Step B: existing extractor (house / flat / apartment / street) ────
399
+ base = extract_address_components(text)
400
+
401
+ house_no = base.get('house_number')
402
+ flat_no = base.get('flat_number')
403
+ apt_name = base.get('apartment') # apartment / building name
404
+ street = base.get('street')
405
+ remaining = base.get('remaining_address', text)
406
+
407
+ # ── Step C: extended patterns on the *remaining* text ─────────────────
408
+ block = _extract_pattern(_BLOCK_RE, remaining)
409
+ if block:
410
+ remaining = _remove_pattern_match(_BLOCK_RE, remaining)
411
+
412
+ sector = _extract_pattern(_SECTOR_RE, remaining)
413
+ if sector:
414
+ remaining = _remove_pattern_match(_SECTOR_RE, remaining)
415
+
416
+ ward = _extract_pattern(_WARD_RE, remaining)
417
+ if ward:
418
+ remaining = _remove_pattern_match(_WARD_RE, remaining)
419
+
420
+ phase = _extract_pattern(_PHASE_RE, remaining)
421
+ if phase:
422
+ remaining = _remove_pattern_match(_PHASE_RE, remaining)
423
+
424
+ plot = _extract_pattern(_PLOT_RE, remaining)
425
+ if plot:
426
+ remaining = _remove_pattern_match(_PLOT_RE, remaining)
427
+
428
+ # ── Step D: final cleanup of residual ─────────────────────────────────
429
+ residual = strip_non_alphanumeric(remaining)
430
+ residual = normalize_and_deduplicate_address(residual)
431
+ residual = re.sub(r'\s+', ' ', residual).strip()
432
+
433
+ return {
434
+ 'house_number': house_no,
435
+ 'flat_number': flat_no,
436
+ 'apartment_name': apt_name,
437
+ 'street': street,
438
+ 'block': block,
439
+ 'sector': sector,
440
+ 'ward': ward,
441
+ 'phase': phase,
442
+ 'plot': plot,
443
+ 'residual': residual,
444
+ }
445
+
446
+
447
+ # ─────────────────────────────────────────────────────────────────────────────
448
+ # 5. STAGE 1 – GEO-ANCHOR COMPARISON
449
+ # ─────────────────────────────────────────────────────────────────────────────
450
+
451
+ def _compare_geo(geo1: Dict, geo2: Dict) -> Tuple[Optional[int], int, Dict]:
452
+ """
453
+ Compare geo-anchor components of two parsed addresses.
454
+
455
+ Returns:
456
+ (early_exit_score_or_None, geo_boost, details_dict)
457
+
458
+ Rules:
459
+ β€’ If a component is present in BOTH and they differ β†’ score = 20, exit immediately
460
+ β€’ If a component is present in BOTH and they match β†’ geo_boost += 10
461
+ β€’ If absent in either β†’ skip (no boost, no penalty)
462
+
463
+ Special case for PIN: uses pincode_similarity_india() distance logic.
464
+ Two PINs in the same metro cluster are NOT treated as a hard mismatch
465
+ (similarity_score >= 60 is acceptable). Only cross-city mismatches exit.
466
+ """
467
+ geo_boost = 0
468
+ details = {}
469
+
470
+ # ── PIN comparison ─────────────────────────────────────────────────────
471
+ p1, p2 = geo1.get('pin'), geo2.get('pin')
472
+ if p1 and p2:
473
+ if p1 == p2:
474
+ geo_boost += GEO_MATCH_BOOST
475
+ details['pin'] = f'match ({p1}) +{GEO_MATCH_BOOST}'
476
+ else:
477
+ pin_result = pincode_similarity_india(p1, p2)
478
+ sim = pin_result.get('similarity_score', 0) or 0
479
+ if sim < 60:
480
+ details['pin'] = f'MISMATCH ({p1} vs {p2}, sim={sim}) β†’ exit={GEO_MISMATCH_SCORE}'
481
+ return GEO_MISMATCH_SCORE, 0, details
482
+ else:
483
+ # Same metro cluster – partial boost
484
+ geo_boost += GEO_MATCH_BOOST // 2
485
+ details['pin'] = f'metro-close ({p1} vs {p2}, sim={sim}) +{GEO_MATCH_BOOST // 2}'
486
+
487
+ # ── CITY comparison ────────────────────────────────────────────────────
488
+ c1, c2 = geo1.get('city'), geo2.get('city')
489
+ if c1 and c2:
490
+ if c1.lower() == c2.lower():
491
+ geo_boost += GEO_MATCH_BOOST
492
+ details['city'] = f'match ({c1}) +{GEO_MATCH_BOOST}'
493
+ else:
494
+ details['city'] = f'MISMATCH ({c1} vs {c2}) β†’ exit={GEO_MISMATCH_SCORE}'
495
+ return GEO_MISMATCH_SCORE, 0, details
496
+
497
+ # ── STATE comparison ───────────────────────────────────────────────────
498
+ s1, s2 = geo1.get('state'), geo2.get('state')
499
+ if s1 and s2:
500
+ if s1.lower() == s2.lower():
501
+ geo_boost += GEO_MATCH_BOOST
502
+ details['state'] = f'match ({s1}) +{GEO_MATCH_BOOST}'
503
+ else:
504
+ details['state'] = f'MISMATCH ({s1} vs {s2}) β†’ exit={GEO_MISMATCH_SCORE}'
505
+ return GEO_MISMATCH_SCORE, 0, details
506
+
507
+ return None, geo_boost, details
508
+
509
+
510
+ # ─────────────────────────────────────────────────────────────────────────────
511
+ # 6. STAGE 2 – HARD IDENTIFIER COMPONENT COMPARISON
512
+ # ─────────────────────────────────────────────────────────────────────────────
513
+
514
+ def _normalize_id(value: Optional[str]) -> Optional[str]:
515
+ """
516
+ Normalize a hard identifier value for comparison.
517
+ Strips whitespace, uppercase, removes separators (- / .).
518
+ '4-B', '4B', '4/B' β†’ '4B'
519
+ """
520
+ if not value:
521
+ return None
522
+ return re.sub(r'[\s\-/.]', '', str(value).strip().upper())
523
+
524
+
525
+ def _compare_identifiers(comp1: Dict, comp2: Dict,
526
+ embed_score: float) -> Tuple[int, Dict]:
527
+ """
528
+ Compare hard identifier components between two parsed addresses.
529
+
530
+ Rules:
531
+ β€’ Both present AND equal β†’ +30 (only if embed_score > EMBED_GATE_THRESHOLD)
532
+ β€’ Both present AND NOT equal β†’ -30 (always, no gate)
533
+ β€’ Present in one, absent in other β†’ 0 (no signal either way)
534
+
535
+ Returns: (identifier_delta, details_dict)
536
+ """
537
+ id_delta = 0
538
+ details = {}
539
+
540
+ for key in HARD_ID_KEYS:
541
+ v1 = _normalize_id(comp1.get(key))
542
+ v2 = _normalize_id(comp2.get(key))
543
+
544
+ if v1 and v2:
545
+ if v1 == v2:
546
+ if embed_score > EMBED_GATE_THRESHOLD:
547
+ id_delta += ID_MATCH_BOOST
548
+ details[key] = f'match ({v1}) +{ID_MATCH_BOOST}'
549
+ else:
550
+ details[key] = f'match ({v1}) but embed={embed_score:.1f} < gate β†’ no boost'
551
+ else:
552
+ id_delta -= ID_MISMATCH_PENALTY
553
+ details[key] = f'MISMATCH ({v1} vs {v2}) -{ID_MISMATCH_PENALTY}'
554
+ elif v1 or v2:
555
+ details[key] = f'absent in one ({v1 or "–"} vs {v2 or "–"}) β†’ skip'
556
+
557
+ return id_delta, details
558
+
559
+
560
+ # ─────────────────────────────────────────────────────────────────────────────
561
+ # 7. STAGE 3 – EMBEDDING / RESIDUAL COMPARISON
562
+ # ─────────────────────────────────────────────────────────────────────────────
563
+
564
+ def _compute_embed_score(residual1: str, residual2: str) -> float:
565
+ """
566
+ Compare two residual address strings using the semantic embedding model.
567
+ Returns 0–100 float.
568
+ Falls back to match_entities (fuzzy+semantic) if residuals are very short.
569
+ """
570
+ r1 = preprocess_for_matching(residual1)
571
+ r2 = preprocess_for_matching(residual2)
572
+
573
+ if not r1 or not r2:
574
+ return 0.0
575
+
576
+ # For very short residuals (< 4 chars) pure embedding is unreliable;
577
+ # use the existing match_entities which blends fuzzy + semantic.
578
+ if len(r1) < 4 or len(r2) < 4:
579
+ return float(match_entities(r1, r2, weights=ADDRESS_MODEL_WEIGHTS))
580
+
581
+ try:
582
+ raw = calculate_semantic_similarity(ADDRESS_EMBED_MODEL, r1, r2)
583
+ return max(0.0, min(100.0, float(raw) * 100))
584
+ except Exception as e:
585
+ logger.warning(f"Embedding model error: {e}; falling back to match_entities")
586
+ return float(match_entities(r1, r2, weights=ADDRESS_MODEL_WEIGHTS))
587
+
588
+
589
+ # ─────────────────────────────────────────────────────────────────────────────
590
+ # 8. MASTER PIPELINE – match_address_lines()
591
+ # ─────────────────────────────────────────────────────────────────────────────
592
+
593
+ def match_address_lines(addr1: str, addr2: str) -> Dict:
594
+ """
595
+ Full 3-stage Indian address matching pipeline.
596
+
597
+ Stage 0 : Standardize both address strings
598
+ Stage 1 : Geo-anchor extraction + comparison (city / state / PIN)
599
+ Stage 2 : Administrative component extraction + comparison
600
+ (house / flat / block / sector / ward / phase / plot)
601
+ Stage 3 : Residual embedding comparison (remaining text after extraction)
602
+
603
+ Final score formula:
604
+ final = clamp(embed_score + geo_boost + id_delta, 0, 100)
605
+
606
+ Early exit:
607
+ If any geo component is present in BOTH and they mismatch β†’
608
+ return score = GEO_MISMATCH_SCORE (20) immediately.
609
+
610
+ Parameters
611
+ ----------
612
+ addr1, addr2 : raw address strings (any case, any format)
613
+
614
+ Returns
615
+ -------
616
+ dict with keys:
617
+ score : float (0–100)
618
+ early_exit : bool
619
+ embed_score : float
620
+ geo_boost : int
621
+ id_delta : int
622
+ breakdown : dict (component-level detail)
623
+ components_1 : dict (parsed components for addr1)
624
+ components_2 : dict (parsed components for addr2)
625
+ """
626
+
627
+ breakdown: Dict = {}
628
+
629
+ # ── Stage 0: Standardize ─────────────────────────────────────────────
630
+ std1 = standardize_address_line(addr1)
631
+ std2 = standardize_address_line(addr2)
632
+ breakdown['standardized'] = {'addr1': std1, 'addr2': std2}
633
+
634
+ # ── Stage 1: Geo-anchor extraction + comparison ───────────────────────
635
+ geo1 = extract_geo_anchors(std1)
636
+ geo2 = extract_geo_anchors(std2)
637
+ breakdown['geo_components'] = {'addr1': geo1, 'addr2': geo2}
638
+
639
+ early_exit_score, geo_boost, geo_detail = _compare_geo(geo1, geo2)
640
+ breakdown['geo_comparison'] = geo_detail
641
+
642
+ if early_exit_score is not None:
643
+ return {
644
+ 'score': float(early_exit_score),
645
+ 'early_exit': True,
646
+ 'embed_score': 0.0,
647
+ 'geo_boost': 0,
648
+ 'id_delta': 0,
649
+ 'breakdown': breakdown,
650
+ 'components_1': {},
651
+ 'components_2': {},
652
+ }
653
+
654
+ # ── Stage 2: Extended component extraction ────────────────────────────
655
+ comp1 = extract_extended_components(std1, geo1)
656
+ comp2 = extract_extended_components(std2, geo2)
657
+ breakdown['components'] = {'addr1': comp1, 'addr2': comp2}
658
+
659
+ # ── Stage 3: Embedding on residuals (needed BEFORE ID comparison) ─────
660
+ residual1 = comp1.get('residual', '')
661
+ residual2 = comp2.get('residual', '')
662
+ embed_score = _compute_embed_score(residual1, residual2)
663
+ breakdown['embed'] = {
664
+ 'residual_1': residual1,
665
+ 'residual_2': residual2,
666
+ 'embed_score': round(embed_score, 2),
667
+ }
668
+
669
+ # ── Stage 2 (cont.): Identifier comparison (gated by embed score) ─────
670
+ id_delta, id_detail = _compare_identifiers(comp1, comp2, embed_score)
671
+ breakdown['id_comparison'] = id_detail
672
+
673
+ # ── Final score composition ───────────────────────────────────────────
674
+ raw_final = embed_score + geo_boost + id_delta
675
+ final = max(0.0, min(100.0, raw_final))
676
+
677
+ breakdown['score_composition'] = {
678
+ 'embed_score': round(embed_score, 2),
679
+ 'geo_boost': geo_boost,
680
+ 'id_delta': id_delta,
681
+ 'raw': round(raw_final, 2),
682
+ 'final': round(final, 2),
683
+ }
684
+
685
+ return {
686
+ 'score': round(final, 2),
687
+ 'early_exit': False,
688
+ 'embed_score': round(embed_score, 2),
689
+ 'geo_boost': geo_boost,
690
+ 'id_delta': id_delta,
691
+ 'breakdown': breakdown,
692
+ 'components_1': comp1,
693
+ 'components_2': comp2,
694
+ }
695
+
696
+
697
+ # ─────────────────────────────────────────────────────────────────────────────
698
+ # 9. DROP-IN REPLACEMENT FOR match_addresses_1_to_n() (model.py)
699
+ # ─────────────────────────────────────────────────────────────────────────────
700
+
701
+ def match_addresses_enhanced(addresses_r1: list, addresses_r2: list) -> float:
702
+ """
703
+ 1:N address matching using the full pipeline.
704
+ Replaces / wraps match_addresses_1_to_n() in model.py.
705
+
706
+ Returns the highest score found across all address pair combinations.
707
+ """
708
+ valid1 = [a for a in addresses_r1 if a and str(a).strip() not in ('', '-', ' ')]
709
+ valid2 = [a for a in addresses_r2 if a and str(a).strip() not in ('', '-', ' ')]
710
+
711
+ if not valid1 or not valid2:
712
+ return 0.0
713
+
714
+ best = 0.0
715
+ for a1 in valid1:
716
+ for a2 in valid2:
717
+ result = match_address_lines(str(a1), str(a2))
718
+ score = result.get('score', 0.0)
719
+ if score > best:
720
+ best = score
721
+
722
+ return best
services/config.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import configparser
2
+ import os
3
+ import ast
4
+ import pandas as pd
5
+ import logging
6
+
7
+ logger = logging.getLogger("config")
8
+
9
+ # Initialize config
10
+ logger = logging.getLogger("config")
11
+
12
+ config = configparser.ConfigParser()
13
+
14
+ project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
15
+
16
+ # Try backend/config first, then config/ as fallback
17
+ conf_file_path = os.path.join(project_root, "backend", "config", "common.properties")
18
+ if not os.path.exists(conf_file_path):
19
+ conf_file_path = os.path.join(project_root, "config", "common.properties")
20
+
21
+ if not os.path.exists(conf_file_path):
22
+ raise FileNotFoundError(f"common.properties not found. Searched in backend/config/ and config/ under {project_root}")
23
+
24
+ config.read(conf_file_path)
25
+
26
+ logger.info(f"Config loaded from: {conf_file_path}")
27
+ #temporary read
28
+ # print("CONFIG FILES LOADED:", files)
29
+ # print("SECTIONS FOUND:", config.sections())
30
+
31
+ # Export any config constants if needed
32
+ APARTMENT_IDENTIFIER = config.get("IDENTIFIERS", "APARTMENT_IDENTIFIER")
33
+ FLAT_NUMBER_IDENTIFIER = config.get("IDENTIFIERS", "FLAT_NUMBER_IDENTIFIER")
34
+ HOUSE_NUMBER_IDENTIFIER = config.get("IDENTIFIERS", "HOUSE_NUMBER_IDENTIFIER")
35
+ STREET_KEYWORD = config.get("IDENTIFIERS", "STREET_KEYWORD")
36
+ FLOOR_NO_KEYWORD=config.get("IDENTIFIERS","FLOOR_KEYWORD")
37
+ SURNAME_IDENTIFIER = ast.literal_eval(config.get("IDENTIFIERS","INDIAN_SURNAMES"))
38
+
39
+ STATE_MAPPING = ast.literal_eval(config.get("MAPPING_DICT", "STATE_MAPPING"))
40
+ CITY_MAPPING = ast.literal_eval(config.get("MAPPING_DICT", "CITY_MAPPING"))
41
+ ADDRESS_MAPPING = ast.literal_eval(config.get("MAPPING_DICT", "ADDRESS_MAPPING"))
42
+
43
+ MODEL_WEIGHTS = ast.literal_eval(config.get("MATCHING_LOGIC", "MODEL_WEIGHTS"))
44
+ MATCHING_RULES = ast.literal_eval(config.get("MATCHING_LOGIC", "MATCHING_RULES"))
45
+
46
+ # Name-specific weights (embedding 0.7 + fuzz 0.2 + phonetic 0.1)
47
+ try:
48
+ NAME_MODEL_WEIGHTS = ast.literal_eval(config.get("NAME_MATCHING", "NAME_MODEL_WEIGHTS"))
49
+ NAME_MATCH_ADJUSTMENTS = ast.literal_eval(config.get("NAME_MATCHING", "NAME_MATCH_ADJUSTMENTS"))
50
+ except Exception:
51
+ NAME_MODEL_WEIGHTS = MODEL_WEIGHTS
52
+ NAME_MATCH_ADJUSTMENTS = {"surname_penalty": -30, "initial_boost": 30, "subset_boost": 40}
53
+
54
+ # Address-specific weights (embedding + fuzz, no phonetic)
55
+ try:
56
+ ADDRESS_MODEL_WEIGHTS = ast.literal_eval(config.get("ADDRESS_MATCHING", "ADDRESS_MODEL_WEIGHTS"))
57
+ ADDRESS_MATCH_ADJUSTMENTS = ast.literal_eval(config.get("ADDRESS_MATCHING", "ADDRESS_MATCH_ADJUSTMENTS"))
58
+ except Exception:
59
+ ADDRESS_MODEL_WEIGHTS = MODEL_WEIGHTS
60
+ ADDRESS_MATCH_ADJUSTMENTS = {"house_match_boost": 30, "house_mismatch_penalty": 70}
61
+
62
+ try:
63
+ MODEL_1_NAME = config.get("EMBEDDING_MODELS", "MODEL_1_NAME").strip()
64
+ MODEL_2_NAME = config.get("EMBEDDING_MODELS", "MODEL_2_NAME").strip()
65
+ except Exception:
66
+ MODEL_1_NAME = "sentence-transformers/all-mpnet-base-v2"
67
+ MODEL_2_NAME = "sentence-transformers/all-MiniLM-L6-v2"
68
+
69
+ # =========================================================
70
+ # CSV DATA LOADING (replacing MySQL)
71
+ # =========================================================
72
+
73
+ def load_csv_file(csv_path: str, file_name: str) -> pd.DataFrame:
74
+ """Load CSV file as DataFrame with error handling."""
75
+ try:
76
+ # Convert to absolute path relative to project root
77
+ if not os.path.isabs(csv_path):
78
+ csv_path = os.path.join(project_root, csv_path)
79
+
80
+ # Normalize path and resolve any ../ or ./ references
81
+ csv_path = os.path.abspath(csv_path)
82
+
83
+ if os.path.exists(csv_path):
84
+ df = pd.read_csv(csv_path)
85
+ logger.info(f"Loaded {file_name}: {len(df)} rows from {csv_path}")
86
+ return df
87
+ else:
88
+ logger.warning(f"CSV file not found: {csv_path}")
89
+ return pd.DataFrame()
90
+ except Exception as e:
91
+ logger.error(f"Failed to load {file_name}: {e}")
92
+ return pd.DataFrame()
93
+
94
+ # Load CSV reference tables
95
+ try:
96
+ name_variation_df = load_csv_file(config.get("csv", "name_variation_standard"), "name_variation_standard")
97
+ hno_variation_df = load_csv_file(config.get("csv", "hno_variation_standard"), "hno_variation_standard")
98
+ city_prev_pres_df = load_csv_file(config.get("csv", "city_prev_pres"), "city_prev_pres")
99
+ state_name_standard_df = load_csv_file(config.get("csv", "state_name_standard"), "state_name_standard")
100
+ sur_comm_names_df = load_csv_file(config.get("csv", "sur_comm_names"), "sur_comm_names")
101
+ pin_city_state_df = load_csv_file(config.get("csv", "pin_city_state"), "pin_city_state")
102
+
103
+ logger.info("All CSV files loaded successfully")
104
+ except Exception as e:
105
+ logger.warning(f"Some CSV files may not have loaded: {e}")
106
+ name_variation_df = pd.DataFrame()
107
+ hno_variation_df = pd.DataFrame()
108
+ city_prev_pres_df = pd.DataFrame()
109
+ state_name_standard_df = pd.DataFrame()
110
+ sur_comm_names_df = pd.DataFrame()
111
+ pin_city_state_df = pd.DataFrame()
112
+
113
+ # Legacy string exports for backward compatibility
114
+ pin_city_state = "pin_city_state"
115
+ sur_comm_names = "sur_comm_names"
116
+ city_prev_pres = "city_prev_pres"
117
+ state_name_standard = "state_name_standard"
118
+ hno_variation_standard = "hno_variation_standard"
119
+ name_variation_standard = "name_variation_standard"
120
+
121
+
services/model.py ADDED
@@ -0,0 +1,1509 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from concurrent.futures import ThreadPoolExecutor
2
+ from typing import Dict, List, Optional, Tuple
3
+ import pandas as pd
4
+ from rapidfuzz import fuzz
5
+ from rapidfuzz.distance import JaroWinkler
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+ from sentence_transformers import SentenceTransformer
8
+ import re
9
+ import itertools
10
+
11
+ from services.config import (
12
+ SURNAME_IDENTIFIER, MODEL_WEIGHTS, MODEL_1_NAME, MODEL_2_NAME,
13
+ NAME_MODEL_WEIGHTS, NAME_MATCH_ADJUSTMENTS,
14
+ ADDRESS_MODEL_WEIGHTS,
15
+ )
16
+ from services.rules import detect_surnames, compute_initial_letter_boost, is_subset_match
17
+
18
+ # ---------- Model Store ----------
19
+ MODEL_STORE = {}
20
+
21
+ def get_model(model_name: str) -> SentenceTransformer:
22
+ if model_name not in MODEL_STORE:
23
+ print(f"Loading {model_name} into memory on CPU...")
24
+ if model_name == "model1":
25
+ MODEL_STORE["model1"] = SentenceTransformer(MODEL_1_NAME, device="cpu")
26
+ elif model_name == "model2":
27
+ MODEL_STORE["model2"] = SentenceTransformer(MODEL_2_NAME, device="cpu")
28
+ return MODEL_STORE[model_name]
29
+
30
+
31
+ # ---------- Text Preprocessing ----------
32
+ def preprocess_for_matching(text: str) -> str:
33
+ """Standardize text for matching"""
34
+ if not text or text in ["-", " ", ""]:
35
+ return ""
36
+ return text.upper().strip()
37
+
38
+ # ---------- Core Matching Functions ----------
39
+ # ---------- Indic Soundex (phonetic for Indian names) ----------
40
+ # def indic_soundex_code(name: str) -> str:
41
+ # """
42
+ # Generate Indic Soundex code for a name token.
43
+ # Handles Indian transliteration phonetics (aspirated consonants, etc.)
44
+ # """
45
+ # if not name:
46
+ # return ""
47
+ # name = name.upper().strip()
48
+ # if not name:
49
+ # return ""
50
+
51
+ # # Pre-process: map aspirated/compound consonants to base
52
+ # for digraph, base in [("SH", "S"), ("PH", "F"), ("TH", "T"), ("DH", "D"),
53
+ # ("KH", "K"), ("GH", "G"), ("BH", "B"), ("CH", "C"), ("JH", "J")]:
54
+ # name = name.replace(digraph, base)
55
+
56
+ # SOUNDEX_MAP = {
57
+ # 'B': '1', 'F': '1', 'P': '1', 'V': '1', 'W': '1',
58
+ # 'C': '2', 'G': '2', 'J': '2', 'K': '2', 'Q': '2', 'S': '2', 'X': '2', 'Z': '2',
59
+ # 'D': '3', 'T': '3',
60
+ # 'L': '4',
61
+ # 'M': '5', 'N': '5',
62
+ # 'R': '6',
63
+ # }
64
+
65
+ # code = name[0]
66
+ # prev_code = SOUNDEX_MAP.get(name[0], '0')
67
+
68
+ # for char in name[1:]:
69
+ # if char in 'AEIOUHY ':
70
+ # prev_code = '0' # Reset on vowel/separator
71
+ # continue
72
+ # digit = SOUNDEX_MAP.get(char, '0')
73
+ # if digit != '0' and digit != prev_code:
74
+ # code += digit
75
+ # prev_code = digit
76
+
77
+ # return (code + '000')[:4]
78
+
79
+ def indic_soundex_code(name: str) -> str:
80
+ """
81
+ Generate Indic Soundex code for a name token.
82
+ Handles Indian transliteration phonetics (aspirated consonants, etc.)
83
+
84
+ [MODIFIED 2026-03-15]
85
+ - Separated palatal fricatives (J, S, Z) from velars (K, G) in SOUNDEX_MAP to accurately
86
+ penalize phonetically distinct names like Rajesh vs Rakesh.
87
+ """
88
+ if not name:
89
+ return ""
90
+ name = name.upper().strip()
91
+ if not name:
92
+ return ""
93
+
94
+ # Pre-process: map aspirated/compound consonants to base
95
+ for digraph, base in [("SH", "S"), ("PH", "F"), ("TH", "T"), ("DH", "D"),
96
+ ("KH", "K"), ("GH", "G"), ("BH", "B"), ("CH", "C"), ("JH", "J")]:
97
+ name = name.replace(digraph, base)
98
+
99
+ SOUNDEX_MAP = {
100
+ 'B': '1', 'F': '1', 'P': '1', 'V': '1', 'W': '1',
101
+ 'C': '2', 'G': '2', 'K': '2', 'Q': '2', 'X': '2',
102
+ 'D': '3', 'T': '3',
103
+ 'L': '4',
104
+ 'M': '5', 'N': '5',
105
+ 'R': '6',
106
+ 'J': '7', 'S': '7', 'Z': '7'
107
+ }
108
+
109
+ code = name[0]
110
+ prev_code = SOUNDEX_MAP.get(name[0], '0')
111
+
112
+ for char in name[1:]:
113
+ if char in 'AEIOUHY ':
114
+ prev_code = '0' # Reset on vowel/separator
115
+ continue
116
+ digit = SOUNDEX_MAP.get(char, '0')
117
+ if digit != '0' and digit != prev_code:
118
+ code += digit
119
+ prev_code = digit
120
+
121
+ return (code + '000')[:4]
122
+
123
+
124
+ def indic_soundex_similarity(text1: str, text2: str) -> float:
125
+ """
126
+ Compare two texts using Indic Soundex on each token.
127
+ Returns 0-100 similarity score.
128
+ """
129
+ tokens1 = text1.upper().split() if text1 else []
130
+ tokens2 = text2.upper().split() if text2 else []
131
+ if not tokens1 or not tokens2:
132
+ return 0.0
133
+
134
+ codes1 = [indic_soundex_code(t) for t in tokens1]
135
+ codes2 = [indic_soundex_code(t) for t in tokens2]
136
+
137
+ shorter, longer = (codes1, codes2) if len(codes1) <= len(codes2) else (codes2, codes1)
138
+ if not shorter:
139
+ return 0.0
140
+
141
+ total_match = 0.0
142
+ used = set()
143
+ for s_code in shorter:
144
+ best_match = 0.0
145
+ best_idx = -1
146
+ for i, l_code in enumerate(longer):
147
+ if i in used:
148
+ continue
149
+ match = sum(c1 == c2 for c1, c2 in zip(s_code, l_code)) / 4.0
150
+ if match > best_match:
151
+ best_match = match
152
+ best_idx = i
153
+ if best_idx >= 0:
154
+ used.add(best_idx)
155
+ total_match += best_match
156
+
157
+ return (total_match / len(shorter)) * 100
158
+
159
+ # ---------- Core Matching Functions ----------
160
+ def calculate_fuzzy_scores(input1: str, input2: str) -> Dict[str, float]:
161
+ """Calculate fuzzy matching scores using RapidFuzz (5 functions)"""
162
+ return {
163
+ "simple_ratio": fuzz.ratio(input1, input2),
164
+ "token_set_ratio": fuzz.token_set_ratio(input1, input2),
165
+ "w_ratio": fuzz.WRatio(input1, input2),
166
+ "partial_ratio": fuzz.partial_ratio(input1, input2),
167
+ "token_sort_ratio": fuzz.token_sort_ratio(input1, input2),
168
+ }
169
+
170
+ def calculate_semantic_similarity(model_name: str, input1: str, input2: str) -> float:
171
+ """Calculate semantic similarity using sentence transformers"""
172
+ model = get_model(model_name)
173
+ # print("input1 to model",input1)
174
+ # print("input2 to model",input2)
175
+ embedding1 = model.encode([input1], show_progress_bar=False)
176
+ embedding2 = model.encode([input2], show_progress_bar=False)
177
+
178
+ return cosine_similarity(embedding1, embedding2)[0][0]
179
+
180
+ def calculate_final_score(fuzzy_scores: Dict[str, float], semantic_score: float) -> float:
181
+ """Calculate weighted final score"""
182
+ weights = MODEL_WEIGHTS
183
+ normalized_scores = {
184
+ "simple_ratio": fuzzy_scores.get("simple_ratio", 0),
185
+ "token_set_ratio": fuzzy_scores.get("token_set_ratio", 0),
186
+ "partial_ratio": fuzzy_scores.get("partial_ratio", 0),
187
+ "w_ratio": fuzzy_scores.get("w_ratio", 0),
188
+ "semantic_score": semantic_score * 100,
189
+ }
190
+ weighted_sum = sum(normalized_scores[key] * weight for key, weight in weights.items())
191
+ return max(0, min(100, weighted_sum))
192
+
193
+ def calculate_overall_similarity(score1: float, score2: float) -> float:
194
+ """Calculate overall similarity from two model scores"""
195
+ return score1 * 0.6 + score2 * 0.4
196
+
197
+ def check_substring_match(str1: str, str2: str) -> bool:
198
+ """Check if one string is a substring of another"""
199
+ if not str1 or not str2:
200
+ return False
201
+ return str1 in str2 or str2 in str1
202
+
203
+ def check_individual_name_matches(name_full: str, fname: str, mname: str, lname: str) -> Tuple[bool, bool, bool]:
204
+ """
205
+ Check if full name contains first, middle, or last name as substring
206
+ Returns: (first_match, middle_match, last_match)
207
+ """
208
+ f_match = check_substring_match(name_full, fname) if fname else False
209
+ m_match = check_substring_match(name_full, mname) if mname else False
210
+ l_match = check_substring_match(name_full, lname) if lname else False
211
+ return f_match, m_match, l_match
212
+
213
+
214
+ def concatenate_name_parts(firstname: str, middlename: str, lastname: str) -> str:
215
+ """Concatenate name parts"""
216
+ parts = []
217
+ if firstname and firstname not in ["-", " ", ""]:
218
+ parts.append(firstname.upper().strip())
219
+ if middlename and middlename not in ["-", " ", ""]:
220
+ parts.append(middlename.upper().strip())
221
+ if lastname and lastname not in ["-", " ", ""]:
222
+ parts.append(lastname.upper().strip())
223
+
224
+ if not parts:
225
+ return ""
226
+
227
+ parts.sort()
228
+ return " ".join(parts)
229
+
230
+ # ---------- helpers used only inside the new logic ----------
231
+ def _normalize_and_sort(name: str) -> str:
232
+ """
233
+ 1. Split on any non-alphanumeric character (space, underscore, comma, etc.)
234
+ 2. Remove empty tokens
235
+ 3. Upper-case
236
+ 4. Sort alphabetically
237
+ 5. Re-join with single space
238
+ """
239
+ tokens = re.split(r'[^A-Za-z0-9]+', name.strip())
240
+ tokens = [t.upper() for t in tokens if t]
241
+ return ' '.join(sorted(tokens))
242
+
243
+ def _all_name_combinations(fname: str, mname: str, lname: str) -> list[str]:
244
+ """
245
+ Return every possible ordering of the supplied parts,
246
+ dropping any empty/blank components.
247
+ """
248
+ parts = []
249
+ for p in (fname, mname, lname):
250
+ if p and p.strip() not in ('-', '', ' '):
251
+ parts.append(p.strip().upper())
252
+ if not parts:
253
+ return []
254
+ # itertools.permutations gives every ordering
255
+ return [' '.join(order) for order in itertools.permutations(parts)]
256
+
257
+
258
+ # def match_entities(value1: str, value2: str, weights: Dict[str, float] = None) -> float:
259
+ # """
260
+ # Match two entities using fuzzy + semantic + optional phonetic similarity.
261
+ # Weights dict determines score component contributions.
262
+ # Returns: similarity score as float (0-100)
263
+ # """
264
+ # if weights is None:
265
+ # weights = MODEL_WEIGHTS
266
+
267
+ # standardized_input1 = preprocess_for_matching(value1)
268
+ # standardized_input2 = preprocess_for_matching(value2)
269
+
270
+ # if not standardized_input1 or not standardized_input2:
271
+ # return 0
272
+
273
+ # # Space-agnostic exact match
274
+ # if standardized_input1.replace(" ", "") == standardized_input2.replace(" ", ""):
275
+ # return 100.0
276
+
277
+ # return calculate_similarity_with_models(standardized_input1, standardized_input2, weights)
278
+
279
+ def match_entities(value1: str, value2: str, weights: Dict[str, float] = None) -> float:
280
+ """
281
+ Match two entities using fuzzy + semantic + optional phonetic similarity.
282
+ Weights dict determines score component contributions.
283
+
284
+ Handles:
285
+ 1. Normal match : "Pujitha Sharma" vs "pujitha sharma"
286
+ 2. Space-agnostic match : "Pujitha Sharma" vs "pujithasharma"
287
+ 3. South Indian names : "Sharma Gari Pujitha" vs "Pujitha Sharma Gari"
288
+ (token order doesn't matter, combinations checked)
289
+
290
+ Returns: similarity score as float (0-100)
291
+
292
+
293
+ - Integrated 'Check 3: Acronym / Initial expansion'. Matches acronyms to
294
+ full names (e.g. K V Reddy vs Katta Venkata Reddy) and boosts to 90+.
295
+ Penalizes mismatching initials (e.g. C Anitha vs H Anitha) by -40.
296
+ - Added 'Check 5: Final Phonetic Audit'. Uses Indic Soundex to securely
297
+ escalate minor spelling variants (likitha vs likheetha) to 95+ and heavily
298
+ punish mathematically close false-positives (rajesh vs rakesh).
299
+ """
300
+ if weights is None:
301
+ weights = MODEL_WEIGHTS
302
+
303
+ standardized_input1 = preprocess_for_matching(value1)
304
+ standardized_input2 = preprocess_for_matching(value2)
305
+
306
+ if not standardized_input1 or not standardized_input2:
307
+ return 0
308
+
309
+ # =========================================================
310
+ # CHECK 1: Space-agnostic exact match
311
+ # "Pujitha Sharma" vs "pujithasharma" β†’ 100.0
312
+ # =========================================================
313
+ if standardized_input1.replace(" ", "") == standardized_input2.replace(" ", ""):
314
+ return 100.0
315
+
316
+ # =========================================================
317
+ # CHECK 2: Token-order permutation match (South Indian names)
318
+ # "sharmagari pujitha" vs "pujitha sharmagari" β†’ 100.0
319
+ # Splits both names into tokens, checks if any permutation
320
+ # of tokens (joined with/without space) matches the other
321
+ # =========================================================
322
+ tokens1 = standardized_input1.split()
323
+ tokens2 = standardized_input2.split()
324
+
325
+ # Only attempt if token count is manageable (avoid factorial explosion)
326
+ if len(tokens1) <= 4 and len(tokens2) <= 4:
327
+
328
+ # Generate all permutations of tokens1 and check against tokens2 (space-agnostic)
329
+ target_nospace = standardized_input2.replace(" ", "")
330
+
331
+ for perm in itertools.permutations(tokens1):
332
+ # joined with space: "pujitha sharmagari"
333
+ # joined without space: "pujithasharmagari"
334
+ perm_with_space = " ".join(perm)
335
+ perm_without_space = "".join(perm)
336
+
337
+ if perm_with_space == standardized_input2:
338
+ return 100.0
339
+
340
+ if perm_without_space == target_nospace:
341
+ return 100.0
342
+
343
+ # Also check permutations of tokens2 against tokens1 (space-agnostic)
344
+ target_nospace1 = standardized_input1.replace(" ", "")
345
+
346
+ for perm in itertools.permutations(tokens2):
347
+ perm_with_space = " ".join(perm)
348
+ perm_without_space = "".join(perm)
349
+
350
+ if perm_with_space == standardized_input1:
351
+ return 100.0
352
+
353
+ if perm_without_space == target_nospace1:
354
+ return 100.0
355
+
356
+ # =========================================================
357
+ # CHECK 3: Acronym / Initial expansion match or mismatch
358
+ # "K V Reddy" vs "Katta Venkata Reddy" β†’ initial match β†’ escalate to 90.0+
359
+ # "C Anitha" vs "H Anitha" β†’ mismatched initials β†’ severe penalty (-40.0)
360
+ # =========================================================
361
+ if len(tokens1) > 0 and len(tokens2) > 0:
362
+ common = set(tokens1) & set(tokens2)
363
+ rem1 = [t for t in tokens1 if t not in common]
364
+ rem2 = [t for t in tokens2 if t not in common]
365
+
366
+ # Only apply if they share some tokens (like a last name) but differ in the rest
367
+ if common and rem1 and rem2:
368
+ rem1_is_initials = all(len(t) == 1 for t in rem1)
369
+ rem2_is_initials = all(len(t) == 1 for t in rem2)
370
+
371
+ initials_list = None
372
+ fullcaps_list = None
373
+
374
+ # Identify which is the initials array and which is the longer names array
375
+ if rem1_is_initials and not rem2_is_initials:
376
+ initials_list = rem1
377
+ fullcaps_list = rem2
378
+ elif rem2_is_initials and not rem1_is_initials:
379
+ initials_list = rem2
380
+ fullcaps_list = rem1
381
+ elif rem1_is_initials and rem2_is_initials:
382
+ # Both are just single letters! (e.g. C Anitha vs H Anitha)
383
+ initials_list = rem1
384
+ fullcaps_list = rem2
385
+
386
+ if initials_list is not None and fullcaps_list is not None:
387
+ initials_set = {t[0] for t in initials_list}
388
+ first_letters_set = {t[0] for t in fullcaps_list if t}
389
+
390
+ # Check for intersection. If they map cleanly, escalate to 90
391
+ if initials_set == first_letters_set or initials_set.issubset(first_letters_set) or first_letters_set.issubset(initials_set):
392
+ base_score = calculate_similarity_with_models(standardized_input1, standardized_input2, weights)
393
+ return max(90.0, base_score)
394
+ else:
395
+ # Explicit conflicting initials! (e.g., C vs H or K vs M)
396
+ base_score = calculate_similarity_with_models(standardized_input1, standardized_input2, weights)
397
+ return max(0.0, base_score - 40.0)
398
+ else:
399
+ # =========================================================
400
+ # EXPLICIT CONFLICTING CORE NAMES - 15-03-2026
401
+ # Example: "M. Manisha Reddy" vs "M. Manoj Reddy" -> Shared: M, Reddy. Unmatched: Manisha vs Manoj
402
+ # Example: "Mukherjee Lakshmi" vs "Prasad Lakshmi" -> Shared: Lakshmi. Unmatched: Mukherjee vs Prasad
403
+ # Since neither unmatched set are initials, evaluate them as explicit words
404
+ # =========================================================
405
+ rem1_str = " ".join(rem1)
406
+ rem2_str = " ".join(rem2)
407
+
408
+ rem_fuzzy = fuzz.ratio(rem1_str, rem2_str)
409
+ if rem_fuzzy < 65.0:
410
+ base_score = calculate_similarity_with_models(standardized_input1, standardized_input2, weights)
411
+ # Severely penalize because key identifying words actively contradict each other
412
+ return max(0.0, base_score - 40.0)
413
+
414
+ # =========================================================
415
+ # CHECK 4: Fallback β†’ weighted model scoring
416
+ # "Pujitha Sharma" vs "Jon Smyth" β†’ ~78.5 (fuzzy+semantic)
417
+ # =========================================================
418
+ base_score = calculate_similarity_with_models(standardized_input1, standardized_input2, weights)
419
+
420
+ # =========================================================
421
+ # CHECK 5: Final Phonetic Audit (for single words/names primarily)
422
+ # If they are single continuous names, check if they are identical
423
+ # phonetically. If they are divergent, brutally penalize to prevent false positives.
424
+ # =========================================================
425
+ if len(tokens1) == 1 and len(tokens2) == 1:
426
+ ph_score = indic_soundex_similarity(standardized_input1, standardized_input2)
427
+
428
+ # Phonetically identical but minor spelling difference (likitha vs likheetha) -> escalate to 95.0+
429
+ if ph_score == 100.0:
430
+ if fuzz.ratio(standardized_input1, standardized_input2) > 65 and abs(len(standardized_input1) - len(standardized_input2)) <= 2:
431
+ return max(95.0, base_score)
432
+
433
+ # Highly distinct phonetics but mathematically close text (Rajesh vs Rakesh) -> ~50.0
434
+ elif ph_score <= 80.0:
435
+ if base_score > 55.0:
436
+ # heavily penalize false-positive anagrams/typos
437
+ return min(base_score - 25.0, 55.0)
438
+
439
+ return base_score
440
+
441
+ # def calculate_similarity_with_models(text1: str, text2: str, weights: Dict[str, float] = None) -> float:
442
+ # """
443
+ # Calculate similarity using fuzzy scores, embedding models, and optional phonetic.
444
+ # The weights dict controls which components are active and their contribution.
445
+ # Phonetic components (jaro_winkler, indic_soundex) are used only if present in weights.
446
+ # Returns similarity percentage as float (0-100)
447
+ # """
448
+ # if weights is None:
449
+ # weights = MODEL_WEIGHTS
450
+
451
+ # if not text1 or not text2:
452
+ # print(f"[SIMILARITY] either value is empty β€” text1={text1!r} text2={text2!r}")
453
+ # return 0.0
454
+
455
+ # text1 = str(text1).strip()
456
+ # text2 = str(text2).strip()
457
+
458
+ # if not text1 or not text2:
459
+ # return 0.0
460
+
461
+ # print(f"[SIMILARITY] text1={text1!r}")
462
+ # print(f"[SIMILARITY] text2={text2!r}")
463
+
464
+ # # Space-agnostic exact match
465
+ # if text1.replace(" ", "") == text2.replace(" ", ""):
466
+ # return 100.0
467
+
468
+ # # --- Fuzzy scores (5 functions) ---
469
+ # fuzzy_scores = {
470
+ # "simple_ratio": fuzz.ratio(text1, text2),
471
+ # "token_set_ratio": fuzz.token_set_ratio(text1, text2),
472
+ # "w_ratio": fuzz.WRatio(text1, text2),
473
+ # "partial_ratio": fuzz.partial_ratio(text1, text2),
474
+ # "token_sort_ratio": fuzz.token_sort_ratio(text1, text2),
475
+ # }
476
+
477
+ # # --- Phonetic scores (only if weights include them) ---
478
+ # phonetic_scores = {}
479
+ # if weights.get("jaro_winkler", 0) > 0:
480
+ # phonetic_scores["jaro_winkler"] = JaroWinkler.similarity(text1, text2) * 100
481
+ # if weights.get("indic_soundex", 0) > 0:
482
+ # phonetic_scores["indic_soundex"] = indic_soundex_similarity(text1, text2)
483
+
484
+ # # --- Semantic scores (dual model, computed in parallel) ---
485
+ # with ThreadPoolExecutor() as executor:
486
+ # model1 = get_model("model1")
487
+ # model2 = get_model("model2")
488
+
489
+ # f1 = executor.submit(
490
+ # lambda: cosine_similarity(
491
+ # model1.encode([text1], show_progress_bar=False),
492
+ # model1.encode([text2], show_progress_bar=False)
493
+ # )[0][0]
494
+ # )
495
+ # f2 = executor.submit(
496
+ # lambda: cosine_similarity(
497
+ # model2.encode([text1], show_progress_bar=False),
498
+ # model2.encode([text2], show_progress_bar=False)
499
+ # )[0][0]
500
+ # )
501
+ # cosine1 = f1.result()
502
+ # cosine2 = f2.result()
503
+
504
+ # def calc_final(semantic_cosine):
505
+ # all_scores = {}
506
+ # all_scores.update(fuzzy_scores)
507
+ # all_scores.update(phonetic_scores)
508
+ # all_scores["semantic_score"] = semantic_cosine * 100
509
+ # return sum(all_scores.get(k, 0) * v for k, v in weights.items())
510
+
511
+ # final1 = calc_final(cosine1)
512
+ # final2 = calc_final(cosine2)
513
+
514
+ # overall_similarity = final1 * 0.6 + final2 * 0.4
515
+ # print("similarity given by model",overall_similarity)
516
+ # return round(max(0, min(100, overall_similarity)), 2)
517
+
518
+ def calculate_similarity_with_models(text1: str, text2: str, weights: Dict[str, float] = None) -> float:
519
+ """
520
+ Calculate similarity using fuzzy scores, embedding models, and optional phonetic.
521
+ The weights dict controls which components are active and their contribution.
522
+ Phonetic components (jaro_winkler, indic_soundex) are used only if present in weights.
523
+ Returns similarity percentage as float (0-100)
524
+ """
525
+ if weights is None:
526
+ weights = MODEL_WEIGHTS
527
+
528
+ if not text1 or not text2:
529
+ return 0.0
530
+
531
+ text1 = str(text1).strip()
532
+ text2 = str(text2).strip()
533
+
534
+ if not text1 or not text2:
535
+ return 0.0
536
+
537
+ # Space-agnostic exact match
538
+ if text1.replace(" ", "") == text2.replace(" ", ""):
539
+ return 100.0
540
+
541
+ # --- Fuzzy scores (5 functions) ---
542
+ fuzzy_scores = {
543
+ "simple_ratio": fuzz.ratio(text1, text2),
544
+ "token_set_ratio": fuzz.token_set_ratio(text1, text2),
545
+ "w_ratio": fuzz.WRatio(text1, text2),
546
+ "partial_ratio": fuzz.partial_ratio(text1, text2),
547
+ "token_sort_ratio": fuzz.token_sort_ratio(text1, text2),
548
+ }
549
+
550
+ # --- Phonetic scores (only if weights include them) ---
551
+ phonetic_scores = {}
552
+ if weights.get("jaro_winkler", 0) > 0:
553
+ phonetic_scores["jaro_winkler"] = JaroWinkler.similarity(text1, text2) * 100
554
+ if weights.get("indic_soundex", 0) > 0:
555
+ phonetic_scores["indic_soundex"] = indic_soundex_similarity(text1, text2)
556
+
557
+ # --- Semantic scores (dual model, computed in parallel) ---
558
+ with ThreadPoolExecutor() as executor:
559
+ model1 = get_model("model1")
560
+ model2 = get_model("model2")
561
+
562
+ f1 = executor.submit(
563
+ lambda: cosine_similarity(
564
+ model1.encode([text1]),
565
+ model1.encode([text2])
566
+ )[0][0]
567
+ )
568
+ f2 = executor.submit(
569
+ lambda: cosine_similarity(
570
+ model2.encode([text1]),
571
+ model2.encode([text2])
572
+ )[0][0]
573
+ )
574
+ cosine1 = f1.result()
575
+ cosine2 = f2.result()
576
+
577
+ def calc_final(semantic_cosine):
578
+ all_scores = {}
579
+ all_scores.update(fuzzy_scores)
580
+ all_scores.update(phonetic_scores)
581
+ all_scores["semantic_score"] = semantic_cosine * 100
582
+ return sum(all_scores.get(k, 0) * v for k, v in weights.items())
583
+
584
+ final1 = calc_final(cosine1)
585
+ final2 = calc_final(cosine2)
586
+
587
+ overall_similarity = final1 * 0.6 + final2 * 0.4
588
+ return round(max(0, min(100, overall_similarity)), 2)
589
+
590
+ # def handle_case1(full_name1: str, full_name2: str,
591
+ # r1_fname: str, r1_mname: str, r1_lname: str,
592
+ # r2_fname: str, r2_mname: str, r2_lname: str) -> dict:
593
+ # """
594
+ # Case-1 (both records supply a full name)
595
+ # Returns a dictionary with separate similarity scores for each component
596
+
597
+ # Returns:
598
+ # dict: {
599
+ # 'full_name_percent': float, # full_name1 vs full_name2
600
+ # 'firstname_percent': float, # r1_fname vs r2_fname
601
+ # 'middlename_percent': float, # r1_mname vs r2_mname
602
+ # 'lastname_percent': float # r1_lname vs r2_lname
603
+ # }
604
+ # """
605
+ # result={}
606
+
607
+ # # Check space-agnostic exact match on original strings before sorting
608
+ # if full_name1.replace(" ", "").upper() == full_name2.replace(" ", "").upper():
609
+ # full_name_percent = 100.0
610
+ # else:
611
+ # # 1. Normalize + alphabetically sort each full name and calculate similarity
612
+ # sorted1 = _normalize_and_sort(full_name1)
613
+ # sorted2 = _normalize_and_sort(full_name2)
614
+ # full_name_percent = calculate_similarity_with_models(sorted1, sorted2, NAME_MODEL_WEIGHTS)
615
+ # # print("full_name_percent is:",full_name_percent)
616
+
617
+ # # 2. Calculate firstname_percent: compare firstnames
618
+ # # firstname
619
+ # if r1_fname and r2_fname:
620
+ # firstname_percent = calculate_similarity_with_models(
621
+ # r1_fname, r2_fname, NAME_MODEL_WEIGHTS
622
+ # )
623
+ # # print("firstname_percent is:",firstname_percent)
624
+ # else:
625
+ # firstname_percent = 0.0
626
+
627
+ # # middlename
628
+ # if r1_mname and r2_mname:
629
+ # middlename_percent = calculate_similarity_with_models(
630
+ # r1_mname, r2_mname, NAME_MODEL_WEIGHTS
631
+ # )
632
+ # # print("middlename_percent is:",middlename_percent)
633
+ # else:
634
+ # middlename_percent = 0.0
635
+
636
+ # # lastname
637
+ # if r1_lname and r2_lname:
638
+ # lastname_percent = calculate_similarity_with_models(
639
+ # r1_lname, r2_lname, NAME_MODEL_WEIGHTS
640
+ # )
641
+ # # print("lastname_percent is:",lastname_percent)
642
+ # else:
643
+ # lastname_percent = 0.0
644
+
645
+
646
+
647
+ # result={
648
+ # 'full_name_percent': full_name_percent,
649
+ # 'firstname_percent': firstname_percent,
650
+ # 'middlename_percent': middlename_percent,
651
+ # 'lastname_percent': lastname_percent
652
+ # }
653
+ # return result
654
+
655
+ # def handle_case2(full_name: str,
656
+ # fname: str, mname: str, lname: str,
657
+ # concat_name: str) -> dict:
658
+ # """
659
+ # Case-2 (one side has full name, the other has F/M/L)
660
+ # Returns a dictionary with separate similarity scores for each component
661
+
662
+ # Returns:
663
+ # dict: {
664
+ # 'full_name_percent': float, # full_name vs concat_name
665
+ # 'firstname_percent': float, # full_name vs fname
666
+ # 'middlename_percent': float, # full_name vs mname
667
+ # 'lastname_percent': float # full_name vs lname
668
+ # }
669
+ # """
670
+ # # 0. Check if any permutation of F/M/L exactly reconstructs full_name.
671
+ # # If yes, full_name_percent = 100. Component scores are still computed
672
+ # # individually β€” a part inside full_name does NOT score 100% on its own.
673
+ # # e.g. full_name="KALLI LIKHITHA", fname="KALLI", mname="LIKHITHA":
674
+ # # full_name_percent = 100 (together they reconstruct it exactly)
675
+ # # firstname_percent != 100 ("KALLI" is only half of "KALLI LIKHITHA")
676
+ # permutation_full_match = any(
677
+ # permuted.replace(" ", "") == full_name.upper().strip().replace(" ", "")
678
+ # for permuted in _all_name_combinations(fname, mname, lname)
679
+ # )
680
+
681
+ # # 1. Calculate full_name_percent
682
+ # if permutation_full_match:
683
+ # full_name_percent = 100.0
684
+ # else:
685
+ # sorted_full = _normalize_and_sort(full_name)
686
+ # sorted_concat = _normalize_and_sort(concat_name)
687
+ # full_name_percent = calculate_similarity_with_models(
688
+ # sorted_full,
689
+ # sorted_concat,
690
+ # NAME_MODEL_WEIGHTS
691
+ # )
692
+
693
+ # # Component-level scores: compare full_name vs each individual part (fname/mname/lname).
694
+ # #
695
+ # # Requirement:
696
+ # # - full_name="KALLI LIKHITHA", fname="KALLI" β†’ firstname_percent reflects
697
+ # # how well "KALLI" matches within the context of the full name, but must
698
+ # # NOT be 100% just because "KALLI" is a complete subset of "KALLI LIKHITHA".
699
+ # # - The comparison is full_name vs part (not token-to-token), so the full
700
+ # # context of the name is preserved.
701
+ # #
702
+ # # Why standard weights fail:
703
+ # # - partial_ratio("KALLI LIKHITHA", "KALLI") = 100 ← subset inflation
704
+ # # - token_set_ratio produces same inflation
705
+ # # - w_ratio picks the best of these β†’ also inflated
706
+ # # - semantic embeddings: short name vs full name share high cosine similarity
707
+ # # because they encode overlapping meaning β†’ also inflated
708
+ # #
709
+ # # Fix: use only LENGTH-SENSITIVE metrics that naturally penalise length
710
+ # # disparity between the strings.
711
+ # # - simple_ratio: 2 * matches / total_chars β€” drops when lengths differ
712
+ # # - jaro_winkler: character-overlap with length normalisation β€” same
713
+ # # - indic_soundex: phonetic token overlap / shorter length β€” same
714
+ # # Intentionally excluded: partial_ratio, token_set_ratio, w_ratio, semantic.
715
+
716
+ # _COMPONENT_WEIGHTS = {
717
+ # "simple_ratio": 0.35,
718
+ # "jaro_winkler": 0.40,
719
+ # "indic_soundex": 0.25,
720
+ # }
721
+
722
+ # def _fullname_vs_part(full: str, part: str) -> float:
723
+ # """
724
+ # Compare full_name against a single name part using only length-sensitive
725
+ # metrics. Returns 0-100. A part that is a strict subset of full_name will
726
+ # score proportionally to how much of the full_name it covers, not 100%.
727
+ # """
728
+ # if not full or not part:
729
+ # return 0.0
730
+ # full_u = full.upper().strip()
731
+ # part_u = part.upper().strip()
732
+ # if full_u == part_u:
733
+ # return 100.0
734
+ # scores = {
735
+ # "simple_ratio": fuzz.ratio(full_u, part_u),
736
+ # "jaro_winkler": JaroWinkler.similarity(full_u, part_u) * 100,
737
+ # "indic_soundex": indic_soundex_similarity(full_u, part_u),
738
+ # }
739
+ # return round(max(0.0, min(100.0,
740
+ # sum(scores[k] * v for k, v in _COMPONENT_WEIGHTS.items())
741
+ # )), 2)
742
+
743
+ # # 2. firstname_percent: full_name vs fname
744
+ # firstname_percent = _fullname_vs_part(full_name, fname) if fname else 0.0
745
+ # # 3. middlename_percent: full_name vs mname
746
+ # middlename_percent = _fullname_vs_part(full_name, mname) if mname else 0.0
747
+ # # 4. lastname_percent: full_name vs lname
748
+ # lastname_percent = _fullname_vs_part(full_name, lname) if (lname and lname.upper() not in SURNAME_IDENTIFIER) else 0.0
749
+
750
+ # result={
751
+ # 'full_name_percent': full_name_percent,
752
+ # 'firstname_percent': firstname_percent,
753
+ # 'middlename_percent': middlename_percent,
754
+ # 'lastname_percent': lastname_percent
755
+ # }
756
+ # return result
757
+
758
+
759
+ # def handle_case3(r1_fname: str, r1_mname: str, r1_lname: str, r1_concat: str,
760
+ # r2_fname: str, r2_mname: str, r2_lname: str, r2_concat: str) -> dict:
761
+ # """
762
+ # Handle Case 3: Both records have F/M/L
763
+ # Returns a dictionary with separate similarity scores for each component
764
+
765
+ # Returns:
766
+ # dict: {
767
+ # 'full_name_percent': float, # r1_concat vs r2_concat
768
+ # 'firstname_percent': float, # r1_fname vs r2_fname
769
+ # 'middlename_percent': float, # r1_mname vs r2_mname
770
+ # 'lastname_percent': float # r1_lname vs r2_lname
771
+ # }
772
+ # """
773
+ # # Check substring matches for each component
774
+ # f_match = check_substring_match(r1_fname, r2_fname) if r1_fname and r2_fname else False
775
+ # m_match = check_substring_match(r1_mname, r2_mname) if r1_mname and r2_mname else False
776
+ # l_match = check_substring_match(r1_lname, r2_lname) if r1_lname and r2_lname else False
777
+
778
+ # # Calculate full_name_percent: compare concatenated names
779
+ # full_name_percent = calculate_similarity_with_models(r1_concat, r2_concat, NAME_MODEL_WEIGHTS)
780
+
781
+ # # Apply boosting logic based on substring matches
782
+ # # Rule 1: Only lastname matches (family match)
783
+ # if l_match and not f_match and not m_match:
784
+ # full_name_percent = max(full_name_percent, 85.0) # Ensure minimum 85% for family match
785
+
786
+ # # Rule 2: Lastname + (firstname or middle) matches (partial match)
787
+ # # Strong indicator of same person
788
+ # elif l_match and (f_match or m_match):
789
+ # full_name_percent = max(full_name_percent, 90.0) # Higher confidence when lastname + another field matches
790
+
791
+ # # Rule 3: No matches at all or only firstname/middlename matches
792
+ # # Use the calculated similarity as-is
793
+
794
+ # # Calculate individual component percentages
795
+ # # 2. Calculate firstname_percent: compare firstnames
796
+ # if r1_fname and r2_fname:
797
+ # firstname_percent = calculate_similarity_with_models(
798
+ # r1_fname,
799
+ # r2_fname,
800
+ # NAME_MODEL_WEIGHTS
801
+ # )
802
+ # else:
803
+ # firstname_percent=0.0
804
+
805
+ # # 3. Calculate middlename_percent: compare middlenames
806
+ # if r1_mname and r2_mname:
807
+ # middlename_percent = calculate_similarity_with_models(
808
+ # r1_mname,
809
+ # r2_mname,
810
+ # NAME_MODEL_WEIGHTS
811
+ # )
812
+ # else:
813
+ # middlename_percent=0.0
814
+
815
+ # # 4. Calculate lastname_percent: compare lastnames
816
+ # if r1_lname and r2_lname and r1_lname.upper() not in SURNAME_IDENTIFIER and r2_lname.upper() not in SURNAME_IDENTIFIER:
817
+ # lastname_percent = calculate_similarity_with_models(
818
+ # r1_lname,
819
+ # r2_lname,
820
+ # NAME_MODEL_WEIGHTS
821
+ # )
822
+ # else:
823
+ # lastname_percent=0.0
824
+
825
+ # result= {
826
+ # 'full_name_percent': full_name_percent,
827
+ # 'firstname_percent': firstname_percent,
828
+ # 'middlename_percent': middlename_percent,
829
+ # 'lastname_percent': lastname_percent
830
+ # }
831
+ # return result
832
+
833
+ # def match_name(name: str, firstname: str, lastname: str, middlename: str) -> float:
834
+ # """
835
+ # Match name with logic
836
+ # Returns similarity score as float or "missing value"
837
+ # """
838
+ # name_processed = preprocess_for_matching(name)
839
+ # concat_name = concatenate_name_parts(firstname, middlename, lastname)
840
+
841
+ # # Case 1: NAME matches concatenated name
842
+ # if name_processed and concat_name and name_processed == concat_name:
843
+ # return 100
844
+
845
+ # # Case 2: NAME is empty, use concatenated
846
+ # if not name_processed and concat_name:
847
+ # return 100
848
+
849
+ # # Case 3: Concat is empty, use NAME
850
+ # if name_processed and not concat_name:
851
+ # return 100
852
+
853
+ # # Case 4: Both exist but different - use model
854
+ # if name_processed and concat_name and name_processed != concat_name:
855
+ # # Pass both to model for fuzzy matching
856
+ # return match_entities(name_processed, concat_name)
857
+
858
+ # # Both empty
859
+ # return 0
860
+
861
+ # def match_names_cross_records(r1_name: str, r1_firstname: str, r1_lastname: str, r1_middlename: str,
862
+ # r2_name: str, r2_firstname: str, r2_lastname: str, r2_middlename: str) -> float:
863
+ # """
864
+ # Match names between two records with enhanced preprocessing:
865
+ # 1. Input is already lowercase + preprocessed (titles removed, variations standardized)
866
+ # 2. Surname detection β€” if only common surnames match, return 20%
867
+ # 3. Token sorting for consistent comparison
868
+ # 4. Common token detection
869
+ # 5. Initial letter boost for abbreviated names
870
+ # 6. Three-case matching (both fullname / one fullname+FML / both FML)
871
+ # """
872
+ # # ── Normalize inputs (already lowercase from preprocess_name) ──
873
+ # r1_name_proc = r1_name.strip() if r1_name and r1_name.strip() not in ["-", ""] else ""
874
+ # r2_name_proc = r2_name.strip() if r2_name and r2_name.strip() not in ["-", ""] else ""
875
+
876
+ # r1_fname = r1_firstname.strip() if r1_firstname and r1_firstname.strip() not in ["-", ""] else ""
877
+ # r1_mname = r1_middlename.strip() if r1_middlename and r1_middlename.strip() not in ["-", ""] else ""
878
+ # r1_lname = r1_lastname.strip() if r1_lastname and r1_lastname.strip() not in ["-", ""] else ""
879
+
880
+ # r2_fname = r2_firstname.strip() if r2_firstname and r2_firstname.strip() not in ["-", ""] else ""
881
+ # r2_mname = r2_middlename.strip() if r2_middlename and r2_middlename.strip() not in ["-", ""] else ""
882
+ # r2_lname = r2_lastname.strip() if r2_lastname and r2_lastname.strip() not in ["-", ""] else ""
883
+
884
+ # # ── Determine case ──
885
+ # r1_has_fullname = bool(r1_name_proc)
886
+ # r2_has_fullname = bool(r2_name_proc)
887
+
888
+ # r1_concat = concatenate_name_parts(r1_fname, r1_mname, r1_lname).lower()
889
+ # r2_concat = concatenate_name_parts(r2_fname, r2_mname, r2_lname).lower()
890
+
891
+ # # Build the effective full name string for each record
892
+ # name1_effective = r1_name_proc if r1_has_fullname else r1_concat
893
+ # name2_effective = r2_name_proc if r2_has_fullname else r2_concat
894
+
895
+ # # Both missing β†’ zero
896
+ # if not name1_effective and not name2_effective:
897
+ # return {
898
+ # 'full_name_percent': 0.0,
899
+ # 'firstname_percent': 0.0,
900
+ # 'middlename_percent': 0.0,
901
+ # 'lastname_percent': 0.0
902
+ # }
903
+
904
+ # # ── Accumulate adjustments (applied AFTER handle_case computation) ──
905
+ # adjustment = 0
906
+ # surname_penalty_val = NAME_MATCH_ADJUSTMENTS.get("surname_penalty", -30)
907
+ # initial_boost_val = NAME_MATCH_ADJUSTMENTS.get("initial_boost", 30)
908
+ # subset_boost_val = NAME_MATCH_ADJUSTMENTS.get("subset_boost", 40)
909
+
910
+ # # ── Surname detection (case 2): penalty if surname-only match ──
911
+ # surname_only_match = False
912
+ # if name1_effective and name2_effective:
913
+ # surnames1 = detect_surnames(name1_effective)
914
+ # surnames2 = detect_surnames(name2_effective)
915
+
916
+ # if surnames1 and surnames2:
917
+ # common_surnames = surnames1 & surnames2
918
+ # if common_surnames:
919
+ # tokens1_non_surname = [t for t in name1_effective.split() if t not in surnames1]
920
+ # tokens2_non_surname = [t for t in name2_effective.split() if t not in surnames2]
921
+
922
+ # if tokens1_non_surname and tokens2_non_surname:
923
+ # non_surname_overlap = set(tokens1_non_surname) & set(tokens2_non_surname)
924
+ # if not non_surname_overlap:
925
+ # non_surname1_str = " ".join(tokens1_non_surname)
926
+ # non_surname2_str = " ".join(tokens2_non_surname)
927
+ # if fuzz.ratio(non_surname1_str, non_surname2_str) < 60:
928
+ # surname_only_match = True
929
+ # adjustment += surname_penalty_val # e.g., -30
930
+
931
+ # # ── Sort tokens for boost/subset detection ──
932
+ # name1_tokens = sorted(name1_effective.split()) if name1_effective else []
933
+ # name2_tokens = sorted(name2_effective.split()) if name2_effective else []
934
+
935
+ # # ── Initial letter boost / mismatch penalty (Case 3A) ──
936
+ # # compute_initial_letter_boost returns:
937
+ # # +0.2 β†’ all initials matched β†’ add initial_boost_val (+10.5)
938
+ # # -0.2 β†’ at least one initial did NOT match β†’ subtract initial_boost_val (-10.5)
939
+ # # 0.0 β†’ no initials present β†’ no change
940
+ # if name1_tokens and name2_tokens:
941
+ # boost = compute_initial_letter_boost(name1_tokens, name2_tokens)
942
+ # if boost > 0:
943
+ # adjustment += initial_boost_val # initials matched β†’ boost
944
+ # elif boost < 0:
945
+ # adjustment -= initial_boost_val # initials mismatched β†’ penalty
946
+
947
+ # # ── Subset match boost (case 5): +40 if one is complete subset ──
948
+ # if name1_tokens and name2_tokens and len(name1_tokens) != len(name2_tokens):
949
+ # if is_subset_match(name1_tokens, name2_tokens):
950
+ # adjustment += subset_boost_val # e.g., +40
951
+
952
+ # # ── Run the appropriate case handler for base similarity ──
953
+ # result = None
954
+
955
+ # # CASE 1: Both records have full names
956
+ # if r1_has_fullname and r2_has_fullname:
957
+ # result = handle_case1(r1_name_proc, r2_name_proc,
958
+ # r1_firstname, r1_middlename, r1_lastname,
959
+ # r2_firstname, r2_middlename, r2_lastname)
960
+
961
+ # # CASE 2: One has full name, other has F/M/L
962
+ # elif r1_has_fullname and not r2_has_fullname and r2_concat:
963
+ # result = handle_case2(r1_name_proc, r2_fname, r2_mname, r2_lname, r2_concat)
964
+
965
+ # elif r2_has_fullname and not r1_has_fullname and r1_concat:
966
+ # result = handle_case2(r2_name_proc, r1_fname, r1_mname, r1_lname, r1_concat)
967
+
968
+ # # CASE 3: Both have F/M/L
969
+ # elif not r1_has_fullname and not r2_has_fullname and r1_concat and r2_concat:
970
+ # result = handle_case3(r1_fname, r1_mname, r1_lname, r1_concat,
971
+ # r2_fname, r2_mname, r2_lname, r2_concat)
972
+
973
+ # # Fallback if no case matched
974
+ # if result is None:
975
+ # result = {
976
+ # 'full_name_percent': 0.0,
977
+ # 'firstname_percent': 0.0,
978
+ # 'middlename_percent': 0.0,
979
+ # 'lastname_percent': 0.0
980
+ # }
981
+
982
+ # # ── Apply accumulated adjustments to full_name_percent ──
983
+ # if adjustment != 0:
984
+ # result['full_name_percent'] = max(0.0, min(100.0, result['full_name_percent'] + adjustment))
985
+
986
+ # return result
987
+ def handle_case1(full_name1: str, full_name2: str,
988
+ r1_fname: str, r1_mname: str, r1_lname: str,
989
+ r2_fname: str, r2_mname: str, r2_lname: str) -> dict:
990
+ """
991
+ Case-1 (both records supply a full name)
992
+ Returns a dictionary with separate similarity scores for each component
993
+
994
+ Returns:
995
+ dict: {
996
+ 'full_name_percent': float, # full_name1 vs full_name2
997
+ 'firstname_percent': float, # r1_fname vs r2_fname
998
+ 'middlename_percent': float, # r1_mname vs r2_mname
999
+ 'lastname_percent': float # r1_lname vs r2_lname
1000
+ }
1001
+ """
1002
+ result={}
1003
+
1004
+ # Check space-agnostic exact match on original strings before sorting
1005
+ if full_name1.replace(" ", "").upper() == full_name2.replace(" ", "").upper():
1006
+ full_name_percent = 100.0
1007
+ else:
1008
+ # 1. Normalize + alphabetically sort each full name and calculate similarity
1009
+ sorted1 = _normalize_and_sort(full_name1)
1010
+ sorted2 = _normalize_and_sort(full_name2)
1011
+ full_name_percent = match_entities(sorted1, sorted2, NAME_MODEL_WEIGHTS)
1012
+ # print("full_name_percent is:",full_name_percent)
1013
+
1014
+ # 2. Calculate firstname_percent: compare firstnames
1015
+ # firstname
1016
+ if r1_fname and r2_fname:
1017
+ firstname_percent = match_entities(
1018
+ r1_fname, r2_fname, NAME_MODEL_WEIGHTS
1019
+ )
1020
+ # print("firstname_percent is:",firstname_percent)
1021
+ else:
1022
+ firstname_percent = 0.0
1023
+
1024
+ # middlename
1025
+ if r1_mname and r2_mname:
1026
+ middlename_percent = match_entities(
1027
+ r1_mname, r2_mname, NAME_MODEL_WEIGHTS
1028
+ )
1029
+ # print("middlename_percent is:",middlename_percent)
1030
+ else:
1031
+ middlename_percent = 0.0
1032
+
1033
+ # lastname
1034
+ if r1_lname and r2_lname:
1035
+ lastname_percent = match_entities(
1036
+ r1_lname, r2_lname, NAME_MODEL_WEIGHTS
1037
+ )
1038
+ # print("lastname_percent is:",lastname_percent)
1039
+ else:
1040
+ lastname_percent = 0.0
1041
+
1042
+
1043
+
1044
+ result={
1045
+ 'full_name_percent': full_name_percent,
1046
+ 'firstname_percent': firstname_percent,
1047
+ 'middlename_percent': middlename_percent,
1048
+ 'lastname_percent': lastname_percent
1049
+ }
1050
+ return result
1051
+
1052
+ def handle_case2(full_name: str,
1053
+ fname: str, mname: str, lname: str,
1054
+ concat_name: str) -> dict:
1055
+ """
1056
+ Case-2 (one side has full name, the other has F/M/L)
1057
+ Returns a dictionary with separate similarity scores for each component
1058
+
1059
+ Returns:
1060
+ dict: {
1061
+ 'full_name_percent': float, # full_name vs concat_name
1062
+ 'firstname_percent': float, # full_name vs fname
1063
+ 'middlename_percent': float, # full_name vs mname
1064
+ 'lastname_percent': float # full_name vs lname
1065
+ }
1066
+ """
1067
+ # 0. Try every permutation of F/M/L
1068
+ full_name_percent = None
1069
+ for permuted in _all_name_combinations(fname, mname, lname):
1070
+ if permuted.replace(" ", "") == full_name.upper().strip().replace(" ", ""):
1071
+ # Perfect match for the Full Name component
1072
+ full_name_percent = 100.0
1073
+ break
1074
+
1075
+ # 1. Calculate full_name_percent: compare sorted components if exact match failed
1076
+ if full_name_percent is None:
1077
+ sorted_full = _normalize_and_sort(full_name)
1078
+ sorted_concat = _normalize_and_sort(concat_name)
1079
+
1080
+ full_name_percent = match_entities(
1081
+ sorted_full,
1082
+ sorted_concat,
1083
+ NAME_MODEL_WEIGHTS
1084
+ )
1085
+
1086
+ # 2. Calculate firstname_percent: compare full_name with firstname only
1087
+ if fname :
1088
+ firstname_percent = match_entities(
1089
+ full_name,
1090
+ fname,
1091
+ NAME_MODEL_WEIGHTS
1092
+ )
1093
+ else:
1094
+ firstname_percent=0.0
1095
+ # 3. Calculate middlename_percent: compare full_name with middlename only
1096
+ if mname :
1097
+ middlename_percent = match_entities(
1098
+ full_name,
1099
+ mname,
1100
+ NAME_MODEL_WEIGHTS
1101
+ )
1102
+ else:
1103
+ middlename_percent=0.0
1104
+
1105
+ # 4. Calculate lastname_percent: compare full_name with lastname only
1106
+ if lname and lname.upper() not in SURNAME_IDENTIFIER:
1107
+ lastname_percent = match_entities(
1108
+ full_name,
1109
+ lname,
1110
+ NAME_MODEL_WEIGHTS
1111
+ )
1112
+ else:
1113
+ lastname_percent=0.0
1114
+
1115
+ result={
1116
+ 'full_name_percent': full_name_percent,
1117
+ 'firstname_percent': firstname_percent,
1118
+ 'middlename_percent': middlename_percent,
1119
+ 'lastname_percent': lastname_percent
1120
+ }
1121
+ return result
1122
+
1123
+
1124
+ def handle_case3(r1_fname: str, r1_mname: str, r1_lname: str, r1_concat: str,
1125
+ r2_fname: str, r2_mname: str, r2_lname: str, r2_concat: str) -> dict:
1126
+ """
1127
+ Handle Case 3: Both records have F/M/L
1128
+ Returns a dictionary with separate similarity scores for each component
1129
+
1130
+ Returns:
1131
+ dict: {
1132
+ 'full_name_percent': float, # r1_concat vs r2_concat
1133
+ 'firstname_percent': float, # r1_fname vs r2_fname
1134
+ 'middlename_percent': float, # r1_mname vs r2_mname
1135
+ 'lastname_percent': float # r1_lname vs r2_lname
1136
+ }
1137
+ """
1138
+ # Check substring matches for each component
1139
+ f_match = check_substring_match(r1_fname, r2_fname) if r1_fname and r2_fname else False
1140
+ m_match = check_substring_match(r1_mname, r2_mname) if r1_mname and r2_mname else False
1141
+ l_match = check_substring_match(r1_lname, r2_lname) if r1_lname and r2_lname else False
1142
+
1143
+ # Calculate full_name_percent: compare concatenated names
1144
+ full_name_percent = match_entities(r1_concat, r2_concat, NAME_MODEL_WEIGHTS)
1145
+
1146
+ # Apply boosting logic based on substring matches
1147
+ # Rule 1: Only lastname matches (family match)
1148
+ if l_match and not f_match and not m_match:
1149
+ full_name_percent = max(full_name_percent, 85.0) # Ensure minimum 85% for family match
1150
+
1151
+ # Rule 2: Lastname + (firstname or middle) matches (partial match)
1152
+ # Strong indicator of same person
1153
+ elif l_match and (f_match or m_match):
1154
+ full_name_percent = max(full_name_percent, 90.0) # Higher confidence when lastname + another field matches
1155
+
1156
+ # Rule 3: No matches at all or only firstname/middlename matches
1157
+ # Use the calculated similarity as-is
1158
+
1159
+ # 2. Calculate firstname_percent: compare firstnames
1160
+ if r1_fname and r2_fname:
1161
+ firstname_percent = match_entities(
1162
+ r1_fname,
1163
+ r2_fname,
1164
+ NAME_MODEL_WEIGHTS
1165
+ )
1166
+ else:
1167
+ firstname_percent=0.0
1168
+
1169
+ # 3. Calculate middlename_percent: compare middlenames
1170
+ if r1_mname and r2_mname:
1171
+ middlename_percent = match_entities(
1172
+ r1_mname,
1173
+ r2_mname,
1174
+ NAME_MODEL_WEIGHTS
1175
+ )
1176
+ else:
1177
+ middlename_percent=0.0
1178
+
1179
+ # 4. Calculate lastname_percent: compare lastnames
1180
+ if r1_lname and r2_lname and r1_lname.upper() not in SURNAME_IDENTIFIER and r2_lname.upper() not in SURNAME_IDENTIFIER:
1181
+ lastname_percent = match_entities(
1182
+ r1_lname,
1183
+ r2_lname,
1184
+ NAME_MODEL_WEIGHTS
1185
+ )
1186
+ else:
1187
+ lastname_percent=0.0
1188
+
1189
+ result= {
1190
+ 'full_name_percent': full_name_percent,
1191
+ 'firstname_percent': firstname_percent,
1192
+ 'middlename_percent': middlename_percent,
1193
+ 'lastname_percent': lastname_percent
1194
+ }
1195
+ return result
1196
+
1197
+ def match_name(name: str, firstname: str, lastname: str, middlename: str) -> float:
1198
+ """
1199
+ Match name with logic
1200
+ Returns similarity score as float or "missing value"
1201
+ """
1202
+ name_processed = preprocess_for_matching(name)
1203
+ concat_name = concatenate_name_parts(firstname, middlename, lastname)
1204
+
1205
+ # Case 1: NAME matches concatenated name
1206
+ if name_processed and concat_name and name_processed == concat_name:
1207
+ return 100
1208
+
1209
+ # Case 2: NAME is empty, use concatenated
1210
+ if not name_processed and concat_name:
1211
+ return 100
1212
+
1213
+ # Case 3: Concat is empty, use NAME
1214
+ if name_processed and not concat_name:
1215
+ return 100
1216
+
1217
+ # Case 4: Both exist but different - use model
1218
+ if name_processed and concat_name and name_processed != concat_name:
1219
+ # Pass both to model for fuzzy matching
1220
+ return match_entities(name_processed, concat_name)
1221
+
1222
+ # Both empty
1223
+ return 0
1224
+
1225
+ def match_names_cross_records(r1_name: str, r1_firstname: str, r1_lastname: str, r1_middlename: str,
1226
+ r2_name: str, r2_firstname: str, r2_lastname: str, r2_middlename: str) -> float:
1227
+ """
1228
+ Match names between two records with enhanced preprocessing:
1229
+ 1. Input is already lowercase + preprocessed (titles removed, variations standardized)
1230
+ 2. Surname detection β€” if only common surnames match, return 20%
1231
+ 3. Token sorting for consistent comparison
1232
+ 4. Common token detection
1233
+ 5. Initial letter boost for abbreviated names
1234
+ 6. Three-case matching (both fullname / one fullname+FML / both FML)
1235
+
1236
+ [MODIFIED 2026-03-15]
1237
+ - Refactored handle_case functions to properly pass exact permutation checking
1238
+ down to match_entities() instead of bypassing it to ml models.
1239
+ - Updated handle_case2 exact match checker to cleanly yield the first, middle,
1240
+ and last name proportions instead of assuming 100% across the board.
1241
+ - Implemented a -40 explicit penalty if two recognized surnames are detected
1242
+ but contradict each other completely (e.g. Krishna Rajput vs Krishna Singh).
1243
+ """
1244
+ # ── Normalize inputs (already lowercase from preprocess_name) ──
1245
+ r1_name_proc = r1_name.strip() if r1_name and r1_name.strip() not in ["-", ""] else ""
1246
+ r2_name_proc = r2_name.strip() if r2_name and r2_name.strip() not in ["-", ""] else ""
1247
+
1248
+ r1_fname = r1_firstname.strip() if r1_firstname and r1_firstname.strip() not in ["-", ""] else ""
1249
+ r1_mname = r1_middlename.strip() if r1_middlename and r1_middlename.strip() not in ["-", ""] else ""
1250
+ r1_lname = r1_lastname.strip() if r1_lastname and r1_lastname.strip() not in ["-", ""] else ""
1251
+
1252
+ r2_fname = r2_firstname.strip() if r2_firstname and r2_firstname.strip() not in ["-", ""] else ""
1253
+ r2_mname = r2_middlename.strip() if r2_middlename and r2_middlename.strip() not in ["-", ""] else ""
1254
+ r2_lname = r2_lastname.strip() if r2_lastname and r2_lastname.strip() not in ["-", ""] else ""
1255
+
1256
+ # ── Determine case ──
1257
+ r1_has_fullname = bool(r1_name_proc)
1258
+ r2_has_fullname = bool(r2_name_proc)
1259
+
1260
+ r1_concat = concatenate_name_parts(r1_fname, r1_mname, r1_lname).lower()
1261
+ r2_concat = concatenate_name_parts(r2_fname, r2_mname, r2_lname).lower()
1262
+
1263
+ # Build the effective full name string for each record
1264
+ name1_effective = r1_name_proc if r1_has_fullname else r1_concat
1265
+ name2_effective = r2_name_proc if r2_has_fullname else r2_concat
1266
+
1267
+ # Both missing β†’ zero
1268
+ if not name1_effective and not name2_effective:
1269
+ return {
1270
+ 'full_name_percent': 0.0,
1271
+ 'firstname_percent': 0.0,
1272
+ 'middlename_percent': 0.0,
1273
+ 'lastname_percent': 0.0
1274
+ }
1275
+
1276
+ # ── Accumulate adjustments (applied AFTER handle_case computation) ──
1277
+ adjustment = 0
1278
+ surname_penalty_val = NAME_MATCH_ADJUSTMENTS.get("surname_penalty", -30)
1279
+ initial_boost_val = NAME_MATCH_ADJUSTMENTS.get("initial_boost", 30)
1280
+ subset_boost_val = NAME_MATCH_ADJUSTMENTS.get("subset_boost", 40)
1281
+
1282
+ # ── Surname detection (case 2): penalty if surname-only match ──
1283
+ surname_only_match = False
1284
+ if name1_effective and name2_effective:
1285
+ surnames1 = detect_surnames(name1_effective)
1286
+ surnames2 = detect_surnames(name2_effective)
1287
+
1288
+ if surnames1 and surnames2:
1289
+ common_surnames = surnames1 & surnames2
1290
+ if common_surnames:
1291
+ tokens1_non_surname = [t for t in name1_effective.split() if t not in surnames1]
1292
+ tokens2_non_surname = [t for t in name2_effective.split() if t not in surnames2]
1293
+
1294
+ if tokens1_non_surname and tokens2_non_surname:
1295
+ non_surname_overlap = set(tokens1_non_surname) & set(tokens2_non_surname)
1296
+ if not non_surname_overlap:
1297
+ non_surname1_str = " ".join(tokens1_non_surname)
1298
+ non_surname2_str = " ".join(tokens2_non_surname)
1299
+ if fuzz.ratio(non_surname1_str, non_surname2_str) < 60:
1300
+ surname_only_match = True
1301
+ adjustment += surname_penalty_val # e.g., -30
1302
+ else:
1303
+ # Mismatching surnames! Both have a known surname, but they don't match.
1304
+ # Example: "krishna rajput" vs "krishna singh"
1305
+ adjustment -= 40 # severe penalty for conflicting standard surnames
1306
+
1307
+ # ── Sort tokens for boost/subset detection ──
1308
+ name1_tokens = sorted(name1_effective.split()) if name1_effective else []
1309
+ name2_tokens = sorted(name2_effective.split()) if name2_effective else []
1310
+
1311
+ # ── Initial letter boost (case 4): +30 if initials match ──
1312
+ if name1_tokens and name2_tokens:
1313
+ boost = compute_initial_letter_boost(name1_tokens, name2_tokens)
1314
+ if boost > 0:
1315
+ adjustment += initial_boost_val # e.g., +30
1316
+
1317
+ # ── Subset match boost (case 5): +40 if one is complete subset ──
1318
+ if name1_tokens and name2_tokens and len(name1_tokens) != len(name2_tokens):
1319
+ if is_subset_match(name1_tokens, name2_tokens):
1320
+ adjustment += subset_boost_val # e.g., +40
1321
+
1322
+ # ── Run the appropriate case handler for base similarity ──
1323
+ result = None
1324
+
1325
+ # CASE 1: Both records have full names
1326
+ if r1_has_fullname and r2_has_fullname:
1327
+ result = handle_case1(r1_name_proc, r2_name_proc,
1328
+ r1_firstname, r1_middlename, r1_lastname,
1329
+ r2_firstname, r2_middlename, r2_lastname)
1330
+
1331
+ # CASE 2: One has full name, other has F/M/L
1332
+ elif r1_has_fullname and not r2_has_fullname and r2_concat:
1333
+ result = handle_case2(r1_name_proc, r2_fname, r2_mname, r2_lname, r2_concat)
1334
+
1335
+ elif r2_has_fullname and not r1_has_fullname and r1_concat:
1336
+ result = handle_case2(r2_name_proc, r1_fname, r1_mname, r1_lname, r1_concat)
1337
+
1338
+ # CASE 3: Both have F/M/L
1339
+ elif not r1_has_fullname and not r2_has_fullname and r1_concat and r2_concat:
1340
+ result = handle_case3(r1_fname, r1_mname, r1_lname, r1_concat,
1341
+ r2_fname, r2_mname, r2_lname, r2_concat)
1342
+
1343
+ # Fallback if no case matched
1344
+ if result is None:
1345
+ result = {
1346
+ 'full_name_percent': 0.0,
1347
+ 'firstname_percent': 0.0,
1348
+ 'middlename_percent': 0.0,
1349
+ 'lastname_percent': 0.0
1350
+ }
1351
+
1352
+ # ── Apply accumulated adjustments to full_name_percent ──
1353
+ if adjustment != 0:
1354
+ result['full_name_percent'] = max(0.0, min(100.0, result['full_name_percent'] + adjustment))
1355
+
1356
+ return result
1357
+
1358
+ def match_addresses_1_to_n(addresses_r1: List[str], addresses_r2: List[str]) -> float:
1359
+ """
1360
+ Match addresses 1:N (plain addressline strings only β€” no city/zipcode/state).
1361
+
1362
+ Pipeline:
1363
+ 1. Extract all address components (house_no, flat, apartment, street) from each address
1364
+ 2. Pass remaining address (components removed) to embedding model β†’ base_score
1365
+ 3. If base_score > 60: apply per-component boost/penalty
1366
+ house_number : match +30 / mismatch -30
1367
+ flat_number : match +10 / mismatch -10
1368
+ apartment : match +10 / mismatch -10
1369
+ street : match +10 / mismatch -10
1370
+ If base_score <= 60: skip all component adjustments
1371
+ 4. Named component + post-box adjustments
1372
+ 5. Cap final score to [0, 100]
1373
+ """
1374
+ from services.rules import (
1375
+ preprocess_address as _preprocess_addr,
1376
+ compare_named_components as _compare_named,
1377
+ compare_postbox as _compare_postbox,
1378
+ remove_postbox_from_address as _strip_postbox,
1379
+ extract_address_components as _extract_components,
1380
+ )
1381
+
1382
+ def _norm(val):
1383
+ """Strip all non-alphanumerics β€” 144/143 β†’ 144143."""
1384
+ if not val:
1385
+ return ""
1386
+ return re.sub(r'[^A-Z0-9]', '', str(val).upper())
1387
+
1388
+ def _component_adj(v1, v2, boost, penalty):
1389
+ """Return (verdict, adjustment) for a single component pair."""
1390
+ if v1 and v2:
1391
+ return ("match", boost) if v1 == v2 else ("mismatch", -penalty)
1392
+ return ("missing", 0.0)
1393
+
1394
+ raw1 = [a for a in addresses_r1 if a and str(a).strip() not in ["-", " ", ""]]
1395
+ raw2 = [a for a in addresses_r2 if a and str(a).strip() not in ["-", " ", ""]]
1396
+
1397
+ if not raw1 or not raw2:
1398
+ return 0
1399
+
1400
+ best_score = 0.0
1401
+
1402
+ for raw_a1 in raw1:
1403
+ for raw_a2 in raw2:
1404
+ if not raw_a1 or not raw_a2:
1405
+ continue
1406
+
1407
+ # ── Extract components from both raw addresses ────────────────
1408
+ comp1 = _extract_components(raw_a1)
1409
+ comp2 = _extract_components(raw_a2)
1410
+
1411
+ hno1 = _norm(comp1.get("house_number"))
1412
+ hno2 = _norm(comp2.get("house_number"))
1413
+ flat1 = _norm(comp1.get("flat_number"))
1414
+ flat2 = _norm(comp2.get("flat_number"))
1415
+ apt1 = _norm(comp1.get("apartment"))
1416
+ apt2 = _norm(comp2.get("apartment"))
1417
+ str1 = _norm(comp1.get("street"))
1418
+ str2 = _norm(comp2.get("street"))
1419
+
1420
+ # ── Remaining address β†’ model input ───────────────────────────
1421
+ rem1 = comp1.get("remaining_address", "").strip()
1422
+ rem2 = comp2.get("remaining_address", "").strip()
1423
+
1424
+ # Fallback to full preprocessed address if remaining is empty
1425
+ if not rem1:
1426
+ rem1 = _preprocess_addr(raw_a1).upper()
1427
+ if not rem2:
1428
+ rem2 = _preprocess_addr(raw_a2).upper()
1429
+
1430
+ addr1_clean = _strip_postbox(rem1) or rem1
1431
+ addr2_clean = _strip_postbox(rem2) or rem2
1432
+
1433
+ # Named components comparison (on full preprocessed address)
1434
+ addr1_full = _preprocess_addr(raw_a1).upper()
1435
+ addr2_full = _preprocess_addr(raw_a2).upper()
1436
+ named_result = _compare_named(addr1_full, addr2_full)
1437
+ pb_result = _compare_postbox(addr1_full, addr2_full)
1438
+
1439
+ try:
1440
+ base_score = float(match_entities(addr1_clean, addr2_clean,
1441
+ weights=ADDRESS_MODEL_WEIGHTS))
1442
+ except (TypeError, ValueError):
1443
+ base_score = 0.0
1444
+
1445
+ # ── Component adjustments (only when base_score > 60) ─────────
1446
+ comp_adj = 0.0
1447
+ component_specs = [
1448
+ ("house_number", hno1, hno2, 30.0, 30.0),
1449
+ ("flat_number", flat1, flat2, 10.0, 10.0),
1450
+ ("apartment", apt1, apt2, 10.0, 10.0),
1451
+ ("street", str1, str2, 10.0, 10.0),
1452
+ ]
1453
+ print(f"[ADDR_COMPONENTS] base_score={base_score:.2f} | threshold=60 | adjustments_applied={base_score > 60}")
1454
+ print(f" remaining_addr1 : {addr1_clean!r}")
1455
+ print(f" remaining_addr2 : {addr2_clean!r}")
1456
+ for label, v1, v2, boost, penalty in component_specs:
1457
+ verdict, adj = _component_adj(v1, v2, boost, penalty)
1458
+ if verdict == "missing":
1459
+ print(f" {label:<15} | verdict=missing | v1={v1!r:>10} v2={v2!r:<10} | adjustment=0.0 [skipped - component absent]")
1460
+ elif base_score <= 60:
1461
+ print(f" {label:<15} | verdict={verdict:<9} | v1={v1!r:>10} v2={v2!r:<10} | adjustment=0.0 [SKIPPED - base_score<=60]")
1462
+ else:
1463
+ comp_adj += adj
1464
+ sign = "+" if adj >= 0 else ""
1465
+ tag = "BOOSTED" if adj > 0 else "PENALISED"
1466
+ print(f" {label:<15} | verdict={verdict:<9} | v1={v1!r:>10} v2={v2!r:<10} | adjustment={sign}{adj:.1f} [{tag}]")
1467
+ print(f" total comp_adj : {comp_adj:+.1f}")
1468
+
1469
+ adjustment = comp_adj + named_result['score_adjustment'] + pb_result['adjustment']
1470
+ final_score = max(0.0, min(100.0, base_score + adjustment))
1471
+ if final_score > best_score:
1472
+ best_score = final_score
1473
+
1474
+ return round(best_score, 2)
1475
+
1476
+
1477
+ def match_addresses_structured(
1478
+ addrs_r1: List[dict],
1479
+ addrs_r2: List[dict],
1480
+ ) -> float:
1481
+ """
1482
+ Match addresses when city / zipcode / state are available as separate columns.
1483
+
1484
+ Each address dict must have keys: addressline, city, zipcode, state.
1485
+ Returns best score across all NΓ—M combinations (0-100).
1486
+
1487
+ Handles:
1488
+ - Missing state/city β†’ inferred from zipcode via pgeocode (offline)
1489
+ - Bank state codes (NDH, BLR …) β†’ canonical form
1490
+ - City name variants β†’ canonical via CITY_MAPPING
1491
+ - House number extraction + comparison
1492
+ - Full addressline text via embedding model
1493
+
1494
+ Example:
1495
+ addrs1 = [{"addressline": "A13 GUPTA ENCLAVE...",
1496
+ "city": "NEW DELHI", "zipcode": "110059", "state": "NDH"}]
1497
+ addrs2 = [{"addressline": "A13 GUPTA ENCLAVE...",
1498
+ "city": "NEW DELHI", "zipcode": "110059", "state": "DELHI"}]
1499
+ score = match_addresses_structured(addrs1, addrs2) # β†’ ~100
1500
+ """
1501
+ from services.rules import match_structured_address_lists as _sa_match
1502
+ return _sa_match(addrs_r1, addrs_r2)
1503
+
1504
+ def match_single_field(value1: str, value2: str) -> float:
1505
+ """
1506
+ Match single fields like SPOUSENAME, MOTHERNAME, etc.
1507
+ Returns similarity score as float or "missing value"
1508
+ """
1509
+ return match_entities(value1, value2)
services/rules.py ADDED
The diff for this file is too large to render. See raw diff