Commit Β·
e9084d7
1
Parent(s): 236f74b
initial push
Browse files- README.md +6 -8
- backend/__init__.py +0 -0
- backend/__pycache__/__init__.cpython-310.pyc +0 -0
- backend/__pycache__/__init__.cpython-312.pyc +0 -0
- backend/__pycache__/matching_service.cpython-310.pyc +0 -0
- backend/__pycache__/matching_service.cpython-312.pyc +0 -0
- backend/__pycache__/models.cpython-310.pyc +0 -0
- backend/__pycache__/models.cpython-312.pyc +0 -0
- backend/__pycache__/server.cpython-310.pyc +0 -0
- backend/__pycache__/server.cpython-312.pyc +0 -0
- backend/config/common.properties +111 -0
- backend/matching_service.py +683 -0
- backend/models.py +536 -0
- backend/server.py +323 -0
- data/city_prev_pres.csv +395 -0
- data/hno_variation_standard.csv +619 -0
- data/name_variation_standard.csv +0 -0
- data/pin_city_state.csv +0 -0
- data/state_name_standard.csv +244 -0
- data/sur_comm_names.csv +182 -0
- frontend/app.py +673 -0
- frontend/assests/Logo icon_color.png +0 -0
- none.webp +0 -0
- note.txt +48 -0
- requirements.txt +20 -0
- services/__pycache__/config.cpython-310.pyc +0 -0
- services/__pycache__/config.cpython-312.pyc +0 -0
- services/__pycache__/llm_model.cpython-310.pyc +0 -0
- services/__pycache__/llm_model.cpython-312.pyc +0 -0
- services/__pycache__/model.cpython-310.pyc +0 -0
- services/__pycache__/model.cpython-312.pyc +0 -0
- services/__pycache__/rules.cpython-310.pyc +0 -0
- services/__pycache__/rules.cpython-312.pyc +0 -0
- services/address_matcher.py +722 -0
- services/config.py +121 -0
- services/model.py +1509 -0
- services/rules.py +0 -0
README.md
CHANGED
|
@@ -1,12 +1,10 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
colorFrom: purple
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
-
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
-
---
|
| 11 |
-
|
| 12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
+
title: GEN AI Record Level Matching
|
| 3 |
+
emoji: π
|
| 4 |
colorFrom: purple
|
| 5 |
+
colorTo: orange
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: "4.44.0"
|
| 8 |
+
app_file: frontend/app.py
|
| 9 |
pinned: false
|
| 10 |
+
---
|
|
|
|
|
|
backend/__init__.py
ADDED
|
File without changes
|
backend/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (169 Bytes). View file
|
|
|
backend/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (173 Bytes). View file
|
|
|
backend/__pycache__/matching_service.cpython-310.pyc
ADDED
|
Binary file (19.5 kB). View file
|
|
|
backend/__pycache__/matching_service.cpython-312.pyc
ADDED
|
Binary file (32.9 kB). View file
|
|
|
backend/__pycache__/models.cpython-310.pyc
ADDED
|
Binary file (14.2 kB). View file
|
|
|
backend/__pycache__/models.cpython-312.pyc
ADDED
|
Binary file (18.9 kB). View file
|
|
|
backend/__pycache__/server.cpython-310.pyc
ADDED
|
Binary file (8.2 kB). View file
|
|
|
backend/__pycache__/server.cpython-312.pyc
ADDED
|
Binary file (11.8 kB). View file
|
|
|
backend/config/common.properties
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[MAPPING_DICT]
|
| 2 |
+
|
| 3 |
+
CITY_MAPPING = {"MUMBAI":["MUMBAI","BOMBAY","MUMBAI SUBURBAN"],"DELHI":["DELHI","NEW DELHI","DELHI NCR","NCT OF DELHI","SEELAMPUR","SHAHDARA","DWARKA","ROHINI","PITAMPURA","KAROL BAGH","LAJPAT NAGAR","SAKET","JANAKPURI","MAYUR VIHAR","VASANT KUNJ","OKHLA"],"BENGALURU":["BENGALURU","BANGALORE","BENGALURU URBAN"],"HYDERABAD":["HYDERABAD","SECUNDERABAD","HYDERABAD CITY"],"CHENNAI":["CHENNAI","MADRAS","CHENNAI CITY"],"KOLKATA":["KOLKATA","CALCUTTA","KOLKATA CITY"],"PUNE":["PUNE","POONA"],"AHMEDABAD":["AHMEDABAD","AMDAVAD"],"JAIPUR":["JAIPUR","PINK CITY"],"LUCKNOW":["LUCKNOW","LAKHNAU"],"KANPUR":["KANPUR","CAWNPORE"],"NAGPUR":["NAGPUR"],"INDORE":["INDORE"],"THANE":["THANE","THANA"],"BHOPAL":["BHOPAL"],"VISAKHAPATNAM":["VISAKHAPATNAM","VIZAG","VISHAKHAPATNAM"],"PIMPRI-CHINCHWAD":["PIMPRI-CHINCHWAD","PIMPRI CHINCHWAD","PCMC"],"PATNA":["PATNA","PATALIPUTRA"],"VADODARA":["VADODARA","BARODA"],"GHAZIABAD":["GHAZIABAD","GHZ"],"LUDHIANA":["LUDHIANA"],"AGRA":["AGRA"],"NASHIK":["NASHIK","NASIK"],"FARIDABAD":["FARIDABAD"],"MEERUT":["MEERUT"],"RAJKOT":["RAJKOT"],"KALYAN-DOMBIVLI":["KALYAN-DOMBIVLI","KALYAN","DOMBIVLI"],"VASAI-VIRAR":["VASAI-VIRAR","VASAI","VIRAR"],"VARANASI":["VARANASI","BANARAS","BENARES","KASHI"],"SRINAGAR":["SRINAGAR"],"AURANGABAD":["AURANGABAD"],"DHANBAD":["DHANBAD"],"AMRITSAR":["AMRITSAR"],"NAVI MUMBAI":["NAVI MUMBAI","NEW BOMBAY"],"ALLAHABAD":["ALLAHABAD","PRAYAGRAJ","ILAHABAD"],"RANCHI":["RANCHI"],"HOWRAH":["HOWRAH","HAORA"],"COIMBATORE":["COIMBATORE"],"JABALPUR":["JABALPUR","JUBBULPORE"],"GWALIOR":["GWALIOR"],"VIJAYAWADA":["VIJAYAWADA"],"JODHPUR":["JODHPUR"],"MADURAI":["MADURAI"],"RAIPUR":["RAIPUR"],"KOTA":["KOTA"],"GUWAHATI":["GUWAHATI","GAUHATI"],"CHANDIGARH":["CHANDIGARH","MOHALI","SAS NAGAR","KHARAR","PANCHKULA","ZIRAKPUR"],"SOLAPUR":["SOLAPUR","SHOLAPUR"],"HUBLI-DHARWAD":["HUBLI-DHARWAD","HUBLI","DHARWAD"],"BAREILLY":["BAREILLY"],"MORADABAD":["MORADABAD"],"MYSORE":["MYSORE","MYSURU"],"GURGAON":["GURGAON","GURUGRAM"],"ALIGARH":["ALIGARH"],"JALANDHAR":["JALANDHAR"],"TIRUCHIRAPPALLI":["TIRUCHIRAPPALLI","TRICHY","TRICHINOPOLY"],"BHUBANESWAR":["BHUBANESWAR","BHUBANESHWAR"],"SALEM":["SALEM"],"WARANGAL":["WARANGAL"],"THIRUVANANTHAPURAM":["THIRUVANANTHAPURAM","TRIVANDRUM"],"GUNTUR":["GUNTUR"],"BHIWANDI":["BHIWANDI"],"SAHARANPUR":["SAHARANPUR"],"GORAKHPUR":["GORAKHPUR"],"BIKANER":["BIKANER"],"AMRAVATI":["AMRAVATI"],"NOIDA":["NOIDA"],"JAMSHEDPUR":["JAMSHEDPUR","TATANAGAR"],"BHILAI":["BHILAI","BHILAI NAGAR"],"CUTTACK":["CUTTACK"],"FIROZABAD":["FIROZABAD"],"KOCHI":["KOCHI","COCHIN"],"BHAVNAGAR":["BHAVNAGAR"],"DEHRADUN":["DEHRADUN","DEHRA DUN"],"DURGAPUR":["DURGAPUR"],"ASANSOL":["ASANSOL"],"NANDED":["NANDED"],"KOLHAPUR":["KOLHAPUR"],"AJMER":["AJMER"],"GULBARGA":["GULBARGA","KALABURAGI"],"JAMNAGAR":["JAMNAGAR"],"UJJAIN":["UJJAIN"],"LONI":["LONI"],"SILIGURI":["SILIGURI"],"JHANSI":["JHANSI"],"ULHASNAGAR":["ULHASNAGAR"],"NELLORE":["NELLORE"],"JAMMU":["JAMMU"],"SANGALI-MIRAJ-KUPWAD":["SANGALI-MIRAJ-KUPWAD","SANGALI","MIRAJ","KUPWAD"],"BELGAUM":["BELGAUM","BELAGAVI"],"MANGALORE":["MANGALORE","MANGALURU"],"AMBATTUR":["AMBATTUR"],"TIRUNELVELI":["TIRUNELVELI"],"MALEGAON":["MALEGAON"],"GREATER NOIDA":["GREATER NOIDA"]}
|
| 4 |
+
|
| 5 |
+
STATE_MAPPING = {"ANDHRA PRADESH":["ANDHRA PRADESH","ANDHRAPRADESH","ANDHRA","AP","A.P","A.P.","AP STATE","IN-AP"],"ARUNACHAL PRADESH":["ARUNACHAL PRADESH","ARUNACHAL","AR","A.R","ARUNACHAL PRADESH STATE","IN-AR"],"ASSAM":["ASSAM","AS","A.S","ASSAM STATE","IN-AS"],"BIHAR":["BIHAR","BR","B.R","BIHAR STATE","IN-BR"],"CHHATTISGARH":["CHHATTISGARH","CHATTISGARH","CHHATISGARH","CG","C.G","CT","CHATTISGARH STATE","IN-CG"],"GOA":["GOA","GA","G.A","IN-GA"],"GUJARAT":["GUJARAT","GUJRAT","GUJARATH","GJ","G.J","IN-GJ"],"HARYANA":["HARYANA","HARIYANA","HR","H.R","IN-HR"],"HIMACHAL PRADESH":["HIMACHAL PRADESH","HIMACHAL","HP","H.P","H.P.","IN-HP"],"JHARKHAND":["JHARKHAND","JH","J.H","IN-JH"],"KARNATAKA":["KARNATAKA","KARNATAK","KARN","KA","K.A","MYSORE STATE","IN-KA"],"KERALA":["KERALA","KERALAM","KL","K.L","IN-KL"],"MADHYA PRADESH":["MADHYA PRADESH","MADHYAPRADESH","MADHYA","MP","M.P","M.P.","MP STATE","IN-MP"],"MAHARASHTRA":["MAHARASHTRA","MAHARASTRA","MAHA","MH","M.H","MAHARASHTRA STATE","IN-MH"],"MANIPUR":["MANIPUR","MN","M.N","IN-MN"],"MEGHALAYA":["MEGHALAYA","ML","M.L","IN-ML"],"MIZORAM":["MIZORAM","MZ","M.Z","IN-MZ"],"NAGALAND":["NAGALAND","NL","N.L","IN-NL"],"ODISHA":["ODISHA","ORISSA","OD","O.D","OR","O.R","ODISHA STATE","IN-OD"],"PUNJAB":["PUNJAB","PANJAB","PB","P.B","IN-PB"],"RAJASTHAN":["RAJASTHAN","RAJ","RJ","R.J","RAJASTHAN STATE","IN-RJ"],"SIKKIM":["SIKKIM","SK","S.K","IN-SK"],"TAMIL NADU":["TAMIL NADU","TAMILNADU","TAMIL","TN","T.N","T.N.","TAMILNADU STATE","IN-TN"],"TELANGANA":["TELANGANA","TELENGANA","TG","T.G","TS","T.S","TELANGANA STATE","IN-TS"],"TRIPURA":["TRIPURA","TR","T.R","IN-TR"],"UTTAR PRADESH":["UTTAR PRADESH","UTTARPRADESH","UTTAR","UP","U.P","U.P.","UP STATE","IN-UP"],"UTTARAKHAND":["UTTARAKHAND","UTTARANCHAL","UK","U.K","UA","UTTARAKHAND STATE","IN-UK"],"WEST BENGAL":["WEST BENGAL","WESTBENGAL","WB","W.B","W.B.","WEST BENGAL STATE","IN-WB"],"ANDAMAN AND NICOBAR ISLANDS":["ANDAMAN AND NICOBAR ISLANDS","ANDAMAN NICOBAR","ANDAMAN","NICOBAR","AN","A.N","A & N ISLANDS","IN-AN"],"CHANDIGARH":["CHANDIGARH","CH","C.H","IN-CH","MOHALI","SAS NAGAR","KHARAR","PANCHKULA","ZIRAKPUR"],"DADRA AND NAGAR HAVELI AND DAMAN AND DIU":["DADRA AND NAGAR HAVELI AND DAMAN AND DIU","DADRA NAGAR HAVELI","DAMAN DIU","DN","D.N","DNH","DD","IN-DH"],"DELHI":["DELHI","NEW DELHI","DL","D.L","NCT OF DELHI","NATIONAL CAPITAL TERRITORY OF DELHI","NORTH EAST DELHI","NORTH WEST DELHI","SOUTH EAST DELHI","SOUTH WEST DELHI","SEELAMPUR","SHAHDARA","DWARKA","ROHINI","PITAMPURA","KAROL BAGH","LAJPAT NAGAR","SAKET","JANAKPURI","MAYUR VIHAR","VASANT KUNJ","OKHLA","NOIDA","GREATER NOIDA","FARIDABAD","GHAZIABAD","GHZ","INDIRAPURAM","GURUGRAM","GURGAON","IN-DL"],"JAMMU AND KASHMIR":["JAMMU AND KASHMIR","JAMMU","KASHMIR","JK","J.K","J&K","JAMMU & KASHMIR","IN-JK"],"LADAKH":["LADAKH","LA","L.A","IN-LA"],"LAKSHADWEEP":["LAKSHADWEEP","LAKSHADWEEP ISLANDS","LD","L.D","IN-LD"],"PUDUCHERRY":["PUDUCHERRY","PONDICHERRY","PY","P.Y","IN-PY"]}
|
| 6 |
+
|
| 7 |
+
ADDRESS_MAPPING = {"DIST":["DISTRICT","DIST","DST","DSTR","DT","ZILLA","JILLA","ZILA"],"TALUK":["TALUK","TAL","TALUKA","TQ","TEH","TEHS","TEHSIL","MANDAL","MD"],"VILLAGE":["VILLAGE","VILL","VIL","VLG","GRAMA","GRAM","GAON"],"CITY":["CITY","CTY","TOWN","TWN","NAGAR","NAG","PURAM","PURA"],"STATE":["STATE","ST","RAJYA","PRADESH"],"HOUSE":["HOUSE NO","H NO","H.NO","H-NO","H/NO","HNO","HOUSE NUMBER","HOUSE#","HOUSE NUM","PROPERTY NO","PROPERTY NUMBER","RESIDENCE NO","RES NO","H:NO","H.NO:","D:NO","D.NO:"],"HOUSE":["D NO","D.NO","D-NO","D/NO","DNO","DOOR NO","DOOR NUMBER"],"APT":["APARTMENT","APT","APT NO","APT NUMBER","APARTMENT NO","TOWER","TOWER NO","WING","PHASE","PHASE NO","RESIDENCY","RESIDENTIAL COMPLEX","HEIGHTS","ENCLAVE","APARTMENTS","SOCIETY","SOCIETY NO","CHS"],"BLDG":["BLDG","BLDG NO","BUILDING","BUILDING NO"],"BLOCK":["BLK","BLOCK","BLOCK NO"],"FLAT":["FLAT","FLAT NO","FLAT NUMBER","FLT","FLT NO","UNIT","UNIT NO","UNIT NUMBER","PORTION","PORTION NO","OFFICE NO","OFFICE NUMBER"],"SHOP":["SHOP NO","SHOP NUMBER"],"ROAD":["ROAD","RD","R D","MARG","MRG","PATH"],"STR":["STREET","ST","STR","GALI","GALLLI","LANE","LN","MARG","PATH","CIRCLE","CIR","SECTOR","SEC"],"LANE":["LANE","LN","BYLANE","CROSS","CR"],"EXTN":["EXTENSION","EXT","EXTN"],"LOCALITY":["LOCALITY","LAYOUT","LYT","PHASE","PH","SECTOR","SEC"],"CLNY":["COLONY","COL","CLNY"],"BUILDING":["BUILDING","APT","APARTMENT","BLDG","TOWER"]}
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
[IDENTIFIERS]
|
| 11 |
+
|
| 12 |
+
APARTMENT_IDENTIFIER=APT
|
| 13 |
+
FLAT_NUMBER_IDENTIFIER=FLAT
|
| 14 |
+
HOUSE_NUMBER_IDENTIFIER= HOUSE
|
| 15 |
+
STREET_KEYWORD=STR
|
| 16 |
+
FLOOR_KEYWORD=FLR
|
| 17 |
+
DOOR_NO_KEYWORD=DOOR
|
| 18 |
+
INDIAN_SURNAMES = ["SINGH","THAKUR","RAJPUT","SHARMA","PANDEY","PANDE","PANDAY","MISHRA","TIWARI","TRIPATHI","DWIVEDI","AWASTHI","GUPTA","AGARWAL","AGGARWAL","BANSAL","GOYAL","MITTAL","VERMA","SAXENA","SRIVASTAVA","NIGAM","RASTOGI","MALHOTRA","KHANNA","ARORA","BHATIA","KOHLI","OBEROI","CHAUDHARY","CHOUDHARY","CHOWDHARY","CHOWDARY","PATEL","SHAH","MEHTA","PANDYA","TRIVEDI","DAVE","DESAI","PAREKH","JOSHI","BHATT","VORA","MODI","KULKARNI","DESHPANDE","APTE","CHITNIS","GOKHALE","PHADKE","NAIK","PAWAR","JADHAV","SHINDE","SAWANT","REDDY","REDDAPPA","REDDIGARI","RAO","NAIDU","RAJU","VARMA","SASTRY","SHASTRI","GOUD","CHOWDARY","CHOUDARY","SETTY","SHETTY","GOWDA","HEGDE","BHAT","PAI","KAMATH","SHENOY","PRABHU","NAYAK","SHETTY","IYER","IYENGAR","AYYAR","PILLAI","MUDALIAR","THEVAR","GOUNDER","CHETTIAR","NADAR","KANNAN","KRISHNAN","RAMAN","SUBRAMANIAN","NAIR","MENON","PILLAI","KURUP","PANICKER","VARMA","NAMBOOTHIRI","NAMBIAR","CHACKO","MATHEW","THOMAS","VARGHESE","DAS","DUTTA","BANERJEE","MUKHERJEE","CHATTERJEE","GHOSH","BOSE","SEN","ROY","SARKAR","MONDAL","MALLICK","MOHANTY","DASH","SAHOO","PATNAIK","NAYAK","PANDA","MUNDA","ORAON","TOPPO","TIRKEY","EKKA","MINZ","YADAV","SAHU","RATHORE","BAGHEL","LODHI","KUSHWAHA","THAKUR","BARUAH","BARUA","GOGOI","SAIKIA","DEKA","HAZARIKA","SANGMA","MARAK","LYNGDOH","LALLAWMA","LALTHANGLIANA","AO","SEMA","KONYAK","SHIMRAY","NINGOMBAM","ALI","HASSAN","KOYA","JAIN","MALIK","SINGHAL","AMBEDKAR","CHAMAR","PANDIT","KAPOOR","CHOPRA","MALIK","SINGHAL","JAIN","PADUKONE"]
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
[csv]
|
| 23 |
+
name_variation_standard = data/name_variation_standard.csv
|
| 24 |
+
hno_variation_standard = data/hno_variation_standard.csv
|
| 25 |
+
city_prev_pres = data/city_prev_pres.csv
|
| 26 |
+
state_name_standard = data/state_name_standard.csv
|
| 27 |
+
sur_comm_names = data/sur_comm_names.csv
|
| 28 |
+
pin_city_state = data/pin_city_state.csv
|
| 29 |
+
|
| 30 |
+
[MATCHING_LOGIC]
|
| 31 |
+
MODEL_WEIGHTS = {
|
| 32 |
+
"simple_ratio": 0.0,
|
| 33 |
+
"token_set_ratio": 0.0,
|
| 34 |
+
"partial_ratio": 0.1,
|
| 35 |
+
"w_ratio": 0.1,
|
| 36 |
+
"semantic_score": 0.8
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
MATCHING_RULES = [
|
| 40 |
+
|
| 41 |
+
([("NAME", 100), ("ZIPCODE", 100), ("ADDRESSLINE", 65)], "NAME >= 100 AND ZIPCODE >= 100 AND ADDRESS >= 65"),
|
| 42 |
+
([("NAME", 100), ("CITY", 100), ("ADDRESSLINE", 65)], "NAME >= 100 AND CITY >= 100 AND ADDRESS >= 65"),
|
| 43 |
+
([("NAME", 85), ("LASTNAME", 85), ("BIRTHDATE", 100), ("ADDRESSLINE", 60)], "NAME >= 85 AND LASTNAME >= 85 AND DOB >= 100 AND ADDRESS >= 60"),
|
| 44 |
+
([("NAME", 85), ("BIRTHDATE", 100), ("ZIPCODE", 100)], "NAME >= 85 AND DOB >= 100 AND ZIPCODE >= 100"),
|
| 45 |
+
([("NAME", 85), ("BIRTHDATE", 100), ("CITY", 100)], "NAME >= 85 AND DOB >= 100 AND CITY >= 100"),
|
| 46 |
+
([("NAME", 85), ("ZIPCODE", 100), ("ADDRESSLINE", 60)], "NAME >= 85 AND ZIPCODE >= 100 AND ADDRESS >= 60"),
|
| 47 |
+
([("NAME", 85), ("CITY", 100), ("ADDRESSLINE", 60)], "NAME >= 85 AND CITY >= 100 AND ADDRESS >= 60"),
|
| 48 |
+
([("BIRTHDATE", 100), ("ZIPCODE", 100), ("ADDRESSLINE", 65)], "BIRTHDATE >= 100 AND ZIPCODE >= 100 AND ADDRESS >= 65"),
|
| 49 |
+
([("BIRTHDATE", 100), ("CITY", 100), ("ADDRESSLINE", 65)], "BIRTHDATE >= 100 AND CITY >= 100 AND ADDRESS >= 65"),
|
| 50 |
+
([("LASTNAME", 85), ("ZIPCODE", 100), ("ADDRESSLINE", 60)], "LASTNAME >= 85 AND ZIPCODE >= 100 AND ADDRESS >= 60"),
|
| 51 |
+
([("NAME", 85), ("PHONE", 100)], "NAME >= 85 AND PHONE >= 100"),
|
| 52 |
+
([("BIRTHDATE", 100), ("PHONE", 100)], "BIRTHDATE >= 100 AND PHONE >= 100"),
|
| 53 |
+
([("BIRTHDATE", 100), ("NAME", 85)], "BIRTHDATE >=100 AND NAME>=85"),
|
| 54 |
+
([("ADDRESSLINE", 60), ("TAXID", 100)], "ADDRESS >= 60 and PAN >= 100"),
|
| 55 |
+
([("ADDRESSLINE", 60), ("LICENSEID", 100)], "ADDRESS >= 60 and DRIVING_LICN_NO >= 100"),
|
| 56 |
+
([("BIRTHDATE", 75), ("PHONE", 100)], "BIRTHDATE >= 75 and PHONE >= 100"),
|
| 57 |
+
([("BIRTHDATE", 75), ("TAXID", 100)], "BIRTHDATE >= 75 and PAN >= 100"),
|
| 58 |
+
([("BIRTHDATE", 75), ("LICENSEID", 100)], "BIRTHDATE >= 75 and DRIVING_LICN_NO >= 100"),
|
| 59 |
+
([("BIRTHDATE", 75), ("PASSPORTID", 100)], "BIRTHDATE >= 75 and PASSPORT_NO >= 100"),
|
| 60 |
+
([("NAME", 60), ("PASSPORTID", 100)], "NAME >= 60 and PASSPORT_NO >= 100"),
|
| 61 |
+
([("NAME", 60), ("LICENSEID", 100)], "NAME >= 60 and DRIVING_LICN_NO >= 100"),
|
| 62 |
+
([("NAME", 60), ("TAXID", 100)], "NAME >= 60 and AADHAR >= 100"),
|
| 63 |
+
([("NAME", 60), ("AADHAR", 100)], "NAME >= 60 and AADHAR >= 100"),
|
| 64 |
+
([("NAME", 60), ("PAN", 100)], "NAME >= 60 and PAN >= 100"),
|
| 65 |
+
([("PHONE", 100)], "PHONE >= 100"),
|
| 66 |
+
([("LICENSEID", 100)], "DRIVING_LICN_NO >= 100"),
|
| 67 |
+
([("PASSPORTID", 100)], "PASSPORT_NO >= 100"),
|
| 68 |
+
([("TAXID", 100)], "AADHAR >= 100"),
|
| 69 |
+
([("AADHAR", 100)], "AADHAR >= 100"),
|
| 70 |
+
([("PAN", 100)], "PAN >= 100"),
|
| 71 |
+
([("EMAIL", 100)], "EMAIL >= 100")
|
| 72 |
+
]
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
[NAME_MATCHING]
|
| 77 |
+
NAME_MODEL_WEIGHTS = {
|
| 78 |
+
"simple_ratio": 0.04,
|
| 79 |
+
"token_set_ratio": 0.04,
|
| 80 |
+
"partial_ratio": 0.04,
|
| 81 |
+
"w_ratio": 0.04,
|
| 82 |
+
"token_sort_ratio": 0.04,
|
| 83 |
+
"semantic_score": 0.7,
|
| 84 |
+
"jaro_winkler": 0.05,
|
| 85 |
+
"indic_soundex": 0.05
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
NAME_MATCH_ADJUSTMENTS = {
|
| 89 |
+
"surname_penalty": -20,
|
| 90 |
+
"initial_boost": 20,
|
| 91 |
+
"subset_boost": 20
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
[ADDRESS_MATCHING]
|
| 95 |
+
ADDRESS_MODEL_WEIGHTS = {
|
| 96 |
+
"simple_ratio": 0.04,
|
| 97 |
+
"token_set_ratio": 0.04,
|
| 98 |
+
"partial_ratio": 0.04,
|
| 99 |
+
"w_ratio": 0.04,
|
| 100 |
+
"token_sort_ratio": 0.04,
|
| 101 |
+
"semantic_score": 0.8
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
ADDRESS_MATCH_ADJUSTMENTS = {
|
| 105 |
+
"house_match_boost": 30,
|
| 106 |
+
"house_mismatch_penalty": 70
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
[EMBEDDING_MODELS]
|
| 110 |
+
MODEL_1_NAME = BAAI/bge-small-en-v1.5
|
| 111 |
+
MODEL_2_NAME = sentence-transformers/gtr-t5-base
|
backend/matching_service.py
ADDED
|
@@ -0,0 +1,683 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
import time
|
| 4 |
+
import logging
|
| 5 |
+
from typing import Dict, Any, Optional, Tuple, Union
|
| 6 |
+
|
| 7 |
+
# Ensure project root is on sys.path so sibling modules resolve
|
| 8 |
+
_PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 9 |
+
if _PROJECT_ROOT not in sys.path:
|
| 10 |
+
sys.path.insert(0, _PROJECT_ROOT)
|
| 11 |
+
|
| 12 |
+
import numpy as np
|
| 13 |
+
import pandas as pd
|
| 14 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 15 |
+
|
| 16 |
+
from services.config import (
|
| 17 |
+
config,
|
| 18 |
+
pin_city_state_df,
|
| 19 |
+
ADDRESS_MATCH_ADJUSTMENTS,
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
from services.rules import (
|
| 23 |
+
preprocess_text,
|
| 24 |
+
standardize_column,
|
| 25 |
+
standardize_city,
|
| 26 |
+
standardize_state,
|
| 27 |
+
standardize_dob,
|
| 28 |
+
compare_exact,
|
| 29 |
+
compare_any_match,
|
| 30 |
+
compare_phone_any_match,
|
| 31 |
+
compare_email_any_match,
|
| 32 |
+
evaluate_matching_rules,
|
| 33 |
+
|
| 34 |
+
apply_pattern_matching_logic,
|
| 35 |
+
pincode_similarity_india,
|
| 36 |
+
extract_address_components,
|
| 37 |
+
validate_and_normalize_phone,
|
| 38 |
+
validate_and_normalize_email,
|
| 39 |
+
validate_and_normalize_pan,
|
| 40 |
+
validate_and_normalize_aadhar,
|
| 41 |
+
preprocess_name,
|
| 42 |
+
detect_surnames,
|
| 43 |
+
compute_initial_letter_boost,
|
| 44 |
+
clean_text,
|
| 45 |
+
deduplicate_tokens,
|
| 46 |
+
deduplicate_consecutive_tokens,
|
| 47 |
+
strip_non_alphanumeric,
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
from services.model import (
|
| 51 |
+
match_names_cross_records as embedding_match_names,
|
| 52 |
+
match_single_field as embedding_match_single,
|
| 53 |
+
)
|
| 54 |
+
from services.address_matcher import match_addresses_enhanced
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
logger = logging.getLogger("matching_service")
|
| 59 |
+
|
| 60 |
+
# =========================================================
|
| 61 |
+
# SENTINEL
|
| 62 |
+
# =========================================================
|
| 63 |
+
_MISSING = -1 # internal sentinel for "field not provided"
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
# =========================================================
|
| 67 |
+
# HELPERS
|
| 68 |
+
# =========================================================
|
| 69 |
+
def _is_valid_value(val: Any) -> bool:
|
| 70 |
+
if val is None:
|
| 71 |
+
return False
|
| 72 |
+
s = str(val).strip().lower()
|
| 73 |
+
return s not in ("", "-", " ", "na", "n/a", "null", "none", "missing value", "missing")
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def _clean_input(val: str) -> str:
|
| 77 |
+
"""Convert placeholder strings to empty string."""
|
| 78 |
+
if not val:
|
| 79 |
+
return ""
|
| 80 |
+
if val.strip().lower() in ("missing value", "missing", "na", "n/a", "null", "none"):
|
| 81 |
+
return ""
|
| 82 |
+
return val
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def _is_field_empty(val: Any) -> bool:
|
| 86 |
+
return not _is_valid_value(val)
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def _normalize_gender(val: Any) -> Optional[str]:
|
| 90 |
+
"""Normalize gender values to canonical MALE / FEMALE / OTHER."""
|
| 91 |
+
if not _is_valid_value(val):
|
| 92 |
+
return None
|
| 93 |
+
s = str(val).strip().lower()
|
| 94 |
+
if s in ("m", "male", "men", "man"):
|
| 95 |
+
return "MALE"
|
| 96 |
+
if s in ("f", "female", "women", "woman"):
|
| 97 |
+
return "FEMALE"
|
| 98 |
+
return s.upper()
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def _safe_round(val: Any) -> float:
|
| 102 |
+
try:
|
| 103 |
+
v = round(float(val), 2)
|
| 104 |
+
return max(v, 0.0)
|
| 105 |
+
except (ValueError, TypeError):
|
| 106 |
+
return 0.0
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
# =========================================================
|
| 110 |
+
# FLATTEN: EntityRecord β flat dict
|
| 111 |
+
# =========================================================
|
| 112 |
+
def flatten_entity_record(record) -> Dict[str, Any]:
|
| 113 |
+
"""
|
| 114 |
+
Convert an backend.models.EntityRecord (structured, nested) into the flat
|
| 115 |
+
dictionary format that match_structured_records() expects.
|
| 116 |
+
|
| 117 |
+
Mapping:
|
| 118 |
+
addresses[0] β addressline_0, city_0, state_0, zipcode_0
|
| 119 |
+
addresses[1] β addressline_1, city_1, state_1, zipcode_1
|
| 120 |
+
addresses[N] β addressline_N, city_N, state_N, zipcode_N β N entries
|
| 121 |
+
phones[0] β phone_0
|
| 122 |
+
phones[1] β phone_1 β N entries
|
| 123 |
+
emails[0] β email_0
|
| 124 |
+
emails[1] β email_1 β N entries
|
| 125 |
+
custom_fields β top-level keys (uppercase)
|
| 126 |
+
|
| 127 |
+
The downstream match_structured_records() function picks up all these
|
| 128 |
+
indexed keys via get_dynamic_fields() and runs best-of-N / any-match
|
| 129 |
+
comparisons automatically β no changes needed there.
|
| 130 |
+
"""
|
| 131 |
+
flat: Dict[str, Any] = {
|
| 132 |
+
"name": _clean_input(record.name),
|
| 133 |
+
"firstname": _clean_input(record.firstname),
|
| 134 |
+
"middlename": _clean_input(record.middlename),
|
| 135 |
+
"lastname": _clean_input(record.lastname),
|
| 136 |
+
"mothername": _clean_input(record.mothername),
|
| 137 |
+
"fathername": _clean_input(record.fathername),
|
| 138 |
+
"spousename": _clean_input(record.spousename),
|
| 139 |
+
"othername": _clean_input(record.othername),
|
| 140 |
+
"dob": _clean_input(record.dob),
|
| 141 |
+
"gender": _clean_input(record.gender),
|
| 142 |
+
"AADHAR": _clean_input(record.aadhar),
|
| 143 |
+
"pan": _clean_input(record.pan),
|
| 144 |
+
"licenseid": _clean_input(record.licenseid),
|
| 145 |
+
"passportid": _clean_input(record.passportid),
|
| 146 |
+
"voterid": _clean_input(record.voterid),
|
| 147 |
+
"companyname": _clean_input(record.companyname),
|
| 148 |
+
"parentcompanyname": _clean_input(record.parentcompanyname),
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
# ββ Addresses β addressline_0, city_0, β¦, addressline_N, city_N, β¦ ββ
|
| 152 |
+
# models.py already deduplicates and removes blank entries before we
|
| 153 |
+
# get here, so record.addresses contains only meaningful addresses.
|
| 154 |
+
for idx, addr in enumerate(record.addresses):
|
| 155 |
+
flat[f"addressline_{idx}"] = _clean_input(addr.addressline)
|
| 156 |
+
flat[f"city_{idx}"] = _clean_input(addr.city)
|
| 157 |
+
flat[f"state_{idx}"] = _clean_input(addr.state)
|
| 158 |
+
flat[f"zipcode_{idx}"] = _clean_input(addr.zipcode)
|
| 159 |
+
|
| 160 |
+
# ββ Phones β phone_0, phone_1, β¦ βββββββββββββββββββββββββββββββββββββ
|
| 161 |
+
# models.py already deduplicates and removes placeholder entries.
|
| 162 |
+
for idx, phone in enumerate(record.phones):
|
| 163 |
+
flat[f"phone_{idx}"] = phone
|
| 164 |
+
|
| 165 |
+
# ββ Emails β email_0, email_1, β¦ βββββββββββββββββββββββββββββββββββββ
|
| 166 |
+
for idx, email in enumerate(record.emails):
|
| 167 |
+
flat[f"email_{idx}"] = email
|
| 168 |
+
|
| 169 |
+
# ββ Custom fields β top-level keys βββββββββββββββββββββββββββββββββββ
|
| 170 |
+
for key, value in record.custom_fields.items():
|
| 171 |
+
safe_key = str(key).strip()
|
| 172 |
+
if safe_key:
|
| 173 |
+
flat[safe_key] = value
|
| 174 |
+
|
| 175 |
+
logger.debug(
|
| 176 |
+
"flatten_entity_record β %d addresses, %d phones, %d emails",
|
| 177 |
+
len(record.addresses), len(record.phones), len(record.emails),
|
| 178 |
+
)
|
| 179 |
+
return flat
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
# =========================================================
|
| 183 |
+
# STANDARDIZE: apply preprocessing pipeline to a flat dict
|
| 184 |
+
# =========================================================
|
| 185 |
+
def standardize_record(raw: Dict[str, Any]) -> Dict[str, Any]:
|
| 186 |
+
"""
|
| 187 |
+
Apply the full standardization pipeline to a flat record dict.
|
| 188 |
+
Mirrors the logic from app_streamlit.py main() lines 1434-1512.
|
| 189 |
+
|
| 190 |
+
Dynamic fields (addressline_N, city_N, state_N, zipcode_N, phone_N,
|
| 191 |
+
email_N) are handled by the loop at the bottom β no changes needed
|
| 192 |
+
for multi-value support.
|
| 193 |
+
"""
|
| 194 |
+
processed: Dict[str, Any] = {
|
| 195 |
+
"gender": preprocess_text(raw.get("gender", "")),
|
| 196 |
+
"name": preprocess_name(raw.get("name", "")),
|
| 197 |
+
"firstname": preprocess_name(raw.get("firstname", "")),
|
| 198 |
+
"middlename": preprocess_name(raw.get("middlename", "")),
|
| 199 |
+
"lastname": preprocess_name(raw.get("lastname", "")),
|
| 200 |
+
"spousename": preprocess_name(raw.get("spousename", "")),
|
| 201 |
+
"mothername": preprocess_name(raw.get("mothername", "")),
|
| 202 |
+
"fathername": preprocess_name(raw.get("fathername", "")),
|
| 203 |
+
"companyname": standardize_column(raw.get("companyname", ""), "companyname"),
|
| 204 |
+
"parentcompanyname": standardize_column(raw.get("parentcompanyname", ""), "parentcompanyname"),
|
| 205 |
+
"AADHAR": standardize_column(raw.get("AADHAR", ""), "AADHAR"),
|
| 206 |
+
"pan": standardize_column(raw.get("pan", ""), "pan"),
|
| 207 |
+
"licenseid": standardize_column(raw.get("licenseid", ""), "licenseid"),
|
| 208 |
+
"passportid": standardize_column(raw.get("passportid", ""), "passportid"),
|
| 209 |
+
"voterid": standardize_column(raw.get("voterid", ""), "voterid"),
|
| 210 |
+
"dob": standardize_dob(raw.get("dob", "")),
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
# ββ Dynamic fields β handle all N entries automatically ββββββββββββββ
|
| 214 |
+
for key, value in raw.items():
|
| 215 |
+
if key.startswith("addressline_"):
|
| 216 |
+
processed[key] = standardize_column(raw.get(key, ""), "addressline")
|
| 217 |
+
elif key.startswith("city_"):
|
| 218 |
+
processed[key] = standardize_city(value) if _is_valid_value(value) else None
|
| 219 |
+
elif key.startswith("zipcode_"):
|
| 220 |
+
processed[key] = standardize_column(raw.get(key, ""), key)
|
| 221 |
+
elif key.startswith("state_"):
|
| 222 |
+
processed[key] = standardize_state(value) if _is_valid_value(value) else None
|
| 223 |
+
elif key.startswith("phone_"):
|
| 224 |
+
processed[key] = standardize_column(raw.get(key, ""), key)
|
| 225 |
+
elif key.startswith("email_"):
|
| 226 |
+
processed[key] = standardize_column(raw.get(key, ""), key)
|
| 227 |
+
elif key not in processed:
|
| 228 |
+
# Custom fields β keep uppercase keys
|
| 229 |
+
safe_key = str(key).strip()
|
| 230 |
+
if safe_key:
|
| 231 |
+
processed[safe_key.upper()] = value
|
| 232 |
+
|
| 233 |
+
return processed
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
# =========================================================
|
| 240 |
+
# MATCH STRUCTURED RECORDS
|
| 241 |
+
# Extracted from app_streamlit.py lines 912-1250
|
| 242 |
+
# =========================================================
|
| 243 |
+
def match_structured_records(
|
| 244 |
+
r1: Dict[str, Any],
|
| 245 |
+
r2: Dict[str, Any],
|
| 246 |
+
mode: str = "embedding",
|
| 247 |
+
) -> Dict[str, Any]:
|
| 248 |
+
"""
|
| 249 |
+
Core matching engine. Accepts two *standardized* flat record dicts.
|
| 250 |
+
Returns a dict of field β score.
|
| 251 |
+
|
| 252 |
+
Multi-value handling (all already in place β no changes needed):
|
| 253 |
+
β Addresses : get_dynamic_fields("addressline_") collects all N
|
| 254 |
+
addresslines from both records; embedding_match_addresses
|
| 255 |
+
run best-of-N across all combos.
|
| 256 |
+
β Phones : get_dynamic_fields("phone_") + compare_phone_any_match
|
| 257 |
+
β any-match across all phone combinations.
|
| 258 |
+
β Emails : get_dynamic_fields("email_") + compare_email_any_match
|
| 259 |
+
β any-match across all email combinations.
|
| 260 |
+
β City/State/Zipcode: cross-compared across all pincode combinations
|
| 261 |
+
via the nested loop (i, j) β already handles N pincodes.
|
| 262 |
+
"""
|
| 263 |
+
|
| 264 |
+
# ββ helpers (closures) ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 265 |
+
def get_dynamic_fields(record: Dict, prefix: str):
|
| 266 |
+
"""
|
| 267 |
+
Extract all dynamic fields with given prefix from record.
|
| 268 |
+
Works for any N: phone_0, phone_1, β¦, phone_N.
|
| 269 |
+
"""
|
| 270 |
+
fields = []
|
| 271 |
+
i = 0
|
| 272 |
+
while True:
|
| 273 |
+
key = f"{prefix}{i}"
|
| 274 |
+
if key in record:
|
| 275 |
+
fields.append(record.get(key))
|
| 276 |
+
i += 1
|
| 277 |
+
else:
|
| 278 |
+
break
|
| 279 |
+
return fields
|
| 280 |
+
|
| 281 |
+
def lookup_pincode_df(pincode, df):
|
| 282 |
+
if not _is_valid_value(pincode):
|
| 283 |
+
return None, None
|
| 284 |
+
row = df.loc[df["pincode"].astype(str) == str(pincode)]
|
| 285 |
+
if not row.empty:
|
| 286 |
+
return row.iloc[0]["districtname"], row.iloc[0]["statename"]
|
| 287 |
+
return None, None
|
| 288 |
+
|
| 289 |
+
# ββ geo / pincode enrichment βββββββββββββββββββββββββββββββββββββββββ
|
| 290 |
+
city_match = 0
|
| 291 |
+
state_match = 0
|
| 292 |
+
zipcode_match = 0
|
| 293 |
+
|
| 294 |
+
pincode_match_found = False
|
| 295 |
+
best_pincode_score = 0
|
| 296 |
+
|
| 297 |
+
# Use pre-loaded CSV DataFrame instead of MySQL
|
| 298 |
+
pincode_df = pin_city_state_df
|
| 299 |
+
|
| 300 |
+
r1_pincodes = get_dynamic_fields(r1, "zipcode_")
|
| 301 |
+
r2_pincodes = get_dynamic_fields(r2, "zipcode_")
|
| 302 |
+
r1_cities_user = get_dynamic_fields(r1, "city_")
|
| 303 |
+
r2_cities_user = get_dynamic_fields(r2, "city_")
|
| 304 |
+
r1_states_user = get_dynamic_fields(r1, "state_")
|
| 305 |
+
r2_states_user = get_dynamic_fields(r2, "state_")
|
| 306 |
+
|
| 307 |
+
r1_cities, r2_cities = [], []
|
| 308 |
+
r1_states, r2_states = [], []
|
| 309 |
+
|
| 310 |
+
# Cross-compare ALL pincode combinations (i Γ j) β handles N pincodes
|
| 311 |
+
for i, r1_pincode in enumerate(r1_pincodes):
|
| 312 |
+
for j, r2_pincode in enumerate(r2_pincodes):
|
| 313 |
+
r1_city_user = r1_cities_user[i] if i < len(r1_cities_user) else None
|
| 314 |
+
r2_city_user = r2_cities_user[j] if j < len(r2_cities_user) else None
|
| 315 |
+
r1_state_user = r1_states_user[i] if i < len(r1_states_user) else None
|
| 316 |
+
r2_state_user = r2_states_user[j] if j < len(r2_states_user) else None
|
| 317 |
+
|
| 318 |
+
if _is_valid_value(r1_pincode) and _is_valid_value(r2_pincode):
|
| 319 |
+
# --- CSV lookup first ---
|
| 320 |
+
r1_city_df, r1_state_df = lookup_pincode_df(r1_pincode, pincode_df)
|
| 321 |
+
r2_city_df, r2_state_df = lookup_pincode_df(r2_pincode, pincode_df)
|
| 322 |
+
|
| 323 |
+
# --- Pincode similarity (for scoring) ---
|
| 324 |
+
if r1_pincode == r2_pincode:
|
| 325 |
+
pincode_match_found = True
|
| 326 |
+
best_pincode_score = max(best_pincode_score, 100)
|
| 327 |
+
else:
|
| 328 |
+
# Different pincodes β pincode score = 0 (exact match or nothing)
|
| 329 |
+
pass
|
| 330 |
+
|
| 331 |
+
# --- City/State enrichment: prefer CSV, then pgeocode fallback ---
|
| 332 |
+
got_r1_geo = bool(r1_city_df and r1_state_df)
|
| 333 |
+
got_r2_geo = bool(r2_city_df and r2_state_df)
|
| 334 |
+
|
| 335 |
+
if got_r1_geo:
|
| 336 |
+
r1_cities.append(standardize_city(r1_city_df))
|
| 337 |
+
r1_states.append(standardize_state(r1_state_df))
|
| 338 |
+
if got_r2_geo:
|
| 339 |
+
r2_cities.append(standardize_city(r2_city_df))
|
| 340 |
+
r2_states.append(standardize_state(r2_state_df))
|
| 341 |
+
|
| 342 |
+
# pgeocode fallback for any pin not found in CSV
|
| 343 |
+
got_r1_pgeo = False
|
| 344 |
+
got_r2_pgeo = False
|
| 345 |
+
if not got_r1_geo or not got_r2_geo:
|
| 346 |
+
pin_result = pincode_similarity_india(r1_pincode, r2_pincode)
|
| 347 |
+
if not got_r1_geo:
|
| 348 |
+
if pin_result.get("pin1_county_name"):
|
| 349 |
+
r1_cities.append(standardize_city(pin_result["pin1_county_name"]))
|
| 350 |
+
got_r1_pgeo = True
|
| 351 |
+
if pin_result.get("pin1_state_name"):
|
| 352 |
+
r1_states.append(standardize_state(pin_result["pin1_state_name"]))
|
| 353 |
+
got_r1_pgeo = True
|
| 354 |
+
if not got_r2_geo:
|
| 355 |
+
if pin_result.get("pin2_county_name"):
|
| 356 |
+
r2_cities.append(standardize_city(pin_result["pin2_county_name"]))
|
| 357 |
+
got_r2_pgeo = True
|
| 358 |
+
if pin_result.get("pin2_state_name"):
|
| 359 |
+
r2_states.append(standardize_state(pin_result["pin2_state_name"]))
|
| 360 |
+
got_r2_pgeo = True
|
| 361 |
+
|
| 362 |
+
# If STILL no geo for a pin, fall back to user-entered city/state
|
| 363 |
+
if not got_r1_geo and not got_r1_pgeo:
|
| 364 |
+
if _is_valid_value(r1_city_user):
|
| 365 |
+
r1_cities.append(standardize_city(r1_city_user))
|
| 366 |
+
if _is_valid_value(r1_state_user):
|
| 367 |
+
r1_states.append(standardize_state(r1_state_user))
|
| 368 |
+
if not got_r2_geo and not got_r2_pgeo:
|
| 369 |
+
if _is_valid_value(r2_city_user):
|
| 370 |
+
r2_cities.append(standardize_city(r2_city_user))
|
| 371 |
+
if _is_valid_value(r2_state_user):
|
| 372 |
+
r2_states.append(standardize_state(r2_state_user))
|
| 373 |
+
else:
|
| 374 |
+
if _is_valid_value(r1_city_user):
|
| 375 |
+
r1_cities.append(standardize_city(r1_city_user))
|
| 376 |
+
if _is_valid_value(r2_city_user):
|
| 377 |
+
r2_cities.append(standardize_city(r2_city_user))
|
| 378 |
+
if _is_valid_value(r1_state_user):
|
| 379 |
+
r1_states.append(standardize_state(r1_state_user))
|
| 380 |
+
if _is_valid_value(r2_state_user):
|
| 381 |
+
r2_states.append(standardize_state(r2_state_user))
|
| 382 |
+
|
| 383 |
+
# Fallback: user-entered cities/states if no pincodes provided
|
| 384 |
+
if not r1_pincodes or not r2_pincodes:
|
| 385 |
+
for city_val in r1_cities_user:
|
| 386 |
+
if _is_valid_value(city_val) and standardize_city(city_val) not in r1_cities:
|
| 387 |
+
r1_cities.append(standardize_city(city_val))
|
| 388 |
+
for city_val in r2_cities_user:
|
| 389 |
+
if _is_valid_value(city_val) and standardize_city(city_val) not in r2_cities:
|
| 390 |
+
r2_cities.append(standardize_city(city_val))
|
| 391 |
+
for state_val in r1_states_user:
|
| 392 |
+
if _is_valid_value(state_val) and standardize_state(state_val) not in r1_states:
|
| 393 |
+
r1_states.append(standardize_state(state_val))
|
| 394 |
+
for state_val in r2_states_user:
|
| 395 |
+
if _is_valid_value(state_val) and standardize_state(state_val) not in r2_states:
|
| 396 |
+
r2_states.append(standardize_state(state_val))
|
| 397 |
+
|
| 398 |
+
# City / State / Pincode scoring
|
| 399 |
+
if r1_cities and r2_cities:
|
| 400 |
+
if any(c1 == c2 for c1 in r1_cities for c2 in r2_cities):
|
| 401 |
+
city_match = 100
|
| 402 |
+
elif pincode_match_found:
|
| 403 |
+
city_match = 100
|
| 404 |
+
|
| 405 |
+
if r1_states and r2_states:
|
| 406 |
+
if any(s1 == s2 for s1 in r1_states for s2 in r2_states):
|
| 407 |
+
state_match = 100
|
| 408 |
+
|
| 409 |
+
zipcode_match = compare_any_match(r1_pincodes, r2_pincodes, field_type="pincode")
|
| 410 |
+
|
| 411 |
+
# ββ Phone / Email matching βββββββββββββββββββββββββββββββββββββββββββ
|
| 412 |
+
# compare_phone_any_match / compare_email_any_match already handle
|
| 413 |
+
# lists of any length β any-match strategy.
|
| 414 |
+
r1_phones = get_dynamic_fields(r1, "phone_")
|
| 415 |
+
r2_phones = get_dynamic_fields(r2, "phone_")
|
| 416 |
+
phone_match = compare_phone_any_match(r1_phones, r2_phones)
|
| 417 |
+
|
| 418 |
+
r1_emails = get_dynamic_fields(r1, "email_")
|
| 419 |
+
r2_emails = get_dynamic_fields(r2, "email_")
|
| 420 |
+
email_match = compare_email_any_match(r1_emails, r2_emails)
|
| 421 |
+
|
| 422 |
+
logger.debug(
|
| 423 |
+
"match_structured_records β phones R1=%s R2=%s | emails R1=%s R2=%s | "
|
| 424 |
+
"addresses R1=%d R2=%d",
|
| 425 |
+
r1_phones, r2_phones, r1_emails, r2_emails,
|
| 426 |
+
len(get_dynamic_fields(r1, "addressline_")),
|
| 427 |
+
len(get_dynamic_fields(r2, "addressline_")),
|
| 428 |
+
)
|
| 429 |
+
|
| 430 |
+
# ββ Name / Address / Single-field matching (mode-dependent) βββββββββ
|
| 431 |
+
|
| 432 |
+
name_match = embedding_match_names(
|
| 433 |
+
r1.get("name"), r1.get("firstname"), r1.get("lastname"), r1.get("middlename"),
|
| 434 |
+
r2.get("name"), r2.get("firstname"), r2.get("lastname"), r2.get("middlename"),
|
| 435 |
+
)
|
| 436 |
+
|
| 437 |
+
r1_addrs = get_dynamic_fields(r1, "addressline_")
|
| 438 |
+
r2_addrs = get_dynamic_fields(r2, "addressline_")
|
| 439 |
+
|
| 440 |
+
# ββ Enhanced address matching with full pipeline ββ
|
| 441 |
+
address_match = match_addresses_enhanced(r1_addrs, r2_addrs)
|
| 442 |
+
|
| 443 |
+
spousename_match = embedding_match_single(r1.get("spousename"), r2.get("spousename"))
|
| 444 |
+
mothername_match = embedding_match_single(r1.get("mothername"), r2.get("mothername"))
|
| 445 |
+
fathername_match = embedding_match_single(r1.get("fathername"), r2.get("fathername"))
|
| 446 |
+
companyname_match = embedding_match_single(r1.get("companyname"), r2.get("companyname"))
|
| 447 |
+
parentcompanyname_match = embedding_match_single(r1.get("parentcompanyname"), r2.get("parentcompanyname"))
|
| 448 |
+
|
| 449 |
+
# ββ Exact matching βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 450 |
+
g1 = _normalize_gender(r1.get("gender"))
|
| 451 |
+
g2 = _normalize_gender(r2.get("gender"))
|
| 452 |
+
if not g1 and not g2:
|
| 453 |
+
gender_match = ""
|
| 454 |
+
elif g1 and g2 and g1 == g2:
|
| 455 |
+
gender_match = 100
|
| 456 |
+
else:
|
| 457 |
+
gender_match = 0
|
| 458 |
+
|
| 459 |
+
aadhar_match = compare_exact(r1.get("AADHAR"), r2.get("AADHAR"))
|
| 460 |
+
pan_match = compare_exact(r1.get("pan"), r2.get("pan"))
|
| 461 |
+
licenseid_match = compare_exact(r1.get("licenseid"), r2.get("licenseid"))
|
| 462 |
+
passportid_match = compare_exact(r1.get("passportid"), r2.get("passportid"))
|
| 463 |
+
voterid_match = compare_exact(r1.get("voterid"), r2.get("voterid"))
|
| 464 |
+
dob_match = compare_exact(r1.get("dob"), r2.get("dob"))
|
| 465 |
+
|
| 466 |
+
# ββ Assemble results βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 467 |
+
results = {
|
| 468 |
+
"GENDER": apply_pattern_matching_logic("GENDER", gender_match) if gender_match != "" else _MISSING,
|
| 469 |
+
"NAME": name_match["full_name_percent"] if name_match is not None else _MISSING,
|
| 470 |
+
"FIRSTNAME": name_match["firstname_percent"] if name_match is not None else _MISSING,
|
| 471 |
+
"MIDDLENAME": name_match["middlename_percent"] if name_match is not None else _MISSING,
|
| 472 |
+
"LASTNAME": name_match["lastname_percent"] if name_match is not None else _MISSING,
|
| 473 |
+
"SPOUSENAME": apply_pattern_matching_logic("SPOUSENAME", spousename_match) if spousename_match != "" else _MISSING,
|
| 474 |
+
"MOTHERNAME": apply_pattern_matching_logic("MOTHERNAME", mothername_match) if mothername_match != "" else _MISSING,
|
| 475 |
+
"FATHERNAME": apply_pattern_matching_logic("FATHERNAME", fathername_match) if fathername_match != "" else _MISSING,
|
| 476 |
+
"COMPANYNAME": apply_pattern_matching_logic("COMPANYNAME", companyname_match) if companyname_match != "" else _MISSING,
|
| 477 |
+
"PARENTCOMPANYNAME":apply_pattern_matching_logic("PARENTCOMPANYNAME",parentcompanyname_match)if parentcompanyname_match!= "" else _MISSING,
|
| 478 |
+
"AADHAR": apply_pattern_matching_logic("AADHAR", aadhar_match) if aadhar_match != "" else _MISSING,
|
| 479 |
+
"PAN": apply_pattern_matching_logic("PAN", pan_match) if pan_match != "" else _MISSING,
|
| 480 |
+
"LICENSEID": apply_pattern_matching_logic("LICENSEID", licenseid_match) if licenseid_match != "" else _MISSING,
|
| 481 |
+
"PASSPORTID": apply_pattern_matching_logic("PASSPORTID", passportid_match) if passportid_match != "" else _MISSING,
|
| 482 |
+
"VOTERID": apply_pattern_matching_logic("TAXID", voterid_match) if voterid_match != "" else _MISSING,
|
| 483 |
+
"ADDRESSLINE": apply_pattern_matching_logic("ADDRESSLINE", address_match) if address_match != "" else _MISSING,
|
| 484 |
+
"BIRTHDATE": apply_pattern_matching_logic("BIRTHDATE", dob_match) if dob_match != "" else _MISSING,
|
| 485 |
+
"PHONE": apply_pattern_matching_logic("PHONE", phone_match) if phone_match != "" else _MISSING,
|
| 486 |
+
"EMAIL": apply_pattern_matching_logic("EMAIL", email_match) if email_match != "" else _MISSING,
|
| 487 |
+
"CITY": apply_pattern_matching_logic("CITY", city_match) if city_match != "" else _MISSING,
|
| 488 |
+
"STATE": apply_pattern_matching_logic("STATE", state_match) if state_match != "" else _MISSING,
|
| 489 |
+
"ZIPCODE": apply_pattern_matching_logic("ZIPCODE", zipcode_match) if zipcode_match != "" else _MISSING,
|
| 490 |
+
}
|
| 491 |
+
|
| 492 |
+
# ββ Custom field matching ββββββββββββββββββββββββββββββββββββββββββββ
|
| 493 |
+
known_keys = {
|
| 494 |
+
"name", "firstname", "middlename", "lastname", "spousename",
|
| 495 |
+
"mothername", "fathername", "dob", "gender", "AADHAR", "pan",
|
| 496 |
+
"licenseid", "passportid", "voterid", "companyname", "parentcompanyname",
|
| 497 |
+
}
|
| 498 |
+
all_keys = set(r1.keys()) | set(r2.keys())
|
| 499 |
+
for key in all_keys:
|
| 500 |
+
key_str = str(key)
|
| 501 |
+
if key_str in known_keys:
|
| 502 |
+
continue
|
| 503 |
+
if any(key_str.startswith(p) for p in (
|
| 504 |
+
"zipcode_", "city_", "state_", "phone_", "email_", "addressline_"
|
| 505 |
+
)):
|
| 506 |
+
continue
|
| 507 |
+
val1 = r1.get(key) or r1.get(key_str.upper())
|
| 508 |
+
val2 = r2.get(key) or r2.get(key_str.upper())
|
| 509 |
+
if val1 or val2:
|
| 510 |
+
score = compare_exact(val1, val2)
|
| 511 |
+
results[key_str.upper()] = score
|
| 512 |
+
|
| 513 |
+
return results
|
| 514 |
+
|
| 515 |
+
|
| 516 |
+
# =========================================================
|
| 517 |
+
# FORMAT RESULTS
|
| 518 |
+
# =========================================================
|
| 519 |
+
def format_results(
|
| 520 |
+
field_results: Dict[str, Any],
|
| 521 |
+
r1_processed: Dict[str, Any],
|
| 522 |
+
r2_processed: Dict[str, Any],
|
| 523 |
+
mode: str,
|
| 524 |
+
) -> Dict[str, Any]:
|
| 525 |
+
"""
|
| 526 |
+
Convert raw field scores into the final response format.
|
| 527 |
+
Handles 'missing value' detection and mode-specific formatting.
|
| 528 |
+
|
| 529 |
+
Dynamic prefixes (addressline_, phone_, email_, city_, state_, zipcode_)
|
| 530 |
+
are scanned from both processed records so missing-value detection works
|
| 531 |
+
correctly regardless of how many entries are present in each record.
|
| 532 |
+
|
| 533 |
+
[MODIFIED 2026-03-15]
|
| 534 |
+
- Expanded 'field_to_inputs' mapping for NAME to explicitly check First,
|
| 535 |
+
Middle, and Last name fields. This prevents `format_results` from blindly
|
| 536 |
+
overwriting a valid exact FML match score back to 'missing value'.
|
| 537 |
+
"""
|
| 538 |
+
|
| 539 |
+
# Static field β input key mappings
|
| 540 |
+
field_to_inputs: Dict[str, list] = {
|
| 541 |
+
"GENDER": [("gender",)],
|
| 542 |
+
"NAME": [("name",), ("firstname",), ("middlename",), ("lastname",)],
|
| 543 |
+
"FIRSTNAME": [("firstname",)],
|
| 544 |
+
"MIDDLENAME": [("middlename",)],
|
| 545 |
+
"LASTNAME": [("lastname",)],
|
| 546 |
+
"SPOUSENAME": [("spousename",)],
|
| 547 |
+
"MOTHERNAME": [("mothername",)],
|
| 548 |
+
"FATHERNAME": [("fathername",)],
|
| 549 |
+
"COMPANYNAME": [("companyname",)],
|
| 550 |
+
"PARENTCOMPANYNAME":[("parentcompanyname",)],
|
| 551 |
+
"AADHAR": [("AADHAR",)],
|
| 552 |
+
"PAN": [("pan",)],
|
| 553 |
+
"LICENSEID": [("licenseid",)],
|
| 554 |
+
"PASSPORTID": [("passportid",)],
|
| 555 |
+
"VOTERID": [("voterid",)],
|
| 556 |
+
"BIRTHDATE": [("dob",)],
|
| 557 |
+
}
|
| 558 |
+
|
| 559 |
+
# Dynamic field mappings β scan ALL keys from both records so N-entry
|
| 560 |
+
# fields are correctly represented regardless of how many were sent.
|
| 561 |
+
for prefix, result_key in [
|
| 562 |
+
("addressline_", "ADDRESSLINE"),
|
| 563 |
+
("phone_", "PHONE"),
|
| 564 |
+
("email_", "EMAIL"),
|
| 565 |
+
("city_", "CITY"),
|
| 566 |
+
("state_", "STATE"),
|
| 567 |
+
("zipcode_", "ZIPCODE"),
|
| 568 |
+
]:
|
| 569 |
+
input_keys = [
|
| 570 |
+
k for k in list(r1_processed.keys()) + list(r2_processed.keys())
|
| 571 |
+
if k.startswith(prefix)
|
| 572 |
+
]
|
| 573 |
+
field_to_inputs[result_key] = [(k,) for k in input_keys] if input_keys else []
|
| 574 |
+
|
| 575 |
+
def check_missing(result_key: str) -> bool:
|
| 576 |
+
"""
|
| 577 |
+
Return True only if BOTH records have no valid data for this field.
|
| 578 |
+
For multi-value fields, any single valid value in either record means
|
| 579 |
+
the field is NOT missing.
|
| 580 |
+
"""
|
| 581 |
+
input_defs = field_to_inputs.get(result_key)
|
| 582 |
+
if input_defs is None:
|
| 583 |
+
return field_results.get(result_key) == _MISSING
|
| 584 |
+
if not input_defs:
|
| 585 |
+
return True
|
| 586 |
+
|
| 587 |
+
# For CITY and STATE, consider them present if ZIPCODE was provided,
|
| 588 |
+
# because the backend infers city/state from the zipcode.
|
| 589 |
+
if result_key in ["CITY", "STATE"]:
|
| 590 |
+
zipcode_defs = field_to_inputs.get("ZIPCODE", [])
|
| 591 |
+
for (field_key,) in zipcode_defs:
|
| 592 |
+
r1_val = r1_processed.get(field_key)
|
| 593 |
+
r2_val = r2_processed.get(field_key)
|
| 594 |
+
if not _is_field_empty(r1_val) or not _is_field_empty(r2_val):
|
| 595 |
+
return False
|
| 596 |
+
|
| 597 |
+
for (field_key,) in input_defs:
|
| 598 |
+
r1_val = r1_processed.get(field_key)
|
| 599 |
+
r2_val = r2_processed.get(field_key)
|
| 600 |
+
if not _is_field_empty(r1_val) or not _is_field_empty(r2_val):
|
| 601 |
+
return False
|
| 602 |
+
return True
|
| 603 |
+
|
| 604 |
+
formatted_scores: Dict[str, Any] = {}
|
| 605 |
+
all_keys = list(field_to_inputs.keys())
|
| 606 |
+
|
| 607 |
+
for k in all_keys:
|
| 608 |
+
v = field_results.get(k, _MISSING)
|
| 609 |
+
|
| 610 |
+
if check_missing(k):
|
| 611 |
+
formatted_scores[k] = "missing value"
|
| 612 |
+
elif mode == "embedding":
|
| 613 |
+
if v == _MISSING:
|
| 614 |
+
formatted_scores[k] = "missing value"
|
| 615 |
+
else:
|
| 616 |
+
try:
|
| 617 |
+
formatted_scores[k] = _safe_round(v)
|
| 618 |
+
except (ValueError, TypeError):
|
| 619 |
+
formatted_scores[k] = "missing value"
|
| 620 |
+
else:
|
| 621 |
+
if v == _MISSING:
|
| 622 |
+
formatted_scores[k] = 0.0
|
| 623 |
+
else:
|
| 624 |
+
formatted_scores[k] = _safe_round(v)
|
| 625 |
+
|
| 626 |
+
return formatted_scores
|
| 627 |
+
|
| 628 |
+
|
| 629 |
+
# =========================================================
|
| 630 |
+
# PUBLIC backend β single match
|
| 631 |
+
# =========================================================
|
| 632 |
+
def perform_match(record1, record2, mode: str = "embedding") -> Dict[str, Any]:
|
| 633 |
+
"""
|
| 634 |
+
End-to-end matching pipeline.
|
| 635 |
+
|
| 636 |
+
Args:
|
| 637 |
+
record1: backend.models.EntityRecord (Pydantic model)
|
| 638 |
+
record2: backend.models.EntityRecord (Pydantic model)
|
| 639 |
+
mode: "embedding"
|
| 640 |
+
|
| 641 |
+
Returns:
|
| 642 |
+
Dict with keys: overall_decision, reason, field_scores, mode,
|
| 643 |
+
processing_time_ms
|
| 644 |
+
"""
|
| 645 |
+
t0 = time.perf_counter()
|
| 646 |
+
|
| 647 |
+
# 1. Flatten nested Pydantic models β flat dicts
|
| 648 |
+
r1_flat = flatten_entity_record(record1)
|
| 649 |
+
r2_flat = flatten_entity_record(record2)
|
| 650 |
+
|
| 651 |
+
logger.info(
|
| 652 |
+
"Flattened records β R1 keys: %s | R2 keys: %s",
|
| 653 |
+
list(r1_flat.keys()), list(r2_flat.keys()),
|
| 654 |
+
)
|
| 655 |
+
|
| 656 |
+
# 2. Standardize
|
| 657 |
+
r1_processed = standardize_record(r1_flat)
|
| 658 |
+
r2_processed = standardize_record(r2_flat)
|
| 659 |
+
|
| 660 |
+
logger.info("Standardized records β mode=%s", mode)
|
| 661 |
+
|
| 662 |
+
|
| 663 |
+
# 4. Match
|
| 664 |
+
field_results = match_structured_records(r1_processed, r2_processed, mode=mode)
|
| 665 |
+
|
| 666 |
+
# 5. Evaluate rules
|
| 667 |
+
|
| 668 |
+
overall_decision, reason = evaluate_matching_rules(field_results)
|
| 669 |
+
|
| 670 |
+
# 6. Format
|
| 671 |
+
formatted_scores = format_results(field_results, r1_processed, r2_processed, mode)
|
| 672 |
+
|
| 673 |
+
elapsed_ms = (time.perf_counter() - t0) * 1000
|
| 674 |
+
|
| 675 |
+
return {
|
| 676 |
+
"overall_decision": overall_decision,
|
| 677 |
+
"reason": reason,
|
| 678 |
+
"field_scores": formatted_scores,
|
| 679 |
+
"mode": mode,
|
| 680 |
+
"processing_time_ms": round(elapsed_ms, 2),
|
| 681 |
+
}
|
| 682 |
+
|
| 683 |
+
|
backend/models.py
ADDED
|
@@ -0,0 +1,536 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel, Field, field_validator, model_validator
|
| 2 |
+
from typing import Dict, List, Optional, Any, Union
|
| 3 |
+
from enum import Enum
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
# =========================================================
|
| 7 |
+
# ENUM
|
| 8 |
+
# =========================================================
|
| 9 |
+
class MatchingMode(str, Enum):
|
| 10 |
+
"""Supported matching modes"""
|
| 11 |
+
EMBEDDING = "embedding"
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# =========================================================
|
| 15 |
+
# CONSTANTS
|
| 16 |
+
# =========================================================
|
| 17 |
+
MISSING_PLACEHOLDERS = {"missing value", "missing", "na", "n/a", "null", "none", "-"}
|
| 18 |
+
|
| 19 |
+
# ---------------------------------------------------------------------------
|
| 20 |
+
# Flat-format key β EntityRecord field name mapping.
|
| 21 |
+
# To support a new flat key in future, just add it here.
|
| 22 |
+
# ---------------------------------------------------------------------------
|
| 23 |
+
_FLAT_KEY_MAP: Dict[str, str] = {
|
| 24 |
+
# Personal identifiers
|
| 25 |
+
"GENDER": "gender",
|
| 26 |
+
"NAME": "name",
|
| 27 |
+
"FIRSTNAME": "firstname",
|
| 28 |
+
"MIDDLENAME": "middlename",
|
| 29 |
+
"LASTNAME": "lastname",
|
| 30 |
+
"SPOUSENAME": "spousename",
|
| 31 |
+
"MOTHERNAME": "mothername",
|
| 32 |
+
"FATHERNAME": "fathername",
|
| 33 |
+
"COMPANYNAME": "companyname",
|
| 34 |
+
"PARENTCOMPANYNAME": "parentcompanyname",
|
| 35 |
+
# ID documents
|
| 36 |
+
"AADHAR": "aadhar",
|
| 37 |
+
"PAN": "pan",
|
| 38 |
+
"LICENSEID": "licenseid",
|
| 39 |
+
"PASSPORTID": "passportid",
|
| 40 |
+
"VOTERID": "voterid",
|
| 41 |
+
# DOB
|
| 42 |
+
"BIRTHDATE": "dob",
|
| 43 |
+
"DOB": "dob",
|
| 44 |
+
# Contact β collected into lists
|
| 45 |
+
"PHONE": "_phone_flat",
|
| 46 |
+
"EMAIL": "_email_flat",
|
| 47 |
+
# Address components β collected into addresses[0]
|
| 48 |
+
"ADDRESSLINE": "_addressline_flat",
|
| 49 |
+
"CITY": "_city_flat",
|
| 50 |
+
"STATE": "_state_flat",
|
| 51 |
+
"ZIPCODE": "_zipcode_flat",
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
_FLAT_ADDRESS_KEYS = {"_addressline_flat", "_city_flat", "_state_flat", "_zipcode_flat"}
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def _is_placeholder(val: Any) -> bool:
|
| 58 |
+
"""Return True if value is a known missing/placeholder sentinel."""
|
| 59 |
+
if val is None:
|
| 60 |
+
return True
|
| 61 |
+
return str(val).strip().lower() in MISSING_PLACEHOLDERS
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def _normalize_flat_to_nested(data: Dict[str, Any]) -> Dict[str, Any]:
|
| 65 |
+
"""
|
| 66 |
+
Detect whether *data* is in flat format (uppercase keys like ADDRESSLINE,
|
| 67 |
+
BIRTHDATE β¦) and, if so, convert it to the nested EntityRecord format.
|
| 68 |
+
|
| 69 |
+
If data already looks nested (has 'addresses' / 'phones' / 'emails' keys)
|
| 70 |
+
it is returned unchanged β this is the fast-path for the nested format
|
| 71 |
+
that supports multiple addresses/phones/emails.
|
| 72 |
+
"""
|
| 73 |
+
# Fast-path: already nested
|
| 74 |
+
if "addresses" in data or "phones" in data or "emails" in data:
|
| 75 |
+
return data
|
| 76 |
+
|
| 77 |
+
# Check if this looks like flat format
|
| 78 |
+
upper_keys = {k.upper() for k in data}
|
| 79 |
+
is_flat = bool(upper_keys & set(_FLAT_KEY_MAP.keys()))
|
| 80 |
+
if not is_flat:
|
| 81 |
+
return data # Unrecognized β pass through and let Pydantic handle
|
| 82 |
+
|
| 83 |
+
# ---- Convert flat β nested -------------------------------------------
|
| 84 |
+
nested: Dict[str, Any] = {}
|
| 85 |
+
address_parts: Dict[str, str] = {}
|
| 86 |
+
phones: List[str] = []
|
| 87 |
+
emails: List[str] = []
|
| 88 |
+
|
| 89 |
+
for raw_key, raw_val in data.items():
|
| 90 |
+
target = _FLAT_KEY_MAP.get(raw_key.upper())
|
| 91 |
+
|
| 92 |
+
if target is None:
|
| 93 |
+
# Unknown flat key β pass through (may end up in custom_fields)
|
| 94 |
+
nested[raw_key] = raw_val
|
| 95 |
+
continue
|
| 96 |
+
|
| 97 |
+
if _is_placeholder(raw_val):
|
| 98 |
+
continue
|
| 99 |
+
|
| 100 |
+
if target == "_phone_flat":
|
| 101 |
+
phones.append(str(raw_val).strip())
|
| 102 |
+
elif target == "_email_flat":
|
| 103 |
+
emails.append(str(raw_val).strip())
|
| 104 |
+
elif target in _FLAT_ADDRESS_KEYS:
|
| 105 |
+
addr_key = target.replace("_flat", "").lstrip("_")
|
| 106 |
+
address_parts[addr_key] = str(raw_val).strip()
|
| 107 |
+
else:
|
| 108 |
+
nested[target] = raw_val
|
| 109 |
+
|
| 110 |
+
if address_parts:
|
| 111 |
+
nested["addresses"] = [address_parts]
|
| 112 |
+
if phones:
|
| 113 |
+
nested["phones"] = phones
|
| 114 |
+
if emails:
|
| 115 |
+
nested["emails"] = emails
|
| 116 |
+
|
| 117 |
+
return nested
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
# =========================================================
|
| 121 |
+
# REQUEST MODELS
|
| 122 |
+
# =========================================================
|
| 123 |
+
class AddressRecord(BaseModel):
|
| 124 |
+
"""A single address entry."""
|
| 125 |
+
addressline: str = Field(default="", description="Street address")
|
| 126 |
+
city: str = Field(default="", description="City name")
|
| 127 |
+
state: str = Field(default="", description="State name")
|
| 128 |
+
zipcode: str = Field(default="", description="6-digit postal code (pincode)")
|
| 129 |
+
|
| 130 |
+
@model_validator(mode="before")
|
| 131 |
+
@classmethod
|
| 132 |
+
def strip_address_placeholders(cls, values: Any) -> Any:
|
| 133 |
+
"""Replace placeholder strings in address fields with empty string."""
|
| 134 |
+
if isinstance(values, dict):
|
| 135 |
+
return {
|
| 136 |
+
k: ("" if _is_placeholder(v) else v)
|
| 137 |
+
for k, v in values.items()
|
| 138 |
+
}
|
| 139 |
+
return values
|
| 140 |
+
|
| 141 |
+
def is_empty(self) -> bool:
|
| 142 |
+
"""Return True when every field is blank β used to filter ghost entries."""
|
| 143 |
+
return not any([self.addressline, self.city, self.state, self.zipcode])
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
class EntityRecord(BaseModel):
|
| 147 |
+
"""
|
| 148 |
+
A single entity record with all possible fields.
|
| 149 |
+
All fields are optional β only provided fields are matched.
|
| 150 |
+
|
| 151 |
+
ββ Multi-value fields ββββββββββββββββββββββββββββββββββββββββββ
|
| 152 |
+
addresses : List[AddressRecord]
|
| 153 |
+
Send as many addresses as needed.
|
| 154 |
+
Duplicates and all-blank entries are removed automatically.
|
| 155 |
+
Matching uses best-of-N across all address combinations
|
| 156 |
+
(handled by get_dynamic_fields + embedding_match_addresses
|
| 157 |
+
in matching_service.py β no service changes needed).
|
| 158 |
+
|
| 159 |
+
phones : List[str]
|
| 160 |
+
Send as many phone numbers as needed.
|
| 161 |
+
Duplicates and placeholder strings are removed automatically.
|
| 162 |
+
Matching uses compare_phone_any_match (any-match across all phones).
|
| 163 |
+
|
| 164 |
+
emails : List[str]
|
| 165 |
+
Same as phones, uses compare_email_any_match.
|
| 166 |
+
|
| 167 |
+
ββ Input formats βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 168 |
+
Accepts BOTH nested format and flat uppercase-key format.
|
| 169 |
+
Flat keys are transparently converted to nested via handle_flat_format.
|
| 170 |
+
"""
|
| 171 |
+
|
| 172 |
+
# ---- Name fields -------------------------------------------------------
|
| 173 |
+
name: str = Field(default="", description="Full name")
|
| 174 |
+
firstname: str = Field(default="", description="First name")
|
| 175 |
+
middlename: str = Field(default="", description="Middle name")
|
| 176 |
+
lastname: str = Field(default="", description="Last name")
|
| 177 |
+
|
| 178 |
+
# ---- Related person names ----------------------------------------------
|
| 179 |
+
mothername: str = Field(default="", description="Mother's name")
|
| 180 |
+
fathername: str = Field(default="", description="Father's name")
|
| 181 |
+
spousename: str = Field(default="", description="Spouse's name")
|
| 182 |
+
othername: str = Field(default="", description="Other/alias name")
|
| 183 |
+
|
| 184 |
+
# ---- Personal info -----------------------------------------------------
|
| 185 |
+
dob: str = Field(default="", description="Date of birth (various formats accepted)")
|
| 186 |
+
gender: str = Field(default="", description="Gender (M/F/Male/Female/Other)")
|
| 187 |
+
|
| 188 |
+
# ---- Identity documents ------------------------------------------------
|
| 189 |
+
aadhar: str = Field(default="", alias="AADHAR", description="Aadhar number (12 digits)")
|
| 190 |
+
pan: str = Field(default="", description="PAN number (AAAAA9999A)")
|
| 191 |
+
licenseid: str = Field(default="", description="Driving license number")
|
| 192 |
+
passportid: str = Field(default="", description="Passport number")
|
| 193 |
+
voterid: str = Field(default="", description="Voter ID")
|
| 194 |
+
|
| 195 |
+
# ---- Addresses β N entries supported -----------------------------------
|
| 196 |
+
addresses: List[AddressRecord] = Field(
|
| 197 |
+
default_factory=list,
|
| 198 |
+
description=(
|
| 199 |
+
"List of addresses. Send any number β duplicates and blank entries "
|
| 200 |
+
"are removed. Matching uses best-of-N across all combinations."
|
| 201 |
+
)
|
| 202 |
+
)
|
| 203 |
+
|
| 204 |
+
# ---- Contact β N entries supported -------------------------------------
|
| 205 |
+
phones: List[str] = Field(
|
| 206 |
+
default_factory=list,
|
| 207 |
+
description=(
|
| 208 |
+
"List of phone numbers. Send any number β duplicates and placeholders "
|
| 209 |
+
"are removed. Matching uses any-match (match if any pair matches)."
|
| 210 |
+
)
|
| 211 |
+
)
|
| 212 |
+
emails: List[str] = Field(
|
| 213 |
+
default_factory=list,
|
| 214 |
+
description=(
|
| 215 |
+
"List of email addresses. Send any number β duplicates and placeholders "
|
| 216 |
+
"are removed. Matching uses any-match."
|
| 217 |
+
)
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
+
# ---- Employment --------------------------------------------------------
|
| 221 |
+
companyname: str = Field(default="", description="Company/employer name")
|
| 222 |
+
parentcompanyname: str = Field(default="", description="Parent company name")
|
| 223 |
+
|
| 224 |
+
# ---- Custom fields -----------------------------------------------------
|
| 225 |
+
custom_fields: Dict[str, str] = Field(
|
| 226 |
+
default_factory=dict,
|
| 227 |
+
description="Arbitrary key-value pairs for exact matching (e.g. MemberID, AccountNumber)"
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
# ββ model_validator: runs BEFORE individual field validators ββββββββββ
|
| 231 |
+
@model_validator(mode="before")
|
| 232 |
+
@classmethod
|
| 233 |
+
def handle_flat_format(cls, values: Any) -> Any:
|
| 234 |
+
"""
|
| 235 |
+
Transparently convert flat-format records (uppercase keys like
|
| 236 |
+
ADDRESSLINE, BIRTHDATE, PHONE β¦) into the nested format.
|
| 237 |
+
Already-nested data is returned unchanged.
|
| 238 |
+
"""
|
| 239 |
+
if isinstance(values, dict):
|
| 240 |
+
return _normalize_flat_to_nested(values)
|
| 241 |
+
return values
|
| 242 |
+
|
| 243 |
+
# ββ Scalar field placeholder cleanup ββββββββββββοΏ½οΏ½οΏ½ββββββββββββββββββββ
|
| 244 |
+
@field_validator(
|
| 245 |
+
"name", "firstname", "middlename", "lastname",
|
| 246 |
+
"mothername", "fathername", "spousename", "othername",
|
| 247 |
+
"dob", "gender", "aadhar", "pan", "licenseid",
|
| 248 |
+
"passportid", "voterid", "companyname", "parentcompanyname",
|
| 249 |
+
mode="before"
|
| 250 |
+
)
|
| 251 |
+
@classmethod
|
| 252 |
+
def strip_missing_placeholders(cls, v):
|
| 253 |
+
"""Convert placeholder strings β empty string."""
|
| 254 |
+
if isinstance(v, str) and v.strip().lower() in MISSING_PLACEHOLDERS:
|
| 255 |
+
return ""
|
| 256 |
+
return v
|
| 257 |
+
|
| 258 |
+
# ββ phones: deduplicate + strip placeholders βββββββββββββββββββββββββ
|
| 259 |
+
@field_validator("phones", mode="before")
|
| 260 |
+
@classmethod
|
| 261 |
+
def clean_phones(cls, v):
|
| 262 |
+
if not isinstance(v, list):
|
| 263 |
+
return v
|
| 264 |
+
seen, result = set(), []
|
| 265 |
+
for item in v:
|
| 266 |
+
s = str(item).strip()
|
| 267 |
+
if s and s.lower() not in MISSING_PLACEHOLDERS and s not in seen:
|
| 268 |
+
seen.add(s)
|
| 269 |
+
result.append(s)
|
| 270 |
+
return result
|
| 271 |
+
|
| 272 |
+
# ββ emails: deduplicate + strip placeholders βββββββββββββββββββββββββ
|
| 273 |
+
@field_validator("emails", mode="before")
|
| 274 |
+
@classmethod
|
| 275 |
+
def clean_emails(cls, v):
|
| 276 |
+
if not isinstance(v, list):
|
| 277 |
+
return v
|
| 278 |
+
seen, result = set(), []
|
| 279 |
+
for item in v:
|
| 280 |
+
s = str(item).strip().lower()
|
| 281 |
+
if s and s not in MISSING_PLACEHOLDERS and s not in seen:
|
| 282 |
+
seen.add(s)
|
| 283 |
+
result.append(s)
|
| 284 |
+
return result
|
| 285 |
+
|
| 286 |
+
# ββ addresses: remove empty entries + deduplicate ββββββββββββββββββββ
|
| 287 |
+
@field_validator("addresses", mode="after")
|
| 288 |
+
@classmethod
|
| 289 |
+
def clean_addresses(cls, v: List[AddressRecord]) -> List[AddressRecord]:
|
| 290 |
+
"""
|
| 291 |
+
Remove all-blank address entries and deduplicate by
|
| 292 |
+
(addressline, city, state, zipcode) tuple.
|
| 293 |
+
This prevents ghost entries from inflating match scores.
|
| 294 |
+
"""
|
| 295 |
+
seen, result = set(), []
|
| 296 |
+
for addr in v:
|
| 297 |
+
if addr.is_empty():
|
| 298 |
+
continue
|
| 299 |
+
key = (
|
| 300 |
+
addr.addressline.strip().lower(),
|
| 301 |
+
addr.city.strip().lower(),
|
| 302 |
+
addr.state.strip().lower(),
|
| 303 |
+
addr.zipcode.strip(),
|
| 304 |
+
)
|
| 305 |
+
if key not in seen:
|
| 306 |
+
seen.add(key)
|
| 307 |
+
result.append(addr)
|
| 308 |
+
return result
|
| 309 |
+
|
| 310 |
+
model_config = {
|
| 311 |
+
"populate_by_name": True,
|
| 312 |
+
"alias_generator": str.upper,
|
| 313 |
+
"json_schema_extra": {
|
| 314 |
+
"examples": [
|
| 315 |
+
# ββ Nested format: multiple addresses + phones ββ
|
| 316 |
+
{
|
| 317 |
+
"name": "RAJESH KUMAR SHARMA",
|
| 318 |
+
"firstname": "RAJESH",
|
| 319 |
+
"dob": "15-01-1990",
|
| 320 |
+
"aadhar": "234567890123",
|
| 321 |
+
"addresses": [
|
| 322 |
+
{
|
| 323 |
+
"addressline": "123 MG Road, Koramangala",
|
| 324 |
+
"city": "Bangalore",
|
| 325 |
+
"state": "Karnataka",
|
| 326 |
+
"zipcode": "560034"
|
| 327 |
+
},
|
| 328 |
+
{
|
| 329 |
+
"addressline": "45 Brigade Road",
|
| 330 |
+
"city": "Bangalore",
|
| 331 |
+
"state": "Karnataka",
|
| 332 |
+
"zipcode": "560025"
|
| 333 |
+
}
|
| 334 |
+
],
|
| 335 |
+
"phones": ["9876543210", "9123456789"],
|
| 336 |
+
"emails": ["rajesh@example.com"]
|
| 337 |
+
},
|
| 338 |
+
# ββ Flat format (single address/phone/email) ββ
|
| 339 |
+
{
|
| 340 |
+
"NAME": "RAJESH KUMAR SHARMA",
|
| 341 |
+
"BIRTHDATE": "15-01-1990",
|
| 342 |
+
"AADHAR": "234567890123",
|
| 343 |
+
"ADDRESSLINE": "123 MG Road, Koramangala",
|
| 344 |
+
"CITY": "Bangalore",
|
| 345 |
+
"STATE": "Karnataka",
|
| 346 |
+
"ZIPCODE": "560034",
|
| 347 |
+
"PHONE": "9876543210",
|
| 348 |
+
"EMAIL": "rajesh@example.com"
|
| 349 |
+
}
|
| 350 |
+
]
|
| 351 |
+
}
|
| 352 |
+
}
|
| 353 |
+
|
| 354 |
+
|
| 355 |
+
class MatchRequest(BaseModel):
|
| 356 |
+
"""Request body for matching two entity records."""
|
| 357 |
+
record1: EntityRecord = Field(..., description="First entity record")
|
| 358 |
+
record2: EntityRecord = Field(..., description="Second entity record")
|
| 359 |
+
mode: MatchingMode = Field(
|
| 360 |
+
default=MatchingMode.EMBEDDING,
|
| 361 |
+
description="Matching mode: 'embedding'"
|
| 362 |
+
)
|
| 363 |
+
|
| 364 |
+
model_config = {
|
| 365 |
+
"json_schema_extra": {
|
| 366 |
+
"examples": [
|
| 367 |
+
# ββ Example 1: Multiple addresses + phones (nested) ββββββββββββββ
|
| 368 |
+
{
|
| 369 |
+
"mode": "embedding",
|
| 370 |
+
"record1": {
|
| 371 |
+
"NAME": "RAJESH KUMAR SHARMA",
|
| 372 |
+
"dob": "15-01-1990",
|
| 373 |
+
"phones": ["9876543210", "9123456789"],
|
| 374 |
+
"emails": ["rajesh@example.com"],
|
| 375 |
+
"addresses": [
|
| 376 |
+
{
|
| 377 |
+
"addressline": "123 MG Road",
|
| 378 |
+
"city": "Bangalore",
|
| 379 |
+
"state": "Karnataka",
|
| 380 |
+
"zipcode": "560034"
|
| 381 |
+
},
|
| 382 |
+
{
|
| 383 |
+
"addressline": "45 Brigade Road",
|
| 384 |
+
"city": "Bangalore",
|
| 385 |
+
"state": "Karnataka",
|
| 386 |
+
"zipcode": "560025"
|
| 387 |
+
}
|
| 388 |
+
]
|
| 389 |
+
},
|
| 390 |
+
"record2": {
|
| 391 |
+
"NAME": "RAJESH K SHARMA",
|
| 392 |
+
"dob": "15/01/1990",
|
| 393 |
+
"phones": ["9876543210"],
|
| 394 |
+
"emails": ["rajesh@example.com"],
|
| 395 |
+
"addresses": [
|
| 396 |
+
{
|
| 397 |
+
"addressline": "123 Mahatma Gandhi Rd",
|
| 398 |
+
"city": "Bengaluru",
|
| 399 |
+
"state": "KA",
|
| 400 |
+
"zipcode": "560034"
|
| 401 |
+
},
|
| 402 |
+
{
|
| 403 |
+
"addressline": "45 Brigade Road",
|
| 404 |
+
"city": "Bangalore",
|
| 405 |
+
"state": "Karnataka",
|
| 406 |
+
"zipcode": "560025"
|
| 407 |
+
}
|
| 408 |
+
]
|
| 409 |
+
}
|
| 410 |
+
},
|
| 411 |
+
# ββ Example 2: Flat format βββββββββββββββββββββββββββββββββββββββ
|
| 412 |
+
{
|
| 413 |
+
"mode": "embedding",
|
| 414 |
+
"record1": {
|
| 415 |
+
"GENDER": "missing value",
|
| 416 |
+
"NAME": "RAJESH KUMAR SHARMA",
|
| 417 |
+
"FIRSTNAME": "missing value",
|
| 418 |
+
"MIDDLENAME": "missing value",
|
| 419 |
+
"LASTNAME": "missing value",
|
| 420 |
+
"SPOUSENAME": "missing value",
|
| 421 |
+
"MOTHERNAME": "missing value",
|
| 422 |
+
"FATHERNAME": "missing value",
|
| 423 |
+
"COMPANYNAME": "missing value",
|
| 424 |
+
"PARENTCOMPANYNAME": "missing value",
|
| 425 |
+
"AADHAR": "missing value",
|
| 426 |
+
"PAN": "missing value",
|
| 427 |
+
"LICENSEID": "missing value",
|
| 428 |
+
"PASSPORTID": "missing value",
|
| 429 |
+
"VOTERID": "missing value",
|
| 430 |
+
"ADDRESSLINE": "123 MG Road",
|
| 431 |
+
"BIRTHDATE": "15-01-1990",
|
| 432 |
+
"PHONE": "9876543210",
|
| 433 |
+
"EMAIL": "missing value",
|
| 434 |
+
"CITY": "Bangalore",
|
| 435 |
+
"STATE": "Karnataka",
|
| 436 |
+
"ZIPCODE": "560034"
|
| 437 |
+
},
|
| 438 |
+
"record2": {
|
| 439 |
+
"GENDER": "missing value",
|
| 440 |
+
"NAME": "RAJESH K SHARMA",
|
| 441 |
+
"FIRSTNAME": "missing value",
|
| 442 |
+
"MIDDLENAME": "missing value",
|
| 443 |
+
"LASTNAME": "missing value",
|
| 444 |
+
"SPOUSENAME": "missing value",
|
| 445 |
+
"MOTHERNAME": "missing value",
|
| 446 |
+
"FATHERNAME": "missing value",
|
| 447 |
+
"COMPANYNAME": "missing value",
|
| 448 |
+
"PARENTCOMPANYNAME": "missing value",
|
| 449 |
+
"AADHAR": "missing value",
|
| 450 |
+
"PAN": "missing value",
|
| 451 |
+
"LICENSEID": "missing value",
|
| 452 |
+
"PASSPORTID": "missing value",
|
| 453 |
+
"VOTERID": "missing value",
|
| 454 |
+
"ADDRESSLINE": "123 Mahatma Gandhi Rd",
|
| 455 |
+
"BIRTHDATE": "15/01/1990",
|
| 456 |
+
"PHONE": "9876543210",
|
| 457 |
+
"EMAIL": "missing value",
|
| 458 |
+
"CITY": "Bengaluru",
|
| 459 |
+
"STATE": "KA",
|
| 460 |
+
"ZIPCODE": "560034"
|
| 461 |
+
}
|
| 462 |
+
}
|
| 463 |
+
]
|
| 464 |
+
}
|
| 465 |
+
}
|
| 466 |
+
|
| 467 |
+
|
| 468 |
+
class BatchMatchRequest(BaseModel):
|
| 469 |
+
"""Request body for batch matching (load testing)."""
|
| 470 |
+
pairs: List[MatchRequest] = Field(
|
| 471 |
+
...,
|
| 472 |
+
description="List of record pairs to match",
|
| 473 |
+
min_length=1,
|
| 474 |
+
max_length=100
|
| 475 |
+
)
|
| 476 |
+
|
| 477 |
+
|
| 478 |
+
# =========================================================
|
| 479 |
+
# RESPONSE MODELS
|
| 480 |
+
# =========================================================
|
| 481 |
+
class FieldScore(BaseModel):
|
| 482 |
+
"""Individual field matching result."""
|
| 483 |
+
field: str
|
| 484 |
+
score: Union[float, str] = Field(
|
| 485 |
+
description="Numeric score (0-100) in embedding mode"
|
| 486 |
+
)
|
| 487 |
+
|
| 488 |
+
|
| 489 |
+
class MatchResult(BaseModel):
|
| 490 |
+
"""Result of matching two entity records."""
|
| 491 |
+
overall_decision: str = Field(description="'Match' or 'No Match'")
|
| 492 |
+
reason: str = Field(description="Human-readable explanation of the matching decision")
|
| 493 |
+
field_scores: Dict[str, Union[float, str]] = Field(
|
| 494 |
+
description="Per-field matching scores. Embedding: numeric 0-100."
|
| 495 |
+
)
|
| 496 |
+
mode: str = Field(description="Matching mode used: 'embedding'")
|
| 497 |
+
|
| 498 |
+
|
| 499 |
+
class MatchResponse(BaseModel):
|
| 500 |
+
"""API response for a single match request."""
|
| 501 |
+
success: bool = True
|
| 502 |
+
result: Optional[MatchResult] = None
|
| 503 |
+
error: Optional[str] = None
|
| 504 |
+
processing_time_ms: float = Field(description="Time taken to process this match in milliseconds")
|
| 505 |
+
|
| 506 |
+
model_config = {"populate_by_name": True}
|
| 507 |
+
|
| 508 |
+
|
| 509 |
+
class BatchMatchResponse(BaseModel):
|
| 510 |
+
"""API response for batch matching."""
|
| 511 |
+
success: bool = True
|
| 512 |
+
total: int = Field(description="Total number of pairs submitted")
|
| 513 |
+
completed: int = Field(description="Number of pairs successfully matched")
|
| 514 |
+
failed: int = Field(description="Number of pairs that failed")
|
| 515 |
+
results: List[MatchResponse] = Field(description="Individual match results")
|
| 516 |
+
total_processing_time_ms: float = Field(description="Total processing time in milliseconds")
|
| 517 |
+
|
| 518 |
+
model_config = {"populate_by_name": True}
|
| 519 |
+
|
| 520 |
+
|
| 521 |
+
class HealthResponse(BaseModel):
|
| 522 |
+
"""Health check response."""
|
| 523 |
+
status: str = Field(description="'healthy' or 'unhealthy'")
|
| 524 |
+
version: str = Field(default="8.0", description="API version")
|
| 525 |
+
components: Dict[str, str] = Field(
|
| 526 |
+
description="Health status of individual components (csv_data, embedding_models)"
|
| 527 |
+
)
|
| 528 |
+
|
| 529 |
+
model_config = {"populate_by_name": True}
|
| 530 |
+
|
| 531 |
+
|
| 532 |
+
class ErrorResponse(BaseModel):
|
| 533 |
+
"""Standard error response."""
|
| 534 |
+
success: bool = False
|
| 535 |
+
error: str
|
| 536 |
+
detail: Optional[str] = None
|
backend/server.py
ADDED
|
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
import time
|
| 4 |
+
import logging
|
| 5 |
+
import traceback
|
| 6 |
+
from typing import List
|
| 7 |
+
from contextlib import asynccontextmanager
|
| 8 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 9 |
+
import asyncio
|
| 10 |
+
|
| 11 |
+
# Ensure project root is importable
|
| 12 |
+
_PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 13 |
+
if _PROJECT_ROOT not in sys.path:
|
| 14 |
+
sys.path.insert(0, _PROJECT_ROOT)
|
| 15 |
+
|
| 16 |
+
from fastapi import FastAPI, HTTPException, Request
|
| 17 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 18 |
+
from fastapi.responses import JSONResponse
|
| 19 |
+
|
| 20 |
+
import requests as http_requests
|
| 21 |
+
|
| 22 |
+
# -- Project imports ----------------------------------------------------------
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
from backend.models import (
|
| 26 |
+
MatchRequest,
|
| 27 |
+
MatchResponse,
|
| 28 |
+
MatchResult,
|
| 29 |
+
BatchMatchRequest,
|
| 30 |
+
BatchMatchResponse,
|
| 31 |
+
HealthResponse,
|
| 32 |
+
ErrorResponse,
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
from backend.matching_service import perform_match
|
| 36 |
+
|
| 37 |
+
# =========================================================
|
| 38 |
+
# LOGGING
|
| 39 |
+
# =========================================================
|
| 40 |
+
logging.basicConfig(
|
| 41 |
+
level=logging.INFO,
|
| 42 |
+
format="%(asctime)s | %(levelname)-7s | %(name)s | %(message)s",
|
| 43 |
+
datefmt="%Y-%m-%d %H:%M:%S",
|
| 44 |
+
force=True,
|
| 45 |
+
handlers=[logging.StreamHandler(sys.stdout)]
|
| 46 |
+
)
|
| 47 |
+
logger = logging.getLogger("backend_server")
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
# =========================================================
|
| 51 |
+
# LIFESPAN β startup / shutdown hooks
|
| 52 |
+
# =========================================================
|
| 53 |
+
@asynccontextmanager
|
| 54 |
+
async def lifespan(app: FastAPI):
|
| 55 |
+
"""
|
| 56 |
+
Startup:
|
| 57 |
+
- Pre-warm embedding models (loaded at import time via model.py)
|
| 58 |
+
- Check CSV data
|
| 59 |
+
Shutdown:
|
| 60 |
+
- Nothing to close (CSV-based, no database connections)
|
| 61 |
+
"""
|
| 62 |
+
logger.info("=" * 60)
|
| 63 |
+
logger.info("Entity Matching backend β Starting up")
|
| 64 |
+
logger.info("=" * 60)
|
| 65 |
+
|
| 66 |
+
logger.info("Embedding models loaded (sentence-transformers).")
|
| 67 |
+
|
| 68 |
+
try:
|
| 69 |
+
from services.config import pin_city_state_df, name_variation_df
|
| 70 |
+
csv_loaded = not pin_city_state_df.empty
|
| 71 |
+
logger.info("CSV data source: %s (%d pincode rows)",
|
| 72 |
+
"OK" if csv_loaded else "EMPTY",
|
| 73 |
+
len(pin_city_state_df))
|
| 74 |
+
except Exception as e:
|
| 75 |
+
logger.warning("CSV data source check failed: %s", e)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
logger.info("backend ready to serve requests")
|
| 79 |
+
logger.info("=" * 60)
|
| 80 |
+
|
| 81 |
+
yield # ββ app is running ββ
|
| 82 |
+
|
| 83 |
+
logger.info("Entity Matching backend β Shutting down")
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
# =========================================================
|
| 87 |
+
# APP INSTANCE
|
| 88 |
+
# =========================================================
|
| 89 |
+
app = FastAPI(
|
| 90 |
+
title="Entity Matching backend",
|
| 91 |
+
description=(
|
| 92 |
+
"Gen AI Record-Level Entity Matching backend.\n\n"
|
| 93 |
+
"Compares two entity records and determines if they represent the same person/entity.\n\n"
|
| 94 |
+
"**Multi-value fields:** `addresses`, `phones`, and `emails` each accept a list "
|
| 95 |
+
"of any length. Matching uses best-of-N for addresses and any-match for phones/emails.\n\n"
|
| 96 |
+
"**Supported matching modes:**\n"
|
| 97 |
+
"- `embedding` (default): Sentence Transformers + Fuzzy matching\n"
|
| 98 |
+
|
| 99 |
+
"**Input formats:**\n"
|
| 100 |
+
"- Nested (recommended for multiple values): pass `addresses`, `phones`, `emails` as lists\n"
|
| 101 |
+
"- Flat (single address/phone/email): pass uppercase keys like `ADDRESSLINE`, `PHONE`, `EMAIL`"
|
| 102 |
+
),
|
| 103 |
+
version="8.0.0",
|
| 104 |
+
lifespan=lifespan,
|
| 105 |
+
docs_url="/docs",
|
| 106 |
+
redoc_url="/redoc",
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
# -- CORS middleware ----------------------------------------------------------
|
| 110 |
+
app.add_middleware(
|
| 111 |
+
CORSMiddleware,
|
| 112 |
+
allow_origins=["*"], # Restrict in production
|
| 113 |
+
allow_credentials=True,
|
| 114 |
+
allow_methods=["*"],
|
| 115 |
+
allow_headers=["*"],
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
# =========================================================
|
| 120 |
+
# REQUEST LOGGING MIDDLEWARE
|
| 121 |
+
# =========================================================
|
| 122 |
+
@app.middleware("http")
|
| 123 |
+
async def log_requests(request: Request, call_next):
|
| 124 |
+
"""Log every request with timing."""
|
| 125 |
+
start = time.perf_counter()
|
| 126 |
+
response = await call_next(request)
|
| 127 |
+
elapsed = (time.perf_counter() - start) * 1000
|
| 128 |
+
logger.info(
|
| 129 |
+
"%s %s β %d (%.1f ms)",
|
| 130 |
+
request.method,
|
| 131 |
+
request.url.path,
|
| 132 |
+
response.status_code,
|
| 133 |
+
elapsed,
|
| 134 |
+
)
|
| 135 |
+
return response
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
# =========================================================
|
| 139 |
+
# GLOBAL EXCEPTION HANDLER
|
| 140 |
+
# =========================================================
|
| 141 |
+
@app.exception_handler(Exception)
|
| 142 |
+
async def global_exception_handler(request: Request, exc: Exception):
|
| 143 |
+
logger.error("Unhandled exception: %s\n%s", exc, traceback.format_exc())
|
| 144 |
+
return JSONResponse(
|
| 145 |
+
status_code=500,
|
| 146 |
+
content={
|
| 147 |
+
"success": False,
|
| 148 |
+
"error": "Internal server error",
|
| 149 |
+
"detail": str(exc),
|
| 150 |
+
},
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
# =========================================================
|
| 155 |
+
# ENDPOINTS
|
| 156 |
+
# =========================================================
|
| 157 |
+
|
| 158 |
+
# ββ Health Checks βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 159 |
+
@app.get(
|
| 160 |
+
"/backend/v1/health",
|
| 161 |
+
response_model=HealthResponse,
|
| 162 |
+
tags=["Health"],
|
| 163 |
+
summary="Full system health check",
|
| 164 |
+
)
|
| 165 |
+
async def health_check():
|
| 166 |
+
"""Check the health of all system components."""
|
| 167 |
+
components = {}
|
| 168 |
+
|
| 169 |
+
try:
|
| 170 |
+
from services.config import pin_city_state_df
|
| 171 |
+
components["csv_data"] = (
|
| 172 |
+
"healthy" if not pin_city_state_df.empty else "unhealthy"
|
| 173 |
+
)
|
| 174 |
+
except Exception as e:
|
| 175 |
+
components["csv_data"] = f"error: {e}"
|
| 176 |
+
|
| 177 |
+
try:
|
| 178 |
+
from services.model import MODEL_STORE
|
| 179 |
+
components["embedding_models"] = "healthy" if MODEL_STORE else "unhealthy"
|
| 180 |
+
except Exception as e:
|
| 181 |
+
components["embedding_models"] = f"error: {e}"
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
overall = (
|
| 186 |
+
"healthy"
|
| 187 |
+
if all(v == "healthy" for v in components.values() if v != "not_configured")
|
| 188 |
+
else "degraded"
|
| 189 |
+
)
|
| 190 |
+
return HealthResponse(status=overall, version="8.0", components=components)
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
# ββ Single Match ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 196 |
+
@app.post(
|
| 197 |
+
"/backend/v1/match",
|
| 198 |
+
response_model=MatchResponse,
|
| 199 |
+
tags=["Matching"],
|
| 200 |
+
summary="Match two entity records",
|
| 201 |
+
responses={
|
| 202 |
+
200: {"description": "Successful matching result"},
|
| 203 |
+
400: {"model": ErrorResponse, "description": "Invalid input"},
|
| 204 |
+
500: {"model": ErrorResponse, "description": "Internal error"},
|
| 205 |
+
},
|
| 206 |
+
)
|
| 207 |
+
async def match_records(request: MatchRequest):
|
| 208 |
+
"""
|
| 209 |
+
Compare two entity records and determine if they represent the same entity.
|
| 210 |
+
|
| 211 |
+
**Multi-value fields:**
|
| 212 |
+
Pass `addresses`, `phones`, and `emails` as lists of any length:
|
| 213 |
+
```json
|
| 214 |
+
{
|
| 215 |
+
"mode": "embedding",
|
| 216 |
+
"record1": {
|
| 217 |
+
"NAME": "RAJESH KUMAR SHARMA",
|
| 218 |
+
"dob": "15-01-1990",
|
| 219 |
+
"phones": ["9876543210", "9123456789"],
|
| 220 |
+
"addresses": [
|
| 221 |
+
{"addressline": "123 MG Road", "city": "Bangalore", "state": "Karnataka", "zipcode": "560034"},
|
| 222 |
+
{"addressline": "45 Brigade Road", "city": "Bangalore", "state": "Karnataka", "zipcode": "560025"}
|
| 223 |
+
]
|
| 224 |
+
},
|
| 225 |
+
"record2": {
|
| 226 |
+
"NAME": "RAJESH K SHARMA",
|
| 227 |
+
"dob": "15/01/1990",
|
| 228 |
+
"phones": ["9876543210"],
|
| 229 |
+
"addresses": [
|
| 230 |
+
{"addressline": "123 Mahatma Gandhi Rd", "city": "Bengaluru", "state": "KA", "zipcode": "560034"}
|
| 231 |
+
]
|
| 232 |
+
}
|
| 233 |
+
}
|
| 234 |
+
```
|
| 235 |
+
|
| 236 |
+
**Matching strategy for lists:**
|
| 237 |
+
- `addresses`: best-of-N (highest score across all pair combinations)
|
| 238 |
+
- `phones`: any-match (match if any phone pair matches)
|
| 239 |
+
- `emails`: any-match (match if any email pair matches)
|
| 240 |
+
|
| 241 |
+
**Modes:**
|
| 242 |
+
- `embedding` (default): Sentence Transformers + RbackenddFuzz
|
| 243 |
+
|
| 244 |
+
"""
|
| 245 |
+
mode = request.mode.value
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
t0 = time.perf_counter()
|
| 249 |
+
try:
|
| 250 |
+
# Pre-print to terminal specifically for user visibility
|
| 251 |
+
import json
|
| 252 |
+
print("\n\n" + "="*80)
|
| 253 |
+
print(f" NEW MATCH REQUEST RECEIVED (Mode: {mode})")
|
| 254 |
+
print("="*80)
|
| 255 |
+
print(f" RECORD 1 INPUT:\n{json.dumps(request.record1.model_dump(by_alias=True), indent=2)}")
|
| 256 |
+
print(f" RECORD 2 INPUT:\n{json.dumps(request.record2.model_dump(by_alias=True), indent=2)}")
|
| 257 |
+
print("-" * 80)
|
| 258 |
+
|
| 259 |
+
# perform_match is synchronous (CPU + IO bound); run in thread pool
|
| 260 |
+
# so it doesn't block the asyncio event loop.
|
| 261 |
+
result = await asyncio.to_thread(
|
| 262 |
+
perform_match, request.record1, request.record2, mode=mode
|
| 263 |
+
)
|
| 264 |
+
elapsed_ms = (time.perf_counter() - t0) * 1000
|
| 265 |
+
logger.info(
|
| 266 |
+
"Match complete β decision=%s mode=%s time=%.1fms",
|
| 267 |
+
result["overall_decision"], mode, elapsed_ms,
|
| 268 |
+
)
|
| 269 |
+
|
| 270 |
+
# Post-print to terminal specifically for user visibility
|
| 271 |
+
print("π€ MATCH RESULT OUT:\n" + json.dumps({
|
| 272 |
+
"overall_decision": result["overall_decision"],
|
| 273 |
+
"reason": result["reason"],
|
| 274 |
+
"field_scores": result["field_scores"]
|
| 275 |
+
}, indent=2))
|
| 276 |
+
print("="*80 + "\n\n")
|
| 277 |
+
|
| 278 |
+
return MatchResponse(
|
| 279 |
+
success=True,
|
| 280 |
+
result=MatchResult(
|
| 281 |
+
overall_decision=result["overall_decision"],
|
| 282 |
+
reason=result["reason"],
|
| 283 |
+
field_scores=result["field_scores"],
|
| 284 |
+
mode=mode,
|
| 285 |
+
),
|
| 286 |
+
processing_time_ms=round(elapsed_ms, 2),
|
| 287 |
+
)
|
| 288 |
+
except Exception as e:
|
| 289 |
+
elapsed_ms = (time.perf_counter() - t0) * 1000
|
| 290 |
+
logger.error("Match failed: %s\n%s", e, traceback.format_exc())
|
| 291 |
+
return MatchResponse(
|
| 292 |
+
success=False,
|
| 293 |
+
error=str(e),
|
| 294 |
+
processing_time_ms=round(elapsed_ms, 2),
|
| 295 |
+
)
|
| 296 |
+
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
# =========================================================
|
| 300 |
+
# ROOT / INFO
|
| 301 |
+
# =========================================================
|
| 302 |
+
@app.get("/", tags=["Info"], include_in_schema=False)
|
| 303 |
+
async def root():
|
| 304 |
+
return {
|
| 305 |
+
"service": "Entity Matching backend",
|
| 306 |
+
"version": "8.0.0",
|
| 307 |
+
"docs": "/docs",
|
| 308 |
+
"health": "/backend/v1/health",
|
| 309 |
+
}
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
# =========================================================
|
| 313 |
+
# MAIN (for direct execution: python backend/server.py)
|
| 314 |
+
# =========================================================
|
| 315 |
+
if __name__ == "__main__":
|
| 316 |
+
import uvicorn
|
| 317 |
+
uvicorn.run(
|
| 318 |
+
"backend.server:app",
|
| 319 |
+
host="0.0.0.0",
|
| 320 |
+
port=8000,
|
| 321 |
+
reload=True,
|
| 322 |
+
log_level="info",
|
| 323 |
+
)
|
data/city_prev_pres.csv
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"variation","standard"
|
| 2 |
+
BENGALURU,BANGALORE
|
| 3 |
+
JULLUNDER,JALANDHAR
|
| 4 |
+
CHERPULASSERY,CHERPULACHERRY
|
| 5 |
+
CHIKKAMAGALUR,CHIKMAGALUR
|
| 6 |
+
CHUCHURA,CHINSURAH MAGRA
|
| 7 |
+
HUBBALLI,HUBLI
|
| 8 |
+
INDUR,INDORE
|
| 9 |
+
KANCHIPURAM,KANCHEEPURAM
|
| 10 |
+
MANDAVGARH,MANDU
|
| 11 |
+
MANGALURU,MANGALORE
|
| 12 |
+
MANSANAGAR,VIJAYAWADA
|
| 13 |
+
NORTH PARAVUR,PARAVUR
|
| 14 |
+
SAS NAGAR,SASNAGAR
|
| 15 |
+
SHAHEED BHAGAT SINGH NAGAR,NAWANSHAHR
|
| 16 |
+
PANAJI,NORTH GOA
|
| 17 |
+
ANDAMAN NICOBAR ISLANDS,ANDAMANNICOBAR ISLANDS
|
| 18 |
+
ANDAMAN & NICOBAR ISLANDS,ANDAMANNICOBAR ISLANDS
|
| 19 |
+
ARIANKUPPAM COMMUNE PANCHAYAT,ARIANKUPPAMCOMMUNE PANCHAYAT
|
| 20 |
+
BAHOUR COMMUNE PANCHAYAT,BAHOURCOMMUNE PANCHAYAT
|
| 21 |
+
BAKSHI KA TALAB**,BAKSHIKA TALAB
|
| 22 |
+
BAKSHI KA TALAB,BAKSHIKA TALAB
|
| 23 |
+
CHANDRA SEKHARA PURAM,CHANDRASEKHARA PURAM
|
| 24 |
+
CHAUTH KA BARWARA,CHAUTHKA BARWARA
|
| 25 |
+
CHERUKUPALLE H/O ARUMBAKA,CHERUKUPALLEHO ARUMBAKA
|
| 26 |
+
CHERUKUPALLE HO ARUMBAKA,CHERUKUPALLEHO ARUMBAKA
|
| 27 |
+
CHILAKALURIPET H/O.PURUSHOTHA PATNAM,CHILAKALURIPETHOPURUSHOTHA PATNAM
|
| 28 |
+
CHILAKALURIPET HOPURUSHOTHA PATNAM,CHILAKALURIPETHOPURUSHOTHA PATNAM
|
| 29 |
+
CHINSURAH MAGRA,CHINSURAH MAGRA
|
| 30 |
+
CHINSURAH - MAGRA,CHINSURAH MAGRA
|
| 31 |
+
CHURACHANDPUR NORTH SUB-DIV.,CHURACHANDPURNORTH SUBDIV
|
| 32 |
+
CHURACHANDPUR NORTH SUBDIV,CHURACHANDPURNORTH SUBDIV
|
| 33 |
+
DADRA NAGAR HAVELI,DADRANAGAR HAVELI
|
| 34 |
+
DADRA & NAGAR HAVELI,DADRANAGAR HAVELI
|
| 35 |
+
DAMAN DIU,DAMAN DIU
|
| 36 |
+
DAMAN & DIU,DAMAN DIU
|
| 37 |
+
DERA BABA NANAK,DERABABA NANAK
|
| 38 |
+
EAST GARO HILLS,EASTGARO HILLS
|
| 39 |
+
EAST KHASI HILLS,EASTKHASI HILLS
|
| 40 |
+
EGMORE NUNGAMBAKKAM,EGMORE NUNGAMBAKKAM
|
| 41 |
+
EGMORE - NUNGAMBAKKAM,EGMORE NUNGAMBAKKAM
|
| 42 |
+
FORT TONDIARPET,FORT TONDIARPET
|
| 43 |
+
FORT - TONDIARPET,FORT TONDIARPET
|
| 44 |
+
GAUTAM BUDDHA NAGAR **,GAUTAMBUDDHA NAGAR
|
| 45 |
+
GAUTAM BUDDHA NAGAR ,GAUTAMBUDDHA NAGAR
|
| 46 |
+
GAUTAM BUDDHA NAGAR,GAUTAMBUDDHA NAGAR
|
| 47 |
+
GOLA GOKARAN NATH,GOLAGOKARAN NATH
|
| 48 |
+
GOOL GULAB GARH,GOOLGULAB GARH
|
| 49 |
+
GUDEM KOTHA VEEDHI,GUDEMKOTHA VEEDHI
|
| 50 |
+
JAJI REDDI GUDEM,JAJIREDDI GUDEM
|
| 51 |
+
JAMMU KASHMIR,JAMMU KASHMIR
|
| 52 |
+
JAMMU & KASHMIR,JAMMU KASHMIR
|
| 53 |
+
JYOTIBA PHULE NAGAR,JYOTIBAPHULE NAGAR
|
| 54 |
+
KAMJONG CHASSAD SUB-DIV.,KAMJONGCHASSAD SUBDIV
|
| 55 |
+
KAMJONG CHASSAD SUBDIV,KAMJONGCHASSAD SUBDIV
|
| 56 |
+
KEIRAO BITRA SUB-DIVISION,KEIRAOBITRA SUBDIVISION
|
| 57 |
+
KEIRAO BITRA SUBDIVISION,KEIRAOBITRA SUBDIVISION
|
| 58 |
+
KOTTUCHERRY COMMUNE PANCHAYAT,KOTTUCHERRYCOMMUNE PANCHAYAT
|
| 59 |
+
KUSHESHWAR ASTHAN PURBI,KUSHESHWARASTHAN PURBI
|
| 60 |
+
LAHUL SPITI,LAHUL SPITI
|
| 61 |
+
LAHUL & SPITI,LAHUL SPITI
|
| 62 |
+
MAMBALAM GUINDY,MAMBALAM GUINDY
|
| 63 |
+
MAMBALAM - GUINDY,MAMBALAM GUINDY
|
| 64 |
+
MANNADIPET COMMUNE PANCHAYAT,MANNADIPETCOMMUNE PANCHAYAT
|
| 65 |
+
MYLAPORE TRIPLICANE,MYLAPORE TRIPLICANE
|
| 66 |
+
MYLAPORE - TRIPLICANE,MYLAPORE TRIPLICANE
|
| 67 |
+
NEDUNGADU COMMUNE PANCHAYAT,NEDUNGADUCOMMUNE PANCHAYAT
|
| 68 |
+
NEEM CHAK BATHANI,NEEMCHAK BATHANI
|
| 69 |
+
NERAVY COMMUNE PANCHAYAT,NERAVYCOMMUNE PANCHAYAT
|
| 70 |
+
NETTAPAKKAM COMMUNE PANCHAYAT,NETTAPAKKAMCOMMUNE PANCHAYAT
|
| 71 |
+
NORTH 24 PARGANAS,NORTH24 PARGANAS
|
| 72 |
+
NORTH AND MIDDLE ANDAMAN,NORTHANDMIDDLE ANDAMAN
|
| 73 |
+
NORTH CACHAR HILLS,NORTHCACHAR HILLS
|
| 74 |
+
NORTH EAST DELHI,NORTHEAST DELHI
|
| 75 |
+
NORTH WEST DELHI,NORTHWEST DELHI
|
| 76 |
+
PEDDA ADISERLA PALLE,PEDDAADISERLA PALLE
|
| 77 |
+
PERAMBUR PURASAWAKKAM,PERAMBUR PURASAWAKKAM
|
| 78 |
+
PERAMBUR - PURASAWAKKAM,PERAMBUR PURASAWAKKAM
|
| 79 |
+
POLBA DADPUR,POLBA DADPUR
|
| 80 |
+
POLBA - DADPUR,POLBA DADPUR
|
| 81 |
+
RAIPUR KARCHULIYAN,RAIPUR KARCHULIYAN
|
| 82 |
+
RAIPUR - KARCHULIYAN,RAIPUR KARCHULIYAN
|
| 83 |
+
SADAR HILLS EAST SUB-DIVISION,SADARHILLSEAST SUBDIVISION
|
| 84 |
+
SADAR HILLS EAST SUBDIVISION,SADARHILLSEAST SUBDIVISION
|
| 85 |
+
SADAR HILLS WEST SUB-DIVISION,SADARHILLSWEST SUBDIVISION
|
| 86 |
+
SADAR HILLS WEST SUBDIVISION,SADARHILLSWEST SUBDIVISION
|
| 87 |
+
SAITU GAMPHAZOL SUB-DIVISION,SAITUGAMPHAZOL SUBDIVISION
|
| 88 |
+
SAITU GAMPHAZOL SUBDIVISION,SAITUGAMPHAZOL SUBDIVISION
|
| 89 |
+
SANT KABIR NAGAR,SANTKABIR NAGAR
|
| 90 |
+
SANT RAVIDAS NAGAR,SANTRAVIDAS NAGAR
|
| 91 |
+
SAVALYAPURAM H/O KANAMARLAPUDI,SAVALYAPURAMHO KANAMARLAPUDI
|
| 92 |
+
SAVALYAPURAM HO KANAMARLAPUDI,SAVALYAPURAMHO KANAMARLAPUDI
|
| 93 |
+
SOUTH 24 PARGANAS,SOUTH24 PARGANAS
|
| 94 |
+
SOUTH EAST DELHI,SOUTHEAST DELHI
|
| 95 |
+
SOUTH GARO HILLS,SOUTHGARO HILLS
|
| 96 |
+
SOUTH WEST DELHI,SOUTHWEST DELHI
|
| 97 |
+
TAMENGLONG NORTH SUB-DIVISION,TAMENGLONGNORTH SUBDIVISION
|
| 98 |
+
TAMENGLONG NORTH SUBDIVISION,TAMENGLONGNORTH SUBDIVISION
|
| 99 |
+
TELANGANA,ANDHRA PRADESH
|
| 100 |
+
THIRUMALAIRAYAN PATTINAM COMMUNE PANCHAYAT,THIRUMALAIRAYANPATTINAMCOMMUNE PANCHAYAT
|
| 101 |
+
THIRUNALLAR COMMUNE PANCHAYAT,THIRUNALLARCOMMUNE PANCHAYAT
|
| 102 |
+
UDHAM SINGH NAGAR,UDHAMSINGH NAGAR
|
| 103 |
+
UKHRUL CENTRAL SUB-DIVISION,UKHRULCENTRAL SUBDIVISION
|
| 104 |
+
UKHRUL CENTRAL SUBDIVISION,UKHRULCENTRAL SUBDIVISION
|
| 105 |
+
VILLIANUR COMMUNE PANCHAYAT,VILLIANURCOMMUNE PANCHAYAT
|
| 106 |
+
WEST GARO HILLS,WESTGARO HILLS
|
| 107 |
+
WEST KHASI HILLS,WESTKHASI HILLS
|
| 108 |
+
EDLABAD,ADILABAD
|
| 109 |
+
KARNAVATI,AHMEDABAD
|
| 110 |
+
AJAYMERU,AJMER
|
| 111 |
+
ALLEPPEY,ALAPPUZHA
|
| 112 |
+
ALLYGURH,ALIGARH
|
| 113 |
+
PRAYAG,ALLAHABAD
|
| 114 |
+
ALWAYE,ALUVA
|
| 115 |
+
BALLARI,BELLARY
|
| 116 |
+
BURDWAN,BARDHAMAN
|
| 117 |
+
BHIR,BEED
|
| 118 |
+
BELAGAVI,BELGAUM
|
| 119 |
+
BROACH,BHARUCH
|
| 120 |
+
BHAVENA ,BHAVNAGAR
|
| 121 |
+
GOHILWAD,BHAVNAGAR
|
| 122 |
+
BELLASGATE,BHEDAGHAT MARBLES
|
| 123 |
+
VIRAVATI,VADODARA
|
| 124 |
+
CHANDRAVATI,VADODARA
|
| 125 |
+
BARODA,VADODARA
|
| 126 |
+
VADPATRA,VADODARA
|
| 127 |
+
MADRAS,CHENNAI
|
| 128 |
+
CHERPALCHERY,CHERPULACHERRY
|
| 129 |
+
SHERTHALAI,CHERTHALA
|
| 130 |
+
CHIKMAGALUR,CHIKMAGALUR
|
| 131 |
+
CHINSURAH,CHINSURAH MAGRA
|
| 132 |
+
ELLORE,ELURU
|
| 133 |
+
GARTHAPURI,GUNTUR
|
| 134 |
+
GURUGRAM,GURGAON
|
| 135 |
+
GAUHATI,GUWAHATI
|
| 136 |
+
HOSAPETE,HOSPET
|
| 137 |
+
HUBLI,HUBLI
|
| 138 |
+
BHAGYANAGARAM,HYDERABAD
|
| 139 |
+
AHILYANAGARI,INDORE
|
| 140 |
+
JUBBULPORE,JABALPUR
|
| 141 |
+
KADAPA,CUDDAPAH
|
| 142 |
+
COCANADA,KAKINADA
|
| 143 |
+
KALABURAGI,GULBARGA
|
| 144 |
+
CONJEEVARAM,KANCHEEPURAM
|
| 145 |
+
CANNANORE,KANNUR
|
| 146 |
+
CAWNPORE,KANPUR
|
| 147 |
+
CAPE COMORIN,KANYAKUMARI
|
| 148 |
+
ELAGANDLA,KARIMNAGAR
|
| 149 |
+
KARUVUR,KARUR
|
| 150 |
+
CAMBAY,KHAMBHAT
|
| 151 |
+
COCHIN,KOCHI
|
| 152 |
+
CRANGANORE,KODUNGALLUR
|
| 153 |
+
CALCUTTA,KOLKATA
|
| 154 |
+
QUILON,KOLLAM
|
| 155 |
+
KONEY,KONNI
|
| 156 |
+
CALICUT,KOZHIKODE
|
| 157 |
+
KANDENAVOLU,KURNOOL
|
| 158 |
+
MUZAFFARNAGAR,MUZAFFARNAGAR
|
| 159 |
+
LAKSHMINAGAR,MUZAFFARNAGAR
|
| 160 |
+
MASULIPATAM,MACHILIPATNAM
|
| 161 |
+
MARCERA,MADIKERI
|
| 162 |
+
PALAMURU,MAHBUBNAGAR
|
| 163 |
+
MANDU,MANDU
|
| 164 |
+
MANDAV NAGAR,MANDI
|
| 165 |
+
MANGALORE,MANGALORE
|
| 166 |
+
KRISHNA,VIJAYAWADA
|
| 167 |
+
MAYAVARAM,MAYILADUTHURAI
|
| 168 |
+
SIDDAPURAM,MEDAK
|
| 169 |
+
METUKU SEEMA,MEDAK
|
| 170 |
+
BOMBAY,MUMBAI
|
| 171 |
+
MYSORE,MYSURU
|
| 172 |
+
NOWGONG,NAGAON
|
| 173 |
+
NASIK,NASHIK
|
| 174 |
+
VIKRAMA SIMHAPURI,NELLORE
|
| 175 |
+
PARUR,PARAVUR
|
| 176 |
+
EKASILANAGARAM,WARANGAL
|
| 177 |
+
ORUGALLU,WARANGAL
|
| 178 |
+
PALAI,PALA
|
| 179 |
+
PALGHAT,PALAKKAD
|
| 180 |
+
PANJIM,NORTH GOA
|
| 181 |
+
PORTO NOVO,PARANGIPETTAI
|
| 182 |
+
PRAKASAM,ONGOLE
|
| 183 |
+
PUDUCHERRY ,PONDICHERRY
|
| 184 |
+
POONA,PUNE
|
| 185 |
+
RASSEN,RAISEN
|
| 186 |
+
RAJAMAHENDRAVARAMU,RAJAHMUNDRY
|
| 187 |
+
KANPUR DEHAT,KANPUR DEHAT
|
| 188 |
+
RAMABAI NAGAR,KANPUR DEHAT
|
| 189 |
+
ROPAR,RUPNAGAR
|
| 190 |
+
SAUGOR,SAGAR
|
| 191 |
+
MOHALI,SASNAGAR
|
| 192 |
+
VIRATNAGARI,SHAHDOL
|
| 193 |
+
NAWAN SHAHAR,NAWANSHAHR
|
| 194 |
+
SIMLA,SHIMLA
|
| 195 |
+
SHIVAMOGGA,SHIMOGA
|
| 196 |
+
SAIHA,SAIHA
|
| 197 |
+
SIVASAGAR,SIBSAGAR
|
| 198 |
+
SRI POTTI SRI RAMULU NELLORE,NELLORE
|
| 199 |
+
SIKKOLU,SRIKAKULAM
|
| 200 |
+
SURYAPUR,SURAT
|
| 201 |
+
TELLICHERRY,THALASSERY
|
| 202 |
+
THANA,THANE
|
| 203 |
+
TANJORE,THANJAVUR
|
| 204 |
+
TRANQUEBAR,THARANGAMBADI
|
| 205 |
+
TRIVANDRUM,THIRUVANANTHAPURAM
|
| 206 |
+
TUTICORIN,THOOTHUKKUDI
|
| 207 |
+
TRICHUR,THRISSUR
|
| 208 |
+
TRICHINOPOLY,TIRUCHIRAPPALLI
|
| 209 |
+
TINNEVELLY,TIRUNELVELI
|
| 210 |
+
TRINOMALEE,TIRUVANNAMALAI
|
| 211 |
+
TUMAKURU,TUMKUR
|
| 212 |
+
OOTACAMUND,UDHAGAMANDALAM
|
| 213 |
+
AVANTIKA,UJJAIN
|
| 214 |
+
OJJAIN,UJJAIN
|
| 215 |
+
BULSAR,VALSAD
|
| 216 |
+
BANARAS,VARANASI
|
| 217 |
+
BADAGARA,VADAKARA
|
| 218 |
+
BHELSA,VIDISHA
|
| 219 |
+
VIJAYAPURA,BIJAPUR
|
| 220 |
+
BEZAWADA,VIJAYAWADA
|
| 221 |
+
VIRUDUPATTI,VIRUDHUNAGAR
|
| 222 |
+
WALTAIR,VISAKHAPATNAM
|
| 223 |
+
VIZAGAPATAM,VISAKHAPATNAM
|
| 224 |
+
YSR DISTRICT,CUDDAPAH
|
| 225 |
+
MUMBAI,MUMBAI
|
| 226 |
+
BOMBAY,MUMBAI
|
| 227 |
+
MUMBAI SUBURBAN,MUMBAI
|
| 228 |
+
DELHI,DELHI
|
| 229 |
+
NEW DELHI,DELHI
|
| 230 |
+
DELHI NCR,DELHI
|
| 231 |
+
NCT OF DELHI,DELHI
|
| 232 |
+
SEELAMPUR,DELHI
|
| 233 |
+
SHAHDARA,DELHI
|
| 234 |
+
DWARKA,DELHI
|
| 235 |
+
ROHINI,DELHI
|
| 236 |
+
PITAMPURA,DELHI
|
| 237 |
+
KAROL BAGH,DELHI
|
| 238 |
+
LAJPAT NAGAR,DELHI
|
| 239 |
+
SAKET,DELHI
|
| 240 |
+
JANAKPURI,DELHI
|
| 241 |
+
MAYUR VIHAR,DELHI
|
| 242 |
+
VASANT KUNJ,DELHI
|
| 243 |
+
OKHLA,DELHI
|
| 244 |
+
BENGALURU,BENGALURU
|
| 245 |
+
BANGALORE,BENGALURU
|
| 246 |
+
BENGALURU URBAN,BENGALURU
|
| 247 |
+
HYDERABAD,HYDERABAD
|
| 248 |
+
SECUNDERABAD,HYDERABAD
|
| 249 |
+
HYDERABAD CITY,HYDERABAD
|
| 250 |
+
CHENNAI,CHENNAI
|
| 251 |
+
MADRAS,CHENNAI
|
| 252 |
+
CHENNAI CITY,CHENNAI
|
| 253 |
+
KOLKATA,KOLKATA
|
| 254 |
+
CALCUTTA,KOLKATA
|
| 255 |
+
KOLKATA CITY,KOLKATA
|
| 256 |
+
PUNE,PUNE
|
| 257 |
+
POONA,PUNE
|
| 258 |
+
AHMEDABAD,AHMEDABAD
|
| 259 |
+
AMDAVAD,AHMEDABAD
|
| 260 |
+
JAIPUR,JAIPUR
|
| 261 |
+
PINK CITY,JAIPUR
|
| 262 |
+
LUCKNOW,LUCKNOW
|
| 263 |
+
LAKHNAU,LUCKNOW
|
| 264 |
+
KANPUR,KANPUR
|
| 265 |
+
CAWNPORE,KANPUR
|
| 266 |
+
NAGPUR,NAGPUR
|
| 267 |
+
INDORE,INDORE
|
| 268 |
+
THANE,THANE
|
| 269 |
+
THANA,THANE
|
| 270 |
+
BHOPAL,BHOPAL
|
| 271 |
+
VISAKHAPATNAM,VISAKHAPATNAM
|
| 272 |
+
VIZAG,VISAKHAPATNAM
|
| 273 |
+
VISHAKHAPATNAM,VISAKHAPATNAM
|
| 274 |
+
PIMPRI-CHINCHWAD,PIMPRI-CHINCHWAD
|
| 275 |
+
PIMPRI CHINCHWAD,PIMPRI-CHINCHWAD
|
| 276 |
+
PCMC,PIMPRI-CHINCHWAD
|
| 277 |
+
PATNA,PATNA
|
| 278 |
+
PATALIPUTRA,PATNA
|
| 279 |
+
VADODARA,VADODARA
|
| 280 |
+
BARODA,VADODARA
|
| 281 |
+
GHAZIABAD,GHAZIABAD
|
| 282 |
+
GHZ,GHAZIABAD
|
| 283 |
+
LUDHIANA,LUDHIANA
|
| 284 |
+
AGRA,AGRA
|
| 285 |
+
NASHIK,NASHIK
|
| 286 |
+
NASIK,NASHIK
|
| 287 |
+
FARIDABAD,FARIDABAD
|
| 288 |
+
MEERUT,MEERUT
|
| 289 |
+
RAJKOT,RAJKOT
|
| 290 |
+
KALYAN-DOMBIVLI,KALYAN-DOMBIVLI
|
| 291 |
+
KALYAN,KALYAN-DOMBIVLI
|
| 292 |
+
DOMBIVLI,KALYAN-DOMBIVLI
|
| 293 |
+
VASAI-VIRAR,VASAI-VIRAR
|
| 294 |
+
VASAI,VASAI-VIRAR
|
| 295 |
+
VIRAR,VASAI-VIRAR
|
| 296 |
+
VARANASI,VARANASI
|
| 297 |
+
BANARAS,VARANASI
|
| 298 |
+
BENARES,VARANASI
|
| 299 |
+
KASHI,VARANASI
|
| 300 |
+
SRINAGAR,SRINAGAR
|
| 301 |
+
AURANGABAD,AURANGABAD
|
| 302 |
+
DHANBAD,DHANBAD
|
| 303 |
+
AMRITSAR,AMRITSAR
|
| 304 |
+
NAVI MUMBAI,NAVI MUMBAI
|
| 305 |
+
NEW BOMBAY,NAVI MUMBAI
|
| 306 |
+
ALLAHABAD,ALLAHABAD
|
| 307 |
+
PRAYAGRAJ,ALLAHABAD
|
| 308 |
+
ILAHABAD,ALLAHABAD
|
| 309 |
+
RANCHI,RANCHI
|
| 310 |
+
HOWRAH,HOWRAH
|
| 311 |
+
HAORA,HOWRAH
|
| 312 |
+
COIMBATORE,COIMBATORE
|
| 313 |
+
JABALPUR,JABALPUR
|
| 314 |
+
JUBBULPORE,JABALPUR
|
| 315 |
+
GWALIOR,GWALIOR
|
| 316 |
+
VIJAYAWADA,VIJAYAWADA
|
| 317 |
+
JODHPUR,JODHPUR
|
| 318 |
+
MADURAI,MADURAI
|
| 319 |
+
RAIPUR,RAIPUR
|
| 320 |
+
KOTA,KOTA
|
| 321 |
+
GUWAHATI,GUWAHATI
|
| 322 |
+
GAUHATI,GUWAHATI
|
| 323 |
+
CHANDIGARH,CHANDIGARH
|
| 324 |
+
MOHALI,CHANDIGARH
|
| 325 |
+
SAS NAGAR,CHANDIGARH
|
| 326 |
+
KHARAR,CHANDIGARH
|
| 327 |
+
PANCHKULA,CHANDIGARH
|
| 328 |
+
ZIRAKPUR,CHANDIGARH
|
| 329 |
+
SOLAPUR,SOLAPUR
|
| 330 |
+
SHOLAPUR,SOLAPUR
|
| 331 |
+
HUBLI-DHARWAD,HUBLI-DHARWAD
|
| 332 |
+
HUBLI,HUBLI-DHARWAD
|
| 333 |
+
DHARWAD,HUBLI-DHARWAD
|
| 334 |
+
BAREILLY,BAREILLY
|
| 335 |
+
MORADABAD,MORADABAD
|
| 336 |
+
MYSORE,MYSORE
|
| 337 |
+
MYSURU,MYSORE
|
| 338 |
+
GURGAON,GURGAON
|
| 339 |
+
GURUGRAM,GURGAON
|
| 340 |
+
ALIGARH,ALIGARH
|
| 341 |
+
JALANDHAR,JALANDHAR
|
| 342 |
+
TIRUCHIRAPPALLI,TIRUCHIRAPPALLI
|
| 343 |
+
TRICHY,TIRUCHIRAPPALLI
|
| 344 |
+
TRICHINOPOLY,TIRUCHIRAPPALLI
|
| 345 |
+
BHUBANESWAR,BHUBANESWAR
|
| 346 |
+
BHUBANESHWAR,BHUBANESWAR
|
| 347 |
+
SALEM,SALEM
|
| 348 |
+
WARANGAL,WARANGAL
|
| 349 |
+
THIRUVANANTHAPURAM,THIRUVANANTHAPURAM
|
| 350 |
+
TRIVANDRUM,THIRUVANANTHAPURAM
|
| 351 |
+
GUNTUR,GUNTUR
|
| 352 |
+
BHIWANDI,BHIWANDI
|
| 353 |
+
SAHARANPUR,SAHARANPUR
|
| 354 |
+
GORAKHPUR,GORAKHPUR
|
| 355 |
+
BIKANER,BIKANER
|
| 356 |
+
AMRAVATI,AMRAVATI
|
| 357 |
+
NOIDA,NOIDA
|
| 358 |
+
JAMSHEDPUR,JAMSHEDPUR
|
| 359 |
+
TATANAGAR,JAMSHEDPUR
|
| 360 |
+
BHILAI,BHILAI
|
| 361 |
+
BHILAI NAGAR,BHILAI
|
| 362 |
+
CUTTACK,CUTTACK
|
| 363 |
+
FIROZABAD,FIROZABAD
|
| 364 |
+
KOCHI,KOCHI
|
| 365 |
+
COCHIN,KOCHI
|
| 366 |
+
BHAVNAGAR,BHAVNAGAR
|
| 367 |
+
DEHRADUN,DEHRADUN
|
| 368 |
+
DEHRA DUN,DEHRADUN
|
| 369 |
+
DURGAPUR,DURGAPUR
|
| 370 |
+
ASANSOL,ASANSOL
|
| 371 |
+
NANDED,NANDED
|
| 372 |
+
KOLHAPUR,KOLHAPUR
|
| 373 |
+
AJMER,AJMER
|
| 374 |
+
GULBARGA,GULBARGA
|
| 375 |
+
KALABURAGI,GULBARGA
|
| 376 |
+
JAMNAGAR,JAMNAGAR
|
| 377 |
+
UJJAIN,UJJAIN
|
| 378 |
+
LONI,LONI
|
| 379 |
+
SILIGURI,SILIGURI
|
| 380 |
+
JHANSI,JHANSI
|
| 381 |
+
ULHASNAGAR,ULHASNAGAR
|
| 382 |
+
NELLORE,NELLORE
|
| 383 |
+
JAMMU,JAMMU
|
| 384 |
+
SANGALI-MIRAJ-KUPWAD,SANGALI-MIRAJ-KUPWAD
|
| 385 |
+
SANGALI,SANGALI-MIRAJ-KUPWAD
|
| 386 |
+
MIRAJ,SANGALI-MIRAJ-KUPWAD
|
| 387 |
+
KUPWAD,SANGALI-MIRAJ-KUPWAD
|
| 388 |
+
BELGAUM,BELGAUM
|
| 389 |
+
BELAGAVI,BELGAUM
|
| 390 |
+
MANGALORE,MANGALORE
|
| 391 |
+
MANGALURU,MANGALORE
|
| 392 |
+
AMBATTUR,AMBATTUR
|
| 393 |
+
TIRUNELVELI,TIRUNELVELI
|
| 394 |
+
MALEGAON,MALEGAON
|
| 395 |
+
GREATER NOIDA,GREATER NOIDA
|
data/hno_variation_standard.csv
ADDED
|
@@ -0,0 +1,619 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"variation","standard"
|
| 2 |
+
ADJUCENT , ADJ
|
| 3 |
+
ADJACNT , ADJ
|
| 4 |
+
ADJNT , ADJ
|
| 5 |
+
ADJT , ADJ
|
| 6 |
+
ADJN , ADJ
|
| 7 |
+
APPART MENTS , APT
|
| 8 |
+
APARTMENTSNO,APT
|
| 9 |
+
APPART MENT , APT
|
| 10 |
+
APART MENTS, APT
|
| 11 |
+
APPARTMENTS , APT
|
| 12 |
+
APART MENT , APT
|
| 13 |
+
APARTUMENT , APT
|
| 14 |
+
APARTMENTS , APT
|
| 15 |
+
APPARTMENT , APT
|
| 16 |
+
APARTMENT , APT
|
| 17 |
+
APPART , APT
|
| 18 |
+
APPATS , APT
|
| 19 |
+
APTMNT , APT
|
| 20 |
+
APARTM , APT
|
| 21 |
+
APATT , APT
|
| 22 |
+
APATS , APT
|
| 23 |
+
APPTS , APT
|
| 24 |
+
APPTT , APT
|
| 25 |
+
APRTS , APT
|
| 26 |
+
APTMS , APT
|
| 27 |
+
APAT , APT
|
| 28 |
+
APTS , APT
|
| 29 |
+
APPT , APT
|
| 30 |
+
APRT , APT
|
| 31 |
+
APTT , APT
|
| 32 |
+
ARKHADHE , ARCADE
|
| 33 |
+
ARKHADE , ARCADE
|
| 34 |
+
ARKADE , ARCADE
|
| 35 |
+
ARKEDE , ARCADE
|
| 36 |
+
ARCAD , ARCADE
|
| 37 |
+
ARCDE , ARCADE
|
| 38 |
+
AREAA , AREA
|
| 39 |
+
ARIAA , AREA
|
| 40 |
+
AVENUIE , AVENUE
|
| 41 |
+
AVANUE , AVENUE
|
| 42 |
+
AVNUE , AVENUE
|
| 43 |
+
AVEN , AVENUE
|
| 44 |
+
BANC , BANK
|
| 45 |
+
BNK , BANK
|
| 46 |
+
BASTHY , BASTI
|
| 47 |
+
BASTY , BASTI
|
| 48 |
+
BEHIND , BEHIND
|
| 49 |
+
B/D , BEHIND
|
| 50 |
+
BEH , BEHIND
|
| 51 |
+
BHI , BEHIND
|
| 52 |
+
BH, BEHIND
|
| 53 |
+
BESIDCE , BESIDE
|
| 54 |
+
BE SIDE , BESIDE
|
| 55 |
+
BESIDES , BESIDE
|
| 56 |
+
BESID , BESIDE
|
| 57 |
+
BSD , BESIDE
|
| 58 |
+
BUILDINGS , BLDG
|
| 59 |
+
BUILDING , BLDG
|
| 60 |
+
BUILDIN , BLDG
|
| 61 |
+
BUILDG , BLDG
|
| 62 |
+
BUILDI , BLDG
|
| 63 |
+
BUILDL , BLDG
|
| 64 |
+
BUILD,BLDG
|
| 65 |
+
BLDGS , BLDG
|
| 66 |
+
BULDG , BLDG
|
| 67 |
+
BLIG , BLDG
|
| 68 |
+
BULD , BLDG
|
| 69 |
+
BDG , BLDG
|
| 70 |
+
BLD , BLDG
|
| 71 |
+
BLG , BLDG
|
| 72 |
+
BLC0K NO , BLOCK
|
| 73 |
+
BLOK NO , BLOCK
|
| 74 |
+
BLCK NO , BLOCK
|
| 75 |
+
BLK NO , BLOCK
|
| 76 |
+
BL NO,BLOCK
|
| 77 |
+
B NO , BLOCK
|
| 78 |
+
BNO , BLOCK
|
| 79 |
+
BAZAAR , BZR
|
| 80 |
+
BAZZAR , BZR
|
| 81 |
+
COLN , CLNY
|
| 82 |
+
COLY , CLNY
|
| 83 |
+
CLN , CLNY
|
| 84 |
+
CLY , CLNY
|
| 85 |
+
COL , CLNY
|
| 86 |
+
COMPHLEX , CMPLX
|
| 87 |
+
COMPLEX , CMPLX
|
| 88 |
+
CMPLEX , CMPLX
|
| 89 |
+
COMPLX , CMPLX
|
| 90 |
+
CMPL , CMPLX
|
| 91 |
+
CPLX , CMPLX
|
| 92 |
+
CENTRAAL , CNTR
|
| 93 |
+
CENTRAL , CNTR
|
| 94 |
+
CENTER , CNTR
|
| 95 |
+
CENTRE , CNTR
|
| 96 |
+
CENTR , CNTR
|
| 97 |
+
CENTL , CNTR
|
| 98 |
+
CNTRL , CNTR
|
| 99 |
+
CENT , CNTR
|
| 100 |
+
CEN , CNTR
|
| 101 |
+
CNT , CNTR
|
| 102 |
+
COMPOUND , COMPND
|
| 103 |
+
CMPOND , COMPND
|
| 104 |
+
COMPD , COMPND
|
| 105 |
+
COMPO , COMPND
|
| 106 |
+
CMPD , COMPND
|
| 107 |
+
CO OPERATIVE , COOP
|
| 108 |
+
COOPERATIVE , COOP
|
| 109 |
+
CO OPERATE , COOP
|
| 110 |
+
CO OPERAT , COOP
|
| 111 |
+
CO OPP , COOP
|
| 112 |
+
CO-OP , COOP
|
| 113 |
+
CO OP HOUSING SOCIETY , COOPHOUSOC
|
| 114 |
+
CO OP HOUSING SOC , COOPHOUSOC
|
| 115 |
+
CO-OP HOUS SOCTY , COOPHOUSOC
|
| 116 |
+
COOP HSG SOCBLDG , COOPHOUSOC
|
| 117 |
+
CO- OP HOU SOC , COOPHOUSOC
|
| 118 |
+
COOP HSG SOCY , COOPHOUSOC
|
| 119 |
+
CO OP HSU SOC , COOPHOUSOC
|
| 120 |
+
CO O HSG SOC , COOPHOUSOC
|
| 121 |
+
COOP HO SOC , COOPHOUSOC
|
| 122 |
+
CO OP HOUS , COOPHOUSOC
|
| 123 |
+
CO OP SOC , COOPHOUSOC
|
| 124 |
+
CO-OP HS , COOPHOUSOC
|
| 125 |
+
CHS , COOPHOUSOC
|
| 126 |
+
CO OPERATE SOCIETY , COOPSOCIETY
|
| 127 |
+
CORNER , CORN
|
| 128 |
+
CROSS,CROSS
|
| 129 |
+
DISTRICT , DIST
|
| 130 |
+
DISTICT , DIST
|
| 131 |
+
DISTR , DIST
|
| 132 |
+
DOORNUMBER ,DOOR
|
| 133 |
+
DOOR NO ,DOOR
|
| 134 |
+
DOORNO,DOOR
|
| 135 |
+
DOR NO ,DOOR
|
| 136 |
+
DRNO ,DOOR
|
| 137 |
+
D NO ,DOOR
|
| 138 |
+
D-NO ,DOOR
|
| 139 |
+
D.NO,DOOR
|
| 140 |
+
DNO ,DOOR
|
| 141 |
+
ENCLAVE , ENCL
|
| 142 |
+
ENKLAVE , ENCL
|
| 143 |
+
ENCLE , ENCL
|
| 144 |
+
ENC , ENCL
|
| 145 |
+
ESTATES , ESTATE
|
| 146 |
+
ESTAT , ESTATE
|
| 147 |
+
ESTA , ESTATE
|
| 148 |
+
ESTS , ESTATE
|
| 149 |
+
ESTT , ESTATE
|
| 150 |
+
EST , ESTATE
|
| 151 |
+
EXTENCTION , EXTN
|
| 152 |
+
EXTENSION , EXTN
|
| 153 |
+
EXT N , EXTN
|
| 154 |
+
EXTEN , EXTN
|
| 155 |
+
EXETN , EXTN
|
| 156 |
+
EXNT , EXTN
|
| 157 |
+
EXT , EXTN
|
| 158 |
+
EX , EXTN
|
| 159 |
+
V TH ,FIFTH
|
| 160 |
+
FIRST ,FIRST
|
| 161 |
+
I ST ,FIRST
|
| 162 |
+
FIRS ,FIRST
|
| 163 |
+
IST ,FIRST
|
| 164 |
+
FIRSTFLOOR , FIRSTFLR
|
| 165 |
+
FLATNUMBER , FLAT
|
| 166 |
+
F NUMBER , FLAT
|
| 167 |
+
FLAT NOS,FLAT
|
| 168 |
+
FLAT NO,FLAT
|
| 169 |
+
FLATN O , FLAT
|
| 170 |
+
FLAT-NO,FLAT
|
| 171 |
+
FT NO , FLAT
|
| 172 |
+
F NO , FLAT
|
| 173 |
+
FLT , FLAT
|
| 174 |
+
FNO , FLAT
|
| 175 |
+
FT,FLAT
|
| 176 |
+
FLOO , FLR
|
| 177 |
+
FLOR , FLR
|
| 178 |
+
FLUR , FLR
|
| 179 |
+
FR , FLR
|
| 180 |
+
FL , FLR
|
| 181 |
+
IVTH ,FOURTH
|
| 182 |
+
GALLI , GALLY
|
| 183 |
+
GILLY , GALLY
|
| 184 |
+
GULLY , GALLY
|
| 185 |
+
GALA , GALLY
|
| 186 |
+
GALI , GALLY
|
| 187 |
+
GADENS , GARDENS
|
| 188 |
+
GRDENS , GARDENS
|
| 189 |
+
GARDNS , GARDENS
|
| 190 |
+
GRDNS , GARDENS
|
| 191 |
+
GDNS , GARDENS
|
| 192 |
+
GROIUND , GND
|
| 193 |
+
GROUND , GND
|
| 194 |
+
GRD , GND
|
| 195 |
+
GR , GND
|
| 196 |
+
GROUNDFLOOR ,GNDFLR
|
| 197 |
+
GOVERNMENT , GOVT
|
| 198 |
+
GAVERNMENT , GOVT
|
| 199 |
+
GOVRNMNT , GOVT
|
| 200 |
+
GOV , GOVT
|
| 201 |
+
GENERAL POST OFFICE , GPO
|
| 202 |
+
GENRAL POST OFICE , GPO
|
| 203 |
+
GNRL POST OFF , GPO
|
| 204 |
+
GENERAL P O , GPO
|
| 205 |
+
GPO , GPO
|
| 206 |
+
GUNJ , GUNZ
|
| 207 |
+
GANJ , GUNZ
|
| 208 |
+
HILS , HILLS
|
| 209 |
+
HASPATAL , HOSPITAL
|
| 210 |
+
HASPITAL , HOSPITAL
|
| 211 |
+
HOSPTL , HOSPITAL
|
| 212 |
+
HSPTL , HOSPITAL
|
| 213 |
+
HOSPI , HOSPITAL
|
| 214 |
+
HOSP , HOSPITAL
|
| 215 |
+
HOUSE NUMBER ,HOUSE
|
| 216 |
+
HOUSENUMBER ,HOUSE
|
| 217 |
+
HOUSE NO ,HOUSE
|
| 218 |
+
HAUSE NO ,HOUSE
|
| 219 |
+
HOUS NO ,HOUSE
|
| 220 |
+
HOU NO ,HOUSE
|
| 221 |
+
HUS NO ,HOUSE
|
| 222 |
+
H-NO ,HOUSE
|
| 223 |
+
H NO ,HOUSE
|
| 224 |
+
BAZAR , BZR
|
| 225 |
+
BAJAR , BZR
|
| 226 |
+
BAZER , BZR
|
| 227 |
+
BAZR , BZR
|
| 228 |
+
CAMPUS , CAMP
|
| 229 |
+
CAMPAS , CAMP
|
| 230 |
+
CAMPS , CAMP
|
| 231 |
+
CMP , CAMP
|
| 232 |
+
COLONY , CLNY
|
| 233 |
+
COLNI , CLNY
|
| 234 |
+
COLOY , CLNY
|
| 235 |
+
CONLY , CLNY
|
| 236 |
+
H.N.,HOUSE
|
| 237 |
+
H.NO,HOUSE
|
| 238 |
+
HONO,HOUSE
|
| 239 |
+
HNO,HOUSE
|
| 240 |
+
INDUSTRIALESTATE , INDESTATE
|
| 241 |
+
INDUSTREALS , INDL
|
| 242 |
+
INDUSTRIAL , INDL
|
| 243 |
+
INDUSTRIES , INDL
|
| 244 |
+
INDUSTREAL , INDL
|
| 245 |
+
INDUSTRY , INDL
|
| 246 |
+
INDUST , INDL
|
| 247 |
+
INDUL , INDL
|
| 248 |
+
INDLL , INDL
|
| 249 |
+
INDUS , INDL
|
| 250 |
+
INDLS , INDL
|
| 251 |
+
INDU , INDL
|
| 252 |
+
INDS , INDL
|
| 253 |
+
IND , INDL
|
| 254 |
+
SECOND ,SECOND
|
| 255 |
+
II ND ,SECOND
|
| 256 |
+
IIND ,SECOND
|
| 257 |
+
2NDFLR ,SECONDFLOOR
|
| 258 |
+
SECTOR , SECT
|
| 259 |
+
SECTAR , SECT
|
| 260 |
+
SCTR , SECT
|
| 261 |
+
SEC , SECT
|
| 262 |
+
SHOP NO,SHOP
|
| 263 |
+
SHOPNO,SHOP
|
| 264 |
+
SITENO,SITE
|
| 265 |
+
LMTED , LTD
|
| 266 |
+
LT , LTD
|
| 267 |
+
MAIN,MAIN
|
| 268 |
+
MRG , MARG
|
| 269 |
+
MARKETS , MKT
|
| 270 |
+
MARKET , MKT
|
| 271 |
+
MRKET , MKT
|
| 272 |
+
MEKT , MKT
|
| 273 |
+
MRKT , MKT
|
| 274 |
+
MKTE , MKT
|
| 275 |
+
NAGAR , NGR
|
| 276 |
+
NAGER , NGR
|
| 277 |
+
NAGR , NGR
|
| 278 |
+
NGAR , NGR
|
| 279 |
+
NAR , NGR
|
| 280 |
+
NG , NGR
|
| 281 |
+
NATIONAL HIGH WAY , NH
|
| 282 |
+
NATIONAL HI WAY , NH
|
| 283 |
+
NATIONAL HYWAY , NH
|
| 284 |
+
NHW , NH
|
| 285 |
+
NH , NH
|
| 286 |
+
NIWAS , NIVAS
|
| 287 |
+
NUMBER , NO
|
| 288 |
+
NEMBER , NO
|
| 289 |
+
NUMBUR , NO
|
| 290 |
+
NEMBUR , NO
|
| 291 |
+
NUMBR , NO
|
| 292 |
+
NMBR , NO
|
| 293 |
+
OFFICE , OFF
|
| 294 |
+
OFFIC , OFF
|
| 295 |
+
OFICE , OFF
|
| 296 |
+
O/O , OFF
|
| 297 |
+
OPPOSITE , OPP
|
| 298 |
+
OPPOSTE , OPP
|
| 299 |
+
OPPSITE,OPP
|
| 300 |
+
OPPE , OPP
|
| 301 |
+
OPPS , OPP
|
| 302 |
+
OPPT , OPP
|
| 303 |
+
PHESE , PHASE
|
| 304 |
+
FASE , PHASE
|
| 305 |
+
PHAS,PHASE
|
| 306 |
+
PH,PHASE
|
| 307 |
+
PCKET,PKT
|
| 308 |
+
PKT,PKT
|
| 309 |
+
PLOT SR NO,PLOT
|
| 310 |
+
PL NUMBER , PLOT
|
| 311 |
+
P NUMBER , PLOT
|
| 312 |
+
PLOT.NO.,PLOT
|
| 313 |
+
PLOT NO,PLOT
|
| 314 |
+
PLOT.NO,PLOT
|
| 315 |
+
PLOT-NO , PLOT
|
| 316 |
+
PLO NO,PLOT
|
| 317 |
+
PLOTNO , PLOT
|
| 318 |
+
PLT NO , PLOT
|
| 319 |
+
PL.NO.,PLOT
|
| 320 |
+
PL NO , PLOT
|
| 321 |
+
PLOT,PLOT
|
| 322 |
+
PTNO,PLOT
|
| 323 |
+
P NO , PLOT
|
| 324 |
+
PLT,PLOT
|
| 325 |
+
PNO , PLOT
|
| 326 |
+
POST OFFICE , PO
|
| 327 |
+
POSTOFFICE , PO
|
| 328 |
+
POST OFF , PO
|
| 329 |
+
P OFFI , PO
|
| 330 |
+
POS OF , PO
|
| 331 |
+
POST , PO
|
| 332 |
+
P OF , PO
|
| 333 |
+
PT , PO
|
| 334 |
+
POST BOX , POBOX
|
| 335 |
+
PST BOX , POBOX
|
| 336 |
+
P O BOX , POBOX
|
| 337 |
+
POST BX , POBOX
|
| 338 |
+
POS BOX , POBOX
|
| 339 |
+
POCKET ,POCKET
|
| 340 |
+
QUARTER NUMBER ,QUTR
|
| 341 |
+
QUARTERNUMBER ,QUTR
|
| 342 |
+
QUARTER NO ,QUTR
|
| 343 |
+
QUARTERS ,QUTR
|
| 344 |
+
QUARTER,QUTR
|
| 345 |
+
QRTR NO ,QUTR
|
| 346 |
+
QURT NO ,QUTR
|
| 347 |
+
QRTERS ,QUTR
|
| 348 |
+
QTR NO,QUTR
|
| 349 |
+
QRT NO,QUTR
|
| 350 |
+
QR NO ,QUTR
|
| 351 |
+
QTARS ,QUTR
|
| 352 |
+
QURTS ,QUTR
|
| 353 |
+
Q. NO,QUTR
|
| 354 |
+
QTRNO,QUTR
|
| 355 |
+
ADJACENT , ADJ
|
| 356 |
+
SITE,SITE
|
| 357 |
+
VI TH ,SIXTH
|
| 358 |
+
SOCIETY , SOC
|
| 359 |
+
SOCTY , SOC
|
| 360 |
+
SOSTY , SOC
|
| 361 |
+
SOCT , SOC
|
| 362 |
+
SCTY , SOC
|
| 363 |
+
SOCI , SOC
|
| 364 |
+
SOCY , SOC
|
| 365 |
+
STAGE,STAGE
|
| 366 |
+
STETION , STN
|
| 367 |
+
STATION , STN
|
| 368 |
+
STANT , STN
|
| 369 |
+
STION , STN
|
| 370 |
+
STETION ROAD , STNRD
|
| 371 |
+
STATION ROAD , STNRD
|
| 372 |
+
SN ROAD , STNRD
|
| 373 |
+
STREETNUMBER , STR
|
| 374 |
+
ST NUMBER , STR
|
| 375 |
+
STREET NO , STR
|
| 376 |
+
STREEET , STR
|
| 377 |
+
STREET , STR
|
| 378 |
+
STREAT , STR
|
| 379 |
+
STRAET , STR
|
| 380 |
+
STRIT , STR
|
| 381 |
+
STRET , STR
|
| 382 |
+
STEET , STR
|
| 383 |
+
ST NO , STR
|
| 384 |
+
STRT , STR
|
| 385 |
+
STR, STR
|
| 386 |
+
SRT , STR
|
| 387 |
+
SU DIVISION , SUBDIVISION
|
| 388 |
+
SU DIVIZAN , SUBDIVISION
|
| 389 |
+
SU DIVIZON , SUBDIVISION
|
| 390 |
+
SUB DIVI , SUBDIVISION
|
| 391 |
+
SU DIVIS , SUBDIVISION
|
| 392 |
+
SU DVSN , SUBDIVISION
|
| 393 |
+
SURVEY NO,SURVEY
|
| 394 |
+
SURVEYNO,SURVEY
|
| 395 |
+
SY NO,SURVEY
|
| 396 |
+
TALUKHAA , TALUKA
|
| 397 |
+
TALOOKHA , TALUKA
|
| 398 |
+
TALOOKA , TALUKA
|
| 399 |
+
TALUQA , TALUKA
|
| 400 |
+
TALUCA , TALUKA
|
| 401 |
+
TAL , TALUKA
|
| 402 |
+
III RD ,THIRD
|
| 403 |
+
TOWER,TOWER
|
| 404 |
+
VIHAAR , VIHAR
|
| 405 |
+
VIHR , VIHAR
|
| 406 |
+
VILL , VILLAGE
|
| 407 |
+
VILL. , VILLAGE
|
| 408 |
+
VILLGE , VILLAGE
|
| 409 |
+
VILLA , VILLAGE
|
| 410 |
+
VILLG , VILLAGE
|
| 411 |
+
VIL , VILLAGE
|
| 412 |
+
WATER TAN , WATERTANK
|
| 413 |
+
WATER TNK , WATERTANK
|
| 414 |
+
WATR TAN , WATERTANK
|
| 415 |
+
WATER TK , WATERTANK
|
| 416 |
+
WATER T , WATERTANK
|
| 417 |
+
WTR TAN , WATERTANK
|
| 418 |
+
WTR TNK , WATERTANK
|
| 419 |
+
W TNK , WATERTANK
|
| 420 |
+
VINGS , WING
|
| 421 |
+
WINGS , WING
|
| 422 |
+
VING , WING
|
| 423 |
+
CROSS ROAD , XRD
|
| 424 |
+
CROSSROAD , XRD
|
| 425 |
+
CROSS RD , XRD
|
| 426 |
+
XRD , XRD
|
| 427 |
+
JONE , ZONE
|
| 428 |
+
JUNCTION , JN
|
| 429 |
+
JUNCTN , JN
|
| 430 |
+
JNCTN , JN
|
| 431 |
+
JNCN , JN
|
| 432 |
+
JNC , JN
|
| 433 |
+
JNT , JN
|
| 434 |
+
LINE , LANE
|
| 435 |
+
LNE , LANE
|
| 436 |
+
LN , LANE
|
| 437 |
+
LYT,LAYOUT
|
| 438 |
+
LIMITED , LTD
|
| 439 |
+
LIMITID , LTD
|
| 440 |
+
LIMETED , LTD
|
| 441 |
+
LIMTED , LTD
|
| 442 |
+
LIMTD , LTD
|
| 443 |
+
FLOOR , FLR
|
| 444 |
+
I FLOOR,"1 ST FLR"
|
| 445 |
+
FIRST FLOOR,"1 ST FLR"
|
| 446 |
+
GROUND FLOOR,"0 TH FLR"
|
| 447 |
+
Q NO,QUTR
|
| 448 |
+
Q-NO ,QUTR
|
| 449 |
+
QRTS ,QUTR
|
| 450 |
+
QRS ,QUTR
|
| 451 |
+
QRT ,QUTR
|
| 452 |
+
QTS ,QUTR
|
| 453 |
+
QNO ,QUTR
|
| 454 |
+
QR ,QUTR
|
| 455 |
+
RAIL , RAILWAY
|
| 456 |
+
RAWY , RAILWAY
|
| 457 |
+
RLY , RAILWAY
|
| 458 |
+
RAILWAYQUARTERS , RAILWAYQTR
|
| 459 |
+
RLY QRTS , RAILWAYQTR
|
| 460 |
+
RAILWAY STATION , RAILWAYSTN
|
| 461 |
+
RAILWAY STION , RAILWAYSTN
|
| 462 |
+
RLY STATION , RAILWAYSTN
|
| 463 |
+
RL STN , RAILWAYSTN
|
| 464 |
+
RESIDENCE , RES
|
| 465 |
+
RECIDANCE , RES
|
| 466 |
+
RSDENCE , RES
|
| 467 |
+
RSDNCE , RES
|
| 468 |
+
RESI , RES
|
| 469 |
+
RSDN , RES
|
| 470 |
+
RS , RES
|
| 471 |
+
ROAD NO ,ROAD
|
| 472 |
+
RAOD NO ,ROAD
|
| 473 |
+
ROADNO,ROAD
|
| 474 |
+
RD NO,ROAD
|
| 475 |
+
ROADS ,ROAD
|
| 476 |
+
RHODE ,ROAD
|
| 477 |
+
ROAD ,ROAD
|
| 478 |
+
RODE ,ROAD
|
| 479 |
+
R NO ,ROAD
|
| 480 |
+
RDNO ,ROAD
|
| 481 |
+
R-NO ,ROAD
|
| 482 |
+
RAD ,ROAD
|
| 483 |
+
ROA ,ROAD
|
| 484 |
+
ROD ,ROAD
|
| 485 |
+
ROOM NO,ROOM
|
| 486 |
+
ROOMNO,ROOM
|
| 487 |
+
R.NO,ROOM
|
| 488 |
+
R NO,ROOM
|
| 489 |
+
ROUTE , RT
|
| 490 |
+
ROOTE , RT
|
| 491 |
+
RUTE , RT
|
| 492 |
+
RTE , RT
|
| 493 |
+
RUT , RT
|
| 494 |
+
S CRUZ , SANTCRUZ
|
| 495 |
+
SECCOND ,SECOND
|
| 496 |
+
DISTRICT,DIST
|
| 497 |
+
DIST,DIST
|
| 498 |
+
DST,DIST
|
| 499 |
+
DSTR,DIST
|
| 500 |
+
DT,DIST
|
| 501 |
+
ZILLA,DIST
|
| 502 |
+
JILLA,DIST
|
| 503 |
+
ZILA,DIST
|
| 504 |
+
TALUK,TALUK
|
| 505 |
+
TAL,TALUK
|
| 506 |
+
TALUKA,TALUK
|
| 507 |
+
TQ,TALUK
|
| 508 |
+
TEH,TALUK
|
| 509 |
+
TEHS,TALUK
|
| 510 |
+
TEHSIL,TALUK
|
| 511 |
+
MANDAL,TALUK
|
| 512 |
+
MD,TALUK
|
| 513 |
+
VILLAGE,VILLAGE
|
| 514 |
+
VILL,VILLAGE
|
| 515 |
+
VIL,VILLAGE
|
| 516 |
+
VLG,VILLAGE
|
| 517 |
+
GRAMA,VILLAGE
|
| 518 |
+
GRAM,VILLAGE
|
| 519 |
+
GAON,VILLAGE
|
| 520 |
+
CITY,CITY
|
| 521 |
+
CTY,CITY
|
| 522 |
+
TOWN,CITY
|
| 523 |
+
TWN,CITY
|
| 524 |
+
NAGAR,CITY
|
| 525 |
+
NAG,CITY
|
| 526 |
+
PURAM,CITY
|
| 527 |
+
PURA,CITY
|
| 528 |
+
STATE,STATE
|
| 529 |
+
ST,STATE
|
| 530 |
+
RAJYA,STATE
|
| 531 |
+
PRADESH,STATE
|
| 532 |
+
D NO,HOUSE
|
| 533 |
+
D.NO,HOUSE
|
| 534 |
+
D-NO,HOUSE
|
| 535 |
+
D/NO,HOUSE
|
| 536 |
+
DNO,HOUSE
|
| 537 |
+
DOOR NO,HOUSE
|
| 538 |
+
DOOR NUMBER,HOUSE
|
| 539 |
+
APARTMENT,APT
|
| 540 |
+
APT,APT
|
| 541 |
+
APT NO,APT
|
| 542 |
+
APT NUMBER,APT
|
| 543 |
+
APARTMENT NO,APT
|
| 544 |
+
TOWER,APT
|
| 545 |
+
TOWER NO,APT
|
| 546 |
+
WING,APT
|
| 547 |
+
PHASE,APT
|
| 548 |
+
PHASE NO,APT
|
| 549 |
+
RESIDENCY,APT
|
| 550 |
+
RESIDENTIAL COMPLEX,APT
|
| 551 |
+
HEIGHTS,APT
|
| 552 |
+
ENCLAVE,APT
|
| 553 |
+
APARTMENTS,APT
|
| 554 |
+
SOCIETY,APT
|
| 555 |
+
SOCIETY NO,APT
|
| 556 |
+
CHS,APT
|
| 557 |
+
BLDG,BLDG
|
| 558 |
+
BLDG NO,BLDG
|
| 559 |
+
BUILDING,BLDG
|
| 560 |
+
BUILDING NO,BLDG
|
| 561 |
+
BLK,BLOCK
|
| 562 |
+
BLOCK,BLOCK
|
| 563 |
+
BLOCK NO,BLOCK
|
| 564 |
+
FLAT,FLAT
|
| 565 |
+
FLAT NO,FLAT
|
| 566 |
+
FLAT NUMBER,FLAT
|
| 567 |
+
FLT,FLAT
|
| 568 |
+
FLT NO,FLAT
|
| 569 |
+
UNIT,FLAT
|
| 570 |
+
UNIT NO,FLAT
|
| 571 |
+
UNIT NUMBER,FLAT
|
| 572 |
+
PORTION,FLAT
|
| 573 |
+
PORTION NO,FLAT
|
| 574 |
+
OFFICE NO,FLAT
|
| 575 |
+
OFFICE NUMBER,FLAT
|
| 576 |
+
SHOP NO,SHOP
|
| 577 |
+
SHOP NUMBER,SHOP
|
| 578 |
+
ROAD,ROAD
|
| 579 |
+
RD,ROAD
|
| 580 |
+
R D,ROAD
|
| 581 |
+
MARG,ROAD
|
| 582 |
+
MRG,ROAD
|
| 583 |
+
PATH,ROAD
|
| 584 |
+
STREET,STR
|
| 585 |
+
ST,STR
|
| 586 |
+
STR,STR
|
| 587 |
+
GALI,STR
|
| 588 |
+
GALLLI,STR
|
| 589 |
+
LANE,STR
|
| 590 |
+
LN,STR
|
| 591 |
+
MARG,STR
|
| 592 |
+
PATH,STR
|
| 593 |
+
CIRCLE,STR
|
| 594 |
+
CIR,STR
|
| 595 |
+
SECTOR,STR
|
| 596 |
+
SEC,STR
|
| 597 |
+
LANE,LANE
|
| 598 |
+
LN,LANE
|
| 599 |
+
BYLANE,LANE
|
| 600 |
+
CROSS,LANE
|
| 601 |
+
CR,LANE
|
| 602 |
+
EXTENSION,EXTN
|
| 603 |
+
EXT,EXTN
|
| 604 |
+
EXTN,EXTN
|
| 605 |
+
LOCALITY,LOCALITY
|
| 606 |
+
LAYOUT,LOCALITY
|
| 607 |
+
LYT,LOCALITY
|
| 608 |
+
PHASE,LOCALITY
|
| 609 |
+
PH,LOCALITY
|
| 610 |
+
SECTOR,LOCALITY
|
| 611 |
+
SEC,LOCALITY
|
| 612 |
+
COLONY,CLNY
|
| 613 |
+
COL,CLNY
|
| 614 |
+
CLNY,CLNY
|
| 615 |
+
BUILDING,BUILDING
|
| 616 |
+
APT,BUILDING
|
| 617 |
+
APARTMENT,BUILDING
|
| 618 |
+
BLDG,BUILDING
|
| 619 |
+
TOWER,BUILDING
|
data/name_variation_standard.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/pin_city_state.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/state_name_standard.csv
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"variation","standard"
|
| 2 |
+
ANDHRA PRADESH,ANDHRA PRADESH
|
| 3 |
+
ANDHRAPRADESH,ANDHRA PRADESH
|
| 4 |
+
ANDHRA,ANDHRA PRADESH
|
| 5 |
+
AP,ANDHRA PRADESH
|
| 6 |
+
A.P,ANDHRA PRADESH
|
| 7 |
+
A.P.,ANDHRA PRADESH
|
| 8 |
+
AP STATE,ANDHRA PRADESH
|
| 9 |
+
IN-AP,ANDHRA PRADESH
|
| 10 |
+
ARUNACHAL PRADESH,ARUNACHAL PRADESH
|
| 11 |
+
ARUNACHAL,ARUNACHAL PRADESH
|
| 12 |
+
AR,ARUNACHAL PRADESH
|
| 13 |
+
A.R,ARUNACHAL PRADESH
|
| 14 |
+
ARUNACHAL PRADESH STATE,ARUNACHAL PRADESH
|
| 15 |
+
IN-AR,ARUNACHAL PRADESH
|
| 16 |
+
ASSAM,ASSAM
|
| 17 |
+
AS,ASSAM
|
| 18 |
+
A.S,ASSAM
|
| 19 |
+
ASSAM STATE,ASSAM
|
| 20 |
+
IN-AS,ASSAM
|
| 21 |
+
BIHAR,BIHAR
|
| 22 |
+
BR,BIHAR
|
| 23 |
+
B.R,BIHAR
|
| 24 |
+
BIHAR STATE,BIHAR
|
| 25 |
+
IN-BR,BIHAR
|
| 26 |
+
CHHATTISGARH,CHHATTISGARH
|
| 27 |
+
CHATTISGARH,CHHATTISGARH
|
| 28 |
+
CHHATISGARH,CHHATTISGARH
|
| 29 |
+
CG,CHHATTISGARH
|
| 30 |
+
C.G,CHHATTISGARH
|
| 31 |
+
CT,CHHATTISGARH
|
| 32 |
+
CHATTISGARH STATE,CHHATTISGARH
|
| 33 |
+
IN-CG,CHHATTISGARH
|
| 34 |
+
GOA,GOA
|
| 35 |
+
GA,GOA
|
| 36 |
+
G.A,GOA
|
| 37 |
+
IN-GA,GOA
|
| 38 |
+
GUJARAT,GUJARAT
|
| 39 |
+
GUJRAT,GUJARAT
|
| 40 |
+
GUJARATH,GUJARAT
|
| 41 |
+
GJ,GUJARAT
|
| 42 |
+
G.J,GUJARAT
|
| 43 |
+
IN-GJ,GUJARAT
|
| 44 |
+
HARYANA,HARYANA
|
| 45 |
+
HARIYANA,HARYANA
|
| 46 |
+
HR,HARYANA
|
| 47 |
+
H.R,HARYANA
|
| 48 |
+
IN-HR,HARYANA
|
| 49 |
+
HIMACHAL PRADESH,HIMACHAL PRADESH
|
| 50 |
+
HIMACHAL,HIMACHAL PRADESH
|
| 51 |
+
HP,HIMACHAL PRADESH
|
| 52 |
+
H.P,HIMACHAL PRADESH
|
| 53 |
+
H.P.,HIMACHAL PRADESH
|
| 54 |
+
IN-HP,HIMACHAL PRADESH
|
| 55 |
+
JHARKHAND,JHARKHAND
|
| 56 |
+
JH,JHARKHAND
|
| 57 |
+
J.H,JHARKHAND
|
| 58 |
+
IN-JH,JHARKHAND
|
| 59 |
+
KARNATAKA,KARNATAKA
|
| 60 |
+
KARNATAK,KARNATAKA
|
| 61 |
+
KARN,KARNATAKA
|
| 62 |
+
KA,KARNATAKA
|
| 63 |
+
K.A,KARNATAKA
|
| 64 |
+
MYSORE STATE,KARNATAKA
|
| 65 |
+
IN-KA,KARNATAKA
|
| 66 |
+
KERALA,KERALA
|
| 67 |
+
KERALAM,KERALA
|
| 68 |
+
KL,KERALA
|
| 69 |
+
K.L,KERALA
|
| 70 |
+
IN-KL,KERALA
|
| 71 |
+
MADHYA PRADESH,MADHYA PRADESH
|
| 72 |
+
MADHYAPRADESH,MADHYA PRADESH
|
| 73 |
+
MADHYA,MADHYA PRADESH
|
| 74 |
+
MP,MADHYA PRADESH
|
| 75 |
+
M.P,MADHYA PRADESH
|
| 76 |
+
M.P.,MADHYA PRADESH
|
| 77 |
+
MP STATE,MADHYA PRADESH
|
| 78 |
+
IN-MP,MADHYA PRADESH
|
| 79 |
+
MAHARASHTRA,MAHARASHTRA
|
| 80 |
+
MAHARASTRA,MAHARASHTRA
|
| 81 |
+
MAHA,MAHARASHTRA
|
| 82 |
+
MH,MAHARASHTRA
|
| 83 |
+
M.H,MAHARASHTRA
|
| 84 |
+
MAHARASHTRA STATE,MAHARASHTRA
|
| 85 |
+
IN-MH,MAHARASHTRA
|
| 86 |
+
MANIPUR,MANIPUR
|
| 87 |
+
MN,MANIPUR
|
| 88 |
+
M.N,MANIPUR
|
| 89 |
+
IN-MN,MANIPUR
|
| 90 |
+
MEGHALAYA,MEGHALAYA
|
| 91 |
+
ML,MEGHALAYA
|
| 92 |
+
M.L,MEGHALAYA
|
| 93 |
+
IN-ML,MEGHALAYA
|
| 94 |
+
MIZORAM,MIZORAM
|
| 95 |
+
MZ,MIZORAM
|
| 96 |
+
M.Z,MIZORAM
|
| 97 |
+
IN-MZ,MIZORAM
|
| 98 |
+
NAGALAND,NAGALAND
|
| 99 |
+
NL,NAGALAND
|
| 100 |
+
N.L,NAGALAND
|
| 101 |
+
IN-NL,NAGALAND
|
| 102 |
+
ODISHA,ODISHA
|
| 103 |
+
ORISSA,ODISHA
|
| 104 |
+
OD,ODISHA
|
| 105 |
+
O.D,ODISHA
|
| 106 |
+
OR,ODISHA
|
| 107 |
+
O.R,ODISHA
|
| 108 |
+
ODISHA STATE,ODISHA
|
| 109 |
+
IN-OD,ODISHA
|
| 110 |
+
PUNJAB,PUNJAB
|
| 111 |
+
PANJAB,PUNJAB
|
| 112 |
+
PB,PUNJAB
|
| 113 |
+
P.B,PUNJAB
|
| 114 |
+
IN-PB,PUNJAB
|
| 115 |
+
RAJASTHAN,RAJASTHAN
|
| 116 |
+
RAJ,RAJASTHAN
|
| 117 |
+
RJ,RAJASTHAN
|
| 118 |
+
R.J,RAJASTHAN
|
| 119 |
+
RAJASTHAN STATE,RAJASTHAN
|
| 120 |
+
IN-RJ,RAJASTHAN
|
| 121 |
+
SIKKIM,SIKKIM
|
| 122 |
+
SK,SIKKIM
|
| 123 |
+
S.K,SIKKIM
|
| 124 |
+
IN-SK,SIKKIM
|
| 125 |
+
TAMIL NADU,TAMIL NADU
|
| 126 |
+
TAMILNADU,TAMIL NADU
|
| 127 |
+
TAMIL,TAMIL NADU
|
| 128 |
+
TN,TAMIL NADU
|
| 129 |
+
T.N,TAMIL NADU
|
| 130 |
+
T.N.,TAMIL NADU
|
| 131 |
+
TAMILNADU STATE,TAMIL NADU
|
| 132 |
+
IN-TN,TAMIL NADU
|
| 133 |
+
TELANGANA,TELANGANA
|
| 134 |
+
TELENGANA,TELANGANA
|
| 135 |
+
TG,TELANGANA
|
| 136 |
+
T.G,TELANGANA
|
| 137 |
+
TS,TELANGANA
|
| 138 |
+
T.S,TELANGANA
|
| 139 |
+
TELANGANA STATE,TELANGANA
|
| 140 |
+
IN-TS,TELANGANA
|
| 141 |
+
TRIPURA,TRIPURA
|
| 142 |
+
TR,TRIPURA
|
| 143 |
+
T.R,TRIPURA
|
| 144 |
+
IN-TR,TRIPURA
|
| 145 |
+
UTTAR PRADESH,UTTAR PRADESH
|
| 146 |
+
UTTARPRADESH,UTTAR PRADESH
|
| 147 |
+
UTTAR,UTTAR PRADESH
|
| 148 |
+
UP,UTTAR PRADESH
|
| 149 |
+
U.P,UTTAR PRADESH
|
| 150 |
+
U.P.,UTTAR PRADESH
|
| 151 |
+
UP STATE,UTTAR PRADESH
|
| 152 |
+
IN-UP,UTTAR PRADESH
|
| 153 |
+
UTTARAKHAND,UTTARAKHAND
|
| 154 |
+
UTTARANCHAL,UTTARAKHAND
|
| 155 |
+
UK,UTTARAKHAND
|
| 156 |
+
U.K,UTTARAKHAND
|
| 157 |
+
UA,UTTARAKHAND
|
| 158 |
+
UTTARAKHAND STATE,UTTARAKHAND
|
| 159 |
+
IN-UK,UTTARAKHAND
|
| 160 |
+
WEST BENGAL,WEST BENGAL
|
| 161 |
+
WESTBENGAL,WEST BENGAL
|
| 162 |
+
WB,WEST BENGAL
|
| 163 |
+
W.B,WEST BENGAL
|
| 164 |
+
W.B.,WEST BENGAL
|
| 165 |
+
WEST BENGAL STATE,WEST BENGAL
|
| 166 |
+
IN-WB,WEST BENGAL
|
| 167 |
+
ANDAMAN AND NICOBAR ISLANDS,ANDAMAN AND NICOBAR ISLANDS
|
| 168 |
+
ANDAMAN NICOBAR,ANDAMAN AND NICOBAR ISLANDS
|
| 169 |
+
ANDAMAN,ANDAMAN AND NICOBAR ISLANDS
|
| 170 |
+
NICOBAR,ANDAMAN AND NICOBAR ISLANDS
|
| 171 |
+
AN,ANDAMAN AND NICOBAR ISLANDS
|
| 172 |
+
A.N,ANDAMAN AND NICOBAR ISLANDS
|
| 173 |
+
A & N ISLANDS,ANDAMAN AND NICOBAR ISLANDS
|
| 174 |
+
IN-AN,ANDAMAN AND NICOBAR ISLANDS
|
| 175 |
+
CHANDIGARH,CHANDIGARH
|
| 176 |
+
CH,CHANDIGARH
|
| 177 |
+
C.H,CHANDIGARH
|
| 178 |
+
IN-CH,CHANDIGARH
|
| 179 |
+
MOHALI,CHANDIGARH
|
| 180 |
+
SAS NAGAR,CHANDIGARH
|
| 181 |
+
KHARAR,CHANDIGARH
|
| 182 |
+
PANCHKULA,CHANDIGARH
|
| 183 |
+
ZIRAKPUR,CHANDIGARH
|
| 184 |
+
DADRA AND NAGAR HAVELI AND DAMAN AND DIU,DADRA AND NAGAR HAVELI AND DAMAN AND DIU
|
| 185 |
+
DADRA NAGAR HAVELI,DADRA AND NAGAR HAVELI AND DAMAN AND DIU
|
| 186 |
+
DAMAN DIU,DADRA AND NAGAR HAVELI AND DAMAN AND DIU
|
| 187 |
+
DN,DADRA AND NAGAR HAVELI AND DAMAN AND DIU
|
| 188 |
+
D.N,DADRA AND NAGAR HAVELI AND DAMAN AND DIU
|
| 189 |
+
DNH,DADRA AND NAGAR HAVELI AND DAMAN AND DIU
|
| 190 |
+
DD,DADRA AND NAGAR HAVELI AND DAMAN AND DIU
|
| 191 |
+
IN-DH,DADRA AND NAGAR HAVELI AND DAMAN AND DIU
|
| 192 |
+
DELHI,DELHI
|
| 193 |
+
NEW DELHI,DELHI
|
| 194 |
+
DL,DELHI
|
| 195 |
+
D.L,DELHI
|
| 196 |
+
NCT OF DELHI,DELHI
|
| 197 |
+
NATIONAL CAPITAL TERRITORY OF DELHI,DELHI
|
| 198 |
+
NORTH EAST DELHI,DELHI
|
| 199 |
+
NORTH WEST DELHI,DELHI
|
| 200 |
+
SOUTH EAST DELHI,DELHI
|
| 201 |
+
SOUTH WEST DELHI,DELHI
|
| 202 |
+
SEELAMPUR,DELHI
|
| 203 |
+
SHAHDARA,DELHI
|
| 204 |
+
DWARKA,DELHI
|
| 205 |
+
ROHINI,DELHI
|
| 206 |
+
PITAMPURA,DELHI
|
| 207 |
+
KAROL BAGH,DELHI
|
| 208 |
+
LAJPAT NAGAR,DELHI
|
| 209 |
+
SAKET,DELHI
|
| 210 |
+
JANAKPURI,DELHI
|
| 211 |
+
MAYUR VIHAR,DELHI
|
| 212 |
+
VASANT KUNJ,DELHI
|
| 213 |
+
OKHLA,DELHI
|
| 214 |
+
NOIDA,DELHI
|
| 215 |
+
GREATER NOIDA,DELHI
|
| 216 |
+
FARIDABAD,DELHI
|
| 217 |
+
GHAZIABAD,DELHI
|
| 218 |
+
GHZ,DELHI
|
| 219 |
+
INDIRAPURAM,DELHI
|
| 220 |
+
GURUGRAM,DELHI
|
| 221 |
+
GURGAON,DELHI
|
| 222 |
+
IN-DL,DELHI
|
| 223 |
+
JAMMU AND KASHMIR,JAMMU AND KASHMIR
|
| 224 |
+
JAMMU,JAMMU AND KASHMIR
|
| 225 |
+
KASHMIR,JAMMU AND KASHMIR
|
| 226 |
+
JK,JAMMU AND KASHMIR
|
| 227 |
+
J.K,JAMMU AND KASHMIR
|
| 228 |
+
J&K,JAMMU AND KASHMIR
|
| 229 |
+
JAMMU & KASHMIR,JAMMU AND KASHMIR
|
| 230 |
+
IN-JK,JAMMU AND KASHMIR
|
| 231 |
+
LADAKH,LADAKH
|
| 232 |
+
LA,LADAKH
|
| 233 |
+
L.A,LADAKH
|
| 234 |
+
IN-LA,LADAKH
|
| 235 |
+
LAKSHADWEEP,LAKSHADWEEP
|
| 236 |
+
LAKSHADWEEP ISLANDS,LAKSHADWEEP
|
| 237 |
+
LD,LAKSHADWEEP
|
| 238 |
+
L.D,LAKSHADWEEP
|
| 239 |
+
IN-LD,LAKSHADWEEP
|
| 240 |
+
PUDUCHERRY,PUDUCHERRY
|
| 241 |
+
PONDICHERRY,PUDUCHERRY
|
| 242 |
+
PY,PUDUCHERRY
|
| 243 |
+
P.Y,PUDUCHERRY
|
| 244 |
+
IN-PY,PUDUCHERRY
|
data/sur_comm_names.csv
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"s_no","surname_community_extension"
|
| 2 |
+
1,SINGH
|
| 3 |
+
2,SHARMA
|
| 4 |
+
3,GUPTA
|
| 5 |
+
4,VERMA
|
| 6 |
+
5,AGARWAL
|
| 7 |
+
6,AGGARWAL
|
| 8 |
+
7,BANSAL
|
| 9 |
+
8,GOYAL
|
| 10 |
+
9,MITTAL
|
| 11 |
+
10,SRIVASTAVA
|
| 12 |
+
11,CHAUDHARY
|
| 13 |
+
12,CHOUDHARY
|
| 14 |
+
13,PANDEY
|
| 15 |
+
14,MISHRA
|
| 16 |
+
15,TIWARI
|
| 17 |
+
16,YADAV
|
| 18 |
+
17,PATEL
|
| 19 |
+
18,SHAH
|
| 20 |
+
19,MEHTA
|
| 21 |
+
20,DESAI
|
| 22 |
+
21,JOSHI
|
| 23 |
+
22,KULKARNI
|
| 24 |
+
23,PAWAR
|
| 25 |
+
24,JADHAV
|
| 26 |
+
25,SHINDE
|
| 27 |
+
26,REDDY
|
| 28 |
+
27,RAO
|
| 29 |
+
28,NAIDU
|
| 30 |
+
29,VARMA
|
| 31 |
+
30,GOWDA
|
| 32 |
+
31,SHETTY
|
| 33 |
+
32,SETTY
|
| 34 |
+
33,NAIR
|
| 35 |
+
34,PILLAI
|
| 36 |
+
35,MENON
|
| 37 |
+
36,DAS
|
| 38 |
+
37,DUTTA
|
| 39 |
+
38,ROY
|
| 40 |
+
39,SARKAR
|
| 41 |
+
40,MONDAL
|
| 42 |
+
41,GHOSH
|
| 43 |
+
42,BOSE
|
| 44 |
+
43,SEN
|
| 45 |
+
44,THOMAS
|
| 46 |
+
45,MATHEW
|
| 47 |
+
46,JOSEPH
|
| 48 |
+
47,JOHN
|
| 49 |
+
48,GEORGE
|
| 50 |
+
49,KAUR
|
| 51 |
+
50,KHAN
|
| 52 |
+
51,AHMED
|
| 53 |
+
52,AHMAD
|
| 54 |
+
53,ANSARI
|
| 55 |
+
54,SHAIKH
|
| 56 |
+
55,SHEIKH
|
| 57 |
+
56,SYED
|
| 58 |
+
57,HUSSAIN
|
| 59 |
+
58,QURESHI
|
| 60 |
+
59,SIDDIQUI
|
| 61 |
+
60,FAROOQI
|
| 62 |
+
61,PATHAN
|
| 63 |
+
62,BEG
|
| 64 |
+
63,BAIG
|
| 65 |
+
64,MIRZA
|
| 66 |
+
65,USMANI
|
| 67 |
+
66,RAZA
|
| 68 |
+
67,PATAN
|
| 69 |
+
68,NAQVI
|
| 70 |
+
69,RIZVI
|
| 71 |
+
70,KAZMI
|
| 72 |
+
71,ZAIDI
|
| 73 |
+
72,BUKHARI
|
| 74 |
+
73,CHISHTI
|
| 75 |
+
74,MADANI
|
| 76 |
+
75,NOMANI
|
| 77 |
+
76,FARUQI
|
| 78 |
+
77,HASHMI
|
| 79 |
+
78,AZMI
|
| 80 |
+
79,KAZI
|
| 81 |
+
80,QAZI
|
| 82 |
+
81,MEMON
|
| 83 |
+
82,BOHRA
|
| 84 |
+
83,ATTAR
|
| 85 |
+
84,TAMBOLI
|
| 86 |
+
85,NADAF
|
| 87 |
+
86,PINJARI
|
| 88 |
+
87,BAGWAN
|
| 89 |
+
88,KUMAR
|
| 90 |
+
89,KUMARI
|
| 91 |
+
90,DEVI
|
| 92 |
+
91,LAL
|
| 93 |
+
92,PRASAD
|
| 94 |
+
93,CHANDRA
|
| 95 |
+
94,NATH
|
| 96 |
+
95,RAJ
|
| 97 |
+
96,DEV
|
| 98 |
+
97,BABU
|
| 99 |
+
98,MOHAMMED
|
| 100 |
+
99,MUHAMMAD
|
| 101 |
+
100,MOHAMMAD
|
| 102 |
+
101,MOHAMED
|
| 103 |
+
102,MOHD
|
| 104 |
+
103,MD
|
| 105 |
+
104,MHD
|
| 106 |
+
105,ABDUL
|
| 107 |
+
106,ABD
|
| 108 |
+
107,ALI
|
| 109 |
+
108,HASSAN
|
| 110 |
+
109,PAUL
|
| 111 |
+
110,PAL
|
| 112 |
+
111,TRIPATHI
|
| 113 |
+
112,DWIVEDI
|
| 114 |
+
113,CHATURVEDI
|
| 115 |
+
114,UPADHYAY
|
| 116 |
+
115,BHARDWAJ
|
| 117 |
+
116,BHARGAVA
|
| 118 |
+
117,VASHISHTHA
|
| 119 |
+
118,SHUKLA
|
| 120 |
+
119,DUBEY
|
| 121 |
+
120,DUBE
|
| 122 |
+
121,TYAGI
|
| 123 |
+
122,SAXENA
|
| 124 |
+
123,MATHUR
|
| 125 |
+
124,TANDON
|
| 126 |
+
125,KHANNA
|
| 127 |
+
126,ARORA
|
| 128 |
+
127,MALHOTRA
|
| 129 |
+
128,BATRA
|
| 130 |
+
129,GROVER
|
| 131 |
+
130,BEDI
|
| 132 |
+
131,SODHI
|
| 133 |
+
132,AHUJA
|
| 134 |
+
133,CHAWLA
|
| 135 |
+
134,SANDHU
|
| 136 |
+
135,SIDHU
|
| 137 |
+
136,DHILLON
|
| 138 |
+
137,BRAR
|
| 139 |
+
138,RANDHAWA
|
| 140 |
+
139,GILL
|
| 141 |
+
140,MANN
|
| 142 |
+
141,CHEEMA
|
| 143 |
+
142,CHAHAL
|
| 144 |
+
143,PUNIA
|
| 145 |
+
144,JAIN
|
| 146 |
+
145,DOSHI
|
| 147 |
+
146,DALAL
|
| 148 |
+
147,MODI
|
| 149 |
+
148,PAREKH
|
| 150 |
+
149,ZAVERI
|
| 151 |
+
150,SANGHVI
|
| 152 |
+
151,SOMANI
|
| 153 |
+
152,LODHA
|
| 154 |
+
153,LODH
|
| 155 |
+
154,PATIL
|
| 156 |
+
155,DESHMUKH
|
| 157 |
+
156,GAIKWAD
|
| 158 |
+
157,KADAM
|
| 159 |
+
158,KAMBLE
|
| 160 |
+
159,SALUNKHE
|
| 161 |
+
160,BHOSALE
|
| 162 |
+
161,MORE
|
| 163 |
+
162,PENDSE
|
| 164 |
+
163,KARANDE
|
| 165 |
+
164,ACHARYA
|
| 166 |
+
165,HEGDE
|
| 167 |
+
166,BHAT
|
| 168 |
+
167,BHATT
|
| 169 |
+
168,IYER
|
| 170 |
+
169,IYENGAR
|
| 171 |
+
170,MALIK
|
| 172 |
+
171,REHMAN
|
| 173 |
+
172,RAHMAN
|
| 174 |
+
173,AKHTAR
|
| 175 |
+
174,IQBAL
|
| 176 |
+
175,SALMAN
|
| 177 |
+
176,SULTAN
|
| 178 |
+
177,TARIQ
|
| 179 |
+
178,JAVED
|
| 180 |
+
179,FAIZ
|
| 181 |
+
180,rai
|
| 182 |
+
181,
|
frontend/app.py
ADDED
|
@@ -0,0 +1,673 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import json
|
| 3 |
+
import re
|
| 4 |
+
import os
|
| 5 |
+
import requests
|
| 6 |
+
import sys, os
|
| 7 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 8 |
+
from backend.matching_service import perform_match
|
| 9 |
+
from backend.models import EntityRecord, MatchMode
|
| 10 |
+
|
| 11 |
+
# =========================================================
|
| 12 |
+
# CONSTANTS
|
| 13 |
+
# =========================================================
|
| 14 |
+
MAX_FIELDS = 20
|
| 15 |
+
|
| 16 |
+
# =========================================================
|
| 17 |
+
# CUSTOM CSS - Matching the original Streamlit design
|
| 18 |
+
# =========================================================
|
| 19 |
+
CUSTOM_CSS = """
|
| 20 |
+
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');
|
| 21 |
+
|
| 22 |
+
* {
|
| 23 |
+
font-family: 'Inter', sans-serif;
|
| 24 |
+
box-sizing: border-box;
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
body, .gradio-container {
|
| 28 |
+
background-color: #f0f2f5 !important;
|
| 29 |
+
color: #333 !important;
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
/* ββ App wrapper ββ */
|
| 33 |
+
.gradio-container {
|
| 34 |
+
max-width: 1400px !important;
|
| 35 |
+
margin: 0 auto !important;
|
| 36 |
+
padding: 20px !important;
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
/* ββ Header ββ */
|
| 40 |
+
.app-header {
|
| 41 |
+
text-align: center;
|
| 42 |
+
margin-bottom: 24px;
|
| 43 |
+
padding: 24px 0 8px;
|
| 44 |
+
}
|
| 45 |
+
.app-header h1 {
|
| 46 |
+
color: #5B4E8B;
|
| 47 |
+
font-size: 26px;
|
| 48 |
+
font-weight: 700;
|
| 49 |
+
margin: 0 0 6px;
|
| 50 |
+
}
|
| 51 |
+
.app-header p {
|
| 52 |
+
color: #666;
|
| 53 |
+
font-size: 14px;
|
| 54 |
+
margin: 0;
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
/* ββ Record header ββ */
|
| 58 |
+
.record-header {
|
| 59 |
+
color: #612383;
|
| 60 |
+
font-size: 22px;
|
| 61 |
+
font-weight: 700;
|
| 62 |
+
padding-bottom: 10px;
|
| 63 |
+
margin-bottom: 18px;
|
| 64 |
+
border-bottom: 3px solid transparent;
|
| 65 |
+
border-image: linear-gradient(90deg, #612383, #E9592E, #F5A700) 1;
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
/* ββ Section card ββ */
|
| 69 |
+
.section-card {
|
| 70 |
+
background: white;
|
| 71 |
+
border-radius: 12px;
|
| 72 |
+
box-shadow: 0 2px 8px rgba(0,0,0,0.08);
|
| 73 |
+
margin-bottom: 18px;
|
| 74 |
+
overflow: hidden;
|
| 75 |
+
}
|
| 76 |
+
.section-header-gradient {
|
| 77 |
+
background: linear-gradient(90deg, #612383 0%, #E9592E 100%);
|
| 78 |
+
color: white;
|
| 79 |
+
padding: 12px 18px;
|
| 80 |
+
font-size: 13px;
|
| 81 |
+
font-weight: 600;
|
| 82 |
+
text-transform: uppercase;
|
| 83 |
+
letter-spacing: 0.5px;
|
| 84 |
+
}
|
| 85 |
+
.section-body {
|
| 86 |
+
padding: 18px;
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
/* ββ Gradio overrides ββ */
|
| 90 |
+
.gr-form, .gr-box {
|
| 91 |
+
background: transparent !important;
|
| 92 |
+
border: none !important;
|
| 93 |
+
box-shadow: none !important;
|
| 94 |
+
padding: 0 !important;
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
label span {
|
| 98 |
+
color: #555 !important;
|
| 99 |
+
font-size: 13px !important;
|
| 100 |
+
font-weight: 500 !important;
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
input[type="text"], textarea {
|
| 104 |
+
background-color: #fafbfc !important;
|
| 105 |
+
color: #333 !important;
|
| 106 |
+
border: 1px solid #e1e4e8 !important;
|
| 107 |
+
border-radius: 8px !important;
|
| 108 |
+
font-size: 14px !important;
|
| 109 |
+
transition: border-color 0.2s;
|
| 110 |
+
}
|
| 111 |
+
input[type="text"]:focus, textarea:focus {
|
| 112 |
+
border-color: #E9592E !important;
|
| 113 |
+
box-shadow: 0 0 0 3px rgba(233,89,46,0.10) !important;
|
| 114 |
+
outline: none !important;
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
/* ββ Run Match button ββ */
|
| 118 |
+
#run-match-btn {
|
| 119 |
+
background: linear-gradient(90deg, #612383 0%, #E9592E 100%) !important;
|
| 120 |
+
color: white !important;
|
| 121 |
+
border: none !important;
|
| 122 |
+
border-radius: 10px !important;
|
| 123 |
+
padding: 16px 32px !important;
|
| 124 |
+
font-size: 16px !important;
|
| 125 |
+
font-weight: 600 !important;
|
| 126 |
+
text-transform: uppercase;
|
| 127 |
+
letter-spacing: 0.5px;
|
| 128 |
+
box-shadow: 0 4px 12px rgba(97,35,131,0.25) !important;
|
| 129 |
+
transition: all 0.3s ease;
|
| 130 |
+
cursor: pointer;
|
| 131 |
+
width: 100%;
|
| 132 |
+
}
|
| 133 |
+
#run-match-btn:hover {
|
| 134 |
+
background: linear-gradient(90deg, #E9592E 0%, #612383 100%) !important;
|
| 135 |
+
transform: translateY(-2px);
|
| 136 |
+
box-shadow: 0 6px 16px rgba(233,89,46,0.35) !important;
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
/* ββ Add/Remove inline buttons ββ */
|
| 140 |
+
.btn-inline {
|
| 141 |
+
width: 36px !important;
|
| 142 |
+
height: 36px !important;
|
| 143 |
+
min-width: 36px !important;
|
| 144 |
+
padding: 0 !important;
|
| 145 |
+
border-radius: 6px !important;
|
| 146 |
+
font-size: 20px !important;
|
| 147 |
+
font-weight: 500 !important;
|
| 148 |
+
background-color: white !important;
|
| 149 |
+
color: #612383 !important;
|
| 150 |
+
border: 1px solid #d0d7de !important;
|
| 151 |
+
cursor: pointer;
|
| 152 |
+
}
|
| 153 |
+
.btn-inline:hover {
|
| 154 |
+
border-color: #28a745 !important;
|
| 155 |
+
color: #28a745 !important;
|
| 156 |
+
background: #f6fef9 !important;
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
/* ββ Add Field gradient button ββ */
|
| 160 |
+
.btn-add-field {
|
| 161 |
+
background: linear-gradient(90deg, #612383 0%, #E9592E 100%) !important;
|
| 162 |
+
color: white !important;
|
| 163 |
+
border: none !important;
|
| 164 |
+
border-radius: 8px !important;
|
| 165 |
+
padding: 10px 22px !important;
|
| 166 |
+
font-size: 13px !important;
|
| 167 |
+
font-weight: 600 !important;
|
| 168 |
+
text-transform: uppercase;
|
| 169 |
+
letter-spacing: 0.5px;
|
| 170 |
+
cursor: pointer;
|
| 171 |
+
margin-top: 10px;
|
| 172 |
+
box-shadow: 0 3px 8px rgba(97,35,131,0.2);
|
| 173 |
+
}
|
| 174 |
+
.btn-add-field:hover {
|
| 175 |
+
background: linear-gradient(90deg, #E9592E 0%, #612383 100%) !important;
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
/* ββ Backend status ββ */
|
| 179 |
+
.status-online { color: #28a745; font-weight: 600; font-size: 14px; }
|
| 180 |
+
.status-offline { color: #dc3545; font-weight: 600; font-size: 14px; }
|
| 181 |
+
|
| 182 |
+
/* ββ Result box ββ */
|
| 183 |
+
.result-box {
|
| 184 |
+
background: white;
|
| 185 |
+
border-radius: 12px;
|
| 186 |
+
padding: 24px;
|
| 187 |
+
margin-top: 24px;
|
| 188 |
+
box-shadow: 0 4px 16px rgba(0,0,0,0.10);
|
| 189 |
+
border-top: 4px solid transparent;
|
| 190 |
+
border-image: linear-gradient(90deg, #612383, #E9592E, #F5A700) 1;
|
| 191 |
+
}
|
| 192 |
+
.result-header {
|
| 193 |
+
color: #612383;
|
| 194 |
+
font-size: 17px;
|
| 195 |
+
font-weight: 600;
|
| 196 |
+
margin-bottom: 12px;
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
/* ββ Subsection label ββ */
|
| 200 |
+
.subsection-label {
|
| 201 |
+
color: #666;
|
| 202 |
+
font-size: 13px;
|
| 203 |
+
font-weight: 600;
|
| 204 |
+
margin: 14px 0 8px;
|
| 205 |
+
text-transform: uppercase;
|
| 206 |
+
letter-spacing: 0.3px;
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
/* ββ Address divider ββ */
|
| 210 |
+
.addr-divider {
|
| 211 |
+
border: none;
|
| 212 |
+
border-top: 1px solid #e1e4e8;
|
| 213 |
+
margin: 16px 0;
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
/* ββ Accordion / Group override ββ */
|
| 217 |
+
.gr-group {
|
| 218 |
+
border: none !important;
|
| 219 |
+
background: transparent !important;
|
| 220 |
+
padding: 0 !important;
|
| 221 |
+
}
|
| 222 |
+
"""
|
| 223 |
+
|
| 224 |
+
# =========================================================
|
| 225 |
+
# HELPERS
|
| 226 |
+
# =========================================================
|
| 227 |
+
def preprocess_text(text):
|
| 228 |
+
if not text:
|
| 229 |
+
return ""
|
| 230 |
+
return re.sub(r"\s+", " ", text.strip())
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
def check_backend_health():
|
| 234 |
+
try:
|
| 235 |
+
r = requests.get("http://127.0.0.1:8000/backend/v1/health", timeout=3)
|
| 236 |
+
if r.status_code == 200:
|
| 237 |
+
return "π’ Backend Server Reachable"
|
| 238 |
+
except Exception:
|
| 239 |
+
pass
|
| 240 |
+
return "π΄ Backend Terminated"
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
def convert_to_api_payload(record: dict) -> dict:
|
| 244 |
+
payload = {
|
| 245 |
+
"name": record.get("name", ""),
|
| 246 |
+
"firstname": record.get("firstname", ""),
|
| 247 |
+
"middlename": record.get("middlename", ""),
|
| 248 |
+
"lastname": record.get("lastname", ""),
|
| 249 |
+
"mothername": record.get("mothername", ""),
|
| 250 |
+
"fathername": record.get("fathername", ""),
|
| 251 |
+
"spousename": record.get("spousename", ""),
|
| 252 |
+
"othername": record.get("othername", ""),
|
| 253 |
+
"dob": record.get("dob", ""),
|
| 254 |
+
"gender": record.get("gender", ""),
|
| 255 |
+
"AADHAR": record.get("AADHAR", ""),
|
| 256 |
+
"pan": record.get("pan", ""),
|
| 257 |
+
"licenseid": record.get("licenseid", ""),
|
| 258 |
+
"passportid": record.get("passportid", ""),
|
| 259 |
+
"voterid": record.get("voterid", ""),
|
| 260 |
+
"companyname": record.get("companyname", ""),
|
| 261 |
+
"parentcompanyname": record.get("parentcompanyname", ""),
|
| 262 |
+
"phones": [],
|
| 263 |
+
"emails": [],
|
| 264 |
+
"addresses": [],
|
| 265 |
+
"custom_fields": {},
|
| 266 |
+
}
|
| 267 |
+
|
| 268 |
+
for i in range(MAX_FIELDS):
|
| 269 |
+
val = record.get(f"phone_{i}", "")
|
| 270 |
+
if val:
|
| 271 |
+
payload["phones"].append(str(val))
|
| 272 |
+
|
| 273 |
+
val = record.get(f"email_{i}", "")
|
| 274 |
+
if val:
|
| 275 |
+
payload["emails"].append(str(val))
|
| 276 |
+
|
| 277 |
+
addr_keys = [f"addressline_{i}", f"city_{i}", f"state_{i}", f"zipcode_{i}"]
|
| 278 |
+
if any(k in record for k in addr_keys):
|
| 279 |
+
addr = {
|
| 280 |
+
"addressline": record.get(f"addressline_{i}", ""),
|
| 281 |
+
"city": record.get(f"city_{i}", ""),
|
| 282 |
+
"state": record.get(f"state_{i}", ""),
|
| 283 |
+
"zipcode": record.get(f"zipcode_{i}", ""),
|
| 284 |
+
}
|
| 285 |
+
payload["addresses"].append(addr)
|
| 286 |
+
|
| 287 |
+
known_keys = set(payload.keys()) - {"phones", "emails", "addresses", "custom_fields"}
|
| 288 |
+
known_prefixes = ("addressline_", "city_", "state_", "zipcode_", "phone_", "email_")
|
| 289 |
+
|
| 290 |
+
for k, v in record.items():
|
| 291 |
+
k_str = str(k)
|
| 292 |
+
if k_str in known_keys:
|
| 293 |
+
continue
|
| 294 |
+
if any(k_str.startswith(p) for p in known_prefixes):
|
| 295 |
+
continue
|
| 296 |
+
if v and str(v).strip():
|
| 297 |
+
payload["custom_fields"][k_str] = str(v)
|
| 298 |
+
|
| 299 |
+
return payload
|
| 300 |
+
|
| 301 |
+
|
| 302 |
+
# =========================================================
|
| 303 |
+
# MATCH FUNCTION (called by the Run button)
|
| 304 |
+
# =========================================================
|
| 305 |
+
def run_match(
|
| 306 |
+
# ββ Record 1 personal ββ
|
| 307 |
+
r1_name, r1_firstname, r1_middlename, r1_lastname,
|
| 308 |
+
r1_mothername, r1_fathername, r1_spousename, r1_othername,
|
| 309 |
+
r1_dob, r1_gender,
|
| 310 |
+
# ββ Record 1 identifiers ββ
|
| 311 |
+
r1_aadhar, r1_pan, r1_licenseid, r1_passportid, r1_voterid,
|
| 312 |
+
# ββ Record 1 addresses (5 slots) ββ
|
| 313 |
+
r1_addr0_line, r1_addr0_city, r1_addr0_state, r1_addr0_zip,
|
| 314 |
+
r1_addr1_line, r1_addr1_city, r1_addr1_state, r1_addr1_zip,
|
| 315 |
+
r1_addr2_line, r1_addr2_city, r1_addr2_state, r1_addr2_zip,
|
| 316 |
+
r1_addr3_line, r1_addr3_city, r1_addr3_state, r1_addr3_zip,
|
| 317 |
+
r1_addr4_line, r1_addr4_city, r1_addr4_state, r1_addr4_zip,
|
| 318 |
+
# ββ Record 1 phones (5 slots) ββ
|
| 319 |
+
r1_phone0, r1_phone1, r1_phone2, r1_phone3, r1_phone4,
|
| 320 |
+
# ββ Record 1 emails (5 slots) ββ
|
| 321 |
+
r1_email0, r1_email1, r1_email2, r1_email3, r1_email4,
|
| 322 |
+
# ββ Record 1 employment ββ
|
| 323 |
+
r1_company, r1_parent_company,
|
| 324 |
+
# ββ Record 1 custom fields (5 slots) ββ
|
| 325 |
+
r1_cf0_name, r1_cf0_val,
|
| 326 |
+
r1_cf1_name, r1_cf1_val,
|
| 327 |
+
r1_cf2_name, r1_cf2_val,
|
| 328 |
+
r1_cf3_name, r1_cf3_val,
|
| 329 |
+
r1_cf4_name, r1_cf4_val,
|
| 330 |
+
|
| 331 |
+
# ββ Record 2 personal οΏ½οΏ½β
|
| 332 |
+
r2_name, r2_firstname, r2_middlename, r2_lastname,
|
| 333 |
+
r2_mothername, r2_fathername, r2_spousename, r2_othername,
|
| 334 |
+
r2_dob, r2_gender,
|
| 335 |
+
# ββ Record 2 identifiers ββ
|
| 336 |
+
r2_aadhar, r2_pan, r2_licenseid, r2_passportid, r2_voterid,
|
| 337 |
+
# ββ Record 2 addresses (5 slots) ββ
|
| 338 |
+
r2_addr0_line, r2_addr0_city, r2_addr0_state, r2_addr0_zip,
|
| 339 |
+
r2_addr1_line, r2_addr1_city, r2_addr1_state, r2_addr1_zip,
|
| 340 |
+
r2_addr2_line, r2_addr2_city, r2_addr2_state, r2_addr2_zip,
|
| 341 |
+
r2_addr3_line, r2_addr3_city, r2_addr3_state, r2_addr3_zip,
|
| 342 |
+
r2_addr4_line, r2_addr4_city, r2_addr4_state, r2_addr4_zip,
|
| 343 |
+
# ββ Record 2 phones (5 slots) ββ
|
| 344 |
+
r2_phone0, r2_phone1, r2_phone2, r2_phone3, r2_phone4,
|
| 345 |
+
# ββ Record 2 emails (5 slots) ββ
|
| 346 |
+
r2_email0, r2_email1, r2_email2, r2_email3, r2_email4,
|
| 347 |
+
# ββ Record 2 employment ββ
|
| 348 |
+
r2_company, r2_parent_company,
|
| 349 |
+
# ββ Record 2 custom fields (5 slots) ββ
|
| 350 |
+
r2_cf0_name, r2_cf0_val,
|
| 351 |
+
r2_cf1_name, r2_cf1_val,
|
| 352 |
+
r2_cf2_name, r2_cf2_val,
|
| 353 |
+
r2_cf3_name, r2_cf3_val,
|
| 354 |
+
r2_cf4_name, r2_cf4_val,
|
| 355 |
+
):
|
| 356 |
+
def build_record(
|
| 357 |
+
name, firstname, middlename, lastname,
|
| 358 |
+
mothername, fathername, spousename, othername, dob, gender,
|
| 359 |
+
aadhar, pan, licenseid, passportid, voterid,
|
| 360 |
+
addr_lines, phones, emails,
|
| 361 |
+
company, parent_company,
|
| 362 |
+
custom_fields_pairs,
|
| 363 |
+
):
|
| 364 |
+
rec = {
|
| 365 |
+
"name": name, "firstname": firstname, "middlename": middlename,
|
| 366 |
+
"lastname": lastname, "mothername": mothername, "fathername": fathername,
|
| 367 |
+
"spousename": spousename, "othername": othername, "dob": dob, "gender": gender,
|
| 368 |
+
"AADHAR": aadhar, "pan": pan, "licenseid": licenseid,
|
| 369 |
+
"passportid": passportid, "voterid": voterid,
|
| 370 |
+
"companyname": company, "parentcompanyname": parent_company,
|
| 371 |
+
}
|
| 372 |
+
for i, (line, city, state, zipcode) in enumerate(addr_lines):
|
| 373 |
+
rec[f"addressline_{i}"] = line
|
| 374 |
+
rec[f"city_{i}"] = city
|
| 375 |
+
rec[f"state_{i}"] = state
|
| 376 |
+
rec[f"zipcode_{i}"] = zipcode
|
| 377 |
+
for i, ph in enumerate(phones):
|
| 378 |
+
rec[f"phone_{i}"] = ph
|
| 379 |
+
for i, em in enumerate(emails):
|
| 380 |
+
rec[f"email_{i}"] = em
|
| 381 |
+
for cf_name, cf_val in custom_fields_pairs:
|
| 382 |
+
if cf_name and cf_name.strip():
|
| 383 |
+
rec[cf_name.strip()] = cf_val
|
| 384 |
+
return rec
|
| 385 |
+
|
| 386 |
+
r1 = build_record(
|
| 387 |
+
r1_name, r1_firstname, r1_middlename, r1_lastname,
|
| 388 |
+
r1_mothername, r1_fathername, r1_spousename, r1_othername, r1_dob, r1_gender,
|
| 389 |
+
r1_aadhar, r1_pan, r1_licenseid, r1_passportid, r1_voterid,
|
| 390 |
+
[
|
| 391 |
+
(r1_addr0_line, r1_addr0_city, r1_addr0_state, r1_addr0_zip),
|
| 392 |
+
(r1_addr1_line, r1_addr1_city, r1_addr1_state, r1_addr1_zip),
|
| 393 |
+
(r1_addr2_line, r1_addr2_city, r1_addr2_state, r1_addr2_zip),
|
| 394 |
+
(r1_addr3_line, r1_addr3_city, r1_addr3_state, r1_addr3_zip),
|
| 395 |
+
(r1_addr4_line, r1_addr4_city, r1_addr4_state, r1_addr4_zip),
|
| 396 |
+
],
|
| 397 |
+
[r1_phone0, r1_phone1, r1_phone2, r1_phone3, r1_phone4],
|
| 398 |
+
[r1_email0, r1_email1, r1_email2, r1_email3, r1_email4],
|
| 399 |
+
r1_company, r1_parent_company,
|
| 400 |
+
[
|
| 401 |
+
(r1_cf0_name, r1_cf0_val), (r1_cf1_name, r1_cf1_val),
|
| 402 |
+
(r1_cf2_name, r1_cf2_val), (r1_cf3_name, r1_cf3_val),
|
| 403 |
+
(r1_cf4_name, r1_cf4_val),
|
| 404 |
+
],
|
| 405 |
+
)
|
| 406 |
+
|
| 407 |
+
r2 = build_record(
|
| 408 |
+
r2_name, r2_firstname, r2_middlename, r2_lastname,
|
| 409 |
+
r2_mothername, r2_fathername, r2_spousename, r2_othername, r2_dob, r2_gender,
|
| 410 |
+
r2_aadhar, r2_pan, r2_licenseid, r2_passportid, r2_voterid,
|
| 411 |
+
[
|
| 412 |
+
(r2_addr0_line, r2_addr0_city, r2_addr0_state, r2_addr0_zip),
|
| 413 |
+
(r2_addr1_line, r2_addr1_city, r2_addr1_state, r2_addr1_zip),
|
| 414 |
+
(r2_addr2_line, r2_addr2_city, r2_addr2_state, r2_addr2_zip),
|
| 415 |
+
(r2_addr3_line, r2_addr3_city, r2_addr3_state, r2_addr3_zip),
|
| 416 |
+
(r2_addr4_line, r2_addr4_city, r2_addr4_state, r2_addr4_zip),
|
| 417 |
+
],
|
| 418 |
+
[r2_phone0, r2_phone1, r2_phone2, r2_phone3, r2_phone4],
|
| 419 |
+
[r2_email0, r2_email1, r2_email2, r2_email3, r2_email4],
|
| 420 |
+
r2_company, r2_parent_company,
|
| 421 |
+
[
|
| 422 |
+
(r2_cf0_name, r2_cf0_val), (r2_cf1_name, r2_cf1_val),
|
| 423 |
+
(r2_cf2_name, r2_cf2_val), (r2_cf3_name, r2_cf3_val),
|
| 424 |
+
(r2_cf4_name, r2_cf4_val),
|
| 425 |
+
],
|
| 426 |
+
)
|
| 427 |
+
|
| 428 |
+
api_url = "http://127.0.0.1:8000/backend/v1/match"
|
| 429 |
+
try:
|
| 430 |
+
r1_payload = convert_to_api_payload(r1)
|
| 431 |
+
r2_payload = convert_to_api_payload(r2)
|
| 432 |
+
|
| 433 |
+
rec1 = EntityRecord(**r1_payload)
|
| 434 |
+
rec2 = EntityRecord(**r2_payload)
|
| 435 |
+
|
| 436 |
+
result_data = perform_match(rec1, rec2, mode="embedding")
|
| 437 |
+
|
| 438 |
+
result = {
|
| 439 |
+
"overall_decision": result_data["overall_decision"],
|
| 440 |
+
"reason": result_data["reason"],
|
| 441 |
+
"field_results": result_data["field_scores"],
|
| 442 |
+
}
|
| 443 |
+
return json.dumps(result, indent=2)
|
| 444 |
+
|
| 445 |
+
except Exception as e:
|
| 446 |
+
return json.dumps({"error": str(e)}, indent=2)
|
| 447 |
+
|
| 448 |
+
# =========================================================
|
| 449 |
+
# UI BUILDER HELPERS
|
| 450 |
+
# =========================================================
|
| 451 |
+
def section_card(title: str, icon: str = ""):
|
| 452 |
+
"""Returns an HTML header string for a section card."""
|
| 453 |
+
return f"""
|
| 454 |
+
<div class="section-card">
|
| 455 |
+
<div class="section-header-gradient">{icon} {title}</div>
|
| 456 |
+
<div class="section-body">
|
| 457 |
+
"""
|
| 458 |
+
|
| 459 |
+
def personal_fields(prefix):
|
| 460 |
+
inputs = []
|
| 461 |
+
with gr.Row():
|
| 462 |
+
full_name = gr.Textbox(label="Full Name", placeholder="Enter full name", elem_id=f"{prefix}_name")
|
| 463 |
+
first_name = gr.Textbox(label="First Name", placeholder="Enter first name", elem_id=f"{prefix}_firstname")
|
| 464 |
+
inputs += [full_name, first_name]
|
| 465 |
+
with gr.Row():
|
| 466 |
+
middle_name = gr.Textbox(label="Middle Name", placeholder="Enter middle name", elem_id=f"{prefix}_middlename")
|
| 467 |
+
last_name = gr.Textbox(label="Last Name", placeholder="Enter last name", elem_id=f"{prefix}_lastname")
|
| 468 |
+
inputs += [middle_name, last_name]
|
| 469 |
+
with gr.Row():
|
| 470 |
+
mother_name = gr.Textbox(label="Mother's Name", placeholder="Enter mother's name", elem_id=f"{prefix}_mothername")
|
| 471 |
+
father_name = gr.Textbox(label="Father's Name", placeholder="Enter father's name", elem_id=f"{prefix}_fathername")
|
| 472 |
+
inputs += [mother_name, father_name]
|
| 473 |
+
with gr.Row():
|
| 474 |
+
spouse_name = gr.Textbox(label="Spouse's Name", placeholder="Enter spouse's name", elem_id=f"{prefix}_spousename")
|
| 475 |
+
other_name = gr.Textbox(label="Other Name", placeholder="Enter other name", elem_id=f"{prefix}_othername")
|
| 476 |
+
inputs += [spouse_name, other_name]
|
| 477 |
+
with gr.Row():
|
| 478 |
+
dob = gr.Textbox(label="Date of Birth", placeholder="YYYY-MM-DD", elem_id=f"{prefix}_dob")
|
| 479 |
+
gender = gr.Textbox(label="Gender", placeholder="Male/Female/Other", elem_id=f"{prefix}_gender")
|
| 480 |
+
inputs += [dob, gender]
|
| 481 |
+
return inputs # [name, firstname, middlename, lastname, mothername, fathername, spousename, othername, dob, gender]
|
| 482 |
+
|
| 483 |
+
def identifier_fields(prefix):
|
| 484 |
+
inputs = []
|
| 485 |
+
with gr.Row():
|
| 486 |
+
aadhar = gr.Textbox(label="Aadhar Number", placeholder="Enter Aadhar number", elem_id=f"{prefix}_aadhar")
|
| 487 |
+
pan = gr.Textbox(label="PAN Number", placeholder="Enter PAN number", elem_id=f"{prefix}_pan")
|
| 488 |
+
inputs += [aadhar, pan]
|
| 489 |
+
with gr.Row():
|
| 490 |
+
license_ = gr.Textbox(label="License Number", placeholder="Enter license number", elem_id=f"{prefix}_license")
|
| 491 |
+
passport = gr.Textbox(label="Passport Number", placeholder="Enter passport number", elem_id=f"{prefix}_passport")
|
| 492 |
+
inputs += [license_, passport]
|
| 493 |
+
with gr.Row():
|
| 494 |
+
voter_id = gr.Textbox(label="Voter ID", placeholder="Enter voter ID", elem_id=f"{prefix}_voterid")
|
| 495 |
+
gr.HTML("") # spacer
|
| 496 |
+
inputs += [voter_id]
|
| 497 |
+
return inputs # [aadhar, pan, licenseid, passportid, voterid]
|
| 498 |
+
|
| 499 |
+
def address_fields(prefix, slot):
|
| 500 |
+
"""Single address slot (0-indexed)."""
|
| 501 |
+
with gr.Group():
|
| 502 |
+
label = "Primary Address" if slot == 0 else f"Address {slot+1}"
|
| 503 |
+
gr.HTML(f'<div class="subsection-label">{label}</div>')
|
| 504 |
+
line = gr.Textbox(label="Street Address", placeholder="Street, Building, Area", elem_id=f"{prefix}_addr{slot}_line")
|
| 505 |
+
with gr.Row():
|
| 506 |
+
city = gr.Textbox(label="City", placeholder="Enter city", elem_id=f"{prefix}_addr{slot}_city")
|
| 507 |
+
state = gr.Textbox(label="State", placeholder="Enter state", elem_id=f"{prefix}_addr{slot}_state")
|
| 508 |
+
zipcode = gr.Textbox(label="Pincode", placeholder="6-digit postal code", elem_id=f"{prefix}_addr{slot}_zip")
|
| 509 |
+
return line, city, state, zipcode
|
| 510 |
+
|
| 511 |
+
def contact_fields(prefix):
|
| 512 |
+
phone_inputs = []
|
| 513 |
+
email_inputs = []
|
| 514 |
+
gr.HTML('<div class="subsection-label">π Phone Numbers</div>')
|
| 515 |
+
for i in range(5):
|
| 516 |
+
ph = gr.Textbox(label=f"Phone {i+1}", placeholder="Enter phone number", elem_id=f"{prefix}_phone{i}")
|
| 517 |
+
phone_inputs.append(ph)
|
| 518 |
+
gr.HTML('<hr class="addr-divider"><div class="subsection-label">βοΈ Email Addresses</div>')
|
| 519 |
+
for i in range(5):
|
| 520 |
+
em = gr.Textbox(label=f"Email {i+1}", placeholder="Enter email address", elem_id=f"{prefix}_email{i}")
|
| 521 |
+
email_inputs.append(em)
|
| 522 |
+
return phone_inputs, email_inputs # each is list of 5
|
| 523 |
+
|
| 524 |
+
def employment_fields(prefix):
|
| 525 |
+
with gr.Row():
|
| 526 |
+
company = gr.Textbox(label="Company Name", placeholder="Enter company name", elem_id=f"{prefix}_company")
|
| 527 |
+
parent_company = gr.Textbox(label="Parent Company Name", placeholder="Enter parent company name", elem_id=f"{prefix}_pcompany")
|
| 528 |
+
return company, parent_company
|
| 529 |
+
|
| 530 |
+
def custom_field_slots(prefix):
|
| 531 |
+
"""5 name+value custom field pairs."""
|
| 532 |
+
pairs = []
|
| 533 |
+
gr.HTML('<div class="subsection-label">Custom Fields (up to 5)</div>')
|
| 534 |
+
for i in range(5):
|
| 535 |
+
with gr.Row():
|
| 536 |
+
cf_name = gr.Textbox(label=f"Field Name {i+1}", placeholder=f"e.g. MemberID", elem_id=f"{prefix}_cf{i}_name")
|
| 537 |
+
cf_val = gr.Textbox(label=f"Field Value {i+1}", placeholder="Value", elem_id=f"{prefix}_cf{i}_val")
|
| 538 |
+
pairs.append((cf_name, cf_val))
|
| 539 |
+
return pairs # list of 5 (name_widget, val_widget) tuples
|
| 540 |
+
|
| 541 |
+
|
| 542 |
+
# =========================================================
|
| 543 |
+
# BUILD THE GRADIO APP
|
| 544 |
+
# =========================================================
|
| 545 |
+
def build_app():
|
| 546 |
+
with gr.Blocks(css=CUSTOM_CSS, title="GEN AI Record Level Matching") as demo:
|
| 547 |
+
|
| 548 |
+
# ββ Header ββ
|
| 549 |
+
gr.HTML("""
|
| 550 |
+
<div class="app-header">
|
| 551 |
+
<h1>Record Level Matching Using Embedding Models</h1>
|
| 552 |
+
<p>Enter details for two records below and click "Run Record Match" to see the matching result</p>
|
| 553 |
+
</div>
|
| 554 |
+
""")
|
| 555 |
+
|
| 556 |
+
# ββ Backend status (shown once on load) ββ
|
| 557 |
+
backend_status = gr.HTML(value=check_backend_health, every=30)
|
| 558 |
+
|
| 559 |
+
# ββ Two-column record layout ββ
|
| 560 |
+
with gr.Row(equal_height=False):
|
| 561 |
+
|
| 562 |
+
# ββββββββββββββββββββββββ
|
| 563 |
+
# RECORD 1
|
| 564 |
+
# ββββββββββββββββββββββββ
|
| 565 |
+
with gr.Column():
|
| 566 |
+
gr.HTML('<div class="record-header">Record 1</div>')
|
| 567 |
+
|
| 568 |
+
# Personal Details
|
| 569 |
+
gr.HTML('<div class="section-card"><div class="section-header-gradient">π€ PERSONAL DETAILS</div><div class="section-body">')
|
| 570 |
+
r1_personal = personal_fields("r1")
|
| 571 |
+
gr.HTML('</div></div>')
|
| 572 |
+
|
| 573 |
+
# Identifiers / Equalities
|
| 574 |
+
gr.HTML('<div class="section-card"><div class="section-header-gradient">πͺͺ EQUALITIES</div><div class="section-body">')
|
| 575 |
+
r1_ids = identifier_fields("r1")
|
| 576 |
+
# Custom fields live inside Equalities (as in original)
|
| 577 |
+
r1_custom_pairs = custom_field_slots("r1")
|
| 578 |
+
gr.HTML('</div></div>')
|
| 579 |
+
|
| 580 |
+
# Address Details
|
| 581 |
+
gr.HTML('<div class="section-card"><div class="section-header-gradient">π ADDRESS DETAILS</div><div class="section-body">')
|
| 582 |
+
r1_addr_fields = []
|
| 583 |
+
for slot in range(5):
|
| 584 |
+
line, city, state, zipcode = address_fields("r1", slot)
|
| 585 |
+
r1_addr_fields += [line, city, state, zipcode]
|
| 586 |
+
if slot < 4:
|
| 587 |
+
gr.HTML('<hr class="addr-divider">')
|
| 588 |
+
gr.HTML('</div></div>')
|
| 589 |
+
|
| 590 |
+
# Contact Information
|
| 591 |
+
gr.HTML('<div class="section-card"><div class="section-header-gradient">π± CONTACT INFORMATION</div><div class="section-body">')
|
| 592 |
+
r1_phones, r1_emails = contact_fields("r1")
|
| 593 |
+
gr.HTML('</div></div>')
|
| 594 |
+
|
| 595 |
+
# Employment Details
|
| 596 |
+
gr.HTML('<div class="section-card"><div class="section-header-gradient">πΌ EMPLOYMENT DETAILS</div><div class="section-body">')
|
| 597 |
+
r1_company, r1_pcompany = employment_fields("r1")
|
| 598 |
+
gr.HTML('</div></div>')
|
| 599 |
+
|
| 600 |
+
# ββββββββββββββββββββββββ
|
| 601 |
+
# RECORD 2
|
| 602 |
+
# ββββββββββββββββββββββββ
|
| 603 |
+
with gr.Column():
|
| 604 |
+
gr.HTML('<div class="record-header">Record 2</div>')
|
| 605 |
+
|
| 606 |
+
gr.HTML('<div class="section-card"><div class="section-header-gradient">π€ PERSONAL DETAILS</div><div class="section-body">')
|
| 607 |
+
r2_personal = personal_fields("r2")
|
| 608 |
+
gr.HTML('</div></div>')
|
| 609 |
+
|
| 610 |
+
gr.HTML('<div class="section-card"><div class="section-header-gradient">πͺͺ EQUALITIES</div><div class="section-body">')
|
| 611 |
+
r2_ids = identifier_fields("r2")
|
| 612 |
+
r2_custom_pairs = custom_field_slots("r2")
|
| 613 |
+
gr.HTML('</div></div>')
|
| 614 |
+
|
| 615 |
+
gr.HTML('<div class="section-card"><div class="section-header-gradient">π ADDRESS DETAILS</div><div class="section-body">')
|
| 616 |
+
r2_addr_fields = []
|
| 617 |
+
for slot in range(5):
|
| 618 |
+
line, city, state, zipcode = address_fields("r2", slot)
|
| 619 |
+
r2_addr_fields += [line, city, state, zipcode]
|
| 620 |
+
if slot < 4:
|
| 621 |
+
gr.HTML('<hr class="addr-divider">')
|
| 622 |
+
gr.HTML('</div></div>')
|
| 623 |
+
|
| 624 |
+
gr.HTML('<div class="section-card"><div class="section-header-gradient">π± CONTACT INFORMATION</div><div class="section-body">')
|
| 625 |
+
r2_phones, r2_emails = contact_fields("r2")
|
| 626 |
+
gr.HTML('</div></div>')
|
| 627 |
+
|
| 628 |
+
gr.HTML('<div class="section-card"><div class="section-header-gradient">πΌ EMPLOYMENT DETAILS</div><div class="section-body">')
|
| 629 |
+
r2_company, r2_pcompany = employment_fields("r2")
|
| 630 |
+
gr.HTML('</div></div>')
|
| 631 |
+
|
| 632 |
+
# ββ Run Match Button ββ
|
| 633 |
+
run_btn = gr.Button("π RUN RECORD MATCH", variant="primary", elem_id="run-match-btn")
|
| 634 |
+
|
| 635 |
+
# ββ Result output ββ
|
| 636 |
+
gr.HTML('<div class="result-box"><div class="result-header">Matching Result (Backend API)</div></div>')
|
| 637 |
+
result_output = gr.Code(label="Result JSON", language="json", lines=20)
|
| 638 |
+
|
| 639 |
+
# ββ Wire up the button ββ
|
| 640 |
+
# Collect all inputs in exact order matching run_match() signature
|
| 641 |
+
all_inputs = (
|
| 642 |
+
r1_personal # 10: name..gender
|
| 643 |
+
+ r1_ids # 5: aadhar..voterid
|
| 644 |
+
+ r1_addr_fields # 20: 5 addr Γ 4 fields
|
| 645 |
+
+ r1_phones # 5
|
| 646 |
+
+ r1_emails # 5
|
| 647 |
+
+ [r1_company, r1_pcompany] # 2
|
| 648 |
+
+ [w for pair in r1_custom_pairs for w in pair] # 10: 5 pairs Γ 2
|
| 649 |
+
+ r2_personal # 10
|
| 650 |
+
+ r2_ids # 5
|
| 651 |
+
+ r2_addr_fields # 20
|
| 652 |
+
+ r2_phones # 5
|
| 653 |
+
+ r2_emails # 5
|
| 654 |
+
+ [r2_company, r2_pcompany] # 2
|
| 655 |
+
+ [w for pair in r2_custom_pairs for w in pair] # 10
|
| 656 |
+
)
|
| 657 |
+
# Total = 10+5+20+5+5+2+10 + 10+5+20+5+5+2+10 = 57+57 = 114 inputs
|
| 658 |
+
|
| 659 |
+
run_btn.click(
|
| 660 |
+
fn=run_match,
|
| 661 |
+
inputs=all_inputs,
|
| 662 |
+
outputs=result_output,
|
| 663 |
+
)
|
| 664 |
+
|
| 665 |
+
return demo
|
| 666 |
+
|
| 667 |
+
|
| 668 |
+
# =========================================================
|
| 669 |
+
# ENTRY POINT
|
| 670 |
+
# =========================================================
|
| 671 |
+
if __name__ == "__main__":
|
| 672 |
+
app = build_app()
|
| 673 |
+
app.launch()
|
frontend/assests/Logo icon_color.png
ADDED
|
|
none.webp
ADDED
|
note.txt
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Current Version of this application features:
|
| 2 |
+
|
| 3 |
+
1. dual mode with embedding and llm mode
|
| 4 |
+
2. data preprocessing retrieving from csv data
|
| 5 |
+
3. Pincode Logic has been updated
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
Objective:
|
| 10 |
+
This repository contains the implementation of a **GenAI-based Entity Matching** system. It supports a dualβmode architecture with a Fastapi backend, a Streamlit frontend, and a collection of services for data processing and model interaction.
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
Features:
|
| 14 |
+
|
| 15 |
+
- **Flexible matching service** implemented in `backend/matching_service.py`.
|
| 16 |
+
- **Modular data models** defined in `backend/models.py`.
|
| 17 |
+
- **Streamlit frontend** for quick experimentation (`frontend/app_streamlit.py`).
|
| 18 |
+
- **Configurable rules and LLM model integration** under `services/`.
|
| 19 |
+
- **Extensive test suite** located in `tests/`.
|
| 20 |
+
- **Configuration files** and property management in `backend/config` and `services/config.py`.
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
Active endpoints :
|
| 24 |
+
|
| 25 |
+
POST /backend/v1/match β Match a single pair of records
|
| 26 |
+
POST /backend/v1/match/batch β Match multiple pairs # multithread implementation
|
| 27 |
+
GET /backend/v1/health β Full health check (CSV data, models, LLM)
|
| 28 |
+
GET /backend/v1/health/llm β LLM server health check only
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
To Run the application :
|
| 34 |
+
|
| 35 |
+
for embedding mode:
|
| 36 |
+
models will be loaded when we initiate the server
|
| 37 |
+
|
| 38 |
+
for llm mode:
|
| 39 |
+
we have to paste the llm up url in the common.properties , base-url:
|
| 40 |
+
|
| 41 |
+
for frontend :
|
| 42 |
+
|
| 43 |
+
python -m streamlit run frontend/app_streamlit.py
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
for backend:
|
| 47 |
+
|
| 48 |
+
python -m uvicorn backend.server:app
|
requirements.txt
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
numpy
|
| 2 |
+
pandas
|
| 3 |
+
python-dateutil
|
| 4 |
+
pytz
|
| 5 |
+
regex
|
| 6 |
+
scipy
|
| 7 |
+
Pillow
|
| 8 |
+
gradio>=4.44.0
|
| 9 |
+
|
| 10 |
+
rapidfuzz==3.9.3
|
| 11 |
+
scikit-learn==1.5.2
|
| 12 |
+
sentence-transformers==2.7.0
|
| 13 |
+
pgeocode==0.5.0
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
openai
|
| 17 |
+
torch --index-url https://download.pytorch.org/whl/cpu
|
| 18 |
+
fastapi
|
| 19 |
+
uvicorn
|
| 20 |
+
requests
|
services/__pycache__/config.cpython-310.pyc
ADDED
|
Binary file (3.2 kB). View file
|
|
|
services/__pycache__/config.cpython-312.pyc
ADDED
|
Binary file (6.51 kB). View file
|
|
|
services/__pycache__/llm_model.cpython-310.pyc
ADDED
|
Binary file (19.1 kB). View file
|
|
|
services/__pycache__/llm_model.cpython-312.pyc
ADDED
|
Binary file (24.2 kB). View file
|
|
|
services/__pycache__/model.cpython-310.pyc
ADDED
|
Binary file (24.8 kB). View file
|
|
|
services/__pycache__/model.cpython-312.pyc
ADDED
|
Binary file (29.6 kB). View file
|
|
|
services/__pycache__/rules.cpython-310.pyc
ADDED
|
Binary file (70.4 kB). View file
|
|
|
services/__pycache__/rules.cpython-312.pyc
ADDED
|
Binary file (55.3 kB). View file
|
|
|
services/address_matcher.py
ADDED
|
@@ -0,0 +1,722 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
address_matcher.py
|
| 3 |
+
==================
|
| 4 |
+
Enhanced Indian address parser + matcher.
|
| 5 |
+
|
| 6 |
+
Reuses existing infrastructure from rules.py and model.py:
|
| 7 |
+
- clean_text, strip_non_alphanumeric, deduplicate_consecutive_tokens (rules.py)
|
| 8 |
+
- standardize_city, standardize_state (rules.py)
|
| 9 |
+
- validate_and_normalize_pincode, pincode_similarity_india (rules.py)
|
| 10 |
+
- extract_address_components (rules.py) β house/flat/apt/street
|
| 11 |
+
- roman_to_number (rules.py)
|
| 12 |
+
- normalize_and_deduplicate_address (rules.py)
|
| 13 |
+
- hno_variation_df, city_prev_pres_df, state_name_standard_df,
|
| 14 |
+
pin_city_state_df, CITY_MAPPING, STATE_MAPPING (config via rules.py)
|
| 15 |
+
- calculate_semantic_similarity, match_entities (model.py)
|
| 16 |
+
- ADDRESS_MODEL_WEIGHTS (config)
|
| 17 |
+
|
| 18 |
+
New additions in this file
|
| 19 |
+
--------------------------
|
| 20 |
+
1. standardize_address_line() β token-level hno / city / state variation replacement
|
| 21 |
+
2. extract_geo_anchors() β PIN / city / state from a *single* address string
|
| 22 |
+
3. extract_extended_components() β block, sector, ward, apt-name, locality on top of
|
| 23 |
+
the existing extract_address_components()
|
| 24 |
+
4. match_address_lines() β full 3-stage pipeline returning score + breakdown
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
from __future__ import annotations
|
| 28 |
+
|
| 29 |
+
import re
|
| 30 |
+
import math
|
| 31 |
+
import logging
|
| 32 |
+
from typing import Dict, Optional, Tuple
|
| 33 |
+
|
| 34 |
+
logger = logging.getLogger("address_matcher")
|
| 35 |
+
|
| 36 |
+
# ββ existing imports (your project layout) βββββββββββββββββββββββββββββββββββ
|
| 37 |
+
from services.rules import (
|
| 38 |
+
clean_text,
|
| 39 |
+
strip_non_alphanumeric,
|
| 40 |
+
deduplicate_consecutive_tokens,
|
| 41 |
+
standardize_city,
|
| 42 |
+
standardize_state,
|
| 43 |
+
validate_and_normalize_pincode,
|
| 44 |
+
pincode_similarity_india,
|
| 45 |
+
extract_address_components,
|
| 46 |
+
roman_to_number,
|
| 47 |
+
normalize_and_deduplicate_address,
|
| 48 |
+
replace_with_standard,
|
| 49 |
+
lookup_from_mapping,
|
| 50 |
+
)
|
| 51 |
+
from services.config import (
|
| 52 |
+
hno_variation_df,
|
| 53 |
+
city_prev_pres_df,
|
| 54 |
+
state_name_standard_df,
|
| 55 |
+
pin_city_state_df,
|
| 56 |
+
CITY_MAPPING,
|
| 57 |
+
STATE_MAPPING,
|
| 58 |
+
ADDRESS_MODEL_WEIGHTS,
|
| 59 |
+
)
|
| 60 |
+
from services.model import (
|
| 61 |
+
calculate_semantic_similarity,
|
| 62 |
+
match_entities,
|
| 63 |
+
preprocess_for_matching,
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 67 |
+
# 1. CONSTANTS
|
| 68 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 69 |
+
|
| 70 |
+
# Scoring constants (requirement spec)
|
| 71 |
+
GEO_MISMATCH_SCORE = 20 # hard cap when city/state/PIN mismatch detected
|
| 72 |
+
GEO_MATCH_BOOST = 10 # per matching geo component
|
| 73 |
+
ID_MATCH_BOOST = 30 # numeric identifier components match (gated by embed > 40)
|
| 74 |
+
ID_MISMATCH_PENALTY = 30 # numeric identifier mismatch penalty
|
| 75 |
+
EMBED_GATE_THRESHOLD = 40 # embedding score must exceed this to award ID_MATCH_BOOST
|
| 76 |
+
|
| 77 |
+
# Semantic model to use for address residual comparison
|
| 78 |
+
ADDRESS_EMBED_MODEL = "model2" # change to "model1" if preferred
|
| 79 |
+
|
| 80 |
+
# ββ Extended component regex patterns ββββββββββββββββββββββββββββββββββββββββ
|
| 81 |
+
_BLOCK_RE = re.compile(
|
| 82 |
+
r'\b(?:block|blk|bl)\.?\s*(?:no\.?\s*)?([a-z0-9]{1,4})\b', re.I)
|
| 83 |
+
_SECTOR_RE = re.compile(
|
| 84 |
+
r'\b(?:sector|sec)\.?\s*(?:no\.?\s*)?(\d{1,3}[a-z]?)\b', re.I)
|
| 85 |
+
_WARD_RE = re.compile(
|
| 86 |
+
r'\b(?:ward)\.?\s*(?:no\.?\s*)?(\d{1,3}[a-z]?)\b', re.I)
|
| 87 |
+
_PHASE_RE = re.compile(
|
| 88 |
+
r'\b(?:phase|ph)\.?\s*(?:no\.?\s*)?(\d{1,2})\b', re.I)
|
| 89 |
+
_PLOT_RE = re.compile(
|
| 90 |
+
r'\b(?:plot|plt)\.?\s*(?:no\.?\s*)?([a-z0-9]{1,6}(?:[/-][a-z0-9]{1,4})?)\b', re.I)
|
| 91 |
+
|
| 92 |
+
# PIN: 6 digits, first digit 1-9 (valid Indian PIN range)
|
| 93 |
+
_PIN_RE = re.compile(r'(?<!\d)([1-9]\d{5})(?!\d)')
|
| 94 |
+
|
| 95 |
+
# City / State boundary markers (help isolate tail of address)
|
| 96 |
+
_TAIL_SEP = re.compile(r'[-β,]\s*')
|
| 97 |
+
|
| 98 |
+
# Hard identifier component keys β mismatch on ANY of these β -30 penalty
|
| 99 |
+
HARD_ID_KEYS = ('house_number', 'flat_number', 'block', 'sector', 'ward', 'plot', 'phase')
|
| 100 |
+
|
| 101 |
+
# Geo-anchor keys β mismatch on ANY of these β score = 20 early exit
|
| 102 |
+
GEO_KEYS = ('pin', 'city', 'state')
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 106 |
+
# 2. STEP 1 β ADDRESS STANDARDIZATION
|
| 107 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββοΏ½οΏ½ββββββββββββββββββββ
|
| 108 |
+
|
| 109 |
+
def _build_hno_lookup() -> Dict[str, str]:
|
| 110 |
+
"""
|
| 111 |
+
Build a token-level lookup from hno_variation_df loaded in config.
|
| 112 |
+
Cached on first call via function attribute.
|
| 113 |
+
Expected columns: 'variation', 'standard'
|
| 114 |
+
"""
|
| 115 |
+
if hasattr(_build_hno_lookup, '_cache'):
|
| 116 |
+
return _build_hno_lookup._cache
|
| 117 |
+
|
| 118 |
+
lookup: Dict[str, str] = {}
|
| 119 |
+
if hno_variation_df is not None and not hno_variation_df.empty:
|
| 120 |
+
df = hno_variation_df.copy()
|
| 121 |
+
df.columns = df.columns.str.lower()
|
| 122 |
+
for _, row in df.iterrows():
|
| 123 |
+
var = str(row.get('variation', '')).strip().lower()
|
| 124 |
+
std = str(row.get('standard', '')).strip().lower()
|
| 125 |
+
if var and std:
|
| 126 |
+
lookup[var] = std
|
| 127 |
+
|
| 128 |
+
_build_hno_lookup._cache = lookup
|
| 129 |
+
return lookup
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def standardize_address_line(address: str) -> str:
|
| 133 |
+
"""
|
| 134 |
+
Stage 0 β token-level standardization of a raw address string.
|
| 135 |
+
|
| 136 |
+
Steps (in order):
|
| 137 |
+
1. clean_text() β strips HTML, control chars, lowercases
|
| 138 |
+
2. roman_to_number() β "Sector IV" β "Sector 4"
|
| 139 |
+
3. Replace h_no variation tokens (h.no / hno / h no / door no / d.no β¦)
|
| 140 |
+
4. Standardize city tokens via CITY_MAPPING / city_prev_pres_df
|
| 141 |
+
5. Standardize state tokens via STATE_MAPPING / state_name_standard_df
|
| 142 |
+
6. Collapse duplicate consecutive tokens
|
| 143 |
+
7. Strip trailing punctuation noise
|
| 144 |
+
|
| 145 |
+
Returns: cleaned, lower-case address string ready for component extraction.
|
| 146 |
+
"""
|
| 147 |
+
if not address:
|
| 148 |
+
return ""
|
| 149 |
+
|
| 150 |
+
text = clean_text(str(address)) # step 1
|
| 151 |
+
text = roman_to_number(text) # step 2
|
| 152 |
+
|
| 153 |
+
# step 3 β hno variation token replacement (word-boundary safe)
|
| 154 |
+
hno_lookup = _build_hno_lookup()
|
| 155 |
+
if hno_lookup:
|
| 156 |
+
tokens = text.split()
|
| 157 |
+
replaced = []
|
| 158 |
+
i = 0
|
| 159 |
+
while i < len(tokens):
|
| 160 |
+
# try 2-token phrases first (e.g. "h no", "door no")
|
| 161 |
+
two = (tokens[i] + ' ' + tokens[i + 1]).lower() if i + 1 < len(tokens) else ''
|
| 162 |
+
if two in hno_lookup:
|
| 163 |
+
replaced.append(hno_lookup[two])
|
| 164 |
+
i += 2
|
| 165 |
+
continue
|
| 166 |
+
one = tokens[i].lower().rstrip('.')
|
| 167 |
+
if one in hno_lookup:
|
| 168 |
+
replaced.append(hno_lookup[one])
|
| 169 |
+
else:
|
| 170 |
+
replaced.append(tokens[i])
|
| 171 |
+
i += 1
|
| 172 |
+
text = ' '.join(replaced)
|
| 173 |
+
|
| 174 |
+
# steps 4+5 β city / state token replacement (applied to whole tokens)
|
| 175 |
+
words = text.split()
|
| 176 |
+
for idx, word in enumerate(words):
|
| 177 |
+
# try 2-word combos for multi-word city/state names
|
| 178 |
+
if idx + 1 < len(words):
|
| 179 |
+
two_word = word + ' ' + words[idx + 1]
|
| 180 |
+
city_std = standardize_city(two_word)
|
| 181 |
+
if city_std and city_std.lower() != two_word.lower():
|
| 182 |
+
words[idx] = city_std
|
| 183 |
+
words[idx + 1] = ''
|
| 184 |
+
continue
|
| 185 |
+
state_std = standardize_state(two_word)
|
| 186 |
+
if state_std and state_std.lower() != two_word.lower():
|
| 187 |
+
words[idx] = state_std
|
| 188 |
+
words[idx + 1] = ''
|
| 189 |
+
continue
|
| 190 |
+
single_city = standardize_city(word)
|
| 191 |
+
if single_city and single_city.lower() != word.lower():
|
| 192 |
+
words[idx] = single_city
|
| 193 |
+
continue
|
| 194 |
+
single_state = standardize_state(word)
|
| 195 |
+
if single_state and single_state.lower() != word.lower():
|
| 196 |
+
words[idx] = single_state
|
| 197 |
+
|
| 198 |
+
text = ' '.join(w for w in words if w)
|
| 199 |
+
|
| 200 |
+
# step 6+7 β dedup consecutive, strip stray punctuation
|
| 201 |
+
text = deduplicate_consecutive_tokens(text)
|
| 202 |
+
text = re.sub(r'\s+', ' ', text).strip(' ,.-')
|
| 203 |
+
return text
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 207 |
+
# 3. STEP 2 β GEO-ANCHOR EXTRACTION (PIN / city / state)
|
| 208 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 209 |
+
|
| 210 |
+
def _pin_from_text(text: str) -> Optional[str]:
|
| 211 |
+
"""Extract and validate first Indian PIN from text."""
|
| 212 |
+
for m in _PIN_RE.finditer(text):
|
| 213 |
+
candidate = m.group(1)
|
| 214 |
+
normalized = validate_and_normalize_pincode(candidate)
|
| 215 |
+
if normalized:
|
| 216 |
+
return normalized
|
| 217 |
+
return None
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
def _city_from_text(text: str) -> Optional[str]:
|
| 221 |
+
"""
|
| 222 |
+
Token scan for city names.
|
| 223 |
+
Tries 2-word and 1-word combinations against CITY_MAPPING / city_prev_pres_df.
|
| 224 |
+
Returns the canonical (standardized) city name or None.
|
| 225 |
+
"""
|
| 226 |
+
tokens = text.split()
|
| 227 |
+
for i in range(len(tokens)):
|
| 228 |
+
# 2-word
|
| 229 |
+
if i + 1 < len(tokens):
|
| 230 |
+
candidate = tokens[i] + ' ' + tokens[i + 1]
|
| 231 |
+
std = standardize_city(candidate)
|
| 232 |
+
if std and std.lower() != candidate.lower():
|
| 233 |
+
return std.lower()
|
| 234 |
+
# also accept direct match if it IS a known city already
|
| 235 |
+
known = lookup_from_mapping(candidate.upper(), CITY_MAPPING)
|
| 236 |
+
if known:
|
| 237 |
+
return known.lower()
|
| 238 |
+
# 1-word
|
| 239 |
+
std = standardize_city(tokens[i])
|
| 240 |
+
if std and std.lower() != tokens[i].lower():
|
| 241 |
+
return std.lower()
|
| 242 |
+
known = lookup_from_mapping(tokens[i].upper(), CITY_MAPPING)
|
| 243 |
+
if known:
|
| 244 |
+
return known.lower()
|
| 245 |
+
return None
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
def _state_from_text(text: str) -> Optional[str]:
|
| 249 |
+
"""
|
| 250 |
+
Token scan for state names.
|
| 251 |
+
Tries 2-word and 1-word combinations.
|
| 252 |
+
"""
|
| 253 |
+
tokens = text.split()
|
| 254 |
+
for i in range(len(tokens)):
|
| 255 |
+
if i + 1 < len(tokens):
|
| 256 |
+
candidate = tokens[i] + ' ' + tokens[i + 1]
|
| 257 |
+
std = standardize_state(candidate)
|
| 258 |
+
if std and std.lower() != candidate.lower():
|
| 259 |
+
return std.lower()
|
| 260 |
+
known = lookup_from_mapping(candidate.upper(), STATE_MAPPING)
|
| 261 |
+
if known:
|
| 262 |
+
return known.lower()
|
| 263 |
+
std = standardize_state(tokens[i])
|
| 264 |
+
if std and std.lower() != tokens[i].lower():
|
| 265 |
+
return std.lower()
|
| 266 |
+
known = lookup_from_mapping(tokens[i].upper(), STATE_MAPPING)
|
| 267 |
+
if known:
|
| 268 |
+
return known.lower()
|
| 269 |
+
return None
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
def _enrich_from_pincode(pin: str) -> Tuple[Optional[str], Optional[str]]:
|
| 273 |
+
"""
|
| 274 |
+
Use pin_city_state_df first, then pgeocode as fallback,
|
| 275 |
+
to fill in city and state from a PIN code.
|
| 276 |
+
Returns (city, state) both lowercase or None.
|
| 277 |
+
"""
|
| 278 |
+
city, state = None, None
|
| 279 |
+
|
| 280 |
+
# ββ try local CSV first ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 281 |
+
if pin_city_state_df is not None and not pin_city_state_df.empty:
|
| 282 |
+
df = pin_city_state_df.copy()
|
| 283 |
+
df.columns = df.columns.str.lower()
|
| 284 |
+
# expected columns: pincode / pin, city, state
|
| 285 |
+
pin_col = next((c for c in df.columns if 'pin' in c), None)
|
| 286 |
+
if pin_col:
|
| 287 |
+
row = df[df[pin_col].astype(str).str.zfill(6) == pin]
|
| 288 |
+
if not row.empty:
|
| 289 |
+
city_col = next((c for c in df.columns if 'city' in c), None)
|
| 290 |
+
state_col = next((c for c in df.columns if 'state' in c), None)
|
| 291 |
+
if city_col:
|
| 292 |
+
city = str(row.iloc[0][city_col]).strip().lower()
|
| 293 |
+
if state_col:
|
| 294 |
+
state = str(row.iloc[0][state_col]).strip().lower()
|
| 295 |
+
if city and state:
|
| 296 |
+
return standardize_city(city), standardize_state(state)
|
| 297 |
+
|
| 298 |
+
# ββ pgeocode fallback ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 299 |
+
try:
|
| 300 |
+
import pgeocode
|
| 301 |
+
nomi = pgeocode.Nominatim('in')
|
| 302 |
+
result = nomi.query_postal_code(pin)
|
| 303 |
+
if result is not None and not result.empty:
|
| 304 |
+
raw_city = getattr(result, 'county_name', None)
|
| 305 |
+
raw_state = getattr(result, 'state_name', None)
|
| 306 |
+
if raw_city is not None:
|
| 307 |
+
cv = raw_city.values[0] if hasattr(raw_city, 'values') else raw_city
|
| 308 |
+
if cv and not (isinstance(cv, float) and math.isnan(cv)):
|
| 309 |
+
city = standardize_city(str(cv).strip().lower())
|
| 310 |
+
if raw_state is not None:
|
| 311 |
+
sv = raw_state.values[0] if hasattr(raw_state, 'values') else raw_state
|
| 312 |
+
if sv and not (isinstance(sv, float) and math.isnan(sv)):
|
| 313 |
+
state = standardize_state(str(sv).strip().lower())
|
| 314 |
+
except Exception:
|
| 315 |
+
pass
|
| 316 |
+
|
| 317 |
+
return city, state
|
| 318 |
+
|
| 319 |
+
|
| 320 |
+
def extract_geo_anchors(standardized_address: str) -> Dict[str, Optional[str]]:
|
| 321 |
+
"""
|
| 322 |
+
Extract { pin, city, state } from a *standardized* address string.
|
| 323 |
+
|
| 324 |
+
Priority order:
|
| 325 |
+
1. PIN extracted from text β pgeocode/CSV fills city+state if absent
|
| 326 |
+
2. City / state scanned directly from tokens
|
| 327 |
+
3. Any remaining None fields stay None (component absent)
|
| 328 |
+
"""
|
| 329 |
+
text = standardized_address
|
| 330 |
+
|
| 331 |
+
pin = _pin_from_text(text)
|
| 332 |
+
city = _city_from_text(text)
|
| 333 |
+
state = _state_from_text(text)
|
| 334 |
+
|
| 335 |
+
# Enrich city/state from PIN if either is still missing
|
| 336 |
+
if pin and (not city or not state):
|
| 337 |
+
pin_city, pin_state = _enrich_from_pincode(pin)
|
| 338 |
+
if not city and pin_city:
|
| 339 |
+
city = pin_city
|
| 340 |
+
if not state and pin_state:
|
| 341 |
+
state = pin_state
|
| 342 |
+
|
| 343 |
+
return {
|
| 344 |
+
'pin': pin,
|
| 345 |
+
'city': city,
|
| 346 |
+
'state': state,
|
| 347 |
+
}
|
| 348 |
+
|
| 349 |
+
|
| 350 |
+
def _remove_geo_tokens(text: str, pin: Optional[str],
|
| 351 |
+
city: Optional[str], state: Optional[str]) -> str:
|
| 352 |
+
"""
|
| 353 |
+
Strip extracted geo tokens from the address string so they don't
|
| 354 |
+
contaminate the residual that goes to the embedding model.
|
| 355 |
+
"""
|
| 356 |
+
if pin:
|
| 357 |
+
text = re.sub(re.escape(pin), ' ', text)
|
| 358 |
+
if city:
|
| 359 |
+
text = re.sub(re.escape(city), ' ', text, flags=re.I)
|
| 360 |
+
if state:
|
| 361 |
+
text = re.sub(re.escape(state), ' ', text, flags=re.I)
|
| 362 |
+
return re.sub(r'\s+', ' ', text).strip(' ,.-')
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 366 |
+
# 4. STEP 3 β EXTENDED COMPONENT EXTRACTION
|
| 367 |
+
# Wraps existing extract_address_components() and adds block/sector/ward/etc.
|
| 368 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 369 |
+
|
| 370 |
+
def _extract_pattern(pattern: re.Pattern, text: str) -> Optional[str]:
|
| 371 |
+
"""Return first group of first match, or None."""
|
| 372 |
+
m = pattern.search(text)
|
| 373 |
+
return m.group(1).strip().upper() if m else None
|
| 374 |
+
|
| 375 |
+
|
| 376 |
+
def _remove_pattern_match(pattern: re.Pattern, text: str) -> str:
|
| 377 |
+
"""Remove the entire match (not just group 1) from text."""
|
| 378 |
+
return re.sub(pattern, ' ', text, count=1, flags=re.I)
|
| 379 |
+
|
| 380 |
+
|
| 381 |
+
def extract_extended_components(standardized_address: str,
|
| 382 |
+
geo: Dict[str, Optional[str]]) -> Dict:
|
| 383 |
+
"""
|
| 384 |
+
Full component extraction pipeline.
|
| 385 |
+
|
| 386 |
+
Returns a dict with keys:
|
| 387 |
+
house_number, flat_number, apartment, street, β from existing rules.py
|
| 388 |
+
block, sector, ward, phase, plot, β new patterns
|
| 389 |
+
apartment_name, β existing (apartment field)
|
| 390 |
+
residual β leftover for embedding
|
| 391 |
+
"""
|
| 392 |
+
# ββ Step A: strip geo tokens before passing to rules extractor ββββββββ
|
| 393 |
+
text = _remove_geo_tokens(
|
| 394 |
+
standardized_address,
|
| 395 |
+
geo.get('pin'), geo.get('city'), geo.get('state')
|
| 396 |
+
)
|
| 397 |
+
|
| 398 |
+
# ββ Step B: existing extractor (house / flat / apartment / street) ββββ
|
| 399 |
+
base = extract_address_components(text)
|
| 400 |
+
|
| 401 |
+
house_no = base.get('house_number')
|
| 402 |
+
flat_no = base.get('flat_number')
|
| 403 |
+
apt_name = base.get('apartment') # apartment / building name
|
| 404 |
+
street = base.get('street')
|
| 405 |
+
remaining = base.get('remaining_address', text)
|
| 406 |
+
|
| 407 |
+
# ββ Step C: extended patterns on the *remaining* text βββββββββββββββββ
|
| 408 |
+
block = _extract_pattern(_BLOCK_RE, remaining)
|
| 409 |
+
if block:
|
| 410 |
+
remaining = _remove_pattern_match(_BLOCK_RE, remaining)
|
| 411 |
+
|
| 412 |
+
sector = _extract_pattern(_SECTOR_RE, remaining)
|
| 413 |
+
if sector:
|
| 414 |
+
remaining = _remove_pattern_match(_SECTOR_RE, remaining)
|
| 415 |
+
|
| 416 |
+
ward = _extract_pattern(_WARD_RE, remaining)
|
| 417 |
+
if ward:
|
| 418 |
+
remaining = _remove_pattern_match(_WARD_RE, remaining)
|
| 419 |
+
|
| 420 |
+
phase = _extract_pattern(_PHASE_RE, remaining)
|
| 421 |
+
if phase:
|
| 422 |
+
remaining = _remove_pattern_match(_PHASE_RE, remaining)
|
| 423 |
+
|
| 424 |
+
plot = _extract_pattern(_PLOT_RE, remaining)
|
| 425 |
+
if plot:
|
| 426 |
+
remaining = _remove_pattern_match(_PLOT_RE, remaining)
|
| 427 |
+
|
| 428 |
+
# ββ Step D: final cleanup of residual βββββββββββββββββββββββββββββββββ
|
| 429 |
+
residual = strip_non_alphanumeric(remaining)
|
| 430 |
+
residual = normalize_and_deduplicate_address(residual)
|
| 431 |
+
residual = re.sub(r'\s+', ' ', residual).strip()
|
| 432 |
+
|
| 433 |
+
return {
|
| 434 |
+
'house_number': house_no,
|
| 435 |
+
'flat_number': flat_no,
|
| 436 |
+
'apartment_name': apt_name,
|
| 437 |
+
'street': street,
|
| 438 |
+
'block': block,
|
| 439 |
+
'sector': sector,
|
| 440 |
+
'ward': ward,
|
| 441 |
+
'phase': phase,
|
| 442 |
+
'plot': plot,
|
| 443 |
+
'residual': residual,
|
| 444 |
+
}
|
| 445 |
+
|
| 446 |
+
|
| 447 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 448 |
+
# 5. STAGE 1 β GEO-ANCHOR COMPARISON
|
| 449 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 450 |
+
|
| 451 |
+
def _compare_geo(geo1: Dict, geo2: Dict) -> Tuple[Optional[int], int, Dict]:
|
| 452 |
+
"""
|
| 453 |
+
Compare geo-anchor components of two parsed addresses.
|
| 454 |
+
|
| 455 |
+
Returns:
|
| 456 |
+
(early_exit_score_or_None, geo_boost, details_dict)
|
| 457 |
+
|
| 458 |
+
Rules:
|
| 459 |
+
β’ If a component is present in BOTH and they differ β score = 20, exit immediately
|
| 460 |
+
β’ If a component is present in BOTH and they match β geo_boost += 10
|
| 461 |
+
β’ If absent in either β skip (no boost, no penalty)
|
| 462 |
+
|
| 463 |
+
Special case for PIN: uses pincode_similarity_india() distance logic.
|
| 464 |
+
Two PINs in the same metro cluster are NOT treated as a hard mismatch
|
| 465 |
+
(similarity_score >= 60 is acceptable). Only cross-city mismatches exit.
|
| 466 |
+
"""
|
| 467 |
+
geo_boost = 0
|
| 468 |
+
details = {}
|
| 469 |
+
|
| 470 |
+
# ββ PIN comparison βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 471 |
+
p1, p2 = geo1.get('pin'), geo2.get('pin')
|
| 472 |
+
if p1 and p2:
|
| 473 |
+
if p1 == p2:
|
| 474 |
+
geo_boost += GEO_MATCH_BOOST
|
| 475 |
+
details['pin'] = f'match ({p1}) +{GEO_MATCH_BOOST}'
|
| 476 |
+
else:
|
| 477 |
+
pin_result = pincode_similarity_india(p1, p2)
|
| 478 |
+
sim = pin_result.get('similarity_score', 0) or 0
|
| 479 |
+
if sim < 60:
|
| 480 |
+
details['pin'] = f'MISMATCH ({p1} vs {p2}, sim={sim}) β exit={GEO_MISMATCH_SCORE}'
|
| 481 |
+
return GEO_MISMATCH_SCORE, 0, details
|
| 482 |
+
else:
|
| 483 |
+
# Same metro cluster β partial boost
|
| 484 |
+
geo_boost += GEO_MATCH_BOOST // 2
|
| 485 |
+
details['pin'] = f'metro-close ({p1} vs {p2}, sim={sim}) +{GEO_MATCH_BOOST // 2}'
|
| 486 |
+
|
| 487 |
+
# ββ CITY comparison ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 488 |
+
c1, c2 = geo1.get('city'), geo2.get('city')
|
| 489 |
+
if c1 and c2:
|
| 490 |
+
if c1.lower() == c2.lower():
|
| 491 |
+
geo_boost += GEO_MATCH_BOOST
|
| 492 |
+
details['city'] = f'match ({c1}) +{GEO_MATCH_BOOST}'
|
| 493 |
+
else:
|
| 494 |
+
details['city'] = f'MISMATCH ({c1} vs {c2}) β exit={GEO_MISMATCH_SCORE}'
|
| 495 |
+
return GEO_MISMATCH_SCORE, 0, details
|
| 496 |
+
|
| 497 |
+
# ββ STATE comparison βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 498 |
+
s1, s2 = geo1.get('state'), geo2.get('state')
|
| 499 |
+
if s1 and s2:
|
| 500 |
+
if s1.lower() == s2.lower():
|
| 501 |
+
geo_boost += GEO_MATCH_BOOST
|
| 502 |
+
details['state'] = f'match ({s1}) +{GEO_MATCH_BOOST}'
|
| 503 |
+
else:
|
| 504 |
+
details['state'] = f'MISMATCH ({s1} vs {s2}) β exit={GEO_MISMATCH_SCORE}'
|
| 505 |
+
return GEO_MISMATCH_SCORE, 0, details
|
| 506 |
+
|
| 507 |
+
return None, geo_boost, details
|
| 508 |
+
|
| 509 |
+
|
| 510 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 511 |
+
# 6. STAGE 2 β HARD IDENTIFIER COMPONENT COMPARISON
|
| 512 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 513 |
+
|
| 514 |
+
def _normalize_id(value: Optional[str]) -> Optional[str]:
|
| 515 |
+
"""
|
| 516 |
+
Normalize a hard identifier value for comparison.
|
| 517 |
+
Strips whitespace, uppercase, removes separators (- / .).
|
| 518 |
+
'4-B', '4B', '4/B' β '4B'
|
| 519 |
+
"""
|
| 520 |
+
if not value:
|
| 521 |
+
return None
|
| 522 |
+
return re.sub(r'[\s\-/.]', '', str(value).strip().upper())
|
| 523 |
+
|
| 524 |
+
|
| 525 |
+
def _compare_identifiers(comp1: Dict, comp2: Dict,
|
| 526 |
+
embed_score: float) -> Tuple[int, Dict]:
|
| 527 |
+
"""
|
| 528 |
+
Compare hard identifier components between two parsed addresses.
|
| 529 |
+
|
| 530 |
+
Rules:
|
| 531 |
+
β’ Both present AND equal β +30 (only if embed_score > EMBED_GATE_THRESHOLD)
|
| 532 |
+
β’ Both present AND NOT equal β -30 (always, no gate)
|
| 533 |
+
β’ Present in one, absent in other β 0 (no signal either way)
|
| 534 |
+
|
| 535 |
+
Returns: (identifier_delta, details_dict)
|
| 536 |
+
"""
|
| 537 |
+
id_delta = 0
|
| 538 |
+
details = {}
|
| 539 |
+
|
| 540 |
+
for key in HARD_ID_KEYS:
|
| 541 |
+
v1 = _normalize_id(comp1.get(key))
|
| 542 |
+
v2 = _normalize_id(comp2.get(key))
|
| 543 |
+
|
| 544 |
+
if v1 and v2:
|
| 545 |
+
if v1 == v2:
|
| 546 |
+
if embed_score > EMBED_GATE_THRESHOLD:
|
| 547 |
+
id_delta += ID_MATCH_BOOST
|
| 548 |
+
details[key] = f'match ({v1}) +{ID_MATCH_BOOST}'
|
| 549 |
+
else:
|
| 550 |
+
details[key] = f'match ({v1}) but embed={embed_score:.1f} < gate β no boost'
|
| 551 |
+
else:
|
| 552 |
+
id_delta -= ID_MISMATCH_PENALTY
|
| 553 |
+
details[key] = f'MISMATCH ({v1} vs {v2}) -{ID_MISMATCH_PENALTY}'
|
| 554 |
+
elif v1 or v2:
|
| 555 |
+
details[key] = f'absent in one ({v1 or "β"} vs {v2 or "β"}) β skip'
|
| 556 |
+
|
| 557 |
+
return id_delta, details
|
| 558 |
+
|
| 559 |
+
|
| 560 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 561 |
+
# 7. STAGE 3 β EMBEDDING / RESIDUAL COMPARISON
|
| 562 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 563 |
+
|
| 564 |
+
def _compute_embed_score(residual1: str, residual2: str) -> float:
|
| 565 |
+
"""
|
| 566 |
+
Compare two residual address strings using the semantic embedding model.
|
| 567 |
+
Returns 0β100 float.
|
| 568 |
+
Falls back to match_entities (fuzzy+semantic) if residuals are very short.
|
| 569 |
+
"""
|
| 570 |
+
r1 = preprocess_for_matching(residual1)
|
| 571 |
+
r2 = preprocess_for_matching(residual2)
|
| 572 |
+
|
| 573 |
+
if not r1 or not r2:
|
| 574 |
+
return 0.0
|
| 575 |
+
|
| 576 |
+
# For very short residuals (< 4 chars) pure embedding is unreliable;
|
| 577 |
+
# use the existing match_entities which blends fuzzy + semantic.
|
| 578 |
+
if len(r1) < 4 or len(r2) < 4:
|
| 579 |
+
return float(match_entities(r1, r2, weights=ADDRESS_MODEL_WEIGHTS))
|
| 580 |
+
|
| 581 |
+
try:
|
| 582 |
+
raw = calculate_semantic_similarity(ADDRESS_EMBED_MODEL, r1, r2)
|
| 583 |
+
return max(0.0, min(100.0, float(raw) * 100))
|
| 584 |
+
except Exception as e:
|
| 585 |
+
logger.warning(f"Embedding model error: {e}; falling back to match_entities")
|
| 586 |
+
return float(match_entities(r1, r2, weights=ADDRESS_MODEL_WEIGHTS))
|
| 587 |
+
|
| 588 |
+
|
| 589 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 590 |
+
# 8. MASTER PIPELINE β match_address_lines()
|
| 591 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 592 |
+
|
| 593 |
+
def match_address_lines(addr1: str, addr2: str) -> Dict:
|
| 594 |
+
"""
|
| 595 |
+
Full 3-stage Indian address matching pipeline.
|
| 596 |
+
|
| 597 |
+
Stage 0 : Standardize both address strings
|
| 598 |
+
Stage 1 : Geo-anchor extraction + comparison (city / state / PIN)
|
| 599 |
+
Stage 2 : Administrative component extraction + comparison
|
| 600 |
+
(house / flat / block / sector / ward / phase / plot)
|
| 601 |
+
Stage 3 : Residual embedding comparison (remaining text after extraction)
|
| 602 |
+
|
| 603 |
+
Final score formula:
|
| 604 |
+
final = clamp(embed_score + geo_boost + id_delta, 0, 100)
|
| 605 |
+
|
| 606 |
+
Early exit:
|
| 607 |
+
If any geo component is present in BOTH and they mismatch β
|
| 608 |
+
return score = GEO_MISMATCH_SCORE (20) immediately.
|
| 609 |
+
|
| 610 |
+
Parameters
|
| 611 |
+
----------
|
| 612 |
+
addr1, addr2 : raw address strings (any case, any format)
|
| 613 |
+
|
| 614 |
+
Returns
|
| 615 |
+
-------
|
| 616 |
+
dict with keys:
|
| 617 |
+
score : float (0β100)
|
| 618 |
+
early_exit : bool
|
| 619 |
+
embed_score : float
|
| 620 |
+
geo_boost : int
|
| 621 |
+
id_delta : int
|
| 622 |
+
breakdown : dict (component-level detail)
|
| 623 |
+
components_1 : dict (parsed components for addr1)
|
| 624 |
+
components_2 : dict (parsed components for addr2)
|
| 625 |
+
"""
|
| 626 |
+
|
| 627 |
+
breakdown: Dict = {}
|
| 628 |
+
|
| 629 |
+
# ββ Stage 0: Standardize βββββββββββββββββββββββββββββββββββββββββββββ
|
| 630 |
+
std1 = standardize_address_line(addr1)
|
| 631 |
+
std2 = standardize_address_line(addr2)
|
| 632 |
+
breakdown['standardized'] = {'addr1': std1, 'addr2': std2}
|
| 633 |
+
|
| 634 |
+
# ββ Stage 1: Geo-anchor extraction + comparison βββββββββββββββββββββββ
|
| 635 |
+
geo1 = extract_geo_anchors(std1)
|
| 636 |
+
geo2 = extract_geo_anchors(std2)
|
| 637 |
+
breakdown['geo_components'] = {'addr1': geo1, 'addr2': geo2}
|
| 638 |
+
|
| 639 |
+
early_exit_score, geo_boost, geo_detail = _compare_geo(geo1, geo2)
|
| 640 |
+
breakdown['geo_comparison'] = geo_detail
|
| 641 |
+
|
| 642 |
+
if early_exit_score is not None:
|
| 643 |
+
return {
|
| 644 |
+
'score': float(early_exit_score),
|
| 645 |
+
'early_exit': True,
|
| 646 |
+
'embed_score': 0.0,
|
| 647 |
+
'geo_boost': 0,
|
| 648 |
+
'id_delta': 0,
|
| 649 |
+
'breakdown': breakdown,
|
| 650 |
+
'components_1': {},
|
| 651 |
+
'components_2': {},
|
| 652 |
+
}
|
| 653 |
+
|
| 654 |
+
# ββ Stage 2: Extended component extraction ββββββββββββββββββββββββββββ
|
| 655 |
+
comp1 = extract_extended_components(std1, geo1)
|
| 656 |
+
comp2 = extract_extended_components(std2, geo2)
|
| 657 |
+
breakdown['components'] = {'addr1': comp1, 'addr2': comp2}
|
| 658 |
+
|
| 659 |
+
# ββ Stage 3: Embedding on residuals (needed BEFORE ID comparison) βββββ
|
| 660 |
+
residual1 = comp1.get('residual', '')
|
| 661 |
+
residual2 = comp2.get('residual', '')
|
| 662 |
+
embed_score = _compute_embed_score(residual1, residual2)
|
| 663 |
+
breakdown['embed'] = {
|
| 664 |
+
'residual_1': residual1,
|
| 665 |
+
'residual_2': residual2,
|
| 666 |
+
'embed_score': round(embed_score, 2),
|
| 667 |
+
}
|
| 668 |
+
|
| 669 |
+
# ββ Stage 2 (cont.): Identifier comparison (gated by embed score) βββββ
|
| 670 |
+
id_delta, id_detail = _compare_identifiers(comp1, comp2, embed_score)
|
| 671 |
+
breakdown['id_comparison'] = id_detail
|
| 672 |
+
|
| 673 |
+
# ββ Final score composition βββββββββββββββββββββββββββββββββββββββββββ
|
| 674 |
+
raw_final = embed_score + geo_boost + id_delta
|
| 675 |
+
final = max(0.0, min(100.0, raw_final))
|
| 676 |
+
|
| 677 |
+
breakdown['score_composition'] = {
|
| 678 |
+
'embed_score': round(embed_score, 2),
|
| 679 |
+
'geo_boost': geo_boost,
|
| 680 |
+
'id_delta': id_delta,
|
| 681 |
+
'raw': round(raw_final, 2),
|
| 682 |
+
'final': round(final, 2),
|
| 683 |
+
}
|
| 684 |
+
|
| 685 |
+
return {
|
| 686 |
+
'score': round(final, 2),
|
| 687 |
+
'early_exit': False,
|
| 688 |
+
'embed_score': round(embed_score, 2),
|
| 689 |
+
'geo_boost': geo_boost,
|
| 690 |
+
'id_delta': id_delta,
|
| 691 |
+
'breakdown': breakdown,
|
| 692 |
+
'components_1': comp1,
|
| 693 |
+
'components_2': comp2,
|
| 694 |
+
}
|
| 695 |
+
|
| 696 |
+
|
| 697 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 698 |
+
# 9. DROP-IN REPLACEMENT FOR match_addresses_1_to_n() (model.py)
|
| 699 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 700 |
+
|
| 701 |
+
def match_addresses_enhanced(addresses_r1: list, addresses_r2: list) -> float:
|
| 702 |
+
"""
|
| 703 |
+
1:N address matching using the full pipeline.
|
| 704 |
+
Replaces / wraps match_addresses_1_to_n() in model.py.
|
| 705 |
+
|
| 706 |
+
Returns the highest score found across all address pair combinations.
|
| 707 |
+
"""
|
| 708 |
+
valid1 = [a for a in addresses_r1 if a and str(a).strip() not in ('', '-', ' ')]
|
| 709 |
+
valid2 = [a for a in addresses_r2 if a and str(a).strip() not in ('', '-', ' ')]
|
| 710 |
+
|
| 711 |
+
if not valid1 or not valid2:
|
| 712 |
+
return 0.0
|
| 713 |
+
|
| 714 |
+
best = 0.0
|
| 715 |
+
for a1 in valid1:
|
| 716 |
+
for a2 in valid2:
|
| 717 |
+
result = match_address_lines(str(a1), str(a2))
|
| 718 |
+
score = result.get('score', 0.0)
|
| 719 |
+
if score > best:
|
| 720 |
+
best = score
|
| 721 |
+
|
| 722 |
+
return best
|
services/config.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import configparser
|
| 2 |
+
import os
|
| 3 |
+
import ast
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import logging
|
| 6 |
+
|
| 7 |
+
logger = logging.getLogger("config")
|
| 8 |
+
|
| 9 |
+
# Initialize config
|
| 10 |
+
logger = logging.getLogger("config")
|
| 11 |
+
|
| 12 |
+
config = configparser.ConfigParser()
|
| 13 |
+
|
| 14 |
+
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 15 |
+
|
| 16 |
+
# Try backend/config first, then config/ as fallback
|
| 17 |
+
conf_file_path = os.path.join(project_root, "backend", "config", "common.properties")
|
| 18 |
+
if not os.path.exists(conf_file_path):
|
| 19 |
+
conf_file_path = os.path.join(project_root, "config", "common.properties")
|
| 20 |
+
|
| 21 |
+
if not os.path.exists(conf_file_path):
|
| 22 |
+
raise FileNotFoundError(f"common.properties not found. Searched in backend/config/ and config/ under {project_root}")
|
| 23 |
+
|
| 24 |
+
config.read(conf_file_path)
|
| 25 |
+
|
| 26 |
+
logger.info(f"Config loaded from: {conf_file_path}")
|
| 27 |
+
#temporary read
|
| 28 |
+
# print("CONFIG FILES LOADED:", files)
|
| 29 |
+
# print("SECTIONS FOUND:", config.sections())
|
| 30 |
+
|
| 31 |
+
# Export any config constants if needed
|
| 32 |
+
APARTMENT_IDENTIFIER = config.get("IDENTIFIERS", "APARTMENT_IDENTIFIER")
|
| 33 |
+
FLAT_NUMBER_IDENTIFIER = config.get("IDENTIFIERS", "FLAT_NUMBER_IDENTIFIER")
|
| 34 |
+
HOUSE_NUMBER_IDENTIFIER = config.get("IDENTIFIERS", "HOUSE_NUMBER_IDENTIFIER")
|
| 35 |
+
STREET_KEYWORD = config.get("IDENTIFIERS", "STREET_KEYWORD")
|
| 36 |
+
FLOOR_NO_KEYWORD=config.get("IDENTIFIERS","FLOOR_KEYWORD")
|
| 37 |
+
SURNAME_IDENTIFIER = ast.literal_eval(config.get("IDENTIFIERS","INDIAN_SURNAMES"))
|
| 38 |
+
|
| 39 |
+
STATE_MAPPING = ast.literal_eval(config.get("MAPPING_DICT", "STATE_MAPPING"))
|
| 40 |
+
CITY_MAPPING = ast.literal_eval(config.get("MAPPING_DICT", "CITY_MAPPING"))
|
| 41 |
+
ADDRESS_MAPPING = ast.literal_eval(config.get("MAPPING_DICT", "ADDRESS_MAPPING"))
|
| 42 |
+
|
| 43 |
+
MODEL_WEIGHTS = ast.literal_eval(config.get("MATCHING_LOGIC", "MODEL_WEIGHTS"))
|
| 44 |
+
MATCHING_RULES = ast.literal_eval(config.get("MATCHING_LOGIC", "MATCHING_RULES"))
|
| 45 |
+
|
| 46 |
+
# Name-specific weights (embedding 0.7 + fuzz 0.2 + phonetic 0.1)
|
| 47 |
+
try:
|
| 48 |
+
NAME_MODEL_WEIGHTS = ast.literal_eval(config.get("NAME_MATCHING", "NAME_MODEL_WEIGHTS"))
|
| 49 |
+
NAME_MATCH_ADJUSTMENTS = ast.literal_eval(config.get("NAME_MATCHING", "NAME_MATCH_ADJUSTMENTS"))
|
| 50 |
+
except Exception:
|
| 51 |
+
NAME_MODEL_WEIGHTS = MODEL_WEIGHTS
|
| 52 |
+
NAME_MATCH_ADJUSTMENTS = {"surname_penalty": -30, "initial_boost": 30, "subset_boost": 40}
|
| 53 |
+
|
| 54 |
+
# Address-specific weights (embedding + fuzz, no phonetic)
|
| 55 |
+
try:
|
| 56 |
+
ADDRESS_MODEL_WEIGHTS = ast.literal_eval(config.get("ADDRESS_MATCHING", "ADDRESS_MODEL_WEIGHTS"))
|
| 57 |
+
ADDRESS_MATCH_ADJUSTMENTS = ast.literal_eval(config.get("ADDRESS_MATCHING", "ADDRESS_MATCH_ADJUSTMENTS"))
|
| 58 |
+
except Exception:
|
| 59 |
+
ADDRESS_MODEL_WEIGHTS = MODEL_WEIGHTS
|
| 60 |
+
ADDRESS_MATCH_ADJUSTMENTS = {"house_match_boost": 30, "house_mismatch_penalty": 70}
|
| 61 |
+
|
| 62 |
+
try:
|
| 63 |
+
MODEL_1_NAME = config.get("EMBEDDING_MODELS", "MODEL_1_NAME").strip()
|
| 64 |
+
MODEL_2_NAME = config.get("EMBEDDING_MODELS", "MODEL_2_NAME").strip()
|
| 65 |
+
except Exception:
|
| 66 |
+
MODEL_1_NAME = "sentence-transformers/all-mpnet-base-v2"
|
| 67 |
+
MODEL_2_NAME = "sentence-transformers/all-MiniLM-L6-v2"
|
| 68 |
+
|
| 69 |
+
# =========================================================
|
| 70 |
+
# CSV DATA LOADING (replacing MySQL)
|
| 71 |
+
# =========================================================
|
| 72 |
+
|
| 73 |
+
def load_csv_file(csv_path: str, file_name: str) -> pd.DataFrame:
|
| 74 |
+
"""Load CSV file as DataFrame with error handling."""
|
| 75 |
+
try:
|
| 76 |
+
# Convert to absolute path relative to project root
|
| 77 |
+
if not os.path.isabs(csv_path):
|
| 78 |
+
csv_path = os.path.join(project_root, csv_path)
|
| 79 |
+
|
| 80 |
+
# Normalize path and resolve any ../ or ./ references
|
| 81 |
+
csv_path = os.path.abspath(csv_path)
|
| 82 |
+
|
| 83 |
+
if os.path.exists(csv_path):
|
| 84 |
+
df = pd.read_csv(csv_path)
|
| 85 |
+
logger.info(f"Loaded {file_name}: {len(df)} rows from {csv_path}")
|
| 86 |
+
return df
|
| 87 |
+
else:
|
| 88 |
+
logger.warning(f"CSV file not found: {csv_path}")
|
| 89 |
+
return pd.DataFrame()
|
| 90 |
+
except Exception as e:
|
| 91 |
+
logger.error(f"Failed to load {file_name}: {e}")
|
| 92 |
+
return pd.DataFrame()
|
| 93 |
+
|
| 94 |
+
# Load CSV reference tables
|
| 95 |
+
try:
|
| 96 |
+
name_variation_df = load_csv_file(config.get("csv", "name_variation_standard"), "name_variation_standard")
|
| 97 |
+
hno_variation_df = load_csv_file(config.get("csv", "hno_variation_standard"), "hno_variation_standard")
|
| 98 |
+
city_prev_pres_df = load_csv_file(config.get("csv", "city_prev_pres"), "city_prev_pres")
|
| 99 |
+
state_name_standard_df = load_csv_file(config.get("csv", "state_name_standard"), "state_name_standard")
|
| 100 |
+
sur_comm_names_df = load_csv_file(config.get("csv", "sur_comm_names"), "sur_comm_names")
|
| 101 |
+
pin_city_state_df = load_csv_file(config.get("csv", "pin_city_state"), "pin_city_state")
|
| 102 |
+
|
| 103 |
+
logger.info("All CSV files loaded successfully")
|
| 104 |
+
except Exception as e:
|
| 105 |
+
logger.warning(f"Some CSV files may not have loaded: {e}")
|
| 106 |
+
name_variation_df = pd.DataFrame()
|
| 107 |
+
hno_variation_df = pd.DataFrame()
|
| 108 |
+
city_prev_pres_df = pd.DataFrame()
|
| 109 |
+
state_name_standard_df = pd.DataFrame()
|
| 110 |
+
sur_comm_names_df = pd.DataFrame()
|
| 111 |
+
pin_city_state_df = pd.DataFrame()
|
| 112 |
+
|
| 113 |
+
# Legacy string exports for backward compatibility
|
| 114 |
+
pin_city_state = "pin_city_state"
|
| 115 |
+
sur_comm_names = "sur_comm_names"
|
| 116 |
+
city_prev_pres = "city_prev_pres"
|
| 117 |
+
state_name_standard = "state_name_standard"
|
| 118 |
+
hno_variation_standard = "hno_variation_standard"
|
| 119 |
+
name_variation_standard = "name_variation_standard"
|
| 120 |
+
|
| 121 |
+
|
services/model.py
ADDED
|
@@ -0,0 +1,1509 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 2 |
+
from typing import Dict, List, Optional, Tuple
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from rapidfuzz import fuzz
|
| 5 |
+
from rapidfuzz.distance import JaroWinkler
|
| 6 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 7 |
+
from sentence_transformers import SentenceTransformer
|
| 8 |
+
import re
|
| 9 |
+
import itertools
|
| 10 |
+
|
| 11 |
+
from services.config import (
|
| 12 |
+
SURNAME_IDENTIFIER, MODEL_WEIGHTS, MODEL_1_NAME, MODEL_2_NAME,
|
| 13 |
+
NAME_MODEL_WEIGHTS, NAME_MATCH_ADJUSTMENTS,
|
| 14 |
+
ADDRESS_MODEL_WEIGHTS,
|
| 15 |
+
)
|
| 16 |
+
from services.rules import detect_surnames, compute_initial_letter_boost, is_subset_match
|
| 17 |
+
|
| 18 |
+
# ---------- Model Store ----------
|
| 19 |
+
MODEL_STORE = {}
|
| 20 |
+
|
| 21 |
+
def get_model(model_name: str) -> SentenceTransformer:
|
| 22 |
+
if model_name not in MODEL_STORE:
|
| 23 |
+
print(f"Loading {model_name} into memory on CPU...")
|
| 24 |
+
if model_name == "model1":
|
| 25 |
+
MODEL_STORE["model1"] = SentenceTransformer(MODEL_1_NAME, device="cpu")
|
| 26 |
+
elif model_name == "model2":
|
| 27 |
+
MODEL_STORE["model2"] = SentenceTransformer(MODEL_2_NAME, device="cpu")
|
| 28 |
+
return MODEL_STORE[model_name]
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
# ---------- Text Preprocessing ----------
|
| 32 |
+
def preprocess_for_matching(text: str) -> str:
|
| 33 |
+
"""Standardize text for matching"""
|
| 34 |
+
if not text or text in ["-", " ", ""]:
|
| 35 |
+
return ""
|
| 36 |
+
return text.upper().strip()
|
| 37 |
+
|
| 38 |
+
# ---------- Core Matching Functions ----------
|
| 39 |
+
# ---------- Indic Soundex (phonetic for Indian names) ----------
|
| 40 |
+
# def indic_soundex_code(name: str) -> str:
|
| 41 |
+
# """
|
| 42 |
+
# Generate Indic Soundex code for a name token.
|
| 43 |
+
# Handles Indian transliteration phonetics (aspirated consonants, etc.)
|
| 44 |
+
# """
|
| 45 |
+
# if not name:
|
| 46 |
+
# return ""
|
| 47 |
+
# name = name.upper().strip()
|
| 48 |
+
# if not name:
|
| 49 |
+
# return ""
|
| 50 |
+
|
| 51 |
+
# # Pre-process: map aspirated/compound consonants to base
|
| 52 |
+
# for digraph, base in [("SH", "S"), ("PH", "F"), ("TH", "T"), ("DH", "D"),
|
| 53 |
+
# ("KH", "K"), ("GH", "G"), ("BH", "B"), ("CH", "C"), ("JH", "J")]:
|
| 54 |
+
# name = name.replace(digraph, base)
|
| 55 |
+
|
| 56 |
+
# SOUNDEX_MAP = {
|
| 57 |
+
# 'B': '1', 'F': '1', 'P': '1', 'V': '1', 'W': '1',
|
| 58 |
+
# 'C': '2', 'G': '2', 'J': '2', 'K': '2', 'Q': '2', 'S': '2', 'X': '2', 'Z': '2',
|
| 59 |
+
# 'D': '3', 'T': '3',
|
| 60 |
+
# 'L': '4',
|
| 61 |
+
# 'M': '5', 'N': '5',
|
| 62 |
+
# 'R': '6',
|
| 63 |
+
# }
|
| 64 |
+
|
| 65 |
+
# code = name[0]
|
| 66 |
+
# prev_code = SOUNDEX_MAP.get(name[0], '0')
|
| 67 |
+
|
| 68 |
+
# for char in name[1:]:
|
| 69 |
+
# if char in 'AEIOUHY ':
|
| 70 |
+
# prev_code = '0' # Reset on vowel/separator
|
| 71 |
+
# continue
|
| 72 |
+
# digit = SOUNDEX_MAP.get(char, '0')
|
| 73 |
+
# if digit != '0' and digit != prev_code:
|
| 74 |
+
# code += digit
|
| 75 |
+
# prev_code = digit
|
| 76 |
+
|
| 77 |
+
# return (code + '000')[:4]
|
| 78 |
+
|
| 79 |
+
def indic_soundex_code(name: str) -> str:
|
| 80 |
+
"""
|
| 81 |
+
Generate Indic Soundex code for a name token.
|
| 82 |
+
Handles Indian transliteration phonetics (aspirated consonants, etc.)
|
| 83 |
+
|
| 84 |
+
[MODIFIED 2026-03-15]
|
| 85 |
+
- Separated palatal fricatives (J, S, Z) from velars (K, G) in SOUNDEX_MAP to accurately
|
| 86 |
+
penalize phonetically distinct names like Rajesh vs Rakesh.
|
| 87 |
+
"""
|
| 88 |
+
if not name:
|
| 89 |
+
return ""
|
| 90 |
+
name = name.upper().strip()
|
| 91 |
+
if not name:
|
| 92 |
+
return ""
|
| 93 |
+
|
| 94 |
+
# Pre-process: map aspirated/compound consonants to base
|
| 95 |
+
for digraph, base in [("SH", "S"), ("PH", "F"), ("TH", "T"), ("DH", "D"),
|
| 96 |
+
("KH", "K"), ("GH", "G"), ("BH", "B"), ("CH", "C"), ("JH", "J")]:
|
| 97 |
+
name = name.replace(digraph, base)
|
| 98 |
+
|
| 99 |
+
SOUNDEX_MAP = {
|
| 100 |
+
'B': '1', 'F': '1', 'P': '1', 'V': '1', 'W': '1',
|
| 101 |
+
'C': '2', 'G': '2', 'K': '2', 'Q': '2', 'X': '2',
|
| 102 |
+
'D': '3', 'T': '3',
|
| 103 |
+
'L': '4',
|
| 104 |
+
'M': '5', 'N': '5',
|
| 105 |
+
'R': '6',
|
| 106 |
+
'J': '7', 'S': '7', 'Z': '7'
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
code = name[0]
|
| 110 |
+
prev_code = SOUNDEX_MAP.get(name[0], '0')
|
| 111 |
+
|
| 112 |
+
for char in name[1:]:
|
| 113 |
+
if char in 'AEIOUHY ':
|
| 114 |
+
prev_code = '0' # Reset on vowel/separator
|
| 115 |
+
continue
|
| 116 |
+
digit = SOUNDEX_MAP.get(char, '0')
|
| 117 |
+
if digit != '0' and digit != prev_code:
|
| 118 |
+
code += digit
|
| 119 |
+
prev_code = digit
|
| 120 |
+
|
| 121 |
+
return (code + '000')[:4]
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def indic_soundex_similarity(text1: str, text2: str) -> float:
|
| 125 |
+
"""
|
| 126 |
+
Compare two texts using Indic Soundex on each token.
|
| 127 |
+
Returns 0-100 similarity score.
|
| 128 |
+
"""
|
| 129 |
+
tokens1 = text1.upper().split() if text1 else []
|
| 130 |
+
tokens2 = text2.upper().split() if text2 else []
|
| 131 |
+
if not tokens1 or not tokens2:
|
| 132 |
+
return 0.0
|
| 133 |
+
|
| 134 |
+
codes1 = [indic_soundex_code(t) for t in tokens1]
|
| 135 |
+
codes2 = [indic_soundex_code(t) for t in tokens2]
|
| 136 |
+
|
| 137 |
+
shorter, longer = (codes1, codes2) if len(codes1) <= len(codes2) else (codes2, codes1)
|
| 138 |
+
if not shorter:
|
| 139 |
+
return 0.0
|
| 140 |
+
|
| 141 |
+
total_match = 0.0
|
| 142 |
+
used = set()
|
| 143 |
+
for s_code in shorter:
|
| 144 |
+
best_match = 0.0
|
| 145 |
+
best_idx = -1
|
| 146 |
+
for i, l_code in enumerate(longer):
|
| 147 |
+
if i in used:
|
| 148 |
+
continue
|
| 149 |
+
match = sum(c1 == c2 for c1, c2 in zip(s_code, l_code)) / 4.0
|
| 150 |
+
if match > best_match:
|
| 151 |
+
best_match = match
|
| 152 |
+
best_idx = i
|
| 153 |
+
if best_idx >= 0:
|
| 154 |
+
used.add(best_idx)
|
| 155 |
+
total_match += best_match
|
| 156 |
+
|
| 157 |
+
return (total_match / len(shorter)) * 100
|
| 158 |
+
|
| 159 |
+
# ---------- Core Matching Functions ----------
|
| 160 |
+
def calculate_fuzzy_scores(input1: str, input2: str) -> Dict[str, float]:
|
| 161 |
+
"""Calculate fuzzy matching scores using RapidFuzz (5 functions)"""
|
| 162 |
+
return {
|
| 163 |
+
"simple_ratio": fuzz.ratio(input1, input2),
|
| 164 |
+
"token_set_ratio": fuzz.token_set_ratio(input1, input2),
|
| 165 |
+
"w_ratio": fuzz.WRatio(input1, input2),
|
| 166 |
+
"partial_ratio": fuzz.partial_ratio(input1, input2),
|
| 167 |
+
"token_sort_ratio": fuzz.token_sort_ratio(input1, input2),
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
def calculate_semantic_similarity(model_name: str, input1: str, input2: str) -> float:
|
| 171 |
+
"""Calculate semantic similarity using sentence transformers"""
|
| 172 |
+
model = get_model(model_name)
|
| 173 |
+
# print("input1 to model",input1)
|
| 174 |
+
# print("input2 to model",input2)
|
| 175 |
+
embedding1 = model.encode([input1], show_progress_bar=False)
|
| 176 |
+
embedding2 = model.encode([input2], show_progress_bar=False)
|
| 177 |
+
|
| 178 |
+
return cosine_similarity(embedding1, embedding2)[0][0]
|
| 179 |
+
|
| 180 |
+
def calculate_final_score(fuzzy_scores: Dict[str, float], semantic_score: float) -> float:
|
| 181 |
+
"""Calculate weighted final score"""
|
| 182 |
+
weights = MODEL_WEIGHTS
|
| 183 |
+
normalized_scores = {
|
| 184 |
+
"simple_ratio": fuzzy_scores.get("simple_ratio", 0),
|
| 185 |
+
"token_set_ratio": fuzzy_scores.get("token_set_ratio", 0),
|
| 186 |
+
"partial_ratio": fuzzy_scores.get("partial_ratio", 0),
|
| 187 |
+
"w_ratio": fuzzy_scores.get("w_ratio", 0),
|
| 188 |
+
"semantic_score": semantic_score * 100,
|
| 189 |
+
}
|
| 190 |
+
weighted_sum = sum(normalized_scores[key] * weight for key, weight in weights.items())
|
| 191 |
+
return max(0, min(100, weighted_sum))
|
| 192 |
+
|
| 193 |
+
def calculate_overall_similarity(score1: float, score2: float) -> float:
|
| 194 |
+
"""Calculate overall similarity from two model scores"""
|
| 195 |
+
return score1 * 0.6 + score2 * 0.4
|
| 196 |
+
|
| 197 |
+
def check_substring_match(str1: str, str2: str) -> bool:
|
| 198 |
+
"""Check if one string is a substring of another"""
|
| 199 |
+
if not str1 or not str2:
|
| 200 |
+
return False
|
| 201 |
+
return str1 in str2 or str2 in str1
|
| 202 |
+
|
| 203 |
+
def check_individual_name_matches(name_full: str, fname: str, mname: str, lname: str) -> Tuple[bool, bool, bool]:
|
| 204 |
+
"""
|
| 205 |
+
Check if full name contains first, middle, or last name as substring
|
| 206 |
+
Returns: (first_match, middle_match, last_match)
|
| 207 |
+
"""
|
| 208 |
+
f_match = check_substring_match(name_full, fname) if fname else False
|
| 209 |
+
m_match = check_substring_match(name_full, mname) if mname else False
|
| 210 |
+
l_match = check_substring_match(name_full, lname) if lname else False
|
| 211 |
+
return f_match, m_match, l_match
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
def concatenate_name_parts(firstname: str, middlename: str, lastname: str) -> str:
|
| 215 |
+
"""Concatenate name parts"""
|
| 216 |
+
parts = []
|
| 217 |
+
if firstname and firstname not in ["-", " ", ""]:
|
| 218 |
+
parts.append(firstname.upper().strip())
|
| 219 |
+
if middlename and middlename not in ["-", " ", ""]:
|
| 220 |
+
parts.append(middlename.upper().strip())
|
| 221 |
+
if lastname and lastname not in ["-", " ", ""]:
|
| 222 |
+
parts.append(lastname.upper().strip())
|
| 223 |
+
|
| 224 |
+
if not parts:
|
| 225 |
+
return ""
|
| 226 |
+
|
| 227 |
+
parts.sort()
|
| 228 |
+
return " ".join(parts)
|
| 229 |
+
|
| 230 |
+
# ---------- helpers used only inside the new logic ----------
|
| 231 |
+
def _normalize_and_sort(name: str) -> str:
|
| 232 |
+
"""
|
| 233 |
+
1. Split on any non-alphanumeric character (space, underscore, comma, etc.)
|
| 234 |
+
2. Remove empty tokens
|
| 235 |
+
3. Upper-case
|
| 236 |
+
4. Sort alphabetically
|
| 237 |
+
5. Re-join with single space
|
| 238 |
+
"""
|
| 239 |
+
tokens = re.split(r'[^A-Za-z0-9]+', name.strip())
|
| 240 |
+
tokens = [t.upper() for t in tokens if t]
|
| 241 |
+
return ' '.join(sorted(tokens))
|
| 242 |
+
|
| 243 |
+
def _all_name_combinations(fname: str, mname: str, lname: str) -> list[str]:
|
| 244 |
+
"""
|
| 245 |
+
Return every possible ordering of the supplied parts,
|
| 246 |
+
dropping any empty/blank components.
|
| 247 |
+
"""
|
| 248 |
+
parts = []
|
| 249 |
+
for p in (fname, mname, lname):
|
| 250 |
+
if p and p.strip() not in ('-', '', ' '):
|
| 251 |
+
parts.append(p.strip().upper())
|
| 252 |
+
if not parts:
|
| 253 |
+
return []
|
| 254 |
+
# itertools.permutations gives every ordering
|
| 255 |
+
return [' '.join(order) for order in itertools.permutations(parts)]
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
# def match_entities(value1: str, value2: str, weights: Dict[str, float] = None) -> float:
|
| 259 |
+
# """
|
| 260 |
+
# Match two entities using fuzzy + semantic + optional phonetic similarity.
|
| 261 |
+
# Weights dict determines score component contributions.
|
| 262 |
+
# Returns: similarity score as float (0-100)
|
| 263 |
+
# """
|
| 264 |
+
# if weights is None:
|
| 265 |
+
# weights = MODEL_WEIGHTS
|
| 266 |
+
|
| 267 |
+
# standardized_input1 = preprocess_for_matching(value1)
|
| 268 |
+
# standardized_input2 = preprocess_for_matching(value2)
|
| 269 |
+
|
| 270 |
+
# if not standardized_input1 or not standardized_input2:
|
| 271 |
+
# return 0
|
| 272 |
+
|
| 273 |
+
# # Space-agnostic exact match
|
| 274 |
+
# if standardized_input1.replace(" ", "") == standardized_input2.replace(" ", ""):
|
| 275 |
+
# return 100.0
|
| 276 |
+
|
| 277 |
+
# return calculate_similarity_with_models(standardized_input1, standardized_input2, weights)
|
| 278 |
+
|
| 279 |
+
def match_entities(value1: str, value2: str, weights: Dict[str, float] = None) -> float:
|
| 280 |
+
"""
|
| 281 |
+
Match two entities using fuzzy + semantic + optional phonetic similarity.
|
| 282 |
+
Weights dict determines score component contributions.
|
| 283 |
+
|
| 284 |
+
Handles:
|
| 285 |
+
1. Normal match : "Pujitha Sharma" vs "pujitha sharma"
|
| 286 |
+
2. Space-agnostic match : "Pujitha Sharma" vs "pujithasharma"
|
| 287 |
+
3. South Indian names : "Sharma Gari Pujitha" vs "Pujitha Sharma Gari"
|
| 288 |
+
(token order doesn't matter, combinations checked)
|
| 289 |
+
|
| 290 |
+
Returns: similarity score as float (0-100)
|
| 291 |
+
|
| 292 |
+
|
| 293 |
+
- Integrated 'Check 3: Acronym / Initial expansion'. Matches acronyms to
|
| 294 |
+
full names (e.g. K V Reddy vs Katta Venkata Reddy) and boosts to 90+.
|
| 295 |
+
Penalizes mismatching initials (e.g. C Anitha vs H Anitha) by -40.
|
| 296 |
+
- Added 'Check 5: Final Phonetic Audit'. Uses Indic Soundex to securely
|
| 297 |
+
escalate minor spelling variants (likitha vs likheetha) to 95+ and heavily
|
| 298 |
+
punish mathematically close false-positives (rajesh vs rakesh).
|
| 299 |
+
"""
|
| 300 |
+
if weights is None:
|
| 301 |
+
weights = MODEL_WEIGHTS
|
| 302 |
+
|
| 303 |
+
standardized_input1 = preprocess_for_matching(value1)
|
| 304 |
+
standardized_input2 = preprocess_for_matching(value2)
|
| 305 |
+
|
| 306 |
+
if not standardized_input1 or not standardized_input2:
|
| 307 |
+
return 0
|
| 308 |
+
|
| 309 |
+
# =========================================================
|
| 310 |
+
# CHECK 1: Space-agnostic exact match
|
| 311 |
+
# "Pujitha Sharma" vs "pujithasharma" β 100.0
|
| 312 |
+
# =========================================================
|
| 313 |
+
if standardized_input1.replace(" ", "") == standardized_input2.replace(" ", ""):
|
| 314 |
+
return 100.0
|
| 315 |
+
|
| 316 |
+
# =========================================================
|
| 317 |
+
# CHECK 2: Token-order permutation match (South Indian names)
|
| 318 |
+
# "sharmagari pujitha" vs "pujitha sharmagari" β 100.0
|
| 319 |
+
# Splits both names into tokens, checks if any permutation
|
| 320 |
+
# of tokens (joined with/without space) matches the other
|
| 321 |
+
# =========================================================
|
| 322 |
+
tokens1 = standardized_input1.split()
|
| 323 |
+
tokens2 = standardized_input2.split()
|
| 324 |
+
|
| 325 |
+
# Only attempt if token count is manageable (avoid factorial explosion)
|
| 326 |
+
if len(tokens1) <= 4 and len(tokens2) <= 4:
|
| 327 |
+
|
| 328 |
+
# Generate all permutations of tokens1 and check against tokens2 (space-agnostic)
|
| 329 |
+
target_nospace = standardized_input2.replace(" ", "")
|
| 330 |
+
|
| 331 |
+
for perm in itertools.permutations(tokens1):
|
| 332 |
+
# joined with space: "pujitha sharmagari"
|
| 333 |
+
# joined without space: "pujithasharmagari"
|
| 334 |
+
perm_with_space = " ".join(perm)
|
| 335 |
+
perm_without_space = "".join(perm)
|
| 336 |
+
|
| 337 |
+
if perm_with_space == standardized_input2:
|
| 338 |
+
return 100.0
|
| 339 |
+
|
| 340 |
+
if perm_without_space == target_nospace:
|
| 341 |
+
return 100.0
|
| 342 |
+
|
| 343 |
+
# Also check permutations of tokens2 against tokens1 (space-agnostic)
|
| 344 |
+
target_nospace1 = standardized_input1.replace(" ", "")
|
| 345 |
+
|
| 346 |
+
for perm in itertools.permutations(tokens2):
|
| 347 |
+
perm_with_space = " ".join(perm)
|
| 348 |
+
perm_without_space = "".join(perm)
|
| 349 |
+
|
| 350 |
+
if perm_with_space == standardized_input1:
|
| 351 |
+
return 100.0
|
| 352 |
+
|
| 353 |
+
if perm_without_space == target_nospace1:
|
| 354 |
+
return 100.0
|
| 355 |
+
|
| 356 |
+
# =========================================================
|
| 357 |
+
# CHECK 3: Acronym / Initial expansion match or mismatch
|
| 358 |
+
# "K V Reddy" vs "Katta Venkata Reddy" β initial match β escalate to 90.0+
|
| 359 |
+
# "C Anitha" vs "H Anitha" β mismatched initials β severe penalty (-40.0)
|
| 360 |
+
# =========================================================
|
| 361 |
+
if len(tokens1) > 0 and len(tokens2) > 0:
|
| 362 |
+
common = set(tokens1) & set(tokens2)
|
| 363 |
+
rem1 = [t for t in tokens1 if t not in common]
|
| 364 |
+
rem2 = [t for t in tokens2 if t not in common]
|
| 365 |
+
|
| 366 |
+
# Only apply if they share some tokens (like a last name) but differ in the rest
|
| 367 |
+
if common and rem1 and rem2:
|
| 368 |
+
rem1_is_initials = all(len(t) == 1 for t in rem1)
|
| 369 |
+
rem2_is_initials = all(len(t) == 1 for t in rem2)
|
| 370 |
+
|
| 371 |
+
initials_list = None
|
| 372 |
+
fullcaps_list = None
|
| 373 |
+
|
| 374 |
+
# Identify which is the initials array and which is the longer names array
|
| 375 |
+
if rem1_is_initials and not rem2_is_initials:
|
| 376 |
+
initials_list = rem1
|
| 377 |
+
fullcaps_list = rem2
|
| 378 |
+
elif rem2_is_initials and not rem1_is_initials:
|
| 379 |
+
initials_list = rem2
|
| 380 |
+
fullcaps_list = rem1
|
| 381 |
+
elif rem1_is_initials and rem2_is_initials:
|
| 382 |
+
# Both are just single letters! (e.g. C Anitha vs H Anitha)
|
| 383 |
+
initials_list = rem1
|
| 384 |
+
fullcaps_list = rem2
|
| 385 |
+
|
| 386 |
+
if initials_list is not None and fullcaps_list is not None:
|
| 387 |
+
initials_set = {t[0] for t in initials_list}
|
| 388 |
+
first_letters_set = {t[0] for t in fullcaps_list if t}
|
| 389 |
+
|
| 390 |
+
# Check for intersection. If they map cleanly, escalate to 90
|
| 391 |
+
if initials_set == first_letters_set or initials_set.issubset(first_letters_set) or first_letters_set.issubset(initials_set):
|
| 392 |
+
base_score = calculate_similarity_with_models(standardized_input1, standardized_input2, weights)
|
| 393 |
+
return max(90.0, base_score)
|
| 394 |
+
else:
|
| 395 |
+
# Explicit conflicting initials! (e.g., C vs H or K vs M)
|
| 396 |
+
base_score = calculate_similarity_with_models(standardized_input1, standardized_input2, weights)
|
| 397 |
+
return max(0.0, base_score - 40.0)
|
| 398 |
+
else:
|
| 399 |
+
# =========================================================
|
| 400 |
+
# EXPLICIT CONFLICTING CORE NAMES - 15-03-2026
|
| 401 |
+
# Example: "M. Manisha Reddy" vs "M. Manoj Reddy" -> Shared: M, Reddy. Unmatched: Manisha vs Manoj
|
| 402 |
+
# Example: "Mukherjee Lakshmi" vs "Prasad Lakshmi" -> Shared: Lakshmi. Unmatched: Mukherjee vs Prasad
|
| 403 |
+
# Since neither unmatched set are initials, evaluate them as explicit words
|
| 404 |
+
# =========================================================
|
| 405 |
+
rem1_str = " ".join(rem1)
|
| 406 |
+
rem2_str = " ".join(rem2)
|
| 407 |
+
|
| 408 |
+
rem_fuzzy = fuzz.ratio(rem1_str, rem2_str)
|
| 409 |
+
if rem_fuzzy < 65.0:
|
| 410 |
+
base_score = calculate_similarity_with_models(standardized_input1, standardized_input2, weights)
|
| 411 |
+
# Severely penalize because key identifying words actively contradict each other
|
| 412 |
+
return max(0.0, base_score - 40.0)
|
| 413 |
+
|
| 414 |
+
# =========================================================
|
| 415 |
+
# CHECK 4: Fallback β weighted model scoring
|
| 416 |
+
# "Pujitha Sharma" vs "Jon Smyth" β ~78.5 (fuzzy+semantic)
|
| 417 |
+
# =========================================================
|
| 418 |
+
base_score = calculate_similarity_with_models(standardized_input1, standardized_input2, weights)
|
| 419 |
+
|
| 420 |
+
# =========================================================
|
| 421 |
+
# CHECK 5: Final Phonetic Audit (for single words/names primarily)
|
| 422 |
+
# If they are single continuous names, check if they are identical
|
| 423 |
+
# phonetically. If they are divergent, brutally penalize to prevent false positives.
|
| 424 |
+
# =========================================================
|
| 425 |
+
if len(tokens1) == 1 and len(tokens2) == 1:
|
| 426 |
+
ph_score = indic_soundex_similarity(standardized_input1, standardized_input2)
|
| 427 |
+
|
| 428 |
+
# Phonetically identical but minor spelling difference (likitha vs likheetha) -> escalate to 95.0+
|
| 429 |
+
if ph_score == 100.0:
|
| 430 |
+
if fuzz.ratio(standardized_input1, standardized_input2) > 65 and abs(len(standardized_input1) - len(standardized_input2)) <= 2:
|
| 431 |
+
return max(95.0, base_score)
|
| 432 |
+
|
| 433 |
+
# Highly distinct phonetics but mathematically close text (Rajesh vs Rakesh) -> ~50.0
|
| 434 |
+
elif ph_score <= 80.0:
|
| 435 |
+
if base_score > 55.0:
|
| 436 |
+
# heavily penalize false-positive anagrams/typos
|
| 437 |
+
return min(base_score - 25.0, 55.0)
|
| 438 |
+
|
| 439 |
+
return base_score
|
| 440 |
+
|
| 441 |
+
# def calculate_similarity_with_models(text1: str, text2: str, weights: Dict[str, float] = None) -> float:
|
| 442 |
+
# """
|
| 443 |
+
# Calculate similarity using fuzzy scores, embedding models, and optional phonetic.
|
| 444 |
+
# The weights dict controls which components are active and their contribution.
|
| 445 |
+
# Phonetic components (jaro_winkler, indic_soundex) are used only if present in weights.
|
| 446 |
+
# Returns similarity percentage as float (0-100)
|
| 447 |
+
# """
|
| 448 |
+
# if weights is None:
|
| 449 |
+
# weights = MODEL_WEIGHTS
|
| 450 |
+
|
| 451 |
+
# if not text1 or not text2:
|
| 452 |
+
# print(f"[SIMILARITY] either value is empty β text1={text1!r} text2={text2!r}")
|
| 453 |
+
# return 0.0
|
| 454 |
+
|
| 455 |
+
# text1 = str(text1).strip()
|
| 456 |
+
# text2 = str(text2).strip()
|
| 457 |
+
|
| 458 |
+
# if not text1 or not text2:
|
| 459 |
+
# return 0.0
|
| 460 |
+
|
| 461 |
+
# print(f"[SIMILARITY] text1={text1!r}")
|
| 462 |
+
# print(f"[SIMILARITY] text2={text2!r}")
|
| 463 |
+
|
| 464 |
+
# # Space-agnostic exact match
|
| 465 |
+
# if text1.replace(" ", "") == text2.replace(" ", ""):
|
| 466 |
+
# return 100.0
|
| 467 |
+
|
| 468 |
+
# # --- Fuzzy scores (5 functions) ---
|
| 469 |
+
# fuzzy_scores = {
|
| 470 |
+
# "simple_ratio": fuzz.ratio(text1, text2),
|
| 471 |
+
# "token_set_ratio": fuzz.token_set_ratio(text1, text2),
|
| 472 |
+
# "w_ratio": fuzz.WRatio(text1, text2),
|
| 473 |
+
# "partial_ratio": fuzz.partial_ratio(text1, text2),
|
| 474 |
+
# "token_sort_ratio": fuzz.token_sort_ratio(text1, text2),
|
| 475 |
+
# }
|
| 476 |
+
|
| 477 |
+
# # --- Phonetic scores (only if weights include them) ---
|
| 478 |
+
# phonetic_scores = {}
|
| 479 |
+
# if weights.get("jaro_winkler", 0) > 0:
|
| 480 |
+
# phonetic_scores["jaro_winkler"] = JaroWinkler.similarity(text1, text2) * 100
|
| 481 |
+
# if weights.get("indic_soundex", 0) > 0:
|
| 482 |
+
# phonetic_scores["indic_soundex"] = indic_soundex_similarity(text1, text2)
|
| 483 |
+
|
| 484 |
+
# # --- Semantic scores (dual model, computed in parallel) ---
|
| 485 |
+
# with ThreadPoolExecutor() as executor:
|
| 486 |
+
# model1 = get_model("model1")
|
| 487 |
+
# model2 = get_model("model2")
|
| 488 |
+
|
| 489 |
+
# f1 = executor.submit(
|
| 490 |
+
# lambda: cosine_similarity(
|
| 491 |
+
# model1.encode([text1], show_progress_bar=False),
|
| 492 |
+
# model1.encode([text2], show_progress_bar=False)
|
| 493 |
+
# )[0][0]
|
| 494 |
+
# )
|
| 495 |
+
# f2 = executor.submit(
|
| 496 |
+
# lambda: cosine_similarity(
|
| 497 |
+
# model2.encode([text1], show_progress_bar=False),
|
| 498 |
+
# model2.encode([text2], show_progress_bar=False)
|
| 499 |
+
# )[0][0]
|
| 500 |
+
# )
|
| 501 |
+
# cosine1 = f1.result()
|
| 502 |
+
# cosine2 = f2.result()
|
| 503 |
+
|
| 504 |
+
# def calc_final(semantic_cosine):
|
| 505 |
+
# all_scores = {}
|
| 506 |
+
# all_scores.update(fuzzy_scores)
|
| 507 |
+
# all_scores.update(phonetic_scores)
|
| 508 |
+
# all_scores["semantic_score"] = semantic_cosine * 100
|
| 509 |
+
# return sum(all_scores.get(k, 0) * v for k, v in weights.items())
|
| 510 |
+
|
| 511 |
+
# final1 = calc_final(cosine1)
|
| 512 |
+
# final2 = calc_final(cosine2)
|
| 513 |
+
|
| 514 |
+
# overall_similarity = final1 * 0.6 + final2 * 0.4
|
| 515 |
+
# print("similarity given by model",overall_similarity)
|
| 516 |
+
# return round(max(0, min(100, overall_similarity)), 2)
|
| 517 |
+
|
| 518 |
+
def calculate_similarity_with_models(text1: str, text2: str, weights: Dict[str, float] = None) -> float:
|
| 519 |
+
"""
|
| 520 |
+
Calculate similarity using fuzzy scores, embedding models, and optional phonetic.
|
| 521 |
+
The weights dict controls which components are active and their contribution.
|
| 522 |
+
Phonetic components (jaro_winkler, indic_soundex) are used only if present in weights.
|
| 523 |
+
Returns similarity percentage as float (0-100)
|
| 524 |
+
"""
|
| 525 |
+
if weights is None:
|
| 526 |
+
weights = MODEL_WEIGHTS
|
| 527 |
+
|
| 528 |
+
if not text1 or not text2:
|
| 529 |
+
return 0.0
|
| 530 |
+
|
| 531 |
+
text1 = str(text1).strip()
|
| 532 |
+
text2 = str(text2).strip()
|
| 533 |
+
|
| 534 |
+
if not text1 or not text2:
|
| 535 |
+
return 0.0
|
| 536 |
+
|
| 537 |
+
# Space-agnostic exact match
|
| 538 |
+
if text1.replace(" ", "") == text2.replace(" ", ""):
|
| 539 |
+
return 100.0
|
| 540 |
+
|
| 541 |
+
# --- Fuzzy scores (5 functions) ---
|
| 542 |
+
fuzzy_scores = {
|
| 543 |
+
"simple_ratio": fuzz.ratio(text1, text2),
|
| 544 |
+
"token_set_ratio": fuzz.token_set_ratio(text1, text2),
|
| 545 |
+
"w_ratio": fuzz.WRatio(text1, text2),
|
| 546 |
+
"partial_ratio": fuzz.partial_ratio(text1, text2),
|
| 547 |
+
"token_sort_ratio": fuzz.token_sort_ratio(text1, text2),
|
| 548 |
+
}
|
| 549 |
+
|
| 550 |
+
# --- Phonetic scores (only if weights include them) ---
|
| 551 |
+
phonetic_scores = {}
|
| 552 |
+
if weights.get("jaro_winkler", 0) > 0:
|
| 553 |
+
phonetic_scores["jaro_winkler"] = JaroWinkler.similarity(text1, text2) * 100
|
| 554 |
+
if weights.get("indic_soundex", 0) > 0:
|
| 555 |
+
phonetic_scores["indic_soundex"] = indic_soundex_similarity(text1, text2)
|
| 556 |
+
|
| 557 |
+
# --- Semantic scores (dual model, computed in parallel) ---
|
| 558 |
+
with ThreadPoolExecutor() as executor:
|
| 559 |
+
model1 = get_model("model1")
|
| 560 |
+
model2 = get_model("model2")
|
| 561 |
+
|
| 562 |
+
f1 = executor.submit(
|
| 563 |
+
lambda: cosine_similarity(
|
| 564 |
+
model1.encode([text1]),
|
| 565 |
+
model1.encode([text2])
|
| 566 |
+
)[0][0]
|
| 567 |
+
)
|
| 568 |
+
f2 = executor.submit(
|
| 569 |
+
lambda: cosine_similarity(
|
| 570 |
+
model2.encode([text1]),
|
| 571 |
+
model2.encode([text2])
|
| 572 |
+
)[0][0]
|
| 573 |
+
)
|
| 574 |
+
cosine1 = f1.result()
|
| 575 |
+
cosine2 = f2.result()
|
| 576 |
+
|
| 577 |
+
def calc_final(semantic_cosine):
|
| 578 |
+
all_scores = {}
|
| 579 |
+
all_scores.update(fuzzy_scores)
|
| 580 |
+
all_scores.update(phonetic_scores)
|
| 581 |
+
all_scores["semantic_score"] = semantic_cosine * 100
|
| 582 |
+
return sum(all_scores.get(k, 0) * v for k, v in weights.items())
|
| 583 |
+
|
| 584 |
+
final1 = calc_final(cosine1)
|
| 585 |
+
final2 = calc_final(cosine2)
|
| 586 |
+
|
| 587 |
+
overall_similarity = final1 * 0.6 + final2 * 0.4
|
| 588 |
+
return round(max(0, min(100, overall_similarity)), 2)
|
| 589 |
+
|
| 590 |
+
# def handle_case1(full_name1: str, full_name2: str,
|
| 591 |
+
# r1_fname: str, r1_mname: str, r1_lname: str,
|
| 592 |
+
# r2_fname: str, r2_mname: str, r2_lname: str) -> dict:
|
| 593 |
+
# """
|
| 594 |
+
# Case-1 (both records supply a full name)
|
| 595 |
+
# Returns a dictionary with separate similarity scores for each component
|
| 596 |
+
|
| 597 |
+
# Returns:
|
| 598 |
+
# dict: {
|
| 599 |
+
# 'full_name_percent': float, # full_name1 vs full_name2
|
| 600 |
+
# 'firstname_percent': float, # r1_fname vs r2_fname
|
| 601 |
+
# 'middlename_percent': float, # r1_mname vs r2_mname
|
| 602 |
+
# 'lastname_percent': float # r1_lname vs r2_lname
|
| 603 |
+
# }
|
| 604 |
+
# """
|
| 605 |
+
# result={}
|
| 606 |
+
|
| 607 |
+
# # Check space-agnostic exact match on original strings before sorting
|
| 608 |
+
# if full_name1.replace(" ", "").upper() == full_name2.replace(" ", "").upper():
|
| 609 |
+
# full_name_percent = 100.0
|
| 610 |
+
# else:
|
| 611 |
+
# # 1. Normalize + alphabetically sort each full name and calculate similarity
|
| 612 |
+
# sorted1 = _normalize_and_sort(full_name1)
|
| 613 |
+
# sorted2 = _normalize_and_sort(full_name2)
|
| 614 |
+
# full_name_percent = calculate_similarity_with_models(sorted1, sorted2, NAME_MODEL_WEIGHTS)
|
| 615 |
+
# # print("full_name_percent is:",full_name_percent)
|
| 616 |
+
|
| 617 |
+
# # 2. Calculate firstname_percent: compare firstnames
|
| 618 |
+
# # firstname
|
| 619 |
+
# if r1_fname and r2_fname:
|
| 620 |
+
# firstname_percent = calculate_similarity_with_models(
|
| 621 |
+
# r1_fname, r2_fname, NAME_MODEL_WEIGHTS
|
| 622 |
+
# )
|
| 623 |
+
# # print("firstname_percent is:",firstname_percent)
|
| 624 |
+
# else:
|
| 625 |
+
# firstname_percent = 0.0
|
| 626 |
+
|
| 627 |
+
# # middlename
|
| 628 |
+
# if r1_mname and r2_mname:
|
| 629 |
+
# middlename_percent = calculate_similarity_with_models(
|
| 630 |
+
# r1_mname, r2_mname, NAME_MODEL_WEIGHTS
|
| 631 |
+
# )
|
| 632 |
+
# # print("middlename_percent is:",middlename_percent)
|
| 633 |
+
# else:
|
| 634 |
+
# middlename_percent = 0.0
|
| 635 |
+
|
| 636 |
+
# # lastname
|
| 637 |
+
# if r1_lname and r2_lname:
|
| 638 |
+
# lastname_percent = calculate_similarity_with_models(
|
| 639 |
+
# r1_lname, r2_lname, NAME_MODEL_WEIGHTS
|
| 640 |
+
# )
|
| 641 |
+
# # print("lastname_percent is:",lastname_percent)
|
| 642 |
+
# else:
|
| 643 |
+
# lastname_percent = 0.0
|
| 644 |
+
|
| 645 |
+
|
| 646 |
+
|
| 647 |
+
# result={
|
| 648 |
+
# 'full_name_percent': full_name_percent,
|
| 649 |
+
# 'firstname_percent': firstname_percent,
|
| 650 |
+
# 'middlename_percent': middlename_percent,
|
| 651 |
+
# 'lastname_percent': lastname_percent
|
| 652 |
+
# }
|
| 653 |
+
# return result
|
| 654 |
+
|
| 655 |
+
# def handle_case2(full_name: str,
|
| 656 |
+
# fname: str, mname: str, lname: str,
|
| 657 |
+
# concat_name: str) -> dict:
|
| 658 |
+
# """
|
| 659 |
+
# Case-2 (one side has full name, the other has F/M/L)
|
| 660 |
+
# Returns a dictionary with separate similarity scores for each component
|
| 661 |
+
|
| 662 |
+
# Returns:
|
| 663 |
+
# dict: {
|
| 664 |
+
# 'full_name_percent': float, # full_name vs concat_name
|
| 665 |
+
# 'firstname_percent': float, # full_name vs fname
|
| 666 |
+
# 'middlename_percent': float, # full_name vs mname
|
| 667 |
+
# 'lastname_percent': float # full_name vs lname
|
| 668 |
+
# }
|
| 669 |
+
# """
|
| 670 |
+
# # 0. Check if any permutation of F/M/L exactly reconstructs full_name.
|
| 671 |
+
# # If yes, full_name_percent = 100. Component scores are still computed
|
| 672 |
+
# # individually β a part inside full_name does NOT score 100% on its own.
|
| 673 |
+
# # e.g. full_name="KALLI LIKHITHA", fname="KALLI", mname="LIKHITHA":
|
| 674 |
+
# # full_name_percent = 100 (together they reconstruct it exactly)
|
| 675 |
+
# # firstname_percent != 100 ("KALLI" is only half of "KALLI LIKHITHA")
|
| 676 |
+
# permutation_full_match = any(
|
| 677 |
+
# permuted.replace(" ", "") == full_name.upper().strip().replace(" ", "")
|
| 678 |
+
# for permuted in _all_name_combinations(fname, mname, lname)
|
| 679 |
+
# )
|
| 680 |
+
|
| 681 |
+
# # 1. Calculate full_name_percent
|
| 682 |
+
# if permutation_full_match:
|
| 683 |
+
# full_name_percent = 100.0
|
| 684 |
+
# else:
|
| 685 |
+
# sorted_full = _normalize_and_sort(full_name)
|
| 686 |
+
# sorted_concat = _normalize_and_sort(concat_name)
|
| 687 |
+
# full_name_percent = calculate_similarity_with_models(
|
| 688 |
+
# sorted_full,
|
| 689 |
+
# sorted_concat,
|
| 690 |
+
# NAME_MODEL_WEIGHTS
|
| 691 |
+
# )
|
| 692 |
+
|
| 693 |
+
# # Component-level scores: compare full_name vs each individual part (fname/mname/lname).
|
| 694 |
+
# #
|
| 695 |
+
# # Requirement:
|
| 696 |
+
# # - full_name="KALLI LIKHITHA", fname="KALLI" β firstname_percent reflects
|
| 697 |
+
# # how well "KALLI" matches within the context of the full name, but must
|
| 698 |
+
# # NOT be 100% just because "KALLI" is a complete subset of "KALLI LIKHITHA".
|
| 699 |
+
# # - The comparison is full_name vs part (not token-to-token), so the full
|
| 700 |
+
# # context of the name is preserved.
|
| 701 |
+
# #
|
| 702 |
+
# # Why standard weights fail:
|
| 703 |
+
# # - partial_ratio("KALLI LIKHITHA", "KALLI") = 100 β subset inflation
|
| 704 |
+
# # - token_set_ratio produces same inflation
|
| 705 |
+
# # - w_ratio picks the best of these β also inflated
|
| 706 |
+
# # - semantic embeddings: short name vs full name share high cosine similarity
|
| 707 |
+
# # because they encode overlapping meaning β also inflated
|
| 708 |
+
# #
|
| 709 |
+
# # Fix: use only LENGTH-SENSITIVE metrics that naturally penalise length
|
| 710 |
+
# # disparity between the strings.
|
| 711 |
+
# # - simple_ratio: 2 * matches / total_chars β drops when lengths differ
|
| 712 |
+
# # - jaro_winkler: character-overlap with length normalisation β same
|
| 713 |
+
# # - indic_soundex: phonetic token overlap / shorter length β same
|
| 714 |
+
# # Intentionally excluded: partial_ratio, token_set_ratio, w_ratio, semantic.
|
| 715 |
+
|
| 716 |
+
# _COMPONENT_WEIGHTS = {
|
| 717 |
+
# "simple_ratio": 0.35,
|
| 718 |
+
# "jaro_winkler": 0.40,
|
| 719 |
+
# "indic_soundex": 0.25,
|
| 720 |
+
# }
|
| 721 |
+
|
| 722 |
+
# def _fullname_vs_part(full: str, part: str) -> float:
|
| 723 |
+
# """
|
| 724 |
+
# Compare full_name against a single name part using only length-sensitive
|
| 725 |
+
# metrics. Returns 0-100. A part that is a strict subset of full_name will
|
| 726 |
+
# score proportionally to how much of the full_name it covers, not 100%.
|
| 727 |
+
# """
|
| 728 |
+
# if not full or not part:
|
| 729 |
+
# return 0.0
|
| 730 |
+
# full_u = full.upper().strip()
|
| 731 |
+
# part_u = part.upper().strip()
|
| 732 |
+
# if full_u == part_u:
|
| 733 |
+
# return 100.0
|
| 734 |
+
# scores = {
|
| 735 |
+
# "simple_ratio": fuzz.ratio(full_u, part_u),
|
| 736 |
+
# "jaro_winkler": JaroWinkler.similarity(full_u, part_u) * 100,
|
| 737 |
+
# "indic_soundex": indic_soundex_similarity(full_u, part_u),
|
| 738 |
+
# }
|
| 739 |
+
# return round(max(0.0, min(100.0,
|
| 740 |
+
# sum(scores[k] * v for k, v in _COMPONENT_WEIGHTS.items())
|
| 741 |
+
# )), 2)
|
| 742 |
+
|
| 743 |
+
# # 2. firstname_percent: full_name vs fname
|
| 744 |
+
# firstname_percent = _fullname_vs_part(full_name, fname) if fname else 0.0
|
| 745 |
+
# # 3. middlename_percent: full_name vs mname
|
| 746 |
+
# middlename_percent = _fullname_vs_part(full_name, mname) if mname else 0.0
|
| 747 |
+
# # 4. lastname_percent: full_name vs lname
|
| 748 |
+
# lastname_percent = _fullname_vs_part(full_name, lname) if (lname and lname.upper() not in SURNAME_IDENTIFIER) else 0.0
|
| 749 |
+
|
| 750 |
+
# result={
|
| 751 |
+
# 'full_name_percent': full_name_percent,
|
| 752 |
+
# 'firstname_percent': firstname_percent,
|
| 753 |
+
# 'middlename_percent': middlename_percent,
|
| 754 |
+
# 'lastname_percent': lastname_percent
|
| 755 |
+
# }
|
| 756 |
+
# return result
|
| 757 |
+
|
| 758 |
+
|
| 759 |
+
# def handle_case3(r1_fname: str, r1_mname: str, r1_lname: str, r1_concat: str,
|
| 760 |
+
# r2_fname: str, r2_mname: str, r2_lname: str, r2_concat: str) -> dict:
|
| 761 |
+
# """
|
| 762 |
+
# Handle Case 3: Both records have F/M/L
|
| 763 |
+
# Returns a dictionary with separate similarity scores for each component
|
| 764 |
+
|
| 765 |
+
# Returns:
|
| 766 |
+
# dict: {
|
| 767 |
+
# 'full_name_percent': float, # r1_concat vs r2_concat
|
| 768 |
+
# 'firstname_percent': float, # r1_fname vs r2_fname
|
| 769 |
+
# 'middlename_percent': float, # r1_mname vs r2_mname
|
| 770 |
+
# 'lastname_percent': float # r1_lname vs r2_lname
|
| 771 |
+
# }
|
| 772 |
+
# """
|
| 773 |
+
# # Check substring matches for each component
|
| 774 |
+
# f_match = check_substring_match(r1_fname, r2_fname) if r1_fname and r2_fname else False
|
| 775 |
+
# m_match = check_substring_match(r1_mname, r2_mname) if r1_mname and r2_mname else False
|
| 776 |
+
# l_match = check_substring_match(r1_lname, r2_lname) if r1_lname and r2_lname else False
|
| 777 |
+
|
| 778 |
+
# # Calculate full_name_percent: compare concatenated names
|
| 779 |
+
# full_name_percent = calculate_similarity_with_models(r1_concat, r2_concat, NAME_MODEL_WEIGHTS)
|
| 780 |
+
|
| 781 |
+
# # Apply boosting logic based on substring matches
|
| 782 |
+
# # Rule 1: Only lastname matches (family match)
|
| 783 |
+
# if l_match and not f_match and not m_match:
|
| 784 |
+
# full_name_percent = max(full_name_percent, 85.0) # Ensure minimum 85% for family match
|
| 785 |
+
|
| 786 |
+
# # Rule 2: Lastname + (firstname or middle) matches (partial match)
|
| 787 |
+
# # Strong indicator of same person
|
| 788 |
+
# elif l_match and (f_match or m_match):
|
| 789 |
+
# full_name_percent = max(full_name_percent, 90.0) # Higher confidence when lastname + another field matches
|
| 790 |
+
|
| 791 |
+
# # Rule 3: No matches at all or only firstname/middlename matches
|
| 792 |
+
# # Use the calculated similarity as-is
|
| 793 |
+
|
| 794 |
+
# # Calculate individual component percentages
|
| 795 |
+
# # 2. Calculate firstname_percent: compare firstnames
|
| 796 |
+
# if r1_fname and r2_fname:
|
| 797 |
+
# firstname_percent = calculate_similarity_with_models(
|
| 798 |
+
# r1_fname,
|
| 799 |
+
# r2_fname,
|
| 800 |
+
# NAME_MODEL_WEIGHTS
|
| 801 |
+
# )
|
| 802 |
+
# else:
|
| 803 |
+
# firstname_percent=0.0
|
| 804 |
+
|
| 805 |
+
# # 3. Calculate middlename_percent: compare middlenames
|
| 806 |
+
# if r1_mname and r2_mname:
|
| 807 |
+
# middlename_percent = calculate_similarity_with_models(
|
| 808 |
+
# r1_mname,
|
| 809 |
+
# r2_mname,
|
| 810 |
+
# NAME_MODEL_WEIGHTS
|
| 811 |
+
# )
|
| 812 |
+
# else:
|
| 813 |
+
# middlename_percent=0.0
|
| 814 |
+
|
| 815 |
+
# # 4. Calculate lastname_percent: compare lastnames
|
| 816 |
+
# if r1_lname and r2_lname and r1_lname.upper() not in SURNAME_IDENTIFIER and r2_lname.upper() not in SURNAME_IDENTIFIER:
|
| 817 |
+
# lastname_percent = calculate_similarity_with_models(
|
| 818 |
+
# r1_lname,
|
| 819 |
+
# r2_lname,
|
| 820 |
+
# NAME_MODEL_WEIGHTS
|
| 821 |
+
# )
|
| 822 |
+
# else:
|
| 823 |
+
# lastname_percent=0.0
|
| 824 |
+
|
| 825 |
+
# result= {
|
| 826 |
+
# 'full_name_percent': full_name_percent,
|
| 827 |
+
# 'firstname_percent': firstname_percent,
|
| 828 |
+
# 'middlename_percent': middlename_percent,
|
| 829 |
+
# 'lastname_percent': lastname_percent
|
| 830 |
+
# }
|
| 831 |
+
# return result
|
| 832 |
+
|
| 833 |
+
# def match_name(name: str, firstname: str, lastname: str, middlename: str) -> float:
|
| 834 |
+
# """
|
| 835 |
+
# Match name with logic
|
| 836 |
+
# Returns similarity score as float or "missing value"
|
| 837 |
+
# """
|
| 838 |
+
# name_processed = preprocess_for_matching(name)
|
| 839 |
+
# concat_name = concatenate_name_parts(firstname, middlename, lastname)
|
| 840 |
+
|
| 841 |
+
# # Case 1: NAME matches concatenated name
|
| 842 |
+
# if name_processed and concat_name and name_processed == concat_name:
|
| 843 |
+
# return 100
|
| 844 |
+
|
| 845 |
+
# # Case 2: NAME is empty, use concatenated
|
| 846 |
+
# if not name_processed and concat_name:
|
| 847 |
+
# return 100
|
| 848 |
+
|
| 849 |
+
# # Case 3: Concat is empty, use NAME
|
| 850 |
+
# if name_processed and not concat_name:
|
| 851 |
+
# return 100
|
| 852 |
+
|
| 853 |
+
# # Case 4: Both exist but different - use model
|
| 854 |
+
# if name_processed and concat_name and name_processed != concat_name:
|
| 855 |
+
# # Pass both to model for fuzzy matching
|
| 856 |
+
# return match_entities(name_processed, concat_name)
|
| 857 |
+
|
| 858 |
+
# # Both empty
|
| 859 |
+
# return 0
|
| 860 |
+
|
| 861 |
+
# def match_names_cross_records(r1_name: str, r1_firstname: str, r1_lastname: str, r1_middlename: str,
|
| 862 |
+
# r2_name: str, r2_firstname: str, r2_lastname: str, r2_middlename: str) -> float:
|
| 863 |
+
# """
|
| 864 |
+
# Match names between two records with enhanced preprocessing:
|
| 865 |
+
# 1. Input is already lowercase + preprocessed (titles removed, variations standardized)
|
| 866 |
+
# 2. Surname detection β if only common surnames match, return 20%
|
| 867 |
+
# 3. Token sorting for consistent comparison
|
| 868 |
+
# 4. Common token detection
|
| 869 |
+
# 5. Initial letter boost for abbreviated names
|
| 870 |
+
# 6. Three-case matching (both fullname / one fullname+FML / both FML)
|
| 871 |
+
# """
|
| 872 |
+
# # ββ Normalize inputs (already lowercase from preprocess_name) ββ
|
| 873 |
+
# r1_name_proc = r1_name.strip() if r1_name and r1_name.strip() not in ["-", ""] else ""
|
| 874 |
+
# r2_name_proc = r2_name.strip() if r2_name and r2_name.strip() not in ["-", ""] else ""
|
| 875 |
+
|
| 876 |
+
# r1_fname = r1_firstname.strip() if r1_firstname and r1_firstname.strip() not in ["-", ""] else ""
|
| 877 |
+
# r1_mname = r1_middlename.strip() if r1_middlename and r1_middlename.strip() not in ["-", ""] else ""
|
| 878 |
+
# r1_lname = r1_lastname.strip() if r1_lastname and r1_lastname.strip() not in ["-", ""] else ""
|
| 879 |
+
|
| 880 |
+
# r2_fname = r2_firstname.strip() if r2_firstname and r2_firstname.strip() not in ["-", ""] else ""
|
| 881 |
+
# r2_mname = r2_middlename.strip() if r2_middlename and r2_middlename.strip() not in ["-", ""] else ""
|
| 882 |
+
# r2_lname = r2_lastname.strip() if r2_lastname and r2_lastname.strip() not in ["-", ""] else ""
|
| 883 |
+
|
| 884 |
+
# # ββ Determine case ββ
|
| 885 |
+
# r1_has_fullname = bool(r1_name_proc)
|
| 886 |
+
# r2_has_fullname = bool(r2_name_proc)
|
| 887 |
+
|
| 888 |
+
# r1_concat = concatenate_name_parts(r1_fname, r1_mname, r1_lname).lower()
|
| 889 |
+
# r2_concat = concatenate_name_parts(r2_fname, r2_mname, r2_lname).lower()
|
| 890 |
+
|
| 891 |
+
# # Build the effective full name string for each record
|
| 892 |
+
# name1_effective = r1_name_proc if r1_has_fullname else r1_concat
|
| 893 |
+
# name2_effective = r2_name_proc if r2_has_fullname else r2_concat
|
| 894 |
+
|
| 895 |
+
# # Both missing β zero
|
| 896 |
+
# if not name1_effective and not name2_effective:
|
| 897 |
+
# return {
|
| 898 |
+
# 'full_name_percent': 0.0,
|
| 899 |
+
# 'firstname_percent': 0.0,
|
| 900 |
+
# 'middlename_percent': 0.0,
|
| 901 |
+
# 'lastname_percent': 0.0
|
| 902 |
+
# }
|
| 903 |
+
|
| 904 |
+
# # ββ Accumulate adjustments (applied AFTER handle_case computation) ββ
|
| 905 |
+
# adjustment = 0
|
| 906 |
+
# surname_penalty_val = NAME_MATCH_ADJUSTMENTS.get("surname_penalty", -30)
|
| 907 |
+
# initial_boost_val = NAME_MATCH_ADJUSTMENTS.get("initial_boost", 30)
|
| 908 |
+
# subset_boost_val = NAME_MATCH_ADJUSTMENTS.get("subset_boost", 40)
|
| 909 |
+
|
| 910 |
+
# # ββ Surname detection (case 2): penalty if surname-only match ββ
|
| 911 |
+
# surname_only_match = False
|
| 912 |
+
# if name1_effective and name2_effective:
|
| 913 |
+
# surnames1 = detect_surnames(name1_effective)
|
| 914 |
+
# surnames2 = detect_surnames(name2_effective)
|
| 915 |
+
|
| 916 |
+
# if surnames1 and surnames2:
|
| 917 |
+
# common_surnames = surnames1 & surnames2
|
| 918 |
+
# if common_surnames:
|
| 919 |
+
# tokens1_non_surname = [t for t in name1_effective.split() if t not in surnames1]
|
| 920 |
+
# tokens2_non_surname = [t for t in name2_effective.split() if t not in surnames2]
|
| 921 |
+
|
| 922 |
+
# if tokens1_non_surname and tokens2_non_surname:
|
| 923 |
+
# non_surname_overlap = set(tokens1_non_surname) & set(tokens2_non_surname)
|
| 924 |
+
# if not non_surname_overlap:
|
| 925 |
+
# non_surname1_str = " ".join(tokens1_non_surname)
|
| 926 |
+
# non_surname2_str = " ".join(tokens2_non_surname)
|
| 927 |
+
# if fuzz.ratio(non_surname1_str, non_surname2_str) < 60:
|
| 928 |
+
# surname_only_match = True
|
| 929 |
+
# adjustment += surname_penalty_val # e.g., -30
|
| 930 |
+
|
| 931 |
+
# # ββ Sort tokens for boost/subset detection ββ
|
| 932 |
+
# name1_tokens = sorted(name1_effective.split()) if name1_effective else []
|
| 933 |
+
# name2_tokens = sorted(name2_effective.split()) if name2_effective else []
|
| 934 |
+
|
| 935 |
+
# # ββ Initial letter boost / mismatch penalty (Case 3A) ββ
|
| 936 |
+
# # compute_initial_letter_boost returns:
|
| 937 |
+
# # +0.2 β all initials matched β add initial_boost_val (+10.5)
|
| 938 |
+
# # -0.2 β at least one initial did NOT match β subtract initial_boost_val (-10.5)
|
| 939 |
+
# # 0.0 β no initials present β no change
|
| 940 |
+
# if name1_tokens and name2_tokens:
|
| 941 |
+
# boost = compute_initial_letter_boost(name1_tokens, name2_tokens)
|
| 942 |
+
# if boost > 0:
|
| 943 |
+
# adjustment += initial_boost_val # initials matched β boost
|
| 944 |
+
# elif boost < 0:
|
| 945 |
+
# adjustment -= initial_boost_val # initials mismatched β penalty
|
| 946 |
+
|
| 947 |
+
# # ββ Subset match boost (case 5): +40 if one is complete subset ββ
|
| 948 |
+
# if name1_tokens and name2_tokens and len(name1_tokens) != len(name2_tokens):
|
| 949 |
+
# if is_subset_match(name1_tokens, name2_tokens):
|
| 950 |
+
# adjustment += subset_boost_val # e.g., +40
|
| 951 |
+
|
| 952 |
+
# # ββ Run the appropriate case handler for base similarity ββ
|
| 953 |
+
# result = None
|
| 954 |
+
|
| 955 |
+
# # CASE 1: Both records have full names
|
| 956 |
+
# if r1_has_fullname and r2_has_fullname:
|
| 957 |
+
# result = handle_case1(r1_name_proc, r2_name_proc,
|
| 958 |
+
# r1_firstname, r1_middlename, r1_lastname,
|
| 959 |
+
# r2_firstname, r2_middlename, r2_lastname)
|
| 960 |
+
|
| 961 |
+
# # CASE 2: One has full name, other has F/M/L
|
| 962 |
+
# elif r1_has_fullname and not r2_has_fullname and r2_concat:
|
| 963 |
+
# result = handle_case2(r1_name_proc, r2_fname, r2_mname, r2_lname, r2_concat)
|
| 964 |
+
|
| 965 |
+
# elif r2_has_fullname and not r1_has_fullname and r1_concat:
|
| 966 |
+
# result = handle_case2(r2_name_proc, r1_fname, r1_mname, r1_lname, r1_concat)
|
| 967 |
+
|
| 968 |
+
# # CASE 3: Both have F/M/L
|
| 969 |
+
# elif not r1_has_fullname and not r2_has_fullname and r1_concat and r2_concat:
|
| 970 |
+
# result = handle_case3(r1_fname, r1_mname, r1_lname, r1_concat,
|
| 971 |
+
# r2_fname, r2_mname, r2_lname, r2_concat)
|
| 972 |
+
|
| 973 |
+
# # Fallback if no case matched
|
| 974 |
+
# if result is None:
|
| 975 |
+
# result = {
|
| 976 |
+
# 'full_name_percent': 0.0,
|
| 977 |
+
# 'firstname_percent': 0.0,
|
| 978 |
+
# 'middlename_percent': 0.0,
|
| 979 |
+
# 'lastname_percent': 0.0
|
| 980 |
+
# }
|
| 981 |
+
|
| 982 |
+
# # ββ Apply accumulated adjustments to full_name_percent ββ
|
| 983 |
+
# if adjustment != 0:
|
| 984 |
+
# result['full_name_percent'] = max(0.0, min(100.0, result['full_name_percent'] + adjustment))
|
| 985 |
+
|
| 986 |
+
# return result
|
| 987 |
+
def handle_case1(full_name1: str, full_name2: str,
|
| 988 |
+
r1_fname: str, r1_mname: str, r1_lname: str,
|
| 989 |
+
r2_fname: str, r2_mname: str, r2_lname: str) -> dict:
|
| 990 |
+
"""
|
| 991 |
+
Case-1 (both records supply a full name)
|
| 992 |
+
Returns a dictionary with separate similarity scores for each component
|
| 993 |
+
|
| 994 |
+
Returns:
|
| 995 |
+
dict: {
|
| 996 |
+
'full_name_percent': float, # full_name1 vs full_name2
|
| 997 |
+
'firstname_percent': float, # r1_fname vs r2_fname
|
| 998 |
+
'middlename_percent': float, # r1_mname vs r2_mname
|
| 999 |
+
'lastname_percent': float # r1_lname vs r2_lname
|
| 1000 |
+
}
|
| 1001 |
+
"""
|
| 1002 |
+
result={}
|
| 1003 |
+
|
| 1004 |
+
# Check space-agnostic exact match on original strings before sorting
|
| 1005 |
+
if full_name1.replace(" ", "").upper() == full_name2.replace(" ", "").upper():
|
| 1006 |
+
full_name_percent = 100.0
|
| 1007 |
+
else:
|
| 1008 |
+
# 1. Normalize + alphabetically sort each full name and calculate similarity
|
| 1009 |
+
sorted1 = _normalize_and_sort(full_name1)
|
| 1010 |
+
sorted2 = _normalize_and_sort(full_name2)
|
| 1011 |
+
full_name_percent = match_entities(sorted1, sorted2, NAME_MODEL_WEIGHTS)
|
| 1012 |
+
# print("full_name_percent is:",full_name_percent)
|
| 1013 |
+
|
| 1014 |
+
# 2. Calculate firstname_percent: compare firstnames
|
| 1015 |
+
# firstname
|
| 1016 |
+
if r1_fname and r2_fname:
|
| 1017 |
+
firstname_percent = match_entities(
|
| 1018 |
+
r1_fname, r2_fname, NAME_MODEL_WEIGHTS
|
| 1019 |
+
)
|
| 1020 |
+
# print("firstname_percent is:",firstname_percent)
|
| 1021 |
+
else:
|
| 1022 |
+
firstname_percent = 0.0
|
| 1023 |
+
|
| 1024 |
+
# middlename
|
| 1025 |
+
if r1_mname and r2_mname:
|
| 1026 |
+
middlename_percent = match_entities(
|
| 1027 |
+
r1_mname, r2_mname, NAME_MODEL_WEIGHTS
|
| 1028 |
+
)
|
| 1029 |
+
# print("middlename_percent is:",middlename_percent)
|
| 1030 |
+
else:
|
| 1031 |
+
middlename_percent = 0.0
|
| 1032 |
+
|
| 1033 |
+
# lastname
|
| 1034 |
+
if r1_lname and r2_lname:
|
| 1035 |
+
lastname_percent = match_entities(
|
| 1036 |
+
r1_lname, r2_lname, NAME_MODEL_WEIGHTS
|
| 1037 |
+
)
|
| 1038 |
+
# print("lastname_percent is:",lastname_percent)
|
| 1039 |
+
else:
|
| 1040 |
+
lastname_percent = 0.0
|
| 1041 |
+
|
| 1042 |
+
|
| 1043 |
+
|
| 1044 |
+
result={
|
| 1045 |
+
'full_name_percent': full_name_percent,
|
| 1046 |
+
'firstname_percent': firstname_percent,
|
| 1047 |
+
'middlename_percent': middlename_percent,
|
| 1048 |
+
'lastname_percent': lastname_percent
|
| 1049 |
+
}
|
| 1050 |
+
return result
|
| 1051 |
+
|
| 1052 |
+
def handle_case2(full_name: str,
|
| 1053 |
+
fname: str, mname: str, lname: str,
|
| 1054 |
+
concat_name: str) -> dict:
|
| 1055 |
+
"""
|
| 1056 |
+
Case-2 (one side has full name, the other has F/M/L)
|
| 1057 |
+
Returns a dictionary with separate similarity scores for each component
|
| 1058 |
+
|
| 1059 |
+
Returns:
|
| 1060 |
+
dict: {
|
| 1061 |
+
'full_name_percent': float, # full_name vs concat_name
|
| 1062 |
+
'firstname_percent': float, # full_name vs fname
|
| 1063 |
+
'middlename_percent': float, # full_name vs mname
|
| 1064 |
+
'lastname_percent': float # full_name vs lname
|
| 1065 |
+
}
|
| 1066 |
+
"""
|
| 1067 |
+
# 0. Try every permutation of F/M/L
|
| 1068 |
+
full_name_percent = None
|
| 1069 |
+
for permuted in _all_name_combinations(fname, mname, lname):
|
| 1070 |
+
if permuted.replace(" ", "") == full_name.upper().strip().replace(" ", ""):
|
| 1071 |
+
# Perfect match for the Full Name component
|
| 1072 |
+
full_name_percent = 100.0
|
| 1073 |
+
break
|
| 1074 |
+
|
| 1075 |
+
# 1. Calculate full_name_percent: compare sorted components if exact match failed
|
| 1076 |
+
if full_name_percent is None:
|
| 1077 |
+
sorted_full = _normalize_and_sort(full_name)
|
| 1078 |
+
sorted_concat = _normalize_and_sort(concat_name)
|
| 1079 |
+
|
| 1080 |
+
full_name_percent = match_entities(
|
| 1081 |
+
sorted_full,
|
| 1082 |
+
sorted_concat,
|
| 1083 |
+
NAME_MODEL_WEIGHTS
|
| 1084 |
+
)
|
| 1085 |
+
|
| 1086 |
+
# 2. Calculate firstname_percent: compare full_name with firstname only
|
| 1087 |
+
if fname :
|
| 1088 |
+
firstname_percent = match_entities(
|
| 1089 |
+
full_name,
|
| 1090 |
+
fname,
|
| 1091 |
+
NAME_MODEL_WEIGHTS
|
| 1092 |
+
)
|
| 1093 |
+
else:
|
| 1094 |
+
firstname_percent=0.0
|
| 1095 |
+
# 3. Calculate middlename_percent: compare full_name with middlename only
|
| 1096 |
+
if mname :
|
| 1097 |
+
middlename_percent = match_entities(
|
| 1098 |
+
full_name,
|
| 1099 |
+
mname,
|
| 1100 |
+
NAME_MODEL_WEIGHTS
|
| 1101 |
+
)
|
| 1102 |
+
else:
|
| 1103 |
+
middlename_percent=0.0
|
| 1104 |
+
|
| 1105 |
+
# 4. Calculate lastname_percent: compare full_name with lastname only
|
| 1106 |
+
if lname and lname.upper() not in SURNAME_IDENTIFIER:
|
| 1107 |
+
lastname_percent = match_entities(
|
| 1108 |
+
full_name,
|
| 1109 |
+
lname,
|
| 1110 |
+
NAME_MODEL_WEIGHTS
|
| 1111 |
+
)
|
| 1112 |
+
else:
|
| 1113 |
+
lastname_percent=0.0
|
| 1114 |
+
|
| 1115 |
+
result={
|
| 1116 |
+
'full_name_percent': full_name_percent,
|
| 1117 |
+
'firstname_percent': firstname_percent,
|
| 1118 |
+
'middlename_percent': middlename_percent,
|
| 1119 |
+
'lastname_percent': lastname_percent
|
| 1120 |
+
}
|
| 1121 |
+
return result
|
| 1122 |
+
|
| 1123 |
+
|
| 1124 |
+
def handle_case3(r1_fname: str, r1_mname: str, r1_lname: str, r1_concat: str,
|
| 1125 |
+
r2_fname: str, r2_mname: str, r2_lname: str, r2_concat: str) -> dict:
|
| 1126 |
+
"""
|
| 1127 |
+
Handle Case 3: Both records have F/M/L
|
| 1128 |
+
Returns a dictionary with separate similarity scores for each component
|
| 1129 |
+
|
| 1130 |
+
Returns:
|
| 1131 |
+
dict: {
|
| 1132 |
+
'full_name_percent': float, # r1_concat vs r2_concat
|
| 1133 |
+
'firstname_percent': float, # r1_fname vs r2_fname
|
| 1134 |
+
'middlename_percent': float, # r1_mname vs r2_mname
|
| 1135 |
+
'lastname_percent': float # r1_lname vs r2_lname
|
| 1136 |
+
}
|
| 1137 |
+
"""
|
| 1138 |
+
# Check substring matches for each component
|
| 1139 |
+
f_match = check_substring_match(r1_fname, r2_fname) if r1_fname and r2_fname else False
|
| 1140 |
+
m_match = check_substring_match(r1_mname, r2_mname) if r1_mname and r2_mname else False
|
| 1141 |
+
l_match = check_substring_match(r1_lname, r2_lname) if r1_lname and r2_lname else False
|
| 1142 |
+
|
| 1143 |
+
# Calculate full_name_percent: compare concatenated names
|
| 1144 |
+
full_name_percent = match_entities(r1_concat, r2_concat, NAME_MODEL_WEIGHTS)
|
| 1145 |
+
|
| 1146 |
+
# Apply boosting logic based on substring matches
|
| 1147 |
+
# Rule 1: Only lastname matches (family match)
|
| 1148 |
+
if l_match and not f_match and not m_match:
|
| 1149 |
+
full_name_percent = max(full_name_percent, 85.0) # Ensure minimum 85% for family match
|
| 1150 |
+
|
| 1151 |
+
# Rule 2: Lastname + (firstname or middle) matches (partial match)
|
| 1152 |
+
# Strong indicator of same person
|
| 1153 |
+
elif l_match and (f_match or m_match):
|
| 1154 |
+
full_name_percent = max(full_name_percent, 90.0) # Higher confidence when lastname + another field matches
|
| 1155 |
+
|
| 1156 |
+
# Rule 3: No matches at all or only firstname/middlename matches
|
| 1157 |
+
# Use the calculated similarity as-is
|
| 1158 |
+
|
| 1159 |
+
# 2. Calculate firstname_percent: compare firstnames
|
| 1160 |
+
if r1_fname and r2_fname:
|
| 1161 |
+
firstname_percent = match_entities(
|
| 1162 |
+
r1_fname,
|
| 1163 |
+
r2_fname,
|
| 1164 |
+
NAME_MODEL_WEIGHTS
|
| 1165 |
+
)
|
| 1166 |
+
else:
|
| 1167 |
+
firstname_percent=0.0
|
| 1168 |
+
|
| 1169 |
+
# 3. Calculate middlename_percent: compare middlenames
|
| 1170 |
+
if r1_mname and r2_mname:
|
| 1171 |
+
middlename_percent = match_entities(
|
| 1172 |
+
r1_mname,
|
| 1173 |
+
r2_mname,
|
| 1174 |
+
NAME_MODEL_WEIGHTS
|
| 1175 |
+
)
|
| 1176 |
+
else:
|
| 1177 |
+
middlename_percent=0.0
|
| 1178 |
+
|
| 1179 |
+
# 4. Calculate lastname_percent: compare lastnames
|
| 1180 |
+
if r1_lname and r2_lname and r1_lname.upper() not in SURNAME_IDENTIFIER and r2_lname.upper() not in SURNAME_IDENTIFIER:
|
| 1181 |
+
lastname_percent = match_entities(
|
| 1182 |
+
r1_lname,
|
| 1183 |
+
r2_lname,
|
| 1184 |
+
NAME_MODEL_WEIGHTS
|
| 1185 |
+
)
|
| 1186 |
+
else:
|
| 1187 |
+
lastname_percent=0.0
|
| 1188 |
+
|
| 1189 |
+
result= {
|
| 1190 |
+
'full_name_percent': full_name_percent,
|
| 1191 |
+
'firstname_percent': firstname_percent,
|
| 1192 |
+
'middlename_percent': middlename_percent,
|
| 1193 |
+
'lastname_percent': lastname_percent
|
| 1194 |
+
}
|
| 1195 |
+
return result
|
| 1196 |
+
|
| 1197 |
+
def match_name(name: str, firstname: str, lastname: str, middlename: str) -> float:
|
| 1198 |
+
"""
|
| 1199 |
+
Match name with logic
|
| 1200 |
+
Returns similarity score as float or "missing value"
|
| 1201 |
+
"""
|
| 1202 |
+
name_processed = preprocess_for_matching(name)
|
| 1203 |
+
concat_name = concatenate_name_parts(firstname, middlename, lastname)
|
| 1204 |
+
|
| 1205 |
+
# Case 1: NAME matches concatenated name
|
| 1206 |
+
if name_processed and concat_name and name_processed == concat_name:
|
| 1207 |
+
return 100
|
| 1208 |
+
|
| 1209 |
+
# Case 2: NAME is empty, use concatenated
|
| 1210 |
+
if not name_processed and concat_name:
|
| 1211 |
+
return 100
|
| 1212 |
+
|
| 1213 |
+
# Case 3: Concat is empty, use NAME
|
| 1214 |
+
if name_processed and not concat_name:
|
| 1215 |
+
return 100
|
| 1216 |
+
|
| 1217 |
+
# Case 4: Both exist but different - use model
|
| 1218 |
+
if name_processed and concat_name and name_processed != concat_name:
|
| 1219 |
+
# Pass both to model for fuzzy matching
|
| 1220 |
+
return match_entities(name_processed, concat_name)
|
| 1221 |
+
|
| 1222 |
+
# Both empty
|
| 1223 |
+
return 0
|
| 1224 |
+
|
| 1225 |
+
def match_names_cross_records(r1_name: str, r1_firstname: str, r1_lastname: str, r1_middlename: str,
|
| 1226 |
+
r2_name: str, r2_firstname: str, r2_lastname: str, r2_middlename: str) -> float:
|
| 1227 |
+
"""
|
| 1228 |
+
Match names between two records with enhanced preprocessing:
|
| 1229 |
+
1. Input is already lowercase + preprocessed (titles removed, variations standardized)
|
| 1230 |
+
2. Surname detection β if only common surnames match, return 20%
|
| 1231 |
+
3. Token sorting for consistent comparison
|
| 1232 |
+
4. Common token detection
|
| 1233 |
+
5. Initial letter boost for abbreviated names
|
| 1234 |
+
6. Three-case matching (both fullname / one fullname+FML / both FML)
|
| 1235 |
+
|
| 1236 |
+
[MODIFIED 2026-03-15]
|
| 1237 |
+
- Refactored handle_case functions to properly pass exact permutation checking
|
| 1238 |
+
down to match_entities() instead of bypassing it to ml models.
|
| 1239 |
+
- Updated handle_case2 exact match checker to cleanly yield the first, middle,
|
| 1240 |
+
and last name proportions instead of assuming 100% across the board.
|
| 1241 |
+
- Implemented a -40 explicit penalty if two recognized surnames are detected
|
| 1242 |
+
but contradict each other completely (e.g. Krishna Rajput vs Krishna Singh).
|
| 1243 |
+
"""
|
| 1244 |
+
# ββ Normalize inputs (already lowercase from preprocess_name) ββ
|
| 1245 |
+
r1_name_proc = r1_name.strip() if r1_name and r1_name.strip() not in ["-", ""] else ""
|
| 1246 |
+
r2_name_proc = r2_name.strip() if r2_name and r2_name.strip() not in ["-", ""] else ""
|
| 1247 |
+
|
| 1248 |
+
r1_fname = r1_firstname.strip() if r1_firstname and r1_firstname.strip() not in ["-", ""] else ""
|
| 1249 |
+
r1_mname = r1_middlename.strip() if r1_middlename and r1_middlename.strip() not in ["-", ""] else ""
|
| 1250 |
+
r1_lname = r1_lastname.strip() if r1_lastname and r1_lastname.strip() not in ["-", ""] else ""
|
| 1251 |
+
|
| 1252 |
+
r2_fname = r2_firstname.strip() if r2_firstname and r2_firstname.strip() not in ["-", ""] else ""
|
| 1253 |
+
r2_mname = r2_middlename.strip() if r2_middlename and r2_middlename.strip() not in ["-", ""] else ""
|
| 1254 |
+
r2_lname = r2_lastname.strip() if r2_lastname and r2_lastname.strip() not in ["-", ""] else ""
|
| 1255 |
+
|
| 1256 |
+
# ββ Determine case ββ
|
| 1257 |
+
r1_has_fullname = bool(r1_name_proc)
|
| 1258 |
+
r2_has_fullname = bool(r2_name_proc)
|
| 1259 |
+
|
| 1260 |
+
r1_concat = concatenate_name_parts(r1_fname, r1_mname, r1_lname).lower()
|
| 1261 |
+
r2_concat = concatenate_name_parts(r2_fname, r2_mname, r2_lname).lower()
|
| 1262 |
+
|
| 1263 |
+
# Build the effective full name string for each record
|
| 1264 |
+
name1_effective = r1_name_proc if r1_has_fullname else r1_concat
|
| 1265 |
+
name2_effective = r2_name_proc if r2_has_fullname else r2_concat
|
| 1266 |
+
|
| 1267 |
+
# Both missing β zero
|
| 1268 |
+
if not name1_effective and not name2_effective:
|
| 1269 |
+
return {
|
| 1270 |
+
'full_name_percent': 0.0,
|
| 1271 |
+
'firstname_percent': 0.0,
|
| 1272 |
+
'middlename_percent': 0.0,
|
| 1273 |
+
'lastname_percent': 0.0
|
| 1274 |
+
}
|
| 1275 |
+
|
| 1276 |
+
# ββ Accumulate adjustments (applied AFTER handle_case computation) ββ
|
| 1277 |
+
adjustment = 0
|
| 1278 |
+
surname_penalty_val = NAME_MATCH_ADJUSTMENTS.get("surname_penalty", -30)
|
| 1279 |
+
initial_boost_val = NAME_MATCH_ADJUSTMENTS.get("initial_boost", 30)
|
| 1280 |
+
subset_boost_val = NAME_MATCH_ADJUSTMENTS.get("subset_boost", 40)
|
| 1281 |
+
|
| 1282 |
+
# ββ Surname detection (case 2): penalty if surname-only match ββ
|
| 1283 |
+
surname_only_match = False
|
| 1284 |
+
if name1_effective and name2_effective:
|
| 1285 |
+
surnames1 = detect_surnames(name1_effective)
|
| 1286 |
+
surnames2 = detect_surnames(name2_effective)
|
| 1287 |
+
|
| 1288 |
+
if surnames1 and surnames2:
|
| 1289 |
+
common_surnames = surnames1 & surnames2
|
| 1290 |
+
if common_surnames:
|
| 1291 |
+
tokens1_non_surname = [t for t in name1_effective.split() if t not in surnames1]
|
| 1292 |
+
tokens2_non_surname = [t for t in name2_effective.split() if t not in surnames2]
|
| 1293 |
+
|
| 1294 |
+
if tokens1_non_surname and tokens2_non_surname:
|
| 1295 |
+
non_surname_overlap = set(tokens1_non_surname) & set(tokens2_non_surname)
|
| 1296 |
+
if not non_surname_overlap:
|
| 1297 |
+
non_surname1_str = " ".join(tokens1_non_surname)
|
| 1298 |
+
non_surname2_str = " ".join(tokens2_non_surname)
|
| 1299 |
+
if fuzz.ratio(non_surname1_str, non_surname2_str) < 60:
|
| 1300 |
+
surname_only_match = True
|
| 1301 |
+
adjustment += surname_penalty_val # e.g., -30
|
| 1302 |
+
else:
|
| 1303 |
+
# Mismatching surnames! Both have a known surname, but they don't match.
|
| 1304 |
+
# Example: "krishna rajput" vs "krishna singh"
|
| 1305 |
+
adjustment -= 40 # severe penalty for conflicting standard surnames
|
| 1306 |
+
|
| 1307 |
+
# ββ Sort tokens for boost/subset detection ββ
|
| 1308 |
+
name1_tokens = sorted(name1_effective.split()) if name1_effective else []
|
| 1309 |
+
name2_tokens = sorted(name2_effective.split()) if name2_effective else []
|
| 1310 |
+
|
| 1311 |
+
# ββ Initial letter boost (case 4): +30 if initials match ββ
|
| 1312 |
+
if name1_tokens and name2_tokens:
|
| 1313 |
+
boost = compute_initial_letter_boost(name1_tokens, name2_tokens)
|
| 1314 |
+
if boost > 0:
|
| 1315 |
+
adjustment += initial_boost_val # e.g., +30
|
| 1316 |
+
|
| 1317 |
+
# ββ Subset match boost (case 5): +40 if one is complete subset ββ
|
| 1318 |
+
if name1_tokens and name2_tokens and len(name1_tokens) != len(name2_tokens):
|
| 1319 |
+
if is_subset_match(name1_tokens, name2_tokens):
|
| 1320 |
+
adjustment += subset_boost_val # e.g., +40
|
| 1321 |
+
|
| 1322 |
+
# ββ Run the appropriate case handler for base similarity ββ
|
| 1323 |
+
result = None
|
| 1324 |
+
|
| 1325 |
+
# CASE 1: Both records have full names
|
| 1326 |
+
if r1_has_fullname and r2_has_fullname:
|
| 1327 |
+
result = handle_case1(r1_name_proc, r2_name_proc,
|
| 1328 |
+
r1_firstname, r1_middlename, r1_lastname,
|
| 1329 |
+
r2_firstname, r2_middlename, r2_lastname)
|
| 1330 |
+
|
| 1331 |
+
# CASE 2: One has full name, other has F/M/L
|
| 1332 |
+
elif r1_has_fullname and not r2_has_fullname and r2_concat:
|
| 1333 |
+
result = handle_case2(r1_name_proc, r2_fname, r2_mname, r2_lname, r2_concat)
|
| 1334 |
+
|
| 1335 |
+
elif r2_has_fullname and not r1_has_fullname and r1_concat:
|
| 1336 |
+
result = handle_case2(r2_name_proc, r1_fname, r1_mname, r1_lname, r1_concat)
|
| 1337 |
+
|
| 1338 |
+
# CASE 3: Both have F/M/L
|
| 1339 |
+
elif not r1_has_fullname and not r2_has_fullname and r1_concat and r2_concat:
|
| 1340 |
+
result = handle_case3(r1_fname, r1_mname, r1_lname, r1_concat,
|
| 1341 |
+
r2_fname, r2_mname, r2_lname, r2_concat)
|
| 1342 |
+
|
| 1343 |
+
# Fallback if no case matched
|
| 1344 |
+
if result is None:
|
| 1345 |
+
result = {
|
| 1346 |
+
'full_name_percent': 0.0,
|
| 1347 |
+
'firstname_percent': 0.0,
|
| 1348 |
+
'middlename_percent': 0.0,
|
| 1349 |
+
'lastname_percent': 0.0
|
| 1350 |
+
}
|
| 1351 |
+
|
| 1352 |
+
# ββ Apply accumulated adjustments to full_name_percent ββ
|
| 1353 |
+
if adjustment != 0:
|
| 1354 |
+
result['full_name_percent'] = max(0.0, min(100.0, result['full_name_percent'] + adjustment))
|
| 1355 |
+
|
| 1356 |
+
return result
|
| 1357 |
+
|
| 1358 |
+
def match_addresses_1_to_n(addresses_r1: List[str], addresses_r2: List[str]) -> float:
|
| 1359 |
+
"""
|
| 1360 |
+
Match addresses 1:N (plain addressline strings only β no city/zipcode/state).
|
| 1361 |
+
|
| 1362 |
+
Pipeline:
|
| 1363 |
+
1. Extract all address components (house_no, flat, apartment, street) from each address
|
| 1364 |
+
2. Pass remaining address (components removed) to embedding model β base_score
|
| 1365 |
+
3. If base_score > 60: apply per-component boost/penalty
|
| 1366 |
+
house_number : match +30 / mismatch -30
|
| 1367 |
+
flat_number : match +10 / mismatch -10
|
| 1368 |
+
apartment : match +10 / mismatch -10
|
| 1369 |
+
street : match +10 / mismatch -10
|
| 1370 |
+
If base_score <= 60: skip all component adjustments
|
| 1371 |
+
4. Named component + post-box adjustments
|
| 1372 |
+
5. Cap final score to [0, 100]
|
| 1373 |
+
"""
|
| 1374 |
+
from services.rules import (
|
| 1375 |
+
preprocess_address as _preprocess_addr,
|
| 1376 |
+
compare_named_components as _compare_named,
|
| 1377 |
+
compare_postbox as _compare_postbox,
|
| 1378 |
+
remove_postbox_from_address as _strip_postbox,
|
| 1379 |
+
extract_address_components as _extract_components,
|
| 1380 |
+
)
|
| 1381 |
+
|
| 1382 |
+
def _norm(val):
|
| 1383 |
+
"""Strip all non-alphanumerics β 144/143 β 144143."""
|
| 1384 |
+
if not val:
|
| 1385 |
+
return ""
|
| 1386 |
+
return re.sub(r'[^A-Z0-9]', '', str(val).upper())
|
| 1387 |
+
|
| 1388 |
+
def _component_adj(v1, v2, boost, penalty):
|
| 1389 |
+
"""Return (verdict, adjustment) for a single component pair."""
|
| 1390 |
+
if v1 and v2:
|
| 1391 |
+
return ("match", boost) if v1 == v2 else ("mismatch", -penalty)
|
| 1392 |
+
return ("missing", 0.0)
|
| 1393 |
+
|
| 1394 |
+
raw1 = [a for a in addresses_r1 if a and str(a).strip() not in ["-", " ", ""]]
|
| 1395 |
+
raw2 = [a for a in addresses_r2 if a and str(a).strip() not in ["-", " ", ""]]
|
| 1396 |
+
|
| 1397 |
+
if not raw1 or not raw2:
|
| 1398 |
+
return 0
|
| 1399 |
+
|
| 1400 |
+
best_score = 0.0
|
| 1401 |
+
|
| 1402 |
+
for raw_a1 in raw1:
|
| 1403 |
+
for raw_a2 in raw2:
|
| 1404 |
+
if not raw_a1 or not raw_a2:
|
| 1405 |
+
continue
|
| 1406 |
+
|
| 1407 |
+
# ββ Extract components from both raw addresses ββββββββββββββββ
|
| 1408 |
+
comp1 = _extract_components(raw_a1)
|
| 1409 |
+
comp2 = _extract_components(raw_a2)
|
| 1410 |
+
|
| 1411 |
+
hno1 = _norm(comp1.get("house_number"))
|
| 1412 |
+
hno2 = _norm(comp2.get("house_number"))
|
| 1413 |
+
flat1 = _norm(comp1.get("flat_number"))
|
| 1414 |
+
flat2 = _norm(comp2.get("flat_number"))
|
| 1415 |
+
apt1 = _norm(comp1.get("apartment"))
|
| 1416 |
+
apt2 = _norm(comp2.get("apartment"))
|
| 1417 |
+
str1 = _norm(comp1.get("street"))
|
| 1418 |
+
str2 = _norm(comp2.get("street"))
|
| 1419 |
+
|
| 1420 |
+
# ββ Remaining address β model input βββββββββββββββββββββββββββ
|
| 1421 |
+
rem1 = comp1.get("remaining_address", "").strip()
|
| 1422 |
+
rem2 = comp2.get("remaining_address", "").strip()
|
| 1423 |
+
|
| 1424 |
+
# Fallback to full preprocessed address if remaining is empty
|
| 1425 |
+
if not rem1:
|
| 1426 |
+
rem1 = _preprocess_addr(raw_a1).upper()
|
| 1427 |
+
if not rem2:
|
| 1428 |
+
rem2 = _preprocess_addr(raw_a2).upper()
|
| 1429 |
+
|
| 1430 |
+
addr1_clean = _strip_postbox(rem1) or rem1
|
| 1431 |
+
addr2_clean = _strip_postbox(rem2) or rem2
|
| 1432 |
+
|
| 1433 |
+
# Named components comparison (on full preprocessed address)
|
| 1434 |
+
addr1_full = _preprocess_addr(raw_a1).upper()
|
| 1435 |
+
addr2_full = _preprocess_addr(raw_a2).upper()
|
| 1436 |
+
named_result = _compare_named(addr1_full, addr2_full)
|
| 1437 |
+
pb_result = _compare_postbox(addr1_full, addr2_full)
|
| 1438 |
+
|
| 1439 |
+
try:
|
| 1440 |
+
base_score = float(match_entities(addr1_clean, addr2_clean,
|
| 1441 |
+
weights=ADDRESS_MODEL_WEIGHTS))
|
| 1442 |
+
except (TypeError, ValueError):
|
| 1443 |
+
base_score = 0.0
|
| 1444 |
+
|
| 1445 |
+
# ββ Component adjustments (only when base_score > 60) βββββββββ
|
| 1446 |
+
comp_adj = 0.0
|
| 1447 |
+
component_specs = [
|
| 1448 |
+
("house_number", hno1, hno2, 30.0, 30.0),
|
| 1449 |
+
("flat_number", flat1, flat2, 10.0, 10.0),
|
| 1450 |
+
("apartment", apt1, apt2, 10.0, 10.0),
|
| 1451 |
+
("street", str1, str2, 10.0, 10.0),
|
| 1452 |
+
]
|
| 1453 |
+
print(f"[ADDR_COMPONENTS] base_score={base_score:.2f} | threshold=60 | adjustments_applied={base_score > 60}")
|
| 1454 |
+
print(f" remaining_addr1 : {addr1_clean!r}")
|
| 1455 |
+
print(f" remaining_addr2 : {addr2_clean!r}")
|
| 1456 |
+
for label, v1, v2, boost, penalty in component_specs:
|
| 1457 |
+
verdict, adj = _component_adj(v1, v2, boost, penalty)
|
| 1458 |
+
if verdict == "missing":
|
| 1459 |
+
print(f" {label:<15} | verdict=missing | v1={v1!r:>10} v2={v2!r:<10} | adjustment=0.0 [skipped - component absent]")
|
| 1460 |
+
elif base_score <= 60:
|
| 1461 |
+
print(f" {label:<15} | verdict={verdict:<9} | v1={v1!r:>10} v2={v2!r:<10} | adjustment=0.0 [SKIPPED - base_score<=60]")
|
| 1462 |
+
else:
|
| 1463 |
+
comp_adj += adj
|
| 1464 |
+
sign = "+" if adj >= 0 else ""
|
| 1465 |
+
tag = "BOOSTED" if adj > 0 else "PENALISED"
|
| 1466 |
+
print(f" {label:<15} | verdict={verdict:<9} | v1={v1!r:>10} v2={v2!r:<10} | adjustment={sign}{adj:.1f} [{tag}]")
|
| 1467 |
+
print(f" total comp_adj : {comp_adj:+.1f}")
|
| 1468 |
+
|
| 1469 |
+
adjustment = comp_adj + named_result['score_adjustment'] + pb_result['adjustment']
|
| 1470 |
+
final_score = max(0.0, min(100.0, base_score + adjustment))
|
| 1471 |
+
if final_score > best_score:
|
| 1472 |
+
best_score = final_score
|
| 1473 |
+
|
| 1474 |
+
return round(best_score, 2)
|
| 1475 |
+
|
| 1476 |
+
|
| 1477 |
+
def match_addresses_structured(
|
| 1478 |
+
addrs_r1: List[dict],
|
| 1479 |
+
addrs_r2: List[dict],
|
| 1480 |
+
) -> float:
|
| 1481 |
+
"""
|
| 1482 |
+
Match addresses when city / zipcode / state are available as separate columns.
|
| 1483 |
+
|
| 1484 |
+
Each address dict must have keys: addressline, city, zipcode, state.
|
| 1485 |
+
Returns best score across all NΓM combinations (0-100).
|
| 1486 |
+
|
| 1487 |
+
Handles:
|
| 1488 |
+
- Missing state/city β inferred from zipcode via pgeocode (offline)
|
| 1489 |
+
- Bank state codes (NDH, BLR β¦) β canonical form
|
| 1490 |
+
- City name variants β canonical via CITY_MAPPING
|
| 1491 |
+
- House number extraction + comparison
|
| 1492 |
+
- Full addressline text via embedding model
|
| 1493 |
+
|
| 1494 |
+
Example:
|
| 1495 |
+
addrs1 = [{"addressline": "A13 GUPTA ENCLAVE...",
|
| 1496 |
+
"city": "NEW DELHI", "zipcode": "110059", "state": "NDH"}]
|
| 1497 |
+
addrs2 = [{"addressline": "A13 GUPTA ENCLAVE...",
|
| 1498 |
+
"city": "NEW DELHI", "zipcode": "110059", "state": "DELHI"}]
|
| 1499 |
+
score = match_addresses_structured(addrs1, addrs2) # β ~100
|
| 1500 |
+
"""
|
| 1501 |
+
from services.rules import match_structured_address_lists as _sa_match
|
| 1502 |
+
return _sa_match(addrs_r1, addrs_r2)
|
| 1503 |
+
|
| 1504 |
+
def match_single_field(value1: str, value2: str) -> float:
|
| 1505 |
+
"""
|
| 1506 |
+
Match single fields like SPOUSENAME, MOTHERNAME, etc.
|
| 1507 |
+
Returns similarity score as float or "missing value"
|
| 1508 |
+
"""
|
| 1509 |
+
return match_entities(value1, value2)
|
services/rules.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|