| from concurrent.futures import ThreadPoolExecutor |
| from typing import Dict, List, Optional, Tuple |
| import pandas as pd |
| from rapidfuzz import fuzz |
| from rapidfuzz.distance import JaroWinkler |
| from sklearn.metrics.pairwise import cosine_similarity |
| from sentence_transformers import SentenceTransformer |
| import re |
| import itertools |
|
|
| from services.config import ( |
| SURNAME_IDENTIFIER, MODEL_WEIGHTS, MODEL_1_NAME, MODEL_2_NAME, |
| NAME_MODEL_WEIGHTS, NAME_MATCH_ADJUSTMENTS, |
| ADDRESS_MODEL_WEIGHTS, |
| ) |
| from services.rules import detect_surnames, compute_initial_letter_boost, is_subset_match |
|
|
| |
| MODEL_STORE = {} |
|
|
| def get_model(model_name: str) -> SentenceTransformer: |
| if model_name not in MODEL_STORE: |
| print(f"Loading {model_name} into memory on CPU...") |
| if model_name == "model1": |
| MODEL_STORE["model1"] = SentenceTransformer(MODEL_1_NAME, device="cpu") |
| elif model_name == "model2": |
| MODEL_STORE["model2"] = SentenceTransformer(MODEL_2_NAME, device="cpu") |
| return MODEL_STORE[model_name] |
|
|
|
|
| |
| def preprocess_for_matching(text: str) -> str: |
| """Standardize text for matching""" |
| if not text or text in ["-", " ", ""]: |
| return "" |
| return text.upper().strip() |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
|
|
| def indic_soundex_code(name: str) -> str: |
| """ |
| Generate Indic Soundex code for a name token. |
| Handles Indian transliteration phonetics (aspirated consonants, etc.) |
| |
| [MODIFIED 2026-03-15] |
| - Separated palatal fricatives (J, S, Z) from velars (K, G) in SOUNDEX_MAP to accurately |
| penalize phonetically distinct names like Rajesh vs Rakesh. |
| """ |
| if not name: |
| return "" |
| name = name.upper().strip() |
| if not name: |
| return "" |
|
|
| |
| for digraph, base in [("SH", "S"), ("PH", "F"), ("TH", "T"), ("DH", "D"), |
| ("KH", "K"), ("GH", "G"), ("BH", "B"), ("CH", "C"), ("JH", "J")]: |
| name = name.replace(digraph, base) |
|
|
| SOUNDEX_MAP = { |
| 'B': '1', 'F': '1', 'P': '1', 'V': '1', 'W': '1', |
| 'C': '2', 'G': '2', 'K': '2', 'Q': '2', 'X': '2', |
| 'D': '3', 'T': '3', |
| 'L': '4', |
| 'M': '5', 'N': '5', |
| 'R': '6', |
| 'J': '7', 'S': '7', 'Z': '7' |
| } |
|
|
| code = name[0] |
| prev_code = SOUNDEX_MAP.get(name[0], '0') |
|
|
| for char in name[1:]: |
| if char in 'AEIOUHY ': |
| prev_code = '0' |
| continue |
| digit = SOUNDEX_MAP.get(char, '0') |
| if digit != '0' and digit != prev_code: |
| code += digit |
| prev_code = digit |
|
|
| return (code + '000')[:4] |
|
|
|
|
| def indic_soundex_similarity(text1: str, text2: str) -> float: |
| """ |
| Compare two texts using Indic Soundex on each token. |
| Returns 0-100 similarity score. |
| """ |
| tokens1 = text1.upper().split() if text1 else [] |
| tokens2 = text2.upper().split() if text2 else [] |
| if not tokens1 or not tokens2: |
| return 0.0 |
|
|
| codes1 = [indic_soundex_code(t) for t in tokens1] |
| codes2 = [indic_soundex_code(t) for t in tokens2] |
|
|
| shorter, longer = (codes1, codes2) if len(codes1) <= len(codes2) else (codes2, codes1) |
| if not shorter: |
| return 0.0 |
|
|
| total_match = 0.0 |
| used = set() |
| for s_code in shorter: |
| best_match = 0.0 |
| best_idx = -1 |
| for i, l_code in enumerate(longer): |
| if i in used: |
| continue |
| match = sum(c1 == c2 for c1, c2 in zip(s_code, l_code)) / 4.0 |
| if match > best_match: |
| best_match = match |
| best_idx = i |
| if best_idx >= 0: |
| used.add(best_idx) |
| total_match += best_match |
|
|
| return (total_match / len(shorter)) * 100 |
|
|
| |
| def calculate_fuzzy_scores(input1: str, input2: str) -> Dict[str, float]: |
| """Calculate fuzzy matching scores using RapidFuzz (5 functions)""" |
| return { |
| "simple_ratio": fuzz.ratio(input1, input2), |
| "token_set_ratio": fuzz.token_set_ratio(input1, input2), |
| "w_ratio": fuzz.WRatio(input1, input2), |
| "partial_ratio": fuzz.partial_ratio(input1, input2), |
| "token_sort_ratio": fuzz.token_sort_ratio(input1, input2), |
| } |
|
|
| def calculate_semantic_similarity(model_name: str, input1: str, input2: str) -> float: |
| """Calculate semantic similarity using sentence transformers""" |
| model = get_model(model_name) |
| |
| |
| embedding1 = model.encode([input1], show_progress_bar=False) |
| embedding2 = model.encode([input2], show_progress_bar=False) |
| |
| return cosine_similarity(embedding1, embedding2)[0][0] |
|
|
| def calculate_final_score(fuzzy_scores: Dict[str, float], semantic_score: float) -> float: |
| """Calculate weighted final score""" |
| weights = MODEL_WEIGHTS |
| normalized_scores = { |
| "simple_ratio": fuzzy_scores.get("simple_ratio", 0), |
| "token_set_ratio": fuzzy_scores.get("token_set_ratio", 0), |
| "partial_ratio": fuzzy_scores.get("partial_ratio", 0), |
| "w_ratio": fuzzy_scores.get("w_ratio", 0), |
| "semantic_score": semantic_score * 100, |
| } |
| weighted_sum = sum(normalized_scores[key] * weight for key, weight in weights.items()) |
| return max(0, min(100, weighted_sum)) |
|
|
| def calculate_overall_similarity(score1: float, score2: float) -> float: |
| """Calculate overall similarity from two model scores""" |
| return score1 * 0.6 + score2 * 0.4 |
|
|
| def check_substring_match(str1: str, str2: str) -> bool: |
| """Check if one string is a substring of another""" |
| if not str1 or not str2: |
| return False |
| return str1 in str2 or str2 in str1 |
|
|
| def check_individual_name_matches(name_full: str, fname: str, mname: str, lname: str) -> Tuple[bool, bool, bool]: |
| """ |
| Check if full name contains first, middle, or last name as substring |
| Returns: (first_match, middle_match, last_match) |
| """ |
| f_match = check_substring_match(name_full, fname) if fname else False |
| m_match = check_substring_match(name_full, mname) if mname else False |
| l_match = check_substring_match(name_full, lname) if lname else False |
| return f_match, m_match, l_match |
|
|
|
|
| def concatenate_name_parts(firstname: str, middlename: str, lastname: str) -> str: |
| """Concatenate name parts""" |
| parts = [] |
| if firstname and firstname not in ["-", " ", ""]: |
| parts.append(firstname.upper().strip()) |
| if middlename and middlename not in ["-", " ", ""]: |
| parts.append(middlename.upper().strip()) |
| if lastname and lastname not in ["-", " ", ""]: |
| parts.append(lastname.upper().strip()) |
| |
| if not parts: |
| return "" |
| |
| parts.sort() |
| return " ".join(parts) |
|
|
| |
| def _normalize_and_sort(name: str) -> str: |
| """ |
| 1. Split on any non-alphanumeric character (space, underscore, comma, etc.) |
| 2. Remove empty tokens |
| 3. Upper-case |
| 4. Sort alphabetically |
| 5. Re-join with single space |
| """ |
| tokens = re.split(r'[^A-Za-z0-9]+', name.strip()) |
| tokens = [t.upper() for t in tokens if t] |
| return ' '.join(sorted(tokens)) |
|
|
| def _all_name_combinations(fname: str, mname: str, lname: str) -> list[str]: |
| """ |
| Return every possible ordering of the supplied parts, |
| dropping any empty/blank components. |
| """ |
| parts = [] |
| for p in (fname, mname, lname): |
| if p and p.strip() not in ('-', '', ' '): |
| parts.append(p.strip().upper()) |
| if not parts: |
| return [] |
| |
| return [' '.join(order) for order in itertools.permutations(parts)] |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
|
|
| |
| |
|
|
| |
| |
| |
|
|
| |
|
|
| def match_entities(value1: str, value2: str, weights: Dict[str, float] = None) -> float: |
| """ |
| Match two entities using fuzzy + semantic + optional phonetic similarity. |
| Weights dict determines score component contributions. |
| |
| Handles: |
| 1. Normal match : "Pujitha Sharma" vs "pujitha sharma" |
| 2. Space-agnostic match : "Pujitha Sharma" vs "pujithasharma" |
| 3. South Indian names : "Sharma Gari Pujitha" vs "Pujitha Sharma Gari" |
| (token order doesn't matter, combinations checked) |
| |
| Returns: similarity score as float (0-100) |
| |
| |
| - Integrated 'Check 3: Acronym / Initial expansion'. Matches acronyms to |
| full names (e.g. K V Reddy vs Katta Venkata Reddy) and boosts to 90+. |
| Penalizes mismatching initials (e.g. C Anitha vs H Anitha) by -40. |
| - Added 'Check 5: Final Phonetic Audit'. Uses Indic Soundex to securely |
| escalate minor spelling variants (likitha vs likheetha) to 95+ and heavily |
| punish mathematically close false-positives (rajesh vs rakesh). |
| """ |
| if weights is None: |
| weights = MODEL_WEIGHTS |
|
|
| standardized_input1 = preprocess_for_matching(value1) |
| standardized_input2 = preprocess_for_matching(value2) |
|
|
| if not standardized_input1 or not standardized_input2: |
| return 0 |
|
|
| |
| |
| |
| |
| if standardized_input1.replace(" ", "") == standardized_input2.replace(" ", ""): |
| return 100.0 |
|
|
| |
| |
| |
| |
| |
| |
| tokens1 = standardized_input1.split() |
| tokens2 = standardized_input2.split() |
|
|
| |
| if len(tokens1) <= 4 and len(tokens2) <= 4: |
|
|
| |
| target_nospace = standardized_input2.replace(" ", "") |
|
|
| for perm in itertools.permutations(tokens1): |
| |
| |
| perm_with_space = " ".join(perm) |
| perm_without_space = "".join(perm) |
|
|
| if perm_with_space == standardized_input2: |
| return 100.0 |
|
|
| if perm_without_space == target_nospace: |
| return 100.0 |
|
|
| |
| target_nospace1 = standardized_input1.replace(" ", "") |
|
|
| for perm in itertools.permutations(tokens2): |
| perm_with_space = " ".join(perm) |
| perm_without_space = "".join(perm) |
|
|
| if perm_with_space == standardized_input1: |
| return 100.0 |
|
|
| if perm_without_space == target_nospace1: |
| return 100.0 |
|
|
| |
| |
| |
| |
| |
| if len(tokens1) > 0 and len(tokens2) > 0: |
| common = set(tokens1) & set(tokens2) |
| rem1 = [t for t in tokens1 if t not in common] |
| rem2 = [t for t in tokens2 if t not in common] |
| |
| |
| if common and rem1 and rem2: |
| rem1_is_initials = all(len(t) == 1 for t in rem1) |
| rem2_is_initials = all(len(t) == 1 for t in rem2) |
| |
| initials_list = None |
| fullcaps_list = None |
| |
| |
| if rem1_is_initials and not rem2_is_initials: |
| initials_list = rem1 |
| fullcaps_list = rem2 |
| elif rem2_is_initials and not rem1_is_initials: |
| initials_list = rem2 |
| fullcaps_list = rem1 |
| elif rem1_is_initials and rem2_is_initials: |
| |
| initials_list = rem1 |
| fullcaps_list = rem2 |
|
|
| if initials_list is not None and fullcaps_list is not None: |
| initials_set = {t[0] for t in initials_list} |
| first_letters_set = {t[0] for t in fullcaps_list if t} |
| |
| |
| if initials_set == first_letters_set or initials_set.issubset(first_letters_set) or first_letters_set.issubset(initials_set): |
| base_score = calculate_similarity_with_models(standardized_input1, standardized_input2, weights) |
| return max(90.0, base_score) |
| else: |
| |
| base_score = calculate_similarity_with_models(standardized_input1, standardized_input2, weights) |
| return max(0.0, base_score - 40.0) |
| else: |
| |
| |
| |
| |
| |
| |
| rem1_str = " ".join(rem1) |
| rem2_str = " ".join(rem2) |
| |
| rem_fuzzy = fuzz.ratio(rem1_str, rem2_str) |
| if rem_fuzzy < 65.0: |
| base_score = calculate_similarity_with_models(standardized_input1, standardized_input2, weights) |
| |
| return max(0.0, base_score - 40.0) |
|
|
| |
| |
| |
| |
| base_score = calculate_similarity_with_models(standardized_input1, standardized_input2, weights) |
|
|
| |
| |
| |
| |
| |
| if len(tokens1) == 1 and len(tokens2) == 1: |
| ph_score = indic_soundex_similarity(standardized_input1, standardized_input2) |
| |
| |
| if ph_score == 100.0: |
| if fuzz.ratio(standardized_input1, standardized_input2) > 65 and abs(len(standardized_input1) - len(standardized_input2)) <= 2: |
| return max(95.0, base_score) |
| |
| |
| elif ph_score <= 80.0: |
| if base_score > 55.0: |
| |
| return min(base_score - 25.0, 55.0) |
|
|
| return base_score |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
|
|
| |
| |
|
|
| |
| |
|
|
| |
| |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
|
|
| |
| |
|
|
| |
| |
| |
|
|
| def calculate_similarity_with_models(text1: str, text2: str, weights: Dict[str, float] = None) -> float: |
| """ |
| Calculate similarity using fuzzy scores, embedding models, and optional phonetic. |
| The weights dict controls which components are active and their contribution. |
| Phonetic components (jaro_winkler, indic_soundex) are used only if present in weights. |
| Returns similarity percentage as float (0-100) |
| """ |
| if weights is None: |
| weights = MODEL_WEIGHTS |
|
|
| if not text1 or not text2: |
| return 0.0 |
|
|
| text1 = str(text1).strip() |
| text2 = str(text2).strip() |
|
|
| if not text1 or not text2: |
| return 0.0 |
|
|
| |
| if text1.replace(" ", "") == text2.replace(" ", ""): |
| return 100.0 |
|
|
| |
| fuzzy_scores = { |
| "simple_ratio": fuzz.ratio(text1, text2), |
| "token_set_ratio": fuzz.token_set_ratio(text1, text2), |
| "w_ratio": fuzz.WRatio(text1, text2), |
| "partial_ratio": fuzz.partial_ratio(text1, text2), |
| "token_sort_ratio": fuzz.token_sort_ratio(text1, text2), |
| } |
|
|
| |
| phonetic_scores = {} |
| if weights.get("jaro_winkler", 0) > 0: |
| phonetic_scores["jaro_winkler"] = JaroWinkler.similarity(text1, text2) * 100 |
| if weights.get("indic_soundex", 0) > 0: |
| phonetic_scores["indic_soundex"] = indic_soundex_similarity(text1, text2) |
|
|
| |
| with ThreadPoolExecutor() as executor: |
| model1 = get_model("model1") |
| model2 = get_model("model2") |
|
|
| f1 = executor.submit( |
| lambda: cosine_similarity( |
| model1.encode([text1]), |
| model1.encode([text2]) |
| )[0][0] |
| ) |
| f2 = executor.submit( |
| lambda: cosine_similarity( |
| model2.encode([text1]), |
| model2.encode([text2]) |
| )[0][0] |
| ) |
| cosine1 = f1.result() |
| cosine2 = f2.result() |
|
|
| def calc_final(semantic_cosine): |
| all_scores = {} |
| all_scores.update(fuzzy_scores) |
| all_scores.update(phonetic_scores) |
| all_scores["semantic_score"] = semantic_cosine * 100 |
| return sum(all_scores.get(k, 0) * v for k, v in weights.items()) |
|
|
| final1 = calc_final(cosine1) |
| final2 = calc_final(cosine2) |
|
|
| overall_similarity = final1 * 0.6 + final2 * 0.4 |
| return round(max(0, min(100, overall_similarity)), 2) |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
|
|
| |
| |
| |
|
|
| |
| |
| |
|
|
| |
| |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
|
|
| |
| |
|
|
| |
| |
| |
| |
| |
|
|
| |
| |
| |
|
|
| |
| |
|
|
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
|
|
| |
| def handle_case1(full_name1: str, full_name2: str, |
| r1_fname: str, r1_mname: str, r1_lname: str, |
| r2_fname: str, r2_mname: str, r2_lname: str) -> dict: |
| """ |
| Case-1 (both records supply a full name) |
| Returns a dictionary with separate similarity scores for each component |
| |
| Returns: |
| dict: { |
| 'full_name_percent': float, # full_name1 vs full_name2 |
| 'firstname_percent': float, # r1_fname vs r2_fname |
| 'middlename_percent': float, # r1_mname vs r2_mname |
| 'lastname_percent': float # r1_lname vs r2_lname |
| } |
| """ |
| result={} |
| |
| |
| if full_name1.replace(" ", "").upper() == full_name2.replace(" ", "").upper(): |
| full_name_percent = 100.0 |
| else: |
| |
| sorted1 = _normalize_and_sort(full_name1) |
| sorted2 = _normalize_and_sort(full_name2) |
| full_name_percent = match_entities(sorted1, sorted2, NAME_MODEL_WEIGHTS) |
| |
| |
| |
| |
| if r1_fname and r2_fname: |
| firstname_percent = match_entities( |
| r1_fname, r2_fname, NAME_MODEL_WEIGHTS |
| ) |
| |
| else: |
| firstname_percent = 0.0 |
|
|
| |
| if r1_mname and r2_mname: |
| middlename_percent = match_entities( |
| r1_mname, r2_mname, NAME_MODEL_WEIGHTS |
| ) |
| |
| else: |
| middlename_percent = 0.0 |
|
|
| |
| if r1_lname and r2_lname: |
| lastname_percent = match_entities( |
| r1_lname, r2_lname, NAME_MODEL_WEIGHTS |
| ) |
| |
| else: |
| lastname_percent = 0.0 |
|
|
| |
|
|
| result={ |
| 'full_name_percent': full_name_percent, |
| 'firstname_percent': firstname_percent, |
| 'middlename_percent': middlename_percent, |
| 'lastname_percent': lastname_percent |
| } |
| return result |
|
|
| def handle_case2(full_name: str, |
| fname: str, mname: str, lname: str, |
| concat_name: str) -> dict: |
| """ |
| Case-2 (one side has full name, the other has F/M/L) |
| Returns a dictionary with separate similarity scores for each component |
| |
| Returns: |
| dict: { |
| 'full_name_percent': float, # full_name vs concat_name |
| 'firstname_percent': float, # full_name vs fname |
| 'middlename_percent': float, # full_name vs mname |
| 'lastname_percent': float # full_name vs lname |
| } |
| """ |
| |
| full_name_percent = None |
| for permuted in _all_name_combinations(fname, mname, lname): |
| if permuted.replace(" ", "") == full_name.upper().strip().replace(" ", ""): |
| |
| full_name_percent = 100.0 |
| break |
|
|
| |
| if full_name_percent is None: |
| sorted_full = _normalize_and_sort(full_name) |
| sorted_concat = _normalize_and_sort(concat_name) |
| |
| full_name_percent = match_entities( |
| sorted_full, |
| sorted_concat, |
| NAME_MODEL_WEIGHTS |
| ) |
|
|
| |
| if fname : |
| firstname_percent = match_entities( |
| full_name, |
| fname, |
| NAME_MODEL_WEIGHTS |
| ) |
| else: |
| firstname_percent=0.0 |
| |
| if mname : |
| middlename_percent = match_entities( |
| full_name, |
| mname, |
| NAME_MODEL_WEIGHTS |
| ) |
| else: |
| middlename_percent=0.0 |
|
|
| |
| if lname and lname.upper() not in SURNAME_IDENTIFIER: |
| lastname_percent = match_entities( |
| full_name, |
| lname, |
| NAME_MODEL_WEIGHTS |
| ) |
| else: |
| lastname_percent=0.0 |
|
|
| result={ |
| 'full_name_percent': full_name_percent, |
| 'firstname_percent': firstname_percent, |
| 'middlename_percent': middlename_percent, |
| 'lastname_percent': lastname_percent |
| } |
| return result |
|
|
|
|
| def handle_case3(r1_fname: str, r1_mname: str, r1_lname: str, r1_concat: str, |
| r2_fname: str, r2_mname: str, r2_lname: str, r2_concat: str) -> dict: |
| """ |
| Handle Case 3: Both records have F/M/L |
| Returns a dictionary with separate similarity scores for each component |
| |
| Returns: |
| dict: { |
| 'full_name_percent': float, # r1_concat vs r2_concat |
| 'firstname_percent': float, # r1_fname vs r2_fname |
| 'middlename_percent': float, # r1_mname vs r2_mname |
| 'lastname_percent': float # r1_lname vs r2_lname |
| } |
| """ |
| |
| f_match = check_substring_match(r1_fname, r2_fname) if r1_fname and r2_fname else False |
| m_match = check_substring_match(r1_mname, r2_mname) if r1_mname and r2_mname else False |
| l_match = check_substring_match(r1_lname, r2_lname) if r1_lname and r2_lname else False |
| |
| |
| full_name_percent = match_entities(r1_concat, r2_concat, NAME_MODEL_WEIGHTS) |
| |
| |
| |
| if l_match and not f_match and not m_match: |
| full_name_percent = max(full_name_percent, 85.0) |
| |
| |
| |
| elif l_match and (f_match or m_match): |
| full_name_percent = max(full_name_percent, 90.0) |
| |
| |
| |
| |
| |
| if r1_fname and r2_fname: |
| firstname_percent = match_entities( |
| r1_fname, |
| r2_fname, |
| NAME_MODEL_WEIGHTS |
| ) |
| else: |
| firstname_percent=0.0 |
|
|
| |
| if r1_mname and r2_mname: |
| middlename_percent = match_entities( |
| r1_mname, |
| r2_mname, |
| NAME_MODEL_WEIGHTS |
| ) |
| else: |
| middlename_percent=0.0 |
|
|
| |
| if r1_lname and r2_lname and r1_lname.upper() not in SURNAME_IDENTIFIER and r2_lname.upper() not in SURNAME_IDENTIFIER: |
| lastname_percent = match_entities( |
| r1_lname, |
| r2_lname, |
| NAME_MODEL_WEIGHTS |
| ) |
| else: |
| lastname_percent=0.0 |
|
|
| result= { |
| 'full_name_percent': full_name_percent, |
| 'firstname_percent': firstname_percent, |
| 'middlename_percent': middlename_percent, |
| 'lastname_percent': lastname_percent |
| } |
| return result |
|
|
| def match_name(name: str, firstname: str, lastname: str, middlename: str) -> float: |
| """ |
| Match name with logic |
| Returns similarity score as float or "missing value" |
| """ |
| name_processed = preprocess_for_matching(name) |
| concat_name = concatenate_name_parts(firstname, middlename, lastname) |
| |
| |
| if name_processed and concat_name and name_processed == concat_name: |
| return 100 |
| |
| |
| if not name_processed and concat_name: |
| return 100 |
| |
| |
| if name_processed and not concat_name: |
| return 100 |
| |
| |
| if name_processed and concat_name and name_processed != concat_name: |
| |
| return match_entities(name_processed, concat_name) |
| |
| |
| return 0 |
|
|
| def match_names_cross_records(r1_name: str, r1_firstname: str, r1_lastname: str, r1_middlename: str, |
| r2_name: str, r2_firstname: str, r2_lastname: str, r2_middlename: str) -> float: |
| """ |
| Match names between two records with enhanced preprocessing: |
| 1. Input is already lowercase + preprocessed (titles removed, variations standardized) |
| 2. Surname detection β if only common surnames match, return 20% |
| 3. Token sorting for consistent comparison |
| 4. Common token detection |
| 5. Initial letter boost for abbreviated names |
| 6. Three-case matching (both fullname / one fullname+FML / both FML) |
| |
| [MODIFIED 2026-03-15] |
| - Refactored handle_case functions to properly pass exact permutation checking |
| down to match_entities() instead of bypassing it to ml models. |
| - Updated handle_case2 exact match checker to cleanly yield the first, middle, |
| and last name proportions instead of assuming 100% across the board. |
| - Implemented a -40 explicit penalty if two recognized surnames are detected |
| but contradict each other completely (e.g. Krishna Rajput vs Krishna Singh). |
| """ |
| |
| r1_name_proc = r1_name.strip() if r1_name and r1_name.strip() not in ["-", ""] else "" |
| r2_name_proc = r2_name.strip() if r2_name and r2_name.strip() not in ["-", ""] else "" |
|
|
| r1_fname = r1_firstname.strip() if r1_firstname and r1_firstname.strip() not in ["-", ""] else "" |
| r1_mname = r1_middlename.strip() if r1_middlename and r1_middlename.strip() not in ["-", ""] else "" |
| r1_lname = r1_lastname.strip() if r1_lastname and r1_lastname.strip() not in ["-", ""] else "" |
|
|
| r2_fname = r2_firstname.strip() if r2_firstname and r2_firstname.strip() not in ["-", ""] else "" |
| r2_mname = r2_middlename.strip() if r2_middlename and r2_middlename.strip() not in ["-", ""] else "" |
| r2_lname = r2_lastname.strip() if r2_lastname and r2_lastname.strip() not in ["-", ""] else "" |
|
|
| |
| r1_has_fullname = bool(r1_name_proc) |
| r2_has_fullname = bool(r2_name_proc) |
|
|
| r1_concat = concatenate_name_parts(r1_fname, r1_mname, r1_lname).lower() |
| r2_concat = concatenate_name_parts(r2_fname, r2_mname, r2_lname).lower() |
|
|
| |
| name1_effective = r1_name_proc if r1_has_fullname else r1_concat |
| name2_effective = r2_name_proc if r2_has_fullname else r2_concat |
|
|
| |
| if not name1_effective and not name2_effective: |
| return { |
| 'full_name_percent': 0.0, |
| 'firstname_percent': 0.0, |
| 'middlename_percent': 0.0, |
| 'lastname_percent': 0.0 |
| } |
|
|
| |
| adjustment = 0 |
| surname_penalty_val = NAME_MATCH_ADJUSTMENTS.get("surname_penalty", -30) |
| initial_boost_val = NAME_MATCH_ADJUSTMENTS.get("initial_boost", 30) |
| subset_boost_val = NAME_MATCH_ADJUSTMENTS.get("subset_boost", 40) |
|
|
| |
| surname_only_match = False |
| if name1_effective and name2_effective: |
| surnames1 = detect_surnames(name1_effective) |
| surnames2 = detect_surnames(name2_effective) |
|
|
| if surnames1 and surnames2: |
| common_surnames = surnames1 & surnames2 |
| if common_surnames: |
| tokens1_non_surname = [t for t in name1_effective.split() if t not in surnames1] |
| tokens2_non_surname = [t for t in name2_effective.split() if t not in surnames2] |
|
|
| if tokens1_non_surname and tokens2_non_surname: |
| non_surname_overlap = set(tokens1_non_surname) & set(tokens2_non_surname) |
| if not non_surname_overlap: |
| non_surname1_str = " ".join(tokens1_non_surname) |
| non_surname2_str = " ".join(tokens2_non_surname) |
| if fuzz.ratio(non_surname1_str, non_surname2_str) < 60: |
| surname_only_match = True |
| adjustment += surname_penalty_val |
| else: |
| |
| |
| adjustment -= 40 |
|
|
| |
| name1_tokens = sorted(name1_effective.split()) if name1_effective else [] |
| name2_tokens = sorted(name2_effective.split()) if name2_effective else [] |
|
|
| |
| if name1_tokens and name2_tokens: |
| boost = compute_initial_letter_boost(name1_tokens, name2_tokens) |
| if boost > 0: |
| adjustment += initial_boost_val |
|
|
| |
| if name1_tokens and name2_tokens and len(name1_tokens) != len(name2_tokens): |
| if is_subset_match(name1_tokens, name2_tokens): |
| adjustment += subset_boost_val |
|
|
| |
| result = None |
|
|
| |
| if r1_has_fullname and r2_has_fullname: |
| result = handle_case1(r1_name_proc, r2_name_proc, |
| r1_firstname, r1_middlename, r1_lastname, |
| r2_firstname, r2_middlename, r2_lastname) |
|
|
| |
| elif r1_has_fullname and not r2_has_fullname and r2_concat: |
| result = handle_case2(r1_name_proc, r2_fname, r2_mname, r2_lname, r2_concat) |
|
|
| elif r2_has_fullname and not r1_has_fullname and r1_concat: |
| result = handle_case2(r2_name_proc, r1_fname, r1_mname, r1_lname, r1_concat) |
|
|
| |
| elif not r1_has_fullname and not r2_has_fullname and r1_concat and r2_concat: |
| result = handle_case3(r1_fname, r1_mname, r1_lname, r1_concat, |
| r2_fname, r2_mname, r2_lname, r2_concat) |
|
|
| |
| if result is None: |
| result = { |
| 'full_name_percent': 0.0, |
| 'firstname_percent': 0.0, |
| 'middlename_percent': 0.0, |
| 'lastname_percent': 0.0 |
| } |
|
|
| |
| if adjustment != 0: |
| result['full_name_percent'] = max(0.0, min(100.0, result['full_name_percent'] + adjustment)) |
|
|
| return result |
|
|
| def match_addresses_1_to_n(addresses_r1: List[str], addresses_r2: List[str]) -> float: |
| """ |
| Match addresses 1:N (plain addressline strings only β no city/zipcode/state). |
| |
| Pipeline: |
| 1. Extract all address components (house_no, flat, apartment, street) from each address |
| 2. Pass remaining address (components removed) to embedding model β base_score |
| 3. If base_score > 60: apply per-component boost/penalty |
| house_number : match +30 / mismatch -30 |
| flat_number : match +10 / mismatch -10 |
| apartment : match +10 / mismatch -10 |
| street : match +10 / mismatch -10 |
| If base_score <= 60: skip all component adjustments |
| 4. Named component + post-box adjustments |
| 5. Cap final score to [0, 100] |
| """ |
| from services.rules import ( |
| preprocess_address as _preprocess_addr, |
| compare_named_components as _compare_named, |
| compare_postbox as _compare_postbox, |
| remove_postbox_from_address as _strip_postbox, |
| extract_address_components as _extract_components, |
| ) |
|
|
| def _norm(val): |
| """Strip all non-alphanumerics β 144/143 β 144143.""" |
| if not val: |
| return "" |
| return re.sub(r'[^A-Z0-9]', '', str(val).upper()) |
|
|
| def _component_adj(v1, v2, boost, penalty): |
| """Return (verdict, adjustment) for a single component pair.""" |
| if v1 and v2: |
| return ("match", boost) if v1 == v2 else ("mismatch", -penalty) |
| return ("missing", 0.0) |
|
|
| raw1 = [a for a in addresses_r1 if a and str(a).strip() not in ["-", " ", ""]] |
| raw2 = [a for a in addresses_r2 if a and str(a).strip() not in ["-", " ", ""]] |
|
|
| if not raw1 or not raw2: |
| return 0 |
|
|
| best_score = 0.0 |
|
|
| for raw_a1 in raw1: |
| for raw_a2 in raw2: |
| if not raw_a1 or not raw_a2: |
| continue |
|
|
| |
| comp1 = _extract_components(raw_a1) |
| comp2 = _extract_components(raw_a2) |
|
|
| hno1 = _norm(comp1.get("house_number")) |
| hno2 = _norm(comp2.get("house_number")) |
| flat1 = _norm(comp1.get("flat_number")) |
| flat2 = _norm(comp2.get("flat_number")) |
| apt1 = _norm(comp1.get("apartment")) |
| apt2 = _norm(comp2.get("apartment")) |
| str1 = _norm(comp1.get("street")) |
| str2 = _norm(comp2.get("street")) |
|
|
| |
| rem1 = comp1.get("remaining_address", "").strip() |
| rem2 = comp2.get("remaining_address", "").strip() |
|
|
| |
| if not rem1: |
| rem1 = _preprocess_addr(raw_a1).upper() |
| if not rem2: |
| rem2 = _preprocess_addr(raw_a2).upper() |
|
|
| addr1_clean = _strip_postbox(rem1) or rem1 |
| addr2_clean = _strip_postbox(rem2) or rem2 |
|
|
| |
| addr1_full = _preprocess_addr(raw_a1).upper() |
| addr2_full = _preprocess_addr(raw_a2).upper() |
| named_result = _compare_named(addr1_full, addr2_full) |
| pb_result = _compare_postbox(addr1_full, addr2_full) |
|
|
| try: |
| base_score = float(match_entities(addr1_clean, addr2_clean, |
| weights=ADDRESS_MODEL_WEIGHTS)) |
| except (TypeError, ValueError): |
| base_score = 0.0 |
|
|
| |
| comp_adj = 0.0 |
| component_specs = [ |
| ("house_number", hno1, hno2, 30.0, 30.0), |
| ("flat_number", flat1, flat2, 10.0, 10.0), |
| ("apartment", apt1, apt2, 10.0, 10.0), |
| ("street", str1, str2, 10.0, 10.0), |
| ] |
| print(f"[ADDR_COMPONENTS] base_score={base_score:.2f} | threshold=60 | adjustments_applied={base_score > 60}") |
| print(f" remaining_addr1 : {addr1_clean!r}") |
| print(f" remaining_addr2 : {addr2_clean!r}") |
| for label, v1, v2, boost, penalty in component_specs: |
| verdict, adj = _component_adj(v1, v2, boost, penalty) |
| if verdict == "missing": |
| print(f" {label:<15} | verdict=missing | v1={v1!r:>10} v2={v2!r:<10} | adjustment=0.0 [skipped - component absent]") |
| elif base_score <= 60: |
| print(f" {label:<15} | verdict={verdict:<9} | v1={v1!r:>10} v2={v2!r:<10} | adjustment=0.0 [SKIPPED - base_score<=60]") |
| else: |
| comp_adj += adj |
| sign = "+" if adj >= 0 else "" |
| tag = "BOOSTED" if adj > 0 else "PENALISED" |
| print(f" {label:<15} | verdict={verdict:<9} | v1={v1!r:>10} v2={v2!r:<10} | adjustment={sign}{adj:.1f} [{tag}]") |
| print(f" total comp_adj : {comp_adj:+.1f}") |
|
|
| adjustment = comp_adj + named_result['score_adjustment'] + pb_result['adjustment'] |
| final_score = max(0.0, min(100.0, base_score + adjustment)) |
| if final_score > best_score: |
| best_score = final_score |
|
|
| return round(best_score, 2) |
|
|
|
|
| def match_addresses_structured( |
| addrs_r1: List[dict], |
| addrs_r2: List[dict], |
| ) -> float: |
| """ |
| Match addresses when city / zipcode / state are available as separate columns. |
| |
| Each address dict must have keys: addressline, city, zipcode, state. |
| Returns best score across all NΓM combinations (0-100). |
| |
| Handles: |
| - Missing state/city β inferred from zipcode via pgeocode (offline) |
| - Bank state codes (NDH, BLR β¦) β canonical form |
| - City name variants β canonical via CITY_MAPPING |
| - House number extraction + comparison |
| - Full addressline text via embedding model |
| |
| Example: |
| addrs1 = [{"addressline": "A13 GUPTA ENCLAVE...", |
| "city": "NEW DELHI", "zipcode": "110059", "state": "NDH"}] |
| addrs2 = [{"addressline": "A13 GUPTA ENCLAVE...", |
| "city": "NEW DELHI", "zipcode": "110059", "state": "DELHI"}] |
| score = match_addresses_structured(addrs1, addrs2) # β ~100 |
| """ |
| from services.rules import match_structured_address_lists as _sa_match |
| return _sa_match(addrs_r1, addrs_r2) |
|
|
| def match_single_field(value1: str, value2: str) -> float: |
| """ |
| Match single fields like SPOUSENAME, MOTHERNAME, etc. |
| Returns similarity score as float or "missing value" |
| """ |
| return match_entities(value1, value2) |