Spaces:
Sleeping
Sleeping
Commit Β·
0ceaa0b
1
Parent(s): 681a0f7
new added
Browse files- src/streamlit_app.py +742 -38
src/streamlit_app.py
CHANGED
|
@@ -1,40 +1,744 @@
|
|
| 1 |
-
import altair as alt
|
| 2 |
-
import numpy as np
|
| 3 |
-
import pandas as pd
|
| 4 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
-
|
| 7 |
-
#
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
"
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
"
|
| 31 |
-
}
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
+
import json
|
| 3 |
+
import re
|
| 4 |
+
import difflib
|
| 5 |
+
from PIL import Image
|
| 6 |
+
import base64
|
| 7 |
+
import os
|
| 8 |
|
| 9 |
+
# =========================================================
|
| 10 |
+
# PAGE CONFIG
|
| 11 |
+
# =========================================================
|
| 12 |
+
st.set_page_config(
|
| 13 |
+
page_title="GEN AI Record Level Matching",
|
| 14 |
+
page_icon="π",
|
| 15 |
+
layout="wide",
|
| 16 |
+
initial_sidebar_state="collapsed"
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
# =========================================================
|
| 20 |
+
# CUSTOM CSS
|
| 21 |
+
# =========================================================
|
| 22 |
+
st.markdown("""
|
| 23 |
+
<style>
|
| 24 |
+
[data-testid="stAppViewContainer"], [data-testid="stApp"], .stApp {
|
| 25 |
+
background-color: #f0f2f5 !important;
|
| 26 |
+
color: #333 !important;
|
| 27 |
+
}
|
| 28 |
+
.main { background-color: #f0f2f5; }
|
| 29 |
+
.stAppDeployButton, .stMainMenu, #MainMenu,
|
| 30 |
+
[data-testid="stToolbarActions"], [data-testid="stStatusWidget"] {
|
| 31 |
+
display: none !important;
|
| 32 |
+
}
|
| 33 |
+
header[data-testid="stHeader"] { background: transparent !important; }
|
| 34 |
+
.block-container { padding-top: 2rem !important; padding-bottom: 2rem !important; }
|
| 35 |
+
.header-title {
|
| 36 |
+
text-align: center; color: #5B4E8B; font-size: 28px;
|
| 37 |
+
font-weight: 600; margin-bottom: 10px;
|
| 38 |
+
}
|
| 39 |
+
.header-subtitle {
|
| 40 |
+
text-align: center; color: #666; font-size: 14px; margin-bottom: 30px;
|
| 41 |
+
}
|
| 42 |
+
.logo-title-container {
|
| 43 |
+
display: flex; align-items: center; justify-content: center;
|
| 44 |
+
gap: 15px; margin-bottom: 10px;
|
| 45 |
+
}
|
| 46 |
+
.record-header {
|
| 47 |
+
color: #612383; font-size: 26px; font-weight: 700;
|
| 48 |
+
margin-bottom: 25px; padding-bottom: 12px;
|
| 49 |
+
border-bottom: 3px solid;
|
| 50 |
+
border-image: linear-gradient(90deg, #612383, #E9592E, #F5A700) 1;
|
| 51 |
+
}
|
| 52 |
+
.section-card {
|
| 53 |
+
background: white; border-radius: 12px;
|
| 54 |
+
box-shadow: 0 2px 8px rgba(0,0,0,0.08);
|
| 55 |
+
margin-bottom: 20px; overflow: hidden;
|
| 56 |
+
}
|
| 57 |
+
.section-header-gradient {
|
| 58 |
+
background: linear-gradient(90deg, #612383 0%, #E9592E 100%);
|
| 59 |
+
color: white; padding: 14px 20px; font-size: 14px;
|
| 60 |
+
font-weight: 600; text-transform: uppercase; letter-spacing: 0.5px;
|
| 61 |
+
display: flex; align-items: center; gap: 10px;
|
| 62 |
+
}
|
| 63 |
+
.section-content { padding: 20px; }
|
| 64 |
+
.stTextInput > div > div > input {
|
| 65 |
+
background-color: #fafbfc !important; color: #333 !important;
|
| 66 |
+
border: 1px solid #e1e4e8 !important; border-radius: 8px !important;
|
| 67 |
+
padding: 10px 14px !important; font-size: 14px !important;
|
| 68 |
+
}
|
| 69 |
+
.stTextInput > div > div > input:focus {
|
| 70 |
+
border-color: #E9592E !important;
|
| 71 |
+
box-shadow: 0 0 0 3px rgba(233,89,46,0.1) !important;
|
| 72 |
+
}
|
| 73 |
+
.stTextInput label { color: #555 !important; font-size: 13px !important; font-weight: 500 !important; }
|
| 74 |
+
.subsection-label { color: #666; font-size: 13px; font-weight: 500; margin-bottom: 12px; }
|
| 75 |
+
div[data-testid="stButton"] button:not([kind="primary"]):not([kind="secondary"]) {
|
| 76 |
+
width: 36px !important; height: 36px !important; min-width: 36px !important;
|
| 77 |
+
padding: 0 !important; border-radius: 6px !important; font-size: 18px !important;
|
| 78 |
+
background-color: white !important; color: #612383 !important;
|
| 79 |
+
border: 1px solid #d0d7de !important;
|
| 80 |
+
}
|
| 81 |
+
button[kind="primary"] {
|
| 82 |
+
background: linear-gradient(90deg, #612383 0%, #E9592E 100%) !important;
|
| 83 |
+
color: white !important; border: none !important; border-radius: 10px !important;
|
| 84 |
+
padding: 16px 32px !important; font-size: 16px !important; font-weight: 600 !important;
|
| 85 |
+
box-shadow: 0 4px 12px rgba(97,35,131,0.25) !important;
|
| 86 |
+
text-transform: uppercase; letter-spacing: 0.5px; height: auto !important;
|
| 87 |
+
}
|
| 88 |
+
button[kind="secondary"] {
|
| 89 |
+
background: linear-gradient(90deg, #612383 0%, #E9592E 100%) !important;
|
| 90 |
+
color: white !important; border: none !important; border-radius: 10px !important;
|
| 91 |
+
padding: 12px 24px !important; font-size: 13px !important; font-weight: 600 !important;
|
| 92 |
+
min-width: 140px !important; height: auto !important;
|
| 93 |
+
box-shadow: 0 4px 12px rgba(97,35,131,0.25) !important;
|
| 94 |
+
text-transform: uppercase; letter-spacing: 0.5px;
|
| 95 |
+
}
|
| 96 |
+
.result-box {
|
| 97 |
+
background: white !important; border-radius: 12px !important;
|
| 98 |
+
padding: 25px !important; margin-top: 30px !important;
|
| 99 |
+
box-shadow: 0 4px 16px rgba(0,0,0,0.1) !important;
|
| 100 |
+
border-top: 4px solid;
|
| 101 |
+
border-image: linear-gradient(90deg, #612383, #E9592E, #F5A700) 1;
|
| 102 |
+
}
|
| 103 |
+
.result-header { color: #612383; font-size: 18px; font-weight: 600; margin-bottom: 15px; }
|
| 104 |
+
.section-divider { border: none; border-top: 1px solid #e1e4e8; margin: 20px 0; }
|
| 105 |
+
div[data-testid="stExpander"] summary { color: #333 !important; font-weight: 600 !important; }
|
| 106 |
+
div[data-testid="stExpander"] summary svg { stroke: #333 !important; }
|
| 107 |
+
.address-title { font-weight: 600; color: #612383; font-size: 14px; }
|
| 108 |
+
::placeholder { color: #666 !important; opacity: 1 !important; }
|
| 109 |
+
[data-testid="stJson"], [data-testid="stCodeBlock"] {
|
| 110 |
+
background-color: #ffffff !important; color: #333333 !important;
|
| 111 |
+
border: 1px solid #e1e4e8 !important; border-radius: 8px !important;
|
| 112 |
+
}
|
| 113 |
+
div[data-testid="stRadio"] label { color: #333 !important; font-size: 14px !important; font-weight: 500 !important; }
|
| 114 |
+
div[data-testid="stRadio"] > label:first-child { color: #222 !important; font-size: 15px !important; font-weight: 600 !important; }
|
| 115 |
+
div[data-testid="stRadio"] div[role="radiogroup"] label[data-baseweb="radio"] div:first-child {
|
| 116 |
+
border-color: #612383 !important;
|
| 117 |
+
}
|
| 118 |
+
div[data-testid="stRadio"] div[role="radiogroup"] label[data-baseweb="radio"] div:first-child div {
|
| 119 |
+
background-color: #612383 !important;
|
| 120 |
+
}
|
| 121 |
+
</style>
|
| 122 |
+
""", unsafe_allow_html=True)
|
| 123 |
+
|
| 124 |
+
# =========================================================
|
| 125 |
+
# SESSION STATE
|
| 126 |
+
# =========================================================
|
| 127 |
+
MAX_FIELDS = 20
|
| 128 |
+
|
| 129 |
+
defaults = {
|
| 130 |
+
'address_ids_r1': [0], 'address_ids_r2': [0],
|
| 131 |
+
'phone_ids_r1': [0], 'phone_ids_r2': [0],
|
| 132 |
+
'email_ids_r1': [0], 'email_ids_r2': [0],
|
| 133 |
+
'custom_fields_r1': [], 'custom_fields_r2': [],
|
| 134 |
+
}
|
| 135 |
+
for k, v in defaults.items():
|
| 136 |
+
if k not in st.session_state:
|
| 137 |
+
st.session_state[k] = v
|
| 138 |
+
|
| 139 |
+
# =========================================================
|
| 140 |
+
# PURE PYTHON MATCHING LOGIC (no external ML libs)
|
| 141 |
+
# =========================================================
|
| 142 |
+
|
| 143 |
+
def normalize_text(text):
|
| 144 |
+
if not text:
|
| 145 |
+
return ""
|
| 146 |
+
return re.sub(r"\s+", " ", str(text).strip().lower())
|
| 147 |
+
|
| 148 |
+
def fuzzy_ratio(a, b):
|
| 149 |
+
"""Simple fuzzy ratio using difflib (0-100)"""
|
| 150 |
+
if not a or not b:
|
| 151 |
+
return 0
|
| 152 |
+
return int(difflib.SequenceMatcher(None, a, b).ratio() * 100)
|
| 153 |
+
|
| 154 |
+
def token_sort_ratio(a, b):
|
| 155 |
+
"""Token sort ratio - sort words before comparing"""
|
| 156 |
+
if not a or not b:
|
| 157 |
+
return 0
|
| 158 |
+
a_sorted = " ".join(sorted(a.split()))
|
| 159 |
+
b_sorted = " ".join(sorted(b.split()))
|
| 160 |
+
return fuzzy_ratio(a_sorted, b_sorted)
|
| 161 |
+
|
| 162 |
+
def name_similarity(a, b):
|
| 163 |
+
"""Compare two name strings"""
|
| 164 |
+
if not a and not b:
|
| 165 |
+
return -1 # both missing
|
| 166 |
+
if not a or not b:
|
| 167 |
+
return 0
|
| 168 |
+
a, b = normalize_text(a), normalize_text(b)
|
| 169 |
+
r1 = fuzzy_ratio(a, b)
|
| 170 |
+
r2 = token_sort_ratio(a, b)
|
| 171 |
+
return max(r1, r2)
|
| 172 |
+
|
| 173 |
+
def match_names(name1, fn1, ln1, mn1, name2, fn2, ln2, mn2):
|
| 174 |
+
"""Match full name records, returns dict with percent scores"""
|
| 175 |
+
|
| 176 |
+
def build_full(name, fn, mn, ln):
|
| 177 |
+
parts = [p for p in [fn, mn, ln] if p and p.strip()]
|
| 178 |
+
if parts:
|
| 179 |
+
return " ".join(parts)
|
| 180 |
+
return name or ""
|
| 181 |
+
|
| 182 |
+
full1 = normalize_text(build_full(name1, fn1, mn1, ln1) or name1 or "")
|
| 183 |
+
full2 = normalize_text(build_full(name2, fn2, mn2, ln2) or name2 or "")
|
| 184 |
+
|
| 185 |
+
full_score = name_similarity(full1, full2) if (full1 or full2) else -1
|
| 186 |
+
|
| 187 |
+
fn_score = name_similarity(normalize_text(fn1), normalize_text(fn2)) if (fn1 or fn2) else -1
|
| 188 |
+
mn_score = name_similarity(normalize_text(mn1), normalize_text(mn2)) if (mn1 or mn2) else -1
|
| 189 |
+
ln_score = name_similarity(normalize_text(ln1), normalize_text(ln2)) if (ln1 or ln2) else -1
|
| 190 |
+
|
| 191 |
+
return {
|
| 192 |
+
"full_name_percent": full_score,
|
| 193 |
+
"firstname_percent": fn_score,
|
| 194 |
+
"middlename_percent": mn_score,
|
| 195 |
+
"lastname_percent": ln_score,
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
def match_single(a, b):
|
| 199 |
+
"""Generic single field name/text matching"""
|
| 200 |
+
if not a and not b:
|
| 201 |
+
return -1
|
| 202 |
+
return name_similarity(normalize_text(a), normalize_text(b))
|
| 203 |
+
|
| 204 |
+
def match_addresses(addrs1, addrs2):
|
| 205 |
+
"""Match lists of addresses, return best score"""
|
| 206 |
+
valid1 = [normalize_text(a) for a in addrs1 if a and a.strip()]
|
| 207 |
+
valid2 = [normalize_text(a) for a in addrs2 if a and a.strip()]
|
| 208 |
+
if not valid1 and not valid2:
|
| 209 |
+
return -1
|
| 210 |
+
if not valid1 or not valid2:
|
| 211 |
+
return 0
|
| 212 |
+
best = 0
|
| 213 |
+
for a1 in valid1:
|
| 214 |
+
for a2 in valid2:
|
| 215 |
+
s = max(fuzzy_ratio(a1, a2), token_sort_ratio(a1, a2))
|
| 216 |
+
if s > best:
|
| 217 |
+
best = s
|
| 218 |
+
return best
|
| 219 |
+
|
| 220 |
+
def normalize_phone(p):
|
| 221 |
+
if not p:
|
| 222 |
+
return ""
|
| 223 |
+
return re.sub(r"[^\d]", "", str(p))
|
| 224 |
+
|
| 225 |
+
def compare_phones(phones1, phones2):
|
| 226 |
+
v1 = [normalize_phone(p) for p in phones1 if p and normalize_phone(p)]
|
| 227 |
+
v2 = [normalize_phone(p) for p in phones2 if p and normalize_phone(p)]
|
| 228 |
+
if not v1 and not v2:
|
| 229 |
+
return -1
|
| 230 |
+
if not v1 or not v2:
|
| 231 |
+
return 0
|
| 232 |
+
for p1 in v1:
|
| 233 |
+
for p2 in v2:
|
| 234 |
+
if p1 == p2 or p1[-10:] == p2[-10:]:
|
| 235 |
+
return 100
|
| 236 |
+
return 0
|
| 237 |
+
|
| 238 |
+
def compare_emails(emails1, emails2):
|
| 239 |
+
v1 = [e.strip().lower() for e in emails1 if e and e.strip()]
|
| 240 |
+
v2 = [e.strip().lower() for e in emails2 if e and e.strip()]
|
| 241 |
+
if not v1 and not v2:
|
| 242 |
+
return -1
|
| 243 |
+
if not v1 or not v2:
|
| 244 |
+
return 0
|
| 245 |
+
for e1 in v1:
|
| 246 |
+
for e2 in v2:
|
| 247 |
+
if e1 == e2:
|
| 248 |
+
return 100
|
| 249 |
+
return 0
|
| 250 |
+
|
| 251 |
+
def compare_exact(a, b):
|
| 252 |
+
if not a and not b:
|
| 253 |
+
return -1
|
| 254 |
+
if not a or not b:
|
| 255 |
+
return 0
|
| 256 |
+
return 100 if normalize_text(a) == normalize_text(b) else 0
|
| 257 |
+
|
| 258 |
+
def standardize_city(city):
|
| 259 |
+
if not city:
|
| 260 |
+
return ""
|
| 261 |
+
return re.sub(r"\s+", " ", str(city).strip().upper())
|
| 262 |
+
|
| 263 |
+
def standardize_state(state):
|
| 264 |
+
if not state:
|
| 265 |
+
return ""
|
| 266 |
+
return re.sub(r"\s+", " ", str(state).strip().upper())
|
| 267 |
+
|
| 268 |
+
def standardize_dob(dob):
|
| 269 |
+
if not dob:
|
| 270 |
+
return ""
|
| 271 |
+
dob = dob.strip()
|
| 272 |
+
# Try to normalize to YYYY-MM-DD
|
| 273 |
+
for fmt in [r"(\d{4})[/-](\d{2})[/-](\d{2})", r"(\d{2})[/-](\d{2})[/-](\d{4})"]:
|
| 274 |
+
m = re.match(fmt, dob)
|
| 275 |
+
if m:
|
| 276 |
+
g = m.groups()
|
| 277 |
+
if len(g[0]) == 4:
|
| 278 |
+
return f"{g[0]}-{g[1]}-{g[2]}"
|
| 279 |
+
else:
|
| 280 |
+
return f"{g[2]}-{g[1]}-{g[0]}"
|
| 281 |
+
return dob
|
| 282 |
+
|
| 283 |
+
def normalize_gender(val):
|
| 284 |
+
if not val:
|
| 285 |
+
return None
|
| 286 |
+
s = str(val).strip().lower()
|
| 287 |
+
if s in ['m', 'male', 'men', 'man']:
|
| 288 |
+
return 'MALE'
|
| 289 |
+
if s in ['f', 'female', 'women', 'woman']:
|
| 290 |
+
return 'FEMALE'
|
| 291 |
+
return s.upper()
|
| 292 |
+
|
| 293 |
+
def score_to_label(score, field):
|
| 294 |
+
"""Convert numeric score to display value"""
|
| 295 |
+
if score == -1:
|
| 296 |
+
return "missing value"
|
| 297 |
+
return round(float(score), 2)
|
| 298 |
+
|
| 299 |
+
def get_dynamic_fields(record, prefix):
|
| 300 |
+
fields = []
|
| 301 |
+
i = 0
|
| 302 |
+
while True:
|
| 303 |
+
key = f"{prefix}{i}"
|
| 304 |
+
if key in record:
|
| 305 |
+
fields.append(record.get(key))
|
| 306 |
+
i += 1
|
| 307 |
+
else:
|
| 308 |
+
break
|
| 309 |
+
return fields
|
| 310 |
+
|
| 311 |
+
def is_valid(val):
|
| 312 |
+
return val and str(val).strip() not in ["", "-", " ", "NA", "N/A", "NULL"]
|
| 313 |
+
|
| 314 |
+
def evaluate_rules(scores):
|
| 315 |
+
"""Simple rule-based overall decision"""
|
| 316 |
+
numeric_scores = {k: v for k, v in scores.items() if isinstance(v, (int, float)) and v != -1}
|
| 317 |
+
missing = {k: v for k, v in scores.items() if v == "missing value" or v == -1}
|
| 318 |
+
|
| 319 |
+
if not numeric_scores:
|
| 320 |
+
return "UNABLE TO DETERMINE", "Insufficient data to make a determination."
|
| 321 |
+
|
| 322 |
+
# Strong identifiers
|
| 323 |
+
strong_ids = ["AADHAR", "PAN", "PASSPORTID", "LICENSEID", "VOTERID"]
|
| 324 |
+
for sid in strong_ids:
|
| 325 |
+
if scores.get(sid) == 100:
|
| 326 |
+
return "MATCH", f"Strong identifier match on {sid}."
|
| 327 |
+
|
| 328 |
+
# Name + DOB + phone
|
| 329 |
+
name_score = scores.get("NAME", scores.get("FIRSTNAME", 0))
|
| 330 |
+
if isinstance(name_score, str):
|
| 331 |
+
name_score = 0
|
| 332 |
+
|
| 333 |
+
high_matches = sum(1 for k, v in numeric_scores.items() if isinstance(v, (int, float)) and v >= 80)
|
| 334 |
+
total_evaluated = len(numeric_scores)
|
| 335 |
+
|
| 336 |
+
if total_evaluated == 0:
|
| 337 |
+
return "UNABLE TO DETERMINE", "No fields to compare."
|
| 338 |
+
|
| 339 |
+
match_ratio = high_matches / total_evaluated
|
| 340 |
+
|
| 341 |
+
if match_ratio >= 0.7:
|
| 342 |
+
return "MATCH", f"{high_matches}/{total_evaluated} fields matched at β₯80%."
|
| 343 |
+
elif match_ratio >= 0.4:
|
| 344 |
+
return "POSSIBLE MATCH", f"{high_matches}/{total_evaluated} fields matched at β₯80%."
|
| 345 |
+
else:
|
| 346 |
+
return "NO MATCH", f"Only {high_matches}/{total_evaluated} fields matched at β₯80%."
|
| 347 |
+
|
| 348 |
+
def match_records(r1, r2):
|
| 349 |
+
"""Full matching pipeline"""
|
| 350 |
+
|
| 351 |
+
# Name matching
|
| 352 |
+
name_result = match_names(
|
| 353 |
+
r1.get("name"), r1.get("firstname"), r1.get("lastname"), r1.get("middlename"),
|
| 354 |
+
r2.get("name"), r2.get("firstname"), r2.get("lastname"), r2.get("middlename")
|
| 355 |
+
)
|
| 356 |
+
|
| 357 |
+
# Address matching
|
| 358 |
+
r1_addrs = get_dynamic_fields(r1, "addressline_")
|
| 359 |
+
r2_addrs = get_dynamic_fields(r2, "addressline_")
|
| 360 |
+
address_score = match_addresses(r1_addrs, r2_addrs)
|
| 361 |
+
|
| 362 |
+
# Phone
|
| 363 |
+
r1_phones = get_dynamic_fields(r1, "phone_")
|
| 364 |
+
r2_phones = get_dynamic_fields(r2, "phone_")
|
| 365 |
+
phone_score = compare_phones(r1_phones, r2_phones)
|
| 366 |
+
|
| 367 |
+
# Email
|
| 368 |
+
r1_emails = get_dynamic_fields(r1, "email_")
|
| 369 |
+
r2_emails = get_dynamic_fields(r2, "email_")
|
| 370 |
+
email_score = compare_emails(r1_emails, r2_emails)
|
| 371 |
+
|
| 372 |
+
# City / State / Zipcode
|
| 373 |
+
r1_cities = [standardize_city(c) for c in get_dynamic_fields(r1, "city_") if is_valid(c)]
|
| 374 |
+
r2_cities = [standardize_city(c) for c in get_dynamic_fields(r2, "city_") if is_valid(c)]
|
| 375 |
+
r1_states = [standardize_state(s) for s in get_dynamic_fields(r1, "state_") if is_valid(s)]
|
| 376 |
+
r2_states = [standardize_state(s) for s in get_dynamic_fields(r2, "state_") if is_valid(s)]
|
| 377 |
+
r1_zips = get_dynamic_fields(r1, "zipcode_")
|
| 378 |
+
r2_zips = get_dynamic_fields(r2, "zipcode_")
|
| 379 |
+
|
| 380 |
+
city_score = -1
|
| 381 |
+
if r1_cities or r2_cities:
|
| 382 |
+
city_score = 100 if any(c1 == c2 for c1 in r1_cities for c2 in r2_cities) else 0
|
| 383 |
+
|
| 384 |
+
state_score = -1
|
| 385 |
+
if r1_states or r2_states:
|
| 386 |
+
state_score = 100 if any(s1 == s2 for s1 in r1_states for s2 in r2_states) else 0
|
| 387 |
+
|
| 388 |
+
zipcode_score = compare_exact(
|
| 389 |
+
next((z for z in r1_zips if is_valid(z)), None),
|
| 390 |
+
next((z for z in r2_zips if is_valid(z)), None)
|
| 391 |
+
) if (r1_zips or r2_zips) else -1
|
| 392 |
+
|
| 393 |
+
# Exact fields
|
| 394 |
+
def safe_exact(k1, k2=None):
|
| 395 |
+
k2 = k2 or k1
|
| 396 |
+
return compare_exact(r1.get(k1), r2.get(k2))
|
| 397 |
+
|
| 398 |
+
g1 = normalize_gender(r1.get("gender"))
|
| 399 |
+
g2 = normalize_gender(r2.get("gender"))
|
| 400 |
+
if not g1 and not g2:
|
| 401 |
+
gender_score = -1
|
| 402 |
+
elif g1 and g2:
|
| 403 |
+
gender_score = 100 if g1 == g2 else 0
|
| 404 |
+
else:
|
| 405 |
+
gender_score = 0
|
| 406 |
+
|
| 407 |
+
results = {
|
| 408 |
+
"GENDER": gender_score,
|
| 409 |
+
"NAME": name_result["full_name_percent"],
|
| 410 |
+
"FIRSTNAME": name_result["firstname_percent"],
|
| 411 |
+
"MIDDLENAME": name_result["middlename_percent"],
|
| 412 |
+
"LASTNAME": name_result["lastname_percent"],
|
| 413 |
+
"SPOUSENAME": match_single(r1.get("spousename"), r2.get("spousename")),
|
| 414 |
+
"MOTHERNAME": match_single(r1.get("mothername"), r2.get("mothername")),
|
| 415 |
+
"FATHERNAME": match_single(r1.get("fathername"), r2.get("fathername")),
|
| 416 |
+
"COMPANYNAME": match_single(r1.get("companyname"), r2.get("companyname")),
|
| 417 |
+
"PARENTCOMPANYNAME": match_single(r1.get("parentcompanyname"), r2.get("parentcompanyname")),
|
| 418 |
+
"AADHAR": safe_exact("AADHAR"),
|
| 419 |
+
"PAN": safe_exact("pan"),
|
| 420 |
+
"LICENSEID": safe_exact("licenseid"),
|
| 421 |
+
"PASSPORTID": safe_exact("passportid"),
|
| 422 |
+
"VOTERID": safe_exact("voterid"),
|
| 423 |
+
"BIRTHDATE": compare_exact(r1.get("dob"), r2.get("dob")),
|
| 424 |
+
"PHONE": phone_score,
|
| 425 |
+
"EMAIL": email_score,
|
| 426 |
+
"ADDRESSLINE": address_score,
|
| 427 |
+
"CITY": city_score,
|
| 428 |
+
"STATE": state_score,
|
| 429 |
+
"ZIPCODE": zipcode_score,
|
| 430 |
+
}
|
| 431 |
+
|
| 432 |
+
# Custom fields
|
| 433 |
+
known = {"name","firstname","middlename","lastname","spousename","mothername",
|
| 434 |
+
"fathername","dob","gender","AADHAR","pan","licenseid","passportid",
|
| 435 |
+
"voterid","companyname","parentcompanyname"}
|
| 436 |
+
dyn_prefixes = ("zipcode_","city_","state_","phone_","email_","addressline_")
|
| 437 |
+
|
| 438 |
+
all_keys = set(r1.keys()) | set(r2.keys())
|
| 439 |
+
for key in all_keys:
|
| 440 |
+
ks = str(key)
|
| 441 |
+
if ks in known:
|
| 442 |
+
continue
|
| 443 |
+
if any(ks.startswith(p) for p in dyn_prefixes):
|
| 444 |
+
continue
|
| 445 |
+
v1, v2 = r1.get(key), r2.get(key)
|
| 446 |
+
if v1 or v2:
|
| 447 |
+
results[ks.upper()] = compare_exact(v1, v2)
|
| 448 |
+
|
| 449 |
+
return results
|
| 450 |
+
|
| 451 |
+
# =========================================================
|
| 452 |
+
# UI HELPERS
|
| 453 |
+
# =========================================================
|
| 454 |
+
def preprocess_text(text):
|
| 455 |
+
if not text:
|
| 456 |
+
return ""
|
| 457 |
+
return re.sub(r"\s+", " ", text.strip())
|
| 458 |
+
|
| 459 |
+
def create_section_card(title, icon_svg, content_func, *args, **kwargs):
|
| 460 |
+
st.markdown(f'''
|
| 461 |
+
<div class="section-card">
|
| 462 |
+
<div class="section-header-gradient">{icon_svg} {title}</div>
|
| 463 |
+
<div class="section-content">
|
| 464 |
+
''', unsafe_allow_html=True)
|
| 465 |
+
result = content_func(*args, **kwargs)
|
| 466 |
+
st.markdown('</div></div>', unsafe_allow_html=True)
|
| 467 |
+
return result
|
| 468 |
+
|
| 469 |
+
ICONS = {
|
| 470 |
+
"user": '<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M20 21v-2a4 4 0 0 0-4-4H8a4 4 0 0 0-4 4v2"></path><circle cx="12" cy="7" r="4"></circle></svg>',
|
| 471 |
+
"id": '<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><rect x="3" y="4" width="18" height="16" rx="2"></rect><line x1="16" y1="2" x2="16" y2="6"></line><line x1="8" y1="2" x2="8" y2="6"></line><line x1="3" y1="10" x2="21" y2="10"></line></svg>',
|
| 472 |
+
"map": '<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><polygon points="3 6 9 3 15 6 21 3 21 18 15 21 9 18 3 21"></polygon></svg>',
|
| 473 |
+
"phone": '<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M22 16.92v3a2 2 0 0 1-2.18 2 19.79 19.79 0 0 1-8.63-3.07 19.5 19.5 0 0 1-6-6 19.79 19.79 0 0 1-3.07-8.67A2 2 0 0 1 4.11 2h3a2 2 0 0 1 2 1.72c.127.96.361 1.903.7 2.81a2 2 0 0 1-.45 2.11L8.09 9.91a16 16 0 0 0 6 6l1.27-1.27a2 2 0 0 1 2.11-.45c.907.339 1.85.573 2.81.7A2 2 0 0 1 22 16.92z"></path></svg>',
|
| 474 |
+
"briefcase": '<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><rect x="2" y="7" width="20" height="14" rx="2" ry="2"></rect><path d="M16 21V5a2 2 0 0 0-2-2h-4a2 2 0 0 0-2 2v16"></path></svg>'
|
| 475 |
+
}
|
| 476 |
+
|
| 477 |
+
# =========================================================
|
| 478 |
+
# SECTION CONTENT FUNCTIONS
|
| 479 |
+
# =========================================================
|
| 480 |
+
|
| 481 |
+
def name_fields_content(record_num, prefix=""):
|
| 482 |
+
col1, col2 = st.columns(2)
|
| 483 |
+
with col1:
|
| 484 |
+
full_name = st.text_input("Full Name", key=f"{prefix}name_{record_num}", placeholder="Enter full name")
|
| 485 |
+
with col2:
|
| 486 |
+
first_name = st.text_input("First Name", key=f"{prefix}firstname_{record_num}", placeholder="Enter first name")
|
| 487 |
+
col1, col2 = st.columns(2)
|
| 488 |
+
with col1:
|
| 489 |
+
middle_name = st.text_input("Middle Name", key=f"{prefix}middlename_{record_num}", placeholder="Enter middle name")
|
| 490 |
+
with col2:
|
| 491 |
+
last_name = st.text_input("Last Name", key=f"{prefix}lastname_{record_num}", placeholder="Enter last name")
|
| 492 |
+
col1, col2 = st.columns(2)
|
| 493 |
+
with col1:
|
| 494 |
+
mother_name = st.text_input("Mother's Name", key=f"{prefix}mothername_{record_num}", placeholder="Enter mother's name")
|
| 495 |
+
with col2:
|
| 496 |
+
father_name = st.text_input("Father's Name", key=f"{prefix}fathername_{record_num}", placeholder="Enter father's name")
|
| 497 |
+
col1, col2 = st.columns(2)
|
| 498 |
+
with col1:
|
| 499 |
+
spouse_name = st.text_input("Spouse's Name", key=f"{prefix}spousename_{record_num}", placeholder="Enter spouse's name")
|
| 500 |
+
with col2:
|
| 501 |
+
other_name = st.text_input("Other Name", key=f"{prefix}othername_{record_num}", placeholder="Enter other name")
|
| 502 |
+
col1, col2 = st.columns(2)
|
| 503 |
+
with col1:
|
| 504 |
+
dob = st.text_input("Date of Birth", key=f"{prefix}dob_{record_num}", placeholder="YYYY-MM-DD")
|
| 505 |
+
with col2:
|
| 506 |
+
gender = st.text_input("Gender", key=f"{prefix}gender_{record_num}", placeholder="Male/Female/Other")
|
| 507 |
+
return {
|
| 508 |
+
"name": full_name, "firstname": first_name, "middlename": middle_name,
|
| 509 |
+
"lastname": last_name, "mothername": mother_name, "fathername": father_name,
|
| 510 |
+
"spousename": spouse_name, "othername": other_name, "gender": gender, "dob": dob
|
| 511 |
+
}
|
| 512 |
+
|
| 513 |
+
def identifier_fields_content(record_num, prefix=""):
|
| 514 |
+
col1, col2 = st.columns(2)
|
| 515 |
+
with col1:
|
| 516 |
+
aadhar = st.text_input("Aadhar Number", key=f"{prefix}taxid_{record_num}", placeholder="Enter Aadhar number")
|
| 517 |
+
with col2:
|
| 518 |
+
pan = st.text_input("PAN Number", key=f"{prefix}pan_{record_num}", placeholder="Enter PAN number")
|
| 519 |
+
col1, col2 = st.columns(2)
|
| 520 |
+
with col1:
|
| 521 |
+
license_id = st.text_input("License Number", key=f"{prefix}licenseid_{record_num}", placeholder="Enter license number")
|
| 522 |
+
with col2:
|
| 523 |
+
passport = st.text_input("Passport Number", key=f"{prefix}passportid_{record_num}", placeholder="Enter passport number")
|
| 524 |
+
col1, _ = st.columns(2)
|
| 525 |
+
with col1:
|
| 526 |
+
voter_id = st.text_input("Voter ID", key=f"{prefix}voterid_{record_num}", placeholder="Enter voter ID")
|
| 527 |
+
|
| 528 |
+
st.markdown('<div class="subsection-label" style="margin-top:15px;">Custom Fields</div>', unsafe_allow_html=True)
|
| 529 |
+
|
| 530 |
+
custom_fields = st.session_state[f"custom_fields_{prefix.strip('_')}"]
|
| 531 |
+
custom_data = {}
|
| 532 |
+
|
| 533 |
+
for idx, field in enumerate(custom_fields):
|
| 534 |
+
col_c1, col_c2, col_rem = st.columns([5, 5, 1])
|
| 535 |
+
with col_c1:
|
| 536 |
+
field_name = st.text_input(f"Field Name {idx+1}", value=field.get('name', ''),
|
| 537 |
+
key=f"{prefix}custom_name_{idx}_{record_num}", placeholder="Field Name")
|
| 538 |
+
custom_fields[idx]['name'] = field_name
|
| 539 |
+
with col_c2:
|
| 540 |
+
field_val = st.text_input(f"Field Value {idx+1}", value=field.get('value', ''),
|
| 541 |
+
key=f"{prefix}custom_val_{idx}_{record_num}", placeholder="Value")
|
| 542 |
+
custom_fields[idx]['value'] = field_val
|
| 543 |
+
if field_name:
|
| 544 |
+
custom_data[field_name] = field_val
|
| 545 |
+
with col_rem:
|
| 546 |
+
st.write("")
|
| 547 |
+
st.write("")
|
| 548 |
+
if st.button("β", key=f"{prefix}remove_custom_{idx}_{record_num}"):
|
| 549 |
+
custom_fields.pop(idx)
|
| 550 |
+
st.rerun()
|
| 551 |
+
|
| 552 |
+
if st.button("+ ADD FIELD", key=f"{prefix}add_custom_{record_num}", type="secondary"):
|
| 553 |
+
custom_fields.append({'name': '', 'value': ''})
|
| 554 |
+
st.rerun()
|
| 555 |
+
|
| 556 |
+
result = {"AADHAR": aadhar, "pan": pan, "licenseid": license_id, "passportid": passport, "voterid": voter_id}
|
| 557 |
+
result.update(custom_data)
|
| 558 |
+
return result
|
| 559 |
+
|
| 560 |
+
def address_item_content(record_num, addr_id, prefix=""):
|
| 561 |
+
address_line = st.text_input("Street Address", key=f"{prefix}addressline_{addr_id}_{record_num}", placeholder="Street, Building, Area")
|
| 562 |
+
city = st.text_input("City", key=f"{prefix}city_{addr_id}_{record_num}", placeholder="Enter city")
|
| 563 |
+
state = st.text_input("State", key=f"{prefix}state_{addr_id}_{record_num}", placeholder="Enter state")
|
| 564 |
+
pincode = st.text_input("Pincode", key=f"{prefix}zipcode_{addr_id}_{record_num}", placeholder="6-digit postal code")
|
| 565 |
+
return {
|
| 566 |
+
f"addressline_{addr_id}": address_line,
|
| 567 |
+
f"city_{addr_id}": city,
|
| 568 |
+
f"state_{addr_id}": state,
|
| 569 |
+
f"zipcode_{addr_id}": pincode,
|
| 570 |
+
}
|
| 571 |
+
|
| 572 |
+
def addresses_section_content(record_num, prefix=""):
|
| 573 |
+
ids_key = f"address_ids_{prefix.strip('_')}"
|
| 574 |
+
ids = st.session_state[ids_key]
|
| 575 |
+
addresses = {}
|
| 576 |
+
col_title, col_add = st.columns([6, 1])
|
| 577 |
+
with col_title:
|
| 578 |
+
st.markdown('<div class="subsection-label">Manage Addresses</div>', unsafe_allow_html=True)
|
| 579 |
+
with col_add:
|
| 580 |
+
if len(ids) < MAX_FIELDS:
|
| 581 |
+
if st.button("οΌ", key=f"{prefix}add_address_{record_num}"):
|
| 582 |
+
ids.append(max(ids) + 1 if ids else 0)
|
| 583 |
+
st.rerun()
|
| 584 |
+
for idx, addr_id in enumerate(ids):
|
| 585 |
+
header_cols = st.columns([8, 1])
|
| 586 |
+
with header_cols[0]:
|
| 587 |
+
header_text = f"Address {addr_id + 1}" if addr_id > 0 else "Primary Address"
|
| 588 |
+
st.markdown(f"<div class='address-title'>{header_text}</div>", unsafe_allow_html=True)
|
| 589 |
+
with header_cols[1]:
|
| 590 |
+
if len(ids) > 1:
|
| 591 |
+
if st.button("β", key=f"{prefix}remove_address_{addr_id}_{record_num}"):
|
| 592 |
+
ids.remove(addr_id)
|
| 593 |
+
st.rerun()
|
| 594 |
+
addr_data = address_item_content(record_num, addr_id, prefix)
|
| 595 |
+
addresses.update(addr_data)
|
| 596 |
+
if idx < len(ids) - 1:
|
| 597 |
+
st.markdown("<hr style='margin:20px 0;border:none;border-top:1px solid #e1e4e8;'>", unsafe_allow_html=True)
|
| 598 |
+
return addresses
|
| 599 |
+
|
| 600 |
+
def contact_section_content(record_num, prefix=""):
|
| 601 |
+
contacts = {}
|
| 602 |
+
r = prefix.strip("_")
|
| 603 |
+
phone_ids = st.session_state[f"phone_ids_{r}"]
|
| 604 |
+
email_ids = st.session_state[f"email_ids_{r}"]
|
| 605 |
+
|
| 606 |
+
st.markdown('<div class="subsection-label">π Phone Numbers</div>', unsafe_allow_html=True)
|
| 607 |
+
for i, phone_id in enumerate(phone_ids):
|
| 608 |
+
cols = st.columns([8, 1, 1])
|
| 609 |
+
with cols[0]:
|
| 610 |
+
phone_val = st.text_input(f"Phone {phone_id+1}", key=f"{prefix}phone_{phone_id}_{record_num}",
|
| 611 |
+
placeholder="Enter phone number", label_visibility="collapsed")
|
| 612 |
+
contacts[f"phone_{phone_id}"] = phone_val
|
| 613 |
+
with cols[1]:
|
| 614 |
+
if len(phone_ids) < MAX_FIELDS:
|
| 615 |
+
if st.button("οΌ", key=f"{prefix}add_phone_{phone_id}_{record_num}"):
|
| 616 |
+
st.session_state[f"phone_ids_{r}"].append(max(phone_ids) + 1 if phone_ids else 0)
|
| 617 |
+
st.rerun()
|
| 618 |
+
with cols[2]:
|
| 619 |
+
if len(phone_ids) > 1:
|
| 620 |
+
if st.button("β", key=f"{prefix}remove_phone_{phone_id}_{record_num}"):
|
| 621 |
+
st.session_state[f"phone_ids_{r}"].remove(phone_id)
|
| 622 |
+
st.rerun()
|
| 623 |
+
|
| 624 |
+
st.markdown('<hr class="section-divider">', unsafe_allow_html=True)
|
| 625 |
+
st.markdown('<div class="subsection-label">βοΈ Email Addresses</div>', unsafe_allow_html=True)
|
| 626 |
+
|
| 627 |
+
for i, email_id in enumerate(email_ids):
|
| 628 |
+
cols = st.columns([8, 1, 1])
|
| 629 |
+
with cols[0]:
|
| 630 |
+
email_val = st.text_input(f"Email {email_id+1}", key=f"{prefix}email_{email_id}_{record_num}",
|
| 631 |
+
placeholder="Enter email address", label_visibility="collapsed")
|
| 632 |
+
contacts[f"email_{email_id}"] = email_val
|
| 633 |
+
with cols[1]:
|
| 634 |
+
if len(email_ids) < MAX_FIELDS:
|
| 635 |
+
if st.button("οΌ", key=f"{prefix}add_email_{email_id}_{record_num}"):
|
| 636 |
+
st.session_state[f"email_ids_{r}"].append(max(email_ids) + 1 if email_ids else 0)
|
| 637 |
+
st.rerun()
|
| 638 |
+
with cols[2]:
|
| 639 |
+
if len(email_ids) > 1:
|
| 640 |
+
if st.button("β", key=f"{prefix}remove_email_{email_id}_{record_num}"):
|
| 641 |
+
st.session_state[f"email_ids_{r}"].remove(email_id)
|
| 642 |
+
st.rerun()
|
| 643 |
+
return contacts
|
| 644 |
+
|
| 645 |
+
def other_details_content(record_num, prefix=""):
|
| 646 |
+
col1, col2 = st.columns(2)
|
| 647 |
+
with col1:
|
| 648 |
+
company = st.text_input("Company Name", key=f"{prefix}companyname_{record_num}", placeholder="Enter company name")
|
| 649 |
+
with col2:
|
| 650 |
+
parent_company = st.text_input("Parent Company Name", key=f"{prefix}parentcompanyname_{record_num}", placeholder="Enter parent company name")
|
| 651 |
+
return {"companyname": company, "parentcompanyname": parent_company}
|
| 652 |
+
|
| 653 |
+
# =========================================================
|
| 654 |
+
# MAIN
|
| 655 |
+
# =========================================================
|
| 656 |
+
def main():
|
| 657 |
+
st.markdown('''
|
| 658 |
+
<div class="logo-title-container">
|
| 659 |
+
<div style="font-size:36px;">π</div>
|
| 660 |
+
<div class="header-title">Record Level Matching Using Transformer based Models</div>
|
| 661 |
+
</div>
|
| 662 |
+
''', unsafe_allow_html=True)
|
| 663 |
+
st.markdown('<div class="header-subtitle">Enter details for two records below and click "Run Record Match" to see the matching result</div>', unsafe_allow_html=True)
|
| 664 |
+
|
| 665 |
+
# Mode selector (UI only β Embedding is the only functional mode here)
|
| 666 |
+
mode_col1, _ = st.columns([4, 6])
|
| 667 |
+
with mode_col1:
|
| 668 |
+
matching_mode = st.radio(
|
| 669 |
+
"Matching Mode",
|
| 670 |
+
["Embedding Mode", "LLM Mode"],
|
| 671 |
+
key="matching_mode",
|
| 672 |
+
horizontal=True,
|
| 673 |
+
help="Embedding: Fuzzy/Token-based matching | LLM Mode: Requires external LLM server (unavailable in standalone)"
|
| 674 |
+
)
|
| 675 |
+
|
| 676 |
+
if matching_mode == "LLM Mode":
|
| 677 |
+
st.warning("β οΈ LLM Mode requires an external vLLM server. Falling back to Embedding (fuzzy) matching for standalone use.")
|
| 678 |
+
|
| 679 |
+
col1, col2 = st.columns(2)
|
| 680 |
+
|
| 681 |
+
with col1:
|
| 682 |
+
st.markdown('<div class="record-header">Record 1</div>', unsafe_allow_html=True)
|
| 683 |
+
r1_names = create_section_card("Personal Details", ICONS["user"], name_fields_content, 1, "r1_")
|
| 684 |
+
r1_identifiers = create_section_card("Equalities", ICONS["id"], identifier_fields_content, 1, "r1_")
|
| 685 |
+
r1_addresses = create_section_card("Address Details", ICONS["map"], addresses_section_content, 1, "r1_")
|
| 686 |
+
r1_contacts = create_section_card("Contact Information", ICONS["phone"], contact_section_content, 1, "r1_")
|
| 687 |
+
r1_other = create_section_card("Employment Details", ICONS["briefcase"], other_details_content, 1, "r1_")
|
| 688 |
+
|
| 689 |
+
with col2:
|
| 690 |
+
st.markdown('<div class="record-header">Record 2</div>', unsafe_allow_html=True)
|
| 691 |
+
r2_names = create_section_card("Personal Details", ICONS["user"], name_fields_content, 2, "r2_")
|
| 692 |
+
r2_identifiers = create_section_card("Equalities", ICONS["id"], identifier_fields_content, 2, "r2_")
|
| 693 |
+
r2_addresses = create_section_card("Address Details", ICONS["map"], addresses_section_content, 2, "r2_")
|
| 694 |
+
r2_contacts = create_section_card("Contact Information", ICONS["phone"], contact_section_content, 2, "r2_")
|
| 695 |
+
r2_other = create_section_card("Employment Details", ICONS["briefcase"], other_details_content, 2, "r2_")
|
| 696 |
+
|
| 697 |
+
if st.button("π Run Record Match", type="primary", use_container_width=True):
|
| 698 |
+
r1 = {**r1_names, **r1_identifiers, **r1_addresses, **r1_contacts, **r1_other}
|
| 699 |
+
r2 = {**r2_names, **r2_identifiers, **r2_addresses, **r2_contacts, **r2_other}
|
| 700 |
+
|
| 701 |
+
# Pre-process
|
| 702 |
+
def process(r):
|
| 703 |
+
out = {}
|
| 704 |
+
for k, v in r.items():
|
| 705 |
+
k_str = str(k)
|
| 706 |
+
if k_str == "dob":
|
| 707 |
+
out[k_str] = standardize_dob(v or "")
|
| 708 |
+
elif k_str.startswith("city_"):
|
| 709 |
+
out[k_str] = standardize_city(v) if v else None
|
| 710 |
+
elif k_str.startswith("state_"):
|
| 711 |
+
out[k_str] = standardize_state(v) if v else None
|
| 712 |
+
else:
|
| 713 |
+
out[k_str] = preprocess_text(v) if isinstance(v, str) else v
|
| 714 |
+
return out
|
| 715 |
+
|
| 716 |
+
r1p = process(r1)
|
| 717 |
+
r2p = process(r2)
|
| 718 |
+
|
| 719 |
+
with st.spinner("Matching records..."):
|
| 720 |
+
raw_scores = match_records(r1p, r2p)
|
| 721 |
+
|
| 722 |
+
def fmt(v):
|
| 723 |
+
if v == -1:
|
| 724 |
+
return "missing value"
|
| 725 |
+
return round(float(v), 2)
|
| 726 |
+
|
| 727 |
+
field_scores = {k: fmt(v) for k, v in raw_scores.items()}
|
| 728 |
+
overall_decision, reason = evaluate_rules(raw_scores)
|
| 729 |
+
|
| 730 |
+
result = {
|
| 731 |
+
"overall_decision": overall_decision,
|
| 732 |
+
"reason": reason,
|
| 733 |
+
"field_scores": field_scores,
|
| 734 |
+
}
|
| 735 |
+
|
| 736 |
+
st.markdown('''
|
| 737 |
+
<div class="result-box">
|
| 738 |
+
<div class="result-header">π Matching Result (JSON)</div>
|
| 739 |
+
</div>
|
| 740 |
+
''', unsafe_allow_html=True)
|
| 741 |
+
st.json(result, expanded=True)
|
| 742 |
+
|
| 743 |
+
if __name__ == "__main__":
|
| 744 |
+
main()
|