pujithapsx commited on
Commit
0ceaa0b
Β·
1 Parent(s): 681a0f7

new added

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +742 -38
src/streamlit_app.py CHANGED
@@ -1,40 +1,744 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
4
  import streamlit as st
 
 
 
 
 
 
5
 
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ import json
3
+ import re
4
+ import difflib
5
+ from PIL import Image
6
+ import base64
7
+ import os
8
 
9
+ # =========================================================
10
+ # PAGE CONFIG
11
+ # =========================================================
12
+ st.set_page_config(
13
+ page_title="GEN AI Record Level Matching",
14
+ page_icon="πŸ”",
15
+ layout="wide",
16
+ initial_sidebar_state="collapsed"
17
+ )
18
+
19
+ # =========================================================
20
+ # CUSTOM CSS
21
+ # =========================================================
22
+ st.markdown("""
23
+ <style>
24
+ [data-testid="stAppViewContainer"], [data-testid="stApp"], .stApp {
25
+ background-color: #f0f2f5 !important;
26
+ color: #333 !important;
27
+ }
28
+ .main { background-color: #f0f2f5; }
29
+ .stAppDeployButton, .stMainMenu, #MainMenu,
30
+ [data-testid="stToolbarActions"], [data-testid="stStatusWidget"] {
31
+ display: none !important;
32
+ }
33
+ header[data-testid="stHeader"] { background: transparent !important; }
34
+ .block-container { padding-top: 2rem !important; padding-bottom: 2rem !important; }
35
+ .header-title {
36
+ text-align: center; color: #5B4E8B; font-size: 28px;
37
+ font-weight: 600; margin-bottom: 10px;
38
+ }
39
+ .header-subtitle {
40
+ text-align: center; color: #666; font-size: 14px; margin-bottom: 30px;
41
+ }
42
+ .logo-title-container {
43
+ display: flex; align-items: center; justify-content: center;
44
+ gap: 15px; margin-bottom: 10px;
45
+ }
46
+ .record-header {
47
+ color: #612383; font-size: 26px; font-weight: 700;
48
+ margin-bottom: 25px; padding-bottom: 12px;
49
+ border-bottom: 3px solid;
50
+ border-image: linear-gradient(90deg, #612383, #E9592E, #F5A700) 1;
51
+ }
52
+ .section-card {
53
+ background: white; border-radius: 12px;
54
+ box-shadow: 0 2px 8px rgba(0,0,0,0.08);
55
+ margin-bottom: 20px; overflow: hidden;
56
+ }
57
+ .section-header-gradient {
58
+ background: linear-gradient(90deg, #612383 0%, #E9592E 100%);
59
+ color: white; padding: 14px 20px; font-size: 14px;
60
+ font-weight: 600; text-transform: uppercase; letter-spacing: 0.5px;
61
+ display: flex; align-items: center; gap: 10px;
62
+ }
63
+ .section-content { padding: 20px; }
64
+ .stTextInput > div > div > input {
65
+ background-color: #fafbfc !important; color: #333 !important;
66
+ border: 1px solid #e1e4e8 !important; border-radius: 8px !important;
67
+ padding: 10px 14px !important; font-size: 14px !important;
68
+ }
69
+ .stTextInput > div > div > input:focus {
70
+ border-color: #E9592E !important;
71
+ box-shadow: 0 0 0 3px rgba(233,89,46,0.1) !important;
72
+ }
73
+ .stTextInput label { color: #555 !important; font-size: 13px !important; font-weight: 500 !important; }
74
+ .subsection-label { color: #666; font-size: 13px; font-weight: 500; margin-bottom: 12px; }
75
+ div[data-testid="stButton"] button:not([kind="primary"]):not([kind="secondary"]) {
76
+ width: 36px !important; height: 36px !important; min-width: 36px !important;
77
+ padding: 0 !important; border-radius: 6px !important; font-size: 18px !important;
78
+ background-color: white !important; color: #612383 !important;
79
+ border: 1px solid #d0d7de !important;
80
+ }
81
+ button[kind="primary"] {
82
+ background: linear-gradient(90deg, #612383 0%, #E9592E 100%) !important;
83
+ color: white !important; border: none !important; border-radius: 10px !important;
84
+ padding: 16px 32px !important; font-size: 16px !important; font-weight: 600 !important;
85
+ box-shadow: 0 4px 12px rgba(97,35,131,0.25) !important;
86
+ text-transform: uppercase; letter-spacing: 0.5px; height: auto !important;
87
+ }
88
+ button[kind="secondary"] {
89
+ background: linear-gradient(90deg, #612383 0%, #E9592E 100%) !important;
90
+ color: white !important; border: none !important; border-radius: 10px !important;
91
+ padding: 12px 24px !important; font-size: 13px !important; font-weight: 600 !important;
92
+ min-width: 140px !important; height: auto !important;
93
+ box-shadow: 0 4px 12px rgba(97,35,131,0.25) !important;
94
+ text-transform: uppercase; letter-spacing: 0.5px;
95
+ }
96
+ .result-box {
97
+ background: white !important; border-radius: 12px !important;
98
+ padding: 25px !important; margin-top: 30px !important;
99
+ box-shadow: 0 4px 16px rgba(0,0,0,0.1) !important;
100
+ border-top: 4px solid;
101
+ border-image: linear-gradient(90deg, #612383, #E9592E, #F5A700) 1;
102
+ }
103
+ .result-header { color: #612383; font-size: 18px; font-weight: 600; margin-bottom: 15px; }
104
+ .section-divider { border: none; border-top: 1px solid #e1e4e8; margin: 20px 0; }
105
+ div[data-testid="stExpander"] summary { color: #333 !important; font-weight: 600 !important; }
106
+ div[data-testid="stExpander"] summary svg { stroke: #333 !important; }
107
+ .address-title { font-weight: 600; color: #612383; font-size: 14px; }
108
+ ::placeholder { color: #666 !important; opacity: 1 !important; }
109
+ [data-testid="stJson"], [data-testid="stCodeBlock"] {
110
+ background-color: #ffffff !important; color: #333333 !important;
111
+ border: 1px solid #e1e4e8 !important; border-radius: 8px !important;
112
+ }
113
+ div[data-testid="stRadio"] label { color: #333 !important; font-size: 14px !important; font-weight: 500 !important; }
114
+ div[data-testid="stRadio"] > label:first-child { color: #222 !important; font-size: 15px !important; font-weight: 600 !important; }
115
+ div[data-testid="stRadio"] div[role="radiogroup"] label[data-baseweb="radio"] div:first-child {
116
+ border-color: #612383 !important;
117
+ }
118
+ div[data-testid="stRadio"] div[role="radiogroup"] label[data-baseweb="radio"] div:first-child div {
119
+ background-color: #612383 !important;
120
+ }
121
+ </style>
122
+ """, unsafe_allow_html=True)
123
+
124
+ # =========================================================
125
+ # SESSION STATE
126
+ # =========================================================
127
+ MAX_FIELDS = 20
128
+
129
+ defaults = {
130
+ 'address_ids_r1': [0], 'address_ids_r2': [0],
131
+ 'phone_ids_r1': [0], 'phone_ids_r2': [0],
132
+ 'email_ids_r1': [0], 'email_ids_r2': [0],
133
+ 'custom_fields_r1': [], 'custom_fields_r2': [],
134
+ }
135
+ for k, v in defaults.items():
136
+ if k not in st.session_state:
137
+ st.session_state[k] = v
138
+
139
+ # =========================================================
140
+ # PURE PYTHON MATCHING LOGIC (no external ML libs)
141
+ # =========================================================
142
+
143
+ def normalize_text(text):
144
+ if not text:
145
+ return ""
146
+ return re.sub(r"\s+", " ", str(text).strip().lower())
147
+
148
+ def fuzzy_ratio(a, b):
149
+ """Simple fuzzy ratio using difflib (0-100)"""
150
+ if not a or not b:
151
+ return 0
152
+ return int(difflib.SequenceMatcher(None, a, b).ratio() * 100)
153
+
154
+ def token_sort_ratio(a, b):
155
+ """Token sort ratio - sort words before comparing"""
156
+ if not a or not b:
157
+ return 0
158
+ a_sorted = " ".join(sorted(a.split()))
159
+ b_sorted = " ".join(sorted(b.split()))
160
+ return fuzzy_ratio(a_sorted, b_sorted)
161
+
162
+ def name_similarity(a, b):
163
+ """Compare two name strings"""
164
+ if not a and not b:
165
+ return -1 # both missing
166
+ if not a or not b:
167
+ return 0
168
+ a, b = normalize_text(a), normalize_text(b)
169
+ r1 = fuzzy_ratio(a, b)
170
+ r2 = token_sort_ratio(a, b)
171
+ return max(r1, r2)
172
+
173
+ def match_names(name1, fn1, ln1, mn1, name2, fn2, ln2, mn2):
174
+ """Match full name records, returns dict with percent scores"""
175
+
176
+ def build_full(name, fn, mn, ln):
177
+ parts = [p for p in [fn, mn, ln] if p and p.strip()]
178
+ if parts:
179
+ return " ".join(parts)
180
+ return name or ""
181
+
182
+ full1 = normalize_text(build_full(name1, fn1, mn1, ln1) or name1 or "")
183
+ full2 = normalize_text(build_full(name2, fn2, mn2, ln2) or name2 or "")
184
+
185
+ full_score = name_similarity(full1, full2) if (full1 or full2) else -1
186
+
187
+ fn_score = name_similarity(normalize_text(fn1), normalize_text(fn2)) if (fn1 or fn2) else -1
188
+ mn_score = name_similarity(normalize_text(mn1), normalize_text(mn2)) if (mn1 or mn2) else -1
189
+ ln_score = name_similarity(normalize_text(ln1), normalize_text(ln2)) if (ln1 or ln2) else -1
190
+
191
+ return {
192
+ "full_name_percent": full_score,
193
+ "firstname_percent": fn_score,
194
+ "middlename_percent": mn_score,
195
+ "lastname_percent": ln_score,
196
+ }
197
+
198
+ def match_single(a, b):
199
+ """Generic single field name/text matching"""
200
+ if not a and not b:
201
+ return -1
202
+ return name_similarity(normalize_text(a), normalize_text(b))
203
+
204
+ def match_addresses(addrs1, addrs2):
205
+ """Match lists of addresses, return best score"""
206
+ valid1 = [normalize_text(a) for a in addrs1 if a and a.strip()]
207
+ valid2 = [normalize_text(a) for a in addrs2 if a and a.strip()]
208
+ if not valid1 and not valid2:
209
+ return -1
210
+ if not valid1 or not valid2:
211
+ return 0
212
+ best = 0
213
+ for a1 in valid1:
214
+ for a2 in valid2:
215
+ s = max(fuzzy_ratio(a1, a2), token_sort_ratio(a1, a2))
216
+ if s > best:
217
+ best = s
218
+ return best
219
+
220
+ def normalize_phone(p):
221
+ if not p:
222
+ return ""
223
+ return re.sub(r"[^\d]", "", str(p))
224
+
225
+ def compare_phones(phones1, phones2):
226
+ v1 = [normalize_phone(p) for p in phones1 if p and normalize_phone(p)]
227
+ v2 = [normalize_phone(p) for p in phones2 if p and normalize_phone(p)]
228
+ if not v1 and not v2:
229
+ return -1
230
+ if not v1 or not v2:
231
+ return 0
232
+ for p1 in v1:
233
+ for p2 in v2:
234
+ if p1 == p2 or p1[-10:] == p2[-10:]:
235
+ return 100
236
+ return 0
237
+
238
+ def compare_emails(emails1, emails2):
239
+ v1 = [e.strip().lower() for e in emails1 if e and e.strip()]
240
+ v2 = [e.strip().lower() for e in emails2 if e and e.strip()]
241
+ if not v1 and not v2:
242
+ return -1
243
+ if not v1 or not v2:
244
+ return 0
245
+ for e1 in v1:
246
+ for e2 in v2:
247
+ if e1 == e2:
248
+ return 100
249
+ return 0
250
+
251
+ def compare_exact(a, b):
252
+ if not a and not b:
253
+ return -1
254
+ if not a or not b:
255
+ return 0
256
+ return 100 if normalize_text(a) == normalize_text(b) else 0
257
+
258
+ def standardize_city(city):
259
+ if not city:
260
+ return ""
261
+ return re.sub(r"\s+", " ", str(city).strip().upper())
262
+
263
+ def standardize_state(state):
264
+ if not state:
265
+ return ""
266
+ return re.sub(r"\s+", " ", str(state).strip().upper())
267
+
268
+ def standardize_dob(dob):
269
+ if not dob:
270
+ return ""
271
+ dob = dob.strip()
272
+ # Try to normalize to YYYY-MM-DD
273
+ for fmt in [r"(\d{4})[/-](\d{2})[/-](\d{2})", r"(\d{2})[/-](\d{2})[/-](\d{4})"]:
274
+ m = re.match(fmt, dob)
275
+ if m:
276
+ g = m.groups()
277
+ if len(g[0]) == 4:
278
+ return f"{g[0]}-{g[1]}-{g[2]}"
279
+ else:
280
+ return f"{g[2]}-{g[1]}-{g[0]}"
281
+ return dob
282
+
283
+ def normalize_gender(val):
284
+ if not val:
285
+ return None
286
+ s = str(val).strip().lower()
287
+ if s in ['m', 'male', 'men', 'man']:
288
+ return 'MALE'
289
+ if s in ['f', 'female', 'women', 'woman']:
290
+ return 'FEMALE'
291
+ return s.upper()
292
+
293
+ def score_to_label(score, field):
294
+ """Convert numeric score to display value"""
295
+ if score == -1:
296
+ return "missing value"
297
+ return round(float(score), 2)
298
+
299
+ def get_dynamic_fields(record, prefix):
300
+ fields = []
301
+ i = 0
302
+ while True:
303
+ key = f"{prefix}{i}"
304
+ if key in record:
305
+ fields.append(record.get(key))
306
+ i += 1
307
+ else:
308
+ break
309
+ return fields
310
+
311
+ def is_valid(val):
312
+ return val and str(val).strip() not in ["", "-", " ", "NA", "N/A", "NULL"]
313
+
314
+ def evaluate_rules(scores):
315
+ """Simple rule-based overall decision"""
316
+ numeric_scores = {k: v for k, v in scores.items() if isinstance(v, (int, float)) and v != -1}
317
+ missing = {k: v for k, v in scores.items() if v == "missing value" or v == -1}
318
+
319
+ if not numeric_scores:
320
+ return "UNABLE TO DETERMINE", "Insufficient data to make a determination."
321
+
322
+ # Strong identifiers
323
+ strong_ids = ["AADHAR", "PAN", "PASSPORTID", "LICENSEID", "VOTERID"]
324
+ for sid in strong_ids:
325
+ if scores.get(sid) == 100:
326
+ return "MATCH", f"Strong identifier match on {sid}."
327
+
328
+ # Name + DOB + phone
329
+ name_score = scores.get("NAME", scores.get("FIRSTNAME", 0))
330
+ if isinstance(name_score, str):
331
+ name_score = 0
332
+
333
+ high_matches = sum(1 for k, v in numeric_scores.items() if isinstance(v, (int, float)) and v >= 80)
334
+ total_evaluated = len(numeric_scores)
335
+
336
+ if total_evaluated == 0:
337
+ return "UNABLE TO DETERMINE", "No fields to compare."
338
+
339
+ match_ratio = high_matches / total_evaluated
340
+
341
+ if match_ratio >= 0.7:
342
+ return "MATCH", f"{high_matches}/{total_evaluated} fields matched at β‰₯80%."
343
+ elif match_ratio >= 0.4:
344
+ return "POSSIBLE MATCH", f"{high_matches}/{total_evaluated} fields matched at β‰₯80%."
345
+ else:
346
+ return "NO MATCH", f"Only {high_matches}/{total_evaluated} fields matched at β‰₯80%."
347
+
348
+ def match_records(r1, r2):
349
+ """Full matching pipeline"""
350
+
351
+ # Name matching
352
+ name_result = match_names(
353
+ r1.get("name"), r1.get("firstname"), r1.get("lastname"), r1.get("middlename"),
354
+ r2.get("name"), r2.get("firstname"), r2.get("lastname"), r2.get("middlename")
355
+ )
356
+
357
+ # Address matching
358
+ r1_addrs = get_dynamic_fields(r1, "addressline_")
359
+ r2_addrs = get_dynamic_fields(r2, "addressline_")
360
+ address_score = match_addresses(r1_addrs, r2_addrs)
361
+
362
+ # Phone
363
+ r1_phones = get_dynamic_fields(r1, "phone_")
364
+ r2_phones = get_dynamic_fields(r2, "phone_")
365
+ phone_score = compare_phones(r1_phones, r2_phones)
366
+
367
+ # Email
368
+ r1_emails = get_dynamic_fields(r1, "email_")
369
+ r2_emails = get_dynamic_fields(r2, "email_")
370
+ email_score = compare_emails(r1_emails, r2_emails)
371
+
372
+ # City / State / Zipcode
373
+ r1_cities = [standardize_city(c) for c in get_dynamic_fields(r1, "city_") if is_valid(c)]
374
+ r2_cities = [standardize_city(c) for c in get_dynamic_fields(r2, "city_") if is_valid(c)]
375
+ r1_states = [standardize_state(s) for s in get_dynamic_fields(r1, "state_") if is_valid(s)]
376
+ r2_states = [standardize_state(s) for s in get_dynamic_fields(r2, "state_") if is_valid(s)]
377
+ r1_zips = get_dynamic_fields(r1, "zipcode_")
378
+ r2_zips = get_dynamic_fields(r2, "zipcode_")
379
+
380
+ city_score = -1
381
+ if r1_cities or r2_cities:
382
+ city_score = 100 if any(c1 == c2 for c1 in r1_cities for c2 in r2_cities) else 0
383
+
384
+ state_score = -1
385
+ if r1_states or r2_states:
386
+ state_score = 100 if any(s1 == s2 for s1 in r1_states for s2 in r2_states) else 0
387
+
388
+ zipcode_score = compare_exact(
389
+ next((z for z in r1_zips if is_valid(z)), None),
390
+ next((z for z in r2_zips if is_valid(z)), None)
391
+ ) if (r1_zips or r2_zips) else -1
392
+
393
+ # Exact fields
394
+ def safe_exact(k1, k2=None):
395
+ k2 = k2 or k1
396
+ return compare_exact(r1.get(k1), r2.get(k2))
397
+
398
+ g1 = normalize_gender(r1.get("gender"))
399
+ g2 = normalize_gender(r2.get("gender"))
400
+ if not g1 and not g2:
401
+ gender_score = -1
402
+ elif g1 and g2:
403
+ gender_score = 100 if g1 == g2 else 0
404
+ else:
405
+ gender_score = 0
406
+
407
+ results = {
408
+ "GENDER": gender_score,
409
+ "NAME": name_result["full_name_percent"],
410
+ "FIRSTNAME": name_result["firstname_percent"],
411
+ "MIDDLENAME": name_result["middlename_percent"],
412
+ "LASTNAME": name_result["lastname_percent"],
413
+ "SPOUSENAME": match_single(r1.get("spousename"), r2.get("spousename")),
414
+ "MOTHERNAME": match_single(r1.get("mothername"), r2.get("mothername")),
415
+ "FATHERNAME": match_single(r1.get("fathername"), r2.get("fathername")),
416
+ "COMPANYNAME": match_single(r1.get("companyname"), r2.get("companyname")),
417
+ "PARENTCOMPANYNAME": match_single(r1.get("parentcompanyname"), r2.get("parentcompanyname")),
418
+ "AADHAR": safe_exact("AADHAR"),
419
+ "PAN": safe_exact("pan"),
420
+ "LICENSEID": safe_exact("licenseid"),
421
+ "PASSPORTID": safe_exact("passportid"),
422
+ "VOTERID": safe_exact("voterid"),
423
+ "BIRTHDATE": compare_exact(r1.get("dob"), r2.get("dob")),
424
+ "PHONE": phone_score,
425
+ "EMAIL": email_score,
426
+ "ADDRESSLINE": address_score,
427
+ "CITY": city_score,
428
+ "STATE": state_score,
429
+ "ZIPCODE": zipcode_score,
430
+ }
431
+
432
+ # Custom fields
433
+ known = {"name","firstname","middlename","lastname","spousename","mothername",
434
+ "fathername","dob","gender","AADHAR","pan","licenseid","passportid",
435
+ "voterid","companyname","parentcompanyname"}
436
+ dyn_prefixes = ("zipcode_","city_","state_","phone_","email_","addressline_")
437
+
438
+ all_keys = set(r1.keys()) | set(r2.keys())
439
+ for key in all_keys:
440
+ ks = str(key)
441
+ if ks in known:
442
+ continue
443
+ if any(ks.startswith(p) for p in dyn_prefixes):
444
+ continue
445
+ v1, v2 = r1.get(key), r2.get(key)
446
+ if v1 or v2:
447
+ results[ks.upper()] = compare_exact(v1, v2)
448
+
449
+ return results
450
+
451
+ # =========================================================
452
+ # UI HELPERS
453
+ # =========================================================
454
+ def preprocess_text(text):
455
+ if not text:
456
+ return ""
457
+ return re.sub(r"\s+", " ", text.strip())
458
+
459
+ def create_section_card(title, icon_svg, content_func, *args, **kwargs):
460
+ st.markdown(f'''
461
+ <div class="section-card">
462
+ <div class="section-header-gradient">{icon_svg} {title}</div>
463
+ <div class="section-content">
464
+ ''', unsafe_allow_html=True)
465
+ result = content_func(*args, **kwargs)
466
+ st.markdown('</div></div>', unsafe_allow_html=True)
467
+ return result
468
+
469
+ ICONS = {
470
+ "user": '<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M20 21v-2a4 4 0 0 0-4-4H8a4 4 0 0 0-4 4v2"></path><circle cx="12" cy="7" r="4"></circle></svg>',
471
+ "id": '<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><rect x="3" y="4" width="18" height="16" rx="2"></rect><line x1="16" y1="2" x2="16" y2="6"></line><line x1="8" y1="2" x2="8" y2="6"></line><line x1="3" y1="10" x2="21" y2="10"></line></svg>',
472
+ "map": '<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><polygon points="3 6 9 3 15 6 21 3 21 18 15 21 9 18 3 21"></polygon></svg>',
473
+ "phone": '<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M22 16.92v3a2 2 0 0 1-2.18 2 19.79 19.79 0 0 1-8.63-3.07 19.5 19.5 0 0 1-6-6 19.79 19.79 0 0 1-3.07-8.67A2 2 0 0 1 4.11 2h3a2 2 0 0 1 2 1.72c.127.96.361 1.903.7 2.81a2 2 0 0 1-.45 2.11L8.09 9.91a16 16 0 0 0 6 6l1.27-1.27a2 2 0 0 1 2.11-.45c.907.339 1.85.573 2.81.7A2 2 0 0 1 22 16.92z"></path></svg>',
474
+ "briefcase": '<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><rect x="2" y="7" width="20" height="14" rx="2" ry="2"></rect><path d="M16 21V5a2 2 0 0 0-2-2h-4a2 2 0 0 0-2 2v16"></path></svg>'
475
+ }
476
+
477
+ # =========================================================
478
+ # SECTION CONTENT FUNCTIONS
479
+ # =========================================================
480
+
481
+ def name_fields_content(record_num, prefix=""):
482
+ col1, col2 = st.columns(2)
483
+ with col1:
484
+ full_name = st.text_input("Full Name", key=f"{prefix}name_{record_num}", placeholder="Enter full name")
485
+ with col2:
486
+ first_name = st.text_input("First Name", key=f"{prefix}firstname_{record_num}", placeholder="Enter first name")
487
+ col1, col2 = st.columns(2)
488
+ with col1:
489
+ middle_name = st.text_input("Middle Name", key=f"{prefix}middlename_{record_num}", placeholder="Enter middle name")
490
+ with col2:
491
+ last_name = st.text_input("Last Name", key=f"{prefix}lastname_{record_num}", placeholder="Enter last name")
492
+ col1, col2 = st.columns(2)
493
+ with col1:
494
+ mother_name = st.text_input("Mother's Name", key=f"{prefix}mothername_{record_num}", placeholder="Enter mother's name")
495
+ with col2:
496
+ father_name = st.text_input("Father's Name", key=f"{prefix}fathername_{record_num}", placeholder="Enter father's name")
497
+ col1, col2 = st.columns(2)
498
+ with col1:
499
+ spouse_name = st.text_input("Spouse's Name", key=f"{prefix}spousename_{record_num}", placeholder="Enter spouse's name")
500
+ with col2:
501
+ other_name = st.text_input("Other Name", key=f"{prefix}othername_{record_num}", placeholder="Enter other name")
502
+ col1, col2 = st.columns(2)
503
+ with col1:
504
+ dob = st.text_input("Date of Birth", key=f"{prefix}dob_{record_num}", placeholder="YYYY-MM-DD")
505
+ with col2:
506
+ gender = st.text_input("Gender", key=f"{prefix}gender_{record_num}", placeholder="Male/Female/Other")
507
+ return {
508
+ "name": full_name, "firstname": first_name, "middlename": middle_name,
509
+ "lastname": last_name, "mothername": mother_name, "fathername": father_name,
510
+ "spousename": spouse_name, "othername": other_name, "gender": gender, "dob": dob
511
+ }
512
+
513
+ def identifier_fields_content(record_num, prefix=""):
514
+ col1, col2 = st.columns(2)
515
+ with col1:
516
+ aadhar = st.text_input("Aadhar Number", key=f"{prefix}taxid_{record_num}", placeholder="Enter Aadhar number")
517
+ with col2:
518
+ pan = st.text_input("PAN Number", key=f"{prefix}pan_{record_num}", placeholder="Enter PAN number")
519
+ col1, col2 = st.columns(2)
520
+ with col1:
521
+ license_id = st.text_input("License Number", key=f"{prefix}licenseid_{record_num}", placeholder="Enter license number")
522
+ with col2:
523
+ passport = st.text_input("Passport Number", key=f"{prefix}passportid_{record_num}", placeholder="Enter passport number")
524
+ col1, _ = st.columns(2)
525
+ with col1:
526
+ voter_id = st.text_input("Voter ID", key=f"{prefix}voterid_{record_num}", placeholder="Enter voter ID")
527
+
528
+ st.markdown('<div class="subsection-label" style="margin-top:15px;">Custom Fields</div>', unsafe_allow_html=True)
529
+
530
+ custom_fields = st.session_state[f"custom_fields_{prefix.strip('_')}"]
531
+ custom_data = {}
532
+
533
+ for idx, field in enumerate(custom_fields):
534
+ col_c1, col_c2, col_rem = st.columns([5, 5, 1])
535
+ with col_c1:
536
+ field_name = st.text_input(f"Field Name {idx+1}", value=field.get('name', ''),
537
+ key=f"{prefix}custom_name_{idx}_{record_num}", placeholder="Field Name")
538
+ custom_fields[idx]['name'] = field_name
539
+ with col_c2:
540
+ field_val = st.text_input(f"Field Value {idx+1}", value=field.get('value', ''),
541
+ key=f"{prefix}custom_val_{idx}_{record_num}", placeholder="Value")
542
+ custom_fields[idx]['value'] = field_val
543
+ if field_name:
544
+ custom_data[field_name] = field_val
545
+ with col_rem:
546
+ st.write("")
547
+ st.write("")
548
+ if st.button("βˆ’", key=f"{prefix}remove_custom_{idx}_{record_num}"):
549
+ custom_fields.pop(idx)
550
+ st.rerun()
551
+
552
+ if st.button("+ ADD FIELD", key=f"{prefix}add_custom_{record_num}", type="secondary"):
553
+ custom_fields.append({'name': '', 'value': ''})
554
+ st.rerun()
555
+
556
+ result = {"AADHAR": aadhar, "pan": pan, "licenseid": license_id, "passportid": passport, "voterid": voter_id}
557
+ result.update(custom_data)
558
+ return result
559
+
560
+ def address_item_content(record_num, addr_id, prefix=""):
561
+ address_line = st.text_input("Street Address", key=f"{prefix}addressline_{addr_id}_{record_num}", placeholder="Street, Building, Area")
562
+ city = st.text_input("City", key=f"{prefix}city_{addr_id}_{record_num}", placeholder="Enter city")
563
+ state = st.text_input("State", key=f"{prefix}state_{addr_id}_{record_num}", placeholder="Enter state")
564
+ pincode = st.text_input("Pincode", key=f"{prefix}zipcode_{addr_id}_{record_num}", placeholder="6-digit postal code")
565
+ return {
566
+ f"addressline_{addr_id}": address_line,
567
+ f"city_{addr_id}": city,
568
+ f"state_{addr_id}": state,
569
+ f"zipcode_{addr_id}": pincode,
570
+ }
571
+
572
+ def addresses_section_content(record_num, prefix=""):
573
+ ids_key = f"address_ids_{prefix.strip('_')}"
574
+ ids = st.session_state[ids_key]
575
+ addresses = {}
576
+ col_title, col_add = st.columns([6, 1])
577
+ with col_title:
578
+ st.markdown('<div class="subsection-label">Manage Addresses</div>', unsafe_allow_html=True)
579
+ with col_add:
580
+ if len(ids) < MAX_FIELDS:
581
+ if st.button("οΌ‹", key=f"{prefix}add_address_{record_num}"):
582
+ ids.append(max(ids) + 1 if ids else 0)
583
+ st.rerun()
584
+ for idx, addr_id in enumerate(ids):
585
+ header_cols = st.columns([8, 1])
586
+ with header_cols[0]:
587
+ header_text = f"Address {addr_id + 1}" if addr_id > 0 else "Primary Address"
588
+ st.markdown(f"<div class='address-title'>{header_text}</div>", unsafe_allow_html=True)
589
+ with header_cols[1]:
590
+ if len(ids) > 1:
591
+ if st.button("βˆ’", key=f"{prefix}remove_address_{addr_id}_{record_num}"):
592
+ ids.remove(addr_id)
593
+ st.rerun()
594
+ addr_data = address_item_content(record_num, addr_id, prefix)
595
+ addresses.update(addr_data)
596
+ if idx < len(ids) - 1:
597
+ st.markdown("<hr style='margin:20px 0;border:none;border-top:1px solid #e1e4e8;'>", unsafe_allow_html=True)
598
+ return addresses
599
+
600
+ def contact_section_content(record_num, prefix=""):
601
+ contacts = {}
602
+ r = prefix.strip("_")
603
+ phone_ids = st.session_state[f"phone_ids_{r}"]
604
+ email_ids = st.session_state[f"email_ids_{r}"]
605
+
606
+ st.markdown('<div class="subsection-label">πŸ“ž Phone Numbers</div>', unsafe_allow_html=True)
607
+ for i, phone_id in enumerate(phone_ids):
608
+ cols = st.columns([8, 1, 1])
609
+ with cols[0]:
610
+ phone_val = st.text_input(f"Phone {phone_id+1}", key=f"{prefix}phone_{phone_id}_{record_num}",
611
+ placeholder="Enter phone number", label_visibility="collapsed")
612
+ contacts[f"phone_{phone_id}"] = phone_val
613
+ with cols[1]:
614
+ if len(phone_ids) < MAX_FIELDS:
615
+ if st.button("οΌ‹", key=f"{prefix}add_phone_{phone_id}_{record_num}"):
616
+ st.session_state[f"phone_ids_{r}"].append(max(phone_ids) + 1 if phone_ids else 0)
617
+ st.rerun()
618
+ with cols[2]:
619
+ if len(phone_ids) > 1:
620
+ if st.button("βˆ’", key=f"{prefix}remove_phone_{phone_id}_{record_num}"):
621
+ st.session_state[f"phone_ids_{r}"].remove(phone_id)
622
+ st.rerun()
623
+
624
+ st.markdown('<hr class="section-divider">', unsafe_allow_html=True)
625
+ st.markdown('<div class="subsection-label">βœ‰οΈ Email Addresses</div>', unsafe_allow_html=True)
626
+
627
+ for i, email_id in enumerate(email_ids):
628
+ cols = st.columns([8, 1, 1])
629
+ with cols[0]:
630
+ email_val = st.text_input(f"Email {email_id+1}", key=f"{prefix}email_{email_id}_{record_num}",
631
+ placeholder="Enter email address", label_visibility="collapsed")
632
+ contacts[f"email_{email_id}"] = email_val
633
+ with cols[1]:
634
+ if len(email_ids) < MAX_FIELDS:
635
+ if st.button("οΌ‹", key=f"{prefix}add_email_{email_id}_{record_num}"):
636
+ st.session_state[f"email_ids_{r}"].append(max(email_ids) + 1 if email_ids else 0)
637
+ st.rerun()
638
+ with cols[2]:
639
+ if len(email_ids) > 1:
640
+ if st.button("βˆ’", key=f"{prefix}remove_email_{email_id}_{record_num}"):
641
+ st.session_state[f"email_ids_{r}"].remove(email_id)
642
+ st.rerun()
643
+ return contacts
644
+
645
+ def other_details_content(record_num, prefix=""):
646
+ col1, col2 = st.columns(2)
647
+ with col1:
648
+ company = st.text_input("Company Name", key=f"{prefix}companyname_{record_num}", placeholder="Enter company name")
649
+ with col2:
650
+ parent_company = st.text_input("Parent Company Name", key=f"{prefix}parentcompanyname_{record_num}", placeholder="Enter parent company name")
651
+ return {"companyname": company, "parentcompanyname": parent_company}
652
+
653
+ # =========================================================
654
+ # MAIN
655
+ # =========================================================
656
+ def main():
657
+ st.markdown('''
658
+ <div class="logo-title-container">
659
+ <div style="font-size:36px;">πŸ”</div>
660
+ <div class="header-title">Record Level Matching Using Transformer based Models</div>
661
+ </div>
662
+ ''', unsafe_allow_html=True)
663
+ st.markdown('<div class="header-subtitle">Enter details for two records below and click "Run Record Match" to see the matching result</div>', unsafe_allow_html=True)
664
+
665
+ # Mode selector (UI only β€” Embedding is the only functional mode here)
666
+ mode_col1, _ = st.columns([4, 6])
667
+ with mode_col1:
668
+ matching_mode = st.radio(
669
+ "Matching Mode",
670
+ ["Embedding Mode", "LLM Mode"],
671
+ key="matching_mode",
672
+ horizontal=True,
673
+ help="Embedding: Fuzzy/Token-based matching | LLM Mode: Requires external LLM server (unavailable in standalone)"
674
+ )
675
+
676
+ if matching_mode == "LLM Mode":
677
+ st.warning("⚠️ LLM Mode requires an external vLLM server. Falling back to Embedding (fuzzy) matching for standalone use.")
678
+
679
+ col1, col2 = st.columns(2)
680
+
681
+ with col1:
682
+ st.markdown('<div class="record-header">Record 1</div>', unsafe_allow_html=True)
683
+ r1_names = create_section_card("Personal Details", ICONS["user"], name_fields_content, 1, "r1_")
684
+ r1_identifiers = create_section_card("Equalities", ICONS["id"], identifier_fields_content, 1, "r1_")
685
+ r1_addresses = create_section_card("Address Details", ICONS["map"], addresses_section_content, 1, "r1_")
686
+ r1_contacts = create_section_card("Contact Information", ICONS["phone"], contact_section_content, 1, "r1_")
687
+ r1_other = create_section_card("Employment Details", ICONS["briefcase"], other_details_content, 1, "r1_")
688
+
689
+ with col2:
690
+ st.markdown('<div class="record-header">Record 2</div>', unsafe_allow_html=True)
691
+ r2_names = create_section_card("Personal Details", ICONS["user"], name_fields_content, 2, "r2_")
692
+ r2_identifiers = create_section_card("Equalities", ICONS["id"], identifier_fields_content, 2, "r2_")
693
+ r2_addresses = create_section_card("Address Details", ICONS["map"], addresses_section_content, 2, "r2_")
694
+ r2_contacts = create_section_card("Contact Information", ICONS["phone"], contact_section_content, 2, "r2_")
695
+ r2_other = create_section_card("Employment Details", ICONS["briefcase"], other_details_content, 2, "r2_")
696
+
697
+ if st.button("πŸš€ Run Record Match", type="primary", use_container_width=True):
698
+ r1 = {**r1_names, **r1_identifiers, **r1_addresses, **r1_contacts, **r1_other}
699
+ r2 = {**r2_names, **r2_identifiers, **r2_addresses, **r2_contacts, **r2_other}
700
+
701
+ # Pre-process
702
+ def process(r):
703
+ out = {}
704
+ for k, v in r.items():
705
+ k_str = str(k)
706
+ if k_str == "dob":
707
+ out[k_str] = standardize_dob(v or "")
708
+ elif k_str.startswith("city_"):
709
+ out[k_str] = standardize_city(v) if v else None
710
+ elif k_str.startswith("state_"):
711
+ out[k_str] = standardize_state(v) if v else None
712
+ else:
713
+ out[k_str] = preprocess_text(v) if isinstance(v, str) else v
714
+ return out
715
+
716
+ r1p = process(r1)
717
+ r2p = process(r2)
718
+
719
+ with st.spinner("Matching records..."):
720
+ raw_scores = match_records(r1p, r2p)
721
+
722
+ def fmt(v):
723
+ if v == -1:
724
+ return "missing value"
725
+ return round(float(v), 2)
726
+
727
+ field_scores = {k: fmt(v) for k, v in raw_scores.items()}
728
+ overall_decision, reason = evaluate_rules(raw_scores)
729
+
730
+ result = {
731
+ "overall_decision": overall_decision,
732
+ "reason": reason,
733
+ "field_scores": field_scores,
734
+ }
735
+
736
+ st.markdown('''
737
+ <div class="result-box">
738
+ <div class="result-header">πŸ“Š Matching Result (JSON)</div>
739
+ </div>
740
+ ''', unsafe_allow_html=True)
741
+ st.json(result, expanded=True)
742
+
743
+ if __name__ == "__main__":
744
+ main()