Roger Surf commited on
Commit
6e8d673
ยท
1 Parent(s): 552e62e

HF: remove evaluation artifacts and ignore permanently

Browse files
pages/4_๐Ÿ‘ค_Candidate_View.py CHANGED
@@ -45,7 +45,6 @@ def compute_bilateral_fairness(
45
  comp_mean = float(np.mean(comp_scores))
46
 
47
  fairness = min(cand_mean, comp_mean) / max(cand_mean, comp_mean)
48
-
49
  return cand_mean, comp_mean, fairness
50
 
51
 
@@ -59,7 +58,7 @@ def cached_fairness(candidate_embeddings, company_embeddings, top_k):
59
  )
60
 
61
  # =========================================================
62
- # COMPUTES SCORE DISTRIBUTION
63
  # =========================================================
64
  @st.cache_data(show_spinner=False)
65
  def compute_score_distribution(
@@ -67,12 +66,6 @@ def compute_score_distribution(
67
  company_embeddings,
68
  sample_size=200
69
  ):
70
- """
71
- Compute a global score distribution using random candidate samples
72
- """
73
- import numpy as np
74
- from sklearn.metrics.pairwise import cosine_similarity
75
-
76
  n = min(sample_size, len(candidate_embeddings))
77
  scores = []
78
 
@@ -86,9 +79,9 @@ def compute_score_distribution(
86
  return np.array(scores)
87
 
88
  # =========================================================
89
- # BUILD NETWORK GRAPH
90
  # =========================================================
91
- @st.cache_data(show_spinner=False)
92
  def build_network_graph(
93
  candidate_embeddings,
94
  company_embeddings,
@@ -98,8 +91,6 @@ def build_network_graph(
98
  sample_size=15
99
  ):
100
  from pyvis.network import Network
101
- import numpy as np
102
- from sklearn.metrics.pairwise import cosine_similarity
103
 
104
  net = Network(
105
  height="600px",
@@ -110,18 +101,17 @@ def build_network_graph(
110
 
111
  n_cand = min(sample_size, len(candidate_embeddings))
112
 
113
- # Add candidate nodes
114
  for i in range(n_cand):
115
- label = f"Candidate {i}"
116
  net.add_node(
117
  f"cand_{i}",
118
- label=label,
119
  color="#667eea",
120
  shape="dot",
121
  size=18
122
  )
123
 
124
- # Add company nodes + edges
125
  for i in range(n_cand):
126
  sims = cosine_similarity(
127
  candidate_embeddings[i].reshape(1, -1),
@@ -131,11 +121,11 @@ def build_network_graph(
131
  top_idx = np.argsort(sims)[-top_k:][::-1]
132
 
133
  for j in top_idx:
134
- company_name = companies_meta.iloc[j].get("name", f"Company {j}")
135
 
136
  net.add_node(
137
  f"comp_{j}",
138
- label=company_name,
139
  color="#2ecc71",
140
  shape="box",
141
  size=14
@@ -151,15 +141,9 @@ def build_network_graph(
151
  return net
152
 
153
  # =========================================================
154
- # LLM-BASED MATCH EXPLANATION
155
  # =========================================================
156
  def explain_match_llm(candidate_row, company_row, score):
157
- """
158
- Post-hoc LLM explanation for a single match.
159
- Safe: does NOT affect ranking.
160
- """
161
- import os
162
-
163
  HF_TOKEN = os.getenv("HF_TOKEN")
164
 
165
  if not HF_TOKEN:
@@ -172,6 +156,7 @@ def explain_match_llm(candidate_row, company_row, score):
172
 
173
  try:
174
  from huggingface_hub import InferenceClient
 
175
 
176
  client = InferenceClient(token=HF_TOKEN)
177
 
@@ -193,10 +178,10 @@ Required Skills: {company_row.get('required_skills','')}
193
  MATCH SCORE: {score:.3f}
194
 
195
  Return a concise explanation in JSON with keys:
196
- - strengths (list)
197
- - gaps (list)
198
- - recommendation (string)
199
- - summary (string)
200
  """
201
 
202
  response = client.chat_completion(
@@ -206,8 +191,6 @@ Return a concise explanation in JSON with keys:
206
  )
207
 
208
  content = response.choices[0].message.content
209
-
210
- import json
211
  start, end = content.find("{"), content.rfind("}") + 1
212
  return json.loads(content[start:end])
213
 
@@ -219,7 +202,6 @@ Return a concise explanation in JSON with keys:
219
  "recommendation": "Review manually."
220
  }
221
 
222
-
223
  # =========================================================
224
  # PAGE CONFIG
225
  # =========================================================
@@ -230,7 +212,7 @@ st.set_page_config(
230
  )
231
 
232
  # =========================================================
233
- # PATHS (V3 = REPORT CONSISTENT)
234
  # =========================================================
235
  BASE_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
236
  DATA_PATH = os.path.join(BASE_PATH, "data", "v3", "processed")
@@ -241,7 +223,7 @@ CAND_META_PATH = os.path.join(DATA_PATH, "candidates_metadata.pkl")
241
  COMP_META_PATH = os.path.join(DATA_PATH, "companies_metadata.pkl")
242
 
243
  # =========================================================
244
- # LOAD CORE DATA
245
  # =========================================================
246
  @st.cache_resource
247
  def load_core():
@@ -287,16 +269,16 @@ left, right = st.columns([1, 2])
287
  with left:
288
  st.subheader("๐Ÿ‘ค Candidate Profile")
289
 
290
- st.markdown(f"**Category:** {candidate.get('Category', 'N/A')}")
291
 
292
  with st.expander("๐Ÿง  Skills", expanded=True):
293
- st.write(candidate.get("skills", "N/A"))
294
 
295
  with st.expander("๐ŸŽฏ Career Objective", expanded=True):
296
- st.write(candidate.get("career_objective", "N/A"))
297
 
298
  # =========================================================
299
- # MATCHING (REAL)
300
  # =========================================================
301
  cand_vec = candidate_embeddings[candidate_id].reshape(1, -1)
302
  scores = cosine_similarity(cand_vec, company_embeddings)[0]
@@ -309,15 +291,15 @@ for rank, (idx, score) in enumerate(zip(top_idx, top_scores), start=1):
309
  company = companies_meta.iloc[idx]
310
  rows.append({
311
  "Rank": rank,
312
- "Company": company.get("name", "Unknown"),
313
- "Industry": company.get("industries_list", "N/A"),
314
  "Score": score
315
  })
316
 
317
  df = pd.DataFrame(rows)
318
 
319
  # =========================================================
320
- # MATCH METRICS + TABLE
321
  # =========================================================
322
  with right:
323
  st.subheader("๐Ÿ“Š Match Overview")
@@ -330,9 +312,7 @@ with right:
330
  st.subheader("๐Ÿข Top Company Matches")
331
 
332
  def style_score(val):
333
- if val > threshold:
334
- return "color: green; font-weight: bold;"
335
- return ""
336
 
337
  st.dataframe(
338
  df.style.applymap(style_score, subset=["Score"]),
@@ -340,78 +320,34 @@ with right:
340
  )
341
 
342
  # =========================================================
343
- # FAIRNESS PANEL
344
  # =========================================================
345
  st.markdown("---")
346
  st.subheader("โš–๏ธ Bilateral Fairness (Top-K)")
347
 
348
- with st.expander("What does this mean?"):
349
- st.markdown("""
350
- **Bilateral Fairness** evaluates whether the system treats
351
- candidates and companies symmetrically.
352
-
353
- - Candidate โ†’ Company: mean Top-K similarity
354
- - Company โ†’ Candidate: mean Top-K similarity
355
-
356
- Values near **1.0** indicate a balanced system.
357
- Lower values are expected in retrieval-based systems.
358
- """)
359
-
360
- with st.spinner("Computing fairness metrics..."):
361
- cand_mean, comp_mean, fairness = cached_fairness(
362
- candidate_embeddings,
363
- company_embeddings,
364
- top_k
365
- )
366
 
367
  c1, c2, c3 = st.columns(3)
368
  c1.metric("Candidate โ†’ Company", f"{cand_mean:.3f}")
369
  c2.metric("Company โ†’ Candidate", f"{comp_mean:.3f}")
370
  c3.metric("Fairness Ratio", f"{fairness:.3f}")
371
 
372
- if fairness >= 0.9:
373
- st.success("โœ… System is highly balanced")
374
- elif fairness >= 0.6:
375
- st.info("โ„น๏ธ System is reasonably balanced (expected for Top-K)")
376
- else:
377
- st.warning("โš ๏ธ Potential asymmetry detected")
378
-
379
  # =========================================================
380
  # SCORE DISTRIBUTION
381
  # =========================================================
382
  st.markdown("---")
383
  st.subheader("๐Ÿ“ˆ Score Distribution")
384
 
385
- with st.expander("How to interpret this?", expanded=False):
386
- st.markdown("""
387
- This histogram shows the **distribution of cosine similarity scores**
388
- between candidates and companies.
389
-
390
- **Important interpretation:**
391
- - Scores above **0.6** are already considered **strong semantic matches**
392
- - Scores above **0.7** are **rare and exceptional**
393
- - The system is evaluated by **ranking**, not absolute thresholds
394
- """)
395
-
396
- with st.spinner("Computing score distribution..."):
397
- score_dist = compute_score_distribution(
398
- candidate_embeddings,
399
- company_embeddings,
400
- sample_size=200
401
- )
402
-
403
- # Histogram
404
- hist_df = pd.DataFrame({"Similarity Score": score_dist})
405
-
406
- st.bar_chart(
407
- hist_df["Similarity Score"].value_counts(bins=30).sort_index()
408
  )
409
 
410
- # Reference lines (textual)
411
- c1, c2, c3 = st.columns(3)
412
- c1.metric("Mean Score", f"{score_dist.mean():.3f}")
413
- c2.metric("95th Percentile", f"{np.percentile(score_dist, 95):.3f}")
414
- c3.metric("Max Observed", f"{score_dist.max():.3f}")
415
 
416
  # =========================================================
417
  # NETWORK GRAPH
@@ -419,69 +355,44 @@ c3.metric("Max Observed", f"{score_dist.max():.3f}")
419
  st.markdown("---")
420
  st.subheader("๐ŸŒ Matching Network Graph")
421
 
422
- with st.expander("What does this show?", expanded=False):
423
- st.markdown("""
424
- This network visualizes the **Top-K semantic relationships**
425
- between candidates and companies.
426
-
427
- - ๐Ÿ”ต Blue nodes: Candidates
428
- - ๐ŸŸข Green nodes: Companies
429
- - Edges represent strong semantic matches
430
-
431
- The graph helps detect:
432
- - Structural bias
433
- - Over-dominant companies
434
- - Diversity of matches
435
- """)
436
-
437
- with st.spinner("Building network graph..."):
438
- net = build_network_graph(
439
- candidate_embeddings,
440
- company_embeddings,
441
- candidates_meta,
442
- companies_meta,
443
- top_k=3,
444
- sample_size=12
445
- )
446
 
447
- html_path = os.path.join(BASE_PATH, "data", "v3", "results", "network_temp.html")
 
448
  net.write_html(html_path)
449
 
450
  import streamlit.components.v1 as components
451
- components.html(
452
- open(html_path, "r").read(),
453
- height=620,
454
- scrolling=True
455
- )
456
 
457
  # =========================================================
458
- # LLM EXPLAINABILITY (TOP-1)
459
  # =========================================================
460
  st.markdown("---")
461
  st.subheader("๐Ÿค– Match Explanation (LLM)")
462
 
463
  with st.expander("Why is this company a good match?", expanded=True):
464
- top_company_idx = top_idx[0]
465
- top_company = companies_meta.iloc[top_company_idx]
466
  top_score = top_scores[0]
467
 
468
  if st.button("Generate AI Explanation"):
469
- with st.spinner("LLM analyzing match..."):
470
- explanation = explain_match_llm(
471
- candidate,
472
- top_company,
473
- top_score
474
- )
475
 
476
  st.markdown(f"**Summary:** {explanation.get('summary','')}")
477
 
478
  c1, c2 = st.columns(2)
479
-
480
  with c1:
481
  st.markdown("### โœ… Strengths")
482
  for s in explanation.get("strengths", []):
483
  st.write(f"- {s}")
484
-
485
  with c2:
486
  st.markdown("### โš ๏ธ Gaps")
487
  for g in explanation.get("gaps", []):
 
45
  comp_mean = float(np.mean(comp_scores))
46
 
47
  fairness = min(cand_mean, comp_mean) / max(cand_mean, comp_mean)
 
48
  return cand_mean, comp_mean, fairness
49
 
50
 
 
58
  )
59
 
60
  # =========================================================
61
+ # SCORE DISTRIBUTION
62
  # =========================================================
63
  @st.cache_data(show_spinner=False)
64
  def compute_score_distribution(
 
66
  company_embeddings,
67
  sample_size=200
68
  ):
 
 
 
 
 
 
69
  n = min(sample_size, len(candidate_embeddings))
70
  scores = []
71
 
 
79
  return np.array(scores)
80
 
81
  # =========================================================
82
+ # NETWORK GRAPH
83
  # =========================================================
84
+ @st.cache_resource(show_spinner=False)
85
  def build_network_graph(
86
  candidate_embeddings,
87
  company_embeddings,
 
91
  sample_size=15
92
  ):
93
  from pyvis.network import Network
 
 
94
 
95
  net = Network(
96
  height="600px",
 
101
 
102
  n_cand = min(sample_size, len(candidate_embeddings))
103
 
104
+ # Candidate nodes
105
  for i in range(n_cand):
 
106
  net.add_node(
107
  f"cand_{i}",
108
+ label=f"Candidate {i}",
109
  color="#667eea",
110
  shape="dot",
111
  size=18
112
  )
113
 
114
+ # Company nodes + edges
115
  for i in range(n_cand):
116
  sims = cosine_similarity(
117
  candidate_embeddings[i].reshape(1, -1),
 
121
  top_idx = np.argsort(sims)[-top_k:][::-1]
122
 
123
  for j in top_idx:
124
+ label = companies_meta.iloc[j].get("name", f"Company {j}")
125
 
126
  net.add_node(
127
  f"comp_{j}",
128
+ label=label,
129
  color="#2ecc71",
130
  shape="box",
131
  size=14
 
141
  return net
142
 
143
  # =========================================================
144
+ # LLM EXPLANATION
145
  # =========================================================
146
  def explain_match_llm(candidate_row, company_row, score):
 
 
 
 
 
 
147
  HF_TOKEN = os.getenv("HF_TOKEN")
148
 
149
  if not HF_TOKEN:
 
156
 
157
  try:
158
  from huggingface_hub import InferenceClient
159
+ import json
160
 
161
  client = InferenceClient(token=HF_TOKEN)
162
 
 
178
  MATCH SCORE: {score:.3f}
179
 
180
  Return a concise explanation in JSON with keys:
181
+ - strengths
182
+ - gaps
183
+ - recommendation
184
+ - summary
185
  """
186
 
187
  response = client.chat_completion(
 
191
  )
192
 
193
  content = response.choices[0].message.content
 
 
194
  start, end = content.find("{"), content.rfind("}") + 1
195
  return json.loads(content[start:end])
196
 
 
202
  "recommendation": "Review manually."
203
  }
204
 
 
205
  # =========================================================
206
  # PAGE CONFIG
207
  # =========================================================
 
212
  )
213
 
214
  # =========================================================
215
+ # PATHS
216
  # =========================================================
217
  BASE_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
218
  DATA_PATH = os.path.join(BASE_PATH, "data", "v3", "processed")
 
223
  COMP_META_PATH = os.path.join(DATA_PATH, "companies_metadata.pkl")
224
 
225
  # =========================================================
226
+ # LOAD DATA
227
  # =========================================================
228
  @st.cache_resource
229
  def load_core():
 
269
  with left:
270
  st.subheader("๐Ÿ‘ค Candidate Profile")
271
 
272
+ st.markdown(f"**Category:** {candidate.get('Category','N/A')}")
273
 
274
  with st.expander("๐Ÿง  Skills", expanded=True):
275
+ st.write(candidate.get("skills","N/A"))
276
 
277
  with st.expander("๐ŸŽฏ Career Objective", expanded=True):
278
+ st.write(candidate.get("career_objective","N/A"))
279
 
280
  # =========================================================
281
+ # MATCHING
282
  # =========================================================
283
  cand_vec = candidate_embeddings[candidate_id].reshape(1, -1)
284
  scores = cosine_similarity(cand_vec, company_embeddings)[0]
 
291
  company = companies_meta.iloc[idx]
292
  rows.append({
293
  "Rank": rank,
294
+ "Company": company.get("name","Unknown"),
295
+ "Industry": company.get("industries_list","N/A"),
296
  "Score": score
297
  })
298
 
299
  df = pd.DataFrame(rows)
300
 
301
  # =========================================================
302
+ # MATCH METRICS
303
  # =========================================================
304
  with right:
305
  st.subheader("๐Ÿ“Š Match Overview")
 
312
  st.subheader("๐Ÿข Top Company Matches")
313
 
314
  def style_score(val):
315
+ return "color: green; font-weight: bold;" if val > threshold else ""
 
 
316
 
317
  st.dataframe(
318
  df.style.applymap(style_score, subset=["Score"]),
 
320
  )
321
 
322
  # =========================================================
323
+ # FAIRNESS
324
  # =========================================================
325
  st.markdown("---")
326
  st.subheader("โš–๏ธ Bilateral Fairness (Top-K)")
327
 
328
+ cand_mean, comp_mean, fairness = cached_fairness(
329
+ candidate_embeddings,
330
+ company_embeddings,
331
+ top_k
332
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
333
 
334
  c1, c2, c3 = st.columns(3)
335
  c1.metric("Candidate โ†’ Company", f"{cand_mean:.3f}")
336
  c2.metric("Company โ†’ Candidate", f"{comp_mean:.3f}")
337
  c3.metric("Fairness Ratio", f"{fairness:.3f}")
338
 
 
 
 
 
 
 
 
339
  # =========================================================
340
  # SCORE DISTRIBUTION
341
  # =========================================================
342
  st.markdown("---")
343
  st.subheader("๐Ÿ“ˆ Score Distribution")
344
 
345
+ score_dist = compute_score_distribution(
346
+ candidate_embeddings,
347
+ company_embeddings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
  )
349
 
350
+ st.bar_chart(pd.Series(score_dist).value_counts(bins=30).sort_index())
 
 
 
 
351
 
352
  # =========================================================
353
  # NETWORK GRAPH
 
355
  st.markdown("---")
356
  st.subheader("๐ŸŒ Matching Network Graph")
357
 
358
+ net = build_network_graph(
359
+ candidate_embeddings,
360
+ company_embeddings,
361
+ candidates_meta,
362
+ companies_meta
363
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
 
365
+ html_path = os.path.join(BASE_PATH, "data", "v3", "results", "network_candidate.html")
366
+ os.makedirs(os.path.dirname(html_path), exist_ok=True)
367
  net.write_html(html_path)
368
 
369
  import streamlit.components.v1 as components
370
+ components.html(open(html_path).read(), height=620, scrolling=True)
 
 
 
 
371
 
372
  # =========================================================
373
+ # LLM EXPLANATION
374
  # =========================================================
375
  st.markdown("---")
376
  st.subheader("๐Ÿค– Match Explanation (LLM)")
377
 
378
  with st.expander("Why is this company a good match?", expanded=True):
379
+ top_company = companies_meta.iloc[top_idx[0]]
 
380
  top_score = top_scores[0]
381
 
382
  if st.button("Generate AI Explanation"):
383
+ explanation = explain_match_llm(
384
+ candidate,
385
+ top_company,
386
+ top_score
387
+ )
 
388
 
389
  st.markdown(f"**Summary:** {explanation.get('summary','')}")
390
 
391
  c1, c2 = st.columns(2)
 
392
  with c1:
393
  st.markdown("### โœ… Strengths")
394
  for s in explanation.get("strengths", []):
395
  st.write(f"- {s}")
 
396
  with c2:
397
  st.markdown("### โš ๏ธ Gaps")
398
  for g in explanation.get("gaps", []):
pages/5_๐Ÿข_Company_View.py CHANGED
@@ -45,7 +45,6 @@ def compute_bilateral_fairness(
45
  comp_mean = float(np.mean(comp_scores))
46
 
47
  fairness = min(cand_mean, comp_mean) / max(cand_mean, comp_mean)
48
-
49
  return cand_mean, comp_mean, fairness
50
 
51
 
@@ -59,12 +58,12 @@ def cached_fairness(candidate_embeddings, company_embeddings, top_k):
59
  )
60
 
61
  # =========================================================
62
- # COMPUTES SCORE DISTRIBUTION
63
  # =========================================================
64
  @st.cache_data(show_spinner=False)
65
  def compute_score_distribution(
66
- company_embeddings,
67
  candidate_embeddings,
 
68
  sample_size=200
69
  ):
70
  n = min(sample_size, len(company_embeddings))
@@ -80,9 +79,9 @@ def compute_score_distribution(
80
  return np.array(scores)
81
 
82
  # =========================================================
83
- # BUILD NETWORK GRAPH
84
  # =========================================================
85
- @st.cache_data(show_spinner=False)
86
  def build_network_graph(
87
  company_embeddings,
88
  candidate_embeddings,
@@ -102,7 +101,7 @@ def build_network_graph(
102
 
103
  n_comp = min(sample_size, len(company_embeddings))
104
 
105
- # Add company nodes
106
  for i in range(n_comp):
107
  label = companies_meta.iloc[i].get("name", f"Company {i}")
108
  net.add_node(
@@ -113,7 +112,7 @@ def build_network_graph(
113
  size=18
114
  )
115
 
116
- # Add candidate nodes + edges
117
  for i in range(n_comp):
118
  sims = cosine_similarity(
119
  company_embeddings[i].reshape(1, -1),
@@ -141,7 +140,7 @@ def build_network_graph(
141
  return net
142
 
143
  # =========================================================
144
- # LLM-BASED MATCH EXPLANATION
145
  # =========================================================
146
  def explain_match_llm(company_row, candidate_row, score):
147
  HF_TOKEN = os.getenv("HF_TOKEN")
@@ -223,7 +222,7 @@ CAND_META_PATH = os.path.join(DATA_PATH, "candidates_metadata.pkl")
223
  COMP_META_PATH = os.path.join(DATA_PATH, "companies_metadata.pkl")
224
 
225
  # =========================================================
226
- # LOAD CORE DATA
227
  # =========================================================
228
  @st.cache_resource
229
  def load_core():
@@ -298,7 +297,7 @@ for rank, (idx, score) in enumerate(zip(top_idx, top_scores), start=1):
298
  df = pd.DataFrame(rows)
299
 
300
  # =========================================================
301
- # MATCH METRICS + TABLE
302
  # =========================================================
303
  with right:
304
  st.subheader("๐Ÿ“Š Match Overview")
@@ -311,9 +310,7 @@ with right:
311
  st.subheader("๐Ÿ‘ค Top Candidate Matches")
312
 
313
  def style_score(val):
314
- if val > threshold:
315
- return "color: green; font-weight: bold;"
316
- return ""
317
 
318
  st.dataframe(
319
  df.style.applymap(style_score, subset=["Score"]),
@@ -321,7 +318,7 @@ with right:
321
  )
322
 
323
  # =========================================================
324
- # FAIRNESS PANEL
325
  # =========================================================
326
  st.markdown("---")
327
  st.subheader("โš–๏ธ Bilateral Fairness (Top-K)")
@@ -344,8 +341,8 @@ st.markdown("---")
344
  st.subheader("๐Ÿ“ˆ Score Distribution")
345
 
346
  score_dist = compute_score_distribution(
347
- company_embeddings,
348
- candidate_embeddings
349
  )
350
 
351
  st.bar_chart(pd.Series(score_dist).value_counts(bins=30).sort_index())
@@ -364,13 +361,14 @@ net = build_network_graph(
364
  )
365
 
366
  html_path = os.path.join(BASE_PATH, "data", "v3", "results", "network_company.html")
 
367
  net.write_html(html_path)
368
 
369
  import streamlit.components.v1 as components
370
  components.html(open(html_path).read(), height=620, scrolling=True)
371
 
372
  # =========================================================
373
- # LLM EXPLAINABILITY
374
  # =========================================================
375
  st.markdown("---")
376
  st.subheader("๐Ÿค– Match Explanation (LLM)")
 
45
  comp_mean = float(np.mean(comp_scores))
46
 
47
  fairness = min(cand_mean, comp_mean) / max(cand_mean, comp_mean)
 
48
  return cand_mean, comp_mean, fairness
49
 
50
 
 
58
  )
59
 
60
  # =========================================================
61
+ # SCORE DISTRIBUTION
62
  # =========================================================
63
  @st.cache_data(show_spinner=False)
64
  def compute_score_distribution(
 
65
  candidate_embeddings,
66
+ company_embeddings,
67
  sample_size=200
68
  ):
69
  n = min(sample_size, len(company_embeddings))
 
79
  return np.array(scores)
80
 
81
  # =========================================================
82
+ # NETWORK GRAPH
83
  # =========================================================
84
+ @st.cache_resource(show_spinner=False)
85
  def build_network_graph(
86
  company_embeddings,
87
  candidate_embeddings,
 
101
 
102
  n_comp = min(sample_size, len(company_embeddings))
103
 
104
+ # Company nodes
105
  for i in range(n_comp):
106
  label = companies_meta.iloc[i].get("name", f"Company {i}")
107
  net.add_node(
 
112
  size=18
113
  )
114
 
115
+ # Candidate nodes + edges
116
  for i in range(n_comp):
117
  sims = cosine_similarity(
118
  company_embeddings[i].reshape(1, -1),
 
140
  return net
141
 
142
  # =========================================================
143
+ # LLM EXPLANATION
144
  # =========================================================
145
  def explain_match_llm(company_row, candidate_row, score):
146
  HF_TOKEN = os.getenv("HF_TOKEN")
 
222
  COMP_META_PATH = os.path.join(DATA_PATH, "companies_metadata.pkl")
223
 
224
  # =========================================================
225
+ # LOAD DATA
226
  # =========================================================
227
  @st.cache_resource
228
  def load_core():
 
297
  df = pd.DataFrame(rows)
298
 
299
  # =========================================================
300
+ # MATCH METRICS
301
  # =========================================================
302
  with right:
303
  st.subheader("๐Ÿ“Š Match Overview")
 
310
  st.subheader("๐Ÿ‘ค Top Candidate Matches")
311
 
312
  def style_score(val):
313
+ return "color: green; font-weight: bold;" if val > threshold else ""
 
 
314
 
315
  st.dataframe(
316
  df.style.applymap(style_score, subset=["Score"]),
 
318
  )
319
 
320
  # =========================================================
321
+ # FAIRNESS
322
  # =========================================================
323
  st.markdown("---")
324
  st.subheader("โš–๏ธ Bilateral Fairness (Top-K)")
 
341
  st.subheader("๐Ÿ“ˆ Score Distribution")
342
 
343
  score_dist = compute_score_distribution(
344
+ candidate_embeddings,
345
+ company_embeddings
346
  )
347
 
348
  st.bar_chart(pd.Series(score_dist).value_counts(bins=30).sort_index())
 
361
  )
362
 
363
  html_path = os.path.join(BASE_PATH, "data", "v3", "results", "network_company.html")
364
+ os.makedirs(os.path.dirname(html_path), exist_ok=True)
365
  net.write_html(html_path)
366
 
367
  import streamlit.components.v1 as components
368
  components.html(open(html_path).read(), height=620, scrolling=True)
369
 
370
  # =========================================================
371
+ # LLM EXPLANATION
372
  # =========================================================
373
  st.markdown("---")
374
  st.subheader("๐Ÿค– Match Explanation (LLM)")
utils/embeddings.py CHANGED
@@ -1,11 +1,49 @@
1
- from sentence_transformers import SentenceTransformer
 
 
2
  import streamlit as st
3
 
4
- @st.cache_resource
5
- def load_model():
6
- return SentenceTransformer("all-MiniLM-L6-v2")
7
 
8
- @st.cache_data
9
- def embed_texts(texts):
10
- model = load_model()
11
- return model.encode(texts, show_progress_bar=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import hf_hub_download
2
+ import numpy as np
3
+ import pickle
4
  import streamlit as st
5
 
 
 
 
6
 
7
+ @st.cache_resource(show_spinner=False)
8
+ def load_production_artifacts():
9
+ base = "processed"
10
+
11
+ cand_emb_path = hf_hub_download(
12
+ repo_id="Rogersurf/hrhub-artifacts",
13
+ filename=f"{base}/candidate_embeddings.npy",
14
+ repo_type="dataset"
15
+ )
16
+
17
+ comp_emb_path = hf_hub_download(
18
+ repo_id="Rogersurf/hrhub-artifacts",
19
+ filename=f"{base}/company_embeddings.npy",
20
+ repo_type="dataset"
21
+ )
22
+
23
+ cand_meta_path = hf_hub_download(
24
+ repo_id="Rogersurf/hrhub-artifacts",
25
+ filename=f"{base}/candidates_metadata.pkl",
26
+ repo_type="dataset"
27
+ )
28
+
29
+ comp_meta_path = hf_hub_download(
30
+ repo_id="Rogersurf/hrhub-artifacts",
31
+ filename=f"{base}/companies_metadata.pkl",
32
+ repo_type="dataset"
33
+ )
34
+
35
+ candidate_embeddings = np.load(cand_emb_path)
36
+ company_embeddings = np.load(comp_emb_path)
37
+
38
+ with open(cand_meta_path, "rb") as f:
39
+ candidates_meta = pickle.load(f)
40
+
41
+ with open(comp_meta_path, "rb") as f:
42
+ companies_meta = pickle.load(f)
43
+
44
+ return (
45
+ candidate_embeddings,
46
+ company_embeddings,
47
+ candidates_meta,
48
+ companies_meta,
49
+ )