Spaces:

Rogersurf
/

hrhub

Sleeping

App Files Files Community

Roger Surf commited on Jan 9

Commit

6e8d673

1 Parent(s): 552e62e

HF: remove evaluation artifacts and ignore permanently

Browse files

Files changed (3) hide show

pages/4_👤_Candidate_View.py +50 -139
pages/5_🏢_Company_View.py +15 -17
utils/embeddings.py +46 -8

pages/4_👤_Candidate_View.py CHANGED Viewed

@@ -45,7 +45,6 @@ def compute_bilateral_fairness(
     comp_mean = float(np.mean(comp_scores))
     fairness = min(cand_mean, comp_mean) / max(cand_mean, comp_mean)
     return cand_mean, comp_mean, fairness
@@ -59,7 +58,7 @@ def cached_fairness(candidate_embeddings, company_embeddings, top_k):
     )
 # =========================================================
-# COMPUTES SCORE DISTRIBUTION
 # =========================================================
 @st.cache_data(show_spinner=False)
 def compute_score_distribution(
@@ -67,12 +66,6 @@ def compute_score_distribution(
     company_embeddings,
     sample_size=200
 ):
-    """
-    Compute a global score distribution using random candidate samples
-    """
-    import numpy as np
-    from sklearn.metrics.pairwise import cosine_similarity
     n = min(sample_size, len(candidate_embeddings))
     scores = []
@@ -86,9 +79,9 @@ def compute_score_distribution(
     return np.array(scores)
 # =========================================================
-# BUILD NETWORK GRAPH
 # =========================================================
-@st.cache_data(show_spinner=False)
 def build_network_graph(
     candidate_embeddings,
     company_embeddings,
@@ -98,8 +91,6 @@ def build_network_graph(
     sample_size=15
 ):
     from pyvis.network import Network
-    import numpy as np
-    from sklearn.metrics.pairwise import cosine_similarity
     net = Network(
         height="600px",
@@ -110,18 +101,17 @@ def build_network_graph(
     n_cand = min(sample_size, len(candidate_embeddings))
-    # Add candidate nodes
     for i in range(n_cand):
-        label = f"Candidate {i}"
         net.add_node(
             f"cand_{i}",
-            label=label,
             color="#667eea",
             shape="dot",
             size=18
         )
-    # Add company nodes + edges
     for i in range(n_cand):
         sims = cosine_similarity(
             candidate_embeddings[i].reshape(1, -1),
@@ -131,11 +121,11 @@ def build_network_graph(
         top_idx = np.argsort(sims)[-top_k:][::-1]
         for j in top_idx:
-            company_name = companies_meta.iloc[j].get("name", f"Company {j}")
             net.add_node(
                 f"comp_{j}",
-                label=company_name,
                 color="#2ecc71",
                 shape="box",
                 size=14
@@ -151,15 +141,9 @@ def build_network_graph(
     return net
 # =========================================================
-# LLM-BASED MATCH EXPLANATION
 # =========================================================
 def explain_match_llm(candidate_row, company_row, score):
-    """
-    Post-hoc LLM explanation for a single match.
-    Safe: does NOT affect ranking.
-    """
-    import os
     HF_TOKEN = os.getenv("HF_TOKEN")
     if not HF_TOKEN:
@@ -172,6 +156,7 @@ def explain_match_llm(candidate_row, company_row, score):
     try:
         from huggingface_hub import InferenceClient
         client = InferenceClient(token=HF_TOKEN)
@@ -193,10 +178,10 @@ Required Skills: {company_row.get('required_skills','')}
 MATCH SCORE: {score:.3f}
 Return a concise explanation in JSON with keys:
-- strengths (list)
-- gaps (list)
-- recommendation (string)
-- summary (string)
 """
         response = client.chat_completion(
@@ -206,8 +191,6 @@ Return a concise explanation in JSON with keys:
         )
         content = response.choices[0].message.content
-        import json
         start, end = content.find("{"), content.rfind("}") + 1
         return json.loads(content[start:end])
@@ -219,7 +202,6 @@ Return a concise explanation in JSON with keys:
             "recommendation": "Review manually."
         }
 # =========================================================
 # PAGE CONFIG
 # =========================================================
@@ -230,7 +212,7 @@ st.set_page_config(
 )
 # =========================================================
-# PATHS (V3 = REPORT CONSISTENT)
 # =========================================================
 BASE_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 DATA_PATH = os.path.join(BASE_PATH, "data", "v3", "processed")
@@ -241,7 +223,7 @@ CAND_META_PATH = os.path.join(DATA_PATH, "candidates_metadata.pkl")
 COMP_META_PATH = os.path.join(DATA_PATH, "companies_metadata.pkl")
 # =========================================================
-# LOAD CORE DATA
 # =========================================================
 @st.cache_resource
 def load_core():
@@ -287,16 +269,16 @@ left, right = st.columns([1, 2])
 with left:
     st.subheader("👤 Candidate Profile")
-    st.markdown(f"**Category:** {candidate.get('Category', 'N/A')}")
     with st.expander("🧠 Skills", expanded=True):
-        st.write(candidate.get("skills", "N/A"))
     with st.expander("🎯 Career Objective", expanded=True):
-        st.write(candidate.get("career_objective", "N/A"))
 # =========================================================
-# MATCHING (REAL)
 # =========================================================
 cand_vec = candidate_embeddings[candidate_id].reshape(1, -1)
 scores = cosine_similarity(cand_vec, company_embeddings)[0]
@@ -309,15 +291,15 @@ for rank, (idx, score) in enumerate(zip(top_idx, top_scores), start=1):
     company = companies_meta.iloc[idx]
     rows.append({
         "Rank": rank,
-        "Company": company.get("name", "Unknown"),
-        "Industry": company.get("industries_list", "N/A"),
         "Score": score
     })
 df = pd.DataFrame(rows)
 # =========================================================
-# MATCH METRICS + TABLE
 # =========================================================
 with right:
     st.subheader("📊 Match Overview")
@@ -330,9 +312,7 @@ with right:
     st.subheader("🏢 Top Company Matches")
     def style_score(val):
-        if val > threshold:
-            return "color: green; font-weight: bold;"
-        return ""
     st.dataframe(
         df.style.applymap(style_score, subset=["Score"]),
@@ -340,78 +320,34 @@ with right:
     )
 # =========================================================
-# FAIRNESS PANEL
 # =========================================================
 st.markdown("---")
 st.subheader("⚖️ Bilateral Fairness (Top-K)")
-with st.expander("What does this mean?"):
-    st.markdown("""
-    **Bilateral Fairness** evaluates whether the system treats
-    candidates and companies symmetrically.
-    - Candidate → Company: mean Top-K similarity
-    - Company → Candidate: mean Top-K similarity
-    Values near **1.0** indicate a balanced system.
-    Lower values are expected in retrieval-based systems.
-    """)
-with st.spinner("Computing fairness metrics..."):
-    cand_mean, comp_mean, fairness = cached_fairness(
-        candidate_embeddings,
-        company_embeddings,
-        top_k
-    )
 c1, c2, c3 = st.columns(3)
 c1.metric("Candidate → Company", f"{cand_mean:.3f}")
 c2.metric("Company → Candidate", f"{comp_mean:.3f}")
 c3.metric("Fairness Ratio", f"{fairness:.3f}")
-if fairness >= 0.9:
-    st.success("✅ System is highly balanced")
-elif fairness >= 0.6:
-    st.info("ℹ️ System is reasonably balanced (expected for Top-K)")
-else:
-    st.warning("⚠️ Potential asymmetry detected")
 # =========================================================
 # SCORE DISTRIBUTION
 # =========================================================
 st.markdown("---")
 st.subheader("📈 Score Distribution")
-with st.expander("How to interpret this?", expanded=False):
-    st.markdown("""
-    This histogram shows the **distribution of cosine similarity scores**
-    between candidates and companies.
-    **Important interpretation:**
-    - Scores above **0.6** are already considered **strong semantic matches**
-    - Scores above **0.7** are **rare and exceptional**
-    - The system is evaluated by **ranking**, not absolute thresholds
-    """)
-with st.spinner("Computing score distribution..."):
-    score_dist = compute_score_distribution(
-        candidate_embeddings,
-        company_embeddings,
-        sample_size=200
-    )
-# Histogram
-hist_df = pd.DataFrame({"Similarity Score": score_dist})
-st.bar_chart(
-    hist_df["Similarity Score"].value_counts(bins=30).sort_index()
 )
-# Reference lines (textual)
-c1, c2, c3 = st.columns(3)
-c1.metric("Mean Score", f"{score_dist.mean():.3f}")
-c2.metric("95th Percentile", f"{np.percentile(score_dist, 95):.3f}")
-c3.metric("Max Observed", f"{score_dist.max():.3f}")
 # =========================================================
 # NETWORK GRAPH
@@ -419,69 +355,44 @@ c3.metric("Max Observed", f"{score_dist.max():.3f}")
 st.markdown("---")
 st.subheader("🌐 Matching Network Graph")
-with st.expander("What does this show?", expanded=False):
-    st.markdown("""
-    This network visualizes the **Top-K semantic relationships**
-    between candidates and companies.
-    - 🔵 Blue nodes: Candidates
-    - 🟢 Green nodes: Companies
-    - Edges represent strong semantic matches
-    The graph helps detect:
-    - Structural bias
-    - Over-dominant companies
-    - Diversity of matches
-    """)
-with st.spinner("Building network graph..."):
-    net = build_network_graph(
-        candidate_embeddings,
-        company_embeddings,
-        candidates_meta,
-        companies_meta,
-        top_k=3,
-        sample_size=12
-    )
-html_path = os.path.join(BASE_PATH, "data", "v3", "results", "network_temp.html")
 net.write_html(html_path)
 import streamlit.components.v1 as components
-components.html(
-    open(html_path, "r").read(),
-    height=620,
-    scrolling=True
-)
 # =========================================================
-# LLM EXPLAINABILITY (TOP-1)
 # =========================================================
 st.markdown("---")
 st.subheader("🤖 Match Explanation (LLM)")
 with st.expander("Why is this company a good match?", expanded=True):
-    top_company_idx = top_idx[0]
-    top_company = companies_meta.iloc[top_company_idx]
     top_score = top_scores[0]
     if st.button("Generate AI Explanation"):
-        with st.spinner("LLM analyzing match..."):
-            explanation = explain_match_llm(
-                candidate,
-                top_company,
-                top_score
-            )
         st.markdown(f"**Summary:** {explanation.get('summary','')}")
         c1, c2 = st.columns(2)
         with c1:
             st.markdown("### ✅ Strengths")
             for s in explanation.get("strengths", []):
                 st.write(f"- {s}")
         with c2:
             st.markdown("### ⚠️ Gaps")
             for g in explanation.get("gaps", []):

     comp_mean = float(np.mean(comp_scores))
     fairness = min(cand_mean, comp_mean) / max(cand_mean, comp_mean)
     return cand_mean, comp_mean, fairness
     )
 # =========================================================
+# SCORE DISTRIBUTION
 # =========================================================
 @st.cache_data(show_spinner=False)
 def compute_score_distribution(
     company_embeddings,
     sample_size=200
 ):
     n = min(sample_size, len(candidate_embeddings))
     scores = []
     return np.array(scores)
 # =========================================================
+# NETWORK GRAPH
 # =========================================================
+@st.cache_resource(show_spinner=False)
 def build_network_graph(
     candidate_embeddings,
     company_embeddings,
     sample_size=15
 ):
     from pyvis.network import Network
     net = Network(
         height="600px",
     n_cand = min(sample_size, len(candidate_embeddings))
+    # Candidate nodes
     for i in range(n_cand):
         net.add_node(
             f"cand_{i}",
+            label=f"Candidate {i}",
             color="#667eea",
             shape="dot",
             size=18
         )
+    # Company nodes + edges
     for i in range(n_cand):
         sims = cosine_similarity(
             candidate_embeddings[i].reshape(1, -1),
         top_idx = np.argsort(sims)[-top_k:][::-1]
         for j in top_idx:
+            label = companies_meta.iloc[j].get("name", f"Company {j}")
             net.add_node(
                 f"comp_{j}",
+                label=label,
                 color="#2ecc71",
                 shape="box",
                 size=14
     return net
 # =========================================================
+# LLM EXPLANATION
 # =========================================================
 def explain_match_llm(candidate_row, company_row, score):
     HF_TOKEN = os.getenv("HF_TOKEN")
     if not HF_TOKEN:
     try:
         from huggingface_hub import InferenceClient
+        import json
         client = InferenceClient(token=HF_TOKEN)
 MATCH SCORE: {score:.3f}
 Return a concise explanation in JSON with keys:
+- strengths
+- gaps
+- recommendation
+- summary
 """
         response = client.chat_completion(
         )
         content = response.choices[0].message.content
         start, end = content.find("{"), content.rfind("}") + 1
         return json.loads(content[start:end])
             "recommendation": "Review manually."
         }
 # =========================================================
 # PAGE CONFIG
 # =========================================================
 )
 # =========================================================
+# PATHS
 # =========================================================
 BASE_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 DATA_PATH = os.path.join(BASE_PATH, "data", "v3", "processed")
 COMP_META_PATH = os.path.join(DATA_PATH, "companies_metadata.pkl")
 # =========================================================
+# LOAD DATA
 # =========================================================
 @st.cache_resource
 def load_core():
 with left:
     st.subheader("👤 Candidate Profile")
+    st.markdown(f"**Category:** {candidate.get('Category','N/A')}")
     with st.expander("🧠 Skills", expanded=True):
+        st.write(candidate.get("skills","N/A"))
     with st.expander("🎯 Career Objective", expanded=True):
+        st.write(candidate.get("career_objective","N/A"))
 # =========================================================
+# MATCHING
 # =========================================================
 cand_vec = candidate_embeddings[candidate_id].reshape(1, -1)
 scores = cosine_similarity(cand_vec, company_embeddings)[0]
     company = companies_meta.iloc[idx]
     rows.append({
         "Rank": rank,
+        "Company": company.get("name","Unknown"),
+        "Industry": company.get("industries_list","N/A"),
         "Score": score
     })
 df = pd.DataFrame(rows)
 # =========================================================
+# MATCH METRICS
 # =========================================================
 with right:
     st.subheader("📊 Match Overview")
     st.subheader("🏢 Top Company Matches")
     def style_score(val):
+        return "color: green; font-weight: bold;" if val > threshold else ""
     st.dataframe(
         df.style.applymap(style_score, subset=["Score"]),
     )
 # =========================================================
+# FAIRNESS
 # =========================================================
 st.markdown("---")
 st.subheader("⚖️ Bilateral Fairness (Top-K)")
+cand_mean, comp_mean, fairness = cached_fairness(
+    candidate_embeddings,
+    company_embeddings,
+    top_k
+)
 c1, c2, c3 = st.columns(3)
 c1.metric("Candidate → Company", f"{cand_mean:.3f}")
 c2.metric("Company → Candidate", f"{comp_mean:.3f}")
 c3.metric("Fairness Ratio", f"{fairness:.3f}")
 # =========================================================
 # SCORE DISTRIBUTION
 # =========================================================
 st.markdown("---")
 st.subheader("📈 Score Distribution")
+score_dist = compute_score_distribution(
+    candidate_embeddings,
+    company_embeddings
 )
+st.bar_chart(pd.Series(score_dist).value_counts(bins=30).sort_index())
 # =========================================================
 # NETWORK GRAPH
 st.markdown("---")
 st.subheader("🌐 Matching Network Graph")
+net = build_network_graph(
+    candidate_embeddings,
+    company_embeddings,
+    candidates_meta,
+    companies_meta
+)
+html_path = os.path.join(BASE_PATH, "data", "v3", "results", "network_candidate.html")
+os.makedirs(os.path.dirname(html_path), exist_ok=True)
 net.write_html(html_path)
 import streamlit.components.v1 as components
+components.html(open(html_path).read(), height=620, scrolling=True)
 # =========================================================
+# LLM EXPLANATION
 # =========================================================
 st.markdown("---")
 st.subheader("🤖 Match Explanation (LLM)")
 with st.expander("Why is this company a good match?", expanded=True):
+    top_company = companies_meta.iloc[top_idx[0]]
     top_score = top_scores[0]
     if st.button("Generate AI Explanation"):
+        explanation = explain_match_llm(
+            candidate,
+            top_company,
+            top_score
+        )
         st.markdown(f"**Summary:** {explanation.get('summary','')}")
         c1, c2 = st.columns(2)
         with c1:
             st.markdown("### ✅ Strengths")
             for s in explanation.get("strengths", []):
                 st.write(f"- {s}")
         with c2:
             st.markdown("### ⚠️ Gaps")
             for g in explanation.get("gaps", []):

pages/5_🏢_Company_View.py CHANGED Viewed

@@ -45,7 +45,6 @@ def compute_bilateral_fairness(
     comp_mean = float(np.mean(comp_scores))
     fairness = min(cand_mean, comp_mean) / max(cand_mean, comp_mean)
     return cand_mean, comp_mean, fairness
@@ -59,12 +58,12 @@ def cached_fairness(candidate_embeddings, company_embeddings, top_k):
     )
 # =========================================================
-# COMPUTES SCORE DISTRIBUTION
 # =========================================================
 @st.cache_data(show_spinner=False)
 def compute_score_distribution(
-    company_embeddings,
     candidate_embeddings,
     sample_size=200
 ):
     n = min(sample_size, len(company_embeddings))
@@ -80,9 +79,9 @@ def compute_score_distribution(
     return np.array(scores)
 # =========================================================
-# BUILD NETWORK GRAPH
 # =========================================================
-@st.cache_data(show_spinner=False)
 def build_network_graph(
     company_embeddings,
     candidate_embeddings,
@@ -102,7 +101,7 @@ def build_network_graph(
     n_comp = min(sample_size, len(company_embeddings))
-    # Add company nodes
     for i in range(n_comp):
         label = companies_meta.iloc[i].get("name", f"Company {i}")
         net.add_node(
@@ -113,7 +112,7 @@ def build_network_graph(
             size=18
         )
-    # Add candidate nodes + edges
     for i in range(n_comp):
         sims = cosine_similarity(
             company_embeddings[i].reshape(1, -1),
@@ -141,7 +140,7 @@ def build_network_graph(
     return net
 # =========================================================
-# LLM-BASED MATCH EXPLANATION
 # =========================================================
 def explain_match_llm(company_row, candidate_row, score):
     HF_TOKEN = os.getenv("HF_TOKEN")
@@ -223,7 +222,7 @@ CAND_META_PATH = os.path.join(DATA_PATH, "candidates_metadata.pkl")
 COMP_META_PATH = os.path.join(DATA_PATH, "companies_metadata.pkl")
 # =========================================================
-# LOAD CORE DATA
 # =========================================================
 @st.cache_resource
 def load_core():
@@ -298,7 +297,7 @@ for rank, (idx, score) in enumerate(zip(top_idx, top_scores), start=1):
 df = pd.DataFrame(rows)
 # =========================================================
-# MATCH METRICS + TABLE
 # =========================================================
 with right:
     st.subheader("📊 Match Overview")
@@ -311,9 +310,7 @@ with right:
     st.subheader("👤 Top Candidate Matches")
     def style_score(val):
-        if val > threshold:
-            return "color: green; font-weight: bold;"
-        return ""
     st.dataframe(
         df.style.applymap(style_score, subset=["Score"]),
@@ -321,7 +318,7 @@ with right:
     )
 # =========================================================
-# FAIRNESS PANEL
 # =========================================================
 st.markdown("---")
 st.subheader("⚖️ Bilateral Fairness (Top-K)")
@@ -344,8 +341,8 @@ st.markdown("---")
 st.subheader("📈 Score Distribution")
 score_dist = compute_score_distribution(
-    company_embeddings,
-    candidate_embeddings
 )
 st.bar_chart(pd.Series(score_dist).value_counts(bins=30).sort_index())
@@ -364,13 +361,14 @@ net = build_network_graph(
 )
 html_path = os.path.join(BASE_PATH, "data", "v3", "results", "network_company.html")
 net.write_html(html_path)
 import streamlit.components.v1 as components
 components.html(open(html_path).read(), height=620, scrolling=True)
 # =========================================================
-# LLM EXPLAINABILITY
 # =========================================================
 st.markdown("---")
 st.subheader("🤖 Match Explanation (LLM)")

     comp_mean = float(np.mean(comp_scores))
     fairness = min(cand_mean, comp_mean) / max(cand_mean, comp_mean)
     return cand_mean, comp_mean, fairness
     )
 # =========================================================
+# SCORE DISTRIBUTION
 # =========================================================
 @st.cache_data(show_spinner=False)
 def compute_score_distribution(
     candidate_embeddings,
+    company_embeddings,
     sample_size=200
 ):
     n = min(sample_size, len(company_embeddings))
     return np.array(scores)
 # =========================================================
+# NETWORK GRAPH
 # =========================================================
+@st.cache_resource(show_spinner=False)
 def build_network_graph(
     company_embeddings,
     candidate_embeddings,
     n_comp = min(sample_size, len(company_embeddings))
+    # Company nodes
     for i in range(n_comp):
         label = companies_meta.iloc[i].get("name", f"Company {i}")
         net.add_node(
             size=18
         )
+    # Candidate nodes + edges
     for i in range(n_comp):
         sims = cosine_similarity(
             company_embeddings[i].reshape(1, -1),
     return net
 # =========================================================
+# LLM EXPLANATION
 # =========================================================
 def explain_match_llm(company_row, candidate_row, score):
     HF_TOKEN = os.getenv("HF_TOKEN")
 COMP_META_PATH = os.path.join(DATA_PATH, "companies_metadata.pkl")
 # =========================================================
+# LOAD DATA
 # =========================================================
 @st.cache_resource
 def load_core():
 df = pd.DataFrame(rows)
 # =========================================================
+# MATCH METRICS
 # =========================================================
 with right:
     st.subheader("📊 Match Overview")
     st.subheader("👤 Top Candidate Matches")
     def style_score(val):
+        return "color: green; font-weight: bold;" if val > threshold else ""
     st.dataframe(
         df.style.applymap(style_score, subset=["Score"]),
     )
 # =========================================================
+# FAIRNESS
 # =========================================================
 st.markdown("---")
 st.subheader("⚖️ Bilateral Fairness (Top-K)")
 st.subheader("📈 Score Distribution")
 score_dist = compute_score_distribution(
+    candidate_embeddings,
+    company_embeddings
 )
 st.bar_chart(pd.Series(score_dist).value_counts(bins=30).sort_index())
 )
 html_path = os.path.join(BASE_PATH, "data", "v3", "results", "network_company.html")
+os.makedirs(os.path.dirname(html_path), exist_ok=True)
 net.write_html(html_path)
 import streamlit.components.v1 as components
 components.html(open(html_path).read(), height=620, scrolling=True)
 # =========================================================
+# LLM EXPLANATION
 # =========================================================
 st.markdown("---")
 st.subheader("🤖 Match Explanation (LLM)")

utils/embeddings.py CHANGED Viewed

@@ -1,11 +1,49 @@
-from sentence_transformers import SentenceTransformer
 import streamlit as st
-@st.cache_resource
-def load_model():
-    return SentenceTransformer("all-MiniLM-L6-v2")
-@st.cache_data
-def embed_texts(texts):
-    model = load_model()
-    return model.encode(texts, show_progress_bar=False)

+from huggingface_hub import hf_hub_download
+import numpy as np
+import pickle
 import streamlit as st
+@st.cache_resource(show_spinner=False)
+def load_production_artifacts():
+    base = "processed"
+    cand_emb_path = hf_hub_download(
+        repo_id="Rogersurf/hrhub-artifacts",
+        filename=f"{base}/candidate_embeddings.npy",
+        repo_type="dataset"
+    )
+    comp_emb_path = hf_hub_download(
+        repo_id="Rogersurf/hrhub-artifacts",
+        filename=f"{base}/company_embeddings.npy",
+        repo_type="dataset"
+    )
+    cand_meta_path = hf_hub_download(
+        repo_id="Rogersurf/hrhub-artifacts",
+        filename=f"{base}/candidates_metadata.pkl",
+        repo_type="dataset"
+    )
+    comp_meta_path = hf_hub_download(
+        repo_id="Rogersurf/hrhub-artifacts",
+        filename=f"{base}/companies_metadata.pkl",
+        repo_type="dataset"
+    )
+    candidate_embeddings = np.load(cand_emb_path)
+    company_embeddings = np.load(comp_emb_path)
+    with open(cand_meta_path, "rb") as f:
+        candidates_meta = pickle.load(f)
+    with open(comp_meta_path, "rb") as f:
+        companies_meta = pickle.load(f)
+    return (
+        candidate_embeddings,
+        company_embeddings,
+        candidates_meta,
+        companies_meta,
+    )