Spaces:

curiouscurrent
/

appliedai

Sleeping

App Files Files Community

curiouscurrent commited on Sep 26, 2025

Commit

482309a

verified ·

1 Parent(s): c814146

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -102

app.py CHANGED Viewed

@@ -2,25 +2,18 @@ import gradio as gr
 import pandas as pd
 import json
 import os
-import requests
 import re
 from functools import lru_cache
 # ----------------------------
 # CONFIG
 # ----------------------------
 JSON_FILE = "form-submissions-1.json"
-# 🚩 CHANGE: Switched to a more capable, instruction-tuned model for semantic matching
-MODEL_ID = "google/flan-t5-large"
-# NOTE: HF_API_TOKEN MUST be set in your environment variables/Space secrets.
-HF_API_TOKEN = os.environ.get("HF_API_TOKEN")
 FILTERED_CSV = "/tmp/filtered_candidates.csv"
-OUTPUT_FILE = "/tmp/outputs.csv"
-BATCH_SIZE = 50
-if not HF_API_TOKEN:
-    # Allow launch for demonstration, but function will warn if token is missing
-    pass
 CATEGORIES = {
     "AI": [
@@ -37,60 +30,39 @@ CATEGORIES = {
 }
 # ----------------------------
-# LLM Call for Semantic Role Scoring
 # ----------------------------
-@lru_cache(maxsize=512)
-def score_candidate(candidate_str, category_name, job_titles_tuple):
-    if not HF_API_TOKEN:
-         print("API Token is missing. Returning score 0.")
-         return 0
-    # 🚩 PROMPT CHANGE: Focus on 'semantic relevance' and 'conceptual fit'
-    prompt = f"""
-You are an HR expert performing semantic matching. Your task is to rate a candidate's conceptual fit based ONLY on their previous job roles and the target roles.
-Rate the semantic relevance of the candidate's 'Roles' to the 'Target Roles' on a scale of 1 (Lowest Match) to 10 (Highest Semantic Match).
-The score must reflect the conceptual alignment and industry similarity, not just keyword presence.
-The target roles for the '{category_name}' category are: {list(job_titles_tuple)}
-Candidate JSON: {candidate_str}
-**Task**: Respond ONLY with the rating number (an integer from 1 to 10).
-"""
-    headers = {"Authorization": f"Bearer {HF_API_TOKEN}", "Content-Type": "application/json"}
-    payload = {
-        "inputs": prompt,
-        "parameters": {
-            "max_new_tokens": 5,
-            "return_full_text": False,
-            "temperature": 0.1
-        }
-    }
-    try:
-        # Note: Flan-T5-Large is slower than small, but more powerful for this task
-        response = requests.post(
-            f"https://api-inference.huggingface.co/models/{MODEL_ID}",
-            headers=headers,
-            data=json.dumps(payload),
-            timeout=120 # Increased timeout for the larger model
-        )
-        response.raise_for_status()
-        result = response.json()
-        generated_text = result[0].get("generated_text", "0").strip()
-        match = re.search(r'\d+', generated_text)
-        if match:
-            score = int(match.group(0))
-            return max(1, min(10, score))
-        return 0
-    except Exception as e:
-        print(f"LLM scoring call failed for candidate (API/Network Error): {e}")
-        return 0
 # ----------------------------
 # Step 1: Filter by roles (Unchanged)
@@ -109,6 +81,7 @@ def filter_by_roles(category_name):
         work_exps = person.get("work_experiences", [])
         if not work_exps:
             continue
         non_fullstack_roles = [
             exp.get("roleName") for exp in work_exps
             if exp.get("roleName") and "full stack developer" not in exp.get("roleName").lower()
@@ -116,6 +89,7 @@ def filter_by_roles(category_name):
         if not non_fullstack_roles:
             continue
         if any(role in job_titles for role in non_fullstack_roles):
             filtered.append({
                 "Name": person.get("name"),
@@ -133,15 +107,13 @@ def filter_by_roles(category_name):
     df = pd.DataFrame(filtered)
     df.to_csv(FILTERED_CSV, index=False)
-    return df, f"{len(df)} candidates filtered by role for category '{category_name}'. Ready for LLM Semantic Scoring."
 # ----------------------------
-# Step 2: LLM recommendations (Semantic Scoring, Sorting, and Output)
 # ----------------------------
-def llm_recommendations(category_name):
-    job_titles = CATEGORIES[category_name]
     if not os.path.exists(FILTERED_CSV):
         df_filtered, msg = filter_by_roles(category_name)
         if df_filtered.empty:
@@ -153,45 +125,28 @@ def llm_recommendations(category_name):
     if df_filtered.empty:
         return f"No filtered candidates found for category '{category_name}'. Run Step 1 first."
-    df_filtered_clean = df_filtered.fillna('N/A')
-    filtered_candidates = df_filtered_clean.to_dict(orient="records")
-    scores = []
-    for person in filtered_candidates:
-        candidate_info = {
-            "Name": person.get("Name"),
-            "Roles": person.get("Roles"),
-            "Skills": person.get("Skills")
-        }
-        candidate_str = json.dumps(candidate_info)
-        score = score_candidate(candidate_str, category_name, tuple(job_titles))
-        scores.append(score)
-    df_filtered["LLM_Score"] = scores
-    # Only filter out scores of 0 if the token is present (0 means total irrelevance if token works)
-    if HF_API_TOKEN:
-        df_recommended = df_filtered[df_filtered["LLM_Score"] > 0].copy()
-    else:
-        df_recommended = df_filtered.copy() # Can't filter if all are 0 due to no token
     if df_recommended.empty:
-        if not HF_API_TOKEN:
-            return "❌ LLM failed: The HF_API_TOKEN is not set or is invalid. Set the token and try again."
-        return f"LLM scored all candidates 0. This indicates zero semantic relevance between the candidates' roles and the target roles for '{category_name}'."
     def parse_salary(s):
         try:
             return float(str(s).replace("$","").replace(",","").replace("N/A", str(float('inf'))))
         except:
             return float('inf')
     df_recommended["Salary_sort"] = df_recommended["Salary"].apply(parse_salary)
     df_top5 = df_recommended.sort_values(
-        by=['LLM_Score', 'Salary_sort'],
         ascending=[False, True]
     ).head(5)
@@ -200,10 +155,12 @@ def llm_recommendations(category_name):
     output_text = f"Top {len(final_names)} Recommended Candidates for the '{category_name}' Category:\n\n"
     for i, name in enumerate(final_names):
-        score = df_top5.iloc[i]['LLM_Score']
-        output_text += f"{i+1}. {name} (Semantic Role Match Score: {score}/10)\n"
-    output_text += "\nThese candidates were ranked by the LLM based on the **conceptual fit (semantic similarity)** of their previous job roles to the target roles, using expected salary as a tie-breaker."
     return output_text
@@ -221,11 +178,11 @@ def show_first_candidates():
         return pd.DataFrame({"Error": [f"Failed to load JSON: {e}"]})
 # ----------------------------
-# Gradio interface (Updated Heading and Launch)
 # ----------------------------
 with gr.Blocks() as app:
-    gr.Markdown("# 🏆 Candidate Selection (Semantic Role Matching)")
-    gr.Markdown("### **Uses a large instruction model to score conceptual fit and similarity between roles.**")
     gr.Markdown("#### 🔍 Raw JSON Preview: First 5 Candidates")
     gr.Dataframe(show_first_candidates(), label="First 5 JSON Entries")
@@ -241,10 +198,10 @@ with gr.Blocks() as app:
     gr.Markdown("---")
-    # Step 2: LLM Recommendations
-    recommend_button = gr.Button("3. Rank Candidates by Semantic Role Match")
-    recommend_output_text = gr.Textbox(label="Top Candidate Recommendations Summary", lines=10, placeholder="Click 'Rank Candidates by Semantic Role Match' after Step 2 completes.")
-    recommend_button.click(llm_recommendations, inputs=[category_dropdown], outputs=[recommend_output_text])
 if __name__ == "__main__":
     app.launch(share=True)

 import pandas as pd
 import json
 import os
 import re
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
 from functools import lru_cache
 # ----------------------------
 # CONFIG
 # ----------------------------
 JSON_FILE = "form-submissions-1.json"
 FILTERED_CSV = "/tmp/filtered_candidates.csv"
+# The HF_API_TOKEN and LLM-related variables are now completely removed.
 CATEGORIES = {
     "AI": [
 }
 # ----------------------------
+# Similarity Matching Function (Reliable Objective Scoring)
 # ----------------------------
+@lru_cache(maxsize=1)
+def calculate_similarity_scores(df_candidates, category_name):
+    """
+    Calculates the cosine similarity between candidate roles and target job titles
+    using TF-IDF for keyword matching based on importance.
+    """
+    if df_candidates.empty:
+        return pd.Series([], dtype='float64')
+    # 1. Define the document corpus
+    # Combines all target roles into one reference text
+    target_roles = " ".join(CATEGORIES[category_name])
+    candidate_roles = df_candidates['Roles'].tolist()
+    # 2. Create the corpus for vectorization
+    corpus = [target_roles] + candidate_roles
+    # 3. Vectorize using TF-IDF (converts text to numerical features)
+    # ngrams help match multi-word phrases like 'Data Scientist'
+    vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
+    tfidf_matrix = vectorizer.fit_transform(corpus)
+    # 4. Extract the vector for the target roles (the first row)
+    target_vector = tfidf_matrix[0]
+    candidate_vectors = tfidf_matrix[1:]
+    # 5. Calculate Cosine Similarity (score ranges from 0.0 to 1.0)
+    similarity_scores = cosine_similarity(target_vector, candidate_vectors).flatten()
+    # Return scores as a Pandas Series aligned with the DataFrame index
+    return pd.Series(similarity_scores, index=df_candidates.index)
 # ----------------------------
 # Step 1: Filter by roles (Unchanged)
         work_exps = person.get("work_experiences", [])
         if not work_exps:
             continue
+        # Filter to get relevant job titles from the work experience
         non_fullstack_roles = [
             exp.get("roleName") for exp in work_exps
             if exp.get("roleName") and "full stack developer" not in exp.get("roleName").lower()
         if not non_fullstack_roles:
             continue
+        # Initial check: filter only candidates who have *at least one* target role
         if any(role in job_titles for role in non_fullstack_roles):
             filtered.append({
                 "Name": person.get("name"),
     df = pd.DataFrame(filtered)
     df.to_csv(FILTERED_CSV, index=False)
+    return df, f"{len(df)} candidates filtered by role for category '{category_name}'. Ready for Similarity Ranking."
 # ----------------------------
+# Step 2: Recommendations (Using Similarity Matching)
 # ----------------------------
+def similarity_recommendations(category_name):
     if not os.path.exists(FILTERED_CSV):
         df_filtered, msg = filter_by_roles(category_name)
         if df_filtered.empty:
     if df_filtered.empty:
         return f"No filtered candidates found for category '{category_name}'. Run Step 1 first."
+    # --- CORE SCORING ---
+    df_filtered["Similarity_Score"] = calculate_similarity_scores(df_filtered, category_name)
+    # Filter out candidates with near-zero relevance (score < 0.01)
+    df_recommended = df_filtered[df_filtered["Similarity_Score"] > 0.01].copy()
     if df_recommended.empty:
+        return f"All candidates had insufficient text similarity (less than 1%) to the target roles for '{category_name}'. The roles do not match the target category keywords."
+    # Define salary parsing for tie-breaker
     def parse_salary(s):
         try:
+            # Replaces '$', ',', and sets 'N/A' to infinity for sorting purposes
             return float(str(s).replace("$","").replace(",","").replace("N/A", str(float('inf'))))
         except:
             return float('inf')
     df_recommended["Salary_sort"] = df_recommended["Salary"].apply(parse_salary)
+    # Sort: 1. Highest Similarity Score (descending), 2. Lowest Salary (ascending)
     df_top5 = df_recommended.sort_values(
+        by=['Similarity_Score', 'Salary_sort'],
         ascending=[False, True]
     ).head(5)
     output_text = f"Top {len(final_names)} Recommended Candidates for the '{category_name}' Category:\n\n"
     for i, name in enumerate(final_names):
+        score = df_top5.iloc[i]['Similarity_Score']
+        # Display the score as a percentage for readability
+        score_percent = f"{score * 100:.2f}%"
+        output_text += f"{i+1}. {name} (Role Match: {score_percent})\n"
+    output_text += "\nThese candidates were ranked objectively based on the **keyword similarity (TF-IDF)** of their previous job roles to the target roles, using expected salary as a tie-breaker."
     return output_text
         return pd.DataFrame({"Error": [f"Failed to load JSON: {e}"]})
 # ----------------------------
+# Gradio interface (Final Version)
 # ----------------------------
 with gr.Blocks() as app:
+    gr.Markdown("# 🏆 Candidate Selection (Keyword Similarity Matching)")
+    gr.Markdown("### **Reliable ranking using objective TF-IDF & Cosine Similarity for keyword overlap.**")
     gr.Markdown("#### 🔍 Raw JSON Preview: First 5 Candidates")
     gr.Dataframe(show_first_candidates(), label="First 5 JSON Entries")
     gr.Markdown("---")
+    # Step 2: Recommendations
+    recommend_button = gr.Button("3. Rank Candidates by Role Keyword Match")
+    recommend_output_text = gr.Textbox(label="Top Candidate Recommendations Summary", lines=10, placeholder="Click 'Rank Candidates by Role Keyword Match' after Step 2 completes.")
+    recommend_button.click(similarity_recommendations, inputs=[category_dropdown], outputs=[recommend_output_text])
 if __name__ == "__main__":
     app.launch(share=True)