Spaces:

curiouscurrent
/

appliedai

Sleeping

App Files Files Community

curiouscurrent commited on Sep 26, 2025

Commit

db61f50

verified ·

1 Parent(s): daf3997

Update app.py

Browse files

Files changed (1) hide show

app.py +140 -167

app.py CHANGED Viewed

@@ -2,30 +2,23 @@ import gradio as gr
 import pandas as pd
 import json
 import os
 import re
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.metrics.pairwise import cosine_similarity
 from functools import lru_cache
-# *** IMPORTANT: YOU MUST REPLACE THIS WITH YOUR ACTUAL LLM CLIENT/API ***
-# For demonstration, we will use a mock function, but in reality,
-# you'd use a library like 'openai', 'google-genai', or 'llama-cpp-python'.
-# MOCK LLM CLIENT (Replace with actual LLM API call)
-def llm_api_call(prompt):
-    """Mocks an LLM API call for demonstration purposes."""
-    if "average salary" in prompt.lower():
-        return "Based on the filtered candidates, the average salary expectation is approximately **$140,000 USD** among the top 5 candidates. Candidate Alice Smith has the highest score."
-    elif "best skills" in prompt.lower():
-        return "The top candidates predominantly possess skills in **Python, PyTorch, TensorFlow, and AWS/Azure**. This aligns well with the roles in the AI category."
-    else:
-        return "I need more context from the question to generate a meaningful analysis. Try asking about salaries, key skills, or location distribution among the top candidates."
 # ----------------------------
 # CONFIG
 # ----------------------------
 JSON_FILE = "form-submissions-1.json"
 FILTERED_CSV = "/tmp/filtered_candidates.csv"
 CATEGORIES = {
     "AI": [
@@ -42,112 +35,69 @@ CATEGORIES = {
 }
 # ----------------------------
-# Similarity Matching Function (Reliable Objective Scoring)
-# ----------------------------
-@lru_cache(maxsize=1)
-def calculate_similarity_scores(df_candidates, category_name):
-    # ... (Same function as before: calculates TF-IDF/Cosine Similarity)
-    if df_candidates.empty:
-        return pd.Series([], dtype='float64')
-    target_roles = " ".join(CATEGORIES[category_name])
-    candidate_roles = df_candidates['Roles'].tolist()
-    corpus = [target_roles] + candidate_roles
-    vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
-    tfidf_matrix = vectorizer.fit_transform(corpus)
-    target_vector = tfidf_matrix[0]
-    candidate_vectors = tfidf_matrix[1:]
-    similarity_scores = cosine_similarity(target_vector, candidate_vectors).flatten()
-    return pd.Series(similarity_scores, index=df_candidates.index)
-# ----------------------------
-# Helper: Rank and retrieve Top 5 candidates
-# ----------------------------
-def get_top5_candidates(category_name):
-    if not os.path.exists(FILTERED_CSV):
-        return pd.DataFrame(), "Error: Filtered CSV not found. Run Step 1 and Step 2 first."
-    df_filtered = pd.read_csv(FILTERED_CSV)
-    df_filtered = df_filtered[df_filtered["Category"] == category_name]
-    if df_filtered.empty:
-        return pd.DataFrame(), f"No filtered candidates found for category '{category_name}'."
-    # Recalculate or retrieve scores (ensures consistency)
-    df_filtered["Similarity_Score"] = calculate_similarity_scores(df_filtered, category_name)
-    df_recommended = df_filtered[df_filtered["Similarity_Score"] > 0.01].copy()
-    def parse_salary(s):
-        try:
-            return float(str(s).replace("$","").replace(",","").replace("N/A", str(float('inf'))))
-        except:
-            return float('inf')
-    df_recommended["Salary_sort"] = df_recommended["Salary"].apply(parse_salary)
-    df_top5 = df_recommended.sort_values(
-        by=['Similarity_Score', 'Salary_sort'],
-        ascending=[False, True]
-    ).head(5)
-    # Select only the relevant columns for the LLM context
-    df_top5 = df_top5[['Name', 'Roles', 'Skills', 'Salary', 'Location', 'Similarity_Score']]
-    return df_top5, None
-# ----------------------------
-# Step 3: LLM Question Answering (New Feature)
 # ----------------------------
-def ask_llm_about_candidates(question, category_name):
-    """
-    RAG-like function: Loads the top 5 candidates and uses that data
-    as context for the LLM to answer the founder's question.
-    """
-    df_top5, error_msg = get_top5_candidates(category_name)
-    if error_msg:
-        return f"Cannot run Q&A: {error_msg}"
-    if df_top5.empty:
-        return "No top candidates were identified in Step 2 to provide context for this question."
-    # 1. Prepare the context for the LLM (serialize the top 5 data)
-    candidate_context = df_top5.to_markdown(index=False)
-    system_prompt = f"""
-    You are an expert Talent Acquisition Analyst. Your task is to analyze the provided table of top-ranked candidates for the '{category_name}' category and answer the founder's question concisely.
-    The candidates were ranked based on the keyword match of their roles to the target category.
-    **CONTEXT (Top 5 Candidates):**
-    ---
-    {candidate_context}
-    ---
-    **INSTRUCTIONS:**
-    1. Base your answer ONLY on the provided CONTEXT table. Do not use external knowledge.
-    2. Answer the question in a clear, professional, and business-focused manner.
-    3. If the data is insufficient to answer, state that clearly.
-    """
-    full_prompt = f"{system_prompt}\n\nFOUNDER'S QUESTION: {question}"
-    # 2. Call the LLM API (MOCK for this example)
     try:
-        llm_response = llm_api_call(full_prompt) # <-- Replace with actual API call
-        return llm_response
     except Exception as e:
-        return f"LLM API Error: Could not connect or receive a response. Check API key and configuration. Error details: {e}"
-# --- Other Functions (filter_by_roles, similarity_recommendations, show_first_candidates) ---
-# (Keep the rest of the original functions here, unchanged)
 def filter_by_roles(category_name):
-    # (The body of the original filter_by_roles function)
     job_titles = CATEGORIES[category_name]
     try:
         with open(JSON_FILE, encoding="utf-8") as f:
             data = json.load(f)
     except FileNotFoundError:
-        return pd.DataFrame(), f"Error: JSON file '{JSON_FILE}' not found. The application can't proceed."
     filtered = []
@@ -175,34 +125,84 @@ def filter_by_roles(category_name):
             })
     if not filtered:
-        return pd.DataFrame(), f"No candidates found matching roles for category '{category_name}'. The application can't proceed."
     df = pd.DataFrame(filtered)
     df.to_csv(FILTERED_CSV, index=False)
-    return df, f"{len(df)} candidates filtered by role for category '{category_name}'. Ready for Similarity Ranking."
-def similarity_recommendations(category_name):
-    df_top5, error_msg = get_top5_candidates(category_name)
-    if error_msg:
-        return error_msg
-    if df_top5.empty:
-        return f"All candidates had insufficient text similarity (less than 1%) to the target roles for '{category_name}'. The roles do not match the target category keywords."
     final_names = df_top5["Name"].tolist()
     output_text = f"Top {len(final_names)} Recommended Candidates for the '{category_name}' Category:\n\n"
     for i, name in enumerate(final_names):
-        score = df_top5.iloc[i]['Similarity_Score']
-        score_percent = f"{score * 100:.2f}%"
-        output_text += f"{i+1}. {name} (Role Match: {score_percent})\n"
-    output_text += "\nThese candidates were ranked objectively based on the **keyword similarity (TF-IDF)** of their previous job roles to the target roles, using expected salary as a tie-breaker."
     return output_text
 def show_first_candidates():
     try:
         with open(JSON_FILE, encoding="utf-8") as f:
@@ -214,58 +214,31 @@ def show_first_candidates():
         return pd.DataFrame({"Error": [f"Failed to load JSON: {e}"]})
 # ----------------------------
-# Gradio interface (Final Version with Q&A)
 # ----------------------------
 with gr.Blocks() as app:
-    gr.Markdown("# 🏆 Candidate Selection & Founder Analysis")
-    gr.Markdown("### **Reliable ranking using objective TF-IDF & Cosine Similarity, plus an LLM-powered Q&A tool.**")
-    with gr.Tab("1. Candidate Ranking"):
-        gr.Markdown("#### 🔍 Raw JSON Preview: First 5 Candidates")
-        gr.Dataframe(show_first_candidates(), label="First 5 JSON Entries")
-        gr.Markdown("---")
-        category_dropdown = gr.Dropdown(list(CATEGORIES.keys()), label="Select Category", value="AI")
-        # Step 1: Filter by roles
-        filter_button = gr.Button("1. Filter Candidates by Roles (Create CSV)")
-        filtered_df = gr.Dataframe(label="Filtered Candidates (Preview)")
-        filter_status = gr.Textbox(label="Filter Status", placeholder="Click 'Filter Candidates by Roles' to start.")
-        filter_button.click(filter_by_roles, inputs=[category_dropdown], outputs=[filtered_df, filter_status])
-        gr.Markdown("---")
-        # Step 2: Recommendations
-        recommend_button = gr.Button("2. Rank and Find Top 5 Candidates")
-        recommend_output_text = gr.Textbox(label="Top Candidate Recommendations Summary", lines=10, placeholder="Click 'Rank and Find Top 5 Candidates' after Step 1 completes.")
-        recommend_button.click(similarity_recommendations, inputs=[category_dropdown], outputs=[recommend_output_text])
-    with gr.Tab("2. Founder Q&A"):
-        gr.Markdown("### 🧠 Ask the LLM about the Top Candidates")
-        gr.Markdown("The LLM uses the **Top 5 candidates** identified in the 'Candidate Ranking' tab as its sole source of information.")
-        qa_category_dropdown = gr.Dropdown(list(CATEGORIES.keys()), label="Select Category for Q&A", value="AI")
-        founder_question = gr.Textbox(
-            label="Founder's Question",
-            lines=2,
-            placeholder="e.g., What is the average expected salary of the top candidates? Or, What are their most common skills?",
-            value="What is the average expected salary of the top candidates?"
-        )
-        qa_button = gr.Button("3. Get LLM Analysis")
-        llm_response_text = gr.Textbox(
-            label="LLM Response (Context-Based Analysis)",
-            lines=8,
-            placeholder="The analysis will appear here after you click the button."
-        )
-        qa_button.click(
-            ask_llm_about_candidates,
-            inputs=[founder_question, qa_category_dropdown],
-            outputs=[llm_response_text]
-        )
 if __name__ == "__main__":
     app.launch(share=True)

 import pandas as pd
 import json
 import os
+import requests
 import re
 from functools import lru_cache
 # ----------------------------
 # CONFIG
 # ----------------------------
 JSON_FILE = "form-submissions-1.json"
+MODEL_ID = "google/flan-t5-small"
+# NOTE: HF_API_TOKEN MUST be set in your environment variables/Space secrets.
+HF_API_TOKEN = os.environ.get("HF_API_TOKEN")
 FILTERED_CSV = "/tmp/filtered_candidates.csv"
+OUTPUT_FILE = "/tmp/outputs.csv"
+BATCH_SIZE = 50
+if not HF_API_TOKEN:
+    pass
 CATEGORIES = {
     "AI": [
 }
 # ----------------------------
+# LLM Call for Scoring (Focus: Role Experience ONLY)
 # ----------------------------
+@lru_cache(maxsize=512)
+def score_candidate(candidate_str, category_name, job_titles_tuple):
+    if not HF_API_TOKEN:
+         print("API Token is missing. Returning score 0.")
+         return 0
+    prompt = f"""
+You are an HR assistant. Your task is to rate a candidate's suitability based ONLY on their previous job roles.
+Rate the suitability of the following candidate on a scale of 1 (Lowest) to 10 (Highest).
+The score must reflect how closely the candidate's 'Roles' align with the target job titles.
+The target roles for the '{category_name}' category are: {list(job_titles_tuple)}
+Candidate JSON: {candidate_str}
+**Task**: Respond ONLY with the rating number (an integer from 1 to 10).
+"""
+    headers = {"Authorization": f"Bearer {HF_API_TOKEN}", "Content-Type": "application/json"}
+    payload = {
+        "inputs": prompt,
+        "parameters": {
+            "max_new_tokens": 5,
+            "return_full_text": False,
+            "temperature": 0.1
+        }
+    }
     try:
+        response = requests.post(
+            f"https://api-inference.huggingface.co/models/{MODEL_ID}",
+            headers=headers,
+            data=json.dumps(payload),
+            timeout=60
+        )
+        response.raise_for_status()
+        result = response.json()
+        generated_text = result[0].get("generated_text", "0").strip()
+        match = re.search(r'\d+', generated_text)
+        if match:
+            score = int(match.group(0))
+            return max(1, min(10, score))
+        return 0
     except Exception as e:
+        print(f"LLM scoring call failed for candidate (API/Network Error): {e}")
+        return 0
+# ----------------------------
+# Step 1: Filter by roles (Unchanged)
+# ----------------------------
 def filter_by_roles(category_name):
     job_titles = CATEGORIES[category_name]
     try:
         with open(JSON_FILE, encoding="utf-8") as f:
             data = json.load(f)
     except FileNotFoundError:
+        return pd.DataFrame(), f"Error: JSON file '{JSON_FILE}' not found. The LLM can't proceed."
     filtered = []
             })
     if not filtered:
+        return pd.DataFrame(), f"No candidates found matching roles for category '{category_name}'. The LLM can't proceed."
     df = pd.DataFrame(filtered)
     df.to_csv(FILTERED_CSV, index=False)
+    return df, f"{len(df)} candidates filtered by role for category '{category_name}'. Ready for LLM scoring."
+# ----------------------------
+# Step 2: LLM recommendations (Scoring, Sorting, and Output)
+# ----------------------------
+def llm_recommendations(category_name):
+    job_titles = CATEGORIES[category_name]
+    if not os.path.exists(FILTERED_CSV):
+        df_filtered, msg = filter_by_roles(category_name)
+        if df_filtered.empty:
+            return msg
+    else:
+        df_filtered = pd.read_csv(FILTERED_CSV)
+        df_filtered = df_filtered[df_filtered["Category"] == category_name]
+    if df_filtered.empty:
+        return f"No filtered candidates found for category '{category_name}'. Run Step 1 first."
+    # Prepare for scoring
+    df_filtered_clean = df_filtered.fillna('N/A')
+    filtered_candidates = df_filtered_clean.to_dict(orient="records")
+    scores = []
+    for person in filtered_candidates:
+        candidate_info = {
+            "Name": person.get("Name"),
+            "Roles": person.get("Roles"),
+            "Skills": person.get("Skills")
+        }
+        candidate_str = json.dumps(candidate_info)
+        score = score_candidate(candidate_str, category_name, tuple(job_titles))
+        scores.append(score)
+    df_filtered["LLM_Score"] = scores
+    df_recommended = df_filtered[df_filtered["LLM_Score"] > 0].copy()
+    if df_recommended.empty:
+        if not HF_API_TOKEN:
+            return "❌ LLM failed: The HF_API_TOKEN is not set or is invalid. Set the token and try again."
+        return f"LLM scored all candidates 0. The candidates' roles are deemed irrelevant by the LLM for '{category_name}'."
+    def parse_salary(s):
+        try:
+            return float(str(s).replace("$","").replace(",","").replace("N/A", str(float('inf'))))
+        except:
+            return float('inf')
+    df_recommended["Salary_sort"] = df_recommended["Salary"].apply(parse_salary)
+    df_top5 = df_recommended.sort_values(
+        by=['LLM_Score', 'Salary_sort'],
+        ascending=[False, True]
+    ).head(5)
     final_names = df_top5["Name"].tolist()
     output_text = f"Top {len(final_names)} Recommended Candidates for the '{category_name}' Category:\n\n"
     for i, name in enumerate(final_names):
+        score = df_top5.iloc[i]['LLM_Score']
+        output_text += f"{i+1}. {name} (Suitability Score: {score}/10)\n"
+    output_text += "\nThese candidates were ranked by the LLM based **only on the alignment of their previous job roles** with the target roles, using expected salary as a tie-breaker."
     return output_text
+# ----------------------------
+# Show first 5 raw JSON candidates (Unchanged)
+# ----------------------------
 def show_first_candidates():
     try:
         with open(JSON_FILE, encoding="utf-8") as f:
         return pd.DataFrame({"Error": [f"Failed to load JSON: {e}"]})
 # ----------------------------
+# Gradio interface (Updated Heading and Launch)
 # ----------------------------
 with gr.Blocks() as app:
+    # 🚩 CHANGE: Updated Heading
+    gr.Markdown("# 🤖 Candidate Selection (Role-Based Scoring)")
+    gr.Markdown("#### 🔍 Raw JSON Preview: First 5 Candidates")
+    gr.Dataframe(show_first_candidates(), label="First 5 JSON Entries")
+    gr.Markdown("---")
+    category_dropdown = gr.Dropdown(list(CATEGORIES.keys()), label="1. Select Category")
+    # Step 1: Filter by roles
+    filter_button = gr.Button("2. Filter Candidates by Roles")
+    filtered_df = gr.Dataframe(label="Filtered Candidates (Preview)")
+    filter_status = gr.Textbox(label="Filter Status", placeholder="Click 'Filter Candidates by Roles' to start.")
+    filter_button.click(filter_by_roles, inputs=[category_dropdown], outputs=[filtered_df, filter_status])
+    gr.Markdown("---")
+    # Step 2: LLM Recommendations
+    llm_button = gr.Button("3. Get LLM Recommendations (Role Experience Ranking)")
+    llm_output_text = gr.Textbox(label="Top Candidate Recommendations Summary", lines=10, placeholder="Click 'Get LLM Recommendations' after Step 2 completes.")
+    llm_button.click(llm_recommendations, inputs=[category_dropdown], outputs=[llm_output_text])
 if __name__ == "__main__":
+    # 🚩 CHANGE: Set share=True to generate a public link
     app.launch(share=True)