Spaces:

curiouscurrent
/

appliedai

Sleeping

App Files Files Community

curiouscurrent commited on Sep 26, 2025

Commit

35bd947

verified ·

1 Parent(s): 8608c15

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -19

app.py CHANGED Viewed

@@ -3,15 +3,15 @@ import pandas as pd
 import json
 import os
 import requests
-import re # Added for robust name parsing
 from functools import lru_cache
 # ----------------------------
 # CONFIG
 # ----------------------------
 JSON_FILE = "form-submissions-1.json"
-# 🚩 FIX 1: Switching to a smaller, more reliable model to avoid API failures
-MODEL_ID = "google/flan-t5-base" # Flan-T5 Base (or try 'small' if this fails)
 HF_API_TOKEN = os.environ.get("HF_API_TOKEN")
 FILTERED_CSV = "/tmp/filtered_candidates.csv"
 OUTPUT_FILE = "/tmp/outputs.csv"
@@ -35,11 +35,10 @@ CATEGORIES = {
 }
 # ----------------------------
-# LLM Call for Ranking (Modified for Robustness)
 # ----------------------------
 @lru_cache(maxsize=1)
 def rank_candidates(candidates_list_str, category_name, job_titles_tuple):
-    # 🚩 FIX 2: Requesting a numbered list instead of a comma-separated string
     prompt = f"""
 You are an HR expert specializing in the '{category_name}' category.
 Your goal is to rank the provided candidates based on two criteria:
@@ -59,13 +58,14 @@ Review the following list of candidates (JSON format):
     payload = {
         "inputs": prompt,
         "parameters": {
-            "max_new_tokens": 150, # Increased for a numbered list
             "return_full_text": False,
             "temperature": 0.3
         }
     }
     try:
         response = requests.post(
             f"https://api-inference.huggingface.co/models/{MODEL_ID}",
             headers=headers,
@@ -81,15 +81,17 @@ Review the following list of candidates (JSON format):
         generated_text = result[0].get("generated_text", "").strip()
-        # 🚩 FIX 3: Robust parsing for numbered list
         ranked_names = []
-        # Use regex to find lines starting with a number and a dot (e.g., "1. Name")
-        for match in re.findall(r'\d+\.\s*(.+)', generated_text):
             name = match.strip()
-            # Clean up potential trailing text, like parentheses or commas
             name = re.sub(r'[,)].*$', '', name).strip()
             if name:
-                ranked_names.append(name)
         return ranked_names
@@ -142,7 +144,7 @@ def filter_by_roles(category_name):
 # ----------------------------
-# Step 2: LLM recommendations (Modified for Ranking)
 # ----------------------------
 def llm_recommendations(category_name):
     job_titles = CATEGORIES[category_name]
@@ -158,7 +160,7 @@ def llm_recommendations(category_name):
     if df_filtered.empty:
         return f"No filtered candidates found for category '{category_name}'. Run Step 1 first."
-    # 🚩 FIX 4: Select top 30 candidates for the LLM to review
     df_top_for_llm = df_filtered.head(30).fillna('N/A')
     # Only send necessary info for ranking
@@ -168,18 +170,18 @@ def llm_recommendations(category_name):
     ranked_names = rank_candidates(candidates_list_str, category_name, tuple(job_titles))
     if not ranked_names:
-        return f"LLM failed to extract or rank suitable candidates for '{category_name}'. This usually means the LLM did not return a properly formatted numbered list. Try a smaller model (e.g., 'flan-t5-small') or check the API key."
     # Reorder the original DataFrame based on the names returned by the LLM
     name_to_rank = {name: i for i, name in enumerate(ranked_names)}
-    # Filter to only include the names returned by the LLM AND that are in the original list (robustness)
     df_ranked = df_filtered[df_filtered["Name"].isin(ranked_names)].copy()
-    # Use the rank dictionary to sort the DataFrame, handling names the LLM returned but aren't in the top 30
     df_ranked["LLM_Rank"] = df_ranked["Name"].map(name_to_rank)
-    # Names that weren't ranked by the LLM will have NaN rank, we drop them.
     df_ranked.dropna(subset=['LLM_Rank'], inplace=True)
     df_top5 = df_ranked.sort_values(by="LLM_Rank").head(5)
@@ -187,7 +189,7 @@ def llm_recommendations(category_name):
     final_names = df_top5["Name"].tolist()
     if not final_names:
-        return f"The LLM returned names, but none matched the candidates available for ranking in '{category_name}'. Try widening the initial role filters."
     output_text = f"Top {len(final_names)} Recommended Candidates for the '{category_name}' Category:\n\n"
@@ -215,7 +217,7 @@ def show_first_candidates():
 # Gradio interface (Unchanged)
 # ----------------------------
 with gr.Blocks() as app:
-    gr.Markdown("# Candidate Recommendation Engine (Experience & Education Focus)")
     gr.Markdown("#### Raw JSON Preview: First 5 Candidates")
     gr.Dataframe(show_first_candidates(), label="First 5 JSON Entries")

 import json
 import os
 import requests
+import re
 from functools import lru_cache
 # ----------------------------
 # CONFIG
 # ----------------------------
 JSON_FILE = "form-submissions-1.json"
+# 🚩 FINAL FIX 1: Switching to the smallest, most reliable model
+MODEL_ID = "google/flan-t5-small"
 HF_API_TOKEN = os.environ.get("HF_API_TOKEN")
 FILTERED_CSV = "/tmp/filtered_candidates.csv"
 OUTPUT_FILE = "/tmp/outputs.csv"
 }
 # ----------------------------
+# LLM Call for Ranking (Model Switched)
 # ----------------------------
 @lru_cache(maxsize=1)
 def rank_candidates(candidates_list_str, category_name, job_titles_tuple):
     prompt = f"""
 You are an HR expert specializing in the '{category_name}' category.
 Your goal is to rank the provided candidates based on two criteria:
     payload = {
         "inputs": prompt,
         "parameters": {
+            "max_new_tokens": 150,
             "return_full_text": False,
             "temperature": 0.3
         }
     }
     try:
+        # NOTE: Flan-T5 Small should be much faster, but we keep the long timeout as a safety net.
         response = requests.post(
             f"https://api-inference.huggingface.co/models/{MODEL_ID}",
             headers=headers,
         generated_text = result[0].get("generated_text", "").strip()
+        # 🚩 FINAL FIX 2: Slightly more permissive regex to capture common list formats (1., 1) or 1 -)
         ranked_names = []
+        # Looks for: (1) start of line, (2) 1 or more digits, (3) a separator (dot, paren, or hyphen), (4) capture the rest
+        for match in re.findall(r'^\s*\d+[\.\)\-]\s*(.+)', generated_text, re.MULTILINE):
             name = match.strip()
+            # Clean up potential trailing text (e.g., a candidate's description the model added)
             name = re.sub(r'[,)].*$', '', name).strip()
             if name:
+                # Only include names that are plausible (not too short)
+                if len(name.split()) >= 2 or len(name) > 4:
+                    ranked_names.append(name)
         return ranked_names
 # ----------------------------
+# Step 2: LLM recommendations (Robust Ranking Logic)
 # ----------------------------
 def llm_recommendations(category_name):
     job_titles = CATEGORIES[category_name]
     if df_filtered.empty:
         return f"No filtered candidates found for category '{category_name}'. Run Step 1 first."
+    # Select top 30 candidates for the LLM to review
     df_top_for_llm = df_filtered.head(30).fillna('N/A')
     # Only send necessary info for ranking
     ranked_names = rank_candidates(candidates_list_str, category_name, tuple(job_titles))
     if not ranked_names:
+        return f"LLM failed to extract or rank suitable candidates for '{category_name}'. Final troubleshooting steps: 1. Manually verify your HF_API_TOKEN is correct. 2. If the token is correct, the issue is with the data provided, which is causing the model to generate unusable output."
     # Reorder the original DataFrame based on the names returned by the LLM
     name_to_rank = {name: i for i, name in enumerate(ranked_names)}
+    # Filter to only include the names returned by the LLM
     df_ranked = df_filtered[df_filtered["Name"].isin(ranked_names)].copy()
+    # Use the rank dictionary to sort the DataFrame
     df_ranked["LLM_Rank"] = df_ranked["Name"].map(name_to_rank)
+    # Drop candidates the LLM mentioned but weren't in the original filter list
     df_ranked.dropna(subset=['LLM_Rank'], inplace=True)
     df_top5 = df_ranked.sort_values(by="LLM_Rank").head(5)
     final_names = df_top5["Name"].tolist()
     if not final_names:
+        return f"The LLM returned names, but none matched the candidates available for ranking in '{category_name}'. This suggests the names in your JSON data do not exactly match the names generated by the LLM (e.g., 'John Smith' vs 'Mr. John Smith')."
     output_text = f"Top {len(final_names)} Recommended Candidates for the '{category_name}' Category:\n\n"
 # Gradio interface (Unchanged)
 # ----------------------------
 with gr.Blocks() as app:
+    gr.Markdown("# Candidate Recommendation Engine (Final Robust Version)")
     gr.Markdown("#### Raw JSON Preview: First 5 Candidates")
     gr.Dataframe(show_first_candidates(), label="First 5 JSON Entries")