curiouscurrent commited on
Commit
35bd947
·
verified ·
1 Parent(s): 8608c15

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -19
app.py CHANGED
@@ -3,15 +3,15 @@ import pandas as pd
3
  import json
4
  import os
5
  import requests
6
- import re # Added for robust name parsing
7
  from functools import lru_cache
8
 
9
  # ----------------------------
10
  # CONFIG
11
  # ----------------------------
12
  JSON_FILE = "form-submissions-1.json"
13
- # 🚩 FIX 1: Switching to a smaller, more reliable model to avoid API failures
14
- MODEL_ID = "google/flan-t5-base" # Flan-T5 Base (or try 'small' if this fails)
15
  HF_API_TOKEN = os.environ.get("HF_API_TOKEN")
16
  FILTERED_CSV = "/tmp/filtered_candidates.csv"
17
  OUTPUT_FILE = "/tmp/outputs.csv"
@@ -35,11 +35,10 @@ CATEGORIES = {
35
  }
36
 
37
  # ----------------------------
38
- # LLM Call for Ranking (Modified for Robustness)
39
  # ----------------------------
40
  @lru_cache(maxsize=1)
41
  def rank_candidates(candidates_list_str, category_name, job_titles_tuple):
42
- # 🚩 FIX 2: Requesting a numbered list instead of a comma-separated string
43
  prompt = f"""
44
  You are an HR expert specializing in the '{category_name}' category.
45
  Your goal is to rank the provided candidates based on two criteria:
@@ -59,13 +58,14 @@ Review the following list of candidates (JSON format):
59
  payload = {
60
  "inputs": prompt,
61
  "parameters": {
62
- "max_new_tokens": 150, # Increased for a numbered list
63
  "return_full_text": False,
64
  "temperature": 0.3
65
  }
66
  }
67
 
68
  try:
 
69
  response = requests.post(
70
  f"https://api-inference.huggingface.co/models/{MODEL_ID}",
71
  headers=headers,
@@ -81,15 +81,17 @@ Review the following list of candidates (JSON format):
81
 
82
  generated_text = result[0].get("generated_text", "").strip()
83
 
84
- # 🚩 FIX 3: Robust parsing for numbered list
85
  ranked_names = []
86
- # Use regex to find lines starting with a number and a dot (e.g., "1. Name")
87
- for match in re.findall(r'\d+\.\s*(.+)', generated_text):
88
  name = match.strip()
89
- # Clean up potential trailing text, like parentheses or commas
90
  name = re.sub(r'[,)].*$', '', name).strip()
91
  if name:
92
- ranked_names.append(name)
 
 
93
 
94
  return ranked_names
95
 
@@ -142,7 +144,7 @@ def filter_by_roles(category_name):
142
 
143
 
144
  # ----------------------------
145
- # Step 2: LLM recommendations (Modified for Ranking)
146
  # ----------------------------
147
  def llm_recommendations(category_name):
148
  job_titles = CATEGORIES[category_name]
@@ -158,7 +160,7 @@ def llm_recommendations(category_name):
158
  if df_filtered.empty:
159
  return f"No filtered candidates found for category '{category_name}'. Run Step 1 first."
160
 
161
- # 🚩 FIX 4: Select top 30 candidates for the LLM to review
162
  df_top_for_llm = df_filtered.head(30).fillna('N/A')
163
 
164
  # Only send necessary info for ranking
@@ -168,18 +170,18 @@ def llm_recommendations(category_name):
168
  ranked_names = rank_candidates(candidates_list_str, category_name, tuple(job_titles))
169
 
170
  if not ranked_names:
171
- return f"LLM failed to extract or rank suitable candidates for '{category_name}'. This usually means the LLM did not return a properly formatted numbered list. Try a smaller model (e.g., 'flan-t5-small') or check the API key."
172
 
173
  # Reorder the original DataFrame based on the names returned by the LLM
174
  name_to_rank = {name: i for i, name in enumerate(ranked_names)}
175
 
176
- # Filter to only include the names returned by the LLM AND that are in the original list (robustness)
177
  df_ranked = df_filtered[df_filtered["Name"].isin(ranked_names)].copy()
178
 
179
- # Use the rank dictionary to sort the DataFrame, handling names the LLM returned but aren't in the top 30
180
  df_ranked["LLM_Rank"] = df_ranked["Name"].map(name_to_rank)
181
 
182
- # Names that weren't ranked by the LLM will have NaN rank, we drop them.
183
  df_ranked.dropna(subset=['LLM_Rank'], inplace=True)
184
 
185
  df_top5 = df_ranked.sort_values(by="LLM_Rank").head(5)
@@ -187,7 +189,7 @@ def llm_recommendations(category_name):
187
  final_names = df_top5["Name"].tolist()
188
 
189
  if not final_names:
190
- return f"The LLM returned names, but none matched the candidates available for ranking in '{category_name}'. Try widening the initial role filters."
191
 
192
  output_text = f"Top {len(final_names)} Recommended Candidates for the '{category_name}' Category:\n\n"
193
 
@@ -215,7 +217,7 @@ def show_first_candidates():
215
  # Gradio interface (Unchanged)
216
  # ----------------------------
217
  with gr.Blocks() as app:
218
- gr.Markdown("# Candidate Recommendation Engine (Experience & Education Focus)")
219
 
220
  gr.Markdown("#### Raw JSON Preview: First 5 Candidates")
221
  gr.Dataframe(show_first_candidates(), label="First 5 JSON Entries")
 
3
  import json
4
  import os
5
  import requests
6
+ import re
7
  from functools import lru_cache
8
 
9
  # ----------------------------
10
  # CONFIG
11
  # ----------------------------
12
  JSON_FILE = "form-submissions-1.json"
13
+ # 🚩 FINAL FIX 1: Switching to the smallest, most reliable model
14
+ MODEL_ID = "google/flan-t5-small"
15
  HF_API_TOKEN = os.environ.get("HF_API_TOKEN")
16
  FILTERED_CSV = "/tmp/filtered_candidates.csv"
17
  OUTPUT_FILE = "/tmp/outputs.csv"
 
35
  }
36
 
37
  # ----------------------------
38
+ # LLM Call for Ranking (Model Switched)
39
  # ----------------------------
40
  @lru_cache(maxsize=1)
41
  def rank_candidates(candidates_list_str, category_name, job_titles_tuple):
 
42
  prompt = f"""
43
  You are an HR expert specializing in the '{category_name}' category.
44
  Your goal is to rank the provided candidates based on two criteria:
 
58
  payload = {
59
  "inputs": prompt,
60
  "parameters": {
61
+ "max_new_tokens": 150,
62
  "return_full_text": False,
63
  "temperature": 0.3
64
  }
65
  }
66
 
67
  try:
68
+ # NOTE: Flan-T5 Small should be much faster, but we keep the long timeout as a safety net.
69
  response = requests.post(
70
  f"https://api-inference.huggingface.co/models/{MODEL_ID}",
71
  headers=headers,
 
81
 
82
  generated_text = result[0].get("generated_text", "").strip()
83
 
84
+ # 🚩 FINAL FIX 2: Slightly more permissive regex to capture common list formats (1., 1) or 1 -)
85
  ranked_names = []
86
+ # Looks for: (1) start of line, (2) 1 or more digits, (3) a separator (dot, paren, or hyphen), (4) capture the rest
87
+ for match in re.findall(r'^\s*\d+[\.\)\-]\s*(.+)', generated_text, re.MULTILINE):
88
  name = match.strip()
89
+ # Clean up potential trailing text (e.g., a candidate's description the model added)
90
  name = re.sub(r'[,)].*$', '', name).strip()
91
  if name:
92
+ # Only include names that are plausible (not too short)
93
+ if len(name.split()) >= 2 or len(name) > 4:
94
+ ranked_names.append(name)
95
 
96
  return ranked_names
97
 
 
144
 
145
 
146
  # ----------------------------
147
+ # Step 2: LLM recommendations (Robust Ranking Logic)
148
  # ----------------------------
149
  def llm_recommendations(category_name):
150
  job_titles = CATEGORIES[category_name]
 
160
  if df_filtered.empty:
161
  return f"No filtered candidates found for category '{category_name}'. Run Step 1 first."
162
 
163
+ # Select top 30 candidates for the LLM to review
164
  df_top_for_llm = df_filtered.head(30).fillna('N/A')
165
 
166
  # Only send necessary info for ranking
 
170
  ranked_names = rank_candidates(candidates_list_str, category_name, tuple(job_titles))
171
 
172
  if not ranked_names:
173
+ return f"LLM failed to extract or rank suitable candidates for '{category_name}'. Final troubleshooting steps: 1. Manually verify your HF_API_TOKEN is correct. 2. If the token is correct, the issue is with the data provided, which is causing the model to generate unusable output."
174
 
175
  # Reorder the original DataFrame based on the names returned by the LLM
176
  name_to_rank = {name: i for i, name in enumerate(ranked_names)}
177
 
178
+ # Filter to only include the names returned by the LLM
179
  df_ranked = df_filtered[df_filtered["Name"].isin(ranked_names)].copy()
180
 
181
+ # Use the rank dictionary to sort the DataFrame
182
  df_ranked["LLM_Rank"] = df_ranked["Name"].map(name_to_rank)
183
 
184
+ # Drop candidates the LLM mentioned but weren't in the original filter list
185
  df_ranked.dropna(subset=['LLM_Rank'], inplace=True)
186
 
187
  df_top5 = df_ranked.sort_values(by="LLM_Rank").head(5)
 
189
  final_names = df_top5["Name"].tolist()
190
 
191
  if not final_names:
192
+ return f"The LLM returned names, but none matched the candidates available for ranking in '{category_name}'. This suggests the names in your JSON data do not exactly match the names generated by the LLM (e.g., 'John Smith' vs 'Mr. John Smith')."
193
 
194
  output_text = f"Top {len(final_names)} Recommended Candidates for the '{category_name}' Category:\n\n"
195
 
 
217
  # Gradio interface (Unchanged)
218
  # ----------------------------
219
  with gr.Blocks() as app:
220
+ gr.Markdown("# Candidate Recommendation Engine (Final Robust Version)")
221
 
222
  gr.Markdown("#### Raw JSON Preview: First 5 Candidates")
223
  gr.Dataframe(show_first_candidates(), label="First 5 JSON Entries")