curiouscurrent commited on
Commit
98ef19f
·
verified ·
1 Parent(s): 35bd947

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -72
app.py CHANGED
@@ -10,15 +10,15 @@ from functools import lru_cache
10
  # CONFIG
11
  # ----------------------------
12
  JSON_FILE = "form-submissions-1.json"
13
- # 🚩 FINAL FIX 1: Switching to the smallest, most reliable model
14
  MODEL_ID = "google/flan-t5-small"
15
- HF_API_TOKEN = os.environ.get("HF_API_TOKEN")
 
16
  FILTERED_CSV = "/tmp/filtered_candidates.csv"
17
  OUTPUT_FILE = "/tmp/outputs.csv"
18
  BATCH_SIZE = 50
19
 
20
  if not HF_API_TOKEN:
21
- raise ValueError("HF_API_TOKEN not found in environment. Add it in Space Secrets.")
22
 
23
  CATEGORIES = {
24
  "AI": [
@@ -35,69 +35,58 @@ CATEGORIES = {
35
  }
36
 
37
  # ----------------------------
38
- # LLM Call for Ranking (Model Switched)
39
  # ----------------------------
40
- @lru_cache(maxsize=1)
41
- def rank_candidates(candidates_list_str, category_name, job_titles_tuple):
 
 
 
 
42
  prompt = f"""
43
- You are an HR expert specializing in the '{category_name}' category.
44
- Your goal is to rank the provided candidates based on two criteria:
45
- 1. **Experience**: Inferred from relevant roles and extensive skills.
46
- 2. **Educational Background**: Assume candidates with technical roles/skills have a strong technical education (e.g., MSc/PhD).
47
 
48
- The target roles are: {list(job_titles_tuple)}
49
 
50
- Review the following list of candidates (JSON format):
51
- {candidates_list_str}
52
 
53
- **Task**: Select the **top 5 most promising candidates** from this list.
54
- **Output Format**: Respond ONLY with a numbered list (1. Name, 2. Name, etc.) of the candidates' **Names**. Do not include any commentary.
55
  """
56
  headers = {"Authorization": f"Bearer {HF_API_TOKEN}", "Content-Type": "application/json"}
57
 
58
  payload = {
59
  "inputs": prompt,
60
  "parameters": {
61
- "max_new_tokens": 150,
62
  "return_full_text": False,
63
- "temperature": 0.3
64
  }
65
  }
66
 
67
  try:
68
- # NOTE: Flan-T5 Small should be much faster, but we keep the long timeout as a safety net.
69
  response = requests.post(
70
  f"https://api-inference.huggingface.co/models/{MODEL_ID}",
71
  headers=headers,
72
  data=json.dumps(payload),
73
- timeout=120
74
  )
75
  response.raise_for_status()
76
  result = response.json()
77
-
78
- if isinstance(result, dict) and "error" in result:
79
- print(f"LLM API Error: {result.get('error')}")
80
- return []
81
 
82
- generated_text = result[0].get("generated_text", "").strip()
 
 
 
 
 
83
 
84
- # 🚩 FINAL FIX 2: Slightly more permissive regex to capture common list formats (1., 1) or 1 -)
85
- ranked_names = []
86
- # Looks for: (1) start of line, (2) 1 or more digits, (3) a separator (dot, paren, or hyphen), (4) capture the rest
87
- for match in re.findall(r'^\s*\d+[\.\)\-]\s*(.+)', generated_text, re.MULTILINE):
88
- name = match.strip()
89
- # Clean up potential trailing text (e.g., a candidate's description the model added)
90
- name = re.sub(r'[,)].*$', '', name).strip()
91
- if name:
92
- # Only include names that are plausible (not too short)
93
- if len(name.split()) >= 2 or len(name) > 4:
94
- ranked_names.append(name)
95
-
96
- return ranked_names
97
 
98
  except Exception as e:
99
- print("LLM ranking call failed:", e)
100
- return []
101
 
102
  # ----------------------------
103
  # Step 1: Filter by roles (Unchanged)
@@ -140,11 +129,11 @@ def filter_by_roles(category_name):
140
 
141
  df = pd.DataFrame(filtered)
142
  df.to_csv(FILTERED_CSV, index=False)
143
- return df, f"{len(df)} candidates filtered by role for category '{category_name}'. Ready for LLM ranking."
144
 
145
 
146
  # ----------------------------
147
- # Step 2: LLM recommendations (Robust Ranking Logic)
148
  # ----------------------------
149
  def llm_recommendations(category_name):
150
  job_titles = CATEGORIES[category_name]
@@ -160,43 +149,54 @@ def llm_recommendations(category_name):
160
  if df_filtered.empty:
161
  return f"No filtered candidates found for category '{category_name}'. Run Step 1 first."
162
 
163
- # Select top 30 candidates for the LLM to review
164
- df_top_for_llm = df_filtered.head(30).fillna('N/A')
 
165
 
166
- # Only send necessary info for ranking
167
- candidates_to_rank = df_top_for_llm[["Name", "Roles", "Skills"]].to_dict(orient="records")
168
- candidates_list_str = json.dumps(candidates_to_rank, indent=2)
169
-
170
- ranked_names = rank_candidates(candidates_list_str, category_name, tuple(job_titles))
171
-
172
- if not ranked_names:
173
- return f"LLM failed to extract or rank suitable candidates for '{category_name}'. Final troubleshooting steps: 1. Manually verify your HF_API_TOKEN is correct. 2. If the token is correct, the issue is with the data provided, which is causing the model to generate unusable output."
174
-
175
- # Reorder the original DataFrame based on the names returned by the LLM
176
- name_to_rank = {name: i for i, name in enumerate(ranked_names)}
177
-
178
- # Filter to only include the names returned by the LLM
179
- df_ranked = df_filtered[df_filtered["Name"].isin(ranked_names)].copy()
180
 
181
- # Use the rank dictionary to sort the DataFrame
182
- df_ranked["LLM_Rank"] = df_ranked["Name"].map(name_to_rank)
 
 
 
 
 
 
 
 
 
 
183
 
184
- # Drop candidates the LLM mentioned but weren't in the original filter list
185
- df_ranked.dropna(subset=['LLM_Rank'], inplace=True)
 
 
 
 
186
 
187
- df_top5 = df_ranked.sort_values(by="LLM_Rank").head(5)
 
 
 
 
 
 
 
 
 
 
 
188
 
189
  final_names = df_top5["Name"].tolist()
190
 
191
- if not final_names:
192
- return f"The LLM returned names, but none matched the candidates available for ranking in '{category_name}'. This suggests the names in your JSON data do not exactly match the names generated by the LLM (e.g., 'John Smith' vs 'Mr. John Smith')."
193
-
194
  output_text = f"Top {len(final_names)} Recommended Candidates for the '{category_name}' Category:\n\n"
195
 
196
  for i, name in enumerate(final_names):
197
- output_text += f"{i+1}. {name}\n"
 
198
 
199
- output_text += "\nThese candidates were ranked by the LLM based on inferred experience and assumed education."
200
 
201
  return output_text
202
 
@@ -214,12 +214,13 @@ def show_first_candidates():
214
  return pd.DataFrame({"Error": [f"Failed to load JSON: {e}"]})
215
 
216
  # ----------------------------
217
- # Gradio interface (Unchanged)
218
  # ----------------------------
219
  with gr.Blocks() as app:
220
- gr.Markdown("# Candidate Recommendation Engine (Final Robust Version)")
 
221
 
222
- gr.Markdown("#### Raw JSON Preview: First 5 Candidates")
223
  gr.Dataframe(show_first_candidates(), label="First 5 JSON Entries")
224
 
225
  gr.Markdown("---")
@@ -234,9 +235,10 @@ with gr.Blocks() as app:
234
  gr.Markdown("---")
235
 
236
  # Step 2: LLM Recommendations
237
- llm_button = gr.Button("3. Get LLM Recommendations (Experience & Education Ranking)")
238
  llm_output_text = gr.Textbox(label="Top Candidate Recommendations Summary", lines=10, placeholder="Click 'Get LLM Recommendations' after Step 2 completes.")
239
  llm_button.click(llm_recommendations, inputs=[category_dropdown], outputs=[llm_output_text])
240
 
241
  if __name__ == "__main__":
242
- app.launch()
 
 
10
  # CONFIG
11
  # ----------------------------
12
  JSON_FILE = "form-submissions-1.json"
 
13
  MODEL_ID = "google/flan-t5-small"
14
+ # NOTE: HF_API_TOKEN MUST be set in your environment variables/Space secrets.
15
+ HF_API_TOKEN = os.environ.get("HF_API_TOKEN")
16
  FILTERED_CSV = "/tmp/filtered_candidates.csv"
17
  OUTPUT_FILE = "/tmp/outputs.csv"
18
  BATCH_SIZE = 50
19
 
20
  if not HF_API_TOKEN:
21
+ pass
22
 
23
  CATEGORIES = {
24
  "AI": [
 
35
  }
36
 
37
  # ----------------------------
38
+ # LLM Call for Scoring (Focus: Role Experience ONLY)
39
  # ----------------------------
40
+ @lru_cache(maxsize=512)
41
+ def score_candidate(candidate_str, category_name, job_titles_tuple):
42
+ if not HF_API_TOKEN:
43
+ print("API Token is missing. Returning score 0.")
44
+ return 0
45
+
46
  prompt = f"""
47
+ You are an HR assistant. Your task is to rate a candidate's suitability based ONLY on their previous job roles.
48
+ Rate the suitability of the following candidate on a scale of 1 (Lowest) to 10 (Highest).
49
+ The score must reflect how closely the candidate's 'Roles' align with the target job titles.
 
50
 
51
+ The target roles for the '{category_name}' category are: {list(job_titles_tuple)}
52
 
53
+ Candidate JSON: {candidate_str}
 
54
 
55
+ **Task**: Respond ONLY with the rating number (an integer from 1 to 10).
 
56
  """
57
  headers = {"Authorization": f"Bearer {HF_API_TOKEN}", "Content-Type": "application/json"}
58
 
59
  payload = {
60
  "inputs": prompt,
61
  "parameters": {
62
+ "max_new_tokens": 5,
63
  "return_full_text": False,
64
+ "temperature": 0.1
65
  }
66
  }
67
 
68
  try:
 
69
  response = requests.post(
70
  f"https://api-inference.huggingface.co/models/{MODEL_ID}",
71
  headers=headers,
72
  data=json.dumps(payload),
73
+ timeout=60
74
  )
75
  response.raise_for_status()
76
  result = response.json()
 
 
 
 
77
 
78
+ generated_text = result[0].get("generated_text", "0").strip()
79
+
80
+ match = re.search(r'\d+', generated_text)
81
+ if match:
82
+ score = int(match.group(0))
83
+ return max(1, min(10, score))
84
 
85
+ return 0
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
  except Exception as e:
88
+ print(f"LLM scoring call failed for candidate (API/Network Error): {e}")
89
+ return 0
90
 
91
  # ----------------------------
92
  # Step 1: Filter by roles (Unchanged)
 
129
 
130
  df = pd.DataFrame(filtered)
131
  df.to_csv(FILTERED_CSV, index=False)
132
+ return df, f"{len(df)} candidates filtered by role for category '{category_name}'. Ready for LLM scoring."
133
 
134
 
135
  # ----------------------------
136
+ # Step 2: LLM recommendations (Scoring, Sorting, and Output)
137
  # ----------------------------
138
  def llm_recommendations(category_name):
139
  job_titles = CATEGORIES[category_name]
 
149
  if df_filtered.empty:
150
  return f"No filtered candidates found for category '{category_name}'. Run Step 1 first."
151
 
152
+ # Prepare for scoring
153
+ df_filtered_clean = df_filtered.fillna('N/A')
154
+ filtered_candidates = df_filtered_clean.to_dict(orient="records")
155
 
156
+ scores = []
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
+ for person in filtered_candidates:
159
+ candidate_info = {
160
+ "Name": person.get("Name"),
161
+ "Roles": person.get("Roles"),
162
+ "Skills": person.get("Skills")
163
+ }
164
+ candidate_str = json.dumps(candidate_info)
165
+
166
+ score = score_candidate(candidate_str, category_name, tuple(job_titles))
167
+ scores.append(score)
168
+
169
+ df_filtered["LLM_Score"] = scores
170
 
171
+ df_recommended = df_filtered[df_filtered["LLM_Score"] > 0].copy()
172
+
173
+ if df_recommended.empty:
174
+ if not HF_API_TOKEN:
175
+ return "❌ LLM failed: The HF_API_TOKEN is not set or is invalid. Set the token and try again."
176
+ return f"LLM scored all candidates 0. The candidates' roles are deemed irrelevant by the LLM for '{category_name}'."
177
 
178
+ def parse_salary(s):
179
+ try:
180
+ return float(str(s).replace("$","").replace(",","").replace("N/A", str(float('inf'))))
181
+ except:
182
+ return float('inf')
183
+
184
+ df_recommended["Salary_sort"] = df_recommended["Salary"].apply(parse_salary)
185
+
186
+ df_top5 = df_recommended.sort_values(
187
+ by=['LLM_Score', 'Salary_sort'],
188
+ ascending=[False, True]
189
+ ).head(5)
190
 
191
  final_names = df_top5["Name"].tolist()
192
 
 
 
 
193
  output_text = f"Top {len(final_names)} Recommended Candidates for the '{category_name}' Category:\n\n"
194
 
195
  for i, name in enumerate(final_names):
196
+ score = df_top5.iloc[i]['LLM_Score']
197
+ output_text += f"{i+1}. {name} (Suitability Score: {score}/10)\n"
198
 
199
+ output_text += "\nThese candidates were ranked by the LLM based **only on the alignment of their previous job roles** with the target roles, using expected salary as a tie-breaker."
200
 
201
  return output_text
202
 
 
214
  return pd.DataFrame({"Error": [f"Failed to load JSON: {e}"]})
215
 
216
  # ----------------------------
217
+ # Gradio interface (Updated Heading and Launch)
218
  # ----------------------------
219
  with gr.Blocks() as app:
220
+ # 🚩 CHANGE: Updated Heading
221
+ gr.Markdown("# 🤖 Candidate Selection (Role-Based Scoring)")
222
 
223
+ gr.Markdown("#### 🔍 Raw JSON Preview: First 5 Candidates")
224
  gr.Dataframe(show_first_candidates(), label="First 5 JSON Entries")
225
 
226
  gr.Markdown("---")
 
235
  gr.Markdown("---")
236
 
237
  # Step 2: LLM Recommendations
238
+ llm_button = gr.Button("3. Get LLM Recommendations (Role Experience Ranking)")
239
  llm_output_text = gr.Textbox(label="Top Candidate Recommendations Summary", lines=10, placeholder="Click 'Get LLM Recommendations' after Step 2 completes.")
240
  llm_button.click(llm_recommendations, inputs=[category_dropdown], outputs=[llm_output_text])
241
 
242
  if __name__ == "__main__":
243
+ # 🚩 CHANGE: Set share=True to generate a public link
244
+ app.launch(share=True)