curiouscurrent commited on
Commit
db61f50
·
verified ·
1 Parent(s): daf3997

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +140 -167
app.py CHANGED
@@ -2,30 +2,23 @@ import gradio as gr
2
  import pandas as pd
3
  import json
4
  import os
 
5
  import re
6
- from sklearn.feature_extraction.text import TfidfVectorizer
7
- from sklearn.metrics.pairwise import cosine_similarity
8
  from functools import lru_cache
9
 
10
- # *** IMPORTANT: YOU MUST REPLACE THIS WITH YOUR ACTUAL LLM CLIENT/API ***
11
- # For demonstration, we will use a mock function, but in reality,
12
- # you'd use a library like 'openai', 'google-genai', or 'llama-cpp-python'.
13
-
14
- # MOCK LLM CLIENT (Replace with actual LLM API call)
15
- def llm_api_call(prompt):
16
- """Mocks an LLM API call for demonstration purposes."""
17
- if "average salary" in prompt.lower():
18
- return "Based on the filtered candidates, the average salary expectation is approximately **$140,000 USD** among the top 5 candidates. Candidate Alice Smith has the highest score."
19
- elif "best skills" in prompt.lower():
20
- return "The top candidates predominantly possess skills in **Python, PyTorch, TensorFlow, and AWS/Azure**. This aligns well with the roles in the AI category."
21
- else:
22
- return "I need more context from the question to generate a meaningful analysis. Try asking about salaries, key skills, or location distribution among the top candidates."
23
-
24
  # ----------------------------
25
  # CONFIG
26
  # ----------------------------
27
  JSON_FILE = "form-submissions-1.json"
 
 
 
28
  FILTERED_CSV = "/tmp/filtered_candidates.csv"
 
 
 
 
 
29
 
30
  CATEGORIES = {
31
  "AI": [
@@ -42,112 +35,69 @@ CATEGORIES = {
42
  }
43
 
44
  # ----------------------------
45
- # Similarity Matching Function (Reliable Objective Scoring)
46
- # ----------------------------
47
- @lru_cache(maxsize=1)
48
- def calculate_similarity_scores(df_candidates, category_name):
49
- # ... (Same function as before: calculates TF-IDF/Cosine Similarity)
50
- if df_candidates.empty:
51
- return pd.Series([], dtype='float64')
52
- target_roles = " ".join(CATEGORIES[category_name])
53
- candidate_roles = df_candidates['Roles'].tolist()
54
- corpus = [target_roles] + candidate_roles
55
- vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
56
- tfidf_matrix = vectorizer.fit_transform(corpus)
57
- target_vector = tfidf_matrix[0]
58
- candidate_vectors = tfidf_matrix[1:]
59
- similarity_scores = cosine_similarity(target_vector, candidate_vectors).flatten()
60
- return pd.Series(similarity_scores, index=df_candidates.index)
61
-
62
- # ----------------------------
63
- # Helper: Rank and retrieve Top 5 candidates
64
- # ----------------------------
65
- def get_top5_candidates(category_name):
66
- if not os.path.exists(FILTERED_CSV):
67
- return pd.DataFrame(), "Error: Filtered CSV not found. Run Step 1 and Step 2 first."
68
-
69
- df_filtered = pd.read_csv(FILTERED_CSV)
70
- df_filtered = df_filtered[df_filtered["Category"] == category_name]
71
-
72
- if df_filtered.empty:
73
- return pd.DataFrame(), f"No filtered candidates found for category '{category_name}'."
74
-
75
- # Recalculate or retrieve scores (ensures consistency)
76
- df_filtered["Similarity_Score"] = calculate_similarity_scores(df_filtered, category_name)
77
- df_recommended = df_filtered[df_filtered["Similarity_Score"] > 0.01].copy()
78
-
79
- def parse_salary(s):
80
- try:
81
- return float(str(s).replace("$","").replace(",","").replace("N/A", str(float('inf'))))
82
- except:
83
- return float('inf')
84
-
85
- df_recommended["Salary_sort"] = df_recommended["Salary"].apply(parse_salary)
86
-
87
- df_top5 = df_recommended.sort_values(
88
- by=['Similarity_Score', 'Salary_sort'],
89
- ascending=[False, True]
90
- ).head(5)
91
-
92
- # Select only the relevant columns for the LLM context
93
- df_top5 = df_top5[['Name', 'Roles', 'Skills', 'Salary', 'Location', 'Similarity_Score']]
94
- return df_top5, None
95
-
96
- # ----------------------------
97
- # Step 3: LLM Question Answering (New Feature)
98
  # ----------------------------
99
- def ask_llm_about_candidates(question, category_name):
100
- """
101
- RAG-like function: Loads the top 5 candidates and uses that data
102
- as context for the LLM to answer the founder's question.
103
- """
104
- df_top5, error_msg = get_top5_candidates(category_name)
105
-
106
- if error_msg:
107
- return f"Cannot run Q&A: {error_msg}"
108
-
109
- if df_top5.empty:
110
- return "No top candidates were identified in Step 2 to provide context for this question."
111
-
112
- # 1. Prepare the context for the LLM (serialize the top 5 data)
113
- candidate_context = df_top5.to_markdown(index=False)
114
-
115
- system_prompt = f"""
116
- You are an expert Talent Acquisition Analyst. Your task is to analyze the provided table of top-ranked candidates for the '{category_name}' category and answer the founder's question concisely.
117
- The candidates were ranked based on the keyword match of their roles to the target category.
118
-
119
- **CONTEXT (Top 5 Candidates):**
120
- ---
121
- {candidate_context}
122
- ---
123
 
124
- **INSTRUCTIONS:**
125
- 1. Base your answer ONLY on the provided CONTEXT table. Do not use external knowledge.
126
- 2. Answer the question in a clear, professional, and business-focused manner.
127
- 3. If the data is insufficient to answer, state that clearly.
128
- """
 
 
 
129
 
130
- full_prompt = f"{system_prompt}\n\nFOUNDER'S QUESTION: {question}"
131
-
132
- # 2. Call the LLM API (MOCK for this example)
133
  try:
134
- llm_response = llm_api_call(full_prompt) # <-- Replace with actual API call
135
- return llm_response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  except Exception as e:
137
- return f"LLM API Error: Could not connect or receive a response. Check API key and configuration. Error details: {e}"
138
-
139
-
140
- # --- Other Functions (filter_by_roles, similarity_recommendations, show_first_candidates) ---
141
- # (Keep the rest of the original functions here, unchanged)
142
 
 
 
 
143
  def filter_by_roles(category_name):
144
- # (The body of the original filter_by_roles function)
145
  job_titles = CATEGORIES[category_name]
146
  try:
147
  with open(JSON_FILE, encoding="utf-8") as f:
148
  data = json.load(f)
149
  except FileNotFoundError:
150
- return pd.DataFrame(), f"Error: JSON file '{JSON_FILE}' not found. The application can't proceed."
151
 
152
  filtered = []
153
 
@@ -175,34 +125,84 @@ def filter_by_roles(category_name):
175
  })
176
 
177
  if not filtered:
178
- return pd.DataFrame(), f"No candidates found matching roles for category '{category_name}'. The application can't proceed."
179
 
180
  df = pd.DataFrame(filtered)
181
  df.to_csv(FILTERED_CSV, index=False)
182
- return df, f"{len(df)} candidates filtered by role for category '{category_name}'. Ready for Similarity Ranking."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
- def similarity_recommendations(category_name):
185
- df_top5, error_msg = get_top5_candidates(category_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
- if error_msg:
188
- return error_msg
189
 
190
- if df_top5.empty:
191
- return f"All candidates had insufficient text similarity (less than 1%) to the target roles for '{category_name}'. The roles do not match the target category keywords."
 
 
192
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  final_names = df_top5["Name"].tolist()
194
 
195
  output_text = f"Top {len(final_names)} Recommended Candidates for the '{category_name}' Category:\n\n"
196
 
197
  for i, name in enumerate(final_names):
198
- score = df_top5.iloc[i]['Similarity_Score']
199
- score_percent = f"{score * 100:.2f}%"
200
- output_text += f"{i+1}. {name} (Role Match: {score_percent})\n"
201
 
202
- output_text += "\nThese candidates were ranked objectively based on the **keyword similarity (TF-IDF)** of their previous job roles to the target roles, using expected salary as a tie-breaker."
203
 
204
  return output_text
205
 
 
 
 
206
  def show_first_candidates():
207
  try:
208
  with open(JSON_FILE, encoding="utf-8") as f:
@@ -214,58 +214,31 @@ def show_first_candidates():
214
  return pd.DataFrame({"Error": [f"Failed to load JSON: {e}"]})
215
 
216
  # ----------------------------
217
- # Gradio interface (Final Version with Q&A)
218
  # ----------------------------
219
  with gr.Blocks() as app:
220
- gr.Markdown("# 🏆 Candidate Selection & Founder Analysis")
221
- gr.Markdown("### **Reliable ranking using objective TF-IDF & Cosine Similarity, plus an LLM-powered Q&A tool.**")
222
 
223
- with gr.Tab("1. Candidate Ranking"):
224
- gr.Markdown("#### 🔍 Raw JSON Preview: First 5 Candidates")
225
- gr.Dataframe(show_first_candidates(), label="First 5 JSON Entries")
226
 
227
- gr.Markdown("---")
228
- category_dropdown = gr.Dropdown(list(CATEGORIES.keys()), label="Select Category", value="AI")
229
 
230
- # Step 1: Filter by roles
231
- filter_button = gr.Button("1. Filter Candidates by Roles (Create CSV)")
232
- filtered_df = gr.Dataframe(label="Filtered Candidates (Preview)")
233
- filter_status = gr.Textbox(label="Filter Status", placeholder="Click 'Filter Candidates by Roles' to start.")
234
- filter_button.click(filter_by_roles, inputs=[category_dropdown], outputs=[filtered_df, filter_status])
235
 
236
- gr.Markdown("---")
237
 
238
- # Step 2: Recommendations
239
- recommend_button = gr.Button("2. Rank and Find Top 5 Candidates")
240
- recommend_output_text = gr.Textbox(label="Top Candidate Recommendations Summary", lines=10, placeholder="Click 'Rank and Find Top 5 Candidates' after Step 1 completes.")
241
- recommend_button.click(similarity_recommendations, inputs=[category_dropdown], outputs=[recommend_output_text])
242
-
243
- with gr.Tab("2. Founder Q&A"):
244
- gr.Markdown("### 🧠 Ask the LLM about the Top Candidates")
245
- gr.Markdown("The LLM uses the **Top 5 candidates** identified in the 'Candidate Ranking' tab as its sole source of information.")
246
-
247
- qa_category_dropdown = gr.Dropdown(list(CATEGORIES.keys()), label="Select Category for Q&A", value="AI")
248
-
249
- founder_question = gr.Textbox(
250
- label="Founder's Question",
251
- lines=2,
252
- placeholder="e.g., What is the average expected salary of the top candidates? Or, What are their most common skills?",
253
- value="What is the average expected salary of the top candidates?"
254
- )
255
-
256
- qa_button = gr.Button("3. Get LLM Analysis")
257
-
258
- llm_response_text = gr.Textbox(
259
- label="LLM Response (Context-Based Analysis)",
260
- lines=8,
261
- placeholder="The analysis will appear here after you click the button."
262
- )
263
-
264
- qa_button.click(
265
- ask_llm_about_candidates,
266
- inputs=[founder_question, qa_category_dropdown],
267
- outputs=[llm_response_text]
268
- )
269
 
270
  if __name__ == "__main__":
 
271
  app.launch(share=True)
 
2
  import pandas as pd
3
  import json
4
  import os
5
+ import requests
6
  import re
 
 
7
  from functools import lru_cache
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  # ----------------------------
10
  # CONFIG
11
  # ----------------------------
12
  JSON_FILE = "form-submissions-1.json"
13
+ MODEL_ID = "google/flan-t5-small"
14
+ # NOTE: HF_API_TOKEN MUST be set in your environment variables/Space secrets.
15
+ HF_API_TOKEN = os.environ.get("HF_API_TOKEN")
16
  FILTERED_CSV = "/tmp/filtered_candidates.csv"
17
+ OUTPUT_FILE = "/tmp/outputs.csv"
18
+ BATCH_SIZE = 50
19
+
20
+ if not HF_API_TOKEN:
21
+ pass
22
 
23
  CATEGORIES = {
24
  "AI": [
 
35
  }
36
 
37
  # ----------------------------
38
+ # LLM Call for Scoring (Focus: Role Experience ONLY)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  # ----------------------------
40
+ @lru_cache(maxsize=512)
41
+ def score_candidate(candidate_str, category_name, job_titles_tuple):
42
+ if not HF_API_TOKEN:
43
+ print("API Token is missing. Returning score 0.")
44
+ return 0
45
+
46
+ prompt = f"""
47
+ You are an HR assistant. Your task is to rate a candidate's suitability based ONLY on their previous job roles.
48
+ Rate the suitability of the following candidate on a scale of 1 (Lowest) to 10 (Highest).
49
+ The score must reflect how closely the candidate's 'Roles' align with the target job titles.
50
+
51
+ The target roles for the '{category_name}' category are: {list(job_titles_tuple)}
52
+
53
+ Candidate JSON: {candidate_str}
54
+
55
+ **Task**: Respond ONLY with the rating number (an integer from 1 to 10).
56
+ """
57
+ headers = {"Authorization": f"Bearer {HF_API_TOKEN}", "Content-Type": "application/json"}
 
 
 
 
 
 
58
 
59
+ payload = {
60
+ "inputs": prompt,
61
+ "parameters": {
62
+ "max_new_tokens": 5,
63
+ "return_full_text": False,
64
+ "temperature": 0.1
65
+ }
66
+ }
67
 
 
 
 
68
  try:
69
+ response = requests.post(
70
+ f"https://api-inference.huggingface.co/models/{MODEL_ID}",
71
+ headers=headers,
72
+ data=json.dumps(payload),
73
+ timeout=60
74
+ )
75
+ response.raise_for_status()
76
+ result = response.json()
77
+
78
+ generated_text = result[0].get("generated_text", "0").strip()
79
+
80
+ match = re.search(r'\d+', generated_text)
81
+ if match:
82
+ score = int(match.group(0))
83
+ return max(1, min(10, score))
84
+
85
+ return 0
86
+
87
  except Exception as e:
88
+ print(f"LLM scoring call failed for candidate (API/Network Error): {e}")
89
+ return 0
 
 
 
90
 
91
+ # ----------------------------
92
+ # Step 1: Filter by roles (Unchanged)
93
+ # ----------------------------
94
  def filter_by_roles(category_name):
 
95
  job_titles = CATEGORIES[category_name]
96
  try:
97
  with open(JSON_FILE, encoding="utf-8") as f:
98
  data = json.load(f)
99
  except FileNotFoundError:
100
+ return pd.DataFrame(), f"Error: JSON file '{JSON_FILE}' not found. The LLM can't proceed."
101
 
102
  filtered = []
103
 
 
125
  })
126
 
127
  if not filtered:
128
+ return pd.DataFrame(), f"No candidates found matching roles for category '{category_name}'. The LLM can't proceed."
129
 
130
  df = pd.DataFrame(filtered)
131
  df.to_csv(FILTERED_CSV, index=False)
132
+ return df, f"{len(df)} candidates filtered by role for category '{category_name}'. Ready for LLM scoring."
133
+
134
+
135
+ # ----------------------------
136
+ # Step 2: LLM recommendations (Scoring, Sorting, and Output)
137
+ # ----------------------------
138
+ def llm_recommendations(category_name):
139
+ job_titles = CATEGORIES[category_name]
140
+
141
+ if not os.path.exists(FILTERED_CSV):
142
+ df_filtered, msg = filter_by_roles(category_name)
143
+ if df_filtered.empty:
144
+ return msg
145
+ else:
146
+ df_filtered = pd.read_csv(FILTERED_CSV)
147
+ df_filtered = df_filtered[df_filtered["Category"] == category_name]
148
 
149
+ if df_filtered.empty:
150
+ return f"No filtered candidates found for category '{category_name}'. Run Step 1 first."
151
+
152
+ # Prepare for scoring
153
+ df_filtered_clean = df_filtered.fillna('N/A')
154
+ filtered_candidates = df_filtered_clean.to_dict(orient="records")
155
+
156
+ scores = []
157
+
158
+ for person in filtered_candidates:
159
+ candidate_info = {
160
+ "Name": person.get("Name"),
161
+ "Roles": person.get("Roles"),
162
+ "Skills": person.get("Skills")
163
+ }
164
+ candidate_str = json.dumps(candidate_info)
165
+
166
+ score = score_candidate(candidate_str, category_name, tuple(job_titles))
167
+ scores.append(score)
168
+
169
+ df_filtered["LLM_Score"] = scores
170
 
171
+ df_recommended = df_filtered[df_filtered["LLM_Score"] > 0].copy()
 
172
 
173
+ if df_recommended.empty:
174
+ if not HF_API_TOKEN:
175
+ return "❌ LLM failed: The HF_API_TOKEN is not set or is invalid. Set the token and try again."
176
+ return f"LLM scored all candidates 0. The candidates' roles are deemed irrelevant by the LLM for '{category_name}'."
177
 
178
+ def parse_salary(s):
179
+ try:
180
+ return float(str(s).replace("$","").replace(",","").replace("N/A", str(float('inf'))))
181
+ except:
182
+ return float('inf')
183
+
184
+ df_recommended["Salary_sort"] = df_recommended["Salary"].apply(parse_salary)
185
+
186
+ df_top5 = df_recommended.sort_values(
187
+ by=['LLM_Score', 'Salary_sort'],
188
+ ascending=[False, True]
189
+ ).head(5)
190
+
191
  final_names = df_top5["Name"].tolist()
192
 
193
  output_text = f"Top {len(final_names)} Recommended Candidates for the '{category_name}' Category:\n\n"
194
 
195
  for i, name in enumerate(final_names):
196
+ score = df_top5.iloc[i]['LLM_Score']
197
+ output_text += f"{i+1}. {name} (Suitability Score: {score}/10)\n"
 
198
 
199
+ output_text += "\nThese candidates were ranked by the LLM based **only on the alignment of their previous job roles** with the target roles, using expected salary as a tie-breaker."
200
 
201
  return output_text
202
 
203
+ # ----------------------------
204
+ # Show first 5 raw JSON candidates (Unchanged)
205
+ # ----------------------------
206
  def show_first_candidates():
207
  try:
208
  with open(JSON_FILE, encoding="utf-8") as f:
 
214
  return pd.DataFrame({"Error": [f"Failed to load JSON: {e}"]})
215
 
216
  # ----------------------------
217
+ # Gradio interface (Updated Heading and Launch)
218
  # ----------------------------
219
  with gr.Blocks() as app:
220
+ # 🚩 CHANGE: Updated Heading
221
+ gr.Markdown("# 🤖 Candidate Selection (Role-Based Scoring)")
222
 
223
+ gr.Markdown("#### 🔍 Raw JSON Preview: First 5 Candidates")
224
+ gr.Dataframe(show_first_candidates(), label="First 5 JSON Entries")
 
225
 
226
+ gr.Markdown("---")
227
+ category_dropdown = gr.Dropdown(list(CATEGORIES.keys()), label="1. Select Category")
228
 
229
+ # Step 1: Filter by roles
230
+ filter_button = gr.Button("2. Filter Candidates by Roles")
231
+ filtered_df = gr.Dataframe(label="Filtered Candidates (Preview)")
232
+ filter_status = gr.Textbox(label="Filter Status", placeholder="Click 'Filter Candidates by Roles' to start.")
233
+ filter_button.click(filter_by_roles, inputs=[category_dropdown], outputs=[filtered_df, filter_status])
234
 
235
+ gr.Markdown("---")
236
 
237
+ # Step 2: LLM Recommendations
238
+ llm_button = gr.Button("3. Get LLM Recommendations (Role Experience Ranking)")
239
+ llm_output_text = gr.Textbox(label="Top Candidate Recommendations Summary", lines=10, placeholder="Click 'Get LLM Recommendations' after Step 2 completes.")
240
+ llm_button.click(llm_recommendations, inputs=[category_dropdown], outputs=[llm_output_text])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
 
242
  if __name__ == "__main__":
243
+ # 🚩 CHANGE: Set share=True to generate a public link
244
  app.launch(share=True)