curiouscurrent commited on
Commit
daf3997
·
verified ·
1 Parent(s): a61aac1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +144 -80
app.py CHANGED
@@ -7,14 +7,26 @@ from sklearn.feature_extraction.text import TfidfVectorizer
7
  from sklearn.metrics.pairwise import cosine_similarity
8
  from functools import lru_cache
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  # ----------------------------
11
  # CONFIG
12
  # ----------------------------
13
  JSON_FILE = "form-submissions-1.json"
14
  FILTERED_CSV = "/tmp/filtered_candidates.csv"
15
 
16
- # The HF_API_TOKEN and LLM-related variables are now completely removed.
17
-
18
  CATEGORIES = {
19
  "AI": [
20
  "AI/ML Ops Engineer","Senior Machine Learning Engineer","Principal Data Scientist",
@@ -34,40 +46,102 @@ CATEGORIES = {
34
  # ----------------------------
35
  @lru_cache(maxsize=1)
36
  def calculate_similarity_scores(df_candidates, category_name):
37
- """
38
- Calculates the cosine similarity between candidate roles and target job titles
39
- using TF-IDF for keyword matching based on importance.
40
- """
41
  if df_candidates.empty:
42
  return pd.Series([], dtype='float64')
43
-
44
- # 1. Define the document corpus
45
- # Combines all target roles into one reference text
46
  target_roles = " ".join(CATEGORIES[category_name])
47
  candidate_roles = df_candidates['Roles'].tolist()
48
-
49
- # 2. Create the corpus for vectorization
50
  corpus = [target_roles] + candidate_roles
51
-
52
- # 3. Vectorize using TF-IDF (converts text to numerical features)
53
- # ngrams help match multi-word phrases like 'Data Scientist'
54
  vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
55
  tfidf_matrix = vectorizer.fit_transform(corpus)
56
-
57
- # 4. Extract the vector for the target roles (the first row)
58
  target_vector = tfidf_matrix[0]
59
  candidate_vectors = tfidf_matrix[1:]
60
-
61
- # 5. Calculate Cosine Similarity (score ranges from 0.0 to 1.0)
62
  similarity_scores = cosine_similarity(target_vector, candidate_vectors).flatten()
63
-
64
- # Return scores as a Pandas Series aligned with the DataFrame index
65
  return pd.Series(similarity_scores, index=df_candidates.index)
66
 
67
  # ----------------------------
68
- # Step 1: Filter by roles (Unchanged)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  # ----------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  def filter_by_roles(category_name):
 
71
  job_titles = CATEGORIES[category_name]
72
  try:
73
  with open(JSON_FILE, encoding="utf-8") as f:
@@ -81,7 +155,6 @@ def filter_by_roles(category_name):
81
  work_exps = person.get("work_experiences", [])
82
  if not work_exps:
83
  continue
84
- # Filter to get relevant job titles from the work experience
85
  non_fullstack_roles = [
86
  exp.get("roleName") for exp in work_exps
87
  if exp.get("roleName") and "full stack developer" not in exp.get("roleName").lower()
@@ -89,7 +162,6 @@ def filter_by_roles(category_name):
89
  if not non_fullstack_roles:
90
  continue
91
 
92
- # Initial check: filter only candidates who have *at least one* target role
93
  if any(role in job_titles for role in non_fullstack_roles):
94
  filtered.append({
95
  "Name": person.get("name"),
@@ -109,54 +181,21 @@ def filter_by_roles(category_name):
109
  df.to_csv(FILTERED_CSV, index=False)
110
  return df, f"{len(df)} candidates filtered by role for category '{category_name}'. Ready for Similarity Ranking."
111
 
112
-
113
- # ----------------------------
114
- # Step 2: Recommendations (Using Similarity Matching)
115
- # ----------------------------
116
  def similarity_recommendations(category_name):
117
- if not os.path.exists(FILTERED_CSV):
118
- df_filtered, msg = filter_by_roles(category_name)
119
- if df_filtered.empty:
120
- return msg
121
- else:
122
- df_filtered = pd.read_csv(FILTERED_CSV)
123
- df_filtered = df_filtered[df_filtered["Category"] == category_name]
124
-
125
- if df_filtered.empty:
126
- return f"No filtered candidates found for category '{category_name}'. Run Step 1 first."
127
-
128
- # --- CORE SCORING ---
129
- df_filtered["Similarity_Score"] = calculate_similarity_scores(df_filtered, category_name)
130
 
131
- # Filter out candidates with near-zero relevance (score < 0.01)
132
- df_recommended = df_filtered[df_filtered["Similarity_Score"] > 0.01].copy()
133
 
134
- if df_recommended.empty:
135
  return f"All candidates had insufficient text similarity (less than 1%) to the target roles for '{category_name}'. The roles do not match the target category keywords."
136
 
137
- # Define salary parsing for tie-breaker
138
- def parse_salary(s):
139
- try:
140
- # Replaces '$', ',', and sets 'N/A' to infinity for sorting purposes
141
- return float(str(s).replace("$","").replace(",","").replace("N/A", str(float('inf'))))
142
- except:
143
- return float('inf')
144
-
145
- df_recommended["Salary_sort"] = df_recommended["Salary"].apply(parse_salary)
146
-
147
- # Sort: 1. Highest Similarity Score (descending), 2. Lowest Salary (ascending)
148
- df_top5 = df_recommended.sort_values(
149
- by=['Similarity_Score', 'Salary_sort'],
150
- ascending=[False, True]
151
- ).head(5)
152
-
153
  final_names = df_top5["Name"].tolist()
154
 
155
  output_text = f"Top {len(final_names)} Recommended Candidates for the '{category_name}' Category:\n\n"
156
 
157
  for i, name in enumerate(final_names):
158
  score = df_top5.iloc[i]['Similarity_Score']
159
- # Display the score as a percentage for readability
160
  score_percent = f"{score * 100:.2f}%"
161
  output_text += f"{i+1}. {name} (Role Match: {score_percent})\n"
162
 
@@ -164,9 +203,6 @@ def similarity_recommendations(category_name):
164
 
165
  return output_text
166
 
167
- # ----------------------------
168
- # Show first 5 raw JSON candidates (Unchanged)
169
- # ----------------------------
170
  def show_first_candidates():
171
  try:
172
  with open(JSON_FILE, encoding="utf-8") as f:
@@ -178,30 +214,58 @@ def show_first_candidates():
178
  return pd.DataFrame({"Error": [f"Failed to load JSON: {e}"]})
179
 
180
  # ----------------------------
181
- # Gradio interface (Final Version)
182
  # ----------------------------
183
  with gr.Blocks() as app:
184
- gr.Markdown("# 🏆 Candidate Selection (Keyword Similarity Matching)")
185
- gr.Markdown("### **Reliable ranking using objective TF-IDF & Cosine Similarity for keyword overlap.**")
186
 
187
- gr.Markdown("#### 🔍 Raw JSON Preview: First 5 Candidates")
188
- gr.Dataframe(show_first_candidates(), label="First 5 JSON Entries")
 
 
 
 
189
 
190
- gr.Markdown("---")
191
- category_dropdown = gr.Dropdown(list(CATEGORIES.keys()), label="1. Select Category")
 
 
 
192
 
193
- # Step 1: Filter by roles
194
- filter_button = gr.Button("2. Filter Candidates by Roles")
195
- filtered_df = gr.Dataframe(label="Filtered Candidates (Preview)")
196
- filter_status = gr.Textbox(label="Filter Status", placeholder="Click 'Filter Candidates by Roles' to start.")
197
- filter_button.click(filter_by_roles, inputs=[category_dropdown], outputs=[filtered_df, filter_status])
198
 
199
- gr.Markdown("---")
 
 
 
200
 
201
- # Step 2: Recommendations
202
- recommend_button = gr.Button("3. Rank Candidates by Role Keyword Match")
203
- recommend_output_text = gr.Textbox(label="Top Candidate Recommendations Summary", lines=10, placeholder="Click 'Rank Candidates by Role Keyword Match' after Step 2 completes.")
204
- recommend_button.click(similarity_recommendations, inputs=[category_dropdown], outputs=[recommend_output_text])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
  if __name__ == "__main__":
207
  app.launch(share=True)
 
7
  from sklearn.metrics.pairwise import cosine_similarity
8
  from functools import lru_cache
9
 
10
+ # *** IMPORTANT: YOU MUST REPLACE THIS WITH YOUR ACTUAL LLM CLIENT/API ***
11
+ # For demonstration, we will use a mock function, but in reality,
12
+ # you'd use a library like 'openai', 'google-genai', or 'llama-cpp-python'.
13
+
14
+ # MOCK LLM CLIENT (Replace with actual LLM API call)
15
+ def llm_api_call(prompt):
16
+ """Mocks an LLM API call for demonstration purposes."""
17
+ if "average salary" in prompt.lower():
18
+ return "Based on the filtered candidates, the average salary expectation is approximately **$140,000 USD** among the top 5 candidates. Candidate Alice Smith has the highest score."
19
+ elif "best skills" in prompt.lower():
20
+ return "The top candidates predominantly possess skills in **Python, PyTorch, TensorFlow, and AWS/Azure**. This aligns well with the roles in the AI category."
21
+ else:
22
+ return "I need more context from the question to generate a meaningful analysis. Try asking about salaries, key skills, or location distribution among the top candidates."
23
+
24
  # ----------------------------
25
  # CONFIG
26
  # ----------------------------
27
  JSON_FILE = "form-submissions-1.json"
28
  FILTERED_CSV = "/tmp/filtered_candidates.csv"
29
 
 
 
30
  CATEGORIES = {
31
  "AI": [
32
  "AI/ML Ops Engineer","Senior Machine Learning Engineer","Principal Data Scientist",
 
46
  # ----------------------------
47
  @lru_cache(maxsize=1)
48
  def calculate_similarity_scores(df_candidates, category_name):
49
+ # ... (Same function as before: calculates TF-IDF/Cosine Similarity)
 
 
 
50
  if df_candidates.empty:
51
  return pd.Series([], dtype='float64')
 
 
 
52
  target_roles = " ".join(CATEGORIES[category_name])
53
  candidate_roles = df_candidates['Roles'].tolist()
 
 
54
  corpus = [target_roles] + candidate_roles
 
 
 
55
  vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
56
  tfidf_matrix = vectorizer.fit_transform(corpus)
 
 
57
  target_vector = tfidf_matrix[0]
58
  candidate_vectors = tfidf_matrix[1:]
 
 
59
  similarity_scores = cosine_similarity(target_vector, candidate_vectors).flatten()
 
 
60
  return pd.Series(similarity_scores, index=df_candidates.index)
61
 
62
  # ----------------------------
63
+ # Helper: Rank and retrieve Top 5 candidates
64
+ # ----------------------------
65
+ def get_top5_candidates(category_name):
66
+ if not os.path.exists(FILTERED_CSV):
67
+ return pd.DataFrame(), "Error: Filtered CSV not found. Run Step 1 and Step 2 first."
68
+
69
+ df_filtered = pd.read_csv(FILTERED_CSV)
70
+ df_filtered = df_filtered[df_filtered["Category"] == category_name]
71
+
72
+ if df_filtered.empty:
73
+ return pd.DataFrame(), f"No filtered candidates found for category '{category_name}'."
74
+
75
+ # Recalculate or retrieve scores (ensures consistency)
76
+ df_filtered["Similarity_Score"] = calculate_similarity_scores(df_filtered, category_name)
77
+ df_recommended = df_filtered[df_filtered["Similarity_Score"] > 0.01].copy()
78
+
79
+ def parse_salary(s):
80
+ try:
81
+ return float(str(s).replace("$","").replace(",","").replace("N/A", str(float('inf'))))
82
+ except:
83
+ return float('inf')
84
+
85
+ df_recommended["Salary_sort"] = df_recommended["Salary"].apply(parse_salary)
86
+
87
+ df_top5 = df_recommended.sort_values(
88
+ by=['Similarity_Score', 'Salary_sort'],
89
+ ascending=[False, True]
90
+ ).head(5)
91
+
92
+ # Select only the relevant columns for the LLM context
93
+ df_top5 = df_top5[['Name', 'Roles', 'Skills', 'Salary', 'Location', 'Similarity_Score']]
94
+ return df_top5, None
95
+
96
+ # ----------------------------
97
+ # Step 3: LLM Question Answering (New Feature)
98
  # ----------------------------
99
+ def ask_llm_about_candidates(question, category_name):
100
+ """
101
+ RAG-like function: Loads the top 5 candidates and uses that data
102
+ as context for the LLM to answer the founder's question.
103
+ """
104
+ df_top5, error_msg = get_top5_candidates(category_name)
105
+
106
+ if error_msg:
107
+ return f"Cannot run Q&A: {error_msg}"
108
+
109
+ if df_top5.empty:
110
+ return "No top candidates were identified in Step 2 to provide context for this question."
111
+
112
+ # 1. Prepare the context for the LLM (serialize the top 5 data)
113
+ candidate_context = df_top5.to_markdown(index=False)
114
+
115
+ system_prompt = f"""
116
+ You are an expert Talent Acquisition Analyst. Your task is to analyze the provided table of top-ranked candidates for the '{category_name}' category and answer the founder's question concisely.
117
+ The candidates were ranked based on the keyword match of their roles to the target category.
118
+
119
+ **CONTEXT (Top 5 Candidates):**
120
+ ---
121
+ {candidate_context}
122
+ ---
123
+
124
+ **INSTRUCTIONS:**
125
+ 1. Base your answer ONLY on the provided CONTEXT table. Do not use external knowledge.
126
+ 2. Answer the question in a clear, professional, and business-focused manner.
127
+ 3. If the data is insufficient to answer, state that clearly.
128
+ """
129
+
130
+ full_prompt = f"{system_prompt}\n\nFOUNDER'S QUESTION: {question}"
131
+
132
+ # 2. Call the LLM API (MOCK for this example)
133
+ try:
134
+ llm_response = llm_api_call(full_prompt) # <-- Replace with actual API call
135
+ return llm_response
136
+ except Exception as e:
137
+ return f"LLM API Error: Could not connect or receive a response. Check API key and configuration. Error details: {e}"
138
+
139
+
140
+ # --- Other Functions (filter_by_roles, similarity_recommendations, show_first_candidates) ---
141
+ # (Keep the rest of the original functions here, unchanged)
142
+
143
  def filter_by_roles(category_name):
144
+ # (The body of the original filter_by_roles function)
145
  job_titles = CATEGORIES[category_name]
146
  try:
147
  with open(JSON_FILE, encoding="utf-8") as f:
 
155
  work_exps = person.get("work_experiences", [])
156
  if not work_exps:
157
  continue
 
158
  non_fullstack_roles = [
159
  exp.get("roleName") for exp in work_exps
160
  if exp.get("roleName") and "full stack developer" not in exp.get("roleName").lower()
 
162
  if not non_fullstack_roles:
163
  continue
164
 
 
165
  if any(role in job_titles for role in non_fullstack_roles):
166
  filtered.append({
167
  "Name": person.get("name"),
 
181
  df.to_csv(FILTERED_CSV, index=False)
182
  return df, f"{len(df)} candidates filtered by role for category '{category_name}'. Ready for Similarity Ranking."
183
 
 
 
 
 
184
  def similarity_recommendations(category_name):
185
+ df_top5, error_msg = get_top5_candidates(category_name)
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
+ if error_msg:
188
+ return error_msg
189
 
190
+ if df_top5.empty:
191
  return f"All candidates had insufficient text similarity (less than 1%) to the target roles for '{category_name}'. The roles do not match the target category keywords."
192
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  final_names = df_top5["Name"].tolist()
194
 
195
  output_text = f"Top {len(final_names)} Recommended Candidates for the '{category_name}' Category:\n\n"
196
 
197
  for i, name in enumerate(final_names):
198
  score = df_top5.iloc[i]['Similarity_Score']
 
199
  score_percent = f"{score * 100:.2f}%"
200
  output_text += f"{i+1}. {name} (Role Match: {score_percent})\n"
201
 
 
203
 
204
  return output_text
205
 
 
 
 
206
  def show_first_candidates():
207
  try:
208
  with open(JSON_FILE, encoding="utf-8") as f:
 
214
  return pd.DataFrame({"Error": [f"Failed to load JSON: {e}"]})
215
 
216
  # ----------------------------
217
+ # Gradio interface (Final Version with Q&A)
218
  # ----------------------------
219
  with gr.Blocks() as app:
220
+ gr.Markdown("# 🏆 Candidate Selection & Founder Analysis")
221
+ gr.Markdown("### **Reliable ranking using objective TF-IDF & Cosine Similarity, plus an LLM-powered Q&A tool.**")
222
 
223
+ with gr.Tab("1. Candidate Ranking"):
224
+ gr.Markdown("#### 🔍 Raw JSON Preview: First 5 Candidates")
225
+ gr.Dataframe(show_first_candidates(), label="First 5 JSON Entries")
226
+
227
+ gr.Markdown("---")
228
+ category_dropdown = gr.Dropdown(list(CATEGORIES.keys()), label="Select Category", value="AI")
229
 
230
+ # Step 1: Filter by roles
231
+ filter_button = gr.Button("1. Filter Candidates by Roles (Create CSV)")
232
+ filtered_df = gr.Dataframe(label="Filtered Candidates (Preview)")
233
+ filter_status = gr.Textbox(label="Filter Status", placeholder="Click 'Filter Candidates by Roles' to start.")
234
+ filter_button.click(filter_by_roles, inputs=[category_dropdown], outputs=[filtered_df, filter_status])
235
 
236
+ gr.Markdown("---")
 
 
 
 
237
 
238
+ # Step 2: Recommendations
239
+ recommend_button = gr.Button("2. Rank and Find Top 5 Candidates")
240
+ recommend_output_text = gr.Textbox(label="Top Candidate Recommendations Summary", lines=10, placeholder="Click 'Rank and Find Top 5 Candidates' after Step 1 completes.")
241
+ recommend_button.click(similarity_recommendations, inputs=[category_dropdown], outputs=[recommend_output_text])
242
 
243
+ with gr.Tab("2. Founder Q&A"):
244
+ gr.Markdown("### 🧠 Ask the LLM about the Top Candidates")
245
+ gr.Markdown("The LLM uses the **Top 5 candidates** identified in the 'Candidate Ranking' tab as its sole source of information.")
246
+
247
+ qa_category_dropdown = gr.Dropdown(list(CATEGORIES.keys()), label="Select Category for Q&A", value="AI")
248
+
249
+ founder_question = gr.Textbox(
250
+ label="Founder's Question",
251
+ lines=2,
252
+ placeholder="e.g., What is the average expected salary of the top candidates? Or, What are their most common skills?",
253
+ value="What is the average expected salary of the top candidates?"
254
+ )
255
+
256
+ qa_button = gr.Button("3. Get LLM Analysis")
257
+
258
+ llm_response_text = gr.Textbox(
259
+ label="LLM Response (Context-Based Analysis)",
260
+ lines=8,
261
+ placeholder="The analysis will appear here after you click the button."
262
+ )
263
+
264
+ qa_button.click(
265
+ ask_llm_about_candidates,
266
+ inputs=[founder_question, qa_category_dropdown],
267
+ outputs=[llm_response_text]
268
+ )
269
 
270
  if __name__ == "__main__":
271
  app.launch(share=True)