curiouscurrent commited on
Commit
482309a
·
verified ·
1 Parent(s): c814146

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -102
app.py CHANGED
@@ -2,25 +2,18 @@ import gradio as gr
2
  import pandas as pd
3
  import json
4
  import os
5
- import requests
6
  import re
 
 
7
  from functools import lru_cache
8
 
9
  # ----------------------------
10
  # CONFIG
11
  # ----------------------------
12
  JSON_FILE = "form-submissions-1.json"
13
- # 🚩 CHANGE: Switched to a more capable, instruction-tuned model for semantic matching
14
- MODEL_ID = "google/flan-t5-large"
15
- # NOTE: HF_API_TOKEN MUST be set in your environment variables/Space secrets.
16
- HF_API_TOKEN = os.environ.get("HF_API_TOKEN")
17
  FILTERED_CSV = "/tmp/filtered_candidates.csv"
18
- OUTPUT_FILE = "/tmp/outputs.csv"
19
- BATCH_SIZE = 50
20
 
21
- if not HF_API_TOKEN:
22
- # Allow launch for demonstration, but function will warn if token is missing
23
- pass
24
 
25
  CATEGORIES = {
26
  "AI": [
@@ -37,60 +30,39 @@ CATEGORIES = {
37
  }
38
 
39
  # ----------------------------
40
- # LLM Call for Semantic Role Scoring
41
  # ----------------------------
42
- @lru_cache(maxsize=512)
43
- def score_candidate(candidate_str, category_name, job_titles_tuple):
44
- if not HF_API_TOKEN:
45
- print("API Token is missing. Returning score 0.")
46
- return 0
47
-
48
- # 🚩 PROMPT CHANGE: Focus on 'semantic relevance' and 'conceptual fit'
49
- prompt = f"""
50
- You are an HR expert performing semantic matching. Your task is to rate a candidate's conceptual fit based ONLY on their previous job roles and the target roles.
51
- Rate the semantic relevance of the candidate's 'Roles' to the 'Target Roles' on a scale of 1 (Lowest Match) to 10 (Highest Semantic Match).
52
- The score must reflect the conceptual alignment and industry similarity, not just keyword presence.
53
-
54
- The target roles for the '{category_name}' category are: {list(job_titles_tuple)}
55
-
56
- Candidate JSON: {candidate_str}
57
-
58
- **Task**: Respond ONLY with the rating number (an integer from 1 to 10).
59
- """
60
- headers = {"Authorization": f"Bearer {HF_API_TOKEN}", "Content-Type": "application/json"}
61
 
62
- payload = {
63
- "inputs": prompt,
64
- "parameters": {
65
- "max_new_tokens": 5,
66
- "return_full_text": False,
67
- "temperature": 0.1
68
- }
69
- }
70
 
71
- try:
72
- # Note: Flan-T5-Large is slower than small, but more powerful for this task
73
- response = requests.post(
74
- f"https://api-inference.huggingface.co/models/{MODEL_ID}",
75
- headers=headers,
76
- data=json.dumps(payload),
77
- timeout=120 # Increased timeout for the larger model
78
- )
79
- response.raise_for_status()
80
- result = response.json()
81
-
82
- generated_text = result[0].get("generated_text", "0").strip()
83
-
84
- match = re.search(r'\d+', generated_text)
85
- if match:
86
- score = int(match.group(0))
87
- return max(1, min(10, score))
88
-
89
- return 0
90
-
91
- except Exception as e:
92
- print(f"LLM scoring call failed for candidate (API/Network Error): {e}")
93
- return 0
94
 
95
  # ----------------------------
96
  # Step 1: Filter by roles (Unchanged)
@@ -109,6 +81,7 @@ def filter_by_roles(category_name):
109
  work_exps = person.get("work_experiences", [])
110
  if not work_exps:
111
  continue
 
112
  non_fullstack_roles = [
113
  exp.get("roleName") for exp in work_exps
114
  if exp.get("roleName") and "full stack developer" not in exp.get("roleName").lower()
@@ -116,6 +89,7 @@ def filter_by_roles(category_name):
116
  if not non_fullstack_roles:
117
  continue
118
 
 
119
  if any(role in job_titles for role in non_fullstack_roles):
120
  filtered.append({
121
  "Name": person.get("name"),
@@ -133,15 +107,13 @@ def filter_by_roles(category_name):
133
 
134
  df = pd.DataFrame(filtered)
135
  df.to_csv(FILTERED_CSV, index=False)
136
- return df, f"{len(df)} candidates filtered by role for category '{category_name}'. Ready for LLM Semantic Scoring."
137
 
138
 
139
  # ----------------------------
140
- # Step 2: LLM recommendations (Semantic Scoring, Sorting, and Output)
141
  # ----------------------------
142
- def llm_recommendations(category_name):
143
- job_titles = CATEGORIES[category_name]
144
-
145
  if not os.path.exists(FILTERED_CSV):
146
  df_filtered, msg = filter_by_roles(category_name)
147
  if df_filtered.empty:
@@ -153,45 +125,28 @@ def llm_recommendations(category_name):
153
  if df_filtered.empty:
154
  return f"No filtered candidates found for category '{category_name}'. Run Step 1 first."
155
 
156
- df_filtered_clean = df_filtered.fillna('N/A')
157
- filtered_candidates = df_filtered_clean.to_dict(orient="records")
158
 
159
- scores = []
160
-
161
- for person in filtered_candidates:
162
- candidate_info = {
163
- "Name": person.get("Name"),
164
- "Roles": person.get("Roles"),
165
- "Skills": person.get("Skills")
166
- }
167
- candidate_str = json.dumps(candidate_info)
168
-
169
- score = score_candidate(candidate_str, category_name, tuple(job_titles))
170
- scores.append(score)
171
-
172
- df_filtered["LLM_Score"] = scores
173
-
174
- # Only filter out scores of 0 if the token is present (0 means total irrelevance if token works)
175
- if HF_API_TOKEN:
176
- df_recommended = df_filtered[df_filtered["LLM_Score"] > 0].copy()
177
- else:
178
- df_recommended = df_filtered.copy() # Can't filter if all are 0 due to no token
179
 
180
  if df_recommended.empty:
181
- if not HF_API_TOKEN:
182
- return "❌ LLM failed: The HF_API_TOKEN is not set or is invalid. Set the token and try again."
183
- return f"LLM scored all candidates 0. This indicates zero semantic relevance between the candidates' roles and the target roles for '{category_name}'."
184
 
 
185
  def parse_salary(s):
186
  try:
 
187
  return float(str(s).replace("$","").replace(",","").replace("N/A", str(float('inf'))))
188
  except:
189
  return float('inf')
190
 
191
  df_recommended["Salary_sort"] = df_recommended["Salary"].apply(parse_salary)
192
 
 
193
  df_top5 = df_recommended.sort_values(
194
- by=['LLM_Score', 'Salary_sort'],
195
  ascending=[False, True]
196
  ).head(5)
197
 
@@ -200,10 +155,12 @@ def llm_recommendations(category_name):
200
  output_text = f"Top {len(final_names)} Recommended Candidates for the '{category_name}' Category:\n\n"
201
 
202
  for i, name in enumerate(final_names):
203
- score = df_top5.iloc[i]['LLM_Score']
204
- output_text += f"{i+1}. {name} (Semantic Role Match Score: {score}/10)\n"
 
 
205
 
206
- output_text += "\nThese candidates were ranked by the LLM based on the **conceptual fit (semantic similarity)** of their previous job roles to the target roles, using expected salary as a tie-breaker."
207
 
208
  return output_text
209
 
@@ -221,11 +178,11 @@ def show_first_candidates():
221
  return pd.DataFrame({"Error": [f"Failed to load JSON: {e}"]})
222
 
223
  # ----------------------------
224
- # Gradio interface (Updated Heading and Launch)
225
  # ----------------------------
226
  with gr.Blocks() as app:
227
- gr.Markdown("# 🏆 Candidate Selection (Semantic Role Matching)")
228
- gr.Markdown("### **Uses a large instruction model to score conceptual fit and similarity between roles.**")
229
 
230
  gr.Markdown("#### 🔍 Raw JSON Preview: First 5 Candidates")
231
  gr.Dataframe(show_first_candidates(), label="First 5 JSON Entries")
@@ -241,10 +198,10 @@ with gr.Blocks() as app:
241
 
242
  gr.Markdown("---")
243
 
244
- # Step 2: LLM Recommendations
245
- recommend_button = gr.Button("3. Rank Candidates by Semantic Role Match")
246
- recommend_output_text = gr.Textbox(label="Top Candidate Recommendations Summary", lines=10, placeholder="Click 'Rank Candidates by Semantic Role Match' after Step 2 completes.")
247
- recommend_button.click(llm_recommendations, inputs=[category_dropdown], outputs=[recommend_output_text])
248
 
249
  if __name__ == "__main__":
250
  app.launch(share=True)
 
2
  import pandas as pd
3
  import json
4
  import os
 
5
  import re
6
+ from sklearn.feature_extraction.text import TfidfVectorizer
7
+ from sklearn.metrics.pairwise import cosine_similarity
8
  from functools import lru_cache
9
 
10
  # ----------------------------
11
  # CONFIG
12
  # ----------------------------
13
  JSON_FILE = "form-submissions-1.json"
 
 
 
 
14
  FILTERED_CSV = "/tmp/filtered_candidates.csv"
 
 
15
 
16
+ # The HF_API_TOKEN and LLM-related variables are now completely removed.
 
 
17
 
18
  CATEGORIES = {
19
  "AI": [
 
30
  }
31
 
32
  # ----------------------------
33
+ # Similarity Matching Function (Reliable Objective Scoring)
34
  # ----------------------------
35
+ @lru_cache(maxsize=1)
36
+ def calculate_similarity_scores(df_candidates, category_name):
37
+ """
38
+ Calculates the cosine similarity between candidate roles and target job titles
39
+ using TF-IDF for keyword matching based on importance.
40
+ """
41
+ if df_candidates.empty:
42
+ return pd.Series([], dtype='float64')
43
+
44
+ # 1. Define the document corpus
45
+ # Combines all target roles into one reference text
46
+ target_roles = " ".join(CATEGORIES[category_name])
47
+ candidate_roles = df_candidates['Roles'].tolist()
 
 
 
 
 
 
48
 
49
+ # 2. Create the corpus for vectorization
50
+ corpus = [target_roles] + candidate_roles
 
 
 
 
 
 
51
 
52
+ # 3. Vectorize using TF-IDF (converts text to numerical features)
53
+ # ngrams help match multi-word phrases like 'Data Scientist'
54
+ vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
55
+ tfidf_matrix = vectorizer.fit_transform(corpus)
56
+
57
+ # 4. Extract the vector for the target roles (the first row)
58
+ target_vector = tfidf_matrix[0]
59
+ candidate_vectors = tfidf_matrix[1:]
60
+
61
+ # 5. Calculate Cosine Similarity (score ranges from 0.0 to 1.0)
62
+ similarity_scores = cosine_similarity(target_vector, candidate_vectors).flatten()
63
+
64
+ # Return scores as a Pandas Series aligned with the DataFrame index
65
+ return pd.Series(similarity_scores, index=df_candidates.index)
 
 
 
 
 
 
 
 
 
66
 
67
  # ----------------------------
68
  # Step 1: Filter by roles (Unchanged)
 
81
  work_exps = person.get("work_experiences", [])
82
  if not work_exps:
83
  continue
84
+ # Filter to get relevant job titles from the work experience
85
  non_fullstack_roles = [
86
  exp.get("roleName") for exp in work_exps
87
  if exp.get("roleName") and "full stack developer" not in exp.get("roleName").lower()
 
89
  if not non_fullstack_roles:
90
  continue
91
 
92
+ # Initial check: filter only candidates who have *at least one* target role
93
  if any(role in job_titles for role in non_fullstack_roles):
94
  filtered.append({
95
  "Name": person.get("name"),
 
107
 
108
  df = pd.DataFrame(filtered)
109
  df.to_csv(FILTERED_CSV, index=False)
110
+ return df, f"{len(df)} candidates filtered by role for category '{category_name}'. Ready for Similarity Ranking."
111
 
112
 
113
  # ----------------------------
114
+ # Step 2: Recommendations (Using Similarity Matching)
115
  # ----------------------------
116
+ def similarity_recommendations(category_name):
 
 
117
  if not os.path.exists(FILTERED_CSV):
118
  df_filtered, msg = filter_by_roles(category_name)
119
  if df_filtered.empty:
 
125
  if df_filtered.empty:
126
  return f"No filtered candidates found for category '{category_name}'. Run Step 1 first."
127
 
128
+ # --- CORE SCORING ---
129
+ df_filtered["Similarity_Score"] = calculate_similarity_scores(df_filtered, category_name)
130
 
131
+ # Filter out candidates with near-zero relevance (score < 0.01)
132
+ df_recommended = df_filtered[df_filtered["Similarity_Score"] > 0.01].copy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
  if df_recommended.empty:
135
+ return f"All candidates had insufficient text similarity (less than 1%) to the target roles for '{category_name}'. The roles do not match the target category keywords."
 
 
136
 
137
+ # Define salary parsing for tie-breaker
138
  def parse_salary(s):
139
  try:
140
+ # Replaces '$', ',', and sets 'N/A' to infinity for sorting purposes
141
  return float(str(s).replace("$","").replace(",","").replace("N/A", str(float('inf'))))
142
  except:
143
  return float('inf')
144
 
145
  df_recommended["Salary_sort"] = df_recommended["Salary"].apply(parse_salary)
146
 
147
+ # Sort: 1. Highest Similarity Score (descending), 2. Lowest Salary (ascending)
148
  df_top5 = df_recommended.sort_values(
149
+ by=['Similarity_Score', 'Salary_sort'],
150
  ascending=[False, True]
151
  ).head(5)
152
 
 
155
  output_text = f"Top {len(final_names)} Recommended Candidates for the '{category_name}' Category:\n\n"
156
 
157
  for i, name in enumerate(final_names):
158
+ score = df_top5.iloc[i]['Similarity_Score']
159
+ # Display the score as a percentage for readability
160
+ score_percent = f"{score * 100:.2f}%"
161
+ output_text += f"{i+1}. {name} (Role Match: {score_percent})\n"
162
 
163
+ output_text += "\nThese candidates were ranked objectively based on the **keyword similarity (TF-IDF)** of their previous job roles to the target roles, using expected salary as a tie-breaker."
164
 
165
  return output_text
166
 
 
178
  return pd.DataFrame({"Error": [f"Failed to load JSON: {e}"]})
179
 
180
  # ----------------------------
181
+ # Gradio interface (Final Version)
182
  # ----------------------------
183
  with gr.Blocks() as app:
184
+ gr.Markdown("# 🏆 Candidate Selection (Keyword Similarity Matching)")
185
+ gr.Markdown("### **Reliable ranking using objective TF-IDF & Cosine Similarity for keyword overlap.**")
186
 
187
  gr.Markdown("#### 🔍 Raw JSON Preview: First 5 Candidates")
188
  gr.Dataframe(show_first_candidates(), label="First 5 JSON Entries")
 
198
 
199
  gr.Markdown("---")
200
 
201
+ # Step 2: Recommendations
202
+ recommend_button = gr.Button("3. Rank Candidates by Role Keyword Match")
203
+ recommend_output_text = gr.Textbox(label="Top Candidate Recommendations Summary", lines=10, placeholder="Click 'Rank Candidates by Role Keyword Match' after Step 2 completes.")
204
+ recommend_button.click(similarity_recommendations, inputs=[category_dropdown], outputs=[recommend_output_text])
205
 
206
  if __name__ == "__main__":
207
  app.launch(share=True)