curiouscurrent commited on
Commit
7de0953
·
verified ·
1 Parent(s): 98ef19f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -97
app.py CHANGED
@@ -2,23 +2,20 @@ import gradio as gr
2
  import pandas as pd
3
  import json
4
  import os
5
- import requests
6
  import re
 
 
7
  from functools import lru_cache
8
 
9
  # ----------------------------
10
  # CONFIG
11
  # ----------------------------
12
  JSON_FILE = "form-submissions-1.json"
13
- MODEL_ID = "google/flan-t5-small"
14
- # NOTE: HF_API_TOKEN MUST be set in your environment variables/Space secrets.
15
- HF_API_TOKEN = os.environ.get("HF_API_TOKEN")
16
  FILTERED_CSV = "/tmp/filtered_candidates.csv"
17
- OUTPUT_FILE = "/tmp/outputs.csv"
18
- BATCH_SIZE = 50
19
 
20
- if not HF_API_TOKEN:
21
- pass
 
22
 
23
  CATEGORIES = {
24
  "AI": [
@@ -35,58 +32,38 @@ CATEGORIES = {
35
  }
36
 
37
  # ----------------------------
38
- # LLM Call for Scoring (Focus: Role Experience ONLY)
39
  # ----------------------------
40
- @lru_cache(maxsize=512)
41
- def score_candidate(candidate_str, category_name, job_titles_tuple):
42
- if not HF_API_TOKEN:
43
- print("API Token is missing. Returning score 0.")
44
- return 0
45
-
46
- prompt = f"""
47
- You are an HR assistant. Your task is to rate a candidate's suitability based ONLY on their previous job roles.
48
- Rate the suitability of the following candidate on a scale of 1 (Lowest) to 10 (Highest).
49
- The score must reflect how closely the candidate's 'Roles' align with the target job titles.
50
-
51
- The target roles for the '{category_name}' category are: {list(job_titles_tuple)}
52
-
53
- Candidate JSON: {candidate_str}
54
-
55
- **Task**: Respond ONLY with the rating number (an integer from 1 to 10).
56
- """
57
- headers = {"Authorization": f"Bearer {HF_API_TOKEN}", "Content-Type": "application/json"}
58
 
59
- payload = {
60
- "inputs": prompt,
61
- "parameters": {
62
- "max_new_tokens": 5,
63
- "return_full_text": False,
64
- "temperature": 0.1
65
- }
66
- }
67
 
68
- try:
69
- response = requests.post(
70
- f"https://api-inference.huggingface.co/models/{MODEL_ID}",
71
- headers=headers,
72
- data=json.dumps(payload),
73
- timeout=60
74
- )
75
- response.raise_for_status()
76
- result = response.json()
77
-
78
- generated_text = result[0].get("generated_text", "0").strip()
79
-
80
- match = re.search(r'\d+', generated_text)
81
- if match:
82
- score = int(match.group(0))
83
- return max(1, min(10, score))
84
-
85
- return 0
86
-
87
- except Exception as e:
88
- print(f"LLM scoring call failed for candidate (API/Network Error): {e}")
89
- return 0
90
 
91
  # ----------------------------
92
  # Step 1: Filter by roles (Unchanged)
@@ -97,7 +74,7 @@ def filter_by_roles(category_name):
97
  with open(JSON_FILE, encoding="utf-8") as f:
98
  data = json.load(f)
99
  except FileNotFoundError:
100
- return pd.DataFrame(), f"Error: JSON file '{JSON_FILE}' not found. The LLM can't proceed."
101
 
102
  filtered = []
103
 
@@ -125,19 +102,17 @@ def filter_by_roles(category_name):
125
  })
126
 
127
  if not filtered:
128
- return pd.DataFrame(), f"No candidates found matching roles for category '{category_name}'. The LLM can't proceed."
129
 
130
  df = pd.DataFrame(filtered)
131
  df.to_csv(FILTERED_CSV, index=False)
132
- return df, f"{len(df)} candidates filtered by role for category '{category_name}'. Ready for LLM scoring."
133
 
134
 
135
  # ----------------------------
136
- # Step 2: LLM recommendations (Scoring, Sorting, and Output)
137
  # ----------------------------
138
- def llm_recommendations(category_name):
139
- job_titles = CATEGORIES[category_name]
140
-
141
  if not os.path.exists(FILTERED_CSV):
142
  df_filtered, msg = filter_by_roles(category_name)
143
  if df_filtered.empty:
@@ -149,32 +124,16 @@ def llm_recommendations(category_name):
149
  if df_filtered.empty:
150
  return f"No filtered candidates found for category '{category_name}'. Run Step 1 first."
151
 
152
- # Prepare for scoring
153
- df_filtered_clean = df_filtered.fillna('N/A')
154
- filtered_candidates = df_filtered_clean.to_dict(orient="records")
155
-
156
- scores = []
157
-
158
- for person in filtered_candidates:
159
- candidate_info = {
160
- "Name": person.get("Name"),
161
- "Roles": person.get("Roles"),
162
- "Skills": person.get("Skills")
163
- }
164
- candidate_str = json.dumps(candidate_info)
165
-
166
- score = score_candidate(candidate_str, category_name, tuple(job_titles))
167
- scores.append(score)
168
-
169
- df_filtered["LLM_Score"] = scores
170
 
171
- df_recommended = df_filtered[df_filtered["LLM_Score"] > 0].copy()
 
172
 
173
  if df_recommended.empty:
174
- if not HF_API_TOKEN:
175
- return "❌ LLM failed: The HF_API_TOKEN is not set or is invalid. Set the token and try again."
176
- return f"LLM scored all candidates 0. The candidates' roles are deemed irrelevant by the LLM for '{category_name}'."
177
 
 
178
  def parse_salary(s):
179
  try:
180
  return float(str(s).replace("$","").replace(",","").replace("N/A", str(float('inf'))))
@@ -183,8 +142,9 @@ def llm_recommendations(category_name):
183
 
184
  df_recommended["Salary_sort"] = df_recommended["Salary"].apply(parse_salary)
185
 
 
186
  df_top5 = df_recommended.sort_values(
187
- by=['LLM_Score', 'Salary_sort'],
188
  ascending=[False, True]
189
  ).head(5)
190
 
@@ -193,10 +153,12 @@ def llm_recommendations(category_name):
193
  output_text = f"Top {len(final_names)} Recommended Candidates for the '{category_name}' Category:\n\n"
194
 
195
  for i, name in enumerate(final_names):
196
- score = df_top5.iloc[i]['LLM_Score']
197
- output_text += f"{i+1}. {name} (Suitability Score: {score}/10)\n"
 
 
198
 
199
- output_text += "\nThese candidates were ranked by the LLM based **only on the alignment of their previous job roles** with the target roles, using expected salary as a tie-breaker."
200
 
201
  return output_text
202
 
@@ -217,8 +179,8 @@ def show_first_candidates():
217
  # Gradio interface (Updated Heading and Launch)
218
  # ----------------------------
219
  with gr.Blocks() as app:
220
- # 🚩 CHANGE: Updated Heading
221
- gr.Markdown("# 🤖 Candidate Selection (Role-Based Scoring)")
222
 
223
  gr.Markdown("#### 🔍 Raw JSON Preview: First 5 Candidates")
224
  gr.Dataframe(show_first_candidates(), label="First 5 JSON Entries")
@@ -234,11 +196,11 @@ with gr.Blocks() as app:
234
 
235
  gr.Markdown("---")
236
 
237
- # Step 2: LLM Recommendations
238
- llm_button = gr.Button("3. Get LLM Recommendations (Role Experience Ranking)")
239
- llm_output_text = gr.Textbox(label="Top Candidate Recommendations Summary", lines=10, placeholder="Click 'Get LLM Recommendations' after Step 2 completes.")
240
- llm_button.click(llm_recommendations, inputs=[category_dropdown], outputs=[llm_output_text])
 
241
 
242
  if __name__ == "__main__":
243
- # 🚩 CHANGE: Set share=True to generate a public link
244
  app.launch(share=True)
 
2
  import pandas as pd
3
  import json
4
  import os
 
5
  import re
6
+ from sklearn.feature_extraction.text import TfidfVectorizer
7
+ from sklearn.metrics.pairwise import cosine_similarity
8
  from functools import lru_cache
9
 
10
  # ----------------------------
11
  # CONFIG
12
  # ----------------------------
13
  JSON_FILE = "form-submissions-1.json"
 
 
 
14
  FILTERED_CSV = "/tmp/filtered_candidates.csv"
 
 
15
 
16
+ # Note: The following variables are no longer needed, as we are not using the Hugging Face API
17
+ # MODEL_ID = "google/flan-t5-small"
18
+ # HF_API_TOKEN = os.environ.get("HF_API_TOKEN")
19
 
20
  CATEGORIES = {
21
  "AI": [
 
32
  }
33
 
34
  # ----------------------------
35
+ # Similarity Matching Function (REPLACING LLM)
36
  # ----------------------------
37
+ @lru_cache(maxsize=1)
38
+ def calculate_similarity_scores(df_candidates, category_name):
39
+ """
40
+ Calculates the cosine similarity between candidate roles and target job titles.
41
+ """
42
+ if df_candidates.empty:
43
+ return pd.Series([], dtype='float64')
44
+
45
+ # 1. Define the document corpus
46
+ target_roles = " ".join(CATEGORIES[category_name])
47
+ candidate_roles = df_candidates['Roles'].tolist()
 
 
 
 
 
 
 
48
 
49
+ # 2. Create the corpus for vectorization
50
+ corpus = [target_roles] + candidate_roles
 
 
 
 
 
 
51
 
52
+ # 3. Vectorize using TF-IDF (converts text to numerical features)
53
+ # Uses unigrams and bigrams for better matching (e.g., 'Data Scientist' vs 'Data' 'Scientist')
54
+ vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
55
+ tfidf_matrix = vectorizer.fit_transform(corpus)
56
+
57
+ # 4. Extract the vector for the target roles (the first row)
58
+ target_vector = tfidf_matrix[0]
59
+ candidate_vectors = tfidf_matrix[1:]
60
+
61
+ # 5. Calculate Cosine Similarity between the target and all candidates
62
+ # The result is a matrix where [0, i] is the similarity score for candidate i
63
+ similarity_scores = cosine_similarity(target_vector, candidate_vectors).flatten()
64
+
65
+ # Return scores as a Pandas Series aligned with the DataFrame index
66
+ return pd.Series(similarity_scores, index=df_candidates.index)
 
 
 
 
 
 
 
67
 
68
  # ----------------------------
69
  # Step 1: Filter by roles (Unchanged)
 
74
  with open(JSON_FILE, encoding="utf-8") as f:
75
  data = json.load(f)
76
  except FileNotFoundError:
77
+ return pd.DataFrame(), f"Error: JSON file '{JSON_FILE}' not found. The application can't proceed."
78
 
79
  filtered = []
80
 
 
102
  })
103
 
104
  if not filtered:
105
+ return pd.DataFrame(), f"No candidates found matching roles for category '{category_name}'. The application can't proceed."
106
 
107
  df = pd.DataFrame(filtered)
108
  df.to_csv(FILTERED_CSV, index=False)
109
+ return df, f"{len(df)} candidates filtered by role for category '{category_name}'. Ready for Similarity Ranking."
110
 
111
 
112
  # ----------------------------
113
+ # Step 2: Recommendations (Using Similarity Matching)
114
  # ----------------------------
115
+ def similarity_recommendations(category_name):
 
 
116
  if not os.path.exists(FILTERED_CSV):
117
  df_filtered, msg = filter_by_roles(category_name)
118
  if df_filtered.empty:
 
124
  if df_filtered.empty:
125
  return f"No filtered candidates found for category '{category_name}'. Run Step 1 first."
126
 
127
+ # --- CORE CHANGE: Calculate Similarity Scores ---
128
+ df_filtered["Similarity_Score"] = calculate_similarity_scores(df_filtered, category_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
+ # Filter out candidates with zero relevance (can happen if roles are too generic)
131
+ df_recommended = df_filtered[df_filtered["Similarity_Score"] > 0].copy()
132
 
133
  if df_recommended.empty:
134
+ return f"All candidates had zero text similarity to the target roles for '{category_name}'. The current roles are not specific enough to match."
 
 
135
 
136
+ # Define salary parsing for tie-breaker
137
  def parse_salary(s):
138
  try:
139
  return float(str(s).replace("$","").replace(",","").replace("N/A", str(float('inf'))))
 
142
 
143
  df_recommended["Salary_sort"] = df_recommended["Salary"].apply(parse_salary)
144
 
145
+ # Sort: 1. Highest Similarity Score (descending), 2. Lowest Salary (ascending)
146
  df_top5 = df_recommended.sort_values(
147
+ by=['Similarity_Score', 'Salary_sort'],
148
  ascending=[False, True]
149
  ).head(5)
150
 
 
153
  output_text = f"Top {len(final_names)} Recommended Candidates for the '{category_name}' Category:\n\n"
154
 
155
  for i, name in enumerate(final_names):
156
+ score = df_top5.iloc[i]['Similarity_Score']
157
+ # Display the score as a percentage for readability
158
+ score_percent = f"{score * 100:.2f}%"
159
+ output_text += f"{i+1}. {name} (Role Match: {score_percent})\n"
160
 
161
+ output_text += "\nThese candidates were ranked objectively based on the **text similarity** of their previous job roles to the target roles, using expected salary as a tie-breaker."
162
 
163
  return output_text
164
 
 
179
  # Gradio interface (Updated Heading and Launch)
180
  # ----------------------------
181
  with gr.Blocks() as app:
182
+ gr.Markdown("# 🏆 Candidate Selection (Objective Similarity Matching)")
183
+ gr.Markdown("### **Reliable ranking using text similarity (TF-IDF & Cosine Similarity) - No LLM API required.**")
184
 
185
  gr.Markdown("#### 🔍 Raw JSON Preview: First 5 Candidates")
186
  gr.Dataframe(show_first_candidates(), label="First 5 JSON Entries")
 
196
 
197
  gr.Markdown("---")
198
 
199
+ # Step 2: Recommendations
200
+ # Changed function name to reflect the new logic
201
+ recommend_button = gr.Button("3. Rank Candidates by Role Similarity")
202
+ recommend_output_text = gr.Textbox(label="Top Candidate Recommendations Summary", lines=10, placeholder="Click 'Rank Candidates by Role Similarity' after Step 2 completes.")
203
+ recommend_button.click(similarity_recommendations, inputs=[category_dropdown], outputs=[recommend_output_text])
204
 
205
  if __name__ == "__main__":
 
206
  app.launch(share=True)