zlf18 commited on
Commit
fea1f56
·
verified ·
1 Parent(s): a4e6efa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -38
app.py CHANGED
@@ -6,8 +6,6 @@ import re
6
  import nltk
7
  from nltk.corpus import words, stopwords
8
  import urllib.parse as _url
9
- from sklearn.feature_extraction.text import TfidfVectorizer
10
- from sklearn.metrics.pairwise import cosine_similarity
11
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
12
  from nltk.stem import PorterStemmer
13
  import gradio as gr
@@ -26,7 +24,7 @@ for package in ['words', 'stopwords', 'averaged_perceptron_tagger', 'punkt']:
26
  STOPWORDS = set(stopwords.words('english'))
27
  stemmer = PorterStemmer()
28
 
29
- # --- EXPANDED: Skill Whitelist with more business, finance, and consulting terms ---
30
  SKILL_WHITELIST = {
31
  # Technical & Data
32
  'python', 'java', 'c++', 'javascript', 'typescript', 'sql', 'nosql', 'html', 'css', 'react', 'angular', 'vue',
@@ -43,6 +41,7 @@ SKILL_WHITELIST = {
43
  'strategy', 'stakeholder management', 'risk management', 'compliance', 'aml', 'kyc', 'reinsurance', 'finance',
44
  'financial modeling', 'financial analysis', 'due diligence', 'sourcing', 'procurement', 'negotiation', 'supply chain',
45
  'business analysis', 'business intelligence', 'presentations', 'public speaking', 'time management', 'critical thinking',
 
46
  # Soft & Other
47
  'leadership', 'stakeholder communication', 'client communication', 'teamwork', 'collaboration', 'problem solving',
48
  'ui/ux design', 'figma', 'sketch', 'adobe xd', 'graphic design', 'autocad', 'solidworks', 'sales', 'marketing',
@@ -137,7 +136,6 @@ def find_job_matches(original_user_query: str, expanded_user_query: str, top_k:
137
  final_results_df = final_results_df.set_index('job_id', drop=False).rename(columns={'job_id': 'Job ID'})
138
  return final_results_df
139
 
140
- # --- REWRITTEN: Skill scoring function using semantic similarity ---
141
  def score_jobs_by_skills(user_skills: list[str], df_to_rank: pd.DataFrame) -> pd.DataFrame:
142
  if df_to_rank is None or df_to_rank.empty or not user_skills:
143
  return df_to_rank.sort_values(by='Similarity Score', ascending=False) if df_to_rank is not None else pd.DataFrame()
@@ -146,21 +144,17 @@ def score_jobs_by_skills(user_skills: list[str], df_to_rank: pd.DataFrame) -> pd
146
  if 'Skills' not in ranked_df.columns:
147
  return ranked_df.sort_values(by='Similarity Score', ascending=False)
148
 
149
- # 1. Encode all user skills and all unique job skills across the dataframe ONCE for efficiency
150
  user_skill_embeddings = model.encode(user_skills, convert_to_tensor=True)
151
  all_job_skills = sorted(list(set(skill for skills_list in ranked_df['Skills'] if skills_list for skill in skills_list)))
152
 
153
- if not all_job_skills: # No skills to compare against
154
  ranked_df['Skill Match Score'] = 0.0
155
  return ranked_df
156
 
157
  job_skill_embeddings = model.encode(all_job_skills, convert_to_tensor=True)
158
-
159
- # 2. Calculate the similarity matrix between every user skill and every job skill
160
  similarity_matrix = util.cos_sim(user_skill_embeddings, job_skill_embeddings)
161
 
162
- # 3. Define the new scoring function
163
- def calculate_semantic_match(row, threshold=0.55):
164
  job_skills_list = row.get('Skills', [])
165
  if not job_skills_list:
166
  return [], 0, 0.0
@@ -168,9 +162,7 @@ def score_jobs_by_skills(user_skills: list[str], df_to_rank: pd.DataFrame) -> pd
168
  matched_skills_in_job = set()
169
  for job_skill in job_skills_list:
170
  try:
171
- # Find which column in the matrix corresponds to the current job skill
172
  job_skill_idx = all_job_skills.index(job_skill)
173
- # Check if ANY of the user's skills meet the similarity threshold for this job skill
174
  if torch.any(similarity_matrix[:, job_skill_idx] > threshold):
175
  matched_skills_in_job.add(job_skill)
176
  except (ValueError, IndexError):
@@ -180,14 +172,10 @@ def score_jobs_by_skills(user_skills: list[str], df_to_rank: pd.DataFrame) -> pd
180
  match_score = len(matched_skills_in_job) / total_required if total_required > 0 else 0.0
181
  return list(matched_skills_in_job), len(matched_skills_in_job), match_score
182
 
183
- # 4. Apply the new scoring function to each row
184
  results = ranked_df.apply(lambda row: calculate_semantic_match(row), axis=1, result_type='expand')
185
  ranked_df[['Skill Matches', 'Skill Match Count', 'Skill Match Score']] = results
186
-
187
- # 5. Sort by the new graded score
188
  ranked_df = ranked_df.sort_values(by=['Skill Match Score', 'Similarity Score'], ascending=[False, False]).reset_index(drop=True)
189
  return ranked_df.set_index('Job ID', drop=False).rename_axis(None)
190
- # ----------------------------------------------------------------------
191
 
192
  def initialize_data_and_model():
193
  global original_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings
@@ -219,7 +207,7 @@ Text: "{text}"
219
  Extracted Skills:
220
  """
221
  try:
222
- response = LLM_PIPELINE(prompt, max_new_tokens=100, do_sample=False, temperature=0.1)
223
  generated_text = response[0]['generated_text']
224
  skills_part = generated_text.split("Extracted Skills:")[-1].strip()
225
  skills = [skill.strip() for skill in skills_part.split(',') if skill.strip()]
@@ -238,15 +226,26 @@ Extracted Skills:
238
  for subtree in chunked_text.subtrees():
239
  if subtree.label() == 'NP':
240
  phrase = " ".join(word for word, tag in subtree.leaves())
241
- normalized_phrase = _norm_skill_token(phrase)
242
- if normalized_phrase in SKILL_WHITELIST:
243
- potential_skills.add(normalized_phrase)
244
  return sorted(list(potential_skills))
 
 
 
 
 
 
 
 
 
 
245
 
246
  def extract_skills_hybrid(text: str) -> list[str]:
247
  llm_skills = extract_skills_llm(text)
248
  nltk_skills = extract_skills_nltk(text)
249
- combined_skills = set(llm_skills) | set(nltk_skills)
 
 
250
  return sorted(list(combined_skills))
251
 
252
  def create_text_for_skills(row):
@@ -261,8 +260,7 @@ Extracted Skills:
261
  original_df.to_parquet(PROCESSED_DATA_PATH)
262
 
263
  original_df['job_id'] = original_df.index
264
- def create_full_text(row):
265
- return " ".join([str(s) for s in [row.get("Job title"), row.get("Company"), row.get("Duties"), row.get("qualifications"), row.get("Description")]])
266
  original_df["full_text"] = original_df.apply(create_full_text, axis=1)
267
 
268
  ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
@@ -355,7 +353,16 @@ def on_select_job(job_id, skills_text):
355
  if not job_skills:
356
  learning_plan_html = "<p><i>No specific skills could be extracted for this job.</i></p>"
357
  return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
358
- all_missing_skills = sorted([s for s in job_skills if not any(util.cos_sim(model.encode(ut), model.encode(s))[0][0] > 0.55 for ut in user_skills)], key=lambda x: x.lower())
 
 
 
 
 
 
 
 
 
359
  if not all_missing_skills:
360
  learning_plan_html = "<h4 style='color:green;'>🎉 You have all the required skills!</h4>"
361
  return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
@@ -365,16 +372,16 @@ def on_select_job(job_id, skills_text):
365
  job_details_markdown += f"\n**Your skill match:** {score_val:.1%}"
366
  headline = "<b>Great fit!</b>" if score_val >= 0.8 else "<b>Good progress!</b>" if score_val >= 0.5 else "<b>Solid starting point.</b>"
367
  learning_plan_html = f"<h4>{headline} Focus on these skills to improve your match:</h4>"
368
- skills_to_display = all_missing_skills[:5]
369
  items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
370
  learning_plan_html += f"<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
371
  return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
372
  else:
373
  headline = "<h4>To be a good fit for this role, you'll need to learn these skills:</h4>"
374
- skills_to_display = job_skills[:5]
375
  items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
376
  learning_plan_html = f"{headline}<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
377
- full_skill_list_for_state = job_skills
378
  new_offset = len(skills_to_display)
379
  should_button_be_visible = len(full_skill_list_for_state) > 5
380
  return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), full_skill_list_for_state, new_offset, gr.Button(visible=should_button_be_visible)
@@ -424,14 +431,4 @@ with gr.Blocks(theme=gr.themes.Soft()) as ui:
424
  with gr.TabItem("Duties"): duties_markdown = gr.Markdown()
425
  with gr.TabItem("Qualifications"): qualifications_markdown = gr.Markdown()
426
  with gr.TabItem("Full Description"): description_markdown = gr.Markdown()
427
- learning_plan_output = gr.HTML(label="Learning Plan")
428
- load_more_btn = gr.Button("Load More Skills", visible=False)
429
- search_btn.click(fn=find_matches_and_rank_with_check, inputs=[dream_text, topk_slider, skills_text], outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row])
430
- search_anyway_btn.click(fn=find_matches_and_rank_anyway, inputs=[dream_text, topk_slider, skills_text], outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row])
431
- retype_btn.click(lambda: ("Status: Ready for you to retype.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(visible=False), gr.Row(visible=False)), outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row])
432
- reset_btn.click(fn=on_reset, outputs=[dream_text, topk_slider, skills_text, df_output, initial_matches_state, job_selector, details_accordion, status_text, job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, spelling_alert, spelling_row, missing_skills_state, skills_offset_state, load_more_btn], queue=False)
433
- rerank_btn.click(fn=rerank_current_results, inputs=[initial_matches_state, skills_text, topk_slider], outputs=[status_text, df_output, job_selector])
434
- job_selector.change(fn=on_select_job, inputs=[job_selector, skills_text], outputs=[job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, learning_plan_output, details_accordion, missing_skills_state, skills_offset_state, load_more_btn])
435
- load_more_btn.click(fn=load_more_skills, inputs=[missing_skills_state, skills_offset_state], outputs=[learning_plan_output, skills_offset_state, load_more_btn])
436
-
437
- ui.launch()
 
6
  import nltk
7
  from nltk.corpus import words, stopwords
8
  import urllib.parse as _url
 
 
9
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
10
  from nltk.stem import PorterStemmer
11
  import gradio as gr
 
24
  STOPWORDS = set(stopwords.words('english'))
25
  stemmer = PorterStemmer()
26
 
27
+ # --- Expanded Skill Whitelist ---
28
  SKILL_WHITELIST = {
29
  # Technical & Data
30
  'python', 'java', 'c++', 'javascript', 'typescript', 'sql', 'nosql', 'html', 'css', 'react', 'angular', 'vue',
 
41
  'strategy', 'stakeholder management', 'risk management', 'compliance', 'aml', 'kyc', 'reinsurance', 'finance',
42
  'financial modeling', 'financial analysis', 'due diligence', 'sourcing', 'procurement', 'negotiation', 'supply chain',
43
  'business analysis', 'business intelligence', 'presentations', 'public speaking', 'time management', 'critical thinking',
44
+ 'design thinking', 'innovation', 'adaptability', 'supervisory', 'pmp', 'cpsm', 'cips',
45
  # Soft & Other
46
  'leadership', 'stakeholder communication', 'client communication', 'teamwork', 'collaboration', 'problem solving',
47
  'ui/ux design', 'figma', 'sketch', 'adobe xd', 'graphic design', 'autocad', 'solidworks', 'sales', 'marketing',
 
136
  final_results_df = final_results_df.set_index('job_id', drop=False).rename(columns={'job_id': 'Job ID'})
137
  return final_results_df
138
 
 
139
  def score_jobs_by_skills(user_skills: list[str], df_to_rank: pd.DataFrame) -> pd.DataFrame:
140
  if df_to_rank is None or df_to_rank.empty or not user_skills:
141
  return df_to_rank.sort_values(by='Similarity Score', ascending=False) if df_to_rank is not None else pd.DataFrame()
 
144
  if 'Skills' not in ranked_df.columns:
145
  return ranked_df.sort_values(by='Similarity Score', ascending=False)
146
 
 
147
  user_skill_embeddings = model.encode(user_skills, convert_to_tensor=True)
148
  all_job_skills = sorted(list(set(skill for skills_list in ranked_df['Skills'] if skills_list for skill in skills_list)))
149
 
150
+ if not all_job_skills:
151
  ranked_df['Skill Match Score'] = 0.0
152
  return ranked_df
153
 
154
  job_skill_embeddings = model.encode(all_job_skills, convert_to_tensor=True)
 
 
155
  similarity_matrix = util.cos_sim(user_skill_embeddings, job_skill_embeddings)
156
 
157
+ def calculate_semantic_match(row, threshold=0.48): # Lowered threshold for more sensitivity
 
158
  job_skills_list = row.get('Skills', [])
159
  if not job_skills_list:
160
  return [], 0, 0.0
 
162
  matched_skills_in_job = set()
163
  for job_skill in job_skills_list:
164
  try:
 
165
  job_skill_idx = all_job_skills.index(job_skill)
 
166
  if torch.any(similarity_matrix[:, job_skill_idx] > threshold):
167
  matched_skills_in_job.add(job_skill)
168
  except (ValueError, IndexError):
 
172
  match_score = len(matched_skills_in_job) / total_required if total_required > 0 else 0.0
173
  return list(matched_skills_in_job), len(matched_skills_in_job), match_score
174
 
 
175
  results = ranked_df.apply(lambda row: calculate_semantic_match(row), axis=1, result_type='expand')
176
  ranked_df[['Skill Matches', 'Skill Match Count', 'Skill Match Score']] = results
 
 
177
  ranked_df = ranked_df.sort_values(by=['Skill Match Score', 'Similarity Score'], ascending=[False, False]).reset_index(drop=True)
178
  return ranked_df.set_index('Job ID', drop=False).rename_axis(None)
 
179
 
180
  def initialize_data_and_model():
181
  global original_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings
 
207
  Extracted Skills:
208
  """
209
  try:
210
+ response = LLM_PIPELINE(prompt, max_new_tokens=150, do_sample=False, temperature=0.1)
211
  generated_text = response[0]['generated_text']
212
  skills_part = generated_text.split("Extracted Skills:")[-1].strip()
213
  skills = [skill.strip() for skill in skills_part.split(',') if skill.strip()]
 
226
  for subtree in chunked_text.subtrees():
227
  if subtree.label() == 'NP':
228
  phrase = " ".join(word for word, tag in subtree.leaves())
229
+ if _norm_skill_token(phrase) in SKILL_WHITELIST:
230
+ potential_skills.add(_norm_skill_token(phrase))
 
231
  return sorted(list(potential_skills))
232
+
233
+ # NEW: Third extraction method for maximum coverage
234
+ def extract_skills_direct_scan(text: str) -> list[str]:
235
+ if not isinstance(text, str): return []
236
+ found_skills = set()
237
+ for skill in SKILL_WHITELIST:
238
+ # Use word boundaries to avoid matching substrings like 'art' in 'startup'
239
+ if re.search(r'\b' + re.escape(skill) + r'\b', text, re.IGNORECASE):
240
+ found_skills.add(skill)
241
+ return list(found_skills)
242
 
243
  def extract_skills_hybrid(text: str) -> list[str]:
244
  llm_skills = extract_skills_llm(text)
245
  nltk_skills = extract_skills_nltk(text)
246
+ direct_skills = extract_skills_direct_scan(text)
247
+ # Combine all sources and return a unique, sorted list
248
+ combined_skills = set(llm_skills) | set(nltk_skills) | set(direct_skills)
249
  return sorted(list(combined_skills))
250
 
251
  def create_text_for_skills(row):
 
260
  original_df.to_parquet(PROCESSED_DATA_PATH)
261
 
262
  original_df['job_id'] = original_df.index
263
+ def create_full_text(row): return " ".join([str(s) for s in [row.get("Job title"), row.get("Company"), row.get("Duties"), row.get("qualifications"), row.get("Description")]])
 
264
  original_df["full_text"] = original_df.apply(create_full_text, axis=1)
265
 
266
  ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
 
353
  if not job_skills:
354
  learning_plan_html = "<p><i>No specific skills could be extracted for this job.</i></p>"
355
  return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
356
+
357
+ all_missing_skills = job_skills
358
+ if user_skills:
359
+ user_skill_embeddings = model.encode(user_skills, convert_to_tensor=True)
360
+ job_skill_embeddings = model.encode(job_skills, convert_to_tensor=True)
361
+ similarity_matrix = util.cos_sim(user_skill_embeddings, job_skill_embeddings)
362
+
363
+ matched_job_skills_mask = torch.any(similarity_matrix > 0.48, dim=0)
364
+ all_missing_skills = [skill for i, skill in enumerate(job_skills) if not matched_job_skills_mask[i]]
365
+
366
  if not all_missing_skills:
367
  learning_plan_html = "<h4 style='color:green;'>🎉 You have all the required skills!</h4>"
368
  return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
 
372
  job_details_markdown += f"\n**Your skill match:** {score_val:.1%}"
373
  headline = "<b>Great fit!</b>" if score_val >= 0.8 else "<b>Good progress!</b>" if score_val >= 0.5 else "<b>Solid starting point.</b>"
374
  learning_plan_html = f"<h4>{headline} Focus on these skills to improve your match:</h4>"
375
+ skills_to_display = sorted(all_missing_skills)[:5]
376
  items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
377
  learning_plan_html += f"<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
378
  return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
379
  else:
380
  headline = "<h4>To be a good fit for this role, you'll need to learn these skills:</h4>"
381
+ skills_to_display = sorted(job_skills)[:5]
382
  items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
383
  learning_plan_html = f"{headline}<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
384
+ full_skill_list_for_state = sorted(job_skills)
385
  new_offset = len(skills_to_display)
386
  should_button_be_visible = len(full_skill_list_for_state) > 5
387
  return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), full_skill_list_for_state, new_offset, gr.Button(visible=should_button_be_visible)
 
431
  with gr.TabItem("Duties"): duties_markdown = gr.Markdown()
432
  with gr.TabItem("Qualifications"): qualifications_markdown = gr.Markdown()
433
  with gr.TabItem("Full Description"): description_markdown = gr.Markdown()
434
+ learning_plan_output = gr.HTML(label="Learning