zlf18 commited on
Commit
a4e6efa
·
verified ·
1 Parent(s): 1d05fa5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -138
app.py CHANGED
@@ -26,23 +26,29 @@ for package in ['words', 'stopwords', 'averaged_perceptron_tagger', 'punkt']:
26
  STOPWORDS = set(stopwords.words('english'))
27
  stemmer = PorterStemmer()
28
 
29
- # --- NEW: Curated Skill Whitelist for NLTK Fallback Accuracy ---
30
  SKILL_WHITELIST = {
 
31
  'python', 'java', 'c++', 'javascript', 'typescript', 'sql', 'nosql', 'html', 'css', 'react', 'angular', 'vue',
32
  'nodejs', 'django', 'flask', 'fastapi', 'spring boot', 'ruby on rails', 'php', 'swift', 'kotlin', 'dart', 'flutter',
33
  'machine learning', 'deep learning', 'tensorflow', 'pytorch', 'keras', 'scikit-learn', 'pandas', 'numpy', 'matplotlib',
34
  'natural language processing', 'nlp', 'computer vision', 'data analysis', 'data science', 'data engineering',
35
- 'big data', 'spark', 'hadoop', 'kafka', 'data visualization', 'tableau', 'power bi', 'd3.js', 'statistics',
36
  'aws', 'azure', 'google cloud', 'gcp', 'docker', 'kubernetes', 'terraform', 'ansible', 'ci/cd', 'jenkins',
37
  'git', 'github', 'devops', 'linux', 'unix', 'shell scripting', 'powershell', 'cybersecurity', 'penetration testing',
38
- 'network security', 'cryptography', 'blockchain', 'agile', 'scrum', 'project management', 'product management',
 
 
 
 
 
 
 
39
  'leadership', 'stakeholder communication', 'client communication', 'teamwork', 'collaboration', 'problem solving',
40
- 'critical thinking', 'ui/ux design', 'figma', 'sketch', 'adobe xd', 'graphic design', 'autocad', 'solidworks',
41
- 'c#', '.net', 'sql server', 'mysql', 'postgresql', 'mongodb', 'redis', 'elasticsearch', 'api design', 'rest apis',
42
- 'graphql', 'microservices', 'serverless', 'system design', 'saas', 'sales', 'marketing', 'seo', 'sem', 'content writing',
43
- 'customer support', 'technical writing', 'sap', 'oracle', 'financial analysis', 'budgeting', 'mentoring', 'supervising'
44
  }
45
- # -----------------------------------------------------------------
46
 
47
  # --- GLOBAL STATE & DATA ---
48
  original_df = None
@@ -63,22 +69,6 @@ def _norm_skill_token(s: str) -> str:
63
  s = re.sub(r'\s+', ' ', s)
64
  return s
65
 
66
- def _skill_match(token1: str, token2: str, threshold: float = 0.9) -> bool:
67
- t1 = _norm_skill_token(token1)
68
- t2 = _norm_skill_token(token2)
69
- if t1 == t2 or t1 in t2 or t2 in t1:
70
- return True
71
- try:
72
- if len(t1) > 2 and len(t2) > 2:
73
- vectorizer = TfidfVectorizer().fit([t1, t2])
74
- vectors = vectorizer.transform([t1, t2])
75
- similarity = cosine_similarity(vectors)[0, 1]
76
- if similarity >= threshold:
77
- return True
78
- except:
79
- pass
80
- return False
81
-
82
  def build_known_vocabulary(df: pd.DataFrame):
83
  global KNOWN_WORDS
84
  english_words = set(w.lower() for w in words.words())
@@ -147,56 +137,83 @@ def find_job_matches(original_user_query: str, expanded_user_query: str, top_k:
147
  final_results_df = final_results_df.set_index('job_id', drop=False).rename(columns={'job_id': 'Job ID'})
148
  return final_results_df
149
 
150
- def score_jobs_by_skills(user_tokens: list[str], df_to_rank: pd.DataFrame) -> pd.DataFrame:
151
- if df_to_rank is None or df_to_rank.empty: return pd.DataFrame()
 
 
 
152
  ranked_df = df_to_rank.copy()
153
- if 'Skills' not in ranked_df.columns: return ranked_df.sort_values(by='Similarity Score', ascending=False)
154
- def calculate_match(row, user_tokens):
155
- job_skills = row.get('Skills', [])
156
- if not isinstance(job_skills, list): return [], 0, 0.0
157
- matched_skills = [s for s in job_skills if any(_skill_match(ut, s) for ut in user_tokens)]
158
- total_required_count = len(job_skills)
159
- match_score = len(matched_skills) / total_required_count if total_required_count > 0 else 0.0
160
- return matched_skills, len(matched_skills), match_score
161
- results = ranked_df.apply(lambda row: calculate_match(row, user_tokens), axis=1, result_type='expand')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  ranked_df[['Skill Matches', 'Skill Match Count', 'Skill Match Score']] = results
 
 
163
  ranked_df = ranked_df.sort_values(by=['Skill Match Score', 'Similarity Score'], ascending=[False, False]).reset_index(drop=True)
164
  return ranked_df.set_index('Job ID', drop=False).rename_axis(None)
 
165
 
166
  def initialize_data_and_model():
167
  global original_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings
168
-
169
  PROCESSED_DATA_PATH = "processed_jobs_with_skills.parquet"
170
 
171
  print("--- Initializing LLM Client ---")
172
- if not initialize_llm_client():
173
- print("Warning: LLM Client failed to initialize. Will use NLTK only for skills.")
174
 
175
  if os.path.exists(PROCESSED_DATA_PATH):
176
  print(f"--- Loading pre-processed data from {PROCESSED_DATA_PATH} ---")
177
  original_df = pd.read_parquet(PROCESSED_DATA_PATH)
178
  else:
179
  print("--- No pre-processed data found. Starting one-time processing... ---")
180
-
181
  ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
182
  original_df = ds["original"].to_pandas()
183
 
184
- # --- Method 1: LLM-based extraction with FEW-SHOT PROMPT ---
185
  def extract_skills_llm(text: str) -> list[str]:
186
- if not isinstance(text, str) or len(text.strip()) < 20 or not LLM_PIPELINE:
187
- return []
188
-
189
  prompt = f"""
190
  Instruct: You are an expert technical recruiter. Extract the key skills from the job description text. List technical and soft skills as a comma-separated string.
191
-
192
  [Example 1]
193
  Text: "Requires 3+ years of experience in cloud infrastructure. Must be proficient in AWS, particularly EC2 and S3. Experience with Terraform for IaC is a plus."
194
  Extracted Skills: cloud infrastructure, aws, ec2, s3, terraform, infrastructure as code
195
-
196
  [Example 2]
197
  Text: "Seeking a team lead with strong project management abilities. Must communicate effectively with stakeholders and manage timelines using Agile methodologies like Scrum."
198
  Extracted Skills: project management, leadership, stakeholder communication, agile, scrum
199
-
200
  [Actual Task]
201
  Text: "{text}"
202
  Extracted Skills:
@@ -207,10 +224,8 @@ Extracted Skills:
207
  skills_part = generated_text.split("Extracted Skills:")[-1].strip()
208
  skills = [skill.strip() for skill in skills_part.split(',') if skill.strip()]
209
  return list(dict.fromkeys(s.lower() for s in skills))
210
- except Exception:
211
- return []
212
 
213
- # --- Method 2: NLTK fallback with SKILL WHITELIST validation ---
214
  def extract_skills_nltk(text: str) -> list[str]:
215
  if not isinstance(text, str): return []
216
  text_lower = text.lower()
@@ -219,23 +234,18 @@ Extracted Skills:
219
  tokens = nltk.word_tokenize(text_lower)
220
  tagged_tokens = nltk.pos_tag(tokens)
221
  chunked_text = chunk_parser.parse(tagged_tokens)
222
-
223
  potential_skills = set()
224
  for subtree in chunked_text.subtrees():
225
  if subtree.label() == 'NP':
226
  phrase = " ".join(word for word, tag in subtree.leaves())
227
  normalized_phrase = _norm_skill_token(phrase)
228
- # The key change: only add the phrase if it's in our known skill list
229
  if normalized_phrase in SKILL_WHITELIST:
230
  potential_skills.add(normalized_phrase)
231
  return sorted(list(potential_skills))
232
 
233
- # --- Hybrid Orchestrator: MERGE LLM and NLTK results for best coverage ---
234
  def extract_skills_hybrid(text: str) -> list[str]:
235
  llm_skills = extract_skills_llm(text)
236
  nltk_skills = extract_skills_nltk(text)
237
-
238
- # Combine the results and remove duplicates
239
  combined_skills = set(llm_skills) | set(nltk_skills)
240
  return sorted(list(combined_skills))
241
 
@@ -243,7 +253,6 @@ Extracted Skills:
243
  return " ".join([str(s) for s in [row.get("Job title"), row.get("Duties"), row.get("qualifications"), row.get("Description")] if pd.notna(s)])
244
 
245
  original_df["text_for_skills"] = original_df.apply(create_text_for_skills, axis=1)
246
-
247
  print("--- Extracting skills with HYBRID ACCURACY model. Please wait... ---")
248
  original_df['Skills'] = original_df['text_for_skills'].progress_apply(extract_skills_hybrid)
249
  original_df = original_df.drop(columns=['text_for_skills'])
@@ -251,7 +260,6 @@ Extracted Skills:
251
  print(f"--- Saving processed data to {PROCESSED_DATA_PATH} for faster future startups ---")
252
  original_df.to_parquet(PROCESSED_DATA_PATH)
253
 
254
- # --- Continue with the rest of the data processing ---
255
  original_df['job_id'] = original_df.index
256
  def create_full_text(row):
257
  return " ".join([str(s) for s in [row.get("Job title"), row.get("Company"), row.get("Duties"), row.get("qualifications"), row.get("Description")]])
@@ -280,7 +288,6 @@ def _course_links_for(skill: str) -> str:
280
  links = [("Coursera", f"https://www.coursera.org/search?query={q}"), ("edX", f"https://www.edx.org/search?q={q}"), ("Udemy", f"https://www.udemy.com/courses/search/?q={q}"), ("YouTube", f"https://www.youtube.com/results?search_query={q}+tutorial")]
281
  return " • ".join([f'<a href="{u}" target="_blank" style="color: #007bff;">{name}</a>' for name, u in links])
282
 
283
- # --- GRADIO INTERFACE FUNCTIONS (No changes needed below this line) ---
284
  def get_job_matches(dream_job: str, top_n: int, skills_text: str):
285
  status = "Searching using hybrid model..."
286
  expanded_desc = llm_expand_query(dream_job)
@@ -291,29 +298,22 @@ def get_job_matches(dream_job: str, top_n: int, skills_text: str):
291
  display_df = score_jobs_by_skills(user_skills, emb_matches)
292
  else:
293
  display_df = emb_matches
294
-
295
  display_df = display_df.head(top_n)
296
-
297
  if user_skills:
298
  status = f"Found and **re-ranked** results by your {len(user_skills)} skills. Displaying top {len(display_df)}."
299
  else:
300
  status = f"Found {len(display_df)} top matches using semantic search."
301
-
302
  table_to_show = display_df[['job_title', 'company', 'Similarity Score']]
303
  if 'Skill Match Score' in display_df.columns:
304
- table_to_show['Skill Match Score'] = display_df['Skill Match Score']
305
-
306
  dropdown_options = [(f"{i+1}. {row['job_title']} - {row['company']}", row.name) for i, row in display_df.iterrows()]
307
  dropdown_value = dropdown_options[0][1] if dropdown_options else None
308
-
309
  return status, emb_matches, table_to_show, gr.Dropdown(choices=dropdown_options, value=dropdown_value, visible=True), gr.Accordion(visible=True)
310
 
311
  def rerank_current_results(initial_matches_df, skills_text, top_n):
312
  if initial_matches_df is None or pd.DataFrame(initial_matches_df).empty:
313
  return "Please find matches first before re-ranking.", pd.DataFrame(), gr.Dropdown(visible=False)
314
-
315
  initial_matches_df = pd.DataFrame(initial_matches_df)
316
-
317
  user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
318
  if not user_skills:
319
  status = "Skills cleared. Showing original semantic search results."
@@ -324,7 +324,7 @@ def rerank_current_results(initial_matches_df, skills_text, top_n):
324
  status = f"Results **re-ranked** based on your {len(user_skills)} skills."
325
  display_df = ranked_df.head(top_n)
326
  table_to_show = display_df[['job_title', 'company', 'Similarity Score', 'Skill Match Score']]
327
-
328
  dropdown_options = [(f"{i+1}. {row['job_title']} - {row['company']}", row.name) for i, row in display_df.iterrows()]
329
  dropdown_value = dropdown_options[0][1] if dropdown_options else None
330
  return status, table_to_show, gr.Dropdown(choices=dropdown_options, value=dropdown_value, visible=True)
@@ -337,7 +337,6 @@ def find_matches_and_rank_with_check(dream_job: str, top_n: int, skills_text: st
337
  word_list_html = ", ".join([f"<b><span style='color: #F87171;'>{w}</span></b>" for w in unrecognized_words])
338
  alert_message = f"<b><span style='color: #F87171;'>⚠️ Possible Spelling Error:</span></b> Unrecognized: {word_list_html}."
339
  return "Status: Awaiting confirmation.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(alert_message, visible=True), gr.Row(visible=True)
340
-
341
  status, emb_matches, table_to_show, dropdown, details_accordion = get_job_matches(dream_job, top_n, skills_text)
342
  return status, emb_matches, table_to_show, dropdown, details_accordion, gr.Markdown(visible=False), gr.Row(visible=False)
343
 
@@ -346,78 +345,61 @@ def find_matches_and_rank_anyway(dream_job: str, top_n: int, skills_text: str):
346
  return status, emb_matches, table_to_show, dropdown, details_accordion, gr.Markdown(visible=False), gr.Row(visible=False)
347
 
348
  def on_select_job(job_id, skills_text):
349
- if job_id is None:
350
- return "", "", "", "", "", gr.Accordion(visible=False), [], 0, gr.Button(visible=False)
351
-
352
  row = original_df.loc[job_id]
353
  title, company = str(row.get("job_title", "")), str(row.get("company", ""))
354
  job_details_markdown = f"### {title} — {company}"
355
  duties, qualifications, description = str(row.get('Duties', '')), str(row.get('qualifications', '')), str(row.get('Description', ''))
356
-
357
  user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
358
  job_skills = row.get("Skills", [])
359
-
360
  if not job_skills:
361
  learning_plan_html = "<p><i>No specific skills could be extracted for this job.</i></p>"
362
  return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
363
-
364
- all_missing_skills = sorted([s for s in job_skills if not any(_skill_match(ut, s) for ut in user_skills)], key=lambda x: x.lower())
365
-
366
  if not all_missing_skills:
367
  learning_plan_html = "<h4 style='color:green;'>🎉 You have all the required skills!</h4>"
368
  return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
369
-
370
  if user_skills:
371
- score_val = (len(job_skills) - len(all_missing_skills)) / len(job_skills)
 
372
  job_details_markdown += f"\n**Your skill match:** {score_val:.1%}"
373
  headline = "<b>Great fit!</b>" if score_val >= 0.8 else "<b>Good progress!</b>" if score_val >= 0.5 else "<b>Solid starting point.</b>"
374
  learning_plan_html = f"<h4>{headline} Focus on these skills to improve your match:</h4>"
375
  skills_to_display = all_missing_skills[:5]
376
  items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
377
  learning_plan_html += f"<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
378
-
379
  return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
380
-
381
  else:
382
  headline = "<h4>To be a good fit for this role, you'll need to learn these skills:</h4>"
383
- skills_to_display = all_missing_skills[:5]
384
  items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
385
  learning_plan_html = f"{headline}<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
386
-
387
- full_skill_list_for_state = all_missing_skills
388
  new_offset = len(skills_to_display)
389
- should_button_be_visible = len(all_missing_skills) > 5
390
-
391
  return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), full_skill_list_for_state, new_offset, gr.Button(visible=should_button_be_visible)
392
 
393
  def load_more_skills(full_skills_list, current_offset):
394
  SKILLS_INCREMENT = 5
395
  new_offset = current_offset + SKILLS_INCREMENT
396
  skills_to_display = full_skills_list[:new_offset]
397
-
398
  items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
399
  learning_plan_html = f"<h4>To be a good fit for this role, you'll need to learn these skills:</h4><ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
400
-
401
  should_button_be_visible = new_offset < len(full_skills_list)
402
-
403
  return learning_plan_html, new_offset, gr.Button(visible=should_button_be_visible)
404
 
405
  def on_reset():
406
  return ("", 3, "", pd.DataFrame(), None, gr.Dropdown(visible=False), gr.Accordion(visible=False), "Status: Ready.", "", "", "", "", gr.Markdown(visible=False), gr.Row(visible=False), [], 0, gr.Button(visible=False))
407
 
408
- # --- Run Initialization ---
409
  print("Starting application initialization...")
410
  initialization_status = initialize_data_and_model()
411
  print(initialization_status)
412
 
413
- # --- Gradio Interface Definition ---
414
  with gr.Blocks(theme=gr.themes.Soft()) as ui:
415
  gr.Markdown("# Hybrid Career Planner & Skill Gap Analyzer")
416
-
417
  initial_matches_state = gr.State()
418
  missing_skills_state = gr.State([])
419
  skills_offset_state = gr.State(0)
420
-
421
  with gr.Row():
422
  with gr.Column(scale=3):
423
  dream_text = gr.Textbox(label='Your Dream Job Description', lines=3, placeholder="e.g., 'A role in a tech startup focused on machine learning...'")
@@ -429,64 +411,27 @@ with gr.Blocks(theme=gr.themes.Soft()) as ui:
429
  topk_slider = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Number of Matches")
430
  search_btn = gr.Button("Find Matches", variant="primary")
431
  reset_btn = gr.Button("Reset All")
432
-
433
  status_text = gr.Markdown("Status: Ready.")
434
  spelling_alert = gr.Markdown(visible=False)
435
  with gr.Row(visible=False) as spelling_row:
436
  search_anyway_btn = gr.Button("Search Anyway", variant="secondary")
437
  retype_btn = gr.Button("Let Me Fix It", variant="stop")
438
-
439
  df_output = gr.DataFrame(label="Job Matches", interactive=False)
440
  job_selector = gr.Dropdown(label="Select a job to see more details & learning plan:", visible=False)
441
-
442
  with gr.Accordion("Job Details & Learning Plan", open=False, visible=False) as details_accordion:
443
  job_details_markdown = gr.Markdown()
444
-
445
  with gr.Tabs():
446
- with gr.TabItem("Duties"):
447
- duties_markdown = gr.Markdown()
448
- with gr.TabItem("Qualifications"):
449
- qualifications_markdown = gr.Markdown()
450
- with gr.TabItem("Full Description"):
451
- description_markdown = gr.Markdown()
452
-
453
  learning_plan_output = gr.HTML(label="Learning Plan")
454
  load_more_btn = gr.Button("Load More Skills", visible=False)
455
-
456
- # --- Event Handlers ---
457
- search_btn.click(
458
- fn=find_matches_and_rank_with_check,
459
- inputs=[dream_text, topk_slider, skills_text],
460
- outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row]
461
- )
462
- search_anyway_btn.click(
463
- fn=find_matches_and_rank_anyway,
464
- inputs=[dream_text, topk_slider, skills_text],
465
- outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row]
466
- )
467
- retype_btn.click(
468
- lambda: ("Status: Ready for you to retype.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(visible=False), gr.Row(visible=False)),
469
- outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row]
470
- )
471
- reset_btn.click(
472
- fn=on_reset,
473
- outputs=[dream_text, topk_slider, skills_text, df_output, initial_matches_state, job_selector, details_accordion, status_text, job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, spelling_alert, spelling_row, missing_skills_state, skills_offset_state, load_more_btn],
474
- queue=False
475
- )
476
- rerank_btn.click(
477
- fn=rerank_current_results,
478
- inputs=[initial_matches_state, skills_text, topk_slider],
479
- outputs=[status_text, df_output, job_selector]
480
- )
481
- job_selector.change(
482
- fn=on_select_job,
483
- inputs=[job_selector, skills_text],
484
- outputs=[job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, learning_plan_output, details_accordion, missing_skills_state, skills_offset_state, load_more_btn]
485
- )
486
- load_more_btn.click(
487
- fn=load_more_skills,
488
- inputs=[missing_skills_state, skills_offset_state],
489
- outputs=[learning_plan_output, skills_offset_state, load_more_btn]
490
- )
491
 
492
  ui.launch()
 
26
  STOPWORDS = set(stopwords.words('english'))
27
  stemmer = PorterStemmer()
28
 
29
+ # --- EXPANDED: Skill Whitelist with more business, finance, and consulting terms ---
30
  SKILL_WHITELIST = {
31
+ # Technical & Data
32
  'python', 'java', 'c++', 'javascript', 'typescript', 'sql', 'nosql', 'html', 'css', 'react', 'angular', 'vue',
33
  'nodejs', 'django', 'flask', 'fastapi', 'spring boot', 'ruby on rails', 'php', 'swift', 'kotlin', 'dart', 'flutter',
34
  'machine learning', 'deep learning', 'tensorflow', 'pytorch', 'keras', 'scikit-learn', 'pandas', 'numpy', 'matplotlib',
35
  'natural language processing', 'nlp', 'computer vision', 'data analysis', 'data science', 'data engineering',
36
+ 'big data', 'spark', 'hadoop', 'kafka', 'data visualization', 'tableau', 'power bi', 'd3.js', 'statistics', 'analytics',
37
  'aws', 'azure', 'google cloud', 'gcp', 'docker', 'kubernetes', 'terraform', 'ansible', 'ci/cd', 'jenkins',
38
  'git', 'github', 'devops', 'linux', 'unix', 'shell scripting', 'powershell', 'cybersecurity', 'penetration testing',
39
+ 'network security', 'cryptography', 'blockchain', 'c#', '.net', 'sql server', 'mysql', 'postgresql', 'mongodb', 'redis',
40
+ 'elasticsearch', 'api design', 'rest apis', 'graphql', 'microservices', 'serverless', 'system design', 'saas',
41
+ # Business & Consulting
42
+ 'agile', 'scrum', 'project management', 'product management', 'consulting', 'client management', 'business development',
43
+ 'strategy', 'stakeholder management', 'risk management', 'compliance', 'aml', 'kyc', 'reinsurance', 'finance',
44
+ 'financial modeling', 'financial analysis', 'due diligence', 'sourcing', 'procurement', 'negotiation', 'supply chain',
45
+ 'business analysis', 'business intelligence', 'presentations', 'public speaking', 'time management', 'critical thinking',
46
+ # Soft & Other
47
  'leadership', 'stakeholder communication', 'client communication', 'teamwork', 'collaboration', 'problem solving',
48
+ 'ui/ux design', 'figma', 'sketch', 'adobe xd', 'graphic design', 'autocad', 'solidworks', 'sales', 'marketing',
49
+ 'seo', 'sem', 'content writing', 'customer support', 'technical writing', 'sap', 'oracle', 'budgeting', 'mentoring', 'supervising'
 
 
50
  }
51
+ # --------------------------------------------------------------------------------
52
 
53
  # --- GLOBAL STATE & DATA ---
54
  original_df = None
 
69
  s = re.sub(r'\s+', ' ', s)
70
  return s
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  def build_known_vocabulary(df: pd.DataFrame):
73
  global KNOWN_WORDS
74
  english_words = set(w.lower() for w in words.words())
 
137
  final_results_df = final_results_df.set_index('job_id', drop=False).rename(columns={'job_id': 'Job ID'})
138
  return final_results_df
139
 
140
+ # --- REWRITTEN: Skill scoring function using semantic similarity ---
141
+ def score_jobs_by_skills(user_skills: list[str], df_to_rank: pd.DataFrame) -> pd.DataFrame:
142
+ if df_to_rank is None or df_to_rank.empty or not user_skills:
143
+ return df_to_rank.sort_values(by='Similarity Score', ascending=False) if df_to_rank is not None else pd.DataFrame()
144
+
145
  ranked_df = df_to_rank.copy()
146
+ if 'Skills' not in ranked_df.columns:
147
+ return ranked_df.sort_values(by='Similarity Score', ascending=False)
148
+
149
+ # 1. Encode all user skills and all unique job skills across the dataframe ONCE for efficiency
150
+ user_skill_embeddings = model.encode(user_skills, convert_to_tensor=True)
151
+ all_job_skills = sorted(list(set(skill for skills_list in ranked_df['Skills'] if skills_list for skill in skills_list)))
152
+
153
+ if not all_job_skills: # No skills to compare against
154
+ ranked_df['Skill Match Score'] = 0.0
155
+ return ranked_df
156
+
157
+ job_skill_embeddings = model.encode(all_job_skills, convert_to_tensor=True)
158
+
159
+ # 2. Calculate the similarity matrix between every user skill and every job skill
160
+ similarity_matrix = util.cos_sim(user_skill_embeddings, job_skill_embeddings)
161
+
162
+ # 3. Define the new scoring function
163
+ def calculate_semantic_match(row, threshold=0.55):
164
+ job_skills_list = row.get('Skills', [])
165
+ if not job_skills_list:
166
+ return [], 0, 0.0
167
+
168
+ matched_skills_in_job = set()
169
+ for job_skill in job_skills_list:
170
+ try:
171
+ # Find which column in the matrix corresponds to the current job skill
172
+ job_skill_idx = all_job_skills.index(job_skill)
173
+ # Check if ANY of the user's skills meet the similarity threshold for this job skill
174
+ if torch.any(similarity_matrix[:, job_skill_idx] > threshold):
175
+ matched_skills_in_job.add(job_skill)
176
+ except (ValueError, IndexError):
177
+ continue
178
+
179
+ total_required = len(job_skills_list)
180
+ match_score = len(matched_skills_in_job) / total_required if total_required > 0 else 0.0
181
+ return list(matched_skills_in_job), len(matched_skills_in_job), match_score
182
+
183
+ # 4. Apply the new scoring function to each row
184
+ results = ranked_df.apply(lambda row: calculate_semantic_match(row), axis=1, result_type='expand')
185
  ranked_df[['Skill Matches', 'Skill Match Count', 'Skill Match Score']] = results
186
+
187
+ # 5. Sort by the new graded score
188
  ranked_df = ranked_df.sort_values(by=['Skill Match Score', 'Similarity Score'], ascending=[False, False]).reset_index(drop=True)
189
  return ranked_df.set_index('Job ID', drop=False).rename_axis(None)
190
+ # ----------------------------------------------------------------------
191
 
192
  def initialize_data_and_model():
193
  global original_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings
 
194
  PROCESSED_DATA_PATH = "processed_jobs_with_skills.parquet"
195
 
196
  print("--- Initializing LLM Client ---")
197
+ if not initialize_llm_client(): print("Warning: LLM Client failed to initialize. Will use NLTK only for skills.")
 
198
 
199
  if os.path.exists(PROCESSED_DATA_PATH):
200
  print(f"--- Loading pre-processed data from {PROCESSED_DATA_PATH} ---")
201
  original_df = pd.read_parquet(PROCESSED_DATA_PATH)
202
  else:
203
  print("--- No pre-processed data found. Starting one-time processing... ---")
 
204
  ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
205
  original_df = ds["original"].to_pandas()
206
 
 
207
  def extract_skills_llm(text: str) -> list[str]:
208
+ if not isinstance(text, str) or len(text.strip()) < 20 or not LLM_PIPELINE: return []
 
 
209
  prompt = f"""
210
  Instruct: You are an expert technical recruiter. Extract the key skills from the job description text. List technical and soft skills as a comma-separated string.
 
211
  [Example 1]
212
  Text: "Requires 3+ years of experience in cloud infrastructure. Must be proficient in AWS, particularly EC2 and S3. Experience with Terraform for IaC is a plus."
213
  Extracted Skills: cloud infrastructure, aws, ec2, s3, terraform, infrastructure as code
 
214
  [Example 2]
215
  Text: "Seeking a team lead with strong project management abilities. Must communicate effectively with stakeholders and manage timelines using Agile methodologies like Scrum."
216
  Extracted Skills: project management, leadership, stakeholder communication, agile, scrum
 
217
  [Actual Task]
218
  Text: "{text}"
219
  Extracted Skills:
 
224
  skills_part = generated_text.split("Extracted Skills:")[-1].strip()
225
  skills = [skill.strip() for skill in skills_part.split(',') if skill.strip()]
226
  return list(dict.fromkeys(s.lower() for s in skills))
227
+ except Exception: return []
 
228
 
 
229
  def extract_skills_nltk(text: str) -> list[str]:
230
  if not isinstance(text, str): return []
231
  text_lower = text.lower()
 
234
  tokens = nltk.word_tokenize(text_lower)
235
  tagged_tokens = nltk.pos_tag(tokens)
236
  chunked_text = chunk_parser.parse(tagged_tokens)
 
237
  potential_skills = set()
238
  for subtree in chunked_text.subtrees():
239
  if subtree.label() == 'NP':
240
  phrase = " ".join(word for word, tag in subtree.leaves())
241
  normalized_phrase = _norm_skill_token(phrase)
 
242
  if normalized_phrase in SKILL_WHITELIST:
243
  potential_skills.add(normalized_phrase)
244
  return sorted(list(potential_skills))
245
 
 
246
  def extract_skills_hybrid(text: str) -> list[str]:
247
  llm_skills = extract_skills_llm(text)
248
  nltk_skills = extract_skills_nltk(text)
 
 
249
  combined_skills = set(llm_skills) | set(nltk_skills)
250
  return sorted(list(combined_skills))
251
 
 
253
  return " ".join([str(s) for s in [row.get("Job title"), row.get("Duties"), row.get("qualifications"), row.get("Description")] if pd.notna(s)])
254
 
255
  original_df["text_for_skills"] = original_df.apply(create_text_for_skills, axis=1)
 
256
  print("--- Extracting skills with HYBRID ACCURACY model. Please wait... ---")
257
  original_df['Skills'] = original_df['text_for_skills'].progress_apply(extract_skills_hybrid)
258
  original_df = original_df.drop(columns=['text_for_skills'])
 
260
  print(f"--- Saving processed data to {PROCESSED_DATA_PATH} for faster future startups ---")
261
  original_df.to_parquet(PROCESSED_DATA_PATH)
262
 
 
263
  original_df['job_id'] = original_df.index
264
  def create_full_text(row):
265
  return " ".join([str(s) for s in [row.get("Job title"), row.get("Company"), row.get("Duties"), row.get("qualifications"), row.get("Description")]])
 
288
  links = [("Coursera", f"https://www.coursera.org/search?query={q}"), ("edX", f"https://www.edx.org/search?q={q}"), ("Udemy", f"https://www.udemy.com/courses/search/?q={q}"), ("YouTube", f"https://www.youtube.com/results?search_query={q}+tutorial")]
289
  return " • ".join([f'<a href="{u}" target="_blank" style="color: #007bff;">{name}</a>' for name, u in links])
290
 
 
291
  def get_job_matches(dream_job: str, top_n: int, skills_text: str):
292
  status = "Searching using hybrid model..."
293
  expanded_desc = llm_expand_query(dream_job)
 
298
  display_df = score_jobs_by_skills(user_skills, emb_matches)
299
  else:
300
  display_df = emb_matches
 
301
  display_df = display_df.head(top_n)
 
302
  if user_skills:
303
  status = f"Found and **re-ranked** results by your {len(user_skills)} skills. Displaying top {len(display_df)}."
304
  else:
305
  status = f"Found {len(display_df)} top matches using semantic search."
 
306
  table_to_show = display_df[['job_title', 'company', 'Similarity Score']]
307
  if 'Skill Match Score' in display_df.columns:
308
+ table_to_show['Skill Match Score'] = display_df['Skill Match Score'].map('{:.2%}'.format)
 
309
  dropdown_options = [(f"{i+1}. {row['job_title']} - {row['company']}", row.name) for i, row in display_df.iterrows()]
310
  dropdown_value = dropdown_options[0][1] if dropdown_options else None
 
311
  return status, emb_matches, table_to_show, gr.Dropdown(choices=dropdown_options, value=dropdown_value, visible=True), gr.Accordion(visible=True)
312
 
313
  def rerank_current_results(initial_matches_df, skills_text, top_n):
314
  if initial_matches_df is None or pd.DataFrame(initial_matches_df).empty:
315
  return "Please find matches first before re-ranking.", pd.DataFrame(), gr.Dropdown(visible=False)
 
316
  initial_matches_df = pd.DataFrame(initial_matches_df)
 
317
  user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
318
  if not user_skills:
319
  status = "Skills cleared. Showing original semantic search results."
 
324
  status = f"Results **re-ranked** based on your {len(user_skills)} skills."
325
  display_df = ranked_df.head(top_n)
326
  table_to_show = display_df[['job_title', 'company', 'Similarity Score', 'Skill Match Score']]
327
+ table_to_show['Skill Match Score'] = table_to_show['Skill Match Score'].map('{:.2%}'.format)
328
  dropdown_options = [(f"{i+1}. {row['job_title']} - {row['company']}", row.name) for i, row in display_df.iterrows()]
329
  dropdown_value = dropdown_options[0][1] if dropdown_options else None
330
  return status, table_to_show, gr.Dropdown(choices=dropdown_options, value=dropdown_value, visible=True)
 
337
  word_list_html = ", ".join([f"<b><span style='color: #F87171;'>{w}</span></b>" for w in unrecognized_words])
338
  alert_message = f"<b><span style='color: #F87171;'>⚠️ Possible Spelling Error:</span></b> Unrecognized: {word_list_html}."
339
  return "Status: Awaiting confirmation.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(alert_message, visible=True), gr.Row(visible=True)
 
340
  status, emb_matches, table_to_show, dropdown, details_accordion = get_job_matches(dream_job, top_n, skills_text)
341
  return status, emb_matches, table_to_show, dropdown, details_accordion, gr.Markdown(visible=False), gr.Row(visible=False)
342
 
 
345
  return status, emb_matches, table_to_show, dropdown, details_accordion, gr.Markdown(visible=False), gr.Row(visible=False)
346
 
347
  def on_select_job(job_id, skills_text):
348
+ if job_id is None: return "", "", "", "", "", gr.Accordion(visible=False), [], 0, gr.Button(visible=False)
 
 
349
  row = original_df.loc[job_id]
350
  title, company = str(row.get("job_title", "")), str(row.get("company", ""))
351
  job_details_markdown = f"### {title} — {company}"
352
  duties, qualifications, description = str(row.get('Duties', '')), str(row.get('qualifications', '')), str(row.get('Description', ''))
 
353
  user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
354
  job_skills = row.get("Skills", [])
 
355
  if not job_skills:
356
  learning_plan_html = "<p><i>No specific skills could be extracted for this job.</i></p>"
357
  return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
358
+ all_missing_skills = sorted([s for s in job_skills if not any(util.cos_sim(model.encode(ut), model.encode(s))[0][0] > 0.55 for ut in user_skills)], key=lambda x: x.lower())
 
 
359
  if not all_missing_skills:
360
  learning_plan_html = "<h4 style='color:green;'>🎉 You have all the required skills!</h4>"
361
  return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
 
362
  if user_skills:
363
+ match_count = len(job_skills) - len(all_missing_skills)
364
+ score_val = match_count / len(job_skills) if len(job_skills) > 0 else 0
365
  job_details_markdown += f"\n**Your skill match:** {score_val:.1%}"
366
  headline = "<b>Great fit!</b>" if score_val >= 0.8 else "<b>Good progress!</b>" if score_val >= 0.5 else "<b>Solid starting point.</b>"
367
  learning_plan_html = f"<h4>{headline} Focus on these skills to improve your match:</h4>"
368
  skills_to_display = all_missing_skills[:5]
369
  items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
370
  learning_plan_html += f"<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
 
371
  return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
 
372
  else:
373
  headline = "<h4>To be a good fit for this role, you'll need to learn these skills:</h4>"
374
+ skills_to_display = job_skills[:5]
375
  items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
376
  learning_plan_html = f"{headline}<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
377
+ full_skill_list_for_state = job_skills
 
378
  new_offset = len(skills_to_display)
379
+ should_button_be_visible = len(full_skill_list_for_state) > 5
 
380
  return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), full_skill_list_for_state, new_offset, gr.Button(visible=should_button_be_visible)
381
 
382
  def load_more_skills(full_skills_list, current_offset):
383
  SKILLS_INCREMENT = 5
384
  new_offset = current_offset + SKILLS_INCREMENT
385
  skills_to_display = full_skills_list[:new_offset]
 
386
  items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
387
  learning_plan_html = f"<h4>To be a good fit for this role, you'll need to learn these skills:</h4><ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
 
388
  should_button_be_visible = new_offset < len(full_skills_list)
 
389
  return learning_plan_html, new_offset, gr.Button(visible=should_button_be_visible)
390
 
391
  def on_reset():
392
  return ("", 3, "", pd.DataFrame(), None, gr.Dropdown(visible=False), gr.Accordion(visible=False), "Status: Ready.", "", "", "", "", gr.Markdown(visible=False), gr.Row(visible=False), [], 0, gr.Button(visible=False))
393
 
 
394
  print("Starting application initialization...")
395
  initialization_status = initialize_data_and_model()
396
  print(initialization_status)
397
 
 
398
  with gr.Blocks(theme=gr.themes.Soft()) as ui:
399
  gr.Markdown("# Hybrid Career Planner & Skill Gap Analyzer")
 
400
  initial_matches_state = gr.State()
401
  missing_skills_state = gr.State([])
402
  skills_offset_state = gr.State(0)
 
403
  with gr.Row():
404
  with gr.Column(scale=3):
405
  dream_text = gr.Textbox(label='Your Dream Job Description', lines=3, placeholder="e.g., 'A role in a tech startup focused on machine learning...'")
 
411
  topk_slider = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Number of Matches")
412
  search_btn = gr.Button("Find Matches", variant="primary")
413
  reset_btn = gr.Button("Reset All")
 
414
  status_text = gr.Markdown("Status: Ready.")
415
  spelling_alert = gr.Markdown(visible=False)
416
  with gr.Row(visible=False) as spelling_row:
417
  search_anyway_btn = gr.Button("Search Anyway", variant="secondary")
418
  retype_btn = gr.Button("Let Me Fix It", variant="stop")
 
419
  df_output = gr.DataFrame(label="Job Matches", interactive=False)
420
  job_selector = gr.Dropdown(label="Select a job to see more details & learning plan:", visible=False)
 
421
  with gr.Accordion("Job Details & Learning Plan", open=False, visible=False) as details_accordion:
422
  job_details_markdown = gr.Markdown()
 
423
  with gr.Tabs():
424
+ with gr.TabItem("Duties"): duties_markdown = gr.Markdown()
425
+ with gr.TabItem("Qualifications"): qualifications_markdown = gr.Markdown()
426
+ with gr.TabItem("Full Description"): description_markdown = gr.Markdown()
 
 
 
 
427
  learning_plan_output = gr.HTML(label="Learning Plan")
428
  load_more_btn = gr.Button("Load More Skills", visible=False)
429
+ search_btn.click(fn=find_matches_and_rank_with_check, inputs=[dream_text, topk_slider, skills_text], outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row])
430
+ search_anyway_btn.click(fn=find_matches_and_rank_anyway, inputs=[dream_text, topk_slider, skills_text], outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row])
431
+ retype_btn.click(lambda: ("Status: Ready for you to retype.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(visible=False), gr.Row(visible=False)), outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row])
432
+ reset_btn.click(fn=on_reset, outputs=[dream_text, topk_slider, skills_text, df_output, initial_matches_state, job_selector, details_accordion, status_text, job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, spelling_alert, spelling_row, missing_skills_state, skills_offset_state, load_more_btn], queue=False)
433
+ rerank_btn.click(fn=rerank_current_results, inputs=[initial_matches_state, skills_text, topk_slider], outputs=[status_text, df_output, job_selector])
434
+ job_selector.change(fn=on_select_job, inputs=[job_selector, skills_text], outputs=[job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, learning_plan_output, details_accordion, missing_skills_state, skills_offset_state, load_more_btn])
435
+ load_more_btn.click(fn=load_more_skills, inputs=[missing_skills_state, skills_offset_state], outputs=[learning_plan_output, skills_offset_state, load_more_btn])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
436
 
437
  ui.launch()