zlf18 commited on
Commit
a515df0
·
verified ·
1 Parent(s): 79d3345

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +418 -915
app.py CHANGED
@@ -1,993 +1,496 @@
1
  import pandas as pd
2
-
3
  import datasets
4
-
5
  from sentence_transformers import SentenceTransformer, util
6
-
7
  import torch
8
-
9
  import re
10
-
11
  import nltk
12
-
13
  from nltk.corpus import words, stopwords
14
-
15
  import urllib.parse as _url
16
-
17
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
18
-
19
  from nltk.stem import PorterStemmer
20
-
21
  import gradio as gr
22
-
23
  import os
24
-
25
  from tqdm import tqdm
26
 
27
-
28
-
29
  tqdm.pandas()
30
 
31
-
32
-
33
  # --- NLTK Data Download ---
34
-
35
  for package in ['words', 'stopwords', 'averaged_perceptron_tagger', 'punkt']:
36
-
37
-     try:
38
-
39
-         nltk.data.find(f'corpora/{package}' if package in ['words', 'stopwords'] else f'taggers/{package}' if package == 'averaged_perceptron_tagger' else f'tokenizers/{package}')
40
-
41
-     except LookupError:
42
-
43
-         nltk.download(package)
44
-
45
-
46
 
47
  STOPWORDS = set(stopwords.words('english'))
48
-
49
  stemmer = PorterStemmer()
50
 
51
-
52
-
53
  # --- Expanded Skill Whitelist ---
54
-
55
  SKILL_WHITELIST = {
56
-
57
-     # Technical & Data
58
-
59
-     'python', 'java', 'c++', 'javascript', 'typescript', 'sql', 'nosql', 'html', 'css', 'react', 'angular', 'vue',
60
-
61
-     'nodejs', 'django', 'flask', 'fastapi', 'spring boot', 'ruby on rails', 'php', 'swift', 'kotlin', 'dart', 'flutter',
62
-
63
-     'machine learning', 'deep learning', 'tensorflow', 'pytorch', 'keras', 'scikit-learn', 'pandas', 'numpy', 'matplotlib',
64
-
65
-     'natural language processing', 'nlp', 'computer vision', 'data analysis', 'data science', 'data engineering',
66
-
67
-     'big data', 'spark', 'hadoop', 'kafka', 'data visualization', 'tableau', 'power bi', 'd3.js', 'statistics', 'analytics',
68
-
69
-     'aws', 'azure', 'google cloud', 'gcp', 'docker', 'kubernetes', 'terraform', 'ansible', 'ci/cd', 'jenkins',
70
-
71
-     'git', 'github', 'devops', 'linux', 'unix', 'shell scripting', 'powershell', 'cybersecurity', 'penetration testing',
72
-
73
-     'network security', 'cryptography', 'blockchain', 'c#', '.net', 'sql server', 'mysql', 'postgresql', 'mongodb', 'redis',
74
-
75
-     'elasticsearch', 'api design', 'rest apis', 'graphql', 'microservices', 'serverless', 'system design', 'saas',
76
-
77
-     # Business & Consulting
78
-
79
-     'agile', 'scrum', 'project management', 'product management', 'consulting', 'client management', 'business development',
80
-
81
-     'strategy', 'stakeholder management', 'risk management', 'compliance', 'aml', 'kyc', 'reinsurance', 'finance',
82
-
83
-     'financial modeling', 'financial analysis', 'due diligence', 'sourcing', 'procurement', 'negotiation', 'supply chain',
84
-
85
-     'business analysis', 'business intelligence', 'presentations', 'public speaking', 'time management', 'critical thinking',
86
-
87
-     'design thinking', 'innovation', 'adaptability', 'supervisory', 'pmp', 'cpsm', 'cips', 'microsoft office', 'communication',
88
-
89
-     'organizational skills',
90
-
91
-     # Soft & Other
92
-
93
-     'leadership', 'stakeholder communication', 'client communication', 'teamwork', 'collaboration', 'problem solving',
94
-
95
-     'ui/ux design', 'figma', 'sketch', 'adobe xd', 'graphic design', 'autocad', 'solidworks', 'sales', 'marketing',
96
-
97
-     'seo', 'sem', 'content writing', 'customer support', 'technical writing', 'sap', 'oracle', 'budgeting', 'mentoring', 'supervising'
98
-
99
  }
100
 
101
-
102
-
103
  # --- GLOBAL STATE & DATA ---
104
-
105
  original_df = None
106
-
107
  combined_df = None
108
-
109
  model = None
110
-
111
  combined_job_embeddings = None
112
-
113
  original_job_title_embeddings = None
114
-
115
  LLM_PIPELINE = None
116
-
117
  LLM_MODEL_NAME = "microsoft/phi-2"
118
-
119
  FINETUNED_MODEL_ID = "its-zion-18/projfinetuned"
120
-
121
  KNOWN_WORDS = set()
122
 
123
-
124
-
125
  # --- CORE NLP & HELPER FUNCTIONS ---
126
-
127
  def _norm_skill_token(s: str) -> str:
128
-
129
-     s = s.lower().strip()
130
-
131
-     s = re.sub(r'[\(\)\[\]\{\}\*]', '', s)
132
-
133
-     s = re.sub(r'^\W+|\W+$', '', s)
134
-
135
-     s = re.sub(r'\s+', ' ', s)
136
-
137
-     return s
138
-
139
-
140
 
141
  def build_known_vocabulary(df: pd.DataFrame):
142
-
143
-     global KNOWN_WORDS
144
-
145
-     english_words = set(w.lower() for w in words.words())
146
-
147
-     job_words = set(re.findall(r'\b\w+\b', " ".join(df['full_text'].astype(str).tolist()).lower()))
148
-
149
-     job_words = {w for w in job_words if w.isalpha() and len(w) > 2}
150
-
151
-     KNOWN_WORDS = english_words | job_words
152
-
153
-     return "Known vocabulary built."
154
-
155
-
156
 
157
  def check_spelling_in_query(query: str) -> list[str]:
158
-
159
-     words_in_query = query.lower().split()
160
-
161
-     unrecognized_words = []
162
-
163
-     if not KNOWN_WORDS: return []
164
-
165
-     for word in words_in_query:
166
-
167
-         if word.isalpha() and len(word) > 1 and word not in KNOWN_WORDS:
168
-
169
-             unrecognized_words.append(word)
170
-
171
-     return list(set(unrecognized_words))
172
-
173
-
174
 
175
  def initialize_llm_client():
176
-
177
-     global LLM_PIPELINE
178
-
179
-     try:
180
-
181
-         tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME, trust_remote_code=True)
182
-
183
-         model_llm = AutoModelForCausalLM.from_pretrained(
184
-
185
-             LLM_MODEL_NAME, torch_dtype="auto", device_map="auto", trust_remote_code=True
186
-
187
-         )
188
-
189
-         LLM_PIPELINE = pipeline("text-generation", model=model_llm, tokenizer=tokenizer)
190
-
191
-         return True
192
-
193
-     except Exception as e:
194
-
195
-         print(f"🚨 ERROR initializing local LLM: {e}")
196
-
197
-         return False
198
-
199
-
200
 
201
  def llm_expand_query(user_input: str) -> str:
202
-
203
-     global LLM_PIPELINE
204
-
205
-     if not LLM_PIPELINE: return user_input
206
-
207
-     prompt_template = (
208
-
209
-         f"User's career interest: '{user_input}'\n"
210
-
211
-         f"Instruction: Based on the user's interest, write a concise, single-sentence summary (40-60 words) that elaborates on the core intent, typical skills, and responsibilities. "
212
-
213
-         f"Do not include a preamble, the user input, or any list formatting in the output. Just the expanded sentence.\n"
214
-
215
-         f"Expanded Intent:"
216
-
217
-     )
218
-
219
-     try:
220
-
221
-         response = LLM_PIPELINE(prompt_template, max_new_tokens=100, do_sample=True, temperature=0.6)
222
-
223
-         expanded_query = response[0]['generated_text'].strip().split("Expanded Intent:")[-1].strip()
224
-
225
-         final_query = user_input + ". " + expanded_query.replace('\n', ' ').replace(':', '').strip()
226
-
227
-         final_query = final_query.replace('..', '.').strip()
228
-
229
-         return final_query
230
-
231
-     except Exception:
232
-
233
-         return user_input
234
-
235
-
236
 
237
  def find_job_matches(original_user_query: str, expanded_user_query: str, top_k: int = 50) -> pd.DataFrame:
238
-
239
-     expanded_user_embedding = model.encode(expanded_user_query, convert_to_tensor=True)
240
-
241
-     general_similarity_scores = util.cos_sim(expanded_user_embedding, combined_job_embeddings)[0]
242
-
243
-     top_indices = torch.topk(general_similarity_scores, k=len(combined_df))
244
-
245
-     sorted_combined_df = combined_df.iloc[top_indices.indices.cpu()].copy()
246
-
247
-     sorted_combined_df['general_score'] = top_indices.values.cpu().numpy()
248
-
249
-     unique_matches = sorted_combined_df.drop_duplicates(subset=['job_id'], keep='first').set_index('job_id')
250
-
251
-     original_user_embedding = model.encode(original_user_query, convert_to_tensor=True)
252
-
253
-     title_boost_scores = util.cos_sim(original_user_embedding, original_job_title_embeddings)[0].cpu().numpy()
254
-
255
-     title_boost_map = pd.Series(title_boost_scores, index=original_df['job_id'])
256
-
257
-     unique_matches['title_boost_score'] = unique_matches.index.map(title_boost_map).fillna(0)
258
-
259
-     unique_matches['Similarity Score'] = (0.70 * unique_matches['general_score'] + 0.30 * unique_matches['title_boost_score'])
260
-
261
-     final_job_ids = unique_matches.sort_values(by='Similarity Score', ascending=False).head(top_k).index.tolist()
262
-
263
-     final_results_df = original_df[original_df['job_id'].isin(final_job_ids)].copy()
264
-
265
-     scores_df = unique_matches.reset_index()[['job_id', 'Similarity Score']].copy()
266
-
267
-     final_results_df = pd.merge(final_results_df, scores_df, on='job_id', how='left')
268
-
269
-     final_results_df = final_results_df.sort_values(by='Similarity Score', ascending=False).reset_index(drop=True)
270
-
271
-     final_results_df = final_results_df.set_index('job_id', drop=False).rename(columns={'job_id': 'Job ID'})
272
-
273
-     return final_results_df
274
-
275
-
276
 
277
  def score_jobs_by_skills(user_skills: list[str], df_to_rank: pd.DataFrame) -> pd.DataFrame:
278
-
279
-     if df_to_rank is None or df_to_rank.empty or not user_skills:
280
-
281
-         return df_to_rank.sort_values(by='Similarity Score', ascending=False) if df_to_rank is not None else pd.DataFrame()
282
-
283
-
284
-
285
-     ranked_df = df_to_rank.copy()
286
-
287
-     if 'Skills' not in ranked_df.columns:
288
-
289
-         return ranked_df.sort_values(by='Similarity Score', ascending=False)
290
-
291
-
292
-
293
-     user_skill_embeddings = model.encode(user_skills, convert_to_tensor=True)
294
-
295
-     all_job_skills = sorted(list(set(skill for skills_list in ranked_df['Skills'] if skills_list for skill in skills_list)))
296
-
297
-     
298
-
299
-     if not all_job_skills:
300
-
301
-         ranked_df['Skill Match Score'] = 0.0
302
-
303
-         ranked_df['Final Score'] = ranked_df['Similarity Score']
304
-
305
-         return ranked_df
306
-
307
-
308
-
309
-     job_skill_embeddings = model.encode(all_job_skills, convert_to_tensor=True)
310
-
311
-     similarity_matrix = util.cos_sim(user_skill_embeddings, job_skill_embeddings)
312
-
313
-
314
-
315
-     def calculate_confidence_adjusted_score(row):
316
-
317
-         job_skills_list = row.get('Skills', [])
318
-
319
-         if not job_skills_list:
320
-
321
-             return 0.0
322
-
323
-         
324
-
325
-         total_required = len(job_skills_list)
326
-
327
-         sum_of_max_similarities = 0.0
328
-
329
-         for job_skill in job_skills_list:
330
-
331
-             try:
332
-
333
-                 job_skill_idx = all_job_skills.index(job_skill)
334
-
335
-                 max_sim = torch.max(similarity_matrix[:, job_skill_idx])
336
-
337
-                 sum_of_max_similarities += max_sim.item()
338
-
339
-             except (ValueError, IndexError):
340
-
341
-                 continue
342
-
343
-         
344
-
345
-         avg_score = sum_of_max_similarities / total_required if total_required > 0 else 0.0
346
-
347
-         skill_count_factor = min(1.0, total_required / 5.0)
348
-
349
-         return avg_score * skill_count_factor
350
-
351
-
352
-
353
-     ranked_df['Skill Match Score'] = ranked_df.apply(calculate_confidence_adjusted_score, axis=1)
354
-
355
-     
356
-
357
-     ranked_df['Final Score'] = (0.8 * ranked_df['Similarity Score']) + (0.2 * ranked_df['Skill Match Score'])
358
-
359
-     
360
-
361
-     ranked_df = ranked_df.sort_values(by='Final Score', ascending=False).reset_index(drop=True)
362
-
363
-     return ranked_df.set_index('Job ID', drop=False).rename_axis(None)
364
-
365
-
366
 
367
  def initialize_data_and_model():
368
-
369
-     global original_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings
370
-
371
-     PROCESSED_DATA_PATH = "processed_jobs_with_skills.parquet"
372
-
373
-
374
-
375
-     print("--- Initializing LLM Client ---")
376
-
377
-     if not initialize_llm_client(): print("Warning: LLM Client failed to initialize. Will use NLTK only for skills.")
378
-
379
-
380
-
381
-     if os.path.exists(PROCESSED_DATA_PATH):
382
-
383
-         print(f"--- Loading pre-processed data from {PROCESSED_DATA_PATH} ---")
384
-
385
-         original_df = pd.read_parquet(PROCESSED_DATA_PATH)
386
-
387
-     else:
388
-
389
-         print("--- No pre-processed data found. Starting one-time processing... ---")
390
-
391
-         ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
392
-
393
-         original_df = ds["original"].to_pandas()
394
-
395
-         
396
-
397
-         def extract_skills_llm(text: str) -> list[str]:
398
-
399
-             if not isinstance(text, str) or len(text.strip()) < 20 or not LLM_PIPELINE: return []
400
-
401
-             prompt = f"""
402
-
403
  Instruct: You are an expert technical recruiter. Extract the key skills from the job description text. List technical and soft skills as a comma-separated string.
404
-
405
  [Example 1]
406
-
407
  Text: "Requires 3+ years of experience in cloud infrastructure. Must be proficient in AWS, particularly EC2 and S3. Experience with Terraform for IaC is a plus."
408
-
409
  Extracted Skills: cloud infrastructure, aws, ec2, s3, terraform, infrastructure as code
410
-
411
  [Example 2]
412
-
413
  Text: "Seeking a team lead with strong project management abilities. Must communicate effectively with stakeholders and manage timelines using Agile methodologies like Scrum."
414
-
415
  Extracted Skills: project management, leadership, stakeholder communication, agile, scrum
416
-
417
  [Actual Task]
418
-
419
  Text: "{text}"
420
-
421
  Extracted Skills:
422
-
423
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
424
 
425
-             try:
426
-
427
-                 response = LLM_PIPELINE(prompt, max_new_tokens=150, do_sample=False, temperature=0.1)
428
-
429
-                 generated_text = response[0]['generated_text']
430
-
431
-                 skills_part = generated_text.split("Extracted Skills:")[-1].strip()
432
-
433
-                 skills = [skill.strip() for skill in skills_part.split(',') if skill.strip()]
434
-
435
-                 return list(dict.fromkeys(s.lower() for s in skills))
436
-
437
-             except Exception: return []
438
-
439
-
440
-
441
-         def extract_skills_nltk(text: str) -> list[str]:
442
-
443
-             if not isinstance(text, str): return []
444
-
445
-             text_lower = text.lower()
446
-
447
-             grammar = "NP: {<JJ.*>*<NN.*>+}"
448
-
449
-             chunk_parser = nltk.RegexpParser(grammar)
450
-
451
-             tokens = nltk.word_tokenize(text_lower)
452
-
453
-             tagged_tokens = nltk.pos_tag(tokens)
454
-
455
-             chunked_text = chunk_parser.parse(tagged_tokens)
456
-
457
-             potential_skills = set()
458
-
459
-             for subtree in chunked_text.subtrees():
460
-
461
-                 if subtree.label() == 'NP':
462
-
463
-                     phrase = " ".join(word for word, tag in subtree.leaves())
464
-
465
-                     if _norm_skill_token(phrase) in SKILL_WHITELIST:
466
-
467
-                         potential_skills.add(_norm_skill_token(phrase))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
468
 
469
-             return sorted(list(potential_skills))
 
 
 
470
 
471
-             
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
472
 
473
-         def extract_skills_direct_scan(text: str) -> list[str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
474
 
475
-             if not isinstance(text, str): return []
 
 
 
 
 
 
 
 
 
476
 
477
-             found_skills = set()
 
 
478
 
479
-             for skill in SKILL_WHITELIST:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
480
 
481
-                 if re.search(r'\b' + re.escape(skill) + r'\b', text, re.IGNORECASE):
482
-
483
-                     found_skills.add(skill)
484
-
485
-             return list(found_skills)
486
-
487
-
488
-
489
-         # --- NEW: Function to expand a short skill list using the LLM ---
490
-
491
-         def expand_skills_with_llm(job_title: str, existing_skills: list) -> list:
492
-
493
-             if not LLM_PIPELINE or not job_title: return []
494
-
495
-             
496
-
497
-             skills_to_add = 6 - len(existing_skills)
498
-
499
-             prompt = f"""
500
-
501
- Instruct: A job has the title "{job_title}" and requires the skills: {', '.join(existing_skills)}.
502
-
503
- Based on this, what are {skills_to_add} additional, closely related skills typically required for such a role?
504
-
505
- List only the new skills, separated by commas. Do not repeat skills from the original list.
506
-
507
-
508
-
509
- Additional Skills:
510
-
511
- """
512
-
513
-             try:
514
-
515
-                 response = LLM_PIPELINE(prompt, max_new_tokens=50, do_sample=True, temperature=0.5)
516
-
517
-                 generated_text = response[0]['generated_text']
518
-
519
-                 skills_part = generated_text.split("Additional Skills:")[-1].strip()
520
-
521
-                 new_skills = [skill.strip().lower() for skill in skills_part.split(',') if skill.strip()]
522
-
523
-                 return new_skills
524
-
525
-             except Exception:
526
-
527
-                 return []
528
-
529
-
530
-
531
-         def extract_skills_hybrid(row) -> list[str]:
532
-
533
-             text = row['text_for_skills']
534
-
535
-             job_title = row.get('Job title', '') # Use original Job title for context
536
-
537
-
538
-
539
-             llm_skills = extract_skills_llm(text)
540
-
541
-             nltk_skills = extract_skills_nltk(text)
542
-
543
-             direct_skills = extract_skills_direct_scan(text)
544
-
545
-             combined_skills = set(llm_skills) | set(nltk_skills) | set(direct_skills)
546
-
547
-             
548
-
549
-             # If the combined list is still too short, expand it
550
-
551
-             if len(combined_skills) < 6:
552
-
553
-                 expanded_skills = expand_skills_with_llm(job_title, list(combined_skills))
554
-
555
-                 combined_skills.update(expanded_skills)
556
-
557
-
558
-
559
-             return sorted(list(combined_skills))
560
-
561
-
562
-
563
-         def create_text_for_skills(row):
564
-
565
-             return " ".join([str(s) for s in [row.get("Job title"), row.get("Duties"), row.get("qualifications"), row.get("Description")] if pd.notna(s)])
566
-
567
-         
568
-
569
-         original_df["text_for_skills"] = original_df.apply(create_text_for_skills, axis=1)
570
-
571
-         print("--- Extracting skills with HYBRID ACCURACY model. Please wait... ---")
572
-
573
-         # Apply the hybrid function row-wise to include job title context
574
-
575
-         original_df['Skills'] = original_df.progress_apply(extract_skills_hybrid, axis=1)
576
-
577
-         original_df = original_df.drop(columns=['text_for_skills'])
578
-
579
-
580
-
581
-         print(f"--- Saving processed data to {PROCESSED_DATA_PATH} for faster future startups ---")
582
-
583
-         original_df.to_parquet(PROCESSED_DATA_PATH)
584
-
585
-
586
-
587
-     original_df['job_id'] = original_df.index
588
-
589
-     def create_full_text(row): return " ".join([str(s) for s in [row.get("Job title"), row.get("Company"), row.get("Duties"), row.get("qualifications"), row.get("Description")]])
590
-
591
-     original_df["full_text"] = original_df.apply(create_full_text, axis=1)
592
-
593
-     
594
-
595
-     ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
596
-
597
-     augmented_df = ds["augmented"].to_pandas()
598
-
599
-     max_id = len(original_df) - 1
600
-
601
-     augmented_df['job_id'] = augmented_df.index.map(lambda i: min(i // 20, max_id))
602
-
603
-     augmented_df["full_text"] = augmented_df.apply(create_full_text, axis=1)
604
-
605
-
606
-
607
-     combined_df = pd.concat([original_df.copy(), augmented_df.copy()], ignore_index=True)
608
-
609
-     original_df = original_df.rename(columns={'Job title': 'job_title', 'Company': 'company'})
610
-
611
-
612
-
613
-     print("--- Loading Fine-Tuned Sentence Transformer Model ---")
614
-
615
-     model = SentenceTransformer(FINETUNED_MODEL_ID)
616
-
617
-     print("--- Encoding Embeddings ---")
618
-
619
-     combined_job_embeddings = model.encode(combined_df["full_text"].tolist(), convert_to_tensor=True, show_progress_bar=True)
620
-
621
-     original_job_title_embeddings = model.encode(original_df["job_title"].tolist(), convert_to_tensor=True, show_progress_bar=True)
622
-
623
-     print("--- Building Vocabulary ---")
624
-
625
-     build_known_vocabulary(combined_df)
626
-
627
-     return "--- Initialization Complete ---"
628
-
629
-
630
-
631
- def _course_links_for(skill: str) -> str:
632
-
633
-     q = _url.quote(skill)
634
-
635
-     links = [("Coursera", f"https://www.coursera.org/search?query={q}"), ("edX", f"https://www.edx.org/search?q={q}"), ("Udemy", f"https://www.udemy.com/courses/search/?q={q}"), ("YouTube", f"https://www.youtube.com/results?search_query={q}+tutorial")]
636
-
637
-     return " • ".join([f'<a href="{u}" target="_blank" style="color: #007bff;">{name}</a>' for name, u in links])
638
-
639
-
640
-
641
- def get_job_matches(dream_job: str, top_n: int, skills_text: str):
642
-
643
-     status = "Searching using hybrid model..."
644
-
645
-     expanded_desc = llm_expand_query(dream_job)
646
-
647
-     emb_matches = find_job_matches(dream_job, expanded_desc, top_k=50)
648
-
649
-     user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
650
-
651
-
652
-
653
-     if user_skills:
654
-
655
-         display_df = score_jobs_by_skills(user_skills, emb_matches)
656
-
657
-     else:
658
-
659
-         display_df = emb_matches
660
-
661
-     display_df = display_df.head(top_n)
662
-
663
-     if user_skills:
664
-
665
-         status = f"Found and **re-ranked** results by your {len(user_skills)} skills. Displaying top {len(display_df)}."
666
-
667
-     else:
668
-
669
-         status = f"Found {len(display_df)} top matches using semantic search."
670
-
671
-     
672
-
673
-     if 'Final Score' in display_df.columns:
674
-
675
-         table_to_show = display_df[['job_title', 'company', 'Final Score', 'Skill Match Score']]
676
-
677
-         table_to_show = table_to_show.rename(columns={'Final Score': 'Overall Score'})
678
-
679
-         table_to_show['Skill Match Score'] = table_to_show['Skill Match Score'].map('{:.2%}'.format)
680
-
681
-         table_to_show['Overall Score'] = table_to_show['Overall Score'].map('{:.2%}'.format)
682
-
683
-     else:
684
-
685
-         table_to_show = display_df[['job_title', 'company', 'Similarity Score']]
686
-
687
-         table_to_show = table_to_show.rename(columns={'Similarity Score': 'Overall Score'})
688
-
689
-         table_to_show['Overall Score'] = table_to_show['Overall Score'].map('{:.2%}'.format)
690
-
691
-         
692
-
693
-     dropdown_options = [(f"{i+1}. {row['job_title']} - {row['company']}", row.name) for i, row in display_df.iterrows()]
694
-
695
-     dropdown_value = dropdown_options[0][1] if dropdown_options else None
696
-
697
-     return status, emb_matches, table_to_show, gr.Dropdown(choices=dropdown_options, value=dropdown_value, visible=True), gr.Accordion(visible=True)
698
-
699
-
700
-
701
- def rerank_current_results(initial_matches_df, skills_text, top_n):
702
-
703
-     if initial_matches_df is None or pd.DataFrame(initial_matches_df).empty:
704
-
705
-         return "Please find matches first before re-ranking.", pd.DataFrame(), gr.Dropdown(visible=False)
706
-
707
-     initial_matches_df = pd.DataFrame(initial_matches_df)
708
-
709
-     user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
710
-
711
-     
712
-
713
-     if not user_skills:
714
-
715
-         status = "Skills cleared. Showing original semantic search results."
716
-
717
-         display_df = initial_matches_df.head(top_n)
718
-
719
-         table_to_show = display_df[['job_title', 'company', 'Similarity Score']]
720
-
721
-         table_to_show = table_to_show.rename(columns={'Similarity Score': 'Overall Score'})
722
-
723
-         table_to_show['Overall Score'] = table_to_show['Overall Score'].map('{:.2%}'.format)
724
-
725
-     else:
726
-
727
-         ranked_df = score_jobs_by_skills(user_skills, initial_matches_df)
728
-
729
-         status = f"Results **re-ranked** based on your {len(user_skills)} skills."
730
-
731
-         display_df = ranked_df.head(top_n)
732
-
733
-         table_to_show = display_df[['job_title', 'company', 'Final Score', 'Skill Match Score']]
734
-
735
-         table_to_show = table_to_show.rename(columns={'Final Score': 'Overall Score'})
736
-
737
-         table_to_show['Skill Match Score'] = table_to_show['Skill Match Score'].map('{:.2%}'.format)
738
-
739
-         table_to_show['Overall Score'] = table_to_show['Overall Score'].map('{:.2%}'.format)
740
-
741
-
742
-
743
-     dropdown_options = [(f"{i+1}. {row['job_title']} - {row['company']}", row.name) for i, row in display_df.iterrows()]
744
-
745
-     dropdown_value = dropdown_options[0][1] if dropdown_options else None
746
-
747
-     return status, table_to_show, gr.Dropdown(choices=dropdown_options, value=dropdown_value, visible=True)
748
-
749
-
750
-
751
- def find_matches_and_rank_with_check(dream_job: str, top_n: int, skills_text: str):
752
-
753
-     if not dream_job:
754
-
755
-         return "Please describe your dream job first.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(""), gr.Row(visible=False)
756
-
757
-     unrecognized_words = check_spelling_in_query(dream_job)
758
-
759
-     if unrecognized_words:
760
-
761
-         word_list_html = ", ".join([f"<b><span style='color: #F87171;'>{w}</span></b>" for w in unrecognized_words])
762
-
763
-         alert_message = f"<b><span style='color: #F87171;'>⚠️ Possible Spelling Error:</span></b> Unrecognized: {word_list_html}."
764
-
765
-         return "Status: Awaiting confirmation.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(alert_message, visible=True), gr.Row(visible=True)
766
-
767
-     status, emb_matches, table_to_show, dropdown, details_accordion = get_job_matches(dream_job, top_n, skills_text)
768
-
769
-     return status, emb_matches, table_to_show, dropdown, details_accordion, gr.Markdown(visible=False), gr.Row(visible=False)
770
-
771
-
772
-
773
- def find_matches_and_rank_anyway(dream_job: str, top_n: int, skills_text: str):
774
-
775
-     status, emb_matches, table_to_show, dropdown, details_accordion = get_job_matches(dream_job, top_n, skills_text)
776
-
777
-     return status, emb_matches, table_to_show, dropdown, details_accordion, gr.Markdown(visible=False), gr.Row(visible=False)
778
-
779
-
780
-
781
- def on_select_job(job_id, skills_text):
782
-
783
-     if job_id is None: return "", "", "", "", "", gr.Accordion(visible=False), [], 0, gr.Button(visible=False)
784
-
785
-     row = original_df.loc[job_id]
786
-
787
-     title, company = str(row.get("job_title", "")), str(row.get("company", ""))
788
-
789
-     job_details_markdown = f"### {title} — {company}"
790
-
791
-     duties, qualifications, description = str(row.get('Duties', '')), str(row.get('qualifications', '')), str(row.get('Description', ''))
792
-
793
-     user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
794
-
795
-     job_skills = row.get("Skills", [])
796
-
797
-     if not job_skills:
798
-
799
-         learning_plan_html = "<p><i>No specific skills could be extracted for this job.</i></p>"
800
-
801
-         return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
802
-
803
-
804
-
805
-     score_val = 0
806
-
807
-     all_missing_skills = job_skills
808
-
809
-     if user_skills:
810
-
811
-         user_skill_embeddings = model.encode(user_skills, convert_to_tensor=True)
812
-
813
-         job_skill_embeddings = model.encode(job_skills, convert_to_tensor=True)
814
-
815
-         similarity_matrix = util.cos_sim(user_skill_embeddings, job_skill_embeddings)
816
-
817
-         
818
-
819
-         sum_of_max_similarities = torch.sum(torch.max(similarity_matrix, dim=0).values)
820
-
821
-         avg_score = (sum_of_max_similarities / len(job_skills)).item() if len(job_skills) > 0 else 0
822
-
823
-         
824
-
825
-         skill_count_factor = min(1.0, len(job_skills) / 5.0)
826
-
827
-         score_val = avg_score * skill_count_factor
828
-
829
-
830
-
831
-         matched_job_skills_mask = torch.any(similarity_matrix > 0.58, dim=0)
832
-
833
-         all_missing_skills = [skill for i, skill in enumerate(job_skills) if not matched_job_skills_mask[i]]
834
-
835
-
836
-
837
-     if user_skills and score_val >= 0.98:
838
-
839
-         learning_plan_html = "<h4 style='color:green;'>🎉 You have all the required skills!</h4>"
840
-
841
-         job_details_markdown += f"\n**Your skill match:** {score_val:.1%}"
842
-
843
-         return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
844
-
845
-     
846
-
847
-     if user_skills:
848
-
849
-         job_details_markdown += f"\n**Your skill match:** {score_val:.1%}"
850
-
851
-         headline = "<b>Great fit!</b>" if score_val >= 0.8 else "<b>Good progress!</b>" if score_val >= 0.5 else "<b>Solid starting point.</b>"
852
-
853
-         learning_plan_html = f"<h4>{headline} Focus on these skills to improve your match:</h4>"
854
-
855
-         skills_to_display = sorted(all_missing_skills)[:5]
856
-
857
-         items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
858
-
859
-         learning_plan_html += f"<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
860
-
861
-         return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
862
-
863
-     else:
864
-
865
-         headline = "<h4>To be a good fit for this role, you'll need to learn these skills:</h4>"
866
-
867
-         skills_to_display = sorted(job_skills)[:5]
868
-
869
-         items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
870
-
871
-         learning_plan_html = f"{headline}<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
872
-
873
-         full_skill_list_for_state = sorted(job_skills)
874
-
875
-         new_offset = len(skills_to_display)
876
-
877
-         should_button_be_visible = len(full_skill_list_for_state) > 5
878
-
879
-         return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), full_skill_list_for_state, new_offset, gr.Button(visible=should_button_be_visible)
880
-
881
-
882
-
883
- def load_more_skills(full_skills_list, current_offset):
884
-
885
-     SKILLS_INCREMENT = 5
886
-
887
-     new_offset = current_offset + SKILLS_INCREMENT
888
-
889
-     skills_to_display = full_skills_list[:new_offset]
890
-
891
-     items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
892
-
893
-     learning_plan_html = f"<h4>To be a good fit for this role, you'll need to learn these skills:</h4><ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
894
-
895
-     should_button_be_visible = new_offset < len(full_skills_list)
896
-
897
-     return learning_plan_html, new_offset, gr.Button(visible=should_button_be_visible)
898
-
899
-
900
-
901
- def on_reset():
902
-
903
-     return ("", 3, "", pd.DataFrame(), None, gr.Dropdown(visible=False), gr.Accordion(visible=False), "Status: Ready.", "", "", "", "", gr.Markdown(visible=False), gr.Row(visible=False), [], 0, gr.Button(visible=False))
904
-
905
-
906
-
907
- print("Starting application initialization...")
908
-
909
- initialization_status = initialize_data_and_model()
910
-
911
- print(initialization_status)
912
 
 
 
913
 
 
 
 
914
 
915
  with gr.Blocks(theme=gr.themes.Soft()) as ui:
916
-
917
-     gr.Markdown("# Hybrid Career Planner & Skill Gap Analyzer")
918
-
919
-     initial_matches_state = gr.State()
920
-
921
-     missing_skills_state = gr.State([])
922
-
923
-     skills_offset_state = gr.State(0)
924
-
925
-     with gr.Row():
926
-
927
-         with gr.Column(scale=3):
928
-
929
-             dream_text = gr.Textbox(label='Your Dream Job Description', lines=3, placeholder="e.g., 'A role in a tech startup focused on machine learning...'")
930
-
931
-             with gr.Accordion("Optional: Add Your Skills to Re-rank Results", open=False):
932
-
933
-                 with gr.Row():
934
-
935
-                     skills_text = gr.Textbox(label='Your Skills (comma-separated)', placeholder="e.g., Python, data analysis", scale=3)
936
-
937
-                     rerank_btn = gr.Button("Re-rank", variant="secondary", scale=1)
938
-
939
-         with gr.Column(scale=1):
940
-
941
-             topk_slider = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Number of Matches")
942
-
943
-             search_btn = gr.Button("Find Matches", variant="primary")
944
-
945
-             reset_btn = gr.Button("Reset All")
946
-
947
-     status_text = gr.Markdown("Status: Ready.")
948
-
949
-     spelling_alert = gr.Markdown(visible=False)
950
-
951
-     with gr.Row(visible=False) as spelling_row:
952
-
953
-         search_anyway_btn = gr.Button("Search Anyway", variant="secondary")
954
-
955
-         retype_btn = gr.Button("Let Me Fix It", variant="stop")
956
-
957
-     df_output = gr.DataFrame(label="Job Matches", interactive=False)
958
-
959
-     job_selector = gr.Dropdown(label="Select a job to see more details & learning plan:", visible=False)
960
-
961
-     with gr.Accordion("Job Details & Learning Plan", open=False, visible=False) as details_accordion:
962
-
963
-         job_details_markdown = gr.Markdown()
964
-
965
-         with gr.Tabs():
966
-
967
-             with gr.TabItem("Duties"): duties_markdown = gr.Markdown()
968
-
969
-             with gr.TabItem("Qualifications"): qualifications_markdown = gr.Markdown()
970
-
971
-             with gr.TabItem("Full Description"): description_markdown = gr.Markdown()
972
-
973
-         learning_plan_output = gr.HTML(label="Learning Plan")
974
-
975
-         load_more_btn = gr.Button("Load More Skills", visible=False)
976
-
977
-     search_btn.click(fn=find_matches_and_rank_with_check, inputs=[dream_text, topk_slider, skills_text], outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row])
978
-
979
-     search_anyway_btn.click(fn=find_matches_and_rank_anyway, inputs=[dream_text, topk_slider, skills_text], outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row])
980
-
981
-     retype_btn.click(lambda: ("Status: Ready for you to retype.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(visible=False), gr.Row(visible=False)), outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row])
982
-
983
-     reset_btn.click(fn=on_reset, outputs=[dream_text, topk_slider, skills_text, df_output, initial_matches_state, job_selector, details_accordion, status_text, job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, spelling_alert, spelling_row, missing_skills_state, skills_offset_state, load_more_btn], queue=False)
984
-
985
-     rerank_btn.click(fn=rerank_current_results, inputs=[initial_matches_state, skills_text, topk_slider], outputs=[status_text, df_output, job_selector])
986
-
987
-     job_selector.change(fn=on_select_job, inputs=[job_selector, skills_text], outputs=[job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, learning_plan_output, details_accordion, missing_skills_state, skills_offset_state, load_more_btn])
988
-
989
-     load_more_btn.click(fn=load_more_skills, inputs=[missing_skills_state, skills_offset_state], outputs=[learning_plan_output, skills_offset_state, load_more_btn])
990
-
991
-
992
 
993
  ui.launch()
 
1
  import pandas as pd
 
2
  import datasets
 
3
  from sentence_transformers import SentenceTransformer, util
 
4
  import torch
 
5
  import re
 
6
  import nltk
 
7
  from nltk.corpus import words, stopwords
 
8
  import urllib.parse as _url
 
9
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 
10
  from nltk.stem import PorterStemmer
 
11
  import gradio as gr
 
12
  import os
 
13
  from tqdm import tqdm
14
 
 
 
15
  tqdm.pandas()
16
 
 
 
17
  # --- NLTK Data Download ---
 
18
  for package in ['words', 'stopwords', 'averaged_perceptron_tagger', 'punkt']:
19
+ try:
20
+ nltk.data.find(f'corpora/{package}' if package in ['words', 'stopwords'] else f'taggers/{package}' if package == 'averaged_perceptron_tagger' else f'tokenizers/{package}')
21
+ except LookupError:
22
+ nltk.download(package)
 
 
 
 
 
 
23
 
24
  STOPWORDS = set(stopwords.words('english'))
 
25
  stemmer = PorterStemmer()
26
 
 
 
27
  # --- Expanded Skill Whitelist ---
 
28
  SKILL_WHITELIST = {
29
+ # Technical & Data
30
+ 'python', 'java', 'c++', 'javascript', 'typescript', 'sql', 'nosql', 'html', 'css', 'react', 'angular', 'vue',
31
+ 'nodejs', 'django', 'flask', 'fastapi', 'spring boot', 'ruby on rails', 'php', 'swift', 'kotlin', 'dart', 'flutter',
32
+ 'machine learning', 'deep learning', 'tensorflow', 'pytorch', 'keras', 'scikit-learn', 'pandas', 'numpy', 'matplotlib',
33
+ 'natural language processing', 'nlp', 'computer vision', 'data analysis', 'data science', 'data engineering',
34
+ 'big data', 'spark', 'hadoop', 'kafka', 'data visualization', 'tableau', 'power bi', 'd3.js', 'statistics', 'analytics',
35
+ 'aws', 'azure', 'google cloud', 'gcp', 'docker', 'kubernetes', 'terraform', 'ansible', 'ci/cd', 'jenkins',
36
+ 'git', 'github', 'devops', 'linux', 'unix', 'shell scripting', 'powershell', 'cybersecurity', 'penetration testing',
37
+ 'network security', 'cryptography', 'blockchain', 'c#', '.net', 'sql server', 'mysql', 'postgresql', 'mongodb', 'redis',
38
+ 'elasticsearch', 'api design', 'rest apis', 'graphql', 'microservices', 'serverless', 'system design', 'saas',
39
+ # Business & Consulting
40
+ 'agile', 'scrum', 'project management', 'product management', 'consulting', 'client management', 'business development',
41
+ 'strategy', 'stakeholder management', 'risk management', 'compliance', 'aml', 'kyc', 'reinsurance', 'finance',
42
+ 'financial modeling', 'financial analysis', 'due diligence', 'sourcing', 'procurement', 'negotiation', 'supply chain',
43
+ 'business analysis', 'business intelligence', 'presentations', 'public speaking', 'time management', 'critical thinking',
44
+ 'design thinking', 'innovation', 'adaptability', 'supervisory', 'pmp', 'cpsm', 'cips', 'microsoft office', 'communication',
45
+ 'organizational skills',
46
+ # Soft & Other
47
+ 'leadership', 'stakeholder communication', 'client communication', 'teamwork', 'collaboration', 'problem solving',
48
+ 'ui/ux design', 'figma', 'sketch', 'adobe xd', 'graphic design', 'autocad', 'solidworks', 'sales', 'marketing',
49
+ 'seo', 'sem', 'content writing', 'customer support', 'technical writing', 'sap', 'oracle', 'budgeting', 'mentoring', 'supervising'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  }
51
 
 
 
52
  # --- GLOBAL STATE & DATA ---
 
53
  original_df = None
 
54
  combined_df = None
 
55
  model = None
 
56
  combined_job_embeddings = None
 
57
  original_job_title_embeddings = None
 
58
  LLM_PIPELINE = None
 
59
  LLM_MODEL_NAME = "microsoft/phi-2"
 
60
  FINETUNED_MODEL_ID = "its-zion-18/projfinetuned"
 
61
  KNOWN_WORDS = set()
62
 
 
 
63
  # --- CORE NLP & HELPER FUNCTIONS ---
 
64
  def _norm_skill_token(s: str) -> str:
65
+ s = s.lower().strip()
66
+ s = re.sub(r'[\(\)\[\]\{\}\*]', '', s)
67
+ s = re.sub(r'^\W+|\W+$', '', s)
68
+ s = re.sub(r'\s+', ' ', s)
69
+ return s
 
 
 
 
 
 
 
70
 
71
  def build_known_vocabulary(df: pd.DataFrame):
72
+ global KNOWN_WORDS
73
+ english_words = set(w.lower() for w in words.words())
74
+ job_words = set(re.findall(r'\b\w+\b', " ".join(df['full_text'].astype(str).tolist()).lower()))
75
+ job_words = {w for w in job_words if w.isalpha() and len(w) > 2}
76
+ KNOWN_WORDS = english_words | job_words
77
+ return "Known vocabulary built."
 
 
 
 
 
 
 
 
78
 
79
  def check_spelling_in_query(query: str) -> list[str]:
80
+ words_in_query = query.lower().split()
81
+ unrecognized_words = []
82
+ if not KNOWN_WORDS: return []
83
+ for word in words_in_query:
84
+ if word.isalpha() and len(word) > 1 and word not in KNOWN_WORDS:
85
+ unrecognized_words.append(word)
86
+ return list(set(unrecognized_words))
 
 
 
 
 
 
 
 
 
87
 
88
  def initialize_llm_client():
89
+ global LLM_PIPELINE
90
+ try:
91
+ tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME, trust_remote_code=True)
92
+ model_llm = AutoModelForCausalLM.from_pretrained(
93
+ LLM_MODEL_NAME, torch_dtype="auto", device_map="auto", trust_remote_code=True
94
+ )
95
+ LLM_PIPELINE = pipeline("text-generation", model=model_llm, tokenizer=tokenizer)
96
+ return True
97
+ except Exception as e:
98
+ print(f"🚨 ERROR initializing local LLM: {e}")
99
+ return False
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
  def llm_expand_query(user_input: str) -> str:
102
+ global LLM_PIPELINE
103
+ if not LLM_PIPELINE: return user_input
104
+ prompt_template = (
105
+ f"User's career interest: '{user_input}'\n"
106
+ f"Instruction: Based on the user's interest, write a concise, single-sentence summary (40-60 words) that elaborates on the core intent, typical skills, and responsibilities. "
107
+ f"Do not include a preamble, the user input, or any list formatting in the output. Just the expanded sentence.\n"
108
+ f"Expanded Intent:"
109
+ )
110
+ try:
111
+ response = LLM_PIPELINE(prompt_template, max_new_tokens=100, do_sample=True, temperature=0.6)
112
+ expanded_query = response[0]['generated_text'].strip().split("Expanded Intent:")[-1].strip()
113
+ final_query = user_input + ". " + expanded_query.replace('\n', ' ').replace(':', '').strip()
114
+ final_query = final_query.replace('..', '.').strip()
115
+ return final_query
116
+ except Exception:
117
+ return user_input
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
  def find_job_matches(original_user_query: str, expanded_user_query: str, top_k: int = 50) -> pd.DataFrame:
120
+ expanded_user_embedding = model.encode(expanded_user_query, convert_to_tensor=True)
121
+ general_similarity_scores = util.cos_sim(expanded_user_embedding, combined_job_embeddings)[0]
122
+ top_indices = torch.topk(general_similarity_scores, k=len(combined_df))
123
+ sorted_combined_df = combined_df.iloc[top_indices.indices.cpu()].copy()
124
+ sorted_combined_df['general_score'] = top_indices.values.cpu().numpy()
125
+ unique_matches = sorted_combined_df.drop_duplicates(subset=['job_id'], keep='first').set_index('job_id')
126
+ original_user_embedding = model.encode(original_user_query, convert_to_tensor=True)
127
+ title_boost_scores = util.cos_sim(original_user_embedding, original_job_title_embeddings)[0].cpu().numpy()
128
+ title_boost_map = pd.Series(title_boost_scores, index=original_df['job_id'])
129
+ unique_matches['title_boost_score'] = unique_matches.index.map(title_boost_map).fillna(0)
130
+ unique_matches['Similarity Score'] = (0.70 * unique_matches['general_score'] + 0.30 * unique_matches['title_boost_score'])
131
+ final_job_ids = unique_matches.sort_values(by='Similarity Score', ascending=False).head(top_k).index.tolist()
132
+ final_results_df = original_df[original_df['job_id'].isin(final_job_ids)].copy()
133
+ scores_df = unique_matches.reset_index()[['job_id', 'Similarity Score']].copy()
134
+ final_results_df = pd.merge(final_results_df, scores_df, on='job_id', how='left')
135
+ final_results_df = final_results_df.sort_values(by='Similarity Score', ascending=False).reset_index(drop=True)
136
+ final_results_df = final_results_df.set_index('job_id', drop=False).rename(columns={'job_id': 'Job ID'})
137
+ return final_results_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
  def score_jobs_by_skills(user_skills: list[str], df_to_rank: pd.DataFrame) -> pd.DataFrame:
140
+ if df_to_rank is None or df_to_rank.empty or not user_skills:
141
+ return df_to_rank.sort_values(by='Similarity Score', ascending=False) if df_to_rank is not None else pd.DataFrame()
142
+
143
+ ranked_df = df_to_rank.copy()
144
+ if 'Skills' not in ranked_df.columns:
145
+ return ranked_df.sort_values(by='Similarity Score', ascending=False)
146
+
147
+ user_skill_embeddings = model.encode(user_skills, convert_to_tensor=True)
148
+ all_job_skills = sorted(list(set(skill for skills_list in ranked_df['Skills'] if skills_list for skill in skills_list)))
149
+
150
+ if not all_job_skills:
151
+ ranked_df['Skill Match Score'] = 0.0
152
+ ranked_df['Final Score'] = ranked_df['Similarity Score']
153
+ return ranked_df
154
+
155
+ job_skill_embeddings = model.encode(all_job_skills, convert_to_tensor=True)
156
+ similarity_matrix = util.cos_sim(user_skill_embeddings, job_skill_embeddings)
157
+
158
+ def calculate_confidence_adjusted_score(row):
159
+ job_skills_list = row.get('Skills', [])
160
+ if not job_skills_list:
161
+ return 0.0
162
+
163
+ total_required = len(job_skills_list)
164
+ sum_of_max_similarities = 0.0
165
+ for job_skill in job_skills_list:
166
+ try:
167
+ job_skill_idx = all_job_skills.index(job_skill)
168
+ max_sim = torch.max(similarity_matrix[:, job_skill_idx])
169
+ sum_of_max_similarities += max_sim.item()
170
+ except (ValueError, IndexError):
171
+ continue
172
+
173
+ avg_score = sum_of_max_similarities / total_required if total_required > 0 else 0.0
174
+ skill_count_factor = min(1.0, total_required / 5.0)
175
+ return avg_score * skill_count_factor
176
+
177
+ ranked_df['Skill Match Score'] = ranked_df.apply(calculate_confidence_adjusted_score, axis=1)
178
+
179
+ ranked_df['Final Score'] = (0.8 * ranked_df['Similarity Score']) + (0.2 * ranked_df['Skill Match Score'])
180
+
181
+ ranked_df = ranked_df.sort_values(by='Final Score', ascending=False).reset_index(drop=True)
182
+ return ranked_df.set_index('Job ID', drop=False).rename_axis(None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
  def initialize_data_and_model():
185
+ global original_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings
186
+ PROCESSED_DATA_PATH = "processed_jobs_with_skills.parquet"
187
+
188
+ print("--- Initializing LLM Client ---")
189
+ if not initialize_llm_client(): print("Warning: LLM Client failed to initialize. Will use NLTK only for skills.")
190
+
191
+ if os.path.exists(PROCESSED_DATA_PATH):
192
+ print(f"--- Loading pre-processed data from {PROCESSED_DATA_PATH} ---")
193
+ original_df = pd.read_parquet(PROCESSED_DATA_PATH)
194
+ else:
195
+ print("--- No pre-processed data found. Starting one-time processing... ---")
196
+ ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
197
+ original_df = ds["original"].to_pandas()
198
+
199
+ def extract_skills_llm(text: str) -> list[str]:
200
+ if not isinstance(text, str) or len(text.strip()) < 20 or not LLM_PIPELINE: return []
201
+ prompt = f"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  Instruct: You are an expert technical recruiter. Extract the key skills from the job description text. List technical and soft skills as a comma-separated string.
 
203
  [Example 1]
 
204
  Text: "Requires 3+ years of experience in cloud infrastructure. Must be proficient in AWS, particularly EC2 and S3. Experience with Terraform for IaC is a plus."
 
205
  Extracted Skills: cloud infrastructure, aws, ec2, s3, terraform, infrastructure as code
 
206
  [Example 2]
 
207
  Text: "Seeking a team lead with strong project management abilities. Must communicate effectively with stakeholders and manage timelines using Agile methodologies like Scrum."
 
208
  Extracted Skills: project management, leadership, stakeholder communication, agile, scrum
 
209
  [Actual Task]
 
210
  Text: "{text}"
 
211
  Extracted Skills:
 
212
  """
213
+ try:
214
+ response = LLM_PIPELINE(prompt, max_new_tokens=150, do_sample=False, temperature=0.1)
215
+ generated_text = response[0]['generated_text']
216
+ skills_part = generated_text.split("Extracted Skills:")[-1].strip()
217
+ skills = [skill.strip() for skill in skills_part.split(',') if skill.strip()]
218
+ return list(dict.fromkeys(s.lower() for s in skills))
219
+ except Exception: return []
220
+
221
+ def extract_skills_nltk(text: str) -> list[str]:
222
+ if not isinstance(text, str): return []
223
+ text_lower = text.lower()
224
+ grammar = "NP: {<JJ.*>*<NN.*>+}"
225
+ chunk_parser = nltk.RegexpParser(grammar)
226
+ tokens = nltk.word_tokenize(text_lower)
227
+ tagged_tokens = nltk.pos_tag(tokens)
228
+ chunked_text = chunk_parser.parse(tagged_tokens)
229
+ potential_skills = set()
230
+ for subtree in chunked_text.subtrees():
231
+ if subtree.label() == 'NP':
232
+ phrase = " ".join(word for word, tag in subtree.leaves())
233
+ if _norm_skill_token(phrase) in SKILL_WHITELIST:
234
+ potential_skills.add(_norm_skill_token(phrase))
235
+ return sorted(list(potential_skills))
236
+
237
+ def extract_skills_direct_scan(text: str) -> list[str]:
238
+ if not isinstance(text, str): return []
239
+ found_skills = set()
240
+ for skill in SKILL_WHITELIST:
241
+ if re.search(r'\b' + re.escape(skill) + r'\b', text, re.IGNORECASE):
242
+ found_skills.add(skill)
243
+ return list(found_skills)
244
+
245
+ def expand_skills_with_llm(job_title: str, existing_skills: list) -> list:
246
+ if not LLM_PIPELINE or not job_title: return []
247
+
248
+ skills_to_add = 6 - len(existing_skills)
249
+ prompt = f"""
250
+ Instruct: A job has the title "{job_title}" and requires the skills: {', '.join(existing_skills)}.
251
+ Based on this, what are {skills_to_add} additional, closely related skills typically required for such a role?
252
+ List only the new skills, separated by commas. Do not repeat skills from the original list.
253
 
254
+ Additional Skills:
255
+ """
256
+ try:
257
+ response = LLM_PIPELINE(prompt, max_new_tokens=50, do_sample=True, temperature=0.5)
258
+ generated_text = response[0]['generated_text']
259
+ skills_part = generated_text.split("Additional Skills:")[-1].strip()
260
+ new_skills = [skill.strip().lower() for skill in skills_part.split(',') if skill.strip()]
261
+ return new_skills
262
+ except Exception:
263
+ return []
264
+
265
+ def extract_skills_hybrid(row) -> list[str]:
266
+ text = row['text_for_skills']
267
+ job_title = row.get('Job title', '') # Use original Job title for context
268
+
269
+ llm_skills = extract_skills_llm(text)
270
+ nltk_skills = extract_skills_nltk(text)
271
+ direct_skills = extract_skills_direct_scan(text)
272
+ combined_skills = set(llm_skills) | set(nltk_skills) | set(direct_skills)
273
+
274
+ # If the combined list is still too short, expand it
275
+ if len(combined_skills) < 6:
276
+ expanded_skills = expand_skills_with_llm(job_title, list(combined_skills))
277
+ combined_skills.update(expanded_skills)
278
+
279
+ return sorted(list(combined_skills))
280
+
281
+ def create_text_for_skills(row):
282
+ return " ".join([str(s) for s in [row.get("Job title"), row.get("Duties"), row.get("qualifications"), row.get("Description")] if pd.notna(s)])
283
+
284
+ original_df["text_for_skills"] = original_df.apply(create_text_for_skills, axis=1)
285
+ print("--- Extracting skills with HYBRID ACCURACY model. Please wait... ---")
286
+ # Apply the hybrid function row-wise to include job title context
287
+ original_df['Skills'] = original_df.progress_apply(extract_skills_hybrid, axis=1)
288
+ original_df = original_df.drop(columns=['text_for_skills'])
289
+
290
+ print(f"--- Saving processed data to {PROCESSED_DATA_PATH} for faster future startups ---")
291
+ original_df.to_parquet(PROCESSED_DATA_PATH)
292
+
293
+ original_df['job_id'] = original_df.index
294
+ def create_full_text(row): return " ".join([str(s) for s in [row.get("Job title"), row.get("Company"), row.get("Duties"), row.get("qualifications"), row.get("Description")]])
295
+ original_df["full_text"] = original_df.apply(create_full_text, axis=1)
296
+
297
+ ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
298
+ augmented_df = ds["augmented"].to_pandas()
299
+ max_id = len(original_df) - 1
300
+ augmented_df['job_id'] = augmented_df.index.map(lambda i: min(i // 20, max_id))
301
+ augmented_df["full_text"] = augmented_df.apply(create_full_text, axis=1)
302
+
303
+ combined_df = pd.concat([original_df.copy(), augmented_df.copy()], ignore_index=True)
304
+ original_df = original_df.rename(columns={'Job title': 'job_title', 'Company': 'company'})
305
+
306
+ print("--- Loading Fine-Tuned Sentence Transformer Model ---")
307
+ model = SentenceTransformer(FINETUNED_MODEL_ID)
308
+ print("--- Encoding Embeddings ---")
309
+ combined_job_embeddings = model.encode(combined_df["full_text"].tolist(), convert_to_tensor=True, show_progress_bar=True)
310
+ original_job_title_embeddings = model.encode(original_df["job_title"].tolist(), convert_to_tensor=True, show_progress_bar=True)
311
+ print("--- Building Vocabulary ---")
312
+ build_known_vocabulary(combined_df)
313
+ return "--- Initialization Complete ---"
314
 
315
+ def _course_links_for(skill: str) -> str:
316
+ q = _url.quote(skill)
317
+ links = [("Coursera", f"https://www.coursera.org/search?query={q}"), ("edX", f"https://www.edx.org/search?q={q}"), ("Udemy", f"https://www.udemy.com/courses/search/?q={q}"), ("YouTube", f"https://www.youtube.com/results?search_query={q}+tutorial")]
318
+ return " • ".join([f'<a href="{u}" target="_blank" style="color: #007bff;">{name}</a>' for name, u in links])
319
 
320
+ def get_job_matches(dream_job: str, top_n: int, skills_text: str):
321
+ status = "Searching using hybrid model..."
322
+ expanded_desc = llm_expand_query(dream_job)
323
+ emb_matches = find_job_matches(dream_job, expanded_desc, top_k=50)
324
+ user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
325
+
326
+ if user_skills:
327
+ display_df = score_jobs_by_skills(user_skills, emb_matches)
328
+ else:
329
+ display_df = emb_matches
330
+ display_df = display_df.head(top_n)
331
+ if user_skills:
332
+ status = f"Found and **re-ranked** results by your {len(user_skills)} skills. Displaying top {len(display_df)}."
333
+ else:
334
+ status = f"Found {len(display_df)} top matches using semantic search."
335
+
336
+ if 'Final Score' in display_df.columns:
337
+ table_to_show = display_df[['job_title', 'company', 'Final Score', 'Skill Match Score']]
338
+ table_to_show = table_to_show.rename(columns={'Final Score': 'Overall Score'})
339
+ table_to_show['Skill Match Score'] = table_to_show['Skill Match Score'].map('{:.2%}'.format)
340
+ table_to_show['Overall Score'] = table_to_show['Overall Score'].map('{:.2%}'.format)
341
+ else:
342
+ table_to_show = display_df[['job_title', 'company', 'Similarity Score']]
343
+ table_to_show = table_to_show.rename(columns={'Similarity Score': 'Overall Score'})
344
+ table_to_show['Overall Score'] = table_to_show['Overall Score'].map('{:.2%}'.format)
345
+
346
+ dropdown_options = [(f"{i+1}. {row['job_title']} - {row['company']}", row.name) for i, row in display_df.iterrows()]
347
+ dropdown_value = dropdown_options[0][1] if dropdown_options else None
348
+ return status, emb_matches, table_to_show, gr.Dropdown(choices=dropdown_options, value=dropdown_value, visible=True), gr.Accordion(visible=True)
349
 
350
+ def rerank_current_results(initial_matches_df, skills_text, top_n):
351
+ if initial_matches_df is None or pd.DataFrame(initial_matches_df).empty:
352
+ return "Please find matches first before re-ranking.", pd.DataFrame(), gr.Dropdown(visible=False)
353
+ initial_matches_df = pd.DataFrame(initial_matches_df)
354
+ user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
355
+
356
+ if not user_skills:
357
+ status = "Skills cleared. Showing original semantic search results."
358
+ display_df = initial_matches_df.head(top_n)
359
+ table_to_show = display_df[['job_title', 'company', 'Similarity Score']]
360
+ table_to_show = table_to_show.rename(columns={'Similarity Score': 'Overall Score'})
361
+ table_to_show['Overall Score'] = table_to_show['Overall Score'].map('{:.2%}'.format)
362
+ else:
363
+ ranked_df = score_jobs_by_skills(user_skills, initial_matches_df)
364
+ status = f"Results **re-ranked** based on your {len(user_skills)} skills."
365
+ display_df = ranked_df.head(top_n)
366
+ table_to_show = display_df[['job_title', 'company', 'Final Score', 'Skill Match Score']]
367
+ table_to_show = table_to_show.rename(columns={'Final Score': 'Overall Score'})
368
+ table_to_show['Skill Match Score'] = table_to_show['Skill Match Score'].map('{:.2%}'.format)
369
+ table_to_show['Overall Score'] = table_to_show['Overall Score'].map('{:.2%}'.format)
370
+
371
+ dropdown_options = [(f"{i+1}. {row['job_title']} - {row['company']}", row.name) for i, row in display_df.iterrows()]
372
+ dropdown_value = dropdown_options[0][1] if dropdown_options else None
373
+ return status, table_to_show, gr.Dropdown(choices=dropdown_options, value=dropdown_value, visible=True)
374
 
375
+ def find_matches_and_rank_with_check(dream_job: str, top_n: int, skills_text: str):
376
+ if not dream_job:
377
+ return "Please describe your dream job first.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(""), gr.Row(visible=False)
378
+ unrecognized_words = check_spelling_in_query(dream_job)
379
+ if unrecognized_words:
380
+ word_list_html = ", ".join([f"<b><span style='color: #F87171;'>{w}</span></b>" for w in unrecognized_words])
381
+ alert_message = f"<b><span style='color: #F87171;'>⚠️ Possible Spelling Error:</span></b> Unrecognized: {word_list_html}."
382
+ return "Status: Awaiting confirmation.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(alert_message, visible=True), gr.Row(visible=True)
383
+ status, emb_matches, table_to_show, dropdown, details_accordion = get_job_matches(dream_job, top_n, skills_text)
384
+ return status, emb_matches, table_to_show, dropdown, details_accordion, gr.Markdown(visible=False), gr.Row(visible=False)
385
 
386
+ def find_matches_and_rank_anyway(dream_job: str, top_n: int, skills_text: str):
387
+ status, emb_matches, table_to_show, dropdown, details_accordion = get_job_matches(dream_job, top_n, skills_text)
388
+ return status, emb_matches, table_to_show, dropdown, details_accordion, gr.Markdown(visible=False), gr.Row(visible=False)
389
 
390
+ def on_select_job(job_id, skills_text):
391
+ if job_id is None: return "", "", "", "", "", gr.Accordion(visible=False), [], 0, gr.Button(visible=False)
392
+ row = original_df.loc[job_id]
393
+ title, company = str(row.get("job_title", "")), str(row.get("company", ""))
394
+ job_details_markdown = f"### {title} — {company}"
395
+ duties, qualifications, description = str(row.get('Duties', '')), str(row.get('qualifications', '')), str(row.get('Description', ''))
396
+ user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
397
+ job_skills = row.get("Skills", [])
398
+ if not job_skills:
399
+ learning_plan_html = "<p><i>No specific skills could be extracted for this job.</i></p>"
400
+ return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
401
+
402
+ score_val = 0
403
+ all_missing_skills = job_skills
404
+ if user_skills:
405
+ user_skill_embeddings = model.encode(user_skills, convert_to_tensor=True)
406
+ job_skill_embeddings = model.encode(job_skills, convert_to_tensor=True)
407
+ similarity_matrix = util.cos_sim(user_skill_embeddings, job_skill_embeddings)
408
+
409
+ sum_of_max_similarities = torch.sum(torch.max(similarity_matrix, dim=0).values)
410
+ avg_score = (sum_of_max_similarities / len(job_skills)).item() if len(job_skills) > 0 else 0
411
+
412
+ skill_count_factor = min(1.0, len(job_skills) / 5.0)
413
+ score_val = avg_score * skill_count_factor
414
+
415
+ matched_job_skills_mask = torch.any(similarity_matrix > 0.58, dim=0)
416
+ all_missing_skills = [skill for i, skill in enumerate(job_skills) if not matched_job_skills_mask[i]]
417
+
418
+ if user_skills and score_val >= 0.98:
419
+ learning_plan_html = "<h4 style='color:green;'>🎉 You have all the required skills!</h4>"
420
+ job_details_markdown += f"\n**Your skill match:** {score_val:.1%}"
421
+ return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
422
+
423
+ if user_skills:
424
+ job_details_markdown += f"\n**Your skill match:** {score_val:.1%}"
425
+ headline = "<b>Great fit!</b>" if score_val >= 0.8 else "<b>Good progress!</b>" if score_val >= 0.5 else "<b>Solid starting point.</b>"
426
+ learning_plan_html = f"<h4>{headline} Focus on these skills to improve your match:</h4>"
427
+ skills_to_display = sorted(all_missing_skills)[:5]
428
+ items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
429
+ learning_plan_html += f"<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
430
+ return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
431
+ else:
432
+ headline = "<h4>To be a good fit for this role, you'll need to learn these skills:</h4>"
433
+ skills_to_display = sorted(job_skills)[:5]
434
+ items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
435
+ learning_plan_html = f"{headline}<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
436
+ full_skill_list_for_state = sorted(job_skills)
437
+ new_offset = len(skills_to_display)
438
+ should_button_be_visible = len(full_skill_list_for_state) > 5
439
+ return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), full_skill_list_for_state, new_offset, gr.Button(visible=should_button_be_visible)
440
 
441
+ def load_more_skills(full_skills_list, current_offset):
442
+ SKILLS_INCREMENT = 5
443
+ new_offset = current_offset + SKILLS_INCREMENT
444
+ skills_to_display = full_skills_list[:new_offset]
445
+ items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
446
+ learning_plan_html = f"<h4>To be a good fit for this role, you'll need to learn these skills:</h4><ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
447
+ should_button_be_visible = new_offset < len(full_skills_list)
448
+ return learning_plan_html, new_offset, gr.Button(visible=should_button_be_visible)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
449
 
450
+ def on_reset():
451
+ return ("", 3, "", pd.DataFrame(), None, gr.Dropdown(visible=False), gr.Accordion(visible=False), "Status: Ready.", "", "", "", "", gr.Markdown(visible=False), gr.Row(visible=False), [], 0, gr.Button(visible=False))
452
 
453
+ print("Starting application initialization...")
454
+ initialization_status = initialize_data_and_model()
455
+ print(initialization_status)
456
 
457
  with gr.Blocks(theme=gr.themes.Soft()) as ui:
458
+ gr.Markdown("# Hybrid Career Planner & Skill Gap Analyzer")
459
+ initial_matches_state = gr.State()
460
+ missing_skills_state = gr.State([])
461
+ skills_offset_state = gr.State(0)
462
+ with gr.Row():
463
+ with gr.Column(scale=3):
464
+ dream_text = gr.Textbox(label='Your Dream Job Description', lines=3, placeholder="e.g., 'A role in a tech startup focused on machine learning...'")
465
+ with gr.Accordion("Optional: Add Your Skills to Re-rank Results", open=False):
466
+ with gr.Row():
467
+ skills_text = gr.Textbox(label='Your Skills (comma-separated)', placeholder="e.g., Python, data analysis", scale=3)
468
+ rerank_btn = gr.Button("Re-rank", variant="secondary", scale=1)
469
+ with gr.Column(scale=1):
470
+ topk_slider = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Number of Matches")
471
+ search_btn = gr.Button("Find Matches", variant="primary")
472
+ reset_btn = gr.Button("Reset All")
473
+ status_text = gr.Markdown("Status: Ready.")
474
+ spelling_alert = gr.Markdown(visible=False)
475
+ with gr.Row(visible=False) as spelling_row:
476
+ search_anyway_btn = gr.Button("Search Anyway", variant="secondary")
477
+ retype_btn = gr.Button("Let Me Fix It", variant="stop")
478
+ df_output = gr.DataFrame(label="Job Matches", interactive=False)
479
+ job_selector = gr.Dropdown(label="Select a job to see more details & learning plan:", visible=False)
480
+ with gr.Accordion("Job Details & Learning Plan", open=False, visible=False) as details_accordion:
481
+ job_details_markdown = gr.Markdown()
482
+ with gr.Tabs():
483
+ with gr.TabItem("Duties"): duties_markdown = gr.Markdown()
484
+ with gr.TabItem("Qualifications"): qualifications_markdown = gr.Markdown()
485
+ with gr.TabItem("Full Description"): description_markdown = gr.Markdown()
486
+ learning_plan_output = gr.HTML(label="Learning Plan")
487
+ load_more_btn = gr.Button("Load More Skills", visible=False)
488
+ search_btn.click(fn=find_matches_and_rank_with_check, inputs=[dream_text, topk_slider, skills_text], outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row])
489
+ search_anyway_btn.click(fn=find_matches_and_rank_anyway, inputs=[dream_text, topk_slider, skills_text], outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row])
490
+ retype_btn.click(lambda: ("Status: Ready for you to retype.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(visible=False), gr.Row(visible=False)), outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row])
491
+ reset_btn.click(fn=on_reset, outputs=[dream_text, topk_slider, skills_text, df_output, initial_matches_state, job_selector, details_accordion, status_text, job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, spelling_alert, spelling_row, missing_skills_state, skills_offset_state, load_more_btn], queue=False)
492
+ rerank_btn.click(fn=rerank_current_results, inputs=[initial_matches_state, skills_text, topk_slider], outputs=[status_text, df_output, job_selector])
493
+ job_selector.change(fn=on_select_job, inputs=[job_selector, skills_text], outputs=[job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, learning_plan_output, details_accordion, missing_skills_state, skills_offset_state, load_more_btn])
494
+ load_more_btn.click(fn=load_more_skills, inputs=[missing_skills_state, skills_offset_state], outputs=[learning_plan_output, skills_offset_state, load_more_btn])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
 
496
  ui.launch()