zlf18 commited on
Commit
79d3345
·
verified ·
1 Parent(s): ddd310b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +935 -394
app.py CHANGED
@@ -1,452 +1,993 @@
1
  import pandas as pd
 
2
  import datasets
 
3
  from sentence_transformers import SentenceTransformer, util
 
4
  import torch
 
5
  import re
 
6
  import nltk
 
7
  from nltk.corpus import words, stopwords
 
8
  import urllib.parse as _url
 
9
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 
10
  from nltk.stem import PorterStemmer
 
11
  import gradio as gr
 
12
  import os
 
13
  from tqdm import tqdm
14
 
 
 
15
  tqdm.pandas()
16
 
 
 
17
  # --- NLTK Data Download ---
 
18
  for package in ['words', 'stopwords', 'averaged_perceptron_tagger', 'punkt']:
19
- try:
20
- nltk.data.find(f'corpora/{package}' if package in ['words', 'stopwords'] else f'taggers/{package}' if package == 'averaged_perceptron_tagger' else f'tokenizers/{package}')
21
- except LookupError:
22
- nltk.download(package)
 
 
 
 
 
 
23
 
24
  STOPWORDS = set(stopwords.words('english'))
 
25
  stemmer = PorterStemmer()
26
 
 
 
27
  # --- Expanded Skill Whitelist ---
 
28
  SKILL_WHITELIST = {
29
- 'python', 'java', 'c++', 'javascript', 'typescript', 'sql', 'nosql', 'html', 'css', 'react', 'angular', 'vue',
30
- 'nodejs', 'django', 'flask', 'fastapi', 'spring boot', 'ruby on rails', 'php', 'swift', 'kotlin', 'dart', 'flutter',
31
- 'machine learning', 'deep learning', 'tensorflow', 'pytorch', 'keras', 'scikit-learn', 'pandas', 'numpy', 'matplotlib',
32
- 'natural language processing', 'nlp', 'computer vision', 'data analysis', 'data science', 'data engineering',
33
- 'big data', 'spark', 'hadoop', 'kafka', 'data visualization', 'tableau', 'power bi', 'd3.js', 'statistics', 'analytics',
34
- 'aws', 'azure', 'google cloud', 'gcp', 'docker', 'kubernetes', 'terraform', 'ansible', 'ci/cd', 'jenkins',
35
- 'git', 'github', 'devops', 'linux', 'unix', 'shell scripting', 'powershell', 'cybersecurity', 'penetration testing',
36
- 'network security', 'cryptography', 'blockchain', 'c#', '.net', 'sql server', 'mysql', 'postgresql', 'mongodb', 'redis',
37
- 'elasticsearch', 'api design', 'rest apis', 'graphql', 'microservices', 'serverless', 'system design', 'saas',
38
- 'agile', 'scrum', 'project management', 'product management', 'consulting', 'client management', 'business development',
39
- 'strategy', 'stakeholder management', 'risk management', 'compliance', 'aml', 'kyc', 'reinsurance', 'finance',
40
- 'financial modeling', 'financial analysis', 'due diligence', 'sourcing', 'procurement', 'negotiation', 'supply chain',
41
- 'business analysis', 'business intelligence', 'presentations', 'public speaking', 'time management', 'critical thinking',
42
- 'design thinking', 'innovation', 'adaptability', 'supervisory', 'pmp', 'cpsm', 'cips', 'microsoft office', 'communication',
43
- 'organizational skills',
44
- 'leadership', 'stakeholder communication', 'client communication', 'teamwork', 'collaboration', 'problem solving',
45
- 'ui/ux design', 'figma', 'sketch', 'adobe xd', 'graphic design', 'autocad', 'solidworks', 'sales', 'marketing',
46
- 'seo', 'sem', 'content writing', 'customer support', 'technical writing', 'sap', 'oracle', 'budgeting', 'mentoring', 'supervising'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  }
48
 
 
 
49
  # --- GLOBAL STATE & DATA ---
 
50
  original_df = None
 
51
  combined_df = None
 
52
  model = None
 
53
  combined_job_embeddings = None
 
54
  original_job_title_embeddings = None
 
55
  LLM_PIPELINE = None
 
56
  LLM_MODEL_NAME = "microsoft/phi-2"
 
57
  FINETUNED_MODEL_ID = "its-zion-18/projfinetuned"
 
58
  KNOWN_WORDS = set()
59
 
 
 
60
  # --- CORE NLP & HELPER FUNCTIONS ---
 
61
  def _norm_skill_token(s: str) -> str:
62
- s = s.lower().strip()
63
- s = re.sub(r'[\(\)\[\]\{\}\*]', '', s)
64
- s = re.sub(r'^\W+|\W+$', '', s)
65
- s = re.sub(r'\s+', ' ', s)
66
- return s
 
 
 
 
 
 
 
67
 
68
  def build_known_vocabulary(df: pd.DataFrame):
69
- global KNOWN_WORDS
70
- english_words = set(w.lower() for w in words.words())
71
- job_words = set(re.findall(r'\b\w+\b', " ".join(df['full_text'].astype(str).tolist()).lower()))
72
- job_words = {w for w in job_words if w.isalpha() and len(w) > 2}
73
- KNOWN_WORDS = english_words | job_words
74
- return "Known vocabulary built."
 
 
 
 
 
 
 
 
75
 
76
  def check_spelling_in_query(query: str) -> list[str]:
77
- words_in_query = query.lower().split()
78
- unrecognized_words = []
79
- if not KNOWN_WORDS: return []
80
- for word in words_in_query:
81
- if word.isalpha() and len(word) > 1 and word not in KNOWN_WORDS:
82
- unrecognized_words.append(word)
83
- return list(set(unrecognized_words))
 
 
 
 
 
 
 
 
 
84
 
85
  def initialize_llm_client():
86
- global LLM_PIPELINE
87
- try:
88
- tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME, trust_remote_code=True)
89
- model_llm = AutoModelForCausalLM.from_pretrained(
90
- LLM_MODEL_NAME, torch_dtype="auto", device_map="auto", trust_remote_code=True
91
- )
92
- LLM_PIPELINE = pipeline("text-generation", model=model_llm, tokenizer=tokenizer)
93
- return True
94
- except Exception as e:
95
- print(f"🚨 ERROR initializing local LLM: {e}")
96
- return False
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
  def llm_expand_query(user_input: str) -> str:
99
- global LLM_PIPELINE
100
- if not LLM_PIPELINE: return user_input
101
- prompt_template = (
102
- f"User's career interest: '{user_input}'\n"
103
- f"Instruction: Based on the user's interest, write a concise, single-sentence summary (40-60 words) that elaborates on the core intent, typical skills, and responsibilities. Do not include a preamble. Expanded Intent:"
104
- )
105
- try:
106
- response = LLM_PIPELINE(prompt_template, max_new_tokens=100, do_sample=True, temperature=0.6, return_full_text=False)
107
- expanded_query = response[0]['generated_text'].strip()
108
- final_query = user_input + ". " + expanded_query.replace('\n', ' ').strip()
109
- return final_query
110
- except Exception:
111
- return user_input
112
-
113
- def find_job_matches(query: str, top_k: int = 50) -> pd.DataFrame:
114
- query_embedding = model.encode(query, convert_to_tensor=True)
115
- general_similarity_scores = util.cos_sim(query_embedding, combined_job_embeddings)[0]
116
- top_indices = torch.topk(general_similarity_scores, k=len(combined_df))
117
- sorted_combined_df = combined_df.iloc[top_indices.indices.cpu()].copy()
118
- sorted_combined_df['general_score'] = top_indices.values.cpu().numpy()
119
- unique_matches = sorted_combined_df.drop_duplicates(subset=['job_id'], keep='first').set_index('job_id')
120
-
121
- title_boost_scores = util.cos_sim(query_embedding, original_job_title_embeddings)[0].cpu().numpy()
122
- title_boost_map = pd.Series(title_boost_scores, index=original_df['job_id'])
123
- unique_matches['title_boost_score'] = unique_matches.index.map(title_boost_map).fillna(0)
124
-
125
- unique_matches['Similarity Score'] = (0.70 * unique_matches['general_score'] + 0.30 * unique_matches['title_boost_score'])
126
-
127
- final_job_ids = unique_matches.sort_values(by='Similarity Score', ascending=False).head(top_k).index.tolist()
128
- final_results_df = original_df[original_df['job_id'].isin(final_job_ids)].copy()
129
- scores_df = unique_matches.reset_index()[['job_id', 'Similarity Score']].copy()
130
- final_results_df = pd.merge(final_results_df, scores_df, on='job_id', how='left')
131
- final_results_df = final_results_df.sort_values(by='Similarity Score', ascending=False).reset_index(drop=True)
132
- final_results_df = final_results_df.set_index('job_id', drop=False).rename(columns={'job_id': 'Job ID'})
133
- return final_results_df
134
-
135
- def calculate_skill_match_score(user_skills: list[str], job_skills: list[str]) -> float:
136
- if not user_skills or not job_skills:
137
- return 0.0
138
-
139
- user_skill_embeddings = model.encode(user_skills, convert_to_tensor=True)
140
- job_skill_embeddings = model.encode(job_skills, convert_to_tensor=True)
141
- similarity_matrix = util.cos_sim(user_skill_embeddings, job_skill_embeddings)
142
-
143
- sum_of_max_similarities = torch.sum(torch.max(similarity_matrix, dim=0).values)
144
- avg_score = (sum_of_max_similarities / len(job_skills)).item() if len(job_skills) > 0 else 0.0
145
-
146
- skill_count_factor = min(1.0, len(job_skills) / 5.0)
147
- return avg_score * skill_count_factor
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
  def initialize_data_and_model():
150
- global original_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings
151
- PROCESSED_DATA_PATH = "processed_jobs_with_skills.parquet"
152
-
153
- print("--- Initializing LLM Client ---")
154
- if not initialize_llm_client(): print("Warning: LLM Client failed to initialize. Will use NLTK only for skills.")
155
-
156
- if os.path.exists(PROCESSED_DATA_PATH):
157
- print(f"--- Loading pre-processed data from {PROCESSED_DATA_PATH} ---")
158
- original_df = pd.read_parquet(PROCESSED_DATA_PATH)
159
- else:
160
- print("--- No pre-processed data found. Starting one-time processing... ---")
161
- ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
162
- original_df = ds["original"].to_pandas()
163
-
164
- def extract_skills_llm(text: str) -> list[str]:
165
- if not isinstance(text, str) or len(text.strip()) < 20 or not LLM_PIPELINE: return []
166
- prompt = f"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  Instruct: You are an expert technical recruiter. Extract the key skills from the job description text. List technical and soft skills as a comma-separated string.
168
- [Example 1] Text: "Requires 3+ years of experience in cloud infrastructure. Must be proficient in AWS, particularly EC2 and S3. Experience with Terraform for IaC is a plus." Extracted Skills: cloud infrastructure, aws, ec2, s3, terraform, infrastructure as code
169
- [Example 2] Text: "Seeking a team lead with strong project management abilities. Must communicate effectively with stakeholders and manage timelines using Agile methodologies like Scrum." Extracted Skills: project management, leadership, stakeholder communication, agile, scrum
170
- [Actual Task] Text: "{text}" Extracted Skills:"""
171
- try:
172
- response = LLM_PIPELINE(prompt, max_new_tokens=150, do_sample=False, temperature=0.1, return_full_text=False)
173
- skills_text = response[0]['generated_text'].strip()
174
- skills = [skill.strip() for skill in skills_text.split(',') if skill.strip()]
175
- return list(dict.fromkeys(s.lower() for s in skills))
176
- except Exception: return []
177
-
178
- def extract_skills_nltk(text: str) -> list[str]:
179
- if not isinstance(text, str): return []
180
- text_lower = text.lower()
181
- grammar = "NP: {<JJ.*>*<NN.*>+}"
182
- chunk_parser = nltk.RegexpParser(grammar)
183
- tokens = nltk.word_tokenize(text_lower)
184
- tagged_tokens = nltk.pos_tag(tokens)
185
- chunked_text = chunk_parser.parse(tagged_tokens)
186
- potential_skills = set()
187
- for subtree in chunked_text.subtrees():
188
- if subtree.label() == 'NP':
189
- phrase = " ".join(word for word, tag in subtree.leaves())
190
- if _norm_skill_token(phrase) in SKILL_WHITELIST:
191
- potential_skills.add(_norm_skill_token(phrase))
192
- return sorted(list(potential_skills))
193
-
194
- def extract_skills_direct_scan(text: str) -> list[str]:
195
- if not isinstance(text, str): return []
196
- found_skills = set()
197
- for skill in SKILL_WHITELIST:
198
- if re.search(r'\b' + re.escape(skill) + r'\b', text, re.IGNORECASE):
199
- found_skills.add(skill)
200
- return list(found_skills)
201
-
202
- def expand_skills_with_llm(job_title: str, existing_skills: list) -> list:
203
- if not LLM_PIPELINE or not job_title or not existing_skills: return []
204
- skills_to_add = 6 - len(existing_skills)
205
- prompt = f"""Instruct: A job has the title "{job_title}" and requires the skills: {', '.join(existing_skills)}. Based on this, what are {skills_to_add} additional, closely related skills typically required for such a role? List only the new skills, separated by commas. Do not repeat skills from the original list. Additional Skills:"""
206
- try:
207
- response = LLM_PIPELINE(prompt, max_new_tokens=50, do_sample=True, temperature=0.5, return_full_text=False)
208
- new_skills_text = response[0]['generated_text'].strip()
209
- new_skills = [skill.strip().lower() for skill in new_skills_text.split(',') if skill.strip()]
210
- return new_skills
211
- except Exception: return []
212
-
213
- def extract_skills_hybrid(row) -> list[str]:
214
- text = row['text_for_skills']
215
- job_title = row.get('Job title', '')
216
- llm_skills = extract_skills_llm(text)
217
- nltk_skills = extract_skills_nltk(text)
218
- direct_skills = extract_skills_direct_scan(text)
219
- combined_skills = set(llm_skills) | set(nltk_skills) | set(direct_skills)
220
-
221
- if 0 < len(combined_skills) < 6:
222
- expanded_skills = expand_skills_with_llm(job_title, list(combined_skills))
223
- combined_skills.update(expanded_skills)
224
- return sorted(list(combined_skills))
225
-
226
- def create_text_for_skills(row):
227
- return " ".join([str(s) for s in [row.get("Job title"), row.get("Duties"), row.get("qualifications"), row.get("Description")] if pd.notna(s)])
228
-
229
- original_df["text_for_skills"] = original_df.apply(create_text_for_skills, axis=1)
230
- print("--- Extracting skills with HYBRID ACCURACY model. Please wait... ---")
231
- original_df['Skills'] = original_df.progress_apply(extract_skills_hybrid, axis=1)
232
- original_df = original_df.drop(columns=['text_for_skills'])
233
- print(f"--- Saving processed data to {PROCESSED_DATA_PATH} for faster future startups ---")
234
- original_df.to_parquet(PROCESSED_DATA_PATH)
235
-
236
- original_df['job_id'] = original_df.index
237
- def create_full_text(row): return " ".join([str(s) for s in [row.get("Job title"), row.get("Company"), row.get("Duties"), row.get("qualifications"), row.get("Description")]])
238
- original_df["full_text"] = original_df.apply(create_full_text, axis=1)
239
-
240
- ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
241
- augmented_df = ds["augmented"].to_pandas()
242
- max_id = len(original_df) - 1
243
- augmented_df['job_id'] = augmented_df.index.map(lambda i: min(i // 20, max_id))
244
- augmented_df["full_text"] = augmented_df.apply(create_full_text, axis=1)
245
- combined_df = pd.concat([original_df.copy(), augmented_df.copy()], ignore_index=True)
246
- original_df = original_df.rename(columns={'Job title': 'job_title', 'Company': 'company'})
247
-
248
- print("--- Loading Fine-Tuned Sentence Transformer Model ---")
249
- model = SentenceTransformer(FINETUNED_MODEL_ID)
250
- print("--- Encoding Embeddings ---")
251
- combined_job_embeddings = model.encode(combined_df["full_text"].tolist(), convert_to_tensor=True, show_progress_bar=True)
252
- original_job_title_embeddings = model.encode(original_df["job_title"].tolist(), convert_to_tensor=True, show_progress_bar=True)
253
- print("--- Building Vocabulary ---")
254
- build_known_vocabulary(combined_df)
255
- return "--- Initialization Complete ---"
256
 
257
- def _course_links_for(skill: str) -> str:
258
- q = _url.quote(skill)
259
- links = [("Coursera", f"https://www.coursera.org/search?query={q}"), ("edX", f"https://www.edx.org/search?q={q}"), ("Udemy", f"https://www.udemy.com/courses/search/?q={q}"), ("YouTube", f"https://www.youtube.com/results?search_query={q}+tutorial")]
260
- return " • ".join([f'<a href="{u}" target="_blank" style="color: #007bff;">{name}</a>' for name, u in links])
261
-
262
- # --- GRADIO EVENT HANDLER FUNCTIONS ---
263
-
264
- def get_job_matches(dream_job: str, top_n: int):
265
- status = "Searching for jobs based on your career goal..."
266
- expanded_desc = llm_expand_query(dream_job)
267
- emb_matches = find_job_matches(expanded_desc, top_k=50) # Get a larger pool for later
268
- display_df = emb_matches.head(top_n)
269
- status = f"Found {len(emb_matches)} top matches for your career goal."
270
-
271
- table_to_show = display_df[['job_title', 'company', 'Similarity Score']]
272
- table_to_show['Similarity Score'] = table_to_show['Similarity Score'].map('{:.2%}'.format)
273
-
274
- dropdown_options = [(f"{i+1}. {row['job_title']} - {row['company']}", row.name) for i, row in display_df.iterrows()]
275
- dropdown_value = dropdown_options[0][1] if dropdown_options else None
276
-
277
- # When initially finding matches, only df_output is used. skill_jobs_output is cleared/hidden.
278
- return status, emb_matches, table_to_show, gr.update(choices=dropdown_options, value=dropdown_value, visible=True), gr.update(visible=True), pd.DataFrame(), gr.update(visible=False)
279
-
280
- def analyze_skills(dream_job, initial_matches_df, skills_text, top_n):
281
- user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
282
- if not user_skills:
283
- # If skills are cleared, just show the original table without skill scores and hide the second table
284
- table1_df = pd.DataFrame(initial_matches_df).head(top_n)
285
- table1_to_show = table1_df[['job_title', 'company', 'Similarity Score']]
286
- table1_to_show['Similarity Score'] = table1_to_show['Similarity Score'].map('{:.2%}'.format)
287
- return "Skills cleared. Showing original relevance.", table1_to_show, pd.DataFrame(), gr.update(visible=False)
288
-
289
- status = "Analyzing skills and finding new job matches..."
290
-
291
- # --- LOGIC FOR TOP TABLE (Reranked Initial Jobs) ---
292
- reranked_initial_jobs = pd.DataFrame(initial_matches_df) # Ensure it's a DataFrame
293
- reranked_initial_jobs['Skill Match Score'] = reranked_initial_jobs['Skills'].apply(lambda js: calculate_skill_match_score(user_skills, js))
294
- reranked_initial_jobs = reranked_initial_jobs.sort_values(by='Skill Match Score', ascending=False)
295
-
296
- top_table_df = reranked_initial_jobs.head(top_n)[['job_title', 'company', 'Similarity Score', 'Skill Match Score']]
297
- top_table_df['Similarity Score'] = top_table_df['Similarity Score'].map('{:.2%}'.format)
298
- top_table_df['Skill Match Score'] = top_table_df['Skill Match Score'].map('{:.2%}'.format)
299
-
300
- # --- LOGIC FOR BOTTOM TABLE (New Skill-Based Jobs) ---
301
- combined_query = dream_job + ". My current skills are: " + skills_text
302
- newly_found_jobs = find_job_matches(combined_query, top_k=top_n)
303
- newly_found_jobs['Skill Match Score'] = newly_found_jobs['Skills'].apply(lambda js: calculate_skill_match_score(user_skills, js))
304
-
305
- bottom_table_df = newly_found_jobs[['job_title', 'company', 'Similarity Score', 'Skill Match Score']]
306
- bottom_table_df['Similarity Score'] = bottom_table_df['Similarity Score'].map('{:.2%}'.format)
307
- bottom_table_df['Skill Match Score'] = bottom_table_df['Skill Match Score'].map('{:.2%}'.format)
308
-
309
- status = f"Re-ranked initial jobs and found new jobs for your skills."
310
-
311
- # Corrected return order: top_table_df for df_output, bottom_table_df for skill_jobs_output
312
- return status, top_table_df, bottom_table_df, gr.update(visible=True)
313
-
314
- def find_matches_and_rank_with_check(dream_job: str, top_n: int):
315
- if not dream_job:
316
- return "Please describe your dream job first.", None, pd.DataFrame(), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), pd.DataFrame(), gr.update(visible=False)
317
-
318
- unrecognized_words = check_spelling_in_query(dream_job)
319
- if unrecognized_words:
320
- word_list_html = ", ".join([f"<b><span style='color: #F87171;'>{w}</span></b>" for w in unrecognized_words])
321
- alert_message = f"<b><span style='color: #F87171;'>⚠️ Possible Spelling Error:</span></b> Unrecognized: {word_list_html}."
322
- return "Status: Awaiting confirmation.", None, pd.DataFrame(), gr.update(visible=False), gr.update(visible=False), gr.update(value=alert_message, visible=True), gr.update(visible=True), pd.DataFrame(), gr.update(visible=False)
323
-
324
- status, emb_matches, table_to_show, dropdown_update, accordion_update, _, _ = get_job_matches(dream_job, top_n) # Adjusted to unpack the 2 extra outputs
325
- # Hide the second table on a new search
326
- return status, emb_matches, table_to_show, dropdown_update, accordion_update, gr.update(visible=False), gr.update(visible=False), pd.DataFrame(), gr.update(visible=False)
327
-
328
-
329
- def find_matches_and_rank_anyway(dream_job: str, top_n: int):
330
- status, emb_matches, table_to_show, dropdown_update, accordion_update, _, _ = get_job_matches(dream_job, top_n) # Adjusted to unpack the 2 extra outputs
331
- return status, emb_matches, table_to_show, dropdown_update, accordion_update, gr.update(visible=False), gr.update(visible=False), pd.DataFrame(), gr.update(visible=False)
332
 
333
- def on_select_job(job_id, skills_text):
334
- if job_id is None: return "", "", "", "", "", gr.update(visible=False), [], 0, gr.Button(visible=False)
335
- row = original_df.loc[job_id]
336
- title, company = str(row.get("job_title", "")), str(row.get("company", ""))
337
- job_details_markdown = f"### {title} — {company}"
338
- duties, qualifications, description = str(row.get('Duties', '')), str(row.get('qualifications', '')), str(row.get('Description', ''))
339
- user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
340
- job_skills = row.get("Skills", [])
341
- if not job_skills:
342
- learning_plan_html = "<p><i>No specific skills could be extracted for this job.</i></p>"
343
- return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.update(visible=True), [], 0, gr.Button(visible=False)
344
-
345
- score_val = 0
346
- all_missing_skills = job_skills
347
- if user_skills:
348
- score_val = calculate_skill_match_score(user_skills, job_skills)
349
- user_skill_embeddings = model.encode(user_skills, convert_to_tensor=True)
350
- job_skill_embeddings = model.encode(job_skills, convert_to_tensor=True)
351
- similarity_matrix = util.cos_sim(user_skill_embeddings, job_skill_embeddings)
352
- matched_job_skills_mask = torch.any(similarity_matrix > 0.58, dim=0)
353
- all_missing_skills = [skill for i, skill in enumerate(job_skills) if not matched_job_skills_mask[i]]
354
-
355
- if user_skills:
356
- job_details_markdown += f"\n**Your skill match:** {score_val:.1%}"
357
-
358
- if user_skills and score_val >= 0.98:
359
- learning_plan_html = "<h4 style='color:green;'>🎉 You have all the required skills!</h4>"
360
- return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.update(visible=True), [], 0, gr.Button(visible=False)
361
-
362
- headline = "<h4>Focus on these skills to improve your match:</h4>" if user_skills else "<h4>To be a good fit for this role, you'll need to learn these skills:</h4>"
363
- skills_to_display = sorted(all_missing_skills)[:5] if user_skills else sorted(job_skills)[:5]
364
- items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
365
- learning_plan_html = f"{headline}<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
366
- full_skill_list_for_state = sorted(all_missing_skills) if user_skills else sorted(job_skills)
367
- new_offset = len(skills_to_display)
368
- should_button_be_visible = len(full_skill_list_for_state) > 5
369
- return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.update(visible=True), full_skill_list_for_state, new_offset, gr.Button(visible=should_button_be_visible)
370
 
371
- def load_more_skills(full_skills_list, current_offset):
372
- SKILLS_INCREMENT = 5
373
- new_offset = current_offset + SKILLS_INCREMENT
374
- skills_to_display = full_skills_list[:new_offset]
375
- items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
376
- learning_plan_html = f"<h4>To be a good fit for this role, you'll need to learn these skills:</h4><ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
377
- should_button_be_visible = new_offset < len(full_skills_list)
378
- return learning_plan_html, new_offset, gr.Button(visible=should_button_be_visible)
379
 
380
- def on_reset():
381
- return ("", 3, "", pd.DataFrame(), None, gr.update(visible=False), gr.update(visible=False), "Status: Ready.", "", "", "", "", gr.update(visible=False), gr.update(visible=False), [], 0, gr.Button(visible=False), pd.DataFrame(), gr.update(visible=False))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
382
 
383
- try:
384
- initialization_status = initialize_data_and_model()
385
- except Exception as e:
386
- initialization_status = f"ERROR during startup: {e}. Please check logs."
387
- print(initialization_status)
388
 
389
- with gr.Blocks(theme=gr.themes.Soft()) as ui:
390
- gr.Markdown("# Hybrid Career Planner & Skill Gap Analyzer")
391
- initial_matches_state = gr.State()
392
- missing_skills_state = gr.State([])
393
- skills_offset_state = gr.State(0)
394
- with gr.Row():
395
- with gr.Column(scale=3):
396
- dream_text = gr.Textbox(label='Your Dream Job Description', lines=3, placeholder="e.g., 'A role in a tech startup focused on machine learning...'")
397
- with gr.Accordion("Optional: Analyze Your Skills & Find More Jobs", open=True):
398
- with gr.Row():
399
- skills_text = gr.Textbox(label='Your Skills (comma-separated)', placeholder="e.g., Python, data analysis", scale=3)
400
- analyze_btn = gr.Button("Analyze Skills", variant="secondary", scale=1)
401
- with gr.Column(scale=1):
402
- topk_slider = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Number of Matches")
403
- search_btn = gr.Button("Find Matches", variant="primary")
404
- reset_btn = gr.Button("Reset All")
405
- status_text = gr.Markdown("Status: Ready.")
406
- spelling_alert = gr.Markdown(visible=False)
407
- with gr.Row(visible=False) as spelling_row:
408
- search_anyway_btn = gr.Button("Search Anyway", variant="secondary")
409
- retype_btn = gr.Button("Let Me Fix It", variant="stop")
410
-
411
- gr.Markdown("### Top Matches for Your Career Goal")
412
- df_output = gr.DataFrame(label="Job Matches", interactive=False)
413
-
414
- with gr.Column(visible=False) as skill_jobs_col:
415
- gr.Markdown("### Potential Jobs to Consider (Given Your Skills)")
416
- skill_jobs_output = gr.DataFrame(label="Skill-Based Job Matches", interactive=False)
417
-
418
- job_selector = gr.Dropdown(label="Select a job to see more details & learning plan:", visible=False)
419
- with gr.Accordion("Job Details & Learning Plan", open=False, visible=False) as details_accordion:
420
- job_details_markdown = gr.Markdown()
421
- with gr.Tabs():
422
- with gr.TabItem("Duties"): duties_markdown = gr.Markdown()
423
- with gr.TabItem("Qualifications"): qualifications_markdown = gr.Markdown()
424
- with gr.TabItem("Full Description"): description_markdown = gr.Markdown()
425
- learning_plan_output = gr.HTML(label="Learning Plan")
426
- load_more_btn = gr.Button("Load More Skills", visible=False)
427
-
428
- # Updated 'search_btn' and 'search_anyway_btn' to correctly unpack outputs from get_job_matches
429
- search_btn.click(fn=find_matches_and_rank_with_check, inputs=[dream_text, topk_slider],
430
- outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion,
431
- spelling_alert, spelling_row, skill_jobs_output, skill_jobs_col])
432
- search_anyway_btn.click(fn=find_matches_and_rank_anyway, inputs=[dream_text, topk_slider],
433
- outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion,
434
- spelling_alert, spelling_row, skill_jobs_output, skill_jobs_col])
435
-
436
- # The analyze_btn output order is now correct: top_table_df to df_output, bottom_table_df to skill_jobs_output
437
- analyze_btn.click(fn=analyze_skills, inputs=[dream_text, initial_matches_state, skills_text, topk_slider],
438
- outputs=[status_text, df_output, skill_jobs_output, skill_jobs_col])
439
-
440
- retype_btn.click(lambda: ("Status: Ready for you to retype.", None, pd.DataFrame(), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), pd.DataFrame(), gr.update(visible=False)),
441
- outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row, skill_jobs_output, skill_jobs_col])
442
- reset_btn.click(fn=on_reset,
443
- outputs=[dream_text, topk_slider, skills_text, df_output, initial_matches_state, job_selector, details_accordion, status_text,
444
- job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, spelling_alert, spelling_row,
445
- missing_skills_state, skills_offset_state, load_more_btn, skill_jobs_output, skill_jobs_col], queue=False)
446
- job_selector.change(fn=on_select_job, inputs=[job_selector, skills_text],
447
- outputs=[job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, learning_plan_output,
448
- details_accordion, missing_skills_state, skills_offset_state, load_more_btn])
449
- load_more_btn.click(fn=load_more_skills, inputs=[missing_skills_state, skills_offset_state],
450
- outputs=[learning_plan_output, skills_offset_state, load_more_btn])
451
 
452
  ui.launch()
 
1
  import pandas as pd
2
+
3
  import datasets
4
+
5
  from sentence_transformers import SentenceTransformer, util
6
+
7
  import torch
8
+
9
  import re
10
+
11
  import nltk
12
+
13
  from nltk.corpus import words, stopwords
14
+
15
  import urllib.parse as _url
16
+
17
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
18
+
19
  from nltk.stem import PorterStemmer
20
+
21
  import gradio as gr
22
+
23
  import os
24
+
25
  from tqdm import tqdm
26
 
27
+
28
+
29
  tqdm.pandas()
30
 
31
+
32
+
33
  # --- NLTK Data Download ---
34
+
35
  for package in ['words', 'stopwords', 'averaged_perceptron_tagger', 'punkt']:
36
+
37
+     try:
38
+
39
+         nltk.data.find(f'corpora/{package}' if package in ['words', 'stopwords'] else f'taggers/{package}' if package == 'averaged_perceptron_tagger' else f'tokenizers/{package}')
40
+
41
+     except LookupError:
42
+
43
+         nltk.download(package)
44
+
45
+
46
 
47
  STOPWORDS = set(stopwords.words('english'))
48
+
49
  stemmer = PorterStemmer()
50
 
51
+
52
+
53
  # --- Expanded Skill Whitelist ---
54
+
55
  SKILL_WHITELIST = {
56
+
57
+     # Technical & Data
58
+
59
+     'python', 'java', 'c++', 'javascript', 'typescript', 'sql', 'nosql', 'html', 'css', 'react', 'angular', 'vue',
60
+
61
+     'nodejs', 'django', 'flask', 'fastapi', 'spring boot', 'ruby on rails', 'php', 'swift', 'kotlin', 'dart', 'flutter',
62
+
63
+     'machine learning', 'deep learning', 'tensorflow', 'pytorch', 'keras', 'scikit-learn', 'pandas', 'numpy', 'matplotlib',
64
+
65
+     'natural language processing', 'nlp', 'computer vision', 'data analysis', 'data science', 'data engineering',
66
+
67
+     'big data', 'spark', 'hadoop', 'kafka', 'data visualization', 'tableau', 'power bi', 'd3.js', 'statistics', 'analytics',
68
+
69
+     'aws', 'azure', 'google cloud', 'gcp', 'docker', 'kubernetes', 'terraform', 'ansible', 'ci/cd', 'jenkins',
70
+
71
+     'git', 'github', 'devops', 'linux', 'unix', 'shell scripting', 'powershell', 'cybersecurity', 'penetration testing',
72
+
73
+     'network security', 'cryptography', 'blockchain', 'c#', '.net', 'sql server', 'mysql', 'postgresql', 'mongodb', 'redis',
74
+
75
+     'elasticsearch', 'api design', 'rest apis', 'graphql', 'microservices', 'serverless', 'system design', 'saas',
76
+
77
+     # Business & Consulting
78
+
79
+     'agile', 'scrum', 'project management', 'product management', 'consulting', 'client management', 'business development',
80
+
81
+     'strategy', 'stakeholder management', 'risk management', 'compliance', 'aml', 'kyc', 'reinsurance', 'finance',
82
+
83
+     'financial modeling', 'financial analysis', 'due diligence', 'sourcing', 'procurement', 'negotiation', 'supply chain',
84
+
85
+     'business analysis', 'business intelligence', 'presentations', 'public speaking', 'time management', 'critical thinking',
86
+
87
+     'design thinking', 'innovation', 'adaptability', 'supervisory', 'pmp', 'cpsm', 'cips', 'microsoft office', 'communication',
88
+
89
+     'organizational skills',
90
+
91
+     # Soft & Other
92
+
93
+     'leadership', 'stakeholder communication', 'client communication', 'teamwork', 'collaboration', 'problem solving',
94
+
95
+     'ui/ux design', 'figma', 'sketch', 'adobe xd', 'graphic design', 'autocad', 'solidworks', 'sales', 'marketing',
96
+
97
+     'seo', 'sem', 'content writing', 'customer support', 'technical writing', 'sap', 'oracle', 'budgeting', 'mentoring', 'supervising'
98
+
99
  }
100
 
101
+
102
+
103
  # --- GLOBAL STATE & DATA ---
104
+
105
  original_df = None
106
+
107
  combined_df = None
108
+
109
  model = None
110
+
111
  combined_job_embeddings = None
112
+
113
  original_job_title_embeddings = None
114
+
115
  LLM_PIPELINE = None
116
+
117
  LLM_MODEL_NAME = "microsoft/phi-2"
118
+
119
  FINETUNED_MODEL_ID = "its-zion-18/projfinetuned"
120
+
121
  KNOWN_WORDS = set()
122
 
123
+
124
+
125
  # --- CORE NLP & HELPER FUNCTIONS ---
126
+
127
  def _norm_skill_token(s: str) -> str:
128
+
129
+     s = s.lower().strip()
130
+
131
+     s = re.sub(r'[\(\)\[\]\{\}\*]', '', s)
132
+
133
+     s = re.sub(r'^\W+|\W+$', '', s)
134
+
135
+     s = re.sub(r'\s+', ' ', s)
136
+
137
+     return s
138
+
139
+
140
 
141
  def build_known_vocabulary(df: pd.DataFrame):
142
+
143
+     global KNOWN_WORDS
144
+
145
+     english_words = set(w.lower() for w in words.words())
146
+
147
+     job_words = set(re.findall(r'\b\w+\b', " ".join(df['full_text'].astype(str).tolist()).lower()))
148
+
149
+     job_words = {w for w in job_words if w.isalpha() and len(w) > 2}
150
+
151
+     KNOWN_WORDS = english_words | job_words
152
+
153
+     return "Known vocabulary built."
154
+
155
+
156
 
157
  def check_spelling_in_query(query: str) -> list[str]:
158
+
159
+     words_in_query = query.lower().split()
160
+
161
+     unrecognized_words = []
162
+
163
+     if not KNOWN_WORDS: return []
164
+
165
+     for word in words_in_query:
166
+
167
+         if word.isalpha() and len(word) > 1 and word not in KNOWN_WORDS:
168
+
169
+             unrecognized_words.append(word)
170
+
171
+     return list(set(unrecognized_words))
172
+
173
+
174
 
175
  def initialize_llm_client():
176
+
177
+     global LLM_PIPELINE
178
+
179
+     try:
180
+
181
+         tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME, trust_remote_code=True)
182
+
183
+         model_llm = AutoModelForCausalLM.from_pretrained(
184
+
185
+             LLM_MODEL_NAME, torch_dtype="auto", device_map="auto", trust_remote_code=True
186
+
187
+         )
188
+
189
+         LLM_PIPELINE = pipeline("text-generation", model=model_llm, tokenizer=tokenizer)
190
+
191
+         return True
192
+
193
+     except Exception as e:
194
+
195
+         print(f"🚨 ERROR initializing local LLM: {e}")
196
+
197
+         return False
198
+
199
+
200
 
201
  def llm_expand_query(user_input: str) -> str:
202
+
203
+     global LLM_PIPELINE
204
+
205
+     if not LLM_PIPELINE: return user_input
206
+
207
+     prompt_template = (
208
+
209
+         f"User's career interest: '{user_input}'\n"
210
+
211
+         f"Instruction: Based on the user's interest, write a concise, single-sentence summary (40-60 words) that elaborates on the core intent, typical skills, and responsibilities. "
212
+
213
+         f"Do not include a preamble, the user input, or any list formatting in the output. Just the expanded sentence.\n"
214
+
215
+         f"Expanded Intent:"
216
+
217
+     )
218
+
219
+     try:
220
+
221
+         response = LLM_PIPELINE(prompt_template, max_new_tokens=100, do_sample=True, temperature=0.6)
222
+
223
+         expanded_query = response[0]['generated_text'].strip().split("Expanded Intent:")[-1].strip()
224
+
225
+         final_query = user_input + ". " + expanded_query.replace('\n', ' ').replace(':', '').strip()
226
+
227
+         final_query = final_query.replace('..', '.').strip()
228
+
229
+         return final_query
230
+
231
+     except Exception:
232
+
233
+         return user_input
234
+
235
+
236
+
237
+ def find_job_matches(original_user_query: str, expanded_user_query: str, top_k: int = 50) -> pd.DataFrame:
238
+
239
+     expanded_user_embedding = model.encode(expanded_user_query, convert_to_tensor=True)
240
+
241
+     general_similarity_scores = util.cos_sim(expanded_user_embedding, combined_job_embeddings)[0]
242
+
243
+     top_indices = torch.topk(general_similarity_scores, k=len(combined_df))
244
+
245
+     sorted_combined_df = combined_df.iloc[top_indices.indices.cpu()].copy()
246
+
247
+     sorted_combined_df['general_score'] = top_indices.values.cpu().numpy()
248
+
249
+     unique_matches = sorted_combined_df.drop_duplicates(subset=['job_id'], keep='first').set_index('job_id')
250
+
251
+     original_user_embedding = model.encode(original_user_query, convert_to_tensor=True)
252
+
253
+     title_boost_scores = util.cos_sim(original_user_embedding, original_job_title_embeddings)[0].cpu().numpy()
254
+
255
+     title_boost_map = pd.Series(title_boost_scores, index=original_df['job_id'])
256
+
257
+     unique_matches['title_boost_score'] = unique_matches.index.map(title_boost_map).fillna(0)
258
+
259
+     unique_matches['Similarity Score'] = (0.70 * unique_matches['general_score'] + 0.30 * unique_matches['title_boost_score'])
260
+
261
+     final_job_ids = unique_matches.sort_values(by='Similarity Score', ascending=False).head(top_k).index.tolist()
262
+
263
+     final_results_df = original_df[original_df['job_id'].isin(final_job_ids)].copy()
264
+
265
+     scores_df = unique_matches.reset_index()[['job_id', 'Similarity Score']].copy()
266
+
267
+     final_results_df = pd.merge(final_results_df, scores_df, on='job_id', how='left')
268
+
269
+     final_results_df = final_results_df.sort_values(by='Similarity Score', ascending=False).reset_index(drop=True)
270
+
271
+     final_results_df = final_results_df.set_index('job_id', drop=False).rename(columns={'job_id': 'Job ID'})
272
+
273
+     return final_results_df
274
+
275
+
276
+
277
+ def score_jobs_by_skills(user_skills: list[str], df_to_rank: pd.DataFrame) -> pd.DataFrame:
278
+
279
+     if df_to_rank is None or df_to_rank.empty or not user_skills:
280
+
281
+         return df_to_rank.sort_values(by='Similarity Score', ascending=False) if df_to_rank is not None else pd.DataFrame()
282
+
283
+
284
+
285
+     ranked_df = df_to_rank.copy()
286
+
287
+     if 'Skills' not in ranked_df.columns:
288
+
289
+         return ranked_df.sort_values(by='Similarity Score', ascending=False)
290
+
291
+
292
+
293
+     user_skill_embeddings = model.encode(user_skills, convert_to_tensor=True)
294
+
295
+     all_job_skills = sorted(list(set(skill for skills_list in ranked_df['Skills'] if skills_list for skill in skills_list)))
296
+
297
+     
298
+
299
+     if not all_job_skills:
300
+
301
+         ranked_df['Skill Match Score'] = 0.0
302
+
303
+         ranked_df['Final Score'] = ranked_df['Similarity Score']
304
+
305
+         return ranked_df
306
+
307
+
308
+
309
+     job_skill_embeddings = model.encode(all_job_skills, convert_to_tensor=True)
310
+
311
+     similarity_matrix = util.cos_sim(user_skill_embeddings, job_skill_embeddings)
312
+
313
+
314
+
315
+     def calculate_confidence_adjusted_score(row):
316
+
317
+         job_skills_list = row.get('Skills', [])
318
+
319
+         if not job_skills_list:
320
+
321
+             return 0.0
322
+
323
+         
324
+
325
+         total_required = len(job_skills_list)
326
+
327
+         sum_of_max_similarities = 0.0
328
+
329
+         for job_skill in job_skills_list:
330
+
331
+             try:
332
+
333
+                 job_skill_idx = all_job_skills.index(job_skill)
334
+
335
+                 max_sim = torch.max(similarity_matrix[:, job_skill_idx])
336
+
337
+                 sum_of_max_similarities += max_sim.item()
338
+
339
+             except (ValueError, IndexError):
340
+
341
+                 continue
342
+
343
+         
344
+
345
+         avg_score = sum_of_max_similarities / total_required if total_required > 0 else 0.0
346
+
347
+         skill_count_factor = min(1.0, total_required / 5.0)
348
+
349
+         return avg_score * skill_count_factor
350
+
351
+
352
+
353
+     ranked_df['Skill Match Score'] = ranked_df.apply(calculate_confidence_adjusted_score, axis=1)
354
+
355
+     
356
+
357
+     ranked_df['Final Score'] = (0.8 * ranked_df['Similarity Score']) + (0.2 * ranked_df['Skill Match Score'])
358
+
359
+     
360
+
361
+     ranked_df = ranked_df.sort_values(by='Final Score', ascending=False).reset_index(drop=True)
362
+
363
+     return ranked_df.set_index('Job ID', drop=False).rename_axis(None)
364
+
365
+
366
 
367
  def initialize_data_and_model():
368
+
369
+     global original_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings
370
+
371
+     PROCESSED_DATA_PATH = "processed_jobs_with_skills.parquet"
372
+
373
+
374
+
375
+     print("--- Initializing LLM Client ---")
376
+
377
+     if not initialize_llm_client(): print("Warning: LLM Client failed to initialize. Will use NLTK only for skills.")
378
+
379
+
380
+
381
+     if os.path.exists(PROCESSED_DATA_PATH):
382
+
383
+         print(f"--- Loading pre-processed data from {PROCESSED_DATA_PATH} ---")
384
+
385
+         original_df = pd.read_parquet(PROCESSED_DATA_PATH)
386
+
387
+     else:
388
+
389
+         print("--- No pre-processed data found. Starting one-time processing... ---")
390
+
391
+         ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
392
+
393
+         original_df = ds["original"].to_pandas()
394
+
395
+         
396
+
397
+         def extract_skills_llm(text: str) -> list[str]:
398
+
399
+             if not isinstance(text, str) or len(text.strip()) < 20 or not LLM_PIPELINE: return []
400
+
401
+             prompt = f"""
402
+
403
  Instruct: You are an expert technical recruiter. Extract the key skills from the job description text. List technical and soft skills as a comma-separated string.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
404
 
405
+ [Example 1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
 
407
+ Text: "Requires 3+ years of experience in cloud infrastructure. Must be proficient in AWS, particularly EC2 and S3. Experience with Terraform for IaC is a plus."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
408
 
409
+ Extracted Skills: cloud infrastructure, aws, ec2, s3, terraform, infrastructure as code
 
 
 
 
 
 
 
410
 
411
+ [Example 2]
412
+
413
+ Text: "Seeking a team lead with strong project management abilities. Must communicate effectively with stakeholders and manage timelines using Agile methodologies like Scrum."
414
+
415
+ Extracted Skills: project management, leadership, stakeholder communication, agile, scrum
416
+
417
+ [Actual Task]
418
+
419
+ Text: "{text}"
420
+
421
+ Extracted Skills:
422
+
423
+ """
424
+
425
+             try:
426
+
427
+                 response = LLM_PIPELINE(prompt, max_new_tokens=150, do_sample=False, temperature=0.1)
428
+
429
+                 generated_text = response[0]['generated_text']
430
+
431
+                 skills_part = generated_text.split("Extracted Skills:")[-1].strip()
432
+
433
+                 skills = [skill.strip() for skill in skills_part.split(',') if skill.strip()]
434
+
435
+                 return list(dict.fromkeys(s.lower() for s in skills))
436
+
437
+             except Exception: return []
438
+
439
+
440
+
441
+         def extract_skills_nltk(text: str) -> list[str]:
442
+
443
+             if not isinstance(text, str): return []
444
+
445
+             text_lower = text.lower()
446
+
447
+             grammar = "NP: {<JJ.*>*<NN.*>+}"
448
+
449
+             chunk_parser = nltk.RegexpParser(grammar)
450
+
451
+             tokens = nltk.word_tokenize(text_lower)
452
+
453
+             tagged_tokens = nltk.pos_tag(tokens)
454
+
455
+             chunked_text = chunk_parser.parse(tagged_tokens)
456
+
457
+             potential_skills = set()
458
+
459
+             for subtree in chunked_text.subtrees():
460
+
461
+                 if subtree.label() == 'NP':
462
+
463
+                     phrase = " ".join(word for word, tag in subtree.leaves())
464
+
465
+                     if _norm_skill_token(phrase) in SKILL_WHITELIST:
466
+
467
+                         potential_skills.add(_norm_skill_token(phrase))
468
+
469
+             return sorted(list(potential_skills))
470
+
471
+             
472
+
473
+         def extract_skills_direct_scan(text: str) -> list[str]:
474
+
475
+             if not isinstance(text, str): return []
476
+
477
+             found_skills = set()
478
+
479
+             for skill in SKILL_WHITELIST:
480
+
481
+                 if re.search(r'\b' + re.escape(skill) + r'\b', text, re.IGNORECASE):
482
+
483
+                     found_skills.add(skill)
484
+
485
+             return list(found_skills)
486
+
487
+
488
+
489
+         # --- NEW: Function to expand a short skill list using the LLM ---
490
+
491
+         def expand_skills_with_llm(job_title: str, existing_skills: list) -> list:
492
+
493
+             if not LLM_PIPELINE or not job_title: return []
494
+
495
+             
496
+
497
+             skills_to_add = 6 - len(existing_skills)
498
+
499
+             prompt = f"""
500
+
501
+ Instruct: A job has the title "{job_title}" and requires the skills: {', '.join(existing_skills)}.
502
+
503
+ Based on this, what are {skills_to_add} additional, closely related skills typically required for such a role?
504
+
505
+ List only the new skills, separated by commas. Do not repeat skills from the original list.
506
+
507
+
508
+
509
+ Additional Skills:
510
+
511
+ """
512
+
513
+             try:
514
+
515
+                 response = LLM_PIPELINE(prompt, max_new_tokens=50, do_sample=True, temperature=0.5)
516
+
517
+                 generated_text = response[0]['generated_text']
518
+
519
+                 skills_part = generated_text.split("Additional Skills:")[-1].strip()
520
+
521
+                 new_skills = [skill.strip().lower() for skill in skills_part.split(',') if skill.strip()]
522
+
523
+                 return new_skills
524
+
525
+             except Exception:
526
+
527
+                 return []
528
+
529
+
530
+
531
+         def extract_skills_hybrid(row) -> list[str]:
532
+
533
+             text = row['text_for_skills']
534
+
535
+             job_title = row.get('Job title', '') # Use original Job title for context
536
+
537
+
538
+
539
+             llm_skills = extract_skills_llm(text)
540
+
541
+             nltk_skills = extract_skills_nltk(text)
542
+
543
+             direct_skills = extract_skills_direct_scan(text)
544
+
545
+             combined_skills = set(llm_skills) | set(nltk_skills) | set(direct_skills)
546
+
547
+             
548
+
549
+             # If the combined list is still too short, expand it
550
+
551
+             if len(combined_skills) < 6:
552
+
553
+                 expanded_skills = expand_skills_with_llm(job_title, list(combined_skills))
554
+
555
+                 combined_skills.update(expanded_skills)
556
+
557
+
558
+
559
+             return sorted(list(combined_skills))
560
+
561
+
562
+
563
+         def create_text_for_skills(row):
564
+
565
+             return " ".join([str(s) for s in [row.get("Job title"), row.get("Duties"), row.get("qualifications"), row.get("Description")] if pd.notna(s)])
566
+
567
+         
568
+
569
+         original_df["text_for_skills"] = original_df.apply(create_text_for_skills, axis=1)
570
+
571
+         print("--- Extracting skills with HYBRID ACCURACY model. Please wait... ---")
572
+
573
+         # Apply the hybrid function row-wise to include job title context
574
+
575
+         original_df['Skills'] = original_df.progress_apply(extract_skills_hybrid, axis=1)
576
+
577
+         original_df = original_df.drop(columns=['text_for_skills'])
578
+
579
+
580
+
581
+         print(f"--- Saving processed data to {PROCESSED_DATA_PATH} for faster future startups ---")
582
+
583
+         original_df.to_parquet(PROCESSED_DATA_PATH)
584
+
585
+
586
+
587
+     original_df['job_id'] = original_df.index
588
+
589
+     def create_full_text(row): return " ".join([str(s) for s in [row.get("Job title"), row.get("Company"), row.get("Duties"), row.get("qualifications"), row.get("Description")]])
590
+
591
+     original_df["full_text"] = original_df.apply(create_full_text, axis=1)
592
+
593
+     
594
+
595
+     ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
596
+
597
+     augmented_df = ds["augmented"].to_pandas()
598
+
599
+     max_id = len(original_df) - 1
600
+
601
+     augmented_df['job_id'] = augmented_df.index.map(lambda i: min(i // 20, max_id))
602
+
603
+     augmented_df["full_text"] = augmented_df.apply(create_full_text, axis=1)
604
+
605
+
606
+
607
+     combined_df = pd.concat([original_df.copy(), augmented_df.copy()], ignore_index=True)
608
+
609
+     original_df = original_df.rename(columns={'Job title': 'job_title', 'Company': 'company'})
610
+
611
+
612
+
613
+     print("--- Loading Fine-Tuned Sentence Transformer Model ---")
614
+
615
+     model = SentenceTransformer(FINETUNED_MODEL_ID)
616
+
617
+     print("--- Encoding Embeddings ---")
618
+
619
+     combined_job_embeddings = model.encode(combined_df["full_text"].tolist(), convert_to_tensor=True, show_progress_bar=True)
620
+
621
+     original_job_title_embeddings = model.encode(original_df["job_title"].tolist(), convert_to_tensor=True, show_progress_bar=True)
622
+
623
+     print("--- Building Vocabulary ---")
624
+
625
+     build_known_vocabulary(combined_df)
626
+
627
+     return "--- Initialization Complete ---"
628
+
629
+
630
+
631
+ def _course_links_for(skill: str) -> str:
632
+
633
+     q = _url.quote(skill)
634
+
635
+     links = [("Coursera", f"https://www.coursera.org/search?query={q}"), ("edX", f"https://www.edx.org/search?q={q}"), ("Udemy", f"https://www.udemy.com/courses/search/?q={q}"), ("YouTube", f"https://www.youtube.com/results?search_query={q}+tutorial")]
636
+
637
+     return " • ".join([f'<a href="{u}" target="_blank" style="color: #007bff;">{name}</a>' for name, u in links])
638
+
639
+
640
+
641
+ def get_job_matches(dream_job: str, top_n: int, skills_text: str):
642
+
643
+     status = "Searching using hybrid model..."
644
+
645
+     expanded_desc = llm_expand_query(dream_job)
646
+
647
+     emb_matches = find_job_matches(dream_job, expanded_desc, top_k=50)
648
+
649
+     user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
650
+
651
+
652
+
653
+     if user_skills:
654
+
655
+         display_df = score_jobs_by_skills(user_skills, emb_matches)
656
+
657
+     else:
658
+
659
+         display_df = emb_matches
660
+
661
+     display_df = display_df.head(top_n)
662
+
663
+     if user_skills:
664
+
665
+         status = f"Found and **re-ranked** results by your {len(user_skills)} skills. Displaying top {len(display_df)}."
666
+
667
+     else:
668
+
669
+         status = f"Found {len(display_df)} top matches using semantic search."
670
+
671
+     
672
+
673
+     if 'Final Score' in display_df.columns:
674
+
675
+         table_to_show = display_df[['job_title', 'company', 'Final Score', 'Skill Match Score']]
676
+
677
+         table_to_show = table_to_show.rename(columns={'Final Score': 'Overall Score'})
678
+
679
+         table_to_show['Skill Match Score'] = table_to_show['Skill Match Score'].map('{:.2%}'.format)
680
+
681
+         table_to_show['Overall Score'] = table_to_show['Overall Score'].map('{:.2%}'.format)
682
+
683
+     else:
684
+
685
+         table_to_show = display_df[['job_title', 'company', 'Similarity Score']]
686
+
687
+         table_to_show = table_to_show.rename(columns={'Similarity Score': 'Overall Score'})
688
+
689
+         table_to_show['Overall Score'] = table_to_show['Overall Score'].map('{:.2%}'.format)
690
+
691
+         
692
+
693
+     dropdown_options = [(f"{i+1}. {row['job_title']} - {row['company']}", row.name) for i, row in display_df.iterrows()]
694
+
695
+     dropdown_value = dropdown_options[0][1] if dropdown_options else None
696
+
697
+     return status, emb_matches, table_to_show, gr.Dropdown(choices=dropdown_options, value=dropdown_value, visible=True), gr.Accordion(visible=True)
698
+
699
+
700
+
701
+ def rerank_current_results(initial_matches_df, skills_text, top_n):
702
+
703
+     if initial_matches_df is None or pd.DataFrame(initial_matches_df).empty:
704
+
705
+         return "Please find matches first before re-ranking.", pd.DataFrame(), gr.Dropdown(visible=False)
706
+
707
+     initial_matches_df = pd.DataFrame(initial_matches_df)
708
+
709
+     user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
710
+
711
+     
712
+
713
+     if not user_skills:
714
+
715
+         status = "Skills cleared. Showing original semantic search results."
716
+
717
+         display_df = initial_matches_df.head(top_n)
718
+
719
+         table_to_show = display_df[['job_title', 'company', 'Similarity Score']]
720
+
721
+         table_to_show = table_to_show.rename(columns={'Similarity Score': 'Overall Score'})
722
+
723
+         table_to_show['Overall Score'] = table_to_show['Overall Score'].map('{:.2%}'.format)
724
+
725
+     else:
726
+
727
+         ranked_df = score_jobs_by_skills(user_skills, initial_matches_df)
728
+
729
+         status = f"Results **re-ranked** based on your {len(user_skills)} skills."
730
+
731
+         display_df = ranked_df.head(top_n)
732
+
733
+         table_to_show = display_df[['job_title', 'company', 'Final Score', 'Skill Match Score']]
734
+
735
+         table_to_show = table_to_show.rename(columns={'Final Score': 'Overall Score'})
736
+
737
+         table_to_show['Skill Match Score'] = table_to_show['Skill Match Score'].map('{:.2%}'.format)
738
+
739
+         table_to_show['Overall Score'] = table_to_show['Overall Score'].map('{:.2%}'.format)
740
+
741
+
742
+
743
+     dropdown_options = [(f"{i+1}. {row['job_title']} - {row['company']}", row.name) for i, row in display_df.iterrows()]
744
+
745
+     dropdown_value = dropdown_options[0][1] if dropdown_options else None
746
+
747
+     return status, table_to_show, gr.Dropdown(choices=dropdown_options, value=dropdown_value, visible=True)
748
+
749
+
750
+
751
+ def find_matches_and_rank_with_check(dream_job: str, top_n: int, skills_text: str):
752
+
753
+     if not dream_job:
754
+
755
+         return "Please describe your dream job first.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(""), gr.Row(visible=False)
756
+
757
+     unrecognized_words = check_spelling_in_query(dream_job)
758
+
759
+     if unrecognized_words:
760
+
761
+         word_list_html = ", ".join([f"<b><span style='color: #F87171;'>{w}</span></b>" for w in unrecognized_words])
762
+
763
+         alert_message = f"<b><span style='color: #F87171;'>⚠️ Possible Spelling Error:</span></b> Unrecognized: {word_list_html}."
764
+
765
+         return "Status: Awaiting confirmation.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(alert_message, visible=True), gr.Row(visible=True)
766
+
767
+     status, emb_matches, table_to_show, dropdown, details_accordion = get_job_matches(dream_job, top_n, skills_text)
768
+
769
+     return status, emb_matches, table_to_show, dropdown, details_accordion, gr.Markdown(visible=False), gr.Row(visible=False)
770
+
771
+
772
+
773
+ def find_matches_and_rank_anyway(dream_job: str, top_n: int, skills_text: str):
774
+
775
+     status, emb_matches, table_to_show, dropdown, details_accordion = get_job_matches(dream_job, top_n, skills_text)
776
+
777
+     return status, emb_matches, table_to_show, dropdown, details_accordion, gr.Markdown(visible=False), gr.Row(visible=False)
778
+
779
+
780
+
781
+ def on_select_job(job_id, skills_text):
782
+
783
+     if job_id is None: return "", "", "", "", "", gr.Accordion(visible=False), [], 0, gr.Button(visible=False)
784
+
785
+     row = original_df.loc[job_id]
786
+
787
+     title, company = str(row.get("job_title", "")), str(row.get("company", ""))
788
+
789
+     job_details_markdown = f"### {title} — {company}"
790
+
791
+     duties, qualifications, description = str(row.get('Duties', '')), str(row.get('qualifications', '')), str(row.get('Description', ''))
792
+
793
+     user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
794
+
795
+     job_skills = row.get("Skills", [])
796
+
797
+     if not job_skills:
798
+
799
+         learning_plan_html = "<p><i>No specific skills could be extracted for this job.</i></p>"
800
+
801
+         return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
802
+
803
+
804
+
805
+     score_val = 0
806
+
807
+     all_missing_skills = job_skills
808
+
809
+     if user_skills:
810
+
811
+         user_skill_embeddings = model.encode(user_skills, convert_to_tensor=True)
812
+
813
+         job_skill_embeddings = model.encode(job_skills, convert_to_tensor=True)
814
+
815
+         similarity_matrix = util.cos_sim(user_skill_embeddings, job_skill_embeddings)
816
+
817
+         
818
+
819
+         sum_of_max_similarities = torch.sum(torch.max(similarity_matrix, dim=0).values)
820
+
821
+         avg_score = (sum_of_max_similarities / len(job_skills)).item() if len(job_skills) > 0 else 0
822
+
823
+         
824
+
825
+         skill_count_factor = min(1.0, len(job_skills) / 5.0)
826
+
827
+         score_val = avg_score * skill_count_factor
828
+
829
+
830
+
831
+         matched_job_skills_mask = torch.any(similarity_matrix > 0.58, dim=0)
832
+
833
+         all_missing_skills = [skill for i, skill in enumerate(job_skills) if not matched_job_skills_mask[i]]
834
+
835
+
836
+
837
+     if user_skills and score_val >= 0.98:
838
+
839
+         learning_plan_html = "<h4 style='color:green;'>🎉 You have all the required skills!</h4>"
840
+
841
+         job_details_markdown += f"\n**Your skill match:** {score_val:.1%}"
842
+
843
+         return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
844
+
845
+     
846
+
847
+     if user_skills:
848
+
849
+         job_details_markdown += f"\n**Your skill match:** {score_val:.1%}"
850
+
851
+         headline = "<b>Great fit!</b>" if score_val >= 0.8 else "<b>Good progress!</b>" if score_val >= 0.5 else "<b>Solid starting point.</b>"
852
+
853
+         learning_plan_html = f"<h4>{headline} Focus on these skills to improve your match:</h4>"
854
+
855
+         skills_to_display = sorted(all_missing_skills)[:5]
856
+
857
+         items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
858
+
859
+         learning_plan_html += f"<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
860
+
861
+         return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
862
+
863
+     else:
864
+
865
+         headline = "<h4>To be a good fit for this role, you'll need to learn these skills:</h4>"
866
+
867
+         skills_to_display = sorted(job_skills)[:5]
868
+
869
+         items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
870
+
871
+         learning_plan_html = f"{headline}<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
872
+
873
+         full_skill_list_for_state = sorted(job_skills)
874
+
875
+         new_offset = len(skills_to_display)
876
+
877
+         should_button_be_visible = len(full_skill_list_for_state) > 5
878
+
879
+         return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), full_skill_list_for_state, new_offset, gr.Button(visible=should_button_be_visible)
880
+
881
+
882
+
883
+ def load_more_skills(full_skills_list, current_offset):
884
+
885
+     SKILLS_INCREMENT = 5
886
+
887
+     new_offset = current_offset + SKILLS_INCREMENT
888
+
889
+     skills_to_display = full_skills_list[:new_offset]
890
+
891
+     items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
892
+
893
+     learning_plan_html = f"<h4>To be a good fit for this role, you'll need to learn these skills:</h4><ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
894
+
895
+     should_button_be_visible = new_offset < len(full_skills_list)
896
+
897
+     return learning_plan_html, new_offset, gr.Button(visible=should_button_be_visible)
898
+
899
+
900
+
901
+ def on_reset():
902
+
903
+     return ("", 3, "", pd.DataFrame(), None, gr.Dropdown(visible=False), gr.Accordion(visible=False), "Status: Ready.", "", "", "", "", gr.Markdown(visible=False), gr.Row(visible=False), [], 0, gr.Button(visible=False))
904
+
905
+
906
+
907
+ print("Starting application initialization...")
908
+
909
+ initialization_status = initialize_data_and_model()
910
+
911
+ print(initialization_status)
912
+
913
+
914
+
915
+ with gr.Blocks(theme=gr.themes.Soft()) as ui:
916
+
917
+     gr.Markdown("# Hybrid Career Planner & Skill Gap Analyzer")
918
+
919
+     initial_matches_state = gr.State()
920
+
921
+     missing_skills_state = gr.State([])
922
+
923
+     skills_offset_state = gr.State(0)
924
+
925
+     with gr.Row():
926
+
927
+         with gr.Column(scale=3):
928
+
929
+             dream_text = gr.Textbox(label='Your Dream Job Description', lines=3, placeholder="e.g., 'A role in a tech startup focused on machine learning...'")
930
+
931
+             with gr.Accordion("Optional: Add Your Skills to Re-rank Results", open=False):
932
+
933
+                 with gr.Row():
934
+
935
+                     skills_text = gr.Textbox(label='Your Skills (comma-separated)', placeholder="e.g., Python, data analysis", scale=3)
936
+
937
+                     rerank_btn = gr.Button("Re-rank", variant="secondary", scale=1)
938
+
939
+         with gr.Column(scale=1):
940
+
941
+             topk_slider = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Number of Matches")
942
+
943
+             search_btn = gr.Button("Find Matches", variant="primary")
944
+
945
+             reset_btn = gr.Button("Reset All")
946
+
947
+     status_text = gr.Markdown("Status: Ready.")
948
+
949
+     spelling_alert = gr.Markdown(visible=False)
950
+
951
+     with gr.Row(visible=False) as spelling_row:
952
+
953
+         search_anyway_btn = gr.Button("Search Anyway", variant="secondary")
954
+
955
+         retype_btn = gr.Button("Let Me Fix It", variant="stop")
956
+
957
+     df_output = gr.DataFrame(label="Job Matches", interactive=False)
958
+
959
+     job_selector = gr.Dropdown(label="Select a job to see more details & learning plan:", visible=False)
960
+
961
+     with gr.Accordion("Job Details & Learning Plan", open=False, visible=False) as details_accordion:
962
+
963
+         job_details_markdown = gr.Markdown()
964
+
965
+         with gr.Tabs():
966
+
967
+             with gr.TabItem("Duties"): duties_markdown = gr.Markdown()
968
+
969
+             with gr.TabItem("Qualifications"): qualifications_markdown = gr.Markdown()
970
+
971
+             with gr.TabItem("Full Description"): description_markdown = gr.Markdown()
972
+
973
+         learning_plan_output = gr.HTML(label="Learning Plan")
974
+
975
+         load_more_btn = gr.Button("Load More Skills", visible=False)
976
+
977
+     search_btn.click(fn=find_matches_and_rank_with_check, inputs=[dream_text, topk_slider, skills_text], outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row])
978
+
979
+     search_anyway_btn.click(fn=find_matches_and_rank_anyway, inputs=[dream_text, topk_slider, skills_text], outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row])
980
+
981
+     retype_btn.click(lambda: ("Status: Ready for you to retype.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(visible=False), gr.Row(visible=False)), outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row])
982
+
983
+     reset_btn.click(fn=on_reset, outputs=[dream_text, topk_slider, skills_text, df_output, initial_matches_state, job_selector, details_accordion, status_text, job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, spelling_alert, spelling_row, missing_skills_state, skills_offset_state, load_more_btn], queue=False)
984
+
985
+     rerank_btn.click(fn=rerank_current_results, inputs=[initial_matches_state, skills_text, topk_slider], outputs=[status_text, df_output, job_selector])
986
+
987
+     job_selector.change(fn=on_select_job, inputs=[job_selector, skills_text], outputs=[job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, learning_plan_output, details_accordion, missing_skills_state, skills_offset_state, load_more_btn])
988
+
989
+     load_more_btn.click(fn=load_more_skills, inputs=[missing_skills_state, skills_offset_state], outputs=[learning_plan_output, skills_offset_state, load_more_btn])
990
 
 
 
 
 
 
991
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
992
 
993
  ui.launch()