Update app.py
Browse files
app.py
CHANGED
|
@@ -61,6 +61,31 @@ FINETUNED_MODEL_ID = "its-zion-18/projfinetuned"
|
|
| 61 |
KNOWN_WORDS = set()
|
| 62 |
|
| 63 |
# --- CORE NLP & HELPER FUNCTIONS ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
def _norm_skill_token(s: str) -> str:
|
| 65 |
s = s.lower().strip()
|
| 66 |
s = re.sub(r'[\(\)\[\]\{\}\*]', '', s)
|
|
@@ -196,41 +221,44 @@ def initialize_data_and_model():
|
|
| 196 |
ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
|
| 197 |
original_df = ds["original"].to_pandas()
|
| 198 |
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
def extract_skills_llm_advanced(job_title: str, duties: str, qualifications: str) -> list[str]:
|
| 202 |
-
if not LLM_PIPELINE: return []
|
| 203 |
-
|
| 204 |
-
# We combine the most important fields to give the LLM full context.
|
| 205 |
-
full_context = f"Job Title: {job_title}\n\nDuties: {duties}\n\nQualifications: {qualifications}"
|
| 206 |
-
|
| 207 |
-
# This prompt is highly specific to encourage better, more niche results.
|
| 208 |
prompt = f"""
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
[Extracted Skills]
|
| 221 |
-
"""
|
| 222 |
try:
|
| 223 |
response = LLM_PIPELINE(prompt, max_new_tokens=150, do_sample=False, temperature=0.1)
|
| 224 |
generated_text = response[0]['generated_text']
|
| 225 |
-
|
| 226 |
-
skills_part = generated_text.split("[Extracted Skills]")[-1].strip()
|
| 227 |
skills = [skill.strip() for skill in skills_part.split(',') if skill.strip()]
|
| 228 |
-
# Return a de-duplicated list, preserving order as much as possible
|
| 229 |
return list(dict.fromkeys(s.lower() for s in skills))
|
| 230 |
-
except Exception
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
def extract_skills_direct_scan(text: str) -> list[str]:
|
| 235 |
if not isinstance(text, str): return []
|
| 236 |
found_skills = set()
|
|
@@ -239,58 +267,32 @@ def initialize_data_and_model():
|
|
| 239 |
found_skills.add(skill)
|
| 240 |
return list(found_skills)
|
| 241 |
|
| 242 |
-
def expand_skills_with_llm(job_title: str, existing_skills: list) -> list:
|
| 243 |
-
if not LLM_PIPELINE or not job_title: return []
|
| 244 |
-
|
| 245 |
-
skills_to_add = 6 - len(existing_skills)
|
| 246 |
-
prompt = f"""
|
| 247 |
-
Instruct: A job has the title "{job_title}" and requires the skills: {', '.join(existing_skills)}.
|
| 248 |
-
Based on this, what are {skills_to_add} additional, closely related skills typically required for such a role?
|
| 249 |
-
List only the new skills, separated by commas. Do not repeat skills from the original list.
|
| 250 |
-
|
| 251 |
-
Additional Skills:
|
| 252 |
-
"""
|
| 253 |
-
try:
|
| 254 |
-
response = LLM_PIPELINE(prompt, max_new_tokens=50, do_sample=True, temperature=0.5)
|
| 255 |
-
generated_text = response[0]['generated_text']
|
| 256 |
-
skills_part = generated_text.split("Additional Skills:")[-1].strip()
|
| 257 |
-
new_skills = [skill.strip().lower() for skill in skills_part.split(',') if skill.strip()]
|
| 258 |
-
return new_skills
|
| 259 |
-
except Exception:
|
| 260 |
-
return []
|
| 261 |
-
|
| 262 |
-
# --- MODIFIED: Hybrid Skill Extraction Logic ---
|
| 263 |
-
# This function is now simpler and more powerful. It prioritizes the advanced LLM extractor.
|
| 264 |
def extract_skills_hybrid(row) -> list[str]:
|
| 265 |
-
|
| 266 |
-
job_title =
|
| 267 |
-
duties = str(row.get('Duties', ''))
|
| 268 |
-
qualifications = str(row.get('qualifications', ''))
|
| 269 |
-
description = str(row.get('Description', ''))
|
| 270 |
-
|
| 271 |
-
# The full text is used for the direct scan as a fallback
|
| 272 |
-
full_text_for_scan = " ".join([job_title, duties, qualifications, description])
|
| 273 |
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
direct_skills = extract_skills_direct_scan(full_text_for_scan)
|
| 279 |
-
|
| 280 |
-
# Combine the results, giving priority to the LLM's findings
|
| 281 |
-
combined_skills = set(advanced_llm_skills) | set(direct_skills)
|
| 282 |
|
| 283 |
-
# If the combined list is still too short,
|
| 284 |
if len(combined_skills) < 6:
|
| 285 |
-
|
|
|
|
| 286 |
combined_skills.update(expanded_skills)
|
| 287 |
|
| 288 |
return sorted(list(combined_skills))
|
| 289 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 290 |
print("--- Extracting skills with HYBRID ACCURACY model. Please wait... ---")
|
| 291 |
# Apply the hybrid function row-wise to include job title context
|
| 292 |
original_df['Skills'] = original_df.progress_apply(extract_skills_hybrid, axis=1)
|
| 293 |
-
|
|
|
|
| 294 |
print(f"--- Saving processed data to {PROCESSED_DATA_PATH} for faster future startups ---")
|
| 295 |
original_df.to_parquet(PROCESSED_DATA_PATH)
|
| 296 |
|
|
@@ -327,14 +329,12 @@ def get_job_matches(dream_job: str, top_n: int, skills_text: str):
|
|
| 327 |
emb_matches = find_job_matches(dream_job, expanded_desc, top_k=50)
|
| 328 |
user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
|
| 329 |
|
| 330 |
-
# --- NEW: Initialize variables for the recommendations section ---
|
| 331 |
recommendations_table = pd.DataFrame()
|
| 332 |
recommendations_visible = False
|
| 333 |
|
| 334 |
if user_skills:
|
| 335 |
scored_df = score_jobs_by_skills(user_skills, emb_matches)
|
| 336 |
|
| 337 |
-
# --- NEW: Logic to get top 5 jobs based purely on skill match score ---
|
| 338 |
skill_sorted_df = scored_df.sort_values(by='Skill Match Score', ascending=False).head(5)
|
| 339 |
if not skill_sorted_df.empty:
|
| 340 |
recs = skill_sorted_df[['job_title', 'company', 'Skill Match Score', 'Final Score']].copy()
|
|
@@ -343,7 +343,6 @@ def get_job_matches(dream_job: str, top_n: int, skills_text: str):
|
|
| 343 |
recs['Overall Score'] = recs['Overall Score'].map('{:.2%}'.format)
|
| 344 |
recommendations_table = recs
|
| 345 |
recommendations_visible = True
|
| 346 |
-
# --- END NEW ---
|
| 347 |
|
| 348 |
display_df = scored_df.head(top_n)
|
| 349 |
status = f"Found and **re-ranked** results by your {len(user_skills)} skills. Displaying top {len(display_df)}."
|
|
@@ -364,7 +363,6 @@ def get_job_matches(dream_job: str, top_n: int, skills_text: str):
|
|
| 364 |
dropdown_options = [(f"{i+1}. {row['job_title']} - {row['company']}", row.name) for i, row in display_df.iterrows()]
|
| 365 |
dropdown_value = dropdown_options[0][1] if dropdown_options else None
|
| 366 |
|
| 367 |
-
# --- MODIFIED: Added new outputs for recommendations ---
|
| 368 |
return status, emb_matches, table_to_show, gr.Dropdown(choices=dropdown_options, value=dropdown_value, visible=True), gr.Accordion(visible=True), recommendations_table, gr.Accordion(visible=recommendations_visible)
|
| 369 |
|
| 370 |
def rerank_current_results(initial_matches_df, skills_text, top_n):
|
|
@@ -373,7 +371,6 @@ def rerank_current_results(initial_matches_df, skills_text, top_n):
|
|
| 373 |
initial_matches_df = pd.DataFrame(initial_matches_df)
|
| 374 |
user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
|
| 375 |
|
| 376 |
-
# --- NEW: Initialize variables for the recommendations section ---
|
| 377 |
recommendations_table = pd.DataFrame()
|
| 378 |
recommendations_visible = False
|
| 379 |
|
|
@@ -388,7 +385,6 @@ def rerank_current_results(initial_matches_df, skills_text, top_n):
|
|
| 388 |
status = f"Results **re-ranked** based on your {len(user_skills)} skills."
|
| 389 |
display_df = ranked_df.head(top_n)
|
| 390 |
|
| 391 |
-
# --- NEW: Logic to get top 5 jobs based purely on skill match score ---
|
| 392 |
skill_sorted_df = ranked_df.sort_values(by='Skill Match Score', ascending=False).head(5)
|
| 393 |
if not skill_sorted_df.empty:
|
| 394 |
recs = skill_sorted_df[['job_title', 'company', 'Skill Match Score', 'Final Score']].copy()
|
|
@@ -397,7 +393,6 @@ def rerank_current_results(initial_matches_df, skills_text, top_n):
|
|
| 397 |
recs['Overall Score'] = recs['Overall Score'].map('{:.2%}'.format)
|
| 398 |
recommendations_table = recs
|
| 399 |
recommendations_visible = True
|
| 400 |
-
# --- END NEW ---
|
| 401 |
|
| 402 |
table_to_show = display_df[['job_title', 'company', 'Final Score', 'Skill Match Score']]
|
| 403 |
table_to_show = table_to_show.rename(columns={'Final Score': 'Overall Score'})
|
|
@@ -407,18 +402,15 @@ def rerank_current_results(initial_matches_df, skills_text, top_n):
|
|
| 407 |
dropdown_options = [(f"{i+1}. {row['job_title']} - {row['company']}", row.name) for i, row in display_df.iterrows()]
|
| 408 |
dropdown_value = dropdown_options[0][1] if dropdown_options else None
|
| 409 |
|
| 410 |
-
# --- MODIFIED: Added new outputs for recommendations ---
|
| 411 |
return status, table_to_show, gr.Dropdown(choices=dropdown_options, value=dropdown_value, visible=True), recommendations_table, gr.Accordion(visible=recommendations_visible)
|
| 412 |
|
| 413 |
def find_matches_and_rank_with_check(dream_job: str, top_n: int, skills_text: str):
|
| 414 |
if not dream_job:
|
| 415 |
-
# --- MODIFIED: Added new default outputs ---
|
| 416 |
return "Please describe your dream job first.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(""), gr.Row(visible=False), pd.DataFrame(), gr.Accordion(visible=False)
|
| 417 |
unrecognized_words = check_spelling_in_query(dream_job)
|
| 418 |
if unrecognized_words:
|
| 419 |
word_list_html = ", ".join([f"<b><span style='color: #F87171;'>{w}</span></b>" for w in unrecognized_words])
|
| 420 |
alert_message = f"<b><span style='color: #F87171;'>⚠️ Possible Spelling Error:</span></b> Unrecognized: {word_list_html}."
|
| 421 |
-
# --- MODIFIED: Added new default outputs ---
|
| 422 |
return "Status: Awaiting confirmation.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(alert_message, visible=True), gr.Row(visible=True), pd.DataFrame(), gr.Accordion(visible=False)
|
| 423 |
|
| 424 |
status, emb_matches, table_to_show, dropdown, details_accordion, recommendations_table, recommendations_accordion = get_job_matches(dream_job, top_n, skills_text)
|
|
@@ -456,12 +448,11 @@ def on_select_job(job_id, skills_text):
|
|
| 456 |
matched_job_skills_mask = torch.any(similarity_matrix > 0.58, dim=0)
|
| 457 |
all_missing_skills = [skill for i, skill in enumerate(job_skills) if not matched_job_skills_mask[i]]
|
| 458 |
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
if user_skills:
|
| 465 |
job_details_markdown += f"\n**Your skill match:** {score_val:.1%}"
|
| 466 |
headline = "<b>Great fit!</b>" if score_val >= 0.8 else "<b>Good progress!</b>" if score_val >= 0.5 else "<b>Solid starting point.</b>"
|
| 467 |
learning_plan_html = f"<h4>{headline} Focus on these skills to improve your match:</h4>"
|
|
@@ -470,11 +461,25 @@ def on_select_job(job_id, skills_text):
|
|
| 470 |
learning_plan_html += f"<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
|
| 471 |
return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
|
| 472 |
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 473 |
headline = "<h4>To be a good fit for this role, you'll need to learn these skills:</h4>"
|
| 474 |
-
skills_to_display = sorted(
|
| 475 |
items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
|
| 476 |
learning_plan_html = f"{headline}<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
|
| 477 |
-
|
|
|
|
| 478 |
new_offset = len(skills_to_display)
|
| 479 |
should_button_be_visible = len(full_skill_list_for_state) > 5
|
| 480 |
return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), full_skill_list_for_state, new_offset, gr.Button(visible=should_button_be_visible)
|
|
@@ -489,7 +494,6 @@ def load_more_skills(full_skills_list, current_offset):
|
|
| 489 |
return learning_plan_html, new_offset, gr.Button(visible=should_button_be_visible)
|
| 490 |
|
| 491 |
def on_reset():
|
| 492 |
-
# --- MODIFIED: Added new default outputs for reset ---
|
| 493 |
return ("", 3, "", pd.DataFrame(), None, gr.Dropdown(visible=False), gr.Accordion(visible=False), "Status: Ready.", "", "", "", "", gr.Markdown(visible=False), gr.Row(visible=False), [], 0, gr.Button(visible=False), pd.DataFrame(), gr.Accordion(visible=False))
|
| 494 |
|
| 495 |
print("Starting application initialization...")
|
|
@@ -520,7 +524,6 @@ with gr.Blocks(theme=gr.themes.Soft()) as ui:
|
|
| 520 |
|
| 521 |
df_output = gr.DataFrame(label="Job Matches (Sorted by Overall Relevance)", interactive=False)
|
| 522 |
|
| 523 |
-
# --- NEW: Added the recommendations section ---
|
| 524 |
with gr.Accordion("✨ Based on your current skills and career interest consider these jobs...", open=True, visible=False) as recommendations_accordion:
|
| 525 |
recommendations_df_output = gr.DataFrame(label="Top Skill Matches", interactive=False)
|
| 526 |
|
|
@@ -534,7 +537,6 @@ with gr.Blocks(theme=gr.themes.Soft()) as ui:
|
|
| 534 |
learning_plan_output = gr.HTML(label="Learning Plan")
|
| 535 |
load_more_btn = gr.Button("Load More Skills", visible=False)
|
| 536 |
|
| 537 |
-
# --- MODIFIED: Added new outputs to the click events ---
|
| 538 |
search_btn.click(fn=find_matches_and_rank_with_check, inputs=[dream_text, topk_slider, skills_text], outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row, recommendations_df_output, recommendations_accordion])
|
| 539 |
search_anyway_btn.click(fn=find_matches_and_rank_anyway, inputs=[dream_text, topk_slider, skills_text], outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row, recommendations_df_output, recommendations_accordion])
|
| 540 |
retype_btn.click(lambda: ("Status: Ready for you to retype.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(visible=False), gr.Row(visible=False), pd.DataFrame(), gr.Accordion(visible=False)), outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row, recommendations_df_output, recommendations_accordion])
|
|
|
|
| 61 |
KNOWN_WORDS = set()
|
| 62 |
|
| 63 |
# --- CORE NLP & HELPER FUNCTIONS ---
|
| 64 |
+
def expand_skills_with_llm(job_title: str, existing_skills: list, num_skills_to_add: int) -> list:
|
| 65 |
+
"""
|
| 66 |
+
Uses the LLM to suggest additional skills based on a job title and existing skills.
|
| 67 |
+
"""
|
| 68 |
+
if not LLM_PIPELINE or not job_title or num_skills_to_add <= 0:
|
| 69 |
+
return []
|
| 70 |
+
|
| 71 |
+
existing_skills_str = ', '.join(existing_skills)
|
| 72 |
+
prompt = f"""
|
| 73 |
+
Instruct: A job has the title "{job_title}" and already lists these skills: {existing_skills_str}.
|
| 74 |
+
Based on this, what are {num_skills_to_add} additional, closely related skills typically required for such a role?
|
| 75 |
+
List only the new skills, separated by commas. Do not repeat skills from the original list. Do not include any preamble.
|
| 76 |
+
|
| 77 |
+
Additional Skills:
|
| 78 |
+
"""
|
| 79 |
+
try:
|
| 80 |
+
response = LLM_PIPELINE(prompt, max_new_tokens=50, do_sample=True, temperature=0.5)
|
| 81 |
+
generated_text = response[0]['generated_text']
|
| 82 |
+
skills_part = generated_text.split("Additional Skills:")[-1].strip()
|
| 83 |
+
new_skills = [skill.strip().lower() for skill in skills_part.split(',') if skill.strip()]
|
| 84 |
+
return list(dict.fromkeys(new_skills)) # Ensure unique skills are returned
|
| 85 |
+
except Exception as e:
|
| 86 |
+
print(f"🚨 ERROR expanding skills with LLM: {e}")
|
| 87 |
+
return []
|
| 88 |
+
|
| 89 |
def _norm_skill_token(s: str) -> str:
|
| 90 |
s = s.lower().strip()
|
| 91 |
s = re.sub(r'[\(\)\[\]\{\}\*]', '', s)
|
|
|
|
| 221 |
ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
|
| 222 |
original_df = ds["original"].to_pandas()
|
| 223 |
|
| 224 |
+
def extract_skills_llm(text: str) -> list[str]:
|
| 225 |
+
if not isinstance(text, str) or len(text.strip()) < 20 or not LLM_PIPELINE: return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
prompt = f"""
|
| 227 |
+
Instruct: You are an expert technical recruiter. Extract the key skills from the job description text. List technical and soft skills as a comma-separated string.
|
| 228 |
+
[Example 1]
|
| 229 |
+
Text: "Requires 3+ years of experience in cloud infrastructure. Must be proficient in AWS, particularly EC2 and S3. Experience with Terraform for IaC is a plus."
|
| 230 |
+
Extracted Skills: cloud infrastructure, aws, ec2, s3, terraform, infrastructure as code
|
| 231 |
+
[Example 2]
|
| 232 |
+
Text: "Seeking a team lead with strong project management abilities. Must communicate effectively with stakeholders and manage timelines using Agile methodologies like Scrum."
|
| 233 |
+
Extracted Skills: project management, leadership, stakeholder communication, agile, scrum
|
| 234 |
+
[Actual Task]
|
| 235 |
+
Text: "{text}"
|
| 236 |
+
Extracted Skills:
|
| 237 |
+
"""
|
|
|
|
|
|
|
| 238 |
try:
|
| 239 |
response = LLM_PIPELINE(prompt, max_new_tokens=150, do_sample=False, temperature=0.1)
|
| 240 |
generated_text = response[0]['generated_text']
|
| 241 |
+
skills_part = generated_text.split("Extracted Skills:")[-1].strip()
|
|
|
|
| 242 |
skills = [skill.strip() for skill in skills_part.split(',') if skill.strip()]
|
|
|
|
| 243 |
return list(dict.fromkeys(s.lower() for s in skills))
|
| 244 |
+
except Exception: return []
|
| 245 |
+
|
| 246 |
+
def extract_skills_nltk(text: str) -> list[str]:
|
| 247 |
+
if not isinstance(text, str): return []
|
| 248 |
+
text_lower = text.lower()
|
| 249 |
+
grammar = "NP: {<JJ.*>*<NN.*>+}"
|
| 250 |
+
chunk_parser = nltk.RegexpParser(grammar)
|
| 251 |
+
tokens = nltk.word_tokenize(text_lower)
|
| 252 |
+
tagged_tokens = nltk.pos_tag(tokens)
|
| 253 |
+
chunked_text = chunk_parser.parse(tagged_tokens)
|
| 254 |
+
potential_skills = set()
|
| 255 |
+
for subtree in chunked_text.subtrees():
|
| 256 |
+
if subtree.label() == 'NP':
|
| 257 |
+
phrase = " ".join(word for word, tag in subtree.leaves())
|
| 258 |
+
if _norm_skill_token(phrase) in SKILL_WHITELIST:
|
| 259 |
+
potential_skills.add(_norm_skill_token(phrase))
|
| 260 |
+
return sorted(list(potential_skills))
|
| 261 |
+
|
| 262 |
def extract_skills_direct_scan(text: str) -> list[str]:
|
| 263 |
if not isinstance(text, str): return []
|
| 264 |
found_skills = set()
|
|
|
|
| 267 |
found_skills.add(skill)
|
| 268 |
return list(found_skills)
|
| 269 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
def extract_skills_hybrid(row) -> list[str]:
|
| 271 |
+
text = row['text_for_skills']
|
| 272 |
+
job_title = row.get('Job title', '') # Use original Job title for context
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
|
| 274 |
+
llm_skills = extract_skills_llm(text)
|
| 275 |
+
nltk_skills = extract_skills_nltk(text)
|
| 276 |
+
direct_skills = extract_skills_direct_scan(text)
|
| 277 |
+
combined_skills = set(llm_skills) | set(nltk_skills) | set(direct_skills)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
|
| 279 |
+
# If the combined list is still too short, expand it
|
| 280 |
if len(combined_skills) < 6:
|
| 281 |
+
# MODIFIED: Call the global expand_skills_with_llm function
|
| 282 |
+
expanded_skills = expand_skills_with_llm(job_title, list(combined_skills), num_skills_to_add=6-len(combined_skills))
|
| 283 |
combined_skills.update(expanded_skills)
|
| 284 |
|
| 285 |
return sorted(list(combined_skills))
|
| 286 |
|
| 287 |
+
def create_text_for_skills(row):
|
| 288 |
+
return " ".join([str(s) for s in [row.get("Job title"), row.get("Duties"), row.get("qualifications"), row.get("Description")] if pd.notna(s)])
|
| 289 |
+
|
| 290 |
+
original_df["text_for_skills"] = original_df.apply(create_text_for_skills, axis=1)
|
| 291 |
print("--- Extracting skills with HYBRID ACCURACY model. Please wait... ---")
|
| 292 |
# Apply the hybrid function row-wise to include job title context
|
| 293 |
original_df['Skills'] = original_df.progress_apply(extract_skills_hybrid, axis=1)
|
| 294 |
+
original_df = original_df.drop(columns=['text_for_skills'])
|
| 295 |
+
|
| 296 |
print(f"--- Saving processed data to {PROCESSED_DATA_PATH} for faster future startups ---")
|
| 297 |
original_df.to_parquet(PROCESSED_DATA_PATH)
|
| 298 |
|
|
|
|
| 329 |
emb_matches = find_job_matches(dream_job, expanded_desc, top_k=50)
|
| 330 |
user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
|
| 331 |
|
|
|
|
| 332 |
recommendations_table = pd.DataFrame()
|
| 333 |
recommendations_visible = False
|
| 334 |
|
| 335 |
if user_skills:
|
| 336 |
scored_df = score_jobs_by_skills(user_skills, emb_matches)
|
| 337 |
|
|
|
|
| 338 |
skill_sorted_df = scored_df.sort_values(by='Skill Match Score', ascending=False).head(5)
|
| 339 |
if not skill_sorted_df.empty:
|
| 340 |
recs = skill_sorted_df[['job_title', 'company', 'Skill Match Score', 'Final Score']].copy()
|
|
|
|
| 343 |
recs['Overall Score'] = recs['Overall Score'].map('{:.2%}'.format)
|
| 344 |
recommendations_table = recs
|
| 345 |
recommendations_visible = True
|
|
|
|
| 346 |
|
| 347 |
display_df = scored_df.head(top_n)
|
| 348 |
status = f"Found and **re-ranked** results by your {len(user_skills)} skills. Displaying top {len(display_df)}."
|
|
|
|
| 363 |
dropdown_options = [(f"{i+1}. {row['job_title']} - {row['company']}", row.name) for i, row in display_df.iterrows()]
|
| 364 |
dropdown_value = dropdown_options[0][1] if dropdown_options else None
|
| 365 |
|
|
|
|
| 366 |
return status, emb_matches, table_to_show, gr.Dropdown(choices=dropdown_options, value=dropdown_value, visible=True), gr.Accordion(visible=True), recommendations_table, gr.Accordion(visible=recommendations_visible)
|
| 367 |
|
| 368 |
def rerank_current_results(initial_matches_df, skills_text, top_n):
|
|
|
|
| 371 |
initial_matches_df = pd.DataFrame(initial_matches_df)
|
| 372 |
user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
|
| 373 |
|
|
|
|
| 374 |
recommendations_table = pd.DataFrame()
|
| 375 |
recommendations_visible = False
|
| 376 |
|
|
|
|
| 385 |
status = f"Results **re-ranked** based on your {len(user_skills)} skills."
|
| 386 |
display_df = ranked_df.head(top_n)
|
| 387 |
|
|
|
|
| 388 |
skill_sorted_df = ranked_df.sort_values(by='Skill Match Score', ascending=False).head(5)
|
| 389 |
if not skill_sorted_df.empty:
|
| 390 |
recs = skill_sorted_df[['job_title', 'company', 'Skill Match Score', 'Final Score']].copy()
|
|
|
|
| 393 |
recs['Overall Score'] = recs['Overall Score'].map('{:.2%}'.format)
|
| 394 |
recommendations_table = recs
|
| 395 |
recommendations_visible = True
|
|
|
|
| 396 |
|
| 397 |
table_to_show = display_df[['job_title', 'company', 'Final Score', 'Skill Match Score']]
|
| 398 |
table_to_show = table_to_show.rename(columns={'Final Score': 'Overall Score'})
|
|
|
|
| 402 |
dropdown_options = [(f"{i+1}. {row['job_title']} - {row['company']}", row.name) for i, row in display_df.iterrows()]
|
| 403 |
dropdown_value = dropdown_options[0][1] if dropdown_options else None
|
| 404 |
|
|
|
|
| 405 |
return status, table_to_show, gr.Dropdown(choices=dropdown_options, value=dropdown_value, visible=True), recommendations_table, gr.Accordion(visible=recommendations_visible)
|
| 406 |
|
| 407 |
def find_matches_and_rank_with_check(dream_job: str, top_n: int, skills_text: str):
|
| 408 |
if not dream_job:
|
|
|
|
| 409 |
return "Please describe your dream job first.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(""), gr.Row(visible=False), pd.DataFrame(), gr.Accordion(visible=False)
|
| 410 |
unrecognized_words = check_spelling_in_query(dream_job)
|
| 411 |
if unrecognized_words:
|
| 412 |
word_list_html = ", ".join([f"<b><span style='color: #F87171;'>{w}</span></b>" for w in unrecognized_words])
|
| 413 |
alert_message = f"<b><span style='color: #F87171;'>⚠️ Possible Spelling Error:</span></b> Unrecognized: {word_list_html}."
|
|
|
|
| 414 |
return "Status: Awaiting confirmation.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(alert_message, visible=True), gr.Row(visible=True), pd.DataFrame(), gr.Accordion(visible=False)
|
| 415 |
|
| 416 |
status, emb_matches, table_to_show, dropdown, details_accordion, recommendations_table, recommendations_accordion = get_job_matches(dream_job, top_n, skills_text)
|
|
|
|
| 448 |
matched_job_skills_mask = torch.any(similarity_matrix > 0.58, dim=0)
|
| 449 |
all_missing_skills = [skill for i, skill in enumerate(job_skills) if not matched_job_skills_mask[i]]
|
| 450 |
|
| 451 |
+
if score_val >= 0.98:
|
| 452 |
+
learning_plan_html = "<h4 style='color:green;'>🎉 You have all the required skills!</h4>"
|
| 453 |
+
job_details_markdown += f"\n**Your skill match:** {score_val:.1%}"
|
| 454 |
+
return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
|
| 455 |
+
|
|
|
|
| 456 |
job_details_markdown += f"\n**Your skill match:** {score_val:.1%}"
|
| 457 |
headline = "<b>Great fit!</b>" if score_val >= 0.8 else "<b>Good progress!</b>" if score_val >= 0.5 else "<b>Solid starting point.</b>"
|
| 458 |
learning_plan_html = f"<h4>{headline} Focus on these skills to improve your match:</h4>"
|
|
|
|
| 461 |
learning_plan_html += f"<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
|
| 462 |
return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
|
| 463 |
else:
|
| 464 |
+
# --- MODIFIED LOGIC TO ENSURE AT LEAST 5 SKILLS ---
|
| 465 |
+
current_job_skills = list(job_skills)
|
| 466 |
+
job_title = str(row.get("job_title", ""))
|
| 467 |
+
|
| 468 |
+
if len(current_job_skills) < 5 and job_title and LLM_PIPELINE:
|
| 469 |
+
additional_skills_needed = 5 - len(current_job_skills)
|
| 470 |
+
newly_expanded_skills = expand_skills_with_llm(job_title, current_job_skills, num_skills_to_add=additional_skills_needed)
|
| 471 |
+
|
| 472 |
+
for skill in newly_expanded_skills:
|
| 473 |
+
if skill not in current_job_skills:
|
| 474 |
+
current_job_skills.append(skill)
|
| 475 |
+
# --- END MODIFICATION ---
|
| 476 |
+
|
| 477 |
headline = "<h4>To be a good fit for this role, you'll need to learn these skills:</h4>"
|
| 478 |
+
skills_to_display = sorted(current_job_skills)[:5]
|
| 479 |
items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
|
| 480 |
learning_plan_html = f"{headline}<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
|
| 481 |
+
|
| 482 |
+
full_skill_list_for_state = sorted(current_job_skills)
|
| 483 |
new_offset = len(skills_to_display)
|
| 484 |
should_button_be_visible = len(full_skill_list_for_state) > 5
|
| 485 |
return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), full_skill_list_for_state, new_offset, gr.Button(visible=should_button_be_visible)
|
|
|
|
| 494 |
return learning_plan_html, new_offset, gr.Button(visible=should_button_be_visible)
|
| 495 |
|
| 496 |
def on_reset():
|
|
|
|
| 497 |
return ("", 3, "", pd.DataFrame(), None, gr.Dropdown(visible=False), gr.Accordion(visible=False), "Status: Ready.", "", "", "", "", gr.Markdown(visible=False), gr.Row(visible=False), [], 0, gr.Button(visible=False), pd.DataFrame(), gr.Accordion(visible=False))
|
| 498 |
|
| 499 |
print("Starting application initialization...")
|
|
|
|
| 524 |
|
| 525 |
df_output = gr.DataFrame(label="Job Matches (Sorted by Overall Relevance)", interactive=False)
|
| 526 |
|
|
|
|
| 527 |
with gr.Accordion("✨ Based on your current skills and career interest consider these jobs...", open=True, visible=False) as recommendations_accordion:
|
| 528 |
recommendations_df_output = gr.DataFrame(label="Top Skill Matches", interactive=False)
|
| 529 |
|
|
|
|
| 537 |
learning_plan_output = gr.HTML(label="Learning Plan")
|
| 538 |
load_more_btn = gr.Button("Load More Skills", visible=False)
|
| 539 |
|
|
|
|
| 540 |
search_btn.click(fn=find_matches_and_rank_with_check, inputs=[dream_text, topk_slider, skills_text], outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row, recommendations_df_output, recommendations_accordion])
|
| 541 |
search_anyway_btn.click(fn=find_matches_and_rank_anyway, inputs=[dream_text, topk_slider, skills_text], outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row, recommendations_df_output, recommendations_accordion])
|
| 542 |
retype_btn.click(lambda: ("Status: Ready for you to retype.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(visible=False), gr.Row(visible=False), pd.DataFrame(), gr.Accordion(visible=False)), outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row, recommendations_df_output, recommendations_accordion])
|