|
|
import pandas as pd |
|
|
import datasets |
|
|
from sentence_transformers import SentenceTransformer, util |
|
|
import torch |
|
|
import re |
|
|
import nltk |
|
|
from nltk.corpus import words, stopwords |
|
|
import urllib.parse as _url |
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline |
|
|
from nltk.stem import PorterStemmer |
|
|
import gradio as gr |
|
|
import os |
|
|
from tqdm import tqdm |
|
|
|
|
|
tqdm.pandas() |
|
|
|
|
|
|
|
|
for package in ['words', 'stopwords', 'averaged_perceptron_tagger', 'punkt']: |
|
|
try: |
|
|
nltk.data.find(f'corpora/{package}' if package in ['words', 'stopwords'] else f'taggers/{package}' if package == 'averaged_perceptron_tagger' else f'tokenizers/{package}') |
|
|
except LookupError: |
|
|
nltk.download(package) |
|
|
|
|
|
STOPWORDS = set(stopwords.words('english')) |
|
|
stemmer = PorterStemmer() |
|
|
|
|
|
|
|
|
SKILL_WHITELIST = { |
|
|
|
|
|
'python', 'java', 'c++', 'javascript', 'typescript', 'sql', 'nosql', 'html', 'css', 'react', 'angular', 'vue', |
|
|
'nodejs', 'django', 'flask', 'fastapi', 'spring boot', 'ruby on rails', 'php', 'swift', 'kotlin', 'dart', 'flutter', |
|
|
'machine learning', 'deep learning', 'tensorflow', 'pytorch', 'keras', 'scikit-learn', 'pandas', 'numpy', 'matplotlib', |
|
|
'natural language processing', 'nlp', 'computer vision', 'data analysis', 'data science', 'data engineering', |
|
|
'big data', 'spark', 'hadoop', 'kafka', 'data visualization', 'tableau', 'power bi', 'd3.js', 'statistics', 'analytics', |
|
|
'aws', 'azure', 'google cloud', 'gcp', 'docker', 'kubernetes', 'terraform', 'ansible', 'ci/cd', 'jenkins', |
|
|
'git', 'github', 'devops', 'linux', 'unix', 'shell scripting', 'powershell', 'cybersecurity', 'penetration testing', |
|
|
'network security', 'cryptography', 'blockchain', 'c#', '.net', 'sql server', 'mysql', 'postgresql', 'mongodb', 'redis', |
|
|
'elasticsearch', 'api design', 'rest apis', 'graphql', 'microservices', 'serverless', 'system design', 'saas', |
|
|
|
|
|
'agile', 'scrum', 'project management', 'product management', 'consulting', 'client management', 'business development', |
|
|
'strategy', 'stakeholder management', 'risk management', 'compliance', 'aml', 'kyc', 'reinsurance', 'finance', |
|
|
'financial modeling', 'financial analysis', 'due diligence', 'sourcing', 'procurement', 'negotiation', 'supply chain', |
|
|
'business analysis', 'business intelligence', 'presentations', 'public speaking', 'time management', 'critical thinking', |
|
|
'design thinking', 'innovation', 'adaptability', 'supervisory', 'pmp', 'cpsm', 'cips', 'microsoft office', 'communication', |
|
|
'organizational skills', |
|
|
|
|
|
'leadership', 'stakeholder communication', 'client communication', 'teamwork', 'collaboration', 'problem solving', |
|
|
'ui/ux design', 'figma', 'sketch', 'adobe xd', 'graphic design', 'autocad', 'solidworks', 'sales', 'marketing', |
|
|
'seo', 'sem', 'content writing', 'customer support', 'technical writing', 'sap', 'oracle', 'budgeting', 'mentoring', 'supervising' |
|
|
} |
|
|
|
|
|
|
|
|
original_df = None |
|
|
combined_df = None |
|
|
model = None |
|
|
combined_job_embeddings = None |
|
|
original_job_title_embeddings = None |
|
|
LLM_PIPELINE = None |
|
|
LLM_MODEL_NAME = "microsoft/phi-2" |
|
|
FINETUNED_MODEL_ID = "its-zion-18/projfinetuned" |
|
|
KNOWN_WORDS = set() |
|
|
|
|
|
|
|
|
def _norm_skill_token(s: str) -> str: |
|
|
s = s.lower().strip() |
|
|
s = re.sub(r'[\(\)\[\]\{\}\*]', '', s) |
|
|
s = re.sub(r'^\W+|\W+$', '', s) |
|
|
s = re.sub(r'\s+', ' ', s) |
|
|
return s |
|
|
|
|
|
def build_known_vocabulary(df: pd.DataFrame): |
|
|
global KNOWN_WORDS |
|
|
english_words = set(w.lower() for w in words.words()) |
|
|
job_words = set(re.findall(r'\b\w+\b', " ".join(df['full_text'].astype(str).tolist()).lower())) |
|
|
job_words = {w for w in job_words if w.isalpha() and len(w) > 2} |
|
|
KNOWN_WORDS = english_words | job_words |
|
|
return "Known vocabulary built." |
|
|
|
|
|
def check_spelling_in_query(query: str) -> list[str]: |
|
|
words_in_query = query.lower().split() |
|
|
unrecognized_words = [] |
|
|
if not KNOWN_WORDS: return [] |
|
|
for word in words_in_query: |
|
|
if word.isalpha() and len(word) > 1 and word not in KNOWN_WORDS: |
|
|
unrecognized_words.append(word) |
|
|
return list(set(unrecognized_words)) |
|
|
|
|
|
def initialize_llm_client(): |
|
|
global LLM_PIPELINE |
|
|
try: |
|
|
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME, trust_remote_code=True) |
|
|
model_llm = AutoModelForCausalLM.from_pretrained( |
|
|
LLM_MODEL_NAME, torch_dtype="auto", device_map="auto", trust_remote_code=True |
|
|
) |
|
|
LLM_PIPELINE = pipeline("text-generation", model=model_llm, tokenizer=tokenizer) |
|
|
return True |
|
|
except Exception as e: |
|
|
print(f"🚨 ERROR initializing local LLM: {e}") |
|
|
return False |
|
|
|
|
|
def llm_expand_query(user_input: str) -> str: |
|
|
global LLM_PIPELINE |
|
|
if not LLM_PIPELINE: return user_input |
|
|
prompt_template = ( |
|
|
f"User's career interest: '{user_input}'\n" |
|
|
f"Instruction: Based on the user's interest, write a concise, single-sentence summary (40-60 words) that elaborates on the core intent, typical skills, and responsibilities. " |
|
|
f"Do not include a preamble, the user input, or any list formatting in the output. Just the expanded sentence.\n" |
|
|
f"Expanded Intent:" |
|
|
) |
|
|
try: |
|
|
response = LLM_PIPELINE(prompt_template, max_new_tokens=100, do_sample=True, temperature=0.6) |
|
|
expanded_query = response[0]['generated_text'].strip().split("Expanded Intent:")[-1].strip() |
|
|
final_query = user_input + ". " + expanded_query.replace('\n', ' ').replace(':', '').strip() |
|
|
final_query = final_query.replace('..', '.').strip() |
|
|
return final_query |
|
|
except Exception: |
|
|
return user_input |
|
|
|
|
|
def find_job_matches(original_user_query: str, expanded_user_query: str, top_k: int = 50) -> pd.DataFrame: |
|
|
expanded_user_embedding = model.encode(expanded_user_query, convert_to_tensor=True) |
|
|
general_similarity_scores = util.cos_sim(expanded_user_embedding, combined_job_embeddings)[0] |
|
|
top_indices = torch.topk(general_similarity_scores, k=len(combined_df)) |
|
|
sorted_combined_df = combined_df.iloc[top_indices.indices.cpu()].copy() |
|
|
sorted_combined_df['general_score'] = top_indices.values.cpu().numpy() |
|
|
unique_matches = sorted_combined_df.drop_duplicates(subset=['job_id'], keep='first').set_index('job_id') |
|
|
original_user_embedding = model.encode(original_user_query, convert_to_tensor=True) |
|
|
title_boost_scores = util.cos_sim(original_user_embedding, original_job_title_embeddings)[0].cpu().numpy() |
|
|
title_boost_map = pd.Series(title_boost_scores, index=original_df['job_id']) |
|
|
unique_matches['title_boost_score'] = unique_matches.index.map(title_boost_map).fillna(0) |
|
|
unique_matches['Similarity Score'] = (0.70 * unique_matches['general_score'] + 0.30 * unique_matches['title_boost_score']) |
|
|
final_job_ids = unique_matches.sort_values(by='Similarity Score', ascending=False).head(top_k).index.tolist() |
|
|
final_results_df = original_df[original_df['job_id'].isin(final_job_ids)].copy() |
|
|
scores_df = unique_matches.reset_index()[['job_id', 'Similarity Score']].copy() |
|
|
final_results_df = pd.merge(final_results_df, scores_df, on='job_id', how='left') |
|
|
final_results_df = final_results_df.sort_values(by='Similarity Score', ascending=False).reset_index(drop=True) |
|
|
final_results_df = final_results_df.set_index('job_id', drop=False).rename(columns={'job_id': 'Job ID'}) |
|
|
return final_results_df |
|
|
|
|
|
def score_jobs_by_skills(user_skills: list[str], df_to_rank: pd.DataFrame) -> pd.DataFrame: |
|
|
if df_to_rank is None or df_to_rank.empty or not user_skills: |
|
|
return df_to_rank.sort_values(by='Similarity Score', ascending=False) if df_to_rank is not None else pd.DataFrame() |
|
|
|
|
|
ranked_df = df_to_rank.copy() |
|
|
if 'Skills' not in ranked_df.columns: |
|
|
return ranked_df.sort_values(by='Similarity Score', ascending=False) |
|
|
|
|
|
user_skill_embeddings = model.encode(user_skills, convert_to_tensor=True) |
|
|
all_job_skills = sorted(list(set(skill for skills_list in ranked_df['Skills'] if skills_list for skill in skills_list))) |
|
|
|
|
|
if not all_job_skills: |
|
|
ranked_df['Skill Match Score'] = 0.0 |
|
|
ranked_df['Final Score'] = ranked_df['Similarity Score'] |
|
|
return ranked_df |
|
|
|
|
|
job_skill_embeddings = model.encode(all_job_skills, convert_to_tensor=True) |
|
|
similarity_matrix = util.cos_sim(user_skill_embeddings, job_skill_embeddings) |
|
|
|
|
|
def calculate_confidence_adjusted_score(row): |
|
|
job_skills_list = row.get('Skills', []) |
|
|
if not job_skills_list: |
|
|
return 0.0 |
|
|
|
|
|
total_required = len(job_skills_list) |
|
|
sum_of_max_similarities = 0.0 |
|
|
for job_skill in job_skills_list: |
|
|
try: |
|
|
job_skill_idx = all_job_skills.index(job_skill) |
|
|
max_sim = torch.max(similarity_matrix[:, job_skill_idx]) |
|
|
sum_of_max_similarities += max_sim.item() |
|
|
except (ValueError, IndexError): |
|
|
continue |
|
|
|
|
|
avg_score = sum_of_max_similarities / total_required if total_required > 0 else 0.0 |
|
|
skill_count_factor = min(1.0, total_required / 5.0) |
|
|
return avg_score * skill_count_factor |
|
|
|
|
|
ranked_df['Skill Match Score'] = ranked_df.apply(calculate_confidence_adjusted_score, axis=1) |
|
|
|
|
|
ranked_df['Final Score'] = (0.8 * ranked_df['Similarity Score']) + (0.2 * ranked_df['Skill Match Score']) |
|
|
|
|
|
ranked_df = ranked_df.sort_values(by='Final Score', ascending=False).reset_index(drop=True) |
|
|
return ranked_df.set_index('Job ID', drop=False).rename_axis(None) |
|
|
|
|
|
def initialize_data_and_model(): |
|
|
global original_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings |
|
|
PROCESSED_DATA_PATH = "processed_jobs_with_skills.parquet" |
|
|
|
|
|
print("--- Initializing LLM Client ---") |
|
|
if not initialize_llm_client(): print("Warning: LLM Client failed to initialize. Will use NLTK only for skills.") |
|
|
|
|
|
if os.path.exists(PROCESSED_DATA_PATH): |
|
|
print(f"--- Loading pre-processed data from {PROCESSED_DATA_PATH} ---") |
|
|
original_df = pd.read_parquet(PROCESSED_DATA_PATH) |
|
|
else: |
|
|
print("--- No pre-processed data found. Starting one-time processing... ---") |
|
|
ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset") |
|
|
original_df = ds["original"].to_pandas() |
|
|
|
|
|
def extract_skills_llm(text: str) -> list[str]: |
|
|
if not isinstance(text, str) or len(text.strip()) < 20 or not LLM_PIPELINE: return [] |
|
|
prompt = f""" |
|
|
Instruct: You are an expert technical recruiter. Extract the key skills from the job description text. List technical and soft skills as a comma-separated string. |
|
|
[Example 1] |
|
|
Text: "Requires 3+ years of experience in cloud infrastructure. Must be proficient in AWS, particularly EC2 and S3. Experience with Terraform for IaC is a plus." |
|
|
Extracted Skills: cloud infrastructure, aws, ec2, s3, terraform, infrastructure as code |
|
|
[Example 2] |
|
|
Text: "Seeking a team lead with strong project management abilities. Must communicate effectively with stakeholders and manage timelines using Agile methodologies like Scrum." |
|
|
Extracted Skills: project management, leadership, stakeholder communication, agile, scrum |
|
|
[Actual Task] |
|
|
Text: "{text}" |
|
|
Extracted Skills: |
|
|
""" |
|
|
try: |
|
|
response = LLM_PIPELINE(prompt, max_new_tokens=150, do_sample=False, temperature=0.1) |
|
|
generated_text = response[0]['generated_text'] |
|
|
skills_part = generated_text.split("Extracted Skills:")[-1].strip() |
|
|
skills = [skill.strip() for skill in skills_part.split(',') if skill.strip()] |
|
|
return list(dict.fromkeys(s.lower() for s in skills)) |
|
|
except Exception: return [] |
|
|
|
|
|
def extract_skills_nltk(text: str) -> list[str]: |
|
|
if not isinstance(text, str): return [] |
|
|
text_lower = text.lower() |
|
|
grammar = "NP: {<JJ.*>*<NN.*>+}" |
|
|
chunk_parser = nltk.RegexpParser(grammar) |
|
|
tokens = nltk.word_tokenize(text_lower) |
|
|
tagged_tokens = nltk.pos_tag(tokens) |
|
|
chunked_text = chunk_parser.parse(tagged_tokens) |
|
|
potential_skills = set() |
|
|
for subtree in chunked_text.subtrees(): |
|
|
if subtree.label() == 'NP': |
|
|
phrase = " ".join(word for word, tag in subtree.leaves()) |
|
|
if _norm_skill_token(phrase) in SKILL_WHITELIST: |
|
|
potential_skills.add(_norm_skill_token(phrase)) |
|
|
return sorted(list(potential_skills)) |
|
|
|
|
|
def extract_skills_direct_scan(text: str) -> list[str]: |
|
|
if not isinstance(text, str): return [] |
|
|
found_skills = set() |
|
|
for skill in SKILL_WHITELIST: |
|
|
if re.search(r'\b' + re.escape(skill) + r'\b', text, re.IGNORECASE): |
|
|
found_skills.add(skill) |
|
|
return list(found_skills) |
|
|
|
|
|
def expand_skills_with_llm(job_title: str, existing_skills: list) -> list: |
|
|
if not LLM_PIPELINE or not job_title: return [] |
|
|
|
|
|
skills_to_add = 6 - len(existing_skills) |
|
|
prompt = f""" |
|
|
Instruct: A job has the title "{job_title}" and requires the skills: {', '.join(existing_skills)}. |
|
|
Based on this, what are {skills_to_add} additional, closely related skills typically required for such a role? |
|
|
List only the new skills, separated by commas. Do not repeat skills from the original list. |
|
|
|
|
|
Additional Skills: |
|
|
""" |
|
|
try: |
|
|
response = LLM_PIPELINE(prompt, max_new_tokens=50, do_sample=True, temperature=0.5) |
|
|
generated_text = response[0]['generated_text'] |
|
|
skills_part = generated_text.split("Additional Skills:")[-1].strip() |
|
|
new_skills = [skill.strip().lower() for skill in skills_part.split(',') if skill.strip()] |
|
|
return new_skills |
|
|
except Exception: |
|
|
return [] |
|
|
|
|
|
def extract_skills_hybrid(row) -> list[str]: |
|
|
text = row['text_for_skills'] |
|
|
job_title = row.get('Job title', '') |
|
|
|
|
|
llm_skills = extract_skills_llm(text) |
|
|
nltk_skills = extract_skills_nltk(text) |
|
|
direct_skills = extract_skills_direct_scan(text) |
|
|
combined_skills = set(llm_skills) | set(nltk_skills) | set(direct_skills) |
|
|
|
|
|
|
|
|
if len(combined_skills) < 6: |
|
|
expanded_skills = expand_skills_with_llm(job_title, list(combined_skills)) |
|
|
combined_skills.update(expanded_skills) |
|
|
|
|
|
return sorted(list(combined_skills)) |
|
|
|
|
|
def create_text_for_skills(row): |
|
|
return " ".join([str(s) for s in [row.get("Job title"), row.get("Duties"), row.get("qualifications"), row.get("Description")] if pd.notna(s)]) |
|
|
|
|
|
original_df["text_for_skills"] = original_df.apply(create_text_for_skills, axis=1) |
|
|
print("--- Extracting skills with HYBRID ACCURACY model. Please wait... ---") |
|
|
|
|
|
original_df['Skills'] = original_df.progress_apply(extract_skills_hybrid, axis=1) |
|
|
original_df = original_df.drop(columns=['text_for_skills']) |
|
|
|
|
|
print(f"--- Saving processed data to {PROCESSED_DATA_PATH} for faster future startups ---") |
|
|
original_df.to_parquet(PROCESSED_DATA_PATH) |
|
|
|
|
|
original_df['job_id'] = original_df.index |
|
|
def create_full_text(row): return " ".join([str(s) for s in [row.get("Job title"), row.get("Company"), row.get("Duties"), row.get("qualifications"), row.get("Description")]]) |
|
|
original_df["full_text"] = original_df.apply(create_full_text, axis=1) |
|
|
|
|
|
ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset") |
|
|
augmented_df = ds["augmented"].to_pandas() |
|
|
max_id = len(original_df) - 1 |
|
|
augmented_df['job_id'] = augmented_df.index.map(lambda i: min(i // 20, max_id)) |
|
|
augmented_df["full_text"] = augmented_df.apply(create_full_text, axis=1) |
|
|
|
|
|
combined_df = pd.concat([original_df.copy(), augmented_df.copy()], ignore_index=True) |
|
|
original_df = original_df.rename(columns={'Job title': 'job_title', 'Company': 'company'}) |
|
|
|
|
|
print("--- Loading Fine-Tuned Sentence Transformer Model ---") |
|
|
model = SentenceTransformer(FINETUNED_MODEL_ID) |
|
|
print("--- Encoding Embeddings ---") |
|
|
combined_job_embeddings = model.encode(combined_df["full_text"].tolist(), convert_to_tensor=True, show_progress_bar=True) |
|
|
original_job_title_embeddings = model.encode(original_df["job_title"].tolist(), convert_to_tensor=True, show_progress_bar=True) |
|
|
print("--- Building Vocabulary ---") |
|
|
build_known_vocabulary(combined_df) |
|
|
return "--- Initialization Complete ---" |
|
|
|
|
|
def _course_links_for(skill: str) -> str: |
|
|
q = _url.quote(skill) |
|
|
links = [("Coursera", f"https://www.coursera.org/search?query={q}"), ("edX", f"https://www.edx.org/search?q={q}"), ("Udemy", f"https://www.udemy.com/courses/search/?q={q}"), ("YouTube", f"https://www.youtube.com/results?search_query={q}+tutorial")] |
|
|
return " • ".join([f'<a href="{u}" target="_blank" style="color: #007bff;">{name}</a>' for name, u in links]) |
|
|
|
|
|
def get_job_matches(dream_job: str, top_n: int, skills_text: str): |
|
|
status = "Searching using hybrid model..." |
|
|
expanded_desc = llm_expand_query(dream_job) |
|
|
emb_matches = find_job_matches(dream_job, expanded_desc, top_k=50) |
|
|
user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)] |
|
|
|
|
|
|
|
|
recommendations_table = pd.DataFrame() |
|
|
recommendations_visible = False |
|
|
|
|
|
if user_skills: |
|
|
scored_df = score_jobs_by_skills(user_skills, emb_matches) |
|
|
|
|
|
|
|
|
skill_sorted_df = scored_df.sort_values(by='Skill Match Score', ascending=False).head(5) |
|
|
if not skill_sorted_df.empty: |
|
|
recs = skill_sorted_df[['job_title', 'company', 'Skill Match Score', 'Final Score']].copy() |
|
|
recs = recs.rename(columns={'Final Score': 'Overall Score'}) |
|
|
recs['Skill Match Score'] = recs['Skill Match Score'].map('{:.2%}'.format) |
|
|
recs['Overall Score'] = recs['Overall Score'].map('{:.2%}'.format) |
|
|
recommendations_table = recs |
|
|
recommendations_visible = True |
|
|
|
|
|
|
|
|
display_df = scored_df.head(top_n) |
|
|
status = f"Found and **re-ranked** results by your {len(user_skills)} skills. Displaying top {len(display_df)}." |
|
|
else: |
|
|
display_df = emb_matches.head(top_n) |
|
|
status = f"Found {len(display_df)} top matches using semantic search." |
|
|
|
|
|
if 'Final Score' in display_df.columns: |
|
|
table_to_show = display_df[['job_title', 'company', 'Final Score', 'Skill Match Score']] |
|
|
table_to_show = table_to_show.rename(columns={'Final Score': 'Overall Score'}) |
|
|
table_to_show['Skill Match Score'] = table_to_show['Skill Match Score'].map('{:.2%}'.format) |
|
|
table_to_show['Overall Score'] = table_to_show['Overall Score'].map('{:.2%}'.format) |
|
|
else: |
|
|
table_to_show = display_df[['job_title', 'company', 'Similarity Score']] |
|
|
table_to_show = table_to_show.rename(columns={'Similarity Score': 'Overall Score'}) |
|
|
table_to_show['Overall Score'] = table_to_show['Overall Score'].map('{:.2%}'.format) |
|
|
|
|
|
dropdown_options = [(f"{i+1}. {row['job_title']} - {row['company']}", row.name) for i, row in display_df.iterrows()] |
|
|
dropdown_value = dropdown_options[0][1] if dropdown_options else None |
|
|
|
|
|
|
|
|
return status, emb_matches, table_to_show, gr.Dropdown(choices=dropdown_options, value=dropdown_value, visible=True), gr.Accordion(visible=True), recommendations_table, gr.Accordion(visible=recommendations_visible) |
|
|
|
|
|
def rerank_current_results(initial_matches_df, skills_text, top_n): |
|
|
if initial_matches_df is None or pd.DataFrame(initial_matches_df).empty: |
|
|
return "Please find matches first before re-ranking.", pd.DataFrame(), gr.Dropdown(visible=False), pd.DataFrame(), gr.Accordion(visible=False) |
|
|
initial_matches_df = pd.DataFrame(initial_matches_df) |
|
|
user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)] |
|
|
|
|
|
|
|
|
recommendations_table = pd.DataFrame() |
|
|
recommendations_visible = False |
|
|
|
|
|
if not user_skills: |
|
|
status = "Skills cleared. Showing original semantic search results." |
|
|
display_df = initial_matches_df.head(top_n) |
|
|
table_to_show = display_df[['job_title', 'company', 'Similarity Score']] |
|
|
table_to_show = table_to_show.rename(columns={'Similarity Score': 'Overall Score'}) |
|
|
table_to_show['Overall Score'] = table_to_show['Overall Score'].map('{:.2%}'.format) |
|
|
else: |
|
|
ranked_df = score_jobs_by_skills(user_skills, initial_matches_df) |
|
|
status = f"Results **re-ranked** based on your {len(user_skills)} skills." |
|
|
display_df = ranked_df.head(top_n) |
|
|
|
|
|
|
|
|
skill_sorted_df = ranked_df.sort_values(by='Skill Match Score', ascending=False).head(5) |
|
|
if not skill_sorted_df.empty: |
|
|
recs = skill_sorted_df[['job_title', 'company', 'Skill Match Score', 'Final Score']].copy() |
|
|
recs = recs.rename(columns={'Final Score': 'Overall Score'}) |
|
|
recs['Skill Match Score'] = recs['Skill Match Score'].map('{:.2%}'.format) |
|
|
recs['Overall Score'] = recs['Overall Score'].map('{:.2%}'.format) |
|
|
recommendations_table = recs |
|
|
recommendations_visible = True |
|
|
|
|
|
|
|
|
table_to_show = display_df[['job_title', 'company', 'Final Score', 'Skill Match Score']] |
|
|
table_to_show = table_to_show.rename(columns={'Final Score': 'Overall Score'}) |
|
|
table_to_show['Skill Match Score'] = table_to_show['Skill Match Score'].map('{:.2%}'.format) |
|
|
table_to_show['Overall Score'] = table_to_show['Overall Score'].map('{:.2%}'.format) |
|
|
|
|
|
dropdown_options = [(f"{i+1}. {row['job_title']} - {row['company']}", row.name) for i, row in display_df.iterrows()] |
|
|
dropdown_value = dropdown_options[0][1] if dropdown_options else None |
|
|
|
|
|
|
|
|
return status, table_to_show, gr.Dropdown(choices=dropdown_options, value=dropdown_value, visible=True), recommendations_table, gr.Accordion(visible=recommendations_visible) |
|
|
|
|
|
def find_matches_and_rank_with_check(dream_job: str, top_n: int, skills_text: str): |
|
|
if not dream_job: |
|
|
|
|
|
return "Please describe your dream job first.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(""), gr.Row(visible=False), pd.DataFrame(), gr.Accordion(visible=False) |
|
|
unrecognized_words = check_spelling_in_query(dream_job) |
|
|
if unrecognized_words: |
|
|
word_list_html = ", ".join([f"<b><span style='color: #F87171;'>{w}</span></b>" for w in unrecognized_words]) |
|
|
alert_message = f"<b><span style='color: #F87171;'>⚠️ Possible Spelling Error:</span></b> Unrecognized: {word_list_html}." |
|
|
|
|
|
return "Status: Awaiting confirmation.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(alert_message, visible=True), gr.Row(visible=True), pd.DataFrame(), gr.Accordion(visible=False) |
|
|
|
|
|
status, emb_matches, table_to_show, dropdown, details_accordion, recommendations_table, recommendations_accordion = get_job_matches(dream_job, top_n, skills_text) |
|
|
return status, emb_matches, table_to_show, dropdown, details_accordion, gr.Markdown(visible=False), gr.Row(visible=False), recommendations_table, recommendations_accordion |
|
|
|
|
|
def find_matches_and_rank_anyway(dream_job: str, top_n: int, skills_text: str): |
|
|
status, emb_matches, table_to_show, dropdown, details_accordion, recommendations_table, recommendations_accordion = get_job_matches(dream_job, top_n, skills_text) |
|
|
return status, emb_matches, table_to_show, dropdown, details_accordion, gr.Markdown(visible=False), gr.Row(visible=False), recommendations_table, recommendations_accordion |
|
|
|
|
|
def on_select_job(job_id, skills_text): |
|
|
if job_id is None: return "", "", "", "", "", gr.Accordion(visible=False), [], 0, gr.Button(visible=False) |
|
|
row = original_df.loc[job_id] |
|
|
title, company = str(row.get("job_title", "")), str(row.get("company", "")) |
|
|
job_details_markdown = f"### {title} — {company}" |
|
|
duties, qualifications, description = str(row.get('Duties', '')), str(row.get('qualifications', '')), str(row.get('Description', '')) |
|
|
user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)] |
|
|
job_skills = row.get("Skills", []) |
|
|
if not job_skills: |
|
|
learning_plan_html = "<p><i>No specific skills could be extracted for this job.</i></p>" |
|
|
return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False) |
|
|
|
|
|
score_val = 0 |
|
|
all_missing_skills = job_skills |
|
|
if user_skills: |
|
|
user_skill_embeddings = model.encode(user_skills, convert_to_tensor=True) |
|
|
job_skill_embeddings = model.encode(job_skills, convert_to_tensor=True) |
|
|
similarity_matrix = util.cos_sim(user_skill_embeddings, job_skill_embeddings) |
|
|
|
|
|
sum_of_max_similarities = torch.sum(torch.max(similarity_matrix, dim=0).values) |
|
|
avg_score = (sum_of_max_similarities / len(job_skills)).item() if len(job_skills) > 0 else 0 |
|
|
|
|
|
skill_count_factor = min(1.0, len(job_skills) / 5.0) |
|
|
score_val = avg_score * skill_count_factor |
|
|
|
|
|
matched_job_skills_mask = torch.any(similarity_matrix > 0.58, dim=0) |
|
|
all_missing_skills = [skill for i, skill in enumerate(job_skills) if not matched_job_skills_mask[i]] |
|
|
|
|
|
if user_skills and score_val >= 0.98: |
|
|
learning_plan_html = "<h4 style='color:green;'>🎉 You have all the required skills!</h4>" |
|
|
job_details_markdown += f"\n**Your skill match:** {score_val:.1%}" |
|
|
return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False) |
|
|
|
|
|
if user_skills: |
|
|
job_details_markdown += f"\n**Your skill match:** {score_val:.1%}" |
|
|
headline = "<b>Great fit!</b>" if score_val >= 0.8 else "<b>Good progress!</b>" if score_val >= 0.5 else "<b>Solid starting point.</b>" |
|
|
learning_plan_html = f"<h4>{headline} Focus on these skills to improve your match:</h4>" |
|
|
skills_to_display = sorted(all_missing_skills)[:5] |
|
|
items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display] |
|
|
learning_plan_html += f"<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>" |
|
|
return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False) |
|
|
else: |
|
|
headline = "<h4>To be a good fit for this role, you'll need to learn these skills:</h4>" |
|
|
skills_to_display = sorted(job_skills)[:5] |
|
|
items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display] |
|
|
learning_plan_html = f"{headline}<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>" |
|
|
full_skill_list_for_state = sorted(job_skills) |
|
|
new_offset = len(skills_to_display) |
|
|
should_button_be_visible = len(full_skill_list_for_state) > 5 |
|
|
return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), full_skill_list_for_state, new_offset, gr.Button(visible=should_button_be_visible) |
|
|
|
|
|
def load_more_skills(full_skills_list, current_offset): |
|
|
SKILLS_INCREMENT = 5 |
|
|
new_offset = current_offset + SKILLS_INCREMENT |
|
|
skills_to_display = full_skills_list[:new_offset] |
|
|
items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display] |
|
|
learning_plan_html = f"<h4>To be a good fit for this role, you'll need to learn these skills:</h4><ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>" |
|
|
should_button_be_visible = new_offset < len(full_skills_list) |
|
|
return learning_plan_html, new_offset, gr.Button(visible=should_button_be_visible) |
|
|
|
|
|
def on_reset(): |
|
|
|
|
|
return ("", 3, "", pd.DataFrame(), None, gr.Dropdown(visible=False), gr.Accordion(visible=False), "Status: Ready.", "", "", "", "", gr.Markdown(visible=False), gr.Row(visible=False), [], 0, gr.Button(visible=False), pd.DataFrame(), gr.Accordion(visible=False)) |
|
|
|
|
|
print("Starting application initialization...") |
|
|
initialization_status = initialize_data_and_model() |
|
|
print(initialization_status) |
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft()) as ui: |
|
|
gr.Markdown("# Hybrid Career Planner & Skill Gap Analyzer") |
|
|
initial_matches_state = gr.State() |
|
|
missing_skills_state = gr.State([]) |
|
|
skills_offset_state = gr.State(0) |
|
|
with gr.Row(): |
|
|
with gr.Column(scale=3): |
|
|
dream_text = gr.Textbox(label='Your Dream Job Description', lines=3, placeholder="e.g., 'A role in a tech startup focused on machine learning...'") |
|
|
with gr.Accordion("Optional: Add Your Skills to Re-rank Results", open=False): |
|
|
with gr.Row(): |
|
|
skills_text = gr.Textbox(label='Your Skills (comma-separated)', placeholder="e.g., Python, data analysis", scale=3) |
|
|
rerank_btn = gr.Button("Re-rank", variant="secondary", scale=1) |
|
|
with gr.Column(scale=1): |
|
|
topk_slider = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Number of Matches") |
|
|
search_btn = gr.Button("Find Matches", variant="primary") |
|
|
reset_btn = gr.Button("Reset All") |
|
|
status_text = gr.Markdown("Status: Ready.") |
|
|
spelling_alert = gr.Markdown(visible=False) |
|
|
with gr.Row(visible=False) as spelling_row: |
|
|
search_anyway_btn = gr.Button("Search Anyway", variant="secondary") |
|
|
retype_btn = gr.Button("Let Me Fix It", variant="stop") |
|
|
|
|
|
df_output = gr.DataFrame(label="Job Matches (Sorted by Overall Relevance)", interactive=False) |
|
|
|
|
|
|
|
|
with gr.Accordion("✨ Based on your current skills and career interest consider these jobs...", open=True, visible=False) as recommendations_accordion: |
|
|
recommendations_df_output = gr.DataFrame(label="Top Skill Matches", interactive=False) |
|
|
|
|
|
job_selector = gr.Dropdown(label="Select a job to see more details & learning plan:", visible=False) |
|
|
with gr.Accordion("Job Details & Learning Plan", open=False, visible=False) as details_accordion: |
|
|
job_details_markdown = gr.Markdown() |
|
|
with gr.Tabs(): |
|
|
with gr.TabItem("Duties"): duties_markdown = gr.Markdown() |
|
|
with gr.TabItem("Qualifications"): qualifications_markdown = gr.Markdown() |
|
|
with gr.TabItem("Full Description"): description_markdown = gr.Markdown() |
|
|
learning_plan_output = gr.HTML(label="Learning Plan") |
|
|
load_more_btn = gr.Button("Load More Skills", visible=False) |
|
|
|
|
|
|
|
|
search_btn.click(fn=find_matches_and_rank_with_check, inputs=[dream_text, topk_slider, skills_text], outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row, recommendations_df_output, recommendations_accordion]) |
|
|
search_anyway_btn.click(fn=find_matches_and_rank_anyway, inputs=[dream_text, topk_slider, skills_text], outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row, recommendations_df_output, recommendations_accordion]) |
|
|
retype_btn.click(lambda: ("Status: Ready for you to retype.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(visible=False), gr.Row(visible=False), pd.DataFrame(), gr.Accordion(visible=False)), outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row, recommendations_df_output, recommendations_accordion]) |
|
|
reset_btn.click(fn=on_reset, outputs=[dream_text, topk_slider, skills_text, df_output, initial_matches_state, job_selector, details_accordion, status_text, job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, spelling_alert, spelling_row, missing_skills_state, skills_offset_state, load_more_btn, recommendations_df_output, recommendations_accordion], queue=False) |
|
|
rerank_btn.click(fn=rerank_current_results, inputs=[initial_matches_state, skills_text, topk_slider], outputs=[status_text, df_output, job_selector, recommendations_df_output, recommendations_accordion]) |
|
|
|
|
|
job_selector.change(fn=on_select_job, inputs=[job_selector, skills_text], outputs=[job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, learning_plan_output, details_accordion, missing_skills_state, skills_offset_state, load_more_btn]) |
|
|
load_more_btn.click(fn=load_more_skills, inputs=[missing_skills_state, skills_offset_state], outputs=[learning_plan_output, skills_offset_state, load_more_btn]) |
|
|
|
|
|
ui.launch() |