Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| import datasets | |
| from sentence_transformers import SentenceTransformer, util, losses, InputExample | |
| from datasets import Dataset | |
| import torch | |
| import re | |
| import nltk | |
| from nltk.corpus import words | |
| from nltk.corpus import stopwords | |
| from IPython.display import display, clear_output | |
| import ipywidgets as widgets | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import numpy as np | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline | |
| import os | |
| from nltk.stem import PorterStemmer | |
| import gradio as gr | |
| import urllib.parse as _url | |
| # --- Download necessary NLTK data --- | |
| try: | |
| nltk.data.find('corpora/words') | |
| except LookupError: | |
| nltk.download('words', quiet=True) | |
| try: | |
| nltk.data.find('corpora/stopwords') | |
| except LookupError: | |
| nltk.download('stopwords', quiet=True) | |
| try: | |
| nltk.data.find('taggers/averaged_perceptron_tagger') | |
| except LookupError: | |
| nltk.download('averaged_perceptron_tagger', quiet=True) | |
| try: | |
| nltk.data.find('tokenizers/punkt') | |
| except LookupError: | |
| nltk.download('punkt', quiet=True) | |
| STOPWORDS = set(stopwords.words('english')) | |
| stemmer = PorterStemmer() | |
| # --- GLOBAL STATE & DATA --- | |
| # These will be initialized once and stored in Gradio's State | |
| original_df = None | |
| augmented_df = None | |
| combined_df = None | |
| model = None | |
| combined_job_embeddings = None | |
| original_job_title_embeddings = None | |
| LLM_PIPELINE = None | |
| LLM_MODEL_NAME = "microsoft/phi-2" | |
| FINETUNED_MODEL_PATH = "./finetuned_model" | |
| KNOWN_WORDS = set() | |
| # --- CORE NLP & HELPER FUNCTIONS --- | |
| def _norm_skill_token(s: str) -> str: | |
| s = s.lower().strip() | |
| s = re.sub(r'[\(\)\[\]\{\}\*]', '', s) | |
| s = re.sub(r'^\W+|\W+$', '', s) | |
| s = re.sub(r'\s+', ' ', s) | |
| return s | |
| def _skill_match(token1: str, token2: str, threshold: float = 0.9) -> bool: | |
| t1 = _norm_skill_token(token1) | |
| t2 = _norm_skill_token(token2) | |
| if t1 == t2 or t1 in t2 or t2 in t1: | |
| return True | |
| try: | |
| if len(t1) > 2 and len(t2) > 2: | |
| vectorizer = TfidfVectorizer().fit([t1, t2]) | |
| vectors = vectorizer.transform([t1, t2]) | |
| similarity = cosine_similarity(vectors)[0, 1] | |
| if similarity >= threshold: | |
| return True | |
| except: | |
| pass | |
| return False | |
| def build_known_vocabulary(df: pd.DataFrame): | |
| global KNOWN_WORDS | |
| english_words = set(w.lower() for w in words.words()) | |
| job_words = set(re.findall(r'\w+', " ".join(df['full_text'].astype(str).tolist()).lower())) | |
| job_words = {w for w in job_words if w.isalpha() and len(w) > 2} | |
| KNOWN_WORDS = english_words | job_words | |
| return "Known vocabulary built (English dictionary + combined dataset words)." | |
| def check_spelling_in_query(query: str) -> list[str]: | |
| words_in_query = query.lower().split() | |
| unrecognized_words = [] | |
| if not KNOWN_WORDS: | |
| return [] | |
| for word in words_in_query: | |
| if word.isalpha() and len(word) > 1 and word not in KNOWN_WORDS: | |
| unrecognized_words.append(word) | |
| return list(set(unrecognized_words)) | |
| def initialize_llm_client(): | |
| global LLM_PIPELINE | |
| try: | |
| device = 0 if torch.cuda.is_available() else -1 | |
| tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME, trust_remote_code=True) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| LLM_MODEL_NAME, | |
| torch_dtype=torch.float16, | |
| device_map="auto", | |
| trust_remote_code=True | |
| ) | |
| LLM_PIPELINE = pipeline( | |
| "text-generation", | |
| model=model, | |
| tokenizer=tokenizer, | |
| max_new_tokens=100, | |
| do_sample=True, | |
| temperature=0.7 | |
| ) | |
| return True | |
| except Exception as e: | |
| print(f"🚨 ERROR initializing local LLM: {e}") | |
| return False | |
| def llm_expand_query(user_input: str) -> str: | |
| global LLM_PIPELINE | |
| if not LLM_PIPELINE: | |
| return user_input | |
| prompt_template = ( | |
| f"User's career interest: '{user_input}'\n" | |
| f"Instruction: Based on the user's interest, write a concise, single-sentence summary (40-60 words) that elaborates on the core intent, typical skills, and responsibilities. " | |
| f"Do not include a preamble, the user input, or any list formatting in the output. Just the expanded sentence.\n" | |
| f"Expanded Intent:" | |
| ) | |
| try: | |
| response = LLM_PIPELINE( | |
| prompt_template, | |
| max_new_tokens=100, | |
| do_sample=True, | |
| temperature=0.6 | |
| ) | |
| expanded_query = response[0]['generated_text'].strip() | |
| if "Expanded Intent:" in expanded_query: | |
| expanded_query = expanded_query.split("Expanded Intent:")[-1].strip() | |
| final_query = user_input + ". " + expanded_query.replace('\n', ' ').replace(':', '').strip() # Fixed: Escape the newline character in the replace method | |
| final_query = final_query.replace('..', '.').strip() | |
| return final_query | |
| except Exception as e: | |
| return user_input | |
| def find_job_matches( | |
| original_user_query: str, | |
| expanded_user_query: str, | |
| top_k: int = 20, | |
| ) -> pd.DataFrame: | |
| expanded_user_embedding = model.encode(expanded_user_query, convert_to_tensor=True) | |
| general_similarity_scores = util.cos_sim(expanded_user_embedding, combined_job_embeddings)[0] | |
| top_indices = torch.topk(general_similarity_scores, k=len(combined_df)) | |
| sorted_combined_df = combined_df.iloc[top_indices.indices.cpu()].copy() | |
| sorted_combined_df['general_score'] = top_indices.values.cpu().numpy() | |
| unique_matches = sorted_combined_df.drop_duplicates(subset=['job_id'], keep='first').set_index('job_id') | |
| original_user_embedding = model.encode(original_user_query, convert_to_tensor=True) | |
| title_boost_scores = util.cos_sim(original_user_embedding, original_job_title_embeddings)[0].cpu().numpy() | |
| title_boost_map = pd.Series(title_boost_scores, index=original_df['job_id']) | |
| unique_matches['title_boost_score'] = unique_matches.index.map(title_boost_map).fillna(0) | |
| unique_matches['Similarity Score'] = ( | |
| 0.70 * unique_matches['general_score'] + | |
| 0.30 * unique_matches['title_boost_score'] | |
| ) | |
| final_job_ids = unique_matches.sort_values(by='Similarity Score', ascending=False).head(top_k).index.tolist() | |
| final_results_df = original_df[original_df['job_id'].isin(final_job_ids)].copy() | |
| scores_df = unique_matches.reset_index()[['job_id', 'Similarity Score']].copy() | |
| final_results_df = pd.merge(final_results_df, scores_df, on='job_id', how='left') | |
| final_results_df = final_results_df.sort_values(by='Similarity Score', ascending=False).reset_index(drop=True) | |
| final_results_df = final_results_df.set_index('job_id', drop=False) | |
| final_results_df = final_results_df.rename(columns={'job_id': 'Job ID'}) | |
| return final_results_df | |
| def score_jobs_by_skills(user_tokens: list[str], df_to_rank: pd.DataFrame) -> pd.DataFrame: | |
| if df_to_rank is None or df_to_rank.empty: | |
| return pd.DataFrame() | |
| ranked_df = df_to_rank.copy() | |
| if 'Skills' not in ranked_df.columns: | |
| return ranked_df.sort_values(by='Similarity Score', ascending=False) | |
| def calculate_match(row, user_tokens): | |
| job_skills = row.get('Skills', []) | |
| matched_skills = [] | |
| if not isinstance(job_skills, list): | |
| return matched_skills, 0, 0.0 | |
| for job_skill in job_skills: | |
| if any(_skill_match(u_token, job_skill) for u_token in user_tokens): | |
| matched_skills.append(job_skill) | |
| total_required_count = len(job_skills) | |
| match_score = len(matched_skills) / total_required_count if total_required_count > 0 else 0.0 | |
| return matched_skills, len(matched_skills), match_score | |
| results = ranked_df.apply(lambda row: calculate_match(row, user_tokens), axis=1, result_type='expand') | |
| ranked_df[['Skill Matches', 'Skill Match Count', 'Skill Match Score']] = results | |
| ranked_df = ranked_df.sort_values( | |
| by=['Skill Match Score', 'Similarity Score'], | |
| ascending=[False, False] | |
| ).reset_index(drop=True) | |
| return ranked_df.set_index('Job ID', drop=False).rename_axis(None) | |
| def fine_tune_model(model: SentenceTransformer, df: pd.DataFrame): | |
| os.environ["WANDB_DISABLED"] = "true" | |
| train_examples = [ | |
| InputExample(texts=[row['job_title'], row['full_text']]) | |
| for _, row in df.iterrows() | |
| ] | |
| train_dataloader = torch.utils.data.DataLoader(train_examples, shuffle=True, batch_size=16) | |
| train_loss = losses.MultipleNegativesRankingLoss(model) | |
| model.fit( | |
| train_objectives=[(train_dataloader, train_loss)], | |
| epochs=1, | |
| warmup_steps=100, | |
| show_progress_bar=True | |
| ) | |
| model.save(FINETUNED_MODEL_PATH) | |
| def initialize_data_and_model(): | |
| global original_df, augmented_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings | |
| if not initialize_llm_client(): | |
| pass | |
| ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset") | |
| original_df = ds["original"].to_pandas() | |
| augmented_df = ds["augmented"].to_pandas() | |
| original_df['job_id'] = original_df.index | |
| original_jobs_count = len(original_df) | |
| max_id = original_jobs_count - 1 | |
| augmented_df['job_id'] = augmented_df.index.map(lambda i: min(i // 20, max_id)) | |
| def create_full_text(row): | |
| return " ".join([ | |
| str(row["Job title"]), | |
| str(row["Company"]), | |
| str(row["Duties"]), | |
| str(row["qualifications"]), | |
| str(row["Description"]), | |
| ]) | |
| original_df["full_text"] = original_df.apply(create_full_text, axis=1) | |
| augmented_df["full_text"] = augmented_df.apply(create_full_text, axis=1) | |
| combined_df = pd.concat([original_df, augmented_df], ignore_index=True) | |
| original_df = original_df.rename(columns={'Job title': 'job_title', 'Company': 'company'}) | |
| def extract_skills_from_text(text): | |
| if not isinstance(text, str): return [] | |
| grammar = "NP: {<JJ.?>*<NN.?>+}" | |
| chunk_parser = nltk.RegexpParser(grammar) | |
| tokens = nltk.word_tokenize(text.lower()) | |
| tagged_tokens = nltk.pos_tag(tokens) | |
| chunked_text = chunk_parser.parse(tagged_tokens) | |
| skills = [] | |
| for subtree in chunked_text.subtrees(): | |
| if subtree.label() == 'NP': | |
| phrase = " ".join(word for word, tag in subtree.leaves()) | |
| junk_phrases = {'demonstrated experience', 'experience', 'related field', 'college/university level', 'equivalent foreign degree', 'cacrep standards', 'students', 'learning experience', 'ability', 'process', 'accreditation', 'human development', 'social welfare', 'sociology', 'pre-service teachers', 'abilities', 'books', 'certifications', 'college', 'level', 'licenses', 'years', 'form', 'knowledge', 'skills'} | |
| if phrase not in junk_phrases and _norm_skill_token(phrase) and phrase not in STOPWORDS: | |
| skills.append(_norm_skill_token(phrase)) | |
| keywords = {'teaching', 'training', 'leadership', 'management', 'data management', 'budget development', 'report'} | |
| for keyword in keywords: | |
| if re.search(r'' + re.escape(keyword) + r'', text.lower()) and _norm_skill_token(keyword) not in skills: | |
| skills.append(_norm_skill_token(keyword)) | |
| stemmed_skills = {} | |
| for skill in skills: | |
| stemmed_phrase = ' '.join([stemmer.stem(word) for word in skill.split()]) | |
| if stemmed_phrase not in stemmed_skills: | |
| stemmed_skills[stemmed_phrase] = skill | |
| return list(stemmed_skills.values()) | |
| original_df['Skills'] = original_df['qualifications'].apply(extract_skills_from_text) | |
| if os.path.exists(FINETUNED_MODEL_PATH): | |
| model = SentenceTransformer(FINETUNED_MODEL_PATH) | |
| else: | |
| model = SentenceTransformer("all-MiniLM-L6-v2") | |
| fine_tune_model(model, original_df) | |
| model = SentenceTransformer(FINETUNED_MODEL_PATH) | |
| combined_job_embeddings = model.encode(combined_df["full_text"].tolist(), convert_to_tensor=True) | |
| original_job_title_embeddings = model.encode(original_df["job_title"].tolist(), convert_to_tensor=True) | |
| build_known_vocabulary(combined_df) | |
| return "--- Initialization Complete ---" | |
| # --- GRADIO INTERFACE DEFINITION --- | |
| def build_interface(): | |
| with gr.Blocks() as ui: | |
| gr.Markdown("# Hybrid Career Planner & Skill Gap Analyzer") | |
| gr.Markdown("<i>Uses Augmented Data & LLM for Robust Search + Your Skills for Reranking.</i>") | |
| with gr.Row(): | |
| dream_text = gr.Textbox(label='Dream job:', lines=3, placeholder="Describe your ideal role (what you do, impact, tools, industry, etc.)", scale=3) | |
| topk_slider = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Top N:", scale=1) | |
| status_text = gr.Markdown("Status: Ready.") | |
| spelling_alert = gr.Markdown(visible=False) | |
| with gr.Row(visible=False) as spelling_row: | |
| search_anyway_btn = gr.Button("Search Anyway", variant="secondary") | |
| retype_btn = gr.Button("Retype/Fix Input", variant="stop") | |
| with gr.Row(): | |
| search_btn = gr.Button("Find matches", variant="primary") | |
| reset_btn = gr.Button("Reset", variant="secondary") | |
| df_output = gr.DataFrame(label="Job Matches") | |
| with gr.Accordion("Optional: Rerank by your skills", open=False): | |
| skills_text = gr.Textbox(label='Your skills:', placeholder="Comma-separated (e.g., Python, SolidWorks, FEA, leadership)") | |
| rerank_btn = gr.Button("Add skills & Re-rank") | |
| job_selector = gr.Dropdown(label="Select a job to see more details & learning plan:") | |
| with gr.Accordion("Job Details", open=True): | |
| job_details_markdown = gr.Markdown() | |
| with gr.Accordion("Duties"): | |
| duties_markdown = gr.Markdown() | |
| with gr.Accordion("Qualifications"): | |
| qualifications_markdown = gr.Markdown() | |
| with gr.Accordion("Description"): | |
| description_markdown = gr.Markdown() | |
| with gr.Accordion("Learning Plan"): | |
| learning_plan_output = gr.HTML() | |
| search_btn.click( | |
| fn=find_matches_and_rank_with_check, | |
| inputs=[dream_text, topk_slider, skills_text], | |
| outputs=[status_text, df_output, job_selector, spelling_alert, spelling_row, job_details_markdown] | |
| ) | |
| rerank_btn.click( | |
| fn=find_matches_and_rank_anyway, | |
| inputs=[dream_text, topk_slider, skills_text], | |
| outputs=[status_text, df_output, job_selector, spelling_alert, spelling_row, job_details_markdown] | |
| ) | |
| search_anyway_btn.click( | |
| fn=find_matches_and_rank_anyway, | |
| inputs=[dream_text, topk_slider, skills_text], | |
| outputs=[status_text, df_output, job_selector, spelling_alert, spelling_row, job_details_markdown] | |
| ) | |
| retype_btn.click( | |
| lambda: ( | |
| "Status: Ready to retype.", pd.DataFrame(), gr.Dropdown(choices=[], value=None), | |
| gr.Markdown(visible=False), gr.Row(visible=False), | |
| "" | |
| ), | |
| inputs=[], | |
| outputs=[status_text, df_output, job_selector, spelling_alert, spelling_row, job_details_markdown] | |
| ) | |
| def on_reset(): | |
| return ( | |
| "", | |
| pd.DataFrame(), | |
| gr.Dropdown(choices=[], value=None), | |
| "", | |
| gr.Markdown("", visible=False), | |
| gr.Row(visible=False), | |
| "" | |
| ) | |
| reset_btn.click( | |
| fn=on_reset, | |
| inputs=[], | |
| outputs=[dream_text, df_output, job_selector, skills_text, spelling_alert, spelling_row, job_details_markdown] | |
| ) | |
| job_selector.change( | |
| fn=on_select_job, | |
| inputs=[job_selector, skills_text], | |
| outputs=[job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, learning_plan_output] | |
| ) | |
| return ui | |
| # --- INITIALIZATION AND LAUNCH --- | |
| if __name__ == "__main__": | |
| initialize_data_and_model() | |
| ui = build_interface() | |
| ui.launch() | |