Spaces:

zlf18
/

test2

Sleeping

App Files Files Community

zlf18 commited on Oct 12, 2025

Commit

a515df0

verified ·

1 Parent(s): 79d3345

Update app.py

Browse files

Files changed (1) hide show

app.py +418 -915

app.py CHANGED Viewed

@@ -1,993 +1,496 @@
 import pandas as pd
 import datasets
 from sentence_transformers import SentenceTransformer, util
 import torch
 import re
 import nltk
 from nltk.corpus import words, stopwords
 import urllib.parse as _url
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 from nltk.stem import PorterStemmer
 import gradio as gr
 import os
 from tqdm import tqdm
 tqdm.pandas()
 # --- NLTK Data Download ---
 for package in ['words', 'stopwords', 'averaged_perceptron_tagger', 'punkt']:
-    try:
-        nltk.data.find(f'corpora/{package}' if package in ['words', 'stopwords'] else f'taggers/{package}' if package == 'averaged_perceptron_tagger' else f'tokenizers/{package}')
-    except LookupError:
-        nltk.download(package)
 STOPWORDS = set(stopwords.words('english'))
 stemmer = PorterStemmer()
 # --- Expanded Skill Whitelist ---
 SKILL_WHITELIST = {
-    # Technical & Data
-    'python', 'java', 'c++', 'javascript', 'typescript', 'sql', 'nosql', 'html', 'css', 'react', 'angular', 'vue',
-    'nodejs', 'django', 'flask', 'fastapi', 'spring boot', 'ruby on rails', 'php', 'swift', 'kotlin', 'dart', 'flutter',
-    'machine learning', 'deep learning', 'tensorflow', 'pytorch', 'keras', 'scikit-learn', 'pandas', 'numpy', 'matplotlib',
-    'natural language processing', 'nlp', 'computer vision', 'data analysis', 'data science', 'data engineering',
-    'big data', 'spark', 'hadoop', 'kafka', 'data visualization', 'tableau', 'power bi', 'd3.js', 'statistics', 'analytics',
-    'aws', 'azure', 'google cloud', 'gcp', 'docker', 'kubernetes', 'terraform', 'ansible', 'ci/cd', 'jenkins',
-    'git', 'github', 'devops', 'linux', 'unix', 'shell scripting', 'powershell', 'cybersecurity', 'penetration testing',
-    'network security', 'cryptography', 'blockchain', 'c#', '.net', 'sql server', 'mysql', 'postgresql', 'mongodb', 'redis',
-    'elasticsearch', 'api design', 'rest apis', 'graphql', 'microservices', 'serverless', 'system design', 'saas',
-    # Business & Consulting
-    'agile', 'scrum', 'project management', 'product management', 'consulting', 'client management', 'business development',
-    'strategy', 'stakeholder management', 'risk management', 'compliance', 'aml', 'kyc', 'reinsurance', 'finance',
-    'financial modeling', 'financial analysis', 'due diligence', 'sourcing', 'procurement', 'negotiation', 'supply chain',
-    'business analysis', 'business intelligence', 'presentations', 'public speaking', 'time management', 'critical thinking',
-    'design thinking', 'innovation', 'adaptability', 'supervisory', 'pmp', 'cpsm', 'cips', 'microsoft office', 'communication',
-    'organizational skills',
-    # Soft & Other
-    'leadership', 'stakeholder communication', 'client communication', 'teamwork', 'collaboration', 'problem solving',
-    'ui/ux design', 'figma', 'sketch', 'adobe xd', 'graphic design', 'autocad', 'solidworks', 'sales', 'marketing',
-    'seo', 'sem', 'content writing', 'customer support', 'technical writing', 'sap', 'oracle', 'budgeting', 'mentoring', 'supervising'
 }
 # --- GLOBAL STATE & DATA ---
 original_df = None
 combined_df = None
 model = None
 combined_job_embeddings = None
 original_job_title_embeddings = None
 LLM_PIPELINE = None
 LLM_MODEL_NAME = "microsoft/phi-2"
 FINETUNED_MODEL_ID = "its-zion-18/projfinetuned"
 KNOWN_WORDS = set()
 # --- CORE NLP & HELPER FUNCTIONS ---
 def _norm_skill_token(s: str) -> str:
-    s = s.lower().strip()
-    s = re.sub(r'[\(\)\[\]\{\}\*]', '', s)
-    s = re.sub(r'^\W+|\W+$', '', s)
-    s = re.sub(r'\s+', ' ', s)
-    return s
 def build_known_vocabulary(df: pd.DataFrame):
-    global KNOWN_WORDS
-    english_words = set(w.lower() for w in words.words())
-    job_words = set(re.findall(r'\b\w+\b', " ".join(df['full_text'].astype(str).tolist()).lower()))
-    job_words = {w for w in job_words if w.isalpha() and len(w) > 2}
-    KNOWN_WORDS = english_words | job_words
-    return "Known vocabulary built."
 def check_spelling_in_query(query: str) -> list[str]:
-    words_in_query = query.lower().split()
-    unrecognized_words = []
-    if not KNOWN_WORDS: return []
-    for word in words_in_query:
-        if word.isalpha() and len(word) > 1 and word not in KNOWN_WORDS:
-            unrecognized_words.append(word)
-    return list(set(unrecognized_words))
 def initialize_llm_client():
-    global LLM_PIPELINE
-    try:
-        tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME, trust_remote_code=True)
-        model_llm = AutoModelForCausalLM.from_pretrained(
-            LLM_MODEL_NAME, torch_dtype="auto", device_map="auto", trust_remote_code=True
-        )
-        LLM_PIPELINE = pipeline("text-generation", model=model_llm, tokenizer=tokenizer)
-        return True
-    except Exception as e:
-        print(f"🚨 ERROR initializing local LLM: {e}")
-        return False
 def llm_expand_query(user_input: str) -> str:
-    global LLM_PIPELINE
-    if not LLM_PIPELINE: return user_input
-    prompt_template = (
-        f"User's career interest: '{user_input}'\n"
-        f"Instruction: Based on the user's interest, write a concise, single-sentence summary (40-60 words) that elaborates on the core intent, typical skills, and responsibilities. "
-        f"Do not include a preamble, the user input, or any list formatting in the output. Just the expanded sentence.\n"
-        f"Expanded Intent:"
-    )
-    try:
-        response = LLM_PIPELINE(prompt_template, max_new_tokens=100, do_sample=True, temperature=0.6)
-        expanded_query = response[0]['generated_text'].strip().split("Expanded Intent:")[-1].strip()
-        final_query = user_input + ". " + expanded_query.replace('\n', ' ').replace(':', '').strip()
-        final_query = final_query.replace('..', '.').strip()
-        return final_query
-    except Exception:
-        return user_input
 def find_job_matches(original_user_query: str, expanded_user_query: str, top_k: int = 50) -> pd.DataFrame:
-    expanded_user_embedding = model.encode(expanded_user_query, convert_to_tensor=True)
-    general_similarity_scores = util.cos_sim(expanded_user_embedding, combined_job_embeddings)[0]
-    top_indices = torch.topk(general_similarity_scores, k=len(combined_df))
-    sorted_combined_df = combined_df.iloc[top_indices.indices.cpu()].copy()
-    sorted_combined_df['general_score'] = top_indices.values.cpu().numpy()
-    unique_matches = sorted_combined_df.drop_duplicates(subset=['job_id'], keep='first').set_index('job_id')
-    original_user_embedding = model.encode(original_user_query, convert_to_tensor=True)
-    title_boost_scores = util.cos_sim(original_user_embedding, original_job_title_embeddings)[0].cpu().numpy()
-    title_boost_map = pd.Series(title_boost_scores, index=original_df['job_id'])
-    unique_matches['title_boost_score'] = unique_matches.index.map(title_boost_map).fillna(0)
-    unique_matches['Similarity Score'] = (0.70 * unique_matches['general_score'] + 0.30 * unique_matches['title_boost_score'])
-    final_job_ids = unique_matches.sort_values(by='Similarity Score', ascending=False).head(top_k).index.tolist()
-    final_results_df = original_df[original_df['job_id'].isin(final_job_ids)].copy()
-    scores_df = unique_matches.reset_index()[['job_id', 'Similarity Score']].copy()
-    final_results_df = pd.merge(final_results_df, scores_df, on='job_id', how='left')
-    final_results_df = final_results_df.sort_values(by='Similarity Score', ascending=False).reset_index(drop=True)
-    final_results_df = final_results_df.set_index('job_id', drop=False).rename(columns={'job_id': 'Job ID'})
-    return final_results_df
 def score_jobs_by_skills(user_skills: list[str], df_to_rank: pd.DataFrame) -> pd.DataFrame:
-    if df_to_rank is None or df_to_rank.empty or not user_skills:
-        return df_to_rank.sort_values(by='Similarity Score', ascending=False) if df_to_rank is not None else pd.DataFrame()
-    ranked_df = df_to_rank.copy()
-    if 'Skills' not in ranked_df.columns:
-        return ranked_df.sort_values(by='Similarity Score', ascending=False)
-    user_skill_embeddings = model.encode(user_skills, convert_to_tensor=True)
-    all_job_skills = sorted(list(set(skill for skills_list in ranked_df['Skills'] if skills_list for skill in skills_list)))
-    if not all_job_skills:
-        ranked_df['Skill Match Score'] = 0.0
-        ranked_df['Final Score'] = ranked_df['Similarity Score']
-        return ranked_df
-    job_skill_embeddings = model.encode(all_job_skills, convert_to_tensor=True)
-    similarity_matrix = util.cos_sim(user_skill_embeddings, job_skill_embeddings)
-    def calculate_confidence_adjusted_score(row):
-        job_skills_list = row.get('Skills', [])
-        if not job_skills_list:
-            return 0.0
-        total_required = len(job_skills_list)
-        sum_of_max_similarities = 0.0
-        for job_skill in job_skills_list:
-            try:
-                job_skill_idx = all_job_skills.index(job_skill)
-                max_sim = torch.max(similarity_matrix[:, job_skill_idx])
-                sum_of_max_similarities += max_sim.item()
-            except (ValueError, IndexError):
-                continue
-        avg_score = sum_of_max_similarities / total_required if total_required > 0 else 0.0
-        skill_count_factor = min(1.0, total_required / 5.0)
-        return avg_score * skill_count_factor
-    ranked_df['Skill Match Score'] = ranked_df.apply(calculate_confidence_adjusted_score, axis=1)
-    ranked_df['Final Score'] = (0.8 * ranked_df['Similarity Score']) + (0.2 * ranked_df['Skill Match Score'])
-    ranked_df = ranked_df.sort_values(by='Final Score', ascending=False).reset_index(drop=True)
-    return ranked_df.set_index('Job ID', drop=False).rename_axis(None)
 def initialize_data_and_model():
-    global original_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings
-    PROCESSED_DATA_PATH = "processed_jobs_with_skills.parquet"
-    print("--- Initializing LLM Client ---")
-    if not initialize_llm_client(): print("Warning: LLM Client failed to initialize. Will use NLTK only for skills.")
-    if os.path.exists(PROCESSED_DATA_PATH):
-        print(f"--- Loading pre-processed data from {PROCESSED_DATA_PATH} ---")
-        original_df = pd.read_parquet(PROCESSED_DATA_PATH)
-    else:
-        print("--- No pre-processed data found. Starting one-time processing... ---")
-        ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
-        original_df = ds["original"].to_pandas()
-        def extract_skills_llm(text: str) -> list[str]:
-            if not isinstance(text, str) or len(text.strip()) < 20 or not LLM_PIPELINE: return []
-            prompt = f"""
 Instruct: You are an expert technical recruiter. Extract the key skills from the job description text. List technical and soft skills as a comma-separated string.
 [Example 1]
 Text: "Requires 3+ years of experience in cloud infrastructure. Must be proficient in AWS, particularly EC2 and S3. Experience with Terraform for IaC is a plus."
 Extracted Skills: cloud infrastructure, aws, ec2, s3, terraform, infrastructure as code
 [Example 2]
 Text: "Seeking a team lead with strong project management abilities. Must communicate effectively with stakeholders and manage timelines using Agile methodologies like Scrum."
 Extracted Skills: project management, leadership, stakeholder communication, agile, scrum
 [Actual Task]
 Text: "{text}"
 Extracted Skills:
 """
-            try:
-                response = LLM_PIPELINE(prompt, max_new_tokens=150, do_sample=False, temperature=0.1)
-                generated_text = response[0]['generated_text']
-                skills_part = generated_text.split("Extracted Skills:")[-1].strip()
-                skills = [skill.strip() for skill in skills_part.split(',') if skill.strip()]
-                return list(dict.fromkeys(s.lower() for s in skills))
-            except Exception: return []
-        def extract_skills_nltk(text: str) -> list[str]:
-            if not isinstance(text, str): return []
-            text_lower = text.lower()
-            grammar = "NP: {<JJ.*>*<NN.*>+}"
-            chunk_parser = nltk.RegexpParser(grammar)
-            tokens = nltk.word_tokenize(text_lower)
-            tagged_tokens = nltk.pos_tag(tokens)
-            chunked_text = chunk_parser.parse(tagged_tokens)
-            potential_skills = set()
-            for subtree in chunked_text.subtrees():
-                if subtree.label() == 'NP':
-                    phrase = " ".join(word for word, tag in subtree.leaves())
-                    if _norm_skill_token(phrase) in SKILL_WHITELIST:
-                        potential_skills.add(_norm_skill_token(phrase))
-            return sorted(list(potential_skills))
-        def extract_skills_direct_scan(text: str) -> list[str]:
-            if not isinstance(text, str): return []
-            found_skills = set()
-            for skill in SKILL_WHITELIST:
-                if re.search(r'\b' + re.escape(skill) + r'\b', text, re.IGNORECASE):
-                    found_skills.add(skill)
-            return list(found_skills)
-        # --- NEW: Function to expand a short skill list using the LLM ---
-        def expand_skills_with_llm(job_title: str, existing_skills: list) -> list:
-            if not LLM_PIPELINE or not job_title: return []
-            skills_to_add = 6 - len(existing_skills)
-            prompt = f"""
-Instruct: A job has the title "{job_title}" and requires the skills: {', '.join(existing_skills)}.
-Based on this, what are {skills_to_add} additional, closely related skills typically required for such a role?
-List only the new skills, separated by commas. Do not repeat skills from the original list.
-Additional Skills:
-"""
-            try:
-                response = LLM_PIPELINE(prompt, max_new_tokens=50, do_sample=True, temperature=0.5)
-                generated_text = response[0]['generated_text']
-                skills_part = generated_text.split("Additional Skills:")[-1].strip()
-                new_skills = [skill.strip().lower() for skill in skills_part.split(',') if skill.strip()]
-                return new_skills
-            except Exception:
-                return []
-        def extract_skills_hybrid(row) -> list[str]:
-            text = row['text_for_skills']
-            job_title = row.get('Job title', '') # Use original Job title for context
-            llm_skills = extract_skills_llm(text)
-            nltk_skills = extract_skills_nltk(text)
-            direct_skills = extract_skills_direct_scan(text)
-            combined_skills = set(llm_skills) | set(nltk_skills) | set(direct_skills)
-            # If the combined list is still too short, expand it
-            if len(combined_skills) < 6:
-                expanded_skills = expand_skills_with_llm(job_title, list(combined_skills))
-                combined_skills.update(expanded_skills)
-            return sorted(list(combined_skills))
-        def create_text_for_skills(row):
-            return " ".join([str(s) for s in [row.get("Job title"), row.get("Duties"), row.get("qualifications"), row.get("Description")] if pd.notna(s)])
-        original_df["text_for_skills"] = original_df.apply(create_text_for_skills, axis=1)
-        print("--- Extracting skills with HYBRID ACCURACY model. Please wait... ---")
-        # Apply the hybrid function row-wise to include job title context
-        original_df['Skills'] = original_df.progress_apply(extract_skills_hybrid, axis=1)
-        original_df = original_df.drop(columns=['text_for_skills'])
-        print(f"--- Saving processed data to {PROCESSED_DATA_PATH} for faster future startups ---")
-        original_df.to_parquet(PROCESSED_DATA_PATH)
-    original_df['job_id'] = original_df.index
-    def create_full_text(row): return " ".join([str(s) for s in [row.get("Job title"), row.get("Company"), row.get("Duties"), row.get("qualifications"), row.get("Description")]])
-    original_df["full_text"] = original_df.apply(create_full_text, axis=1)
-    ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
-    augmented_df = ds["augmented"].to_pandas()
-    max_id = len(original_df) - 1
-    augmented_df['job_id'] = augmented_df.index.map(lambda i: min(i // 20, max_id))
-    augmented_df["full_text"] = augmented_df.apply(create_full_text, axis=1)
-    combined_df = pd.concat([original_df.copy(), augmented_df.copy()], ignore_index=True)
-    original_df = original_df.rename(columns={'Job title': 'job_title', 'Company': 'company'})
-    print("--- Loading Fine-Tuned Sentence Transformer Model ---")
-    model = SentenceTransformer(FINETUNED_MODEL_ID)
-    print("--- Encoding Embeddings ---")
-    combined_job_embeddings = model.encode(combined_df["full_text"].tolist(), convert_to_tensor=True, show_progress_bar=True)
-    original_job_title_embeddings = model.encode(original_df["job_title"].tolist(), convert_to_tensor=True, show_progress_bar=True)
-    print("--- Building Vocabulary ---")
-    build_known_vocabulary(combined_df)
-    return "--- Initialization Complete ---"
-def _course_links_for(skill: str) -> str:
-    q = _url.quote(skill)
-    links = [("Coursera", f"https://www.coursera.org/search?query={q}"), ("edX", f"https://www.edx.org/search?q={q}"), ("Udemy", f"https://www.udemy.com/courses/search/?q={q}"), ("YouTube", f"https://www.youtube.com/results?search_query={q}+tutorial")]
-    return " • ".join([f'<a href="{u}" target="_blank" style="color: #007bff;">{name}</a>' for name, u in links])
-def get_job_matches(dream_job: str, top_n: int, skills_text: str):
-    status = "Searching using hybrid model..."
-    expanded_desc = llm_expand_query(dream_job)
-    emb_matches = find_job_matches(dream_job, expanded_desc, top_k=50)
-    user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
-    if user_skills:
-        display_df = score_jobs_by_skills(user_skills, emb_matches)
-    else:
-        display_df = emb_matches
-    display_df = display_df.head(top_n)
-    if user_skills:
-        status = f"Found and **re-ranked** results by your {len(user_skills)} skills. Displaying top {len(display_df)}."
-    else:
-        status = f"Found {len(display_df)} top matches using semantic search."
-    if 'Final Score' in display_df.columns:
-        table_to_show = display_df[['job_title', 'company', 'Final Score', 'Skill Match Score']]
-        table_to_show = table_to_show.rename(columns={'Final Score': 'Overall Score'})
-        table_to_show['Skill Match Score'] = table_to_show['Skill Match Score'].map('{:.2%}'.format)
-        table_to_show['Overall Score'] = table_to_show['Overall Score'].map('{:.2%}'.format)
-    else:
-        table_to_show = display_df[['job_title', 'company', 'Similarity Score']]
-        table_to_show = table_to_show.rename(columns={'Similarity Score': 'Overall Score'})
-        table_to_show['Overall Score'] = table_to_show['Overall Score'].map('{:.2%}'.format)
-    dropdown_options = [(f"{i+1}. {row['job_title']} - {row['company']}", row.name) for i, row in display_df.iterrows()]
-    dropdown_value = dropdown_options[0][1] if dropdown_options else None
-    return status, emb_matches, table_to_show, gr.Dropdown(choices=dropdown_options, value=dropdown_value, visible=True), gr.Accordion(visible=True)
-def rerank_current_results(initial_matches_df, skills_text, top_n):
-    if initial_matches_df is None or pd.DataFrame(initial_matches_df).empty:
-        return "Please find matches first before re-ranking.", pd.DataFrame(), gr.Dropdown(visible=False)
-    initial_matches_df = pd.DataFrame(initial_matches_df)
-    user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
-    if not user_skills:
-        status = "Skills cleared. Showing original semantic search results."
-        display_df = initial_matches_df.head(top_n)
-        table_to_show = display_df[['job_title', 'company', 'Similarity Score']]
-        table_to_show = table_to_show.rename(columns={'Similarity Score': 'Overall Score'})
-        table_to_show['Overall Score'] = table_to_show['Overall Score'].map('{:.2%}'.format)
-    else:
-        ranked_df = score_jobs_by_skills(user_skills, initial_matches_df)
-        status = f"Results **re-ranked** based on your {len(user_skills)} skills."
-        display_df = ranked_df.head(top_n)
-        table_to_show = display_df[['job_title', 'company', 'Final Score', 'Skill Match Score']]
-        table_to_show = table_to_show.rename(columns={'Final Score': 'Overall Score'})
-        table_to_show['Skill Match Score'] = table_to_show['Skill Match Score'].map('{:.2%}'.format)
-        table_to_show['Overall Score'] = table_to_show['Overall Score'].map('{:.2%}'.format)
-    dropdown_options = [(f"{i+1}. {row['job_title']} - {row['company']}", row.name) for i, row in display_df.iterrows()]
-    dropdown_value = dropdown_options[0][1] if dropdown_options else None
-    return status, table_to_show, gr.Dropdown(choices=dropdown_options, value=dropdown_value, visible=True)
-def find_matches_and_rank_with_check(dream_job: str, top_n: int, skills_text: str):
-    if not dream_job:
-        return "Please describe your dream job first.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(""), gr.Row(visible=False)
-    unrecognized_words = check_spelling_in_query(dream_job)
-    if unrecognized_words:
-        word_list_html = ", ".join([f"<b><span style='color: #F87171;'>{w}</span></b>" for w in unrecognized_words])
-        alert_message = f"<b><span style='color: #F87171;'>⚠️ Possible Spelling Error:</span></b> Unrecognized: {word_list_html}."
-        return "Status: Awaiting confirmation.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(alert_message, visible=True), gr.Row(visible=True)
-    status, emb_matches, table_to_show, dropdown, details_accordion = get_job_matches(dream_job, top_n, skills_text)
-    return status, emb_matches, table_to_show, dropdown, details_accordion, gr.Markdown(visible=False), gr.Row(visible=False)
-def find_matches_and_rank_anyway(dream_job: str, top_n: int, skills_text: str):
-    status, emb_matches, table_to_show, dropdown, details_accordion = get_job_matches(dream_job, top_n, skills_text)
-    return status, emb_matches, table_to_show, dropdown, details_accordion, gr.Markdown(visible=False), gr.Row(visible=False)
-def on_select_job(job_id, skills_text):
-    if job_id is None: return "", "", "", "", "", gr.Accordion(visible=False), [], 0, gr.Button(visible=False)
-    row = original_df.loc[job_id]
-    title, company = str(row.get("job_title", "")), str(row.get("company", ""))
-    job_details_markdown = f"### {title} — {company}"
-    duties, qualifications, description = str(row.get('Duties', '')), str(row.get('qualifications', '')), str(row.get('Description', ''))
-    user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
-    job_skills = row.get("Skills", [])
-    if not job_skills:
-        learning_plan_html = "<p><i>No specific skills could be extracted for this job.</i></p>"
-        return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
-    score_val = 0
-    all_missing_skills = job_skills
-    if user_skills:
-        user_skill_embeddings = model.encode(user_skills, convert_to_tensor=True)
-        job_skill_embeddings = model.encode(job_skills, convert_to_tensor=True)
-        similarity_matrix = util.cos_sim(user_skill_embeddings, job_skill_embeddings)
-        sum_of_max_similarities = torch.sum(torch.max(similarity_matrix, dim=0).values)
-        avg_score = (sum_of_max_similarities / len(job_skills)).item() if len(job_skills) > 0 else 0
-        skill_count_factor = min(1.0, len(job_skills) / 5.0)
-        score_val = avg_score * skill_count_factor
-        matched_job_skills_mask = torch.any(similarity_matrix > 0.58, dim=0)
-        all_missing_skills = [skill for i, skill in enumerate(job_skills) if not matched_job_skills_mask[i]]
-    if user_skills and score_val >= 0.98:
-        learning_plan_html = "<h4 style='color:green;'>🎉 You have all the required skills!</h4>"
-        job_details_markdown += f"\n**Your skill match:** {score_val:.1%}"
-        return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
-    if user_skills:
-        job_details_markdown += f"\n**Your skill match:** {score_val:.1%}"
-        headline = "<b>Great fit!</b>" if score_val >= 0.8 else "<b>Good progress!</b>" if score_val >= 0.5 else "<b>Solid starting point.</b>"
-        learning_plan_html = f"<h4>{headline} Focus on these skills to improve your match:</h4>"
-        skills_to_display = sorted(all_missing_skills)[:5]
-        items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
-        learning_plan_html += f"<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
-        return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
-    else:
-        headline = "<h4>To be a good fit for this role, you'll need to learn these skills:</h4>"
-        skills_to_display = sorted(job_skills)[:5]
-        items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
-        learning_plan_html = f"{headline}<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
-        full_skill_list_for_state = sorted(job_skills)
-        new_offset = len(skills_to_display)
-        should_button_be_visible = len(full_skill_list_for_state) > 5
-        return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), full_skill_list_for_state, new_offset, gr.Button(visible=should_button_be_visible)
-def load_more_skills(full_skills_list, current_offset):
-    SKILLS_INCREMENT = 5
-    new_offset = current_offset + SKILLS_INCREMENT
-    skills_to_display = full_skills_list[:new_offset]
-    items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
-    learning_plan_html = f"<h4>To be a good fit for this role, you'll need to learn these skills:</h4><ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
-    should_button_be_visible = new_offset < len(full_skills_list)
-    return learning_plan_html, new_offset, gr.Button(visible=should_button_be_visible)
-def on_reset():
-    return ("", 3, "", pd.DataFrame(), None, gr.Dropdown(visible=False), gr.Accordion(visible=False), "Status: Ready.", "", "", "", "", gr.Markdown(visible=False), gr.Row(visible=False), [], 0, gr.Button(visible=False))
-print("Starting application initialization...")
-initialization_status = initialize_data_and_model()
-print(initialization_status)
 with gr.Blocks(theme=gr.themes.Soft()) as ui:
-    gr.Markdown("# Hybrid Career Planner & Skill Gap Analyzer")
-    initial_matches_state = gr.State()
-    missing_skills_state = gr.State([])
-    skills_offset_state = gr.State(0)
-    with gr.Row():
-        with gr.Column(scale=3):
-            dream_text = gr.Textbox(label='Your Dream Job Description', lines=3, placeholder="e.g., 'A role in a tech startup focused on machine learning...'")
-            with gr.Accordion("Optional: Add Your Skills to Re-rank Results", open=False):
-                with gr.Row():
-                    skills_text = gr.Textbox(label='Your Skills (comma-separated)', placeholder="e.g., Python, data analysis", scale=3)
-                    rerank_btn = gr.Button("Re-rank", variant="secondary", scale=1)
-        with gr.Column(scale=1):
-            topk_slider = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Number of Matches")
-            search_btn = gr.Button("Find Matches", variant="primary")
-            reset_btn = gr.Button("Reset All")
-    status_text = gr.Markdown("Status: Ready.")
-    spelling_alert = gr.Markdown(visible=False)
-    with gr.Row(visible=False) as spelling_row:
-        search_anyway_btn = gr.Button("Search Anyway", variant="secondary")
-        retype_btn = gr.Button("Let Me Fix It", variant="stop")
-    df_output = gr.DataFrame(label="Job Matches", interactive=False)
-    job_selector = gr.Dropdown(label="Select a job to see more details & learning plan:", visible=False)
-    with gr.Accordion("Job Details & Learning Plan", open=False, visible=False) as details_accordion:
-        job_details_markdown = gr.Markdown()
-        with gr.Tabs():
-            with gr.TabItem("Duties"): duties_markdown = gr.Markdown()
-            with gr.TabItem("Qualifications"): qualifications_markdown = gr.Markdown()
-            with gr.TabItem("Full Description"): description_markdown = gr.Markdown()
-        learning_plan_output = gr.HTML(label="Learning Plan")
-        load_more_btn = gr.Button("Load More Skills", visible=False)
-    search_btn.click(fn=find_matches_and_rank_with_check, inputs=[dream_text, topk_slider, skills_text], outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row])
-    search_anyway_btn.click(fn=find_matches_and_rank_anyway, inputs=[dream_text, topk_slider, skills_text], outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row])
-    retype_btn.click(lambda: ("Status: Ready for you to retype.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(visible=False), gr.Row(visible=False)), outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row])
-    reset_btn.click(fn=on_reset, outputs=[dream_text, topk_slider, skills_text, df_output, initial_matches_state, job_selector, details_accordion, status_text, job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, spelling_alert, spelling_row, missing_skills_state, skills_offset_state, load_more_btn], queue=False)
-    rerank_btn.click(fn=rerank_current_results, inputs=[initial_matches_state, skills_text, topk_slider], outputs=[status_text, df_output, job_selector])
-    job_selector.change(fn=on_select_job, inputs=[job_selector, skills_text], outputs=[job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, learning_plan_output, details_accordion, missing_skills_state, skills_offset_state, load_more_btn])
-    load_more_btn.click(fn=load_more_skills, inputs=[missing_skills_state, skills_offset_state], outputs=[learning_plan_output, skills_offset_state, load_more_btn])
 ui.launch()

 import pandas as pd
 import datasets
 from sentence_transformers import SentenceTransformer, util
 import torch
 import re
 import nltk
 from nltk.corpus import words, stopwords
 import urllib.parse as _url
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 from nltk.stem import PorterStemmer
 import gradio as gr
 import os
 from tqdm import tqdm
 tqdm.pandas()
 # --- NLTK Data Download ---
 for package in ['words', 'stopwords', 'averaged_perceptron_tagger', 'punkt']:
+    try:
+        nltk.data.find(f'corpora/{package}' if package in ['words', 'stopwords'] else f'taggers/{package}' if package == 'averaged_perceptron_tagger' else f'tokenizers/{package}')
+    except LookupError:
+        nltk.download(package)
 STOPWORDS = set(stopwords.words('english'))
 stemmer = PorterStemmer()
 # --- Expanded Skill Whitelist ---
 SKILL_WHITELIST = {
+    # Technical & Data
+    'python', 'java', 'c++', 'javascript', 'typescript', 'sql', 'nosql', 'html', 'css', 'react', 'angular', 'vue',
+    'nodejs', 'django', 'flask', 'fastapi', 'spring boot', 'ruby on rails', 'php', 'swift', 'kotlin', 'dart', 'flutter',
+    'machine learning', 'deep learning', 'tensorflow', 'pytorch', 'keras', 'scikit-learn', 'pandas', 'numpy', 'matplotlib',
+    'natural language processing', 'nlp', 'computer vision', 'data analysis', 'data science', 'data engineering',
+    'big data', 'spark', 'hadoop', 'kafka', 'data visualization', 'tableau', 'power bi', 'd3.js', 'statistics', 'analytics',
+    'aws', 'azure', 'google cloud', 'gcp', 'docker', 'kubernetes', 'terraform', 'ansible', 'ci/cd', 'jenkins',
+    'git', 'github', 'devops', 'linux', 'unix', 'shell scripting', 'powershell', 'cybersecurity', 'penetration testing',
+    'network security', 'cryptography', 'blockchain', 'c#', '.net', 'sql server', 'mysql', 'postgresql', 'mongodb', 'redis',
+    'elasticsearch', 'api design', 'rest apis', 'graphql', 'microservices', 'serverless', 'system design', 'saas',
+    # Business & Consulting
+    'agile', 'scrum', 'project management', 'product management', 'consulting', 'client management', 'business development',
+    'strategy', 'stakeholder management', 'risk management', 'compliance', 'aml', 'kyc', 'reinsurance', 'finance',
+    'financial modeling', 'financial analysis', 'due diligence', 'sourcing', 'procurement', 'negotiation', 'supply chain',
+    'business analysis', 'business intelligence', 'presentations', 'public speaking', 'time management', 'critical thinking',
+    'design thinking', 'innovation', 'adaptability', 'supervisory', 'pmp', 'cpsm', 'cips', 'microsoft office', 'communication',
+    'organizational skills',
+    # Soft & Other
+    'leadership', 'stakeholder communication', 'client communication', 'teamwork', 'collaboration', 'problem solving',
+    'ui/ux design', 'figma', 'sketch', 'adobe xd', 'graphic design', 'autocad', 'solidworks', 'sales', 'marketing',
+    'seo', 'sem', 'content writing', 'customer support', 'technical writing', 'sap', 'oracle', 'budgeting', 'mentoring', 'supervising'
 }
 # --- GLOBAL STATE & DATA ---
 original_df = None
 combined_df = None
 model = None
 combined_job_embeddings = None
 original_job_title_embeddings = None
 LLM_PIPELINE = None
 LLM_MODEL_NAME = "microsoft/phi-2"
 FINETUNED_MODEL_ID = "its-zion-18/projfinetuned"
 KNOWN_WORDS = set()
 # --- CORE NLP & HELPER FUNCTIONS ---
 def _norm_skill_token(s: str) -> str:
+    s = s.lower().strip()
+    s = re.sub(r'[\(\)\[\]\{\}\*]', '', s)
+    s = re.sub(r'^\W+|\W+$', '', s)
+    s = re.sub(r'\s+', ' ', s)
+    return s
 def build_known_vocabulary(df: pd.DataFrame):
+    global KNOWN_WORDS
+    english_words = set(w.lower() for w in words.words())
+    job_words = set(re.findall(r'\b\w+\b', " ".join(df['full_text'].astype(str).tolist()).lower()))
+    job_words = {w for w in job_words if w.isalpha() and len(w) > 2}
+    KNOWN_WORDS = english_words | job_words
+    return "Known vocabulary built."
 def check_spelling_in_query(query: str) -> list[str]:
+    words_in_query = query.lower().split()
+    unrecognized_words = []
+    if not KNOWN_WORDS: return []
+    for word in words_in_query:
+        if word.isalpha() and len(word) > 1 and word not in KNOWN_WORDS:
+            unrecognized_words.append(word)
+    return list(set(unrecognized_words))
 def initialize_llm_client():
+    global LLM_PIPELINE
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME, trust_remote_code=True)
+        model_llm = AutoModelForCausalLM.from_pretrained(
+            LLM_MODEL_NAME, torch_dtype="auto", device_map="auto", trust_remote_code=True
+        )
+        LLM_PIPELINE = pipeline("text-generation", model=model_llm, tokenizer=tokenizer)
+        return True
+    except Exception as e:
+        print(f"🚨 ERROR initializing local LLM: {e}")
+        return False
 def llm_expand_query(user_input: str) -> str:
+    global LLM_PIPELINE
+    if not LLM_PIPELINE: return user_input
+    prompt_template = (
+        f"User's career interest: '{user_input}'\n"
+        f"Instruction: Based on the user's interest, write a concise, single-sentence summary (40-60 words) that elaborates on the core intent, typical skills, and responsibilities. "
+        f"Do not include a preamble, the user input, or any list formatting in the output. Just the expanded sentence.\n"
+        f"Expanded Intent:"
+    )
+    try:
+        response = LLM_PIPELINE(prompt_template, max_new_tokens=100, do_sample=True, temperature=0.6)
+        expanded_query = response[0]['generated_text'].strip().split("Expanded Intent:")[-1].strip()
+        final_query = user_input + ". " + expanded_query.replace('\n', ' ').replace(':', '').strip()
+        final_query = final_query.replace('..', '.').strip()
+        return final_query
+    except Exception:
+        return user_input
 def find_job_matches(original_user_query: str, expanded_user_query: str, top_k: int = 50) -> pd.DataFrame:
+    expanded_user_embedding = model.encode(expanded_user_query, convert_to_tensor=True)
+    general_similarity_scores = util.cos_sim(expanded_user_embedding, combined_job_embeddings)[0]
+    top_indices = torch.topk(general_similarity_scores, k=len(combined_df))
+    sorted_combined_df = combined_df.iloc[top_indices.indices.cpu()].copy()
+    sorted_combined_df['general_score'] = top_indices.values.cpu().numpy()
+    unique_matches = sorted_combined_df.drop_duplicates(subset=['job_id'], keep='first').set_index('job_id')
+    original_user_embedding = model.encode(original_user_query, convert_to_tensor=True)
+    title_boost_scores = util.cos_sim(original_user_embedding, original_job_title_embeddings)[0].cpu().numpy()
+    title_boost_map = pd.Series(title_boost_scores, index=original_df['job_id'])
+    unique_matches['title_boost_score'] = unique_matches.index.map(title_boost_map).fillna(0)
+    unique_matches['Similarity Score'] = (0.70 * unique_matches['general_score'] + 0.30 * unique_matches['title_boost_score'])
+    final_job_ids = unique_matches.sort_values(by='Similarity Score', ascending=False).head(top_k).index.tolist()
+    final_results_df = original_df[original_df['job_id'].isin(final_job_ids)].copy()
+    scores_df = unique_matches.reset_index()[['job_id', 'Similarity Score']].copy()
+    final_results_df = pd.merge(final_results_df, scores_df, on='job_id', how='left')
+    final_results_df = final_results_df.sort_values(by='Similarity Score', ascending=False).reset_index(drop=True)
+    final_results_df = final_results_df.set_index('job_id', drop=False).rename(columns={'job_id': 'Job ID'})
+    return final_results_df
 def score_jobs_by_skills(user_skills: list[str], df_to_rank: pd.DataFrame) -> pd.DataFrame:
+    if df_to_rank is None or df_to_rank.empty or not user_skills:
+        return df_to_rank.sort_values(by='Similarity Score', ascending=False) if df_to_rank is not None else pd.DataFrame()
+    ranked_df = df_to_rank.copy()
+    if 'Skills' not in ranked_df.columns:
+        return ranked_df.sort_values(by='Similarity Score', ascending=False)
+    user_skill_embeddings = model.encode(user_skills, convert_to_tensor=True)
+    all_job_skills = sorted(list(set(skill for skills_list in ranked_df['Skills'] if skills_list for skill in skills_list)))
+    if not all_job_skills:
+        ranked_df['Skill Match Score'] = 0.0
+        ranked_df['Final Score'] = ranked_df['Similarity Score']
+        return ranked_df
+    job_skill_embeddings = model.encode(all_job_skills, convert_to_tensor=True)
+    similarity_matrix = util.cos_sim(user_skill_embeddings, job_skill_embeddings)
+    def calculate_confidence_adjusted_score(row):
+        job_skills_list = row.get('Skills', [])
+        if not job_skills_list:
+            return 0.0
+        total_required = len(job_skills_list)
+        sum_of_max_similarities = 0.0
+        for job_skill in job_skills_list:
+            try:
+                job_skill_idx = all_job_skills.index(job_skill)
+                max_sim = torch.max(similarity_matrix[:, job_skill_idx])
+                sum_of_max_similarities += max_sim.item()
+            except (ValueError, IndexError):
+                continue
+        avg_score = sum_of_max_similarities / total_required if total_required > 0 else 0.0
+        skill_count_factor = min(1.0, total_required / 5.0)
+        return avg_score * skill_count_factor
+    ranked_df['Skill Match Score'] = ranked_df.apply(calculate_confidence_adjusted_score, axis=1)
+    ranked_df['Final Score'] = (0.8 * ranked_df['Similarity Score']) + (0.2 * ranked_df['Skill Match Score'])
+    ranked_df = ranked_df.sort_values(by='Final Score', ascending=False).reset_index(drop=True)
+    return ranked_df.set_index('Job ID', drop=False).rename_axis(None)
 def initialize_data_and_model():
+    global original_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings
+    PROCESSED_DATA_PATH = "processed_jobs_with_skills.parquet"
+    print("--- Initializing LLM Client ---")
+    if not initialize_llm_client(): print("Warning: LLM Client failed to initialize. Will use NLTK only for skills.")
+    if os.path.exists(PROCESSED_DATA_PATH):
+        print(f"--- Loading pre-processed data from {PROCESSED_DATA_PATH} ---")
+        original_df = pd.read_parquet(PROCESSED_DATA_PATH)
+    else:
+        print("--- No pre-processed data found. Starting one-time processing... ---")
+        ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
+        original_df = ds["original"].to_pandas()
+        def extract_skills_llm(text: str) -> list[str]:
+            if not isinstance(text, str) or len(text.strip()) < 20 or not LLM_PIPELINE: return []
+            prompt = f"""
 Instruct: You are an expert technical recruiter. Extract the key skills from the job description text. List technical and soft skills as a comma-separated string.
 [Example 1]
 Text: "Requires 3+ years of experience in cloud infrastructure. Must be proficient in AWS, particularly EC2 and S3. Experience with Terraform for IaC is a plus."
 Extracted Skills: cloud infrastructure, aws, ec2, s3, terraform, infrastructure as code
 [Example 2]
 Text: "Seeking a team lead with strong project management abilities. Must communicate effectively with stakeholders and manage timelines using Agile methodologies like Scrum."
 Extracted Skills: project management, leadership, stakeholder communication, agile, scrum
 [Actual Task]
 Text: "{text}"
 Extracted Skills:
 """
+            try:
+                response = LLM_PIPELINE(prompt, max_new_tokens=150, do_sample=False, temperature=0.1)
+                generated_text = response[0]['generated_text']
+                skills_part = generated_text.split("Extracted Skills:")[-1].strip()
+                skills = [skill.strip() for skill in skills_part.split(',') if skill.strip()]
+                return list(dict.fromkeys(s.lower() for s in skills))
+            except Exception: return []
+        def extract_skills_nltk(text: str) -> list[str]:
+            if not isinstance(text, str): return []
+            text_lower = text.lower()
+            grammar = "NP: {<JJ.*>*<NN.*>+}"
+            chunk_parser = nltk.RegexpParser(grammar)
+            tokens = nltk.word_tokenize(text_lower)
+            tagged_tokens = nltk.pos_tag(tokens)
+            chunked_text = chunk_parser.parse(tagged_tokens)
+            potential_skills = set()
+            for subtree in chunked_text.subtrees():
+                if subtree.label() == 'NP':
+                    phrase = " ".join(word for word, tag in subtree.leaves())
+                    if _norm_skill_token(phrase) in SKILL_WHITELIST:
+                        potential_skills.add(_norm_skill_token(phrase))
+            return sorted(list(potential_skills))
+        def extract_skills_direct_scan(text: str) -> list[str]:
+            if not isinstance(text, str): return []
+            found_skills = set()
+            for skill in SKILL_WHITELIST:
+                if re.search(r'\b' + re.escape(skill) + r'\b', text, re.IGNORECASE):
+                    found_skills.add(skill)
+            return list(found_skills)
+        def expand_skills_with_llm(job_title: str, existing_skills: list) -> list:
+            if not LLM_PIPELINE or not job_title: return []
+            skills_to_add = 6 - len(existing_skills)
+            prompt = f"""
+Instruct: A job has the title "{job_title}" and requires the skills: {', '.join(existing_skills)}.
+Based on this, what are {skills_to_add} additional, closely related skills typically required for such a role?
+List only the new skills, separated by commas. Do not repeat skills from the original list.
+Additional Skills:
+"""
+            try:
+                response = LLM_PIPELINE(prompt, max_new_tokens=50, do_sample=True, temperature=0.5)
+                generated_text = response[0]['generated_text']
+                skills_part = generated_text.split("Additional Skills:")[-1].strip()
+                new_skills = [skill.strip().lower() for skill in skills_part.split(',') if skill.strip()]
+                return new_skills
+            except Exception:
+                return []
+        def extract_skills_hybrid(row) -> list[str]:
+            text = row['text_for_skills']
+            job_title = row.get('Job title', '') # Use original Job title for context
+            llm_skills = extract_skills_llm(text)
+            nltk_skills = extract_skills_nltk(text)
+            direct_skills = extract_skills_direct_scan(text)
+            combined_skills = set(llm_skills) | set(nltk_skills) | set(direct_skills)
+            # If the combined list is still too short, expand it
+            if len(combined_skills) < 6:
+                expanded_skills = expand_skills_with_llm(job_title, list(combined_skills))
+                combined_skills.update(expanded_skills)
+            return sorted(list(combined_skills))
+        def create_text_for_skills(row):
+            return " ".join([str(s) for s in [row.get("Job title"), row.get("Duties"), row.get("qualifications"), row.get("Description")] if pd.notna(s)])
+        original_df["text_for_skills"] = original_df.apply(create_text_for_skills, axis=1)
+        print("--- Extracting skills with HYBRID ACCURACY model. Please wait... ---")
+        # Apply the hybrid function row-wise to include job title context
+        original_df['Skills'] = original_df.progress_apply(extract_skills_hybrid, axis=1)
+        original_df = original_df.drop(columns=['text_for_skills'])
+        print(f"--- Saving processed data to {PROCESSED_DATA_PATH} for faster future startups ---")
+        original_df.to_parquet(PROCESSED_DATA_PATH)
+    original_df['job_id'] = original_df.index
+    def create_full_text(row): return " ".join([str(s) for s in [row.get("Job title"), row.get("Company"), row.get("Duties"), row.get("qualifications"), row.get("Description")]])
+    original_df["full_text"] = original_df.apply(create_full_text, axis=1)
+    ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
+    augmented_df = ds["augmented"].to_pandas()
+    max_id = len(original_df) - 1
+    augmented_df['job_id'] = augmented_df.index.map(lambda i: min(i // 20, max_id))
+    augmented_df["full_text"] = augmented_df.apply(create_full_text, axis=1)
+    combined_df = pd.concat([original_df.copy(), augmented_df.copy()], ignore_index=True)
+    original_df = original_df.rename(columns={'Job title': 'job_title', 'Company': 'company'})
+    print("--- Loading Fine-Tuned Sentence Transformer Model ---")
+    model = SentenceTransformer(FINETUNED_MODEL_ID)
+    print("--- Encoding Embeddings ---")
+    combined_job_embeddings = model.encode(combined_df["full_text"].tolist(), convert_to_tensor=True, show_progress_bar=True)
+    original_job_title_embeddings = model.encode(original_df["job_title"].tolist(), convert_to_tensor=True, show_progress_bar=True)
+    print("--- Building Vocabulary ---")
+    build_known_vocabulary(combined_df)
+    return "--- Initialization Complete ---"
+def _course_links_for(skill: str) -> str:
+    q = _url.quote(skill)
+    links = [("Coursera", f"https://www.coursera.org/search?query={q}"), ("edX", f"https://www.edx.org/search?q={q}"), ("Udemy", f"https://www.udemy.com/courses/search/?q={q}"), ("YouTube", f"https://www.youtube.com/results?search_query={q}+tutorial")]
+    return " • ".join([f'<a href="{u}" target="_blank" style="color: #007bff;">{name}</a>' for name, u in links])
+def get_job_matches(dream_job: str, top_n: int, skills_text: str):
+    status = "Searching using hybrid model..."
+    expanded_desc = llm_expand_query(dream_job)
+    emb_matches = find_job_matches(dream_job, expanded_desc, top_k=50)
+    user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
+    if user_skills:
+        display_df = score_jobs_by_skills(user_skills, emb_matches)
+    else:
+        display_df = emb_matches
+    display_df = display_df.head(top_n)
+    if user_skills:
+        status = f"Found and **re-ranked** results by your {len(user_skills)} skills. Displaying top {len(display_df)}."
+    else:
+        status = f"Found {len(display_df)} top matches using semantic search."
+    if 'Final Score' in display_df.columns:
+        table_to_show = display_df[['job_title', 'company', 'Final Score', 'Skill Match Score']]
+        table_to_show = table_to_show.rename(columns={'Final Score': 'Overall Score'})
+        table_to_show['Skill Match Score'] = table_to_show['Skill Match Score'].map('{:.2%}'.format)
+        table_to_show['Overall Score'] = table_to_show['Overall Score'].map('{:.2%}'.format)
+    else:
+        table_to_show = display_df[['job_title', 'company', 'Similarity Score']]
+        table_to_show = table_to_show.rename(columns={'Similarity Score': 'Overall Score'})
+        table_to_show['Overall Score'] = table_to_show['Overall Score'].map('{:.2%}'.format)
+    dropdown_options = [(f"{i+1}. {row['job_title']} - {row['company']}", row.name) for i, row in display_df.iterrows()]
+    dropdown_value = dropdown_options[0][1] if dropdown_options else None
+    return status, emb_matches, table_to_show, gr.Dropdown(choices=dropdown_options, value=dropdown_value, visible=True), gr.Accordion(visible=True)
+def rerank_current_results(initial_matches_df, skills_text, top_n):
+    if initial_matches_df is None or pd.DataFrame(initial_matches_df).empty:
+        return "Please find matches first before re-ranking.", pd.DataFrame(), gr.Dropdown(visible=False)
+    initial_matches_df = pd.DataFrame(initial_matches_df)
+    user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
+    if not user_skills:
+        status = "Skills cleared. Showing original semantic search results."
+        display_df = initial_matches_df.head(top_n)
+        table_to_show = display_df[['job_title', 'company', 'Similarity Score']]
+        table_to_show = table_to_show.rename(columns={'Similarity Score': 'Overall Score'})
+        table_to_show['Overall Score'] = table_to_show['Overall Score'].map('{:.2%}'.format)
+    else:
+        ranked_df = score_jobs_by_skills(user_skills, initial_matches_df)
+        status = f"Results **re-ranked** based on your {len(user_skills)} skills."
+        display_df = ranked_df.head(top_n)
+        table_to_show = display_df[['job_title', 'company', 'Final Score', 'Skill Match Score']]
+        table_to_show = table_to_show.rename(columns={'Final Score': 'Overall Score'})
+        table_to_show['Skill Match Score'] = table_to_show['Skill Match Score'].map('{:.2%}'.format)
+        table_to_show['Overall Score'] = table_to_show['Overall Score'].map('{:.2%}'.format)
+    dropdown_options = [(f"{i+1}. {row['job_title']} - {row['company']}", row.name) for i, row in display_df.iterrows()]
+    dropdown_value = dropdown_options[0][1] if dropdown_options else None
+    return status, table_to_show, gr.Dropdown(choices=dropdown_options, value=dropdown_value, visible=True)
+def find_matches_and_rank_with_check(dream_job: str, top_n: int, skills_text: str):
+    if not dream_job:
+        return "Please describe your dream job first.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(""), gr.Row(visible=False)
+    unrecognized_words = check_spelling_in_query(dream_job)
+    if unrecognized_words:
+        word_list_html = ", ".join([f"<b><span style='color: #F87171;'>{w}</span></b>" for w in unrecognized_words])
+        alert_message = f"<b><span style='color: #F87171;'>⚠️ Possible Spelling Error:</span></b> Unrecognized: {word_list_html}."
+        return "Status: Awaiting confirmation.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(alert_message, visible=True), gr.Row(visible=True)
+    status, emb_matches, table_to_show, dropdown, details_accordion = get_job_matches(dream_job, top_n, skills_text)
+    return status, emb_matches, table_to_show, dropdown, details_accordion, gr.Markdown(visible=False), gr.Row(visible=False)
+def find_matches_and_rank_anyway(dream_job: str, top_n: int, skills_text: str):
+    status, emb_matches, table_to_show, dropdown, details_accordion = get_job_matches(dream_job, top_n, skills_text)
+    return status, emb_matches, table_to_show, dropdown, details_accordion, gr.Markdown(visible=False), gr.Row(visible=False)
+def on_select_job(job_id, skills_text):
+    if job_id is None: return "", "", "", "", "", gr.Accordion(visible=False), [], 0, gr.Button(visible=False)
+    row = original_df.loc[job_id]
+    title, company = str(row.get("job_title", "")), str(row.get("company", ""))
+    job_details_markdown = f"### {title} — {company}"
+    duties, qualifications, description = str(row.get('Duties', '')), str(row.get('qualifications', '')), str(row.get('Description', ''))
+    user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
+    job_skills = row.get("Skills", [])
+    if not job_skills:
+        learning_plan_html = "<p><i>No specific skills could be extracted for this job.</i></p>"
+        return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
+    score_val = 0
+    all_missing_skills = job_skills
+    if user_skills:
+        user_skill_embeddings = model.encode(user_skills, convert_to_tensor=True)
+        job_skill_embeddings = model.encode(job_skills, convert_to_tensor=True)
+        similarity_matrix = util.cos_sim(user_skill_embeddings, job_skill_embeddings)
+        sum_of_max_similarities = torch.sum(torch.max(similarity_matrix, dim=0).values)
+        avg_score = (sum_of_max_similarities / len(job_skills)).item() if len(job_skills) > 0 else 0
+        skill_count_factor = min(1.0, len(job_skills) / 5.0)
+        score_val = avg_score * skill_count_factor
+        matched_job_skills_mask = torch.any(similarity_matrix > 0.58, dim=0)
+        all_missing_skills = [skill for i, skill in enumerate(job_skills) if not matched_job_skills_mask[i]]
+    if user_skills and score_val >= 0.98:
+        learning_plan_html = "<h4 style='color:green;'>🎉 You have all the required skills!</h4>"
+        job_details_markdown += f"\n**Your skill match:** {score_val:.1%}"
+        return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
+    if user_skills:
+        job_details_markdown += f"\n**Your skill match:** {score_val:.1%}"
+        headline = "<b>Great fit!</b>" if score_val >= 0.8 else "<b>Good progress!</b>" if score_val >= 0.5 else "<b>Solid starting point.</b>"
+        learning_plan_html = f"<h4>{headline} Focus on these skills to improve your match:</h4>"
+        skills_to_display = sorted(all_missing_skills)[:5]
+        items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
+        learning_plan_html += f"<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
+        return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
+    else:
+        headline = "<h4>To be a good fit for this role, you'll need to learn these skills:</h4>"
+        skills_to_display = sorted(job_skills)[:5]
+        items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
+        learning_plan_html = f"{headline}<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
+        full_skill_list_for_state = sorted(job_skills)
+        new_offset = len(skills_to_display)
+        should_button_be_visible = len(full_skill_list_for_state) > 5
+        return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), full_skill_list_for_state, new_offset, gr.Button(visible=should_button_be_visible)
+def load_more_skills(full_skills_list, current_offset):
+    SKILLS_INCREMENT = 5
+    new_offset = current_offset + SKILLS_INCREMENT
+    skills_to_display = full_skills_list[:new_offset]
+    items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
+    learning_plan_html = f"<h4>To be a good fit for this role, you'll need to learn these skills:</h4><ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
+    should_button_be_visible = new_offset < len(full_skills_list)
+    return learning_plan_html, new_offset, gr.Button(visible=should_button_be_visible)
+def on_reset():
+    return ("", 3, "", pd.DataFrame(), None, gr.Dropdown(visible=False), gr.Accordion(visible=False), "Status: Ready.", "", "", "", "", gr.Markdown(visible=False), gr.Row(visible=False), [], 0, gr.Button(visible=False))
+print("Starting application initialization...")
+initialization_status = initialize_data_and_model()
+print(initialization_status)
 with gr.Blocks(theme=gr.themes.Soft()) as ui:
+    gr.Markdown("# Hybrid Career Planner & Skill Gap Analyzer")
+    initial_matches_state = gr.State()
+    missing_skills_state = gr.State([])
+    skills_offset_state = gr.State(0)
+    with gr.Row():
+        with gr.Column(scale=3):
+            dream_text = gr.Textbox(label='Your Dream Job Description', lines=3, placeholder="e.g., 'A role in a tech startup focused on machine learning...'")
+            with gr.Accordion("Optional: Add Your Skills to Re-rank Results", open=False):
+                with gr.Row():
+                    skills_text = gr.Textbox(label='Your Skills (comma-separated)', placeholder="e.g., Python, data analysis", scale=3)
+                    rerank_btn = gr.Button("Re-rank", variant="secondary", scale=1)
+        with gr.Column(scale=1):
+            topk_slider = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Number of Matches")
+            search_btn = gr.Button("Find Matches", variant="primary")
+            reset_btn = gr.Button("Reset All")
+    status_text = gr.Markdown("Status: Ready.")
+    spelling_alert = gr.Markdown(visible=False)
+    with gr.Row(visible=False) as spelling_row:
+        search_anyway_btn = gr.Button("Search Anyway", variant="secondary")
+        retype_btn = gr.Button("Let Me Fix It", variant="stop")
+    df_output = gr.DataFrame(label="Job Matches", interactive=False)
+    job_selector = gr.Dropdown(label="Select a job to see more details & learning plan:", visible=False)
+    with gr.Accordion("Job Details & Learning Plan", open=False, visible=False) as details_accordion:
+        job_details_markdown = gr.Markdown()
+        with gr.Tabs():
+            with gr.TabItem("Duties"): duties_markdown = gr.Markdown()
+            with gr.TabItem("Qualifications"): qualifications_markdown = gr.Markdown()
+            with gr.TabItem("Full Description"): description_markdown = gr.Markdown()
+        learning_plan_output = gr.HTML(label="Learning Plan")
+        load_more_btn = gr.Button("Load More Skills", visible=False)
+    search_btn.click(fn=find_matches_and_rank_with_check, inputs=[dream_text, topk_slider, skills_text], outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row])
+    search_anyway_btn.click(fn=find_matches_and_rank_anyway, inputs=[dream_text, topk_slider, skills_text], outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row])
+    retype_btn.click(lambda: ("Status: Ready for you to retype.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(visible=False), gr.Row(visible=False)), outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row])
+    reset_btn.click(fn=on_reset, outputs=[dream_text, topk_slider, skills_text, df_output, initial_matches_state, job_selector, details_accordion, status_text, job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, spelling_alert, spelling_row, missing_skills_state, skills_offset_state, load_more_btn], queue=False)
+    rerank_btn.click(fn=rerank_current_results, inputs=[initial_matches_state, skills_text, topk_slider], outputs=[status_text, df_output, job_selector])
+    job_selector.change(fn=on_select_job, inputs=[job_selector, skills_text], outputs=[job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, learning_plan_output, details_accordion, missing_skills_state, skills_offset_state, load_more_btn])
+    load_more_btn.click(fn=load_more_skills, inputs=[missing_skills_state, skills_offset_state], outputs=[learning_plan_output, skills_offset_state, load_more_btn])
 ui.launch()