Spaces:

bcueva
/

project_1_space

Runtime error

App Files Files Community

bcueva commited on Oct 6, 2025

Commit

03c9c3a

verified ·

1 Parent(s): 90cd8fe

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +390 -0

app.py ADDED Viewed

	@@ -0,0 +1,390 @@

+import pandas as pd
+import datasets
+from sentence_transformers import SentenceTransformer, util, losses, InputExample
+from datasets import Dataset
+import torch
+import re
+import nltk
+from nltk.corpus import words
+from nltk.corpus import stopwords
+from IPython.display import display, clear_output
+import ipywidgets as widgets
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+import os
+from nltk.stem import PorterStemmer
+import gradio as gr
+import urllib.parse as _url
+# --- Download necessary NLTK data ---
+try:
+    nltk.data.find('corpora/words')
+except LookupError:
+    nltk.download('words', quiet=True)
+try:
+    nltk.data.find('corpora/stopwords')
+except LookupError:
+    nltk.download('stopwords', quiet=True)
+try:
+    nltk.data.find('taggers/averaged_perceptron_tagger')
+except LookupError:
+    nltk.download('averaged_perceptron_tagger', quiet=True)
+try:
+    nltk.data.find('tokenizers/punkt')
+except LookupError:
+    nltk.download('punkt', quiet=True)
+STOPWORDS = set(stopwords.words('english'))
+stemmer = PorterStemmer()
+# --- GLOBAL STATE & DATA ---
+# These will be initialized once and stored in Gradio's State
+original_df = None
+augmented_df = None
+combined_df = None
+model = None
+combined_job_embeddings = None
+original_job_title_embeddings = None
+LLM_PIPELINE = None
+LLM_MODEL_NAME = "microsoft/phi-2"
+FINETUNED_MODEL_PATH = "./finetuned_model"
+KNOWN_WORDS = set()
+# --- CORE NLP & HELPER FUNCTIONS ---
+def _norm_skill_token(s: str) -> str:
+    s = s.lower().strip()
+    s = re.sub(r'[\(\)\[\]\{\}\*]', '', s)
+    s = re.sub(r'^\W+|\W+$', '', s)
+    s = re.sub(r'\s+', ' ', s)
+    return s
+def _skill_match(token1: str, token2: str, threshold: float = 0.9) -> bool:
+    t1 = _norm_skill_token(token1)
+    t2 = _norm_skill_token(token2)
+    if t1 == t2 or t1 in t2 or t2 in t1:
+        return True
+    try:
+        if len(t1) > 2 and len(t2) > 2:
+            vectorizer = TfidfVectorizer().fit([t1, t2])
+            vectors = vectorizer.transform([t1, t2])
+            similarity = cosine_similarity(vectors)[0, 1]
+            if similarity >= threshold:
+                return True
+    except:
+        pass
+    return False
+def build_known_vocabulary(df: pd.DataFrame):
+    global KNOWN_WORDS
+    english_words = set(w.lower() for w in words.words())
+    job_words = set(re.findall(r'\w+', " ".join(df['full_text'].astype(str).tolist()).lower()))
+    job_words = {w for w in job_words if w.isalpha() and len(w) > 2}
+    KNOWN_WORDS = english_words | job_words
+    return "Known vocabulary built (English dictionary + combined dataset words)."
+def check_spelling_in_query(query: str) -> list[str]:
+    words_in_query = query.lower().split()
+    unrecognized_words = []
+    if not KNOWN_WORDS:
+        return []
+    for word in words_in_query:
+        if word.isalpha() and len(word) > 1 and word not in KNOWN_WORDS:
+            unrecognized_words.append(word)
+    return list(set(unrecognized_words))
+def initialize_llm_client():
+    global LLM_PIPELINE
+    try:
+        device = 0 if torch.cuda.is_available() else -1
+        tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME, trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(
+            LLM_MODEL_NAME,
+            torch_dtype=torch.float16,
+            device_map="auto",
+            trust_remote_code=True
+        )
+        LLM_PIPELINE = pipeline(
+            "text-generation",
+            model=model,
+            tokenizer=tokenizer,
+            max_new_tokens=100,
+            do_sample=True,
+            temperature=0.7
+        )
+        return True
+    except Exception as e:
+        print(f"🚨 ERROR initializing local LLM: {e}")
+        return False
+def llm_expand_query(user_input: str) -> str:
+    global LLM_PIPELINE
+    if not LLM_PIPELINE:
+        return user_input
+    prompt_template = (
+        f"User's career interest: '{user_input}'
+"
+        f"Instruction: Based on the user's interest, write a concise, single-sentence summary (40-60 words) that elaborates on the core intent, typical skills, and responsibilities. "
+        f"Do not include a preamble, the user input, or any list formatting in the output. Just the expanded sentence.
+"
+        f"Expanded Intent:"
+    )
+    try:
+        response = LLM_PIPELINE(
+            prompt_template,
+            max_new_tokens=100,
+            do_sample=True,
+            temperature=0.6
+        )
+        expanded_query = response[0]['generated_text'].strip()
+        if "Expanded Intent:" in expanded_query:
+            expanded_query = expanded_query.split("Expanded Intent:")[-1].strip()
+        final_query = user_input + ". " + expanded_query.replace('
+', ' ').replace(':', '').strip()
+        final_query = final_query.replace('..', '.').strip()
+        return final_query
+    except Exception as e:
+        return user_input
+def find_job_matches(
+    original_user_query: str,
+    expanded_user_query: str,
+    top_k: int = 20,
+) -> pd.DataFrame:
+    expanded_user_embedding = model.encode(expanded_user_query, convert_to_tensor=True)
+    general_similarity_scores = util.cos_sim(expanded_user_embedding, combined_job_embeddings)[0]
+    top_indices = torch.topk(general_similarity_scores, k=len(combined_df))
+    sorted_combined_df = combined_df.iloc[top_indices.indices.cpu()].copy()
+    sorted_combined_df['general_score'] = top_indices.values.cpu().numpy()
+    unique_matches = sorted_combined_df.drop_duplicates(subset=['job_id'], keep='first').set_index('job_id')
+    original_user_embedding = model.encode(original_user_query, convert_to_tensor=True)
+    title_boost_scores = util.cos_sim(original_user_embedding, original_job_title_embeddings)[0].cpu().numpy()
+    title_boost_map = pd.Series(title_boost_scores, index=original_df['job_id'])
+    unique_matches['title_boost_score'] = unique_matches.index.map(title_boost_map).fillna(0)
+    unique_matches['Similarity Score'] = (
+        0.70 * unique_matches['general_score'] +
+        0.30 * unique_matches['title_boost_score']
+    )
+    final_job_ids = unique_matches.sort_values(by='Similarity Score', ascending=False).head(top_k).index.tolist()
+    final_results_df = original_df[original_df['job_id'].isin(final_job_ids)].copy()
+    scores_df = unique_matches.reset_index()[['job_id', 'Similarity Score']].copy()
+    final_results_df = pd.merge(final_results_df, scores_df, on='job_id', how='left')
+    final_results_df = final_results_df.sort_values(by='Similarity Score', ascending=False).reset_index(drop=True)
+    final_results_df = final_results_df.set_index('job_id', drop=False)
+    final_results_df = final_results_df.rename(columns={'job_id': 'Job ID'})
+    return final_results_df
+def score_jobs_by_skills(user_tokens: list[str], df_to_rank: pd.DataFrame) -> pd.DataFrame:
+    if df_to_rank is None or df_to_rank.empty:
+        return pd.DataFrame()
+    ranked_df = df_to_rank.copy()
+    if 'Skills' not in ranked_df.columns:
+        return ranked_df.sort_values(by='Similarity Score', ascending=False)
+    def calculate_match(row, user_tokens):
+        job_skills = row.get('Skills', [])
+        matched_skills = []
+        if not isinstance(job_skills, list):
+            return matched_skills, 0, 0.0
+        for job_skill in job_skills:
+            if any(_skill_match(u_token, job_skill) for u_token in user_tokens):
+                matched_skills.append(job_skill)
+        total_required_count = len(job_skills)
+        match_score = len(matched_skills) / total_required_count if total_required_count > 0 else 0.0
+        return matched_skills, len(matched_skills), match_score
+    results = ranked_df.apply(lambda row: calculate_match(row, user_tokens), axis=1, result_type='expand')
+    ranked_df[['Skill Matches', 'Skill Match Count', 'Skill Match Score']] = results
+    ranked_df = ranked_df.sort_values(
+        by=['Skill Match Score', 'Similarity Score'],
+        ascending=[False, False]
+    ).reset_index(drop=True)
+    return ranked_df.set_index('Job ID', drop=False).rename_axis(None)
+def fine_tune_model(model: SentenceTransformer, df: pd.DataFrame):
+    os.environ["WANDB_DISABLED"] = "true"
+    train_examples = [
+        InputExample(texts=[row['job_title'], row['full_text']])
+        for _, row in df.iterrows()
+    ]
+    train_dataloader = torch.utils.data.DataLoader(train_examples, shuffle=True, batch_size=16)
+    train_loss = losses.MultipleNegativesRankingLoss(model)
+    model.fit(
+        train_objectives=[(train_dataloader, train_loss)],
+        epochs=1,
+        warmup_steps=100,
+        show_progress_bar=True
+    )
+    model.save(FINETUNED_MODEL_PATH)
+def initialize_data_and_model():
+    global original_df, augmented_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings
+    if not initialize_llm_client():
+        pass
+    ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
+    original_df = ds["original"].to_pandas()
+    augmented_df = ds["augmented"].to_pandas()
+    original_df['job_id'] = original_df.index
+    original_jobs_count = len(original_df)
+    max_id = original_jobs_count - 1
+    augmented_df['job_id'] = augmented_df.index.map(lambda i: min(i // 20, max_id))
+    def create_full_text(row):
+        return " ".join([
+            str(row["Job title"]),
+            str(row["Company"]),
+            str(row["Duties"]),
+            str(row["qualifications"]),
+            str(row["Description"]),
+        ])
+    original_df["full_text"] = original_df.apply(create_full_text, axis=1)
+    augmented_df["full_text"] = augmented_df.apply(create_full_text, axis=1)
+    combined_df = pd.concat([original_df, augmented_df], ignore_index=True)
+    original_df = original_df.rename(columns={'Job title': 'job_title', 'Company': 'company'})
+    def extract_skills_from_text(text):
+        if not isinstance(text, str): return []
+        grammar = "NP: {<JJ.?>*<NN.?>+}"
+        chunk_parser = nltk.RegexpParser(grammar)
+        tokens = nltk.word_tokenize(text.lower())
+        tagged_tokens = nltk.pos_tag(tokens)
+        chunked_text = chunk_parser.parse(tagged_tokens)
+        skills = []
+        for subtree in chunked_text.subtrees():
+            if subtree.label() == 'NP':
+                phrase = " ".join(word for word, tag in subtree.leaves())
+                junk_phrases = {'demonstrated experience', 'experience', 'related field', 'college/university level', 'equivalent foreign degree', 'cacrep standards', 'students', 'learning experience', 'ability', 'process', 'accreditation', 'human development', 'social welfare', 'sociology', 'pre-service teachers', 'abilities', 'books', 'certifications', 'college', 'level', 'licenses', 'years', 'form', 'knowledge', 'skills'}
+                if phrase not in junk_phrases and _norm_skill_token(phrase) and phrase not in STOPWORDS:
+                    skills.append(_norm_skill_token(phrase))
+        keywords = {'teaching', 'training', 'leadership', 'management', 'data management', 'budget development', 'report'}
+        for keyword in keywords:
+            if re.search(r'' + re.escape(keyword) + r'', text.lower()) and _norm_skill_token(keyword) not in skills:
+                skills.append(_norm_skill_token(keyword))
+        stemmed_skills = {}
+        for skill in skills:
+            stemmed_phrase = ' '.join([stemmer.stem(word) for word in skill.split()])
+            if stemmed_phrase not in stemmed_skills:
+                stemmed_skills[stemmed_phrase] = skill
+        return list(stemmed_skills.values())
+    original_df['Skills'] = original_df['qualifications'].apply(extract_skills_from_text)
+    if os.path.exists(FINETUNED_MODEL_PATH):
+        model = SentenceTransformer(FINETUNED_MODEL_PATH)
+    else:
+        model = SentenceTransformer("all-MiniLM-L6-v2")
+        fine_tune_model(model, original_df)
+        model = SentenceTransformer(FINETUNED_MODEL_PATH)
+    combined_job_embeddings = model.encode(combined_df["full_text"].tolist(), convert_to_tensor=True)
+    original_job_title_embeddings = model.encode(original_df["job_title"].tolist(), convert_to_tensor=True)
+    build_known_vocabulary(combined_df)
+    return "--- Initialization Complete ---"
+# --- GRADIO INTERFACE DEFINITION ---
+def build_interface():
+    with gr.Blocks() as ui:
+        gr.Markdown("# Hybrid Career Planner & Skill Gap Analyzer")
+        gr.Markdown("<i>Uses Augmented Data & LLM for Robust Search + Your Skills for Reranking.</i>")
+        with gr.Row():
+            dream_text = gr.Textbox(label='Dream job:', lines=3, placeholder="Describe your ideal role (what you do, impact, tools, industry, etc.)", scale=3)
+            topk_slider = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Top N:", scale=1)
+        status_text = gr.Markdown("Status: Ready.")
+        spelling_alert = gr.Markdown(visible=False)
+        with gr.Row(visible=False) as spelling_row:
+            search_anyway_btn = gr.Button("Search Anyway", variant="secondary")
+            retype_btn = gr.Button("Retype/Fix Input", variant="stop")
+        with gr.Row():
+            search_btn = gr.Button("Find matches", variant="primary")
+            reset_btn = gr.Button("Reset", variant="secondary")
+        df_output = gr.DataFrame(label="Job Matches")
+        with gr.Accordion("Optional: Rerank by your skills", open=False):
+            skills_text = gr.Textbox(label='Your skills:', placeholder="Comma-separated (e.g., Python, SolidWorks, FEA, leadership)")
+            rerank_btn = gr.Button("Add skills & Re-rank")
+        job_selector = gr.Dropdown(label="Select a job to see more details & learning plan:")
+        with gr.Accordion("Job Details", open=True):
+            job_details_markdown = gr.Markdown()
+            with gr.Accordion("Duties"):
+                duties_markdown = gr.Markdown()
+            with gr.Accordion("Qualifications"):
+                qualifications_markdown = gr.Markdown()
+            with gr.Accordion("Description"):
+                description_markdown = gr.Markdown()
+        with gr.Accordion("Learning Plan"):
+            learning_plan_output = gr.HTML()
+        search_btn.click(
+            fn=find_matches_and_rank_with_check,
+            inputs=[dream_text, topk_slider, skills_text],
+            outputs=[status_text, df_output, job_selector, spelling_alert, spelling_row, job_details_markdown]
+        )
+        rerank_btn.click(
+            fn=find_matches_and_rank_anyway,
+            inputs=[dream_text, topk_slider, skills_text],
+            outputs=[status_text, df_output, job_selector, spelling_alert, spelling_row, job_details_markdown]
+        )
+        search_anyway_btn.click(
+            fn=find_matches_and_rank_anyway,
+            inputs=[dream_text, topk_slider, skills_text],
+            outputs=[status_text, df_output, job_selector, spelling_alert, spelling_row, job_details_markdown]
+        )
+        retype_btn.click(
+            lambda: (
+                "Status: Ready to retype.", pd.DataFrame(), gr.Dropdown(choices=[], value=None),
+                gr.Markdown(visible=False), gr.Row(visible=False),
+                ""
+            ),
+            inputs=[],
+            outputs=[status_text, df_output, job_selector, spelling_alert, spelling_row, job_details_markdown]
+        )
+        def on_reset():
+            return (
+                "",
+                pd.DataFrame(),
+                gr.Dropdown(choices=[], value=None),
+                "",
+                gr.Markdown("", visible=False),
+                gr.Row(visible=False),
+                ""
+            )
+        reset_btn.click(
+            fn=on_reset,
+            inputs=[],
+            outputs=[dream_text, df_output, job_selector, skills_text, spelling_alert, spelling_row, job_details_markdown]
+        )
+        job_selector.change(
+            fn=on_select_job,
+            inputs=[job_selector, skills_text],
+            outputs=[job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, learning_plan_output]
+        )
+    return ui
+# --- INITIALIZATION AND LAUNCH ---
+if __name__ == "__main__":
+    initialize_data_and_model()
+    ui = build_interface()
+    ui.launch()