File size: 16,284 Bytes
03c9c3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fb86312
03c9c3a
fb86312
03c9c3a
 
 
 
 
 
 
 
 
 
 
 
fb86312
03c9c3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388

import pandas as pd
import datasets
from sentence_transformers import SentenceTransformer, util, losses, InputExample
from datasets import Dataset
import torch
import re
import nltk
from nltk.corpus import words
from nltk.corpus import stopwords
from IPython.display import display, clear_output
import ipywidgets as widgets
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import os
from nltk.stem import PorterStemmer
import gradio as gr
import urllib.parse as _url

# --- Download necessary NLTK data ---
try:
    nltk.data.find('corpora/words')
except LookupError:
    nltk.download('words', quiet=True)
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords', quiet=True)
try:
    nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
    nltk.download('averaged_perceptron_tagger', quiet=True)
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt', quiet=True)

STOPWORDS = set(stopwords.words('english'))
stemmer = PorterStemmer()

# --- GLOBAL STATE & DATA ---
# These will be initialized once and stored in Gradio's State
original_df = None
augmented_df = None
combined_df = None
model = None
combined_job_embeddings = None
original_job_title_embeddings = None
LLM_PIPELINE = None
LLM_MODEL_NAME = "microsoft/phi-2"
FINETUNED_MODEL_PATH = "./finetuned_model"
KNOWN_WORDS = set()

# --- CORE NLP & HELPER FUNCTIONS ---
def _norm_skill_token(s: str) -> str:
    s = s.lower().strip()
    s = re.sub(r'[\(\)\[\]\{\}\*]', '', s)
    s = re.sub(r'^\W+|\W+$', '', s)
    s = re.sub(r'\s+', ' ', s)
    return s

def _skill_match(token1: str, token2: str, threshold: float = 0.9) -> bool:
    t1 = _norm_skill_token(token1)
    t2 = _norm_skill_token(token2)
    if t1 == t2 or t1 in t2 or t2 in t1:
        return True
    try:
        if len(t1) > 2 and len(t2) > 2:
            vectorizer = TfidfVectorizer().fit([t1, t2])
            vectors = vectorizer.transform([t1, t2])
            similarity = cosine_similarity(vectors)[0, 1]
            if similarity >= threshold:
                return True
    except:
        pass
    return False

def build_known_vocabulary(df: pd.DataFrame):
    global KNOWN_WORDS
    english_words = set(w.lower() for w in words.words())
    job_words = set(re.findall(r'\w+', " ".join(df['full_text'].astype(str).tolist()).lower()))
    job_words = {w for w in job_words if w.isalpha() and len(w) > 2}
    KNOWN_WORDS = english_words | job_words
    return "Known vocabulary built (English dictionary + combined dataset words)."

def check_spelling_in_query(query: str) -> list[str]:
    words_in_query = query.lower().split()
    unrecognized_words = []
    if not KNOWN_WORDS:
        return []

    for word in words_in_query:
        if word.isalpha() and len(word) > 1 and word not in KNOWN_WORDS:
            unrecognized_words.append(word)
    return list(set(unrecognized_words))


def initialize_llm_client():
    global LLM_PIPELINE
    try:
        device = 0 if torch.cuda.is_available() else -1
        tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME, trust_remote_code=True)
        model = AutoModelForCausalLM.from_pretrained(
            LLM_MODEL_NAME,
            torch_dtype=torch.float16,
            device_map="auto",
            trust_remote_code=True
        )
        LLM_PIPELINE = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            max_new_tokens=100,
            do_sample=True,
            temperature=0.7
        )
        return True
    except Exception as e:
        print(f"🚨 ERROR initializing local LLM: {e}")
        return False

def llm_expand_query(user_input: str) -> str:
    global LLM_PIPELINE
    if not LLM_PIPELINE:
        return user_input
    prompt_template = (
        f"User's career interest: '{user_input}'\n"
        f"Instruction: Based on the user's interest, write a concise, single-sentence summary (40-60 words) that elaborates on the core intent, typical skills, and responsibilities. "
        f"Do not include a preamble, the user input, or any list formatting in the output. Just the expanded sentence.\n"
        f"Expanded Intent:"
    )
    try:
        response = LLM_PIPELINE(
            prompt_template,
            max_new_tokens=100,
            do_sample=True,
            temperature=0.6
        )
        expanded_query = response[0]['generated_text'].strip()
        if "Expanded Intent:" in expanded_query:
            expanded_query = expanded_query.split("Expanded Intent:")[-1].strip()
        final_query = user_input + ". " + expanded_query.replace('\n', ' ').replace(':', '').strip() # Fixed: Escape the newline character in the replace method
        final_query = final_query.replace('..', '.').strip()
        return final_query
    except Exception as e:
        return user_input

def find_job_matches(
    original_user_query: str,
    expanded_user_query: str,
    top_k: int = 20,
) -> pd.DataFrame:
    expanded_user_embedding = model.encode(expanded_user_query, convert_to_tensor=True)
    general_similarity_scores = util.cos_sim(expanded_user_embedding, combined_job_embeddings)[0]
    top_indices = torch.topk(general_similarity_scores, k=len(combined_df))
    sorted_combined_df = combined_df.iloc[top_indices.indices.cpu()].copy()
    sorted_combined_df['general_score'] = top_indices.values.cpu().numpy()
    unique_matches = sorted_combined_df.drop_duplicates(subset=['job_id'], keep='first').set_index('job_id')
    original_user_embedding = model.encode(original_user_query, convert_to_tensor=True)
    title_boost_scores = util.cos_sim(original_user_embedding, original_job_title_embeddings)[0].cpu().numpy()
    title_boost_map = pd.Series(title_boost_scores, index=original_df['job_id'])
    unique_matches['title_boost_score'] = unique_matches.index.map(title_boost_map).fillna(0)
    unique_matches['Similarity Score'] = (
        0.70 * unique_matches['general_score'] +
        0.30 * unique_matches['title_boost_score']
    )
    final_job_ids = unique_matches.sort_values(by='Similarity Score', ascending=False).head(top_k).index.tolist()
    final_results_df = original_df[original_df['job_id'].isin(final_job_ids)].copy()
    scores_df = unique_matches.reset_index()[['job_id', 'Similarity Score']].copy()
    final_results_df = pd.merge(final_results_df, scores_df, on='job_id', how='left')
    final_results_df = final_results_df.sort_values(by='Similarity Score', ascending=False).reset_index(drop=True)
    final_results_df = final_results_df.set_index('job_id', drop=False)
    final_results_df = final_results_df.rename(columns={'job_id': 'Job ID'})
    return final_results_df

def score_jobs_by_skills(user_tokens: list[str], df_to_rank: pd.DataFrame) -> pd.DataFrame:
    if df_to_rank is None or df_to_rank.empty:
        return pd.DataFrame()
    ranked_df = df_to_rank.copy()
    if 'Skills' not in ranked_df.columns:
        return ranked_df.sort_values(by='Similarity Score', ascending=False)
    def calculate_match(row, user_tokens):
        job_skills = row.get('Skills', [])
        matched_skills = []
        if not isinstance(job_skills, list):
            return matched_skills, 0, 0.0
        for job_skill in job_skills:
            if any(_skill_match(u_token, job_skill) for u_token in user_tokens):
                matched_skills.append(job_skill)
        total_required_count = len(job_skills)
        match_score = len(matched_skills) / total_required_count if total_required_count > 0 else 0.0
        return matched_skills, len(matched_skills), match_score
    results = ranked_df.apply(lambda row: calculate_match(row, user_tokens), axis=1, result_type='expand')
    ranked_df[['Skill Matches', 'Skill Match Count', 'Skill Match Score']] = results
    ranked_df = ranked_df.sort_values(
        by=['Skill Match Score', 'Similarity Score'],
        ascending=[False, False]
    ).reset_index(drop=True)
    return ranked_df.set_index('Job ID', drop=False).rename_axis(None)

def fine_tune_model(model: SentenceTransformer, df: pd.DataFrame):
    os.environ["WANDB_DISABLED"] = "true"
    train_examples = [
        InputExample(texts=[row['job_title'], row['full_text']])
        for _, row in df.iterrows()
    ]
    train_dataloader = torch.utils.data.DataLoader(train_examples, shuffle=True, batch_size=16)
    train_loss = losses.MultipleNegativesRankingLoss(model)
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=1,
        warmup_steps=100,
        show_progress_bar=True
    )
    model.save(FINETUNED_MODEL_PATH)

def initialize_data_and_model():
    global original_df, augmented_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings

    if not initialize_llm_client():
        pass

    ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
    original_df = ds["original"].to_pandas()
    augmented_df = ds["augmented"].to_pandas()

    original_df['job_id'] = original_df.index
    original_jobs_count = len(original_df)
    max_id = original_jobs_count - 1
    augmented_df['job_id'] = augmented_df.index.map(lambda i: min(i // 20, max_id))

    def create_full_text(row):
        return " ".join([
            str(row["Job title"]),
            str(row["Company"]),
            str(row["Duties"]),
            str(row["qualifications"]),
            str(row["Description"]),
        ])
    original_df["full_text"] = original_df.apply(create_full_text, axis=1)
    augmented_df["full_text"] = augmented_df.apply(create_full_text, axis=1)
    combined_df = pd.concat([original_df, augmented_df], ignore_index=True)

    original_df = original_df.rename(columns={'Job title': 'job_title', 'Company': 'company'})

    def extract_skills_from_text(text):
        if not isinstance(text, str): return []
        grammar = "NP: {<JJ.?>*<NN.?>+}"
        chunk_parser = nltk.RegexpParser(grammar)
        tokens = nltk.word_tokenize(text.lower())
        tagged_tokens = nltk.pos_tag(tokens)
        chunked_text = chunk_parser.parse(tagged_tokens)
        skills = []
        for subtree in chunked_text.subtrees():
            if subtree.label() == 'NP':
                phrase = " ".join(word for word, tag in subtree.leaves())
                junk_phrases = {'demonstrated experience', 'experience', 'related field', 'college/university level', 'equivalent foreign degree', 'cacrep standards', 'students', 'learning experience', 'ability', 'process', 'accreditation', 'human development', 'social welfare', 'sociology', 'pre-service teachers', 'abilities', 'books', 'certifications', 'college', 'level', 'licenses', 'years', 'form', 'knowledge', 'skills'}
                if phrase not in junk_phrases and _norm_skill_token(phrase) and phrase not in STOPWORDS:
                    skills.append(_norm_skill_token(phrase))
        keywords = {'teaching', 'training', 'leadership', 'management', 'data management', 'budget development', 'report'}
        for keyword in keywords:
            if re.search(r'' + re.escape(keyword) + r'', text.lower()) and _norm_skill_token(keyword) not in skills:
                skills.append(_norm_skill_token(keyword))
        stemmed_skills = {}
        for skill in skills:
            stemmed_phrase = ' '.join([stemmer.stem(word) for word in skill.split()])
            if stemmed_phrase not in stemmed_skills:
                stemmed_skills[stemmed_phrase] = skill
        return list(stemmed_skills.values())

    original_df['Skills'] = original_df['qualifications'].apply(extract_skills_from_text)

    if os.path.exists(FINETUNED_MODEL_PATH):
        model = SentenceTransformer(FINETUNED_MODEL_PATH)
    else:
        model = SentenceTransformer("all-MiniLM-L6-v2")
        fine_tune_model(model, original_df)
        model = SentenceTransformer(FINETUNED_MODEL_PATH)

    combined_job_embeddings = model.encode(combined_df["full_text"].tolist(), convert_to_tensor=True)
    original_job_title_embeddings = model.encode(original_df["job_title"].tolist(), convert_to_tensor=True)

    build_known_vocabulary(combined_df)

    return "--- Initialization Complete ---"

# --- GRADIO INTERFACE DEFINITION ---
def build_interface():
    with gr.Blocks() as ui:
        gr.Markdown("# Hybrid Career Planner & Skill Gap Analyzer")
        gr.Markdown("<i>Uses Augmented Data & LLM for Robust Search + Your Skills for Reranking.</i>")

        with gr.Row():
            dream_text = gr.Textbox(label='Dream job:', lines=3, placeholder="Describe your ideal role (what you do, impact, tools, industry, etc.)", scale=3)
            topk_slider = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Top N:", scale=1)

        status_text = gr.Markdown("Status: Ready.")
        spelling_alert = gr.Markdown(visible=False)

        with gr.Row(visible=False) as spelling_row:
            search_anyway_btn = gr.Button("Search Anyway", variant="secondary")
            retype_btn = gr.Button("Retype/Fix Input", variant="stop")

        with gr.Row():
            search_btn = gr.Button("Find matches", variant="primary")
            reset_btn = gr.Button("Reset", variant="secondary")

        df_output = gr.DataFrame(label="Job Matches")

        with gr.Accordion("Optional: Rerank by your skills", open=False):
            skills_text = gr.Textbox(label='Your skills:', placeholder="Comma-separated (e.g., Python, SolidWorks, FEA, leadership)")
            rerank_btn = gr.Button("Add skills & Re-rank")

        job_selector = gr.Dropdown(label="Select a job to see more details & learning plan:")

        with gr.Accordion("Job Details", open=True):
            job_details_markdown = gr.Markdown()
            with gr.Accordion("Duties"):
                duties_markdown = gr.Markdown()
            with gr.Accordion("Qualifications"):
                qualifications_markdown = gr.Markdown()
            with gr.Accordion("Description"):
                description_markdown = gr.Markdown()

        with gr.Accordion("Learning Plan"):
            learning_plan_output = gr.HTML()

        search_btn.click(
            fn=find_matches_and_rank_with_check,
            inputs=[dream_text, topk_slider, skills_text],
            outputs=[status_text, df_output, job_selector, spelling_alert, spelling_row, job_details_markdown]
        )

        rerank_btn.click(
            fn=find_matches_and_rank_anyway,
            inputs=[dream_text, topk_slider, skills_text],
            outputs=[status_text, df_output, job_selector, spelling_alert, spelling_row, job_details_markdown]
        )

        search_anyway_btn.click(
            fn=find_matches_and_rank_anyway,
            inputs=[dream_text, topk_slider, skills_text],
            outputs=[status_text, df_output, job_selector, spelling_alert, spelling_row, job_details_markdown]
        )

        retype_btn.click(
            lambda: (
                "Status: Ready to retype.", pd.DataFrame(), gr.Dropdown(choices=[], value=None),
                gr.Markdown(visible=False), gr.Row(visible=False),
                ""
            ),
            inputs=[],
            outputs=[status_text, df_output, job_selector, spelling_alert, spelling_row, job_details_markdown]
        )

        def on_reset():
            return (
                "",
                pd.DataFrame(),
                gr.Dropdown(choices=[], value=None),
                "",
                gr.Markdown("", visible=False),
                gr.Row(visible=False),
                ""
            )

        reset_btn.click(
            fn=on_reset,
            inputs=[],
            outputs=[dream_text, df_output, job_selector, skills_text, spelling_alert, spelling_row, job_details_markdown]
        )

        job_selector.change(
            fn=on_select_job,
            inputs=[job_selector, skills_text],
            outputs=[job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, learning_plan_output]
        )

    return ui

# --- INITIALIZATION AND LAUNCH ---
if __name__ == "__main__":
    initialize_data_and_model()
    ui = build_interface()
    ui.launch()