bcueva commited on
Commit
03c9c3a
·
verified ·
1 Parent(s): 90cd8fe

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +390 -0
app.py ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import pandas as pd
3
+ import datasets
4
+ from sentence_transformers import SentenceTransformer, util, losses, InputExample
5
+ from datasets import Dataset
6
+ import torch
7
+ import re
8
+ import nltk
9
+ from nltk.corpus import words
10
+ from nltk.corpus import stopwords
11
+ from IPython.display import display, clear_output
12
+ import ipywidgets as widgets
13
+ from sklearn.feature_extraction.text import TfidfVectorizer
14
+ from sklearn.metrics.pairwise import cosine_similarity
15
+ import numpy as np
16
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
17
+ import os
18
+ from nltk.stem import PorterStemmer
19
+ import gradio as gr
20
+ import urllib.parse as _url
21
+
22
+ # --- Download necessary NLTK data ---
23
+ try:
24
+ nltk.data.find('corpora/words')
25
+ except LookupError:
26
+ nltk.download('words', quiet=True)
27
+ try:
28
+ nltk.data.find('corpora/stopwords')
29
+ except LookupError:
30
+ nltk.download('stopwords', quiet=True)
31
+ try:
32
+ nltk.data.find('taggers/averaged_perceptron_tagger')
33
+ except LookupError:
34
+ nltk.download('averaged_perceptron_tagger', quiet=True)
35
+ try:
36
+ nltk.data.find('tokenizers/punkt')
37
+ except LookupError:
38
+ nltk.download('punkt', quiet=True)
39
+
40
+ STOPWORDS = set(stopwords.words('english'))
41
+ stemmer = PorterStemmer()
42
+
43
+ # --- GLOBAL STATE & DATA ---
44
+ # These will be initialized once and stored in Gradio's State
45
+ original_df = None
46
+ augmented_df = None
47
+ combined_df = None
48
+ model = None
49
+ combined_job_embeddings = None
50
+ original_job_title_embeddings = None
51
+ LLM_PIPELINE = None
52
+ LLM_MODEL_NAME = "microsoft/phi-2"
53
+ FINETUNED_MODEL_PATH = "./finetuned_model"
54
+ KNOWN_WORDS = set()
55
+
56
+ # --- CORE NLP & HELPER FUNCTIONS ---
57
+ def _norm_skill_token(s: str) -> str:
58
+ s = s.lower().strip()
59
+ s = re.sub(r'[\(\)\[\]\{\}\*]', '', s)
60
+ s = re.sub(r'^\W+|\W+$', '', s)
61
+ s = re.sub(r'\s+', ' ', s)
62
+ return s
63
+
64
+ def _skill_match(token1: str, token2: str, threshold: float = 0.9) -> bool:
65
+ t1 = _norm_skill_token(token1)
66
+ t2 = _norm_skill_token(token2)
67
+ if t1 == t2 or t1 in t2 or t2 in t1:
68
+ return True
69
+ try:
70
+ if len(t1) > 2 and len(t2) > 2:
71
+ vectorizer = TfidfVectorizer().fit([t1, t2])
72
+ vectors = vectorizer.transform([t1, t2])
73
+ similarity = cosine_similarity(vectors)[0, 1]
74
+ if similarity >= threshold:
75
+ return True
76
+ except:
77
+ pass
78
+ return False
79
+
80
+ def build_known_vocabulary(df: pd.DataFrame):
81
+ global KNOWN_WORDS
82
+ english_words = set(w.lower() for w in words.words())
83
+ job_words = set(re.findall(r'\w+', " ".join(df['full_text'].astype(str).tolist()).lower()))
84
+ job_words = {w for w in job_words if w.isalpha() and len(w) > 2}
85
+ KNOWN_WORDS = english_words | job_words
86
+ return "Known vocabulary built (English dictionary + combined dataset words)."
87
+
88
+ def check_spelling_in_query(query: str) -> list[str]:
89
+ words_in_query = query.lower().split()
90
+ unrecognized_words = []
91
+ if not KNOWN_WORDS:
92
+ return []
93
+
94
+ for word in words_in_query:
95
+ if word.isalpha() and len(word) > 1 and word not in KNOWN_WORDS:
96
+ unrecognized_words.append(word)
97
+ return list(set(unrecognized_words))
98
+
99
+
100
+ def initialize_llm_client():
101
+ global LLM_PIPELINE
102
+ try:
103
+ device = 0 if torch.cuda.is_available() else -1
104
+ tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME, trust_remote_code=True)
105
+ model = AutoModelForCausalLM.from_pretrained(
106
+ LLM_MODEL_NAME,
107
+ torch_dtype=torch.float16,
108
+ device_map="auto",
109
+ trust_remote_code=True
110
+ )
111
+ LLM_PIPELINE = pipeline(
112
+ "text-generation",
113
+ model=model,
114
+ tokenizer=tokenizer,
115
+ max_new_tokens=100,
116
+ do_sample=True,
117
+ temperature=0.7
118
+ )
119
+ return True
120
+ except Exception as e:
121
+ print(f"🚨 ERROR initializing local LLM: {e}")
122
+ return False
123
+
124
+ def llm_expand_query(user_input: str) -> str:
125
+ global LLM_PIPELINE
126
+ if not LLM_PIPELINE:
127
+ return user_input
128
+ prompt_template = (
129
+ f"User's career interest: '{user_input}'
130
+ "
131
+ f"Instruction: Based on the user's interest, write a concise, single-sentence summary (40-60 words) that elaborates on the core intent, typical skills, and responsibilities. "
132
+ f"Do not include a preamble, the user input, or any list formatting in the output. Just the expanded sentence.
133
+ "
134
+ f"Expanded Intent:"
135
+ )
136
+ try:
137
+ response = LLM_PIPELINE(
138
+ prompt_template,
139
+ max_new_tokens=100,
140
+ do_sample=True,
141
+ temperature=0.6
142
+ )
143
+ expanded_query = response[0]['generated_text'].strip()
144
+ if "Expanded Intent:" in expanded_query:
145
+ expanded_query = expanded_query.split("Expanded Intent:")[-1].strip()
146
+ final_query = user_input + ". " + expanded_query.replace('
147
+ ', ' ').replace(':', '').strip()
148
+ final_query = final_query.replace('..', '.').strip()
149
+ return final_query
150
+ except Exception as e:
151
+ return user_input
152
+
153
+ def find_job_matches(
154
+ original_user_query: str,
155
+ expanded_user_query: str,
156
+ top_k: int = 20,
157
+ ) -> pd.DataFrame:
158
+ expanded_user_embedding = model.encode(expanded_user_query, convert_to_tensor=True)
159
+ general_similarity_scores = util.cos_sim(expanded_user_embedding, combined_job_embeddings)[0]
160
+ top_indices = torch.topk(general_similarity_scores, k=len(combined_df))
161
+ sorted_combined_df = combined_df.iloc[top_indices.indices.cpu()].copy()
162
+ sorted_combined_df['general_score'] = top_indices.values.cpu().numpy()
163
+ unique_matches = sorted_combined_df.drop_duplicates(subset=['job_id'], keep='first').set_index('job_id')
164
+ original_user_embedding = model.encode(original_user_query, convert_to_tensor=True)
165
+ title_boost_scores = util.cos_sim(original_user_embedding, original_job_title_embeddings)[0].cpu().numpy()
166
+ title_boost_map = pd.Series(title_boost_scores, index=original_df['job_id'])
167
+ unique_matches['title_boost_score'] = unique_matches.index.map(title_boost_map).fillna(0)
168
+ unique_matches['Similarity Score'] = (
169
+ 0.70 * unique_matches['general_score'] +
170
+ 0.30 * unique_matches['title_boost_score']
171
+ )
172
+ final_job_ids = unique_matches.sort_values(by='Similarity Score', ascending=False).head(top_k).index.tolist()
173
+ final_results_df = original_df[original_df['job_id'].isin(final_job_ids)].copy()
174
+ scores_df = unique_matches.reset_index()[['job_id', 'Similarity Score']].copy()
175
+ final_results_df = pd.merge(final_results_df, scores_df, on='job_id', how='left')
176
+ final_results_df = final_results_df.sort_values(by='Similarity Score', ascending=False).reset_index(drop=True)
177
+ final_results_df = final_results_df.set_index('job_id', drop=False)
178
+ final_results_df = final_results_df.rename(columns={'job_id': 'Job ID'})
179
+ return final_results_df
180
+
181
+ def score_jobs_by_skills(user_tokens: list[str], df_to_rank: pd.DataFrame) -> pd.DataFrame:
182
+ if df_to_rank is None or df_to_rank.empty:
183
+ return pd.DataFrame()
184
+ ranked_df = df_to_rank.copy()
185
+ if 'Skills' not in ranked_df.columns:
186
+ return ranked_df.sort_values(by='Similarity Score', ascending=False)
187
+ def calculate_match(row, user_tokens):
188
+ job_skills = row.get('Skills', [])
189
+ matched_skills = []
190
+ if not isinstance(job_skills, list):
191
+ return matched_skills, 0, 0.0
192
+ for job_skill in job_skills:
193
+ if any(_skill_match(u_token, job_skill) for u_token in user_tokens):
194
+ matched_skills.append(job_skill)
195
+ total_required_count = len(job_skills)
196
+ match_score = len(matched_skills) / total_required_count if total_required_count > 0 else 0.0
197
+ return matched_skills, len(matched_skills), match_score
198
+ results = ranked_df.apply(lambda row: calculate_match(row, user_tokens), axis=1, result_type='expand')
199
+ ranked_df[['Skill Matches', 'Skill Match Count', 'Skill Match Score']] = results
200
+ ranked_df = ranked_df.sort_values(
201
+ by=['Skill Match Score', 'Similarity Score'],
202
+ ascending=[False, False]
203
+ ).reset_index(drop=True)
204
+ return ranked_df.set_index('Job ID', drop=False).rename_axis(None)
205
+
206
+ def fine_tune_model(model: SentenceTransformer, df: pd.DataFrame):
207
+ os.environ["WANDB_DISABLED"] = "true"
208
+ train_examples = [
209
+ InputExample(texts=[row['job_title'], row['full_text']])
210
+ for _, row in df.iterrows()
211
+ ]
212
+ train_dataloader = torch.utils.data.DataLoader(train_examples, shuffle=True, batch_size=16)
213
+ train_loss = losses.MultipleNegativesRankingLoss(model)
214
+ model.fit(
215
+ train_objectives=[(train_dataloader, train_loss)],
216
+ epochs=1,
217
+ warmup_steps=100,
218
+ show_progress_bar=True
219
+ )
220
+ model.save(FINETUNED_MODEL_PATH)
221
+
222
+ def initialize_data_and_model():
223
+ global original_df, augmented_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings
224
+
225
+ if not initialize_llm_client():
226
+ pass
227
+
228
+ ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
229
+ original_df = ds["original"].to_pandas()
230
+ augmented_df = ds["augmented"].to_pandas()
231
+
232
+ original_df['job_id'] = original_df.index
233
+ original_jobs_count = len(original_df)
234
+ max_id = original_jobs_count - 1
235
+ augmented_df['job_id'] = augmented_df.index.map(lambda i: min(i // 20, max_id))
236
+
237
+ def create_full_text(row):
238
+ return " ".join([
239
+ str(row["Job title"]),
240
+ str(row["Company"]),
241
+ str(row["Duties"]),
242
+ str(row["qualifications"]),
243
+ str(row["Description"]),
244
+ ])
245
+ original_df["full_text"] = original_df.apply(create_full_text, axis=1)
246
+ augmented_df["full_text"] = augmented_df.apply(create_full_text, axis=1)
247
+ combined_df = pd.concat([original_df, augmented_df], ignore_index=True)
248
+
249
+ original_df = original_df.rename(columns={'Job title': 'job_title', 'Company': 'company'})
250
+
251
+ def extract_skills_from_text(text):
252
+ if not isinstance(text, str): return []
253
+ grammar = "NP: {<JJ.?>*<NN.?>+}"
254
+ chunk_parser = nltk.RegexpParser(grammar)
255
+ tokens = nltk.word_tokenize(text.lower())
256
+ tagged_tokens = nltk.pos_tag(tokens)
257
+ chunked_text = chunk_parser.parse(tagged_tokens)
258
+ skills = []
259
+ for subtree in chunked_text.subtrees():
260
+ if subtree.label() == 'NP':
261
+ phrase = " ".join(word for word, tag in subtree.leaves())
262
+ junk_phrases = {'demonstrated experience', 'experience', 'related field', 'college/university level', 'equivalent foreign degree', 'cacrep standards', 'students', 'learning experience', 'ability', 'process', 'accreditation', 'human development', 'social welfare', 'sociology', 'pre-service teachers', 'abilities', 'books', 'certifications', 'college', 'level', 'licenses', 'years', 'form', 'knowledge', 'skills'}
263
+ if phrase not in junk_phrases and _norm_skill_token(phrase) and phrase not in STOPWORDS:
264
+ skills.append(_norm_skill_token(phrase))
265
+ keywords = {'teaching', 'training', 'leadership', 'management', 'data management', 'budget development', 'report'}
266
+ for keyword in keywords:
267
+ if re.search(r'' + re.escape(keyword) + r'', text.lower()) and _norm_skill_token(keyword) not in skills:
268
+ skills.append(_norm_skill_token(keyword))
269
+ stemmed_skills = {}
270
+ for skill in skills:
271
+ stemmed_phrase = ' '.join([stemmer.stem(word) for word in skill.split()])
272
+ if stemmed_phrase not in stemmed_skills:
273
+ stemmed_skills[stemmed_phrase] = skill
274
+ return list(stemmed_skills.values())
275
+
276
+ original_df['Skills'] = original_df['qualifications'].apply(extract_skills_from_text)
277
+
278
+ if os.path.exists(FINETUNED_MODEL_PATH):
279
+ model = SentenceTransformer(FINETUNED_MODEL_PATH)
280
+ else:
281
+ model = SentenceTransformer("all-MiniLM-L6-v2")
282
+ fine_tune_model(model, original_df)
283
+ model = SentenceTransformer(FINETUNED_MODEL_PATH)
284
+
285
+ combined_job_embeddings = model.encode(combined_df["full_text"].tolist(), convert_to_tensor=True)
286
+ original_job_title_embeddings = model.encode(original_df["job_title"].tolist(), convert_to_tensor=True)
287
+
288
+ build_known_vocabulary(combined_df)
289
+
290
+ return "--- Initialization Complete ---"
291
+
292
+ # --- GRADIO INTERFACE DEFINITION ---
293
+ def build_interface():
294
+ with gr.Blocks() as ui:
295
+ gr.Markdown("# Hybrid Career Planner & Skill Gap Analyzer")
296
+ gr.Markdown("<i>Uses Augmented Data & LLM for Robust Search + Your Skills for Reranking.</i>")
297
+
298
+ with gr.Row():
299
+ dream_text = gr.Textbox(label='Dream job:', lines=3, placeholder="Describe your ideal role (what you do, impact, tools, industry, etc.)", scale=3)
300
+ topk_slider = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Top N:", scale=1)
301
+
302
+ status_text = gr.Markdown("Status: Ready.")
303
+ spelling_alert = gr.Markdown(visible=False)
304
+
305
+ with gr.Row(visible=False) as spelling_row:
306
+ search_anyway_btn = gr.Button("Search Anyway", variant="secondary")
307
+ retype_btn = gr.Button("Retype/Fix Input", variant="stop")
308
+
309
+ with gr.Row():
310
+ search_btn = gr.Button("Find matches", variant="primary")
311
+ reset_btn = gr.Button("Reset", variant="secondary")
312
+
313
+ df_output = gr.DataFrame(label="Job Matches")
314
+
315
+ with gr.Accordion("Optional: Rerank by your skills", open=False):
316
+ skills_text = gr.Textbox(label='Your skills:', placeholder="Comma-separated (e.g., Python, SolidWorks, FEA, leadership)")
317
+ rerank_btn = gr.Button("Add skills & Re-rank")
318
+
319
+ job_selector = gr.Dropdown(label="Select a job to see more details & learning plan:")
320
+
321
+ with gr.Accordion("Job Details", open=True):
322
+ job_details_markdown = gr.Markdown()
323
+ with gr.Accordion("Duties"):
324
+ duties_markdown = gr.Markdown()
325
+ with gr.Accordion("Qualifications"):
326
+ qualifications_markdown = gr.Markdown()
327
+ with gr.Accordion("Description"):
328
+ description_markdown = gr.Markdown()
329
+
330
+ with gr.Accordion("Learning Plan"):
331
+ learning_plan_output = gr.HTML()
332
+
333
+ search_btn.click(
334
+ fn=find_matches_and_rank_with_check,
335
+ inputs=[dream_text, topk_slider, skills_text],
336
+ outputs=[status_text, df_output, job_selector, spelling_alert, spelling_row, job_details_markdown]
337
+ )
338
+
339
+ rerank_btn.click(
340
+ fn=find_matches_and_rank_anyway,
341
+ inputs=[dream_text, topk_slider, skills_text],
342
+ outputs=[status_text, df_output, job_selector, spelling_alert, spelling_row, job_details_markdown]
343
+ )
344
+
345
+ search_anyway_btn.click(
346
+ fn=find_matches_and_rank_anyway,
347
+ inputs=[dream_text, topk_slider, skills_text],
348
+ outputs=[status_text, df_output, job_selector, spelling_alert, spelling_row, job_details_markdown]
349
+ )
350
+
351
+ retype_btn.click(
352
+ lambda: (
353
+ "Status: Ready to retype.", pd.DataFrame(), gr.Dropdown(choices=[], value=None),
354
+ gr.Markdown(visible=False), gr.Row(visible=False),
355
+ ""
356
+ ),
357
+ inputs=[],
358
+ outputs=[status_text, df_output, job_selector, spelling_alert, spelling_row, job_details_markdown]
359
+ )
360
+
361
+ def on_reset():
362
+ return (
363
+ "",
364
+ pd.DataFrame(),
365
+ gr.Dropdown(choices=[], value=None),
366
+ "",
367
+ gr.Markdown("", visible=False),
368
+ gr.Row(visible=False),
369
+ ""
370
+ )
371
+
372
+ reset_btn.click(
373
+ fn=on_reset,
374
+ inputs=[],
375
+ outputs=[dream_text, df_output, job_selector, skills_text, spelling_alert, spelling_row, job_details_markdown]
376
+ )
377
+
378
+ job_selector.change(
379
+ fn=on_select_job,
380
+ inputs=[job_selector, skills_text],
381
+ outputs=[job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, learning_plan_output]
382
+ )
383
+
384
+ return ui
385
+
386
+ # --- INITIALIZATION AND LAUNCH ---
387
+ if __name__ == "__main__":
388
+ initialize_data_and_model()
389
+ ui = build_interface()
390
+ ui.launch()