zlf18 commited on
Commit
f17cdab
·
verified ·
1 Parent(s): df2be10

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +440 -0
app.py ADDED
@@ -0,0 +1,440 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import datasets
3
+ from sentence_transformers import SentenceTransformer, util
4
+ import torch
5
+ import re
6
+ import nltk
7
+ from nltk.corpus import words, stopwords
8
+ import urllib.parse as _url
9
+ from sklearn.feature_extraction.text import TfidfVectorizer
10
+ from sklearn.metrics.pairwise import cosine_similarity
11
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
12
+ from nltk.stem import PorterStemmer
13
+ import gradio as gr
14
+ import spacy # --- NEW: Import spaCy ---
15
+
16
+ # --- CORRECTED: Download necessary NLTK data ---
17
+ # This revised block is more direct and ensures all packages are downloaded.
18
+ for package in ['words', 'stopwords', 'averaged_perceptron_tagger', 'punkt']:
19
+ try:
20
+ nltk.data.find(f'corpora/{package}' if package in ['words', 'stopwords'] else f'taggers/{package}' if package == 'averaged_perceptron_tagger' else f'tokenizers/{package}')
21
+ except LookupError:
22
+ nltk.download(package)
23
+ # ------------------------------------------------
24
+
25
+ STOPWORDS = set(stopwords.words('english'))
26
+ stemmer = PorterStemmer()
27
+
28
+ # --- GLOBAL STATE & DATA ---
29
+ original_df = None
30
+ combined_df = None
31
+ model = None
32
+ combined_job_embeddings = None
33
+ original_job_title_embeddings = None
34
+ LLM_PIPELINE = None
35
+ NLP_MODEL = None # --- NEW: Global variable for the spaCy model ---
36
+ LLM_MODEL_NAME = "microsoft/phi-2"
37
+ FINETUNED_MODEL_ID = "its-zion-18/projfinetuned"
38
+ KNOWN_WORDS = set()
39
+
40
+ # --- CORE NLP & HELPER FUNCTIONS ---
41
+ def _norm_skill_token(s: str) -> str:
42
+ s = s.lower().strip()
43
+ s = re.sub(r'[\(\)\[\]\{\}\*]', '', s)
44
+ s = re.sub(r'^\W+|\W+$', '', s)
45
+ s = re.sub(r'\s+', ' ', s)
46
+ return s
47
+
48
+ def _skill_match(token1: str, token2: str, threshold: float = 0.9) -> bool:
49
+ t1 = _norm_skill_token(token1)
50
+ t2 = _norm_skill_token(token2)
51
+ if t1 == t2 or t1 in t2 or t2 in t1:
52
+ return True
53
+ try:
54
+ if len(t1) > 2 and len(t2) > 2:
55
+ vectorizer = TfidfVectorizer().fit([t1, t2])
56
+ vectors = vectorizer.transform([t1, t2])
57
+ similarity = cosine_similarity(vectors)[0, 1]
58
+ if similarity >= threshold:
59
+ return True
60
+ except:
61
+ pass
62
+ return False
63
+
64
+ def build_known_vocabulary(df: pd.DataFrame):
65
+ global KNOWN_WORDS
66
+ english_words = set(w.lower() for w in words.words())
67
+ job_words = set(re.findall(r'\b\w+\b', " ".join(df['full_text'].astype(str).tolist()).lower()))
68
+ job_words = {w for w in job_words if w.isalpha() and len(w) > 2}
69
+ KNOWN_WORDS = english_words | job_words
70
+ return "Known vocabulary built."
71
+
72
+ def check_spelling_in_query(query: str) -> list[str]:
73
+ words_in_query = query.lower().split()
74
+ unrecognized_words = []
75
+ if not KNOWN_WORDS: return []
76
+ for word in words_in_query:
77
+ if word.isalpha() and len(word) > 1 and word not in KNOWN_WORDS:
78
+ unrecognized_words.append(word)
79
+ return list(set(unrecognized_words))
80
+
81
+ def initialize_llm_client():
82
+ global LLM_PIPELINE
83
+ try:
84
+ tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME, trust_remote_code=True)
85
+ model_llm = AutoModelForCausalLM.from_pretrained(
86
+ LLM_MODEL_NAME, torch_dtype="auto", device_map="auto", trust_remote_code=True
87
+ )
88
+ LLM_PIPELINE = pipeline(
89
+ "text-generation", model=model_llm, tokenizer=tokenizer, max_new_tokens=100, do_sample=True, temperature=0.7
90
+ )
91
+ return True
92
+ except Exception as e:
93
+ print(f"🚨 ERROR initializing local LLM: {e}")
94
+ return False
95
+
96
+ def llm_expand_query(user_input: str) -> str:
97
+ global LLM_PIPELINE
98
+ if not LLM_PIPELINE: return user_input
99
+ prompt_template = (
100
+ f"User's career interest: '{user_input}'\n"
101
+ f"Instruction: Based on the user's interest, write a concise, single-sentence summary (40-60 words) that elaborates on the core intent, typical skills, and responsibilities. "
102
+ f"Do not include a preamble, the user input, or any list formatting in the output. Just the expanded sentence.\n"
103
+ f"Expanded Intent:"
104
+ )
105
+ try:
106
+ response = LLM_PIPELINE(prompt_template, max_new_tokens=100, do_sample=True, temperature=0.6)
107
+ expanded_query = response[0]['generated_text'].strip().split("Expanded Intent:")[-1].strip()
108
+ final_query = user_input + ". " + expanded_query.replace('\n', ' ').replace(':', '').strip()
109
+ final_query = final_query.replace('..', '.').strip()
110
+ return final_query
111
+ except Exception:
112
+ return user_input
113
+
114
+ def find_job_matches(original_user_query: str, expanded_user_query: str, top_k: int = 50) -> pd.DataFrame:
115
+ expanded_user_embedding = model.encode(expanded_user_query, convert_to_tensor=True)
116
+ general_similarity_scores = util.cos_sim(expanded_user_embedding, combined_job_embeddings)[0]
117
+ top_indices = torch.topk(general_similarity_scores, k=len(combined_df))
118
+ sorted_combined_df = combined_df.iloc[top_indices.indices.cpu()].copy()
119
+ sorted_combined_df['general_score'] = top_indices.values.cpu().numpy()
120
+ unique_matches = sorted_combined_df.drop_duplicates(subset=['job_id'], keep='first').set_index('job_id')
121
+ original_user_embedding = model.encode(original_user_query, convert_to_tensor=True)
122
+ title_boost_scores = util.cos_sim(original_user_embedding, original_job_title_embeddings)[0].cpu().numpy()
123
+ title_boost_map = pd.Series(title_boost_scores, index=original_df['job_id'])
124
+ unique_matches['title_boost_score'] = unique_matches.index.map(title_boost_map).fillna(0)
125
+ unique_matches['Similarity Score'] = (0.70 * unique_matches['general_score'] + 0.30 * unique_matches['title_boost_score'])
126
+ final_job_ids = unique_matches.sort_values(by='Similarity Score', ascending=False).head(top_k).index.tolist()
127
+ final_results_df = original_df[original_df['job_id'].isin(final_job_ids)].copy()
128
+ scores_df = unique_matches.reset_index()[['job_id', 'Similarity Score']].copy()
129
+ final_results_df = pd.merge(final_results_df, scores_df, on='job_id', how='left')
130
+ final_results_df = final_results_df.sort_values(by='Similarity Score', ascending=False).reset_index(drop=True)
131
+ final_results_df = final_results_df.set_index('job_id', drop=False).rename(columns={'job_id': 'Job ID'})
132
+ return final_results_df
133
+
134
+ def score_jobs_by_skills(user_tokens: list[str], df_to_rank: pd.DataFrame) -> pd.DataFrame:
135
+ if df_to_rank is None or df_to_rank.empty: return pd.DataFrame()
136
+ ranked_df = df_to_rank.copy()
137
+ if 'Skills' not in ranked_df.columns: return ranked_df.sort_values(by='Similarity Score', ascending=False)
138
+ def calculate_match(row, user_tokens):
139
+ job_skills = row.get('Skills', [])
140
+ if not isinstance(job_skills, list): return [], 0, 0.0
141
+ matched_skills = [s for s in job_skills if any(_skill_match(ut, s) for ut in user_tokens)]
142
+ total_required_count = len(job_skills)
143
+ match_score = len(matched_skills) / total_required_count if total_required_count > 0 else 0.0
144
+ return matched_skills, len(matched_skills), match_score
145
+ results = ranked_df.apply(lambda row: calculate_match(row, user_tokens), axis=1, result_type='expand')
146
+ ranked_df[['Skill Matches', 'Skill Match Count', 'Skill Match Score']] = results
147
+ ranked_df = ranked_df.sort_values(by=['Skill Match Score', 'Similarity Score'], ascending=[False, False]).reset_index(drop=True)
148
+ return ranked_df.set_index('Job ID', drop=False).rename_axis(None)
149
+
150
+ # --- REPLACED: Skill extraction now uses spaCy for much better accuracy ---
151
+ def extract_skills_from_text(text: str):
152
+ global NLP_MODEL
153
+ if not isinstance(text, str) or not NLP_MODEL:
154
+ return []
155
+
156
+ doc = NLP_MODEL(text)
157
+ skills = set()
158
+
159
+ # Extract named entities that are often skills (e.g., 'Python', 'Amazon Web Services')
160
+ for ent in doc.ents:
161
+ if ent.label_ in ["ORG", "PRODUCT", "WORK_OF_ART"]:
162
+ skills.add(ent.text)
163
+
164
+ # Extract noun chunks that look like skills (e.g., 'data analysis')
165
+ for chunk in doc.noun_chunks:
166
+ if 1 <= len(chunk.text.split()) <= 4:
167
+ skills.add(chunk.text)
168
+
169
+ # Normalize and apply original filtering logic
170
+ normalized_skills = [_norm_skill_token(s) for s in skills]
171
+ junk_phrases = {'demonstrated experience', 'experience', 'related field', 'college/university level', 'equivalent foreign degree', 'cacrep standards', 'students', 'learning experience', 'ability', 'process', 'accreditation', 'human development', 'social welfare', 'sociology', 'pre-service teachers', 'abilities', 'books', 'certifications', 'college', 'level', 'licenses', 'years', 'form', 'knowledge', 'skills'}
172
+ filtered_skills = [s for s in normalized_skills if s and s not in STOPWORDS and s not in junk_phrases]
173
+
174
+ # Deduplicate using stemming
175
+ stemmed_skills = {}
176
+ for skill in filtered_skills:
177
+ stemmed_phrase = ' '.join([stemmer.stem(word) for word in skill.split()])
178
+ if stemmed_phrase not in stemmed_skills:
179
+ stemmed_skills[stemmed_phrase] = skill
180
+
181
+ return sorted(list(stemmed_skills.values()))
182
+ # --- END REPLACEMENT ---
183
+
184
+ def initialize_data_and_model():
185
+ global original_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings, NLP_MODEL
186
+ print("--- Initializing LLM Client ---")
187
+ if not initialize_llm_client(): print("Warning: LLM Client failed to initialize.")
188
+
189
+ # --- MODIFIED: Load spaCy model ---
190
+ print("--- Loading spaCy Model for Skill Extraction ---")
191
+ try:
192
+ NLP_MODEL = spacy.load("en_core_web_sm")
193
+ except Exception as e:
194
+ print(f"🚨 ERROR loading spaCy model: {e}. Skill extraction will be disabled.")
195
+ # --- END MODIFICATION ---
196
+
197
+ print("--- Loading Datasets ---")
198
+ ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
199
+ original_df = ds["original"].to_pandas()
200
+ augmented_df = ds["augmented"].to_pandas()
201
+ original_df['job_id'] = original_df.index
202
+ max_id = len(original_df) - 1
203
+ augmented_df['job_id'] = augmented_df.index.map(lambda i: min(i // 20, max_id))
204
+ def create_full_text(row):
205
+ return " ".join([str(s) for s in [row.get("Job title"), row.get("Company"), row.get("Duties"), row.get("qualifications"), row.get("Description")]])
206
+ original_df["full_text"] = original_df.apply(create_full_text, axis=1)
207
+ augmented_df["full_text"] = augmented_df.apply(create_full_text, axis=1)
208
+ combined_df = pd.concat([original_df.copy(), augmented_df.copy()], ignore_index=True)
209
+ original_df = original_df.rename(columns={'Job title': 'job_title', 'Company': 'company'})
210
+
211
+ # --- MODIFIED: Apply new skill extraction function ---
212
+ print("--- Extracting Skills using spaCy (this may take a moment)... ---")
213
+ original_df['Skills'] = original_df['qualifications'].apply(extract_skills_from_text)
214
+ # --- END MODIFICATION ---
215
+
216
+ print("--- Loading Fine-Tuned Sentence Transformer Model ---")
217
+ model = SentenceTransformer(FINETUNED_MODEL_ID)
218
+ print("--- Encoding Embeddings ---")
219
+ combined_job_embeddings = model.encode(combined_df["full_text"].tolist(), convert_to_tensor=True, show_progress_bar=True)
220
+ original_job_title_embeddings = model.encode(original_df["job_title"].tolist(), convert_to_tensor=True, show_progress_bar=True)
221
+ print("--- Building Vocabulary ---")
222
+ build_known_vocabulary(combined_df)
223
+ return "--- Initialization Complete ---"
224
+
225
+ def _course_links_for(skill: str) -> str:
226
+ q = _url.quote(skill)
227
+ links = [("Coursera", f"https://www.coursera.org/search?query={q}"), ("edX", f"https://www.edx.org/search?q={q}"), ("Udemy", f"https://www.udemy.com/courses/search/?q={q}"), ("YouTube", f"https://www.youtube.com/results?search_query={q}+tutorial")]
228
+ return " • ".join([f'<a href="{u}" target="_blank" style="color: #007bff;">{name}</a>' for name, u in links])
229
+
230
+ # --- GRADIO INTERFACE FUNCTIONS (No changes needed below this line) ---
231
+
232
+ def get_job_matches(dream_job: str, top_n: int, skills_text: str):
233
+ status = "Searching using hybrid model..."
234
+ expanded_desc = llm_expand_query(dream_job)
235
+ emb_matches = find_job_matches(dream_job, expanded_desc, top_k=50)
236
+ user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
237
+
238
+ if user_skills:
239
+ display_df = score_jobs_by_skills(user_skills, emb_matches)
240
+ else:
241
+ display_df = emb_matches
242
+
243
+ display_df = display_df.head(top_n)
244
+
245
+ if user_skills:
246
+ status = f"Found and **re-ranked** results by your {len(user_skills)} skills. Displaying top {len(display_df)}."
247
+ else:
248
+ status = f"Found {len(display_df)} top matches using semantic search."
249
+
250
+ table_to_show = display_df[['job_title', 'company', 'Similarity Score']]
251
+ if 'Skill Match Score' in display_df.columns:
252
+ table_to_show['Skill Match Score'] = display_df['Skill Match Score']
253
+
254
+ dropdown_options = [(f"{i+1}. {row['job_title']} - {row['company']}", row.name) for i, row in display_df.iterrows()]
255
+ dropdown_value = dropdown_options[0][1] if dropdown_options else None
256
+
257
+ return status, emb_matches, table_to_show, gr.Dropdown(choices=dropdown_options, value=dropdown_value, visible=True), gr.Accordion(visible=True)
258
+
259
+ def rerank_current_results(initial_matches_df, skills_text, top_n):
260
+ if initial_matches_df is None or pd.DataFrame(initial_matches_df).empty:
261
+ return "Please find matches first before re-ranking.", pd.DataFrame(), gr.Dropdown(visible=False)
262
+
263
+ initial_matches_df = pd.DataFrame(initial_matches_df)
264
+
265
+ user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
266
+ if not user_skills:
267
+ status = "Skills cleared. Showing original semantic search results."
268
+ display_df = initial_matches_df.head(top_n)
269
+ table_to_show = display_df[['job_title', 'company', 'Similarity Score']]
270
+ else:
271
+ ranked_df = score_jobs_by_skills(user_skills, initial_matches_df)
272
+ status = f"Results **re-ranked** based on your {len(user_skills)} skills."
273
+ display_df = ranked_df.head(top_n)
274
+ table_to_show = display_df[['job_title', 'company', 'Similarity Score', 'Skill Match Score']]
275
+
276
+ dropdown_options = [(f"{i+1}. {row['job_title']} - {row['company']}", row.name) for i, row in display_df.iterrows()]
277
+ dropdown_value = dropdown_options[0][1] if dropdown_options else None
278
+ return status, table_to_show, gr.Dropdown(choices=dropdown_options, value=dropdown_value, visible=True)
279
+
280
+ def find_matches_and_rank_with_check(dream_job: str, top_n: int, skills_text: str):
281
+ if not dream_job:
282
+ return "Please describe your dream job first.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(""), gr.Row(visible=False)
283
+ unrecognized_words = check_spelling_in_query(dream_job)
284
+ if unrecognized_words:
285
+ word_list_html = ", ".join([f"<b><span style='color: #F87171;'>{w}</span></b>" for w in unrecognized_words])
286
+ alert_message = f"<b><span style='color: #F87171;'>⚠️ Possible Spelling Error:</span></b> Unrecognized: {word_list_html}."
287
+ return "Status: Awaiting confirmation.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(alert_message, visible=True), gr.Row(visible=True)
288
+
289
+ status, emb_matches, table_to_show, dropdown, details_accordion = get_job_matches(dream_job, top_n, skills_text)
290
+ return status, emb_matches, table_to_show, dropdown, details_accordion, gr.Markdown(visible=False), gr.Row(visible=False)
291
+
292
+ def find_matches_and_rank_anyway(dream_job: str, top_n: int, skills_text: str):
293
+ status, emb_matches, table_to_show, dropdown, details_accordion = get_job_matches(dream_job, top_n, skills_text)
294
+ return status, emb_matches, table_to_show, dropdown, details_accordion, gr.Markdown(visible=False), gr.Row(visible=False)
295
+
296
+ def on_select_job(job_id, skills_text):
297
+ if job_id is None:
298
+ return "", "", "", "", "", gr.Accordion(visible=False), [], 0, gr.Button(visible=False)
299
+
300
+ row = original_df.loc[job_id]
301
+ title, company = str(row.get("job_title", "")), str(row.get("company", ""))
302
+ job_details_markdown = f"### {title} — {company}"
303
+ duties, qualifications, description = str(row.get('Duties', '')), str(row.get('qualifications', '')), str(row.get('Description', ''))
304
+
305
+ user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
306
+ job_skills = row.get("Skills", [])
307
+
308
+ if not job_skills:
309
+ learning_plan_html = "<p><i>No specific skills were extracted for this job.</i></p>"
310
+ return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
311
+
312
+ all_missing_skills = sorted([s for s in job_skills if not any(_skill_match(ut, s) for ut in user_skills)], key=lambda x: x.lower())
313
+
314
+ if not all_missing_skills:
315
+ learning_plan_html = "<h4 style='color:green;'>🎉 You have all the required skills!</h4>"
316
+ return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
317
+
318
+ if user_skills:
319
+ score_val = (len(job_skills) - len(all_missing_skills)) / len(job_skills)
320
+ job_details_markdown += f"\n**Your skill match:** {score_val:.1%}"
321
+ headline = "<b>Great fit!</b>" if score_val >= 0.8 else "<b>Good progress!</b>" if score_val >= 0.5 else "<b>Solid starting point.</b>"
322
+ learning_plan_html = f"<h4>{headline} Focus on these skills to improve your match:</h4>"
323
+ skills_to_display = all_missing_skills[:5]
324
+ items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
325
+ learning_plan_html += f"<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
326
+
327
+ return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
328
+
329
+ else:
330
+ headline = "<h4>To be a good fit for this role, you'll need to learn these skills:</h4>"
331
+ skills_to_display = all_missing_skills[:5]
332
+ items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
333
+ learning_plan_html = f"{headline}<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
334
+
335
+ full_skill_list_for_state = all_missing_skills
336
+ new_offset = len(skills_to_display)
337
+ should_button_be_visible = len(all_missing_skills) > 5
338
+
339
+ return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), full_skill_list_for_state, new_offset, gr.Button(visible=should_button_be_visible)
340
+
341
+ def load_more_skills(full_skills_list, current_offset):
342
+ SKILLS_INCREMENT = 5
343
+ new_offset = current_offset + SKILLS_INCREMENT
344
+ skills_to_display = full_skills_list[:new_offset]
345
+
346
+ items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
347
+ learning_plan_html = f"<h4>To be a good fit for this role, you'll need to learn these skills:</h4><ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
348
+
349
+ should_button_be_visible = new_offset < len(full_skills_list)
350
+
351
+ return learning_plan_html, new_offset, gr.Button(visible=should_button_be_visible)
352
+
353
+ def on_reset():
354
+ return ("", 3, "", pd.DataFrame(), None, gr.Dropdown(visible=False), gr.Accordion(visible=False), "Status: Ready.", "", "", "", "", gr.Markdown(visible=False), gr.Row(visible=False), [], 0, gr.Button(visible=False))
355
+
356
+ # --- Run Initialization ---
357
+ print("Starting application initialization...")
358
+ initialization_status = initialize_data_and_model()
359
+ print(initialization_status)
360
+
361
+ # --- Gradio Interface Definition ---
362
+ with gr.Blocks(theme=gr.themes.Soft()) as ui:
363
+ gr.Markdown("# Hybrid Career Planner & Skill Gap Analyzer")
364
+
365
+ initial_matches_state = gr.State()
366
+ missing_skills_state = gr.State([])
367
+ skills_offset_state = gr.State(0)
368
+
369
+ with gr.Row():
370
+ with gr.Column(scale=3):
371
+ dream_text = gr.Textbox(label='Your Dream Job Description', lines=3, placeholder="e.g., 'A role in a tech startup focused on machine learning...'")
372
+ with gr.Accordion("Optional: Add Your Skills to Re-rank Results", open=False):
373
+ with gr.Row():
374
+ skills_text = gr.Textbox(label='Your Skills (comma-separated)', placeholder="e.g., Python, data analysis", scale=3)
375
+ rerank_btn = gr.Button("Re-rank", variant="secondary", scale=1)
376
+ with gr.Column(scale=1):
377
+ topk_slider = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Number of Matches")
378
+ search_btn = gr.Button("Find Matches", variant="primary")
379
+ reset_btn = gr.Button("Reset All")
380
+
381
+ status_text = gr.Markdown("Status: Ready.")
382
+ spelling_alert = gr.Markdown(visible=False)
383
+ with gr.Row(visible=False) as spelling_row:
384
+ search_anyway_btn = gr.Button("Search Anyway", variant="secondary")
385
+ retype_btn = gr.Button("Let Me Fix It", variant="stop")
386
+
387
+ df_output = gr.DataFrame(label="Job Matches", interactive=False)
388
+ job_selector = gr.Dropdown(label="Select a job to see more details & learning plan:", visible=False)
389
+
390
+ with gr.Accordion("Job Details & Learning Plan", open=False, visible=False) as details_accordion:
391
+ job_details_markdown = gr.Markdown()
392
+
393
+ with gr.Tabs():
394
+ with gr.TabItem("Duties"):
395
+ duties_markdown = gr.Markdown()
396
+ with gr.TabItem("Qualifications"):
397
+ qualifications_markdown = gr.Markdown()
398
+ with gr.TabItem("Full Description"):
399
+ description_markdown = gr.Markdown()
400
+
401
+ learning_plan_output = gr.HTML(label="Learning Plan")
402
+ load_more_btn = gr.Button("Load More Skills", visible=False)
403
+
404
+ # --- Event Handlers ---
405
+ search_btn.click(
406
+ fn=find_matches_and_rank_with_check,
407
+ inputs=[dream_text, topk_slider, skills_text],
408
+ outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row]
409
+ )
410
+ search_anyway_btn.click(
411
+ fn=find_matches_and_rank_anyway,
412
+ inputs=[dream_text, topk_slider, skills_text],
413
+ outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row]
414
+ )
415
+ retype_btn.click(
416
+ lambda: ("Status: Ready for you to retype.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(visible=False), gr.Row(visible=False)),
417
+ outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row]
418
+ )
419
+ reset_btn.click(
420
+ fn=on_reset,
421
+ outputs=[dream_text, topk_slider, skills_text, df_output, initial_matches_state, job_selector, details_accordion, status_text, job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, spelling_alert, spelling_row, missing_skills_state, skills_offset_state, load_more_btn],
422
+ queue=False
423
+ )
424
+ rerank_btn.click(
425
+ fn=rerank_current_results,
426
+ inputs=[initial_matches_state, skills_text, topk_slider],
427
+ outputs=[status_text, df_output, job_selector]
428
+ )
429
+ job_selector.change(
430
+ fn=on_select_job,
431
+ inputs=[job_selector, skills_text],
432
+ outputs=[job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, learning_plan_output, details_accordion, missing_skills_state, skills_offset_state, load_more_btn]
433
+ )
434
+ load_more_btn.click(
435
+ fn=load_more_skills,
436
+ inputs=[missing_skills_state, skills_offset_state],
437
+ outputs=[learning_plan_output, skills_offset_state, load_more_btn]
438
+ )
439
+
440
+ ui.launch()