Update app.py
Browse files
app.py
CHANGED
|
@@ -196,44 +196,41 @@ def initialize_data_and_model():
|
|
| 196 |
ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
|
| 197 |
original_df = ds["original"].to_pandas()
|
| 198 |
|
| 199 |
-
|
| 200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
prompt = f"""
|
| 202 |
-
Instruct: You are
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
|
|
|
|
|
|
| 213 |
try:
|
| 214 |
response = LLM_PIPELINE(prompt, max_new_tokens=150, do_sample=False, temperature=0.1)
|
| 215 |
generated_text = response[0]['generated_text']
|
| 216 |
-
|
|
|
|
| 217 |
skills = [skill.strip() for skill in skills_part.split(',') if skill.strip()]
|
|
|
|
| 218 |
return list(dict.fromkeys(s.lower() for s in skills))
|
| 219 |
-
except Exception
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
text_lower = text.lower()
|
| 224 |
-
grammar = "NP: {<JJ.*>*<NN.*>+}"
|
| 225 |
-
chunk_parser = nltk.RegexpParser(grammar)
|
| 226 |
-
tokens = nltk.word_tokenize(text_lower)
|
| 227 |
-
tagged_tokens = nltk.pos_tag(tokens)
|
| 228 |
-
chunked_text = chunk_parser.parse(tagged_tokens)
|
| 229 |
-
potential_skills = set()
|
| 230 |
-
for subtree in chunked_text.subtrees():
|
| 231 |
-
if subtree.label() == 'NP':
|
| 232 |
-
phrase = " ".join(word for word, tag in subtree.leaves())
|
| 233 |
-
if _norm_skill_token(phrase) in SKILL_WHITELIST:
|
| 234 |
-
potential_skills.add(_norm_skill_token(phrase))
|
| 235 |
-
return sorted(list(potential_skills))
|
| 236 |
-
|
| 237 |
def extract_skills_direct_scan(text: str) -> list[str]:
|
| 238 |
if not isinstance(text, str): return []
|
| 239 |
found_skills = set()
|
|
@@ -247,12 +244,12 @@ Extracted Skills:
|
|
| 247 |
|
| 248 |
skills_to_add = 6 - len(existing_skills)
|
| 249 |
prompt = f"""
|
| 250 |
-
Instruct: A job has the title "{job_title}" and requires the skills: {', '.join(existing_skills)}.
|
| 251 |
-
Based on this, what are {skills_to_add} additional, closely related skills typically required for such a role?
|
| 252 |
-
List only the new skills, separated by commas. Do not repeat skills from the original list.
|
| 253 |
|
| 254 |
-
Additional Skills:
|
| 255 |
-
"""
|
| 256 |
try:
|
| 257 |
response = LLM_PIPELINE(prompt, max_new_tokens=50, do_sample=True, temperature=0.5)
|
| 258 |
generated_text = response[0]['generated_text']
|
|
@@ -262,31 +259,38 @@ Additional Skills:
|
|
| 262 |
except Exception:
|
| 263 |
return []
|
| 264 |
|
|
|
|
|
|
|
| 265 |
def extract_skills_hybrid(row) -> list[str]:
|
| 266 |
-
text
|
| 267 |
-
job_title = row.get('Job title', '')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
|
|
|
| 273 |
|
| 274 |
-
#
|
|
|
|
|
|
|
|
|
|
| 275 |
if len(combined_skills) < 6:
|
| 276 |
expanded_skills = expand_skills_with_llm(job_title, list(combined_skills))
|
| 277 |
combined_skills.update(expanded_skills)
|
| 278 |
|
| 279 |
return sorted(list(combined_skills))
|
| 280 |
|
| 281 |
-
def create_text_for_skills(row):
|
| 282 |
-
return " ".join([str(s) for s in [row.get("Job title"), row.get("Duties"), row.get("qualifications"), row.get("Description")] if pd.notna(s)])
|
| 283 |
-
|
| 284 |
-
original_df["text_for_skills"] = original_df.apply(create_text_for_skills, axis=1)
|
| 285 |
print("--- Extracting skills with HYBRID ACCURACY model. Please wait... ---")
|
| 286 |
# Apply the hybrid function row-wise to include job title context
|
| 287 |
original_df['Skills'] = original_df.progress_apply(extract_skills_hybrid, axis=1)
|
| 288 |
-
|
| 289 |
-
|
| 290 |
print(f"--- Saving processed data to {PROCESSED_DATA_PATH} for faster future startups ---")
|
| 291 |
original_df.to_parquet(PROCESSED_DATA_PATH)
|
| 292 |
|
|
|
|
| 196 |
ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
|
| 197 |
original_df = ds["original"].to_pandas()
|
| 198 |
|
| 199 |
+
# --- NEW: Advanced LLM Skill Extractor ---
|
| 200 |
+
# This new function uses a much more detailed prompt to get niche, specific skills.
|
| 201 |
+
def extract_skills_llm_advanced(job_title: str, duties: str, qualifications: str) -> list[str]:
|
| 202 |
+
if not LLM_PIPELINE: return []
|
| 203 |
+
|
| 204 |
+
# We combine the most important fields to give the LLM full context.
|
| 205 |
+
full_context = f"Job Title: {job_title}\n\nDuties: {duties}\n\nQualifications: {qualifications}"
|
| 206 |
+
|
| 207 |
+
# This prompt is highly specific to encourage better, more niche results.
|
| 208 |
prompt = f"""
|
| 209 |
+
Instruct: You are a highly specialized technical recruiter and hiring manager. Your task is to meticulously extract a comprehensive list of the most critical and specific skills from the provided job description, paying special attention to the 'qualifications' and 'duties' sections.
|
| 210 |
+
|
| 211 |
+
Identify specific programming languages, software tools (e.g., AutoCAD, Figma, SAP), cloud technologies (e.g., AWS S3, Azure DevOps), data analysis tools (e.g., Tableau, Power BI), engineering concepts, and industry standards (e.g., ISO 13485, GMP).
|
| 212 |
+
|
| 213 |
+
Avoid overly generic soft skills like 'teamwork' or 'communication' unless they are explicitly emphasized as a core requirement. Prioritize tangible, niche competencies that truly define the role.
|
| 214 |
+
|
| 215 |
+
Return a single, comma-separated string of the extracted skills. Do not add any preamble or explanation.
|
| 216 |
+
|
| 217 |
+
[Job Description Context]
|
| 218 |
+
{full_context}
|
| 219 |
+
|
| 220 |
+
[Extracted Skills]
|
| 221 |
+
"""
|
| 222 |
try:
|
| 223 |
response = LLM_PIPELINE(prompt, max_new_tokens=150, do_sample=False, temperature=0.1)
|
| 224 |
generated_text = response[0]['generated_text']
|
| 225 |
+
# Robustly find the skills part after the final indicator
|
| 226 |
+
skills_part = generated_text.split("[Extracted Skills]")[-1].strip()
|
| 227 |
skills = [skill.strip() for skill in skills_part.split(',') if skill.strip()]
|
| 228 |
+
# Return a de-duplicated list, preserving order as much as possible
|
| 229 |
return list(dict.fromkeys(s.lower() for s in skills))
|
| 230 |
+
except Exception as e:
|
| 231 |
+
print(f"LLM skill extraction failed: {e}")
|
| 232 |
+
return []
|
| 233 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
def extract_skills_direct_scan(text: str) -> list[str]:
|
| 235 |
if not isinstance(text, str): return []
|
| 236 |
found_skills = set()
|
|
|
|
| 244 |
|
| 245 |
skills_to_add = 6 - len(existing_skills)
|
| 246 |
prompt = f"""
|
| 247 |
+
Instruct: A job has the title "{job_title}" and requires the skills: {', '.join(existing_skills)}.
|
| 248 |
+
Based on this, what are {skills_to_add} additional, closely related skills typically required for such a role?
|
| 249 |
+
List only the new skills, separated by commas. Do not repeat skills from the original list.
|
| 250 |
|
| 251 |
+
Additional Skills:
|
| 252 |
+
"""
|
| 253 |
try:
|
| 254 |
response = LLM_PIPELINE(prompt, max_new_tokens=50, do_sample=True, temperature=0.5)
|
| 255 |
generated_text = response[0]['generated_text']
|
|
|
|
| 259 |
except Exception:
|
| 260 |
return []
|
| 261 |
|
| 262 |
+
# --- MODIFIED: Hybrid Skill Extraction Logic ---
|
| 263 |
+
# This function is now simpler and more powerful. It prioritizes the advanced LLM extractor.
|
| 264 |
def extract_skills_hybrid(row) -> list[str]:
|
| 265 |
+
# Extract the relevant text fields from the row
|
| 266 |
+
job_title = str(row.get('Job title', ''))
|
| 267 |
+
duties = str(row.get('Duties', ''))
|
| 268 |
+
qualifications = str(row.get('qualifications', ''))
|
| 269 |
+
description = str(row.get('Description', ''))
|
| 270 |
+
|
| 271 |
+
# The full text is used for the direct scan as a fallback
|
| 272 |
+
full_text_for_scan = " ".join([job_title, duties, qualifications, description])
|
| 273 |
|
| 274 |
+
# 🎯 Primary Method: Use the advanced LLM extractor for high-quality, niche skills
|
| 275 |
+
advanced_llm_skills = extract_skills_llm_advanced(job_title, duties, qualifications)
|
| 276 |
+
|
| 277 |
+
# 🛡️ Secondary Method: Use a direct scan as a fast and reliable backup for common skills
|
| 278 |
+
direct_skills = extract_skills_direct_scan(full_text_for_scan)
|
| 279 |
|
| 280 |
+
# Combine the results, giving priority to the LLM's findings
|
| 281 |
+
combined_skills = set(advanced_llm_skills) | set(direct_skills)
|
| 282 |
+
|
| 283 |
+
# If the combined list is still too short, use the LLM to expand it
|
| 284 |
if len(combined_skills) < 6:
|
| 285 |
expanded_skills = expand_skills_with_llm(job_title, list(combined_skills))
|
| 286 |
combined_skills.update(expanded_skills)
|
| 287 |
|
| 288 |
return sorted(list(combined_skills))
|
| 289 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 290 |
print("--- Extracting skills with HYBRID ACCURACY model. Please wait... ---")
|
| 291 |
# Apply the hybrid function row-wise to include job title context
|
| 292 |
original_df['Skills'] = original_df.progress_apply(extract_skills_hybrid, axis=1)
|
| 293 |
+
|
|
|
|
| 294 |
print(f"--- Saving processed data to {PROCESSED_DATA_PATH} for faster future startups ---")
|
| 295 |
original_df.to_parquet(PROCESSED_DATA_PATH)
|
| 296 |
|