zlf18 commited on
Commit
ac17735
·
verified ·
1 Parent(s): df56c68

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -50
app.py CHANGED
@@ -196,44 +196,41 @@ def initialize_data_and_model():
196
  ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
197
  original_df = ds["original"].to_pandas()
198
 
199
- def extract_skills_llm(text: str) -> list[str]:
200
- if not isinstance(text, str) or len(text.strip()) < 20 or not LLM_PIPELINE: return []
 
 
 
 
 
 
 
201
  prompt = f"""
202
- Instruct: You are an expert technical recruiter. Extract the key skills from the job description text. List technical and soft skills as a comma-separated string.
203
- [Example 1]
204
- Text: "Requires 3+ years of experience in cloud infrastructure. Must be proficient in AWS, particularly EC2 and S3. Experience with Terraform for IaC is a plus."
205
- Extracted Skills: cloud infrastructure, aws, ec2, s3, terraform, infrastructure as code
206
- [Example 2]
207
- Text: "Seeking a team lead with strong project management abilities. Must communicate effectively with stakeholders and manage timelines using Agile methodologies like Scrum."
208
- Extracted Skills: project management, leadership, stakeholder communication, agile, scrum
209
- [Actual Task]
210
- Text: "{text}"
211
- Extracted Skills:
212
- """
 
 
213
  try:
214
  response = LLM_PIPELINE(prompt, max_new_tokens=150, do_sample=False, temperature=0.1)
215
  generated_text = response[0]['generated_text']
216
- skills_part = generated_text.split("Extracted Skills:")[-1].strip()
 
217
  skills = [skill.strip() for skill in skills_part.split(',') if skill.strip()]
 
218
  return list(dict.fromkeys(s.lower() for s in skills))
219
- except Exception: return []
220
-
221
- def extract_skills_nltk(text: str) -> list[str]:
222
- if not isinstance(text, str): return []
223
- text_lower = text.lower()
224
- grammar = "NP: {<JJ.*>*<NN.*>+}"
225
- chunk_parser = nltk.RegexpParser(grammar)
226
- tokens = nltk.word_tokenize(text_lower)
227
- tagged_tokens = nltk.pos_tag(tokens)
228
- chunked_text = chunk_parser.parse(tagged_tokens)
229
- potential_skills = set()
230
- for subtree in chunked_text.subtrees():
231
- if subtree.label() == 'NP':
232
- phrase = " ".join(word for word, tag in subtree.leaves())
233
- if _norm_skill_token(phrase) in SKILL_WHITELIST:
234
- potential_skills.add(_norm_skill_token(phrase))
235
- return sorted(list(potential_skills))
236
-
237
  def extract_skills_direct_scan(text: str) -> list[str]:
238
  if not isinstance(text, str): return []
239
  found_skills = set()
@@ -247,12 +244,12 @@ Extracted Skills:
247
 
248
  skills_to_add = 6 - len(existing_skills)
249
  prompt = f"""
250
- Instruct: A job has the title "{job_title}" and requires the skills: {', '.join(existing_skills)}.
251
- Based on this, what are {skills_to_add} additional, closely related skills typically required for such a role?
252
- List only the new skills, separated by commas. Do not repeat skills from the original list.
253
 
254
- Additional Skills:
255
- """
256
  try:
257
  response = LLM_PIPELINE(prompt, max_new_tokens=50, do_sample=True, temperature=0.5)
258
  generated_text = response[0]['generated_text']
@@ -262,31 +259,38 @@ Additional Skills:
262
  except Exception:
263
  return []
264
 
 
 
265
  def extract_skills_hybrid(row) -> list[str]:
266
- text = row['text_for_skills']
267
- job_title = row.get('Job title', '') # Use original Job title for context
 
 
 
 
 
 
268
 
269
- llm_skills = extract_skills_llm(text)
270
- nltk_skills = extract_skills_nltk(text)
271
- direct_skills = extract_skills_direct_scan(text)
272
- combined_skills = set(llm_skills) | set(nltk_skills) | set(direct_skills)
 
273
 
274
- # If the combined list is still too short, expand it
 
 
 
275
  if len(combined_skills) < 6:
276
  expanded_skills = expand_skills_with_llm(job_title, list(combined_skills))
277
  combined_skills.update(expanded_skills)
278
 
279
  return sorted(list(combined_skills))
280
 
281
- def create_text_for_skills(row):
282
- return " ".join([str(s) for s in [row.get("Job title"), row.get("Duties"), row.get("qualifications"), row.get("Description")] if pd.notna(s)])
283
-
284
- original_df["text_for_skills"] = original_df.apply(create_text_for_skills, axis=1)
285
  print("--- Extracting skills with HYBRID ACCURACY model. Please wait... ---")
286
  # Apply the hybrid function row-wise to include job title context
287
  original_df['Skills'] = original_df.progress_apply(extract_skills_hybrid, axis=1)
288
- original_df = original_df.drop(columns=['text_for_skills'])
289
-
290
  print(f"--- Saving processed data to {PROCESSED_DATA_PATH} for faster future startups ---")
291
  original_df.to_parquet(PROCESSED_DATA_PATH)
292
 
 
196
  ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
197
  original_df = ds["original"].to_pandas()
198
 
199
+ # --- NEW: Advanced LLM Skill Extractor ---
200
+ # This new function uses a much more detailed prompt to get niche, specific skills.
201
+ def extract_skills_llm_advanced(job_title: str, duties: str, qualifications: str) -> list[str]:
202
+ if not LLM_PIPELINE: return []
203
+
204
+ # We combine the most important fields to give the LLM full context.
205
+ full_context = f"Job Title: {job_title}\n\nDuties: {duties}\n\nQualifications: {qualifications}"
206
+
207
+ # This prompt is highly specific to encourage better, more niche results.
208
  prompt = f"""
209
+ Instruct: You are a highly specialized technical recruiter and hiring manager. Your task is to meticulously extract a comprehensive list of the most critical and specific skills from the provided job description, paying special attention to the 'qualifications' and 'duties' sections.
210
+
211
+ Identify specific programming languages, software tools (e.g., AutoCAD, Figma, SAP), cloud technologies (e.g., AWS S3, Azure DevOps), data analysis tools (e.g., Tableau, Power BI), engineering concepts, and industry standards (e.g., ISO 13485, GMP).
212
+
213
+ Avoid overly generic soft skills like 'teamwork' or 'communication' unless they are explicitly emphasized as a core requirement. Prioritize tangible, niche competencies that truly define the role.
214
+
215
+ Return a single, comma-separated string of the extracted skills. Do not add any preamble or explanation.
216
+
217
+ [Job Description Context]
218
+ {full_context}
219
+
220
+ [Extracted Skills]
221
+ """
222
  try:
223
  response = LLM_PIPELINE(prompt, max_new_tokens=150, do_sample=False, temperature=0.1)
224
  generated_text = response[0]['generated_text']
225
+ # Robustly find the skills part after the final indicator
226
+ skills_part = generated_text.split("[Extracted Skills]")[-1].strip()
227
  skills = [skill.strip() for skill in skills_part.split(',') if skill.strip()]
228
+ # Return a de-duplicated list, preserving order as much as possible
229
  return list(dict.fromkeys(s.lower() for s in skills))
230
+ except Exception as e:
231
+ print(f"LLM skill extraction failed: {e}")
232
+ return []
233
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  def extract_skills_direct_scan(text: str) -> list[str]:
235
  if not isinstance(text, str): return []
236
  found_skills = set()
 
244
 
245
  skills_to_add = 6 - len(existing_skills)
246
  prompt = f"""
247
+ Instruct: A job has the title "{job_title}" and requires the skills: {', '.join(existing_skills)}.
248
+ Based on this, what are {skills_to_add} additional, closely related skills typically required for such a role?
249
+ List only the new skills, separated by commas. Do not repeat skills from the original list.
250
 
251
+ Additional Skills:
252
+ """
253
  try:
254
  response = LLM_PIPELINE(prompt, max_new_tokens=50, do_sample=True, temperature=0.5)
255
  generated_text = response[0]['generated_text']
 
259
  except Exception:
260
  return []
261
 
262
+ # --- MODIFIED: Hybrid Skill Extraction Logic ---
263
+ # This function is now simpler and more powerful. It prioritizes the advanced LLM extractor.
264
  def extract_skills_hybrid(row) -> list[str]:
265
+ # Extract the relevant text fields from the row
266
+ job_title = str(row.get('Job title', ''))
267
+ duties = str(row.get('Duties', ''))
268
+ qualifications = str(row.get('qualifications', ''))
269
+ description = str(row.get('Description', ''))
270
+
271
+ # The full text is used for the direct scan as a fallback
272
+ full_text_for_scan = " ".join([job_title, duties, qualifications, description])
273
 
274
+ # 🎯 Primary Method: Use the advanced LLM extractor for high-quality, niche skills
275
+ advanced_llm_skills = extract_skills_llm_advanced(job_title, duties, qualifications)
276
+
277
+ # 🛡️ Secondary Method: Use a direct scan as a fast and reliable backup for common skills
278
+ direct_skills = extract_skills_direct_scan(full_text_for_scan)
279
 
280
+ # Combine the results, giving priority to the LLM's findings
281
+ combined_skills = set(advanced_llm_skills) | set(direct_skills)
282
+
283
+ # If the combined list is still too short, use the LLM to expand it
284
  if len(combined_skills) < 6:
285
  expanded_skills = expand_skills_with_llm(job_title, list(combined_skills))
286
  combined_skills.update(expanded_skills)
287
 
288
  return sorted(list(combined_skills))
289
 
 
 
 
 
290
  print("--- Extracting skills with HYBRID ACCURACY model. Please wait... ---")
291
  # Apply the hybrid function row-wise to include job title context
292
  original_df['Skills'] = original_df.progress_apply(extract_skills_hybrid, axis=1)
293
+
 
294
  print(f"--- Saving processed data to {PROCESSED_DATA_PATH} for faster future startups ---")
295
  original_df.to_parquet(PROCESSED_DATA_PATH)
296