zlf18 commited on
Commit
1d05fa5
·
verified ·
1 Parent(s): 2975425

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -37
app.py CHANGED
@@ -11,23 +11,38 @@ from sklearn.metrics.pairwise import cosine_similarity
11
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
12
  from nltk.stem import PorterStemmer
13
  import gradio as gr
14
- import os # New import for file path checking
15
- from tqdm import tqdm # New import for progress bars
16
 
17
- # Initialize tqdm for pandas
18
  tqdm.pandas()
19
 
20
- # --- CORRECTED: Download necessary NLTK data ---
21
- for package in ['words', 'stopwords', 'punkt']: # Removed unused NLTK packages
22
  try:
23
- nltk.data.find(f'corpora/{package}' if package in ['words', 'stopwords'] else f'tokenizers/{package}')
24
  except LookupError:
25
  nltk.download(package)
26
- # ------------------------------------------------
27
 
28
  STOPWORDS = set(stopwords.words('english'))
29
  stemmer = PorterStemmer()
30
- # NOTE: The hardcoded EXTENDED_JUNK_PHRASES set has been removed.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  # --- GLOBAL STATE & DATA ---
33
  original_df = None
@@ -88,9 +103,7 @@ def initialize_llm_client():
88
  model_llm = AutoModelForCausalLM.from_pretrained(
89
  LLM_MODEL_NAME, torch_dtype="auto", device_map="auto", trust_remote_code=True
90
  )
91
- LLM_PIPELINE = pipeline(
92
- "text-generation", model=model_llm, tokenizer=tokenizer
93
- )
94
  return True
95
  except Exception as e:
96
  print(f"🚨 ERROR initializing local LLM: {e}")
@@ -150,7 +163,6 @@ def score_jobs_by_skills(user_tokens: list[str], df_to_rank: pd.DataFrame) -> pd
150
  ranked_df = ranked_df.sort_values(by=['Skill Match Score', 'Similarity Score'], ascending=[False, False]).reset_index(drop=True)
151
  return ranked_df.set_index('Job ID', drop=False).rename_axis(None)
152
 
153
- # --- COMPLETELY REWRITTEN INITIALIZATION FUNCTION ---
154
  def initialize_data_and_model():
155
  global original_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings
156
 
@@ -158,35 +170,35 @@ def initialize_data_and_model():
158
 
159
  print("--- Initializing LLM Client ---")
160
  if not initialize_llm_client():
161
- print("Warning: LLM Client failed to initialize. Skill extraction will be skipped.")
162
 
163
- # --- Caching Logic: Check for pre-processed file ---
164
  if os.path.exists(PROCESSED_DATA_PATH):
165
  print(f"--- Loading pre-processed data from {PROCESSED_DATA_PATH} ---")
166
  original_df = pd.read_parquet(PROCESSED_DATA_PATH)
167
  else:
168
  print("--- No pre-processed data found. Starting one-time processing... ---")
169
- print("--- This will be slow on the first run but fast on subsequent runs. ---")
170
 
171
  ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
172
  original_df = ds["original"].to_pandas()
173
 
174
- # --- NEW LLM-based skill extraction function ---
175
- def extract_skills_with_llm(text: str) -> list[str]:
176
- if not isinstance(text, str) or len(text.strip()) < 20:
177
- return []
178
- if not LLM_PIPELINE:
179
  return []
180
-
181
  prompt = f"""
182
- Instruct: You are an expert technical recruiter analyzing a job description. Extract the key skills required for the role.
183
- - Identify both technical skills (e.g., 'Python', 'React', 'AWS', 'machine learning') and important soft skills (e.g., 'leadership', 'project management').
184
- - Do not include generic phrases like 'bachelor's degree' or 'years of experience' as skills.
185
- - List the extracted skills as a single, comma-separated string. Do not use bullet points or any other formatting.
 
186
 
187
- Job Description Text:
188
- "{text}"
 
189
 
 
 
190
  Extracted Skills:
191
  """
192
  try:
@@ -194,25 +206,52 @@ Extracted Skills:
194
  generated_text = response[0]['generated_text']
195
  skills_part = generated_text.split("Extracted Skills:")[-1].strip()
196
  skills = [skill.strip() for skill in skills_part.split(',') if skill.strip()]
197
- return list(dict.fromkeys(s.lower() for s in skills)) # Return unique skills
198
- except Exception as e:
199
- print(f"Error during LLM skill extraction: {e}")
200
  return []
201
-
202
- # Combine relevant text fields for better context
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  def create_text_for_skills(row):
204
  return " ".join([str(s) for s in [row.get("Job title"), row.get("Duties"), row.get("qualifications"), row.get("Description")] if pd.notna(s)])
205
 
206
  original_df["text_for_skills"] = original_df.apply(create_text_for_skills, axis=1)
207
 
208
- print("--- Extracting skills using the LLM. Please wait... ---")
209
- original_df['Skills'] = original_df['text_for_skills'].progress_apply(extract_skills_with_llm)
210
  original_df = original_df.drop(columns=['text_for_skills'])
211
 
212
  print(f"--- Saving processed data to {PROCESSED_DATA_PATH} for faster future startups ---")
213
  original_df.to_parquet(PROCESSED_DATA_PATH)
214
 
215
- # --- Continue with the rest of the data processing using the loaded/created `original_df` ---
216
  original_df['job_id'] = original_df.index
217
  def create_full_text(row):
218
  return " ".join([str(s) for s in [row.get("Job title"), row.get("Company"), row.get("Duties"), row.get("qualifications"), row.get("Description")]])
@@ -242,7 +281,6 @@ def _course_links_for(skill: str) -> str:
242
  return " • ".join([f'<a href="{u}" target="_blank" style="color: #007bff;">{name}</a>' for name, u in links])
243
 
244
  # --- GRADIO INTERFACE FUNCTIONS (No changes needed below this line) ---
245
-
246
  def get_job_matches(dream_job: str, top_n: int, skills_text: str):
247
  status = "Searching using hybrid model..."
248
  expanded_desc = llm_expand_query(dream_job)
@@ -320,7 +358,7 @@ def on_select_job(job_id, skills_text):
320
  job_skills = row.get("Skills", [])
321
 
322
  if not job_skills:
323
- learning_plan_html = "<p><i>No specific skills were extracted for this job.</i></p>"
324
  return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
325
 
326
  all_missing_skills = sorted([s for s in job_skills if not any(_skill_match(ut, s) for ut in user_skills)], key=lambda x: x.lower())
 
11
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
12
  from nltk.stem import PorterStemmer
13
  import gradio as gr
14
+ import os
15
+ from tqdm import tqdm
16
 
 
17
  tqdm.pandas()
18
 
19
+ # --- NLTK Data Download ---
20
+ for package in ['words', 'stopwords', 'averaged_perceptron_tagger', 'punkt']:
21
  try:
22
+ nltk.data.find(f'corpora/{package}' if package in ['words', 'stopwords'] else f'taggers/{package}' if package == 'averaged_perceptron_tagger' else f'tokenizers/{package}')
23
  except LookupError:
24
  nltk.download(package)
 
25
 
26
  STOPWORDS = set(stopwords.words('english'))
27
  stemmer = PorterStemmer()
28
+
29
+ # --- NEW: Curated Skill Whitelist for NLTK Fallback Accuracy ---
30
+ SKILL_WHITELIST = {
31
+ 'python', 'java', 'c++', 'javascript', 'typescript', 'sql', 'nosql', 'html', 'css', 'react', 'angular', 'vue',
32
+ 'nodejs', 'django', 'flask', 'fastapi', 'spring boot', 'ruby on rails', 'php', 'swift', 'kotlin', 'dart', 'flutter',
33
+ 'machine learning', 'deep learning', 'tensorflow', 'pytorch', 'keras', 'scikit-learn', 'pandas', 'numpy', 'matplotlib',
34
+ 'natural language processing', 'nlp', 'computer vision', 'data analysis', 'data science', 'data engineering',
35
+ 'big data', 'spark', 'hadoop', 'kafka', 'data visualization', 'tableau', 'power bi', 'd3.js', 'statistics',
36
+ 'aws', 'azure', 'google cloud', 'gcp', 'docker', 'kubernetes', 'terraform', 'ansible', 'ci/cd', 'jenkins',
37
+ 'git', 'github', 'devops', 'linux', 'unix', 'shell scripting', 'powershell', 'cybersecurity', 'penetration testing',
38
+ 'network security', 'cryptography', 'blockchain', 'agile', 'scrum', 'project management', 'product management',
39
+ 'leadership', 'stakeholder communication', 'client communication', 'teamwork', 'collaboration', 'problem solving',
40
+ 'critical thinking', 'ui/ux design', 'figma', 'sketch', 'adobe xd', 'graphic design', 'autocad', 'solidworks',
41
+ 'c#', '.net', 'sql server', 'mysql', 'postgresql', 'mongodb', 'redis', 'elasticsearch', 'api design', 'rest apis',
42
+ 'graphql', 'microservices', 'serverless', 'system design', 'saas', 'sales', 'marketing', 'seo', 'sem', 'content writing',
43
+ 'customer support', 'technical writing', 'sap', 'oracle', 'financial analysis', 'budgeting', 'mentoring', 'supervising'
44
+ }
45
+ # -----------------------------------------------------------------
46
 
47
  # --- GLOBAL STATE & DATA ---
48
  original_df = None
 
103
  model_llm = AutoModelForCausalLM.from_pretrained(
104
  LLM_MODEL_NAME, torch_dtype="auto", device_map="auto", trust_remote_code=True
105
  )
106
+ LLM_PIPELINE = pipeline("text-generation", model=model_llm, tokenizer=tokenizer)
 
 
107
  return True
108
  except Exception as e:
109
  print(f"🚨 ERROR initializing local LLM: {e}")
 
163
  ranked_df = ranked_df.sort_values(by=['Skill Match Score', 'Similarity Score'], ascending=[False, False]).reset_index(drop=True)
164
  return ranked_df.set_index('Job ID', drop=False).rename_axis(None)
165
 
 
166
  def initialize_data_and_model():
167
  global original_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings
168
 
 
170
 
171
  print("--- Initializing LLM Client ---")
172
  if not initialize_llm_client():
173
+ print("Warning: LLM Client failed to initialize. Will use NLTK only for skills.")
174
 
 
175
  if os.path.exists(PROCESSED_DATA_PATH):
176
  print(f"--- Loading pre-processed data from {PROCESSED_DATA_PATH} ---")
177
  original_df = pd.read_parquet(PROCESSED_DATA_PATH)
178
  else:
179
  print("--- No pre-processed data found. Starting one-time processing... ---")
 
180
 
181
  ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
182
  original_df = ds["original"].to_pandas()
183
 
184
+ # --- Method 1: LLM-based extraction with FEW-SHOT PROMPT ---
185
+ def extract_skills_llm(text: str) -> list[str]:
186
+ if not isinstance(text, str) or len(text.strip()) < 20 or not LLM_PIPELINE:
 
 
187
  return []
188
+
189
  prompt = f"""
190
+ Instruct: You are an expert technical recruiter. Extract the key skills from the job description text. List technical and soft skills as a comma-separated string.
191
+
192
+ [Example 1]
193
+ Text: "Requires 3+ years of experience in cloud infrastructure. Must be proficient in AWS, particularly EC2 and S3. Experience with Terraform for IaC is a plus."
194
+ Extracted Skills: cloud infrastructure, aws, ec2, s3, terraform, infrastructure as code
195
 
196
+ [Example 2]
197
+ Text: "Seeking a team lead with strong project management abilities. Must communicate effectively with stakeholders and manage timelines using Agile methodologies like Scrum."
198
+ Extracted Skills: project management, leadership, stakeholder communication, agile, scrum
199
 
200
+ [Actual Task]
201
+ Text: "{text}"
202
  Extracted Skills:
203
  """
204
  try:
 
206
  generated_text = response[0]['generated_text']
207
  skills_part = generated_text.split("Extracted Skills:")[-1].strip()
208
  skills = [skill.strip() for skill in skills_part.split(',') if skill.strip()]
209
+ return list(dict.fromkeys(s.lower() for s in skills))
210
+ except Exception:
 
211
  return []
212
+
213
+ # --- Method 2: NLTK fallback with SKILL WHITELIST validation ---
214
+ def extract_skills_nltk(text: str) -> list[str]:
215
+ if not isinstance(text, str): return []
216
+ text_lower = text.lower()
217
+ grammar = "NP: {<JJ.*>*<NN.*>+}"
218
+ chunk_parser = nltk.RegexpParser(grammar)
219
+ tokens = nltk.word_tokenize(text_lower)
220
+ tagged_tokens = nltk.pos_tag(tokens)
221
+ chunked_text = chunk_parser.parse(tagged_tokens)
222
+
223
+ potential_skills = set()
224
+ for subtree in chunked_text.subtrees():
225
+ if subtree.label() == 'NP':
226
+ phrase = " ".join(word for word, tag in subtree.leaves())
227
+ normalized_phrase = _norm_skill_token(phrase)
228
+ # The key change: only add the phrase if it's in our known skill list
229
+ if normalized_phrase in SKILL_WHITELIST:
230
+ potential_skills.add(normalized_phrase)
231
+ return sorted(list(potential_skills))
232
+
233
+ # --- Hybrid Orchestrator: MERGE LLM and NLTK results for best coverage ---
234
+ def extract_skills_hybrid(text: str) -> list[str]:
235
+ llm_skills = extract_skills_llm(text)
236
+ nltk_skills = extract_skills_nltk(text)
237
+
238
+ # Combine the results and remove duplicates
239
+ combined_skills = set(llm_skills) | set(nltk_skills)
240
+ return sorted(list(combined_skills))
241
+
242
  def create_text_for_skills(row):
243
  return " ".join([str(s) for s in [row.get("Job title"), row.get("Duties"), row.get("qualifications"), row.get("Description")] if pd.notna(s)])
244
 
245
  original_df["text_for_skills"] = original_df.apply(create_text_for_skills, axis=1)
246
 
247
+ print("--- Extracting skills with HYBRID ACCURACY model. Please wait... ---")
248
+ original_df['Skills'] = original_df['text_for_skills'].progress_apply(extract_skills_hybrid)
249
  original_df = original_df.drop(columns=['text_for_skills'])
250
 
251
  print(f"--- Saving processed data to {PROCESSED_DATA_PATH} for faster future startups ---")
252
  original_df.to_parquet(PROCESSED_DATA_PATH)
253
 
254
+ # --- Continue with the rest of the data processing ---
255
  original_df['job_id'] = original_df.index
256
  def create_full_text(row):
257
  return " ".join([str(s) for s in [row.get("Job title"), row.get("Company"), row.get("Duties"), row.get("qualifications"), row.get("Description")]])
 
281
  return " • ".join([f'<a href="{u}" target="_blank" style="color: #007bff;">{name}</a>' for name, u in links])
282
 
283
  # --- GRADIO INTERFACE FUNCTIONS (No changes needed below this line) ---
 
284
  def get_job_matches(dream_job: str, top_n: int, skills_text: str):
285
  status = "Searching using hybrid model..."
286
  expanded_desc = llm_expand_query(dream_job)
 
358
  job_skills = row.get("Skills", [])
359
 
360
  if not job_skills:
361
+ learning_plan_html = "<p><i>No specific skills could be extracted for this job.</i></p>"
362
  return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
363
 
364
  all_missing_skills = sorted([s for s in job_skills if not any(_skill_match(ut, s) for ut in user_skills)], key=lambda x: x.lower())