scmlewis commited on
Commit
4299b38
·
verified ·
1 Parent(s): a6a34a9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -136
app.py CHANGED
@@ -93,36 +93,6 @@ def extract_text_from_file(uploaded_file):
93
  st.error("Unsupported file format. Please upload a PDF or Word (.docx) document.")
94
  return ""
95
 
96
- # Helper functions for preprocessing
97
- def preprocess_resume(text):
98
- """Extract relevant sections (Skills, Experience) from resume text to reduce input length."""
99
- text = text.lower()
100
- # Define patterns for sections
101
- skills_section = ""
102
- experience_section = ""
103
-
104
- # Extract Skills section
105
- skills_match = re.search(r'(skills|technical skills|key skills)(.*?)(experience|education|projects|$)', text, re.DOTALL | re.IGNORECASE)
106
- if skills_match:
107
- skills_section = skills_match.group(2).strip()
108
-
109
- # Extract Experience section
110
- experience_match = re.search(r'(experience|work experience|professional experience)(.*?)(education|projects|$)', text, re.DOTALL | re.IGNORECASE)
111
- if experience_match:
112
- experience_section = experience_match.group(2).strip()
113
-
114
- # Combine relevant sections
115
- processed_text = f"{skills_section}\n{experience_section}".strip()
116
- if not processed_text:
117
- # Fallback to original text if sections not found
118
- processed_text = text
119
-
120
- # Limit length to 500 characters to reduce token count
121
- if len(processed_text) > 500:
122
- processed_text = processed_text[:500] + "..."
123
-
124
- return processed_text
125
-
126
  # Helper functions for analysis
127
  def normalize_text(text):
128
  text = text.lower()
@@ -180,14 +150,14 @@ def tokenize_inputs(resumes, job_description, _bert_tokenizer, _t5_tokenizer):
180
  """Precompute tokenized inputs for BERT and T5."""
181
  job_description_norm = normalize_text(job_description)
182
  bert_inputs = [f"resume: {normalize_text(resume)} [sep] job: {job_description_norm}" for resume in resumes]
183
- bert_tokenized = _bert_tokenizer(bert_inputs, return_tensors='pt', padding=True, truncation=True, max_length=128)
184
 
185
  t5_inputs = []
186
  for resume in resumes:
187
  prompt = re.sub(r'\b[Cc]\+\+\b', 'c++', resume)
188
  prompt_normalized = normalize_text(prompt)
189
  t5_inputs.append(f"summarize: {prompt_normalized}")
190
- t5_tokenized = _t5_tokenizer(t5_inputs, return_tensors='pt', padding=True, truncation=True, max_length=128)
191
 
192
  return bert_tokenized, t5_inputs, t5_tokenized
193
 
@@ -200,105 +170,94 @@ def extract_skills(text):
200
  return set(found_skills)
201
 
202
  @st.cache_data
203
- def classify_and_summarize_batch(resumes, job_description, _bert_tokenized, _t5_inputs, _t5_tokenized, _job_skills_set):
204
- """Process resumes in batches to balance speed and stability."""
205
- bert_tokenizer, bert_model, t5_tokenizer, t5_model, device = st.session_state.models
206
- timeout = 60 # Timeout per batch
207
- batch_size = 2 # Process 2 resumes at a time
208
 
209
- results = []
210
- num_resumes = len(resumes)
211
- for batch_start in range(0, num_resumes, batch_size):
212
- batch_end = min(batch_start + batch_size, num_resumes)
213
- batch_resumes = resumes[batch_start:batch_end]
214
- batch_bert_tokenized = {k: v[batch_start:batch_end].to(device) for k, v in _bert_tokenized.items()}
215
- batch_t5_inputs = _t5_inputs[batch_start:batch_end]
216
- batch_t5_tokenized = {k: v[batch_start:batch_end].to(device) for k, v in _t5_tokenized.items()}
217
 
218
- start_time = time.time()
219
- try:
220
- # BERT inference for the batch
221
- with torch.no_grad():
222
- bert_start = time.time()
223
- outputs = bert_model(**batch_bert_tokenized)
224
- if time.time() - bert_start > timeout:
225
- raise TimeoutError("BERT inference timed out")
226
-
227
- logits = outputs.logits
228
- probabilities = torch.softmax(logits, dim=1).cpu().numpy()
229
- predictions = np.argmax(probabilities, axis=1)
230
-
231
- confidence_threshold = 0.85
232
-
233
- # T5 inference for the batch
234
- with torch.no_grad():
235
- t5_start = time.time()
236
- t5_outputs = t5_model.generate(
237
- batch_t5_tokenized['input_ids'],
238
- attention_mask=batch_t5_tokenized['attention_mask'],
239
- max_length=30,
240
- min_length=8,
241
- num_beams=1, # Reduced for speed
242
- no_repeat_ngram_size=3,
243
- length_penalty=1.0, # Reduced for faster generation
244
- early_stopping=True
245
- )
246
- if time.time() - t5_start > timeout:
247
- raise TimeoutError("T5 inference timed out")
248
- summaries = [t5_tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True) for output in t5_outputs]
249
- summaries = [re.sub(r'\s+', ' ', summary).strip() for summary in summaries]
250
-
251
- # Process each resume in the batch
252
- for i, (resume, prob, pred, summary, t5_input) in enumerate(zip(batch_resumes, probabilities, predictions, summaries, batch_t5_inputs)):
253
- resume_skills_set = extract_skills(resume)
254
- skill_overlap = len(_job_skills_set.intersection(resume_skills_set)) / len(_job_skills_set) if _job_skills_set else 0
255
-
256
- if skill_overlap < 0.4:
257
- suitability = "Irrelevant"
258
- warning = "Skills are irrelevant"
259
- else:
260
- exp_warning = check_experience_mismatch(resume, job_description)
261
- if exp_warning:
262
- suitability = "Uncertain"
263
- warning = exp_warning
264
- else:
265
- if prob[pred] < confidence_threshold:
266
- suitability = "Uncertain"
267
- warning = f"Low confidence: {prob[pred]:.4f}"
268
- else:
269
- suitability = "Relevant" if skill_overlap >= 0.5 else "Irrelevant"
270
- warning = "Skills are not a strong match" if suitability == "Irrelevant" else None
271
-
272
- skills = list(set(skills_pattern.findall(t5_input))) # Deduplicate skills
273
- exp_match = re.search(r'\d+\s*years?|senior', resume.lower())
274
- if skills and exp_match:
275
- summary = f"{', '.join(skills)} proficiency, {exp_match.group(0)} experience"
276
  else:
277
- summary = f"{exp_match.group(0) if exp_match else 'unknown'} experience"
278
-
279
- results.append({
280
- "Suitability": suitability,
281
- "Data/Tech Related Skills Summary": summary,
282
- "Warning": warning or "None"
283
- })
284
- except TimeoutError as e:
285
- st.warning(f"Skipped processing for batch due to timeout: {str(e)}")
286
- for resume in batch_resumes:
287
- results.append({
288
- "Suitability": "Error",
289
- "Data/Tech Related Skills Summary": "Processing timed out",
290
- "Warning": str(e)
291
- })
292
- except Exception as e:
293
- st.error(f"Error during inference for batch: {str(e)}")
294
- for resume in batch_resumes:
295
- results.append({
296
- "Suitability": "Error",
297
- "Data/Tech Related Skills Summary": "Failed to process",
298
- "Warning": str(e)
299
- })
300
-
301
- return results
 
 
 
 
 
 
302
 
303
  @st.cache_data
304
  def generate_skill_pie_chart(resumes):
@@ -335,7 +294,7 @@ def render_sidebar():
335
  st.markdown("""
336
  <h1 style='text-align: center; color: #007BFF; font-size: 32px; text-shadow: 1px 1px 2px rgba(0, 0, 0, 0.1); margin-bottom: 10px;'>💻 Resume Screening Assistant for Data/Tech</h1>
337
  <p style='text-align: center; font-size: 16px; margin-top: 0;'>
338
- Welcome to our AI-powered resume screening tool, specialized for data science and tech roles! This app evaluates multiple resumes against a single job description to determine suitability, providing concise summaries of key data and tech skills and experience. Built with advanced natural language processing, it ensures accurate and efficient screening for technical positions. <br><br><strong>Note:</strong> Performance may vary due to server load on free CPU instances. For best results, consider upgrading to a paid plan on Streamlit Community Cloud.
339
  </p>
340
  """, unsafe_allow_html=True)
341
 
@@ -401,9 +360,7 @@ def main():
401
  if uploaded_file is not None:
402
  extracted_text = extract_text_from_file(uploaded_file)
403
  if extracted_text:
404
- # Preprocess the extracted text to reduce length
405
- processed_text = preprocess_resume(extracted_text)
406
- st.session_state.resumes[i] = processed_text
407
  else:
408
  st.session_state.resumes[i] = ""
409
 
@@ -495,10 +452,19 @@ def main():
495
  job_skills_set = extract_skills(job_description)
496
 
497
  status_text.text("Classifying and summarizing resumes...")
498
- results = classify_and_summarize_batch(valid_resumes, job_description, bert_tokenized, t5_inputs, t5_tokenized, job_skills_set)
499
-
500
- for i in range(len(results)):
501
- results[i]["Resume"] = f"Resume {i+1}"
 
 
 
 
 
 
 
 
 
502
  progress_bar.progress((i + 1) / total_steps)
503
 
504
  st.session_state.results = results
 
93
  st.error("Unsupported file format. Please upload a PDF or Word (.docx) document.")
94
  return ""
95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  # Helper functions for analysis
97
  def normalize_text(text):
98
  text = text.lower()
 
150
  """Precompute tokenized inputs for BERT and T5."""
151
  job_description_norm = normalize_text(job_description)
152
  bert_inputs = [f"resume: {normalize_text(resume)} [sep] job: {job_description_norm}" for resume in resumes]
153
+ bert_tokenized = _bert_tokenizer(bert_inputs, return_tensors='pt', padding=True, truncation=True, max_length=64)
154
 
155
  t5_inputs = []
156
  for resume in resumes:
157
  prompt = re.sub(r'\b[Cc]\+\+\b', 'c++', resume)
158
  prompt_normalized = normalize_text(prompt)
159
  t5_inputs.append(f"summarize: {prompt_normalized}")
160
+ t5_tokenized = _t5_tokenizer(t5_inputs, return_tensors='pt', padding=True, truncation=True, max_length=64)
161
 
162
  return bert_tokenized, t5_inputs, t5_tokenized
163
 
 
170
  return set(found_skills)
171
 
172
  @st.cache_data
173
+ def classify_and_summarize_batch(resume, job_description, _bert_tokenized, _t5_input, _t5_tokenized, _job_skills_set):
174
+ """Process one resume at a time to reduce CPU load with a timeout."""
175
+ _, bert_model, t5_tokenizer, t5_model, device = st.session_state.models
176
+ start_time = time.time()
177
+ timeout = 60 # Timeout after 60 seconds
178
 
179
+ try:
180
+ bert_tokenized = {k: v.to(device) for k, v in _bert_tokenized.items()}
181
+ with torch.no_grad():
182
+ # BERT inference
183
+ bert_start = time.time()
184
+ outputs = bert_model(**bert_tokenized)
185
+ if time.time() - bert_start > timeout:
186
+ raise TimeoutError("BERT inference timed out")
187
 
188
+ logits = outputs.logits
189
+ probabilities = torch.softmax(logits, dim=1).cpu().numpy()
190
+ predictions = np.argmax(probabilities, axis=1)
191
+
192
+ confidence_threshold = 0.85
193
+
194
+ t5_tokenized = {k: v.to(device) for k, v in _t5_tokenized.items()}
195
+ with torch.no_grad():
196
+ # T5 inference
197
+ t5_start = time.time()
198
+ t5_outputs = t5_model.generate(
199
+ t5_tokenized['input_ids'],
200
+ attention_mask=t5_tokenized['attention_mask'],
201
+ max_length=30,
202
+ min_length=8,
203
+ num_beams=2,
204
+ no_repeat_ngram_size=3,
205
+ length_penalty=2.0,
206
+ early_stopping=True
207
+ )
208
+ if time.time() - t5_start > timeout:
209
+ raise TimeoutError("T5 inference timed out")
210
+ summaries = [t5_tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True) for output in t5_outputs]
211
+ summaries = [re.sub(r'\s+', ' ', summary).strip() for summary in summaries]
212
+
213
+ prob, pred, summary, t5_input = probabilities[0], predictions[0], summaries[0], _t5_input
214
+ resume_skills_set = extract_skills(resume)
215
+ skill_overlap = len(_job_skills_set.intersection(resume_skills_set)) / len(_job_skills_set) if _job_skills_set else 0
216
+
217
+ if skill_overlap < 0.4:
218
+ suitability = "Irrelevant"
219
+ warning = "Skills are irrelevant"
220
+ else:
221
+ exp_warning = check_experience_mismatch(resume, job_description)
222
+ if exp_warning:
223
+ suitability = "Uncertain"
224
+ warning = exp_warning
225
+ else:
226
+ if prob[pred] < confidence_threshold:
227
+ suitability = "Uncertain"
228
+ warning = f"Low confidence: {prob[pred]:.4f}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  else:
230
+ suitability = "Relevant" if skill_overlap >= 0.5 else "Irrelevant"
231
+ warning = "Skills are not a strong match" if suitability == "Irrelevant" else None
232
+
233
+ skills = list(set(skills_pattern.findall(t5_input))) # Deduplicate skills
234
+ exp_match = re.search(r'\d+\s*years?|senior', resume.lower())
235
+ if skills and exp_match:
236
+ summary = f"{', '.join(skills)} proficiency, {exp_match.group(0)} experience"
237
+ else:
238
+ summary = f"{exp_match.group(0) if exp_match else 'unknown'} experience"
239
+
240
+ result = {
241
+ "Suitability": suitability,
242
+ "Data/Tech Related Skills Summary": summary,
243
+ "Warning": warning or "None"
244
+ }
245
+
246
+ return result
247
+ except TimeoutError as e:
248
+ st.warning(f"Skipped processing for resume due to timeout: {str(e)}")
249
+ return {
250
+ "Suitability": "Error",
251
+ "Data/Tech Related Skills Summary": "Processing timed out",
252
+ "Warning": str(e)
253
+ }
254
+ except Exception as e:
255
+ st.error(f"Error during inference for resume: {str(e)}")
256
+ return {
257
+ "Suitability": "Error",
258
+ "Data/Tech Related Skills Summary": "Failed to process",
259
+ "Warning": str(e)
260
+ }
261
 
262
  @st.cache_data
263
  def generate_skill_pie_chart(resumes):
 
294
  st.markdown("""
295
  <h1 style='text-align: center; color: #007BFF; font-size: 32px; text-shadow: 1px 1px 2px rgba(0, 0, 0, 0.1); margin-bottom: 10px;'>💻 Resume Screening Assistant for Data/Tech</h1>
296
  <p style='text-align: center; font-size: 16px; margin-top: 0;'>
297
+ Welcome to our AI-powered resume screening tool, specialized for data science and tech roles! This app evaluates multiple resumes against a single job description to determine suitability, providing concise summaries of key data and tech skills and experience. Built with advanced natural language processing, it ensures accurate and efficient screening for technical positions. <br><br><strong>Note:</strong> Performance may vary due to server load on free CPU instances.
298
  </p>
299
  """, unsafe_allow_html=True)
300
 
 
360
  if uploaded_file is not None:
361
  extracted_text = extract_text_from_file(uploaded_file)
362
  if extracted_text:
363
+ st.session_state.resumes[i] = extracted_text
 
 
364
  else:
365
  st.session_state.resumes[i] = ""
366
 
 
452
  job_skills_set = extract_skills(job_description)
453
 
454
  status_text.text("Classifying and summarizing resumes...")
455
+ results = []
456
+ for i, (resume, bert_tok, t5_in, t5_tok) in enumerate(zip(valid_resumes, bert_tokenized['input_ids'], t5_inputs, t5_tokenized['input_ids'])):
457
+ status_text.text(f"Processing Resume {i+1}/{total_steps}: {resume[:50]}...")
458
+ result = classify_and_summarize_batch(
459
+ resume,
460
+ job_description,
461
+ {'input_ids': bert_tok.unsqueeze(0), 'attention_mask': bert_tokenized['attention_mask'][i].unsqueeze(0)},
462
+ t5_in,
463
+ {'input_ids': t5_tok.unsqueeze(0), 'attention_mask': t5_tokenized['attention_mask'][i].unsqueeze(0)},
464
+ job_skills_set
465
+ )
466
+ result["Resume"] = f"Resume {i+1}"
467
+ results.append(result)
468
  progress_bar.progress((i + 1) / total_steps)
469
 
470
  st.session_state.results = results