scmlewis commited on
Commit
49d861a
·
verified ·
1 Parent(s): 7ee3a57

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -117
app.py CHANGED
@@ -299,10 +299,10 @@ def classify_and_summarize_batch(resume, job_description, _bert_tokenized, _t5_i
299
  bert_tokenized = {k: v.to(device) for k, v in _bert_tokenized.items()}
300
  with torch.no_grad():
301
  # BERT inference
302
- elapsed_time = time.time() - start_time
303
- if elapsed_time > timeout:
304
- raise TimeoutError("BERT inference timed out")
305
  outputs = bert_model(**bert_tokenized)
 
 
306
 
307
  logits = outputs.logits
308
  probabilities = torch.softmax(logits, dim=1).cpu().numpy()
@@ -313,9 +313,7 @@ def classify_and_summarize_batch(resume, job_description, _bert_tokenized, _t5_i
313
  t5_tokenized = {k: v.to(device) for k, v in _t5_tokenized.items()}
314
  with torch.no_grad():
315
  # T5 inference
316
- elapsed_time = time.time() - start_time
317
- if elapsed_time > timeout:
318
- raise TimeoutError("T5 inference timed out")
319
  t5_outputs = t5_model.generate(
320
  t5_tokenized['input_ids'],
321
  attention_mask=t5_tokenized['attention_mask'],
@@ -326,6 +324,8 @@ def classify_and_summarize_batch(resume, job_description, _bert_tokenized, _t5_i
326
  length_penalty=3.0,
327
  early_stopping=True
328
  )
 
 
329
  summaries = [t5_tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True) for output in t5_outputs]
330
  summaries = [re.sub(r'\s+', ' ', summary).strip() for summary in summaries]
331
 
@@ -382,77 +382,38 @@ def classify_and_summarize_batch(resume, job_description, _bert_tokenized, _t5_i
382
  "Inference Time": time.time() - start_time
383
  }
384
 
385
- def render_sidebar():
386
- """Render sidebar content."""
387
- with st.sidebar:
388
- st.markdown("""
389
- <h1 style='text-align: center; font-size: 32px; margin-bottom: 10px;'>📄 Resume Screening Assistant for Databricks</h1>
390
- <p style='text-align: center; font-size: 16px; margin-top: 0;'>
391
- Welcome to our AI-powered resume screening tool, specialized for data science and tech roles! This app evaluates multiple resumes against a single job description, providing suitability classifications, skill summaries, and a skill frequency visualization.
392
- </p>
393
- """, unsafe_allow_html=True)
394
-
395
- # Persist expander states
396
- if 'expander1' not in st.session_state:
397
- st.session_state.expander1 = True
398
- if 'expander2' not in st.session_state:
399
- st.session_state.expander2 = False
400
- if 'expander3' not in st.session_state:
401
- st.session_state.expander3 = False
402
- if 'expander4' not in st.session_state:
403
- st.session_state.expander4 = False
404
-
405
- with st.expander("How to Use the App", expanded=st.session_state.expander1):
406
- st.session_state.expander1 = True
407
- st.markdown("""
408
- - Enter up to 5 candidate resumes in the text boxes below, listing data/tech skills and experience (e.g., "Expert in python, databricks, 6 years experience").
409
- - Enter the job description, specifying required skills and experience (e.g., "Data engineer requires python, spark, 5 years+").
410
- - Click the "Analyze" button to evaluate all non-empty resumes (at least one resume required).
411
- - Use the "Add Resume" or "Remove Resume" buttons to adjust the number of resume fields (1-5).
412
- - Use the "Reset" button to clear all inputs and results.
413
- - Results can be downloaded as a CSV file for record-keeping.
414
- - View the skill frequency pie chart to see the distribution of skills across resumes.
415
- """)
416
-
417
- with st.expander("Example Test Cases", expanded=st.session_state.expander2):
418
- st.session_state.expander2 = True
419
- st.markdown("""
420
- - **Test Case 1**:
421
- - Resume 1: "Expert in python, machine learning, tableau, 4 years experience"
422
- - Resume 2: "Skilled in sql, pandas, 2 years experience"
423
- - Resume 3: "Proficient in java, python, 5 years experience"
424
- - Job Description: "Data scientist requires python, machine learning, 3 years+"
425
- - **Test Case 2**:
426
- - Resume 1: "Skilled in databricks, spark, python, 6 years experience"
427
- - Resume 2: "Expert in sql, tableau, business intelligence, 3 years experience"
428
- - Resume 3: "Proficient in rust, langchain, 2 years experience"
429
- - Job Description: "Data engineer requires python, spark, 5 years+"
430
- """)
431
-
432
- with st.expander("Guidelines", expanded=st.session_state.expander3):
433
- st.session_state.expander3 = True
434
- st.markdown("""
435
- - Use comma-separated skills from a comprehensive list including python, sql, databricks, etc. (79 skills supported, see Project Report for full list).
436
- - Include experience in years (e.g., "3 years experience" or "1 year experience") or as "senior".
437
- - Focus on data/tech skills for accurate summarization.
438
- - Resumes with only irrelevant skills (e.g., sales, marketing) will be classified as "Irrelevant".
439
- """)
440
-
441
- with st.expander("Classification Criteria", expanded=st.session_state.expander4):
442
- st.session_state.expander4 = True
443
- st.markdown("""
444
- Resumes are classified based on:
445
- - **Skill Overlap**: The resume's data/tech skills are compared to the job's requirements. A skill overlap below 40% results in an "Irrelevant" classification.
446
- - **Model Confidence**: A finetuned BERT model evaluates skill relevance. If confidence is below 85%, the classification is "Uncertain".
447
- - **Experience Match**: The resume's experience (in years or seniority) must meet or exceed the job's requirement.
448
-
449
- **Outcomes**:
450
- - **Relevant**: Skill overlap ≥ 50%, sufficient experience, and high model confidence (≥ 85%).
451
- - **Irrelevant**: Skill overlap < 40% or high confidence in low skill relevance.
452
- - **Uncertain**: Skill overlap ≥ 50% but experience mismatch (e.g., resume has 2 years, job requires 5 years+), or low model confidence (< 85%).
453
-
454
- **Note**: An experience mismatch warning is shown if the resume's experience is below the job's requirement, overriding the skill overlap and confidence to classify as Uncertain.
455
- """)
456
 
457
  def main():
458
  """Main function to run the Streamlit app for resume screening."""
@@ -518,10 +479,11 @@ def main():
518
  placeholder="e.g., 'Data engineer requires python, spark, 5 years+'"
519
  )
520
 
521
- # Analyze button with loading spinner
522
  if st.button("Analyze"):
523
  with st.spinner("Analyzing resumes... This may take a moment depending on server load."):
524
  start_time = time.time()
 
525
  resumes = tuple(resume.strip() for resume in st.session_state.resumes[:num_resumes]) # Use tuple for cache stability
526
  job_description = st.session_state.job_description.strip()
527
 
@@ -542,6 +504,9 @@ def main():
542
  job_skills_set = extract_skills(job_description)
543
  results = []
544
  for i, resume in enumerate(valid_resumes):
 
 
 
545
  st.write(f"Processing {resume[:50]}...") # Log progress
546
  bert_tokenized, t5_inputs, t5_tokenized = tokenize_inputs([resume], job_description)
547
  result = classify_and_summarize_batch(resume, job_description, bert_tokenized, t5_inputs[0], t5_tokenized, job_skills_set)
@@ -557,42 +522,4 @@ def main():
557
 
558
  st.session_state.total_analyze_time = time.time() - start_time
559
  # Detailed timing logs
560
- st.write(f"Total Analyze Time: {st.session_state.total_analyze_time:.2f} seconds")
561
- st.write(f"Model Load Time: {getattr(st.session_state, 'load_models_time', 0):.2f} seconds")
562
- st.write(f"Tokenize Time: {getattr(st.session_state, 'tokenize_time', 0):.2f} seconds")
563
- st.write(f"Extract Skills Time: {getattr(st.session_state, 'extract_skills_time', 0):.2f} seconds")
564
- if st.session_state.results:
565
- for idx, result in enumerate(st.session_state.results):
566
- st.write(f"Inference Time for {result['Resume']}: {result['Inference Time']:.2f} seconds")
567
- st.write(f"Pie Chart Time: {getattr(st.session_state, 'pie_chart_time', 0):.2f} seconds")
568
-
569
- # Performance note
570
- if st.session_state.total_analyze_time > 60:
571
- st.warning("The runtime is longer than expected due to server load on Hugging Face Spaces. For a smoother experience, consider testing locally or deploying on a different platform (e.g., Streamlit Community Cloud or a personal server).")
572
-
573
- # Display results
574
- if st.session_state.results:
575
- with st.container():
576
- st.subheader("Results")
577
- df = pd.DataFrame(st.session_state.results)
578
- df = df[["Resume", "Suitability", "Data/Tech Related Skills Summary", "Warning"]] # Exclude Inference Time from display
579
- st.dataframe(df, use_container_width=True)
580
-
581
- csv = df.to_csv(index=False)
582
- st.download_button(
583
- label="Download Results as CSV",
584
- data=csv,
585
- file_name="resume_screening_results.csv",
586
- mime="text/csv",
587
- )
588
-
589
- # Display pie chart
590
- if st.session_state.pie_chart:
591
- with st.container():
592
- st.subheader("Skill Frequency Across Resumes")
593
- st.pyplot(st.session_state.pie_chart)
594
- elif st.session_state.results and not st.session_state.pie_chart:
595
- st.warning("No recognized data/tech skills found in the resumes to generate a pie chart.")
596
-
597
- if __name__ == "__main__":
598
- main()
 
299
  bert_tokenized = {k: v.to(device) for k, v in _bert_tokenized.items()}
300
  with torch.no_grad():
301
  # BERT inference
302
+ bert_start = time.time()
 
 
303
  outputs = bert_model(**bert_tokenized)
304
+ if time.time() - bert_start > timeout:
305
+ raise TimeoutError("BERT inference timed out")
306
 
307
  logits = outputs.logits
308
  probabilities = torch.softmax(logits, dim=1).cpu().numpy()
 
313
  t5_tokenized = {k: v.to(device) for k, v in _t5_tokenized.items()}
314
  with torch.no_grad():
315
  # T5 inference
316
+ t5_start = time.time()
 
 
317
  t5_outputs = t5_model.generate(
318
  t5_tokenized['input_ids'],
319
  attention_mask=t5_tokenized['attention_mask'],
 
324
  length_penalty=3.0,
325
  early_stopping=True
326
  )
327
+ if time.time() - t5_start > timeout:
328
+ raise TimeoutError("T5 inference timed out")
329
  summaries = [t5_tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True) for output in t5_outputs]
330
  summaries = [re.sub(r'\s+', ' ', summary).strip() for summary in summaries]
331
 
 
382
  "Inference Time": time.time() - start_time
383
  }
384
 
385
+ @st.cache_data
386
+ def generate_skill_pie_chart(resumes):
387
+ """Generate a pie chart of skill frequency across resumes."""
388
+ start_time = time.time()
389
+ skill_counts = {}
390
+ total_resumes = len([r for r in resumes if r.strip()])
391
+
392
+ if total_resumes == 0:
393
+ return None
394
+
395
+ for resume in resumes:
396
+ if resume.strip():
397
+ resume_lower = normalize_text(resume)
398
+ resume_lower = re.sub(r'[,_-]', ' ', resume_lower)
399
+ found_skills = skills_pattern.findall(resume_lower)
400
+ for skill in found_skills:
401
+ skill_counts[skill] = skill_counts.get(skill, 0) + 1
402
+
403
+ if not skill_counts:
404
+ return None
405
+
406
+ labels = list(skill_counts.keys())
407
+ sizes = [(count / sum(skill_counts.values())) * 100 for count in skill_counts.values()]
408
+
409
+ fig, ax = plt.subplots(figsize=(6, 4))
410
+ colors = plt.cm.Blues(np.linspace(0.4, 0.8, len(labels)))
411
+ ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, colors=colors, textprops={'fontsize': 10})
412
+ ax.axis('equal')
413
+ plt.title("Skill Frequency Across Resumes", fontsize=12, color='#FF3621', pad=10)
414
+
415
+ st.session_state.pie_chart_time = time.time() - start_time
416
+ return fig
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
417
 
418
  def main():
419
  """Main function to run the Streamlit app for resume screening."""
 
479
  placeholder="e.g., 'Data engineer requires python, spark, 5 years+'"
480
  )
481
 
482
+ # Analyze button with loading spinner and global timeout
483
  if st.button("Analyze"):
484
  with st.spinner("Analyzing resumes... This may take a moment depending on server load."):
485
  start_time = time.time()
486
+ global_timeout = 180 # Global timeout of 3 minutes for all resumes
487
  resumes = tuple(resume.strip() for resume in st.session_state.resumes[:num_resumes]) # Use tuple for cache stability
488
  job_description = st.session_state.job_description.strip()
489
 
 
504
  job_skills_set = extract_skills(job_description)
505
  results = []
506
  for i, resume in enumerate(valid_resumes):
507
+ if time.time() - start_time > global_timeout:
508
+ st.error("Analysis timed out after 3 minutes. Please try again or deploy on a different platform.")
509
+ break
510
  st.write(f"Processing {resume[:50]}...") # Log progress
511
  bert_tokenized, t5_inputs, t5_tokenized = tokenize_inputs([resume], job_description)
512
  result = classify_and_summarize_batch(resume, job_description, bert_tokenized, t5_inputs[0], t5_tokenized, job_skills_set)
 
522
 
523
  st.session_state.total_analyze_time = time.time() - start_time
524
  # Detailed timing logs
525
+ st.write(f"Total Analyze Time: {st.session_state.total_analyze_time:.2f}