scmlewis commited on
Commit
5fd9f8e
Β·
verified Β·
1 Parent(s): f6d04de

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +286 -162
app.py CHANGED
@@ -15,9 +15,10 @@ import time
15
  # Set page config as the first Streamlit command
16
  st.set_page_config(page_title="Resume Screening Assistant for Data/Tech", page_icon="πŸ“„", layout="wide")
17
 
18
- # Set sidebar width and make uncollapsible
19
  st.markdown("""
20
  <style>
 
21
  .css-1d391kg { /* Sidebar */
22
  width: 350px !important;
23
  }
@@ -28,14 +29,34 @@ st.markdown("""
28
  min-width: 350px !important;
29
  visibility: visible !important;
30
  }
 
31
  [data-testid="stExpander"] summary { /* Expander headers */
32
  font-size: 26px !important;
33
  font-weight: bold !important;
34
  text-shadow: 1px 1px 2px rgba(0, 0, 0, 0.1) !important;
35
  white-space: nowrap !important;
36
  }
37
- .st-expander-content p { /* Expander body text */
38
- font-size: 12px !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  }
40
  </style>
41
  """, unsafe_allow_html=True)
@@ -59,7 +80,7 @@ skills_list = [
59
  # Precompile regex for skills matching (optimized for single pass)
60
  skills_pattern = re.compile(r'\b(' + '|'.join(re.escape(skill) for skill in skills_list) + r')\b', re.IGNORECASE)
61
 
62
- # Helper functions for CV parsing
63
  def extract_text_from_pdf(file):
64
  try:
65
  pdf_reader = PyPDF2.PdfReader(file)
@@ -90,10 +111,10 @@ def extract_text_from_file(uploaded_file):
90
  elif uploaded_file.name.endswith('.docx'):
91
  return extract_text_from_docx(uploaded_file)
92
  else:
93
- st.error("Unsupported file format. Please upload a PDF or Word (.docx) document.")
94
  return ""
95
 
96
- # Helper functions for analysis
97
  def normalize_text(text):
98
  text = text.lower()
99
  # Remove underscores, hyphens, and specific phrases, replacing with empty string
@@ -101,24 +122,28 @@ def normalize_text(text):
101
  return text
102
 
103
  def check_experience_mismatch(resume, job_description):
 
104
  resume_match = re.search(r'(\d+)\s*years?|senior', resume.lower())
105
- # Allow optional words like "experience" between "years" and "+"
106
  job_match = re.search(r'(\d+)\s*years?(?:\s+\w+)*\+|senior\+', job_description.lower())
107
  if resume_match and job_match:
108
- resume_years = resume_match.group(0)
109
- job_years = job_match.group(0)
 
110
  # Handle resume years
111
- if 'senior' in resume_years:
112
  resume_num = 10
113
  else:
114
  resume_num = int(resume_match.group(1))
 
115
  # Handle job years
116
- if 'senior+' in job_years:
117
  job_num = 10
118
  else:
119
  job_num = int(job_match.group(1))
 
120
  if resume_num < job_num:
121
- return f"Experience mismatch: Resume has {resume_years}, job requires {job_years}"
122
  return None
123
 
124
  def validate_input(text, is_resume=True):
@@ -150,13 +175,16 @@ def tokenize_inputs(resumes, job_description, _bert_tokenizer, _t5_tokenizer):
150
  """Precompute tokenized inputs for BERT and T5."""
151
  job_description_norm = normalize_text(job_description)
152
  bert_inputs = [f"resume: {normalize_text(resume)} [sep] job: {job_description_norm}" for resume in resumes]
 
153
  bert_tokenized = _bert_tokenizer(bert_inputs, return_tensors='pt', padding=True, truncation=True, max_length=64)
154
 
155
  t5_inputs = []
156
  for resume in resumes:
 
157
  prompt = re.sub(r'\b[Cc]\+\+\b', 'c++', resume)
158
  prompt_normalized = normalize_text(prompt)
159
  t5_inputs.append(f"summarize: {prompt_normalized}")
 
160
  t5_tokenized = _t5_tokenizer(t5_inputs, return_tensors='pt', padding=True, truncation=True, max_length=64)
161
 
162
  return bert_tokenized, t5_inputs, t5_tokenized
@@ -167,19 +195,19 @@ def extract_skills(text):
167
  text_normalized = normalize_text(text)
168
  text_normalized = re.sub(r'[,_-]', ' ', text_normalized)
169
  found_skills = skills_pattern.findall(text_normalized)
170
- return set(found_skills)
171
 
172
  @st.cache_data
173
- def classify_and_summarize_batch(resume, job_description, _bert_tokenized, _t5_input, _t5_tokenized, _job_skills_set):
174
  """Process one resume at a time to reduce CPU load with a timeout."""
 
175
  _, bert_model, t5_tokenizer, t5_model, device = st.session_state.models
176
- start_time = time.time()
177
  timeout = 60 # Timeout after 60 seconds
178
 
179
  try:
180
- bert_tokenized = {k: v.to(device) for k, v in _bert_tokenized.items()}
 
181
  with torch.no_grad():
182
- # BERT inference
183
  bert_start = time.time()
184
  outputs = bert_model(**bert_tokenized)
185
  if time.time() - bert_start > timeout:
@@ -190,10 +218,11 @@ def classify_and_summarize_batch(resume, job_description, _bert_tokenized, _t5_i
190
  predictions = np.argmax(probabilities, axis=1)
191
 
192
  confidence_threshold = 0.85
193
-
194
- t5_tokenized = {k: v.to(device) for k, v in _t5_tokenized.items()}
 
 
195
  with torch.no_grad():
196
- # T5 inference
197
  t5_start = time.time()
198
  t5_outputs = t5_model.generate(
199
  t5_tokenized['input_ids'],
@@ -208,39 +237,44 @@ def classify_and_summarize_batch(resume, job_description, _bert_tokenized, _t5_i
208
  if time.time() - t5_start > timeout:
209
  raise TimeoutError("T5 inference timed out")
210
  summaries = [t5_tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True) for output in t5_outputs]
211
- summaries = [re.sub(r'\s+', ' ', summary).strip() for summary in summaries]
212
 
213
- prob, pred, summary, t5_input = probabilities[0], predictions[0], summaries[0], _t5_input
214
  resume_skills_set = extract_skills(resume)
215
  skill_overlap = len(_job_skills_set.intersection(resume_skills_set)) / len(_job_skills_set) if _job_skills_set else 0
216
 
 
 
 
 
217
  if skill_overlap < 0.4:
218
  suitability = "Irrelevant"
219
- warning = "Skills are irrelevant"
220
- else:
221
- exp_warning = check_experience_mismatch(resume, job_description)
222
- if exp_warning:
223
- suitability = "Uncertain"
224
- warning = exp_warning
225
- else:
226
- if prob[pred] < confidence_threshold:
227
- suitability = "Uncertain"
228
- warning = f"Low confidence: {prob[pred]:.4f}"
229
- else:
230
- suitability = "Relevant" if skill_overlap >= 0.5 else "Irrelevant"
231
- warning = "Skills are not a strong match" if suitability == "Irrelevant" else None
232
 
233
- skills = list(set(skills_pattern.findall(t5_input))) # Deduplicate skills
 
234
  exp_match = re.search(r'\d+\s*years?|senior', resume.lower())
235
- if skills and exp_match:
236
- summary = f"{', '.join(skills)} proficiency, {exp_match.group(0)} experience"
 
 
 
237
  else:
238
- summary = f"{exp_match.group(0) if exp_match else 'unknown'} experience"
239
 
240
  result = {
241
  "Suitability": suitability,
242
- "Data/Tech Related Skills Summary": summary,
243
- "Warning": warning or "None"
244
  }
245
 
246
  return result
@@ -273,19 +307,42 @@ def generate_skill_pie_chart(resumes):
273
  resume_lower = normalize_text(resume)
274
  found_skills = skills_pattern.findall(resume_lower)
275
  for skill in found_skills:
276
- skill_counts[skill] = skill_counts.get(skill, 0) + 1
277
 
 
278
  if not skill_counts:
279
  return None
280
 
281
- labels = list(skill_counts.keys())
282
- sizes = [(count / sum(skill_counts.values())) * 100 for count in skill_counts.values()]
 
 
 
 
 
 
 
 
 
 
 
283
 
284
  fig, ax = plt.subplots(figsize=(6, 4))
285
- colors = plt.cm.Blues(np.linspace(0.4, 0.8, len(labels)))
286
- ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, colors=colors, textprops={'fontsize': 10})
287
- ax.axis('equal')
288
- plt.title("Skill Frequency Across Resumes", fontsize=12, color='#007BFF', pad=10)
 
 
 
 
 
 
 
 
 
 
 
289
  return fig
290
 
291
  def render_sidebar():
@@ -301,37 +358,24 @@ def render_sidebar():
301
  with st.expander("πŸ“‹ How to Use the App", expanded=True):
302
  st.markdown("""
303
  **Instructions**:
304
- - Upload a PDF or Word (.docx) CV or manually enter up to 5 candidate resumes in the text boxes, listing data/tech skills and experience (e.g., "Expert in python, databricks, 6 years experience").
305
- - Enter the job description, specifying required skills and experience (e.g., "Data engineer requires python, spark, 5 years+").
306
- - Click **Analyze** to evaluate all non-empty resumes (at least one required).
307
  - Use **Add Resume** or **Remove Resume** to adjust the number of resume fields (1–5).
308
- - Use the **Reset** button to clear all inputs and results.
309
- - Download results as a CSV file for record-keeping.
310
- - View the skill frequency pie chart to see skill distribution across resumes.
311
- - Example test cases:
312
- - **Test Case 1**: Resumes like "Expert in python, machine learning, tableau, 4 years experience" against "Data scientist requires python, machine learning, 3 years+".
313
- - **Test Case 2**: Resumes like "Skilled in databricks, spark, python, 6 years experience" against "Data engineer requires python, spark, 5 years+".
314
-
315
- **Guidelines**:
316
- - Use comma-separated skills from a comprehensive list including python, sql, databricks, etc. (79 skills supported).
317
- - Include experience in years (e.g., "3 years experience" or "1 year experience") or as "senior".
318
- - Focus on data/tech skills for accurate summarization.
319
- - Resumes with only irrelevant skills (e.g., sales, marketing) will be classified as "Irrelevant".
320
- - If uploading a CV, ensure it’s a text-based PDF or Word document (scanned PDFs may not work).
321
  """)
322
  with st.expander("ℹ️ Classification Criteria", expanded=True):
323
  st.markdown("""
324
  The app classifies resumes based on:
325
- - **Skill Overlap**: The resume’s data/tech skills are compared to the job’s requirements. A skill overlap below 40% results in an "Irrelevant" classification.
326
- - **Model Confidence**: A finetuned BERT model evaluates skill relevance. If confidence is below 85%, the classification is "Uncertain".
327
- - **Experience Match**: The resume’s experience (in years or seniority) must meet or exceed the job’s requirement.
328
 
329
  **Outcomes**:
330
- - **Relevant**: Skill overlap β‰₯ 50%, sufficient experience, and high model confidence (β‰₯85%).
331
- - **Irrelevant**: Skill overlap < 40% or high confidence in low skill relevance.
332
- - **Uncertain**: Skill overlap β‰₯ 50% but experience mismatch (e.g., resume has 2 years, job requires 5 years+), or low model confidence (<85%).
333
-
334
- **Note**: An experience mismatch warning is shown if the resume’s experience is below the job’s requirement, overriding the skill overlap and confidence to classify as Uncertain.
335
  """)
336
 
337
  def main():
@@ -341,6 +385,7 @@ def main():
341
 
342
  # Initialize session state
343
  if 'resumes' not in st.session_state:
 
344
  st.session_state.resumes = ["Expert in python, machine learning, tableau, 4 years experience", "", ""]
345
  if 'input_job_description' not in st.session_state:
346
  st.session_state.input_job_description = "Data scientist requires python, machine learning, 3 years+"
@@ -351,85 +396,118 @@ def main():
351
  if 'models' not in st.session_state:
352
  st.session_state.models = None
353
 
354
- # Resume inputs with file upload and manual text input
355
- st.markdown("### πŸ“ Enter Resumes")
356
- for i in range(len(st.session_state.resumes)):
357
- st.markdown(f"**Resume {i+1}**")
358
- uploaded_file = st.file_uploader(f"Upload CV (PDF or Word) for Resume {i+1}", type=['pdf', 'docx'], key=f"file_upload_{i}")
 
 
 
 
359
 
360
- if uploaded_file is not None:
361
- extracted_text = extract_text_from_file(uploaded_file)
362
- if extracted_text:
363
- st.session_state.resumes[i] = extracted_text
364
- else:
365
- st.session_state.resumes[i] = ""
366
-
367
- st.session_state.resumes[i] = st.text_area(
368
- f"Enter or edit resume text",
369
- value=st.session_state.resumes[i],
370
- height=100,
371
- key=f"resume_{i}",
372
- placeholder="e.g., Expert in python, sql, 3 years experience"
373
  )
374
- validation_error = validate_input(st.session_state.resumes[i], is_resume=True)
375
- if validation_error and st.session_state.resumes[i].strip():
376
- st.warning(f"Resume {i+1}: {validation_error}")
377
-
378
- # Add/Remove resume buttons
379
- col_add, col_remove, _ = st.columns([1, 1, 3])
380
- with col_add:
381
- if st.button("Add Resume") and len(st.session_state.resumes) < 5:
382
- st.session_state.resumes.append("")
383
- st.rerun()
384
- with col_remove:
385
- if st.button("Remove Resume") and len(st.session_state.resumes) > 1:
386
- st.session_state.resumes.pop()
387
- st.rerun()
388
-
389
- # Job description input
390
- st.markdown("### πŸ“‹ Enter Job Description")
391
- job_description = st.text_area(
392
- "Job Description",
393
- value=st.session_state.input_job_description,
394
- height=100,
395
- key="job_description",
396
- placeholder="e.g., Data scientist requires python, sql, 3 years+"
397
- )
398
- validation_error = validate_input(job_description, is_resume=False)
399
- if validation_error and job_description.strip():
400
- st.warning(f"Job Description: {validation_error}")
401
-
402
- # Analyze and Reset buttons
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
403
  col_btn1, col_btn2, _ = st.columns([1, 1, 3])
404
  with col_btn1:
405
- analyze_clicked = st.button("Analyze", type="primary")
406
  with col_btn2:
407
- reset_clicked = st.button("Reset")
408
-
 
409
  # Handle reset
410
  if reset_clicked:
411
  st.session_state.resumes = ["", "", ""]
412
  st.session_state.input_job_description = ""
413
  st.session_state.results = []
414
  st.session_state.valid_resumes = []
 
415
  st.rerun()
416
 
417
  # Handle analysis with early validation and lazy model loading
418
  if analyze_clicked:
419
  # Early validation of inputs
420
  valid_resumes = []
 
 
421
  for i, resume in enumerate(st.session_state.resumes):
422
  validation_error = validate_input(resume, is_resume=True)
423
  if not validation_error and resume.strip():
424
  valid_resumes.append(resume)
425
  elif validation_error and resume.strip():
426
- st.warning(f"Resume {i+1}: {validation_error}")
427
-
428
- validation_error = validate_input(job_description, is_resume=False)
429
- if validation_error and job_description.strip():
430
- st.warning(f"Job Description: {validation_error}")
 
 
431
 
432
- if valid_resumes and job_description.strip():
433
  # Load models only when needed
434
  if st.session_state.models is None:
435
  with st.spinner("Loading models, please wait..."):
@@ -442,25 +520,29 @@ def main():
442
  with st.spinner("Analyzing resumes..."):
443
  progress_bar = st.progress(0)
444
  status_text = st.empty()
445
- status_text.text("Preparing inputs...")
446
 
447
  # Retrieve tokenizers from st.session_state.models
448
- bert_tokenizer, bert_model, t5_tokenizer, t5_model, device = st.session_state.models
449
 
 
450
  # Precompute tokenized inputs and job skills
451
  bert_tokenized, t5_inputs, t5_tokenized = tokenize_inputs(valid_resumes, job_description, bert_tokenizer, t5_tokenizer)
452
  job_skills_set = extract_skills(job_description)
453
 
454
- status_text.text("Classifying and summarizing resumes...")
455
  results = []
456
- for i, (resume, bert_tok, t5_in, t5_tok) in enumerate(zip(valid_resumes, bert_tokenized['input_ids'], t5_inputs, t5_tokenized['input_ids'])):
457
- status_text.text(f"Processing Resume {i+1}/{total_steps}: {resume[:50]}...")
 
 
 
 
 
458
  result = classify_and_summarize_batch(
459
  resume,
460
  job_description,
461
- {'input_ids': bert_tok.unsqueeze(0), 'attention_mask': bert_tokenized['attention_mask'][i].unsqueeze(0)},
462
- t5_in,
463
- {'input_ids': t5_tok.unsqueeze(0), 'attention_mask': t5_tokenized['attention_mask'][i].unsqueeze(0)},
464
  job_skills_set
465
  )
466
  result["Resume"] = f"Resume {i+1}"
@@ -471,34 +553,76 @@ def main():
471
 
472
  status_text.empty()
473
  progress_bar.empty()
474
- st.success("Analysis completed! πŸŽ‰")
 
475
  else:
476
- st.error("Please enter at least one valid resume and a job description.")
477
 
478
- # Display results
479
- if st.session_state.results:
480
- st.markdown("### πŸ“Š Results")
481
- st.table(st.session_state.results)
482
-
483
- csv_buffer = io.StringIO()
484
- csv_buffer.write("Resume Number,Resume Text,Job Description,Suitability,Summary,Warning\n")
485
- for i, result in enumerate(st.session_state.results):
486
- resume_text = st.session_state.valid_resumes[i].replace('"', '""').replace('\n', ' ')
487
- job_text = job_description.replace('"', '""').replace('\n', ' ')
488
- csv_buffer.write(f'"{result["Resume"]}","{resume_text}","{job_text}","{result["Suitability"]}","{result["Data/Tech Related Skills Summary"]}","{result["Warning"]}"\n')
489
- st.download_button("Download Results", csv_buffer.getvalue(), file_name="resume_analysis.csv", mime="text/csv")
490
-
491
- with st.expander("πŸ“ˆ Skill Frequency Across Resumes", expanded=False):
492
- if st.session_state.valid_resumes:
493
- fig = generate_skill_pie_chart(st.session_state.valid_resumes)
494
- if fig:
495
- st.pyplot(fig)
496
- plt.close(fig)
497
- else:
498
- st.write("No recognized data/tech skills found in the resumes.")
499
- else:
500
- st.write("No valid resumes to analyze.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
501
 
502
  if __name__ == "__main__":
503
- # When this module is run directly, call the main function.
504
  main()
 
15
  # Set page config as the first Streamlit command
16
  st.set_page_config(page_title="Resume Screening Assistant for Data/Tech", page_icon="πŸ“„", layout="wide")
17
 
18
+ # --- CUSTOM CSS FOR UI/UX IMPROVEMENTS ---
19
  st.markdown("""
20
  <style>
21
+ /* 1. Sidebar Styling */
22
  .css-1d391kg { /* Sidebar */
23
  width: 350px !important;
24
  }
 
29
  min-width: 350px !important;
30
  visibility: visible !important;
31
  }
32
+ /* 2. Expander/Instructions Styling */
33
  [data-testid="stExpander"] summary { /* Expander headers */
34
  font-size: 26px !important;
35
  font-weight: bold !important;
36
  text-shadow: 1px 1px 2px rgba(0, 0, 0, 0.1) !important;
37
  white-space: nowrap !important;
38
  }
39
+ .st-expander-content p { /* Expander body text - made slightly larger for readability */
40
+ font-size: 14px !important;
41
+ line-height: 1.6;
42
+ }
43
+ /* 3. Main Title Styling */
44
+ h1 {
45
+ text-align: center;
46
+ color: #007BFF;
47
+ font-size: 40px;
48
+ }
49
+ /* 4. Tab Styling */
50
+ .stTabs [data-baseweb="tab-list"] {
51
+ gap: 24px;
52
+ }
53
+ .stTabs [data-baseweb="tab"] {
54
+ height: 50px;
55
+ white-space: nowrap;
56
+ border-radius: 4px 4px 0 0;
57
+ gap: 1px;
58
+ padding-top: 10px;
59
+ padding-bottom: 10px;
60
  }
61
  </style>
62
  """, unsafe_allow_html=True)
 
80
  # Precompile regex for skills matching (optimized for single pass)
81
  skills_pattern = re.compile(r'\b(' + '|'.join(re.escape(skill) for skill in skills_list) + r')\b', re.IGNORECASE)
82
 
83
+ # --- Helper functions for CV parsing ---
84
  def extract_text_from_pdf(file):
85
  try:
86
  pdf_reader = PyPDF2.PdfReader(file)
 
111
  elif uploaded_file.name.endswith('.docx'):
112
  return extract_text_from_docx(uploaded_file)
113
  else:
114
+ # st.error is now handled by the calling logic if the text is empty
115
  return ""
116
 
117
+ # --- Helper functions for analysis ---
118
  def normalize_text(text):
119
  text = text.lower()
120
  # Remove underscores, hyphens, and specific phrases, replacing with empty string
 
122
  return text
123
 
124
  def check_experience_mismatch(resume, job_description):
125
+ # Search for year numbers or 'senior' in resume
126
  resume_match = re.search(r'(\d+)\s*years?|senior', resume.lower())
127
+ # Search for year numbers followed by '+' or 'senior+' in JD
128
  job_match = re.search(r'(\d+)\s*years?(?:\s+\w+)*\+|senior\+', job_description.lower())
129
  if resume_match and job_match:
130
+ resume_years_text = resume_match.group(0)
131
+ job_years_text = job_match.group(0)
132
+
133
  # Handle resume years
134
+ if 'senior' in resume_years_text:
135
  resume_num = 10
136
  else:
137
  resume_num = int(resume_match.group(1))
138
+
139
  # Handle job years
140
+ if 'senior+' in job_years_text:
141
  job_num = 10
142
  else:
143
  job_num = int(job_match.group(1))
144
+
145
  if resume_num < job_num:
146
+ return f"Experience mismatch: Resume has {resume_years_text.strip()}, job requires {job_years_text.strip()}"
147
  return None
148
 
149
  def validate_input(text, is_resume=True):
 
175
  """Precompute tokenized inputs for BERT and T5."""
176
  job_description_norm = normalize_text(job_description)
177
  bert_inputs = [f"resume: {normalize_text(resume)} [sep] job: {job_description_norm}" for resume in resumes]
178
+ # BERT tokens must be padded/truncated consistently
179
  bert_tokenized = _bert_tokenizer(bert_inputs, return_tensors='pt', padding=True, truncation=True, max_length=64)
180
 
181
  t5_inputs = []
182
  for resume in resumes:
183
+ # Prompt preparation for T5 summary
184
  prompt = re.sub(r'\b[Cc]\+\+\b', 'c++', resume)
185
  prompt_normalized = normalize_text(prompt)
186
  t5_inputs.append(f"summarize: {prompt_normalized}")
187
+ # T5 tokens must be padded/truncated consistently
188
  t5_tokenized = _t5_tokenizer(t5_inputs, return_tensors='pt', padding=True, truncation=True, max_length=64)
189
 
190
  return bert_tokenized, t5_inputs, t5_tokenized
 
195
  text_normalized = normalize_text(text)
196
  text_normalized = re.sub(r'[,_-]', ' ', text_normalized)
197
  found_skills = skills_pattern.findall(text_normalized)
198
+ return set(s.lower() for s in found_skills) # Ensure lower case for set intersection
199
 
200
  @st.cache_data
201
+ def classify_and_summarize_batch(resume, job_description, _bert_tok, _t5_input, _t5_tok, _job_skills_set):
202
  """Process one resume at a time to reduce CPU load with a timeout."""
203
+ # Note: We pass single-item dicts for inference to avoid re-tokenization outside of cache
204
  _, bert_model, t5_tokenizer, t5_model, device = st.session_state.models
 
205
  timeout = 60 # Timeout after 60 seconds
206
 
207
  try:
208
+ # --- BERT Inference (Classification) ---
209
+ bert_tokenized = {k: v.to(device) for k, v in _bert_tok.items()}
210
  with torch.no_grad():
 
211
  bert_start = time.time()
212
  outputs = bert_model(**bert_tokenized)
213
  if time.time() - bert_start > timeout:
 
218
  predictions = np.argmax(probabilities, axis=1)
219
 
220
  confidence_threshold = 0.85
221
+ prob, pred = probabilities[0], predictions[0]
222
+
223
+ # --- T5 Inference (Summarization) ---
224
+ t5_tokenized = {k: v.to(device) for k, v in _t5_tok.items()}
225
  with torch.no_grad():
 
226
  t5_start = time.time()
227
  t5_outputs = t5_model.generate(
228
  t5_tokenized['input_ids'],
 
237
  if time.time() - t5_start > timeout:
238
  raise TimeoutError("T5 inference timed out")
239
  summaries = [t5_tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True) for output in t5_outputs]
240
+ summary = re.sub(r'\s+', ' ', summaries[0]).strip()
241
 
242
+ # --- Suitability Logic ---
243
  resume_skills_set = extract_skills(resume)
244
  skill_overlap = len(_job_skills_set.intersection(resume_skills_set)) / len(_job_skills_set) if _job_skills_set else 0
245
 
246
+ suitability = "Relevant"
247
+ warning = "None"
248
+ exp_warning = check_experience_mismatch(resume, job_description)
249
+
250
  if skill_overlap < 0.4:
251
  suitability = "Irrelevant"
252
+ warning = "Low skill overlap (<40%) with job requirements"
253
+ elif exp_warning:
254
+ suitability = "Uncertain"
255
+ warning = exp_warning
256
+ elif prob[pred] < confidence_threshold:
257
+ suitability = "Uncertain"
258
+ warning = f"Low model confidence: {prob[pred]:.2f}"
259
+ elif skill_overlap < 0.5:
260
+ suitability = "Irrelevant"
261
+ warning = "Skill overlap is acceptable but not a strong match (<50%)"
 
 
 
262
 
263
+ # --- Final Summary Formatting (Override T5 for clarity) ---
264
+ detected_skills = list(set(skills_pattern.findall(normalize_text(resume)))) # Deduplicate skills
265
  exp_match = re.search(r'\d+\s*years?|senior', resume.lower())
266
+
267
+ if detected_skills and exp_match:
268
+ final_summary = f"{', '.join(detected_skills)} proficiency, {exp_match.group(0)} experience"
269
+ elif detected_skills:
270
+ final_summary = f"{', '.join(detected_skills)} proficiency"
271
  else:
272
+ final_summary = f"{exp_match.group(0) if exp_match else 'unknown'} experience"
273
 
274
  result = {
275
  "Suitability": suitability,
276
+ "Data/Tech Related Skills Summary": final_summary,
277
+ "Warning": warning
278
  }
279
 
280
  return result
 
307
  resume_lower = normalize_text(resume)
308
  found_skills = skills_pattern.findall(resume_lower)
309
  for skill in found_skills:
310
+ skill_counts[skill.lower()] = skill_counts.get(skill.lower(), 0) + 1
311
 
312
+ # Filter for top N skills (e.g., top 8)
313
  if not skill_counts:
314
  return None
315
 
316
+ sorted_skills = sorted(skill_counts.items(), key=lambda item: item[1], reverse=True)
317
+ top_n = 8
318
+
319
+ # Aggregate "Other" skills
320
+ if len(sorted_skills) > top_n:
321
+ top_skills = dict(sorted_skills[:top_n-1])
322
+ other_count = sum(count for _, count in sorted_skills[top_n-1:])
323
+ top_skills["Other"] = other_count
324
+ else:
325
+ top_skills = dict(sorted_skills)
326
+
327
+ labels = list(top_skills.keys())
328
+ sizes = [(count / sum(top_skills.values())) * 100 for count in top_skills.values()]
329
 
330
  fig, ax = plt.subplots(figsize=(6, 4))
331
+ # Use a visually appealing color map
332
+ colors = plt.cm.tab20(np.linspace(0, 1, len(labels)))
333
+
334
+ # Draw pie chart with a shadow for depth
335
+ wedges, texts, autotexts = ax.pie(
336
+ sizes,
337
+ labels=labels,
338
+ autopct='%1.1f%%',
339
+ startangle=90,
340
+ colors=colors,
341
+ textprops={'fontsize': 10}
342
+ )
343
+ ax.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
344
+
345
+ plt.title("Top Skill Frequency Across Resumes", fontsize=12, color='#007BFF', pad=10)
346
  return fig
347
 
348
  def render_sidebar():
 
358
  with st.expander("πŸ“‹ How to Use the App", expanded=True):
359
  st.markdown("""
360
  **Instructions**:
361
+ - Use the **Setup** tab to input the Job Description.
362
+ - Use the **Resumes** tab to upload or paste up to 5 resumes, including skills and experience (e.g., "Expert in python, databricks, 6 years experience").
363
+ - Click **Run Analysis** to evaluate all valid resumes.
364
  - Use **Add Resume** or **Remove Resume** to adjust the number of resume fields (1–5).
365
+ - Use **Reset All** to clear all inputs and results.
366
+ - View the detailed table, download results, and check the skill frequency chart in the **Results** tab.
 
 
 
 
 
 
 
 
 
 
 
367
  """)
368
  with st.expander("ℹ️ Classification Criteria", expanded=True):
369
  st.markdown("""
370
  The app classifies resumes based on:
371
+ - **Skill Overlap**: Resume skills vs. Job skills. Overlap must be $\geq 40\%$.
372
+ - **Experience Match**: Resume's experience must meet or exceed the job's stated requirement (e.g., '5 years+').
373
+ - **Model Confidence**: BERT classification confidence $\geq 85\%$.
374
 
375
  **Outcomes**:
376
+ - **Relevant**: High skill overlap ($\geq 50\%$), sufficient experience, and high confidence ($\geq 85\%$).
377
+ - **Irrelevant**: Skill overlap $< 40\%$ or acceptable overlap but high confidence in low relevance.
378
+ - **Uncertain**: Experience mismatch, or model confidence $< 85\%$.
 
 
379
  """)
380
 
381
  def main():
 
385
 
386
  # Initialize session state
387
  if 'resumes' not in st.session_state:
388
+ # Start with a good example and two empty slots
389
  st.session_state.resumes = ["Expert in python, machine learning, tableau, 4 years experience", "", ""]
390
  if 'input_job_description' not in st.session_state:
391
  st.session_state.input_job_description = "Data scientist requires python, machine learning, 3 years+"
 
396
  if 'models' not in st.session_state:
397
  st.session_state.models = None
398
 
399
+ st.markdown("<h1>πŸ“„ AI-Powered Resume Screening</h1>", unsafe_allow_html=True)
400
+
401
+ # 🌟 NEW: Use Streamlit Tabs for better flow
402
+ tab_setup, tab_resumes, tab_results = st.tabs(["βš™οΈ 1. Setup & Job Description", "πŸ“ 2. Manage Resumes", "πŸ“Š 3. Analysis & Results"])
403
+
404
+ # --- TAB 1: Setup & Job Description ---
405
+ with tab_setup:
406
+ st.subheader("1. Enter Job Description")
407
+ st.info("Paste the job description here. It must include required skills and experience (e.g., 'Data engineer requires python, spark, 5 years+').")
408
 
409
+ job_description = st.text_area(
410
+ "Job Description Text",
411
+ value=st.session_state.input_job_description,
412
+ height=150,
413
+ key="job_description_tab",
414
+ placeholder="e.g., Data scientist requires python, sql, 3 years+"
 
 
 
 
 
 
 
415
  )
416
+ st.session_state.input_job_description = job_description
417
+
418
+ validation_error = validate_input(job_description, is_resume=False)
419
+ if validation_error and job_description.strip():
420
+ st.warning(f"Job Description Validation: {validation_error}")
421
+
422
+ # --- TAB 2: Manage Resumes ---
423
+ with tab_resumes:
424
+ st.subheader(f"2. Resume Inputs ({len(st.session_state.resumes)}/5)")
425
+ st.info("Upload PDF/DOCX or paste text for up to 5 resumes. Each must contain data/tech skills and experience.")
426
+
427
+ # Manage resume text areas and file uploads
428
+ for i in range(len(st.session_state.resumes)):
429
+ # Use an expander for each resume to keep the page clean
430
+ is_expanded = (i == 0) or (st.session_state.resumes[i].strip() != "")
431
+ with st.expander(f"**Resume {i+1}**", expanded=is_expanded):
432
+
433
+ # File uploader on the left
434
+ uploaded_file = st.file_uploader(
435
+ f"Upload CV (PDF or Word) for Resume {i+1}",
436
+ type=['pdf', 'docx'],
437
+ key=f"file_upload_{i}"
438
+ )
439
+
440
+ # File upload logic
441
+ if uploaded_file is not None:
442
+ extracted_text = extract_text_from_file(uploaded_file)
443
+ if extracted_text:
444
+ st.session_state.resumes[i] = extracted_text
445
+ else:
446
+ st.session_state.resumes[i] = ""
447
+
448
+ # Text area input
449
+ st.session_state.resumes[i] = st.text_area(
450
+ f"Paste or edit resume text",
451
+ value=st.session_state.resumes[i],
452
+ height=100,
453
+ key=f"resume_{i}_tab",
454
+ placeholder="e.g., Expert in python, sql, 3 years experience"
455
+ )
456
+
457
+ # Validation feedback
458
+ validation_error = validate_input(st.session_state.resumes[i], is_resume=True)
459
+ if validation_error and st.session_state.resumes[i].strip():
460
+ st.warning(f"Validation: {validation_error}")
461
+
462
+ # Add/Remove resume buttons outside the loop
463
+ col_add, col_remove, _ = st.columns([1, 1, 3])
464
+ with col_add:
465
+ if st.button("βž• Add Resume", use_container_width=True) and len(st.session_state.resumes) < 5:
466
+ st.session_state.resumes.append("")
467
+ st.rerun()
468
+ with col_remove:
469
+ if st.button("βž– Remove Resume", use_container_width=True) and len(st.session_state.resumes) > 1:
470
+ st.session_state.resumes.pop()
471
+ st.rerun()
472
+
473
+ # --- ACTION BUTTONS (Outside the tabs for prominence) ---
474
+ st.markdown("---")
475
  col_btn1, col_btn2, _ = st.columns([1, 1, 3])
476
  with col_btn1:
477
+ analyze_clicked = st.button("πŸš€ Run Analysis", type="primary", use_container_width=True)
478
  with col_btn2:
479
+ reset_clicked = st.button("πŸ”„ Reset All", use_container_width=True)
480
+ st.markdown("---")
481
+
482
  # Handle reset
483
  if reset_clicked:
484
  st.session_state.resumes = ["", "", ""]
485
  st.session_state.input_job_description = ""
486
  st.session_state.results = []
487
  st.session_state.valid_resumes = []
488
+ st.session_state.models = None # Also clear models to re-load fresh
489
  st.rerun()
490
 
491
  # Handle analysis with early validation and lazy model loading
492
  if analyze_clicked:
493
  # Early validation of inputs
494
  valid_resumes = []
495
+ all_inputs_valid = True
496
+
497
  for i, resume in enumerate(st.session_state.resumes):
498
  validation_error = validate_input(resume, is_resume=True)
499
  if not validation_error and resume.strip():
500
  valid_resumes.append(resume)
501
  elif validation_error and resume.strip():
502
+ st.error(f"Cannot run analysis. Resume {i+1} failed validation: {validation_error}")
503
+ all_inputs_valid = False
504
+
505
+ job_validation_error = validate_input(job_description, is_resume=False)
506
+ if job_validation_error and job_description.strip():
507
+ st.error(f"Cannot run analysis. Job Description failed validation: {job_validation_error}")
508
+ all_inputs_valid = False
509
 
510
+ if valid_resumes and job_description.strip() and all_inputs_valid:
511
  # Load models only when needed
512
  if st.session_state.models is None:
513
  with st.spinner("Loading models, please wait..."):
 
520
  with st.spinner("Analyzing resumes..."):
521
  progress_bar = st.progress(0)
522
  status_text = st.empty()
 
523
 
524
  # Retrieve tokenizers from st.session_state.models
525
+ bert_tokenizer, _, t5_tokenizer, _, _ = st.session_state.models
526
 
527
+ status_text.text("Preparing inputs: Tokenizing and extracting job skills...")
528
  # Precompute tokenized inputs and job skills
529
  bert_tokenized, t5_inputs, t5_tokenized = tokenize_inputs(valid_resumes, job_description, bert_tokenizer, t5_tokenizer)
530
  job_skills_set = extract_skills(job_description)
531
 
 
532
  results = []
533
+ for i, resume in enumerate(valid_resumes):
534
+ status_text.text(f"Processing Resume {i+1}/{total_steps}: {resume[:30]}...")
535
+
536
+ # Package single item inputs for the cached function call
537
+ bert_tok_single = {'input_ids': bert_tokenized['input_ids'][i].unsqueeze(0), 'attention_mask': bert_tokenized['attention_mask'][i].unsqueeze(0)}
538
+ t5_tok_single = {'input_ids': t5_tokenized['input_ids'][i].unsqueeze(0), 'attention_mask': t5_tokenized['attention_mask'][i].unsqueeze(0)}
539
+
540
  result = classify_and_summarize_batch(
541
  resume,
542
  job_description,
543
+ bert_tok_single,
544
+ t5_inputs[i],
545
+ t5_tok_single,
546
  job_skills_set
547
  )
548
  result["Resume"] = f"Resume {i+1}"
 
553
 
554
  status_text.empty()
555
  progress_bar.empty()
556
+ st.success("Analysis completed! πŸŽ‰ Results are in the 'Analysis & Results' tab.")
557
+ st.balloons()
558
  else:
559
+ st.error("Please ensure at least one valid resume and a valid job description are provided.")
560
 
561
+ # --- TAB 3: Results ---
562
+ with tab_results:
563
+ st.subheader("3. Screening Results")
564
+ if st.session_state.results:
565
+ st.success("Analysis complete. See the suitability assessment below.")
566
+
567
+ # 🌟 NEW: Use st.dataframe for enhanced, sortable results table
568
+ st.dataframe(
569
+ st.session_state.results,
570
+ column_config={
571
+ "Suitability": st.column_config.TextColumn(
572
+ "Suitability",
573
+ help="Model's assessment (Relevant, Irrelevant, Uncertain)",
574
+ width="small"
575
+ ),
576
+ "Warning": st.column_config.TextColumn(
577
+ "Warning",
578
+ help="Reason for non-Relevant status (e.g., experience mismatch, low confidence)",
579
+ width="medium"
580
+ ),
581
+ "Data/Tech Related Skills Summary": st.column_config.TextColumn(
582
+ "Skills/Exp Summary",
583
+ help="Concise summary of detected skills and experience",
584
+ width="large"
585
+ ),
586
+ "Resume": st.column_config.TextColumn(
587
+ "Resume",
588
+ width="small"
589
+ )
590
+ },
591
+ use_container_width=True
592
+ )
593
+
594
+ # Download and Chart Section
595
+ col_dl, col_chart_expander = st.columns([1, 3])
596
+
597
+ with col_dl:
598
+ csv_buffer = io.StringIO()
599
+ csv_buffer.write("Resume Number,Suitability,Summary,Warning\n")
600
+ # Exclude the full resume text from CSV to keep it clean and focused
601
+ for i, result in enumerate(st.session_state.results):
602
+ summary = result["Data/Tech Related Skills Summary"].replace('"', '""')
603
+ csv_buffer.write(f'"{result["Resume"]}","{result["Suitability"]}","{summary}","{result["Warning"]}"\n')
604
+
605
+ st.download_button(
606
+ "πŸ’Ύ Download Results CSV",
607
+ csv_buffer.getvalue(),
608
+ file_name="resume_analysis.csv",
609
+ mime="text/csv",
610
+ use_container_width=True
611
+ )
612
+
613
+ with col_chart_expander:
614
+ with st.expander("πŸ“ˆ View Top Skill Frequency Across Resumes", expanded=False):
615
+ if st.session_state.valid_resumes:
616
+ fig = generate_skill_pie_chart(st.session_state.valid_resumes)
617
+ if fig:
618
+ st.pyplot(fig)
619
+ plt.close(fig)
620
+ else:
621
+ st.info("No recognized data/tech skills found in the resumes for charting.")
622
+ else:
623
+ st.info("No valid resumes to analyze.")
624
+ else:
625
+ st.info("Please complete the Setup and Resume tabs, then click 'Run Analysis' to see results.")
626
 
627
  if __name__ == "__main__":
 
628
  main()