Update app.py
Browse files
app.py
CHANGED
|
@@ -15,9 +15,10 @@ import time
|
|
| 15 |
# Set page config as the first Streamlit command
|
| 16 |
st.set_page_config(page_title="Resume Screening Assistant for Data/Tech", page_icon="π", layout="wide")
|
| 17 |
|
| 18 |
-
#
|
| 19 |
st.markdown("""
|
| 20 |
<style>
|
|
|
|
| 21 |
.css-1d391kg { /* Sidebar */
|
| 22 |
width: 350px !important;
|
| 23 |
}
|
|
@@ -28,14 +29,34 @@ st.markdown("""
|
|
| 28 |
min-width: 350px !important;
|
| 29 |
visibility: visible !important;
|
| 30 |
}
|
|
|
|
| 31 |
[data-testid="stExpander"] summary { /* Expander headers */
|
| 32 |
font-size: 26px !important;
|
| 33 |
font-weight: bold !important;
|
| 34 |
text-shadow: 1px 1px 2px rgba(0, 0, 0, 0.1) !important;
|
| 35 |
white-space: nowrap !important;
|
| 36 |
}
|
| 37 |
-
.st-expander-content p { /* Expander body text */
|
| 38 |
-
font-size:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
}
|
| 40 |
</style>
|
| 41 |
""", unsafe_allow_html=True)
|
|
@@ -59,7 +80,7 @@ skills_list = [
|
|
| 59 |
# Precompile regex for skills matching (optimized for single pass)
|
| 60 |
skills_pattern = re.compile(r'\b(' + '|'.join(re.escape(skill) for skill in skills_list) + r')\b', re.IGNORECASE)
|
| 61 |
|
| 62 |
-
# Helper functions for CV parsing
|
| 63 |
def extract_text_from_pdf(file):
|
| 64 |
try:
|
| 65 |
pdf_reader = PyPDF2.PdfReader(file)
|
|
@@ -90,10 +111,10 @@ def extract_text_from_file(uploaded_file):
|
|
| 90 |
elif uploaded_file.name.endswith('.docx'):
|
| 91 |
return extract_text_from_docx(uploaded_file)
|
| 92 |
else:
|
| 93 |
-
st.error
|
| 94 |
return ""
|
| 95 |
|
| 96 |
-
# Helper functions for analysis
|
| 97 |
def normalize_text(text):
|
| 98 |
text = text.lower()
|
| 99 |
# Remove underscores, hyphens, and specific phrases, replacing with empty string
|
|
@@ -101,24 +122,28 @@ def normalize_text(text):
|
|
| 101 |
return text
|
| 102 |
|
| 103 |
def check_experience_mismatch(resume, job_description):
|
|
|
|
| 104 |
resume_match = re.search(r'(\d+)\s*years?|senior', resume.lower())
|
| 105 |
-
#
|
| 106 |
job_match = re.search(r'(\d+)\s*years?(?:\s+\w+)*\+|senior\+', job_description.lower())
|
| 107 |
if resume_match and job_match:
|
| 108 |
-
|
| 109 |
-
|
|
|
|
| 110 |
# Handle resume years
|
| 111 |
-
if 'senior' in
|
| 112 |
resume_num = 10
|
| 113 |
else:
|
| 114 |
resume_num = int(resume_match.group(1))
|
|
|
|
| 115 |
# Handle job years
|
| 116 |
-
if 'senior+' in
|
| 117 |
job_num = 10
|
| 118 |
else:
|
| 119 |
job_num = int(job_match.group(1))
|
|
|
|
| 120 |
if resume_num < job_num:
|
| 121 |
-
return f"Experience mismatch: Resume has {
|
| 122 |
return None
|
| 123 |
|
| 124 |
def validate_input(text, is_resume=True):
|
|
@@ -150,13 +175,16 @@ def tokenize_inputs(resumes, job_description, _bert_tokenizer, _t5_tokenizer):
|
|
| 150 |
"""Precompute tokenized inputs for BERT and T5."""
|
| 151 |
job_description_norm = normalize_text(job_description)
|
| 152 |
bert_inputs = [f"resume: {normalize_text(resume)} [sep] job: {job_description_norm}" for resume in resumes]
|
|
|
|
| 153 |
bert_tokenized = _bert_tokenizer(bert_inputs, return_tensors='pt', padding=True, truncation=True, max_length=64)
|
| 154 |
|
| 155 |
t5_inputs = []
|
| 156 |
for resume in resumes:
|
|
|
|
| 157 |
prompt = re.sub(r'\b[Cc]\+\+\b', 'c++', resume)
|
| 158 |
prompt_normalized = normalize_text(prompt)
|
| 159 |
t5_inputs.append(f"summarize: {prompt_normalized}")
|
|
|
|
| 160 |
t5_tokenized = _t5_tokenizer(t5_inputs, return_tensors='pt', padding=True, truncation=True, max_length=64)
|
| 161 |
|
| 162 |
return bert_tokenized, t5_inputs, t5_tokenized
|
|
@@ -167,19 +195,19 @@ def extract_skills(text):
|
|
| 167 |
text_normalized = normalize_text(text)
|
| 168 |
text_normalized = re.sub(r'[,_-]', ' ', text_normalized)
|
| 169 |
found_skills = skills_pattern.findall(text_normalized)
|
| 170 |
-
return set(found_skills)
|
| 171 |
|
| 172 |
@st.cache_data
|
| 173 |
-
def classify_and_summarize_batch(resume, job_description,
|
| 174 |
"""Process one resume at a time to reduce CPU load with a timeout."""
|
|
|
|
| 175 |
_, bert_model, t5_tokenizer, t5_model, device = st.session_state.models
|
| 176 |
-
start_time = time.time()
|
| 177 |
timeout = 60 # Timeout after 60 seconds
|
| 178 |
|
| 179 |
try:
|
| 180 |
-
|
|
|
|
| 181 |
with torch.no_grad():
|
| 182 |
-
# BERT inference
|
| 183 |
bert_start = time.time()
|
| 184 |
outputs = bert_model(**bert_tokenized)
|
| 185 |
if time.time() - bert_start > timeout:
|
|
@@ -190,10 +218,11 @@ def classify_and_summarize_batch(resume, job_description, _bert_tokenized, _t5_i
|
|
| 190 |
predictions = np.argmax(probabilities, axis=1)
|
| 191 |
|
| 192 |
confidence_threshold = 0.85
|
| 193 |
-
|
| 194 |
-
|
|
|
|
|
|
|
| 195 |
with torch.no_grad():
|
| 196 |
-
# T5 inference
|
| 197 |
t5_start = time.time()
|
| 198 |
t5_outputs = t5_model.generate(
|
| 199 |
t5_tokenized['input_ids'],
|
|
@@ -208,39 +237,44 @@ def classify_and_summarize_batch(resume, job_description, _bert_tokenized, _t5_i
|
|
| 208 |
if time.time() - t5_start > timeout:
|
| 209 |
raise TimeoutError("T5 inference timed out")
|
| 210 |
summaries = [t5_tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True) for output in t5_outputs]
|
| 211 |
-
|
| 212 |
|
| 213 |
-
|
| 214 |
resume_skills_set = extract_skills(resume)
|
| 215 |
skill_overlap = len(_job_skills_set.intersection(resume_skills_set)) / len(_job_skills_set) if _job_skills_set else 0
|
| 216 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
if skill_overlap < 0.4:
|
| 218 |
suitability = "Irrelevant"
|
| 219 |
-
warning = "
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
else:
|
| 230 |
-
suitability = "Relevant" if skill_overlap >= 0.5 else "Irrelevant"
|
| 231 |
-
warning = "Skills are not a strong match" if suitability == "Irrelevant" else None
|
| 232 |
|
| 233 |
-
|
|
|
|
| 234 |
exp_match = re.search(r'\d+\s*years?|senior', resume.lower())
|
| 235 |
-
|
| 236 |
-
|
|
|
|
|
|
|
|
|
|
| 237 |
else:
|
| 238 |
-
|
| 239 |
|
| 240 |
result = {
|
| 241 |
"Suitability": suitability,
|
| 242 |
-
"Data/Tech Related Skills Summary":
|
| 243 |
-
"Warning": warning
|
| 244 |
}
|
| 245 |
|
| 246 |
return result
|
|
@@ -273,19 +307,42 @@ def generate_skill_pie_chart(resumes):
|
|
| 273 |
resume_lower = normalize_text(resume)
|
| 274 |
found_skills = skills_pattern.findall(resume_lower)
|
| 275 |
for skill in found_skills:
|
| 276 |
-
skill_counts[skill] = skill_counts.get(skill, 0) + 1
|
| 277 |
|
|
|
|
| 278 |
if not skill_counts:
|
| 279 |
return None
|
| 280 |
|
| 281 |
-
|
| 282 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
|
| 284 |
fig, ax = plt.subplots(figsize=(6, 4))
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
return fig
|
| 290 |
|
| 291 |
def render_sidebar():
|
|
@@ -301,37 +358,24 @@ def render_sidebar():
|
|
| 301 |
with st.expander("π How to Use the App", expanded=True):
|
| 302 |
st.markdown("""
|
| 303 |
**Instructions**:
|
| 304 |
-
-
|
| 305 |
-
-
|
| 306 |
-
- Click **
|
| 307 |
- Use **Add Resume** or **Remove Resume** to adjust the number of resume fields (1β5).
|
| 308 |
-
- Use
|
| 309 |
-
-
|
| 310 |
-
- View the skill frequency pie chart to see skill distribution across resumes.
|
| 311 |
-
- Example test cases:
|
| 312 |
-
- **Test Case 1**: Resumes like "Expert in python, machine learning, tableau, 4 years experience" against "Data scientist requires python, machine learning, 3 years+".
|
| 313 |
-
- **Test Case 2**: Resumes like "Skilled in databricks, spark, python, 6 years experience" against "Data engineer requires python, spark, 5 years+".
|
| 314 |
-
|
| 315 |
-
**Guidelines**:
|
| 316 |
-
- Use comma-separated skills from a comprehensive list including python, sql, databricks, etc. (79 skills supported).
|
| 317 |
-
- Include experience in years (e.g., "3 years experience" or "1 year experience") or as "senior".
|
| 318 |
-
- Focus on data/tech skills for accurate summarization.
|
| 319 |
-
- Resumes with only irrelevant skills (e.g., sales, marketing) will be classified as "Irrelevant".
|
| 320 |
-
- If uploading a CV, ensure itβs a text-based PDF or Word document (scanned PDFs may not work).
|
| 321 |
""")
|
| 322 |
with st.expander("βΉοΈ Classification Criteria", expanded=True):
|
| 323 |
st.markdown("""
|
| 324 |
The app classifies resumes based on:
|
| 325 |
-
- **Skill Overlap**:
|
| 326 |
-
- **
|
| 327 |
-
- **
|
| 328 |
|
| 329 |
**Outcomes**:
|
| 330 |
-
- **Relevant**:
|
| 331 |
-
- **Irrelevant**: Skill overlap < 40% or high confidence in low
|
| 332 |
-
- **Uncertain**:
|
| 333 |
-
|
| 334 |
-
**Note**: An experience mismatch warning is shown if the resumeβs experience is below the jobβs requirement, overriding the skill overlap and confidence to classify as Uncertain.
|
| 335 |
""")
|
| 336 |
|
| 337 |
def main():
|
|
@@ -341,6 +385,7 @@ def main():
|
|
| 341 |
|
| 342 |
# Initialize session state
|
| 343 |
if 'resumes' not in st.session_state:
|
|
|
|
| 344 |
st.session_state.resumes = ["Expert in python, machine learning, tableau, 4 years experience", "", ""]
|
| 345 |
if 'input_job_description' not in st.session_state:
|
| 346 |
st.session_state.input_job_description = "Data scientist requires python, machine learning, 3 years+"
|
|
@@ -351,85 +396,118 @@ def main():
|
|
| 351 |
if 'models' not in st.session_state:
|
| 352 |
st.session_state.models = None
|
| 353 |
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 359 |
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
st.session_state.resumes[i] = st.text_area(
|
| 368 |
-
f"Enter or edit resume text",
|
| 369 |
-
value=st.session_state.resumes[i],
|
| 370 |
-
height=100,
|
| 371 |
-
key=f"resume_{i}",
|
| 372 |
-
placeholder="e.g., Expert in python, sql, 3 years experience"
|
| 373 |
)
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 403 |
col_btn1, col_btn2, _ = st.columns([1, 1, 3])
|
| 404 |
with col_btn1:
|
| 405 |
-
analyze_clicked = st.button("
|
| 406 |
with col_btn2:
|
| 407 |
-
reset_clicked = st.button("Reset")
|
| 408 |
-
|
|
|
|
| 409 |
# Handle reset
|
| 410 |
if reset_clicked:
|
| 411 |
st.session_state.resumes = ["", "", ""]
|
| 412 |
st.session_state.input_job_description = ""
|
| 413 |
st.session_state.results = []
|
| 414 |
st.session_state.valid_resumes = []
|
|
|
|
| 415 |
st.rerun()
|
| 416 |
|
| 417 |
# Handle analysis with early validation and lazy model loading
|
| 418 |
if analyze_clicked:
|
| 419 |
# Early validation of inputs
|
| 420 |
valid_resumes = []
|
|
|
|
|
|
|
| 421 |
for i, resume in enumerate(st.session_state.resumes):
|
| 422 |
validation_error = validate_input(resume, is_resume=True)
|
| 423 |
if not validation_error and resume.strip():
|
| 424 |
valid_resumes.append(resume)
|
| 425 |
elif validation_error and resume.strip():
|
| 426 |
-
st.
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
|
|
|
|
|
|
| 431 |
|
| 432 |
-
if valid_resumes and job_description.strip():
|
| 433 |
# Load models only when needed
|
| 434 |
if st.session_state.models is None:
|
| 435 |
with st.spinner("Loading models, please wait..."):
|
|
@@ -442,25 +520,29 @@ def main():
|
|
| 442 |
with st.spinner("Analyzing resumes..."):
|
| 443 |
progress_bar = st.progress(0)
|
| 444 |
status_text = st.empty()
|
| 445 |
-
status_text.text("Preparing inputs...")
|
| 446 |
|
| 447 |
# Retrieve tokenizers from st.session_state.models
|
| 448 |
-
bert_tokenizer,
|
| 449 |
|
|
|
|
| 450 |
# Precompute tokenized inputs and job skills
|
| 451 |
bert_tokenized, t5_inputs, t5_tokenized = tokenize_inputs(valid_resumes, job_description, bert_tokenizer, t5_tokenizer)
|
| 452 |
job_skills_set = extract_skills(job_description)
|
| 453 |
|
| 454 |
-
status_text.text("Classifying and summarizing resumes...")
|
| 455 |
results = []
|
| 456 |
-
for i,
|
| 457 |
-
status_text.text(f"Processing Resume {i+1}/{total_steps}: {resume[:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 458 |
result = classify_and_summarize_batch(
|
| 459 |
resume,
|
| 460 |
job_description,
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
job_skills_set
|
| 465 |
)
|
| 466 |
result["Resume"] = f"Resume {i+1}"
|
|
@@ -471,34 +553,76 @@ def main():
|
|
| 471 |
|
| 472 |
status_text.empty()
|
| 473 |
progress_bar.empty()
|
| 474 |
-
st.success("Analysis completed! π")
|
|
|
|
| 475 |
else:
|
| 476 |
-
st.error("Please
|
| 477 |
|
| 478 |
-
#
|
| 479 |
-
|
| 480 |
-
st.
|
| 481 |
-
st.
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 501 |
|
| 502 |
if __name__ == "__main__":
|
| 503 |
-
# When this module is run directly, call the main function.
|
| 504 |
main()
|
|
|
|
| 15 |
# Set page config as the first Streamlit command
|
| 16 |
st.set_page_config(page_title="Resume Screening Assistant for Data/Tech", page_icon="π", layout="wide")
|
| 17 |
|
| 18 |
+
# --- CUSTOM CSS FOR UI/UX IMPROVEMENTS ---
|
| 19 |
st.markdown("""
|
| 20 |
<style>
|
| 21 |
+
/* 1. Sidebar Styling */
|
| 22 |
.css-1d391kg { /* Sidebar */
|
| 23 |
width: 350px !important;
|
| 24 |
}
|
|
|
|
| 29 |
min-width: 350px !important;
|
| 30 |
visibility: visible !important;
|
| 31 |
}
|
| 32 |
+
/* 2. Expander/Instructions Styling */
|
| 33 |
[data-testid="stExpander"] summary { /* Expander headers */
|
| 34 |
font-size: 26px !important;
|
| 35 |
font-weight: bold !important;
|
| 36 |
text-shadow: 1px 1px 2px rgba(0, 0, 0, 0.1) !important;
|
| 37 |
white-space: nowrap !important;
|
| 38 |
}
|
| 39 |
+
.st-expander-content p { /* Expander body text - made slightly larger for readability */
|
| 40 |
+
font-size: 14px !important;
|
| 41 |
+
line-height: 1.6;
|
| 42 |
+
}
|
| 43 |
+
/* 3. Main Title Styling */
|
| 44 |
+
h1 {
|
| 45 |
+
text-align: center;
|
| 46 |
+
color: #007BFF;
|
| 47 |
+
font-size: 40px;
|
| 48 |
+
}
|
| 49 |
+
/* 4. Tab Styling */
|
| 50 |
+
.stTabs [data-baseweb="tab-list"] {
|
| 51 |
+
gap: 24px;
|
| 52 |
+
}
|
| 53 |
+
.stTabs [data-baseweb="tab"] {
|
| 54 |
+
height: 50px;
|
| 55 |
+
white-space: nowrap;
|
| 56 |
+
border-radius: 4px 4px 0 0;
|
| 57 |
+
gap: 1px;
|
| 58 |
+
padding-top: 10px;
|
| 59 |
+
padding-bottom: 10px;
|
| 60 |
}
|
| 61 |
</style>
|
| 62 |
""", unsafe_allow_html=True)
|
|
|
|
| 80 |
# Precompile regex for skills matching (optimized for single pass)
|
| 81 |
skills_pattern = re.compile(r'\b(' + '|'.join(re.escape(skill) for skill in skills_list) + r')\b', re.IGNORECASE)
|
| 82 |
|
| 83 |
+
# --- Helper functions for CV parsing ---
|
| 84 |
def extract_text_from_pdf(file):
|
| 85 |
try:
|
| 86 |
pdf_reader = PyPDF2.PdfReader(file)
|
|
|
|
| 111 |
elif uploaded_file.name.endswith('.docx'):
|
| 112 |
return extract_text_from_docx(uploaded_file)
|
| 113 |
else:
|
| 114 |
+
# st.error is now handled by the calling logic if the text is empty
|
| 115 |
return ""
|
| 116 |
|
| 117 |
+
# --- Helper functions for analysis ---
|
| 118 |
def normalize_text(text):
|
| 119 |
text = text.lower()
|
| 120 |
# Remove underscores, hyphens, and specific phrases, replacing with empty string
|
|
|
|
| 122 |
return text
|
| 123 |
|
| 124 |
def check_experience_mismatch(resume, job_description):
|
| 125 |
+
# Search for year numbers or 'senior' in resume
|
| 126 |
resume_match = re.search(r'(\d+)\s*years?|senior', resume.lower())
|
| 127 |
+
# Search for year numbers followed by '+' or 'senior+' in JD
|
| 128 |
job_match = re.search(r'(\d+)\s*years?(?:\s+\w+)*\+|senior\+', job_description.lower())
|
| 129 |
if resume_match and job_match:
|
| 130 |
+
resume_years_text = resume_match.group(0)
|
| 131 |
+
job_years_text = job_match.group(0)
|
| 132 |
+
|
| 133 |
# Handle resume years
|
| 134 |
+
if 'senior' in resume_years_text:
|
| 135 |
resume_num = 10
|
| 136 |
else:
|
| 137 |
resume_num = int(resume_match.group(1))
|
| 138 |
+
|
| 139 |
# Handle job years
|
| 140 |
+
if 'senior+' in job_years_text:
|
| 141 |
job_num = 10
|
| 142 |
else:
|
| 143 |
job_num = int(job_match.group(1))
|
| 144 |
+
|
| 145 |
if resume_num < job_num:
|
| 146 |
+
return f"Experience mismatch: Resume has {resume_years_text.strip()}, job requires {job_years_text.strip()}"
|
| 147 |
return None
|
| 148 |
|
| 149 |
def validate_input(text, is_resume=True):
|
|
|
|
| 175 |
"""Precompute tokenized inputs for BERT and T5."""
|
| 176 |
job_description_norm = normalize_text(job_description)
|
| 177 |
bert_inputs = [f"resume: {normalize_text(resume)} [sep] job: {job_description_norm}" for resume in resumes]
|
| 178 |
+
# BERT tokens must be padded/truncated consistently
|
| 179 |
bert_tokenized = _bert_tokenizer(bert_inputs, return_tensors='pt', padding=True, truncation=True, max_length=64)
|
| 180 |
|
| 181 |
t5_inputs = []
|
| 182 |
for resume in resumes:
|
| 183 |
+
# Prompt preparation for T5 summary
|
| 184 |
prompt = re.sub(r'\b[Cc]\+\+\b', 'c++', resume)
|
| 185 |
prompt_normalized = normalize_text(prompt)
|
| 186 |
t5_inputs.append(f"summarize: {prompt_normalized}")
|
| 187 |
+
# T5 tokens must be padded/truncated consistently
|
| 188 |
t5_tokenized = _t5_tokenizer(t5_inputs, return_tensors='pt', padding=True, truncation=True, max_length=64)
|
| 189 |
|
| 190 |
return bert_tokenized, t5_inputs, t5_tokenized
|
|
|
|
| 195 |
text_normalized = normalize_text(text)
|
| 196 |
text_normalized = re.sub(r'[,_-]', ' ', text_normalized)
|
| 197 |
found_skills = skills_pattern.findall(text_normalized)
|
| 198 |
+
return set(s.lower() for s in found_skills) # Ensure lower case for set intersection
|
| 199 |
|
| 200 |
@st.cache_data
|
| 201 |
+
def classify_and_summarize_batch(resume, job_description, _bert_tok, _t5_input, _t5_tok, _job_skills_set):
|
| 202 |
"""Process one resume at a time to reduce CPU load with a timeout."""
|
| 203 |
+
# Note: We pass single-item dicts for inference to avoid re-tokenization outside of cache
|
| 204 |
_, bert_model, t5_tokenizer, t5_model, device = st.session_state.models
|
|
|
|
| 205 |
timeout = 60 # Timeout after 60 seconds
|
| 206 |
|
| 207 |
try:
|
| 208 |
+
# --- BERT Inference (Classification) ---
|
| 209 |
+
bert_tokenized = {k: v.to(device) for k, v in _bert_tok.items()}
|
| 210 |
with torch.no_grad():
|
|
|
|
| 211 |
bert_start = time.time()
|
| 212 |
outputs = bert_model(**bert_tokenized)
|
| 213 |
if time.time() - bert_start > timeout:
|
|
|
|
| 218 |
predictions = np.argmax(probabilities, axis=1)
|
| 219 |
|
| 220 |
confidence_threshold = 0.85
|
| 221 |
+
prob, pred = probabilities[0], predictions[0]
|
| 222 |
+
|
| 223 |
+
# --- T5 Inference (Summarization) ---
|
| 224 |
+
t5_tokenized = {k: v.to(device) for k, v in _t5_tok.items()}
|
| 225 |
with torch.no_grad():
|
|
|
|
| 226 |
t5_start = time.time()
|
| 227 |
t5_outputs = t5_model.generate(
|
| 228 |
t5_tokenized['input_ids'],
|
|
|
|
| 237 |
if time.time() - t5_start > timeout:
|
| 238 |
raise TimeoutError("T5 inference timed out")
|
| 239 |
summaries = [t5_tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True) for output in t5_outputs]
|
| 240 |
+
summary = re.sub(r'\s+', ' ', summaries[0]).strip()
|
| 241 |
|
| 242 |
+
# --- Suitability Logic ---
|
| 243 |
resume_skills_set = extract_skills(resume)
|
| 244 |
skill_overlap = len(_job_skills_set.intersection(resume_skills_set)) / len(_job_skills_set) if _job_skills_set else 0
|
| 245 |
|
| 246 |
+
suitability = "Relevant"
|
| 247 |
+
warning = "None"
|
| 248 |
+
exp_warning = check_experience_mismatch(resume, job_description)
|
| 249 |
+
|
| 250 |
if skill_overlap < 0.4:
|
| 251 |
suitability = "Irrelevant"
|
| 252 |
+
warning = "Low skill overlap (<40%) with job requirements"
|
| 253 |
+
elif exp_warning:
|
| 254 |
+
suitability = "Uncertain"
|
| 255 |
+
warning = exp_warning
|
| 256 |
+
elif prob[pred] < confidence_threshold:
|
| 257 |
+
suitability = "Uncertain"
|
| 258 |
+
warning = f"Low model confidence: {prob[pred]:.2f}"
|
| 259 |
+
elif skill_overlap < 0.5:
|
| 260 |
+
suitability = "Irrelevant"
|
| 261 |
+
warning = "Skill overlap is acceptable but not a strong match (<50%)"
|
|
|
|
|
|
|
|
|
|
| 262 |
|
| 263 |
+
# --- Final Summary Formatting (Override T5 for clarity) ---
|
| 264 |
+
detected_skills = list(set(skills_pattern.findall(normalize_text(resume)))) # Deduplicate skills
|
| 265 |
exp_match = re.search(r'\d+\s*years?|senior', resume.lower())
|
| 266 |
+
|
| 267 |
+
if detected_skills and exp_match:
|
| 268 |
+
final_summary = f"{', '.join(detected_skills)} proficiency, {exp_match.group(0)} experience"
|
| 269 |
+
elif detected_skills:
|
| 270 |
+
final_summary = f"{', '.join(detected_skills)} proficiency"
|
| 271 |
else:
|
| 272 |
+
final_summary = f"{exp_match.group(0) if exp_match else 'unknown'} experience"
|
| 273 |
|
| 274 |
result = {
|
| 275 |
"Suitability": suitability,
|
| 276 |
+
"Data/Tech Related Skills Summary": final_summary,
|
| 277 |
+
"Warning": warning
|
| 278 |
}
|
| 279 |
|
| 280 |
return result
|
|
|
|
| 307 |
resume_lower = normalize_text(resume)
|
| 308 |
found_skills = skills_pattern.findall(resume_lower)
|
| 309 |
for skill in found_skills:
|
| 310 |
+
skill_counts[skill.lower()] = skill_counts.get(skill.lower(), 0) + 1
|
| 311 |
|
| 312 |
+
# Filter for top N skills (e.g., top 8)
|
| 313 |
if not skill_counts:
|
| 314 |
return None
|
| 315 |
|
| 316 |
+
sorted_skills = sorted(skill_counts.items(), key=lambda item: item[1], reverse=True)
|
| 317 |
+
top_n = 8
|
| 318 |
+
|
| 319 |
+
# Aggregate "Other" skills
|
| 320 |
+
if len(sorted_skills) > top_n:
|
| 321 |
+
top_skills = dict(sorted_skills[:top_n-1])
|
| 322 |
+
other_count = sum(count for _, count in sorted_skills[top_n-1:])
|
| 323 |
+
top_skills["Other"] = other_count
|
| 324 |
+
else:
|
| 325 |
+
top_skills = dict(sorted_skills)
|
| 326 |
+
|
| 327 |
+
labels = list(top_skills.keys())
|
| 328 |
+
sizes = [(count / sum(top_skills.values())) * 100 for count in top_skills.values()]
|
| 329 |
|
| 330 |
fig, ax = plt.subplots(figsize=(6, 4))
|
| 331 |
+
# Use a visually appealing color map
|
| 332 |
+
colors = plt.cm.tab20(np.linspace(0, 1, len(labels)))
|
| 333 |
+
|
| 334 |
+
# Draw pie chart with a shadow for depth
|
| 335 |
+
wedges, texts, autotexts = ax.pie(
|
| 336 |
+
sizes,
|
| 337 |
+
labels=labels,
|
| 338 |
+
autopct='%1.1f%%',
|
| 339 |
+
startangle=90,
|
| 340 |
+
colors=colors,
|
| 341 |
+
textprops={'fontsize': 10}
|
| 342 |
+
)
|
| 343 |
+
ax.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
|
| 344 |
+
|
| 345 |
+
plt.title("Top Skill Frequency Across Resumes", fontsize=12, color='#007BFF', pad=10)
|
| 346 |
return fig
|
| 347 |
|
| 348 |
def render_sidebar():
|
|
|
|
| 358 |
with st.expander("π How to Use the App", expanded=True):
|
| 359 |
st.markdown("""
|
| 360 |
**Instructions**:
|
| 361 |
+
- Use the **Setup** tab to input the Job Description.
|
| 362 |
+
- Use the **Resumes** tab to upload or paste up to 5 resumes, including skills and experience (e.g., "Expert in python, databricks, 6 years experience").
|
| 363 |
+
- Click **Run Analysis** to evaluate all valid resumes.
|
| 364 |
- Use **Add Resume** or **Remove Resume** to adjust the number of resume fields (1β5).
|
| 365 |
+
- Use **Reset All** to clear all inputs and results.
|
| 366 |
+
- View the detailed table, download results, and check the skill frequency chart in the **Results** tab.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 367 |
""")
|
| 368 |
with st.expander("βΉοΈ Classification Criteria", expanded=True):
|
| 369 |
st.markdown("""
|
| 370 |
The app classifies resumes based on:
|
| 371 |
+
- **Skill Overlap**: Resume skills vs. Job skills. Overlap must be $\geq 40\%$.
|
| 372 |
+
- **Experience Match**: Resume's experience must meet or exceed the job's stated requirement (e.g., '5 years+').
|
| 373 |
+
- **Model Confidence**: BERT classification confidence $\geq 85\%$.
|
| 374 |
|
| 375 |
**Outcomes**:
|
| 376 |
+
- **Relevant**: High skill overlap ($\geq 50\%$), sufficient experience, and high confidence ($\geq 85\%$).
|
| 377 |
+
- **Irrelevant**: Skill overlap $< 40\%$ or acceptable overlap but high confidence in low relevance.
|
| 378 |
+
- **Uncertain**: Experience mismatch, or model confidence $< 85\%$.
|
|
|
|
|
|
|
| 379 |
""")
|
| 380 |
|
| 381 |
def main():
|
|
|
|
| 385 |
|
| 386 |
# Initialize session state
|
| 387 |
if 'resumes' not in st.session_state:
|
| 388 |
+
# Start with a good example and two empty slots
|
| 389 |
st.session_state.resumes = ["Expert in python, machine learning, tableau, 4 years experience", "", ""]
|
| 390 |
if 'input_job_description' not in st.session_state:
|
| 391 |
st.session_state.input_job_description = "Data scientist requires python, machine learning, 3 years+"
|
|
|
|
| 396 |
if 'models' not in st.session_state:
|
| 397 |
st.session_state.models = None
|
| 398 |
|
| 399 |
+
st.markdown("<h1>π AI-Powered Resume Screening</h1>", unsafe_allow_html=True)
|
| 400 |
+
|
| 401 |
+
# π NEW: Use Streamlit Tabs for better flow
|
| 402 |
+
tab_setup, tab_resumes, tab_results = st.tabs(["βοΈ 1. Setup & Job Description", "π 2. Manage Resumes", "π 3. Analysis & Results"])
|
| 403 |
+
|
| 404 |
+
# --- TAB 1: Setup & Job Description ---
|
| 405 |
+
with tab_setup:
|
| 406 |
+
st.subheader("1. Enter Job Description")
|
| 407 |
+
st.info("Paste the job description here. It must include required skills and experience (e.g., 'Data engineer requires python, spark, 5 years+').")
|
| 408 |
|
| 409 |
+
job_description = st.text_area(
|
| 410 |
+
"Job Description Text",
|
| 411 |
+
value=st.session_state.input_job_description,
|
| 412 |
+
height=150,
|
| 413 |
+
key="job_description_tab",
|
| 414 |
+
placeholder="e.g., Data scientist requires python, sql, 3 years+"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 415 |
)
|
| 416 |
+
st.session_state.input_job_description = job_description
|
| 417 |
+
|
| 418 |
+
validation_error = validate_input(job_description, is_resume=False)
|
| 419 |
+
if validation_error and job_description.strip():
|
| 420 |
+
st.warning(f"Job Description Validation: {validation_error}")
|
| 421 |
+
|
| 422 |
+
# --- TAB 2: Manage Resumes ---
|
| 423 |
+
with tab_resumes:
|
| 424 |
+
st.subheader(f"2. Resume Inputs ({len(st.session_state.resumes)}/5)")
|
| 425 |
+
st.info("Upload PDF/DOCX or paste text for up to 5 resumes. Each must contain data/tech skills and experience.")
|
| 426 |
+
|
| 427 |
+
# Manage resume text areas and file uploads
|
| 428 |
+
for i in range(len(st.session_state.resumes)):
|
| 429 |
+
# Use an expander for each resume to keep the page clean
|
| 430 |
+
is_expanded = (i == 0) or (st.session_state.resumes[i].strip() != "")
|
| 431 |
+
with st.expander(f"**Resume {i+1}**", expanded=is_expanded):
|
| 432 |
+
|
| 433 |
+
# File uploader on the left
|
| 434 |
+
uploaded_file = st.file_uploader(
|
| 435 |
+
f"Upload CV (PDF or Word) for Resume {i+1}",
|
| 436 |
+
type=['pdf', 'docx'],
|
| 437 |
+
key=f"file_upload_{i}"
|
| 438 |
+
)
|
| 439 |
+
|
| 440 |
+
# File upload logic
|
| 441 |
+
if uploaded_file is not None:
|
| 442 |
+
extracted_text = extract_text_from_file(uploaded_file)
|
| 443 |
+
if extracted_text:
|
| 444 |
+
st.session_state.resumes[i] = extracted_text
|
| 445 |
+
else:
|
| 446 |
+
st.session_state.resumes[i] = ""
|
| 447 |
+
|
| 448 |
+
# Text area input
|
| 449 |
+
st.session_state.resumes[i] = st.text_area(
|
| 450 |
+
f"Paste or edit resume text",
|
| 451 |
+
value=st.session_state.resumes[i],
|
| 452 |
+
height=100,
|
| 453 |
+
key=f"resume_{i}_tab",
|
| 454 |
+
placeholder="e.g., Expert in python, sql, 3 years experience"
|
| 455 |
+
)
|
| 456 |
+
|
| 457 |
+
# Validation feedback
|
| 458 |
+
validation_error = validate_input(st.session_state.resumes[i], is_resume=True)
|
| 459 |
+
if validation_error and st.session_state.resumes[i].strip():
|
| 460 |
+
st.warning(f"Validation: {validation_error}")
|
| 461 |
+
|
| 462 |
+
# Add/Remove resume buttons outside the loop
|
| 463 |
+
col_add, col_remove, _ = st.columns([1, 1, 3])
|
| 464 |
+
with col_add:
|
| 465 |
+
if st.button("β Add Resume", use_container_width=True) and len(st.session_state.resumes) < 5:
|
| 466 |
+
st.session_state.resumes.append("")
|
| 467 |
+
st.rerun()
|
| 468 |
+
with col_remove:
|
| 469 |
+
if st.button("β Remove Resume", use_container_width=True) and len(st.session_state.resumes) > 1:
|
| 470 |
+
st.session_state.resumes.pop()
|
| 471 |
+
st.rerun()
|
| 472 |
+
|
| 473 |
+
# --- ACTION BUTTONS (Outside the tabs for prominence) ---
|
| 474 |
+
st.markdown("---")
|
| 475 |
col_btn1, col_btn2, _ = st.columns([1, 1, 3])
|
| 476 |
with col_btn1:
|
| 477 |
+
analyze_clicked = st.button("π Run Analysis", type="primary", use_container_width=True)
|
| 478 |
with col_btn2:
|
| 479 |
+
reset_clicked = st.button("π Reset All", use_container_width=True)
|
| 480 |
+
st.markdown("---")
|
| 481 |
+
|
| 482 |
# Handle reset
|
| 483 |
if reset_clicked:
|
| 484 |
st.session_state.resumes = ["", "", ""]
|
| 485 |
st.session_state.input_job_description = ""
|
| 486 |
st.session_state.results = []
|
| 487 |
st.session_state.valid_resumes = []
|
| 488 |
+
st.session_state.models = None # Also clear models to re-load fresh
|
| 489 |
st.rerun()
|
| 490 |
|
| 491 |
# Handle analysis with early validation and lazy model loading
|
| 492 |
if analyze_clicked:
|
| 493 |
# Early validation of inputs
|
| 494 |
valid_resumes = []
|
| 495 |
+
all_inputs_valid = True
|
| 496 |
+
|
| 497 |
for i, resume in enumerate(st.session_state.resumes):
|
| 498 |
validation_error = validate_input(resume, is_resume=True)
|
| 499 |
if not validation_error and resume.strip():
|
| 500 |
valid_resumes.append(resume)
|
| 501 |
elif validation_error and resume.strip():
|
| 502 |
+
st.error(f"Cannot run analysis. Resume {i+1} failed validation: {validation_error}")
|
| 503 |
+
all_inputs_valid = False
|
| 504 |
+
|
| 505 |
+
job_validation_error = validate_input(job_description, is_resume=False)
|
| 506 |
+
if job_validation_error and job_description.strip():
|
| 507 |
+
st.error(f"Cannot run analysis. Job Description failed validation: {job_validation_error}")
|
| 508 |
+
all_inputs_valid = False
|
| 509 |
|
| 510 |
+
if valid_resumes and job_description.strip() and all_inputs_valid:
|
| 511 |
# Load models only when needed
|
| 512 |
if st.session_state.models is None:
|
| 513 |
with st.spinner("Loading models, please wait..."):
|
|
|
|
| 520 |
with st.spinner("Analyzing resumes..."):
|
| 521 |
progress_bar = st.progress(0)
|
| 522 |
status_text = st.empty()
|
|
|
|
| 523 |
|
| 524 |
# Retrieve tokenizers from st.session_state.models
|
| 525 |
+
bert_tokenizer, _, t5_tokenizer, _, _ = st.session_state.models
|
| 526 |
|
| 527 |
+
status_text.text("Preparing inputs: Tokenizing and extracting job skills...")
|
| 528 |
# Precompute tokenized inputs and job skills
|
| 529 |
bert_tokenized, t5_inputs, t5_tokenized = tokenize_inputs(valid_resumes, job_description, bert_tokenizer, t5_tokenizer)
|
| 530 |
job_skills_set = extract_skills(job_description)
|
| 531 |
|
|
|
|
| 532 |
results = []
|
| 533 |
+
for i, resume in enumerate(valid_resumes):
|
| 534 |
+
status_text.text(f"Processing Resume {i+1}/{total_steps}: {resume[:30]}...")
|
| 535 |
+
|
| 536 |
+
# Package single item inputs for the cached function call
|
| 537 |
+
bert_tok_single = {'input_ids': bert_tokenized['input_ids'][i].unsqueeze(0), 'attention_mask': bert_tokenized['attention_mask'][i].unsqueeze(0)}
|
| 538 |
+
t5_tok_single = {'input_ids': t5_tokenized['input_ids'][i].unsqueeze(0), 'attention_mask': t5_tokenized['attention_mask'][i].unsqueeze(0)}
|
| 539 |
+
|
| 540 |
result = classify_and_summarize_batch(
|
| 541 |
resume,
|
| 542 |
job_description,
|
| 543 |
+
bert_tok_single,
|
| 544 |
+
t5_inputs[i],
|
| 545 |
+
t5_tok_single,
|
| 546 |
job_skills_set
|
| 547 |
)
|
| 548 |
result["Resume"] = f"Resume {i+1}"
|
|
|
|
| 553 |
|
| 554 |
status_text.empty()
|
| 555 |
progress_bar.empty()
|
| 556 |
+
st.success("Analysis completed! π Results are in the 'Analysis & Results' tab.")
|
| 557 |
+
st.balloons()
|
| 558 |
else:
|
| 559 |
+
st.error("Please ensure at least one valid resume and a valid job description are provided.")
|
| 560 |
|
| 561 |
+
# --- TAB 3: Results ---
|
| 562 |
+
with tab_results:
|
| 563 |
+
st.subheader("3. Screening Results")
|
| 564 |
+
if st.session_state.results:
|
| 565 |
+
st.success("Analysis complete. See the suitability assessment below.")
|
| 566 |
+
|
| 567 |
+
# π NEW: Use st.dataframe for enhanced, sortable results table
|
| 568 |
+
st.dataframe(
|
| 569 |
+
st.session_state.results,
|
| 570 |
+
column_config={
|
| 571 |
+
"Suitability": st.column_config.TextColumn(
|
| 572 |
+
"Suitability",
|
| 573 |
+
help="Model's assessment (Relevant, Irrelevant, Uncertain)",
|
| 574 |
+
width="small"
|
| 575 |
+
),
|
| 576 |
+
"Warning": st.column_config.TextColumn(
|
| 577 |
+
"Warning",
|
| 578 |
+
help="Reason for non-Relevant status (e.g., experience mismatch, low confidence)",
|
| 579 |
+
width="medium"
|
| 580 |
+
),
|
| 581 |
+
"Data/Tech Related Skills Summary": st.column_config.TextColumn(
|
| 582 |
+
"Skills/Exp Summary",
|
| 583 |
+
help="Concise summary of detected skills and experience",
|
| 584 |
+
width="large"
|
| 585 |
+
),
|
| 586 |
+
"Resume": st.column_config.TextColumn(
|
| 587 |
+
"Resume",
|
| 588 |
+
width="small"
|
| 589 |
+
)
|
| 590 |
+
},
|
| 591 |
+
use_container_width=True
|
| 592 |
+
)
|
| 593 |
+
|
| 594 |
+
# Download and Chart Section
|
| 595 |
+
col_dl, col_chart_expander = st.columns([1, 3])
|
| 596 |
+
|
| 597 |
+
with col_dl:
|
| 598 |
+
csv_buffer = io.StringIO()
|
| 599 |
+
csv_buffer.write("Resume Number,Suitability,Summary,Warning\n")
|
| 600 |
+
# Exclude the full resume text from CSV to keep it clean and focused
|
| 601 |
+
for i, result in enumerate(st.session_state.results):
|
| 602 |
+
summary = result["Data/Tech Related Skills Summary"].replace('"', '""')
|
| 603 |
+
csv_buffer.write(f'"{result["Resume"]}","{result["Suitability"]}","{summary}","{result["Warning"]}"\n')
|
| 604 |
+
|
| 605 |
+
st.download_button(
|
| 606 |
+
"πΎ Download Results CSV",
|
| 607 |
+
csv_buffer.getvalue(),
|
| 608 |
+
file_name="resume_analysis.csv",
|
| 609 |
+
mime="text/csv",
|
| 610 |
+
use_container_width=True
|
| 611 |
+
)
|
| 612 |
+
|
| 613 |
+
with col_chart_expander:
|
| 614 |
+
with st.expander("π View Top Skill Frequency Across Resumes", expanded=False):
|
| 615 |
+
if st.session_state.valid_resumes:
|
| 616 |
+
fig = generate_skill_pie_chart(st.session_state.valid_resumes)
|
| 617 |
+
if fig:
|
| 618 |
+
st.pyplot(fig)
|
| 619 |
+
plt.close(fig)
|
| 620 |
+
else:
|
| 621 |
+
st.info("No recognized data/tech skills found in the resumes for charting.")
|
| 622 |
+
else:
|
| 623 |
+
st.info("No valid resumes to analyze.")
|
| 624 |
+
else:
|
| 625 |
+
st.info("Please complete the Setup and Resume tabs, then click 'Run Analysis' to see results.")
|
| 626 |
|
| 627 |
if __name__ == "__main__":
|
|
|
|
| 628 |
main()
|