Update app.py
Browse files
app.py
CHANGED
|
@@ -299,10 +299,10 @@ def classify_and_summarize_batch(resume, job_description, _bert_tokenized, _t5_i
|
|
| 299 |
bert_tokenized = {k: v.to(device) for k, v in _bert_tokenized.items()}
|
| 300 |
with torch.no_grad():
|
| 301 |
# BERT inference
|
| 302 |
-
|
| 303 |
-
if elapsed_time > timeout:
|
| 304 |
-
raise TimeoutError("BERT inference timed out")
|
| 305 |
outputs = bert_model(**bert_tokenized)
|
|
|
|
|
|
|
| 306 |
|
| 307 |
logits = outputs.logits
|
| 308 |
probabilities = torch.softmax(logits, dim=1).cpu().numpy()
|
|
@@ -313,9 +313,7 @@ def classify_and_summarize_batch(resume, job_description, _bert_tokenized, _t5_i
|
|
| 313 |
t5_tokenized = {k: v.to(device) for k, v in _t5_tokenized.items()}
|
| 314 |
with torch.no_grad():
|
| 315 |
# T5 inference
|
| 316 |
-
|
| 317 |
-
if elapsed_time > timeout:
|
| 318 |
-
raise TimeoutError("T5 inference timed out")
|
| 319 |
t5_outputs = t5_model.generate(
|
| 320 |
t5_tokenized['input_ids'],
|
| 321 |
attention_mask=t5_tokenized['attention_mask'],
|
|
@@ -326,6 +324,8 @@ def classify_and_summarize_batch(resume, job_description, _bert_tokenized, _t5_i
|
|
| 326 |
length_penalty=3.0,
|
| 327 |
early_stopping=True
|
| 328 |
)
|
|
|
|
|
|
|
| 329 |
summaries = [t5_tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True) for output in t5_outputs]
|
| 330 |
summaries = [re.sub(r'\s+', ' ', summary).strip() for summary in summaries]
|
| 331 |
|
|
@@ -382,77 +382,38 @@ def classify_and_summarize_batch(resume, job_description, _bert_tokenized, _t5_i
|
|
| 382 |
"Inference Time": time.time() - start_time
|
| 383 |
}
|
| 384 |
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
if
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
with st.expander("Example Test Cases", expanded=st.session_state.expander2):
|
| 418 |
-
st.session_state.expander2 = True
|
| 419 |
-
st.markdown("""
|
| 420 |
-
- **Test Case 1**:
|
| 421 |
-
- Resume 1: "Expert in python, machine learning, tableau, 4 years experience"
|
| 422 |
-
- Resume 2: "Skilled in sql, pandas, 2 years experience"
|
| 423 |
-
- Resume 3: "Proficient in java, python, 5 years experience"
|
| 424 |
-
- Job Description: "Data scientist requires python, machine learning, 3 years+"
|
| 425 |
-
- **Test Case 2**:
|
| 426 |
-
- Resume 1: "Skilled in databricks, spark, python, 6 years experience"
|
| 427 |
-
- Resume 2: "Expert in sql, tableau, business intelligence, 3 years experience"
|
| 428 |
-
- Resume 3: "Proficient in rust, langchain, 2 years experience"
|
| 429 |
-
- Job Description: "Data engineer requires python, spark, 5 years+"
|
| 430 |
-
""")
|
| 431 |
-
|
| 432 |
-
with st.expander("Guidelines", expanded=st.session_state.expander3):
|
| 433 |
-
st.session_state.expander3 = True
|
| 434 |
-
st.markdown("""
|
| 435 |
-
- Use comma-separated skills from a comprehensive list including python, sql, databricks, etc. (79 skills supported, see Project Report for full list).
|
| 436 |
-
- Include experience in years (e.g., "3 years experience" or "1 year experience") or as "senior".
|
| 437 |
-
- Focus on data/tech skills for accurate summarization.
|
| 438 |
-
- Resumes with only irrelevant skills (e.g., sales, marketing) will be classified as "Irrelevant".
|
| 439 |
-
""")
|
| 440 |
-
|
| 441 |
-
with st.expander("Classification Criteria", expanded=st.session_state.expander4):
|
| 442 |
-
st.session_state.expander4 = True
|
| 443 |
-
st.markdown("""
|
| 444 |
-
Resumes are classified based on:
|
| 445 |
-
- **Skill Overlap**: The resume's data/tech skills are compared to the job's requirements. A skill overlap below 40% results in an "Irrelevant" classification.
|
| 446 |
-
- **Model Confidence**: A finetuned BERT model evaluates skill relevance. If confidence is below 85%, the classification is "Uncertain".
|
| 447 |
-
- **Experience Match**: The resume's experience (in years or seniority) must meet or exceed the job's requirement.
|
| 448 |
-
|
| 449 |
-
**Outcomes**:
|
| 450 |
-
- **Relevant**: Skill overlap ≥ 50%, sufficient experience, and high model confidence (≥ 85%).
|
| 451 |
-
- **Irrelevant**: Skill overlap < 40% or high confidence in low skill relevance.
|
| 452 |
-
- **Uncertain**: Skill overlap ≥ 50% but experience mismatch (e.g., resume has 2 years, job requires 5 years+), or low model confidence (< 85%).
|
| 453 |
-
|
| 454 |
-
**Note**: An experience mismatch warning is shown if the resume's experience is below the job's requirement, overriding the skill overlap and confidence to classify as Uncertain.
|
| 455 |
-
""")
|
| 456 |
|
| 457 |
def main():
|
| 458 |
"""Main function to run the Streamlit app for resume screening."""
|
|
@@ -518,10 +479,11 @@ def main():
|
|
| 518 |
placeholder="e.g., 'Data engineer requires python, spark, 5 years+'"
|
| 519 |
)
|
| 520 |
|
| 521 |
-
# Analyze button with loading spinner
|
| 522 |
if st.button("Analyze"):
|
| 523 |
with st.spinner("Analyzing resumes... This may take a moment depending on server load."):
|
| 524 |
start_time = time.time()
|
|
|
|
| 525 |
resumes = tuple(resume.strip() for resume in st.session_state.resumes[:num_resumes]) # Use tuple for cache stability
|
| 526 |
job_description = st.session_state.job_description.strip()
|
| 527 |
|
|
@@ -542,6 +504,9 @@ def main():
|
|
| 542 |
job_skills_set = extract_skills(job_description)
|
| 543 |
results = []
|
| 544 |
for i, resume in enumerate(valid_resumes):
|
|
|
|
|
|
|
|
|
|
| 545 |
st.write(f"Processing {resume[:50]}...") # Log progress
|
| 546 |
bert_tokenized, t5_inputs, t5_tokenized = tokenize_inputs([resume], job_description)
|
| 547 |
result = classify_and_summarize_batch(resume, job_description, bert_tokenized, t5_inputs[0], t5_tokenized, job_skills_set)
|
|
@@ -557,42 +522,4 @@ def main():
|
|
| 557 |
|
| 558 |
st.session_state.total_analyze_time = time.time() - start_time
|
| 559 |
# Detailed timing logs
|
| 560 |
-
st.write(f"Total Analyze Time: {st.session_state.total_analyze_time:.2f}
|
| 561 |
-
st.write(f"Model Load Time: {getattr(st.session_state, 'load_models_time', 0):.2f} seconds")
|
| 562 |
-
st.write(f"Tokenize Time: {getattr(st.session_state, 'tokenize_time', 0):.2f} seconds")
|
| 563 |
-
st.write(f"Extract Skills Time: {getattr(st.session_state, 'extract_skills_time', 0):.2f} seconds")
|
| 564 |
-
if st.session_state.results:
|
| 565 |
-
for idx, result in enumerate(st.session_state.results):
|
| 566 |
-
st.write(f"Inference Time for {result['Resume']}: {result['Inference Time']:.2f} seconds")
|
| 567 |
-
st.write(f"Pie Chart Time: {getattr(st.session_state, 'pie_chart_time', 0):.2f} seconds")
|
| 568 |
-
|
| 569 |
-
# Performance note
|
| 570 |
-
if st.session_state.total_analyze_time > 60:
|
| 571 |
-
st.warning("The runtime is longer than expected due to server load on Hugging Face Spaces. For a smoother experience, consider testing locally or deploying on a different platform (e.g., Streamlit Community Cloud or a personal server).")
|
| 572 |
-
|
| 573 |
-
# Display results
|
| 574 |
-
if st.session_state.results:
|
| 575 |
-
with st.container():
|
| 576 |
-
st.subheader("Results")
|
| 577 |
-
df = pd.DataFrame(st.session_state.results)
|
| 578 |
-
df = df[["Resume", "Suitability", "Data/Tech Related Skills Summary", "Warning"]] # Exclude Inference Time from display
|
| 579 |
-
st.dataframe(df, use_container_width=True)
|
| 580 |
-
|
| 581 |
-
csv = df.to_csv(index=False)
|
| 582 |
-
st.download_button(
|
| 583 |
-
label="Download Results as CSV",
|
| 584 |
-
data=csv,
|
| 585 |
-
file_name="resume_screening_results.csv",
|
| 586 |
-
mime="text/csv",
|
| 587 |
-
)
|
| 588 |
-
|
| 589 |
-
# Display pie chart
|
| 590 |
-
if st.session_state.pie_chart:
|
| 591 |
-
with st.container():
|
| 592 |
-
st.subheader("Skill Frequency Across Resumes")
|
| 593 |
-
st.pyplot(st.session_state.pie_chart)
|
| 594 |
-
elif st.session_state.results and not st.session_state.pie_chart:
|
| 595 |
-
st.warning("No recognized data/tech skills found in the resumes to generate a pie chart.")
|
| 596 |
-
|
| 597 |
-
if __name__ == "__main__":
|
| 598 |
-
main()
|
|
|
|
| 299 |
bert_tokenized = {k: v.to(device) for k, v in _bert_tokenized.items()}
|
| 300 |
with torch.no_grad():
|
| 301 |
# BERT inference
|
| 302 |
+
bert_start = time.time()
|
|
|
|
|
|
|
| 303 |
outputs = bert_model(**bert_tokenized)
|
| 304 |
+
if time.time() - bert_start > timeout:
|
| 305 |
+
raise TimeoutError("BERT inference timed out")
|
| 306 |
|
| 307 |
logits = outputs.logits
|
| 308 |
probabilities = torch.softmax(logits, dim=1).cpu().numpy()
|
|
|
|
| 313 |
t5_tokenized = {k: v.to(device) for k, v in _t5_tokenized.items()}
|
| 314 |
with torch.no_grad():
|
| 315 |
# T5 inference
|
| 316 |
+
t5_start = time.time()
|
|
|
|
|
|
|
| 317 |
t5_outputs = t5_model.generate(
|
| 318 |
t5_tokenized['input_ids'],
|
| 319 |
attention_mask=t5_tokenized['attention_mask'],
|
|
|
|
| 324 |
length_penalty=3.0,
|
| 325 |
early_stopping=True
|
| 326 |
)
|
| 327 |
+
if time.time() - t5_start > timeout:
|
| 328 |
+
raise TimeoutError("T5 inference timed out")
|
| 329 |
summaries = [t5_tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True) for output in t5_outputs]
|
| 330 |
summaries = [re.sub(r'\s+', ' ', summary).strip() for summary in summaries]
|
| 331 |
|
|
|
|
| 382 |
"Inference Time": time.time() - start_time
|
| 383 |
}
|
| 384 |
|
| 385 |
+
@st.cache_data
|
| 386 |
+
def generate_skill_pie_chart(resumes):
|
| 387 |
+
"""Generate a pie chart of skill frequency across resumes."""
|
| 388 |
+
start_time = time.time()
|
| 389 |
+
skill_counts = {}
|
| 390 |
+
total_resumes = len([r for r in resumes if r.strip()])
|
| 391 |
+
|
| 392 |
+
if total_resumes == 0:
|
| 393 |
+
return None
|
| 394 |
+
|
| 395 |
+
for resume in resumes:
|
| 396 |
+
if resume.strip():
|
| 397 |
+
resume_lower = normalize_text(resume)
|
| 398 |
+
resume_lower = re.sub(r'[,_-]', ' ', resume_lower)
|
| 399 |
+
found_skills = skills_pattern.findall(resume_lower)
|
| 400 |
+
for skill in found_skills:
|
| 401 |
+
skill_counts[skill] = skill_counts.get(skill, 0) + 1
|
| 402 |
+
|
| 403 |
+
if not skill_counts:
|
| 404 |
+
return None
|
| 405 |
+
|
| 406 |
+
labels = list(skill_counts.keys())
|
| 407 |
+
sizes = [(count / sum(skill_counts.values())) * 100 for count in skill_counts.values()]
|
| 408 |
+
|
| 409 |
+
fig, ax = plt.subplots(figsize=(6, 4))
|
| 410 |
+
colors = plt.cm.Blues(np.linspace(0.4, 0.8, len(labels)))
|
| 411 |
+
ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, colors=colors, textprops={'fontsize': 10})
|
| 412 |
+
ax.axis('equal')
|
| 413 |
+
plt.title("Skill Frequency Across Resumes", fontsize=12, color='#FF3621', pad=10)
|
| 414 |
+
|
| 415 |
+
st.session_state.pie_chart_time = time.time() - start_time
|
| 416 |
+
return fig
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 417 |
|
| 418 |
def main():
|
| 419 |
"""Main function to run the Streamlit app for resume screening."""
|
|
|
|
| 479 |
placeholder="e.g., 'Data engineer requires python, spark, 5 years+'"
|
| 480 |
)
|
| 481 |
|
| 482 |
+
# Analyze button with loading spinner and global timeout
|
| 483 |
if st.button("Analyze"):
|
| 484 |
with st.spinner("Analyzing resumes... This may take a moment depending on server load."):
|
| 485 |
start_time = time.time()
|
| 486 |
+
global_timeout = 180 # Global timeout of 3 minutes for all resumes
|
| 487 |
resumes = tuple(resume.strip() for resume in st.session_state.resumes[:num_resumes]) # Use tuple for cache stability
|
| 488 |
job_description = st.session_state.job_description.strip()
|
| 489 |
|
|
|
|
| 504 |
job_skills_set = extract_skills(job_description)
|
| 505 |
results = []
|
| 506 |
for i, resume in enumerate(valid_resumes):
|
| 507 |
+
if time.time() - start_time > global_timeout:
|
| 508 |
+
st.error("Analysis timed out after 3 minutes. Please try again or deploy on a different platform.")
|
| 509 |
+
break
|
| 510 |
st.write(f"Processing {resume[:50]}...") # Log progress
|
| 511 |
bert_tokenized, t5_inputs, t5_tokenized = tokenize_inputs([resume], job_description)
|
| 512 |
result = classify_and_summarize_batch(resume, job_description, bert_tokenized, t5_inputs[0], t5_tokenized, job_skills_set)
|
|
|
|
| 522 |
|
| 523 |
st.session_state.total_analyze_time = time.time() - start_time
|
| 524 |
# Detailed timing logs
|
| 525 |
+
st.write(f"Total Analyze Time: {st.session_state.total_analyze_time:.2f}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|