Update app.py
Browse files
app.py
CHANGED
|
@@ -93,36 +93,6 @@ def extract_text_from_file(uploaded_file):
|
|
| 93 |
st.error("Unsupported file format. Please upload a PDF or Word (.docx) document.")
|
| 94 |
return ""
|
| 95 |
|
| 96 |
-
# Helper functions for preprocessing
|
| 97 |
-
def preprocess_resume(text):
|
| 98 |
-
"""Extract relevant sections (Skills, Experience) from resume text to reduce input length."""
|
| 99 |
-
text = text.lower()
|
| 100 |
-
# Define patterns for sections
|
| 101 |
-
skills_section = ""
|
| 102 |
-
experience_section = ""
|
| 103 |
-
|
| 104 |
-
# Extract Skills section
|
| 105 |
-
skills_match = re.search(r'(skills|technical skills|key skills)(.*?)(experience|education|projects|$)', text, re.DOTALL | re.IGNORECASE)
|
| 106 |
-
if skills_match:
|
| 107 |
-
skills_section = skills_match.group(2).strip()
|
| 108 |
-
|
| 109 |
-
# Extract Experience section
|
| 110 |
-
experience_match = re.search(r'(experience|work experience|professional experience)(.*?)(education|projects|$)', text, re.DOTALL | re.IGNORECASE)
|
| 111 |
-
if experience_match:
|
| 112 |
-
experience_section = experience_match.group(2).strip()
|
| 113 |
-
|
| 114 |
-
# Combine relevant sections
|
| 115 |
-
processed_text = f"{skills_section}\n{experience_section}".strip()
|
| 116 |
-
if not processed_text:
|
| 117 |
-
# Fallback to original text if sections not found
|
| 118 |
-
processed_text = text
|
| 119 |
-
|
| 120 |
-
# Limit length to 500 characters to reduce token count
|
| 121 |
-
if len(processed_text) > 500:
|
| 122 |
-
processed_text = processed_text[:500] + "..."
|
| 123 |
-
|
| 124 |
-
return processed_text
|
| 125 |
-
|
| 126 |
# Helper functions for analysis
|
| 127 |
def normalize_text(text):
|
| 128 |
text = text.lower()
|
|
@@ -180,14 +150,14 @@ def tokenize_inputs(resumes, job_description, _bert_tokenizer, _t5_tokenizer):
|
|
| 180 |
"""Precompute tokenized inputs for BERT and T5."""
|
| 181 |
job_description_norm = normalize_text(job_description)
|
| 182 |
bert_inputs = [f"resume: {normalize_text(resume)} [sep] job: {job_description_norm}" for resume in resumes]
|
| 183 |
-
bert_tokenized = _bert_tokenizer(bert_inputs, return_tensors='pt', padding=True, truncation=True, max_length=
|
| 184 |
|
| 185 |
t5_inputs = []
|
| 186 |
for resume in resumes:
|
| 187 |
prompt = re.sub(r'\b[Cc]\+\+\b', 'c++', resume)
|
| 188 |
prompt_normalized = normalize_text(prompt)
|
| 189 |
t5_inputs.append(f"summarize: {prompt_normalized}")
|
| 190 |
-
t5_tokenized = _t5_tokenizer(t5_inputs, return_tensors='pt', padding=True, truncation=True, max_length=
|
| 191 |
|
| 192 |
return bert_tokenized, t5_inputs, t5_tokenized
|
| 193 |
|
|
@@ -200,105 +170,94 @@ def extract_skills(text):
|
|
| 200 |
return set(found_skills)
|
| 201 |
|
| 202 |
@st.cache_data
|
| 203 |
-
def classify_and_summarize_batch(
|
| 204 |
-
"""Process
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
if
|
| 257 |
-
suitability = "
|
| 258 |
-
warning = "
|
| 259 |
-
else:
|
| 260 |
-
exp_warning = check_experience_mismatch(resume, job_description)
|
| 261 |
-
if exp_warning:
|
| 262 |
-
suitability = "Uncertain"
|
| 263 |
-
warning = exp_warning
|
| 264 |
-
else:
|
| 265 |
-
if prob[pred] < confidence_threshold:
|
| 266 |
-
suitability = "Uncertain"
|
| 267 |
-
warning = f"Low confidence: {prob[pred]:.4f}"
|
| 268 |
-
else:
|
| 269 |
-
suitability = "Relevant" if skill_overlap >= 0.5 else "Irrelevant"
|
| 270 |
-
warning = "Skills are not a strong match" if suitability == "Irrelevant" else None
|
| 271 |
-
|
| 272 |
-
skills = list(set(skills_pattern.findall(t5_input))) # Deduplicate skills
|
| 273 |
-
exp_match = re.search(r'\d+\s*years?|senior', resume.lower())
|
| 274 |
-
if skills and exp_match:
|
| 275 |
-
summary = f"{', '.join(skills)} proficiency, {exp_match.group(0)} experience"
|
| 276 |
else:
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
|
| 303 |
@st.cache_data
|
| 304 |
def generate_skill_pie_chart(resumes):
|
|
@@ -335,7 +294,7 @@ def render_sidebar():
|
|
| 335 |
st.markdown("""
|
| 336 |
<h1 style='text-align: center; color: #007BFF; font-size: 32px; text-shadow: 1px 1px 2px rgba(0, 0, 0, 0.1); margin-bottom: 10px;'>💻 Resume Screening Assistant for Data/Tech</h1>
|
| 337 |
<p style='text-align: center; font-size: 16px; margin-top: 0;'>
|
| 338 |
-
Welcome to our AI-powered resume screening tool, specialized for data science and tech roles! This app evaluates multiple resumes against a single job description to determine suitability, providing concise summaries of key data and tech skills and experience. Built with advanced natural language processing, it ensures accurate and efficient screening for technical positions. <br><br><strong>Note:</strong> Performance may vary due to server load on free CPU instances.
|
| 339 |
</p>
|
| 340 |
""", unsafe_allow_html=True)
|
| 341 |
|
|
@@ -401,9 +360,7 @@ def main():
|
|
| 401 |
if uploaded_file is not None:
|
| 402 |
extracted_text = extract_text_from_file(uploaded_file)
|
| 403 |
if extracted_text:
|
| 404 |
-
|
| 405 |
-
processed_text = preprocess_resume(extracted_text)
|
| 406 |
-
st.session_state.resumes[i] = processed_text
|
| 407 |
else:
|
| 408 |
st.session_state.resumes[i] = ""
|
| 409 |
|
|
@@ -495,10 +452,19 @@ def main():
|
|
| 495 |
job_skills_set = extract_skills(job_description)
|
| 496 |
|
| 497 |
status_text.text("Classifying and summarizing resumes...")
|
| 498 |
-
results =
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 502 |
progress_bar.progress((i + 1) / total_steps)
|
| 503 |
|
| 504 |
st.session_state.results = results
|
|
|
|
| 93 |
st.error("Unsupported file format. Please upload a PDF or Word (.docx) document.")
|
| 94 |
return ""
|
| 95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
# Helper functions for analysis
|
| 97 |
def normalize_text(text):
|
| 98 |
text = text.lower()
|
|
|
|
| 150 |
"""Precompute tokenized inputs for BERT and T5."""
|
| 151 |
job_description_norm = normalize_text(job_description)
|
| 152 |
bert_inputs = [f"resume: {normalize_text(resume)} [sep] job: {job_description_norm}" for resume in resumes]
|
| 153 |
+
bert_tokenized = _bert_tokenizer(bert_inputs, return_tensors='pt', padding=True, truncation=True, max_length=64)
|
| 154 |
|
| 155 |
t5_inputs = []
|
| 156 |
for resume in resumes:
|
| 157 |
prompt = re.sub(r'\b[Cc]\+\+\b', 'c++', resume)
|
| 158 |
prompt_normalized = normalize_text(prompt)
|
| 159 |
t5_inputs.append(f"summarize: {prompt_normalized}")
|
| 160 |
+
t5_tokenized = _t5_tokenizer(t5_inputs, return_tensors='pt', padding=True, truncation=True, max_length=64)
|
| 161 |
|
| 162 |
return bert_tokenized, t5_inputs, t5_tokenized
|
| 163 |
|
|
|
|
| 170 |
return set(found_skills)
|
| 171 |
|
| 172 |
@st.cache_data
|
| 173 |
+
def classify_and_summarize_batch(resume, job_description, _bert_tokenized, _t5_input, _t5_tokenized, _job_skills_set):
|
| 174 |
+
"""Process one resume at a time to reduce CPU load with a timeout."""
|
| 175 |
+
_, bert_model, t5_tokenizer, t5_model, device = st.session_state.models
|
| 176 |
+
start_time = time.time()
|
| 177 |
+
timeout = 60 # Timeout after 60 seconds
|
| 178 |
|
| 179 |
+
try:
|
| 180 |
+
bert_tokenized = {k: v.to(device) for k, v in _bert_tokenized.items()}
|
| 181 |
+
with torch.no_grad():
|
| 182 |
+
# BERT inference
|
| 183 |
+
bert_start = time.time()
|
| 184 |
+
outputs = bert_model(**bert_tokenized)
|
| 185 |
+
if time.time() - bert_start > timeout:
|
| 186 |
+
raise TimeoutError("BERT inference timed out")
|
| 187 |
|
| 188 |
+
logits = outputs.logits
|
| 189 |
+
probabilities = torch.softmax(logits, dim=1).cpu().numpy()
|
| 190 |
+
predictions = np.argmax(probabilities, axis=1)
|
| 191 |
+
|
| 192 |
+
confidence_threshold = 0.85
|
| 193 |
+
|
| 194 |
+
t5_tokenized = {k: v.to(device) for k, v in _t5_tokenized.items()}
|
| 195 |
+
with torch.no_grad():
|
| 196 |
+
# T5 inference
|
| 197 |
+
t5_start = time.time()
|
| 198 |
+
t5_outputs = t5_model.generate(
|
| 199 |
+
t5_tokenized['input_ids'],
|
| 200 |
+
attention_mask=t5_tokenized['attention_mask'],
|
| 201 |
+
max_length=30,
|
| 202 |
+
min_length=8,
|
| 203 |
+
num_beams=2,
|
| 204 |
+
no_repeat_ngram_size=3,
|
| 205 |
+
length_penalty=2.0,
|
| 206 |
+
early_stopping=True
|
| 207 |
+
)
|
| 208 |
+
if time.time() - t5_start > timeout:
|
| 209 |
+
raise TimeoutError("T5 inference timed out")
|
| 210 |
+
summaries = [t5_tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True) for output in t5_outputs]
|
| 211 |
+
summaries = [re.sub(r'\s+', ' ', summary).strip() for summary in summaries]
|
| 212 |
+
|
| 213 |
+
prob, pred, summary, t5_input = probabilities[0], predictions[0], summaries[0], _t5_input
|
| 214 |
+
resume_skills_set = extract_skills(resume)
|
| 215 |
+
skill_overlap = len(_job_skills_set.intersection(resume_skills_set)) / len(_job_skills_set) if _job_skills_set else 0
|
| 216 |
+
|
| 217 |
+
if skill_overlap < 0.4:
|
| 218 |
+
suitability = "Irrelevant"
|
| 219 |
+
warning = "Skills are irrelevant"
|
| 220 |
+
else:
|
| 221 |
+
exp_warning = check_experience_mismatch(resume, job_description)
|
| 222 |
+
if exp_warning:
|
| 223 |
+
suitability = "Uncertain"
|
| 224 |
+
warning = exp_warning
|
| 225 |
+
else:
|
| 226 |
+
if prob[pred] < confidence_threshold:
|
| 227 |
+
suitability = "Uncertain"
|
| 228 |
+
warning = f"Low confidence: {prob[pred]:.4f}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
else:
|
| 230 |
+
suitability = "Relevant" if skill_overlap >= 0.5 else "Irrelevant"
|
| 231 |
+
warning = "Skills are not a strong match" if suitability == "Irrelevant" else None
|
| 232 |
+
|
| 233 |
+
skills = list(set(skills_pattern.findall(t5_input))) # Deduplicate skills
|
| 234 |
+
exp_match = re.search(r'\d+\s*years?|senior', resume.lower())
|
| 235 |
+
if skills and exp_match:
|
| 236 |
+
summary = f"{', '.join(skills)} proficiency, {exp_match.group(0)} experience"
|
| 237 |
+
else:
|
| 238 |
+
summary = f"{exp_match.group(0) if exp_match else 'unknown'} experience"
|
| 239 |
+
|
| 240 |
+
result = {
|
| 241 |
+
"Suitability": suitability,
|
| 242 |
+
"Data/Tech Related Skills Summary": summary,
|
| 243 |
+
"Warning": warning or "None"
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
return result
|
| 247 |
+
except TimeoutError as e:
|
| 248 |
+
st.warning(f"Skipped processing for resume due to timeout: {str(e)}")
|
| 249 |
+
return {
|
| 250 |
+
"Suitability": "Error",
|
| 251 |
+
"Data/Tech Related Skills Summary": "Processing timed out",
|
| 252 |
+
"Warning": str(e)
|
| 253 |
+
}
|
| 254 |
+
except Exception as e:
|
| 255 |
+
st.error(f"Error during inference for resume: {str(e)}")
|
| 256 |
+
return {
|
| 257 |
+
"Suitability": "Error",
|
| 258 |
+
"Data/Tech Related Skills Summary": "Failed to process",
|
| 259 |
+
"Warning": str(e)
|
| 260 |
+
}
|
| 261 |
|
| 262 |
@st.cache_data
|
| 263 |
def generate_skill_pie_chart(resumes):
|
|
|
|
| 294 |
st.markdown("""
|
| 295 |
<h1 style='text-align: center; color: #007BFF; font-size: 32px; text-shadow: 1px 1px 2px rgba(0, 0, 0, 0.1); margin-bottom: 10px;'>💻 Resume Screening Assistant for Data/Tech</h1>
|
| 296 |
<p style='text-align: center; font-size: 16px; margin-top: 0;'>
|
| 297 |
+
Welcome to our AI-powered resume screening tool, specialized for data science and tech roles! This app evaluates multiple resumes against a single job description to determine suitability, providing concise summaries of key data and tech skills and experience. Built with advanced natural language processing, it ensures accurate and efficient screening for technical positions. <br><br><strong>Note:</strong> Performance may vary due to server load on free CPU instances.
|
| 298 |
</p>
|
| 299 |
""", unsafe_allow_html=True)
|
| 300 |
|
|
|
|
| 360 |
if uploaded_file is not None:
|
| 361 |
extracted_text = extract_text_from_file(uploaded_file)
|
| 362 |
if extracted_text:
|
| 363 |
+
st.session_state.resumes[i] = extracted_text
|
|
|
|
|
|
|
| 364 |
else:
|
| 365 |
st.session_state.resumes[i] = ""
|
| 366 |
|
|
|
|
| 452 |
job_skills_set = extract_skills(job_description)
|
| 453 |
|
| 454 |
status_text.text("Classifying and summarizing resumes...")
|
| 455 |
+
results = []
|
| 456 |
+
for i, (resume, bert_tok, t5_in, t5_tok) in enumerate(zip(valid_resumes, bert_tokenized['input_ids'], t5_inputs, t5_tokenized['input_ids'])):
|
| 457 |
+
status_text.text(f"Processing Resume {i+1}/{total_steps}: {resume[:50]}...")
|
| 458 |
+
result = classify_and_summarize_batch(
|
| 459 |
+
resume,
|
| 460 |
+
job_description,
|
| 461 |
+
{'input_ids': bert_tok.unsqueeze(0), 'attention_mask': bert_tokenized['attention_mask'][i].unsqueeze(0)},
|
| 462 |
+
t5_in,
|
| 463 |
+
{'input_ids': t5_tok.unsqueeze(0), 'attention_mask': t5_tokenized['attention_mask'][i].unsqueeze(0)},
|
| 464 |
+
job_skills_set
|
| 465 |
+
)
|
| 466 |
+
result["Resume"] = f"Resume {i+1}"
|
| 467 |
+
results.append(result)
|
| 468 |
progress_bar.progress((i + 1) / total_steps)
|
| 469 |
|
| 470 |
st.session_state.results = results
|