Spaces:

scmlewis
/

Resume_Screening_Assistant_for_Data_Tech

Sleeping

App Files Files Community

scmlewis commited on May 25, 2025

Commit

4299b38

verified ·

1 Parent(s): a6a34a9

Update app.py

Browse files

Files changed (1) hide show

app.py +102 -136

app.py CHANGED Viewed

@@ -93,36 +93,6 @@ def extract_text_from_file(uploaded_file):
         st.error("Unsupported file format. Please upload a PDF or Word (.docx) document.")
         return ""
-# Helper functions for preprocessing
-def preprocess_resume(text):
-    """Extract relevant sections (Skills, Experience) from resume text to reduce input length."""
-    text = text.lower()
-    # Define patterns for sections
-    skills_section = ""
-    experience_section = ""
-    # Extract Skills section
-    skills_match = re.search(r'(skills|technical skills|key skills)(.*?)(experience|education|projects|$)', text, re.DOTALL | re.IGNORECASE)
-    if skills_match:
-        skills_section = skills_match.group(2).strip()
-    # Extract Experience section
-    experience_match = re.search(r'(experience|work experience|professional experience)(.*?)(education|projects|$)', text, re.DOTALL | re.IGNORECASE)
-    if experience_match:
-        experience_section = experience_match.group(2).strip()
-    # Combine relevant sections
-    processed_text = f"{skills_section}\n{experience_section}".strip()
-    if not processed_text:
-        # Fallback to original text if sections not found
-        processed_text = text
-    # Limit length to 500 characters to reduce token count
-    if len(processed_text) > 500:
-        processed_text = processed_text[:500] + "..."
-    return processed_text
 # Helper functions for analysis
 def normalize_text(text):
     text = text.lower()
@@ -180,14 +150,14 @@ def tokenize_inputs(resumes, job_description, _bert_tokenizer, _t5_tokenizer):
     """Precompute tokenized inputs for BERT and T5."""
     job_description_norm = normalize_text(job_description)
     bert_inputs = [f"resume: {normalize_text(resume)} [sep] job: {job_description_norm}" for resume in resumes]
-    bert_tokenized = _bert_tokenizer(bert_inputs, return_tensors='pt', padding=True, truncation=True, max_length=128)
     t5_inputs = []
     for resume in resumes:
         prompt = re.sub(r'\b[Cc]\+\+\b', 'c++', resume)
         prompt_normalized = normalize_text(prompt)
         t5_inputs.append(f"summarize: {prompt_normalized}")
-    t5_tokenized = _t5_tokenizer(t5_inputs, return_tensors='pt', padding=True, truncation=True, max_length=128)
     return bert_tokenized, t5_inputs, t5_tokenized
@@ -200,105 +170,94 @@ def extract_skills(text):
     return set(found_skills)
 @st.cache_data
-def classify_and_summarize_batch(resumes, job_description, _bert_tokenized, _t5_inputs, _t5_tokenized, _job_skills_set):
-    """Process resumes in batches to balance speed and stability."""
-    bert_tokenizer, bert_model, t5_tokenizer, t5_model, device = st.session_state.models
-    timeout = 60  # Timeout per batch
-    batch_size = 2  # Process 2 resumes at a time
-    results = []
-    num_resumes = len(resumes)
-    for batch_start in range(0, num_resumes, batch_size):
-        batch_end = min(batch_start + batch_size, num_resumes)
-        batch_resumes = resumes[batch_start:batch_end]
-        batch_bert_tokenized = {k: v[batch_start:batch_end].to(device) for k, v in _bert_tokenized.items()}
-        batch_t5_inputs = _t5_inputs[batch_start:batch_end]
-        batch_t5_tokenized = {k: v[batch_start:batch_end].to(device) for k, v in _t5_tokenized.items()}
-        start_time = time.time()
-        try:
-            # BERT inference for the batch
-            with torch.no_grad():
-                bert_start = time.time()
-                outputs = bert_model(**batch_bert_tokenized)
-                if time.time() - bert_start > timeout:
-                    raise TimeoutError("BERT inference timed out")
-            logits = outputs.logits
-            probabilities = torch.softmax(logits, dim=1).cpu().numpy()
-            predictions = np.argmax(probabilities, axis=1)
-            confidence_threshold = 0.85
-            # T5 inference for the batch
-            with torch.no_grad():
-                t5_start = time.time()
-                t5_outputs = t5_model.generate(
-                    batch_t5_tokenized['input_ids'],
-                    attention_mask=batch_t5_tokenized['attention_mask'],
-                    max_length=30,
-                    min_length=8,
-                    num_beams=1,  # Reduced for speed
-                    no_repeat_ngram_size=3,
-                    length_penalty=1.0,  # Reduced for faster generation
-                    early_stopping=True
-                )
-                if time.time() - t5_start > timeout:
-                    raise TimeoutError("T5 inference timed out")
-            summaries = [t5_tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True) for output in t5_outputs]
-            summaries = [re.sub(r'\s+', ' ', summary).strip() for summary in summaries]
-            # Process each resume in the batch
-            for i, (resume, prob, pred, summary, t5_input) in enumerate(zip(batch_resumes, probabilities, predictions, summaries, batch_t5_inputs)):
-                resume_skills_set = extract_skills(resume)
-                skill_overlap = len(_job_skills_set.intersection(resume_skills_set)) / len(_job_skills_set) if _job_skills_set else 0
-                if skill_overlap < 0.4:
-                    suitability = "Irrelevant"
-                    warning = "Skills are irrelevant"
-                else:
-                    exp_warning = check_experience_mismatch(resume, job_description)
-                    if exp_warning:
-                        suitability = "Uncertain"
-                        warning = exp_warning
-                    else:
-                        if prob[pred] < confidence_threshold:
-                            suitability = "Uncertain"
-                            warning = f"Low confidence: {prob[pred]:.4f}"
-                        else:
-                            suitability = "Relevant" if skill_overlap >= 0.5 else "Irrelevant"
-                            warning = "Skills are not a strong match" if suitability == "Irrelevant" else None
-                skills = list(set(skills_pattern.findall(t5_input)))  # Deduplicate skills
-                exp_match = re.search(r'\d+\s*years?|senior', resume.lower())
-                if skills and exp_match:
-                    summary = f"{', '.join(skills)} proficiency, {exp_match.group(0)} experience"
                 else:
-                    summary = f"{exp_match.group(0) if exp_match else 'unknown'} experience"
-                results.append({
-                    "Suitability": suitability,
-                    "Data/Tech Related Skills Summary": summary,
-                    "Warning": warning or "None"
-                })
-        except TimeoutError as e:
-            st.warning(f"Skipped processing for batch due to timeout: {str(e)}")
-            for resume in batch_resumes:
-                results.append({
-                    "Suitability": "Error",
-                    "Data/Tech Related Skills Summary": "Processing timed out",
-                    "Warning": str(e)
-                })
-        except Exception as e:
-            st.error(f"Error during inference for batch: {str(e)}")
-            for resume in batch_resumes:
-                results.append({
-                    "Suitability": "Error",
-                    "Data/Tech Related Skills Summary": "Failed to process",
-                    "Warning": str(e)
-                })
-    return results
 @st.cache_data
 def generate_skill_pie_chart(resumes):
@@ -335,7 +294,7 @@ def render_sidebar():
         st.markdown("""
             <h1 style='text-align: center; color: #007BFF; font-size: 32px; text-shadow: 1px 1px 2px rgba(0, 0, 0, 0.1); margin-bottom: 10px;'>💻 Resume Screening Assistant for Data/Tech</h1>
             <p style='text-align: center; font-size: 16px; margin-top: 0;'>
-                Welcome to our AI-powered resume screening tool, specialized for data science and tech roles! This app evaluates multiple resumes against a single job description to determine suitability, providing concise summaries of key data and tech skills and experience. Built with advanced natural language processing, it ensures accurate and efficient screening for technical positions. <br><br><strong>Note:</strong> Performance may vary due to server load on free CPU instances. For best results, consider upgrading to a paid plan on Streamlit Community Cloud.
             </p>
         """, unsafe_allow_html=True)
@@ -401,9 +360,7 @@ def main():
         if uploaded_file is not None:
             extracted_text = extract_text_from_file(uploaded_file)
             if extracted_text:
-                # Preprocess the extracted text to reduce length
-                processed_text = preprocess_resume(extracted_text)
-                st.session_state.resumes[i] = processed_text
             else:
                 st.session_state.resumes[i] = ""
@@ -495,10 +452,19 @@ def main():
                 job_skills_set = extract_skills(job_description)
                 status_text.text("Classifying and summarizing resumes...")
-                results = classify_and_summarize_batch(valid_resumes, job_description, bert_tokenized, t5_inputs, t5_tokenized, job_skills_set)
-                for i in range(len(results)):
-                    results[i]["Resume"] = f"Resume {i+1}"
                     progress_bar.progress((i + 1) / total_steps)
                 st.session_state.results = results

         st.error("Unsupported file format. Please upload a PDF or Word (.docx) document.")
         return ""
 # Helper functions for analysis
 def normalize_text(text):
     text = text.lower()
     """Precompute tokenized inputs for BERT and T5."""
     job_description_norm = normalize_text(job_description)
     bert_inputs = [f"resume: {normalize_text(resume)} [sep] job: {job_description_norm}" for resume in resumes]
+    bert_tokenized = _bert_tokenizer(bert_inputs, return_tensors='pt', padding=True, truncation=True, max_length=64)
     t5_inputs = []
     for resume in resumes:
         prompt = re.sub(r'\b[Cc]\+\+\b', 'c++', resume)
         prompt_normalized = normalize_text(prompt)
         t5_inputs.append(f"summarize: {prompt_normalized}")
+    t5_tokenized = _t5_tokenizer(t5_inputs, return_tensors='pt', padding=True, truncation=True, max_length=64)
     return bert_tokenized, t5_inputs, t5_tokenized
     return set(found_skills)
 @st.cache_data
+def classify_and_summarize_batch(resume, job_description, _bert_tokenized, _t5_input, _t5_tokenized, _job_skills_set):
+    """Process one resume at a time to reduce CPU load with a timeout."""
+    _, bert_model, t5_tokenizer, t5_model, device = st.session_state.models
+    start_time = time.time()
+    timeout = 60  # Timeout after 60 seconds
+    try:
+        bert_tokenized = {k: v.to(device) for k, v in _bert_tokenized.items()}
+        with torch.no_grad():
+            # BERT inference
+            bert_start = time.time()
+            outputs = bert_model(**bert_tokenized)
+            if time.time() - bert_start > timeout:
+                raise TimeoutError("BERT inference timed out")
+        logits = outputs.logits
+        probabilities = torch.softmax(logits, dim=1).cpu().numpy()
+        predictions = np.argmax(probabilities, axis=1)
+        confidence_threshold = 0.85
+        t5_tokenized = {k: v.to(device) for k, v in _t5_tokenized.items()}
+        with torch.no_grad():
+            # T5 inference
+            t5_start = time.time()
+            t5_outputs = t5_model.generate(
+                t5_tokenized['input_ids'],
+                attention_mask=t5_tokenized['attention_mask'],
+                max_length=30,
+                min_length=8,
+                num_beams=2,
+                no_repeat_ngram_size=3,
+                length_penalty=2.0,
+                early_stopping=True
+            )
+            if time.time() - t5_start > timeout:
+                raise TimeoutError("T5 inference timed out")
+        summaries = [t5_tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True) for output in t5_outputs]
+        summaries = [re.sub(r'\s+', ' ', summary).strip() for summary in summaries]
+        prob, pred, summary, t5_input = probabilities[0], predictions[0], summaries[0], _t5_input
+        resume_skills_set = extract_skills(resume)
+        skill_overlap = len(_job_skills_set.intersection(resume_skills_set)) / len(_job_skills_set) if _job_skills_set else 0
+        if skill_overlap < 0.4:
+            suitability = "Irrelevant"
+            warning = "Skills are irrelevant"
+        else:
+            exp_warning = check_experience_mismatch(resume, job_description)
+            if exp_warning:
+                suitability = "Uncertain"
+                warning = exp_warning
+            else:
+                if prob[pred] < confidence_threshold:
+                    suitability = "Uncertain"
+                    warning = f"Low confidence: {prob[pred]:.4f}"
                 else:
+                    suitability = "Relevant" if skill_overlap >= 0.5 else "Irrelevant"
+                    warning = "Skills are not a strong match" if suitability == "Irrelevant" else None
+        skills = list(set(skills_pattern.findall(t5_input)))  # Deduplicate skills
+        exp_match = re.search(r'\d+\s*years?|senior', resume.lower())
+        if skills and exp_match:
+            summary = f"{', '.join(skills)} proficiency, {exp_match.group(0)} experience"
+        else:
+            summary = f"{exp_match.group(0) if exp_match else 'unknown'} experience"
+        result = {
+            "Suitability": suitability,
+            "Data/Tech Related Skills Summary": summary,
+            "Warning": warning or "None"
+        }
+        return result
+    except TimeoutError as e:
+        st.warning(f"Skipped processing for resume due to timeout: {str(e)}")
+        return {
+            "Suitability": "Error",
+            "Data/Tech Related Skills Summary": "Processing timed out",
+            "Warning": str(e)
+        }
+    except Exception as e:
+        st.error(f"Error during inference for resume: {str(e)}")
+        return {
+            "Suitability": "Error",
+            "Data/Tech Related Skills Summary": "Failed to process",
+            "Warning": str(e)
+        }
 @st.cache_data
 def generate_skill_pie_chart(resumes):
         st.markdown("""
             <h1 style='text-align: center; color: #007BFF; font-size: 32px; text-shadow: 1px 1px 2px rgba(0, 0, 0, 0.1); margin-bottom: 10px;'>💻 Resume Screening Assistant for Data/Tech</h1>
             <p style='text-align: center; font-size: 16px; margin-top: 0;'>
+                Welcome to our AI-powered resume screening tool, specialized for data science and tech roles! This app evaluates multiple resumes against a single job description to determine suitability, providing concise summaries of key data and tech skills and experience. Built with advanced natural language processing, it ensures accurate and efficient screening for technical positions. <br><br><strong>Note:</strong> Performance may vary due to server load on free CPU instances.
             </p>
         """, unsafe_allow_html=True)
         if uploaded_file is not None:
             extracted_text = extract_text_from_file(uploaded_file)
             if extracted_text:
+                st.session_state.resumes[i] = extracted_text
             else:
                 st.session_state.resumes[i] = ""
                 job_skills_set = extract_skills(job_description)
                 status_text.text("Classifying and summarizing resumes...")
+                results = []
+                for i, (resume, bert_tok, t5_in, t5_tok) in enumerate(zip(valid_resumes, bert_tokenized['input_ids'], t5_inputs, t5_tokenized['input_ids'])):
+                    status_text.text(f"Processing Resume {i+1}/{total_steps}: {resume[:50]}...")
+                    result = classify_and_summarize_batch(
+                        resume,
+                        job_description,
+                        {'input_ids': bert_tok.unsqueeze(0), 'attention_mask': bert_tokenized['attention_mask'][i].unsqueeze(0)},
+                        t5_in,
+                        {'input_ids': t5_tok.unsqueeze(0), 'attention_mask': t5_tokenized['attention_mask'][i].unsqueeze(0)},
+                        job_skills_set
+                    )
+                    result["Resume"] = f"Resume {i+1}"
+                    results.append(result)
                     progress_bar.progress((i + 1) / total_steps)
                 st.session_state.results = results