sharadrajore's picture
Update app.py
206e7ab verified
import streamlit as st
import os
from dotenv import load_dotenv
from langchain_aws import BedrockEmbeddings, BedrockLLM
import boto3
from langchain_core.prompts import PromptTemplate
import docx
import zipfile
import PyPDF2
import io
from typing import List, Dict
import pandas as pd
from io import BytesIO
from pathlib import Path
def extract_text_from_file(file_content: bytes, file_extension: str) -> str:
"""Extract text from different file types"""
text = ""
try:
if file_extension == '.pdf':
pdf_reader = PyPDF2.PdfReader(BytesIO(file_content))
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
elif file_extension in ['.docx', '.doc']:
doc = docx.Document(BytesIO(file_content))
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
elif file_extension == '.txt':
text = file_content.decode('utf-8')
except Exception as e:
print(f"Error extracting text from {file_extension} file: {str(e)}")
return text
def process_zip_file(zip_content: bytes, blacklist: set) -> List[dict]:
"""Process contents of a ZIP file"""
processed_files = []
with zipfile.ZipFile(BytesIO(zip_content)) as z:
for zip_filename in z.namelist():
if not zip_filename.endswith(('.txt', '.docx', '.doc', '.pdf')):
continue
try:
with z.open(zip_filename) as f:
file_content = f.read()
file_extension = Path(zip_filename).suffix.lower()
text = extract_text_from_file(file_content, file_extension)
if text and not check_for_blacklisted_companies(text, blacklist):
processed_files.append({
"id": f"{zip_filename}_{hash(text)}",
"name": zip_filename,
"content": text
})
else:
print(f"Skipping {zip_filename} from ZIP - contains blacklisted company")
except Exception as e:
print(f"Error processing {zip_filename} from ZIP: {str(e)}")
return processed_files
def load_blacklist() -> set:
"""Load blacklisted company names from a file"""
try:
with open('blacklist.txt', 'r', encoding='utf-8') as file:
# Convert to lowercase and remove whitespace
return {line.strip().lower() for line in file if line.strip()}
except FileNotFoundError:
print("Warning: blacklist.txt not found. Creating empty blacklist.")
# Create empty blacklist file
with open('blacklist.txt', 'w', encoding='utf-8') as file:
pass
return set()
def check_for_blacklisted_companies(text: str, blacklist: set) -> bool:
"""
Check if any blacklisted company names appear in the text
Args:
text: The text to check
blacklist: Set of blacklisted company names
Returns:
True if blacklisted company found, False otherwise
"""
if not text or not blacklist:
return False
text_lower = text.lower()
return any(company in text_lower for company in blacklist)
def save_uploaded_resumes(uploaded_files):
"""Save uploaded resume files to the Docs folder"""
docs_folder = Path("Docs")
docs_folder.mkdir(exist_ok=True)
blacklist = load_blacklist()
saved_files = []
for uploaded_file in uploaded_files:
try:
content = uploaded_file.read()
file_extension = Path(uploaded_file.name).suffix.lower()
# Handle ZIP files
if file_extension == '.zip':
processed_zip_files = process_zip_file(content, blacklist)
for processed_file in processed_zip_files:
zip_content = processed_file["content"].encode('utf-8')
new_filename = processed_file["name"]
counter = 1
while (docs_folder / new_filename).exists():
base_name = Path(new_filename).stem
ext = Path(new_filename).suffix
new_filename = f"{base_name}_{counter}{ext}"
counter += 1
file_path = docs_folder / new_filename
with open(file_path, "wb") as f:
f.write(zip_content)
saved_files.append(new_filename)
else:
# Handle individual files
text = extract_text_from_file(content, file_extension)
if text and not check_for_blacklisted_companies(text, blacklist):
base_name = Path(uploaded_file.name).stem
new_filename = uploaded_file.name
counter = 1
while (docs_folder / new_filename).exists():
new_filename = f"{base_name}_{counter}{file_extension}"
counter += 1
file_path = docs_folder / new_filename
with open(file_path, "wb") as f:
uploaded_file.seek(0)
f.write(uploaded_file.getbuffer())
saved_files.append(new_filename)
else:
print(f"Skipping {uploaded_file.name} - contains blacklisted company")
except Exception as e:
print(f"Error processing {uploaded_file.name}: {str(e)}")
return saved_files
def upload_section():
st.subheader("Upload Resumes")
uploaded_files = st.file_uploader(
"Upload one or more resumes",
type=['pdf', 'docx', 'doc', 'txt', 'zip'],
accept_multiple_files=True
)
if uploaded_files:
if st.button("Process Uploaded Resumes"):
saved_files = save_uploaded_resumes(uploaded_files)
if saved_files:
st.success(f"Successfully saved {len(saved_files)} files to Docs folder")
st.write("Saved files:", ", ".join(saved_files))
if any(f.endswith('.zip') for f in [f.name for f in uploaded_files]):
st.info("ZIP files were processed and their contents were extracted")
else:
st.warning("No files were saved. They may contain blacklisted content")
def create_aws_client():
"""Create and return AWS Bedrock client"""
AWS_ACCESS_KEY = os.getenv('ACCESS_KEY')
AWS_SECRET_ACCESS_KEY = os.getenv('SECRET_ACCESS_KEY')
REGION_NAME = os.getenv('REGION')
bedrock_client = boto3.client(
'bedrock-runtime',
region_name=REGION_NAME,
aws_access_key_id=AWS_ACCESS_KEY,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY
)
return bedrock_client
def process_docs_folder(folder_path: str) -> List[dict]:
"""Process all documents in the specified folder"""
processed_files = []
blacklist = load_blacklist()
try:
if not os.path.exists(folder_path):
raise Exception(f"Folder not found: {folder_path}")
for filename in os.listdir(folder_path):
file_path = os.path.join(folder_path, filename)
if not os.path.isfile(file_path):
continue
file_extension = f".{filename.split('.')[-1].lower()}"
try:
with open(file_path, 'rb') as file:
content = file.read()
if file_extension == '.zip':
processed_files.extend(process_zip_file(content, blacklist))
else:
text = extract_text_from_file(content, file_extension)
if text and not check_for_blacklisted_companies(text, blacklist):
processed_files.append({
"id": f"{filename}_{hash(text)}",
"name": filename,
"content": text
})
else:
print(f"Skipping {filename} - contains blacklisted company")
except Exception as e:
print(f"Error processing {filename}: {str(e)}")
except Exception as e:
raise Exception(f"Error accessing docs folder: {str(e)}")
return processed_files
def check_resume_relevance(job_desc: str, resume_content: str, required_skills: List[str], client) -> dict:
"""Check if resume matches required skills and job description"""
llm = BedrockLLM(
model_id="amazon.titan-text-lite-v1",
client=client
)
# First, do a direct text search for skills
resume_lower = resume_content.lower()
found_skills_direct = []
for skill in required_skills:
if skill.lower() in resume_lower:
found_skills_direct.append(skill)
# If we found any skills directly, proceed with detailed analysis
if found_skills_direct:
relevance_prompt = PromptTemplate.from_template("""
Analyze this resume for the following skills. Be lenient in matching skills.
Required Skills to Check:
{skills}
Resume Content:
{resume}
For each skill, determine:
1. If it's present (including variations and related technologies)
2. The experience level with the skill
3. How recently it was used
Respond in this format:
{{
"skills_found": [
List of skills found (including variations)
],
"match_percentage": Percentage of required skills found (0-100),
"skill_details": {{
"skill_name": {{
"found": true/false,
"experience": "description of experience",
"evidence": "where found in resume"
}}
}}
}}
Be generous in skill matching. If you find related technologies or variations, count them as matches.
""")
message = relevance_prompt.format(
skills="\n".join([f"- {skill}" for skill in required_skills]),
resume=resume_content
)
try:
response = llm.invoke(message)
response_lower = response.lower()
# Calculate match score
total_skills = len(required_skills)
found_skills = len(found_skills_direct)
match_score = (found_skills / total_skills) * 100 if total_skills > 0 else 0
# Determine relevance (more lenient threshold)
is_relevant = match_score >= 50 # Lower threshold for relevance
return {
"is_relevant": is_relevant,
"score": match_score,
"found_skills": found_skills_direct,
"total_skills": total_skills,
"key_matches": response
}
except Exception as e:
print(f"Error in LLM analysis: {e}")
# Fall back to direct matching results
match_score = (len(found_skills_direct) / len(required_skills)) * 100
return {
"is_relevant": len(found_skills_direct) > 0, # Consider relevant if any skills found
"score": match_score,
"found_skills": found_skills_direct,
"total_skills": len(required_skills),
"key_matches": f"Skills found through direct matching: {', '.join(found_skills_direct)}"
}
else:
# If no direct matches, do a more lenient check with LLM
lenient_prompt = PromptTemplate.from_template("""
Analyze this resume for skills related to or equivalent to:
{skills}
Consider variations and related technologies.
Resume Content:
{resume}
List any matches found, including:
1. Direct matches
2. Related technologies
3. Equivalent skills
Respond with found matches only.
""")
message = lenient_prompt.format(
skills="\n".join([f"- {skill}" for skill in required_skills]),
resume=resume_content
)
try:
response = llm.invoke(message)
# Check if any skills are mentioned in the response
found_skills = []
for skill in required_skills:
if skill.lower() in response.lower():
found_skills.append(skill)
match_score = (len(found_skills) / len(required_skills)) * 100 if required_skills else 0
return {
"is_relevant": len(found_skills) > 0, # Consider relevant if any skills found
"score": match_score,
"found_skills": found_skills,
"total_skills": len(required_skills),
"key_matches": response
}
except Exception as e:
print(f"Error in lenient LLM analysis: {e}")
return {
"is_relevant": False,
"score": 0,
"found_skills": [],
"total_skills": len(required_skills),
"key_matches": "Error in analysis"
}
def get_summary_from_llm(job_desc: str, resume_content: str, required_skills: List[str], client) -> str:
"""Generate detailed analysis of resume against requirements"""
llm = BedrockLLM(
model_id="amazon.titan-text-lite-v1",
client=client
)
map_prompt_template = PromptTemplate.from_template("""
Provide a detailed analysis of this resume against the job requirements.
Required Skills:
{skills}
Additional Requirements:
{job_desc}
Resume Content:
{resume_details}
Provide analysis in this format:
## Skills Analysis
### Required Skills Match
{skills_analysis}
### Technical Proficiency
- For each required skill:
* Experience level
* Years of usage
* Recent projects
### Additional Technical Skills
- Only list relevant additional skills
## Experience Analysis
- Total years of relevant experience
- Key projects using multiple required skills
- Notable achievements with required technologies
## Overall Assessment
- Skills Match Score: X/Y required skills found
- Technical Proficiency Score: (0-100)
- Experience Level Match: (Junior/Mid/Senior)
## Recommendation
- Hiring Decision: (Strong Match/Potential Match/Not Recommended)
- Key Strengths: (list top 3)
- Areas to Verify: (list specific areas)
Focus only on exact matches and verifiable experience.
""")
message = map_prompt_template.format(
skills="\n".join([f"- {skill}" for skill in required_skills]),
job_desc=job_desc,
resume_details=resume_content,
skills_analysis="\n".join([f"- {skill}: Found/Not Found, Experience Level, Evidence" for skill in required_skills])
)
try:
summary = llm.invoke(message)
return summary
except Exception as e:
return f"Error generating analysis: {str(e)}"
def export_to_excel(matches: List[dict], required_skills: List[str]) -> BytesIO:
"""Create Excel report from matches"""
# Prepare data for Excel
excel_data = []
for match in matches:
found_skills = match.get('found_skills', [])
row_data = {
'Candidate Name': match['name'],
'Match Score': f"{match.get('match_score', 0):.1f}%",
'Skills Found': ', '.join(found_skills),
'Missing Skills': ', '.join([skill for skill in required_skills if skill not in found_skills]),
'Total Skills Found': len(found_skills),
'Total Required Skills': len(required_skills)
}
# Add individual skill columns
for skill in required_skills:
row_data[f'Skill - {skill}'] = '✓' if skill in found_skills else '✗'
excel_data.append(row_data)
# Create DataFrame
df = pd.DataFrame(excel_data)
# Create Excel file
output = BytesIO()
with pd.ExcelWriter(output, engine='openpyxl') as writer:
df.to_excel(writer, index=False, sheet_name='Resume Matches')
# Get workbook and worksheet
workbook = writer.book
worksheet = writer.sheets['Resume Matches']
# Format columns
for idx, col in enumerate(df.columns):
# Get maximum length of column content
max_length = max(
df[col].astype(str).apply(len).max(),
len(col)
)
# Set column width
worksheet.column_dimensions[chr(65 + idx)].width = min(max_length + 2, 50)
return output
def main():
try:
# Load environment variables and setup
load_dotenv()
client = create_aws_client()
# Streamlit UI setup
st.set_page_config(
page_title="Resume Screening Assistant",
layout="wide"
)
st.title("Resume Screening AI Assistant")
st.subheader("Match resumes with required skills and experience")
upload_section()
# Skills input
st.write("Enter required skills (one per line):")
skills_input = st.text_area(
"Required Skills",
placeholder="Example:\nPython\nJava\nAWS\nDocker",
height=150
)
# Additional requirements
additional_reqs = st.text_area(
"Additional Requirements (optional)",
placeholder="Enter any additional requirements like:\n- Years of experience\n- Education\n- Specific domain knowledge",
height=100
)
# Process inputs
required_skills = [skill.strip() for skill in skills_input.split('\n') if skill.strip()]
analyze_button = st.button("Find Matching Profiles", use_container_width=True)
if analyze_button:
if not required_skills:
st.error("Please enter at least one required skill!")
return
docs_folder = os.path.join(os.path.dirname(__file__), 'Docs')
with st.spinner("Analyzing resumes..."):
try:
processed_files = process_docs_folder(docs_folder)
if not processed_files:
st.error("No resumes found in the Docs folder!")
return
# Analyze each resume
matches = []
progress_bar = st.progress(0)
for idx, file_data in enumerate(processed_files):
progress = (idx + 1) / len(processed_files)
progress_bar.progress(progress)
relevance = check_resume_relevance(
additional_reqs,
file_data['content'],
required_skills,
client
)
if relevance['found_skills']: # Show if any skills found
matches.append({
**file_data,
"match_score": relevance['score'],
"found_skills": relevance['found_skills'],
"total_skills": relevance['total_skills'],
"key_matches": relevance['key_matches']
})
progress_bar.empty()
# Sort matches by score
matches.sort(key=lambda x: x['match_score'], reverse=True)
if matches:
st.success(f"Found {len(matches)} profiles with matching skills")
# Create columns for filters
col1, col2 = st.columns(2)
with col1:
min_score = st.slider(
"Minimum Match Score",
min_value=0,
max_value=100,
value=50,
step=5
)
with col2:
min_skills = st.slider(
"Minimum Required Skills",
min_value=0,
max_value=len(required_skills),
value=1,
step=1
)
# Filter matches based on criteria
filtered_matches = [
match for match in matches
if match['match_score'] >= min_score and
len(match['found_skills']) >= min_skills
]
# Display filtered matches
st.subheader(f"Showing {len(filtered_matches)} matches meeting criteria")
# Display matches
for idx, match in enumerate(filtered_matches):
with st.container():
st.markdown("---")
col1, col2 = st.columns([1, 3])
with col1:
st.subheader(f"Match #{idx + 1}")
st.write(f"📄 {match['name']}")
st.write(f"Match Score: {match['match_score']:.1f}%")
# Display skills breakdown
st.write("Skills Found:")
found_skills = match.get('found_skills', [])
for skill in required_skills:
if skill in found_skills:
st.write(f"✅ {skill}")
else:
st.write(f"❌ {skill}")
with col2:
with st.expander("Show Detailed Analysis"):
analysis = get_summary_from_llm(
additional_reqs,
match['content'],
required_skills,
client
)
st.markdown(analysis)
# Add export section
st.markdown("---")
st.subheader("Export Results")
# Create Excel file
excel_output = export_to_excel(filtered_matches, required_skills)
# Add download button with count of matches
st.download_button(
label=f"📥 Download Excel Report ({len(filtered_matches)} matches)",
data=excel_output.getvalue(),
file_name="resume_matches.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
# Display summary statistics
st.markdown("---")
st.subheader("Summary Statistics")
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Total Matches", len(filtered_matches))
with col2:
avg_score = sum(match['match_score'] for match in filtered_matches) / len(filtered_matches)
st.metric("Average Match Score", f"{avg_score:.1f}%")
with col3:
perfect_matches = sum(1 for match in filtered_matches if match['match_score'] == 100)
st.metric("Perfect Matches", perfect_matches)
with col4:
avg_skills = sum(len(match['found_skills']) for match in filtered_matches) / len(filtered_matches)
st.metric("Avg. Skills Found", f"{avg_skills:.1f}")
# Add skill distribution chart
st.subheader("Skill Distribution")
skill_counts = {skill: 0 for skill in required_skills}
for match in filtered_matches:
for skill in match['found_skills']:
if skill in skill_counts:
skill_counts[skill] += 1
# Create DataFrame for chart
chart_data = pd.DataFrame({
'Skill': list(skill_counts.keys()),
'Count': list(skill_counts.values())
})
# Display bar chart
st.bar_chart(chart_data.set_index('Skill'))
else:
st.warning(
"No profiles found matching the required skills. "
"Try adjusting the requirements or adding more resumes."
)
except Exception as e:
st.error(f"Error during analysis: {str(e)}")
print(f"Error Details: {e}")
except Exception as error:
st.error(f"An error occurred: {str(error)}")
print(f"Error Details: {error}")
if __name__ == "__main__":
main()