Resume-Analyzer / app.py
Deepakkori45's picture
Update app.py
4c63be3 verified
import streamlit as st
import pandas as pd
import os
from datetime import datetime
import docx2txt
# Try importing Document from python-docx, but don't fail if not available
try:
from docx import Document
DOCX_AVAILABLE = True
except ImportError:
DOCX_AVAILABLE = False
from PyPDF2 import PdfReader
import openai
from dotenv import load_dotenv
import tempfile
import plotly.express as px
import plotly.graph_objects as go
from database import ResumeDatabase
# Load environment variables
load_dotenv()
# Initialize OpenAI API key and database
openai.api_key = os.getenv('OPENAI_API_KEY')
db = ResumeDatabase()
def extract_text_from_pdf(file):
"""Extract text from PDF file"""
try:
pdf_reader = PdfReader(file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
return text
except Exception as e:
st.error(f"Error extracting text from PDF: {str(e)}")
return None
def extract_text_from_docx(file):
"""Extract text from DOCX file"""
try:
# Try using docx2txt first
try:
text = docx2txt.process(file)
return text
except Exception as e1:
# If docx2txt fails and python-docx is available, try that
if DOCX_AVAILABLE:
doc = Document(file)
text = ""
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text
else:
st.error("Could not process DOCX file. Please ensure python-docx is installed.")
return None
except Exception as e:
st.error(f"Error extracting text from DOCX: {str(e)}")
return None
def analyze_resume(text):
"""Analyze resume text using OpenAI API"""
try:
system_prompt = """You are a professional resume analyzer specializing in Data Science and Data Engineering roles.
Analyze the following resume and extract key information in a structured format.
Return ONLY a Python dictionary with the following keys (no other text):
{
"name": "extracted name",
"email": "extracted email",
"phone": "extracted phone",
"location": "extracted location",
"linkedin_url": "extracted LinkedIn URL",
"github_url": "extracted GitHub URL",
"portfolio_url": "extracted portfolio URL",
"cgpa": "extracted CGPA",
"years_experience": "extracted years of experience as a number",
"education_level": "highest education level",
"major": "extracted major/field of study",
"university": "extracted university name",
"internships": ["list of internships"],
"programming_languages": ["list of programming languages"],
"technical_skills": ["list of technical skills"],
"job_titles": ["list of job titles"],
"certifications": ["list of certifications"],
"ml_frameworks": ["list of ML frameworks"],
"visualization_tools": ["list of visualization tools"],
"statistical_tools": ["list of statistical tools"],
"big_data_tools": ["list of big data tools"],
"cloud_platforms": ["list of cloud platforms"],
"deep_learning": ["list of deep learning skills"],
"nlp_skills": ["list of NLP skills"],
"computer_vision": ["list of computer vision skills"],
"databases": ["list of databases"],
"etl_tools": ["list of ETL tools"],
"data_warehousing": ["list of data warehousing tools"],
"orchestration_tools": ["list of orchestration tools"],
"streaming_tech": ["list of streaming technologies"],
"data_modeling": ["list of data modeling skills"],
"data_governance": ["list of data governance experience"],
"data_quality_tools": ["list of data quality tools"],
"projects": ["list of projects"],
"publications": ["list of publications"],
"research_experience": "yes/no or details of research experience",
"hackathons": ["list of hackathons"],
"awards": ["list of awards"],
"soft_skills": ["list of soft skills"],
"domain_expertise": ["list of domain expertise"],
"languages": ["list of languages"],
"leadership_experience": "yes/no or details of leadership experience",
"team_size": "number of people managed",
"code_quality": ["list of code quality metrics"],
"project_impact": ["list of project impact metrics"],
"performance_improvements": ["list of performance improvements"],
"version_control": ["list of version control systems"],
"ci_cd_tools": ["list of CI/CD tools"],
"testing_frameworks": ["list of testing frameworks"],
"agile_experience": "yes/no or details of agile experience",
"system_architecture": ["list of system architecture experience"],
"business_domain": ["list of business domains"],
"industry_certifications": ["list of industry certifications"],
"domain_tools": ["list of domain-specific tools"],
"compliance_knowledge": ["list of compliance knowledge"],
"confidence_score": 0.95
}
For any field where information is not found:
- Use "Not found" for string fields
- Use [] for list fields
- Use 0 for numeric fields
- Use "no" for yes/no fields
Ensure all fields are included in the response, even if empty."""
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": text}
],
temperature=0.2
)
# Get the response content
result_str = response['choices'][0]['message']['content'].strip()
# Clean up the response string to ensure it's valid Python dict syntax
result_str = result_str.replace('```python', '').replace('```', '').strip()
# Safely evaluate the string to a Python dictionary
try:
result = eval(result_str)
except:
import ast
result = ast.literal_eval(result_str)
# Ensure all required fields are present with default values
required_fields = {
'name': 'Not found',
'email': 'Not found',
'phone': 'Not found',
'location': 'Not found',
'linkedin_url': 'Not found',
'github_url': 'Not found',
'portfolio_url': 'Not found',
'cgpa': 'Not found',
'years_experience': 0,
'education_level': 'Not found',
'major': 'Not found',
'university': 'Not found',
'programming_languages': [],
'technical_skills': [],
'job_titles': [],
'certifications': [],
'ml_frameworks': [],
'visualization_tools': [],
'projects': [],
'publications': [],
'research_experience': 'no',
'awards': [],
'leadership_experience': 'no',
'team_size': 0
}
# Update with default values for missing fields
for field, default_value in required_fields.items():
if field not in result or result[field] is None:
result[field] = default_value
elif isinstance(default_value, list) and not isinstance(result[field], list):
result[field] = [result[field]] if result[field] != "Not found" else []
return result
except Exception as e:
st.error(f"Error analyzing resume: {str(e)}")
return None
def display_statistics():
"""Display statistics and visualizations of the analyzed resumes"""
stats = db.get_statistics()
st.subheader("📊 Resume Analysis Statistics")
# Basic stats in three columns
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Total Resumes Analyzed", stats['total_resumes'])
with col2:
st.metric("Average Years of Experience", f"{stats['avg_work_experience']:.1f}")
with col3:
st.metric("Universities Represented", len(stats['university_distribution']))
# Education Distribution
st.subheader("🎓 Education Statistics")
col1, col2 = st.columns(2)
with col1:
if stats['education_levels']:
fig = px.pie(
values=list(stats['education_levels'].values()),
names=list(stats['education_levels'].keys()),
title="Education Levels"
)
st.plotly_chart(fig, use_container_width=True)
with col2:
if stats['major_distribution']:
fig = px.pie(
values=list(stats['major_distribution'].values()),
names=list(stats['major_distribution'].keys()),
title="Major Distribution"
)
st.plotly_chart(fig, use_container_width=True)
# Technical Skills Section
st.subheader("💻 Technical Expertise")
# Programming Languages and Technical Skills
col1, col2 = st.columns(2)
with col1:
if stats['top_programming_languages']:
fig = px.bar(
x=list(stats['top_programming_languages'].keys()),
y=list(stats['top_programming_languages'].values()),
title="Top Programming Languages"
)
st.plotly_chart(fig, use_container_width=True)
with col2:
if stats['top_technical_skills']:
fig = px.bar(
x=list(stats['top_technical_skills'].keys()),
y=list(stats['top_technical_skills'].values()),
title="Top Technical Skills"
)
st.plotly_chart(fig, use_container_width=True)
# Data Science Specific Skills
st.subheader("🔬 Data Science Expertise")
col1, col2 = st.columns(2)
with col1:
if stats['top_ml_frameworks']:
fig = px.bar(
x=list(stats['top_ml_frameworks'].keys()),
y=list(stats['top_ml_frameworks'].values()),
title="Top ML Frameworks"
)
st.plotly_chart(fig, use_container_width=True)
with col2:
if stats['top_visualization_tools']:
fig = px.bar(
x=list(stats['top_visualization_tools'].keys()),
y=list(stats['top_visualization_tools'].values()),
title="Top Visualization Tools"
)
st.plotly_chart(fig, use_container_width=True)
# Data Engineering Specific Skills
st.subheader("⚙️ Data Engineering Expertise")
col1, col2, col3 = st.columns(3)
with col1:
if stats['top_databases']:
fig = px.bar(
x=list(stats['top_databases'].keys()),
y=list(stats['top_databases'].values()),
title="Top Databases"
)
st.plotly_chart(fig, use_container_width=True)
with col2:
if stats['top_etl_tools']:
fig = px.bar(
x=list(stats['top_etl_tools'].keys()),
y=list(stats['top_etl_tools'].values()),
title="Top ETL Tools"
)
st.plotly_chart(fig, use_container_width=True)
with col3:
if stats['top_streaming_tech']:
fig = px.bar(
x=list(stats['top_streaming_tech'].keys()),
y=list(stats['top_streaming_tech'].values()),
title="Top Streaming Technologies"
)
st.plotly_chart(fig, use_container_width=True)
# Cloud & Big Data
st.subheader("☁️ Cloud & Big Data Expertise")
col1, col2 = st.columns(2)
with col1:
if stats['top_cloud_platforms']:
fig = px.bar(
x=list(stats['top_cloud_platforms'].keys()),
y=list(stats['top_cloud_platforms'].values()),
title="Top Cloud Platforms"
)
st.plotly_chart(fig, use_container_width=True)
with col2:
if stats['top_certifications']:
fig = px.bar(
x=list(stats['top_certifications'].keys()),
y=list(stats['top_certifications'].values()),
title="Top Certifications"
)
st.plotly_chart(fig, use_container_width=True)
def display_rankings():
"""Display candidate rankings with filtering options"""
st.subheader("🏆 Candidate Rankings")
# Role selection
role_type = st.selectbox(
"Select Role Type",
["both", "data_science", "data_engineering"],
format_func=lambda x: {
"both": "Both Roles",
"data_science": "Data Science",
"data_engineering": "Data Engineering"
}[x]
)
# Minimum score filter
min_score = st.slider("Minimum Score", 0, 100, 50)
# Get rankings
rankings = db.get_candidate_rankings(role_type, min_score)
if not rankings:
st.warning("No candidates found matching the criteria.")
return
# Display top candidates
st.write(f"Found {len(rankings)} candidates matching the criteria")
for i, candidate in enumerate(rankings, 1):
with st.expander(f"#{i}: {candidate['name']} - Score: {candidate['total_score']:.1f}"):
col1, col2 = st.columns(2)
with col1:
st.write("📊 Score Breakdown")
fig = go.Figure()
scores = [
('Education', candidate['education_score']),
('Experience', candidate['experience_score']),
('Technical', candidate['technical_score']),
('Projects', candidate['project_score']),
('Impact', candidate['impact_score']),
('Role Specific', candidate['role_specific_score'])
]
fig.add_trace(go.Bar(
x=[s[0] for s in scores],
y=[s[1] for s in scores],
text=[f"{s[1]:.1f}" for s in scores],
textposition='auto',
))
fig.update_layout(
title="Score Components",
showlegend=False,
height=300
)
st.plotly_chart(fig, use_container_width=True)
with col2:
st.write("👤 Candidate Information")
st.write(f"Email: {candidate['email']}")
st.write(f"Experience: {candidate['years_experience']}")
st.write(f"Education: {candidate['education_level']}")
st.write("Key Skills:")
for skill in candidate['key_skills']:
st.write(f"- {skill}")
def main():
st.title("Resume Analyzer")
# Sidebar navigation
page = st.sidebar.selectbox(
"Choose a page",
["Upload Resume", "View Statistics", "View Rankings", "Export Data"]
)
if page == "Upload Resume":
st.write("Upload resumes in PDF or DOCX format for analysis")
# Check for API key
if not os.getenv('OPENAI_API_KEY'):
st.error("Please set your OpenAI API key in the .env file")
return
uploaded_file = st.file_uploader(
"Choose a resume file",
type=['pdf', 'docx'],
help="Upload a resume in PDF or DOCX format"
)
if uploaded_file:
with st.spinner("Processing resume..."):
# Create a temporary file
with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
tmp_file.write(uploaded_file.getvalue())
tmp_file_path = tmp_file.name
# Extract text based on file type
file_extension = uploaded_file.name.split('.')[-1].lower()
if file_extension == 'pdf':
text = extract_text_from_pdf(tmp_file_path)
elif file_extension == 'docx':
text = extract_text_from_docx(tmp_file_path)
# Clean up temporary file
os.unlink(tmp_file_path)
if text:
# Analyze the resume
analysis_result = analyze_resume(text)
if analysis_result:
# Save to database
db.save_analysis(analysis_result, text)
st.success("Resume analyzed successfully!")
# Calculate and display score
scores = db.calculate_score(analysis_result)
st.subheader("📊 Candidate Score")
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Total Score", f"{scores['total_score']:.1f}/100")
with col2:
st.metric("Technical Score", f"{scores['technical_score']:.1f}/20")
with col3:
st.metric("Experience Score", f"{scores['experience_score']:.1f}/20")
# Display detailed scores
fig = go.Figure()
score_components = [
('Education', scores['education_score'], 20),
('Experience', scores['experience_score'], 20),
('Technical', scores['technical_score'], 20),
('Projects', scores['project_score'], 15),
('Impact', scores['impact_score'], 15),
('Role Specific', scores['role_specific_score'], 10)
]
fig.add_trace(go.Bar(
name='Score',
x=[s[0] for s in score_components],
y=[s[1] for s in score_components],
text=[f"{s[1]:.1f}/{s[2]}" for s in score_components],
textposition='auto',
))
fig.update_layout(
title="Score Breakdown",
yaxis_title="Points",
showlegend=False
)
st.plotly_chart(fig, use_container_width=True)
# Display analysis results
st.subheader("Analysis Results")
col1, col2 = st.columns(2)
with col1:
st.write("📚 Education")
st.write(f"CGPA: {analysis_result.get('cgpa', 'Not found')}")
st.write(f"Education Level: {analysis_result.get('education_level', 'Not found')}")
st.write(f"Major: {analysis_result.get('major', 'Not found')}")
st.write(f"University: {analysis_result.get('university', 'Not found')}")
st.write("💼 Experience")
st.write(f"Years of Experience: {analysis_result.get('years_experience', 'Not found')}")
st.write("Job Titles:")
job_titles = analysis_result.get('job_titles', [])
if job_titles:
for title in job_titles:
st.write(f"- {title}")
else:
st.write("- Not found")
with col2:
st.write("🔧 Technical Skills")
# Programming Languages
st.write("Programming Languages:")
prog_langs = analysis_result.get('programming_languages', [])
if prog_langs:
for lang in prog_langs:
st.write(f"- {lang}")
else:
st.write("- Not found")
# Data Science Skills
st.write("Data Science Skills:")
ds_skills = (
analysis_result.get('ml_frameworks', []) +
analysis_result.get('deep_learning', []) +
analysis_result.get('nlp_skills', []) +
analysis_result.get('computer_vision', []) +
analysis_result.get('statistical_tools', []) +
analysis_result.get('visualization_tools', [])
)
if ds_skills:
for skill in ds_skills:
st.write(f"- {skill}")
else:
st.write("- Not found")
# Data Engineering Skills
st.write("Data Engineering Skills:")
de_skills = (
analysis_result.get('databases', []) +
analysis_result.get('etl_tools', []) +
analysis_result.get('data_warehousing', []) +
analysis_result.get('orchestration_tools', []) +
analysis_result.get('streaming_tech', []) +
analysis_result.get('data_modeling', [])
)
if de_skills:
for skill in de_skills:
st.write(f"- {skill}")
else:
st.write("- Not found")
# Cloud & Tools
st.write("Cloud & Tools:")
cloud_tools = (
analysis_result.get('cloud_platforms', []) +
analysis_result.get('ci_cd_tools', []) +
analysis_result.get('version_control', [])
)
if cloud_tools:
for tool in cloud_tools:
st.write(f"- {tool}")
else:
st.write("- Not found")
# Additional Information
st.write("📜 Additional Information")
col3, col4 = st.columns(2)
with col3:
# Certifications
st.write("Certifications:")
certs = analysis_result.get('certifications', [])
if certs:
for cert in certs:
st.write(f"- {cert}")
else:
st.write("- Not found")
# Projects
st.write("Projects:")
projects = analysis_result.get('projects', [])
if projects:
for project in projects:
st.write(f"- {project}")
else:
st.write("- Not found")
with col4:
# Publications & Research
st.write("Publications & Research:")
publications = analysis_result.get('publications', [])
if publications:
for pub in publications:
st.write(f"- {pub}")
else:
st.write("- Not found")
research_exp = analysis_result.get('research_experience', 'no')
if research_exp.lower() != 'no':
st.write("Research Experience:", research_exp)
# Leadership & Team Size
leadership = analysis_result.get('leadership_experience', 'no')
if leadership.lower() != 'no':
st.write("Leadership Experience:", leadership)
team_size = analysis_result.get('team_size', 0)
if team_size:
st.write(f"Team Size Managed: {team_size}")
elif page == "View Statistics":
display_statistics()
elif page == "View Rankings":
display_rankings()
else: # Export Data page
st.subheader("Export Data")
col1, col2 = st.columns(2)
with col1:
if st.button("Export to CSV"):
csv_path = db.export_to_csv()
with open(csv_path, 'rb') as f:
st.download_button(
label="Download CSV",
data=f,
file_name="resume_analyses.csv",
mime="text/csv"
)
with col2:
if st.button("Export to JSON"):
json_path = db.export_to_json()
with open(json_path, 'rb') as f:
st.download_button(
label="Download JSON",
data=f,
file_name="resume_analyses.json",
mime="application/json"
)
if __name__ == "__main__":
main()