Spaces:

Deepakkori45
/

Resume-Analyzer

Sleeping

App Files Files Community

Resume-Analyzer / app.py

Deepakkori45

Update app.py

4c63be3 verified 11 months ago

raw

history blame contribute delete

27 kB

	import streamlit as st
	import pandas as pd
	import os
	from datetime import datetime
	import docx2txt
	# Try importing Document from python-docx, but don't fail if not available
	try:
	from docx import Document
	DOCX_AVAILABLE = True
	except ImportError:
	DOCX_AVAILABLE = False
	from PyPDF2 import PdfReader
	import openai
	from dotenv import load_dotenv
	import tempfile
	import plotly.express as px
	import plotly.graph_objects as go
	from database import ResumeDatabase

	# Load environment variables
	load_dotenv()

	# Initialize OpenAI API key and database
	openai.api_key = os.getenv('OPENAI_API_KEY')
	db = ResumeDatabase()

	def extract_text_from_pdf(file):
	"""Extract text from PDF file"""
	try:
	pdf_reader = PdfReader(file)
	text = ""
	for page in pdf_reader.pages:
	text += page.extract_text()
	return text
	except Exception as e:
	st.error(f"Error extracting text from PDF: {str(e)}")
	return None

	def extract_text_from_docx(file):
	"""Extract text from DOCX file"""
	try:
	# Try using docx2txt first
	try:
	text = docx2txt.process(file)
	return text
	except Exception as e1:
	# If docx2txt fails and python-docx is available, try that
	if DOCX_AVAILABLE:
	doc = Document(file)
	text = ""
	for paragraph in doc.paragraphs:
	text += paragraph.text + "\n"
	return text
	else:
	st.error("Could not process DOCX file. Please ensure python-docx is installed.")
	return None
	except Exception as e:
	st.error(f"Error extracting text from DOCX: {str(e)}")
	return None

	def analyze_resume(text):
	"""Analyze resume text using OpenAI API"""
	try:
	system_prompt = """You are a professional resume analyzer specializing in Data Science and Data Engineering roles.
	Analyze the following resume and extract key information in a structured format.

	Return ONLY a Python dictionary with the following keys (no other text):
	{
	"name": "extracted name",
	"email": "extracted email",
	"phone": "extracted phone",
	"location": "extracted location",
	"linkedin_url": "extracted LinkedIn URL",
	"github_url": "extracted GitHub URL",
	"portfolio_url": "extracted portfolio URL",
	"cgpa": "extracted CGPA",
	"years_experience": "extracted years of experience as a number",
	"education_level": "highest education level",
	"major": "extracted major/field of study",
	"university": "extracted university name",
	"internships": ["list of internships"],
	"programming_languages": ["list of programming languages"],
	"technical_skills": ["list of technical skills"],
	"job_titles": ["list of job titles"],
	"certifications": ["list of certifications"],
	"ml_frameworks": ["list of ML frameworks"],
	"visualization_tools": ["list of visualization tools"],
	"statistical_tools": ["list of statistical tools"],
	"big_data_tools": ["list of big data tools"],
	"cloud_platforms": ["list of cloud platforms"],
	"deep_learning": ["list of deep learning skills"],
	"nlp_skills": ["list of NLP skills"],
	"computer_vision": ["list of computer vision skills"],
	"databases": ["list of databases"],
	"etl_tools": ["list of ETL tools"],
	"data_warehousing": ["list of data warehousing tools"],
	"orchestration_tools": ["list of orchestration tools"],
	"streaming_tech": ["list of streaming technologies"],
	"data_modeling": ["list of data modeling skills"],
	"data_governance": ["list of data governance experience"],
	"data_quality_tools": ["list of data quality tools"],
	"projects": ["list of projects"],
	"publications": ["list of publications"],
	"research_experience": "yes/no or details of research experience",
	"hackathons": ["list of hackathons"],
	"awards": ["list of awards"],
	"soft_skills": ["list of soft skills"],
	"domain_expertise": ["list of domain expertise"],
	"languages": ["list of languages"],
	"leadership_experience": "yes/no or details of leadership experience",
	"team_size": "number of people managed",
	"code_quality": ["list of code quality metrics"],
	"project_impact": ["list of project impact metrics"],
	"performance_improvements": ["list of performance improvements"],
	"version_control": ["list of version control systems"],
	"ci_cd_tools": ["list of CI/CD tools"],
	"testing_frameworks": ["list of testing frameworks"],
	"agile_experience": "yes/no or details of agile experience",
	"system_architecture": ["list of system architecture experience"],
	"business_domain": ["list of business domains"],
	"industry_certifications": ["list of industry certifications"],
	"domain_tools": ["list of domain-specific tools"],
	"compliance_knowledge": ["list of compliance knowledge"],
	"confidence_score": 0.95
	}

	For any field where information is not found:
	- Use "Not found" for string fields
	- Use [] for list fields
	- Use 0 for numeric fields
	- Use "no" for yes/no fields

	Ensure all fields are included in the response, even if empty."""

	response = openai.ChatCompletion.create(
	model="gpt-3.5-turbo",
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": text}
	],
	temperature=0.2
	)

	# Get the response content
	result_str = response['choices'][0]['message']['content'].strip()

	# Clean up the response string to ensure it's valid Python dict syntax
	result_str = result_str.replace('```python', '').replace('```', '').strip()

	# Safely evaluate the string to a Python dictionary
	try:
	result = eval(result_str)
	except:
	import ast
	result = ast.literal_eval(result_str)

	# Ensure all required fields are present with default values
	required_fields = {
	'name': 'Not found',
	'email': 'Not found',
	'phone': 'Not found',
	'location': 'Not found',
	'linkedin_url': 'Not found',
	'github_url': 'Not found',
	'portfolio_url': 'Not found',
	'cgpa': 'Not found',
	'years_experience': 0,
	'education_level': 'Not found',
	'major': 'Not found',
	'university': 'Not found',
	'programming_languages': [],
	'technical_skills': [],
	'job_titles': [],
	'certifications': [],
	'ml_frameworks': [],
	'visualization_tools': [],
	'projects': [],
	'publications': [],
	'research_experience': 'no',
	'awards': [],
	'leadership_experience': 'no',
	'team_size': 0
	}

	# Update with default values for missing fields
	for field, default_value in required_fields.items():
	if field not in result or result[field] is None:
	result[field] = default_value
	elif isinstance(default_value, list) and not isinstance(result[field], list):
	result[field] = [result[field]] if result[field] != "Not found" else []

	return result
	except Exception as e:
	st.error(f"Error analyzing resume: {str(e)}")
	return None

	def display_statistics():
	"""Display statistics and visualizations of the analyzed resumes"""
	stats = db.get_statistics()

	st.subheader("📊 Resume Analysis Statistics")

	# Basic stats in three columns
	col1, col2, col3 = st.columns(3)
	with col1:
	st.metric("Total Resumes Analyzed", stats['total_resumes'])
	with col2:
	st.metric("Average Years of Experience", f"{stats['avg_work_experience']:.1f}")
	with col3:
	st.metric("Universities Represented", len(stats['university_distribution']))

	# Education Distribution
	st.subheader("🎓 Education Statistics")
	col1, col2 = st.columns(2)

	with col1:
	if stats['education_levels']:
	fig = px.pie(
	values=list(stats['education_levels'].values()),
	names=list(stats['education_levels'].keys()),
	title="Education Levels"
	)
	st.plotly_chart(fig, use_container_width=True)

	with col2:
	if stats['major_distribution']:
	fig = px.pie(
	values=list(stats['major_distribution'].values()),
	names=list(stats['major_distribution'].keys()),
	title="Major Distribution"
	)
	st.plotly_chart(fig, use_container_width=True)

	# Technical Skills Section
	st.subheader("💻 Technical Expertise")

	# Programming Languages and Technical Skills
	col1, col2 = st.columns(2)
	with col1:
	if stats['top_programming_languages']:
	fig = px.bar(
	x=list(stats['top_programming_languages'].keys()),
	y=list(stats['top_programming_languages'].values()),
	title="Top Programming Languages"
	)
	st.plotly_chart(fig, use_container_width=True)

	with col2:
	if stats['top_technical_skills']:
	fig = px.bar(
	x=list(stats['top_technical_skills'].keys()),
	y=list(stats['top_technical_skills'].values()),
	title="Top Technical Skills"
	)
	st.plotly_chart(fig, use_container_width=True)

	# Data Science Specific Skills
	st.subheader("🔬 Data Science Expertise")
	col1, col2 = st.columns(2)

	with col1:
	if stats['top_ml_frameworks']:
	fig = px.bar(
	x=list(stats['top_ml_frameworks'].keys()),
	y=list(stats['top_ml_frameworks'].values()),
	title="Top ML Frameworks"
	)
	st.plotly_chart(fig, use_container_width=True)

	with col2:
	if stats['top_visualization_tools']:
	fig = px.bar(
	x=list(stats['top_visualization_tools'].keys()),
	y=list(stats['top_visualization_tools'].values()),
	title="Top Visualization Tools"
	)
	st.plotly_chart(fig, use_container_width=True)

	# Data Engineering Specific Skills
	st.subheader("⚙️ Data Engineering Expertise")
	col1, col2, col3 = st.columns(3)

	with col1:
	if stats['top_databases']:
	fig = px.bar(
	x=list(stats['top_databases'].keys()),
	y=list(stats['top_databases'].values()),
	title="Top Databases"
	)
	st.plotly_chart(fig, use_container_width=True)

	with col2:
	if stats['top_etl_tools']:
	fig = px.bar(
	x=list(stats['top_etl_tools'].keys()),
	y=list(stats['top_etl_tools'].values()),
	title="Top ETL Tools"
	)
	st.plotly_chart(fig, use_container_width=True)

	with col3:
	if stats['top_streaming_tech']:
	fig = px.bar(
	x=list(stats['top_streaming_tech'].keys()),
	y=list(stats['top_streaming_tech'].values()),
	title="Top Streaming Technologies"
	)
	st.plotly_chart(fig, use_container_width=True)

	# Cloud & Big Data
	st.subheader("☁️ Cloud & Big Data Expertise")
	col1, col2 = st.columns(2)

	with col1:
	if stats['top_cloud_platforms']:
	fig = px.bar(
	x=list(stats['top_cloud_platforms'].keys()),
	y=list(stats['top_cloud_platforms'].values()),
	title="Top Cloud Platforms"
	)
	st.plotly_chart(fig, use_container_width=True)

	with col2:
	if stats['top_certifications']:
	fig = px.bar(
	x=list(stats['top_certifications'].keys()),
	y=list(stats['top_certifications'].values()),
	title="Top Certifications"
	)
	st.plotly_chart(fig, use_container_width=True)

	def display_rankings():
	"""Display candidate rankings with filtering options"""
	st.subheader("🏆 Candidate Rankings")

	# Role selection
	role_type = st.selectbox(
	"Select Role Type",
	["both", "data_science", "data_engineering"],
	format_func=lambda x: {
	"both": "Both Roles",
	"data_science": "Data Science",
	"data_engineering": "Data Engineering"
	}[x]
	)

	# Minimum score filter
	min_score = st.slider("Minimum Score", 0, 100, 50)

	# Get rankings
	rankings = db.get_candidate_rankings(role_type, min_score)

	if not rankings:
	st.warning("No candidates found matching the criteria.")
	return

	# Display top candidates
	st.write(f"Found {len(rankings)} candidates matching the criteria")

	for i, candidate in enumerate(rankings, 1):
	with st.expander(f"#{i}: {candidate['name']} - Score: {candidate['total_score']:.1f}"):
	col1, col2 = st.columns(2)

	with col1:
	st.write("📊 Score Breakdown")
	fig = go.Figure()
	scores = [
	('Education', candidate['education_score']),
	('Experience', candidate['experience_score']),
	('Technical', candidate['technical_score']),
	('Projects', candidate['project_score']),
	('Impact', candidate['impact_score']),
	('Role Specific', candidate['role_specific_score'])
	]

	fig.add_trace(go.Bar(
	x=[s[0] for s in scores],
	y=[s[1] for s in scores],
	text=[f"{s[1]:.1f}" for s in scores],
	textposition='auto',
	))

	fig.update_layout(
	title="Score Components",
	showlegend=False,
	height=300
	)
	st.plotly_chart(fig, use_container_width=True)

	with col2:
	st.write("👤 Candidate Information")
	st.write(f"Email: {candidate['email']}")
	st.write(f"Experience: {candidate['years_experience']}")
	st.write(f"Education: {candidate['education_level']}")
	st.write("Key Skills:")
	for skill in candidate['key_skills']:
	st.write(f"- {skill}")

	def main():
	st.title("Resume Analyzer")

	# Sidebar navigation
	page = st.sidebar.selectbox(
	"Choose a page",
	["Upload Resume", "View Statistics", "View Rankings", "Export Data"]
	)

	if page == "Upload Resume":
	st.write("Upload resumes in PDF or DOCX format for analysis")

	# Check for API key
	if not os.getenv('OPENAI_API_KEY'):
	st.error("Please set your OpenAI API key in the .env file")
	return

	uploaded_file = st.file_uploader(
	"Choose a resume file",
	type=['pdf', 'docx'],
	help="Upload a resume in PDF or DOCX format"
	)

	if uploaded_file:
	with st.spinner("Processing resume..."):
	# Create a temporary file
	with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
	tmp_file.write(uploaded_file.getvalue())
	tmp_file_path = tmp_file.name

	# Extract text based on file type
	file_extension = uploaded_file.name.split('.')[-1].lower()
	if file_extension == 'pdf':
	text = extract_text_from_pdf(tmp_file_path)
	elif file_extension == 'docx':
	text = extract_text_from_docx(tmp_file_path)

	# Clean up temporary file
	os.unlink(tmp_file_path)

	if text:
	# Analyze the resume
	analysis_result = analyze_resume(text)

	if analysis_result:
	# Save to database
	db.save_analysis(analysis_result, text)
	st.success("Resume analyzed successfully!")

	# Calculate and display score
	scores = db.calculate_score(analysis_result)

	st.subheader("📊 Candidate Score")
	col1, col2, col3 = st.columns(3)
	with col1:
	st.metric("Total Score", f"{scores['total_score']:.1f}/100")
	with col2:
	st.metric("Technical Score", f"{scores['technical_score']:.1f}/20")
	with col3:
	st.metric("Experience Score", f"{scores['experience_score']:.1f}/20")

	# Display detailed scores
	fig = go.Figure()
	score_components = [
	('Education', scores['education_score'], 20),
	('Experience', scores['experience_score'], 20),
	('Technical', scores['technical_score'], 20),
	('Projects', scores['project_score'], 15),
	('Impact', scores['impact_score'], 15),
	('Role Specific', scores['role_specific_score'], 10)
	]

	fig.add_trace(go.Bar(
	name='Score',
	x=[s[0] for s in score_components],
	y=[s[1] for s in score_components],
	text=[f"{s[1]:.1f}/{s[2]}" for s in score_components],
	textposition='auto',
	))

	fig.update_layout(
	title="Score Breakdown",
	yaxis_title="Points",
	showlegend=False
	)

	st.plotly_chart(fig, use_container_width=True)

	# Display analysis results
	st.subheader("Analysis Results")
	col1, col2 = st.columns(2)

	with col1:
	st.write("📚 Education")
	st.write(f"CGPA: {analysis_result.get('cgpa', 'Not found')}")
	st.write(f"Education Level: {analysis_result.get('education_level', 'Not found')}")
	st.write(f"Major: {analysis_result.get('major', 'Not found')}")
	st.write(f"University: {analysis_result.get('university', 'Not found')}")

	st.write("💼 Experience")
	st.write(f"Years of Experience: {analysis_result.get('years_experience', 'Not found')}")
	st.write("Job Titles:")
	job_titles = analysis_result.get('job_titles', [])
	if job_titles:
	for title in job_titles:
	st.write(f"- {title}")
	else:
	st.write("- Not found")

	with col2:
	st.write("🔧 Technical Skills")

	# Programming Languages
	st.write("Programming Languages:")
	prog_langs = analysis_result.get('programming_languages', [])
	if prog_langs:
	for lang in prog_langs:
	st.write(f"- {lang}")
	else:
	st.write("- Not found")

	# Data Science Skills
	st.write("Data Science Skills:")
	ds_skills = (
	analysis_result.get('ml_frameworks', []) +
	analysis_result.get('deep_learning', []) +
	analysis_result.get('nlp_skills', []) +
	analysis_result.get('computer_vision', []) +
	analysis_result.get('statistical_tools', []) +
	analysis_result.get('visualization_tools', [])
	)
	if ds_skills:
	for skill in ds_skills:
	st.write(f"- {skill}")
	else:
	st.write("- Not found")

	# Data Engineering Skills
	st.write("Data Engineering Skills:")
	de_skills = (
	analysis_result.get('databases', []) +
	analysis_result.get('etl_tools', []) +
	analysis_result.get('data_warehousing', []) +
	analysis_result.get('orchestration_tools', []) +
	analysis_result.get('streaming_tech', []) +
	analysis_result.get('data_modeling', [])
	)
	if de_skills:
	for skill in de_skills:
	st.write(f"- {skill}")
	else:
	st.write("- Not found")

	# Cloud & Tools
	st.write("Cloud & Tools:")
	cloud_tools = (
	analysis_result.get('cloud_platforms', []) +
	analysis_result.get('ci_cd_tools', []) +
	analysis_result.get('version_control', [])
	)
	if cloud_tools:
	for tool in cloud_tools:
	st.write(f"- {tool}")
	else:
	st.write("- Not found")

	# Additional Information
	st.write("📜 Additional Information")
	col3, col4 = st.columns(2)

	with col3:
	# Certifications
	st.write("Certifications:")
	certs = analysis_result.get('certifications', [])
	if certs:
	for cert in certs:
	st.write(f"- {cert}")
	else:
	st.write("- Not found")

	# Projects
	st.write("Projects:")
	projects = analysis_result.get('projects', [])
	if projects:
	for project in projects:
	st.write(f"- {project}")
	else:
	st.write("- Not found")

	with col4:
	# Publications & Research
	st.write("Publications & Research:")
	publications = analysis_result.get('publications', [])
	if publications:
	for pub in publications:
	st.write(f"- {pub}")
	else:
	st.write("- Not found")

	research_exp = analysis_result.get('research_experience', 'no')
	if research_exp.lower() != 'no':
	st.write("Research Experience:", research_exp)

	# Leadership & Team Size
	leadership = analysis_result.get('leadership_experience', 'no')
	if leadership.lower() != 'no':
	st.write("Leadership Experience:", leadership)

	team_size = analysis_result.get('team_size', 0)
	if team_size:
	st.write(f"Team Size Managed: {team_size}")

	elif page == "View Statistics":
	display_statistics()

	elif page == "View Rankings":
	display_rankings()

	else: # Export Data page
	st.subheader("Export Data")

	col1, col2 = st.columns(2)

	with col1:
	if st.button("Export to CSV"):
	csv_path = db.export_to_csv()
	with open(csv_path, 'rb') as f:
	st.download_button(
	label="Download CSV",
	data=f,
	file_name="resume_analyses.csv",
	mime="text/csv"
	)

	with col2:
	if st.button("Export to JSON"):
	json_path = db.export_to_json()
	with open(json_path, 'rb') as f:
	st.download_button(
	label="Download JSON",
	data=f,
	file_name="resume_analyses.json",
	mime="application/json"
	)

	if __name__ == "__main__":
	main()