import streamlit as st import pandas as pd import os from datetime import datetime import docx2txt # Try importing Document from python-docx, but don't fail if not available try: from docx import Document DOCX_AVAILABLE = True except ImportError: DOCX_AVAILABLE = False from PyPDF2 import PdfReader import openai from dotenv import load_dotenv import tempfile import plotly.express as px import plotly.graph_objects as go from database import ResumeDatabase # Load environment variables load_dotenv() # Initialize OpenAI API key and database openai.api_key = os.getenv('OPENAI_API_KEY') db = ResumeDatabase() def extract_text_from_pdf(file): """Extract text from PDF file""" try: pdf_reader = PdfReader(file) text = "" for page in pdf_reader.pages: text += page.extract_text() return text except Exception as e: st.error(f"Error extracting text from PDF: {str(e)}") return None def extract_text_from_docx(file): """Extract text from DOCX file""" try: # Try using docx2txt first try: text = docx2txt.process(file) return text except Exception as e1: # If docx2txt fails and python-docx is available, try that if DOCX_AVAILABLE: doc = Document(file) text = "" for paragraph in doc.paragraphs: text += paragraph.text + "\n" return text else: st.error("Could not process DOCX file. Please ensure python-docx is installed.") return None except Exception as e: st.error(f"Error extracting text from DOCX: {str(e)}") return None def analyze_resume(text): """Analyze resume text using OpenAI API""" try: system_prompt = """You are a professional resume analyzer specializing in Data Science and Data Engineering roles. Analyze the following resume and extract key information in a structured format. Return ONLY a Python dictionary with the following keys (no other text): { "name": "extracted name", "email": "extracted email", "phone": "extracted phone", "location": "extracted location", "linkedin_url": "extracted LinkedIn URL", "github_url": "extracted GitHub URL", "portfolio_url": "extracted portfolio URL", "cgpa": "extracted CGPA", "years_experience": "extracted years of experience as a number", "education_level": "highest education level", "major": "extracted major/field of study", "university": "extracted university name", "internships": ["list of internships"], "programming_languages": ["list of programming languages"], "technical_skills": ["list of technical skills"], "job_titles": ["list of job titles"], "certifications": ["list of certifications"], "ml_frameworks": ["list of ML frameworks"], "visualization_tools": ["list of visualization tools"], "statistical_tools": ["list of statistical tools"], "big_data_tools": ["list of big data tools"], "cloud_platforms": ["list of cloud platforms"], "deep_learning": ["list of deep learning skills"], "nlp_skills": ["list of NLP skills"], "computer_vision": ["list of computer vision skills"], "databases": ["list of databases"], "etl_tools": ["list of ETL tools"], "data_warehousing": ["list of data warehousing tools"], "orchestration_tools": ["list of orchestration tools"], "streaming_tech": ["list of streaming technologies"], "data_modeling": ["list of data modeling skills"], "data_governance": ["list of data governance experience"], "data_quality_tools": ["list of data quality tools"], "projects": ["list of projects"], "publications": ["list of publications"], "research_experience": "yes/no or details of research experience", "hackathons": ["list of hackathons"], "awards": ["list of awards"], "soft_skills": ["list of soft skills"], "domain_expertise": ["list of domain expertise"], "languages": ["list of languages"], "leadership_experience": "yes/no or details of leadership experience", "team_size": "number of people managed", "code_quality": ["list of code quality metrics"], "project_impact": ["list of project impact metrics"], "performance_improvements": ["list of performance improvements"], "version_control": ["list of version control systems"], "ci_cd_tools": ["list of CI/CD tools"], "testing_frameworks": ["list of testing frameworks"], "agile_experience": "yes/no or details of agile experience", "system_architecture": ["list of system architecture experience"], "business_domain": ["list of business domains"], "industry_certifications": ["list of industry certifications"], "domain_tools": ["list of domain-specific tools"], "compliance_knowledge": ["list of compliance knowledge"], "confidence_score": 0.95 } For any field where information is not found: - Use "Not found" for string fields - Use [] for list fields - Use 0 for numeric fields - Use "no" for yes/no fields Ensure all fields are included in the response, even if empty.""" response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": text} ], temperature=0.2 ) # Get the response content result_str = response['choices'][0]['message']['content'].strip() # Clean up the response string to ensure it's valid Python dict syntax result_str = result_str.replace('```python', '').replace('```', '').strip() # Safely evaluate the string to a Python dictionary try: result = eval(result_str) except: import ast result = ast.literal_eval(result_str) # Ensure all required fields are present with default values required_fields = { 'name': 'Not found', 'email': 'Not found', 'phone': 'Not found', 'location': 'Not found', 'linkedin_url': 'Not found', 'github_url': 'Not found', 'portfolio_url': 'Not found', 'cgpa': 'Not found', 'years_experience': 0, 'education_level': 'Not found', 'major': 'Not found', 'university': 'Not found', 'programming_languages': [], 'technical_skills': [], 'job_titles': [], 'certifications': [], 'ml_frameworks': [], 'visualization_tools': [], 'projects': [], 'publications': [], 'research_experience': 'no', 'awards': [], 'leadership_experience': 'no', 'team_size': 0 } # Update with default values for missing fields for field, default_value in required_fields.items(): if field not in result or result[field] is None: result[field] = default_value elif isinstance(default_value, list) and not isinstance(result[field], list): result[field] = [result[field]] if result[field] != "Not found" else [] return result except Exception as e: st.error(f"Error analyzing resume: {str(e)}") return None def display_statistics(): """Display statistics and visualizations of the analyzed resumes""" stats = db.get_statistics() st.subheader("📊 Resume Analysis Statistics") # Basic stats in three columns col1, col2, col3 = st.columns(3) with col1: st.metric("Total Resumes Analyzed", stats['total_resumes']) with col2: st.metric("Average Years of Experience", f"{stats['avg_work_experience']:.1f}") with col3: st.metric("Universities Represented", len(stats['university_distribution'])) # Education Distribution st.subheader("🎓 Education Statistics") col1, col2 = st.columns(2) with col1: if stats['education_levels']: fig = px.pie( values=list(stats['education_levels'].values()), names=list(stats['education_levels'].keys()), title="Education Levels" ) st.plotly_chart(fig, use_container_width=True) with col2: if stats['major_distribution']: fig = px.pie( values=list(stats['major_distribution'].values()), names=list(stats['major_distribution'].keys()), title="Major Distribution" ) st.plotly_chart(fig, use_container_width=True) # Technical Skills Section st.subheader("💻 Technical Expertise") # Programming Languages and Technical Skills col1, col2 = st.columns(2) with col1: if stats['top_programming_languages']: fig = px.bar( x=list(stats['top_programming_languages'].keys()), y=list(stats['top_programming_languages'].values()), title="Top Programming Languages" ) st.plotly_chart(fig, use_container_width=True) with col2: if stats['top_technical_skills']: fig = px.bar( x=list(stats['top_technical_skills'].keys()), y=list(stats['top_technical_skills'].values()), title="Top Technical Skills" ) st.plotly_chart(fig, use_container_width=True) # Data Science Specific Skills st.subheader("🔬 Data Science Expertise") col1, col2 = st.columns(2) with col1: if stats['top_ml_frameworks']: fig = px.bar( x=list(stats['top_ml_frameworks'].keys()), y=list(stats['top_ml_frameworks'].values()), title="Top ML Frameworks" ) st.plotly_chart(fig, use_container_width=True) with col2: if stats['top_visualization_tools']: fig = px.bar( x=list(stats['top_visualization_tools'].keys()), y=list(stats['top_visualization_tools'].values()), title="Top Visualization Tools" ) st.plotly_chart(fig, use_container_width=True) # Data Engineering Specific Skills st.subheader("⚙️ Data Engineering Expertise") col1, col2, col3 = st.columns(3) with col1: if stats['top_databases']: fig = px.bar( x=list(stats['top_databases'].keys()), y=list(stats['top_databases'].values()), title="Top Databases" ) st.plotly_chart(fig, use_container_width=True) with col2: if stats['top_etl_tools']: fig = px.bar( x=list(stats['top_etl_tools'].keys()), y=list(stats['top_etl_tools'].values()), title="Top ETL Tools" ) st.plotly_chart(fig, use_container_width=True) with col3: if stats['top_streaming_tech']: fig = px.bar( x=list(stats['top_streaming_tech'].keys()), y=list(stats['top_streaming_tech'].values()), title="Top Streaming Technologies" ) st.plotly_chart(fig, use_container_width=True) # Cloud & Big Data st.subheader("☁️ Cloud & Big Data Expertise") col1, col2 = st.columns(2) with col1: if stats['top_cloud_platforms']: fig = px.bar( x=list(stats['top_cloud_platforms'].keys()), y=list(stats['top_cloud_platforms'].values()), title="Top Cloud Platforms" ) st.plotly_chart(fig, use_container_width=True) with col2: if stats['top_certifications']: fig = px.bar( x=list(stats['top_certifications'].keys()), y=list(stats['top_certifications'].values()), title="Top Certifications" ) st.plotly_chart(fig, use_container_width=True) def display_rankings(): """Display candidate rankings with filtering options""" st.subheader("🏆 Candidate Rankings") # Role selection role_type = st.selectbox( "Select Role Type", ["both", "data_science", "data_engineering"], format_func=lambda x: { "both": "Both Roles", "data_science": "Data Science", "data_engineering": "Data Engineering" }[x] ) # Minimum score filter min_score = st.slider("Minimum Score", 0, 100, 50) # Get rankings rankings = db.get_candidate_rankings(role_type, min_score) if not rankings: st.warning("No candidates found matching the criteria.") return # Display top candidates st.write(f"Found {len(rankings)} candidates matching the criteria") for i, candidate in enumerate(rankings, 1): with st.expander(f"#{i}: {candidate['name']} - Score: {candidate['total_score']:.1f}"): col1, col2 = st.columns(2) with col1: st.write("📊 Score Breakdown") fig = go.Figure() scores = [ ('Education', candidate['education_score']), ('Experience', candidate['experience_score']), ('Technical', candidate['technical_score']), ('Projects', candidate['project_score']), ('Impact', candidate['impact_score']), ('Role Specific', candidate['role_specific_score']) ] fig.add_trace(go.Bar( x=[s[0] for s in scores], y=[s[1] for s in scores], text=[f"{s[1]:.1f}" for s in scores], textposition='auto', )) fig.update_layout( title="Score Components", showlegend=False, height=300 ) st.plotly_chart(fig, use_container_width=True) with col2: st.write("👤 Candidate Information") st.write(f"Email: {candidate['email']}") st.write(f"Experience: {candidate['years_experience']}") st.write(f"Education: {candidate['education_level']}") st.write("Key Skills:") for skill in candidate['key_skills']: st.write(f"- {skill}") def main(): st.title("Resume Analyzer") # Sidebar navigation page = st.sidebar.selectbox( "Choose a page", ["Upload Resume", "View Statistics", "View Rankings", "Export Data"] ) if page == "Upload Resume": st.write("Upload resumes in PDF or DOCX format for analysis") # Check for API key if not os.getenv('OPENAI_API_KEY'): st.error("Please set your OpenAI API key in the .env file") return uploaded_file = st.file_uploader( "Choose a resume file", type=['pdf', 'docx'], help="Upload a resume in PDF or DOCX format" ) if uploaded_file: with st.spinner("Processing resume..."): # Create a temporary file with tempfile.NamedTemporaryFile(delete=False) as tmp_file: tmp_file.write(uploaded_file.getvalue()) tmp_file_path = tmp_file.name # Extract text based on file type file_extension = uploaded_file.name.split('.')[-1].lower() if file_extension == 'pdf': text = extract_text_from_pdf(tmp_file_path) elif file_extension == 'docx': text = extract_text_from_docx(tmp_file_path) # Clean up temporary file os.unlink(tmp_file_path) if text: # Analyze the resume analysis_result = analyze_resume(text) if analysis_result: # Save to database db.save_analysis(analysis_result, text) st.success("Resume analyzed successfully!") # Calculate and display score scores = db.calculate_score(analysis_result) st.subheader("📊 Candidate Score") col1, col2, col3 = st.columns(3) with col1: st.metric("Total Score", f"{scores['total_score']:.1f}/100") with col2: st.metric("Technical Score", f"{scores['technical_score']:.1f}/20") with col3: st.metric("Experience Score", f"{scores['experience_score']:.1f}/20") # Display detailed scores fig = go.Figure() score_components = [ ('Education', scores['education_score'], 20), ('Experience', scores['experience_score'], 20), ('Technical', scores['technical_score'], 20), ('Projects', scores['project_score'], 15), ('Impact', scores['impact_score'], 15), ('Role Specific', scores['role_specific_score'], 10) ] fig.add_trace(go.Bar( name='Score', x=[s[0] for s in score_components], y=[s[1] for s in score_components], text=[f"{s[1]:.1f}/{s[2]}" for s in score_components], textposition='auto', )) fig.update_layout( title="Score Breakdown", yaxis_title="Points", showlegend=False ) st.plotly_chart(fig, use_container_width=True) # Display analysis results st.subheader("Analysis Results") col1, col2 = st.columns(2) with col1: st.write("📚 Education") st.write(f"CGPA: {analysis_result.get('cgpa', 'Not found')}") st.write(f"Education Level: {analysis_result.get('education_level', 'Not found')}") st.write(f"Major: {analysis_result.get('major', 'Not found')}") st.write(f"University: {analysis_result.get('university', 'Not found')}") st.write("💼 Experience") st.write(f"Years of Experience: {analysis_result.get('years_experience', 'Not found')}") st.write("Job Titles:") job_titles = analysis_result.get('job_titles', []) if job_titles: for title in job_titles: st.write(f"- {title}") else: st.write("- Not found") with col2: st.write("🔧 Technical Skills") # Programming Languages st.write("Programming Languages:") prog_langs = analysis_result.get('programming_languages', []) if prog_langs: for lang in prog_langs: st.write(f"- {lang}") else: st.write("- Not found") # Data Science Skills st.write("Data Science Skills:") ds_skills = ( analysis_result.get('ml_frameworks', []) + analysis_result.get('deep_learning', []) + analysis_result.get('nlp_skills', []) + analysis_result.get('computer_vision', []) + analysis_result.get('statistical_tools', []) + analysis_result.get('visualization_tools', []) ) if ds_skills: for skill in ds_skills: st.write(f"- {skill}") else: st.write("- Not found") # Data Engineering Skills st.write("Data Engineering Skills:") de_skills = ( analysis_result.get('databases', []) + analysis_result.get('etl_tools', []) + analysis_result.get('data_warehousing', []) + analysis_result.get('orchestration_tools', []) + analysis_result.get('streaming_tech', []) + analysis_result.get('data_modeling', []) ) if de_skills: for skill in de_skills: st.write(f"- {skill}") else: st.write("- Not found") # Cloud & Tools st.write("Cloud & Tools:") cloud_tools = ( analysis_result.get('cloud_platforms', []) + analysis_result.get('ci_cd_tools', []) + analysis_result.get('version_control', []) ) if cloud_tools: for tool in cloud_tools: st.write(f"- {tool}") else: st.write("- Not found") # Additional Information st.write("📜 Additional Information") col3, col4 = st.columns(2) with col3: # Certifications st.write("Certifications:") certs = analysis_result.get('certifications', []) if certs: for cert in certs: st.write(f"- {cert}") else: st.write("- Not found") # Projects st.write("Projects:") projects = analysis_result.get('projects', []) if projects: for project in projects: st.write(f"- {project}") else: st.write("- Not found") with col4: # Publications & Research st.write("Publications & Research:") publications = analysis_result.get('publications', []) if publications: for pub in publications: st.write(f"- {pub}") else: st.write("- Not found") research_exp = analysis_result.get('research_experience', 'no') if research_exp.lower() != 'no': st.write("Research Experience:", research_exp) # Leadership & Team Size leadership = analysis_result.get('leadership_experience', 'no') if leadership.lower() != 'no': st.write("Leadership Experience:", leadership) team_size = analysis_result.get('team_size', 0) if team_size: st.write(f"Team Size Managed: {team_size}") elif page == "View Statistics": display_statistics() elif page == "View Rankings": display_rankings() else: # Export Data page st.subheader("Export Data") col1, col2 = st.columns(2) with col1: if st.button("Export to CSV"): csv_path = db.export_to_csv() with open(csv_path, 'rb') as f: st.download_button( label="Download CSV", data=f, file_name="resume_analyses.csv", mime="text/csv" ) with col2: if st.button("Export to JSON"): json_path = db.export_to_json() with open(json_path, 'rb') as f: st.download_button( label="Download JSON", data=f, file_name="resume_analyses.json", mime="application/json" ) if __name__ == "__main__": main()