Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import os | |
| from datetime import datetime | |
| import docx2txt | |
| # Try importing Document from python-docx, but don't fail if not available | |
| try: | |
| from docx import Document | |
| DOCX_AVAILABLE = True | |
| except ImportError: | |
| DOCX_AVAILABLE = False | |
| from PyPDF2 import PdfReader | |
| import openai | |
| from dotenv import load_dotenv | |
| import tempfile | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from database import ResumeDatabase | |
| # Load environment variables | |
| load_dotenv() | |
| # Initialize OpenAI API key and database | |
| openai.api_key = os.getenv('OPENAI_API_KEY') | |
| db = ResumeDatabase() | |
| def extract_text_from_pdf(file): | |
| """Extract text from PDF file""" | |
| try: | |
| pdf_reader = PdfReader(file) | |
| text = "" | |
| for page in pdf_reader.pages: | |
| text += page.extract_text() | |
| return text | |
| except Exception as e: | |
| st.error(f"Error extracting text from PDF: {str(e)}") | |
| return None | |
| def extract_text_from_docx(file): | |
| """Extract text from DOCX file""" | |
| try: | |
| # Try using docx2txt first | |
| try: | |
| text = docx2txt.process(file) | |
| return text | |
| except Exception as e1: | |
| # If docx2txt fails and python-docx is available, try that | |
| if DOCX_AVAILABLE: | |
| doc = Document(file) | |
| text = "" | |
| for paragraph in doc.paragraphs: | |
| text += paragraph.text + "\n" | |
| return text | |
| else: | |
| st.error("Could not process DOCX file. Please ensure python-docx is installed.") | |
| return None | |
| except Exception as e: | |
| st.error(f"Error extracting text from DOCX: {str(e)}") | |
| return None | |
| def analyze_resume(text): | |
| """Analyze resume text using OpenAI API""" | |
| try: | |
| system_prompt = """You are a professional resume analyzer specializing in Data Science and Data Engineering roles. | |
| Analyze the following resume and extract key information in a structured format. | |
| Return ONLY a Python dictionary with the following keys (no other text): | |
| { | |
| "name": "extracted name", | |
| "email": "extracted email", | |
| "phone": "extracted phone", | |
| "location": "extracted location", | |
| "linkedin_url": "extracted LinkedIn URL", | |
| "github_url": "extracted GitHub URL", | |
| "portfolio_url": "extracted portfolio URL", | |
| "cgpa": "extracted CGPA", | |
| "years_experience": "extracted years of experience as a number", | |
| "education_level": "highest education level", | |
| "major": "extracted major/field of study", | |
| "university": "extracted university name", | |
| "internships": ["list of internships"], | |
| "programming_languages": ["list of programming languages"], | |
| "technical_skills": ["list of technical skills"], | |
| "job_titles": ["list of job titles"], | |
| "certifications": ["list of certifications"], | |
| "ml_frameworks": ["list of ML frameworks"], | |
| "visualization_tools": ["list of visualization tools"], | |
| "statistical_tools": ["list of statistical tools"], | |
| "big_data_tools": ["list of big data tools"], | |
| "cloud_platforms": ["list of cloud platforms"], | |
| "deep_learning": ["list of deep learning skills"], | |
| "nlp_skills": ["list of NLP skills"], | |
| "computer_vision": ["list of computer vision skills"], | |
| "databases": ["list of databases"], | |
| "etl_tools": ["list of ETL tools"], | |
| "data_warehousing": ["list of data warehousing tools"], | |
| "orchestration_tools": ["list of orchestration tools"], | |
| "streaming_tech": ["list of streaming technologies"], | |
| "data_modeling": ["list of data modeling skills"], | |
| "data_governance": ["list of data governance experience"], | |
| "data_quality_tools": ["list of data quality tools"], | |
| "projects": ["list of projects"], | |
| "publications": ["list of publications"], | |
| "research_experience": "yes/no or details of research experience", | |
| "hackathons": ["list of hackathons"], | |
| "awards": ["list of awards"], | |
| "soft_skills": ["list of soft skills"], | |
| "domain_expertise": ["list of domain expertise"], | |
| "languages": ["list of languages"], | |
| "leadership_experience": "yes/no or details of leadership experience", | |
| "team_size": "number of people managed", | |
| "code_quality": ["list of code quality metrics"], | |
| "project_impact": ["list of project impact metrics"], | |
| "performance_improvements": ["list of performance improvements"], | |
| "version_control": ["list of version control systems"], | |
| "ci_cd_tools": ["list of CI/CD tools"], | |
| "testing_frameworks": ["list of testing frameworks"], | |
| "agile_experience": "yes/no or details of agile experience", | |
| "system_architecture": ["list of system architecture experience"], | |
| "business_domain": ["list of business domains"], | |
| "industry_certifications": ["list of industry certifications"], | |
| "domain_tools": ["list of domain-specific tools"], | |
| "compliance_knowledge": ["list of compliance knowledge"], | |
| "confidence_score": 0.95 | |
| } | |
| For any field where information is not found: | |
| - Use "Not found" for string fields | |
| - Use [] for list fields | |
| - Use 0 for numeric fields | |
| - Use "no" for yes/no fields | |
| Ensure all fields are included in the response, even if empty.""" | |
| response = openai.ChatCompletion.create( | |
| model="gpt-3.5-turbo", | |
| messages=[ | |
| {"role": "system", "content": system_prompt}, | |
| {"role": "user", "content": text} | |
| ], | |
| temperature=0.2 | |
| ) | |
| # Get the response content | |
| result_str = response['choices'][0]['message']['content'].strip() | |
| # Clean up the response string to ensure it's valid Python dict syntax | |
| result_str = result_str.replace('```python', '').replace('```', '').strip() | |
| # Safely evaluate the string to a Python dictionary | |
| try: | |
| result = eval(result_str) | |
| except: | |
| import ast | |
| result = ast.literal_eval(result_str) | |
| # Ensure all required fields are present with default values | |
| required_fields = { | |
| 'name': 'Not found', | |
| 'email': 'Not found', | |
| 'phone': 'Not found', | |
| 'location': 'Not found', | |
| 'linkedin_url': 'Not found', | |
| 'github_url': 'Not found', | |
| 'portfolio_url': 'Not found', | |
| 'cgpa': 'Not found', | |
| 'years_experience': 0, | |
| 'education_level': 'Not found', | |
| 'major': 'Not found', | |
| 'university': 'Not found', | |
| 'programming_languages': [], | |
| 'technical_skills': [], | |
| 'job_titles': [], | |
| 'certifications': [], | |
| 'ml_frameworks': [], | |
| 'visualization_tools': [], | |
| 'projects': [], | |
| 'publications': [], | |
| 'research_experience': 'no', | |
| 'awards': [], | |
| 'leadership_experience': 'no', | |
| 'team_size': 0 | |
| } | |
| # Update with default values for missing fields | |
| for field, default_value in required_fields.items(): | |
| if field not in result or result[field] is None: | |
| result[field] = default_value | |
| elif isinstance(default_value, list) and not isinstance(result[field], list): | |
| result[field] = [result[field]] if result[field] != "Not found" else [] | |
| return result | |
| except Exception as e: | |
| st.error(f"Error analyzing resume: {str(e)}") | |
| return None | |
| def display_statistics(): | |
| """Display statistics and visualizations of the analyzed resumes""" | |
| stats = db.get_statistics() | |
| st.subheader("📊 Resume Analysis Statistics") | |
| # Basic stats in three columns | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric("Total Resumes Analyzed", stats['total_resumes']) | |
| with col2: | |
| st.metric("Average Years of Experience", f"{stats['avg_work_experience']:.1f}") | |
| with col3: | |
| st.metric("Universities Represented", len(stats['university_distribution'])) | |
| # Education Distribution | |
| st.subheader("🎓 Education Statistics") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| if stats['education_levels']: | |
| fig = px.pie( | |
| values=list(stats['education_levels'].values()), | |
| names=list(stats['education_levels'].keys()), | |
| title="Education Levels" | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with col2: | |
| if stats['major_distribution']: | |
| fig = px.pie( | |
| values=list(stats['major_distribution'].values()), | |
| names=list(stats['major_distribution'].keys()), | |
| title="Major Distribution" | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Technical Skills Section | |
| st.subheader("💻 Technical Expertise") | |
| # Programming Languages and Technical Skills | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| if stats['top_programming_languages']: | |
| fig = px.bar( | |
| x=list(stats['top_programming_languages'].keys()), | |
| y=list(stats['top_programming_languages'].values()), | |
| title="Top Programming Languages" | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with col2: | |
| if stats['top_technical_skills']: | |
| fig = px.bar( | |
| x=list(stats['top_technical_skills'].keys()), | |
| y=list(stats['top_technical_skills'].values()), | |
| title="Top Technical Skills" | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Data Science Specific Skills | |
| st.subheader("🔬 Data Science Expertise") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| if stats['top_ml_frameworks']: | |
| fig = px.bar( | |
| x=list(stats['top_ml_frameworks'].keys()), | |
| y=list(stats['top_ml_frameworks'].values()), | |
| title="Top ML Frameworks" | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with col2: | |
| if stats['top_visualization_tools']: | |
| fig = px.bar( | |
| x=list(stats['top_visualization_tools'].keys()), | |
| y=list(stats['top_visualization_tools'].values()), | |
| title="Top Visualization Tools" | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Data Engineering Specific Skills | |
| st.subheader("⚙️ Data Engineering Expertise") | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| if stats['top_databases']: | |
| fig = px.bar( | |
| x=list(stats['top_databases'].keys()), | |
| y=list(stats['top_databases'].values()), | |
| title="Top Databases" | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with col2: | |
| if stats['top_etl_tools']: | |
| fig = px.bar( | |
| x=list(stats['top_etl_tools'].keys()), | |
| y=list(stats['top_etl_tools'].values()), | |
| title="Top ETL Tools" | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with col3: | |
| if stats['top_streaming_tech']: | |
| fig = px.bar( | |
| x=list(stats['top_streaming_tech'].keys()), | |
| y=list(stats['top_streaming_tech'].values()), | |
| title="Top Streaming Technologies" | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Cloud & Big Data | |
| st.subheader("☁️ Cloud & Big Data Expertise") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| if stats['top_cloud_platforms']: | |
| fig = px.bar( | |
| x=list(stats['top_cloud_platforms'].keys()), | |
| y=list(stats['top_cloud_platforms'].values()), | |
| title="Top Cloud Platforms" | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with col2: | |
| if stats['top_certifications']: | |
| fig = px.bar( | |
| x=list(stats['top_certifications'].keys()), | |
| y=list(stats['top_certifications'].values()), | |
| title="Top Certifications" | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| def display_rankings(): | |
| """Display candidate rankings with filtering options""" | |
| st.subheader("🏆 Candidate Rankings") | |
| # Role selection | |
| role_type = st.selectbox( | |
| "Select Role Type", | |
| ["both", "data_science", "data_engineering"], | |
| format_func=lambda x: { | |
| "both": "Both Roles", | |
| "data_science": "Data Science", | |
| "data_engineering": "Data Engineering" | |
| }[x] | |
| ) | |
| # Minimum score filter | |
| min_score = st.slider("Minimum Score", 0, 100, 50) | |
| # Get rankings | |
| rankings = db.get_candidate_rankings(role_type, min_score) | |
| if not rankings: | |
| st.warning("No candidates found matching the criteria.") | |
| return | |
| # Display top candidates | |
| st.write(f"Found {len(rankings)} candidates matching the criteria") | |
| for i, candidate in enumerate(rankings, 1): | |
| with st.expander(f"#{i}: {candidate['name']} - Score: {candidate['total_score']:.1f}"): | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.write("📊 Score Breakdown") | |
| fig = go.Figure() | |
| scores = [ | |
| ('Education', candidate['education_score']), | |
| ('Experience', candidate['experience_score']), | |
| ('Technical', candidate['technical_score']), | |
| ('Projects', candidate['project_score']), | |
| ('Impact', candidate['impact_score']), | |
| ('Role Specific', candidate['role_specific_score']) | |
| ] | |
| fig.add_trace(go.Bar( | |
| x=[s[0] for s in scores], | |
| y=[s[1] for s in scores], | |
| text=[f"{s[1]:.1f}" for s in scores], | |
| textposition='auto', | |
| )) | |
| fig.update_layout( | |
| title="Score Components", | |
| showlegend=False, | |
| height=300 | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with col2: | |
| st.write("👤 Candidate Information") | |
| st.write(f"Email: {candidate['email']}") | |
| st.write(f"Experience: {candidate['years_experience']}") | |
| st.write(f"Education: {candidate['education_level']}") | |
| st.write("Key Skills:") | |
| for skill in candidate['key_skills']: | |
| st.write(f"- {skill}") | |
| def main(): | |
| st.title("Resume Analyzer") | |
| # Sidebar navigation | |
| page = st.sidebar.selectbox( | |
| "Choose a page", | |
| ["Upload Resume", "View Statistics", "View Rankings", "Export Data"] | |
| ) | |
| if page == "Upload Resume": | |
| st.write("Upload resumes in PDF or DOCX format for analysis") | |
| # Check for API key | |
| if not os.getenv('OPENAI_API_KEY'): | |
| st.error("Please set your OpenAI API key in the .env file") | |
| return | |
| uploaded_file = st.file_uploader( | |
| "Choose a resume file", | |
| type=['pdf', 'docx'], | |
| help="Upload a resume in PDF or DOCX format" | |
| ) | |
| if uploaded_file: | |
| with st.spinner("Processing resume..."): | |
| # Create a temporary file | |
| with tempfile.NamedTemporaryFile(delete=False) as tmp_file: | |
| tmp_file.write(uploaded_file.getvalue()) | |
| tmp_file_path = tmp_file.name | |
| # Extract text based on file type | |
| file_extension = uploaded_file.name.split('.')[-1].lower() | |
| if file_extension == 'pdf': | |
| text = extract_text_from_pdf(tmp_file_path) | |
| elif file_extension == 'docx': | |
| text = extract_text_from_docx(tmp_file_path) | |
| # Clean up temporary file | |
| os.unlink(tmp_file_path) | |
| if text: | |
| # Analyze the resume | |
| analysis_result = analyze_resume(text) | |
| if analysis_result: | |
| # Save to database | |
| db.save_analysis(analysis_result, text) | |
| st.success("Resume analyzed successfully!") | |
| # Calculate and display score | |
| scores = db.calculate_score(analysis_result) | |
| st.subheader("📊 Candidate Score") | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric("Total Score", f"{scores['total_score']:.1f}/100") | |
| with col2: | |
| st.metric("Technical Score", f"{scores['technical_score']:.1f}/20") | |
| with col3: | |
| st.metric("Experience Score", f"{scores['experience_score']:.1f}/20") | |
| # Display detailed scores | |
| fig = go.Figure() | |
| score_components = [ | |
| ('Education', scores['education_score'], 20), | |
| ('Experience', scores['experience_score'], 20), | |
| ('Technical', scores['technical_score'], 20), | |
| ('Projects', scores['project_score'], 15), | |
| ('Impact', scores['impact_score'], 15), | |
| ('Role Specific', scores['role_specific_score'], 10) | |
| ] | |
| fig.add_trace(go.Bar( | |
| name='Score', | |
| x=[s[0] for s in score_components], | |
| y=[s[1] for s in score_components], | |
| text=[f"{s[1]:.1f}/{s[2]}" for s in score_components], | |
| textposition='auto', | |
| )) | |
| fig.update_layout( | |
| title="Score Breakdown", | |
| yaxis_title="Points", | |
| showlegend=False | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Display analysis results | |
| st.subheader("Analysis Results") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.write("📚 Education") | |
| st.write(f"CGPA: {analysis_result.get('cgpa', 'Not found')}") | |
| st.write(f"Education Level: {analysis_result.get('education_level', 'Not found')}") | |
| st.write(f"Major: {analysis_result.get('major', 'Not found')}") | |
| st.write(f"University: {analysis_result.get('university', 'Not found')}") | |
| st.write("💼 Experience") | |
| st.write(f"Years of Experience: {analysis_result.get('years_experience', 'Not found')}") | |
| st.write("Job Titles:") | |
| job_titles = analysis_result.get('job_titles', []) | |
| if job_titles: | |
| for title in job_titles: | |
| st.write(f"- {title}") | |
| else: | |
| st.write("- Not found") | |
| with col2: | |
| st.write("🔧 Technical Skills") | |
| # Programming Languages | |
| st.write("Programming Languages:") | |
| prog_langs = analysis_result.get('programming_languages', []) | |
| if prog_langs: | |
| for lang in prog_langs: | |
| st.write(f"- {lang}") | |
| else: | |
| st.write("- Not found") | |
| # Data Science Skills | |
| st.write("Data Science Skills:") | |
| ds_skills = ( | |
| analysis_result.get('ml_frameworks', []) + | |
| analysis_result.get('deep_learning', []) + | |
| analysis_result.get('nlp_skills', []) + | |
| analysis_result.get('computer_vision', []) + | |
| analysis_result.get('statistical_tools', []) + | |
| analysis_result.get('visualization_tools', []) | |
| ) | |
| if ds_skills: | |
| for skill in ds_skills: | |
| st.write(f"- {skill}") | |
| else: | |
| st.write("- Not found") | |
| # Data Engineering Skills | |
| st.write("Data Engineering Skills:") | |
| de_skills = ( | |
| analysis_result.get('databases', []) + | |
| analysis_result.get('etl_tools', []) + | |
| analysis_result.get('data_warehousing', []) + | |
| analysis_result.get('orchestration_tools', []) + | |
| analysis_result.get('streaming_tech', []) + | |
| analysis_result.get('data_modeling', []) | |
| ) | |
| if de_skills: | |
| for skill in de_skills: | |
| st.write(f"- {skill}") | |
| else: | |
| st.write("- Not found") | |
| # Cloud & Tools | |
| st.write("Cloud & Tools:") | |
| cloud_tools = ( | |
| analysis_result.get('cloud_platforms', []) + | |
| analysis_result.get('ci_cd_tools', []) + | |
| analysis_result.get('version_control', []) | |
| ) | |
| if cloud_tools: | |
| for tool in cloud_tools: | |
| st.write(f"- {tool}") | |
| else: | |
| st.write("- Not found") | |
| # Additional Information | |
| st.write("📜 Additional Information") | |
| col3, col4 = st.columns(2) | |
| with col3: | |
| # Certifications | |
| st.write("Certifications:") | |
| certs = analysis_result.get('certifications', []) | |
| if certs: | |
| for cert in certs: | |
| st.write(f"- {cert}") | |
| else: | |
| st.write("- Not found") | |
| # Projects | |
| st.write("Projects:") | |
| projects = analysis_result.get('projects', []) | |
| if projects: | |
| for project in projects: | |
| st.write(f"- {project}") | |
| else: | |
| st.write("- Not found") | |
| with col4: | |
| # Publications & Research | |
| st.write("Publications & Research:") | |
| publications = analysis_result.get('publications', []) | |
| if publications: | |
| for pub in publications: | |
| st.write(f"- {pub}") | |
| else: | |
| st.write("- Not found") | |
| research_exp = analysis_result.get('research_experience', 'no') | |
| if research_exp.lower() != 'no': | |
| st.write("Research Experience:", research_exp) | |
| # Leadership & Team Size | |
| leadership = analysis_result.get('leadership_experience', 'no') | |
| if leadership.lower() != 'no': | |
| st.write("Leadership Experience:", leadership) | |
| team_size = analysis_result.get('team_size', 0) | |
| if team_size: | |
| st.write(f"Team Size Managed: {team_size}") | |
| elif page == "View Statistics": | |
| display_statistics() | |
| elif page == "View Rankings": | |
| display_rankings() | |
| else: # Export Data page | |
| st.subheader("Export Data") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| if st.button("Export to CSV"): | |
| csv_path = db.export_to_csv() | |
| with open(csv_path, 'rb') as f: | |
| st.download_button( | |
| label="Download CSV", | |
| data=f, | |
| file_name="resume_analyses.csv", | |
| mime="text/csv" | |
| ) | |
| with col2: | |
| if st.button("Export to JSON"): | |
| json_path = db.export_to_json() | |
| with open(json_path, 'rb') as f: | |
| st.download_button( | |
| label="Download JSON", | |
| data=f, | |
| file_name="resume_analyses.json", | |
| mime="application/json" | |
| ) | |
| if __name__ == "__main__": | |
| main() |