Spaces:

Deepakkori45
/

Resume-Analyzer

Build error

App Files Files Community

Deepakkori45 commited on Mar 9, 2025

Commit

8dfded8

verified ·

1 Parent(s): 05ff3c1

Create database.py

Browse files

Files changed (1) hide show

database.py +532 -0

database.py ADDED Viewed

	@@ -0,0 +1,532 @@

+import sqlite3
+import json
+import pandas as pd
+from datetime import datetime
+class ResumeDatabase:
+    def __init__(self, db_path='resume_data.db'):
+        self.db_path = db_path
+        self.init_database()
+    def init_database(self):
+        """Initialize the database with required tables"""
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        # Create resume_analyses table with enhanced fields for DS/DE roles
+        cursor.execute('''
+        CREATE TABLE IF NOT EXISTS resume_analyses (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            timestamp TEXT,
+            -- Basic Information
+            name TEXT,
+            email TEXT,
+            phone TEXT,
+            location TEXT,
+            linkedin_url TEXT,
+            github_url TEXT,
+            portfolio_url TEXT,
+            -- Education & Experience
+            cgpa TEXT,
+            work_experience TEXT,
+            education_level TEXT,
+            major TEXT,
+            university TEXT,
+            internships TEXT,
+            -- Skills & Expertise
+            technical_skills TEXT,
+            programming_languages TEXT,
+            job_titles TEXT,
+            ds_de_skills TEXT,
+            certifications TEXT,
+            -- Data Science Specific Fields
+            ml_frameworks TEXT,
+            visualization_tools TEXT,
+            statistical_tools TEXT,
+            big_data_tools TEXT,
+            cloud_platforms TEXT,
+            deep_learning_expertise TEXT,
+            nlp_expertise TEXT,
+            computer_vision_expertise TEXT,
+            -- Data Engineering Specific Fields
+            databases TEXT,
+            etl_tools TEXT,
+            data_warehousing TEXT,
+            orchestration_tools TEXT,
+            streaming_technologies TEXT,
+            data_modeling_skills TEXT,
+            data_governance_experience TEXT,
+            data_quality_tools TEXT,
+            -- Project Information
+            projects TEXT,
+            publications TEXT,
+            research_experience TEXT,
+            hackathons TEXT,
+            awards_achievements TEXT,
+            -- Additional Skills & Metrics
+            soft_skills TEXT,
+            industry_domain TEXT,
+            languages TEXT,
+            leadership_experience TEXT,
+            team_size_managed TEXT,
+            -- Performance Metrics
+            code_quality_metrics TEXT,
+            project_impact_metrics TEXT,
+            performance_improvements TEXT,
+            -- Additional Technical Areas
+            version_control_systems TEXT,
+            ci_cd_tools TEXT,
+            testing_frameworks TEXT,
+            agile_methodologies TEXT,
+            system_architecture TEXT,
+            -- Business & Domain Knowledge
+            business_domain_expertise TEXT,
+            industry_certifications TEXT,
+            domain_specific_tools TEXT,
+            compliance_knowledge TEXT,
+            -- Raw Data
+            raw_text TEXT,
+            -- Metadata
+            last_updated TEXT,
+            resume_version TEXT,
+            analysis_confidence_score TEXT
+        )
+        ''')
+        conn.commit()
+        conn.close()
+    def save_analysis(self, analysis_result, raw_text):
+        """Save analysis results to database"""
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        # Convert lists and dictionaries to JSON strings for storage
+        analysis_data = {
+            'timestamp': datetime.now().isoformat(),
+            # Basic Information
+            'name': analysis_result.get('Name', 'Not found'),
+            'email': analysis_result.get('Email', 'Not found'),
+            'phone': analysis_result.get('Phone', 'Not found'),
+            'location': analysis_result.get('Location', 'Not found'),
+            'linkedin_url': analysis_result.get('LinkedIn', 'Not found'),
+            'github_url': analysis_result.get('GitHub', 'Not found'),
+            'portfolio_url': analysis_result.get('Portfolio', 'Not found'),
+            # Education & Experience
+            'cgpa': analysis_result.get('CGPA', 'Not found'),
+            'work_experience': analysis_result.get('Total years of work experience', 'Not found'),
+            'education_level': analysis_result.get('Education level', 'Not found'),
+            'major': analysis_result.get('Major', 'Not found'),
+            'university': analysis_result.get('University', 'Not found'),
+            'internships': json.dumps(analysis_result.get('Internships', [])),
+            # Skills & Expertise
+            'technical_skills': json.dumps(analysis_result.get('Technical skills', [])),
+            'programming_languages': json.dumps(analysis_result.get('Programming languages', [])),
+            'job_titles': json.dumps(analysis_result.get('Job titles', [])),
+            'ds_de_skills': json.dumps(analysis_result.get('Data science/engineering specific skills', [])),
+            'certifications': json.dumps(analysis_result.get('Certifications', [])),
+            # Data Science Specific Fields
+            'ml_frameworks': json.dumps(analysis_result.get('Machine learning frameworks', [])),
+            'visualization_tools': json.dumps(analysis_result.get('Visualization tools', [])),
+            'statistical_tools': json.dumps(analysis_result.get('Statistical tools', [])),
+            'big_data_tools': json.dumps(analysis_result.get('Big data tools', [])),
+            'cloud_platforms': json.dumps(analysis_result.get('Cloud platforms', [])),
+            'deep_learning_expertise': json.dumps(analysis_result.get('Deep learning expertise', [])),
+            'nlp_expertise': json.dumps(analysis_result.get('NLP expertise', [])),
+            'computer_vision_expertise': json.dumps(analysis_result.get('Computer vision expertise', [])),
+            # Data Engineering Specific Fields
+            'databases': json.dumps(analysis_result.get('Databases', [])),
+            'etl_tools': json.dumps(analysis_result.get('ETL tools', [])),
+            'data_warehousing': json.dumps(analysis_result.get('Data warehousing', [])),
+            'orchestration_tools': json.dumps(analysis_result.get('Orchestration tools', [])),
+            'streaming_technologies': json.dumps(analysis_result.get('Streaming technologies', [])),
+            'data_modeling_skills': json.dumps(analysis_result.get('Data modeling skills', [])),
+            'data_governance_experience': json.dumps(analysis_result.get('Data governance experience', [])),
+            'data_quality_tools': json.dumps(analysis_result.get('Data quality tools', [])),
+            # Project Information
+            'projects': json.dumps(analysis_result.get('Projects', [])),
+            'publications': json.dumps(analysis_result.get('Publications', [])),
+            'research_experience': json.dumps(analysis_result.get('Research experience', [])),
+            'hackathons': json.dumps(analysis_result.get('Hackathons', [])),
+            'awards_achievements': json.dumps(analysis_result.get('Awards and achievements', [])),
+            # Additional Skills & Metrics
+            'soft_skills': json.dumps(analysis_result.get('Soft skills', [])),
+            'industry_domain': json.dumps(analysis_result.get('Industry domain', [])),
+            'languages': json.dumps(analysis_result.get('Languages', [])),
+            'leadership_experience': json.dumps(analysis_result.get('Leadership experience', [])),
+            'team_size_managed': analysis_result.get('Team size managed', 'Not found'),
+            # Performance Metrics
+            'code_quality_metrics': json.dumps(analysis_result.get('Code quality metrics', [])),
+            'project_impact_metrics': json.dumps(analysis_result.get('Project impact metrics', [])),
+            'performance_improvements': json.dumps(analysis_result.get('Performance improvements', [])),
+            # Additional Technical Areas
+            'version_control_systems': json.dumps(analysis_result.get('Version control systems', [])),
+            'ci_cd_tools': json.dumps(analysis_result.get('CI/CD tools', [])),
+            'testing_frameworks': json.dumps(analysis_result.get('Testing frameworks', [])),
+            'agile_methodologies': json.dumps(analysis_result.get('Agile methodologies', [])),
+            'system_architecture': json.dumps(analysis_result.get('System architecture experience', [])),
+            # Business & Domain Knowledge
+            'business_domain_expertise': json.dumps(analysis_result.get('Business domain expertise', [])),
+            'industry_certifications': json.dumps(analysis_result.get('Industry certifications', [])),
+            'domain_specific_tools': json.dumps(analysis_result.get('Domain specific tools', [])),
+            'compliance_knowledge': json.dumps(analysis_result.get('Compliance knowledge', [])),
+            # Raw Data and Metadata
+            'raw_text': raw_text,
+            'last_updated': datetime.now().isoformat(),
+            'resume_version': '1.0',
+            'analysis_confidence_score': analysis_result.get('Analysis confidence score', '0.0')
+        }
+        # Create the SQL query dynamically based on the fields
+        fields = ', '.join(analysis_data.keys())
+        placeholders = ', '.join(['?' for _ in analysis_data])
+        query = f'INSERT INTO resume_analyses ({fields}) VALUES ({placeholders})'
+        cursor.execute(query, list(analysis_data.values()))
+        conn.commit()
+        conn.close()
+    def get_all_analyses(self):
+        """Retrieve all analyses from database"""
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        cursor.execute('SELECT * FROM resume_analyses')
+        columns = [description[0] for description in cursor.description]
+        results = cursor.fetchall()
+        analyses = []
+        for row in results:
+            analysis = dict(zip(columns, row))
+            # Convert JSON strings back to lists/dicts for all relevant fields
+            json_fields = [
+                'technical_skills', 'programming_languages', 'job_titles',
+                'ds_de_skills', 'certifications', 'ml_frameworks',
+                'visualization_tools', 'statistical_tools', 'big_data_tools',
+                'cloud_platforms', 'databases', 'etl_tools', 'data_warehousing',
+                'orchestration_tools', 'streaming_technologies', 'projects',
+                'publications', 'research_experience', 'soft_skills',
+                'industry_domain', 'languages'
+            ]
+            for field in json_fields:
+                if analysis[field]:
+                    analysis[field] = json.loads(analysis[field])
+            analyses.append(analysis)
+        conn.close()
+        return analyses
+    def export_to_csv(self, filepath='resume_analyses.csv'):
+        """Export all analyses to CSV"""
+        analyses = self.get_all_analyses()
+        df = pd.DataFrame(analyses)
+        df.to_csv(filepath, index=False)
+        return filepath
+    def export_to_json(self, filepath='resume_analyses.json'):
+        """Export all analyses to JSON"""
+        analyses = self.get_all_analyses()
+        with open(filepath, 'w') as f:
+            json.dump(analyses, f, indent=2)
+        return filepath
+    def get_statistics(self):
+        """Get enhanced statistics about the stored data"""
+        analyses = self.get_all_analyses()
+        stats = {
+            'total_resumes': len(analyses),
+            'avg_work_experience': 0,
+            'education_levels': {},
+            'top_programming_languages': {},
+            'top_technical_skills': {},
+            'top_certifications': {},
+            # New statistics
+            'top_ml_frameworks': {},
+            'top_visualization_tools': {},
+            'top_databases': {},
+            'top_cloud_platforms': {},
+            'top_etl_tools': {},
+            'top_streaming_tech': {},
+            'industry_distribution': {},
+            'university_distribution': {},
+            'major_distribution': {}
+        }
+        for analysis in analyses:
+            # Existing statistics
+            edu_level = analysis['education_level']
+            stats['education_levels'][edu_level] = stats['education_levels'].get(edu_level, 0) + 1
+            # Count various skills and tools
+            self._count_items(analysis['programming_languages'], stats['top_programming_languages'])
+            self._count_items(analysis['technical_skills'], stats['top_technical_skills'])
+            self._count_items(analysis['certifications'], stats['top_certifications'])
+            self._count_items(analysis['ml_frameworks'], stats['top_ml_frameworks'])
+            self._count_items(analysis['visualization_tools'], stats['top_visualization_tools'])
+            self._count_items(analysis['databases'], stats['top_databases'])
+            self._count_items(analysis['cloud_platforms'], stats['top_cloud_platforms'])
+            self._count_items(analysis['etl_tools'], stats['top_etl_tools'])
+            self._count_items(analysis['streaming_technologies'], stats['top_streaming_tech'])
+            # Count university and major distribution
+            if analysis['university'] != 'Not found':
+                stats['university_distribution'][analysis['university']] = \
+                    stats['university_distribution'].get(analysis['university'], 0) + 1
+            if analysis['major'] != 'Not found':
+                stats['major_distribution'][analysis['major']] = \
+                    stats['major_distribution'].get(analysis['major'], 0) + 1
+            # Calculate average work experience
+            try:
+                exp = float(analysis['work_experience'].split()[0])
+                stats['avg_work_experience'] += exp
+            except:
+                continue
+        if stats['total_resumes'] > 0:
+            stats['avg_work_experience'] /= stats['total_resumes']
+        # Sort and limit all dictionaries to top 10
+        for key in stats:
+            if isinstance(stats[key], dict):
+                stats[key] = dict(sorted(stats[key].items(), key=lambda x: x[1], reverse=True)[:10])
+        return stats
+    def _count_items(self, items, counter_dict):
+        """Helper method to count items in a list"""
+        if items:
+            for item in items:
+                counter_dict[item] = counter_dict.get(item, 0) + 1
+    def calculate_score(self, analysis, role_type='both'):
+        """Calculate score for a resume based on role type (data_science, data_engineering, or both)"""
+        scores = {
+            'technical_score': 0,
+            'experience_score': 0,
+            'education_score': 0,
+            'project_score': 0,
+            'impact_score': 0,
+            'total_score': 0,
+            'role_specific_score': 0
+        }
+        # Education Score (max 20 points)
+        education_weights = {
+            'PhD': 20,
+            'Masters': 18,
+            'Bachelors': 15,
+            'Associate': 10
+        }
+        edu_level = analysis['education_level'].lower()
+        for level, weight in education_weights.items():
+            if level.lower() in edu_level:
+                scores['education_score'] = weight
+                break
+        # Add points for CGPA if available
+        try:
+            cgpa = float(analysis['cgpa'].split('/')[0])
+            if cgpa >= 3.5:
+                scores['education_score'] += 5
+            elif cgpa >= 3.0:
+                scores['education_score'] += 3
+        except:
+            pass
+        # Experience Score (max 20 points)
+        try:
+            years = float(analysis['work_experience'].split()[0])
+            scores['experience_score'] = min(20, years * 4)  # 4 points per year, max 20
+        except:
+            pass
+        # Technical Skills Score (max 20 points)
+        tech_score = 0
+        if role_type in ['data_science', 'both']:
+            # Data Science specific skills
+            ds_skills = {
+                'python': 3, 'r': 2, 'sql': 2,
+                'tensorflow': 2, 'pytorch': 2, 'scikit-learn': 2,
+                'pandas': 1, 'numpy': 1, 'matplotlib': 1,
+                'tableau': 2, 'powerbi': 2,
+                'statistics': 2, 'machine learning': 3,
+                'deep learning': 3, 'nlp': 2, 'computer vision': 2
+            }
+            all_skills = (
+                analysis['programming_languages'] +
+                analysis['technical_skills'] +
+                analysis['ml_frameworks'] +
+                analysis['visualization_tools'] +
+                analysis['statistical_tools']
+            )
+            for skill in all_skills:
+                skill_lower = skill.lower()
+                for key, value in ds_skills.items():
+                    if key in skill_lower:
+                        tech_score += value
+        if role_type in ['data_engineering', 'both']:
+            # Data Engineering specific skills
+            de_skills = {
+                'sql': 3, 'python': 2, 'java': 2, 'scala': 2,
+                'hadoop': 2, 'spark': 3, 'kafka': 2,
+                'airflow': 2, 'luigi': 2,
+                'aws': 3, 'azure': 3, 'gcp': 3,
+                'snowflake': 2, 'redshift': 2,
+                'mongodb': 1, 'postgresql': 2,
+                'etl': 3, 'data warehouse': 2,
+                'data modeling': 2, 'data governance': 2
+            }
+            all_skills = (
+                analysis['programming_languages'] +
+                analysis['technical_skills'] +
+                analysis['databases'] +
+                analysis['etl_tools'] +
+                analysis['data_warehousing'] +
+                analysis['orchestration_tools'] +
+                analysis['streaming_technologies']
+            )
+            for skill in all_skills:
+                skill_lower = skill.lower()
+                for key, value in de_skills.items():
+                    if key in skill_lower:
+                        tech_score += value
+        scores['technical_score'] = min(20, tech_score)  # Cap at 20 points
+        # Project Score (max 15 points)
+        project_score = 0
+        projects = analysis['projects']
+        project_score += min(10, len(projects) * 2)  # 2 points per project, max 10
+        # Add points for research and publications
+        if analysis['research_experience']:
+            project_score += 3
+        if analysis['publications']:
+            project_score += 2
+        scores['project_score'] = project_score
+        # Impact Score (max 15 points)
+        impact_score = 0
+        # Leadership and team management
+        if analysis['leadership_experience']:
+            impact_score += 3
+        try:
+            team_size = int(''.join(filter(str.isdigit, analysis['team_size_managed'])))
+            impact_score += min(3, team_size // 5)  # 1 point per 5 team members, max 3
+        except:
+            pass
+        # Certifications
+        impact_score += min(3, len(analysis['certifications']))
+        # Awards and achievements
+        impact_score += min(3, len(analysis['awards_achievements']))
+        # Project impact metrics
+        if analysis['project_impact_metrics']:
+            impact_score += 3
+        scores['impact_score'] = impact_score
+        # Role-specific score (max 10 points)
+        role_score = 0
+        if role_type == 'data_science':
+            # Data Science specific achievements
+            if analysis['deep_learning_expertise']:
+                role_score += 2
+            if analysis['nlp_expertise']:
+                role_score += 2
+            if analysis['computer_vision_expertise']:
+                role_score += 2
+            if analysis['statistical_tools']:
+                role_score += 2
+            if analysis['visualization_tools']:
+                role_score += 2
+        elif role_type == 'data_engineering':
+            # Data Engineering specific achievements
+            if analysis['data_modeling_skills']:
+                role_score += 2
+            if analysis['data_governance_experience']:
+                role_score += 2
+            if analysis['data_quality_tools']:
+                role_score += 2
+            if analysis['ci_cd_tools']:
+                role_score += 2
+            if analysis['system_architecture']:
+                role_score += 2
+        scores['role_specific_score'] = role_score
+        # Calculate total score (max 100 points)
+        scores['total_score'] = (
+            scores['education_score'] +
+            scores['experience_score'] +
+            scores['technical_score'] +
+            scores['project_score'] +
+            scores['impact_score'] +
+            scores['role_specific_score']
+        )
+        return scores
+    def get_candidate_rankings(self, role_type='both', min_score=0):
+        """Get ranked list of candidates based on role type and minimum score"""
+        analyses = self.get_all_analyses()
+        rankings = []
+        for analysis in analyses:
+            scores = self.calculate_score(analysis, role_type)
+            if scores['total_score'] >= min_score:
+                rankings.append({
+                    'name': analysis['name'],
+                    'email': analysis['email'],
+                    'total_score': scores['total_score'],
+                    'education_score': scores['education_score'],
+                    'experience_score': scores['experience_score'],
+                    'technical_score': scores['technical_score'],
+                    'project_score': scores['project_score'],
+                    'impact_score': scores['impact_score'],
+                    'role_specific_score': scores['role_specific_score'],
+                    'key_skills': analysis['technical_skills'][:5],  # Top 5 skills
+                    'years_experience': analysis['work_experience'],
+                    'education_level': analysis['education_level']
+                })
+        # Sort by total score in descending order
+        rankings.sort(key=lambda x: x['total_score'], reverse=True)
+        return rankings