Spaces:

Deepakkori45
/

Resume-Analyzer

Build error

App Files Files Community

Deepakkori45 commited on Mar 9, 2025

Commit

4cee71c

verified ·

1 Parent(s): 0b7aa1f

Update database.py

Browse files

Files changed (1) hide show

database.py +267 -479

database.py CHANGED Viewed

@@ -4,530 +4,318 @@ import pandas as pd
 from datetime import datetime
 class ResumeDatabase:
-    def __init__(self, db_path='resume_data.db'):
         self.db_path = db_path
-        self.init_database()
-    def init_database(self):
-        """Initialize the database with required tables"""
         conn = sqlite3.connect(self.db_path)
-        cursor = conn.cursor()
-        # Create resume_analyses table with enhanced fields for DS/DE roles
-        cursor.execute('''
-        CREATE TABLE IF NOT EXISTS resume_analyses (
-            id INTEGER PRIMARY KEY AUTOINCREMENT,
-            timestamp TEXT,
-            -- Basic Information
-            name TEXT,
-            email TEXT,
-            phone TEXT,
-            location TEXT,
-            linkedin_url TEXT,
-            github_url TEXT,
-            portfolio_url TEXT,
-            -- Education & Experience
-            cgpa TEXT,
-            work_experience TEXT,
-            education_level TEXT,
-            major TEXT,
-            university TEXT,
-            internships TEXT,
-            -- Skills & Expertise
-            technical_skills TEXT,
-            programming_languages TEXT,
-            job_titles TEXT,
-            ds_de_skills TEXT,
-            certifications TEXT,
-            -- Data Science Specific Fields
-            ml_frameworks TEXT,
-            visualization_tools TEXT,
-            statistical_tools TEXT,
-            big_data_tools TEXT,
-            cloud_platforms TEXT,
-            deep_learning_expertise TEXT,
-            nlp_expertise TEXT,
-            computer_vision_expertise TEXT,
-            -- Data Engineering Specific Fields
-            databases TEXT,
-            etl_tools TEXT,
-            data_warehousing TEXT,
-            orchestration_tools TEXT,
-            streaming_technologies TEXT,
-            data_modeling_skills TEXT,
-            data_governance_experience TEXT,
-            data_quality_tools TEXT,
-            -- Project Information
-            projects TEXT,
-            publications TEXT,
-            research_experience TEXT,
-            hackathons TEXT,
-            awards_achievements TEXT,
-            -- Additional Skills & Metrics
-            soft_skills TEXT,
-            industry_domain TEXT,
-            languages TEXT,
-            leadership_experience TEXT,
-            team_size_managed TEXT,
-            -- Performance Metrics
-            code_quality_metrics TEXT,
-            project_impact_metrics TEXT,
-            performance_improvements TEXT,
-            -- Additional Technical Areas
-            version_control_systems TEXT,
-            ci_cd_tools TEXT,
-            testing_frameworks TEXT,
-            agile_methodologies TEXT,
-            system_architecture TEXT,
-            -- Business & Domain Knowledge
-            business_domain_expertise TEXT,
-            industry_certifications TEXT,
-            domain_specific_tools TEXT,
-            compliance_knowledge TEXT,
-            -- Raw Data
-            raw_text TEXT,
-            -- Metadata
-            last_updated TEXT,
-            resume_version TEXT,
-            analysis_confidence_score TEXT
-        )
-        ''')
         conn.commit()
         conn.close()
     def save_analysis(self, analysis_result, raw_text):
-        """Save analysis results to database"""
         conn = sqlite3.connect(self.db_path)
-        cursor = conn.cursor()
-        # Convert lists and dictionaries to JSON strings for storage
-        analysis_data = {
-            'timestamp': datetime.now().isoformat(),
-            # Basic Information
-            'name': analysis_result.get('Name', 'Not found'),
-            'email': analysis_result.get('Email', 'Not found'),
-            'phone': analysis_result.get('Phone', 'Not found'),
-            'location': analysis_result.get('Location', 'Not found'),
-            'linkedin_url': analysis_result.get('LinkedIn', 'Not found'),
-            'github_url': analysis_result.get('GitHub', 'Not found'),
-            'portfolio_url': analysis_result.get('Portfolio', 'Not found'),
-            # Education & Experience
-            'cgpa': analysis_result.get('CGPA', 'Not found'),
-            'work_experience': analysis_result.get('Total years of work experience', 'Not found'),
-            'education_level': analysis_result.get('Education level', 'Not found'),
-            'major': analysis_result.get('Major', 'Not found'),
-            'university': analysis_result.get('University', 'Not found'),
-            'internships': json.dumps(analysis_result.get('Internships', [])),
-            # Skills & Expertise
-            'technical_skills': json.dumps(analysis_result.get('Technical skills', [])),
-            'programming_languages': json.dumps(analysis_result.get('Programming languages', [])),
-            'job_titles': json.dumps(analysis_result.get('Job titles', [])),
-            'ds_de_skills': json.dumps(analysis_result.get('Data science/engineering specific skills', [])),
-            'certifications': json.dumps(analysis_result.get('Certifications', [])),
-            # Data Science Specific Fields
-            'ml_frameworks': json.dumps(analysis_result.get('Machine learning frameworks', [])),
-            'visualization_tools': json.dumps(analysis_result.get('Visualization tools', [])),
-            'statistical_tools': json.dumps(analysis_result.get('Statistical tools', [])),
-            'big_data_tools': json.dumps(analysis_result.get('Big data tools', [])),
-            'cloud_platforms': json.dumps(analysis_result.get('Cloud platforms', [])),
-            'deep_learning_expertise': json.dumps(analysis_result.get('Deep learning expertise', [])),
-            'nlp_expertise': json.dumps(analysis_result.get('NLP expertise', [])),
-            'computer_vision_expertise': json.dumps(analysis_result.get('Computer vision expertise', [])),
-            # Data Engineering Specific Fields
-            'databases': json.dumps(analysis_result.get('Databases', [])),
-            'etl_tools': json.dumps(analysis_result.get('ETL tools', [])),
-            'data_warehousing': json.dumps(analysis_result.get('Data warehousing', [])),
-            'orchestration_tools': json.dumps(analysis_result.get('Orchestration tools', [])),
-            'streaming_technologies': json.dumps(analysis_result.get('Streaming technologies', [])),
-            'data_modeling_skills': json.dumps(analysis_result.get('Data modeling skills', [])),
-            'data_governance_experience': json.dumps(analysis_result.get('Data governance experience', [])),
-            'data_quality_tools': json.dumps(analysis_result.get('Data quality tools', [])),
-            # Project Information
-            'projects': json.dumps(analysis_result.get('Projects', [])),
-            'publications': json.dumps(analysis_result.get('Publications', [])),
-            'research_experience': json.dumps(analysis_result.get('Research experience', [])),
-            'hackathons': json.dumps(analysis_result.get('Hackathons', [])),
-            'awards_achievements': json.dumps(analysis_result.get('Awards and achievements', [])),
-            # Additional Skills & Metrics
-            'soft_skills': json.dumps(analysis_result.get('Soft skills', [])),
-            'industry_domain': json.dumps(analysis_result.get('Industry domain', [])),
-            'languages': json.dumps(analysis_result.get('Languages', [])),
-            'leadership_experience': json.dumps(analysis_result.get('Leadership experience', [])),
-            'team_size_managed': analysis_result.get('Team size managed', 'Not found'),
-            # Performance Metrics
-            'code_quality_metrics': json.dumps(analysis_result.get('Code quality metrics', [])),
-            'project_impact_metrics': json.dumps(analysis_result.get('Project impact metrics', [])),
-            'performance_improvements': json.dumps(analysis_result.get('Performance improvements', [])),
-            # Additional Technical Areas
-            'version_control_systems': json.dumps(analysis_result.get('Version control systems', [])),
-            'ci_cd_tools': json.dumps(analysis_result.get('CI/CD tools', [])),
-            'testing_frameworks': json.dumps(analysis_result.get('Testing frameworks', [])),
-            'agile_methodologies': json.dumps(analysis_result.get('Agile methodologies', [])),
-            'system_architecture': json.dumps(analysis_result.get('System architecture experience', [])),
-            # Business & Domain Knowledge
-            'business_domain_expertise': json.dumps(analysis_result.get('Business domain expertise', [])),
-            'industry_certifications': json.dumps(analysis_result.get('Industry certifications', [])),
-            'domain_specific_tools': json.dumps(analysis_result.get('Domain specific tools', [])),
-            'compliance_knowledge': json.dumps(analysis_result.get('Compliance knowledge', [])),
-            # Raw Data and Metadata
-            'raw_text': raw_text,
-            'last_updated': datetime.now().isoformat(),
-            'resume_version': '1.0',
-            'analysis_confidence_score': analysis_result.get('Analysis confidence score', '0.0')
-        }
-        # Create the SQL query dynamically based on the fields
-        fields = ', '.join(analysis_data.keys())
-        placeholders = ', '.join(['?' for _ in analysis_data])
-        query = f'INSERT INTO resume_analyses ({fields}) VALUES ({placeholders})'
-        cursor.execute(query, list(analysis_data.values()))
-        conn.commit()
-        conn.close()
-    def get_all_analyses(self):
-        """Retrieve all analyses from database"""
         conn = sqlite3.connect(self.db_path)
-        cursor = conn.cursor()
-        cursor.execute('SELECT * FROM resume_analyses')
-        columns = [description[0] for description in cursor.description]
-        results = cursor.fetchall()
-        analyses = []
-        for row in results:
-            analysis = dict(zip(columns, row))
-            # Convert JSON strings back to lists/dicts for all relevant fields
-            json_fields = [
-                'technical_skills', 'programming_languages', 'job_titles',
-                'ds_de_skills', 'certifications', 'ml_frameworks',
-                'visualization_tools', 'statistical_tools', 'big_data_tools',
-                'cloud_platforms', 'databases', 'etl_tools', 'data_warehousing',
-                'orchestration_tools', 'streaming_technologies', 'projects',
-                'publications', 'research_experience', 'soft_skills',
-                'industry_domain', 'languages'
-            ]
-            for field in json_fields:
-                if analysis[field]:
-                    analysis[field] = json.loads(analysis[field])
-            analyses.append(analysis)
         conn.close()
-        return analyses
-    def export_to_csv(self, filepath='resume_analyses.csv'):
-        """Export all analyses to CSV"""
-        analyses = self.get_all_analyses()
-        df = pd.DataFrame(analyses)
-        df.to_csv(filepath, index=False)
-        return filepath
-    def export_to_json(self, filepath='resume_analyses.json'):
-        """Export all analyses to JSON"""
-        analyses = self.get_all_analyses()
-        with open(filepath, 'w') as f:
-            json.dump(analyses, f, indent=2)
-        return filepath
-    def get_statistics(self):
-        """Get enhanced statistics about the stored data"""
-        analyses = self.get_all_analyses()
         stats = {
             'total_resumes': len(analyses),
             'avg_work_experience': 0,
             'education_levels': {},
             'top_programming_languages': {},
             'top_technical_skills': {},
-            'top_certifications': {},
-            # New statistics
             'top_ml_frameworks': {},
             'top_visualization_tools': {},
             'top_databases': {},
-            'top_cloud_platforms': {},
             'top_etl_tools': {},
             'top_streaming_tech': {},
-            'industry_distribution': {},
-            'university_distribution': {},
-            'major_distribution': {}
         }
         for analysis in analyses:
-            # Existing statistics
-            edu_level = analysis['education_level']
-            stats['education_levels'][edu_level] = stats['education_levels'].get(edu_level, 0) + 1
-            # Count various skills and tools
-            self._count_items(analysis['programming_languages'], stats['top_programming_languages'])
-            self._count_items(analysis['technical_skills'], stats['top_technical_skills'])
-            self._count_items(analysis['certifications'], stats['top_certifications'])
-            self._count_items(analysis['ml_frameworks'], stats['top_ml_frameworks'])
-            self._count_items(analysis['visualization_tools'], stats['top_visualization_tools'])
-            self._count_items(analysis['databases'], stats['top_databases'])
-            self._count_items(analysis['cloud_platforms'], stats['top_cloud_platforms'])
-            self._count_items(analysis['etl_tools'], stats['top_etl_tools'])
-            self._count_items(analysis['streaming_technologies'], stats['top_streaming_tech'])
-            # Count university and major distribution
-            if analysis['university'] != 'Not found':
-                stats['university_distribution'][analysis['university']] = \
-                    stats['university_distribution'].get(analysis['university'], 0) + 1
-            if analysis['major'] != 'Not found':
-                stats['major_distribution'][analysis['major']] = \
-                    stats['major_distribution'].get(analysis['major'], 0) + 1
-            # Calculate average work experience
-            try:
-                exp = float(analysis['work_experience'].split()[0])
-                stats['avg_work_experience'] += exp
-            except:
-                continue
-        if stats['total_resumes'] > 0:
-            stats['avg_work_experience'] /= stats['total_resumes']
-        # Sort and limit all dictionaries to top 10
         for key in stats:
             if isinstance(stats[key], dict):
                 stats[key] = dict(sorted(stats[key].items(), key=lambda x: x[1], reverse=True)[:10])
         return stats
-    def _count_items(self, items, counter_dict):
-        """Helper method to count items in a list"""
-        if items:
-            for item in items:
-                counter_dict[item] = counter_dict.get(item, 0) + 1
-    def calculate_score(self, analysis, role_type='both'):
-        """Calculate score for a resume based on role type (data_science, data_engineering, or both)"""
-        scores = {
-            'technical_score': 0,
-            'experience_score': 0,
-            'education_score': 0,
-            'project_score': 0,
-            'impact_score': 0,
-            'total_score': 0,
-            'role_specific_score': 0
-        }
-        # Education Score (max 20 points)
-        education_weights = {
-            'PhD': 20,
-            'Masters': 18,
-            'Bachelors': 15,
-            'Associate': 10
-        }
-        edu_level = analysis['education_level'].lower()
-        for level, weight in education_weights.items():
-            if level.lower() in edu_level:
-                scores['education_score'] = weight
-                break
-        # Add points for CGPA if available
-        try:
-            cgpa = float(analysis['cgpa'].split('/')[0])
-            if cgpa >= 3.5:
-                scores['education_score'] += 5
-            elif cgpa >= 3.0:
-                scores['education_score'] += 3
-        except:
-            pass
-        # Experience Score (max 20 points)
-        try:
-            years = float(analysis['work_experience'].split()[0])
-            scores['experience_score'] = min(20, years * 4)  # 4 points per year, max 20
-        except:
-            pass
-        # Technical Skills Score (max 20 points)
-        tech_score = 0
-        if role_type in ['data_science', 'both']:
-            # Data Science specific skills
-            ds_skills = {
-                'python': 3, 'r': 2, 'sql': 2,
-                'tensorflow': 2, 'pytorch': 2, 'scikit-learn': 2,
-                'pandas': 1, 'numpy': 1, 'matplotlib': 1,
-                'tableau': 2, 'powerbi': 2,
-                'statistics': 2, 'machine learning': 3,
-                'deep learning': 3, 'nlp': 2, 'computer vision': 2
-            }
-            all_skills = (
-                analysis['programming_languages'] +
-                analysis['technical_skills'] +
-                analysis['ml_frameworks'] +
-                analysis['visualization_tools'] +
-                analysis['statistical_tools']
-            )
-            for skill in all_skills:
-                skill_lower = skill.lower()
-                for key, value in ds_skills.items():
-                    if key in skill_lower:
-                        tech_score += value
-        if role_type in ['data_engineering', 'both']:
-            # Data Engineering specific skills
-            de_skills = {
-                'sql': 3, 'python': 2, 'java': 2, 'scala': 2,
-                'hadoop': 2, 'spark': 3, 'kafka': 2,
-                'airflow': 2, 'luigi': 2,
-                'aws': 3, 'azure': 3, 'gcp': 3,
-                'snowflake': 2, 'redshift': 2,
-                'mongodb': 1, 'postgresql': 2,
-                'etl': 3, 'data warehouse': 2,
-                'data modeling': 2, 'data governance': 2
-            }
-            all_skills = (
-                analysis['programming_languages'] +
-                analysis['technical_skills'] +
-                analysis['databases'] +
-                analysis['etl_tools'] +
-                analysis['data_warehousing'] +
-                analysis['orchestration_tools'] +
-                analysis['streaming_technologies']
-            )
-            for skill in all_skills:
-                skill_lower = skill.lower()
-                for key, value in de_skills.items():
-                    if key in skill_lower:
-                        tech_score += value
-        scores['technical_score'] = min(20, tech_score)  # Cap at 20 points
-        # Project Score (max 15 points)
-        project_score = 0
-        projects = analysis['projects']
-        project_score += min(10, len(projects) * 2)  # 2 points per project, max 10
-        # Add points for research and publications
-        if analysis['research_experience']:
-            project_score += 3
-        if analysis['publications']:
-            project_score += 2
-        scores['project_score'] = project_score
-        # Impact Score (max 15 points)
-        impact_score = 0
-        # Leadership and team management
-        if analysis['leadership_experience']:
-            impact_score += 3
-        try:
-            team_size = int(''.join(filter(str.isdigit, analysis['team_size_managed'])))
-            impact_score += min(3, team_size // 5)  # 1 point per 5 team members, max 3
-        except:
-            pass
-        # Certifications
-        impact_score += min(3, len(analysis['certifications']))
-        # Awards and achievements
-        impact_score += min(3, len(analysis['awards_achievements']))
-        # Project impact metrics
-        if analysis['project_impact_metrics']:
-            impact_score += 3
-        scores['impact_score'] = impact_score
-        # Role-specific score (max 10 points)
-        role_score = 0
-        if role_type == 'data_science':
-            # Data Science specific achievements
-            if analysis['deep_learning_expertise']:
-                role_score += 2
-            if analysis['nlp_expertise']:
-                role_score += 2
-            if analysis['computer_vision_expertise']:
-                role_score += 2
-            if analysis['statistical_tools']:
-                role_score += 2
-            if analysis['visualization_tools']:
-                role_score += 2
-        elif role_type == 'data_engineering':
-            # Data Engineering specific achievements
-            if analysis['data_modeling_skills']:
-                role_score += 2
-            if analysis['data_governance_experience']:
-                role_score += 2
-            if analysis['data_quality_tools']:
-                role_score += 2
-            if analysis['ci_cd_tools']:
-                role_score += 2
-            if analysis['system_architecture']:
-                role_score += 2
-        scores['role_specific_score'] = role_score
-        # Calculate total score (max 100 points)
-        scores['total_score'] = (
-            scores['education_score'] +
-            scores['experience_score'] +
-            scores['technical_score'] +
-            scores['project_score'] +
-            scores['impact_score'] +
-            scores['role_specific_score']
-        )
-        return scores
-    def get_candidate_rankings(self, role_type='both', min_score=0):
-        """Get ranked list of candidates based on role type and minimum score"""
-        analyses = self.get_all_analyses()
-        rankings = []
-        for analysis in analyses:
-            scores = self.calculate_score(analysis, role_type)
-            if scores['total_score'] >= min_score:
-                rankings.append({
-                    'name': analysis['name'],
-                    'email': analysis['email'],
-                    'total_score': scores['total_score'],
-                    'education_score': scores['education_score'],
-                    'experience_score': scores['experience_score'],
-                    'technical_score': scores['technical_score'],
-                    'project_score': scores['project_score'],
-                    'impact_score': scores['impact_score'],
-                    'role_specific_score': scores['role_specific_score'],
-                    'key_skills': analysis['technical_skills'][:5],  # Top 5 skills
-                    'years_experience': analysis['work_experience'],
-                    'education_level': analysis['education_level']
-                })
-        # Sort by total score in descending order
-        rankings.sort(key=lambda x: x['total_score'], reverse=True)
-        return rankings

 from datetime import datetime
 class ResumeDatabase:
+    def __init__(self, db_path='resumes.db'):
         self.db_path = db_path
+        self.create_tables()
+    def create_tables(self):
         conn = sqlite3.connect(self.db_path)
+        c = conn.cursor()
+        c.execute('''CREATE TABLE IF NOT EXISTS resumes
+                    (id INTEGER PRIMARY KEY AUTOINCREMENT,
+                     name TEXT,
+                     email TEXT,
+                     phone TEXT,
+                     raw_text TEXT,
+                     analysis_json TEXT,
+                     created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)''')
         conn.commit()
         conn.close()
     def save_analysis(self, analysis_result, raw_text):
         conn = sqlite3.connect(self.db_path)
+        c = conn.cursor()
+        c.execute('''INSERT INTO resumes (name, email, phone, raw_text, analysis_json)
+                    VALUES (?, ?, ?, ?, ?)''',
+                    (analysis_result.get('name', 'Not found'),
+                     analysis_result.get('email', 'Not found'),
+                     analysis_result.get('phone', 'Not found'),
+                     raw_text,
+                     json.dumps(analysis_result)))
+        conn.commit()
+        conn.close()
+    def calculate_score(self, analysis):
+        """Calculate a comprehensive score based on resume analysis"""
+        try:
+            # Initialize scores
+            education_score = 0
+            experience_score = 0
+            technical_score = 0
+            project_score = 0
+            impact_score = 0
+            role_specific_score = 0
+            # Education Score (max 20 points)
+            edu_level = str(analysis.get('education_level', '')).lower()
+            if edu_level:
+                if 'phd' in edu_level or 'doctorate' in edu_level:
+                    education_score += 20
+                elif 'master' in edu_level or 'ms' in edu_level or 'mtech' in edu_level:
+                    education_score += 18
+                elif 'bachelor' in edu_level or 'bs' in edu_level or 'btech' in edu_level:
+                    education_score += 15
+                else:
+                    education_score += 10
+            # Add points for CGPA if available
+            cgpa = analysis.get('cgpa', 'Not found')
+            if isinstance(cgpa, (int, float)):
+                if cgpa >= 3.5:  # Assuming 4.0 scale
+                    education_score = min(20, education_score + 2)
+            # Experience Score (max 20 points)
+            years_exp = analysis.get('years_experience', 0)
+            if isinstance(years_exp, (int, float)):
+                experience_score = min(20, years_exp * 4)  # 5 years for max score
+            elif isinstance(years_exp, str) and years_exp.replace('.', '').isdigit():
+                experience_score = min(20, float(years_exp) * 4)
+            # Technical Score (max 20 points)
+            tech_skills = {
+                'programming_languages': analysis.get('programming_languages', []),
+                'technical_skills': analysis.get('technical_skills', []),
+                'ml_frameworks': analysis.get('ml_frameworks', []),
+                'databases': analysis.get('databases', []),
+                'cloud_platforms': analysis.get('cloud_platforms', [])
+            }
+            total_skills = sum(len(skills) for skills in tech_skills.values())
+            technical_score = min(20, total_skills * 2)
+            # Project Score (max 15 points)
+            projects = len(analysis.get('projects', []))
+            research_exp = 1 if analysis.get('research_experience') else 0
+            publications = len(analysis.get('publications', []))
+            project_score = min(15, projects * 2 + research_exp * 3 + publications * 2)
+            # Impact Score (max 15 points)
+            leadership = 1 if analysis.get('leadership_experience') else 0
+            team_size = analysis.get('team_size', 0)
+            if isinstance(team_size, str):
+                try:
+                    team_size = int(''.join(filter(str.isdigit, team_size)))
+                except:
+                    team_size = 0
+            certifications = len(analysis.get('certifications', []))
+            awards = len(analysis.get('awards', []))
+            impact_score = min(15, leadership * 5 + min(5, team_size/2) + min(5, certifications * 2 + awards))
+            # Role Specific Score (max 10 points)
+            ds_skills = len(analysis.get('ml_frameworks', [])) + len(analysis.get('deep_learning', [])) + \
+                       len(analysis.get('nlp_skills', [])) + len(analysis.get('computer_vision', []))
+            de_skills = len(analysis.get('etl_tools', [])) + len(analysis.get('data_warehousing', [])) + \
+                       len(analysis.get('orchestration_tools', [])) + len(analysis.get('streaming_tech', []))
+            role_specific_score = min(10, max(ds_skills, de_skills))
+            # Calculate total score
+            total_score = education_score + experience_score + technical_score + \
+                         project_score + impact_score + role_specific_score
+            return {
+                'total_score': total_score,
+                'education_score': education_score,
+                'experience_score': experience_score,
+                'technical_score': technical_score,
+                'project_score': project_score,
+                'impact_score': impact_score,
+                'role_specific_score': role_specific_score
+            }
+        except Exception as e:
+            print(f"Error calculating score: {str(e)}")
+            return {
+                'total_score': 0,
+                'education_score': 0,
+                'experience_score': 0,
+                'technical_score': 0,
+                'project_score': 0,
+                'impact_score': 0,
+                'role_specific_score': 0
+            }
+    def get_statistics(self):
+        """Get statistics of analyzed resumes"""
         conn = sqlite3.connect(self.db_path)
+        df = pd.read_sql_query("SELECT analysis_json FROM resumes", conn)
         conn.close()
+        if df.empty:
+            return {
+                'total_resumes': 0,
+                'avg_work_experience': 0,
+                'education_levels': {},
+                'major_distribution': {},
+                'top_programming_languages': {},
+                'top_technical_skills': {},
+                'top_ml_frameworks': {},
+                'top_visualization_tools': {},
+                'top_databases': {},
+                'top_etl_tools': {},
+                'top_streaming_tech': {},
+                'top_cloud_platforms': {},
+                'top_certifications': {},
+                'university_distribution': {}
+            }
+        analyses = [json.loads(x) for x in df['analysis_json']]
+        # Calculate statistics
         stats = {
             'total_resumes': len(analyses),
             'avg_work_experience': 0,
             'education_levels': {},
+            'major_distribution': {},
             'top_programming_languages': {},
             'top_technical_skills': {},
             'top_ml_frameworks': {},
             'top_visualization_tools': {},
             'top_databases': {},
             'top_etl_tools': {},
             'top_streaming_tech': {},
+            'top_cloud_platforms': {},
+            'top_certifications': {},
+            'university_distribution': {}
         }
+        # Calculate averages and distributions
+        total_exp = 0
+        valid_exp = 0
         for analysis in analyses:
+            # Work experience
+            exp = analysis.get('years_experience', 0)
+            if isinstance(exp, (int, float)) or (isinstance(exp, str) and exp.replace('.', '').isdigit()):
+                try:
+                    exp = float(exp)
+                    total_exp += exp
+                    valid_exp += 1
+                except:
+                    pass
+            # Education level
+            edu = analysis.get('education_level', 'Not specified')
+            stats['education_levels'][edu] = stats['education_levels'].get(edu, 0) + 1
+            # Major
+            major = analysis.get('major', 'Not specified')
+            stats['major_distribution'][major] = stats['major_distribution'].get(major, 0) + 1
+            # University
+            uni = analysis.get('university', 'Not specified')
+            stats['university_distribution'][uni] = stats['university_distribution'].get(uni, 0) + 1
+            # Technical skills distributions
+            for lang in analysis.get('programming_languages', []):
+                stats['top_programming_languages'][lang] = stats['top_programming_languages'].get(lang, 0) + 1
+            for skill in analysis.get('technical_skills', []):
+                stats['top_technical_skills'][skill] = stats['top_technical_skills'].get(skill, 0) + 1
+            for framework in analysis.get('ml_frameworks', []):
+                stats['top_ml_frameworks'][framework] = stats['top_ml_frameworks'].get(framework, 0) + 1
+            for tool in analysis.get('visualization_tools', []):
+                stats['top_visualization_tools'][tool] = stats['top_visualization_tools'].get(tool, 0) + 1
+            for db in analysis.get('databases', []):
+                stats['top_databases'][db] = stats['top_databases'].get(db, 0) + 1
+            for tool in analysis.get('etl_tools', []):
+                stats['top_etl_tools'][tool] = stats['top_etl_tools'].get(tool, 0) + 1
+            for tech in analysis.get('streaming_tech', []):
+                stats['top_streaming_tech'][tech] = stats['top_streaming_tech'].get(tech, 0) + 1
+            for platform in analysis.get('cloud_platforms', []):
+                stats['top_cloud_platforms'][platform] = stats['top_cloud_platforms'].get(platform, 0) + 1
+            for cert in analysis.get('certifications', []):
+                stats['top_certifications'][cert] = stats['top_certifications'].get(cert, 0) + 1
+        # Calculate average work experience
+        stats['avg_work_experience'] = total_exp / valid_exp if valid_exp > 0 else 0
+        # Sort and limit distributions
         for key in stats:
             if isinstance(stats[key], dict):
                 stats[key] = dict(sorted(stats[key].items(), key=lambda x: x[1], reverse=True)[:10])
         return stats
+    def get_candidate_rankings(self, role_type='both', min_score=50):
+        """Get ranked list of candidates based on their scores"""
+        conn = sqlite3.connect(self.db_path)
+        df = pd.read_sql_query("SELECT analysis_json FROM resumes", conn)
+        conn.close()
+        if df.empty:
+            return []
+        rankings = []
+        for analysis_json in df['analysis_json']:
+            analysis = json.loads(analysis_json)
+            scores = self.calculate_score(analysis)
+            if scores['total_score'] >= min_score:
+                candidate = {
+                    'name': analysis.get('name', 'Not found'),
+                    'email': analysis.get('email', 'Not found'),
+                    'years_experience': analysis.get('years_experience', 'Not found'),
+                    'education_level': analysis.get('education_level', 'Not found'),
+                    'key_skills': (
+                        analysis.get('programming_languages', []) +
+                        analysis.get('technical_skills', [])
+                    )[:5],  # Top 5 skills
+                    **scores
+                }
+                # Filter based on role type
+                if role_type == 'data_science':
+                    ds_score = len(analysis.get('ml_frameworks', [])) + \
+                              len(analysis.get('deep_learning', [])) + \
+                              len(analysis.get('nlp_skills', [])) + \
+                              len(analysis.get('computer_vision', []))
+                    if ds_score > 0:
+                        rankings.append(candidate)
+                elif role_type == 'data_engineering':
+                    de_score = len(analysis.get('etl_tools', [])) + \
+                              len(analysis.get('data_warehousing', [])) + \
+                              len(analysis.get('orchestration_tools', [])) + \
+                              len(analysis.get('streaming_tech', []))
+                    if de_score > 0:
+                        rankings.append(candidate)
+                else:  # both
+                    rankings.append(candidate)
+        # Sort by total score
+        rankings.sort(key=lambda x: x['total_score'], reverse=True)
+        return rankings
+    def export_to_csv(self):
+        """Export analyses to CSV"""
+        conn = sqlite3.connect(self.db_path)
+        df = pd.read_sql_query("SELECT * FROM resumes", conn)
+        conn.close()
+        csv_path = f"resume_analyses_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
+        df.to_csv(csv_path, index=False)
+        return csv_path
+    def export_to_json(self):
+        """Export analyses to JSON"""
+        conn = sqlite3.connect(self.db_path)
+        df = pd.read_sql_query("SELECT * FROM resumes", conn)
+        conn.close()
+        json_path = f"resume_analyses_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
+        df.to_json(json_path, orient='records')
+        return json_path