Deepakkori45 commited on
Commit
4cee71c
·
verified ·
1 Parent(s): 0b7aa1f

Update database.py

Browse files
Files changed (1) hide show
  1. database.py +267 -479
database.py CHANGED
@@ -4,530 +4,318 @@ import pandas as pd
4
  from datetime import datetime
5
 
6
  class ResumeDatabase:
7
- def __init__(self, db_path='resume_data.db'):
8
  self.db_path = db_path
9
- self.init_database()
10
 
11
- def init_database(self):
12
- """Initialize the database with required tables"""
13
  conn = sqlite3.connect(self.db_path)
14
- cursor = conn.cursor()
15
-
16
- # Create resume_analyses table with enhanced fields for DS/DE roles
17
- cursor.execute('''
18
- CREATE TABLE IF NOT EXISTS resume_analyses (
19
- id INTEGER PRIMARY KEY AUTOINCREMENT,
20
- timestamp TEXT,
21
-
22
- -- Basic Information
23
- name TEXT,
24
- email TEXT,
25
- phone TEXT,
26
- location TEXT,
27
- linkedin_url TEXT,
28
- github_url TEXT,
29
- portfolio_url TEXT,
30
-
31
- -- Education & Experience
32
- cgpa TEXT,
33
- work_experience TEXT,
34
- education_level TEXT,
35
- major TEXT,
36
- university TEXT,
37
- internships TEXT,
38
-
39
- -- Skills & Expertise
40
- technical_skills TEXT,
41
- programming_languages TEXT,
42
- job_titles TEXT,
43
- ds_de_skills TEXT,
44
- certifications TEXT,
45
-
46
- -- Data Science Specific Fields
47
- ml_frameworks TEXT,
48
- visualization_tools TEXT,
49
- statistical_tools TEXT,
50
- big_data_tools TEXT,
51
- cloud_platforms TEXT,
52
- deep_learning_expertise TEXT,
53
- nlp_expertise TEXT,
54
- computer_vision_expertise TEXT,
55
-
56
- -- Data Engineering Specific Fields
57
- databases TEXT,
58
- etl_tools TEXT,
59
- data_warehousing TEXT,
60
- orchestration_tools TEXT,
61
- streaming_technologies TEXT,
62
- data_modeling_skills TEXT,
63
- data_governance_experience TEXT,
64
- data_quality_tools TEXT,
65
-
66
- -- Project Information
67
- projects TEXT,
68
- publications TEXT,
69
- research_experience TEXT,
70
- hackathons TEXT,
71
- awards_achievements TEXT,
72
-
73
- -- Additional Skills & Metrics
74
- soft_skills TEXT,
75
- industry_domain TEXT,
76
- languages TEXT,
77
- leadership_experience TEXT,
78
- team_size_managed TEXT,
79
-
80
- -- Performance Metrics
81
- code_quality_metrics TEXT,
82
- project_impact_metrics TEXT,
83
- performance_improvements TEXT,
84
-
85
- -- Additional Technical Areas
86
- version_control_systems TEXT,
87
- ci_cd_tools TEXT,
88
- testing_frameworks TEXT,
89
- agile_methodologies TEXT,
90
- system_architecture TEXT,
91
-
92
- -- Business & Domain Knowledge
93
- business_domain_expertise TEXT,
94
- industry_certifications TEXT,
95
- domain_specific_tools TEXT,
96
- compliance_knowledge TEXT,
97
-
98
- -- Raw Data
99
- raw_text TEXT,
100
-
101
- -- Metadata
102
- last_updated TEXT,
103
- resume_version TEXT,
104
- analysis_confidence_score TEXT
105
- )
106
- ''')
107
-
108
  conn.commit()
109
  conn.close()
110
 
111
  def save_analysis(self, analysis_result, raw_text):
112
- """Save analysis results to database"""
113
  conn = sqlite3.connect(self.db_path)
114
- cursor = conn.cursor()
 
 
 
 
 
 
 
 
 
 
 
115
 
116
- # Convert lists and dictionaries to JSON strings for storage
117
- analysis_data = {
118
- 'timestamp': datetime.now().isoformat(),
119
-
120
- # Basic Information
121
- 'name': analysis_result.get('Name', 'Not found'),
122
- 'email': analysis_result.get('Email', 'Not found'),
123
- 'phone': analysis_result.get('Phone', 'Not found'),
124
- 'location': analysis_result.get('Location', 'Not found'),
125
- 'linkedin_url': analysis_result.get('LinkedIn', 'Not found'),
126
- 'github_url': analysis_result.get('GitHub', 'Not found'),
127
- 'portfolio_url': analysis_result.get('Portfolio', 'Not found'),
128
-
129
- # Education & Experience
130
- 'cgpa': analysis_result.get('CGPA', 'Not found'),
131
- 'work_experience': analysis_result.get('Total years of work experience', 'Not found'),
132
- 'education_level': analysis_result.get('Education level', 'Not found'),
133
- 'major': analysis_result.get('Major', 'Not found'),
134
- 'university': analysis_result.get('University', 'Not found'),
135
- 'internships': json.dumps(analysis_result.get('Internships', [])),
136
-
137
- # Skills & Expertise
138
- 'technical_skills': json.dumps(analysis_result.get('Technical skills', [])),
139
- 'programming_languages': json.dumps(analysis_result.get('Programming languages', [])),
140
- 'job_titles': json.dumps(analysis_result.get('Job titles', [])),
141
- 'ds_de_skills': json.dumps(analysis_result.get('Data science/engineering specific skills', [])),
142
- 'certifications': json.dumps(analysis_result.get('Certifications', [])),
143
-
144
- # Data Science Specific Fields
145
- 'ml_frameworks': json.dumps(analysis_result.get('Machine learning frameworks', [])),
146
- 'visualization_tools': json.dumps(analysis_result.get('Visualization tools', [])),
147
- 'statistical_tools': json.dumps(analysis_result.get('Statistical tools', [])),
148
- 'big_data_tools': json.dumps(analysis_result.get('Big data tools', [])),
149
- 'cloud_platforms': json.dumps(analysis_result.get('Cloud platforms', [])),
150
- 'deep_learning_expertise': json.dumps(analysis_result.get('Deep learning expertise', [])),
151
- 'nlp_expertise': json.dumps(analysis_result.get('NLP expertise', [])),
152
- 'computer_vision_expertise': json.dumps(analysis_result.get('Computer vision expertise', [])),
153
-
154
- # Data Engineering Specific Fields
155
- 'databases': json.dumps(analysis_result.get('Databases', [])),
156
- 'etl_tools': json.dumps(analysis_result.get('ETL tools', [])),
157
- 'data_warehousing': json.dumps(analysis_result.get('Data warehousing', [])),
158
- 'orchestration_tools': json.dumps(analysis_result.get('Orchestration tools', [])),
159
- 'streaming_technologies': json.dumps(analysis_result.get('Streaming technologies', [])),
160
- 'data_modeling_skills': json.dumps(analysis_result.get('Data modeling skills', [])),
161
- 'data_governance_experience': json.dumps(analysis_result.get('Data governance experience', [])),
162
- 'data_quality_tools': json.dumps(analysis_result.get('Data quality tools', [])),
163
 
164
- # Project Information
165
- 'projects': json.dumps(analysis_result.get('Projects', [])),
166
- 'publications': json.dumps(analysis_result.get('Publications', [])),
167
- 'research_experience': json.dumps(analysis_result.get('Research experience', [])),
168
- 'hackathons': json.dumps(analysis_result.get('Hackathons', [])),
169
- 'awards_achievements': json.dumps(analysis_result.get('Awards and achievements', [])),
 
170
 
171
- # Additional Skills & Metrics
172
- 'soft_skills': json.dumps(analysis_result.get('Soft skills', [])),
173
- 'industry_domain': json.dumps(analysis_result.get('Industry domain', [])),
174
- 'languages': json.dumps(analysis_result.get('Languages', [])),
175
- 'leadership_experience': json.dumps(analysis_result.get('Leadership experience', [])),
176
- 'team_size_managed': analysis_result.get('Team size managed', 'Not found'),
 
 
 
 
177
 
178
- # Performance Metrics
179
- 'code_quality_metrics': json.dumps(analysis_result.get('Code quality metrics', [])),
180
- 'project_impact_metrics': json.dumps(analysis_result.get('Project impact metrics', [])),
181
- 'performance_improvements': json.dumps(analysis_result.get('Performance improvements', [])),
182
 
183
- # Additional Technical Areas
184
- 'version_control_systems': json.dumps(analysis_result.get('Version control systems', [])),
185
- 'ci_cd_tools': json.dumps(analysis_result.get('CI/CD tools', [])),
186
- 'testing_frameworks': json.dumps(analysis_result.get('Testing frameworks', [])),
187
- 'agile_methodologies': json.dumps(analysis_result.get('Agile methodologies', [])),
188
- 'system_architecture': json.dumps(analysis_result.get('System architecture experience', [])),
189
 
190
- # Business & Domain Knowledge
191
- 'business_domain_expertise': json.dumps(analysis_result.get('Business domain expertise', [])),
192
- 'industry_certifications': json.dumps(analysis_result.get('Industry certifications', [])),
193
- 'domain_specific_tools': json.dumps(analysis_result.get('Domain specific tools', [])),
194
- 'compliance_knowledge': json.dumps(analysis_result.get('Compliance knowledge', [])),
195
 
196
- # Raw Data and Metadata
197
- 'raw_text': raw_text,
198
- 'last_updated': datetime.now().isoformat(),
199
- 'resume_version': '1.0',
200
- 'analysis_confidence_score': analysis_result.get('Analysis confidence score', '0.0')
201
- }
202
-
203
- # Create the SQL query dynamically based on the fields
204
- fields = ', '.join(analysis_data.keys())
205
- placeholders = ', '.join(['?' for _ in analysis_data])
206
- query = f'INSERT INTO resume_analyses ({fields}) VALUES ({placeholders})'
207
-
208
- cursor.execute(query, list(analysis_data.values()))
209
- conn.commit()
210
- conn.close()
 
 
 
 
 
 
 
 
 
 
 
211
 
212
- def get_all_analyses(self):
213
- """Retrieve all analyses from database"""
214
  conn = sqlite3.connect(self.db_path)
215
- cursor = conn.cursor()
216
-
217
- cursor.execute('SELECT * FROM resume_analyses')
218
- columns = [description[0] for description in cursor.description]
219
- results = cursor.fetchall()
220
-
221
- analyses = []
222
- for row in results:
223
- analysis = dict(zip(columns, row))
224
- # Convert JSON strings back to lists/dicts for all relevant fields
225
- json_fields = [
226
- 'technical_skills', 'programming_languages', 'job_titles',
227
- 'ds_de_skills', 'certifications', 'ml_frameworks',
228
- 'visualization_tools', 'statistical_tools', 'big_data_tools',
229
- 'cloud_platforms', 'databases', 'etl_tools', 'data_warehousing',
230
- 'orchestration_tools', 'streaming_technologies', 'projects',
231
- 'publications', 'research_experience', 'soft_skills',
232
- 'industry_domain', 'languages'
233
- ]
234
- for field in json_fields:
235
- if analysis[field]:
236
- analysis[field] = json.loads(analysis[field])
237
- analyses.append(analysis)
238
-
239
  conn.close()
240
- return analyses
241
-
242
- def export_to_csv(self, filepath='resume_analyses.csv'):
243
- """Export all analyses to CSV"""
244
- analyses = self.get_all_analyses()
245
- df = pd.DataFrame(analyses)
246
- df.to_csv(filepath, index=False)
247
- return filepath
248
-
249
- def export_to_json(self, filepath='resume_analyses.json'):
250
- """Export all analyses to JSON"""
251
- analyses = self.get_all_analyses()
252
- with open(filepath, 'w') as f:
253
- json.dump(analyses, f, indent=2)
254
- return filepath
255
 
256
- def get_statistics(self):
257
- """Get enhanced statistics about the stored data"""
258
- analyses = self.get_all_analyses()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
 
 
260
  stats = {
261
  'total_resumes': len(analyses),
262
  'avg_work_experience': 0,
263
  'education_levels': {},
 
264
  'top_programming_languages': {},
265
  'top_technical_skills': {},
266
- 'top_certifications': {},
267
-
268
- # New statistics
269
  'top_ml_frameworks': {},
270
  'top_visualization_tools': {},
271
  'top_databases': {},
272
- 'top_cloud_platforms': {},
273
  'top_etl_tools': {},
274
  'top_streaming_tech': {},
275
- 'industry_distribution': {},
276
- 'university_distribution': {},
277
- 'major_distribution': {}
278
  }
279
 
 
 
 
 
280
  for analysis in analyses:
281
- # Existing statistics
282
- edu_level = analysis['education_level']
283
- stats['education_levels'][edu_level] = stats['education_levels'].get(edu_level, 0) + 1
284
-
285
- # Count various skills and tools
286
- self._count_items(analysis['programming_languages'], stats['top_programming_languages'])
287
- self._count_items(analysis['technical_skills'], stats['top_technical_skills'])
288
- self._count_items(analysis['certifications'], stats['top_certifications'])
289
- self._count_items(analysis['ml_frameworks'], stats['top_ml_frameworks'])
290
- self._count_items(analysis['visualization_tools'], stats['top_visualization_tools'])
291
- self._count_items(analysis['databases'], stats['top_databases'])
292
- self._count_items(analysis['cloud_platforms'], stats['top_cloud_platforms'])
293
- self._count_items(analysis['etl_tools'], stats['top_etl_tools'])
294
- self._count_items(analysis['streaming_technologies'], stats['top_streaming_tech'])
295
-
296
- # Count university and major distribution
297
- if analysis['university'] != 'Not found':
298
- stats['university_distribution'][analysis['university']] = \
299
- stats['university_distribution'].get(analysis['university'], 0) + 1
300
-
301
- if analysis['major'] != 'Not found':
302
- stats['major_distribution'][analysis['major']] = \
303
- stats['major_distribution'].get(analysis['major'], 0) + 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
 
305
- # Calculate average work experience
306
- try:
307
- exp = float(analysis['work_experience'].split()[0])
308
- stats['avg_work_experience'] += exp
309
- except:
310
- continue
311
 
312
- if stats['total_resumes'] > 0:
313
- stats['avg_work_experience'] /= stats['total_resumes']
314
 
315
- # Sort and limit all dictionaries to top 10
316
  for key in stats:
317
  if isinstance(stats[key], dict):
318
  stats[key] = dict(sorted(stats[key].items(), key=lambda x: x[1], reverse=True)[:10])
319
 
320
  return stats
321
 
322
- def _count_items(self, items, counter_dict):
323
- """Helper method to count items in a list"""
324
- if items:
325
- for item in items:
326
- counter_dict[item] = counter_dict.get(item, 0) + 1
327
-
328
- def calculate_score(self, analysis, role_type='both'):
329
- """Calculate score for a resume based on role type (data_science, data_engineering, or both)"""
330
- scores = {
331
- 'technical_score': 0,
332
- 'experience_score': 0,
333
- 'education_score': 0,
334
- 'project_score': 0,
335
- 'impact_score': 0,
336
- 'total_score': 0,
337
- 'role_specific_score': 0
338
- }
339
-
340
- # Education Score (max 20 points)
341
- education_weights = {
342
- 'PhD': 20,
343
- 'Masters': 18,
344
- 'Bachelors': 15,
345
- 'Associate': 10
346
- }
347
- edu_level = analysis['education_level'].lower()
348
- for level, weight in education_weights.items():
349
- if level.lower() in edu_level:
350
- scores['education_score'] = weight
351
- break
352
-
353
- # Add points for CGPA if available
354
- try:
355
- cgpa = float(analysis['cgpa'].split('/')[0])
356
- if cgpa >= 3.5:
357
- scores['education_score'] += 5
358
- elif cgpa >= 3.0:
359
- scores['education_score'] += 3
360
- except:
361
- pass
362
-
363
- # Experience Score (max 20 points)
364
- try:
365
- years = float(analysis['work_experience'].split()[0])
366
- scores['experience_score'] = min(20, years * 4) # 4 points per year, max 20
367
- except:
368
- pass
369
-
370
- # Technical Skills Score (max 20 points)
371
- tech_score = 0
372
- if role_type in ['data_science', 'both']:
373
- # Data Science specific skills
374
- ds_skills = {
375
- 'python': 3, 'r': 2, 'sql': 2,
376
- 'tensorflow': 2, 'pytorch': 2, 'scikit-learn': 2,
377
- 'pandas': 1, 'numpy': 1, 'matplotlib': 1,
378
- 'tableau': 2, 'powerbi': 2,
379
- 'statistics': 2, 'machine learning': 3,
380
- 'deep learning': 3, 'nlp': 2, 'computer vision': 2
381
- }
382
-
383
- all_skills = (
384
- analysis['programming_languages'] +
385
- analysis['technical_skills'] +
386
- analysis['ml_frameworks'] +
387
- analysis['visualization_tools'] +
388
- analysis['statistical_tools']
389
- )
390
-
391
- for skill in all_skills:
392
- skill_lower = skill.lower()
393
- for key, value in ds_skills.items():
394
- if key in skill_lower:
395
- tech_score += value
396
-
397
- if role_type in ['data_engineering', 'both']:
398
- # Data Engineering specific skills
399
- de_skills = {
400
- 'sql': 3, 'python': 2, 'java': 2, 'scala': 2,
401
- 'hadoop': 2, 'spark': 3, 'kafka': 2,
402
- 'airflow': 2, 'luigi': 2,
403
- 'aws': 3, 'azure': 3, 'gcp': 3,
404
- 'snowflake': 2, 'redshift': 2,
405
- 'mongodb': 1, 'postgresql': 2,
406
- 'etl': 3, 'data warehouse': 2,
407
- 'data modeling': 2, 'data governance': 2
408
- }
409
-
410
- all_skills = (
411
- analysis['programming_languages'] +
412
- analysis['technical_skills'] +
413
- analysis['databases'] +
414
- analysis['etl_tools'] +
415
- analysis['data_warehousing'] +
416
- analysis['orchestration_tools'] +
417
- analysis['streaming_technologies']
418
- )
419
-
420
- for skill in all_skills:
421
- skill_lower = skill.lower()
422
- for key, value in de_skills.items():
423
- if key in skill_lower:
424
- tech_score += value
425
-
426
- scores['technical_score'] = min(20, tech_score) # Cap at 20 points
427
-
428
- # Project Score (max 15 points)
429
- project_score = 0
430
- projects = analysis['projects']
431
- project_score += min(10, len(projects) * 2) # 2 points per project, max 10
432
-
433
- # Add points for research and publications
434
- if analysis['research_experience']:
435
- project_score += 3
436
- if analysis['publications']:
437
- project_score += 2
438
-
439
- scores['project_score'] = project_score
440
 
441
- # Impact Score (max 15 points)
442
- impact_score = 0
443
-
444
- # Leadership and team management
445
- if analysis['leadership_experience']:
446
- impact_score += 3
447
- try:
448
- team_size = int(''.join(filter(str.isdigit, analysis['team_size_managed'])))
449
- impact_score += min(3, team_size // 5) # 1 point per 5 team members, max 3
450
- except:
451
- pass
452
-
453
- # Certifications
454
- impact_score += min(3, len(analysis['certifications']))
455
-
456
- # Awards and achievements
457
- impact_score += min(3, len(analysis['awards_achievements']))
458
-
459
- # Project impact metrics
460
- if analysis['project_impact_metrics']:
461
- impact_score += 3
462
 
463
- scores['impact_score'] = impact_score
464
-
465
- # Role-specific score (max 10 points)
466
- role_score = 0
467
- if role_type == 'data_science':
468
- # Data Science specific achievements
469
- if analysis['deep_learning_expertise']:
470
- role_score += 2
471
- if analysis['nlp_expertise']:
472
- role_score += 2
473
- if analysis['computer_vision_expertise']:
474
- role_score += 2
475
- if analysis['statistical_tools']:
476
- role_score += 2
477
- if analysis['visualization_tools']:
478
- role_score += 2
479
-
480
- elif role_type == 'data_engineering':
481
- # Data Engineering specific achievements
482
- if analysis['data_modeling_skills']:
483
- role_score += 2
484
- if analysis['data_governance_experience']:
485
- role_score += 2
486
- if analysis['data_quality_tools']:
487
- role_score += 2
488
- if analysis['ci_cd_tools']:
489
- role_score += 2
490
- if analysis['system_architecture']:
491
- role_score += 2
492
 
493
- scores['role_specific_score'] = role_score
494
-
495
- # Calculate total score (max 100 points)
496
- scores['total_score'] = (
497
- scores['education_score'] +
498
- scores['experience_score'] +
499
- scores['technical_score'] +
500
- scores['project_score'] +
501
- scores['impact_score'] +
502
- scores['role_specific_score']
503
- )
504
-
505
- return scores
506
-
507
- def get_candidate_rankings(self, role_type='both', min_score=0):
508
- """Get ranked list of candidates based on role type and minimum score"""
509
- analyses = self.get_all_analyses()
510
- rankings = []
 
 
 
 
 
 
 
 
 
511
 
512
- for analysis in analyses:
513
- scores = self.calculate_score(analysis, role_type)
514
- if scores['total_score'] >= min_score:
515
- rankings.append({
516
- 'name': analysis['name'],
517
- 'email': analysis['email'],
518
- 'total_score': scores['total_score'],
519
- 'education_score': scores['education_score'],
520
- 'experience_score': scores['experience_score'],
521
- 'technical_score': scores['technical_score'],
522
- 'project_score': scores['project_score'],
523
- 'impact_score': scores['impact_score'],
524
- 'role_specific_score': scores['role_specific_score'],
525
- 'key_skills': analysis['technical_skills'][:5], # Top 5 skills
526
- 'years_experience': analysis['work_experience'],
527
- 'education_level': analysis['education_level']
528
- })
529
 
530
- # Sort by total score in descending order
531
- rankings.sort(key=lambda x: x['total_score'], reverse=True)
532
- return rankings
533
-
 
4
  from datetime import datetime
5
 
6
  class ResumeDatabase:
7
+ def __init__(self, db_path='resumes.db'):
8
  self.db_path = db_path
9
+ self.create_tables()
10
 
11
+ def create_tables(self):
 
12
  conn = sqlite3.connect(self.db_path)
13
+ c = conn.cursor()
14
+
15
+ c.execute('''CREATE TABLE IF NOT EXISTS resumes
16
+ (id INTEGER PRIMARY KEY AUTOINCREMENT,
17
+ name TEXT,
18
+ email TEXT,
19
+ phone TEXT,
20
+ raw_text TEXT,
21
+ analysis_json TEXT,
22
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)''')
23
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  conn.commit()
25
  conn.close()
26
 
27
  def save_analysis(self, analysis_result, raw_text):
 
28
  conn = sqlite3.connect(self.db_path)
29
+ c = conn.cursor()
30
+
31
+ c.execute('''INSERT INTO resumes (name, email, phone, raw_text, analysis_json)
32
+ VALUES (?, ?, ?, ?, ?)''',
33
+ (analysis_result.get('name', 'Not found'),
34
+ analysis_result.get('email', 'Not found'),
35
+ analysis_result.get('phone', 'Not found'),
36
+ raw_text,
37
+ json.dumps(analysis_result)))
38
+
39
+ conn.commit()
40
+ conn.close()
41
 
42
+ def calculate_score(self, analysis):
43
+ """Calculate a comprehensive score based on resume analysis"""
44
+ try:
45
+ # Initialize scores
46
+ education_score = 0
47
+ experience_score = 0
48
+ technical_score = 0
49
+ project_score = 0
50
+ impact_score = 0
51
+ role_specific_score = 0
52
+
53
+ # Education Score (max 20 points)
54
+ edu_level = str(analysis.get('education_level', '')).lower()
55
+ if edu_level:
56
+ if 'phd' in edu_level or 'doctorate' in edu_level:
57
+ education_score += 20
58
+ elif 'master' in edu_level or 'ms' in edu_level or 'mtech' in edu_level:
59
+ education_score += 18
60
+ elif 'bachelor' in edu_level or 'bs' in edu_level or 'btech' in edu_level:
61
+ education_score += 15
62
+ else:
63
+ education_score += 10
64
+
65
+ # Add points for CGPA if available
66
+ cgpa = analysis.get('cgpa', 'Not found')
67
+ if isinstance(cgpa, (int, float)):
68
+ if cgpa >= 3.5: # Assuming 4.0 scale
69
+ education_score = min(20, education_score + 2)
70
+
71
+ # Experience Score (max 20 points)
72
+ years_exp = analysis.get('years_experience', 0)
73
+ if isinstance(years_exp, (int, float)):
74
+ experience_score = min(20, years_exp * 4) # 5 years for max score
75
+ elif isinstance(years_exp, str) and years_exp.replace('.', '').isdigit():
76
+ experience_score = min(20, float(years_exp) * 4)
77
+
78
+ # Technical Score (max 20 points)
79
+ tech_skills = {
80
+ 'programming_languages': analysis.get('programming_languages', []),
81
+ 'technical_skills': analysis.get('technical_skills', []),
82
+ 'ml_frameworks': analysis.get('ml_frameworks', []),
83
+ 'databases': analysis.get('databases', []),
84
+ 'cloud_platforms': analysis.get('cloud_platforms', [])
85
+ }
 
 
 
86
 
87
+ total_skills = sum(len(skills) for skills in tech_skills.values())
88
+ technical_score = min(20, total_skills * 2)
89
+
90
+ # Project Score (max 15 points)
91
+ projects = len(analysis.get('projects', []))
92
+ research_exp = 1 if analysis.get('research_experience') else 0
93
+ publications = len(analysis.get('publications', []))
94
 
95
+ project_score = min(15, projects * 2 + research_exp * 3 + publications * 2)
96
+
97
+ # Impact Score (max 15 points)
98
+ leadership = 1 if analysis.get('leadership_experience') else 0
99
+ team_size = analysis.get('team_size', 0)
100
+ if isinstance(team_size, str):
101
+ try:
102
+ team_size = int(''.join(filter(str.isdigit, team_size)))
103
+ except:
104
+ team_size = 0
105
 
106
+ certifications = len(analysis.get('certifications', []))
107
+ awards = len(analysis.get('awards', []))
 
 
108
 
109
+ impact_score = min(15, leadership * 5 + min(5, team_size/2) + min(5, certifications * 2 + awards))
110
+
111
+ # Role Specific Score (max 10 points)
112
+ ds_skills = len(analysis.get('ml_frameworks', [])) + len(analysis.get('deep_learning', [])) + \
113
+ len(analysis.get('nlp_skills', [])) + len(analysis.get('computer_vision', []))
 
114
 
115
+ de_skills = len(analysis.get('etl_tools', [])) + len(analysis.get('data_warehousing', [])) + \
116
+ len(analysis.get('orchestration_tools', [])) + len(analysis.get('streaming_tech', []))
 
 
 
117
 
118
+ role_specific_score = min(10, max(ds_skills, de_skills))
119
+
120
+ # Calculate total score
121
+ total_score = education_score + experience_score + technical_score + \
122
+ project_score + impact_score + role_specific_score
123
+
124
+ return {
125
+ 'total_score': total_score,
126
+ 'education_score': education_score,
127
+ 'experience_score': experience_score,
128
+ 'technical_score': technical_score,
129
+ 'project_score': project_score,
130
+ 'impact_score': impact_score,
131
+ 'role_specific_score': role_specific_score
132
+ }
133
+ except Exception as e:
134
+ print(f"Error calculating score: {str(e)}")
135
+ return {
136
+ 'total_score': 0,
137
+ 'education_score': 0,
138
+ 'experience_score': 0,
139
+ 'technical_score': 0,
140
+ 'project_score': 0,
141
+ 'impact_score': 0,
142
+ 'role_specific_score': 0
143
+ }
144
 
145
+ def get_statistics(self):
146
+ """Get statistics of analyzed resumes"""
147
  conn = sqlite3.connect(self.db_path)
148
+ df = pd.read_sql_query("SELECT analysis_json FROM resumes", conn)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  conn.close()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
+ if df.empty:
152
+ return {
153
+ 'total_resumes': 0,
154
+ 'avg_work_experience': 0,
155
+ 'education_levels': {},
156
+ 'major_distribution': {},
157
+ 'top_programming_languages': {},
158
+ 'top_technical_skills': {},
159
+ 'top_ml_frameworks': {},
160
+ 'top_visualization_tools': {},
161
+ 'top_databases': {},
162
+ 'top_etl_tools': {},
163
+ 'top_streaming_tech': {},
164
+ 'top_cloud_platforms': {},
165
+ 'top_certifications': {},
166
+ 'university_distribution': {}
167
+ }
168
+
169
+ analyses = [json.loads(x) for x in df['analysis_json']]
170
 
171
+ # Calculate statistics
172
  stats = {
173
  'total_resumes': len(analyses),
174
  'avg_work_experience': 0,
175
  'education_levels': {},
176
+ 'major_distribution': {},
177
  'top_programming_languages': {},
178
  'top_technical_skills': {},
 
 
 
179
  'top_ml_frameworks': {},
180
  'top_visualization_tools': {},
181
  'top_databases': {},
 
182
  'top_etl_tools': {},
183
  'top_streaming_tech': {},
184
+ 'top_cloud_platforms': {},
185
+ 'top_certifications': {},
186
+ 'university_distribution': {}
187
  }
188
 
189
+ # Calculate averages and distributions
190
+ total_exp = 0
191
+ valid_exp = 0
192
+
193
  for analysis in analyses:
194
+ # Work experience
195
+ exp = analysis.get('years_experience', 0)
196
+ if isinstance(exp, (int, float)) or (isinstance(exp, str) and exp.replace('.', '').isdigit()):
197
+ try:
198
+ exp = float(exp)
199
+ total_exp += exp
200
+ valid_exp += 1
201
+ except:
202
+ pass
203
+
204
+ # Education level
205
+ edu = analysis.get('education_level', 'Not specified')
206
+ stats['education_levels'][edu] = stats['education_levels'].get(edu, 0) + 1
207
+
208
+ # Major
209
+ major = analysis.get('major', 'Not specified')
210
+ stats['major_distribution'][major] = stats['major_distribution'].get(major, 0) + 1
211
+
212
+ # University
213
+ uni = analysis.get('university', 'Not specified')
214
+ stats['university_distribution'][uni] = stats['university_distribution'].get(uni, 0) + 1
215
+
216
+ # Technical skills distributions
217
+ for lang in analysis.get('programming_languages', []):
218
+ stats['top_programming_languages'][lang] = stats['top_programming_languages'].get(lang, 0) + 1
219
+
220
+ for skill in analysis.get('technical_skills', []):
221
+ stats['top_technical_skills'][skill] = stats['top_technical_skills'].get(skill, 0) + 1
222
+
223
+ for framework in analysis.get('ml_frameworks', []):
224
+ stats['top_ml_frameworks'][framework] = stats['top_ml_frameworks'].get(framework, 0) + 1
225
+
226
+ for tool in analysis.get('visualization_tools', []):
227
+ stats['top_visualization_tools'][tool] = stats['top_visualization_tools'].get(tool, 0) + 1
228
+
229
+ for db in analysis.get('databases', []):
230
+ stats['top_databases'][db] = stats['top_databases'].get(db, 0) + 1
231
+
232
+ for tool in analysis.get('etl_tools', []):
233
+ stats['top_etl_tools'][tool] = stats['top_etl_tools'].get(tool, 0) + 1
234
+
235
+ for tech in analysis.get('streaming_tech', []):
236
+ stats['top_streaming_tech'][tech] = stats['top_streaming_tech'].get(tech, 0) + 1
237
+
238
+ for platform in analysis.get('cloud_platforms', []):
239
+ stats['top_cloud_platforms'][platform] = stats['top_cloud_platforms'].get(platform, 0) + 1
240
 
241
+ for cert in analysis.get('certifications', []):
242
+ stats['top_certifications'][cert] = stats['top_certifications'].get(cert, 0) + 1
 
 
 
 
243
 
244
+ # Calculate average work experience
245
+ stats['avg_work_experience'] = total_exp / valid_exp if valid_exp > 0 else 0
246
 
247
+ # Sort and limit distributions
248
  for key in stats:
249
  if isinstance(stats[key], dict):
250
  stats[key] = dict(sorted(stats[key].items(), key=lambda x: x[1], reverse=True)[:10])
251
 
252
  return stats
253
 
254
+ def get_candidate_rankings(self, role_type='both', min_score=50):
255
+ """Get ranked list of candidates based on their scores"""
256
+ conn = sqlite3.connect(self.db_path)
257
+ df = pd.read_sql_query("SELECT analysis_json FROM resumes", conn)
258
+ conn.close()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
 
260
+ if df.empty:
261
+ return []
262
+
263
+ rankings = []
264
+ for analysis_json in df['analysis_json']:
265
+ analysis = json.loads(analysis_json)
266
+ scores = self.calculate_score(analysis)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
 
268
+ if scores['total_score'] >= min_score:
269
+ candidate = {
270
+ 'name': analysis.get('name', 'Not found'),
271
+ 'email': analysis.get('email', 'Not found'),
272
+ 'years_experience': analysis.get('years_experience', 'Not found'),
273
+ 'education_level': analysis.get('education_level', 'Not found'),
274
+ 'key_skills': (
275
+ analysis.get('programming_languages', []) +
276
+ analysis.get('technical_skills', [])
277
+ )[:5], # Top 5 skills
278
+ **scores
279
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
 
281
+ # Filter based on role type
282
+ if role_type == 'data_science':
283
+ ds_score = len(analysis.get('ml_frameworks', [])) + \
284
+ len(analysis.get('deep_learning', [])) + \
285
+ len(analysis.get('nlp_skills', [])) + \
286
+ len(analysis.get('computer_vision', []))
287
+ if ds_score > 0:
288
+ rankings.append(candidate)
289
+ elif role_type == 'data_engineering':
290
+ de_score = len(analysis.get('etl_tools', [])) + \
291
+ len(analysis.get('data_warehousing', [])) + \
292
+ len(analysis.get('orchestration_tools', [])) + \
293
+ len(analysis.get('streaming_tech', []))
294
+ if de_score > 0:
295
+ rankings.append(candidate)
296
+ else: # both
297
+ rankings.append(candidate)
298
+
299
+ # Sort by total score
300
+ rankings.sort(key=lambda x: x['total_score'], reverse=True)
301
+ return rankings
302
+
303
+ def export_to_csv(self):
304
+ """Export analyses to CSV"""
305
+ conn = sqlite3.connect(self.db_path)
306
+ df = pd.read_sql_query("SELECT * FROM resumes", conn)
307
+ conn.close()
308
 
309
+ csv_path = f"resume_analyses_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
310
+ df.to_csv(csv_path, index=False)
311
+ return csv_path
312
+
313
+ def export_to_json(self):
314
+ """Export analyses to JSON"""
315
+ conn = sqlite3.connect(self.db_path)
316
+ df = pd.read_sql_query("SELECT * FROM resumes", conn)
317
+ conn.close()
 
 
 
 
 
 
 
 
318
 
319
+ json_path = f"resume_analyses_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
320
+ df.to_json(json_path, orient='records')
321
+ return json_path