SMART_AI_RESUME / utils /resume_analyzer.py
parthib07's picture
Upload 531 files
d7d3dff verified
import re
class ResumeAnalyzer:
def __init__(self):
# Document type indicators
self.document_types = {
'resume': [
'experience', 'education', 'skills', 'work', 'project', 'objective',
'summary', 'employment', 'qualification', 'achievements'
],
'marksheet': [
'grade', 'marks', 'score', 'semester', 'cgpa', 'sgpa', 'examination',
'result', 'academic year', 'percentage'
],
'certificate': [
'certificate', 'certification', 'awarded', 'completed', 'achievement',
'training', 'course completion', 'qualified'
],
'id_card': [
'id card', 'identity', 'student id', 'employee id', 'valid until',
'date of issue', 'identification'
]
}
def detect_document_type(self, text):
text = text.lower()
scores = {}
# Calculate score for each document type
for doc_type, keywords in self.document_types.items():
matches = sum(1 for keyword in keywords if keyword in text)
density = matches / len(keywords)
frequency = matches / (len(text.split()) + 1) # Add 1 to avoid division by zero
scores[doc_type] = (density * 0.7) + (frequency * 0.3)
# Get the highest scoring document type
best_match = max(scores.items(), key=lambda x: x[1])
# Only return a document type if the score is significant
return best_match[0] if best_match[1] > 0.15 else 'unknown'
def calculate_keyword_match(self, resume_text, required_skills):
resume_text = resume_text.lower()
found_skills = []
missing_skills = []
for skill in required_skills:
skill_lower = skill.lower()
# Check for exact match
if skill_lower in resume_text:
found_skills.append(skill)
# Check for partial matches (e.g., "Python" in "Python programming")
elif any(skill_lower in phrase for phrase in resume_text.split('.')):
found_skills.append(skill)
else:
missing_skills.append(skill)
match_score = (len(found_skills) / len(required_skills)) * 100 if required_skills else 0
return {
'score': match_score,
'found_skills': found_skills,
'missing_skills': missing_skills
}
def check_resume_sections(self, text):
text = text.lower()
essential_sections = {
'contact': ['email', 'phone', 'address', 'linkedin'],
'education': ['education', 'university', 'college', 'degree', 'academic'],
'experience': ['experience', 'work', 'employment', 'job', 'internship'],
'skills': ['skills', 'technologies', 'tools', 'proficiencies', 'expertise']
}
section_scores = {}
for section, keywords in essential_sections.items():
found = sum(1 for keyword in keywords if keyword in text)
section_scores[section] = min(25, (found / len(keywords)) * 25)
return sum(section_scores.values())
def check_formatting(self, text):
lines = text.split('\n')
score = 100
deductions = []
# Check for minimum content
if len(text) < 300:
score -= 30
deductions.append("Resume is too short")
# Check for section headers
if not any(line.isupper() for line in lines):
score -= 20
deductions.append("No clear section headers found")
# Check for bullet points
if not any(line.strip().startswith(('•', '-', '*', '→')) for line in lines):
score -= 20
deductions.append("No bullet points found for listing details")
# Check for consistent spacing
if any(len(line.strip()) == 0 and len(next_line.strip()) == 0
for line, next_line in zip(lines[:-1], lines[1:])):
score -= 15
deductions.append("Inconsistent spacing between sections")
# Check for contact information format
contact_patterns = [
r'\b[\w\.-]+@[\w\.-]+\.\w+\b', # email
r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', # phone
r'linkedin\.com/\w+', # LinkedIn
]
if not any(re.search(pattern, text) for pattern in contact_patterns):
score -= 15
deductions.append("Missing or improperly formatted contact information")
return max(0, score), deductions
def extract_text_from_pdf(self, file):
try:
import PyPDF2
import io
# Create a PDF reader object
# First make sure we have the file content as bytes
if hasattr(file, 'read'):
# If it's already a file-like object, read it
file_content = file.read()
file.seek(0) # Reset file pointer
else:
# If it's already bytes
file_content = file
# Create BytesIO from bytes content
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
# Extract text from all pages
text = ""
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
return text
except Exception as e:
raise Exception(f"Error extracting text from PDF: {str(e)}")
def extract_text_from_docx(self, docx_file):
"""Extract text from a DOCX file"""
try:
from docx import Document
doc = Document(docx_file)
full_text = []
for paragraph in doc.paragraphs:
full_text.append(paragraph.text)
return '\n'.join(full_text)
except Exception as e:
raise Exception(f"Error extracting text from DOCX file: {str(e)}")
def extract_personal_info(self, text):
"""Extract personal information from resume text"""
# Basic patterns for personal info
email_pattern = r'[\w\.-]+@[\w\.-]+\.\w+'
phone_pattern = r'(\+\d{1,3}[-.]?)?\s*\(?\d{3}\)?[-.]?\s*\d{3}[-.]?\s*\d{4}'
linkedin_pattern = r'linkedin\.com/in/[\w-]+'
github_pattern = r'github\.com/[\w-]+'
# Extract information
email = re.search(email_pattern, text)
phone = re.search(phone_pattern, text)
linkedin = re.search(linkedin_pattern, text)
github = re.search(github_pattern, text)
# Get the first line as name (basic assumption)
name = text.split('\n')[0].strip()
return {
'name': name if len(name) > 0 else 'Unknown',
'email': email.group(0) if email else '',
'phone': phone.group(0) if phone else '',
'linkedin': linkedin.group(0) if linkedin else '',
'github': github.group(0) if github else '',
'portfolio': '' # Can be enhanced later
}
def extract_education(self, text):
"""Extract education information from resume text"""
education = []
lines = text.split('\n')
education_keywords = [
'education', 'academic', 'qualification', 'degree', 'university', 'college',
'school', 'institute', 'certification', 'diploma', 'bachelor', 'master',
'phd', 'b.tech', 'm.tech', 'b.e', 'm.e', 'b.sc', 'm.sc','bca', 'mca', 'b.com',
'm.com', 'b.cs-it', 'imca', 'bba', 'mba', 'honors', 'scholarship'
]
in_education_section = False
current_entry = []
for line in lines:
line = line.strip()
# Check for section header
if any(keyword.lower() in line.lower() for keyword in education_keywords):
if not any(keyword.lower() == line.lower() for keyword in education_keywords):
# This line contains education info, not just a header
current_entry.append(line)
in_education_section = True
continue
if in_education_section:
# Check if we've hit another section
if line and any(keyword.lower() in line.lower() for keyword in self.document_types['resume']):
if not any(edu_key.lower() in line.lower() for edu_key in education_keywords):
in_education_section = False
if current_entry:
education.append(' '.join(current_entry))
current_entry = []
continue
if line:
current_entry.append(line)
elif current_entry: # Empty line and we have content
education.append(' '.join(current_entry))
current_entry = []
if current_entry:
education.append(' '.join(current_entry))
return education
def extract_experience(self, text):
"""Extract work experience information from resume text"""
experience = []
lines = text.split('\n')
experience_keywords = [
'experience', 'employment', 'work history', 'professional experience',
'work experience', 'career history', 'professional background',
'employment history', 'job history', 'positions held', 'experience',
'job title', 'job responsibilities', 'job description', 'job summary'
]
in_experience_section = False
current_entry = []
for line in lines:
line = line.strip()
# Check for section header
if any(keyword.lower() in line.lower() for keyword in experience_keywords):
if not any(keyword.lower() == line.lower() for keyword in experience_keywords):
# This line contains experience info, not just a header
current_entry.append(line)
in_experience_section = True
continue
if in_experience_section:
# Check if we've hit another section
if line and any(keyword.lower() in line.lower() for keyword in self.document_types['resume']):
if not any(exp_key.lower() in line.lower() for exp_key in experience_keywords):
in_experience_section = False
if current_entry:
experience.append(' '.join(current_entry))
current_entry = []
continue
if line:
current_entry.append(line)
elif current_entry: # Empty line and we have content
experience.append(' '.join(current_entry))
current_entry = []
if current_entry:
experience.append(' '.join(current_entry))
return experience
def extract_projects(self, text):
"""Extract project information from resume text"""
projects = []
lines = text.split('\n')
project_keywords = [
'projects', 'personal projects', 'academic projects', 'key projects',
'major projects', 'professional projects', 'project experience',
'relevant projects', 'featured projects','latest projects',
'top projects'
]
in_project_section = False
current_entry = []
for line in lines:
line = line.strip()
# Check for section header
if any(keyword.lower() in line.lower() for keyword in project_keywords):
if not any(keyword.lower() == line.lower() for keyword in project_keywords):
# This line contains project info, not just a header
current_entry.append(line)
in_project_section = True
continue
if in_project_section:
# Check if we've hit another section
if line and any(keyword.lower() in line.lower() for keyword in self.document_types['resume']):
if not any(proj_key.lower() in line.lower() for proj_key in project_keywords):
in_project_section = False
if current_entry:
projects.append(' '.join(current_entry))
current_entry = []
continue
if line:
current_entry.append(line)
elif current_entry: # Empty line and we have content
projects.append(' '.join(current_entry))
current_entry = []
if current_entry:
projects.append(' '.join(current_entry))
return projects
def extract_skills(self, text):
"""Extract skills from resume text"""
skills = set() # Use set to avoid duplicates
lines = text.split('\n')
skills_keywords = [
'skills', 'technical skills', 'competencies', 'expertise',
'core competencies', 'professional skills', 'key skills',
'technical expertise', 'proficiencies', 'qualifications',
'top skills', 'key skill', 'major skill', 'personal skill',
'soft skills', 'soft skill', 'soft skillset'
]
in_skills_section = False
current_entry = []
# Common skill separators
separators = [',', '•', '|', '/', '\\', '·', '>', '-', '–', '―']
for line in lines:
line = line.strip()
# Check for section header
if any(keyword.lower() in line.lower() for keyword in skills_keywords):
if not any(keyword.lower() == line.lower() for keyword in skills_keywords):
# This line contains skills, not just a header
current_entry.append(line)
in_skills_section = True
continue
if in_skills_section:
# Check if we've hit another section
if line and any(keyword.lower() in line.lower() for keyword in self.document_types['resume']):
if not any(skill_key.lower() in line.lower() for skill_key in skills_keywords):
in_skills_section = False
if current_entry:
# Process the current entry
text_to_process = ' '.join(current_entry)
# Split by common separators
for separator in separators:
if separator in text_to_process:
skills.update(skill.strip() for skill in text_to_process.split(separator) if skill.strip())
current_entry = []
continue
if line:
current_entry.append(line)
elif current_entry: # Empty line and we have content
# Process the current entry
text_to_process = ' '.join(current_entry)
# Split by common separators
for separator in separators:
if separator in text_to_process:
skills.update(skill.strip() for skill in text_to_process.split(separator) if skill.strip())
current_entry = []
if current_entry:
# Process any remaining skills
text_to_process = ' '.join(current_entry)
for separator in separators:
if separator in text_to_process:
skills.update(skill.strip() for skill in text_to_process.split(separator) if skill.strip())
return list(skills)
def extract_summary(self, text):
"""Extract summary/objective from resume text"""
summary = []
lines = text.split('\n')
summary_keywords = [
'summary', 'professional summary', 'career summary', 'objective',
'career objective', 'professional objective', 'about me', 'profile',
'professional profile', 'career profile', 'overview', 'skill summary'
]
in_summary_section = False
current_entry = []
# Try to find summary at the beginning of the resume
start_index = 0
while start_index < min(10, len(lines)) and not lines[start_index].strip():
start_index += 1
# Check first few non-empty lines for potential summary
first_lines = []
lines_checked = 0
for line in lines[start_index:]:
if line.strip():
first_lines.append(line.strip())
lines_checked += 1
if lines_checked >= 5: # Check first 5 non-empty lines
break
# If first few lines look like a summary (no special formatting, no contact info)
if first_lines and not any(keyword in first_lines[0].lower() for keyword in summary_keywords):
potential_summary = ' '.join(first_lines)
if len(potential_summary.split()) > 10: # More than 10 words
if not re.search(r'\b(?:email|phone|address|tel|mobile|linkedin)\b', potential_summary.lower()):
summary.append(potential_summary)
# Look for explicitly marked summary section
for line in lines:
line = line.strip()
# Check for section header
if any(keyword.lower() in line.lower() for keyword in summary_keywords):
if not any(keyword.lower() == line.lower() for keyword in summary_keywords):
# This line contains summary info, not just a header
current_entry.append(line)
in_summary_section = True
continue
if in_summary_section:
# Check if we've hit another section
if line and any(keyword.lower() in line.lower() for keyword in self.document_types['resume']):
if not any(sum_key.lower() in line.lower() for sum_key in summary_keywords):
in_summary_section = False
if current_entry:
summary.append(' '.join(current_entry))
current_entry = []
continue
if line:
current_entry.append(line)
elif current_entry: # Empty line and we have content
summary.append(' '.join(current_entry))
current_entry = []
if current_entry:
summary.append(' '.join(current_entry))
return ' '.join(summary) if summary else ''
def analyze_resume(self, resume_data, job_requirements):
"""Analyze resume and return scores and recommendations"""
try:
text = resume_data.get('raw_text', '')
# Extract personal information
personal_info = self.extract_personal_info(text)
# First detect document type
doc_type = self.detect_document_type(text)
if doc_type != 'resume':
return {
'ats_score': 0,
'document_type': doc_type,
'keyword_match': {'score': 0, 'found_skills': [], 'missing_skills': []},
'section_score': 0,
'format_score': 0,
'suggestions': [f"This appears to be a {doc_type} document. Please upload a resume for ATS analysis."]
}
# Calculate keyword match
required_skills = job_requirements.get('required_skills', [])
keyword_match = self.calculate_keyword_match(text, required_skills)
# Extract all resume sections
education = self.extract_education(text)
experience = self.extract_experience(text)
projects = self.extract_projects(text)
skills = list(self.extract_skills(text)) # Convert skills set to list
summary = self.extract_summary(text)
# Check resume sections
section_score = self.check_resume_sections(text)
# Check formatting
format_score, format_deductions = self.check_formatting(text)
# Generate section-specific suggestions
contact_suggestions = []
if not personal_info.get('email'):
contact_suggestions.append("Add your email address")
if not personal_info.get('phone'):
contact_suggestions.append("Add your phone number")
if not personal_info.get('linkedin'):
contact_suggestions.append("Add your LinkedIn profile URL")
summary_suggestions = []
if not summary:
summary_suggestions.append("Add a professional summary to highlight your key qualifications")
elif len(summary.split()) < 30:
summary_suggestions.append("Expand your professional summary to better highlight your experience and goals")
elif len(summary.split()) > 100:
summary_suggestions.append("Consider making your summary more concise (aim for 50-75 words)")
skills_suggestions = []
if not skills:
skills_suggestions.append("Add a dedicated skills section")
if isinstance(skills, (list, set)) and len(list(skills)) < 5:
skills_suggestions.append("List more relevant technical and soft skills")
if keyword_match['score'] < 70:
skills_suggestions.append("Add more skills that match the job requirements")
experience_suggestions = []
if not experience:
experience_suggestions.append("Add your work experience section")
else:
has_dates = any(re.search(r'\b(19|20)\d{2}\b', exp) for exp in experience)
has_bullets = any(re.search(r'[•\-\*]', exp) for exp in experience)
has_action_verbs = any(re.search(r'\b(developed|managed|created|implemented|designed|led|improved)\b',
exp.lower()) for exp in experience)
if not has_dates:
experience_suggestions.append("Include dates for each work experience")
if not has_bullets:
experience_suggestions.append("Use bullet points to list your achievements and responsibilities")
if not has_action_verbs:
experience_suggestions.append("Start bullet points with strong action verbs")
education_suggestions = []
if not education:
education_suggestions.append("Add your educational background")
else:
has_dates = any(re.search(r'\b(19|20)\d{2}\b', edu) for edu in education)
has_degree = any(re.search(r'\b(bachelor|master|phd|b\.|m\.|diploma)\b',
edu.lower()) for edu in education)
has_gpa = any(re.search(r'\b(gpa|cgpa|grade|percentage)\b',
edu.lower()) for edu in education)
if not has_dates:
education_suggestions.append("Include graduation dates")
if not has_degree:
education_suggestions.append("Specify your degree type")
if not has_gpa and job_requirements.get('require_gpa', False):
education_suggestions.append("Include your GPA if it's above 3.0")
format_suggestions = []
if format_score < 100:
format_suggestions.extend(format_deductions)
# Calculate section-specific scores
contact_score = 100 - (len(contact_suggestions) * 25) # -25 for each missing item
summary_score = 100 - (len(summary_suggestions) * 33) # -33 for each issue
skills_score = keyword_match['score']
experience_score = 100 - (len(experience_suggestions) * 25)
education_score = 100 - (len(education_suggestions) * 25)
# Calculate overall ATS score with weighted components
ats_score = (
int(round(contact_score * 0.1)) + # 10% weight for contact info
int(round(summary_score * 0.1)) + # 10% weight for summary
int(round(skills_score * 0.3)) + # 30% weight for skills match
int(round(experience_score * 0.2)) + # 20% weight for experience
int(round(education_score * 0.1)) + # 10% weight for education
int(round(format_score * 0.2)) # 20% weight for formatting
)
# Combine all suggestions into a single list
suggestions = []
suggestions.extend(contact_suggestions)
suggestions.extend(summary_suggestions)
suggestions.extend(skills_suggestions)
suggestions.extend(experience_suggestions)
suggestions.extend(education_suggestions)
suggestions.extend(format_suggestions)
if not suggestions:
suggestions.append("Your resume is well-optimized for ATS systems")
# Return final structured result
return {
**personal_info, # Include extracted personal info
'ats_score': ats_score,
'document_type': 'resume',
'keyword_match': keyword_match,
'section_score': section_score,
'format_score': format_score,
'education': education,
'experience': experience,
'projects': projects,
'skills': skills,
'summary': summary,
'suggestions': suggestions,
'contact_suggestions': contact_suggestions,
'summary_suggestions': summary_suggestions,
'skills_suggestions': skills_suggestions,
'experience_suggestions': experience_suggestions,
'education_suggestions': education_suggestions,
'format_suggestions': format_suggestions,
'section_scores': {
'contact': contact_score,
'summary': summary_score,
'skills': skills_score,
'experience': experience_score,
'education': education_score,
'format': format_score
}
}
except Exception as e:
import traceback
print(f"Error analyzing resume: {str(e)}")
print(traceback.format_exc())
# Return a default error response
return {
'error': f"Resume analysis failed: {str(e)}",
'ats_score': 0,
'document_type': 'unknown',
'keyword_match': {'score': 0, 'found_skills': [], 'missing_skills': []},
'section_score': 0,
'format_score': 0,
'suggestions': [f"Error analyzing resume: {str(e)}. Please check your file and try again."]
}