Spaces:

parthib07
/

SMART_AI_RESUME

Sleeping

App Files Files Community

SMART_AI_RESUME / utils /resume_analyzer.py

parthib07

Upload 531 files

d7d3dff verified 2 months ago

raw

history blame contribute delete

28.7 kB

	import re

	class ResumeAnalyzer:
	def __init__(self):
	# Document type indicators
	self.document_types = {
	'resume': [
	'experience', 'education', 'skills', 'work', 'project', 'objective',
	'summary', 'employment', 'qualification', 'achievements'
	],
	'marksheet': [
	'grade', 'marks', 'score', 'semester', 'cgpa', 'sgpa', 'examination',
	'result', 'academic year', 'percentage'
	],
	'certificate': [
	'certificate', 'certification', 'awarded', 'completed', 'achievement',
	'training', 'course completion', 'qualified'
	],
	'id_card': [
	'id card', 'identity', 'student id', 'employee id', 'valid until',
	'date of issue', 'identification'
	]
	}

	def detect_document_type(self, text):
	text = text.lower()
	scores = {}

	# Calculate score for each document type
	for doc_type, keywords in self.document_types.items():
	matches = sum(1 for keyword in keywords if keyword in text)
	density = matches / len(keywords)
	frequency = matches / (len(text.split()) + 1) # Add 1 to avoid division by zero
	scores[doc_type] = (density * 0.7) + (frequency * 0.3)

	# Get the highest scoring document type
	best_match = max(scores.items(), key=lambda x: x[1])

	# Only return a document type if the score is significant
	return best_match[0] if best_match[1] > 0.15 else 'unknown'

	def calculate_keyword_match(self, resume_text, required_skills):
	resume_text = resume_text.lower()
	found_skills = []
	missing_skills = []

	for skill in required_skills:
	skill_lower = skill.lower()
	# Check for exact match
	if skill_lower in resume_text:
	found_skills.append(skill)
	# Check for partial matches (e.g., "Python" in "Python programming")
	elif any(skill_lower in phrase for phrase in resume_text.split('.')):
	found_skills.append(skill)
	else:
	missing_skills.append(skill)

	match_score = (len(found_skills) / len(required_skills)) * 100 if required_skills else 0

	return {
	'score': match_score,
	'found_skills': found_skills,
	'missing_skills': missing_skills
	}

	def check_resume_sections(self, text):
	text = text.lower()
	essential_sections = {
	'contact': ['email', 'phone', 'address', 'linkedin'],
	'education': ['education', 'university', 'college', 'degree', 'academic'],
	'experience': ['experience', 'work', 'employment', 'job', 'internship'],
	'skills': ['skills', 'technologies', 'tools', 'proficiencies', 'expertise']
	}

	section_scores = {}
	for section, keywords in essential_sections.items():
	found = sum(1 for keyword in keywords if keyword in text)
	section_scores[section] = min(25, (found / len(keywords)) * 25)

	return sum(section_scores.values())

	def check_formatting(self, text):
	lines = text.split('\n')
	score = 100
	deductions = []

	# Check for minimum content
	if len(text) < 300:
	score -= 30
	deductions.append("Resume is too short")

	# Check for section headers
	if not any(line.isupper() for line in lines):
	score -= 20
	deductions.append("No clear section headers found")

	# Check for bullet points
	if not any(line.strip().startswith(('•', '-', '*', '→')) for line in lines):
	score -= 20
	deductions.append("No bullet points found for listing details")

	# Check for consistent spacing
	if any(len(line.strip()) == 0 and len(next_line.strip()) == 0
	for line, next_line in zip(lines[:-1], lines[1:])):
	score -= 15
	deductions.append("Inconsistent spacing between sections")

	# Check for contact information format
	contact_patterns = [
	r'\b[\w\.-]+@[\w\.-]+\.\w+\b', # email
	r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', # phone
	r'linkedin\.com/\w+', # LinkedIn
	]
	if not any(re.search(pattern, text) for pattern in contact_patterns):
	score -= 15
	deductions.append("Missing or improperly formatted contact information")

	return max(0, score), deductions

	def extract_text_from_pdf(self, file):
	try:
	import PyPDF2
	import io

	# Create a PDF reader object
	# First make sure we have the file content as bytes
	if hasattr(file, 'read'):
	# If it's already a file-like object, read it
	file_content = file.read()
	file.seek(0) # Reset file pointer
	else:
	# If it's already bytes
	file_content = file

	# Create BytesIO from bytes content
	pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))

	# Extract text from all pages
	text = ""
	for page in pdf_reader.pages:
	text += page.extract_text() + "\n"

	return text
	except Exception as e:
	raise Exception(f"Error extracting text from PDF: {str(e)}")

	def extract_text_from_docx(self, docx_file):
	"""Extract text from a DOCX file"""
	try:
	from docx import Document
	doc = Document(docx_file)
	full_text = []
	for paragraph in doc.paragraphs:
	full_text.append(paragraph.text)
	return '\n'.join(full_text)
	except Exception as e:
	raise Exception(f"Error extracting text from DOCX file: {str(e)}")

	def extract_personal_info(self, text):
	"""Extract personal information from resume text"""
	# Basic patterns for personal info
	email_pattern = r'[\w\.-]+@[\w\.-]+\.\w+'
	phone_pattern = r'(\+\d{1,3}[-.]?)?\s\(?\d{3}\)?[-.]?\s\d{3}[-.]?\s*\d{4}'
	linkedin_pattern = r'linkedin\.com/in/[\w-]+'
	github_pattern = r'github\.com/[\w-]+'

	# Extract information
	email = re.search(email_pattern, text)
	phone = re.search(phone_pattern, text)
	linkedin = re.search(linkedin_pattern, text)
	github = re.search(github_pattern, text)

	# Get the first line as name (basic assumption)
	name = text.split('\n')[0].strip()

	return {
	'name': name if len(name) > 0 else 'Unknown',
	'email': email.group(0) if email else '',
	'phone': phone.group(0) if phone else '',
	'linkedin': linkedin.group(0) if linkedin else '',
	'github': github.group(0) if github else '',
	'portfolio': '' # Can be enhanced later
	}

	def extract_education(self, text):
	"""Extract education information from resume text"""
	education = []
	lines = text.split('\n')
	education_keywords = [
	'education', 'academic', 'qualification', 'degree', 'university', 'college',
	'school', 'institute', 'certification', 'diploma', 'bachelor', 'master',
	'phd', 'b.tech', 'm.tech', 'b.e', 'm.e', 'b.sc', 'm.sc','bca', 'mca', 'b.com',
	'm.com', 'b.cs-it', 'imca', 'bba', 'mba', 'honors', 'scholarship'
	]
	in_education_section = False
	current_entry = []

	for line in lines:
	line = line.strip()
	# Check for section header
	if any(keyword.lower() in line.lower() for keyword in education_keywords):
	if not any(keyword.lower() == line.lower() for keyword in education_keywords):
	# This line contains education info, not just a header
	current_entry.append(line)
	in_education_section = True
	continue

	if in_education_section:
	# Check if we've hit another section
	if line and any(keyword.lower() in line.lower() for keyword in self.document_types['resume']):
	if not any(edu_key.lower() in line.lower() for edu_key in education_keywords):
	in_education_section = False
	if current_entry:
	education.append(' '.join(current_entry))
	current_entry = []
	continue

	if line:
	current_entry.append(line)
	elif current_entry: # Empty line and we have content
	education.append(' '.join(current_entry))
	current_entry = []

	if current_entry:
	education.append(' '.join(current_entry))

	return education

	def extract_experience(self, text):
	"""Extract work experience information from resume text"""
	experience = []
	lines = text.split('\n')
	experience_keywords = [
	'experience', 'employment', 'work history', 'professional experience',
	'work experience', 'career history', 'professional background',
	'employment history', 'job history', 'positions held', 'experience',
	'job title', 'job responsibilities', 'job description', 'job summary'
	]
	in_experience_section = False
	current_entry = []

	for line in lines:
	line = line.strip()
	# Check for section header
	if any(keyword.lower() in line.lower() for keyword in experience_keywords):
	if not any(keyword.lower() == line.lower() for keyword in experience_keywords):
	# This line contains experience info, not just a header
	current_entry.append(line)
	in_experience_section = True
	continue

	if in_experience_section:
	# Check if we've hit another section
	if line and any(keyword.lower() in line.lower() for keyword in self.document_types['resume']):
	if not any(exp_key.lower() in line.lower() for exp_key in experience_keywords):
	in_experience_section = False
	if current_entry:
	experience.append(' '.join(current_entry))
	current_entry = []
	continue

	if line:
	current_entry.append(line)
	elif current_entry: # Empty line and we have content
	experience.append(' '.join(current_entry))
	current_entry = []

	if current_entry:
	experience.append(' '.join(current_entry))

	return experience

	def extract_projects(self, text):
	"""Extract project information from resume text"""
	projects = []
	lines = text.split('\n')
	project_keywords = [
	'projects', 'personal projects', 'academic projects', 'key projects',
	'major projects', 'professional projects', 'project experience',
	'relevant projects', 'featured projects','latest projects',
	'top projects'
	]
	in_project_section = False
	current_entry = []

	for line in lines:
	line = line.strip()
	# Check for section header
	if any(keyword.lower() in line.lower() for keyword in project_keywords):
	if not any(keyword.lower() == line.lower() for keyword in project_keywords):
	# This line contains project info, not just a header
	current_entry.append(line)
	in_project_section = True
	continue

	if in_project_section:
	# Check if we've hit another section
	if line and any(keyword.lower() in line.lower() for keyword in self.document_types['resume']):
	if not any(proj_key.lower() in line.lower() for proj_key in project_keywords):
	in_project_section = False
	if current_entry:
	projects.append(' '.join(current_entry))
	current_entry = []
	continue

	if line:
	current_entry.append(line)
	elif current_entry: # Empty line and we have content
	projects.append(' '.join(current_entry))
	current_entry = []

	if current_entry:
	projects.append(' '.join(current_entry))

	return projects

	def extract_skills(self, text):
	"""Extract skills from resume text"""
	skills = set() # Use set to avoid duplicates
	lines = text.split('\n')
	skills_keywords = [
	'skills', 'technical skills', 'competencies', 'expertise',
	'core competencies', 'professional skills', 'key skills',
	'technical expertise', 'proficiencies', 'qualifications',
	'top skills', 'key skill', 'major skill', 'personal skill',
	'soft skills', 'soft skill', 'soft skillset'
	]
	in_skills_section = False
	current_entry = []

	# Common skill separators
	separators = [',', '•', '\|', '/', '\\', '·', '>', '-', '–', '―']

	for line in lines:
	line = line.strip()
	# Check for section header
	if any(keyword.lower() in line.lower() for keyword in skills_keywords):
	if not any(keyword.lower() == line.lower() for keyword in skills_keywords):
	# This line contains skills, not just a header
	current_entry.append(line)
	in_skills_section = True
	continue

	if in_skills_section:
	# Check if we've hit another section
	if line and any(keyword.lower() in line.lower() for keyword in self.document_types['resume']):
	if not any(skill_key.lower() in line.lower() for skill_key in skills_keywords):
	in_skills_section = False
	if current_entry:
	# Process the current entry
	text_to_process = ' '.join(current_entry)
	# Split by common separators
	for separator in separators:
	if separator in text_to_process:
	skills.update(skill.strip() for skill in text_to_process.split(separator) if skill.strip())
	current_entry = []
	continue

	if line:
	current_entry.append(line)
	elif current_entry: # Empty line and we have content
	# Process the current entry
	text_to_process = ' '.join(current_entry)
	# Split by common separators
	for separator in separators:
	if separator in text_to_process:
	skills.update(skill.strip() for skill in text_to_process.split(separator) if skill.strip())
	current_entry = []

	if current_entry:
	# Process any remaining skills
	text_to_process = ' '.join(current_entry)
	for separator in separators:
	if separator in text_to_process:
	skills.update(skill.strip() for skill in text_to_process.split(separator) if skill.strip())

	return list(skills)

	def extract_summary(self, text):
	"""Extract summary/objective from resume text"""
	summary = []
	lines = text.split('\n')
	summary_keywords = [
	'summary', 'professional summary', 'career summary', 'objective',
	'career objective', 'professional objective', 'about me', 'profile',
	'professional profile', 'career profile', 'overview', 'skill summary'
	]
	in_summary_section = False
	current_entry = []

	# Try to find summary at the beginning of the resume
	start_index = 0
	while start_index < min(10, len(lines)) and not lines[start_index].strip():
	start_index += 1

	# Check first few non-empty lines for potential summary
	first_lines = []
	lines_checked = 0
	for line in lines[start_index:]:
	if line.strip():
	first_lines.append(line.strip())
	lines_checked += 1
	if lines_checked >= 5: # Check first 5 non-empty lines
	break

	# If first few lines look like a summary (no special formatting, no contact info)
	if first_lines and not any(keyword in first_lines[0].lower() for keyword in summary_keywords):
	potential_summary = ' '.join(first_lines)
	if len(potential_summary.split()) > 10: # More than 10 words
	if not re.search(r'\b(?:email\|phone\|address\|tel\|mobile\|linkedin)\b', potential_summary.lower()):
	summary.append(potential_summary)

	# Look for explicitly marked summary section
	for line in lines:
	line = line.strip()
	# Check for section header
	if any(keyword.lower() in line.lower() for keyword in summary_keywords):
	if not any(keyword.lower() == line.lower() for keyword in summary_keywords):
	# This line contains summary info, not just a header
	current_entry.append(line)
	in_summary_section = True
	continue

	if in_summary_section:
	# Check if we've hit another section
	if line and any(keyword.lower() in line.lower() for keyword in self.document_types['resume']):
	if not any(sum_key.lower() in line.lower() for sum_key in summary_keywords):
	in_summary_section = False
	if current_entry:
	summary.append(' '.join(current_entry))
	current_entry = []
	continue

	if line:
	current_entry.append(line)
	elif current_entry: # Empty line and we have content
	summary.append(' '.join(current_entry))
	current_entry = []

	if current_entry:
	summary.append(' '.join(current_entry))

	return ' '.join(summary) if summary else ''

	def analyze_resume(self, resume_data, job_requirements):
	"""Analyze resume and return scores and recommendations"""
	try:
	text = resume_data.get('raw_text', '')

	# Extract personal information
	personal_info = self.extract_personal_info(text)

	# First detect document type
	doc_type = self.detect_document_type(text)
	if doc_type != 'resume':
	return {
	'ats_score': 0,
	'document_type': doc_type,
	'keyword_match': {'score': 0, 'found_skills': [], 'missing_skills': []},
	'section_score': 0,
	'format_score': 0,
	'suggestions': [f"This appears to be a {doc_type} document. Please upload a resume for ATS analysis."]
	}

	# Calculate keyword match
	required_skills = job_requirements.get('required_skills', [])
	keyword_match = self.calculate_keyword_match(text, required_skills)

	# Extract all resume sections
	education = self.extract_education(text)
	experience = self.extract_experience(text)
	projects = self.extract_projects(text)
	skills = list(self.extract_skills(text)) # Convert skills set to list
	summary = self.extract_summary(text)

	# Check resume sections
	section_score = self.check_resume_sections(text)

	# Check formatting
	format_score, format_deductions = self.check_formatting(text)

	# Generate section-specific suggestions
	contact_suggestions = []
	if not personal_info.get('email'):
	contact_suggestions.append("Add your email address")
	if not personal_info.get('phone'):
	contact_suggestions.append("Add your phone number")
	if not personal_info.get('linkedin'):
	contact_suggestions.append("Add your LinkedIn profile URL")

	summary_suggestions = []
	if not summary:
	summary_suggestions.append("Add a professional summary to highlight your key qualifications")
	elif len(summary.split()) < 30:
	summary_suggestions.append("Expand your professional summary to better highlight your experience and goals")
	elif len(summary.split()) > 100:
	summary_suggestions.append("Consider making your summary more concise (aim for 50-75 words)")

	skills_suggestions = []
	if not skills:
	skills_suggestions.append("Add a dedicated skills section")
	if isinstance(skills, (list, set)) and len(list(skills)) < 5:
	skills_suggestions.append("List more relevant technical and soft skills")
	if keyword_match['score'] < 70:
	skills_suggestions.append("Add more skills that match the job requirements")

	experience_suggestions = []
	if not experience:
	experience_suggestions.append("Add your work experience section")
	else:
	has_dates = any(re.search(r'\b(19\|20)\d{2}\b', exp) for exp in experience)
	has_bullets = any(re.search(r'[•\-\*]', exp) for exp in experience)
	has_action_verbs = any(re.search(r'\b(developed\|managed\|created\|implemented\|designed\|led\|improved)\b',
	exp.lower()) for exp in experience)

	if not has_dates:
	experience_suggestions.append("Include dates for each work experience")
	if not has_bullets:
	experience_suggestions.append("Use bullet points to list your achievements and responsibilities")
	if not has_action_verbs:
	experience_suggestions.append("Start bullet points with strong action verbs")

	education_suggestions = []
	if not education:
	education_suggestions.append("Add your educational background")
	else:
	has_dates = any(re.search(r'\b(19\|20)\d{2}\b', edu) for edu in education)
	has_degree = any(re.search(r'\b(bachelor\|master\|phd\|b\.\|m\.\|diploma)\b',
	edu.lower()) for edu in education)
	has_gpa = any(re.search(r'\b(gpa\|cgpa\|grade\|percentage)\b',
	edu.lower()) for edu in education)

	if not has_dates:
	education_suggestions.append("Include graduation dates")
	if not has_degree:
	education_suggestions.append("Specify your degree type")
	if not has_gpa and job_requirements.get('require_gpa', False):
	education_suggestions.append("Include your GPA if it's above 3.0")

	format_suggestions = []
	if format_score < 100:
	format_suggestions.extend(format_deductions)

	# Calculate section-specific scores
	contact_score = 100 - (len(contact_suggestions) * 25) # -25 for each missing item
	summary_score = 100 - (len(summary_suggestions) * 33) # -33 for each issue
	skills_score = keyword_match['score']
	experience_score = 100 - (len(experience_suggestions) * 25)
	education_score = 100 - (len(education_suggestions) * 25)

	# Calculate overall ATS score with weighted components
	ats_score = (
	int(round(contact_score * 0.1)) + # 10% weight for contact info
	int(round(summary_score * 0.1)) + # 10% weight for summary
	int(round(skills_score * 0.3)) + # 30% weight for skills match
	int(round(experience_score * 0.2)) + # 20% weight for experience
	int(round(education_score * 0.1)) + # 10% weight for education
	int(round(format_score * 0.2)) # 20% weight for formatting
	)

	# Combine all suggestions into a single list
	suggestions = []
	suggestions.extend(contact_suggestions)
	suggestions.extend(summary_suggestions)
	suggestions.extend(skills_suggestions)
	suggestions.extend(experience_suggestions)
	suggestions.extend(education_suggestions)
	suggestions.extend(format_suggestions)

	if not suggestions:
	suggestions.append("Your resume is well-optimized for ATS systems")

	# Return final structured result
	return {
	**personal_info, # Include extracted personal info
	'ats_score': ats_score,
	'document_type': 'resume',
	'keyword_match': keyword_match,
	'section_score': section_score,
	'format_score': format_score,
	'education': education,
	'experience': experience,
	'projects': projects,
	'skills': skills,
	'summary': summary,
	'suggestions': suggestions,
	'contact_suggestions': contact_suggestions,
	'summary_suggestions': summary_suggestions,
	'skills_suggestions': skills_suggestions,
	'experience_suggestions': experience_suggestions,
	'education_suggestions': education_suggestions,
	'format_suggestions': format_suggestions,
	'section_scores': {
	'contact': contact_score,
	'summary': summary_score,
	'skills': skills_score,
	'experience': experience_score,
	'education': education_score,
	'format': format_score
	}
	}
	except Exception as e:
	import traceback
	print(f"Error analyzing resume: {str(e)}")
	print(traceback.format_exc())
	# Return a default error response
	return {
	'error': f"Resume analysis failed: {str(e)}",
	'ats_score': 0,
	'document_type': 'unknown',
	'keyword_match': {'score': 0, 'found_skills': [], 'missing_skills': []},
	'section_score': 0,
	'format_score': 0,
	'suggestions': [f"Error analyzing resume: {str(e)}. Please check your file and try again."]
	}