resume-analyzer / src /resume_parser.py
ananttripathiak's picture
Create resume_parser.py
1b6c396 verified
"""
Resume Parser Module
Handles PDF/DOCX parsing and text extraction from resumes.
"""
import re
import PyPDF2
import pdfplumber
from typing import Dict, Optional, List
from pathlib import Path
import logging
from docx import Document
logger = logging.getLogger(__name__)
class ResumeParser:
"""
Advanced resume parser that extracts text and metadata from PDF and DOCX files.
"""
def __init__(self):
self.supported_formats = ['.pdf', '.docx', '.txt']
logger.info("ResumeParser initialized")
def parse_pdf(self, file_path: str) -> str:
"""
Extract text from PDF using multiple methods for reliability.
Args:
file_path: Path to PDF file
Returns:
Extracted text content
"""
try:
# Try pdfplumber first (better for complex layouts)
text = self._parse_with_pdfplumber(file_path)
if text and len(text.strip()) > 50:
return text
# Fallback to PyPDF2
text = self._parse_with_pypdf2(file_path)
return text
except Exception as e:
logger.error(f"Error parsing PDF: {e}")
raise ValueError(f"Failed to parse PDF: {str(e)}")
def _parse_with_pdfplumber(self, file_path: str) -> str:
"""Parse PDF using pdfplumber."""
text_content = []
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
text = page.extract_text()
if text:
text_content.append(text)
return "\n".join(text_content)
def _parse_with_pypdf2(self, file_path: str) -> str:
"""Parse PDF using PyPDF2 as fallback."""
text_content = []
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page in pdf_reader.pages:
text = page.extract_text()
if text:
text_content.append(text)
return "\n".join(text_content)
def parse_docx(self, file_path: str) -> str:
"""
Extract text from DOCX file.
Args:
file_path: Path to DOCX file
Returns:
Extracted text content
"""
try:
doc = Document(file_path)
text_content = []
for paragraph in doc.paragraphs:
if paragraph.text.strip():
text_content.append(paragraph.text)
# Also extract from tables
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
if cell.text.strip():
text_content.append(cell.text)
return "\n".join(text_content)
except Exception as e:
logger.error(f"Error parsing DOCX: {e}")
raise ValueError(f"Failed to parse DOCX: {str(e)}")
def parse_file(self, file_path: str) -> Dict[str, any]:
"""
Parse resume file and extract text with metadata.
Args:
file_path: Path to resume file
Returns:
Dictionary containing parsed content and metadata
"""
file_path = Path(file_path)
if not file_path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
extension = file_path.suffix.lower()
if extension not in self.supported_formats:
raise ValueError(f"Unsupported file format: {extension}")
# Extract text based on file type
if extension == '.pdf':
text = self.parse_pdf(str(file_path))
elif extension == '.docx':
text = self.parse_docx(str(file_path))
elif extension == '.txt':
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
else:
raise ValueError(f"Unsupported format: {extension}")
# Clean and normalize text
text = self.clean_text(text)
# Extract basic metadata
metadata = self.extract_metadata(text)
return {
"raw_text": text,
"cleaned_text": text,
"file_name": file_path.name,
"file_type": extension,
"metadata": metadata,
"word_count": len(text.split()),
"char_count": len(text)
}
def clean_text(self, text: str) -> str:
"""
Clean and normalize extracted text.
Args:
text: Raw extracted text
Returns:
Cleaned text
"""
# Remove excessive whitespace
text = re.sub(r'\s+', ' ', text)
# Remove special characters but keep important punctuation
text = re.sub(r'[^\w\s\.,;:\-\(\)@/]', '', text)
# Normalize line breaks
text = text.replace('\r\n', '\n').replace('\r', '\n')
return text.strip()
def extract_metadata(self, text: str) -> Dict[str, any]:
"""
Extract basic metadata from resume text.
Args:
text: Resume text
Returns:
Dictionary with metadata
"""
metadata = {}
# Extract email
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
emails = re.findall(email_pattern, text)
metadata['emails'] = emails[:1] if emails else []
# Extract phone numbers
phone_pattern = r'(\+\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'
phones = re.findall(phone_pattern, text)
metadata['phones'] = [p[0] + p[1] if isinstance(p, tuple) else p for p in phones[:2]]
# Extract LinkedIn
linkedin_pattern = r'linkedin\.com/in/[\w-]+'
linkedin = re.findall(linkedin_pattern, text.lower())
metadata['linkedin'] = linkedin[0] if linkedin else None
# Extract GitHub
github_pattern = r'github\.com/[\w-]+'
github = re.findall(github_pattern, text.lower())
metadata['github'] = github[0] if github else None
return metadata
def extract_sections(self, text: str) -> Dict[str, str]:
"""
Identify and extract common resume sections.
Args:
text: Resume text
Returns:
Dictionary mapping section names to content
"""
sections = {}
# Common section headers
section_patterns = {
'summary': r'(professional summary|summary|profile|objective)',
'experience': r'(work experience|experience|employment history|professional experience)',
'education': r'(education|academic background|qualifications)',
'skills': r'(skills|technical skills|core competencies|expertise)',
'projects': r'(projects|portfolio)',
'certifications': r'(certifications|certificates|licenses)'
}
text_lower = text.lower()
for section_name, pattern in section_patterns.items():
match = re.search(pattern, text_lower)
if match:
start_idx = match.start()
# Find next section or end of text
next_section_idx = len(text)
for other_pattern in section_patterns.values():
if other_pattern != pattern:
next_match = re.search(other_pattern, text_lower[start_idx + 50:])
if next_match:
candidate_idx = start_idx + 50 + next_match.start()
if candidate_idx < next_section_idx:
next_section_idx = candidate_idx
sections[section_name] = text[start_idx:next_section_idx].strip()
return sections