Spaces:

ananttripathiak
/

resume-analyzer

Running

App Files Files Community

resume-analyzer / src /resume_parser.py

ananttripathiak

Create resume_parser.py

1b6c396 verified about 2 months ago

raw

history blame contribute delete

8.15 kB

	"""
	Resume Parser Module
	Handles PDF/DOCX parsing and text extraction from resumes.
	"""

	import re
	import PyPDF2
	import pdfplumber
	from typing import Dict, Optional, List
	from pathlib import Path
	import logging
	from docx import Document

	logger = logging.getLogger(__name__)


	class ResumeParser:
	"""
	Advanced resume parser that extracts text and metadata from PDF and DOCX files.
	"""

	def __init__(self):
	self.supported_formats = ['.pdf', '.docx', '.txt']
	logger.info("ResumeParser initialized")

	def parse_pdf(self, file_path: str) -> str:
	"""
	Extract text from PDF using multiple methods for reliability.

	Args:
	file_path: Path to PDF file

	Returns:
	Extracted text content
	"""
	try:
	# Try pdfplumber first (better for complex layouts)
	text = self._parse_with_pdfplumber(file_path)
	if text and len(text.strip()) > 50:
	return text

	# Fallback to PyPDF2
	text = self._parse_with_pypdf2(file_path)
	return text

	except Exception as e:
	logger.error(f"Error parsing PDF: {e}")
	raise ValueError(f"Failed to parse PDF: {str(e)}")

	def _parse_with_pdfplumber(self, file_path: str) -> str:
	"""Parse PDF using pdfplumber."""
	text_content = []
	with pdfplumber.open(file_path) as pdf:
	for page in pdf.pages:
	text = page.extract_text()
	if text:
	text_content.append(text)
	return "\n".join(text_content)

	def _parse_with_pypdf2(self, file_path: str) -> str:
	"""Parse PDF using PyPDF2 as fallback."""
	text_content = []
	with open(file_path, 'rb') as file:
	pdf_reader = PyPDF2.PdfReader(file)
	for page in pdf_reader.pages:
	text = page.extract_text()
	if text:
	text_content.append(text)
	return "\n".join(text_content)

	def parse_docx(self, file_path: str) -> str:
	"""
	Extract text from DOCX file.

	Args:
	file_path: Path to DOCX file

	Returns:
	Extracted text content
	"""
	try:
	doc = Document(file_path)
	text_content = []

	for paragraph in doc.paragraphs:
	if paragraph.text.strip():
	text_content.append(paragraph.text)

	# Also extract from tables
	for table in doc.tables:
	for row in table.rows:
	for cell in row.cells:
	if cell.text.strip():
	text_content.append(cell.text)

	return "\n".join(text_content)

	except Exception as e:
	logger.error(f"Error parsing DOCX: {e}")
	raise ValueError(f"Failed to parse DOCX: {str(e)}")

	def parse_file(self, file_path: str) -> Dict[str, any]:
	"""
	Parse resume file and extract text with metadata.

	Args:
	file_path: Path to resume file

	Returns:
	Dictionary containing parsed content and metadata
	"""
	file_path = Path(file_path)

	if not file_path.exists():
	raise FileNotFoundError(f"File not found: {file_path}")

	extension = file_path.suffix.lower()

	if extension not in self.supported_formats:
	raise ValueError(f"Unsupported file format: {extension}")

	# Extract text based on file type
	if extension == '.pdf':
	text = self.parse_pdf(str(file_path))
	elif extension == '.docx':
	text = self.parse_docx(str(file_path))
	elif extension == '.txt':
	with open(file_path, 'r', encoding='utf-8') as f:
	text = f.read()
	else:
	raise ValueError(f"Unsupported format: {extension}")

	# Clean and normalize text
	text = self.clean_text(text)

	# Extract basic metadata
	metadata = self.extract_metadata(text)

	return {
	"raw_text": text,
	"cleaned_text": text,
	"file_name": file_path.name,
	"file_type": extension,
	"metadata": metadata,
	"word_count": len(text.split()),
	"char_count": len(text)
	}

	def clean_text(self, text: str) -> str:
	"""
	Clean and normalize extracted text.

	Args:
	text: Raw extracted text

	Returns:
	Cleaned text
	"""
	# Remove excessive whitespace
	text = re.sub(r'\s+', ' ', text)

	# Remove special characters but keep important punctuation
	text = re.sub(r'[^\w\s\.,;:\-\(\)@/]', '', text)

	# Normalize line breaks
	text = text.replace('\r\n', '\n').replace('\r', '\n')

	return text.strip()

	def extract_metadata(self, text: str) -> Dict[str, any]:
	"""
	Extract basic metadata from resume text.

	Args:
	text: Resume text

	Returns:
	Dictionary with metadata
	"""
	metadata = {}

	# Extract email
	email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z\|a-z]{2,}\b'
	emails = re.findall(email_pattern, text)
	metadata['emails'] = emails[:1] if emails else []

	# Extract phone numbers
	phone_pattern = r'(\+\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'
	phones = re.findall(phone_pattern, text)
	metadata['phones'] = [p[0] + p[1] if isinstance(p, tuple) else p for p in phones[:2]]

	# Extract LinkedIn
	linkedin_pattern = r'linkedin\.com/in/[\w-]+'
	linkedin = re.findall(linkedin_pattern, text.lower())
	metadata['linkedin'] = linkedin[0] if linkedin else None

	# Extract GitHub
	github_pattern = r'github\.com/[\w-]+'
	github = re.findall(github_pattern, text.lower())
	metadata['github'] = github[0] if github else None

	return metadata

	def extract_sections(self, text: str) -> Dict[str, str]:
	"""
	Identify and extract common resume sections.

	Args:
	text: Resume text

	Returns:
	Dictionary mapping section names to content
	"""
	sections = {}

	# Common section headers
	section_patterns = {
	'summary': r'(professional summary\|summary\|profile\|objective)',
	'experience': r'(work experience\|experience\|employment history\|professional experience)',
	'education': r'(education\|academic background\|qualifications)',
	'skills': r'(skills\|technical skills\|core competencies\|expertise)',
	'projects': r'(projects\|portfolio)',
	'certifications': r'(certifications\|certificates\|licenses)'
	}

	text_lower = text.lower()

	for section_name, pattern in section_patterns.items():
	match = re.search(pattern, text_lower)
	if match:
	start_idx = match.start()
	# Find next section or end of text
	next_section_idx = len(text)
	for other_pattern in section_patterns.values():
	if other_pattern != pattern:
	next_match = re.search(other_pattern, text_lower[start_idx + 50:])
	if next_match:
	candidate_idx = start_idx + 50 + next_match.start()
	if candidate_idx < next_section_idx:
	next_section_idx = candidate_idx

	sections[section_name] = text[start_idx:next_section_idx].strip()

	return sections