ATS-SCORE-CHECKER / processing /pdf_extractor.py
Akash-Dragon's picture
Initial commit: Resume ATS Score Checker API
4689a82
import pdfplumber
import PyPDF2
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import re
from typing import Dict, List
class PDFExtractor:
"""Extract text from PDF resumes with fallback mechanisms"""
@staticmethod
def extract_text(pdf_path: str) -> str:
"""Extract text from PDF using pdfplumber, fallback to PyPDF2, then OCR"""
try:
# Primary: pdfplumber
with pdfplumber.open(pdf_path) as pdf:
text = ""
for page in pdf.pages:
text += page.extract_text() or ""
if text.strip():
return PDFExtractor.normalize_text(text)
except Exception as e:
print(f"pdfplumber failed: {e}, trying PyPDF2...")
try:
# Fallback 1: PyPDF2
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text() or ""
if text.strip():
return PDFExtractor.normalize_text(text)
except Exception as e:
print(f"PyPDF2 failed: {e}, trying OCR...")
try:
# Fallback 2: OCR with Tesseract
print("Attempting OCR extraction (this may take a moment)...")
images = convert_from_path(pdf_path, dpi=300)
text = ""
for i, image in enumerate(images):
page_text = pytesseract.image_to_string(image, lang='eng')
text += page_text + "\n"
print(f"OCR page {i+1}: extracted {len(page_text)} chars")
if text.strip():
return PDFExtractor.normalize_text(text)
else:
raise Exception("OCR extraction returned empty text")
except Exception as e:
raise Exception(f"All extraction methods failed. Last error (OCR): {e}")
@staticmethod
def normalize_text(text: str) -> str:
"""Clean and normalize extracted text"""
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text)
# Remove special characters but keep punctuation
text = re.sub(r'[^\w\s.,;:()\-@]', '', text)
return text.strip()
@staticmethod
def detect_sections(text: str) -> Dict[str, str]:
"""Detect common resume sections"""
sections = {
'experience': '',
'education': '',
'skills': '',
'projects': '',
'summary': '',
'other': ''
}
# Section headers patterns
patterns = {
'experience': r'(experience|work history|employment|professional experience)',
'education': r'(education|academic|qualifications|degrees)',
'skills': r'(skills|technical skills|competencies|expertise)',
'projects': r'(projects|portfolio|work samples)',
'summary': r'(summary|objective|profile|about)'
}
text_lower = text.lower()
# Find section boundaries
section_positions = []
for section_name, pattern in patterns.items():
matches = list(re.finditer(pattern, text_lower))
for match in matches:
section_positions.append((match.start(), section_name))
# Sort by position
section_positions.sort()
# Extract section content
for i, (start_pos, section_name) in enumerate(section_positions):
end_pos = section_positions[i + 1][0] if i + 1 < len(section_positions) else len(text)
sections[section_name] = text[start_pos:end_pos].strip()
# If no sections detected, put everything in 'other'
if not any(sections.values()):
sections['other'] = text
return sections