Spaces:

Akash-Dragon
/

ATS-SCORE-CHECKER

Sleeping

App Files Files Community

ATS-SCORE-CHECKER / processing /pdf_extractor.py

Akash-Dragon

Initial commit: Resume ATS Score Checker API

4689a82 24 days ago

raw

history blame contribute delete

4 kB

	import pdfplumber
	import PyPDF2
	import pytesseract
	from pdf2image import convert_from_path
	from PIL import Image
	import re
	from typing import Dict, List

	class PDFExtractor:
	"""Extract text from PDF resumes with fallback mechanisms"""

	@staticmethod
	def extract_text(pdf_path: str) -> str:
	"""Extract text from PDF using pdfplumber, fallback to PyPDF2, then OCR"""
	try:
	# Primary: pdfplumber
	with pdfplumber.open(pdf_path) as pdf:
	text = ""
	for page in pdf.pages:
	text += page.extract_text() or ""

	if text.strip():
	return PDFExtractor.normalize_text(text)
	except Exception as e:
	print(f"pdfplumber failed: {e}, trying PyPDF2...")

	try:
	# Fallback 1: PyPDF2
	with open(pdf_path, 'rb') as file:
	reader = PyPDF2.PdfReader(file)
	text = ""
	for page in reader.pages:
	text += page.extract_text() or ""

	if text.strip():
	return PDFExtractor.normalize_text(text)
	except Exception as e:
	print(f"PyPDF2 failed: {e}, trying OCR...")

	try:
	# Fallback 2: OCR with Tesseract
	print("Attempting OCR extraction (this may take a moment)...")
	images = convert_from_path(pdf_path, dpi=300)
	text = ""
	for i, image in enumerate(images):
	page_text = pytesseract.image_to_string(image, lang='eng')
	text += page_text + "\n"
	print(f"OCR page {i+1}: extracted {len(page_text)} chars")

	if text.strip():
	return PDFExtractor.normalize_text(text)
	else:
	raise Exception("OCR extraction returned empty text")
	except Exception as e:
	raise Exception(f"All extraction methods failed. Last error (OCR): {e}")

	@staticmethod
	def normalize_text(text: str) -> str:
	"""Clean and normalize extracted text"""
	# Remove extra whitespace
	text = re.sub(r'\s+', ' ', text)
	# Remove special characters but keep punctuation
	text = re.sub(r'[^\w\s.,;:()\-@]', '', text)
	return text.strip()

	@staticmethod
	def detect_sections(text: str) -> Dict[str, str]:
	"""Detect common resume sections"""
	sections = {
	'experience': '',
	'education': '',
	'skills': '',
	'projects': '',
	'summary': '',
	'other': ''
	}

	# Section headers patterns
	patterns = {
	'experience': r'(experience\|work history\|employment\|professional experience)',
	'education': r'(education\|academic\|qualifications\|degrees)',
	'skills': r'(skills\|technical skills\|competencies\|expertise)',
	'projects': r'(projects\|portfolio\|work samples)',
	'summary': r'(summary\|objective\|profile\|about)'
	}

	text_lower = text.lower()

	# Find section boundaries
	section_positions = []
	for section_name, pattern in patterns.items():
	matches = list(re.finditer(pattern, text_lower))
	for match in matches:
	section_positions.append((match.start(), section_name))

	# Sort by position
	section_positions.sort()

	# Extract section content
	for i, (start_pos, section_name) in enumerate(section_positions):
	end_pos = section_positions[i + 1][0] if i + 1 < len(section_positions) else len(text)
	sections[section_name] = text[start_pos:end_pos].strip()

	# If no sections detected, put everything in 'other'
	if not any(sections.values()):
	sections['other'] = text

	return sections