Upload folder using huggingface_hub

e4f4981 verified 9 months ago

12.5 kB

	"""
	PDF Parser Module for Resume Analysis
	Provides advanced text extraction with OCR support for PDF documents.
	"""

	import os
	import sys
	from pathlib import Path
	from typing import Optional, List, Dict, Any
	import logging

	try:
	from PyPDF2 import PdfReader
	PYPDF2_AVAILABLE = True
	except ImportError:
	PYPDF2_AVAILABLE = False
	print("PyPDF2 not found. Install with: pip install PyPDF2")

	try:
	import pytesseract
	from pdf2image import convert_from_path
	OCR_AVAILABLE = True
	except ImportError:
	OCR_AVAILABLE = False
	print("OCR libraries not found. Install with: pip install pytesseract pdf2image")

	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	class PDFExtractor:
	"""Advanced PDF text extractor with OCR support and error handling."""

	def __init__(self, pdf_path: str):
	self.pdf_path = Path(pdf_path)
	self.reader = None
	self.is_encrypted = False
	self.text_content = {}

	def validate_file(self) -> bool:
	"""Validate PDF file exists and is accessible."""
	if not self.pdf_path.exists():
	logger.error(f"PDF file not found: {self.pdf_path}")
	return False

	if not self.pdf_path.is_file():
	logger.error(f"Path is not a file: {self.pdf_path}")
	return False

	if self.pdf_path.stat().st_size == 0:
	logger.error(f"PDF file is empty: {self.pdf_path}")
	return False

	return True

	def load_pdf(self) -> bool:
	"""Load PDF with error handling."""
	try:
	self.reader = PdfReader(self.pdf_path)

	# Check if PDF is encrypted
	if self.reader.is_encrypted:
	self.is_encrypted = True
	logger.warning("PDF is encrypted. Text extraction may be limited.")

	logger.info(f"PDF loaded successfully. Pages: {len(self.reader.pages)}")
	return True

	except Exception as e:
	logger.error(f"Failed to load PDF: {e}")
	return False

	def extract_text_from_page(self, page, page_num: int) -> str:
	"""Extract text from a single page using multiple methods."""
	text = ""

	# Method 1: Direct text extraction
	try:
	text = page.extract_text()
	if text and text.strip():
	logger.info(f"Page {page_num + 1}: Text extracted successfully")
	return text
	except Exception as e:
	logger.warning(f"Page {page_num + 1}: Text extraction failed - {e}")

	# Method 2: OCR for scanned PDFs (if available)
	if not text and OCR_AVAILABLE:
	try:
	text = self._extract_text_with_ocr(page_num)
	if text and text.strip():
	logger.info(f"Page {page_num + 1}: Text extracted via OCR")
	return text
	except Exception as e:
	logger.warning(f"Page {page_num + 1}: OCR failed - {e}")

	# Method 3: Try alternative extraction methods
	try:
	# Try to get text from annotations
	if hasattr(page, 'annotations'):
	for annotation in page.annotations:
	if hasattr(annotation, 'get_text'):
	text += annotation.get_text() + "\n"

	# Try to get text from form fields
	if hasattr(page, 'get_form_text_fields'):
	form_fields = page.get_form_text_fields()
	for field_name, field_value in form_fields.items():
	if field_value:
	text += f"{field_name}: {field_value}\n"

	except Exception as e:
	logger.debug(f"Alternative extraction methods failed: {e}")

	return text.strip() if text else ""

	def _extract_text_with_ocr(self, page_num: int) -> str:
	"""Extract text from scanned PDF using OCR."""
	try:
	# Convert PDF page to image
	images = convert_from_path(
	self.pdf_path,
	first_page=page_num + 1,
	last_page=page_num + 1,
	dpi=300 # Higher DPI for better OCR accuracy
	)

	if images:
	# Extract text using OCR
	text = pytesseract.image_to_string(images[0], lang='eng')
	return text

	except Exception as e:
	logger.error(f"OCR processing failed: {e}")

	return ""

	def extract_all_text(self) -> Dict[str, Any]:
	"""Extract text from all pages with comprehensive metadata."""
	if not self.validate_file():
	return {"error": "Invalid PDF file"}

	if not self.load_pdf():
	return {"error": "Failed to load PDF"}

	result = {
	"file_path": str(self.pdf_path),
	"total_pages": len(self.reader.pages),
	"is_encrypted": self.is_encrypted,
	"pages": {},
	"full_text": "",
	"metadata": {}
	}

	# Extract metadata
	try:
	if self.reader.metadata:
	result["metadata"] = {
	"title": self.reader.metadata.get('/Title', ''),
	"author": self.reader.metadata.get('/Author', ''),
	"subject": self.reader.metadata.get('/Subject', ''),
	"creator": self.reader.metadata.get('/Creator', ''),
	"producer": self.reader.metadata.get('/Producer', ''),
	"creation_date": self.reader.metadata.get('/CreationDate', ''),
	"modification_date": self.reader.metadata.get('/ModDate', '')
	}
	except Exception as e:
	logger.warning(f"Failed to extract metadata: {e}")

	# Extract text from each page
	for page_num, page in enumerate(self.reader.pages):
	page_text = self.extract_text_from_page(page, page_num)

	result["pages"][page_num + 1] = {
	"text": page_text,
	"has_text": bool(page_text.strip()),
	"extraction_method": "direct" if page_text else "none"
	}

	result["full_text"] += f"\n--- Page {page_num + 1} ---\n{page_text}\n"

	return result

	def save_extracted_text(self, output_path: Optional[str] = None) -> str:
	"""Save extracted text to a file."""
	result = self.extract_all_text()

	if "error" in result:
	logger.error(f"Cannot save: {result['error']}")
	return ""

	if not output_path:
	output_path = self.pdf_path.with_suffix('.txt')

	try:
	with open(output_path, 'w', encoding='utf-8') as f:
	f.write(f"PDF Text Extraction Results\n")
	f.write(f"File: {result['file_path']}\n")
	f.write(f"Pages: {result['total_pages']}\n")
	f.write(f"Encrypted: {result['is_encrypted']}\n")
	f.write(f"Extracted on: {result.get('extraction_date', 'Unknown')}\n")
	f.write("=" * 50 + "\n\n")
	f.write(result['full_text'])

	logger.info(f"Text saved to: {output_path}")
	return str(output_path)

	except Exception as e:
	logger.error(f"Failed to save text: {e}")
	return ""

	def extract_resume_text(file_path: str) -> Dict[str, Any]:
	"""
	Extract text from a single resume file.

	Args:
	file_path: Path to the resume file

	Returns:
	Dict containing extraction results with keys:
	- success: Boolean indicating if extraction was successful
	- file_path: Original file path
	- text: Extracted text content
	- metadata: PDF metadata if available
	- pages: Page-by-page extraction details
	- error: Error message if extraction failed
	"""
	try:
	extractor = PDFExtractor(file_path)
	result = extractor.extract_all_text()

	if "error" in result:
	return {
	"success": False,
	"file_path": file_path,
	"error": result["error"]
	}

	return {
	"success": True,
	"file_path": file_path,
	"text": result["full_text"],
	"metadata": result["metadata"],
	"pages": result["pages"],
	"total_pages": result["total_pages"],
	"is_encrypted": result["is_encrypted"]
	}

	except Exception as e:
	logger.error(f"Failed to extract text from {file_path}: {e}")
	return {
	"success": False,
	"file_path": file_path,
	"error": str(e)
	}

	def process_batch_resumes(file_paths: List[str]) -> List[Dict[str, Any]]:
	"""
	Process multiple resume files in batch.

	Args:
	file_paths: List of file paths to process

	Returns:
	List of extraction results for each file
	"""
	results = []
	total_files = len(file_paths)

	logger.info(f"Starting batch processing of {total_files} files")

	for i, file_path in enumerate(file_paths, 1):
	logger.info(f"Processing file {i}/{total_files}: {file_path}")
	result = extract_resume_text(file_path)
	results.append(result)

	if result["success"]:
	logger.info(f"✓ Successfully processed: {file_path}")
	else:
	logger.warning(f"✗ Failed to process: {file_path} - {result['error']}")

	# Summary statistics
	successful = sum(1 for r in results if r["success"])
	failed = total_files - successful

	logger.info(f"Batch processing complete: {successful} successful, {failed} failed")

	return results

	def extract_resume_sections(text: str) -> Dict[str, str]:
	"""
	Extract structured sections from resume text.

	Args:
	text: Raw resume text

	Returns:
	Dict with structured sections (skills, experience, education, etc.)
	"""
	sections = {
	"contact_info": "",
	"skills": "",
	"experience": "",
	"education": "",
	"summary": "",
	"other": ""
	}

	# Simple section extraction using keywords
	lines = text.split('\n')
	current_section = "other"

	for line in lines:
	line_lower = line.lower().strip()

	# Detect sections based on keywords
	if any(keyword in line_lower for keyword in ['skill', 'technology', 'programming', 'framework']):
	current_section = "skills"
	elif any(keyword in line_lower for keyword in ['experience', 'work', 'employment', 'job']):
	current_section = "experience"
	elif any(keyword in line_lower for keyword in ['education', 'degree', 'university', 'college', 'school']):
	current_section = "education"
	elif any(keyword in line_lower for keyword in ['summary', 'profile', 'objective', 'about']):
	current_section = "summary"
	elif any(keyword in line_lower for keyword in ['email', 'phone', '@', 'linkedin', 'github']):
	current_section = "contact_info"

	# Add line to current section
	if line.strip():
	sections[current_section] += line + "\n"

	# Clean up sections
	for key in sections:
	sections[key] = sections[key].strip()

	return sections

	def main():
	"""Main function for command line usage (kept for backward compatibility)."""
	if len(sys.argv) > 1:
	pdf_path = sys.argv[1]
	result = extract_resume_text(pdf_path)

	if result["success"]:
	print(f"✓ Successfully extracted text from: {pdf_path}")
	print(f"Text length: {len(result['text'])} characters")
	print(f"Pages: {result['total_pages']}")
	else:
	print(f"✗ Failed to extract text: {result['error']}")
	else:
	print("Usage: python pdf_parser.py <file_path>")
	print("For batch processing, use the programmatic functions directly.")

	if __name__ == "__main__":
	main()