File size: 1,823 Bytes
92c68e3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 | import PyPDF2
from typing import Optional
class PDFProcessor:
"""Handles PDF file processing and text extraction"""
def __init__(self):
"""Initialize PDF processor"""
pass
def extract_text(self, pdf_path: str) -> str:
"""
Extract text content from a PDF file
Args:
pdf_path: Path to the PDF file
Returns:
str: Extracted text content
Raises:
FileNotFoundError: If PDF file doesn't exist
PyPDF2.PdfReadError: If PDF file is invalid or corrupted
"""
try:
with open(pdf_path, 'rb') as file:
# Create PDF reader object
reader = PyPDF2.PdfReader(file)
# Extract text from all pages
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
return text.strip()
except FileNotFoundError:
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
except PyPDF2.PdfReadError as e:
raise PyPDF2.PdfReadError(f"Error reading PDF file: {str(e)}")
except Exception as e:
raise Exception(f"Unexpected error processing PDF: {str(e)}")
def get_metadata(self, pdf_path: str) -> dict:
"""
Extract metadata from PDF file
Args:
pdf_path: Path to the PDF file
Returns:
dict: PDF metadata
"""
try:
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
return reader.metadata
except Exception as e:
return {"error": str(e)} |