Spaces:
Runtime error
Runtime error
| import PyPDF2 | |
| from typing import Optional | |
| class PDFProcessor: | |
| """Handles PDF file processing and text extraction""" | |
| def __init__(self): | |
| """Initialize PDF processor""" | |
| pass | |
| def extract_text(self, pdf_path: str) -> str: | |
| """ | |
| Extract text content from a PDF file | |
| Args: | |
| pdf_path: Path to the PDF file | |
| Returns: | |
| str: Extracted text content | |
| Raises: | |
| FileNotFoundError: If PDF file doesn't exist | |
| PyPDF2.PdfReadError: If PDF file is invalid or corrupted | |
| """ | |
| try: | |
| with open(pdf_path, 'rb') as file: | |
| # Create PDF reader object | |
| reader = PyPDF2.PdfReader(file) | |
| # Extract text from all pages | |
| text = "" | |
| for page in reader.pages: | |
| text += page.extract_text() + "\n" | |
| return text.strip() | |
| except FileNotFoundError: | |
| raise FileNotFoundError(f"PDF file not found: {pdf_path}") | |
| except PyPDF2.PdfReadError as e: | |
| raise PyPDF2.PdfReadError(f"Error reading PDF file: {str(e)}") | |
| except Exception as e: | |
| raise Exception(f"Unexpected error processing PDF: {str(e)}") | |
| def get_metadata(self, pdf_path: str) -> dict: | |
| """ | |
| Extract metadata from PDF file | |
| Args: | |
| pdf_path: Path to the PDF file | |
| Returns: | |
| dict: PDF metadata | |
| """ | |
| try: | |
| with open(pdf_path, 'rb') as file: | |
| reader = PyPDF2.PdfReader(file) | |
| return reader.metadata | |
| except Exception as e: | |
| return {"error": str(e)} |