import fitz import pymupdf4llm from pydantic import BaseModel from pathlib import Path from typing import List, Optional import logging logger = logging.getLogger(__name__) class PDFMetadata(BaseModel): """Metadata for extracted PDF content.""" source: str page_number: int num_words: int document_title: Optional[str] = None class PDFEntry(BaseModel): """Represents a single page of extracted PDF content.""" id: str markdown_text: str metadata: PDFMetadata class BasePDFExtractorConfig(BaseModel): """Base configuration for PDF extractors.""" extension: str = "pdf" class PyMuPDFExtractorConfig(BasePDFExtractorConfig): """Configuration for PyMuPDF-based extractor.""" name: str = "pymupdf" class BasePDFExtractor: """Base class for PDF extractors.""" def __init__(self, config: BasePDFExtractorConfig): """Initialize the PDF extractor with configuration.""" self.config = config def extract(self, pdf_path: Path) -> List[PDFEntry]: """Extract text from a PDF file.""" raise NotImplementedError("This method should be implemented by subclasses") class PyMuPDFExtractor(BasePDFExtractor): """PDF extractor using PyMuPDF library.""" def __init__(self, config: PyMuPDFExtractorConfig): super().__init__(config) def extract(self, pdf_path: Path) -> List[PDFEntry]: """Extract text from PDF using PyMuPDF.""" pdf_file_path = str(pdf_path) try: doc = fitz.open(pdf_file_path) pdf_name = pdf_path.name entries = [] logger.info(f"Extracting content from {pdf_file_path}") total_pages = len(doc) processed_count = 0 for page_num in range(len(doc)): # page = doc[page_num] logger.info(f"Processing page: {page_num + 1}/{total_pages}") markdown_text = pymupdf4llm.to_markdown(doc, pages=[page_num]) metadata = PDFMetadata( source=pdf_file_path, page_number=page_num + 1, num_words=len(markdown_text.split()), document_title=pdf_name ) entry = PDFEntry( id=f"{pdf_name}_page_{page_num + 1}", markdown_text=markdown_text, metadata=metadata ) entries.append(entry) processed_count += 1 return entries except fitz.FileNotFoundError: print(f"Error: PDF file not found at '{pdf_file_path}'") except Exception as e: print(f"An error occurred: {e}")