Spaces:
Runtime error
Runtime error
| import fitz | |
| import pymupdf4llm | |
| from pydantic import BaseModel | |
| from pathlib import Path | |
| from typing import List, Optional | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| class PDFMetadata(BaseModel): | |
| """Metadata for extracted PDF content.""" | |
| source: str | |
| page_number: int | |
| num_words: int | |
| document_title: Optional[str] = None | |
| class PDFEntry(BaseModel): | |
| """Represents a single page of extracted PDF content.""" | |
| id: str | |
| markdown_text: str | |
| metadata: PDFMetadata | |
| class BasePDFExtractorConfig(BaseModel): | |
| """Base configuration for PDF extractors.""" | |
| extension: str = "pdf" | |
| class PyMuPDFExtractorConfig(BasePDFExtractorConfig): | |
| """Configuration for PyMuPDF-based extractor.""" | |
| name: str = "pymupdf" | |
| class BasePDFExtractor: | |
| """Base class for PDF extractors.""" | |
| def __init__(self, config: BasePDFExtractorConfig): | |
| """Initialize the PDF extractor with configuration.""" | |
| self.config = config | |
| def extract(self, pdf_path: Path) -> List[PDFEntry]: | |
| """Extract text from a PDF file.""" | |
| raise NotImplementedError("This method should be implemented by subclasses") | |
| class PyMuPDFExtractor(BasePDFExtractor): | |
| """PDF extractor using PyMuPDF library.""" | |
| def __init__(self, config: PyMuPDFExtractorConfig): | |
| super().__init__(config) | |
| def extract(self, pdf_path: Path) -> List[PDFEntry]: | |
| """Extract text from PDF using PyMuPDF.""" | |
| pdf_file_path = str(pdf_path) | |
| try: | |
| doc = fitz.open(pdf_file_path) | |
| pdf_name = pdf_path.name | |
| entries = [] | |
| logger.info(f"Extracting content from {pdf_file_path}") | |
| total_pages = len(doc) | |
| processed_count = 0 | |
| for page_num in range(len(doc)): | |
| # page = doc[page_num] | |
| logger.info(f"Processing page: {page_num + 1}/{total_pages}") | |
| markdown_text = pymupdf4llm.to_markdown(doc, pages=[page_num]) | |
| metadata = PDFMetadata( | |
| source=pdf_file_path, | |
| page_number=page_num + 1, | |
| num_words=len(markdown_text.split()), | |
| document_title=pdf_name | |
| ) | |
| entry = PDFEntry( | |
| id=f"{pdf_name}_page_{page_num + 1}", | |
| markdown_text=markdown_text, | |
| metadata=metadata | |
| ) | |
| entries.append(entry) | |
| processed_count += 1 | |
| return entries | |
| except fitz.FileNotFoundError: | |
| print(f"Error: PDF file not found at '{pdf_file_path}'") | |
| except Exception as e: | |
| print(f"An error occurred: {e}") | |