Spaces:
Runtime error
Runtime error
File size: 2,714 Bytes
e68d535 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
import fitz
import pymupdf4llm
from pydantic import BaseModel
from pathlib import Path
from typing import List, Optional
import logging
logger = logging.getLogger(__name__)
class PDFMetadata(BaseModel):
"""Metadata for extracted PDF content."""
source: str
page_number: int
num_words: int
document_title: Optional[str] = None
class PDFEntry(BaseModel):
"""Represents a single page of extracted PDF content."""
id: str
markdown_text: str
metadata: PDFMetadata
class BasePDFExtractorConfig(BaseModel):
"""Base configuration for PDF extractors."""
extension: str = "pdf"
class PyMuPDFExtractorConfig(BasePDFExtractorConfig):
"""Configuration for PyMuPDF-based extractor."""
name: str = "pymupdf"
class BasePDFExtractor:
"""Base class for PDF extractors."""
def __init__(self, config: BasePDFExtractorConfig):
"""Initialize the PDF extractor with configuration."""
self.config = config
def extract(self, pdf_path: Path) -> List[PDFEntry]:
"""Extract text from a PDF file."""
raise NotImplementedError("This method should be implemented by subclasses")
class PyMuPDFExtractor(BasePDFExtractor):
"""PDF extractor using PyMuPDF library."""
def __init__(self, config: PyMuPDFExtractorConfig):
super().__init__(config)
def extract(self, pdf_path: Path) -> List[PDFEntry]:
"""Extract text from PDF using PyMuPDF."""
pdf_file_path = str(pdf_path)
try:
doc = fitz.open(pdf_file_path)
pdf_name = pdf_path.name
entries = []
logger.info(f"Extracting content from {pdf_file_path}")
total_pages = len(doc)
processed_count = 0
for page_num in range(len(doc)):
# page = doc[page_num]
logger.info(f"Processing page: {page_num + 1}/{total_pages}")
markdown_text = pymupdf4llm.to_markdown(doc, pages=[page_num])
metadata = PDFMetadata(
source=pdf_file_path,
page_number=page_num + 1,
num_words=len(markdown_text.split()),
document_title=pdf_name
)
entry = PDFEntry(
id=f"{pdf_name}_page_{page_num + 1}",
markdown_text=markdown_text,
metadata=metadata
)
entries.append(entry)
processed_count += 1
return entries
except fitz.FileNotFoundError:
print(f"Error: PDF file not found at '{pdf_file_path}'")
except Exception as e:
print(f"An error occurred: {e}")
|