Demos / backend /classes /pdf_extractor.py
nikhile-galileo's picture
Adding finance protect demo
e68d535
raw
history blame
2.71 kB
import fitz
import pymupdf4llm
from pydantic import BaseModel
from pathlib import Path
from typing import List, Optional
import logging
logger = logging.getLogger(__name__)
class PDFMetadata(BaseModel):
"""Metadata for extracted PDF content."""
source: str
page_number: int
num_words: int
document_title: Optional[str] = None
class PDFEntry(BaseModel):
"""Represents a single page of extracted PDF content."""
id: str
markdown_text: str
metadata: PDFMetadata
class BasePDFExtractorConfig(BaseModel):
"""Base configuration for PDF extractors."""
extension: str = "pdf"
class PyMuPDFExtractorConfig(BasePDFExtractorConfig):
"""Configuration for PyMuPDF-based extractor."""
name: str = "pymupdf"
class BasePDFExtractor:
"""Base class for PDF extractors."""
def __init__(self, config: BasePDFExtractorConfig):
"""Initialize the PDF extractor with configuration."""
self.config = config
def extract(self, pdf_path: Path) -> List[PDFEntry]:
"""Extract text from a PDF file."""
raise NotImplementedError("This method should be implemented by subclasses")
class PyMuPDFExtractor(BasePDFExtractor):
"""PDF extractor using PyMuPDF library."""
def __init__(self, config: PyMuPDFExtractorConfig):
super().__init__(config)
def extract(self, pdf_path: Path) -> List[PDFEntry]:
"""Extract text from PDF using PyMuPDF."""
pdf_file_path = str(pdf_path)
try:
doc = fitz.open(pdf_file_path)
pdf_name = pdf_path.name
entries = []
logger.info(f"Extracting content from {pdf_file_path}")
total_pages = len(doc)
processed_count = 0
for page_num in range(len(doc)):
# page = doc[page_num]
logger.info(f"Processing page: {page_num + 1}/{total_pages}")
markdown_text = pymupdf4llm.to_markdown(doc, pages=[page_num])
metadata = PDFMetadata(
source=pdf_file_path,
page_number=page_num + 1,
num_words=len(markdown_text.split()),
document_title=pdf_name
)
entry = PDFEntry(
id=f"{pdf_name}_page_{page_num + 1}",
markdown_text=markdown_text,
metadata=metadata
)
entries.append(entry)
processed_count += 1
return entries
except fitz.FileNotFoundError:
print(f"Error: PDF file not found at '{pdf_file_path}'")
except Exception as e:
print(f"An error occurred: {e}")