File size: 2,714 Bytes
e68d535
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import fitz
import pymupdf4llm
from pydantic import BaseModel
from pathlib import Path
from typing import List, Optional
import logging

logger = logging.getLogger(__name__)


class PDFMetadata(BaseModel):
    """Metadata for extracted PDF content."""
    source: str
    page_number: int
    num_words: int
    document_title: Optional[str] = None


class PDFEntry(BaseModel):
    """Represents a single page of extracted PDF content."""
    id: str
    markdown_text: str
    metadata: PDFMetadata


class BasePDFExtractorConfig(BaseModel):
    """Base configuration for PDF extractors."""
    extension: str = "pdf"


class PyMuPDFExtractorConfig(BasePDFExtractorConfig):
    """Configuration for PyMuPDF-based extractor."""
    name: str = "pymupdf"


class BasePDFExtractor:
    """Base class for PDF extractors."""
    def __init__(self, config: BasePDFExtractorConfig):
        """Initialize the PDF extractor with configuration."""
        self.config = config

    def extract(self, pdf_path: Path) -> List[PDFEntry]:
        """Extract text from a PDF file."""
        raise NotImplementedError("This method should be implemented by subclasses")


class PyMuPDFExtractor(BasePDFExtractor):
    """PDF extractor using PyMuPDF library."""
    def __init__(self, config: PyMuPDFExtractorConfig):
        super().__init__(config)

    def extract(self, pdf_path: Path) -> List[PDFEntry]:
        """Extract text from PDF using PyMuPDF."""
        pdf_file_path = str(pdf_path)
        try:
            doc = fitz.open(pdf_file_path)

            pdf_name = pdf_path.name
            entries = []
            logger.info(f"Extracting content from {pdf_file_path}")
            total_pages = len(doc)
            processed_count = 0
            for page_num in range(len(doc)):
                # page = doc[page_num]
                logger.info(f"Processing page: {page_num + 1}/{total_pages}")
                markdown_text = pymupdf4llm.to_markdown(doc, pages=[page_num])

                metadata = PDFMetadata(
                    source=pdf_file_path,
                    page_number=page_num + 1,
                    num_words=len(markdown_text.split()),
                    document_title=pdf_name
                )

                entry = PDFEntry(
                    id=f"{pdf_name}_page_{page_num + 1}",
                    markdown_text=markdown_text,
                    metadata=metadata
                )

                entries.append(entry)
                processed_count += 1

            return entries
        except fitz.FileNotFoundError:
            print(f"Error: PDF file not found at '{pdf_file_path}'")
        except Exception as e:
            print(f"An error occurred: {e}")