File size: 1,661 Bytes
31a2688
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
"""PDF parsing module using PyMuPDF (fitz)."""

import logging
import os

import fitz  # PyMuPDF

logger = logging.getLogger(__name__)


class PDFParser:
    """Parses PDF files and extracts raw text with metadata."""

    def parse(self, file_path: str) -> list[dict[str, str | int]]:
        """Extract text and metadata from a PDF file.

        Args:
            file_path: Path to the PDF file.

        Returns:
            List of dicts, each containing 'text', 'page_number',
            and 'source' keys.

        Raises:
            FileNotFoundError: If the PDF file does not exist.
            ValueError: If the file is not a valid PDF.
        """
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"PDF file not found: {file_path}")

        if not file_path.lower().endswith(".pdf"):
            raise ValueError(f"File is not a PDF: {file_path}")

        logger.info("Parsing PDF: %s", file_path)
        source = os.path.basename(file_path)
        pages: list[dict[str, str | int]] = []

        try:
            doc = fitz.open(file_path)
        except Exception as exc:
            raise ValueError(f"Failed to open PDF: {file_path}") from exc

        try:
            for page_num, page in enumerate(doc, start=1):
                text = page.get_text()
                if text.strip():
                    pages.append({
                        "text": text,
                        "page_number": page_num,
                        "source": source,
                    })
        finally:
            doc.close()

        logger.info("Extracted %d pages from %s", len(pages), source)
        return pages