Spaces:

XQ
/

Dokumentassistent

Running

File size: 1,661 Bytes

31a2688

"""PDF parsing module using PyMuPDF (fitz)."""

import logging
import os

import fitz  # PyMuPDF

logger = logging.getLogger(__name__)


class PDFParser:
    """Parses PDF files and extracts raw text with metadata."""

    def parse(self, file_path: str) -> list[dict[str, str | int]]:
        """Extract text and metadata from a PDF file.

        Args:
            file_path: Path to the PDF file.

        Returns:
            List of dicts, each containing 'text', 'page_number',
            and 'source' keys.

        Raises:
            FileNotFoundError: If the PDF file does not exist.
            ValueError: If the file is not a valid PDF.
        """
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"PDF file not found: {file_path}")

        if not file_path.lower().endswith(".pdf"):
            raise ValueError(f"File is not a PDF: {file_path}")

        logger.info("Parsing PDF: %s", file_path)
        source = os.path.basename(file_path)
        pages: list[dict[str, str | int]] = []

        try:
            doc = fitz.open(file_path)
        except Exception as exc:
            raise ValueError(f"Failed to open PDF: {file_path}") from exc

        try:
            for page_num, page in enumerate(doc, start=1):
                text = page.get_text()
                if text.strip():
                    pages.append({
                        "text": text,
                        "page_number": page_num,
                        "source": source,
                    })
        finally:
            doc.close()

        logger.info("Extracted %d pages from %s", len(pages), source)
        return pages