Dokumentassistent / src /ingestion /pdf_parser.py
XQ
init
31a2688
raw
history blame
1.66 kB
"""PDF parsing module using PyMuPDF (fitz)."""
import logging
import os
import fitz # PyMuPDF
logger = logging.getLogger(__name__)
class PDFParser:
"""Parses PDF files and extracts raw text with metadata."""
def parse(self, file_path: str) -> list[dict[str, str | int]]:
"""Extract text and metadata from a PDF file.
Args:
file_path: Path to the PDF file.
Returns:
List of dicts, each containing 'text', 'page_number',
and 'source' keys.
Raises:
FileNotFoundError: If the PDF file does not exist.
ValueError: If the file is not a valid PDF.
"""
if not os.path.exists(file_path):
raise FileNotFoundError(f"PDF file not found: {file_path}")
if not file_path.lower().endswith(".pdf"):
raise ValueError(f"File is not a PDF: {file_path}")
logger.info("Parsing PDF: %s", file_path)
source = os.path.basename(file_path)
pages: list[dict[str, str | int]] = []
try:
doc = fitz.open(file_path)
except Exception as exc:
raise ValueError(f"Failed to open PDF: {file_path}") from exc
try:
for page_num, page in enumerate(doc, start=1):
text = page.get_text()
if text.strip():
pages.append({
"text": text,
"page_number": page_num,
"source": source,
})
finally:
doc.close()
logger.info("Extracted %d pages from %s", len(pages), source)
return pages