Spaces:

XQ
/

Dokumentassistent

Running

init

31a2688 about 2 months ago

1.66 kB

	"""PDF parsing module using PyMuPDF (fitz)."""

	import logging
	import os

	import fitz # PyMuPDF

	logger = logging.getLogger(__name__)


	class PDFParser:
	"""Parses PDF files and extracts raw text with metadata."""

	def parse(self, file_path: str) -> list[dict[str, str \| int]]:
	"""Extract text and metadata from a PDF file.

	Args:
	file_path: Path to the PDF file.

	Returns:
	List of dicts, each containing 'text', 'page_number',
	and 'source' keys.

	Raises:
	FileNotFoundError: If the PDF file does not exist.
	ValueError: If the file is not a valid PDF.
	"""
	if not os.path.exists(file_path):
	raise FileNotFoundError(f"PDF file not found: {file_path}")

	if not file_path.lower().endswith(".pdf"):
	raise ValueError(f"File is not a PDF: {file_path}")

	logger.info("Parsing PDF: %s", file_path)
	source = os.path.basename(file_path)
	pages: list[dict[str, str \| int]] = []

	try:
	doc = fitz.open(file_path)
	except Exception as exc:
	raise ValueError(f"Failed to open PDF: {file_path}") from exc

	try:
	for page_num, page in enumerate(doc, start=1):
	text = page.get_text()
	if text.strip():
	pages.append({
	"text": text,
	"page_number": page_num,
	"source": source,
	})
	finally:
	doc.close()

	logger.info("Extracted %d pages from %s", len(pages), source)
	return pages