Spaces:
Running
Running
| """PDF parsing module using PyMuPDF (fitz).""" | |
| import logging | |
| import os | |
| import fitz # PyMuPDF | |
| logger = logging.getLogger(__name__) | |
| class PDFParser: | |
| """Parses PDF files and extracts raw text with metadata.""" | |
| def parse(self, file_path: str) -> list[dict[str, str | int]]: | |
| """Extract text and metadata from a PDF file. | |
| Args: | |
| file_path: Path to the PDF file. | |
| Returns: | |
| List of dicts, each containing 'text', 'page_number', | |
| and 'source' keys. | |
| Raises: | |
| FileNotFoundError: If the PDF file does not exist. | |
| ValueError: If the file is not a valid PDF. | |
| """ | |
| if not os.path.exists(file_path): | |
| raise FileNotFoundError(f"PDF file not found: {file_path}") | |
| if not file_path.lower().endswith(".pdf"): | |
| raise ValueError(f"File is not a PDF: {file_path}") | |
| logger.info("Parsing PDF: %s", file_path) | |
| source = os.path.basename(file_path) | |
| pages: list[dict[str, str | int]] = [] | |
| try: | |
| doc = fitz.open(file_path) | |
| except Exception as exc: | |
| raise ValueError(f"Failed to open PDF: {file_path}") from exc | |
| try: | |
| for page_num, page in enumerate(doc, start=1): | |
| text = page.get_text() | |
| if text.strip(): | |
| pages.append({ | |
| "text": text, | |
| "page_number": page_num, | |
| "source": source, | |
| }) | |
| finally: | |
| doc.close() | |
| logger.info("Extracted %d pages from %s", len(pages), source) | |
| return pages | |