Spaces:
Running
Running
| """PDF extraction module for the RAG pipeline. | |
| This module provides tools for extracting text and structure from PDF | |
| documents and converting them to well-structured Markdown format | |
| suitable for chunking and embedding. | |
| Components: | |
| Data Models: | |
| - ExtractedPage: Represents a single page with content, tables, images | |
| - ExtractedDocument: Represents a complete extracted PDF document | |
| Protocols: | |
| - Extractor: Synchronous protocol for PDF extraction implementations | |
| - AsyncExtractor: Asynchronous protocol for PDF extraction | |
| Implementations: | |
| - PDFExtractor: Main class for PDF document processing | |
| - MarkdownConverter: Converts extracted content to Markdown format | |
| Lazy Loading: | |
| Heavy dependencies (pymupdf4llm) are loaded on first access using | |
| __getattr__. This ensures fast import times when the extraction | |
| module is not immediately needed. Lightweight components (models, | |
| protocols) are also lazy-loaded for consistency. | |
| Example: | |
| ------- | |
| >>> from rag_chatbot.extraction import PDFExtractor, ExtractedDocument | |
| >>> from pathlib import Path | |
| >>> | |
| >>> # Extract a PDF document | |
| >>> extractor = PDFExtractor() | |
| >>> doc = extractor.extract(Path("document.pdf")) | |
| >>> print(f"Extracted {doc.page_count} pages") | |
| >>> | |
| >>> # Access individual pages | |
| >>> for page in doc.pages: | |
| ... print(f"Page {page.page_number}: {len(page.content)} chars") | |
| >>> | |
| >>> # Export to Markdown | |
| >>> markdown = doc.to_markdown() | |
| """ | |
| from __future__ import annotations | |
| from typing import TYPE_CHECKING | |
| # ============================================================================= | |
| # Type Checking Imports | |
| # ============================================================================= | |
| # These imports are only processed by type checkers (mypy, pyright) and IDEs. | |
| # They enable proper type hints and autocompletion without runtime overhead. | |
| # ============================================================================= | |
| if TYPE_CHECKING: | |
| from rag_chatbot.extraction.base import AsyncExtractor, Extractor | |
| from rag_chatbot.extraction.markdown_converter import MarkdownConverter | |
| from rag_chatbot.extraction.models import ExtractedDocument, ExtractedPage | |
| from rag_chatbot.extraction.pdf_extractor import PDFExtractor | |
| # ============================================================================= | |
| # Module Exports | |
| # ============================================================================= | |
| # The __all__ list defines what is exported when using `from extraction import *` | |
| # All exports are lazy-loaded on first access for fast import times. | |
| # ============================================================================= | |
| __all__: list[str] = [ | |
| # Data Models | |
| "ExtractedPage", | |
| "ExtractedDocument", | |
| # Protocols | |
| "Extractor", | |
| "AsyncExtractor", | |
| # Implementations | |
| "PDFExtractor", | |
| "MarkdownConverter", | |
| ] | |
| def __getattr__(name: str) -> object: | |
| """Lazy load module exports on first access. | |
| This function is called when an attribute is not found in the module's | |
| namespace. It enables lazy loading of dependencies, ensuring fast import | |
| times when specific components are not immediately needed. | |
| The lazy loading strategy: | |
| 1. Models (ExtractedPage, ExtractedDocument) - Lightweight but lazy for consistency | |
| 2. Protocols (Extractor, AsyncExtractor) - Lightweight but lazy for consistency | |
| 3. Implementations (PDFExtractor, MarkdownConverter) - Heavy deps, must be lazy | |
| Args: | |
| ---- | |
| name: The name of the attribute being accessed. | |
| Returns: | |
| ------- | |
| The requested attribute if it exists in __all__. | |
| Raises: | |
| ------ | |
| AttributeError: If the attribute is not a valid export. | |
| Example: | |
| ------- | |
| >>> from rag_chatbot.extraction import ExtractedDocument | |
| >>> # ExtractedDocument is loaded only when accessed | |
| >>> doc = ExtractedDocument(source_path="test.pdf", pages=[]) | |
| """ | |
| # ------------------------------------------------------------------------- | |
| # Data Models - Lightweight Pydantic models | |
| # ------------------------------------------------------------------------- | |
| if name == "ExtractedPage": | |
| from rag_chatbot.extraction.models import ExtractedPage | |
| return ExtractedPage | |
| if name == "ExtractedDocument": | |
| from rag_chatbot.extraction.models import ExtractedDocument | |
| return ExtractedDocument | |
| # ------------------------------------------------------------------------- | |
| # Protocols - Lightweight Protocol definitions | |
| # ------------------------------------------------------------------------- | |
| if name == "Extractor": | |
| from rag_chatbot.extraction.base import Extractor | |
| return Extractor | |
| if name == "AsyncExtractor": | |
| from rag_chatbot.extraction.base import AsyncExtractor | |
| return AsyncExtractor | |
| # ------------------------------------------------------------------------- | |
| # Implementations - May have heavy dependencies | |
| # ------------------------------------------------------------------------- | |
| if name == "PDFExtractor": | |
| from rag_chatbot.extraction.pdf_extractor import PDFExtractor | |
| return PDFExtractor | |
| if name == "MarkdownConverter": | |
| from rag_chatbot.extraction.markdown_converter import MarkdownConverter | |
| return MarkdownConverter | |
| # ------------------------------------------------------------------------- | |
| # Unknown attribute - raise AttributeError | |
| # ------------------------------------------------------------------------- | |
| msg = f"module {__name__!r} has no attribute {name!r}" | |
| raise AttributeError(msg) | |