sadickam's picture
Initial commit for HF Space
3326079
"""PDF extraction module for the RAG pipeline.
This module provides tools for extracting text and structure from PDF
documents and converting them to well-structured Markdown format
suitable for chunking and embedding.
Components:
Data Models:
- ExtractedPage: Represents a single page with content, tables, images
- ExtractedDocument: Represents a complete extracted PDF document
Protocols:
- Extractor: Synchronous protocol for PDF extraction implementations
- AsyncExtractor: Asynchronous protocol for PDF extraction
Implementations:
- PDFExtractor: Main class for PDF document processing
- MarkdownConverter: Converts extracted content to Markdown format
Lazy Loading:
Heavy dependencies (pymupdf4llm) are loaded on first access using
__getattr__. This ensures fast import times when the extraction
module is not immediately needed. Lightweight components (models,
protocols) are also lazy-loaded for consistency.
Example:
-------
>>> from rag_chatbot.extraction import PDFExtractor, ExtractedDocument
>>> from pathlib import Path
>>>
>>> # Extract a PDF document
>>> extractor = PDFExtractor()
>>> doc = extractor.extract(Path("document.pdf"))
>>> print(f"Extracted {doc.page_count} pages")
>>>
>>> # Access individual pages
>>> for page in doc.pages:
... print(f"Page {page.page_number}: {len(page.content)} chars")
>>>
>>> # Export to Markdown
>>> markdown = doc.to_markdown()
"""
from __future__ import annotations
from typing import TYPE_CHECKING
# =============================================================================
# Type Checking Imports
# =============================================================================
# These imports are only processed by type checkers (mypy, pyright) and IDEs.
# They enable proper type hints and autocompletion without runtime overhead.
# =============================================================================
if TYPE_CHECKING:
from rag_chatbot.extraction.base import AsyncExtractor, Extractor
from rag_chatbot.extraction.markdown_converter import MarkdownConverter
from rag_chatbot.extraction.models import ExtractedDocument, ExtractedPage
from rag_chatbot.extraction.pdf_extractor import PDFExtractor
# =============================================================================
# Module Exports
# =============================================================================
# The __all__ list defines what is exported when using `from extraction import *`
# All exports are lazy-loaded on first access for fast import times.
# =============================================================================
__all__: list[str] = [
# Data Models
"ExtractedPage",
"ExtractedDocument",
# Protocols
"Extractor",
"AsyncExtractor",
# Implementations
"PDFExtractor",
"MarkdownConverter",
]
def __getattr__(name: str) -> object:
"""Lazy load module exports on first access.
This function is called when an attribute is not found in the module's
namespace. It enables lazy loading of dependencies, ensuring fast import
times when specific components are not immediately needed.
The lazy loading strategy:
1. Models (ExtractedPage, ExtractedDocument) - Lightweight but lazy for consistency
2. Protocols (Extractor, AsyncExtractor) - Lightweight but lazy for consistency
3. Implementations (PDFExtractor, MarkdownConverter) - Heavy deps, must be lazy
Args:
----
name: The name of the attribute being accessed.
Returns:
-------
The requested attribute if it exists in __all__.
Raises:
------
AttributeError: If the attribute is not a valid export.
Example:
-------
>>> from rag_chatbot.extraction import ExtractedDocument
>>> # ExtractedDocument is loaded only when accessed
>>> doc = ExtractedDocument(source_path="test.pdf", pages=[])
"""
# -------------------------------------------------------------------------
# Data Models - Lightweight Pydantic models
# -------------------------------------------------------------------------
if name == "ExtractedPage":
from rag_chatbot.extraction.models import ExtractedPage
return ExtractedPage
if name == "ExtractedDocument":
from rag_chatbot.extraction.models import ExtractedDocument
return ExtractedDocument
# -------------------------------------------------------------------------
# Protocols - Lightweight Protocol definitions
# -------------------------------------------------------------------------
if name == "Extractor":
from rag_chatbot.extraction.base import Extractor
return Extractor
if name == "AsyncExtractor":
from rag_chatbot.extraction.base import AsyncExtractor
return AsyncExtractor
# -------------------------------------------------------------------------
# Implementations - May have heavy dependencies
# -------------------------------------------------------------------------
if name == "PDFExtractor":
from rag_chatbot.extraction.pdf_extractor import PDFExtractor
return PDFExtractor
if name == "MarkdownConverter":
from rag_chatbot.extraction.markdown_converter import MarkdownConverter
return MarkdownConverter
# -------------------------------------------------------------------------
# Unknown attribute - raise AttributeError
# -------------------------------------------------------------------------
msg = f"module {__name__!r} has no attribute {name!r}"
raise AttributeError(msg)