Spaces:

sadickam
/

pythermalcomfort_Chat

Running

File size: 6,273 Bytes
"""Structure-aware chunking module for the RAG pipeline.

This module provides tools for splitting Markdown documents into
semantically meaningful chunks suitable for embedding and retrieval.
The chunking process preserves document structure by:
    - Respecting heading boundaries
    - Inheriting parent headings for context
    - Maintaining code block integrity
    - Keeping related content together
    - Attaching lists to preceding paragraphs
    - Converting tables to natural language descriptions

Components:
    - Chunk: Pydantic model representing a document chunk with metadata
    - ChunkingConfig: Configuration parameters for chunking
    - TextNormalizer: Text normalization utilities for PDF extraction artifacts
    - HeadingParser: Parser for extracting headings and content blocks
    - ListAttacher: Attaches lists to preceding paragraph context
    - TableConverter: Converts markdown tables to natural language
    - ContentProcessor: Unified processor for lists and tables
    - Chunker: Main orchestrator class for document chunking pipeline
    - ChunkingStrategy: Abstract base class for chunking strategies
    - MarkdownStrategy: Strategy for general Markdown documents
    - CodeStrategy: Strategy for code-heavy content
    - StructureAwareStrategy: Strategy combining heading, list, and table handling

Lazy Loading:
    Heavy dependencies are loaded on first access using __getattr__.
    This ensures fast import times when the chunking module is not
    immediately needed.

Example:
-------
    >>> from rag_chatbot.chunking import ChunkingConfig, Chunker, TextNormalizer
    >>> config = ChunkingConfig(min_tokens=450, max_tokens=700)
    >>> normalizer = TextNormalizer()
    >>> chunker = Chunker(config, normalizer=normalizer)
    >>> chunks = chunker.chunk_document(markdown_content, source="doc.pdf")

"""

from __future__ import annotations

from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from .chunker import Chunker
    from .heading_parser import ContentBlock, HeadingParser, ParsedHeading
    from .list_table_attacher import (
        AttachedContent,
        ContentProcessor,
        ListAttacher,
        ListDetector,
        ParsedTable,
        TableConverter,
        TableDetector,
    )
    from .models import (
        THERMAL_COMFORT_TERMS,
        Chunk,
        ChunkingConfig,
        TextNormalizer,
    )
    from .strategies import (
        ChunkingStrategy,
        CodeStrategy,
        MarkdownStrategy,
        StructureAwareStrategy,
    )
    from .token_aware import SlidingWindowChunker, Tokenizer

# =============================================================================
# Module Exports
# =============================================================================
__all__: list[str] = [
    # Data models (Step 3.1)
    "Chunk",
    "ChunkingConfig",
    "TextNormalizer",
    "THERMAL_COMFORT_TERMS",
    # Heading parser (Step 3.2)
    "ParsedHeading",
    "ContentBlock",
    "HeadingParser",
    # List and table attachment (Step 3.3)
    "AttachedContent",
    "ListDetector",
    "ListAttacher",
    "ParsedTable",
    "TableDetector",
    "TableConverter",
    "ContentProcessor",
    # Chunker pipeline (Step 3.5)
    "Chunker",
    "ChunkingStrategy",
    "MarkdownStrategy",
    "CodeStrategy",
    "StructureAwareStrategy",
    # Token-aware chunking (Step 3.4)
    "Tokenizer",
    "SlidingWindowChunker",
]


# =============================================================================
# Lazy Import Registry
# =============================================================================
# Maps exported names to their (module, attribute) tuples for lazy loading.
# This pattern avoids loading heavy dependencies until they are actually used.
# =============================================================================
_LAZY_IMPORTS: dict[str, tuple[str, str]] = {
    # Data models (Step 3.1)
    "Chunk": (".models", "Chunk"),
    "ChunkingConfig": (".models", "ChunkingConfig"),
    "TextNormalizer": (".models", "TextNormalizer"),
    "THERMAL_COMFORT_TERMS": (".models", "THERMAL_COMFORT_TERMS"),
    # Heading parser (Step 3.2)
    "ParsedHeading": (".heading_parser", "ParsedHeading"),
    "ContentBlock": (".heading_parser", "ContentBlock"),
    "HeadingParser": (".heading_parser", "HeadingParser"),
    # List and table attachment (Step 3.3)
    "AttachedContent": (".list_table_attacher", "AttachedContent"),
    "ListDetector": (".list_table_attacher", "ListDetector"),
    "ListAttacher": (".list_table_attacher", "ListAttacher"),
    "ParsedTable": (".list_table_attacher", "ParsedTable"),
    "TableDetector": (".list_table_attacher", "TableDetector"),
    "TableConverter": (".list_table_attacher", "TableConverter"),
    "ContentProcessor": (".list_table_attacher", "ContentProcessor"),
    # Chunker pipeline (Step 3.5)
    "Chunker": (".chunker", "Chunker"),
    "ChunkingStrategy": (".strategies", "ChunkingStrategy"),
    "MarkdownStrategy": (".strategies", "MarkdownStrategy"),
    "CodeStrategy": (".strategies", "CodeStrategy"),
    "StructureAwareStrategy": (".strategies", "StructureAwareStrategy"),
    # Token-aware chunking (Step 3.4)
    "Tokenizer": (".token_aware", "Tokenizer"),
    "SlidingWindowChunker": (".token_aware", "SlidingWindowChunker"),
}


def __getattr__(name: str) -> object:
    """Lazy load module exports on first access.

    This function is called when an attribute is not found in the module's
    namespace. It enables lazy loading of heavy dependencies using a
    registry-based lookup pattern.

    Args:
    ----
        name: The name of the attribute being accessed.

    Returns:
    -------
        The requested attribute if it exists in the lazy import registry.

    Raises:
    ------
        AttributeError: If the attribute is not a valid export.

    """
    if name in _LAZY_IMPORTS:
        module_path, attr_name = _LAZY_IMPORTS[name]
        # Use __import__ with proper package context for relative imports
        import importlib

        module = importlib.import_module(module_path, package=__name__)
        return getattr(module, attr_name)

    msg = f"module {__name__!r} has no attribute {name!r}"  # pragma: no cover
    raise AttributeError(msg)  # pragma: no cover