"""Structure-aware chunking module for the RAG pipeline. This module provides tools for splitting Markdown documents into semantically meaningful chunks suitable for embedding and retrieval. The chunking process preserves document structure by: - Respecting heading boundaries - Inheriting parent headings for context - Maintaining code block integrity - Keeping related content together - Attaching lists to preceding paragraphs - Converting tables to natural language descriptions Components: - Chunk: Pydantic model representing a document chunk with metadata - ChunkingConfig: Configuration parameters for chunking - TextNormalizer: Text normalization utilities for PDF extraction artifacts - HeadingParser: Parser for extracting headings and content blocks - ListAttacher: Attaches lists to preceding paragraph context - TableConverter: Converts markdown tables to natural language - ContentProcessor: Unified processor for lists and tables - Chunker: Main orchestrator class for document chunking pipeline - ChunkingStrategy: Abstract base class for chunking strategies - MarkdownStrategy: Strategy for general Markdown documents - CodeStrategy: Strategy for code-heavy content - StructureAwareStrategy: Strategy combining heading, list, and table handling Lazy Loading: Heavy dependencies are loaded on first access using __getattr__. This ensures fast import times when the chunking module is not immediately needed. Example: ------- >>> from rag_chatbot.chunking import ChunkingConfig, Chunker, TextNormalizer >>> config = ChunkingConfig(min_tokens=450, max_tokens=700) >>> normalizer = TextNormalizer() >>> chunker = Chunker(config, normalizer=normalizer) >>> chunks = chunker.chunk_document(markdown_content, source="doc.pdf") """ from __future__ import annotations from typing import TYPE_CHECKING if TYPE_CHECKING: from .chunker import Chunker from .heading_parser import ContentBlock, HeadingParser, ParsedHeading from .list_table_attacher import ( AttachedContent, ContentProcessor, ListAttacher, ListDetector, ParsedTable, TableConverter, TableDetector, ) from .models import ( THERMAL_COMFORT_TERMS, Chunk, ChunkingConfig, TextNormalizer, ) from .strategies import ( ChunkingStrategy, CodeStrategy, MarkdownStrategy, StructureAwareStrategy, ) from .token_aware import SlidingWindowChunker, Tokenizer # ============================================================================= # Module Exports # ============================================================================= __all__: list[str] = [ # Data models (Step 3.1) "Chunk", "ChunkingConfig", "TextNormalizer", "THERMAL_COMFORT_TERMS", # Heading parser (Step 3.2) "ParsedHeading", "ContentBlock", "HeadingParser", # List and table attachment (Step 3.3) "AttachedContent", "ListDetector", "ListAttacher", "ParsedTable", "TableDetector", "TableConverter", "ContentProcessor", # Chunker pipeline (Step 3.5) "Chunker", "ChunkingStrategy", "MarkdownStrategy", "CodeStrategy", "StructureAwareStrategy", # Token-aware chunking (Step 3.4) "Tokenizer", "SlidingWindowChunker", ] # ============================================================================= # Lazy Import Registry # ============================================================================= # Maps exported names to their (module, attribute) tuples for lazy loading. # This pattern avoids loading heavy dependencies until they are actually used. # ============================================================================= _LAZY_IMPORTS: dict[str, tuple[str, str]] = { # Data models (Step 3.1) "Chunk": (".models", "Chunk"), "ChunkingConfig": (".models", "ChunkingConfig"), "TextNormalizer": (".models", "TextNormalizer"), "THERMAL_COMFORT_TERMS": (".models", "THERMAL_COMFORT_TERMS"), # Heading parser (Step 3.2) "ParsedHeading": (".heading_parser", "ParsedHeading"), "ContentBlock": (".heading_parser", "ContentBlock"), "HeadingParser": (".heading_parser", "HeadingParser"), # List and table attachment (Step 3.3) "AttachedContent": (".list_table_attacher", "AttachedContent"), "ListDetector": (".list_table_attacher", "ListDetector"), "ListAttacher": (".list_table_attacher", "ListAttacher"), "ParsedTable": (".list_table_attacher", "ParsedTable"), "TableDetector": (".list_table_attacher", "TableDetector"), "TableConverter": (".list_table_attacher", "TableConverter"), "ContentProcessor": (".list_table_attacher", "ContentProcessor"), # Chunker pipeline (Step 3.5) "Chunker": (".chunker", "Chunker"), "ChunkingStrategy": (".strategies", "ChunkingStrategy"), "MarkdownStrategy": (".strategies", "MarkdownStrategy"), "CodeStrategy": (".strategies", "CodeStrategy"), "StructureAwareStrategy": (".strategies", "StructureAwareStrategy"), # Token-aware chunking (Step 3.4) "Tokenizer": (".token_aware", "Tokenizer"), "SlidingWindowChunker": (".token_aware", "SlidingWindowChunker"), } def __getattr__(name: str) -> object: """Lazy load module exports on first access. This function is called when an attribute is not found in the module's namespace. It enables lazy loading of heavy dependencies using a registry-based lookup pattern. Args: ---- name: The name of the attribute being accessed. Returns: ------- The requested attribute if it exists in the lazy import registry. Raises: ------ AttributeError: If the attribute is not a valid export. """ if name in _LAZY_IMPORTS: module_path, attr_name = _LAZY_IMPORTS[name] # Use __import__ with proper package context for relative imports import importlib module = importlib.import_module(module_path, package=__name__) return getattr(module, attr_name) msg = f"module {__name__!r} has no attribute {name!r}" # pragma: no cover raise AttributeError(msg) # pragma: no cover