sadickam's picture
Initial commit for HF Space
3326079
"""Structure-aware chunking module for the RAG pipeline.
This module provides tools for splitting Markdown documents into
semantically meaningful chunks suitable for embedding and retrieval.
The chunking process preserves document structure by:
- Respecting heading boundaries
- Inheriting parent headings for context
- Maintaining code block integrity
- Keeping related content together
- Attaching lists to preceding paragraphs
- Converting tables to natural language descriptions
Components:
- Chunk: Pydantic model representing a document chunk with metadata
- ChunkingConfig: Configuration parameters for chunking
- TextNormalizer: Text normalization utilities for PDF extraction artifacts
- HeadingParser: Parser for extracting headings and content blocks
- ListAttacher: Attaches lists to preceding paragraph context
- TableConverter: Converts markdown tables to natural language
- ContentProcessor: Unified processor for lists and tables
- Chunker: Main orchestrator class for document chunking pipeline
- ChunkingStrategy: Abstract base class for chunking strategies
- MarkdownStrategy: Strategy for general Markdown documents
- CodeStrategy: Strategy for code-heavy content
- StructureAwareStrategy: Strategy combining heading, list, and table handling
Lazy Loading:
Heavy dependencies are loaded on first access using __getattr__.
This ensures fast import times when the chunking module is not
immediately needed.
Example:
-------
>>> from rag_chatbot.chunking import ChunkingConfig, Chunker, TextNormalizer
>>> config = ChunkingConfig(min_tokens=450, max_tokens=700)
>>> normalizer = TextNormalizer()
>>> chunker = Chunker(config, normalizer=normalizer)
>>> chunks = chunker.chunk_document(markdown_content, source="doc.pdf")
"""
from __future__ import annotations
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from .chunker import Chunker
from .heading_parser import ContentBlock, HeadingParser, ParsedHeading
from .list_table_attacher import (
AttachedContent,
ContentProcessor,
ListAttacher,
ListDetector,
ParsedTable,
TableConverter,
TableDetector,
)
from .models import (
THERMAL_COMFORT_TERMS,
Chunk,
ChunkingConfig,
TextNormalizer,
)
from .strategies import (
ChunkingStrategy,
CodeStrategy,
MarkdownStrategy,
StructureAwareStrategy,
)
from .token_aware import SlidingWindowChunker, Tokenizer
# =============================================================================
# Module Exports
# =============================================================================
__all__: list[str] = [
# Data models (Step 3.1)
"Chunk",
"ChunkingConfig",
"TextNormalizer",
"THERMAL_COMFORT_TERMS",
# Heading parser (Step 3.2)
"ParsedHeading",
"ContentBlock",
"HeadingParser",
# List and table attachment (Step 3.3)
"AttachedContent",
"ListDetector",
"ListAttacher",
"ParsedTable",
"TableDetector",
"TableConverter",
"ContentProcessor",
# Chunker pipeline (Step 3.5)
"Chunker",
"ChunkingStrategy",
"MarkdownStrategy",
"CodeStrategy",
"StructureAwareStrategy",
# Token-aware chunking (Step 3.4)
"Tokenizer",
"SlidingWindowChunker",
]
# =============================================================================
# Lazy Import Registry
# =============================================================================
# Maps exported names to their (module, attribute) tuples for lazy loading.
# This pattern avoids loading heavy dependencies until they are actually used.
# =============================================================================
_LAZY_IMPORTS: dict[str, tuple[str, str]] = {
# Data models (Step 3.1)
"Chunk": (".models", "Chunk"),
"ChunkingConfig": (".models", "ChunkingConfig"),
"TextNormalizer": (".models", "TextNormalizer"),
"THERMAL_COMFORT_TERMS": (".models", "THERMAL_COMFORT_TERMS"),
# Heading parser (Step 3.2)
"ParsedHeading": (".heading_parser", "ParsedHeading"),
"ContentBlock": (".heading_parser", "ContentBlock"),
"HeadingParser": (".heading_parser", "HeadingParser"),
# List and table attachment (Step 3.3)
"AttachedContent": (".list_table_attacher", "AttachedContent"),
"ListDetector": (".list_table_attacher", "ListDetector"),
"ListAttacher": (".list_table_attacher", "ListAttacher"),
"ParsedTable": (".list_table_attacher", "ParsedTable"),
"TableDetector": (".list_table_attacher", "TableDetector"),
"TableConverter": (".list_table_attacher", "TableConverter"),
"ContentProcessor": (".list_table_attacher", "ContentProcessor"),
# Chunker pipeline (Step 3.5)
"Chunker": (".chunker", "Chunker"),
"ChunkingStrategy": (".strategies", "ChunkingStrategy"),
"MarkdownStrategy": (".strategies", "MarkdownStrategy"),
"CodeStrategy": (".strategies", "CodeStrategy"),
"StructureAwareStrategy": (".strategies", "StructureAwareStrategy"),
# Token-aware chunking (Step 3.4)
"Tokenizer": (".token_aware", "Tokenizer"),
"SlidingWindowChunker": (".token_aware", "SlidingWindowChunker"),
}
def __getattr__(name: str) -> object:
"""Lazy load module exports on first access.
This function is called when an attribute is not found in the module's
namespace. It enables lazy loading of heavy dependencies using a
registry-based lookup pattern.
Args:
----
name: The name of the attribute being accessed.
Returns:
-------
The requested attribute if it exists in the lazy import registry.
Raises:
------
AttributeError: If the attribute is not a valid export.
"""
if name in _LAZY_IMPORTS:
module_path, attr_name = _LAZY_IMPORTS[name]
# Use __import__ with proper package context for relative imports
import importlib
module = importlib.import_module(module_path, package=__name__)
return getattr(module, attr_name)
msg = f"module {__name__!r} has no attribute {name!r}" # pragma: no cover
raise AttributeError(msg) # pragma: no cover