Spaces:
Sleeping
Sleeping
| """Structure-aware chunking module for the RAG pipeline. | |
| This module provides tools for splitting Markdown documents into | |
| semantically meaningful chunks suitable for embedding and retrieval. | |
| The chunking process preserves document structure by: | |
| - Respecting heading boundaries | |
| - Inheriting parent headings for context | |
| - Maintaining code block integrity | |
| - Keeping related content together | |
| - Attaching lists to preceding paragraphs | |
| - Converting tables to natural language descriptions | |
| Components: | |
| - Chunk: Pydantic model representing a document chunk with metadata | |
| - ChunkingConfig: Configuration parameters for chunking | |
| - TextNormalizer: Text normalization utilities for PDF extraction artifacts | |
| - HeadingParser: Parser for extracting headings and content blocks | |
| - ListAttacher: Attaches lists to preceding paragraph context | |
| - TableConverter: Converts markdown tables to natural language | |
| - ContentProcessor: Unified processor for lists and tables | |
| - Chunker: Main orchestrator class for document chunking pipeline | |
| - ChunkingStrategy: Abstract base class for chunking strategies | |
| - MarkdownStrategy: Strategy for general Markdown documents | |
| - CodeStrategy: Strategy for code-heavy content | |
| - StructureAwareStrategy: Strategy combining heading, list, and table handling | |
| Lazy Loading: | |
| Heavy dependencies are loaded on first access using __getattr__. | |
| This ensures fast import times when the chunking module is not | |
| immediately needed. | |
| Example: | |
| ------- | |
| >>> from rag_chatbot.chunking import ChunkingConfig, Chunker, TextNormalizer | |
| >>> config = ChunkingConfig(min_tokens=450, max_tokens=700) | |
| >>> normalizer = TextNormalizer() | |
| >>> chunker = Chunker(config, normalizer=normalizer) | |
| >>> chunks = chunker.chunk_document(markdown_content, source="doc.pdf") | |
| """ | |
| from __future__ import annotations | |
| from typing import TYPE_CHECKING | |
| if TYPE_CHECKING: | |
| from .chunker import Chunker | |
| from .heading_parser import ContentBlock, HeadingParser, ParsedHeading | |
| from .list_table_attacher import ( | |
| AttachedContent, | |
| ContentProcessor, | |
| ListAttacher, | |
| ListDetector, | |
| ParsedTable, | |
| TableConverter, | |
| TableDetector, | |
| ) | |
| from .models import ( | |
| THERMAL_COMFORT_TERMS, | |
| Chunk, | |
| ChunkingConfig, | |
| TextNormalizer, | |
| ) | |
| from .strategies import ( | |
| ChunkingStrategy, | |
| CodeStrategy, | |
| MarkdownStrategy, | |
| StructureAwareStrategy, | |
| ) | |
| from .token_aware import SlidingWindowChunker, Tokenizer | |
| # ============================================================================= | |
| # Module Exports | |
| # ============================================================================= | |
| __all__: list[str] = [ | |
| # Data models (Step 3.1) | |
| "Chunk", | |
| "ChunkingConfig", | |
| "TextNormalizer", | |
| "THERMAL_COMFORT_TERMS", | |
| # Heading parser (Step 3.2) | |
| "ParsedHeading", | |
| "ContentBlock", | |
| "HeadingParser", | |
| # List and table attachment (Step 3.3) | |
| "AttachedContent", | |
| "ListDetector", | |
| "ListAttacher", | |
| "ParsedTable", | |
| "TableDetector", | |
| "TableConverter", | |
| "ContentProcessor", | |
| # Chunker pipeline (Step 3.5) | |
| "Chunker", | |
| "ChunkingStrategy", | |
| "MarkdownStrategy", | |
| "CodeStrategy", | |
| "StructureAwareStrategy", | |
| # Token-aware chunking (Step 3.4) | |
| "Tokenizer", | |
| "SlidingWindowChunker", | |
| ] | |
| # ============================================================================= | |
| # Lazy Import Registry | |
| # ============================================================================= | |
| # Maps exported names to their (module, attribute) tuples for lazy loading. | |
| # This pattern avoids loading heavy dependencies until they are actually used. | |
| # ============================================================================= | |
| _LAZY_IMPORTS: dict[str, tuple[str, str]] = { | |
| # Data models (Step 3.1) | |
| "Chunk": (".models", "Chunk"), | |
| "ChunkingConfig": (".models", "ChunkingConfig"), | |
| "TextNormalizer": (".models", "TextNormalizer"), | |
| "THERMAL_COMFORT_TERMS": (".models", "THERMAL_COMFORT_TERMS"), | |
| # Heading parser (Step 3.2) | |
| "ParsedHeading": (".heading_parser", "ParsedHeading"), | |
| "ContentBlock": (".heading_parser", "ContentBlock"), | |
| "HeadingParser": (".heading_parser", "HeadingParser"), | |
| # List and table attachment (Step 3.3) | |
| "AttachedContent": (".list_table_attacher", "AttachedContent"), | |
| "ListDetector": (".list_table_attacher", "ListDetector"), | |
| "ListAttacher": (".list_table_attacher", "ListAttacher"), | |
| "ParsedTable": (".list_table_attacher", "ParsedTable"), | |
| "TableDetector": (".list_table_attacher", "TableDetector"), | |
| "TableConverter": (".list_table_attacher", "TableConverter"), | |
| "ContentProcessor": (".list_table_attacher", "ContentProcessor"), | |
| # Chunker pipeline (Step 3.5) | |
| "Chunker": (".chunker", "Chunker"), | |
| "ChunkingStrategy": (".strategies", "ChunkingStrategy"), | |
| "MarkdownStrategy": (".strategies", "MarkdownStrategy"), | |
| "CodeStrategy": (".strategies", "CodeStrategy"), | |
| "StructureAwareStrategy": (".strategies", "StructureAwareStrategy"), | |
| # Token-aware chunking (Step 3.4) | |
| "Tokenizer": (".token_aware", "Tokenizer"), | |
| "SlidingWindowChunker": (".token_aware", "SlidingWindowChunker"), | |
| } | |
| def __getattr__(name: str) -> object: | |
| """Lazy load module exports on first access. | |
| This function is called when an attribute is not found in the module's | |
| namespace. It enables lazy loading of heavy dependencies using a | |
| registry-based lookup pattern. | |
| Args: | |
| ---- | |
| name: The name of the attribute being accessed. | |
| Returns: | |
| ------- | |
| The requested attribute if it exists in the lazy import registry. | |
| Raises: | |
| ------ | |
| AttributeError: If the attribute is not a valid export. | |
| """ | |
| if name in _LAZY_IMPORTS: | |
| module_path, attr_name = _LAZY_IMPORTS[name] | |
| # Use __import__ with proper package context for relative imports | |
| import importlib | |
| module = importlib.import_module(module_path, package=__name__) | |
| return getattr(module, attr_name) | |
| msg = f"module {__name__!r} has no attribute {name!r}" # pragma: no cover | |
| raise AttributeError(msg) # pragma: no cover | |