File size: 6,273 Bytes
3326079
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
"""Structure-aware chunking module for the RAG pipeline.

This module provides tools for splitting Markdown documents into
semantically meaningful chunks suitable for embedding and retrieval.
The chunking process preserves document structure by:
    - Respecting heading boundaries
    - Inheriting parent headings for context
    - Maintaining code block integrity
    - Keeping related content together
    - Attaching lists to preceding paragraphs
    - Converting tables to natural language descriptions

Components:
    - Chunk: Pydantic model representing a document chunk with metadata
    - ChunkingConfig: Configuration parameters for chunking
    - TextNormalizer: Text normalization utilities for PDF extraction artifacts
    - HeadingParser: Parser for extracting headings and content blocks
    - ListAttacher: Attaches lists to preceding paragraph context
    - TableConverter: Converts markdown tables to natural language
    - ContentProcessor: Unified processor for lists and tables
    - Chunker: Main orchestrator class for document chunking pipeline
    - ChunkingStrategy: Abstract base class for chunking strategies
    - MarkdownStrategy: Strategy for general Markdown documents
    - CodeStrategy: Strategy for code-heavy content
    - StructureAwareStrategy: Strategy combining heading, list, and table handling

Lazy Loading:
    Heavy dependencies are loaded on first access using __getattr__.
    This ensures fast import times when the chunking module is not
    immediately needed.

Example:
-------
    >>> from rag_chatbot.chunking import ChunkingConfig, Chunker, TextNormalizer
    >>> config = ChunkingConfig(min_tokens=450, max_tokens=700)
    >>> normalizer = TextNormalizer()
    >>> chunker = Chunker(config, normalizer=normalizer)
    >>> chunks = chunker.chunk_document(markdown_content, source="doc.pdf")

"""

from __future__ import annotations

from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from .chunker import Chunker
    from .heading_parser import ContentBlock, HeadingParser, ParsedHeading
    from .list_table_attacher import (
        AttachedContent,
        ContentProcessor,
        ListAttacher,
        ListDetector,
        ParsedTable,
        TableConverter,
        TableDetector,
    )
    from .models import (
        THERMAL_COMFORT_TERMS,
        Chunk,
        ChunkingConfig,
        TextNormalizer,
    )
    from .strategies import (
        ChunkingStrategy,
        CodeStrategy,
        MarkdownStrategy,
        StructureAwareStrategy,
    )
    from .token_aware import SlidingWindowChunker, Tokenizer

# =============================================================================
# Module Exports
# =============================================================================
__all__: list[str] = [
    # Data models (Step 3.1)
    "Chunk",
    "ChunkingConfig",
    "TextNormalizer",
    "THERMAL_COMFORT_TERMS",
    # Heading parser (Step 3.2)
    "ParsedHeading",
    "ContentBlock",
    "HeadingParser",
    # List and table attachment (Step 3.3)
    "AttachedContent",
    "ListDetector",
    "ListAttacher",
    "ParsedTable",
    "TableDetector",
    "TableConverter",
    "ContentProcessor",
    # Chunker pipeline (Step 3.5)
    "Chunker",
    "ChunkingStrategy",
    "MarkdownStrategy",
    "CodeStrategy",
    "StructureAwareStrategy",
    # Token-aware chunking (Step 3.4)
    "Tokenizer",
    "SlidingWindowChunker",
]


# =============================================================================
# Lazy Import Registry
# =============================================================================
# Maps exported names to their (module, attribute) tuples for lazy loading.
# This pattern avoids loading heavy dependencies until they are actually used.
# =============================================================================
_LAZY_IMPORTS: dict[str, tuple[str, str]] = {
    # Data models (Step 3.1)
    "Chunk": (".models", "Chunk"),
    "ChunkingConfig": (".models", "ChunkingConfig"),
    "TextNormalizer": (".models", "TextNormalizer"),
    "THERMAL_COMFORT_TERMS": (".models", "THERMAL_COMFORT_TERMS"),
    # Heading parser (Step 3.2)
    "ParsedHeading": (".heading_parser", "ParsedHeading"),
    "ContentBlock": (".heading_parser", "ContentBlock"),
    "HeadingParser": (".heading_parser", "HeadingParser"),
    # List and table attachment (Step 3.3)
    "AttachedContent": (".list_table_attacher", "AttachedContent"),
    "ListDetector": (".list_table_attacher", "ListDetector"),
    "ListAttacher": (".list_table_attacher", "ListAttacher"),
    "ParsedTable": (".list_table_attacher", "ParsedTable"),
    "TableDetector": (".list_table_attacher", "TableDetector"),
    "TableConverter": (".list_table_attacher", "TableConverter"),
    "ContentProcessor": (".list_table_attacher", "ContentProcessor"),
    # Chunker pipeline (Step 3.5)
    "Chunker": (".chunker", "Chunker"),
    "ChunkingStrategy": (".strategies", "ChunkingStrategy"),
    "MarkdownStrategy": (".strategies", "MarkdownStrategy"),
    "CodeStrategy": (".strategies", "CodeStrategy"),
    "StructureAwareStrategy": (".strategies", "StructureAwareStrategy"),
    # Token-aware chunking (Step 3.4)
    "Tokenizer": (".token_aware", "Tokenizer"),
    "SlidingWindowChunker": (".token_aware", "SlidingWindowChunker"),
}


def __getattr__(name: str) -> object:
    """Lazy load module exports on first access.

    This function is called when an attribute is not found in the module's
    namespace. It enables lazy loading of heavy dependencies using a
    registry-based lookup pattern.

    Args:
    ----
        name: The name of the attribute being accessed.

    Returns:
    -------
        The requested attribute if it exists in the lazy import registry.

    Raises:
    ------
        AttributeError: If the attribute is not a valid export.

    """
    if name in _LAZY_IMPORTS:
        module_path, attr_name = _LAZY_IMPORTS[name]
        # Use __import__ with proper package context for relative imports
        import importlib

        module = importlib.import_module(module_path, package=__name__)
        return getattr(module, attr_name)

    msg = f"module {__name__!r} has no attribute {name!r}"  # pragma: no cover
    raise AttributeError(msg)  # pragma: no cover