File size: 10,059 Bytes
0a4529c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
# DEPENDENCIES
from typing import List
from typing import Optional
from config.models import DocumentChunk
from config.settings import get_settings
from config.models import DocumentMetadata
from config.models import ChunkingStrategy
from config.logging_config import get_logger
from chunking.base_chunker import BaseChunker
from chunking.base_chunker import ChunkerConfig
from chunking.token_counter import TokenCounter
from chunking.semantic_chunker import SemanticChunker
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.schema import Document as LlamaDocument
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SemanticSplitterNodeParser


# Setup Settings and Logging
logger   = get_logger(__name__)
settings = get_settings()


class LlamaIndexChunker(BaseChunker):
    """
    LlamaIndex-based semantic chunking strategy:
    - Uses LlamaIndex's advanced semantic splitting algorithms
    - Provides superior boundary detection using embeddings
    - Supports multiple LlamaIndex splitter types
    
    Best for:
    - Documents requiring sophisticated semantic analysis
    - When LlamaIndex ecosystem integration is needed
    - Advanced chunking with embedding-based boundaries
    """
    def __init__(self, chunk_size: int = None, overlap: int = None, splitter_type: str = "semantic", min_chunk_size: int = 100):
        """
        Initialize LlamaIndex chunker
        
        Arguments:
        ----------
            chunk_size     { int }  : Target tokens per chunk
            
            overlap        { int }  : Overlap tokens between chunks
            
            splitter_type  { str }  : Type of LlamaIndex splitter ("semantic", "sentence", "token")
            
            min_chunk_size { int }  : Minimum chunk size in tokens
        """
        # Use SEMANTIC since it's semantic-based
        super().__init__(ChunkingStrategy.SEMANTIC)  
        
        self.chunk_size     = chunk_size or settings.FIXED_CHUNK_SIZE
        self.overlap        = overlap or settings.FIXED_CHUNK_OVERLAP
        self.splitter_type  = splitter_type
        self.min_chunk_size = min_chunk_size
        
        # Initialize token counter
        self.token_counter  = TokenCounter()
        
        # Initialize LlamaIndex components
        self._splitter      = None
        self._initialized   = False
        
        self._initialize_llamaindex()
        
        self.logger.info(f"Initialized LlamaIndexChunker: chunk_size={self.chunk_size}, overlap={self.overlap}, splitter_type={self.splitter_type}")
    

    def _initialize_llamaindex(self):
        """
        Initialize LlamaIndex splitter with proper error handling
        """
        try:
            # Initialize embedding model
            embed_model = HuggingFaceEmbedding(model_name = settings.EMBEDDING_MODEL)
            
            # Initialize appropriate splitter based on type
            if (self.splitter_type == "semantic"):
                self._splitter = SemanticSplitterNodeParser(buffer_size         = 1,
                                                            breakpoint_percentile_threshold = 95,
                                                            embed_model        = embed_model,
                                                           )
            
            elif (self.splitter_type == "sentence"):
                self._splitter = SentenceSplitter(chunk_size       = self.chunk_size,
                                                  chunk_overlap    = self.overlap,
                                                 )
            
            elif (self.splitter_type == "token"):
                self._splitter = TokenTextSplitter(chunk_size       = self.chunk_size,
                                                   chunk_overlap    = self.overlap,
                                                  )
            
            else:
                self.logger.warning(f"Unknown splitter type: {self.splitter_type}, using semantic")
                self._splitter = SemanticSplitterNodeParser(buffer_size                     = 1,
                                                            breakpoint_percentile_threshold = 95,
                                                            embed_model                     = embed_model,
                                                           )
            
            self._initialized = True
            self.logger.info(f"Successfully initialized LlamaIndex {self.splitter_type} splitter")
            
        except ImportError as e:
            self.logger.error(f"LlamaIndex not available: {repr(e)}")
            self._initialized = False
        
        except Exception as e:
            self.logger.error(f"Failed to initialize LlamaIndex: {repr(e)}")
            self._initialized = False
    

    def chunk_text(self, text: str, metadata: Optional[DocumentMetadata] = None) -> List[DocumentChunk]:
        """
        Chunk text using LlamaIndex semantic splitting
        
        Arguments:
        ----------
            text            { str }       : Input text

            metadata { DocumentMetaData } : Document metadata
        
        Returns:
        --------
                     { list }             : List of DocumentChunk objects
        """
        if not text or not text.strip():
            return []
        
        # Fallback if LlamaIndex not available
        if not self._initialized:
            self.logger.warning("LlamaIndex not available, falling back to simple semantic chunking")
            return self._fallback_chunking(text        = text, 
                                           metadata    = metadata,
                                          )
        
        document_id = metadata.document_id if metadata else "unknown"
        
        try:
            # Create LlamaIndex document
            llama_doc = LlamaDocument(text = text)
            
            # Get nodes from splitter
            nodes     = self._splitter.get_nodes_from_documents([llama_doc])
            
            # Convert nodes to our DocumentChunk format
            chunks    = list()
            start_pos = 0
            
            for i, node in enumerate(nodes):
                chunk_text = node.text
                
                # Create chunk
                chunk      = self._create_chunk(text          = self._clean_chunk_text(chunk_text),
                                                chunk_index   = i,
                                                document_id   = document_id,
                                                start_char    = start_pos,
                                                end_char      = start_pos + len(chunk_text),
                                                metadata      = {"llamaindex_splitter" : self.splitter_type,
                                                                 "node_id"             : node.node_id,
                                                                 "chunk_type"          : "llamaindex_semantic",
                                                                }
                                               )
                
                chunks.append(chunk)
                start_pos += len(chunk_text)
            
            # Filter out chunks that are too small
            chunks = [c for c in chunks if (c.token_count >= self.min_chunk_size)]
            
            self.logger.debug(f"Created {len(chunks)} chunks using LlamaIndex {self.splitter_type} splitter")
            
            return chunks
            
        except Exception as e:
            self.logger.error(f"LlamaIndex chunking failed: {repr(e)}")
            return self._fallback_chunking(text     = text, 
                                           metadata = metadata,
                                          )
    

    def _fallback_chunking(self, text: str, metadata: Optional[DocumentMetadata] = None) -> List[DocumentChunk]:
        """
        Fallback to basic semantic chunking when LlamaIndex fails
        
        Arguments:
        ----------
            text            { str }       : Input text

            metadata { DocumentMetaData } : Document metadata
        
        Returns:
        --------
                     { list }             : List of chunks
        """
        fallback_chunker = SemanticChunker(chunk_size           = self.chunk_size,
                                           overlap              = self.overlap,
                                           similarity_threshold = 0.95,
                                           min_chunk_size       = self.min_chunk_size,
                                          )
        
        return fallback_chunker.chunk_text(text, metadata)
    

    def get_splitter_info(self) -> dict:
        """
        Get information about the LlamaIndex splitter configuration
        
        Returns:
        --------
            { dict }    : Splitter information
        """
        return {"splitter_type"   : self.splitter_type,
                "chunk_size"      : self.chunk_size,
                "overlap"         : self.overlap,
                "initialized"     : self._initialized,
                "min_chunk_size"  : self.min_chunk_size,
               }
    

    @classmethod
    def from_config(cls, config: ChunkerConfig) -> 'LlamaIndexChunker':
        """
        Create LlamaIndexChunker from configuration
        
        Arguments:
        ----------
            config { ChunkerConfig } : ChunkerConfig object
        
        Returns:
        --------
            { LlamaIndexChunker }    : LlamaIndexChunker instance
        """
        return cls(chunk_size    = config.chunk_size,
                   overlap       = config.overlap,
                   splitter_type = config.extra.get('llamaindex_splitter', 'semantic'),
                   min_chunk_size = config.min_chunk_size,
                  )