QuerySphere / chunking /llamaindex_chunker.py
satyakimitra's picture
first commit
0a4529c
# DEPENDENCIES
from typing import List
from typing import Optional
from config.models import DocumentChunk
from config.settings import get_settings
from config.models import DocumentMetadata
from config.models import ChunkingStrategy
from config.logging_config import get_logger
from chunking.base_chunker import BaseChunker
from chunking.base_chunker import ChunkerConfig
from chunking.token_counter import TokenCounter
from chunking.semantic_chunker import SemanticChunker
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.schema import Document as LlamaDocument
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SemanticSplitterNodeParser
# Setup Settings and Logging
logger = get_logger(__name__)
settings = get_settings()
class LlamaIndexChunker(BaseChunker):
"""
LlamaIndex-based semantic chunking strategy:
- Uses LlamaIndex's advanced semantic splitting algorithms
- Provides superior boundary detection using embeddings
- Supports multiple LlamaIndex splitter types
Best for:
- Documents requiring sophisticated semantic analysis
- When LlamaIndex ecosystem integration is needed
- Advanced chunking with embedding-based boundaries
"""
def __init__(self, chunk_size: int = None, overlap: int = None, splitter_type: str = "semantic", min_chunk_size: int = 100):
"""
Initialize LlamaIndex chunker
Arguments:
----------
chunk_size { int } : Target tokens per chunk
overlap { int } : Overlap tokens between chunks
splitter_type { str } : Type of LlamaIndex splitter ("semantic", "sentence", "token")
min_chunk_size { int } : Minimum chunk size in tokens
"""
# Use SEMANTIC since it's semantic-based
super().__init__(ChunkingStrategy.SEMANTIC)
self.chunk_size = chunk_size or settings.FIXED_CHUNK_SIZE
self.overlap = overlap or settings.FIXED_CHUNK_OVERLAP
self.splitter_type = splitter_type
self.min_chunk_size = min_chunk_size
# Initialize token counter
self.token_counter = TokenCounter()
# Initialize LlamaIndex components
self._splitter = None
self._initialized = False
self._initialize_llamaindex()
self.logger.info(f"Initialized LlamaIndexChunker: chunk_size={self.chunk_size}, overlap={self.overlap}, splitter_type={self.splitter_type}")
def _initialize_llamaindex(self):
"""
Initialize LlamaIndex splitter with proper error handling
"""
try:
# Initialize embedding model
embed_model = HuggingFaceEmbedding(model_name = settings.EMBEDDING_MODEL)
# Initialize appropriate splitter based on type
if (self.splitter_type == "semantic"):
self._splitter = SemanticSplitterNodeParser(buffer_size = 1,
breakpoint_percentile_threshold = 95,
embed_model = embed_model,
)
elif (self.splitter_type == "sentence"):
self._splitter = SentenceSplitter(chunk_size = self.chunk_size,
chunk_overlap = self.overlap,
)
elif (self.splitter_type == "token"):
self._splitter = TokenTextSplitter(chunk_size = self.chunk_size,
chunk_overlap = self.overlap,
)
else:
self.logger.warning(f"Unknown splitter type: {self.splitter_type}, using semantic")
self._splitter = SemanticSplitterNodeParser(buffer_size = 1,
breakpoint_percentile_threshold = 95,
embed_model = embed_model,
)
self._initialized = True
self.logger.info(f"Successfully initialized LlamaIndex {self.splitter_type} splitter")
except ImportError as e:
self.logger.error(f"LlamaIndex not available: {repr(e)}")
self._initialized = False
except Exception as e:
self.logger.error(f"Failed to initialize LlamaIndex: {repr(e)}")
self._initialized = False
def chunk_text(self, text: str, metadata: Optional[DocumentMetadata] = None) -> List[DocumentChunk]:
"""
Chunk text using LlamaIndex semantic splitting
Arguments:
----------
text { str } : Input text
metadata { DocumentMetaData } : Document metadata
Returns:
--------
{ list } : List of DocumentChunk objects
"""
if not text or not text.strip():
return []
# Fallback if LlamaIndex not available
if not self._initialized:
self.logger.warning("LlamaIndex not available, falling back to simple semantic chunking")
return self._fallback_chunking(text = text,
metadata = metadata,
)
document_id = metadata.document_id if metadata else "unknown"
try:
# Create LlamaIndex document
llama_doc = LlamaDocument(text = text)
# Get nodes from splitter
nodes = self._splitter.get_nodes_from_documents([llama_doc])
# Convert nodes to our DocumentChunk format
chunks = list()
start_pos = 0
for i, node in enumerate(nodes):
chunk_text = node.text
# Create chunk
chunk = self._create_chunk(text = self._clean_chunk_text(chunk_text),
chunk_index = i,
document_id = document_id,
start_char = start_pos,
end_char = start_pos + len(chunk_text),
metadata = {"llamaindex_splitter" : self.splitter_type,
"node_id" : node.node_id,
"chunk_type" : "llamaindex_semantic",
}
)
chunks.append(chunk)
start_pos += len(chunk_text)
# Filter out chunks that are too small
chunks = [c for c in chunks if (c.token_count >= self.min_chunk_size)]
self.logger.debug(f"Created {len(chunks)} chunks using LlamaIndex {self.splitter_type} splitter")
return chunks
except Exception as e:
self.logger.error(f"LlamaIndex chunking failed: {repr(e)}")
return self._fallback_chunking(text = text,
metadata = metadata,
)
def _fallback_chunking(self, text: str, metadata: Optional[DocumentMetadata] = None) -> List[DocumentChunk]:
"""
Fallback to basic semantic chunking when LlamaIndex fails
Arguments:
----------
text { str } : Input text
metadata { DocumentMetaData } : Document metadata
Returns:
--------
{ list } : List of chunks
"""
fallback_chunker = SemanticChunker(chunk_size = self.chunk_size,
overlap = self.overlap,
similarity_threshold = 0.95,
min_chunk_size = self.min_chunk_size,
)
return fallback_chunker.chunk_text(text, metadata)
def get_splitter_info(self) -> dict:
"""
Get information about the LlamaIndex splitter configuration
Returns:
--------
{ dict } : Splitter information
"""
return {"splitter_type" : self.splitter_type,
"chunk_size" : self.chunk_size,
"overlap" : self.overlap,
"initialized" : self._initialized,
"min_chunk_size" : self.min_chunk_size,
}
@classmethod
def from_config(cls, config: ChunkerConfig) -> 'LlamaIndexChunker':
"""
Create LlamaIndexChunker from configuration
Arguments:
----------
config { ChunkerConfig } : ChunkerConfig object
Returns:
--------
{ LlamaIndexChunker } : LlamaIndexChunker instance
"""
return cls(chunk_size = config.chunk_size,
overlap = config.overlap,
splitter_type = config.extra.get('llamaindex_splitter', 'semantic'),
min_chunk_size = config.min_chunk_size,
)