Spaces:
Running
Running
| # DEPENDENCIES | |
| from typing import List | |
| from typing import Optional | |
| from config.models import DocumentChunk | |
| from config.settings import get_settings | |
| from config.models import DocumentMetadata | |
| from config.models import ChunkingStrategy | |
| from config.logging_config import get_logger | |
| from chunking.base_chunker import BaseChunker | |
| from chunking.base_chunker import ChunkerConfig | |
| from chunking.token_counter import TokenCounter | |
| from chunking.overlap_manager import OverlapManager | |
| from chunking.fixed_chunker import FixedChunker | |
| # Setup Settings and Logging | |
| logger = get_logger(__name__) | |
| settings = get_settings() | |
| class HierarchicalChunker(BaseChunker): | |
| """ | |
| Hierarchical chunking strategy: | |
| - Creates parent chunks (large) and child chunks (small) | |
| - Child chunks for granular search, parent chunks for context | |
| - Maintains parent-child relationships for context expansion | |
| Best for: | |
| - Large documents (>500K tokens) | |
| - Complex documents with nested structure | |
| - When both granular search and context preservation are needed | |
| """ | |
| def __init__(self, parent_chunk_size: int = None, child_chunk_size: int = None, overlap: int = None, min_chunk_size: int = 100): | |
| """ | |
| Initialize hierarchical chunker | |
| Arguments: | |
| ---------- | |
| parent_chunk_size { int } : Size of parent chunks in tokens | |
| child_chunk_size { int } : Size of child chunks in tokens | |
| overlap { int } : Overlap between child chunks | |
| min_chunk_size { int } : Minimum chunk size in tokens | |
| """ | |
| super().__init__(ChunkingStrategy.HIERARCHICAL) | |
| self.parent_chunk_size = parent_chunk_size or settings.PARENT_CHUNK_SIZE | |
| self.child_chunk_size = child_chunk_size or settings.CHILD_CHUNK_SIZE | |
| self.overlap = overlap or settings.FIXED_CHUNK_OVERLAP | |
| self.min_chunk_size = min_chunk_size | |
| # Validate parameters | |
| if (self.child_chunk_size >= self.parent_chunk_size): | |
| raise ValueError(f"Child chunk size ({self.child_chunk_size}) must be smaller than parent chunk size ({self.parent_chunk_size})") | |
| # Initialize dependencies | |
| self.token_counter = TokenCounter() | |
| self.overlap_manager = OverlapManager(overlap_tokens = self.overlap) | |
| self.child_chunker = FixedChunker(chunk_size = self.child_chunk_size, | |
| overlap = self.overlap, | |
| respect_sentence_boundaries = True, | |
| ) | |
| self.logger.info(f"Initialized HierarchicalChunker: parent_size={self.parent_chunk_size}, child_size={self.child_chunk_size}, overlap={self.overlap}") | |
| def chunk_text(self, text: str, metadata: Optional[DocumentMetadata] = None) -> List[DocumentChunk]: | |
| """ | |
| Create hierarchical chunks with parent-child relationships | |
| Arguments: | |
| ---------- | |
| text { str } : Input text | |
| metadata { DocumentMetaData } : Document metadata | |
| Returns: | |
| -------- | |
| { list } : List of DocumentChunk objects (children with parent references) | |
| """ | |
| if not text or not text.strip(): | |
| return [] | |
| document_id = metadata.document_id if metadata else "unknown" | |
| # Create parent chunks (large context windows) | |
| parent_chunks = self._create_parent_chunks(text, document_id) | |
| # For each parent chunk, create child chunks (granular search) | |
| all_child_chunks = list() | |
| for parent_chunk in parent_chunks: | |
| child_chunks = self._create_child_chunks(parent_chunk = parent_chunk, | |
| parent_text = text, | |
| document_id = document_id, | |
| ) | |
| all_child_chunks.extend(child_chunks) | |
| # Step 3: Filter small chunks | |
| all_child_chunks = [c for c in all_child_chunks if (c.token_count >= self.min_chunk_size)] | |
| self.logger.info(f"Created {len(all_child_chunks)} child chunks from {len(parent_chunks)} parent chunks") | |
| return all_child_chunks | |
| def _create_parent_chunks(self, text: str, document_id: str) -> List[DocumentChunk]: | |
| """ | |
| Create large parent chunks for context preservation | |
| Arguments: | |
| ---------- | |
| text { str } : Input text | |
| document_id { str } : Document ID | |
| Returns: | |
| -------- | |
| { list } : List of parent chunks WITHOUT overlap | |
| """ | |
| # Use fixed chunking for parents (no overlap between parents) | |
| parent_chunker = FixedChunker(chunk_size = self.parent_chunk_size, | |
| overlap = 0, # No overlap between parents | |
| respect_sentence_boundaries = True, | |
| ) | |
| # Create parent chunks | |
| parent_chunks = parent_chunker._chunk_with_sentence_boundaries(text = text, | |
| document_id = document_id, | |
| ) | |
| # Add parent metadata | |
| for i, chunk in enumerate(parent_chunks): | |
| chunk.metadata["chunk_type"] = "parent" | |
| chunk.metadata["parent_chunk_id"] = chunk.chunk_id | |
| return parent_chunks | |
| def _create_child_chunks(self, parent_chunk: DocumentChunk, parent_text: str, document_id: str) -> List[DocumentChunk]: | |
| """ | |
| Create child chunks within a parent chunk | |
| Arguments: | |
| ---------- | |
| parent_chunk { DocumentChunk } : Parent chunk object | |
| parent_text { str } : Full parent text (for position reference) | |
| document_id { str } : Document ID | |
| Returns: | |
| -------- | |
| { list } : List of child chunks with parent references | |
| """ | |
| # Extract the actual text segment from parent_text using parent chunk positions | |
| parent_segment = parent_text[parent_chunk.start_char:parent_chunk.end_char] | |
| # Create child chunks within this parent segment | |
| child_chunker = FixedChunker(chunk_size = self.child_chunk_size, | |
| overlap = self.overlap, | |
| respect_sentence_boundaries = True, | |
| ) | |
| # Create child chunks with proper positioning | |
| child_chunks = child_chunker._chunk_with_sentence_boundaries(text = parent_segment, | |
| document_id = document_id, | |
| ) | |
| # Update child chunks with parent relationship and correct positions | |
| for i, child_chunk in enumerate(child_chunks): | |
| # Adjust positions to be relative to full document | |
| child_chunk.start_char += parent_chunk.start_char | |
| child_chunk.end_char += parent_chunk.start_char | |
| # Add parent relationship metadata | |
| child_chunk.metadata["chunk_type"] = "child" | |
| child_chunk.metadata["parent_chunk_id"] = parent_chunk.chunk_id | |
| child_chunk.metadata["parent_index"] = i | |
| # Update chunk ID to reflect hierarchy | |
| child_chunk.chunk_id = f"{parent_chunk.chunk_id}_child_{i}" | |
| return child_chunks | |
| def expand_to_parent_context(self, child_chunk: DocumentChunk, all_chunks: List[DocumentChunk]) -> DocumentChunk: | |
| """ | |
| Expand a child chunk to include full parent context for generation | |
| Arguments: | |
| ---------- | |
| child_chunk { DocumentChunk } : Child chunk to expand | |
| all_chunks { list } : All chunks from the document | |
| Returns: | |
| -------- | |
| { DocumentChunk } : Expanded chunk with parent context | |
| """ | |
| # Find the parent chunk | |
| parent_chunk_id = child_chunk.metadata.get("parent_chunk_id") | |
| if not parent_chunk_id: | |
| return child_chunk | |
| parent_chunk = next((c for c in all_chunks if c.chunk_id == parent_chunk_id), None) | |
| if not parent_chunk: | |
| return child_chunk | |
| # Create expanded chunk with parent context | |
| expanded_text = f"[PARENT_CONTEXT]\n{parent_chunk.text}\n\n[CHILD_CONTEXT]\n{child_chunk.text}" | |
| expanded_chunk = DocumentChunk(chunk_id = f"{child_chunk.chunk_id}_expanded", | |
| document_id = child_chunk.document_id, | |
| text = expanded_text, | |
| chunk_index = child_chunk.chunk_index, | |
| start_char = child_chunk.start_char, | |
| end_char = child_chunk.end_char, | |
| page_number = child_chunk.page_number, | |
| section_title = child_chunk.section_title, | |
| token_count = self.token_counter.count_tokens(expanded_text), | |
| parent_chunk_id = parent_chunk_id, | |
| child_chunk_ids = [child_chunk.chunk_id], | |
| metadata = {**child_chunk.metadata, "expanded": True}, | |
| ) | |
| return expanded_chunk | |
| def get_parent_child_relationships(self, chunks: List[DocumentChunk]) -> dict: | |
| """ | |
| Extract parent-child relationships from chunks | |
| Arguments: | |
| ---------- | |
| chunks { list } : List of chunks | |
| Returns: | |
| -------- | |
| { dict } : Dictionary mapping parent IDs to child chunks | |
| """ | |
| relationships = dict() | |
| for chunk in chunks: | |
| if (chunk.metadata.get("chunk_type") == "parent"): | |
| relationships[chunk.chunk_id] = {"parent" : chunk, | |
| "children" : [], | |
| } | |
| for chunk in chunks: | |
| parent_id = chunk.metadata.get("parent_chunk_id") | |
| if parent_id and parent_id in relationships: | |
| relationships[parent_id]["children"].append(chunk) | |
| return relationships | |
| def from_config(cls, config: ChunkerConfig) -> 'HierarchicalChunker': | |
| """ | |
| Create HierarchicalChunker from configuration | |
| Arguments: | |
| ---------- | |
| config { ChunkerConfig } : ChunkerConfig object | |
| Returns: | |
| -------- | |
| { HierarchicalChunker } : HierarchicalChunker instance | |
| """ | |
| return cls(parent_chunk_size = config.extra.get('parent_size', settings.PARENT_CHUNK_SIZE), | |
| child_chunk_size = config.extra.get('child_size', settings.CHILD_CHUNK_SIZE), | |
| overlap = config.overlap, | |
| min_chunk_size = config.min_chunk_size, | |
| ) |