""" Text chunking utilities for large transcript processing. This module provides intelligent chunking strategies to handle large transcripts while respecting Anthropic model token limits and maintaining context coherence. """ import math from typing import List, Dict, Any, Tuple, Optional from dataclasses import dataclass from enum import Enum from models.input import TranscriptSentence class ChunkingStrategy(str, Enum): """Available chunking strategies for transcript processing.""" SENTENCE_COUNT = "sentence_count" TOKEN_ESTIMATE = "token_estimate" TIME_BASED = "time_based" SPEAKER_AWARE = "speaker_aware" SEMANTIC_BOUNDARY = "semantic_boundary" @dataclass class ChunkMetadata: """Metadata for a transcript chunk.""" chunk_id: int start_sentence_index: int end_sentence_index: int start_time: float end_time: float sentence_count: int estimated_tokens: int speakers: List[str] strategy_used: ChunkingStrategy overlap_sentences: int = 0 @dataclass class ChunkingResult: """Result of transcript chunking operation.""" chunks: List[List[TranscriptSentence]] metadata: List[ChunkMetadata] total_chunks: int total_sentences: int strategy_used: ChunkingStrategy overlap_enabled: bool class TranscriptChunker: """ Intelligent transcript chunking for large document processing. Handles various chunking strategies while maintaining context coherence and respecting token limits for Anthropic models. """ # Token estimation constants (approximate) TOKENS_PER_WORD = 1.3 # Conservative estimate for English TOKENS_PER_CHAR = 0.25 # For non-English languages # Model token limits (with safety margin) MODEL_TOKEN_LIMITS = { "claude-3-5-sonnet-20241022": 180000, # 200k with margin "claude-3-5-haiku-20241022": 180000, # 200k with margin "claude-3-sonnet-20240229": 180000, # 200k with margin "claude-3-haiku-20240307": 180000, # 200k with margin } # Default chunk sizes for different strategies DEFAULT_CHUNK_SIZES = { ChunkingStrategy.SENTENCE_COUNT: 40, # Smaller for better topic granularity ChunkingStrategy.TOKEN_ESTIMATE: 6000, # Reduced for better topic extraction ChunkingStrategy.TIME_BASED: 300.0, # 5 minutes ChunkingStrategy.SPEAKER_AWARE: 35, # Smaller for better speaker context ChunkingStrategy.SEMANTIC_BOUNDARY: 45 # Smaller for better semantic boundaries } def __init__( self, strategy: ChunkingStrategy = ChunkingStrategy.TOKEN_ESTIMATE, chunk_size: Optional[int] = None, overlap_sentences: int = 3, model_name: str = "claude-3-5-sonnet-20241022" ): """ Initialize the transcript chunker. Args: strategy: Chunking strategy to use chunk_size: Size parameter for chunking (strategy-dependent) overlap_sentences: Number of sentences to overlap between chunks model_name: Target Anthropic model for token limit calculation """ self.strategy = strategy self.chunk_size = chunk_size or self.DEFAULT_CHUNK_SIZES[strategy] self.overlap_sentences = overlap_sentences self.model_name = model_name self.max_tokens = self.MODEL_TOKEN_LIMITS.get(model_name, 180000) def chunk_transcript( self, sentences: List[TranscriptSentence], enable_overlap: bool = True ) -> ChunkingResult: """ Chunk a transcript using the configured strategy. Args: sentences: List of transcript sentences to chunk enable_overlap: Whether to enable sentence overlap between chunks Returns: ChunkingResult with chunks and metadata """ if not sentences: return ChunkingResult( chunks=[], metadata=[], total_chunks=0, total_sentences=0, strategy_used=self.strategy, overlap_enabled=enable_overlap ) # Choose chunking method based on strategy if self.strategy == ChunkingStrategy.SENTENCE_COUNT: chunks, metadata = self._chunk_by_sentence_count(sentences, enable_overlap) elif self.strategy == ChunkingStrategy.TOKEN_ESTIMATE: chunks, metadata = self._chunk_by_token_estimate(sentences, enable_overlap) elif self.strategy == ChunkingStrategy.TIME_BASED: chunks, metadata = self._chunk_by_time(sentences, enable_overlap) elif self.strategy == ChunkingStrategy.SPEAKER_AWARE: chunks, metadata = self._chunk_speaker_aware(sentences, enable_overlap) elif self.strategy == ChunkingStrategy.SEMANTIC_BOUNDARY: chunks, metadata = self._chunk_semantic_boundary(sentences, enable_overlap) else: raise ValueError(f"Unsupported chunking strategy: {self.strategy}") return ChunkingResult( chunks=chunks, metadata=metadata, total_chunks=len(chunks), total_sentences=len(sentences), strategy_used=self.strategy, overlap_enabled=enable_overlap ) def _chunk_by_sentence_count( self, sentences: List[TranscriptSentence], enable_overlap: bool ) -> Tuple[List[List[TranscriptSentence]], List[ChunkMetadata]]: """Chunk by fixed sentence count.""" chunks = [] metadata = [] chunk_size = int(self.chunk_size) overlap = self.overlap_sentences if enable_overlap else 0 i = 0 chunk_id = 0 while i < len(sentences): # Calculate chunk end end_idx = min(i + chunk_size, len(sentences)) chunk_sentences = sentences[i:end_idx] # Create metadata chunk_metadata = ChunkMetadata( chunk_id=chunk_id, start_sentence_index=chunk_sentences[0].sentence_index, end_sentence_index=chunk_sentences[-1].sentence_index, start_time=chunk_sentences[0].start_time, end_time=chunk_sentences[-1].end_time, sentence_count=len(chunk_sentences), estimated_tokens=self._estimate_tokens(chunk_sentences), speakers=list(set(s.speaker for s in chunk_sentences)), strategy_used=self.strategy, overlap_sentences=overlap if chunk_id > 0 else 0 ) chunks.append(chunk_sentences) metadata.append(chunk_metadata) # Move to next chunk with overlap i = end_idx - overlap if end_idx < len(sentences) else end_idx chunk_id += 1 return chunks, metadata def _chunk_by_token_estimate( self, sentences: List[TranscriptSentence], enable_overlap: bool ) -> Tuple[List[List[TranscriptSentence]], List[ChunkMetadata]]: """Chunk by estimated token count.""" chunks = [] metadata = [] target_tokens = int(self.chunk_size) overlap = self.overlap_sentences if enable_overlap else 0 i = 0 chunk_id = 0 while i < len(sentences): chunk_sentences = [] current_tokens = 0 # Add sentences until token limit j = i while j < len(sentences) and current_tokens < target_tokens: sentence_tokens = self._estimate_tokens([sentences[j]]) if current_tokens + sentence_tokens <= target_tokens or not chunk_sentences: chunk_sentences.append(sentences[j]) current_tokens += sentence_tokens j += 1 else: break if not chunk_sentences: # Single sentence exceeds token limit - include it anyway chunk_sentences = [sentences[i]] j = i + 1 # Create metadata chunk_metadata = ChunkMetadata( chunk_id=chunk_id, start_sentence_index=chunk_sentences[0].sentence_index, end_sentence_index=chunk_sentences[-1].sentence_index, start_time=chunk_sentences[0].start_time, end_time=chunk_sentences[-1].end_time, sentence_count=len(chunk_sentences), estimated_tokens=current_tokens, speakers=list(set(s.speaker for s in chunk_sentences)), strategy_used=self.strategy, overlap_sentences=overlap if chunk_id > 0 else 0 ) chunks.append(chunk_sentences) metadata.append(chunk_metadata) # Move to next chunk with overlap if j < len(sentences): i = max(j - overlap, i + 1) # Ensure progress while maintaining overlap else: i = j chunk_id += 1 return chunks, metadata def _chunk_by_time( self, sentences: List[TranscriptSentence], enable_overlap: bool ) -> Tuple[List[List[TranscriptSentence]], List[ChunkMetadata]]: """Chunk by time duration.""" chunks = [] metadata = [] target_duration = float(self.chunk_size) # seconds overlap = self.overlap_sentences if enable_overlap else 0 i = 0 chunk_id = 0 while i < len(sentences): chunk_sentences = [] start_time = sentences[i].start_time # Add sentences until time limit j = i while j < len(sentences): current_duration = sentences[j].end_time - start_time if current_duration <= target_duration or not chunk_sentences: chunk_sentences.append(sentences[j]) j += 1 else: break # Create metadata chunk_metadata = ChunkMetadata( chunk_id=chunk_id, start_sentence_index=chunk_sentences[0].sentence_index, end_sentence_index=chunk_sentences[-1].sentence_index, start_time=chunk_sentences[0].start_time, end_time=chunk_sentences[-1].end_time, sentence_count=len(chunk_sentences), estimated_tokens=self._estimate_tokens(chunk_sentences), speakers=list(set(s.speaker for s in chunk_sentences)), strategy_used=self.strategy, overlap_sentences=overlap if chunk_id > 0 else 0 ) chunks.append(chunk_sentences) metadata.append(chunk_metadata) # Move to next chunk with overlap i = max(j - overlap, j) if j < len(sentences) else j chunk_id += 1 return chunks, metadata def _chunk_speaker_aware( self, sentences: List[TranscriptSentence], enable_overlap: bool ) -> Tuple[List[List[TranscriptSentence]], List[ChunkMetadata]]: """Chunk with speaker change awareness.""" chunks = [] metadata = [] target_sentences = int(self.chunk_size) overlap = self.overlap_sentences if enable_overlap else 0 i = 0 chunk_id = 0 while i < len(sentences): chunk_sentences = [] # Add sentences, preferring speaker boundaries j = i while j < len(sentences) and len(chunk_sentences) < target_sentences: chunk_sentences.append(sentences[j]) j += 1 # Check for natural speaker boundary if (j < len(sentences) and len(chunk_sentences) >= target_sentences // 2 and sentences[j].speaker != sentences[j-1].speaker): break # Create metadata chunk_metadata = ChunkMetadata( chunk_id=chunk_id, start_sentence_index=chunk_sentences[0].sentence_index, end_sentence_index=chunk_sentences[-1].sentence_index, start_time=chunk_sentences[0].start_time, end_time=chunk_sentences[-1].end_time, sentence_count=len(chunk_sentences), estimated_tokens=self._estimate_tokens(chunk_sentences), speakers=list(set(s.speaker for s in chunk_sentences)), strategy_used=self.strategy, overlap_sentences=overlap if chunk_id > 0 else 0 ) chunks.append(chunk_sentences) metadata.append(chunk_metadata) # Move to next chunk with overlap i = max(j - overlap, j) if j < len(sentences) else j chunk_id += 1 return chunks, metadata def _chunk_semantic_boundary( self, sentences: List[TranscriptSentence], enable_overlap: bool ) -> Tuple[List[List[TranscriptSentence]], List[ChunkMetadata]]: """Chunk with semantic boundary detection (simplified).""" # For now, use sentence count with pause-based boundaries # This could be enhanced with NLP techniques in the future chunks = [] metadata = [] target_sentences = int(self.chunk_size) overlap = self.overlap_sentences if enable_overlap else 0 i = 0 chunk_id = 0 while i < len(sentences): chunk_sentences = [] # Add sentences, looking for natural pauses j = i while j < len(sentences) and len(chunk_sentences) < target_sentences: chunk_sentences.append(sentences[j]) j += 1 # Look for natural pause (gap > 2 seconds) if (j < len(sentences) and len(chunk_sentences) >= target_sentences // 2 and sentences[j].start_time - sentences[j-1].end_time > 2.0): break # Create metadata chunk_metadata = ChunkMetadata( chunk_id=chunk_id, start_sentence_index=chunk_sentences[0].sentence_index, end_sentence_index=chunk_sentences[-1].sentence_index, start_time=chunk_sentences[0].start_time, end_time=chunk_sentences[-1].end_time, sentence_count=len(chunk_sentences), estimated_tokens=self._estimate_tokens(chunk_sentences), speakers=list(set(s.speaker for s in chunk_sentences)), strategy_used=self.strategy, overlap_sentences=overlap if chunk_id > 0 else 0 ) chunks.append(chunk_sentences) metadata.append(chunk_metadata) # Move to next chunk with overlap i = max(j - overlap, j) if j < len(sentences) else j chunk_id += 1 return chunks, metadata def _estimate_tokens(self, sentences: List[TranscriptSentence]) -> int: """Estimate token count for a list of sentences.""" total_tokens = 0 for sentence in sentences: # Count words for English-like languages word_count = len(sentence.text.split()) char_count = len(sentence.text) # Use word-based estimation for English, character-based for others if hasattr(sentence, 'language') and sentence.language: if sentence.language.value in ['en']: tokens = word_count * self.TOKENS_PER_WORD else: tokens = char_count * self.TOKENS_PER_CHAR else: # Default to word-based estimation tokens = word_count * self.TOKENS_PER_WORD total_tokens += tokens return int(total_tokens) def get_optimal_strategy( self, sentences: List[TranscriptSentence], target_chunks: Optional[int] = None ) -> ChunkingStrategy: """ Recommend optimal chunking strategy based on transcript characteristics. Args: sentences: Transcript sentences to analyze target_chunks: Desired number of chunks (optional) Returns: Recommended chunking strategy """ if not sentences: return ChunkingStrategy.SENTENCE_COUNT total_sentences = len(sentences) total_duration = sentences[-1].end_time - sentences[0].start_time unique_speakers = len(set(s.speaker for s in sentences)) estimated_tokens = self._estimate_tokens(sentences) # Very large transcripts - use token estimation if estimated_tokens > self.max_tokens * 0.8: return ChunkingStrategy.TOKEN_ESTIMATE # Large transcripts (>200 sentences) - prefer sentence chunking for better granularity if total_sentences > 200: return ChunkingStrategy.SENTENCE_COUNT # Many speakers - use speaker-aware chunking if unique_speakers > 5: return ChunkingStrategy.SPEAKER_AWARE # Long duration - use time-based chunking if total_duration > 3600: # > 1 hour return ChunkingStrategy.TIME_BASED # Medium transcripts (>50 sentences) - use sentence chunking if total_sentences > 50: return ChunkingStrategy.SENTENCE_COUNT # Default to sentence count for moderate transcripts return ChunkingStrategy.SENTENCE_COUNT def validate_chunks(self, result: ChunkingResult) -> List[str]: """ Validate chunking result and return any warnings. Args: result: Chunking result to validate Returns: List of warning messages """ warnings = [] # Check for empty chunks empty_chunks = [i for i, chunk in enumerate(result.chunks) if not chunk] if empty_chunks: warnings.append(f"Empty chunks found at indices: {empty_chunks}") # Check for oversized chunks (token-wise) for i, metadata in enumerate(result.metadata): if metadata.estimated_tokens > self.max_tokens: warnings.append( f"Chunk {i} exceeds token limit: {metadata.estimated_tokens} > {self.max_tokens}" ) # Check for very small chunks (except last) min_sentences = 3 small_chunks = [ i for i, metadata in enumerate(result.metadata[:-1]) if metadata.sentence_count < min_sentences ] if small_chunks: warnings.append(f"Very small chunks found at indices: {small_chunks}") # Check for gaps in coverage for i in range(len(result.metadata) - 1): current_end = result.metadata[i].end_sentence_index next_start = result.metadata[i + 1].start_sentence_index expected_gap = result.metadata[i + 1].overlap_sentences actual_gap = current_end - next_start + 1 if actual_gap != expected_gap: warnings.append( f"Unexpected gap between chunks {i} and {i + 1}: " f"expected {expected_gap}, got {actual_gap}" ) return warnings def create_chunker( strategy: str = "token_estimate", chunk_size: Optional[int] = None, overlap_sentences: int = 3, model_name: str = "claude-3-5-sonnet-20241022" ) -> TranscriptChunker: """ Factory function to create a transcript chunker. Args: strategy: Chunking strategy name chunk_size: Size parameter for chunking overlap_sentences: Number of sentences to overlap model_name: Target Anthropic model Returns: Configured TranscriptChunker instance """ strategy_enum = ChunkingStrategy(strategy) return TranscriptChunker( strategy=strategy_enum, chunk_size=chunk_size, overlap_sentences=overlap_sentences, model_name=model_name )