Spaces:
Sleeping
Sleeping
| """ | |
| Text chunking utilities for large transcript processing. | |
| This module provides intelligent chunking strategies to handle large transcripts | |
| while respecting Anthropic model token limits and maintaining context coherence. | |
| """ | |
| import math | |
| from typing import List, Dict, Any, Tuple, Optional | |
| from dataclasses import dataclass | |
| from enum import Enum | |
| from models.input import TranscriptSentence | |
| class ChunkingStrategy(str, Enum): | |
| """Available chunking strategies for transcript processing.""" | |
| SENTENCE_COUNT = "sentence_count" | |
| TOKEN_ESTIMATE = "token_estimate" | |
| TIME_BASED = "time_based" | |
| SPEAKER_AWARE = "speaker_aware" | |
| SEMANTIC_BOUNDARY = "semantic_boundary" | |
| class ChunkMetadata: | |
| """Metadata for a transcript chunk.""" | |
| chunk_id: int | |
| start_sentence_index: int | |
| end_sentence_index: int | |
| start_time: float | |
| end_time: float | |
| sentence_count: int | |
| estimated_tokens: int | |
| speakers: List[str] | |
| strategy_used: ChunkingStrategy | |
| overlap_sentences: int = 0 | |
| class ChunkingResult: | |
| """Result of transcript chunking operation.""" | |
| chunks: List[List[TranscriptSentence]] | |
| metadata: List[ChunkMetadata] | |
| total_chunks: int | |
| total_sentences: int | |
| strategy_used: ChunkingStrategy | |
| overlap_enabled: bool | |
| class TranscriptChunker: | |
| """ | |
| Intelligent transcript chunking for large document processing. | |
| Handles various chunking strategies while maintaining context coherence | |
| and respecting token limits for Anthropic models. | |
| """ | |
| # Token estimation constants (approximate) | |
| TOKENS_PER_WORD = 1.3 # Conservative estimate for English | |
| TOKENS_PER_CHAR = 0.25 # For non-English languages | |
| # Model token limits (with safety margin) | |
| MODEL_TOKEN_LIMITS = { | |
| "claude-3-5-sonnet-20241022": 180000, # 200k with margin | |
| "claude-3-5-haiku-20241022": 180000, # 200k with margin | |
| "claude-3-sonnet-20240229": 180000, # 200k with margin | |
| "claude-3-haiku-20240307": 180000, # 200k with margin | |
| } | |
| # Default chunk sizes for different strategies | |
| DEFAULT_CHUNK_SIZES = { | |
| ChunkingStrategy.SENTENCE_COUNT: 40, # Smaller for better topic granularity | |
| ChunkingStrategy.TOKEN_ESTIMATE: 6000, # Reduced for better topic extraction | |
| ChunkingStrategy.TIME_BASED: 300.0, # 5 minutes | |
| ChunkingStrategy.SPEAKER_AWARE: 35, # Smaller for better speaker context | |
| ChunkingStrategy.SEMANTIC_BOUNDARY: 45 # Smaller for better semantic boundaries | |
| } | |
| def __init__( | |
| self, | |
| strategy: ChunkingStrategy = ChunkingStrategy.TOKEN_ESTIMATE, | |
| chunk_size: Optional[int] = None, | |
| overlap_sentences: int = 3, | |
| model_name: str = "claude-3-5-sonnet-20241022" | |
| ): | |
| """ | |
| Initialize the transcript chunker. | |
| Args: | |
| strategy: Chunking strategy to use | |
| chunk_size: Size parameter for chunking (strategy-dependent) | |
| overlap_sentences: Number of sentences to overlap between chunks | |
| model_name: Target Anthropic model for token limit calculation | |
| """ | |
| self.strategy = strategy | |
| self.chunk_size = chunk_size or self.DEFAULT_CHUNK_SIZES[strategy] | |
| self.overlap_sentences = overlap_sentences | |
| self.model_name = model_name | |
| self.max_tokens = self.MODEL_TOKEN_LIMITS.get(model_name, 180000) | |
| def chunk_transcript( | |
| self, | |
| sentences: List[TranscriptSentence], | |
| enable_overlap: bool = True | |
| ) -> ChunkingResult: | |
| """ | |
| Chunk a transcript using the configured strategy. | |
| Args: | |
| sentences: List of transcript sentences to chunk | |
| enable_overlap: Whether to enable sentence overlap between chunks | |
| Returns: | |
| ChunkingResult with chunks and metadata | |
| """ | |
| if not sentences: | |
| return ChunkingResult( | |
| chunks=[], | |
| metadata=[], | |
| total_chunks=0, | |
| total_sentences=0, | |
| strategy_used=self.strategy, | |
| overlap_enabled=enable_overlap | |
| ) | |
| # Choose chunking method based on strategy | |
| if self.strategy == ChunkingStrategy.SENTENCE_COUNT: | |
| chunks, metadata = self._chunk_by_sentence_count(sentences, enable_overlap) | |
| elif self.strategy == ChunkingStrategy.TOKEN_ESTIMATE: | |
| chunks, metadata = self._chunk_by_token_estimate(sentences, enable_overlap) | |
| elif self.strategy == ChunkingStrategy.TIME_BASED: | |
| chunks, metadata = self._chunk_by_time(sentences, enable_overlap) | |
| elif self.strategy == ChunkingStrategy.SPEAKER_AWARE: | |
| chunks, metadata = self._chunk_speaker_aware(sentences, enable_overlap) | |
| elif self.strategy == ChunkingStrategy.SEMANTIC_BOUNDARY: | |
| chunks, metadata = self._chunk_semantic_boundary(sentences, enable_overlap) | |
| else: | |
| raise ValueError(f"Unsupported chunking strategy: {self.strategy}") | |
| return ChunkingResult( | |
| chunks=chunks, | |
| metadata=metadata, | |
| total_chunks=len(chunks), | |
| total_sentences=len(sentences), | |
| strategy_used=self.strategy, | |
| overlap_enabled=enable_overlap | |
| ) | |
| def _chunk_by_sentence_count( | |
| self, | |
| sentences: List[TranscriptSentence], | |
| enable_overlap: bool | |
| ) -> Tuple[List[List[TranscriptSentence]], List[ChunkMetadata]]: | |
| """Chunk by fixed sentence count.""" | |
| chunks = [] | |
| metadata = [] | |
| chunk_size = int(self.chunk_size) | |
| overlap = self.overlap_sentences if enable_overlap else 0 | |
| i = 0 | |
| chunk_id = 0 | |
| while i < len(sentences): | |
| # Calculate chunk end | |
| end_idx = min(i + chunk_size, len(sentences)) | |
| chunk_sentences = sentences[i:end_idx] | |
| # Create metadata | |
| chunk_metadata = ChunkMetadata( | |
| chunk_id=chunk_id, | |
| start_sentence_index=chunk_sentences[0].sentence_index, | |
| end_sentence_index=chunk_sentences[-1].sentence_index, | |
| start_time=chunk_sentences[0].start_time, | |
| end_time=chunk_sentences[-1].end_time, | |
| sentence_count=len(chunk_sentences), | |
| estimated_tokens=self._estimate_tokens(chunk_sentences), | |
| speakers=list(set(s.speaker for s in chunk_sentences)), | |
| strategy_used=self.strategy, | |
| overlap_sentences=overlap if chunk_id > 0 else 0 | |
| ) | |
| chunks.append(chunk_sentences) | |
| metadata.append(chunk_metadata) | |
| # Move to next chunk with overlap | |
| i = end_idx - overlap if end_idx < len(sentences) else end_idx | |
| chunk_id += 1 | |
| return chunks, metadata | |
| def _chunk_by_token_estimate( | |
| self, | |
| sentences: List[TranscriptSentence], | |
| enable_overlap: bool | |
| ) -> Tuple[List[List[TranscriptSentence]], List[ChunkMetadata]]: | |
| """Chunk by estimated token count.""" | |
| chunks = [] | |
| metadata = [] | |
| target_tokens = int(self.chunk_size) | |
| overlap = self.overlap_sentences if enable_overlap else 0 | |
| i = 0 | |
| chunk_id = 0 | |
| while i < len(sentences): | |
| chunk_sentences = [] | |
| current_tokens = 0 | |
| # Add sentences until token limit | |
| j = i | |
| while j < len(sentences) and current_tokens < target_tokens: | |
| sentence_tokens = self._estimate_tokens([sentences[j]]) | |
| if current_tokens + sentence_tokens <= target_tokens or not chunk_sentences: | |
| chunk_sentences.append(sentences[j]) | |
| current_tokens += sentence_tokens | |
| j += 1 | |
| else: | |
| break | |
| if not chunk_sentences: | |
| # Single sentence exceeds token limit - include it anyway | |
| chunk_sentences = [sentences[i]] | |
| j = i + 1 | |
| # Create metadata | |
| chunk_metadata = ChunkMetadata( | |
| chunk_id=chunk_id, | |
| start_sentence_index=chunk_sentences[0].sentence_index, | |
| end_sentence_index=chunk_sentences[-1].sentence_index, | |
| start_time=chunk_sentences[0].start_time, | |
| end_time=chunk_sentences[-1].end_time, | |
| sentence_count=len(chunk_sentences), | |
| estimated_tokens=current_tokens, | |
| speakers=list(set(s.speaker for s in chunk_sentences)), | |
| strategy_used=self.strategy, | |
| overlap_sentences=overlap if chunk_id > 0 else 0 | |
| ) | |
| chunks.append(chunk_sentences) | |
| metadata.append(chunk_metadata) | |
| # Move to next chunk with overlap | |
| if j < len(sentences): | |
| i = max(j - overlap, i + 1) # Ensure progress while maintaining overlap | |
| else: | |
| i = j | |
| chunk_id += 1 | |
| return chunks, metadata | |
| def _chunk_by_time( | |
| self, | |
| sentences: List[TranscriptSentence], | |
| enable_overlap: bool | |
| ) -> Tuple[List[List[TranscriptSentence]], List[ChunkMetadata]]: | |
| """Chunk by time duration.""" | |
| chunks = [] | |
| metadata = [] | |
| target_duration = float(self.chunk_size) # seconds | |
| overlap = self.overlap_sentences if enable_overlap else 0 | |
| i = 0 | |
| chunk_id = 0 | |
| while i < len(sentences): | |
| chunk_sentences = [] | |
| start_time = sentences[i].start_time | |
| # Add sentences until time limit | |
| j = i | |
| while j < len(sentences): | |
| current_duration = sentences[j].end_time - start_time | |
| if current_duration <= target_duration or not chunk_sentences: | |
| chunk_sentences.append(sentences[j]) | |
| j += 1 | |
| else: | |
| break | |
| # Create metadata | |
| chunk_metadata = ChunkMetadata( | |
| chunk_id=chunk_id, | |
| start_sentence_index=chunk_sentences[0].sentence_index, | |
| end_sentence_index=chunk_sentences[-1].sentence_index, | |
| start_time=chunk_sentences[0].start_time, | |
| end_time=chunk_sentences[-1].end_time, | |
| sentence_count=len(chunk_sentences), | |
| estimated_tokens=self._estimate_tokens(chunk_sentences), | |
| speakers=list(set(s.speaker for s in chunk_sentences)), | |
| strategy_used=self.strategy, | |
| overlap_sentences=overlap if chunk_id > 0 else 0 | |
| ) | |
| chunks.append(chunk_sentences) | |
| metadata.append(chunk_metadata) | |
| # Move to next chunk with overlap | |
| i = max(j - overlap, j) if j < len(sentences) else j | |
| chunk_id += 1 | |
| return chunks, metadata | |
| def _chunk_speaker_aware( | |
| self, | |
| sentences: List[TranscriptSentence], | |
| enable_overlap: bool | |
| ) -> Tuple[List[List[TranscriptSentence]], List[ChunkMetadata]]: | |
| """Chunk with speaker change awareness.""" | |
| chunks = [] | |
| metadata = [] | |
| target_sentences = int(self.chunk_size) | |
| overlap = self.overlap_sentences if enable_overlap else 0 | |
| i = 0 | |
| chunk_id = 0 | |
| while i < len(sentences): | |
| chunk_sentences = [] | |
| # Add sentences, preferring speaker boundaries | |
| j = i | |
| while j < len(sentences) and len(chunk_sentences) < target_sentences: | |
| chunk_sentences.append(sentences[j]) | |
| j += 1 | |
| # Check for natural speaker boundary | |
| if (j < len(sentences) and | |
| len(chunk_sentences) >= target_sentences // 2 and | |
| sentences[j].speaker != sentences[j-1].speaker): | |
| break | |
| # Create metadata | |
| chunk_metadata = ChunkMetadata( | |
| chunk_id=chunk_id, | |
| start_sentence_index=chunk_sentences[0].sentence_index, | |
| end_sentence_index=chunk_sentences[-1].sentence_index, | |
| start_time=chunk_sentences[0].start_time, | |
| end_time=chunk_sentences[-1].end_time, | |
| sentence_count=len(chunk_sentences), | |
| estimated_tokens=self._estimate_tokens(chunk_sentences), | |
| speakers=list(set(s.speaker for s in chunk_sentences)), | |
| strategy_used=self.strategy, | |
| overlap_sentences=overlap if chunk_id > 0 else 0 | |
| ) | |
| chunks.append(chunk_sentences) | |
| metadata.append(chunk_metadata) | |
| # Move to next chunk with overlap | |
| i = max(j - overlap, j) if j < len(sentences) else j | |
| chunk_id += 1 | |
| return chunks, metadata | |
| def _chunk_semantic_boundary( | |
| self, | |
| sentences: List[TranscriptSentence], | |
| enable_overlap: bool | |
| ) -> Tuple[List[List[TranscriptSentence]], List[ChunkMetadata]]: | |
| """Chunk with semantic boundary detection (simplified).""" | |
| # For now, use sentence count with pause-based boundaries | |
| # This could be enhanced with NLP techniques in the future | |
| chunks = [] | |
| metadata = [] | |
| target_sentences = int(self.chunk_size) | |
| overlap = self.overlap_sentences if enable_overlap else 0 | |
| i = 0 | |
| chunk_id = 0 | |
| while i < len(sentences): | |
| chunk_sentences = [] | |
| # Add sentences, looking for natural pauses | |
| j = i | |
| while j < len(sentences) and len(chunk_sentences) < target_sentences: | |
| chunk_sentences.append(sentences[j]) | |
| j += 1 | |
| # Look for natural pause (gap > 2 seconds) | |
| if (j < len(sentences) and | |
| len(chunk_sentences) >= target_sentences // 2 and | |
| sentences[j].start_time - sentences[j-1].end_time > 2.0): | |
| break | |
| # Create metadata | |
| chunk_metadata = ChunkMetadata( | |
| chunk_id=chunk_id, | |
| start_sentence_index=chunk_sentences[0].sentence_index, | |
| end_sentence_index=chunk_sentences[-1].sentence_index, | |
| start_time=chunk_sentences[0].start_time, | |
| end_time=chunk_sentences[-1].end_time, | |
| sentence_count=len(chunk_sentences), | |
| estimated_tokens=self._estimate_tokens(chunk_sentences), | |
| speakers=list(set(s.speaker for s in chunk_sentences)), | |
| strategy_used=self.strategy, | |
| overlap_sentences=overlap if chunk_id > 0 else 0 | |
| ) | |
| chunks.append(chunk_sentences) | |
| metadata.append(chunk_metadata) | |
| # Move to next chunk with overlap | |
| i = max(j - overlap, j) if j < len(sentences) else j | |
| chunk_id += 1 | |
| return chunks, metadata | |
| def _estimate_tokens(self, sentences: List[TranscriptSentence]) -> int: | |
| """Estimate token count for a list of sentences.""" | |
| total_tokens = 0 | |
| for sentence in sentences: | |
| # Count words for English-like languages | |
| word_count = len(sentence.text.split()) | |
| char_count = len(sentence.text) | |
| # Use word-based estimation for English, character-based for others | |
| if hasattr(sentence, 'language') and sentence.language: | |
| if sentence.language.value in ['en']: | |
| tokens = word_count * self.TOKENS_PER_WORD | |
| else: | |
| tokens = char_count * self.TOKENS_PER_CHAR | |
| else: | |
| # Default to word-based estimation | |
| tokens = word_count * self.TOKENS_PER_WORD | |
| total_tokens += tokens | |
| return int(total_tokens) | |
| def get_optimal_strategy( | |
| self, | |
| sentences: List[TranscriptSentence], | |
| target_chunks: Optional[int] = None | |
| ) -> ChunkingStrategy: | |
| """ | |
| Recommend optimal chunking strategy based on transcript characteristics. | |
| Args: | |
| sentences: Transcript sentences to analyze | |
| target_chunks: Desired number of chunks (optional) | |
| Returns: | |
| Recommended chunking strategy | |
| """ | |
| if not sentences: | |
| return ChunkingStrategy.SENTENCE_COUNT | |
| total_sentences = len(sentences) | |
| total_duration = sentences[-1].end_time - sentences[0].start_time | |
| unique_speakers = len(set(s.speaker for s in sentences)) | |
| estimated_tokens = self._estimate_tokens(sentences) | |
| # Very large transcripts - use token estimation | |
| if estimated_tokens > self.max_tokens * 0.8: | |
| return ChunkingStrategy.TOKEN_ESTIMATE | |
| # Large transcripts (>200 sentences) - prefer sentence chunking for better granularity | |
| if total_sentences > 200: | |
| return ChunkingStrategy.SENTENCE_COUNT | |
| # Many speakers - use speaker-aware chunking | |
| if unique_speakers > 5: | |
| return ChunkingStrategy.SPEAKER_AWARE | |
| # Long duration - use time-based chunking | |
| if total_duration > 3600: # > 1 hour | |
| return ChunkingStrategy.TIME_BASED | |
| # Medium transcripts (>50 sentences) - use sentence chunking | |
| if total_sentences > 50: | |
| return ChunkingStrategy.SENTENCE_COUNT | |
| # Default to sentence count for moderate transcripts | |
| return ChunkingStrategy.SENTENCE_COUNT | |
| def validate_chunks(self, result: ChunkingResult) -> List[str]: | |
| """ | |
| Validate chunking result and return any warnings. | |
| Args: | |
| result: Chunking result to validate | |
| Returns: | |
| List of warning messages | |
| """ | |
| warnings = [] | |
| # Check for empty chunks | |
| empty_chunks = [i for i, chunk in enumerate(result.chunks) if not chunk] | |
| if empty_chunks: | |
| warnings.append(f"Empty chunks found at indices: {empty_chunks}") | |
| # Check for oversized chunks (token-wise) | |
| for i, metadata in enumerate(result.metadata): | |
| if metadata.estimated_tokens > self.max_tokens: | |
| warnings.append( | |
| f"Chunk {i} exceeds token limit: {metadata.estimated_tokens} > {self.max_tokens}" | |
| ) | |
| # Check for very small chunks (except last) | |
| min_sentences = 3 | |
| small_chunks = [ | |
| i for i, metadata in enumerate(result.metadata[:-1]) | |
| if metadata.sentence_count < min_sentences | |
| ] | |
| if small_chunks: | |
| warnings.append(f"Very small chunks found at indices: {small_chunks}") | |
| # Check for gaps in coverage | |
| for i in range(len(result.metadata) - 1): | |
| current_end = result.metadata[i].end_sentence_index | |
| next_start = result.metadata[i + 1].start_sentence_index | |
| expected_gap = result.metadata[i + 1].overlap_sentences | |
| actual_gap = current_end - next_start + 1 | |
| if actual_gap != expected_gap: | |
| warnings.append( | |
| f"Unexpected gap between chunks {i} and {i + 1}: " | |
| f"expected {expected_gap}, got {actual_gap}" | |
| ) | |
| return warnings | |
| def create_chunker( | |
| strategy: str = "token_estimate", | |
| chunk_size: Optional[int] = None, | |
| overlap_sentences: int = 3, | |
| model_name: str = "claude-3-5-sonnet-20241022" | |
| ) -> TranscriptChunker: | |
| """ | |
| Factory function to create a transcript chunker. | |
| Args: | |
| strategy: Chunking strategy name | |
| chunk_size: Size parameter for chunking | |
| overlap_sentences: Number of sentences to overlap | |
| model_name: Target Anthropic model | |
| Returns: | |
| Configured TranscriptChunker instance | |
| """ | |
| strategy_enum = ChunkingStrategy(strategy) | |
| return TranscriptChunker( | |
| strategy=strategy_enum, | |
| chunk_size=chunk_size, | |
| overlap_sentences=overlap_sentences, | |
| model_name=model_name | |
| ) |