Yeetek's picture
Upload 20 files
68171c1 verified
"""
Text chunking utilities for large transcript processing.
This module provides intelligent chunking strategies to handle large transcripts
while respecting Anthropic model token limits and maintaining context coherence.
"""
import math
from typing import List, Dict, Any, Tuple, Optional
from dataclasses import dataclass
from enum import Enum
from models.input import TranscriptSentence
class ChunkingStrategy(str, Enum):
"""Available chunking strategies for transcript processing."""
SENTENCE_COUNT = "sentence_count"
TOKEN_ESTIMATE = "token_estimate"
TIME_BASED = "time_based"
SPEAKER_AWARE = "speaker_aware"
SEMANTIC_BOUNDARY = "semantic_boundary"
@dataclass
class ChunkMetadata:
"""Metadata for a transcript chunk."""
chunk_id: int
start_sentence_index: int
end_sentence_index: int
start_time: float
end_time: float
sentence_count: int
estimated_tokens: int
speakers: List[str]
strategy_used: ChunkingStrategy
overlap_sentences: int = 0
@dataclass
class ChunkingResult:
"""Result of transcript chunking operation."""
chunks: List[List[TranscriptSentence]]
metadata: List[ChunkMetadata]
total_chunks: int
total_sentences: int
strategy_used: ChunkingStrategy
overlap_enabled: bool
class TranscriptChunker:
"""
Intelligent transcript chunking for large document processing.
Handles various chunking strategies while maintaining context coherence
and respecting token limits for Anthropic models.
"""
# Token estimation constants (approximate)
TOKENS_PER_WORD = 1.3 # Conservative estimate for English
TOKENS_PER_CHAR = 0.25 # For non-English languages
# Model token limits (with safety margin)
MODEL_TOKEN_LIMITS = {
"claude-3-5-sonnet-20241022": 180000, # 200k with margin
"claude-3-5-haiku-20241022": 180000, # 200k with margin
"claude-3-sonnet-20240229": 180000, # 200k with margin
"claude-3-haiku-20240307": 180000, # 200k with margin
}
# Default chunk sizes for different strategies
DEFAULT_CHUNK_SIZES = {
ChunkingStrategy.SENTENCE_COUNT: 40, # Smaller for better topic granularity
ChunkingStrategy.TOKEN_ESTIMATE: 6000, # Reduced for better topic extraction
ChunkingStrategy.TIME_BASED: 300.0, # 5 minutes
ChunkingStrategy.SPEAKER_AWARE: 35, # Smaller for better speaker context
ChunkingStrategy.SEMANTIC_BOUNDARY: 45 # Smaller for better semantic boundaries
}
def __init__(
self,
strategy: ChunkingStrategy = ChunkingStrategy.TOKEN_ESTIMATE,
chunk_size: Optional[int] = None,
overlap_sentences: int = 3,
model_name: str = "claude-3-5-sonnet-20241022"
):
"""
Initialize the transcript chunker.
Args:
strategy: Chunking strategy to use
chunk_size: Size parameter for chunking (strategy-dependent)
overlap_sentences: Number of sentences to overlap between chunks
model_name: Target Anthropic model for token limit calculation
"""
self.strategy = strategy
self.chunk_size = chunk_size or self.DEFAULT_CHUNK_SIZES[strategy]
self.overlap_sentences = overlap_sentences
self.model_name = model_name
self.max_tokens = self.MODEL_TOKEN_LIMITS.get(model_name, 180000)
def chunk_transcript(
self,
sentences: List[TranscriptSentence],
enable_overlap: bool = True
) -> ChunkingResult:
"""
Chunk a transcript using the configured strategy.
Args:
sentences: List of transcript sentences to chunk
enable_overlap: Whether to enable sentence overlap between chunks
Returns:
ChunkingResult with chunks and metadata
"""
if not sentences:
return ChunkingResult(
chunks=[],
metadata=[],
total_chunks=0,
total_sentences=0,
strategy_used=self.strategy,
overlap_enabled=enable_overlap
)
# Choose chunking method based on strategy
if self.strategy == ChunkingStrategy.SENTENCE_COUNT:
chunks, metadata = self._chunk_by_sentence_count(sentences, enable_overlap)
elif self.strategy == ChunkingStrategy.TOKEN_ESTIMATE:
chunks, metadata = self._chunk_by_token_estimate(sentences, enable_overlap)
elif self.strategy == ChunkingStrategy.TIME_BASED:
chunks, metadata = self._chunk_by_time(sentences, enable_overlap)
elif self.strategy == ChunkingStrategy.SPEAKER_AWARE:
chunks, metadata = self._chunk_speaker_aware(sentences, enable_overlap)
elif self.strategy == ChunkingStrategy.SEMANTIC_BOUNDARY:
chunks, metadata = self._chunk_semantic_boundary(sentences, enable_overlap)
else:
raise ValueError(f"Unsupported chunking strategy: {self.strategy}")
return ChunkingResult(
chunks=chunks,
metadata=metadata,
total_chunks=len(chunks),
total_sentences=len(sentences),
strategy_used=self.strategy,
overlap_enabled=enable_overlap
)
def _chunk_by_sentence_count(
self,
sentences: List[TranscriptSentence],
enable_overlap: bool
) -> Tuple[List[List[TranscriptSentence]], List[ChunkMetadata]]:
"""Chunk by fixed sentence count."""
chunks = []
metadata = []
chunk_size = int(self.chunk_size)
overlap = self.overlap_sentences if enable_overlap else 0
i = 0
chunk_id = 0
while i < len(sentences):
# Calculate chunk end
end_idx = min(i + chunk_size, len(sentences))
chunk_sentences = sentences[i:end_idx]
# Create metadata
chunk_metadata = ChunkMetadata(
chunk_id=chunk_id,
start_sentence_index=chunk_sentences[0].sentence_index,
end_sentence_index=chunk_sentences[-1].sentence_index,
start_time=chunk_sentences[0].start_time,
end_time=chunk_sentences[-1].end_time,
sentence_count=len(chunk_sentences),
estimated_tokens=self._estimate_tokens(chunk_sentences),
speakers=list(set(s.speaker for s in chunk_sentences)),
strategy_used=self.strategy,
overlap_sentences=overlap if chunk_id > 0 else 0
)
chunks.append(chunk_sentences)
metadata.append(chunk_metadata)
# Move to next chunk with overlap
i = end_idx - overlap if end_idx < len(sentences) else end_idx
chunk_id += 1
return chunks, metadata
def _chunk_by_token_estimate(
self,
sentences: List[TranscriptSentence],
enable_overlap: bool
) -> Tuple[List[List[TranscriptSentence]], List[ChunkMetadata]]:
"""Chunk by estimated token count."""
chunks = []
metadata = []
target_tokens = int(self.chunk_size)
overlap = self.overlap_sentences if enable_overlap else 0
i = 0
chunk_id = 0
while i < len(sentences):
chunk_sentences = []
current_tokens = 0
# Add sentences until token limit
j = i
while j < len(sentences) and current_tokens < target_tokens:
sentence_tokens = self._estimate_tokens([sentences[j]])
if current_tokens + sentence_tokens <= target_tokens or not chunk_sentences:
chunk_sentences.append(sentences[j])
current_tokens += sentence_tokens
j += 1
else:
break
if not chunk_sentences:
# Single sentence exceeds token limit - include it anyway
chunk_sentences = [sentences[i]]
j = i + 1
# Create metadata
chunk_metadata = ChunkMetadata(
chunk_id=chunk_id,
start_sentence_index=chunk_sentences[0].sentence_index,
end_sentence_index=chunk_sentences[-1].sentence_index,
start_time=chunk_sentences[0].start_time,
end_time=chunk_sentences[-1].end_time,
sentence_count=len(chunk_sentences),
estimated_tokens=current_tokens,
speakers=list(set(s.speaker for s in chunk_sentences)),
strategy_used=self.strategy,
overlap_sentences=overlap if chunk_id > 0 else 0
)
chunks.append(chunk_sentences)
metadata.append(chunk_metadata)
# Move to next chunk with overlap
if j < len(sentences):
i = max(j - overlap, i + 1) # Ensure progress while maintaining overlap
else:
i = j
chunk_id += 1
return chunks, metadata
def _chunk_by_time(
self,
sentences: List[TranscriptSentence],
enable_overlap: bool
) -> Tuple[List[List[TranscriptSentence]], List[ChunkMetadata]]:
"""Chunk by time duration."""
chunks = []
metadata = []
target_duration = float(self.chunk_size) # seconds
overlap = self.overlap_sentences if enable_overlap else 0
i = 0
chunk_id = 0
while i < len(sentences):
chunk_sentences = []
start_time = sentences[i].start_time
# Add sentences until time limit
j = i
while j < len(sentences):
current_duration = sentences[j].end_time - start_time
if current_duration <= target_duration or not chunk_sentences:
chunk_sentences.append(sentences[j])
j += 1
else:
break
# Create metadata
chunk_metadata = ChunkMetadata(
chunk_id=chunk_id,
start_sentence_index=chunk_sentences[0].sentence_index,
end_sentence_index=chunk_sentences[-1].sentence_index,
start_time=chunk_sentences[0].start_time,
end_time=chunk_sentences[-1].end_time,
sentence_count=len(chunk_sentences),
estimated_tokens=self._estimate_tokens(chunk_sentences),
speakers=list(set(s.speaker for s in chunk_sentences)),
strategy_used=self.strategy,
overlap_sentences=overlap if chunk_id > 0 else 0
)
chunks.append(chunk_sentences)
metadata.append(chunk_metadata)
# Move to next chunk with overlap
i = max(j - overlap, j) if j < len(sentences) else j
chunk_id += 1
return chunks, metadata
def _chunk_speaker_aware(
self,
sentences: List[TranscriptSentence],
enable_overlap: bool
) -> Tuple[List[List[TranscriptSentence]], List[ChunkMetadata]]:
"""Chunk with speaker change awareness."""
chunks = []
metadata = []
target_sentences = int(self.chunk_size)
overlap = self.overlap_sentences if enable_overlap else 0
i = 0
chunk_id = 0
while i < len(sentences):
chunk_sentences = []
# Add sentences, preferring speaker boundaries
j = i
while j < len(sentences) and len(chunk_sentences) < target_sentences:
chunk_sentences.append(sentences[j])
j += 1
# Check for natural speaker boundary
if (j < len(sentences) and
len(chunk_sentences) >= target_sentences // 2 and
sentences[j].speaker != sentences[j-1].speaker):
break
# Create metadata
chunk_metadata = ChunkMetadata(
chunk_id=chunk_id,
start_sentence_index=chunk_sentences[0].sentence_index,
end_sentence_index=chunk_sentences[-1].sentence_index,
start_time=chunk_sentences[0].start_time,
end_time=chunk_sentences[-1].end_time,
sentence_count=len(chunk_sentences),
estimated_tokens=self._estimate_tokens(chunk_sentences),
speakers=list(set(s.speaker for s in chunk_sentences)),
strategy_used=self.strategy,
overlap_sentences=overlap if chunk_id > 0 else 0
)
chunks.append(chunk_sentences)
metadata.append(chunk_metadata)
# Move to next chunk with overlap
i = max(j - overlap, j) if j < len(sentences) else j
chunk_id += 1
return chunks, metadata
def _chunk_semantic_boundary(
self,
sentences: List[TranscriptSentence],
enable_overlap: bool
) -> Tuple[List[List[TranscriptSentence]], List[ChunkMetadata]]:
"""Chunk with semantic boundary detection (simplified)."""
# For now, use sentence count with pause-based boundaries
# This could be enhanced with NLP techniques in the future
chunks = []
metadata = []
target_sentences = int(self.chunk_size)
overlap = self.overlap_sentences if enable_overlap else 0
i = 0
chunk_id = 0
while i < len(sentences):
chunk_sentences = []
# Add sentences, looking for natural pauses
j = i
while j < len(sentences) and len(chunk_sentences) < target_sentences:
chunk_sentences.append(sentences[j])
j += 1
# Look for natural pause (gap > 2 seconds)
if (j < len(sentences) and
len(chunk_sentences) >= target_sentences // 2 and
sentences[j].start_time - sentences[j-1].end_time > 2.0):
break
# Create metadata
chunk_metadata = ChunkMetadata(
chunk_id=chunk_id,
start_sentence_index=chunk_sentences[0].sentence_index,
end_sentence_index=chunk_sentences[-1].sentence_index,
start_time=chunk_sentences[0].start_time,
end_time=chunk_sentences[-1].end_time,
sentence_count=len(chunk_sentences),
estimated_tokens=self._estimate_tokens(chunk_sentences),
speakers=list(set(s.speaker for s in chunk_sentences)),
strategy_used=self.strategy,
overlap_sentences=overlap if chunk_id > 0 else 0
)
chunks.append(chunk_sentences)
metadata.append(chunk_metadata)
# Move to next chunk with overlap
i = max(j - overlap, j) if j < len(sentences) else j
chunk_id += 1
return chunks, metadata
def _estimate_tokens(self, sentences: List[TranscriptSentence]) -> int:
"""Estimate token count for a list of sentences."""
total_tokens = 0
for sentence in sentences:
# Count words for English-like languages
word_count = len(sentence.text.split())
char_count = len(sentence.text)
# Use word-based estimation for English, character-based for others
if hasattr(sentence, 'language') and sentence.language:
if sentence.language.value in ['en']:
tokens = word_count * self.TOKENS_PER_WORD
else:
tokens = char_count * self.TOKENS_PER_CHAR
else:
# Default to word-based estimation
tokens = word_count * self.TOKENS_PER_WORD
total_tokens += tokens
return int(total_tokens)
def get_optimal_strategy(
self,
sentences: List[TranscriptSentence],
target_chunks: Optional[int] = None
) -> ChunkingStrategy:
"""
Recommend optimal chunking strategy based on transcript characteristics.
Args:
sentences: Transcript sentences to analyze
target_chunks: Desired number of chunks (optional)
Returns:
Recommended chunking strategy
"""
if not sentences:
return ChunkingStrategy.SENTENCE_COUNT
total_sentences = len(sentences)
total_duration = sentences[-1].end_time - sentences[0].start_time
unique_speakers = len(set(s.speaker for s in sentences))
estimated_tokens = self._estimate_tokens(sentences)
# Very large transcripts - use token estimation
if estimated_tokens > self.max_tokens * 0.8:
return ChunkingStrategy.TOKEN_ESTIMATE
# Large transcripts (>200 sentences) - prefer sentence chunking for better granularity
if total_sentences > 200:
return ChunkingStrategy.SENTENCE_COUNT
# Many speakers - use speaker-aware chunking
if unique_speakers > 5:
return ChunkingStrategy.SPEAKER_AWARE
# Long duration - use time-based chunking
if total_duration > 3600: # > 1 hour
return ChunkingStrategy.TIME_BASED
# Medium transcripts (>50 sentences) - use sentence chunking
if total_sentences > 50:
return ChunkingStrategy.SENTENCE_COUNT
# Default to sentence count for moderate transcripts
return ChunkingStrategy.SENTENCE_COUNT
def validate_chunks(self, result: ChunkingResult) -> List[str]:
"""
Validate chunking result and return any warnings.
Args:
result: Chunking result to validate
Returns:
List of warning messages
"""
warnings = []
# Check for empty chunks
empty_chunks = [i for i, chunk in enumerate(result.chunks) if not chunk]
if empty_chunks:
warnings.append(f"Empty chunks found at indices: {empty_chunks}")
# Check for oversized chunks (token-wise)
for i, metadata in enumerate(result.metadata):
if metadata.estimated_tokens > self.max_tokens:
warnings.append(
f"Chunk {i} exceeds token limit: {metadata.estimated_tokens} > {self.max_tokens}"
)
# Check for very small chunks (except last)
min_sentences = 3
small_chunks = [
i for i, metadata in enumerate(result.metadata[:-1])
if metadata.sentence_count < min_sentences
]
if small_chunks:
warnings.append(f"Very small chunks found at indices: {small_chunks}")
# Check for gaps in coverage
for i in range(len(result.metadata) - 1):
current_end = result.metadata[i].end_sentence_index
next_start = result.metadata[i + 1].start_sentence_index
expected_gap = result.metadata[i + 1].overlap_sentences
actual_gap = current_end - next_start + 1
if actual_gap != expected_gap:
warnings.append(
f"Unexpected gap between chunks {i} and {i + 1}: "
f"expected {expected_gap}, got {actual_gap}"
)
return warnings
def create_chunker(
strategy: str = "token_estimate",
chunk_size: Optional[int] = None,
overlap_sentences: int = 3,
model_name: str = "claude-3-5-sonnet-20241022"
) -> TranscriptChunker:
"""
Factory function to create a transcript chunker.
Args:
strategy: Chunking strategy name
chunk_size: Size parameter for chunking
overlap_sentences: Number of sentences to overlap
model_name: Target Anthropic model
Returns:
Configured TranscriptChunker instance
"""
strategy_enum = ChunkingStrategy(strategy)
return TranscriptChunker(
strategy=strategy_enum,
chunk_size=chunk_size,
overlap_sentences=overlap_sentences,
model_name=model_name
)