Spaces:
Running
on
Zero
Running
on
Zero
| """ | |
| Audio Segment Model | |
| Represents a contiguous portion of audio with speaker and timing information. | |
| """ | |
| from dataclasses import dataclass | |
| from enum import Enum | |
| from typing import List, Optional | |
| class SegmentType(Enum): | |
| """Classification of audio segment types.""" | |
| SPEECH = "speech" | |
| NONVERBAL = "nonverbal" | |
| SILENCE = "silence" | |
| OVERLAP = "overlap" # Multiple speakers talking simultaneously | |
| class AudioSegment: | |
| """ | |
| Audio segment with time range and speaker information. | |
| IMPORTANT: This model stores ONLY metadata (timestamps, speaker info, classification). | |
| Audio data is NEVER stored in AudioSegment instances. Audio is extracted on-demand | |
| from source files using the stored timestamps during concatenation or processing. | |
| This metadata-only design enables memory-efficient processing of large audio files | |
| (>1 hour) by avoiding storage of thousands of audio arrays in memory. | |
| Attributes: | |
| start_time: Beginning timestamp in seconds | |
| end_time: Ending timestamp in seconds | |
| speaker_id: Identifier of the speaker in this segment | |
| confidence: Certainty of speaker identification (0.0-1.0) | |
| segment_type: Classification of the segment | |
| audio_file: Path to the source audio file (optional, for reference only) | |
| Usage Pattern: | |
| # Create segment with metadata only | |
| segment = AudioSegment( | |
| start_time=10.5, | |
| end_time=15.3, | |
| speaker_id="speaker_00", | |
| confidence=0.95, | |
| segment_type=SegmentType.SPEECH | |
| ) | |
| # Extract audio on-demand when needed | |
| start_sample = int(segment.start_time * sample_rate) | |
| end_sample = int(segment.end_time * sample_rate) | |
| segment_audio = source_audio[start_sample:end_sample] | |
| """ | |
| start_time: float | |
| end_time: float | |
| speaker_id: str | |
| confidence: float = 1.0 | |
| segment_type: SegmentType = SegmentType.SPEECH | |
| audio_file: Optional[str] = None | |
| def __post_init__(self): | |
| """Validate audio segment data.""" | |
| if self.start_time < 0: | |
| raise ValueError(f"Start time cannot be negative: {self.start_time}") | |
| if self.end_time <= self.start_time: | |
| raise ValueError( | |
| f"End time ({self.end_time}) must be after start time ({self.start_time})" | |
| ) | |
| if not 0.0 <= self.confidence <= 1.0: | |
| raise ValueError(f"Confidence must be between 0.0 and 1.0, got {self.confidence}") | |
| # Ensure no audio data is accidentally stored (metadata-only enforcement) | |
| if hasattr(self, "audio") or "audio" in self.__dict__: | |
| raise ValueError( | |
| "AudioSegment must not contain 'audio' attribute. " | |
| "Audio data should be extracted on-demand using timestamps." | |
| ) | |
| def duration(self) -> float: | |
| """Calculate duration of the segment in seconds.""" | |
| return self.end_time - self.start_time | |
| def overlaps_with(self, other: "AudioSegment") -> bool: | |
| """Check if this segment overlaps with another segment.""" | |
| return not (self.end_time <= other.start_time or other.end_time <= self.start_time) | |
| def contains_time(self, time: float) -> bool: | |
| """Check if a timestamp falls within this segment.""" | |
| return self.start_time <= time <= self.end_time | |
| def __repr__(self) -> str: | |
| return ( | |
| f"AudioSegment(" | |
| f"speaker='{self.speaker_id}', " | |
| f"time={self.start_time:.2f}-{self.end_time:.2f}s, " | |
| f"duration={self.duration:.2f}s, " | |
| f"confidence={self.confidence:.2f}, " | |
| f"type={self.segment_type.value})" | |
| ) | |
| class SegmentCollection: | |
| """ | |
| Collection of audio segments with utility methods. | |
| Provides methods for filtering, sorting, and analyzing groups of segments. | |
| """ | |
| def __init__(self, segments: List[AudioSegment]): | |
| """Initialize collection with segments.""" | |
| self.segments = segments | |
| def __len__(self) -> int: | |
| """Return number of segments.""" | |
| return len(self.segments) | |
| def __iter__(self): | |
| """Iterate over segments.""" | |
| return iter(self.segments) | |
| def __getitem__(self, index): | |
| """Get segment by index.""" | |
| return self.segments[index] | |
| def total_duration(self) -> float: | |
| """Calculate total duration of all segments.""" | |
| return sum(seg.duration for seg in self.segments) | |
| def filter_by_speaker(self, speaker_id: str) -> "SegmentCollection": | |
| """Filter segments by speaker ID.""" | |
| filtered = [seg for seg in self.segments if seg.speaker_id == speaker_id] | |
| return SegmentCollection(filtered) | |
| def filter_by_type(self, segment_type: SegmentType) -> "SegmentCollection": | |
| """Filter segments by type.""" | |
| filtered = [seg for seg in self.segments if seg.segment_type == segment_type] | |
| return SegmentCollection(filtered) | |
| def sort_by_time(self) -> "SegmentCollection": | |
| """Sort segments by start time.""" | |
| sorted_segments = sorted(self.segments, key=lambda s: s.start_time) | |
| return SegmentCollection(sorted_segments) | |
| def get_speakers(self) -> List[str]: | |
| """Get unique list of speaker IDs.""" | |
| return list(set(seg.speaker_id for seg in self.segments)) | |
| def average_confidence(self) -> float: | |
| """Calculate average confidence across all segments.""" | |
| if not self.segments: | |
| return 0.0 | |
| return sum(seg.confidence for seg in self.segments) / len(self.segments) | |