Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,641 Bytes
0456b70 95e1515 0456b70 95e1515 0456b70 95e1515 0456b70 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
"""
Audio Segment Model
Represents a contiguous portion of audio with speaker and timing information.
"""
from dataclasses import dataclass
from enum import Enum
from typing import List, Optional
class SegmentType(Enum):
"""Classification of audio segment types."""
SPEECH = "speech"
NONVERBAL = "nonverbal"
SILENCE = "silence"
OVERLAP = "overlap" # Multiple speakers talking simultaneously
@dataclass
class AudioSegment:
"""
Audio segment with time range and speaker information.
IMPORTANT: This model stores ONLY metadata (timestamps, speaker info, classification).
Audio data is NEVER stored in AudioSegment instances. Audio is extracted on-demand
from source files using the stored timestamps during concatenation or processing.
This metadata-only design enables memory-efficient processing of large audio files
(>1 hour) by avoiding storage of thousands of audio arrays in memory.
Attributes:
start_time: Beginning timestamp in seconds
end_time: Ending timestamp in seconds
speaker_id: Identifier of the speaker in this segment
confidence: Certainty of speaker identification (0.0-1.0)
segment_type: Classification of the segment
audio_file: Path to the source audio file (optional, for reference only)
Usage Pattern:
# Create segment with metadata only
segment = AudioSegment(
start_time=10.5,
end_time=15.3,
speaker_id="speaker_00",
confidence=0.95,
segment_type=SegmentType.SPEECH
)
# Extract audio on-demand when needed
start_sample = int(segment.start_time * sample_rate)
end_sample = int(segment.end_time * sample_rate)
segment_audio = source_audio[start_sample:end_sample]
"""
start_time: float
end_time: float
speaker_id: str
confidence: float = 1.0
segment_type: SegmentType = SegmentType.SPEECH
audio_file: Optional[str] = None
def __post_init__(self):
"""Validate audio segment data."""
if self.start_time < 0:
raise ValueError(f"Start time cannot be negative: {self.start_time}")
if self.end_time <= self.start_time:
raise ValueError(
f"End time ({self.end_time}) must be after start time ({self.start_time})"
)
if not 0.0 <= self.confidence <= 1.0:
raise ValueError(f"Confidence must be between 0.0 and 1.0, got {self.confidence}")
# Ensure no audio data is accidentally stored (metadata-only enforcement)
if hasattr(self, "audio") or "audio" in self.__dict__:
raise ValueError(
"AudioSegment must not contain 'audio' attribute. "
"Audio data should be extracted on-demand using timestamps."
)
@property
def duration(self) -> float:
"""Calculate duration of the segment in seconds."""
return self.end_time - self.start_time
def overlaps_with(self, other: "AudioSegment") -> bool:
"""Check if this segment overlaps with another segment."""
return not (self.end_time <= other.start_time or other.end_time <= self.start_time)
def contains_time(self, time: float) -> bool:
"""Check if a timestamp falls within this segment."""
return self.start_time <= time <= self.end_time
def __repr__(self) -> str:
return (
f"AudioSegment("
f"speaker='{self.speaker_id}', "
f"time={self.start_time:.2f}-{self.end_time:.2f}s, "
f"duration={self.duration:.2f}s, "
f"confidence={self.confidence:.2f}, "
f"type={self.segment_type.value})"
)
class SegmentCollection:
"""
Collection of audio segments with utility methods.
Provides methods for filtering, sorting, and analyzing groups of segments.
"""
def __init__(self, segments: List[AudioSegment]):
"""Initialize collection with segments."""
self.segments = segments
def __len__(self) -> int:
"""Return number of segments."""
return len(self.segments)
def __iter__(self):
"""Iterate over segments."""
return iter(self.segments)
def __getitem__(self, index):
"""Get segment by index."""
return self.segments[index]
@property
def total_duration(self) -> float:
"""Calculate total duration of all segments."""
return sum(seg.duration for seg in self.segments)
def filter_by_speaker(self, speaker_id: str) -> "SegmentCollection":
"""Filter segments by speaker ID."""
filtered = [seg for seg in self.segments if seg.speaker_id == speaker_id]
return SegmentCollection(filtered)
def filter_by_type(self, segment_type: SegmentType) -> "SegmentCollection":
"""Filter segments by type."""
filtered = [seg for seg in self.segments if seg.segment_type == segment_type]
return SegmentCollection(filtered)
def sort_by_time(self) -> "SegmentCollection":
"""Sort segments by start time."""
sorted_segments = sorted(self.segments, key=lambda s: s.start_time)
return SegmentCollection(sorted_segments)
def get_speakers(self) -> List[str]:
"""Get unique list of speaker IDs."""
return list(set(seg.speaker_id for seg in self.segments))
def average_confidence(self) -> float:
"""Calculate average confidence across all segments."""
if not self.segments:
return 0.0
return sum(seg.confidence for seg in self.segments) / len(self.segments)
|