Spaces:

jcudit
/

voice-tools

Running on Zero

File size: 5,641 Bytes

"""
Audio Segment Model

Represents a contiguous portion of audio with speaker and timing information.
"""

from dataclasses import dataclass
from enum import Enum
from typing import List, Optional


class SegmentType(Enum):
    """Classification of audio segment types."""

    SPEECH = "speech"
    NONVERBAL = "nonverbal"
    SILENCE = "silence"
    OVERLAP = "overlap"  # Multiple speakers talking simultaneously


@dataclass
class AudioSegment:
    """
    Audio segment with time range and speaker information.

    IMPORTANT: This model stores ONLY metadata (timestamps, speaker info, classification).
    Audio data is NEVER stored in AudioSegment instances. Audio is extracted on-demand
    from source files using the stored timestamps during concatenation or processing.

    This metadata-only design enables memory-efficient processing of large audio files
    (>1 hour) by avoiding storage of thousands of audio arrays in memory.

    Attributes:
        start_time: Beginning timestamp in seconds
        end_time: Ending timestamp in seconds
        speaker_id: Identifier of the speaker in this segment
        confidence: Certainty of speaker identification (0.0-1.0)
        segment_type: Classification of the segment
        audio_file: Path to the source audio file (optional, for reference only)

    Usage Pattern:
        # Create segment with metadata only
        segment = AudioSegment(
            start_time=10.5,
            end_time=15.3,
            speaker_id="speaker_00",
            confidence=0.95,
            segment_type=SegmentType.SPEECH
        )

        # Extract audio on-demand when needed
        start_sample = int(segment.start_time * sample_rate)
        end_sample = int(segment.end_time * sample_rate)
        segment_audio = source_audio[start_sample:end_sample]
    """

    start_time: float
    end_time: float
    speaker_id: str
    confidence: float = 1.0
    segment_type: SegmentType = SegmentType.SPEECH
    audio_file: Optional[str] = None

    def __post_init__(self):
        """Validate audio segment data."""
        if self.start_time < 0:
            raise ValueError(f"Start time cannot be negative: {self.start_time}")

        if self.end_time <= self.start_time:
            raise ValueError(
                f"End time ({self.end_time}) must be after start time ({self.start_time})"
            )

        if not 0.0 <= self.confidence <= 1.0:
            raise ValueError(f"Confidence must be between 0.0 and 1.0, got {self.confidence}")

        # Ensure no audio data is accidentally stored (metadata-only enforcement)
        if hasattr(self, "audio") or "audio" in self.__dict__:
            raise ValueError(
                "AudioSegment must not contain 'audio' attribute. "
                "Audio data should be extracted on-demand using timestamps."
            )

    @property
    def duration(self) -> float:
        """Calculate duration of the segment in seconds."""
        return self.end_time - self.start_time

    def overlaps_with(self, other: "AudioSegment") -> bool:
        """Check if this segment overlaps with another segment."""
        return not (self.end_time <= other.start_time or other.end_time <= self.start_time)

    def contains_time(self, time: float) -> bool:
        """Check if a timestamp falls within this segment."""
        return self.start_time <= time <= self.end_time

    def __repr__(self) -> str:
        return (
            f"AudioSegment("
            f"speaker='{self.speaker_id}', "
            f"time={self.start_time:.2f}-{self.end_time:.2f}s, "
            f"duration={self.duration:.2f}s, "
            f"confidence={self.confidence:.2f}, "
            f"type={self.segment_type.value})"
        )


class SegmentCollection:
    """
    Collection of audio segments with utility methods.

    Provides methods for filtering, sorting, and analyzing groups of segments.
    """

    def __init__(self, segments: List[AudioSegment]):
        """Initialize collection with segments."""
        self.segments = segments

    def __len__(self) -> int:
        """Return number of segments."""
        return len(self.segments)

    def __iter__(self):
        """Iterate over segments."""
        return iter(self.segments)

    def __getitem__(self, index):
        """Get segment by index."""
        return self.segments[index]

    @property
    def total_duration(self) -> float:
        """Calculate total duration of all segments."""
        return sum(seg.duration for seg in self.segments)

    def filter_by_speaker(self, speaker_id: str) -> "SegmentCollection":
        """Filter segments by speaker ID."""
        filtered = [seg for seg in self.segments if seg.speaker_id == speaker_id]
        return SegmentCollection(filtered)

    def filter_by_type(self, segment_type: SegmentType) -> "SegmentCollection":
        """Filter segments by type."""
        filtered = [seg for seg in self.segments if seg.segment_type == segment_type]
        return SegmentCollection(filtered)

    def sort_by_time(self) -> "SegmentCollection":
        """Sort segments by start time."""
        sorted_segments = sorted(self.segments, key=lambda s: s.start_time)
        return SegmentCollection(sorted_segments)

    def get_speakers(self) -> List[str]:
        """Get unique list of speaker IDs."""
        return list(set(seg.speaker_id for seg in self.segments))

    def average_confidence(self) -> float:
        """Calculate average confidence across all segments."""
        if not self.segments:
            return 0.0
        return sum(seg.confidence for seg in self.segments) / len(self.segments)