Spaces:

Hedrekao
/

audio-explorers-visualization

Sleeping

File size: 3,898 Bytes

a361db3

"""Data structures for pipeline stages."""

from dataclasses import dataclass, asdict, field
from typing import Optional, List
import json


@dataclass
class TalkerInfo:
    """Information about a single speaker."""
    id: int
    label: str                          # e.g. "SPEAKER_00"
    gender: Optional[str] = None        # "male" / "female" / "unknown" / "ambiguous"
    mean_f0_hz: Optional[float] = None  # Fundamental frequency in Hz
    transcript: Optional[str] = None    # Transcribed speech
    language: Optional[str] = None      # Detected language code (e.g. "en", "da")
    wav_path: Optional[str] = None      # Path to extracted source WAV
    is_toi: bool = False                # Talker of interest flag
    toi_reason: Optional[str] = None    # Reasoning for ToI selection
    direction_deg: Optional[float] = None  # Direction of arrival in degrees
    energy: Optional[float] = None      # Energy level
    selection_score: Optional[float] = None  # Numerical score for ToI selection

    def to_dict(self):
        """Convert to dictionary, excluding None values for cleaner JSON."""
        return {k: v for k, v in asdict(self).items() if v is not None}


@dataclass
class PipelineOutput:
    """Complete output from pipeline execution."""
    input_file: str
    approach: str                      # "ica" / "ica_deeplearning" / "frankenstein"
    duration_seconds: float
    sample_rate: int
    n_speakers: int
    talker_of_interest: int            # Source index (1-indexed)
    sources: List[TalkerInfo] = field(default_factory=list)
    
    # Performance metrics
    execution_time_seconds: Optional[float] = None
    separation_method: Optional[str] = None
    doa_method: Optional[str] = None
    gender_method: Optional[str] = None
    asr_model: Optional[str] = None
    
    # Optional: Processing chain details
    processing_notes: Optional[str] = None

    def to_dict(self):
        """Convert to dictionary for JSON serialization."""
        return {
            "input_file": self.input_file,
            "approach": self.approach,
            "duration_seconds": round(self.duration_seconds, 2),
            "sample_rate": self.sample_rate,
            "n_speakers": self.n_speakers,
            "talker_of_interest": self.talker_of_interest,
            "execution_time_seconds": round(self.execution_time_seconds, 2) if self.execution_time_seconds else None,
            "processing_methods": {
                "separation": self.separation_method,
                "direction_of_arrival": self.doa_method,
                "gender_classification": self.gender_method,
                "asr_model": self.asr_model,
            },
            "sources": [s.to_dict() for s in self.sources],
            "notes": self.processing_notes,
        }

    def to_json(self, indent: int = 2) -> str:
        """Convert to JSON string."""
        return json.dumps(self.to_dict(), indent=indent)

    @classmethod
    def from_dict(cls, data: dict):
        """Reconstruct from dictionary."""
        sources = [TalkerInfo(**s) for s in data.get("sources", [])]
        return cls(
            input_file=data["input_file"],
            approach=data["approach"],
            duration_seconds=data["duration_seconds"],
            sample_rate=data["sample_rate"],
            n_speakers=data["n_speakers"],
            talker_of_interest=data["talker_of_interest"],
            sources=sources,
            execution_time_seconds=data.get("execution_time_seconds"),
            separation_method=data.get("processing_methods", {}).get("separation"),
            doa_method=data.get("processing_methods", {}).get("direction_of_arrival"),
            gender_method=data.get("processing_methods", {}).get("gender_classification"),
            asr_model=data.get("processing_methods", {}).get("asr_model"),
            processing_notes=data.get("notes"),
        )