Spaces:

jcudit
/

voice-tools

Paused

File size: 4,569 Bytes

0456b70

"""
VoiceProfile data model: Reference embedding and speaker identification.

Represents a target voice profile extracted from reference audio clip.
"""

from dataclasses import dataclass, field
from typing import Optional

import numpy as np


@dataclass
class VoiceProfile:
    """
    Voice profile representing a target speaker.

    Contains embedding vectors and metadata for voice identification.
    """

    # Core identification
    speaker_id: str
    embedding: np.ndarray  # 512-dimensional vector from pyannote

    # Source information
    reference_file: str
    reference_duration: float  # seconds

    # Quality metrics
    embedding_quality: float = 1.0  # 0-1 score indicating embedding confidence
    num_speech_segments: int = 0

    # Metadata
    sample_rate: int = 16000
    created_at: Optional[str] = None

    def __post_init__(self):
        """Validate voice profile after initialization."""
        if self.embedding.ndim != 1:
            raise ValueError("Embedding must be 1-dimensional vector")

        if self.embedding_quality < 0 or self.embedding_quality > 1:
            raise ValueError("Embedding quality must be between 0 and 1")

    def similarity(self, other_embedding: np.ndarray) -> float:
        """
        Calculate cosine similarity with another embedding.

        Args:
            other_embedding: Another voice embedding vector

        Returns:
            Similarity score (0-1, higher = more similar)
        """
        from scipy.spatial.distance import cosine

        # Cosine similarity = 1 - cosine distance
        similarity = 1 - cosine(self.embedding, other_embedding)
        return max(0.0, min(1.0, similarity))  # Clamp to [0, 1]

    def matches(self, other_embedding: np.ndarray, threshold: float = 0.7) -> bool:
        """
        Check if another embedding matches this voice profile.

        Args:
            other_embedding: Voice embedding to compare
            threshold: Similarity threshold for match (default: 0.7)

        Returns:
            True if embeddings match above threshold
        """
        return self.similarity(other_embedding) >= threshold

    def to_dict(self) -> dict:
        """
        Convert voice profile to dictionary.

        Returns:
            Dictionary representation
        """
        return {
            "speaker_id": self.speaker_id,
            "embedding": self.embedding.tolist(),
            "reference_file": self.reference_file,
            "reference_duration": self.reference_duration,
            "embedding_quality": self.embedding_quality,
            "num_speech_segments": self.num_speech_segments,
            "sample_rate": self.sample_rate,
            "created_at": self.created_at,
        }

    @classmethod
    def from_dict(cls, data: dict) -> "VoiceProfile":
        """
        Create voice profile from dictionary.

        Args:
            data: Dictionary representation

        Returns:
            VoiceProfile instance
        """
        data = data.copy()
        data["embedding"] = np.array(data["embedding"])
        return cls(**data)

    def save(self, file_path: str):
        """
        Save voice profile to file.

        Args:
            file_path: Output file path (.npz format)
        """
        import numpy as np

        np.savez(
            file_path,
            speaker_id=self.speaker_id,
            embedding=self.embedding,
            reference_file=self.reference_file,
            reference_duration=self.reference_duration,
            embedding_quality=self.embedding_quality,
            num_speech_segments=self.num_speech_segments,
            sample_rate=self.sample_rate,
            created_at=self.created_at or "",
        )

    @classmethod
    def load(cls, file_path: str) -> "VoiceProfile":
        """
        Load voice profile from file.

        Args:
            file_path: Input file path (.npz format)

        Returns:
            VoiceProfile instance
        """
        import numpy as np

        data = np.load(file_path, allow_pickle=True)

        return cls(
            speaker_id=str(data["speaker_id"]),
            embedding=data["embedding"],
            reference_file=str(data["reference_file"]),
            reference_duration=float(data["reference_duration"]),
            embedding_quality=float(data["embedding_quality"]),
            num_speech_segments=int(data["num_speech_segments"]),
            sample_rate=int(data["sample_rate"]),
            created_at=str(data["created_at"]) if data["created_at"] else None,
        )