Spaces:
Running on Zero
Running on Zero
| """ | |
| VoiceProfile data model: Reference embedding and speaker identification. | |
| Represents a target voice profile extracted from reference audio clip. | |
| """ | |
| from dataclasses import dataclass, field | |
| from typing import Optional | |
| import numpy as np | |
| class VoiceProfile: | |
| """ | |
| Voice profile representing a target speaker. | |
| Contains embedding vectors and metadata for voice identification. | |
| """ | |
| # Core identification | |
| speaker_id: str | |
| embedding: np.ndarray # 512-dimensional vector from pyannote | |
| # Source information | |
| reference_file: str | |
| reference_duration: float # seconds | |
| # Quality metrics | |
| embedding_quality: float = 1.0 # 0-1 score indicating embedding confidence | |
| num_speech_segments: int = 0 | |
| # Metadata | |
| sample_rate: int = 16000 | |
| created_at: Optional[str] = None | |
| def __post_init__(self): | |
| """Validate voice profile after initialization.""" | |
| if self.embedding.ndim != 1: | |
| raise ValueError("Embedding must be 1-dimensional vector") | |
| if self.embedding_quality < 0 or self.embedding_quality > 1: | |
| raise ValueError("Embedding quality must be between 0 and 1") | |
| def similarity(self, other_embedding: np.ndarray) -> float: | |
| """ | |
| Calculate cosine similarity with another embedding. | |
| Args: | |
| other_embedding: Another voice embedding vector | |
| Returns: | |
| Similarity score (0-1, higher = more similar) | |
| """ | |
| from scipy.spatial.distance import cosine | |
| # Cosine similarity = 1 - cosine distance | |
| similarity = 1 - cosine(self.embedding, other_embedding) | |
| return max(0.0, min(1.0, similarity)) # Clamp to [0, 1] | |
| def matches(self, other_embedding: np.ndarray, threshold: float = 0.7) -> bool: | |
| """ | |
| Check if another embedding matches this voice profile. | |
| Args: | |
| other_embedding: Voice embedding to compare | |
| threshold: Similarity threshold for match (default: 0.7) | |
| Returns: | |
| True if embeddings match above threshold | |
| """ | |
| return self.similarity(other_embedding) >= threshold | |
| def to_dict(self) -> dict: | |
| """ | |
| Convert voice profile to dictionary. | |
| Returns: | |
| Dictionary representation | |
| """ | |
| return { | |
| "speaker_id": self.speaker_id, | |
| "embedding": self.embedding.tolist(), | |
| "reference_file": self.reference_file, | |
| "reference_duration": self.reference_duration, | |
| "embedding_quality": self.embedding_quality, | |
| "num_speech_segments": self.num_speech_segments, | |
| "sample_rate": self.sample_rate, | |
| "created_at": self.created_at, | |
| } | |
| def from_dict(cls, data: dict) -> "VoiceProfile": | |
| """ | |
| Create voice profile from dictionary. | |
| Args: | |
| data: Dictionary representation | |
| Returns: | |
| VoiceProfile instance | |
| """ | |
| data = data.copy() | |
| data["embedding"] = np.array(data["embedding"]) | |
| return cls(**data) | |
| def save(self, file_path: str): | |
| """ | |
| Save voice profile to file. | |
| Args: | |
| file_path: Output file path (.npz format) | |
| """ | |
| import numpy as np | |
| np.savez( | |
| file_path, | |
| speaker_id=self.speaker_id, | |
| embedding=self.embedding, | |
| reference_file=self.reference_file, | |
| reference_duration=self.reference_duration, | |
| embedding_quality=self.embedding_quality, | |
| num_speech_segments=self.num_speech_segments, | |
| sample_rate=self.sample_rate, | |
| created_at=self.created_at or "", | |
| ) | |
| def load(cls, file_path: str) -> "VoiceProfile": | |
| """ | |
| Load voice profile from file. | |
| Args: | |
| file_path: Input file path (.npz format) | |
| Returns: | |
| VoiceProfile instance | |
| """ | |
| import numpy as np | |
| data = np.load(file_path, allow_pickle=True) | |
| return cls( | |
| speaker_id=str(data["speaker_id"]), | |
| embedding=data["embedding"], | |
| reference_file=str(data["reference_file"]), | |
| reference_duration=float(data["reference_duration"]), | |
| embedding_quality=float(data["embedding_quality"]), | |
| num_speech_segments=int(data["num_speech_segments"]), | |
| sample_rate=int(data["sample_rate"]), | |
| created_at=str(data["created_at"]) if data["created_at"] else None, | |
| ) | |