voice-tools / src /models /voice_profile.py
jcudit's picture
jcudit HF Staff
fix: correct gitignore to only exclude root-level models directory, not src/models package
0456b70
"""
VoiceProfile data model: Reference embedding and speaker identification.
Represents a target voice profile extracted from reference audio clip.
"""
from dataclasses import dataclass, field
from typing import Optional
import numpy as np
@dataclass
class VoiceProfile:
"""
Voice profile representing a target speaker.
Contains embedding vectors and metadata for voice identification.
"""
# Core identification
speaker_id: str
embedding: np.ndarray # 512-dimensional vector from pyannote
# Source information
reference_file: str
reference_duration: float # seconds
# Quality metrics
embedding_quality: float = 1.0 # 0-1 score indicating embedding confidence
num_speech_segments: int = 0
# Metadata
sample_rate: int = 16000
created_at: Optional[str] = None
def __post_init__(self):
"""Validate voice profile after initialization."""
if self.embedding.ndim != 1:
raise ValueError("Embedding must be 1-dimensional vector")
if self.embedding_quality < 0 or self.embedding_quality > 1:
raise ValueError("Embedding quality must be between 0 and 1")
def similarity(self, other_embedding: np.ndarray) -> float:
"""
Calculate cosine similarity with another embedding.
Args:
other_embedding: Another voice embedding vector
Returns:
Similarity score (0-1, higher = more similar)
"""
from scipy.spatial.distance import cosine
# Cosine similarity = 1 - cosine distance
similarity = 1 - cosine(self.embedding, other_embedding)
return max(0.0, min(1.0, similarity)) # Clamp to [0, 1]
def matches(self, other_embedding: np.ndarray, threshold: float = 0.7) -> bool:
"""
Check if another embedding matches this voice profile.
Args:
other_embedding: Voice embedding to compare
threshold: Similarity threshold for match (default: 0.7)
Returns:
True if embeddings match above threshold
"""
return self.similarity(other_embedding) >= threshold
def to_dict(self) -> dict:
"""
Convert voice profile to dictionary.
Returns:
Dictionary representation
"""
return {
"speaker_id": self.speaker_id,
"embedding": self.embedding.tolist(),
"reference_file": self.reference_file,
"reference_duration": self.reference_duration,
"embedding_quality": self.embedding_quality,
"num_speech_segments": self.num_speech_segments,
"sample_rate": self.sample_rate,
"created_at": self.created_at,
}
@classmethod
def from_dict(cls, data: dict) -> "VoiceProfile":
"""
Create voice profile from dictionary.
Args:
data: Dictionary representation
Returns:
VoiceProfile instance
"""
data = data.copy()
data["embedding"] = np.array(data["embedding"])
return cls(**data)
def save(self, file_path: str):
"""
Save voice profile to file.
Args:
file_path: Output file path (.npz format)
"""
import numpy as np
np.savez(
file_path,
speaker_id=self.speaker_id,
embedding=self.embedding,
reference_file=self.reference_file,
reference_duration=self.reference_duration,
embedding_quality=self.embedding_quality,
num_speech_segments=self.num_speech_segments,
sample_rate=self.sample_rate,
created_at=self.created_at or "",
)
@classmethod
def load(cls, file_path: str) -> "VoiceProfile":
"""
Load voice profile from file.
Args:
file_path: Input file path (.npz format)
Returns:
VoiceProfile instance
"""
import numpy as np
data = np.load(file_path, allow_pickle=True)
return cls(
speaker_id=str(data["speaker_id"]),
embedding=data["embedding"],
reference_file=str(data["reference_file"]),
reference_duration=float(data["reference_duration"]),
embedding_quality=float(data["embedding_quality"]),
num_speech_segments=int(data["num_speech_segments"]),
sample_rate=int(data["sample_rate"]),
created_at=str(data["created_at"]) if data["created_at"] else None,
)