Spaces:

jcudit
/

voice-tools

Running on Zero

App Files Files Community

voice-tools / src /models /voice_profile.py

jcudit HF Staff

fix: correct gitignore to only exclude root-level models directory, not src/models package

0456b70 2 months ago

raw

history blame contribute delete

4.57 kB

	"""
	VoiceProfile data model: Reference embedding and speaker identification.

	Represents a target voice profile extracted from reference audio clip.
	"""

	from dataclasses import dataclass, field
	from typing import Optional

	import numpy as np


	@dataclass
	class VoiceProfile:
	"""
	Voice profile representing a target speaker.

	Contains embedding vectors and metadata for voice identification.
	"""

	# Core identification
	speaker_id: str
	embedding: np.ndarray # 512-dimensional vector from pyannote

	# Source information
	reference_file: str
	reference_duration: float # seconds

	# Quality metrics
	embedding_quality: float = 1.0 # 0-1 score indicating embedding confidence
	num_speech_segments: int = 0

	# Metadata
	sample_rate: int = 16000
	created_at: Optional[str] = None

	def __post_init__(self):
	"""Validate voice profile after initialization."""
	if self.embedding.ndim != 1:
	raise ValueError("Embedding must be 1-dimensional vector")

	if self.embedding_quality < 0 or self.embedding_quality > 1:
	raise ValueError("Embedding quality must be between 0 and 1")

	def similarity(self, other_embedding: np.ndarray) -> float:
	"""
	Calculate cosine similarity with another embedding.

	Args:
	other_embedding: Another voice embedding vector

	Returns:
	Similarity score (0-1, higher = more similar)
	"""
	from scipy.spatial.distance import cosine

	# Cosine similarity = 1 - cosine distance
	similarity = 1 - cosine(self.embedding, other_embedding)
	return max(0.0, min(1.0, similarity)) # Clamp to [0, 1]

	def matches(self, other_embedding: np.ndarray, threshold: float = 0.7) -> bool:
	"""
	Check if another embedding matches this voice profile.

	Args:
	other_embedding: Voice embedding to compare
	threshold: Similarity threshold for match (default: 0.7)

	Returns:
	True if embeddings match above threshold
	"""
	return self.similarity(other_embedding) >= threshold

	def to_dict(self) -> dict:
	"""
	Convert voice profile to dictionary.

	Returns:
	Dictionary representation
	"""
	return {
	"speaker_id": self.speaker_id,
	"embedding": self.embedding.tolist(),
	"reference_file": self.reference_file,
	"reference_duration": self.reference_duration,
	"embedding_quality": self.embedding_quality,
	"num_speech_segments": self.num_speech_segments,
	"sample_rate": self.sample_rate,
	"created_at": self.created_at,
	}

	@classmethod
	def from_dict(cls, data: dict) -> "VoiceProfile":
	"""
	Create voice profile from dictionary.

	Args:
	data: Dictionary representation

	Returns:
	VoiceProfile instance
	"""
	data = data.copy()
	data["embedding"] = np.array(data["embedding"])
	return cls(**data)

	def save(self, file_path: str):
	"""
	Save voice profile to file.

	Args:
	file_path: Output file path (.npz format)
	"""
	import numpy as np

	np.savez(
	file_path,
	speaker_id=self.speaker_id,
	embedding=self.embedding,
	reference_file=self.reference_file,
	reference_duration=self.reference_duration,
	embedding_quality=self.embedding_quality,
	num_speech_segments=self.num_speech_segments,
	sample_rate=self.sample_rate,
	created_at=self.created_at or "",
	)

	@classmethod
	def load(cls, file_path: str) -> "VoiceProfile":
	"""
	Load voice profile from file.

	Args:
	file_path: Input file path (.npz format)

	Returns:
	VoiceProfile instance
	"""
	import numpy as np

	data = np.load(file_path, allow_pickle=True)

	return cls(
	speaker_id=str(data["speaker_id"]),
	embedding=data["embedding"],
	reference_file=str(data["reference_file"]),
	reference_duration=float(data["reference_duration"]),
	embedding_quality=float(data["embedding_quality"]),
	num_speech_segments=int(data["num_speech_segments"]),
	sample_rate=int(data["sample_rate"]),
	created_at=str(data["created_at"]) if data["created_at"] else None,
	)