Spaces:

saadmannan
/

TTS-with-VoiceCloning

Runtime error

App Files Files Community

TTS-with-VoiceCloning / src /speaker_encoder.py

saadmannan

initial commit

5ffccae 2 months ago

raw

history blame contribute delete

8.65 kB

	"""
	Speaker Encoder Module
	Extract speaker embeddings and compute similarity using Resemblyzer
	"""

	import numpy as np
	import librosa
	import torch
	from pathlib import Path
	from typing import Union, Tuple
	import warnings
	warnings.filterwarnings('ignore')

	try:
	from resemblyzer import VoiceEncoder, preprocess_wav
	except ImportError:
	print("Warning: resemblyzer not installed. Run: pip install resemblyzer")
	VoiceEncoder = None
	preprocess_wav = None


	class SpeakerEncoder:
	"""
	Speaker embedding extraction and similarity computation

	Features:
	- Extract 256-dimensional speaker embeddings
	- Compute speaker similarity (cosine similarity)
	- Support for multiple audio formats
	"""

	def __init__(self, device: str = "cuda"):
	"""
	Initialize Speaker Encoder

	Args:
	device: Device to run on ('cuda' or 'cpu')
	"""
	if VoiceEncoder is None:
	raise ImportError("resemblyzer not installed. Run: pip install resemblyzer")

	self.device = device if torch.cuda.is_available() else "cpu"

	print(f"🎯 Initializing Speaker Encoder on {self.device}...")

	try:
	self.encoder = VoiceEncoder(device=self.device)
	print("✓ Speaker Encoder initialized successfully!")

	except Exception as e:
	print(f"❌ Error initializing Speaker Encoder: {e}")
	raise

	def extract_embedding(
	self,
	audio_path: Union[str, Path],
	normalize: bool = True
	) -> np.ndarray:
	"""
	Extract speaker embedding from audio

	Args:
	audio_path: Path to audio file
	normalize: Normalize the embedding to unit length

	Returns:
	256-dimensional speaker embedding
	"""
	audio_path = Path(audio_path)

	if not audio_path.exists():
	raise FileNotFoundError(f"Audio file not found: {audio_path}")

	try:
	# Load and preprocess audio
	wav = preprocess_wav(audio_path)

	# Extract embedding
	embedding = self.encoder.embed_utterance(wav)

	# Normalize if requested
	if normalize:
	embedding = embedding / (np.linalg.norm(embedding) + 1e-8)

	return embedding

	except Exception as e:
	print(f"❌ Error extracting embedding from {audio_path.name}: {e}")
	raise

	def extract_embeddings_batch(
	self,
	audio_paths: list,
	normalize: bool = True
	) -> np.ndarray:
	"""
	Extract embeddings from multiple audio files

	Args:
	audio_paths: List of audio file paths
	normalize: Normalize embeddings

	Returns:
	Array of shape (n_files, 256)
	"""
	embeddings = []

	print(f"📊 Extracting embeddings from {len(audio_paths)} files...")

	for audio_path in audio_paths:
	try:
	emb = self.extract_embedding(audio_path, normalize=normalize)
	embeddings.append(emb)

	except Exception as e:
	print(f"⚠️ Skipping {audio_path}: {e}")
	embeddings.append(np.zeros(256)) # Placeholder

	return np.array(embeddings)

	def compute_similarity(
	self,
	audio_path1: Union[str, Path],
	audio_path2: Union[str, Path]
	) -> float:
	"""
	Compute speaker similarity between two audio files

	Args:
	audio_path1: First audio file
	audio_path2: Second audio file

	Returns:
	Cosine similarity score (0-1, higher is more similar)
	"""
	# Extract embeddings
	emb1 = self.extract_embedding(audio_path1, normalize=True)
	emb2 = self.extract_embedding(audio_path2, normalize=True)

	# Compute cosine similarity
	similarity = np.dot(emb1, emb2)

	return float(similarity)

	def compute_similarity_matrix(
	self,
	audio_paths: list
	) -> np.ndarray:
	"""
	Compute pairwise similarity matrix for multiple audio files

	Args:
	audio_paths: List of audio file paths

	Returns:
	Similarity matrix of shape (n_files, n_files)
	"""
	# Extract all embeddings
	embeddings = self.extract_embeddings_batch(audio_paths, normalize=True)

	# Compute similarity matrix
	similarity_matrix = np.dot(embeddings, embeddings.T)

	return similarity_matrix

	def find_most_similar(
	self,
	query_audio: Union[str, Path],
	candidate_audios: list,
	top_k: int = 5
	) -> list:
	"""
	Find most similar speakers to a query audio

	Args:
	query_audio: Query audio file
	candidate_audios: List of candidate audio files
	top_k: Number of top matches to return

	Returns:
	List of (audio_path, similarity_score) tuples
	"""
	# Extract query embedding
	query_emb = self.extract_embedding(query_audio, normalize=True)

	# Extract candidate embeddings
	candidate_embs = self.extract_embeddings_batch(candidate_audios, normalize=True)

	# Compute similarities
	similarities = np.dot(candidate_embs, query_emb)

	# Get top-k indices
	top_indices = np.argsort(similarities)[::-1][:top_k]

	# Return results
	results = [
	(candidate_audios[idx], float(similarities[idx]))
	for idx in top_indices
	]

	return results

	def verify_speaker(
	self,
	audio_path1: Union[str, Path],
	audio_path2: Union[str, Path],
	threshold: float = 0.75
	) -> Tuple[bool, float]:
	"""
	Verify if two audio files are from the same speaker

	Args:
	audio_path1: First audio file
	audio_path2: Second audio file
	threshold: Similarity threshold for same speaker (default: 0.75)

	Returns:
	Tuple of (is_same_speaker, similarity_score)
	"""
	similarity = self.compute_similarity(audio_path1, audio_path2)
	is_same = similarity >= threshold

	return is_same, similarity

	def interpolate_embeddings(
	self,
	audio_path1: Union[str, Path],
	audio_path2: Union[str, Path],
	alpha: float = 0.5
	) -> np.ndarray:
	"""
	Interpolate between two speaker embeddings
	Useful for creating synthetic speaker characteristics

	Args:
	audio_path1: First audio file
	audio_path2: Second audio file
	alpha: Interpolation factor (0=speaker1, 1=speaker2)

	Returns:
	Interpolated embedding
	"""
	emb1 = self.extract_embedding(audio_path1, normalize=True)
	emb2 = self.extract_embedding(audio_path2, normalize=True)

	# Linear interpolation
	interpolated = (1 - alpha) * emb1 + alpha * emb2

	# Normalize
	interpolated = interpolated / (np.linalg.norm(interpolated) + 1e-8)

	return interpolated

	@staticmethod
	def load_audio(
	audio_path: Union[str, Path],
	sr: int = 16000
	) -> Tuple[np.ndarray, int]:
	"""
	Load audio file

	Args:
	audio_path: Path to audio file
	sr: Target sample rate

	Returns:
	Tuple of (audio_array, sample_rate)
	"""
	audio, sample_rate = librosa.load(str(audio_path), sr=sr)
	return audio, sample_rate

	def __repr__(self):
	return f"SpeakerEncoder(device={self.device})"


	def main():
	"""Demo usage of SpeakerEncoder"""
	print("=" * 60)
	print("Speaker Encoder Demo")
	print("=" * 60)

	# Initialize
	encoder = SpeakerEncoder(device="cuda")

	print("\n✓ Speaker Encoder ready!")
	print(" Embedding dimension: 256")
	print(" Use for:")
	print(" - Extract speaker embeddings")
	print(" - Compute speaker similarity")
	print(" - Verify speaker identity")
	print(" - Interpolate between speakers")

	print("\n" + "=" * 60)


	if __name__ == "__main__":
	main()