Spaces:
Runtime error
Runtime error
| """ | |
| Speaker Encoder Module | |
| Extract speaker embeddings and compute similarity using Resemblyzer | |
| """ | |
| import numpy as np | |
| import librosa | |
| import torch | |
| from pathlib import Path | |
| from typing import Union, Tuple | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| try: | |
| from resemblyzer import VoiceEncoder, preprocess_wav | |
| except ImportError: | |
| print("Warning: resemblyzer not installed. Run: pip install resemblyzer") | |
| VoiceEncoder = None | |
| preprocess_wav = None | |
| class SpeakerEncoder: | |
| """ | |
| Speaker embedding extraction and similarity computation | |
| Features: | |
| - Extract 256-dimensional speaker embeddings | |
| - Compute speaker similarity (cosine similarity) | |
| - Support for multiple audio formats | |
| """ | |
| def __init__(self, device: str = "cuda"): | |
| """ | |
| Initialize Speaker Encoder | |
| Args: | |
| device: Device to run on ('cuda' or 'cpu') | |
| """ | |
| if VoiceEncoder is None: | |
| raise ImportError("resemblyzer not installed. Run: pip install resemblyzer") | |
| self.device = device if torch.cuda.is_available() else "cpu" | |
| print(f"🎯 Initializing Speaker Encoder on {self.device}...") | |
| try: | |
| self.encoder = VoiceEncoder(device=self.device) | |
| print("✓ Speaker Encoder initialized successfully!") | |
| except Exception as e: | |
| print(f"❌ Error initializing Speaker Encoder: {e}") | |
| raise | |
| def extract_embedding( | |
| self, | |
| audio_path: Union[str, Path], | |
| normalize: bool = True | |
| ) -> np.ndarray: | |
| """ | |
| Extract speaker embedding from audio | |
| Args: | |
| audio_path: Path to audio file | |
| normalize: Normalize the embedding to unit length | |
| Returns: | |
| 256-dimensional speaker embedding | |
| """ | |
| audio_path = Path(audio_path) | |
| if not audio_path.exists(): | |
| raise FileNotFoundError(f"Audio file not found: {audio_path}") | |
| try: | |
| # Load and preprocess audio | |
| wav = preprocess_wav(audio_path) | |
| # Extract embedding | |
| embedding = self.encoder.embed_utterance(wav) | |
| # Normalize if requested | |
| if normalize: | |
| embedding = embedding / (np.linalg.norm(embedding) + 1e-8) | |
| return embedding | |
| except Exception as e: | |
| print(f"❌ Error extracting embedding from {audio_path.name}: {e}") | |
| raise | |
| def extract_embeddings_batch( | |
| self, | |
| audio_paths: list, | |
| normalize: bool = True | |
| ) -> np.ndarray: | |
| """ | |
| Extract embeddings from multiple audio files | |
| Args: | |
| audio_paths: List of audio file paths | |
| normalize: Normalize embeddings | |
| Returns: | |
| Array of shape (n_files, 256) | |
| """ | |
| embeddings = [] | |
| print(f"📊 Extracting embeddings from {len(audio_paths)} files...") | |
| for audio_path in audio_paths: | |
| try: | |
| emb = self.extract_embedding(audio_path, normalize=normalize) | |
| embeddings.append(emb) | |
| except Exception as e: | |
| print(f"⚠️ Skipping {audio_path}: {e}") | |
| embeddings.append(np.zeros(256)) # Placeholder | |
| return np.array(embeddings) | |
| def compute_similarity( | |
| self, | |
| audio_path1: Union[str, Path], | |
| audio_path2: Union[str, Path] | |
| ) -> float: | |
| """ | |
| Compute speaker similarity between two audio files | |
| Args: | |
| audio_path1: First audio file | |
| audio_path2: Second audio file | |
| Returns: | |
| Cosine similarity score (0-1, higher is more similar) | |
| """ | |
| # Extract embeddings | |
| emb1 = self.extract_embedding(audio_path1, normalize=True) | |
| emb2 = self.extract_embedding(audio_path2, normalize=True) | |
| # Compute cosine similarity | |
| similarity = np.dot(emb1, emb2) | |
| return float(similarity) | |
| def compute_similarity_matrix( | |
| self, | |
| audio_paths: list | |
| ) -> np.ndarray: | |
| """ | |
| Compute pairwise similarity matrix for multiple audio files | |
| Args: | |
| audio_paths: List of audio file paths | |
| Returns: | |
| Similarity matrix of shape (n_files, n_files) | |
| """ | |
| # Extract all embeddings | |
| embeddings = self.extract_embeddings_batch(audio_paths, normalize=True) | |
| # Compute similarity matrix | |
| similarity_matrix = np.dot(embeddings, embeddings.T) | |
| return similarity_matrix | |
| def find_most_similar( | |
| self, | |
| query_audio: Union[str, Path], | |
| candidate_audios: list, | |
| top_k: int = 5 | |
| ) -> list: | |
| """ | |
| Find most similar speakers to a query audio | |
| Args: | |
| query_audio: Query audio file | |
| candidate_audios: List of candidate audio files | |
| top_k: Number of top matches to return | |
| Returns: | |
| List of (audio_path, similarity_score) tuples | |
| """ | |
| # Extract query embedding | |
| query_emb = self.extract_embedding(query_audio, normalize=True) | |
| # Extract candidate embeddings | |
| candidate_embs = self.extract_embeddings_batch(candidate_audios, normalize=True) | |
| # Compute similarities | |
| similarities = np.dot(candidate_embs, query_emb) | |
| # Get top-k indices | |
| top_indices = np.argsort(similarities)[::-1][:top_k] | |
| # Return results | |
| results = [ | |
| (candidate_audios[idx], float(similarities[idx])) | |
| for idx in top_indices | |
| ] | |
| return results | |
| def verify_speaker( | |
| self, | |
| audio_path1: Union[str, Path], | |
| audio_path2: Union[str, Path], | |
| threshold: float = 0.75 | |
| ) -> Tuple[bool, float]: | |
| """ | |
| Verify if two audio files are from the same speaker | |
| Args: | |
| audio_path1: First audio file | |
| audio_path2: Second audio file | |
| threshold: Similarity threshold for same speaker (default: 0.75) | |
| Returns: | |
| Tuple of (is_same_speaker, similarity_score) | |
| """ | |
| similarity = self.compute_similarity(audio_path1, audio_path2) | |
| is_same = similarity >= threshold | |
| return is_same, similarity | |
| def interpolate_embeddings( | |
| self, | |
| audio_path1: Union[str, Path], | |
| audio_path2: Union[str, Path], | |
| alpha: float = 0.5 | |
| ) -> np.ndarray: | |
| """ | |
| Interpolate between two speaker embeddings | |
| Useful for creating synthetic speaker characteristics | |
| Args: | |
| audio_path1: First audio file | |
| audio_path2: Second audio file | |
| alpha: Interpolation factor (0=speaker1, 1=speaker2) | |
| Returns: | |
| Interpolated embedding | |
| """ | |
| emb1 = self.extract_embedding(audio_path1, normalize=True) | |
| emb2 = self.extract_embedding(audio_path2, normalize=True) | |
| # Linear interpolation | |
| interpolated = (1 - alpha) * emb1 + alpha * emb2 | |
| # Normalize | |
| interpolated = interpolated / (np.linalg.norm(interpolated) + 1e-8) | |
| return interpolated | |
| def load_audio( | |
| audio_path: Union[str, Path], | |
| sr: int = 16000 | |
| ) -> Tuple[np.ndarray, int]: | |
| """ | |
| Load audio file | |
| Args: | |
| audio_path: Path to audio file | |
| sr: Target sample rate | |
| Returns: | |
| Tuple of (audio_array, sample_rate) | |
| """ | |
| audio, sample_rate = librosa.load(str(audio_path), sr=sr) | |
| return audio, sample_rate | |
| def __repr__(self): | |
| return f"SpeakerEncoder(device={self.device})" | |
| def main(): | |
| """Demo usage of SpeakerEncoder""" | |
| print("=" * 60) | |
| print("Speaker Encoder Demo") | |
| print("=" * 60) | |
| # Initialize | |
| encoder = SpeakerEncoder(device="cuda") | |
| print("\n✓ Speaker Encoder ready!") | |
| print(" Embedding dimension: 256") | |
| print(" Use for:") | |
| print(" - Extract speaker embeddings") | |
| print(" - Compute speaker similarity") | |
| print(" - Verify speaker identity") | |
| print(" - Interpolate between speakers") | |
| print("\n" + "=" * 60) | |
| if __name__ == "__main__": | |
| main() | |