""" Speaker Encoder Module Extract speaker embeddings and compute similarity using Resemblyzer """ import numpy as np import librosa import torch from pathlib import Path from typing import Union, Tuple import warnings warnings.filterwarnings('ignore') try: from resemblyzer import VoiceEncoder, preprocess_wav except ImportError: print("Warning: resemblyzer not installed. Run: pip install resemblyzer") VoiceEncoder = None preprocess_wav = None class SpeakerEncoder: """ Speaker embedding extraction and similarity computation Features: - Extract 256-dimensional speaker embeddings - Compute speaker similarity (cosine similarity) - Support for multiple audio formats """ def __init__(self, device: str = "cuda"): """ Initialize Speaker Encoder Args: device: Device to run on ('cuda' or 'cpu') """ if VoiceEncoder is None: raise ImportError("resemblyzer not installed. Run: pip install resemblyzer") self.device = device if torch.cuda.is_available() else "cpu" print(f"šŸŽÆ Initializing Speaker Encoder on {self.device}...") try: self.encoder = VoiceEncoder(device=self.device) print("āœ“ Speaker Encoder initialized successfully!") except Exception as e: print(f"āŒ Error initializing Speaker Encoder: {e}") raise def extract_embedding( self, audio_path: Union[str, Path], normalize: bool = True ) -> np.ndarray: """ Extract speaker embedding from audio Args: audio_path: Path to audio file normalize: Normalize the embedding to unit length Returns: 256-dimensional speaker embedding """ audio_path = Path(audio_path) if not audio_path.exists(): raise FileNotFoundError(f"Audio file not found: {audio_path}") try: # Load and preprocess audio wav = preprocess_wav(audio_path) # Extract embedding embedding = self.encoder.embed_utterance(wav) # Normalize if requested if normalize: embedding = embedding / (np.linalg.norm(embedding) + 1e-8) return embedding except Exception as e: print(f"āŒ Error extracting embedding from {audio_path.name}: {e}") raise def extract_embeddings_batch( self, audio_paths: list, normalize: bool = True ) -> np.ndarray: """ Extract embeddings from multiple audio files Args: audio_paths: List of audio file paths normalize: Normalize embeddings Returns: Array of shape (n_files, 256) """ embeddings = [] print(f"šŸ“Š Extracting embeddings from {len(audio_paths)} files...") for audio_path in audio_paths: try: emb = self.extract_embedding(audio_path, normalize=normalize) embeddings.append(emb) except Exception as e: print(f"āš ļø Skipping {audio_path}: {e}") embeddings.append(np.zeros(256)) # Placeholder return np.array(embeddings) def compute_similarity( self, audio_path1: Union[str, Path], audio_path2: Union[str, Path] ) -> float: """ Compute speaker similarity between two audio files Args: audio_path1: First audio file audio_path2: Second audio file Returns: Cosine similarity score (0-1, higher is more similar) """ # Extract embeddings emb1 = self.extract_embedding(audio_path1, normalize=True) emb2 = self.extract_embedding(audio_path2, normalize=True) # Compute cosine similarity similarity = np.dot(emb1, emb2) return float(similarity) def compute_similarity_matrix( self, audio_paths: list ) -> np.ndarray: """ Compute pairwise similarity matrix for multiple audio files Args: audio_paths: List of audio file paths Returns: Similarity matrix of shape (n_files, n_files) """ # Extract all embeddings embeddings = self.extract_embeddings_batch(audio_paths, normalize=True) # Compute similarity matrix similarity_matrix = np.dot(embeddings, embeddings.T) return similarity_matrix def find_most_similar( self, query_audio: Union[str, Path], candidate_audios: list, top_k: int = 5 ) -> list: """ Find most similar speakers to a query audio Args: query_audio: Query audio file candidate_audios: List of candidate audio files top_k: Number of top matches to return Returns: List of (audio_path, similarity_score) tuples """ # Extract query embedding query_emb = self.extract_embedding(query_audio, normalize=True) # Extract candidate embeddings candidate_embs = self.extract_embeddings_batch(candidate_audios, normalize=True) # Compute similarities similarities = np.dot(candidate_embs, query_emb) # Get top-k indices top_indices = np.argsort(similarities)[::-1][:top_k] # Return results results = [ (candidate_audios[idx], float(similarities[idx])) for idx in top_indices ] return results def verify_speaker( self, audio_path1: Union[str, Path], audio_path2: Union[str, Path], threshold: float = 0.75 ) -> Tuple[bool, float]: """ Verify if two audio files are from the same speaker Args: audio_path1: First audio file audio_path2: Second audio file threshold: Similarity threshold for same speaker (default: 0.75) Returns: Tuple of (is_same_speaker, similarity_score) """ similarity = self.compute_similarity(audio_path1, audio_path2) is_same = similarity >= threshold return is_same, similarity def interpolate_embeddings( self, audio_path1: Union[str, Path], audio_path2: Union[str, Path], alpha: float = 0.5 ) -> np.ndarray: """ Interpolate between two speaker embeddings Useful for creating synthetic speaker characteristics Args: audio_path1: First audio file audio_path2: Second audio file alpha: Interpolation factor (0=speaker1, 1=speaker2) Returns: Interpolated embedding """ emb1 = self.extract_embedding(audio_path1, normalize=True) emb2 = self.extract_embedding(audio_path2, normalize=True) # Linear interpolation interpolated = (1 - alpha) * emb1 + alpha * emb2 # Normalize interpolated = interpolated / (np.linalg.norm(interpolated) + 1e-8) return interpolated @staticmethod def load_audio( audio_path: Union[str, Path], sr: int = 16000 ) -> Tuple[np.ndarray, int]: """ Load audio file Args: audio_path: Path to audio file sr: Target sample rate Returns: Tuple of (audio_array, sample_rate) """ audio, sample_rate = librosa.load(str(audio_path), sr=sr) return audio, sample_rate def __repr__(self): return f"SpeakerEncoder(device={self.device})" def main(): """Demo usage of SpeakerEncoder""" print("=" * 60) print("Speaker Encoder Demo") print("=" * 60) # Initialize encoder = SpeakerEncoder(device="cuda") print("\nāœ“ Speaker Encoder ready!") print(" Embedding dimension: 256") print(" Use for:") print(" - Extract speaker embeddings") print(" - Compute speaker similarity") print(" - Verify speaker identity") print(" - Interpolate between speakers") print("\n" + "=" * 60) if __name__ == "__main__": main()