File size: 8,651 Bytes
5ffccae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
"""
Speaker Encoder Module
Extract speaker embeddings and compute similarity using Resemblyzer
"""

import numpy as np
import librosa
import torch
from pathlib import Path
from typing import Union, Tuple
import warnings
warnings.filterwarnings('ignore')

try:
    from resemblyzer import VoiceEncoder, preprocess_wav
except ImportError:
    print("Warning: resemblyzer not installed. Run: pip install resemblyzer")
    VoiceEncoder = None
    preprocess_wav = None


class SpeakerEncoder:
    """
    Speaker embedding extraction and similarity computation
    
    Features:
    - Extract 256-dimensional speaker embeddings
    - Compute speaker similarity (cosine similarity)
    - Support for multiple audio formats
    """
    
    def __init__(self, device: str = "cuda"):
        """
        Initialize Speaker Encoder
        
        Args:
            device: Device to run on ('cuda' or 'cpu')
        """
        if VoiceEncoder is None:
            raise ImportError("resemblyzer not installed. Run: pip install resemblyzer")
        
        self.device = device if torch.cuda.is_available() else "cpu"
        
        print(f"🎯 Initializing Speaker Encoder on {self.device}...")
        
        try:
            self.encoder = VoiceEncoder(device=self.device)
            print("✓ Speaker Encoder initialized successfully!")
            
        except Exception as e:
            print(f"❌ Error initializing Speaker Encoder: {e}")
            raise
    
    def extract_embedding(
        self,
        audio_path: Union[str, Path],
        normalize: bool = True
    ) -> np.ndarray:
        """
        Extract speaker embedding from audio
        
        Args:
            audio_path: Path to audio file
            normalize: Normalize the embedding to unit length
        
        Returns:
            256-dimensional speaker embedding
        """
        audio_path = Path(audio_path)
        
        if not audio_path.exists():
            raise FileNotFoundError(f"Audio file not found: {audio_path}")
        
        try:
            # Load and preprocess audio
            wav = preprocess_wav(audio_path)
            
            # Extract embedding
            embedding = self.encoder.embed_utterance(wav)
            
            # Normalize if requested
            if normalize:
                embedding = embedding / (np.linalg.norm(embedding) + 1e-8)
            
            return embedding
            
        except Exception as e:
            print(f"❌ Error extracting embedding from {audio_path.name}: {e}")
            raise
    
    def extract_embeddings_batch(
        self,
        audio_paths: list,
        normalize: bool = True
    ) -> np.ndarray:
        """
        Extract embeddings from multiple audio files
        
        Args:
            audio_paths: List of audio file paths
            normalize: Normalize embeddings
        
        Returns:
            Array of shape (n_files, 256)
        """
        embeddings = []
        
        print(f"📊 Extracting embeddings from {len(audio_paths)} files...")
        
        for audio_path in audio_paths:
            try:
                emb = self.extract_embedding(audio_path, normalize=normalize)
                embeddings.append(emb)
                
            except Exception as e:
                print(f"⚠️  Skipping {audio_path}: {e}")
                embeddings.append(np.zeros(256))  # Placeholder
        
        return np.array(embeddings)
    
    def compute_similarity(
        self,
        audio_path1: Union[str, Path],
        audio_path2: Union[str, Path]
    ) -> float:
        """
        Compute speaker similarity between two audio files
        
        Args:
            audio_path1: First audio file
            audio_path2: Second audio file
        
        Returns:
            Cosine similarity score (0-1, higher is more similar)
        """
        # Extract embeddings
        emb1 = self.extract_embedding(audio_path1, normalize=True)
        emb2 = self.extract_embedding(audio_path2, normalize=True)
        
        # Compute cosine similarity
        similarity = np.dot(emb1, emb2)
        
        return float(similarity)
    
    def compute_similarity_matrix(
        self,
        audio_paths: list
    ) -> np.ndarray:
        """
        Compute pairwise similarity matrix for multiple audio files
        
        Args:
            audio_paths: List of audio file paths
        
        Returns:
            Similarity matrix of shape (n_files, n_files)
        """
        # Extract all embeddings
        embeddings = self.extract_embeddings_batch(audio_paths, normalize=True)
        
        # Compute similarity matrix
        similarity_matrix = np.dot(embeddings, embeddings.T)
        
        return similarity_matrix
    
    def find_most_similar(
        self,
        query_audio: Union[str, Path],
        candidate_audios: list,
        top_k: int = 5
    ) -> list:
        """
        Find most similar speakers to a query audio
        
        Args:
            query_audio: Query audio file
            candidate_audios: List of candidate audio files
            top_k: Number of top matches to return
        
        Returns:
            List of (audio_path, similarity_score) tuples
        """
        # Extract query embedding
        query_emb = self.extract_embedding(query_audio, normalize=True)
        
        # Extract candidate embeddings
        candidate_embs = self.extract_embeddings_batch(candidate_audios, normalize=True)
        
        # Compute similarities
        similarities = np.dot(candidate_embs, query_emb)
        
        # Get top-k indices
        top_indices = np.argsort(similarities)[::-1][:top_k]
        
        # Return results
        results = [
            (candidate_audios[idx], float(similarities[idx]))
            for idx in top_indices
        ]
        
        return results
    
    def verify_speaker(
        self,
        audio_path1: Union[str, Path],
        audio_path2: Union[str, Path],
        threshold: float = 0.75
    ) -> Tuple[bool, float]:
        """
        Verify if two audio files are from the same speaker
        
        Args:
            audio_path1: First audio file
            audio_path2: Second audio file
            threshold: Similarity threshold for same speaker (default: 0.75)
        
        Returns:
            Tuple of (is_same_speaker, similarity_score)
        """
        similarity = self.compute_similarity(audio_path1, audio_path2)
        is_same = similarity >= threshold
        
        return is_same, similarity
    
    def interpolate_embeddings(
        self,
        audio_path1: Union[str, Path],
        audio_path2: Union[str, Path],
        alpha: float = 0.5
    ) -> np.ndarray:
        """
        Interpolate between two speaker embeddings
        Useful for creating synthetic speaker characteristics
        
        Args:
            audio_path1: First audio file
            audio_path2: Second audio file
            alpha: Interpolation factor (0=speaker1, 1=speaker2)
        
        Returns:
            Interpolated embedding
        """
        emb1 = self.extract_embedding(audio_path1, normalize=True)
        emb2 = self.extract_embedding(audio_path2, normalize=True)
        
        # Linear interpolation
        interpolated = (1 - alpha) * emb1 + alpha * emb2
        
        # Normalize
        interpolated = interpolated / (np.linalg.norm(interpolated) + 1e-8)
        
        return interpolated
    
    @staticmethod
    def load_audio(
        audio_path: Union[str, Path],
        sr: int = 16000
    ) -> Tuple[np.ndarray, int]:
        """
        Load audio file
        
        Args:
            audio_path: Path to audio file
            sr: Target sample rate
        
        Returns:
            Tuple of (audio_array, sample_rate)
        """
        audio, sample_rate = librosa.load(str(audio_path), sr=sr)
        return audio, sample_rate
    
    def __repr__(self):
        return f"SpeakerEncoder(device={self.device})"


def main():
    """Demo usage of SpeakerEncoder"""
    print("=" * 60)
    print("Speaker Encoder Demo")
    print("=" * 60)
    
    # Initialize
    encoder = SpeakerEncoder(device="cuda")
    
    print("\n✓ Speaker Encoder ready!")
    print("   Embedding dimension: 256")
    print("   Use for:")
    print("   - Extract speaker embeddings")
    print("   - Compute speaker similarity")
    print("   - Verify speaker identity")
    print("   - Interpolate between speakers")
    
    print("\n" + "=" * 60)


if __name__ == "__main__":
    main()