crackuser commited on
Commit
142aa49
·
verified ·
1 Parent(s): 0d1b7fe

Create voice_analyzer.py

Browse files
Files changed (1) hide show
  1. voice_analyzer.py +409 -0
voice_analyzer.py ADDED
@@ -0,0 +1,409 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import librosa
3
+ from scipy import stats
4
+ from typing import Dict, Tuple
5
+ import parselmouth
6
+ from parselmouth.praat import call
7
+
8
+ class VoiceAnalyzer:
9
+ """Advanced voice analysis for cloning applications"""
10
+
11
+ def __init__(self):
12
+ self.sample_rate = 22050
13
+
14
+ def analyze_voice(self, audio: np.ndarray, sr: int) -> Dict:
15
+ """Comprehensive voice analysis"""
16
+
17
+ # Resample if needed
18
+ if sr != self.sample_rate:
19
+ audio = librosa.resample(audio, orig_sr=sr, target_sr=self.sample_rate)
20
+
21
+ analysis = {}
22
+
23
+ # Basic audio properties
24
+ analysis.update(self._analyze_basic_properties(audio))
25
+
26
+ # Pitch analysis
27
+ analysis.update(self._analyze_pitch(audio))
28
+
29
+ # Formant analysis
30
+ analysis.update(self._analyze_formants(audio))
31
+
32
+ # Spectral analysis
33
+ analysis.update(self._analyze_spectral_features(audio))
34
+
35
+ # Prosodic features
36
+ analysis.update(self._analyze_prosody(audio))
37
+
38
+ # Voice quality measures
39
+ analysis.update(self._analyze_voice_quality(audio))
40
+
41
+ return analysis
42
+
43
+ def _analyze_basic_properties(self, audio: np.ndarray) -> Dict:
44
+ """Analyze basic audio properties"""
45
+
46
+ duration = len(audio) / self.sample_rate
47
+ rms_energy = np.sqrt(np.mean(audio**2))
48
+ zcr = np.mean(librosa.feature.zero_crossing_rate(audio))
49
+
50
+ return {
51
+ 'duration_seconds': round(duration, 2),
52
+ 'rms_energy': round(float(rms_energy), 4),
53
+ 'zero_crossing_rate': round(float(zcr), 4),
54
+ 'peak_amplitude': round(float(np.max(np.abs(audio))), 4)
55
+ }
56
+
57
+ def _analyze_pitch(self, audio: np.ndarray) -> Dict:
58
+ """Analyze pitch characteristics"""
59
+
60
+ # Extract pitch using librosa
61
+ pitches, magnitudes = librosa.piptrack(y=audio, sr=self.sample_rate, fmin=50, fmax=400)
62
+
63
+ # Get pitch values
64
+ pitch_values = []
65
+ for t in range(pitches.shape[1]):
66
+ index = magnitudes[:, t].argmax()
67
+ pitch = pitches[index, t]
68
+ if pitch > 0:
69
+ pitch_values.append(pitch)
70
+
71
+ if pitch_values:
72
+ pitch_values = np.array(pitch_values)
73
+
74
+ return {
75
+ 'fundamental_frequency_mean_hz': round(float(np.mean(pitch_values)), 2),
76
+ 'fundamental_frequency_std_hz': round(float(np.std(pitch_values)), 2),
77
+ 'fundamental_frequency_range_hz': round(float(np.ptp(pitch_values)), 2),
78
+ 'pitch_median_hz': round(float(np.median(pitch_values)), 2)
79
+ }
80
+ else:
81
+ return {
82
+ 'fundamental_frequency_mean_hz': 0,
83
+ 'fundamental_frequency_std_hz': 0,
84
+ 'fundamental_frequency_range_hz': 0,
85
+ 'pitch_median_hz': 0
86
+ }
87
+
88
+ def _analyze_formants(self, audio: np.ndarray) -> Dict:
89
+ """Analyze formant frequencies"""
90
+
91
+ try:
92
+ # Use parselmouth for formant analysis
93
+ sound = parselmouth.Sound(audio, sampling_frequency=self.sample_rate)
94
+ formant = call(sound, "To Formant (burg)", 0.0025, 5, 5500, 0.025, 50)
95
+
96
+ # Extract first 3 formants
97
+ f1_values = []
98
+ f2_values = []
99
+ f3_values = []
100
+
101
+ n_frames = call(formant, "Get number of frames")
102
+
103
+ for i in range(1, min(n_frames + 1, 100)): # Sample max 100 frames
104
+ f1 = call(formant, "Get value at time", 1, i * 0.01, "Hertz", "Linear")
105
+ f2 = call(formant, "Get value at time", 2, i * 0.01, "Hertz", "Linear")
106
+ f3 = call(formant, "Get value at time", 3, i * 0.01, "Hertz", "Linear")
107
+
108
+ if not (np.isnan(f1) or np.isnan(f2) or np.isnan(f3)):
109
+ f1_values.append(f1)
110
+ f2_values.append(f2)
111
+ f3_values.append(f3)
112
+
113
+ if f1_values and f2_values and f3_values:
114
+ return {
115
+ 'formant_f1_mean_hz': round(float(np.mean(f1_values)), 2),
116
+ 'formant_f2_mean_hz': round(float(np.mean(f2_values)), 2),
117
+ 'formant_f3_mean_hz': round(float(np.mean(f3_values)), 2),
118
+ 'formant_f1_std_hz': round(float(np.std(f1_values)), 2),
119
+ 'formant_f2_std_hz': round(float(np.std(f2_values)), 2),
120
+ 'formant_f3_std_hz': round(float(np.std(f3_values)), 2)
121
+ }
122
+ except:
123
+ pass
124
+
125
+ # Fallback: estimate formants from spectral peaks
126
+ return self._estimate_formants_spectral(audio)
127
+
128
+ def _estimate_formants_spectral(self, audio: np.ndarray) -> Dict:
129
+ """Estimate formants from spectral analysis"""
130
+
131
+ # Compute FFT
132
+ fft = np.fft.rfft(audio)
133
+ magnitude = np.abs(fft)
134
+ frequencies = np.fft.rfftfreq(len(audio), 1/self.sample_rate)
135
+
136
+ # Find peaks in frequency domain
137
+ from scipy.signal import find_peaks
138
+
139
+ peaks, _ = find_peaks(magnitude, height=np.max(magnitude) * 0.1, distance=50)
140
+ peak_freqs = frequencies[peaks]
141
+
142
+ # Select first few peaks as formant estimates
143
+ formants = sorted(peak_freqs[peak_freqs > 200])[:3] # Above 200 Hz
144
+
145
+ return {
146
+ 'formant_f1_mean_hz': round(float(formants[0]) if len(formants) > 0 else 500, 2),
147
+ 'formant_f2_mean_hz': round(float(formants[1]) if len(formants) > 1 else 1500, 2),
148
+ 'formant_f3_mean_hz': round(float(formants[2]) if len(formants) > 2 else 2500, 2),
149
+ 'formant_f1_std_hz': 0.0,
150
+ 'formant_f2_std_hz': 0.0,
151
+ 'formant_f3_std_hz': 0.0
152
+ }
153
+
154
+ def _analyze_spectral_features(self, audio: np.ndarray) -> Dict:
155
+ """Analyze spectral characteristics"""
156
+
157
+ # Spectral centroid
158
+ spectral_centroids = librosa.feature.spectral_centroid(y=audio, sr=self.sample_rate)[0]
159
+
160
+ # Spectral bandwidth
161
+ spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=self.sample_rate)[0]
162
+
163
+ # Spectral rolloff
164
+ spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=self.sample_rate)[0]
165
+
166
+ # Spectral contrast
167
+ spectral_contrast = librosa.feature.spectral_contrast(y=audio, sr=self.sample_rate)
168
+
169
+ # MFCC features
170
+ mfccs = librosa.feature.mfcc(y=audio, sr=self.sample_rate, n_mfcc=13)
171
+
172
+ return {
173
+ 'spectral_centroid_mean_hz': round(float(np.mean(spectral_centroids)), 2),
174
+ 'spectral_centroid_std_hz': round(float(np.std(spectral_centroids)), 2),
175
+ 'spectral_bandwidth_mean_hz': round(float(np.mean(spectral_bandwidth)), 2),
176
+ 'spectral_rolloff_mean_hz': round(float(np.mean(spectral_rolloff)), 2),
177
+ 'spectral_contrast_mean': round(float(np.mean(spectral_contrast)), 4),
178
+ 'mfcc_mean': [round(float(x), 4) for x in np.mean(mfccs, axis=1)]
179
+ }
180
+
181
+ def _analyze_prosody(self, audio: np.ndarray) -> Dict:
182
+ """Analyze prosodic features"""
183
+
184
+ # Speaking rate (approximate)
185
+ # Detect voiced segments
186
+ frame_length = int(0.025 * self.sample_rate) # 25ms frames
187
+ hop_length = int(0.010 * self.sample_rate) # 10ms hop
188
+
189
+ # Energy-based voice activity detection
190
+ energy = []
191
+ for i in range(0, len(audio) - frame_length + 1, hop_length):
192
+ frame = audio[i:i + frame_length]
193
+ energy.append(np.sum(frame ** 2))
194
+
195
+ energy = np.array(energy)
196
+ voiced_frames = energy > (np.mean(energy) * 0.1)
197
+
198
+ # Estimate speaking rate
199
+ voiced_duration = np.sum(voiced_frames) * 0.010 # 10ms per frame
200
+ total_duration = len(audio) / self.sample_rate
201
+
202
+ speech_rate = voiced_duration / total_duration if total_duration > 0 else 0
203
+
204
+ # Jitter and shimmer (simplified estimation)
205
+ pitch_periods = self._extract_pitch_periods(audio)
206
+ jitter = self._calculate_jitter(pitch_periods) if len(pitch_periods) > 3 else 0
207
+ shimmer = self._calculate_shimmer(audio, pitch_periods) if len(pitch_periods) > 3 else 0
208
+
209
+ return {
210
+ 'speech_rate_ratio': round(speech_rate, 4),
211
+ 'voiced_frames_ratio': round(float(np.mean(voiced_frames)), 4),
212
+ 'jitter_percent': round(jitter * 100, 4),
213
+ 'shimmer_percent': round(shimmer * 100, 4)
214
+ }
215
+
216
+ def _analyze_voice_quality(self, audio: np.ndarray) -> Dict:
217
+ """Analyze voice quality measures"""
218
+
219
+ # Harmonics-to-noise ratio (simplified)
220
+ hnr = self._calculate_hnr(audio)
221
+
222
+ # Spectral tilt
223
+ spectral_tilt = self._calculate_spectral_tilt(audio)
224
+
225
+ # Breathiness measure (high-frequency energy ratio)
226
+ breathiness = self._calculate_breathiness(audio)
227
+
228
+ return {
229
+ 'harmonics_to_noise_ratio_db': round(hnr, 2),
230
+ 'spectral_tilt_db_oct': round(spectral_tilt, 2),
231
+ 'breathiness_ratio': round(breathiness, 4)
232
+ }
233
+
234
+ def _extract_pitch_periods(self, audio: np.ndarray) -> np.ndarray:
235
+ """Extract pitch periods from audio"""
236
+
237
+ # Simple autocorrelation-based pitch period extraction
238
+ autocorr = np.correlate(audio, audio, mode='full')
239
+ autocorr = autocorr[len(autocorr)//2:]
240
+
241
+ # Find peaks in autocorrelation
242
+ from scipy.signal import find_peaks
243
+
244
+ min_period = int(self.sample_rate / 400) # 400 Hz max
245
+ max_period = int(self.sample_rate / 50) # 50 Hz min
246
+
247
+ peaks, _ = find_peaks(autocorr[min_period:max_period])
248
+ peaks += min_period
249
+
250
+ return peaks[:10] # Return up to 10 periods
251
+
252
+ def _calculate_jitter(self, pitch_periods: np.ndarray) -> float:
253
+ """Calculate jitter (pitch period variability)"""
254
+
255
+ if len(pitch_periods) < 2:
256
+ return 0.0
257
+
258
+ # Calculate period differences
259
+ period_diffs = np.abs(np.diff(pitch_periods))
260
+ mean_period = np.mean(pitch_periods)
261
+
262
+ if mean_period > 0:
263
+ jitter = np.mean(period_diffs) / mean_period
264
+ return jitter
265
+
266
+ return 0.0
267
+
268
+ def _calculate_shimmer(self, audio: np.ndarray, pitch_periods: np.ndarray) -> float:
269
+ """Calculate shimmer (amplitude variability)"""
270
+
271
+ if len(pitch_periods) < 2:
272
+ return 0.0
273
+
274
+ # Extract amplitude for each period
275
+ amplitudes = []
276
+ for period in pitch_periods:
277
+ if period < len(audio):
278
+ amplitudes.append(np.max(np.abs(audio[max(0, period-50):period+50])))
279
+
280
+ if len(amplitudes) < 2:
281
+ return 0.0
282
+
283
+ # Calculate amplitude differences
284
+ amp_diffs = np.abs(np.diff(amplitudes))
285
+ mean_amplitude = np.mean(amplitudes)
286
+
287
+ if mean_amplitude > 0:
288
+ shimmer = np.mean(amp_diffs) / mean_amplitude
289
+ return shimmer
290
+
291
+ return 0.0
292
+
293
+ def _calculate_hnr(self, audio: np.ndarray) -> float:
294
+ """Calculate harmonics-to-noise ratio"""
295
+
296
+ # Simplified HNR calculation
297
+ # In practice, this would require more sophisticated harmonic analysis
298
+
299
+ # Calculate power spectrum
300
+ fft = np.fft.rfft(audio)
301
+ power_spectrum = np.abs(fft) ** 2
302
+
303
+ # Estimate harmonic vs noise content
304
+ # This is a very simplified approach
305
+ total_power = np.sum(power_spectrum)
306
+
307
+ # Assume harmonic content is in lower frequencies
308
+ harmonic_power = np.sum(power_spectrum[:len(power_spectrum)//4])
309
+ noise_power = total_power - harmonic_power
310
+
311
+ if noise_power > 0:
312
+ hnr_ratio = harmonic_power / noise_power
313
+ hnr_db = 10 * np.log10(hnr_ratio)
314
+ return hnr_db
315
+
316
+ return 20.0 # High HNR if no noise detected
317
+
318
+ def _calculate_spectral_tilt(self, audio: np.ndarray) -> float:
319
+ """Calculate spectral tilt"""
320
+
321
+ # Calculate power spectrum
322
+ fft = np.fft.rfft(audio)
323
+ power_spectrum = np.abs(fft) ** 2
324
+ frequencies = np.fft.rfftfreq(len(audio), 1/self.sample_rate)
325
+
326
+ # Convert to dB
327
+ power_db = 10 * np.log10(power_spectrum + 1e-10)
328
+
329
+ # Fit line to log power spectrum
330
+ # Focus on speech-relevant frequencies (100-4000 Hz)
331
+ freq_mask = (frequencies >= 100) & (frequencies <= 4000)
332
+ if np.sum(freq_mask) > 10:
333
+ slope, _, _, _, _ = stats.linregress(
334
+ np.log10(frequencies[freq_mask]),
335
+ power_db[freq_mask]
336
+ )
337
+ return slope * 10 # Convert to dB/decade
338
+
339
+ return 0.0
340
+
341
+ def _calculate_breathiness(self, audio: np.ndarray) -> float:
342
+ """Calculate breathiness measure"""
343
+
344
+ # Calculate power in high frequency band vs total power
345
+ fft = np.fft.rfft(audio)
346
+ power_spectrum = np.abs(fft) ** 2
347
+ frequencies = np.fft.rfftfreq(len(audio), 1/self.sample_rate)
348
+
349
+ # High frequency power (2000-8000 Hz)
350
+ hf_mask = (frequencies >= 2000) & (frequencies <= 8000)
351
+ hf_power = np.sum(power_spectrum[hf_mask])
352
+
353
+ total_power = np.sum(power_spectrum)
354
+
355
+ if total_power > 0:
356
+ breathiness_ratio = hf_power / total_power
357
+ return breathiness_ratio
358
+
359
+ return 0.0
360
+
361
+ def calculate_similarity(self, audio1: np.ndarray, audio2: np.ndarray, sr: int) -> float:
362
+ """Calculate similarity between two audio samples"""
363
+
364
+ # Analyze both audio samples
365
+ features1 = self.analyze_voice(audio1, sr)
366
+ features2 = self.analyze_voice(audio2, sr)
367
+
368
+ # Compare key features
369
+ similarity_scores = []
370
+
371
+ # Pitch similarity
372
+ f0_1 = features1.get('fundamental_frequency_mean_hz', 0)
373
+ f0_2 = features2.get('fundamental_frequency_mean_hz', 0)
374
+ if f0_1 > 0 and f0_2 > 0:
375
+ pitch_sim = 1 - min(1, abs(f0_1 - f0_2) / max(f0_1, f0_2))
376
+ similarity_scores.append(pitch_sim)
377
+
378
+ # Formant similarity
379
+ for i in range(1, 4):
380
+ f1 = features1.get(f'formant_f{i}_mean_hz', 0)
381
+ f2 = features2.get(f'formant_f{i}_mean_hz', 0)
382
+ if f1 > 0 and f2 > 0:
383
+ formant_sim = 1 - min(1, abs(f1 - f2) / max(f1, f2))
384
+ similarity_scores.append(formant_sim)
385
+
386
+ # Spectral similarity
387
+ sc1 = features1.get('spectral_centroid_mean_hz', 0)
388
+ sc2 = features2.get('spectral_centroid_mean_hz', 0)
389
+ if sc1 > 0 and sc2 > 0:
390
+ spectral_sim = 1 - min(1, abs(sc1 - sc2) / max(sc1, sc2))
391
+ similarity_scores.append(spectral_sim)
392
+
393
+ # MFCC similarity
394
+ mfcc1 = np.array(features1.get('mfcc_mean', []))
395
+ mfcc2 = np.array(features2.get('mfcc_mean', []))
396
+ if len(mfcc1) > 0 and len(mfcc2) > 0:
397
+ # Cosine similarity
398
+ dot_product = np.dot(mfcc1, mfcc2)
399
+ norm1 = np.linalg.norm(mfcc1)
400
+ norm2 = np.linalg.norm(mfcc2)
401
+ if norm1 > 0 and norm2 > 0:
402
+ mfcc_sim = dot_product / (norm1 * norm2)
403
+ similarity_scores.append(max(0, mfcc_sim))
404
+
405
+ # Return average similarity
406
+ if similarity_scores:
407
+ return np.mean(similarity_scores)
408
+ else:
409
+ return 0.5 # Default similarity if no features could be compared