crackuser commited on
Commit
c3607d5
·
verified ·
1 Parent(s): 4ec8b62

Delete voice_analyzer.py

Browse files
Files changed (1) hide show
  1. voice_analyzer.py +0 -409
voice_analyzer.py DELETED
@@ -1,409 +0,0 @@
1
- import numpy as np
2
- import librosa
3
- from scipy import stats
4
- from typing import Dict, Tuple
5
- import parselmouth
6
- from parselmouth.praat import call
7
-
8
- class VoiceAnalyzer:
9
- """Advanced voice analysis for cloning applications"""
10
-
11
- def __init__(self):
12
- self.sample_rate = 22050
13
-
14
- def analyze_voice(self, audio: np.ndarray, sr: int) -> Dict:
15
- """Comprehensive voice analysis"""
16
-
17
- # Resample if needed
18
- if sr != self.sample_rate:
19
- audio = librosa.resample(audio, orig_sr=sr, target_sr=self.sample_rate)
20
-
21
- analysis = {}
22
-
23
- # Basic audio properties
24
- analysis.update(self._analyze_basic_properties(audio))
25
-
26
- # Pitch analysis
27
- analysis.update(self._analyze_pitch(audio))
28
-
29
- # Formant analysis
30
- analysis.update(self._analyze_formants(audio))
31
-
32
- # Spectral analysis
33
- analysis.update(self._analyze_spectral_features(audio))
34
-
35
- # Prosodic features
36
- analysis.update(self._analyze_prosody(audio))
37
-
38
- # Voice quality measures
39
- analysis.update(self._analyze_voice_quality(audio))
40
-
41
- return analysis
42
-
43
- def _analyze_basic_properties(self, audio: np.ndarray) -> Dict:
44
- """Analyze basic audio properties"""
45
-
46
- duration = len(audio) / self.sample_rate
47
- rms_energy = np.sqrt(np.mean(audio**2))
48
- zcr = np.mean(librosa.feature.zero_crossing_rate(audio))
49
-
50
- return {
51
- 'duration_seconds': round(duration, 2),
52
- 'rms_energy': round(float(rms_energy), 4),
53
- 'zero_crossing_rate': round(float(zcr), 4),
54
- 'peak_amplitude': round(float(np.max(np.abs(audio))), 4)
55
- }
56
-
57
- def _analyze_pitch(self, audio: np.ndarray) -> Dict:
58
- """Analyze pitch characteristics"""
59
-
60
- # Extract pitch using librosa
61
- pitches, magnitudes = librosa.piptrack(y=audio, sr=self.sample_rate, fmin=50, fmax=400)
62
-
63
- # Get pitch values
64
- pitch_values = []
65
- for t in range(pitches.shape[1]):
66
- index = magnitudes[:, t].argmax()
67
- pitch = pitches[index, t]
68
- if pitch > 0:
69
- pitch_values.append(pitch)
70
-
71
- if pitch_values:
72
- pitch_values = np.array(pitch_values)
73
-
74
- return {
75
- 'fundamental_frequency_mean_hz': round(float(np.mean(pitch_values)), 2),
76
- 'fundamental_frequency_std_hz': round(float(np.std(pitch_values)), 2),
77
- 'fundamental_frequency_range_hz': round(float(np.ptp(pitch_values)), 2),
78
- 'pitch_median_hz': round(float(np.median(pitch_values)), 2)
79
- }
80
- else:
81
- return {
82
- 'fundamental_frequency_mean_hz': 0,
83
- 'fundamental_frequency_std_hz': 0,
84
- 'fundamental_frequency_range_hz': 0,
85
- 'pitch_median_hz': 0
86
- }
87
-
88
- def _analyze_formants(self, audio: np.ndarray) -> Dict:
89
- """Analyze formant frequencies"""
90
-
91
- try:
92
- # Use parselmouth for formant analysis
93
- sound = parselmouth.Sound(audio, sampling_frequency=self.sample_rate)
94
- formant = call(sound, "To Formant (burg)", 0.0025, 5, 5500, 0.025, 50)
95
-
96
- # Extract first 3 formants
97
- f1_values = []
98
- f2_values = []
99
- f3_values = []
100
-
101
- n_frames = call(formant, "Get number of frames")
102
-
103
- for i in range(1, min(n_frames + 1, 100)): # Sample max 100 frames
104
- f1 = call(formant, "Get value at time", 1, i * 0.01, "Hertz", "Linear")
105
- f2 = call(formant, "Get value at time", 2, i * 0.01, "Hertz", "Linear")
106
- f3 = call(formant, "Get value at time", 3, i * 0.01, "Hertz", "Linear")
107
-
108
- if not (np.isnan(f1) or np.isnan(f2) or np.isnan(f3)):
109
- f1_values.append(f1)
110
- f2_values.append(f2)
111
- f3_values.append(f3)
112
-
113
- if f1_values and f2_values and f3_values:
114
- return {
115
- 'formant_f1_mean_hz': round(float(np.mean(f1_values)), 2),
116
- 'formant_f2_mean_hz': round(float(np.mean(f2_values)), 2),
117
- 'formant_f3_mean_hz': round(float(np.mean(f3_values)), 2),
118
- 'formant_f1_std_hz': round(float(np.std(f1_values)), 2),
119
- 'formant_f2_std_hz': round(float(np.std(f2_values)), 2),
120
- 'formant_f3_std_hz': round(float(np.std(f3_values)), 2)
121
- }
122
- except:
123
- pass
124
-
125
- # Fallback: estimate formants from spectral peaks
126
- return self._estimate_formants_spectral(audio)
127
-
128
- def _estimate_formants_spectral(self, audio: np.ndarray) -> Dict:
129
- """Estimate formants from spectral analysis"""
130
-
131
- # Compute FFT
132
- fft = np.fft.rfft(audio)
133
- magnitude = np.abs(fft)
134
- frequencies = np.fft.rfftfreq(len(audio), 1/self.sample_rate)
135
-
136
- # Find peaks in frequency domain
137
- from scipy.signal import find_peaks
138
-
139
- peaks, _ = find_peaks(magnitude, height=np.max(magnitude) * 0.1, distance=50)
140
- peak_freqs = frequencies[peaks]
141
-
142
- # Select first few peaks as formant estimates
143
- formants = sorted(peak_freqs[peak_freqs > 200])[:3] # Above 200 Hz
144
-
145
- return {
146
- 'formant_f1_mean_hz': round(float(formants[0]) if len(formants) > 0 else 500, 2),
147
- 'formant_f2_mean_hz': round(float(formants[1]) if len(formants) > 1 else 1500, 2),
148
- 'formant_f3_mean_hz': round(float(formants[2]) if len(formants) > 2 else 2500, 2),
149
- 'formant_f1_std_hz': 0.0,
150
- 'formant_f2_std_hz': 0.0,
151
- 'formant_f3_std_hz': 0.0
152
- }
153
-
154
- def _analyze_spectral_features(self, audio: np.ndarray) -> Dict:
155
- """Analyze spectral characteristics"""
156
-
157
- # Spectral centroid
158
- spectral_centroids = librosa.feature.spectral_centroid(y=audio, sr=self.sample_rate)[0]
159
-
160
- # Spectral bandwidth
161
- spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=self.sample_rate)[0]
162
-
163
- # Spectral rolloff
164
- spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=self.sample_rate)[0]
165
-
166
- # Spectral contrast
167
- spectral_contrast = librosa.feature.spectral_contrast(y=audio, sr=self.sample_rate)
168
-
169
- # MFCC features
170
- mfccs = librosa.feature.mfcc(y=audio, sr=self.sample_rate, n_mfcc=13)
171
-
172
- return {
173
- 'spectral_centroid_mean_hz': round(float(np.mean(spectral_centroids)), 2),
174
- 'spectral_centroid_std_hz': round(float(np.std(spectral_centroids)), 2),
175
- 'spectral_bandwidth_mean_hz': round(float(np.mean(spectral_bandwidth)), 2),
176
- 'spectral_rolloff_mean_hz': round(float(np.mean(spectral_rolloff)), 2),
177
- 'spectral_contrast_mean': round(float(np.mean(spectral_contrast)), 4),
178
- 'mfcc_mean': [round(float(x), 4) for x in np.mean(mfccs, axis=1)]
179
- }
180
-
181
- def _analyze_prosody(self, audio: np.ndarray) -> Dict:
182
- """Analyze prosodic features"""
183
-
184
- # Speaking rate (approximate)
185
- # Detect voiced segments
186
- frame_length = int(0.025 * self.sample_rate) # 25ms frames
187
- hop_length = int(0.010 * self.sample_rate) # 10ms hop
188
-
189
- # Energy-based voice activity detection
190
- energy = []
191
- for i in range(0, len(audio) - frame_length + 1, hop_length):
192
- frame = audio[i:i + frame_length]
193
- energy.append(np.sum(frame ** 2))
194
-
195
- energy = np.array(energy)
196
- voiced_frames = energy > (np.mean(energy) * 0.1)
197
-
198
- # Estimate speaking rate
199
- voiced_duration = np.sum(voiced_frames) * 0.010 # 10ms per frame
200
- total_duration = len(audio) / self.sample_rate
201
-
202
- speech_rate = voiced_duration / total_duration if total_duration > 0 else 0
203
-
204
- # Jitter and shimmer (simplified estimation)
205
- pitch_periods = self._extract_pitch_periods(audio)
206
- jitter = self._calculate_jitter(pitch_periods) if len(pitch_periods) > 3 else 0
207
- shimmer = self._calculate_shimmer(audio, pitch_periods) if len(pitch_periods) > 3 else 0
208
-
209
- return {
210
- 'speech_rate_ratio': round(speech_rate, 4),
211
- 'voiced_frames_ratio': round(float(np.mean(voiced_frames)), 4),
212
- 'jitter_percent': round(jitter * 100, 4),
213
- 'shimmer_percent': round(shimmer * 100, 4)
214
- }
215
-
216
- def _analyze_voice_quality(self, audio: np.ndarray) -> Dict:
217
- """Analyze voice quality measures"""
218
-
219
- # Harmonics-to-noise ratio (simplified)
220
- hnr = self._calculate_hnr(audio)
221
-
222
- # Spectral tilt
223
- spectral_tilt = self._calculate_spectral_tilt(audio)
224
-
225
- # Breathiness measure (high-frequency energy ratio)
226
- breathiness = self._calculate_breathiness(audio)
227
-
228
- return {
229
- 'harmonics_to_noise_ratio_db': round(hnr, 2),
230
- 'spectral_tilt_db_oct': round(spectral_tilt, 2),
231
- 'breathiness_ratio': round(breathiness, 4)
232
- }
233
-
234
- def _extract_pitch_periods(self, audio: np.ndarray) -> np.ndarray:
235
- """Extract pitch periods from audio"""
236
-
237
- # Simple autocorrelation-based pitch period extraction
238
- autocorr = np.correlate(audio, audio, mode='full')
239
- autocorr = autocorr[len(autocorr)//2:]
240
-
241
- # Find peaks in autocorrelation
242
- from scipy.signal import find_peaks
243
-
244
- min_period = int(self.sample_rate / 400) # 400 Hz max
245
- max_period = int(self.sample_rate / 50) # 50 Hz min
246
-
247
- peaks, _ = find_peaks(autocorr[min_period:max_period])
248
- peaks += min_period
249
-
250
- return peaks[:10] # Return up to 10 periods
251
-
252
- def _calculate_jitter(self, pitch_periods: np.ndarray) -> float:
253
- """Calculate jitter (pitch period variability)"""
254
-
255
- if len(pitch_periods) < 2:
256
- return 0.0
257
-
258
- # Calculate period differences
259
- period_diffs = np.abs(np.diff(pitch_periods))
260
- mean_period = np.mean(pitch_periods)
261
-
262
- if mean_period > 0:
263
- jitter = np.mean(period_diffs) / mean_period
264
- return jitter
265
-
266
- return 0.0
267
-
268
- def _calculate_shimmer(self, audio: np.ndarray, pitch_periods: np.ndarray) -> float:
269
- """Calculate shimmer (amplitude variability)"""
270
-
271
- if len(pitch_periods) < 2:
272
- return 0.0
273
-
274
- # Extract amplitude for each period
275
- amplitudes = []
276
- for period in pitch_periods:
277
- if period < len(audio):
278
- amplitudes.append(np.max(np.abs(audio[max(0, period-50):period+50])))
279
-
280
- if len(amplitudes) < 2:
281
- return 0.0
282
-
283
- # Calculate amplitude differences
284
- amp_diffs = np.abs(np.diff(amplitudes))
285
- mean_amplitude = np.mean(amplitudes)
286
-
287
- if mean_amplitude > 0:
288
- shimmer = np.mean(amp_diffs) / mean_amplitude
289
- return shimmer
290
-
291
- return 0.0
292
-
293
- def _calculate_hnr(self, audio: np.ndarray) -> float:
294
- """Calculate harmonics-to-noise ratio"""
295
-
296
- # Simplified HNR calculation
297
- # In practice, this would require more sophisticated harmonic analysis
298
-
299
- # Calculate power spectrum
300
- fft = np.fft.rfft(audio)
301
- power_spectrum = np.abs(fft) ** 2
302
-
303
- # Estimate harmonic vs noise content
304
- # This is a very simplified approach
305
- total_power = np.sum(power_spectrum)
306
-
307
- # Assume harmonic content is in lower frequencies
308
- harmonic_power = np.sum(power_spectrum[:len(power_spectrum)//4])
309
- noise_power = total_power - harmonic_power
310
-
311
- if noise_power > 0:
312
- hnr_ratio = harmonic_power / noise_power
313
- hnr_db = 10 * np.log10(hnr_ratio)
314
- return hnr_db
315
-
316
- return 20.0 # High HNR if no noise detected
317
-
318
- def _calculate_spectral_tilt(self, audio: np.ndarray) -> float:
319
- """Calculate spectral tilt"""
320
-
321
- # Calculate power spectrum
322
- fft = np.fft.rfft(audio)
323
- power_spectrum = np.abs(fft) ** 2
324
- frequencies = np.fft.rfftfreq(len(audio), 1/self.sample_rate)
325
-
326
- # Convert to dB
327
- power_db = 10 * np.log10(power_spectrum + 1e-10)
328
-
329
- # Fit line to log power spectrum
330
- # Focus on speech-relevant frequencies (100-4000 Hz)
331
- freq_mask = (frequencies >= 100) & (frequencies <= 4000)
332
- if np.sum(freq_mask) > 10:
333
- slope, _, _, _, _ = stats.linregress(
334
- np.log10(frequencies[freq_mask]),
335
- power_db[freq_mask]
336
- )
337
- return slope * 10 # Convert to dB/decade
338
-
339
- return 0.0
340
-
341
- def _calculate_breathiness(self, audio: np.ndarray) -> float:
342
- """Calculate breathiness measure"""
343
-
344
- # Calculate power in high frequency band vs total power
345
- fft = np.fft.rfft(audio)
346
- power_spectrum = np.abs(fft) ** 2
347
- frequencies = np.fft.rfftfreq(len(audio), 1/self.sample_rate)
348
-
349
- # High frequency power (2000-8000 Hz)
350
- hf_mask = (frequencies >= 2000) & (frequencies <= 8000)
351
- hf_power = np.sum(power_spectrum[hf_mask])
352
-
353
- total_power = np.sum(power_spectrum)
354
-
355
- if total_power > 0:
356
- breathiness_ratio = hf_power / total_power
357
- return breathiness_ratio
358
-
359
- return 0.0
360
-
361
- def calculate_similarity(self, audio1: np.ndarray, audio2: np.ndarray, sr: int) -> float:
362
- """Calculate similarity between two audio samples"""
363
-
364
- # Analyze both audio samples
365
- features1 = self.analyze_voice(audio1, sr)
366
- features2 = self.analyze_voice(audio2, sr)
367
-
368
- # Compare key features
369
- similarity_scores = []
370
-
371
- # Pitch similarity
372
- f0_1 = features1.get('fundamental_frequency_mean_hz', 0)
373
- f0_2 = features2.get('fundamental_frequency_mean_hz', 0)
374
- if f0_1 > 0 and f0_2 > 0:
375
- pitch_sim = 1 - min(1, abs(f0_1 - f0_2) / max(f0_1, f0_2))
376
- similarity_scores.append(pitch_sim)
377
-
378
- # Formant similarity
379
- for i in range(1, 4):
380
- f1 = features1.get(f'formant_f{i}_mean_hz', 0)
381
- f2 = features2.get(f'formant_f{i}_mean_hz', 0)
382
- if f1 > 0 and f2 > 0:
383
- formant_sim = 1 - min(1, abs(f1 - f2) / max(f1, f2))
384
- similarity_scores.append(formant_sim)
385
-
386
- # Spectral similarity
387
- sc1 = features1.get('spectral_centroid_mean_hz', 0)
388
- sc2 = features2.get('spectral_centroid_mean_hz', 0)
389
- if sc1 > 0 and sc2 > 0:
390
- spectral_sim = 1 - min(1, abs(sc1 - sc2) / max(sc1, sc2))
391
- similarity_scores.append(spectral_sim)
392
-
393
- # MFCC similarity
394
- mfcc1 = np.array(features1.get('mfcc_mean', []))
395
- mfcc2 = np.array(features2.get('mfcc_mean', []))
396
- if len(mfcc1) > 0 and len(mfcc2) > 0:
397
- # Cosine similarity
398
- dot_product = np.dot(mfcc1, mfcc2)
399
- norm1 = np.linalg.norm(mfcc1)
400
- norm2 = np.linalg.norm(mfcc2)
401
- if norm1 > 0 and norm2 > 0:
402
- mfcc_sim = dot_product / (norm1 * norm2)
403
- similarity_scores.append(max(0, mfcc_sim))
404
-
405
- # Return average similarity
406
- if similarity_scores:
407
- return np.mean(similarity_scores)
408
- else:
409
- return 0.5 # Default similarity if no features could be compared