AbosamraOnFire13 commited on
Commit
6f8acdb
·
verified ·
1 Parent(s): 387b29c

Update infer.py

Browse files
Files changed (1) hide show
  1. infer.py +190 -175
infer.py CHANGED
@@ -1,176 +1,191 @@
1
- import os
2
- import numpy as np
3
- import librosa
4
- import noisereduce as nr
5
- import parselmouth
6
- from parselmouth.praat import call
7
- import joblib
8
- from typing import Dict, Optional
9
-
10
- class SpeakerClassifier:
11
- def __init__(self):
12
- """Initialize models and ensure they're loaded once."""
13
- self.gender_model = joblib.load("stacked_gender_model.joblib")
14
- self.age_model = joblib.load("stacked_age_model.joblib")
15
-
16
- def predict(self, audio_path: str) -> Dict[str, str]:
17
- """
18
- Predict gender and age from an audio file.
19
- Returns: {'gender': 'male/female', 'age': '20s/50s'}
20
- """
21
- features = self._extract_features(audio_path)
22
- if features is None:
23
- return {"error": "Feature extraction failed"}
24
-
25
- # Predict using your models
26
- gender_num = self.gender_model.predict([features])[0]
27
- age_num = self.age_model.predict([features])[0]
28
-
29
- # Map numerical predictions to labels
30
- gender = "male" if gender_num == 0 else "female"
31
- age = "20s" if age_num == 0 else "50s"
32
-
33
- return {"gender": gender, "age": age}
34
-
35
- # --- Your Feature Extraction Functions (adapted) ---
36
- @staticmethod
37
- def _normalize_volume(audio, target_dBFS=-20):
38
- rms = np.sqrt(np.mean(audio**2))
39
- gain = 10**((target_dBFS - 20*np.log10(rms))/20)
40
- return audio * gain
41
-
42
- @staticmethod
43
- def _remove_silence(audio, top_db=20):
44
- intervals = librosa.effects.split(audio, top_db=top_db)
45
- return np.concatenate([audio[start:end] for start, end in intervals])
46
-
47
- @staticmethod
48
- def _equalize_audio(audio, sr, bass_boost=2, treble_boost=1.5):
49
- S = librosa.stft(audio)
50
- freqs = librosa.fft_frequencies(sr=sr)
51
- S[freqs < 250] *= bass_boost
52
- S[freqs > 4000] *= treble_boost
53
- return librosa.istft(S)
54
-
55
- def _preprocess_audio(self, audio, sr, target_sr=16000):
56
- audio = self._remove_silence(audio)
57
- audio = nr.reduce_noise(y=audio, sr=target_sr)
58
- audio = self._normalize_volume(audio)
59
- audio = self._equalize_audio(audio, target_sr)
60
- return audio
61
-
62
- def _extract_formants(self, y, sr):
63
- try:
64
- sound = parselmouth.Sound(y, sampling_frequency=sr)
65
- formant = sound.to_formant_burg(time_step=0.01)
66
-
67
- f1_list, f2_list, f3_list = [], [], []
68
- for t in np.arange(0, sound.duration, 0.01):
69
- try:
70
- f1 = formant.get_value_at_time(1, t)
71
- f2 = formant.get_value_at_time(2, t)
72
- f3 = formant.get_value_at_time(3, t)
73
- if all(v and not np.isnan(v) for v in [f1, f2, f3]):
74
- f1_list.append(f1)
75
- f2_list.append(f2)
76
- f3_list.append(f3)
77
- except Exception:
78
- continue
79
-
80
- features = [
81
- np.mean(f1_list) if f1_list else 0,
82
- np.std(f1_list) if f1_list else 0,
83
- np.median(f1_list) if f1_list else 0,
84
- (np.percentile(f1_list, 75) - np.percentile(f1_list, 25)) if f1_list else 0,
85
- # ... (include all your formant features)
86
- ]
87
- return np.array(features)
88
- except Exception:
89
- return None
90
-
91
- def _calculate_jitter(self, y, sr):
92
- try:
93
- sound = parselmouth.Sound(y, sampling_frequency=sr)
94
- pointProcess = call(sound, "To PointProcess (periodic, cc)", 75, 500)
95
- harmonicity = call(sound, "To Harmonicity (cc)", 0.01, 75, 0.1, 1.0)
96
-
97
- metrics = np.array([
98
- call(harmonicity, "Get mean", 0, 0),
99
- call(pointProcess, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3),
100
- # ... (include all your jitter/shimmer metrics)
101
- ])
102
- return metrics
103
- except Exception:
104
- return None
105
-
106
- def _extract_features(self, audio_path: str) -> Optional[np.ndarray]:
107
- """Main feature extraction pipeline."""
108
- try:
109
- y, sr = librosa.load(audio_path, sr=16000, duration=7)
110
- y = self._preprocess_audio(y, sr)
111
-
112
- # Extract all feature types
113
- jitter_features = self._calculate_jitter(y, sr)
114
- formant_features = self._extract_formants(y, sr)
115
-
116
- # F0 features
117
- f0, _, _ = librosa.pyin(y, sr=sr, fmin=75, fmax=500, frame_length=1024)
118
- f0 = f0[~np.isnan(f0)]
119
- f0_features = self._get_f0_features(f0) if len(f0) > 0 else self._get_default_f0_features()
120
-
121
- # MFCCs
122
- mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, n_fft=512, hop_length=256)
123
- mfcc_features = np.concatenate([np.mean(mfccs, axis=1), np.std(mfccs, axis=1)])
124
-
125
- # Spectral features
126
- spectral_tilt = self._compute_spectral_tilt(y, sr)
127
- cpp = self._compute_cpp(y, sr)
128
- speaking_rate = self._compute_speaking_rate(y, sr)
129
-
130
- # Combine all features
131
- features = np.concatenate([
132
- [spectral_tilt, cpp, speaking_rate],
133
- mfcc_features,
134
- formant_features,
135
- jitter_features,
136
- f0_features
137
- ])
138
-
139
- return features if not (np.any(np.isnan(features)) or np.any(np.isinf(features))) else None
140
-
141
- except Exception as e:
142
- print(f"Feature extraction error: {str(e)}")
143
- return None
144
-
145
- # Helper methods for feature extraction
146
- @staticmethod
147
- def _get_f0_features(f0):
148
- f0_diff = np.diff(f0)
149
- return np.array([
150
- 0, # is_distorted=False
151
- float(np.mean(f0)),
152
- float(np.std(f0)),
153
- float(np.median(f0)),
154
- float(np.max(f0) - np.min(f0)),
155
- float(np.mean(np.abs(f0_diff)) / np.mean(f0)) if np.mean(f0) > 0 else 0.0
156
- ])
157
-
158
- @staticmethod
159
- def _get_default_f0_features():
160
- return np.array([1, 150.0, 20.0, 150.0, 100.0, 0.1]) # Default values
161
-
162
- @staticmethod
163
- def _compute_spectral_tilt(y, sr):
164
- S = np.abs(librosa.stft(y))
165
- return np.max(S[1:10]) - np.max(S[10:20])
166
-
167
- @staticmethod
168
- def _compute_cpp(y, sr):
169
- cepstrum = np.abs(np.fft.irfft(np.log(np.abs(np.fft.rfft(y)))))
170
- return np.max(cepstrum[10:60])
171
-
172
- @staticmethod
173
- def _compute_speaking_rate(y, sr):
174
- onset_env = librosa.onset.onset_strength(y=y, sr=sr)
175
- peaks = librosa.util.peak_pick(onset_env, 3, 3, 3, 3, 0.5, 10)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  return len(peaks) / (len(y) / sr)
 
1
+ import os
2
+ import numpy as np
3
+ import librosa
4
+ import noisereduce as nr
5
+ import parselmouth
6
+ from parselmouth.praat import call
7
+ import joblib
8
+ from typing import Dict, Optional
9
+
10
+ class SpeakerClassifier:
11
+ def __init__(self):
12
+ """Initialize models and ensure they're loaded once."""
13
+ self.gender_model = joblib.load("stacked_gender_model.joblib")
14
+ self.age_model = joblib.load("stacked_age_model.joblib")
15
+
16
+ def predict(self, audio_path: str) -> Dict[str, str]:
17
+ """
18
+ Predict gender and age from an audio file.
19
+ Returns: {'gender': 'male/female', 'age': '20s/50s'}
20
+ """
21
+ features = self._extract_features(audio_path)
22
+ if features is None:
23
+ return {"error": "Feature extraction failed"}
24
+
25
+ # Predict using your models
26
+ gender_num = self.gender_model.predict([features])[0]
27
+ age_num = self.age_model.predict([features])[0]
28
+
29
+ # Map numerical predictions to labels
30
+ gender = "male" if gender_num == 0 else "female"
31
+ age = "20s" if age_num == 0 else "50s"
32
+
33
+ return {"gender": gender, "age": age}
34
+
35
+ # --- Your Feature Extraction Functions (adapted) ---
36
+ @staticmethod
37
+ def _normalize_volume(audio, target_dBFS=-20):
38
+ rms = np.sqrt(np.mean(audio**2))
39
+ gain = 10**((target_dBFS - 20*np.log10(rms))/20)
40
+ return audio * gain
41
+
42
+ @staticmethod
43
+ def _remove_silence(audio, top_db=20):
44
+ intervals = librosa.effects.split(audio, top_db=top_db)
45
+ return np.concatenate([audio[start:end] for start, end in intervals])
46
+
47
+ @staticmethod
48
+ def _equalize_audio(audio, sr, bass_boost=2, treble_boost=1.5):
49
+ S = librosa.stft(audio)
50
+ freqs = librosa.fft_frequencies(sr=sr)
51
+ S[freqs < 250] *= bass_boost
52
+ S[freqs > 4000] *= treble_boost
53
+ return librosa.istft(S)
54
+
55
+ def _preprocess_audio(self, audio, sr, target_sr=16000):
56
+ audio = self._remove_silence(audio)
57
+ audio = nr.reduce_noise(y=audio, sr=target_sr)
58
+ audio = self._normalize_volume(audio)
59
+ audio = self._equalize_audio(audio, target_sr)
60
+ return audio
61
+
62
+ def _extract_formants(self, y, sr):
63
+ try:
64
+ sound = parselmouth.Sound(y, sampling_frequency=sr)
65
+ formant = sound.to_formant_burg(time_step=0.01)
66
+
67
+ f1_list, f2_list, f3_list = [], [], []
68
+ for t in np.arange(0, sound.duration, 0.01):
69
+ try:
70
+ f1 = formant.get_value_at_time(1, t)
71
+ f2 = formant.get_value_at_time(2, t)
72
+ f3 = formant.get_value_at_time(3, t)
73
+ if all(v and not np.isnan(v) for v in [f1, f2, f3]):
74
+ f1_list.append(f1)
75
+ f2_list.append(f2)
76
+ f3_list.append(f3)
77
+ except Exception:
78
+ continue
79
+
80
+ features = [
81
+ np.mean(f1_list) if f1_list else 0,
82
+ np.std(f1_list) if f1_list else 0,
83
+ np.median(f1_list) if f1_list else 0,
84
+ (np.percentile(f1_list, 75) - np.percentile(f1_list, 25)) if f1_list else 0, # IQR
85
+ np.mean(f2_list) if f2_list else 0,
86
+ np.std(f2_list) if f2_list else 0,
87
+ np.median(f2_list) if f2_list else 0,
88
+ (np.percentile(f2_list, 75) - np.percentile(f2_list, 25)) if f2_list else 0, # IQR
89
+ np.mean(f3_list) if f3_list else 0,
90
+ np.std(f3_list) if f3_list else 0,
91
+ np.median(f3_list) if f3_list else 0,
92
+ (np.percentile(f3_list, 75) - np.percentile(f3_list, 25)) if f3_list else 0 # IQR
93
+ ]
94
+ return np.array(features)
95
+ except Exception:
96
+ return None
97
+
98
+ def _calculate_jitter(self, y, sr):
99
+ try:
100
+ sound = parselmouth.Sound(y, sampling_frequency=sr)
101
+ pointProcess = call(sound, "To PointProcess (periodic, cc)", 75, 500)
102
+ harmonicity = call(sound, "To Harmonicity (cc)", 0.01, 75, 0.1, 1.0)
103
+
104
+ metrics = np.array([
105
+ call(harmonicity, "Get mean", 0, 0),
106
+ call(pointProcess, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3),
107
+ call(harmonicity, "Get mean", 0, 0),
108
+ call(pointProcess, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3),
109
+ call(pointProcess, "Get jitter (local, absolute)", 0, 0, 0.0001, 0.02, 1.3),
110
+ call(pointProcess, "Get jitter (rap)", 0, 0, 0.0001, 0.02, 1.3),
111
+ call(pointProcess, "Get jitter (ppq5)", 0, 0, 0.0001, 0.02, 1.3),
112
+ call(pointProcess, "Get jitter (ddp)", 0, 0, 0.0001, 0.02, 1.3),
113
+ call([sound, pointProcess], "Get shimmer (local)", 0, 0, 0.0001, 0.02, 1.3, 1.6),
114
+ call([sound, pointProcess], "Get shimmer (local_dB)", 0, 0, 0.0001, 0.02, 1.3, 1.6),
115
+ call([sound, pointProcess], "Get shimmer (apq3)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
116
+ ])
117
+ return metrics
118
+ except Exception:
119
+ return None
120
+
121
+ def _extract_features(self, audio_path: str) -> Optional[np.ndarray]:
122
+ """Main feature extraction pipeline."""
123
+ try:
124
+ y, sr = librosa.load(audio_path, sr=16000, duration=7)
125
+ y = self._preprocess_audio(y, sr)
126
+
127
+ # Extract all feature types
128
+ jitter_features = self._calculate_jitter(y, sr)
129
+ formant_features = self._extract_formants(y, sr)
130
+
131
+ # F0 features
132
+ f0, _, _ = librosa.pyin(y, sr=sr, fmin=75, fmax=500, frame_length=1024)
133
+ f0 = f0[~np.isnan(f0)]
134
+ f0_features = self._get_f0_features(f0) if len(f0) > 0 else self._get_default_f0_features()
135
+
136
+ # MFCCs
137
+ mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, n_fft=512, hop_length=256)
138
+ mfcc_features = np.concatenate([np.mean(mfccs, axis=1), np.std(mfccs, axis=1)])
139
+
140
+ # Spectral features
141
+ spectral_tilt = self._compute_spectral_tilt(y, sr)
142
+ cpp = self._compute_cpp(y, sr)
143
+ speaking_rate = self._compute_speaking_rate(y, sr)
144
+
145
+ # Combine all features
146
+ features = np.concatenate([
147
+ [spectral_tilt, cpp, speaking_rate],
148
+ mfcc_features,
149
+ formant_features,
150
+ jitter_features,
151
+ f0_features
152
+ ])
153
+
154
+ return features if not (np.any(np.isnan(features)) or np.any(np.isinf(features))) else None
155
+
156
+ except Exception as e:
157
+ print(f"Feature extraction error: {str(e)}")
158
+ return None
159
+
160
+ # Helper methods for feature extraction
161
+ @staticmethod
162
+ def _get_f0_features(f0):
163
+ f0_diff = np.diff(f0)
164
+ return np.array([
165
+ 0, # is_distorted=False
166
+ float(np.mean(f0)),
167
+ float(np.std(f0)),
168
+ float(np.median(f0)),
169
+ float(np.max(f0) - np.min(f0)),
170
+ float(np.mean(np.abs(f0_diff)) / np.mean(f0)) if np.mean(f0) > 0 else 0.0
171
+ ])
172
+
173
+ @staticmethod
174
+ def _get_default_f0_features():
175
+ return np.array([1, 150.0, 20.0, 150.0, 100.0, 0.1]) # Default values
176
+
177
+ @staticmethod
178
+ def _compute_spectral_tilt(y, sr):
179
+ S = np.abs(librosa.stft(y))
180
+ return np.max(S[1:10]) - np.max(S[10:20])
181
+
182
+ @staticmethod
183
+ def _compute_cpp(y, sr):
184
+ cepstrum = np.abs(np.fft.irfft(np.log(np.abs(np.fft.rfft(y)))))
185
+ return np.max(cepstrum[10:60])
186
+
187
+ @staticmethod
188
+ def _compute_speaking_rate(y, sr):
189
+ onset_env = librosa.onset.onset_strength(y=y, sr=sr)
190
+ peaks = librosa.util.peak_pick(onset_env, 3, 3, 3, 3, 0.5, 10)
191
  return len(peaks) / (len(y) / sr)