AbosamraOnFire13 commited on
Commit
496a6ee
·
verified ·
1 Parent(s): d43a36b

Create infer.py

Browse files
Files changed (1) hide show
  1. infer.py +257 -0
infer.py ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import numpy as np
4
+ from tqdm import tqdm
5
+ import joblib
6
+ import librosa
7
+ import noisereduce as nr
8
+ import parselmouth
9
+ from parselmouth.praat import call
10
+ from concurrent.futures import ProcessPoolExecutor
11
+
12
+ def normalize_volume(audio, target_dBFS=-20):
13
+ rms = np.sqrt(np.mean(audio**2))
14
+ gain = 10**((target_dBFS - 20*np.log10(rms))/20)
15
+ return audio * gain
16
+
17
+ def remove_silence(audio, top_db=20):
18
+ intervals = librosa.effects.split(audio, top_db=top_db)
19
+ return np.concatenate([audio[start:end] for start, end in intervals])
20
+
21
+ def equalize_audio(audio, sr, bass_boost=2, treble_boost=1.5):
22
+ # Simple EQ example
23
+ S = librosa.stft(audio)
24
+ freqs = librosa.fft_frequencies(sr=sr)
25
+
26
+ # Bass boost (low frequencies)
27
+ bass_mask = freqs < 250
28
+ S[bass_mask] *= bass_boost
29
+
30
+ # Treble boost (high frequencies)
31
+ treble_mask = freqs > 4000
32
+ S[treble_mask] *= treble_boost
33
+
34
+ return librosa.istft(S)
35
+
36
+ def preprocess_audio(audio, sr, target_sr=16000):
37
+
38
+ # Remove silence
39
+ audio = remove_silence(audio)
40
+
41
+ # Reduce noise
42
+ audio = nr.reduce_noise(y=audio, sr=target_sr)
43
+
44
+ # Normalize volume
45
+ audio = normalize_volume(audio)
46
+
47
+ # Equalize frequency response
48
+ audio = equalize_audio(audio, target_sr)
49
+
50
+ return audio
51
+
52
+ def extract_formants(y, sr):
53
+ """
54
+ Optimized formant extraction using vectorized operations
55
+ Returns 20 features (6 for F1, 6 for F2, 6 for F3, 2 ratios each for F2/F1 and F3/F1)
56
+ """
57
+ try:
58
+ sound = parselmouth.Sound(y, sampling_frequency=sr)
59
+
60
+ # Use Praat's formant extractor
61
+ formant = sound.to_formant_burg(time_step=0.01)
62
+ # Get formant values for the first N frames (or average over time)
63
+ f1_list = []
64
+ f2_list = []
65
+ f3_list = []
66
+ for t in np.arange(0, sound.duration, 0.01):
67
+ try:
68
+ f1 = formant.get_value_at_time(1, t)
69
+ f2 = formant.get_value_at_time(2, t)
70
+ f3 = formant.get_value_at_time(3, t)
71
+ if f1 and f2 and f3 and not np.isnan(f1) and not np.isnan(f2) and not np.isnan(f3):
72
+ f1_list.append(f1)
73
+ f2_list.append(f2)
74
+ f3_list.append(f3)
75
+ except Exception:
76
+ continue
77
+ # Aggregate features: mean and std deviation
78
+ features = [
79
+ np.mean(f1_list) if f1_list else 0,
80
+ np.std(f1_list) if f1_list else 0,
81
+ np.median(f1_list) if f1_list else 0,
82
+ (np.percentile(f1_list, 75) - np.percentile(f1_list, 25)) if f1_list else 0, # IQR
83
+ np.mean(f2_list) if f2_list else 0,
84
+ np.std(f2_list) if f2_list else 0,
85
+ np.median(f2_list) if f2_list else 0,
86
+ (np.percentile(f2_list, 75) - np.percentile(f2_list, 25)) if f2_list else 0, # IQR
87
+ np.mean(f3_list) if f3_list else 0,
88
+ np.std(f3_list) if f3_list else 0,
89
+ np.median(f3_list) if f3_list else 0,
90
+ (np.percentile(f3_list, 75) - np.percentile(f3_list, 25)) if f3_list else 0 # IQR
91
+ ]
92
+ return np.array(features)
93
+
94
+ except Exception as e:
95
+ return None
96
+ def calculate_jitter(y, sr,file_path):
97
+ try:
98
+ sound = parselmouth.Sound(y, sampling_frequency=sr)
99
+ pointProcess = call(sound, "To PointProcess (periodic, cc)", 75, 500)
100
+ harmonicity = call(sound, "To Harmonicity (cc)", 0.01, 75, 0.1, 1.0)
101
+ hnr = call(harmonicity, "Get mean", 0, 0)
102
+ pointProcess = call(sound, "To PointProcess (periodic, cc)", 75, 500)
103
+ localJitter = call(pointProcess, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3)
104
+ localabsoluteJitter = call(pointProcess, "Get jitter (local, absolute)", 0, 0, 0.0001, 0.02, 1.3)
105
+ rapJitter = call(pointProcess, "Get jitter (rap)", 0, 0, 0.0001, 0.02, 1.3)
106
+ ddpJitter = call(pointProcess, "Get jitter (ddp)", 0, 0, 0.0001, 0.02, 1.3)
107
+ localShimmer = call([sound, pointProcess], "Get shimmer (local)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
108
+ localdbShimmer = call([sound, pointProcess], "Get shimmer (local_dB)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
109
+
110
+ metrics = np.array([
111
+ hnr, # Harmonic-to-Noise Ratio (HNR) in dB
112
+ localJitter, # Local jitter (%)
113
+ localabsoluteJitter, # Local absolute jitter (seconds)
114
+ rapJitter, # RAP jitter (%)
115
+ ddpJitter, # DDP jitter (%)
116
+ localShimmer, # Local shimmer (%)
117
+ localdbShimmer, # Local shimmer (dB)
118
+ ])
119
+ return metrics
120
+ except Exception as e:
121
+ return None
122
+
123
+ def extract_features(file_path, n_mfcc=13, sr=16000, duration=7):
124
+ """Extracts MFCCs with fixed-length padding/trimming."""
125
+ try:
126
+
127
+ # Load audio (resampled to `sr` Hz)
128
+ y, sr = librosa.load(file_path, sr=sr, duration=duration)
129
+ y = preprocess_audio(y, sr)
130
+
131
+ jitter_features = calculate_jitter(y,sr,file_path)
132
+
133
+ # if jitter_features==None or (np.any(np.isnan(jitter_features)) or
134
+ # np.any(np.isinf(jitter_features))):
135
+ # return("jitter")
136
+
137
+ # Extract fundamental frequency using a probabilistic approach
138
+ f0_mean = 150.0 # Neutral speech pitch
139
+ f0_std = 20.0 # Moderate variability
140
+ f0_median = 150.0
141
+ f0_range = 100.0 # Max - min
142
+ f0_norm_diff = 0.1 # Normalized mean abs difference
143
+ is_distorted = 1 # Explicit flag
144
+
145
+ f0, _, _ = librosa.pyin(y, sr=sr, fmin=75, fmax=500, frame_length=1024)
146
+ f0 = f0[~np.isnan(f0)]
147
+
148
+ if len(f0) > 0:
149
+ is_distorted = 0
150
+ f0_diff = np.diff(f0)
151
+ f0_mean = float(np.mean(f0)) # Ensure scalar value
152
+ f0_std = float(np.std(f0)) # Ensure scalar value
153
+ f0_median = float(np.median(f0)) # Ensure scalar value
154
+ f0_range = float(np.max(f0) - np.min(f0)) # Ensure scalar value
155
+ f0_norm_diff = float(np.mean(np.abs(f0_diff)) / f0_mean) if f0_mean > 0 else 0.0
156
+
157
+ # Create the feature array ensuring all elements are scalars
158
+ f0_features = np.array([
159
+ float(is_distorted),
160
+ float(f0_mean),
161
+ float(f0_std),
162
+ float(f0_median),
163
+ float(f0_range),
164
+ float(f0_norm_diff)
165
+ ])
166
+
167
+ # if f0_features==None or (np.any(np.isnan(f0_features)) or
168
+ # np.any(np.isinf(f0_features))):
169
+ # return("f0")
170
+
171
+ formant_features = extract_formants(y,sr)
172
+
173
+ # if formant_features==None or (np.any(np.isnan(formant_features)) or
174
+ # np.any(np.isinf(formant_features))):
175
+ # return("formant")
176
+
177
+
178
+ # Extract MFCCs (shape: [n_mfcc, time_frames])
179
+
180
+ mfccs = librosa.feature.mfcc(
181
+ y=y, sr=sr, n_mfcc=n_mfcc,
182
+ n_fft=512, hop_length=256
183
+ )
184
+
185
+ # # Aggregate statistics over time (mean + std)
186
+ mfcc_features = np.concatenate([np.mean(mfccs, axis=1), np.std(mfccs, axis=1)])
187
+
188
+ # if mfcc_features==None or (np.any(np.isnan(mfcc_features)) or
189
+ # np.any(np.isinf(mfcc_features))):
190
+ # return("mfcc")
191
+ # --- New Feature 2: Spectral Tilt (H1-H2) ---
192
+ def compute_spectral_tilt(y, sr):
193
+ S = np.abs(librosa.stft(y))
194
+ h1 = np.max(S[1:10]) # First harmonic (avoid DC)
195
+ h2 = np.max(S[10:20]) # Second harmonic
196
+ return h1 - h2
197
+ spectral_tilt = compute_spectral_tilt(y, sr)
198
+
199
+ # --- New Feature 4: Cepstral Peak Prominence (CPP) ---
200
+ def compute_cpp(y, sr):
201
+ cepstrum = np.abs(np.fft.irfft(np.log(np.abs(np.fft.rfft(y)))))
202
+ cpp = np.max(cepstrum[10:60]) # Peak in typical F0 range
203
+ return cpp
204
+ cpp = compute_cpp(y, sr)
205
+
206
+ # --- New Feature 5: Speaking Rate (Syllables per Second) ---
207
+ def compute_speaking_rate(y, sr):
208
+ onset_env = librosa.onset.onset_strength(y=y, sr=sr)
209
+ peaks = librosa.util.peak_pick(onset_env, pre_max=3, post_max=3, pre_avg=3, post_avg=3, delta=0.5, wait=10)
210
+ return len(peaks) / (len(y) / sr)
211
+ speaking_rate = compute_speaking_rate(y, sr)
212
+
213
+ # Return the 5 new features
214
+ features = np.concatenate([
215
+ [spectral_tilt, cpp, speaking_rate],
216
+ mfcc_features,
217
+ formant_features,
218
+ jitter_features,
219
+ f0_features
220
+ ])
221
+ if (np.any(np.isnan(features)) or
222
+ np.any(np.isinf(features))):
223
+ return None
224
+ return features
225
+
226
+ except Exception as e:
227
+ return None
228
+
229
+ def process_file(file_path):
230
+ if file_path.lower().endswith(('.wav', '.mp3')):
231
+ features = extract_features(file_path)
232
+ return (file_path, features)
233
+ return None
234
+
235
+ def testing_pipeline(folder_path):
236
+ # Load models from file paths
237
+ model_gender = joblib.load("stacked_age_model.joblib")
238
+ model_age = joblib.load("stacked_gender_model.joblib")
239
+
240
+ _, features = process_file(folder_path)
241
+ features_df = pd.DataFrame.from_dict(features, orient='index')
242
+ non_nan_indices = features_df.dropna().index
243
+ X = features_df.loc[non_nan_indices]
244
+
245
+ # Step 3: Predict
246
+ y_pred_age = model_age.predict(X)
247
+ y_pred_gender = model_gender.predict(X)
248
+ y_pred_combined = (y_pred_age << 1) + y_pred_gender
249
+
250
+ # Step 4: Write to text file
251
+ return y_pred_combined[0]
252
+
253
+ print("Predictions written to predictions.txt")
254
+
255
+ if __name__ == "__main__":
256
+ import sys
257
+ testing_pipeline(sys.argv[1])