AbosamraOnFire13 commited on
Commit
387b29c
·
verified ·
1 Parent(s): 845b964

Upload folder using huggingface_hub

Browse files
README.MD ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Speaker Gender/Age Recognition
2
+
3
+ Predicts gender (male/female) and age group (20s/50s) from audio.
4
+
5
+ ## Usage
6
+ ```python
7
+ from huggingface_hub import InferenceClient
8
+
9
+ client = InferenceClient("YOUR_USERNAME/speaker-recognition")
10
+ result = client.post(json={"file": open("audio.wav", "rb")})
app.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ from fastapi import FastAPI, UploadFile, File
3
+ from inference import SpeakerClassifier
4
+ import os
5
+
6
+ app = FastAPI()
7
+ classifier = SpeakerClassifier()
8
+
9
+ @app.post("/predict")
10
+ async def predict_audio(file: UploadFile = File(...)):
11
+ # Save the uploaded file temporarily
12
+ temp_path = f"temp_{file.filename}"
13
+ with open(temp_path, "wb") as f:
14
+ f.write(await file.read())
15
+
16
+ # Predict
17
+ result = classifier.predict(temp_path)
18
+
19
+ # Clean up
20
+ os.remove(temp_path)
21
+
22
+ return result
23
+
24
+ if __name__ == "__main__":
25
+ import uvicorn
26
+ uvicorn.run(app, host="0.0.0.0", port=8000)
external_infer.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import time
3
+ def external_infer(path):
4
+ start_time = time.time()
5
+ subprocess.run(['python', 'infer.py', path],check=True)
6
+ end_time = time.time()
7
+ elapsed_time = end_time - start_time
8
+ with open('time.txt', 'w') as f:
9
+ f.write(str(elapsed_time))
10
+
11
+ print("External inference completed successfully")
12
+
13
+ if __name__ == "__main__":
14
+ import sys
15
+ external_infer(sys.argv[1])
infer.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import librosa
4
+ import noisereduce as nr
5
+ import parselmouth
6
+ from parselmouth.praat import call
7
+ import joblib
8
+ from typing import Dict, Optional
9
+
10
+ class SpeakerClassifier:
11
+ def __init__(self):
12
+ """Initialize models and ensure they're loaded once."""
13
+ self.gender_model = joblib.load("stacked_gender_model.joblib")
14
+ self.age_model = joblib.load("stacked_age_model.joblib")
15
+
16
+ def predict(self, audio_path: str) -> Dict[str, str]:
17
+ """
18
+ Predict gender and age from an audio file.
19
+ Returns: {'gender': 'male/female', 'age': '20s/50s'}
20
+ """
21
+ features = self._extract_features(audio_path)
22
+ if features is None:
23
+ return {"error": "Feature extraction failed"}
24
+
25
+ # Predict using your models
26
+ gender_num = self.gender_model.predict([features])[0]
27
+ age_num = self.age_model.predict([features])[0]
28
+
29
+ # Map numerical predictions to labels
30
+ gender = "male" if gender_num == 0 else "female"
31
+ age = "20s" if age_num == 0 else "50s"
32
+
33
+ return {"gender": gender, "age": age}
34
+
35
+ # --- Your Feature Extraction Functions (adapted) ---
36
+ @staticmethod
37
+ def _normalize_volume(audio, target_dBFS=-20):
38
+ rms = np.sqrt(np.mean(audio**2))
39
+ gain = 10**((target_dBFS - 20*np.log10(rms))/20)
40
+ return audio * gain
41
+
42
+ @staticmethod
43
+ def _remove_silence(audio, top_db=20):
44
+ intervals = librosa.effects.split(audio, top_db=top_db)
45
+ return np.concatenate([audio[start:end] for start, end in intervals])
46
+
47
+ @staticmethod
48
+ def _equalize_audio(audio, sr, bass_boost=2, treble_boost=1.5):
49
+ S = librosa.stft(audio)
50
+ freqs = librosa.fft_frequencies(sr=sr)
51
+ S[freqs < 250] *= bass_boost
52
+ S[freqs > 4000] *= treble_boost
53
+ return librosa.istft(S)
54
+
55
+ def _preprocess_audio(self, audio, sr, target_sr=16000):
56
+ audio = self._remove_silence(audio)
57
+ audio = nr.reduce_noise(y=audio, sr=target_sr)
58
+ audio = self._normalize_volume(audio)
59
+ audio = self._equalize_audio(audio, target_sr)
60
+ return audio
61
+
62
+ def _extract_formants(self, y, sr):
63
+ try:
64
+ sound = parselmouth.Sound(y, sampling_frequency=sr)
65
+ formant = sound.to_formant_burg(time_step=0.01)
66
+
67
+ f1_list, f2_list, f3_list = [], [], []
68
+ for t in np.arange(0, sound.duration, 0.01):
69
+ try:
70
+ f1 = formant.get_value_at_time(1, t)
71
+ f2 = formant.get_value_at_time(2, t)
72
+ f3 = formant.get_value_at_time(3, t)
73
+ if all(v and not np.isnan(v) for v in [f1, f2, f3]):
74
+ f1_list.append(f1)
75
+ f2_list.append(f2)
76
+ f3_list.append(f3)
77
+ except Exception:
78
+ continue
79
+
80
+ features = [
81
+ np.mean(f1_list) if f1_list else 0,
82
+ np.std(f1_list) if f1_list else 0,
83
+ np.median(f1_list) if f1_list else 0,
84
+ (np.percentile(f1_list, 75) - np.percentile(f1_list, 25)) if f1_list else 0,
85
+ # ... (include all your formant features)
86
+ ]
87
+ return np.array(features)
88
+ except Exception:
89
+ return None
90
+
91
+ def _calculate_jitter(self, y, sr):
92
+ try:
93
+ sound = parselmouth.Sound(y, sampling_frequency=sr)
94
+ pointProcess = call(sound, "To PointProcess (periodic, cc)", 75, 500)
95
+ harmonicity = call(sound, "To Harmonicity (cc)", 0.01, 75, 0.1, 1.0)
96
+
97
+ metrics = np.array([
98
+ call(harmonicity, "Get mean", 0, 0),
99
+ call(pointProcess, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3),
100
+ # ... (include all your jitter/shimmer metrics)
101
+ ])
102
+ return metrics
103
+ except Exception:
104
+ return None
105
+
106
+ def _extract_features(self, audio_path: str) -> Optional[np.ndarray]:
107
+ """Main feature extraction pipeline."""
108
+ try:
109
+ y, sr = librosa.load(audio_path, sr=16000, duration=7)
110
+ y = self._preprocess_audio(y, sr)
111
+
112
+ # Extract all feature types
113
+ jitter_features = self._calculate_jitter(y, sr)
114
+ formant_features = self._extract_formants(y, sr)
115
+
116
+ # F0 features
117
+ f0, _, _ = librosa.pyin(y, sr=sr, fmin=75, fmax=500, frame_length=1024)
118
+ f0 = f0[~np.isnan(f0)]
119
+ f0_features = self._get_f0_features(f0) if len(f0) > 0 else self._get_default_f0_features()
120
+
121
+ # MFCCs
122
+ mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, n_fft=512, hop_length=256)
123
+ mfcc_features = np.concatenate([np.mean(mfccs, axis=1), np.std(mfccs, axis=1)])
124
+
125
+ # Spectral features
126
+ spectral_tilt = self._compute_spectral_tilt(y, sr)
127
+ cpp = self._compute_cpp(y, sr)
128
+ speaking_rate = self._compute_speaking_rate(y, sr)
129
+
130
+ # Combine all features
131
+ features = np.concatenate([
132
+ [spectral_tilt, cpp, speaking_rate],
133
+ mfcc_features,
134
+ formant_features,
135
+ jitter_features,
136
+ f0_features
137
+ ])
138
+
139
+ return features if not (np.any(np.isnan(features)) or np.any(np.isinf(features))) else None
140
+
141
+ except Exception as e:
142
+ print(f"Feature extraction error: {str(e)}")
143
+ return None
144
+
145
+ # Helper methods for feature extraction
146
+ @staticmethod
147
+ def _get_f0_features(f0):
148
+ f0_diff = np.diff(f0)
149
+ return np.array([
150
+ 0, # is_distorted=False
151
+ float(np.mean(f0)),
152
+ float(np.std(f0)),
153
+ float(np.median(f0)),
154
+ float(np.max(f0) - np.min(f0)),
155
+ float(np.mean(np.abs(f0_diff)) / np.mean(f0)) if np.mean(f0) > 0 else 0.0
156
+ ])
157
+
158
+ @staticmethod
159
+ def _get_default_f0_features():
160
+ return np.array([1, 150.0, 20.0, 150.0, 100.0, 0.1]) # Default values
161
+
162
+ @staticmethod
163
+ def _compute_spectral_tilt(y, sr):
164
+ S = np.abs(librosa.stft(y))
165
+ return np.max(S[1:10]) - np.max(S[10:20])
166
+
167
+ @staticmethod
168
+ def _compute_cpp(y, sr):
169
+ cepstrum = np.abs(np.fft.irfft(np.log(np.abs(np.fft.rfft(y)))))
170
+ return np.max(cepstrum[10:60])
171
+
172
+ @staticmethod
173
+ def _compute_speaking_rate(y, sr):
174
+ onset_env = librosa.onset.onset_strength(y=y, sr=sr)
175
+ peaks = librosa.util.peak_pick(onset_env, 3, 3, 3, 3, 0.5, 10)
176
+ return len(peaks) / (len(y) / sr)
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy==1.26.4
2
+ pandas==2.1.4
3
+ librosa==0.10.1
4
+ noisereduce==2.0.0
5
+ tqdm==4.66.1
6
+ joblib==1.3.2
7
+ soundfile==0.12.1
8
+ pydub==0.25.1
9
+ PyYAML==6.0.1
10
+ stopit==1.1.2
11
+ praat-parselmouth
12
+ scikit-learn==1.6.1
13
+ xgboost
stacked_age_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6ec42570c191f46973dfdeac070158acde9e227484d784372ed8c503c85dd03
3
+ size 171046812
stacked_gender_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4311ae6a0f789dbda9eb43030d97ea659acd4386fb30d5c07774e7fb5cbb031
3
+ size 81134594