Upload folder using huggingface_hub
Browse files- README.MD +10 -0
- app.py +26 -0
- external_infer.py +15 -0
- infer.py +176 -0
- requirements.txt +13 -0
- stacked_age_model.joblib +3 -0
- stacked_gender_model.joblib +3 -0
README.MD
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Speaker Gender/Age Recognition
|
| 2 |
+
|
| 3 |
+
Predicts gender (male/female) and age group (20s/50s) from audio.
|
| 4 |
+
|
| 5 |
+
## Usage
|
| 6 |
+
```python
|
| 7 |
+
from huggingface_hub import InferenceClient
|
| 8 |
+
|
| 9 |
+
client = InferenceClient("YOUR_USERNAME/speaker-recognition")
|
| 10 |
+
result = client.post(json={"file": open("audio.wav", "rb")})
|
app.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app.py
|
| 2 |
+
from fastapi import FastAPI, UploadFile, File
|
| 3 |
+
from inference import SpeakerClassifier
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
app = FastAPI()
|
| 7 |
+
classifier = SpeakerClassifier()
|
| 8 |
+
|
| 9 |
+
@app.post("/predict")
|
| 10 |
+
async def predict_audio(file: UploadFile = File(...)):
|
| 11 |
+
# Save the uploaded file temporarily
|
| 12 |
+
temp_path = f"temp_{file.filename}"
|
| 13 |
+
with open(temp_path, "wb") as f:
|
| 14 |
+
f.write(await file.read())
|
| 15 |
+
|
| 16 |
+
# Predict
|
| 17 |
+
result = classifier.predict(temp_path)
|
| 18 |
+
|
| 19 |
+
# Clean up
|
| 20 |
+
os.remove(temp_path)
|
| 21 |
+
|
| 22 |
+
return result
|
| 23 |
+
|
| 24 |
+
if __name__ == "__main__":
|
| 25 |
+
import uvicorn
|
| 26 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
external_infer.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import subprocess
|
| 2 |
+
import time
|
| 3 |
+
def external_infer(path):
|
| 4 |
+
start_time = time.time()
|
| 5 |
+
subprocess.run(['python', 'infer.py', path],check=True)
|
| 6 |
+
end_time = time.time()
|
| 7 |
+
elapsed_time = end_time - start_time
|
| 8 |
+
with open('time.txt', 'w') as f:
|
| 9 |
+
f.write(str(elapsed_time))
|
| 10 |
+
|
| 11 |
+
print("External inference completed successfully")
|
| 12 |
+
|
| 13 |
+
if __name__ == "__main__":
|
| 14 |
+
import sys
|
| 15 |
+
external_infer(sys.argv[1])
|
infer.py
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import numpy as np
|
| 3 |
+
import librosa
|
| 4 |
+
import noisereduce as nr
|
| 5 |
+
import parselmouth
|
| 6 |
+
from parselmouth.praat import call
|
| 7 |
+
import joblib
|
| 8 |
+
from typing import Dict, Optional
|
| 9 |
+
|
| 10 |
+
class SpeakerClassifier:
|
| 11 |
+
def __init__(self):
|
| 12 |
+
"""Initialize models and ensure they're loaded once."""
|
| 13 |
+
self.gender_model = joblib.load("stacked_gender_model.joblib")
|
| 14 |
+
self.age_model = joblib.load("stacked_age_model.joblib")
|
| 15 |
+
|
| 16 |
+
def predict(self, audio_path: str) -> Dict[str, str]:
|
| 17 |
+
"""
|
| 18 |
+
Predict gender and age from an audio file.
|
| 19 |
+
Returns: {'gender': 'male/female', 'age': '20s/50s'}
|
| 20 |
+
"""
|
| 21 |
+
features = self._extract_features(audio_path)
|
| 22 |
+
if features is None:
|
| 23 |
+
return {"error": "Feature extraction failed"}
|
| 24 |
+
|
| 25 |
+
# Predict using your models
|
| 26 |
+
gender_num = self.gender_model.predict([features])[0]
|
| 27 |
+
age_num = self.age_model.predict([features])[0]
|
| 28 |
+
|
| 29 |
+
# Map numerical predictions to labels
|
| 30 |
+
gender = "male" if gender_num == 0 else "female"
|
| 31 |
+
age = "20s" if age_num == 0 else "50s"
|
| 32 |
+
|
| 33 |
+
return {"gender": gender, "age": age}
|
| 34 |
+
|
| 35 |
+
# --- Your Feature Extraction Functions (adapted) ---
|
| 36 |
+
@staticmethod
|
| 37 |
+
def _normalize_volume(audio, target_dBFS=-20):
|
| 38 |
+
rms = np.sqrt(np.mean(audio**2))
|
| 39 |
+
gain = 10**((target_dBFS - 20*np.log10(rms))/20)
|
| 40 |
+
return audio * gain
|
| 41 |
+
|
| 42 |
+
@staticmethod
|
| 43 |
+
def _remove_silence(audio, top_db=20):
|
| 44 |
+
intervals = librosa.effects.split(audio, top_db=top_db)
|
| 45 |
+
return np.concatenate([audio[start:end] for start, end in intervals])
|
| 46 |
+
|
| 47 |
+
@staticmethod
|
| 48 |
+
def _equalize_audio(audio, sr, bass_boost=2, treble_boost=1.5):
|
| 49 |
+
S = librosa.stft(audio)
|
| 50 |
+
freqs = librosa.fft_frequencies(sr=sr)
|
| 51 |
+
S[freqs < 250] *= bass_boost
|
| 52 |
+
S[freqs > 4000] *= treble_boost
|
| 53 |
+
return librosa.istft(S)
|
| 54 |
+
|
| 55 |
+
def _preprocess_audio(self, audio, sr, target_sr=16000):
|
| 56 |
+
audio = self._remove_silence(audio)
|
| 57 |
+
audio = nr.reduce_noise(y=audio, sr=target_sr)
|
| 58 |
+
audio = self._normalize_volume(audio)
|
| 59 |
+
audio = self._equalize_audio(audio, target_sr)
|
| 60 |
+
return audio
|
| 61 |
+
|
| 62 |
+
def _extract_formants(self, y, sr):
|
| 63 |
+
try:
|
| 64 |
+
sound = parselmouth.Sound(y, sampling_frequency=sr)
|
| 65 |
+
formant = sound.to_formant_burg(time_step=0.01)
|
| 66 |
+
|
| 67 |
+
f1_list, f2_list, f3_list = [], [], []
|
| 68 |
+
for t in np.arange(0, sound.duration, 0.01):
|
| 69 |
+
try:
|
| 70 |
+
f1 = formant.get_value_at_time(1, t)
|
| 71 |
+
f2 = formant.get_value_at_time(2, t)
|
| 72 |
+
f3 = formant.get_value_at_time(3, t)
|
| 73 |
+
if all(v and not np.isnan(v) for v in [f1, f2, f3]):
|
| 74 |
+
f1_list.append(f1)
|
| 75 |
+
f2_list.append(f2)
|
| 76 |
+
f3_list.append(f3)
|
| 77 |
+
except Exception:
|
| 78 |
+
continue
|
| 79 |
+
|
| 80 |
+
features = [
|
| 81 |
+
np.mean(f1_list) if f1_list else 0,
|
| 82 |
+
np.std(f1_list) if f1_list else 0,
|
| 83 |
+
np.median(f1_list) if f1_list else 0,
|
| 84 |
+
(np.percentile(f1_list, 75) - np.percentile(f1_list, 25)) if f1_list else 0,
|
| 85 |
+
# ... (include all your formant features)
|
| 86 |
+
]
|
| 87 |
+
return np.array(features)
|
| 88 |
+
except Exception:
|
| 89 |
+
return None
|
| 90 |
+
|
| 91 |
+
def _calculate_jitter(self, y, sr):
|
| 92 |
+
try:
|
| 93 |
+
sound = parselmouth.Sound(y, sampling_frequency=sr)
|
| 94 |
+
pointProcess = call(sound, "To PointProcess (periodic, cc)", 75, 500)
|
| 95 |
+
harmonicity = call(sound, "To Harmonicity (cc)", 0.01, 75, 0.1, 1.0)
|
| 96 |
+
|
| 97 |
+
metrics = np.array([
|
| 98 |
+
call(harmonicity, "Get mean", 0, 0),
|
| 99 |
+
call(pointProcess, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3),
|
| 100 |
+
# ... (include all your jitter/shimmer metrics)
|
| 101 |
+
])
|
| 102 |
+
return metrics
|
| 103 |
+
except Exception:
|
| 104 |
+
return None
|
| 105 |
+
|
| 106 |
+
def _extract_features(self, audio_path: str) -> Optional[np.ndarray]:
|
| 107 |
+
"""Main feature extraction pipeline."""
|
| 108 |
+
try:
|
| 109 |
+
y, sr = librosa.load(audio_path, sr=16000, duration=7)
|
| 110 |
+
y = self._preprocess_audio(y, sr)
|
| 111 |
+
|
| 112 |
+
# Extract all feature types
|
| 113 |
+
jitter_features = self._calculate_jitter(y, sr)
|
| 114 |
+
formant_features = self._extract_formants(y, sr)
|
| 115 |
+
|
| 116 |
+
# F0 features
|
| 117 |
+
f0, _, _ = librosa.pyin(y, sr=sr, fmin=75, fmax=500, frame_length=1024)
|
| 118 |
+
f0 = f0[~np.isnan(f0)]
|
| 119 |
+
f0_features = self._get_f0_features(f0) if len(f0) > 0 else self._get_default_f0_features()
|
| 120 |
+
|
| 121 |
+
# MFCCs
|
| 122 |
+
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, n_fft=512, hop_length=256)
|
| 123 |
+
mfcc_features = np.concatenate([np.mean(mfccs, axis=1), np.std(mfccs, axis=1)])
|
| 124 |
+
|
| 125 |
+
# Spectral features
|
| 126 |
+
spectral_tilt = self._compute_spectral_tilt(y, sr)
|
| 127 |
+
cpp = self._compute_cpp(y, sr)
|
| 128 |
+
speaking_rate = self._compute_speaking_rate(y, sr)
|
| 129 |
+
|
| 130 |
+
# Combine all features
|
| 131 |
+
features = np.concatenate([
|
| 132 |
+
[spectral_tilt, cpp, speaking_rate],
|
| 133 |
+
mfcc_features,
|
| 134 |
+
formant_features,
|
| 135 |
+
jitter_features,
|
| 136 |
+
f0_features
|
| 137 |
+
])
|
| 138 |
+
|
| 139 |
+
return features if not (np.any(np.isnan(features)) or np.any(np.isinf(features))) else None
|
| 140 |
+
|
| 141 |
+
except Exception as e:
|
| 142 |
+
print(f"Feature extraction error: {str(e)}")
|
| 143 |
+
return None
|
| 144 |
+
|
| 145 |
+
# Helper methods for feature extraction
|
| 146 |
+
@staticmethod
|
| 147 |
+
def _get_f0_features(f0):
|
| 148 |
+
f0_diff = np.diff(f0)
|
| 149 |
+
return np.array([
|
| 150 |
+
0, # is_distorted=False
|
| 151 |
+
float(np.mean(f0)),
|
| 152 |
+
float(np.std(f0)),
|
| 153 |
+
float(np.median(f0)),
|
| 154 |
+
float(np.max(f0) - np.min(f0)),
|
| 155 |
+
float(np.mean(np.abs(f0_diff)) / np.mean(f0)) if np.mean(f0) > 0 else 0.0
|
| 156 |
+
])
|
| 157 |
+
|
| 158 |
+
@staticmethod
|
| 159 |
+
def _get_default_f0_features():
|
| 160 |
+
return np.array([1, 150.0, 20.0, 150.0, 100.0, 0.1]) # Default values
|
| 161 |
+
|
| 162 |
+
@staticmethod
|
| 163 |
+
def _compute_spectral_tilt(y, sr):
|
| 164 |
+
S = np.abs(librosa.stft(y))
|
| 165 |
+
return np.max(S[1:10]) - np.max(S[10:20])
|
| 166 |
+
|
| 167 |
+
@staticmethod
|
| 168 |
+
def _compute_cpp(y, sr):
|
| 169 |
+
cepstrum = np.abs(np.fft.irfft(np.log(np.abs(np.fft.rfft(y)))))
|
| 170 |
+
return np.max(cepstrum[10:60])
|
| 171 |
+
|
| 172 |
+
@staticmethod
|
| 173 |
+
def _compute_speaking_rate(y, sr):
|
| 174 |
+
onset_env = librosa.onset.onset_strength(y=y, sr=sr)
|
| 175 |
+
peaks = librosa.util.peak_pick(onset_env, 3, 3, 3, 3, 0.5, 10)
|
| 176 |
+
return len(peaks) / (len(y) / sr)
|
requirements.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
numpy==1.26.4
|
| 2 |
+
pandas==2.1.4
|
| 3 |
+
librosa==0.10.1
|
| 4 |
+
noisereduce==2.0.0
|
| 5 |
+
tqdm==4.66.1
|
| 6 |
+
joblib==1.3.2
|
| 7 |
+
soundfile==0.12.1
|
| 8 |
+
pydub==0.25.1
|
| 9 |
+
PyYAML==6.0.1
|
| 10 |
+
stopit==1.1.2
|
| 11 |
+
praat-parselmouth
|
| 12 |
+
scikit-learn==1.6.1
|
| 13 |
+
xgboost
|
stacked_age_model.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a6ec42570c191f46973dfdeac070158acde9e227484d784372ed8c503c85dd03
|
| 3 |
+
size 171046812
|
stacked_gender_model.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a4311ae6a0f789dbda9eb43030d97ea659acd4386fb30d5c07774e7fb5cbb031
|
| 3 |
+
size 81134594
|