Spaces:
Running
Running
auralis model & files
Browse files- api/routes.py +55 -0
- audio/__init__.py +0 -0
- audio/feature_extractor.py +22 -0
- audio/preprocessing.py +109 -0
- audio/validators.py +36 -0
- core/__init__.py +0 -0
- core/config.py +29 -0
- data/centroid_healthy.npy +3 -0
- data/fatigue_axis.npy +3 -0
- data/high_percentile.npz +3 -0
- data/low_percentile.npz +3 -0
- data/reference_embeddings_192-d.npy +3 -0
- main.py +34 -0
- model/__init__.py +0 -0
- model/config.yaml +26 -0
- model/ecapa.py +50 -0
- model/scorer.py +69 -0
- models/ecapa_supcon_model.pth +3 -0
- requirements.txt +11 -0
- utils/__init__.py +0 -0
- utils/file_utils.py +8 -0
- utils/logger.py +8 -0
api/routes.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import File, UploadFile, APIRouter
|
| 2 |
+
from audio.preprocessing import load_audio, extract_features
|
| 3 |
+
from model.ecapa import ECAPAENCODER
|
| 4 |
+
# from model.scorer import fatigue_score_0_to_100, prosody_score ## for prosody scoring
|
| 5 |
+
from model.scorer import fatigue_score_0_to_100
|
| 6 |
+
from fastapi.responses import JSONResponse
|
| 7 |
+
import numpy as np
|
| 8 |
+
from utils.logger import logger
|
| 9 |
+
from utils.file_utils import save_temp_audio
|
| 10 |
+
from core.config import LOW_PERCENTILE, HIGH_PERCENTILE, FATIGUE_AXIS, REF_C_H, MAX_DURATION_SEC
|
| 11 |
+
# from audio.feature_extractor import get_prosody_stats
|
| 12 |
+
from fastapi import HTTPException, status
|
| 13 |
+
from audio.validators import validate_audio_duration, validate_audio_file, AudioValidationError
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
C_h = np.load(REF_C_H)
|
| 17 |
+
fatigue_axis = np.load(FATIGUE_AXIS)
|
| 18 |
+
low = float(np.load(LOW_PERCENTILE)["arr_0"])
|
| 19 |
+
high = float(np.load(HIGH_PERCENTILE)["arr_0"])
|
| 20 |
+
|
| 21 |
+
router = APIRouter()
|
| 22 |
+
|
| 23 |
+
encoder = ECAPAENCODER()
|
| 24 |
+
|
| 25 |
+
@router.post("/score")
|
| 26 |
+
async def score_voice(file: UploadFile = File(...)):
|
| 27 |
+
try:
|
| 28 |
+
|
| 29 |
+
path = save_temp_audio(file)
|
| 30 |
+
validate_audio_file(
|
| 31 |
+
file_path=path,
|
| 32 |
+
original_filename=file.filename
|
| 33 |
+
)
|
| 34 |
+
wav = load_audio(path)
|
| 35 |
+
# prosody_features = get_prosody_stats(wav)
|
| 36 |
+
# p_score, report = prosody_score(prosody_features)
|
| 37 |
+
features = extract_features(wav)
|
| 38 |
+
wav = wav.squeeze()
|
| 39 |
+
emb = encoder.encode(wav)
|
| 40 |
+
score = float(fatigue_score_0_to_100(emb, C_h, fatigue_axis, low, high))
|
| 41 |
+
# return {"fatigue_score": score, "prosody_score": p_score, "prosody_report": report}
|
| 42 |
+
return {"fatigue_score" : score}
|
| 43 |
+
|
| 44 |
+
except AudioValidationError as e:
|
| 45 |
+
logger.warning(str(e))
|
| 46 |
+
raise HTTPException(
|
| 47 |
+
status_code= status.HTTP_400_BAD_REQUEST,
|
| 48 |
+
detail = str(e)
|
| 49 |
+
)
|
| 50 |
+
except Exception as e:
|
| 51 |
+
logger.exception("Unexpected server error")
|
| 52 |
+
raise HTTPException(
|
| 53 |
+
status_code= status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 54 |
+
detail = "Unexpected server error."
|
| 55 |
+
)
|
audio/__init__.py
ADDED
|
File without changes
|
audio/feature_extractor.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# import numpy as np
|
| 2 |
+
# import parselmouth
|
| 3 |
+
# from core.config import SAMPLE_RATE
|
| 4 |
+
|
| 5 |
+
# def get_prosody_stats(waveforms, sr=SAMPLE_RATE):
|
| 6 |
+
# feats = {"pitch_mean" : [], "pitch_std" : [], "jitter" : [], "shimmer" : [], "hnr" : []}
|
| 7 |
+
|
| 8 |
+
# for wav in waveforms:
|
| 9 |
+
# snd = parselmouth.Sound(wav.numpy, sampling_frequency=sr)
|
| 10 |
+
# pitch = snd.to_pitch()
|
| 11 |
+
|
| 12 |
+
# feats["pitch_mean"].append(pitch.mean())
|
| 13 |
+
# feats["pitch_std"].append(pitch.stdev())
|
| 14 |
+
# feats["jitter"].append(snd.get_jitter_local())
|
| 15 |
+
# feats["shimmer"].append(snd.get_shimmer_local())
|
| 16 |
+
# feats["hnr"].append(snd.to_harmonicity().mean())
|
| 17 |
+
|
| 18 |
+
# thresholds = {}
|
| 19 |
+
|
| 20 |
+
# for k, v in feats.items():
|
| 21 |
+
# thresholds[k] = (np.percentile(v, 5), np.percentile(v, 95))
|
| 22 |
+
# return thresholds
|
audio/preprocessing.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torchaudio
|
| 3 |
+
import torch.nn.functional as F
|
| 4 |
+
from core.config import SAMPLE_RATE, DEVICE, N_MELS, TARGET_LEN
|
| 5 |
+
from pydub import AudioSegment
|
| 6 |
+
import numpy as np
|
| 7 |
+
|
| 8 |
+
mel_transform = torchaudio.transforms.MelSpectrogram(
|
| 9 |
+
sample_rate = SAMPLE_RATE,
|
| 10 |
+
n_fft = 400,
|
| 11 |
+
hop_length = 256,
|
| 12 |
+
n_mels = N_MELS
|
| 13 |
+
).to(DEVICE)
|
| 14 |
+
|
| 15 |
+
amp_to_db = torchaudio.transforms.AmplitudeToDB().to(DEVICE)
|
| 16 |
+
|
| 17 |
+
# def load_audio(path: str) -> torch.Tensor:
|
| 18 |
+
# wav, sr = torchaudio.load(path)
|
| 19 |
+
# if sr != SAMPLE_RATE:
|
| 20 |
+
# wav = torchaudio.transforms.Resample(wav, sr, SAMPLE_RATE)
|
| 21 |
+
# if wav.shape[0] > 1:
|
| 22 |
+
# wav = wav.mean(dim = 0)
|
| 23 |
+
# return wav.to(DEVICE)
|
| 24 |
+
|
| 25 |
+
import torch
|
| 26 |
+
import torchaudio
|
| 27 |
+
import torch.nn.functional as F
|
| 28 |
+
import numpy as np
|
| 29 |
+
from pydub import AudioSegment
|
| 30 |
+
|
| 31 |
+
class AudioLoadError(Exception):
|
| 32 |
+
pass
|
| 33 |
+
|
| 34 |
+
def load_audio(path: str) -> torch.Tensor:
|
| 35 |
+
waveform = None
|
| 36 |
+
sr = None
|
| 37 |
+
|
| 38 |
+
# --- primary loader ---
|
| 39 |
+
try:
|
| 40 |
+
waveform, sr = torchaudio.load(path)
|
| 41 |
+
except Exception as e1:
|
| 42 |
+
# --- fallback loader ---
|
| 43 |
+
try:
|
| 44 |
+
audio = AudioSegment.from_file(path)
|
| 45 |
+
audio = audio.set_channels(1).set_frame_rate(SAMPLE_RATE)
|
| 46 |
+
samples = np.array(audio.get_array_of_samples(), dtype=np.float32)
|
| 47 |
+
if samples.size == 0:
|
| 48 |
+
raise AudioLoadError("Empty audio file")
|
| 49 |
+
|
| 50 |
+
waveform = torch.from_numpy(samples)
|
| 51 |
+
sr = SAMPLE_RATE
|
| 52 |
+
|
| 53 |
+
except Exception as e2:
|
| 54 |
+
raise AudioLoadError(
|
| 55 |
+
f"Failed to decode audio file: {str(e2)}"
|
| 56 |
+
) from e2
|
| 57 |
+
|
| 58 |
+
# ---- sanity checks ----
|
| 59 |
+
if waveform is None or waveform.numel() == 0:
|
| 60 |
+
raise AudioLoadError("Loaded audio is empty")
|
| 61 |
+
|
| 62 |
+
# mono
|
| 63 |
+
if waveform.dim() > 1:
|
| 64 |
+
waveform = waveform.mean(dim=0)
|
| 65 |
+
|
| 66 |
+
# resample
|
| 67 |
+
if sr != SAMPLE_RATE:
|
| 68 |
+
waveform = torchaudio.transforms.Resample(sr, SAMPLE_RATE)(waveform)
|
| 69 |
+
|
| 70 |
+
# duration control
|
| 71 |
+
if waveform.numel() < TARGET_LEN:
|
| 72 |
+
raise AudioLoadError("Audio too short for analysis")
|
| 73 |
+
|
| 74 |
+
if waveform.numel() > TARGET_LEN:
|
| 75 |
+
waveform = waveform[:TARGET_LEN]
|
| 76 |
+
else:
|
| 77 |
+
waveform = F.pad(waveform, (0, TARGET_LEN - waveform.numel()))
|
| 78 |
+
|
| 79 |
+
return waveform.float()
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def waveform_to_mel(waveform: torch.Tensor):
|
| 84 |
+
"""
|
| 85 |
+
waveform: [T]
|
| 86 |
+
returns: [1, T, N_MELS]
|
| 87 |
+
"""
|
| 88 |
+
mel = mel_transform(waveform.unsqueeze(0)) # [1, n_mels, frames]
|
| 89 |
+
mel = amp_to_db(mel)
|
| 90 |
+
mel = mel.transpose(1, 2) # [1, frames, n_mels]
|
| 91 |
+
return mel
|
| 92 |
+
|
| 93 |
+
def pad_time_dim(mel):
|
| 94 |
+
T = mel.shape[1]
|
| 95 |
+
pad_len = (8 - (T % 8)) % 8
|
| 96 |
+
if pad_len > 0:
|
| 97 |
+
mel = F.pad(mel, (0, 0, 0, pad_len))
|
| 98 |
+
return mel
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def extract_features(wav: torch.Tensor) -> torch.Tensor:
|
| 102 |
+
mel = mel_transform(wav.unsqueeze(0))
|
| 103 |
+
mel = amp_to_db(mel)
|
| 104 |
+
if mel.dim == 4:
|
| 105 |
+
mel = mel.squeeze(1)
|
| 106 |
+
|
| 107 |
+
mel.transpose(1, 2) # [B, T, N_MELS]
|
| 108 |
+
mel = pad_time_dim(mel)
|
| 109 |
+
return mel
|
audio/validators.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import soundfile as sf
|
| 2 |
+
from core.config import MAX_DURATION_SEC, MIN_DURATION_SEC, ALLOWED_EXTENSIONS
|
| 3 |
+
from utils.logger import logger
|
| 4 |
+
|
| 5 |
+
class AudioValidationError(ValueError):
|
| 6 |
+
pass
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def validate_audio_duration(filepath: str, max_duration: float = MAX_DURATION_SEC):
|
| 10 |
+
try:
|
| 11 |
+
info = sf.info(filepath)
|
| 12 |
+
except RuntimeError:
|
| 13 |
+
raise AudioValidationError("Invalid or corrupted audio file.")
|
| 14 |
+
|
| 15 |
+
duration = info.frames / float(info.samplerate)
|
| 16 |
+
|
| 17 |
+
if duration > max_duration or duration < MIN_DURATION_SEC:
|
| 18 |
+
logger.warning(f"Unsupported file length received: {original_filename}")
|
| 19 |
+
raise AudioValidationError(
|
| 20 |
+
f"Audio duration {duration:.2f}s invalid. "
|
| 21 |
+
f"Allowed range: {MIN_DURATION_SEC:.2f}s – {max_duration:.2f}s."
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
return duration
|
| 25 |
+
|
| 26 |
+
def validate_audio_file(file_path: str, original_filename: str):
|
| 27 |
+
ext = original_filename.lower().rsplit(".", 1)[-1]
|
| 28 |
+
ext = "." + ext
|
| 29 |
+
|
| 30 |
+
if ext not in ALLOWED_EXTENSIONS:
|
| 31 |
+
logger.warning(f"Unsupported file format received: {original_filename}")
|
| 32 |
+
raise AudioValidationError(
|
| 33 |
+
f"Unsupported file type {ext}. Allowed formats are: " + ", ".join(ALLOWED_EXTENSIONS)
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
validate_audio_duration(file_path)
|
core/__init__.py
ADDED
|
File without changes
|
core/config.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
import torch
|
| 3 |
+
import yaml
|
| 4 |
+
|
| 5 |
+
BASE_DIR = Path(__file__).resolve().parents[1]
|
| 6 |
+
|
| 7 |
+
MODEL_DIR = f"{BASE_DIR}/models/ecapa_supcon_model.pth"
|
| 8 |
+
REF_EMB = f"{BASE_DIR}/data/reference_embeddings_192-d.npy"
|
| 9 |
+
REF_C_H = f"{BASE_DIR}/data/centroid_healthy.npy"
|
| 10 |
+
FATIGUE_AXIS = f"{BASE_DIR}/data/fatigue_axis.npy"
|
| 11 |
+
LOW_PERCENTILE = f"{BASE_DIR}/data/low_percentile.npz"
|
| 12 |
+
HIGH_PERCENTILE = f"{BASE_DIR}/data/high_percentile.npz"
|
| 13 |
+
CONFIG_PATH = f"{BASE_DIR}/model/config.yaml"
|
| 14 |
+
|
| 15 |
+
SAMPLE_RATE = 16000
|
| 16 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 17 |
+
TARGET_SEC = 5
|
| 18 |
+
N_MELS = 80
|
| 19 |
+
TARGET_LEN = SAMPLE_RATE * TARGET_SEC
|
| 20 |
+
MAX_DURATION_SEC = 10.0
|
| 21 |
+
MIN_DURATION_SEC = 5.0
|
| 22 |
+
ALLOWED_EXTENSIONS = {".wav", ".mp3", ".m4a"}
|
| 23 |
+
|
| 24 |
+
with open(CONFIG_PATH, "r") as f:
|
| 25 |
+
CONFIG = yaml.safe_load(f)
|
| 26 |
+
|
| 27 |
+
print(f"Model directory is set to: {MODEL_DIR}")
|
| 28 |
+
print(f"base dir: {BASE_DIR}")
|
| 29 |
+
print(f"ref emb path: {REF_EMB}")
|
data/centroid_healthy.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7290aa86b3a3c2d6dc739fc1a305f969d0ab442a81fd65e1bd0157032d0a4bac
|
| 3 |
+
size 896
|
data/fatigue_axis.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4b68e5a5ded25edd95efc6ee0f4a45804c46023a994cc5fa616a3a89100a698f
|
| 3 |
+
size 896
|
data/high_percentile.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e24a1e11b2ea4e347d9452a96be376d8886ec59a2f3cc8b8de3d3102dd1115d1
|
| 3 |
+
size 211
|
data/low_percentile.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5a2e8eb7c090098d787b2a72e9300c77c811815307b808ab05ab4d3bef6ad639
|
| 3 |
+
size 211
|
data/reference_embeddings_192-d.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:60ecf6c5194ccf55d9729786aec24c02308a519097cf7fd408d6d6f0bccd9783
|
| 3 |
+
size 57448
|
main.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, APIRouter, Request
|
| 2 |
+
from api.routes import router
|
| 3 |
+
import time
|
| 4 |
+
import logging
|
| 5 |
+
|
| 6 |
+
logging.basicConfig(
|
| 7 |
+
level = logging.INFO,
|
| 8 |
+
format = "%(asctime)s - %(levelname)s - %(message)s"
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
app = FastAPI(title = "Vocal fatigue scoring API")
|
| 12 |
+
|
| 13 |
+
@app.middleware("HTTP")
|
| 14 |
+
async def log_requests(request: Request, call_next):
|
| 15 |
+
start = time.time()
|
| 16 |
+
response = await call_next(request)
|
| 17 |
+
duration = time.time() - start
|
| 18 |
+
|
| 19 |
+
logging.info(
|
| 20 |
+
f"{request.method} {request.url.path} "
|
| 21 |
+
f"status_code = {response.status_code} "
|
| 22 |
+
f"time = {duration:.3f}"
|
| 23 |
+
)
|
| 24 |
+
return response
|
| 25 |
+
|
| 26 |
+
api_v1 = APIRouter(prefix="/api/v1")
|
| 27 |
+
api_v1.include_router(router, prefix="/voice")
|
| 28 |
+
|
| 29 |
+
app.include_router(api_v1)
|
| 30 |
+
|
| 31 |
+
@app.get("/health")
|
| 32 |
+
def health():
|
| 33 |
+
return {"status" : "ok"}
|
| 34 |
+
|
model/__init__.py
ADDED
|
File without changes
|
model/config.yaml
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model:
|
| 2 |
+
name: ecapa_fatigue
|
| 3 |
+
version: 1.0.0
|
| 4 |
+
encoder: ecapa_tdnn
|
| 5 |
+
embedding_dim: 192
|
| 6 |
+
sampling_rate: 16000
|
| 7 |
+
N_mels: 80
|
| 8 |
+
scoring:
|
| 9 |
+
method: calibrated_sigmoid
|
| 10 |
+
raw_low_percentile: 5
|
| 11 |
+
raw_high_percentile: 95
|
| 12 |
+
preprocessing:
|
| 13 |
+
mono: True
|
| 14 |
+
target_len: 48000
|
| 15 |
+
prosody_thresholds:
|
| 16 |
+
pitch_mean: [110, 180] # Hz
|
| 17 |
+
pitch_std: [5, 40] # Hz
|
| 18 |
+
jitter: [0.0, 0.5] # %
|
| 19 |
+
shimmer: [0.0, 1.0] # %
|
| 20 |
+
hnr: [10, 30] # dB
|
| 21 |
+
audio:
|
| 22 |
+
min_duration: 5.0 # seconds
|
| 23 |
+
max_duration: 10.0 # seconds
|
| 24 |
+
allow_trim: False
|
| 25 |
+
allowed_formats: ['.wav', '.m4a', '.mp3']
|
| 26 |
+
required_sampling_rate: 16000
|
model/ecapa.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torchaudio
|
| 3 |
+
from speechbrain.lobes.models.ECAPA_TDNN import ECAPA_TDNN
|
| 4 |
+
from core.config import MODEL_DIR, DEVICE, N_MELS
|
| 5 |
+
from audio.preprocessing import waveform_to_mel
|
| 6 |
+
import numpy as np
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class ECAPAENCODER:
|
| 10 |
+
def __init__(self):
|
| 11 |
+
self.ecapa = ECAPA_TDNN(
|
| 12 |
+
input_size = N_MELS,
|
| 13 |
+
lin_neurons = 192,
|
| 14 |
+
channels = [512, 512, 512],
|
| 15 |
+
kernel_sizes = [5, 3, 3],
|
| 16 |
+
dilations = [1, 2 , 3]
|
| 17 |
+
).to(DEVICE)
|
| 18 |
+
|
| 19 |
+
checkpoint = torch.load(MODEL_DIR, map_location = DEVICE)
|
| 20 |
+
self.ecapa.load_state_dict(checkpoint['ecapa_state_dict'])
|
| 21 |
+
self.ecapa.eval()
|
| 22 |
+
|
| 23 |
+
@torch.no_grad()
|
| 24 |
+
def encode(self, waveform):
|
| 25 |
+
"""
|
| 26 |
+
waveform: Tensor [T]
|
| 27 |
+
returns: np.ndarray [192]
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
# ---- safety checks ----
|
| 31 |
+
if waveform.dim() == 2 and waveform.shape[0] == 1:
|
| 32 |
+
waveform = waveform.squeeze(0)
|
| 33 |
+
if waveform.dim() != 1:
|
| 34 |
+
raise ValueError(f"Expected waveform [T], got {waveform.shape}")
|
| 35 |
+
|
| 36 |
+
waveform = waveform.float().to(DEVICE)
|
| 37 |
+
# waveform = waveform.unsqueeze(0) # [1, T]
|
| 38 |
+
|
| 39 |
+
mel = waveform_to_mel(waveform) # [1, n_mels, frames]
|
| 40 |
+
|
| 41 |
+
# if mel.dim() == 4:
|
| 42 |
+
# mel = mel.squeeze(1)
|
| 43 |
+
|
| 44 |
+
# mel = mel.transpose(1, 2).contiguous() # [1, T, n_mels]
|
| 45 |
+
|
| 46 |
+
# ---- critical debug line (keep this while testing) ----
|
| 47 |
+
print("ECAPA INPUT SHAPE:", mel.shape)
|
| 48 |
+
|
| 49 |
+
emb = self.ecapa(mel) # [1, 192]
|
| 50 |
+
return emb.squeeze(0).cpu().numpy()
|
model/scorer.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from core.config import LOW_PERCENTILE, HIGH_PERCENTILE, FATIGUE_AXIS, REF_C_H
|
| 2 |
+
import numpy as np
|
| 3 |
+
from core.config import CONFIG
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
C_h = np.load(REF_C_H)
|
| 7 |
+
fatigue_axis = np.load(FATIGUE_AXIS)
|
| 8 |
+
low = float(np.load(LOW_PERCENTILE)["arr_0"])
|
| 9 |
+
high = float(np.load(HIGH_PERCENTILE)["arr_0"])
|
| 10 |
+
|
| 11 |
+
# def fatigue_score_0_to_100(emb: np.ndarray) -> float:
|
| 12 |
+
# raw = np.dot(emb - C_h, fatigue_axis)
|
| 13 |
+
# raw = np.clip(raw, low , high)
|
| 14 |
+
# return 100 * (raw - low) / (high - low)
|
| 15 |
+
|
| 16 |
+
def fatigue_score_0_to_100(embedding, C_h, fatigue_axis, raw_low, raw_high, method='sigmoid'):
|
| 17 |
+
"""
|
| 18 |
+
Compute a continuous fatigue score (0-100) from an embedding.
|
| 19 |
+
|
| 20 |
+
embedding: 192-d numpy array
|
| 21 |
+
C_h: healthy centroid (192-d)
|
| 22 |
+
fatigue_axis: unit vector from healthy -> fatigued (192-d)
|
| 23 |
+
raw_low, raw_high: training percentile values along the fatigue axis
|
| 24 |
+
method: 'linear', 'sigmoid', or 'smooth_linear'
|
| 25 |
+
|
| 26 |
+
Returns: float [0, 100]
|
| 27 |
+
"""
|
| 28 |
+
# Project embedding along fatigue axis
|
| 29 |
+
raw = np.dot(embedding - C_h, fatigue_axis)
|
| 30 |
+
|
| 31 |
+
# Normalize raw value to [0, 1] within training range
|
| 32 |
+
normalized = (raw - raw_low) / (raw_high - raw_low)
|
| 33 |
+
|
| 34 |
+
# Clamp slightly beyond training range to avoid extreme scores
|
| 35 |
+
normalized = np.clip(normalized, -0.05, 1.05)
|
| 36 |
+
|
| 37 |
+
if method == 'linear':
|
| 38 |
+
score = normalized * 100 # simple linear scaling
|
| 39 |
+
elif method == 'sigmoid':
|
| 40 |
+
# Smooth sigmoid, less steep
|
| 41 |
+
midpoint = 0.5
|
| 42 |
+
scale = 0.25 # tune this for slope; bigger = smoother
|
| 43 |
+
score = 1 / (1 + np.exp(-(normalized - midpoint) / scale)) * 100
|
| 44 |
+
elif method == 'smooth_linear':
|
| 45 |
+
# Combine linear scaling with mild sigmoid smoothing at ends
|
| 46 |
+
# This gives a natural 0-100 spread but saturates near extremes
|
| 47 |
+
scale = 10 # controls smoothness near 0 and 100
|
| 48 |
+
score = normalized * 100
|
| 49 |
+
score = 100 / (1 + np.exp(- (score - 50) / scale))
|
| 50 |
+
else:
|
| 51 |
+
raise ValueError("method must be 'linear', 'sigmoid', or 'smooth_linear'")
|
| 52 |
+
|
| 53 |
+
# Ensure the output is float and bounded
|
| 54 |
+
return float(np.clip(score, 0, 100))
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
# def prosody_score(prosody_feats):
|
| 58 |
+
# report = []
|
| 59 |
+
# thresholds = CONFIG['prosody_thresholds']
|
| 60 |
+
|
| 61 |
+
# for feat, val in prosody_feats.items():
|
| 62 |
+
# low, high = thresholds[feat]
|
| 63 |
+
# if val < low:
|
| 64 |
+
# report.append(f"{feat} is low → potential fatigue")
|
| 65 |
+
# elif val > high:
|
| 66 |
+
# report.append(f"{feat} is high → potential fatigue")
|
| 67 |
+
|
| 68 |
+
# score = len(report) # simple count, or map to 0-100 if needed
|
| 69 |
+
# return score, report
|
models/ecapa_supcon_model.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:145325909e3e53c13bbb351537117727f4caf34828aea9c2e55b1d0f7262bfc6
|
| 3 |
+
size 9208363
|
requirements.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch==2.1.1
|
| 2 |
+
pytest
|
| 3 |
+
fastapi
|
| 4 |
+
torchaudio==2.1.1
|
| 5 |
+
speechbrain
|
| 6 |
+
numpy==1.26.4
|
| 7 |
+
pathlib
|
| 8 |
+
pydub
|
| 9 |
+
uvicorn
|
| 10 |
+
soundfile
|
| 11 |
+
python-multipart
|
utils/__init__.py
ADDED
|
File without changes
|
utils/file_utils.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import tempfile
|
| 2 |
+
from fastapi import UploadFile
|
| 3 |
+
|
| 4 |
+
def save_temp_audio(file: UploadFile) -> str:
|
| 5 |
+
suffix = file.filename.split(".")[-1]
|
| 6 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=f".{suffix}") as tmp_file:
|
| 7 |
+
tmp_file.write(file.file.read())
|
| 8 |
+
return tmp_file.name
|
utils/logger.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
|
| 3 |
+
logging.basicConfig(
|
| 4 |
+
level= logging.INFO,
|
| 5 |
+
format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
| 6 |
+
)
|
| 7 |
+
|
| 8 |
+
logger = logging.getLogger("vocal-fatigue-api")
|