Khubaib01 commited on
Commit
36e0dea
·
verified ·
1 Parent(s): f40ee09

auralis model & files

Browse files
api/routes.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import File, UploadFile, APIRouter
2
+ from audio.preprocessing import load_audio, extract_features
3
+ from model.ecapa import ECAPAENCODER
4
+ # from model.scorer import fatigue_score_0_to_100, prosody_score ## for prosody scoring
5
+ from model.scorer import fatigue_score_0_to_100
6
+ from fastapi.responses import JSONResponse
7
+ import numpy as np
8
+ from utils.logger import logger
9
+ from utils.file_utils import save_temp_audio
10
+ from core.config import LOW_PERCENTILE, HIGH_PERCENTILE, FATIGUE_AXIS, REF_C_H, MAX_DURATION_SEC
11
+ # from audio.feature_extractor import get_prosody_stats
12
+ from fastapi import HTTPException, status
13
+ from audio.validators import validate_audio_duration, validate_audio_file, AudioValidationError
14
+
15
+
16
+ C_h = np.load(REF_C_H)
17
+ fatigue_axis = np.load(FATIGUE_AXIS)
18
+ low = float(np.load(LOW_PERCENTILE)["arr_0"])
19
+ high = float(np.load(HIGH_PERCENTILE)["arr_0"])
20
+
21
+ router = APIRouter()
22
+
23
+ encoder = ECAPAENCODER()
24
+
25
+ @router.post("/score")
26
+ async def score_voice(file: UploadFile = File(...)):
27
+ try:
28
+
29
+ path = save_temp_audio(file)
30
+ validate_audio_file(
31
+ file_path=path,
32
+ original_filename=file.filename
33
+ )
34
+ wav = load_audio(path)
35
+ # prosody_features = get_prosody_stats(wav)
36
+ # p_score, report = prosody_score(prosody_features)
37
+ features = extract_features(wav)
38
+ wav = wav.squeeze()
39
+ emb = encoder.encode(wav)
40
+ score = float(fatigue_score_0_to_100(emb, C_h, fatigue_axis, low, high))
41
+ # return {"fatigue_score": score, "prosody_score": p_score, "prosody_report": report}
42
+ return {"fatigue_score" : score}
43
+
44
+ except AudioValidationError as e:
45
+ logger.warning(str(e))
46
+ raise HTTPException(
47
+ status_code= status.HTTP_400_BAD_REQUEST,
48
+ detail = str(e)
49
+ )
50
+ except Exception as e:
51
+ logger.exception("Unexpected server error")
52
+ raise HTTPException(
53
+ status_code= status.HTTP_500_INTERNAL_SERVER_ERROR,
54
+ detail = "Unexpected server error."
55
+ )
audio/__init__.py ADDED
File without changes
audio/feature_extractor.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import numpy as np
2
+ # import parselmouth
3
+ # from core.config import SAMPLE_RATE
4
+
5
+ # def get_prosody_stats(waveforms, sr=SAMPLE_RATE):
6
+ # feats = {"pitch_mean" : [], "pitch_std" : [], "jitter" : [], "shimmer" : [], "hnr" : []}
7
+
8
+ # for wav in waveforms:
9
+ # snd = parselmouth.Sound(wav.numpy, sampling_frequency=sr)
10
+ # pitch = snd.to_pitch()
11
+
12
+ # feats["pitch_mean"].append(pitch.mean())
13
+ # feats["pitch_std"].append(pitch.stdev())
14
+ # feats["jitter"].append(snd.get_jitter_local())
15
+ # feats["shimmer"].append(snd.get_shimmer_local())
16
+ # feats["hnr"].append(snd.to_harmonicity().mean())
17
+
18
+ # thresholds = {}
19
+
20
+ # for k, v in feats.items():
21
+ # thresholds[k] = (np.percentile(v, 5), np.percentile(v, 95))
22
+ # return thresholds
audio/preprocessing.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torchaudio
3
+ import torch.nn.functional as F
4
+ from core.config import SAMPLE_RATE, DEVICE, N_MELS, TARGET_LEN
5
+ from pydub import AudioSegment
6
+ import numpy as np
7
+
8
+ mel_transform = torchaudio.transforms.MelSpectrogram(
9
+ sample_rate = SAMPLE_RATE,
10
+ n_fft = 400,
11
+ hop_length = 256,
12
+ n_mels = N_MELS
13
+ ).to(DEVICE)
14
+
15
+ amp_to_db = torchaudio.transforms.AmplitudeToDB().to(DEVICE)
16
+
17
+ # def load_audio(path: str) -> torch.Tensor:
18
+ # wav, sr = torchaudio.load(path)
19
+ # if sr != SAMPLE_RATE:
20
+ # wav = torchaudio.transforms.Resample(wav, sr, SAMPLE_RATE)
21
+ # if wav.shape[0] > 1:
22
+ # wav = wav.mean(dim = 0)
23
+ # return wav.to(DEVICE)
24
+
25
+ import torch
26
+ import torchaudio
27
+ import torch.nn.functional as F
28
+ import numpy as np
29
+ from pydub import AudioSegment
30
+
31
+ class AudioLoadError(Exception):
32
+ pass
33
+
34
+ def load_audio(path: str) -> torch.Tensor:
35
+ waveform = None
36
+ sr = None
37
+
38
+ # --- primary loader ---
39
+ try:
40
+ waveform, sr = torchaudio.load(path)
41
+ except Exception as e1:
42
+ # --- fallback loader ---
43
+ try:
44
+ audio = AudioSegment.from_file(path)
45
+ audio = audio.set_channels(1).set_frame_rate(SAMPLE_RATE)
46
+ samples = np.array(audio.get_array_of_samples(), dtype=np.float32)
47
+ if samples.size == 0:
48
+ raise AudioLoadError("Empty audio file")
49
+
50
+ waveform = torch.from_numpy(samples)
51
+ sr = SAMPLE_RATE
52
+
53
+ except Exception as e2:
54
+ raise AudioLoadError(
55
+ f"Failed to decode audio file: {str(e2)}"
56
+ ) from e2
57
+
58
+ # ---- sanity checks ----
59
+ if waveform is None or waveform.numel() == 0:
60
+ raise AudioLoadError("Loaded audio is empty")
61
+
62
+ # mono
63
+ if waveform.dim() > 1:
64
+ waveform = waveform.mean(dim=0)
65
+
66
+ # resample
67
+ if sr != SAMPLE_RATE:
68
+ waveform = torchaudio.transforms.Resample(sr, SAMPLE_RATE)(waveform)
69
+
70
+ # duration control
71
+ if waveform.numel() < TARGET_LEN:
72
+ raise AudioLoadError("Audio too short for analysis")
73
+
74
+ if waveform.numel() > TARGET_LEN:
75
+ waveform = waveform[:TARGET_LEN]
76
+ else:
77
+ waveform = F.pad(waveform, (0, TARGET_LEN - waveform.numel()))
78
+
79
+ return waveform.float()
80
+
81
+
82
+
83
+ def waveform_to_mel(waveform: torch.Tensor):
84
+ """
85
+ waveform: [T]
86
+ returns: [1, T, N_MELS]
87
+ """
88
+ mel = mel_transform(waveform.unsqueeze(0)) # [1, n_mels, frames]
89
+ mel = amp_to_db(mel)
90
+ mel = mel.transpose(1, 2) # [1, frames, n_mels]
91
+ return mel
92
+
93
+ def pad_time_dim(mel):
94
+ T = mel.shape[1]
95
+ pad_len = (8 - (T % 8)) % 8
96
+ if pad_len > 0:
97
+ mel = F.pad(mel, (0, 0, 0, pad_len))
98
+ return mel
99
+
100
+
101
+ def extract_features(wav: torch.Tensor) -> torch.Tensor:
102
+ mel = mel_transform(wav.unsqueeze(0))
103
+ mel = amp_to_db(mel)
104
+ if mel.dim == 4:
105
+ mel = mel.squeeze(1)
106
+
107
+ mel.transpose(1, 2) # [B, T, N_MELS]
108
+ mel = pad_time_dim(mel)
109
+ return mel
audio/validators.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import soundfile as sf
2
+ from core.config import MAX_DURATION_SEC, MIN_DURATION_SEC, ALLOWED_EXTENSIONS
3
+ from utils.logger import logger
4
+
5
+ class AudioValidationError(ValueError):
6
+ pass
7
+
8
+
9
+ def validate_audio_duration(filepath: str, max_duration: float = MAX_DURATION_SEC):
10
+ try:
11
+ info = sf.info(filepath)
12
+ except RuntimeError:
13
+ raise AudioValidationError("Invalid or corrupted audio file.")
14
+
15
+ duration = info.frames / float(info.samplerate)
16
+
17
+ if duration > max_duration or duration < MIN_DURATION_SEC:
18
+ logger.warning(f"Unsupported file length received: {original_filename}")
19
+ raise AudioValidationError(
20
+ f"Audio duration {duration:.2f}s invalid. "
21
+ f"Allowed range: {MIN_DURATION_SEC:.2f}s – {max_duration:.2f}s."
22
+ )
23
+
24
+ return duration
25
+
26
+ def validate_audio_file(file_path: str, original_filename: str):
27
+ ext = original_filename.lower().rsplit(".", 1)[-1]
28
+ ext = "." + ext
29
+
30
+ if ext not in ALLOWED_EXTENSIONS:
31
+ logger.warning(f"Unsupported file format received: {original_filename}")
32
+ raise AudioValidationError(
33
+ f"Unsupported file type {ext}. Allowed formats are: " + ", ".join(ALLOWED_EXTENSIONS)
34
+ )
35
+
36
+ validate_audio_duration(file_path)
core/__init__.py ADDED
File without changes
core/config.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import torch
3
+ import yaml
4
+
5
+ BASE_DIR = Path(__file__).resolve().parents[1]
6
+
7
+ MODEL_DIR = f"{BASE_DIR}/models/ecapa_supcon_model.pth"
8
+ REF_EMB = f"{BASE_DIR}/data/reference_embeddings_192-d.npy"
9
+ REF_C_H = f"{BASE_DIR}/data/centroid_healthy.npy"
10
+ FATIGUE_AXIS = f"{BASE_DIR}/data/fatigue_axis.npy"
11
+ LOW_PERCENTILE = f"{BASE_DIR}/data/low_percentile.npz"
12
+ HIGH_PERCENTILE = f"{BASE_DIR}/data/high_percentile.npz"
13
+ CONFIG_PATH = f"{BASE_DIR}/model/config.yaml"
14
+
15
+ SAMPLE_RATE = 16000
16
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
17
+ TARGET_SEC = 5
18
+ N_MELS = 80
19
+ TARGET_LEN = SAMPLE_RATE * TARGET_SEC
20
+ MAX_DURATION_SEC = 10.0
21
+ MIN_DURATION_SEC = 5.0
22
+ ALLOWED_EXTENSIONS = {".wav", ".mp3", ".m4a"}
23
+
24
+ with open(CONFIG_PATH, "r") as f:
25
+ CONFIG = yaml.safe_load(f)
26
+
27
+ print(f"Model directory is set to: {MODEL_DIR}")
28
+ print(f"base dir: {BASE_DIR}")
29
+ print(f"ref emb path: {REF_EMB}")
data/centroid_healthy.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7290aa86b3a3c2d6dc739fc1a305f969d0ab442a81fd65e1bd0157032d0a4bac
3
+ size 896
data/fatigue_axis.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b68e5a5ded25edd95efc6ee0f4a45804c46023a994cc5fa616a3a89100a698f
3
+ size 896
data/high_percentile.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e24a1e11b2ea4e347d9452a96be376d8886ec59a2f3cc8b8de3d3102dd1115d1
3
+ size 211
data/low_percentile.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a2e8eb7c090098d787b2a72e9300c77c811815307b808ab05ab4d3bef6ad639
3
+ size 211
data/reference_embeddings_192-d.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60ecf6c5194ccf55d9729786aec24c02308a519097cf7fd408d6d6f0bccd9783
3
+ size 57448
main.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, APIRouter, Request
2
+ from api.routes import router
3
+ import time
4
+ import logging
5
+
6
+ logging.basicConfig(
7
+ level = logging.INFO,
8
+ format = "%(asctime)s - %(levelname)s - %(message)s"
9
+ )
10
+
11
+ app = FastAPI(title = "Vocal fatigue scoring API")
12
+
13
+ @app.middleware("HTTP")
14
+ async def log_requests(request: Request, call_next):
15
+ start = time.time()
16
+ response = await call_next(request)
17
+ duration = time.time() - start
18
+
19
+ logging.info(
20
+ f"{request.method} {request.url.path} "
21
+ f"status_code = {response.status_code} "
22
+ f"time = {duration:.3f}"
23
+ )
24
+ return response
25
+
26
+ api_v1 = APIRouter(prefix="/api/v1")
27
+ api_v1.include_router(router, prefix="/voice")
28
+
29
+ app.include_router(api_v1)
30
+
31
+ @app.get("/health")
32
+ def health():
33
+ return {"status" : "ok"}
34
+
model/__init__.py ADDED
File without changes
model/config.yaml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ name: ecapa_fatigue
3
+ version: 1.0.0
4
+ encoder: ecapa_tdnn
5
+ embedding_dim: 192
6
+ sampling_rate: 16000
7
+ N_mels: 80
8
+ scoring:
9
+ method: calibrated_sigmoid
10
+ raw_low_percentile: 5
11
+ raw_high_percentile: 95
12
+ preprocessing:
13
+ mono: True
14
+ target_len: 48000
15
+ prosody_thresholds:
16
+ pitch_mean: [110, 180] # Hz
17
+ pitch_std: [5, 40] # Hz
18
+ jitter: [0.0, 0.5] # %
19
+ shimmer: [0.0, 1.0] # %
20
+ hnr: [10, 30] # dB
21
+ audio:
22
+ min_duration: 5.0 # seconds
23
+ max_duration: 10.0 # seconds
24
+ allow_trim: False
25
+ allowed_formats: ['.wav', '.m4a', '.mp3']
26
+ required_sampling_rate: 16000
model/ecapa.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torchaudio
3
+ from speechbrain.lobes.models.ECAPA_TDNN import ECAPA_TDNN
4
+ from core.config import MODEL_DIR, DEVICE, N_MELS
5
+ from audio.preprocessing import waveform_to_mel
6
+ import numpy as np
7
+
8
+
9
+ class ECAPAENCODER:
10
+ def __init__(self):
11
+ self.ecapa = ECAPA_TDNN(
12
+ input_size = N_MELS,
13
+ lin_neurons = 192,
14
+ channels = [512, 512, 512],
15
+ kernel_sizes = [5, 3, 3],
16
+ dilations = [1, 2 , 3]
17
+ ).to(DEVICE)
18
+
19
+ checkpoint = torch.load(MODEL_DIR, map_location = DEVICE)
20
+ self.ecapa.load_state_dict(checkpoint['ecapa_state_dict'])
21
+ self.ecapa.eval()
22
+
23
+ @torch.no_grad()
24
+ def encode(self, waveform):
25
+ """
26
+ waveform: Tensor [T]
27
+ returns: np.ndarray [192]
28
+ """
29
+
30
+ # ---- safety checks ----
31
+ if waveform.dim() == 2 and waveform.shape[0] == 1:
32
+ waveform = waveform.squeeze(0)
33
+ if waveform.dim() != 1:
34
+ raise ValueError(f"Expected waveform [T], got {waveform.shape}")
35
+
36
+ waveform = waveform.float().to(DEVICE)
37
+ # waveform = waveform.unsqueeze(0) # [1, T]
38
+
39
+ mel = waveform_to_mel(waveform) # [1, n_mels, frames]
40
+
41
+ # if mel.dim() == 4:
42
+ # mel = mel.squeeze(1)
43
+
44
+ # mel = mel.transpose(1, 2).contiguous() # [1, T, n_mels]
45
+
46
+ # ---- critical debug line (keep this while testing) ----
47
+ print("ECAPA INPUT SHAPE:", mel.shape)
48
+
49
+ emb = self.ecapa(mel) # [1, 192]
50
+ return emb.squeeze(0).cpu().numpy()
model/scorer.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from core.config import LOW_PERCENTILE, HIGH_PERCENTILE, FATIGUE_AXIS, REF_C_H
2
+ import numpy as np
3
+ from core.config import CONFIG
4
+
5
+
6
+ C_h = np.load(REF_C_H)
7
+ fatigue_axis = np.load(FATIGUE_AXIS)
8
+ low = float(np.load(LOW_PERCENTILE)["arr_0"])
9
+ high = float(np.load(HIGH_PERCENTILE)["arr_0"])
10
+
11
+ # def fatigue_score_0_to_100(emb: np.ndarray) -> float:
12
+ # raw = np.dot(emb - C_h, fatigue_axis)
13
+ # raw = np.clip(raw, low , high)
14
+ # return 100 * (raw - low) / (high - low)
15
+
16
+ def fatigue_score_0_to_100(embedding, C_h, fatigue_axis, raw_low, raw_high, method='sigmoid'):
17
+ """
18
+ Compute a continuous fatigue score (0-100) from an embedding.
19
+
20
+ embedding: 192-d numpy array
21
+ C_h: healthy centroid (192-d)
22
+ fatigue_axis: unit vector from healthy -> fatigued (192-d)
23
+ raw_low, raw_high: training percentile values along the fatigue axis
24
+ method: 'linear', 'sigmoid', or 'smooth_linear'
25
+
26
+ Returns: float [0, 100]
27
+ """
28
+ # Project embedding along fatigue axis
29
+ raw = np.dot(embedding - C_h, fatigue_axis)
30
+
31
+ # Normalize raw value to [0, 1] within training range
32
+ normalized = (raw - raw_low) / (raw_high - raw_low)
33
+
34
+ # Clamp slightly beyond training range to avoid extreme scores
35
+ normalized = np.clip(normalized, -0.05, 1.05)
36
+
37
+ if method == 'linear':
38
+ score = normalized * 100 # simple linear scaling
39
+ elif method == 'sigmoid':
40
+ # Smooth sigmoid, less steep
41
+ midpoint = 0.5
42
+ scale = 0.25 # tune this for slope; bigger = smoother
43
+ score = 1 / (1 + np.exp(-(normalized - midpoint) / scale)) * 100
44
+ elif method == 'smooth_linear':
45
+ # Combine linear scaling with mild sigmoid smoothing at ends
46
+ # This gives a natural 0-100 spread but saturates near extremes
47
+ scale = 10 # controls smoothness near 0 and 100
48
+ score = normalized * 100
49
+ score = 100 / (1 + np.exp(- (score - 50) / scale))
50
+ else:
51
+ raise ValueError("method must be 'linear', 'sigmoid', or 'smooth_linear'")
52
+
53
+ # Ensure the output is float and bounded
54
+ return float(np.clip(score, 0, 100))
55
+
56
+
57
+ # def prosody_score(prosody_feats):
58
+ # report = []
59
+ # thresholds = CONFIG['prosody_thresholds']
60
+
61
+ # for feat, val in prosody_feats.items():
62
+ # low, high = thresholds[feat]
63
+ # if val < low:
64
+ # report.append(f"{feat} is low → potential fatigue")
65
+ # elif val > high:
66
+ # report.append(f"{feat} is high → potential fatigue")
67
+
68
+ # score = len(report) # simple count, or map to 0-100 if needed
69
+ # return score, report
models/ecapa_supcon_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:145325909e3e53c13bbb351537117727f4caf34828aea9c2e55b1d0f7262bfc6
3
+ size 9208363
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch==2.1.1
2
+ pytest
3
+ fastapi
4
+ torchaudio==2.1.1
5
+ speechbrain
6
+ numpy==1.26.4
7
+ pathlib
8
+ pydub
9
+ uvicorn
10
+ soundfile
11
+ python-multipart
utils/__init__.py ADDED
File without changes
utils/file_utils.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import tempfile
2
+ from fastapi import UploadFile
3
+
4
+ def save_temp_audio(file: UploadFile) -> str:
5
+ suffix = file.filename.split(".")[-1]
6
+ with tempfile.NamedTemporaryFile(delete=False, suffix=f".{suffix}") as tmp_file:
7
+ tmp_file.write(file.file.read())
8
+ return tmp_file.name
utils/logger.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ logging.basicConfig(
4
+ level= logging.INFO,
5
+ format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
6
+ )
7
+
8
+ logger = logging.getLogger("vocal-fatigue-api")