Spaces:

E5K7
/

InnerVoice

Running

App Files Files Community

E5K7 commited on Apr 11

Commit

b972f63

1 Parent(s): de5487e

fix: Lightweight baseline extraction — only compute pitch/energy/tempo inline, skip full feature pipeline to prevent OOM on HF

Browse files

Files changed (1) hide show

backend/routes/auth.py +24 -6

backend/routes/auth.py CHANGED Viewed

@@ -133,8 +133,8 @@ async def setup_baseline(
     db: Session = Depends(get_db)
 ):
     """
-    Onboarding step: Takes a neutral reading audio clip, extracts features,
-    and sets the baseline statistics for the user.
     """
     suffix = ".webm" if audio.content_type == "audio/webm" else ".wav"
     tmp_fd, tmp_path = tempfile.mkstemp(suffix=suffix)
@@ -146,11 +146,29 @@ async def setup_baseline(
         try:
             wav_path = convert_to_wav(tmp_path)
-            features = extract_features(wav_path)
-            current_user.baseline_pitch = features.get("pitch_mean", 0.0)
-            current_user.baseline_energy = features.get("energy_raw", 0.0)
-            current_user.baseline_speech_rate = features.get("speech_rate", 0.0)
         except Exception as e:
             print(f"[Baseline] Audio processing failed, using defaults: {e}")
             current_user.baseline_pitch = 150.0

     db: Session = Depends(get_db)
 ):
     """
+    Onboarding step: Takes a neutral reading audio clip, extracts ONLY the 3
+    baseline metrics (pitch, energy, speech_rate) using lightweight librosa calls.
     """
     suffix = ".webm" if audio.content_type == "audio/webm" else ".wav"
     tmp_fd, tmp_path = tempfile.mkstemp(suffix=suffix)
         try:
             wav_path = convert_to_wav(tmp_path)
+            # Lightweight inline extraction — only 3 values needed
+            import librosa
+            import numpy as np
+            y, sr = librosa.load(wav_path, sr=16000, mono=True, duration=15)
+            # Pitch
+            f0, voiced, _ = librosa.pyin(y, fmin=80, fmax=600, sr=sr)
+            f0_clean = f0[voiced == 1] if voiced is not None else np.array([])
+            pitch = float(np.mean(f0_clean)) if len(f0_clean) > 0 else 150.0
+            # Energy
+            rms = librosa.feature.rms(y=y)[0]
+            energy = float(np.sqrt(np.mean(rms ** 2)))
+            # Speech rate (tempo proxy)
+            tempo_arr, _ = librosa.beat.beat_track(y=y, sr=sr)
+            speech_rate = float(tempo_arr) if np.isscalar(tempo_arr) else float(tempo_arr[0])
+            current_user.baseline_pitch = round(pitch, 2)
+            current_user.baseline_energy = round(energy, 6)
+            current_user.baseline_speech_rate = round(speech_rate, 2)
         except Exception as e:
             print(f"[Baseline] Audio processing failed, using defaults: {e}")
             current_user.baseline_pitch = 150.0