E5K7 commited on
Commit
b972f63
·
1 Parent(s): de5487e

fix: Lightweight baseline extraction — only compute pitch/energy/tempo inline, skip full feature pipeline to prevent OOM on HF

Browse files
Files changed (1) hide show
  1. backend/routes/auth.py +24 -6
backend/routes/auth.py CHANGED
@@ -133,8 +133,8 @@ async def setup_baseline(
133
  db: Session = Depends(get_db)
134
  ):
135
  """
136
- Onboarding step: Takes a neutral reading audio clip, extracts features,
137
- and sets the baseline statistics for the user.
138
  """
139
  suffix = ".webm" if audio.content_type == "audio/webm" else ".wav"
140
  tmp_fd, tmp_path = tempfile.mkstemp(suffix=suffix)
@@ -146,11 +146,29 @@ async def setup_baseline(
146
 
147
  try:
148
  wav_path = convert_to_wav(tmp_path)
149
- features = extract_features(wav_path)
150
 
151
- current_user.baseline_pitch = features.get("pitch_mean", 0.0)
152
- current_user.baseline_energy = features.get("energy_raw", 0.0)
153
- current_user.baseline_speech_rate = features.get("speech_rate", 0.0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  except Exception as e:
155
  print(f"[Baseline] Audio processing failed, using defaults: {e}")
156
  current_user.baseline_pitch = 150.0
 
133
  db: Session = Depends(get_db)
134
  ):
135
  """
136
+ Onboarding step: Takes a neutral reading audio clip, extracts ONLY the 3
137
+ baseline metrics (pitch, energy, speech_rate) using lightweight librosa calls.
138
  """
139
  suffix = ".webm" if audio.content_type == "audio/webm" else ".wav"
140
  tmp_fd, tmp_path = tempfile.mkstemp(suffix=suffix)
 
146
 
147
  try:
148
  wav_path = convert_to_wav(tmp_path)
 
149
 
150
+ # Lightweight inline extraction — only 3 values needed
151
+ import librosa
152
+ import numpy as np
153
+ y, sr = librosa.load(wav_path, sr=16000, mono=True, duration=15)
154
+
155
+ # Pitch
156
+ f0, voiced, _ = librosa.pyin(y, fmin=80, fmax=600, sr=sr)
157
+ f0_clean = f0[voiced == 1] if voiced is not None else np.array([])
158
+ pitch = float(np.mean(f0_clean)) if len(f0_clean) > 0 else 150.0
159
+
160
+ # Energy
161
+ rms = librosa.feature.rms(y=y)[0]
162
+ energy = float(np.sqrt(np.mean(rms ** 2)))
163
+
164
+ # Speech rate (tempo proxy)
165
+ tempo_arr, _ = librosa.beat.beat_track(y=y, sr=sr)
166
+ speech_rate = float(tempo_arr) if np.isscalar(tempo_arr) else float(tempo_arr[0])
167
+
168
+ current_user.baseline_pitch = round(pitch, 2)
169
+ current_user.baseline_energy = round(energy, 6)
170
+ current_user.baseline_speech_rate = round(speech_rate, 2)
171
+
172
  except Exception as e:
173
  print(f"[Baseline] Audio processing failed, using defaults: {e}")
174
  current_user.baseline_pitch = 150.0