EurekaPotato commited on
Commit
ea2573a
Β·
verified Β·
1 Parent(s): 7d59fd0

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. handler.py +88 -42
handler.py CHANGED
@@ -44,22 +44,43 @@ class EmotionCNN:
44
  mel_spec_db = np.clip(mel_spec_db, -80, 0)
45
  mel_spec_norm = (mel_spec_db + 80) / 80
46
 
47
- from skimage.transform import resize
48
- mel_resized = resize(mel_spec_norm, (224, 224), mode="constant")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
- from matplotlib import cm
51
- colormap = cm.get_cmap("jet")
52
- rgb = colormap(mel_resized)[:, :, :3]
53
  return np.transpose(rgb, (2, 0, 1)).astype(np.float32)
54
 
55
  def extract_embedding(self, audio: np.ndarray, sr: int = 16000) -> np.ndarray:
56
- spec_rgb = self.audio_to_spectrogram(audio, sr)
57
- tensor = torch.from_numpy(spec_rgb).unsqueeze(0)
58
- if self.device == "cuda":
59
- tensor = tensor.cuda()
60
- with torch.no_grad():
61
- emb = self.model(tensor)
62
- return emb.cpu().numpy().flatten()
 
 
 
 
63
 
64
 
65
  # ──────────────────────────────────────────────────────────────────────── #
@@ -223,7 +244,10 @@ class AudioFeatureExtractorEndpoint:
223
 
224
  from fastapi import FastAPI, File, UploadFile, Form
225
  from fastapi.middleware.cors import CORSMiddleware
 
 
226
  import base64
 
227
 
228
  app = FastAPI(title="Audio Feature Extraction API", version="1.0.0")
229
  app.add_middleware(
@@ -234,6 +258,22 @@ app.add_middleware(
234
 
235
  extractor = AudioFeatureExtractorEndpoint()
236
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
 
238
  @app.get("/")
239
  async def root():
@@ -252,57 +292,63 @@ async def health():
252
  @app.post("/extract-audio-features")
253
  async def extract_audio_features(audio: UploadFile = File(...), transcript: str = Form("")):
254
  """Extract all 17 voice features from uploaded audio file."""
255
- audio_bytes = await audio.read()
256
- y, sr = librosa.load(io.BytesIO(audio_bytes), sr=16000, mono=True)
257
- features = extractor.extract_all(y, transcript)
258
- return features
 
 
 
 
 
259
 
260
 
261
  @app.post("/extract-audio-features-base64")
262
- async def extract_audio_features_base64(data: dict):
263
  """Extract features from base64-encoded audio (for Vercel serverless calls)."""
264
  import soundfile as sf
265
 
266
- audio_b64 = data.get("audio_base64", "")
267
- transcript = data.get("transcript", "")
268
 
269
  # Handle empty / missing audio β€” return default features
270
  if not audio_b64 or len(audio_b64) < 100:
271
- return {
272
- "v1_snr": 0.0,
273
- "v2_noise_traffic": 0.0, "v2_noise_office": 0.0,
274
- "v2_noise_crowd": 0.0, "v2_noise_wind": 0.0, "v2_noise_clean": 1.0,
275
- "v3_speech_rate": 0.0,
276
- "v4_pitch_mean": 0.0, "v5_pitch_std": 0.0,
277
- "v6_energy_mean": 0.0, "v7_energy_std": 0.0,
278
- "v8_pause_ratio": 0.0, "v9_avg_pause_dur": 0.0, "v10_mid_pause_cnt": 0,
279
- "v11_emotion_stress": 0.0, "v12_emotion_energy": 0.0, "v13_emotion_valence": 0.0,
280
- }
281
 
282
  try:
 
 
 
 
283
  audio_bytes = base64.b64decode(audio_b64)
284
- y, sr = sf.read(io.BytesIO(audio_bytes))
 
 
 
 
 
 
 
 
285
  if len(y.shape) > 1:
286
  y = np.mean(y, axis=1)
287
  if sr != 16000:
288
  y = librosa.resample(y, orig_sr=sr, target_sr=16000)
289
  y = y.astype(np.float32)
290
 
 
 
 
 
291
  features = extractor.extract_all(y, transcript)
 
292
  return features
293
  except Exception as e:
294
- # If audio decoding fails, return defaults rather than 500
295
- return {
296
- "v1_snr": 0.0,
297
- "v2_noise_traffic": 0.0, "v2_noise_office": 0.0,
298
- "v2_noise_crowd": 0.0, "v2_noise_wind": 0.0, "v2_noise_clean": 1.0,
299
- "v3_speech_rate": 0.0,
300
- "v4_pitch_mean": 0.0, "v5_pitch_std": 0.0,
301
- "v6_energy_mean": 0.0, "v7_energy_std": 0.0,
302
- "v8_pause_ratio": 0.0, "v9_avg_pause_dur": 0.0, "v10_mid_pause_cnt": 0,
303
- "v11_emotion_stress": 0.0, "v12_emotion_energy": 0.0, "v13_emotion_valence": 0.0,
304
- "_error": str(e),
305
- }
306
 
307
 
308
  if __name__ == "__main__":
 
44
  mel_spec_db = np.clip(mel_spec_db, -80, 0)
45
  mel_spec_norm = (mel_spec_db + 80) / 80
46
 
47
+ try:
48
+ from skimage.transform import resize
49
+ mel_resized = resize(mel_spec_norm, (224, 224), mode="constant")
50
+ except ImportError:
51
+ # Fallback: simple nearest-neighbor resize with numpy
52
+ mel_resized = np.array(
53
+ [np.interp(np.linspace(0, mel_spec_norm.shape[1]-1, 224),
54
+ np.arange(mel_spec_norm.shape[1]), row)
55
+ for row in np.interp(
56
+ np.linspace(0, mel_spec_norm.shape[0]-1, 224),
57
+ np.arange(mel_spec_norm.shape[0]),
58
+ np.arange(mel_spec_norm.shape[0])
59
+ ).astype(int).__iter__()]
60
+ ) if mel_spec_norm.size > 0 else np.zeros((224, 224))
61
+
62
+ try:
63
+ from matplotlib import cm
64
+ colormap = cm.get_cmap("jet")
65
+ rgb = colormap(mel_resized)[:, :, :3]
66
+ except (ImportError, Exception):
67
+ # Fallback: stack grayscale into 3 channels
68
+ rgb = np.stack([mel_resized] * 3, axis=-1)
69
 
 
 
 
70
  return np.transpose(rgb, (2, 0, 1)).astype(np.float32)
71
 
72
  def extract_embedding(self, audio: np.ndarray, sr: int = 16000) -> np.ndarray:
73
+ try:
74
+ spec_rgb = self.audio_to_spectrogram(audio, sr)
75
+ tensor = torch.from_numpy(spec_rgb).unsqueeze(0)
76
+ if self.device == "cuda":
77
+ tensor = tensor.cuda()
78
+ with torch.no_grad():
79
+ emb = self.model(tensor)
80
+ return emb.cpu().numpy().flatten()
81
+ except Exception as e:
82
+ print(f"[WARN] EmotionCNN embedding extraction failed: {e}")
83
+ return np.zeros(576) # MobileNetV3-small output size
84
 
85
 
86
  # ──────────────────────────────────────────────────────────────────────── #
 
244
 
245
  from fastapi import FastAPI, File, UploadFile, Form
246
  from fastapi.middleware.cors import CORSMiddleware
247
+ from pydantic import BaseModel
248
+ from typing import Optional
249
  import base64
250
+ import traceback
251
 
252
  app = FastAPI(title="Audio Feature Extraction API", version="1.0.0")
253
  app.add_middleware(
 
258
 
259
  extractor = AudioFeatureExtractorEndpoint()
260
 
261
+ DEFAULT_AUDIO_FEATURES = {
262
+ "v1_snr": 0.0,
263
+ "v2_noise_traffic": 0.0, "v2_noise_office": 0.0,
264
+ "v2_noise_crowd": 0.0, "v2_noise_wind": 0.0, "v2_noise_clean": 1.0,
265
+ "v3_speech_rate": 0.0,
266
+ "v4_pitch_mean": 0.0, "v5_pitch_std": 0.0,
267
+ "v6_energy_mean": 0.0, "v7_energy_std": 0.0,
268
+ "v8_pause_ratio": 0.0, "v9_avg_pause_dur": 0.0, "v10_mid_pause_cnt": 0,
269
+ "v11_emotion_stress": 0.0, "v12_emotion_energy": 0.0, "v13_emotion_valence": 0.0,
270
+ }
271
+
272
+
273
+ class AudioBase64Request(BaseModel):
274
+ audio_base64: str = ""
275
+ transcript: str = ""
276
+
277
 
278
  @app.get("/")
279
  async def root():
 
292
  @app.post("/extract-audio-features")
293
  async def extract_audio_features(audio: UploadFile = File(...), transcript: str = Form("")):
294
  """Extract all 17 voice features from uploaded audio file."""
295
+ try:
296
+ audio_bytes = await audio.read()
297
+ y, sr = librosa.load(io.BytesIO(audio_bytes), sr=16000, mono=True)
298
+ features = extractor.extract_all(y, transcript)
299
+ return features
300
+ except Exception as e:
301
+ print(f"[ERROR] extract_audio_features: {e}")
302
+ traceback.print_exc()
303
+ return {**DEFAULT_AUDIO_FEATURES, "_error": str(e)}
304
 
305
 
306
  @app.post("/extract-audio-features-base64")
307
+ async def extract_audio_features_base64(data: AudioBase64Request):
308
  """Extract features from base64-encoded audio (for Vercel serverless calls)."""
309
  import soundfile as sf
310
 
311
+ audio_b64 = data.audio_base64
312
+ transcript = data.transcript
313
 
314
  # Handle empty / missing audio β€” return default features
315
  if not audio_b64 or len(audio_b64) < 100:
316
+ print("[INFO] Empty or too-short audio_base64, returning defaults")
317
+ return {**DEFAULT_AUDIO_FEATURES}
 
 
 
 
 
 
 
 
318
 
319
  try:
320
+ # Strip data URL prefix if present (e.g. "data:audio/wav;base64,...")
321
+ if "," in audio_b64[:80]:
322
+ audio_b64 = audio_b64.split(",", 1)[1]
323
+
324
  audio_bytes = base64.b64decode(audio_b64)
325
+ print(f"[INFO] Decoded {len(audio_bytes)} bytes of audio")
326
+
327
+ # Try soundfile first, fall back to librosa
328
+ try:
329
+ y, sr = sf.read(io.BytesIO(audio_bytes))
330
+ except Exception as sf_err:
331
+ print(f"[WARN] soundfile failed ({sf_err}), trying librosa...")
332
+ y, sr = librosa.load(io.BytesIO(audio_bytes), sr=16000, mono=True)
333
+
334
  if len(y.shape) > 1:
335
  y = np.mean(y, axis=1)
336
  if sr != 16000:
337
  y = librosa.resample(y, orig_sr=sr, target_sr=16000)
338
  y = y.astype(np.float32)
339
 
340
+ if len(y) < 100:
341
+ print("[WARN] Audio too short after decode, returning defaults")
342
+ return {**DEFAULT_AUDIO_FEATURES}
343
+
344
  features = extractor.extract_all(y, transcript)
345
+ print(f"[OK] Extracted {len(features)} audio features")
346
  return features
347
  except Exception as e:
348
+ print(f"[ERROR] extract_audio_features_base64: {e}")
349
+ traceback.print_exc()
350
+ # Return defaults rather than 500
351
+ return {**DEFAULT_AUDIO_FEATURES, "_error": str(e)}
 
 
 
 
 
 
 
 
352
 
353
 
354
  if __name__ == "__main__":