Lior-0618 Claude Opus 4.6 commited on
Commit
e31fc14
Β·
1 Parent(s): 8e1d7bd

fix: always convert audio to WAV before forwarding to evoxtral API

Browse files

The external Modal API doesn't support WebM/OGG/M4A formats. Convert
all uploads to 16kHz mono WAV via ffmpeg before calling the API.
For transcribe-diarize, the converted WAV is reused for VAD segmentation.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. model/voxtral-server/main.py +50 -15
model/voxtral-server/main.py CHANGED
@@ -74,14 +74,17 @@ async def health():
74
 
75
  # ─── External API ──────────────────────────────────────────────────────────────
76
 
77
- async def _call_evoxtral(contents: bytes, filename: str) -> dict:
78
- """Forward audio bytes to the external evoxtral API; return parsed JSON.
79
  Response: {"transcription": "...[laughs]...", "language": "en", "model": "..."}
 
80
  """
 
 
81
  async with httpx.AsyncClient(timeout=300) as client:
82
  r = await client.post(
83
  f"{EVOXTRAL_API}/transcribe",
84
- files={"file": (filename, contents)},
85
  )
86
  if not r.is_success:
87
  raise HTTPException(
@@ -279,7 +282,30 @@ async def transcribe(audio: UploadFile = File(...)):
279
  raise HTTPException(status_code=400, detail=f"Failed to read file: {e}")
280
  _validate_upload(contents)
281
 
282
- result = await _call_evoxtral(contents, filename)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  text = result.get("transcription", "")
284
  lang = result.get("language")
285
 
@@ -311,19 +337,15 @@ async def transcribe_diarize(audio: UploadFile = File(...)):
311
  if suffix not in (".wav", ".mp3", ".flac", ".ogg", ".m4a", ".webm"):
312
  suffix = ".wav"
313
 
314
- # ── Step 1: call external evoxtral API ──────────────────────────────────
315
- t0 = time.perf_counter()
316
- result = await _call_evoxtral(contents, filename)
317
- full_text = result.get("transcription", "")
318
- print(f"[voxtral] {req_id} evoxtral API done {(time.perf_counter()-t0)*1000:.0f}ms text_len={len(full_text)}")
319
-
320
- # ── Step 2: load audio for VAD segmentation ──────────────────────────────
321
  with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
322
  tmp.write(contents)
323
  tmp_path = tmp.name
 
324
  try:
325
  t0 = time.perf_counter()
326
- audio_array = _load_audio(tmp_path, TARGET_SR)
 
327
  print(f"[voxtral] {req_id} load_audio done shape={audio_array.shape} in {(time.perf_counter()-t0)*1000:.0f}ms")
328
  except Exception as e:
329
  raise HTTPException(status_code=400, detail=f"Cannot decode audio: {e}")
@@ -334,17 +356,30 @@ async def transcribe_diarize(audio: UploadFile = File(...)):
334
  except OSError:
335
  pass
336
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337
  duration = round(len(audio_array) / TARGET_SR, 3)
338
 
339
- # ── Step 3: VAD sentence segmentation ───────────────────────────────────
340
  t0 = time.perf_counter()
341
  raw_segs, seg_method = _segments_from_vad(audio_array, TARGET_SR)
342
  print(f"[voxtral] {req_id} segmentation done {(time.perf_counter()-t0)*1000:.0f}ms segs={len(raw_segs)}")
343
 
344
- # ── Step 4: distribute text to segments ─────────────────────────────────
345
  segs_with_text = _distribute_text(full_text, raw_segs)
346
 
347
- # ── Step 5: parse emotion from expression tags ──────────────────────────
348
  segments = []
349
  for i, s in enumerate(segs_with_text):
350
  emo = _parse_emotion(s["text"])
 
74
 
75
  # ─── External API ──────────────────────────────────────────────────────────────
76
 
77
+ async def _call_evoxtral(wav_path: str) -> dict:
78
+ """Send a WAV file to the external evoxtral API; return parsed JSON.
79
  Response: {"transcription": "...[laughs]...", "language": "en", "model": "..."}
80
+ Always expects a local WAV file path (already converted/validated).
81
  """
82
+ with open(wav_path, "rb") as f:
83
+ wav_bytes = f.read()
84
  async with httpx.AsyncClient(timeout=300) as client:
85
  r = await client.post(
86
  f"{EVOXTRAL_API}/transcribe",
87
+ files={"file": ("audio.wav", wav_bytes, "audio/wav")},
88
  )
89
  if not r.is_success:
90
  raise HTTPException(
 
282
  raise HTTPException(status_code=400, detail=f"Failed to read file: {e}")
283
  _validate_upload(contents)
284
 
285
+ suffix = os.path.splitext(filename)[1].lower() or ".wav"
286
+ if suffix not in (".wav", ".mp3", ".flac", ".ogg", ".m4a", ".webm"):
287
+ suffix = ".wav"
288
+
289
+ # Save upload, convert to WAV for external API compatibility
290
+ with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
291
+ tmp.write(contents)
292
+ tmp_path = tmp.name
293
+ wav_path = None
294
+ try:
295
+ wav_path = _convert_to_wav_ffmpeg(tmp_path, TARGET_SR)
296
+ result = await _call_evoxtral(wav_path)
297
+ except HTTPException:
298
+ raise
299
+ except Exception as e:
300
+ raise HTTPException(status_code=400, detail=f"Cannot decode audio: {e}")
301
+ finally:
302
+ for p in (tmp_path, wav_path):
303
+ if p and os.path.exists(p):
304
+ try:
305
+ os.unlink(p)
306
+ except OSError:
307
+ pass
308
+
309
  text = result.get("transcription", "")
310
  lang = result.get("language")
311
 
 
337
  if suffix not in (".wav", ".mp3", ".flac", ".ogg", ".m4a", ".webm"):
338
  suffix = ".wav"
339
 
340
+ # Save upload and convert to WAV once β€” reused for both external API and VAD
 
 
 
 
 
 
341
  with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
342
  tmp.write(contents)
343
  tmp_path = tmp.name
344
+ wav_path = None
345
  try:
346
  t0 = time.perf_counter()
347
+ wav_path = _convert_to_wav_ffmpeg(tmp_path, TARGET_SR)
348
+ audio_array = _load_audio(wav_path, TARGET_SR)
349
  print(f"[voxtral] {req_id} load_audio done shape={audio_array.shape} in {(time.perf_counter()-t0)*1000:.0f}ms")
350
  except Exception as e:
351
  raise HTTPException(status_code=400, detail=f"Cannot decode audio: {e}")
 
356
  except OSError:
357
  pass
358
 
359
+ # ── Step 1: call external evoxtral API (send the converted WAV) ──────────
360
+ try:
361
+ t0 = time.perf_counter()
362
+ result = await _call_evoxtral(wav_path)
363
+ full_text = result.get("transcription", "")
364
+ print(f"[voxtral] {req_id} evoxtral API done {(time.perf_counter()-t0)*1000:.0f}ms text_len={len(full_text)}")
365
+ finally:
366
+ if wav_path and os.path.exists(wav_path):
367
+ try:
368
+ os.unlink(wav_path)
369
+ except OSError:
370
+ pass
371
+
372
  duration = round(len(audio_array) / TARGET_SR, 3)
373
 
374
+ # ── Step 2: VAD sentence segmentation ───────────────────────────────────
375
  t0 = time.perf_counter()
376
  raw_segs, seg_method = _segments_from_vad(audio_array, TARGET_SR)
377
  print(f"[voxtral] {req_id} segmentation done {(time.perf_counter()-t0)*1000:.0f}ms segs={len(raw_segs)}")
378
 
379
+ # ── Step 3: distribute text to segments ─────────────────────────────────
380
  segs_with_text = _distribute_text(full_text, raw_segs)
381
 
382
+ # ── Step 4: parse emotion from expression tags ──────────────────────────
383
  segments = []
384
  for i, s in enumerate(segs_with_text):
385
  emo = _parse_emotion(s["text"])