Lior-0618 Claude Opus 4.6 commited on
Commit
8e1d7bd
Β·
1 Parent(s): f07c505

feat: switch to external evoxtral Modal API (no local model)

Browse files

- Replace local VoxtralForConditionalGeneration+PEFT inference with
HTTP calls to https://yongkang-zou1999--evoxtral-api-evoxtralmodel-web.modal.run
- Remove torch, transformers, peft, accelerate, mistral-common from requirements.txt
- Add httpx for async HTTP client
- Parse inline expression tags ([laughs], [sighs], etc.) from transcription
to derive emotion/valence/arousal per segment
- Remove model weight caching from Dockerfile (no local weights needed)
- Server startup is now instant (no model loading)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Dockerfile CHANGED
@@ -43,12 +43,6 @@ RUN cd demo && NEXT_PUBLIC_API_URL="" npm run build \
43
  COPY nginx.conf /etc/nginx/nginx.conf
44
  COPY supervisord.conf /etc/supervisor/conf.d/app.conf
45
 
46
- # ─── Model weight cache ───────────────────────────────────────────────────────
47
- # /data is persisted across Space restarts on HuggingFace Spaces
48
- RUN mkdir -p /data/models
49
- ENV TRANSFORMERS_CACHE=/data/models
50
- ENV HF_HOME=/data/models
51
-
52
  # HuggingFace Spaces public port
53
  EXPOSE 7860
54
 
 
43
  COPY nginx.conf /etc/nginx/nginx.conf
44
  COPY supervisord.conf /etc/supervisor/conf.d/app.conf
45
 
 
 
 
 
 
 
46
  # HuggingFace Spaces public port
47
  EXPOSE 7860
48
 
model/voxtral-server/main.py CHANGED
@@ -1,180 +1,99 @@
1
  """
2
- Voxtral speech-to-text API (offline transcription + speaker diarization) - Model layer.
3
- Model ID can be overridden with env VOXTRAL_MODEL_ID; default mistralai/Voxtral-Mini-4B-Realtime-2602
 
4
  """
5
  import os
 
6
  import shutil
7
  import subprocess
8
  import tempfile
9
  import time
10
  from contextlib import asynccontextmanager
11
- from typing import Optional
12
 
13
- import torch
14
- import numpy as np
15
  import librosa
16
- import soundfile as sf
17
- from fastapi import FastAPI, File, UploadFile, HTTPException, Query
18
  from fastapi.middleware.cors import CORSMiddleware
19
 
20
- REPO_ID = os.environ.get("VOXTRAL_MODEL_ID", "YongkangZOU/evoxtral-lora")
21
- BASE_MODEL_ID = "mistralai/Voxtral-Mini-3B-2507"
 
 
22
  MAX_UPLOAD_BYTES = int(os.environ.get("MAX_UPLOAD_MB", "100")) * 1024 * 1024
23
- HF_TOKEN = os.environ.get("HF_TOKEN") # optional: enables pyannote speaker diarization
24
-
25
- processor = None
26
- model = None
27
-
28
- # Optional: pyannote pipeline (loaded lazily on first diarize request if HF_TOKEN is set)
29
- _pyannote_pipeline = None
30
- _pyannote_loaded = False
31
- _pyannote_available = False
32
-
33
- try:
34
- from pyannote.audio import Pipeline as _PyannotePipeline
35
- _pyannote_available = True
36
- except ImportError:
37
- pass
38
 
39
 
40
  def _check_ffmpeg():
41
- """Check ffmpeg is available at startup; raise with clear message if not."""
42
  if shutil.which("ffmpeg") is None:
43
  raise RuntimeError(
44
- "ffmpeg not found. WebM (e.g. browser recording) requires ffmpeg to decode.\n"
45
  " macOS: brew install ffmpeg\n"
46
- " Ubuntu: sudo apt install ffmpeg\n"
47
- " Windows: https://ffmpeg.org/download.html\n"
48
- "Then restart this service."
49
- )
50
-
51
-
52
- def _get_pyannote_pipeline():
53
- """Lazy-load pyannote pipeline (requires HF_TOKEN and pyannote.audio installed)."""
54
- global _pyannote_pipeline, _pyannote_loaded
55
- if _pyannote_loaded:
56
- return _pyannote_pipeline
57
- _pyannote_loaded = True
58
- if not _pyannote_available or not HF_TOKEN:
59
- print("[voxtral] pyannote: not available (install pyannote.audio and set HF_TOKEN for real diarization; using VAD+MFCC fallback)")
60
- return None
61
- try:
62
- pipeline = _PyannotePipeline.from_pretrained(
63
- "pyannote/speaker-diarization-3.1",
64
- use_auth_token=HF_TOKEN,
65
  )
66
- if torch.cuda.is_available():
67
- pipeline = pipeline.to(torch.device("cuda"))
68
- elif torch.backends.mps.is_available():
69
- pipeline = pipeline.to(torch.device("mps"))
70
- _pyannote_pipeline = pipeline
71
- print("[voxtral] pyannote speaker-diarization-3.1 loaded")
72
- except Exception as e:
73
- print(f"[voxtral] pyannote load failed: {e} β€” using VAD+MFCC fallback")
74
- return _pyannote_pipeline
75
 
76
 
77
  @asynccontextmanager
78
  async def lifespan(app: FastAPI):
79
- """On startup: check deps and load model."""
80
- global processor, model
81
-
82
  _check_ffmpeg()
83
  print(f"[voxtral] ffmpeg: {shutil.which('ffmpeg')}")
84
-
85
- if torch.cuda.is_available():
86
- _device = torch.device("cuda")
87
- _dtype = torch.bfloat16
88
- elif torch.backends.mps.is_available():
89
- _device = torch.device("mps")
90
- _dtype = torch.float16 # MPS does not support bfloat16
91
- else:
92
- _device = torch.device("cpu")
93
- _dtype = torch.bfloat16 # halves memory vs float32 (8 GB vs 16 GB); supported on modern x86
94
- print(f"[voxtral] Device: {_device} dtype: {_dtype}")
95
-
96
- print(f"[voxtral] Loading base model: {BASE_MODEL_ID} ...")
97
- print(f"[voxtral] Applying LoRA adapter: {REPO_ID} ...")
98
  try:
99
- from transformers import VoxtralForConditionalGeneration, AutoProcessor
100
- from peft import PeftModel
101
-
102
- processor = AutoProcessor.from_pretrained(BASE_MODEL_ID)
103
- base = VoxtralForConditionalGeneration.from_pretrained(
104
- BASE_MODEL_ID, torch_dtype=_dtype
105
- ).to(_device)
106
- model = PeftModel.from_pretrained(base, REPO_ID)
107
- model.eval()
108
- print(f"[voxtral] Model ready: {BASE_MODEL_ID} + LoRA {REPO_ID} on {_device}")
109
  except Exception as e:
110
- raise RuntimeError(
111
- f"Model load failed: {e}\n"
112
- "Ensure deps are installed: pip install -r requirements.txt\n"
113
- "And sufficient VRAM (recommended β‰₯16GB) or use CPU (slower)."
114
- ) from e
115
-
116
- # Warm-up: run one silent dummy inference to pre-compile MPS Metal shaders.
117
- print("[voxtral] Warming up (dummy inference)...")
118
- try:
119
- sr = getattr(getattr(processor, "feature_extractor", None), "sampling_rate", 16000)
120
- dummy = np.zeros(sr, dtype=np.float32) # 1 second of silence
121
- with torch.inference_mode():
122
- dummy_inputs = processor(dummy, return_tensors="pt")
123
- dummy_inputs = {
124
- k: (v.to(_device, dtype=_dtype) if v.is_floating_point() else v.to(_device))
125
- for k, v in dummy_inputs.items()
126
- }
127
- model.generate(**dummy_inputs, max_new_tokens=1)
128
- print("[voxtral] Warm-up complete β€” first request will be fast")
129
- except Exception as e:
130
- print(f"[voxtral] Warm-up skipped: {e}")
131
-
132
  yield
133
 
134
 
135
- app = FastAPI(title="Voxtral Speech-to-Text (Model)", lifespan=lifespan)
136
 
137
  app.add_middleware(
138
  CORSMiddleware,
139
- allow_origins=[
140
- "http://localhost:3000",
141
- "http://127.0.0.1:3000",
142
- ],
143
  allow_methods=["GET", "POST", "OPTIONS"],
144
  allow_headers=["*"],
145
  )
146
 
147
 
148
- @app.get("/debug-inference")
149
- async def debug_inference():
150
- """Run a 1-second silent inference and return full result or traceback."""
151
- import traceback as tb
152
- try:
153
- dummy = np.zeros(16000, dtype=np.float32)
154
- text = _transcribe(dummy)
155
- return {"status": "ok", "text": text}
156
- except Exception as e:
157
- return {"status": "error", "error": str(e), "traceback": tb.format_exc()}
158
-
159
-
160
  @app.get("/health")
161
  async def health():
162
- """Health check: service and dependency status."""
163
  return {
164
  "status": "ok",
165
- "model": REPO_ID,
166
- "model_loaded": model is not None,
167
  "ffmpeg": shutil.which("ffmpeg") is not None,
168
- "pyannote_available": _pyannote_available,
169
- "hf_token_set": bool(HF_TOKEN),
170
  "max_upload_mb": MAX_UPLOAD_BYTES // 1024 // 1024,
 
171
  }
172
 
173
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  # ─── Audio helpers ─────────────────────────────────────────────────────────────
175
 
176
  def _convert_to_wav_ffmpeg(path: str, target_sr: int) -> str:
177
- """Convert any format to 16kHz mono WAV with ffmpeg; return path to new file."""
178
  out = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
179
  out.close()
180
  rc = subprocess.run(
@@ -188,14 +107,11 @@ def _convert_to_wav_ffmpeg(path: str, target_sr: int) -> str:
188
  )
189
  if rc.returncode != 0:
190
  os.unlink(out.name)
191
- raise RuntimeError(
192
- f"ffmpeg failed: {rc.stderr.decode(errors='replace')[:500]}"
193
- )
194
  return out.name
195
 
196
 
197
- def load_audio_to_array(file_path: str, target_sr: int) -> np.ndarray:
198
- """Load audio to mono float32 and resample to target_sr."""
199
  lower = file_path.lower()
200
  if lower.endswith((".webm", ".opus", ".m4a", ".ogg")):
201
  wav_path = _convert_to_wav_ffmpeg(file_path, target_sr)
@@ -205,16 +121,13 @@ def load_audio_to_array(file_path: str, target_sr: int) -> np.ndarray:
205
  finally:
206
  if os.path.exists(wav_path):
207
  os.unlink(wav_path)
208
-
209
  try:
210
  y, _ = librosa.load(file_path, sr=target_sr, mono=True)
211
  return y.astype(np.float32)
212
  except Exception as e:
213
- if not os.path.isfile(file_path):
214
- raise
215
  need_ffmpeg = (
216
  "format not recognised" in str(e).lower()
217
- or "nobackenderror" in str(type(e).__name__).lower()
218
  )
219
  if need_ffmpeg:
220
  wav_path = _convert_to_wav_ffmpeg(file_path, target_sr)
@@ -228,9 +141,11 @@ def load_audio_to_array(file_path: str, target_sr: int) -> np.ndarray:
228
 
229
 
230
  def _validate_upload(contents: bytes) -> None:
231
- """Validate upload: non-empty and within size limit."""
232
  if len(contents) == 0:
233
- raise HTTPException(status_code=400, detail="Audio file is empty; record at least 1–2 seconds or choose a valid file")
 
 
 
234
  if len(contents) > MAX_UPLOAD_BYTES:
235
  mb = len(contents) / 1024 / 1024
236
  limit_mb = MAX_UPLOAD_BYTES // 1024 // 1024
@@ -243,29 +158,20 @@ def _validate_upload(contents: bytes) -> None:
243
  # ─── Segmentation helpers ──────────────────────────────────────────────────────
244
 
245
  def _vad_segment(audio: np.ndarray, sr: int) -> list[tuple[int, int]]:
246
- """Split audio into speech segments by silence detection.
247
- Merges gaps < 0.5 s (intra-phrase pauses) and drops segments < 0.3 s.
248
- Returns list of (start_sample, end_sample).
249
- """
250
  intervals = librosa.effects.split(audio, top_db=28, frame_length=2048, hop_length=512)
251
  if len(intervals) == 0:
252
  return [(0, len(audio))]
253
-
254
  merged: list[list[int]] = [[int(intervals[0][0]), int(intervals[0][1])]]
255
  for s, e in intervals[1:]:
256
  if (int(s) - merged[-1][1]) / sr < 0.3:
257
  merged[-1][1] = int(e)
258
  else:
259
  merged.append([int(s), int(e)])
260
-
261
  result = [(s, e) for s, e in merged if (e - s) / sr >= 0.3]
262
  return result if result else [(0, len(audio))]
263
 
264
 
265
  def _segments_from_vad(audio: np.ndarray, sr: int) -> tuple[list[dict], str]:
266
- """Segment audio by silence, assign all segments to SPEAKER_00.
267
- Returns (segments, method_name).
268
- """
269
  intervals = _vad_segment(audio, sr)
270
  segs = [
271
  {"speaker": "SPEAKER_00", "start": round(s / sr, 3), "end": round(e / sr, 3)}
@@ -276,25 +182,17 @@ def _segments_from_vad(audio: np.ndarray, sr: int) -> tuple[list[dict], str]:
276
 
277
 
278
  def _split_sentences(text: str) -> list[str]:
279
- """Split text into sentences at punctuation boundaries (CJK + Latin)."""
280
- import re
281
  parts = re.split(r'(?<=[οΌŸοΌγ€‚?!])\s*', text)
282
  return [p for p in parts if p.strip()]
283
 
284
 
285
  def _distribute_text(full_text: str, segs: list[dict]) -> list[dict]:
286
- """Assign complete sentences to segments by time proportion.
287
- Sentences are never split mid-punctuation; each segment gets whole sentences.
288
- Falls back to character-level splitting if no sentence boundaries found.
289
- """
290
  if not full_text or not segs:
291
  return [{**s, "text": ""} for s in segs]
292
-
293
  if len(segs) == 1:
294
  return [{**segs[0], "text": full_text}]
295
 
296
  sentences = _split_sentences(full_text)
297
- # Fallback: split by character if no sentence boundaries
298
  if len(sentences) <= 1:
299
  is_cjk = len(full_text.split()) <= 1
300
  sentences = list(full_text) if is_cjk else full_text.split()
@@ -305,206 +203,97 @@ def _distribute_text(full_text: str, segs: list[dict]) -> list[dict]:
305
 
306
  is_cjk = len(full_text.split()) <= 1 and len(full_text) > 1
307
  sep = "" if is_cjk else " "
308
-
309
- # Assign each sentence to the segment whose cumulative time covers its proportional position
310
  n = len(sentences)
311
  result_texts: list[list[str]] = [[] for _ in segs]
312
-
313
  cumulative = 0.0
314
  for i, seg in enumerate(segs):
315
  cumulative += (seg["end"] - seg["start"]) / total_dur
316
- # Assign sentences whose proportional position falls within this segment's cumulative range
317
  threshold = cumulative * n
318
  while len(result_texts[i]) + sum(len(t) for t in result_texts[:i]) < round(threshold):
319
  idx = sum(len(t) for t in result_texts)
320
  if idx >= n:
321
  break
322
  result_texts[i].append(sentences[idx])
323
-
324
- # Ensure any leftover sentences go to the last segment
325
  assigned = sum(len(t) for t in result_texts)
326
  result_texts[-1].extend(sentences[assigned:])
327
-
328
  return [{**seg, "text": sep.join(texts)} for seg, texts in zip(segs, result_texts)]
329
 
330
 
331
- # ─── Emotion analysis ──────────────────────────────────────────────────────────
332
-
333
- def _emotion_label(valence: float, arousal: float) -> str:
334
- """Map continuous valence/arousal to a discrete emotion label."""
335
- if arousal > 0.3:
336
- if valence > 0.15:
337
- return "Happy" if arousal > 0.6 else "Excited"
338
- elif valence < -0.15:
339
- return "Angry" if arousal > 0.6 else "Anxious"
340
- return "Alert"
341
- elif arousal < -0.2:
342
- if valence > 0.15:
343
- return "Calm"
344
- elif valence < -0.15:
345
- return "Sad"
346
- return "Bored"
347
- else:
348
- if valence > 0.2:
349
- return "Content"
350
- elif valence < -0.2:
351
- return "Frustrated"
352
- return "Neutral"
353
-
354
-
355
- def _analyze_emotion(chunk: np.ndarray, sr: int) -> dict:
356
- """Estimate valence/arousal from acoustic features; return {emotion, valence, arousal}.
357
-
358
- Correlates used:
359
- Arousal ← RMS energy, mean pitch, zero-crossing rate
360
- Valence ← spectral brightness, pitch variation (tonal variety)
361
  """
362
- if len(chunk) < 512:
363
- return {"emotion": "Neutral", "valence": 0.0, "arousal": 0.0}
364
-
365
- try:
366
- # ── Energy ──────────────────────────────────────────────────────────
367
- rms = float(librosa.feature.rms(y=chunk).mean())
368
-
369
- # ── Pitch (YIN) ─────────────────────────────────────────────────────
370
- f0 = librosa.yin(chunk, fmin=60, fmax=450, sr=sr)
371
- voiced = f0[(f0 > 60) & (f0 < 450)]
372
- pitch_mean = float(voiced.mean()) if len(voiced) > 0 else 150.0
373
- pitch_std = float(voiced.std()) if len(voiced) > 0 else 0.0
374
-
375
- # ── Spectral features ────────────────────────────────────────────────
376
- spec_centroid = float(librosa.feature.spectral_centroid(y=chunk, sr=sr).mean())
377
- zcr = float(librosa.feature.zero_crossing_rate(chunk).mean())
378
-
379
- # ── Arousal (0..1 before rescaling) ─────────────────────────────────
380
- rms_n = min(rms / 0.08, 1.0) # typical speech RMS
381
- pitch_n = max(0.0, min((pitch_mean - 80) / 320, 1.0)) # 80–400 Hz
382
- zcr_n = min(zcr / 0.12, 1.0)
383
- arousal_01 = 0.5 * rms_n + 0.35 * pitch_n + 0.15 * zcr_n
384
- arousal = round(arousal_01 * 2 - 1, 3) # β†’ -1..1
385
-
386
- # ── Valence (0..1 before rescaling) ─────────────────────────────────
387
- spec_n = min(spec_centroid / 3500, 1.0) # brighter = warmer
388
- pitch_var_n = min(pitch_std / 60, 1.0) # melodic variety
389
- valence_01 = 0.55 * spec_n + 0.45 * pitch_var_n
390
- valence = round(valence_01 * 2 - 1, 3) # β†’ -1..1
391
-
392
- emotion = _emotion_label(valence, arousal)
393
- return {"emotion": emotion, "valence": valence, "arousal": arousal}
394
-
395
- except Exception as e:
396
- print(f"[voxtral] _analyze_emotion failed: {e}")
397
- return {"emotion": "Neutral", "valence": 0.0, "arousal": 0.0}
398
-
399
-
400
- # ─── Inference helper ──────────────────────────────────────────────────────────
401
-
402
- def _transcribe(audio_array: np.ndarray) -> str:
403
- """Run Voxtral-3B + LoRA inference via chat template; return transcribed text."""
404
- import traceback
405
- audio_sec = round(len(audio_array) / 16000, 2)
406
- model_dtype = next(model.parameters()).dtype
407
- print(f"[_transcribe] START audio={audio_sec}s device={model.device} dtype={model_dtype}", flush=True)
408
-
409
- try:
410
- t0 = time.perf_counter()
411
- inputs = processor(audio_array, return_tensors="pt")
412
- print(f"[_transcribe] processor() OK {(time.perf_counter()-t0)*1000:.0f}ms keys={list(inputs.keys())}", flush=True)
413
- except Exception:
414
- print(f"[_transcribe] processor() FAILED:\n{traceback.format_exc()}", flush=True)
415
- raise
416
-
417
- try:
418
- t0 = time.perf_counter()
419
- # move to device; cast floating tensors to model dtype to avoid dtype mismatch
420
- inputs = {
421
- k: (v.to(model.device, dtype=model_dtype) if v.is_floating_point() else v.to(model.device))
422
- for k, v in inputs.items()
423
- }
424
- input_len = inputs["input_ids"].shape[1]
425
- print(f"[_transcribe] to(device) OK {(time.perf_counter()-t0)*1000:.0f}ms input_len={input_len}", flush=True)
426
- except Exception:
427
- print(f"[_transcribe] to(device) FAILED:\n{traceback.format_exc()}", flush=True)
428
- raise
429
-
430
- try:
431
- t0 = time.perf_counter()
432
- print(f"[_transcribe] calling model.generate ...", flush=True)
433
- with torch.inference_mode():
434
- outputs = model.generate(**inputs, max_new_tokens=1024)
435
- new_tokens = outputs.shape[1] - input_len
436
- print(f"[_transcribe] model.generate OK {(time.perf_counter()-t0)*1000:.0f}ms new_tokens={new_tokens}", flush=True)
437
- except Exception:
438
- print(f"[_transcribe] model.generate FAILED:\n{traceback.format_exc()}", flush=True)
439
- raise
440
-
441
- try:
442
- # For direct processor() call, decode full output (no input prefix to strip)
443
- text = processor.decode(outputs[0], skip_special_tokens=True).strip()
444
- print(f"[_transcribe] decode OK (full) text={repr(text[:200])}", flush=True)
445
- # Also log the new-tokens-only version for comparison
446
- if input_len > 0 and outputs.shape[1] > input_len:
447
- new_only = processor.decode(outputs[0][input_len:], skip_special_tokens=True).strip()
448
- print(f"[_transcribe] decode new-only text={repr(new_only[:200])}", flush=True)
449
- return text
450
- except Exception:
451
- print(f"[_transcribe] decode FAILED:\n{traceback.format_exc()}", flush=True)
452
- raise
453
 
454
 
455
  # ─── Endpoints ─────────────────────────────────────────────────────────────────
456
 
457
  @app.post("/transcribe")
458
  async def transcribe(audio: UploadFile = File(...)):
459
- """
460
- Upload an audio file; return full transcription (offline, single response).
461
- Supported: wav, mp3, flac, ogg, m4a, webm
462
- """
463
  req_start = time.perf_counter()
464
  req_id = f"transcribe-{int(req_start * 1000)}"
465
  filename = audio.filename or "audio.wav"
466
- print(f"[voxtral] {req_id} POST /transcribe received filename={filename}")
467
 
468
  try:
469
  contents = await audio.read()
470
  except Exception as e:
471
  raise HTTPException(status_code=400, detail=f"Failed to read file: {e}")
472
-
473
  _validate_upload(contents)
474
 
475
- suffix = os.path.splitext(filename)[1].lower() or ".wav"
476
- if suffix not in (".wav", ".mp3", ".flac", ".ogg", ".m4a", ".webm"):
477
- suffix = ".wav"
478
-
479
- target_sr = getattr(getattr(processor, "feature_extractor", None), "sampling_rate", 16000)
480
- with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
481
- tmp.write(contents)
482
- tmp_path = tmp.name
483
-
484
- try:
485
- audio_array = load_audio_to_array(tmp_path, target_sr)
486
- except Exception as e:
487
- raise HTTPException(status_code=400, detail=f"Cannot decode audio: {e}")
488
- finally:
489
- if os.path.exists(tmp_path):
490
- try:
491
- os.unlink(tmp_path)
492
- except OSError:
493
- pass
494
 
495
- text = _transcribe(audio_array)
496
  total_ms = (time.perf_counter() - req_start) * 1000
497
  print(f"[voxtral] {req_id} done total={total_ms:.0f}ms text_len={len(text)}")
498
- return {"text": text, "words": [], "languageCode": None}
499
 
500
 
501
  @app.post("/transcribe-diarize")
502
- async def transcribe_diarize(
503
- audio: UploadFile = File(...),
504
- ):
505
  """
506
- Upload audio β†’ transcription + VAD sentence segmentation + per-segment emotion analysis.
507
- Returns structured segments: [{id, speaker, start, end, text, emotion, valence, arousal}]
 
508
  All segments are labelled SPEAKER_00 (single-speaker mode).
509
  """
510
  req_start = time.perf_counter()
@@ -516,22 +305,25 @@ async def transcribe_diarize(
516
  contents = await audio.read()
517
  except Exception as e:
518
  raise HTTPException(status_code=400, detail=f"Failed to read file: {e}")
519
-
520
  _validate_upload(contents)
521
 
522
  suffix = os.path.splitext(filename)[1].lower() or ".wav"
523
  if suffix not in (".wav", ".mp3", ".flac", ".ogg", ".m4a", ".webm"):
524
  suffix = ".wav"
525
 
526
- target_sr = getattr(getattr(processor, "feature_extractor", None), "sampling_rate", 16000)
 
 
 
 
527
 
 
528
  with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
529
  tmp.write(contents)
530
  tmp_path = tmp.name
531
-
532
  try:
533
  t0 = time.perf_counter()
534
- audio_array = load_audio_to_array(tmp_path, target_sr)
535
  print(f"[voxtral] {req_id} load_audio done shape={audio_array.shape} in {(time.perf_counter()-t0)*1000:.0f}ms")
536
  except Exception as e:
537
  raise HTTPException(status_code=400, detail=f"Cannot decode audio: {e}")
@@ -542,29 +334,20 @@ async def transcribe_diarize(
542
  except OSError:
543
  pass
544
 
545
- duration = round(len(audio_array) / target_sr, 3)
546
 
547
- # ── Step 1: full transcription via Voxtral ──────────────────────────────
548
  t0 = time.perf_counter()
549
- full_text = _transcribe(audio_array)
550
- print(f"[voxtral] {req_id} transcription done in {(time.perf_counter()-t0)*1000:.0f}ms text_len={len(full_text)}")
551
 
552
- # ── Step 2: VAD sentence segmentation ───────────────────────────────────
553
- t0 = time.perf_counter()
554
- raw_segs, seg_method = _segments_from_vad(audio_array, target_sr)
555
- print(f"[voxtral] {req_id} segmentation done in {(time.perf_counter()-t0)*1000:.0f}ms segs={len(raw_segs)}")
556
-
557
- # ── Step 3: distribute text proportionally ──────────────────────────────
558
  segs_with_text = _distribute_text(full_text, raw_segs)
559
 
560
- # ── Step 4: emotion analysis per segment ────────────────────────────────
561
- t0 = time.perf_counter()
562
  segments = []
563
  for i, s in enumerate(segs_with_text):
564
- start_sample = int(s["start"] * target_sr)
565
- end_sample = int(s["end"] * target_sr)
566
- chunk = audio_array[start_sample:end_sample]
567
- emo = _analyze_emotion(chunk, target_sr)
568
  segments.append({
569
  "id": i + 1,
570
  "speaker": s["speaker"],
@@ -575,7 +358,6 @@ async def transcribe_diarize(
575
  "valence": emo["valence"],
576
  "arousal": emo["arousal"],
577
  })
578
- print(f"[voxtral] {req_id} emotion analysis done in {(time.perf_counter()-t0)*1000:.0f}ms")
579
 
580
  total_ms = (time.perf_counter() - req_start) * 1000
581
  print(f"[voxtral] {req_id} complete total={total_ms:.0f}ms segments={len(segments)}")
 
1
  """
2
+ Evoxtral speech-to-text API proxy (Model layer).
3
+ Forwards audio to the external Modal evoxtral API, then adds
4
+ VAD segmentation and emotion parsing from inline expression tags.
5
  """
6
  import os
7
+ import re
8
  import shutil
9
  import subprocess
10
  import tempfile
11
  import time
12
  from contextlib import asynccontextmanager
 
13
 
14
+ import httpx
 
15
  import librosa
16
+ import numpy as np
17
+ from fastapi import FastAPI, File, UploadFile, HTTPException
18
  from fastapi.middleware.cors import CORSMiddleware
19
 
20
+ EVOXTRAL_API = os.environ.get(
21
+ "EVOXTRAL_API",
22
+ "https://yongkang-zou1999--evoxtral-api-evoxtralmodel-web.modal.run",
23
+ ).rstrip("/")
24
  MAX_UPLOAD_BYTES = int(os.environ.get("MAX_UPLOAD_MB", "100")) * 1024 * 1024
25
+ TARGET_SR = 16000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
 
28
  def _check_ffmpeg():
 
29
  if shutil.which("ffmpeg") is None:
30
  raise RuntimeError(
31
+ "ffmpeg not found. WebM / M4A / OGG requires ffmpeg to decode.\n"
32
  " macOS: brew install ffmpeg\n"
33
+ " Ubuntu: sudo apt install ffmpeg"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  )
 
 
 
 
 
 
 
 
 
35
 
36
 
37
  @asynccontextmanager
38
  async def lifespan(app: FastAPI):
 
 
 
39
  _check_ffmpeg()
40
  print(f"[voxtral] ffmpeg: {shutil.which('ffmpeg')}")
41
+ print(f"[voxtral] Evoxtral API: {EVOXTRAL_API}")
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  try:
43
+ async with httpx.AsyncClient(timeout=15) as client:
44
+ r = await client.get(f"{EVOXTRAL_API}/health")
45
+ print(f"[voxtral] External API health: {r.status_code} {r.text[:200]}")
 
 
 
 
 
 
 
46
  except Exception as e:
47
+ print(f"[voxtral] External API health check failed: {e} (will retry on first request)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  yield
49
 
50
 
51
+ app = FastAPI(title="Evoxtral Speech-to-Text (Model)", lifespan=lifespan)
52
 
53
  app.add_middleware(
54
  CORSMiddleware,
55
+ allow_origins=["http://localhost:3000", "http://127.0.0.1:3000"],
 
 
 
56
  allow_methods=["GET", "POST", "OPTIONS"],
57
  allow_headers=["*"],
58
  )
59
 
60
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  @app.get("/health")
62
  async def health():
 
63
  return {
64
  "status": "ok",
65
+ "model": "YongkangZOU/evoxtral-lora (external API)",
66
+ "model_loaded": True,
67
  "ffmpeg": shutil.which("ffmpeg") is not None,
68
+ "pyannote_available": False,
69
+ "hf_token_set": False,
70
  "max_upload_mb": MAX_UPLOAD_BYTES // 1024 // 1024,
71
+ "evoxtral_api": EVOXTRAL_API,
72
  }
73
 
74
 
75
+ # ─── External API ──────────────────────────────────────────────────────────────
76
+
77
+ async def _call_evoxtral(contents: bytes, filename: str) -> dict:
78
+ """Forward audio bytes to the external evoxtral API; return parsed JSON.
79
+ Response: {"transcription": "...[laughs]...", "language": "en", "model": "..."}
80
+ """
81
+ async with httpx.AsyncClient(timeout=300) as client:
82
+ r = await client.post(
83
+ f"{EVOXTRAL_API}/transcribe",
84
+ files={"file": (filename, contents)},
85
+ )
86
+ if not r.is_success:
87
+ raise HTTPException(
88
+ status_code=502,
89
+ detail=f"Evoxtral API error {r.status_code}: {r.text[:300]}",
90
+ )
91
+ return r.json()
92
+
93
+
94
  # ─── Audio helpers ─────────────────────────────────────────────────────────────
95
 
96
  def _convert_to_wav_ffmpeg(path: str, target_sr: int) -> str:
 
97
  out = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
98
  out.close()
99
  rc = subprocess.run(
 
107
  )
108
  if rc.returncode != 0:
109
  os.unlink(out.name)
110
+ raise RuntimeError(f"ffmpeg failed: {rc.stderr.decode(errors='replace')[:500]}")
 
 
111
  return out.name
112
 
113
 
114
+ def _load_audio(file_path: str, target_sr: int) -> np.ndarray:
 
115
  lower = file_path.lower()
116
  if lower.endswith((".webm", ".opus", ".m4a", ".ogg")):
117
  wav_path = _convert_to_wav_ffmpeg(file_path, target_sr)
 
121
  finally:
122
  if os.path.exists(wav_path):
123
  os.unlink(wav_path)
 
124
  try:
125
  y, _ = librosa.load(file_path, sr=target_sr, mono=True)
126
  return y.astype(np.float32)
127
  except Exception as e:
 
 
128
  need_ffmpeg = (
129
  "format not recognised" in str(e).lower()
130
+ or "nobackenderror" in type(e).__name__.lower()
131
  )
132
  if need_ffmpeg:
133
  wav_path = _convert_to_wav_ffmpeg(file_path, target_sr)
 
141
 
142
 
143
  def _validate_upload(contents: bytes) -> None:
 
144
  if len(contents) == 0:
145
+ raise HTTPException(
146
+ status_code=400,
147
+ detail="Audio file is empty; record at least 1–2 seconds or choose a valid file",
148
+ )
149
  if len(contents) > MAX_UPLOAD_BYTES:
150
  mb = len(contents) / 1024 / 1024
151
  limit_mb = MAX_UPLOAD_BYTES // 1024 // 1024
 
158
  # ─── Segmentation helpers ──────────────────────────────────────────────────────
159
 
160
  def _vad_segment(audio: np.ndarray, sr: int) -> list[tuple[int, int]]:
 
 
 
 
161
  intervals = librosa.effects.split(audio, top_db=28, frame_length=2048, hop_length=512)
162
  if len(intervals) == 0:
163
  return [(0, len(audio))]
 
164
  merged: list[list[int]] = [[int(intervals[0][0]), int(intervals[0][1])]]
165
  for s, e in intervals[1:]:
166
  if (int(s) - merged[-1][1]) / sr < 0.3:
167
  merged[-1][1] = int(e)
168
  else:
169
  merged.append([int(s), int(e)])
 
170
  result = [(s, e) for s, e in merged if (e - s) / sr >= 0.3]
171
  return result if result else [(0, len(audio))]
172
 
173
 
174
  def _segments_from_vad(audio: np.ndarray, sr: int) -> tuple[list[dict], str]:
 
 
 
175
  intervals = _vad_segment(audio, sr)
176
  segs = [
177
  {"speaker": "SPEAKER_00", "start": round(s / sr, 3), "end": round(e / sr, 3)}
 
182
 
183
 
184
  def _split_sentences(text: str) -> list[str]:
 
 
185
  parts = re.split(r'(?<=[οΌŸοΌγ€‚?!])\s*', text)
186
  return [p for p in parts if p.strip()]
187
 
188
 
189
  def _distribute_text(full_text: str, segs: list[dict]) -> list[dict]:
 
 
 
 
190
  if not full_text or not segs:
191
  return [{**s, "text": ""} for s in segs]
 
192
  if len(segs) == 1:
193
  return [{**segs[0], "text": full_text}]
194
 
195
  sentences = _split_sentences(full_text)
 
196
  if len(sentences) <= 1:
197
  is_cjk = len(full_text.split()) <= 1
198
  sentences = list(full_text) if is_cjk else full_text.split()
 
203
 
204
  is_cjk = len(full_text.split()) <= 1 and len(full_text) > 1
205
  sep = "" if is_cjk else " "
 
 
206
  n = len(sentences)
207
  result_texts: list[list[str]] = [[] for _ in segs]
 
208
  cumulative = 0.0
209
  for i, seg in enumerate(segs):
210
  cumulative += (seg["end"] - seg["start"]) / total_dur
 
211
  threshold = cumulative * n
212
  while len(result_texts[i]) + sum(len(t) for t in result_texts[:i]) < round(threshold):
213
  idx = sum(len(t) for t in result_texts)
214
  if idx >= n:
215
  break
216
  result_texts[i].append(sentences[idx])
 
 
217
  assigned = sum(len(t) for t in result_texts)
218
  result_texts[-1].extend(sentences[assigned:])
 
219
  return [{**seg, "text": sep.join(texts)} for seg, texts in zip(segs, result_texts)]
220
 
221
 
222
+ # ─── Emotion parsing from evoxtral expression tags ─────────────────────────────
223
+
224
+ # Maps inline tags like [laughs], [sighs] β†’ (emotion label, valence, arousal)
225
+ _TAG_EMOTIONS: dict[str, tuple[str, float, float]] = {
226
+ "laughs": ("Happy", 0.70, 0.60),
227
+ "laughing": ("Happy", 0.70, 0.60),
228
+ "chuckles": ("Happy", 0.50, 0.30),
229
+ "giggles": ("Happy", 0.60, 0.40),
230
+ "sighs": ("Sad", -0.30, -0.30),
231
+ "sighing": ("Sad", -0.30, -0.30),
232
+ "cries": ("Sad", -0.70, 0.40),
233
+ "crying": ("Sad", -0.70, 0.40),
234
+ "whispers": ("Calm", 0.10, -0.50),
235
+ "whispering":("Calm", 0.10, -0.50),
236
+ "shouts": ("Angry", -0.50, 0.80),
237
+ "shouting": ("Angry", -0.50, 0.80),
238
+ "exclaims": ("Excited", 0.50, 0.70),
239
+ "gasps": ("Surprised", 0.20, 0.70),
240
+ "hesitates": ("Anxious", -0.20, 0.30),
241
+ "stutters": ("Anxious", -0.20, 0.40),
242
+ "mumbles": ("Sad", -0.20, -0.30),
243
+ "claps": ("Happy", 0.60, 0.50),
244
+ "applause": ("Happy", 0.60, 0.50),
245
+ }
246
+
247
+
248
+ def _parse_emotion(text: str) -> dict:
249
+ """Extract the first recognized expression tag from text like [sighs] or [laughs].
250
+ Returns {"emotion": str, "valence": float, "arousal": float}.
251
+ Defaults to Neutral (0, 0) if no known tag is found.
252
  """
253
+ tags = re.findall(r'\[([^\]]+)\]', text.lower())
254
+ for tag in tags:
255
+ tag = tag.strip()
256
+ if tag in _TAG_EMOTIONS:
257
+ label, valence, arousal = _TAG_EMOTIONS[tag]
258
+ return {"emotion": label, "valence": valence, "arousal": arousal}
259
+ # Partial match (e.g. "laughs softly" β†’ "laughs")
260
+ for key, (label, valence, arousal) in _TAG_EMOTIONS.items():
261
+ if key in tag:
262
+ return {"emotion": label, "valence": valence, "arousal": arousal}
263
+ return {"emotion": "Neutral", "valence": 0.0, "arousal": 0.0}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
 
265
 
266
  # ─── Endpoints ─────────────────────────────────────────────────────────────────
267
 
268
  @app.post("/transcribe")
269
  async def transcribe(audio: UploadFile = File(...)):
270
+ """Upload audio β†’ plain transcription (with inline expression tags)."""
 
 
 
271
  req_start = time.perf_counter()
272
  req_id = f"transcribe-{int(req_start * 1000)}"
273
  filename = audio.filename or "audio.wav"
274
+ print(f"[voxtral] {req_id} POST /transcribe filename={filename}")
275
 
276
  try:
277
  contents = await audio.read()
278
  except Exception as e:
279
  raise HTTPException(status_code=400, detail=f"Failed to read file: {e}")
 
280
  _validate_upload(contents)
281
 
282
+ result = await _call_evoxtral(contents, filename)
283
+ text = result.get("transcription", "")
284
+ lang = result.get("language")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
 
 
286
  total_ms = (time.perf_counter() - req_start) * 1000
287
  print(f"[voxtral] {req_id} done total={total_ms:.0f}ms text_len={len(text)}")
288
+ return {"text": text, "words": [], "languageCode": lang}
289
 
290
 
291
  @app.post("/transcribe-diarize")
292
+ async def transcribe_diarize(audio: UploadFile = File(...)):
 
 
293
  """
294
+ Upload audio β†’ transcription + VAD segmentation + per-segment emotion.
295
+ Transcription is produced by the external evoxtral API (includes expressive tags).
296
+ Emotion is parsed from inline tags like [sighs], [laughs], etc.
297
  All segments are labelled SPEAKER_00 (single-speaker mode).
298
  """
299
  req_start = time.perf_counter()
 
305
  contents = await audio.read()
306
  except Exception as e:
307
  raise HTTPException(status_code=400, detail=f"Failed to read file: {e}")
 
308
  _validate_upload(contents)
309
 
310
  suffix = os.path.splitext(filename)[1].lower() or ".wav"
311
  if suffix not in (".wav", ".mp3", ".flac", ".ogg", ".m4a", ".webm"):
312
  suffix = ".wav"
313
 
314
+ # ── Step 1: call external evoxtral API ──────────────────────────────────
315
+ t0 = time.perf_counter()
316
+ result = await _call_evoxtral(contents, filename)
317
+ full_text = result.get("transcription", "")
318
+ print(f"[voxtral] {req_id} evoxtral API done {(time.perf_counter()-t0)*1000:.0f}ms text_len={len(full_text)}")
319
 
320
+ # ── Step 2: load audio for VAD segmentation ──────────────────────────────
321
  with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
322
  tmp.write(contents)
323
  tmp_path = tmp.name
 
324
  try:
325
  t0 = time.perf_counter()
326
+ audio_array = _load_audio(tmp_path, TARGET_SR)
327
  print(f"[voxtral] {req_id} load_audio done shape={audio_array.shape} in {(time.perf_counter()-t0)*1000:.0f}ms")
328
  except Exception as e:
329
  raise HTTPException(status_code=400, detail=f"Cannot decode audio: {e}")
 
334
  except OSError:
335
  pass
336
 
337
+ duration = round(len(audio_array) / TARGET_SR, 3)
338
 
339
+ # ── Step 3: VAD sentence segmentation ───────────────────────────────────
340
  t0 = time.perf_counter()
341
+ raw_segs, seg_method = _segments_from_vad(audio_array, TARGET_SR)
342
+ print(f"[voxtral] {req_id} segmentation done {(time.perf_counter()-t0)*1000:.0f}ms segs={len(raw_segs)}")
343
 
344
+ # ── Step 4: distribute text to segments ─────────────────────────────────
 
 
 
 
 
345
  segs_with_text = _distribute_text(full_text, raw_segs)
346
 
347
+ # ── Step 5: parse emotion from expression tags ──────────────────────────
 
348
  segments = []
349
  for i, s in enumerate(segs_with_text):
350
+ emo = _parse_emotion(s["text"])
 
 
 
351
  segments.append({
352
  "id": i + 1,
353
  "speaker": s["speaker"],
 
358
  "valence": emo["valence"],
359
  "arousal": emo["arousal"],
360
  })
 
361
 
362
  total_ms = (time.perf_counter() - req_start) * 1000
363
  print(f"[voxtral] {req_id} complete total={total_ms:.0f}ms segments={len(segments)}")
model/voxtral-server/requirements.txt CHANGED
@@ -1,16 +1,8 @@
1
- # Voxtral-Mini-3B-2507 + LoRA adapter (YongkangZOU/evoxtral-lora)
2
  fastapi>=0.115.0
3
  uvicorn[standard]>=0.32.0
4
  python-multipart>=0.0.9
5
- transformers==4.54.0
6
- peft>=0.18.0
7
- torch>=2.0.0
8
- accelerate>=0.33.0
9
- mistral-common[audio]>=1.5.0
10
  librosa>=0.10.0
11
  soundfile>=0.12.0
12
  numpy>=1.24.0
13
- scikit-learn>=1.3.0
14
- # Optional: production-grade speaker diarization (requires HF_TOKEN env var + model license acceptance)
15
- # pip install pyannote.audio>=3.1.0
16
- # Then: export HF_TOKEN=your_token
 
1
+ # Evoxtral API proxy β€” calls external Modal API for inference (no local model)
2
  fastapi>=0.115.0
3
  uvicorn[standard]>=0.32.0
4
  python-multipart>=0.0.9
5
+ httpx>=0.27.0
 
 
 
 
6
  librosa>=0.10.0
7
  soundfile>=0.12.0
8
  numpy>=1.24.0