Spaces:

mistral-hackaton-2026
/

ethos

Running

Lior-0618 Claude Opus 4.6 commited on Mar 1

Commit

8e1d7bd

1 Parent(s): f07c505

feat: switch to external evoxtral Modal API (no local model)

- Replace local VoxtralForConditionalGeneration+PEFT inference with
HTTP calls to https://yongkang-zou1999--evoxtral-api-evoxtralmodel-web.modal.run
- Remove torch, transformers, peft, accelerate, mistral-common from requirements.txt
- Add httpx for async HTTP client
- Parse inline expression tags ([laughs], [sighs], etc.) from transcription
to derive emotion/valence/arousal per segment
- Remove model weight caching from Dockerfile (no local weights needed)
- Server startup is now instant (no model loading)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (3) hide show

Dockerfile +0 -6
model/voxtral-server/main.py +117 -335
model/voxtral-server/requirements.txt +2 -10

Dockerfile CHANGED Viewed

@@ -43,12 +43,6 @@ RUN cd demo && NEXT_PUBLIC_API_URL="" npm run build \
 COPY nginx.conf /etc/nginx/nginx.conf
 COPY supervisord.conf /etc/supervisor/conf.d/app.conf
-# ─── Model weight cache ───────────────────────────────────────────────────────
-# /data is persisted across Space restarts on HuggingFace Spaces
-RUN mkdir -p /data/models
-ENV TRANSFORMERS_CACHE=/data/models
-ENV HF_HOME=/data/models
 # HuggingFace Spaces public port
 EXPOSE 7860

 COPY nginx.conf /etc/nginx/nginx.conf
 COPY supervisord.conf /etc/supervisor/conf.d/app.conf
 # HuggingFace Spaces public port
 EXPOSE 7860

model/voxtral-server/main.py CHANGED Viewed

@@ -1,180 +1,99 @@
 """
-Voxtral speech-to-text API (offline transcription + speaker diarization) - Model layer.
-Model ID can be overridden with env VOXTRAL_MODEL_ID; default mistralai/Voxtral-Mini-4B-Realtime-2602
 """
 import os
 import shutil
 import subprocess
 import tempfile
 import time
 from contextlib import asynccontextmanager
-from typing import Optional
-import torch
-import numpy as np
 import librosa
-import soundfile as sf
-from fastapi import FastAPI, File, UploadFile, HTTPException, Query
 from fastapi.middleware.cors import CORSMiddleware
-REPO_ID = os.environ.get("VOXTRAL_MODEL_ID", "YongkangZOU/evoxtral-lora")
-BASE_MODEL_ID = "mistralai/Voxtral-Mini-3B-2507"
 MAX_UPLOAD_BYTES = int(os.environ.get("MAX_UPLOAD_MB", "100")) * 1024 * 1024
-HF_TOKEN = os.environ.get("HF_TOKEN")  # optional: enables pyannote speaker diarization
-processor = None
-model = None
-# Optional: pyannote pipeline (loaded lazily on first diarize request if HF_TOKEN is set)
-_pyannote_pipeline = None
-_pyannote_loaded = False
-_pyannote_available = False
-try:
-    from pyannote.audio import Pipeline as _PyannotePipeline
-    _pyannote_available = True
-except ImportError:
-    pass
 def _check_ffmpeg():
-    """Check ffmpeg is available at startup; raise with clear message if not."""
     if shutil.which("ffmpeg") is None:
         raise RuntimeError(
-            "ffmpeg not found. WebM (e.g. browser recording) requires ffmpeg to decode.\n"
             "  macOS:   brew install ffmpeg\n"
-            "  Ubuntu:  sudo apt install ffmpeg\n"
-            "  Windows: https://ffmpeg.org/download.html\n"
-            "Then restart this service."
-        )
-def _get_pyannote_pipeline():
-    """Lazy-load pyannote pipeline (requires HF_TOKEN and pyannote.audio installed)."""
-    global _pyannote_pipeline, _pyannote_loaded
-    if _pyannote_loaded:
-        return _pyannote_pipeline
-    _pyannote_loaded = True
-    if not _pyannote_available or not HF_TOKEN:
-        print("[voxtral] pyannote: not available (install pyannote.audio and set HF_TOKEN for real diarization; using VAD+MFCC fallback)")
-        return None
-    try:
-        pipeline = _PyannotePipeline.from_pretrained(
-            "pyannote/speaker-diarization-3.1",
-            use_auth_token=HF_TOKEN,
         )
-        if torch.cuda.is_available():
-            pipeline = pipeline.to(torch.device("cuda"))
-        elif torch.backends.mps.is_available():
-            pipeline = pipeline.to(torch.device("mps"))
-        _pyannote_pipeline = pipeline
-        print("[voxtral] pyannote speaker-diarization-3.1 loaded")
-    except Exception as e:
-        print(f"[voxtral] pyannote load failed: {e} — using VAD+MFCC fallback")
-    return _pyannote_pipeline
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    """On startup: check deps and load model."""
-    global processor, model
     _check_ffmpeg()
     print(f"[voxtral] ffmpeg: {shutil.which('ffmpeg')}")
-    if torch.cuda.is_available():
-        _device = torch.device("cuda")
-        _dtype = torch.bfloat16
-    elif torch.backends.mps.is_available():
-        _device = torch.device("mps")
-        _dtype = torch.float16   # MPS does not support bfloat16
-    else:
-        _device = torch.device("cpu")
-        _dtype = torch.bfloat16  # halves memory vs float32 (8 GB vs 16 GB); supported on modern x86
-    print(f"[voxtral] Device: {_device}  dtype: {_dtype}")
-    print(f"[voxtral] Loading base model: {BASE_MODEL_ID} ...")
-    print(f"[voxtral] Applying LoRA adapter: {REPO_ID} ...")
     try:
-        from transformers import VoxtralForConditionalGeneration, AutoProcessor
-        from peft import PeftModel
-        processor = AutoProcessor.from_pretrained(BASE_MODEL_ID)
-        base = VoxtralForConditionalGeneration.from_pretrained(
-            BASE_MODEL_ID, torch_dtype=_dtype
-        ).to(_device)
-        model = PeftModel.from_pretrained(base, REPO_ID)
-        model.eval()
-        print(f"[voxtral] Model ready: {BASE_MODEL_ID} + LoRA {REPO_ID} on {_device}")
     except Exception as e:
-        raise RuntimeError(
-            f"Model load failed: {e}\n"
-            "Ensure deps are installed: pip install -r requirements.txt\n"
-            "And sufficient VRAM (recommended ≥16GB) or use CPU (slower)."
-        ) from e
-    # Warm-up: run one silent dummy inference to pre-compile MPS Metal shaders.
-    print("[voxtral] Warming up (dummy inference)...")
-    try:
-        sr = getattr(getattr(processor, "feature_extractor", None), "sampling_rate", 16000)
-        dummy = np.zeros(sr, dtype=np.float32)  # 1 second of silence
-        with torch.inference_mode():
-            dummy_inputs = processor(dummy, return_tensors="pt")
-            dummy_inputs = {
-                k: (v.to(_device, dtype=_dtype) if v.is_floating_point() else v.to(_device))
-                for k, v in dummy_inputs.items()
-            }
-            model.generate(**dummy_inputs, max_new_tokens=1)
-        print("[voxtral] Warm-up complete — first request will be fast")
-    except Exception as e:
-        print(f"[voxtral] Warm-up skipped: {e}")
     yield
-app = FastAPI(title="Voxtral Speech-to-Text (Model)", lifespan=lifespan)
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=[
-        "http://localhost:3000",
-        "http://127.0.0.1:3000",
-    ],
     allow_methods=["GET", "POST", "OPTIONS"],
     allow_headers=["*"],
 )
-@app.get("/debug-inference")
-async def debug_inference():
-    """Run a 1-second silent inference and return full result or traceback."""
-    import traceback as tb
-    try:
-        dummy = np.zeros(16000, dtype=np.float32)
-        text = _transcribe(dummy)
-        return {"status": "ok", "text": text}
-    except Exception as e:
-        return {"status": "error", "error": str(e), "traceback": tb.format_exc()}
 @app.get("/health")
 async def health():
-    """Health check: service and dependency status."""
     return {
         "status": "ok",
-        "model": REPO_ID,
-        "model_loaded": model is not None,
         "ffmpeg": shutil.which("ffmpeg") is not None,
-        "pyannote_available": _pyannote_available,
-        "hf_token_set": bool(HF_TOKEN),
         "max_upload_mb": MAX_UPLOAD_BYTES // 1024 // 1024,
     }
 # ─── Audio helpers ─────────────────────────────────────────────────────────────
 def _convert_to_wav_ffmpeg(path: str, target_sr: int) -> str:
-    """Convert any format to 16kHz mono WAV with ffmpeg; return path to new file."""
     out = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
     out.close()
     rc = subprocess.run(
@@ -188,14 +107,11 @@ def _convert_to_wav_ffmpeg(path: str, target_sr: int) -> str:
     )
     if rc.returncode != 0:
         os.unlink(out.name)
-        raise RuntimeError(
-            f"ffmpeg failed: {rc.stderr.decode(errors='replace')[:500]}"
-        )
     return out.name
-def load_audio_to_array(file_path: str, target_sr: int) -> np.ndarray:
-    """Load audio to mono float32 and resample to target_sr."""
     lower = file_path.lower()
     if lower.endswith((".webm", ".opus", ".m4a", ".ogg")):
         wav_path = _convert_to_wav_ffmpeg(file_path, target_sr)
@@ -205,16 +121,13 @@ def load_audio_to_array(file_path: str, target_sr: int) -> np.ndarray:
         finally:
             if os.path.exists(wav_path):
                 os.unlink(wav_path)
     try:
         y, _ = librosa.load(file_path, sr=target_sr, mono=True)
         return y.astype(np.float32)
     except Exception as e:
-        if not os.path.isfile(file_path):
-            raise
         need_ffmpeg = (
             "format not recognised" in str(e).lower()
-            or "nobackenderror" in str(type(e).__name__).lower()
         )
         if need_ffmpeg:
             wav_path = _convert_to_wav_ffmpeg(file_path, target_sr)
@@ -228,9 +141,11 @@ def load_audio_to_array(file_path: str, target_sr: int) -> np.ndarray:
 def _validate_upload(contents: bytes) -> None:
-    """Validate upload: non-empty and within size limit."""
     if len(contents) == 0:
-        raise HTTPException(status_code=400, detail="Audio file is empty; record at least 1–2 seconds or choose a valid file")
     if len(contents) > MAX_UPLOAD_BYTES:
         mb = len(contents) / 1024 / 1024
         limit_mb = MAX_UPLOAD_BYTES // 1024 // 1024
@@ -243,29 +158,20 @@ def _validate_upload(contents: bytes) -> None:
 # ─── Segmentation helpers ──────────────────────────────────────────────────────
 def _vad_segment(audio: np.ndarray, sr: int) -> list[tuple[int, int]]:
-    """Split audio into speech segments by silence detection.
-    Merges gaps < 0.5 s (intra-phrase pauses) and drops segments < 0.3 s.
-    Returns list of (start_sample, end_sample).
-    """
     intervals = librosa.effects.split(audio, top_db=28, frame_length=2048, hop_length=512)
     if len(intervals) == 0:
         return [(0, len(audio))]
     merged: list[list[int]] = [[int(intervals[0][0]), int(intervals[0][1])]]
     for s, e in intervals[1:]:
         if (int(s) - merged[-1][1]) / sr < 0.3:
             merged[-1][1] = int(e)
         else:
             merged.append([int(s), int(e)])
     result = [(s, e) for s, e in merged if (e - s) / sr >= 0.3]
     return result if result else [(0, len(audio))]
 def _segments_from_vad(audio: np.ndarray, sr: int) -> tuple[list[dict], str]:
-    """Segment audio by silence, assign all segments to SPEAKER_00.
-    Returns (segments, method_name).
-    """
     intervals = _vad_segment(audio, sr)
     segs = [
         {"speaker": "SPEAKER_00", "start": round(s / sr, 3), "end": round(e / sr, 3)}
@@ -276,25 +182,17 @@ def _segments_from_vad(audio: np.ndarray, sr: int) -> tuple[list[dict], str]:
 def _split_sentences(text: str) -> list[str]:
-    """Split text into sentences at punctuation boundaries (CJK + Latin)."""
-    import re
     parts = re.split(r'(?<=[？！。?!])\s*', text)
     return [p for p in parts if p.strip()]
 def _distribute_text(full_text: str, segs: list[dict]) -> list[dict]:
-    """Assign complete sentences to segments by time proportion.
-    Sentences are never split mid-punctuation; each segment gets whole sentences.
-    Falls back to character-level splitting if no sentence boundaries found.
-    """
     if not full_text or not segs:
         return [{**s, "text": ""} for s in segs]
     if len(segs) == 1:
         return [{**segs[0], "text": full_text}]
     sentences = _split_sentences(full_text)
-    # Fallback: split by character if no sentence boundaries
     if len(sentences) <= 1:
         is_cjk = len(full_text.split()) <= 1
         sentences = list(full_text) if is_cjk else full_text.split()
@@ -305,206 +203,97 @@ def _distribute_text(full_text: str, segs: list[dict]) -> list[dict]:
     is_cjk = len(full_text.split()) <= 1 and len(full_text) > 1
     sep = "" if is_cjk else " "
-    # Assign each sentence to the segment whose cumulative time covers its proportional position
     n = len(sentences)
     result_texts: list[list[str]] = [[] for _ in segs]
     cumulative = 0.0
     for i, seg in enumerate(segs):
         cumulative += (seg["end"] - seg["start"]) / total_dur
-        # Assign sentences whose proportional position falls within this segment's cumulative range
         threshold = cumulative * n
         while len(result_texts[i]) + sum(len(t) for t in result_texts[:i]) < round(threshold):
             idx = sum(len(t) for t in result_texts)
             if idx >= n:
                 break
             result_texts[i].append(sentences[idx])
-    # Ensure any leftover sentences go to the last segment
     assigned = sum(len(t) for t in result_texts)
     result_texts[-1].extend(sentences[assigned:])
     return [{**seg, "text": sep.join(texts)} for seg, texts in zip(segs, result_texts)]
-# ─── Emotion analysis ──────────────────────────────────────────────────────────
-def _emotion_label(valence: float, arousal: float) -> str:
-    """Map continuous valence/arousal to a discrete emotion label."""
-    if arousal > 0.3:
-        if valence > 0.15:
-            return "Happy" if arousal > 0.6 else "Excited"
-        elif valence < -0.15:
-            return "Angry" if arousal > 0.6 else "Anxious"
-        return "Alert"
-    elif arousal < -0.2:
-        if valence > 0.15:
-            return "Calm"
-        elif valence < -0.15:
-            return "Sad"
-        return "Bored"
-    else:
-        if valence > 0.2:
-            return "Content"
-        elif valence < -0.2:
-            return "Frustrated"
-        return "Neutral"
-def _analyze_emotion(chunk: np.ndarray, sr: int) -> dict:
-    """Estimate valence/arousal from acoustic features; return {emotion, valence, arousal}.
-    Correlates used:
-      Arousal  ← RMS energy, mean pitch, zero-crossing rate
-      Valence  ← spectral brightness, pitch variation (tonal variety)
     """
-    if len(chunk) < 512:
-        return {"emotion": "Neutral", "valence": 0.0, "arousal": 0.0}
-    try:
-        # ── Energy ──────────────────────────────────────────────────────────
-        rms = float(librosa.feature.rms(y=chunk).mean())
-        # ── Pitch (YIN) ─────────────────────────────────────────────────────
-        f0 = librosa.yin(chunk, fmin=60, fmax=450, sr=sr)
-        voiced = f0[(f0 > 60) & (f0 < 450)]
-        pitch_mean = float(voiced.mean()) if len(voiced) > 0 else 150.0
-        pitch_std  = float(voiced.std())  if len(voiced) > 0 else 0.0
-        # ── Spectral features ────────────────────────────────────────────────
-        spec_centroid = float(librosa.feature.spectral_centroid(y=chunk, sr=sr).mean())
-        zcr           = float(librosa.feature.zero_crossing_rate(chunk).mean())
-        # ── Arousal (0..1 before rescaling) ─────────────────────────────────
-        rms_n   = min(rms / 0.08, 1.0)                    # typical speech RMS
-        pitch_n = max(0.0, min((pitch_mean - 80) / 320, 1.0))  # 80–400 Hz
-        zcr_n   = min(zcr / 0.12, 1.0)
-        arousal_01 = 0.5 * rms_n + 0.35 * pitch_n + 0.15 * zcr_n
-        arousal = round(arousal_01 * 2 - 1, 3)            # → -1..1
-        # ── Valence (0..1 before rescaling) ─────────────────────────────────
-        spec_n     = min(spec_centroid / 3500, 1.0)        # brighter = warmer
-        pitch_var_n = min(pitch_std / 60, 1.0)             # melodic variety
-        valence_01 = 0.55 * spec_n + 0.45 * pitch_var_n
-        valence = round(valence_01 * 2 - 1, 3)            # → -1..1
-        emotion = _emotion_label(valence, arousal)
-        return {"emotion": emotion, "valence": valence, "arousal": arousal}
-    except Exception as e:
-        print(f"[voxtral] _analyze_emotion failed: {e}")
-        return {"emotion": "Neutral", "valence": 0.0, "arousal": 0.0}
-# ─── Inference helper ──────────────────────────────────────────────────────────
-def _transcribe(audio_array: np.ndarray) -> str:
-    """Run Voxtral-3B + LoRA inference via chat template; return transcribed text."""
-    import traceback
-    audio_sec = round(len(audio_array) / 16000, 2)
-    model_dtype = next(model.parameters()).dtype
-    print(f"[_transcribe] START audio={audio_sec}s device={model.device} dtype={model_dtype}", flush=True)
-    try:
-        t0 = time.perf_counter()
-        inputs = processor(audio_array, return_tensors="pt")
-        print(f"[_transcribe] processor() OK {(time.perf_counter()-t0)*1000:.0f}ms keys={list(inputs.keys())}", flush=True)
-    except Exception:
-        print(f"[_transcribe] processor() FAILED:\n{traceback.format_exc()}", flush=True)
-        raise
-    try:
-        t0 = time.perf_counter()
-        # move to device; cast floating tensors to model dtype to avoid dtype mismatch
-        inputs = {
-            k: (v.to(model.device, dtype=model_dtype) if v.is_floating_point() else v.to(model.device))
-            for k, v in inputs.items()
-        }
-        input_len = inputs["input_ids"].shape[1]
-        print(f"[_transcribe] to(device) OK {(time.perf_counter()-t0)*1000:.0f}ms input_len={input_len}", flush=True)
-    except Exception:
-        print(f"[_transcribe] to(device) FAILED:\n{traceback.format_exc()}", flush=True)
-        raise
-    try:
-        t0 = time.perf_counter()
-        print(f"[_transcribe] calling model.generate ...", flush=True)
-        with torch.inference_mode():
-            outputs = model.generate(**inputs, max_new_tokens=1024)
-        new_tokens = outputs.shape[1] - input_len
-        print(f"[_transcribe] model.generate OK {(time.perf_counter()-t0)*1000:.0f}ms new_tokens={new_tokens}", flush=True)
-    except Exception:
-        print(f"[_transcribe] model.generate FAILED:\n{traceback.format_exc()}", flush=True)
-        raise
-    try:
-        # For direct processor() call, decode full output (no input prefix to strip)
-        text = processor.decode(outputs[0], skip_special_tokens=True).strip()
-        print(f"[_transcribe] decode OK (full) text={repr(text[:200])}", flush=True)
-        # Also log the new-tokens-only version for comparison
-        if input_len > 0 and outputs.shape[1] > input_len:
-            new_only = processor.decode(outputs[0][input_len:], skip_special_tokens=True).strip()
-            print(f"[_transcribe] decode new-only text={repr(new_only[:200])}", flush=True)
-        return text
-    except Exception:
-        print(f"[_transcribe] decode FAILED:\n{traceback.format_exc()}", flush=True)
-        raise
 # ─── Endpoints ─────────────────────────────────────────────────────────────────
 @app.post("/transcribe")
 async def transcribe(audio: UploadFile = File(...)):
-    """
-    Upload an audio file; return full transcription (offline, single response).
-    Supported: wav, mp3, flac, ogg, m4a, webm
-    """
     req_start = time.perf_counter()
     req_id = f"transcribe-{int(req_start * 1000)}"
     filename = audio.filename or "audio.wav"
-    print(f"[voxtral] {req_id} POST /transcribe received filename={filename}")
     try:
         contents = await audio.read()
     except Exception as e:
         raise HTTPException(status_code=400, detail=f"Failed to read file: {e}")
     _validate_upload(contents)
-    suffix = os.path.splitext(filename)[1].lower() or ".wav"
-    if suffix not in (".wav", ".mp3", ".flac", ".ogg", ".m4a", ".webm"):
-        suffix = ".wav"
-    target_sr = getattr(getattr(processor, "feature_extractor", None), "sampling_rate", 16000)
-    with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
-        tmp.write(contents)
-        tmp_path = tmp.name
-    try:
-        audio_array = load_audio_to_array(tmp_path, target_sr)
-    except Exception as e:
-        raise HTTPException(status_code=400, detail=f"Cannot decode audio: {e}")
-    finally:
-        if os.path.exists(tmp_path):
-            try:
-                os.unlink(tmp_path)
-            except OSError:
-                pass
-    text = _transcribe(audio_array)
     total_ms = (time.perf_counter() - req_start) * 1000
     print(f"[voxtral] {req_id} done total={total_ms:.0f}ms text_len={len(text)}")
-    return {"text": text, "words": [], "languageCode": None}
 @app.post("/transcribe-diarize")
-async def transcribe_diarize(
-    audio: UploadFile = File(...),
-):
     """
-    Upload audio → transcription + VAD sentence segmentation + per-segment emotion analysis.
-    Returns structured segments: [{id, speaker, start, end, text, emotion, valence, arousal}]
     All segments are labelled SPEAKER_00 (single-speaker mode).
     """
     req_start = time.perf_counter()
@@ -516,22 +305,25 @@ async def transcribe_diarize(
         contents = await audio.read()
     except Exception as e:
         raise HTTPException(status_code=400, detail=f"Failed to read file: {e}")
     _validate_upload(contents)
     suffix = os.path.splitext(filename)[1].lower() or ".wav"
     if suffix not in (".wav", ".mp3", ".flac", ".ogg", ".m4a", ".webm"):
         suffix = ".wav"
-    target_sr = getattr(getattr(processor, "feature_extractor", None), "sampling_rate", 16000)
     with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
         tmp.write(contents)
         tmp_path = tmp.name
     try:
         t0 = time.perf_counter()
-        audio_array = load_audio_to_array(tmp_path, target_sr)
         print(f"[voxtral] {req_id} load_audio done shape={audio_array.shape} in {(time.perf_counter()-t0)*1000:.0f}ms")
     except Exception as e:
         raise HTTPException(status_code=400, detail=f"Cannot decode audio: {e}")
@@ -542,29 +334,20 @@ async def transcribe_diarize(
             except OSError:
                 pass
-    duration = round(len(audio_array) / target_sr, 3)
-    # ── Step 1: full transcription via Voxtral ──────────────────────────────
     t0 = time.perf_counter()
-    full_text = _transcribe(audio_array)
-    print(f"[voxtral] {req_id} transcription done in {(time.perf_counter()-t0)*1000:.0f}ms text_len={len(full_text)}")
-    # ── Step 2: VAD sentence segmentation ───────────────────────────────────
-    t0 = time.perf_counter()
-    raw_segs, seg_method = _segments_from_vad(audio_array, target_sr)
-    print(f"[voxtral] {req_id} segmentation done in {(time.perf_counter()-t0)*1000:.0f}ms segs={len(raw_segs)}")
-    # ── Step 3: distribute text proportionally ──────────────────────────────
     segs_with_text = _distribute_text(full_text, raw_segs)
-    # ── Step 4: emotion analysis per segment ────────────────────────────────
-    t0 = time.perf_counter()
     segments = []
     for i, s in enumerate(segs_with_text):
-        start_sample = int(s["start"] * target_sr)
-        end_sample   = int(s["end"]   * target_sr)
-        chunk = audio_array[start_sample:end_sample]
-        emo = _analyze_emotion(chunk, target_sr)
         segments.append({
             "id":      i + 1,
             "speaker": s["speaker"],
@@ -575,7 +358,6 @@ async def transcribe_diarize(
             "valence": emo["valence"],
             "arousal": emo["arousal"],
         })
-    print(f"[voxtral] {req_id} emotion analysis done in {(time.perf_counter()-t0)*1000:.0f}ms")
     total_ms = (time.perf_counter() - req_start) * 1000
     print(f"[voxtral] {req_id} complete total={total_ms:.0f}ms segments={len(segments)}")

 """
+Evoxtral speech-to-text API proxy (Model layer).
+Forwards audio to the external Modal evoxtral API, then adds
+VAD segmentation and emotion parsing from inline expression tags.
 """
 import os
+import re
 import shutil
 import subprocess
 import tempfile
 import time
 from contextlib import asynccontextmanager
+import httpx
 import librosa
+import numpy as np
+from fastapi import FastAPI, File, UploadFile, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
+EVOXTRAL_API = os.environ.get(
+    "EVOXTRAL_API",
+    "https://yongkang-zou1999--evoxtral-api-evoxtralmodel-web.modal.run",
+).rstrip("/")
 MAX_UPLOAD_BYTES = int(os.environ.get("MAX_UPLOAD_MB", "100")) * 1024 * 1024
+TARGET_SR = 16000
 def _check_ffmpeg():
     if shutil.which("ffmpeg") is None:
         raise RuntimeError(
+            "ffmpeg not found. WebM / M4A / OGG requires ffmpeg to decode.\n"
             "  macOS:   brew install ffmpeg\n"
+            "  Ubuntu:  sudo apt install ffmpeg"
         )
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     _check_ffmpeg()
     print(f"[voxtral] ffmpeg: {shutil.which('ffmpeg')}")
+    print(f"[voxtral] Evoxtral API: {EVOXTRAL_API}")
     try:
+        async with httpx.AsyncClient(timeout=15) as client:
+            r = await client.get(f"{EVOXTRAL_API}/health")
+            print(f"[voxtral] External API health: {r.status_code} {r.text[:200]}")
     except Exception as e:
+        print(f"[voxtral] External API health check failed: {e} (will retry on first request)")
     yield
+app = FastAPI(title="Evoxtral Speech-to-Text (Model)", lifespan=lifespan)
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["http://localhost:3000", "http://127.0.0.1:3000"],
     allow_methods=["GET", "POST", "OPTIONS"],
     allow_headers=["*"],
 )
 @app.get("/health")
 async def health():
     return {
         "status": "ok",
+        "model": "YongkangZOU/evoxtral-lora (external API)",
+        "model_loaded": True,
         "ffmpeg": shutil.which("ffmpeg") is not None,
+        "pyannote_available": False,
+        "hf_token_set": False,
         "max_upload_mb": MAX_UPLOAD_BYTES // 1024 // 1024,
+        "evoxtral_api": EVOXTRAL_API,
     }
+# ─── External API ──────────────────────────────────────────────────────────────
+async def _call_evoxtral(contents: bytes, filename: str) -> dict:
+    """Forward audio bytes to the external evoxtral API; return parsed JSON.
+    Response: {"transcription": "...[laughs]...", "language": "en", "model": "..."}
+    """
+    async with httpx.AsyncClient(timeout=300) as client:
+        r = await client.post(
+            f"{EVOXTRAL_API}/transcribe",
+            files={"file": (filename, contents)},
+        )
+    if not r.is_success:
+        raise HTTPException(
+            status_code=502,
+            detail=f"Evoxtral API error {r.status_code}: {r.text[:300]}",
+        )
+    return r.json()
 # ─── Audio helpers ─────────────────────────────────────────────────────────────
 def _convert_to_wav_ffmpeg(path: str, target_sr: int) -> str:
     out = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
     out.close()
     rc = subprocess.run(
     )
     if rc.returncode != 0:
         os.unlink(out.name)
+        raise RuntimeError(f"ffmpeg failed: {rc.stderr.decode(errors='replace')[:500]}")
     return out.name
+def _load_audio(file_path: str, target_sr: int) -> np.ndarray:
     lower = file_path.lower()
     if lower.endswith((".webm", ".opus", ".m4a", ".ogg")):
         wav_path = _convert_to_wav_ffmpeg(file_path, target_sr)
         finally:
             if os.path.exists(wav_path):
                 os.unlink(wav_path)
     try:
         y, _ = librosa.load(file_path, sr=target_sr, mono=True)
         return y.astype(np.float32)
     except Exception as e:
         need_ffmpeg = (
             "format not recognised" in str(e).lower()
+            or "nobackenderror" in type(e).__name__.lower()
         )
         if need_ffmpeg:
             wav_path = _convert_to_wav_ffmpeg(file_path, target_sr)
 def _validate_upload(contents: bytes) -> None:
     if len(contents) == 0:
+        raise HTTPException(
+            status_code=400,
+            detail="Audio file is empty; record at least 1–2 seconds or choose a valid file",
+        )
     if len(contents) > MAX_UPLOAD_BYTES:
         mb = len(contents) / 1024 / 1024
         limit_mb = MAX_UPLOAD_BYTES // 1024 // 1024
 # ─── Segmentation helpers ──────────────────────────────────────────────────────
 def _vad_segment(audio: np.ndarray, sr: int) -> list[tuple[int, int]]:
     intervals = librosa.effects.split(audio, top_db=28, frame_length=2048, hop_length=512)
     if len(intervals) == 0:
         return [(0, len(audio))]
     merged: list[list[int]] = [[int(intervals[0][0]), int(intervals[0][1])]]
     for s, e in intervals[1:]:
         if (int(s) - merged[-1][1]) / sr < 0.3:
             merged[-1][1] = int(e)
         else:
             merged.append([int(s), int(e)])
     result = [(s, e) for s, e in merged if (e - s) / sr >= 0.3]
     return result if result else [(0, len(audio))]
 def _segments_from_vad(audio: np.ndarray, sr: int) -> tuple[list[dict], str]:
     intervals = _vad_segment(audio, sr)
     segs = [
         {"speaker": "SPEAKER_00", "start": round(s / sr, 3), "end": round(e / sr, 3)}
 def _split_sentences(text: str) -> list[str]:
     parts = re.split(r'(?<=[？！。?!])\s*', text)
     return [p for p in parts if p.strip()]
 def _distribute_text(full_text: str, segs: list[dict]) -> list[dict]:
     if not full_text or not segs:
         return [{**s, "text": ""} for s in segs]
     if len(segs) == 1:
         return [{**segs[0], "text": full_text}]
     sentences = _split_sentences(full_text)
     if len(sentences) <= 1:
         is_cjk = len(full_text.split()) <= 1
         sentences = list(full_text) if is_cjk else full_text.split()
     is_cjk = len(full_text.split()) <= 1 and len(full_text) > 1
     sep = "" if is_cjk else " "
     n = len(sentences)
     result_texts: list[list[str]] = [[] for _ in segs]
     cumulative = 0.0
     for i, seg in enumerate(segs):
         cumulative += (seg["end"] - seg["start"]) / total_dur
         threshold = cumulative * n
         while len(result_texts[i]) + sum(len(t) for t in result_texts[:i]) < round(threshold):
             idx = sum(len(t) for t in result_texts)
             if idx >= n:
                 break
             result_texts[i].append(sentences[idx])
     assigned = sum(len(t) for t in result_texts)
     result_texts[-1].extend(sentences[assigned:])
     return [{**seg, "text": sep.join(texts)} for seg, texts in zip(segs, result_texts)]
+# ─── Emotion parsing from evoxtral expression tags ─────────────────────────────
+# Maps inline tags like [laughs], [sighs] → (emotion label, valence, arousal)
+_TAG_EMOTIONS: dict[str, tuple[str, float, float]] = {
+    "laughs":    ("Happy",     0.70,  0.60),
+    "laughing":  ("Happy",     0.70,  0.60),
+    "chuckles":  ("Happy",     0.50,  0.30),
+    "giggles":   ("Happy",     0.60,  0.40),
+    "sighs":     ("Sad",      -0.30, -0.30),
+    "sighing":   ("Sad",      -0.30, -0.30),
+    "cries":     ("Sad",      -0.70,  0.40),
+    "crying":    ("Sad",      -0.70,  0.40),
+    "whispers":  ("Calm",      0.10, -0.50),
+    "whispering":("Calm",      0.10, -0.50),
+    "shouts":    ("Angry",    -0.50,  0.80),
+    "shouting":  ("Angry",    -0.50,  0.80),
+    "exclaims":  ("Excited",   0.50,  0.70),
+    "gasps":     ("Surprised", 0.20,  0.70),
+    "hesitates": ("Anxious",  -0.20,  0.30),
+    "stutters":  ("Anxious",  -0.20,  0.40),
+    "mumbles":   ("Sad",      -0.20, -0.30),
+    "claps":     ("Happy",     0.60,  0.50),
+    "applause":  ("Happy",     0.60,  0.50),
+}
+def _parse_emotion(text: str) -> dict:
+    """Extract the first recognized expression tag from text like [sighs] or [laughs].
+    Returns {"emotion": str, "valence": float, "arousal": float}.
+    Defaults to Neutral (0, 0) if no known tag is found.
     """
+    tags = re.findall(r'\[([^\]]+)\]', text.lower())
+    for tag in tags:
+        tag = tag.strip()
+        if tag in _TAG_EMOTIONS:
+            label, valence, arousal = _TAG_EMOTIONS[tag]
+            return {"emotion": label, "valence": valence, "arousal": arousal}
+        # Partial match (e.g. "laughs softly" → "laughs")
+        for key, (label, valence, arousal) in _TAG_EMOTIONS.items():
+            if key in tag:
+                return {"emotion": label, "valence": valence, "arousal": arousal}
+    return {"emotion": "Neutral", "valence": 0.0, "arousal": 0.0}
 # ─── Endpoints ─────────────────────────────────────────────────────────────────
 @app.post("/transcribe")
 async def transcribe(audio: UploadFile = File(...)):
+    """Upload audio → plain transcription (with inline expression tags)."""
     req_start = time.perf_counter()
     req_id = f"transcribe-{int(req_start * 1000)}"
     filename = audio.filename or "audio.wav"
+    print(f"[voxtral] {req_id} POST /transcribe filename={filename}")
     try:
         contents = await audio.read()
     except Exception as e:
         raise HTTPException(status_code=400, detail=f"Failed to read file: {e}")
     _validate_upload(contents)
+    result = await _call_evoxtral(contents, filename)
+    text = result.get("transcription", "")
+    lang = result.get("language")
     total_ms = (time.perf_counter() - req_start) * 1000
     print(f"[voxtral] {req_id} done total={total_ms:.0f}ms text_len={len(text)}")
+    return {"text": text, "words": [], "languageCode": lang}
 @app.post("/transcribe-diarize")
+async def transcribe_diarize(audio: UploadFile = File(...)):
     """
+    Upload audio → transcription + VAD segmentation + per-segment emotion.
+    Transcription is produced by the external evoxtral API (includes expressive tags).
+    Emotion is parsed from inline tags like [sighs], [laughs], etc.
     All segments are labelled SPEAKER_00 (single-speaker mode).
     """
     req_start = time.perf_counter()
         contents = await audio.read()
     except Exception as e:
         raise HTTPException(status_code=400, detail=f"Failed to read file: {e}")
     _validate_upload(contents)
     suffix = os.path.splitext(filename)[1].lower() or ".wav"
     if suffix not in (".wav", ".mp3", ".flac", ".ogg", ".m4a", ".webm"):
         suffix = ".wav"
+    # ── Step 1: call external evoxtral API ──────────────────────────────────
+    t0 = time.perf_counter()
+    result = await _call_evoxtral(contents, filename)
+    full_text = result.get("transcription", "")
+    print(f"[voxtral] {req_id} evoxtral API done {(time.perf_counter()-t0)*1000:.0f}ms text_len={len(full_text)}")
+    # ── Step 2: load audio for VAD segmentation ──────────────────────────────
     with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
         tmp.write(contents)
         tmp_path = tmp.name
     try:
         t0 = time.perf_counter()
+        audio_array = _load_audio(tmp_path, TARGET_SR)
         print(f"[voxtral] {req_id} load_audio done shape={audio_array.shape} in {(time.perf_counter()-t0)*1000:.0f}ms")
     except Exception as e:
         raise HTTPException(status_code=400, detail=f"Cannot decode audio: {e}")
             except OSError:
                 pass
+    duration = round(len(audio_array) / TARGET_SR, 3)
+    # ── Step 3: VAD sentence segmentation ───────────────────────────────────
     t0 = time.perf_counter()
+    raw_segs, seg_method = _segments_from_vad(audio_array, TARGET_SR)
+    print(f"[voxtral] {req_id} segmentation done {(time.perf_counter()-t0)*1000:.0f}ms segs={len(raw_segs)}")
+    # ── Step 4: distribute text to segments ─────────────────────────────────
     segs_with_text = _distribute_text(full_text, raw_segs)
+    # ── Step 5: parse emotion from expression tags ──────────────────────────
     segments = []
     for i, s in enumerate(segs_with_text):
+        emo = _parse_emotion(s["text"])
         segments.append({
             "id":      i + 1,
             "speaker": s["speaker"],
             "valence": emo["valence"],
             "arousal": emo["arousal"],
         })
     total_ms = (time.perf_counter() - req_start) * 1000
     print(f"[voxtral] {req_id} complete total={total_ms:.0f}ms segments={len(segments)}")

model/voxtral-server/requirements.txt CHANGED Viewed

@@ -1,16 +1,8 @@
-# Voxtral-Mini-3B-2507 + LoRA adapter (YongkangZOU/evoxtral-lora)
 fastapi>=0.115.0
 uvicorn[standard]>=0.32.0
 python-multipart>=0.0.9
-transformers==4.54.0
-peft>=0.18.0
-torch>=2.0.0
-accelerate>=0.33.0
-mistral-common[audio]>=1.5.0
 librosa>=0.10.0
 soundfile>=0.12.0
 numpy>=1.24.0
-scikit-learn>=1.3.0
-# Optional: production-grade speaker diarization (requires HF_TOKEN env var + model license acceptance)
-# pip install pyannote.audio>=3.1.0
-# Then: export HF_TOKEN=your_token

+# Evoxtral API proxy — calls external Modal API for inference (no local model)
 fastapi>=0.115.0
 uvicorn[standard]>=0.32.0
 python-multipart>=0.0.9
+httpx>=0.27.0
 librosa>=0.10.0
 soundfile>=0.12.0
 numpy>=1.24.0