Spaces:
Running
Running
feat: switch to external evoxtral Modal API (no local model)
Browse files- Replace local VoxtralForConditionalGeneration+PEFT inference with
HTTP calls to https://yongkang-zou1999--evoxtral-api-evoxtralmodel-web.modal.run
- Remove torch, transformers, peft, accelerate, mistral-common from requirements.txt
- Add httpx for async HTTP client
- Parse inline expression tags ([laughs], [sighs], etc.) from transcription
to derive emotion/valence/arousal per segment
- Remove model weight caching from Dockerfile (no local weights needed)
- Server startup is now instant (no model loading)
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
- Dockerfile +0 -6
- model/voxtral-server/main.py +117 -335
- model/voxtral-server/requirements.txt +2 -10
Dockerfile
CHANGED
|
@@ -43,12 +43,6 @@ RUN cd demo && NEXT_PUBLIC_API_URL="" npm run build \
|
|
| 43 |
COPY nginx.conf /etc/nginx/nginx.conf
|
| 44 |
COPY supervisord.conf /etc/supervisor/conf.d/app.conf
|
| 45 |
|
| 46 |
-
# βββ Model weight cache βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 47 |
-
# /data is persisted across Space restarts on HuggingFace Spaces
|
| 48 |
-
RUN mkdir -p /data/models
|
| 49 |
-
ENV TRANSFORMERS_CACHE=/data/models
|
| 50 |
-
ENV HF_HOME=/data/models
|
| 51 |
-
|
| 52 |
# HuggingFace Spaces public port
|
| 53 |
EXPOSE 7860
|
| 54 |
|
|
|
|
| 43 |
COPY nginx.conf /etc/nginx/nginx.conf
|
| 44 |
COPY supervisord.conf /etc/supervisor/conf.d/app.conf
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
# HuggingFace Spaces public port
|
| 47 |
EXPOSE 7860
|
| 48 |
|
model/voxtral-server/main.py
CHANGED
|
@@ -1,180 +1,99 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
-
|
|
|
|
| 4 |
"""
|
| 5 |
import os
|
|
|
|
| 6 |
import shutil
|
| 7 |
import subprocess
|
| 8 |
import tempfile
|
| 9 |
import time
|
| 10 |
from contextlib import asynccontextmanager
|
| 11 |
-
from typing import Optional
|
| 12 |
|
| 13 |
-
import
|
| 14 |
-
import numpy as np
|
| 15 |
import librosa
|
| 16 |
-
import
|
| 17 |
-
from fastapi import FastAPI, File, UploadFile, HTTPException
|
| 18 |
from fastapi.middleware.cors import CORSMiddleware
|
| 19 |
|
| 20 |
-
|
| 21 |
-
|
|
|
|
|
|
|
| 22 |
MAX_UPLOAD_BYTES = int(os.environ.get("MAX_UPLOAD_MB", "100")) * 1024 * 1024
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
processor = None
|
| 26 |
-
model = None
|
| 27 |
-
|
| 28 |
-
# Optional: pyannote pipeline (loaded lazily on first diarize request if HF_TOKEN is set)
|
| 29 |
-
_pyannote_pipeline = None
|
| 30 |
-
_pyannote_loaded = False
|
| 31 |
-
_pyannote_available = False
|
| 32 |
-
|
| 33 |
-
try:
|
| 34 |
-
from pyannote.audio import Pipeline as _PyannotePipeline
|
| 35 |
-
_pyannote_available = True
|
| 36 |
-
except ImportError:
|
| 37 |
-
pass
|
| 38 |
|
| 39 |
|
| 40 |
def _check_ffmpeg():
|
| 41 |
-
"""Check ffmpeg is available at startup; raise with clear message if not."""
|
| 42 |
if shutil.which("ffmpeg") is None:
|
| 43 |
raise RuntimeError(
|
| 44 |
-
"ffmpeg not found. WebM
|
| 45 |
" macOS: brew install ffmpeg\n"
|
| 46 |
-
" Ubuntu: sudo apt install ffmpeg
|
| 47 |
-
" Windows: https://ffmpeg.org/download.html\n"
|
| 48 |
-
"Then restart this service."
|
| 49 |
-
)
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
def _get_pyannote_pipeline():
|
| 53 |
-
"""Lazy-load pyannote pipeline (requires HF_TOKEN and pyannote.audio installed)."""
|
| 54 |
-
global _pyannote_pipeline, _pyannote_loaded
|
| 55 |
-
if _pyannote_loaded:
|
| 56 |
-
return _pyannote_pipeline
|
| 57 |
-
_pyannote_loaded = True
|
| 58 |
-
if not _pyannote_available or not HF_TOKEN:
|
| 59 |
-
print("[voxtral] pyannote: not available (install pyannote.audio and set HF_TOKEN for real diarization; using VAD+MFCC fallback)")
|
| 60 |
-
return None
|
| 61 |
-
try:
|
| 62 |
-
pipeline = _PyannotePipeline.from_pretrained(
|
| 63 |
-
"pyannote/speaker-diarization-3.1",
|
| 64 |
-
use_auth_token=HF_TOKEN,
|
| 65 |
)
|
| 66 |
-
if torch.cuda.is_available():
|
| 67 |
-
pipeline = pipeline.to(torch.device("cuda"))
|
| 68 |
-
elif torch.backends.mps.is_available():
|
| 69 |
-
pipeline = pipeline.to(torch.device("mps"))
|
| 70 |
-
_pyannote_pipeline = pipeline
|
| 71 |
-
print("[voxtral] pyannote speaker-diarization-3.1 loaded")
|
| 72 |
-
except Exception as e:
|
| 73 |
-
print(f"[voxtral] pyannote load failed: {e} β using VAD+MFCC fallback")
|
| 74 |
-
return _pyannote_pipeline
|
| 75 |
|
| 76 |
|
| 77 |
@asynccontextmanager
|
| 78 |
async def lifespan(app: FastAPI):
|
| 79 |
-
"""On startup: check deps and load model."""
|
| 80 |
-
global processor, model
|
| 81 |
-
|
| 82 |
_check_ffmpeg()
|
| 83 |
print(f"[voxtral] ffmpeg: {shutil.which('ffmpeg')}")
|
| 84 |
-
|
| 85 |
-
if torch.cuda.is_available():
|
| 86 |
-
_device = torch.device("cuda")
|
| 87 |
-
_dtype = torch.bfloat16
|
| 88 |
-
elif torch.backends.mps.is_available():
|
| 89 |
-
_device = torch.device("mps")
|
| 90 |
-
_dtype = torch.float16 # MPS does not support bfloat16
|
| 91 |
-
else:
|
| 92 |
-
_device = torch.device("cpu")
|
| 93 |
-
_dtype = torch.bfloat16 # halves memory vs float32 (8 GB vs 16 GB); supported on modern x86
|
| 94 |
-
print(f"[voxtral] Device: {_device} dtype: {_dtype}")
|
| 95 |
-
|
| 96 |
-
print(f"[voxtral] Loading base model: {BASE_MODEL_ID} ...")
|
| 97 |
-
print(f"[voxtral] Applying LoRA adapter: {REPO_ID} ...")
|
| 98 |
try:
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
processor = AutoProcessor.from_pretrained(BASE_MODEL_ID)
|
| 103 |
-
base = VoxtralForConditionalGeneration.from_pretrained(
|
| 104 |
-
BASE_MODEL_ID, torch_dtype=_dtype
|
| 105 |
-
).to(_device)
|
| 106 |
-
model = PeftModel.from_pretrained(base, REPO_ID)
|
| 107 |
-
model.eval()
|
| 108 |
-
print(f"[voxtral] Model ready: {BASE_MODEL_ID} + LoRA {REPO_ID} on {_device}")
|
| 109 |
except Exception as e:
|
| 110 |
-
|
| 111 |
-
f"Model load failed: {e}\n"
|
| 112 |
-
"Ensure deps are installed: pip install -r requirements.txt\n"
|
| 113 |
-
"And sufficient VRAM (recommended β₯16GB) or use CPU (slower)."
|
| 114 |
-
) from e
|
| 115 |
-
|
| 116 |
-
# Warm-up: run one silent dummy inference to pre-compile MPS Metal shaders.
|
| 117 |
-
print("[voxtral] Warming up (dummy inference)...")
|
| 118 |
-
try:
|
| 119 |
-
sr = getattr(getattr(processor, "feature_extractor", None), "sampling_rate", 16000)
|
| 120 |
-
dummy = np.zeros(sr, dtype=np.float32) # 1 second of silence
|
| 121 |
-
with torch.inference_mode():
|
| 122 |
-
dummy_inputs = processor(dummy, return_tensors="pt")
|
| 123 |
-
dummy_inputs = {
|
| 124 |
-
k: (v.to(_device, dtype=_dtype) if v.is_floating_point() else v.to(_device))
|
| 125 |
-
for k, v in dummy_inputs.items()
|
| 126 |
-
}
|
| 127 |
-
model.generate(**dummy_inputs, max_new_tokens=1)
|
| 128 |
-
print("[voxtral] Warm-up complete β first request will be fast")
|
| 129 |
-
except Exception as e:
|
| 130 |
-
print(f"[voxtral] Warm-up skipped: {e}")
|
| 131 |
-
|
| 132 |
yield
|
| 133 |
|
| 134 |
|
| 135 |
-
app = FastAPI(title="
|
| 136 |
|
| 137 |
app.add_middleware(
|
| 138 |
CORSMiddleware,
|
| 139 |
-
allow_origins=[
|
| 140 |
-
"http://localhost:3000",
|
| 141 |
-
"http://127.0.0.1:3000",
|
| 142 |
-
],
|
| 143 |
allow_methods=["GET", "POST", "OPTIONS"],
|
| 144 |
allow_headers=["*"],
|
| 145 |
)
|
| 146 |
|
| 147 |
|
| 148 |
-
@app.get("/debug-inference")
|
| 149 |
-
async def debug_inference():
|
| 150 |
-
"""Run a 1-second silent inference and return full result or traceback."""
|
| 151 |
-
import traceback as tb
|
| 152 |
-
try:
|
| 153 |
-
dummy = np.zeros(16000, dtype=np.float32)
|
| 154 |
-
text = _transcribe(dummy)
|
| 155 |
-
return {"status": "ok", "text": text}
|
| 156 |
-
except Exception as e:
|
| 157 |
-
return {"status": "error", "error": str(e), "traceback": tb.format_exc()}
|
| 158 |
-
|
| 159 |
-
|
| 160 |
@app.get("/health")
|
| 161 |
async def health():
|
| 162 |
-
"""Health check: service and dependency status."""
|
| 163 |
return {
|
| 164 |
"status": "ok",
|
| 165 |
-
"model":
|
| 166 |
-
"model_loaded":
|
| 167 |
"ffmpeg": shutil.which("ffmpeg") is not None,
|
| 168 |
-
"pyannote_available":
|
| 169 |
-
"hf_token_set":
|
| 170 |
"max_upload_mb": MAX_UPLOAD_BYTES // 1024 // 1024,
|
|
|
|
| 171 |
}
|
| 172 |
|
| 173 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
# βββ Audio helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 175 |
|
| 176 |
def _convert_to_wav_ffmpeg(path: str, target_sr: int) -> str:
|
| 177 |
-
"""Convert any format to 16kHz mono WAV with ffmpeg; return path to new file."""
|
| 178 |
out = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
| 179 |
out.close()
|
| 180 |
rc = subprocess.run(
|
|
@@ -188,14 +107,11 @@ def _convert_to_wav_ffmpeg(path: str, target_sr: int) -> str:
|
|
| 188 |
)
|
| 189 |
if rc.returncode != 0:
|
| 190 |
os.unlink(out.name)
|
| 191 |
-
raise RuntimeError(
|
| 192 |
-
f"ffmpeg failed: {rc.stderr.decode(errors='replace')[:500]}"
|
| 193 |
-
)
|
| 194 |
return out.name
|
| 195 |
|
| 196 |
|
| 197 |
-
def
|
| 198 |
-
"""Load audio to mono float32 and resample to target_sr."""
|
| 199 |
lower = file_path.lower()
|
| 200 |
if lower.endswith((".webm", ".opus", ".m4a", ".ogg")):
|
| 201 |
wav_path = _convert_to_wav_ffmpeg(file_path, target_sr)
|
|
@@ -205,16 +121,13 @@ def load_audio_to_array(file_path: str, target_sr: int) -> np.ndarray:
|
|
| 205 |
finally:
|
| 206 |
if os.path.exists(wav_path):
|
| 207 |
os.unlink(wav_path)
|
| 208 |
-
|
| 209 |
try:
|
| 210 |
y, _ = librosa.load(file_path, sr=target_sr, mono=True)
|
| 211 |
return y.astype(np.float32)
|
| 212 |
except Exception as e:
|
| 213 |
-
if not os.path.isfile(file_path):
|
| 214 |
-
raise
|
| 215 |
need_ffmpeg = (
|
| 216 |
"format not recognised" in str(e).lower()
|
| 217 |
-
or "nobackenderror" in
|
| 218 |
)
|
| 219 |
if need_ffmpeg:
|
| 220 |
wav_path = _convert_to_wav_ffmpeg(file_path, target_sr)
|
|
@@ -228,9 +141,11 @@ def load_audio_to_array(file_path: str, target_sr: int) -> np.ndarray:
|
|
| 228 |
|
| 229 |
|
| 230 |
def _validate_upload(contents: bytes) -> None:
|
| 231 |
-
"""Validate upload: non-empty and within size limit."""
|
| 232 |
if len(contents) == 0:
|
| 233 |
-
raise HTTPException(
|
|
|
|
|
|
|
|
|
|
| 234 |
if len(contents) > MAX_UPLOAD_BYTES:
|
| 235 |
mb = len(contents) / 1024 / 1024
|
| 236 |
limit_mb = MAX_UPLOAD_BYTES // 1024 // 1024
|
|
@@ -243,29 +158,20 @@ def _validate_upload(contents: bytes) -> None:
|
|
| 243 |
# βββ Segmentation helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 244 |
|
| 245 |
def _vad_segment(audio: np.ndarray, sr: int) -> list[tuple[int, int]]:
|
| 246 |
-
"""Split audio into speech segments by silence detection.
|
| 247 |
-
Merges gaps < 0.5 s (intra-phrase pauses) and drops segments < 0.3 s.
|
| 248 |
-
Returns list of (start_sample, end_sample).
|
| 249 |
-
"""
|
| 250 |
intervals = librosa.effects.split(audio, top_db=28, frame_length=2048, hop_length=512)
|
| 251 |
if len(intervals) == 0:
|
| 252 |
return [(0, len(audio))]
|
| 253 |
-
|
| 254 |
merged: list[list[int]] = [[int(intervals[0][0]), int(intervals[0][1])]]
|
| 255 |
for s, e in intervals[1:]:
|
| 256 |
if (int(s) - merged[-1][1]) / sr < 0.3:
|
| 257 |
merged[-1][1] = int(e)
|
| 258 |
else:
|
| 259 |
merged.append([int(s), int(e)])
|
| 260 |
-
|
| 261 |
result = [(s, e) for s, e in merged if (e - s) / sr >= 0.3]
|
| 262 |
return result if result else [(0, len(audio))]
|
| 263 |
|
| 264 |
|
| 265 |
def _segments_from_vad(audio: np.ndarray, sr: int) -> tuple[list[dict], str]:
|
| 266 |
-
"""Segment audio by silence, assign all segments to SPEAKER_00.
|
| 267 |
-
Returns (segments, method_name).
|
| 268 |
-
"""
|
| 269 |
intervals = _vad_segment(audio, sr)
|
| 270 |
segs = [
|
| 271 |
{"speaker": "SPEAKER_00", "start": round(s / sr, 3), "end": round(e / sr, 3)}
|
|
@@ -276,25 +182,17 @@ def _segments_from_vad(audio: np.ndarray, sr: int) -> tuple[list[dict], str]:
|
|
| 276 |
|
| 277 |
|
| 278 |
def _split_sentences(text: str) -> list[str]:
|
| 279 |
-
"""Split text into sentences at punctuation boundaries (CJK + Latin)."""
|
| 280 |
-
import re
|
| 281 |
parts = re.split(r'(?<=[οΌοΌγ?!])\s*', text)
|
| 282 |
return [p for p in parts if p.strip()]
|
| 283 |
|
| 284 |
|
| 285 |
def _distribute_text(full_text: str, segs: list[dict]) -> list[dict]:
|
| 286 |
-
"""Assign complete sentences to segments by time proportion.
|
| 287 |
-
Sentences are never split mid-punctuation; each segment gets whole sentences.
|
| 288 |
-
Falls back to character-level splitting if no sentence boundaries found.
|
| 289 |
-
"""
|
| 290 |
if not full_text or not segs:
|
| 291 |
return [{**s, "text": ""} for s in segs]
|
| 292 |
-
|
| 293 |
if len(segs) == 1:
|
| 294 |
return [{**segs[0], "text": full_text}]
|
| 295 |
|
| 296 |
sentences = _split_sentences(full_text)
|
| 297 |
-
# Fallback: split by character if no sentence boundaries
|
| 298 |
if len(sentences) <= 1:
|
| 299 |
is_cjk = len(full_text.split()) <= 1
|
| 300 |
sentences = list(full_text) if is_cjk else full_text.split()
|
|
@@ -305,206 +203,97 @@ def _distribute_text(full_text: str, segs: list[dict]) -> list[dict]:
|
|
| 305 |
|
| 306 |
is_cjk = len(full_text.split()) <= 1 and len(full_text) > 1
|
| 307 |
sep = "" if is_cjk else " "
|
| 308 |
-
|
| 309 |
-
# Assign each sentence to the segment whose cumulative time covers its proportional position
|
| 310 |
n = len(sentences)
|
| 311 |
result_texts: list[list[str]] = [[] for _ in segs]
|
| 312 |
-
|
| 313 |
cumulative = 0.0
|
| 314 |
for i, seg in enumerate(segs):
|
| 315 |
cumulative += (seg["end"] - seg["start"]) / total_dur
|
| 316 |
-
# Assign sentences whose proportional position falls within this segment's cumulative range
|
| 317 |
threshold = cumulative * n
|
| 318 |
while len(result_texts[i]) + sum(len(t) for t in result_texts[:i]) < round(threshold):
|
| 319 |
idx = sum(len(t) for t in result_texts)
|
| 320 |
if idx >= n:
|
| 321 |
break
|
| 322 |
result_texts[i].append(sentences[idx])
|
| 323 |
-
|
| 324 |
-
# Ensure any leftover sentences go to the last segment
|
| 325 |
assigned = sum(len(t) for t in result_texts)
|
| 326 |
result_texts[-1].extend(sentences[assigned:])
|
| 327 |
-
|
| 328 |
return [{**seg, "text": sep.join(texts)} for seg, texts in zip(segs, result_texts)]
|
| 329 |
|
| 330 |
|
| 331 |
-
# βββ Emotion
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
"""
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
pitch_std = float(voiced.std()) if len(voiced) > 0 else 0.0
|
| 374 |
-
|
| 375 |
-
# ββ Spectral features ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 376 |
-
spec_centroid = float(librosa.feature.spectral_centroid(y=chunk, sr=sr).mean())
|
| 377 |
-
zcr = float(librosa.feature.zero_crossing_rate(chunk).mean())
|
| 378 |
-
|
| 379 |
-
# ββ Arousal (0..1 before rescaling) βββββββββββββββββββββββββββββββββ
|
| 380 |
-
rms_n = min(rms / 0.08, 1.0) # typical speech RMS
|
| 381 |
-
pitch_n = max(0.0, min((pitch_mean - 80) / 320, 1.0)) # 80β400 Hz
|
| 382 |
-
zcr_n = min(zcr / 0.12, 1.0)
|
| 383 |
-
arousal_01 = 0.5 * rms_n + 0.35 * pitch_n + 0.15 * zcr_n
|
| 384 |
-
arousal = round(arousal_01 * 2 - 1, 3) # β -1..1
|
| 385 |
-
|
| 386 |
-
# ββ Valence (0..1 before rescaling) βββββββββββββββββββββββββββββββββ
|
| 387 |
-
spec_n = min(spec_centroid / 3500, 1.0) # brighter = warmer
|
| 388 |
-
pitch_var_n = min(pitch_std / 60, 1.0) # melodic variety
|
| 389 |
-
valence_01 = 0.55 * spec_n + 0.45 * pitch_var_n
|
| 390 |
-
valence = round(valence_01 * 2 - 1, 3) # β -1..1
|
| 391 |
-
|
| 392 |
-
emotion = _emotion_label(valence, arousal)
|
| 393 |
-
return {"emotion": emotion, "valence": valence, "arousal": arousal}
|
| 394 |
-
|
| 395 |
-
except Exception as e:
|
| 396 |
-
print(f"[voxtral] _analyze_emotion failed: {e}")
|
| 397 |
-
return {"emotion": "Neutral", "valence": 0.0, "arousal": 0.0}
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
# βββ Inference helper ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 401 |
-
|
| 402 |
-
def _transcribe(audio_array: np.ndarray) -> str:
|
| 403 |
-
"""Run Voxtral-3B + LoRA inference via chat template; return transcribed text."""
|
| 404 |
-
import traceback
|
| 405 |
-
audio_sec = round(len(audio_array) / 16000, 2)
|
| 406 |
-
model_dtype = next(model.parameters()).dtype
|
| 407 |
-
print(f"[_transcribe] START audio={audio_sec}s device={model.device} dtype={model_dtype}", flush=True)
|
| 408 |
-
|
| 409 |
-
try:
|
| 410 |
-
t0 = time.perf_counter()
|
| 411 |
-
inputs = processor(audio_array, return_tensors="pt")
|
| 412 |
-
print(f"[_transcribe] processor() OK {(time.perf_counter()-t0)*1000:.0f}ms keys={list(inputs.keys())}", flush=True)
|
| 413 |
-
except Exception:
|
| 414 |
-
print(f"[_transcribe] processor() FAILED:\n{traceback.format_exc()}", flush=True)
|
| 415 |
-
raise
|
| 416 |
-
|
| 417 |
-
try:
|
| 418 |
-
t0 = time.perf_counter()
|
| 419 |
-
# move to device; cast floating tensors to model dtype to avoid dtype mismatch
|
| 420 |
-
inputs = {
|
| 421 |
-
k: (v.to(model.device, dtype=model_dtype) if v.is_floating_point() else v.to(model.device))
|
| 422 |
-
for k, v in inputs.items()
|
| 423 |
-
}
|
| 424 |
-
input_len = inputs["input_ids"].shape[1]
|
| 425 |
-
print(f"[_transcribe] to(device) OK {(time.perf_counter()-t0)*1000:.0f}ms input_len={input_len}", flush=True)
|
| 426 |
-
except Exception:
|
| 427 |
-
print(f"[_transcribe] to(device) FAILED:\n{traceback.format_exc()}", flush=True)
|
| 428 |
-
raise
|
| 429 |
-
|
| 430 |
-
try:
|
| 431 |
-
t0 = time.perf_counter()
|
| 432 |
-
print(f"[_transcribe] calling model.generate ...", flush=True)
|
| 433 |
-
with torch.inference_mode():
|
| 434 |
-
outputs = model.generate(**inputs, max_new_tokens=1024)
|
| 435 |
-
new_tokens = outputs.shape[1] - input_len
|
| 436 |
-
print(f"[_transcribe] model.generate OK {(time.perf_counter()-t0)*1000:.0f}ms new_tokens={new_tokens}", flush=True)
|
| 437 |
-
except Exception:
|
| 438 |
-
print(f"[_transcribe] model.generate FAILED:\n{traceback.format_exc()}", flush=True)
|
| 439 |
-
raise
|
| 440 |
-
|
| 441 |
-
try:
|
| 442 |
-
# For direct processor() call, decode full output (no input prefix to strip)
|
| 443 |
-
text = processor.decode(outputs[0], skip_special_tokens=True).strip()
|
| 444 |
-
print(f"[_transcribe] decode OK (full) text={repr(text[:200])}", flush=True)
|
| 445 |
-
# Also log the new-tokens-only version for comparison
|
| 446 |
-
if input_len > 0 and outputs.shape[1] > input_len:
|
| 447 |
-
new_only = processor.decode(outputs[0][input_len:], skip_special_tokens=True).strip()
|
| 448 |
-
print(f"[_transcribe] decode new-only text={repr(new_only[:200])}", flush=True)
|
| 449 |
-
return text
|
| 450 |
-
except Exception:
|
| 451 |
-
print(f"[_transcribe] decode FAILED:\n{traceback.format_exc()}", flush=True)
|
| 452 |
-
raise
|
| 453 |
|
| 454 |
|
| 455 |
# βββ Endpoints βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 456 |
|
| 457 |
@app.post("/transcribe")
|
| 458 |
async def transcribe(audio: UploadFile = File(...)):
|
| 459 |
-
"""
|
| 460 |
-
Upload an audio file; return full transcription (offline, single response).
|
| 461 |
-
Supported: wav, mp3, flac, ogg, m4a, webm
|
| 462 |
-
"""
|
| 463 |
req_start = time.perf_counter()
|
| 464 |
req_id = f"transcribe-{int(req_start * 1000)}"
|
| 465 |
filename = audio.filename or "audio.wav"
|
| 466 |
-
print(f"[voxtral] {req_id} POST /transcribe
|
| 467 |
|
| 468 |
try:
|
| 469 |
contents = await audio.read()
|
| 470 |
except Exception as e:
|
| 471 |
raise HTTPException(status_code=400, detail=f"Failed to read file: {e}")
|
| 472 |
-
|
| 473 |
_validate_upload(contents)
|
| 474 |
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
target_sr = getattr(getattr(processor, "feature_extractor", None), "sampling_rate", 16000)
|
| 480 |
-
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
|
| 481 |
-
tmp.write(contents)
|
| 482 |
-
tmp_path = tmp.name
|
| 483 |
-
|
| 484 |
-
try:
|
| 485 |
-
audio_array = load_audio_to_array(tmp_path, target_sr)
|
| 486 |
-
except Exception as e:
|
| 487 |
-
raise HTTPException(status_code=400, detail=f"Cannot decode audio: {e}")
|
| 488 |
-
finally:
|
| 489 |
-
if os.path.exists(tmp_path):
|
| 490 |
-
try:
|
| 491 |
-
os.unlink(tmp_path)
|
| 492 |
-
except OSError:
|
| 493 |
-
pass
|
| 494 |
|
| 495 |
-
text = _transcribe(audio_array)
|
| 496 |
total_ms = (time.perf_counter() - req_start) * 1000
|
| 497 |
print(f"[voxtral] {req_id} done total={total_ms:.0f}ms text_len={len(text)}")
|
| 498 |
-
return {"text": text, "words": [], "languageCode":
|
| 499 |
|
| 500 |
|
| 501 |
@app.post("/transcribe-diarize")
|
| 502 |
-
async def transcribe_diarize(
|
| 503 |
-
audio: UploadFile = File(...),
|
| 504 |
-
):
|
| 505 |
"""
|
| 506 |
-
Upload audio β transcription + VAD
|
| 507 |
-
|
|
|
|
| 508 |
All segments are labelled SPEAKER_00 (single-speaker mode).
|
| 509 |
"""
|
| 510 |
req_start = time.perf_counter()
|
|
@@ -516,22 +305,25 @@ async def transcribe_diarize(
|
|
| 516 |
contents = await audio.read()
|
| 517 |
except Exception as e:
|
| 518 |
raise HTTPException(status_code=400, detail=f"Failed to read file: {e}")
|
| 519 |
-
|
| 520 |
_validate_upload(contents)
|
| 521 |
|
| 522 |
suffix = os.path.splitext(filename)[1].lower() or ".wav"
|
| 523 |
if suffix not in (".wav", ".mp3", ".flac", ".ogg", ".m4a", ".webm"):
|
| 524 |
suffix = ".wav"
|
| 525 |
|
| 526 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 527 |
|
|
|
|
| 528 |
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
|
| 529 |
tmp.write(contents)
|
| 530 |
tmp_path = tmp.name
|
| 531 |
-
|
| 532 |
try:
|
| 533 |
t0 = time.perf_counter()
|
| 534 |
-
audio_array =
|
| 535 |
print(f"[voxtral] {req_id} load_audio done shape={audio_array.shape} in {(time.perf_counter()-t0)*1000:.0f}ms")
|
| 536 |
except Exception as e:
|
| 537 |
raise HTTPException(status_code=400, detail=f"Cannot decode audio: {e}")
|
|
@@ -542,29 +334,20 @@ async def transcribe_diarize(
|
|
| 542 |
except OSError:
|
| 543 |
pass
|
| 544 |
|
| 545 |
-
duration = round(len(audio_array) /
|
| 546 |
|
| 547 |
-
# ββ Step
|
| 548 |
t0 = time.perf_counter()
|
| 549 |
-
|
| 550 |
-
print(f"[voxtral] {req_id}
|
| 551 |
|
| 552 |
-
# ββ Step
|
| 553 |
-
t0 = time.perf_counter()
|
| 554 |
-
raw_segs, seg_method = _segments_from_vad(audio_array, target_sr)
|
| 555 |
-
print(f"[voxtral] {req_id} segmentation done in {(time.perf_counter()-t0)*1000:.0f}ms segs={len(raw_segs)}")
|
| 556 |
-
|
| 557 |
-
# ββ Step 3: distribute text proportionally ββββββββββββββββββββββββββββββ
|
| 558 |
segs_with_text = _distribute_text(full_text, raw_segs)
|
| 559 |
|
| 560 |
-
# ββ Step
|
| 561 |
-
t0 = time.perf_counter()
|
| 562 |
segments = []
|
| 563 |
for i, s in enumerate(segs_with_text):
|
| 564 |
-
|
| 565 |
-
end_sample = int(s["end"] * target_sr)
|
| 566 |
-
chunk = audio_array[start_sample:end_sample]
|
| 567 |
-
emo = _analyze_emotion(chunk, target_sr)
|
| 568 |
segments.append({
|
| 569 |
"id": i + 1,
|
| 570 |
"speaker": s["speaker"],
|
|
@@ -575,7 +358,6 @@ async def transcribe_diarize(
|
|
| 575 |
"valence": emo["valence"],
|
| 576 |
"arousal": emo["arousal"],
|
| 577 |
})
|
| 578 |
-
print(f"[voxtral] {req_id} emotion analysis done in {(time.perf_counter()-t0)*1000:.0f}ms")
|
| 579 |
|
| 580 |
total_ms = (time.perf_counter() - req_start) * 1000
|
| 581 |
print(f"[voxtral] {req_id} complete total={total_ms:.0f}ms segments={len(segments)}")
|
|
|
|
| 1 |
"""
|
| 2 |
+
Evoxtral speech-to-text API proxy (Model layer).
|
| 3 |
+
Forwards audio to the external Modal evoxtral API, then adds
|
| 4 |
+
VAD segmentation and emotion parsing from inline expression tags.
|
| 5 |
"""
|
| 6 |
import os
|
| 7 |
+
import re
|
| 8 |
import shutil
|
| 9 |
import subprocess
|
| 10 |
import tempfile
|
| 11 |
import time
|
| 12 |
from contextlib import asynccontextmanager
|
|
|
|
| 13 |
|
| 14 |
+
import httpx
|
|
|
|
| 15 |
import librosa
|
| 16 |
+
import numpy as np
|
| 17 |
+
from fastapi import FastAPI, File, UploadFile, HTTPException
|
| 18 |
from fastapi.middleware.cors import CORSMiddleware
|
| 19 |
|
| 20 |
+
EVOXTRAL_API = os.environ.get(
|
| 21 |
+
"EVOXTRAL_API",
|
| 22 |
+
"https://yongkang-zou1999--evoxtral-api-evoxtralmodel-web.modal.run",
|
| 23 |
+
).rstrip("/")
|
| 24 |
MAX_UPLOAD_BYTES = int(os.environ.get("MAX_UPLOAD_MB", "100")) * 1024 * 1024
|
| 25 |
+
TARGET_SR = 16000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
|
| 28 |
def _check_ffmpeg():
|
|
|
|
| 29 |
if shutil.which("ffmpeg") is None:
|
| 30 |
raise RuntimeError(
|
| 31 |
+
"ffmpeg not found. WebM / M4A / OGG requires ffmpeg to decode.\n"
|
| 32 |
" macOS: brew install ffmpeg\n"
|
| 33 |
+
" Ubuntu: sudo apt install ffmpeg"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
|
| 37 |
@asynccontextmanager
|
| 38 |
async def lifespan(app: FastAPI):
|
|
|
|
|
|
|
|
|
|
| 39 |
_check_ffmpeg()
|
| 40 |
print(f"[voxtral] ffmpeg: {shutil.which('ffmpeg')}")
|
| 41 |
+
print(f"[voxtral] Evoxtral API: {EVOXTRAL_API}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
try:
|
| 43 |
+
async with httpx.AsyncClient(timeout=15) as client:
|
| 44 |
+
r = await client.get(f"{EVOXTRAL_API}/health")
|
| 45 |
+
print(f"[voxtral] External API health: {r.status_code} {r.text[:200]}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
except Exception as e:
|
| 47 |
+
print(f"[voxtral] External API health check failed: {e} (will retry on first request)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
yield
|
| 49 |
|
| 50 |
|
| 51 |
+
app = FastAPI(title="Evoxtral Speech-to-Text (Model)", lifespan=lifespan)
|
| 52 |
|
| 53 |
app.add_middleware(
|
| 54 |
CORSMiddleware,
|
| 55 |
+
allow_origins=["http://localhost:3000", "http://127.0.0.1:3000"],
|
|
|
|
|
|
|
|
|
|
| 56 |
allow_methods=["GET", "POST", "OPTIONS"],
|
| 57 |
allow_headers=["*"],
|
| 58 |
)
|
| 59 |
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
@app.get("/health")
|
| 62 |
async def health():
|
|
|
|
| 63 |
return {
|
| 64 |
"status": "ok",
|
| 65 |
+
"model": "YongkangZOU/evoxtral-lora (external API)",
|
| 66 |
+
"model_loaded": True,
|
| 67 |
"ffmpeg": shutil.which("ffmpeg") is not None,
|
| 68 |
+
"pyannote_available": False,
|
| 69 |
+
"hf_token_set": False,
|
| 70 |
"max_upload_mb": MAX_UPLOAD_BYTES // 1024 // 1024,
|
| 71 |
+
"evoxtral_api": EVOXTRAL_API,
|
| 72 |
}
|
| 73 |
|
| 74 |
|
| 75 |
+
# βββ External API ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 76 |
+
|
| 77 |
+
async def _call_evoxtral(contents: bytes, filename: str) -> dict:
|
| 78 |
+
"""Forward audio bytes to the external evoxtral API; return parsed JSON.
|
| 79 |
+
Response: {"transcription": "...[laughs]...", "language": "en", "model": "..."}
|
| 80 |
+
"""
|
| 81 |
+
async with httpx.AsyncClient(timeout=300) as client:
|
| 82 |
+
r = await client.post(
|
| 83 |
+
f"{EVOXTRAL_API}/transcribe",
|
| 84 |
+
files={"file": (filename, contents)},
|
| 85 |
+
)
|
| 86 |
+
if not r.is_success:
|
| 87 |
+
raise HTTPException(
|
| 88 |
+
status_code=502,
|
| 89 |
+
detail=f"Evoxtral API error {r.status_code}: {r.text[:300]}",
|
| 90 |
+
)
|
| 91 |
+
return r.json()
|
| 92 |
+
|
| 93 |
+
|
| 94 |
# βββ Audio helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 95 |
|
| 96 |
def _convert_to_wav_ffmpeg(path: str, target_sr: int) -> str:
|
|
|
|
| 97 |
out = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
| 98 |
out.close()
|
| 99 |
rc = subprocess.run(
|
|
|
|
| 107 |
)
|
| 108 |
if rc.returncode != 0:
|
| 109 |
os.unlink(out.name)
|
| 110 |
+
raise RuntimeError(f"ffmpeg failed: {rc.stderr.decode(errors='replace')[:500]}")
|
|
|
|
|
|
|
| 111 |
return out.name
|
| 112 |
|
| 113 |
|
| 114 |
+
def _load_audio(file_path: str, target_sr: int) -> np.ndarray:
|
|
|
|
| 115 |
lower = file_path.lower()
|
| 116 |
if lower.endswith((".webm", ".opus", ".m4a", ".ogg")):
|
| 117 |
wav_path = _convert_to_wav_ffmpeg(file_path, target_sr)
|
|
|
|
| 121 |
finally:
|
| 122 |
if os.path.exists(wav_path):
|
| 123 |
os.unlink(wav_path)
|
|
|
|
| 124 |
try:
|
| 125 |
y, _ = librosa.load(file_path, sr=target_sr, mono=True)
|
| 126 |
return y.astype(np.float32)
|
| 127 |
except Exception as e:
|
|
|
|
|
|
|
| 128 |
need_ffmpeg = (
|
| 129 |
"format not recognised" in str(e).lower()
|
| 130 |
+
or "nobackenderror" in type(e).__name__.lower()
|
| 131 |
)
|
| 132 |
if need_ffmpeg:
|
| 133 |
wav_path = _convert_to_wav_ffmpeg(file_path, target_sr)
|
|
|
|
| 141 |
|
| 142 |
|
| 143 |
def _validate_upload(contents: bytes) -> None:
|
|
|
|
| 144 |
if len(contents) == 0:
|
| 145 |
+
raise HTTPException(
|
| 146 |
+
status_code=400,
|
| 147 |
+
detail="Audio file is empty; record at least 1β2 seconds or choose a valid file",
|
| 148 |
+
)
|
| 149 |
if len(contents) > MAX_UPLOAD_BYTES:
|
| 150 |
mb = len(contents) / 1024 / 1024
|
| 151 |
limit_mb = MAX_UPLOAD_BYTES // 1024 // 1024
|
|
|
|
| 158 |
# βββ Segmentation helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 159 |
|
| 160 |
def _vad_segment(audio: np.ndarray, sr: int) -> list[tuple[int, int]]:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
intervals = librosa.effects.split(audio, top_db=28, frame_length=2048, hop_length=512)
|
| 162 |
if len(intervals) == 0:
|
| 163 |
return [(0, len(audio))]
|
|
|
|
| 164 |
merged: list[list[int]] = [[int(intervals[0][0]), int(intervals[0][1])]]
|
| 165 |
for s, e in intervals[1:]:
|
| 166 |
if (int(s) - merged[-1][1]) / sr < 0.3:
|
| 167 |
merged[-1][1] = int(e)
|
| 168 |
else:
|
| 169 |
merged.append([int(s), int(e)])
|
|
|
|
| 170 |
result = [(s, e) for s, e in merged if (e - s) / sr >= 0.3]
|
| 171 |
return result if result else [(0, len(audio))]
|
| 172 |
|
| 173 |
|
| 174 |
def _segments_from_vad(audio: np.ndarray, sr: int) -> tuple[list[dict], str]:
|
|
|
|
|
|
|
|
|
|
| 175 |
intervals = _vad_segment(audio, sr)
|
| 176 |
segs = [
|
| 177 |
{"speaker": "SPEAKER_00", "start": round(s / sr, 3), "end": round(e / sr, 3)}
|
|
|
|
| 182 |
|
| 183 |
|
| 184 |
def _split_sentences(text: str) -> list[str]:
|
|
|
|
|
|
|
| 185 |
parts = re.split(r'(?<=[οΌοΌγ?!])\s*', text)
|
| 186 |
return [p for p in parts if p.strip()]
|
| 187 |
|
| 188 |
|
| 189 |
def _distribute_text(full_text: str, segs: list[dict]) -> list[dict]:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
if not full_text or not segs:
|
| 191 |
return [{**s, "text": ""} for s in segs]
|
|
|
|
| 192 |
if len(segs) == 1:
|
| 193 |
return [{**segs[0], "text": full_text}]
|
| 194 |
|
| 195 |
sentences = _split_sentences(full_text)
|
|
|
|
| 196 |
if len(sentences) <= 1:
|
| 197 |
is_cjk = len(full_text.split()) <= 1
|
| 198 |
sentences = list(full_text) if is_cjk else full_text.split()
|
|
|
|
| 203 |
|
| 204 |
is_cjk = len(full_text.split()) <= 1 and len(full_text) > 1
|
| 205 |
sep = "" if is_cjk else " "
|
|
|
|
|
|
|
| 206 |
n = len(sentences)
|
| 207 |
result_texts: list[list[str]] = [[] for _ in segs]
|
|
|
|
| 208 |
cumulative = 0.0
|
| 209 |
for i, seg in enumerate(segs):
|
| 210 |
cumulative += (seg["end"] - seg["start"]) / total_dur
|
|
|
|
| 211 |
threshold = cumulative * n
|
| 212 |
while len(result_texts[i]) + sum(len(t) for t in result_texts[:i]) < round(threshold):
|
| 213 |
idx = sum(len(t) for t in result_texts)
|
| 214 |
if idx >= n:
|
| 215 |
break
|
| 216 |
result_texts[i].append(sentences[idx])
|
|
|
|
|
|
|
| 217 |
assigned = sum(len(t) for t in result_texts)
|
| 218 |
result_texts[-1].extend(sentences[assigned:])
|
|
|
|
| 219 |
return [{**seg, "text": sep.join(texts)} for seg, texts in zip(segs, result_texts)]
|
| 220 |
|
| 221 |
|
| 222 |
+
# βββ Emotion parsing from evoxtral expression tags βββββββββββββββββββββββββββββ
|
| 223 |
+
|
| 224 |
+
# Maps inline tags like [laughs], [sighs] β (emotion label, valence, arousal)
|
| 225 |
+
_TAG_EMOTIONS: dict[str, tuple[str, float, float]] = {
|
| 226 |
+
"laughs": ("Happy", 0.70, 0.60),
|
| 227 |
+
"laughing": ("Happy", 0.70, 0.60),
|
| 228 |
+
"chuckles": ("Happy", 0.50, 0.30),
|
| 229 |
+
"giggles": ("Happy", 0.60, 0.40),
|
| 230 |
+
"sighs": ("Sad", -0.30, -0.30),
|
| 231 |
+
"sighing": ("Sad", -0.30, -0.30),
|
| 232 |
+
"cries": ("Sad", -0.70, 0.40),
|
| 233 |
+
"crying": ("Sad", -0.70, 0.40),
|
| 234 |
+
"whispers": ("Calm", 0.10, -0.50),
|
| 235 |
+
"whispering":("Calm", 0.10, -0.50),
|
| 236 |
+
"shouts": ("Angry", -0.50, 0.80),
|
| 237 |
+
"shouting": ("Angry", -0.50, 0.80),
|
| 238 |
+
"exclaims": ("Excited", 0.50, 0.70),
|
| 239 |
+
"gasps": ("Surprised", 0.20, 0.70),
|
| 240 |
+
"hesitates": ("Anxious", -0.20, 0.30),
|
| 241 |
+
"stutters": ("Anxious", -0.20, 0.40),
|
| 242 |
+
"mumbles": ("Sad", -0.20, -0.30),
|
| 243 |
+
"claps": ("Happy", 0.60, 0.50),
|
| 244 |
+
"applause": ("Happy", 0.60, 0.50),
|
| 245 |
+
}
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
def _parse_emotion(text: str) -> dict:
|
| 249 |
+
"""Extract the first recognized expression tag from text like [sighs] or [laughs].
|
| 250 |
+
Returns {"emotion": str, "valence": float, "arousal": float}.
|
| 251 |
+
Defaults to Neutral (0, 0) if no known tag is found.
|
| 252 |
"""
|
| 253 |
+
tags = re.findall(r'\[([^\]]+)\]', text.lower())
|
| 254 |
+
for tag in tags:
|
| 255 |
+
tag = tag.strip()
|
| 256 |
+
if tag in _TAG_EMOTIONS:
|
| 257 |
+
label, valence, arousal = _TAG_EMOTIONS[tag]
|
| 258 |
+
return {"emotion": label, "valence": valence, "arousal": arousal}
|
| 259 |
+
# Partial match (e.g. "laughs softly" β "laughs")
|
| 260 |
+
for key, (label, valence, arousal) in _TAG_EMOTIONS.items():
|
| 261 |
+
if key in tag:
|
| 262 |
+
return {"emotion": label, "valence": valence, "arousal": arousal}
|
| 263 |
+
return {"emotion": "Neutral", "valence": 0.0, "arousal": 0.0}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
|
| 265 |
|
| 266 |
# βββ Endpoints βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 267 |
|
| 268 |
@app.post("/transcribe")
|
| 269 |
async def transcribe(audio: UploadFile = File(...)):
|
| 270 |
+
"""Upload audio β plain transcription (with inline expression tags)."""
|
|
|
|
|
|
|
|
|
|
| 271 |
req_start = time.perf_counter()
|
| 272 |
req_id = f"transcribe-{int(req_start * 1000)}"
|
| 273 |
filename = audio.filename or "audio.wav"
|
| 274 |
+
print(f"[voxtral] {req_id} POST /transcribe filename={filename}")
|
| 275 |
|
| 276 |
try:
|
| 277 |
contents = await audio.read()
|
| 278 |
except Exception as e:
|
| 279 |
raise HTTPException(status_code=400, detail=f"Failed to read file: {e}")
|
|
|
|
| 280 |
_validate_upload(contents)
|
| 281 |
|
| 282 |
+
result = await _call_evoxtral(contents, filename)
|
| 283 |
+
text = result.get("transcription", "")
|
| 284 |
+
lang = result.get("language")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
|
|
|
|
| 286 |
total_ms = (time.perf_counter() - req_start) * 1000
|
| 287 |
print(f"[voxtral] {req_id} done total={total_ms:.0f}ms text_len={len(text)}")
|
| 288 |
+
return {"text": text, "words": [], "languageCode": lang}
|
| 289 |
|
| 290 |
|
| 291 |
@app.post("/transcribe-diarize")
|
| 292 |
+
async def transcribe_diarize(audio: UploadFile = File(...)):
|
|
|
|
|
|
|
| 293 |
"""
|
| 294 |
+
Upload audio β transcription + VAD segmentation + per-segment emotion.
|
| 295 |
+
Transcription is produced by the external evoxtral API (includes expressive tags).
|
| 296 |
+
Emotion is parsed from inline tags like [sighs], [laughs], etc.
|
| 297 |
All segments are labelled SPEAKER_00 (single-speaker mode).
|
| 298 |
"""
|
| 299 |
req_start = time.perf_counter()
|
|
|
|
| 305 |
contents = await audio.read()
|
| 306 |
except Exception as e:
|
| 307 |
raise HTTPException(status_code=400, detail=f"Failed to read file: {e}")
|
|
|
|
| 308 |
_validate_upload(contents)
|
| 309 |
|
| 310 |
suffix = os.path.splitext(filename)[1].lower() or ".wav"
|
| 311 |
if suffix not in (".wav", ".mp3", ".flac", ".ogg", ".m4a", ".webm"):
|
| 312 |
suffix = ".wav"
|
| 313 |
|
| 314 |
+
# ββ Step 1: call external evoxtral API ββββββββββββββββββββββββββββββββββ
|
| 315 |
+
t0 = time.perf_counter()
|
| 316 |
+
result = await _call_evoxtral(contents, filename)
|
| 317 |
+
full_text = result.get("transcription", "")
|
| 318 |
+
print(f"[voxtral] {req_id} evoxtral API done {(time.perf_counter()-t0)*1000:.0f}ms text_len={len(full_text)}")
|
| 319 |
|
| 320 |
+
# ββ Step 2: load audio for VAD segmentation ββββββββββββββββββββββββββββββ
|
| 321 |
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
|
| 322 |
tmp.write(contents)
|
| 323 |
tmp_path = tmp.name
|
|
|
|
| 324 |
try:
|
| 325 |
t0 = time.perf_counter()
|
| 326 |
+
audio_array = _load_audio(tmp_path, TARGET_SR)
|
| 327 |
print(f"[voxtral] {req_id} load_audio done shape={audio_array.shape} in {(time.perf_counter()-t0)*1000:.0f}ms")
|
| 328 |
except Exception as e:
|
| 329 |
raise HTTPException(status_code=400, detail=f"Cannot decode audio: {e}")
|
|
|
|
| 334 |
except OSError:
|
| 335 |
pass
|
| 336 |
|
| 337 |
+
duration = round(len(audio_array) / TARGET_SR, 3)
|
| 338 |
|
| 339 |
+
# ββ Step 3: VAD sentence segmentation βββββββββββββββββββββββββββββββββββ
|
| 340 |
t0 = time.perf_counter()
|
| 341 |
+
raw_segs, seg_method = _segments_from_vad(audio_array, TARGET_SR)
|
| 342 |
+
print(f"[voxtral] {req_id} segmentation done {(time.perf_counter()-t0)*1000:.0f}ms segs={len(raw_segs)}")
|
| 343 |
|
| 344 |
+
# ββ Step 4: distribute text to segments βββββββββββββββββββββββββββββββββ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
segs_with_text = _distribute_text(full_text, raw_segs)
|
| 346 |
|
| 347 |
+
# ββ Step 5: parse emotion from expression tags ββββββββββββββββββββββββββ
|
|
|
|
| 348 |
segments = []
|
| 349 |
for i, s in enumerate(segs_with_text):
|
| 350 |
+
emo = _parse_emotion(s["text"])
|
|
|
|
|
|
|
|
|
|
| 351 |
segments.append({
|
| 352 |
"id": i + 1,
|
| 353 |
"speaker": s["speaker"],
|
|
|
|
| 358 |
"valence": emo["valence"],
|
| 359 |
"arousal": emo["arousal"],
|
| 360 |
})
|
|
|
|
| 361 |
|
| 362 |
total_ms = (time.perf_counter() - req_start) * 1000
|
| 363 |
print(f"[voxtral] {req_id} complete total={total_ms:.0f}ms segments={len(segments)}")
|
model/voxtral-server/requirements.txt
CHANGED
|
@@ -1,16 +1,8 @@
|
|
| 1 |
-
#
|
| 2 |
fastapi>=0.115.0
|
| 3 |
uvicorn[standard]>=0.32.0
|
| 4 |
python-multipart>=0.0.9
|
| 5 |
-
|
| 6 |
-
peft>=0.18.0
|
| 7 |
-
torch>=2.0.0
|
| 8 |
-
accelerate>=0.33.0
|
| 9 |
-
mistral-common[audio]>=1.5.0
|
| 10 |
librosa>=0.10.0
|
| 11 |
soundfile>=0.12.0
|
| 12 |
numpy>=1.24.0
|
| 13 |
-
scikit-learn>=1.3.0
|
| 14 |
-
# Optional: production-grade speaker diarization (requires HF_TOKEN env var + model license acceptance)
|
| 15 |
-
# pip install pyannote.audio>=3.1.0
|
| 16 |
-
# Then: export HF_TOKEN=your_token
|
|
|
|
| 1 |
+
# Evoxtral API proxy β calls external Modal API for inference (no local model)
|
| 2 |
fastapi>=0.115.0
|
| 3 |
uvicorn[standard]>=0.32.0
|
| 4 |
python-multipart>=0.0.9
|
| 5 |
+
httpx>=0.27.0
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
librosa>=0.10.0
|
| 7 |
soundfile>=0.12.0
|
| 8 |
numpy>=1.24.0
|
|
|
|
|
|
|
|
|
|
|
|