""" Transcription Engine Comparison Space — Free CPU. Single-file app: Gradio + FastAPI routes + WhisperLiveKit WebSocket + Voxtral Realtime browser-side transcription (WebGPU via transformers.js) + inline recorder UI (HTML/CSS/JS). FER runs entirely in browser via ONNX (no server cost). The only external file is static/emotion_model_web.onnx (~4.8MB). """ import base64 import logging import os import gc import sys import traceback logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) _CLI_MODE = len(sys.argv) > 1 and not sys.argv[1].startswith("--") if _CLI_MODE: # -- CLI MODE: transcribe + diarize an audio file ------------------------- import numpy as np import torch import librosa from pyannote.audio import Pipeline audio_file = sys.argv[1] print(f"Loading: {audio_file}") audio, _ = librosa.load(audio_file, sr=16000, mono=True) audio = audio.astype(np.float32) print(f"Audio: {len(audio)/16000:.1f}s") # Diarization print("Loading diarization (pyannote)...") pipeline = Pipeline.from_pretrained("models/speaker-diarization-3.1") waveform = torch.tensor(audio).unsqueeze(0) result = pipeline({"waveform": waveform, "sample_rate": 16000}) diar = result.speaker_diarization # Post-processing: merge speakers with similar embeddings (numpy, no sklearn) speaker_labels = sorted(diar.labels()) merge_map = {} if hasattr(result, "speaker_embeddings") and result.speaker_embeddings is not None and len(speaker_labels) > 1: emb = result.speaker_embeddings norms = np.linalg.norm(emb, axis=1, keepdims=True) norms[norms == 0] = 1 sim = (emb / norms) @ (emb / norms).T for i in range(len(speaker_labels)): for j in range(i + 1, len(speaker_labels)): if sim[i][j] >= 0.6: target = merge_map.get(speaker_labels[i], speaker_labels[i]) merge_map[speaker_labels[j]] = target print(f" Merging {speaker_labels[j]} -> {target} (sim: {sim[i][j]:.3f})") merged = [] speakers_seen = set() for turn, _, spk in diar.itertracks(yield_label=True): actual_spk = merge_map.get(spk, spk) speaker_id = int(actual_spk.split("_")[-1]) + 1 speakers_seen.add(speaker_id) merged.append({"start": turn.start, "end": turn.end, "speakers": [speaker_id]}) num_speakers = len(speakers_seen) print(f"Speakers: {num_speakers} | Segments: {len(merged)}\n") # Transcription (Parakeet) print("Running Parakeet TDT v3 (with timestamps)...") import onnx_asr model = onnx_asr.load_model("nemo-parakeet-tdt-0.6b-v3", providers=["CPUExecutionProvider"]).with_timestamps() audio_int16 = (audio * 32767).astype(np.int16) output = model.recognize(audio_int16) del model gc.collect() tokens = output.tokens if hasattr(output, "tokens") else [] timestamps = output.timestamps if hasattr(output, "timestamps") else [] # Reconstruct full words from subword tokens words = [] # list of {"text": str, "start": float, "end": float} current_word = "" current_start = 0.0 current_end = 0.0 for tok, ts in zip(tokens, timestamps): if tok.startswith(" ") or tok.startswith("\n"): if current_word.strip(): words.append({"text": current_word, "start": current_start, "end": current_end}) current_word = tok current_start = ts current_end = ts else: if not current_word: current_start = ts current_word += tok current_end = ts if current_word.strip(): words.append({"text": current_word, "start": current_start, "end": current_end}) # Align each word to speaker with greatest temporal overlap def best_speaker(word_start, word_end): best = None max_overlap = 0 for seg in merged: ov_start = max(word_start, seg["start"]) ov_end = min(word_end, seg["end"]) if ov_start < ov_end: overlap = ov_end - ov_start if overlap > max_overlap: max_overlap = overlap best = " & ".join(f"SPEAKER {s}" for s in seg["speakers"]) return best # Assign speaker to each word, then merge consecutive same-speaker labeled = [] for w in words: spk = best_speaker(w["start"], w["end"] + 0.05) if spk is None: spk = labeled[-1][0] if labeled else "UNKNOWN" labeled.append((spk, w["start"], w["text"])) # Merge consecutive same-speaker words print("=" * 60) print(f"{num_speakers} speakers detected:\n") if labeled: current_spk = labeled[0][0] current_start = labeled[0][1] current_text = labeled[0][2] for spk, ts, txt in labeled[1:]: if spk == current_spk: current_text += txt else: chunk = current_text.strip() if chunk: m, s = divmod(int(current_start), 60) print(f"{current_spk} [{m:02d}:{s:02d}]: {chunk}") current_spk = spk current_start = ts current_text = txt chunk = current_text.strip() if chunk: m, s = divmod(int(current_start), 60) print(f"{current_spk} [{m:02d}:{s:02d}]: {chunk}") else: text = output.text if hasattr(output, "text") else str(output) print(text) print("=" * 60) sys.exit(0) if not _CLI_MODE: import gradio as gr import asyncio from fastapi import WebSocket, WebSocketDisconnect from fastapi.staticfiles import StaticFiles from starlette.requests import Request from starlette.responses import Response from whisperlivekit import TranscriptionEngine, AudioProcessor # -- WhisperLiveKit engine (loaded at startup, ~3-6GB) ------------------- logger.info("Loading TranscriptionEngine (large-v3-turbo model, CPU)...") transcription_engine = TranscriptionEngine( model_size="large-v3-turbo", vac=True, min_chunk_size=1.0, lan="auto", direct_english_translation=False, ) logger.info("TranscriptionEngine ready.") # -- Inline CSS -------------------------------------------------------------- RECORDER_CSS = r""" *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; } :root { --bg: #0f0f0f; --surface: #1a1a2e; --surface2: #16213e; --accent: #e94560; --accent2: #0f3460; --text: #eee; --text-dim: #888; --success: #4ecca3; --warning: #f5a623; --radius: 12px; --font: 'Segoe UI', system-ui, -apple-system, sans-serif; } body { background: var(--bg); color: var(--text); font-family: var(--font); padding: 12px; line-height: 1.5; } .mode-selector { display: flex; gap: 6px; justify-content: center; margin-bottom: 8px; flex-wrap: wrap; } .engine-btn { background: var(--surface); border: 2px solid transparent; border-radius: var(--radius); padding: 8px 14px; cursor: pointer; font-size: 0.85rem; color: var(--text); transition: all 0.2s; } .engine-btn:hover { border-color: var(--accent2); } .engine-btn.active { border-color: var(--accent); background: var(--surface2); } .options-row { display: flex; gap: 16px; justify-content: center; align-items: center; margin-bottom: 8px; flex-wrap: wrap; } .options-row label { font-size: 0.85rem; color: var(--text-dim); cursor: pointer; display: flex; align-items: center; gap: 6px; } .options-row input[type="checkbox"] { accent-color: var(--accent); } .controls { display: flex; align-items: center; justify-content: center; gap: 16px; margin-bottom: 10px; } #recordButton { width: 64px; height: 64px; border-radius: 50%; border: 3px solid var(--accent); background: transparent; cursor: pointer; display: flex; align-items: center; justify-content: center; transition: all 0.3s; flex-shrink: 0; } #recordButton .inner { width: 28px; height: 28px; background: var(--accent); border-radius: 50%; transition: all 0.3s; } #recordButton.recording .inner { border-radius: 4px; width: 24px; height: 24px; } #recordButton:hover { transform: scale(1.05); } .upload-btn { width: 40px; height: 40px; border-radius: 50%; border: 2px solid var(--accent2); background: transparent; color: var(--text-dim); cursor: pointer; display: flex; align-items: center; justify-content: center; transition: all 0.2s; } .upload-btn:hover { border-color: var(--accent); color: var(--text); } .timer { font-size: 1.2rem; font-variant-numeric: tabular-nums; color: var(--text-dim); min-width: 60px; } .timer.recording { color: var(--accent); } #waveCanvas { width: 200px; height: 48px; border-radius: 8px; background: var(--surface); } #status { text-align: center; font-size: 0.85rem; color: var(--text-dim); margin-bottom: 8px; min-height: 1.3em; } #status.error { color: var(--accent); } #status.success { color: var(--success); } .results-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 8px; margin-top: 8px; } /* Panel visibility controlled by JS updateResultsLayout() */ .result-panel { background: var(--surface); border-radius: var(--radius); padding: 12px; min-height: 120px; position: relative; } .copy-btn { position: absolute; top: 6px; right: 6px; background: transparent; border: 1px solid rgba(255,255,255,0.15); color: var(--text-dim); cursor: pointer; padding: 3px 10px; border-radius: 6px; font-size: 0.72rem; z-index: 2; display: flex; align-items: center; gap: 4px; transition: all 0.15s; } .copy-btn:hover { background: rgba(255,255,255,0.1); color: var(--text); border-color: rgba(255,255,255,0.3); } .copy-btn.copied { color: var(--success); border-color: var(--success); } .short-hint { color: var(--text-dim); font-size: 0.8rem; margin-top: 8px; font-style: italic; } .result-panel h3 { font-size: 0.9rem; margin-bottom: 6px; display: flex; align-items: center; gap: 6px; flex-wrap: wrap; padding-right: 50px; } .badge { font-size: 0.7rem; padding: 2px 8px; border-radius: 999px; font-weight: 500; } .badge.realtime { background: var(--success); color: #000; } .badge.browser { background: var(--warning); color: #000; } .timing { font-size: 0.8rem; color: var(--text-dim); margin-bottom: 8px; } .transcript { font-size: 0.85rem; line-height: 1.6; white-space: pre-wrap; word-break: break-word; max-height: 220px; overflow-y: auto; background: rgba(0,0,0,0.25); border-radius: 8px; padding: 10px 12px; font-family: 'SF Mono', 'Cascadia Code', 'Fira Code', 'Consolas', monospace; border: 1px solid rgba(255,255,255,0.06); } .transcript .buffer { color: var(--text-dim); font-style: italic; } .transcript .line { margin-bottom: 4px; } .transcript .timestamp { color: var(--accent2); font-size: 0.75rem; margin-right: 6px; opacity: 0.7; font-variant-numeric: tabular-nums; } .transcript .speaker { color: var(--success); font-weight: 600; font-size: 0.8rem; margin-right: 6px; } .spinner { display: inline-block; width: 20px; height: 20px; border: 2px solid var(--text-dim); border-top-color: var(--accent); border-radius: 50%; animation: spin 0.8s linear infinite; margin-right: 8px; vertical-align: middle; } @keyframes spin { to { transform: rotate(360deg); } } .fer-container { position: relative; display: flex; justify-content: center; margin: 0 auto 8px; } .fer-container.hidden { display: none; } #webcamVideo { width: 240px; height: 180px; border-radius: var(--radius); object-fit: cover; transform: scaleX(-1); background: #000; } .emotion-bars { position: absolute; bottom: 8px; left: 8px; right: 8px; display: flex; flex-direction: column; gap: 2px; background: rgba(0,0,0,0.6); padding: 6px; border-radius: 6px; font-size: 0.65rem; } .emotion-bar { display: flex; align-items: center; gap: 4px; } .emotion-bar .label { width: 55px; text-align: right; flex-shrink: 0; } .emotion-bar .bar { flex: 1; height: 6px; background: rgba(255,255,255,0.15); border-radius: 3px; overflow: hidden; } .emotion-bar .fill { height: 100%; border-radius: 3px; transition: width 0.3s; background: var(--success); } .emotion-bar .pct { width: 30px; text-align: right; font-variant-numeric: tabular-nums; } #webgpuWarning { display: none; text-align: center; padding: 10px 16px; margin-bottom: 12px; background: rgba(233, 69, 96, 0.15); border: 1px solid var(--accent); border-radius: var(--radius); font-size: 0.85rem; color: var(--accent); } .progress-bar-container { width: 100%; background: rgba(255,255,255,0.1); border-radius: 4px; margin: 6px 0; height: 8px; overflow: hidden; } .progress-bar-fill { height: 100%; background: var(--success); border-radius: 4px; transition: width 0.3s; width: 0%; } @media (max-width: 600px) { .results-grid { grid-template-columns: 1fr; } .results-grid .result-panel { display: block !important; } #waveCanvas { width: 120px; } } """ # -- Inline FER JS ----------------------------------------------------------- FER_JS = r""" const FER_LABELS = [ "Anger", "Contempt", "Disgust", "Fear", "Happy", "Neutral", "Sad", "Surprise" ]; const IMAGE_SIZE = 224; const IMAGENET_MEAN = [0.485, 0.456, 0.406]; const IMAGENET_STD = [0.229, 0.224, 0.225]; let ferSession = null; let ferCanvas = null; let ortModule = null; function softmax(scores) { let max = -Infinity; for (let i = 0; i < scores.length; i++) { if (scores[i] > max) max = scores[i]; } const exps = new Float32Array(scores.length); let sum = 0; for (let i = 0; i < scores.length; i++) { exps[i] = Math.exp(scores[i] - max); sum += exps[i]; } for (let i = 0; i < exps.length; i++) { exps[i] /= sum; } return exps; } async function loadFERModel() { if (ferSession) return true; try { const ort = window.ort; if (!ort) { console.error("[FER] onnxruntime-web not loaded"); return false; } ortModule = ort; ort.env.wasm.numThreads = 1; const response = await fetch("/static/emotion_model_web.onnx"); const modelBuffer = await response.arrayBuffer(); ferSession = await ort.InferenceSession.create( new Uint8Array(modelBuffer), { executionProviders: ["wasm"] } ); console.log("[FER] Model loaded"); return true; } catch (err) { console.error("[FER] Failed to load model:", err); return false; } } async function classifyEmotion(videoElement) { if (!ferSession || !ortModule) return null; try { if (!ferCanvas) { ferCanvas = document.createElement("canvas"); ferCanvas.width = IMAGE_SIZE; ferCanvas.height = IMAGE_SIZE; } const ctx = ferCanvas.getContext("2d", { willReadFrequently: true }); if (!ctx) return null; ctx.drawImage(videoElement, 0, 0, IMAGE_SIZE, IMAGE_SIZE); const imageData = ctx.getImageData(0, 0, IMAGE_SIZE, IMAGE_SIZE); const { data } = imageData; const floatData = new Float32Array(1 * 3 * IMAGE_SIZE * IMAGE_SIZE); const pixelCount = IMAGE_SIZE * IMAGE_SIZE; for (let i = 0; i < pixelCount; i++) { const srcIdx = i * 4; floatData[i] = (data[srcIdx] / 255 - IMAGENET_MEAN[0]) / IMAGENET_STD[0]; floatData[pixelCount + i] = (data[srcIdx + 1] / 255 - IMAGENET_MEAN[1]) / IMAGENET_STD[1]; floatData[2 * pixelCount + i] = (data[srcIdx + 2] / 255 - IMAGENET_MEAN[2]) / IMAGENET_STD[2]; } const inputTensor = new ortModule.Tensor("float32", floatData, [1, 3, IMAGE_SIZE, IMAGE_SIZE]); const inputName = ferSession.inputNames[0]; const results = await ferSession.run({ [inputName]: inputTensor }); const outputName = ferSession.outputNames[0]; const output = results[outputName]; if (!output) return null; const rawScores = output.data; const probs = softmax(rawScores); const scores = {}; let maxIdx = 0; let maxVal = probs[0]; for (let i = 0; i < probs.length; i++) { scores[FER_LABELS[i]] = probs[i]; if (probs[i] > maxVal) { maxVal = probs[i]; maxIdx = i; } } return { emotion: FER_LABELS[maxIdx], confidence: maxVal, scores: scores }; } catch (err) { console.error("[FER] Classification error:", err); return null; } } function releaseFER() { if (ferSession) { ferSession.release().catch(() => {}); ferSession = null; } } """ # -- Inline Recorder JS ----------------------------------------------------- RECORDER_JS = r""" // -- State ------------------------------------------------------------------- let activeEngines = new Set(["parakeet"]); let isRecording = false; let websocket = null; let mediaRecorder = null; let audioChunks = []; let micRecorder = null; let micChunks = []; let screenRecorder = null; let screenChunks = []; let mixedStream = null; let micStream = null; let displayStream = null; let audioContext = null; let analyserNode = null; let animFrameId = null; let timerInterval = null; let recordingStartTime = null; let ferInterval = null; let webcamStream = null; // -- Voxtral Realtime state -------------------------------------------------- let voxtralModel = null; let voxtralProcessor = null; let voxtralLoading = false; let voxtralAudioChunks = []; let voxtralAudioLength = 0; // Lazy-concatenate: only rebuild when new chunks arrive let _voxtralCached = new Float32Array(0); let _voxtralCachedLen = 0; function getVoxtralAudio() { if (voxtralAudioLength === _voxtralCachedLen) return _voxtralCached; if (voxtralAudioChunks.length === 0) { _voxtralCached = new Float32Array(0); _voxtralCachedLen = 0; return _voxtralCached; } if (voxtralAudioChunks.length === 1) { _voxtralCached = voxtralAudioChunks[0]; _voxtralCachedLen = voxtralAudioLength; return _voxtralCached; } const combined = new Float32Array(voxtralAudioLength); let offset = 0; for (const chunk of voxtralAudioChunks) { combined.set(chunk, offset); offset += chunk.length; } voxtralAudioChunks = [combined]; _voxtralCached = combined; _voxtralCachedLen = voxtralAudioLength; return combined; } let voxtralIsRunning = false; let voxtralStopRequested = false; let voxtralAudioContext = null; let voxtralWorkletNode = null; let voxtralMicSource = null; let transformersModule = null; const VOXTRAL_MODEL_ID = "onnx-community/Voxtral-Mini-4B-Realtime-2602-ONNX"; const SEGMENTATION_MODEL_ID = "onnx-community/pyannote-segmentation-3.0"; let segmentationModel = null; let segmentationProcessor = null; // -- DOM refs ---------------------------------------------------------------- const modeSelector = document.getElementById("modeSelector"); const recordButton = document.getElementById("recordButton"); const waveCanvas = document.getElementById("waveCanvas"); const timerEl = document.getElementById("timer"); const statusEl = document.getElementById("status"); const resultsGrid = document.getElementById("resultsGrid"); const whisperPanel = document.getElementById("whisperPanel"); const voxtralPanel = document.getElementById("voxtralPanel"); const whisperTranscript = document.getElementById("whisperTranscript"); const voxtralTranscript = document.getElementById("voxtralTranscript"); const whisperTiming = document.getElementById("whisperTiming"); const voxtralTiming = document.getElementById("voxtralTiming"); const screenAudioToggle = document.getElementById("screenAudioToggle"); const ferToggle = document.getElementById("ferToggle"); const diarizeToggle = document.getElementById("diarizeToggle"); const ferContainer = document.getElementById("ferContainer"); const webcamVideo = document.getElementById("webcamVideo"); const emotionBarsEl = document.getElementById("emotionBars"); const webgpuWarning = document.getElementById("webgpuWarning"); const parakeetPanel = document.getElementById("parakeetPanel"); const parakeetTranscript = document.getElementById("parakeetTranscript"); const parakeetTiming = document.getElementById("parakeetTiming"); const nemotronPanel = document.getElementById("nemotronPanel"); const nemotronTranscript = document.getElementById("nemotronTranscript"); const nemotronTiming = document.getElementById("nemotronTiming"); // -- WebGPU check ------------------------------------------------------------ async function checkWebGPU() { if (!navigator.gpu) { webgpuWarning.style.display = "block"; webgpuWarning.textContent = "WebGPU is not supported in this browser. Voxtral Realtime requires WebGPU (Chrome 113+, Edge 113+)."; return false; } try { const adapter = await navigator.gpu.requestAdapter(); if (!adapter) { webgpuWarning.style.display = "block"; webgpuWarning.textContent = "WebGPU adapter not available. Check your GPU drivers."; return false; } return true; } catch (e) { webgpuWarning.style.display = "block"; webgpuWarning.textContent = "WebGPU check failed: " + e.message; return false; } } checkWebGPU(); // -- Engine toggle selector --------------------------------------------------- modeSelector.querySelectorAll(".engine-btn").forEach((btn) => { btn.addEventListener("click", () => { if (isRecording) return; const engine = btn.dataset.engine; if (activeEngines.has(engine)) { if (activeEngines.size > 1) { activeEngines.delete(engine); btn.classList.remove("active"); } } else { activeEngines.add(engine); btn.classList.add("active"); } updateResultsLayout(); }); }); function updateResultsLayout() { const panelMap = { whisper: whisperPanel, voxtral: voxtralPanel, parakeet: parakeetPanel, nemotron: nemotronPanel, }; Object.entries(panelMap).forEach(([key, panel]) => { panel.style.display = activeEngines.has(key) ? '' : 'none'; }); const count = activeEngines.size; resultsGrid.style.gridTemplateColumns = count <= 1 ? '1fr' : '1fr 1fr'; } updateResultsLayout(); // -- FER toggle -------------------------------------------------------------- ferToggle.addEventListener("change", async () => { if (ferToggle.checked) { ferContainer.classList.remove("hidden"); await startWebcam(); await loadFERModel(); startFERLoop(); } else { ferContainer.classList.add("hidden"); stopFERLoop(); stopWebcam(); } }); async function startWebcam() { try { webcamStream = await navigator.mediaDevices.getUserMedia({ video: true }); webcamVideo.srcObject = webcamStream; } catch (err) { console.error("[FER] Webcam error:", err); setStatus("Webcam access denied", "error"); } } function stopWebcam() { if (webcamStream) { webcamStream.getTracks().forEach((t) => t.stop()); webcamStream = null; webcamVideo.srcObject = null; } } function startFERLoop() { if (ferInterval) return; ferInterval = setInterval(async () => { if (!webcamVideo.srcObject) return; const result = await classifyEmotion(webcamVideo); if (result) renderEmotionBars(result.scores); }, 500); } function stopFERLoop() { if (ferInterval) { clearInterval(ferInterval); ferInterval = null; } } function renderEmotionBars(scores) { const labels = Object.keys(scores); let html = ""; for (const label of labels) { const pct = (scores[label] * 100).toFixed(0); html += `
${label}
${pct}%
`; } emotionBarsEl.innerHTML = html; } // -- Voxtral model loading --------------------------------------------------- async function loadVoxtralModel() { if (voxtralModel && voxtralProcessor) return true; if (voxtralLoading) return false; voxtralLoading = true; voxtralTranscript.innerHTML = ' Loading Voxtral Realtime model (WebGPU)... This downloads ~2GB on first use.'; try { if (!transformersModule) { voxtralTranscript.innerHTML = ' Loading transformers.js library...'; transformersModule = await import("https://cdn.jsdelivr.net/npm/@huggingface/transformers@4.0.0-next.7"); } const { VoxtralRealtimeForConditionalGeneration, VoxtralRealtimeProcessor } = transformersModule; voxtralTranscript.innerHTML = ' Downloading & loading Voxtral model (q4f16, WebGPU)...
'; const progressCallback = (progress) => { const bar = document.getElementById("voxtralProgress"); const txt = document.getElementById("voxtralProgressText"); if (bar && progress.progress !== undefined) { bar.style.width = progress.progress.toFixed(1) + "%"; } if (txt && progress.file) { const status = progress.status || ""; const pct = progress.progress !== undefined ? ` (${progress.progress.toFixed(1)}%)` : ""; txt.textContent = `${status} ${progress.file}${pct}`; } }; voxtralProcessor = await VoxtralRealtimeProcessor.from_pretrained(VOXTRAL_MODEL_ID, { progress_callback: progressCallback, }); voxtralModel = await VoxtralRealtimeForConditionalGeneration.from_pretrained(VOXTRAL_MODEL_ID, { dtype: { audio_encoder: "q4f16", embed_tokens: "q4f16", decoder_model_merged: "q4f16", }, device: "webgpu", progress_callback: progressCallback, }); // Load speaker segmentation model for browser-side diarization if (!segmentationModel) { voxtralTranscript.innerHTML = ' Loading speaker segmentation model...'; const { AutoProcessor, AutoModelForAudioFrameClassification } = transformersModule; segmentationProcessor = await AutoProcessor.from_pretrained(SEGMENTATION_MODEL_ID, { progress_callback: progressCallback }); segmentationModel = await AutoModelForAudioFrameClassification.from_pretrained(SEGMENTATION_MODEL_ID, { device: "wasm", dtype: "fp32", progress_callback: progressCallback }); } voxtralTranscript.innerHTML = 'Models loaded. Ready to transcribe.'; voxtralLoading = false; return true; } catch (err) { console.error("[Voxtral] Model loading error:", err); voxtralTranscript.innerHTML = `Failed to load model: ${escapeHtml(err.message)}`; voxtralLoading = false; return false; } } // -- Voxtral audio capture via AudioWorklet ---------------------------------- // Dual-track buffers for Voxtral (when Speaker detection OFF) let voxtralMicChunks = []; let voxtralMicLength = 0; let voxtralScreenChunks = []; let voxtralScreenLength = 0; let voxtralDualTrack = false; function getVoxtralMicAudio() { if (voxtralMicChunks.length === 0) return new Float32Array(0); if (voxtralMicChunks.length === 1) return voxtralMicChunks[0]; const c = new Float32Array(voxtralMicLength); let o = 0; for (const ch of voxtralMicChunks) { c.set(ch, o); o += ch.length; } voxtralMicChunks = [c]; return c; } function getVoxtralScreenAudio() { if (voxtralScreenChunks.length === 0) return new Float32Array(0); if (voxtralScreenChunks.length === 1) return voxtralScreenChunks[0]; const c = new Float32Array(voxtralScreenLength); let o = 0; for (const ch of voxtralScreenChunks) { c.set(ch, o); o += ch.length; } voxtralScreenChunks = [c]; return c; } async function startVoxtralRecording(stream, micOnlyStream, screenOnlyStream) { voxtralAudioChunks = []; voxtralAudioLength = 0; voxtralMicChunks = []; voxtralMicLength = 0; voxtralScreenChunks = []; voxtralScreenLength = 0; voxtralStopRequested = false; voxtralIsRunning = true; voxtralDualTrack = !!(micOnlyStream && screenOnlyStream); voxtralAudioContext = new AudioContext({ sampleRate: 16000 }); const workletCode = `class CaptureProcessor extends AudioWorkletProcessor { process(inputs) { const input = inputs[0]; if (input.length > 0 && input[0].length > 0) { this.port.postMessage(input[0]); } return true; } } registerProcessor("capture-processor", CaptureProcessor);`; const blob = new Blob([workletCode], { type: "application/javascript" }); const url = URL.createObjectURL(blob); await voxtralAudioContext.audioWorklet.addModule(url); URL.revokeObjectURL(url); // Main mixed stream capture (for transcription) voxtralMicSource = voxtralAudioContext.createMediaStreamSource(stream); voxtralWorkletNode = new AudioWorkletNode(voxtralAudioContext, "capture-processor"); voxtralWorkletNode.port.onmessage = (event) => { if (voxtralStopRequested) return; const newData = new Float32Array(event.data); if (newData.length === 0) return; voxtralAudioChunks.push(newData); voxtralAudioLength += newData.length; }; voxtralMicSource.connect(voxtralWorkletNode); const silentGain = voxtralAudioContext.createGain(); silentGain.gain.value = 0; voxtralWorkletNode.connect(silentGain); silentGain.connect(voxtralAudioContext.destination); // Dual-track: separate mic and screen captures if (voxtralDualTrack) { // Mic-only worklet const micSrc = voxtralAudioContext.createMediaStreamSource(micOnlyStream); const micWork = new AudioWorkletNode(voxtralAudioContext, "capture-processor"); micWork.port.onmessage = (event) => { if (voxtralStopRequested) return; const d = new Float32Array(event.data); if (d.length > 0) { voxtralMicChunks.push(d); voxtralMicLength += d.length; } }; micSrc.connect(micWork); micWork.connect(silentGain); // Screen-only worklet const scrSrc = voxtralAudioContext.createMediaStreamSource(screenOnlyStream); const scrWork = new AudioWorkletNode(voxtralAudioContext, "capture-processor"); scrWork.port.onmessage = (event) => { if (voxtralStopRequested) return; const d = new Float32Array(event.data); if (d.length > 0) { voxtralScreenChunks.push(d); voxtralScreenLength += d.length; } }; scrSrc.connect(scrWork); scrWork.connect(silentGain); } // Start the transcription loop (uses mixed stream for real-time) runVoxtralTranscription(); } function stopVoxtralRecording() { voxtralStopRequested = true; if (voxtralWorkletNode) { voxtralWorkletNode.disconnect(); voxtralWorkletNode = null; } if (voxtralMicSource) { voxtralMicSource.disconnect(); voxtralMicSource = null; } if (voxtralAudioContext && voxtralAudioContext.state !== "closed") { voxtralAudioContext.close().catch(() => {}); voxtralAudioContext = null; } } // -- Voxtral streaming transcription loop ------------------------------------ async function runVoxtralTranscription() { if (!voxtralModel || !voxtralProcessor) { console.error("[Voxtral] Model or processor not loaded"); return; } const { BaseStreamer } = transformersModule; const numSamplesFirst = voxtralProcessor.num_samples_first_audio_chunk; const numSamplesPerChunk = voxtralProcessor.num_samples_per_audio_chunk; const { hop_length, n_fft } = voxtralProcessor.feature_extractor.config; const winHalf = Math.floor(n_fft / 2); const samplesPerTok = voxtralProcessor.audio_length_per_tok * hop_length; const voxtralStartTime = Date.now(); let fullText = ""; // Streamer matching reference VoxtralProvider.tsx pattern const tokenizer = voxtralProcessor.tokenizer; const specialIds = new Set(tokenizer.all_special_ids.map(BigInt)); let tokenCache = []; let printLen = 0; let isPrompt = true; function flushDecodedText() { if (tokenCache.length === 0) return; const text = tokenizer.decode(tokenCache, { skip_special_tokens: true }); const printableText = text.slice(printLen); printLen = text.length; if (printableText.length > 0) { fullText += printableText; voxtralTranscript.innerHTML = `
${escapeHtml(fullText)}
streaming...`; voxtralTranscript.scrollTop = voxtralTranscript.scrollHeight; } } const streamer = new (class extends BaseStreamer { put(value) { if (voxtralStopRequested) return; if (isPrompt) { isPrompt = false; return; } const tokens = value[0]; if (tokens.length === 1 && specialIds.has(tokens[0])) return; tokenCache = tokenCache.concat(tokens); flushDecodedText(); } end() { if (voxtralStopRequested) { tokenCache = []; printLen = 0; isPrompt = true; return; } flushDecodedText(); tokenCache = []; printLen = 0; isPrompt = true; } })(); voxtralTranscript.innerHTML = 'Waiting for audio...'; // Wait until we have enough audio for the first chunk while (voxtralAudioLength < numSamplesFirst && !voxtralStopRequested) { await new Promise((r) => setTimeout(r, 100)); } if (voxtralStopRequested) { voxtralIsRunning = false; return; } // Process first chunk to get input_ids and first input_features const voxtralAudioBuffer = getVoxtralAudio(); const firstAudio = voxtralAudioBuffer.subarray(0, numSamplesFirst); const firstChunkInputs = await voxtralProcessor(firstAudio, { is_streaming: true, is_first_audio_chunk: true, }); // Async generator yields input_features ONLY (not full processor output) async function* inputFeaturesGenerator() { yield firstChunkInputs.input_features; let melFrameIdx = voxtralProcessor.num_mel_frames_first_audio_chunk; let startIdx = melFrameIdx * hop_length - winHalf; while (!voxtralStopRequested) { const endNeeded = startIdx + numSamplesPerChunk; while (voxtralAudioLength < endNeeded && !voxtralStopRequested) { await new Promise((r) => setTimeout(r, 50)); } if (voxtralStopRequested) break; // Batch extra available audio (matching reference pattern) const availableSamples = voxtralAudioLength; let batchEndSample = endNeeded; while (batchEndSample + samplesPerTok <= availableSamples) { batchEndSample += samplesPerTok; } const chunkAudio = getVoxtralAudio().slice(startIdx, batchEndSample); const chunkInputs = await voxtralProcessor(chunkAudio, { is_streaming: true, is_first_audio_chunk: false, }); yield chunkInputs.input_features; melFrameIdx += chunkInputs.input_features.dims[2]; startIdx = melFrameIdx * hop_length - winHalf; } } try { voxtralTranscript.innerHTML = 'Transcribing...'; // Pass input_ids and input_features separately (matching reference) await voxtralModel.generate({ input_ids: firstChunkInputs.input_ids, input_features: inputFeaturesGenerator(), max_new_tokens: 4096, streamer: streamer, }); const elapsed = ((Date.now() - voxtralStartTime) / 1000).toFixed(1); voxtralTiming.textContent = `Processing time: ${elapsed}s (real-time, browser)`; if (fullText.trim()) { voxtralTranscript.innerHTML = `
${escapeHtml(fullText)}
`; // Browser-only diarization (Xenova's method, pyannote segmentation ONNX) // Voxtral is fully standalone - no server calls, max 3 speakers // Runs on full audio at end, not chunked if (segmentationModel && segmentationProcessor) { try { voxtralTranscript.innerHTML += '
Analyzing speakers (browser)...
'; const audio16k = getVoxtralAudio(); const inputs = await segmentationProcessor(audio16k); const { logits } = await segmentationModel(inputs); const diarSegs = segmentationProcessor.post_process_speaker_diarization(logits, audio16k.length)[0]; const speakerSet = new Set(); const labeled = []; for (const seg of diarSegs) { const label = segmentationModel.config.id2label[seg.id]; if (label === 'NO_SPEAKER') continue; speakerSet.add(label); labeled.push({start: seg.start, end: seg.end, label}); } if (speakerSet.size >= 2 && labeled.length > 0) { const merged = [labeled[0]]; for (let i = 1; i < labeled.length; i++) { const prev = merged[merged.length - 1]; if (labeled[i].label === prev.label && labeled[i].start - prev.end < 0.5) { prev.end = labeled[i].end; } else { merged.push({...labeled[i]}); } } let diarText = speakerSet.size + ' speakers detected (browser):\n'; for (const seg of merged) { diarText += '\n[' + fmtTime(seg.start) + ' - ' + fmtTime(seg.end) + '] ' + seg.label; } voxtralTranscript.textContent = diarText + '\n\n' + fullText; } } catch (diarErr) { console.warn("[Voxtral Diarization]", diarErr); } } } else { voxtralTranscript.innerHTML = 'No speech detected.'; } } catch (err) { console.error("[Voxtral] Transcription error:", err); voxtralTranscript.innerHTML = `Transcription error: ${escapeHtml(err.message)}`; } // Energy-based source attribution: compare mic vs screen energy per time window if (voxtralDualTrack && voxtralMicLength > 0 && voxtralScreenLength > 0 && fullText.trim()) { try { const micAudio = getVoxtralMicAudio(); const screenAudio = getVoxtralScreenAudio(); const sr = 16000; const windowSize = Math.floor(sr * 0.5); // 0.5s windows // Compute RMS energy per window function rms(buf, start, len) { let sum = 0; const end = Math.min(start + len, buf.length); for (let i = start; i < end; i++) sum += buf[i] * buf[i]; return Math.sqrt(sum / (end - start || 1)); } // Build source timeline const segments = []; const maxLen = Math.max(micAudio.length, screenAudio.length); for (let i = 0; i < maxLen; i += windowSize) { const micE = i < micAudio.length ? rms(micAudio, i, windowSize) : 0; const scrE = i < screenAudio.length ? rms(screenAudio, i, windowSize) : 0; const t = i / sr; if (micE < 0.005 && scrE < 0.005) continue; // silence const src = micE >= scrE ? 'YOU' : 'SCREEN'; if (segments.length > 0 && segments[segments.length - 1].src === src) { segments[segments.length - 1].end = t + 0.5; } else { segments.push({src, start: t, end: t + 0.5}); } } if (segments.length > 1) { // Split transcript proportionally by segment duration const totalDur = segments.reduce((s, seg) => s + (seg.end - seg.start), 0); const words = fullText.trim().split(/\s+/); const totalWords = words.length; let output = '', wordIdx = 0; for (const seg of segments) { const dur = seg.end - seg.start; const nWords = Math.max(1, Math.round(totalWords * dur / totalDur)); const chunk = words.slice(wordIdx, wordIdx + nWords).join(' '); wordIdx += nWords; if (!chunk) continue; const m = Math.floor(seg.start / 60), s = Math.floor(seg.start % 60); output += seg.src + ' [' + String(m).padStart(2,'0') + ':' + String(s).padStart(2,'0') + ']: ' + chunk + '\n'; } if (wordIdx < totalWords) output += words.slice(wordIdx).join(' '); voxtralTranscript.textContent = output.trim(); } } catch (energyErr) { console.warn("[Voxtral Energy]", energyErr); } } voxtralIsRunning = false; } // -- Record button ----------------------------------------------------------- recordButton.addEventListener("click", () => { if (isRecording) { stopRecording(); } else { startRecording(); } }); // -- Start recording --------------------------------------------------------- async function startRecording() { whisperTranscript.innerHTML = ""; voxtralTranscript.innerHTML = ""; whisperTiming.textContent = ""; voxtralTiming.textContent = ""; parakeetTranscript.innerHTML = ""; parakeetTiming.textContent = ""; nemotronTranscript.innerHTML = ""; nemotronTiming.textContent = ""; setStatus("Starting..."); window._parakeetData = null; window._diarSegments = null; window._diarNumSpeakers = 0; // For voxtral, check WebGPU and load model first if (activeEngines.has("voxtral")) { const gpuOk = await checkWebGPU(); if (!gpuOk) { setStatus("WebGPU not available. Cannot use Voxtral Realtime.", "error"); return; } const loaded = await loadVoxtralModel(); if (!loaded) { setStatus("Failed to load Voxtral model.", "error"); return; } } try { micStream = await navigator.mediaDevices.getUserMedia({ audio: true }); if (screenAudioToggle.checked) { try { displayStream = await navigator.mediaDevices.getDisplayMedia({ video: true, audio: true }); displayStream.getVideoTracks().forEach((t) => t.stop()); } catch (err) { console.warn("[Recorder] Screen audio not available:", err); setStatus("Screen audio denied - using mic only", "error"); displayStream = null; } } audioContext = new AudioContext(); const dest = audioContext.createMediaStreamDestination(); const micSource = audioContext.createMediaStreamSource(micStream); micSource.connect(dest); if (displayStream && displayStream.getAudioTracks().length > 0) { const displaySource = audioContext.createMediaStreamSource(displayStream); displaySource.connect(dest); } mixedStream = dest.stream; analyserNode = audioContext.createAnalyser(); analyserNode.fftSize = 256; micSource.connect(analyserNode); const mimeType = MediaRecorder.isTypeSupported("audio/webm;codecs=opus") ? "audio/webm;codecs=opus" : "audio/webm"; audioChunks = []; micChunks = []; screenChunks = []; mediaRecorder = new MediaRecorder(mixedStream, { mimeType }); mediaRecorder.ondataavailable = (e) => { if (e.data.size > 0) { audioChunks.push(e.data); if (activeEngines.has("whisper") && websocket && websocket.readyState === WebSocket.OPEN) { websocket.send(e.data); } } }; // Separate mic/screen recorders for routing-based speaker separation if (!diarizeToggle.checked && displayStream && displayStream.getAudioTracks().length > 0) { micRecorder = new MediaRecorder(micStream, { mimeType }); micRecorder.ondataavailable = (e) => { if (e.data.size > 0) micChunks.push(e.data); }; const screenDest = audioContext.createMediaStreamDestination(); const screenSrc = audioContext.createMediaStreamSource(displayStream); screenSrc.connect(screenDest); screenRecorder = new MediaRecorder(screenDest.stream, { mimeType }); screenRecorder.ondataavailable = (e) => { if (e.data.size > 0) screenChunks.push(e.data); }; micRecorder.start(250); screenRecorder.start(250); } mediaRecorder.onstop = () => { onRecordingStopped(); }; if (activeEngines.has("whisper")) { await connectWebSocket(); } // Start Voxtral recording with its own AudioContext at 16kHz if (activeEngines.has("voxtral")) { if (await checkWebGPU()) { const loaded = voxtralModel ? true : await loadVoxtralModel(); const dualVoxtral = !diarizeToggle.checked && displayStream && displayStream.getAudioTracks().length > 0; if (loaded) await startVoxtralRecording(mixedStream || micStream, dualVoxtral ? micStream : null, dualVoxtral ? displayStream : null); } } mediaRecorder.start(250); isRecording = true; recordButton.classList.add("recording"); recordingStartTime = Date.now(); startTimer(); startWaveform(); setStatus("Recording..."); } catch (err) { console.error("[Recorder] Start error:", err); setStatus("Failed to start: " + err.message, "error"); cleanupStreams(); } } // -- Stop recording ---------------------------------------------------------- function stopRecording() { if (!isRecording) return; isRecording = false; recordButton.classList.remove("recording"); stopTimer(); stopWaveform(); if (websocket && websocket.readyState === WebSocket.OPEN) { websocket.send(new Blob([])); } // Stop Voxtral recording if (activeEngines.has("voxtral")) { stopVoxtralRecording(); } if (mediaRecorder && mediaRecorder.state !== "inactive") { mediaRecorder.stop(); } if (micRecorder && micRecorder.state !== "inactive") { micRecorder.stop(); } if (screenRecorder && screenRecorder.state !== "inactive") { screenRecorder.stop(); } setStatus("Processing..."); } // -- After recording stops --------------------------------------------------- async function onRecordingStopped() { // Let WhisperLiveKit finish in background (don't block batch engines) const whisperDone = (websocket && websocket.readyState === WebSocket.OPEN) ? new Promise((resolve) => { const timeout = setTimeout(() => { resolve(); }, 300000); const origHandler = websocket.onmessage; websocket.onmessage = (event) => { if (origHandler) origHandler(event); try { const data = JSON.parse(event.data); if (data.type === "ready_to_stop") { clearTimeout(timeout); resolve(); } } catch(e) {} }; }).then(() => { if (websocket && websocket.readyState === WebSocket.OPEN) { websocket.close(); } websocket = null; }) : Promise.resolve(); // Batch transcription for Parakeet/Nemotron const baseUrl = (window.location.origin !== 'null' && window.location.host) ? '' : window.parent.location.origin; // Routing-based separation: when Speaker detection OFF + screen audio captured const useRouting = !diarizeToggle.checked && micChunks.length > 0 && screenChunks.length > 0; console.log('[Routing] diarize:', diarizeToggle.checked, 'micChunks:', micChunks.length, 'screenChunks:', screenChunks.length, 'useRouting:', useRouting); const batchEngines = []; if (activeEngines.has('parakeet')) batchEngines.push({endpoint: '/parakeet-transcribe', el: parakeetTranscript, tim: parakeetTiming}); if (activeEngines.has('nemotron')) batchEngines.push({endpoint: '/nemotron-transcribe', el: nemotronTranscript, tim: nemotronTiming}); if (batchEngines.length > 0 && useRouting) { // Dual-track: transcribe mic and screen separately in parallel const micBlob = new Blob(micChunks, { type: 'audio/webm' }); const screenBlob = new Blob(screenChunks, { type: 'audio/webm' }); const promises = batchEngines.map(async ({endpoint, el, tim}) => { el.innerHTML = ' Transcribing mic + screen separately...'; const t0 = Date.now(); try { const [micResp, screenResp] = await Promise.all([ fetch(baseUrl + endpoint, { method: 'POST', body: micBlob }), fetch(baseUrl + endpoint, { method: 'POST', body: screenBlob }) ]); const micData = await micResp.json(); const screenData = await screenResp.json(); const elapsed = ((Date.now() - t0) / 1000).toFixed(1); tim.textContent = 'Processing time: ' + elapsed + 's (dual-track, server CPU)'; // Merge both tracks by timestamps, interleaved const micTokens = micData.tokens || []; const micTimestamps = micData.timestamps || []; const screenTokens = screenData.tokens || []; const screenTimestamps = screenData.timestamps || []; // Build word arrays with source label function buildWords(tokens, timestamps, label) { const words = []; let curWord = '', curStart = 0; for (let i = 0; i < tokens.length; i++) { const tok = tokens[i], ts = timestamps[i]; if (tok.startsWith(' ') || tok.startsWith('\n')) { if (curWord.trim()) words.push({text: curWord.trim(), start: curStart, label}); curWord = tok; curStart = ts; } else { if (!curWord) curStart = ts; curWord += tok; } } if (curWord.trim()) words.push({text: curWord.trim(), start: curStart, label}); return words; } const allWords = [ ...buildWords(micTokens, micTimestamps, 'YOU'), ...buildWords(screenTokens, screenTimestamps, 'SCREEN') ].sort((a, b) => a.start - b.start); // Merge consecutive same-label words into segments let output = ''; if (allWords.length > 0) { let cur = {label: allWords[0].label, start: allWords[0].start, text: allWords[0].text}; for (let i = 1; i < allWords.length; i++) { if (allWords[i].label === cur.label) { cur.text += ' ' + allWords[i].text; } else { const m = Math.floor(cur.start / 60), s = Math.floor(cur.start % 60); output += cur.label + ' [' + String(m).padStart(2,'0') + ':' + String(s).padStart(2,'0') + ']: ' + cur.text + '\n'; cur = {label: allWords[i].label, start: allWords[i].start, text: allWords[i].text}; } } const m = Math.floor(cur.start / 60), s = Math.floor(cur.start % 60); output += cur.label + ' [' + String(m).padStart(2,'0') + ':' + String(s).padStart(2,'0') + ']: ' + cur.text; } el.textContent = output.trim() || (micData.text || '') + '\n' + (screenData.text || '') || 'No speech detected.'; if (endpoint.includes('parakeet') && micData.tokens) window._parakeetData = micData; } catch (err) { el.innerHTML = 'Error: ' + escapeHtml(err.message) + ''; } }); await Promise.all(promises); } else if (batchEngines.length > 0) { // Normal: single mixed audio const blob = new Blob(audioChunks, { type: 'audio/webm' }); const promises = batchEngines.map(async ({endpoint, el, tim}) => { el.innerHTML = ' Transcribing...'; const t0 = Date.now(); try { const resp = await fetch(baseUrl + endpoint, { method: 'POST', body: blob }); const data = await resp.json(); const elapsed = ((Date.now() - t0) / 1000).toFixed(1); tim.textContent = 'Processing time: ' + elapsed + 's (server CPU)'; el.innerHTML = '
' + escapeHtml(data.text) + '
'; if (endpoint.includes('parakeet') && data.tokens) window._parakeetData = data; } catch (err) { el.innerHTML = 'Error: ' + escapeHtml(err.message) + ''; } }); await Promise.all(promises); } // Run diarization on recorded audio (if speaker detection enabled) if (audioChunks.length > 0 && diarizeToggle.checked) { const blob = new Blob(audioChunks, { type: 'audio/webm' }); try { setStatus("Analyzing speakers..."); const diarUrl = baseUrl + '/diarize'; console.log("[Diarization] Posting to:", diarUrl, "blob size:", blob.size); const resp = await fetch(diarUrl, { method: 'POST', body: blob }); if (!resp.ok) { console.warn("[Diarization] Server error:", resp.status, await resp.text().catch(() => '')); } else { const data = await resp.json(); console.log("[Diarization] Result:", data.num_speakers, "speakers,", (data.segments||[]).length, "segments"); if (data.segments && data.segments.length > 0) { window._diarSegments = data.segments; window._diarNumSpeakers = data.num_speakers || 0; applyDiarization(); } } } catch (err) { console.warn("[Diarization] Error:", err); } } // Offer audio download (before cleanup clears chunks) if (audioChunks.length > 0) { const audioBlob = new Blob(audioChunks, { type: 'audio/webm' }); const url = URL.createObjectURL(audioBlob); const dl = document.getElementById('audioDownload'); if (dl) { if (dl.href) URL.revokeObjectURL(dl.href); dl.href = url; dl.download = 'recording_' + new Date().toISOString().slice(0,19).replace(/:/g,'-') + '.webm'; dl.style.display = 'inline-block'; } } // Wait for WhisperLiveKit to finish (runs in parallel with batch engines) await whisperDone; cleanupStreams(); setStatus("Done.", "success"); // Short recording hint for WhisperLiveKit if (activeEngines.has("whisper")) { setTimeout(() => { if (whisperTranscript.textContent.trim() === "") { whisperTranscript.innerHTML = '
Tip: Record for 20+ seconds for best results with large models on CPU
'; } }, 3000); } } // -- WebSocket (WhisperLiveKit) ---------------------------------------------- function connectWebSocket() { return new Promise((resolve, reject) => { // Use parent window's host (iframe from document.write has about:blank origin) const host = window.location.host || window.parent.location.host; const proto = (window.location.protocol === "https:" || window.parent.location.protocol === "https:") ? "wss:" : "ws:"; const wsUrl = proto + "//" + host + "/asr"; websocket = new WebSocket(wsUrl); let whisperStartTime = Date.now(); websocket.onopen = () => { console.log("[WS] Connected"); whisperStartTime = Date.now(); resolve(); }; websocket.onmessage = (event) => { try { const data = JSON.parse(event.data); if (data.type === "config") return; if (data.type === "ready_to_stop") { const elapsed = ((Date.now() - whisperStartTime) / 1000).toFixed(1); whisperTiming.textContent = `Processing time: ${elapsed}s (real-time)`; return; } renderWhisperResults(data); } catch (err) { console.warn("[WS] Parse error:", err); } }; websocket.onerror = (err) => { console.error("[WS] Error:", err); setStatus("WebSocket connection failed", "error"); reject(err); }; websocket.onclose = () => { console.log("[WS] Closed"); }; }); } function fmtTime(s) { if (s == null || isNaN(s) || s < 0) return ""; const m = Math.floor(s / 60), sec = Math.floor(s % 60); return String(m).padStart(2,"0") + ":" + String(sec).padStart(2,"0"); } function applyDiarization() { const segs = window._diarSegments; if (!segs || segs.length === 0) return; const numSpeakers = window._diarNumSpeakers || 0; const panels = [ {el: whisperTranscript, active: activeEngines.has('whisper'), data: null}, // Voxtral excluded: uses browser-only diarization (Xenova method), no server {el: parakeetTranscript, active: activeEngines.has('parakeet'), data: window._parakeetData}, {el: nemotronTranscript, active: activeEngines.has('nemotron'), data: null}, ]; for (const p of panels) { if (!p.active || !p.el.textContent.trim()) continue; // If we have token timestamps (Parakeet), use word-level alignment if (p.data && p.data.tokens && p.data.timestamps && p.data.tokens.length > 0) { // Reconstruct full words from subword tokens const words = []; let curWord = '', curStart = 0, curEnd = 0; for (let i = 0; i < p.data.tokens.length; i++) { const tok = p.data.tokens[i]; const ts = p.data.timestamps[i]; if (tok.startsWith(' ') || tok.startsWith('\n')) { if (curWord.trim()) words.push({text: curWord, start: curStart, end: curEnd}); curWord = tok; curStart = ts; curEnd = ts; } else { if (!curWord) curStart = ts; curWord += tok; curEnd = ts; } } if (curWord.trim()) words.push({text: curWord, start: curStart, end: curEnd}); // Assign speaker by greatest temporal overlap function bestSpeaker(ws, we) { let best = null, maxOv = 0; for (const seg of segs) { const ovS = Math.max(ws, seg.start), ovE = Math.min(we, seg.end); if (ovS < ovE && ovE - ovS > maxOv) { maxOv = ovE - ovS; best = seg.speakers.map(s => 'Speaker ' + s).join(' & '); } } return best; } // Merge consecutive same-speaker words let merged = ''; let cSpk = null, cStart = 0, cText = ''; for (const w of words) { const spk = bestSpeaker(w.start, w.end + 0.05) || cSpk || 'Unknown'; if (spk === cSpk) { cText += w.text; } else { if (cText.trim() && cSpk) merged += '\n[' + fmtTime(cStart) + '] ' + cSpk + ': ' + cText.trim(); cSpk = spk; cStart = w.start; cText = w.text; } } if (cText.trim() && cSpk) merged += '\n[' + fmtTime(cStart) + '] ' + cSpk + ': ' + cText.trim(); p.el.textContent = numSpeakers + ' speakers detected:\n' + merged.trim(); } else { // Fallback: proportional split for engines without timestamps const rawText = p.el.textContent.trim(); const words = rawText.split(/\s+/); const totalWords = words.length; const totalDur = segs.reduce((s, seg) => s + (seg.end - seg.start), 0); if (totalDur <= 0 || totalWords === 0) continue; let merged = ''; let wordIdx = 0; let lastSpeaker = ''; for (const seg of segs) { const dur = seg.end - seg.start; const nWords = Math.max(1, Math.round(totalWords * dur / totalDur)); const chunk = words.slice(wordIdx, wordIdx + nWords).join(' '); wordIdx += nWords; if (!chunk) continue; const speakers = seg.speakers.map(s => 'Speaker ' + s).join(' & '); const start = fmtTime(seg.start); if (speakers !== lastSpeaker) { merged += '\n[' + start + '] ' + speakers + ': ' + chunk; lastSpeaker = speakers; } else { merged += ' ' + chunk; } } if (wordIdx < totalWords) merged += ' ' + words.slice(wordIdx).join(' '); p.el.textContent = numSpeakers + ' speakers detected:\n' + merged.trim(); } } } function renderWhisperResults(data) { if (!data.lines && !data.buffer_transcription) return; let html = ""; if (data.lines) { for (const line of data.lines) { if (!line.text && !line.translation) continue; const tsFmt = fmtTime(line.start); const ts = tsFmt ? `[${tsFmt}]` : ""; // speaker tag only if real diarization is active (multiple speakers detected) const speakerTag = (line.speaker > 0 && data.lines.some(l => l.speaker !== line.speaker)) ? `Speaker ${line.speaker}` : ""; const text = line.text || ""; html += `
${ts}${speakerTag}${escapeHtml(text)}
`; } } if (data.buffer_transcription) { html += `${escapeHtml(data.buffer_transcription)}`; } if (data.buffer_diarization) { html += ` ${escapeHtml(data.buffer_diarization)}`; } if (html) { whisperTranscript.innerHTML = html; whisperTranscript.scrollTop = whisperTranscript.scrollHeight; } } // -- Timer ------------------------------------------------------------------- function startTimer() { timerEl.classList.add("recording"); timerInterval = setInterval(() => { const elapsed = Math.floor((Date.now() - recordingStartTime) / 1000); const mins = String(Math.floor(elapsed / 60)).padStart(2, "0"); const secs = String(elapsed % 60).padStart(2, "0"); timerEl.textContent = `${mins}:${secs}`; }, 500); } function stopTimer() { timerEl.classList.remove("recording"); if (timerInterval) { clearInterval(timerInterval); timerInterval = null; } } // -- Waveform ---------------------------------------------------------------- function startWaveform() { const ctx = waveCanvas.getContext("2d"); const bufferLength = analyserNode.frequencyBinCount; const dataArray = new Uint8Array(bufferLength); function draw() { animFrameId = requestAnimationFrame(draw); analyserNode.getByteTimeDomainData(dataArray); ctx.fillStyle = getComputedStyle(document.documentElement).getPropertyValue("--surface").trim(); ctx.fillRect(0, 0, waveCanvas.width, waveCanvas.height); ctx.lineWidth = 2; ctx.strokeStyle = getComputedStyle(document.documentElement).getPropertyValue("--accent").trim(); ctx.beginPath(); const sliceWidth = waveCanvas.width / bufferLength; let x = 0; for (let i = 0; i < bufferLength; i++) { const v = dataArray[i] / 128.0; const y = (v * waveCanvas.height) / 2; if (i === 0) ctx.moveTo(x, y); else ctx.lineTo(x, y); x += sliceWidth; } ctx.lineTo(waveCanvas.width, waveCanvas.height / 2); ctx.stroke(); } draw(); } function stopWaveform() { if (animFrameId) { cancelAnimationFrame(animFrameId); animFrameId = null; } } // -- Cleanup ----------------------------------------------------------------- function cleanupStreams() { if (micStream) { micStream.getTracks().forEach((t) => t.stop()); micStream = null; } if (displayStream) { displayStream.getTracks().forEach((t) => t.stop()); displayStream = null; } if (audioContext && audioContext.state !== "closed") { audioContext.close().catch(() => {}); audioContext = null; } mixedStream = null; analyserNode = null; audioChunks = []; micChunks = []; screenChunks = []; micRecorder = null; screenRecorder = null; voxtralAudioChunks = []; voxtralAudioLength = 0; voxtralMicChunks = []; voxtralMicLength = 0; voxtralScreenChunks = []; voxtralScreenLength = 0; _voxtralCached = new Float32Array(0); _voxtralCachedLen = 0; } // -- Helpers ----------------------------------------------------------------- function setStatus(text, type = "") { statusEl.textContent = text; statusEl.className = type; } function escapeHtml(str) { const div = document.createElement("div"); div.textContent = str; return div.innerHTML; } // -- File upload -------------------------------------------------------------- document.getElementById('fileUpload').addEventListener('change', async (e) => { const file = e.target.files[0]; if (!file) return; // Clear all panels whisperTranscript.innerHTML = 'File upload - streaming engines not available'; voxtralTranscript.innerHTML = 'File upload - browser engine not available'; parakeetTranscript.innerHTML = ''; parakeetTiming.textContent = ''; nemotronTranscript.innerHTML = ''; nemotronTiming.textContent = ''; setStatus('Processing uploaded file: ' + file.name); const baseUrl = (window.location.origin !== 'null' && window.location.host) ? '' : window.parent.location.origin; const blob = file; // Run batch engines in parallel const batchEngines = []; if (activeEngines.has('parakeet')) batchEngines.push({endpoint: '/parakeet-transcribe', el: parakeetTranscript, tim: parakeetTiming}); if (activeEngines.has('nemotron')) batchEngines.push({endpoint: '/nemotron-transcribe', el: nemotronTranscript, tim: nemotronTiming}); if (batchEngines.length > 0) { const promises = batchEngines.map(async ({endpoint, el, tim}) => { el.innerHTML = ' Transcribing...'; const t0 = Date.now(); try { const resp = await fetch(baseUrl + endpoint, { method: 'POST', body: blob }); const data = await resp.json(); const elapsed = ((Date.now() - t0) / 1000).toFixed(1); tim.textContent = 'Processing time: ' + elapsed + 's (server CPU)'; el.innerHTML = '
' + escapeHtml(data.text) + '
'; if (endpoint.includes('parakeet') && data.tokens) window._parakeetData = data; } catch (err) { el.innerHTML = 'Error: ' + escapeHtml(err.message) + ''; } }); await Promise.all(promises); } // Run diarization (if speaker detection enabled) if (diarizeToggle.checked) try { setStatus('Analyzing speakers...'); console.log('[Upload Diarization] Posting to:', baseUrl + '/diarize', 'size:', blob.size); const resp = await fetch(baseUrl + '/diarize', { method: 'POST', body: blob }); const text = await resp.text(); console.log('[Upload Diarization] Response:', resp.status, text.substring(0, 200)); if (resp.ok) { const data = JSON.parse(text); if (data.error) { console.warn('[Upload Diarization] Server error:', data.error); } else if (data.segments && data.segments.length > 0) { window._diarSegments = data.segments; window._diarNumSpeakers = data.num_speakers || 0; applyDiarization(); } } } catch (err) { console.warn('[Upload Diarization] Error:', err); } setStatus('Done.', 'success'); e.target.value = ''; // Reset file input }); // -- Copy buttons ------------------------------------------------------------- document.querySelectorAll(".copy-btn").forEach((btn) => { btn.addEventListener("click", () => { const panel = btn.closest(".result-panel"); const transcript = panel.querySelector(".transcript"); const text = transcript ? transcript.textContent.trim() : ""; if (!text) return; navigator.clipboard.writeText(text).then(() => { const origHTML = btn.innerHTML; btn.innerHTML = ' Copied!'; btn.classList.add("copied"); setTimeout(() => { btn.innerHTML = origHTML; btn.classList.remove("copied"); }, 1500); }).catch(() => {}); }); }); """ # -- Inline HTML (with embedded CSS + JS) ------------------------------------ RECORDER_HTML = f""" Transcription Comparison

Transcription Comparison - For your meeting notes!

00:00

WhisperLiveKit · Whisper large-v3-turbo Real-time

Voxtral-Mini-4B-Realtime-2602 WebGPU ONNX

Parakeet TDT v3 · 25 languages CPU ONNX

Nemotron Streaming · English only CPU ONNX int8

""" # Base64-encode the recorder HTML so we can embed it in JS without any server route _RECORDER_HTML_B64 = base64.b64encode(RECORDER_HTML.encode("utf-8")).decode("ascii") # Inject loader via trick — Gradio 6 strips