Spaces:
Running
Running
| import { Component, Show, For, createSignal, createEffect, onMount, onCleanup } from 'solid-js'; | |
| import { appStore } from './stores/appStore'; | |
| import { CompactWaveform, ModelLoadingOverlay, DebugPanel, TranscriptionDisplay, SettingsContent } from './components'; | |
| import { getModelDisplayName, MODELS } from './components/ModelLoadingOverlay'; | |
| import { AudioEngine } from './lib/audio'; | |
| import { MelWorkerClient } from './lib/audio/MelWorkerClient'; | |
| import { TranscriptionWorkerClient } from './lib/transcription'; | |
| import { HybridVAD } from './lib/vad'; | |
| import { WindowBuilder } from './lib/transcription/WindowBuilder'; | |
| import { BufferWorkerClient } from './lib/buffer'; | |
| import { TenVADWorkerClient } from './lib/vad/TenVADWorkerClient'; | |
| import type { V4ProcessResult } from './lib/transcription/TranscriptionWorkerClient'; | |
| import type { BufferWorkerConfig, TenVADResult } from './lib/buffer/types'; | |
| import { formatDuration } from './utils/time'; | |
| // Singleton instances | |
| let audioEngine: AudioEngine | null = null; | |
| export const [audioEngineSignal, setAudioEngineSignal] = createSignal<AudioEngine | null>(null); | |
| let workerClient: TranscriptionWorkerClient | null = null; | |
| let melClient: MelWorkerClient | null = null; | |
| export const [melClientSignal, setMelClientSignal] = createSignal<MelWorkerClient | null>(null); | |
| let segmentUnsubscribe: (() => void) | null = null; | |
| let windowUnsubscribe: (() => void) | null = null; | |
| let melChunkUnsubscribe: (() => void) | null = null; | |
| let visualizationUnsubscribe: (() => void) | undefined; | |
| // v4 pipeline instances | |
| let hybridVAD: HybridVAD | null = null; | |
| let bufferClient: BufferWorkerClient | null = null; | |
| let tenVADClient: TenVADWorkerClient | null = null; | |
| let windowBuilder: WindowBuilder | null = null; | |
| let v4TickTimeout: number | undefined; | |
| let v4TickRunning = false; | |
| let v4AudioChunkUnsubscribe: (() => void) | null = null; | |
| let v4MelChunkUnsubscribe: (() => void) | null = null; | |
| let v4InferenceBusy = false; | |
| let v4LastInferenceTime = 0; | |
| // Global sample counter for audio chunks (tracks total samples written to BufferWorker) | |
| let v4GlobalSampleOffset = 0; | |
| // Throttle UI updates from TEN-VAD to at most once per frame | |
| let pendingSileroProb: number | null = null; | |
| let sileroUpdateScheduled = false; | |
| let pendingVadState: { | |
| isSpeech: boolean; | |
| energy: number; | |
| snr: number; | |
| hybridState: string; | |
| sileroProbability?: number; | |
| } | null = null; | |
| let vadUpdateScheduled = false; | |
| const scheduleSileroUpdate = (prob: number) => { | |
| pendingSileroProb = prob; | |
| if (sileroUpdateScheduled) return; | |
| sileroUpdateScheduled = true; | |
| requestAnimationFrame(() => { | |
| sileroUpdateScheduled = false; | |
| if (pendingSileroProb === null) return; | |
| const currentState = appStore.vadState(); | |
| appStore.setVadState({ | |
| ...currentState, | |
| sileroProbability: pendingSileroProb, | |
| }); | |
| }); | |
| }; | |
| const scheduleVadStateUpdate = (next: { | |
| isSpeech: boolean; | |
| energy: number; | |
| snr: number; | |
| hybridState: string; | |
| sileroProbability?: number; | |
| }) => { | |
| pendingVadState = next; | |
| if (vadUpdateScheduled) return; | |
| vadUpdateScheduled = true; | |
| requestAnimationFrame(() => { | |
| vadUpdateScheduled = false; | |
| if (!pendingVadState) return; | |
| const currentState = appStore.vadState(); | |
| const sileroProbability = | |
| pendingVadState.sileroProbability !== undefined | |
| ? pendingVadState.sileroProbability | |
| : currentState.sileroProbability; | |
| appStore.setVadState({ | |
| ...currentState, | |
| ...pendingVadState, | |
| sileroProbability, | |
| }); | |
| appStore.setIsSpeechDetected(pendingVadState.isSpeech); | |
| pendingVadState = null; | |
| }); | |
| }; | |
| const Header: Component<{ | |
| onToggleDebug: () => void; | |
| }> = (props) => { | |
| const sessionLabel = () => | |
| appStore.modelState() === 'ready' ? getModelDisplayName(appStore.selectedModelId()) : 'Session'; | |
| return ( | |
| <header class="h-20 flex items-center justify-between px-8 bg-[var(--color-earthy-bg)]/80 backdrop-blur-sm z-30 shrink-0"> | |
| <div class="flex items-center gap-6"> | |
| <div class="flex items-center gap-3"> | |
| <div class="w-10 h-10 rounded-full bg-[var(--color-earthy-muted-green)] flex items-center justify-center text-white"> | |
| <span class="material-symbols-outlined text-xl">auto_awesome</span> | |
| </div> | |
| <div> | |
| <h1 class="text-lg font-semibold tracking-tight text-[var(--color-earthy-dark-brown)]">keet</h1> | |
| <p class="text-[10px] uppercase tracking-[0.2em] text-[var(--color-earthy-soft-brown)] font-medium">{sessionLabel()}</p> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="flex items-center gap-4"> | |
| <button | |
| type="button" | |
| onClick={props.onToggleDebug} | |
| class={`p-2 rounded-full transition-colors ${appStore.showDebugPanel() ? 'bg-[var(--color-earthy-muted-green)] text-white' : 'text-[var(--color-earthy-muted-green)] hover:bg-[var(--color-earthy-sage)]/30'}`} | |
| title={appStore.showDebugPanel() ? 'Hide debug panel' : 'Show debug panel'} | |
| aria-label="Toggle debug panel" | |
| > | |
| <span class="material-symbols-outlined">bug_report</span> | |
| </button> | |
| <button | |
| type="button" | |
| class="p-2 text-[var(--color-earthy-muted-green)] hover:scale-110 transition-transform" | |
| aria-label="More options" | |
| > | |
| <span class="material-symbols-outlined">more_vert</span> | |
| </button> | |
| </div> | |
| </header> | |
| ); | |
| }; | |
| const WIDGET_STORAGE_KEY = 'boncukjs-control-widget-pos'; | |
| const WIDGET_MAX_W = 672; | |
| const WIDGET_MIN_H = 80; | |
| const App: Component = () => { | |
| const [showModelOverlay, setShowModelOverlay] = createSignal(false); | |
| const [showContextPanel, setShowContextPanel] = createSignal(false); | |
| type SettingsPanelSection = 'full' | 'audio' | 'model'; | |
| const [settingsPanelSection, setSettingsPanelSection] = createSignal<SettingsPanelSection>('full'); | |
| let panelHoverCloseTimeout: number | undefined; | |
| const [workerReady, setWorkerReady] = createSignal(false); | |
| const [widgetPos, setWidgetPos] = createSignal<{ x: number; y: number } | null>(null); | |
| const [isDragging, setIsDragging] = createSignal(false); | |
| const isRecording = () => appStore.recordingState() === 'recording'; | |
| const isModelReady = () => appStore.modelState() === 'ready'; | |
| let dragStart = { x: 0, y: 0 }; | |
| let posStart = { x: 0, y: 0 }; | |
| const [windowHeight, setWindowHeight] = createSignal(typeof window !== 'undefined' ? window.innerHeight : 600); | |
| const settingsExpandUp = () => { | |
| const pos = widgetPos(); | |
| if (!pos) return true; | |
| return pos.y >= windowHeight() / 2; | |
| }; | |
| const handleWidgetDragStart = (e: MouseEvent) => { | |
| const target = e.target; | |
| if (target instanceof Element && target.closest('button, select, input')) return; | |
| e.preventDefault(); | |
| const pos = widgetPos(); | |
| if (!pos) return; | |
| setIsDragging(true); | |
| dragStart = { x: e.clientX, y: e.clientY }; | |
| posStart = { ...pos }; | |
| const onMove = (e2: MouseEvent) => { | |
| const dx = e2.clientX - dragStart.x; | |
| const dy = e2.clientY - dragStart.y; | |
| const w = typeof window !== 'undefined' ? window.innerWidth : 800; | |
| const h = typeof window !== 'undefined' ? window.innerHeight : 600; | |
| const newX = Math.max(0, Math.min(w - WIDGET_MAX_W, posStart.x + dx)); | |
| const newY = Math.max(0, Math.min(h - WIDGET_MIN_H, posStart.y + dy)); | |
| setWidgetPos({ x: newX, y: newY }); | |
| }; | |
| const onUp = () => { | |
| setIsDragging(false); | |
| window.removeEventListener('mousemove', onMove); | |
| window.removeEventListener('mouseup', onUp); | |
| const p = widgetPos(); | |
| if (p && typeof localStorage !== 'undefined') { | |
| try { | |
| localStorage.setItem(WIDGET_STORAGE_KEY, JSON.stringify(p)); | |
| } catch (_) {} | |
| } | |
| }; | |
| window.addEventListener('mousemove', onMove); | |
| window.addEventListener('mouseup', onUp); | |
| }; | |
| createEffect(() => { | |
| if (!showContextPanel()) return; | |
| const handler = (e: KeyboardEvent) => { | |
| if (e.key === 'Escape') { | |
| e.preventDefault(); | |
| setShowContextPanel(false); | |
| } | |
| }; | |
| document.addEventListener('keydown', handler); | |
| return () => document.removeEventListener('keydown', handler); | |
| }); | |
| createEffect(() => { | |
| if (appStore.modelState() === 'ready' && showContextPanel() && settingsPanelSection() === 'model') { | |
| setShowContextPanel(false); | |
| } | |
| }); | |
| onMount(() => { | |
| const onResize = () => setWindowHeight(window.innerHeight); | |
| window.addEventListener('resize', onResize); | |
| const stored = | |
| typeof localStorage !== 'undefined' ? localStorage.getItem(WIDGET_STORAGE_KEY) : null; | |
| let posRestored = false; | |
| if (stored) { | |
| try { | |
| const parsed = JSON.parse(stored) as { x: number; y: number }; | |
| if (Number.isFinite(parsed.x) && Number.isFinite(parsed.y)) { | |
| setWidgetPos({ x: parsed.x, y: parsed.y }); | |
| posRestored = true; | |
| } | |
| } catch (_) {} | |
| } | |
| if (!posRestored) { | |
| const w = window.innerWidth; | |
| const h = window.innerHeight; | |
| setWidgetPos({ | |
| x: Math.max(0, (w - WIDGET_MAX_W) / 2), | |
| y: h - 140, | |
| }); | |
| } | |
| workerClient = new TranscriptionWorkerClient(); | |
| workerClient.onModelProgress = (p) => { | |
| appStore.setModelProgress(p.progress); | |
| appStore.setModelMessage(p.message || ''); | |
| if (p.file) appStore.setModelFile(p.file); | |
| }; | |
| workerClient.onModelStateChange = (s) => { | |
| appStore.setModelState(s); | |
| }; | |
| workerClient.onV3Confirmed = (text) => { | |
| appStore.setTranscript(text); | |
| }; | |
| workerClient.onV3Pending = (text) => { | |
| appStore.setPendingText(text); | |
| }; | |
| workerClient.onError = (msg) => { | |
| appStore.setErrorMessage(msg); | |
| }; | |
| appStore.refreshDevices(); | |
| setWorkerReady(true); | |
| return () => window.removeEventListener('resize', onResize); | |
| }); | |
| // No longer auto-show blocking model overlay; model selection is in the settings panel. | |
| // createEffect(() => { ... setShowModelOverlay(true); }); | |
| onCleanup(() => { | |
| clearTimeout(panelHoverCloseTimeout); | |
| visualizationUnsubscribe?.(); | |
| cleanupV4Pipeline(); | |
| melClient?.dispose(); | |
| workerClient?.dispose(); | |
| }); | |
| // ---- v4 pipeline tick: periodic window building + inference ---- | |
| let v4TickCount = 0; | |
| let v4ModelNotReadyLogged = false; | |
| const v4Tick = async () => { | |
| if (!workerClient || !windowBuilder || !audioEngine || !bufferClient || v4InferenceBusy) return; | |
| // Skip inference if model is not ready (but still allow audio/mel/VAD to process) | |
| if (appStore.modelState() !== 'ready') { | |
| if (!v4ModelNotReadyLogged) { | |
| console.log('[v4Tick] Model not ready yet - audio is being captured and preprocessed'); | |
| v4ModelNotReadyLogged = true; | |
| } | |
| return; | |
| } | |
| // Reset the flag once model becomes ready | |
| if (v4ModelNotReadyLogged) { | |
| console.log('[v4Tick] Model is now ready - starting inference'); | |
| v4ModelNotReadyLogged = false; | |
| // Initialize the v4 service now that model is ready | |
| await workerClient.initV4Service({ debug: false }); | |
| } | |
| v4TickCount++; | |
| const now = performance.now(); | |
| // Use the store's configurable inference interval (minus a small margin for the tick jitter) | |
| const minInterval = Math.max(200, appStore.v4InferenceIntervalMs() - 100); | |
| if (now - v4LastInferenceTime < minInterval) return; | |
| // Check if there is speech via the BufferWorker (async query). | |
| // We check both energy and inference VAD layers; either one detecting speech triggers inference. | |
| const cursorSample = windowBuilder.getMatureCursorFrame(); // frame === sample in our pipeline | |
| const currentSample = v4GlobalSampleOffset; | |
| const startSample = cursorSample > 0 ? cursorSample : 0; | |
| let hasSpeech = false; | |
| if (currentSample > startSample) { | |
| // Check energy VAD first (always available, low latency) | |
| const energyResult = await bufferClient.hasSpeech('energyVad', startSample, currentSample, 0.3); | |
| // When inference VAD is ready, require BOTH energy AND inference to agree | |
| // This prevents false positives from music/noise that has high energy but no speech | |
| if (tenVADClient?.isReady()) { | |
| const inferenceResult = await bufferClient.hasSpeech('inferenceVad', startSample, currentSample, 0.5); | |
| // Require both energy and inference VAD to agree (AND logic) | |
| hasSpeech = energyResult.hasSpeech && inferenceResult.hasSpeech; | |
| } else { | |
| // Fall back to energy-only if inference VAD is not available | |
| hasSpeech = energyResult.hasSpeech; | |
| } | |
| } | |
| if (v4TickCount <= 5 || v4TickCount % 20 === 0) { | |
| const vadState = appStore.vadState(); | |
| const rb = audioEngine.getRingBuffer(); | |
| const rbFrame = rb.getCurrentFrame(); | |
| const rbBase = rb.getBaseFrameOffset(); | |
| console.log( | |
| `[v4Tick #${v4TickCount}] hasSpeech=${hasSpeech}, vadState=${vadState.hybridState}, ` + | |
| `energy=${vadState.energy.toFixed(4)}, inferenceVAD=${(vadState.sileroProbability || 0).toFixed(2)}, ` + | |
| `samples=[${startSample}:${currentSample}], ` + | |
| `ringBuf=[base=${rbBase}, head=${rbFrame}, avail=${rbFrame - rbBase}]` | |
| ); | |
| } | |
| // Periodic buffer worker state dump (every 40 ticks) | |
| if (v4TickCount % 40 === 0 && bufferClient) { | |
| try { | |
| const state = await bufferClient.getState(); | |
| const layerSummary = Object.entries(state.layers) | |
| .map(([id, l]) => `${id}:${l.fillCount}/${l.maxEntries}@${l.currentSample}`) | |
| .join(', '); | |
| console.log(`[v4Tick #${v4TickCount}] BufferState: ${layerSummary}`); | |
| } catch (_) { /* ignore state query errors */ } | |
| } | |
| if (!hasSpeech) { | |
| // Check for silence-based flush using BufferWorker | |
| const silenceDuration = await bufferClient.getSilenceTailDuration('energyVad', 0.3); | |
| if (silenceDuration >= appStore.v4SilenceFlushSec()) { | |
| // Flush pending sentence via timeout finalization | |
| try { | |
| const flushResult = await workerClient.v4FinalizeTimeout(); | |
| if (flushResult) { | |
| appStore.setMatureText(flushResult.matureText); | |
| appStore.setImmatureText(flushResult.immatureText); | |
| appStore.setMatureCursorTime(flushResult.matureCursorTime); | |
| appStore.setTranscript(flushResult.fullText); | |
| appStore.appendV4SentenceEntries(flushResult.matureSentences); | |
| appStore.setV4MergerStats({ | |
| sentencesFinalized: flushResult.matureSentenceCount, | |
| cursorUpdates: flushResult.stats?.matureCursorUpdates || 0, | |
| utterancesProcessed: flushResult.stats?.utterancesProcessed || 0, | |
| }); | |
| // Advance window builder cursor | |
| windowBuilder.advanceMatureCursorByTime(flushResult.matureCursorTime); | |
| } | |
| } catch (err) { | |
| console.error('[v4Tick] Flush error:', err); | |
| } | |
| } | |
| return; | |
| } | |
| // Build window from cursor to current position | |
| const window = windowBuilder.buildWindow(); | |
| if (!window) { | |
| if (v4TickCount <= 10 || v4TickCount % 20 === 0) { | |
| const rb = audioEngine.getRingBuffer(); | |
| const rbHead = rb.getCurrentFrame(); | |
| const rbBase = rb.getBaseFrameOffset(); | |
| console.log( | |
| `[v4Tick #${v4TickCount}] buildWindow=null, ` + | |
| `ringBuf=[base=${rbBase}, head=${rbHead}, avail=${rbHead - rbBase}], ` + | |
| `cursor=${windowBuilder.getMatureCursorFrame()}` | |
| ); | |
| } | |
| return; | |
| } | |
| console.log(`[v4Tick #${v4TickCount}] Window [${window.startFrame}:${window.endFrame}] ${window.durationSeconds.toFixed(2)}s (initial=${window.isInitial})`); | |
| v4InferenceBusy = true; | |
| v4LastInferenceTime = now; | |
| try { | |
| const inferenceStart = performance.now(); | |
| // Get mel features for the window | |
| let features: { features: Float32Array; T: number; melBins: number } | null = null; | |
| if (melClient) { | |
| features = await melClient.getFeatures(window.startFrame, window.endFrame); | |
| } | |
| if (!features) { | |
| v4InferenceBusy = false; | |
| return; | |
| } | |
| // Calculate time offset for absolute timestamps | |
| const timeOffset = window.startFrame / 16000; | |
| // Calculate incremental cache parameters | |
| const cursorFrame = windowBuilder.getMatureCursorFrame(); | |
| const prefixSeconds = cursorFrame > 0 ? (window.startFrame - cursorFrame) / 16000 : 0; | |
| const result: V4ProcessResult = await workerClient.processV4ChunkWithFeatures({ | |
| features: features.features, | |
| T: features.T, | |
| melBins: features.melBins, | |
| timeOffset, | |
| endTime: window.endFrame / 16000, | |
| segmentId: `v4_${Date.now()}`, | |
| incrementalCache: prefixSeconds > 0 ? { | |
| cacheKey: 'v4-stream', | |
| prefixSeconds, | |
| } : undefined, | |
| }); | |
| const inferenceMs = performance.now() - inferenceStart; | |
| // Update UI state | |
| appStore.setMatureText(result.matureText); | |
| appStore.setImmatureText(result.immatureText); | |
| appStore.setTranscript(result.fullText); | |
| appStore.setPendingText(result.immatureText); | |
| appStore.appendV4SentenceEntries(result.matureSentences); | |
| appStore.setInferenceLatency(inferenceMs); | |
| // Update RTF | |
| const audioDurationMs = window.durationSeconds * 1000; | |
| appStore.setRtf(inferenceMs / audioDurationMs); | |
| // Advance cursor if merger advanced it | |
| if (result.matureCursorTime > windowBuilder.getMatureCursorTime()) { | |
| appStore.setMatureCursorTime(result.matureCursorTime); | |
| windowBuilder.advanceMatureCursorByTime(result.matureCursorTime); | |
| windowBuilder.markSentenceEnd(Math.round(result.matureCursorTime * 16000)); | |
| } | |
| // Update stats | |
| appStore.setV4MergerStats({ | |
| sentencesFinalized: result.matureSentenceCount, | |
| cursorUpdates: result.stats?.matureCursorUpdates || 0, | |
| utterancesProcessed: result.stats?.utterancesProcessed || 0, | |
| }); | |
| // Update buffer metrics | |
| const ring = audioEngine.getRingBuffer(); | |
| appStore.setBufferMetrics({ | |
| fillRatio: ring.getFillCount() / ring.getSize(), | |
| latencyMs: (ring.getFillCount() / 16000) * 1000, | |
| }); | |
| // Update metrics | |
| if (result.metrics) { | |
| appStore.setSystemMetrics({ | |
| throughput: 0, | |
| modelConfidence: 0, | |
| }); | |
| } | |
| } catch (err: any) { | |
| console.error('[v4Tick] Inference error:', err); | |
| } finally { | |
| v4InferenceBusy = false; | |
| } | |
| }; | |
| // ---- Cleanup v4 pipeline resources ---- | |
| const cleanupV4Pipeline = () => { | |
| v4TickRunning = false; | |
| if (v4TickTimeout) { | |
| clearTimeout(v4TickTimeout); | |
| v4TickTimeout = undefined; | |
| } | |
| if (v4AudioChunkUnsubscribe) { | |
| v4AudioChunkUnsubscribe(); | |
| v4AudioChunkUnsubscribe = null; | |
| } | |
| if (v4MelChunkUnsubscribe) { | |
| v4MelChunkUnsubscribe(); | |
| v4MelChunkUnsubscribe = null; | |
| } | |
| hybridVAD = null; | |
| if (tenVADClient) { | |
| tenVADClient.dispose(); | |
| tenVADClient = null; | |
| } | |
| if (bufferClient) { | |
| bufferClient.dispose(); | |
| bufferClient = null; | |
| } | |
| windowBuilder = null; | |
| v4InferenceBusy = false; | |
| v4LastInferenceTime = 0; | |
| v4GlobalSampleOffset = 0; | |
| }; | |
| const toggleRecording = async () => { | |
| if (isRecording()) { | |
| // Update UI immediately so the stop button always takes effect even if cleanup throws | |
| visualizationUnsubscribe?.(); | |
| visualizationUnsubscribe = undefined; | |
| appStore.stopRecording(); | |
| appStore.setAudioLevel(0); | |
| appStore.setBarLevels(new Float32Array(0)); | |
| try { | |
| audioEngine?.stop(); | |
| if (segmentUnsubscribe) segmentUnsubscribe(); | |
| if (windowUnsubscribe) windowUnsubscribe(); | |
| if (melChunkUnsubscribe) melChunkUnsubscribe(); | |
| cleanupV4Pipeline(); | |
| if (workerClient) { | |
| const final = await workerClient.finalize(); | |
| let text = ''; | |
| if ('text' in final && typeof final.text === 'string') { | |
| text = final.text; | |
| } else if ('fullText' in final && typeof final.fullText === 'string') { | |
| text = final.fullText; | |
| } | |
| appStore.setTranscript(text); | |
| appStore.setPendingText(''); | |
| } | |
| melClient?.reset(); | |
| audioEngine?.reset(); | |
| } catch (err) { | |
| console.warn('[App] Error during stop recording cleanup:', err); | |
| } | |
| } else { | |
| try { | |
| if (!audioEngine) { | |
| audioEngine = new AudioEngine({ | |
| sampleRate: 16000, | |
| deviceId: appStore.selectedDeviceId(), | |
| }); | |
| setAudioEngineSignal(audioEngine); | |
| } else { | |
| audioEngine.updateConfig({ deviceId: appStore.selectedDeviceId() }); | |
| audioEngine.reset(); | |
| } | |
| const mode = appStore.transcriptionMode(); | |
| // v4 mode: Always start audio capture, mel preprocessing, and VAD | |
| // Inference only runs when model is ready (checked in v4Tick) | |
| if (mode === 'v4-utterance') { | |
| // ---- v4: Utterance-based pipeline with BufferWorker + TEN-VAD ---- | |
| // Initialize merger in worker only if model is ready | |
| if (isModelReady() && workerClient) { | |
| await workerClient.initV4Service({ debug: false }); | |
| } | |
| // Initialize mel worker (always needed for preprocessing) | |
| if (!melClient) { | |
| melClient = new MelWorkerClient(); | |
| setMelClientSignal(melClient); | |
| } | |
| try { | |
| await melClient.init({ nMels: 128 }); | |
| } catch (e) { | |
| melClient.dispose(); | |
| melClient = null; | |
| setMelClientSignal(null); | |
| } | |
| // Initialize BufferWorker (centralized multi-layer data store) | |
| bufferClient = new BufferWorkerClient(); | |
| const bufferConfig: BufferWorkerConfig = { | |
| sampleRate: 16000, | |
| layers: { | |
| audio: { hopSamples: 1, entryDimension: 1, maxDurationSec: 120 }, | |
| mel: { hopSamples: 160, entryDimension: 128, maxDurationSec: 120 }, | |
| energyVad: { hopSamples: 1280, entryDimension: 1, maxDurationSec: 120 }, | |
| inferenceVad: { hopSamples: 256, entryDimension: 1, maxDurationSec: 120 }, | |
| }, | |
| }; | |
| await bufferClient.init(bufferConfig); | |
| // Initialize TEN-VAD worker (inference-based VAD) | |
| tenVADClient = new TenVADWorkerClient(); | |
| tenVADClient.onResult((result: TenVADResult) => { | |
| if (!bufferClient) return; | |
| // Batch-write hop probabilities to inferenceVad (single worker message) | |
| if (result.hopCount > 0) { | |
| const lastProb = result.probabilities[result.hopCount - 1]; | |
| if (bufferClient.writeBatchTransfer) { | |
| bufferClient.writeBatchTransfer('inferenceVad', result.probabilities, result.globalSampleOffset); | |
| } else { | |
| bufferClient.writeBatch('inferenceVad', result.probabilities, result.globalSampleOffset); | |
| } | |
| // Update UI at most once per frame with the latest probability | |
| scheduleSileroUpdate(lastProb); | |
| } | |
| }); | |
| // TEN-VAD init is non-blocking; falls back gracefully if WASM fails | |
| const wasmPath = `${import.meta.env.BASE_URL}wasm/`; | |
| tenVADClient.init({ hopSize: 256, threshold: 0.5, wasmPath }).catch((err) => { | |
| console.warn('[v4] TEN-VAD init failed, using energy-only:', err); | |
| }); | |
| // Initialize hybrid VAD for energy-based detection (always runs, fast) | |
| hybridVAD = new HybridVAD({ | |
| sileroThreshold: 0.5, | |
| onsetConfirmations: 2, | |
| offsetConfirmations: 3, | |
| sampleRate: 16000, | |
| }); | |
| // Do NOT init Silero in HybridVAD (TEN-VAD replaces it) | |
| // NOTE: WindowBuilder is created AFTER audioEngine.start() below, | |
| // because start() may re-create the internal RingBuffer. | |
| // Reset global sample counter | |
| v4GlobalSampleOffset = 0; | |
| // Feed audio chunks to mel worker from the main v4 audio handler below | |
| v4MelChunkUnsubscribe = null; | |
| // Process each audio chunk: energy VAD + write to BufferWorker + forward to TEN-VAD | |
| v4AudioChunkUnsubscribe = audioEngine.onAudioChunk((chunk) => { | |
| if (!hybridVAD || !bufferClient) return; | |
| const chunkOffset = v4GlobalSampleOffset; | |
| v4GlobalSampleOffset += chunk.length; | |
| // 1. Run energy VAD (synchronous, fast) and write to BufferWorker | |
| const vadResult = hybridVAD.processEnergyOnly(chunk); | |
| const energyProb = vadResult.isSpeech ? 0.9 : 0.1; | |
| bufferClient.writeScalar('energyVad', energyProb); | |
| // 2. Forward audio to mel worker (copy, keep chunk for TEN-VAD transfer) | |
| melClient?.pushAudioCopy(chunk); | |
| // 3. Forward audio to TEN-VAD worker for inference-based VAD (transfer, no copy) | |
| if (tenVADClient?.isReady()) { | |
| tenVADClient.processTransfer(chunk, chunkOffset); | |
| } | |
| // 4. Update VAD state for UI | |
| const sileroProbability = tenVADClient?.isReady() | |
| ? undefined | |
| : (vadResult.sileroProbability || 0); | |
| scheduleVadStateUpdate({ | |
| isSpeech: vadResult.isSpeech, | |
| energy: vadResult.energy, | |
| snr: vadResult.snr || 0, | |
| hybridState: vadResult.state, | |
| ...(sileroProbability !== undefined ? { sileroProbability } : {}), | |
| }); | |
| }); | |
| // Start adaptive inference tick loop (reads interval from appStore) | |
| // Note: v4Tick internally checks if model is ready before running inference | |
| v4TickRunning = true; | |
| const scheduleNextTick = () => { | |
| if (!v4TickRunning) return; | |
| v4TickTimeout = window.setTimeout(async () => { | |
| if (!v4TickRunning) return; | |
| await v4Tick(); | |
| scheduleNextTick(); | |
| }, appStore.v4InferenceIntervalMs()); | |
| }; | |
| scheduleNextTick(); | |
| } else if (isModelReady() && workerClient) { | |
| // v3 and v2 modes still require model to be ready | |
| if (mode === 'v3-streaming') { | |
| // ---- v3: Fixed-window token streaming (existing) ---- | |
| const windowDur = appStore.streamingWindow(); | |
| const triggerInt = appStore.triggerInterval(); | |
| const overlapDur = Math.max(1.0, windowDur - triggerInt); | |
| await workerClient.initV3Service({ | |
| windowDuration: windowDur, | |
| overlapDuration: overlapDur, | |
| sampleRate: 16000, | |
| frameStride: appStore.frameStride(), | |
| }); | |
| if (!melClient) { | |
| melClient = new MelWorkerClient(); | |
| setMelClientSignal(melClient); | |
| } | |
| try { | |
| await melClient.init({ nMels: 128 }); | |
| } catch (e) { | |
| melClient.dispose(); | |
| melClient = null; | |
| setMelClientSignal(null); | |
| } | |
| melChunkUnsubscribe = audioEngine.onAudioChunk((chunk) => { | |
| melClient?.pushAudioCopy(chunk); | |
| }); | |
| windowUnsubscribe = audioEngine.onWindowChunk( | |
| windowDur, | |
| overlapDur, | |
| triggerInt, | |
| async (audio, startTime) => { | |
| if (!workerClient) return; | |
| const start = performance.now(); | |
| let result; | |
| if (melClient) { | |
| const startSample = Math.round(startTime * 16000); | |
| const endSample = startSample + audio.length; | |
| const melFeatures = await melClient.getFeatures(startSample, endSample); | |
| if (melFeatures) { | |
| result = await workerClient.processV3ChunkWithFeatures( | |
| melFeatures.features, | |
| melFeatures.T, | |
| melFeatures.melBins, | |
| startTime, | |
| overlapDur, | |
| ); | |
| } else { | |
| result = await workerClient.processV3Chunk(audio, startTime); | |
| } | |
| } else { | |
| result = await workerClient.processV3Chunk(audio, startTime); | |
| } | |
| const duration = performance.now() - start; | |
| const stride = appStore.triggerInterval(); | |
| appStore.setRtf(duration / (stride * 1000)); | |
| appStore.setInferenceLatency(duration); | |
| if (audioEngine) { | |
| const ring = audioEngine.getRingBuffer(); | |
| appStore.setBufferMetrics({ | |
| fillRatio: ring.getFillCount() / ring.getSize(), | |
| latencyMs: (ring.getFillCount() / 16000) * 1000, | |
| }); | |
| } | |
| appStore.setMergeInfo({ | |
| lcsLength: result.lcsLength, | |
| anchorValid: result.anchorValid, | |
| chunkCount: result.chunkCount, | |
| anchorTokens: result.anchorTokens | |
| }); | |
| } | |
| ); | |
| } else { | |
| // ---- v2: Per-utterance (existing) ---- | |
| await workerClient.initService({ sampleRate: 16000 }); | |
| segmentUnsubscribe = audioEngine.onSpeechSegment(async (segment) => { | |
| if (workerClient) { | |
| const start = Date.now(); | |
| const samples = audioEngine!.getRingBuffer().read(segment.startFrame, segment.endFrame); | |
| const result = await workerClient.transcribeSegment(samples); | |
| if (result.text) appStore.appendTranscript(result.text + ' '); | |
| appStore.setInferenceLatency(Date.now() - start); | |
| } | |
| }); | |
| } | |
| } | |
| await audioEngine.start(); | |
| // Create WindowBuilder AFTER start() so we get the final RingBuffer reference | |
| // (AudioEngine.init() re-creates the RingBuffer internally) | |
| if (mode === 'v4-utterance') { | |
| windowBuilder = new WindowBuilder( | |
| audioEngine.getRingBuffer(), | |
| null, // No VADRingBuffer; hasSpeech now goes through BufferWorker | |
| { | |
| sampleRate: 16000, | |
| minDurationSec: 3.0, | |
| maxDurationSec: 30.0, | |
| minInitialDurationSec: 1.5, | |
| useVadBoundaries: false, // VAD boundaries now managed by BufferWorker | |
| vadSilenceThreshold: 0.3, | |
| debug: true, // Enable debug logging for diagnostics | |
| } | |
| ); | |
| } | |
| appStore.startRecording(); | |
| // Use same 30fps tick (onVisualizationUpdate throttled to 33ms). | |
| // Bar levels from AnalyserNode (native FFT, low CPU) instead of mel worker. | |
| visualizationUnsubscribe = audioEngine.onVisualizationUpdate((_data, metrics) => { | |
| appStore.setAudioLevel(metrics.currentEnergy); | |
| if (appStore.transcriptionMode() !== 'v4-utterance') { | |
| appStore.setIsSpeechDetected(audioEngine?.isSpeechActive() ?? false); | |
| } | |
| appStore.setBarLevels(audioEngine!.getBarLevels()); | |
| }); | |
| } catch (err: any) { | |
| appStore.setErrorMessage(err.message); | |
| } | |
| } | |
| }; | |
| const loadSelectedModel = async () => { | |
| if (!workerClient) return; | |
| if (appStore.modelState() === 'ready') return; | |
| if (appStore.modelState() === 'loading') return; | |
| setShowContextPanel(true); | |
| try { | |
| await workerClient.initModel(appStore.selectedModelId()); | |
| } catch (e) { | |
| console.error('Failed to load model:', e); | |
| appStore.setModelState('error'); | |
| appStore.setErrorMessage(e instanceof Error ? e.message : String(e)); | |
| } | |
| }; | |
| const openPanelForAudio = () => { | |
| clearTimeout(panelHoverCloseTimeout); | |
| setSettingsPanelSection('audio'); | |
| setShowContextPanel(true); | |
| }; | |
| const openPanelForModel = () => { | |
| clearTimeout(panelHoverCloseTimeout); | |
| setSettingsPanelSection('model'); | |
| setShowContextPanel(true); | |
| }; | |
| const schedulePanelCloseIfHover = () => { | |
| panelHoverCloseTimeout = window.setTimeout(() => { | |
| if (settingsPanelSection() !== 'full' && appStore.modelState() !== 'loading') { | |
| setShowContextPanel(false); | |
| } | |
| }, 250); | |
| }; | |
| const cancelPanelClose = () => clearTimeout(panelHoverCloseTimeout); | |
| const panelMouseLeave = () => { | |
| if (settingsPanelSection() !== 'full') schedulePanelCloseIfHover(); | |
| }; | |
| const handleLocalLoad = async (files: FileList) => { | |
| if (!workerClient) return; | |
| setShowContextPanel(true); | |
| try { | |
| await workerClient.initLocalModel(files); | |
| } catch (e) { | |
| console.error('Failed to load local model:', e); | |
| } | |
| }; | |
| return ( | |
| <div class="h-screen flex flex-col overflow-hidden bg-[var(--color-earthy-bg)] selection:bg-[var(--color-earthy-coral)] selection:text-white"> | |
| <ModelLoadingOverlay | |
| isVisible={showModelOverlay()} | |
| state={appStore.modelState()} | |
| progress={appStore.modelProgress()} | |
| message={appStore.modelMessage()} | |
| file={appStore.modelFile()} | |
| backend={appStore.backend()} | |
| selectedModelId={appStore.selectedModelId()} | |
| onModelSelect={(id: string) => appStore.setSelectedModelId(id)} | |
| onStart={() => loadSelectedModel()} | |
| onLocalLoad={handleLocalLoad} | |
| onClose={() => setShowModelOverlay(false)} | |
| /> | |
| <Header | |
| onToggleDebug={() => appStore.setShowDebugPanel(!appStore.showDebugPanel())} | |
| /> | |
| <div class="flex-1 flex overflow-hidden relative"> | |
| <main class="flex-1 overflow-y-auto custom-scrollbar px-4 sm:px-6 lg:px-10 xl:px-14 2xl:px-20 flex flex-col items-center"> | |
| <div class="w-full max-w-[1680px] py-8 md:py-10 lg:py-12"> | |
| <TranscriptionDisplay | |
| confirmedText={appStore.transcriptionMode() === 'v4-utterance' ? appStore.matureText() : appStore.transcript()} | |
| pendingText={appStore.transcriptionMode() === 'v4-utterance' ? appStore.immatureText() : appStore.pendingText()} | |
| sentenceEntries={appStore.v4SentenceEntries()} | |
| isV4Mode={appStore.transcriptionMode() === 'v4-utterance'} | |
| isRecording={isRecording()} | |
| lcsLength={appStore.mergeInfo().lcsLength} | |
| anchorValid={appStore.mergeInfo().anchorValid} | |
| showConfidence={appStore.transcriptionMode() === 'v3-streaming'} | |
| class="min-h-[56vh]" | |
| /> | |
| </div> | |
| </main> | |
| </div> | |
| {/* Draggable floating control widget */} | |
| <div | |
| class={widgetPos() !== null ? 'fixed z-30 w-full max-w-2xl px-6 select-none' : 'absolute bottom-8 left-1/2 -translate-x-1/2 z-30 w-full max-w-2xl px-6'} | |
| style={widgetPos() ? { left: `${widgetPos()!.x}px`, top: `${widgetPos()!.y}px` } : {}} | |
| > | |
| <div class="relative"> | |
| {/* Settings panel: expands up or down depending on bar position vs half screen height */} | |
| <div | |
| class="absolute left-0 right-0 overflow-hidden transition-[max-height] duration-300 ease-out border border-[var(--color-earthy-sage)]/30 bg-[var(--color-earthy-bg)]/95 backdrop-blur-sm shadow-lg" | |
| classList={{ | |
| 'max-h-0': !showContextPanel(), | |
| 'max-h-[70vh]': showContextPanel(), | |
| 'bottom-full rounded-t-2xl border-b-0': settingsExpandUp(), | |
| 'top-full rounded-b-2xl border-t-0': !settingsExpandUp(), | |
| }} | |
| onMouseEnter={cancelPanelClose} | |
| onMouseLeave={panelMouseLeave} | |
| > | |
| <div class="max-h-[70vh] min-h-0 flex flex-col overflow-y-auto custom-scrollbar"> | |
| <SettingsContent | |
| section={settingsPanelSection()} | |
| onClose={() => setShowContextPanel(false)} | |
| onLoadModel={() => loadSelectedModel()} | |
| onLocalLoad={handleLocalLoad} | |
| onOpenDebug={() => appStore.setShowDebugPanel(true)} | |
| onDeviceSelect={(id) => { | |
| if (audioEngine) audioEngine.updateConfig({ deviceId: id }); | |
| }} | |
| audioEngine={audioEngineSignal() ?? undefined} | |
| expandUp={settingsExpandUp} | |
| /> | |
| </div> | |
| </div> | |
| {/* Control bar: steady, fixed position; never moves when settings open */} | |
| <div | |
| class="bg-white/90 backdrop-blur-md shadow-lg border border-[var(--color-earthy-sage)]/30 rounded-2xl overflow-hidden" | |
| onMouseDown={handleWidgetDragStart} | |
| role="presentation" | |
| > | |
| <div class="p-4 flex items-center justify-between gap-6 cursor-grab active:cursor-grabbing"> | |
| <div class="flex items-center gap-2 flex-shrink-0"> | |
| <span class="material-symbols-outlined text-[var(--color-earthy-soft-brown)] text-lg opacity-60" aria-hidden="true">drag_indicator</span> | |
| <div class="flex flex-col min-w-[60px]"> | |
| <span class="text-[10px] uppercase tracking-wider text-[var(--color-earthy-soft-brown)] font-bold">Rec</span> | |
| <span class="font-mono text-sm text-[var(--color-earthy-dark-brown)]">{formatDuration(appStore.sessionDuration())}</span> | |
| </div> | |
| </div> | |
| <div class="flex-1 min-w-0 flex flex-col justify-center gap-1"> | |
| <div class="h-8 flex items-center justify-center gap-1 overflow-hidden opacity-80 abstract-wave"> | |
| <CompactWaveform audioLevel={appStore.audioLevel()} barLevels={appStore.barLevels()} isRecording={isRecording()} /> | |
| </div> | |
| <Show when={appStore.modelState() === 'loading'}> | |
| <div class="flex items-center gap-2 px-1"> | |
| <div class="flex-1 h-1.5 rounded-full overflow-hidden bg-[var(--color-earthy-sage)]/20"> | |
| <div | |
| class="h-full bg-[var(--color-earthy-muted-green)] rounded-full transition-all duration-300" | |
| style={{ width: `${Math.max(0, Math.min(100, appStore.modelProgress()))}%` }} | |
| /> | |
| </div> | |
| <span class="text-[10px] font-mono text-[var(--color-earthy-soft-brown)] tabular-nums">{Math.round(appStore.modelProgress())}%</span> | |
| </div> | |
| </Show> | |
| </div> | |
| <div class="flex items-center gap-2 flex-shrink-0"> | |
| <button | |
| type="button" | |
| onClick={toggleRecording} | |
| onMouseEnter={openPanelForAudio} | |
| onMouseLeave={schedulePanelCloseIfHover} | |
| class={`w-10 h-10 rounded-full flex items-center justify-center transition-colors border ${isRecording() ? 'bg-[var(--color-earthy-coral)] text-white border-[var(--color-earthy-coral)]' : 'text-[var(--color-earthy-dark-brown)] hover:bg-[var(--color-earthy-bg)] border-transparent hover:border-[var(--color-earthy-sage)]/30'}`} | |
| title={isRecording() ? 'Stop recording' : 'Start recording'} | |
| > | |
| <span class="material-symbols-outlined">mic</span> | |
| </button> | |
| <button | |
| type="button" | |
| onClick={() => loadSelectedModel()} | |
| onMouseEnter={openPanelForModel} | |
| onMouseLeave={schedulePanelCloseIfHover} | |
| disabled={appStore.modelState() === 'loading' || appStore.modelState() === 'ready'} | |
| class="w-10 h-10 rounded-full flex items-center justify-center text-[var(--color-earthy-dark-brown)] hover:bg-[var(--color-earthy-bg)] transition-colors border border-transparent hover:border-[var(--color-earthy-sage)]/30 disabled:opacity-40 disabled:cursor-not-allowed relative" | |
| title={appStore.modelState() === 'ready' ? 'Model loaded' : appStore.modelState() === 'loading' ? 'Loading...' : 'Load model'} | |
| > | |
| <Show when={appStore.modelState() === 'loading'} fallback={<span class="material-symbols-outlined">power_settings_new</span>}> | |
| <span class="material-symbols-outlined load-btn-spin">progress_activity</span> | |
| </Show> | |
| </button> | |
| <button | |
| type="button" | |
| onClick={() => { setSettingsPanelSection('full'); setShowContextPanel((v) => !v); }} | |
| class={`w-10 h-10 rounded-full flex items-center justify-center transition-colors border ${showContextPanel() ? 'bg-[var(--color-earthy-sage)]/30 text-[var(--color-earthy-muted-green)] border-[var(--color-earthy-sage)]/50' : 'text-[var(--color-earthy-dark-brown)] hover:bg-[var(--color-earthy-bg)] border-transparent hover:border-[var(--color-earthy-sage)]/30'}`} | |
| title="Settings" | |
| > | |
| <span class="material-symbols-outlined">tune</span> | |
| </button> | |
| <button | |
| type="button" | |
| onClick={() => isRecording() && toggleRecording()} | |
| disabled={!isRecording()} | |
| class="w-10 h-10 rounded-full flex items-center justify-center text-[var(--color-earthy-dark-brown)] hover:bg-[var(--color-earthy-bg)] transition-colors border border-transparent hover:border-[var(--color-earthy-sage)]/30 disabled:opacity-40 disabled:cursor-not-allowed" | |
| title="Pause" | |
| > | |
| <span class="material-symbols-outlined">pause</span> | |
| </button> | |
| <button | |
| type="button" | |
| onClick={() => appStore.copyTranscript()} | |
| class="w-10 h-10 rounded-full flex items-center justify-center text-[var(--color-earthy-dark-brown)] hover:bg-[var(--color-earthy-bg)] transition-colors border border-transparent hover:border-[var(--color-earthy-sage)]/30" | |
| title="Copy transcript" | |
| > | |
| <span class="material-symbols-outlined">content_copy</span> | |
| </button> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| {/* Foldable debug panel (bottom drawer) */} | |
| <Show when={appStore.showDebugPanel()}> | |
| <div class="absolute bottom-0 left-0 right-0 z-20 flex flex-col bg-[var(--color-earthy-bg)] border-t border-[var(--color-earthy-sage)]/30 shadow-[0_-4px_20px_rgba(0,0,0,0.08)] max-h-[70vh] overflow-hidden transition-all"> | |
| <DebugPanel | |
| audioEngine={audioEngineSignal() ?? undefined} | |
| melClient={melClientSignal() ?? undefined} | |
| /> | |
| </div> | |
| </Show> | |
| </div> | |
| ); | |
| }; | |
| export default App; | |