Spaces:
Running
Running
| /** | |
| * Keet v3.0 - App Store | |
| * | |
| * Central state management using SolidJS signals. | |
| * Manages recording state, model status, and transcript. | |
| */ | |
| import { createSignal, createMemo, createRoot, onCleanup } from 'solid-js'; | |
| import type { RecordingState, ModelState, BackendType } from '../types'; | |
| import type { V4SentenceEntry } from '../lib/transcription/TranscriptionWorkerClient'; | |
| export interface DebugToken { | |
| id: string; | |
| text: string; | |
| confidence: number; | |
| } | |
| export interface SystemMetrics { | |
| throughput: number; // tokens/sec | |
| modelConfidence: number; // 0-1 | |
| vramUsage?: string; | |
| } | |
| /** Transcription mode: v2 (per-utterance VAD), v3 (overlapping windows + LCS merge), v4 (utterance-based merger) */ | |
| export type TranscriptionMode = 'v2-utterance' | 'v3-streaming' | 'v4-utterance'; | |
| /** Merge info for v3 streaming mode */ | |
| export interface MergeInfo { | |
| lcsLength: number; | |
| anchorValid: boolean; | |
| chunkCount: number; | |
| anchorTokens?: string[]; | |
| } | |
| /** VAD state for UI display */ | |
| export interface VADState { | |
| isSpeech: boolean; | |
| energy: number; | |
| snr: number; | |
| sileroProbability: number; | |
| hybridState: string; | |
| } | |
| /** Merger stats for v4 mode */ | |
| export interface V4MergerStats { | |
| sentencesFinalized: number; | |
| cursorUpdates: number; | |
| utterancesProcessed: number; | |
| } | |
| export function createAppStore() { | |
| // Recording state | |
| const [recordingState, setRecordingState] = createSignal<RecordingState>('idle'); | |
| const [sessionDuration, setSessionDuration] = createSignal(0); | |
| const [availableDevices, setAvailableDevices] = createSignal<MediaDeviceInfo[]>([]); | |
| const [selectedDeviceId, setSelectedDeviceId] = createSignal(''); | |
| let timerInterval: number | undefined; | |
| // Model state | |
| const [modelState, setModelState] = createSignal<ModelState>('unloaded'); | |
| const [selectedModelId, setSelectedModelId] = createSignal('parakeet-tdt-0.6b-v2'); | |
| const [modelProgress, setModelProgress] = createSignal(0); | |
| const [modelMessage, setModelMessage] = createSignal(''); | |
| const [modelFile, setModelFile] = createSignal(''); | |
| const [backend, setBackend] = createSignal<BackendType>('webgpu'); | |
| const [errorMessage, setErrorMessage] = createSignal<string | null>(null); | |
| // Transcript state | |
| const [transcript, setTranscript] = createSignal(''); | |
| const [pendingText, setPendingText] = createSignal(''); | |
| // Audio state | |
| const [audioLevel, setAudioLevel] = createSignal(0); | |
| const [barLevels, setBarLevels] = createSignal<Float32Array>(new Float32Array(0)); | |
| const [isSpeechDetected, setIsSpeechDetected] = createSignal(false); | |
| // Offline state | |
| const [isOfflineReady, setIsOfflineReady] = createSignal(false); | |
| const [isOnline, setIsOnline] = createSignal(typeof navigator !== 'undefined' ? navigator.onLine : true); | |
| // Debug metrics | |
| const [inferenceLatency, setInferenceLatencyInternal] = createSignal(0); | |
| const [latencySamples, setLatencySamples] = createSignal<number[]>([]); | |
| const LATENCY_SAMPLE_SIZE = 5; | |
| const setInferenceLatency = (v: number) => { | |
| setInferenceLatencyInternal(v); | |
| setLatencySamples(prev => [...prev.slice(1 - LATENCY_SAMPLE_SIZE), v]); | |
| }; | |
| const inferenceLatencyAverage = createMemo(() => { | |
| const s = latencySamples(); | |
| if (s.length === 0) return inferenceLatency(); | |
| return s.reduce((sum, x) => sum + x, 0) / s.length; | |
| }); | |
| const [debugTokens, setDebugTokens] = createSignal<DebugToken[]>([]); | |
| const [systemMetrics, setSystemMetrics] = createSignal<SystemMetrics>({ | |
| throughput: 0, | |
| modelConfidence: 0, | |
| }); | |
| // Transcription mode toggle (v4-utterance is the new default) | |
| const [transcriptionMode, setTranscriptionMode] = createSignal<TranscriptionMode>('v4-utterance'); | |
| const [mergeInfo, setMergeInfo] = createSignal<MergeInfo>({ | |
| lcsLength: 0, | |
| anchorValid: false, | |
| chunkCount: 0, | |
| anchorTokens: [], | |
| }); | |
| // Performance Telemetry | |
| const [rtf, setRtfInternal] = createSignal(0); // Real-Time Factor (Inference/AudioDuration) | |
| const [rtfSamples, setRtfSamples] = createSignal<number[]>([]); // Last N RTF values for RTFx average | |
| const RTF_SAMPLE_SIZE = 10; | |
| const setRtf = (v: number) => { | |
| setRtfInternal(v); | |
| setRtfSamples(prev => [...prev.slice(1 - RTF_SAMPLE_SIZE), v]); | |
| }; | |
| const rtfxAverage = createMemo(() => { | |
| const s = rtfSamples().filter(r => r > 0); | |
| if (s.length === 0) return 0; | |
| return s.reduce((sum, r) => sum + 1 / r, 0) / s.length; | |
| }); | |
| const [bufferMetrics, setBufferMetrics] = createSignal({ | |
| fillRatio: 0, | |
| latencyMs: 0, | |
| }); | |
| // v3 Streaming config | |
| // Window=5s gives ~62 encoder frames (vs 87 for 7s) - 30% less decode work. | |
| // Overlap=3.5s with trigger=1.5s provides enough context for LCS merging | |
| // while giving the transcriber 1.5s headroom per chunk. | |
| const [streamingWindow, setStreamingWindow] = createSignal(5.0); | |
| const [streamingOverlap, setStreamingOverlap] = createSignal(3.5); | |
| const [triggerInterval, setTriggerInterval] = createSignal(1.5); | |
| const [energyThreshold, setEnergyThreshold] = createSignal(0.08); | |
| // Decoder frame stride: 1 = full precision, 2 = halves decoder steps (faster, coarser timestamps) | |
| const [frameStride, setFrameStride] = createSignal(1); | |
| // v4 Pipeline config | |
| const [v4InferenceIntervalMs, setV4InferenceIntervalMs] = createSignal(480); // Transcription tick frequency in ms (320-8000) | |
| const [v4SilenceFlushSec, setV4SilenceFlushSec] = createSignal(1.0); // Silence duration to flush pending sentence | |
| const [sileroThreshold, setSileroThreshold] = createSignal(0.5); // Silero VAD probability threshold | |
| // UI state | |
| const [showDebugPanel, setShowDebugPanel] = createSignal(false); | |
| // v4 Utterance-based state | |
| const [matureText, setMatureText] = createSignal(''); | |
| const [immatureText, setImmatureText] = createSignal(''); | |
| const [matureCursorTime, setMatureCursorTime] = createSignal(0); | |
| const [vadState, setVadState] = createSignal<VADState>({ | |
| isSpeech: false, | |
| energy: 0, | |
| snr: 0, | |
| sileroProbability: 0, | |
| hybridState: 'silence', | |
| }); | |
| const [v4MergerStats, setV4MergerStats] = createSignal<V4MergerStats>({ | |
| sentencesFinalized: 0, | |
| cursorUpdates: 0, | |
| utterancesProcessed: 0, | |
| }); | |
| const [v4SentenceEntries, setV4SentenceEntries] = createSignal<V4SentenceEntry[]>([]); | |
| // Network status listeners (with cleanup to prevent leaks) | |
| if (typeof window !== 'undefined') { | |
| const handleOnline = () => setIsOnline(true); | |
| const handleOffline = () => setIsOnline(false); | |
| window.addEventListener('online', handleOnline); | |
| window.addEventListener('offline', handleOffline); | |
| onCleanup(() => { | |
| window.removeEventListener('online', handleOnline); | |
| window.removeEventListener('offline', handleOffline); | |
| }); | |
| } | |
| // Actions | |
| const startRecording = () => { | |
| setRecordingState('recording'); | |
| setSessionDuration(0); | |
| if (timerInterval) clearInterval(timerInterval); | |
| timerInterval = window.setInterval(() => { | |
| setSessionDuration(prev => prev + 1); | |
| }, 1000); | |
| }; | |
| const stopRecording = () => { | |
| setRecordingState('idle'); | |
| if (timerInterval) { | |
| clearInterval(timerInterval); | |
| timerInterval = undefined; | |
| } | |
| }; | |
| const refreshDevices = async () => { | |
| try { | |
| const devices = await navigator.mediaDevices.enumerateDevices(); | |
| const mics = devices.filter(d => d.kind === 'audioinput'); | |
| setAvailableDevices(mics); | |
| if (mics.length > 0 && !selectedDeviceId()) { | |
| setSelectedDeviceId(mics[0].deviceId); | |
| } | |
| } catch (e) { | |
| console.error('Failed to enum devices:', e); | |
| } | |
| }; | |
| const appendTranscript = (text: string) => { | |
| setTranscript(prev => prev + text); | |
| setPendingText(''); | |
| }; | |
| const clearTranscript = () => { | |
| setTranscript(''); | |
| setPendingText(''); | |
| setMatureText(''); | |
| setImmatureText(''); | |
| setMatureCursorTime(0); | |
| setV4SentenceEntries([]); | |
| }; | |
| const appendV4SentenceEntries = (entries: V4SentenceEntry[]) => { | |
| if (!Array.isArray(entries) || entries.length === 0) return; | |
| setV4SentenceEntries(prev => [...prev, ...entries]); | |
| }; | |
| const clearV4SentenceEntries = () => { | |
| setV4SentenceEntries([]); | |
| }; | |
| const copyTranscript = async () => { | |
| try { | |
| await navigator.clipboard.writeText(transcript()); | |
| return true; | |
| } catch { | |
| return false; | |
| } | |
| }; | |
| return { | |
| // State (readonly) | |
| recordingState, | |
| availableDevices, | |
| selectedDeviceId, | |
| sessionDuration, | |
| modelState, | |
| selectedModelId, | |
| modelProgress, | |
| modelMessage, | |
| modelFile, | |
| backend, | |
| transcript, | |
| pendingText, | |
| audioLevel, | |
| barLevels, | |
| setBarLevels, | |
| isSpeechDetected, | |
| isOfflineReady, | |
| isOnline, | |
| inferenceLatency, | |
| inferenceLatencyAverage, | |
| rtf, | |
| rtfxAverage, | |
| bufferMetrics, | |
| debugTokens, | |
| systemMetrics, | |
| errorMessage, | |
| transcriptionMode, | |
| mergeInfo, | |
| streamingWindow, | |
| streamingOverlap, | |
| triggerInterval, | |
| energyThreshold, | |
| frameStride, | |
| // v4 config | |
| v4InferenceIntervalMs, | |
| v4SilenceFlushSec, | |
| sileroThreshold, | |
| // UI state | |
| showDebugPanel, | |
| // v4 state | |
| matureText, | |
| immatureText, | |
| matureCursorTime, | |
| vadState, | |
| v4MergerStats, | |
| v4SentenceEntries, | |
| // Setters (for internal use) | |
| setRecordingState, | |
| setSessionDuration, | |
| setAvailableDevices, | |
| setSelectedDeviceId, | |
| setModelState, | |
| setSelectedModelId, | |
| setModelProgress, | |
| setModelMessage, | |
| setModelFile, | |
| setBackend, | |
| setErrorMessage, | |
| setTranscript, | |
| setPendingText, | |
| setAudioLevel, | |
| setIsSpeechDetected, | |
| setIsOfflineReady, | |
| setInferenceLatency, | |
| setRtf, | |
| setBufferMetrics, | |
| setDebugTokens, | |
| setSystemMetrics, | |
| setTranscriptionMode, | |
| setMergeInfo, | |
| setStreamingWindow, | |
| setStreamingOverlap, | |
| setTriggerInterval, | |
| setEnergyThreshold, | |
| setFrameStride, | |
| // UI setters | |
| setShowDebugPanel, | |
| // v4 setters | |
| setV4InferenceIntervalMs, | |
| setV4SilenceFlushSec, | |
| setSileroThreshold, | |
| setMatureText, | |
| setImmatureText, | |
| setMatureCursorTime, | |
| setVadState, | |
| setV4MergerStats, | |
| setV4SentenceEntries, | |
| // Actions | |
| startRecording, | |
| stopRecording, | |
| refreshDevices, | |
| appendTranscript, | |
| clearTranscript, | |
| appendV4SentenceEntries, | |
| clearV4SentenceEntries, | |
| copyTranscript, | |
| }; | |
| } | |
| // Create singleton store | |
| export const appStore = createRoot(createAppStore); | |