Spaces:
Running
Running
| /** | |
| * Web Worker for Parakeet ONNX Model Inference | |
| * | |
| * Handles model loading and transcription in a separate thread using parakeet.js | |
| * https://github.com/ysdede/parakeet.js | |
| */ | |
| import { fromHub } from 'parakeet.js'; | |
| let model = null; | |
| let isLoading = false; | |
| /** | |
| * Load the Parakeet model using parakeet.js | |
| */ | |
| async function loadModel(modelVersion = 'parakeet-tdt-0.6b-v3', options = {}) { | |
| if (isLoading) { | |
| return { status: 'loading', message: 'Model is already loading...' }; | |
| } | |
| if (model) { | |
| return { status: 'ready', message: 'Model already loaded' }; | |
| } | |
| try { | |
| isLoading = true; | |
| // Use 'webgpu-hybrid' for WebGPU encoder + WASM decoder (best performance) | |
| // Use 'wasm' for full WASM execution | |
| const backend = options.device === 'webgpu' ? 'webgpu-hybrid' : 'wasm'; | |
| self.postMessage({ | |
| status: 'loading', | |
| message: `Loading Parakeet ${modelVersion}... (~2.5GB)`, | |
| }); | |
| console.log('[Worker] Starting model load with backend:', backend); | |
| // Load model using parakeet.js fromHub helper | |
| // webgpu-hybrid: FP32 encoder on WebGPU + INT8 decoder on WASM (optimal) | |
| // wasm: Both INT8 on WASM (CPU only) | |
| const quantization = backend === 'wasm' | |
| ? { encoderQuant: 'int8', decoderQuant: 'int8', preprocessor: 'nemo128' } // WASM: both INT8 | |
| : { encoderQuant: 'fp32', decoderQuant: 'int8', preprocessor: 'nemo128' }; // WebGPU-hybrid: FP32 encoder + INT8 decoder | |
| console.log('[Worker] Calling fromHub...'); | |
| // Track which files we've already sent 'initiate' for | |
| const initiatedFiles = new Set(); | |
| model = await fromHub(modelVersion, { | |
| backend, | |
| ...quantization, | |
| progress: (progressData) => { | |
| const { loaded, total, file } = progressData; | |
| const progress = total > 0 ? Math.round((loaded / total) * 100) : 0; | |
| // Send 'initiate' message for new files | |
| if (!initiatedFiles.has(file)) { | |
| initiatedFiles.add(file); | |
| self.postMessage({ | |
| status: 'initiate', | |
| file, | |
| progress: 0, | |
| total, | |
| }); | |
| } | |
| // Send progress update | |
| self.postMessage({ | |
| status: 'progress', | |
| file, | |
| progress, | |
| total, | |
| loaded, | |
| }); | |
| // Send 'done' when complete | |
| if (loaded >= total) { | |
| self.postMessage({ | |
| status: 'done', | |
| file, | |
| }); | |
| } | |
| }, | |
| }); | |
| console.log('[Worker] fromHub completed successfully'); | |
| self.postMessage({ | |
| status: 'loading', | |
| message: 'Model loaded, warming up...', | |
| }); | |
| // Warm-up inference (recommended by parakeet.js) | |
| const dummyAudio = new Float32Array(16000); // 1 second of silence | |
| await model.transcribe(dummyAudio, 16000); | |
| self.postMessage({ | |
| status: 'ready', | |
| message: `Parakeet ${modelVersion} ready!`, | |
| device: backend, | |
| modelVersion, | |
| }); | |
| return { status: 'ready', device: backend }; | |
| } catch (error) { | |
| console.error('Failed to load model:', error); | |
| self.postMessage({ | |
| status: 'error', | |
| message: `Failed to load model: ${error.message}`, | |
| error: error.toString(), | |
| }); | |
| return { status: 'error', error: error.toString() }; | |
| } finally { | |
| isLoading = false; | |
| } | |
| } | |
| /** | |
| * Transcribe audio chunk using Parakeet | |
| */ | |
| async function transcribe(audio, language = null) { | |
| if (!model) { | |
| throw new Error('Model not loaded. Call load() first.'); | |
| } | |
| try { | |
| const startTime = performance.now(); | |
| // Transcribe with parakeet.js | |
| const result = await model.transcribe(audio, 16000, { | |
| returnTimestamps: true, // Get word-level timestamps | |
| returnConfidences: true, // Get confidence scores | |
| temperature: 1.0, // Greedy decoding | |
| }); | |
| const endTime = performance.now(); | |
| const latency = (endTime - startTime) / 1000; // seconds | |
| const audioDuration = audio.length / 16000; | |
| const rtf = audioDuration / latency; // Speed factor (inverse of traditional RTF) | |
| // Convert parakeet.js word format to our sentence format | |
| const sentences = groupWordsIntoSentences(result.words || []); | |
| return { | |
| text: result.utterance_text || '', | |
| sentences, | |
| words: result.words || [], | |
| chunks: result.words || [], // For compatibility | |
| metadata: { | |
| latency, | |
| audioDuration, | |
| rtf, | |
| language, | |
| confidence: result.confidence_scores, | |
| metrics: result.metrics, | |
| }, | |
| }; | |
| } catch (error) { | |
| console.error('Transcription error:', error); | |
| throw error; | |
| } | |
| } | |
| /** | |
| * Group words into sentences based on punctuation | |
| * | |
| * Note: This is a simplified implementation since parakeet.js provides word-level | |
| * alignments but not sentence-level. The Python implementation uses model-provided | |
| * sentence boundaries. We split on sentence-ending punctuation (.!?) to approximate | |
| * sentence boundaries for the progressive streaming window management. | |
| */ | |
| function groupWordsIntoSentences(words) { | |
| if (!words || words.length === 0) { | |
| return []; | |
| } | |
| const sentences = []; | |
| let currentWords = []; | |
| let currentStart = words[0].start_time || 0; | |
| for (let i = 0; i < words.length; i++) { | |
| const word = words[i]; | |
| currentWords.push(word.text); | |
| // Check if this word ends a sentence (only period, question mark, exclamation) | |
| // Note: We explicitly ignore commas - they don't end sentences | |
| const endsWithTerminalPunctuation = /[.!?]$/.test(word.text); | |
| if (endsWithTerminalPunctuation || i === words.length - 1) { | |
| // Create sentence | |
| sentences.push({ | |
| text: currentWords.join(' ').trim(), | |
| start: currentStart, | |
| end: word.end_time || (word.start_time || 0), | |
| }); | |
| // Start new sentence if there are more words | |
| if (i < words.length - 1) { | |
| currentWords = []; | |
| currentStart = words[i + 1].start_time || (word.end_time || 0); | |
| } | |
| } | |
| } | |
| return sentences; | |
| } | |
| /** | |
| * Message handler | |
| */ | |
| self.onmessage = async (event) => { | |
| const { type, data } = event.data; | |
| try { | |
| switch (type) { | |
| case 'load': | |
| await loadModel(data?.modelVersion, data?.options || {}); | |
| break; | |
| case 'transcribe': | |
| const result = await transcribe(data.audio, data.language); | |
| self.postMessage({ | |
| status: 'transcription', | |
| result, | |
| }); | |
| break; | |
| case 'ping': | |
| self.postMessage({ status: 'pong' }); | |
| break; | |
| default: | |
| self.postMessage({ | |
| status: 'error', | |
| message: `Unknown message type: ${type}`, | |
| }); | |
| } | |
| } catch (error) { | |
| self.postMessage({ | |
| status: 'error', | |
| message: error.message, | |
| error: error.toString(), | |
| }); | |
| } | |
| }; | |