andito's picture
andito HF Staff
Re-add progress bar functionality for model downloads
c54543a
/**
* Web Worker for Parakeet ONNX Model Inference
*
* Handles model loading and transcription in a separate thread using parakeet.js
* https://github.com/ysdede/parakeet.js
*/
import { fromHub } from 'parakeet.js';
let model = null;
let isLoading = false;
/**
* Load the Parakeet model using parakeet.js
*/
async function loadModel(modelVersion = 'parakeet-tdt-0.6b-v3', options = {}) {
if (isLoading) {
return { status: 'loading', message: 'Model is already loading...' };
}
if (model) {
return { status: 'ready', message: 'Model already loaded' };
}
try {
isLoading = true;
// Use 'webgpu-hybrid' for WebGPU encoder + WASM decoder (best performance)
// Use 'wasm' for full WASM execution
const backend = options.device === 'webgpu' ? 'webgpu-hybrid' : 'wasm';
self.postMessage({
status: 'loading',
message: `Loading Parakeet ${modelVersion}... (~2.5GB)`,
});
console.log('[Worker] Starting model load with backend:', backend);
// Load model using parakeet.js fromHub helper
// webgpu-hybrid: FP32 encoder on WebGPU + INT8 decoder on WASM (optimal)
// wasm: Both INT8 on WASM (CPU only)
const quantization = backend === 'wasm'
? { encoderQuant: 'int8', decoderQuant: 'int8', preprocessor: 'nemo128' } // WASM: both INT8
: { encoderQuant: 'fp32', decoderQuant: 'int8', preprocessor: 'nemo128' }; // WebGPU-hybrid: FP32 encoder + INT8 decoder
console.log('[Worker] Calling fromHub...');
// Track which files we've already sent 'initiate' for
const initiatedFiles = new Set();
model = await fromHub(modelVersion, {
backend,
...quantization,
progress: (progressData) => {
const { loaded, total, file } = progressData;
const progress = total > 0 ? Math.round((loaded / total) * 100) : 0;
// Send 'initiate' message for new files
if (!initiatedFiles.has(file)) {
initiatedFiles.add(file);
self.postMessage({
status: 'initiate',
file,
progress: 0,
total,
});
}
// Send progress update
self.postMessage({
status: 'progress',
file,
progress,
total,
loaded,
});
// Send 'done' when complete
if (loaded >= total) {
self.postMessage({
status: 'done',
file,
});
}
},
});
console.log('[Worker] fromHub completed successfully');
self.postMessage({
status: 'loading',
message: 'Model loaded, warming up...',
});
// Warm-up inference (recommended by parakeet.js)
const dummyAudio = new Float32Array(16000); // 1 second of silence
await model.transcribe(dummyAudio, 16000);
self.postMessage({
status: 'ready',
message: `Parakeet ${modelVersion} ready!`,
device: backend,
modelVersion,
});
return { status: 'ready', device: backend };
} catch (error) {
console.error('Failed to load model:', error);
self.postMessage({
status: 'error',
message: `Failed to load model: ${error.message}`,
error: error.toString(),
});
return { status: 'error', error: error.toString() };
} finally {
isLoading = false;
}
}
/**
* Transcribe audio chunk using Parakeet
*/
async function transcribe(audio, language = null) {
if (!model) {
throw new Error('Model not loaded. Call load() first.');
}
try {
const startTime = performance.now();
// Transcribe with parakeet.js
const result = await model.transcribe(audio, 16000, {
returnTimestamps: true, // Get word-level timestamps
returnConfidences: true, // Get confidence scores
temperature: 1.0, // Greedy decoding
});
const endTime = performance.now();
const latency = (endTime - startTime) / 1000; // seconds
const audioDuration = audio.length / 16000;
const rtf = audioDuration / latency; // Speed factor (inverse of traditional RTF)
// Convert parakeet.js word format to our sentence format
const sentences = groupWordsIntoSentences(result.words || []);
return {
text: result.utterance_text || '',
sentences,
words: result.words || [],
chunks: result.words || [], // For compatibility
metadata: {
latency,
audioDuration,
rtf,
language,
confidence: result.confidence_scores,
metrics: result.metrics,
},
};
} catch (error) {
console.error('Transcription error:', error);
throw error;
}
}
/**
* Group words into sentences based on punctuation
*
* Note: This is a simplified implementation since parakeet.js provides word-level
* alignments but not sentence-level. The Python implementation uses model-provided
* sentence boundaries. We split on sentence-ending punctuation (.!?) to approximate
* sentence boundaries for the progressive streaming window management.
*/
function groupWordsIntoSentences(words) {
if (!words || words.length === 0) {
return [];
}
const sentences = [];
let currentWords = [];
let currentStart = words[0].start_time || 0;
for (let i = 0; i < words.length; i++) {
const word = words[i];
currentWords.push(word.text);
// Check if this word ends a sentence (only period, question mark, exclamation)
// Note: We explicitly ignore commas - they don't end sentences
const endsWithTerminalPunctuation = /[.!?]$/.test(word.text);
if (endsWithTerminalPunctuation || i === words.length - 1) {
// Create sentence
sentences.push({
text: currentWords.join(' ').trim(),
start: currentStart,
end: word.end_time || (word.start_time || 0),
});
// Start new sentence if there are more words
if (i < words.length - 1) {
currentWords = [];
currentStart = words[i + 1].start_time || (word.end_time || 0);
}
}
}
return sentences;
}
/**
* Message handler
*/
self.onmessage = async (event) => {
const { type, data } = event.data;
try {
switch (type) {
case 'load':
await loadModel(data?.modelVersion, data?.options || {});
break;
case 'transcribe':
const result = await transcribe(data.audio, data.language);
self.postMessage({
status: 'transcription',
result,
});
break;
case 'ping':
self.postMessage({ status: 'pong' });
break;
default:
self.postMessage({
status: 'error',
message: `Unknown message type: ${type}`,
});
}
} catch (error) {
self.postMessage({
status: 'error',
message: error.message,
error: error.toString(),
});
}
};