/** * Smart Progressive Streaming Handler * * JavaScript port of STT/smart_progressive_streaming.py * * Provides frequent partial transcriptions (every 250ms) with: * - Growing window up to 15s for accuracy * - Sentence-boundary-aware window sliding for audio > 15s * - Fixed sentences + active transcription */ export class PartialTranscription { constructor(fixedText, activeText, timestamp, isFinal) { this.fixedText = fixedText; // Sentences that won't change this.activeText = activeText; // Current partial transcription this.timestamp = timestamp; // Current position in audio this.isFinal = isFinal; // True if this is the last update } } export class SmartProgressiveStreamingHandler { /** * Smart progressive streaming with sentence-aware window management. * * Strategy: * 1. Emit partial transcriptions every 250ms * 2. Use growing window (up to 15s) for better accuracy * 3. When audio > 15s, slide window using sentence boundaries: * - Keep completed sentences as "fixed" * - Only re-transcribe the "active" portion */ constructor(model, options = {}) { this.model = model; this.emissionInterval = options.emissionInterval || 0.5; // 500ms this.maxWindowSize = options.maxWindowSize || 15.0; // 15 seconds this.sentenceBuffer = options.sentenceBuffer || 2.0; // 2 second buffer this.sampleRate = options.sampleRate || 16000; // State for incremental streaming this.reset(); } reset() { /** * Reset state for new streaming session. */ this.fixedSentences = []; this.fixedEndTime = 0.0; this.lastTranscribedLength = 0; } async transcribeIncremental(audio) { /** * Transcribe audio incrementally (for live streaming). * * Call this repeatedly with growing audio buffer (Float32Array). * Returns a single PartialTranscription for current state. * * @param {Float32Array} audio - Growing audio buffer * @returns {Promise} */ // Skip if not enough new audio const currentLength = audio.length; if (currentLength < this.sampleRate * 0.5) { // Need at least 500ms return new PartialTranscription( this.fixedSentences.join(" "), "", currentLength / this.sampleRate, false ); } // Skip if no new audio since last transcription if (currentLength === this.lastTranscribedLength) { return new PartialTranscription( this.fixedSentences.join(" "), "", currentLength / this.sampleRate, false ); } this.lastTranscribedLength = currentLength; // Extract window for transcription (from last fixed sentence to end) const windowStartSamples = Math.floor(this.fixedEndTime * this.sampleRate); const audioWindow = audio.slice(windowStartSamples); // Check if window exceeds max_window_size const windowDuration = audioWindow.length / this.sampleRate; // Transcribe current window let result = await this.model.transcribe(audioWindow); if (windowDuration >= this.maxWindowSize && result.sentences && result.sentences.length > 1) { // Window is too large - fix some sentences const cutoffTime = windowDuration - this.sentenceBuffer; // Find sentences to fix (matching Python implementation) const newFixedSentences = []; let newFixedEndTime = this.fixedEndTime; for (const sentence of result.sentences) { if (sentence.end < cutoffTime) { // Fix this sentence newFixedSentences.push(sentence.text.trim()); newFixedEndTime = this.fixedEndTime + sentence.end; } else { break; } } if (newFixedSentences.length > 0) { this.fixedSentences.push(...newFixedSentences); this.fixedEndTime = newFixedEndTime; // Re-transcribe from new fixed point const newWindowStartSamples = Math.floor(this.fixedEndTime * this.sampleRate); const newAudioWindow = audio.slice(newWindowStartSamples); result = await this.model.transcribe(newAudioWindow); } } // Build output const fixedText = this.fixedSentences.join(" "); const activeText = result.text ? result.text.trim() : ""; const timestamp = audio.length / this.sampleRate; return new PartialTranscription( fixedText, activeText, timestamp, false ); } async *transcribeProgressive(audio) { /** * Transcribe audio with smart progressive emissions. * * Yields PartialTranscription with: * - fixedText: Completed sentences (won't change) * - activeText: Current partial transcription * - timestamp: Current position * * @param {Float32Array} audio - Complete audio buffer * @yields {PartialTranscription} */ const totalDuration = audio.length / this.sampleRate; let currentTime = 0; this.reset(); while (currentTime < totalDuration) { currentTime += this.emissionInterval; const currentSamples = Math.min( Math.floor(currentTime * this.sampleRate), audio.length ); const currentAudio = audio.slice(0, currentSamples); const result = await this.transcribeIncremental(currentAudio); yield result; // Small delay to simulate real-time await new Promise(resolve => setTimeout(resolve, this.emissionInterval * 1000)); } // Final transcription const finalResult = await this.transcribeIncremental(audio); yield new PartialTranscription( finalResult.fixedText, finalResult.activeText, finalResult.timestamp, true // is_final = true ); } async *transcribeBatch(audio) { /** * Transcribe audio in batch mode (for uploaded files). * * Processes as fast as possible with full 15s windows: * - Start with maximum window size immediately * - No artificial delays between windows * - Slide window as soon as current transcription completes * * @param {Float32Array} audio - Complete audio buffer * @yields {PartialTranscription} */ const totalDuration = audio.length / this.sampleRate; this.reset(); let processedUpTo = 0; // Track how much audio we've finalized while (processedUpTo < totalDuration) { // Process next 15s window (or remaining audio if less than 15s) const windowStart = processedUpTo; const windowEnd = Math.min(processedUpTo + this.maxWindowSize, totalDuration); const windowDuration = windowEnd - windowStart; // Extract audio window const windowStartSamples = Math.floor(windowStart * this.sampleRate); const windowEndSamples = Math.floor(windowEnd * this.sampleRate); const audioWindow = audio.slice(windowStartSamples, windowEndSamples); // Transcribe this window const result = await this.model.transcribe(audioWindow); // Determine how much of this window to lock as "fixed" if (windowDuration >= this.maxWindowSize) { // Full window - need to slide const cutoffTime = windowDuration - this.sentenceBuffer; // Find sentences to fix (matching Python implementation) if (result.sentences && result.sentences.length > 1) { const sentencesToFix = result.sentences.filter(s => s.end < cutoffTime); if (sentencesToFix.length > 0) { // Lock these sentences as fixed const newFixedSentences = sentencesToFix.map(s => s.text.trim()); this.fixedSentences.push(...newFixedSentences); // Move processed marker forward const lastSentenceTime = sentencesToFix[sentencesToFix.length - 1].end; processedUpTo = windowStart + lastSentenceTime; // Get active text (remaining sentences) const activeSentences = result.sentences.filter(s => s.end >= cutoffTime); const activeText = activeSentences.map(s => s.text).join(' ').trim(); // Yield update yield new PartialTranscription( this.fixedSentences.join(" "), activeText, windowEnd, false ); } else { // No sentences found before cutoff - lock half the window to make progress const halfText = result.text ? result.text.trim() : ""; this.fixedSentences.push(halfText); processedUpTo = windowStart + windowDuration / 2; yield new PartialTranscription( this.fixedSentences.join(" "), "", windowEnd, false ); } } else { // No sentence timestamps - lock half to make progress const halfText = result.text ? result.text.trim() : ""; this.fixedSentences.push(halfText); processedUpTo = windowStart + windowDuration / 2; yield new PartialTranscription( this.fixedSentences.join(" "), "", windowEnd, false ); } } else { // Last window (less than 15s) - lock everything const finalText = result.text ? result.text.trim() : ""; this.fixedSentences.push(finalText); processedUpTo = windowEnd; yield new PartialTranscription( this.fixedSentences.join(" "), "", windowEnd, true // Final ); } } } async finalize(audio) { /** * Get final transcription by combining fixed + active. * * @param {Float32Array} audio - Complete audio buffer * @returns {Promise} Final complete transcription */ const result = await this.transcribeIncremental(audio); const parts = []; if (result.fixedText) parts.push(result.fixedText); if (result.activeText) parts.push(result.activeText); return parts.join(" "); } }