Spaces:
Running
Running
| /** | |
| * Smart Progressive Streaming Handler | |
| * | |
| * JavaScript port of STT/smart_progressive_streaming.py | |
| * | |
| * Provides frequent partial transcriptions (every 250ms) with: | |
| * - Growing window up to 15s for accuracy | |
| * - Sentence-boundary-aware window sliding for audio > 15s | |
| * - Fixed sentences + active transcription | |
| */ | |
| export class PartialTranscription { | |
| constructor(fixedText, activeText, timestamp, isFinal) { | |
| this.fixedText = fixedText; // Sentences that won't change | |
| this.activeText = activeText; // Current partial transcription | |
| this.timestamp = timestamp; // Current position in audio | |
| this.isFinal = isFinal; // True if this is the last update | |
| } | |
| } | |
| export class SmartProgressiveStreamingHandler { | |
| /** | |
| * Smart progressive streaming with sentence-aware window management. | |
| * | |
| * Strategy: | |
| * 1. Emit partial transcriptions every 250ms | |
| * 2. Use growing window (up to 15s) for better accuracy | |
| * 3. When audio > 15s, slide window using sentence boundaries: | |
| * - Keep completed sentences as "fixed" | |
| * - Only re-transcribe the "active" portion | |
| */ | |
| constructor(model, options = {}) { | |
| this.model = model; | |
| this.emissionInterval = options.emissionInterval || 0.5; // 500ms | |
| this.maxWindowSize = options.maxWindowSize || 15.0; // 15 seconds | |
| this.sentenceBuffer = options.sentenceBuffer || 2.0; // 2 second buffer | |
| this.sampleRate = options.sampleRate || 16000; | |
| // State for incremental streaming | |
| this.reset(); | |
| } | |
| reset() { | |
| /** | |
| * Reset state for new streaming session. | |
| */ | |
| this.fixedSentences = []; | |
| this.fixedEndTime = 0.0; | |
| this.lastTranscribedLength = 0; | |
| } | |
| async transcribeIncremental(audio) { | |
| /** | |
| * Transcribe audio incrementally (for live streaming). | |
| * | |
| * Call this repeatedly with growing audio buffer (Float32Array). | |
| * Returns a single PartialTranscription for current state. | |
| * | |
| * @param {Float32Array} audio - Growing audio buffer | |
| * @returns {Promise<PartialTranscription>} | |
| */ | |
| // Skip if not enough new audio | |
| const currentLength = audio.length; | |
| if (currentLength < this.sampleRate * 0.5) { // Need at least 500ms | |
| return new PartialTranscription( | |
| this.fixedSentences.join(" "), | |
| "", | |
| currentLength / this.sampleRate, | |
| false | |
| ); | |
| } | |
| // Skip if no new audio since last transcription | |
| if (currentLength === this.lastTranscribedLength) { | |
| return new PartialTranscription( | |
| this.fixedSentences.join(" "), | |
| "", | |
| currentLength / this.sampleRate, | |
| false | |
| ); | |
| } | |
| this.lastTranscribedLength = currentLength; | |
| // Extract window for transcription (from last fixed sentence to end) | |
| const windowStartSamples = Math.floor(this.fixedEndTime * this.sampleRate); | |
| const audioWindow = audio.slice(windowStartSamples); | |
| // Check if window exceeds max_window_size | |
| const windowDuration = audioWindow.length / this.sampleRate; | |
| // Transcribe current window | |
| let result = await this.model.transcribe(audioWindow); | |
| if (windowDuration >= this.maxWindowSize && result.sentences && result.sentences.length > 1) { | |
| // Window is too large - fix some sentences | |
| const cutoffTime = windowDuration - this.sentenceBuffer; | |
| // Find sentences to fix (matching Python implementation) | |
| const newFixedSentences = []; | |
| let newFixedEndTime = this.fixedEndTime; | |
| for (const sentence of result.sentences) { | |
| if (sentence.end < cutoffTime) { | |
| // Fix this sentence | |
| newFixedSentences.push(sentence.text.trim()); | |
| newFixedEndTime = this.fixedEndTime + sentence.end; | |
| } else { | |
| break; | |
| } | |
| } | |
| if (newFixedSentences.length > 0) { | |
| this.fixedSentences.push(...newFixedSentences); | |
| this.fixedEndTime = newFixedEndTime; | |
| // Re-transcribe from new fixed point | |
| const newWindowStartSamples = Math.floor(this.fixedEndTime * this.sampleRate); | |
| const newAudioWindow = audio.slice(newWindowStartSamples); | |
| result = await this.model.transcribe(newAudioWindow); | |
| } | |
| } | |
| // Build output | |
| const fixedText = this.fixedSentences.join(" "); | |
| const activeText = result.text ? result.text.trim() : ""; | |
| const timestamp = audio.length / this.sampleRate; | |
| return new PartialTranscription( | |
| fixedText, | |
| activeText, | |
| timestamp, | |
| false | |
| ); | |
| } | |
| async *transcribeProgressive(audio) { | |
| /** | |
| * Transcribe audio with smart progressive emissions. | |
| * | |
| * Yields PartialTranscription with: | |
| * - fixedText: Completed sentences (won't change) | |
| * - activeText: Current partial transcription | |
| * - timestamp: Current position | |
| * | |
| * @param {Float32Array} audio - Complete audio buffer | |
| * @yields {PartialTranscription} | |
| */ | |
| const totalDuration = audio.length / this.sampleRate; | |
| let currentTime = 0; | |
| this.reset(); | |
| while (currentTime < totalDuration) { | |
| currentTime += this.emissionInterval; | |
| const currentSamples = Math.min( | |
| Math.floor(currentTime * this.sampleRate), | |
| audio.length | |
| ); | |
| const currentAudio = audio.slice(0, currentSamples); | |
| const result = await this.transcribeIncremental(currentAudio); | |
| yield result; | |
| // Small delay to simulate real-time | |
| await new Promise(resolve => setTimeout(resolve, this.emissionInterval * 1000)); | |
| } | |
| // Final transcription | |
| const finalResult = await this.transcribeIncremental(audio); | |
| yield new PartialTranscription( | |
| finalResult.fixedText, | |
| finalResult.activeText, | |
| finalResult.timestamp, | |
| true // is_final = true | |
| ); | |
| } | |
| async *transcribeBatch(audio) { | |
| /** | |
| * Transcribe audio in batch mode (for uploaded files). | |
| * | |
| * Processes as fast as possible with full 15s windows: | |
| * - Start with maximum window size immediately | |
| * - No artificial delays between windows | |
| * - Slide window as soon as current transcription completes | |
| * | |
| * @param {Float32Array} audio - Complete audio buffer | |
| * @yields {PartialTranscription} | |
| */ | |
| const totalDuration = audio.length / this.sampleRate; | |
| this.reset(); | |
| let processedUpTo = 0; // Track how much audio we've finalized | |
| while (processedUpTo < totalDuration) { | |
| // Process next 15s window (or remaining audio if less than 15s) | |
| const windowStart = processedUpTo; | |
| const windowEnd = Math.min(processedUpTo + this.maxWindowSize, totalDuration); | |
| const windowDuration = windowEnd - windowStart; | |
| // Extract audio window | |
| const windowStartSamples = Math.floor(windowStart * this.sampleRate); | |
| const windowEndSamples = Math.floor(windowEnd * this.sampleRate); | |
| const audioWindow = audio.slice(windowStartSamples, windowEndSamples); | |
| // Transcribe this window | |
| const result = await this.model.transcribe(audioWindow); | |
| // Determine how much of this window to lock as "fixed" | |
| if (windowDuration >= this.maxWindowSize) { | |
| // Full window - need to slide | |
| const cutoffTime = windowDuration - this.sentenceBuffer; | |
| // Find sentences to fix (matching Python implementation) | |
| if (result.sentences && result.sentences.length > 1) { | |
| const sentencesToFix = result.sentences.filter(s => s.end < cutoffTime); | |
| if (sentencesToFix.length > 0) { | |
| // Lock these sentences as fixed | |
| const newFixedSentences = sentencesToFix.map(s => s.text.trim()); | |
| this.fixedSentences.push(...newFixedSentences); | |
| // Move processed marker forward | |
| const lastSentenceTime = sentencesToFix[sentencesToFix.length - 1].end; | |
| processedUpTo = windowStart + lastSentenceTime; | |
| // Get active text (remaining sentences) | |
| const activeSentences = result.sentences.filter(s => s.end >= cutoffTime); | |
| const activeText = activeSentences.map(s => s.text).join(' ').trim(); | |
| // Yield update | |
| yield new PartialTranscription( | |
| this.fixedSentences.join(" "), | |
| activeText, | |
| windowEnd, | |
| false | |
| ); | |
| } else { | |
| // No sentences found before cutoff - lock half the window to make progress | |
| const halfText = result.text ? result.text.trim() : ""; | |
| this.fixedSentences.push(halfText); | |
| processedUpTo = windowStart + windowDuration / 2; | |
| yield new PartialTranscription( | |
| this.fixedSentences.join(" "), | |
| "", | |
| windowEnd, | |
| false | |
| ); | |
| } | |
| } else { | |
| // No sentence timestamps - lock half to make progress | |
| const halfText = result.text ? result.text.trim() : ""; | |
| this.fixedSentences.push(halfText); | |
| processedUpTo = windowStart + windowDuration / 2; | |
| yield new PartialTranscription( | |
| this.fixedSentences.join(" "), | |
| "", | |
| windowEnd, | |
| false | |
| ); | |
| } | |
| } else { | |
| // Last window (less than 15s) - lock everything | |
| const finalText = result.text ? result.text.trim() : ""; | |
| this.fixedSentences.push(finalText); | |
| processedUpTo = windowEnd; | |
| yield new PartialTranscription( | |
| this.fixedSentences.join(" "), | |
| "", | |
| windowEnd, | |
| true // Final | |
| ); | |
| } | |
| } | |
| } | |
| async finalize(audio) { | |
| /** | |
| * Get final transcription by combining fixed + active. | |
| * | |
| * @param {Float32Array} audio - Complete audio buffer | |
| * @returns {Promise<string>} Final complete transcription | |
| */ | |
| const result = await this.transcribeIncremental(audio); | |
| const parts = []; | |
| if (result.fixedText) parts.push(result.fixedText); | |
| if (result.activeText) parts.push(result.activeText); | |
| return parts.join(" "); | |
| } | |
| } | |