parakeet-v3-streaming / source /src /utils /progressive-streaming.js
andito's picture
andito HF Staff
Use sentence-based window sliding instead of word-based
c9d9124
/**
* Smart Progressive Streaming Handler
*
* JavaScript port of STT/smart_progressive_streaming.py
*
* Provides frequent partial transcriptions (every 250ms) with:
* - Growing window up to 15s for accuracy
* - Sentence-boundary-aware window sliding for audio > 15s
* - Fixed sentences + active transcription
*/
export class PartialTranscription {
constructor(fixedText, activeText, timestamp, isFinal) {
this.fixedText = fixedText; // Sentences that won't change
this.activeText = activeText; // Current partial transcription
this.timestamp = timestamp; // Current position in audio
this.isFinal = isFinal; // True if this is the last update
}
}
export class SmartProgressiveStreamingHandler {
/**
* Smart progressive streaming with sentence-aware window management.
*
* Strategy:
* 1. Emit partial transcriptions every 250ms
* 2. Use growing window (up to 15s) for better accuracy
* 3. When audio > 15s, slide window using sentence boundaries:
* - Keep completed sentences as "fixed"
* - Only re-transcribe the "active" portion
*/
constructor(model, options = {}) {
this.model = model;
this.emissionInterval = options.emissionInterval || 0.5; // 500ms
this.maxWindowSize = options.maxWindowSize || 15.0; // 15 seconds
this.sentenceBuffer = options.sentenceBuffer || 2.0; // 2 second buffer
this.sampleRate = options.sampleRate || 16000;
// State for incremental streaming
this.reset();
}
reset() {
/**
* Reset state for new streaming session.
*/
this.fixedSentences = [];
this.fixedEndTime = 0.0;
this.lastTranscribedLength = 0;
}
async transcribeIncremental(audio) {
/**
* Transcribe audio incrementally (for live streaming).
*
* Call this repeatedly with growing audio buffer (Float32Array).
* Returns a single PartialTranscription for current state.
*
* @param {Float32Array} audio - Growing audio buffer
* @returns {Promise<PartialTranscription>}
*/
// Skip if not enough new audio
const currentLength = audio.length;
if (currentLength < this.sampleRate * 0.5) { // Need at least 500ms
return new PartialTranscription(
this.fixedSentences.join(" "),
"",
currentLength / this.sampleRate,
false
);
}
// Skip if no new audio since last transcription
if (currentLength === this.lastTranscribedLength) {
return new PartialTranscription(
this.fixedSentences.join(" "),
"",
currentLength / this.sampleRate,
false
);
}
this.lastTranscribedLength = currentLength;
// Extract window for transcription (from last fixed sentence to end)
const windowStartSamples = Math.floor(this.fixedEndTime * this.sampleRate);
const audioWindow = audio.slice(windowStartSamples);
// Check if window exceeds max_window_size
const windowDuration = audioWindow.length / this.sampleRate;
// Transcribe current window
let result = await this.model.transcribe(audioWindow);
if (windowDuration >= this.maxWindowSize && result.sentences && result.sentences.length > 1) {
// Window is too large - fix some sentences
const cutoffTime = windowDuration - this.sentenceBuffer;
// Find sentences to fix (matching Python implementation)
const newFixedSentences = [];
let newFixedEndTime = this.fixedEndTime;
for (const sentence of result.sentences) {
if (sentence.end < cutoffTime) {
// Fix this sentence
newFixedSentences.push(sentence.text.trim());
newFixedEndTime = this.fixedEndTime + sentence.end;
} else {
break;
}
}
if (newFixedSentences.length > 0) {
this.fixedSentences.push(...newFixedSentences);
this.fixedEndTime = newFixedEndTime;
// Re-transcribe from new fixed point
const newWindowStartSamples = Math.floor(this.fixedEndTime * this.sampleRate);
const newAudioWindow = audio.slice(newWindowStartSamples);
result = await this.model.transcribe(newAudioWindow);
}
}
// Build output
const fixedText = this.fixedSentences.join(" ");
const activeText = result.text ? result.text.trim() : "";
const timestamp = audio.length / this.sampleRate;
return new PartialTranscription(
fixedText,
activeText,
timestamp,
false
);
}
async *transcribeProgressive(audio) {
/**
* Transcribe audio with smart progressive emissions.
*
* Yields PartialTranscription with:
* - fixedText: Completed sentences (won't change)
* - activeText: Current partial transcription
* - timestamp: Current position
*
* @param {Float32Array} audio - Complete audio buffer
* @yields {PartialTranscription}
*/
const totalDuration = audio.length / this.sampleRate;
let currentTime = 0;
this.reset();
while (currentTime < totalDuration) {
currentTime += this.emissionInterval;
const currentSamples = Math.min(
Math.floor(currentTime * this.sampleRate),
audio.length
);
const currentAudio = audio.slice(0, currentSamples);
const result = await this.transcribeIncremental(currentAudio);
yield result;
// Small delay to simulate real-time
await new Promise(resolve => setTimeout(resolve, this.emissionInterval * 1000));
}
// Final transcription
const finalResult = await this.transcribeIncremental(audio);
yield new PartialTranscription(
finalResult.fixedText,
finalResult.activeText,
finalResult.timestamp,
true // is_final = true
);
}
async *transcribeBatch(audio) {
/**
* Transcribe audio in batch mode (for uploaded files).
*
* Processes as fast as possible with full 15s windows:
* - Start with maximum window size immediately
* - No artificial delays between windows
* - Slide window as soon as current transcription completes
*
* @param {Float32Array} audio - Complete audio buffer
* @yields {PartialTranscription}
*/
const totalDuration = audio.length / this.sampleRate;
this.reset();
let processedUpTo = 0; // Track how much audio we've finalized
while (processedUpTo < totalDuration) {
// Process next 15s window (or remaining audio if less than 15s)
const windowStart = processedUpTo;
const windowEnd = Math.min(processedUpTo + this.maxWindowSize, totalDuration);
const windowDuration = windowEnd - windowStart;
// Extract audio window
const windowStartSamples = Math.floor(windowStart * this.sampleRate);
const windowEndSamples = Math.floor(windowEnd * this.sampleRate);
const audioWindow = audio.slice(windowStartSamples, windowEndSamples);
// Transcribe this window
const result = await this.model.transcribe(audioWindow);
// Determine how much of this window to lock as "fixed"
if (windowDuration >= this.maxWindowSize) {
// Full window - need to slide
const cutoffTime = windowDuration - this.sentenceBuffer;
// Find sentences to fix (matching Python implementation)
if (result.sentences && result.sentences.length > 1) {
const sentencesToFix = result.sentences.filter(s => s.end < cutoffTime);
if (sentencesToFix.length > 0) {
// Lock these sentences as fixed
const newFixedSentences = sentencesToFix.map(s => s.text.trim());
this.fixedSentences.push(...newFixedSentences);
// Move processed marker forward
const lastSentenceTime = sentencesToFix[sentencesToFix.length - 1].end;
processedUpTo = windowStart + lastSentenceTime;
// Get active text (remaining sentences)
const activeSentences = result.sentences.filter(s => s.end >= cutoffTime);
const activeText = activeSentences.map(s => s.text).join(' ').trim();
// Yield update
yield new PartialTranscription(
this.fixedSentences.join(" "),
activeText,
windowEnd,
false
);
} else {
// No sentences found before cutoff - lock half the window to make progress
const halfText = result.text ? result.text.trim() : "";
this.fixedSentences.push(halfText);
processedUpTo = windowStart + windowDuration / 2;
yield new PartialTranscription(
this.fixedSentences.join(" "),
"",
windowEnd,
false
);
}
} else {
// No sentence timestamps - lock half to make progress
const halfText = result.text ? result.text.trim() : "";
this.fixedSentences.push(halfText);
processedUpTo = windowStart + windowDuration / 2;
yield new PartialTranscription(
this.fixedSentences.join(" "),
"",
windowEnd,
false
);
}
} else {
// Last window (less than 15s) - lock everything
const finalText = result.text ? result.text.trim() : "";
this.fixedSentences.push(finalText);
processedUpTo = windowEnd;
yield new PartialTranscription(
this.fixedSentences.join(" "),
"",
windowEnd,
true // Final
);
}
}
}
async finalize(audio) {
/**
* Get final transcription by combining fixed + active.
*
* @param {Float32Array} audio - Complete audio buffer
* @returns {Promise<string>} Final complete transcription
*/
const result = await this.transcribeIncremental(audio);
const parts = [];
if (result.fixedText) parts.push(result.fixedText);
if (result.activeText) parts.push(result.activeText);
return parts.join(" ");
}
}