Spaces:
Running
Running
File size: 9,979 Bytes
b830719 ea6aed8 b830719 c9d9124 b830719 c9d9124 0e8835b c9d9124 b830719 c9d9124 b830719 c9d9124 b830719 b76aacc c9d9124 b76aacc c9d9124 b76aacc c9d9124 b76aacc c9d9124 b76aacc c9d9124 b76aacc c9d9124 b76aacc b830719 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 |
/**
* Smart Progressive Streaming Handler
*
* JavaScript port of STT/smart_progressive_streaming.py
*
* Provides frequent partial transcriptions (every 250ms) with:
* - Growing window up to 15s for accuracy
* - Sentence-boundary-aware window sliding for audio > 15s
* - Fixed sentences + active transcription
*/
export class PartialTranscription {
constructor(fixedText, activeText, timestamp, isFinal) {
this.fixedText = fixedText; // Sentences that won't change
this.activeText = activeText; // Current partial transcription
this.timestamp = timestamp; // Current position in audio
this.isFinal = isFinal; // True if this is the last update
}
}
export class SmartProgressiveStreamingHandler {
/**
* Smart progressive streaming with sentence-aware window management.
*
* Strategy:
* 1. Emit partial transcriptions every 250ms
* 2. Use growing window (up to 15s) for better accuracy
* 3. When audio > 15s, slide window using sentence boundaries:
* - Keep completed sentences as "fixed"
* - Only re-transcribe the "active" portion
*/
constructor(model, options = {}) {
this.model = model;
this.emissionInterval = options.emissionInterval || 0.5; // 500ms
this.maxWindowSize = options.maxWindowSize || 15.0; // 15 seconds
this.sentenceBuffer = options.sentenceBuffer || 2.0; // 2 second buffer
this.sampleRate = options.sampleRate || 16000;
// State for incremental streaming
this.reset();
}
reset() {
/**
* Reset state for new streaming session.
*/
this.fixedSentences = [];
this.fixedEndTime = 0.0;
this.lastTranscribedLength = 0;
}
async transcribeIncremental(audio) {
/**
* Transcribe audio incrementally (for live streaming).
*
* Call this repeatedly with growing audio buffer (Float32Array).
* Returns a single PartialTranscription for current state.
*
* @param {Float32Array} audio - Growing audio buffer
* @returns {Promise<PartialTranscription>}
*/
// Skip if not enough new audio
const currentLength = audio.length;
if (currentLength < this.sampleRate * 0.5) { // Need at least 500ms
return new PartialTranscription(
this.fixedSentences.join(" "),
"",
currentLength / this.sampleRate,
false
);
}
// Skip if no new audio since last transcription
if (currentLength === this.lastTranscribedLength) {
return new PartialTranscription(
this.fixedSentences.join(" "),
"",
currentLength / this.sampleRate,
false
);
}
this.lastTranscribedLength = currentLength;
// Extract window for transcription (from last fixed sentence to end)
const windowStartSamples = Math.floor(this.fixedEndTime * this.sampleRate);
const audioWindow = audio.slice(windowStartSamples);
// Check if window exceeds max_window_size
const windowDuration = audioWindow.length / this.sampleRate;
// Transcribe current window
let result = await this.model.transcribe(audioWindow);
if (windowDuration >= this.maxWindowSize && result.sentences && result.sentences.length > 1) {
// Window is too large - fix some sentences
const cutoffTime = windowDuration - this.sentenceBuffer;
// Find sentences to fix (matching Python implementation)
const newFixedSentences = [];
let newFixedEndTime = this.fixedEndTime;
for (const sentence of result.sentences) {
if (sentence.end < cutoffTime) {
// Fix this sentence
newFixedSentences.push(sentence.text.trim());
newFixedEndTime = this.fixedEndTime + sentence.end;
} else {
break;
}
}
if (newFixedSentences.length > 0) {
this.fixedSentences.push(...newFixedSentences);
this.fixedEndTime = newFixedEndTime;
// Re-transcribe from new fixed point
const newWindowStartSamples = Math.floor(this.fixedEndTime * this.sampleRate);
const newAudioWindow = audio.slice(newWindowStartSamples);
result = await this.model.transcribe(newAudioWindow);
}
}
// Build output
const fixedText = this.fixedSentences.join(" ");
const activeText = result.text ? result.text.trim() : "";
const timestamp = audio.length / this.sampleRate;
return new PartialTranscription(
fixedText,
activeText,
timestamp,
false
);
}
async *transcribeProgressive(audio) {
/**
* Transcribe audio with smart progressive emissions.
*
* Yields PartialTranscription with:
* - fixedText: Completed sentences (won't change)
* - activeText: Current partial transcription
* - timestamp: Current position
*
* @param {Float32Array} audio - Complete audio buffer
* @yields {PartialTranscription}
*/
const totalDuration = audio.length / this.sampleRate;
let currentTime = 0;
this.reset();
while (currentTime < totalDuration) {
currentTime += this.emissionInterval;
const currentSamples = Math.min(
Math.floor(currentTime * this.sampleRate),
audio.length
);
const currentAudio = audio.slice(0, currentSamples);
const result = await this.transcribeIncremental(currentAudio);
yield result;
// Small delay to simulate real-time
await new Promise(resolve => setTimeout(resolve, this.emissionInterval * 1000));
}
// Final transcription
const finalResult = await this.transcribeIncremental(audio);
yield new PartialTranscription(
finalResult.fixedText,
finalResult.activeText,
finalResult.timestamp,
true // is_final = true
);
}
async *transcribeBatch(audio) {
/**
* Transcribe audio in batch mode (for uploaded files).
*
* Processes as fast as possible with full 15s windows:
* - Start with maximum window size immediately
* - No artificial delays between windows
* - Slide window as soon as current transcription completes
*
* @param {Float32Array} audio - Complete audio buffer
* @yields {PartialTranscription}
*/
const totalDuration = audio.length / this.sampleRate;
this.reset();
let processedUpTo = 0; // Track how much audio we've finalized
while (processedUpTo < totalDuration) {
// Process next 15s window (or remaining audio if less than 15s)
const windowStart = processedUpTo;
const windowEnd = Math.min(processedUpTo + this.maxWindowSize, totalDuration);
const windowDuration = windowEnd - windowStart;
// Extract audio window
const windowStartSamples = Math.floor(windowStart * this.sampleRate);
const windowEndSamples = Math.floor(windowEnd * this.sampleRate);
const audioWindow = audio.slice(windowStartSamples, windowEndSamples);
// Transcribe this window
const result = await this.model.transcribe(audioWindow);
// Determine how much of this window to lock as "fixed"
if (windowDuration >= this.maxWindowSize) {
// Full window - need to slide
const cutoffTime = windowDuration - this.sentenceBuffer;
// Find sentences to fix (matching Python implementation)
if (result.sentences && result.sentences.length > 1) {
const sentencesToFix = result.sentences.filter(s => s.end < cutoffTime);
if (sentencesToFix.length > 0) {
// Lock these sentences as fixed
const newFixedSentences = sentencesToFix.map(s => s.text.trim());
this.fixedSentences.push(...newFixedSentences);
// Move processed marker forward
const lastSentenceTime = sentencesToFix[sentencesToFix.length - 1].end;
processedUpTo = windowStart + lastSentenceTime;
// Get active text (remaining sentences)
const activeSentences = result.sentences.filter(s => s.end >= cutoffTime);
const activeText = activeSentences.map(s => s.text).join(' ').trim();
// Yield update
yield new PartialTranscription(
this.fixedSentences.join(" "),
activeText,
windowEnd,
false
);
} else {
// No sentences found before cutoff - lock half the window to make progress
const halfText = result.text ? result.text.trim() : "";
this.fixedSentences.push(halfText);
processedUpTo = windowStart + windowDuration / 2;
yield new PartialTranscription(
this.fixedSentences.join(" "),
"",
windowEnd,
false
);
}
} else {
// No sentence timestamps - lock half to make progress
const halfText = result.text ? result.text.trim() : "";
this.fixedSentences.push(halfText);
processedUpTo = windowStart + windowDuration / 2;
yield new PartialTranscription(
this.fixedSentences.join(" "),
"",
windowEnd,
false
);
}
} else {
// Last window (less than 15s) - lock everything
const finalText = result.text ? result.text.trim() : "";
this.fixedSentences.push(finalText);
processedUpTo = windowEnd;
yield new PartialTranscription(
this.fixedSentences.join(" "),
"",
windowEnd,
true // Final
);
}
}
}
async finalize(audio) {
/**
* Get final transcription by combining fixed + active.
*
* @param {Float32Array} audio - Complete audio buffer
* @returns {Promise<string>} Final complete transcription
*/
const result = await this.transcribeIncremental(audio);
const parts = [];
if (result.fixedText) parts.push(result.fixedText);
if (result.activeText) parts.push(result.activeText);
return parts.join(" ");
}
}
|