File size: 9,979 Bytes
b830719
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea6aed8
 
 
b830719
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c9d9124
 
b830719
 
c9d9124
 
 
0e8835b
c9d9124
 
 
 
 
 
 
 
 
b830719
c9d9124
 
 
b830719
c9d9124
 
 
 
b830719
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b76aacc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c9d9124
 
 
b76aacc
c9d9124
 
 
 
b76aacc
 
c9d9124
 
b76aacc
c9d9124
 
 
b76aacc
 
 
 
 
 
 
 
 
c9d9124
b76aacc
 
 
 
 
 
 
 
 
 
 
 
c9d9124
b76aacc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b830719
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
/**
 * Smart Progressive Streaming Handler
 *
 * JavaScript port of STT/smart_progressive_streaming.py
 *
 * Provides frequent partial transcriptions (every 250ms) with:
 * - Growing window up to 15s for accuracy
 * - Sentence-boundary-aware window sliding for audio > 15s
 * - Fixed sentences + active transcription
 */

export class PartialTranscription {
  constructor(fixedText, activeText, timestamp, isFinal) {
    this.fixedText = fixedText;  // Sentences that won't change
    this.activeText = activeText;  // Current partial transcription
    this.timestamp = timestamp;  // Current position in audio
    this.isFinal = isFinal;  // True if this is the last update
  }
}

export class SmartProgressiveStreamingHandler {
  /**
   * Smart progressive streaming with sentence-aware window management.
   *
   * Strategy:
   * 1. Emit partial transcriptions every 250ms
   * 2. Use growing window (up to 15s) for better accuracy
   * 3. When audio > 15s, slide window using sentence boundaries:
   *    - Keep completed sentences as "fixed"
   *    - Only re-transcribe the "active" portion
   */
  constructor(model, options = {}) {
    this.model = model;
    this.emissionInterval = options.emissionInterval || 0.5;  // 500ms
    this.maxWindowSize = options.maxWindowSize || 15.0;  // 15 seconds
    this.sentenceBuffer = options.sentenceBuffer || 2.0;  // 2 second buffer
    this.sampleRate = options.sampleRate || 16000;

    // State for incremental streaming
    this.reset();
  }

  reset() {
    /**
     * Reset state for new streaming session.
     */
    this.fixedSentences = [];
    this.fixedEndTime = 0.0;
    this.lastTranscribedLength = 0;
  }

  async transcribeIncremental(audio) {
    /**
     * Transcribe audio incrementally (for live streaming).
     *
     * Call this repeatedly with growing audio buffer (Float32Array).
     * Returns a single PartialTranscription for current state.
     *
     * @param {Float32Array} audio - Growing audio buffer
     * @returns {Promise<PartialTranscription>}
     */

    // Skip if not enough new audio
    const currentLength = audio.length;
    if (currentLength < this.sampleRate * 0.5) {  // Need at least 500ms
      return new PartialTranscription(
        this.fixedSentences.join(" "),
        "",
        currentLength / this.sampleRate,
        false
      );
    }

    // Skip if no new audio since last transcription
    if (currentLength === this.lastTranscribedLength) {
      return new PartialTranscription(
        this.fixedSentences.join(" "),
        "",
        currentLength / this.sampleRate,
        false
      );
    }

    this.lastTranscribedLength = currentLength;

    // Extract window for transcription (from last fixed sentence to end)
    const windowStartSamples = Math.floor(this.fixedEndTime * this.sampleRate);
    const audioWindow = audio.slice(windowStartSamples);

    // Check if window exceeds max_window_size
    const windowDuration = audioWindow.length / this.sampleRate;

    // Transcribe current window
    let result = await this.model.transcribe(audioWindow);

    if (windowDuration >= this.maxWindowSize && result.sentences && result.sentences.length > 1) {
      // Window is too large - fix some sentences
      const cutoffTime = windowDuration - this.sentenceBuffer;

      // Find sentences to fix (matching Python implementation)
      const newFixedSentences = [];
      let newFixedEndTime = this.fixedEndTime;

      for (const sentence of result.sentences) {
        if (sentence.end < cutoffTime) {
          // Fix this sentence
          newFixedSentences.push(sentence.text.trim());
          newFixedEndTime = this.fixedEndTime + sentence.end;
        } else {
          break;
        }
      }

      if (newFixedSentences.length > 0) {
        this.fixedSentences.push(...newFixedSentences);
        this.fixedEndTime = newFixedEndTime;

        // Re-transcribe from new fixed point
        const newWindowStartSamples = Math.floor(this.fixedEndTime * this.sampleRate);
        const newAudioWindow = audio.slice(newWindowStartSamples);
        result = await this.model.transcribe(newAudioWindow);
      }
    }

    // Build output
    const fixedText = this.fixedSentences.join(" ");
    const activeText = result.text ? result.text.trim() : "";
    const timestamp = audio.length / this.sampleRate;

    return new PartialTranscription(
      fixedText,
      activeText,
      timestamp,
      false
    );
  }

  async *transcribeProgressive(audio) {
    /**
     * Transcribe audio with smart progressive emissions.
     *
     * Yields PartialTranscription with:
     * - fixedText: Completed sentences (won't change)
     * - activeText: Current partial transcription
     * - timestamp: Current position
     *
     * @param {Float32Array} audio - Complete audio buffer
     * @yields {PartialTranscription}
     */

    const totalDuration = audio.length / this.sampleRate;
    let currentTime = 0;

    this.reset();

    while (currentTime < totalDuration) {
      currentTime += this.emissionInterval;
      const currentSamples = Math.min(
        Math.floor(currentTime * this.sampleRate),
        audio.length
      );

      const currentAudio = audio.slice(0, currentSamples);
      const result = await this.transcribeIncremental(currentAudio);

      yield result;

      // Small delay to simulate real-time
      await new Promise(resolve => setTimeout(resolve, this.emissionInterval * 1000));
    }

    // Final transcription
    const finalResult = await this.transcribeIncremental(audio);
    yield new PartialTranscription(
      finalResult.fixedText,
      finalResult.activeText,
      finalResult.timestamp,
      true  // is_final = true
    );
  }

  async *transcribeBatch(audio) {
    /**
     * Transcribe audio in batch mode (for uploaded files).
     *
     * Processes as fast as possible with full 15s windows:
     * - Start with maximum window size immediately
     * - No artificial delays between windows
     * - Slide window as soon as current transcription completes
     *
     * @param {Float32Array} audio - Complete audio buffer
     * @yields {PartialTranscription}
     */
    const totalDuration = audio.length / this.sampleRate;
    this.reset();

    let processedUpTo = 0;  // Track how much audio we've finalized

    while (processedUpTo < totalDuration) {
      // Process next 15s window (or remaining audio if less than 15s)
      const windowStart = processedUpTo;
      const windowEnd = Math.min(processedUpTo + this.maxWindowSize, totalDuration);
      const windowDuration = windowEnd - windowStart;

      // Extract audio window
      const windowStartSamples = Math.floor(windowStart * this.sampleRate);
      const windowEndSamples = Math.floor(windowEnd * this.sampleRate);
      const audioWindow = audio.slice(windowStartSamples, windowEndSamples);

      // Transcribe this window
      const result = await this.model.transcribe(audioWindow);

      // Determine how much of this window to lock as "fixed"
      if (windowDuration >= this.maxWindowSize) {
        // Full window - need to slide
        const cutoffTime = windowDuration - this.sentenceBuffer;

        // Find sentences to fix (matching Python implementation)
        if (result.sentences && result.sentences.length > 1) {
          const sentencesToFix = result.sentences.filter(s => s.end < cutoffTime);

          if (sentencesToFix.length > 0) {
            // Lock these sentences as fixed
            const newFixedSentences = sentencesToFix.map(s => s.text.trim());
            this.fixedSentences.push(...newFixedSentences);

            // Move processed marker forward
            const lastSentenceTime = sentencesToFix[sentencesToFix.length - 1].end;
            processedUpTo = windowStart + lastSentenceTime;

            // Get active text (remaining sentences)
            const activeSentences = result.sentences.filter(s => s.end >= cutoffTime);
            const activeText = activeSentences.map(s => s.text).join(' ').trim();

            // Yield update
            yield new PartialTranscription(
              this.fixedSentences.join(" "),
              activeText,
              windowEnd,
              false
            );
          } else {
            // No sentences found before cutoff - lock half the window to make progress
            const halfText = result.text ? result.text.trim() : "";
            this.fixedSentences.push(halfText);
            processedUpTo = windowStart + windowDuration / 2;

            yield new PartialTranscription(
              this.fixedSentences.join(" "),
              "",
              windowEnd,
              false
            );
          }
        } else {
          // No sentence timestamps - lock half to make progress
          const halfText = result.text ? result.text.trim() : "";
          this.fixedSentences.push(halfText);
          processedUpTo = windowStart + windowDuration / 2;

          yield new PartialTranscription(
            this.fixedSentences.join(" "),
            "",
            windowEnd,
            false
          );
        }
      } else {
        // Last window (less than 15s) - lock everything
        const finalText = result.text ? result.text.trim() : "";
        this.fixedSentences.push(finalText);
        processedUpTo = windowEnd;

        yield new PartialTranscription(
          this.fixedSentences.join(" "),
          "",
          windowEnd,
          true  // Final
        );
      }
    }
  }

  async finalize(audio) {
    /**
     * Get final transcription by combining fixed + active.
     *
     * @param {Float32Array} audio - Complete audio buffer
     * @returns {Promise<string>} Final complete transcription
     */
    const result = await this.transcribeIncremental(audio);

    const parts = [];
    if (result.fixedText) parts.push(result.fixedText);
    if (result.activeText) parts.push(result.activeText);

    return parts.join(" ");
  }
}