Spaces:

andito
/

parakeet-v3-streaming

Running

File size: 9,979 Bytes

/**
 * Smart Progressive Streaming Handler
 *
 * JavaScript port of STT/smart_progressive_streaming.py
 *
 * Provides frequent partial transcriptions (every 250ms) with:
 * - Growing window up to 15s for accuracy
 * - Sentence-boundary-aware window sliding for audio > 15s
 * - Fixed sentences + active transcription
 */

export class PartialTranscription {
  constructor(fixedText, activeText, timestamp, isFinal) {
    this.fixedText = fixedText;  // Sentences that won't change
    this.activeText = activeText;  // Current partial transcription
    this.timestamp = timestamp;  // Current position in audio
    this.isFinal = isFinal;  // True if this is the last update
  }
}

export class SmartProgressiveStreamingHandler {
  /**
   * Smart progressive streaming with sentence-aware window management.
   *
   * Strategy:
   * 1. Emit partial transcriptions every 250ms
   * 2. Use growing window (up to 15s) for better accuracy
   * 3. When audio > 15s, slide window using sentence boundaries:
   *    - Keep completed sentences as "fixed"
   *    - Only re-transcribe the "active" portion
   */
  constructor(model, options = {}) {
    this.model = model;
    this.emissionInterval = options.emissionInterval || 0.5;  // 500ms
    this.maxWindowSize = options.maxWindowSize || 15.0;  // 15 seconds
    this.sentenceBuffer = options.sentenceBuffer || 2.0;  // 2 second buffer
    this.sampleRate = options.sampleRate || 16000;

    // State for incremental streaming
    this.reset();
  }

  reset() {
    /**
     * Reset state for new streaming session.
     */
    this.fixedSentences = [];
    this.fixedEndTime = 0.0;
    this.lastTranscribedLength = 0;
  }

  async transcribeIncremental(audio) {
    /**
     * Transcribe audio incrementally (for live streaming).
     *
     * Call this repeatedly with growing audio buffer (Float32Array).
     * Returns a single PartialTranscription for current state.
     *
     * @param {Float32Array} audio - Growing audio buffer
     * @returns {Promise<PartialTranscription>}
     */

    // Skip if not enough new audio
    const currentLength = audio.length;
    if (currentLength < this.sampleRate * 0.5) {  // Need at least 500ms
      return new PartialTranscription(
        this.fixedSentences.join(" "),
        "",
        currentLength / this.sampleRate,
        false
      );
    }

    // Skip if no new audio since last transcription
    if (currentLength === this.lastTranscribedLength) {
      return new PartialTranscription(
        this.fixedSentences.join(" "),
        "",
        currentLength / this.sampleRate,
        false
      );
    }

    this.lastTranscribedLength = currentLength;

    // Extract window for transcription (from last fixed sentence to end)
    const windowStartSamples = Math.floor(this.fixedEndTime * this.sampleRate);
    const audioWindow = audio.slice(windowStartSamples);

    // Check if window exceeds max_window_size
    const windowDuration = audioWindow.length / this.sampleRate;

    // Transcribe current window
    let result = await this.model.transcribe(audioWindow);

    if (windowDuration >= this.maxWindowSize && result.sentences && result.sentences.length > 1) {
      // Window is too large - fix some sentences
      const cutoffTime = windowDuration - this.sentenceBuffer;

      // Find sentences to fix (matching Python implementation)
      const newFixedSentences = [];
      let newFixedEndTime = this.fixedEndTime;

      for (const sentence of result.sentences) {
        if (sentence.end < cutoffTime) {
          // Fix this sentence
          newFixedSentences.push(sentence.text.trim());
          newFixedEndTime = this.fixedEndTime + sentence.end;
        } else {
          break;
        }
      }

      if (newFixedSentences.length > 0) {
        this.fixedSentences.push(...newFixedSentences);
        this.fixedEndTime = newFixedEndTime;

        // Re-transcribe from new fixed point
        const newWindowStartSamples = Math.floor(this.fixedEndTime * this.sampleRate);
        const newAudioWindow = audio.slice(newWindowStartSamples);
        result = await this.model.transcribe(newAudioWindow);
      }
    }

    // Build output
    const fixedText = this.fixedSentences.join(" ");
    const activeText = result.text ? result.text.trim() : "";
    const timestamp = audio.length / this.sampleRate;

    return new PartialTranscription(
      fixedText,
      activeText,
      timestamp,
      false
    );
  }

  async *transcribeProgressive(audio) {
    /**
     * Transcribe audio with smart progressive emissions.
     *
     * Yields PartialTranscription with:
     * - fixedText: Completed sentences (won't change)
     * - activeText: Current partial transcription
     * - timestamp: Current position
     *
     * @param {Float32Array} audio - Complete audio buffer
     * @yields {PartialTranscription}
     */

    const totalDuration = audio.length / this.sampleRate;
    let currentTime = 0;

    this.reset();

    while (currentTime < totalDuration) {
      currentTime += this.emissionInterval;
      const currentSamples = Math.min(
        Math.floor(currentTime * this.sampleRate),
        audio.length
      );

      const currentAudio = audio.slice(0, currentSamples);
      const result = await this.transcribeIncremental(currentAudio);

      yield result;

      // Small delay to simulate real-time
      await new Promise(resolve => setTimeout(resolve, this.emissionInterval * 1000));
    }

    // Final transcription
    const finalResult = await this.transcribeIncremental(audio);
    yield new PartialTranscription(
      finalResult.fixedText,
      finalResult.activeText,
      finalResult.timestamp,
      true  // is_final = true
    );
  }

  async *transcribeBatch(audio) {
    /**
     * Transcribe audio in batch mode (for uploaded files).
     *
     * Processes as fast as possible with full 15s windows:
     * - Start with maximum window size immediately
     * - No artificial delays between windows
     * - Slide window as soon as current transcription completes
     *
     * @param {Float32Array} audio - Complete audio buffer
     * @yields {PartialTranscription}
     */
    const totalDuration = audio.length / this.sampleRate;
    this.reset();

    let processedUpTo = 0;  // Track how much audio we've finalized

    while (processedUpTo < totalDuration) {
      // Process next 15s window (or remaining audio if less than 15s)
      const windowStart = processedUpTo;
      const windowEnd = Math.min(processedUpTo + this.maxWindowSize, totalDuration);
      const windowDuration = windowEnd - windowStart;

      // Extract audio window
      const windowStartSamples = Math.floor(windowStart * this.sampleRate);
      const windowEndSamples = Math.floor(windowEnd * this.sampleRate);
      const audioWindow = audio.slice(windowStartSamples, windowEndSamples);

      // Transcribe this window
      const result = await this.model.transcribe(audioWindow);

      // Determine how much of this window to lock as "fixed"
      if (windowDuration >= this.maxWindowSize) {
        // Full window - need to slide
        const cutoffTime = windowDuration - this.sentenceBuffer;

        // Find sentences to fix (matching Python implementation)
        if (result.sentences && result.sentences.length > 1) {
          const sentencesToFix = result.sentences.filter(s => s.end < cutoffTime);

          if (sentencesToFix.length > 0) {
            // Lock these sentences as fixed
            const newFixedSentences = sentencesToFix.map(s => s.text.trim());
            this.fixedSentences.push(...newFixedSentences);

            // Move processed marker forward
            const lastSentenceTime = sentencesToFix[sentencesToFix.length - 1].end;
            processedUpTo = windowStart + lastSentenceTime;

            // Get active text (remaining sentences)
            const activeSentences = result.sentences.filter(s => s.end >= cutoffTime);
            const activeText = activeSentences.map(s => s.text).join(' ').trim();

            // Yield update
            yield new PartialTranscription(
              this.fixedSentences.join(" "),
              activeText,
              windowEnd,
              false
            );
          } else {
            // No sentences found before cutoff - lock half the window to make progress
            const halfText = result.text ? result.text.trim() : "";
            this.fixedSentences.push(halfText);
            processedUpTo = windowStart + windowDuration / 2;

            yield new PartialTranscription(
              this.fixedSentences.join(" "),
              "",
              windowEnd,
              false
            );
          }
        } else {
          // No sentence timestamps - lock half to make progress
          const halfText = result.text ? result.text.trim() : "";
          this.fixedSentences.push(halfText);
          processedUpTo = windowStart + windowDuration / 2;

          yield new PartialTranscription(
            this.fixedSentences.join(" "),
            "",
            windowEnd,
            false
          );
        }
      } else {
        // Last window (less than 15s) - lock everything
        const finalText = result.text ? result.text.trim() : "";
        this.fixedSentences.push(finalText);
        processedUpTo = windowEnd;

        yield new PartialTranscription(
          this.fixedSentences.join(" "),
          "",
          windowEnd,
          true  // Final
        );
      }
    }
  }

  async finalize(audio) {
    /**
     * Get final transcription by combining fixed + active.
     *
     * @param {Float32Array} audio - Complete audio buffer
     * @returns {Promise<string>} Final complete transcription
     */
    const result = await this.transcribeIncremental(audio);

    const parts = [];
    if (result.fixedText) parts.push(result.fixedText);
    if (result.activeText) parts.push(result.activeText);

    return parts.join(" ");
  }
}