keet-streaming / src /lib /transcription /WindowBuilder.ts
ysdede's picture
feat(space): migrate Hugging Face Space to keet SolidJS app
b8cc2bf
/**
* WindowBuilder.ts
*
* Cursor-based dynamic window construction for transcription.
* Ported from zdasr-main/src/zdasr/window_cursor.py.
*
* Instead of a fixed 5-second window that fires every 1.5 seconds,
* this builder creates windows that:
* - Start at the mature cursor (end of last finalized sentence)
* - Extend to the current audio position
* - Respect min/max duration constraints
* - Optionally use VAD data to align boundaries to silence
* - Never re-transcribe audio before the mature cursor
*/
import type { IRingBuffer } from '../audio/types';
import type { VADRingBuffer } from '../vad/VADRingBuffer';
/** The result of building a transcription window */
export interface TranscriptionWindow {
/** Start frame in global offset */
startFrame: number;
/** End frame in global offset */
endFrame: number;
/** Duration of the window in seconds */
durationSeconds: number;
/** Whether this is an initial (pre-first-sentence) window */
isInitial: boolean;
}
/** Configuration for WindowBuilder */
export interface WindowBuilderConfig {
/** Audio sample rate in Hz (default: 16000) */
sampleRate: number;
/** Minimum window duration in seconds (default: 3.0) */
minDurationSec: number;
/** Maximum window duration in seconds (default: 30.0) */
maxDurationSec: number;
/** Min duration before first sentence (default: 1.5) */
minInitialDurationSec: number;
/** Max sentence end markers to keep (default: 4) */
maxSentences: number;
/** Whether to use VAD for boundary refinement (default: true) */
useVadBoundaries: boolean;
/** VAD silence threshold (default: 0.3) */
vadSilenceThreshold: number;
/** Enable debug logging (default: false) */
debug: boolean;
}
export class WindowBuilder {
private config: WindowBuilderConfig;
private ringBuffer: IRingBuffer;
private vadBuffer: VADRingBuffer | null;
// State
private sentenceEnds: number[] = [];
private matureCursorFrame: number = 0;
private firstSentenceReceived: boolean = false;
constructor(
ringBuffer: IRingBuffer,
vadBuffer: VADRingBuffer | null = null,
config: Partial<WindowBuilderConfig> = {}
) {
this.ringBuffer = ringBuffer;
this.vadBuffer = vadBuffer;
this.config = {
sampleRate: 16000,
minDurationSec: 3.0,
maxDurationSec: 30.0,
minInitialDurationSec: 1.5,
maxSentences: 4,
useVadBoundaries: true,
vadSilenceThreshold: 0.3,
debug: false,
...config,
};
}
// ---- Sentence boundary bookkeeping ----
/**
* Record the end frame of a fully finalized sentence.
*/
markSentenceEnd(frameIdx: number): void {
this.sentenceEnds.push(frameIdx);
// Keep only the most recent N
if (this.sentenceEnds.length > this.config.maxSentences) {
this.sentenceEnds = this.sentenceEnds.slice(-this.config.maxSentences);
}
if (!this.firstSentenceReceived) {
this.firstSentenceReceived = true;
if (this.config.debug) {
console.log(`[WindowBuilder] First sentence received at frame ${frameIdx}`);
}
}
}
// ---- Mature cursor management ----
/**
* Advance the mature cursor to a finalized sentence boundary.
* The mature cursor marks where transcription is considered stable.
*/
advanceMatureCursor(frameIdx: number): void {
if (frameIdx > this.matureCursorFrame) {
const oldCursor = this.matureCursorFrame;
this.matureCursorFrame = frameIdx;
if (this.config.debug) {
const cursorTime = frameIdx / this.config.sampleRate;
console.log(
`[WindowBuilder] Cursor advanced from frame ${oldCursor} to ${frameIdx} (${cursorTime.toFixed(2)}s)`
);
}
if (!this.firstSentenceReceived) {
this.firstSentenceReceived = true;
}
}
}
/**
* Advance the mature cursor using a time value (seconds).
* Converts to frame offset based on sample rate.
*/
advanceMatureCursorByTime(timeSec: number): void {
const frameIdx = Math.round(timeSec * this.config.sampleRate);
this.advanceMatureCursor(frameIdx);
}
/**
* Get current mature cursor position in frames.
*/
getMatureCursorFrame(): number {
return this.matureCursorFrame;
}
/**
* Get current mature cursor position in seconds.
*/
getMatureCursorTime(): number {
return this.matureCursorFrame / this.config.sampleRate;
}
// ---- Window building ----
/**
* Build a transcription window from the mature cursor to the current buffer head.
*
* Returns null if:
* - No data in the buffer
* - Not enough audio for minimum duration
* - Start frame >= end frame
*
* The caller should use the returned startFrame/endFrame to extract audio
* from the ring buffer and request mel features from the mel worker.
*/
buildWindow(): TranscriptionWindow | null {
const endFrame = this.ringBuffer.getCurrentFrame();
const baseFrame = this.ringBuffer.getBaseFrameOffset();
if (endFrame === baseFrame) {
return null; // no data
}
const availableFrames = endFrame - baseFrame;
// ---- Initial mode (before first sentence) ----
if (!this.firstSentenceReceived) {
const minInitialFrames = Math.round(
this.config.minInitialDurationSec * this.config.sampleRate
);
if (availableFrames < minInitialFrames) {
if (this.config.debug) {
const availDur = availableFrames / this.config.sampleRate;
console.log(
`[WindowBuilder] Initial mode: waiting (${availDur.toFixed(2)}s / ${this.config.minInitialDurationSec}s)`
);
}
return null;
}
// Start from base, up to max duration
const maxFrames = Math.round(this.config.maxDurationSec * this.config.sampleRate);
const clippedEnd = Math.min(endFrame, baseFrame + maxFrames);
const duration = (clippedEnd - baseFrame) / this.config.sampleRate;
if (this.config.debug) {
console.log(
`[WindowBuilder] Initial window [${baseFrame}:${clippedEnd}] (${duration.toFixed(2)}s)`
);
}
return {
startFrame: baseFrame,
endFrame: clippedEnd,
durationSeconds: duration,
isInitial: true,
};
}
// ---- Normal mode (after first sentence) ----
// Determine start frame from mature cursor or sentence ends
let startFrame: number;
if (this.matureCursorFrame > 0) {
startFrame = this.matureCursorFrame;
} else if (this.sentenceEnds.length >= 2) {
startFrame = this.sentenceEnds[this.sentenceEnds.length - 2];
} else if (this.sentenceEnds.length >= 1) {
startFrame = this.sentenceEnds[0];
} else {
startFrame = baseFrame;
}
// Ensure start frame is within valid buffer range
if (startFrame < baseFrame) {
if (this.config.debug) {
console.log(
`[WindowBuilder] Start frame ${startFrame} < base ${baseFrame}; clipping to base.`
);
}
startFrame = baseFrame;
}
if (startFrame >= endFrame) {
if (this.config.debug) {
console.log('[WindowBuilder] Start frame >= end frame, nothing new to transcribe');
}
return null;
}
let windowFrames = endFrame - startFrame;
// Enforce minimum duration (never extend backward past cursor)
const minFrames = Math.round(this.config.minDurationSec * this.config.sampleRate);
if (windowFrames < minFrames) {
if (this.config.debug) {
const dur = windowFrames / this.config.sampleRate;
console.log(
`[WindowBuilder] Insufficient audio (${dur.toFixed(2)}s < ${this.config.minDurationSec}s). Waiting...`
);
}
return null;
}
// Enforce maximum duration (keep newest audio)
const maxFrames = Math.round(this.config.maxDurationSec * this.config.sampleRate);
if (windowFrames > maxFrames) {
const proposedStart = endFrame - maxFrames;
if (proposedStart < this.matureCursorFrame) {
startFrame = this.matureCursorFrame;
} else {
startFrame = proposedStart;
}
windowFrames = endFrame - startFrame;
}
// VAD boundary refinement: nudge start to a silence boundary
if (this.config.useVadBoundaries && this.vadBuffer) {
const searchEnd = Math.min(
startFrame + Math.round(this.config.sampleRate * 0.5),
endFrame
);
const vadStart = this.vadBuffer.findSilenceBoundary(
searchEnd,
startFrame,
this.config.vadSilenceThreshold
);
if (vadStart > startFrame) {
const newWindowFrames = endFrame - vadStart;
const newDuration = newWindowFrames / this.config.sampleRate;
if (newDuration >= this.config.minDurationSec) {
if (this.config.debug) {
console.log(
`[WindowBuilder] VAD adjusted start: ${startFrame} -> ${vadStart}`
);
}
startFrame = vadStart;
windowFrames = newWindowFrames;
}
}
}
// Final validation
if (startFrame >= endFrame) {
return null;
}
const durationSeconds = (endFrame - startFrame) / this.config.sampleRate;
if (this.config.debug) {
console.log(
`[WindowBuilder] Window [${startFrame}:${endFrame}] duration=${durationSeconds.toFixed(2)}s cursor=${this.matureCursorFrame}`
);
}
return {
startFrame,
endFrame,
durationSeconds,
isInitial: false,
};
}
/**
* Check if there is extended silence at the buffer tail (for flush decisions).
*/
getSilenceTailDuration(): number {
if (!this.vadBuffer) return 0;
return this.vadBuffer.getSilenceTailDuration(this.config.vadSilenceThreshold);
}
/**
* Check if there is speech in the current pending window.
*/
hasSpeechInPendingWindow(): boolean {
if (!this.vadBuffer) return true; // Assume speech if no VAD buffer
const endFrame = this.ringBuffer.getCurrentFrame();
const startFrame = this.matureCursorFrame > 0
? this.matureCursorFrame
: this.ringBuffer.getBaseFrameOffset();
if (startFrame >= endFrame) return false;
return this.vadBuffer.hasSpeechInRange(startFrame, endFrame, this.config.vadSilenceThreshold);
}
/**
* Reset all internal state.
*/
reset(): void {
this.sentenceEnds = [];
this.matureCursorFrame = 0;
this.firstSentenceReceived = false;
if (this.config.debug) {
console.log('[WindowBuilder] Reset');
}
}
/**
* Get the current configuration.
*/
getConfig(): WindowBuilderConfig {
return { ...this.config };
}
}