|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const SAMPLE_RATE = 16000; |
|
|
const VAD_CHUNK_SAMPLES = 160; |
|
|
const ASR_CHUNK_SAMPLES = 320; |
|
|
const ENCODER_BATCH_SAMPLES = 5120; |
|
|
|
|
|
const PRE_BUFFER_CHUNKS = 25; |
|
|
const POST_BUFFER_CHUNKS = 5; |
|
|
const MIN_SEGMENT_DURATION_MS = 2000; |
|
|
const OFFSET_RAMP_START_MS = 6000; |
|
|
const OFFSET_RAMP_END_MS = 8000; |
|
|
const OFFSET_CHUNKS_REQUIRED = 10; |
|
|
|
|
|
const MODEL_CONFIGS = { |
|
|
sleeker: { |
|
|
dim: 336, |
|
|
dec_dim: 288, |
|
|
depth: 6, |
|
|
encoder_depth: 6, |
|
|
n_past: 16, |
|
|
n_future: 4, |
|
|
nheads: 8, |
|
|
head_dim: 36, |
|
|
vocab_size: 32768 |
|
|
}, |
|
|
spindlier: { |
|
|
dim: 620, |
|
|
dec_dim: 512, |
|
|
depth: 10, |
|
|
encoder_depth: 10, |
|
|
n_past: 16, |
|
|
n_future: 4, |
|
|
nheads: 8, |
|
|
head_dim: 64, |
|
|
vocab_size: 32768 |
|
|
} |
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TenVAD { |
|
|
constructor(hopSize = 160, threshold = 0.5) { |
|
|
this.hopSize = hopSize; |
|
|
this.threshold = threshold; |
|
|
this.module = null; |
|
|
this.vadHandle = null; |
|
|
this.audioPtr = null; |
|
|
this.probPtr = null; |
|
|
this.flagPtr = null; |
|
|
this.ready = false; |
|
|
} |
|
|
|
|
|
async init(wasmUrl = './ten_vad.js') { |
|
|
const wasmBinaryUrl = wasmUrl.replace('.js', '.wasm'); |
|
|
|
|
|
|
|
|
const vadModule = await import(wasmUrl); |
|
|
const createTenVadModule = vadModule.default; |
|
|
|
|
|
this.module = await createTenVadModule({ |
|
|
locateFile: (path) => { |
|
|
if (path.endsWith('.wasm')) { |
|
|
return wasmBinaryUrl; |
|
|
} |
|
|
return path; |
|
|
} |
|
|
}); |
|
|
|
|
|
|
|
|
const vadHandlePtr = this.module._malloc(4); |
|
|
const result = this.module._ten_vad_create(vadHandlePtr, this.hopSize, this.threshold); |
|
|
|
|
|
if (result !== 0) { |
|
|
this.module._free(vadHandlePtr); |
|
|
throw new Error(`Failed to create TenVAD instance: ${result}`); |
|
|
} |
|
|
|
|
|
this.vadHandle = this.module.HEAP32[vadHandlePtr / 4]; |
|
|
this.module._free(vadHandlePtr); |
|
|
|
|
|
|
|
|
this.audioPtr = this.module._malloc(this.hopSize * 2); |
|
|
this.probPtr = this.module._malloc(4); |
|
|
this.flagPtr = this.module._malloc(4); |
|
|
|
|
|
this.ready = true; |
|
|
} |
|
|
|
|
|
process(audioChunkFloat32) { |
|
|
if (!this.ready) return -1; |
|
|
|
|
|
const int16Data = new Int16Array(this.hopSize); |
|
|
for (let i = 0; i < this.hopSize && i < audioChunkFloat32.length; i++) { |
|
|
int16Data[i] = Math.max(-32768, Math.min(32767, Math.round(audioChunkFloat32[i] * 32767))); |
|
|
} |
|
|
|
|
|
this.module.HEAP16.set(int16Data, this.audioPtr / 2); |
|
|
this.module._ten_vad_process(this.vadHandle, this.audioPtr, this.hopSize, this.probPtr, this.flagPtr); |
|
|
|
|
|
return this.module.HEAPF32[this.probPtr / 4]; |
|
|
} |
|
|
|
|
|
destroy() { |
|
|
if (!this.ready || !this.module) return; |
|
|
|
|
|
this.ready = false; |
|
|
|
|
|
try { |
|
|
if (this.audioPtr) { |
|
|
this.module._free(this.audioPtr); |
|
|
this.audioPtr = null; |
|
|
} |
|
|
if (this.probPtr) { |
|
|
this.module._free(this.probPtr); |
|
|
this.probPtr = null; |
|
|
} |
|
|
if (this.flagPtr) { |
|
|
this.module._free(this.flagPtr); |
|
|
this.flagPtr = null; |
|
|
} |
|
|
|
|
|
|
|
|
this.vadHandle = null; |
|
|
} catch (e) { |
|
|
console.warn('TenVAD cleanup error:', e); |
|
|
} |
|
|
|
|
|
this.module = null; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
class SimpleVAD { |
|
|
constructor(sampleRate = 16000, frameSize = 160) { |
|
|
this.frameSize = frameSize; |
|
|
this.energyHistory = []; |
|
|
this.historySize = 50; |
|
|
this.noiseFloor = 0.001; |
|
|
this.ready = true; |
|
|
} |
|
|
|
|
|
async init() {} |
|
|
|
|
|
process(audioChunk) { |
|
|
let energy = 0; |
|
|
for (let i = 0; i < audioChunk.length; i++) { |
|
|
energy += audioChunk[i] * audioChunk[i]; |
|
|
} |
|
|
energy = Math.sqrt(energy / audioChunk.length); |
|
|
|
|
|
this.energyHistory.push(energy); |
|
|
if (this.energyHistory.length > this.historySize) { |
|
|
this.energyHistory.shift(); |
|
|
} |
|
|
|
|
|
if (this.energyHistory.length > 10) { |
|
|
const sorted = [...this.energyHistory].sort((a, b) => a - b); |
|
|
this.noiseFloor = sorted[Math.floor(sorted.length * 0.1)] || 0.001; |
|
|
} |
|
|
|
|
|
const snr = energy / (this.noiseFloor + 1e-10); |
|
|
return 1 / (1 + Math.exp(-2 * (snr - 3))); |
|
|
} |
|
|
|
|
|
destroy() {} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class PipelinedStreamingASR { |
|
|
constructor(config) { |
|
|
this.modelName = config.modelName || 'sleeker'; |
|
|
this.onnxUrl = config.onnxUrl || './models'; |
|
|
this.backendChoice = config.backend || 'wasm'; |
|
|
this.onsetThreshold = config.onsetThreshold || 0.5; |
|
|
this.offsetThreshold = config.offsetThreshold || 0.3; |
|
|
this.emaAlpha = config.emaAlpha || 0.3; |
|
|
|
|
|
this.cfg = MODEL_CONFIGS[this.modelName]; |
|
|
|
|
|
|
|
|
this.encoderWorker = null; |
|
|
this.decoderWorker = null; |
|
|
this.encoderReady = false; |
|
|
this.decoderReady = false; |
|
|
|
|
|
|
|
|
this.vad = null; |
|
|
|
|
|
|
|
|
this.audioContext = null; |
|
|
this.sourceNode = null; |
|
|
this.workletNode = null; |
|
|
|
|
|
|
|
|
this.running = false; |
|
|
this.state = 'idle'; |
|
|
this.currentSegmentId = 0; |
|
|
this.emaProb = 0; |
|
|
this.onsetCounter = 0; |
|
|
this.offsetCounter = 0; |
|
|
this.segmentStartTime = 0; |
|
|
|
|
|
|
|
|
this.vadBuffer = []; |
|
|
this.asrBuffer = []; |
|
|
this.preBuffer = []; |
|
|
this.postBufferRemaining = 0; |
|
|
this.encoderBatchBuffer = []; |
|
|
|
|
|
|
|
|
this.vadHistory = []; |
|
|
this.vadUpdateCounter = 0; |
|
|
this.vadUpdateInterval = 5; |
|
|
this.segmentEvents = []; |
|
|
this.vadHistoryStartTime = 0; |
|
|
|
|
|
|
|
|
this.onVadUpdate = null; |
|
|
this.onTranscript = null; |
|
|
this.onLiveCaption = null; |
|
|
this.onStatusUpdate = null; |
|
|
this.onQueueUpdate = null; |
|
|
this.onBackendUpdate = null; |
|
|
|
|
|
|
|
|
this.backend = 'unknown'; |
|
|
} |
|
|
|
|
|
async loadModels(progressCallback, detailedProgressCallback) { |
|
|
|
|
|
const totalModels = 7; |
|
|
let completedModels = 0; |
|
|
let currentModel = ''; |
|
|
let currentProgress = { loaded: 0, total: 0 }; |
|
|
|
|
|
const updateProgress = () => { |
|
|
const overallPercent = (completedModels / totalModels) * 100; |
|
|
detailedProgressCallback?.({ |
|
|
completedModels, |
|
|
totalModels, |
|
|
overallPercent, |
|
|
currentModel, |
|
|
currentProgress |
|
|
}); |
|
|
}; |
|
|
|
|
|
|
|
|
try { |
|
|
currentModel = 'VAD'; |
|
|
progressCallback?.('Loading TenVAD...'); |
|
|
updateProgress(); |
|
|
this.vad = new TenVAD(VAD_CHUNK_SAMPLES, 0.5); |
|
|
await this.vad.init('./ten_vad.js'); |
|
|
console.log('Using TenVAD'); |
|
|
completedModels++; |
|
|
updateProgress(); |
|
|
} catch (e) { |
|
|
console.warn('TenVAD failed, using SimpleVAD:', e.message); |
|
|
this.vad = new SimpleVAD(SAMPLE_RATE, VAD_CHUNK_SAMPLES); |
|
|
await this.vad.init(); |
|
|
completedModels++; |
|
|
updateProgress(); |
|
|
} |
|
|
|
|
|
|
|
|
progressCallback?.('Loading encoder models...'); |
|
|
await this.initEncoderWorker((model, progress) => { |
|
|
currentModel = model; |
|
|
currentProgress = progress; |
|
|
updateProgress(); |
|
|
}, () => { |
|
|
completedModels++; |
|
|
updateProgress(); |
|
|
}); |
|
|
|
|
|
|
|
|
progressCallback?.('Loading decoder models...'); |
|
|
await this.initDecoderWorker((model, progress) => { |
|
|
currentModel = model; |
|
|
currentProgress = progress; |
|
|
updateProgress(); |
|
|
}, () => { |
|
|
completedModels++; |
|
|
updateProgress(); |
|
|
}); |
|
|
|
|
|
progressCallback?.('Ready!'); |
|
|
} |
|
|
|
|
|
initEncoderWorker(onProgress, onModelDone) { |
|
|
return new Promise((resolve, reject) => { |
|
|
this.encoderWorker = new Worker('./encoder_worker.js'); |
|
|
|
|
|
this.encoderWorker.onmessage = (e) => { |
|
|
const { type } = e.data; |
|
|
|
|
|
switch (type) { |
|
|
case 'ready': |
|
|
this.encoderReady = true; |
|
|
resolve(); |
|
|
break; |
|
|
case 'error': |
|
|
reject(new Error(e.data.message)); |
|
|
break; |
|
|
case 'status': |
|
|
|
|
|
break; |
|
|
case 'progress': |
|
|
onProgress?.(e.data.model, { loaded: e.data.loaded, total: e.data.total, cached: e.data.cached }); |
|
|
break; |
|
|
case 'model_done': |
|
|
onModelDone?.(e.data.model); |
|
|
break; |
|
|
case 'segment_start': |
|
|
this.decoderWorker?.postMessage({ type: 'segment_start', data: { segmentId: e.data.segmentId } }); |
|
|
break; |
|
|
case 'segment_end': |
|
|
this.decoderWorker?.postMessage({ type: 'segment_end', data: { segmentId: e.data.segmentId } }); |
|
|
break; |
|
|
case 'features': |
|
|
|
|
|
this.decoderWorker?.postMessage({ |
|
|
type: 'features', |
|
|
data: { |
|
|
segmentId: e.data.segmentId, |
|
|
features: e.data.features, |
|
|
dims: e.data.dims |
|
|
} |
|
|
}, [e.data.features.buffer]); |
|
|
break; |
|
|
} |
|
|
}; |
|
|
|
|
|
this.encoderWorker.postMessage({ |
|
|
type: 'init', |
|
|
data: { |
|
|
cfg: this.cfg, |
|
|
onnxUrl: this.onnxUrl, |
|
|
modelName: this.modelName, |
|
|
backend: this.backendChoice |
|
|
} |
|
|
}); |
|
|
}); |
|
|
} |
|
|
|
|
|
initDecoderWorker(onProgress, onModelDone) { |
|
|
return new Promise((resolve, reject) => { |
|
|
this.decoderWorker = new Worker('./decoder_worker.js'); |
|
|
|
|
|
this.decoderWorker.onmessage = (e) => { |
|
|
const { type } = e.data; |
|
|
|
|
|
switch (type) { |
|
|
case 'ready': |
|
|
this.decoderReady = true; |
|
|
this.backend = e.data.backend || 'wasm'; |
|
|
this.onBackendUpdate?.(this.backend); |
|
|
resolve(); |
|
|
break; |
|
|
case 'error': |
|
|
reject(new Error(e.data.message)); |
|
|
break; |
|
|
case 'status': |
|
|
break; |
|
|
case 'progress': |
|
|
onProgress?.(e.data.model, { loaded: e.data.loaded, total: e.data.total, cached: e.data.cached }); |
|
|
break; |
|
|
case 'model_done': |
|
|
onModelDone?.(e.data.model); |
|
|
break; |
|
|
case 'transcript': |
|
|
this.onTranscript?.(e.data.text, e.data.segmentId); |
|
|
break; |
|
|
case 'live_caption': |
|
|
this.onLiveCaption?.(e.data.text); |
|
|
break; |
|
|
} |
|
|
}; |
|
|
|
|
|
this.decoderWorker.postMessage({ |
|
|
type: 'init', |
|
|
data: { |
|
|
cfg: this.cfg, |
|
|
onnxUrl: this.onnxUrl, |
|
|
modelName: this.modelName, |
|
|
backend: this.backendChoice |
|
|
} |
|
|
}); |
|
|
}); |
|
|
} |
|
|
|
|
|
async start() { |
|
|
if (this.running) return; |
|
|
|
|
|
const stream = await navigator.mediaDevices.getUserMedia({ |
|
|
audio: { |
|
|
sampleRate: SAMPLE_RATE, |
|
|
channelCount: 1, |
|
|
echoCancellation: false, |
|
|
noiseSuppression: false, |
|
|
autoGainControl: false |
|
|
} |
|
|
}); |
|
|
|
|
|
this.audioContext = new AudioContext({ sampleRate: SAMPLE_RATE }); |
|
|
|
|
|
|
|
|
console.log(`Requested sample rate: ${SAMPLE_RATE}, Actual: ${this.audioContext.sampleRate}`); |
|
|
this.sourceNode = this.audioContext.createMediaStreamSource(stream); |
|
|
|
|
|
|
|
|
try { |
|
|
await this.audioContext.audioWorklet.addModule('./audio_processor.js'); |
|
|
this.workletNode = new AudioWorkletNode(this.audioContext, 'audio-processor'); |
|
|
|
|
|
this.workletNode.port.onmessage = (e) => { |
|
|
if (this.running) { |
|
|
this.processAudioChunk(e.data.audio); |
|
|
} |
|
|
}; |
|
|
|
|
|
this.sourceNode.connect(this.workletNode); |
|
|
this.workletNode.connect(this.audioContext.destination); |
|
|
} catch (e) { |
|
|
|
|
|
console.warn('AudioWorklet not available, using ScriptProcessor'); |
|
|
const bufferSize = 2048; |
|
|
this.scriptNode = this.audioContext.createScriptProcessor(bufferSize, 1, 1); |
|
|
|
|
|
this.scriptNode.onaudioprocess = (e) => { |
|
|
if (this.running) { |
|
|
const inputData = e.inputBuffer.getChannelData(0); |
|
|
this.processAudioChunk(new Float32Array(inputData)); |
|
|
} |
|
|
}; |
|
|
|
|
|
this.sourceNode.connect(this.scriptNode); |
|
|
this.scriptNode.connect(this.audioContext.destination); |
|
|
} |
|
|
|
|
|
this.running = true; |
|
|
this.state = 'idle'; |
|
|
this.onsetCounter = 0; |
|
|
this.offsetCounter = 0; |
|
|
this.emaProb = 0; |
|
|
|
|
|
this.onStatusUpdate?.('listening', 'Listening...'); |
|
|
} |
|
|
|
|
|
stop() { |
|
|
this.running = false; |
|
|
|
|
|
if (this.workletNode) { |
|
|
this.workletNode.disconnect(); |
|
|
this.workletNode = null; |
|
|
} |
|
|
if (this.scriptNode) { |
|
|
this.scriptNode.disconnect(); |
|
|
this.scriptNode = null; |
|
|
} |
|
|
if (this.sourceNode) { |
|
|
this.sourceNode.disconnect(); |
|
|
this.sourceNode = null; |
|
|
} |
|
|
if (this.audioContext) { |
|
|
this.audioContext.close(); |
|
|
this.audioContext = null; |
|
|
} |
|
|
if (this.vad) { |
|
|
this.vad.destroy(); |
|
|
} |
|
|
if (this.encoderWorker) { |
|
|
this.encoderWorker.terminate(); |
|
|
this.encoderWorker = null; |
|
|
} |
|
|
if (this.decoderWorker) { |
|
|
this.decoderWorker.terminate(); |
|
|
this.decoderWorker = null; |
|
|
} |
|
|
|
|
|
this.onStatusUpdate?.('idle', 'Stopped'); |
|
|
} |
|
|
|
|
|
processAudioChunk(audioData) { |
|
|
|
|
|
this.vadBuffer.push(...audioData); |
|
|
|
|
|
|
|
|
this.asrBuffer.push(...audioData); |
|
|
|
|
|
|
|
|
while (this.vadBuffer.length >= VAD_CHUNK_SAMPLES) { |
|
|
const vadChunk = new Float32Array(this.vadBuffer.splice(0, VAD_CHUNK_SAMPLES)); |
|
|
const prob = this.vad.process(vadChunk); |
|
|
|
|
|
if (prob >= 0) { |
|
|
this.emaProb = this.emaAlpha * prob + (1 - this.emaAlpha) * this.emaProb; |
|
|
|
|
|
|
|
|
this.vadUpdateCounter++; |
|
|
if (this.vadUpdateCounter >= this.vadUpdateInterval) { |
|
|
this.vadUpdateCounter = 0; |
|
|
const now = Date.now(); |
|
|
|
|
|
|
|
|
if (this.vadHistory.length === 0) { |
|
|
this.vadHistoryStartTime = now; |
|
|
} |
|
|
|
|
|
this.vadHistory.push(this.emaProb); |
|
|
if (this.vadHistory.length > 100) { |
|
|
this.vadHistory.shift(); |
|
|
|
|
|
this.vadHistoryStartTime += 50; |
|
|
} |
|
|
|
|
|
|
|
|
const historyDuration = this.vadHistory.length * 50; |
|
|
const historyStart = now - historyDuration; |
|
|
this.segmentEvents = this.segmentEvents.filter(e => e.time >= historyStart); |
|
|
|
|
|
this.onVadUpdate?.(this.emaProb, this.vadHistory, this.segmentEvents, this.vadHistoryStartTime); |
|
|
} |
|
|
|
|
|
this.updateSegmentState(); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
while (this.asrBuffer.length >= ASR_CHUNK_SAMPLES) { |
|
|
const chunkData = this.asrBuffer.splice(0, ASR_CHUNK_SAMPLES); |
|
|
const chunk = new Float32Array(chunkData); |
|
|
|
|
|
if (this.state === 'speech') { |
|
|
this.sendAudioToEncoder(chunk); |
|
|
} else { |
|
|
this.preBuffer.push(chunk); |
|
|
if (this.preBuffer.length > PRE_BUFFER_CHUNKS) { |
|
|
this.preBuffer.shift(); |
|
|
} |
|
|
|
|
|
if (this.postBufferRemaining > 0) { |
|
|
this.sendAudioToEncoder(chunk); |
|
|
this.postBufferRemaining--; |
|
|
|
|
|
if (this.postBufferRemaining === 0) { |
|
|
this.finalizeSegmentEnd(); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
sendAudioToEncoder(chunk, flush = false) { |
|
|
if (!this.encoderWorker || !this.encoderReady) return; |
|
|
|
|
|
|
|
|
this.encoderBatchBuffer.push(...chunk); |
|
|
|
|
|
|
|
|
if (this.encoderBatchBuffer.length >= ENCODER_BATCH_SAMPLES || flush) { |
|
|
if (this.encoderBatchBuffer.length > 0) { |
|
|
const batch = new Float32Array(this.encoderBatchBuffer); |
|
|
this.encoderBatchBuffer = []; |
|
|
|
|
|
this.encoderWorker.postMessage({ |
|
|
type: 'audio', |
|
|
data: { |
|
|
audio: batch, |
|
|
segmentId: this.currentSegmentId |
|
|
} |
|
|
}, [batch.buffer]); |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
updateSegmentState() { |
|
|
if (this.state === 'idle') { |
|
|
if (this.emaProb >= this.onsetThreshold) { |
|
|
this.onsetCounter++; |
|
|
if (this.onsetCounter >= 2) { |
|
|
this.startSegment(); |
|
|
} |
|
|
} else { |
|
|
this.onsetCounter = 0; |
|
|
} |
|
|
} else if (this.state === 'speech') { |
|
|
const segmentDuration = Date.now() - this.segmentStartTime; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
let effectiveOffsetThreshold = this.offsetThreshold; |
|
|
if (segmentDuration >= OFFSET_RAMP_START_MS) { |
|
|
const rampProgress = Math.min(1.0, |
|
|
(segmentDuration - OFFSET_RAMP_START_MS) / (OFFSET_RAMP_END_MS - OFFSET_RAMP_START_MS) |
|
|
); |
|
|
|
|
|
effectiveOffsetThreshold = this.offsetThreshold + rampProgress * (1.0 - this.offsetThreshold); |
|
|
} |
|
|
|
|
|
|
|
|
const minDurationMet = segmentDuration >= MIN_SEGMENT_DURATION_MS; |
|
|
|
|
|
if (this.emaProb < effectiveOffsetThreshold) { |
|
|
this.offsetCounter++; |
|
|
|
|
|
if (minDurationMet && this.offsetCounter >= OFFSET_CHUNKS_REQUIRED) { |
|
|
this.endSegment(); |
|
|
} |
|
|
} else { |
|
|
this.offsetCounter = 0; |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
startSegment() { |
|
|
|
|
|
if (this.postBufferRemaining > 0) { |
|
|
this.finalizeSegmentEnd(); |
|
|
this.postBufferRemaining = 0; |
|
|
} |
|
|
|
|
|
this.currentSegmentId++; |
|
|
this.state = 'speech'; |
|
|
this.onsetCounter = 0; |
|
|
this.offsetCounter = 0; |
|
|
this.segmentStartTime = Date.now(); |
|
|
this.encoderBatchBuffer = []; |
|
|
|
|
|
|
|
|
this.segmentEvents.push({ type: 'start', time: this.segmentStartTime }); |
|
|
|
|
|
|
|
|
this.encoderWorker?.postMessage({ |
|
|
type: 'segment_start', |
|
|
data: { segmentId: this.currentSegmentId } |
|
|
}); |
|
|
|
|
|
|
|
|
while (this.preBuffer.length > 0) { |
|
|
const chunk = this.preBuffer.shift(); |
|
|
this.sendAudioToEncoder(chunk); |
|
|
} |
|
|
|
|
|
this.onStatusUpdate?.('recording', 'Recording...'); |
|
|
} |
|
|
|
|
|
endSegment() { |
|
|
this.state = 'idle'; |
|
|
this.offsetCounter = 0; |
|
|
this.postBufferRemaining = POST_BUFFER_CHUNKS; |
|
|
|
|
|
|
|
|
this.segmentEvents.push({ type: 'end', time: Date.now() }); |
|
|
|
|
|
if (this.postBufferRemaining === 0) { |
|
|
this.finalizeSegmentEnd(); |
|
|
} |
|
|
|
|
|
this.onStatusUpdate?.('listening', 'Listening...'); |
|
|
} |
|
|
|
|
|
finalizeSegmentEnd() { |
|
|
|
|
|
while (this.asrBuffer.length >= ASR_CHUNK_SAMPLES) { |
|
|
const chunkData = this.asrBuffer.splice(0, ASR_CHUNK_SAMPLES); |
|
|
const chunk = new Float32Array(chunkData); |
|
|
this.sendAudioToEncoder(chunk); |
|
|
} |
|
|
|
|
|
|
|
|
if (this.asrBuffer.length > 0) { |
|
|
const padded = new Float32Array(ASR_CHUNK_SAMPLES); |
|
|
padded.set(this.asrBuffer); |
|
|
this.sendAudioToEncoder(padded); |
|
|
} |
|
|
this.asrBuffer = []; |
|
|
|
|
|
|
|
|
this.sendAudioToEncoder(new Float32Array(0), true); |
|
|
|
|
|
|
|
|
this.encoderWorker?.postMessage({ |
|
|
type: 'segment_end', |
|
|
data: { segmentId: this.currentSegmentId } |
|
|
}); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ASRDemoUI { |
|
|
constructor() { |
|
|
this.asr = null; |
|
|
this.vadCanvas = null; |
|
|
this.vadCtx = null; |
|
|
|
|
|
this.initElements(); |
|
|
this.initCanvas(); |
|
|
this.bindEvents(); |
|
|
} |
|
|
|
|
|
initElements() { |
|
|
this.loadingOverlay = document.getElementById('loadingOverlay'); |
|
|
this.loadingText = document.getElementById('loadingText'); |
|
|
this.loadingProgressFill = document.getElementById('loadingProgressFill'); |
|
|
this.loadingProgressText = document.getElementById('loadingProgressText'); |
|
|
this.loadingDetails = document.getElementById('loadingDetails'); |
|
|
this.errorMessage = document.getElementById('errorMessage'); |
|
|
this.statusDot = document.getElementById('statusDot'); |
|
|
this.statusText = document.getElementById('statusText'); |
|
|
this.startBtn = document.getElementById('startBtn'); |
|
|
this.stopBtn = document.getElementById('stopBtn'); |
|
|
this.vadBarFill = document.getElementById('vadBarFill'); |
|
|
this.vadValue = document.getElementById('vadValue'); |
|
|
this.audioQueueSize = document.getElementById('audioQueueSize'); |
|
|
this.featuresQueueSize = document.getElementById('featuresQueueSize'); |
|
|
this.droppedChunksEl = document.getElementById('droppedChunks'); |
|
|
this.transcriptsList = document.getElementById('transcriptsList'); |
|
|
this.liveCaption = document.getElementById('liveCaption'); |
|
|
this.liveCaptionText = document.getElementById('liveCaptionText'); |
|
|
this.liveCaptionMobile = document.getElementById('liveCaptionMobile'); |
|
|
this.liveCaptionTextMobile = document.getElementById('liveCaptionTextMobile'); |
|
|
this.backendBadge = document.getElementById('backendBadge'); |
|
|
this.modelSelect = document.getElementById('modelSelect'); |
|
|
this.backendSelect = document.getElementById('backendSelect'); |
|
|
this.onnxUrl = document.getElementById('onnxUrl'); |
|
|
this.onsetThreshold = document.getElementById('onsetThreshold'); |
|
|
this.offsetThreshold = document.getElementById('offsetThreshold'); |
|
|
} |
|
|
|
|
|
initCanvas() { |
|
|
this.vadCanvas = document.getElementById('vadCanvas'); |
|
|
this.vadCtx = this.vadCanvas.getContext('2d'); |
|
|
|
|
|
const rect = this.vadCanvas.getBoundingClientRect(); |
|
|
this.vadCanvas.width = rect.width * window.devicePixelRatio; |
|
|
this.vadCanvas.height = rect.height * window.devicePixelRatio; |
|
|
this.vadCtx.scale(window.devicePixelRatio, window.devicePixelRatio); |
|
|
} |
|
|
|
|
|
bindEvents() { |
|
|
this.startBtn.addEventListener('click', () => this.handleStart()); |
|
|
this.stopBtn.addEventListener('click', () => this.handleStop()); |
|
|
|
|
|
|
|
|
const configSection = document.querySelector('.config-section'); |
|
|
const vadSection = document.querySelector('.vad-section'); |
|
|
|
|
|
configSection?.querySelector('h3')?.addEventListener('click', () => { |
|
|
configSection.classList.toggle('collapsed'); |
|
|
}); |
|
|
|
|
|
vadSection?.querySelector('h3')?.addEventListener('click', () => { |
|
|
vadSection.classList.toggle('collapsed'); |
|
|
|
|
|
if (!vadSection.classList.contains('collapsed')) { |
|
|
this.initCanvas(); |
|
|
} |
|
|
}); |
|
|
|
|
|
|
|
|
if (window.innerWidth <= 768) { |
|
|
configSection?.classList.add('collapsed'); |
|
|
} |
|
|
} |
|
|
|
|
|
async handleStart() { |
|
|
try { |
|
|
this.showLoading('Initializing...'); |
|
|
|
|
|
const config = { |
|
|
modelName: this.modelSelect.value, |
|
|
onnxUrl: this.onnxUrl.value || './models', |
|
|
backend: this.backendSelect.value, |
|
|
onsetThreshold: parseFloat(this.onsetThreshold.value), |
|
|
offsetThreshold: parseFloat(this.offsetThreshold.value) |
|
|
}; |
|
|
|
|
|
this.asr = new PipelinedStreamingASR(config); |
|
|
|
|
|
this.asr.onVadUpdate = (prob, history, segmentEvents, historyStartTime) => this.updateVadDisplay(prob, history, segmentEvents, historyStartTime); |
|
|
this.asr.onTranscript = (text, segmentId) => this.addTranscript(text, segmentId); |
|
|
this.asr.onLiveCaption = (text) => this.updateLiveCaption(text); |
|
|
this.asr.onStatusUpdate = (status, text) => this.updateStatus(status, text); |
|
|
this.asr.onBackendUpdate = (backend) => this.updateBackendBadge(backend); |
|
|
|
|
|
await this.asr.loadModels( |
|
|
(text) => { |
|
|
this.loadingText.textContent = text; |
|
|
}, |
|
|
(progress) => { |
|
|
this.updateLoadingProgress(progress); |
|
|
} |
|
|
); |
|
|
|
|
|
await this.asr.start(); |
|
|
|
|
|
this.hideLoading(); |
|
|
this.startBtn.disabled = true; |
|
|
this.stopBtn.disabled = false; |
|
|
this.disableConfig(true); |
|
|
|
|
|
} catch (error) { |
|
|
console.error('Start error:', error); |
|
|
this.hideLoading(); |
|
|
this.showError(`Failed to start: ${error.message}`); |
|
|
} |
|
|
} |
|
|
|
|
|
handleStop() { |
|
|
if (this.asr) { |
|
|
this.asr.stop(); |
|
|
this.asr = null; |
|
|
} |
|
|
|
|
|
this.startBtn.disabled = false; |
|
|
this.stopBtn.disabled = true; |
|
|
this.disableConfig(false); |
|
|
this.updateStatus('idle', 'Ready'); |
|
|
this.backendBadge.classList.remove('visible'); |
|
|
} |
|
|
|
|
|
updateVadDisplay(prob, history, segmentEvents = [], historyStartTime = 0) { |
|
|
this.vadBarFill.style.width = `${prob * 100}%`; |
|
|
this.vadValue.textContent = `${Math.round(prob * 100)}%`; |
|
|
|
|
|
const ctx = this.vadCtx; |
|
|
const rect = this.vadCanvas.getBoundingClientRect(); |
|
|
const width = rect.width; |
|
|
const height = rect.height; |
|
|
|
|
|
|
|
|
const graphHeight = height - 20; |
|
|
const graphTop = 0; |
|
|
|
|
|
ctx.fillStyle = '#0f0f23'; |
|
|
ctx.fillRect(0, 0, width, height); |
|
|
|
|
|
if (history.length < 2) return; |
|
|
|
|
|
const historyDuration = history.length * 50; |
|
|
const now = Date.now(); |
|
|
|
|
|
|
|
|
ctx.strokeStyle = '#333'; |
|
|
ctx.fillStyle = '#666'; |
|
|
ctx.font = '10px monospace'; |
|
|
ctx.textAlign = 'center'; |
|
|
ctx.lineWidth = 1; |
|
|
|
|
|
for (let t = 0; t <= historyDuration; t += 100) { |
|
|
const x = (t / historyDuration) * width; |
|
|
|
|
|
|
|
|
ctx.beginPath(); |
|
|
ctx.moveTo(x, graphHeight); |
|
|
ctx.lineTo(x, graphHeight + 5); |
|
|
ctx.stroke(); |
|
|
|
|
|
|
|
|
if (t % 500 === 0) { |
|
|
ctx.strokeStyle = '#444'; |
|
|
} else { |
|
|
ctx.strokeStyle = '#222'; |
|
|
} |
|
|
ctx.beginPath(); |
|
|
ctx.moveTo(x, graphTop); |
|
|
ctx.lineTo(x, graphHeight); |
|
|
ctx.stroke(); |
|
|
ctx.strokeStyle = '#333'; |
|
|
|
|
|
|
|
|
if (t % 500 === 0) { |
|
|
const seconds = (t / 1000).toFixed(1); |
|
|
ctx.fillText(seconds + 's', x, height - 2); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
for (const event of segmentEvents) { |
|
|
const eventAge = now - event.time; |
|
|
const eventPos = historyDuration - eventAge; |
|
|
if (eventPos < 0 || eventPos > historyDuration) continue; |
|
|
|
|
|
const x = (eventPos / historyDuration) * width; |
|
|
|
|
|
ctx.lineWidth = 2; |
|
|
if (event.type === 'start') { |
|
|
ctx.strokeStyle = '#00ff88'; |
|
|
} else { |
|
|
ctx.strokeStyle = '#ff4444'; |
|
|
} |
|
|
ctx.beginPath(); |
|
|
ctx.moveTo(x, graphTop); |
|
|
ctx.lineTo(x, graphHeight); |
|
|
ctx.stroke(); |
|
|
} |
|
|
|
|
|
|
|
|
const onsetY = graphHeight * (1 - parseFloat(this.onsetThreshold.value)); |
|
|
const offsetY = graphHeight * (1 - parseFloat(this.offsetThreshold.value)); |
|
|
|
|
|
ctx.strokeStyle = '#ff444466'; |
|
|
ctx.lineWidth = 1; |
|
|
ctx.beginPath(); |
|
|
ctx.moveTo(0, onsetY); |
|
|
ctx.lineTo(width, onsetY); |
|
|
ctx.stroke(); |
|
|
|
|
|
ctx.strokeStyle = '#00ff8866'; |
|
|
ctx.beginPath(); |
|
|
ctx.moveTo(0, offsetY); |
|
|
ctx.lineTo(width, offsetY); |
|
|
ctx.stroke(); |
|
|
|
|
|
|
|
|
ctx.strokeStyle = '#00d4ff'; |
|
|
ctx.lineWidth = 2; |
|
|
ctx.beginPath(); |
|
|
|
|
|
for (let i = 0; i < history.length; i++) { |
|
|
const x = (i / (history.length - 1)) * width; |
|
|
const y = graphHeight * (1 - history[i]); |
|
|
if (i === 0) { |
|
|
ctx.moveTo(x, y); |
|
|
} else { |
|
|
ctx.lineTo(x, y); |
|
|
} |
|
|
} |
|
|
ctx.stroke(); |
|
|
} |
|
|
|
|
|
addTranscript(text, segmentId) { |
|
|
if (!text || !text.trim()) return; |
|
|
|
|
|
const item = document.createElement('div'); |
|
|
item.className = 'transcript-item'; |
|
|
item.innerHTML = ` |
|
|
<span class="transcript-duration">#${segmentId}</span> |
|
|
<span class="transcript-text">${this.escapeHtml(text)}</span> |
|
|
`; |
|
|
this.transcriptsList.appendChild(item); |
|
|
this.transcriptsList.scrollTop = this.transcriptsList.scrollHeight; |
|
|
} |
|
|
|
|
|
updateLiveCaption(text) { |
|
|
if (text) { |
|
|
|
|
|
this.liveCaptionText.textContent = text; |
|
|
this.liveCaptionText.classList.remove('placeholder'); |
|
|
this.liveCaption.classList.add('active'); |
|
|
|
|
|
this.liveCaptionTextMobile.textContent = text; |
|
|
this.liveCaptionTextMobile.classList.remove('placeholder'); |
|
|
this.liveCaptionMobile.classList.add('active'); |
|
|
} else { |
|
|
|
|
|
this.liveCaptionText.textContent = 'Waiting for speech...'; |
|
|
this.liveCaptionText.classList.add('placeholder'); |
|
|
this.liveCaption.classList.remove('active'); |
|
|
|
|
|
this.liveCaptionTextMobile.textContent = 'Waiting for speech...'; |
|
|
this.liveCaptionTextMobile.classList.add('placeholder'); |
|
|
this.liveCaptionMobile.classList.remove('active'); |
|
|
} |
|
|
} |
|
|
|
|
|
updateStatus(status, text) { |
|
|
this.statusDot.className = 'status-dot ' + status; |
|
|
this.statusText.textContent = text; |
|
|
} |
|
|
|
|
|
updateBackendBadge(backend) { |
|
|
const labels = { 'wasm': 'WASM', 'webgl': 'WebGL', 'webgpu': 'WebGPU' }; |
|
|
this.backendBadge.textContent = labels[backend] || backend; |
|
|
this.backendBadge.className = 'backend-badge visible ' + backend; |
|
|
} |
|
|
|
|
|
showLoading(text) { |
|
|
this.loadingText.textContent = text; |
|
|
this.loadingProgressFill.style.width = '0%'; |
|
|
this.loadingProgressText.textContent = '0 / 7 models'; |
|
|
this.loadingDetails.textContent = ''; |
|
|
this.loadingOverlay.classList.remove('hidden'); |
|
|
} |
|
|
|
|
|
hideLoading() { |
|
|
this.loadingOverlay.classList.add('hidden'); |
|
|
} |
|
|
|
|
|
updateLoadingProgress(progress) { |
|
|
const { completedModels, totalModels, currentModel, currentProgress } = progress; |
|
|
|
|
|
|
|
|
const overallPercent = (completedModels / totalModels) * 100; |
|
|
this.loadingProgressFill.style.width = `${overallPercent}%`; |
|
|
this.loadingProgressText.textContent = `${completedModels} / ${totalModels} models`; |
|
|
|
|
|
|
|
|
if (currentModel && currentProgress.total > 0) { |
|
|
if (currentProgress.cached) { |
|
|
const sizeMB = (currentProgress.total / (1024 * 1024)).toFixed(1); |
|
|
this.loadingDetails.textContent = `${currentModel}: ${sizeMB} MB (cached)`; |
|
|
} else { |
|
|
const loadedMB = (currentProgress.loaded / (1024 * 1024)).toFixed(1); |
|
|
const totalMB = (currentProgress.total / (1024 * 1024)).toFixed(1); |
|
|
const percent = Math.round((currentProgress.loaded / currentProgress.total) * 100); |
|
|
this.loadingDetails.textContent = `${currentModel}: ${loadedMB} / ${totalMB} MB (${percent}%)`; |
|
|
} |
|
|
} else if (currentModel) { |
|
|
this.loadingDetails.textContent = `Loading ${currentModel}...`; |
|
|
} |
|
|
} |
|
|
|
|
|
showError(message) { |
|
|
this.errorMessage.textContent = message; |
|
|
this.errorMessage.classList.add('visible'); |
|
|
} |
|
|
|
|
|
disableConfig(disabled) { |
|
|
this.modelSelect.disabled = disabled; |
|
|
this.backendSelect.disabled = disabled; |
|
|
this.onnxUrl.disabled = disabled; |
|
|
this.onsetThreshold.disabled = disabled; |
|
|
this.offsetThreshold.disabled = disabled; |
|
|
} |
|
|
|
|
|
escapeHtml(text) { |
|
|
const div = document.createElement('div'); |
|
|
div.textContent = text; |
|
|
return div.innerHTML; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
document.addEventListener('DOMContentLoaded', () => { |
|
|
window.asrDemo = new ASRDemoUI(); |
|
|
}); |
|
|
|