Manjunath Kudlur commited on
Commit
23932e3
·
1 Parent(s): bb3b9a0

Timeline prettify

Browse files
Files changed (4) hide show
  1. decoder_worker.js +79 -4
  2. encoder_worker.js +22 -1
  3. index.html +2 -2
  4. streaming_asr.js +121 -9
decoder_worker.js CHANGED
@@ -170,6 +170,12 @@ let tokenizer = null;
170
  let accumulatedFeatures = null;
171
  let currentSegmentId = null;
172
 
 
 
 
 
 
 
173
  class MoonshineTokenizer {
174
  constructor() {
175
  this.decoder = null;
@@ -320,7 +326,11 @@ async function decodeAccumulated() {
320
  }
321
  }
322
 
323
- self.onmessage = async function(e) {
 
 
 
 
324
  const { type, data } = e.data;
325
 
326
  switch (type) {
@@ -376,6 +386,9 @@ self.onmessage = async function(e) {
376
  case 'segment_start': {
377
  accumulatedFeatures = null;
378
  currentSegmentId = data.segmentId;
 
 
 
379
  self.postMessage({ type: 'live_caption', text: '' });
380
  break;
381
  }
@@ -383,7 +396,15 @@ self.onmessage = async function(e) {
383
  case 'segment_end': {
384
  if (data.segmentId !== currentSegmentId) break;
385
 
 
 
 
 
 
 
386
  const text = await decodeAccumulated();
 
 
387
  self.postMessage({
388
  type: 'transcript',
389
  segmentId: data.segmentId,
@@ -435,10 +456,64 @@ self.onmessage = async function(e) {
435
  }
436
  }
437
 
438
- // Live caption
439
- const partialText = await decodeAccumulated();
440
- self.postMessage({ type: 'live_caption', text: partialText });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
441
  break;
442
  }
443
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
444
  };
 
170
  let accumulatedFeatures = null;
171
  let currentSegmentId = null;
172
 
173
+ // Live caption throttling to prevent pipeline backup
174
+ let isDecoding = false;
175
+ let lastDecodeTime = 0;
176
+ let pendingDecode = false;
177
+ const MIN_DECODE_INTERVAL_MS = 500; // Don't decode more often than every 500ms for live captions
178
+
179
  class MoonshineTokenizer {
180
  constructor() {
181
  this.decoder = null;
 
326
  }
327
  }
328
 
329
+ // Message queue for sequential processing
330
+ const messageQueue = [];
331
+ let isProcessingQueue = false;
332
+
333
+ async function processMessage(e) {
334
  const { type, data } = e.data;
335
 
336
  switch (type) {
 
386
  case 'segment_start': {
387
  accumulatedFeatures = null;
388
  currentSegmentId = data.segmentId;
389
+ isDecoding = false;
390
+ lastDecodeTime = 0;
391
+ pendingDecode = false;
392
  self.postMessage({ type: 'live_caption', text: '' });
393
  break;
394
  }
 
396
  case 'segment_end': {
397
  if (data.segmentId !== currentSegmentId) break;
398
 
399
+ // Wait for any in-progress decode to finish before final decode
400
+ while (isDecoding) {
401
+ await new Promise(resolve => setTimeout(resolve, 50));
402
+ }
403
+
404
+ isDecoding = true;
405
  const text = await decodeAccumulated();
406
+ isDecoding = false;
407
+
408
  self.postMessage({
409
  type: 'transcript',
410
  segmentId: data.segmentId,
 
456
  }
457
  }
458
 
459
+ // Live caption with throttling to prevent pipeline backup
460
+ const now = Date.now();
461
+ const timeSinceLastDecode = now - lastDecodeTime;
462
+
463
+ if (isDecoding) {
464
+ // Already decoding, mark that we need another decode when done
465
+ pendingDecode = true;
466
+ } else if (timeSinceLastDecode >= MIN_DECODE_INTERVAL_MS) {
467
+ // Enough time has passed, decode now
468
+ isDecoding = true;
469
+ lastDecodeTime = now;
470
+
471
+ try {
472
+ const partialText = await decodeAccumulated();
473
+ self.postMessage({ type: 'live_caption', text: partialText });
474
+ } finally {
475
+ isDecoding = false;
476
+
477
+ // If there was a pending decode request, schedule it
478
+ if (pendingDecode) {
479
+ pendingDecode = false;
480
+ // Use setTimeout to avoid blocking - decode will happen on next message or timeout
481
+ setTimeout(async () => {
482
+ if (!isDecoding && currentSegmentId !== null) {
483
+ isDecoding = true;
484
+ lastDecodeTime = Date.now();
485
+ try {
486
+ const text = await decodeAccumulated();
487
+ self.postMessage({ type: 'live_caption', text: text });
488
+ } finally {
489
+ isDecoding = false;
490
+ }
491
+ }
492
+ }, MIN_DECODE_INTERVAL_MS);
493
+ }
494
+ }
495
+ } else {
496
+ // Too soon since last decode, mark pending
497
+ pendingDecode = true;
498
+ }
499
  break;
500
  }
501
  }
502
+ }
503
+
504
+ async function processQueue() {
505
+ if (isProcessingQueue) return;
506
+ isProcessingQueue = true;
507
+
508
+ while (messageQueue.length > 0) {
509
+ const msg = messageQueue.shift();
510
+ await processMessage(msg);
511
+ }
512
+
513
+ isProcessingQueue = false;
514
+ }
515
+
516
+ self.onmessage = function(e) {
517
+ messageQueue.push(e);
518
+ processQueue();
519
  };
encoder_worker.js CHANGED
@@ -228,7 +228,11 @@ async function processEncoder(melData, melDims, flush = true) {
228
  return { data: resultData, dims: [1, newOutputCount, encDim] };
229
  }
230
 
231
- self.onmessage = async function(e) {
 
 
 
 
232
  const { type, data } = e.data;
233
 
234
  switch (type) {
@@ -320,4 +324,21 @@ self.onmessage = async function(e) {
320
  break;
321
  }
322
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
  };
 
228
  return { data: resultData, dims: [1, newOutputCount, encDim] };
229
  }
230
 
231
+ // Message queue for sequential processing
232
+ const messageQueue = [];
233
+ let isProcessing = false;
234
+
235
+ async function processMessage(e) {
236
  const { type, data } = e.data;
237
 
238
  switch (type) {
 
324
  break;
325
  }
326
  }
327
+ }
328
+
329
+ async function processQueue() {
330
+ if (isProcessing) return;
331
+ isProcessing = true;
332
+
333
+ while (messageQueue.length > 0) {
334
+ const msg = messageQueue.shift();
335
+ await processMessage(msg);
336
+ }
337
+
338
+ isProcessing = false;
339
+ }
340
+
341
+ self.onmessage = function(e) {
342
+ messageQueue.push(e);
343
+ processQueue();
344
  };
index.html CHANGED
@@ -115,7 +115,7 @@
115
  background: #0f0f23;
116
  border-radius: 5px;
117
  padding: 10px;
118
- height: 120px;
119
  position: relative;
120
  overflow: hidden;
121
  }
@@ -531,7 +531,7 @@
531
  }
532
 
533
  .vad-graph {
534
- height: 80px;
535
  }
536
 
537
  .pipeline-status {
 
115
  background: #0f0f23;
116
  border-radius: 5px;
117
  padding: 10px;
118
+ height: 140px;
119
  position: relative;
120
  overflow: hidden;
121
  }
 
531
  }
532
 
533
  .vad-graph {
534
+ height: 100px;
535
  }
536
 
537
  .pipeline-status {
streaming_asr.js CHANGED
@@ -19,6 +19,8 @@ const ENCODER_BATCH_SAMPLES = 5120; // 320ms - batch size for encoder
19
  const PRE_BUFFER_CHUNKS = 25; // ~500ms at 20ms chunks - capture more audio before onset
20
  const POST_BUFFER_CHUNKS = 5; // ~100ms at 20ms chunks
21
  const MIN_SEGMENT_DURATION_MS = 2000; // Minimum 2 seconds before allowing segment end
 
 
22
  const OFFSET_CHUNKS_REQUIRED = 10; // ~100ms of silence needed to end segment
23
 
24
  const MODEL_CONFIGS = {
@@ -225,6 +227,8 @@ class PipelinedStreamingASR {
225
  this.vadHistory = [];
226
  this.vadUpdateCounter = 0;
227
  this.vadUpdateInterval = 5; // Update display every 5 VAD chunks (50ms)
 
 
228
 
229
  // Callbacks
230
  this.onVadUpdate = null;
@@ -504,9 +508,26 @@ class PipelinedStreamingASR {
504
  this.vadUpdateCounter++;
505
  if (this.vadUpdateCounter >= this.vadUpdateInterval) {
506
  this.vadUpdateCounter = 0;
 
 
 
 
 
 
 
507
  this.vadHistory.push(this.emaProb);
508
- if (this.vadHistory.length > 100) this.vadHistory.shift();
509
- this.onVadUpdate?.(this.emaProb, this.vadHistory);
 
 
 
 
 
 
 
 
 
 
510
  }
511
 
512
  this.updateSegmentState();
@@ -572,11 +593,25 @@ class PipelinedStreamingASR {
572
  this.onsetCounter = 0;
573
  }
574
  } else if (this.state === 'speech') {
575
- // Check if minimum segment duration has passed
576
  const segmentDuration = Date.now() - this.segmentStartTime;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
577
  const minDurationMet = segmentDuration >= MIN_SEGMENT_DURATION_MS;
578
 
579
- if (this.emaProb < this.offsetThreshold) {
580
  this.offsetCounter++;
581
  // Only end segment if minimum duration met AND enough silence chunks
582
  if (minDurationMet && this.offsetCounter >= OFFSET_CHUNKS_REQUIRED) {
@@ -589,6 +624,12 @@ class PipelinedStreamingASR {
589
  }
590
 
591
  startSegment() {
 
 
 
 
 
 
592
  this.currentSegmentId++;
593
  this.state = 'speech';
594
  this.onsetCounter = 0;
@@ -596,6 +637,9 @@ class PipelinedStreamingASR {
596
  this.segmentStartTime = Date.now();
597
  this.encoderBatchBuffer = []; // Reset batch buffer for new segment
598
 
 
 
 
599
  // Tell encoder to start new segment
600
  this.encoderWorker?.postMessage({
601
  type: 'segment_start',
@@ -616,6 +660,9 @@ class PipelinedStreamingASR {
616
  this.offsetCounter = 0;
617
  this.postBufferRemaining = POST_BUFFER_CHUNKS;
618
 
 
 
 
619
  if (this.postBufferRemaining === 0) {
620
  this.finalizeSegmentEnd();
621
  }
@@ -741,7 +788,7 @@ class ASRDemoUI {
741
 
742
  this.asr = new PipelinedStreamingASR(config);
743
 
744
- this.asr.onVadUpdate = (prob, history) => this.updateVadDisplay(prob, history);
745
  this.asr.onTranscript = (text, segmentId) => this.addTranscript(text, segmentId);
746
  this.asr.onLiveCaption = (text) => this.updateLiveCaption(text);
747
  this.asr.onStatusUpdate = (status, text) => this.updateStatus(status, text);
@@ -781,7 +828,7 @@ class ASRDemoUI {
781
  this.updateStatus('idle', 'Ready');
782
  }
783
 
784
- updateVadDisplay(prob, history) {
785
  this.vadBarFill.style.width = `${prob * 100}%`;
786
  this.vadValue.textContent = `${Math.round(prob * 100)}%`;
787
 
@@ -790,15 +837,79 @@ class ASRDemoUI {
790
  const width = rect.width;
791
  const height = rect.height;
792
 
 
 
 
 
793
  ctx.fillStyle = '#0f0f23';
794
  ctx.fillRect(0, 0, width, height);
795
 
796
  if (history.length < 2) return;
797
 
798
- const onsetY = height * (1 - parseFloat(this.onsetThreshold.value));
799
- const offsetY = height * (1 - parseFloat(this.offsetThreshold.value));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
800
 
801
  ctx.strokeStyle = '#ff444466';
 
802
  ctx.beginPath();
803
  ctx.moveTo(0, onsetY);
804
  ctx.lineTo(width, onsetY);
@@ -810,13 +921,14 @@ class ASRDemoUI {
810
  ctx.lineTo(width, offsetY);
811
  ctx.stroke();
812
 
 
813
  ctx.strokeStyle = '#00d4ff';
814
  ctx.lineWidth = 2;
815
  ctx.beginPath();
816
 
817
  for (let i = 0; i < history.length; i++) {
818
  const x = (i / (history.length - 1)) * width;
819
- const y = height * (1 - history[i]);
820
  if (i === 0) {
821
  ctx.moveTo(x, y);
822
  } else {
 
19
  const PRE_BUFFER_CHUNKS = 25; // ~500ms at 20ms chunks - capture more audio before onset
20
  const POST_BUFFER_CHUNKS = 5; // ~100ms at 20ms chunks
21
  const MIN_SEGMENT_DURATION_MS = 2000; // Minimum 2 seconds before allowing segment end
22
+ const OFFSET_RAMP_START_MS = 6000; // Start ramping offset threshold at 6 seconds
23
+ const OFFSET_RAMP_END_MS = 8000; // Reach max offset threshold (1.0) at 8 seconds
24
  const OFFSET_CHUNKS_REQUIRED = 10; // ~100ms of silence needed to end segment
25
 
26
  const MODEL_CONFIGS = {
 
227
  this.vadHistory = [];
228
  this.vadUpdateCounter = 0;
229
  this.vadUpdateInterval = 5; // Update display every 5 VAD chunks (50ms)
230
+ this.segmentEvents = []; // Track segment start/end for visualization
231
+ this.vadHistoryStartTime = 0; // When the current history window started
232
 
233
  // Callbacks
234
  this.onVadUpdate = null;
 
508
  this.vadUpdateCounter++;
509
  if (this.vadUpdateCounter >= this.vadUpdateInterval) {
510
  this.vadUpdateCounter = 0;
511
+ const now = Date.now();
512
+
513
+ // Initialize history start time
514
+ if (this.vadHistory.length === 0) {
515
+ this.vadHistoryStartTime = now;
516
+ }
517
+
518
  this.vadHistory.push(this.emaProb);
519
+ if (this.vadHistory.length > 100) {
520
+ this.vadHistory.shift();
521
+ // Shift the start time by 50ms (one history entry)
522
+ this.vadHistoryStartTime += 50;
523
+ }
524
+
525
+ // Remove old segment events that are outside the history window
526
+ const historyDuration = this.vadHistory.length * 50; // ms
527
+ const historyStart = now - historyDuration;
528
+ this.segmentEvents = this.segmentEvents.filter(e => e.time >= historyStart);
529
+
530
+ this.onVadUpdate?.(this.emaProb, this.vadHistory, this.segmentEvents, this.vadHistoryStartTime);
531
  }
532
 
533
  this.updateSegmentState();
 
593
  this.onsetCounter = 0;
594
  }
595
  } else if (this.state === 'speech') {
 
596
  const segmentDuration = Date.now() - this.segmentStartTime;
597
+
598
+ // Calculate effective offset threshold with gradual ramp
599
+ // Before OFFSET_RAMP_START_MS: use normal offsetThreshold
600
+ // Between OFFSET_RAMP_START_MS and OFFSET_RAMP_END_MS: linearly ramp to 1.0
601
+ // After OFFSET_RAMP_END_MS: use 1.0 (any VAD level will trigger offset)
602
+ let effectiveOffsetThreshold = this.offsetThreshold;
603
+ if (segmentDuration >= OFFSET_RAMP_START_MS) {
604
+ const rampProgress = Math.min(1.0,
605
+ (segmentDuration - OFFSET_RAMP_START_MS) / (OFFSET_RAMP_END_MS - OFFSET_RAMP_START_MS)
606
+ );
607
+ // Lerp from offsetThreshold to 1.0
608
+ effectiveOffsetThreshold = this.offsetThreshold + rampProgress * (1.0 - this.offsetThreshold);
609
+ }
610
+
611
+ // Check if minimum segment duration has passed
612
  const minDurationMet = segmentDuration >= MIN_SEGMENT_DURATION_MS;
613
 
614
+ if (this.emaProb < effectiveOffsetThreshold) {
615
  this.offsetCounter++;
616
  // Only end segment if minimum duration met AND enough silence chunks
617
  if (minDurationMet && this.offsetCounter >= OFFSET_CHUNKS_REQUIRED) {
 
624
  }
625
 
626
  startSegment() {
627
+ // If previous segment wasn't fully finalized, finalize it now
628
+ if (this.postBufferRemaining > 0) {
629
+ this.finalizeSegmentEnd();
630
+ this.postBufferRemaining = 0;
631
+ }
632
+
633
  this.currentSegmentId++;
634
  this.state = 'speech';
635
  this.onsetCounter = 0;
 
637
  this.segmentStartTime = Date.now();
638
  this.encoderBatchBuffer = []; // Reset batch buffer for new segment
639
 
640
+ // Record segment start for visualization
641
+ this.segmentEvents.push({ type: 'start', time: this.segmentStartTime });
642
+
643
  // Tell encoder to start new segment
644
  this.encoderWorker?.postMessage({
645
  type: 'segment_start',
 
660
  this.offsetCounter = 0;
661
  this.postBufferRemaining = POST_BUFFER_CHUNKS;
662
 
663
+ // Record segment end for visualization
664
+ this.segmentEvents.push({ type: 'end', time: Date.now() });
665
+
666
  if (this.postBufferRemaining === 0) {
667
  this.finalizeSegmentEnd();
668
  }
 
788
 
789
  this.asr = new PipelinedStreamingASR(config);
790
 
791
+ this.asr.onVadUpdate = (prob, history, segmentEvents, historyStartTime) => this.updateVadDisplay(prob, history, segmentEvents, historyStartTime);
792
  this.asr.onTranscript = (text, segmentId) => this.addTranscript(text, segmentId);
793
  this.asr.onLiveCaption = (text) => this.updateLiveCaption(text);
794
  this.asr.onStatusUpdate = (status, text) => this.updateStatus(status, text);
 
828
  this.updateStatus('idle', 'Ready');
829
  }
830
 
831
+ updateVadDisplay(prob, history, segmentEvents = [], historyStartTime = 0) {
832
  this.vadBarFill.style.width = `${prob * 100}%`;
833
  this.vadValue.textContent = `${Math.round(prob * 100)}%`;
834
 
 
837
  const width = rect.width;
838
  const height = rect.height;
839
 
840
+ // Leave space for x-axis labels
841
+ const graphHeight = height - 20;
842
+ const graphTop = 0;
843
+
844
  ctx.fillStyle = '#0f0f23';
845
  ctx.fillRect(0, 0, width, height);
846
 
847
  if (history.length < 2) return;
848
 
849
+ const historyDuration = history.length * 50; // ms (each entry is 50ms)
850
+ const now = Date.now();
851
+
852
+ // Draw x-axis ticks (every 0.1 seconds = 100ms)
853
+ ctx.strokeStyle = '#333';
854
+ ctx.fillStyle = '#666';
855
+ ctx.font = '10px monospace';
856
+ ctx.textAlign = 'center';
857
+ ctx.lineWidth = 1;
858
+
859
+ for (let t = 0; t <= historyDuration; t += 100) {
860
+ const x = (t / historyDuration) * width;
861
+
862
+ // Draw tick mark
863
+ ctx.beginPath();
864
+ ctx.moveTo(x, graphHeight);
865
+ ctx.lineTo(x, graphHeight + 5);
866
+ ctx.stroke();
867
+
868
+ // Draw vertical grid line (lighter for minor ticks)
869
+ if (t % 500 === 0) {
870
+ ctx.strokeStyle = '#444';
871
+ } else {
872
+ ctx.strokeStyle = '#222';
873
+ }
874
+ ctx.beginPath();
875
+ ctx.moveTo(x, graphTop);
876
+ ctx.lineTo(x, graphHeight);
877
+ ctx.stroke();
878
+ ctx.strokeStyle = '#333';
879
+
880
+ // Draw label every 0.5 seconds
881
+ if (t % 500 === 0) {
882
+ const seconds = (t / 1000).toFixed(1);
883
+ ctx.fillText(seconds + 's', x, height - 2);
884
+ }
885
+ }
886
+
887
+ // Draw segment events (start = green line, end = red line)
888
+ for (const event of segmentEvents) {
889
+ const eventAge = now - event.time; // ms ago
890
+ const eventPos = historyDuration - eventAge; // position in history
891
+ if (eventPos < 0 || eventPos > historyDuration) continue;
892
+
893
+ const x = (eventPos / historyDuration) * width;
894
+
895
+ ctx.lineWidth = 2;
896
+ if (event.type === 'start') {
897
+ ctx.strokeStyle = '#00ff88'; // Green for start
898
+ } else {
899
+ ctx.strokeStyle = '#ff4444'; // Red for end
900
+ }
901
+ ctx.beginPath();
902
+ ctx.moveTo(x, graphTop);
903
+ ctx.lineTo(x, graphHeight);
904
+ ctx.stroke();
905
+ }
906
+
907
+ // Draw threshold lines
908
+ const onsetY = graphHeight * (1 - parseFloat(this.onsetThreshold.value));
909
+ const offsetY = graphHeight * (1 - parseFloat(this.offsetThreshold.value));
910
 
911
  ctx.strokeStyle = '#ff444466';
912
+ ctx.lineWidth = 1;
913
  ctx.beginPath();
914
  ctx.moveTo(0, onsetY);
915
  ctx.lineTo(width, onsetY);
 
921
  ctx.lineTo(width, offsetY);
922
  ctx.stroke();
923
 
924
+ // Draw VAD probability line
925
  ctx.strokeStyle = '#00d4ff';
926
  ctx.lineWidth = 2;
927
  ctx.beginPath();
928
 
929
  for (let i = 0; i < history.length; i++) {
930
  const x = (i / (history.length - 1)) * width;
931
+ const y = graphHeight * (1 - history[i]);
932
  if (i === 0) {
933
  ctx.moveTo(x, y);
934
  } else {