KevinAHM commited on
Commit
294ea8d
·
1 Parent(s): 4c13925

Add: increased max generation length

Browse files
Files changed (3) hide show
  1. index.html +8 -9
  2. inference-worker.js +102 -62
  3. onnx-streaming.js +47 -41
index.html CHANGED
@@ -76,15 +76,14 @@
76
  </div>
77
 
78
  <div class="textarea-wrap">
79
- <textarea
80
- id="text-input"
81
- placeholder="Type or paste text to synthesize..."
82
- aria-label="Text to synthesize"
83
- maxlength="500"
84
- ></textarea>
85
- <div class="textarea-meta">
86
- <span class="char-count"><span id="char-count">0</span>/500</span>
87
- </div>
88
  </div>
89
 
90
  <!-- Sample Texts -->
 
76
  </div>
77
 
78
  <div class="textarea-wrap">
79
+ <textarea
80
+ id="text-input"
81
+ placeholder="Type or paste text to synthesize..."
82
+ aria-label="Text to synthesize"
83
+ ></textarea>
84
+ <div class="textarea-meta">
85
+ <span class="char-count"><span id="char-count">0</span> chars</span>
86
+ </div>
 
87
  </div>
88
 
89
  <!-- Sample Texts -->
inference-worker.js CHANGED
@@ -19,6 +19,13 @@ const MODELS = {
19
  const SAMPLE_RATE = 24000;
20
  const SAMPLES_PER_FRAME = 1920;
21
  const MAX_FRAMES = 500;
 
 
 
 
 
 
 
22
 
23
  // State
24
  let mimiEncoderSession = null;
@@ -229,63 +236,62 @@ function dedupPunctuation(text) {
229
  return text.replace(/\.\.\.+/g, '[ELLIPSIS]').replace(/,+/g, ',').replace(/[.,]*\.[.,]*/g, '.').replace(/[.,!]*![.,!]*/g, '!').replace(/[.,!?]*\?[.,!?]*/g, '?').replace(/\[ELLIPSIS\]/g, '...');
230
  }
231
 
232
- // Split text into sentence chunks (max 50 tokens each)
233
- function splitIntoBestSentences(text) {
234
- const preparedText = prepareText(text);
235
- if (!preparedText) return [];
236
 
237
- const tokenIds = tokenizerProcessor.encodeIds(preparedText);
 
 
 
 
238
 
239
- // Get end-of-sentence token IDs
240
- const eosTokenIds = tokenizerProcessor.encodeIds('.!...?');
241
- const eosSet = new Set(eosTokenIds);
 
 
 
 
 
242
 
243
- // Find sentence boundaries
244
- const endOfSentenceIndices = [0];
245
- let previousWasEos = false;
 
246
 
247
- for (let i = 0; i < tokenIds.length; i++) {
248
- if (eosSet.has(tokenIds[i])) {
249
- previousWasEos = true;
250
- } else {
251
- if (previousWasEos) {
252
- endOfSentenceIndices.push(i);
253
- }
254
- previousWasEos = false;
255
- }
256
- }
257
- endOfSentenceIndices.push(tokenIds.length);
258
-
259
- // Reconstruct sentences with token counts
260
- const sentences = [];
261
- for (let i = 0; i < endOfSentenceIndices.length - 1; i++) {
262
- const start = endOfSentenceIndices[i];
263
- const end = endOfSentenceIndices[i + 1];
264
- const sentenceTokens = tokenIds.slice(start, end);
265
- const sentenceText = tokenizerProcessor.decodeIds(sentenceTokens).trim();
266
- sentences.push({ tokens: end - start, text: sentenceText });
267
- }
268
 
269
- // Merge into chunks of max 50 tokens
270
- const MAX_TOKENS = 50;
271
  const chunks = [];
272
  let currentChunk = '';
273
- let currentTokens = 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
 
275
- for (const { tokens, text } of sentences) {
276
  if (currentChunk === '') {
277
- currentChunk = text;
278
- currentTokens = tokens;
279
  continue;
280
  }
281
 
282
- if (currentTokens + tokens > MAX_TOKENS) {
 
 
283
  chunks.push(currentChunk.trim());
284
- currentChunk = text;
285
- currentTokens = tokens;
286
  } else {
287
- currentChunk += ' ' + text;
288
- currentTokens += tokens;
289
  }
290
  }
291
 
@@ -750,7 +756,7 @@ async function startGeneration(text, voiceName) {
750
  postMessage({ type: 'generation_started', data: { time: performance.now() } });
751
 
752
  try {
753
- // Split text into sentence chunks (max 50 tokens each)
754
  const chunks = splitIntoBestSentences(text);
755
  console.log(`Split into ${chunks.length} chunks:`, chunks);
756
 
@@ -788,8 +794,7 @@ async function startGeneration(text, voiceName) {
788
  }
789
 
790
  async function runGenerationPipeline(voiceEmb, chunks) {
791
- // Initialize state - persists across all chunks
792
- let flowLmState = initState(flowLmMainSession, FLOW_LM_STATE_SHAPES);
793
  let mimiState = initState(mimiDecoderSession, MIMI_DECODER_STATE_SHAPES);
794
  const emptySeq = new ort.Tensor('float32', new Float32Array(0), [1, 0, 32]);
795
  const emptyTextEmb = new ort.Tensor('float32', new Float32Array(0), [1, 0, 1024]);
@@ -798,25 +803,30 @@ async function runGenerationPipeline(voiceEmb, chunks) {
798
  const voiceTensor = new ort.Tensor('float32', voiceEmb.data, voiceEmb.shape);
799
  console.log('Voice embeddings shape:', voiceEmb.shape);
800
 
801
- // Voice conditioning (once for all chunks)
802
- console.log('Running voice conditioning...');
803
- const voiceCondInputs = {
804
- sequence: emptySeq,
805
- text_embeddings: voiceTensor,
806
- ...flowLmState
807
- };
 
808
 
809
- let condResult = await flowLmMainSession.run(voiceCondInputs);
810
 
811
- // Update state from voice conditioning
812
- for (let i = 2; i < flowLmMainSession.outputNames.length; i++) {
813
- const outputName = flowLmMainSession.outputNames[i];
814
- if (outputName.startsWith('out_state_')) {
815
- const stateIdx = parseInt(outputName.replace('out_state_', ''));
816
- flowLmState[`state_${stateIdx}`] = condResult[outputName];
 
817
  }
 
818
  }
819
 
 
 
820
  // Streaming parameters
821
  const FIRST_CHUNK_FRAMES = 3;
822
  const NORMAL_CHUNK_FRAMES = 12;
@@ -833,9 +843,18 @@ async function runGenerationPipeline(voiceEmb, chunks) {
833
  for (let chunkIdx = 0; chunkIdx < chunks.length; chunkIdx++) {
834
  if (!isGenerating) break;
835
 
 
 
 
 
 
 
 
836
  const chunkText = chunks[chunkIdx];
837
  console.log(`Processing chunk ${chunkIdx + 1}/${chunks.length}: "${chunkText}"`);
838
 
 
 
839
  // Tokenize this chunk
840
  const tokenIds = tokenizerProcessor.encodeIds(chunkText);
841
  console.log(`Chunk ${chunkIdx + 1} tokens:`, tokenIds.length);
@@ -855,7 +874,7 @@ async function runGenerationPipeline(voiceEmb, chunks) {
855
  ...flowLmState
856
  };
857
 
858
- condResult = await flowLmMainSession.run(textCondInputs);
859
 
860
  // Update state from text conditioning
861
  for (let i = 2; i < flowLmMainSession.outputNames.length; i++) {
@@ -873,6 +892,7 @@ async function runGenerationPipeline(voiceEmb, chunks) {
873
  const FRAMES_AFTER_EOS = 3; // Match PyTorch behavior - generate extra frames after EOS
874
  let eosStep = null;
875
 
 
876
  for (let step = 0; step < MAX_FRAMES; step++) {
877
  if (!isGenerating) break;
878
 
@@ -994,18 +1014,38 @@ async function runGenerationPipeline(voiceEmb, chunks) {
994
  decTime: 0,
995
  chunkDuration: audioFloat32.length / SAMPLE_RATE,
996
  isFirst: isFirstAudioChunk,
997
- isLast: isLastChunk
 
998
  }
999
  }, [audioFloat32.buffer]);
1000
 
1001
  isFirstAudioChunk = false;
 
1002
  }
1003
 
1004
  if (shouldStop) {
1005
  console.log(`Chunk ${chunkIdx + 1} EOS at step ${eosStep}, stopped at step ${step}, ${chunkLatents.length} frames`);
 
1006
  break;
1007
  }
1008
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1009
  }
1010
 
1011
  const totalTime = (performance.now() - arStartTime) / 1000;
 
19
  const SAMPLE_RATE = 24000;
20
  const SAMPLES_PER_FRAME = 1920;
21
  const MAX_FRAMES = 500;
22
+ // Text chunking target; lower if long passages hit generation limits.
23
+ const CHUNK_TARGET_TOKENS = 50;
24
+ const CHUNK_GAP_SEC = 0.25;
25
+ // If true, re-run voice conditioning per chunk to avoid stale AR state.
26
+ const RESET_FLOW_STATE_EACH_CHUNK = true;
27
+ // If true, reset decoder state per chunk to avoid carry-over artifacts.
28
+ const RESET_MIMI_STATE_EACH_CHUNK = true;
29
 
30
  // State
31
  let mimiEncoderSession = null;
 
236
  return text.replace(/\.\.\.+/g, '[ELLIPSIS]').replace(/,+/g, ',').replace(/[.,]*\.[.,]*/g, '.').replace(/[.,!]*![.,!]*/g, '!').replace(/[.,!?]*\?[.,!?]*/g, '?').replace(/\[ELLIPSIS\]/g, '...');
237
  }
238
 
239
+ const SENTENCE_SPLIT_RE = /[^.!?]+[.!?]+|[^.!?]+$/g;
 
 
 
240
 
241
+ function splitTextIntoSentences(text) {
242
+ const matches = text.match(SENTENCE_SPLIT_RE);
243
+ if (!matches) return [];
244
+ return matches.map(sentence => sentence.trim()).filter(Boolean);
245
+ }
246
 
247
+ function splitTokenIdsIntoChunks(tokenIds, maxTokens) {
248
+ const chunks = [];
249
+ for (let i = 0; i < tokenIds.length; i += maxTokens) {
250
+ const chunkText = tokenizerProcessor.decodeIds(tokenIds.slice(i, i + maxTokens)).trim();
251
+ if (chunkText) chunks.push(chunkText);
252
+ }
253
+ return chunks;
254
+ }
255
 
256
+ // Split text into sentence chunks (target <= CHUNK_TARGET_TOKENS tokens)
257
+ function splitIntoBestSentences(text) {
258
+ const preparedText = prepareText(text);
259
+ if (!preparedText) return [];
260
 
261
+ const sentences = splitTextIntoSentences(preparedText);
262
+ if (sentences.length === 0) return [];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
 
264
+ // Merge sentences into chunks that stay within the token target
 
265
  const chunks = [];
266
  let currentChunk = '';
267
+ for (const sentenceText of sentences) {
268
+ const sentenceTokenIds = tokenizerProcessor.encodeIds(sentenceText);
269
+ const sentenceTokens = sentenceTokenIds.length;
270
+
271
+ if (sentenceTokens > CHUNK_TARGET_TOKENS) {
272
+ if (currentChunk !== '') {
273
+ chunks.push(currentChunk.trim());
274
+ currentChunk = '';
275
+ }
276
+ const splitChunks = splitTokenIdsIntoChunks(sentenceTokenIds, CHUNK_TARGET_TOKENS);
277
+ for (const splitChunk of splitChunks) {
278
+ if (splitChunk) chunks.push(splitChunk.trim());
279
+ }
280
+ continue;
281
+ }
282
 
 
283
  if (currentChunk === '') {
284
+ currentChunk = sentenceText;
 
285
  continue;
286
  }
287
 
288
+ const combined = `${currentChunk} ${sentenceText}`;
289
+ const combinedTokens = tokenizerProcessor.encodeIds(combined).length;
290
+ if (combinedTokens > CHUNK_TARGET_TOKENS) {
291
  chunks.push(currentChunk.trim());
292
+ currentChunk = sentenceText;
 
293
  } else {
294
+ currentChunk = combined;
 
295
  }
296
  }
297
 
 
756
  postMessage({ type: 'generation_started', data: { time: performance.now() } });
757
 
758
  try {
759
+ // Split text into sentence chunks (target <= CHUNK_TARGET_TOKENS tokens)
760
  const chunks = splitIntoBestSentences(text);
761
  console.log(`Split into ${chunks.length} chunks:`, chunks);
762
 
 
794
  }
795
 
796
  async function runGenerationPipeline(voiceEmb, chunks) {
797
+ // Initialize state - may be reset per chunk
 
798
  let mimiState = initState(mimiDecoderSession, MIMI_DECODER_STATE_SHAPES);
799
  const emptySeq = new ort.Tensor('float32', new Float32Array(0), [1, 0, 32]);
800
  const emptyTextEmb = new ort.Tensor('float32', new Float32Array(0), [1, 0, 1024]);
 
803
  const voiceTensor = new ort.Tensor('float32', voiceEmb.data, voiceEmb.shape);
804
  console.log('Voice embeddings shape:', voiceEmb.shape);
805
 
806
+ async function buildVoiceConditionedState() {
807
+ let flowLmState = initState(flowLmMainSession, FLOW_LM_STATE_SHAPES);
808
+ console.log('Running voice conditioning...');
809
+ const voiceCondInputs = {
810
+ sequence: emptySeq,
811
+ text_embeddings: voiceTensor,
812
+ ...flowLmState
813
+ };
814
 
815
+ let condResult = await flowLmMainSession.run(voiceCondInputs);
816
 
817
+ // Update state from voice conditioning
818
+ for (let i = 2; i < flowLmMainSession.outputNames.length; i++) {
819
+ const outputName = flowLmMainSession.outputNames[i];
820
+ if (outputName.startsWith('out_state_')) {
821
+ const stateIdx = parseInt(outputName.replace('out_state_', ''));
822
+ flowLmState[`state_${stateIdx}`] = condResult[outputName];
823
+ }
824
  }
825
+ return flowLmState;
826
  }
827
 
828
+ let flowLmState = await buildVoiceConditionedState();
829
+
830
  // Streaming parameters
831
  const FIRST_CHUNK_FRAMES = 3;
832
  const NORMAL_CHUNK_FRAMES = 12;
 
843
  for (let chunkIdx = 0; chunkIdx < chunks.length; chunkIdx++) {
844
  if (!isGenerating) break;
845
 
846
+ if (RESET_FLOW_STATE_EACH_CHUNK && chunkIdx > 0) {
847
+ flowLmState = await buildVoiceConditionedState();
848
+ }
849
+ if (RESET_MIMI_STATE_EACH_CHUNK && chunkIdx > 0) {
850
+ mimiState = initState(mimiDecoderSession, MIMI_DECODER_STATE_SHAPES);
851
+ }
852
+
853
  const chunkText = chunks[chunkIdx];
854
  console.log(`Processing chunk ${chunkIdx + 1}/${chunks.length}: "${chunkText}"`);
855
 
856
+ let isFirstAudioChunkOfTextChunk = true;
857
+
858
  // Tokenize this chunk
859
  const tokenIds = tokenizerProcessor.encodeIds(chunkText);
860
  console.log(`Chunk ${chunkIdx + 1} tokens:`, tokenIds.length);
 
874
  ...flowLmState
875
  };
876
 
877
+ let condResult = await flowLmMainSession.run(textCondInputs);
878
 
879
  // Update state from text conditioning
880
  for (let i = 2; i < flowLmMainSession.outputNames.length; i++) {
 
892
  const FRAMES_AFTER_EOS = 3; // Match PyTorch behavior - generate extra frames after EOS
893
  let eosStep = null;
894
 
895
+ let chunkEnded = false;
896
  for (let step = 0; step < MAX_FRAMES; step++) {
897
  if (!isGenerating) break;
898
 
 
1014
  decTime: 0,
1015
  chunkDuration: audioFloat32.length / SAMPLE_RATE,
1016
  isFirst: isFirstAudioChunk,
1017
+ isLast: isLastChunk,
1018
+ chunkStart: isFirstAudioChunkOfTextChunk
1019
  }
1020
  }, [audioFloat32.buffer]);
1021
 
1022
  isFirstAudioChunk = false;
1023
+ isFirstAudioChunkOfTextChunk = false;
1024
  }
1025
 
1026
  if (shouldStop) {
1027
  console.log(`Chunk ${chunkIdx + 1} EOS at step ${eosStep}, stopped at step ${step}, ${chunkLatents.length} frames`);
1028
+ chunkEnded = true;
1029
  break;
1030
  }
1031
  }
1032
+
1033
+ if (chunkEnded && isGenerating && chunkIdx < chunks.length - 1) {
1034
+ const gapSamples = Math.max(1, Math.floor(CHUNK_GAP_SEC * SAMPLE_RATE));
1035
+ const silence = new Float32Array(gapSamples);
1036
+ postMessage({
1037
+ type: 'audio_chunk',
1038
+ data: silence,
1039
+ metrics: {
1040
+ bbTime: 0,
1041
+ decTime: 0,
1042
+ chunkDuration: gapSamples / SAMPLE_RATE,
1043
+ isFirst: false,
1044
+ isLast: false,
1045
+ isSilence: true
1046
+ }
1047
+ }, [silence.buffer]);
1048
+ }
1049
  }
1050
 
1051
  const totalTime = (performance.now() - arStartTime) / 1000;
onnx-streaming.js CHANGED
@@ -2,8 +2,8 @@
2
  import { PCMPlayerWorklet as PCMPlayer } from './PCMPlayerWorklet.js';
3
 
4
  // Configuration
5
- const SAMPLE_RATE = 24000;
6
- const FADE_SAMPLES = 480; // 20ms fade at 24kHz
7
 
8
  export class PocketTTSStreaming {
9
  constructor() {
@@ -22,7 +22,8 @@ export class PocketTTSStreaming {
22
  // Metrics State
23
  this.generationStartTime = 0;
24
  this.lastChunkFinishTime = 0;
25
- this.rtfMovingAverage = 0;
 
26
 
27
  // Edge optimization state (dynamic LSD)
28
  this.edgeOptimizationApplied = false;
@@ -392,38 +393,40 @@ export class PocketTTSStreaming {
392
  }
393
  }
394
 
395
- stopGeneration() {
396
- if (!this.isGenerating) return;
397
- this.worker.postMessage({ type: 'stop' });
398
- // Handle stop immediately in UI
399
- this.handleStreamEnd();
400
- }
401
-
402
- applyFadeIn(audioData) {
403
- const fadeLen = Math.min(FADE_SAMPLES, audioData.length);
404
- for (let i = 0; i < fadeLen; i++) {
405
- audioData[i] *= i / fadeLen;
406
- }
407
- }
408
-
409
- applyFadeOut(audioData) {
410
- const fadeLen = Math.min(FADE_SAMPLES, audioData.length);
411
- const startIdx = audioData.length - fadeLen;
412
- for (let i = 0; i < fadeLen; i++) {
413
- audioData[startIdx + i] *= 1 - (i / fadeLen);
414
- }
415
- }
416
-
417
- handleAudioChunk(audioData, metrics) {
418
- if (!this.isGenerating) return;
419
-
420
- // Apply fades to prevent pops
421
- if (metrics.isFirst) {
422
- this.applyFadeIn(audioData);
423
- }
424
- if (metrics.isLast) {
425
- this.applyFadeOut(audioData);
426
- }
 
 
427
 
428
  // Play audio
429
  this.player.playAudio(audioData);
@@ -432,12 +435,15 @@ export class PocketTTSStreaming {
432
  const now = performance.now();
433
  let ttfb = 0;
434
 
435
- if (metrics.isFirst) {
436
- ttfb = now - this.generationStartTime;
437
- this.lastChunkFinishTime = now;
438
- } else if (this.lastChunkFinishTime > 0) {
439
- const timeSinceLastChunk = (now - this.lastChunkFinishTime) / 1000;
440
- this.lastChunkFinishTime = now;
 
 
 
441
 
442
  if (timeSinceLastChunk > 0) {
443
  const chunkDurationSec = metrics.chunkDuration;
 
2
  import { PCMPlayerWorklet as PCMPlayer } from './PCMPlayerWorklet.js';
3
 
4
  // Configuration
5
+ const SAMPLE_RATE = 24000;
6
+ const FADE_SAMPLES = 480; // 20ms fade at 24kHz
7
 
8
  export class PocketTTSStreaming {
9
  constructor() {
 
22
  // Metrics State
23
  this.generationStartTime = 0;
24
  this.lastChunkFinishTime = 0;
25
+ this.rtfMovingAverage = 0;
26
+ this.skipNextRtf = false;
27
 
28
  // Edge optimization state (dynamic LSD)
29
  this.edgeOptimizationApplied = false;
 
393
  }
394
  }
395
 
396
+ stopGeneration() {
397
+ if (!this.isGenerating) return;
398
+ this.worker.postMessage({ type: 'stop' });
399
+ // Handle stop immediately in UI
400
+ this.handleStreamEnd();
401
+ }
402
+
403
+ applyFadeIn(audioData) {
404
+ const fadeLen = Math.min(FADE_SAMPLES, audioData.length);
405
+ for (let i = 0; i < fadeLen; i++) {
406
+ audioData[i] *= i / fadeLen;
407
+ }
408
+ }
409
+
410
+ applyFadeOut(audioData) {
411
+ const fadeLen = Math.min(FADE_SAMPLES, audioData.length);
412
+ const startIdx = audioData.length - fadeLen;
413
+ for (let i = 0; i < fadeLen; i++) {
414
+ audioData[startIdx + i] *= 1 - (i / fadeLen);
415
+ }
416
+ }
417
+
418
+ handleAudioChunk(audioData, metrics) {
419
+ if (!this.isGenerating) return;
420
+
421
+ if (metrics.isSilence) {
422
+ this.player.playAudio(audioData);
423
+ this.skipNextRtf = true;
424
+ return;
425
+ }
426
+
427
+ // Apply fade-in at the start of each text chunk
428
+ if (metrics.isFirst || metrics.chunkStart) this.applyFadeIn(audioData);
429
+ if (metrics.isLast) this.applyFadeOut(audioData);
430
 
431
  // Play audio
432
  this.player.playAudio(audioData);
 
435
  const now = performance.now();
436
  let ttfb = 0;
437
 
438
+ if (metrics.isFirst) {
439
+ ttfb = now - this.generationStartTime;
440
+ this.lastChunkFinishTime = now;
441
+ } else if (this.skipNextRtf) {
442
+ this.lastChunkFinishTime = now;
443
+ this.skipNextRtf = false;
444
+ } else if (this.lastChunkFinishTime > 0) {
445
+ const timeSinceLastChunk = (now - this.lastChunkFinishTime) / 1000;
446
+ this.lastChunkFinishTime = now;
447
 
448
  if (timeSinceLastChunk > 0) {
449
  const chunkDurationSec = metrics.chunkDuration;