pocket-tts-web

Running

App Files Files Community

KevinAHM commited on Feb 2

Commit

294ea8d

1 Parent(s): 4c13925

Add: increased max generation length

Browse files

Files changed (3) hide show

index.html +8 -9
inference-worker.js +102 -62
onnx-streaming.js +47 -41

index.html CHANGED Viewed

@@ -76,15 +76,14 @@
                 </div>
                 <div class="textarea-wrap">
-                    <textarea
-                        id="text-input"
-                        placeholder="Type or paste text to synthesize..."
-                        aria-label="Text to synthesize"
-                        maxlength="500"
-                    ></textarea>
-                    <div class="textarea-meta">
-                        <span class="char-count"><span id="char-count">0</span>/500</span>
-                    </div>
                 </div>
                 <!-- Sample Texts -->

                 </div>
                 <div class="textarea-wrap">
+                    <textarea
+                        id="text-input"
+                        placeholder="Type or paste text to synthesize..."
+                        aria-label="Text to synthesize"
+                    ></textarea>
+                    <div class="textarea-meta">
+                        <span class="char-count"><span id="char-count">0</span> chars</span>
+                    </div>
                 </div>
                 <!-- Sample Texts -->

inference-worker.js CHANGED Viewed

@@ -19,6 +19,13 @@ const MODELS = {
 const SAMPLE_RATE = 24000;
 const SAMPLES_PER_FRAME = 1920;
 const MAX_FRAMES = 500;
 // State
 let mimiEncoderSession = null;
@@ -229,63 +236,62 @@ function dedupPunctuation(text) {
     return text.replace(/\.\.\.+/g, '[ELLIPSIS]').replace(/,+/g, ',').replace(/[.,]*\.[.,]*/g, '.').replace(/[.,!]*![.,!]*/g, '!').replace(/[.,!?]*\?[.,!?]*/g, '?').replace(/\[ELLIPSIS\]/g, '...');
 }
-// Split text into sentence chunks (max 50 tokens each)
-function splitIntoBestSentences(text) {
-    const preparedText = prepareText(text);
-    if (!preparedText) return [];
-    const tokenIds = tokenizerProcessor.encodeIds(preparedText);
-    // Get end-of-sentence token IDs
-    const eosTokenIds = tokenizerProcessor.encodeIds('.!...?');
-    const eosSet = new Set(eosTokenIds);
-    // Find sentence boundaries
-    const endOfSentenceIndices = [0];
-    let previousWasEos = false;
-    for (let i = 0; i < tokenIds.length; i++) {
-        if (eosSet.has(tokenIds[i])) {
-            previousWasEos = true;
-        } else {
-            if (previousWasEos) {
-                endOfSentenceIndices.push(i);
-            }
-            previousWasEos = false;
-        }
-    }
-    endOfSentenceIndices.push(tokenIds.length);
-    // Reconstruct sentences with token counts
-    const sentences = [];
-    for (let i = 0; i < endOfSentenceIndices.length - 1; i++) {
-        const start = endOfSentenceIndices[i];
-        const end = endOfSentenceIndices[i + 1];
-        const sentenceTokens = tokenIds.slice(start, end);
-        const sentenceText = tokenizerProcessor.decodeIds(sentenceTokens).trim();
-        sentences.push({ tokens: end - start, text: sentenceText });
-    }
-    // Merge into chunks of max 50 tokens
-    const MAX_TOKENS = 50;
     const chunks = [];
     let currentChunk = '';
-    let currentTokens = 0;
-    for (const { tokens, text } of sentences) {
         if (currentChunk === '') {
-            currentChunk = text;
-            currentTokens = tokens;
             continue;
         }
-        if (currentTokens + tokens > MAX_TOKENS) {
             chunks.push(currentChunk.trim());
-            currentChunk = text;
-            currentTokens = tokens;
         } else {
-            currentChunk += ' ' + text;
-            currentTokens += tokens;
         }
     }
@@ -750,7 +756,7 @@ async function startGeneration(text, voiceName) {
     postMessage({ type: 'generation_started', data: { time: performance.now() } });
     try {
-        // Split text into sentence chunks (max 50 tokens each)
         const chunks = splitIntoBestSentences(text);
         console.log(`Split into ${chunks.length} chunks:`, chunks);
@@ -788,8 +794,7 @@ async function startGeneration(text, voiceName) {
 }
 async function runGenerationPipeline(voiceEmb, chunks) {
-    // Initialize state - persists across all chunks
-    let flowLmState = initState(flowLmMainSession, FLOW_LM_STATE_SHAPES);
     let mimiState = initState(mimiDecoderSession, MIMI_DECODER_STATE_SHAPES);
     const emptySeq = new ort.Tensor('float32', new Float32Array(0), [1, 0, 32]);
     const emptyTextEmb = new ort.Tensor('float32', new Float32Array(0), [1, 0, 1024]);
@@ -798,25 +803,30 @@ async function runGenerationPipeline(voiceEmb, chunks) {
     const voiceTensor = new ort.Tensor('float32', voiceEmb.data, voiceEmb.shape);
     console.log('Voice embeddings shape:', voiceEmb.shape);
-    // Voice conditioning (once for all chunks)
-    console.log('Running voice conditioning...');
-    const voiceCondInputs = {
-        sequence: emptySeq,
-        text_embeddings: voiceTensor,
-        ...flowLmState
-    };
-    let condResult = await flowLmMainSession.run(voiceCondInputs);
-    // Update state from voice conditioning
-    for (let i = 2; i < flowLmMainSession.outputNames.length; i++) {
-        const outputName = flowLmMainSession.outputNames[i];
-        if (outputName.startsWith('out_state_')) {
-            const stateIdx = parseInt(outputName.replace('out_state_', ''));
-            flowLmState[`state_${stateIdx}`] = condResult[outputName];
         }
     }
     // Streaming parameters
     const FIRST_CHUNK_FRAMES = 3;
     const NORMAL_CHUNK_FRAMES = 12;
@@ -833,9 +843,18 @@ async function runGenerationPipeline(voiceEmb, chunks) {
     for (let chunkIdx = 0; chunkIdx < chunks.length; chunkIdx++) {
         if (!isGenerating) break;
         const chunkText = chunks[chunkIdx];
         console.log(`Processing chunk ${chunkIdx + 1}/${chunks.length}: "${chunkText}"`);
         // Tokenize this chunk
         const tokenIds = tokenizerProcessor.encodeIds(chunkText);
         console.log(`Chunk ${chunkIdx + 1} tokens:`, tokenIds.length);
@@ -855,7 +874,7 @@ async function runGenerationPipeline(voiceEmb, chunks) {
             ...flowLmState
         };
-        condResult = await flowLmMainSession.run(textCondInputs);
         // Update state from text conditioning
         for (let i = 2; i < flowLmMainSession.outputNames.length; i++) {
@@ -873,6 +892,7 @@ async function runGenerationPipeline(voiceEmb, chunks) {
         const FRAMES_AFTER_EOS = 3;  // Match PyTorch behavior - generate extra frames after EOS
         let eosStep = null;
         for (let step = 0; step < MAX_FRAMES; step++) {
             if (!isGenerating) break;
@@ -994,18 +1014,38 @@ async function runGenerationPipeline(voiceEmb, chunks) {
                         decTime: 0,
                         chunkDuration: audioFloat32.length / SAMPLE_RATE,
                         isFirst: isFirstAudioChunk,
-                        isLast: isLastChunk
                     }
                 }, [audioFloat32.buffer]);
                 isFirstAudioChunk = false;
             }
             if (shouldStop) {
                 console.log(`Chunk ${chunkIdx + 1} EOS at step ${eosStep}, stopped at step ${step}, ${chunkLatents.length} frames`);
                 break;
             }
         }
     }
     const totalTime = (performance.now() - arStartTime) / 1000;

 const SAMPLE_RATE = 24000;
 const SAMPLES_PER_FRAME = 1920;
 const MAX_FRAMES = 500;
+// Text chunking target; lower if long passages hit generation limits.
+const CHUNK_TARGET_TOKENS = 50;
+const CHUNK_GAP_SEC = 0.25;
+// If true, re-run voice conditioning per chunk to avoid stale AR state.
+const RESET_FLOW_STATE_EACH_CHUNK = true;
+// If true, reset decoder state per chunk to avoid carry-over artifacts.
+const RESET_MIMI_STATE_EACH_CHUNK = true;
 // State
 let mimiEncoderSession = null;
     return text.replace(/\.\.\.+/g, '[ELLIPSIS]').replace(/,+/g, ',').replace(/[.,]*\.[.,]*/g, '.').replace(/[.,!]*![.,!]*/g, '!').replace(/[.,!?]*\?[.,!?]*/g, '?').replace(/\[ELLIPSIS\]/g, '...');
 }
+const SENTENCE_SPLIT_RE = /[^.!?]+[.!?]+|[^.!?]+$/g;
+function splitTextIntoSentences(text) {
+    const matches = text.match(SENTENCE_SPLIT_RE);
+    if (!matches) return [];
+    return matches.map(sentence => sentence.trim()).filter(Boolean);
+}
+function splitTokenIdsIntoChunks(tokenIds, maxTokens) {
+    const chunks = [];
+    for (let i = 0; i < tokenIds.length; i += maxTokens) {
+        const chunkText = tokenizerProcessor.decodeIds(tokenIds.slice(i, i + maxTokens)).trim();
+        if (chunkText) chunks.push(chunkText);
+    }
+    return chunks;
+}
+// Split text into sentence chunks (target <= CHUNK_TARGET_TOKENS tokens)
+function splitIntoBestSentences(text) {
+    const preparedText = prepareText(text);
+    if (!preparedText) return [];
+    const sentences = splitTextIntoSentences(preparedText);
+    if (sentences.length === 0) return [];
+    // Merge sentences into chunks that stay within the token target
     const chunks = [];
     let currentChunk = '';
+    for (const sentenceText of sentences) {
+        const sentenceTokenIds = tokenizerProcessor.encodeIds(sentenceText);
+        const sentenceTokens = sentenceTokenIds.length;
+        if (sentenceTokens > CHUNK_TARGET_TOKENS) {
+            if (currentChunk !== '') {
+                chunks.push(currentChunk.trim());
+                currentChunk = '';
+            }
+            const splitChunks = splitTokenIdsIntoChunks(sentenceTokenIds, CHUNK_TARGET_TOKENS);
+            for (const splitChunk of splitChunks) {
+                if (splitChunk) chunks.push(splitChunk.trim());
+            }
+            continue;
+        }
         if (currentChunk === '') {
+            currentChunk = sentenceText;
             continue;
         }
+        const combined = `${currentChunk} ${sentenceText}`;
+        const combinedTokens = tokenizerProcessor.encodeIds(combined).length;
+        if (combinedTokens > CHUNK_TARGET_TOKENS) {
             chunks.push(currentChunk.trim());
+            currentChunk = sentenceText;
         } else {
+            currentChunk = combined;
         }
     }
     postMessage({ type: 'generation_started', data: { time: performance.now() } });
     try {
+        // Split text into sentence chunks (target <= CHUNK_TARGET_TOKENS tokens)
         const chunks = splitIntoBestSentences(text);
         console.log(`Split into ${chunks.length} chunks:`, chunks);
 }
 async function runGenerationPipeline(voiceEmb, chunks) {
+    // Initialize state - may be reset per chunk
     let mimiState = initState(mimiDecoderSession, MIMI_DECODER_STATE_SHAPES);
     const emptySeq = new ort.Tensor('float32', new Float32Array(0), [1, 0, 32]);
     const emptyTextEmb = new ort.Tensor('float32', new Float32Array(0), [1, 0, 1024]);
     const voiceTensor = new ort.Tensor('float32', voiceEmb.data, voiceEmb.shape);
     console.log('Voice embeddings shape:', voiceEmb.shape);
+    async function buildVoiceConditionedState() {
+        let flowLmState = initState(flowLmMainSession, FLOW_LM_STATE_SHAPES);
+        console.log('Running voice conditioning...');
+        const voiceCondInputs = {
+            sequence: emptySeq,
+            text_embeddings: voiceTensor,
+            ...flowLmState
+        };
+        let condResult = await flowLmMainSession.run(voiceCondInputs);
+        // Update state from voice conditioning
+        for (let i = 2; i < flowLmMainSession.outputNames.length; i++) {
+            const outputName = flowLmMainSession.outputNames[i];
+            if (outputName.startsWith('out_state_')) {
+                const stateIdx = parseInt(outputName.replace('out_state_', ''));
+                flowLmState[`state_${stateIdx}`] = condResult[outputName];
+            }
         }
+        return flowLmState;
     }
+    let flowLmState = await buildVoiceConditionedState();
     // Streaming parameters
     const FIRST_CHUNK_FRAMES = 3;
     const NORMAL_CHUNK_FRAMES = 12;
     for (let chunkIdx = 0; chunkIdx < chunks.length; chunkIdx++) {
         if (!isGenerating) break;
+        if (RESET_FLOW_STATE_EACH_CHUNK && chunkIdx > 0) {
+            flowLmState = await buildVoiceConditionedState();
+        }
+        if (RESET_MIMI_STATE_EACH_CHUNK && chunkIdx > 0) {
+            mimiState = initState(mimiDecoderSession, MIMI_DECODER_STATE_SHAPES);
+        }
         const chunkText = chunks[chunkIdx];
         console.log(`Processing chunk ${chunkIdx + 1}/${chunks.length}: "${chunkText}"`);
+        let isFirstAudioChunkOfTextChunk = true;
         // Tokenize this chunk
         const tokenIds = tokenizerProcessor.encodeIds(chunkText);
         console.log(`Chunk ${chunkIdx + 1} tokens:`, tokenIds.length);
             ...flowLmState
         };
+        let condResult = await flowLmMainSession.run(textCondInputs);
         // Update state from text conditioning
         for (let i = 2; i < flowLmMainSession.outputNames.length; i++) {
         const FRAMES_AFTER_EOS = 3;  // Match PyTorch behavior - generate extra frames after EOS
         let eosStep = null;
+        let chunkEnded = false;
         for (let step = 0; step < MAX_FRAMES; step++) {
             if (!isGenerating) break;
                         decTime: 0,
                         chunkDuration: audioFloat32.length / SAMPLE_RATE,
                         isFirst: isFirstAudioChunk,
+                        isLast: isLastChunk,
+                        chunkStart: isFirstAudioChunkOfTextChunk
                     }
                 }, [audioFloat32.buffer]);
                 isFirstAudioChunk = false;
+                isFirstAudioChunkOfTextChunk = false;
             }
             if (shouldStop) {
                 console.log(`Chunk ${chunkIdx + 1} EOS at step ${eosStep}, stopped at step ${step}, ${chunkLatents.length} frames`);
+                chunkEnded = true;
                 break;
             }
         }
+        if (chunkEnded && isGenerating && chunkIdx < chunks.length - 1) {
+            const gapSamples = Math.max(1, Math.floor(CHUNK_GAP_SEC * SAMPLE_RATE));
+            const silence = new Float32Array(gapSamples);
+            postMessage({
+                type: 'audio_chunk',
+                data: silence,
+                metrics: {
+                    bbTime: 0,
+                    decTime: 0,
+                    chunkDuration: gapSamples / SAMPLE_RATE,
+                    isFirst: false,
+                    isLast: false,
+                    isSilence: true
+                }
+            }, [silence.buffer]);
+        }
     }
     const totalTime = (performance.now() - arStartTime) / 1000;

onnx-streaming.js CHANGED Viewed

@@ -2,8 +2,8 @@
 import { PCMPlayerWorklet as PCMPlayer } from './PCMPlayerWorklet.js';
 // Configuration
-const SAMPLE_RATE = 24000;
-const FADE_SAMPLES = 480; // 20ms fade at 24kHz
 export class PocketTTSStreaming {
     constructor() {
@@ -22,7 +22,8 @@ export class PocketTTSStreaming {
         // Metrics State
         this.generationStartTime = 0;
         this.lastChunkFinishTime = 0;
-        this.rtfMovingAverage = 0;
         // Edge optimization state (dynamic LSD)
         this.edgeOptimizationApplied = false;
@@ -392,38 +393,40 @@ export class PocketTTSStreaming {
         }
     }
-    stopGeneration() {
-        if (!this.isGenerating) return;
-        this.worker.postMessage({ type: 'stop' });
-        // Handle stop immediately in UI
-        this.handleStreamEnd();
-    }
-    applyFadeIn(audioData) {
-        const fadeLen = Math.min(FADE_SAMPLES, audioData.length);
-        for (let i = 0; i < fadeLen; i++) {
-            audioData[i] *= i / fadeLen;
-        }
-    }
-    applyFadeOut(audioData) {
-        const fadeLen = Math.min(FADE_SAMPLES, audioData.length);
-        const startIdx = audioData.length - fadeLen;
-        for (let i = 0; i < fadeLen; i++) {
-            audioData[startIdx + i] *= 1 - (i / fadeLen);
-        }
-    }
-    handleAudioChunk(audioData, metrics) {
-        if (!this.isGenerating) return;
-        // Apply fades to prevent pops
-        if (metrics.isFirst) {
-            this.applyFadeIn(audioData);
-        }
-        if (metrics.isLast) {
-            this.applyFadeOut(audioData);
-        }
         // Play audio
         this.player.playAudio(audioData);
@@ -432,12 +435,15 @@ export class PocketTTSStreaming {
         const now = performance.now();
         let ttfb = 0;
-        if (metrics.isFirst) {
-            ttfb = now - this.generationStartTime;
-            this.lastChunkFinishTime = now;
-        } else if (this.lastChunkFinishTime > 0) {
-            const timeSinceLastChunk = (now - this.lastChunkFinishTime) / 1000;
-            this.lastChunkFinishTime = now;
             if (timeSinceLastChunk > 0) {
                 const chunkDurationSec = metrics.chunkDuration;

 import { PCMPlayerWorklet as PCMPlayer } from './PCMPlayerWorklet.js';
 // Configuration
+const SAMPLE_RATE = 24000;
+const FADE_SAMPLES = 480; // 20ms fade at 24kHz
 export class PocketTTSStreaming {
     constructor() {
         // Metrics State
         this.generationStartTime = 0;
         this.lastChunkFinishTime = 0;
+        this.rtfMovingAverage = 0;
+        this.skipNextRtf = false;
         // Edge optimization state (dynamic LSD)
         this.edgeOptimizationApplied = false;
         }
     }
+    stopGeneration() {
+        if (!this.isGenerating) return;
+        this.worker.postMessage({ type: 'stop' });
+        // Handle stop immediately in UI
+        this.handleStreamEnd();
+    }
+    applyFadeIn(audioData) {
+        const fadeLen = Math.min(FADE_SAMPLES, audioData.length);
+        for (let i = 0; i < fadeLen; i++) {
+            audioData[i] *= i / fadeLen;
+        }
+    }
+    applyFadeOut(audioData) {
+        const fadeLen = Math.min(FADE_SAMPLES, audioData.length);
+        const startIdx = audioData.length - fadeLen;
+        for (let i = 0; i < fadeLen; i++) {
+            audioData[startIdx + i] *= 1 - (i / fadeLen);
+        }
+    }
+    handleAudioChunk(audioData, metrics) {
+        if (!this.isGenerating) return;
+        if (metrics.isSilence) {
+            this.player.playAudio(audioData);
+            this.skipNextRtf = true;
+            return;
+        }
+        // Apply fade-in at the start of each text chunk
+        if (metrics.isFirst || metrics.chunkStart) this.applyFadeIn(audioData);
+        if (metrics.isLast) this.applyFadeOut(audioData);
         // Play audio
         this.player.playAudio(audioData);
         const now = performance.now();
         let ttfb = 0;
+        if (metrics.isFirst) {
+            ttfb = now - this.generationStartTime;
+            this.lastChunkFinishTime = now;
+        } else if (this.skipNextRtf) {
+            this.lastChunkFinishTime = now;
+            this.skipNextRtf = false;
+        } else if (this.lastChunkFinishTime > 0) {
+            const timeSinceLastChunk = (now - this.lastChunkFinishTime) / 1000;
+            this.lastChunkFinishTime = now;
             if (timeSinceLastChunk > 0) {
                 const chunkDurationSec = metrics.chunkDuration;