Spaces:
Running
Running
Add: increased max generation length
Browse files- index.html +8 -9
- inference-worker.js +102 -62
- onnx-streaming.js +47 -41
index.html
CHANGED
|
@@ -76,15 +76,14 @@
|
|
| 76 |
</div>
|
| 77 |
|
| 78 |
<div class="textarea-wrap">
|
| 79 |
-
<textarea
|
| 80 |
-
id="text-input"
|
| 81 |
-
placeholder="Type or paste text to synthesize..."
|
| 82 |
-
aria-label="Text to synthesize"
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
</div>
|
| 88 |
</div>
|
| 89 |
|
| 90 |
<!-- Sample Texts -->
|
|
|
|
| 76 |
</div>
|
| 77 |
|
| 78 |
<div class="textarea-wrap">
|
| 79 |
+
<textarea
|
| 80 |
+
id="text-input"
|
| 81 |
+
placeholder="Type or paste text to synthesize..."
|
| 82 |
+
aria-label="Text to synthesize"
|
| 83 |
+
></textarea>
|
| 84 |
+
<div class="textarea-meta">
|
| 85 |
+
<span class="char-count"><span id="char-count">0</span> chars</span>
|
| 86 |
+
</div>
|
|
|
|
| 87 |
</div>
|
| 88 |
|
| 89 |
<!-- Sample Texts -->
|
inference-worker.js
CHANGED
|
@@ -19,6 +19,13 @@ const MODELS = {
|
|
| 19 |
const SAMPLE_RATE = 24000;
|
| 20 |
const SAMPLES_PER_FRAME = 1920;
|
| 21 |
const MAX_FRAMES = 500;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
// State
|
| 24 |
let mimiEncoderSession = null;
|
|
@@ -229,63 +236,62 @@ function dedupPunctuation(text) {
|
|
| 229 |
return text.replace(/\.\.\.+/g, '[ELLIPSIS]').replace(/,+/g, ',').replace(/[.,]*\.[.,]*/g, '.').replace(/[.,!]*![.,!]*/g, '!').replace(/[.,!?]*\?[.,!?]*/g, '?').replace(/\[ELLIPSIS\]/g, '...');
|
| 230 |
}
|
| 231 |
|
| 232 |
-
|
| 233 |
-
function splitIntoBestSentences(text) {
|
| 234 |
-
const preparedText = prepareText(text);
|
| 235 |
-
if (!preparedText) return [];
|
| 236 |
|
| 237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
|
| 239 |
-
|
| 240 |
-
const
|
| 241 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
|
|
|
| 246 |
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
previousWasEos = true;
|
| 250 |
-
} else {
|
| 251 |
-
if (previousWasEos) {
|
| 252 |
-
endOfSentenceIndices.push(i);
|
| 253 |
-
}
|
| 254 |
-
previousWasEos = false;
|
| 255 |
-
}
|
| 256 |
-
}
|
| 257 |
-
endOfSentenceIndices.push(tokenIds.length);
|
| 258 |
-
|
| 259 |
-
// Reconstruct sentences with token counts
|
| 260 |
-
const sentences = [];
|
| 261 |
-
for (let i = 0; i < endOfSentenceIndices.length - 1; i++) {
|
| 262 |
-
const start = endOfSentenceIndices[i];
|
| 263 |
-
const end = endOfSentenceIndices[i + 1];
|
| 264 |
-
const sentenceTokens = tokenIds.slice(start, end);
|
| 265 |
-
const sentenceText = tokenizerProcessor.decodeIds(sentenceTokens).trim();
|
| 266 |
-
sentences.push({ tokens: end - start, text: sentenceText });
|
| 267 |
-
}
|
| 268 |
|
| 269 |
-
// Merge into chunks
|
| 270 |
-
const MAX_TOKENS = 50;
|
| 271 |
const chunks = [];
|
| 272 |
let currentChunk = '';
|
| 273 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
|
| 275 |
-
for (const { tokens, text } of sentences) {
|
| 276 |
if (currentChunk === '') {
|
| 277 |
-
currentChunk =
|
| 278 |
-
currentTokens = tokens;
|
| 279 |
continue;
|
| 280 |
}
|
| 281 |
|
| 282 |
-
|
|
|
|
|
|
|
| 283 |
chunks.push(currentChunk.trim());
|
| 284 |
-
currentChunk =
|
| 285 |
-
currentTokens = tokens;
|
| 286 |
} else {
|
| 287 |
-
currentChunk
|
| 288 |
-
currentTokens += tokens;
|
| 289 |
}
|
| 290 |
}
|
| 291 |
|
|
@@ -750,7 +756,7 @@ async function startGeneration(text, voiceName) {
|
|
| 750 |
postMessage({ type: 'generation_started', data: { time: performance.now() } });
|
| 751 |
|
| 752 |
try {
|
| 753 |
-
// Split text into sentence chunks (
|
| 754 |
const chunks = splitIntoBestSentences(text);
|
| 755 |
console.log(`Split into ${chunks.length} chunks:`, chunks);
|
| 756 |
|
|
@@ -788,8 +794,7 @@ async function startGeneration(text, voiceName) {
|
|
| 788 |
}
|
| 789 |
|
| 790 |
async function runGenerationPipeline(voiceEmb, chunks) {
|
| 791 |
-
// Initialize state -
|
| 792 |
-
let flowLmState = initState(flowLmMainSession, FLOW_LM_STATE_SHAPES);
|
| 793 |
let mimiState = initState(mimiDecoderSession, MIMI_DECODER_STATE_SHAPES);
|
| 794 |
const emptySeq = new ort.Tensor('float32', new Float32Array(0), [1, 0, 32]);
|
| 795 |
const emptyTextEmb = new ort.Tensor('float32', new Float32Array(0), [1, 0, 1024]);
|
|
@@ -798,25 +803,30 @@ async function runGenerationPipeline(voiceEmb, chunks) {
|
|
| 798 |
const voiceTensor = new ort.Tensor('float32', voiceEmb.data, voiceEmb.shape);
|
| 799 |
console.log('Voice embeddings shape:', voiceEmb.shape);
|
| 800 |
|
| 801 |
-
|
| 802 |
-
|
| 803 |
-
|
| 804 |
-
|
| 805 |
-
|
| 806 |
-
|
| 807 |
-
|
|
|
|
| 808 |
|
| 809 |
-
|
| 810 |
|
| 811 |
-
|
| 812 |
-
|
| 813 |
-
|
| 814 |
-
|
| 815 |
-
|
| 816 |
-
|
|
|
|
| 817 |
}
|
|
|
|
| 818 |
}
|
| 819 |
|
|
|
|
|
|
|
| 820 |
// Streaming parameters
|
| 821 |
const FIRST_CHUNK_FRAMES = 3;
|
| 822 |
const NORMAL_CHUNK_FRAMES = 12;
|
|
@@ -833,9 +843,18 @@ async function runGenerationPipeline(voiceEmb, chunks) {
|
|
| 833 |
for (let chunkIdx = 0; chunkIdx < chunks.length; chunkIdx++) {
|
| 834 |
if (!isGenerating) break;
|
| 835 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 836 |
const chunkText = chunks[chunkIdx];
|
| 837 |
console.log(`Processing chunk ${chunkIdx + 1}/${chunks.length}: "${chunkText}"`);
|
| 838 |
|
|
|
|
|
|
|
| 839 |
// Tokenize this chunk
|
| 840 |
const tokenIds = tokenizerProcessor.encodeIds(chunkText);
|
| 841 |
console.log(`Chunk ${chunkIdx + 1} tokens:`, tokenIds.length);
|
|
@@ -855,7 +874,7 @@ async function runGenerationPipeline(voiceEmb, chunks) {
|
|
| 855 |
...flowLmState
|
| 856 |
};
|
| 857 |
|
| 858 |
-
condResult = await flowLmMainSession.run(textCondInputs);
|
| 859 |
|
| 860 |
// Update state from text conditioning
|
| 861 |
for (let i = 2; i < flowLmMainSession.outputNames.length; i++) {
|
|
@@ -873,6 +892,7 @@ async function runGenerationPipeline(voiceEmb, chunks) {
|
|
| 873 |
const FRAMES_AFTER_EOS = 3; // Match PyTorch behavior - generate extra frames after EOS
|
| 874 |
let eosStep = null;
|
| 875 |
|
|
|
|
| 876 |
for (let step = 0; step < MAX_FRAMES; step++) {
|
| 877 |
if (!isGenerating) break;
|
| 878 |
|
|
@@ -994,18 +1014,38 @@ async function runGenerationPipeline(voiceEmb, chunks) {
|
|
| 994 |
decTime: 0,
|
| 995 |
chunkDuration: audioFloat32.length / SAMPLE_RATE,
|
| 996 |
isFirst: isFirstAudioChunk,
|
| 997 |
-
isLast: isLastChunk
|
|
|
|
| 998 |
}
|
| 999 |
}, [audioFloat32.buffer]);
|
| 1000 |
|
| 1001 |
isFirstAudioChunk = false;
|
|
|
|
| 1002 |
}
|
| 1003 |
|
| 1004 |
if (shouldStop) {
|
| 1005 |
console.log(`Chunk ${chunkIdx + 1} EOS at step ${eosStep}, stopped at step ${step}, ${chunkLatents.length} frames`);
|
|
|
|
| 1006 |
break;
|
| 1007 |
}
|
| 1008 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1009 |
}
|
| 1010 |
|
| 1011 |
const totalTime = (performance.now() - arStartTime) / 1000;
|
|
|
|
| 19 |
const SAMPLE_RATE = 24000;
|
| 20 |
const SAMPLES_PER_FRAME = 1920;
|
| 21 |
const MAX_FRAMES = 500;
|
| 22 |
+
// Text chunking target; lower if long passages hit generation limits.
|
| 23 |
+
const CHUNK_TARGET_TOKENS = 50;
|
| 24 |
+
const CHUNK_GAP_SEC = 0.25;
|
| 25 |
+
// If true, re-run voice conditioning per chunk to avoid stale AR state.
|
| 26 |
+
const RESET_FLOW_STATE_EACH_CHUNK = true;
|
| 27 |
+
// If true, reset decoder state per chunk to avoid carry-over artifacts.
|
| 28 |
+
const RESET_MIMI_STATE_EACH_CHUNK = true;
|
| 29 |
|
| 30 |
// State
|
| 31 |
let mimiEncoderSession = null;
|
|
|
|
| 236 |
return text.replace(/\.\.\.+/g, '[ELLIPSIS]').replace(/,+/g, ',').replace(/[.,]*\.[.,]*/g, '.').replace(/[.,!]*![.,!]*/g, '!').replace(/[.,!?]*\?[.,!?]*/g, '?').replace(/\[ELLIPSIS\]/g, '...');
|
| 237 |
}
|
| 238 |
|
| 239 |
+
const SENTENCE_SPLIT_RE = /[^.!?]+[.!?]+|[^.!?]+$/g;
|
|
|
|
|
|
|
|
|
|
| 240 |
|
| 241 |
+
function splitTextIntoSentences(text) {
|
| 242 |
+
const matches = text.match(SENTENCE_SPLIT_RE);
|
| 243 |
+
if (!matches) return [];
|
| 244 |
+
return matches.map(sentence => sentence.trim()).filter(Boolean);
|
| 245 |
+
}
|
| 246 |
|
| 247 |
+
function splitTokenIdsIntoChunks(tokenIds, maxTokens) {
|
| 248 |
+
const chunks = [];
|
| 249 |
+
for (let i = 0; i < tokenIds.length; i += maxTokens) {
|
| 250 |
+
const chunkText = tokenizerProcessor.decodeIds(tokenIds.slice(i, i + maxTokens)).trim();
|
| 251 |
+
if (chunkText) chunks.push(chunkText);
|
| 252 |
+
}
|
| 253 |
+
return chunks;
|
| 254 |
+
}
|
| 255 |
|
| 256 |
+
// Split text into sentence chunks (target <= CHUNK_TARGET_TOKENS tokens)
|
| 257 |
+
function splitIntoBestSentences(text) {
|
| 258 |
+
const preparedText = prepareText(text);
|
| 259 |
+
if (!preparedText) return [];
|
| 260 |
|
| 261 |
+
const sentences = splitTextIntoSentences(preparedText);
|
| 262 |
+
if (sentences.length === 0) return [];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
|
| 264 |
+
// Merge sentences into chunks that stay within the token target
|
|
|
|
| 265 |
const chunks = [];
|
| 266 |
let currentChunk = '';
|
| 267 |
+
for (const sentenceText of sentences) {
|
| 268 |
+
const sentenceTokenIds = tokenizerProcessor.encodeIds(sentenceText);
|
| 269 |
+
const sentenceTokens = sentenceTokenIds.length;
|
| 270 |
+
|
| 271 |
+
if (sentenceTokens > CHUNK_TARGET_TOKENS) {
|
| 272 |
+
if (currentChunk !== '') {
|
| 273 |
+
chunks.push(currentChunk.trim());
|
| 274 |
+
currentChunk = '';
|
| 275 |
+
}
|
| 276 |
+
const splitChunks = splitTokenIdsIntoChunks(sentenceTokenIds, CHUNK_TARGET_TOKENS);
|
| 277 |
+
for (const splitChunk of splitChunks) {
|
| 278 |
+
if (splitChunk) chunks.push(splitChunk.trim());
|
| 279 |
+
}
|
| 280 |
+
continue;
|
| 281 |
+
}
|
| 282 |
|
|
|
|
| 283 |
if (currentChunk === '') {
|
| 284 |
+
currentChunk = sentenceText;
|
|
|
|
| 285 |
continue;
|
| 286 |
}
|
| 287 |
|
| 288 |
+
const combined = `${currentChunk} ${sentenceText}`;
|
| 289 |
+
const combinedTokens = tokenizerProcessor.encodeIds(combined).length;
|
| 290 |
+
if (combinedTokens > CHUNK_TARGET_TOKENS) {
|
| 291 |
chunks.push(currentChunk.trim());
|
| 292 |
+
currentChunk = sentenceText;
|
|
|
|
| 293 |
} else {
|
| 294 |
+
currentChunk = combined;
|
|
|
|
| 295 |
}
|
| 296 |
}
|
| 297 |
|
|
|
|
| 756 |
postMessage({ type: 'generation_started', data: { time: performance.now() } });
|
| 757 |
|
| 758 |
try {
|
| 759 |
+
// Split text into sentence chunks (target <= CHUNK_TARGET_TOKENS tokens)
|
| 760 |
const chunks = splitIntoBestSentences(text);
|
| 761 |
console.log(`Split into ${chunks.length} chunks:`, chunks);
|
| 762 |
|
|
|
|
| 794 |
}
|
| 795 |
|
| 796 |
async function runGenerationPipeline(voiceEmb, chunks) {
|
| 797 |
+
// Initialize state - may be reset per chunk
|
|
|
|
| 798 |
let mimiState = initState(mimiDecoderSession, MIMI_DECODER_STATE_SHAPES);
|
| 799 |
const emptySeq = new ort.Tensor('float32', new Float32Array(0), [1, 0, 32]);
|
| 800 |
const emptyTextEmb = new ort.Tensor('float32', new Float32Array(0), [1, 0, 1024]);
|
|
|
|
| 803 |
const voiceTensor = new ort.Tensor('float32', voiceEmb.data, voiceEmb.shape);
|
| 804 |
console.log('Voice embeddings shape:', voiceEmb.shape);
|
| 805 |
|
| 806 |
+
async function buildVoiceConditionedState() {
|
| 807 |
+
let flowLmState = initState(flowLmMainSession, FLOW_LM_STATE_SHAPES);
|
| 808 |
+
console.log('Running voice conditioning...');
|
| 809 |
+
const voiceCondInputs = {
|
| 810 |
+
sequence: emptySeq,
|
| 811 |
+
text_embeddings: voiceTensor,
|
| 812 |
+
...flowLmState
|
| 813 |
+
};
|
| 814 |
|
| 815 |
+
let condResult = await flowLmMainSession.run(voiceCondInputs);
|
| 816 |
|
| 817 |
+
// Update state from voice conditioning
|
| 818 |
+
for (let i = 2; i < flowLmMainSession.outputNames.length; i++) {
|
| 819 |
+
const outputName = flowLmMainSession.outputNames[i];
|
| 820 |
+
if (outputName.startsWith('out_state_')) {
|
| 821 |
+
const stateIdx = parseInt(outputName.replace('out_state_', ''));
|
| 822 |
+
flowLmState[`state_${stateIdx}`] = condResult[outputName];
|
| 823 |
+
}
|
| 824 |
}
|
| 825 |
+
return flowLmState;
|
| 826 |
}
|
| 827 |
|
| 828 |
+
let flowLmState = await buildVoiceConditionedState();
|
| 829 |
+
|
| 830 |
// Streaming parameters
|
| 831 |
const FIRST_CHUNK_FRAMES = 3;
|
| 832 |
const NORMAL_CHUNK_FRAMES = 12;
|
|
|
|
| 843 |
for (let chunkIdx = 0; chunkIdx < chunks.length; chunkIdx++) {
|
| 844 |
if (!isGenerating) break;
|
| 845 |
|
| 846 |
+
if (RESET_FLOW_STATE_EACH_CHUNK && chunkIdx > 0) {
|
| 847 |
+
flowLmState = await buildVoiceConditionedState();
|
| 848 |
+
}
|
| 849 |
+
if (RESET_MIMI_STATE_EACH_CHUNK && chunkIdx > 0) {
|
| 850 |
+
mimiState = initState(mimiDecoderSession, MIMI_DECODER_STATE_SHAPES);
|
| 851 |
+
}
|
| 852 |
+
|
| 853 |
const chunkText = chunks[chunkIdx];
|
| 854 |
console.log(`Processing chunk ${chunkIdx + 1}/${chunks.length}: "${chunkText}"`);
|
| 855 |
|
| 856 |
+
let isFirstAudioChunkOfTextChunk = true;
|
| 857 |
+
|
| 858 |
// Tokenize this chunk
|
| 859 |
const tokenIds = tokenizerProcessor.encodeIds(chunkText);
|
| 860 |
console.log(`Chunk ${chunkIdx + 1} tokens:`, tokenIds.length);
|
|
|
|
| 874 |
...flowLmState
|
| 875 |
};
|
| 876 |
|
| 877 |
+
let condResult = await flowLmMainSession.run(textCondInputs);
|
| 878 |
|
| 879 |
// Update state from text conditioning
|
| 880 |
for (let i = 2; i < flowLmMainSession.outputNames.length; i++) {
|
|
|
|
| 892 |
const FRAMES_AFTER_EOS = 3; // Match PyTorch behavior - generate extra frames after EOS
|
| 893 |
let eosStep = null;
|
| 894 |
|
| 895 |
+
let chunkEnded = false;
|
| 896 |
for (let step = 0; step < MAX_FRAMES; step++) {
|
| 897 |
if (!isGenerating) break;
|
| 898 |
|
|
|
|
| 1014 |
decTime: 0,
|
| 1015 |
chunkDuration: audioFloat32.length / SAMPLE_RATE,
|
| 1016 |
isFirst: isFirstAudioChunk,
|
| 1017 |
+
isLast: isLastChunk,
|
| 1018 |
+
chunkStart: isFirstAudioChunkOfTextChunk
|
| 1019 |
}
|
| 1020 |
}, [audioFloat32.buffer]);
|
| 1021 |
|
| 1022 |
isFirstAudioChunk = false;
|
| 1023 |
+
isFirstAudioChunkOfTextChunk = false;
|
| 1024 |
}
|
| 1025 |
|
| 1026 |
if (shouldStop) {
|
| 1027 |
console.log(`Chunk ${chunkIdx + 1} EOS at step ${eosStep}, stopped at step ${step}, ${chunkLatents.length} frames`);
|
| 1028 |
+
chunkEnded = true;
|
| 1029 |
break;
|
| 1030 |
}
|
| 1031 |
}
|
| 1032 |
+
|
| 1033 |
+
if (chunkEnded && isGenerating && chunkIdx < chunks.length - 1) {
|
| 1034 |
+
const gapSamples = Math.max(1, Math.floor(CHUNK_GAP_SEC * SAMPLE_RATE));
|
| 1035 |
+
const silence = new Float32Array(gapSamples);
|
| 1036 |
+
postMessage({
|
| 1037 |
+
type: 'audio_chunk',
|
| 1038 |
+
data: silence,
|
| 1039 |
+
metrics: {
|
| 1040 |
+
bbTime: 0,
|
| 1041 |
+
decTime: 0,
|
| 1042 |
+
chunkDuration: gapSamples / SAMPLE_RATE,
|
| 1043 |
+
isFirst: false,
|
| 1044 |
+
isLast: false,
|
| 1045 |
+
isSilence: true
|
| 1046 |
+
}
|
| 1047 |
+
}, [silence.buffer]);
|
| 1048 |
+
}
|
| 1049 |
}
|
| 1050 |
|
| 1051 |
const totalTime = (performance.now() - arStartTime) / 1000;
|
onnx-streaming.js
CHANGED
|
@@ -2,8 +2,8 @@
|
|
| 2 |
import { PCMPlayerWorklet as PCMPlayer } from './PCMPlayerWorklet.js';
|
| 3 |
|
| 4 |
// Configuration
|
| 5 |
-
const SAMPLE_RATE = 24000;
|
| 6 |
-
const FADE_SAMPLES = 480; // 20ms fade at 24kHz
|
| 7 |
|
| 8 |
export class PocketTTSStreaming {
|
| 9 |
constructor() {
|
|
@@ -22,7 +22,8 @@ export class PocketTTSStreaming {
|
|
| 22 |
// Metrics State
|
| 23 |
this.generationStartTime = 0;
|
| 24 |
this.lastChunkFinishTime = 0;
|
| 25 |
-
this.rtfMovingAverage = 0;
|
|
|
|
| 26 |
|
| 27 |
// Edge optimization state (dynamic LSD)
|
| 28 |
this.edgeOptimizationApplied = false;
|
|
@@ -392,38 +393,40 @@ export class PocketTTSStreaming {
|
|
| 392 |
}
|
| 393 |
}
|
| 394 |
|
| 395 |
-
stopGeneration() {
|
| 396 |
-
if (!this.isGenerating) return;
|
| 397 |
-
this.worker.postMessage({ type: 'stop' });
|
| 398 |
-
// Handle stop immediately in UI
|
| 399 |
-
this.handleStreamEnd();
|
| 400 |
-
}
|
| 401 |
-
|
| 402 |
-
applyFadeIn(audioData) {
|
| 403 |
-
const fadeLen = Math.min(FADE_SAMPLES, audioData.length);
|
| 404 |
-
for (let i = 0; i < fadeLen; i++) {
|
| 405 |
-
audioData[i] *= i / fadeLen;
|
| 406 |
-
}
|
| 407 |
-
}
|
| 408 |
-
|
| 409 |
-
applyFadeOut(audioData) {
|
| 410 |
-
const fadeLen = Math.min(FADE_SAMPLES, audioData.length);
|
| 411 |
-
const startIdx = audioData.length - fadeLen;
|
| 412 |
-
for (let i = 0; i < fadeLen; i++) {
|
| 413 |
-
audioData[startIdx + i] *= 1 - (i / fadeLen);
|
| 414 |
-
}
|
| 415 |
-
}
|
| 416 |
-
|
| 417 |
-
handleAudioChunk(audioData, metrics) {
|
| 418 |
-
if (!this.isGenerating) return;
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
this.
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
|
|
|
|
|
|
| 427 |
|
| 428 |
// Play audio
|
| 429 |
this.player.playAudio(audioData);
|
|
@@ -432,12 +435,15 @@ export class PocketTTSStreaming {
|
|
| 432 |
const now = performance.now();
|
| 433 |
let ttfb = 0;
|
| 434 |
|
| 435 |
-
if (metrics.isFirst) {
|
| 436 |
-
ttfb = now - this.generationStartTime;
|
| 437 |
-
this.lastChunkFinishTime = now;
|
| 438 |
-
} else if (this.
|
| 439 |
-
|
| 440 |
-
this.
|
|
|
|
|
|
|
|
|
|
| 441 |
|
| 442 |
if (timeSinceLastChunk > 0) {
|
| 443 |
const chunkDurationSec = metrics.chunkDuration;
|
|
|
|
| 2 |
import { PCMPlayerWorklet as PCMPlayer } from './PCMPlayerWorklet.js';
|
| 3 |
|
| 4 |
// Configuration
|
| 5 |
+
const SAMPLE_RATE = 24000;
|
| 6 |
+
const FADE_SAMPLES = 480; // 20ms fade at 24kHz
|
| 7 |
|
| 8 |
export class PocketTTSStreaming {
|
| 9 |
constructor() {
|
|
|
|
| 22 |
// Metrics State
|
| 23 |
this.generationStartTime = 0;
|
| 24 |
this.lastChunkFinishTime = 0;
|
| 25 |
+
this.rtfMovingAverage = 0;
|
| 26 |
+
this.skipNextRtf = false;
|
| 27 |
|
| 28 |
// Edge optimization state (dynamic LSD)
|
| 29 |
this.edgeOptimizationApplied = false;
|
|
|
|
| 393 |
}
|
| 394 |
}
|
| 395 |
|
| 396 |
+
stopGeneration() {
|
| 397 |
+
if (!this.isGenerating) return;
|
| 398 |
+
this.worker.postMessage({ type: 'stop' });
|
| 399 |
+
// Handle stop immediately in UI
|
| 400 |
+
this.handleStreamEnd();
|
| 401 |
+
}
|
| 402 |
+
|
| 403 |
+
applyFadeIn(audioData) {
|
| 404 |
+
const fadeLen = Math.min(FADE_SAMPLES, audioData.length);
|
| 405 |
+
for (let i = 0; i < fadeLen; i++) {
|
| 406 |
+
audioData[i] *= i / fadeLen;
|
| 407 |
+
}
|
| 408 |
+
}
|
| 409 |
+
|
| 410 |
+
applyFadeOut(audioData) {
|
| 411 |
+
const fadeLen = Math.min(FADE_SAMPLES, audioData.length);
|
| 412 |
+
const startIdx = audioData.length - fadeLen;
|
| 413 |
+
for (let i = 0; i < fadeLen; i++) {
|
| 414 |
+
audioData[startIdx + i] *= 1 - (i / fadeLen);
|
| 415 |
+
}
|
| 416 |
+
}
|
| 417 |
+
|
| 418 |
+
handleAudioChunk(audioData, metrics) {
|
| 419 |
+
if (!this.isGenerating) return;
|
| 420 |
+
|
| 421 |
+
if (metrics.isSilence) {
|
| 422 |
+
this.player.playAudio(audioData);
|
| 423 |
+
this.skipNextRtf = true;
|
| 424 |
+
return;
|
| 425 |
+
}
|
| 426 |
+
|
| 427 |
+
// Apply fade-in at the start of each text chunk
|
| 428 |
+
if (metrics.isFirst || metrics.chunkStart) this.applyFadeIn(audioData);
|
| 429 |
+
if (metrics.isLast) this.applyFadeOut(audioData);
|
| 430 |
|
| 431 |
// Play audio
|
| 432 |
this.player.playAudio(audioData);
|
|
|
|
| 435 |
const now = performance.now();
|
| 436 |
let ttfb = 0;
|
| 437 |
|
| 438 |
+
if (metrics.isFirst) {
|
| 439 |
+
ttfb = now - this.generationStartTime;
|
| 440 |
+
this.lastChunkFinishTime = now;
|
| 441 |
+
} else if (this.skipNextRtf) {
|
| 442 |
+
this.lastChunkFinishTime = now;
|
| 443 |
+
this.skipNextRtf = false;
|
| 444 |
+
} else if (this.lastChunkFinishTime > 0) {
|
| 445 |
+
const timeSinceLastChunk = (now - this.lastChunkFinishTime) / 1000;
|
| 446 |
+
this.lastChunkFinishTime = now;
|
| 447 |
|
| 448 |
if (timeSinceLastChunk > 0) {
|
| 449 |
const chunkDurationSec = metrics.chunkDuration;
|