| <!DOCTYPE html>
|
| <html lang="en">
|
| <meta charset="UTF-8" />
|
| <title>VibeVoice-Realtime TTS Demo</title>
|
| <style>
|
| :root {
|
| --bg: #f5f7fc;
|
| --surface: #ffffff;
|
| --accent: #5562ff;
|
| --accent-strong: #3f4dff;
|
| --text-primary: #1f2742;
|
| --text-muted: #5d6789;
|
| --border: rgba(85, 98, 255, 0.18);
|
| --shadow: 0 18px 45px rgba(31, 39, 66, 0.08);
|
| }
|
|
|
| .helper-text {
|
| font-size: 12px;
|
| color: #8a93b5;
|
| }
|
|
|
| * {
|
| box-sizing: border-box;
|
| }
|
|
|
| body {
|
| margin: 0;
|
| background: var(--bg);
|
| font-family: 'Inter', 'Segoe UI', Roboto, Helvetica, sans-serif;
|
| color: var(--text-primary);
|
| display: flex;
|
| justify-content: center;
|
| padding: 48px 20px;
|
| }
|
|
|
| .app-shell {
|
| width: min(960px, 100%);
|
| background: var(--surface);
|
| border-radius: 20px;
|
| padding: 36px 40px 44px;
|
| box-shadow: var(--shadow);
|
| display: flex;
|
| flex-direction: column;
|
| gap: 28px;
|
| }
|
|
|
| h1 {
|
| margin: 0;
|
| text-align: center;
|
| font-size: 30px;
|
| font-weight: 700;
|
| letter-spacing: 0.01em;
|
| }
|
|
|
| .panel {
|
| display: flex;
|
| flex-direction: column;
|
| gap: 10px;
|
| }
|
|
|
| .field {
|
| display: flex;
|
| flex-direction: column;
|
| gap: 8px;
|
| }
|
|
|
| .field-label {
|
| font-weight: 600;
|
| font-size: 15px;
|
| color: var(--text-primary);
|
| }
|
|
|
| .text-input {
|
| width: 100%;
|
| min-height: 140px;
|
| max-height: 240px;
|
| border: 1px solid rgba(31, 39, 66, 0.14);
|
| border-radius: 12px;
|
| padding: 14px 16px;
|
| font-size: 15px;
|
| line-height: 1.6;
|
| font-family: inherit;
|
| background: #f9faff;
|
| transition: border-color 0.2s, box-shadow 0.2s;
|
| resize: vertical;
|
| }
|
|
|
| .text-input:focus {
|
| outline: none;
|
| border-color: var(--accent);
|
| box-shadow: 0 0 0 3px rgba(85, 98, 255, 0.18);
|
| background: #fff;
|
| }
|
|
|
| #streamingPreviewContainer {
|
| border-radius: 14px;
|
| border: 1px solid var(--border);
|
| background: linear-gradient(135deg, #eef2ff 0%, #f7f9ff 100%);
|
| padding: 18px 20px;
|
| box-shadow: inset 0 1px 2px rgba(85, 98, 255, 0.12);
|
| }
|
|
|
| #streamingPreviewHeader {
|
| font-weight: 600;
|
| color: var(--text-primary);
|
| display: flex;
|
| align-items: center;
|
| gap: 10px;
|
| font-size: 14px;
|
| margin-bottom: 8px;
|
| }
|
|
|
| #streamingPreviewNote {
|
| font-weight: 400;
|
| font-size: 12px;
|
| color: var(--text-muted);
|
| }
|
|
|
| #streamingPreview {
|
| min-height: 70px;
|
| padding: 10px 12px;
|
| border-radius: 10px;
|
| background: rgba(255, 255, 255, 0.9);
|
| border: 1px solid rgba(85, 98, 255, 0.25);
|
| font-family: 'Courier New', Courier, monospace;
|
| font-size: 14px;
|
| line-height: 1.5;
|
| color: var(--text-primary);
|
| white-space: pre-wrap;
|
| }
|
|
|
| #streamingPreview.streaming-active::after {
|
| content: "";
|
| display: inline-block;
|
| width: 2px;
|
| height: 1.1em;
|
| background: var(--accent);
|
| margin-left: 2px;
|
| animation: previewCaret 0.9s steps(1) infinite;
|
| vertical-align: bottom;
|
| }
|
|
|
| @keyframes previewCaret {
|
| 0%, 50% {
|
| opacity: 1;
|
| }
|
| 51%, 100% {
|
| opacity: 0;
|
| }
|
| }
|
|
|
| .control-panel {
|
| display: flex;
|
| flex-direction: column;
|
| gap: 18px;
|
| }
|
|
|
| .inline-field {
|
| display: flex;
|
| flex-direction: column;
|
| gap: 6px;
|
| }
|
|
|
| .select-control {
|
| width: 220px;
|
| border: 1px solid rgba(31, 39, 66, 0.14);
|
| border-radius: 10px;
|
| padding: 8px 12px;
|
| font-size: 14px;
|
| font-family: inherit;
|
| background: #fbfcff;
|
| color: var(--text-primary);
|
| transition: border-color 0.2s, box-shadow 0.2s;
|
| }
|
|
|
| .select-control:focus {
|
| outline: none;
|
| border-color: var(--accent);
|
| box-shadow: 0 0 0 3px rgba(85, 98, 255, 0.18);
|
| background: #fff;
|
| }
|
|
|
| .control-row {
|
| display: flex;
|
| align-items: center;
|
| flex-wrap: wrap;
|
| gap: 20px 28px;
|
| }
|
|
|
| .range-control {
|
| display: flex;
|
| align-items: center;
|
| gap: 12px;
|
| font-size: 14px;
|
| color: var(--text-primary);
|
| }
|
|
|
| .range-control input[type="range"] {
|
| width: 200px;
|
| accent-color: var(--accent);
|
| }
|
|
|
| .range-value {
|
| font-weight: 600;
|
| color: var(--text-primary);
|
| min-width: 42px;
|
| text-align: right;
|
| }
|
|
|
| #playback {
|
| background: var(--accent);
|
| color: #fff;
|
| border: none;
|
| padding: 10px 24px;
|
| border-radius: 999px;
|
| cursor: pointer;
|
| font-weight: 600;
|
| font-size: 14px;
|
| box-shadow: 0 8px 16px rgba(85, 98, 255, 0.25);
|
| transition: transform 0.15s, box-shadow 0.15s, background 0.15s;
|
| }
|
|
|
| #playback:hover {
|
| transform: translateY(-1px);
|
| box-shadow: 0 10px 20px rgba(85, 98, 255, 0.28);
|
| }
|
|
|
| #playback:active {
|
| transform: translateY(0);
|
| }
|
|
|
| #playback.playing {
|
| background: var(--accent-strong);
|
| }
|
|
|
| .secondary-btn {
|
| border: 1px solid rgba(31, 39, 66, 0.18);
|
| background: #f1f3ff;
|
| color: var(--text-primary);
|
| padding: 8px 18px;
|
| border-radius: 999px;
|
| cursor: pointer;
|
| font-size: 13px;
|
| font-weight: 500;
|
| transition: background 0.15s, border-color 0.15s;
|
| }
|
|
|
| .secondary-btn:hover {
|
| background: #e6e9ff;
|
| border-color: rgba(31, 39, 66, 0.26);
|
| }
|
|
|
| .secondary-btn:disabled {
|
| opacity: 0.55;
|
| cursor: not-allowed;
|
| }
|
|
|
| .metrics {
|
| display: flex;
|
| flex-wrap: wrap;
|
| gap: 16px 32px;
|
| font-size: 14px;
|
| color: var(--text-muted);
|
| }
|
|
|
| .metrics span {
|
| display: flex;
|
| align-items: baseline;
|
| gap: 6px;
|
| }
|
|
|
| .metrics span strong {
|
| color: var(--text-primary);
|
| font-weight: 600;
|
| }
|
|
|
| .metric-unit {
|
| color: var(--text-muted);
|
| font-size: 13px;
|
| }
|
|
|
| #logOutput {
|
| max-height: 260px;
|
| overflow-y: auto;
|
| background: #f7f9ff;
|
| color: var(--text-primary);
|
| padding: 16px 18px;
|
| border: 1px solid rgba(31, 39, 66, 0.12);
|
| border-radius: 12px;
|
| font-size: 13px;
|
| line-height: 1.6;
|
| box-shadow: inset 0 1px 2px rgba(15, 23, 42, 0.06);
|
| font-family: 'Fira Code', 'Courier New', Courier, monospace;
|
| margin-top: 0px;
|
| }
|
|
|
| @media (max-width: 720px) {
|
| .app-shell {
|
| padding: 28px 20px 36px;
|
| gap: 24px;
|
| }
|
|
|
| .select-control {
|
| width: 100%;
|
| }
|
|
|
| .control-row {
|
| flex-direction: column;
|
| align-items: flex-start;
|
| gap: 16px;
|
| }
|
|
|
| #playback {
|
| width: 100%;
|
| text-align: center;
|
| }
|
| }
|
| </style>
|
| <body>
|
| <div class="app-shell">
|
| <h1>VibeVoice-Realtime TTS Demo</h1>
|
|
|
| <section class="panel">
|
| <label class="field">
|
| <span class="field-label">Text</span>
|
| <textarea
|
| id="prompt"
|
| class="text-input"
|
| rows="4"
|
| >Enter your text here and click "Start" to instantly hear the VibeVoice-Realtime TTS output audio.</textarea>
|
| </label>
|
|
|
| <div id="streamingPreviewContainer">
|
| <div id="streamingPreviewHeader">
|
| <span>Streaming Input Text</span>
|
| </div>
|
| <div id="streamingPreview" aria-live="polite">This area will display the streaming input text in real time.</div>
|
| </div>
|
| </section>
|
| <span class="helper-text">This demo requires the full text to be provided upfront. The model then receives the text via streaming input during synthesis.<br>
|
| For non-punctuation special characters, applying text normalization before processing often yields better results.</span>
|
|
|
| <section class="panel control-panel">
|
| <div class="inline-field">
|
| <span class="field-label">Speaker</span>
|
| <select id="voiceSelect" class="select-control">
|
| <option value="">Loading...</option>
|
| </select>
|
| </div>
|
|
|
| <div class="control-row">
|
| <label class="range-control">
|
| <span>CFG</span>
|
| <input id="cfgScale" type="range" min="1" max="3" step="0.05" value="1.5" />
|
| <span class="range-value" id="cfgValue">1.5</span>
|
| </label>
|
| <label class="range-control">
|
| <span>Inference Steps</span>
|
| <input id="inferenceSteps" type="range" min="1" max="20" step="1" value="5" />
|
| <span class="range-value" id="stepsValue">5</span>
|
| </label>
|
| <button id="resetControls" type="button" class="secondary-btn">Reset Controls</button>
|
| </div>
|
|
|
| <div class="control-row">
|
| <button id="playback">Start</button>
|
| <button id="saveAudio" type="button" class="secondary-btn" disabled>Save</button>
|
| </div>
|
| </section>
|
|
|
| <section class="panel">
|
| <div class="metrics">
|
| <span>Model Generated Audio<strong id="modelGenerated">0.00</strong><span class="metric-unit">s</span></span>
|
| <span>Audio Played<strong id="playbackElapsed">0.00</strong><span class="metric-unit">s</span></span>
|
| </div>
|
| </section>
|
|
|
| <section class="panel">
|
| <span class="field-label">Runtime Logs</span>
|
| <pre id="logOutput"></pre>
|
| </section>
|
| </div>
|
|
|
|
|
| <script>
|
| (() => {
|
| const SAMPLE_RATE = 24_000;
|
| const BUFFER_SIZE = 2048;
|
| const PREBUFFER_SEC = 0.1;
|
|
|
| let audioCtx = null;
|
| let scriptNode = null;
|
| let socket = null;
|
| let buffer = new Float32Array(0);
|
| let isPlaying = false;
|
| let hasStartedPlayback = false;
|
| let silentFrameCount = 0;
|
|
|
| const promptInput = document.getElementById('prompt');
|
| const streamingPreview = document.getElementById('streamingPreview');
|
| const controlBtn = document.getElementById('playback');
|
| const cfgSelect = document.getElementById('cfgScale');
|
| const stepsSelect = document.getElementById('inferenceSteps');
|
| const voiceSelect = document.getElementById('voiceSelect');
|
| const cfgValueLabel = document.getElementById('cfgValue');
|
| const stepsValueLabel = document.getElementById('stepsValue');
|
| const modelGeneratedLabel = document.getElementById('modelGenerated');
|
| const playbackElapsedLabel = document.getElementById('playbackElapsed');
|
| const logOutput = document.getElementById('logOutput');
|
| const resetBtn = document.getElementById('resetControls');
|
| const saveBtn = document.getElementById('saveAudio');
|
|
|
| let playbackTimer = null;
|
| let lastPlaybackElapsed = 0;
|
| let playbackSamples = 0;
|
| let modelGeneratedTotal = 0;
|
| let firstBrowserChunkLogged = false;
|
| let playbackStartedLogged = false;
|
| const logEntries = [];
|
| let logSequence = 0;
|
| let recordedChunks = [];
|
| let recordedSamples = 0;
|
| let recordingComplete = false;
|
| let downloadUrl = null;
|
|
|
| const revokeDownloadUrl = () => {
|
| if (downloadUrl) {
|
| URL.revokeObjectURL(downloadUrl);
|
| downloadUrl = null;
|
| }
|
| };
|
|
|
| const updateSaveButtonState = () => {
|
| if (!saveBtn) {
|
| return;
|
| }
|
| saveBtn.disabled = recordedSamples === 0 || !recordingComplete;
|
| };
|
|
|
| const clearRecordedChunks = () => {
|
| recordedChunks = [];
|
| recordedSamples = 0;
|
| recordingComplete = false;
|
| revokeDownloadUrl();
|
| updateSaveButtonState();
|
| };
|
|
|
| const createWavBlob = () => {
|
| if (!recordedSamples) {
|
| return null;
|
| }
|
| const wavBuffer = new ArrayBuffer(44 + recordedSamples * 2);
|
| const view = new DataView(wavBuffer);
|
| const writeString = (offset, str) => {
|
| for (let i = 0; i < str.length; i += 1) {
|
| view.setUint8(offset + i, str.charCodeAt(i));
|
| }
|
| };
|
|
|
| writeString(0, 'RIFF');
|
| view.setUint32(4, 36 + recordedSamples * 2, true);
|
| writeString(8, 'WAVE');
|
| writeString(12, 'fmt ');
|
| view.setUint32(16, 16, true);
|
| view.setUint16(20, 1, true);
|
| view.setUint16(22, 1, true);
|
| view.setUint32(24, SAMPLE_RATE, true);
|
| view.setUint32(28, SAMPLE_RATE * 2, true);
|
| view.setUint16(32, 2, true);
|
| view.setUint16(34, 16, true);
|
| writeString(36, 'data');
|
| view.setUint32(40, recordedSamples * 2, true);
|
|
|
| const pcmData = new Int16Array(wavBuffer, 44, recordedSamples);
|
| let offset = 0;
|
| recordedChunks.forEach(chunk => {
|
| const chunkData = new Int16Array(chunk);
|
| pcmData.set(chunkData, offset);
|
| offset += chunkData.length;
|
| });
|
| return new Blob([wavBuffer], { type: 'audio/wav' });
|
| };
|
|
|
| const updateCfgDisplay = () => {
|
| cfgValueLabel.textContent = Number(cfgSelect.value).toFixed(3);
|
| };
|
|
|
| const updateStepsDisplay = () => {
|
| stepsValueLabel.textContent = Number(stepsSelect.value).toString();
|
| };
|
|
|
| cfgSelect.addEventListener('input', updateCfgDisplay);
|
| stepsSelect.addEventListener('input', updateStepsDisplay);
|
| updateCfgDisplay();
|
| updateStepsDisplay();
|
|
|
| const pad2 = value => value.toString().padStart(2, '0');
|
| const pad3 = value => value.toString().padStart(3, '0');
|
|
|
| const formatLocalTimestamp = () => {
|
| const d = new Date();
|
| const year = d.getFullYear();
|
| const month = pad2(d.getMonth() + 1);
|
| const day = pad2(d.getDate());
|
| const hours = pad2(d.getHours());
|
| const minutes = pad2(d.getMinutes());
|
| const seconds = pad2(d.getSeconds());
|
| const millis = pad3(d.getMilliseconds());
|
| return `${year}-${month}-${day} ${hours}:${minutes}:${seconds}.${millis}`;
|
| };
|
|
|
| const formatSeconds = raw => {
|
| const value = Number(raw);
|
| return Number.isFinite(value) ? value.toFixed(2) : '0.00';
|
| };
|
|
|
| const parseTimestamp = value => {
|
| if (!value) {
|
| return new Date();
|
| }
|
| if (/\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3}/.test(value)) {
|
| return new Date(value.replace(' ', 'T'));
|
| }
|
| return new Date(value);
|
| };
|
|
|
| const setModelGenerated = value => {
|
| const numeric = Number(value);
|
| if (!Number.isFinite(numeric)) {
|
| return;
|
| }
|
| modelGeneratedTotal = Math.max(0, numeric);
|
| modelGeneratedLabel.textContent = formatSeconds(modelGeneratedTotal);
|
| };
|
|
|
| const setPlaybackElapsed = value => {
|
| const capped = Math.min(modelGeneratedTotal, Math.max(0, value));
|
| lastPlaybackElapsed = capped;
|
| playbackElapsedLabel.textContent = formatSeconds(lastPlaybackElapsed);
|
| };
|
|
|
| const STREAMING_WPM = 180;
|
| const STREAMING_INTERVAL_MS = 60000 / STREAMING_WPM;
|
| let previewTimeoutId = null;
|
| let previewTokens = [];
|
| let previewIndex = 0;
|
| let previewActive = false;
|
|
|
| const clearPreviewTimer = () => {
|
| if (previewTimeoutId) {
|
| clearTimeout(previewTimeoutId);
|
| previewTimeoutId = null;
|
| }
|
| };
|
|
|
| const setPreviewIdle = message => {
|
| if (!streamingPreview) {
|
| return;
|
| }
|
| streamingPreview.classList.remove('streaming-active');
|
| streamingPreview.textContent = message;
|
| };
|
|
|
| const schedulePreviewTick = () => {
|
| if (!streamingPreview) {
|
| return;
|
| }
|
| if (previewIndex >= previewTokens.length) {
|
| streamingPreview.classList.remove('streaming-active');
|
| return;
|
| }
|
|
|
| streamingPreview.classList.add('streaming-active');
|
|
|
| streamingPreview.textContent += previewTokens[previewIndex];
|
| previewIndex += 1;
|
| previewTimeoutId = setTimeout(schedulePreviewTick, STREAMING_INTERVAL_MS);
|
| };
|
|
|
| const updateStreamingPreview = () => {
|
| if (!streamingPreview) {
|
| return;
|
| }
|
| clearPreviewTimer();
|
| previewIndex = 0;
|
| const source = (promptInput?.value || '').trimEnd();
|
| streamingPreview.textContent = '';
|
| previewTokens = source.match(/\S+\s*/g) || [];
|
| schedulePreviewTick();
|
| };
|
|
|
| const clearLogs = () => {
|
| if (logOutput) {
|
| logOutput.textContent = '';
|
| }
|
| logEntries.length = 0;
|
| modelGeneratedTotal = 0;
|
| setModelGenerated(0);
|
| };
|
|
|
| const appendLog = (message, timestamp) => {
|
| if (!logOutput) {
|
| return;
|
| }
|
| const finalTimestamp = timestamp || formatLocalTimestamp();
|
| const entry = {
|
| timestamp: finalTimestamp,
|
| date: parseTimestamp(finalTimestamp),
|
| message,
|
| seq: logSequence += 1,
|
| };
|
| logEntries.push(entry);
|
| logEntries.sort((a, b) => {
|
| const diff = a.date.getTime() - b.date.getTime();
|
| return diff !== 0 ? diff : a.seq - b.seq;
|
| });
|
| if (logEntries.length > 400) {
|
| logEntries.splice(0, logEntries.length - 400);
|
| }
|
| logOutput.textContent = logEntries
|
| .map(item => `[${item.timestamp}] ${item.message}`)
|
| .join('\n');
|
| logOutput.scrollTop = logOutput.scrollHeight;
|
| };
|
|
|
| const handleSaveClick = () => {
|
| if (!recordedSamples) {
|
| appendLog('[Frontend] Save requested but no audio received yet');
|
| return;
|
| }
|
| const wavBlob = createWavBlob();
|
| if (!wavBlob) {
|
| appendLog('[Error] Failed to assemble WAV data for download');
|
| return;
|
| }
|
| revokeDownloadUrl();
|
| downloadUrl = URL.createObjectURL(wavBlob);
|
| const link = document.createElement('a');
|
| const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
|
| link.href = downloadUrl;
|
| link.download = `vibevoice_realtime_audio_${timestamp}.wav`;
|
| document.body.appendChild(link);
|
| link.click();
|
| document.body.removeChild(link);
|
| appendLog('[Frontend] Audio download triggered');
|
| };
|
|
|
| const stopPlaybackTimer = () => {
|
| if (playbackTimer) {
|
| clearInterval(playbackTimer);
|
| playbackTimer = null;
|
| }
|
| };
|
|
|
| const startPlaybackTimer = () => {
|
| stopPlaybackTimer();
|
| playbackTimer = setInterval(() => {
|
| setPlaybackElapsed(playbackSamples / SAMPLE_RATE);
|
| }, 250);
|
| };
|
|
|
| const loadVoices = async () => {
|
| try {
|
| voiceSelect.disabled = true;
|
| const response = await fetch('/config');
|
| if (!response.ok) {
|
| throw new Error(`Failed to fetch config: ${response.status}`);
|
| }
|
| const data = await response.json();
|
| const voices = Array.isArray(data.voices) ? data.voices : [];
|
| voiceSelect.innerHTML = '';
|
| if (voices.length === 0) {
|
| const option = document.createElement('option');
|
| option.value = '';
|
| option.textContent = 'No voices available';
|
| voiceSelect.appendChild(option);
|
| voiceSelect.disabled = true;
|
| appendLog('[Error] No voice presets available');
|
| return;
|
| }
|
|
|
| voices.forEach(voice => {
|
| const option = document.createElement('option');
|
| option.value = voice;
|
| option.textContent = voice;
|
| voiceSelect.appendChild(option);
|
| });
|
|
|
| if (data.default_voice && voices.includes(data.default_voice)) {
|
| voiceSelect.value = data.default_voice;
|
| }
|
| voiceSelect.disabled = false;
|
| appendLog(`[Frontend] Loaded ${voices.length} voice presets`);
|
| } catch (err) {
|
| console.error('Failed to load voices', err);
|
| voiceSelect.innerHTML = '';
|
| const option = document.createElement('option');
|
| option.value = '';
|
| option.textContent = 'Load failed';
|
| voiceSelect.appendChild(option);
|
| voiceSelect.disabled = true;
|
| appendLog('[Error] Failed to load voice presets');
|
| }
|
| };
|
|
|
| loadVoices();
|
|
|
| resetBtn.addEventListener('click', () => {
|
| cfgSelect.value = '1.5';
|
| stepsSelect.value = '5';
|
| updateCfgDisplay();
|
| updateStepsDisplay();
|
| appendLog('[Frontend] Controls reset to defaults (CFG=1.5, Steps=5)');
|
| });
|
|
|
| if (promptInput) {
|
| promptInput.addEventListener('input', () => {
|
| if (previewActive) {
|
| updateStreamingPreview();
|
| }
|
| });
|
| }
|
|
|
| const handleLogMessage = raw => {
|
| let payload;
|
| try {
|
| payload = JSON.parse(raw);
|
| } catch (err) {
|
| appendLog(`[Error] Failed to parse log message: ${raw}`);
|
| return;
|
| }
|
| if (!payload || payload.type !== 'log') {
|
| appendLog(`[Log] ${raw}`);
|
| return;
|
| }
|
|
|
| const { event, data = {}, timestamp } = payload;
|
| switch (event) {
|
| case 'backend_request_received': {
|
| const cfg = typeof data.cfg_scale === 'number' ? data.cfg_scale.toFixed(3) : data.cfg_scale;
|
| const steps = data.inference_steps ?? 'default';
|
| const voice = data.voice || 'default';
|
| const textLength = data.text_length ?? 0;
|
| appendLog(`[Backend] Received request`, timestamp);
|
| break;
|
| }
|
| case 'backend_first_chunk_sent':
|
| appendLog('[Backend] Sent first audio chunk', timestamp);
|
| break;
|
| case 'model_progress':
|
| if (typeof data.generated_sec !== 'undefined') {
|
| const generated = Number(data.generated_sec);
|
| if (Number.isFinite(generated)) {
|
| setModelGenerated(generated);
|
| }
|
| }
|
| return;
|
| case 'generation_error':
|
| appendLog(`[Error] Generation error: ${data.message || 'Unknown error'}`, timestamp);
|
| break;
|
| case 'backend_error':
|
| appendLog(`[Error] Backend error: ${data.message || 'Unknown error'}`, timestamp);
|
| break;
|
| case 'client_disconnected':
|
| appendLog('[Frontend] Client disconnected', timestamp);
|
| break;
|
| case 'backend_stream_complete':
|
| appendLog('[Backend] Backend finished', timestamp);
|
| recordingComplete = true;
|
| updateSaveButtonState();
|
| break;
|
| default:
|
| appendLog(`[Log] Event ${event}`, timestamp);
|
| break;
|
| }
|
| };
|
|
|
| const updateButtonLabel = () => {
|
| controlBtn.textContent = isPlaying ? 'Stop' : 'Start';
|
| controlBtn.classList.toggle('playing', isPlaying);
|
| };
|
|
|
| const appendAudio = chunk => {
|
| const merged = new Float32Array(buffer.length + chunk.length);
|
| merged.set(buffer, 0);
|
| merged.set(chunk, buffer.length);
|
| buffer = merged;
|
| };
|
|
|
| const pullAudio = frameCount => {
|
| const available = buffer.length;
|
| if (available === 0) {
|
| return new Float32Array(frameCount);
|
| }
|
| if (available <= frameCount) {
|
| const chunk = buffer;
|
| buffer = new Float32Array(0);
|
| if (chunk.length < frameCount) {
|
| const padded = new Float32Array(frameCount);
|
| padded.set(chunk, 0);
|
| return padded;
|
| }
|
| return chunk;
|
| }
|
| const chunk = buffer.subarray(0, frameCount);
|
| buffer = buffer.subarray(frameCount);
|
| return chunk;
|
| };
|
|
|
| const closeSocket = () => {
|
| if (socket && (socket.readyState === WebSocket.OPEN || socket.readyState === WebSocket.CONNECTING)) {
|
| socket.close();
|
| }
|
| socket = null;
|
| };
|
|
|
| const resetPlaybackFlags = (resetSamples = true) => {
|
| buffer = new Float32Array(0);
|
| if (resetSamples) {
|
| playbackSamples = 0;
|
| setPlaybackElapsed(0);
|
| }
|
| hasStartedPlayback = false;
|
| silentFrameCount = 0;
|
| firstBrowserChunkLogged = false;
|
| playbackStartedLogged = false;
|
| };
|
|
|
| const teardownAudio = () => {
|
| if (scriptNode) {
|
| try { scriptNode.disconnect(); } catch (err) { console.warn('disconnect error', err); }
|
| scriptNode.onaudioprocess = null;
|
| }
|
| if (audioCtx) {
|
| try { audioCtx.close(); } catch (err) { console.warn('audioCtx.close error', err); }
|
| }
|
| audioCtx = null;
|
| scriptNode = null;
|
| };
|
|
|
| const resetState = (resetSamples = true) => {
|
| closeSocket();
|
| teardownAudio();
|
| resetPlaybackFlags(resetSamples);
|
| isPlaying = false;
|
| stopPlaybackTimer();
|
| };
|
|
|
| const createAudioChain = () => {
|
| teardownAudio();
|
| resetPlaybackFlags();
|
| audioCtx = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: SAMPLE_RATE });
|
| scriptNode = audioCtx.createScriptProcessor(BUFFER_SIZE, 0, 1);
|
|
|
| const minBufferSamples = Math.floor(audioCtx.sampleRate * PREBUFFER_SEC);
|
|
|
| scriptNode.onaudioprocess = event => {
|
| const output = event.outputBuffer.getChannelData(0);
|
| const needPrebuffer = !hasStartedPlayback;
|
| const socketClosed = !socket || socket.readyState === WebSocket.CLOSED || socket.readyState === WebSocket.CLOSING;
|
|
|
| if (needPrebuffer) {
|
| if (buffer.length >= minBufferSamples || socketClosed) {
|
| hasStartedPlayback = true;
|
| if (!playbackStartedLogged) {
|
| playbackStartedLogged = true;
|
| appendLog('[Frontend] Browser started to play audio');
|
| startPlaybackTimer();
|
| }
|
| } else {
|
| output.fill(0);
|
| return;
|
| }
|
| }
|
|
|
| const chunk = pullAudio(output.length);
|
| output.set(chunk);
|
|
|
| if (hasStartedPlayback) {
|
| playbackSamples += output.length;
|
| }
|
|
|
| if (socketClosed && buffer.length === 0 && chunk.every(sample => sample === 0)) {
|
| silentFrameCount += 1;
|
| if (silentFrameCount >= 4) {
|
| stop();
|
| }
|
| } else {
|
| silentFrameCount = 0;
|
| }
|
| };
|
|
|
| scriptNode.connect(audioCtx.destination);
|
| };
|
|
|
| const start = () => {
|
| if (isPlaying) {
|
| return;
|
| }
|
|
|
| const textValue = promptInput?.value || '';
|
| const cfgValue = Number(cfgSelect.value);
|
| const stepsValue = Number(stepsSelect.value);
|
| const voiceValue = voiceSelect.value || '';
|
|
|
| clearLogs();
|
| const cfgDisplay = Number.isFinite(cfgValue) ? cfgValue.toFixed(3) : 'default';
|
| const stepsDisplay = Number.isFinite(stepsValue) ? stepsValue : 'default';
|
| appendLog(`[Frontend] Start button clicked, CFG=${cfgDisplay}, Steps=${stepsDisplay}, Speaker=${voiceValue || 'default'}`);
|
| setModelGenerated(0);
|
| setPlaybackElapsed(0);
|
|
|
| resetState(true);
|
| clearRecordedChunks();
|
| isPlaying = true;
|
| previewActive = true;
|
| updateStreamingPreview();
|
| updateButtonLabel();
|
| createAudioChain();
|
|
|
| const params = new URLSearchParams();
|
| params.set('text', textValue);
|
| if (!Number.isNaN(cfgValue)) {
|
| params.set('cfg', cfgValue.toFixed(3));
|
| }
|
| if (!Number.isNaN(stepsValue)) {
|
| params.set('steps', stepsValue.toString());
|
| }
|
| if (voiceValue) {
|
| params.set('voice', voiceValue);
|
| }
|
| const wsUrl = `${location.origin.replace(/^http/, 'ws')}/stream?${params.toString()}`;
|
|
|
| socket = new WebSocket(wsUrl);
|
| socket.binaryType = 'arraybuffer';
|
|
|
| socket.onmessage = event => {
|
| if (typeof event.data === 'string') {
|
| handleLogMessage(event.data);
|
| return;
|
| }
|
|
|
| if (!(event.data instanceof ArrayBuffer)) {
|
| return;
|
| }
|
| const rawBuffer = event.data.slice(0);
|
| const view = new DataView(rawBuffer);
|
| const floatChunk = new Float32Array(view.byteLength / 2);
|
| for (let i = 0; i < floatChunk.length; i += 1) {
|
| floatChunk[i] = view.getInt16(i * 2, true) / 32768;
|
| }
|
| appendAudio(floatChunk);
|
| recordedChunks.push(rawBuffer);
|
| recordedSamples += floatChunk.length;
|
| updateSaveButtonState();
|
|
|
| if (!firstBrowserChunkLogged) {
|
| firstBrowserChunkLogged = true;
|
| appendLog('[Frontend] Received first audio chunk');
|
| }
|
| };
|
|
|
| socket.onerror = err => {
|
| console.error('WebSocket error', err);
|
| appendLog(`[Error] WebSocket error: ${err?.message || err}`);
|
| stop();
|
| };
|
|
|
| socket.onclose = () => {
|
| socket = null;
|
| if (recordedSamples > 0) {
|
| recordingComplete = true;
|
| updateSaveButtonState();
|
| }
|
| };
|
| };
|
|
|
| const stop = () => {
|
| if (!isPlaying) {
|
| resetState(false);
|
| updateButtonLabel();
|
| return;
|
| }
|
| resetState(false);
|
| setPlaybackElapsed(Math.min(lastPlaybackElapsed, modelGeneratedTotal));
|
| appendLog('[Frontend] Playback stopped');
|
| if (recordedSamples > 0) {
|
| recordingComplete = true;
|
| updateSaveButtonState();
|
| }
|
| previewActive = false;
|
| clearPreviewTimer();
|
| streamingPreview?.classList.remove('streaming-active');
|
| updateButtonLabel();
|
| };
|
|
|
| controlBtn.addEventListener('click', () => {
|
| if (isPlaying) {
|
| stop();
|
| } else {
|
| start();
|
| }
|
| });
|
| if (saveBtn) {
|
| saveBtn.addEventListener('click', handleSaveClick);
|
| }
|
| updateButtonLabel();
|
| updateSaveButtonState();
|
| window.addEventListener('beforeunload', () => {
|
| resetState();
|
| clearPreviewTimer();
|
| revokeDownloadUrl();
|
| });
|
| })();
|
| </script>
|
| </body>
|
| </html>
|
|
|