import * as ort from 'onnxruntime-web'; const presetTexts = window.presetTexts || {}; const PLAY_ICON_SVG = ``; const PAUSE_ICON_SVG = ``; const STOP_ICON_SVG = ``; // Lightning background parallax (function initLightningParallax() { if (typeof document === 'undefined') { return; } const runBlink = (className, onComplete) => { let remaining = 1 + Math.round(Math.random()); const blink = () => { if (remaining-- <= 0) { if (typeof onComplete === 'function') { onComplete(); } return; } const wait = 20 + Math.random() * 80; document.body.classList.add(className); setTimeout(() => { document.body.classList.remove(className); setTimeout(blink, wait); }, wait); }; blink(); }; const schedule = () => { setTimeout(() => runBlink('lightning-flicker', schedule), Math.random() * 10000); }; schedule(); })(); function escapeHtml(value) { return value.replace(/[&<>"']/g, (match) => { switch (match) { case '&': return '&'; case '<': return '<'; case '>': return '>'; case '"': return '"'; case "'": return '''; default: return match; } }); } function formatStatValueWithSuffix(value, suffix, options = {}) { const { firstLabel = false } = options; if (value === undefined || value === null) { return ''; } if (!suffix) { const raw = `${value}`; return escapeHtml(raw); } const raw = `${value}`.trim(); if (!raw || raw === '--' || raw === '-' || raw.toLowerCase() === 'error') { return escapeHtml(raw); } const appendSuffix = (segment, includePrefix = false) => { const trimmed = segment.trim(); if (!trimmed) { return ''; } const escapedValue = `${escapeHtml(trimmed)}`; const suffixSpan = `${escapeHtml(suffix)}`; const prefixSpan = includePrefix && firstLabel ? `First` : ''; const segmentClass = includePrefix && firstLabel ? 'stat-value-segment has-prefix' : 'stat-value-segment'; return `${prefixSpan}${escapedValue}${suffixSpan}`; }; if (raw.includes('/')) { const parts = raw.split('/'); const segments = parts.map((part, index) => appendSuffix(part, index === 0)); return segments.join(' / '); } return appendSuffix(raw); } /** * Unicode text processor */ export class UnicodeProcessor { constructor(indexer) { this.indexer = indexer; } call(textList, lang = null) { const processedTexts = textList.map(t => preprocessText(t, lang)); const textIdsLengths = processedTexts.map(t => t.length); const maxLen = Math.max(...textIdsLengths); const textIds = []; const unsupportedChars = new Set(); for (let i = 0; i < processedTexts.length; i++) { const row = new Array(maxLen).fill(0); const unicodeVals = textToUnicodeValues(processedTexts[i]); for (let j = 0; j < unicodeVals.length; j++) { const indexValue = this.indexer[unicodeVals[j]]; // Check if character is supported (not -1, undefined, or null) if (indexValue === undefined || indexValue === null || indexValue === -1) { unsupportedChars.add(processedTexts[i][j]); row[j] = 0; // Use 0 as fallback } else { row[j] = indexValue; } } textIds.push(row); } const textMask = getTextMask(textIdsLengths); return { textIds, textMask, unsupportedChars: Array.from(unsupportedChars) }; } } const AVAILABLE_LANGS = ["en", "ko", "es", "pt", "fr"]; /** * Language detection based on character patterns and language-specific markers * Returns the detected language code or null if uncertain */ export function detectLanguage(text) { if (!text || text.trim().length < 3) { return null; } // Only consider last 100 characters for efficiency const sampleText = text.length > 100 ? text.substring(text.length - 100) : text; // Normalize text for analysis const normalizedText = sampleText.normalize('NFC').toLowerCase(); // Korean detection: Hangul characters (most reliable) const koreanRegex = /[\uAC00-\uD7AF\u1100-\u11FF\u3130-\u318F\uA960-\uA97F\uD7B0-\uD7FF]/g; const koreanMatches = normalizedText.match(koreanRegex) || []; if (koreanMatches.length >= 2) { return 'ko'; } // Scoring system for Latin-based languages const scores = { en: 0, es: 0, fr: 0, pt: 0 }; // 1. Highly distinctive characters (definitive markers) if (/ñ/.test(normalizedText)) scores.es += 15; if (/[¿¡]/.test(normalizedText)) scores.es += 12; if (/ã/.test(normalizedText)) scores.pt += 15; if (/õ/.test(normalizedText)) scores.pt += 15; if (/œ/.test(normalizedText)) scores.fr += 15; if (/[ùû]/.test(normalizedText)) scores.fr += 10; // ç is shared between French and Portuguese if (/ç/.test(normalizedText)) { scores.fr += 4; scores.pt += 4; } // French-specific accent patterns if (/[èêë]/.test(normalizedText)) scores.fr += 5; if (/[àâ]/.test(normalizedText)) scores.fr += 3; if (/[îï]/.test(normalizedText)) scores.fr += 4; if (/ô/.test(normalizedText)) scores.fr += 3; // 2. Exclusive stopwords (words unique to one language) const exclusiveWords = { en: ['the', 'is', 'are', 'was', 'were', 'have', 'has', 'been', 'will', 'would', 'could', 'should', 'this', 'that', 'with', 'from', 'they', 'what', 'which', 'there', 'their', 'about', 'these', 'other', 'into', 'just', 'your', 'some', 'than', 'them', 'then', 'only', 'being', 'through', 'after', 'before'], es: ['el', 'los', 'las', 'es', 'está', 'están', 'porque', 'pero', 'muy', 'también', 'más', 'este', 'esta', 'estos', 'estas', 'ese', 'esa', 'yo', 'tú', 'nosotros', 'ellos', 'ellas', 'hola', 'gracias', 'buenos', 'buenas', 'ahora', 'siempre', 'nunca', 'todo', 'nada', 'algo', 'alguien'], fr: ['le', 'les', 'est', 'sont', 'dans', 'ce', 'cette', 'ces', 'il', 'elle', 'ils', 'elles', 'je', 'tu', 'nous', 'vous', 'avec', 'sur', 'ne', 'pas', 'plus', 'tout', 'bien', 'fait', 'être', 'avoir', 'donc', 'car', 'ni', 'jamais', 'toujours', 'rien', 'quelque', 'encore', 'aussi', 'très', 'peu', 'ici'], pt: ['os', 'as', 'é', 'são', 'está', 'estão', 'não', 'na', 'no', 'da', 'do', 'das', 'dos', 'ao', 'aos', 'ele', 'ela', 'eles', 'elas', 'eu', 'nós', 'você', 'vocês', 'seu', 'sua', 'seus', 'suas', 'muito', 'também', 'já', 'foi', 'só', 'mesmo', 'ter', 'até', 'isso', 'olá', 'obrigado', 'obrigada', 'bom', 'boa', 'agora', 'sempre', 'nunca', 'tudo', 'nada', 'algo', 'alguém'] }; // Extract words from text const words = normalizedText.match(/[a-záàâãäåçéèêëíìîïñóòôõöúùûüýÿœæ]+/g) || []; for (const word of words) { for (const [lang, wordList] of Object.entries(exclusiveWords)) { if (wordList.includes(word)) { scores[lang] += 3; } } } // 3. Common n-grams (character patterns) const ngramPatterns = { en: [/th/g, /ing/g, /tion/g, /ight/g, /ould/g], es: [/ción/g, /mente/g, /ado/g, /ido/g], fr: [/tion/g, /ment/g, /eau/g, /aux/g, /eux/g, /oir/g, /ais/g, /ait/g, /ont/g], pt: [/ção/g, /ões/g, /mente/g, /ado/g, /ido/g, /nh/g, /lh/g] }; for (const [lang, patterns] of Object.entries(ngramPatterns)) { for (const pattern of patterns) { const matches = normalizedText.match(pattern) || []; scores[lang] += matches.length * 2; } } // 4. French contractions and apostrophes const frenchContractions = /[cdjlmnst]'[aeiouéèêàâîïôûù]/g; const frenchContractionMatches = normalizedText.match(frenchContractions) || []; scores.fr += frenchContractionMatches.length * 5; // 5. Article patterns that help distinguish // "the" is very English, "el/la" Spanish, "le/la" French, "o/a" Portuguese if (/\bthe\b/.test(normalizedText)) scores.en += 5; if (/\b(el|los)\b/.test(normalizedText)) scores.es += 4; if (/\b(le|les)\b/.test(normalizedText)) scores.fr += 4; if (/\b(o|os)\b/.test(normalizedText)) scores.pt += 3; // Find the language with the highest score let maxScore = 0; let detectedLang = null; for (const [lang, score] of Object.entries(scores)) { if (score > maxScore) { maxScore = score; detectedLang = lang; } } // Only return if we have enough confidence (minimum threshold) if (maxScore >= 4) { return detectedLang; } return null; } // Language display names for toast notification const LANGUAGE_NAMES = { 'en': 'English', 'ko': 'Korean', 'es': 'Spanish', 'pt': 'Portuguese', 'fr': 'French' }; export function preprocessText(text, lang = null) { // Normalize unicode characters text = text.normalize('NFKD'); // Remove emojis text = text.replace(/[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F700}-\u{1F77F}\u{1F780}-\u{1F7FF}\u{1F800}-\u{1F8FF}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1F1E6}-\u{1F1FF}]+/gu, ''); // Replace various dashes and symbols const replacements = { "–": "-", "‑": "-", "—": "-", "_": " ", "\u201C": '"', // " "\u201D": '"', // " "\u2018": "'", // ' "\u2019": "'", // ' "´": "'", "`": "'", "[": " ", "]": " ", "|": " ", "/": " ", // FIXME: `/` should be pronounced. "#": " ", // FIXME: `#` should be pronounced. "→": " ", "←": " ", }; for (const [k, v] of Object.entries(replacements)) { text = text.replaceAll(k, v); } // Remove special symbols text = text.replace(/[♥☆♡©\\]/g, ""); // Replace known expressions const exprReplacements = { "@": " at ", "e.g.,": "for example,", "i.e.,": "that is,", }; for (const [k, v] of Object.entries(exprReplacements)) { text = text.replaceAll(k, v); } // Fix spacing around punctuation text = text.replace(/ ,/g, ","); text = text.replace(/ \./g, "."); text = text.replace(/ !/g, "!"); text = text.replace(/ \?/g, "?"); text = text.replace(/ ;/g, ";"); text = text.replace(/ :/g, ":"); text = text.replace(/ '/g, "'"); // Remove duplicate quotes while (text.includes('""')) { text = text.replace(/""/g, '"'); } while (text.includes("''")) { text = text.replace(/''/g, "'"); } while (text.includes("``")) { text = text.replace(/``/g, "`"); } // Remove extra spaces text = text.replace(/\s+/g, " ").trim(); // If text doesn't end with punctuation, quotes, or closing brackets, add a period if (!/[.!?;:,'"')\]}…。」』】〉》›»]$/.test(text)) { text += "."; } // Add language tags if (lang !== null) { if (!AVAILABLE_LANGS.includes(lang)) { throw new Error(`Invalid language: ${lang}`); } text = `<${lang}>` + text + ``; } else { text = `` + text + ``; } return text; } export function textToUnicodeValues(text) { return Array.from(text).map(char => char.charCodeAt(0)); } export function lengthToMask(lengths, maxLen = null) { maxLen = maxLen || Math.max(...lengths); const mask = []; for (let i = 0; i < lengths.length; i++) { const row = []; for (let j = 0; j < maxLen; j++) { row.push(j < lengths[i] ? 1.0 : 0.0); } mask.push([row]); } return mask; } export function getTextMask(textIdsLengths) { return lengthToMask(textIdsLengths); } export function getLatentMask(wavLengths, cfgs) { const baseChunkSize = cfgs.ae.base_chunk_size; const chunkCompressFactor = cfgs.ttl.chunk_compress_factor; const latentSize = baseChunkSize * chunkCompressFactor; const latentLengths = wavLengths.map(len => Math.floor((len + latentSize - 1) / latentSize) ); return lengthToMask(latentLengths); } export function sampleNoisyLatent(duration, cfgs) { const sampleRate = cfgs.ae.sample_rate; const baseChunkSize = cfgs.ae.base_chunk_size; const chunkCompressFactor = cfgs.ttl.chunk_compress_factor; const ldim = cfgs.ttl.latent_dim; const wavLenMax = Math.max(...duration.map(d => d[0][0])) * sampleRate; const wavLengths = duration.map(d => Math.floor(d[0][0] * sampleRate)); const chunkSize = baseChunkSize * chunkCompressFactor; const latentLen = Math.floor((wavLenMax + chunkSize - 1) / chunkSize); const latentDim = ldim * chunkCompressFactor; const noisyLatent = []; for (let b = 0; b < duration.length; b++) { const batch = []; for (let d = 0; d < latentDim; d++) { const row = []; for (let t = 0; t < latentLen; t++) { const u1 = Math.random(); const u2 = Math.random(); const randNormal = Math.sqrt(-2.0 * Math.log(u1)) * Math.cos(2.0 * Math.PI * u2); row.push(randNormal); } batch.push(row); } noisyLatent.push(batch); } const latentMask = getLatentMask(wavLengths, cfgs); for (let b = 0; b < noisyLatent.length; b++) { for (let d = 0; d < noisyLatent[b].length; d++) { for (let t = 0; t < noisyLatent[b][d].length; t++) { noisyLatent[b][d][t] *= latentMask[b][0][t]; } } } return { noisyLatent, latentMask }; } export async function loadOnnx(onnxPath, opts) { return await ort.InferenceSession.create(onnxPath, opts); } export async function loadOnnxAll(basePath, opts, onProgress) { const models = [ { name: 'Duration Predictor', path: `${basePath}/duration_predictor.onnx`, key: 'dpOrt' }, { name: 'Text Encoder', path: `${basePath}/text_encoder.onnx`, key: 'textEncOrt' }, { name: 'Vector Estimator', path: `${basePath}/vector_estimator.onnx`, key: 'vectorEstOrt' }, { name: 'Vocoder', path: `${basePath}/vocoder.onnx`, key: 'vocoderOrt' } ]; const result = {}; let loadedCount = 0; // Load all models in parallel const loadPromises = models.map(async (model) => { const session = await loadOnnx(model.path, opts); loadedCount++; if (onProgress) { onProgress(model.name, loadedCount, models.length); } return { key: model.key, session }; }); // Wait for all models to load const loadedModels = await Promise.all(loadPromises); // Organize results loadedModels.forEach(({ key, session }) => { result[key] = session; }); try { // Download counting await fetch('https://huggingface.co/Supertone/supertonic-2/resolve/main/config.json'); } catch (error) { console.warn('Failed to update download count:', error); } return result; } export async function loadCfgs(basePath) { const response = await fetch(`${basePath}/tts.json`); return await response.json(); } export async function loadProcessors(basePath) { const response = await fetch(`${basePath}/unicode_indexer.json`); const unicodeIndexerData = await response.json(); const textProcessor = new UnicodeProcessor(unicodeIndexerData); return { textProcessor }; } function parseWavFile(buffer) { const view = new DataView(buffer); // Check RIFF header const riff = String.fromCharCode(view.getUint8(0), view.getUint8(1), view.getUint8(2), view.getUint8(3)); if (riff !== 'RIFF') { throw new Error('Not a valid WAV file'); } const wave = String.fromCharCode(view.getUint8(8), view.getUint8(9), view.getUint8(10), view.getUint8(11)); if (wave !== 'WAVE') { throw new Error('Not a valid WAV file'); } let offset = 12; let fmtChunk = null; let dataChunk = null; while (offset < buffer.byteLength) { const chunkId = String.fromCharCode( view.getUint8(offset), view.getUint8(offset + 1), view.getUint8(offset + 2), view.getUint8(offset + 3) ); const chunkSize = view.getUint32(offset + 4, true); if (chunkId === 'fmt ') { fmtChunk = { audioFormat: view.getUint16(offset + 8, true), numChannels: view.getUint16(offset + 10, true), sampleRate: view.getUint32(offset + 12, true), bitsPerSample: view.getUint16(offset + 22, true) }; } else if (chunkId === 'data') { dataChunk = { offset: offset + 8, size: chunkSize }; break; } offset += 8 + chunkSize; } if (!fmtChunk || !dataChunk) { throw new Error('Invalid WAV file format'); } const bytesPerSample = fmtChunk.bitsPerSample / 8; const numSamples = Math.floor(dataChunk.size / (bytesPerSample * fmtChunk.numChannels)); const audioData = new Float32Array(numSamples); if (fmtChunk.bitsPerSample === 16) { for (let i = 0; i < numSamples; i++) { let sample = 0; for (let ch = 0; ch < fmtChunk.numChannels; ch++) { const sampleOffset = dataChunk.offset + (i * fmtChunk.numChannels + ch) * 2; sample += view.getInt16(sampleOffset, true); } audioData[i] = (sample / fmtChunk.numChannels) / 32768.0; } } else if (fmtChunk.bitsPerSample === 24) { // Support 24-bit PCM for (let i = 0; i < numSamples; i++) { let sample = 0; for (let ch = 0; ch < fmtChunk.numChannels; ch++) { const sampleOffset = dataChunk.offset + (i * fmtChunk.numChannels + ch) * 3; // Read 3 bytes and convert to signed 24-bit integer const byte1 = view.getUint8(sampleOffset); const byte2 = view.getUint8(sampleOffset + 1); const byte3 = view.getUint8(sampleOffset + 2); let value = (byte3 << 16) | (byte2 << 8) | byte1; // Convert to signed (two's complement) if (value & 0x800000) { value = value - 0x1000000; } sample += value; } audioData[i] = (sample / fmtChunk.numChannels) / 8388608.0; // 2^23 } } else if (fmtChunk.bitsPerSample === 32) { for (let i = 0; i < numSamples; i++) { let sample = 0; for (let ch = 0; ch < fmtChunk.numChannels; ch++) { const sampleOffset = dataChunk.offset + (i * fmtChunk.numChannels + ch) * 4; sample += view.getFloat32(sampleOffset, true); } audioData[i] = sample / fmtChunk.numChannels; } } else { throw new Error(`Unsupported bit depth: ${fmtChunk.bitsPerSample}. Supported formats: 16-bit, 24-bit, 32-bit`); } return { sampleRate: fmtChunk.sampleRate, audioData: audioData }; } export function arrayToTensor(array, dims) { const flat = array.flat(Infinity); return new ort.Tensor('float32', Float32Array.from(flat), dims); } export function intArrayToTensor(array, dims) { const flat = array.flat(Infinity); return new ort.Tensor('int64', BigInt64Array.from(flat.map(x => BigInt(x))), dims); } export function writeWavFile(audioData, sampleRate) { const numChannels = 1; const bitsPerSample = 16; const byteRate = sampleRate * numChannels * bitsPerSample / 8; const blockAlign = numChannels * bitsPerSample / 8; const dataSize = audioData.length * bitsPerSample / 8; const buffer = new ArrayBuffer(44 + dataSize); const view = new DataView(buffer); // RIFF header view.setUint8(0, 'R'.charCodeAt(0)); view.setUint8(1, 'I'.charCodeAt(0)); view.setUint8(2, 'F'.charCodeAt(0)); view.setUint8(3, 'F'.charCodeAt(0)); view.setUint32(4, 36 + dataSize, true); view.setUint8(8, 'W'.charCodeAt(0)); view.setUint8(9, 'A'.charCodeAt(0)); view.setUint8(10, 'V'.charCodeAt(0)); view.setUint8(11, 'E'.charCodeAt(0)); // fmt chunk view.setUint8(12, 'f'.charCodeAt(0)); view.setUint8(13, 'm'.charCodeAt(0)); view.setUint8(14, 't'.charCodeAt(0)); view.setUint8(15, ' '.charCodeAt(0)); view.setUint32(16, 16, true); view.setUint16(20, 1, true); // PCM view.setUint16(22, numChannels, true); view.setUint32(24, sampleRate, true); view.setUint32(28, byteRate, true); view.setUint16(32, blockAlign, true); view.setUint16(34, bitsPerSample, true); // data chunk view.setUint8(36, 'd'.charCodeAt(0)); view.setUint8(37, 'a'.charCodeAt(0)); view.setUint8(38, 't'.charCodeAt(0)); view.setUint8(39, 'a'.charCodeAt(0)); view.setUint32(40, dataSize, true); // Write audio data for (let i = 0; i < audioData.length; i++) { const sample = Math.max(-1, Math.min(1, audioData[i])); const intSample = Math.floor(sample * 32767); view.setInt16(44 + i * 2, intSample, true); } return buffer; } // Smooth scroll functionality document.addEventListener('DOMContentLoaded', () => { // Smooth scroll for anchor links document.querySelectorAll('a[href^="#"]').forEach(anchor => { anchor.addEventListener('click', function (e) { e.preventDefault(); const href = this.getAttribute('href'); const target = document.querySelector(href); if (target) { // Update URL with anchor if (history.pushState) { history.pushState(null, null, href); } target.scrollIntoView({ behavior: 'smooth', block: 'start' }); } }); }); // Add scroll animation for sections const observerOptions = { threshold: 0.1, rootMargin: '0px 0px -100px 0px' }; const observer = new IntersectionObserver((entries) => { entries.forEach(entry => { if (entry.isIntersecting) { entry.target.style.opacity = '1'; entry.target.style.transform = 'translateY(0)'; } }); }, observerOptions); }); // TTS Demo functionality (async function() { // Check if we're on a page with the TTS demo const demoTextInput = document.getElementById('demoTextInput'); if (!demoTextInput) return; // Configure ONNX Runtime for WebGPU support ort.env.wasm.wasmPaths = 'https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.0/dist/'; ort.env.wasm.numThreads = 1; // Configuration const REF_EMBEDDING_PATHS = { 'F1': 'assets/voice_styles/F1.json', 'F2': 'assets/voice_styles/F2.json', 'F3': 'assets/voice_styles/F3.json', 'F4': 'assets/voice_styles/F4.json', 'F5': 'assets/voice_styles/F5.json', 'M1': 'assets/voice_styles/M1.json', 'M2': 'assets/voice_styles/M2.json', 'M3': 'assets/voice_styles/M3.json', 'M4': 'assets/voice_styles/M4.json', 'M5': 'assets/voice_styles/M5.json' }; // Voice descriptions const VOICE_DESCRIPTIONS = { 'F1': 'Sarah - A calm female voice with a slightly low tone; steady and composed.', 'F2': 'Lily - A bright, cheerful female voice; lively, playful, and youthful with spirited energy.', 'F3': 'Jessica - A clear, professional announcer-style female voice; articulate and broadcast-ready.', 'F4': 'Olivia - A crisp, confident female voice; distinct and expressive with strong delivery.', 'F5': 'Emily - A kind, gentle female voice; soft-spoken, calm, and naturally soothing.', 'M1': 'Alex - A lively, upbeat male voice with confident energy and a standard, clear tone.', 'M2': 'James - A deep, robust male voice; calm, composed, and serious with a grounded presence.', 'M3': 'Robert - A polished, authoritative male voice; confident and trustworthy with strong presentation quality.', 'M4': 'Sam - A soft, neutral-toned male voice; gentle and approachable with a youthful, friendly quality.', 'M5': 'Daniel - A warm, soft-spoken male voice; calm and soothing with a natural storytelling quality.' }; // Global state let models = null; let cfgs = null; let processors = null; let currentVoice = 'M3'; // Default to Robert voice // Detect browser language and set initial language function detectBrowserLanguage() { // Get browser language (works in Chrome, Safari, Edge, Firefox, Opera, Samsung Internet) const browserLang = navigator.language || navigator.userLanguage || 'en'; // Extract language code (e.g., 'en-US' -> 'en', 'ko-KR' -> 'ko') const langCode = browserLang.split('-')[0].toLowerCase(); // Supported languages const supportedLangs = ['en', 'es', 'pt', 'fr', 'ko']; // Return detected language if supported, otherwise default to English return supportedLangs.includes(langCode) ? langCode : 'en'; } let currentLanguage = detectBrowserLanguage(); // Auto-detect from browser let refEmbeddingCache = {}; // Cache for embeddings let currentStyleTtlTensor = null; let currentStyleDpTensor = null; let modelsLoading = false; // Track if models are currently loading let modelsLoaded = false; // Track if models are fully loaded let modelsLoadPromise = null; // Promise for model loading // UI Elements const demoStatusBox = document.getElementById('demoStatusBox'); const demoStatusText = document.getElementById('demoStatusText'); const wasmWarningBanner = document.getElementById('wasmWarningBanner'); const demoGenerateBtn = document.getElementById('demoGenerateBtn'); const demoTotalSteps = document.getElementById('demoTotalSteps'); const demoSpeed = document.getElementById('demoSpeed'); const demoTotalStepsValue = document.getElementById('demoTotalStepsValue'); const demoSpeedValue = document.getElementById('demoSpeedValue'); const demoResults = document.getElementById('demoResults'); const demoError = document.getElementById('demoError'); const demoCharCount = document.getElementById('demoCharCount'); const demoCharCounter = document.getElementById('demoCharCounter'); const demoCharWarning = document.getElementById('demoCharWarning'); // Text validation constants const MIN_CHARS = 10; const MAX_CHUNK_LENGTH_DEFAULT = 300; // Maximum length for each chunk (default) const MAX_CHUNK_LENGTH_KO = 120; // Maximum length for Korean function getMaxChunkLength() { return currentLanguage === 'ko' ? MAX_CHUNK_LENGTH_KO : MAX_CHUNK_LENGTH_DEFAULT; } // Custom audio player state (shared across generations) let audioContext = null; let scheduledSources = []; let audioChunks = []; let totalDuration = 0; let startTime = 0; let pauseTime = 0; let isPaused = false; let isPlaying = false; let animationFrameId = null; let playPauseBtn = null; let progressBar = null; let currentTimeDisplay = null; let durationDisplay = null; let progressFill = null; let firstChunkGenerationTime = 0; // Processing time for first chunk let totalChunks = 0; let nextScheduledTime = 0; // Next time to schedule audio chunk let currentGenerationTextLength = 0; let supertonicPlayerRecord = null; // Supertonic player record for cross-player pause management let isGenerating = false; // Track if speech generation is in progress // Track all custom audio players let customAudioPlayers = []; const isMobileViewport = () => window.matchMedia('(max-width: 768px)').matches; // Check if device actually supports touch (not just viewport size) const isTouchDevice = () => 'ontouchstart' in window || navigator.maxTouchPoints > 0; const trimDecimalsForMobile = (formatted) => { if (!formatted) return formatted; return isMobileViewport() ? formatted.replace(/\.\d{2}$/, '') : formatted; }; function pauseAllPlayersExcept(currentPlayer) { customAudioPlayers.forEach(player => { if (player !== currentPlayer && player && typeof player.pausePlayback === 'function') { player.pausePlayback(); } }); } /** * Chunk text into smaller pieces based on sentence boundaries * @param {string} text - The text to chunk * @param {number} maxLen - Maximum length for each chunk * @returns {Array} - Array of text chunks */ function chunkText(text, maxLen = getMaxChunkLength()) { // Split by paragraph (two or more newlines) const paragraphs = text.trim().split(/\n\s*\n+/).filter(p => p.trim()); const chunks = []; for (let paragraph of paragraphs) { paragraph = paragraph.trim(); if (!paragraph) continue; // Split by sentence boundaries (period, question mark, exclamation mark followed by space) // But exclude common abbreviations like Mr., Mrs., Dr., etc. and single capital letters like F. const sentences = paragraph.split(/(?= 0 && progress <= 100) { const clampedProgress = Math.max(0, Math.min(progress, 100)); demoStatusBox.style.setProperty('--status-progress', `${clampedProgress}%`); demoStatusBox.classList.toggle('complete', clampedProgress >= 100); } else if (type === 'success' || type === 'error') { demoStatusBox.style.removeProperty('--status-progress'); demoStatusBox.classList.remove('complete'); } else { demoStatusBox.style.removeProperty('--status-progress'); demoStatusBox.classList.remove('complete'); } } function hideDemoStatus() { demoStatusBox.style.display = 'none'; } function showDemoError(message) { demoError.textContent = message; demoError.classList.add('active'); } function hideDemoError() { demoError.classList.remove('active'); } // Language toast notification const languageToast = document.getElementById('languageToast'); const languageToastMessage = document.getElementById('languageToastMessage'); let languageToastTimeout = null; function showLanguageToast(fromLang, toLang) { if (!languageToast || !languageToastMessage) return; const fromName = LANGUAGE_NAMES[fromLang] || fromLang; const toName = LANGUAGE_NAMES[toLang] || toLang; languageToastMessage.innerHTML = `Language auto-detected: ${toName}`; // Clear any existing timeout if (languageToastTimeout) { clearTimeout(languageToastTimeout); } // Show toast languageToast.classList.add('show'); // Hide after 3 seconds languageToastTimeout = setTimeout(() => { languageToast.classList.remove('show'); }, 3000); } function showWasmWarning() { if (wasmWarningBanner) { wasmWarningBanner.style.display = 'flex'; } } // Validate characters in text function validateCharacters(text) { if (!processors || !processors.textProcessor) { return { valid: true, unsupportedChars: [] }; } try { // Extract unique characters to minimize preprocessText calls const uniqueChars = [...new Set(text)]; // Build mapping for unique chars only (much faster for long texts) // For example, Korean '간' -> 'ㄱㅏㄴ', so we map 'ㄱ','ㅏ','ㄴ' -> '간' const processedToOriginal = new Map(); const charToProcessed = new Map(); for (const char of uniqueChars) { const processedChar = preprocessText(char); charToProcessed.set(char, processedChar); // Map each processed character back to its original for (const pc of processedChar) { if (!processedToOriginal.has(pc)) { processedToOriginal.set(pc, new Set()); } processedToOriginal.get(pc).add(char); } } // Build full processed text using cached mappings const fullProcessedText = Array.from(text).map(c => charToProcessed.get(c)).join(''); // Check the entire processed text once (efficient) const { unsupportedChars } = processors.textProcessor.call([fullProcessedText]); // Map unsupported processed chars back to original chars const unsupportedOriginalChars = new Set(); if (unsupportedChars && unsupportedChars.length > 0) { for (const unsupportedChar of unsupportedChars) { const originalChars = processedToOriginal.get(unsupportedChar); if (originalChars) { originalChars.forEach(c => unsupportedOriginalChars.add(c)); } } } const unsupportedCharsArray = Array.from(unsupportedOriginalChars); return { valid: unsupportedCharsArray.length === 0, unsupportedChars: unsupportedCharsArray }; } catch (error) { return { valid: true, unsupportedChars: [] }; } } // Update character counter and validate text length function updateCharCounter() { const rawText = demoTextInput.textContent || demoTextInput.innerText || ''; const text = rawText.replace(/\n$/g, ''); // Remove trailing newline that browsers may add const length = text.length; demoCharCount.textContent = length; // Get the actual width of the textarea const textareaWidth = demoTextInput.offsetWidth; // Max width reference: 1280px (container max-width) / 2 (grid column) - padding/gap ≈ 638px // Using 640px as reference for easier calculation const maxWidthRef = 640; // Calculate font size based on width ratio // Original rem values at max-width (640px): // 5rem = 80px @ 16px base → 80/640 = 12.5% // 4rem = 64px → 64/640 = 10% // 3rem = 48px → 48/640 = 7.5% // 2.5rem = 40px → 40/640 = 6.25% // 2rem = 32px → 32/640 = 5% // 1.5rem = 24px → 24/640 = 3.75% // 1rem = 16px → 16/640 = 2.5% // Check if mobile (572px or less) for 2x font size scaling const isMobile = window.innerWidth <= 572; const mobileMultiplier = isMobile ? 2 : 1; let fontSizeRatio; if (length <= 100) { fontSizeRatio = 0.055 * mobileMultiplier; // 5.5% of width } else if (length <= 200) { fontSizeRatio = 0.04 * mobileMultiplier; // 4% of width } else if (length < 240) { fontSizeRatio = 0.053125 * mobileMultiplier; // ~5.3125% of width (scaled from 2.5rem) } else if (length < 400) { fontSizeRatio = 0.0425 * mobileMultiplier; // ~4.25% of width (scaled from 2rem) } else if (length < 700) { fontSizeRatio = 0.031875 * mobileMultiplier; // ~3.1875% of width (scaled from 1.5rem) } else { fontSizeRatio = 0.025 * mobileMultiplier; // 2.5% of width (minimum stays the same) } // Calculate font size based on actual width const fontSize = textareaWidth * fontSizeRatio; demoTextInput.style.fontSize = `${fontSize}px`; // Remove all status classes demoCharCounter.classList.remove('error', 'warning', 'valid'); // Check for unsupported characters first (only if models are loaded) let hasUnsupportedChars = false; if (models && processors && length > 0) { const validation = validateCharacters(text); if (!validation.valid && validation.unsupportedChars.length > 0) { hasUnsupportedChars = true; const charList = validation.unsupportedChars.slice(0, 5).map(c => `"${c}"`).join(', '); const moreChars = validation.unsupportedChars.length > 5 ? ` and ${validation.unsupportedChars.length - 5} more` : ''; showDemoError(`Unsupported characters detected: ${charList}${moreChars}. Please remove them before generating speech.`); } else { hideDemoError(); } } // Update status based on length and character validation if (length < MIN_CHARS) { demoCharCounter.classList.add('error'); demoCharWarning.textContent = '(At least 10 characters)'; demoGenerateBtn.disabled = true; } else if (hasUnsupportedChars) { demoCharCounter.classList.add('error'); demoCharWarning.textContent = '(Unsupported characters)'; demoGenerateBtn.disabled = true; } else { demoCharCounter.classList.add('valid'); demoCharWarning.textContent = ''; // Enable only if models are loaded AND not currently generating demoGenerateBtn.disabled = !models || isGenerating; } } // Validate text input function validateTextInput(text) { if (!text || text.trim().length === 0) { return { valid: false, message: 'Please enter some text.' }; } if (text.length < MIN_CHARS) { return { valid: false, message: `Text must be at least ${MIN_CHARS} characters long. (Currently ${text.length})` }; } return { valid: true }; } // Load pre-extracted style embeddings from JSON async function loadStyleEmbeddings(voice) { try { // Check if already cached if (refEmbeddingCache[voice]) { return refEmbeddingCache[voice]; } const embeddingPath = REF_EMBEDDING_PATHS[voice]; if (!embeddingPath) { throw new Error(`No embedding path configured for voice: ${voice}`); } const response = await fetch(embeddingPath); if (!response.ok) { throw new Error(`Failed to fetch embedding: ${response.statusText}`); } const embeddingData = await response.json(); // Convert JSON data to ONNX tensors // Flatten nested arrays before creating Float32Array const styleTtlData = embeddingData.style_ttl.data.flat(Infinity); const styleTtlTensor = new ort.Tensor( embeddingData.style_ttl.type || 'float32', Float32Array.from(styleTtlData), embeddingData.style_ttl.dims ); const styleDpData = embeddingData.style_dp.data.flat(Infinity); const styleDpTensor = new ort.Tensor( embeddingData.style_dp.type || 'float32', Float32Array.from(styleDpData), embeddingData.style_dp.dims ); const embeddings = { styleTtl: styleTtlTensor, styleDp: styleDpTensor }; // Cache the embeddings refEmbeddingCache[voice] = embeddings; return embeddings; } catch (error) { throw error; } } // Switch to a different voice async function switchVoice(voice) { try { const embeddings = await loadStyleEmbeddings(voice); currentStyleTtlTensor = embeddings.styleTtl; currentStyleDpTensor = embeddings.styleDp; currentVoice = voice; // Update active speaker in UI if (typeof window.updateActiveSpeaker === 'function') { window.updateActiveSpeaker(voice); } // Re-validate text after switching voice updateCharCounter(); } catch (error) { showDemoError(`Failed to load voice ${voice}: ${error.message}`); throw error; } } // Check WebGPU support more thoroughly async function checkWebGPUSupport() { try { // Detect iOS/Safari const isIOS = /iPad|iPhone|iPod/.test(navigator.userAgent) || (navigator.platform === 'MacIntel' && navigator.maxTouchPoints > 1); const isSafari = /^((?!chrome|crios|android|edg|firefox).)*safari/i.test(navigator.userAgent); // iOS and Safari have incomplete WebGPU support if (isIOS) { return { supported: false, reason: 'iOS does not support the required WebGPU features' }; } if (isSafari) { // Desktop Safari might work, but check carefully return { supported: false, reason: 'Safari does not support the required WebGPU features' }; } // Check if WebGPU is available in the browser if (!navigator.gpu) { return { supported: false, reason: 'WebGPU not available in this browser' }; } // Request adapter const adapter = await navigator.gpu.requestAdapter(); if (!adapter) { return { supported: false, reason: 'No WebGPU adapter found' }; } // Check adapter info try { const adapterInfo = await adapter.requestAdapterInfo(); } catch (infoError) { // Ignore adapter info errors } // Request device to test if it actually works const device = await adapter.requestDevice(); if (!device) { return { supported: false, reason: 'Failed to create WebGPU device' }; } return { supported: true, adapter, device }; } catch (error) { // Handle specific iOS/Safari errors const errorMsg = error.message || ''; if (errorMsg.includes('subgroupMinSize') || errorMsg.includes('subgroup')) { return { supported: false, reason: 'iOS/Safari does not support required WebGPU features (subgroup operations)' }; } return { supported: false, reason: error.message }; } } // Warmup models with dummy inference (no audio playback, no UI updates) async function warmupModels() { try { const dummyText = 'Looking to integrate Supertonic into your product? We offer customized on-device SDK solutions tailored to your business needs. Our lightweight, high-performance TTS technology can be seamlessly integrated into mobile apps, IoT devices, automotive systems, and more. Try it now, and enjoy its speed.'; const totalStep = 5; // Use minimal steps for faster warmup const durationFactor = 1.0; const textList = [dummyText]; const bsz = 1; // Use pre-computed style embeddings const styleTtlTensor = currentStyleTtlTensor; const styleDpTensor = currentStyleDpTensor; // Step 1: Estimate duration const { textIds, textMask } = processors.textProcessor.call(textList, currentLanguage); const textIdsShape = [bsz, textIds[0].length]; const textMaskShape = [bsz, 1, textMask[0][0].length]; const textMaskTensor = arrayToTensor(textMask, textMaskShape); const dpResult = await models.dpOrt.run({ text_ids: intArrayToTensor(textIds, textIdsShape), style_dp: styleDpTensor, text_mask: textMaskTensor }); const durOnnx = Array.from(dpResult.duration.data); for (let i = 0; i < durOnnx.length; i++) { durOnnx[i] *= durationFactor; } const durReshaped = []; for (let b = 0; b < bsz; b++) { durReshaped.push([[durOnnx[b]]]); } // Step 2: Encode text const textEncResult = await models.textEncOrt.run({ text_ids: intArrayToTensor(textIds, textIdsShape), style_ttl: styleTtlTensor, text_mask: textMaskTensor }); const textEmbTensor = textEncResult.text_emb; // Step 3: Denoising let { noisyLatent, latentMask } = sampleNoisyLatent(durReshaped, cfgs); const latentShape = [bsz, noisyLatent[0].length, noisyLatent[0][0].length]; const latentMaskShape = [bsz, 1, latentMask[0][0].length]; const latentMaskTensor = arrayToTensor(latentMask, latentMaskShape); const totalStepArray = new Array(bsz).fill(totalStep); const scalarShape = [bsz]; const totalStepTensor = arrayToTensor(totalStepArray, scalarShape); for (let step = 0; step < totalStep; step++) { const currentStepArray = new Array(bsz).fill(step); const vectorEstResult = await models.vectorEstOrt.run({ noisy_latent: arrayToTensor(noisyLatent, latentShape), text_emb: textEmbTensor, style_ttl: styleTtlTensor, text_mask: textMaskTensor, latent_mask: latentMaskTensor, total_step: totalStepTensor, current_step: arrayToTensor(currentStepArray, scalarShape) }); const denoisedLatent = Array.from(vectorEstResult.denoised_latent.data); // Update latent let idx = 0; for (let b = 0; b < noisyLatent.length; b++) { for (let d = 0; d < noisyLatent[b].length; d++) { for (let t = 0; t < noisyLatent[b][d].length; t++) { noisyLatent[b][d][t] = denoisedLatent[idx++]; } } } } // Step 4: Generate waveform const vocoderResult = await models.vocoderOrt.run({ latent: arrayToTensor(noisyLatent, latentShape) }); // Warmup complete - no need to process the audio further } catch (error) { console.warn('Warmup failed (non-critical):', error.message); // Don't throw - warmup failure shouldn't prevent normal usage } } // Load models on page load async function initializeModels() { // If models are already loading, return the existing promise if (modelsLoading && modelsLoadPromise) { return modelsLoadPromise; } // If models are already loaded, return immediately if (modelsLoaded && models) { return; } modelsLoading = true; // Disable speaker selection during model loading const speakerItemsForLoading = document.querySelectorAll('.speaker-item[data-voice]'); speakerItemsForLoading.forEach(item => item.classList.add('disabled')); // Disable language selection during model loading const languageItemsForLoading = document.querySelectorAll('.speaker-item[data-language]'); languageItemsForLoading.forEach(item => item.classList.add('disabled')); modelsLoadPromise = (async () => { try { showDemoStatus('Loading configuration...', 'info', 5); const basePath = 'assets/onnx'; // Load config cfgs = await loadCfgs(basePath); // Check WebGPU support first showDemoStatus('Checking WebGPU support...', 'info', 8); const webgpuCheck = await checkWebGPUSupport(); // Determine execution provider based on WebGPU support const useWebGPU = webgpuCheck.supported; const executionProvider = useWebGPU ? 'webgpu' : 'wasm'; // If WebGPU is not supported, show subtle warning banner if (!useWebGPU) { showWasmWarning(); } // Load models with appropriate backend const backendName = useWebGPU ? 'WebGPU' : 'WASM'; showDemoStatus(`${backendName} detected! Loading models...`, 'info', 10); const modelsLoadPromise = loadOnnxAll(basePath, { executionProviders: [executionProvider], graphOptimizationLevel: 'all' }, (modelName, current, total) => { const progress = 10 + (current / total) * 70; // 10-80% for model loading showDemoStatus(`Loading models with ${backendName} (${current}/${total}): ${modelName}...`, 'info', progress); }); // Load processors in parallel with models const [loadedModels, loadedProcessors] = await Promise.all([ modelsLoadPromise, loadProcessors(basePath) ]); models = loadedModels; processors = loadedProcessors; showDemoStatus('Loading reference embeddings...', 'info', 85); // Load pre-extracted embeddings for default voice const embeddings = await loadStyleEmbeddings(currentVoice); currentStyleTtlTensor = embeddings.styleTtl; currentStyleDpTensor = embeddings.styleDp; showDemoStatus('Warming up models...', 'info', 90); // Warmup step: run inference once in background with dummy text await warmupModels(); hideDemoStatus(); demoGenerateBtn.disabled = false; demoTotalSteps.disabled = false; demoSpeed.disabled = false; // Enable voice toggle buttons after models are loaded const voiceToggleTexts = document.querySelectorAll('.voice-toggle-text'); voiceToggleTexts.forEach(text => text.classList.remove('disabled')); // Validate initial text now that models are loaded updateCharCounter(); // Mark models as loaded modelsLoaded = true; modelsLoading = false; // Re-enable speaker selection after model loading speakerItemsForLoading.forEach(item => item.classList.remove('disabled')); // Re-enable language selection after model loading languageItemsForLoading.forEach(item => item.classList.remove('disabled')); } catch (error) { modelsLoading = false; // Re-enable speaker selection on error too speakerItemsForLoading.forEach(item => item.classList.remove('disabled')); // Re-enable language selection on error too languageItemsForLoading.forEach(item => item.classList.remove('disabled')); showDemoStatus(`Error: ${error.message}`, 'error'); showDemoError(`Failed to initialize: ${error.message}. Check console for details.`); throw error; } })(); return modelsLoadPromise; } // Supertonic synthesis function (extracted for parallel execution) async function generateSupertonicSpeech(text, totalStep, durationFactor) { const supertonicStartTime = Date.now(); try { const textList = [text]; const bsz = 1; const sampleRate = cfgs.ae.sample_rate; // Use pre-computed style embeddings const styleTtlTensor = currentStyleTtlTensor; const styleDpTensor = currentStyleDpTensor; // Step 1: Estimate duration const { textIds, textMask, unsupportedChars } = processors.textProcessor.call(textList, currentLanguage); // Check for unsupported characters if (unsupportedChars && unsupportedChars.length > 0) { const charList = unsupportedChars.map(c => `"${c}"`).join(', '); throw new Error(`Unsupported characters: ${charList}`); } const textIdsShape = [bsz, textIds[0].length]; const textMaskShape = [bsz, 1, textMask[0][0].length]; const textMaskTensor = arrayToTensor(textMask, textMaskShape); const dpResult = await models.dpOrt.run({ text_ids: intArrayToTensor(textIds, textIdsShape), style_dp: styleDpTensor, text_mask: textMaskTensor }); const durOnnx = Array.from(dpResult.duration.data); // Apply duration factor to adjust speech length (once) for (let i = 0; i < durOnnx.length; i++) { durOnnx[i] *= durationFactor; } const durReshaped = []; for (let b = 0; b < bsz; b++) { durReshaped.push([[durOnnx[b]]]); } // Step 2: Encode text const textEncResult = await models.textEncOrt.run({ text_ids: intArrayToTensor(textIds, textIdsShape), style_ttl: styleTtlTensor, text_mask: textMaskTensor }); const textEmbTensor = textEncResult.text_emb; // Step 3: Denoising let { noisyLatent, latentMask } = sampleNoisyLatent(durReshaped, cfgs); const latentDim = noisyLatent[0].length; const latentLen = noisyLatent[0][0].length; const latentShape = [bsz, latentDim, latentLen]; const latentMaskShape = [bsz, 1, latentMask[0][0].length]; const latentMaskTensor = arrayToTensor(latentMask, latentMaskShape); // Pre-allocate flat buffer for latent data to avoid repeated allocations const latentBufferSize = bsz * latentDim * latentLen; const latentBuffer = new Float32Array(latentBufferSize); // Initialize latent buffer from noisyLatent let initIdx = 0; for (let b = 0; b < bsz; b++) { for (let d = 0; d < latentDim; d++) { for (let t = 0; t < latentLen; t++) { latentBuffer[initIdx++] = noisyLatent[b][d][t]; } } } // Prepare constant tensors const scalarShape = [bsz]; const totalStepTensor = arrayToTensor(new Array(bsz).fill(totalStep), scalarShape); // Pre-create all step tensors to avoid repeated allocations const stepTensors = []; for (let step = 0; step < totalStep; step++) { stepTensors.push(arrayToTensor(new Array(bsz).fill(step), scalarShape)); } for (let step = 0; step < totalStep; step++) { // Create tensor from pre-allocated buffer const noisyLatentTensor = new ort.Tensor('float32', latentBuffer, latentShape); const vectorEstResult = await models.vectorEstOrt.run({ noisy_latent: noisyLatentTensor, text_emb: textEmbTensor, style_ttl: styleTtlTensor, text_mask: textMaskTensor, latent_mask: latentMaskTensor, total_step: totalStepTensor, current_step: stepTensors[step] }); // Copy denoised result directly into pre-allocated buffer const denoisedData = vectorEstResult.denoised_latent.data; latentBuffer.set(denoisedData); } // Step 4: Generate waveform - use latentBuffer directly const vocoderResult = await models.vocoderOrt.run({ latent: new ort.Tensor('float32', latentBuffer, latentShape) }); const wavBatch = vocoderResult.wav_tts.data; const wavLen = Math.floor(sampleRate * durOnnx[0]); // Create a copy of the audio data (not a view) to prevent buffer reuse issues const audioData = wavBatch.slice(0, wavLen); // Calculate times for Supertonic const supertonicEndTime = Date.now(); const supertonicProcessingTime = (supertonicEndTime - supertonicStartTime) / 1000; const audioDurationSec = durOnnx[0]; return { success: true, processingTime: supertonicProcessingTime, audioDuration: audioDurationSec, audioData: audioData, sampleRate: sampleRate, text: text }; } catch (error) { return { success: false, error: error.message, text: text }; } } // Format time: 60초 미만 -> 00.00, 60분 미만 -> 00:00.00, 60분 이상 -> 00:00:00.00 function formatTimeDetailed(seconds) { const hours = Math.floor(seconds / 3600); const mins = Math.floor((seconds % 3600) / 60); const secs = seconds % 60; const ms = Math.floor((secs % 1) * 100); const wholeSecs = Math.floor(secs); if (seconds < 60) { return `${wholeSecs.toString().padStart(2, '0')}.${ms.toString().padStart(2, '0')}`; } else if (seconds < 3600) { return `${mins.toString().padStart(2, '0')}:${wholeSecs.toString().padStart(2, '0')}.${ms.toString().padStart(2, '0')}`; } else { return `${hours.toString().padStart(2, '0')}:${mins.toString().padStart(2, '0')}:${wholeSecs.toString().padStart(2, '0')}.${ms.toString().padStart(2, '0')}`; } } // Generate Supertonic speech with chunking support and progressive playback async function generateSupertonicSpeechChunked(text, totalStep, durationFactor, onFirstChunkReady, onChunkAdded) { const supertonicStartTime = Date.now(); const sampleRate = cfgs.ae.sample_rate; const silenceDuration = 0.3; // 0.3 seconds of silence between chunks try { // Split text into chunks const chunks = chunkText(text); const audioDataArrays = []; const durations = []; const silenceSamples = Math.floor(silenceDuration * sampleRate); let firstChunkEndTime = 0; let firstChunkTime = 0; // Generate speech for each chunk for (let i = 0; i < chunks.length; i++) { const chunkText = chunks[i]; const result = await generateSupertonicSpeech(chunkText, totalStep, durationFactor); if (!result.success) { throw new Error(`Failed to generate chunk ${i + 1}: ${result.error}`); } // Use raw Float32Array directly - no WAV encode/decode round-trip const audioData = result.audioData; audioDataArrays.push(audioData); durations.push(result.audioDuration); // Progressive playback: pass raw Float32Array directly to callbacks if (i === 0 && onFirstChunkReady) { // First chunk ready - send it immediately firstChunkEndTime = Date.now(); firstChunkTime = (firstChunkEndTime - supertonicStartTime) / 1000; const totalDurationSoFar = result.audioDuration; const processedChars = chunks[0].length; // Pass raw audio data and sample rate directly onFirstChunkReady(audioData, sampleRate, totalDurationSoFar, text, chunks.length, firstChunkTime, processedChars); } else if (i > 0 && onChunkAdded) { // Subsequent chunks - send just the new chunk const totalDurationSoFar = durations.slice(0, i + 1).reduce((sum, dur) => sum + dur, 0) + silenceDuration * i; const currentProcessingTime = (Date.now() - supertonicStartTime) / 1000; const processedChars = chunks.slice(0, i + 1).reduce((sum, chunk) => sum + chunk.length, 0); // Pass raw audio data and sample rate directly onChunkAdded(audioData, sampleRate, totalDurationSoFar, i + 1, chunks.length, currentProcessingTime, processedChars); } } // Concatenate all audio chunks with silence for final result const totalDuration = durations.reduce((sum, dur) => sum + dur, 0) + silenceDuration * (chunks.length - 1); // Calculate total samples needed let totalSamples = 0; for (let i = 0; i < audioDataArrays.length; i++) { totalSamples += audioDataArrays[i].length; if (i < audioDataArrays.length - 1) { totalSamples += silenceSamples; } } const wavCat = new Float32Array(totalSamples); let currentIdx = 0; for (let i = 0; i < audioDataArrays.length; i++) { // Copy audio data const audioData = audioDataArrays[i]; wavCat.set(audioData, currentIdx); currentIdx += audioData.length; // Add silence if not the last chunk if (i < audioDataArrays.length - 1) { // Silence is already zeros in Float32Array, just skip the indices currentIdx += silenceSamples; } } // Create final WAV file const wavBuffer = writeWavFile(wavCat, sampleRate); const blob = new Blob([wavBuffer], { type: 'audio/wav' }); const url = URL.createObjectURL(blob); const supertonicEndTime = Date.now(); const supertonicProcessingTime = (supertonicEndTime - supertonicStartTime) / 1000; return { success: true, processingTime: supertonicProcessingTime, audioDuration: totalDuration, url: url, text: text, firstChunkTime: firstChunkTime }; } catch (error) { return { success: false, error: error.message, text: text }; } } // Main synthesis function async function generateSpeech() { let text = (demoTextInput.textContent || demoTextInput.innerText || '').trim(); // Validate text input const validation = validateTextInput(text); if (!validation.valid) { showDemoError(validation.message); return; } if (!models || !cfgs || !processors) { showDemoError('Models are still loading. Please wait.'); return; } if (!currentStyleTtlTensor || !currentStyleDpTensor) { showDemoError('Reference embeddings are not ready. Please wait.'); return; } // Validate characters before generation const charValidation = validateCharacters(text); if (!charValidation.valid && charValidation.unsupportedChars.length > 0) { const charList = charValidation.unsupportedChars.map(c => `"${c}"`).join(', '); showDemoError(`Cannot generate speech: Unsupported characters found: ${charList}`); return; } currentGenerationTextLength = text.length; try { isGenerating = true; demoGenerateBtn.disabled = true; // Disable speaker selection during generation const speakerItemsForGeneration = document.querySelectorAll('.speaker-item[data-voice]'); speakerItemsForGeneration.forEach(item => item.classList.add('disabled')); // Disable language selection during generation const languageItemsForGeneration = document.querySelectorAll('.speaker-item[data-language]'); languageItemsForGeneration.forEach(item => item.classList.add('disabled')); hideDemoError(); hideDemoStatus(); // Hide the status box when starting generation // Clean up previous audio playback if (audioContext) { // Stop all scheduled sources scheduledSources.forEach(source => { try { source.stop(); } catch (e) { // Already stopped } }); scheduledSources = []; // Close audio context if (audioContext.state !== 'closed') { audioContext.close(); } audioContext = null; } // Cancel animation frame if (animationFrameId) { cancelAnimationFrame(animationFrameId); animationFrameId = null; } // Clean up all custom audio players customAudioPlayers.forEach(player => { if (player.cleanup) { player.cleanup(); } }); customAudioPlayers = []; // Reset state audioChunks = []; totalDuration = 0; startTime = 0; pauseTime = 0; isPaused = false; isPlaying = false; firstChunkGenerationTime = 0; // Processing time for first chunk totalChunks = 0; nextScheduledTime = 0; // Next time to schedule audio chunk // Show result shell(s) immediately const createInitialResultItem = (system, titleMain, titleSub, titleColor, includeStatus) => { const titleStatus = includeStatus ? `⏳ Running...` : ''; return `

${titleMain} ${titleSub} ${titleStatus}

--

Processing Time↓

--

Chars/sec↑

--

RTF↓

Generating speech...

`; }; const supertonicInitial = createInitialResultItem( 'supertonic', 'Supertonic', 'On-Device', 'var(--accent-yellow)', false ); demoResults.style.display = 'flex'; demoResults.innerHTML = supertonicInitial; const totalStep = parseInt(demoTotalSteps.value); const speed = parseFloat(demoSpeed.value); const durationFactor = speedToDurationFactor(speed); // Track which one finishes first let latestSupertonicProcessedChars = 0; // Helper functions for custom player const formatTime = (seconds, { trimMobile = false } = {}) => { const mins = Math.floor(seconds / 60); const secs = seconds % 60; const secString = secs.toFixed(2).padStart(5, '0'); let formatted = `${mins}:${secString}`; if (trimMobile) { formatted = trimDecimalsForMobile(formatted); } return formatted; }; const updateProgress = () => { if (!isPlaying || !audioContext) return; const currentTime = isPaused ? pauseTime : (audioContext.currentTime - startTime); const progress = totalDuration > 0 ? (currentTime / totalDuration) * 100 : 0; if (progressFill) { progressFill.style.width = `${Math.min(progress, 100)}%`; } if (currentTimeDisplay) { currentTimeDisplay.textContent = formatTime(Math.min(currentTime, totalDuration), { trimMobile: true }); } if (currentTime < totalDuration) { animationFrameId = requestAnimationFrame(updateProgress); } else { // Playback finished isPlaying = false; isPaused = false; if (playPauseBtn) { playPauseBtn.innerHTML = PLAY_ICON_SVG; } } }; const togglePlayPause = () => { if (!audioContext || audioChunks.length === 0) return; if (isPaused) { // Resume from paused position pauseAllPlayersExcept(supertonicPlayerRecord); const seekTime = pauseTime; // Find which chunk we should start from let accumulatedTime = 0; let startChunkIndex = 0; let offsetInChunk = seekTime; for (let i = 0; i < audioChunks.length; i++) { const chunkDuration = audioChunks[i].buffer.duration; if (accumulatedTime + chunkDuration > seekTime) { startChunkIndex = i; offsetInChunk = seekTime - accumulatedTime; break; } accumulatedTime += chunkDuration + 0.3; } // Stop any existing sources scheduledSources.forEach(source => { try { source.stop(); } catch (e) { // Already stopped } }); scheduledSources = []; // Resume AudioContext if suspended if (audioContext.state === 'suspended') { audioContext.resume(); } // Reschedule from the pause point startTime = audioContext.currentTime - seekTime; let nextStartTime = audioContext.currentTime; for (let i = startChunkIndex; i < audioChunks.length; i++) { const source = audioContext.createBufferSource(); source.buffer = audioChunks[i].buffer; source.connect(audioContext.destination); if (i === startChunkIndex) { source.start(nextStartTime, offsetInChunk); nextStartTime += (audioChunks[i].buffer.duration - offsetInChunk); } else { source.start(nextStartTime); nextStartTime += audioChunks[i].buffer.duration; } if (i < audioChunks.length - 1) { nextStartTime += 0.3; } scheduledSources.push(source); } nextScheduledTime = nextStartTime; isPaused = false; isPlaying = true; playPauseBtn.innerHTML = PAUSE_ICON_SVG; updateProgress(); } else if (isPlaying) { // Pause playback pauseTime = audioContext.currentTime - startTime; audioContext.suspend(); isPaused = true; playPauseBtn.innerHTML = PLAY_ICON_SVG; if (animationFrameId) { cancelAnimationFrame(animationFrameId); } } else { // Was finished, restart from beginning pauseAllPlayersExcept(supertonicPlayerRecord); pauseTime = 0; // Resume AudioContext if suspended if (audioContext.state === 'suspended') { audioContext.resume(); } // Stop any existing sources scheduledSources.forEach(source => { try { source.stop(); } catch (e) { // Already stopped } }); scheduledSources = []; // Restart from beginning startTime = audioContext.currentTime; let nextStartTime = audioContext.currentTime; for (let i = 0; i < audioChunks.length; i++) { const source = audioContext.createBufferSource(); source.buffer = audioChunks[i].buffer; source.connect(audioContext.destination); source.start(nextStartTime); nextStartTime += audioChunks[i].buffer.duration; if (i < audioChunks.length - 1) { nextStartTime += 0.3; } scheduledSources.push(source); } nextScheduledTime = nextStartTime; isPlaying = true; isPaused = false; playPauseBtn.innerHTML = PAUSE_ICON_SVG; updateProgress(); } }; const seekTo = (percentage) => { if (!audioContext || audioChunks.length === 0) return; const seekTime = (percentage / 100) * totalDuration; // Remember current playing state const wasPlaying = isPlaying; const wasPaused = isPaused; // Stop all current sources scheduledSources.forEach(source => { try { source.stop(); } catch (e) { // Already stopped } }); scheduledSources = []; // Cancel animation if (animationFrameId) { cancelAnimationFrame(animationFrameId); } // Find which chunk we should start from let accumulatedTime = 0; let startChunkIndex = 0; let offsetInChunk = seekTime; for (let i = 0; i < audioChunks.length; i++) { const chunkDuration = audioChunks[i].buffer.duration; if (accumulatedTime + chunkDuration > seekTime) { startChunkIndex = i; offsetInChunk = seekTime - accumulatedTime; break; } accumulatedTime += chunkDuration + 0.3; // Include silence } // If paused or finished, just update the pause position if (wasPaused || !wasPlaying) { pauseTime = seekTime; // Update UI if (progressFill) { const progress = (seekTime / totalDuration) * 100; progressFill.style.width = `${Math.min(progress, 100)}%`; } if (currentTimeDisplay) { currentTimeDisplay.textContent = formatTime(seekTime, { trimMobile: true }); } // Set to paused state so play button will resume from seek position isPaused = true; isPlaying = true; // Valid state for playback if (playPauseBtn) { playPauseBtn.innerHTML = PLAY_ICON_SVG; } return; } // Resume AudioContext if it was suspended if (audioContext.state === 'suspended') { audioContext.resume(); } // Reschedule from the seek point startTime = audioContext.currentTime - seekTime; let nextStartTime = audioContext.currentTime; for (let i = startChunkIndex; i < audioChunks.length; i++) { const source = audioContext.createBufferSource(); source.buffer = audioChunks[i].buffer; source.connect(audioContext.destination); if (i === startChunkIndex) { // Start from offset source.start(nextStartTime, offsetInChunk); nextStartTime += (audioChunks[i].buffer.duration - offsetInChunk); } else { source.start(nextStartTime); nextStartTime += audioChunks[i].buffer.duration; } // Add silence between chunks if (i < audioChunks.length - 1) { nextStartTime += 0.3; } scheduledSources.push(source); } // Update nextScheduledTime for any future chunks nextScheduledTime = nextStartTime; // Resume playing state isPlaying = true; isPaused = false; if (playPauseBtn) { playPauseBtn.innerHTML = PAUSE_ICON_SVG; } // Restart progress animation updateProgress(); }; // Callback for first chunk ready - create custom player and start playback // Helper function to create AudioBuffer directly from Float32Array const createAudioBufferFromFloat32 = (audioData, sampleRate) => { const audioBuffer = audioContext.createBuffer(1, audioData.length, sampleRate); audioBuffer.getChannelData(0).set(audioData); return audioBuffer; }; const onFirstChunkReady = async (audioData, sampleRate, duration, text, numChunks, firstChunkTime, processedChars) => { totalChunks = numChunks; firstChunkGenerationTime = firstChunkTime; const container = document.getElementById('demoResults'); const textLength = currentGenerationTextLength > 0 ? currentGenerationTextLength : (text ? text.length : 0); const isBatch = textLength >= getMaxChunkLength(); const processingTimeStr = isBatch && firstChunkTime ? `${formatTimeDetailed(firstChunkTime)} / ${formatTimeDetailed(firstChunkTime)}` : formatTimeDetailed(firstChunkTime); const safeInitialChars = typeof processedChars === 'number' ? processedChars : 0; const displayedInitialChars = textLength > 0 ? Math.min(safeInitialChars, textLength) : safeInitialChars; const charsPerSec = firstChunkTime > 0 && displayedInitialChars > 0 ? (displayedInitialChars / firstChunkTime).toFixed(1) : '0.0'; const rtf = duration > 0 && firstChunkTime > 0 ? (firstChunkTime / duration).toFixed(3) : '-'; const progressValue = textLength > 0 ? Math.min(100, (displayedInitialChars / textLength) * 100) : 0; const resultItemEl = document.getElementById('supertonic-result'); if (!resultItemEl) { console.warn('Supertonic result container not found.'); return; } resultItemEl.classList.remove('generating'); resultItemEl.style.setProperty('--result-progress', `${progressValue}%`); const titleMainEl = resultItemEl.querySelector('.title-main'); if (titleMainEl) { titleMainEl.textContent = 'Supertonic'; titleMainEl.style.color = 'var(--accent-yellow)'; } const titleSubEl = resultItemEl.querySelector('.title-sub'); if (titleSubEl) { titleSubEl.textContent = 'On-Device'; } const infoContainer = resultItemEl.querySelector('.demo-result-info'); if (infoContainer) { infoContainer.classList.remove('error'); } const timeElInitial = document.getElementById('supertonic-time'); if (timeElInitial) { timeElInitial.innerHTML = formatStatValueWithSuffix(processingTimeStr, 's', { firstLabel: true }); } const cpsElInitial = document.getElementById('supertonic-cps'); if (cpsElInitial) { cpsElInitial.textContent = charsPerSec; } const rtfElInitial = document.getElementById('supertonic-rtf'); if (rtfElInitial) { rtfElInitial.innerHTML = formatStatValueWithSuffix(rtf, 'x'); } const playerContainer = resultItemEl.querySelector('.custom-audio-player'); if (playerContainer) { playerContainer.style.display = ''; playerContainer.innerHTML = `

0:00.00

${formatTime(duration, { trimMobile: true })}

`; } container.style.display = 'flex'; latestSupertonicProcessedChars = displayedInitialChars; // Get UI elements playPauseBtn = document.getElementById('play-pause-btn'); progressBar = document.getElementById('progress-container'); currentTimeDisplay = document.getElementById('current-time'); durationDisplay = document.getElementById('total-duration'); progressFill = document.getElementById('progress-fill'); // Initialize Web Audio API audioContext = new (window.AudioContext || window.webkitAudioContext)(); startTime = audioContext.currentTime; totalDuration = duration; isPlaying = true; isPaused = false; // Create Supertonic player record and register it const pausePlayback = () => { if (!audioContext || audioContext.state === 'closed') return; if (isPlaying) { pauseTime = audioContext.currentTime - startTime; scheduledSources.forEach(source => { try { source.stop(); } catch (e) { // Already stopped } }); scheduledSources = []; audioContext.suspend(); isPaused = true; isPlaying = false; if (playPauseBtn) { playPauseBtn.innerHTML = PLAY_ICON_SVG; } if (animationFrameId) { cancelAnimationFrame(animationFrameId); } } }; supertonicPlayerRecord = { audioContext: audioContext, pausePlayback: pausePlayback }; // Remove old Supertonic player if exists and add new one customAudioPlayers = customAudioPlayers.filter(p => p !== supertonicPlayerRecord && p.audioContext !== audioContext); customAudioPlayers.push(supertonicPlayerRecord); // Pause all other players before starting Supertonic pauseAllPlayersExcept(supertonicPlayerRecord); // Create AudioBuffer directly from Float32Array - no WAV encode/decode const audioBuffer = createAudioBufferFromFloat32(audioData, sampleRate); audioChunks.push({ buffer: audioBuffer, duration: audioBuffer.duration }); // Play first chunk immediately const source = audioContext.createBufferSource(); source.buffer = audioBuffer; source.connect(audioContext.destination); source.start(audioContext.currentTime); scheduledSources.push(source); // Set next scheduled time for additional chunks nextScheduledTime = audioContext.currentTime + audioBuffer.duration + 0.3; // Add silence gap // Setup player controls playPauseBtn.addEventListener('click', togglePlayPause); progressBar.addEventListener('click', (e) => { const rect = progressBar.getBoundingClientRect(); const percentage = ((e.clientX - rect.left) / rect.width) * 100; seekTo(percentage); }); // Start progress animation updateProgress(); }; // Callback for each additional chunk - schedule seamlessly const onChunkAdded = async (audioData, sampleRate, duration, chunkIndex, totalChunks, currentProcessingTime, processedChars) => { if (!audioContext) return; // Create AudioBuffer directly from Float32Array - no WAV encode/decode const audioBuffer = createAudioBufferFromFloat32(audioData, sampleRate); const chunkDuration = audioBuffer.duration; audioChunks.push({ buffer: audioBuffer, duration: chunkDuration }); // Schedule the new chunk at the pre-calculated time const source = audioContext.createBufferSource(); source.buffer = audioBuffer; source.connect(audioContext.destination); source.start(nextScheduledTime); scheduledSources.push(source); // Update next scheduled time for the next chunk nextScheduledTime = nextScheduledTime + audioBuffer.duration + 0.3; // Add silence gap // Update total duration totalDuration = duration; // Update duration display with smooth animation if (durationDisplay) { durationDisplay.textContent = formatTime(duration, { trimMobile: true }); durationDisplay.style.transition = 'color 0.3s'; durationDisplay.style.color = '#ffffff'; setTimeout(() => { durationDisplay.style.color = ''; }, 300); } // Update info display const textLengthCandidate = currentGenerationTextLength > 0 ? currentGenerationTextLength : (demoTextInput.textContent || demoTextInput.innerText || '').trim().length; const textLength = textLengthCandidate; const isBatch = textLength >= getMaxChunkLength(); const timeEl = document.getElementById('supertonic-time'); const durationEl = document.getElementById('supertonic-duration'); const cpsEl = document.getElementById('supertonic-cps'); const rtfEl = document.getElementById('supertonic-rtf'); const effectiveProcessedChars = typeof processedChars === 'number' ? processedChars : latestSupertonicProcessedChars; if (effectiveProcessedChars < latestSupertonicProcessedChars) { return; } const clampedProcessedChars = textLength > 0 ? Math.min(effectiveProcessedChars, textLength) : effectiveProcessedChars; const progressValue = textLength > 0 ? Math.min(100, (clampedProcessedChars / textLength) * 100) : 0; if (durationEl) { durationEl.textContent = formatTimeDetailed(duration); } if (timeEl && isBatch && firstChunkGenerationTime > 0 && currentProcessingTime) { const timeDisplay = `${formatTimeDetailed(firstChunkGenerationTime)} / ${formatTimeDetailed(currentProcessingTime)}`; timeEl.innerHTML = formatStatValueWithSuffix(timeDisplay, 's', { firstLabel: true }); } if (cpsEl && currentProcessingTime > 0 && clampedProcessedChars >= 0) { const charsPerSec = (clampedProcessedChars / currentProcessingTime).toFixed(1); cpsEl.textContent = charsPerSec; } if (rtfEl && duration > 0 && currentProcessingTime > 0) { const rtf = (currentProcessingTime / duration).toFixed(3); rtfEl.innerHTML = formatStatValueWithSuffix(rtf, 'x'); } const resultItemEl = document.getElementById('supertonic-result'); if (resultItemEl) { resultItemEl.style.setProperty('--result-progress', `${progressValue}%`); } latestSupertonicProcessedChars = clampedProcessedChars; }; // Start all syntheses simultaneously const result = await generateSupertonicSpeechChunked( text, totalStep, durationFactor, onFirstChunkReady, onChunkAdded ); if (result.success) { const textLength = result.text ? result.text.length : 0; const isBatch = textLength >= getMaxChunkLength(); const processingTimeStr = isBatch && firstChunkGenerationTime > 0 ? `${formatTimeDetailed(firstChunkGenerationTime)} / ${formatTimeDetailed(result.processingTime)}` : formatTimeDetailed(result.processingTime); const charsPerSec = result.processingTime > 0 ? (textLength / result.processingTime).toFixed(1) : '0.0'; const progressValue = textLength > 0 ? 100 : 0; const timeEl = document.getElementById('supertonic-time'); const durationEl = document.getElementById('supertonic-duration'); const cpsEl = document.getElementById('supertonic-cps'); const rtfEl = document.getElementById('supertonic-rtf'); if (timeEl) timeEl.innerHTML = formatStatValueWithSuffix(processingTimeStr, 's', { firstLabel: true }); if (durationEl) durationEl.textContent = formatTimeDetailed(result.audioDuration); latestSupertonicProcessedChars = textLength; if (cpsEl) cpsEl.textContent = charsPerSec; if (rtfEl) { const rtf = result.audioDuration > 0 ? (result.processingTime / result.audioDuration).toFixed(3) : '-'; rtfEl.innerHTML = formatStatValueWithSuffix(rtf, 'x'); } const resultItemEl = document.getElementById('supertonic-result'); if (resultItemEl) { resultItemEl.style.setProperty('--result-progress', `${progressValue}%`); } // Final duration update (if custom player was used) if (audioContext && audioChunks.length > 0) { totalDuration = result.audioDuration; if (durationDisplay) { durationDisplay.textContent = formatTime(result.audioDuration, { trimMobile: true }); } } // Always show download button const downloadBtn = document.getElementById('supertonic-download'); if (downloadBtn) { downloadBtn.parentElement.style.display = 'block'; downloadBtn.onclick = () => downloadDemoAudio(result.url, 'supertonic_speech.wav'); } } } catch (error) { showDemoStatus(`Error: ${error.message}`, 'error'); showDemoError(`Error during synthesis: ${error.message}`); console.error('Synthesis error:', error); // Restore placeholder demoResults.style.display = 'none'; demoResults.innerHTML = `

🎙️

Your generated speech will appear here

`; } finally { isGenerating = false; demoGenerateBtn.disabled = false; // Re-enable speaker selection after generation const speakerItemsForGeneration = document.querySelectorAll('.speaker-item[data-voice]'); speakerItemsForGeneration.forEach(item => item.classList.remove('disabled')); // Re-enable language selection after generation const languageItemsForGeneration = document.querySelectorAll('.speaker-item[data-language]'); languageItemsForGeneration.forEach(item => item.classList.remove('disabled')); } } // Download handler (make it global) window.downloadDemoAudio = function(url, filename) { const a = document.createElement('a'); a.href = url; a.download = filename; a.click(); }; // Helper function to convert speed to durationFactor function speedToDurationFactor(speed, offset=0.05) { return 1 / (speed + offset); } // Update slider value displays function updateSliderValues() { demoTotalStepsValue.textContent = demoTotalSteps.value + ' Steps'; // Display speed with 'x' suffix (e.g., 1.0x, 0.7x, 1.5x) const speed = parseFloat(demoSpeed.value); demoSpeedValue.textContent = speed.toFixed(2) + 'x'; } // Attach slider event listeners demoTotalSteps.addEventListener('input', updateSliderValues); demoSpeed.addEventListener('input', updateSliderValues); // Initialize slider values updateSliderValues(); // Attach generate function to button demoGenerateBtn.addEventListener('click', generateSpeech); // Preset text items (defined before input listener to share scope) const presetItems = document.querySelectorAll('.preset-item[data-preset]'); const freeformBtn = document.getElementById('freeformBtn'); let currentPreset = 'quote'; // Initialize with quote // currentLanguage is already declared above (line 902) let isPresetChanging = false; // Flag to track if text change is from preset button // Helper function to update active button state function updateActiveButton(presetType) { // Remove active from all preset items presetItems.forEach(item => item.classList.remove('active')); // Add active to the specified item if (presetType) { const targetItem = document.querySelector(`.preset-item[data-preset="${presetType}"]`); if (targetItem) { targetItem.classList.add('active'); } } currentPreset = presetType; updateQuoteModeState(presetType === 'quote'); } function updateQuoteModeState(isQuote) { if (!demoResults) return; demoResults.classList.toggle('quote-mode', Boolean(isQuote)); } // Initialize quote button active state updateActiveButton('quote'); if (presetTexts.quote && typeof presetTexts.quote === 'object' && presetTexts.quote[currentLanguage]) { demoTextInput.textContent = presetTexts.quote[currentLanguage]; updateCharCounter(); } presetItems.forEach(item => { item.addEventListener('click', () => { const presetType = item.getAttribute('data-preset'); if (presetType === 'freeform') { // Freeform item: clear text isPresetChanging = true; demoTextInput.textContent = ''; updateCharCounter(); updateActiveButton('freeform'); isPresetChanging = false; } else { // Other preset items: set text const preset = presetTexts[presetType]; if (preset && typeof preset === 'object' && preset[currentLanguage]) { const text = preset[currentLanguage]; isPresetChanging = true; demoTextInput.textContent = text; updateCharCounter(); updateActiveButton(presetType); isPresetChanging = false; } else if (preset && typeof preset === 'string') { // Fallback for old format (shouldn't happen, but just in case) isPresetChanging = true; demoTextInput.textContent = preset; updateCharCounter(); updateActiveButton(presetType); isPresetChanging = false; } } }); }); // Handle paste event to remove styles and paste only text demoTextInput.addEventListener('paste', (e) => { e.preventDefault(); const text = (e.clipboardData || window.clipboardData).getData('text/plain'); const selection = window.getSelection(); if (!selection.rangeCount) return; const range = selection.getRangeAt(0); range.deleteContents(); const textNode = document.createTextNode(text); range.insertNode(textNode); range.setStartAfter(textNode); range.collapse(true); selection.removeAllRanges(); selection.addRange(range); // Trigger input event to update character counter demoTextInput.dispatchEvent(new Event('input', { bubbles: true })); }); // Update character counter on input let previousTextValue = demoTextInput.textContent || demoTextInput.innerText || ''; // Update left border line height to match demo-input-section height const demoInputSection = document.querySelector('.demo-input-section'); function updateLeftBorderHeight() { if (demoInputSection) { const height = demoInputSection.offsetHeight; demoInputSection.style.setProperty('--demo-text-input-height', `${height}px`); } } // Initialize and observe height changes updateLeftBorderHeight(); const resizeObserver = new ResizeObserver(() => { updateLeftBorderHeight(); }); if (demoInputSection) { resizeObserver.observe(demoInputSection); } // Auto-calculate text input height for screens wider than 768px function calculateTextInputHeight() { if (window.innerWidth <= 768) { // Reset to default height for screens 768px and below demoTextInput.style.height = ''; return; } const viewportHeight = window.innerHeight; const interactiveDemoEl = document.querySelector('.interactive-demo'); const containerEl = document.querySelector('.container'); const headerWrapperEl = document.querySelector('.demo-header-wrapper'); const controlsEl = document.querySelector('.demo-controls'); const inputLabelEl = document.querySelector('.demo-input-label'); const presetRowEl = document.querySelector('#presetControlsRow'); const outputSectionEl = document.querySelector('.demo-output-section'); const contentEl = document.querySelector('.demo-content'); // Get computed styles for gaps and paddings const interactiveDemoStyle = window.getComputedStyle(interactiveDemoEl || document.body); const containerStyle = window.getComputedStyle(containerEl || document.body); const contentStyle = window.getComputedStyle(contentEl || document.body); // Calculate total height of elements above and below text input let totalHeight = 0; // Interactive demo padding const interactiveDemoPaddingTop = parseFloat(interactiveDemoStyle.paddingTop) || 0; const interactiveDemoPaddingBottom = parseFloat(interactiveDemoStyle.paddingBottom) || 0; totalHeight += interactiveDemoPaddingTop + interactiveDemoPaddingBottom; // Container padding const containerPaddingTop = parseFloat(containerStyle.paddingTop) || 0; const containerPaddingBottom = parseFloat(containerStyle.paddingBottom) || 0; totalHeight += containerPaddingTop + containerPaddingBottom; // Header wrapper if (headerWrapperEl) { totalHeight += headerWrapperEl.offsetHeight; } // Demo controls if (controlsEl) { totalHeight += controlsEl.offsetHeight; } // Demo content gap (top) const contentGap = parseFloat(contentStyle.gap) || 0; totalHeight += contentGap; // Input label if (inputLabelEl) { totalHeight += inputLabelEl.offsetHeight; } // Preset controls row if (presetRowEl) { totalHeight += presetRowEl.offsetHeight; } // Demo content gap (bottom) totalHeight += contentGap; // Output section if (outputSectionEl) { totalHeight += outputSectionEl.offsetHeight; } // Calculate available height for text input const availableHeight = viewportHeight - totalHeight - 275; // Subtract 275px // Set minimum height (e.g., 200px) and maximum height const minHeight = 200; const maxHeight = availableHeight - 20; // 20px buffer if (availableHeight > minHeight) { demoTextInput.style.height = `${Math.max(minHeight, maxHeight)}px`; } else { demoTextInput.style.height = `${minHeight}px`; } } // Calculate on load and resize calculateTextInputHeight(); window.addEventListener('resize', calculateTextInputHeight); // Observe elements that might change height const heightObserver = new ResizeObserver(() => { calculateTextInputHeight(); }); const headerWrapperEl = document.querySelector('.demo-header-wrapper'); const controlsEl = document.querySelector('.demo-controls'); const presetRowEl = document.querySelector('#presetControlsRow'); const outputSectionEl = document.querySelector('.demo-output-section'); if (headerWrapperEl) heightObserver.observe(headerWrapperEl); if (controlsEl) heightObserver.observe(controlsEl); if (presetRowEl) heightObserver.observe(presetRowEl); if (outputSectionEl) heightObserver.observe(outputSectionEl); // Auto-hide scrollbar functionality let scrollbarTimeout; demoTextInput.addEventListener('scroll', () => { // Add scrolling class to show scrollbar demoTextInput.classList.add('scrolling'); // Clear existing timeout if (scrollbarTimeout) { clearTimeout(scrollbarTimeout); } // Hide scrollbar after 1.5 seconds of no scrolling scrollbarTimeout = setTimeout(() => { demoTextInput.classList.remove('scrolling'); }, 1500); }); demoTextInput.addEventListener('input', () => { updateCharCounter(); // If text was modified by user (not from preset button), switch to freeform const currentText = demoTextInput.textContent || demoTextInput.innerText || ''; if (!isPresetChanging && currentText !== previousTextValue) { updateActiveButton('freeform'); } if (currentPreset === 'freeform') { // Auto-detect language when user is typing (not from preset) const detectedLang = detectLanguage(currentText); if (detectedLang && detectedLang !== currentLanguage) { const previousLang = currentLanguage; currentLanguage = detectedLang; window.updateActiveLanguage(currentLanguage); showLanguageToast(previousLang, detectedLang); } } previousTextValue = currentText; }); // Update font size when window is resized (for responsive width-based font sizing) let resizeTimeout; window.addEventListener('resize', () => { clearTimeout(resizeTimeout); resizeTimeout = setTimeout(() => { updateCharCounter(); }, 100); }); // Initialize character counter updateCharCounter(); // Speaker list handler (replaces voice select dropdown) const speakerList = document.getElementById('speakerList'); const speakerItems = speakerList ? speakerList.querySelectorAll('.speaker-item[data-voice]') : []; const createVoiceBtn = document.getElementById('createVoiceBtn'); const comingSoonModal = document.getElementById('comingSoonModal'); const comingSoonCloseBtn = document.getElementById('comingSoonCloseBtn'); let voiceSelectDisabled = false; // Update active speaker item (global function for use in switchVoice) window.updateActiveSpeaker = function(voice) { if (!speakerList || !speakerItems) return; speakerItems.forEach(item => { if (item.dataset.voice === voice) { item.classList.add('active'); } else { item.classList.remove('active'); } }); }; // Initialize active speaker if (speakerList && speakerItems.length > 0) { window.updateActiveSpeaker(currentVoice); } // Handle speaker item clicks and hover tooltips const speakerTooltip = document.getElementById('speakerTooltip'); if (speakerList) { speakerItems.forEach(item => { // Track if click was triggered by touch event (to prevent double execution) let clickFromTouch = false; // Click handler item.addEventListener('click', async (e) => { // On touch devices with mobile viewport, ignore native click events (we'll trigger manually from touchend) // PC (even with narrow viewport) should always handle clicks if (isTouchDevice() && isMobileViewport() && !clickFromTouch) { return; } // Reset flag clickFromTouch = false; if (voiceSelectDisabled || modelsLoading || isGenerating) return; const selectedVoice = item.dataset.voice; // If already selected, just auto-generate and play if (selectedVoice === currentVoice) { const text = (demoTextInput.textContent || demoTextInput.innerText || '').trim(); if (text.length >= 10 && !isGenerating && models && cfgs && processors) { generateSpeech(); } return; } // Disable all controls while loading const wasDisabled = demoGenerateBtn.disabled; demoGenerateBtn.disabled = true; voiceSelectDisabled = true; // Update UI immediately window.updateActiveSpeaker(selectedVoice); try { await switchVoice(selectedVoice); // Re-enable if models are loaded if (models && cfgs && processors) { demoGenerateBtn.disabled = false; voiceSelectDisabled = false; // Auto-generate and play after voice change const text = (demoTextInput.textContent || demoTextInput.innerText || '').trim(); if (text.length >= 10 && !isGenerating) { generateSpeech(); } } } catch (error) { console.error('Failed to switch voice:', error); // Revert selection on error window.updateActiveSpeaker(currentVoice); voiceSelectDisabled = false; if (!wasDisabled) demoGenerateBtn.disabled = false; } }); // Hover handler for tooltip if (speakerTooltip) { // Desktop hover events item.addEventListener('mouseenter', (e) => { if (isTouchDevice() && isMobileViewport()) return; // Skip on touch devices with mobile viewport const voice = item.dataset.voice; if (voice && VOICE_DESCRIPTIONS[voice]) { speakerTooltip.textContent = VOICE_DESCRIPTIONS[voice]; speakerTooltip.style.display = 'block'; updateTooltipPosition(e, speakerTooltip); } }); item.addEventListener('mousemove', (e) => { if (isTouchDevice() && isMobileViewport()) return; // Skip on touch devices with mobile viewport if (speakerTooltip.style.display === 'block') { updateTooltipPosition(e, speakerTooltip); } }); item.addEventListener('mouseleave', () => { if (isTouchDevice() && isMobileViewport()) return; // Skip on touch devices with mobile viewport speakerTooltip.style.display = 'none'; }); // Mobile touch events let touchStartTime = 0; let touchHandled = false; let touchStartY = 0; const TOUCH_MOVE_THRESHOLD = 10; // pixels item.addEventListener('touchstart', (e) => { if (!isTouchDevice() || !isMobileViewport()) return; touchHandled = false; const touch = e.touches[0]; touchStartTime = Date.now(); touchStartY = touch.clientY; const voice = item.dataset.voice; if (voice && VOICE_DESCRIPTIONS[voice]) { // Prevent default to block text selection e.preventDefault(); // Show tooltip with mobile styling speakerTooltip.textContent = VOICE_DESCRIPTIONS[voice]; speakerTooltip.style.display = 'block'; updateTooltipPositionMobile(speakerTooltip, touch.clientY); } }, { passive: false }); item.addEventListener('touchmove', (e) => { if (!isTouchDevice() || !isMobileViewport()) return; const touch = e.touches[0]; const deltaY = Math.abs(touch.clientY - touchStartY); // Check if touch moved significantly if (deltaY > TOUCH_MOVE_THRESHOLD) { touchHandled = true; // Hide tooltip if user moves finger speakerTooltip.style.display = 'none'; } // Prevent default to avoid scrolling while showing tooltip e.preventDefault(); }, { passive: false }); item.addEventListener('touchend', (e) => { if (!isTouchDevice() || !isMobileViewport()) return; const touchEndTime = Date.now(); const touchDuration = touchEndTime - touchStartTime; // Hide tooltip speakerTooltip.style.display = 'none'; // Always prevent default to avoid text selection e.preventDefault(); // Only allow click if it was a short tap without movement if (!touchHandled && touchDuration < 500) { // Short tap - trigger click event manually after a small delay clickFromTouch = true; setTimeout(() => { const clickEvent = new MouseEvent('click', { bubbles: true, cancelable: true, view: window }); item.dispatchEvent(clickEvent); }, 50); } else { // Long press or moved - prevent click touchHandled = true; e.stopPropagation(); } }, { passive: false }); item.addEventListener('touchcancel', (e) => { if (!isTouchDevice() || !isMobileViewport()) return; // Hide tooltip speakerTooltip.style.display = 'none'; touchHandled = true; // Prevent default e.preventDefault(); }, { passive: false }); // Prevent context menu (long press menu) item.addEventListener('contextmenu', (e) => { if (isTouchDevice() && isMobileViewport()) { e.preventDefault(); return false; } }); } }); } // Function to update tooltip position (40px above mouse pointer) function updateTooltipPosition(event, tooltip) { const x = event.clientX; const y = event.clientY - 40; // 40px above mouse pointer tooltip.style.left = x + 'px'; tooltip.style.top = y + 'px'; // Adjust if tooltip goes off screen const tooltipRect = tooltip.getBoundingClientRect(); const windowWidth = window.innerWidth; const windowHeight = window.innerHeight; if (tooltipRect.right > windowWidth) { tooltip.style.left = (windowWidth - tooltipRect.width - 10) + 'px'; } if (tooltipRect.left < 0) { tooltip.style.left = '10px'; } if (tooltipRect.top < 0) { tooltip.style.top = (event.clientY + 40) + 'px'; } if (tooltipRect.bottom > windowHeight) { tooltip.style.top = (windowHeight - tooltipRect.height - 10) + 'px'; } } // Function to update tooltip position for mobile (centered, 75px above touch point) function updateTooltipPositionMobile(tooltip, touchY) { const windowWidth = window.innerWidth; const windowHeight = window.innerHeight; // Set mobile-specific styles tooltip.style.width = '90%'; tooltip.style.left = '5%'; // Center: (100% - 90%) / 2 = 5% tooltip.style.right = 'auto'; tooltip.style.marginLeft = '0'; tooltip.style.marginRight = '0'; tooltip.style.whiteSpace = 'normal'; tooltip.style.textAlign = 'center'; // Position tooltip 75px above touch point (60px + 15px) const y = touchY - 75; tooltip.style.top = y + 'px'; // Adjust if tooltip goes off screen const tooltipRect = tooltip.getBoundingClientRect(); if (tooltipRect.top < 10) { // If tooltip goes above viewport, position it below touch point instead tooltip.style.top = (touchY + 20) + 'px'; } if (tooltipRect.bottom > windowHeight - 10) { tooltip.style.top = (windowHeight - tooltipRect.height - 10) + 'px'; } } // Handle "Create your own voice" button if (createVoiceBtn && comingSoonModal) { createVoiceBtn.addEventListener('click', () => { comingSoonModal.classList.add('show'); }); } // Close modal handlers if (comingSoonCloseBtn && comingSoonModal) { comingSoonCloseBtn.addEventListener('click', () => { comingSoonModal.classList.remove('show'); }); } if (comingSoonModal) { const overlay = comingSoonModal.querySelector('.coming-soon-modal-overlay'); if (overlay) { overlay.addEventListener('click', () => { comingSoonModal.classList.remove('show'); }); } } // Language selection handler const languageList = document.getElementById('languageList'); const languageItems = languageList ? languageList.querySelectorAll('.speaker-item[data-language]') : []; // Update active language item (global function for use in language change) window.updateActiveLanguage = function(language) { if (!languageList || !languageItems) return; languageItems.forEach(item => { if (item.dataset.language === language) { item.classList.add('active'); } else { item.classList.remove('active'); } }); }; // Initialize active language if (languageList && languageItems.length > 0) { window.updateActiveLanguage(currentLanguage); } // Handle language item clicks if (languageList) { languageItems.forEach(item => { item.addEventListener('click', async (e) => { // Don't allow language change during model loading or generation if (modelsLoading || isGenerating) return; const selectedLanguage = item.dataset.language; // If already selected, just auto-generate and play if (selectedLanguage === currentLanguage) { const text = (demoTextInput.textContent || demoTextInput.innerText || '').trim(); if (text.length >= 10 && !isGenerating && models && cfgs && processors) { generateSpeech(); } return; } // Update language currentLanguage = selectedLanguage; window.updateActiveLanguage(currentLanguage); // Update text if we're on a preset (not freeform) if (currentPreset && currentPreset !== 'freeform' && presetTexts[currentPreset]) { const preset = presetTexts[currentPreset]; if (preset && typeof preset === 'object' && preset[currentLanguage]) { isPresetChanging = true; demoTextInput.textContent = preset[currentLanguage]; updateCharCounter(); isPresetChanging = false; } } // Auto-generate and play after language change // Wait a bit for UI to update await new Promise(resolve => setTimeout(resolve, 100)); const text = (demoTextInput.textContent || demoTextInput.innerText || '').trim(); if (text.length >= 10 && !isGenerating && models && cfgs && processors) { generateSpeech(); } }); }); } // Title animation setup const demoTitleLeft = document.querySelector('.demo-title-left'); const demoTitleRight = document.querySelector('.demo-title-right'); const demoOutputSection = document.querySelector('.demo-output-section'); // Initialize Text with letters wrapped in spans if (demoTitleLeft) { const text = demoTitleLeft.textContent.trim(); demoTitleLeft.innerHTML = text.split('').map(char => char === ' ' ? ' ' : `${char}` ).join(''); } // Text animation on demo-input-section click if (demoInputSection && demoTitleLeft) { demoInputSection.addEventListener('click', () => { const letters = demoTitleLeft.querySelectorAll('.letter'); // Reset all letters letters.forEach(letter => { letter.classList.remove('visible'); }); // Show letters one by one (total 0.25s = 0.125s / 2) letters.forEach((letter, index) => { setTimeout(() => { letter.classList.add('visible'); }, index * 0.0625 * 1000); // 0.0625s delay between each letter }); }); } // Speech animation on demo-output-section click if (demoOutputSection && demoTitleRight) { demoOutputSection.addEventListener('click', (event) => { if (event.target.closest('#demoGenerateBtn')) { return; } demoTitleRight.classList.remove('animate-speech'); // Trigger reflow void demoTitleRight.offsetWidth; demoTitleRight.classList.add('animate-speech'); }); } // Initialize models initializeModels(); })();