Spaces:
Running
Running
| import * as ort from 'onnxruntime-web'; | |
| const presetTexts = window.presetTexts || {}; | |
| const PLAY_ICON_SVG = `<svg width="24" height="24" viewBox="0 0 24 24" fill="currentColor" aria-hidden="true" focusable="false"><path d="M8 5v14l11-7-11-7z"></path></svg>`; | |
| const PAUSE_ICON_SVG = `<svg width="24" height="24" viewBox="0 0 24 24" fill="currentColor" aria-hidden="true" focusable="false"><path d="M8 6h3v12H8V6zm5 0h3v12h-3V6z"></path></svg>`; | |
| const STOP_ICON_SVG = `<svg width="24" height="24" viewBox="0 0 24 24" fill="currentColor" aria-hidden="true" focusable="false"><path d="M7 7h10v10H7V7z"></path></svg>`; | |
| // Lightning background parallax | |
| (function initLightningParallax() { | |
| if (typeof document === 'undefined') { | |
| return; | |
| } | |
| const runBlink = (className, onComplete) => { | |
| let remaining = 1 + Math.round(Math.random()); | |
| const blink = () => { | |
| if (remaining-- <= 0) { | |
| if (typeof onComplete === 'function') { | |
| onComplete(); | |
| } | |
| return; | |
| } | |
| const wait = 20 + Math.random() * 80; | |
| document.body.classList.add(className); | |
| setTimeout(() => { | |
| document.body.classList.remove(className); | |
| setTimeout(blink, wait); | |
| }, wait); | |
| }; | |
| blink(); | |
| }; | |
| const schedule = () => { | |
| setTimeout(() => runBlink('lightning-flicker', schedule), Math.random() * 10000); | |
| }; | |
| schedule(); | |
| })(); | |
| function escapeHtml(value) { | |
| return value.replace(/[&<>"']/g, (match) => { | |
| switch (match) { | |
| case '&': return '&'; | |
| case '<': return '<'; | |
| case '>': return '>'; | |
| case '"': return '"'; | |
| case "'": return '''; | |
| default: return match; | |
| } | |
| }); | |
| } | |
| function getErrorMessage(error) { | |
| if (error instanceof Error && error.message) { | |
| return error.message; | |
| } | |
| if (typeof error === 'string') { | |
| return error; | |
| } | |
| if (error === undefined || error === null) { | |
| return 'Unknown error'; | |
| } | |
| try { | |
| return JSON.stringify(error); | |
| } catch (_) { | |
| return String(error); | |
| } | |
| } | |
| function formatStatValueWithSuffix(value, suffix, options = {}) { | |
| const { firstLabel = false } = options; | |
| if (value === undefined || value === null) { | |
| return ''; | |
| } | |
| if (!suffix) { | |
| const raw = `${value}`; | |
| return escapeHtml(raw); | |
| } | |
| const raw = `${value}`.trim(); | |
| if (!raw || raw === '--' || raw === '-' || raw.toLowerCase() === 'error') { | |
| return escapeHtml(raw); | |
| } | |
| const appendSuffix = (segment, includePrefix = false) => { | |
| const trimmed = segment.trim(); | |
| if (!trimmed) { | |
| return ''; | |
| } | |
| const escapedValue = `<span class="stat-value-number">${escapeHtml(trimmed)}</span>`; | |
| const suffixSpan = `<span class="stat-label stat-suffix">${escapeHtml(suffix)}</span>`; | |
| const prefixSpan = includePrefix && firstLabel | |
| ? `<span class="stat-label stat-suffix stat-prefix">First</span>` | |
| : ''; | |
| const segmentClass = includePrefix && firstLabel | |
| ? 'stat-value-segment has-prefix' | |
| : 'stat-value-segment'; | |
| return `<span class="${segmentClass}">${prefixSpan}${escapedValue}${suffixSpan}</span>`; | |
| }; | |
| if (raw.includes('/')) { | |
| const parts = raw.split('/'); | |
| const segments = parts.map((part, index) => appendSuffix(part, index === 0)); | |
| return segments.join(' / '); | |
| } | |
| return appendSuffix(raw); | |
| } | |
| /** | |
| * Unicode text processor | |
| */ | |
| export class UnicodeProcessor { | |
| constructor(indexer) { | |
| this.indexer = indexer; | |
| } | |
| call(textList, lang = null) { | |
| const processedTexts = textList.map(t => preprocessText(t, lang)); | |
| const textIdsLengths = processedTexts.map(t => t.length); | |
| const maxLen = Math.max(...textIdsLengths); | |
| const textIds = []; | |
| const unsupportedChars = new Set(); | |
| for (let i = 0; i < processedTexts.length; i++) { | |
| const row = new Array(maxLen).fill(0); | |
| const unicodeVals = textToUnicodeValues(processedTexts[i]); | |
| for (let j = 0; j < unicodeVals.length; j++) { | |
| const indexValue = this.indexer[unicodeVals[j]]; | |
| // Check if character is supported (not -1, undefined, or null) | |
| if (indexValue === undefined || indexValue === null || indexValue === -1) { | |
| unsupportedChars.add(processedTexts[i][j]); | |
| row[j] = 0; // Use 0 as fallback | |
| } else { | |
| row[j] = indexValue; | |
| } | |
| } | |
| textIds.push(row); | |
| } | |
| const textMask = getTextMask(textIdsLengths); | |
| return { textIds, textMask, unsupportedChars: Array.from(unsupportedChars) }; | |
| } | |
| } | |
| const AVAILABLE_LANGS = ["en", "ko", "ja", "ar", "bg", "cs", "da", "de", "el", "es", "et", "fi", "fr", "hi", "hr", "hu", "id", "it", "lt", "lv", "nl", "pl", "pt", "ro", "ru", "sk", "sl", "sv", "tr", "uk", "vi"]; | |
| /** | |
| * Language detection \u2014 Unicode-script first, then Latin scoring. | |
| * | |
| * Strategy: | |
| * 1. Detect non-Latin scripts via Unicode block matches (high confidence). | |
| * Hangul \u2192 ko, Hiragana/Katakana \u2192 ja, Arabic \u2192 ar, Devanagari \u2192 hi, | |
| * Greek \u2192 el, Cyrillic \u2192 bg/ru/uk via stopword tiebreak. | |
| * 2. For Latin script, score against a curated set of distinctive | |
| * characters and stopwords for the most common languages we cover. | |
| * Languages without a high-signal scoring rule (e.g. et, fi, hu, sk, | |
| * sl, ro, lt, lv, hr, id) fall through; the caller then uses the | |
| * dropdown selection. | |
| * | |
| * The model itself is language-agnostic, so misdetection is graceful: | |
| * the wrong tag still produces understandable speech. | |
| */ | |
| export function detectLanguage(text) { | |
| if (!text || text.trim().length < 3) { | |
| return null; | |
| } | |
| const sampleText = text.length > 200 ? text.substring(text.length - 200) : text; | |
| const normalizedText = sampleText.normalize('NFC').toLowerCase(); | |
| // 1) Non-Latin scripts via Unicode blocks (definitive) | |
| if (/[\uAC00-\uD7AF\u1100-\u11FF\u3130-\u318F]/.test(normalizedText)) return 'ko'; | |
| if (/[\u3040-\u30FF]/.test(normalizedText)) return 'ja'; // Hiragana/Katakana | |
| if (/[\u0600-\u06FF\uFB50-\uFDFF\uFE70-\uFEFF]/.test(normalizedText)) return 'ar'; | |
| if (/[\u0900-\u097F]/.test(normalizedText)) return 'hi'; // Devanagari | |
| if (/[\u0370-\u03FF]/.test(normalizedText)) return 'el'; // Greek | |
| // Cyrillic \u2014 distinguish bg/ru/uk via stopwords + diacritics | |
| if (/[\u0400-\u04FF]/.test(normalizedText)) { | |
| const cyrText = ' ' + normalizedText + ' '; | |
| let ru = 0, uk = 0, bg = 0; | |
| for (const w of ['\u0438', '\u043D\u0435', '\u044D\u0442\u043E', '\u0447\u0442\u043E', '\u043A\u0430\u043A', '\u0434\u043B\u044F', '\u043D\u043E', '\u0432\u0441\u0435']) if (cyrText.includes(' ' + w + ' ')) ru += 2; | |
| for (const w of ['\u0442\u0430', '\u043D\u0435', '\u0449\u043E', '\u044F\u043A', '\u0434\u043B\u044F', '\u0430\u043B\u0435', '\u0432\u0441\u0435', '\u0430\u0431\u043E']) if (cyrText.includes(' ' + w + ' ')) uk += 2; | |
| for (const w of ['\u0438', '\u043D\u0430', '\u043D\u0435', '\u0447\u0435', '\u043A\u0430\u0442\u043E', '\u0441\u044A\u0441', '\u0441\u044A\u0449\u043E']) if (cyrText.includes(' ' + w + ' ')) bg += 2; | |
| if (/[\u0456\u0457\u0454\u0491]/.test(normalizedText)) uk += 6; | |
| if (/\u044A/.test(normalizedText)) bg += 4; | |
| if (/[\u044B\u044D\u0451]/.test(normalizedText)) ru += 4; | |
| if (uk > ru && uk > bg) return 'uk'; | |
| if (bg > ru && bg > uk) return 'bg'; | |
| return 'ru'; | |
| } | |
| // 2) Latin script scoring | |
| const scores = { en: 0, es: 0, fr: 0, pt: 0, de: 0, it: 0, nl: 0, pl: 0, sv: 0, da: 0, tr: 0, vi: 0 }; | |
| // Highly distinctive characters | |
| if (/\u00F1/.test(normalizedText)) scores.es += 15; | |
| if (/[\u00BF\u00A1]/.test(normalizedText)) scores.es += 12; | |
| if (/\u00E3/.test(normalizedText)) scores.pt += 15; | |
| if (/\u00F5/.test(normalizedText)) scores.pt += 15; | |
| if (/\u0153/.test(normalizedText)) scores.fr += 15; | |
| if (/[\u00F9\u00FB]/.test(normalizedText)) scores.fr += 10; | |
| if (/\u00E7/.test(normalizedText)) { scores.fr += 4; scores.pt += 4; scores.tr += 4; } | |
| if (/[\u00E8\u00EA\u00EB]/.test(normalizedText)) scores.fr += 5; | |
| if (/[\u00E0\u00E2]/.test(normalizedText)) scores.fr += 3; | |
| if (/[\u00EE\u00EF]/.test(normalizedText)) scores.fr += 4; | |
| if (/\u00F4/.test(normalizedText)) scores.fr += 3; | |
| if (/\u00DF/.test(normalizedText)) scores.de += 15; | |
| if (/[\u00E4\u00F6\u00FC]/.test(normalizedText)) { scores.de += 4; scores.sv += 2; } | |
| if (/\u00E5/.test(normalizedText)) { scores.sv += 8; scores.da += 8; } | |
| if (/[\u00E6\u00F8]/.test(normalizedText)) scores.da += 12; | |
| if (/[\u0105\u0119\u0107\u0142\u0144\u015B\u017A\u017C]/.test(normalizedText)) scores.pl += 12; | |
| if (/[\u011F\u015F\u0131\u0130]/.test(normalizedText)) scores.tr += 12; | |
| if (/[\u01A1\u01B0\u0103\u0111]/.test(normalizedText)) scores.vi += 12; | |
| if (/[\u00E0\u1EA3\u00E3\u00E1\u1EA1\u1EB1\u1EAF\u1EB3\u1EB5\u1EB7\u00E2\u1EA7\u1EA5\u1EA9\u1EAB\u1EAD\u00E8\u1EBB\u1EBD\u00E9\u1EB9\u00EA\u1EC1\u1EBF\u1EC3\u1EC5\u1EC7\u00EC\u1EC9\u0129\u00ED\u1ECB\u00F2\u1ECF\u00F5\u00F3\u1ECD\u00F4\u1ED3\u1ED1\u1ED5\u1ED7\u1ED9\u01A1\u1EDD\u1EDB\u1EDF\u1EE1\u1EE3\u00F9\u1EE7\u0169\u00FA\u1EE5\u01B0\u1EEB\u1EE9\u1EED\u1EEF\u1EF1\u1EF3\u1EF7\u1EF9\u00FD\u1EF5]/.test(normalizedText)) scores.vi += 6; | |
| // Exclusive stopwords (highest signal per language) | |
| const exclusiveWords = { | |
| en: ['the', 'is', 'are', 'was', 'were', 'have', 'has', 'been', 'will', 'would', 'this', 'that', 'with', 'from', 'they', 'what', 'which', 'there', 'their', 'about', 'these', 'other', 'into', 'just', 'your', 'some', 'than', 'them', 'then', 'only', 'being', 'through', 'after', 'before'], | |
| es: ['el', 'los', 'las', 'est\u00E1', 'est\u00E1n', 'porque', 'pero', 'muy', 'tambi\u00E9n', 'm\u00E1s', 'este', 'esta', 'estos', 'estas', 'ese', 'esa', 'nosotros', 'ellos', 'ellas', 'hola', 'gracias', 'ahora', 'siempre', 'nunca'], | |
| fr: ['le', 'les', 'est', 'sont', 'dans', 'ce', 'cette', 'ces', 'elle', 'ils', 'elles', 'nous', 'vous', 'avec', 'sur', 'pas', 'plus', 'tout', 'bien', 'fait', '\u00EAtre', 'avoir', 'donc', 'car', 'jamais', 'toujours', 'aussi', 'tr\u00E8s'], | |
| pt: ['os', 'as', 's\u00E3o', 'est\u00E3o', 'n\u00E3o', 'na', 'no', 'da', 'do', 'das', 'dos', 'ao', 'aos', 'ele', 'ela', 'eles', 'elas', 'n\u00F3s', 'voc\u00EA', 'voc\u00EAs', 'seu', 'sua', 'muito', 'tamb\u00E9m', 'foi', 'mesmo', 'at\u00E9', 'isso', 'ol\u00E1', 'obrigado', 'obrigada'], | |
| de: ['der', 'die', 'das', 'und', 'ist', 'sind', 'nicht', 'ich', 'wir', 'sie', 'er', 'mit', 'f\u00FCr', 'auf', 'eine', 'einen', 'einem', 'auch', 'aber', 'doch', 'noch', 'nur', 'sehr', 'so', 'oder', 'wenn', 'weil', 'als'], | |
| it: ['il', 'la', 'gli', 'le', '\u00E8', 'sono', 'non', 'che', 'di', 'per', 'con', 'una', 'uno', 'noi', 'voi', 'loro', 'questo', 'questa', 'anche', 'ma', 'pi\u00F9', 'molto', 'sempre', 'mai'], | |
| nl: ['de', 'het', 'een', 'en', 'is', 'zijn', 'niet', 'van', 'voor', 'met', 'op', 'aan', 'om', 'maar', 'ook', 'wel', 'nog', 'als', 'dan', 'wat', 'wie', 'hoe', 'omdat', 'altijd', 'nooit'], | |
| pl: ['jest', 's\u0105', 'nie', 'si\u0119', 'tak', 'czy', 'ale', 'oraz', 'jak', 'tym', 'tego', 'tej', 'jeszcze', 'tylko', 'bardzo', 'zawsze', 'nigdy'], | |
| sv: ['\u00E4r', 'och', 'inte', 'det', 'att', 'f\u00F6r', 'p\u00E5', 'med', 'som', 'jag', 'vi', 'ni', 'de', 'eller', 'men', 'ocks\u00E5', 'alltid', 'aldrig', 'bara'], | |
| da: ['er', 'og', 'ikke', 'det', 'at', 'for', 'p\u00E5', 'med', 'som', 'jeg', 'vi', 'de', 'eller', 'men', 'ogs\u00E5', 'altid', 'aldrig', 'bare'], | |
| tr: ['ve', 'ile', 'i\u00E7in', 'bir', 'bu', '\u015Fu', 'de\u011Fil', 'gibi', '\u00E7ok', 'ama', 'her', 'hi\u00E7', 'yine', 'daha'], | |
| vi: ['v\u00E0', 'l\u00E0', 'c\u1EE7a', 'kh\u00F4ng', 'm\u1ED9t', 'nh\u1EEFng', 'n\u00E0y', '\u0111\u00F3', 'c\u0169ng', 'v\u1EDBi', 'nh\u01B0', '\u0111\u1EC3', 'nh\u01B0ng', 'r\u1EA5t', 'lu\u00F4n', 'bao'] | |
| }; | |
| const words = normalizedText.match(/[\p{Letter}']+/gu) || []; | |
| for (const word of words) { | |
| for (const [lang, wordList] of Object.entries(exclusiveWords)) { | |
| if (wordList.includes(word)) { | |
| scores[lang] += 3; | |
| } | |
| } | |
| } | |
| // 3. Common n-grams (character patterns) | |
| const ngramPatterns = { | |
| en: [/th/g, /ing/g, /tion/g, /ight/g, /ould/g], | |
| es: [/ción/g, /mente/g, /ado/g, /ido/g], | |
| fr: [/tion/g, /ment/g, /eau/g, /aux/g, /eux/g, /oir/g, /ais/g, /ait/g, /ont/g], | |
| pt: [/ção/g, /ões/g, /mente/g, /ado/g, /ido/g, /nh/g, /lh/g], | |
| de: [/sch/g, /chen/g, /lich/g, /ung/g, /ein/g], | |
| it: [/zione/g, /mente/g, /ono/g, /are/g, /ere/g], | |
| nl: [/sch/g, /eer/g, /ijk/g, /aar/g], | |
| pl: [/cz/g, /sz/g, /rz/g, /dzie/g], | |
| sv: [/skt/g, /tion/g], | |
| da: [/skt/g, /tion/g, /tt/g], | |
| tr: [/lar/g, /ler/g, /siz/g, /lik/g], | |
| vi: [/ng/g, /nh/g, /th/g] | |
| }; | |
| for (const [lang, patterns] of Object.entries(ngramPatterns)) { | |
| for (const pattern of patterns) { | |
| const matches = normalizedText.match(pattern) || []; | |
| scores[lang] += matches.length * 2; | |
| } | |
| } | |
| // 4. French apostrophe contractions | |
| const frenchContractions = /[cdjlmnst]'[aeiouéèêàâîïôûù]/g; | |
| const frenchContractionMatches = normalizedText.match(frenchContractions) || []; | |
| scores.fr += frenchContractionMatches.length * 5; | |
| // 5. Definite-article anchors | |
| if (/\bthe\b/.test(normalizedText)) scores.en += 5; | |
| if (/\b(el|los)\b/.test(normalizedText)) scores.es += 4; | |
| if (/\b(le|les)\b/.test(normalizedText)) scores.fr += 4; | |
| if (/\b(o|os)\b/.test(normalizedText)) scores.pt += 3; | |
| if (/\b(der|die|das)\b/.test(normalizedText)) scores.de += 5; | |
| // Pick winner with confidence threshold | |
| let maxScore = 0; | |
| let detectedLang = null; | |
| for (const [lang, score] of Object.entries(scores)) { | |
| if (score > maxScore) { | |
| maxScore = score; | |
| detectedLang = lang; | |
| } | |
| } | |
| if (maxScore >= 4) { | |
| return detectedLang; | |
| } | |
| return null; | |
| } | |
| // Language display names for toast notification (31 languages) | |
| const LANGUAGE_NAMES = { | |
| 'en': 'English', | |
| 'ko': 'Korean', | |
| 'ja': 'Japanese', | |
| 'ar': 'Arabic', | |
| 'bg': 'Bulgarian', | |
| 'cs': 'Czech', | |
| 'da': 'Danish', | |
| 'de': 'German', | |
| 'el': 'Greek', | |
| 'es': 'Spanish', | |
| 'et': 'Estonian', | |
| 'fi': 'Finnish', | |
| 'fr': 'French', | |
| 'hi': 'Hindi', | |
| 'hr': 'Croatian', | |
| 'hu': 'Hungarian', | |
| 'id': 'Indonesian', | |
| 'it': 'Italian', | |
| 'lt': 'Lithuanian', | |
| 'lv': 'Latvian', | |
| 'nl': 'Dutch', | |
| 'pl': 'Polish', | |
| 'pt': 'Portuguese', | |
| 'ro': 'Romanian', | |
| 'ru': 'Russian', | |
| 'sk': 'Slovak', | |
| 'sl': 'Slovenian', | |
| 'sv': 'Swedish', | |
| 'tr': 'Turkish', | |
| 'uk': 'Ukrainian', | |
| 'vi': 'Vietnamese' | |
| }; | |
| export function preprocessText(text, lang = null) { | |
| // Normalize unicode characters | |
| text = text.normalize('NFKD'); | |
| // Remove emojis | |
| text = text.replace(/[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F700}-\u{1F77F}\u{1F780}-\u{1F7FF}\u{1F800}-\u{1F8FF}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1F1E6}-\u{1F1FF}]+/gu, ''); | |
| // Replace various dashes and symbols | |
| const replacements = { | |
| "–": "-", | |
| "‑": "-", | |
| "—": "-", | |
| "_": " ", | |
| "\u201C": '"', // " | |
| "\u201D": '"', // " | |
| "\u2018": "'", // ' | |
| "\u2019": "'", // ' | |
| "´": "'", | |
| "`": "'", | |
| "[": " ", | |
| "]": " ", | |
| "|": " ", | |
| "/": " ", // FIXME: `/` should be pronounced. | |
| "#": " ", // FIXME: `#` should be pronounced. | |
| "→": " ", | |
| "←": " ", | |
| }; | |
| for (const [k, v] of Object.entries(replacements)) { | |
| text = text.replaceAll(k, v); | |
| } | |
| // Remove special symbols | |
| text = text.replace(/[♥☆♡©\\]/g, ""); | |
| // Replace known expressions | |
| const exprReplacements = { | |
| "@": " at ", | |
| "e.g.,": "for example,", | |
| "i.e.,": "that is,", | |
| }; | |
| for (const [k, v] of Object.entries(exprReplacements)) { | |
| text = text.replaceAll(k, v); | |
| } | |
| // Fix spacing around punctuation | |
| text = text.replace(/ ,/g, ","); | |
| text = text.replace(/ \./g, "."); | |
| text = text.replace(/ !/g, "!"); | |
| text = text.replace(/ \?/g, "?"); | |
| text = text.replace(/ ;/g, ";"); | |
| text = text.replace(/ :/g, ":"); | |
| text = text.replace(/ '/g, "'"); | |
| // Remove duplicate quotes | |
| while (text.includes('""')) { | |
| text = text.replace(/""/g, '"'); | |
| } | |
| while (text.includes("''")) { | |
| text = text.replace(/''/g, "'"); | |
| } | |
| while (text.includes("``")) { | |
| text = text.replace(/``/g, "`"); | |
| } | |
| // Remove extra spaces | |
| text = text.replace(/\s+/g, " ").trim(); | |
| // If text doesn't end with punctuation, quotes, or closing brackets, add a period | |
| if (!/[.!?;:,'"')\]}…。」』】〉》›»]$/.test(text)) { | |
| text += "."; | |
| } | |
| // Add language tags | |
| if (lang !== null) { | |
| if (!AVAILABLE_LANGS.includes(lang)) { | |
| throw new Error(`Invalid language: ${lang}`); | |
| } | |
| text = `<${lang}>` + text + `</${lang}>`; | |
| } else { | |
| text = `<na>` + text + `</na>`; | |
| } | |
| return text; | |
| } | |
| export function textToUnicodeValues(text) { | |
| return Array.from(text).map(char => char.charCodeAt(0)); | |
| } | |
| export function lengthToMask(lengths, maxLen = null) { | |
| maxLen = maxLen || Math.max(...lengths); | |
| const mask = []; | |
| for (let i = 0; i < lengths.length; i++) { | |
| const row = []; | |
| for (let j = 0; j < maxLen; j++) { | |
| row.push(j < lengths[i] ? 1.0 : 0.0); | |
| } | |
| mask.push([row]); | |
| } | |
| return mask; | |
| } | |
| export function getTextMask(textIdsLengths) { | |
| return lengthToMask(textIdsLengths); | |
| } | |
| export function getLatentMask(wavLengths, cfgs) { | |
| const baseChunkSize = cfgs.ae.base_chunk_size; | |
| const chunkCompressFactor = cfgs.ttl.chunk_compress_factor; | |
| const latentSize = baseChunkSize * chunkCompressFactor; | |
| const latentLengths = wavLengths.map(len => | |
| Math.floor((len + latentSize - 1) / latentSize) | |
| ); | |
| return lengthToMask(latentLengths); | |
| } | |
| export function sampleNoisyLatent(duration, cfgs) { | |
| const sampleRate = cfgs.ae.sample_rate; | |
| const baseChunkSize = cfgs.ae.base_chunk_size; | |
| const chunkCompressFactor = cfgs.ttl.chunk_compress_factor; | |
| const ldim = cfgs.ttl.latent_dim; | |
| const wavLenMax = Math.max(...duration.map(d => d[0][0])) * sampleRate; | |
| const wavLengths = duration.map(d => Math.floor(d[0][0] * sampleRate)); | |
| const chunkSize = baseChunkSize * chunkCompressFactor; | |
| const latentLen = Math.floor((wavLenMax + chunkSize - 1) / chunkSize); | |
| const latentDim = ldim * chunkCompressFactor; | |
| const noisyLatent = []; | |
| for (let b = 0; b < duration.length; b++) { | |
| const batch = []; | |
| for (let d = 0; d < latentDim; d++) { | |
| const row = []; | |
| for (let t = 0; t < latentLen; t++) { | |
| const u1 = Math.random(); | |
| const u2 = Math.random(); | |
| const randNormal = Math.sqrt(-2.0 * Math.log(u1)) * Math.cos(2.0 * Math.PI * u2); | |
| row.push(randNormal); | |
| } | |
| batch.push(row); | |
| } | |
| noisyLatent.push(batch); | |
| } | |
| const latentMask = getLatentMask(wavLengths, cfgs); | |
| for (let b = 0; b < noisyLatent.length; b++) { | |
| for (let d = 0; d < noisyLatent[b].length; d++) { | |
| for (let t = 0; t < noisyLatent[b][d].length; t++) { | |
| noisyLatent[b][d][t] *= latentMask[b][0][t]; | |
| } | |
| } | |
| } | |
| return { noisyLatent, latentMask }; | |
| } | |
| export async function loadOnnx(onnxPath, opts) { | |
| return await ort.InferenceSession.create(onnxPath, opts); | |
| } | |
| export async function loadOnnxAll(basePath, opts, onProgress) { | |
| const models = [ | |
| { name: 'Duration Predictor', path: `${basePath}/duration_predictor.onnx`, key: 'dpOrt' }, | |
| { name: 'Text Encoder', path: `${basePath}/text_encoder.onnx`, key: 'textEncOrt' }, | |
| { name: 'Vector Estimator', path: `${basePath}/vector_estimator.onnx`, key: 'vectorEstOrt' }, | |
| { name: 'Vocoder', path: `${basePath}/vocoder.onnx`, key: 'vocoderOrt' } | |
| ]; | |
| const result = {}; | |
| let loadedCount = 0; | |
| // Load all models in parallel | |
| const loadPromises = models.map(async (model) => { | |
| const session = await loadOnnx(model.path, opts); | |
| loadedCount++; | |
| if (onProgress) { | |
| onProgress(model.name, loadedCount, models.length); | |
| } | |
| return { key: model.key, session }; | |
| }); | |
| // Wait for all models to load | |
| const loadedModels = await Promise.all(loadPromises); | |
| // Organize results | |
| loadedModels.forEach(({ key, session }) => { | |
| result[key] = session; | |
| }); | |
| try { | |
| // Download counting. Skip localhost to avoid noisy requests during local testing. | |
| const hostname = typeof window !== 'undefined' ? window.location.hostname : ''; | |
| const isLocalhost = ['localhost', '127.0.0.1', '::1'].includes(hostname); | |
| if (!isLocalhost) { | |
| await fetch('https://huggingface.co/Supertone/supertonic-3/resolve/main/config.json', { | |
| mode: 'no-cors', | |
| cache: 'no-store' | |
| }); | |
| } | |
| } catch (error) { | |
| console.warn('Failed to update download count:', error); | |
| } | |
| return result; | |
| } | |
| export async function loadCfgs(basePath) { | |
| const response = await fetch(`${basePath}/tts.json`); | |
| return await response.json(); | |
| } | |
| export async function loadProcessors(basePath) { | |
| const response = await fetch(`${basePath}/unicode_indexer.json`); | |
| const unicodeIndexerData = await response.json(); | |
| const textProcessor = new UnicodeProcessor(unicodeIndexerData); | |
| return { textProcessor }; | |
| } | |
| function parseWavFile(buffer) { | |
| const view = new DataView(buffer); | |
| // Check RIFF header | |
| const riff = String.fromCharCode(view.getUint8(0), view.getUint8(1), view.getUint8(2), view.getUint8(3)); | |
| if (riff !== 'RIFF') { | |
| throw new Error('Not a valid WAV file'); | |
| } | |
| const wave = String.fromCharCode(view.getUint8(8), view.getUint8(9), view.getUint8(10), view.getUint8(11)); | |
| if (wave !== 'WAVE') { | |
| throw new Error('Not a valid WAV file'); | |
| } | |
| let offset = 12; | |
| let fmtChunk = null; | |
| let dataChunk = null; | |
| while (offset < buffer.byteLength) { | |
| const chunkId = String.fromCharCode( | |
| view.getUint8(offset), | |
| view.getUint8(offset + 1), | |
| view.getUint8(offset + 2), | |
| view.getUint8(offset + 3) | |
| ); | |
| const chunkSize = view.getUint32(offset + 4, true); | |
| if (chunkId === 'fmt ') { | |
| fmtChunk = { | |
| audioFormat: view.getUint16(offset + 8, true), | |
| numChannels: view.getUint16(offset + 10, true), | |
| sampleRate: view.getUint32(offset + 12, true), | |
| bitsPerSample: view.getUint16(offset + 22, true) | |
| }; | |
| } else if (chunkId === 'data') { | |
| dataChunk = { | |
| offset: offset + 8, | |
| size: chunkSize | |
| }; | |
| break; | |
| } | |
| offset += 8 + chunkSize; | |
| } | |
| if (!fmtChunk || !dataChunk) { | |
| throw new Error('Invalid WAV file format'); | |
| } | |
| const bytesPerSample = fmtChunk.bitsPerSample / 8; | |
| const numSamples = Math.floor(dataChunk.size / (bytesPerSample * fmtChunk.numChannels)); | |
| const audioData = new Float32Array(numSamples); | |
| if (fmtChunk.bitsPerSample === 16) { | |
| for (let i = 0; i < numSamples; i++) { | |
| let sample = 0; | |
| for (let ch = 0; ch < fmtChunk.numChannels; ch++) { | |
| const sampleOffset = dataChunk.offset + (i * fmtChunk.numChannels + ch) * 2; | |
| sample += view.getInt16(sampleOffset, true); | |
| } | |
| audioData[i] = (sample / fmtChunk.numChannels) / 32768.0; | |
| } | |
| } else if (fmtChunk.bitsPerSample === 24) { | |
| // Support 24-bit PCM | |
| for (let i = 0; i < numSamples; i++) { | |
| let sample = 0; | |
| for (let ch = 0; ch < fmtChunk.numChannels; ch++) { | |
| const sampleOffset = dataChunk.offset + (i * fmtChunk.numChannels + ch) * 3; | |
| // Read 3 bytes and convert to signed 24-bit integer | |
| const byte1 = view.getUint8(sampleOffset); | |
| const byte2 = view.getUint8(sampleOffset + 1); | |
| const byte3 = view.getUint8(sampleOffset + 2); | |
| let value = (byte3 << 16) | (byte2 << 8) | byte1; | |
| // Convert to signed (two's complement) | |
| if (value & 0x800000) { | |
| value = value - 0x1000000; | |
| } | |
| sample += value; | |
| } | |
| audioData[i] = (sample / fmtChunk.numChannels) / 8388608.0; // 2^23 | |
| } | |
| } else if (fmtChunk.bitsPerSample === 32) { | |
| for (let i = 0; i < numSamples; i++) { | |
| let sample = 0; | |
| for (let ch = 0; ch < fmtChunk.numChannels; ch++) { | |
| const sampleOffset = dataChunk.offset + (i * fmtChunk.numChannels + ch) * 4; | |
| sample += view.getFloat32(sampleOffset, true); | |
| } | |
| audioData[i] = sample / fmtChunk.numChannels; | |
| } | |
| } else { | |
| throw new Error(`Unsupported bit depth: ${fmtChunk.bitsPerSample}. Supported formats: 16-bit, 24-bit, 32-bit`); | |
| } | |
| return { | |
| sampleRate: fmtChunk.sampleRate, | |
| audioData: audioData | |
| }; | |
| } | |
| export function arrayToTensor(array, dims) { | |
| const flat = array.flat(Infinity); | |
| return new ort.Tensor('float32', Float32Array.from(flat), dims); | |
| } | |
| export function intArrayToTensor(array, dims) { | |
| const flat = array.flat(Infinity); | |
| return new ort.Tensor('int64', BigInt64Array.from(flat.map(x => BigInt(x))), dims); | |
| } | |
| export function writeWavFile(audioData, sampleRate) { | |
| const numChannels = 1; | |
| const bitsPerSample = 16; | |
| const byteRate = sampleRate * numChannels * bitsPerSample / 8; | |
| const blockAlign = numChannels * bitsPerSample / 8; | |
| const dataSize = audioData.length * bitsPerSample / 8; | |
| const buffer = new ArrayBuffer(44 + dataSize); | |
| const view = new DataView(buffer); | |
| // RIFF header | |
| view.setUint8(0, 'R'.charCodeAt(0)); | |
| view.setUint8(1, 'I'.charCodeAt(0)); | |
| view.setUint8(2, 'F'.charCodeAt(0)); | |
| view.setUint8(3, 'F'.charCodeAt(0)); | |
| view.setUint32(4, 36 + dataSize, true); | |
| view.setUint8(8, 'W'.charCodeAt(0)); | |
| view.setUint8(9, 'A'.charCodeAt(0)); | |
| view.setUint8(10, 'V'.charCodeAt(0)); | |
| view.setUint8(11, 'E'.charCodeAt(0)); | |
| // fmt chunk | |
| view.setUint8(12, 'f'.charCodeAt(0)); | |
| view.setUint8(13, 'm'.charCodeAt(0)); | |
| view.setUint8(14, 't'.charCodeAt(0)); | |
| view.setUint8(15, ' '.charCodeAt(0)); | |
| view.setUint32(16, 16, true); | |
| view.setUint16(20, 1, true); // PCM | |
| view.setUint16(22, numChannels, true); | |
| view.setUint32(24, sampleRate, true); | |
| view.setUint32(28, byteRate, true); | |
| view.setUint16(32, blockAlign, true); | |
| view.setUint16(34, bitsPerSample, true); | |
| // data chunk | |
| view.setUint8(36, 'd'.charCodeAt(0)); | |
| view.setUint8(37, 'a'.charCodeAt(0)); | |
| view.setUint8(38, 't'.charCodeAt(0)); | |
| view.setUint8(39, 'a'.charCodeAt(0)); | |
| view.setUint32(40, dataSize, true); | |
| // Write audio data | |
| for (let i = 0; i < audioData.length; i++) { | |
| const sample = Math.max(-1, Math.min(1, audioData[i])); | |
| const intSample = Math.floor(sample * 32767); | |
| view.setInt16(44 + i * 2, intSample, true); | |
| } | |
| return buffer; | |
| } | |
| // Smooth scroll functionality | |
| document.addEventListener('DOMContentLoaded', () => { | |
| // Smooth scroll for anchor links | |
| document.querySelectorAll('a[href^="#"]').forEach(anchor => { | |
| anchor.addEventListener('click', function (e) { | |
| e.preventDefault(); | |
| const href = this.getAttribute('href'); | |
| const target = document.querySelector(href); | |
| if (target) { | |
| // Update URL with anchor | |
| if (history.pushState) { | |
| history.pushState(null, null, href); | |
| } | |
| target.scrollIntoView({ | |
| behavior: 'smooth', | |
| block: 'start' | |
| }); | |
| } | |
| }); | |
| }); | |
| // Add scroll animation for sections | |
| const observerOptions = { | |
| threshold: 0.1, | |
| rootMargin: '0px 0px -100px 0px' | |
| }; | |
| const observer = new IntersectionObserver((entries) => { | |
| entries.forEach(entry => { | |
| if (entry.isIntersecting) { | |
| entry.target.style.opacity = '1'; | |
| entry.target.style.transform = 'translateY(0)'; | |
| } | |
| }); | |
| }, observerOptions); | |
| }); | |
| // TTS Demo functionality | |
| (async function() { | |
| // Check if we're on a page with the TTS demo | |
| const demoTextInput = document.getElementById('demoTextInput'); | |
| if (!demoTextInput) return; | |
| // Configure ONNX Runtime WASM assets | |
| ort.env.wasm.wasmPaths = 'https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.0/dist/'; | |
| ort.env.wasm.numThreads = 1; | |
| // Configuration | |
| const REF_EMBEDDING_PATHS = { | |
| 'F1': 'assets/voice_styles/F1.json', | |
| 'F2': 'assets/voice_styles/F2.json', | |
| 'F3': 'assets/voice_styles/F3.json', | |
| 'F4': 'assets/voice_styles/F4.json', | |
| 'F5': 'assets/voice_styles/F5.json', | |
| 'M1': 'assets/voice_styles/M1.json', | |
| 'M2': 'assets/voice_styles/M2.json', | |
| 'M3': 'assets/voice_styles/M3.json', | |
| 'M4': 'assets/voice_styles/M4.json', | |
| 'M5': 'assets/voice_styles/M5.json' | |
| }; | |
| // Voice descriptions | |
| const VOICE_DESCRIPTIONS = { | |
| 'F1': 'Sarah - A calm female voice with a slightly low tone; steady and composed.', | |
| 'F2': 'Lily - A bright, cheerful female voice; lively, playful, and youthful with spirited energy.', | |
| 'F3': 'Jessica - A clear, professional announcer-style female voice; articulate and broadcast-ready.', | |
| 'F4': 'Olivia - A crisp, confident female voice; distinct and expressive with strong delivery.', | |
| 'F5': 'Emily - A kind, gentle female voice; soft-spoken, calm, and naturally soothing.', | |
| 'M1': 'Alex - A lively, upbeat male voice with confident energy and a standard, clear tone.', | |
| 'M2': 'James - A deep, robust male voice; calm, composed, and serious with a grounded presence.', | |
| 'M3': 'Robert - A polished, authoritative male voice; confident and trustworthy with strong presentation quality.', | |
| 'M4': 'Sam - A soft, neutral-toned male voice; gentle and approachable with a youthful, friendly quality.', | |
| 'M5': 'Daniel - A warm, soft-spoken male voice; calm and soothing with a natural storytelling quality.' | |
| }; | |
| // Global state | |
| let models = null; | |
| let cfgs = null; | |
| let processors = null; | |
| let currentVoice = 'M3'; // Default to Robert voice | |
| // Detect browser language and set initial language | |
| function detectBrowserLanguage() { | |
| // Get browser language (works in Chrome, Safari, Edge, Firefox, Opera, Samsung Internet) | |
| const browserLang = navigator.language || navigator.userLanguage || 'en'; | |
| // Extract language code (e.g., 'en-US' -> 'en', 'ko-KR' -> 'ko') | |
| const langCode = browserLang.split('-')[0].toLowerCase(); | |
| // Reuse the 31-language list defined at module top-level | |
| return AVAILABLE_LANGS.includes(langCode) ? langCode : 'en'; | |
| } | |
| let currentLanguage = detectBrowserLanguage(); // Auto-detect from browser | |
| let refEmbeddingCache = {}; // Cache for embeddings | |
| let currentStyleTtlTensor = null; | |
| let currentStyleDpTensor = null; | |
| let modelsLoading = false; // Track if models are currently loading | |
| let modelsLoaded = false; // Track if models are fully loaded | |
| let modelsLoadPromise = null; // Promise for model loading | |
| // UI Elements | |
| const demoStatusBox = document.getElementById('demoStatusBox'); | |
| const demoStatusText = document.getElementById('demoStatusText'); | |
| const demoGenerateBtn = document.getElementById('demoGenerateBtn'); | |
| const demoTotalSteps = document.getElementById('demoTotalSteps'); | |
| const demoSpeed = document.getElementById('demoSpeed'); | |
| const demoTotalStepsValue = document.getElementById('demoTotalStepsValue'); | |
| const demoSpeedValue = document.getElementById('demoSpeedValue'); | |
| const demoResults = document.getElementById('demoResults'); | |
| const demoError = document.getElementById('demoError'); | |
| const demoCharCount = document.getElementById('demoCharCount'); | |
| const demoCharCounter = document.getElementById('demoCharCounter'); | |
| const demoCharWarning = document.getElementById('demoCharWarning'); | |
| const fixedFontPresets = new Set(['paragraph', 'script']); | |
| let currentPreset = 'quote'; // Initialize with quote | |
| // Text validation constants | |
| const MIN_CHARS = 10; | |
| const MAX_CHUNK_LENGTH_DEFAULT = 300; // Maximum length for each chunk (default) | |
| const MAX_CHUNK_LENGTH_CJK = 120; // Maximum length for Korean/Japanese | |
| function getMaxChunkLength() { | |
| return (currentLanguage === 'ko' || currentLanguage === 'ja') ? MAX_CHUNK_LENGTH_CJK : MAX_CHUNK_LENGTH_DEFAULT; | |
| } | |
| // Custom audio player state (shared across generations) | |
| let audioContext = null; | |
| let scheduledSources = []; | |
| let audioChunks = []; | |
| let totalDuration = 0; | |
| let startTime = 0; | |
| let pauseTime = 0; | |
| let isPaused = false; | |
| let isPlaying = false; | |
| let animationFrameId = null; | |
| let playPauseBtn = null; | |
| let progressBar = null; | |
| let currentTimeDisplay = null; | |
| let durationDisplay = null; | |
| let progressFill = null; | |
| let firstChunkGenerationTime = 0; // Processing time for first chunk | |
| let totalChunks = 0; | |
| let nextScheduledTime = 0; // Next time to schedule audio chunk | |
| let currentGenerationTextLength = 0; | |
| let supertonicPlayerRecord = null; // Supertonic player record for cross-player pause management | |
| let isGenerating = false; // Track if speech generation is in progress | |
| // Track all custom audio players | |
| let customAudioPlayers = []; | |
| const isMobileViewport = () => window.matchMedia('(max-width: 768px)').matches; | |
| // Check if device actually supports touch (not just viewport size) | |
| const isTouchDevice = () => 'ontouchstart' in window || navigator.maxTouchPoints > 0; | |
| const trimDecimalsForMobile = (formatted) => { | |
| if (!formatted) return formatted; | |
| return isMobileViewport() ? formatted.replace(/\.\d{2}$/, '') : formatted; | |
| }; | |
| function pauseAllPlayersExcept(currentPlayer) { | |
| customAudioPlayers.forEach(player => { | |
| if (player !== currentPlayer && player && typeof player.pausePlayback === 'function') { | |
| player.pausePlayback(); | |
| } | |
| }); | |
| } | |
| /** | |
| * Chunk text into smaller pieces based on sentence boundaries | |
| * @param {string} text - The text to chunk | |
| * @param {number} maxLen - Maximum length for each chunk | |
| * @returns {Array<string>} - Array of text chunks | |
| */ | |
| function chunkText(text, maxLen = getMaxChunkLength()) { | |
| // Split by paragraph (two or more newlines) | |
| const paragraphs = text.trim().split(/\n\s*\n+/).filter(p => p.trim()); | |
| const chunks = []; | |
| for (let paragraph of paragraphs) { | |
| paragraph = paragraph.trim(); | |
| if (!paragraph) continue; | |
| // Split by sentence boundaries (period, question mark, exclamation mark followed by space) | |
| // But exclude common abbreviations like Mr., Mrs., Dr., etc. and single capital letters like F. | |
| const sentences = paragraph.split(/(?<!Mr\.|Mrs\.|Ms\.|Dr\.|Prof\.|Sr\.|Jr\.|Ph\.D\.|etc\.|e\.g\.|i\.e\.|vs\.|Inc\.|Ltd\.|Co\.|Corp\.|St\.|Ave\.|Blvd\.)(?<!\b[A-Z]\.)(?<=[.!?])\s+/); | |
| let currentChunk = ""; | |
| for (let sentence of sentences) { | |
| if (currentChunk.length + sentence.length + 1 <= maxLen) { | |
| currentChunk += (currentChunk ? " " : "") + sentence; | |
| } else { | |
| if (currentChunk) { | |
| chunks.push(currentChunk.trim()); | |
| } | |
| currentChunk = sentence; | |
| } | |
| } | |
| if (currentChunk) { | |
| chunks.push(currentChunk.trim()); | |
| } | |
| } | |
| return chunks; | |
| } | |
| function showDemoStatus(message, type = 'info', progress = null) { | |
| demoStatusText.innerHTML = message; | |
| demoStatusBox.className = 'demo-status-box'; | |
| demoStatusBox.style.removeProperty('--status-progress'); | |
| demoStatusBox.style.display = ''; // Show the status box | |
| if (type === 'success') { | |
| demoStatusBox.classList.add('success'); | |
| } else if (type === 'error') { | |
| demoStatusBox.classList.add('error'); | |
| } | |
| // Update progress bar | |
| if (progress !== null && progress >= 0 && progress <= 100) { | |
| const clampedProgress = Math.max(0, Math.min(progress, 100)); | |
| demoStatusBox.style.setProperty('--status-progress', `${clampedProgress}%`); | |
| demoStatusBox.classList.toggle('complete', clampedProgress >= 100); | |
| } else if (type === 'success' || type === 'error') { | |
| demoStatusBox.style.removeProperty('--status-progress'); | |
| demoStatusBox.classList.remove('complete'); | |
| } else { | |
| demoStatusBox.style.removeProperty('--status-progress'); | |
| demoStatusBox.classList.remove('complete'); | |
| } | |
| } | |
| function hideDemoStatus() { | |
| demoStatusBox.style.display = 'none'; | |
| } | |
| function showDemoError(message) { | |
| demoError.textContent = message; | |
| demoError.classList.add('active'); | |
| } | |
| function hideDemoError() { | |
| demoError.classList.remove('active'); | |
| } | |
| // Language toast notification | |
| const languageToast = document.getElementById('languageToast'); | |
| const languageToastMessage = document.getElementById('languageToastMessage'); | |
| let languageToastTimeout = null; | |
| function showLanguageToast(fromLang, toLang) { | |
| if (!languageToast || !languageToastMessage) return; | |
| const fromName = LANGUAGE_NAMES[fromLang] || fromLang; | |
| const toName = LANGUAGE_NAMES[toLang] || toLang; | |
| languageToastMessage.innerHTML = `Language auto-detected: <strong>${toName}</strong>`; | |
| // Clear any existing timeout | |
| if (languageToastTimeout) { | |
| clearTimeout(languageToastTimeout); | |
| } | |
| // Show toast | |
| languageToast.classList.add('show'); | |
| // Hide after 3 seconds | |
| languageToastTimeout = setTimeout(() => { | |
| languageToast.classList.remove('show'); | |
| }, 3000); | |
| } | |
| // Validate characters in text | |
| function validateCharacters(text) { | |
| if (!processors || !processors.textProcessor) { | |
| return { valid: true, unsupportedChars: [] }; | |
| } | |
| try { | |
| // Extract unique characters to minimize preprocessText calls | |
| const uniqueChars = [...new Set(text)]; | |
| // Build mapping for unique chars only (much faster for long texts) | |
| // For example, Korean '간' -> 'ㄱㅏㄴ', so we map 'ㄱ','ㅏ','ㄴ' -> '간' | |
| const processedToOriginal = new Map(); | |
| const charToProcessed = new Map(); | |
| for (const char of uniqueChars) { | |
| const processedChar = preprocessText(char); | |
| charToProcessed.set(char, processedChar); | |
| // Map each processed character back to its original | |
| for (const pc of processedChar) { | |
| if (!processedToOriginal.has(pc)) { | |
| processedToOriginal.set(pc, new Set()); | |
| } | |
| processedToOriginal.get(pc).add(char); | |
| } | |
| } | |
| // Build full processed text using cached mappings | |
| const fullProcessedText = Array.from(text).map(c => charToProcessed.get(c)).join(''); | |
| // Check the entire processed text once (efficient) | |
| const { unsupportedChars } = processors.textProcessor.call([fullProcessedText]); | |
| // Map unsupported processed chars back to original chars | |
| const unsupportedOriginalChars = new Set(); | |
| if (unsupportedChars && unsupportedChars.length > 0) { | |
| for (const unsupportedChar of unsupportedChars) { | |
| const originalChars = processedToOriginal.get(unsupportedChar); | |
| if (originalChars) { | |
| originalChars.forEach(c => unsupportedOriginalChars.add(c)); | |
| } | |
| } | |
| } | |
| const unsupportedCharsArray = Array.from(unsupportedOriginalChars); | |
| return { | |
| valid: unsupportedCharsArray.length === 0, | |
| unsupportedChars: unsupportedCharsArray | |
| }; | |
| } catch (error) { | |
| return { valid: true, unsupportedChars: [] }; | |
| } | |
| } | |
| // Update character counter and validate text length | |
| function updateCharCounter() { | |
| const rawText = demoTextInput.textContent || demoTextInput.innerText || ''; | |
| const text = rawText.replace(/\n$/g, ''); // Remove trailing newline that browsers may add | |
| const length = text.length; | |
| demoCharCount.textContent = length; | |
| if (fixedFontPresets.has(currentPreset)) { | |
| demoTextInput.style.fontSize = '1.5rem'; | |
| } else { | |
| // Get the actual width of the textarea | |
| const textareaWidth = demoTextInput.offsetWidth; | |
| // Check if mobile (572px or less) for 2x font size scaling | |
| const isMobile = window.innerWidth <= 572; | |
| const mobileMultiplier = isMobile ? 2 : 1; | |
| let fontSizeRatio; | |
| if (length <= 100) { | |
| fontSizeRatio = 0.055 * mobileMultiplier; // 5.5% of width | |
| } else if (length <= 200) { | |
| fontSizeRatio = 0.04 * mobileMultiplier; // 4% of width | |
| } else if (length < 240) { | |
| fontSizeRatio = 0.053125 * mobileMultiplier; // ~5.3125% of width | |
| } else if (length < 400) { | |
| fontSizeRatio = 0.0425 * mobileMultiplier; // ~4.25% of width | |
| } else if (length < 700) { | |
| fontSizeRatio = 0.031875 * mobileMultiplier; // ~3.1875% of width | |
| } else { | |
| fontSizeRatio = 0.025 * mobileMultiplier; // 2.5% of width | |
| } | |
| // Calculate font size based on actual width | |
| const fontSize = textareaWidth * fontSizeRatio; | |
| demoTextInput.style.fontSize = `${fontSize}px`; | |
| } | |
| // Remove all status classes | |
| demoCharCounter.classList.remove('error', 'warning', 'valid'); | |
| // Check for unsupported characters first (only if models are loaded) | |
| let hasUnsupportedChars = false; | |
| if (models && processors && length > 0) { | |
| const validation = validateCharacters(text); | |
| if (!validation.valid && validation.unsupportedChars.length > 0) { | |
| hasUnsupportedChars = true; | |
| const charList = validation.unsupportedChars.slice(0, 5).map(c => `"${c}"`).join(', '); | |
| const moreChars = validation.unsupportedChars.length > 5 ? ` and ${validation.unsupportedChars.length - 5} more` : ''; | |
| showDemoError(`Unsupported characters detected: ${charList}${moreChars}. Please remove them before generating speech.`); | |
| } else { | |
| hideDemoError(); | |
| } | |
| } | |
| // Update status based on length and character validation | |
| if (length < MIN_CHARS) { | |
| demoCharCounter.classList.add('error'); | |
| demoCharWarning.textContent = '(At least 10 characters)'; | |
| demoGenerateBtn.disabled = true; | |
| } else if (hasUnsupportedChars) { | |
| demoCharCounter.classList.add('error'); | |
| demoCharWarning.textContent = '(Unsupported characters)'; | |
| demoGenerateBtn.disabled = true; | |
| } else { | |
| demoCharCounter.classList.add('valid'); | |
| demoCharWarning.textContent = ''; | |
| // Enable only if models are loaded AND not currently generating | |
| demoGenerateBtn.disabled = !models || isGenerating; | |
| } | |
| } | |
| // Validate text input | |
| function validateTextInput(text) { | |
| if (!text || text.trim().length === 0) { | |
| return { valid: false, message: 'Please enter some text.' }; | |
| } | |
| if (text.length < MIN_CHARS) { | |
| return { valid: false, message: `Text must be at least ${MIN_CHARS} characters long. (Currently ${text.length})` }; | |
| } | |
| return { valid: true }; | |
| } | |
| // Load pre-extracted style embeddings from JSON | |
| async function loadStyleEmbeddings(voice) { | |
| try { | |
| // Check if already cached | |
| if (refEmbeddingCache[voice]) { | |
| return refEmbeddingCache[voice]; | |
| } | |
| const embeddingPath = REF_EMBEDDING_PATHS[voice]; | |
| if (!embeddingPath) { | |
| throw new Error(`No embedding path configured for voice: ${voice}`); | |
| } | |
| const response = await fetch(embeddingPath); | |
| if (!response.ok) { | |
| throw new Error(`Failed to fetch embedding: ${response.statusText}`); | |
| } | |
| const embeddingData = await response.json(); | |
| // Convert JSON data to ONNX tensors | |
| // Flatten nested arrays before creating Float32Array | |
| const styleTtlData = embeddingData.style_ttl.data.flat(Infinity); | |
| const styleTtlTensor = new ort.Tensor( | |
| embeddingData.style_ttl.type || 'float32', | |
| Float32Array.from(styleTtlData), | |
| embeddingData.style_ttl.dims | |
| ); | |
| const styleDpData = embeddingData.style_dp.data.flat(Infinity); | |
| const styleDpTensor = new ort.Tensor( | |
| embeddingData.style_dp.type || 'float32', | |
| Float32Array.from(styleDpData), | |
| embeddingData.style_dp.dims | |
| ); | |
| const embeddings = { | |
| styleTtl: styleTtlTensor, | |
| styleDp: styleDpTensor | |
| }; | |
| // Cache the embeddings | |
| refEmbeddingCache[voice] = embeddings; | |
| return embeddings; | |
| } catch (error) { | |
| throw error; | |
| } | |
| } | |
| // Switch to a different voice | |
| async function switchVoice(voice) { | |
| try { | |
| const embeddings = await loadStyleEmbeddings(voice); | |
| currentStyleTtlTensor = embeddings.styleTtl; | |
| currentStyleDpTensor = embeddings.styleDp; | |
| currentVoice = voice; | |
| // Update active speaker in UI | |
| if (typeof window.updateActiveSpeaker === 'function') { | |
| window.updateActiveSpeaker(voice); | |
| } | |
| // Re-validate text after switching voice | |
| updateCharCounter(); | |
| } catch (error) { | |
| showDemoError(`Failed to load voice ${voice}: ${getErrorMessage(error)}`); | |
| throw error; | |
| } | |
| } | |
| // Warmup models with dummy inference (no audio playback, no UI updates) | |
| async function warmupModels() { | |
| try { | |
| const dummyText = 'Hello, this is a quick warmup.'; | |
| const totalStep = 1; | |
| const durationFactor = 1.0; | |
| const textList = [dummyText]; | |
| const bsz = 1; | |
| // Use pre-computed style embeddings | |
| const styleTtlTensor = currentStyleTtlTensor; | |
| const styleDpTensor = currentStyleDpTensor; | |
| // Step 1: Estimate duration | |
| const { textIds, textMask } = processors.textProcessor.call(textList, currentLanguage); | |
| const textIdsShape = [bsz, textIds[0].length]; | |
| const textMaskShape = [bsz, 1, textMask[0][0].length]; | |
| const textMaskTensor = arrayToTensor(textMask, textMaskShape); | |
| const dpResult = await models.dpOrt.run({ | |
| text_ids: intArrayToTensor(textIds, textIdsShape), | |
| style_dp: styleDpTensor, | |
| text_mask: textMaskTensor | |
| }); | |
| const durOnnx = Array.from(dpResult.duration.data); | |
| for (let i = 0; i < durOnnx.length; i++) { | |
| durOnnx[i] *= durationFactor; | |
| } | |
| const durReshaped = []; | |
| for (let b = 0; b < bsz; b++) { | |
| durReshaped.push([[durOnnx[b]]]); | |
| } | |
| // Step 2: Encode text | |
| const textEncResult = await models.textEncOrt.run({ | |
| text_ids: intArrayToTensor(textIds, textIdsShape), | |
| style_ttl: styleTtlTensor, | |
| text_mask: textMaskTensor | |
| }); | |
| const textEmbTensor = textEncResult.text_emb; | |
| // Step 3: Denoising | |
| let { noisyLatent, latentMask } = sampleNoisyLatent(durReshaped, cfgs); | |
| const latentShape = [bsz, noisyLatent[0].length, noisyLatent[0][0].length]; | |
| const latentMaskShape = [bsz, 1, latentMask[0][0].length]; | |
| const latentMaskTensor = arrayToTensor(latentMask, latentMaskShape); | |
| const totalStepArray = new Array(bsz).fill(totalStep); | |
| const scalarShape = [bsz]; | |
| const totalStepTensor = arrayToTensor(totalStepArray, scalarShape); | |
| for (let step = 0; step < totalStep; step++) { | |
| const currentStepArray = new Array(bsz).fill(step); | |
| const vectorEstResult = await models.vectorEstOrt.run({ | |
| noisy_latent: arrayToTensor(noisyLatent, latentShape), | |
| text_emb: textEmbTensor, | |
| style_ttl: styleTtlTensor, | |
| text_mask: textMaskTensor, | |
| latent_mask: latentMaskTensor, | |
| total_step: totalStepTensor, | |
| current_step: arrayToTensor(currentStepArray, scalarShape) | |
| }); | |
| const denoisedLatent = Array.from(vectorEstResult.denoised_latent.data); | |
| // Update latent | |
| let idx = 0; | |
| for (let b = 0; b < noisyLatent.length; b++) { | |
| for (let d = 0; d < noisyLatent[b].length; d++) { | |
| for (let t = 0; t < noisyLatent[b][d].length; t++) { | |
| noisyLatent[b][d][t] = denoisedLatent[idx++]; | |
| } | |
| } | |
| } | |
| } | |
| // Step 4: Generate waveform | |
| const vocoderResult = await models.vocoderOrt.run({ | |
| latent: arrayToTensor(noisyLatent, latentShape) | |
| }); | |
| // Warmup complete - no need to process the audio further | |
| } catch (error) { | |
| console.warn('Warmup failed (non-critical):', getErrorMessage(error)); | |
| // Don't throw - warmup failure shouldn't prevent normal usage | |
| } | |
| } | |
| // Load models on page load | |
| async function initializeModels() { | |
| // If models are already loading, return the existing promise | |
| if (modelsLoading && modelsLoadPromise) { | |
| return modelsLoadPromise; | |
| } | |
| // If models are already loaded, return immediately | |
| if (modelsLoaded && models) { | |
| return; | |
| } | |
| modelsLoading = true; | |
| // Disable speaker selection during model loading | |
| const speakerItemsForLoading = document.querySelectorAll('.speaker-item[data-voice]'); | |
| speakerItemsForLoading.forEach(item => item.classList.add('disabled')); | |
| // Disable language selection during model loading | |
| const languageItemsForLoading = document.querySelectorAll('.speaker-item[data-language]'); | |
| languageItemsForLoading.forEach(item => item.classList.add('disabled')); | |
| modelsLoadPromise = (async () => { | |
| try { | |
| showDemoStatus('<strong>Loading configuration...</strong>', 'info', 5); | |
| const basePath = 'assets/onnx'; | |
| // Load config | |
| cfgs = await loadCfgs(basePath); | |
| let useWebGPU = !!navigator.gpu && window.isSecureContext; | |
| let executionProvider = useWebGPU ? 'webgpu' : 'wasm'; | |
| // Load models with appropriate backend | |
| let backendName = useWebGPU ? 'WebGPU' : 'WASM'; | |
| showDemoStatus(`<strong>${backendName} detected! Loading models...</strong>`, 'info', 10); | |
| const loadBundle = async () => { | |
| const modelsLoadPromise = loadOnnxAll(basePath, { | |
| executionProviders: [executionProvider], | |
| graphOptimizationLevel: 'all' | |
| }, (modelName, current, total) => { | |
| const progress = 10 + (current / total) * 70; // 10-80% for model loading | |
| showDemoStatus(`<strong>Loading models with ${backendName} (${current}/${total}):</strong> ${modelName}...`, 'info', progress); | |
| }); | |
| // Load processors in parallel with models | |
| return await Promise.all([ | |
| modelsLoadPromise, | |
| loadProcessors(basePath) | |
| ]); | |
| }; | |
| let loadedModels; | |
| let loadedProcessors; | |
| try { | |
| [loadedModels, loadedProcessors] = await loadBundle(); | |
| } catch (error) { | |
| if (!useWebGPU) { | |
| throw error; | |
| } | |
| console.warn('WebGPU model load failed, falling back to WASM:', error); | |
| useWebGPU = false; | |
| executionProvider = 'wasm'; | |
| backendName = 'WASM'; | |
| showDemoStatus('<strong>WebGPU unavailable. Falling back to WASM...</strong>', 'info', 10); | |
| [loadedModels, loadedProcessors] = await loadBundle(); | |
| } | |
| models = loadedModels; | |
| processors = loadedProcessors; | |
| showDemoStatus('<strong>Loading reference embeddings...</strong>', 'info', 85); | |
| // Load pre-extracted embeddings for default voice | |
| const embeddings = await loadStyleEmbeddings(currentVoice); | |
| currentStyleTtlTensor = embeddings.styleTtl; | |
| currentStyleDpTensor = embeddings.styleDp; | |
| showDemoStatus('<strong>Warming up models...</strong>', 'info', 90); | |
| // Warmup step: run inference once in background with dummy text | |
| await warmupModels(); | |
| hideDemoStatus(); | |
| demoGenerateBtn.disabled = false; | |
| demoTotalSteps.disabled = false; | |
| demoSpeed.disabled = false; | |
| // Enable voice toggle buttons after models are loaded | |
| const voiceToggleTexts = document.querySelectorAll('.voice-toggle-text'); | |
| voiceToggleTexts.forEach(text => text.classList.remove('disabled')); | |
| // Validate initial text now that models are loaded | |
| updateCharCounter(); | |
| // Mark models as loaded | |
| modelsLoaded = true; | |
| modelsLoading = false; | |
| // Re-enable speaker selection after model loading | |
| speakerItemsForLoading.forEach(item => item.classList.remove('disabled')); | |
| // Re-enable language selection after model loading | |
| languageItemsForLoading.forEach(item => item.classList.remove('disabled')); | |
| } catch (error) { | |
| modelsLoading = false; | |
| // Re-enable speaker selection on error too | |
| speakerItemsForLoading.forEach(item => item.classList.remove('disabled')); | |
| // Re-enable language selection on error too | |
| languageItemsForLoading.forEach(item => item.classList.remove('disabled')); | |
| const errorMessage = getErrorMessage(error); | |
| showDemoStatus(`<strong>Error:</strong> ${errorMessage}`, 'error'); | |
| showDemoError(`Failed to initialize: ${errorMessage}. Check console for details.`); | |
| throw error; | |
| } | |
| })(); | |
| return modelsLoadPromise; | |
| } | |
| // Supertonic synthesis function (extracted for parallel execution) | |
| async function generateSupertonicSpeech(text, totalStep, durationFactor) { | |
| const supertonicStartTime = Date.now(); | |
| try { | |
| const textList = [text]; | |
| const bsz = 1; | |
| const sampleRate = cfgs.ae.sample_rate; | |
| // Use pre-computed style embeddings | |
| const styleTtlTensor = currentStyleTtlTensor; | |
| const styleDpTensor = currentStyleDpTensor; | |
| // Step 1: Estimate duration | |
| const { textIds, textMask, unsupportedChars } = processors.textProcessor.call(textList, currentLanguage); | |
| // Check for unsupported characters | |
| if (unsupportedChars && unsupportedChars.length > 0) { | |
| const charList = unsupportedChars.map(c => `"${c}"`).join(', '); | |
| throw new Error(`Unsupported characters: ${charList}`); | |
| } | |
| const textIdsShape = [bsz, textIds[0].length]; | |
| const textMaskShape = [bsz, 1, textMask[0][0].length]; | |
| const textMaskTensor = arrayToTensor(textMask, textMaskShape); | |
| const dpResult = await models.dpOrt.run({ | |
| text_ids: intArrayToTensor(textIds, textIdsShape), | |
| style_dp: styleDpTensor, | |
| text_mask: textMaskTensor | |
| }); | |
| const durOnnx = Array.from(dpResult.duration.data); | |
| // Apply duration factor to adjust speech length (once) | |
| for (let i = 0; i < durOnnx.length; i++) { | |
| durOnnx[i] *= durationFactor; | |
| } | |
| const durReshaped = []; | |
| for (let b = 0; b < bsz; b++) { | |
| durReshaped.push([[durOnnx[b]]]); | |
| } | |
| // Step 2: Encode text | |
| const textEncResult = await models.textEncOrt.run({ | |
| text_ids: intArrayToTensor(textIds, textIdsShape), | |
| style_ttl: styleTtlTensor, | |
| text_mask: textMaskTensor | |
| }); | |
| const textEmbTensor = textEncResult.text_emb; | |
| // Step 3: Denoising | |
| let { noisyLatent, latentMask } = sampleNoisyLatent(durReshaped, cfgs); | |
| const latentDim = noisyLatent[0].length; | |
| const latentLen = noisyLatent[0][0].length; | |
| const latentShape = [bsz, latentDim, latentLen]; | |
| const latentMaskShape = [bsz, 1, latentMask[0][0].length]; | |
| const latentMaskTensor = arrayToTensor(latentMask, latentMaskShape); | |
| // Pre-allocate flat buffer for latent data to avoid repeated allocations | |
| const latentBufferSize = bsz * latentDim * latentLen; | |
| const latentBuffer = new Float32Array(latentBufferSize); | |
| // Initialize latent buffer from noisyLatent | |
| let initIdx = 0; | |
| for (let b = 0; b < bsz; b++) { | |
| for (let d = 0; d < latentDim; d++) { | |
| for (let t = 0; t < latentLen; t++) { | |
| latentBuffer[initIdx++] = noisyLatent[b][d][t]; | |
| } | |
| } | |
| } | |
| // Prepare constant tensors | |
| const scalarShape = [bsz]; | |
| const totalStepTensor = arrayToTensor(new Array(bsz).fill(totalStep), scalarShape); | |
| // Pre-create all step tensors to avoid repeated allocations | |
| const stepTensors = []; | |
| for (let step = 0; step < totalStep; step++) { | |
| stepTensors.push(arrayToTensor(new Array(bsz).fill(step), scalarShape)); | |
| } | |
| for (let step = 0; step < totalStep; step++) { | |
| // Create tensor from pre-allocated buffer | |
| const noisyLatentTensor = new ort.Tensor('float32', latentBuffer, latentShape); | |
| const vectorEstResult = await models.vectorEstOrt.run({ | |
| noisy_latent: noisyLatentTensor, | |
| text_emb: textEmbTensor, | |
| style_ttl: styleTtlTensor, | |
| text_mask: textMaskTensor, | |
| latent_mask: latentMaskTensor, | |
| total_step: totalStepTensor, | |
| current_step: stepTensors[step] | |
| }); | |
| // Copy denoised result directly into pre-allocated buffer | |
| const denoisedData = vectorEstResult.denoised_latent.data; | |
| latentBuffer.set(denoisedData); | |
| } | |
| // Step 4: Generate waveform - use latentBuffer directly | |
| const vocoderResult = await models.vocoderOrt.run({ | |
| latent: new ort.Tensor('float32', latentBuffer, latentShape) | |
| }); | |
| const wavBatch = vocoderResult.wav_tts.data; | |
| const wavLen = Math.floor(sampleRate * durOnnx[0]); | |
| // Create a copy of the audio data (not a view) to prevent buffer reuse issues | |
| const audioData = wavBatch.slice(0, wavLen); | |
| // Calculate times for Supertonic | |
| const supertonicEndTime = Date.now(); | |
| const supertonicProcessingTime = (supertonicEndTime - supertonicStartTime) / 1000; | |
| const audioDurationSec = durOnnx[0]; | |
| return { | |
| success: true, | |
| processingTime: supertonicProcessingTime, | |
| audioDuration: audioDurationSec, | |
| audioData: audioData, | |
| sampleRate: sampleRate, | |
| text: text | |
| }; | |
| } catch (error) { | |
| return { | |
| success: false, | |
| error: getErrorMessage(error), | |
| text: text | |
| }; | |
| } | |
| } | |
| // Format time: 60초 미만 -> 00.00, 60분 미만 -> 00:00.00, 60분 이상 -> 00:00:00.00 | |
| function formatTimeDetailed(seconds) { | |
| const hours = Math.floor(seconds / 3600); | |
| const mins = Math.floor((seconds % 3600) / 60); | |
| const secs = seconds % 60; | |
| const ms = Math.floor((secs % 1) * 100); | |
| const wholeSecs = Math.floor(secs); | |
| if (seconds < 60) { | |
| return `${wholeSecs.toString().padStart(2, '0')}.${ms.toString().padStart(2, '0')}`; | |
| } else if (seconds < 3600) { | |
| return `${mins.toString().padStart(2, '0')}:${wholeSecs.toString().padStart(2, '0')}.${ms.toString().padStart(2, '0')}`; | |
| } else { | |
| return `${hours.toString().padStart(2, '0')}:${mins.toString().padStart(2, '0')}:${wholeSecs.toString().padStart(2, '0')}.${ms.toString().padStart(2, '0')}`; | |
| } | |
| } | |
| // Generate Supertonic speech with chunking support and progressive playback | |
| async function generateSupertonicSpeechChunked(text, totalStep, durationFactor, onFirstChunkReady, onChunkAdded) { | |
| const supertonicStartTime = Date.now(); | |
| const sampleRate = cfgs.ae.sample_rate; | |
| const silenceDuration = 0.3; // 0.3 seconds of silence between chunks | |
| try { | |
| // Split text into chunks | |
| const chunks = chunkText(text); | |
| const audioDataArrays = []; | |
| const durations = []; | |
| const silenceSamples = Math.floor(silenceDuration * sampleRate); | |
| let firstChunkEndTime = 0; | |
| let firstChunkTime = 0; | |
| // Generate speech for each chunk | |
| for (let i = 0; i < chunks.length; i++) { | |
| const chunkText = chunks[i]; | |
| const result = await generateSupertonicSpeech(chunkText, totalStep, durationFactor); | |
| if (!result.success) { | |
| throw new Error(`Failed to generate chunk ${i + 1}: ${result.error}`); | |
| } | |
| // Use raw Float32Array directly - no WAV encode/decode round-trip | |
| const audioData = result.audioData; | |
| audioDataArrays.push(audioData); | |
| durations.push(result.audioDuration); | |
| // Progressive playback: pass raw Float32Array directly to callbacks | |
| if (i === 0 && onFirstChunkReady) { | |
| // First chunk ready - send it immediately | |
| firstChunkEndTime = Date.now(); | |
| firstChunkTime = (firstChunkEndTime - supertonicStartTime) / 1000; | |
| const totalDurationSoFar = result.audioDuration; | |
| const processedChars = chunks[0].length; | |
| // Pass raw audio data and sample rate directly | |
| onFirstChunkReady(audioData, sampleRate, totalDurationSoFar, text, chunks.length, firstChunkTime, processedChars); | |
| } else if (i > 0 && onChunkAdded) { | |
| // Subsequent chunks - send just the new chunk | |
| const totalDurationSoFar = durations.slice(0, i + 1).reduce((sum, dur) => sum + dur, 0) + silenceDuration * i; | |
| const currentProcessingTime = (Date.now() - supertonicStartTime) / 1000; | |
| const processedChars = chunks.slice(0, i + 1).reduce((sum, chunk) => sum + chunk.length, 0); | |
| // Pass raw audio data and sample rate directly | |
| onChunkAdded(audioData, sampleRate, totalDurationSoFar, i + 1, chunks.length, currentProcessingTime, processedChars); | |
| } | |
| } | |
| // Concatenate all audio chunks with silence for final result | |
| const totalDuration = durations.reduce((sum, dur) => sum + dur, 0) + silenceDuration * (chunks.length - 1); | |
| // Calculate total samples needed | |
| let totalSamples = 0; | |
| for (let i = 0; i < audioDataArrays.length; i++) { | |
| totalSamples += audioDataArrays[i].length; | |
| if (i < audioDataArrays.length - 1) { | |
| totalSamples += silenceSamples; | |
| } | |
| } | |
| const wavCat = new Float32Array(totalSamples); | |
| let currentIdx = 0; | |
| for (let i = 0; i < audioDataArrays.length; i++) { | |
| // Copy audio data | |
| const audioData = audioDataArrays[i]; | |
| wavCat.set(audioData, currentIdx); | |
| currentIdx += audioData.length; | |
| // Add silence if not the last chunk | |
| if (i < audioDataArrays.length - 1) { | |
| // Silence is already zeros in Float32Array, just skip the indices | |
| currentIdx += silenceSamples; | |
| } | |
| } | |
| // Create final WAV file | |
| const wavBuffer = writeWavFile(wavCat, sampleRate); | |
| const blob = new Blob([wavBuffer], { type: 'audio/wav' }); | |
| const url = URL.createObjectURL(blob); | |
| const supertonicEndTime = Date.now(); | |
| const supertonicProcessingTime = (supertonicEndTime - supertonicStartTime) / 1000; | |
| return { | |
| success: true, | |
| processingTime: supertonicProcessingTime, | |
| audioDuration: totalDuration, | |
| url: url, | |
| text: text, | |
| firstChunkTime: firstChunkTime | |
| }; | |
| } catch (error) { | |
| return { | |
| success: false, | |
| error: getErrorMessage(error), | |
| text: text | |
| }; | |
| } | |
| } | |
| // Main synthesis function | |
| async function generateSpeech() { | |
| let text = (demoTextInput.textContent || demoTextInput.innerText || '').trim(); | |
| // Validate text input | |
| const validation = validateTextInput(text); | |
| if (!validation.valid) { | |
| showDemoError(validation.message); | |
| return; | |
| } | |
| if (!models || !cfgs || !processors) { | |
| showDemoError('Models are still loading. Please wait.'); | |
| return; | |
| } | |
| if (!currentStyleTtlTensor || !currentStyleDpTensor) { | |
| showDemoError('Reference embeddings are not ready. Please wait.'); | |
| return; | |
| } | |
| // Validate characters before generation | |
| const charValidation = validateCharacters(text); | |
| if (!charValidation.valid && charValidation.unsupportedChars.length > 0) { | |
| const charList = charValidation.unsupportedChars.map(c => `"${c}"`).join(', '); | |
| showDemoError(`Cannot generate speech: Unsupported characters found: ${charList}`); | |
| return; | |
| } | |
| currentGenerationTextLength = text.length; | |
| try { | |
| isGenerating = true; | |
| demoGenerateBtn.disabled = true; | |
| // Disable speaker selection during generation | |
| const speakerItemsForGeneration = document.querySelectorAll('.speaker-item[data-voice]'); | |
| speakerItemsForGeneration.forEach(item => item.classList.add('disabled')); | |
| // Disable language selection during generation | |
| const languageItemsForGeneration = document.querySelectorAll('.speaker-item[data-language]'); | |
| languageItemsForGeneration.forEach(item => item.classList.add('disabled')); | |
| hideDemoError(); | |
| hideDemoStatus(); // Hide the status box when starting generation | |
| // Clean up previous audio playback | |
| if (audioContext) { | |
| // Stop all scheduled sources | |
| scheduledSources.forEach(source => { | |
| try { | |
| source.stop(); | |
| } catch (e) { | |
| // Already stopped | |
| } | |
| }); | |
| scheduledSources = []; | |
| // Close audio context | |
| if (audioContext.state !== 'closed') { | |
| audioContext.close(); | |
| } | |
| audioContext = null; | |
| } | |
| // Cancel animation frame | |
| if (animationFrameId) { | |
| cancelAnimationFrame(animationFrameId); | |
| animationFrameId = null; | |
| } | |
| // Clean up all custom audio players | |
| customAudioPlayers.forEach(player => { | |
| if (player.cleanup) { | |
| player.cleanup(); | |
| } | |
| }); | |
| customAudioPlayers = []; | |
| // Reset state | |
| audioChunks = []; | |
| totalDuration = 0; | |
| startTime = 0; | |
| pauseTime = 0; | |
| isPaused = false; | |
| isPlaying = false; | |
| firstChunkGenerationTime = 0; // Processing time for first chunk | |
| totalChunks = 0; | |
| nextScheduledTime = 0; // Next time to schedule audio chunk | |
| // Show result shell(s) immediately | |
| const createInitialResultItem = (system, titleMain, titleSub, titleColor, includeStatus) => { | |
| const titleStatus = includeStatus | |
| ? `<span class="title-status status-running" id="${system}-status">⏳ Running...</span>` | |
| : ''; | |
| return ` | |
| <div class="demo-result-item ${system}-result-item generating" id="${system}-result" style="--result-progress: 0%;"> | |
| <div class="demo-result-title"> | |
| <span class="title-main" style="color: ${titleColor};">${titleMain}</span> | |
| <span class="title-sub">${titleSub}</span> | |
| ${titleStatus} | |
| </div> | |
| <div class="demo-result-info"> | |
| <!-- | |
| <div class="stat"> | |
| <div class="stat-value" id="${system}-chars">--</div> | |
| <div class="stat-label">Processed Chars</div> | |
| </div> | |
| --> | |
| <div class="stat"> | |
| <div class="stat-value" id="${system}-time">--</div> | |
| <div class="stat-label">Processing Time<span class="stat-arrow stat-arrow--down">↓</span></div> | |
| </div> | |
| <div class="stat"> | |
| <div class="stat-value" id="${system}-cps">--</div> | |
| <div class="stat-label">Chars/sec<span class="stat-arrow stat-arrow--up">↑</span></div> | |
| </div> | |
| <div class="stat"> | |
| <div class="stat-value" id="${system}-rtf">--</div> | |
| <div class="stat-label">RTF<span class="stat-arrow stat-arrow--down">↓</span></div> | |
| </div> | |
| </div> | |
| <div class="custom-audio-player"> | |
| <div class="demo-placeholder-audio">Generating speech...</div> | |
| </div> | |
| </div> | |
| `; | |
| }; | |
| const supertonicInitial = createInitialResultItem( | |
| 'supertonic', | |
| 'Supertonic', | |
| 'On-Device', | |
| 'var(--accent-yellow)', | |
| false | |
| ); | |
| demoResults.style.display = 'flex'; | |
| demoResults.innerHTML = supertonicInitial; | |
| const totalStep = parseInt(demoTotalSteps.value); | |
| const speed = parseFloat(demoSpeed.value); | |
| const durationFactor = speedToDurationFactor(speed); | |
| // Track which one finishes first | |
| let latestSupertonicProcessedChars = 0; | |
| // Helper functions for custom player | |
| const formatTime = (seconds, { trimMobile = false } = {}) => { | |
| const mins = Math.floor(seconds / 60); | |
| const secs = seconds % 60; | |
| const secString = secs.toFixed(2).padStart(5, '0'); | |
| let formatted = `${mins}:${secString}`; | |
| if (trimMobile) { | |
| formatted = trimDecimalsForMobile(formatted); | |
| } | |
| return formatted; | |
| }; | |
| const updateProgress = () => { | |
| if (!isPlaying || !audioContext) return; | |
| const currentTime = isPaused ? pauseTime : (audioContext.currentTime - startTime); | |
| const progress = totalDuration > 0 ? (currentTime / totalDuration) * 100 : 0; | |
| if (progressFill) { | |
| progressFill.style.width = `${Math.min(progress, 100)}%`; | |
| } | |
| if (currentTimeDisplay) { | |
| currentTimeDisplay.textContent = formatTime(Math.min(currentTime, totalDuration), { trimMobile: true }); | |
| } | |
| if (currentTime < totalDuration) { | |
| animationFrameId = requestAnimationFrame(updateProgress); | |
| } else { | |
| // Playback finished | |
| isPlaying = false; | |
| isPaused = false; | |
| if (playPauseBtn) { | |
| playPauseBtn.innerHTML = PLAY_ICON_SVG; | |
| } | |
| } | |
| }; | |
| const togglePlayPause = () => { | |
| if (!audioContext || audioChunks.length === 0) return; | |
| if (isPaused) { | |
| // Resume from paused position | |
| pauseAllPlayersExcept(supertonicPlayerRecord); | |
| const seekTime = pauseTime; | |
| // Find which chunk we should start from | |
| let accumulatedTime = 0; | |
| let startChunkIndex = 0; | |
| let offsetInChunk = seekTime; | |
| for (let i = 0; i < audioChunks.length; i++) { | |
| const chunkDuration = audioChunks[i].buffer.duration; | |
| if (accumulatedTime + chunkDuration > seekTime) { | |
| startChunkIndex = i; | |
| offsetInChunk = seekTime - accumulatedTime; | |
| break; | |
| } | |
| accumulatedTime += chunkDuration + 0.3; | |
| } | |
| // Stop any existing sources | |
| scheduledSources.forEach(source => { | |
| try { | |
| source.stop(); | |
| } catch (e) { | |
| // Already stopped | |
| } | |
| }); | |
| scheduledSources = []; | |
| // Resume AudioContext if suspended | |
| if (audioContext.state === 'suspended') { | |
| audioContext.resume(); | |
| } | |
| // Reschedule from the pause point | |
| startTime = audioContext.currentTime - seekTime; | |
| let nextStartTime = audioContext.currentTime; | |
| for (let i = startChunkIndex; i < audioChunks.length; i++) { | |
| const source = audioContext.createBufferSource(); | |
| source.buffer = audioChunks[i].buffer; | |
| source.connect(audioContext.destination); | |
| if (i === startChunkIndex) { | |
| source.start(nextStartTime, offsetInChunk); | |
| nextStartTime += (audioChunks[i].buffer.duration - offsetInChunk); | |
| } else { | |
| source.start(nextStartTime); | |
| nextStartTime += audioChunks[i].buffer.duration; | |
| } | |
| if (i < audioChunks.length - 1) { | |
| nextStartTime += 0.3; | |
| } | |
| scheduledSources.push(source); | |
| } | |
| nextScheduledTime = nextStartTime; | |
| isPaused = false; | |
| isPlaying = true; | |
| playPauseBtn.innerHTML = PAUSE_ICON_SVG; | |
| updateProgress(); | |
| } else if (isPlaying) { | |
| // Pause playback | |
| pauseTime = audioContext.currentTime - startTime; | |
| audioContext.suspend(); | |
| isPaused = true; | |
| playPauseBtn.innerHTML = PLAY_ICON_SVG; | |
| if (animationFrameId) { | |
| cancelAnimationFrame(animationFrameId); | |
| } | |
| } else { | |
| // Was finished, restart from beginning | |
| pauseAllPlayersExcept(supertonicPlayerRecord); | |
| pauseTime = 0; | |
| // Resume AudioContext if suspended | |
| if (audioContext.state === 'suspended') { | |
| audioContext.resume(); | |
| } | |
| // Stop any existing sources | |
| scheduledSources.forEach(source => { | |
| try { | |
| source.stop(); | |
| } catch (e) { | |
| // Already stopped | |
| } | |
| }); | |
| scheduledSources = []; | |
| // Restart from beginning | |
| startTime = audioContext.currentTime; | |
| let nextStartTime = audioContext.currentTime; | |
| for (let i = 0; i < audioChunks.length; i++) { | |
| const source = audioContext.createBufferSource(); | |
| source.buffer = audioChunks[i].buffer; | |
| source.connect(audioContext.destination); | |
| source.start(nextStartTime); | |
| nextStartTime += audioChunks[i].buffer.duration; | |
| if (i < audioChunks.length - 1) { | |
| nextStartTime += 0.3; | |
| } | |
| scheduledSources.push(source); | |
| } | |
| nextScheduledTime = nextStartTime; | |
| isPlaying = true; | |
| isPaused = false; | |
| playPauseBtn.innerHTML = PAUSE_ICON_SVG; | |
| updateProgress(); | |
| } | |
| }; | |
| const seekTo = (percentage) => { | |
| if (!audioContext || audioChunks.length === 0) return; | |
| const seekTime = (percentage / 100) * totalDuration; | |
| // Remember current playing state | |
| const wasPlaying = isPlaying; | |
| const wasPaused = isPaused; | |
| // Stop all current sources | |
| scheduledSources.forEach(source => { | |
| try { | |
| source.stop(); | |
| } catch (e) { | |
| // Already stopped | |
| } | |
| }); | |
| scheduledSources = []; | |
| // Cancel animation | |
| if (animationFrameId) { | |
| cancelAnimationFrame(animationFrameId); | |
| } | |
| // Find which chunk we should start from | |
| let accumulatedTime = 0; | |
| let startChunkIndex = 0; | |
| let offsetInChunk = seekTime; | |
| for (let i = 0; i < audioChunks.length; i++) { | |
| const chunkDuration = audioChunks[i].buffer.duration; | |
| if (accumulatedTime + chunkDuration > seekTime) { | |
| startChunkIndex = i; | |
| offsetInChunk = seekTime - accumulatedTime; | |
| break; | |
| } | |
| accumulatedTime += chunkDuration + 0.3; // Include silence | |
| } | |
| // If paused or finished, just update the pause position | |
| if (wasPaused || !wasPlaying) { | |
| pauseTime = seekTime; | |
| // Update UI | |
| if (progressFill) { | |
| const progress = (seekTime / totalDuration) * 100; | |
| progressFill.style.width = `${Math.min(progress, 100)}%`; | |
| } | |
| if (currentTimeDisplay) { | |
| currentTimeDisplay.textContent = formatTime(seekTime, { trimMobile: true }); | |
| } | |
| // Set to paused state so play button will resume from seek position | |
| isPaused = true; | |
| isPlaying = true; // Valid state for playback | |
| if (playPauseBtn) { | |
| playPauseBtn.innerHTML = PLAY_ICON_SVG; | |
| } | |
| return; | |
| } | |
| // Resume AudioContext if it was suspended | |
| if (audioContext.state === 'suspended') { | |
| audioContext.resume(); | |
| } | |
| // Reschedule from the seek point | |
| startTime = audioContext.currentTime - seekTime; | |
| let nextStartTime = audioContext.currentTime; | |
| for (let i = startChunkIndex; i < audioChunks.length; i++) { | |
| const source = audioContext.createBufferSource(); | |
| source.buffer = audioChunks[i].buffer; | |
| source.connect(audioContext.destination); | |
| if (i === startChunkIndex) { | |
| // Start from offset | |
| source.start(nextStartTime, offsetInChunk); | |
| nextStartTime += (audioChunks[i].buffer.duration - offsetInChunk); | |
| } else { | |
| source.start(nextStartTime); | |
| nextStartTime += audioChunks[i].buffer.duration; | |
| } | |
| // Add silence between chunks | |
| if (i < audioChunks.length - 1) { | |
| nextStartTime += 0.3; | |
| } | |
| scheduledSources.push(source); | |
| } | |
| // Update nextScheduledTime for any future chunks | |
| nextScheduledTime = nextStartTime; | |
| // Resume playing state | |
| isPlaying = true; | |
| isPaused = false; | |
| if (playPauseBtn) { | |
| playPauseBtn.innerHTML = PAUSE_ICON_SVG; | |
| } | |
| // Restart progress animation | |
| updateProgress(); | |
| }; | |
| // Callback for first chunk ready - create custom player and start playback | |
| // Helper function to create AudioBuffer directly from Float32Array | |
| const createAudioBufferFromFloat32 = (audioData, sampleRate) => { | |
| const audioBuffer = audioContext.createBuffer(1, audioData.length, sampleRate); | |
| audioBuffer.getChannelData(0).set(audioData); | |
| return audioBuffer; | |
| }; | |
| const onFirstChunkReady = async (audioData, sampleRate, duration, text, numChunks, firstChunkTime, processedChars) => { | |
| totalChunks = numChunks; | |
| firstChunkGenerationTime = firstChunkTime; | |
| const container = document.getElementById('demoResults'); | |
| const textLength = currentGenerationTextLength > 0 | |
| ? currentGenerationTextLength | |
| : (text ? text.length : 0); | |
| const isBatch = textLength >= getMaxChunkLength(); | |
| const processingTimeStr = isBatch && firstChunkTime | |
| ? `${formatTimeDetailed(firstChunkTime)} / ${formatTimeDetailed(firstChunkTime)}` | |
| : formatTimeDetailed(firstChunkTime); | |
| const safeInitialChars = typeof processedChars === 'number' ? processedChars : 0; | |
| const displayedInitialChars = textLength > 0 ? Math.min(safeInitialChars, textLength) : safeInitialChars; | |
| const charsPerSec = firstChunkTime > 0 && displayedInitialChars > 0 | |
| ? (displayedInitialChars / firstChunkTime).toFixed(1) | |
| : '0.0'; | |
| const rtf = duration > 0 && firstChunkTime > 0 ? (firstChunkTime / duration).toFixed(3) : '-'; | |
| const progressValue = textLength > 0 ? Math.min(100, (displayedInitialChars / textLength) * 100) : 0; | |
| const resultItemEl = document.getElementById('supertonic-result'); | |
| if (!resultItemEl) { | |
| console.warn('Supertonic result container not found.'); | |
| return; | |
| } | |
| resultItemEl.classList.remove('generating'); | |
| resultItemEl.style.setProperty('--result-progress', `${progressValue}%`); | |
| const titleMainEl = resultItemEl.querySelector('.title-main'); | |
| if (titleMainEl) { | |
| titleMainEl.textContent = 'Supertonic'; | |
| titleMainEl.style.color = 'var(--accent-yellow)'; | |
| } | |
| const titleSubEl = resultItemEl.querySelector('.title-sub'); | |
| if (titleSubEl) { | |
| titleSubEl.textContent = 'On-Device'; | |
| } | |
| const infoContainer = resultItemEl.querySelector('.demo-result-info'); | |
| if (infoContainer) { | |
| infoContainer.classList.remove('error'); | |
| } | |
| const timeElInitial = document.getElementById('supertonic-time'); | |
| if (timeElInitial) { | |
| timeElInitial.innerHTML = formatStatValueWithSuffix(processingTimeStr, 's', { firstLabel: true }); | |
| } | |
| const cpsElInitial = document.getElementById('supertonic-cps'); | |
| if (cpsElInitial) { | |
| cpsElInitial.textContent = charsPerSec; | |
| } | |
| const rtfElInitial = document.getElementById('supertonic-rtf'); | |
| if (rtfElInitial) { | |
| rtfElInitial.innerHTML = formatStatValueWithSuffix(rtf, 'x'); | |
| } | |
| const playerContainer = resultItemEl.querySelector('.custom-audio-player'); | |
| if (playerContainer) { | |
| playerContainer.style.display = ''; | |
| playerContainer.innerHTML = ` | |
| <button id="play-pause-btn" class="player-btn">${PAUSE_ICON_SVG}</button> | |
| <div class="time-display" id="current-time">0:00.00</div> | |
| <div class="progress-container" id="progress-container"> | |
| <div class="progress-bar"> | |
| <div class="progress-fill" id="progress-fill"></div> | |
| </div> | |
| </div> | |
| <div class="time-display" id="total-duration">${formatTime(duration, { trimMobile: true })}</div> | |
| <div class="demo-result-actions" style="display: none;"> | |
| <button class="demo-download-btn" id="supertonic-download" aria-label="Download WAV" title="Download WAV"> | |
| <svg width="16" height="16" fill="none" stroke="currentColor" stroke-width="2" viewBox="0 0 24 24"> | |
| <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/> | |
| <polyline points="7 10 12 15 17 10"/> | |
| <line x1="12" y1="15" x2="12" y2="3"/> | |
| </svg> | |
| </button> | |
| </div> | |
| `; | |
| } | |
| container.style.display = 'flex'; | |
| latestSupertonicProcessedChars = displayedInitialChars; | |
| // Get UI elements | |
| playPauseBtn = document.getElementById('play-pause-btn'); | |
| progressBar = document.getElementById('progress-container'); | |
| currentTimeDisplay = document.getElementById('current-time'); | |
| durationDisplay = document.getElementById('total-duration'); | |
| progressFill = document.getElementById('progress-fill'); | |
| // Initialize Web Audio API | |
| audioContext = new (window.AudioContext || window.webkitAudioContext)(); | |
| startTime = audioContext.currentTime; | |
| totalDuration = duration; | |
| isPlaying = true; | |
| isPaused = false; | |
| // Create Supertonic player record and register it | |
| const pausePlayback = () => { | |
| if (!audioContext || audioContext.state === 'closed') return; | |
| if (isPlaying) { | |
| pauseTime = audioContext.currentTime - startTime; | |
| scheduledSources.forEach(source => { | |
| try { | |
| source.stop(); | |
| } catch (e) { | |
| // Already stopped | |
| } | |
| }); | |
| scheduledSources = []; | |
| audioContext.suspend(); | |
| isPaused = true; | |
| isPlaying = false; | |
| if (playPauseBtn) { | |
| playPauseBtn.innerHTML = PLAY_ICON_SVG; | |
| } | |
| if (animationFrameId) { | |
| cancelAnimationFrame(animationFrameId); | |
| } | |
| } | |
| }; | |
| supertonicPlayerRecord = { | |
| audioContext: audioContext, | |
| pausePlayback: pausePlayback | |
| }; | |
| // Remove old Supertonic player if exists and add new one | |
| customAudioPlayers = customAudioPlayers.filter(p => p !== supertonicPlayerRecord && p.audioContext !== audioContext); | |
| customAudioPlayers.push(supertonicPlayerRecord); | |
| // Pause all other players before starting Supertonic | |
| pauseAllPlayersExcept(supertonicPlayerRecord); | |
| // Create AudioBuffer directly from Float32Array - no WAV encode/decode | |
| const audioBuffer = createAudioBufferFromFloat32(audioData, sampleRate); | |
| audioChunks.push({ buffer: audioBuffer, duration: audioBuffer.duration }); | |
| // Play first chunk immediately | |
| const source = audioContext.createBufferSource(); | |
| source.buffer = audioBuffer; | |
| source.connect(audioContext.destination); | |
| source.start(audioContext.currentTime); | |
| scheduledSources.push(source); | |
| // Set next scheduled time for additional chunks | |
| nextScheduledTime = audioContext.currentTime + audioBuffer.duration + 0.3; // Add silence gap | |
| // Setup player controls | |
| playPauseBtn.addEventListener('click', togglePlayPause); | |
| progressBar.addEventListener('click', (e) => { | |
| const rect = progressBar.getBoundingClientRect(); | |
| const percentage = ((e.clientX - rect.left) / rect.width) * 100; | |
| seekTo(percentage); | |
| }); | |
| // Start progress animation | |
| updateProgress(); | |
| }; | |
| // Callback for each additional chunk - schedule seamlessly | |
| const onChunkAdded = async (audioData, sampleRate, duration, chunkIndex, totalChunks, currentProcessingTime, processedChars) => { | |
| if (!audioContext) return; | |
| // Create AudioBuffer directly from Float32Array - no WAV encode/decode | |
| const audioBuffer = createAudioBufferFromFloat32(audioData, sampleRate); | |
| const chunkDuration = audioBuffer.duration; | |
| audioChunks.push({ buffer: audioBuffer, duration: chunkDuration }); | |
| // Schedule the new chunk at the pre-calculated time | |
| const source = audioContext.createBufferSource(); | |
| source.buffer = audioBuffer; | |
| source.connect(audioContext.destination); | |
| source.start(nextScheduledTime); | |
| scheduledSources.push(source); | |
| // Update next scheduled time for the next chunk | |
| nextScheduledTime = nextScheduledTime + audioBuffer.duration + 0.3; // Add silence gap | |
| // Update total duration | |
| totalDuration = duration; | |
| // Update duration display with smooth animation | |
| if (durationDisplay) { | |
| durationDisplay.textContent = formatTime(duration, { trimMobile: true }); | |
| durationDisplay.style.transition = 'color 0.3s'; | |
| durationDisplay.style.color = '#ffffff'; | |
| setTimeout(() => { | |
| durationDisplay.style.color = ''; | |
| }, 300); | |
| } | |
| // Update info display | |
| const textLengthCandidate = currentGenerationTextLength > 0 | |
| ? currentGenerationTextLength | |
| : (demoTextInput.textContent || demoTextInput.innerText || '').trim().length; | |
| const textLength = textLengthCandidate; | |
| const isBatch = textLength >= getMaxChunkLength(); | |
| const timeEl = document.getElementById('supertonic-time'); | |
| const durationEl = document.getElementById('supertonic-duration'); | |
| const cpsEl = document.getElementById('supertonic-cps'); | |
| const rtfEl = document.getElementById('supertonic-rtf'); | |
| const effectiveProcessedChars = typeof processedChars === 'number' ? processedChars : latestSupertonicProcessedChars; | |
| if (effectiveProcessedChars < latestSupertonicProcessedChars) { | |
| return; | |
| } | |
| const clampedProcessedChars = textLength > 0 ? Math.min(effectiveProcessedChars, textLength) : effectiveProcessedChars; | |
| const progressValue = textLength > 0 ? Math.min(100, (clampedProcessedChars / textLength) * 100) : 0; | |
| if (durationEl) { | |
| durationEl.textContent = formatTimeDetailed(duration); | |
| } | |
| if (timeEl && isBatch && firstChunkGenerationTime > 0 && currentProcessingTime) { | |
| const timeDisplay = `${formatTimeDetailed(firstChunkGenerationTime)} / ${formatTimeDetailed(currentProcessingTime)}`; | |
| timeEl.innerHTML = formatStatValueWithSuffix(timeDisplay, 's', { firstLabel: true }); | |
| } | |
| if (cpsEl && currentProcessingTime > 0 && clampedProcessedChars >= 0) { | |
| const charsPerSec = (clampedProcessedChars / currentProcessingTime).toFixed(1); | |
| cpsEl.textContent = charsPerSec; | |
| } | |
| if (rtfEl && duration > 0 && currentProcessingTime > 0) { | |
| const rtf = (currentProcessingTime / duration).toFixed(3); | |
| rtfEl.innerHTML = formatStatValueWithSuffix(rtf, 'x'); | |
| } | |
| const resultItemEl = document.getElementById('supertonic-result'); | |
| if (resultItemEl) { | |
| resultItemEl.style.setProperty('--result-progress', `${progressValue}%`); | |
| } | |
| latestSupertonicProcessedChars = clampedProcessedChars; | |
| }; | |
| // Start all syntheses simultaneously | |
| const result = await generateSupertonicSpeechChunked( | |
| text, | |
| totalStep, | |
| durationFactor, | |
| onFirstChunkReady, | |
| onChunkAdded | |
| ); | |
| if (result.success) { | |
| const textLength = result.text ? result.text.length : 0; | |
| const isBatch = textLength >= getMaxChunkLength(); | |
| const processingTimeStr = isBatch && firstChunkGenerationTime > 0 | |
| ? `${formatTimeDetailed(firstChunkGenerationTime)} / ${formatTimeDetailed(result.processingTime)}` | |
| : formatTimeDetailed(result.processingTime); | |
| const charsPerSec = result.processingTime > 0 ? (textLength / result.processingTime).toFixed(1) : '0.0'; | |
| const progressValue = textLength > 0 ? 100 : 0; | |
| const timeEl = document.getElementById('supertonic-time'); | |
| const durationEl = document.getElementById('supertonic-duration'); | |
| const cpsEl = document.getElementById('supertonic-cps'); | |
| const rtfEl = document.getElementById('supertonic-rtf'); | |
| if (timeEl) timeEl.innerHTML = formatStatValueWithSuffix(processingTimeStr, 's', { firstLabel: true }); | |
| if (durationEl) durationEl.textContent = formatTimeDetailed(result.audioDuration); | |
| latestSupertonicProcessedChars = textLength; | |
| if (cpsEl) cpsEl.textContent = charsPerSec; | |
| if (rtfEl) { | |
| const rtf = result.audioDuration > 0 ? (result.processingTime / result.audioDuration).toFixed(3) : '-'; | |
| rtfEl.innerHTML = formatStatValueWithSuffix(rtf, 'x'); | |
| } | |
| const resultItemEl = document.getElementById('supertonic-result'); | |
| if (resultItemEl) { | |
| resultItemEl.style.setProperty('--result-progress', `${progressValue}%`); | |
| } | |
| // Final duration update (if custom player was used) | |
| if (audioContext && audioChunks.length > 0) { | |
| totalDuration = result.audioDuration; | |
| if (durationDisplay) { | |
| durationDisplay.textContent = formatTime(result.audioDuration, { trimMobile: true }); | |
| } | |
| } | |
| // Always show download button | |
| const downloadBtn = document.getElementById('supertonic-download'); | |
| if (downloadBtn) { | |
| downloadBtn.parentElement.style.display = 'block'; | |
| downloadBtn.onclick = () => downloadDemoAudio(result.url, 'supertonic_speech.wav'); | |
| } | |
| } | |
| } catch (error) { | |
| const errorMessage = getErrorMessage(error); | |
| showDemoStatus(`<strong>Error:</strong> ${errorMessage}`, 'error'); | |
| showDemoError(`Error during synthesis: ${errorMessage}`); | |
| console.error('Synthesis error:', error); | |
| // Restore placeholder | |
| demoResults.style.display = 'none'; | |
| demoResults.innerHTML = ` | |
| <div class="demo-placeholder"> | |
| <div class="demo-placeholder-icon">🎙️</div> | |
| <p>Your generated speech will appear here</p> | |
| </div> | |
| `; | |
| } finally { | |
| isGenerating = false; | |
| demoGenerateBtn.disabled = false; | |
| // Re-enable speaker selection after generation | |
| const speakerItemsForGeneration = document.querySelectorAll('.speaker-item[data-voice]'); | |
| speakerItemsForGeneration.forEach(item => item.classList.remove('disabled')); | |
| // Re-enable language selection after generation | |
| const languageItemsForGeneration = document.querySelectorAll('.speaker-item[data-language]'); | |
| languageItemsForGeneration.forEach(item => item.classList.remove('disabled')); | |
| } | |
| } | |
| // Download handler (make it global) | |
| window.downloadDemoAudio = function(url, filename) { | |
| const a = document.createElement('a'); | |
| a.href = url; | |
| a.download = filename; | |
| a.click(); | |
| }; | |
| // Helper function to convert speed to durationFactor | |
| function speedToDurationFactor(speed, offset=0.05) { | |
| return 1 / (speed + offset); | |
| } | |
| // Update slider value displays | |
| function updateSliderValues() { | |
| demoTotalStepsValue.textContent = demoTotalSteps.value + ' Steps'; | |
| // Display speed with 'x' suffix (e.g., 1.0x, 0.7x, 1.5x) | |
| const speed = parseFloat(demoSpeed.value); | |
| demoSpeedValue.textContent = speed.toFixed(2) + 'x'; | |
| } | |
| // Attach slider event listeners | |
| demoTotalSteps.addEventListener('input', updateSliderValues); | |
| demoSpeed.addEventListener('input', updateSliderValues); | |
| // Initialize slider values | |
| updateSliderValues(); | |
| // Attach generate function to button | |
| demoGenerateBtn.addEventListener('click', generateSpeech); | |
| // Preset text items (defined before input listener to share scope) | |
| const presetItems = document.querySelectorAll('.preset-item[data-preset]'); | |
| const freeformBtn = document.getElementById('freeformBtn'); | |
| // currentLanguage is already declared above (line 902) | |
| let isPresetChanging = false; // Flag to track if text change is from preset button | |
| // Helper function to update active button state | |
| function updateActiveButton(presetType) { | |
| // Remove active from all preset items | |
| presetItems.forEach(item => item.classList.remove('active')); | |
| // Add active to the specified item | |
| if (presetType) { | |
| const targetItem = document.querySelector(`.preset-item[data-preset="${presetType}"]`); | |
| if (targetItem) { | |
| targetItem.classList.add('active'); | |
| } | |
| } | |
| currentPreset = presetType; | |
| updateQuoteModeState(presetType === 'quote'); | |
| } | |
| function updateQuoteModeState(isQuote) { | |
| if (!demoResults) return; | |
| demoResults.classList.toggle('quote-mode', Boolean(isQuote)); | |
| } | |
| // Initialize quote button active state | |
| updateActiveButton('quote'); | |
| if (presetTexts.quote && typeof presetTexts.quote === 'object' && presetTexts.quote[currentLanguage]) { | |
| demoTextInput.textContent = presetTexts.quote[currentLanguage]; | |
| updateCharCounter(); | |
| } | |
| presetItems.forEach(item => { | |
| item.addEventListener('click', () => { | |
| const presetType = item.getAttribute('data-preset'); | |
| if (presetType === 'freeform') { | |
| // Freeform item: clear text | |
| isPresetChanging = true; | |
| demoTextInput.textContent = ''; | |
| updateActiveButton('freeform'); | |
| updateCharCounter(); | |
| isPresetChanging = false; | |
| } else { | |
| // Other preset items: set text | |
| const preset = presetTexts[presetType]; | |
| if (preset && typeof preset === 'object' && preset[currentLanguage]) { | |
| const text = preset[currentLanguage]; | |
| isPresetChanging = true; | |
| demoTextInput.textContent = text; | |
| updateActiveButton(presetType); | |
| updateCharCounter(); | |
| isPresetChanging = false; | |
| } else if (preset && typeof preset === 'string') { | |
| // Fallback for old format (shouldn't happen, but just in case) | |
| isPresetChanging = true; | |
| demoTextInput.textContent = preset; | |
| updateActiveButton(presetType); | |
| updateCharCounter(); | |
| isPresetChanging = false; | |
| } | |
| } | |
| }); | |
| }); | |
| // Handle paste event to remove styles and paste only text | |
| demoTextInput.addEventListener('paste', (e) => { | |
| e.preventDefault(); | |
| const text = (e.clipboardData || window.clipboardData).getData('text/plain'); | |
| const selection = window.getSelection(); | |
| if (!selection.rangeCount) return; | |
| const range = selection.getRangeAt(0); | |
| range.deleteContents(); | |
| const textNode = document.createTextNode(text); | |
| range.insertNode(textNode); | |
| range.setStartAfter(textNode); | |
| range.collapse(true); | |
| selection.removeAllRanges(); | |
| selection.addRange(range); | |
| // Trigger input event to update character counter | |
| demoTextInput.dispatchEvent(new Event('input', { bubbles: true })); | |
| }); | |
| // Update character counter on input | |
| let previousTextValue = demoTextInput.textContent || demoTextInput.innerText || ''; | |
| // Update left border line height to match demo-input-section height | |
| const demoInputSection = document.querySelector('.demo-input-section'); | |
| function updateLeftBorderHeight() { | |
| if (demoInputSection) { | |
| const height = demoInputSection.offsetHeight; | |
| demoInputSection.style.setProperty('--demo-text-input-height', `${height}px`); | |
| } | |
| } | |
| // Initialize and observe height changes | |
| updateLeftBorderHeight(); | |
| const resizeObserver = new ResizeObserver(() => { | |
| updateLeftBorderHeight(); | |
| }); | |
| if (demoInputSection) { | |
| resizeObserver.observe(demoInputSection); | |
| } | |
| // Auto-calculate text input height for screens wider than 768px | |
| function calculateTextInputHeight() { | |
| if (window.innerWidth <= 768) { | |
| // Reset to default height for screens 768px and below | |
| demoTextInput.style.height = ''; | |
| return; | |
| } | |
| const viewportHeight = window.innerHeight; | |
| const interactiveDemoEl = document.querySelector('.interactive-demo'); | |
| const containerEl = document.querySelector('.container'); | |
| const headerWrapperEl = document.querySelector('.demo-header-wrapper'); | |
| const controlsEl = document.querySelector('.demo-controls'); | |
| const inputLabelEl = document.querySelector('.demo-input-label'); | |
| const presetRowEl = document.querySelector('#presetControlsRow'); | |
| const outputSectionEl = document.querySelector('.demo-output-section'); | |
| const contentEl = document.querySelector('.demo-content'); | |
| // Get computed styles for gaps and paddings | |
| const interactiveDemoStyle = window.getComputedStyle(interactiveDemoEl || document.body); | |
| const containerStyle = window.getComputedStyle(containerEl || document.body); | |
| const contentStyle = window.getComputedStyle(contentEl || document.body); | |
| // Calculate total height of elements above and below text input | |
| let totalHeight = 0; | |
| // Interactive demo padding | |
| const interactiveDemoPaddingTop = parseFloat(interactiveDemoStyle.paddingTop) || 0; | |
| const interactiveDemoPaddingBottom = parseFloat(interactiveDemoStyle.paddingBottom) || 0; | |
| totalHeight += interactiveDemoPaddingTop + interactiveDemoPaddingBottom; | |
| // Container padding | |
| const containerPaddingTop = parseFloat(containerStyle.paddingTop) || 0; | |
| const containerPaddingBottom = parseFloat(containerStyle.paddingBottom) || 0; | |
| totalHeight += containerPaddingTop + containerPaddingBottom; | |
| // Header wrapper | |
| if (headerWrapperEl) { | |
| totalHeight += headerWrapperEl.offsetHeight; | |
| } | |
| // Demo controls | |
| if (controlsEl) { | |
| totalHeight += controlsEl.offsetHeight; | |
| } | |
| // Demo content gap (top) | |
| const contentGap = parseFloat(contentStyle.gap) || 0; | |
| totalHeight += contentGap; | |
| // Input label | |
| if (inputLabelEl) { | |
| totalHeight += inputLabelEl.offsetHeight; | |
| } | |
| // Preset controls row | |
| if (presetRowEl) { | |
| totalHeight += presetRowEl.offsetHeight; | |
| } | |
| // Demo content gap (bottom) | |
| totalHeight += contentGap; | |
| // Output section | |
| if (outputSectionEl) { | |
| totalHeight += outputSectionEl.offsetHeight; | |
| } | |
| // Calculate available height for text input | |
| const availableHeight = viewportHeight - totalHeight - 275; // Subtract 275px | |
| // Set minimum height (e.g., 200px) and maximum height | |
| const minHeight = 200; | |
| const maxHeight = availableHeight - 20; // 20px buffer | |
| if (availableHeight > minHeight) { | |
| demoTextInput.style.height = `${Math.max(minHeight, maxHeight)}px`; | |
| } else { | |
| demoTextInput.style.height = `${minHeight}px`; | |
| } | |
| } | |
| // Calculate on load and resize | |
| calculateTextInputHeight(); | |
| window.addEventListener('resize', calculateTextInputHeight); | |
| // Observe elements that might change height | |
| const heightObserver = new ResizeObserver(() => { | |
| calculateTextInputHeight(); | |
| }); | |
| const headerWrapperEl = document.querySelector('.demo-header-wrapper'); | |
| const controlsEl = document.querySelector('.demo-controls'); | |
| const presetRowEl = document.querySelector('#presetControlsRow'); | |
| const outputSectionEl = document.querySelector('.demo-output-section'); | |
| if (headerWrapperEl) heightObserver.observe(headerWrapperEl); | |
| if (controlsEl) heightObserver.observe(controlsEl); | |
| if (presetRowEl) heightObserver.observe(presetRowEl); | |
| if (outputSectionEl) heightObserver.observe(outputSectionEl); | |
| // Auto-hide scrollbar functionality | |
| let scrollbarTimeout; | |
| demoTextInput.addEventListener('scroll', () => { | |
| // Add scrolling class to show scrollbar | |
| demoTextInput.classList.add('scrolling'); | |
| // Clear existing timeout | |
| if (scrollbarTimeout) { | |
| clearTimeout(scrollbarTimeout); | |
| } | |
| // Hide scrollbar after 1.5 seconds of no scrolling | |
| scrollbarTimeout = setTimeout(() => { | |
| demoTextInput.classList.remove('scrolling'); | |
| }, 1500); | |
| }); | |
| demoTextInput.addEventListener('input', () => { | |
| updateCharCounter(); | |
| // If text was modified by user (not from preset button), switch to freeform | |
| const currentText = demoTextInput.textContent || demoTextInput.innerText || ''; | |
| if (!isPresetChanging && currentText !== previousTextValue) { | |
| updateActiveButton('freeform'); | |
| } | |
| if (currentPreset === 'freeform') { | |
| // Auto-detect language when user is typing (not from preset) | |
| const detectedLang = detectLanguage(currentText); | |
| if (detectedLang && detectedLang !== currentLanguage) { | |
| const previousLang = currentLanguage; | |
| currentLanguage = detectedLang; | |
| window.updateActiveLanguage(currentLanguage); | |
| showLanguageToast(previousLang, detectedLang); | |
| } | |
| } | |
| previousTextValue = currentText; | |
| }); | |
| // Update font size when window is resized (for responsive width-based font sizing) | |
| let resizeTimeout; | |
| window.addEventListener('resize', () => { | |
| clearTimeout(resizeTimeout); | |
| resizeTimeout = setTimeout(() => { | |
| updateCharCounter(); | |
| }, 100); | |
| }); | |
| // Initialize character counter | |
| updateCharCounter(); | |
| // Speaker list handler (replaces voice select dropdown) | |
| const speakerList = document.getElementById('speakerList'); | |
| const speakerItems = speakerList ? speakerList.querySelectorAll('.speaker-item[data-voice]') : []; | |
| let voiceSelectDisabled = false; | |
| // Update active speaker item (global function for use in switchVoice) | |
| window.updateActiveSpeaker = function(voice) { | |
| if (!speakerList || !speakerItems) return; | |
| speakerItems.forEach(item => { | |
| if (item.dataset.voice === voice) { | |
| item.classList.add('active'); | |
| } else { | |
| item.classList.remove('active'); | |
| } | |
| }); | |
| }; | |
| // Initialize active speaker | |
| if (speakerList && speakerItems.length > 0) { | |
| window.updateActiveSpeaker(currentVoice); | |
| } | |
| // Handle speaker item clicks and hover tooltips | |
| const speakerTooltip = document.getElementById('speakerTooltip'); | |
| if (speakerList) { | |
| speakerItems.forEach(item => { | |
| // Track if click was triggered by touch event (to prevent double execution) | |
| let clickFromTouch = false; | |
| // Click handler | |
| item.addEventListener('click', async (e) => { | |
| // On touch devices with mobile viewport, ignore native click events (we'll trigger manually from touchend) | |
| // PC (even with narrow viewport) should always handle clicks | |
| if (isTouchDevice() && isMobileViewport() && !clickFromTouch) { | |
| return; | |
| } | |
| // Reset flag | |
| clickFromTouch = false; | |
| if (voiceSelectDisabled || modelsLoading || isGenerating) return; | |
| const selectedVoice = item.dataset.voice; | |
| // If already selected, just auto-generate and play | |
| if (selectedVoice === currentVoice) { | |
| const text = (demoTextInput.textContent || demoTextInput.innerText || '').trim(); | |
| if (text.length >= 10 && !isGenerating && models && cfgs && processors) { | |
| generateSpeech(); | |
| } | |
| return; | |
| } | |
| // Disable all controls while loading | |
| const wasDisabled = demoGenerateBtn.disabled; | |
| demoGenerateBtn.disabled = true; | |
| voiceSelectDisabled = true; | |
| // Update UI immediately | |
| window.updateActiveSpeaker(selectedVoice); | |
| try { | |
| await switchVoice(selectedVoice); | |
| // Re-enable if models are loaded | |
| if (models && cfgs && processors) { | |
| demoGenerateBtn.disabled = false; | |
| voiceSelectDisabled = false; | |
| // Auto-generate and play after voice change | |
| const text = (demoTextInput.textContent || demoTextInput.innerText || '').trim(); | |
| if (text.length >= 10 && !isGenerating) { | |
| generateSpeech(); | |
| } | |
| } | |
| } catch (error) { | |
| console.error('Failed to switch voice:', error); | |
| // Revert selection on error | |
| window.updateActiveSpeaker(currentVoice); | |
| voiceSelectDisabled = false; | |
| if (!wasDisabled) demoGenerateBtn.disabled = false; | |
| } | |
| }); | |
| // Hover handler for tooltip | |
| if (speakerTooltip) { | |
| // Desktop hover events | |
| item.addEventListener('mouseenter', (e) => { | |
| if (isTouchDevice() && isMobileViewport()) return; // Skip on touch devices with mobile viewport | |
| const voice = item.dataset.voice; | |
| if (voice && VOICE_DESCRIPTIONS[voice]) { | |
| speakerTooltip.textContent = VOICE_DESCRIPTIONS[voice]; | |
| speakerTooltip.style.display = 'block'; | |
| updateTooltipPosition(e, speakerTooltip); | |
| } | |
| }); | |
| item.addEventListener('mousemove', (e) => { | |
| if (isTouchDevice() && isMobileViewport()) return; // Skip on touch devices with mobile viewport | |
| if (speakerTooltip.style.display === 'block') { | |
| updateTooltipPosition(e, speakerTooltip); | |
| } | |
| }); | |
| item.addEventListener('mouseleave', () => { | |
| if (isTouchDevice() && isMobileViewport()) return; // Skip on touch devices with mobile viewport | |
| speakerTooltip.style.display = 'none'; | |
| }); | |
| // Mobile touch events | |
| let touchStartTime = 0; | |
| let touchHandled = false; | |
| let touchStartY = 0; | |
| const TOUCH_MOVE_THRESHOLD = 10; // pixels | |
| item.addEventListener('touchstart', (e) => { | |
| if (!isTouchDevice() || !isMobileViewport()) return; | |
| touchHandled = false; | |
| const touch = e.touches[0]; | |
| touchStartTime = Date.now(); | |
| touchStartY = touch.clientY; | |
| const voice = item.dataset.voice; | |
| if (voice && VOICE_DESCRIPTIONS[voice]) { | |
| // Prevent default to block text selection | |
| e.preventDefault(); | |
| // Show tooltip with mobile styling | |
| speakerTooltip.textContent = VOICE_DESCRIPTIONS[voice]; | |
| speakerTooltip.style.display = 'block'; | |
| updateTooltipPositionMobile(speakerTooltip, touch.clientY); | |
| } | |
| }, { passive: false }); | |
| item.addEventListener('touchmove', (e) => { | |
| if (!isTouchDevice() || !isMobileViewport()) return; | |
| const touch = e.touches[0]; | |
| const deltaY = Math.abs(touch.clientY - touchStartY); | |
| // Check if touch moved significantly | |
| if (deltaY > TOUCH_MOVE_THRESHOLD) { | |
| touchHandled = true; | |
| // Hide tooltip if user moves finger | |
| speakerTooltip.style.display = 'none'; | |
| } | |
| // Prevent default to avoid scrolling while showing tooltip | |
| e.preventDefault(); | |
| }, { passive: false }); | |
| item.addEventListener('touchend', (e) => { | |
| if (!isTouchDevice() || !isMobileViewport()) return; | |
| const touchEndTime = Date.now(); | |
| const touchDuration = touchEndTime - touchStartTime; | |
| // Hide tooltip | |
| speakerTooltip.style.display = 'none'; | |
| // Always prevent default to avoid text selection | |
| e.preventDefault(); | |
| // Only allow click if it was a short tap without movement | |
| if (!touchHandled && touchDuration < 500) { | |
| // Short tap - trigger click event manually after a small delay | |
| clickFromTouch = true; | |
| setTimeout(() => { | |
| const clickEvent = new MouseEvent('click', { | |
| bubbles: true, | |
| cancelable: true, | |
| view: window | |
| }); | |
| item.dispatchEvent(clickEvent); | |
| }, 50); | |
| } else { | |
| // Long press or moved - prevent click | |
| touchHandled = true; | |
| e.stopPropagation(); | |
| } | |
| }, { passive: false }); | |
| item.addEventListener('touchcancel', (e) => { | |
| if (!isTouchDevice() || !isMobileViewport()) return; | |
| // Hide tooltip | |
| speakerTooltip.style.display = 'none'; | |
| touchHandled = true; | |
| // Prevent default | |
| e.preventDefault(); | |
| }, { passive: false }); | |
| // Prevent context menu (long press menu) | |
| item.addEventListener('contextmenu', (e) => { | |
| if (isTouchDevice() && isMobileViewport()) { | |
| e.preventDefault(); | |
| return false; | |
| } | |
| }); | |
| } | |
| }); | |
| } | |
| // Function to update tooltip position (40px above mouse pointer) | |
| function updateTooltipPosition(event, tooltip) { | |
| const x = event.clientX; | |
| const y = event.clientY - 40; // 40px above mouse pointer | |
| tooltip.style.left = x + 'px'; | |
| tooltip.style.top = y + 'px'; | |
| // Adjust if tooltip goes off screen | |
| const tooltipRect = tooltip.getBoundingClientRect(); | |
| const windowWidth = window.innerWidth; | |
| const windowHeight = window.innerHeight; | |
| if (tooltipRect.right > windowWidth) { | |
| tooltip.style.left = (windowWidth - tooltipRect.width - 10) + 'px'; | |
| } | |
| if (tooltipRect.left < 0) { | |
| tooltip.style.left = '10px'; | |
| } | |
| if (tooltipRect.top < 0) { | |
| tooltip.style.top = (event.clientY + 40) + 'px'; | |
| } | |
| if (tooltipRect.bottom > windowHeight) { | |
| tooltip.style.top = (windowHeight - tooltipRect.height - 10) + 'px'; | |
| } | |
| } | |
| // Function to update tooltip position for mobile (centered, 75px above touch point) | |
| function updateTooltipPositionMobile(tooltip, touchY) { | |
| const windowWidth = window.innerWidth; | |
| const windowHeight = window.innerHeight; | |
| // Set mobile-specific styles | |
| tooltip.style.width = '90%'; | |
| tooltip.style.left = '5%'; // Center: (100% - 90%) / 2 = 5% | |
| tooltip.style.right = 'auto'; | |
| tooltip.style.marginLeft = '0'; | |
| tooltip.style.marginRight = '0'; | |
| tooltip.style.whiteSpace = 'normal'; | |
| tooltip.style.textAlign = 'center'; | |
| // Position tooltip 75px above touch point (60px + 15px) | |
| const y = touchY - 75; | |
| tooltip.style.top = y + 'px'; | |
| // Adjust if tooltip goes off screen | |
| const tooltipRect = tooltip.getBoundingClientRect(); | |
| if (tooltipRect.top < 10) { | |
| // If tooltip goes above viewport, position it below touch point instead | |
| tooltip.style.top = (touchY + 20) + 'px'; | |
| } | |
| if (tooltipRect.bottom > windowHeight - 10) { | |
| tooltip.style.top = (windowHeight - tooltipRect.height - 10) + 'px'; | |
| } | |
| } | |
| // Language selection handler | |
| const languageList = document.getElementById('languageList'); | |
| const languageItems = languageList ? languageList.querySelectorAll('.speaker-item[data-language]') : []; | |
| const rtlLanguages = new Set(['ar']); | |
| function applyInterfaceDirection(language) { | |
| const isRtl = rtlLanguages.has(language); | |
| const direction = isRtl ? 'rtl' : 'ltr'; | |
| const lang = language || 'en'; | |
| document.documentElement.lang = lang; | |
| document.documentElement.dir = direction; | |
| document.documentElement.dataset.language = lang; | |
| document.body.classList.toggle('rtl-language', isRtl); | |
| demoTextInput.setAttribute('lang', lang); | |
| demoTextInput.setAttribute('dir', direction); | |
| demoTextInput.classList.toggle('rtl-text', isRtl); | |
| if (demoResults) { | |
| demoResults.setAttribute('lang', lang); | |
| demoResults.setAttribute('dir', direction); | |
| } | |
| } | |
| // Update active language item (global function for use in language change) | |
| window.updateActiveLanguage = function(language) { | |
| applyInterfaceDirection(language); | |
| if (languageList && languageItems) { | |
| languageItems.forEach(item => { | |
| if (item.dataset.language === language) { | |
| item.classList.add('active'); | |
| } else { | |
| item.classList.remove('active'); | |
| } | |
| }); | |
| } | |
| }; | |
| // Initialize active language | |
| if (languageList && languageItems.length > 0) { | |
| window.updateActiveLanguage(currentLanguage); | |
| } | |
| // Handle language item clicks | |
| if (languageList) { | |
| languageItems.forEach(item => { | |
| item.addEventListener('click', async (e) => { | |
| // Don't allow language change during model loading or generation | |
| if (modelsLoading || isGenerating) return; | |
| const selectedLanguage = item.dataset.language; | |
| // If already selected, just auto-generate and play | |
| if (selectedLanguage === currentLanguage) { | |
| const text = (demoTextInput.textContent || demoTextInput.innerText || '').trim(); | |
| if (text.length >= 10 && !isGenerating && models && cfgs && processors) { | |
| generateSpeech(); | |
| } | |
| return; | |
| } | |
| // Update language | |
| currentLanguage = selectedLanguage; | |
| window.updateActiveLanguage(currentLanguage); | |
| // Update text if we're on a preset (not freeform) | |
| if (currentPreset && currentPreset !== 'freeform' && presetTexts[currentPreset]) { | |
| const preset = presetTexts[currentPreset]; | |
| if (preset && typeof preset === 'object' && preset[currentLanguage]) { | |
| isPresetChanging = true; | |
| demoTextInput.textContent = preset[currentLanguage]; | |
| updateCharCounter(); | |
| isPresetChanging = false; | |
| } | |
| } | |
| // Auto-generate and play after language change | |
| // Wait a bit for UI to update | |
| await new Promise(resolve => setTimeout(resolve, 100)); | |
| const text = (demoTextInput.textContent || demoTextInput.innerText || '').trim(); | |
| if (text.length >= 10 && !isGenerating && models && cfgs && processors) { | |
| generateSpeech(); | |
| } | |
| }); | |
| }); | |
| } | |
| // Title animation setup | |
| const demoTitleLeft = document.querySelector('.demo-title-left'); | |
| const demoTitleRight = document.querySelector('.demo-title-right'); | |
| const demoOutputSection = document.querySelector('.demo-output-section'); | |
| // Initialize Text with letters wrapped in spans | |
| if (demoTitleLeft) { | |
| const text = demoTitleLeft.textContent.trim(); | |
| demoTitleLeft.innerHTML = text.split('').map(char => | |
| char === ' ' ? ' ' : `<span class="letter visible">${char}</span>` | |
| ).join(''); | |
| } | |
| // Text animation on demo-input-section click | |
| if (demoInputSection && demoTitleLeft) { | |
| demoInputSection.addEventListener('click', () => { | |
| const letters = demoTitleLeft.querySelectorAll('.letter'); | |
| // Reset all letters | |
| letters.forEach(letter => { | |
| letter.classList.remove('visible'); | |
| }); | |
| // Show letters one by one (total 0.25s = 0.125s / 2) | |
| letters.forEach((letter, index) => { | |
| setTimeout(() => { | |
| letter.classList.add('visible'); | |
| }, index * 0.0625 * 1000); // 0.0625s delay between each letter | |
| }); | |
| }); | |
| } | |
| // Speech animation on demo-output-section click | |
| if (demoOutputSection && demoTitleRight) { | |
| demoOutputSection.addEventListener('click', (event) => { | |
| if (event.target.closest('#demoGenerateBtn')) { | |
| return; | |
| } | |
| demoTitleRight.classList.remove('animate-speech'); | |
| // Trigger reflow | |
| void demoTitleRight.offsetWidth; | |
| demoTitleRight.classList.add('animate-speech'); | |
| }); | |
| } | |
| // Initialize models | |
| initializeModels(); | |
| })(); | |
| // v3 cache-bust | |