// index.js class SupertonicTTS { constructor() { this.tts = null; this.currentAudio = null; this.isGenerating = false; this.device = 'cpu'; this.voices = { 'F1': 'https://huggingface.co/datasets/Supertonic/Supertonic-TTS-ONNX/resolve/main/voices/F1.bin', 'F2': 'https://huggingface.co/datasets/Supertonic/Supertonic-TTS-ONNX/resolve/main/voices/F2.bin', 'M1': 'https://huggingface.co/datasets/Supertonic/Supertonic-TTS-ONNX/resolve/main/voices/M1.bin', 'M2': 'https://huggingface.co/datasets/Supertonic/Supertonic-TTS-ONNX/resolve/main/voices/M2.bin' }; this.initElements(); this.bindEvents(); this.checkWebGPU(); } initElements() { this.textInput = document.getElementById('textInput'); this.voiceSelect = document.getElementById('voiceSelect'); this.generateBtn = document.getElementById('generateBtn'); this.playBtn = document.getElementById('playBtn'); this.downloadBtn = document.getElementById('downloadBtn'); this.regenerateBtn = document.getElementById('regenerateBtn'); this.audioPlayer = document.getElementById('audioPlayer'); this.deviceToggle = document.getElementById('deviceToggle'); this.deviceText = document.getElementById('deviceText'); this.status = document.getElementById('status'); this.audioSection = document.getElementById('audioSection'); this.charCount = document.getElementById('charCount'); } bindEvents() { this.textInput.addEventListener('input', () => this.updateCharCount()); this.generateBtn.addEventListener('click', () => this.generateSpeech()); this.playBtn.addEventListener('click', () => this.playAudio()); this.downloadBtn.addEventListener('click', () => this.downloadAudio()); this.regenerateBtn.addEventListener('click', () => this.regenerate()); this.deviceToggle.addEventListener('change', (e) => this.toggleDevice(e.target.checked)); this.audioPlayer.addEventListener('ended', () => this.updatePlayButton(true)); } async checkWebGPU() { if (navigator.gpu) { this.deviceText.textContent = 'GPU Available'; this.deviceToggle.disabled = false; } else { this.deviceText.textContent = 'GPU Not Supported'; } } toggleDevice(enabled) { this.device = enabled ? 'webgpu' : 'cpu'; this.deviceText.textContent = enabled ? 'GPU Mode' : 'CPU Mode'; if (this.tts) { this.showStatus('Device changed. Please regenerate audio.', 'info'); } } updateCharCount() { const length = this.textInput.value.length; this.charCount.textContent = `${length}/500`; this.generateBtn.disabled = !this.textInput.value.trim() || length > 500 || this.isGenerating; } async initTTS() { try { this.showStatus('Loading Supertonic TTS model...', 'loading'); const options = { dtype: 'fp16' // Changed to fp16 for better compatibility }; if (this.device === 'webgpu') { options.device = 'webgpu'; } this.tts = await window.pipeline('text-to-speech', 'onnx-community/Supertonic-TTS-ONNX', options); this.showStatus('Model loaded successfully! Ready to generate speech.', 'success'); this.generateBtn.disabled = false; } catch (error) { console.error('Failed to load TTS model:', error); this.showStatus(`Failed to load model: ${error.message}. Retrying in CPU mode...`, 'error'); // Fallback to CPU with fp32 try { this.tts = await window.pipeline('text-to-speech', 'onnx-community/Supertonic-TTS-ONNX', { dtype: 'fp32' }); this.showStatus('Model loaded in CPU mode!', 'success'); this.generateBtn.disabled = false; } catch (fallbackError) { this.showStatus('Failed to load model. Please refresh and try again.', 'error'); } } } async generateSpeech() { if (this.isGenerating || !this.tts) { if (!this.tts) await this.initTTS(); return; } const text = this.textInput.value.trim(); if (!text) { this.showStatus('Please enter some text to convert to speech.', 'error'); return; } this.isGenerating = true; this.generateBtn.disabled = true; this.setButtonLoading(true); try { this.showStatus('Generating speech...', 'loading'); const voice = this.voiceSelect.value; const speakerEmbeddingUrl = this.voices[voice]; // Preload speaker embedding const speakerResponse = await fetch(speakerEmbeddingUrl); const speakerArrayBuffer = await speakerResponse.arrayBuffer(); const speakerEmbedding = new Float32Array(speakerArrayBuffer); // Ensure proper alignment (multiple of 4) if (speakerEmbedding.length % 4 !== 0) { const paddedLength = Math.ceil(speakerEmbedding.length / 4) * 4; const paddedEmbedding = new Float32Array(paddedLength); paddedEmbedding.set(speakerEmbedding); speakerEmbedding = paddedEmbedding; } const audio = await this.tts(text, { speaker_embeddings: speakerEmbedding }); this.currentAudio = audio; // Create WAV blob from audio tensor const audioBlob = await this.tensorToWavBlob(audio); const audioUrl = URL.createObjectURL(audioBlob); this.audioPlayer.src = audioUrl; this.audioSection.classList.remove('hidden'); this.downloadBtn.disabled = false; this.updatePlayButton(true); this.showStatus('Speech generated successfully! Click play to listen.', 'success'); } catch (error) { console.error('TTS generation failed:', error); this.showStatus(`Generation failed: ${error.message}`, 'error'); } finally { this.isGenerating = false; this.setButtonLoading(false); this.generateBtn.disabled = false; } } async tensorToWavBlob(audioTensor) { // Convert tensor to WAV format const audioData = await audioTensor.save(); const float32Array = new Float32Array(audioData.buffer); // Normalize audio to 16-bit PCM const maxSample = 32767; const int16Array = new Int16Array(float32Array.length); for (let i = 0; i < float32Array.length; i++) { const sample = Math.max(-1, Math.min(1, float32Array[i])); int16Array[i] = sample * maxSample; } // Create WAV header const sampleRate = 24000; // Supertonic default const wavBuffer = this.createWavBuffer(int16Array, sampleRate); return new Blob([wavBuffer], { type: 'audio/wav' }); } createWavBuffer(audioData, sampleRate) { const buffer = new ArrayBuffer(44 + audioData.length * 2); const view = new DataView(buffer); // WAV header const writeString = (offset, string) => { for (let i = 0; i < string.length; i++) { view.setUint8(offset + i, string.charCodeAt(i)); } }; writeString(0, 'RIFF'); view.setUint32(4, 36 + audioData.length * 2, true); writeString(8, 'WAVE'); writeString(12, 'fmt '); view.setUint32(16, 16, true); view.setUint16(20, 1, true); view.setUint16(22, 1, true); view.setUint32(24, sampleRate, true); view.setUint32(28, sampleRate * 2, true); view.setUint16(32, 2, true); view.setUint16(34, 16, true); writeString(36, 'data'); view.setUint32(40, audioData.length * 2, true); // Write audio data for (let i = 0; i < audioData.length; i++) { view.setInt16(44 + i * 2, audioData[i], true); } return buffer; } playAudio() { if (this.audioPlayer.paused) { this.audioPlayer.play(); this.updatePlayButton(false); } else { this.audioPlayer.pause(); this.updatePlayButton(true); } } updatePlayButton(isPaused) { const icon = this.playBtn.querySelector('svg'); if (isPaused) { icon.innerHTML = ''; this.playBtn.title = 'Play audio'; } else { icon.innerHTML = ''; this.playBtn.title = 'Pause audio'; } } async downloadAudio() { if (this.currentAudio) { const audioBlob = await this.tensorToWavBlob(this.currentAudio); const url = URL.createObjectURL(audioBlob); const a = document.createElement('a'); a.href = url; a.download = 'supertonic-speech.wav'; document.body.appendChild(a); a.click(); document.body.removeChild(a); URL.revokeObjectURL(url); } } regenerate() { this.textInput.value = ''; this.audioSection.classList.add('hidden'); this.downloadBtn.disabled = true; this.currentAudio = null; this.audioPlayer.src = ''; this.updateCharCount(); } setButtonLoading(loading) { const spinner = this.generateBtn.querySelector('.spinner'); const btnText = this.generateBtn.querySelector('.btn-text'); if (loading) { spinner.style.display = 'inline-block'; btnText.textContent = 'Generating...'; } else { spinner.style.display = 'none'; btnText.textContent = 'Generate Speech'; } } showStatus(message, type) { this.status.textContent = message; this.status.className = `status ${type}`; this.status.classList.remove('hidden'); if (type !== 'loading') { setTimeout(() => { this.status.classList.add('hidden'); }, 5000); } } } // Initialize app when DOM is loaded document.addEventListener('DOMContentLoaded', () => { const app = new SupertonicTTS(); // Auto-initialize TTS model setTimeout(() => app.initTTS(), 100); });