// index.js
class SupertonicTTS {
constructor() {
this.tts = null;
this.currentAudio = null;
this.isGenerating = false;
this.device = 'cpu';
this.voices = {
'F1': 'https://huggingface.co/datasets/Supertonic/Supertonic-TTS-ONNX/resolve/main/voices/F1.bin',
'F2': 'https://huggingface.co/datasets/Supertonic/Supertonic-TTS-ONNX/resolve/main/voices/F2.bin',
'M1': 'https://huggingface.co/datasets/Supertonic/Supertonic-TTS-ONNX/resolve/main/voices/M1.bin',
'M2': 'https://huggingface.co/datasets/Supertonic/Supertonic-TTS-ONNX/resolve/main/voices/M2.bin'
};
this.initElements();
this.bindEvents();
this.checkWebGPU();
}
initElements() {
this.textInput = document.getElementById('textInput');
this.voiceSelect = document.getElementById('voiceSelect');
this.generateBtn = document.getElementById('generateBtn');
this.playBtn = document.getElementById('playBtn');
this.downloadBtn = document.getElementById('downloadBtn');
this.regenerateBtn = document.getElementById('regenerateBtn');
this.audioPlayer = document.getElementById('audioPlayer');
this.deviceToggle = document.getElementById('deviceToggle');
this.deviceText = document.getElementById('deviceText');
this.status = document.getElementById('status');
this.audioSection = document.getElementById('audioSection');
this.charCount = document.getElementById('charCount');
}
bindEvents() {
this.textInput.addEventListener('input', () => this.updateCharCount());
this.generateBtn.addEventListener('click', () => this.generateSpeech());
this.playBtn.addEventListener('click', () => this.playAudio());
this.downloadBtn.addEventListener('click', () => this.downloadAudio());
this.regenerateBtn.addEventListener('click', () => this.regenerate());
this.deviceToggle.addEventListener('change', (e) => this.toggleDevice(e.target.checked));
this.audioPlayer.addEventListener('ended', () => this.updatePlayButton(true));
}
async checkWebGPU() {
if (navigator.gpu) {
this.deviceText.textContent = 'GPU Available';
this.deviceToggle.disabled = false;
} else {
this.deviceText.textContent = 'GPU Not Supported';
}
}
toggleDevice(enabled) {
this.device = enabled ? 'webgpu' : 'cpu';
this.deviceText.textContent = enabled ? 'GPU Mode' : 'CPU Mode';
if (this.tts) {
this.showStatus('Device changed. Please regenerate audio.', 'info');
}
}
updateCharCount() {
const length = this.textInput.value.length;
this.charCount.textContent = `${length}/500`;
this.generateBtn.disabled = !this.textInput.value.trim() || length > 500 || this.isGenerating;
}
async initTTS() {
try {
this.showStatus('Loading Supertonic TTS model...', 'loading');
const options = {
dtype: 'fp16' // Changed to fp16 for better compatibility
};
if (this.device === 'webgpu') {
options.device = 'webgpu';
}
this.tts = await window.pipeline('text-to-speech', 'onnx-community/Supertonic-TTS-ONNX', options);
this.showStatus('Model loaded successfully! Ready to generate speech.', 'success');
this.generateBtn.disabled = false;
} catch (error) {
console.error('Failed to load TTS model:', error);
this.showStatus(`Failed to load model: ${error.message}. Retrying in CPU mode...`, 'error');
// Fallback to CPU with fp32
try {
this.tts = await window.pipeline('text-to-speech', 'onnx-community/Supertonic-TTS-ONNX', { dtype: 'fp32' });
this.showStatus('Model loaded in CPU mode!', 'success');
this.generateBtn.disabled = false;
} catch (fallbackError) {
this.showStatus('Failed to load model. Please refresh and try again.', 'error');
}
}
}
async generateSpeech() {
if (this.isGenerating || !this.tts) {
if (!this.tts) await this.initTTS();
return;
}
const text = this.textInput.value.trim();
if (!text) {
this.showStatus('Please enter some text to convert to speech.', 'error');
return;
}
this.isGenerating = true;
this.generateBtn.disabled = true;
this.setButtonLoading(true);
try {
this.showStatus('Generating speech...', 'loading');
const voice = this.voiceSelect.value;
const speakerEmbeddingUrl = this.voices[voice];
// Preload speaker embedding
const speakerResponse = await fetch(speakerEmbeddingUrl);
const speakerArrayBuffer = await speakerResponse.arrayBuffer();
const speakerEmbedding = new Float32Array(speakerArrayBuffer);
// Ensure proper alignment (multiple of 4)
if (speakerEmbedding.length % 4 !== 0) {
const paddedLength = Math.ceil(speakerEmbedding.length / 4) * 4;
const paddedEmbedding = new Float32Array(paddedLength);
paddedEmbedding.set(speakerEmbedding);
speakerEmbedding = paddedEmbedding;
}
const audio = await this.tts(text, {
speaker_embeddings: speakerEmbedding
});
this.currentAudio = audio;
// Create WAV blob from audio tensor
const audioBlob = await this.tensorToWavBlob(audio);
const audioUrl = URL.createObjectURL(audioBlob);
this.audioPlayer.src = audioUrl;
this.audioSection.classList.remove('hidden');
this.downloadBtn.disabled = false;
this.updatePlayButton(true);
this.showStatus('Speech generated successfully! Click play to listen.', 'success');
} catch (error) {
console.error('TTS generation failed:', error);
this.showStatus(`Generation failed: ${error.message}`, 'error');
} finally {
this.isGenerating = false;
this.setButtonLoading(false);
this.generateBtn.disabled = false;
}
}
async tensorToWavBlob(audioTensor) {
// Convert tensor to WAV format
const audioData = await audioTensor.save();
const float32Array = new Float32Array(audioData.buffer);
// Normalize audio to 16-bit PCM
const maxSample = 32767;
const int16Array = new Int16Array(float32Array.length);
for (let i = 0; i < float32Array.length; i++) {
const sample = Math.max(-1, Math.min(1, float32Array[i]));
int16Array[i] = sample * maxSample;
}
// Create WAV header
const sampleRate = 24000; // Supertonic default
const wavBuffer = this.createWavBuffer(int16Array, sampleRate);
return new Blob([wavBuffer], { type: 'audio/wav' });
}
createWavBuffer(audioData, sampleRate) {
const buffer = new ArrayBuffer(44 + audioData.length * 2);
const view = new DataView(buffer);
// WAV header
const writeString = (offset, string) => {
for (let i = 0; i < string.length; i++) {
view.setUint8(offset + i, string.charCodeAt(i));
}
};
writeString(0, 'RIFF');
view.setUint32(4, 36 + audioData.length * 2, true);
writeString(8, 'WAVE');
writeString(12, 'fmt ');
view.setUint32(16, 16, true);
view.setUint16(20, 1, true);
view.setUint16(22, 1, true);
view.setUint32(24, sampleRate, true);
view.setUint32(28, sampleRate * 2, true);
view.setUint16(32, 2, true);
view.setUint16(34, 16, true);
writeString(36, 'data');
view.setUint32(40, audioData.length * 2, true);
// Write audio data
for (let i = 0; i < audioData.length; i++) {
view.setInt16(44 + i * 2, audioData[i], true);
}
return buffer;
}
playAudio() {
if (this.audioPlayer.paused) {
this.audioPlayer.play();
this.updatePlayButton(false);
} else {
this.audioPlayer.pause();
this.updatePlayButton(true);
}
}
updatePlayButton(isPaused) {
const icon = this.playBtn.querySelector('svg');
if (isPaused) {
icon.innerHTML = '';
this.playBtn.title = 'Play audio';
} else {
icon.innerHTML = '';
this.playBtn.title = 'Pause audio';
}
}
async downloadAudio() {
if (this.currentAudio) {
const audioBlob = await this.tensorToWavBlob(this.currentAudio);
const url = URL.createObjectURL(audioBlob);
const a = document.createElement('a');
a.href = url;
a.download = 'supertonic-speech.wav';
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
URL.revokeObjectURL(url);
}
}
regenerate() {
this.textInput.value = '';
this.audioSection.classList.add('hidden');
this.downloadBtn.disabled = true;
this.currentAudio = null;
this.audioPlayer.src = '';
this.updateCharCount();
}
setButtonLoading(loading) {
const spinner = this.generateBtn.querySelector('.spinner');
const btnText = this.generateBtn.querySelector('.btn-text');
if (loading) {
spinner.style.display = 'inline-block';
btnText.textContent = 'Generating...';
} else {
spinner.style.display = 'none';
btnText.textContent = 'Generate Speech';
}
}
showStatus(message, type) {
this.status.textContent = message;
this.status.className = `status ${type}`;
this.status.classList.remove('hidden');
if (type !== 'loading') {
setTimeout(() => {
this.status.classList.add('hidden');
}, 5000);
}
}
}
// Initialize app when DOM is loaded
document.addEventListener('DOMContentLoaded', () => {
const app = new SupertonicTTS();
// Auto-initialize TTS model
setTimeout(() => app.initTTS(), 100);
});