chatterbox / app.py
abc1181's picture
Update app.py
535dc07 verified
from fastapi import FastAPI, Request, UploadFile, File, Form
from fastapi.responses import StreamingResponse, JSONResponse, HTMLResponse, FileResponse
import torch
import torchaudio
import io
import uvicorn
import os
import base64
import json
import subprocess
import tempfile
import shutil
import httpx
from pathlib import Path
from chatterbox.tts import ChatterboxTTS
app = FastAPI()
# ============================================================
# STARTUP
# ============================================================
print("Loading Chatterbox Multilingual...")
device = "cuda" if torch.cuda.is_available() else "cpu"
# Set HF token for faster downloads
hf_token = os.getenv("HF_TOKEN")
if hf_token:
from huggingface_hub import login
login(token=hf_token)
model = ChatterboxTTS.from_pretrained(device=device)
print(f"✅ Chatterbox loaded on {device}")
VOICE_SAMPLE_PATH = "voice_sample.wav"
VOICES_DIR = Path("voices")
VOICES_DIR.mkdir(exist_ok=True)
VOICES_META = VOICES_DIR / "meta.json"
HF_TOKEN = os.getenv("HF_TOKEN")
HF_REPO_ID = os.getenv("HF_REPO_ID") # e.g. abc1181/livekit-tts-chatterbox
GROQ_KEY = os.getenv("GROQ_API_KEY")
voice_sample = VOICE_SAMPLE_PATH if os.path.exists(VOICE_SAMPLE_PATH) else None
def has_devanagari(text: str) -> bool:
return any('\u0900' <= c <= '\u097F' for c in text)
def get_language(text: str) -> str:
return "hi" if has_devanagari(text) else "en"
def load_voices_meta() -> dict:
if VOICES_META.exists():
return json.loads(VOICES_META.read_text())
return {}
def save_voices_meta(meta: dict):
VOICES_META.write_text(json.dumps(meta, indent=2))
def push_to_hf(local_path: str, repo_path: str):
"""Push a file to HF repo permanently."""
if not HF_TOKEN or not HF_REPO_ID:
return False
try:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
path_or_fileobj=local_path,
path_in_repo=f"voices/{repo_path}",
repo_id=HF_REPO_ID,
repo_type="space",
token=HF_TOKEN,
)
return True
except Exception as e:
print(f"HF push failed: {e}")
return False
def pull_from_hf():
"""Pull voices from HF repo on startup."""
if not HF_TOKEN or not HF_REPO_ID:
return
try:
from huggingface_hub import HfApi
api = HfApi()
files = api.list_repo_files(repo_id=HF_REPO_ID, repo_type="space", token=HF_TOKEN)
for f in files:
if f.startswith("voices/") and f.endswith(".wav"):
name = Path(f).name
dest = VOICES_DIR / name
if not dest.exists():
api.hf_hub_download(
repo_id=HF_REPO_ID,
filename=f,
repo_type="space",
token=HF_TOKEN,
local_dir="."
)
except Exception as e:
print(f"HF pull failed: {e}")
# Pull voices from HF on startup
pull_from_hf()
# ============================================================
# UI
# ============================================================
@app.get("/", response_class=HTMLResponse)
async def ui():
return HTMLResponse(content="""
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8"/>
<meta name="viewport" content="width=device-width, initial-scale=1.0"/>
<title>Cortana TTS Studio</title>
<style>
* { margin:0; padding:0; box-sizing:border-box; }
body {
background:#080808;
color:#fff;
font-family:'Inter',-apple-system,sans-serif;
min-height:100vh;
}
.sidebar {
position:fixed;
left:0; top:0; bottom:0;
width:220px;
background:#0d0d0d;
border-right:1px solid #1a1a1a;
display:flex;
flex-direction:column;
padding:24px 0;
z-index:100;
}
.logo {
padding:0 20px 24px;
border-bottom:1px solid #1a1a1a;
}
.logo h1 {
font-size:1.1rem;
font-weight:700;
letter-spacing:0.2em;
text-transform:uppercase;
background:linear-gradient(135deg,#c0c0c0,#fff);
-webkit-background-clip:text;
-webkit-text-fill-color:transparent;
}
.logo p { font-size:0.7rem; color:#444; margin-top:4px; }
.nav { padding:16px 12px; flex:1; }
.nav-item {
display:flex;
align-items:center;
gap:10px;
padding:10px 12px;
border-radius:8px;
cursor:pointer;
font-size:0.85rem;
color:#555;
transition:all 0.2s;
margin-bottom:2px;
}
.nav-item:hover { background:#151515; color:#888; }
.nav-item.active { background:#151515; color:#fff; }
.nav-icon { font-size:1rem; width:20px; text-align:center; }
.main {
margin-left:220px;
padding:32px;
min-height:100vh;
}
.page { display:none; }
.page.active { display:block; }
.page-title {
font-size:1.4rem;
font-weight:700;
margin-bottom:8px;
}
.page-sub {
color:#444;
font-size:0.85rem;
margin-bottom:28px;
}
.card {
background:#111;
border:1px solid #1a1a1a;
border-radius:14px;
padding:22px;
margin-bottom:20px;
}
.card-title {
font-size:0.7rem;
font-weight:600;
letter-spacing:0.15em;
text-transform:uppercase;
color:#444;
margin-bottom:16px;
display:flex;
align-items:center;
gap:8px;
}
.card-title::before {
content:'';
display:block;
width:3px; height:12px;
background:#c0c0c0;
border-radius:2px;
}
textarea {
width:100%;
background:#0a0a0a;
border:1px solid #1f1f1f;
border-radius:10px;
color:#fff;
font-size:0.95rem;
padding:14px;
resize:vertical;
min-height:130px;
outline:none;
font-family:inherit;
line-height:1.6;
transition:border-color 0.2s;
}
textarea:focus { border-color:#333; }
textarea::placeholder { color:#2a2a2a; }
input[type="text"] {
width:100%;
background:#0a0a0a;
border:1px solid #1f1f1f;
border-radius:8px;
color:#fff;
font-size:0.9rem;
padding:10px 14px;
outline:none;
font-family:inherit;
transition:border-color 0.2s;
}
input[type="text"]:focus { border-color:#333; }
input[type="text"]::placeholder { color:#2a2a2a; }
.grid2 { display:grid; grid-template-columns:1fr 1fr; gap:16px; }
.grid3 { display:grid; grid-template-columns:1fr 1fr 1fr; gap:16px; }
.control-label {
font-size:0.72rem;
color:#444;
text-transform:uppercase;
letter-spacing:0.1em;
margin-bottom:8px;
display:block;
}
.slider-row {
display:flex;
align-items:center;
gap:12px;
}
input[type="range"] {
flex:1;
-webkit-appearance:none;
height:4px;
background:#1f1f1f;
border-radius:2px;
outline:none;
}
input[type="range"]::-webkit-slider-thumb {
-webkit-appearance:none;
width:14px; height:14px;
border-radius:50%;
background:#c0c0c0;
cursor:pointer;
}
.slider-val { font-size:0.8rem; color:#666; width:32px; text-align:right; }
.btn {
background:#151515;
border:1px solid #222;
border-radius:8px;
color:#fff;
font-size:0.9rem;
font-weight:600;
padding:11px 20px;
cursor:pointer;
transition:all 0.2s;
display:inline-flex;
align-items:center;
gap:8px;
}
.btn:hover { border-color:#444; }
.btn:disabled { opacity:0.3; cursor:not-allowed; }
.btn-primary {
width:100%;
justify-content:center;
padding:13px;
margin-top:14px;
font-size:0.95rem;
}
.btn-primary:hover { box-shadow:0 0 20px rgba(192,192,192,0.08); }
.btn-sm { padding:7px 14px; font-size:0.8rem; }
.btn-danger { border-color:#ef444433; color:#ef4444; }
.btn-danger:hover { border-color:#ef4444; background:rgba(239,68,68,0.1); }
.spinner {
width:16px; height:16px;
border:2px solid #333;
border-top-color:#c0c0c0;
border-radius:50%;
animation:spin 0.8s linear infinite;
display:none;
}
.btn.loading .spinner { display:block; }
.btn.loading .btn-label { display:none; }
@keyframes spin { to { transform:rotate(360deg); } }
.audio-player {
display:none;
background:#0a0a0a;
border:1px solid #1a1a1a;
border-radius:10px;
padding:14px;
margin-top:14px;
}
.audio-player.visible { display:block; }
.audio-player audio { width:100%; filter:invert(0.9); }
.dl-btn {
display:inline-flex;
align-items:center;
gap:6px;
margin-top:10px;
background:#151515;
border:1px solid #222;
border-radius:7px;
color:#666;
font-size:0.78rem;
padding:7px 12px;
cursor:pointer;
text-decoration:none;
transition:all 0.2s;
}
.dl-btn:hover { color:#fff; border-color:#444; }
.lang-row {
display:flex;
align-items:center;
gap:10px;
margin-top:8px;
}
.lang-badge {
background:#151515;
border:1px solid #222;
border-radius:999px;
padding:3px 12px;
font-size:0.75rem;
color:#666;
}
.upload-area {
border:1px dashed #1f1f1f;
border-radius:10px;
padding:22px;
text-align:center;
cursor:pointer;
transition:all 0.2s;
background:#0a0a0a;
position:relative;
}
.upload-area:hover { border-color:#333; }
.upload-area.has-file { border-color:#22c55e; border-style:solid; }
.upload-area input[type="file"] {
position:absolute; inset:0;
opacity:0; cursor:pointer;
width:100%; height:100%;
}
.upload-icon { font-size:1.5rem; margin-bottom:6px; }
.upload-label { font-size:0.85rem; color:#444; }
.upload-sub { font-size:0.75rem; color:#2a2a2a; margin-top:4px; }
.file-ok { font-size:0.8rem; color:#22c55e; margin-top:6px; }
.history-list { display:flex; flex-direction:column; gap:8px; }
.hist-item {
background:#0a0a0a;
border:1px solid #151515;
border-radius:9px;
padding:11px 14px;
display:flex;
align-items:center;
gap:12px;
}
.hist-text { flex:1; font-size:0.82rem; color:#555; white-space:nowrap; overflow:hidden; text-overflow:ellipsis; }
.hist-lang {
font-size:0.7rem; color:#333;
background:#111; border:1px solid #1a1a1a;
border-radius:999px; padding:2px 8px;
}
.hist-play {
width:30px; height:30px;
border-radius:50%;
background:#151515; border:1px solid #222;
color:#666; font-size:0.75rem;
cursor:pointer; display:flex; align-items:center; justify-content:center;
transition:all 0.2s; flex-shrink:0;
}
.hist-play:hover { border-color:#c0c0c0; color:#fff; }
.voice-grid {
display:grid;
grid-template-columns:repeat(auto-fill,minmax(180px,1fr));
gap:12px;
}
.voice-card {
background:#0a0a0a;
border:1px solid #1a1a1a;
border-radius:10px;
padding:14px;
cursor:pointer;
transition:all 0.2s;
position:relative;
}
.voice-card:hover { border-color:#333; }
.voice-card.selected { border-color:#c0c0c0; }
.voice-avatar {
width:44px; height:44px;
border-radius:50%;
background:#1a1a1a;
display:flex; align-items:center; justify-content:center;
font-size:1.2rem;
margin-bottom:10px;
}
.voice-name { font-size:0.85rem; font-weight:600; color:#ccc; }
.voice-lang { font-size:0.72rem; color:#444; margin-top:3px; }
.voice-actions {
display:flex; gap:6px; margin-top:10px;
}
.empty-state {
text-align:center;
padding:40px 20px;
color:#2a2a2a;
font-size:0.85rem;
}
.empty-icon { font-size:2rem; margin-bottom:10px; }
.tag {
display:inline-block;
background:#151515; border:1px solid #1f1f1f;
border-radius:6px; padding:4px 10px;
font-size:0.75rem; color:#555;
margin:3px;
}
.progress-bar {
height:4px; background:#1a1a1a;
border-radius:2px; overflow:hidden;
margin-top:10px; display:none;
}
.progress-bar.visible { display:block; }
.progress-fill {
height:100%; background:#c0c0c0;
border-radius:2px;
transition:width 0.3s;
}
.status-log {
background:#0a0a0a; border:1px solid #1a1a1a;
border-radius:8px; padding:12px;
font-family:monospace; font-size:0.78rem;
color:#444; max-height:120px;
overflow-y:auto; margin-top:12px;
display:none;
}
.status-log.visible { display:block; }
.status-line { margin-bottom:4px; }
.status-line.ok { color:#22c55e; }
.status-line.err { color:#ef4444; }
.status-line.info { color:#666; }
.error-msg {
background:rgba(239,68,68,0.08);
border:1px solid rgba(239,68,68,0.15);
border-radius:7px; padding:9px 13px;
font-size:0.82rem; color:#ef4444;
display:none; margin-top:10px;
}
.error-msg.visible { display:block; }
.divider { height:1px; background:#141414; margin:16px 0; }
.row { display:flex; align-items:center; gap:12px; }
select {
background:#0a0a0a; border:1px solid #1f1f1f;
border-radius:8px; color:#888;
padding:9px 12px; outline:none;
font-family:inherit; font-size:0.85rem;
cursor:pointer; width:100%;
}
select:focus { border-color:#333; }
::-webkit-scrollbar { width:4px; }
::-webkit-scrollbar-track { background:#0a0a0a; }
::-webkit-scrollbar-thumb { background:#1f1f1f; border-radius:2px; }
.voice-design-result {
background:#0a0a0a; border:1px solid #1a1a1a;
border-radius:10px; padding:16px;
margin-top:14px; display:none;
}
.voice-design-result.visible { display:block; }
.param-row {
display:flex; justify-content:space-between;
align-items:center; margin-bottom:8px;
}
.param-key { font-size:0.78rem; color:#444; }
.param-val { font-size:0.78rem; color:#888; font-family:monospace; }
</style>
</head>
<body>
<!-- SIDEBAR -->
<div class="sidebar">
<div class="logo">
<h1>Cortana TTS</h1>
<p>Studio</p>
</div>
<div class="nav">
<div class="nav-item active" onclick="showPage('tts')">
<span class="nav-icon">▶</span> Text to Speech
</div>
<div class="nav-item" onclick="showPage('library')">
<span class="nav-icon">🎙</span> Voice Library
</div>
<div class="nav-item" onclick="showPage('design')">
<span class="nav-icon">✨</span> Voice Design
</div>
<div class="nav-item" onclick="showPage('dubbing')">
<span class="nav-icon">🎬</span> Dubbing
</div>
<div class="nav-item" onclick="showPage('api')">
<span class="nav-icon">⚡</span> API
</div>
</div>
</div>
<!-- MAIN -->
<div class="main">
<!-- PAGE: TTS -->
<div class="page active" id="page-tts">
<div class="page-title">Text to Speech</div>
<div class="page-sub">Generate natural speech in English, Hindi or Hinglish</div>
<div class="card">
<div class="card-title">Text Input</div>
<textarea id="ttsText"
placeholder="Type in English, Hindi or Hinglish...&#10;नमस्ते, मैं कोर्टाना हूं।&#10;Arey yaar, kya scene hai?"></textarea>
<div class="lang-row">
Detected: <span class="lang-badge" id="langBadge">English</span>
<span style="color:#2a2a2a;font-size:0.75rem;">Ctrl+Enter to generate</span>
</div>
</div>
<div class="card">
<div class="card-title">Voice</div>
<div class="grid2">
<div>
<span class="control-label">Active Voice</span>
<select id="voiceSelect" onchange="voiceSelectChanged()">
<option value="default">Default Voice</option>
</select>
</div>
<div>
<span class="control-label">Upload One-time Sample</span>
<div class="upload-area" id="ttsUpload" style="padding:12px;">
<input type="file" id="ttsVoiceFile" accept=".wav,.mp3" onchange="ttsFileSelected()"/>
<div style="font-size:0.82rem;color:#333;">Drop WAV/MP3 here</div>
<div class="file-ok" id="ttsFileName"></div>
</div>
</div>
</div>
</div>
<div class="card">
<div class="card-title">Parameters</div>
<div class="grid2">
<div>
<span class="control-label">Emotion / Expressiveness</span>
<div class="slider-row">
<input type="range" id="ttsEmotion" min="0" max="1" step="0.05" value="0.5"/>
<span class="slider-val" id="ttsEmotionVal">0.5</span>
</div>
</div>
<div>
<span class="control-label">Speed</span>
<div class="slider-row">
<input type="range" id="ttsSpeed" min="0.5" max="2.0" step="0.1" value="1.0"/>
<span class="slider-val" id="ttsSpeedVal">1.0</span>
</div>
</div>
</div>
<button class="btn btn-primary" id="ttsBtn" onclick="generateTTS()">
<div class="spinner"></div>
<span class="btn-label">▶ Generate Speech</span>
</button>
<div class="error-msg" id="ttsError"></div>
<div class="audio-player" id="ttsPlayer">
<audio id="ttsAudio" controls></audio><br/>
<a class="dl-btn" id="ttsDL" download="cortana.mp3">↓ Download MP3</a>
<button class="btn btn-sm" style="margin-left:8px;margin-top:10px;"
onclick="saveToHistory()">+ History</button>
</div>
</div>
<div class="card">
<div class="card-title">Generation History</div>
<div class="history-list" id="histList">
<div class="empty-state">
<div class="empty-icon">🎵</div>
Your generations appear here
</div>
</div>
</div>
</div>
<!-- PAGE: VOICE LIBRARY -->
<div class="page" id="page-library">
<div class="page-title">Voice Library</div>
<div class="page-sub">Save, manage and reuse cloned voices permanently</div>
<div class="card">
<div class="card-title">Add New Voice</div>
<div class="grid2">
<div>
<span class="control-label">Voice Name</span>
<input type="text" id="newVoiceName" placeholder="e.g. Cortana English, Raj Hindi..."/>
</div>
<div>
<span class="control-label">Language Tag</span>
<select id="newVoiceLang">
<option value="en">English</option>
<option value="hi">Hindi</option>
<option value="both">English + Hindi</option>
</select>
</div>
</div>
<div style="margin-top:14px;">
<span class="control-label">Voice Sample (5–30 seconds, clean audio)</span>
<div class="upload-area" id="libUploadArea">
<input type="file" id="libVoiceFile" accept=".wav,.mp3" onchange="libFileSelected()"/>
<div class="upload-icon">🎙</div>
<div class="upload-label">Upload WAV or MP3</div>
<div class="upload-sub">No background music — clear speech only</div>
<div class="file-ok" id="libFileName"></div>
</div>
</div>
<button class="btn btn-primary" id="saveVoiceBtn" onclick="saveVoice()">
<div class="spinner"></div>
<span class="btn-label">💾 Save to Library</span>
</button>
<div class="error-msg" id="libError"></div>
</div>
<div class="card">
<div class="card-title">Saved Voices</div>
<div class="voice-grid" id="voiceGrid">
<div class="empty-state" style="grid-column:1/-1;">
<div class="empty-icon">🎙</div>
No voices saved yet — add one above
</div>
</div>
</div>
</div>
<!-- PAGE: VOICE DESIGN -->
<div class="page" id="page-design">
<div class="page-title">Voice Design</div>
<div class="page-sub">Describe a voice in plain words — AI generates the parameters</div>
<div class="card">
<div class="card-title">Describe Your Voice</div>
<textarea id="designPrompt" style="min-height:80px;"
placeholder="e.g. Young Indian woman, warm and friendly, speaks at a medium pace with a hint of excitement&#10;e.g. Deep mature male voice, calm and authoritative, slightly slow&#10;e.g. Energetic young man, fast paced, very expressive"></textarea>
<div style="margin-top:14px;">
<span class="control-label">Preview Text</span>
<textarea id="designPreviewText" style="min-height:60px;"
placeholder="Hello, I am Cortana. How can I assist you today?"></textarea>
</div>
<button class="btn btn-primary" id="designBtn" onclick="designVoice()">
<div class="spinner"></div>
<span class="btn-label">✨ Design Voice</span>
</button>
<div class="error-msg" id="designError"></div>
<div class="voice-design-result" id="designResult">
<div class="card-title">Generated Parameters</div>
<div id="designParams"></div>
<div class="divider"></div>
<div class="audio-player visible" style="margin-top:0;">
<audio id="designAudio" controls></audio><br/>
<a class="dl-btn" id="designDL" download="designed_voice.mp3">↓ Download</a>
<button class="btn btn-sm" style="margin-left:8px;margin-top:10px;"
onclick="saveDesignedVoice()">💾 Save to Library</button>
</div>
</div>
</div>
<div class="card">
<div class="card-title">Example Prompts</div>
<div>
<span class="tag" onclick="setDesignPrompt(this)">Young Indian woman, warm and friendly</span>
<span class="tag" onclick="setDesignPrompt(this)">Deep mature male, calm and authoritative</span>
<span class="tag" onclick="setDesignPrompt(this)">Energetic teen, very expressive and fast</span>
<span class="tag" onclick="setDesignPrompt(this)">Professional newsreader, neutral accent</span>
<span class="tag" onclick="setDesignPrompt(this)">Soft spoken elderly woman, slow and gentle</span>
<span class="tag" onclick="setDesignPrompt(this)">Excited sports commentator, loud and fast</span>
</div>
</div>
</div>
<!-- PAGE: DUBBING -->
<div class="page" id="page-dubbing">
<div class="page-title">Dubbing</div>
<div class="page-sub">Translate and re-voice any video or audio file</div>
<div class="card" style="border-color:#ef444422;">
<div style="font-size:0.82rem;color:#ef4444;margin-bottom:4px;">⚠️ Free CPU Warning</div>
<div style="font-size:0.78rem;color:#555;">
Dubbing on free CPU takes 10–20 minutes per minute of video.
Start with a short clip to test. Upgrade to GPU for faster processing.
</div>
</div>
<div class="card">
<div class="card-title">Upload Media</div>
<div class="upload-area" id="dubUploadArea">
<input type="file" id="dubFile" accept="video/*,audio/*" onchange="dubFileSelected()"/>
<div class="upload-icon">🎬</div>
<div class="upload-label">Upload video or audio file</div>
<div class="upload-sub">MP4, MKV, AVI, MP3, WAV — max 100MB</div>
<div class="file-ok" id="dubFileName"></div>
</div>
<div class="grid2" style="margin-top:16px;">
<div>
<span class="control-label">Source Language</span>
<select id="dubSrcLang">
<option value="auto">Auto Detect</option>
<option value="en">English</option>
<option value="hi">Hindi</option>
<option value="es">Spanish</option>
<option value="fr">French</option>
<option value="de">German</option>
<option value="ja">Japanese</option>
<option value="zh">Chinese</option>
</select>
</div>
<div>
<span class="control-label">Target Language</span>
<select id="dubTgtLang">
<option value="en">English</option>
<option value="hi">Hindi</option>
<option value="es">Spanish</option>
<option value="fr">French</option>
<option value="de">German</option>
<option value="ja">Japanese</option>
<option value="zh">Chinese</option>
</select>
</div>
</div>
<div style="margin-top:14px;">
<span class="control-label">Dubbing Voice (optional)</span>
<select id="dubVoiceSelect">
<option value="default">Default Voice</option>
</select>
</div>
<button class="btn btn-primary" id="dubBtn" onclick="startDubbing()">
<div class="spinner"></div>
<span class="btn-label">🎬 Start Dubbing</span>
</button>
<div class="error-msg" id="dubError"></div>
<div class="progress-bar" id="dubProgress">
<div class="progress-fill" id="dubProgressFill" style="width:0%"></div>
</div>
<div class="status-log" id="dubLog"></div>
<div class="audio-player" id="dubPlayer">
<audio id="dubAudio" controls></audio><br/>
<a class="dl-btn" id="dubDL" download="dubbed.mp3">↓ Download Dubbed Audio</a>
</div>
</div>
</div>
<!-- PAGE: API -->
<div class="page" id="page-api">
<div class="page-title">API Reference</div>
<div class="page-sub">OpenAI-compatible endpoints — drop-in replacement</div>
<div class="card">
<div class="card-title">Text to Speech</div>
<div style="background:#0a0a0a;border:1px solid #1a1a1a;border-radius:8px;padding:14px;font-family:monospace;font-size:0.78rem;color:#555;line-height:1.7;">
POST /v1/audio/speech<br/><br/>
{<br/>
&nbsp;&nbsp;"input": "Hello I am Cortana",<br/>
&nbsp;&nbsp;"emotion": 0.5,&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;// 0.0 neutral → 1.0 expressive<br/>
&nbsp;&nbsp;"speed": 1.0&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;// 0.5x to 2.0x<br/>
}<br/><br/>
Returns: audio/mpeg stream
</div>
<button class="btn btn-sm" style="margin-top:10px;" onclick="copyText(0)">Copy</button>
</div>
<div class="card">
<div class="card-title">Voice Cloning (on-the-fly)</div>
<div style="background:#0a0a0a;border:1px solid #1a1a1a;border-radius:8px;padding:14px;font-family:monospace;font-size:0.78rem;color:#555;line-height:1.7;">
POST /v1/audio/speech/clone<br/><br/>
{<br/>
&nbsp;&nbsp;"input": "Hello I am Cortana",<br/>
&nbsp;&nbsp;"voice_b64": "base64_encoded_wav",<br/>
&nbsp;&nbsp;"emotion": 0.5<br/>
}<br/><br/>
Returns: audio/mpeg stream
</div>
<button class="btn btn-sm" style="margin-top:10px;" onclick="copyText(1)">Copy</button>
</div>
<div class="card">
<div class="card-title">List Voices</div>
<div style="background:#0a0a0a;border:1px solid #1a1a1a;border-radius:8px;padding:14px;font-family:monospace;font-size:0.78rem;color:#555;line-height:1.7;">
GET /v1/voices<br/><br/>
Returns: { "voices": [ { "id": "...", "name": "...", "lang": "..." } ] }
</div>
</div>
<div class="card">
<div class="card-title">CURL Example</div>
<div style="background:#0a0a0a;border:1px solid #1a1a1a;border-radius:8px;padding:14px;font-family:monospace;font-size:0.78rem;color:#555;line-height:1.7;" id="curlExample">
curl -X POST "https://YOUR_SPACE.hf.space/v1/audio/speech" \<br/>
&nbsp;&nbsp;-H "Authorization: Bearer YOUR_HF_TOKEN" \<br/>
&nbsp;&nbsp;-H "Content-Type: application/json" \<br/>
&nbsp;&nbsp;-d '{"input": "Hello I am Cortana", "emotion": 0.5}' \<br/>
&nbsp;&nbsp;--output speech.mp3
</div>
</div>
</div>
</div>
<script>
const historyItems = [];
let selectedVoiceId = 'default';
let designedParams = null;
// ============ NAVIGATION ============
function showPage(id) {
document.querySelectorAll('.page').forEach(p => p.classList.remove('active'));
document.querySelectorAll('.nav-item').forEach(n => n.classList.remove('active'));
document.getElementById('page-' + id).classList.add('active');
event.currentTarget.classList.add('active');
if (id === 'library') loadVoiceLibrary();
}
// ============ LANGUAGE DETECTION ============
document.getElementById('ttsText').addEventListener('input', function() {
const text = this.value;
const badge = document.getElementById('langBadge');
if (/[\u0900-\u097F]/.test(text)) {
badge.textContent = 'Hindi';
badge.style.color = '#f97316';
} else if (/\b(hai|hoon|kya|nahi|aur|toh|yaar|arey|bhi)\b/i.test(text)) {
badge.textContent = 'Hinglish';
badge.style.color = '#a855f7';
} else {
badge.textContent = 'English';
badge.style.color = '#666';
}
});
// ============ SLIDERS ============
document.getElementById('ttsEmotion').addEventListener('input', function() {
document.getElementById('ttsEmotionVal').textContent = this.value;
});
document.getElementById('ttsSpeed').addEventListener('input', function() {
document.getElementById('ttsSpeedVal').textContent = this.value;
});
// ============ TTS ============
function ttsFileSelected() {
const f = document.getElementById('ttsVoiceFile').files[0];
if (f) document.getElementById('ttsFileName').textContent = '✅ ' + f.name;
}
async function generateTTS() {
const text = document.getElementById('ttsText').value.trim();
const emotion = document.getElementById('ttsEmotion').value;
const speed = document.getElementById('ttsSpeed').value;
const voiceFile = document.getElementById('ttsVoiceFile').files[0];
const btn = document.getElementById('ttsBtn');
const err = document.getElementById('ttsError');
if (!text) { showError(err, 'Please enter text first.'); return; }
err.classList.remove('visible');
setLoading(btn, true);
try {
let res;
if (voiceFile) {
const b64 = await toB64(voiceFile);
res = await fetch('/v1/audio/speech/clone', {
method:'POST',
headers:{'Content-Type':'application/json'},
body:JSON.stringify({ input:text, emotion:parseFloat(emotion), speed:parseFloat(speed), voice_b64:b64 })
});
} else if (selectedVoiceId !== 'default') {
res = await fetch('/v1/audio/speech/voice/' + selectedVoiceId, {
method:'POST',
headers:{'Content-Type':'application/json'},
body:JSON.stringify({ input:text, emotion:parseFloat(emotion), speed:parseFloat(speed) })
});
} else {
res = await fetch('/v1/audio/speech', {
method:'POST',
headers:{'Content-Type':'application/json'},
body:JSON.stringify({ input:text, emotion:parseFloat(emotion), speed:parseFloat(speed) })
});
}
if (!res.ok) { const e = await res.json(); throw new Error(e.error); }
const blob = await res.blob();
const url = URL.createObjectURL(blob);
setAudio('ttsAudio', 'ttsDL', url);
document.getElementById('ttsPlayer').classList.add('visible');
document.getElementById('ttsAudio').play();
// auto add to history
historyItems.unshift({ text, url, lang: /[\u0900-\u097F]/.test(text) ? 'Hindi' : 'English' });
renderHistory();
} catch(e) {
showError(err, e.message);
} finally {
setLoading(btn, false);
}
}
function renderHistory() {
const list = document.getElementById('histList');
if (!historyItems.length) return;
list.innerHTML = historyItems.slice(0,10).map((h,i) => `
<div class="hist-item">
<div class="hist-text">${h.text}</div>
<div class="hist-lang">${h.lang}</div>
<button class="hist-play" onclick="new Audio('${h.url}').play()">▶</button>
</div>
`).join('');
}
document.getElementById('ttsText').addEventListener('keydown', e => {
if (e.ctrlKey && e.key === 'Enter') generateTTS();
});
// ============ VOICE LIBRARY ============
function libFileSelected() {
const f = document.getElementById('libVoiceFile').files[0];
if (f) {
document.getElementById('libFileName').textContent = '✅ ' + f.name;
document.getElementById('libUploadArea').classList.add('has-file');
}
}
async function saveVoice() {
const name = document.getElementById('newVoiceName').value.trim();
const lang = document.getElementById('newVoiceLang').value;
const file = document.getElementById('libVoiceFile').files[0];
const btn = document.getElementById('saveVoiceBtn');
const err = document.getElementById('libError');
if (!name) { showError(err, 'Please enter a voice name.'); return; }
if (!file) { showError(err, 'Please upload a voice sample.'); return; }
err.classList.remove('visible');
setLoading(btn, true);
try {
const b64 = await toB64(file);
const res = await fetch('/v1/voices', {
method:'POST',
headers:{'Content-Type':'application/json'},
body:JSON.stringify({ name, lang, voice_b64:b64, filename:file.name })
});
if (!res.ok) { const e = await res.json(); throw new Error(e.error); }
document.getElementById('newVoiceName').value = '';
document.getElementById('libVoiceFile').value = '';
document.getElementById('libFileName').textContent = '';
document.getElementById('libUploadArea').classList.remove('has-file');
await loadVoiceLibrary();
} catch(e) {
showError(err, e.message);
} finally {
setLoading(btn, false);
}
}
async function loadVoiceLibrary() {
const res = await fetch('/v1/voices');
const data = await res.json();
const grid = document.getElementById('voiceGrid');
const voices = data.voices || [];
// Update dropdowns
updateVoiceDropdowns(voices);
if (!voices.length) {
grid.innerHTML = '<div class="empty-state" style="grid-column:1/-1;"><div class="empty-icon">🎙</div>No voices saved yet</div>';
return;
}
grid.innerHTML = voices.map(v => `
<div class="voice-card ${selectedVoiceId === v.id ? 'selected' : ''}" onclick="selectVoice('${v.id}')">
<div class="voice-avatar">🎙</div>
<div class="voice-name">${v.name}</div>
<div class="voice-lang">${v.lang.toUpperCase()}</div>
<div class="voice-actions">
<button class="btn btn-sm" onclick="event.stopPropagation();previewVoice('${v.id}')">▶</button>
<button class="btn btn-sm btn-danger" onclick="event.stopPropagation();deleteVoice('${v.id}')">✕</button>
</div>
</div>
`).join('');
}
function updateVoiceDropdowns(voices) {
const opts = '<option value="default">Default Voice</option>' +
voices.map(v => `<option value="${v.id}">${v.name}</option>`).join('');
document.getElementById('voiceSelect').innerHTML = opts;
document.getElementById('dubVoiceSelect').innerHTML = opts;
}
function voiceSelectChanged() {
selectedVoiceId = document.getElementById('voiceSelect').value;
}
function selectVoice(id) {
selectedVoiceId = id;
document.getElementById('voiceSelect').value = id;
loadVoiceLibrary();
}
async function previewVoice(id) {
const res = await fetch('/v1/audio/speech/voice/' + id, {
method:'POST',
headers:{'Content-Type':'application/json'},
body:JSON.stringify({ input:'Hello, I am Cortana. How can I assist you?', emotion:0.5 })
});
if (!res.ok) return;
const blob = await res.blob();
new Audio(URL.createObjectURL(blob)).play();
}
async function deleteVoice(id) {
if (!confirm('Delete this voice?')) return;
await fetch('/v1/voices/' + id, { method:'DELETE' });
await loadVoiceLibrary();
}
// ============ VOICE DESIGN ============
function setDesignPrompt(el) {
document.getElementById('designPrompt').value = el.textContent;
}
async function designVoice() {
const prompt = document.getElementById('designPrompt').value.trim();
const preview = document.getElementById('designPreviewText').value.trim() ||
'Hello, I am Cortana. How can I assist you today?';
const btn = document.getElementById('designBtn');
const err = document.getElementById('designError');
if (!prompt) { showError(err, 'Please describe the voice first.'); return; }
err.classList.remove('visible');
setLoading(btn, true);
try {
const res = await fetch('/v1/voices/design', {
method:'POST',
headers:{'Content-Type':'application/json'},
body:JSON.stringify({ prompt, preview_text:preview })
});
if (!res.ok) { const e = await res.json(); throw new Error(e.error); }
const data = await res.json();
designedParams = data.params;
// Show params
document.getElementById('designParams').innerHTML = Object.entries(data.params).map(([k,v]) => `
<div class="param-row">
<span class="param-key">${k}</span>
<span class="param-val">${v}</span>
</div>
`).join('');
// Generate preview audio
const audioRes = await fetch('/v1/audio/speech', {
method:'POST',
headers:{'Content-Type':'application/json'},
body:JSON.stringify({ input:preview, ...data.params })
});
const blob = await audioRes.blob();
const url = URL.createObjectURL(blob);
setAudio('designAudio', 'designDL', url);
document.getElementById('designResult').classList.add('visible');
document.getElementById('designAudio').play();
} catch(e) {
showError(err, e.message);
} finally {
setLoading(btn, false);
}
}
async function saveDesignedVoice() {
const name = prompt('Enter a name for this voice:');
if (!name || !designedParams) return;
await fetch('/v1/voices/design/save', {
method:'POST',
headers:{'Content-Type':'application/json'},
body:JSON.stringify({ name, params:designedParams })
});
alert('Voice saved to library!');
}
// ============ DUBBING ============
function dubFileSelected() {
const f = document.getElementById('dubFile').files[0];
if (f) {
document.getElementById('dubFileName').textContent = '✅ ' + f.name;
document.getElementById('dubUploadArea').classList.add('has-file');
}
}
async function startDubbing() {
const file = document.getElementById('dubFile').files[0];
const srcLang = document.getElementById('dubSrcLang').value;
const tgtLang = document.getElementById('dubTgtLang').value;
const voiceId = document.getElementById('dubVoiceSelect').value;
const btn = document.getElementById('dubBtn');
const err = document.getElementById('dubError');
const log = document.getElementById('dubLog');
const prog = document.getElementById('dubProgress');
const fill = document.getElementById('dubProgressFill');
if (!file) { showError(err, 'Please upload a video or audio file.'); return; }
err.classList.remove('visible');
setLoading(btn, true);
log.classList.add('visible');
prog.classList.add('visible');
log.innerHTML = '<div class="status-line info">Starting dubbing pipeline...</div>';
fill.style.width = '5%';
try {
const b64 = await toB64(file);
fill.style.width = '10%';
addLog(log, 'Uploading file...', 'info');
const res = await fetch('/v1/dubbing', {
method:'POST',
headers:{'Content-Type':'application/json'},
body:JSON.stringify({
file_b64: b64,
filename: file.name,
src_lang: srcLang,
tgt_lang: tgtLang,
voice_id: voiceId
})
});
fill.style.width = '50%';
addLog(log, 'Processing...', 'info');
if (!res.ok) { const e = await res.json(); throw new Error(e.error); }
fill.style.width = '90%';
addLog(log, 'Finalizing audio...', 'info');
const blob = await res.blob();
const url = URL.createObjectURL(blob);
setAudio('dubAudio', 'dubDL', url);
document.getElementById('dubPlayer').classList.add('visible');
fill.style.width = '100%';
addLog(log, '✅ Dubbing complete!', 'ok');
} catch(e) {
showError(err, e.message);
addLog(log, '❌ ' + e.message, 'err');
} finally {
setLoading(btn, false);
}
}
// ============ HELPERS ============
function toB64(file) {
return new Promise(res => {
const r = new FileReader();
r.onload = e => res(e.target.result.split(',')[1]);
r.readAsDataURL(file);
});
}
function setAudio(audioId, dlId, url) {
document.getElementById(audioId).src = url;
document.getElementById(dlId).href = url;
}
function showError(el, msg) {
el.textContent = '⚠ ' + msg;
el.classList.add('visible');
}
function setLoading(btn, state) {
btn.disabled = state;
btn.classList.toggle('loading', state);
}
function addLog(el, msg, type) {
el.innerHTML += `<div class="status-line ${type}">${msg}</div>`;
el.scrollTop = el.scrollHeight;
}
function copyText(idx) {
const boxes = document.querySelectorAll('#page-api [style*="monospace"]');
navigator.clipboard.writeText(boxes[idx].innerText);
}
// Load voice library on start
loadVoiceLibrary();
</script>
</body>
</html>
""")
# ============================================================
# API ENDPOINTS
# ============================================================
@app.get("/v1")
async def v1_root():
return {"status": "ok", "service": "chatterbox-multilingual-tts"}
@app.post("/v1/audio/speech")
async def tts(request: Request):
try:
data = await request.json()
text = data.get("input", "")
emotion = float(data.get("emotion", 0.5))
if not text:
return JSONResponse({"error": "No input text"}, status_code=400)
lang = get_language(text)
wav = model.generate(text, audio_prompt_path=voice_sample, exaggeration=emotion, language=lang)
out = io.BytesIO()
torchaudio.save(out, wav, model.sr, format="mp3")
out.seek(0)
return StreamingResponse(out, media_type="audio/mpeg")
except Exception as e:
return JSONResponse({"error": str(e)}, status_code=500)
@app.post("/v1/audio/speech/clone")
async def tts_clone(request: Request):
try:
data = await request.json()
text = data.get("input", "")
voice_b64 = data.get("voice_b64", "")
emotion = float(data.get("emotion", 0.5))
if not text:
return JSONResponse({"error": "No input text"}, status_code=400)
lang = get_language(text)
if voice_b64:
voice_bytes = base64.b64decode(voice_b64)
temp_path = "/tmp/clone_voice.wav"
with open(temp_path, "wb") as f:
f.write(voice_bytes)
prompt_path = temp_path
else:
prompt_path = voice_sample
wav = model.generate(text, audio_prompt_path=prompt_path, exaggeration=emotion, language=lang)
out = io.BytesIO()
torchaudio.save(out, wav, model.sr, format="mp3")
out.seek(0)
return StreamingResponse(out, media_type="audio/mpeg")
except Exception as e:
return JSONResponse({"error": str(e)}, status_code=500)
@app.post("/v1/audio/speech/voice/{voice_id}")
async def tts_with_voice(voice_id: str, request: Request):
try:
data = await request.json()
text = data.get("input", "")
emotion = float(data.get("emotion", 0.5))
if not text:
return JSONResponse({"error": "No input text"}, status_code=400)
meta = load_voices_meta()
voice_info = meta.get(voice_id)
prompt_path = str(VOICES_DIR / voice_info["filename"]) if voice_info else voice_sample
lang = get_language(text)
wav = model.generate(text, audio_prompt_path=prompt_path, exaggeration=emotion, language=lang)
out = io.BytesIO()
torchaudio.save(out, wav, model.sr, format="mp3")
out.seek(0)
return StreamingResponse(out, media_type="audio/mpeg")
except Exception as e:
return JSONResponse({"error": str(e)}, status_code=500)
# ============================================================
# VOICE LIBRARY ENDPOINTS
# ============================================================
@app.get("/v1/voices")
async def list_voices():
meta = load_voices_meta()
voices = [{"id": k, "name": v["name"], "lang": v["lang"]} for k, v in meta.items()]
return {"voices": voices}
@app.post("/v1/voices")
async def add_voice(request: Request):
try:
data = await request.json()
name = data.get("name", "").strip()
lang = data.get("lang", "en")
voice_b64 = data.get("voice_b64", "")
filename = data.get("filename", "voice.wav")
if not name or not voice_b64:
return JSONResponse({"error": "Name and voice sample required"}, status_code=400)
import uuid
voice_id = str(uuid.uuid4())[:8]
safe_name = f"{voice_id}.wav"
local_path = str(VOICES_DIR / safe_name)
voice_bytes = base64.b64decode(voice_b64)
with open(local_path, "wb") as f:
f.write(voice_bytes)
meta = load_voices_meta()
meta[voice_id] = {"name": name, "lang": lang, "filename": safe_name}
save_voices_meta(meta)
# Push to HF repo
push_to_hf(local_path, safe_name)
push_to_hf(str(VOICES_META), "meta.json")
return {"id": voice_id, "name": name, "lang": lang}
except Exception as e:
return JSONResponse({"error": str(e)}, status_code=500)
@app.delete("/v1/voices/{voice_id}")
async def delete_voice(voice_id: str):
try:
meta = load_voices_meta()
if voice_id in meta:
wav_path = VOICES_DIR / meta[voice_id]["filename"]
if wav_path.exists():
wav_path.unlink()
del meta[voice_id]
save_voices_meta(meta)
push_to_hf(str(VOICES_META), "meta.json")
return {"deleted": voice_id}
except Exception as e:
return JSONResponse({"error": str(e)}, status_code=500)
# ============================================================
# VOICE DESIGN ENDPOINTS
# ============================================================
@app.post("/v1/voices/design")
async def design_voice(request: Request):
try:
data = await request.json()
prompt = data.get("prompt", "")
preview_text = data.get("preview_text", "Hello, I am Cortana.")
if not GROQ_KEY:
return JSONResponse({"error": "GROQ_API_KEY not set in secrets"}, status_code=500)
# Ask Groq LLM to map description to Chatterbox parameters
async with httpx.AsyncClient(timeout=15.0) as client:
res = await client.post(
"https://api.groq.com/openai/v1/chat/completions",
headers={
"Authorization": f"Bearer {GROQ_KEY}",
"Content-Type": "application/json"
},
json={
"model": "llama-3.1-8b-instant",
"messages": [
{
"role": "system",
"content": """You are a voice parameter mapper for a TTS system.
Given a voice description, output ONLY a JSON object with these exact fields:
- emotion: float 0.0 to 1.0 (0=neutral/calm, 1=very expressive/excited)
- speed: float 0.5 to 2.0 (0.5=very slow, 1.0=normal, 2.0=very fast)
- description: one sentence summarizing the voice
Examples:
"calm elderly woman" -> {"emotion":0.2,"speed":0.8,"description":"Soft calm elderly female voice"}
"excited sports commentator" -> {"emotion":0.95,"speed":1.6,"description":"Energetic fast sports commentator"}
"professional newsreader" -> {"emotion":0.3,"speed":1.0,"description":"Neutral professional news voice"}
Output ONLY the JSON. No explanation. No markdown."""
},
{
"role": "user",
"content": prompt
}
],
"max_tokens": 100,
"temperature": 0.3
}
)
result = res.json()
raw = result["choices"][0]["message"]["content"].strip()
params = json.loads(raw)
return {"params": params, "preview_text": preview_text}
except Exception as e:
return JSONResponse({"error": str(e)}, status_code=500)
@app.post("/v1/voices/design/save")
async def save_designed_voice(request: Request):
try:
data = await request.json()
name = data.get("name", "Designed Voice")
params = data.get("params", {})
import uuid
voice_id = str(uuid.uuid4())[:8]
meta = load_voices_meta()
meta[voice_id] = {
"name": name,
"lang": "en",
"filename": None,
"params": params,
"designed": True
}
save_voices_meta(meta)
push_to_hf(str(VOICES_META), "meta.json")
return {"id": voice_id, "name": name}
except Exception as e:
return JSONResponse({"error": str(e)}, status_code=500)
# ============================================================
# DUBBING ENDPOINT
# ============================================================
@app.post("/v1/dubbing")
async def dub_video(request: Request):
try:
data = await request.json()
file_b64 = data.get("file_b64", "")
filename = data.get("filename", "input.mp4")
src_lang = data.get("src_lang", "auto")
tgt_lang = data.get("tgt_lang", "en")
voice_id = data.get("voice_id", "default")
if not file_b64:
return JSONResponse({"error": "No file provided"}, status_code=400)
if not GROQ_KEY:
return JSONResponse({"error": "GROQ_API_KEY not set"}, status_code=500)
tmpdir = tempfile.mkdtemp()
try:
# Step 1 — Save uploaded file
input_path = os.path.join(tmpdir, filename)
with open(input_path, "wb") as f:
f.write(base64.b64decode(file_b64))
# Step 2 — Extract audio as WAV
audio_path = os.path.join(tmpdir, "audio.wav")
subprocess.run([
"ffmpeg", "-i", input_path,
"-ar", "16000", "-ac", "1",
"-y", audio_path
], check=True, capture_output=True)
# Step 3 — Transcribe with Whisper via Groq
with open(audio_path, "rb") as af:
audio_b64 = base64.b64encode(af.read()).decode()
async with httpx.AsyncClient(timeout=120.0) as client:
# Use Groq Whisper for transcription
with open(audio_path, "rb") as af:
trans_res = await client.post(
"https://api.groq.com/openai/v1/audio/transcriptions",
headers={"Authorization": f"Bearer {GROQ_KEY}"},
files={"file": (filename, af, "audio/wav")},
data={
"model": "whisper-large-v3",
"language": src_lang if src_lang != "auto" else None,
"response_format": "verbose_json"
}
)
transcript_data = trans_res.json()
segments = transcript_data.get("segments", [])
full_text = transcript_data.get("text", "")
if not full_text:
return JSONResponse({"error": "Could not transcribe audio"}, status_code=500)
# Step 4 — Translate via Groq LLM
lang_names = {
"en": "English", "hi": "Hindi", "es": "Spanish",
"fr": "French", "de": "German", "ja": "Japanese", "zh": "Chinese"
}
tgt_name = lang_names.get(tgt_lang, tgt_lang)
trans_response = await client.post(
"https://api.groq.com/openai/v1/chat/completions",
headers={
"Authorization": f"Bearer {GROQ_KEY}",
"Content-Type": "application/json"
},
json={
"model": "llama-3.3-70b-versatile",
"messages": [
{
"role": "system",
"content": f"Translate the following text to {tgt_name}. Output ONLY the translated text. No explanations."
},
{"role": "user", "content": full_text}
],
"max_tokens": 2000
}
)
translated_text = trans_response.json()["choices"][0]["message"]["content"].strip()
# Step 5 — Synthesize translated text with Chatterbox
meta = load_voices_meta()
voice_info = meta.get(voice_id)
if voice_info and voice_info.get("filename"):
prompt_path = str(VOICES_DIR / voice_info["filename"])
else:
prompt_path = voice_sample
lang_code = get_language(translated_text)
emotion = 0.5
if voice_info and voice_info.get("params"):
emotion = float(voice_info["params"].get("emotion", 0.5))
wav = model.generate(
translated_text,
audio_prompt_path=prompt_path,
exaggeration=emotion,
language=lang_code
)
# Step 6 — Return dubbed audio
out = io.BytesIO()
torchaudio.save(out, wav, model.sr, format="mp3")
out.seek(0)
return StreamingResponse(out, media_type="audio/mpeg")
finally:
shutil.rmtree(tmpdir, ignore_errors=True)
except Exception as e:
return JSONResponse({"error": str(e)}, status_code=500)
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=7860)