Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI, Request, UploadFile, File, Form | |
| from fastapi.responses import StreamingResponse, JSONResponse, HTMLResponse, FileResponse | |
| import torch | |
| import torchaudio | |
| import io | |
| import uvicorn | |
| import os | |
| import base64 | |
| import json | |
| import subprocess | |
| import tempfile | |
| import shutil | |
| import httpx | |
| from pathlib import Path | |
| from chatterbox.tts import ChatterboxTTS | |
| app = FastAPI() | |
| # ============================================================ | |
| # STARTUP | |
| # ============================================================ | |
| print("Loading Chatterbox Multilingual...") | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # Set HF token for faster downloads | |
| hf_token = os.getenv("HF_TOKEN") | |
| if hf_token: | |
| from huggingface_hub import login | |
| login(token=hf_token) | |
| model = ChatterboxTTS.from_pretrained(device=device) | |
| print(f"✅ Chatterbox loaded on {device}") | |
| VOICE_SAMPLE_PATH = "voice_sample.wav" | |
| VOICES_DIR = Path("voices") | |
| VOICES_DIR.mkdir(exist_ok=True) | |
| VOICES_META = VOICES_DIR / "meta.json" | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| HF_REPO_ID = os.getenv("HF_REPO_ID") # e.g. abc1181/livekit-tts-chatterbox | |
| GROQ_KEY = os.getenv("GROQ_API_KEY") | |
| voice_sample = VOICE_SAMPLE_PATH if os.path.exists(VOICE_SAMPLE_PATH) else None | |
| def has_devanagari(text: str) -> bool: | |
| return any('\u0900' <= c <= '\u097F' for c in text) | |
| def get_language(text: str) -> str: | |
| return "hi" if has_devanagari(text) else "en" | |
| def load_voices_meta() -> dict: | |
| if VOICES_META.exists(): | |
| return json.loads(VOICES_META.read_text()) | |
| return {} | |
| def save_voices_meta(meta: dict): | |
| VOICES_META.write_text(json.dumps(meta, indent=2)) | |
| def push_to_hf(local_path: str, repo_path: str): | |
| """Push a file to HF repo permanently.""" | |
| if not HF_TOKEN or not HF_REPO_ID: | |
| return False | |
| try: | |
| from huggingface_hub import HfApi | |
| api = HfApi() | |
| api.upload_file( | |
| path_or_fileobj=local_path, | |
| path_in_repo=f"voices/{repo_path}", | |
| repo_id=HF_REPO_ID, | |
| repo_type="space", | |
| token=HF_TOKEN, | |
| ) | |
| return True | |
| except Exception as e: | |
| print(f"HF push failed: {e}") | |
| return False | |
| def pull_from_hf(): | |
| """Pull voices from HF repo on startup.""" | |
| if not HF_TOKEN or not HF_REPO_ID: | |
| return | |
| try: | |
| from huggingface_hub import HfApi | |
| api = HfApi() | |
| files = api.list_repo_files(repo_id=HF_REPO_ID, repo_type="space", token=HF_TOKEN) | |
| for f in files: | |
| if f.startswith("voices/") and f.endswith(".wav"): | |
| name = Path(f).name | |
| dest = VOICES_DIR / name | |
| if not dest.exists(): | |
| api.hf_hub_download( | |
| repo_id=HF_REPO_ID, | |
| filename=f, | |
| repo_type="space", | |
| token=HF_TOKEN, | |
| local_dir="." | |
| ) | |
| except Exception as e: | |
| print(f"HF pull failed: {e}") | |
| # Pull voices from HF on startup | |
| pull_from_hf() | |
| # ============================================================ | |
| # UI | |
| # ============================================================ | |
| async def ui(): | |
| return HTMLResponse(content=""" | |
| <!DOCTYPE html> | |
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8"/> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"/> | |
| <title>Cortana TTS Studio</title> | |
| <style> | |
| * { margin:0; padding:0; box-sizing:border-box; } | |
| body { | |
| background:#080808; | |
| color:#fff; | |
| font-family:'Inter',-apple-system,sans-serif; | |
| min-height:100vh; | |
| } | |
| .sidebar { | |
| position:fixed; | |
| left:0; top:0; bottom:0; | |
| width:220px; | |
| background:#0d0d0d; | |
| border-right:1px solid #1a1a1a; | |
| display:flex; | |
| flex-direction:column; | |
| padding:24px 0; | |
| z-index:100; | |
| } | |
| .logo { | |
| padding:0 20px 24px; | |
| border-bottom:1px solid #1a1a1a; | |
| } | |
| .logo h1 { | |
| font-size:1.1rem; | |
| font-weight:700; | |
| letter-spacing:0.2em; | |
| text-transform:uppercase; | |
| background:linear-gradient(135deg,#c0c0c0,#fff); | |
| -webkit-background-clip:text; | |
| -webkit-text-fill-color:transparent; | |
| } | |
| .logo p { font-size:0.7rem; color:#444; margin-top:4px; } | |
| .nav { padding:16px 12px; flex:1; } | |
| .nav-item { | |
| display:flex; | |
| align-items:center; | |
| gap:10px; | |
| padding:10px 12px; | |
| border-radius:8px; | |
| cursor:pointer; | |
| font-size:0.85rem; | |
| color:#555; | |
| transition:all 0.2s; | |
| margin-bottom:2px; | |
| } | |
| .nav-item:hover { background:#151515; color:#888; } | |
| .nav-item.active { background:#151515; color:#fff; } | |
| .nav-icon { font-size:1rem; width:20px; text-align:center; } | |
| .main { | |
| margin-left:220px; | |
| padding:32px; | |
| min-height:100vh; | |
| } | |
| .page { display:none; } | |
| .page.active { display:block; } | |
| .page-title { | |
| font-size:1.4rem; | |
| font-weight:700; | |
| margin-bottom:8px; | |
| } | |
| .page-sub { | |
| color:#444; | |
| font-size:0.85rem; | |
| margin-bottom:28px; | |
| } | |
| .card { | |
| background:#111; | |
| border:1px solid #1a1a1a; | |
| border-radius:14px; | |
| padding:22px; | |
| margin-bottom:20px; | |
| } | |
| .card-title { | |
| font-size:0.7rem; | |
| font-weight:600; | |
| letter-spacing:0.15em; | |
| text-transform:uppercase; | |
| color:#444; | |
| margin-bottom:16px; | |
| display:flex; | |
| align-items:center; | |
| gap:8px; | |
| } | |
| .card-title::before { | |
| content:''; | |
| display:block; | |
| width:3px; height:12px; | |
| background:#c0c0c0; | |
| border-radius:2px; | |
| } | |
| textarea { | |
| width:100%; | |
| background:#0a0a0a; | |
| border:1px solid #1f1f1f; | |
| border-radius:10px; | |
| color:#fff; | |
| font-size:0.95rem; | |
| padding:14px; | |
| resize:vertical; | |
| min-height:130px; | |
| outline:none; | |
| font-family:inherit; | |
| line-height:1.6; | |
| transition:border-color 0.2s; | |
| } | |
| textarea:focus { border-color:#333; } | |
| textarea::placeholder { color:#2a2a2a; } | |
| input[type="text"] { | |
| width:100%; | |
| background:#0a0a0a; | |
| border:1px solid #1f1f1f; | |
| border-radius:8px; | |
| color:#fff; | |
| font-size:0.9rem; | |
| padding:10px 14px; | |
| outline:none; | |
| font-family:inherit; | |
| transition:border-color 0.2s; | |
| } | |
| input[type="text"]:focus { border-color:#333; } | |
| input[type="text"]::placeholder { color:#2a2a2a; } | |
| .grid2 { display:grid; grid-template-columns:1fr 1fr; gap:16px; } | |
| .grid3 { display:grid; grid-template-columns:1fr 1fr 1fr; gap:16px; } | |
| .control-label { | |
| font-size:0.72rem; | |
| color:#444; | |
| text-transform:uppercase; | |
| letter-spacing:0.1em; | |
| margin-bottom:8px; | |
| display:block; | |
| } | |
| .slider-row { | |
| display:flex; | |
| align-items:center; | |
| gap:12px; | |
| } | |
| input[type="range"] { | |
| flex:1; | |
| -webkit-appearance:none; | |
| height:4px; | |
| background:#1f1f1f; | |
| border-radius:2px; | |
| outline:none; | |
| } | |
| input[type="range"]::-webkit-slider-thumb { | |
| -webkit-appearance:none; | |
| width:14px; height:14px; | |
| border-radius:50%; | |
| background:#c0c0c0; | |
| cursor:pointer; | |
| } | |
| .slider-val { font-size:0.8rem; color:#666; width:32px; text-align:right; } | |
| .btn { | |
| background:#151515; | |
| border:1px solid #222; | |
| border-radius:8px; | |
| color:#fff; | |
| font-size:0.9rem; | |
| font-weight:600; | |
| padding:11px 20px; | |
| cursor:pointer; | |
| transition:all 0.2s; | |
| display:inline-flex; | |
| align-items:center; | |
| gap:8px; | |
| } | |
| .btn:hover { border-color:#444; } | |
| .btn:disabled { opacity:0.3; cursor:not-allowed; } | |
| .btn-primary { | |
| width:100%; | |
| justify-content:center; | |
| padding:13px; | |
| margin-top:14px; | |
| font-size:0.95rem; | |
| } | |
| .btn-primary:hover { box-shadow:0 0 20px rgba(192,192,192,0.08); } | |
| .btn-sm { padding:7px 14px; font-size:0.8rem; } | |
| .btn-danger { border-color:#ef444433; color:#ef4444; } | |
| .btn-danger:hover { border-color:#ef4444; background:rgba(239,68,68,0.1); } | |
| .spinner { | |
| width:16px; height:16px; | |
| border:2px solid #333; | |
| border-top-color:#c0c0c0; | |
| border-radius:50%; | |
| animation:spin 0.8s linear infinite; | |
| display:none; | |
| } | |
| .btn.loading .spinner { display:block; } | |
| .btn.loading .btn-label { display:none; } | |
| @keyframes spin { to { transform:rotate(360deg); } } | |
| .audio-player { | |
| display:none; | |
| background:#0a0a0a; | |
| border:1px solid #1a1a1a; | |
| border-radius:10px; | |
| padding:14px; | |
| margin-top:14px; | |
| } | |
| .audio-player.visible { display:block; } | |
| .audio-player audio { width:100%; filter:invert(0.9); } | |
| .dl-btn { | |
| display:inline-flex; | |
| align-items:center; | |
| gap:6px; | |
| margin-top:10px; | |
| background:#151515; | |
| border:1px solid #222; | |
| border-radius:7px; | |
| color:#666; | |
| font-size:0.78rem; | |
| padding:7px 12px; | |
| cursor:pointer; | |
| text-decoration:none; | |
| transition:all 0.2s; | |
| } | |
| .dl-btn:hover { color:#fff; border-color:#444; } | |
| .lang-row { | |
| display:flex; | |
| align-items:center; | |
| gap:10px; | |
| margin-top:8px; | |
| } | |
| .lang-badge { | |
| background:#151515; | |
| border:1px solid #222; | |
| border-radius:999px; | |
| padding:3px 12px; | |
| font-size:0.75rem; | |
| color:#666; | |
| } | |
| .upload-area { | |
| border:1px dashed #1f1f1f; | |
| border-radius:10px; | |
| padding:22px; | |
| text-align:center; | |
| cursor:pointer; | |
| transition:all 0.2s; | |
| background:#0a0a0a; | |
| position:relative; | |
| } | |
| .upload-area:hover { border-color:#333; } | |
| .upload-area.has-file { border-color:#22c55e; border-style:solid; } | |
| .upload-area input[type="file"] { | |
| position:absolute; inset:0; | |
| opacity:0; cursor:pointer; | |
| width:100%; height:100%; | |
| } | |
| .upload-icon { font-size:1.5rem; margin-bottom:6px; } | |
| .upload-label { font-size:0.85rem; color:#444; } | |
| .upload-sub { font-size:0.75rem; color:#2a2a2a; margin-top:4px; } | |
| .file-ok { font-size:0.8rem; color:#22c55e; margin-top:6px; } | |
| .history-list { display:flex; flex-direction:column; gap:8px; } | |
| .hist-item { | |
| background:#0a0a0a; | |
| border:1px solid #151515; | |
| border-radius:9px; | |
| padding:11px 14px; | |
| display:flex; | |
| align-items:center; | |
| gap:12px; | |
| } | |
| .hist-text { flex:1; font-size:0.82rem; color:#555; white-space:nowrap; overflow:hidden; text-overflow:ellipsis; } | |
| .hist-lang { | |
| font-size:0.7rem; color:#333; | |
| background:#111; border:1px solid #1a1a1a; | |
| border-radius:999px; padding:2px 8px; | |
| } | |
| .hist-play { | |
| width:30px; height:30px; | |
| border-radius:50%; | |
| background:#151515; border:1px solid #222; | |
| color:#666; font-size:0.75rem; | |
| cursor:pointer; display:flex; align-items:center; justify-content:center; | |
| transition:all 0.2s; flex-shrink:0; | |
| } | |
| .hist-play:hover { border-color:#c0c0c0; color:#fff; } | |
| .voice-grid { | |
| display:grid; | |
| grid-template-columns:repeat(auto-fill,minmax(180px,1fr)); | |
| gap:12px; | |
| } | |
| .voice-card { | |
| background:#0a0a0a; | |
| border:1px solid #1a1a1a; | |
| border-radius:10px; | |
| padding:14px; | |
| cursor:pointer; | |
| transition:all 0.2s; | |
| position:relative; | |
| } | |
| .voice-card:hover { border-color:#333; } | |
| .voice-card.selected { border-color:#c0c0c0; } | |
| .voice-avatar { | |
| width:44px; height:44px; | |
| border-radius:50%; | |
| background:#1a1a1a; | |
| display:flex; align-items:center; justify-content:center; | |
| font-size:1.2rem; | |
| margin-bottom:10px; | |
| } | |
| .voice-name { font-size:0.85rem; font-weight:600; color:#ccc; } | |
| .voice-lang { font-size:0.72rem; color:#444; margin-top:3px; } | |
| .voice-actions { | |
| display:flex; gap:6px; margin-top:10px; | |
| } | |
| .empty-state { | |
| text-align:center; | |
| padding:40px 20px; | |
| color:#2a2a2a; | |
| font-size:0.85rem; | |
| } | |
| .empty-icon { font-size:2rem; margin-bottom:10px; } | |
| .tag { | |
| display:inline-block; | |
| background:#151515; border:1px solid #1f1f1f; | |
| border-radius:6px; padding:4px 10px; | |
| font-size:0.75rem; color:#555; | |
| margin:3px; | |
| } | |
| .progress-bar { | |
| height:4px; background:#1a1a1a; | |
| border-radius:2px; overflow:hidden; | |
| margin-top:10px; display:none; | |
| } | |
| .progress-bar.visible { display:block; } | |
| .progress-fill { | |
| height:100%; background:#c0c0c0; | |
| border-radius:2px; | |
| transition:width 0.3s; | |
| } | |
| .status-log { | |
| background:#0a0a0a; border:1px solid #1a1a1a; | |
| border-radius:8px; padding:12px; | |
| font-family:monospace; font-size:0.78rem; | |
| color:#444; max-height:120px; | |
| overflow-y:auto; margin-top:12px; | |
| display:none; | |
| } | |
| .status-log.visible { display:block; } | |
| .status-line { margin-bottom:4px; } | |
| .status-line.ok { color:#22c55e; } | |
| .status-line.err { color:#ef4444; } | |
| .status-line.info { color:#666; } | |
| .error-msg { | |
| background:rgba(239,68,68,0.08); | |
| border:1px solid rgba(239,68,68,0.15); | |
| border-radius:7px; padding:9px 13px; | |
| font-size:0.82rem; color:#ef4444; | |
| display:none; margin-top:10px; | |
| } | |
| .error-msg.visible { display:block; } | |
| .divider { height:1px; background:#141414; margin:16px 0; } | |
| .row { display:flex; align-items:center; gap:12px; } | |
| select { | |
| background:#0a0a0a; border:1px solid #1f1f1f; | |
| border-radius:8px; color:#888; | |
| padding:9px 12px; outline:none; | |
| font-family:inherit; font-size:0.85rem; | |
| cursor:pointer; width:100%; | |
| } | |
| select:focus { border-color:#333; } | |
| ::-webkit-scrollbar { width:4px; } | |
| ::-webkit-scrollbar-track { background:#0a0a0a; } | |
| ::-webkit-scrollbar-thumb { background:#1f1f1f; border-radius:2px; } | |
| .voice-design-result { | |
| background:#0a0a0a; border:1px solid #1a1a1a; | |
| border-radius:10px; padding:16px; | |
| margin-top:14px; display:none; | |
| } | |
| .voice-design-result.visible { display:block; } | |
| .param-row { | |
| display:flex; justify-content:space-between; | |
| align-items:center; margin-bottom:8px; | |
| } | |
| .param-key { font-size:0.78rem; color:#444; } | |
| .param-val { font-size:0.78rem; color:#888; font-family:monospace; } | |
| </style> | |
| </head> | |
| <body> | |
| <!-- SIDEBAR --> | |
| <div class="sidebar"> | |
| <div class="logo"> | |
| <h1>Cortana TTS</h1> | |
| <p>Studio</p> | |
| </div> | |
| <div class="nav"> | |
| <div class="nav-item active" onclick="showPage('tts')"> | |
| <span class="nav-icon">▶</span> Text to Speech | |
| </div> | |
| <div class="nav-item" onclick="showPage('library')"> | |
| <span class="nav-icon">🎙</span> Voice Library | |
| </div> | |
| <div class="nav-item" onclick="showPage('design')"> | |
| <span class="nav-icon">✨</span> Voice Design | |
| </div> | |
| <div class="nav-item" onclick="showPage('dubbing')"> | |
| <span class="nav-icon">🎬</span> Dubbing | |
| </div> | |
| <div class="nav-item" onclick="showPage('api')"> | |
| <span class="nav-icon">⚡</span> API | |
| </div> | |
| </div> | |
| </div> | |
| <!-- MAIN --> | |
| <div class="main"> | |
| <!-- PAGE: TTS --> | |
| <div class="page active" id="page-tts"> | |
| <div class="page-title">Text to Speech</div> | |
| <div class="page-sub">Generate natural speech in English, Hindi or Hinglish</div> | |
| <div class="card"> | |
| <div class="card-title">Text Input</div> | |
| <textarea id="ttsText" | |
| placeholder="Type in English, Hindi or Hinglish... नमस्ते, मैं कोर्टाना हूं। Arey yaar, kya scene hai?"></textarea> | |
| <div class="lang-row"> | |
| Detected: <span class="lang-badge" id="langBadge">English</span> | |
| <span style="color:#2a2a2a;font-size:0.75rem;">Ctrl+Enter to generate</span> | |
| </div> | |
| </div> | |
| <div class="card"> | |
| <div class="card-title">Voice</div> | |
| <div class="grid2"> | |
| <div> | |
| <span class="control-label">Active Voice</span> | |
| <select id="voiceSelect" onchange="voiceSelectChanged()"> | |
| <option value="default">Default Voice</option> | |
| </select> | |
| </div> | |
| <div> | |
| <span class="control-label">Upload One-time Sample</span> | |
| <div class="upload-area" id="ttsUpload" style="padding:12px;"> | |
| <input type="file" id="ttsVoiceFile" accept=".wav,.mp3" onchange="ttsFileSelected()"/> | |
| <div style="font-size:0.82rem;color:#333;">Drop WAV/MP3 here</div> | |
| <div class="file-ok" id="ttsFileName"></div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="card"> | |
| <div class="card-title">Parameters</div> | |
| <div class="grid2"> | |
| <div> | |
| <span class="control-label">Emotion / Expressiveness</span> | |
| <div class="slider-row"> | |
| <input type="range" id="ttsEmotion" min="0" max="1" step="0.05" value="0.5"/> | |
| <span class="slider-val" id="ttsEmotionVal">0.5</span> | |
| </div> | |
| </div> | |
| <div> | |
| <span class="control-label">Speed</span> | |
| <div class="slider-row"> | |
| <input type="range" id="ttsSpeed" min="0.5" max="2.0" step="0.1" value="1.0"/> | |
| <span class="slider-val" id="ttsSpeedVal">1.0</span> | |
| </div> | |
| </div> | |
| </div> | |
| <button class="btn btn-primary" id="ttsBtn" onclick="generateTTS()"> | |
| <div class="spinner"></div> | |
| <span class="btn-label">▶ Generate Speech</span> | |
| </button> | |
| <div class="error-msg" id="ttsError"></div> | |
| <div class="audio-player" id="ttsPlayer"> | |
| <audio id="ttsAudio" controls></audio><br/> | |
| <a class="dl-btn" id="ttsDL" download="cortana.mp3">↓ Download MP3</a> | |
| <button class="btn btn-sm" style="margin-left:8px;margin-top:10px;" | |
| onclick="saveToHistory()">+ History</button> | |
| </div> | |
| </div> | |
| <div class="card"> | |
| <div class="card-title">Generation History</div> | |
| <div class="history-list" id="histList"> | |
| <div class="empty-state"> | |
| <div class="empty-icon">🎵</div> | |
| Your generations appear here | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- PAGE: VOICE LIBRARY --> | |
| <div class="page" id="page-library"> | |
| <div class="page-title">Voice Library</div> | |
| <div class="page-sub">Save, manage and reuse cloned voices permanently</div> | |
| <div class="card"> | |
| <div class="card-title">Add New Voice</div> | |
| <div class="grid2"> | |
| <div> | |
| <span class="control-label">Voice Name</span> | |
| <input type="text" id="newVoiceName" placeholder="e.g. Cortana English, Raj Hindi..."/> | |
| </div> | |
| <div> | |
| <span class="control-label">Language Tag</span> | |
| <select id="newVoiceLang"> | |
| <option value="en">English</option> | |
| <option value="hi">Hindi</option> | |
| <option value="both">English + Hindi</option> | |
| </select> | |
| </div> | |
| </div> | |
| <div style="margin-top:14px;"> | |
| <span class="control-label">Voice Sample (5–30 seconds, clean audio)</span> | |
| <div class="upload-area" id="libUploadArea"> | |
| <input type="file" id="libVoiceFile" accept=".wav,.mp3" onchange="libFileSelected()"/> | |
| <div class="upload-icon">🎙</div> | |
| <div class="upload-label">Upload WAV or MP3</div> | |
| <div class="upload-sub">No background music — clear speech only</div> | |
| <div class="file-ok" id="libFileName"></div> | |
| </div> | |
| </div> | |
| <button class="btn btn-primary" id="saveVoiceBtn" onclick="saveVoice()"> | |
| <div class="spinner"></div> | |
| <span class="btn-label">💾 Save to Library</span> | |
| </button> | |
| <div class="error-msg" id="libError"></div> | |
| </div> | |
| <div class="card"> | |
| <div class="card-title">Saved Voices</div> | |
| <div class="voice-grid" id="voiceGrid"> | |
| <div class="empty-state" style="grid-column:1/-1;"> | |
| <div class="empty-icon">🎙</div> | |
| No voices saved yet — add one above | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- PAGE: VOICE DESIGN --> | |
| <div class="page" id="page-design"> | |
| <div class="page-title">Voice Design</div> | |
| <div class="page-sub">Describe a voice in plain words — AI generates the parameters</div> | |
| <div class="card"> | |
| <div class="card-title">Describe Your Voice</div> | |
| <textarea id="designPrompt" style="min-height:80px;" | |
| placeholder="e.g. Young Indian woman, warm and friendly, speaks at a medium pace with a hint of excitement e.g. Deep mature male voice, calm and authoritative, slightly slow e.g. Energetic young man, fast paced, very expressive"></textarea> | |
| <div style="margin-top:14px;"> | |
| <span class="control-label">Preview Text</span> | |
| <textarea id="designPreviewText" style="min-height:60px;" | |
| placeholder="Hello, I am Cortana. How can I assist you today?"></textarea> | |
| </div> | |
| <button class="btn btn-primary" id="designBtn" onclick="designVoice()"> | |
| <div class="spinner"></div> | |
| <span class="btn-label">✨ Design Voice</span> | |
| </button> | |
| <div class="error-msg" id="designError"></div> | |
| <div class="voice-design-result" id="designResult"> | |
| <div class="card-title">Generated Parameters</div> | |
| <div id="designParams"></div> | |
| <div class="divider"></div> | |
| <div class="audio-player visible" style="margin-top:0;"> | |
| <audio id="designAudio" controls></audio><br/> | |
| <a class="dl-btn" id="designDL" download="designed_voice.mp3">↓ Download</a> | |
| <button class="btn btn-sm" style="margin-left:8px;margin-top:10px;" | |
| onclick="saveDesignedVoice()">💾 Save to Library</button> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="card"> | |
| <div class="card-title">Example Prompts</div> | |
| <div> | |
| <span class="tag" onclick="setDesignPrompt(this)">Young Indian woman, warm and friendly</span> | |
| <span class="tag" onclick="setDesignPrompt(this)">Deep mature male, calm and authoritative</span> | |
| <span class="tag" onclick="setDesignPrompt(this)">Energetic teen, very expressive and fast</span> | |
| <span class="tag" onclick="setDesignPrompt(this)">Professional newsreader, neutral accent</span> | |
| <span class="tag" onclick="setDesignPrompt(this)">Soft spoken elderly woman, slow and gentle</span> | |
| <span class="tag" onclick="setDesignPrompt(this)">Excited sports commentator, loud and fast</span> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- PAGE: DUBBING --> | |
| <div class="page" id="page-dubbing"> | |
| <div class="page-title">Dubbing</div> | |
| <div class="page-sub">Translate and re-voice any video or audio file</div> | |
| <div class="card" style="border-color:#ef444422;"> | |
| <div style="font-size:0.82rem;color:#ef4444;margin-bottom:4px;">⚠️ Free CPU Warning</div> | |
| <div style="font-size:0.78rem;color:#555;"> | |
| Dubbing on free CPU takes 10–20 minutes per minute of video. | |
| Start with a short clip to test. Upgrade to GPU for faster processing. | |
| </div> | |
| </div> | |
| <div class="card"> | |
| <div class="card-title">Upload Media</div> | |
| <div class="upload-area" id="dubUploadArea"> | |
| <input type="file" id="dubFile" accept="video/*,audio/*" onchange="dubFileSelected()"/> | |
| <div class="upload-icon">🎬</div> | |
| <div class="upload-label">Upload video or audio file</div> | |
| <div class="upload-sub">MP4, MKV, AVI, MP3, WAV — max 100MB</div> | |
| <div class="file-ok" id="dubFileName"></div> | |
| </div> | |
| <div class="grid2" style="margin-top:16px;"> | |
| <div> | |
| <span class="control-label">Source Language</span> | |
| <select id="dubSrcLang"> | |
| <option value="auto">Auto Detect</option> | |
| <option value="en">English</option> | |
| <option value="hi">Hindi</option> | |
| <option value="es">Spanish</option> | |
| <option value="fr">French</option> | |
| <option value="de">German</option> | |
| <option value="ja">Japanese</option> | |
| <option value="zh">Chinese</option> | |
| </select> | |
| </div> | |
| <div> | |
| <span class="control-label">Target Language</span> | |
| <select id="dubTgtLang"> | |
| <option value="en">English</option> | |
| <option value="hi">Hindi</option> | |
| <option value="es">Spanish</option> | |
| <option value="fr">French</option> | |
| <option value="de">German</option> | |
| <option value="ja">Japanese</option> | |
| <option value="zh">Chinese</option> | |
| </select> | |
| </div> | |
| </div> | |
| <div style="margin-top:14px;"> | |
| <span class="control-label">Dubbing Voice (optional)</span> | |
| <select id="dubVoiceSelect"> | |
| <option value="default">Default Voice</option> | |
| </select> | |
| </div> | |
| <button class="btn btn-primary" id="dubBtn" onclick="startDubbing()"> | |
| <div class="spinner"></div> | |
| <span class="btn-label">🎬 Start Dubbing</span> | |
| </button> | |
| <div class="error-msg" id="dubError"></div> | |
| <div class="progress-bar" id="dubProgress"> | |
| <div class="progress-fill" id="dubProgressFill" style="width:0%"></div> | |
| </div> | |
| <div class="status-log" id="dubLog"></div> | |
| <div class="audio-player" id="dubPlayer"> | |
| <audio id="dubAudio" controls></audio><br/> | |
| <a class="dl-btn" id="dubDL" download="dubbed.mp3">↓ Download Dubbed Audio</a> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- PAGE: API --> | |
| <div class="page" id="page-api"> | |
| <div class="page-title">API Reference</div> | |
| <div class="page-sub">OpenAI-compatible endpoints — drop-in replacement</div> | |
| <div class="card"> | |
| <div class="card-title">Text to Speech</div> | |
| <div style="background:#0a0a0a;border:1px solid #1a1a1a;border-radius:8px;padding:14px;font-family:monospace;font-size:0.78rem;color:#555;line-height:1.7;"> | |
| POST /v1/audio/speech<br/><br/> | |
| {<br/> | |
| "input": "Hello I am Cortana",<br/> | |
| "emotion": 0.5, // 0.0 neutral → 1.0 expressive<br/> | |
| "speed": 1.0 // 0.5x to 2.0x<br/> | |
| }<br/><br/> | |
| Returns: audio/mpeg stream | |
| </div> | |
| <button class="btn btn-sm" style="margin-top:10px;" onclick="copyText(0)">Copy</button> | |
| </div> | |
| <div class="card"> | |
| <div class="card-title">Voice Cloning (on-the-fly)</div> | |
| <div style="background:#0a0a0a;border:1px solid #1a1a1a;border-radius:8px;padding:14px;font-family:monospace;font-size:0.78rem;color:#555;line-height:1.7;"> | |
| POST /v1/audio/speech/clone<br/><br/> | |
| {<br/> | |
| "input": "Hello I am Cortana",<br/> | |
| "voice_b64": "base64_encoded_wav",<br/> | |
| "emotion": 0.5<br/> | |
| }<br/><br/> | |
| Returns: audio/mpeg stream | |
| </div> | |
| <button class="btn btn-sm" style="margin-top:10px;" onclick="copyText(1)">Copy</button> | |
| </div> | |
| <div class="card"> | |
| <div class="card-title">List Voices</div> | |
| <div style="background:#0a0a0a;border:1px solid #1a1a1a;border-radius:8px;padding:14px;font-family:monospace;font-size:0.78rem;color:#555;line-height:1.7;"> | |
| GET /v1/voices<br/><br/> | |
| Returns: { "voices": [ { "id": "...", "name": "...", "lang": "..." } ] } | |
| </div> | |
| </div> | |
| <div class="card"> | |
| <div class="card-title">CURL Example</div> | |
| <div style="background:#0a0a0a;border:1px solid #1a1a1a;border-radius:8px;padding:14px;font-family:monospace;font-size:0.78rem;color:#555;line-height:1.7;" id="curlExample"> | |
| curl -X POST "https://YOUR_SPACE.hf.space/v1/audio/speech" \<br/> | |
| -H "Authorization: Bearer YOUR_HF_TOKEN" \<br/> | |
| -H "Content-Type: application/json" \<br/> | |
| -d '{"input": "Hello I am Cortana", "emotion": 0.5}' \<br/> | |
| --output speech.mp3 | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <script> | |
| const historyItems = []; | |
| let selectedVoiceId = 'default'; | |
| let designedParams = null; | |
| // ============ NAVIGATION ============ | |
| function showPage(id) { | |
| document.querySelectorAll('.page').forEach(p => p.classList.remove('active')); | |
| document.querySelectorAll('.nav-item').forEach(n => n.classList.remove('active')); | |
| document.getElementById('page-' + id).classList.add('active'); | |
| event.currentTarget.classList.add('active'); | |
| if (id === 'library') loadVoiceLibrary(); | |
| } | |
| // ============ LANGUAGE DETECTION ============ | |
| document.getElementById('ttsText').addEventListener('input', function() { | |
| const text = this.value; | |
| const badge = document.getElementById('langBadge'); | |
| if (/[\u0900-\u097F]/.test(text)) { | |
| badge.textContent = 'Hindi'; | |
| badge.style.color = '#f97316'; | |
| } else if (/\b(hai|hoon|kya|nahi|aur|toh|yaar|arey|bhi)\b/i.test(text)) { | |
| badge.textContent = 'Hinglish'; | |
| badge.style.color = '#a855f7'; | |
| } else { | |
| badge.textContent = 'English'; | |
| badge.style.color = '#666'; | |
| } | |
| }); | |
| // ============ SLIDERS ============ | |
| document.getElementById('ttsEmotion').addEventListener('input', function() { | |
| document.getElementById('ttsEmotionVal').textContent = this.value; | |
| }); | |
| document.getElementById('ttsSpeed').addEventListener('input', function() { | |
| document.getElementById('ttsSpeedVal').textContent = this.value; | |
| }); | |
| // ============ TTS ============ | |
| function ttsFileSelected() { | |
| const f = document.getElementById('ttsVoiceFile').files[0]; | |
| if (f) document.getElementById('ttsFileName').textContent = '✅ ' + f.name; | |
| } | |
| async function generateTTS() { | |
| const text = document.getElementById('ttsText').value.trim(); | |
| const emotion = document.getElementById('ttsEmotion').value; | |
| const speed = document.getElementById('ttsSpeed').value; | |
| const voiceFile = document.getElementById('ttsVoiceFile').files[0]; | |
| const btn = document.getElementById('ttsBtn'); | |
| const err = document.getElementById('ttsError'); | |
| if (!text) { showError(err, 'Please enter text first.'); return; } | |
| err.classList.remove('visible'); | |
| setLoading(btn, true); | |
| try { | |
| let res; | |
| if (voiceFile) { | |
| const b64 = await toB64(voiceFile); | |
| res = await fetch('/v1/audio/speech/clone', { | |
| method:'POST', | |
| headers:{'Content-Type':'application/json'}, | |
| body:JSON.stringify({ input:text, emotion:parseFloat(emotion), speed:parseFloat(speed), voice_b64:b64 }) | |
| }); | |
| } else if (selectedVoiceId !== 'default') { | |
| res = await fetch('/v1/audio/speech/voice/' + selectedVoiceId, { | |
| method:'POST', | |
| headers:{'Content-Type':'application/json'}, | |
| body:JSON.stringify({ input:text, emotion:parseFloat(emotion), speed:parseFloat(speed) }) | |
| }); | |
| } else { | |
| res = await fetch('/v1/audio/speech', { | |
| method:'POST', | |
| headers:{'Content-Type':'application/json'}, | |
| body:JSON.stringify({ input:text, emotion:parseFloat(emotion), speed:parseFloat(speed) }) | |
| }); | |
| } | |
| if (!res.ok) { const e = await res.json(); throw new Error(e.error); } | |
| const blob = await res.blob(); | |
| const url = URL.createObjectURL(blob); | |
| setAudio('ttsAudio', 'ttsDL', url); | |
| document.getElementById('ttsPlayer').classList.add('visible'); | |
| document.getElementById('ttsAudio').play(); | |
| // auto add to history | |
| historyItems.unshift({ text, url, lang: /[\u0900-\u097F]/.test(text) ? 'Hindi' : 'English' }); | |
| renderHistory(); | |
| } catch(e) { | |
| showError(err, e.message); | |
| } finally { | |
| setLoading(btn, false); | |
| } | |
| } | |
| function renderHistory() { | |
| const list = document.getElementById('histList'); | |
| if (!historyItems.length) return; | |
| list.innerHTML = historyItems.slice(0,10).map((h,i) => ` | |
| <div class="hist-item"> | |
| <div class="hist-text">${h.text}</div> | |
| <div class="hist-lang">${h.lang}</div> | |
| <button class="hist-play" onclick="new Audio('${h.url}').play()">▶</button> | |
| </div> | |
| `).join(''); | |
| } | |
| document.getElementById('ttsText').addEventListener('keydown', e => { | |
| if (e.ctrlKey && e.key === 'Enter') generateTTS(); | |
| }); | |
| // ============ VOICE LIBRARY ============ | |
| function libFileSelected() { | |
| const f = document.getElementById('libVoiceFile').files[0]; | |
| if (f) { | |
| document.getElementById('libFileName').textContent = '✅ ' + f.name; | |
| document.getElementById('libUploadArea').classList.add('has-file'); | |
| } | |
| } | |
| async function saveVoice() { | |
| const name = document.getElementById('newVoiceName').value.trim(); | |
| const lang = document.getElementById('newVoiceLang').value; | |
| const file = document.getElementById('libVoiceFile').files[0]; | |
| const btn = document.getElementById('saveVoiceBtn'); | |
| const err = document.getElementById('libError'); | |
| if (!name) { showError(err, 'Please enter a voice name.'); return; } | |
| if (!file) { showError(err, 'Please upload a voice sample.'); return; } | |
| err.classList.remove('visible'); | |
| setLoading(btn, true); | |
| try { | |
| const b64 = await toB64(file); | |
| const res = await fetch('/v1/voices', { | |
| method:'POST', | |
| headers:{'Content-Type':'application/json'}, | |
| body:JSON.stringify({ name, lang, voice_b64:b64, filename:file.name }) | |
| }); | |
| if (!res.ok) { const e = await res.json(); throw new Error(e.error); } | |
| document.getElementById('newVoiceName').value = ''; | |
| document.getElementById('libVoiceFile').value = ''; | |
| document.getElementById('libFileName').textContent = ''; | |
| document.getElementById('libUploadArea').classList.remove('has-file'); | |
| await loadVoiceLibrary(); | |
| } catch(e) { | |
| showError(err, e.message); | |
| } finally { | |
| setLoading(btn, false); | |
| } | |
| } | |
| async function loadVoiceLibrary() { | |
| const res = await fetch('/v1/voices'); | |
| const data = await res.json(); | |
| const grid = document.getElementById('voiceGrid'); | |
| const voices = data.voices || []; | |
| // Update dropdowns | |
| updateVoiceDropdowns(voices); | |
| if (!voices.length) { | |
| grid.innerHTML = '<div class="empty-state" style="grid-column:1/-1;"><div class="empty-icon">🎙</div>No voices saved yet</div>'; | |
| return; | |
| } | |
| grid.innerHTML = voices.map(v => ` | |
| <div class="voice-card ${selectedVoiceId === v.id ? 'selected' : ''}" onclick="selectVoice('${v.id}')"> | |
| <div class="voice-avatar">🎙</div> | |
| <div class="voice-name">${v.name}</div> | |
| <div class="voice-lang">${v.lang.toUpperCase()}</div> | |
| <div class="voice-actions"> | |
| <button class="btn btn-sm" onclick="event.stopPropagation();previewVoice('${v.id}')">▶</button> | |
| <button class="btn btn-sm btn-danger" onclick="event.stopPropagation();deleteVoice('${v.id}')">✕</button> | |
| </div> | |
| </div> | |
| `).join(''); | |
| } | |
| function updateVoiceDropdowns(voices) { | |
| const opts = '<option value="default">Default Voice</option>' + | |
| voices.map(v => `<option value="${v.id}">${v.name}</option>`).join(''); | |
| document.getElementById('voiceSelect').innerHTML = opts; | |
| document.getElementById('dubVoiceSelect').innerHTML = opts; | |
| } | |
| function voiceSelectChanged() { | |
| selectedVoiceId = document.getElementById('voiceSelect').value; | |
| } | |
| function selectVoice(id) { | |
| selectedVoiceId = id; | |
| document.getElementById('voiceSelect').value = id; | |
| loadVoiceLibrary(); | |
| } | |
| async function previewVoice(id) { | |
| const res = await fetch('/v1/audio/speech/voice/' + id, { | |
| method:'POST', | |
| headers:{'Content-Type':'application/json'}, | |
| body:JSON.stringify({ input:'Hello, I am Cortana. How can I assist you?', emotion:0.5 }) | |
| }); | |
| if (!res.ok) return; | |
| const blob = await res.blob(); | |
| new Audio(URL.createObjectURL(blob)).play(); | |
| } | |
| async function deleteVoice(id) { | |
| if (!confirm('Delete this voice?')) return; | |
| await fetch('/v1/voices/' + id, { method:'DELETE' }); | |
| await loadVoiceLibrary(); | |
| } | |
| // ============ VOICE DESIGN ============ | |
| function setDesignPrompt(el) { | |
| document.getElementById('designPrompt').value = el.textContent; | |
| } | |
| async function designVoice() { | |
| const prompt = document.getElementById('designPrompt').value.trim(); | |
| const preview = document.getElementById('designPreviewText').value.trim() || | |
| 'Hello, I am Cortana. How can I assist you today?'; | |
| const btn = document.getElementById('designBtn'); | |
| const err = document.getElementById('designError'); | |
| if (!prompt) { showError(err, 'Please describe the voice first.'); return; } | |
| err.classList.remove('visible'); | |
| setLoading(btn, true); | |
| try { | |
| const res = await fetch('/v1/voices/design', { | |
| method:'POST', | |
| headers:{'Content-Type':'application/json'}, | |
| body:JSON.stringify({ prompt, preview_text:preview }) | |
| }); | |
| if (!res.ok) { const e = await res.json(); throw new Error(e.error); } | |
| const data = await res.json(); | |
| designedParams = data.params; | |
| // Show params | |
| document.getElementById('designParams').innerHTML = Object.entries(data.params).map(([k,v]) => ` | |
| <div class="param-row"> | |
| <span class="param-key">${k}</span> | |
| <span class="param-val">${v}</span> | |
| </div> | |
| `).join(''); | |
| // Generate preview audio | |
| const audioRes = await fetch('/v1/audio/speech', { | |
| method:'POST', | |
| headers:{'Content-Type':'application/json'}, | |
| body:JSON.stringify({ input:preview, ...data.params }) | |
| }); | |
| const blob = await audioRes.blob(); | |
| const url = URL.createObjectURL(blob); | |
| setAudio('designAudio', 'designDL', url); | |
| document.getElementById('designResult').classList.add('visible'); | |
| document.getElementById('designAudio').play(); | |
| } catch(e) { | |
| showError(err, e.message); | |
| } finally { | |
| setLoading(btn, false); | |
| } | |
| } | |
| async function saveDesignedVoice() { | |
| const name = prompt('Enter a name for this voice:'); | |
| if (!name || !designedParams) return; | |
| await fetch('/v1/voices/design/save', { | |
| method:'POST', | |
| headers:{'Content-Type':'application/json'}, | |
| body:JSON.stringify({ name, params:designedParams }) | |
| }); | |
| alert('Voice saved to library!'); | |
| } | |
| // ============ DUBBING ============ | |
| function dubFileSelected() { | |
| const f = document.getElementById('dubFile').files[0]; | |
| if (f) { | |
| document.getElementById('dubFileName').textContent = '✅ ' + f.name; | |
| document.getElementById('dubUploadArea').classList.add('has-file'); | |
| } | |
| } | |
| async function startDubbing() { | |
| const file = document.getElementById('dubFile').files[0]; | |
| const srcLang = document.getElementById('dubSrcLang').value; | |
| const tgtLang = document.getElementById('dubTgtLang').value; | |
| const voiceId = document.getElementById('dubVoiceSelect').value; | |
| const btn = document.getElementById('dubBtn'); | |
| const err = document.getElementById('dubError'); | |
| const log = document.getElementById('dubLog'); | |
| const prog = document.getElementById('dubProgress'); | |
| const fill = document.getElementById('dubProgressFill'); | |
| if (!file) { showError(err, 'Please upload a video or audio file.'); return; } | |
| err.classList.remove('visible'); | |
| setLoading(btn, true); | |
| log.classList.add('visible'); | |
| prog.classList.add('visible'); | |
| log.innerHTML = '<div class="status-line info">Starting dubbing pipeline...</div>'; | |
| fill.style.width = '5%'; | |
| try { | |
| const b64 = await toB64(file); | |
| fill.style.width = '10%'; | |
| addLog(log, 'Uploading file...', 'info'); | |
| const res = await fetch('/v1/dubbing', { | |
| method:'POST', | |
| headers:{'Content-Type':'application/json'}, | |
| body:JSON.stringify({ | |
| file_b64: b64, | |
| filename: file.name, | |
| src_lang: srcLang, | |
| tgt_lang: tgtLang, | |
| voice_id: voiceId | |
| }) | |
| }); | |
| fill.style.width = '50%'; | |
| addLog(log, 'Processing...', 'info'); | |
| if (!res.ok) { const e = await res.json(); throw new Error(e.error); } | |
| fill.style.width = '90%'; | |
| addLog(log, 'Finalizing audio...', 'info'); | |
| const blob = await res.blob(); | |
| const url = URL.createObjectURL(blob); | |
| setAudio('dubAudio', 'dubDL', url); | |
| document.getElementById('dubPlayer').classList.add('visible'); | |
| fill.style.width = '100%'; | |
| addLog(log, '✅ Dubbing complete!', 'ok'); | |
| } catch(e) { | |
| showError(err, e.message); | |
| addLog(log, '❌ ' + e.message, 'err'); | |
| } finally { | |
| setLoading(btn, false); | |
| } | |
| } | |
| // ============ HELPERS ============ | |
| function toB64(file) { | |
| return new Promise(res => { | |
| const r = new FileReader(); | |
| r.onload = e => res(e.target.result.split(',')[1]); | |
| r.readAsDataURL(file); | |
| }); | |
| } | |
| function setAudio(audioId, dlId, url) { | |
| document.getElementById(audioId).src = url; | |
| document.getElementById(dlId).href = url; | |
| } | |
| function showError(el, msg) { | |
| el.textContent = '⚠ ' + msg; | |
| el.classList.add('visible'); | |
| } | |
| function setLoading(btn, state) { | |
| btn.disabled = state; | |
| btn.classList.toggle('loading', state); | |
| } | |
| function addLog(el, msg, type) { | |
| el.innerHTML += `<div class="status-line ${type}">${msg}</div>`; | |
| el.scrollTop = el.scrollHeight; | |
| } | |
| function copyText(idx) { | |
| const boxes = document.querySelectorAll('#page-api [style*="monospace"]'); | |
| navigator.clipboard.writeText(boxes[idx].innerText); | |
| } | |
| // Load voice library on start | |
| loadVoiceLibrary(); | |
| </script> | |
| </body> | |
| </html> | |
| """) | |
| # ============================================================ | |
| # API ENDPOINTS | |
| # ============================================================ | |
| async def v1_root(): | |
| return {"status": "ok", "service": "chatterbox-multilingual-tts"} | |
| async def tts(request: Request): | |
| try: | |
| data = await request.json() | |
| text = data.get("input", "") | |
| emotion = float(data.get("emotion", 0.5)) | |
| if not text: | |
| return JSONResponse({"error": "No input text"}, status_code=400) | |
| lang = get_language(text) | |
| wav = model.generate(text, audio_prompt_path=voice_sample, exaggeration=emotion, language=lang) | |
| out = io.BytesIO() | |
| torchaudio.save(out, wav, model.sr, format="mp3") | |
| out.seek(0) | |
| return StreamingResponse(out, media_type="audio/mpeg") | |
| except Exception as e: | |
| return JSONResponse({"error": str(e)}, status_code=500) | |
| async def tts_clone(request: Request): | |
| try: | |
| data = await request.json() | |
| text = data.get("input", "") | |
| voice_b64 = data.get("voice_b64", "") | |
| emotion = float(data.get("emotion", 0.5)) | |
| if not text: | |
| return JSONResponse({"error": "No input text"}, status_code=400) | |
| lang = get_language(text) | |
| if voice_b64: | |
| voice_bytes = base64.b64decode(voice_b64) | |
| temp_path = "/tmp/clone_voice.wav" | |
| with open(temp_path, "wb") as f: | |
| f.write(voice_bytes) | |
| prompt_path = temp_path | |
| else: | |
| prompt_path = voice_sample | |
| wav = model.generate(text, audio_prompt_path=prompt_path, exaggeration=emotion, language=lang) | |
| out = io.BytesIO() | |
| torchaudio.save(out, wav, model.sr, format="mp3") | |
| out.seek(0) | |
| return StreamingResponse(out, media_type="audio/mpeg") | |
| except Exception as e: | |
| return JSONResponse({"error": str(e)}, status_code=500) | |
| async def tts_with_voice(voice_id: str, request: Request): | |
| try: | |
| data = await request.json() | |
| text = data.get("input", "") | |
| emotion = float(data.get("emotion", 0.5)) | |
| if not text: | |
| return JSONResponse({"error": "No input text"}, status_code=400) | |
| meta = load_voices_meta() | |
| voice_info = meta.get(voice_id) | |
| prompt_path = str(VOICES_DIR / voice_info["filename"]) if voice_info else voice_sample | |
| lang = get_language(text) | |
| wav = model.generate(text, audio_prompt_path=prompt_path, exaggeration=emotion, language=lang) | |
| out = io.BytesIO() | |
| torchaudio.save(out, wav, model.sr, format="mp3") | |
| out.seek(0) | |
| return StreamingResponse(out, media_type="audio/mpeg") | |
| except Exception as e: | |
| return JSONResponse({"error": str(e)}, status_code=500) | |
| # ============================================================ | |
| # VOICE LIBRARY ENDPOINTS | |
| # ============================================================ | |
| async def list_voices(): | |
| meta = load_voices_meta() | |
| voices = [{"id": k, "name": v["name"], "lang": v["lang"]} for k, v in meta.items()] | |
| return {"voices": voices} | |
| async def add_voice(request: Request): | |
| try: | |
| data = await request.json() | |
| name = data.get("name", "").strip() | |
| lang = data.get("lang", "en") | |
| voice_b64 = data.get("voice_b64", "") | |
| filename = data.get("filename", "voice.wav") | |
| if not name or not voice_b64: | |
| return JSONResponse({"error": "Name and voice sample required"}, status_code=400) | |
| import uuid | |
| voice_id = str(uuid.uuid4())[:8] | |
| safe_name = f"{voice_id}.wav" | |
| local_path = str(VOICES_DIR / safe_name) | |
| voice_bytes = base64.b64decode(voice_b64) | |
| with open(local_path, "wb") as f: | |
| f.write(voice_bytes) | |
| meta = load_voices_meta() | |
| meta[voice_id] = {"name": name, "lang": lang, "filename": safe_name} | |
| save_voices_meta(meta) | |
| # Push to HF repo | |
| push_to_hf(local_path, safe_name) | |
| push_to_hf(str(VOICES_META), "meta.json") | |
| return {"id": voice_id, "name": name, "lang": lang} | |
| except Exception as e: | |
| return JSONResponse({"error": str(e)}, status_code=500) | |
| async def delete_voice(voice_id: str): | |
| try: | |
| meta = load_voices_meta() | |
| if voice_id in meta: | |
| wav_path = VOICES_DIR / meta[voice_id]["filename"] | |
| if wav_path.exists(): | |
| wav_path.unlink() | |
| del meta[voice_id] | |
| save_voices_meta(meta) | |
| push_to_hf(str(VOICES_META), "meta.json") | |
| return {"deleted": voice_id} | |
| except Exception as e: | |
| return JSONResponse({"error": str(e)}, status_code=500) | |
| # ============================================================ | |
| # VOICE DESIGN ENDPOINTS | |
| # ============================================================ | |
| async def design_voice(request: Request): | |
| try: | |
| data = await request.json() | |
| prompt = data.get("prompt", "") | |
| preview_text = data.get("preview_text", "Hello, I am Cortana.") | |
| if not GROQ_KEY: | |
| return JSONResponse({"error": "GROQ_API_KEY not set in secrets"}, status_code=500) | |
| # Ask Groq LLM to map description to Chatterbox parameters | |
| async with httpx.AsyncClient(timeout=15.0) as client: | |
| res = await client.post( | |
| "https://api.groq.com/openai/v1/chat/completions", | |
| headers={ | |
| "Authorization": f"Bearer {GROQ_KEY}", | |
| "Content-Type": "application/json" | |
| }, | |
| json={ | |
| "model": "llama-3.1-8b-instant", | |
| "messages": [ | |
| { | |
| "role": "system", | |
| "content": """You are a voice parameter mapper for a TTS system. | |
| Given a voice description, output ONLY a JSON object with these exact fields: | |
| - emotion: float 0.0 to 1.0 (0=neutral/calm, 1=very expressive/excited) | |
| - speed: float 0.5 to 2.0 (0.5=very slow, 1.0=normal, 2.0=very fast) | |
| - description: one sentence summarizing the voice | |
| Examples: | |
| "calm elderly woman" -> {"emotion":0.2,"speed":0.8,"description":"Soft calm elderly female voice"} | |
| "excited sports commentator" -> {"emotion":0.95,"speed":1.6,"description":"Energetic fast sports commentator"} | |
| "professional newsreader" -> {"emotion":0.3,"speed":1.0,"description":"Neutral professional news voice"} | |
| Output ONLY the JSON. No explanation. No markdown.""" | |
| }, | |
| { | |
| "role": "user", | |
| "content": prompt | |
| } | |
| ], | |
| "max_tokens": 100, | |
| "temperature": 0.3 | |
| } | |
| ) | |
| result = res.json() | |
| raw = result["choices"][0]["message"]["content"].strip() | |
| params = json.loads(raw) | |
| return {"params": params, "preview_text": preview_text} | |
| except Exception as e: | |
| return JSONResponse({"error": str(e)}, status_code=500) | |
| async def save_designed_voice(request: Request): | |
| try: | |
| data = await request.json() | |
| name = data.get("name", "Designed Voice") | |
| params = data.get("params", {}) | |
| import uuid | |
| voice_id = str(uuid.uuid4())[:8] | |
| meta = load_voices_meta() | |
| meta[voice_id] = { | |
| "name": name, | |
| "lang": "en", | |
| "filename": None, | |
| "params": params, | |
| "designed": True | |
| } | |
| save_voices_meta(meta) | |
| push_to_hf(str(VOICES_META), "meta.json") | |
| return {"id": voice_id, "name": name} | |
| except Exception as e: | |
| return JSONResponse({"error": str(e)}, status_code=500) | |
| # ============================================================ | |
| # DUBBING ENDPOINT | |
| # ============================================================ | |
| async def dub_video(request: Request): | |
| try: | |
| data = await request.json() | |
| file_b64 = data.get("file_b64", "") | |
| filename = data.get("filename", "input.mp4") | |
| src_lang = data.get("src_lang", "auto") | |
| tgt_lang = data.get("tgt_lang", "en") | |
| voice_id = data.get("voice_id", "default") | |
| if not file_b64: | |
| return JSONResponse({"error": "No file provided"}, status_code=400) | |
| if not GROQ_KEY: | |
| return JSONResponse({"error": "GROQ_API_KEY not set"}, status_code=500) | |
| tmpdir = tempfile.mkdtemp() | |
| try: | |
| # Step 1 — Save uploaded file | |
| input_path = os.path.join(tmpdir, filename) | |
| with open(input_path, "wb") as f: | |
| f.write(base64.b64decode(file_b64)) | |
| # Step 2 — Extract audio as WAV | |
| audio_path = os.path.join(tmpdir, "audio.wav") | |
| subprocess.run([ | |
| "ffmpeg", "-i", input_path, | |
| "-ar", "16000", "-ac", "1", | |
| "-y", audio_path | |
| ], check=True, capture_output=True) | |
| # Step 3 — Transcribe with Whisper via Groq | |
| with open(audio_path, "rb") as af: | |
| audio_b64 = base64.b64encode(af.read()).decode() | |
| async with httpx.AsyncClient(timeout=120.0) as client: | |
| # Use Groq Whisper for transcription | |
| with open(audio_path, "rb") as af: | |
| trans_res = await client.post( | |
| "https://api.groq.com/openai/v1/audio/transcriptions", | |
| headers={"Authorization": f"Bearer {GROQ_KEY}"}, | |
| files={"file": (filename, af, "audio/wav")}, | |
| data={ | |
| "model": "whisper-large-v3", | |
| "language": src_lang if src_lang != "auto" else None, | |
| "response_format": "verbose_json" | |
| } | |
| ) | |
| transcript_data = trans_res.json() | |
| segments = transcript_data.get("segments", []) | |
| full_text = transcript_data.get("text", "") | |
| if not full_text: | |
| return JSONResponse({"error": "Could not transcribe audio"}, status_code=500) | |
| # Step 4 — Translate via Groq LLM | |
| lang_names = { | |
| "en": "English", "hi": "Hindi", "es": "Spanish", | |
| "fr": "French", "de": "German", "ja": "Japanese", "zh": "Chinese" | |
| } | |
| tgt_name = lang_names.get(tgt_lang, tgt_lang) | |
| trans_response = await client.post( | |
| "https://api.groq.com/openai/v1/chat/completions", | |
| headers={ | |
| "Authorization": f"Bearer {GROQ_KEY}", | |
| "Content-Type": "application/json" | |
| }, | |
| json={ | |
| "model": "llama-3.3-70b-versatile", | |
| "messages": [ | |
| { | |
| "role": "system", | |
| "content": f"Translate the following text to {tgt_name}. Output ONLY the translated text. No explanations." | |
| }, | |
| {"role": "user", "content": full_text} | |
| ], | |
| "max_tokens": 2000 | |
| } | |
| ) | |
| translated_text = trans_response.json()["choices"][0]["message"]["content"].strip() | |
| # Step 5 — Synthesize translated text with Chatterbox | |
| meta = load_voices_meta() | |
| voice_info = meta.get(voice_id) | |
| if voice_info and voice_info.get("filename"): | |
| prompt_path = str(VOICES_DIR / voice_info["filename"]) | |
| else: | |
| prompt_path = voice_sample | |
| lang_code = get_language(translated_text) | |
| emotion = 0.5 | |
| if voice_info and voice_info.get("params"): | |
| emotion = float(voice_info["params"].get("emotion", 0.5)) | |
| wav = model.generate( | |
| translated_text, | |
| audio_prompt_path=prompt_path, | |
| exaggeration=emotion, | |
| language=lang_code | |
| ) | |
| # Step 6 — Return dubbed audio | |
| out = io.BytesIO() | |
| torchaudio.save(out, wav, model.sr, format="mp3") | |
| out.seek(0) | |
| return StreamingResponse(out, media_type="audio/mpeg") | |
| finally: | |
| shutil.rmtree(tmpdir, ignore_errors=True) | |
| except Exception as e: | |
| return JSONResponse({"error": str(e)}, status_code=500) | |
| if __name__ == "__main__": | |
| uvicorn.run(app, host="0.0.0.0", port=7860) |