from fastapi import FastAPI, Request, UploadFile, File, Form from fastapi.responses import StreamingResponse, JSONResponse, HTMLResponse, FileResponse import torch import torchaudio import io import uvicorn import os import base64 import json import subprocess import tempfile import shutil import httpx from pathlib import Path from chatterbox.tts import ChatterboxTTS app = FastAPI() # ============================================================ # STARTUP # ============================================================ print("Loading Chatterbox Multilingual...") device = "cuda" if torch.cuda.is_available() else "cpu" # Set HF token for faster downloads hf_token = os.getenv("HF_TOKEN") if hf_token: from huggingface_hub import login login(token=hf_token) model = ChatterboxTTS.from_pretrained(device=device) print(f"✅ Chatterbox loaded on {device}") VOICE_SAMPLE_PATH = "voice_sample.wav" VOICES_DIR = Path("voices") VOICES_DIR.mkdir(exist_ok=True) VOICES_META = VOICES_DIR / "meta.json" HF_TOKEN = os.getenv("HF_TOKEN") HF_REPO_ID = os.getenv("HF_REPO_ID") # e.g. abc1181/livekit-tts-chatterbox GROQ_KEY = os.getenv("GROQ_API_KEY") voice_sample = VOICE_SAMPLE_PATH if os.path.exists(VOICE_SAMPLE_PATH) else None def has_devanagari(text: str) -> bool: return any('\u0900' <= c <= '\u097F' for c in text) def get_language(text: str) -> str: return "hi" if has_devanagari(text) else "en" def load_voices_meta() -> dict: if VOICES_META.exists(): return json.loads(VOICES_META.read_text()) return {} def save_voices_meta(meta: dict): VOICES_META.write_text(json.dumps(meta, indent=2)) def push_to_hf(local_path: str, repo_path: str): """Push a file to HF repo permanently.""" if not HF_TOKEN or not HF_REPO_ID: return False try: from huggingface_hub import HfApi api = HfApi() api.upload_file( path_or_fileobj=local_path, path_in_repo=f"voices/{repo_path}", repo_id=HF_REPO_ID, repo_type="space", token=HF_TOKEN, ) return True except Exception as e: print(f"HF push failed: {e}") return False def pull_from_hf(): """Pull voices from HF repo on startup.""" if not HF_TOKEN or not HF_REPO_ID: return try: from huggingface_hub import HfApi api = HfApi() files = api.list_repo_files(repo_id=HF_REPO_ID, repo_type="space", token=HF_TOKEN) for f in files: if f.startswith("voices/") and f.endswith(".wav"): name = Path(f).name dest = VOICES_DIR / name if not dest.exists(): api.hf_hub_download( repo_id=HF_REPO_ID, filename=f, repo_type="space", token=HF_TOKEN, local_dir="." ) except Exception as e: print(f"HF pull failed: {e}") # Pull voices from HF on startup pull_from_hf() # ============================================================ # UI # ============================================================ @app.get("/", response_class=HTMLResponse) async def ui(): return HTMLResponse(content=""" Cortana TTS Studio
Text to Speech
Generate natural speech in English, Hindi or Hinglish
Text Input
Detected: English Ctrl+Enter to generate
Voice
Active Voice
Upload One-time Sample
Drop WAV/MP3 here
Parameters
Emotion / Expressiveness
0.5
Speed
1.0
Generation History
đŸŽĩ
Your generations appear here
Voice Library
Save, manage and reuse cloned voices permanently
Add New Voice
Voice Name
Language Tag
Voice Sample (5–30 seconds, clean audio)
🎙
Upload WAV or MP3
No background music — clear speech only
Saved Voices
🎙
No voices saved yet — add one above
Voice Design
Describe a voice in plain words — AI generates the parameters
Describe Your Voice
Preview Text
Generated Parameters

↓ Download
Example Prompts
Young Indian woman, warm and friendly Deep mature male, calm and authoritative Energetic teen, very expressive and fast Professional newsreader, neutral accent Soft spoken elderly woman, slow and gentle Excited sports commentator, loud and fast
Dubbing
Translate and re-voice any video or audio file
âš ī¸ Free CPU Warning
Dubbing on free CPU takes 10–20 minutes per minute of video. Start with a short clip to test. Upgrade to GPU for faster processing.
Upload Media
đŸŽŦ
Upload video or audio file
MP4, MKV, AVI, MP3, WAV — max 100MB
Source Language
Target Language
Dubbing Voice (optional)
API Reference
OpenAI-compatible endpoints — drop-in replacement
Text to Speech
POST /v1/audio/speech

{
  "input": "Hello I am Cortana",
  "emotion": 0.5,         // 0.0 neutral → 1.0 expressive
  "speed": 1.0           // 0.5x to 2.0x
}

Returns: audio/mpeg stream
Voice Cloning (on-the-fly)
POST /v1/audio/speech/clone

{
  "input": "Hello I am Cortana",
  "voice_b64": "base64_encoded_wav",
  "emotion": 0.5
}

Returns: audio/mpeg stream
List Voices
GET /v1/voices

Returns: { "voices": [ { "id": "...", "name": "...", "lang": "..." } ] }
CURL Example
curl -X POST "https://YOUR_SPACE.hf.space/v1/audio/speech" \
  -H "Authorization: Bearer YOUR_HF_TOKEN" \
  -H "Content-Type: application/json" \
  -d '{"input": "Hello I am Cortana", "emotion": 0.5}' \
  --output speech.mp3
""") # ============================================================ # API ENDPOINTS # ============================================================ @app.get("/v1") async def v1_root(): return {"status": "ok", "service": "chatterbox-multilingual-tts"} @app.post("/v1/audio/speech") async def tts(request: Request): try: data = await request.json() text = data.get("input", "") emotion = float(data.get("emotion", 0.5)) if not text: return JSONResponse({"error": "No input text"}, status_code=400) lang = get_language(text) wav = model.generate(text, audio_prompt_path=voice_sample, exaggeration=emotion, language=lang) out = io.BytesIO() torchaudio.save(out, wav, model.sr, format="mp3") out.seek(0) return StreamingResponse(out, media_type="audio/mpeg") except Exception as e: return JSONResponse({"error": str(e)}, status_code=500) @app.post("/v1/audio/speech/clone") async def tts_clone(request: Request): try: data = await request.json() text = data.get("input", "") voice_b64 = data.get("voice_b64", "") emotion = float(data.get("emotion", 0.5)) if not text: return JSONResponse({"error": "No input text"}, status_code=400) lang = get_language(text) if voice_b64: voice_bytes = base64.b64decode(voice_b64) temp_path = "/tmp/clone_voice.wav" with open(temp_path, "wb") as f: f.write(voice_bytes) prompt_path = temp_path else: prompt_path = voice_sample wav = model.generate(text, audio_prompt_path=prompt_path, exaggeration=emotion, language=lang) out = io.BytesIO() torchaudio.save(out, wav, model.sr, format="mp3") out.seek(0) return StreamingResponse(out, media_type="audio/mpeg") except Exception as e: return JSONResponse({"error": str(e)}, status_code=500) @app.post("/v1/audio/speech/voice/{voice_id}") async def tts_with_voice(voice_id: str, request: Request): try: data = await request.json() text = data.get("input", "") emotion = float(data.get("emotion", 0.5)) if not text: return JSONResponse({"error": "No input text"}, status_code=400) meta = load_voices_meta() voice_info = meta.get(voice_id) prompt_path = str(VOICES_DIR / voice_info["filename"]) if voice_info else voice_sample lang = get_language(text) wav = model.generate(text, audio_prompt_path=prompt_path, exaggeration=emotion, language=lang) out = io.BytesIO() torchaudio.save(out, wav, model.sr, format="mp3") out.seek(0) return StreamingResponse(out, media_type="audio/mpeg") except Exception as e: return JSONResponse({"error": str(e)}, status_code=500) # ============================================================ # VOICE LIBRARY ENDPOINTS # ============================================================ @app.get("/v1/voices") async def list_voices(): meta = load_voices_meta() voices = [{"id": k, "name": v["name"], "lang": v["lang"]} for k, v in meta.items()] return {"voices": voices} @app.post("/v1/voices") async def add_voice(request: Request): try: data = await request.json() name = data.get("name", "").strip() lang = data.get("lang", "en") voice_b64 = data.get("voice_b64", "") filename = data.get("filename", "voice.wav") if not name or not voice_b64: return JSONResponse({"error": "Name and voice sample required"}, status_code=400) import uuid voice_id = str(uuid.uuid4())[:8] safe_name = f"{voice_id}.wav" local_path = str(VOICES_DIR / safe_name) voice_bytes = base64.b64decode(voice_b64) with open(local_path, "wb") as f: f.write(voice_bytes) meta = load_voices_meta() meta[voice_id] = {"name": name, "lang": lang, "filename": safe_name} save_voices_meta(meta) # Push to HF repo push_to_hf(local_path, safe_name) push_to_hf(str(VOICES_META), "meta.json") return {"id": voice_id, "name": name, "lang": lang} except Exception as e: return JSONResponse({"error": str(e)}, status_code=500) @app.delete("/v1/voices/{voice_id}") async def delete_voice(voice_id: str): try: meta = load_voices_meta() if voice_id in meta: wav_path = VOICES_DIR / meta[voice_id]["filename"] if wav_path.exists(): wav_path.unlink() del meta[voice_id] save_voices_meta(meta) push_to_hf(str(VOICES_META), "meta.json") return {"deleted": voice_id} except Exception as e: return JSONResponse({"error": str(e)}, status_code=500) # ============================================================ # VOICE DESIGN ENDPOINTS # ============================================================ @app.post("/v1/voices/design") async def design_voice(request: Request): try: data = await request.json() prompt = data.get("prompt", "") preview_text = data.get("preview_text", "Hello, I am Cortana.") if not GROQ_KEY: return JSONResponse({"error": "GROQ_API_KEY not set in secrets"}, status_code=500) # Ask Groq LLM to map description to Chatterbox parameters async with httpx.AsyncClient(timeout=15.0) as client: res = await client.post( "https://api.groq.com/openai/v1/chat/completions", headers={ "Authorization": f"Bearer {GROQ_KEY}", "Content-Type": "application/json" }, json={ "model": "llama-3.1-8b-instant", "messages": [ { "role": "system", "content": """You are a voice parameter mapper for a TTS system. Given a voice description, output ONLY a JSON object with these exact fields: - emotion: float 0.0 to 1.0 (0=neutral/calm, 1=very expressive/excited) - speed: float 0.5 to 2.0 (0.5=very slow, 1.0=normal, 2.0=very fast) - description: one sentence summarizing the voice Examples: "calm elderly woman" -> {"emotion":0.2,"speed":0.8,"description":"Soft calm elderly female voice"} "excited sports commentator" -> {"emotion":0.95,"speed":1.6,"description":"Energetic fast sports commentator"} "professional newsreader" -> {"emotion":0.3,"speed":1.0,"description":"Neutral professional news voice"} Output ONLY the JSON. No explanation. No markdown.""" }, { "role": "user", "content": prompt } ], "max_tokens": 100, "temperature": 0.3 } ) result = res.json() raw = result["choices"][0]["message"]["content"].strip() params = json.loads(raw) return {"params": params, "preview_text": preview_text} except Exception as e: return JSONResponse({"error": str(e)}, status_code=500) @app.post("/v1/voices/design/save") async def save_designed_voice(request: Request): try: data = await request.json() name = data.get("name", "Designed Voice") params = data.get("params", {}) import uuid voice_id = str(uuid.uuid4())[:8] meta = load_voices_meta() meta[voice_id] = { "name": name, "lang": "en", "filename": None, "params": params, "designed": True } save_voices_meta(meta) push_to_hf(str(VOICES_META), "meta.json") return {"id": voice_id, "name": name} except Exception as e: return JSONResponse({"error": str(e)}, status_code=500) # ============================================================ # DUBBING ENDPOINT # ============================================================ @app.post("/v1/dubbing") async def dub_video(request: Request): try: data = await request.json() file_b64 = data.get("file_b64", "") filename = data.get("filename", "input.mp4") src_lang = data.get("src_lang", "auto") tgt_lang = data.get("tgt_lang", "en") voice_id = data.get("voice_id", "default") if not file_b64: return JSONResponse({"error": "No file provided"}, status_code=400) if not GROQ_KEY: return JSONResponse({"error": "GROQ_API_KEY not set"}, status_code=500) tmpdir = tempfile.mkdtemp() try: # Step 1 — Save uploaded file input_path = os.path.join(tmpdir, filename) with open(input_path, "wb") as f: f.write(base64.b64decode(file_b64)) # Step 2 — Extract audio as WAV audio_path = os.path.join(tmpdir, "audio.wav") subprocess.run([ "ffmpeg", "-i", input_path, "-ar", "16000", "-ac", "1", "-y", audio_path ], check=True, capture_output=True) # Step 3 — Transcribe with Whisper via Groq with open(audio_path, "rb") as af: audio_b64 = base64.b64encode(af.read()).decode() async with httpx.AsyncClient(timeout=120.0) as client: # Use Groq Whisper for transcription with open(audio_path, "rb") as af: trans_res = await client.post( "https://api.groq.com/openai/v1/audio/transcriptions", headers={"Authorization": f"Bearer {GROQ_KEY}"}, files={"file": (filename, af, "audio/wav")}, data={ "model": "whisper-large-v3", "language": src_lang if src_lang != "auto" else None, "response_format": "verbose_json" } ) transcript_data = trans_res.json() segments = transcript_data.get("segments", []) full_text = transcript_data.get("text", "") if not full_text: return JSONResponse({"error": "Could not transcribe audio"}, status_code=500) # Step 4 — Translate via Groq LLM lang_names = { "en": "English", "hi": "Hindi", "es": "Spanish", "fr": "French", "de": "German", "ja": "Japanese", "zh": "Chinese" } tgt_name = lang_names.get(tgt_lang, tgt_lang) trans_response = await client.post( "https://api.groq.com/openai/v1/chat/completions", headers={ "Authorization": f"Bearer {GROQ_KEY}", "Content-Type": "application/json" }, json={ "model": "llama-3.3-70b-versatile", "messages": [ { "role": "system", "content": f"Translate the following text to {tgt_name}. Output ONLY the translated text. No explanations." }, {"role": "user", "content": full_text} ], "max_tokens": 2000 } ) translated_text = trans_response.json()["choices"][0]["message"]["content"].strip() # Step 5 — Synthesize translated text with Chatterbox meta = load_voices_meta() voice_info = meta.get(voice_id) if voice_info and voice_info.get("filename"): prompt_path = str(VOICES_DIR / voice_info["filename"]) else: prompt_path = voice_sample lang_code = get_language(translated_text) emotion = 0.5 if voice_info and voice_info.get("params"): emotion = float(voice_info["params"].get("emotion", 0.5)) wav = model.generate( translated_text, audio_prompt_path=prompt_path, exaggeration=emotion, language=lang_code ) # Step 6 — Return dubbed audio out = io.BytesIO() torchaudio.save(out, wav, model.sr, format="mp3") out.seek(0) return StreamingResponse(out, media_type="audio/mpeg") finally: shutil.rmtree(tmpdir, ignore_errors=True) except Exception as e: return JSONResponse({"error": str(e)}, status_code=500) if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=7860)