# app.py - Using Coqui XTTS instead of Parler-TTS from fastapi import FastAPI, HTTPException from fastapi.responses import JSONResponse from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel from typing import List, Optional import logging import requests import tempfile import os import torch import numpy as np import soundfile as sf import io # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) app = FastAPI(title="TTS API", version="1.0.0") # Add CORS middleware app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"], ) # Configuration OCI_UPLOAD_API_URL = os.getenv("OCI_UPLOAD_API_URL", "https://yukee1992-oci-video-storage.hf.space") # Global variables tts_model = None model_loaded = False model_type = "none" # Pydantic models class VoiceoverRequest(BaseModel): project_id: str voiceover_scenes: List[str] upload_to_oci: Optional[bool] = False @app.on_event("startup") async def startup_event(): """Initialize the application with Coqui XTTS""" global tts_model, model_loaded, model_type logger.info("=== TTS API Starting ===") # Try Coqui XTTS first (most reliable) if await load_coqui_xtts(): model_loaded = True model_type = "coqui-xtts" logger.info("✅ Coqui XTTS loaded successfully!") return # Fallback to Bark if await load_bark_model(): model_loaded = True model_type = "bark" logger.info("✅ Bark model loaded as fallback!") return logger.error("❌ All models failed to load") model_loaded = False async def load_coqui_xtts(): """Load Coqui XTTS model""" try: logger.info("Loading Coqui XTTS model...") # Method 1: Try using transformers try: from transformers import AutoProcessor, AutoModel processor = AutoProcessor.from_pretrained("coqui/XTTS-v2") model = AutoModel.from_pretrained("coqui/XTTS-v2") global tts_model tts_model = {"processor": processor, "model": model, "type": "transformers"} return True except Exception as e: logger.warning(f"Transformers XTTS failed: {e}") # Method 2: Try using TTS package try: from TTS.api import TTS tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2") global tts_model tts_model = {"tts": tts, "type": "coqui"} return True except Exception as e: logger.warning(f"Coqui TTS package failed: {e}") except Exception as e: logger.error(f"Coqui XTTS loading failed: {e}") return False async def load_bark_model(): """Load Bark model as fallback""" try: from transformers import AutoProcessor, AutoModel processor = AutoProcessor.from_pretrained("suno/bark-small") model = AutoModel.from_pretrained("suno/bark-small") global tts_model tts_model = {"processor": processor, "model": model, "type": "bark"} return True except Exception as e: logger.error(f"Bark model loading failed: {e}") return False def generate_voiceover(text, speaker_wav=None): """Generate voiceover using available model""" try: if tts_model is None: return None, "No model loaded" if tts_model["type"] == "coqui": # Using Coqui TTS package tts = tts_model["tts"] temp_dir = tempfile.gettempdir() temp_file = os.path.join(temp_dir, "coqui_generated.wav") tts.tts_to_file( text=text, speaker_wav=speaker_wav, language="en", file_path=temp_file ) return temp_file, None elif tts_model["type"] == "transformers": # Using transformers XTTS processor = tts_model["processor"] model = tts_model["model"] inputs = processor(text=[text], return_tensors="pt") with torch.no_grad(): output = model.generate(**inputs) temp_dir = tempfile.gettempdir() temp_file = os.path.join(temp_dir, "xtts_generated.wav") audio_array = output.cpu().numpy().squeeze() sf.write(temp_file, audio_array, 24000) return temp_file, None elif tts_model["type"] == "bark": # Using Bark processor = tts_model["processor"] model = tts_model["model"] inputs = processor(text=[text], return_tensors="pt") with torch.no_grad(): speech_values = model.generate(**inputs, do_sample=True) temp_dir = tempfile.gettempdir() temp_file = os.path.join(temp_dir, "bark_generated.wav") audio_array = speech_values.cpu().numpy().squeeze() sf.write(temp_file, audio_array, 24000) return temp_file, None except Exception as e: return None, str(e) return None, "Unknown model type" def upload_to_oci(file_path, filename, project_id): """Upload to OCI storage""" try: with open(file_path, 'rb') as f: files = {'file': (filename, f)} data = { 'project_id': project_id, 'subfolder': 'voiceover' } response = requests.post( f"{OCI_UPLOAD_API_URL}/api/upload", files=files, data=data, timeout=30 ) if response.status_code == 200: return response.json(), None else: return None, f"Upload failed: {response.status_code}" except Exception as e: return None, str(e) @app.get("/") async def root(): return { "message": "TTS API with High-Quality Voice Generation", "model_loaded": model_loaded, "model_type": model_type, "supported_models": ["coqui-xtts", "bark"], "endpoints": { "health": "/health", "model_status": "/api/model-status", "generate_voiceovers": "/api/generate-voiceovers" } } @app.get("/health") async def health(): return { "status": "healthy" if model_loaded else "degraded", "model_loaded": model_loaded, "model_type": model_type, "quality": "high" if model_type == "coqui-xtts" else "good" } @app.get("/api/model-status") async def model_status(): """Get detailed model status""" return { "model_loaded": model_loaded, "model_type": model_type, "model_quality": "high" if model_type == "coqui-xtts" else "good", "supported_models": ["Coqui XTTS (recommended)", "Bark (fallback)"], "message": "Using Coqui XTTS for high-quality voice generation" } @app.post("/api/generate-voiceovers") async def generate_voiceovers_endpoint(request: VoiceoverRequest): """Main API endpoint""" try: if not model_loaded: raise HTTPException(status_code=503, detail="No TTS model loaded") results = [] for i, scene_text in enumerate(request.voiceover_scenes, 1): try: filename = f"voiceover_{i:02d}.wav" logger.info(f"Generating voiceover {i} with {model_type}...") temp_file, error = generate_voiceover(scene_text) if error: results.append({ "sequence": i, "status": "error", "error": error, "filename": filename, "model": model_type }) continue # Upload to OCI if requested upload_result = None if request.upload_to_oci and OCI_UPLOAD_API_URL: upload_result, upload_error = upload_to_oci(temp_file, filename, request.project_id) if upload_error: results.append({ "sequence": i, "status": "upload_error", "error": upload_error, "filename": filename, "model": model_type }) continue # Clean up try: os.remove(temp_file) except: pass results.append({ "sequence": i, "status": "success", "filename": filename, "text_preview": scene_text[:100] + "..." if len(scene_text) > 100 else scene_text, "uploaded_to_oci": bool(upload_result), "model": model_type, "quality": "high" if model_type == "coqui-xtts" else "good" }) except Exception as e: results.append({ "sequence": i, "status": "error", "error": str(e), "filename": f"voiceover_{i:02d}.wav" }) return { "status": "processed", "project_id": request.project_id, "total_scenes": len(request.voiceover_scenes), "successful": len([r for r in results if r['status'] == 'success']), "failed": len([r for r in results if r['status'] != 'success']), "model_type": model_type, "voice_quality": "high" if model_type == "coqui-xtts" else "good", "results": results } except Exception as e: raise HTTPException(status_code=500, detail=str(e)) if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)