Spaces:
Build error
Build error
| # app.py - Using Coqui XTTS instead of Parler-TTS | |
| from fastapi import FastAPI, HTTPException | |
| from fastapi.responses import JSONResponse | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from pydantic import BaseModel | |
| from typing import List, Optional | |
| import logging | |
| import requests | |
| import tempfile | |
| import os | |
| import torch | |
| import numpy as np | |
| import soundfile as sf | |
| import io | |
| # Set up logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| app = FastAPI(title="TTS API", version="1.0.0") | |
| # Add CORS middleware | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| # Configuration | |
| OCI_UPLOAD_API_URL = os.getenv("OCI_UPLOAD_API_URL", "https://yukee1992-oci-video-storage.hf.space") | |
| # Global variables | |
| tts_model = None | |
| model_loaded = False | |
| model_type = "none" | |
| # Pydantic models | |
| class VoiceoverRequest(BaseModel): | |
| project_id: str | |
| voiceover_scenes: List[str] | |
| upload_to_oci: Optional[bool] = False | |
| async def startup_event(): | |
| """Initialize the application with Coqui XTTS""" | |
| global tts_model, model_loaded, model_type | |
| logger.info("=== TTS API Starting ===") | |
| # Try Coqui XTTS first (most reliable) | |
| if await load_coqui_xtts(): | |
| model_loaded = True | |
| model_type = "coqui-xtts" | |
| logger.info("✅ Coqui XTTS loaded successfully!") | |
| return | |
| # Fallback to Bark | |
| if await load_bark_model(): | |
| model_loaded = True | |
| model_type = "bark" | |
| logger.info("✅ Bark model loaded as fallback!") | |
| return | |
| logger.error("❌ All models failed to load") | |
| model_loaded = False | |
| async def load_coqui_xtts(): | |
| """Load Coqui XTTS model""" | |
| try: | |
| logger.info("Loading Coqui XTTS model...") | |
| # Method 1: Try using transformers | |
| try: | |
| from transformers import AutoProcessor, AutoModel | |
| processor = AutoProcessor.from_pretrained("coqui/XTTS-v2") | |
| model = AutoModel.from_pretrained("coqui/XTTS-v2") | |
| global tts_model | |
| tts_model = {"processor": processor, "model": model, "type": "transformers"} | |
| return True | |
| except Exception as e: | |
| logger.warning(f"Transformers XTTS failed: {e}") | |
| # Method 2: Try using TTS package | |
| try: | |
| from TTS.api import TTS | |
| tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2") | |
| global tts_model | |
| tts_model = {"tts": tts, "type": "coqui"} | |
| return True | |
| except Exception as e: | |
| logger.warning(f"Coqui TTS package failed: {e}") | |
| except Exception as e: | |
| logger.error(f"Coqui XTTS loading failed: {e}") | |
| return False | |
| async def load_bark_model(): | |
| """Load Bark model as fallback""" | |
| try: | |
| from transformers import AutoProcessor, AutoModel | |
| processor = AutoProcessor.from_pretrained("suno/bark-small") | |
| model = AutoModel.from_pretrained("suno/bark-small") | |
| global tts_model | |
| tts_model = {"processor": processor, "model": model, "type": "bark"} | |
| return True | |
| except Exception as e: | |
| logger.error(f"Bark model loading failed: {e}") | |
| return False | |
| def generate_voiceover(text, speaker_wav=None): | |
| """Generate voiceover using available model""" | |
| try: | |
| if tts_model is None: | |
| return None, "No model loaded" | |
| if tts_model["type"] == "coqui": | |
| # Using Coqui TTS package | |
| tts = tts_model["tts"] | |
| temp_dir = tempfile.gettempdir() | |
| temp_file = os.path.join(temp_dir, "coqui_generated.wav") | |
| tts.tts_to_file( | |
| text=text, | |
| speaker_wav=speaker_wav, | |
| language="en", | |
| file_path=temp_file | |
| ) | |
| return temp_file, None | |
| elif tts_model["type"] == "transformers": | |
| # Using transformers XTTS | |
| processor = tts_model["processor"] | |
| model = tts_model["model"] | |
| inputs = processor(text=[text], return_tensors="pt") | |
| with torch.no_grad(): | |
| output = model.generate(**inputs) | |
| temp_dir = tempfile.gettempdir() | |
| temp_file = os.path.join(temp_dir, "xtts_generated.wav") | |
| audio_array = output.cpu().numpy().squeeze() | |
| sf.write(temp_file, audio_array, 24000) | |
| return temp_file, None | |
| elif tts_model["type"] == "bark": | |
| # Using Bark | |
| processor = tts_model["processor"] | |
| model = tts_model["model"] | |
| inputs = processor(text=[text], return_tensors="pt") | |
| with torch.no_grad(): | |
| speech_values = model.generate(**inputs, do_sample=True) | |
| temp_dir = tempfile.gettempdir() | |
| temp_file = os.path.join(temp_dir, "bark_generated.wav") | |
| audio_array = speech_values.cpu().numpy().squeeze() | |
| sf.write(temp_file, audio_array, 24000) | |
| return temp_file, None | |
| except Exception as e: | |
| return None, str(e) | |
| return None, "Unknown model type" | |
| def upload_to_oci(file_path, filename, project_id): | |
| """Upload to OCI storage""" | |
| try: | |
| with open(file_path, 'rb') as f: | |
| files = {'file': (filename, f)} | |
| data = { | |
| 'project_id': project_id, | |
| 'subfolder': 'voiceover' | |
| } | |
| response = requests.post( | |
| f"{OCI_UPLOAD_API_URL}/api/upload", | |
| files=files, | |
| data=data, | |
| timeout=30 | |
| ) | |
| if response.status_code == 200: | |
| return response.json(), None | |
| else: | |
| return None, f"Upload failed: {response.status_code}" | |
| except Exception as e: | |
| return None, str(e) | |
| async def root(): | |
| return { | |
| "message": "TTS API with High-Quality Voice Generation", | |
| "model_loaded": model_loaded, | |
| "model_type": model_type, | |
| "supported_models": ["coqui-xtts", "bark"], | |
| "endpoints": { | |
| "health": "/health", | |
| "model_status": "/api/model-status", | |
| "generate_voiceovers": "/api/generate-voiceovers" | |
| } | |
| } | |
| async def health(): | |
| return { | |
| "status": "healthy" if model_loaded else "degraded", | |
| "model_loaded": model_loaded, | |
| "model_type": model_type, | |
| "quality": "high" if model_type == "coqui-xtts" else "good" | |
| } | |
| async def model_status(): | |
| """Get detailed model status""" | |
| return { | |
| "model_loaded": model_loaded, | |
| "model_type": model_type, | |
| "model_quality": "high" if model_type == "coqui-xtts" else "good", | |
| "supported_models": ["Coqui XTTS (recommended)", "Bark (fallback)"], | |
| "message": "Using Coqui XTTS for high-quality voice generation" | |
| } | |
| async def generate_voiceovers_endpoint(request: VoiceoverRequest): | |
| """Main API endpoint""" | |
| try: | |
| if not model_loaded: | |
| raise HTTPException(status_code=503, detail="No TTS model loaded") | |
| results = [] | |
| for i, scene_text in enumerate(request.voiceover_scenes, 1): | |
| try: | |
| filename = f"voiceover_{i:02d}.wav" | |
| logger.info(f"Generating voiceover {i} with {model_type}...") | |
| temp_file, error = generate_voiceover(scene_text) | |
| if error: | |
| results.append({ | |
| "sequence": i, | |
| "status": "error", | |
| "error": error, | |
| "filename": filename, | |
| "model": model_type | |
| }) | |
| continue | |
| # Upload to OCI if requested | |
| upload_result = None | |
| if request.upload_to_oci and OCI_UPLOAD_API_URL: | |
| upload_result, upload_error = upload_to_oci(temp_file, filename, request.project_id) | |
| if upload_error: | |
| results.append({ | |
| "sequence": i, | |
| "status": "upload_error", | |
| "error": upload_error, | |
| "filename": filename, | |
| "model": model_type | |
| }) | |
| continue | |
| # Clean up | |
| try: | |
| os.remove(temp_file) | |
| except: | |
| pass | |
| results.append({ | |
| "sequence": i, | |
| "status": "success", | |
| "filename": filename, | |
| "text_preview": scene_text[:100] + "..." if len(scene_text) > 100 else scene_text, | |
| "uploaded_to_oci": bool(upload_result), | |
| "model": model_type, | |
| "quality": "high" if model_type == "coqui-xtts" else "good" | |
| }) | |
| except Exception as e: | |
| results.append({ | |
| "sequence": i, | |
| "status": "error", | |
| "error": str(e), | |
| "filename": f"voiceover_{i:02d}.wav" | |
| }) | |
| return { | |
| "status": "processed", | |
| "project_id": request.project_id, | |
| "total_scenes": len(request.voiceover_scenes), | |
| "successful": len([r for r in results if r['status'] == 'success']), | |
| "failed": len([r for r in results if r['status'] != 'success']), | |
| "model_type": model_type, | |
| "voice_quality": "high" if model_type == "coqui-xtts" else "good", | |
| "results": results | |
| } | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=7860) |