parler-tts-api / app.py
yukee1992's picture
Update app.py
125c3d1 verified
# app.py - Using Coqui XTTS instead of Parler-TTS
from fastapi import FastAPI, HTTPException
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from typing import List, Optional
import logging
import requests
import tempfile
import os
import torch
import numpy as np
import soundfile as sf
import io
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = FastAPI(title="TTS API", version="1.0.0")
# Add CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
# Configuration
OCI_UPLOAD_API_URL = os.getenv("OCI_UPLOAD_API_URL", "https://yukee1992-oci-video-storage.hf.space")
# Global variables
tts_model = None
model_loaded = False
model_type = "none"
# Pydantic models
class VoiceoverRequest(BaseModel):
project_id: str
voiceover_scenes: List[str]
upload_to_oci: Optional[bool] = False
@app.on_event("startup")
async def startup_event():
"""Initialize the application with Coqui XTTS"""
global tts_model, model_loaded, model_type
logger.info("=== TTS API Starting ===")
# Try Coqui XTTS first (most reliable)
if await load_coqui_xtts():
model_loaded = True
model_type = "coqui-xtts"
logger.info("✅ Coqui XTTS loaded successfully!")
return
# Fallback to Bark
if await load_bark_model():
model_loaded = True
model_type = "bark"
logger.info("✅ Bark model loaded as fallback!")
return
logger.error("❌ All models failed to load")
model_loaded = False
async def load_coqui_xtts():
"""Load Coqui XTTS model"""
try:
logger.info("Loading Coqui XTTS model...")
# Method 1: Try using transformers
try:
from transformers import AutoProcessor, AutoModel
processor = AutoProcessor.from_pretrained("coqui/XTTS-v2")
model = AutoModel.from_pretrained("coqui/XTTS-v2")
global tts_model
tts_model = {"processor": processor, "model": model, "type": "transformers"}
return True
except Exception as e:
logger.warning(f"Transformers XTTS failed: {e}")
# Method 2: Try using TTS package
try:
from TTS.api import TTS
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
global tts_model
tts_model = {"tts": tts, "type": "coqui"}
return True
except Exception as e:
logger.warning(f"Coqui TTS package failed: {e}")
except Exception as e:
logger.error(f"Coqui XTTS loading failed: {e}")
return False
async def load_bark_model():
"""Load Bark model as fallback"""
try:
from transformers import AutoProcessor, AutoModel
processor = AutoProcessor.from_pretrained("suno/bark-small")
model = AutoModel.from_pretrained("suno/bark-small")
global tts_model
tts_model = {"processor": processor, "model": model, "type": "bark"}
return True
except Exception as e:
logger.error(f"Bark model loading failed: {e}")
return False
def generate_voiceover(text, speaker_wav=None):
"""Generate voiceover using available model"""
try:
if tts_model is None:
return None, "No model loaded"
if tts_model["type"] == "coqui":
# Using Coqui TTS package
tts = tts_model["tts"]
temp_dir = tempfile.gettempdir()
temp_file = os.path.join(temp_dir, "coqui_generated.wav")
tts.tts_to_file(
text=text,
speaker_wav=speaker_wav,
language="en",
file_path=temp_file
)
return temp_file, None
elif tts_model["type"] == "transformers":
# Using transformers XTTS
processor = tts_model["processor"]
model = tts_model["model"]
inputs = processor(text=[text], return_tensors="pt")
with torch.no_grad():
output = model.generate(**inputs)
temp_dir = tempfile.gettempdir()
temp_file = os.path.join(temp_dir, "xtts_generated.wav")
audio_array = output.cpu().numpy().squeeze()
sf.write(temp_file, audio_array, 24000)
return temp_file, None
elif tts_model["type"] == "bark":
# Using Bark
processor = tts_model["processor"]
model = tts_model["model"]
inputs = processor(text=[text], return_tensors="pt")
with torch.no_grad():
speech_values = model.generate(**inputs, do_sample=True)
temp_dir = tempfile.gettempdir()
temp_file = os.path.join(temp_dir, "bark_generated.wav")
audio_array = speech_values.cpu().numpy().squeeze()
sf.write(temp_file, audio_array, 24000)
return temp_file, None
except Exception as e:
return None, str(e)
return None, "Unknown model type"
def upload_to_oci(file_path, filename, project_id):
"""Upload to OCI storage"""
try:
with open(file_path, 'rb') as f:
files = {'file': (filename, f)}
data = {
'project_id': project_id,
'subfolder': 'voiceover'
}
response = requests.post(
f"{OCI_UPLOAD_API_URL}/api/upload",
files=files,
data=data,
timeout=30
)
if response.status_code == 200:
return response.json(), None
else:
return None, f"Upload failed: {response.status_code}"
except Exception as e:
return None, str(e)
@app.get("/")
async def root():
return {
"message": "TTS API with High-Quality Voice Generation",
"model_loaded": model_loaded,
"model_type": model_type,
"supported_models": ["coqui-xtts", "bark"],
"endpoints": {
"health": "/health",
"model_status": "/api/model-status",
"generate_voiceovers": "/api/generate-voiceovers"
}
}
@app.get("/health")
async def health():
return {
"status": "healthy" if model_loaded else "degraded",
"model_loaded": model_loaded,
"model_type": model_type,
"quality": "high" if model_type == "coqui-xtts" else "good"
}
@app.get("/api/model-status")
async def model_status():
"""Get detailed model status"""
return {
"model_loaded": model_loaded,
"model_type": model_type,
"model_quality": "high" if model_type == "coqui-xtts" else "good",
"supported_models": ["Coqui XTTS (recommended)", "Bark (fallback)"],
"message": "Using Coqui XTTS for high-quality voice generation"
}
@app.post("/api/generate-voiceovers")
async def generate_voiceovers_endpoint(request: VoiceoverRequest):
"""Main API endpoint"""
try:
if not model_loaded:
raise HTTPException(status_code=503, detail="No TTS model loaded")
results = []
for i, scene_text in enumerate(request.voiceover_scenes, 1):
try:
filename = f"voiceover_{i:02d}.wav"
logger.info(f"Generating voiceover {i} with {model_type}...")
temp_file, error = generate_voiceover(scene_text)
if error:
results.append({
"sequence": i,
"status": "error",
"error": error,
"filename": filename,
"model": model_type
})
continue
# Upload to OCI if requested
upload_result = None
if request.upload_to_oci and OCI_UPLOAD_API_URL:
upload_result, upload_error = upload_to_oci(temp_file, filename, request.project_id)
if upload_error:
results.append({
"sequence": i,
"status": "upload_error",
"error": upload_error,
"filename": filename,
"model": model_type
})
continue
# Clean up
try:
os.remove(temp_file)
except:
pass
results.append({
"sequence": i,
"status": "success",
"filename": filename,
"text_preview": scene_text[:100] + "..." if len(scene_text) > 100 else scene_text,
"uploaded_to_oci": bool(upload_result),
"model": model_type,
"quality": "high" if model_type == "coqui-xtts" else "good"
})
except Exception as e:
results.append({
"sequence": i,
"status": "error",
"error": str(e),
"filename": f"voiceover_{i:02d}.wav"
})
return {
"status": "processed",
"project_id": request.project_id,
"total_scenes": len(request.voiceover_scenes),
"successful": len([r for r in results if r['status'] == 'success']),
"failed": len([r for r in results if r['status'] != 'success']),
"model_type": model_type,
"voice_quality": "high" if model_type == "coqui-xtts" else "good",
"results": results
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)