Spaces:

yukee1992
/

parler-tts-api

Build error

App Files Files Community

parler-tts-api / app.py

yukee1992

Update app.py

125c3d1 verified 6 months ago

raw

history blame contribute delete

10.4 kB

	# app.py - Using Coqui XTTS instead of Parler-TTS
	from fastapi import FastAPI, HTTPException
	from fastapi.responses import JSONResponse
	from fastapi.middleware.cors import CORSMiddleware
	from pydantic import BaseModel
	from typing import List, Optional
	import logging
	import requests
	import tempfile
	import os
	import torch
	import numpy as np
	import soundfile as sf
	import io

	# Set up logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	app = FastAPI(title="TTS API", version="1.0.0")

	# Add CORS middleware
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_methods=["*"],
	allow_headers=["*"],
	)

	# Configuration
	OCI_UPLOAD_API_URL = os.getenv("OCI_UPLOAD_API_URL", "https://yukee1992-oci-video-storage.hf.space")

	# Global variables
	tts_model = None
	model_loaded = False
	model_type = "none"

	# Pydantic models
	class VoiceoverRequest(BaseModel):
	project_id: str
	voiceover_scenes: List[str]
	upload_to_oci: Optional[bool] = False

	@app.on_event("startup")
	async def startup_event():
	"""Initialize the application with Coqui XTTS"""
	global tts_model, model_loaded, model_type

	logger.info("=== TTS API Starting ===")

	# Try Coqui XTTS first (most reliable)
	if await load_coqui_xtts():
	model_loaded = True
	model_type = "coqui-xtts"
	logger.info("✅ Coqui XTTS loaded successfully!")
	return

	# Fallback to Bark
	if await load_bark_model():
	model_loaded = True
	model_type = "bark"
	logger.info("✅ Bark model loaded as fallback!")
	return

	logger.error("❌ All models failed to load")
	model_loaded = False

	async def load_coqui_xtts():
	"""Load Coqui XTTS model"""
	try:
	logger.info("Loading Coqui XTTS model...")

	# Method 1: Try using transformers
	try:
	from transformers import AutoProcessor, AutoModel

	processor = AutoProcessor.from_pretrained("coqui/XTTS-v2")
	model = AutoModel.from_pretrained("coqui/XTTS-v2")

	global tts_model
	tts_model = {"processor": processor, "model": model, "type": "transformers"}
	return True

	except Exception as e:
	logger.warning(f"Transformers XTTS failed: {e}")

	# Method 2: Try using TTS package
	try:
	from TTS.api import TTS
	tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
	global tts_model
	tts_model = {"tts": tts, "type": "coqui"}
	return True

	except Exception as e:
	logger.warning(f"Coqui TTS package failed: {e}")

	except Exception as e:
	logger.error(f"Coqui XTTS loading failed: {e}")

	return False

	async def load_bark_model():
	"""Load Bark model as fallback"""
	try:
	from transformers import AutoProcessor, AutoModel

	processor = AutoProcessor.from_pretrained("suno/bark-small")
	model = AutoModel.from_pretrained("suno/bark-small")

	global tts_model
	tts_model = {"processor": processor, "model": model, "type": "bark"}
	return True

	except Exception as e:
	logger.error(f"Bark model loading failed: {e}")
	return False

	def generate_voiceover(text, speaker_wav=None):
	"""Generate voiceover using available model"""
	try:
	if tts_model is None:
	return None, "No model loaded"

	if tts_model["type"] == "coqui":
	# Using Coqui TTS package
	tts = tts_model["tts"]
	temp_dir = tempfile.gettempdir()
	temp_file = os.path.join(temp_dir, "coqui_generated.wav")

	tts.tts_to_file(
	text=text,
	speaker_wav=speaker_wav,
	language="en",
	file_path=temp_file
	)

	return temp_file, None

	elif tts_model["type"] == "transformers":
	# Using transformers XTTS
	processor = tts_model["processor"]
	model = tts_model["model"]

	inputs = processor(text=[text], return_tensors="pt")
	with torch.no_grad():
	output = model.generate(**inputs)

	temp_dir = tempfile.gettempdir()
	temp_file = os.path.join(temp_dir, "xtts_generated.wav")

	audio_array = output.cpu().numpy().squeeze()
	sf.write(temp_file, audio_array, 24000)

	return temp_file, None

	elif tts_model["type"] == "bark":
	# Using Bark
	processor = tts_model["processor"]
	model = tts_model["model"]

	inputs = processor(text=[text], return_tensors="pt")
	with torch.no_grad():
	speech_values = model.generate(**inputs, do_sample=True)

	temp_dir = tempfile.gettempdir()
	temp_file = os.path.join(temp_dir, "bark_generated.wav")

	audio_array = speech_values.cpu().numpy().squeeze()
	sf.write(temp_file, audio_array, 24000)

	return temp_file, None

	except Exception as e:
	return None, str(e)

	return None, "Unknown model type"

	def upload_to_oci(file_path, filename, project_id):
	"""Upload to OCI storage"""
	try:
	with open(file_path, 'rb') as f:
	files = {'file': (filename, f)}
	data = {
	'project_id': project_id,
	'subfolder': 'voiceover'
	}

	response = requests.post(
	f"{OCI_UPLOAD_API_URL}/api/upload",
	files=files,
	data=data,
	timeout=30
	)

	if response.status_code == 200:
	return response.json(), None
	else:
	return None, f"Upload failed: {response.status_code}"

	except Exception as e:
	return None, str(e)

	@app.get("/")
	async def root():
	return {
	"message": "TTS API with High-Quality Voice Generation",
	"model_loaded": model_loaded,
	"model_type": model_type,
	"supported_models": ["coqui-xtts", "bark"],
	"endpoints": {
	"health": "/health",
	"model_status": "/api/model-status",
	"generate_voiceovers": "/api/generate-voiceovers"
	}
	}

	@app.get("/health")
	async def health():
	return {
	"status": "healthy" if model_loaded else "degraded",
	"model_loaded": model_loaded,
	"model_type": model_type,
	"quality": "high" if model_type == "coqui-xtts" else "good"
	}

	@app.get("/api/model-status")
	async def model_status():
	"""Get detailed model status"""
	return {
	"model_loaded": model_loaded,
	"model_type": model_type,
	"model_quality": "high" if model_type == "coqui-xtts" else "good",
	"supported_models": ["Coqui XTTS (recommended)", "Bark (fallback)"],
	"message": "Using Coqui XTTS for high-quality voice generation"
	}

	@app.post("/api/generate-voiceovers")
	async def generate_voiceovers_endpoint(request: VoiceoverRequest):
	"""Main API endpoint"""
	try:
	if not model_loaded:
	raise HTTPException(status_code=503, detail="No TTS model loaded")

	results = []

	for i, scene_text in enumerate(request.voiceover_scenes, 1):
	try:
	filename = f"voiceover_{i:02d}.wav"

	logger.info(f"Generating voiceover {i} with {model_type}...")
	temp_file, error = generate_voiceover(scene_text)

	if error:
	results.append({
	"sequence": i,
	"status": "error",
	"error": error,
	"filename": filename,
	"model": model_type
	})
	continue

	# Upload to OCI if requested
	upload_result = None
	if request.upload_to_oci and OCI_UPLOAD_API_URL:
	upload_result, upload_error = upload_to_oci(temp_file, filename, request.project_id)
	if upload_error:
	results.append({
	"sequence": i,
	"status": "upload_error",
	"error": upload_error,
	"filename": filename,
	"model": model_type
	})
	continue

	# Clean up
	try:
	os.remove(temp_file)
	except:
	pass

	results.append({
	"sequence": i,
	"status": "success",
	"filename": filename,
	"text_preview": scene_text[:100] + "..." if len(scene_text) > 100 else scene_text,
	"uploaded_to_oci": bool(upload_result),
	"model": model_type,
	"quality": "high" if model_type == "coqui-xtts" else "good"
	})

	except Exception as e:
	results.append({
	"sequence": i,
	"status": "error",
	"error": str(e),
	"filename": f"voiceover_{i:02d}.wav"
	})

	return {
	"status": "processed",
	"project_id": request.project_id,
	"total_scenes": len(request.voiceover_scenes),
	"successful": len([r for r in results if r['status'] == 'success']),
	"failed": len([r for r in results if r['status'] != 'success']),
	"model_type": model_type,
	"voice_quality": "high" if model_type == "coqui-xtts" else "good",
	"results": results
	}

	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))

	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=7860)