Spaces:

CherithCutestory
/

vlengine-xttsv2

Paused

App Files Files Community

vlengine-xttsv2 / app.py

CherithCutestory

Added auth key

a047f6e 3 months ago

raw

history blame contribute delete

7.91 kB

	import os
	os.environ.setdefault("OMP_NUM_THREADS", "4")
	os.environ.setdefault("COQUI_TOS_AGREED", "1")

	import io
	import base64
	import tempfile
	import logging
	import wave
	import numpy as np
	import torch
	import pyrubberband as pyrb
	from contextlib import asynccontextmanager
	from pathlib import Path
	from fastapi import FastAPI, Request, HTTPException
	from fastapi.responses import Response, JSONResponse, HTMLResponse
	from pydantic import BaseModel, Field
	from typing import Optional

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger("xttsv2-engine")

	BEARER_TOKEN = os.environ.get("API_KEY", "124CC717-7517-47A2-BBD6-54FCAE310297")
	MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
	SAMPLE_RATE = 24000
	BIT_DEPTH = 16
	CHANNELS = 1
	MAX_SECONDS = 30

	CANONICAL_EMOTIONS = [
	"neutral", "happy", "sad", "angry", "calm", "excited",
	"fear", "surprise", "disgust", "confused", "anxious",
	"hopeful", "melancholy", "fearful",
	]

	EMOTION_SPEED_MAP = {
	"neutral": 1.0,
	"happy": 1.05,
	"sad": 0.93,
	"angry": 1.08,
	"calm": 0.92,
	"excited": 1.10,
	"fear": 1.06,
	"surprise": 1.07,
	"disgust": 0.97,
	"confused": 0.96,
	"anxious": 1.04,
	"hopeful": 1.02,
	"melancholy": 0.91,
	"fearful": 1.06,
	}

	EMOTION_PITCH_MAP = {
	"neutral": 0.0,
	"happy": 0.6,
	"sad": -0.5,
	"angry": -0.3,
	"calm": 0.0,
	"excited": 0.8,
	"fear": 0.4,
	"surprise": 0.7,
	"disgust": -0.4,
	"confused": 0.3,
	"anxious": 0.3,
	"hopeful": 0.4,
	"melancholy": -0.5,
	"fearful": 0.4,
	}

	tts_model = None


	def load_model():
	global tts_model
	import torch.serialization
	_original_load = torch.load
	def _patched_load(args, *kwargs):
	kwargs.setdefault("weights_only", False)
	return _original_load(args, *kwargs)
	torch.load = _patched_load

	from TTS.api import TTS

	device = "cuda" if torch.cuda.is_available() else "cpu"
	logger.info(f"Loading XTTSv2 model on {device}...")
	tts_model = TTS(model_name=MODEL_NAME, progress_bar=True).to(device)
	logger.info("XTTSv2 model loaded successfully.")


	@asynccontextmanager
	async def lifespan(app: FastAPI):
	load_model()
	yield


	app = FastAPI(title="XTTSv2 TTS Engine", lifespan=lifespan)


	def verify_auth(request: Request):
	if not BEARER_TOKEN:
	return
	auth = request.headers.get("Authorization", "")
	if auth != f"Bearer {BEARER_TOKEN}":
	raise HTTPException(status_code=401, detail="Unauthorized")


	def numpy_to_wav_bytes(audio_np: np.ndarray, sample_rate: int) -> bytes:
	audio_np = np.clip(audio_np, -1.0, 1.0)
	audio_int16 = (audio_np * 32767).astype(np.int16)

	buf = io.BytesIO()
	with wave.open(buf, "wb") as wf:
	wf.setnchannels(CHANNELS)
	wf.setsampwidth(2)
	wf.setframerate(sample_rate)
	wf.writeframes(audio_int16.tobytes())
	return buf.getvalue()


	class ConvertRequest(BaseModel):
	input_text: str
	builtin_voice_id: Optional[str] = None
	voice_to_clone_sample: Optional[str] = None
	random_seed: Optional[int] = None
	emotion_set: list[str] = Field(default_factory=lambda: ["neutral"])
	intensity: int = Field(default=50, ge=1, le=100)
	volume: int = Field(default=75, ge=1, le=100)
	speed_adjust: float = Field(default=0.0, ge=-5.0, le=5.0)
	pitch_adjust: float = Field(default=0.0, ge=-5.0, le=5.0)


	@app.post("/GetEngineDetails")
	async def get_engine_details(request: Request):
	verify_auth(request)

	return {
	"engine_id": "xttsv2",
	"engine_name": "Coqui XTTSv2",
	"sample_rate": SAMPLE_RATE,
	"bit_depth": BIT_DEPTH,
	"channels": CHANNELS,
	"max_seconds_per_conversion": MAX_SECONDS,
	"supports_voice_cloning": True,
	"builtin_voices": [],
	"supported_emotions": CANONICAL_EMOTIONS,
	"extra_properties": {
	"model": MODEL_NAME,
	"languages": [
	"en", "es", "fr", "de", "it", "pt", "pl", "tr",
	"ru", "nl", "cs", "ar", "zh-cn", "ja", "hu", "ko"
	]
	}
	}


	@app.post("/ConvertTextToSpeech")
	async def convert_text_to_speech(request: Request):
	verify_auth(request)

	try:
	body = await request.json()
	req = ConvertRequest(**body)
	except Exception as e:
	return JSONResponse(
	status_code=400,
	content={"error": str(e), "error_code": "INVALID_REQUEST"}
	)

	if not req.input_text.strip():
	return JSONResponse(
	status_code=400,
	content={"error": "Input text is empty", "error_code": "INVALID_REQUEST"}
	)

	if req.random_seed is not None:
	torch.manual_seed(req.random_seed)
	if torch.cuda.is_available():
	torch.cuda.manual_seed(req.random_seed)

	speaker_wav_path = None
	temp_files = []

	try:
	if req.voice_to_clone_sample:
	wav_bytes = base64.b64decode(req.voice_to_clone_sample)
	tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
	tmp.write(wav_bytes)
	tmp.close()
	speaker_wav_path = tmp.name
	temp_files.append(tmp.name)

	dominant_emotion = req.emotion_set[0].lower() if req.emotion_set else "neutral"
	if dominant_emotion not in EMOTION_SPEED_MAP:
	dominant_emotion = "neutral"

	intensity_scale = req.intensity / 50.0

	emotion_speed_raw = EMOTION_SPEED_MAP[dominant_emotion]
	emotion_speed = 1.0 + (emotion_speed_raw - 1.0) * intensity_scale

	emotion_pitch_raw = EMOTION_PITCH_MAP[dominant_emotion]
	emotion_pitch = emotion_pitch_raw * intensity_scale

	base_speed = emotion_speed * (1.0 + (req.speed_adjust / 100.0))
	speed = max(0.5, min(2.0, base_speed))

	total_pitch = emotion_pitch + (req.pitch_adjust * 0.24)

	logger.info(
	f"Emotion: {dominant_emotion}, intensity: {req.intensity}, "
	f"emotion_speed: {emotion_speed:.3f}, emotion_pitch: {emotion_pitch:.2f}, "
	f"final_speed: {speed:.3f}, final_pitch: {total_pitch:.2f}"
	)

	synth_text = req.input_text

	language = "en"

	if speaker_wav_path:
	audio = tts_model.tts(
	text=synth_text,
	speaker_wav=speaker_wav_path,
	language=language,
	speed=speed,
	)
	else:
	audio = tts_model.tts(
	text=synth_text,
	language=language,
	speed=speed,
	)

	audio_np = np.array(audio, dtype=np.float32)

	if abs(total_pitch) > 0.01:
	audio_np = pyrb.pitch_shift(audio_np, SAMPLE_RATE, total_pitch)

	vol_factor = req.volume / 75.0
	audio_np = audio_np * vol_factor

	wav_bytes = numpy_to_wav_bytes(audio_np, SAMPLE_RATE)

	return Response(content=wav_bytes, media_type="audio/wav")

	except Exception as e:
	logger.exception("TTS generation failed")
	return JSONResponse(
	status_code=500,
	content={
	"error": "Audio generation failed",
	"error_code": "GENERATION_FAILED",
	"details": str(e)
	}
	)
	finally:
	for f in temp_files:
	try:
	os.unlink(f)
	except OSError:
	pass


	@app.get("/", response_class=HTMLResponse)
	async def root():
	html_path = Path(__file__).parent / "index.html"
	return HTMLResponse(content=html_path.read_text())


	@app.get("/health")
	async def health():
	return {"status": "ok", "model_loaded": tts_model is not None}


	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=7860)