import os os.environ.setdefault("OMP_NUM_THREADS", "4") os.environ.setdefault("HF_HUB_CACHE", "./checkpoints/hf_cache") import io import base64 import tempfile import logging import wave import numpy as np import torch import pyrubberband as pyrb from contextlib import asynccontextmanager from pathlib import Path from fastapi import FastAPI, Request, HTTPException from fastapi.responses import Response, JSONResponse, HTMLResponse from pydantic import BaseModel, Field from typing import Optional logging.basicConfig(level=logging.INFO) logger = logging.getLogger("indextts2-engine") BEARER_TOKEN = os.environ.get("API_KEY", "124CC717-7517-47A2-BBD6-54FCAE310297") SAMPLE_RATE = 22050 BIT_DEPTH = 16 CHANNELS = 1 MAX_SECONDS = 60 MAX_CHARS = 500 VOXLIBRIS_TO_INDEXTTS2_EMOTIONS = { "neutral": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.8], "happy": [0.8, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1], "angry": [0.0, 0.8, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1], "sad": [0.0, 0.0, 0.8, 0.0, 0.0, 0.0, 0.0, 0.1], "fear": [0.0, 0.0, 0.0, 0.8, 0.0, 0.0, 0.0, 0.1], "disgust": [0.0, 0.0, 0.0, 0.0, 0.8, 0.0, 0.0, 0.1], "melancholy": [0.0, 0.0, 0.2, 0.0, 0.0, 0.6, 0.0, 0.1], "surprise": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.1], "calm": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.8], "excited": [0.6, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2, 0.0], "anxious": [0.0, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.2], "hopeful": [0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3], "tender": [0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5], "proud": [0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.2], "fearful": [0.0, 0.0, 0.0, 0.8, 0.0, 0.0, 0.0, 0.1], "confused": [0.0, 0.0, 0.0, 0.2, 0.0, 0.0, 0.3, 0.3], } EMOTION_SPEED_MAP = { "neutral": 1.0, "happy": 1.02, "sad": 0.97, "angry": 1.04, "fear": 1.03, "fearful": 1.03, "surprise": 1.05, "disgust": 0.98, "excited": 1.03, "calm": 0.96, "confused": 0.98, "anxious": 1.02, "hopeful": 1.01, "melancholy": 0.96, "tender": 0.97, "proud": 1.01, } EMOTION_PITCH_MAP = { "neutral": 0.0, "happy": 0.5, "sad": -0.3, "angry": -0.2, "fear": 0.3, "fearful": 0.3, "surprise": 0.6, "disgust": -0.2, "excited": 0.7, "calm": -0.1, "confused": 0.2, "anxious": 0.3, "hopeful": 0.3, "melancholy": -0.4, "tender": -0.1, "proud": 0.2, } CANONICAL_EMOTIONS = [ "neutral", "happy", "sad", "angry", "fear", "surprise", "disgust", "excited", "calm", "anxious", "hopeful", "melancholy", "tender", "proud", "fearful", "confused", ] tts_model = None def load_model(): global tts_model from indextts.infer_v2 import IndexTTS2 model_dir = os.environ.get("MODEL_DIR", "checkpoints") cfg_path = os.path.join(model_dir, "config.yaml") if not os.path.exists(cfg_path): logger.info( "Model not found locally, downloading IndexTeam/IndexTTS-2...") from huggingface_hub import snapshot_download snapshot_download("IndexTeam/IndexTTS-2", local_dir=model_dir) logger.info("Model download complete.") use_fp16 = os.environ.get("USE_FP16", "true").lower() in ("true", "1", "yes") device = None if torch.cuda.is_available(): device = "cuda:0" elif hasattr(torch, "mps") and torch.backends.mps.is_available(): device = "mps" use_fp16 = False else: device = "cpu" use_fp16 = False logger.info( f"Loading IndexTTS2 model from {model_dir} on {device} (fp16={use_fp16})..." ) tts_model = IndexTTS2( cfg_path=cfg_path, model_dir=model_dir, use_fp16=use_fp16, device=device, ) logger.info("IndexTTS2 model loaded successfully.") @asynccontextmanager async def lifespan(app: FastAPI): load_model() yield app = FastAPI(title="IndexTTS2 Engine", lifespan=lifespan) def verify_auth(request: Request): if not BEARER_TOKEN: return auth = request.headers.get("Authorization", "") if auth != f"Bearer {BEARER_TOKEN}": raise HTTPException(status_code=401, detail="Unauthorized") def numpy_to_wav_bytes(audio_np: np.ndarray, sample_rate: int) -> bytes: audio_np = np.clip(audio_np, -1.0, 1.0) audio_int16 = (audio_np * 32767).astype(np.int16) buf = io.BytesIO() with wave.open(buf, "wb") as wf: wf.setnchannels(CHANNELS) wf.setsampwidth(2) wf.setframerate(sample_rate) wf.writeframes(audio_int16.tobytes()) return buf.getvalue() def blend_emotion_vectors(emotion_set: list[str], intensity: int) -> list[float]: intensity_factor = intensity / 50.0 if not emotion_set or emotion_set == ["neutral"]: base = VOXLIBRIS_TO_INDEXTTS2_EMOTIONS.get("neutral", [0.0] * 7 + [0.8]) return list(base) blended = [0.0] * 8 count = 0 for emo in emotion_set: emo_lower = emo.lower() vec = VOXLIBRIS_TO_INDEXTTS2_EMOTIONS.get(emo_lower) if vec: for i in range(8): blended[i] += vec[i] count += 1 if count == 0: return list(VOXLIBRIS_TO_INDEXTTS2_EMOTIONS["neutral"]) blended = [v / count for v in blended] for i in range(7): blended[i] = blended[i] * intensity_factor calm_remaining = max(0.0, 1.0 - sum(blended[:7])) blended[7] = min(blended[7], calm_remaining) return blended class ConvertRequest(BaseModel): input_text: str builtin_voice_id: Optional[str] = None voice_to_clone_sample: Optional[str] = None random_seed: Optional[int] = None emotion_set: list[str] = Field(default_factory=lambda: ["neutral"]) intensity: int = Field(default=50, ge=1, le=100) volume: int = Field(default=75, ge=1, le=100) speed_adjust: float = Field(default=0.0, ge=-5.0, le=5.0) pitch_adjust: float = Field(default=0.0, ge=-5.0, le=5.0) @app.post("/GetEngineDetails") async def get_engine_details(request: Request): verify_auth(request) return { "engine_id": "indextts2", "engine_name": "IndexTTS2", "sample_rate": SAMPLE_RATE, "bit_depth": BIT_DEPTH, "channels": CHANNELS, "max_seconds_per_conversion": MAX_SECONDS, "supports_voice_cloning": True, "builtin_voices": [], "supported_emotions": CANONICAL_EMOTIONS, "extra_properties": { "model": "IndexTeam/IndexTTS-2", "max_characters": MAX_CHARS, "emotion_control": "8-dimensional emotion vectors via fine-tuned Qwen3", "features": [ "zero-shot voice cloning", "emotion-speaker disentanglement", "duration control", "multilingual (Chinese, English)", ], } } @app.post("/ConvertTextToSpeech") async def convert_text_to_speech(request: Request): verify_auth(request) try: body = await request.json() req = ConvertRequest(**body) except Exception as e: return JSONResponse(status_code=400, content={ "error": str(e), "error_code": "INVALID_REQUEST" }) if not req.input_text.strip(): return JSONResponse(status_code=400, content={ "error": "Input text is empty", "error_code": "INVALID_REQUEST" }) if not req.voice_to_clone_sample: return JSONResponse( status_code=400, content={ "error": "IndexTTS2 requires a voice sample for cloning. " "Please provide a voice_to_clone_sample.", "error_code": "INVALID_REQUEST" }) if req.random_seed is not None and req.random_seed > 0: torch.manual_seed(req.random_seed) if torch.cuda.is_available(): torch.cuda.manual_seed(req.random_seed) temp_files = [] try: try: wav_bytes = base64.b64decode(req.voice_to_clone_sample, validate=True) except Exception: return JSONResponse( status_code=400, content={ "error": "Invalid voice_to_clone_sample: not valid base64", "error_code": "INVALID_REQUEST" }) if len(wav_bytes) < 44: return JSONResponse( status_code=400, content={ "error": "Invalid voice_to_clone_sample: file too small to be valid audio", "error_code": "INVALID_REQUEST" }) tmp_voice = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) tmp_voice.write(wav_bytes) tmp_voice.close() speaker_wav_path = tmp_voice.name temp_files.append(tmp_voice.name) tmp_out = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) tmp_out.close() output_path = tmp_out.name temp_files.append(tmp_out.name) text = req.input_text.strip() if len(text) > MAX_CHARS: truncated = text[:MAX_CHARS] last_space = truncated.rfind(' ') if last_space > MAX_CHARS * 0.6: truncated = truncated[:last_space] text = truncated logger.warning(f"Text truncated to {len(text)} characters") if text and text[-1] not in '.!?;:。!?;:': text += '.' dominant_emotion = req.emotion_set[0].lower( ) if req.emotion_set else "neutral" emo_vector = blend_emotion_vectors(req.emotion_set, req.intensity) emo_vector = tts_model.normalize_emo_vec(emo_vector, apply_bias=True) emotion_speed = EMOTION_SPEED_MAP.get(dominant_emotion, 1.0) emotion_pitch = EMOTION_PITCH_MAP.get(dominant_emotion, 0.0) intensity_factor = req.intensity / 50.0 emotion_speed = 1.0 + (emotion_speed - 1.0) * intensity_factor emotion_pitch = emotion_pitch * intensity_factor is_neutral = all(e.lower() in ("neutral", "calm") for e in req.emotion_set) logger.info(f"Generating with IndexTTS2: emotions={req.emotion_set}, " f"emo_vector={[f'{v:.2f}' for v in emo_vector]}, " f"intensity={req.intensity}, text_len={len(text)}, " f"is_neutral={is_neutral}") kwargs = { "spk_audio_prompt": speaker_wav_path, "text": text, "output_path": output_path, "verbose": False, } if not is_neutral: kwargs["emo_vector"] = emo_vector tts_model.infer(**kwargs) if not os.path.exists(output_path) or os.path.getsize( output_path) == 0: return JSONResponse(status_code=500, content={ "error": "IndexTTS2 produced no output", "error_code": "GENERATION_FAILED" }) import torchaudio wav_tensor, sr = torchaudio.load(output_path) audio_np = wav_tensor.squeeze().numpy().astype(np.float32) if sr != SAMPLE_RATE: import librosa audio_np = librosa.resample(audio_np, orig_sr=sr, target_sr=SAMPLE_RATE) peak = np.max(np.abs(audio_np)) if peak > 0: audio_np = audio_np / peak speed_factor = emotion_speed if req.speed_adjust != 0.0: user_speed = 1.0 + (req.speed_adjust / 100.0) speed_factor = speed_factor * user_speed speed_factor = max(0.5, min(2.0, speed_factor)) if abs(speed_factor - 1.0) > 0.01: audio_np = pyrb.time_stretch(audio_np, SAMPLE_RATE, speed_factor) total_pitch = emotion_pitch if req.pitch_adjust != 0.0: total_pitch += req.pitch_adjust * 0.24 if abs(total_pitch) > 0.01: audio_np = pyrb.pitch_shift(audio_np, SAMPLE_RATE, total_pitch) vol_factor = req.volume / 75.0 audio_np = audio_np * vol_factor wav_bytes_out = numpy_to_wav_bytes(audio_np, SAMPLE_RATE) return Response(content=wav_bytes_out, media_type="audio/wav") except Exception as e: logger.exception("TTS generation failed") return JSONResponse(status_code=500, content={ "error": "Audio generation failed", "error_code": "GENERATION_FAILED", "details": str(e) }) finally: for f in temp_files: try: os.unlink(f) except OSError: pass @app.get("/", response_class=HTMLResponse) async def root(): html_path = Path(__file__).parent / "index.html" if html_path.exists(): return HTMLResponse(content=html_path.read_text()) return HTMLResponse(content="""
VoxLibris-compatible TTS engine powered by IndexTTS2.
POST /GetEngineDetails - Get engine capabilitiesPOST /ConvertTextToSpeech - Convert text to speechGET /health - Health check