Spaces:

ataberkkilavuzcu
/

indextts2-api

Running

App Files Files Community

ataberkkilavuzcu commited on about 19 hours ago

Commit

16e9700

verified ·

1 Parent(s): 8d3e172

Update app.py

Browse files

Files changed (1) hide show

app.py +87 -17

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import uuid
 from pathlib import Path
 from threading import Lock
 from typing import Dict, Optional
 import requests
 import torch
@@ -28,11 +29,16 @@ MODEL_DIR = os.getenv("MODEL_DIR", "/data/indextts2")
 MAX_TEXT_LENGTH = 1000
 DEFAULT_LANGUAGE = "en"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # Job management
 JOBS: Dict[str, Dict[str, str]] = {}
 JOB_LOCK = Lock()
 # Set token in environment before importing
 if HF_TOKEN:
     os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
@@ -45,7 +51,6 @@ if HF_TOKEN:
 # Download model checkpoints from Hugging Face
 os.makedirs(MODEL_DIR, exist_ok=True)
 try:
     from huggingface_hub import snapshot_download
@@ -62,7 +67,7 @@ except Exception as exc:
     print(f"Warning: Could not download model: {exc}")
     # Continue anyway - model might already be present
-# Initialize IndexTTS2
 try:
     from indextts.infer_v2 import IndexTTS2
@@ -72,14 +77,56 @@ try:
             f"Config file not found at {cfg_path}. Model may not be downloaded."
         )
     tts_model = IndexTTS2(
         cfg_path=cfg_path,
         model_dir=MODEL_DIR,
-        use_fp16=False,  # CPU doesn't support FP16
-        use_cuda_kernel=False,  # CPU mode
-        use_deepspeed=False,  # CPU mode
     )
-    print("IndexTTS2 model loaded successfully.")
 except Exception as exc:
     raise RuntimeError(f"Failed to load IndexTTS2 model: {exc}") from exc
@@ -102,8 +149,8 @@ def _require_api_key(x_api_key: Optional[str]):
 def _write_temp_audio_from_url(url: HttpUrl) -> str:
-    """Download audio from URL to temporary file."""
-    response = requests.get(url, stream=True, timeout=30)
     if response.status_code >= 400:
         raise HTTPException(
             status_code=400,
@@ -150,6 +197,8 @@ def _preprocess_audio_wav(
     - convert to mono
     - resample to target_sr
     - peak-normalize to target_peak (avoid clipping)
     """
     wav, sr = torchaudio.load(path)
@@ -204,28 +253,37 @@ def _cleanup_files(*files: str):
 def _run_generate_job(job_id: str, payload: Dict[str, str]):
-    """Background job for TTS generation."""
     speaker_file = None
     output_file = None
     _set_job(job_id, status="processing")
     try:
         speaker_file = _temp_speaker_file(payload["speaker_wav"])
         speaker_file = _preprocess_audio_wav(speaker_file)
         output_file = os.path.join(
             tempfile.gettempdir(),
             f"indextts2-{uuid.uuid4()}.wav"
         )
-        tts_model.infer(
-            spk_audio_prompt=speaker_file,
-            text=payload["text"],
-            output_path=output_file,
-            use_random=False,
-            verbose=False,
-        )
         output_file = _preprocess_audio_wav(output_file)
         if not Path(output_file).exists():
@@ -233,9 +291,13 @@ def _run_generate_job(job_id: str, payload: Dict[str, str]):
                 f"TTS generation failed: output file was not created at {output_file}"
             )
         _cleanup_files(speaker_file)
         _set_job(job_id, status="completed", output_file=output_file)
     except Exception as exc:
         _cleanup_files(speaker_file, output_file)
         _set_job(job_id, status="error", error=str(exc))
@@ -244,7 +306,13 @@ def _run_generate_job(job_id: str, payload: Dict[str, str]):
 def health(x_api_key: Optional[str] = Header(default=None)):
     """Health check endpoint."""
     _require_api_key(x_api_key)
-    return {"status": "ok", "model": "indextts2", "device": DEVICE}
 @app.post("/generate")
@@ -337,6 +405,8 @@ def root():
     """API root with available endpoints."""
     return {
         "name": "indextts2-api",
         "endpoints": [
             "/health",
             "/generate",

 from pathlib import Path
 from threading import Lock
 from typing import Dict, Optional
+import time
 import requests
 import torch
 MAX_TEXT_LENGTH = 1000
 DEFAULT_LANGUAGE = "en"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+USE_GPU = DEVICE == "cuda"
 # Job management
 JOBS: Dict[str, Dict[str, str]] = {}
 JOB_LOCK = Lock()
+# Connection pooling for faster URL downloads
+HTTP_SESSION = requests.Session()
+HTTP_SESSION.headers.update({"User-Agent": "IndexTTS2-API/1.0"})
 # Set token in environment before importing
 if HF_TOKEN:
     os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
 # Download model checkpoints from Hugging Face
 os.makedirs(MODEL_DIR, exist_ok=True)
 try:
     from huggingface_hub import snapshot_download
     print(f"Warning: Could not download model: {exc}")
     # Continue anyway - model might already be present
+# Initialize IndexTTS2 with GPU optimizations if available
 try:
     from indextts.infer_v2 import IndexTTS2
             f"Config file not found at {cfg_path}. Model may not be downloaded."
         )
+    print(f"Loading IndexTTS2 model on {DEVICE}...")
+    load_start = time.time()
     tts_model = IndexTTS2(
         cfg_path=cfg_path,
         model_dir=MODEL_DIR,
+        use_fp16=USE_GPU,  # Enable FP16 on GPU for ~30-40% speedup
+        use_cuda_kernel=USE_GPU,  # Enable CUDA kernels on GPU
+        use_deepspeed=False,  # Keep disabled for stability
     )
+    load_time = time.time() - load_start
+    print(f"IndexTTS2 model loaded successfully in {load_time:.2f}s on {DEVICE}")
+    # Warmup inference to initialize all model components
+    # This moves the initialization cost from first request to startup
+    print("Running warmup inference...")
+    warmup_start = time.time()
+    try:
+        # Create a minimal warmup audio file
+        warmup_audio_path = os.path.join(tempfile.gettempdir(), "warmup.wav")
+        warmup_output_path = os.path.join(tempfile.gettempdir(), "warmup_out.wav")
+        # Generate a short sine wave for warmup (1 second at 24kHz)
+        sample_rate = 24000
+        duration = 1.0
+        t = torch.linspace(0, duration, int(sample_rate * duration))
+        warmup_wav = (0.5 * torch.sin(2 * 3.14159 * 440 * t)).unsqueeze(0)
+        torchaudio.save(warmup_audio_path, warmup_wav, sample_rate)
+        # Run minimal inference with inference_mode for speed
+        with torch.inference_mode():
+            tts_model.infer(
+                spk_audio_prompt=warmup_audio_path,
+                text="Hello.",
+                output_path=warmup_output_path,
+                use_random=False,
+                verbose=False,
+            )
+        # Cleanup warmup files
+        Path(warmup_audio_path).unlink(missing_ok=True)
+        Path(warmup_output_path).unlink(missing_ok=True)
+        warmup_time = time.time() - warmup_start
+        print(f"Warmup complete in {warmup_time:.2f}s - model is ready!")
+    except Exception as warmup_exc:
+        print(f"Warmup failed (non-fatal): {warmup_exc}")
+        # Continue anyway - first request will just be slower
 except Exception as exc:
     raise RuntimeError(f"Failed to load IndexTTS2 model: {exc}") from exc
 def _write_temp_audio_from_url(url: HttpUrl) -> str:
+    """Download audio from URL to temporary file using connection pooling."""
+    response = HTTP_SESSION.get(str(url), stream=True, timeout=30)
     if response.status_code >= 400:
         raise HTTPException(
             status_code=400,
     - convert to mono
     - resample to target_sr
     - peak-normalize to target_peak (avoid clipping)
+    Optimized to minimize disk I/O.
     """
     wav, sr = torchaudio.load(path)
 def _run_generate_job(job_id: str, payload: Dict[str, str]):
+    """Background job for TTS generation with optimizations."""
     speaker_file = None
     output_file = None
     _set_job(job_id, status="processing")
     try:
+        start_time = time.time()
+        # Download/decode speaker audio
         speaker_file = _temp_speaker_file(payload["speaker_wav"])
         speaker_file = _preprocess_audio_wav(speaker_file)
+        prep_time = time.time() - start_time
         output_file = os.path.join(
             tempfile.gettempdir(),
             f"indextts2-{uuid.uuid4()}.wav"
         )
+        # Run inference with torch.inference_mode() for faster execution
+        infer_start = time.time()
+        with torch.inference_mode():
+            tts_model.infer(
+                spk_audio_prompt=speaker_file,
+                text=payload["text"],
+                output_path=output_file,
+                use_random=False,
+                verbose=True,  # Keep verbose for timing info
+            )
+        infer_time = time.time() - infer_start
+        # Post-process output
         output_file = _preprocess_audio_wav(output_file)
         if not Path(output_file).exists():
                 f"TTS generation failed: output file was not created at {output_file}"
             )
+        total_time = time.time() - start_time
+        print(f">> Job {job_id[:8]} completed: prep={prep_time:.2f}s, infer={infer_time:.2f}s, total={total_time:.2f}s")
         _cleanup_files(speaker_file)
         _set_job(job_id, status="completed", output_file=output_file)
     except Exception as exc:
+        print(f">> Job {job_id[:8]} failed: {exc}")
         _cleanup_files(speaker_file, output_file)
         _set_job(job_id, status="error", error=str(exc))
 def health(x_api_key: Optional[str] = Header(default=None)):
     """Health check endpoint."""
     _require_api_key(x_api_key)
+    return {
+        "status": "ok",
+        "model": "indextts2",
+        "device": DEVICE,
+        "gpu_enabled": USE_GPU,
+        "fp16_enabled": USE_GPU,
+    }
 @app.post("/generate")
     """API root with available endpoints."""
     return {
         "name": "indextts2-api",
+        "device": DEVICE,
+        "gpu_enabled": USE_GPU,
         "endpoints": [
             "/health",
             "/generate",