indextts2-api

Runtime error

App Files Files Community

ataberkkilavuzcu commited on Dec 10, 2025

Commit

edcc9a5

verified ·

1 Parent(s): b71bca4

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -134

app.py CHANGED Viewed

@@ -8,6 +8,8 @@ from typing import Dict, Optional
 import requests
 import torch
 from fastapi import BackgroundTasks, Body, FastAPI, Header, HTTPException
 from fastapi.responses import FileResponse, JSONResponse
 from pydantic import BaseModel, Field, HttpUrl
@@ -21,7 +23,8 @@ HF_TOKEN = (
 )
 # Model configuration
-MODEL_DIR = os.getenv("MODEL_DIR", "./checkpoints")
 MAX_TEXT_LENGTH = 1000
 DEFAULT_LANGUAGE = "en"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
@@ -34,73 +37,60 @@ JOB_LOCK = Lock()
 if HF_TOKEN:
     os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
     os.environ["HF_TOKEN"] = HF_TOKEN
-# Download and initialize OpenVoice model
 os.makedirs(MODEL_DIR, exist_ok=True)
-print(f"Initializing OpenVoice on {DEVICE}...")
 try:
-    # Download checkpoints if needed
-    if not Path(MODEL_DIR, "checkpoints_v2").exists():
-        print("Downloading OpenVoice V2 checkpoints...")
-        from huggingface_hub import snapshot_download
         snapshot_download(
-            repo_id="myshell-ai/OpenVoice",
             local_dir=MODEL_DIR,
             token=HF_TOKEN,
         )
         print("Model download complete.")
-    # Import OpenVoice modules
-    from melo.api import TTS
-    from openvoice import se_extractor
-    from openvoice.api import ToneColorConverter
-    # Initialize base TTS (MeloTTS)
-    ckpt_converter = f'{MODEL_DIR}/checkpoints_v2/converter'
-    # Initialize tone color converter
-    tone_color_converter = ToneColorConverter(
-        f'{ckpt_converter}/config.json',
-        device=DEVICE
     )
-    tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
-    # Initialize base TTS for English
-    base_speaker_tts = TTS(language='EN', device=DEVICE)
-    base_speaker = base_speaker_tts.hps.data.spk2id['EN-US']
-    print("OpenVoice V2 loaded successfully!")
 except Exception as exc:
-    print(f"Error loading OpenVoice: {exc}")
-    print("Trying alternative initialization...")
-    try:
-        # Fallback: Use simpler initialization
-        from melo.api import TTS
-        base_speaker_tts = TTS(language='EN', device=DEVICE)
-        base_speaker = base_speaker_tts.hps.data.spk2id['EN-US']
-        # Mock converter for basic functionality
-        tone_color_converter = None
-        print("Loaded base TTS only (voice cloning disabled)")
-    except Exception as exc2:
-        raise RuntimeError(f"Failed to load OpenVoice: {exc2}") from exc2
 # Initialize FastAPI app
-app = FastAPI(title="openvoice-api", version="2.0.0")
 class GenerateRequest(BaseModel):
     text: str = Field(..., min_length=1, max_length=MAX_TEXT_LENGTH)
     speaker_wav: str = Field(..., description="HTTPS URL or base64-encoded audio")
-    language: Optional[str] = Field(DEFAULT_LANGUAGE, description="ISO code: en, es, fr, zh, ja, ko")
-    speed: Optional[float] = Field(1.0, ge=0.5, le=2.0, description="Speech speed (0.5-2.0)")
 def _require_api_key(x_api_key: Optional[str]):
@@ -150,6 +140,40 @@ def _temp_speaker_file(speaker_wav: str) -> str:
     return _write_temp_audio_from_base64(speaker_wav)
 def _set_job(job_id: str, **kwargs):
     """Thread-safe job update."""
     with JOB_LOCK:
@@ -180,90 +204,39 @@ def _cleanup_files(*files: str):
 def _run_generate_job(job_id: str, payload: Dict[str, str]):
-    """
-    Background job for TTS generation using OpenVoice V2.
-    Two-step process:
-    1. Generate base speech with MeloTTS
-    2. Apply target voice characteristics with ToneColorConverter
-    """
     speaker_file = None
-    temp_audio = None
     output_file = None
     _set_job(job_id, status="processing")
     try:
-        # Step 1: Generate base speech
-        temp_audio = os.path.join(
             tempfile.gettempdir(),
-            f"openvoice-temp-{uuid.uuid4()}.wav"
         )
-        speed = float(payload.get("speed", 1.0))
-        base_speaker_tts.tts_to_file(
-            payload["text"],
-            base_speaker,
-            temp_audio,
-            speed=speed
         )
-        # Step 2: Apply voice cloning if converter is available
-        if tone_color_converter is not None:
-            try:
-                # Prepare reference audio
-                speaker_file = _temp_speaker_file(payload["speaker_wav"])
-                # Extract target speaker embedding
-                target_se, _ = se_extractor.get_se(
-                    speaker_file,
-                    tone_color_converter,
-                    vad=True
-                )
-                # Get source speaker embedding
-                source_se = torch.load(
-                    f'{MODEL_DIR}/checkpoints_v2/base_speakers/ses/en-us.pth',
-                    map_location=DEVICE
-                )
-                # Apply voice conversion
-                output_file = os.path.join(
-                    tempfile.gettempdir(),
-                    f"openvoice-{uuid.uuid4()}.wav"
-                )
-                tone_color_converter.convert(
-                    audio_src_path=temp_audio,
-                    src_se=source_se,
-                    tgt_se=target_se,
-                    output_path=output_file,
-                    message="@MyShell"
-                )
-                # Cleanup temp audio
-                _cleanup_files(speaker_file, temp_audio)
-            except Exception as convert_error:
-                print(f"Voice conversion failed: {convert_error}")
-                # Fall back to base audio without voice cloning
-                output_file = temp_audio
-                temp_audio = None
-                _cleanup_files(speaker_file)
-        else:
-            # No converter available, use base audio
-            output_file = temp_audio
-            temp_audio = None
-        # Verify output exists
         if not Path(output_file).exists():
             raise RuntimeError(
-                f"TTS generation failed: output file was not created"
             )
         _set_job(job_id, status="completed", output_file=output_file)
     except Exception as exc:
-        _cleanup_files(speaker_file, temp_audio, output_file)
         _set_job(job_id, status="error", error=str(exc))
@@ -271,13 +244,7 @@ def _run_generate_job(job_id: str, payload: Dict[str, str]):
 def health(x_api_key: Optional[str] = Header(default=None)):
     """Health check endpoint."""
     _require_api_key(x_api_key)
-    return {
-        "status": "ok",
-        "model": "openvoice-v2",
-        "device": DEVICE,
-        "voice_cloning": tone_color_converter is not None,
-        "supported_languages": ["en", "es", "fr", "zh", "ja", "ko"]
-    }
 @app.post("/generate")
@@ -287,7 +254,7 @@ def generate(
     x_api_key: Optional[str] = Header(default=None),
 ):
     """
-    Generate speech from text using voice cloning with OpenVoice.
     Returns job information for async processing.
     """
     _require_api_key(x_api_key)
@@ -295,7 +262,7 @@ def generate(
     job_id = str(uuid.uuid4())
     _set_job(job_id, status="queued")
-    # Offload the synthesis to background task
     background_tasks.add_task(_run_generate_job, job_id, payload.dict())
     return JSONResponse(
@@ -369,20 +336,11 @@ def job_result(
 def root():
     """API root with available endpoints."""
     return {
-        "name": "openvoice-api",
-        "version": "2.0.0",
-        "model": "OpenVoice V2",
-        "voice_cloning": tone_color_converter is not None,
         "endpoints": [
             "/health",
             "/generate",
             "/status/{job_id}",
             "/result/{job_id}"
         ],
-        "features": [
-            "Voice cloning with 3-10s reference audio" if tone_color_converter else "Base TTS only",
-            "Multi-language support (EN, ES, FR, ZH, JA, KO)",
-            "Adjustable speech speed (0.5-2.0x)",
-            "Fast CPU performance"
-        ]
     }

 import requests
 import torch
+import torchaudio
+from torchaudio.transforms import Resample
 from fastapi import BackgroundTasks, Body, FastAPI, Header, HTTPException
 from fastapi.responses import FileResponse, JSONResponse
 from pydantic import BaseModel, Field, HttpUrl
 )
 # Model configuration
+MODEL_REPO = "IndexTeam/IndexTTS-2"
+MODEL_DIR = os.getenv("MODEL_DIR", "/data/indextts2")
 MAX_TEXT_LENGTH = 1000
 DEFAULT_LANGUAGE = "en"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 if HF_TOKEN:
     os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
     os.environ["HF_TOKEN"] = HF_TOKEN
+    try:
+        from huggingface_hub import login
+        login(token=HF_TOKEN, add_to_git_credential=False)
+    except ImportError:
+        pass
+# Download model checkpoints from Hugging Face
 os.makedirs(MODEL_DIR, exist_ok=True)
 try:
+    from huggingface_hub import snapshot_download
+    # Download model if not already present
+    if not Path(MODEL_DIR, "config.yaml").exists():
+        print(f"Downloading IndexTTS2 model from {MODEL_REPO}...")
         snapshot_download(
+            repo_id=MODEL_REPO,
             local_dir=MODEL_DIR,
             token=HF_TOKEN,
         )
         print("Model download complete.")
+except Exception as exc:
+    print(f"Warning: Could not download model: {exc}")
+    # Continue anyway - model might already be present
+# Initialize IndexTTS2
+try:
+    from indextts.infer_v2 import IndexTTS2
+    cfg_path = os.path.join(MODEL_DIR, "config.yaml")
+    if not Path(cfg_path).exists():
+        raise FileNotFoundError(
+            f"Config file not found at {cfg_path}. Model may not be downloaded."
+        )
+    tts_model = IndexTTS2(
+        cfg_path=cfg_path,
+        model_dir=MODEL_DIR,
+        use_fp16=False,  # CPU doesn't support FP16
+        use_cuda_kernel=False,  # CPU mode
+        use_deepspeed=False,  # CPU mode
     )
+    print("IndexTTS2 model loaded successfully.")
 except Exception as exc:
+    raise RuntimeError(f"Failed to load IndexTTS2 model: {exc}") from exc
 # Initialize FastAPI app
+app = FastAPI(title="indextts2-api", version="1.0.0")
 class GenerateRequest(BaseModel):
     text: str = Field(..., min_length=1, max_length=MAX_TEXT_LENGTH)
     speaker_wav: str = Field(..., description="HTTPS URL or base64-encoded audio")
+    language: Optional[str] = Field(DEFAULT_LANGUAGE, description="ISO code, default en")
 def _require_api_key(x_api_key: Optional[str]):
     return _write_temp_audio_from_base64(speaker_wav)
+def _preprocess_audio_wav(
+    path: str,
+    target_sr: int = 24000,
+    target_peak: float = 0.98
+) -> str:
+    """
+    Light preprocessing to stabilize embeddings and output quality:
+    - convert to mono
+    - resample to target_sr
+    - peak-normalize to target_peak (avoid clipping)
+    """
+    wav, sr = torchaudio.load(path)
+    # Convert to mono
+    if wav.shape[0] > 1:
+        wav = wav.mean(dim=0, keepdim=True)
+    # Resample if needed
+    if sr != target_sr:
+        resampler = Resample(orig_freq=sr, new_freq=target_sr)
+        wav = resampler(wav)
+        sr = target_sr
+    # Peak normalize
+    peak = wav.abs().max().item() if wav.numel() else 0.0
+    if peak > 0:
+        scale = min(target_peak / peak, 1.0)
+        wav = wav * scale
+    # Overwrite input file to avoid extra temp files
+    torchaudio.save(path, wav, sr, bits_per_sample=16)
+    return path
 def _set_job(job_id: str, **kwargs):
     """Thread-safe job update."""
     with JOB_LOCK:
 def _run_generate_job(job_id: str, payload: Dict[str, str]):
+    """Background job for TTS generation."""
     speaker_file = None
     output_file = None
     _set_job(job_id, status="processing")
     try:
+        speaker_file = _temp_speaker_file(payload["speaker_wav"])
+        speaker_file = _preprocess_audio_wav(speaker_file)
+        output_file = os.path.join(
             tempfile.gettempdir(),
+            f"indextts2-{uuid.uuid4()}.wav"
         )
+        tts_model.infer(
+            spk_audio_prompt=speaker_file,
+            text=payload["text"],
+            output_path=output_file,
+            use_random=False,
+            verbose=False,
         )
+        output_file = _preprocess_audio_wav(output_file)
         if not Path(output_file).exists():
             raise RuntimeError(
+                f"TTS generation failed: output file was not created at {output_file}"
             )
+        _cleanup_files(speaker_file)
         _set_job(job_id, status="completed", output_file=output_file)
     except Exception as exc:
+        _cleanup_files(speaker_file, output_file)
         _set_job(job_id, status="error", error=str(exc))
 def health(x_api_key: Optional[str] = Header(default=None)):
     """Health check endpoint."""
     _require_api_key(x_api_key)
+    return {"status": "ok", "model": "indextts2", "device": DEVICE}
 @app.post("/generate")
     x_api_key: Optional[str] = Header(default=None),
 ):
     """
+    Generate speech from text using voice cloning.
     Returns job information for async processing.
     """
     _require_api_key(x_api_key)
     job_id = str(uuid.uuid4())
     _set_job(job_id, status="queued")
+    # Offload the long-running synthesis so the HTTP request stays fast (<100s)
     background_tasks.add_task(_run_generate_job, job_id, payload.dict())
     return JSONResponse(
 def root():
     """API root with available endpoints."""
     return {
+        "name": "indextts2-api",
         "endpoints": [
             "/health",
             "/generate",
             "/status/{job_id}",
             "/result/{job_id}"
         ],
     }