Spaces:

ataberkkilavuzcu
/

indextts2-api

Running

App Files Files Community

ataberkkilavuzcu commited on 3 days ago

Commit

b71bca4

verified ·

1 Parent(s): 5fc2568

Update app.py

Browse files

Files changed (1) hide show

app.py +99 -131

app.py CHANGED Viewed

@@ -8,8 +8,6 @@ from typing import Dict, Optional
 import requests
 import torch
-import torchaudio
-from torchaudio.transforms import Resample
 from fastapi import BackgroundTasks, Body, FastAPI, Header, HTTPException
 from fastapi.responses import FileResponse, JSONResponse
 from pydantic import BaseModel, Field, HttpUrl
@@ -23,8 +21,7 @@ HF_TOKEN = (
 )
 # Model configuration
-OPENVOICE_REPO = "myshell-ai/OpenVoiceV2"
-MODEL_DIR = os.getenv("MODEL_DIR", "/data/openvoice")
 MAX_TEXT_LENGTH = 1000
 DEFAULT_LANGUAGE = "en"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
@@ -37,67 +34,66 @@ JOB_LOCK = Lock()
 if HF_TOKEN:
     os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
     os.environ["HF_TOKEN"] = HF_TOKEN
-    try:
-        from huggingface_hub import login
-        login(token=HF_TOKEN, add_to_git_credential=False)
-    except ImportError:
-        pass
-# Download model checkpoints from Hugging Face
 os.makedirs(MODEL_DIR, exist_ok=True)
 try:
-    from huggingface_hub import snapshot_download
-    # Download OpenVoice model if not already present
-    if not Path(MODEL_DIR, "converter").exists():
-        print(f"Downloading OpenVoice model from {OPENVOICE_REPO}...")
         snapshot_download(
-            repo_id=OPENVOICE_REPO,
             local_dir=MODEL_DIR,
             token=HF_TOKEN,
         )
-        print("OpenVoice model download complete.")
-except Exception as exc:
-    print(f"Warning: Could not download model: {exc}")
-    # Continue anyway - model might already be present
-# Initialize OpenVoice
-try:
     from openvoice import se_extractor
-    from openvoice.api import BaseSpeakerTTS, ToneColorConverter
-    # Initialize base TTS model (MeloTTS)
-    ckpt_converter = os.path.join(MODEL_DIR, "converter")
-    if not Path(ckpt_converter).exists():
-        raise FileNotFoundError(
-            f"Converter checkpoint not found at {ckpt_converter}. Model may not be downloaded."
-        )
-    # Initialize TTS and Tone Color Converter
-    base_speaker_tts = BaseSpeakerTTS(
-        f'{MODEL_DIR}/base_speakers/EN/config.json',
-        device=DEVICE
-    )
     tone_color_converter = ToneColorConverter(
         f'{ckpt_converter}/config.json',
         device=DEVICE
     )
-    # Load source speaker embedding (default voice)
-    source_se = torch.load(
-        f'{MODEL_DIR}/base_speakers/EN/en_default_se.pth',
-        map_location=DEVICE
-    )
-    print("OpenVoice model loaded successfully.")
 except Exception as exc:
-    raise RuntimeError(f"Failed to load OpenVoice model: {exc}") from exc
 # Initialize FastAPI app
-app = FastAPI(title="openvoice-api", version="1.0.0")
 class GenerateRequest(BaseModel):
@@ -154,47 +150,6 @@ def _temp_speaker_file(speaker_wav: str) -> str:
     return _write_temp_audio_from_base64(speaker_wav)
-def _preprocess_audio_wav(
-    path: str,
-    target_sr: int = 24000,
-    target_peak: float = 0.98,
-    min_duration: float = 3.0
-) -> str:
-    """
-    Preprocess audio for optimal voice cloning:
-    - convert to mono
-    - resample to target_sr
-    - peak-normalize to target_peak (avoid clipping)
-    - ensure minimum duration (OpenVoice works better with 3-10s audio)
-    """
-    wav, sr = torchaudio.load(path)
-    # Convert to mono
-    if wav.shape[0] > 1:
-        wav = wav.mean(dim=0, keepdim=True)
-    # Resample if needed
-    if sr != target_sr:
-        resampler = Resample(orig_freq=sr, new_freq=target_sr)
-        wav = resampler(wav)
-        sr = target_sr
-    # Check duration (OpenVoice recommends 3-10 seconds)
-    duration = wav.shape[1] / sr
-    if duration < min_duration:
-        print(f"Warning: Reference audio is {duration:.2f}s. OpenVoice works best with 3-10s audio.")
-    # Peak normalize
-    peak = wav.abs().max().item() if wav.numel() else 0.0
-    if peak > 0:
-        scale = min(target_peak / peak, 1.0)
-        wav = wav * scale
-    # Overwrite input file to avoid extra temp files
-    torchaudio.save(path, wav, sr, bits_per_sample=16)
-    return path
 def _set_job(job_id: str, **kwargs):
     """Thread-safe job update."""
     with JOB_LOCK:
@@ -226,9 +181,9 @@ def _cleanup_files(*files: str):
 def _run_generate_job(job_id: str, payload: Dict[str, str]):
     """
-    Background job for TTS generation using OpenVoice.
     Two-step process:
-    1. Generate base speech with BaseSpeakerTTS
     2. Apply target voice characteristics with ToneColorConverter
     """
     speaker_file = None
@@ -237,18 +192,7 @@ def _run_generate_job(job_id: str, payload: Dict[str, str]):
     _set_job(job_id, status="processing")
     try:
-        # Step 1: Prepare reference audio and extract speaker embedding
-        speaker_file = _temp_speaker_file(payload["speaker_wav"])
-        speaker_file = _preprocess_audio_wav(speaker_file)
-        # Extract target speaker embedding
-        target_se, _ = se_extractor.get_se(
-            speaker_file,
-            tone_color_converter,
-            vad=True  # Voice activity detection for better extraction
-        )
-        # Step 2: Generate base speech with default voice
         temp_audio = os.path.join(
             tempfile.gettempdir(),
             f"openvoice-temp-{uuid.uuid4()}.wav"
@@ -256,39 +200,66 @@ def _run_generate_job(job_id: str, payload: Dict[str, str]):
         speed = float(payload.get("speed", 1.0))
-        base_speaker_tts.tts(
-            text=payload["text"],
-            output_path=temp_audio,
-            speaker='default',
-            language=payload.get("language", "en").upper(),
             speed=speed
         )
-        # Step 3: Apply target voice characteristics
-        output_file = os.path.join(
-            tempfile.gettempdir(),
-            f"openvoice-{uuid.uuid4()}.wav"
-        )
-        # Encode with watermark (set to False if not needed)
-        encode_message = "@MyShell"
-        tone_color_converter.convert(
-            audio_src_path=temp_audio,
-            src_se=source_se,
-            tgt_se=target_se,
-            output_path=output_file,
-            message=encode_message
-        )
         # Verify output exists
         if not Path(output_file).exists():
             raise RuntimeError(
-                f"TTS generation failed: output file was not created at {output_file}"
             )
-        # Cleanup intermediate files
-        _cleanup_files(speaker_file, temp_audio)
         _set_job(job_id, status="completed", output_file=output_file)
     except Exception as exc:
@@ -304,6 +275,7 @@ def health(x_api_key: Optional[str] = Header(default=None)):
         "status": "ok",
         "model": "openvoice-v2",
         "device": DEVICE,
         "supported_languages": ["en", "es", "fr", "zh", "ja", "ko"]
     }
@@ -317,10 +289,6 @@ def generate(
     """
     Generate speech from text using voice cloning with OpenVoice.
     Returns job information for async processing.
-    OpenVoice uses a two-step process:
-    1. Generate base speech with MeloTTS
-    2. Apply voice characteristics from reference audio
     """
     _require_api_key(x_api_key)
@@ -404,6 +372,7 @@ def root():
         "name": "openvoice-api",
         "version": "2.0.0",
         "model": "OpenVoice V2",
         "endpoints": [
             "/health",
             "/generate",
@@ -411,10 +380,9 @@ def root():
             "/result/{job_id}"
         ],
         "features": [
-            "Voice cloning with 3-10s reference audio",
             "Multi-language support (EN, ES, FR, ZH, JA, KO)",
             "Adjustable speech speed (0.5-2.0x)",
-            "Fast CPU performance (5-10x faster than IndexTTS2)"
         ]
     }

 import requests
 import torch
 from fastapi import BackgroundTasks, Body, FastAPI, Header, HTTPException
 from fastapi.responses import FileResponse, JSONResponse
 from pydantic import BaseModel, Field, HttpUrl
 )
 # Model configuration
+MODEL_DIR = os.getenv("MODEL_DIR", "./checkpoints")
 MAX_TEXT_LENGTH = 1000
 DEFAULT_LANGUAGE = "en"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 if HF_TOKEN:
     os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
     os.environ["HF_TOKEN"] = HF_TOKEN
+# Download and initialize OpenVoice model
 os.makedirs(MODEL_DIR, exist_ok=True)
+print(f"Initializing OpenVoice on {DEVICE}...")
 try:
+    # Download checkpoints if needed
+    if not Path(MODEL_DIR, "checkpoints_v2").exists():
+        print("Downloading OpenVoice V2 checkpoints...")
+        from huggingface_hub import snapshot_download
         snapshot_download(
+            repo_id="myshell-ai/OpenVoice",
             local_dir=MODEL_DIR,
             token=HF_TOKEN,
         )
+        print("Model download complete.")
+    # Import OpenVoice modules
+    from melo.api import TTS
     from openvoice import se_extractor
+    from openvoice.api import ToneColorConverter
+    # Initialize base TTS (MeloTTS)
+    ckpt_converter = f'{MODEL_DIR}/checkpoints_v2/converter'
+    # Initialize tone color converter
     tone_color_converter = ToneColorConverter(
         f'{ckpt_converter}/config.json',
         device=DEVICE
     )
+    tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
+    # Initialize base TTS for English
+    base_speaker_tts = TTS(language='EN', device=DEVICE)
+    base_speaker = base_speaker_tts.hps.data.spk2id['EN-US']
+    print("OpenVoice V2 loaded successfully!")
 except Exception as exc:
+    print(f"Error loading OpenVoice: {exc}")
+    print("Trying alternative initialization...")
+    try:
+        # Fallback: Use simpler initialization
+        from melo.api import TTS
+        base_speaker_tts = TTS(language='EN', device=DEVICE)
+        base_speaker = base_speaker_tts.hps.data.spk2id['EN-US']
+        # Mock converter for basic functionality
+        tone_color_converter = None
+        print("Loaded base TTS only (voice cloning disabled)")
+    except Exception as exc2:
+        raise RuntimeError(f"Failed to load OpenVoice: {exc2}") from exc2
 # Initialize FastAPI app
+app = FastAPI(title="openvoice-api", version="2.0.0")
 class GenerateRequest(BaseModel):
     return _write_temp_audio_from_base64(speaker_wav)
 def _set_job(job_id: str, **kwargs):
     """Thread-safe job update."""
     with JOB_LOCK:
 def _run_generate_job(job_id: str, payload: Dict[str, str]):
     """
+    Background job for TTS generation using OpenVoice V2.
     Two-step process:
+    1. Generate base speech with MeloTTS
     2. Apply target voice characteristics with ToneColorConverter
     """
     speaker_file = None
     _set_job(job_id, status="processing")
     try:
+        # Step 1: Generate base speech
         temp_audio = os.path.join(
             tempfile.gettempdir(),
             f"openvoice-temp-{uuid.uuid4()}.wav"
         speed = float(payload.get("speed", 1.0))
+        base_speaker_tts.tts_to_file(
+            payload["text"],
+            base_speaker,
+            temp_audio,
             speed=speed
         )
+        # Step 2: Apply voice cloning if converter is available
+        if tone_color_converter is not None:
+            try:
+                # Prepare reference audio
+                speaker_file = _temp_speaker_file(payload["speaker_wav"])
+                # Extract target speaker embedding
+                target_se, _ = se_extractor.get_se(
+                    speaker_file,
+                    tone_color_converter,
+                    vad=True
+                )
+                # Get source speaker embedding
+                source_se = torch.load(
+                    f'{MODEL_DIR}/checkpoints_v2/base_speakers/ses/en-us.pth',
+                    map_location=DEVICE
+                )
+                # Apply voice conversion
+                output_file = os.path.join(
+                    tempfile.gettempdir(),
+                    f"openvoice-{uuid.uuid4()}.wav"
+                )
+                tone_color_converter.convert(
+                    audio_src_path=temp_audio,
+                    src_se=source_se,
+                    tgt_se=target_se,
+                    output_path=output_file,
+                    message="@MyShell"
+                )
+                # Cleanup temp audio
+                _cleanup_files(speaker_file, temp_audio)
+            except Exception as convert_error:
+                print(f"Voice conversion failed: {convert_error}")
+                # Fall back to base audio without voice cloning
+                output_file = temp_audio
+                temp_audio = None
+                _cleanup_files(speaker_file)
+        else:
+            # No converter available, use base audio
+            output_file = temp_audio
+            temp_audio = None
         # Verify output exists
         if not Path(output_file).exists():
             raise RuntimeError(
+                f"TTS generation failed: output file was not created"
             )
         _set_job(job_id, status="completed", output_file=output_file)
     except Exception as exc:
         "status": "ok",
         "model": "openvoice-v2",
         "device": DEVICE,
+        "voice_cloning": tone_color_converter is not None,
         "supported_languages": ["en", "es", "fr", "zh", "ja", "ko"]
     }
     """
     Generate speech from text using voice cloning with OpenVoice.
     Returns job information for async processing.
     """
     _require_api_key(x_api_key)
         "name": "openvoice-api",
         "version": "2.0.0",
         "model": "OpenVoice V2",
+        "voice_cloning": tone_color_converter is not None,
         "endpoints": [
             "/health",
             "/generate",
             "/result/{job_id}"
         ],
         "features": [
+            "Voice cloning with 3-10s reference audio" if tone_color_converter else "Base TTS only",
             "Multi-language support (EN, ES, FR, ZH, JA, KO)",
             "Adjustable speech speed (0.5-2.0x)",
+            "Fast CPU performance"
         ]
     }