Spaces:

fosters
/

xttsv2

Running

App Files Files Community

fosters commited on 19 days ago

Commit

5df429e

verified ·

1 Parent(s): ce8d2b9

Upload app.py

Browse files

Files changed (1) hide show

app.py +186 -73

app.py CHANGED Viewed

@@ -1,6 +1,11 @@
 """
-Alternative XTTSv2 loader - loads fine-tuned model from Hugging Face Hub
-Use this if your model is hosted on HF Hub instead of locally in the Space
 """
 import gradio as gr
@@ -8,8 +13,10 @@ import torch
 import os
 import gc
 import hashlib
 import numpy as np
-from huggingface_hub import snapshot_download
 from typing import Optional, Tuple
 import logging
@@ -17,42 +24,43 @@ logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # ============== Configuration ==============
-# Change this to your HF Hub model repo
-HF_MODEL_REPO = os.environ.get("HF_MODEL_REPO", "your-username/your-xtts-finetuned")
-USE_DEEPSPEED = os.environ.get("USE_DEEPSPEED", "true").lower() == "true"
 USE_FP16 = os.environ.get("USE_FP16", "true").lower() == "true"
-USE_TORCH_COMPILE = os.environ.get("USE_TORCH_COMPILE", "true").lower() == "true"
-MAX_CACHE_SIZE = int(os.environ.get("MAX_CACHE_SIZE", "10"))
 STREAMING_CHUNK_SIZE = int(os.environ.get("STREAMING_CHUNK_SIZE", "20"))
 # ============== Model Loading ==============
-def download_and_load_model():
-    """Download model from HF Hub and load with optimizations"""
     from TTS.tts.configs.xtts_config import XttsConfig
     from TTS.tts.models.xtts import Xtts
-    logger.info(f"Downloading model from {HF_MODEL_REPO}...")
-    # Download model files from HF Hub
-    model_path = snapshot_download(
-        repo_id=HF_MODEL_REPO,
-        allow_patterns=["*.pth", "*.json", "*.txt", "vocab.*"],
-        local_dir="./model",
-        local_dir_use_symlinks=False
-    )
-    logger.info(f"Model downloaded to {model_path}")
-    config = XttsConfig()
-    config.load_json(os.path.join(model_path, "config.json"))
-    model = Xtts.init_from_config(config)
-    model.load_checkpoint(
-        config,
-        checkpoint_dir=model_path,
-        eval=True,
-        use_deepspeed=USE_DEEPSPEED
-    )
     device = "cuda" if torch.cuda.is_available() else "cpu"
     model = model.to(device)
@@ -61,6 +69,7 @@ def download_and_load_model():
     if USE_FP16 and device == "cuda":
         logger.info("Enabling FP16 inference...")
         model.half()
         if hasattr(model, 'gpt'):
             model.gpt.float()
@@ -83,22 +92,26 @@ def download_and_load_model():
     return model, config, device
 # Global model instance
-model, config, device = download_and_load_model()
 # ============== Speaker Caching ==============
 class SpeakerCache:
     def __init__(self, max_size: int = 10):
         self.max_size = max_size
         self.cache = {}
         self.order = []
     def _hash_audio(self, audio_path: str) -> str:
         with open(audio_path, 'rb') as f:
             return hashlib.md5(f.read()).hexdigest()[:16]
     def get(self, audio_path: str) -> Optional[Tuple[torch.Tensor, torch.Tensor]]:
         key = self._hash_audio(audio_path)
         if key in self.cache:
             self.order.remove(key)
             self.order.append(key)
             return self.cache[key]
@@ -106,12 +119,15 @@ class SpeakerCache:
     def set(self, audio_path: str, latents: Tuple[torch.Tensor, torch.Tensor]):
         key = self._hash_audio(audio_path)
         if len(self.cache) >= self.max_size and key not in self.cache:
             oldest = self.order.pop(0)
             del self.cache[oldest]
             gc.collect()
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
         self.cache[key] = latents
         if key not in self.order:
             self.order.append(key)
@@ -128,6 +144,9 @@ speaker_cache = SpeakerCache(max_size=MAX_CACHE_SIZE)
 # ============== Core Functions ==============
 @torch.inference_mode()
 def get_speaker_latents(speaker_wav: str) -> Tuple[torch.Tensor, torch.Tensor]:
     cached = speaker_cache.get(speaker_wav)
     if cached is not None:
         logger.info("Using cached speaker latents")
@@ -136,12 +155,13 @@ def get_speaker_latents(speaker_wav: str) -> Tuple[torch.Tensor, torch.Tensor]:
     logger.info("Computing speaker latents...")
     gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
         audio_path=speaker_wav,
-        gpt_cond_len=getattr(config, 'gpt_cond_len', 6),
-        gpt_cond_chunk_len=getattr(config, 'gpt_cond_chunk_len', 3),
-        max_ref_length=getattr(config, 'max_ref_len', 30),
-        sound_norm_refs=getattr(config, 'sound_norm_refs', False),
     )
     if USE_FP16 and device == "cuda":
         gpt_cond_latent = gpt_cond_latent.half()
         speaker_embedding = speaker_embedding.half()
@@ -162,7 +182,11 @@ def synthesize(
     length_penalty: float = 1.0,
     speed: float = 1.0
 ) -> Optional[Tuple[int, np.ndarray]]:
-    if not text.strip() or not speaker_wav:
         return None
     try:
@@ -183,7 +207,8 @@ def synthesize(
         )
         wav = np.array(out["wav"])
-        sample_rate = getattr(config.audio, 'output_sample_rate', 24000)
         return (sample_rate, wav)
     except Exception as e:
@@ -202,6 +227,8 @@ def synthesize_streaming(
     repetition_penalty: float = 5.0,
     speed: float = 1.0
 ):
     if not text.strip() or not speaker_wav:
         return
@@ -222,7 +249,7 @@ def synthesize_streaming(
             enable_text_splitting=True
         )
-        sample_rate = getattr(config.audio, 'output_sample_rate', 24000)
         for chunk in chunks:
             if chunk is not None:
@@ -234,42 +261,85 @@ def synthesize_streaming(
 def clear_cache():
     speaker_cache.clear()
     return "Cache cleared!"
 # ============== Gradio Interface ==============
 LANGUAGES = [
-    ("English", "en"), ("Spanish", "es"), ("French", "fr"), ("German", "de"),
-    ("Italian", "it"), ("Portuguese", "pt"), ("Polish", "pl"), ("Turkish", "tr"),
-    ("Russian", "ru"), ("Dutch", "nl"), ("Czech", "cs"), ("Arabic", "ar"),
-    ("Chinese", "zh-cn"), ("Japanese", "ja"), ("Hungarian", "hu"), ("Korean", "ko"),
     ("Hindi", "hi"),
 ]
-with gr.Blocks(title="🐸 XTTSv2 TTS", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🐸 XTTSv2 Text-to-Speech\nHigh-quality multilingual voice cloning.")
     with gr.Tabs():
         with gr.TabItem("🎙️ Standard"):
             with gr.Row():
-                with gr.Column():
-                    text_input = gr.Textbox(label="Text", placeholder="Enter text...", lines=4)
-                    speaker_wav = gr.Audio(label="Reference Audio", type="filepath")
-                    language = gr.Dropdown(choices=LANGUAGES, value="en", label="Language")
-                    with gr.Accordion("Advanced", open=False):
-                        temperature = gr.Slider(0.1, 1.0, value=0.65, label="Temperature")
-                        top_p = gr.Slider(0.1, 1.0, value=0.85, label="Top P")
-                        top_k = gr.Slider(1, 100, value=50, label="Top K")
-                        repetition_penalty = gr.Slider(1.0, 15.0, value=5.0, label="Repetition Penalty")
-                        length_penalty = gr.Slider(0.5, 2.0, value=1.0, label="Length Penalty")
-                        speed = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
-                    generate_btn = gr.Button("🔊 Generate", variant="primary")
-                with gr.Column():
-                    audio_output = gr.Audio(label="Output")
             generate_btn.click(
                 fn=synthesize,
@@ -277,28 +347,71 @@ with gr.Blocks(title="🐸 XTTSv2 TTS", theme=gr.themes.Soft()) as demo:
                 outputs=audio_output
             )
-        with gr.TabItem("⚡ Streaming"):
             with gr.Row():
-                with gr.Column():
-                    text_stream = gr.Textbox(label="Text", lines=4)
-                    speaker_stream = gr.Audio(label="Reference Audio", type="filepath")
-                    lang_stream = gr.Dropdown(choices=LANGUAGES, value="en", label="Language")
-                    stream_btn = gr.Button("⚡ Stream", variant="primary")
-                with gr.Column():
-                    audio_stream = gr.Audio(label="Output", streaming=True, autoplay=True)
             stream_btn.click(
                 fn=synthesize_streaming,
-                inputs=[text_stream, speaker_stream, lang_stream],
-                outputs=audio_stream
             )
         with gr.TabItem("⚙️ Settings"):
-            gr.Markdown(f"**Device**: {device} | **DeepSpeed**: {USE_DEEPSPEED} | **FP16**: {USE_FP16}")
-            clear_btn = gr.Button("🗑️ Clear Cache")
-            status = gr.Textbox(label="Status", interactive=False)
-            clear_btn.click(fn=clear_cache, outputs=status)
 if __name__ == "__main__":
-    demo.queue(max_size=10).launch(server_name="0.0.0.0", server_port=7860)

 """
+Optimized XTTSv2 Hugging Face Space
+- DeepSpeed acceleration
+- FP16 inference
+- torch.compile() optimization
+- Speaker latent caching
+- Streaming inference
+- Memory optimization
 """
 import gradio as gr
 import os
 import gc
 import hashlib
+import tempfile
 import numpy as np
+from pathlib import Path
+from functools import lru_cache
 from typing import Optional, Tuple
 import logging
 logger = logging.getLogger(__name__)
 # ============== Configuration ==============
+MODEL_PATH = os.environ.get("MODEL_PATH", "./model")
+USE_DEEPSPEED = os.environ.get("USE_DEEPSPEED", "false").lower() == "true"  # Disabled by default for stability
 USE_FP16 = os.environ.get("USE_FP16", "true").lower() == "true"
+USE_TORCH_COMPILE = os.environ.get("USE_TORCH_COMPILE", "false").lower() == "true"  # Disabled by default for stability
+MAX_CACHE_SIZE = int(os.environ.get("MAX_CACHE_SIZE", "10"))  # Max cached speakers
 STREAMING_CHUNK_SIZE = int(os.environ.get("STREAMING_CHUNK_SIZE", "20"))
 # ============== Model Loading ==============
+def load_model():
+    """Load XTTSv2 with all optimizations"""
+    from TTS.api import TTS
     from TTS.tts.configs.xtts_config import XttsConfig
     from TTS.tts.models.xtts import Xtts
+    logger.info("Loading XTTSv2 model...")
+    # Check if local model exists, otherwise use default from HF Hub
+    local_config = os.path.join(MODEL_PATH, "config.json")
+    if os.path.exists(local_config):
+        # Load local/fine-tuned model
+        logger.info(f"Loading local model from {MODEL_PATH}")
+        config = XttsConfig()
+        config.load_json(local_config)
+        model = Xtts.init_from_config(config)
+        model.load_checkpoint(
+            config,
+            checkpoint_dir=MODEL_PATH,
+            eval=True,
+            use_deepspeed=USE_DEEPSPEED
+        )
+    else:
+        # Load default XTTS-v2 from Hugging Face Hub via TTS API
+        logger.info("Loading default coqui/XTTS-v2 model from Hugging Face Hub...")
+        tts_api = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=torch.cuda.is_available())
+        model = tts_api.synthesizer.tts_model
+        config = tts_api.synthesizer.tts_config
     device = "cuda" if torch.cuda.is_available() else "cpu"
     model = model.to(device)
     if USE_FP16 and device == "cuda":
         logger.info("Enabling FP16 inference...")
         model.half()
+        # Keep some layers in FP32 for stability
         if hasattr(model, 'gpt'):
             model.gpt.float()
     return model, config, device
 # Global model instance
+model, config, device = load_model()
 # ============== Speaker Caching ==============
 class SpeakerCache:
+    """LRU cache for speaker embeddings with hash-based keys"""
     def __init__(self, max_size: int = 10):
         self.max_size = max_size
         self.cache = {}
         self.order = []
     def _hash_audio(self, audio_path: str) -> str:
+        """Create hash from audio file for cache key"""
         with open(audio_path, 'rb') as f:
             return hashlib.md5(f.read()).hexdigest()[:16]
     def get(self, audio_path: str) -> Optional[Tuple[torch.Tensor, torch.Tensor]]:
         key = self._hash_audio(audio_path)
         if key in self.cache:
+            # Move to end (most recently used)
             self.order.remove(key)
             self.order.append(key)
             return self.cache[key]
     def set(self, audio_path: str, latents: Tuple[torch.Tensor, torch.Tensor]):
         key = self._hash_audio(audio_path)
+        # Evict oldest if at capacity
         if len(self.cache) >= self.max_size and key not in self.cache:
             oldest = self.order.pop(0)
             del self.cache[oldest]
             gc.collect()
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
         self.cache[key] = latents
         if key not in self.order:
             self.order.append(key)
 # ============== Core Functions ==============
 @torch.inference_mode()
 def get_speaker_latents(speaker_wav: str) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Get speaker conditioning with caching"""
+    # Check cache first
     cached = speaker_cache.get(speaker_wav)
     if cached is not None:
         logger.info("Using cached speaker latents")
     logger.info("Computing speaker latents...")
     gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
         audio_path=speaker_wav,
+        gpt_cond_len=config.gpt_cond_len if hasattr(config, 'gpt_cond_len') else 6,
+        gpt_cond_chunk_len=config.gpt_cond_chunk_len if hasattr(config, 'gpt_cond_chunk_len') else 3,
+        max_ref_length=config.max_ref_len if hasattr(config, 'max_ref_len') else 30,
+        sound_norm_refs=config.sound_norm_refs if hasattr(config, 'sound_norm_refs') else False,
     )
+    # Move to correct device and dtype
     if USE_FP16 and device == "cuda":
         gpt_cond_latent = gpt_cond_latent.half()
         speaker_embedding = speaker_embedding.half()
     length_penalty: float = 1.0,
     speed: float = 1.0
 ) -> Optional[Tuple[int, np.ndarray]]:
+    """Standard synthesis with optimizations"""
+    if not text.strip():
+        return None
+    if not speaker_wav:
         return None
     try:
         )
         wav = np.array(out["wav"])
+        sample_rate = config.audio.output_sample_rate if hasattr(config.audio, 'output_sample_rate') else 24000
         return (sample_rate, wav)
     except Exception as e:
     repetition_penalty: float = 5.0,
     speed: float = 1.0
 ):
+    """Streaming synthesis for lower latency"""
     if not text.strip() or not speaker_wav:
         return
             enable_text_splitting=True
         )
+        sample_rate = config.audio.output_sample_rate if hasattr(config.audio, 'output_sample_rate') else 24000
         for chunk in chunks:
             if chunk is not None:
 def clear_cache():
+    """Clear speaker cache and CUDA memory"""
     speaker_cache.clear()
     return "Cache cleared!"
 # ============== Gradio Interface ==============
 LANGUAGES = [
+    ("English", "en"),
+    ("Spanish", "es"),
+    ("French", "fr"),
+    ("German", "de"),
+    ("Italian", "it"),
+    ("Portuguese", "pt"),
+    ("Polish", "pl"),
+    ("Turkish", "tr"),
+    ("Russian", "ru"),
+    ("Dutch", "nl"),
+    ("Czech", "cs"),
+    ("Arabic", "ar"),
+    ("Chinese", "zh-cn"),
+    ("Japanese", "ja"),
+    ("Hungarian", "hu"),
+    ("Korean", "ko"),
     ("Hindi", "hi"),
 ]
+css = """
+.generate-btn {
+    background: linear-gradient(90deg, #4CAF50 0%, #45a049 100%) !important;
+    border: none !important;
+}
+.generate-btn:hover {
+    background: linear-gradient(90deg, #45a049 0%, #3d8b40 100%) !important;
+}
+footer {visibility: hidden}
+"""
+with gr.Blocks(title="🐸 XTTSv2 TTS", css=css, theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🐸 XTTSv2 Text-to-Speech
+    High-quality multilingual voice cloning with optimized inference.
+    Upload a reference audio (6+ seconds recommended) and enter your text.
+    """)
     with gr.Tabs():
+        # Standard Tab
         with gr.TabItem("🎙️ Standard"):
             with gr.Row():
+                with gr.Column(scale=1):
+                    text_input = gr.Textbox(
+                        label="Text to synthesize",
+                        placeholder="Enter text here...",
+                        lines=4,
+                        max_lines=10
+                    )
+                    speaker_wav = gr.Audio(
+                        label="Reference Audio",
+                        type="filepath",
+                        sources=["upload", "microphone"]
+                    )
+                    language = gr.Dropdown(
+                        choices=LANGUAGES,
+                        value="en",
+                        label="Language"
+                    )
+                    with gr.Accordion("Advanced Settings", open=False):
+                        temperature = gr.Slider(0.1, 1.0, value=0.65, step=0.05, label="Temperature")
+                        top_p = gr.Slider(0.1, 1.0, value=0.85, step=0.05, label="Top P")
+                        top_k = gr.Slider(1, 100, value=50, step=1, label="Top K")
+                        repetition_penalty = gr.Slider(1.0, 15.0, value=5.0, step=0.5, label="Repetition Penalty")
+                        length_penalty = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Length Penalty")
+                        speed = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Speed")
+                    generate_btn = gr.Button("🔊 Generate Speech", variant="primary", elem_classes=["generate-btn"])
+                with gr.Column(scale=1):
+                    audio_output = gr.Audio(label="Generated Speech", type="numpy")
             generate_btn.click(
                 fn=synthesize,
                 outputs=audio_output
             )
+        # Streaming Tab
+        with gr.TabItem("⚡ Streaming (Low Latency)"):
             with gr.Row():
+                with gr.Column(scale=1):
+                    text_input_stream = gr.Textbox(
+                        label="Text to synthesize",
+                        placeholder="Enter text here...",
+                        lines=4
+                    )
+                    speaker_wav_stream = gr.Audio(
+                        label="Reference Audio",
+                        type="filepath",
+                        sources=["upload", "microphone"]
+                    )
+                    language_stream = gr.Dropdown(
+                        choices=LANGUAGES,
+                        value="en",
+                        label="Language"
+                    )
+                    with gr.Accordion("Advanced Settings", open=False):
+                        temp_stream = gr.Slider(0.1, 1.0, value=0.65, step=0.05, label="Temperature")
+                        top_p_stream = gr.Slider(0.1, 1.0, value=0.85, step=0.05, label="Top P")
+                        top_k_stream = gr.Slider(1, 100, value=50, step=1, label="Top K")
+                        rep_pen_stream = gr.Slider(1.0, 15.0, value=5.0, step=0.5, label="Repetition Penalty")
+                        speed_stream = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Speed")
+                    stream_btn = gr.Button("⚡ Stream Speech", variant="primary")
+                with gr.Column(scale=1):
+                    audio_output_stream = gr.Audio(label="Streaming Output", streaming=True, autoplay=True)
             stream_btn.click(
                 fn=synthesize_streaming,
+                inputs=[text_input_stream, speaker_wav_stream, language_stream, temp_stream, top_p_stream, top_k_stream, rep_pen_stream, speed_stream],
+                outputs=audio_output_stream
             )
+        # Settings Tab
         with gr.TabItem("⚙️ Settings"):
+            gr.Markdown(f"""
+            ### Current Configuration
+            - **Device**: {device}
+            - **DeepSpeed**: {'Enabled' if USE_DEEPSPEED else 'Disabled'}
+            - **FP16**: {'Enabled' if USE_FP16 else 'Disabled'}
+            - **torch.compile**: {'Enabled' if USE_TORCH_COMPILE else 'Disabled'}
+            - **Max Cached Speakers**: {MAX_CACHE_SIZE}
+            """)
+            clear_cache_btn = gr.Button("🗑️ Clear Speaker Cache")
+            cache_status = gr.Textbox(label="Status", interactive=False)
+            clear_cache_btn.click(fn=clear_cache, outputs=cache_status)
+    gr.Markdown("""
+    ---
+    **Tips for best results:**
+    - Use clean reference audio with minimal background noise
+    - 6-30 seconds of reference audio works best
+    - Match the language of your text to your reference audio for best quality
+    """)
 if __name__ == "__main__":
+    demo.queue(max_size=10).launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_error=True
+    )