Spaces:

fosters
/

xttsv2

Sleeping

App Files Files Community

fosters commited on 22 days ago

Commit

e590dfc

verified ·

1 Parent(s): 78ed7b9

Upload 2 files

Browse files

Files changed (2) hide show

app.py +23 -29
requirements.txt +10 -7

app.py CHANGED Viewed

@@ -37,18 +37,18 @@ STREAMING_CHUNK_SIZE = int(os.environ.get("STREAMING_CHUNK_SIZE", "20"))
 # ============== Model Loading ==============
 def load_model():
     """Load XTTSv2 with all optimizations"""
-    from TTS.api import TTS
     from TTS.tts.configs.xtts_config import XttsConfig
     from TTS.tts.models.xtts import Xtts
     logger.info("Loading XTTSv2 model...")
-    # Check if local model exists, otherwise use default from HF Hub
     local_config = os.path.join(MODEL_PATH, "config.json")
     if os.path.exists(local_config):
-        # Load local/fine-tuned model
-        logger.info(f"Loading local model from {MODEL_PATH}")
         config = XttsConfig()
         config.load_json(local_config)
         model = Xtts.init_from_config(config)
@@ -59,39 +59,29 @@ def load_model():
             use_deepspeed=USE_DEEPSPEED
         )
     else:
-        # Load default XTTS-v2 from Hugging Face Hub via TTS API
-        logger.info("Loading default coqui/XTTS-v2 model from Hugging Face Hub...")
-        tts_api = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=torch.cuda.is_available())
-        model = tts_api.synthesizer.tts_model
-        config = tts_api.synthesizer.tts_config
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    model = model.to(device)
-    # FP16 optimization
     if USE_FP16 and device == "cuda":
         logger.info("Enabling FP16 inference...")
         model.half()
-        # Keep some layers in FP32 for stability
-        if hasattr(model, 'gpt'):
-            model.gpt.float()
-    # torch.compile for PyTorch 2.0+
     if USE_TORCH_COMPILE and hasattr(torch, 'compile'):
-        logger.info("Applying torch.compile()...")
         try:
-            if hasattr(model, 'hifigan_decoder'):
-                model.hifigan_decoder = torch.compile(
-                    model.hifigan_decoder,
-                    mode="reduce-overhead",
-                    fullgraph=False
-                )
         except Exception as e:
-            logger.warning(f"torch.compile failed: {e}")
     model.eval()
-    logger.info(f"Model loaded on {device}")
     return model, config, device
 # Global model instance
@@ -264,9 +254,13 @@ def synthesize_streaming(
 def clear_cache():
-    """Clear speaker cache and CUDA memory"""
     speaker_cache.clear()
-    return "Cache cleared!"
 # ============== Gradio Interface ==============

 # ============== Model Loading ==============
 def load_model():
     """Load XTTSv2 with all optimizations"""
+    # Import inside function to prevent early CUDA initialization
     from TTS.tts.configs.xtts_config import XttsConfig
     from TTS.tts.models.xtts import Xtts
+    from TTS.api import TTS
     logger.info("Loading XTTSv2 model...")
+    # Check if local model exists
     local_config = os.path.join(MODEL_PATH, "config.json")
+    device = "cuda" if torch.cuda.is_available() else "cpu"
     if os.path.exists(local_config):
         config = XttsConfig()
         config.load_json(local_config)
         model = Xtts.init_from_config(config)
             use_deepspeed=USE_DEEPSPEED
         )
     else:
+        # Reverting to the high-level API for Hub loads as it handles weights better
+        logger.info("Loading default coqui/XTTS-v2 from Hub...")
+        # We use the synthesizer directly to access the model object for optimizations
+        tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
+        model = tts.synthesizer.tts_model
+        config = tts.synthesizer.tts_config
+    model.to(device)
     if USE_FP16 and device == "cuda":
         logger.info("Enabling FP16 inference...")
         model.half()
+    # Logic for torch.compile (requires Triton for some features)
     if USE_TORCH_COMPILE and hasattr(torch, 'compile'):
         try:
+            # We only compile the GPT part as it's the bottleneck
+            model.gpt = torch.compile(model.gpt, mode="reduce-overhead")
+            logger.info("GPT compiled successfully.")
         except Exception as e:
+            logger.warning(f"torch.compile failed, skipping: {e}")
     model.eval()
     return model, config, device
 # Global model instance
 def clear_cache():
+    """Clear speaker cache and exhaustively free CUDA memory"""
     speaker_cache.clear()
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+    return "Cache and VRAM cleared!"
 # ============== Gradio Interface ==============

requirements.txt CHANGED Viewed

@@ -1,9 +1,12 @@
-# Use the maintained Idiap fork instead of abandoned coqui-ai/TTS
-# This fixes transformers compatibility and is actively maintained
-coqui-tts>=0.25.3
-# Gradio UI - pin to stable version (5.6.0 has JSON schema bug)
-gradio==5.5.0
-# Hugging Face
-huggingface_hub

+# Use the latest stable Gradio to fix the JSON Schema / additionalProperties bug
+gradio>=5.9.1
+# The PyPI package for Coqui TTS is 'tts'
+# We pin versions of transformers/tokenizers because XTTS is sensitive to their breaking changes
+tts==0.22.0
+transformers<=4.43.3
+tokenizers<=0.19.1
+# High-performance inference
+deepspeed>=0.14.0
+huggingface_hub