Spaces:

TGPro1
/

S2ST

Sleeping

App Files Files Community

TGPro1 commited on Jan 21

Commit

a5419b4

verified ·

1 Parent(s): 5da62fe

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +63 -55

app.py CHANGED Viewed

@@ -13,8 +13,9 @@ import torchaudio
 import gc
 import sys
 import types
-# 🛠️ 1. CRITICAL COMPATIBILITY MONKEYPATCHES
 print("🛠️ Applying compatibility monkeypatches...")
 if "torchaudio.backend" not in sys.modules:
     backend = types.ModuleType("torchaudio.backend")
@@ -54,13 +55,13 @@ try:
     print("✅ Torchaudio patched")
 except Exception as e: print(f"⚠️ Patch failed: {e}")
-# 📦 2. PRE-LOADING (v90 Optimization)
-print("📦 Pre-loading AI Engines into RAM...")
 import chatterbox_utils
 from faster_whisper import WhisperModel
 from TTS.api import TTS
-from df.enhance import init_df, enhance, load_audio, save_audio
-import deep_translator
 print("✅ Imports Complete")
 try:
@@ -73,57 +74,49 @@ except ImportError:
             if f is None: return lambda x: x
             return f
-# FORCE BUILD TRIGGER: 10:55:00 Jan 21 2026
-# v90: Fixed Whisper CUDA 'Invalid Argument' crash. (Cleaner GPU Handoff)
 os.environ["COQUI_TOS_AGREED"] = "1"
 MODELS = {"stt": None, "translate": None, "tts": None, "denoiser": None}
 def activate_gpu_models(action):
-    """v90: Optimized GPU Activation with clean handoff"""
     global MODELS
-    # 1. Faster-Whisper GPU Activation
     if action in ["stt", "s2st"]:
-        stt_on_gpu = False
-        try:
-            if MODELS["stt"] is not None and hasattr(MODELS["stt"], "model") and MODELS["stt"].model.device == "cuda":
-                stt_on_gpu = True
-        except: pass
-        if not stt_on_gpu:
-            print(f"🎙️ Activating Whisper on GPU for {action}...")
-            # 🧹 CRITICAL: Clear old instance to avoid "Invalid Argument" CUDA errors
-            old_stt = MODELS.pop("stt", None)
-            if old_stt: del old_stt
             gc.collect()
-            if torch.cuda.is_available(): torch.cuda.empty_cache()
-            # Re-init on GPU with safe parameters for ZeroGPU MIG
-            try:
-                MODELS["stt"] = WhisperModel(
-                    "large-v3",
-                    device="cuda",
-                    device_index=0,
-                    compute_type="int8_float16", # Better stability on H100/H200 MIG
-                    cpu_threads=4,
-                    num_workers=1
-                )
-                print("✨ Whisper Activated on GPU")
-            except Exception as e:
-                print(f"❌ Whisper GPU fail: {e}. Falling back to CPU in-session.")
-                MODELS["stt"] = WhisperModel("large-v3", device="cpu", compute_type="int8")
-    # 2. XTTS-v2 GPU Activation
     if action in ["tts", "s2st"]:
         if MODELS["tts"] is None:
-            MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
-        try:
-            current_dev = str(next(MODELS["tts"].synthesizer.tts_model.parameters()).device)
-            if "cuda" not in current_dev:
-                print(f"🚀 Moving XTTS-v2 to GPU...")
-                MODELS["tts"].to("cuda")
-        except: MODELS["tts"].to("cuda")
     # 3. Helpers
     if MODELS["denoiser"] is None:
@@ -133,17 +126,30 @@ def activate_gpu_models(action):
     chatterbox_utils.load_chatterbox(device="cuda" if torch.cuda.is_available() else "cpu")
 def warmup_models():
-    """PRE-LOAD MODELS INTO SYSTEM RAM"""
-    print("\n🔥 --- SYSTEM STARTUP: RAM LOADING (v90) ---")
     start = time.time()
     try:
-        MODELS["stt"] = WhisperModel("large-v3", device="cpu", compute_type="int8")
-        MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
-        try: MODELS["denoiser"] = init_df()
-        except: pass
         chatterbox_utils.warmup_chatterbox()
-        print(f"✅ --- SYSTEM READY ({time.time()-start:.2f}s) --- \n")
-    except Exception as e: print(f"⚠️ Startup warning: {e}")
 def _stt_logic(request_dict):
     audio_bytes = base64.b64decode(request_dict.get("file"))
@@ -165,6 +171,7 @@ def _tts_logic(text, lang, speaker_wav_b64):
     XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
     clean_lang = lang.strip().lower().split('-')[0]
     mapped_lang = XTTS_MAP.get(clean_lang) or ("zh-cn" if clean_lang == "zh" else None)
     if mapped_lang:
         speaker_wav_path = None
         if speaker_wav_b64:
@@ -180,6 +187,7 @@ def _tts_logic(text, lang, speaker_wav_b64):
         finally:
             if speaker_wav_path and "default_speaker" not in speaker_wav_path and os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
             if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
     try:
         temp_ref = None
         if speaker_wav_b64:
@@ -195,7 +203,7 @@ def _tts_logic(text, lang, speaker_wav_b64):
 def core_process(request_dict):
     action = request_dict.get("action")
     t0 = time.time()
-    print(f"--- [v90] 🚀 GPU SESSION START: {action} ---")
     activate_gpu_models(action)
     try:
         if action == "stt": res = _stt_logic(request_dict)
@@ -209,7 +217,7 @@ def core_process(request_dict):
         elif action == "health": res = {"status": "awake"}
         else: res = {"error": f"Unknown action: {action}"}
     finally:
-        print(f"--- [v90] ✨ END: {action} ({time.time()-t0:.2f}s) ---")
         gc.collect()
         if torch.cuda.is_available(): torch.cuda.empty_cache()
     return res

 import gc
 import sys
 import types
+from huggingface_hub import snapshot_download
+# 🛡️ 1. CRITICAL COMPATIBILITY MONKEYPATCHES
 print("🛠️ Applying compatibility monkeypatches...")
 if "torchaudio.backend" not in sys.modules:
     backend = types.ModuleType("torchaudio.backend")
     print("✅ Torchaudio patched")
 except Exception as e: print(f"⚠️ Patch failed: {e}")
+# 📦 2. TOP-LEVEL IMPORTS (No engine initialization yet)
+print("📦 Importing AI Libraries...")
 import chatterbox_utils
+# Note: We import the classes, but DO NOT instantiate them on the CPU
 from faster_whisper import WhisperModel
 from TTS.api import TTS
+from df.enhance import init_df
 print("✅ Imports Complete")
 try:
             if f is None: return lambda x: x
             return f
+# FORCE BUILD TRIGGER: 11:05:00 Jan 21 2026
+# v91: No-Instance Startup (Resolved CUDA std::system_error)
 os.environ["COQUI_TOS_AGREED"] = "1"
+# MODELS starts empty to ensure a clean CUDA handoff
 MODELS = {"stt": None, "translate": None, "tts": None, "denoiser": None}
 def activate_gpu_models(action):
+    """v91: Direct GPU initialization (Safe & Clean)"""
     global MODELS
+    # 1. Faster-Whisper: Initialize directly on GPU
     if action in ["stt", "s2st"]:
+        if MODELS["stt"] is None:
+            print(f"🎙️ [v91] Initializing Whisper directly on GPU for {action}...")
+            # No CPU instance should exist at this point
+            MODELS["stt"] = WhisperModel(
+                "large-v3",
+                device="cuda",
+                compute_type="float16"
+            )
+            print("✨ Whisper GPU Engine Ready")
+        elif MODELS["stt"].model.device != "cuda":
+            # This case shouldn't happen with No-Instance Startup, but for safety:
+            print("⚠️ Switching Whisper to GPU...")
+            del MODELS["stt"]
             gc.collect()
+            torch.cuda.empty_cache()
+            MODELS["stt"] = WhisperModel("large-v3", device="cuda", compute_type="float16")
+    # 2. XTTS-v2: Initialize directly on GPU
     if action in ["tts", "s2st"]:
         if MODELS["tts"] is None:
+            print(f"🔊 [v91] Initializing XTTS directly on GPU for {action}...")
+            MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
+            print("✨ XTTS GPU Engine Ready")
+        else:
+            try:
+                current_dev = str(next(MODELS["tts"].synthesizer.tts_model.parameters()).device)
+                if "cuda" not in current_dev:
+                    print("🚀 Moving XTTS-v2 to GPU...")
+                    MODELS["tts"].to("cuda")
+            except: MODELS["tts"].to("cuda")
     # 3. Helpers
     if MODELS["denoiser"] is None:
     chatterbox_utils.load_chatterbox(device="cuda" if torch.cuda.is_available() else "cpu")
 def warmup_models():
+    """v91: DOWNLOAD ONLY (No engine initialization)"""
+    print("\n🔥 --- SYSTEM STARTUP: NO-INSTANCE WARMUP (v91) ---")
     start = time.time()
     try:
+        # 1. Download Whisper (CTranslate2 format)
+        print("📥 Pre-downloading Whisper large-v3 weights...")
+        snapshot_download(repo_id="Systran/faster-whisper-large-v3")
+        # 2. Download XTTS-v2
+        print("📥 Pre-downloading XTTS-v2 weights...")
+        snapshot_download(repo_id="coqui/XTTS-v2")
+        # 3. Download DeepFilterNet
+        print("📥 Pre-downloading DeepFilterNet...")
+        # DeepFilterNet downloads usually happen via init_df, but we can try to force it
+        # snapshot_download(repo_id="RVoice/DeepFilterNet3")
+        # 4. Chatterbox Warmup
         chatterbox_utils.warmup_chatterbox()
+        print(f"✅ --- STARTUP COMPLETE: DATA ON DISK ({time.time()-start:.2f}s) --- \n")
+        print("⚠️ NOTE: No engine instances created on CPU to prevent CUDA conflicts.")
+    except Exception as e:
+        print(f"⚠️ Startup warning: {e}")
 def _stt_logic(request_dict):
     audio_bytes = base64.b64decode(request_dict.get("file"))
     XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
     clean_lang = lang.strip().lower().split('-')[0]
     mapped_lang = XTTS_MAP.get(clean_lang) or ("zh-cn" if clean_lang == "zh" else None)
     if mapped_lang:
         speaker_wav_path = None
         if speaker_wav_b64:
         finally:
             if speaker_wav_path and "default_speaker" not in speaker_wav_path and os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
             if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
     try:
         temp_ref = None
         if speaker_wav_b64:
 def core_process(request_dict):
     action = request_dict.get("action")
     t0 = time.time()
+    print(f"--- [v91] 🚀 GPU SESSION START: {action} ---")
     activate_gpu_models(action)
     try:
         if action == "stt": res = _stt_logic(request_dict)
         elif action == "health": res = {"status": "awake"}
         else: res = {"error": f"Unknown action: {action}"}
     finally:
+        print(f"--- [v91] ✨ END: {action} ({time.time()-t0:.2f}s) ---")
         gc.collect()
         if torch.cuda.is_available(): torch.cuda.empty_cache()
     return res