Spaces:

TGPro1
/

S2ST

Sleeping

App Files Files Community

TGPro1 commited on Jan 21

Commit

0c0d892

verified ·

1 Parent(s): a333eb5

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +57 -55

app.py CHANGED Viewed

@@ -10,33 +10,15 @@ import traceback
 import json
 import time
 import torchaudio
-import chatterbox_utils
 import gc
-# 🛡️ BULKY IMPORTS AT TOP-LEVEL (v88 Optimization)
-# Pre-loading these into RAM at startup so they are READY when GPU session starts
-print("📦 Pre-loading AI Engines into RAM...")
-from faster_whisper import WhisperModel
-from TTS.api import TTS
-from df.enhance import init_df, enhance, load_audio, save_audio
-import deep_translator
-print("✅ Imports Complete")
-# 🛡️ ZeroGPU Support
-try:
-    import spaces
-    print("✅ ZeroGPU/Spaces detected")
-except ImportError:
-    print("⚠️ Spaces library not found. Using mock decorator for local run.")
-    class spaces:
-        @staticmethod
-        def GPU(duration=60, f=None):
-            if f is None: return lambda x: x
-            return f
-# 🛠️ Monkeypatch torchaudio.backend (DeepFilterNet compatibility)
 import sys
 import types
 if "torchaudio.backend" not in sys.modules:
     backend = types.ModuleType("torchaudio.backend")
     common = types.ModuleType("torchaudio.backend.common")
@@ -49,9 +31,8 @@ if "torchaudio.backend" not in sys.modules:
     sys.modules["torchaudio.backend"] = backend
     sys.modules["torchaudio.backend.common"] = common
-# 🛡️ Torchaudio Compatibility Fix
 if not hasattr(torchaudio, "info"):
-    print("🛠️ Mocking torchaudio.info for compatibility...")
     def mock_info(filepath, **kwargs):
         from types import SimpleNamespace
         import wave
@@ -68,8 +49,48 @@ if not hasattr(torchaudio, "info"):
              return SimpleNamespace(sample_rate=48000, num_frames=0, num_channels=1)
     torchaudio.info = mock_info
-# FORCE BUILD TRIGGER: 10:30:00 Jan 21 2026
-# v88: Mandatory GPU-Only (STT + TTS). Fast Activation + 150s Duration.
 os.environ["COQUI_TOS_AGREED"] = "1"
@@ -77,22 +98,19 @@ os.environ["COQUI_TOS_AGREED"] = "1"
 MODELS = {"stt": None, "translate": None, "tts": None, "denoiser": None}
 def activate_gpu_models(action):
-    """v88: Fast GPU Movement and Activation"""
     global MODELS
     # 1. Faster-Whisper GPU Activation
     if action in ["stt", "s2st"]:
         if MODELS["stt"] is None or MODELS["stt"].model.device != "cuda":
             print(f"🎙️ Activating Whisper on GPU for {action}...")
-            # We re-init to move to CUDA. Since weights are cached, this is fast.
             MODELS["stt"] = WhisperModel("large-v3", device="cuda", compute_type="float16")
     # 2. XTTS-v2 GPU Activation
     if action in ["tts", "s2st"]:
         if MODELS["tts"] is None:
-            print("🔊 Initializing XTTS to RAM...")
             MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
         try:
             current_dev = str(next(MODELS["tts"].synthesizer.tts_model.parameters()).device)
             if "cuda" not in current_dev:
@@ -108,31 +126,24 @@ def activate_gpu_models(action):
     if MODELS["translate"] is None:
         MODELS["translate"] = "active"
-    # Chatterbox (STAY CPU if no GPU available for it, or use CUDA if ONNX allows)
     chatterbox_utils.load_chatterbox(device="cuda" if torch.cuda.is_available() else "cpu")
-    # 🧹 Mem Cleanup
     gc.collect()
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
 def warmup_models():
-    """PRE-LOAD EVERYTHING INTO SYSTEM RAM (CPU)"""
-    print("\n🔥 --- SYSTEM STARTUP: RESIDENT RAM LOADING (v88) ---")
     start = time.time()
     try:
-        print("📥 Pre-loading Whisper large-v3 to RAM...")
         MODELS["stt"] = WhisperModel("large-v3", device="cpu", compute_type="int8")
         print("📥 Pre-loading XTTS-v2 to RAM...")
         MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
         print("📥 Pre-loading DeepFilterNet...")
         try: MODELS["denoiser"] = init_df()
         except: pass
         chatterbox_utils.warmup_chatterbox()
-        print(f"✅ --- SYSTEM READY: MODELS IN RAM ({time.time()-start:.2f}s) --- \n")
     except Exception as e:
         print(f"⚠️ Startup warning: {e}")
@@ -163,7 +174,6 @@ def _tts_logic(text, lang, speaker_wav_b64):
     mapped_lang = XTTS_MAP.get(clean_lang) or ("zh-cn" if clean_lang == "zh" else None)
     if mapped_lang:
-        print(f"[v88] GPU Inference: XTTS-v2 for '{mapped_lang}'")
         speaker_wav_path = None
         if speaker_wav_b64:
             sb = base64.b64decode(speaker_wav_b64)
@@ -180,8 +190,6 @@ def _tts_logic(text, lang, speaker_wav_b64):
             if speaker_wav_path and "default_speaker" not in speaker_wav_path and os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
             if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
-    # Fallback path
-    print(f"[v88] Inference: Chatterbox Fallback for '{clean_lang}'")
     try:
         temp_ref = None
         if speaker_wav_b64:
@@ -193,23 +201,17 @@ def _tts_logic(text, lang, speaker_wav_b64):
         return {"audio": base64.b64encode(audio_bytes).decode()}
     except Exception as e: return {"error": f"TTS Failure: {str(e)}"}
-# 🚀 AGGRESSIVE GPU SESSION (150s Duration)
 @spaces.GPU(duration=150)
 def core_process(request_dict):
-    """MANDATORY GPU ENTRY POINT (v88)"""
     action = request_dict.get("action")
     t0 = time.time()
-    print(f"--- [v88] 🚀 GPU SESSION START: {action} ---")
-    # v88 Optimization: Only activate models for current action
     activate_gpu_models(action)
     try:
         if action == "stt": res = _stt_logic(request_dict)
         elif action == "translate": res = {"translated": _translate_logic(request_dict.get("text"), request_dict.get("target_lang", "en"))}
         elif action == "tts": res = _tts_logic(request_dict.get("text"), request_dict.get("lang"), request_dict.get("speaker_wav"))
         elif action == "s2st":
-            # Direct GPU Pipeline
             stt_res = _stt_logic({"file": request_dict.get("file"), "lang": request_dict.get("source_lang")})
             text = stt_res.get("text", "")
             if not text: return {"error": "No speech detected"}
@@ -219,7 +221,7 @@ def core_process(request_dict):
         elif action == "health": res = {"status": "awake"}
         else: res = {"error": f"Unknown action: {action}"}
     finally:
-        print(f"--- [v88] ✨ SESSION END: {action} ({time.time()-t0:.2f}s) ---")
         gc.collect()
         if torch.cuda.is_available(): torch.cuda.empty_cache()
     return res

 import json
 import time
 import torchaudio
 import gc
 import sys
 import types
+# 🛠️ 1. CRITICAL COMPATIBILITY MONKEYPATCHES (v89)
+# These MUST happen before importing df (DeepFilterNet) or other audio tools
+print("🛠️ Applying compatibility monkeypatches...")
+# Patch torchaudio.backend for DeepFilterNet
 if "torchaudio.backend" not in sys.modules:
     backend = types.ModuleType("torchaudio.backend")
     common = types.ModuleType("torchaudio.backend.common")
     sys.modules["torchaudio.backend"] = backend
     sys.modules["torchaudio.backend.common"] = common
+# Mock torchaudio.info
 if not hasattr(torchaudio, "info"):
     def mock_info(filepath, **kwargs):
         from types import SimpleNamespace
         import wave
              return SimpleNamespace(sample_rate=48000, num_frames=0, num_channels=1)
     torchaudio.info = mock_info
+# Patch torchaudio.load
+try:
+    _orig_load = torchaudio.load
+    def patched_load(filepath, *args, **kwargs):
+        try:
+            return _orig_load(filepath, *args, **kwargs)
+        except ImportError as e:
+            if "torchcodec" in str(e).lower():
+                import soundfile as sf
+                data, samplerate = sf.read(filepath)
+                t = torch.from_numpy(data).float()
+                if len(t.shape) == 1: t = t.unsqueeze(0)
+                else: t = t.T
+                return t, samplerate
+            raise e
+    torchaudio.load = patched_load
+    print("✅ Torchaudio patched")
+except Exception as e:
+    print(f"⚠️ Patch failed: {e}")
+# 📦 2. BULKY IMPORTS (After patches)
+print("📦 Pre-loading AI Engines...")
+import chatterbox_utils
+from faster_whisper import WhisperModel
+from TTS.api import TTS
+from df.enhance import init_df, enhance, load_audio, save_audio
+import deep_translator
+print("✅ Imports Complete")
+# 🛡️ ZeroGPU Support
+try:
+    import spaces
+    print("✅ ZeroGPU/Spaces detected")
+except ImportError:
+    class spaces:
+        @staticmethod
+        def GPU(duration=60, f=None):
+            if f is None: return lambda x: x
+            return f
+# FORCE BUILD TRIGGER: 10:45:00 Jan 21 2026
+# v89: Fixed Import Order (Resolved ModuleNotFoundError)
 os.environ["COQUI_TOS_AGREED"] = "1"
 MODELS = {"stt": None, "translate": None, "tts": None, "denoiser": None}
 def activate_gpu_models(action):
+    """Fast GPU Activation"""
     global MODELS
     # 1. Faster-Whisper GPU Activation
     if action in ["stt", "s2st"]:
         if MODELS["stt"] is None or MODELS["stt"].model.device != "cuda":
             print(f"🎙️ Activating Whisper on GPU for {action}...")
             MODELS["stt"] = WhisperModel("large-v3", device="cuda", compute_type="float16")
     # 2. XTTS-v2 GPU Activation
     if action in ["tts", "s2st"]:
         if MODELS["tts"] is None:
             MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
         try:
             current_dev = str(next(MODELS["tts"].synthesizer.tts_model.parameters()).device)
             if "cuda" not in current_dev:
     if MODELS["translate"] is None:
         MODELS["translate"] = "active"
     chatterbox_utils.load_chatterbox(device="cuda" if torch.cuda.is_available() else "cpu")
     gc.collect()
+    if torch.cuda.is_available(): torch.cuda.empty_cache()
 def warmup_models():
+    """PRE-LOAD MODELS INTO SYSTEM RAM (CPU)"""
+    print("\n🔥 --- SYSTEM STARTUP: RAM LOADING (v89) ---")
     start = time.time()
     try:
+        print("📥 Pre-loading Whisper to RAM...")
         MODELS["stt"] = WhisperModel("large-v3", device="cpu", compute_type="int8")
         print("📥 Pre-loading XTTS-v2 to RAM...")
         MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
         print("📥 Pre-loading DeepFilterNet...")
         try: MODELS["denoiser"] = init_df()
         except: pass
         chatterbox_utils.warmup_chatterbox()
+        print(f"✅ --- SYSTEM READY ({time.time()-start:.2f}s) --- \n")
     except Exception as e:
         print(f"⚠️ Startup warning: {e}")
     mapped_lang = XTTS_MAP.get(clean_lang) or ("zh-cn" if clean_lang == "zh" else None)
     if mapped_lang:
         speaker_wav_path = None
         if speaker_wav_b64:
             sb = base64.b64decode(speaker_wav_b64)
             if speaker_wav_path and "default_speaker" not in speaker_wav_path and os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
             if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
     try:
         temp_ref = None
         if speaker_wav_b64:
         return {"audio": base64.b64encode(audio_bytes).decode()}
     except Exception as e: return {"error": f"TTS Failure: {str(e)}"}
 @spaces.GPU(duration=150)
 def core_process(request_dict):
     action = request_dict.get("action")
     t0 = time.time()
+    print(f"--- [v89] 🚀 GPU SESSION START: {action} ---")
     activate_gpu_models(action)
     try:
         if action == "stt": res = _stt_logic(request_dict)
         elif action == "translate": res = {"translated": _translate_logic(request_dict.get("text"), request_dict.get("target_lang", "en"))}
         elif action == "tts": res = _tts_logic(request_dict.get("text"), request_dict.get("lang"), request_dict.get("speaker_wav"))
         elif action == "s2st":
             stt_res = _stt_logic({"file": request_dict.get("file"), "lang": request_dict.get("source_lang")})
             text = stt_res.get("text", "")
             if not text: return {"error": "No speech detected"}
         elif action == "health": res = {"status": "awake"}
         else: res = {"error": f"Unknown action: {action}"}
     finally:
+        print(f"--- [v89] ✨ SESSION END: {action} ({time.time()-t0:.2f}s) ---")
         gc.collect()
         if torch.cuda.is_available(): torch.cuda.empty_cache()
     return res