Spaces:

TGPro1
/

S2ST

Running on Zero

App Files Files Community

TGPro1 commited on 28 days ago

Commit

52ea7a6

verified ·

1 Parent(s): 36145e9

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +107 -28

app.py CHANGED Viewed

@@ -10,6 +10,8 @@ import traceback
 import json
 import time
 import torchaudio
 # 🛡️ ZeroGPU Support (v69)
 # CRITICAL: @spaces.GPU MUST only be used on synchronous functions (def, not async def)
@@ -59,8 +61,8 @@ if not hasattr(torchaudio, "info"):
 from df.enhance import enhance, init_df, load_audio, save_audio
-# FORCE BUILD TRIGGER: 08:26:00 Jan 21 2026
-# v79: Full 16-Language XTTS Mapping Support
 # 🛠️ Monkeypatch torchaudio.load
 try:
@@ -100,6 +102,15 @@ def load_models():
             print("⚠️ Falling back to CPU (int8)")
             MODELS["stt"] = WhisperModel("large-v3", device="cpu", compute_type="int8")
     if MODELS["translate"] is None:
         print("🌍 Loading Google Translate...")
         MODELS["translate"] = "active"
@@ -168,37 +179,64 @@ def _tts_logic(text, lang, speaker_wav_b64):
         "zh": "zh-cn", "zh-cn": "zh-cn", "zh-tw": "zh-cn"
     }
     if lang:
         lang_key = lang.strip().lower()
-        # 1. Try exact match (e.g. 'zh-cn')
-        # 2. Try the sub-code split match (e.g. 'en-US' -> 'en')
-        # 3. Fallback to the original key if not in map
-        lang = XTTS_MAP.get(lang_key) or XTTS_MAP.get(lang_key.split('-')[0]) or lang_key
-    print(f"[v79] TTS mapped language: {lang}")
-    speaker_wav_path = None
-    if speaker_wav_b64:
-        sb = base64.b64decode(speaker_wav_b64)
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
-            f.write(sb)
-            speaker_wav_path = f.name
-    else:
-        speaker_wav_path = "default_speaker.wav"
     try:
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
-            output_path = output_file.name
-        # 🎙️ XTTS Inference
-        MODELS["tts"].tts_to_file(text=text, language=lang, file_path=output_path, speaker_wav=speaker_wav_path)
-        with open(output_path, "rb") as f:
-            audio_b64 = base64.b64encode(f.read()).decode()
         return {"audio": audio_b64}
-    finally:
-        if speaker_wav_path and "default_speaker" not in speaker_wav_path:
-            if os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
-        if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
 @spaces.GPU
 def core_process(request_dict):
@@ -233,7 +271,12 @@ def core_process(request_dict):
     else:
         res = {"error": f"Unknown action: {action}"}
-    print(f"--- [v77] ✨ GPU SESSION END: {action} (Total: {time.time()-t0:.2f}s) ---")
     return res
     return {"error": f"Unknown action: {action}"}
@@ -274,6 +317,9 @@ def gpu_tts_generator(text, lang, speaker_wav_path):
     finally:
         if speaker_wav_path and "default_speaker" not in speaker_wav_path:
             if os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
 # --- FastAPI Entry Points ---
 app = FastAPI()
@@ -314,7 +360,40 @@ async def api_tts_stream(request: Request):
 @app.get("/health")
 def health():
-    return {"status": "ok", "gpu": torch.cuda.is_available()}
 # --- Gradio UI ---
 def gradio_fn(req_json):

 import json
 import time
 import torchaudio
+import chatterbox_utils
+import gc
 # 🛡️ ZeroGPU Support (v69)
 # CRITICAL: @spaces.GPU MUST only be used on synchronous functions (def, not async def)
 from df.enhance import enhance, init_df, load_audio, save_audio
+# FORCE BUILD TRIGGER: 09:10:00 Jan 21 2026
+# v81: Stability Optimizations (Memory Management + Cache Clearing)
 # 🛠️ Monkeypatch torchaudio.load
 try:
             print("⚠️ Falling back to CPU (int8)")
             MODELS["stt"] = WhisperModel("large-v3", device="cpu", compute_type="int8")
+    # 🧹 Proactive Memory Cleanup
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    # Initialize Chatterbox ONNX (High-Speed Fallback)
+    # This will load the model if not already loaded internally by chatterbox_utils
+    chatterbox_utils.load_chatterbox(device="cuda" if torch.cuda.is_available() else "cpu")
     if MODELS["translate"] is None:
         print("🌍 Loading Google Translate...")
         MODELS["translate"] = "active"
         "zh": "zh-cn", "zh-cn": "zh-cn", "zh-tw": "zh-cn"
     }
+    XTTS_LANG_CODES = set(XTTS_MAP.values())
+    mapped_lang = None
     if lang:
         lang_key = lang.strip().lower()
+        mapped_lang = XTTS_MAP.get(lang_key) or XTTS_MAP.get(lang_key.split('-')[0])
+    print(f"[v80] TTS Request - Original: {lang}, Mapped: {mapped_lang}")
+    # 🛣️ INTELLIGENT ROUTING
+    # Case A: XTTS Support (Voice Cloning)
+    if mapped_lang and mapped_lang in XTTS_LANG_CODES:
+        print(f"[v80] Using XTTS-v2 for '{mapped_lang}'")
+        speaker_wav_path = None
+        if speaker_wav_b64:
+            sb = base64.b64decode(speaker_wav_b64)
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+                f.write(sb)
+                speaker_wav_path = f.name
+        else:
+            speaker_wav_path = "default_speaker.wav"
+        try:
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
+                output_path = output_file.name
+            # 🎙️ XTTS Inference
+            MODELS["tts"].tts_to_file(text=text, language=mapped_lang, file_path=output_path, speaker_wav=speaker_wav_path)
+            with open(output_path, "rb") as f:
+                audio_b64 = base64.b64encode(f.read()).decode()
+            return {"audio": audio_b64}
+        finally:
+            if speaker_wav_path and "default_speaker" not in speaker_wav_path:
+                if os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
+            if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
+    # Case B: Chatterbox ONNX Support (High-Quality Fast Fallback)
+    print(f"[v80] Using Chatterbox ONNX Fallback for '{lang}'")
     try:
+        # Use local file if available for cloning in Chatterbox too
+        temp_ref = None
+        if speaker_wav_b64:
+             sb = base64.b64decode(speaker_wav_b64)
+             with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+                 f.write(sb); temp_ref = f.name
+        # Chatterbox supports codes like 'fi', 'el', 'da', etc.
+        chatter_lang = lang.strip().lower().split('-')[0]
+        audio_bytes = chatterbox_utils.run_chatterbox_inference(text, chatter_lang, speaker_wav_path=temp_ref)
+        if temp_ref and os.path.exists(temp_ref): os.unlink(temp_ref)
+        audio_b64 = base64.b64encode(audio_bytes).decode()
         return {"audio": audio_b64}
+    except Exception as e:
+        print(f"❌ Chatterbox Fallback failed: {e}")
+        return {"error": f"TTS Failure: '{lang}' not supported by XTTS or Chatterbox."}
 @spaces.GPU
 def core_process(request_dict):
     else:
         res = {"error": f"Unknown action: {action}"}
+    finally:
+        print(f"--- [v81] ✨ SESSION END: {action} ---")
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
     return res
     return {"error": f"Unknown action: {action}"}
     finally:
         if speaker_wav_path and "default_speaker" not in speaker_wav_path:
             if os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
 # --- FastAPI Entry Points ---
 app = FastAPI()
 @app.get("/health")
 def health():
+    return {"status": "ok", "gpu": torch.cuda.is_available(), "time": time.ctime()}
+@app.post("/api/v1/clear_cache")
+async def clear_cache():
+    """Manual deep cleanup of memory and caches"""
+    try:
+        t0 = time.time()
+        print("🧹 Manual Cache Clearing Triggered...")
+        # 1. GC collect
+        gc.collect()
+        # 2. CUDA cache
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        # 3. Clean temp files
+        temp_dir = tempfile.gettempdir()
+        count = 0
+        for f in os.listdir(temp_dir):
+            if f.endswith(".wav") or f.startswith("tm"):
+                try:
+                    os.unlink(os.path.join(temp_dir, f))
+                    count += 1
+                except: pass
+        return {
+            "status": "success",
+            "cleaned_files": count,
+            "duration": f"{time.time()-t0:.2f}s",
+            "gpu_memory": f"{torch.cuda.memory_allocated() / 1024**2:.2f}MB" if torch.cuda.is_available() else "N/A"
+        }
+    except Exception as e:
+        return {"status": "error", "message": str(e)}
 # --- Gradio UI ---
 def gradio_fn(req_json):