Spaces:

TGPro1
/

S2ST

Running on Zero

App Files Files Community

TGPro1 commited on 27 days ago

Commit

2ebc6b4

verified ·

1 Parent(s): abb9165

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +43 -51

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# 🚀 V125: ZEROGPU HOPPER ULTIMATE (FULL FP32 MODE)
 try:
     import spaces
 except ImportError:
@@ -9,9 +9,6 @@ except ImportError:
             return f
 import gradio as gr
-from fastapi import FastAPI, Request
-from fastapi.middleware.cors import CORSMiddleware
-import uvicorn
 import base64
 import torch
 import os
@@ -21,10 +18,13 @@ import time
 import gc
 import traceback
 import soundfile as sf
-from huggingface_hub import snapshot_download
 from transformers import pipeline
-# 🛡️ 0. MONKEYPATCH: TORCHAUDIO CODEC BYPASS (v125)
 import torchaudio
 def torchaudio_load_safe(filepath, **kwargs):
     data, sr = sf.read(filepath)
@@ -33,46 +33,39 @@ def torchaudio_load_safe(filepath, **kwargs):
     return tensor, sr
 torchaudio.load = torchaudio_load_safe
-# 🛡️ 1. SILENCE & ENV (v125)
-import logging
-logging.getLogger("transformers").setLevel(logging.ERROR)
-os.environ["COQUI_TOS_AGREED"] = "1"
-os.environ["PYTHONWARNINGS"] = "ignore"
-# 📦 2. GLOBAL MODELS (LAZY LOAD)
 MODELS = {"stt": None, "tts": None}
-# 🛠️ 3. CORE PROCESSING (v125: FULL FP32 STABILITY)
 @spaces.GPU(duration=120)
 def core_process(request_dict):
     global MODELS
     action = request_dict.get("action")
-    print(f"--- [v125] 🛡️ TOTAL FP32 MODE: {action} ---")
     t1 = time.time()
     try:
-        # v125: Whisper Turbo (Forced FP32 for H200 Driver Stability)
         if action in ["stt", "s2st"] and MODELS["stt"] is None:
-            print("🎙️ Loading Whisper Turbo (v3) [float32]...")
-            model_id = "openai/whisper-large-v3-turbo"
             MODELS["stt"] = pipeline(
                 "automatic-speech-recognition",
-                model=model_id,
                 torch_dtype=torch.float32,
                 device="cuda"
             )
-        # v125: XTTS-v2 (Forced FP32 to avoid cublasSgemm Batched crash)
         if action in ["tts", "s2st"] and MODELS["tts"] is None:
-            print("🔊 Loading XTTS-v2 (FP32 Guarded)...")
             from TTS.api import TTS
-            # We don't set gpu=True in constructor to stay in float32 initially
-            tt = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
-            print("⚙️ Moving XTTS to CUDA [float32]...")
-            tt.to("cuda") # Manually move. Default is float32.
-            MODELS["tts"] = tt
-        # 🛠️ Execute Logic
         if action == "stt":
             audio_bytes = base64.b64decode(request_dict.get("file"))
             with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
@@ -114,7 +107,6 @@ def core_process(request_dict):
                 try:
                     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_f:
                         out_p = out_f.name
-                    # v125: Force context to avoid any automatic half-precision casting
                     MODELS["tts"].tts_to_file(text=text, language=mapped_lang, file_path=out_p, speaker_wav=speaker_wav_path)
                     with open(out_p, "rb") as f: res = {"audio": base64.b64encode(f.read()).decode()}
                 finally:
@@ -132,41 +124,41 @@ def core_process(request_dict):
             from deep_translator import GoogleTranslator
             target = request_dict.get("target_lang") or "en"
             trans_t = GoogleTranslator(source='auto', target=target).translate(stt_t)
             t_res = core_process.__wrapped__({"action": "tts", "text": trans_t, "lang": target, "speaker_wav": request_dict.get("speaker_wav")})
             res = {"text": stt_t, "translated": trans_t, "audio": t_res.get("audio")}
         else: res = {"error": "Invalid action"}
     except Exception as e:
-        print(f"❌ [v125] ERROR: {traceback.format_exc()}")
         res = {"error": str(e)}
     finally:
-        print(f"--- [v125] ✨ DONE ({time.time()-t1:.1f}s) ---")
         gc.collect()
         if torch.cuda.is_available(): torch.cuda.empty_cache()
     return res
-# 🚀 4. SERVER SETUP
-app = FastAPI()
-app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
-@app.post("/api/v1/process")
-async def api_process(request: Request):
     try:
-        data = await request.json()
-        if data.get("action") == "health": return {"status": "awake", "v": "125"}
-        return core_process(data)
-    except Exception as e: return {"error": str(e)}
-@app.get("/health")
-def health(): return {"status": "ok", "v": "125"}
-def gradio_fn(req_json):
-    try: return json.dumps(core_process(json.loads(req_json)))
-    except Exception as e: return json.dumps({"error": str(e)})
-demo = gr.Interface(fn=gradio_fn, inputs="text", outputs="text", title="🚀 AI Engine v125")
-demo.queue()
-app = gr.mount_gradio_app(app, demo, path="/")
 if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=7860, log_level="warning")

+# 🚀 V126: ZEROGPU HOPPER ROBUST (HYBRID ENGINE)
 try:
     import spaces
 except ImportError:
             return f
 import gradio as gr
 import base64
 import torch
 import os
 import gc
 import traceback
 import soundfile as sf
 from transformers import pipeline
+# 🛡️ 0. ENV & MONKEYPATCH (v126)
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" # Stability for MIG
+os.environ["COQUI_TOS_AGREED"] = "1"
+os.environ["PYTHONWARNINGS"] = "ignore"
 import torchaudio
 def torchaudio_load_safe(filepath, **kwargs):
     data, sr = sf.read(filepath)
     return tensor, sr
 torchaudio.load = torchaudio_load_safe
+# 📦 1. GLOBAL MODELS (LAZY LOAD)
 MODELS = {"stt": None, "tts": None}
+# 🛠️ 2. CORE PROCESSING (v126: GPU-STT + CPU-TTS)
+# Since XTTS keeps crashing the CUDA context on H200, we move it to CPU.
+# Whisper remains on GPU as it is fully stable and incredibly fast.
 @spaces.GPU(duration=120)
 def core_process(request_dict):
     global MODELS
     action = request_dict.get("action")
+    print(f"--- [v126] 🛠️ HYBRID ENGINE: {action} ---")
     t1 = time.time()
     try:
+        # GPU PATH: Whisper Large-v3-Turbo
         if action in ["stt", "s2st"] and MODELS["stt"] is None:
+            print("🎙️ Loading Whisper Turbo (v3) [GPU: float32]...")
             MODELS["stt"] = pipeline(
                 "automatic-speech-recognition",
+                model="openai/whisper-large-v3-turbo",
                 torch_dtype=torch.float32,
                 device="cuda"
             )
+        # CPU PATH: XTTS-v2 (Zero-Crash Stability)
         if action in ["tts", "s2st"] and MODELS["tts"] is None:
+            print("🔊 Loading XTTS-v2 [CPU Path]...")
             from TTS.api import TTS
+            # Running on CPU avoids the persistent cublasSgemm crashes on H200
+            MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
+        # 🛠️ Execution Logic
         if action == "stt":
             audio_bytes = base64.b64decode(request_dict.get("file"))
             with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
                 try:
                     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_f:
                         out_p = out_f.name
                     MODELS["tts"].tts_to_file(text=text, language=mapped_lang, file_path=out_p, speaker_wav=speaker_wav_path)
                     with open(out_p, "rb") as f: res = {"audio": base64.b64encode(f.read()).decode()}
                 finally:
             from deep_translator import GoogleTranslator
             target = request_dict.get("target_lang") or "en"
             trans_t = GoogleTranslator(source='auto', target=target).translate(stt_t)
+            # TTS is already on CPU, so we call it directly
             t_res = core_process.__wrapped__({"action": "tts", "text": trans_t, "lang": target, "speaker_wav": request_dict.get("speaker_wav")})
             res = {"text": stt_t, "translated": trans_t, "audio": t_res.get("audio")}
+        elif action == "health":
+            res = {"status": "awake", "v": "126", "gpu": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None"}
         else: res = {"error": "Invalid action"}
     except Exception as e:
+        print(f"❌ [v126] ERROR: {traceback.format_exc()}")
         res = {"error": str(e)}
     finally:
+        print(f"--- [v126] ✨ DONE ({time.time()-t1:.1f}s) ---")
         gc.collect()
         if torch.cuda.is_available(): torch.cuda.empty_cache()
     return res
+# 🚀 3. GRADIO INTERFACE (v126)
+def handle_api(req_json):
     try:
+        data = json.loads(req_json)
+        # Direct return for health to avoid GPU trigger if not needed
+        if data.get("action") == "health": return json.dumps({"status": "awake", "v": "126"})
+        return json.dumps(core_process(data))
+    except Exception as e:
+        return json.dumps({"error": str(e)})
+demo = gr.Interface(
+    fn=handle_api,
+    inputs="text",
+    outputs="text",
+    title="🚀 AI Engine v126 (Hopper Robust)",
+    description="STT (GPU) | Translation | TTS (CPU-Fallthrough)"
+)
 if __name__ == "__main__":
+    demo.queue()
+    # demo.launch handles the server and port binding automatically/robustly on HF
+    demo.launch(server_name="0.0.0.0", server_port=7860)