Spaces:

TGPro1
/

S2ST

Running on Zero

App Files Files Community

TGPro1 commited on Jan 21

Commit

71c50e8

verified ·

1 Parent(s): c4e65cf

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +31 -37

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# 🚀 v135: ZEROGPU HOPPER ELITE (FP32 STABILITY)
 try:
     import spaces
 except ImportError:
@@ -23,11 +23,14 @@ import traceback
 import soundfile as sf
 from faster_whisper import WhisperModel
-# 🛡️ 0. INFRASTRUCTURE OPTIMIZATION (v135)
 os.environ["COQUI_TOS_AGREED"] = "1"
 os.environ["PYTHONWARNINGS"] = "ignore"
-os.environ["CT2_CUDA_ALLOW_TF32"] = "1" # Leverage H200 TF32 cores
-torch.set_float32_matmul_precision('high')
 import torchaudio
 def torchaudio_load_safe(filepath, **kwargs):
@@ -37,47 +40,46 @@ def torchaudio_load_safe(filepath, **kwargs):
     return tensor, sr
 torchaudio.load = torchaudio_load_safe
-# 📦 1. GLOBAL MODELS (LAZY CPU LOAD)
 MODELS = {"stt": None, "tts": None}
-def get_tts():
     if MODELS["tts"] is None:
-        print("🔊 Pre-loading XTTS-v2 (CPU RAM)...")
         from TTS.api import TTS
-        MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
-    return MODELS["tts"]
-# 🛠️ 2. CORE PROCESSING (v135: FP32 FOR STABILITY)
 @spaces.GPU(duration=120)
 def core_process(request_dict):
     global MODELS
     action = request_dict.get("action")
-    print(f"--- [v135] 🛠️ ELITE ENGINE: {action} ---")
     t1 = time.time()
     try:
-        # 🎙️ STT PATH (Fast-Whisper GPU FP32)
         if action in ["stt", "s2st"]:
-            print("⚡ Promoting STT to GPU (FP32 path)...")
-            # Force float32 to avoid cublasSgemm alignment errors on H200 drivers
-            gpu_stt = WhisperModel("large-v3-turbo", device="cuda", compute_type="float32")
             audio_bytes = base64.b64decode(request_dict.get("file"))
             with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
                 f.write(audio_bytes); temp_path = f.name
             try:
                 lang = request_dict.get("lang")
-                segments, _ = gpu_stt.transcribe(temp_path, language=lang if lang and len(lang) <= 3 else None, beam_size=1)
                 stt_text = "".join([s.text for s in segments]).strip()
             finally:
                 if os.path.exists(temp_path): os.unlink(temp_path)
-                del gpu_stt
-                gc.collect()
-                torch.cuda.empty_cache()
             if action == "stt": return {"text": stt_text}
-        # 🔊 TTS PATH (XTTS GPU)
         if action in ["tts", "s2st"]:
             text = (request_dict.get("text") if action == "tts" else stt_text).strip()
             if action == "s2st":
@@ -89,10 +91,6 @@ def core_process(request_dict):
             if len(text) < 2 or not any(c.isalnum() for c in text):
                 return {"audio": ""} if action == "tts" else {"text": stt_text, "translated": "", "audio": ""}
-            print("⚡ Promoting TTS to GPU...")
-            from TTS.api import TTS
-            gpu_tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
             XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
             raw_lang = (request_dict.get("lang") if action == "tts" else target).strip().lower()
             clean_lang = raw_lang.split('-')[0]
@@ -111,14 +109,11 @@ def core_process(request_dict):
                 try:
                     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_f:
                         out_p = out_f.name
-                    gpu_tts.tts_to_file(text=text, language=mapped_lang, file_path=out_p, speaker_wav=speaker_wav_path)
                     with open(out_p, "rb") as f: audio_b64 = base64.b64encode(f.read()).decode()
                 finally:
                     if speaker_wav_path and "default" not in speaker_wav_path and os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
                     if 'out_p' in locals() and os.path.exists(out_p): os.unlink(out_p)
-                    del gpu_tts
-                    gc.collect()
-                    torch.cuda.empty_cache()
             else:
                 import chatterbox_utils
                 audio_bytes = chatterbox_utils.run_chatterbox_inference(text, clean_lang)
@@ -128,11 +123,11 @@ def core_process(request_dict):
             return {"text": stt_text, "translated": trans_text, "audio": audio_b64}
     except Exception as e:
-        print(f"❌ [v135] ERROR: {traceback.format_exc()}")
         return {"error": str(e)}
     finally:
-        print(f"--- [v135] ✨ DONE ({time.time()-t1:.1f}s) ---")
-        gc.collect()
 # 🚀 3. SERVER SETUP
 app = FastAPI()
@@ -142,20 +137,19 @@ app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], all
 async def api_process(request: Request):
     try:
         data = await request.json()
-        if data.get("action") == "health": return {"status": "awake", "v": "135"}
         return core_process(data)
     except Exception as e: return {"error": str(e)}
 @app.get("/health")
-def health(): return {"status": "ok", "v": "135"}
 demo = gr.Interface(
     fn=lambda x: json.dumps(core_process(json.loads(x))),
-    inputs="text", outputs="text", title="🚀 AI Engine v135 (H200 FP32)",
-    description="Optimized for H200 | GPU STT (FP32) | GPU TTS | Zero-Crash"
 ).queue()
 app = gr.mount_gradio_app(app, demo, path="/")
 if __name__ == "__main__":
-    # Simplified entry point for Hugging Face compatibility
     uvicorn.run(app, host="0.0.0.0", port=7860, log_level="warning")

+# 🚀 v136: ZEROGPU HOPPER ULTIMATE (PERSISTENT GPU)
 try:
     import spaces
 except ImportError:
 import soundfile as sf
 from faster_whisper import WhisperModel
+# 🛡️ 0. INFRASTRUCTURE PURIST (v136)
 os.environ["COQUI_TOS_AGREED"] = "1"
 os.environ["PYTHONWARNINGS"] = "ignore"
+# Strict CUBLAS stability for H200
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.backends.cudnn.allow_tf32 = False
+torch.use_deterministic_algorithms(False) # Some kernels might need this, but let's keep it flexible
 import torchaudio
 def torchaudio_load_safe(filepath, **kwargs):
     return tensor, sr
 torchaudio.load = torchaudio_load_safe
+# 📦 1. GLOBAL MODELS (SINGLETON PATTERN)
 MODELS = {"stt": None, "tts": None}
+def load_gpu_models():
+    global MODELS
+    if MODELS["stt"] is None:
+        print("🎙️ Loading Faster-Whisper to GPU (Persistent)...")
+        MODELS["stt"] = WhisperModel("large-v3-turbo", device="cuda", compute_type="float16")
     if MODELS["tts"] is None:
+        print("🔊 Loading XTTS-v2 to GPU (Persistent)...")
         from TTS.api import TTS
+        MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
+# 🛠️ 2. CORE PROCESSING (v136: NO PAGING, NO JITTER)
 @spaces.GPU(duration=120)
 def core_process(request_dict):
     global MODELS
     action = request_dict.get("action")
+    print(f"--- [v136] 🛠️ PURIST ENGINE: {action} ---")
     t1 = time.time()
     try:
+        # Load once and keep in VRAM within the worker life
+        load_gpu_models()
+        # 🎙️ STT PATH
         if action in ["stt", "s2st"]:
             audio_bytes = base64.b64decode(request_dict.get("file"))
             with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
                 f.write(audio_bytes); temp_path = f.name
             try:
                 lang = request_dict.get("lang")
+                segments, _ = MODELS["stt"].transcribe(temp_path, language=lang if lang and len(lang) <= 3 else None, beam_size=1)
                 stt_text = "".join([s.text for s in segments]).strip()
             finally:
                 if os.path.exists(temp_path): os.unlink(temp_path)
             if action == "stt": return {"text": stt_text}
+        # 🔊 TTS PATH
         if action in ["tts", "s2st"]:
             text = (request_dict.get("text") if action == "tts" else stt_text).strip()
             if action == "s2st":
             if len(text) < 2 or not any(c.isalnum() for c in text):
                 return {"audio": ""} if action == "tts" else {"text": stt_text, "translated": "", "audio": ""}
             XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
             raw_lang = (request_dict.get("lang") if action == "tts" else target).strip().lower()
             clean_lang = raw_lang.split('-')[0]
                 try:
                     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_f:
                         out_p = out_f.name
+                    MODELS["tts"].tts_to_file(text=text, language=mapped_lang, file_path=out_p, speaker_wav=speaker_wav_path)
                     with open(out_p, "rb") as f: audio_b64 = base64.b64encode(f.read()).decode()
                 finally:
                     if speaker_wav_path and "default" not in speaker_wav_path and os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
                     if 'out_p' in locals() and os.path.exists(out_p): os.unlink(out_p)
             else:
                 import chatterbox_utils
                 audio_bytes = chatterbox_utils.run_chatterbox_inference(text, clean_lang)
             return {"text": stt_text, "translated": trans_text, "audio": audio_b64}
     except Exception as e:
+        print(f"❌ [v136] ERROR: {traceback.format_exc()}")
         return {"error": str(e)}
     finally:
+        print(f"--- [v136] ✨ DONE ({time.time()-t1:.1f}s) ---")
+        torch.cuda.empty_cache() # Keep models in VRAM, but clear temp buffers
 # 🚀 3. SERVER SETUP
 app = FastAPI()
 async def api_process(request: Request):
     try:
         data = await request.json()
+        if data.get("action") == "health": return {"status": "awake", "v": "136"}
         return core_process(data)
     except Exception as e: return {"error": str(e)}
 @app.get("/health")
+def health(): return {"status": "ok", "v": "136"}
 demo = gr.Interface(
     fn=lambda x: json.dumps(core_process(json.loads(x))),
+    inputs="text", outputs="text", title="🚀 AI Engine v136 (Persistent GPU)",
+    description="H200 Native | Fast-Whisper + XTTS-v2 | Full VRAM Mode"
 ).queue()
 app = gr.mount_gradio_app(app, demo, path="/")
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=7860, log_level="warning")