Spaces:

TGPro1
/

S2ST

Sleeping

App Files Files Community

TGPro1 commited on Jan 21

Commit

81932e5

verified ·

1 Parent(s): ad3d045

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +44 -35

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# 🚀 V119: ZEROGPU HOPPER RESILIENT (STABILITY OVERRIDE)
 try:
     import spaces
 except ImportError:
@@ -18,48 +18,46 @@ import os
 import tempfile
 import json
 import time
-import torchaudio
 import gc
 import sys
-import types
-import logging
 import traceback
-from threading import Thread
-from huggingface_hub import snapshot_download, hf_hub_download
-from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
-# 🛡️ 1. SILENCE & ENV (v119)
 logging.getLogger("transformers").setLevel(logging.ERROR)
 os.environ["COQUI_TOS_AGREED"] = "1"
-os.environ["CT2_VERBOSE"] = "0"
 # 📦 2. GLOBAL MODELS (LAZY LOAD)
-MODELS = {"stt": None, "tts": None, "translate": None}
-# 🛠️ 3. CORE PROCESSING (v119: STABILITY FIRST)
-@spaces.GPU(duration=150)
 def core_process(request_dict):
     global MODELS
     action = request_dict.get("action")
-    print(f"--- [v119] 🚀 PROCESSING: {action} ---")
     t1 = time.time()
     try:
-        # v119: LAZY LOAD INSIDE GPU SESSION (Prevents Startup Hangs)
         if action in ["stt", "s2st"] and MODELS["stt"] is None:
-            print("🎙️ Loading Whisper (Transformers Pipeline, float16)...")
-            # Using Transformers instead of faster-whisper for MIG stability
-            model_id = "openai/whisper-large-v3"
             MODELS["stt"] = pipeline(
                 "automatic-speech-recognition",
                 model=model_id,
-                torch_dtype=torch.float16,
-                device="cuda"
             )
         if action in ["tts", "s2st"] and MODELS["tts"] is None:
-            print("🔊 Loading XTTS-v2 (Native float16)...")
             from TTS.api import TTS
             MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
         # 🛠️ Execute Logic
@@ -68,8 +66,15 @@ def core_process(request_dict):
             with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
                 f.write(audio_bytes); temp_path = f.name
             try:
-                # v119: Transcribe via Transformers
-                result = MODELS["stt"](temp_path, generate_kwargs={"language": request_dict.get("lang")})
                 res = {"text": result["text"].strip()}
             finally:
                 if os.path.exists(temp_path): os.unlink(temp_path)
@@ -81,7 +86,8 @@ def core_process(request_dict):
         elif action == "tts":
             text = request_dict.get("text")
             XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
-            clean_lang = (request_dict.get("lang") or "en").strip().lower().split('-')[0]
             mapped_lang = XTTS_MAP.get(clean_lang) or ("zh-cn" if clean_lang == "zh" else None)
             if mapped_lang:
@@ -90,8 +96,13 @@ def core_process(request_dict):
                     sb = base64.b64decode(request_dict.get("speaker_wav"))
                     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
                         f.write(sb); speaker_wav_path = f.name
-                else: speaker_wav_path = "default_speaker.wav"
                 try:
                     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
                         output_path = output_file.name
@@ -109,9 +120,9 @@ def core_process(request_dict):
             print("🔄 Step 1: STT...")
             s_res = core_process.__wrapped__({**request_dict, "action": "stt"})
             text = s_res.get("text", "")
-            print(f"🔄 Step 2: Translation to {request_dict.get('target_lang')}...")
             import deep_translator
-            target = request_dict.get("target_lang")
             translated = deep_translator.GoogleTranslator(source='auto', target=target).translate(text)
             print("🔄 Step 3: TTS...")
             t_res = core_process.__wrapped__({"action": "tts", "text": translated, "lang": target, "speaker_wav": request_dict.get("speaker_wav")})
@@ -119,11 +130,10 @@ def core_process(request_dict):
         else: res = {"error": "Invalid action"}
     except Exception as e:
-        print(f"❌ [v119] ERROR: {traceback.format_exc()}")
         res = {"error": str(e)}
     finally:
-        print(f"--- [v119] ✨ FINISHED IN {time.time()-t1:.2f}s ---")
-        # Aggressive memory cleanup for ZeroGPU
         gc.collect()
         if torch.cuda.is_available(): torch.cuda.empty_cache()
     return res
@@ -136,22 +146,21 @@ app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], all
 async def api_process(request: Request):
     try:
         data = await request.json()
-        if data.get("action") == "health": return {"status": "awake", "v": "119"}
         return core_process(data)
     except Exception as e: return {"error": str(e)}
 @app.get("/health")
-def health(): return {"status": "ok", "v": "119"}
 def gradio_fn(req_json):
     try: return json.dumps(core_process(json.loads(req_json)))
     except Exception as e: return json.dumps({"error": str(e)})
-# Unified UI
-demo = gr.Interface(fn=gradio_fn, inputs="text", outputs="text", title="🚀 AI Engine v119")
 demo.queue()
 app = gr.mount_gradio_app(app, demo, path="/")
 if __name__ == "__main__":
-    print("🚀 [v119] Starting Resilient Server on Port 7860...")
     uvicorn.run(app, host="0.0.0.0", port=7860, log_level="warning")

+# 🚀 V120: ZEROGPU HOPPER TURBO (FLASH ATTENTION ENABLED)
 try:
     import spaces
 except ImportError:
 import tempfile
 import json
 import time
 import gc
 import sys
 import traceback
+from huggingface_hub import snapshot_download
+from transformers import pipeline
+# 🛡️ 1. SILENCE & ENV (v120)
+import logging
 logging.getLogger("transformers").setLevel(logging.ERROR)
 os.environ["COQUI_TOS_AGREED"] = "1"
+os.environ["PYTHONWARNINGS"] = "ignore"
 # 📦 2. GLOBAL MODELS (LAZY LOAD)
+MODELS = {"stt": None, "tts": None}
+# 🛠️ 3. CORE PROCESSING (v120: FLASH SPEED)
+@spaces.GPU(duration=120)
 def core_process(request_dict):
     global MODELS
     action = request_dict.get("action")
+    print(f"--- [v120] ⚡ HOPPER ACTIVATED: {action} ---")
     t1 = time.time()
     try:
+        # v120: Whisper Large-v3-Turbo + Flash Attention 2 (H200 Optimized)
         if action in ["stt", "s2st"] and MODELS["stt"] is None:
+            print("🎙️ Loading Whisper Turbo (v3) + FlashAttention-2...")
+            model_id = "openai/whisper-large-v3-turbo"
             MODELS["stt"] = pipeline(
                 "automatic-speech-recognition",
                 model=model_id,
+                torch_dtype=torch.bfloat16,
+                device="cuda",
+                model_kwargs={"attn_implementation": "flash_attention_2"}
             )
         if action in ["tts", "s2st"] and MODELS["tts"] is None:
+            print("🔊 Loading XTTS-v2 (Hopper BF16 Optimized)...")
             from TTS.api import TTS
+            # Note: XTTS-v2 doesn't native support bfloat16 in its loader yet, but we'll use gpu=True
             MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
         # 🛠️ Execute Logic
             with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
                 f.write(audio_bytes); temp_path = f.name
             try:
+                # v120: Optimized Transcription
+                lang = request_dict.get("lang")
+                gen_kwargs = {"language": lang} if lang and len(lang) <= 3 else {}
+                result = MODELS["stt"](
+                    temp_path,
+                    chunk_length_s=30,
+                    batch_size=8,
+                    generate_kwargs=gen_kwargs
+                )
                 res = {"text": result["text"].strip()}
             finally:
                 if os.path.exists(temp_path): os.unlink(temp_path)
         elif action == "tts":
             text = request_dict.get("text")
             XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
+            raw_lang = (request_dict.get("lang") or "en").strip().lower()
+            clean_lang = raw_lang.split('-')[0]
             mapped_lang = XTTS_MAP.get(clean_lang) or ("zh-cn" if clean_lang == "zh" else None)
             if mapped_lang:
                     sb = base64.b64decode(request_dict.get("speaker_wav"))
                     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
                         f.write(sb); speaker_wav_path = f.name
+                else:
+                    # Use a default speaker if available, or just use the first available
+                    speaker_wav_path = "default_speaker.wav"
+                    if not os.path.exists(speaker_wav_path):
+                        # Fallback to internal speaker if default not found
+                        speaker_wav_path = None
                 try:
                     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
                         output_path = output_file.name
             print("🔄 Step 1: STT...")
             s_res = core_process.__wrapped__({**request_dict, "action": "stt"})
             text = s_res.get("text", "")
+            print(f"🔄 Step 2: Translation ({request_dict.get('target_lang')})...")
             import deep_translator
+            target = request_dict.get("target_lang") or "en"
             translated = deep_translator.GoogleTranslator(source='auto', target=target).translate(text)
             print("🔄 Step 3: TTS...")
             t_res = core_process.__wrapped__({"action": "tts", "text": translated, "lang": target, "speaker_wav": request_dict.get("speaker_wav")})
         else: res = {"error": "Invalid action"}
     except Exception as e:
+        print(f"❌ [v120] ERROR: {traceback.format_exc()}")
         res = {"error": str(e)}
     finally:
+        print(f"--- [v120] ✨ FINISHED IN {time.time()-t1:.2f}s ---")
         gc.collect()
         if torch.cuda.is_available(): torch.cuda.empty_cache()
     return res
 async def api_process(request: Request):
     try:
         data = await request.json()
+        if data.get("action") == "health": return {"status": "awake", "v": "120"}
         return core_process(data)
     except Exception as e: return {"error": str(e)}
 @app.get("/health")
+def health(): return {"status": "ok", "v": "120"}
 def gradio_fn(req_json):
     try: return json.dumps(core_process(json.loads(req_json)))
     except Exception as e: return json.dumps({"error": str(e)})
+demo = gr.Interface(fn=gradio_fn, inputs="text", outputs="text", title="🚀 AI Engine v120 (Hopper Turbo)")
 demo.queue()
 app = gr.mount_gradio_app(app, demo, path="/")
 if __name__ == "__main__":
+    print("🚀 [v120] Starting Hopper Turbo Engine on Port 7860...")
     uvicorn.run(app, host="0.0.0.0", port=7860, log_level="warning")