Spaces:

TGPro1
/

S2ST

Sleeping

App Files Files Community

TGPro1 commited on Jan 21

Commit

ad3d045

verified ·

1 Parent(s): 456b557

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +61 -139

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# 🚀 V118: ZEROGPU HOPPER STEADY (PRODUCTION GRADE)
 try:
     import spaces
 except ImportError:
@@ -8,10 +8,9 @@ except ImportError:
             if f is None: return lambda x: x
             return f
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
-from contextlib import asynccontextmanager
-import gradio as gr
 import uvicorn
 import base64
 import torch
@@ -25,125 +24,58 @@ import sys
 import types
 import logging
 import traceback
 from huggingface_hub import snapshot_download, hf_hub_download
-# 🛡️ 1. SILENCE & ENV (v118)
 logging.getLogger("transformers").setLevel(logging.ERROR)
-logging.getLogger("TTS").setLevel(logging.ERROR)
-os.environ["CT2_VERBOSE"] = "0"
-os.environ["ORT_LOGGING_LEVEL"] = "3"
 os.environ["COQUI_TOS_AGREED"] = "1"
-# 🛠️ 2. TOP-LEVEL ASSET PREPARATION (Ensures HF Readiness)
-print("\n📦 [v118] TOP-LEVEL: Preparing AI Assets...")
-try:
-    WHISPER_PATH = snapshot_download("Systran/faster-whisper-large-v3")
-    XTTS_PATH = snapshot_download("coqui/XTTS-v2")
-    print("✅ Assets cached on disk.")
-except Exception as e:
-    print(f"⚠️ Pre-download warning: {e}")
-    WHISPER_PATH = "large-v3"
-# 🛠️ 3. COMPATIBILITY PATCHES
-if "torchaudio.backend" not in sys.modules:
-    backend = types.ModuleType("torchaudio.backend")
-    common = types.ModuleType("torchaudio.backend.common")
-    try: common.AudioMetaData = torchaudio.AudioMetaData
-    except AttributeError:
-        class AudioMetaData: pass
-        common.AudioMetaData = AudioMetaData
-    backend.common = common
-    sys.modules["torchaudio.backend"] = backend
-    sys.modules["torchaudio.backend.common"] = common
-if not hasattr(torchaudio, "info"):
-    def mock_info(filepath, **kwargs):
-        from types import SimpleNamespace
-        import wave
-        try:
-            with wave.open(filepath, "rb") as f:
-                return SimpleNamespace(sample_rate=f.getframerate(), num_frames=f.getnframes(), num_channels=f.getnchannels(), bits_per_sample=f.getsampwidth() * 8, encoding="PCM_S")
-        except: return SimpleNamespace(sample_rate=48000, num_frames=0, num_channels=1)
-    torchaudio.info = mock_info
-# 📦 4. AI LIBRARIES
-import chatterbox_utils
-from faster_whisper import WhisperModel
-from TTS.api import TTS
-from df.enhance import init_df
-import deep_translator
-# v118: Hopper Steady. Persistent RAM Init. int8 GPU.
-MODELS = {"stt": None, "translate": None, "tts": None, "denoiser": None}
-def activate_gpu_models(action):
-    """v118: Robust GPU Promotion"""
-    global MODELS
-    if action in ["stt", "s2st"]:
-        stt_on_gpu = False
-        try: stt_on_gpu = MODELS["stt"] is not None and MODELS["stt"].model.device == "cuda"
-        except: pass
-        if not stt_on_gpu:
-            print(f"🎙️ [v118] PROMOTE: Whisper (GPU, int8)...")
-            try:
-                gc.collect(); torch.cuda.empty_cache()
-                MODELS["stt"] = WhisperModel(WHISPER_PATH, device="cuda", compute_type="int8", num_workers=1)
-            except Exception as e:
-                print(f"⚠️ GPU STT Fail: {e}")
-                MODELS["stt"] = WhisperModel(WHISPER_PATH, device="cpu", compute_type="int8")
-    if action in ["tts", "s2st"]:
-        tts_on_gpu = False
-        try:
-            params = next(MODELS["tts"].synthesizer.tts_model.parameters())
-            tts_on_gpu = "cuda" in str(params.device)
-        except: pass
-        if MODELS["tts"] is not None and not tts_on_gpu:
-            print(f"🔊 [v118] PROMOTE: XTTS to GPU...")
-            try: MODELS["tts"].to("cuda")
-            except: pass
-    chatterbox_utils.load_chatterbox(device="cpu")
-    if MODELS["denoiser"] is None:
-        try: MODELS["denoiser"] = init_df()
-        except: pass
-    if MODELS["translate"] is None: MODELS["translate"] = "active"
-def release_gpu_models():
-    """v118: Graceful Offload"""
-    global MODELS
-    try:
-        if MODELS["stt"] and MODELS["stt"].model.device == "cuda":
-            MODELS["stt"] = WhisperModel(WHISPER_PATH, device="cpu", compute_type="int8", local_files_only=True)
-        if MODELS["tts"]:
-            try: MODELS["tts"].to("cpu")
-            except: pass
-    except: pass
-    gc.collect()
-    if torch.cuda.is_available(): torch.cuda.empty_cache()
 @spaces.GPU(duration=150)
 def core_process(request_dict):
     action = request_dict.get("action")
-    print(f"--- [v118] 🚀 REQUEST: {action} ---")
     t1 = time.time()
-    activate_gpu_models(action)
     try:
         if action == "stt":
             audio_bytes = base64.b64decode(request_dict.get("file"))
             with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
                 f.write(audio_bytes); temp_path = f.name
             try:
-                segments, _ = MODELS["stt"].transcribe(temp_path, language=request_dict.get("lang"), beam_size=1)
-                res = {"text": " ".join([s.text for s in segments]).strip()}
             finally:
                 if os.path.exists(temp_path): os.unlink(temp_path)
         elif action == "translate":
             res = {"translated": deep_translator.GoogleTranslator(source='auto', target=request_dict.get("target_lang", "en")).translate(request_dict.get("text"))}
         elif action == "tts":
@@ -169,67 +101,57 @@ def core_process(request_dict):
                     if speaker_wav_path and "default" not in speaker_wav_path and os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
                     if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
             else:
                 audio_bytes = chatterbox_utils.run_chatterbox_inference(text, clean_lang)
                 res = {"audio": base64.b64encode(audio_bytes).decode()}
         elif action == "s2st":
-            # Direct logic sequence in v118 (No recursion)
-            audio_bytes = base64.b64decode(request_dict.get("file"))
-            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
-                f.write(audio_bytes); temp_path = f.name
-            try:
-                # 1. STT
-                segs, _ = MODELS["stt"].transcribe(temp_path, language=request_dict.get("lang"), beam_size=1)
-                stt_text = " ".join([s.text for s in segs]).strip()
-                # 2. Translated
-                target = request_dict.get("target_lang")
-                translated = deep_translator.GoogleTranslator(source='auto', target=target).translate(stt_text)
-                # 3. TTS
-                final_res = core_process.__wrapped__({"action": "tts", "text": translated, "lang": target, "speaker_wav": request_dict.get("speaker_wav")})
-                res = {"text": stt_text, "translated": translated, "audio": final_res.get("audio")}
-            finally:
-                if os.path.exists(temp_path): os.unlink(temp_path)
-        else: res = {"error": f"Unknown action: {action}"}
     except Exception as e:
-        print(f"❌ Fault: {traceback.format_exc()}")
         res = {"error": str(e)}
     finally:
-        print(f"--- [v118] ✨ FINISH ({time.time()-t1:.2f}s) ---")
-        release_gpu_models()
     return res
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    print("🔥 [v118] RAM Warming...")
-    MODELS["stt"] = WhisperModel(WHISPER_PATH, device="cpu", compute_type="int8")
-    MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
-    chatterbox_utils.warmup_chatterbox()
-    print("✅ [v118] ENGINE READY.")
-    yield
-# 🚀 FastAPI
-app = FastAPI(lifespan=lifespan)
 app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
 @app.post("/api/v1/process")
 async def api_process(request: Request):
     try:
-        req_data = await request.json()
-        if req_data.get("action") == "health": return {"status": "awake", "v": "118"}
-        return core_process(req_data)
     except Exception as e: return {"error": str(e)}
 @app.get("/health")
-def health(): return {"status": "ok", "v": "118"}
 def gradio_fn(req_json):
     try: return json.dumps(core_process(json.loads(req_json)))
     except Exception as e: return json.dumps({"error": str(e)})
-# Unified UI mount
-demo = gr.Interface(fn=gradio_fn, inputs="text", outputs="text", title="🚀 AI Engine v118")
 demo.queue()
 app = gr.mount_gradio_app(app, demo, path="/")
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=7860, log_level="warning")

+# 🚀 V119: ZEROGPU HOPPER RESILIENT (STABILITY OVERRIDE)
 try:
     import spaces
 except ImportError:
             if f is None: return lambda x: x
             return f
+import gradio as gr
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
 import uvicorn
 import base64
 import torch
 import types
 import logging
 import traceback
+from threading import Thread
 from huggingface_hub import snapshot_download, hf_hub_download
+from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
+# 🛡️ 1. SILENCE & ENV (v119)
 logging.getLogger("transformers").setLevel(logging.ERROR)
 os.environ["COQUI_TOS_AGREED"] = "1"
+os.environ["CT2_VERBOSE"] = "0"
+# 📦 2. GLOBAL MODELS (LAZY LOAD)
+MODELS = {"stt": None, "tts": None, "translate": None}
+# 🛠️ 3. CORE PROCESSING (v119: STABILITY FIRST)
 @spaces.GPU(duration=150)
 def core_process(request_dict):
+    global MODELS
     action = request_dict.get("action")
+    print(f"--- [v119] 🚀 PROCESSING: {action} ---")
     t1 = time.time()
     try:
+        # v119: LAZY LOAD INSIDE GPU SESSION (Prevents Startup Hangs)
+        if action in ["stt", "s2st"] and MODELS["stt"] is None:
+            print("🎙️ Loading Whisper (Transformers Pipeline, float16)...")
+            # Using Transformers instead of faster-whisper for MIG stability
+            model_id = "openai/whisper-large-v3"
+            MODELS["stt"] = pipeline(
+                "automatic-speech-recognition",
+                model=model_id,
+                torch_dtype=torch.float16,
+                device="cuda"
+            )
+        if action in ["tts", "s2st"] and MODELS["tts"] is None:
+            print("🔊 Loading XTTS-v2 (Native float16)...")
+            from TTS.api import TTS
+            MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
+        # 🛠️ Execute Logic
         if action == "stt":
             audio_bytes = base64.b64decode(request_dict.get("file"))
             with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
                 f.write(audio_bytes); temp_path = f.name
             try:
+                # v119: Transcribe via Transformers
+                result = MODELS["stt"](temp_path, generate_kwargs={"language": request_dict.get("lang")})
+                res = {"text": result["text"].strip()}
             finally:
                 if os.path.exists(temp_path): os.unlink(temp_path)
         elif action == "translate":
+            import deep_translator
             res = {"translated": deep_translator.GoogleTranslator(source='auto', target=request_dict.get("target_lang", "en")).translate(request_dict.get("text"))}
         elif action == "tts":
                     if speaker_wav_path and "default" not in speaker_wav_path and os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
                     if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
             else:
+                import chatterbox_utils
                 audio_bytes = chatterbox_utils.run_chatterbox_inference(text, clean_lang)
                 res = {"audio": base64.b64encode(audio_bytes).decode()}
         elif action == "s2st":
+            print("🔄 Step 1: STT...")
+            s_res = core_process.__wrapped__({**request_dict, "action": "stt"})
+            text = s_res.get("text", "")
+            print(f"🔄 Step 2: Translation to {request_dict.get('target_lang')}...")
+            import deep_translator
+            target = request_dict.get("target_lang")
+            translated = deep_translator.GoogleTranslator(source='auto', target=target).translate(text)
+            print("🔄 Step 3: TTS...")
+            t_res = core_process.__wrapped__({"action": "tts", "text": translated, "lang": target, "speaker_wav": request_dict.get("speaker_wav")})
+            res = {"text": text, "translated": translated, "audio": t_res.get("audio")}
+        else: res = {"error": "Invalid action"}
     except Exception as e:
+        print(f"❌ [v119] ERROR: {traceback.format_exc()}")
         res = {"error": str(e)}
     finally:
+        print(f"--- [v119] ✨ FINISHED IN {time.time()-t1:.2f}s ---")
+        # Aggressive memory cleanup for ZeroGPU
+        gc.collect()
+        if torch.cuda.is_available(): torch.cuda.empty_cache()
     return res
+# 🚀 4. SERVER SETUP
+app = FastAPI()
 app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
 @app.post("/api/v1/process")
 async def api_process(request: Request):
     try:
+        data = await request.json()
+        if data.get("action") == "health": return {"status": "awake", "v": "119"}
+        return core_process(data)
     except Exception as e: return {"error": str(e)}
 @app.get("/health")
+def health(): return {"status": "ok", "v": "119"}
 def gradio_fn(req_json):
     try: return json.dumps(core_process(json.loads(req_json)))
     except Exception as e: return json.dumps({"error": str(e)})
+# Unified UI
+demo = gr.Interface(fn=gradio_fn, inputs="text", outputs="text", title="🚀 AI Engine v119")
 demo.queue()
 app = gr.mount_gradio_app(app, demo, path="/")
 if __name__ == "__main__":
+    print("🚀 [v119] Starting Resilient Server on Port 7860...")
     uvicorn.run(app, host="0.0.0.0", port=7860, log_level="warning")