Spaces:

TGPro1
/

S2ST

Running on Zero

App Files Files Community

TGPro1 commited on Jan 21

Commit

92f88b0

verified ·

1 Parent(s): 2b4125e

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +70 -33

app.py CHANGED Viewed

@@ -24,6 +24,24 @@ import soundfile as sf
 from faster_whisper import WhisperModel
 # 🛡️ 0. INFRASTRUCTURE PURIST (v136)
 os.environ["COQUI_TOS_AGREED"] = "1"
 os.environ["PYTHONWARNINGS"] = "ignore"
 # Strict CUBLAS stability for H200
@@ -32,33 +50,48 @@ torch.backends.cuda.matmul.allow_tf32 = False
 torch.backends.cudnn.allow_tf32 = False
 torch.use_deterministic_algorithms(False) # Some kernels might need this, but let's keep it flexible
-import torchaudio
-def torchaudio_load_safe(filepath, **kwargs):
-    data, sr = sf.read(filepath)
-    if len(data.shape) == 1: tensor = torch.from_numpy(data).float().unsqueeze(0)
-    else: tensor = torch.from_numpy(data).float().transpose(0, 1)
-    return tensor, sr
-torchaudio.load = torchaudio_load_safe
-# 📦 1. GLOBAL MODELS (SINGLETON PATTERN)
-MODELS = {"stt": None, "tts": None}
 def load_gpu_models():
     global MODELS
     if MODELS["stt"] is None:
-        print("🎙️ Loading Faster-Whisper to GPU (Persistent)...")
-        MODELS["stt"] = WhisperModel("large-v3-turbo", device="cuda", compute_type="float16")
     if MODELS["tts"] is None:
-        print("🔊 Loading XTTS-v2 to GPU (Persistent)...")
-        from TTS.api import TTS
-        MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
-# 🛠️ 2. CORE PROCESSING (v136: NO PAGING, NO JITTER)
 @spaces.GPU(duration=120)
 def core_process(request_dict):
     global MODELS
     action = request_dict.get("action")
-    print(f"--- [v136] 🛠️ PURIST ENGINE: {action} ---")
     t1 = time.time()
     try:
@@ -70,10 +103,12 @@ def core_process(request_dict):
             audio_bytes = base64.b64decode(request_dict.get("file"))
             with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
                 f.write(audio_bytes); temp_path = f.name
             try:
                 lang = request_dict.get("lang")
-                segments, _ = MODELS["stt"].transcribe(temp_path, language=lang if lang and len(lang) <= 3 else None, beam_size=1)
-                stt_text = "".join([s.text for s in segments]).strip()
             finally:
                 if os.path.exists(temp_path): os.unlink(temp_path)
@@ -82,6 +117,8 @@ def core_process(request_dict):
         # 🔊 TTS PATH
         if action in ["tts", "s2st"]:
             text = (request_dict.get("text") if action == "tts" else stt_text).strip()
             if action == "s2st":
                 from deep_translator import GoogleTranslator
                 target = request_dict.get("target_lang") or "en"
@@ -123,32 +160,32 @@ def core_process(request_dict):
             return {"text": stt_text, "translated": trans_text, "audio": audio_b64}
     except Exception as e:
-        print(f"❌ [v136] ERROR: {traceback.format_exc()}")
         return {"error": str(e)}
     finally:
-        print(f"--- [v136] ✨ DONE ({time.time()-t1:.1f}s) ---")
-        torch.cuda.empty_cache() # Keep models in VRAM, but clear temp buffers
-# 🚀 3. SERVER SETUP
-app = FastAPI()
-app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
-@app.post("/api/v1/process")
 async def api_process(request: Request):
     try:
         data = await request.json()
-        if data.get("action") == "health": return {"status": "awake", "v": "136"}
         return core_process(data)
     except Exception as e: return {"error": str(e)}
 @app.get("/health")
-def health(): return {"status": "ok", "v": "136"}
-demo = gr.Interface(
-    fn=lambda x: json.dumps(core_process(json.loads(x))),
-    inputs="text", outputs="text", title="🚀 AI Engine v136 (Persistent GPU)",
-    description="H200 Native | Fast-Whisper + XTTS-v2 | Full VRAM Mode"
-).queue()
 app = gr.mount_gradio_app(app, demo, path="/")
 if __name__ == "__main__":

 from faster_whisper import WhisperModel
 # 🛡️ 0. INFRASTRUCTURE PURIST (v136)
+import numpy as np
+import uvicorn
+from fastapi import FastAPI, Request
+from fastapi.middleware.cors import CORSMiddleware
+from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
+from TTS.api import TTS
+import gradio as gr
+import json # Added for gradio interface
+# ==========================================
+# 🚀 v137 - HOPPER NATIVE (Transformers + Persistent VRAM)
+# ==========================================
+# Stability Strategy:
+# 1. Revert to 'transformers' pipeline (Native PyTorch kernels for H200).
+# 2. LOAD ONCE, STAY IN VRAM (Singleton Pattern).
+# 3. Force SDPA (Flash Attention) + FP16.
+# 4. Strict GPU-only path inside ZeroGPU context.
 os.environ["COQUI_TOS_AGREED"] = "1"
 os.environ["PYTHONWARNINGS"] = "ignore"
 # Strict CUBLAS stability for H200
 torch.backends.cudnn.allow_tf32 = False
 torch.use_deterministic_algorithms(False) # Some kernels might need this, but let's keep it flexible
+app = FastAPI()
+app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
+MODELS = {"stt": None, "tts": None, "processor": None}
 def load_gpu_models():
+    """Persistent loading into GPU VRAM. Only runs once per worker."""
     global MODELS
+    device = "cuda"
     if MODELS["stt"] is None:
+        print("--- [v137] 📥 LOADING NATIVE WHISPER (Large-v3-Turbo) ---")
+        model_id = "openai/whisper-large-v3-turbo"
+        torch_dtype = torch.float16
+        # Load model with SDPA (Flash Attention) for H200
+        model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
+        ).to(device)
+        processor = AutoProcessor.from_pretrained(model_id)
+        MODELS["stt"] = pipeline(
+            "automatic-speech-recognition",
+            model=model,
+            tokenizer=processor.tokenizer,
+            feature_extractor=processor.feature_extractor,
+            torch_dtype=torch_dtype,
+            device=device,
+            model_kwargs={"attn_implementation": "sdpa"}
+        )
+        print("--- [v137] ✅ WHISPER LOADED ---")
     if MODELS["tts"] is None:
+        print("--- [v137] 📥 LOADING XTTS (VRAM STABLE) ---")
+        MODELS["tts"] = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
+        print("--- [v137] ✅ XTTS LOADED ---")
 @spaces.GPU(duration=120)
 def core_process(request_dict):
     global MODELS
     action = request_dict.get("action")
+    print(f"--- [v137] 🛠️ HOPPER ENGINE: {action} ---")
     t1 = time.time()
     try:
             audio_bytes = base64.b64decode(request_dict.get("file"))
             with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
                 f.write(audio_bytes); temp_path = f.name
             try:
                 lang = request_dict.get("lang")
+                # Inference using transformers pipeline
+                result = MODELS["stt"](temp_path, generate_kwargs={"language": lang if lang and len(lang) <= 3 else None})
+                stt_text = result["text"].strip()
             finally:
                 if os.path.exists(temp_path): os.unlink(temp_path)
         # 🔊 TTS PATH
         if action in ["tts", "s2st"]:
             text = (request_dict.get("text") if action == "tts" else stt_text).strip()
+            trans_text = text
             if action == "s2st":
                 from deep_translator import GoogleTranslator
                 target = request_dict.get("target_lang") or "en"
             return {"text": stt_text, "translated": trans_text, "audio": audio_b64}
     except Exception as e:
+        print(f"❌ [v137] ERROR: {traceback.format_exc()}")
         return {"error": str(e)}
     finally:
+        print(f"--- [v137] ✨ DONE ({time.time()-t1:.1f}s) ---")
+        torch.cuda.empty_cache()
+@app.post("/process")
 async def api_process(request: Request):
     try:
         data = await request.json()
+        if data.get("action") == "health": return {"status": "awake", "v": "137"}
         return core_process(data)
     except Exception as e: return {"error": str(e)}
 @app.get("/health")
+def health(): return {"status": "ok", "v": "137"}
+# Gradio interface for debugging
+with gr.Blocks() as demo:
+    gr.Markdown("## v137 HOPPER NATIVE (H200 Stable)")
+    with gr.Row():
+        audio_in = gr.Audio(type="filepath", label="Input Audio")
+        stt_btn = gr.Button("STT")
+        txt_out = gr.Textbox(label="STT Result")
+    stt_btn.click(fn=lambda x: core_process({"action": "stt", "file": base64.b64encode(open(x, "rb").read()).decode()})["text"], inputs=audio_in, outputs=txt_out)
 app = gr.mount_gradio_app(app, demo, path="/")
 if __name__ == "__main__":