Spaces:

TGPro1
/

S2ST

Running on Zero

App Files Files Community

TGPro1 commited on 27 days ago

Commit

5972208

verified ·

1 Parent(s): 9203a32

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +15 -15

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 try:
     import spaces
 except ImportError:
@@ -27,15 +28,9 @@ from TTS.api import TTS
 # ==========================================
 # 🚀 v137 - HOPPER NATIVE (Transformers + Persistent VRAM)
 # ==========================================
-# Stability Strategy:
-# 1. Revert to 'transformers' pipeline (Native PyTorch kernels for H200).
-# 2. LOAD ONCE, STAY IN VRAM (Singleton Pattern).
-# 3. Force SDPA (Flash Attention) + FP16.
-# 4. Strict GPU-only path inside ZeroGPU context.
 os.environ["COQUI_TOS_AGREED"] = "1"
 os.environ["PYTHONWARNINGS"] = "ignore"
-# Strict CUBLAS stability for H200
 os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
 torch.backends.cuda.matmul.allow_tf32 = False
 torch.backends.cudnn.allow_tf32 = False
@@ -53,11 +48,9 @@ def load_gpu_models():
     if MODELS.get("stt") is None:
         print("--- [v137] 📥 LOADING NATIVE WHISPER (Large-v3-Turbo) ---")
         model_id = "openai/whisper-large-v3-turbo"
-        torch_dtype = torch.float16
         # Load model with SDPA (Flash Attention) for H200
         model = AutoModelForSpeechSeq2Seq.from_pretrained(
-            model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
         ).to(device)
         processor = AutoProcessor.from_pretrained(model_id)
@@ -66,7 +59,7 @@ def load_gpu_models():
             model=model,
             tokenizer=processor.tokenizer,
             feature_extractor=processor.feature_extractor,
-            torch_dtype=torch_dtype,
             device=device,
             model_kwargs={"attn_implementation": "sdpa"}
         )
@@ -85,7 +78,6 @@ def core_process(request_dict):
     t1 = time.time()
     try:
-        # Load once and keep in VRAM within the worker life
         load_gpu_models()
         # 🎙️ STT PATH
@@ -96,7 +88,6 @@ def core_process(request_dict):
             try:
                 lang = request_dict.get("lang")
-                # Inference using transformers pipeline
                 result = MODELS["stt"](temp_path, generate_kwargs={"language": lang if lang and len(lang) <= 3 else None})
                 stt_text = result["text"].strip()
             finally:
@@ -167,7 +158,14 @@ async def api_process(request: Request):
 @app.get("/health")
 def health(): return {"status": "ok", "v": "137"}
-# Gradio interface for debugging
 with gr.Blocks() as demo:
     gr.Markdown("# 🚀 v137 HOPPER NATIVE (H200 Stable)")
     gr.Markdown("Direct GPU path | Transformers Whisper | XTTS-v2 Singleton")
@@ -175,9 +173,11 @@ with gr.Blocks() as demo:
         audio_in = gr.Audio(type="filepath", label="Input Audio")
         stt_btn = gr.Button("STT")
         txt_out = gr.Textbox(label="STT Result")
-    stt_btn.click(fn=lambda x: core_process({"action": "stt", "file": base64.b64encode(open(x, "rb").read()).decode()})["text"], inputs=audio_in, outputs=txt_out)
 app = gr.mount_gradio_app(app, demo, path="/")
 if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=7860, log_level="warning")

+print("--- [v137-clean] 🚀 BOOTING APP.PY ---")
 try:
     import spaces
 except ImportError:
 # ==========================================
 # 🚀 v137 - HOPPER NATIVE (Transformers + Persistent VRAM)
 # ==========================================
 os.environ["COQUI_TOS_AGREED"] = "1"
 os.environ["PYTHONWARNINGS"] = "ignore"
 os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
 torch.backends.cuda.matmul.allow_tf32 = False
 torch.backends.cudnn.allow_tf32 = False
     if MODELS.get("stt") is None:
         print("--- [v137] 📥 LOADING NATIVE WHISPER (Large-v3-Turbo) ---")
         model_id = "openai/whisper-large-v3-turbo"
         # Load model with SDPA (Flash Attention) for H200
         model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True
         ).to(device)
         processor = AutoProcessor.from_pretrained(model_id)
             model=model,
             tokenizer=processor.tokenizer,
             feature_extractor=processor.feature_extractor,
+            torch_dtype=torch.float16,
             device=device,
             model_kwargs={"attn_implementation": "sdpa"}
         )
     t1 = time.time()
     try:
         load_gpu_models()
         # 🎙️ STT PATH
             try:
                 lang = request_dict.get("lang")
                 result = MODELS["stt"](temp_path, generate_kwargs={"language": lang if lang and len(lang) <= 3 else None})
                 stt_text = result["text"].strip()
             finally:
 @app.get("/health")
 def health(): return {"status": "ok", "v": "137"}
+# Named function for Gradio to avoid lambda schema issues
+def gradio_stt(audio_path):
+    if not audio_path: return ""
+    with open(audio_path, "rb") as f:
+        b64 = base64.b64encode(f.read()).decode()
+    res = core_process({"action": "stt", "file": b64})
+    return res.get("text", f"Error: {res.get('error')}")
 with gr.Blocks() as demo:
     gr.Markdown("# 🚀 v137 HOPPER NATIVE (H200 Stable)")
     gr.Markdown("Direct GPU path | Transformers Whisper | XTTS-v2 Singleton")
         audio_in = gr.Audio(type="filepath", label="Input Audio")
         stt_btn = gr.Button("STT")
         txt_out = gr.Textbox(label="STT Result")
+    stt_btn.click(fn=gradio_stt, inputs=audio_in, outputs=txt_out)
+print("--- [v137-clean] 🔧 MOUNTING GRADIO ---")
 app = gr.mount_gradio_app(app, demo, path="/")
 if __name__ == "__main__":
+    print("--- [v137-clean] 📡 STARTING UVICORN ---")
+    uvicorn.run(app, host="0.0.0.0", port=7860, log_level="info")