TGPro1 commited on
Commit
5972208
Β·
verified Β·
1 Parent(s): 9203a32

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +15 -15
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  try:
2
  import spaces
3
  except ImportError:
@@ -27,15 +28,9 @@ from TTS.api import TTS
27
  # ==========================================
28
  # πŸš€ v137 - HOPPER NATIVE (Transformers + Persistent VRAM)
29
  # ==========================================
30
- # Stability Strategy:
31
- # 1. Revert to 'transformers' pipeline (Native PyTorch kernels for H200).
32
- # 2. LOAD ONCE, STAY IN VRAM (Singleton Pattern).
33
- # 3. Force SDPA (Flash Attention) + FP16.
34
- # 4. Strict GPU-only path inside ZeroGPU context.
35
 
36
  os.environ["COQUI_TOS_AGREED"] = "1"
37
  os.environ["PYTHONWARNINGS"] = "ignore"
38
- # Strict CUBLAS stability for H200
39
  os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
40
  torch.backends.cuda.matmul.allow_tf32 = False
41
  torch.backends.cudnn.allow_tf32 = False
@@ -53,11 +48,9 @@ def load_gpu_models():
53
  if MODELS.get("stt") is None:
54
  print("--- [v137] πŸ“₯ LOADING NATIVE WHISPER (Large-v3-Turbo) ---")
55
  model_id = "openai/whisper-large-v3-turbo"
56
- torch_dtype = torch.float16
57
-
58
  # Load model with SDPA (Flash Attention) for H200
59
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
60
- model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
61
  ).to(device)
62
  processor = AutoProcessor.from_pretrained(model_id)
63
 
@@ -66,7 +59,7 @@ def load_gpu_models():
66
  model=model,
67
  tokenizer=processor.tokenizer,
68
  feature_extractor=processor.feature_extractor,
69
- torch_dtype=torch_dtype,
70
  device=device,
71
  model_kwargs={"attn_implementation": "sdpa"}
72
  )
@@ -85,7 +78,6 @@ def core_process(request_dict):
85
  t1 = time.time()
86
 
87
  try:
88
- # Load once and keep in VRAM within the worker life
89
  load_gpu_models()
90
 
91
  # πŸŽ™οΈ STT PATH
@@ -96,7 +88,6 @@ def core_process(request_dict):
96
 
97
  try:
98
  lang = request_dict.get("lang")
99
- # Inference using transformers pipeline
100
  result = MODELS["stt"](temp_path, generate_kwargs={"language": lang if lang and len(lang) <= 3 else None})
101
  stt_text = result["text"].strip()
102
  finally:
@@ -167,7 +158,14 @@ async def api_process(request: Request):
167
  @app.get("/health")
168
  def health(): return {"status": "ok", "v": "137"}
169
 
170
- # Gradio interface for debugging
 
 
 
 
 
 
 
171
  with gr.Blocks() as demo:
172
  gr.Markdown("# πŸš€ v137 HOPPER NATIVE (H200 Stable)")
173
  gr.Markdown("Direct GPU path | Transformers Whisper | XTTS-v2 Singleton")
@@ -175,9 +173,11 @@ with gr.Blocks() as demo:
175
  audio_in = gr.Audio(type="filepath", label="Input Audio")
176
  stt_btn = gr.Button("STT")
177
  txt_out = gr.Textbox(label="STT Result")
178
- stt_btn.click(fn=lambda x: core_process({"action": "stt", "file": base64.b64encode(open(x, "rb").read()).decode()})["text"], inputs=audio_in, outputs=txt_out)
179
 
 
180
  app = gr.mount_gradio_app(app, demo, path="/")
181
 
182
  if __name__ == "__main__":
183
- uvicorn.run(app, host="0.0.0.0", port=7860, log_level="warning")
 
 
1
+ print("--- [v137-clean] πŸš€ BOOTING APP.PY ---")
2
  try:
3
  import spaces
4
  except ImportError:
 
28
  # ==========================================
29
  # πŸš€ v137 - HOPPER NATIVE (Transformers + Persistent VRAM)
30
  # ==========================================
 
 
 
 
 
31
 
32
  os.environ["COQUI_TOS_AGREED"] = "1"
33
  os.environ["PYTHONWARNINGS"] = "ignore"
 
34
  os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
35
  torch.backends.cuda.matmul.allow_tf32 = False
36
  torch.backends.cudnn.allow_tf32 = False
 
48
  if MODELS.get("stt") is None:
49
  print("--- [v137] πŸ“₯ LOADING NATIVE WHISPER (Large-v3-Turbo) ---")
50
  model_id = "openai/whisper-large-v3-turbo"
 
 
51
  # Load model with SDPA (Flash Attention) for H200
52
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
53
+ model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True
54
  ).to(device)
55
  processor = AutoProcessor.from_pretrained(model_id)
56
 
 
59
  model=model,
60
  tokenizer=processor.tokenizer,
61
  feature_extractor=processor.feature_extractor,
62
+ torch_dtype=torch.float16,
63
  device=device,
64
  model_kwargs={"attn_implementation": "sdpa"}
65
  )
 
78
  t1 = time.time()
79
 
80
  try:
 
81
  load_gpu_models()
82
 
83
  # πŸŽ™οΈ STT PATH
 
88
 
89
  try:
90
  lang = request_dict.get("lang")
 
91
  result = MODELS["stt"](temp_path, generate_kwargs={"language": lang if lang and len(lang) <= 3 else None})
92
  stt_text = result["text"].strip()
93
  finally:
 
158
  @app.get("/health")
159
  def health(): return {"status": "ok", "v": "137"}
160
 
161
+ # Named function for Gradio to avoid lambda schema issues
162
+ def gradio_stt(audio_path):
163
+ if not audio_path: return ""
164
+ with open(audio_path, "rb") as f:
165
+ b64 = base64.b64encode(f.read()).decode()
166
+ res = core_process({"action": "stt", "file": b64})
167
+ return res.get("text", f"Error: {res.get('error')}")
168
+
169
  with gr.Blocks() as demo:
170
  gr.Markdown("# πŸš€ v137 HOPPER NATIVE (H200 Stable)")
171
  gr.Markdown("Direct GPU path | Transformers Whisper | XTTS-v2 Singleton")
 
173
  audio_in = gr.Audio(type="filepath", label="Input Audio")
174
  stt_btn = gr.Button("STT")
175
  txt_out = gr.Textbox(label="STT Result")
176
+ stt_btn.click(fn=gradio_stt, inputs=audio_in, outputs=txt_out)
177
 
178
+ print("--- [v137-clean] πŸ”§ MOUNTING GRADIO ---")
179
  app = gr.mount_gradio_app(app, demo, path="/")
180
 
181
  if __name__ == "__main__":
182
+ print("--- [v137-clean] πŸ“‘ STARTING UVICORN ---")
183
+ uvicorn.run(app, host="0.0.0.0", port=7860, log_level="info")