TGPro1 commited on
Commit
654e2d9
Β·
verified Β·
1 Parent(s): 659b8ce

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +28 -52
app.py CHANGED
@@ -6,15 +6,15 @@ import torch
6
  import tempfile
7
  import traceback
8
  import gc
9
- from fastapi import FastAPI, Request, Response
10
  from fastapi.middleware.cors import CORSMiddleware
11
  from fastapi.responses import HTMLResponse
12
  import uvicorn
13
  from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
14
  from TTS.api import TTS
15
 
16
- # --- [v138] πŸš€ ZEROGPU LEGACY-FREE ENGINE ---
17
- print(f"--- [v138] πŸ“‘ BOOTING API ENGINE ---")
18
 
19
  try:
20
  import spaces
@@ -27,12 +27,14 @@ except ImportError:
27
  if f is None: return lambda x: x
28
  return f
29
 
30
- # --- System Config ---
31
  os.environ["COQUI_TOS_AGREED"] = "1"
32
  os.environ["PYTHONWARNINGS"] = "ignore"
 
33
  os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
34
  torch.backends.cuda.matmul.allow_tf32 = False
35
  torch.backends.cudnn.allow_tf32 = False
 
36
 
37
  app = FastAPI()
38
  app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
@@ -40,15 +42,16 @@ app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], all
40
  MODELS = {"stt": None, "tts": None}
41
 
42
  def load_gpu_models():
43
- """Persistent loading into GPU VRAM. Only runs once per worker."""
44
  global MODELS
45
  device = "cuda"
46
 
47
  if MODELS.get("stt") is None:
48
- print("--- [v138] πŸ“₯ LOADING NATIVE WHISPER (Large-v3-Turbo) ---")
49
  model_id = "openai/whisper-large-v3-turbo"
 
50
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
51
- model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True
52
  ).to(device)
53
  processor = AutoProcessor.from_pretrained(model_id)
54
 
@@ -57,22 +60,24 @@ def load_gpu_models():
57
  model=model,
58
  tokenizer=processor.tokenizer,
59
  feature_extractor=processor.feature_extractor,
60
- torch_dtype=torch.float16,
61
  device=device,
62
- model_kwargs={"attn_implementation": "sdpa"}
 
63
  )
64
- print("--- [v138] βœ… WHISPER LOADED ---")
65
 
66
  if MODELS.get("tts") is None:
67
- print("--- [v138] πŸ“₯ LOADING XTTS (VRAM STABLE) ---")
 
68
  MODELS["tts"] = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
69
- print("--- [v138] βœ… XTTS LOADED ---")
70
 
71
  @spaces.GPU(duration=120)
72
  def core_process(request_dict):
73
  global MODELS
74
  action = request_dict.get("action")
75
- print(f"--- [v138] πŸ› οΈ HOPPER ENGINE: {action} ---")
76
  t1 = time.time()
77
 
78
  try:
@@ -86,7 +91,8 @@ def core_process(request_dict):
86
 
87
  try:
88
  lang = request_dict.get("lang")
89
- result = MODELS["stt"](temp_path, generate_kwargs={"language": lang if lang and len(lang) <= 3 else None})
 
90
  stt_text = result["text"].strip()
91
  finally:
92
  if os.path.exists(temp_path): os.unlink(temp_path)
@@ -129,60 +135,30 @@ def core_process(request_dict):
129
  return {"text": stt_text, "translated": trans_text, "audio": audio_b64}
130
 
131
  except Exception as e:
132
- print(f"❌ [v138] ERROR: {traceback.format_exc()}")
133
  return {"error": str(e)}
134
  finally:
135
- print(f"--- [v138] ✨ DONE ({time.time()-t1:.1f}s) ---")
136
  torch.cuda.empty_cache()
137
 
138
- # --- API Endpoints ---
139
  @app.post("/process")
140
  async def api_process(request: Request):
141
  try:
142
  data = await request.json()
143
- if data.get("action") == "health": return {"status": "awake", "v": "138"}
144
  return core_process(data)
145
  except Exception as e: return {"error": str(e)}
146
 
147
  @app.get("/health")
148
- def health(): return {"status": "ok", "v": "138", "gpu": HAS_SPACES}
149
 
150
- # --- Minimal UI ---
151
  @app.get("/", response_class=HTMLResponse)
152
  def root():
153
  return """
154
- <html>
155
- <head>
156
- <title>S2ST v138</title>
157
- <style>
158
- body { font-family: sans-serif; background: #111; color: #eee; text-align: center; padding-top: 50px; }
159
- .card { background: #222; border: 1px solid #444; padding: 20px; border-radius: 10px; display: inline-block; }
160
- button { background: #007bff; color: #fff; border: none; padding: 10px 20px; border-radius: 5px; cursor: pointer; }
161
- #log { margin-top: 20px; color: #aaa; font-family: monospace; }
162
- </style>
163
- </head>
164
- <body>
165
- <div class="card">
166
- <h1>πŸš€ AI Engine v138</h1>
167
- <p>HOPPER NATIVE - FASTAPI ONLY</p>
168
- <button onclick="checkHealth()">Test API Connectivity</button>
169
- <div id="log">Status: Awaiting test...</div>
170
- </div>
171
- <script>
172
- async function checkHealth() {
173
- const log = document.getElementById('log');
174
- log.innerText = 'Checking...';
175
- try {
176
- const res = await fetch('/health');
177
- const data = await res.json();
178
- log.innerText = 'Response: ' + JSON.stringify(data);
179
- } catch (e) {
180
- log.innerText = 'Error: ' + e;
181
- }
182
- }
183
- </script>
184
- </body>
185
- </html>
186
  """
187
 
188
  if __name__ == "__main__":
 
6
  import tempfile
7
  import traceback
8
  import gc
9
+ from fastapi import FastAPI, Request
10
  from fastapi.middleware.cors import CORSMiddleware
11
  from fastapi.responses import HTMLResponse
12
  import uvicorn
13
  from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
14
  from TTS.api import TTS
15
 
16
+ # --- [v139] πŸš€ H200 SAFEST MODE (FP32 + Standard Attention) ---
17
+ print(f"--- [v139] πŸ“‘ BOOTING SAFEST ENGINE ---")
18
 
19
  try:
20
  import spaces
 
27
  if f is None: return lambda x: x
28
  return f
29
 
30
+ # --- Strict Stability Config ---
31
  os.environ["COQUI_TOS_AGREED"] = "1"
32
  os.environ["PYTHONWARNINGS"] = "ignore"
33
+ # Disable all hardware acceleration that might cause kernel alignment errors
34
  os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
35
  torch.backends.cuda.matmul.allow_tf32 = False
36
  torch.backends.cudnn.allow_tf32 = False
37
+ torch.backends.cudnn.deterministic = True # Extra safety
38
 
39
  app = FastAPI()
40
  app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
 
42
  MODELS = {"stt": None, "tts": None}
43
 
44
  def load_gpu_models():
45
+ """Persistent loading into GPU VRAM (FP32 focus)."""
46
  global MODELS
47
  device = "cuda"
48
 
49
  if MODELS.get("stt") is None:
50
+ print("--- [v139] πŸ“₯ LOADING NATIVE WHISPER (FP32 / No-SDPA) ---")
51
  model_id = "openai/whisper-large-v3-turbo"
52
+ # Force float32 to avoid CUBLAS_STATUS_INVALID_VALUE on H200 MIG
53
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
54
+ model_id, torch_dtype=torch.float32, low_cpu_mem_usage=True, use_safetensors=True
55
  ).to(device)
56
  processor = AutoProcessor.from_pretrained(model_id)
57
 
 
60
  model=model,
61
  tokenizer=processor.tokenizer,
62
  feature_extractor=processor.feature_extractor,
63
+ torch_dtype=torch.float32,
64
  device=device,
65
+ # Explicitly avoid SDPA/Flash Attention to dodge kernel bugs
66
+ model_kwargs={"attn_implementation": "eager"}
67
  )
68
+ print("--- [v139] βœ… WHISPER LOADED (FP32) ---")
69
 
70
  if MODELS.get("tts") is None:
71
+ print("--- [v139] πŸ“₯ LOADING XTTS (SINGLETON) ---")
72
+ # XTTS is generally stable if in VRAM
73
  MODELS["tts"] = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
74
+ print("--- [v139] βœ… XTTS LOADED ---")
75
 
76
  @spaces.GPU(duration=120)
77
  def core_process(request_dict):
78
  global MODELS
79
  action = request_dict.get("action")
80
+ print(f"--- [v139] πŸ› οΈ SAFE ENGINE: {action} ---")
81
  t1 = time.time()
82
 
83
  try:
 
91
 
92
  try:
93
  lang = request_dict.get("lang")
94
+ # batch_size=1 for maximum stability
95
+ result = MODELS["stt"](temp_path, batch_size=1, generate_kwargs={"language": lang if lang and len(lang) <= 3 else None})
96
  stt_text = result["text"].strip()
97
  finally:
98
  if os.path.exists(temp_path): os.unlink(temp_path)
 
135
  return {"text": stt_text, "translated": trans_text, "audio": audio_b64}
136
 
137
  except Exception as e:
138
+ print(f"❌ [v139] ERROR: {traceback.format_exc()}")
139
  return {"error": str(e)}
140
  finally:
141
+ print(f"--- [v139] ✨ DONE ({time.time()-t1:.1f}s) ---")
142
  torch.cuda.empty_cache()
143
 
 
144
  @app.post("/process")
145
  async def api_process(request: Request):
146
  try:
147
  data = await request.json()
148
+ if data.get("action") == "health": return {"status": "awake", "v": "139"}
149
  return core_process(data)
150
  except Exception as e: return {"error": str(e)}
151
 
152
  @app.get("/health")
153
+ def health(): return {"status": "ok", "v": "139", "gpu": HAS_SPACES}
154
 
 
155
  @app.get("/", response_class=HTMLResponse)
156
  def root():
157
  return """
158
+ <html><head><title>S2ST v139</title><style>body { font-family: sans-serif; background: #111; color: #eee; text-align: center; padding-top: 50px; }</style></head>
159
+ <body><h1>πŸš€ AI Engine v139 (FP32 SAFE)</h1><p>H200 Native Stability Test</p><div id="log">Awaiting test...</div>
160
+ <script>fetch('/health').then(r=>r.json()).then(d=>document.getElementById('log').innerText=JSON.stringify(d));</script>
161
+ </body></html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  """
163
 
164
  if __name__ == "__main__":