TGPro1 commited on
Commit
4aeda0b
Β·
verified Β·
1 Parent(s): 66d68db

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +37 -40
app.py CHANGED
@@ -18,12 +18,14 @@ import logging
18
  from threading import Thread, Lock
19
  from huggingface_hub import snapshot_download
20
 
21
- # πŸ›‘οΈ 1. SILENCE LOGS & WARNINGS
22
  logging.getLogger("transformers").setLevel(logging.ERROR)
23
  logging.getLogger("TTS").setLevel(logging.ERROR)
24
  logging.getLogger("onnxruntime").setLevel(logging.ERROR)
25
  os.environ["CT2_VERBOSE"] = "0"
26
- os.environ["ORT_LOGGING_LEVEL"] = "3" # Silence ONNX discovery noise
 
 
27
 
28
  # πŸ› οΈ 2. COMPATIBILITY PATCHES
29
  if "torchaudio.backend" not in sys.modules:
@@ -79,8 +81,8 @@ except ImportError:
79
  if f is None: return lambda x: x
80
  return f
81
 
82
- # FORCE BUILD TRIGGER: 12:45:00 Jan 21 2026
83
- # v99: Fix Queue Loop. Health on CPU. Re-enable 4 Workers. Lifespan events.
84
 
85
  os.environ["COQUI_TOS_AGREED"] = "1"
86
  MODELS = {"stt": None, "translate": None, "tts": None, "denoiser": None}
@@ -89,30 +91,31 @@ WARMUP_STATUS = {"complete": False, "in_progress": False}
89
  WARMUP_LOCK = Lock()
90
 
91
  def activate_gpu_models(action):
92
- """v99: Optimized GPU Session Activation"""
93
  global MODELS, WARMUP_STATUS
94
  local_only = WARMUP_STATUS["complete"]
95
 
96
- # 1. Faster-Whisper: Peak Performance Requested
97
  if action in ["stt", "s2st"]:
98
  stt_on_gpu = False
99
  try: stt_on_gpu = MODELS["stt"] is not None and MODELS["stt"].model.device == "cuda"
100
  except: pass
101
  if not stt_on_gpu:
102
- print(f"πŸŽ™οΈ [v99] Activating Whisper (GPU: 4-Workers)...")
103
  try:
104
  if MODELS["stt"]: del MODELS["stt"]
105
  gc.collect(); torch.cuda.empty_cache()
 
106
  MODELS["stt"] = WhisperModel(
107
  "large-v3",
108
  device="cuda",
109
- compute_type="int8_float16",
110
- num_workers=4,
111
  local_files_only=local_only
112
  )
113
  except Exception as e:
114
- print(f"⚠️ GPU Init failed: {e}. Falling back to standard.")
115
- MODELS["stt"] = WhisperModel("large-v3", device="cuda", compute_type="float16", local_files_only=local_only)
116
 
117
  # 2. XTTS-v2
118
  if action in ["tts", "s2st"]:
@@ -122,12 +125,14 @@ def activate_gpu_models(action):
122
  tts_on_gpu = "cuda" in curr
123
  except: pass
124
  if MODELS["tts"] is None or not tts_on_gpu:
125
- print(f"πŸ”Š [v99] Activating XTTS-v2 (GPU)...")
126
- if MODELS["tts"] is None:
127
- MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
128
- else: MODELS["tts"].to("cuda")
 
 
129
 
130
- # 3. Chatterbox GPU-Mode
131
  chatterbox_utils.load_chatterbox(device="cuda" if torch.cuda.is_available() else "cpu")
132
 
133
  # 4. Helpers
@@ -137,9 +142,9 @@ def activate_gpu_models(action):
137
  if MODELS["translate"] is None: MODELS["translate"] = "active"
138
 
139
  def release_gpu_models():
140
- """v99: Persistence in RAM (CPU)"""
141
  global MODELS
142
- print("🧹 [v99] Releasing GPU. Engines staying WARM in RAM.")
143
  try:
144
  if MODELS["stt"] and MODELS["stt"].model.device == "cuda":
145
  del MODELS["stt"]
@@ -153,20 +158,19 @@ def release_gpu_models():
153
  if torch.cuda.is_available(): torch.cuda.empty_cache()
154
 
155
  def warmup_task():
156
- """V99: Resident RAM Pre-loading"""
157
  global WARMUP_STATUS
158
  with WARMUP_LOCK:
159
  if WARMUP_STATUS["complete"] or WARMUP_STATUS["in_progress"]: return
160
  WARMUP_STATUS["in_progress"] = True
161
- print("\nπŸ”₯ --- V99: PEAK WARMUP STARTED ---")
162
  try:
163
  MODELS["stt"] = WhisperModel("large-v3", device="cpu", compute_type="int8")
164
  MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
165
  chatterbox_utils.warmup_chatterbox()
166
  WARMUP_STATUS["complete"] = True
167
- print(f"βœ… --- SYSTEM READY --- \n")
168
- except Exception as e:
169
- print(f"❌ Warmup fail: {e}")
170
  finally: WARMUP_STATUS["in_progress"] = False
171
 
172
  def _stt_logic(request_dict):
@@ -184,7 +188,6 @@ def _translate_logic(text, target_lang):
184
  return deep_translator.GoogleTranslator(source='auto', target=target_lang).translate(text)
185
 
186
  def _tts_logic(text, lang, speaker_wav_b64):
187
- if not text or not text.strip(): return {"error": "Input empty"}
188
  XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
189
  clean_lang = lang.strip().lower().split('-')[0]
190
  mapped_lang = XTTS_MAP.get(clean_lang) or ("zh-cn" if clean_lang == "zh" else None)
@@ -212,13 +215,13 @@ def _tts_logic(text, lang, speaker_wav_b64):
212
  audio_bytes = chatterbox_utils.run_chatterbox_inference(text, clean_lang, speaker_wav_path=temp_ref)
213
  if temp_ref and os.path.exists(temp_ref): os.unlink(temp_ref)
214
  return {"audio": base64.b64encode(audio_bytes).decode()}
215
- except Exception as e: return {"error": f"TTS Failure: {str(e)}"}
216
 
217
  @spaces.GPU(duration=150)
218
  def core_process(request_dict):
219
  action = request_dict.get("action")
220
  t1 = time.time()
221
- print(f"--- [v99] πŸš€ GPU SESSION: {action} ---")
222
  activate_gpu_models(action)
223
  try:
224
  if action == "stt": res = _stt_logic(request_dict)
@@ -231,17 +234,16 @@ def core_process(request_dict):
231
  res = {"text": stt_res.get("text"), "translated": translated, "audio": tts_res.get("audio")}
232
  else: res = {"error": f"Unknown action: {action}"}
233
  finally:
234
- print(f"--- [v99] ✨ SUCCESS: {action} ({time.time()-t1:.2f}s) ---")
235
  release_gpu_models()
236
  return res
237
 
238
  @asynccontextmanager
239
  async def lifespan(app: FastAPI):
240
- # Startup: Background Warmup
241
  Thread(target=warmup_task, daemon=True).start()
242
  yield
243
- # Shutdown logic (optional)
244
- pass
245
 
246
  app = FastAPI(lifespan=lifespan)
247
 
@@ -249,15 +251,14 @@ app = FastAPI(lifespan=lifespan)
249
  async def api_process(request: Request):
250
  try:
251
  req_data = await request.json()
252
- action = req_data.get("action")
253
- # πŸ”₯ V99 CRITICAL FIX: Handle 'health' on CPU to prevent GPU queue loops
254
- if action == "health":
255
  return {"status": "awake", "warm": WARMUP_STATUS["complete"]}
256
  return core_process(req_data)
257
  except Exception as e: return {"error": str(e)}
258
 
259
  @app.get("/health")
260
- def health(): return {"status": "ok", "warm": WARMUP_STATUS["complete"], "time": time.ctime()}
261
 
262
  @app.post("/api/v1/clear_cache")
263
  async def clear_cache():
@@ -269,13 +270,9 @@ async def clear_cache():
269
  try: os.unlink(os.path.join(temp_dir, f))
270
  except: pass
271
  return {"status": "success"}
272
- except Exception as e: return {"status": "error", "message": str(e)}
273
-
274
- def gradio_fn(req_json):
275
- try: return json.dumps(core_process(json.loads(req_json)))
276
- except Exception as e: return json.dumps({"error": str(e)})
277
 
278
- demo = gr.Interface(fn=gradio_fn, inputs="text", outputs="text", title="πŸš€ AI Engine")
279
  app = gr.mount_gradio_app(app, demo, path="/")
280
 
281
  if __name__ == "__main__":
 
18
  from threading import Thread, Lock
19
  from huggingface_hub import snapshot_download
20
 
21
+ # πŸ›‘οΈ 1. SILENCE LOGS & WARNINGS (v100: Absolute Silence)
22
  logging.getLogger("transformers").setLevel(logging.ERROR)
23
  logging.getLogger("TTS").setLevel(logging.ERROR)
24
  logging.getLogger("onnxruntime").setLevel(logging.ERROR)
25
  os.environ["CT2_VERBOSE"] = "0"
26
+ os.environ["ORT_LOGGING_LEVEL"] = "3"
27
+ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
28
+ os.environ["CUDA_VISIBLE_DEVICES"] = "0"
29
 
30
  # πŸ› οΈ 2. COMPATIBILITY PATCHES
31
  if "torchaudio.backend" not in sys.modules:
 
81
  if f is None: return lambda x: x
82
  return f
83
 
84
+ # FORCE BUILD TRIGGER: 13:00:00 Jan 21 2026
85
+ # v100: Centennial Stability Update. 1-Worker Lockdown. Lifespan Events.
86
 
87
  os.environ["COQUI_TOS_AGREED"] = "1"
88
  MODELS = {"stt": None, "translate": None, "tts": None, "denoiser": None}
 
91
  WARMUP_LOCK = Lock()
92
 
93
  def activate_gpu_models(action):
94
+ """v100: Stability-First GPU Activation"""
95
  global MODELS, WARMUP_STATUS
96
  local_only = WARMUP_STATUS["complete"]
97
 
98
+ # 1. Faster-Whisper: Lockdown to 1 worker for stability on the H200 MIG
99
  if action in ["stt", "s2st"]:
100
  stt_on_gpu = False
101
  try: stt_on_gpu = MODELS["stt"] is not None and MODELS["stt"].model.device == "cuda"
102
  except: pass
103
  if not stt_on_gpu:
104
+ print(f"πŸŽ™οΈ [v100] Activating Whisper (GPU: Stability Protocol)...")
105
  try:
106
  if MODELS["stt"]: del MODELS["stt"]
107
  gc.collect(); torch.cuda.empty_cache()
108
+ # πŸ›‘οΈ v100: 1-Worker to prevent CUDA deadlocks observed in v99
109
  MODELS["stt"] = WhisperModel(
110
  "large-v3",
111
  device="cuda",
112
+ compute_type="float16",
113
+ num_workers=1,
114
  local_files_only=local_only
115
  )
116
  except Exception as e:
117
+ print(f"⚠️ Stability Init failed: {e}. Falling back to CPU.")
118
+ MODELS["stt"] = WhisperModel("large-v3", device="cpu", compute_type="int8", local_files_only=True)
119
 
120
  # 2. XTTS-v2
121
  if action in ["tts", "s2st"]:
 
125
  tts_on_gpu = "cuda" in curr
126
  except: pass
127
  if MODELS["tts"] is None or not tts_on_gpu:
128
+ print(f"πŸ”Š [v100] Activating XTTS-v2 (GPU)...")
129
+ try:
130
+ if MODELS["tts"] is None:
131
+ MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
132
+ else: MODELS["tts"].to("cuda")
133
+ except: pass
134
 
135
+ # 3. Chatterbox GPU-Mode (Zero-Latency)
136
  chatterbox_utils.load_chatterbox(device="cuda" if torch.cuda.is_available() else "cpu")
137
 
138
  # 4. Helpers
 
142
  if MODELS["translate"] is None: MODELS["translate"] = "active"
143
 
144
  def release_gpu_models():
145
+ """v100: RAM-Resident Cleanup"""
146
  global MODELS
147
+ print("🧹 [v100] Releasing GPU. Engines resident in RAM.")
148
  try:
149
  if MODELS["stt"] and MODELS["stt"].model.device == "cuda":
150
  del MODELS["stt"]
 
158
  if torch.cuda.is_available(): torch.cuda.empty_cache()
159
 
160
  def warmup_task():
161
+ """Silent Warmup (v100)"""
162
  global WARMUP_STATUS
163
  with WARMUP_LOCK:
164
  if WARMUP_STATUS["complete"] or WARMUP_STATUS["in_progress"]: return
165
  WARMUP_STATUS["in_progress"] = True
166
+ print("\nπŸ”₯ --- V100: STABILITY WARMUP STARTED ---")
167
  try:
168
  MODELS["stt"] = WhisperModel("large-v3", device="cpu", compute_type="int8")
169
  MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
170
  chatterbox_utils.warmup_chatterbox()
171
  WARMUP_STATUS["complete"] = True
172
+ print(f"βœ… --- SYSTEM READY: v100 --- \n")
173
+ except: pass
 
174
  finally: WARMUP_STATUS["in_progress"] = False
175
 
176
  def _stt_logic(request_dict):
 
188
  return deep_translator.GoogleTranslator(source='auto', target=target_lang).translate(text)
189
 
190
  def _tts_logic(text, lang, speaker_wav_b64):
 
191
  XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
192
  clean_lang = lang.strip().lower().split('-')[0]
193
  mapped_lang = XTTS_MAP.get(clean_lang) or ("zh-cn" if clean_lang == "zh" else None)
 
215
  audio_bytes = chatterbox_utils.run_chatterbox_inference(text, clean_lang, speaker_wav_path=temp_ref)
216
  if temp_ref and os.path.exists(temp_ref): os.unlink(temp_ref)
217
  return {"audio": base64.b64encode(audio_bytes).decode()}
218
+ except: return {"error": "TTS Failure"}
219
 
220
  @spaces.GPU(duration=150)
221
  def core_process(request_dict):
222
  action = request_dict.get("action")
223
  t1 = time.time()
224
+ print(f"--- [v100] πŸš€ GPU SESSION: {action} ---")
225
  activate_gpu_models(action)
226
  try:
227
  if action == "stt": res = _stt_logic(request_dict)
 
234
  res = {"text": stt_res.get("text"), "translated": translated, "audio": tts_res.get("audio")}
235
  else: res = {"error": f"Unknown action: {action}"}
236
  finally:
237
+ print(f"--- [v100] ✨ END: {action} ({time.time()-t1:.2f}s) ---")
238
  release_gpu_models()
239
  return res
240
 
241
  @asynccontextmanager
242
  async def lifespan(app: FastAPI):
243
+ # Startup
244
  Thread(target=warmup_task, daemon=True).start()
245
  yield
246
+ # Shutdown
 
247
 
248
  app = FastAPI(lifespan=lifespan)
249
 
 
251
  async def api_process(request: Request):
252
  try:
253
  req_data = await request.json()
254
+ # πŸ₯ v100: LIGHTWEIGHT CPU HEALTH (Prevent Queue Bloat)
255
+ if req_data.get("action") == "health":
 
256
  return {"status": "awake", "warm": WARMUP_STATUS["complete"]}
257
  return core_process(req_data)
258
  except Exception as e: return {"error": str(e)}
259
 
260
  @app.get("/health")
261
+ def health(): return {"status": "ok", "warm": WARMUP_STATUS["complete"], "version": "v100"}
262
 
263
  @app.post("/api/v1/clear_cache")
264
  async def clear_cache():
 
270
  try: os.unlink(os.path.join(temp_dir, f))
271
  except: pass
272
  return {"status": "success"}
273
+ except: return {"status": "error"}
 
 
 
 
274
 
275
+ demo = gr.Interface(fn=lambda x: json.dumps(core_process(json.loads(x))), inputs="text", outputs="text")
276
  app = gr.mount_gradio_app(app, demo, path="/")
277
 
278
  if __name__ == "__main__":