TGPro1 commited on
Commit
f4203ee
Β·
verified Β·
1 Parent(s): e4d48cf

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +46 -67
app.py CHANGED
@@ -61,8 +61,8 @@ if not hasattr(torchaudio, "info"):
61
 
62
  from df.enhance import enhance, init_df, load_audio, save_audio
63
 
64
- # FORCE BUILD TRIGGER: 09:40:00 Jan 21 2026
65
- # v84: Fixed SyntaxError (Missing try block in core_process)
66
 
67
  # πŸ› οΈ Monkeypatch torchaudio.load
68
  try:
@@ -93,7 +93,7 @@ MODELS = {"stt": None, "translate": None, "tts": None, "tokenizer": None, "denoi
93
  def load_models():
94
  global MODELS
95
  if MODELS["stt"] is None:
96
- print("πŸŽ™οΈ Loading Faster-Whisper large-v3...")
97
  from faster_whisper import WhisperModel
98
  if torch.cuda.is_available():
99
  print(f"πŸš€ GPU Detected: {torch.cuda.get_device_name(0)}")
@@ -108,7 +108,6 @@ def load_models():
108
  torch.cuda.empty_cache()
109
 
110
  # Initialize Chatterbox ONNX (High-Speed Fallback)
111
- # This will load the model if not already loaded internally by chatterbox_utils
112
  chatterbox_utils.load_chatterbox(device="cuda" if torch.cuda.is_available() else "cpu")
113
 
114
  if MODELS["translate"] is None:
@@ -123,7 +122,7 @@ def load_models():
123
  except: pass
124
 
125
  if MODELS["tts"] is None:
126
- print("πŸ”Š Loading XTTS-v2...")
127
  from TTS.api import TTS
128
  try:
129
  MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=torch.cuda.is_available())
@@ -132,6 +131,35 @@ def load_models():
132
  print(f"❌ Failed to load XTTS: {e}")
133
  raise e
134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  def _stt_logic(request_dict):
136
  """STT Logic (Runs on GPU when called via core_process)"""
137
  audio_bytes = base64.b64decode(request_dict.get("file"))
@@ -158,8 +186,6 @@ def _tts_logic(text, lang, speaker_wav_b64):
158
  if not text or not text.strip():
159
  return {"error": "TTS Error: Input text is empty"}
160
 
161
- # 🌍 XTTS-v2 COMPLETE 16-LANGUAGE MAPPING (v79)
162
- # This dictionary ensures every officially supported XTTS language code is correctly matched.
163
  XTTS_MAP = {
164
  "en": "en", "en-us": "en", "en-gb": "en",
165
  "de": "de", "de-de": "de",
@@ -180,18 +206,15 @@ def _tts_logic(text, lang, speaker_wav_b64):
180
  }
181
 
182
  XTTS_LANG_CODES = set(XTTS_MAP.values())
183
-
184
  mapped_lang = None
185
  if lang:
186
  lang_key = lang.strip().lower()
187
  mapped_lang = XTTS_MAP.get(lang_key) or XTTS_MAP.get(lang_key.split('-')[0])
188
 
189
- print(f"[v84] TTS Request - Original: {lang}, Mapped: {mapped_lang}")
190
 
191
- # πŸ›£οΈ INTELLIGENT ROUTING
192
- # Case A: XTTS Support (Voice Cloning)
193
  if mapped_lang and mapped_lang in XTTS_LANG_CODES:
194
- print(f"[v84] Using XTTS-v2 for '{mapped_lang}'")
195
  speaker_wav_path = None
196
  if speaker_wav_b64:
197
  sb = base64.b64decode(speaker_wav_b64)
@@ -205,7 +228,6 @@ def _tts_logic(text, lang, speaker_wav_b64):
205
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
206
  output_path = output_file.name
207
 
208
- # πŸŽ™οΈ XTTS Inference
209
  MODELS["tts"].tts_to_file(text=text, language=mapped_lang, file_path=output_path, speaker_wav=speaker_wav_path)
210
 
211
  with open(output_path, "rb") as f:
@@ -216,22 +238,17 @@ def _tts_logic(text, lang, speaker_wav_b64):
216
  if os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
217
  if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
218
 
219
- # Case B: Chatterbox ONNX Support (High-Quality Fast Fallback)
220
- print(f"[v84] Using Chatterbox ONNX Fallback for '{lang}'")
221
  try:
222
- # Use local file if available for cloning in Chatterbox too
223
  temp_ref = None
224
  if speaker_wav_b64:
225
  sb = base64.b64decode(speaker_wav_b64)
226
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
227
  f.write(sb); temp_ref = f.name
228
 
229
- # Chatterbox supports codes like 'fi', 'el', 'da', etc.
230
  chatter_lang = lang.strip().lower().split('-')[0]
231
  audio_bytes = chatterbox_utils.run_chatterbox_inference(text, chatter_lang, speaker_wav_path=temp_ref)
232
-
233
  if temp_ref and os.path.exists(temp_ref): os.unlink(temp_ref)
234
-
235
  audio_b64 = base64.b64encode(audio_bytes).decode()
236
  return {"audio": audio_b64}
237
  except Exception as e:
@@ -240,14 +257,10 @@ def _tts_logic(text, lang, speaker_wav_b64):
240
 
241
  @spaces.GPU
242
  def core_process(request_dict):
243
- """
244
- Unified GPU Entry Point (v84).
245
- This function handles all high-speed tasks inside a single GPU allocation.
246
- The container stays resident on CPU but triggers GPU on demand.
247
- """
248
  action = request_dict.get("action")
249
  t0 = time.time()
250
- print(f"--- [v84] πŸš€ GPU SESSION START: {action} at {time.ctime()} ---")
251
  load_models()
252
 
253
  try:
@@ -258,26 +271,21 @@ def core_process(request_dict):
258
  elif action == "tts":
259
  res = _tts_logic(request_dict.get("text"), request_dict.get("lang"), request_dict.get("speaker_wav"))
260
  elif action == "s2st":
261
- # πŸ”— FULL PIPELINE (Single GPU Call)
262
  stt_res = _stt_logic({"file": request_dict.get("file"), "lang": request_dict.get("source_lang")})
263
  text = stt_res.get("text", "")
264
  if not text: return {"error": "No speech detected"}
265
-
266
  translated = _translate_logic(text, request_dict.get("target_lang"))
267
-
268
  tts_res = _tts_logic(translated, request_dict.get("target_lang"), request_dict.get("speaker_wav"))
269
  res = {"text": text, "translated": translated, "audio": tts_res.get("audio")}
270
  elif action == "health":
271
  res = {"status": "awake", "time": time.ctime()}
272
  else:
273
  res = {"error": f"Unknown action: {action}"}
274
-
275
  finally:
276
- print(f"--- [v84] ✨ SESSION END: {action} (Total: {time.time()-t0:.2f}s) ---")
277
  gc.collect()
278
  if torch.cuda.is_available():
279
  torch.cuda.empty_cache()
280
-
281
  return res
282
 
283
  def create_wav_header(sample_rate=24000, channels=1, bit_depth=16):
@@ -296,13 +304,11 @@ def create_wav_header(sample_rate=24000, channels=1, bit_depth=16):
296
  header.extend((0xFFFFFFFF).to_bytes(4, 'little'))
297
  return bytes(header)
298
 
299
- # πŸš€ Sync Generator for ZeroGPU
300
  @spaces.GPU
301
  def gpu_tts_generator(text, lang, speaker_wav_path):
302
  load_models()
303
  try:
304
  yield bytes(create_wav_header(sample_rate=24000))
305
- # inference_stream is a generator
306
  for chunk in MODELS["tts"].synthesizer.tts_model.inference_stream(
307
  text,
308
  lang,
@@ -320,15 +326,12 @@ def gpu_tts_generator(text, lang, speaker_wav_path):
320
  if torch.cuda.is_available():
321
  torch.cuda.empty_cache()
322
 
323
- # --- FastAPI Entry Points ---
324
  app = FastAPI()
325
 
326
  @app.post("/api/v1/process")
327
  async def api_process(request: Request):
328
- """Async endpoint. Routes to CPU (STT/Translate) or Hybrid (S2ST/TTS)"""
329
  try:
330
  data = await request.json()
331
- # Direct call to the hybrid process
332
  result = core_process(data)
333
  return result
334
  except Exception as e:
@@ -337,7 +340,6 @@ async def api_process(request: Request):
337
 
338
  @app.post("/api/v1/tts_stream")
339
  async def api_tts_stream(request: Request):
340
- """Async entry point for StreamingResponse"""
341
  try:
342
  data = await request.json()
343
  speaker_wav_b64 = data.get("speaker_wav")
@@ -349,11 +351,7 @@ async def api_tts_stream(request: Request):
349
  speaker_wav_path = f.name
350
  else:
351
  speaker_wav_path = "default_speaker.wav"
352
-
353
- return StreamingResponse(
354
- gpu_tts_generator(data.get("text"), data.get("lang"), speaker_wav_path),
355
- media_type="audio/wav"
356
- )
357
  except Exception as e:
358
  return {"error": str(e)}
359
 
@@ -363,46 +361,27 @@ def health():
363
 
364
  @app.post("/api/v1/clear_cache")
365
  async def clear_cache():
366
- """Manual deep cleanup of memory and caches"""
367
  try:
368
  t0 = time.time()
369
- print("🧹 Manual Cache Clearing Triggered...")
370
-
371
- # 1. GC collect
372
  gc.collect()
373
-
374
- # 2. CUDA cache
375
- if torch.cuda.is_available():
376
- torch.cuda.empty_cache()
377
-
378
- # 3. Clean temp files
379
  temp_dir = tempfile.gettempdir()
380
  count = 0
381
  for f in os.listdir(temp_dir):
382
  if f.endswith(".wav") or f.startswith("tm"):
383
- try:
384
- os.unlink(os.path.join(temp_dir, f))
385
- count += 1
386
  except: pass
387
-
388
- return {
389
- "status": "success",
390
- "cleaned_files": count,
391
- "duration": f"{time.time()-t0:.2f}s",
392
- "gpu_memory": f"{torch.cuda.memory_allocated() / 1024**2:.2f}MB" if torch.cuda.is_available() else "N/A"
393
- }
394
  except Exception as e:
395
  return {"status": "error", "message": str(e)}
396
 
397
- # --- Gradio UI ---
398
  def gradio_fn(req_json):
399
- try:
400
- return json.dumps(core_process(json.loads(req_json)))
401
- except Exception as e:
402
- return json.dumps({"error": str(e)})
403
 
404
  demo = gr.Interface(fn=gradio_fn, inputs="text", outputs="text", title="πŸš€ AI Engine")
405
  app = gr.mount_gradio_app(app, demo, path="/")
406
 
407
  if __name__ == "__main__":
 
408
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
61
 
62
  from df.enhance import enhance, init_df, load_audio, save_audio
63
 
64
+ # FORCE BUILD TRIGGER: 10:00:00 Jan 21 2026
65
+ # v85: ZeroGPU Warmup & Pre-caching (Prevents session timeouts)
66
 
67
  # πŸ› οΈ Monkeypatch torchaudio.load
68
  try:
 
93
  def load_models():
94
  global MODELS
95
  if MODELS["stt"] is None:
96
+ print("πŸŽ™οΈ Loading Faster-Whisper large-v3 into Engine...")
97
  from faster_whisper import WhisperModel
98
  if torch.cuda.is_available():
99
  print(f"πŸš€ GPU Detected: {torch.cuda.get_device_name(0)}")
 
108
  torch.cuda.empty_cache()
109
 
110
  # Initialize Chatterbox ONNX (High-Speed Fallback)
 
111
  chatterbox_utils.load_chatterbox(device="cuda" if torch.cuda.is_available() else "cpu")
112
 
113
  if MODELS["translate"] is None:
 
122
  except: pass
123
 
124
  if MODELS["tts"] is None:
125
+ print("πŸ”Š Loading XTTS-v2 into Engine...")
126
  from TTS.api import TTS
127
  try:
128
  MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=torch.cuda.is_available())
 
131
  print(f"❌ Failed to load XTTS: {e}")
132
  raise e
133
 
134
+ def warmup_models():
135
+ """Download and cache all models on CPU at startup (Prevents GPU timeouts)"""
136
+ print("\nπŸ”₯ --- SYSTEM WARMUP STARTING (CPU) ---")
137
+ start = time.time()
138
+ try:
139
+ # 1. Warmup Whisper
140
+ print("πŸ“₯ Initializing Whisper large-v3 cache...")
141
+ from faster_whisper import WhisperModel
142
+ # Use simple init to trigger download
143
+ _ = WhisperModel("large-v3", device="cpu", compute_type="int8")
144
+
145
+ # 2. Warmup XTTS-v2 (This takes the longest)
146
+ print("πŸ“₯ Initializing XTTS-v2 cache...")
147
+ from TTS.api import TTS
148
+ # Initialize once on CPU just to force download
149
+ _ = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
150
+
151
+ # 3. Warmup DeepFilterNet
152
+ print("πŸ“₯ Initializing DeepFilterNet cache...")
153
+ try: init_df()
154
+ except: pass
155
+
156
+ # 4. Warmup Chatterbox
157
+ chatterbox_utils.warmup_chatterbox()
158
+
159
+ print(f"βœ… --- SYSTEM WARMUP COMPLETE (Time: {time.time()-start:.2f}s) --- \n")
160
+ except Exception as e:
161
+ print(f"⚠️ Warmup warning: {e}")
162
+
163
  def _stt_logic(request_dict):
164
  """STT Logic (Runs on GPU when called via core_process)"""
165
  audio_bytes = base64.b64decode(request_dict.get("file"))
 
186
  if not text or not text.strip():
187
  return {"error": "TTS Error: Input text is empty"}
188
 
 
 
189
  XTTS_MAP = {
190
  "en": "en", "en-us": "en", "en-gb": "en",
191
  "de": "de", "de-de": "de",
 
206
  }
207
 
208
  XTTS_LANG_CODES = set(XTTS_MAP.values())
 
209
  mapped_lang = None
210
  if lang:
211
  lang_key = lang.strip().lower()
212
  mapped_lang = XTTS_MAP.get(lang_key) or XTTS_MAP.get(lang_key.split('-')[0])
213
 
214
+ print(f"[v85] TTS Request - Original: {lang}, Mapped: {mapped_lang}")
215
 
 
 
216
  if mapped_lang and mapped_lang in XTTS_LANG_CODES:
217
+ print(f"[v85] Using XTTS-v2 for '{mapped_lang}'")
218
  speaker_wav_path = None
219
  if speaker_wav_b64:
220
  sb = base64.b64decode(speaker_wav_b64)
 
228
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
229
  output_path = output_file.name
230
 
 
231
  MODELS["tts"].tts_to_file(text=text, language=mapped_lang, file_path=output_path, speaker_wav=speaker_wav_path)
232
 
233
  with open(output_path, "rb") as f:
 
238
  if os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
239
  if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
240
 
241
+ print(f"[v85] Using Chatterbox ONNX Fallback for '{lang}'")
 
242
  try:
 
243
  temp_ref = None
244
  if speaker_wav_b64:
245
  sb = base64.b64decode(speaker_wav_b64)
246
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
247
  f.write(sb); temp_ref = f.name
248
 
 
249
  chatter_lang = lang.strip().lower().split('-')[0]
250
  audio_bytes = chatterbox_utils.run_chatterbox_inference(text, chatter_lang, speaker_wav_path=temp_ref)
 
251
  if temp_ref and os.path.exists(temp_ref): os.unlink(temp_ref)
 
252
  audio_b64 = base64.b64encode(audio_bytes).decode()
253
  return {"audio": audio_b64}
254
  except Exception as e:
 
257
 
258
  @spaces.GPU
259
  def core_process(request_dict):
260
+ """Unified GPU Entry Point (v85)"""
 
 
 
 
261
  action = request_dict.get("action")
262
  t0 = time.time()
263
+ print(f"--- [v85] πŸš€ GPU SESSION START: {action} at {time.ctime()} ---")
264
  load_models()
265
 
266
  try:
 
271
  elif action == "tts":
272
  res = _tts_logic(request_dict.get("text"), request_dict.get("lang"), request_dict.get("speaker_wav"))
273
  elif action == "s2st":
 
274
  stt_res = _stt_logic({"file": request_dict.get("file"), "lang": request_dict.get("source_lang")})
275
  text = stt_res.get("text", "")
276
  if not text: return {"error": "No speech detected"}
 
277
  translated = _translate_logic(text, request_dict.get("target_lang"))
 
278
  tts_res = _tts_logic(translated, request_dict.get("target_lang"), request_dict.get("speaker_wav"))
279
  res = {"text": text, "translated": translated, "audio": tts_res.get("audio")}
280
  elif action == "health":
281
  res = {"status": "awake", "time": time.ctime()}
282
  else:
283
  res = {"error": f"Unknown action: {action}"}
 
284
  finally:
285
+ print(f"--- [v85] ✨ SESSION END: {action} (Total: {time.time()-t0:.2f}s) ---")
286
  gc.collect()
287
  if torch.cuda.is_available():
288
  torch.cuda.empty_cache()
 
289
  return res
290
 
291
  def create_wav_header(sample_rate=24000, channels=1, bit_depth=16):
 
304
  header.extend((0xFFFFFFFF).to_bytes(4, 'little'))
305
  return bytes(header)
306
 
 
307
  @spaces.GPU
308
  def gpu_tts_generator(text, lang, speaker_wav_path):
309
  load_models()
310
  try:
311
  yield bytes(create_wav_header(sample_rate=24000))
 
312
  for chunk in MODELS["tts"].synthesizer.tts_model.inference_stream(
313
  text,
314
  lang,
 
326
  if torch.cuda.is_available():
327
  torch.cuda.empty_cache()
328
 
 
329
  app = FastAPI()
330
 
331
  @app.post("/api/v1/process")
332
  async def api_process(request: Request):
 
333
  try:
334
  data = await request.json()
 
335
  result = core_process(data)
336
  return result
337
  except Exception as e:
 
340
 
341
  @app.post("/api/v1/tts_stream")
342
  async def api_tts_stream(request: Request):
 
343
  try:
344
  data = await request.json()
345
  speaker_wav_b64 = data.get("speaker_wav")
 
351
  speaker_wav_path = f.name
352
  else:
353
  speaker_wav_path = "default_speaker.wav"
354
+ return StreamingResponse(gpu_tts_generator(data.get("text"), data.get("lang"), speaker_wav_path), media_type="audio/wav")
 
 
 
 
355
  except Exception as e:
356
  return {"error": str(e)}
357
 
 
361
 
362
  @app.post("/api/v1/clear_cache")
363
  async def clear_cache():
 
364
  try:
365
  t0 = time.time()
 
 
 
366
  gc.collect()
367
+ if torch.cuda.is_available(): torch.cuda.empty_cache()
 
 
 
 
 
368
  temp_dir = tempfile.gettempdir()
369
  count = 0
370
  for f in os.listdir(temp_dir):
371
  if f.endswith(".wav") or f.startswith("tm"):
372
+ try: os.unlink(os.path.join(temp_dir, f)); count += 1
 
 
373
  except: pass
374
+ return {"status": "success", "cleaned_files": count, "duration": f"{time.time()-t0:.2f}s"}
 
 
 
 
 
 
375
  except Exception as e:
376
  return {"status": "error", "message": str(e)}
377
 
 
378
  def gradio_fn(req_json):
379
+ try: return json.dumps(core_process(json.loads(req_json)))
380
+ except Exception as e: return json.dumps({"error": str(e)})
 
 
381
 
382
  demo = gr.Interface(fn=gradio_fn, inputs="text", outputs="text", title="πŸš€ AI Engine")
383
  app = gr.mount_gradio_app(app, demo, path="/")
384
 
385
  if __name__ == "__main__":
386
+ warmup_models()
387
  uvicorn.run(app, host="0.0.0.0", port=7860)