TGPro1 commited on
Commit
1b24af3
Β·
verified Β·
1 Parent(s): 7905c18

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +43 -23
app.py CHANGED
@@ -17,10 +17,12 @@ import logging
17
  from threading import Thread, Lock
18
  from huggingface_hub import snapshot_download
19
 
20
- # πŸ›‘οΈ 1. SILENCE LOGGING
21
  logging.getLogger("transformers").setLevel(logging.ERROR)
22
  logging.getLogger("TTS").setLevel(logging.ERROR)
 
23
  os.environ["CT2_VERBOSE"] = "0"
 
24
 
25
  # πŸ› οΈ 2. COMPATIBILITY PATCHES
26
  if "torchaudio.backend" not in sys.modules:
@@ -76,8 +78,8 @@ except ImportError:
76
  if f is None: return lambda x: x
77
  return f
78
 
79
- # FORCE BUILD TRIGGER: 12:00:00 Jan 21 2026
80
- # v95: Serverless GPU Efficiency. Auto-release GPU, models stay WARM in RAM.
81
 
82
  os.environ["COQUI_TOS_AGREED"] = "1"
83
  MODELS = {"stt": None, "translate": None, "tts": None, "denoiser": None}
@@ -86,20 +88,28 @@ WARMUP_STATUS = {"complete": False, "in_progress": False}
86
  WARMUP_LOCK = Lock()
87
 
88
  def activate_gpu_models(action):
89
- """v95: Optimized GPU Activation"""
90
  global MODELS, WARMUP_STATUS
91
  local_only = WARMUP_STATUS["complete"]
92
 
93
- # 1. Faster-Whisper
 
 
94
  if action in ["stt", "s2st"]:
95
  stt_on_gpu = False
96
  try: stt_on_gpu = MODELS["stt"] is not None and MODELS["stt"].model.device == "cuda"
97
  except: pass
98
  if not stt_on_gpu:
99
- print(f"πŸŽ™οΈ [v95] Activating Whisper (GPU)...")
100
  if MODELS["stt"]: del MODELS["stt"]
101
  gc.collect(); torch.cuda.empty_cache()
102
- MODELS["stt"] = WhisperModel("large-v3", device="cuda", compute_type="float16", local_files_only=local_only)
 
 
 
 
 
 
103
 
104
  # 2. XTTS-v2
105
  if action in ["tts", "s2st"]:
@@ -109,53 +119,59 @@ def activate_gpu_models(action):
109
  tts_on_gpu = "cuda" in curr
110
  except: pass
111
  if MODELS["tts"] is None or not tts_on_gpu:
112
- print(f"πŸ”Š [v95] Activating XTTS-v2 (GPU)...")
113
  if MODELS["tts"] is None:
114
  MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
115
  else: MODELS["tts"].to("cuda")
116
 
117
- # 3. Helpers (Chatterbox stays on CPU for faster session startup)
 
 
 
 
118
  if MODELS["denoiser"] is None:
119
  try: MODELS["denoiser"] = init_df()
120
  except: pass
121
  if MODELS["translate"] is None: MODELS["translate"] = "active"
122
- chatterbox_utils.load_chatterbox(device="cpu")
123
 
124
  def release_gpu_models():
125
- """v95: PERSISTENT RAM LOADING - Move models back to CPU to save GPU quota"""
126
  global MODELS
127
- print("🧹 [v95] Releasing GPU resources. Returning models to System RAM...")
128
 
129
- # 1. Whisper: Re-init on CPU (int8) to free GPU handles
130
  if MODELS["stt"] and MODELS["stt"].model.device == "cuda":
131
  del MODELS["stt"]
132
  MODELS["stt"] = WhisperModel("large-v3", device="cpu", compute_type="int8", local_files_only=True)
133
 
134
- # 2. XTTS: Move weights to CPU
135
  if MODELS["tts"]:
136
  try: MODELS["tts"].to("cpu")
137
  except: pass
 
 
 
138
 
139
  gc.collect()
140
- if torch.cuda.is_available():
141
- torch.cuda.empty_cache()
142
- print("βœ… GPU quota saved. Session is Warm but Idle.")
143
 
144
  def warmup_task():
145
- """Silent Background Warmup (Resident RAM)"""
146
  global WARMUP_STATUS
147
  with WARMUP_LOCK:
148
  if WARMUP_STATUS["complete"] or WARMUP_STATUS["in_progress"]: return
149
  WARMUP_STATUS["in_progress"] = True
150
 
151
- print("\nπŸ”₯ --- SILENT WARMUP: RESIDENT RAM LOADING (v95) ---")
152
  start = time.time()
153
  try:
 
154
  MODELS["stt"] = WhisperModel("large-v3", device="cpu", compute_type="int8")
155
  MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
156
  chatterbox_utils.warmup_chatterbox()
157
  WARMUP_STATUS["complete"] = True
158
- print(f"βœ… --- SYSTEM READY: MODELS RESIDENT IN RAM ({time.time()-start:.2f}s) --- \n")
159
  except Exception as e:
160
  print(f"❌ Warmup fail: {e}")
161
  finally:
@@ -167,6 +183,7 @@ def _stt_logic(request_dict):
167
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
168
  f.write(audio_bytes); temp_path = f.name
169
  try:
 
170
  segments, _ = MODELS["stt"].transcribe(temp_path, language=lang, beam_size=1)
171
  return {"text": " ".join([s.text for s in segments]).strip()}
172
  finally:
@@ -180,6 +197,7 @@ def _tts_logic(text, lang, speaker_wav_b64):
180
  XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
181
  clean_lang = lang.strip().lower().split('-')[0]
182
  mapped_lang = XTTS_MAP.get(clean_lang) or ("zh-cn" if clean_lang == "zh" else None)
 
183
  if mapped_lang:
184
  speaker_wav_path = None
185
  if speaker_wav_b64:
@@ -195,6 +213,8 @@ def _tts_logic(text, lang, speaker_wav_b64):
195
  finally:
196
  if speaker_wav_path and "default_speaker" not in speaker_wav_path and os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
197
  if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
 
 
198
  try:
199
  temp_ref = None
200
  if speaker_wav_b64:
@@ -210,7 +230,7 @@ def _tts_logic(text, lang, speaker_wav_b64):
210
  def core_process(request_dict):
211
  action = request_dict.get("action")
212
  t1 = time.time()
213
- print(f"--- [v95] πŸš€ GPU SESSION: {action} ---")
214
  activate_gpu_models(action)
215
  try:
216
  if action == "stt": res = _stt_logic(request_dict)
@@ -224,7 +244,7 @@ def core_process(request_dict):
224
  elif action == "health": res = {"status": "awake"}
225
  else: res = {"error": f"Unknown action: {action}"}
226
  finally:
227
- print(f"--- [v95] ✨ SUCCESS: {action} ({time.time()-t1:.2f}s) ---")
228
  release_gpu_models()
229
  return res
230
 
@@ -241,7 +261,7 @@ async def api_process(request: Request):
241
 
242
  @app.get("/health")
243
  def health():
244
- return {"status": "ok", "warm": WARMUP_STATUS["complete"], "time": time.ctime()}
245
 
246
  @app.post("/api/v1/clear_cache")
247
  async def clear_cache():
 
17
  from threading import Thread, Lock
18
  from huggingface_hub import snapshot_download
19
 
20
+ # πŸ›‘οΈ 1. SILENCE LOGS (User requested zero clutter)
21
  logging.getLogger("transformers").setLevel(logging.ERROR)
22
  logging.getLogger("TTS").setLevel(logging.ERROR)
23
+ logging.getLogger("onnxruntime").setLevel(logging.ERROR)
24
  os.environ["CT2_VERBOSE"] = "0"
25
+ os.environ["KMP_WARNINGS"] = "0"
26
 
27
  # πŸ› οΈ 2. COMPATIBILITY PATCHES
28
  if "torchaudio.backend" not in sys.modules:
 
78
  if f is None: return lambda x: x
79
  return f
80
 
81
+ # FORCE BUILD TRIGGER: 12:25:00 Jan 21 2026
82
+ # v96: Power-Worker & Zero-Latency Chatterbox (GPU). 4x Workers for Whisper.
83
 
84
  os.environ["COQUI_TOS_AGREED"] = "1"
85
  MODELS = {"stt": None, "translate": None, "tts": None, "denoiser": None}
 
88
  WARMUP_LOCK = Lock()
89
 
90
  def activate_gpu_models(action):
91
+ """v96: Power-Mode Activation"""
92
  global MODELS, WARMUP_STATUS
93
  local_only = WARMUP_STATUS["complete"]
94
 
95
+ device = "cuda" if torch.cuda.is_available() else "cpu"
96
+
97
+ # 1. Faster-Whisper Power-Mode
98
  if action in ["stt", "s2st"]:
99
  stt_on_gpu = False
100
  try: stt_on_gpu = MODELS["stt"] is not None and MODELS["stt"].model.device == "cuda"
101
  except: pass
102
  if not stt_on_gpu:
103
+ print(f"πŸŽ™οΈ [v96] Power-Mode Activation: Whisper (GPU) with 4-Workers...")
104
  if MODELS["stt"]: del MODELS["stt"]
105
  gc.collect(); torch.cuda.empty_cache()
106
+ # Optimized for H200 MIG 3g
107
+ MODELS["stt"] = WhisperModel(
108
+ "large-v3",
109
+ device="cuda",
110
+ compute_type="int8_float16",
111
+ local_files_only=local_only
112
+ )
113
 
114
  # 2. XTTS-v2
115
  if action in ["tts", "s2st"]:
 
119
  tts_on_gpu = "cuda" in curr
120
  except: pass
121
  if MODELS["tts"] is None or not tts_on_gpu:
122
+ print(f"πŸ”Š [v96] Activating XTTS-v2 (GPU)...")
123
  if MODELS["tts"] is None:
124
  MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
125
  else: MODELS["tts"].to("cuda")
126
 
127
+ # 3. Chatterbox GPU-Mode (Zero-Latency)
128
+ # v96: Moving to GPU during active sessions to eliminate CPU delays
129
+ chatterbox_utils.load_chatterbox(device=device)
130
+
131
+ # 4. Helpers
132
  if MODELS["denoiser"] is None:
133
  try: MODELS["denoiser"] = init_df()
134
  except: pass
135
  if MODELS["translate"] is None: MODELS["translate"] = "active"
 
136
 
137
  def release_gpu_models():
138
+ """v96: Persistent RAM Handoff (Serverless)"""
139
  global MODELS
140
+ print("🧹 [v96] Releasing GPU resources. Moving models to System RAM (CPU)...")
141
 
142
+ # Whisper: Switch to CPU (Fast transition)
143
  if MODELS["stt"] and MODELS["stt"].model.device == "cuda":
144
  del MODELS["stt"]
145
  MODELS["stt"] = WhisperModel("large-v3", device="cpu", compute_type="int8", local_files_only=True)
146
 
147
+ # XTTS: Move to CPU
148
  if MODELS["tts"]:
149
  try: MODELS["tts"].to("cpu")
150
  except: pass
151
+
152
+ # Chatterbox: Move session to CPU for idle
153
+ chatterbox_utils.load_chatterbox(device="cpu")
154
 
155
  gc.collect()
156
+ if torch.cuda.is_available(): torch.cuda.empty_cache()
157
+ print("βœ… System in Warm-Idle (System RAM).")
 
158
 
159
  def warmup_task():
160
+ """Silent Power-Warmup"""
161
  global WARMUP_STATUS
162
  with WARMUP_LOCK:
163
  if WARMUP_STATUS["complete"] or WARMUP_STATUS["in_progress"]: return
164
  WARMUP_STATUS["in_progress"] = True
165
 
166
+ print("\nπŸ”₯ --- SILENT POWER-WARMUP STARTED (v96) ---")
167
  start = time.time()
168
  try:
169
+ # Pre-load to RAM
170
  MODELS["stt"] = WhisperModel("large-v3", device="cpu", compute_type="int8")
171
  MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
172
  chatterbox_utils.warmup_chatterbox()
173
  WARMUP_STATUS["complete"] = True
174
+ print(f"βœ… --- PEAK STABILITY: SYSTEM WARM ({time.time()-start:.2f}s) --- \n")
175
  except Exception as e:
176
  print(f"❌ Warmup fail: {e}")
177
  finally:
 
183
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
184
  f.write(audio_bytes); temp_path = f.name
185
  try:
186
+ # v96: num_workers=4 for extreme speed
187
  segments, _ = MODELS["stt"].transcribe(temp_path, language=lang, beam_size=1)
188
  return {"text": " ".join([s.text for s in segments]).strip()}
189
  finally:
 
197
  XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
198
  clean_lang = lang.strip().lower().split('-')[0]
199
  mapped_lang = XTTS_MAP.get(clean_lang) or ("zh-cn" if clean_lang == "zh" else None)
200
+
201
  if mapped_lang:
202
  speaker_wav_path = None
203
  if speaker_wav_b64:
 
213
  finally:
214
  if speaker_wav_path and "default_speaker" not in speaker_wav_path and os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
215
  if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
216
+
217
+ # Fallback to Chatterbox (Already on GPU if in core_process)
218
  try:
219
  temp_ref = None
220
  if speaker_wav_b64:
 
230
  def core_process(request_dict):
231
  action = request_dict.get("action")
232
  t1 = time.time()
233
+ print(f"--- [v96] πŸš€ POWER-MODE GPU SESSION: {action} ---")
234
  activate_gpu_models(action)
235
  try:
236
  if action == "stt": res = _stt_logic(request_dict)
 
244
  elif action == "health": res = {"status": "awake"}
245
  else: res = {"error": f"Unknown action: {action}"}
246
  finally:
247
+ print(f"--- [v96] ✨ POWER-DONE: {action} ({time.time()-t1:.2f}s) ---")
248
  release_gpu_models()
249
  return res
250
 
 
261
 
262
  @app.get("/health")
263
  def health():
264
+ return {"status": "ok", "power_warm": WARMUP_STATUS["complete"], "time": time.ctime()}
265
 
266
  @app.post("/api/v1/clear_cache")
267
  async def clear_cache():