TGPro1 commited on
Commit
2e66333
Β·
verified Β·
1 Parent(s): 4c2d97a

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +57 -67
app.py CHANGED
@@ -13,6 +13,15 @@ import torchaudio
13
  import chatterbox_utils
14
  import gc
15
 
 
 
 
 
 
 
 
 
 
16
  # πŸ›‘οΈ ZeroGPU Support
17
  try:
18
  import spaces
@@ -21,8 +30,9 @@ except ImportError:
21
  print("⚠️ Spaces library not found. Using mock decorator for local run.")
22
  class spaces:
23
  @staticmethod
24
- def GPU(f): return f
25
-
 
26
 
27
  # πŸ› οΈ Monkeypatch torchaudio.backend (DeepFilterNet compatibility)
28
  import sys
@@ -58,100 +68,73 @@ if not hasattr(torchaudio, "info"):
58
  return SimpleNamespace(sample_rate=48000, num_frames=0, num_channels=1)
59
  torchaudio.info = mock_info
60
 
61
- from df.enhance import enhance, init_df, load_audio, save_audio
62
-
63
- # FORCE BUILD TRIGGER: 10:20:00 Jan 21 2026
64
- # v87: Targeted GPU Activation (Only loads what's needed for the specific action)
65
-
66
- # πŸ› οΈ Monkeypatch torchaudio.load
67
- try:
68
- _orig_load = torchaudio.load
69
- def patched_load(filepath, *args, **kwargs):
70
- try:
71
- return _orig_load(filepath, *args, **kwargs)
72
- except ImportError as e:
73
- if "torchcodec" in str(e).lower():
74
- print(f"⚠️ Redirecting load for {filepath} via soundfile")
75
- import soundfile as sf
76
- data, samplerate = sf.read(filepath)
77
- t = torch.from_numpy(data).float()
78
- if len(t.shape) == 1: t = t.unsqueeze(0)
79
- else: t = t.T
80
- return t, samplerate
81
- raise e
82
- torchaudio.load = patched_load
83
- print("βœ… Torchaudio monkeypatched successfully")
84
- except Exception as e:
85
- print(f"⚠️ Failed to monkeypatch torchaudio: {e}")
86
 
87
  os.environ["COQUI_TOS_AGREED"] = "1"
88
 
89
- # Global models (Resident in RAM)
90
  MODELS = {"stt": None, "translate": None, "tts": None, "denoiser": None}
91
 
92
  def activate_gpu_models(action):
93
- """v87: Targetted activation of models on GPU to save time"""
94
  global MODELS
95
 
96
- # 1. Faster-Whisper (Activate only if action needs it)
97
  if action in ["stt", "s2st"]:
98
- is_cuda = False
99
- try:
100
- # Check current device
101
- if hasattr(MODELS["stt"], "model") and MODELS["stt"].model.device == "cuda":
102
- is_cuda = True
103
- except: pass
104
-
105
- if not is_cuda:
106
  print(f"πŸŽ™οΈ Activating Whisper on GPU for {action}...")
107
- from faster_whisper import WhisperModel
108
  MODELS["stt"] = WhisperModel("large-v3", device="cuda", compute_type="float16")
109
 
110
- # 2. XTTS-v2 (Activate only if action needs it)
111
  if action in ["tts", "s2st"]:
112
  if MODELS["tts"] is None:
113
- from TTS.api import TTS
114
  print("πŸ”Š Initializing XTTS to RAM...")
115
  MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
116
 
117
  try:
118
  current_dev = str(next(MODELS["tts"].synthesizer.tts_model.parameters()).device)
119
  if "cuda" not in current_dev:
120
- print(f"πŸš€ Activating XTTS-v2 on GPU for {action}...")
121
  MODELS["tts"].to("cuda")
122
  except:
123
  MODELS["tts"].to("cuda")
124
 
125
- # 3. Denoiser & Translate & Chatterbox
126
- if action in ["tts", "s2st", "stt"]:
127
- if MODELS["denoiser"] is None:
128
- try: MODELS["denoiser"] = init_df()
129
- except: pass
130
- if MODELS["translate"] is None: MODELS["translate"] = "active"
131
- chatterbox_utils.load_chatterbox(device="cuda" if torch.cuda.is_available() else "cpu")
 
 
132
 
133
- # 🧹 Cleanup
134
  gc.collect()
135
  if torch.cuda.is_available():
136
  torch.cuda.empty_cache()
137
 
138
  def warmup_models():
139
- """Download models at startup to System RAM"""
140
- print("\nπŸ”₯ --- SYSTEM WARMUP: RAM CACHING (v87) ---")
141
  start = time.time()
142
  try:
143
- from faster_whisper import WhisperModel
144
- print("πŸ“₯ Caching Whisper to RAM...")
145
  MODELS["stt"] = WhisperModel("large-v3", device="cpu", compute_type="int8")
146
 
147
- from TTS.api import TTS
148
- print("πŸ“₯ Caching XTTS-v2 to RAM...")
149
  MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
150
 
 
 
 
 
151
  chatterbox_utils.warmup_chatterbox()
152
- print(f"βœ… --- WARMUP COMPLETE ({time.time()-start:.2f}s) --- \n")
153
  except Exception as e:
154
- print(f"⚠️ Warmup warning: {e}")
155
 
156
  def _stt_logic(request_dict):
157
  audio_bytes = base64.b64decode(request_dict.get("file"))
@@ -160,7 +143,8 @@ def _stt_logic(request_dict):
160
  f.write(audio_bytes); temp_path = f.name
161
  try:
162
  segments, _ = MODELS["stt"].transcribe(temp_path, language=lang, beam_size=1)
163
- return {"text": " ".join([s.text for s in segments]).strip()}
 
164
  finally:
165
  if os.path.exists(temp_path): os.unlink(temp_path)
166
 
@@ -179,7 +163,7 @@ def _tts_logic(text, lang, speaker_wav_b64):
179
  mapped_lang = XTTS_MAP.get(clean_lang) or ("zh-cn" if clean_lang == "zh" else None)
180
 
181
  if mapped_lang:
182
- print(f"[v87] Use XTTS: {mapped_lang}")
183
  speaker_wav_path = None
184
  if speaker_wav_b64:
185
  sb = base64.b64decode(speaker_wav_b64)
@@ -193,11 +177,11 @@ def _tts_logic(text, lang, speaker_wav_b64):
193
  with open(output_path, "rb") as f: audio_b64 = base64.b64encode(f.read()).decode()
194
  return {"audio": audio_b64}
195
  finally:
196
- if speaker_wav_path and "default_speaker" not in speaker_wav_path:
197
- if os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
198
  if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
199
 
200
- print(f"[v87] Use Chatterbox: {clean_lang}")
 
201
  try:
202
  temp_ref = None
203
  if speaker_wav_b64:
@@ -209,27 +193,33 @@ def _tts_logic(text, lang, speaker_wav_b64):
209
  return {"audio": base64.b64encode(audio_bytes).decode()}
210
  except Exception as e: return {"error": f"TTS Failure: {str(e)}"}
211
 
212
- @spaces.GPU
 
213
  def core_process(request_dict):
 
214
  action = request_dict.get("action")
215
  t0 = time.time()
216
- print(f"--- [v87] πŸš€ GPU START: {action} ---")
 
 
217
  activate_gpu_models(action)
 
218
  try:
219
  if action == "stt": res = _stt_logic(request_dict)
220
  elif action == "translate": res = {"translated": _translate_logic(request_dict.get("text"), request_dict.get("target_lang", "en"))}
221
  elif action == "tts": res = _tts_logic(request_dict.get("text"), request_dict.get("lang"), request_dict.get("speaker_wav"))
222
  elif action == "s2st":
 
223
  stt_res = _stt_logic({"file": request_dict.get("file"), "lang": request_dict.get("source_lang")})
224
  text = stt_res.get("text", "")
225
- if not text: return {"error": "No speech"}
226
  translated = _translate_logic(text, request_dict.get("target_lang"))
227
  tts_res = _tts_logic(translated, request_dict.get("target_lang"), request_dict.get("speaker_wav"))
228
  res = {"text": text, "translated": translated, "audio": tts_res.get("audio")}
229
  elif action == "health": res = {"status": "awake"}
230
  else: res = {"error": f"Unknown action: {action}"}
231
  finally:
232
- print(f"--- [v87] ✨ END: {action} ({time.time()-t0:.2f}s) ---")
233
  gc.collect()
234
  if torch.cuda.is_available(): torch.cuda.empty_cache()
235
  return res
 
13
  import chatterbox_utils
14
  import gc
15
 
16
+ # πŸ›‘οΈ BULKY IMPORTS AT TOP-LEVEL (v88 Optimization)
17
+ # Pre-loading these into RAM at startup so they are READY when GPU session starts
18
+ print("πŸ“¦ Pre-loading AI Engines into RAM...")
19
+ from faster_whisper import WhisperModel
20
+ from TTS.api import TTS
21
+ from df.enhance import init_df, enhance, load_audio, save_audio
22
+ import deep_translator
23
+ print("βœ… Imports Complete")
24
+
25
  # πŸ›‘οΈ ZeroGPU Support
26
  try:
27
  import spaces
 
30
  print("⚠️ Spaces library not found. Using mock decorator for local run.")
31
  class spaces:
32
  @staticmethod
33
+ def GPU(duration=60, f=None):
34
+ if f is None: return lambda x: x
35
+ return f
36
 
37
  # πŸ› οΈ Monkeypatch torchaudio.backend (DeepFilterNet compatibility)
38
  import sys
 
68
  return SimpleNamespace(sample_rate=48000, num_frames=0, num_channels=1)
69
  torchaudio.info = mock_info
70
 
71
+ # FORCE BUILD TRIGGER: 10:30:00 Jan 21 2026
72
+ # v88: Mandatory GPU-Only (STT + TTS). Fast Activation + 150s Duration.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
  os.environ["COQUI_TOS_AGREED"] = "1"
75
 
76
+ # Global models (Resident in System RAM)
77
  MODELS = {"stt": None, "translate": None, "tts": None, "denoiser": None}
78
 
79
  def activate_gpu_models(action):
80
+ """v88: Fast GPU Movement and Activation"""
81
  global MODELS
82
 
83
+ # 1. Faster-Whisper GPU Activation
84
  if action in ["stt", "s2st"]:
85
+ if MODELS["stt"] is None or MODELS["stt"].model.device != "cuda":
 
 
 
 
 
 
 
86
  print(f"πŸŽ™οΈ Activating Whisper on GPU for {action}...")
87
+ # We re-init to move to CUDA. Since weights are cached, this is fast.
88
  MODELS["stt"] = WhisperModel("large-v3", device="cuda", compute_type="float16")
89
 
90
+ # 2. XTTS-v2 GPU Activation
91
  if action in ["tts", "s2st"]:
92
  if MODELS["tts"] is None:
 
93
  print("πŸ”Š Initializing XTTS to RAM...")
94
  MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
95
 
96
  try:
97
  current_dev = str(next(MODELS["tts"].synthesizer.tts_model.parameters()).device)
98
  if "cuda" not in current_dev:
99
+ print(f"πŸš€ Moving XTTS-v2 to GPU...")
100
  MODELS["tts"].to("cuda")
101
  except:
102
  MODELS["tts"].to("cuda")
103
 
104
+ # 3. Denoiser & Translate
105
+ if MODELS["denoiser"] is None:
106
+ try: MODELS["denoiser"] = init_df()
107
+ except: pass
108
+ if MODELS["translate"] is None:
109
+ MODELS["translate"] = "active"
110
+
111
+ # Chatterbox (STAY CPU if no GPU available for it, or use CUDA if ONNX allows)
112
+ chatterbox_utils.load_chatterbox(device="cuda" if torch.cuda.is_available() else "cpu")
113
 
114
+ # 🧹 Mem Cleanup
115
  gc.collect()
116
  if torch.cuda.is_available():
117
  torch.cuda.empty_cache()
118
 
119
  def warmup_models():
120
+ """PRE-LOAD EVERYTHING INTO SYSTEM RAM (CPU)"""
121
+ print("\nπŸ”₯ --- SYSTEM STARTUP: RESIDENT RAM LOADING (v88) ---")
122
  start = time.time()
123
  try:
124
+ print("πŸ“₯ Pre-loading Whisper large-v3 to RAM...")
 
125
  MODELS["stt"] = WhisperModel("large-v3", device="cpu", compute_type="int8")
126
 
127
+ print("πŸ“₯ Pre-loading XTTS-v2 to RAM...")
 
128
  MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
129
 
130
+ print("πŸ“₯ Pre-loading DeepFilterNet...")
131
+ try: MODELS["denoiser"] = init_df()
132
+ except: pass
133
+
134
  chatterbox_utils.warmup_chatterbox()
135
+ print(f"βœ… --- SYSTEM READY: MODELS IN RAM ({time.time()-start:.2f}s) --- \n")
136
  except Exception as e:
137
+ print(f"⚠️ Startup warning: {e}")
138
 
139
  def _stt_logic(request_dict):
140
  audio_bytes = base64.b64decode(request_dict.get("file"))
 
143
  f.write(audio_bytes); temp_path = f.name
144
  try:
145
  segments, _ = MODELS["stt"].transcribe(temp_path, language=lang, beam_size=1)
146
+ text = " ".join([s.text for s in segments]).strip()
147
+ return {"text": text}
148
  finally:
149
  if os.path.exists(temp_path): os.unlink(temp_path)
150
 
 
163
  mapped_lang = XTTS_MAP.get(clean_lang) or ("zh-cn" if clean_lang == "zh" else None)
164
 
165
  if mapped_lang:
166
+ print(f"[v88] GPU Inference: XTTS-v2 for '{mapped_lang}'")
167
  speaker_wav_path = None
168
  if speaker_wav_b64:
169
  sb = base64.b64decode(speaker_wav_b64)
 
177
  with open(output_path, "rb") as f: audio_b64 = base64.b64encode(f.read()).decode()
178
  return {"audio": audio_b64}
179
  finally:
180
+ if speaker_wav_path and "default_speaker" not in speaker_wav_path and os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
 
181
  if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
182
 
183
+ # Fallback path
184
+ print(f"[v88] Inference: Chatterbox Fallback for '{clean_lang}'")
185
  try:
186
  temp_ref = None
187
  if speaker_wav_b64:
 
193
  return {"audio": base64.b64encode(audio_bytes).decode()}
194
  except Exception as e: return {"error": f"TTS Failure: {str(e)}"}
195
 
196
+ # πŸš€ AGGRESSIVE GPU SESSION (150s Duration)
197
+ @spaces.GPU(duration=150)
198
  def core_process(request_dict):
199
+ """MANDATORY GPU ENTRY POINT (v88)"""
200
  action = request_dict.get("action")
201
  t0 = time.time()
202
+ print(f"--- [v88] πŸš€ GPU SESSION START: {action} ---")
203
+
204
+ # v88 Optimization: Only activate models for current action
205
  activate_gpu_models(action)
206
+
207
  try:
208
  if action == "stt": res = _stt_logic(request_dict)
209
  elif action == "translate": res = {"translated": _translate_logic(request_dict.get("text"), request_dict.get("target_lang", "en"))}
210
  elif action == "tts": res = _tts_logic(request_dict.get("text"), request_dict.get("lang"), request_dict.get("speaker_wav"))
211
  elif action == "s2st":
212
+ # Direct GPU Pipeline
213
  stt_res = _stt_logic({"file": request_dict.get("file"), "lang": request_dict.get("source_lang")})
214
  text = stt_res.get("text", "")
215
+ if not text: return {"error": "No speech detected"}
216
  translated = _translate_logic(text, request_dict.get("target_lang"))
217
  tts_res = _tts_logic(translated, request_dict.get("target_lang"), request_dict.get("speaker_wav"))
218
  res = {"text": text, "translated": translated, "audio": tts_res.get("audio")}
219
  elif action == "health": res = {"status": "awake"}
220
  else: res = {"error": f"Unknown action: {action}"}
221
  finally:
222
+ print(f"--- [v88] ✨ SESSION END: {action} ({time.time()-t0:.2f}s) ---")
223
  gc.collect()
224
  if torch.cuda.is_available(): torch.cuda.empty_cache()
225
  return res