TGPro1 commited on
Commit
0c0d892
Β·
verified Β·
1 Parent(s): a333eb5

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +57 -55
app.py CHANGED
@@ -10,33 +10,15 @@ import traceback
10
  import json
11
  import time
12
  import torchaudio
13
- import chatterbox_utils
14
  import gc
15
-
16
- # πŸ›‘οΈ BULKY IMPORTS AT TOP-LEVEL (v88 Optimization)
17
- # Pre-loading these into RAM at startup so they are READY when GPU session starts
18
- print("πŸ“¦ Pre-loading AI Engines into RAM...")
19
- from faster_whisper import WhisperModel
20
- from TTS.api import TTS
21
- from df.enhance import init_df, enhance, load_audio, save_audio
22
- import deep_translator
23
- print("βœ… Imports Complete")
24
-
25
- # πŸ›‘οΈ ZeroGPU Support
26
- try:
27
- import spaces
28
- print("βœ… ZeroGPU/Spaces detected")
29
- except ImportError:
30
- print("⚠️ Spaces library not found. Using mock decorator for local run.")
31
- class spaces:
32
- @staticmethod
33
- def GPU(duration=60, f=None):
34
- if f is None: return lambda x: x
35
- return f
36
-
37
- # πŸ› οΈ Monkeypatch torchaudio.backend (DeepFilterNet compatibility)
38
  import sys
39
  import types
 
 
 
 
 
 
40
  if "torchaudio.backend" not in sys.modules:
41
  backend = types.ModuleType("torchaudio.backend")
42
  common = types.ModuleType("torchaudio.backend.common")
@@ -49,9 +31,8 @@ if "torchaudio.backend" not in sys.modules:
49
  sys.modules["torchaudio.backend"] = backend
50
  sys.modules["torchaudio.backend.common"] = common
51
 
52
- # πŸ›‘οΈ Torchaudio Compatibility Fix
53
  if not hasattr(torchaudio, "info"):
54
- print("πŸ› οΈ Mocking torchaudio.info for compatibility...")
55
  def mock_info(filepath, **kwargs):
56
  from types import SimpleNamespace
57
  import wave
@@ -68,8 +49,48 @@ if not hasattr(torchaudio, "info"):
68
  return SimpleNamespace(sample_rate=48000, num_frames=0, num_channels=1)
69
  torchaudio.info = mock_info
70
 
71
- # FORCE BUILD TRIGGER: 10:30:00 Jan 21 2026
72
- # v88: Mandatory GPU-Only (STT + TTS). Fast Activation + 150s Duration.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
  os.environ["COQUI_TOS_AGREED"] = "1"
75
 
@@ -77,22 +98,19 @@ os.environ["COQUI_TOS_AGREED"] = "1"
77
  MODELS = {"stt": None, "translate": None, "tts": None, "denoiser": None}
78
 
79
  def activate_gpu_models(action):
80
- """v88: Fast GPU Movement and Activation"""
81
  global MODELS
82
 
83
  # 1. Faster-Whisper GPU Activation
84
  if action in ["stt", "s2st"]:
85
  if MODELS["stt"] is None or MODELS["stt"].model.device != "cuda":
86
  print(f"πŸŽ™οΈ Activating Whisper on GPU for {action}...")
87
- # We re-init to move to CUDA. Since weights are cached, this is fast.
88
  MODELS["stt"] = WhisperModel("large-v3", device="cuda", compute_type="float16")
89
 
90
  # 2. XTTS-v2 GPU Activation
91
  if action in ["tts", "s2st"]:
92
  if MODELS["tts"] is None:
93
- print("πŸ”Š Initializing XTTS to RAM...")
94
  MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
95
-
96
  try:
97
  current_dev = str(next(MODELS["tts"].synthesizer.tts_model.parameters()).device)
98
  if "cuda" not in current_dev:
@@ -108,31 +126,24 @@ def activate_gpu_models(action):
108
  if MODELS["translate"] is None:
109
  MODELS["translate"] = "active"
110
 
111
- # Chatterbox (STAY CPU if no GPU available for it, or use CUDA if ONNX allows)
112
  chatterbox_utils.load_chatterbox(device="cuda" if torch.cuda.is_available() else "cpu")
113
-
114
- # 🧹 Mem Cleanup
115
  gc.collect()
116
- if torch.cuda.is_available():
117
- torch.cuda.empty_cache()
118
 
119
  def warmup_models():
120
- """PRE-LOAD EVERYTHING INTO SYSTEM RAM (CPU)"""
121
- print("\nπŸ”₯ --- SYSTEM STARTUP: RESIDENT RAM LOADING (v88) ---")
122
  start = time.time()
123
  try:
124
- print("πŸ“₯ Pre-loading Whisper large-v3 to RAM...")
125
  MODELS["stt"] = WhisperModel("large-v3", device="cpu", compute_type="int8")
126
-
127
  print("πŸ“₯ Pre-loading XTTS-v2 to RAM...")
128
  MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
129
-
130
  print("πŸ“₯ Pre-loading DeepFilterNet...")
131
  try: MODELS["denoiser"] = init_df()
132
  except: pass
133
-
134
  chatterbox_utils.warmup_chatterbox()
135
- print(f"βœ… --- SYSTEM READY: MODELS IN RAM ({time.time()-start:.2f}s) --- \n")
136
  except Exception as e:
137
  print(f"⚠️ Startup warning: {e}")
138
 
@@ -163,7 +174,6 @@ def _tts_logic(text, lang, speaker_wav_b64):
163
  mapped_lang = XTTS_MAP.get(clean_lang) or ("zh-cn" if clean_lang == "zh" else None)
164
 
165
  if mapped_lang:
166
- print(f"[v88] GPU Inference: XTTS-v2 for '{mapped_lang}'")
167
  speaker_wav_path = None
168
  if speaker_wav_b64:
169
  sb = base64.b64decode(speaker_wav_b64)
@@ -180,8 +190,6 @@ def _tts_logic(text, lang, speaker_wav_b64):
180
  if speaker_wav_path and "default_speaker" not in speaker_wav_path and os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
181
  if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
182
 
183
- # Fallback path
184
- print(f"[v88] Inference: Chatterbox Fallback for '{clean_lang}'")
185
  try:
186
  temp_ref = None
187
  if speaker_wav_b64:
@@ -193,23 +201,17 @@ def _tts_logic(text, lang, speaker_wav_b64):
193
  return {"audio": base64.b64encode(audio_bytes).decode()}
194
  except Exception as e: return {"error": f"TTS Failure: {str(e)}"}
195
 
196
- # πŸš€ AGGRESSIVE GPU SESSION (150s Duration)
197
  @spaces.GPU(duration=150)
198
  def core_process(request_dict):
199
- """MANDATORY GPU ENTRY POINT (v88)"""
200
  action = request_dict.get("action")
201
  t0 = time.time()
202
- print(f"--- [v88] πŸš€ GPU SESSION START: {action} ---")
203
-
204
- # v88 Optimization: Only activate models for current action
205
  activate_gpu_models(action)
206
-
207
  try:
208
  if action == "stt": res = _stt_logic(request_dict)
209
  elif action == "translate": res = {"translated": _translate_logic(request_dict.get("text"), request_dict.get("target_lang", "en"))}
210
  elif action == "tts": res = _tts_logic(request_dict.get("text"), request_dict.get("lang"), request_dict.get("speaker_wav"))
211
  elif action == "s2st":
212
- # Direct GPU Pipeline
213
  stt_res = _stt_logic({"file": request_dict.get("file"), "lang": request_dict.get("source_lang")})
214
  text = stt_res.get("text", "")
215
  if not text: return {"error": "No speech detected"}
@@ -219,7 +221,7 @@ def core_process(request_dict):
219
  elif action == "health": res = {"status": "awake"}
220
  else: res = {"error": f"Unknown action: {action}"}
221
  finally:
222
- print(f"--- [v88] ✨ SESSION END: {action} ({time.time()-t0:.2f}s) ---")
223
  gc.collect()
224
  if torch.cuda.is_available(): torch.cuda.empty_cache()
225
  return res
 
10
  import json
11
  import time
12
  import torchaudio
 
13
  import gc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  import sys
15
  import types
16
+
17
+ # πŸ› οΈ 1. CRITICAL COMPATIBILITY MONKEYPATCHES (v89)
18
+ # These MUST happen before importing df (DeepFilterNet) or other audio tools
19
+ print("πŸ› οΈ Applying compatibility monkeypatches...")
20
+
21
+ # Patch torchaudio.backend for DeepFilterNet
22
  if "torchaudio.backend" not in sys.modules:
23
  backend = types.ModuleType("torchaudio.backend")
24
  common = types.ModuleType("torchaudio.backend.common")
 
31
  sys.modules["torchaudio.backend"] = backend
32
  sys.modules["torchaudio.backend.common"] = common
33
 
34
+ # Mock torchaudio.info
35
  if not hasattr(torchaudio, "info"):
 
36
  def mock_info(filepath, **kwargs):
37
  from types import SimpleNamespace
38
  import wave
 
49
  return SimpleNamespace(sample_rate=48000, num_frames=0, num_channels=1)
50
  torchaudio.info = mock_info
51
 
52
+ # Patch torchaudio.load
53
+ try:
54
+ _orig_load = torchaudio.load
55
+ def patched_load(filepath, *args, **kwargs):
56
+ try:
57
+ return _orig_load(filepath, *args, **kwargs)
58
+ except ImportError as e:
59
+ if "torchcodec" in str(e).lower():
60
+ import soundfile as sf
61
+ data, samplerate = sf.read(filepath)
62
+ t = torch.from_numpy(data).float()
63
+ if len(t.shape) == 1: t = t.unsqueeze(0)
64
+ else: t = t.T
65
+ return t, samplerate
66
+ raise e
67
+ torchaudio.load = patched_load
68
+ print("βœ… Torchaudio patched")
69
+ except Exception as e:
70
+ print(f"⚠️ Patch failed: {e}")
71
+
72
+ # πŸ“¦ 2. BULKY IMPORTS (After patches)
73
+ print("πŸ“¦ Pre-loading AI Engines...")
74
+ import chatterbox_utils
75
+ from faster_whisper import WhisperModel
76
+ from TTS.api import TTS
77
+ from df.enhance import init_df, enhance, load_audio, save_audio
78
+ import deep_translator
79
+ print("βœ… Imports Complete")
80
+
81
+ # πŸ›‘οΈ ZeroGPU Support
82
+ try:
83
+ import spaces
84
+ print("βœ… ZeroGPU/Spaces detected")
85
+ except ImportError:
86
+ class spaces:
87
+ @staticmethod
88
+ def GPU(duration=60, f=None):
89
+ if f is None: return lambda x: x
90
+ return f
91
+
92
+ # FORCE BUILD TRIGGER: 10:45:00 Jan 21 2026
93
+ # v89: Fixed Import Order (Resolved ModuleNotFoundError)
94
 
95
  os.environ["COQUI_TOS_AGREED"] = "1"
96
 
 
98
  MODELS = {"stt": None, "translate": None, "tts": None, "denoiser": None}
99
 
100
  def activate_gpu_models(action):
101
+ """Fast GPU Activation"""
102
  global MODELS
103
 
104
  # 1. Faster-Whisper GPU Activation
105
  if action in ["stt", "s2st"]:
106
  if MODELS["stt"] is None or MODELS["stt"].model.device != "cuda":
107
  print(f"πŸŽ™οΈ Activating Whisper on GPU for {action}...")
 
108
  MODELS["stt"] = WhisperModel("large-v3", device="cuda", compute_type="float16")
109
 
110
  # 2. XTTS-v2 GPU Activation
111
  if action in ["tts", "s2st"]:
112
  if MODELS["tts"] is None:
 
113
  MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
 
114
  try:
115
  current_dev = str(next(MODELS["tts"].synthesizer.tts_model.parameters()).device)
116
  if "cuda" not in current_dev:
 
126
  if MODELS["translate"] is None:
127
  MODELS["translate"] = "active"
128
 
 
129
  chatterbox_utils.load_chatterbox(device="cuda" if torch.cuda.is_available() else "cpu")
 
 
130
  gc.collect()
131
+ if torch.cuda.is_available(): torch.cuda.empty_cache()
 
132
 
133
  def warmup_models():
134
+ """PRE-LOAD MODELS INTO SYSTEM RAM (CPU)"""
135
+ print("\nπŸ”₯ --- SYSTEM STARTUP: RAM LOADING (v89) ---")
136
  start = time.time()
137
  try:
138
+ print("πŸ“₯ Pre-loading Whisper to RAM...")
139
  MODELS["stt"] = WhisperModel("large-v3", device="cpu", compute_type="int8")
 
140
  print("πŸ“₯ Pre-loading XTTS-v2 to RAM...")
141
  MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
 
142
  print("πŸ“₯ Pre-loading DeepFilterNet...")
143
  try: MODELS["denoiser"] = init_df()
144
  except: pass
 
145
  chatterbox_utils.warmup_chatterbox()
146
+ print(f"βœ… --- SYSTEM READY ({time.time()-start:.2f}s) --- \n")
147
  except Exception as e:
148
  print(f"⚠️ Startup warning: {e}")
149
 
 
174
  mapped_lang = XTTS_MAP.get(clean_lang) or ("zh-cn" if clean_lang == "zh" else None)
175
 
176
  if mapped_lang:
 
177
  speaker_wav_path = None
178
  if speaker_wav_b64:
179
  sb = base64.b64decode(speaker_wav_b64)
 
190
  if speaker_wav_path and "default_speaker" not in speaker_wav_path and os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
191
  if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
192
 
 
 
193
  try:
194
  temp_ref = None
195
  if speaker_wav_b64:
 
201
  return {"audio": base64.b64encode(audio_bytes).decode()}
202
  except Exception as e: return {"error": f"TTS Failure: {str(e)}"}
203
 
 
204
  @spaces.GPU(duration=150)
205
  def core_process(request_dict):
 
206
  action = request_dict.get("action")
207
  t0 = time.time()
208
+ print(f"--- [v89] πŸš€ GPU SESSION START: {action} ---")
 
 
209
  activate_gpu_models(action)
 
210
  try:
211
  if action == "stt": res = _stt_logic(request_dict)
212
  elif action == "translate": res = {"translated": _translate_logic(request_dict.get("text"), request_dict.get("target_lang", "en"))}
213
  elif action == "tts": res = _tts_logic(request_dict.get("text"), request_dict.get("lang"), request_dict.get("speaker_wav"))
214
  elif action == "s2st":
 
215
  stt_res = _stt_logic({"file": request_dict.get("file"), "lang": request_dict.get("source_lang")})
216
  text = stt_res.get("text", "")
217
  if not text: return {"error": "No speech detected"}
 
221
  elif action == "health": res = {"status": "awake"}
222
  else: res = {"error": f"Unknown action: {action}"}
223
  finally:
224
+ print(f"--- [v89] ✨ SESSION END: {action} ({time.time()-t0:.2f}s) ---")
225
  gc.collect()
226
  if torch.cuda.is_available(): torch.cuda.empty_cache()
227
  return res