TGPro1 commited on
Commit
a80de4d
Β·
verified Β·
1 Parent(s): 0c0d892

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +57 -74
app.py CHANGED
@@ -14,16 +14,12 @@ import gc
14
  import sys
15
  import types
16
 
17
- # πŸ› οΈ 1. CRITICAL COMPATIBILITY MONKEYPATCHES (v89)
18
- # These MUST happen before importing df (DeepFilterNet) or other audio tools
19
  print("πŸ› οΈ Applying compatibility monkeypatches...")
20
-
21
- # Patch torchaudio.backend for DeepFilterNet
22
  if "torchaudio.backend" not in sys.modules:
23
  backend = types.ModuleType("torchaudio.backend")
24
  common = types.ModuleType("torchaudio.backend.common")
25
- try:
26
- common.AudioMetaData = torchaudio.AudioMetaData
27
  except AttributeError:
28
  class AudioMetaData: pass
29
  common.AudioMetaData = AudioMetaData
@@ -31,30 +27,20 @@ if "torchaudio.backend" not in sys.modules:
31
  sys.modules["torchaudio.backend"] = backend
32
  sys.modules["torchaudio.backend.common"] = common
33
 
34
- # Mock torchaudio.info
35
  if not hasattr(torchaudio, "info"):
36
  def mock_info(filepath, **kwargs):
37
  from types import SimpleNamespace
38
  import wave
39
  try:
40
  with wave.open(filepath, "rb") as f:
41
- return SimpleNamespace(
42
- sample_rate=f.getframerate(),
43
- num_frames=f.getnframes(),
44
- num_channels=f.getnchannels(),
45
- bits_per_sample=f.getsampwidth() * 8,
46
- encoding="PCM_S"
47
- )
48
- except:
49
- return SimpleNamespace(sample_rate=48000, num_frames=0, num_channels=1)
50
  torchaudio.info = mock_info
51
 
52
- # Patch torchaudio.load
53
  try:
54
  _orig_load = torchaudio.load
55
  def patched_load(filepath, *args, **kwargs):
56
- try:
57
- return _orig_load(filepath, *args, **kwargs)
58
  except ImportError as e:
59
  if "torchcodec" in str(e).lower():
60
  import soundfile as sf
@@ -66,11 +52,10 @@ try:
66
  raise e
67
  torchaudio.load = patched_load
68
  print("βœ… Torchaudio patched")
69
- except Exception as e:
70
- print(f"⚠️ Patch failed: {e}")
71
 
72
- # πŸ“¦ 2. BULKY IMPORTS (After patches)
73
- print("πŸ“¦ Pre-loading AI Engines...")
74
  import chatterbox_utils
75
  from faster_whisper import WhisperModel
76
  from TTS.api import TTS
@@ -78,7 +63,6 @@ from df.enhance import init_df, enhance, load_audio, save_audio
78
  import deep_translator
79
  print("βœ… Imports Complete")
80
 
81
- # πŸ›‘οΈ ZeroGPU Support
82
  try:
83
  import spaces
84
  print("βœ… ZeroGPU/Spaces detected")
@@ -89,23 +73,46 @@ except ImportError:
89
  if f is None: return lambda x: x
90
  return f
91
 
92
- # FORCE BUILD TRIGGER: 10:45:00 Jan 21 2026
93
- # v89: Fixed Import Order (Resolved ModuleNotFoundError)
94
 
95
  os.environ["COQUI_TOS_AGREED"] = "1"
96
-
97
- # Global models (Resident in System RAM)
98
  MODELS = {"stt": None, "translate": None, "tts": None, "denoiser": None}
99
 
100
  def activate_gpu_models(action):
101
- """Fast GPU Activation"""
102
  global MODELS
103
 
104
  # 1. Faster-Whisper GPU Activation
105
  if action in ["stt", "s2st"]:
106
- if MODELS["stt"] is None or MODELS["stt"].model.device != "cuda":
 
 
 
 
 
 
107
  print(f"πŸŽ™οΈ Activating Whisper on GPU for {action}...")
108
- MODELS["stt"] = WhisperModel("large-v3", device="cuda", compute_type="float16")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
  # 2. XTTS-v2 GPU Activation
111
  if action in ["tts", "s2st"]:
@@ -116,36 +123,27 @@ def activate_gpu_models(action):
116
  if "cuda" not in current_dev:
117
  print(f"πŸš€ Moving XTTS-v2 to GPU...")
118
  MODELS["tts"].to("cuda")
119
- except:
120
- MODELS["tts"].to("cuda")
121
 
122
- # 3. Denoiser & Translate
123
  if MODELS["denoiser"] is None:
124
  try: MODELS["denoiser"] = init_df()
125
  except: pass
126
- if MODELS["translate"] is None:
127
- MODELS["translate"] = "active"
128
-
129
  chatterbox_utils.load_chatterbox(device="cuda" if torch.cuda.is_available() else "cpu")
130
- gc.collect()
131
- if torch.cuda.is_available(): torch.cuda.empty_cache()
132
 
133
  def warmup_models():
134
- """PRE-LOAD MODELS INTO SYSTEM RAM (CPU)"""
135
- print("\nπŸ”₯ --- SYSTEM STARTUP: RAM LOADING (v89) ---")
136
  start = time.time()
137
  try:
138
- print("πŸ“₯ Pre-loading Whisper to RAM...")
139
  MODELS["stt"] = WhisperModel("large-v3", device="cpu", compute_type="int8")
140
- print("πŸ“₯ Pre-loading XTTS-v2 to RAM...")
141
  MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
142
- print("πŸ“₯ Pre-loading DeepFilterNet...")
143
  try: MODELS["denoiser"] = init_df()
144
  except: pass
145
  chatterbox_utils.warmup_chatterbox()
146
  print(f"βœ… --- SYSTEM READY ({time.time()-start:.2f}s) --- \n")
147
- except Exception as e:
148
- print(f"⚠️ Startup warning: {e}")
149
 
150
  def _stt_logic(request_dict):
151
  audio_bytes = base64.b64decode(request_dict.get("file"))
@@ -154,8 +152,7 @@ def _stt_logic(request_dict):
154
  f.write(audio_bytes); temp_path = f.name
155
  try:
156
  segments, _ = MODELS["stt"].transcribe(temp_path, language=lang, beam_size=1)
157
- text = " ".join([s.text for s in segments]).strip()
158
- return {"text": text}
159
  finally:
160
  if os.path.exists(temp_path): os.unlink(temp_path)
161
 
@@ -165,14 +162,9 @@ def _translate_logic(text, target_lang):
165
 
166
  def _tts_logic(text, lang, speaker_wav_b64):
167
  if not text or not text.strip(): return {"error": "Input empty"}
168
- XTTS_MAP = {
169
- "en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl",
170
- "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar",
171
- "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"
172
- }
173
  clean_lang = lang.strip().lower().split('-')[0]
174
  mapped_lang = XTTS_MAP.get(clean_lang) or ("zh-cn" if clean_lang == "zh" else None)
175
-
176
  if mapped_lang:
177
  speaker_wav_path = None
178
  if speaker_wav_b64:
@@ -184,12 +176,10 @@ def _tts_logic(text, lang, speaker_wav_b64):
184
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
185
  output_path = output_file.name
186
  MODELS["tts"].tts_to_file(text=text, language=mapped_lang, file_path=output_path, speaker_wav=speaker_wav_path)
187
- with open(output_path, "rb") as f: audio_b64 = base64.b64encode(f.read()).decode()
188
- return {"audio": audio_b64}
189
  finally:
190
  if speaker_wav_path and "default_speaker" not in speaker_wav_path and os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
191
  if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
192
-
193
  try:
194
  temp_ref = None
195
  if speaker_wav_b64:
@@ -205,7 +195,7 @@ def _tts_logic(text, lang, speaker_wav_b64):
205
  def core_process(request_dict):
206
  action = request_dict.get("action")
207
  t0 = time.time()
208
- print(f"--- [v89] πŸš€ GPU SESSION START: {action} ---")
209
  activate_gpu_models(action)
210
  try:
211
  if action == "stt": res = _stt_logic(request_dict)
@@ -213,29 +203,22 @@ def core_process(request_dict):
213
  elif action == "tts": res = _tts_logic(request_dict.get("text"), request_dict.get("lang"), request_dict.get("speaker_wav"))
214
  elif action == "s2st":
215
  stt_res = _stt_logic({"file": request_dict.get("file"), "lang": request_dict.get("source_lang")})
216
- text = stt_res.get("text", "")
217
- if not text: return {"error": "No speech detected"}
218
- translated = _translate_logic(text, request_dict.get("target_lang"))
219
  tts_res = _tts_logic(translated, request_dict.get("target_lang"), request_dict.get("speaker_wav"))
220
- res = {"text": text, "translated": translated, "audio": tts_res.get("audio")}
221
  elif action == "health": res = {"status": "awake"}
222
  else: res = {"error": f"Unknown action: {action}"}
223
  finally:
224
- print(f"--- [v89] ✨ SESSION END: {action} ({time.time()-t0:.2f}s) ---")
225
  gc.collect()
226
  if torch.cuda.is_available(): torch.cuda.empty_cache()
227
  return res
228
 
229
  app = FastAPI()
230
-
231
  @app.post("/api/v1/process")
232
  async def api_process(request: Request):
233
- try:
234
- data = await request.json()
235
- return core_process(data)
236
- except Exception as e:
237
- traceback.print_exc()
238
- return {"error": str(e)}
239
 
240
  @app.get("/health")
241
  def health(): return {"status": "ok", "gpu": torch.cuda.is_available(), "time": time.ctime()}
@@ -243,14 +226,14 @@ def health(): return {"status": "ok", "gpu": torch.cuda.is_available(), "time":
243
  @app.post("/api/v1/clear_cache")
244
  async def clear_cache():
245
  try:
246
- t0 = time.time(); gc.collect()
247
  if torch.cuda.is_available(): torch.cuda.empty_cache()
248
- temp_dir = tempfile.gettempdir(); count = 0
249
  for f in os.listdir(temp_dir):
250
  if f.endswith(".wav") or f.startswith("tm"):
251
- try: os.unlink(os.path.join(temp_dir, f)); count += 1
252
  except: pass
253
- return {"status": "success", "cleaned_files": count}
254
  except Exception as e: return {"status": "error", "message": str(e)}
255
 
256
  def gradio_fn(req_json):
 
14
  import sys
15
  import types
16
 
17
+ # πŸ› οΈ 1. CRITICAL COMPATIBILITY MONKEYPATCHES
 
18
  print("πŸ› οΈ Applying compatibility monkeypatches...")
 
 
19
  if "torchaudio.backend" not in sys.modules:
20
  backend = types.ModuleType("torchaudio.backend")
21
  common = types.ModuleType("torchaudio.backend.common")
22
+ try: common.AudioMetaData = torchaudio.AudioMetaData
 
23
  except AttributeError:
24
  class AudioMetaData: pass
25
  common.AudioMetaData = AudioMetaData
 
27
  sys.modules["torchaudio.backend"] = backend
28
  sys.modules["torchaudio.backend.common"] = common
29
 
 
30
  if not hasattr(torchaudio, "info"):
31
  def mock_info(filepath, **kwargs):
32
  from types import SimpleNamespace
33
  import wave
34
  try:
35
  with wave.open(filepath, "rb") as f:
36
+ return SimpleNamespace(sample_rate=f.getframerate(), num_frames=f.getnframes(), num_channels=f.getnchannels(), bits_per_sample=f.getsampwidth() * 8, encoding="PCM_S")
37
+ except: return SimpleNamespace(sample_rate=48000, num_frames=0, num_channels=1)
 
 
 
 
 
 
 
38
  torchaudio.info = mock_info
39
 
 
40
  try:
41
  _orig_load = torchaudio.load
42
  def patched_load(filepath, *args, **kwargs):
43
+ try: return _orig_load(filepath, *args, **kwargs)
 
44
  except ImportError as e:
45
  if "torchcodec" in str(e).lower():
46
  import soundfile as sf
 
52
  raise e
53
  torchaudio.load = patched_load
54
  print("βœ… Torchaudio patched")
55
+ except Exception as e: print(f"⚠️ Patch failed: {e}")
 
56
 
57
+ # πŸ“¦ 2. PRE-LOADING (v90 Optimization)
58
+ print("πŸ“¦ Pre-loading AI Engines into RAM...")
59
  import chatterbox_utils
60
  from faster_whisper import WhisperModel
61
  from TTS.api import TTS
 
63
  import deep_translator
64
  print("βœ… Imports Complete")
65
 
 
66
  try:
67
  import spaces
68
  print("βœ… ZeroGPU/Spaces detected")
 
73
  if f is None: return lambda x: x
74
  return f
75
 
76
+ # FORCE BUILD TRIGGER: 10:55:00 Jan 21 2026
77
+ # v90: Fixed Whisper CUDA 'Invalid Argument' crash. (Cleaner GPU Handoff)
78
 
79
  os.environ["COQUI_TOS_AGREED"] = "1"
 
 
80
  MODELS = {"stt": None, "translate": None, "tts": None, "denoiser": None}
81
 
82
  def activate_gpu_models(action):
83
+ """v90: Optimized GPU Activation with clean handoff"""
84
  global MODELS
85
 
86
  # 1. Faster-Whisper GPU Activation
87
  if action in ["stt", "s2st"]:
88
+ stt_on_gpu = False
89
+ try:
90
+ if MODELS["stt"] is not None and hasattr(MODELS["stt"], "model") and MODELS["stt"].model.device == "cuda":
91
+ stt_on_gpu = True
92
+ except: pass
93
+
94
+ if not stt_on_gpu:
95
  print(f"πŸŽ™οΈ Activating Whisper on GPU for {action}...")
96
+ # 🧹 CRITICAL: Clear old instance to avoid "Invalid Argument" CUDA errors
97
+ old_stt = MODELS.pop("stt", None)
98
+ if old_stt: del old_stt
99
+ gc.collect()
100
+ if torch.cuda.is_available(): torch.cuda.empty_cache()
101
+
102
+ # Re-init on GPU with safe parameters for ZeroGPU MIG
103
+ try:
104
+ MODELS["stt"] = WhisperModel(
105
+ "large-v3",
106
+ device="cuda",
107
+ device_index=0,
108
+ compute_type="int8_float16", # Better stability on H100/H200 MIG
109
+ cpu_threads=4,
110
+ num_workers=1
111
+ )
112
+ print("✨ Whisper Activated on GPU")
113
+ except Exception as e:
114
+ print(f"❌ Whisper GPU fail: {e}. Falling back to CPU in-session.")
115
+ MODELS["stt"] = WhisperModel("large-v3", device="cpu", compute_type="int8")
116
 
117
  # 2. XTTS-v2 GPU Activation
118
  if action in ["tts", "s2st"]:
 
123
  if "cuda" not in current_dev:
124
  print(f"πŸš€ Moving XTTS-v2 to GPU...")
125
  MODELS["tts"].to("cuda")
126
+ except: MODELS["tts"].to("cuda")
 
127
 
128
+ # 3. Helpers
129
  if MODELS["denoiser"] is None:
130
  try: MODELS["denoiser"] = init_df()
131
  except: pass
132
+ if MODELS["translate"] is None: MODELS["translate"] = "active"
 
 
133
  chatterbox_utils.load_chatterbox(device="cuda" if torch.cuda.is_available() else "cpu")
 
 
134
 
135
  def warmup_models():
136
+ """PRE-LOAD MODELS INTO SYSTEM RAM"""
137
+ print("\nπŸ”₯ --- SYSTEM STARTUP: RAM LOADING (v90) ---")
138
  start = time.time()
139
  try:
 
140
  MODELS["stt"] = WhisperModel("large-v3", device="cpu", compute_type="int8")
 
141
  MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
 
142
  try: MODELS["denoiser"] = init_df()
143
  except: pass
144
  chatterbox_utils.warmup_chatterbox()
145
  print(f"βœ… --- SYSTEM READY ({time.time()-start:.2f}s) --- \n")
146
+ except Exception as e: print(f"⚠️ Startup warning: {e}")
 
147
 
148
  def _stt_logic(request_dict):
149
  audio_bytes = base64.b64decode(request_dict.get("file"))
 
152
  f.write(audio_bytes); temp_path = f.name
153
  try:
154
  segments, _ = MODELS["stt"].transcribe(temp_path, language=lang, beam_size=1)
155
+ return {"text": " ".join([s.text for s in segments]).strip()}
 
156
  finally:
157
  if os.path.exists(temp_path): os.unlink(temp_path)
158
 
 
162
 
163
  def _tts_logic(text, lang, speaker_wav_b64):
164
  if not text or not text.strip(): return {"error": "Input empty"}
165
+ XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
 
 
 
 
166
  clean_lang = lang.strip().lower().split('-')[0]
167
  mapped_lang = XTTS_MAP.get(clean_lang) or ("zh-cn" if clean_lang == "zh" else None)
 
168
  if mapped_lang:
169
  speaker_wav_path = None
170
  if speaker_wav_b64:
 
176
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
177
  output_path = output_file.name
178
  MODELS["tts"].tts_to_file(text=text, language=mapped_lang, file_path=output_path, speaker_wav=speaker_wav_path)
179
+ with open(output_path, "rb") as f: return {"audio": base64.b64encode(f.read()).decode()}
 
180
  finally:
181
  if speaker_wav_path and "default_speaker" not in speaker_wav_path and os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
182
  if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
 
183
  try:
184
  temp_ref = None
185
  if speaker_wav_b64:
 
195
  def core_process(request_dict):
196
  action = request_dict.get("action")
197
  t0 = time.time()
198
+ print(f"--- [v90] πŸš€ GPU SESSION START: {action} ---")
199
  activate_gpu_models(action)
200
  try:
201
  if action == "stt": res = _stt_logic(request_dict)
 
203
  elif action == "tts": res = _tts_logic(request_dict.get("text"), request_dict.get("lang"), request_dict.get("speaker_wav"))
204
  elif action == "s2st":
205
  stt_res = _stt_logic({"file": request_dict.get("file"), "lang": request_dict.get("source_lang")})
206
+ translated = _translate_logic(stt_res.get("text", ""), request_dict.get("target_lang"))
 
 
207
  tts_res = _tts_logic(translated, request_dict.get("target_lang"), request_dict.get("speaker_wav"))
208
+ res = {"text": stt_res.get("text"), "translated": translated, "audio": tts_res.get("audio")}
209
  elif action == "health": res = {"status": "awake"}
210
  else: res = {"error": f"Unknown action: {action}"}
211
  finally:
212
+ print(f"--- [v90] ✨ END: {action} ({time.time()-t0:.2f}s) ---")
213
  gc.collect()
214
  if torch.cuda.is_available(): torch.cuda.empty_cache()
215
  return res
216
 
217
  app = FastAPI()
 
218
  @app.post("/api/v1/process")
219
  async def api_process(request: Request):
220
+ try: return core_process(await request.json())
221
+ except Exception as e: traceback.print_exc(); return {"error": str(e)}
 
 
 
 
222
 
223
  @app.get("/health")
224
  def health(): return {"status": "ok", "gpu": torch.cuda.is_available(), "time": time.ctime()}
 
226
  @app.post("/api/v1/clear_cache")
227
  async def clear_cache():
228
  try:
229
+ gc.collect()
230
  if torch.cuda.is_available(): torch.cuda.empty_cache()
231
+ temp_dir = tempfile.gettempdir()
232
  for f in os.listdir(temp_dir):
233
  if f.endswith(".wav") or f.startswith("tm"):
234
+ try: os.unlink(os.path.join(temp_dir, f))
235
  except: pass
236
+ return {"status": "success"}
237
  except Exception as e: return {"status": "error", "message": str(e)}
238
 
239
  def gradio_fn(req_json):