TGPro1 commited on
Commit
183c33d
·
verified ·
1 Parent(s): f97199c

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +93 -144
app.py CHANGED
@@ -13,8 +13,7 @@ import torchaudio
13
  import chatterbox_utils
14
  import gc
15
 
16
- # 🛡️ ZeroGPU Support (v69)
17
- # CRITICAL: @spaces.GPU MUST only be used on synchronous functions (def, not async def)
18
  try:
19
  import spaces
20
  print("✅ ZeroGPU/Spaces detected")
@@ -40,7 +39,7 @@ if "torchaudio.backend" not in sys.modules:
40
  sys.modules["torchaudio.backend"] = backend
41
  sys.modules["torchaudio.backend.common"] = common
42
 
43
- # 🛡️ Torchaudio Compatibility Fix (v60)
44
  if not hasattr(torchaudio, "info"):
45
  print("🛠️ Mocking torchaudio.info for compatibility...")
46
  def mock_info(filepath, **kwargs):
@@ -61,8 +60,8 @@ if not hasattr(torchaudio, "info"):
61
 
62
  from df.enhance import enhance, init_df, load_audio, save_audio
63
 
64
- # FORCE BUILD TRIGGER: 10:00:00 Jan 21 2026
65
- # v85: ZeroGPU Warmup & Pre-caching (Prevents session timeouts)
66
 
67
  # 🛠️ Monkeypatch torchaudio.load
68
  try:
@@ -87,88 +86,90 @@ except Exception as e:
87
 
88
  os.environ["COQUI_TOS_AGREED"] = "1"
89
 
90
- # Global models
91
- MODELS = {"stt": None, "translate": None, "tts": None, "tokenizer": None, "denoiser": None}
92
 
93
  def load_models():
 
94
  global MODELS
 
 
95
  if MODELS["stt"] is None:
96
- print("🎙️ Loading Faster-Whisper large-v3 into Engine...")
97
  from faster_whisper import WhisperModel
98
- if torch.cuda.is_available():
99
- print(f"🚀 GPU Detected: {torch.cuda.get_device_name(0)}")
100
- MODELS["stt"] = WhisperModel("large-v3", device="cuda", compute_type="float16")
101
- else:
102
- print("⚠️ Falling back to CPU (int8)")
103
- MODELS["stt"] = WhisperModel("large-v3", device="cpu", compute_type="int8")
104
 
105
- # 🧹 Proactive Memory Cleanup
106
- gc.collect()
107
- if torch.cuda.is_available():
108
- torch.cuda.empty_cache()
 
109
 
110
- # Initialize Chatterbox ONNX (High-Speed Fallback)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  chatterbox_utils.load_chatterbox(device="cuda" if torch.cuda.is_available() else "cpu")
112
 
 
113
  if MODELS["translate"] is None:
114
- print("🌍 Loading Google Translate...")
115
  MODELS["translate"] = "active"
116
 
117
- if MODELS["denoiser"] is None:
118
- print("🧹 Loading DeepFilterNet...")
119
- try:
120
- MODELS["denoiser"] = init_df()
121
- print("✨ DeepFilterNet Loaded")
122
- except: pass
123
-
124
- if MODELS["tts"] is None:
125
- print("🔊 Loading XTTS-v2 into Engine...")
126
- from TTS.api import TTS
127
- try:
128
- MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=torch.cuda.is_available())
129
- print(f"✨ XTTS-v2 Loaded (GPU={torch.cuda.is_available()})")
130
- except Exception as e:
131
- print(f"❌ Failed to load XTTS: {e}")
132
- raise e
133
 
134
  def warmup_models():
135
- """Download and cache all models on CPU at startup (Prevents GPU timeouts)"""
136
- print("\n🔥 --- SYSTEM WARMUP STARTING (CPU) ---")
137
  start = time.time()
138
  try:
139
- # 1. Warmup Whisper
140
- print("📥 Initializing Whisper large-v3 cache...")
141
  from faster_whisper import WhisperModel
142
- # Use simple init to trigger download
143
- _ = WhisperModel("large-v3", device="cpu", compute_type="int8")
144
 
145
- # 2. Warmup XTTS-v2 (This takes the longest)
146
- print("📥 Initializing XTTS-v2 cache...")
147
  from TTS.api import TTS
148
- # Initialize once on CPU just to force download
149
- _ = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
150
 
151
- # 3. Warmup DeepFilterNet
152
- print("📥 Initializing DeepFilterNet cache...")
153
- try: init_df()
154
  except: pass
155
 
156
- # 4. Warmup Chatterbox
157
  chatterbox_utils.warmup_chatterbox()
158
 
159
- print(f"✅ --- SYSTEM WARMUP COMPLETE (Time: {time.time()-start:.2f}s) --- \n")
160
  except Exception as e:
161
  print(f"⚠️ Warmup warning: {e}")
162
 
163
  def _stt_logic(request_dict):
164
- """STT Logic (Runs on GPU when called via core_process)"""
165
  audio_bytes = base64.b64decode(request_dict.get("file"))
166
  lang = request_dict.get("lang")
167
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
168
  f.write(audio_bytes)
169
  temp_path = f.name
170
  try:
171
- # Transcribe (Uses GPU if device="cuda" in MODELS)
172
  segments, _ = MODELS["stt"].transcribe(temp_path, language=lang, beam_size=1)
173
  text = " ".join([s.text for s in segments]).strip()
174
  return {"text": text}
@@ -176,100 +177,68 @@ def _stt_logic(request_dict):
176
  if os.path.exists(temp_path): os.unlink(temp_path)
177
 
178
  def _translate_logic(text, target_lang):
179
- """Translation (CPU/Network)"""
180
  from deep_translator import GoogleTranslator
181
- translated = GoogleTranslator(source='auto', target=target_lang).translate(text)
182
- return translated
183
 
184
  def _tts_logic(text, lang, speaker_wav_b64):
185
- """TTS Logic (Runs on GPU when called via core_process)"""
186
- if not text or not text.strip():
187
- return {"error": "TTS Error: Input text is empty"}
188
 
189
  XTTS_MAP = {
190
- "en": "en", "en-us": "en", "en-gb": "en",
191
- "de": "de", "de-de": "de",
192
- "fr": "fr", "fr-fr": "fr",
193
- "es": "es", "es-es": "es",
194
- "it": "it", "it-it": "it",
195
- "pl": "pl", "pl-pl": "pl",
196
- "pt": "pt", "pt-pt": "pt", "pt-br": "pt",
197
- "tr": "tr", "tr-tr": "tr",
198
- "ru": "ru", "ru-ru": "ru",
199
- "nl": "nl", "nl-nl": "nl",
200
- "cs": "cs", "cs-cz": "cs",
201
- "ar": "ar", "ar-sa": "ar", "ar-eg": "ar",
202
- "hu": "hu", "hu-hu": "hu",
203
- "ko": "ko", "ko-kr": "ko",
204
- "hi": "hi", "hi-in": "hi",
205
- "zh": "zh-cn", "zh-cn": "zh-cn", "zh-tw": "zh-cn"
206
  }
207
 
208
- XTTS_LANG_CODES = set(XTTS_MAP.values())
209
- mapped_lang = None
210
- if lang:
211
- lang_key = lang.strip().lower()
212
- mapped_lang = XTTS_MAP.get(lang_key) or XTTS_MAP.get(lang_key.split('-')[0])
213
 
214
- print(f"[v85] TTS Request - Original: {lang}, Mapped: {mapped_lang}")
215
 
216
- if mapped_lang and mapped_lang in XTTS_LANG_CODES:
217
- print(f"[v85] Using XTTS-v2 for '{mapped_lang}'")
218
  speaker_wav_path = None
219
  if speaker_wav_b64:
220
  sb = base64.b64decode(speaker_wav_b64)
221
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
222
- f.write(sb)
223
- speaker_wav_path = f.name
224
- else:
225
- speaker_wav_path = "default_speaker.wav"
226
 
227
  try:
228
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
229
  output_path = output_file.name
230
-
231
  MODELS["tts"].tts_to_file(text=text, language=mapped_lang, file_path=output_path, speaker_wav=speaker_wav_path)
232
-
233
- with open(output_path, "rb") as f:
234
- audio_b64 = base64.b64encode(f.read()).decode()
235
  return {"audio": audio_b64}
236
  finally:
237
  if speaker_wav_path and "default_speaker" not in speaker_wav_path:
238
  if os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
239
  if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
240
 
241
- print(f"[v85] Using Chatterbox ONNX Fallback for '{lang}'")
 
242
  try:
243
  temp_ref = None
244
  if speaker_wav_b64:
245
  sb = base64.b64decode(speaker_wav_b64)
246
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
247
  f.write(sb); temp_ref = f.name
248
-
249
- chatter_lang = lang.strip().lower().split('-')[0]
250
- audio_bytes = chatterbox_utils.run_chatterbox_inference(text, chatter_lang, speaker_wav_path=temp_ref)
251
  if temp_ref and os.path.exists(temp_ref): os.unlink(temp_ref)
252
- audio_b64 = base64.b64encode(audio_bytes).decode()
253
- return {"audio": audio_b64}
254
  except Exception as e:
255
- print(f" Chatterbox Fallback failed: {e}")
256
- return {"error": f"TTS Failure: '{lang}' not supported by XTTS or Chatterbox."}
257
 
258
  @spaces.GPU
259
  def core_process(request_dict):
260
- """Unified GPU Entry Point (v85)"""
261
  action = request_dict.get("action")
262
  t0 = time.time()
263
- print(f"--- [v85] 🚀 GPU SESSION START: {action} at {time.ctime()} ---")
264
  load_models()
265
-
266
  try:
267
- if action == "stt":
268
- res = _stt_logic(request_dict)
269
- elif action == "translate":
270
- res = {"translated": _translate_logic(request_dict.get("text"), request_dict.get("target_lang", "en"))}
271
- elif action == "tts":
272
- res = _tts_logic(request_dict.get("text"), request_dict.get("lang"), request_dict.get("speaker_wav"))
273
  elif action == "s2st":
274
  stt_res = _stt_logic({"file": request_dict.get("file"), "lang": request_dict.get("source_lang")})
275
  text = stt_res.get("text", "")
@@ -277,19 +246,15 @@ def core_process(request_dict):
277
  translated = _translate_logic(text, request_dict.get("target_lang"))
278
  tts_res = _tts_logic(translated, request_dict.get("target_lang"), request_dict.get("speaker_wav"))
279
  res = {"text": text, "translated": translated, "audio": tts_res.get("audio")}
280
- elif action == "health":
281
- res = {"status": "awake", "time": time.ctime()}
282
- else:
283
- res = {"error": f"Unknown action: {action}"}
284
  finally:
285
- print(f"--- [v85] ✨ SESSION END: {action} (Total: {time.time()-t0:.2f}s) ---")
286
  gc.collect()
287
- if torch.cuda.is_available():
288
- torch.cuda.empty_cache()
289
  return res
290
 
291
  def create_wav_header(sample_rate=24000, channels=1, bit_depth=16):
292
- """Returns a standard WAV header as standard BYTES"""
293
  header = bytearray(b'RIFF')
294
  header.extend((1000000000).to_bytes(4, 'little'))
295
  header.extend(b'WAVEfmt ')
@@ -310,21 +275,14 @@ def gpu_tts_generator(text, lang, speaker_wav_path):
310
  try:
311
  yield bytes(create_wav_header(sample_rate=24000))
312
  for chunk in MODELS["tts"].synthesizer.tts_model.inference_stream(
313
- text,
314
- lang,
315
- *MODELS["tts"].synthesizer.tts_model.get_conditioning_latents(audio_path=[speaker_wav_path]),
316
  stream_chunk_size=20
317
  ):
318
  yield bytes((chunk * 32767).to(torch.int16).cpu().numpy().tobytes())
319
- print("✨ [Generator Complete]")
320
- except Exception as e:
321
- print(f"❌ [Generator Error]: {e}")
322
  finally:
323
- if speaker_wav_path and "default_speaker" not in speaker_wav_path:
324
- if os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
325
  gc.collect()
326
- if torch.cuda.is_available():
327
- torch.cuda.empty_cache()
328
 
329
  app = FastAPI()
330
 
@@ -332,8 +290,7 @@ app = FastAPI()
332
  async def api_process(request: Request):
333
  try:
334
  data = await request.json()
335
- result = core_process(data)
336
- return result
337
  except Exception as e:
338
  traceback.print_exc()
339
  return {"error": str(e)}
@@ -343,37 +300,29 @@ async def api_tts_stream(request: Request):
343
  try:
344
  data = await request.json()
345
  speaker_wav_b64 = data.get("speaker_wav")
346
- speaker_wav_path = None
347
  if speaker_wav_b64:
348
  sb = base64.b64decode(speaker_wav_b64)
349
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
350
- f.write(sb)
351
- speaker_wav_path = f.name
352
- else:
353
- speaker_wav_path = "default_speaker.wav"
354
  return StreamingResponse(gpu_tts_generator(data.get("text"), data.get("lang"), speaker_wav_path), media_type="audio/wav")
355
- except Exception as e:
356
- return {"error": str(e)}
357
 
358
  @app.get("/health")
359
- def health():
360
- return {"status": "ok", "gpu": torch.cuda.is_available(), "time": time.ctime()}
361
 
362
  @app.post("/api/v1/clear_cache")
363
  async def clear_cache():
364
  try:
365
- t0 = time.time()
366
- gc.collect()
367
  if torch.cuda.is_available(): torch.cuda.empty_cache()
368
- temp_dir = tempfile.gettempdir()
369
- count = 0
370
  for f in os.listdir(temp_dir):
371
  if f.endswith(".wav") or f.startswith("tm"):
372
  try: os.unlink(os.path.join(temp_dir, f)); count += 1
373
  except: pass
374
- return {"status": "success", "cleaned_files": count, "duration": f"{time.time()-t0:.2f}s"}
375
- except Exception as e:
376
- return {"status": "error", "message": str(e)}
377
 
378
  def gradio_fn(req_json):
379
  try: return json.dumps(core_process(json.loads(req_json)))
 
13
  import chatterbox_utils
14
  import gc
15
 
16
+ # 🛡️ ZeroGPU Support
 
17
  try:
18
  import spaces
19
  print("✅ ZeroGPU/Spaces detected")
 
39
  sys.modules["torchaudio.backend"] = backend
40
  sys.modules["torchaudio.backend.common"] = common
41
 
42
+ # 🛡️ Torchaudio Compatibility Fix
43
  if not hasattr(torchaudio, "info"):
44
  print("🛠️ Mocking torchaudio.info for compatibility...")
45
  def mock_info(filepath, **kwargs):
 
60
 
61
  from df.enhance import enhance, init_df, load_audio, save_audio
62
 
63
+ # FORCE BUILD TRIGGER: 10:10:00 Jan 21 2026
64
+ # v86: Pre-load to CPU RAM + Fast Transfer to GPU (Prevents ZeroGPU timeouts)
65
 
66
  # 🛠️ Monkeypatch torchaudio.load
67
  try:
 
86
 
87
  os.environ["COQUI_TOS_AGREED"] = "1"
88
 
89
+ # Global models (Resident in RAM)
90
+ MODELS = {"stt": None, "translate": None, "tts": None, "denoiser": None}
91
 
92
  def load_models():
93
+ """Fast GPU Activation: Moves pre-loaded CPU models to GPU (v86)"""
94
  global MODELS
95
+
96
+ # 1. Faster-Whisper (Must re-init for device change, but disk cache is hot)
97
  if MODELS["stt"] is None:
98
+ print("🎙️ Initializing Faster-Whisper...")
99
  from faster_whisper import WhisperModel
100
+ dev = "cuda" if torch.cuda.is_available() else "cpu"
101
+ ct = "float16" if dev == "cuda" else "int8"
102
+ MODELS["stt"] = WhisperModel("large-v3", device=dev, compute_type=ct)
 
 
 
103
 
104
+ # 2. XTTS-v2 (Efficient .to("cuda") transfer)
105
+ if MODELS["tts"] is None:
106
+ print("🔊 Loading XTTS-v2 into Engine (CPU Base)...")
107
+ from TTS.api import TTS
108
+ MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
109
 
110
+ if torch.cuda.is_available():
111
+ # Move XTTS to GPU if it's currently on CPU
112
+ # We check the device of the underlying model to avoid redundant moves
113
+ try:
114
+ current_dev = str(next(MODELS["tts"].synthesizer.tts_model.parameters()).device)
115
+ if "cuda" not in current_dev:
116
+ print("🚀 Moving XTTS-v2 to GPU...")
117
+ MODELS["tts"].to("cuda")
118
+ except:
119
+ # Fallback if structure differs
120
+ MODELS["tts"].to("cuda")
121
+
122
+ # 3. DeepFilterNet
123
+ if MODELS["denoiser"] is None:
124
+ try: MODELS["denoiser"] = init_df()
125
+ except: pass
126
+
127
+ # 4. Chatterbox ONNX
128
  chatterbox_utils.load_chatterbox(device="cuda" if torch.cuda.is_available() else "cpu")
129
 
130
+ # 5. Translate
131
  if MODELS["translate"] is None:
 
132
  MODELS["translate"] = "active"
133
 
134
+ # 🧹 Proactive Memory Cleanup
135
+ gc.collect()
136
+ if torch.cuda.is_available():
137
+ torch.cuda.empty_cache()
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
  def warmup_models():
140
+ """PRE-LOAD EVERYTHING INTO SYSTEM RAM (CPU)"""
141
+ print("\n🔥 --- SYSTEM WARMUP: RESIDENT RAM LOADING (v86) ---")
142
  start = time.time()
143
  try:
144
+ # Load Whisper into RAM
145
+ print("��� Pre-loading Whisper to RAM...")
146
  from faster_whisper import WhisperModel
147
+ MODELS["stt"] = WhisperModel("large-v3", device="cpu", compute_type="int8")
 
148
 
149
+ # Load XTTS into RAM (The heaviest part)
150
+ print("📥 Pre-loading XTTS-v2 to RAM...")
151
  from TTS.api import TTS
152
+ MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
 
153
 
154
+ # Load Denoiser
155
+ print("📥 Pre-loading Denoiser...")
156
+ try: MODELS["denoiser"] = init_df()
157
  except: pass
158
 
159
+ # Pre-download ONNX weights
160
  chatterbox_utils.warmup_chatterbox()
161
 
162
+ print(f"✅ --- WARMUP COMPLETE: All models resident in RAM ({time.time()-start:.2f}s) --- \n")
163
  except Exception as e:
164
  print(f"⚠️ Warmup warning: {e}")
165
 
166
  def _stt_logic(request_dict):
 
167
  audio_bytes = base64.b64decode(request_dict.get("file"))
168
  lang = request_dict.get("lang")
169
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
170
  f.write(audio_bytes)
171
  temp_path = f.name
172
  try:
 
173
  segments, _ = MODELS["stt"].transcribe(temp_path, language=lang, beam_size=1)
174
  text = " ".join([s.text for s in segments]).strip()
175
  return {"text": text}
 
177
  if os.path.exists(temp_path): os.unlink(temp_path)
178
 
179
  def _translate_logic(text, target_lang):
 
180
  from deep_translator import GoogleTranslator
181
+ return GoogleTranslator(source='auto', target=target_lang).translate(text)
 
182
 
183
  def _tts_logic(text, lang, speaker_wav_b64):
184
+ if not text or not text.strip(): return {"error": "Input empty"}
 
 
185
 
186
  XTTS_MAP = {
187
+ "en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl",
188
+ "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar",
189
+ "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  }
191
 
192
+ clean_lang = lang.strip().lower().split('-')[0]
193
+ mapped_lang = XTTS_MAP.get(clean_lang)
194
+ if clean_lang == "zh": mapped_lang = "zh-cn"
 
 
195
 
196
+ print(f"[v86] TTS: {lang} -> {mapped_lang}")
197
 
198
+ if mapped_lang:
199
+ print(f"[v86] GPU Mode: XTTS-v2")
200
  speaker_wav_path = None
201
  if speaker_wav_b64:
202
  sb = base64.b64decode(speaker_wav_b64)
203
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
204
+ f.write(sb); speaker_wav_path = f.name
205
+ else: speaker_wav_path = "default_speaker.wav"
 
 
206
 
207
  try:
208
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
209
  output_path = output_file.name
 
210
  MODELS["tts"].tts_to_file(text=text, language=mapped_lang, file_path=output_path, speaker_wav=speaker_wav_path)
211
+ with open(output_path, "rb") as f: audio_b64 = base64.b64encode(f.read()).decode()
 
 
212
  return {"audio": audio_b64}
213
  finally:
214
  if speaker_wav_path and "default_speaker" not in speaker_wav_path:
215
  if os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
216
  if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
217
 
218
+ # Fallback to Chatterbox
219
+ print(f"[v86] Fallback Mode: Chatterbox ONNX")
220
  try:
221
  temp_ref = None
222
  if speaker_wav_b64:
223
  sb = base64.b64decode(speaker_wav_b64)
224
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
225
  f.write(sb); temp_ref = f.name
226
+ audio_bytes = chatterbox_utils.run_chatterbox_inference(text, clean_lang, speaker_wav_path=temp_ref)
 
 
227
  if temp_ref and os.path.exists(temp_ref): os.unlink(temp_ref)
228
+ return {"audio": base64.b64encode(audio_bytes).decode()}
 
229
  except Exception as e:
230
+ return {"error": f"TTS Failure: {str(e)}"}
 
231
 
232
  @spaces.GPU
233
  def core_process(request_dict):
 
234
  action = request_dict.get("action")
235
  t0 = time.time()
236
+ print(f"--- [v86] 🚀 GPU SESSION START: {action} ---")
237
  load_models()
 
238
  try:
239
+ if action == "stt": res = _stt_logic(request_dict)
240
+ elif action == "translate": res = {"translated": _translate_logic(request_dict.get("text"), request_dict.get("target_lang", "en"))}
241
+ elif action == "tts": res = _tts_logic(request_dict.get("text"), request_dict.get("lang"), request_dict.get("speaker_wav"))
 
 
 
242
  elif action == "s2st":
243
  stt_res = _stt_logic({"file": request_dict.get("file"), "lang": request_dict.get("source_lang")})
244
  text = stt_res.get("text", "")
 
246
  translated = _translate_logic(text, request_dict.get("target_lang"))
247
  tts_res = _tts_logic(translated, request_dict.get("target_lang"), request_dict.get("speaker_wav"))
248
  res = {"text": text, "translated": translated, "audio": tts_res.get("audio")}
249
+ elif action == "health": res = {"status": "awake"}
250
+ else: res = {"error": f"Unknown action: {action}"}
 
 
251
  finally:
252
+ print(f"--- [v86] ✨ SESSION END: {action} ({time.time()-t0:.2f}s) ---")
253
  gc.collect()
254
+ if torch.cuda.is_available(): torch.cuda.empty_cache()
 
255
  return res
256
 
257
  def create_wav_header(sample_rate=24000, channels=1, bit_depth=16):
 
258
  header = bytearray(b'RIFF')
259
  header.extend((1000000000).to_bytes(4, 'little'))
260
  header.extend(b'WAVEfmt ')
 
275
  try:
276
  yield bytes(create_wav_header(sample_rate=24000))
277
  for chunk in MODELS["tts"].synthesizer.tts_model.inference_stream(
278
+ text, lang, *MODELS["tts"].synthesizer.tts_model.get_conditioning_latents(audio_path=[speaker_wav_path]),
 
 
279
  stream_chunk_size=20
280
  ):
281
  yield bytes((chunk * 32767).to(torch.int16).cpu().numpy().tobytes())
 
 
 
282
  finally:
283
+ if speaker_wav_path and "default_speaker" not in speaker_wav_path and os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
 
284
  gc.collect()
285
+ if torch.cuda.is_available(): torch.cuda.empty_cache()
 
286
 
287
  app = FastAPI()
288
 
 
290
  async def api_process(request: Request):
291
  try:
292
  data = await request.json()
293
+ return core_process(data)
 
294
  except Exception as e:
295
  traceback.print_exc()
296
  return {"error": str(e)}
 
300
  try:
301
  data = await request.json()
302
  speaker_wav_b64 = data.get("speaker_wav")
 
303
  if speaker_wav_b64:
304
  sb = base64.b64decode(speaker_wav_b64)
305
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
306
+ f.write(sb); speaker_wav_path = f.name
307
+ else: speaker_wav_path = "default_speaker.wav"
 
 
308
  return StreamingResponse(gpu_tts_generator(data.get("text"), data.get("lang"), speaker_wav_path), media_type="audio/wav")
309
+ except Exception as e: return {"error": str(e)}
 
310
 
311
  @app.get("/health")
312
+ def health(): return {"status": "ok", "gpu": torch.cuda.is_available(), "time": time.ctime()}
 
313
 
314
  @app.post("/api/v1/clear_cache")
315
  async def clear_cache():
316
  try:
317
+ t0 = time.time(); gc.collect()
 
318
  if torch.cuda.is_available(): torch.cuda.empty_cache()
319
+ temp_dir = tempfile.gettempdir(); count = 0
 
320
  for f in os.listdir(temp_dir):
321
  if f.endswith(".wav") or f.startswith("tm"):
322
  try: os.unlink(os.path.join(temp_dir, f)); count += 1
323
  except: pass
324
+ return {"status": "success", "cleaned_files": count}
325
+ except Exception as e: return {"status": "error", "message": str(e)}
 
326
 
327
  def gradio_fn(req_json):
328
  try: return json.dumps(core_process(json.loads(req_json)))