TGPro1 commited on
Commit
066dbb8
Β·
verified Β·
1 Parent(s): 1b4bc84

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +89 -217
app.py CHANGED
@@ -11,7 +11,8 @@ import json
11
  import time
12
  import torchaudio
13
 
14
- # πŸ›‘οΈ ZeroGPU Support (v68)
 
15
  try:
16
  import spaces
17
  print("βœ… ZeroGPU/Spaces detected")
@@ -23,8 +24,6 @@ except ImportError:
23
 
24
 
25
  # πŸ› οΈ Monkeypatch torchaudio.backend (DeepFilterNet compatibility)
26
- # DeepFilterNet uses older torchaudio API structure (torchaudio.backend.common.AudioMetaData)
27
- # We mock it here before importing df
28
  import sys
29
  import types
30
  if "torchaudio.backend" not in sys.modules:
@@ -40,7 +39,6 @@ if "torchaudio.backend" not in sys.modules:
40
  sys.modules["torchaudio.backend.common"] = common
41
 
42
  # πŸ›‘οΈ Torchaudio Compatibility Fix (v60)
43
- # Ensure .info exists for DeepFilterNet
44
  if not hasattr(torchaudio, "info"):
45
  print("πŸ› οΈ Mocking torchaudio.info for compatibility...")
46
  def mock_info(filepath, **kwargs):
@@ -61,9 +59,9 @@ if not hasattr(torchaudio, "info"):
61
 
62
  from df.enhance import enhance, init_df, load_audio, save_audio
63
 
64
- # FORCE BUILD TRIGGER: 15:10:00 Jan 20 2026
65
 
66
- # πŸ› οΈ Monkeypatch torchaudio.load to bypass TorchCodec requirement
67
  try:
68
  _orig_load = torchaudio.load
69
  def patched_load(filepath, *args, **kwargs):
@@ -71,15 +69,12 @@ try:
71
  return _orig_load(filepath, *args, **kwargs)
72
  except ImportError as e:
73
  if "torchcodec" in str(e).lower():
74
- print(f"⚠️ Redirecting load for {filepath} via soundfile (TorchCodec bypass)")
75
  import soundfile as sf
76
  data, samplerate = sf.read(filepath)
77
- # Convert to torch tensor with correct shape (C, N)
78
  t = torch.from_numpy(data).float()
79
- if len(t.shape) == 1:
80
- t = t.unsqueeze(0)
81
- else:
82
- t = t.T
83
  return t, samplerate
84
  raise e
85
  torchaudio.load = patched_load
@@ -97,232 +92,100 @@ def load_models():
97
  if MODELS["stt"] is None:
98
  print("πŸŽ™οΈ Loading Faster-Whisper large-v3...")
99
  from faster_whisper import WhisperModel
100
-
101
- # 🦾 HYBRID HARDWARE SELECTION (v67)
102
  if torch.cuda.is_available():
103
- print(f"πŸš€ High-Performance GPU Detected: {torch.cuda.get_device_name(0)} (H200/A10G/T4)")
104
- print(f"πŸ’Ύ VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
105
  MODELS["stt"] = WhisperModel("large-v3", device="cuda", compute_type="float16")
106
  else:
107
- print("⚠️ WARNING: GPU NOT DETECTED. Falling back to CPU (int8 optimization).")
108
- # CPU fallback: int8 is necessary for decent speed on CPU
109
  MODELS["stt"] = WhisperModel("large-v3", device="cpu", compute_type="int8")
110
 
111
  if MODELS["translate"] is None:
112
- print("🌍 Loading Google Translate (deep-translator)...")
113
- MODELS["translate"] = "deep-translator-active"
114
 
115
  if MODELS["denoiser"] is None:
116
- print("🧹 Loading DeepFilterNet (Voice Cleaner)...")
117
  try:
118
- df_ret = init_df()
119
- if isinstance(df_ret, (list, tuple)) and len(df_ret) > 1:
120
- MODELS["denoiser"] = df_ret[0]
121
- else:
122
- MODELS["denoiser"] = df_ret
123
  print("✨ DeepFilterNet Loaded")
124
- except Exception as e:
125
- print(f"⚠️ Failed to load denoiser: {e}")
126
- try:
127
- MODELS["denoiser"] = init_df()
128
- except:
129
- pass
130
 
131
  if MODELS["tts"] is None:
132
- print("πŸ”Š Loading XTTS-v2 (STRICT GPU PREFERRED)...")
133
  from TTS.api import TTS
134
  try:
135
- MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
136
- print("✨ XTTS-v2 Loaded on GPU")
137
- except Exception as tts_e:
138
- print(f"⚠️ GPU TTS load failed: {tts_e}. Falling back to CPU...")
139
- try:
140
- MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
141
- print("✨ XTTS-v2 Loaded on CPU")
142
- except Exception as cpu_e:
143
- print(f"❌ FATAL: Could not load XTTS-v2 on any hardware: {cpu_e}")
144
- raise cpu_e
145
 
146
  @spaces.GPU
147
  def core_process(request_dict):
148
- """Internal logic used by both FastAPI and Gradio"""
149
  action = request_dict.get("action")
150
- print(f"--- πŸ› οΈ Processing Action: {action} (ZeroGPU Context) ---")
151
- start_time = time.time()
152
-
153
- if action == "health":
154
- return {"status": "ok", "gpu": torch.cuda.is_available(), "timestamp": time.time()}
155
-
156
- print(f"⏳ Loading models for {action}...")
157
  load_models()
158
- print(f"βœ… Models ready for {action} (Load time: {time.time() - start_time:.2f}s)")
159
 
160
  if action == "stt":
161
- audio_b64 = request_dict.get("file")
162
  lang = request_dict.get("lang")
163
- print(f"πŸŽ™οΈ STT: Decoding audio ({len(audio_b64) if audio_b64 else 0} bytes)...")
164
- audio_bytes = base64.b64decode(audio_b64)
165
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
166
  f.write(audio_bytes)
167
  temp_path = f.name
168
  try:
169
- print("πŸš€ Faster-Whisper Transcription Starting (Instant Mode)...")
170
- segments, info = MODELS["stt"].transcribe(temp_path, language=lang, beam_size=1)
171
- text = " ".join([segment.text for segment in segments]).strip()
172
- print(f"✨ Transcription Done: '{text[:50]}...'")
173
  return {"text": text}
174
  finally:
175
  if os.path.exists(temp_path): os.unlink(temp_path)
176
 
177
  elif action == "translate":
178
- text = request_dict.get("text")
179
- target_lang = request_dict.get("target_lang")
180
- print(f"🌍 Translate: '{text[:50]}...' to {target_lang}")
181
-
182
- g_lang_map = {
183
- "en": "en", "fr": "fr", "es": "es", "de": "de",
184
- "ar": "ar", "it": "it", "pt": "pt", "ru": "ru",
185
- "zh": "zh-cn", "ja": "ja", "ko": "ko", "hi": "hi"
186
- }
187
- g_target = g_lang_map.get(target_lang, "en")
188
-
189
  from deep_translator import GoogleTranslator
190
- result = GoogleTranslator(source='auto', target=g_target).translate(text)
191
-
192
- print(f"✨ Translation Done: '{result[:50]}...'")
193
- return {"translated": result}
194
 
195
  elif action == "tts":
196
  text = request_dict.get("text")
197
  lang = request_dict.get("lang")
198
- print(f"πŸ”Š TTS: '{text[:50]}...' in {lang}")
199
  speaker_wav_b64 = request_dict.get("speaker_wav")
200
  speaker_wav_path = None
201
  if speaker_wav_b64:
202
- speaker_bytes = base64.b64decode(speaker_wav_b64)
203
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
204
- f.write(speaker_bytes)
205
  speaker_wav_path = f.name
206
  else:
207
- print("⚠️ No speaker ref provided. Using generated default.")
208
- import wave, struct, math
209
- default_path = "default_speaker.wav"
210
- if not os.path.exists(default_path):
211
- try:
212
- with wave.open(default_path, "w") as wav_file:
213
- wav_file.setnchannels(1)
214
- wav_file.setsampwidth(2)
215
- wav_file.setframerate(24000)
216
- data = [struct.pack('<h', int(math.sin(x/100.0)*3000)) for x in range(24000)]
217
- wav_file.writeframes(b''.join(data))
218
- except: pass
219
- if os.path.exists(default_path):
220
- speaker_wav_path = default_path
221
 
222
  try:
223
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
224
  output_path = output_file.name
225
-
226
- if speaker_wav_path:
227
- MODELS["tts"].tts_to_file(text=text, language=lang, file_path=output_path, speaker_wav=speaker_wav_path)
228
-
229
- if MODELS["denoiser"]:
230
- try:
231
- noisy_audio, _ = load_audio(output_path, sr=48000)
232
- enhanced_audio = enhance(MODELS["denoiser"], noisy_audio, pad=True)
233
- save_audio(output_path, enhanced_audio, 48000)
234
- except: pass
235
-
236
  with open(output_path, "rb") as f:
237
  audio_b64 = base64.b64encode(f.read()).decode()
238
- print("✨ TTS Done")
239
  return {"audio": audio_b64}
240
  finally:
241
- if speaker_wav_path and os.path.exists(speaker_wav_path) and "default_speaker" not in speaker_wav_path:
242
- os.unlink(speaker_wav_path)
243
  if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
244
 
245
  elif action == "s2st":
246
- audio_b64 = request_dict.get("file")
247
- source_lang = request_dict.get("source_lang")
248
- target_lang = request_dict.get("target_lang")
249
- speaker_wav_b64 = request_dict.get("speaker_wav")
250
-
251
- audio_bytes = base64.b64decode(audio_b64)
252
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
253
- f.write(audio_bytes)
254
- temp_path = f.name
255
 
256
- speaker_wav_path = None
257
- if speaker_wav_b64:
258
- speaker_bytes = base64.b64decode(speaker_wav_b64)
259
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as sf:
260
- sf.write(speaker_bytes)
261
- speaker_wav_path = sf.name
262
- else:
263
- default_path = "default_speaker.wav"
264
- if os.path.exists(default_path):
265
- speaker_wav_path = default_path
266
 
267
- try:
268
- # Padding & Denoising
269
- try:
270
- waveform, sr = torchaudio.load(temp_path)
271
- if MODELS["denoiser"]:
272
- noisy_in, _ = load_audio(temp_path, sr=48000)
273
- clean_in = enhance(MODELS["denoiser"], noisy_in, pad=True)
274
- save_audio(temp_path, clean_in, 48000)
275
- waveform, sr = torchaudio.load(temp_path)
276
-
277
- silence = torch.zeros((waveform.shape[0], int(1.5 * sr)))
278
- padded = torch.cat([waveform, silence], dim=1)
279
- torchaudio.save(temp_path, padded, sr)
280
- except: pass
281
-
282
- # STT
283
- segments, info = MODELS["stt"].transcribe(temp_path, language=source_lang, beam_size=1)
284
- text = " ".join([segment.text for segment in segments]).strip()
285
- if text and not text.endswith(('.', '!', '?', '…')): text += "..."
286
- if not text: return {"error": "No speech detected"}
287
-
288
- # Translate
289
- g_lang_map = {"en": "en", "fr": "fr", "es": "es", "de": "de", "ar": "ar", "it": "it", "pt": "pt", "ru": "ru", "zh": "zh-cn", "ja": "ja", "ko": "ko", "hi": "hi"}
290
- g_target = g_lang_map.get(target_lang, "en")
291
- from deep_translator import GoogleTranslator
292
- translated_text = GoogleTranslator(source='auto', target=g_target).translate(text)
293
 
294
- # TTS
295
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
296
- output_path = output_file.name
297
-
298
- MODELS["tts"].tts_to_file(text=translated_text, language=target_lang, file_path=output_path, speaker_wav=speaker_wav_path)
299
-
300
- with open(output_path, "rb") as o:
301
- audio_out_b64 = base64.b64encode(o.read()).decode()
302
-
303
- return {"text": text, "translated": translated_text, "audio": audio_out_b64}
304
- finally:
305
- if os.path.exists(temp_path): os.unlink(temp_path)
306
- if speaker_wav_path and os.path.exists(speaker_wav_path) and "default_speaker" not in speaker_wav_path:
307
- os.unlink(speaker_wav_path)
308
- if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
309
-
310
  return {"error": f"Unknown action: {action}"}
311
 
312
- # --- FastAPI App ---
313
- app = FastAPI()
314
-
315
- @app.post("/api/v1/process")
316
- async def api_process(request: Request):
317
- try:
318
- data = await request.json()
319
- print(f"πŸ“₯ FastAPI Request: {data.get('action')}")
320
- result = core_process(data)
321
- return result
322
- except Exception as e:
323
- print(f"❌ API Global Error: {traceback.format_exc()}")
324
- return {"error": str(e)}
325
-
326
  def create_wav_header(sample_rate=24000, channels=1, bit_depth=16):
327
  header = bytearray(b'RIFF')
328
  header.extend((1000000000).to_bytes(4, 'little'))
@@ -338,65 +201,74 @@ def create_wav_header(sample_rate=24000, channels=1, bit_depth=16):
338
  header.extend((0xFFFFFFFF).to_bytes(4, 'little'))
339
  return header
340
 
341
- @app.post("/api/v1/tts_stream")
342
  @spaces.GPU
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
  async def api_tts_stream(request: Request):
 
344
  try:
345
- load_models()
346
  data = await request.json()
347
- text = data.get("text")
348
- lang = data.get("lang")
349
  speaker_wav_b64 = data.get("speaker_wav")
350
-
351
  speaker_wav_path = None
352
  if speaker_wav_b64:
353
- speaker_bytes = base64.b64decode(speaker_wav_b64)
354
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
355
- f.write(speaker_bytes)
356
  speaker_wav_path = f.name
357
  else:
358
  speaker_wav_path = "default_speaker.wav"
359
-
360
- def stream_generator():
361
- try:
362
- yield create_wav_header(sample_rate=24000)
363
- for chunk in MODELS["tts"].synthesizer.tts_model.inference_stream(
364
- text,
365
- lang,
366
- *MODELS["tts"].synthesizer.tts_model.get_conditioning_latents(audio_path=[speaker_wav_path]),
367
- stream_chunk_size=20
368
- ):
369
- yield (chunk * 32767).to(torch.int16).cpu().numpy().tobytes()
370
- except Exception as ge:
371
- print(f"❌ [Stream Error]: {ge}")
372
- finally:
373
- if speaker_wav_path and os.path.exists(speaker_wav_path) and "default_speaker" not in speaker_wav_path:
374
- os.unlink(speaker_wav_path)
375
-
376
- return StreamingResponse(stream_generator(), media_type="audio/wav")
377
  except Exception as e:
378
  return {"error": str(e)}
379
 
380
  @app.get("/health")
381
  def health():
382
- return {"status": "ok", "gpu": torch.cuda.is_available(), "timestamp": time.time()}
383
 
384
- # --- Gradio Interface ---
385
  def gradio_fn(req_json):
386
  try:
387
- data = json.loads(req_json)
388
- res = core_process(data)
389
- return json.dumps(res)
390
  except Exception as e:
391
  return json.dumps({"error": str(e)})
392
 
393
- demo = gr.Interface(
394
- fn=gradio_fn,
395
- inputs=gr.Textbox(label="JSON Request"),
396
- outputs=gr.Textbox(label="JSON Response"),
397
- title="πŸš€ Unified AI Engine (H200/XTTS-v2)"
398
- )
399
-
400
  app = gr.mount_gradio_app(app, demo, path="/")
401
 
402
  if __name__ == "__main__":
 
11
  import time
12
  import torchaudio
13
 
14
+ # πŸ›‘οΈ ZeroGPU Support (v69)
15
+ # CRITICAL: @spaces.GPU MUST only be used on synchronous functions (def, not async def)
16
  try:
17
  import spaces
18
  print("βœ… ZeroGPU/Spaces detected")
 
24
 
25
 
26
  # πŸ› οΈ Monkeypatch torchaudio.backend (DeepFilterNet compatibility)
 
 
27
  import sys
28
  import types
29
  if "torchaudio.backend" not in sys.modules:
 
39
  sys.modules["torchaudio.backend.common"] = common
40
 
41
  # πŸ›‘οΈ Torchaudio Compatibility Fix (v60)
 
42
  if not hasattr(torchaudio, "info"):
43
  print("πŸ› οΈ Mocking torchaudio.info for compatibility...")
44
  def mock_info(filepath, **kwargs):
 
59
 
60
  from df.enhance import enhance, init_df, load_audio, save_audio
61
 
62
+ # FORCE BUILD TRIGGER: 15:20:00 Jan 20 2026
63
 
64
+ # πŸ› οΈ Monkeypatch torchaudio.load
65
  try:
66
  _orig_load = torchaudio.load
67
  def patched_load(filepath, *args, **kwargs):
 
69
  return _orig_load(filepath, *args, **kwargs)
70
  except ImportError as e:
71
  if "torchcodec" in str(e).lower():
72
+ print(f"⚠️ Redirecting load for {filepath} via soundfile")
73
  import soundfile as sf
74
  data, samplerate = sf.read(filepath)
 
75
  t = torch.from_numpy(data).float()
76
+ if len(t.shape) == 1: t = t.unsqueeze(0)
77
+ else: t = t.T
 
 
78
  return t, samplerate
79
  raise e
80
  torchaudio.load = patched_load
 
92
  if MODELS["stt"] is None:
93
  print("πŸŽ™οΈ Loading Faster-Whisper large-v3...")
94
  from faster_whisper import WhisperModel
 
 
95
  if torch.cuda.is_available():
96
+ print(f"πŸš€ GPU Detected: {torch.cuda.get_device_name(0)}")
 
97
  MODELS["stt"] = WhisperModel("large-v3", device="cuda", compute_type="float16")
98
  else:
99
+ print("⚠️ Falling back to CPU (int8)")
 
100
  MODELS["stt"] = WhisperModel("large-v3", device="cpu", compute_type="int8")
101
 
102
  if MODELS["translate"] is None:
103
+ print("🌍 Loading Google Translate...")
104
+ MODELS["translate"] = "active"
105
 
106
  if MODELS["denoiser"] is None:
107
+ print("扫 Loading DeepFilterNet...")
108
  try:
109
+ MODELS["denoiser"] = init_df()
 
 
 
 
110
  print("✨ DeepFilterNet Loaded")
111
+ except: pass
 
 
 
 
 
112
 
113
  if MODELS["tts"] is None:
114
+ print("πŸ”Š Loading XTTS-v2...")
115
  from TTS.api import TTS
116
  try:
117
+ MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=torch.cuda.is_available())
118
+ print(f"✨ XTTS-v2 Loaded (GPU={torch.cuda.is_available()})")
119
+ except Exception as e:
120
+ print(f"❌ Failed to load XTTS: {e}")
121
+ raise e
 
 
 
 
 
122
 
123
  @spaces.GPU
124
  def core_process(request_dict):
125
+ """Synchronous inference logic with GPU decorator"""
126
  action = request_dict.get("action")
127
+ print(f"--- πŸ› οΈ Processing Action: {action} (GPU Context) ---")
 
 
 
 
 
 
128
  load_models()
 
129
 
130
  if action == "stt":
131
+ audio_bytes = base64.b64decode(request_dict.get("file"))
132
  lang = request_dict.get("lang")
 
 
133
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
134
  f.write(audio_bytes)
135
  temp_path = f.name
136
  try:
137
+ segments, _ = MODELS["stt"].transcribe(temp_path, language=lang, beam_size=1)
138
+ text = " ".join([s.text for s in segments]).strip()
 
 
139
  return {"text": text}
140
  finally:
141
  if os.path.exists(temp_path): os.unlink(temp_path)
142
 
143
  elif action == "translate":
 
 
 
 
 
 
 
 
 
 
 
144
  from deep_translator import GoogleTranslator
145
+ text = request_dict.get("text")
146
+ target_lang = request_dict.get("target_lang", "en")
147
+ translated = GoogleTranslator(source='auto', target=target_lang).translate(text)
148
+ return {"translated": translated}
149
 
150
  elif action == "tts":
151
  text = request_dict.get("text")
152
  lang = request_dict.get("lang")
 
153
  speaker_wav_b64 = request_dict.get("speaker_wav")
154
  speaker_wav_path = None
155
  if speaker_wav_b64:
156
+ sb = base64.b64decode(speaker_wav_b64)
157
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
158
+ f.write(sb)
159
  speaker_wav_path = f.name
160
  else:
161
+ speaker_wav_path = "default_speaker.wav"
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
  try:
164
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
165
  output_path = output_file.name
166
+ MODELS["tts"].tts_to_file(text=text, language=lang, file_path=output_path, speaker_wav=speaker_wav_path)
 
 
 
 
 
 
 
 
 
 
167
  with open(output_path, "rb") as f:
168
  audio_b64 = base64.b64encode(f.read()).decode()
 
169
  return {"audio": audio_b64}
170
  finally:
171
+ if speaker_wav_path and "default_speaker" not in speaker_wav_path:
172
+ if os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
173
  if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
174
 
175
  elif action == "s2st":
176
+ # Full S2ST flow
177
+ data = core_process({"action": "stt", "file": request_dict.get("file"), "lang": request_dict.get("source_lang")})
178
+ text = data.get("text", "")
179
+ if not text: return {"error": "No speech detected"}
 
 
 
 
 
180
 
181
+ data_tr = core_process({"action": "translate", "text": text, "target_lang": request_dict.get("target_lang")})
182
+ translated = data_tr.get("translated", "")
 
 
 
 
 
 
 
 
183
 
184
+ data_tts = core_process({"action": "tts", "text": translated, "lang": request_dict.get("target_lang"), "speaker_wav": request_dict.get("speaker_wav")})
185
+ return {"text": text, "translated": translated, "audio": data_tts.get("audio")}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  return {"error": f"Unknown action: {action}"}
188
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  def create_wav_header(sample_rate=24000, channels=1, bit_depth=16):
190
  header = bytearray(b'RIFF')
191
  header.extend((1000000000).to_bytes(4, 'little'))
 
201
  header.extend((0xFFFFFFFF).to_bytes(4, 'little'))
202
  return header
203
 
204
+ # πŸš€ Sync Generator for ZeroGPU
205
  @spaces.GPU
206
+ def gpu_tts_generator(text, lang, speaker_wav_path):
207
+ load_models()
208
+ try:
209
+ yield create_wav_header(sample_rate=24000)
210
+ # inference_stream is a generator
211
+ for chunk in MODELS["tts"].synthesizer.tts_model.inference_stream(
212
+ text,
213
+ lang,
214
+ *MODELS["tts"].synthesizer.tts_model.get_conditioning_latents(audio_path=[speaker_wav_path]),
215
+ stream_chunk_size=20
216
+ ):
217
+ yield (chunk * 32767).to(torch.int16).cpu().numpy().tobytes()
218
+ print("✨ [Generator Complete]")
219
+ except Exception as e:
220
+ print(f"❌ [Generator Error]: {e}")
221
+ finally:
222
+ if speaker_wav_path and "default_speaker" not in speaker_wav_path:
223
+ if os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
224
+
225
+ # --- FastAPI Entry Points ---
226
+ app = FastAPI()
227
+
228
+ @app.post("/api/v1/process")
229
+ async def api_process(request: Request):
230
+ """Async endpoint calls synchronous GPU function"""
231
+ try:
232
+ data = await request.json()
233
+ result = core_process(data)
234
+ return result
235
+ except Exception as e:
236
+ return {"error": str(e)}
237
+
238
+ @app.post("/api/v1/tts_stream")
239
  async def api_tts_stream(request: Request):
240
+ """Async entry point for StreamingResponse"""
241
  try:
 
242
  data = await request.json()
 
 
243
  speaker_wav_b64 = data.get("speaker_wav")
 
244
  speaker_wav_path = None
245
  if speaker_wav_b64:
246
+ sb = base64.b64decode(speaker_wav_b64)
247
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
248
+ f.write(sb)
249
  speaker_wav_path = f.name
250
  else:
251
  speaker_wav_path = "default_speaker.wav"
252
+
253
+ return StreamingResponse(
254
+ gpu_tts_generator(data.get("text"), data.get("lang"), speaker_wav_path),
255
+ media_type="audio/wav"
256
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
257
  except Exception as e:
258
  return {"error": str(e)}
259
 
260
  @app.get("/health")
261
  def health():
262
+ return {"status": "ok", "gpu": torch.cuda.is_available()}
263
 
264
+ # --- Gradio UI ---
265
  def gradio_fn(req_json):
266
  try:
267
+ return json.dumps(core_process(json.loads(req_json)))
 
 
268
  except Exception as e:
269
  return json.dumps({"error": str(e)})
270
 
271
+ demo = gr.Interface(fn=gradio_fn, inputs="text", outputs="text", title="πŸš€ AI Engine")
 
 
 
 
 
 
272
  app = gr.mount_gradio_app(app, demo, path="/")
273
 
274
  if __name__ == "__main__":