TGPro1 commited on
Commit
4258912
Β·
verified Β·
1 Parent(s): 060b891

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +22 -22
app.py CHANGED
@@ -59,8 +59,8 @@ if not hasattr(torchaudio, "info"):
59
 
60
  from df.enhance import enhance, init_df, load_audio, save_audio
61
 
62
- # FORCE BUILD TRIGGER: 07:15:00 Jan 21 2026
63
- # v76: CPU-STT (Instant) + GPU-TTS (High Quality)
64
 
65
  # πŸ› οΈ Monkeypatch torchaudio.load
66
  try:
@@ -122,14 +122,14 @@ def load_models():
122
  raise e
123
 
124
  def _stt_logic(request_dict):
125
- """STT runs on CPU for instant start (no GPU queue wait)"""
126
  audio_bytes = base64.b64decode(request_dict.get("file"))
127
  lang = request_dict.get("lang")
128
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
129
  f.write(audio_bytes)
130
  temp_path = f.name
131
  try:
132
- # ⚑ CPU Transcription: No @spaces.GPU needed
133
  segments, _ = MODELS["stt"].transcribe(temp_path, language=lang, beam_size=1)
134
  text = " ".join([s.text for s in segments]).strip()
135
  return {"text": text}
@@ -137,15 +137,13 @@ def _stt_logic(request_dict):
137
  if os.path.exists(temp_path): os.unlink(temp_path)
138
 
139
  def _translate_logic(text, target_lang):
140
- """Translation runs on CPU (Instant)"""
141
  from deep_translator import GoogleTranslator
142
  translated = GoogleTranslator(source='auto', target=target_lang).translate(text)
143
  return translated
144
 
145
- @spaces.GPU
146
- def _tts_gpu_logic(text, lang, speaker_wav_b64):
147
- """Only TTS triggers GPU allocation"""
148
- load_models()
149
  if not text or not text.strip():
150
  return {"error": "TTS Error: Input text is empty"}
151
 
@@ -166,7 +164,7 @@ def _tts_gpu_logic(text, lang, speaker_wav_b64):
166
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
167
  output_path = output_file.name
168
 
169
- # πŸŽ™οΈ XTTS Inference on GPU
170
  MODELS["tts"].tts_to_file(text=text, language=lang, file_path=output_path, speaker_wav=speaker_wav_path)
171
 
172
  with open(output_path, "rb") as f:
@@ -177,38 +175,40 @@ def _tts_gpu_logic(text, lang, speaker_wav_b64):
177
  if os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
178
  if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
179
 
 
180
  def core_process(request_dict):
181
- """Unified entry (CPU/Hybrid)"""
 
 
 
 
182
  action = request_dict.get("action")
183
  t0 = time.time()
184
- print(f"--- [v76] πŸ› οΈ Process: {action} at {time.ctime()} ---")
185
- load_models() # Load CPU bits if needed
186
 
187
  if action == "stt":
188
- # ⚑ Instant STT on CPU
189
  res = _stt_logic(request_dict)
190
  elif action == "translate":
191
  res = {"translated": _translate_logic(request_dict.get("text"), request_dict.get("target_lang", "en"))}
192
  elif action == "tts":
193
- # πŸš€ TTS on GPU
194
- res = _tts_gpu_logic(request_dict.get("text"), request_dict.get("lang"), request_dict.get("speaker_wav"))
195
  elif action == "s2st":
196
- # πŸ”— HYBRID PIPELINE
197
- # Step 1: STT (CPU - Instant)
198
  stt_res = _stt_logic({"file": request_dict.get("file"), "lang": request_dict.get("source_lang")})
199
  text = stt_res.get("text", "")
200
  if not text: return {"error": "No speech detected"}
201
 
202
- # Step 2: Translation (CPU - Instant)
203
  translated = _translate_logic(text, request_dict.get("target_lang"))
204
 
205
- # Step 3: TTS (GPU - Quality)
206
- tts_res = _tts_gpu_logic(translated, request_dict.get("target_lang"), request_dict.get("speaker_wav"))
207
  res = {"text": text, "translated": translated, "audio": tts_res.get("audio")}
 
 
208
  else:
209
  res = {"error": f"Unknown action: {action}"}
210
 
211
- print(f"--- [v76] βœ… End: {action} (Took {time.time()-t0:.2f}s) ---")
212
  return res
213
 
214
  return {"error": f"Unknown action: {action}"}
 
59
 
60
  from df.enhance import enhance, init_df, load_audio, save_audio
61
 
62
+ # FORCE BUILD TRIGGER: 07:18:00 Jan 21 2026
63
+ # v77: High-Speed GPU Pipeline (STT + TTS on GPU)
64
 
65
  # πŸ› οΈ Monkeypatch torchaudio.load
66
  try:
 
122
  raise e
123
 
124
  def _stt_logic(request_dict):
125
+ """STT Logic (Runs on GPU when called via core_process)"""
126
  audio_bytes = base64.b64decode(request_dict.get("file"))
127
  lang = request_dict.get("lang")
128
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
129
  f.write(audio_bytes)
130
  temp_path = f.name
131
  try:
132
+ # Transcribe (Uses GPU if device="cuda" in MODELS)
133
  segments, _ = MODELS["stt"].transcribe(temp_path, language=lang, beam_size=1)
134
  text = " ".join([s.text for s in segments]).strip()
135
  return {"text": text}
 
137
  if os.path.exists(temp_path): os.unlink(temp_path)
138
 
139
  def _translate_logic(text, target_lang):
140
+ """Translation (CPU/Network)"""
141
  from deep_translator import GoogleTranslator
142
  translated = GoogleTranslator(source='auto', target=target_lang).translate(text)
143
  return translated
144
 
145
+ def _tts_logic(text, lang, speaker_wav_b64):
146
+ """TTS Logic (Runs on GPU when called via core_process)"""
 
 
147
  if not text or not text.strip():
148
  return {"error": "TTS Error: Input text is empty"}
149
 
 
164
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
165
  output_path = output_file.name
166
 
167
+ # πŸŽ™οΈ XTTS Inference
168
  MODELS["tts"].tts_to_file(text=text, language=lang, file_path=output_path, speaker_wav=speaker_wav_path)
169
 
170
  with open(output_path, "rb") as f:
 
175
  if os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
176
  if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
177
 
178
+ @spaces.GPU
179
  def core_process(request_dict):
180
+ """
181
+ Unified GPU Entry Point (v77).
182
+ This function handles all high-speed tasks inside a single GPU allocation.
183
+ The container stays resident on CPU but triggers GPU on demand.
184
+ """
185
  action = request_dict.get("action")
186
  t0 = time.time()
187
+ print(f"--- [v77] πŸš€ GPU SESSION START: {action} at {time.ctime()} ---")
188
+ load_models()
189
 
190
  if action == "stt":
 
191
  res = _stt_logic(request_dict)
192
  elif action == "translate":
193
  res = {"translated": _translate_logic(request_dict.get("text"), request_dict.get("target_lang", "en"))}
194
  elif action == "tts":
195
+ res = _tts_logic(request_dict.get("text"), request_dict.get("lang"), request_dict.get("speaker_wav"))
 
196
  elif action == "s2st":
197
+ # πŸ”— FULL PIPELINE (Single GPU Call)
 
198
  stt_res = _stt_logic({"file": request_dict.get("file"), "lang": request_dict.get("source_lang")})
199
  text = stt_res.get("text", "")
200
  if not text: return {"error": "No speech detected"}
201
 
 
202
  translated = _translate_logic(text, request_dict.get("target_lang"))
203
 
204
+ tts_res = _tts_logic(translated, request_dict.get("target_lang"), request_dict.get("speaker_wav"))
 
205
  res = {"text": text, "translated": translated, "audio": tts_res.get("audio")}
206
+ elif action == "health":
207
+ res = {"status": "awake", "time": time.ctime()}
208
  else:
209
  res = {"error": f"Unknown action: {action}"}
210
 
211
+ print(f"--- [v77] ✨ GPU SESSION END: {action} (Total: {time.time()-t0:.2f}s) ---")
212
  return res
213
 
214
  return {"error": f"Unknown action: {action}"}