TGPro1 commited on
Commit
060b891
Β·
verified Β·
1 Parent(s): 2934096

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +28 -23
app.py CHANGED
@@ -60,6 +60,7 @@ if not hasattr(torchaudio, "info"):
60
  from df.enhance import enhance, init_df, load_audio, save_audio
61
 
62
  # FORCE BUILD TRIGGER: 07:15:00 Jan 21 2026
 
63
 
64
  # πŸ› οΈ Monkeypatch torchaudio.load
65
  try:
@@ -121,12 +122,14 @@ def load_models():
121
  raise e
122
 
123
  def _stt_logic(request_dict):
 
124
  audio_bytes = base64.b64decode(request_dict.get("file"))
125
  lang = request_dict.get("lang")
126
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
127
  f.write(audio_bytes)
128
  temp_path = f.name
129
  try:
 
130
  segments, _ = MODELS["stt"].transcribe(temp_path, language=lang, beam_size=1)
131
  text = " ".join([s.text for s in segments]).strip()
132
  return {"text": text}
@@ -134,15 +137,18 @@ def _stt_logic(request_dict):
134
  if os.path.exists(temp_path): os.unlink(temp_path)
135
 
136
  def _translate_logic(text, target_lang):
 
137
  from deep_translator import GoogleTranslator
138
  translated = GoogleTranslator(source='auto', target=target_lang).translate(text)
139
  return translated
140
 
141
- def _tts_logic(text, lang, speaker_wav_b64):
 
 
 
142
  if not text or not text.strip():
143
  return {"error": "TTS Error: Input text is empty"}
144
 
145
- # 🧹 Normalize language code
146
  if lang:
147
  lang = lang.strip().lower()
148
  if '-' in lang: lang = lang.split('-')[0]
@@ -159,7 +165,10 @@ def _tts_logic(text, lang, speaker_wav_b64):
159
  try:
160
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
161
  output_path = output_file.name
 
 
162
  MODELS["tts"].tts_to_file(text=text, language=lang, file_path=output_path, speaker_wav=speaker_wav_path)
 
163
  with open(output_path, "rb") as f:
164
  audio_b64 = base64.b64encode(f.read()).decode()
165
  return {"audio": audio_b64}
@@ -168,35 +177,38 @@ def _tts_logic(text, lang, speaker_wav_b64):
168
  if os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
169
  if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
170
 
171
- @spaces.GPU
172
  def core_process(request_dict):
173
- """Entry point for GPU-bound tasks. Only one GPU allocation per call."""
174
  action = request_dict.get("action")
175
  t0 = time.time()
176
- print(f"--- [v74] πŸ› οΈ GPU Start: {action} at {time.ctime()} ---")
177
- load_models()
178
 
179
  if action == "stt":
 
180
  res = _stt_logic(request_dict)
 
 
181
  elif action == "tts":
182
- res = _tts_logic(request_dict.get("text"), request_dict.get("lang"), request_dict.get("speaker_wav"))
 
183
  elif action == "s2st":
184
- # πŸ”— COMPACT PIPELINE: Stay on the same GPU worker for all steps
185
- # Step 1: STT
186
  stt_res = _stt_logic({"file": request_dict.get("file"), "lang": request_dict.get("source_lang")})
187
  text = stt_res.get("text", "")
188
  if not text: return {"error": "No speech detected"}
189
 
190
- # Step 2: Translation (Google API)
191
  translated = _translate_logic(text, request_dict.get("target_lang"))
192
 
193
- # Step 3: TTS
194
- tts_res = _tts_logic(translated, request_dict.get("target_lang"), request_dict.get("speaker_wav"))
195
  res = {"text": text, "translated": translated, "audio": tts_res.get("audio")}
196
  else:
197
- res = {"error": f"Unknown GPU action: {action}"}
198
 
199
- print(f"--- [v74] βœ… GPU End: {action} (Took {time.time()-t0:.2f}s) ---")
200
  return res
201
 
202
  return {"error": f"Unknown action: {action}"}
@@ -243,17 +255,10 @@ app = FastAPI()
243
 
244
  @app.post("/api/v1/process")
245
  async def api_process(request: Request):
246
- """Async endpoint routes to GPU or CPU logic"""
247
  try:
248
  data = await request.json()
249
- action = data.get("action")
250
-
251
- if action == "translate":
252
- # ⚑ CPU OPTIMIZATION: Translation is just a web request, don't waste GPU allocation
253
- translated = _translate_logic(data.get("text"), data.get("target_lang", "en"))
254
- return {"translated": translated}
255
-
256
- # For STT, TTS, S2ST: Trigger ONE GPU allocation
257
  result = core_process(data)
258
  return result
259
  except Exception as e:
 
60
  from df.enhance import enhance, init_df, load_audio, save_audio
61
 
62
  # FORCE BUILD TRIGGER: 07:15:00 Jan 21 2026
63
+ # v76: CPU-STT (Instant) + GPU-TTS (High Quality)
64
 
65
  # πŸ› οΈ Monkeypatch torchaudio.load
66
  try:
 
122
  raise e
123
 
124
  def _stt_logic(request_dict):
125
+ """STT runs on CPU for instant start (no GPU queue wait)"""
126
  audio_bytes = base64.b64decode(request_dict.get("file"))
127
  lang = request_dict.get("lang")
128
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
129
  f.write(audio_bytes)
130
  temp_path = f.name
131
  try:
132
+ # ⚑ CPU Transcription: No @spaces.GPU needed
133
  segments, _ = MODELS["stt"].transcribe(temp_path, language=lang, beam_size=1)
134
  text = " ".join([s.text for s in segments]).strip()
135
  return {"text": text}
 
137
  if os.path.exists(temp_path): os.unlink(temp_path)
138
 
139
  def _translate_logic(text, target_lang):
140
+ """Translation runs on CPU (Instant)"""
141
  from deep_translator import GoogleTranslator
142
  translated = GoogleTranslator(source='auto', target=target_lang).translate(text)
143
  return translated
144
 
145
+ @spaces.GPU
146
+ def _tts_gpu_logic(text, lang, speaker_wav_b64):
147
+ """Only TTS triggers GPU allocation"""
148
+ load_models()
149
  if not text or not text.strip():
150
  return {"error": "TTS Error: Input text is empty"}
151
 
 
152
  if lang:
153
  lang = lang.strip().lower()
154
  if '-' in lang: lang = lang.split('-')[0]
 
165
  try:
166
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
167
  output_path = output_file.name
168
+
169
+ # πŸŽ™οΈ XTTS Inference on GPU
170
  MODELS["tts"].tts_to_file(text=text, language=lang, file_path=output_path, speaker_wav=speaker_wav_path)
171
+
172
  with open(output_path, "rb") as f:
173
  audio_b64 = base64.b64encode(f.read()).decode()
174
  return {"audio": audio_b64}
 
177
  if os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
178
  if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
179
 
 
180
  def core_process(request_dict):
181
+ """Unified entry (CPU/Hybrid)"""
182
  action = request_dict.get("action")
183
  t0 = time.time()
184
+ print(f"--- [v76] πŸ› οΈ Process: {action} at {time.ctime()} ---")
185
+ load_models() # Load CPU bits if needed
186
 
187
  if action == "stt":
188
+ # ⚑ Instant STT on CPU
189
  res = _stt_logic(request_dict)
190
+ elif action == "translate":
191
+ res = {"translated": _translate_logic(request_dict.get("text"), request_dict.get("target_lang", "en"))}
192
  elif action == "tts":
193
+ # πŸš€ TTS on GPU
194
+ res = _tts_gpu_logic(request_dict.get("text"), request_dict.get("lang"), request_dict.get("speaker_wav"))
195
  elif action == "s2st":
196
+ # πŸ”— HYBRID PIPELINE
197
+ # Step 1: STT (CPU - Instant)
198
  stt_res = _stt_logic({"file": request_dict.get("file"), "lang": request_dict.get("source_lang")})
199
  text = stt_res.get("text", "")
200
  if not text: return {"error": "No speech detected"}
201
 
202
+ # Step 2: Translation (CPU - Instant)
203
  translated = _translate_logic(text, request_dict.get("target_lang"))
204
 
205
+ # Step 3: TTS (GPU - Quality)
206
+ tts_res = _tts_gpu_logic(translated, request_dict.get("target_lang"), request_dict.get("speaker_wav"))
207
  res = {"text": text, "translated": translated, "audio": tts_res.get("audio")}
208
  else:
209
+ res = {"error": f"Unknown action: {action}"}
210
 
211
+ print(f"--- [v76] βœ… End: {action} (Took {time.time()-t0:.2f}s) ---")
212
  return res
213
 
214
  return {"error": f"Unknown action: {action}"}
 
255
 
256
  @app.post("/api/v1/process")
257
  async def api_process(request: Request):
258
+ """Async endpoint. Routes to CPU (STT/Translate) or Hybrid (S2ST/TTS)"""
259
  try:
260
  data = await request.json()
261
+ # Direct call to the hybrid process
 
 
 
 
 
 
 
262
  result = core_process(data)
263
  return result
264
  except Exception as e: