TGPro1 commited on
Commit
2934096
·
verified ·
1 Parent(s): ae76cda

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +78 -63
app.py CHANGED
@@ -59,7 +59,7 @@ if not hasattr(torchaudio, "info"):
59
 
60
  from df.enhance import enhance, init_df, load_audio, save_audio
61
 
62
- # FORCE BUILD TRIGGER: 15:58:00 Jan 20 2026
63
 
64
  # 🛠️ Monkeypatch torchaudio.load
65
  try:
@@ -120,78 +120,84 @@ def load_models():
120
  print(f"❌ Failed to load XTTS: {e}")
121
  raise e
122
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  @spaces.GPU
124
  def core_process(request_dict):
125
- """Synchronous inference logic with GPU decorator"""
126
  action = request_dict.get("action")
127
- print(f"--- 🛠️ Processing Action: {action} (GPU Context) ---")
 
128
  load_models()
129
 
130
  if action == "stt":
131
- audio_bytes = base64.b64decode(request_dict.get("file"))
132
- lang = request_dict.get("lang")
133
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
134
- f.write(audio_bytes)
135
- temp_path = f.name
136
- try:
137
- segments, _ = MODELS["stt"].transcribe(temp_path, language=lang, beam_size=1)
138
- text = " ".join([s.text for s in segments]).strip()
139
- return {"text": text}
140
- finally:
141
- if os.path.exists(temp_path): os.unlink(temp_path)
142
-
143
- elif action == "translate":
144
- from deep_translator import GoogleTranslator
145
- text = request_dict.get("text")
146
- target_lang = request_dict.get("target_lang", "en")
147
- translated = GoogleTranslator(source='auto', target=target_lang).translate(text)
148
- return {"translated": translated}
149
-
150
  elif action == "tts":
151
- text = request_dict.get("text")
152
- lang = request_dict.get("lang")
153
-
154
- if not text or not text.strip():
155
- return {"error": "TTS Error: Input text is empty"}
156
-
157
- # 🧹 Normalize language code
158
- if lang:
159
- lang = lang.strip().lower()
160
- # Map complex codes to 2-letter codes if needed, e.g., 'fr-fr' -> 'fr'
161
- if '-' in lang: lang = lang.split('-')[0]
162
- speaker_wav_b64 = request_dict.get("speaker_wav")
163
- speaker_wav_path = None
164
- if speaker_wav_b64:
165
- sb = base64.b64decode(speaker_wav_b64)
166
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
167
- f.write(sb)
168
- speaker_wav_path = f.name
169
- else:
170
- speaker_wav_path = "default_speaker.wav"
171
-
172
- try:
173
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
174
- output_path = output_file.name
175
- MODELS["tts"].tts_to_file(text=text, language=lang, file_path=output_path, speaker_wav=speaker_wav_path)
176
- with open(output_path, "rb") as f:
177
- audio_b64 = base64.b64encode(f.read()).decode()
178
- return {"audio": audio_b64}
179
- finally:
180
- if speaker_wav_path and "default_speaker" not in speaker_wav_path:
181
- if os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
182
- if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
183
-
184
  elif action == "s2st":
185
- # Full S2ST flow
186
- data = core_process({"action": "stt", "file": request_dict.get("file"), "lang": request_dict.get("source_lang")})
187
- text = data.get("text", "")
 
188
  if not text: return {"error": "No speech detected"}
189
 
190
- data_tr = core_process({"action": "translate", "text": text, "target_lang": request_dict.get("target_lang")})
191
- translated = data_tr.get("translated", "")
 
 
 
 
 
 
192
 
193
- data_tts = core_process({"action": "tts", "text": translated, "lang": request_dict.get("target_lang"), "speaker_wav": request_dict.get("speaker_wav")})
194
- return {"text": text, "translated": translated, "audio": data_tts.get("audio")}
195
 
196
  return {"error": f"Unknown action: {action}"}
197
 
@@ -237,12 +243,21 @@ app = FastAPI()
237
 
238
  @app.post("/api/v1/process")
239
  async def api_process(request: Request):
240
- """Async endpoint calls synchronous GPU function"""
241
  try:
242
  data = await request.json()
 
 
 
 
 
 
 
 
243
  result = core_process(data)
244
  return result
245
  except Exception as e:
 
246
  return {"error": str(e)}
247
 
248
  @app.post("/api/v1/tts_stream")
 
59
 
60
  from df.enhance import enhance, init_df, load_audio, save_audio
61
 
62
+ # FORCE BUILD TRIGGER: 07:15:00 Jan 21 2026
63
 
64
  # 🛠️ Monkeypatch torchaudio.load
65
  try:
 
120
  print(f"❌ Failed to load XTTS: {e}")
121
  raise e
122
 
123
+ def _stt_logic(request_dict):
124
+ audio_bytes = base64.b64decode(request_dict.get("file"))
125
+ lang = request_dict.get("lang")
126
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
127
+ f.write(audio_bytes)
128
+ temp_path = f.name
129
+ try:
130
+ segments, _ = MODELS["stt"].transcribe(temp_path, language=lang, beam_size=1)
131
+ text = " ".join([s.text for s in segments]).strip()
132
+ return {"text": text}
133
+ finally:
134
+ if os.path.exists(temp_path): os.unlink(temp_path)
135
+
136
+ def _translate_logic(text, target_lang):
137
+ from deep_translator import GoogleTranslator
138
+ translated = GoogleTranslator(source='auto', target=target_lang).translate(text)
139
+ return translated
140
+
141
+ def _tts_logic(text, lang, speaker_wav_b64):
142
+ if not text or not text.strip():
143
+ return {"error": "TTS Error: Input text is empty"}
144
+
145
+ # 🧹 Normalize language code
146
+ if lang:
147
+ lang = lang.strip().lower()
148
+ if '-' in lang: lang = lang.split('-')[0]
149
+
150
+ speaker_wav_path = None
151
+ if speaker_wav_b64:
152
+ sb = base64.b64decode(speaker_wav_b64)
153
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
154
+ f.write(sb)
155
+ speaker_wav_path = f.name
156
+ else:
157
+ speaker_wav_path = "default_speaker.wav"
158
+
159
+ try:
160
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
161
+ output_path = output_file.name
162
+ MODELS["tts"].tts_to_file(text=text, language=lang, file_path=output_path, speaker_wav=speaker_wav_path)
163
+ with open(output_path, "rb") as f:
164
+ audio_b64 = base64.b64encode(f.read()).decode()
165
+ return {"audio": audio_b64}
166
+ finally:
167
+ if speaker_wav_path and "default_speaker" not in speaker_wav_path:
168
+ if os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
169
+ if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
170
+
171
  @spaces.GPU
172
  def core_process(request_dict):
173
+ """Entry point for GPU-bound tasks. Only one GPU allocation per call."""
174
  action = request_dict.get("action")
175
+ t0 = time.time()
176
+ print(f"--- [v74] 🛠️ GPU Start: {action} at {time.ctime()} ---")
177
  load_models()
178
 
179
  if action == "stt":
180
+ res = _stt_logic(request_dict)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  elif action == "tts":
182
+ res = _tts_logic(request_dict.get("text"), request_dict.get("lang"), request_dict.get("speaker_wav"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  elif action == "s2st":
184
+ # 🔗 COMPACT PIPELINE: Stay on the same GPU worker for all steps
185
+ # Step 1: STT
186
+ stt_res = _stt_logic({"file": request_dict.get("file"), "lang": request_dict.get("source_lang")})
187
+ text = stt_res.get("text", "")
188
  if not text: return {"error": "No speech detected"}
189
 
190
+ # Step 2: Translation (Google API)
191
+ translated = _translate_logic(text, request_dict.get("target_lang"))
192
+
193
+ # Step 3: TTS
194
+ tts_res = _tts_logic(translated, request_dict.get("target_lang"), request_dict.get("speaker_wav"))
195
+ res = {"text": text, "translated": translated, "audio": tts_res.get("audio")}
196
+ else:
197
+ res = {"error": f"Unknown GPU action: {action}"}
198
 
199
+ print(f"--- [v74] GPU End: {action} (Took {time.time()-t0:.2f}s) ---")
200
+ return res
201
 
202
  return {"error": f"Unknown action: {action}"}
203
 
 
243
 
244
  @app.post("/api/v1/process")
245
  async def api_process(request: Request):
246
+ """Async endpoint routes to GPU or CPU logic"""
247
  try:
248
  data = await request.json()
249
+ action = data.get("action")
250
+
251
+ if action == "translate":
252
+ # ⚡ CPU OPTIMIZATION: Translation is just a web request, don't waste GPU allocation
253
+ translated = _translate_logic(data.get("text"), data.get("target_lang", "en"))
254
+ return {"translated": translated}
255
+
256
+ # For STT, TTS, S2ST: Trigger ONE GPU allocation
257
  result = core_process(data)
258
  return result
259
  except Exception as e:
260
+ traceback.print_exc()
261
  return {"error": str(e)}
262
 
263
  @app.post("/api/v1/tts_stream")