Spaces:

TGPro1
/

S2ST

Running on Zero

App Files Files Community

TGPro1 commited on 29 days ago

Commit

060b891

verified ·

1 Parent(s): 2934096

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +28 -23

app.py CHANGED Viewed

@@ -60,6 +60,7 @@ if not hasattr(torchaudio, "info"):
 from df.enhance import enhance, init_df, load_audio, save_audio
 # FORCE BUILD TRIGGER: 07:15:00 Jan 21 2026
 # 🛠️ Monkeypatch torchaudio.load
 try:
@@ -121,12 +122,14 @@ def load_models():
             raise e
 def _stt_logic(request_dict):
     audio_bytes = base64.b64decode(request_dict.get("file"))
     lang = request_dict.get("lang")
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
         f.write(audio_bytes)
         temp_path = f.name
     try:
         segments, _ = MODELS["stt"].transcribe(temp_path, language=lang, beam_size=1)
         text = " ".join([s.text for s in segments]).strip()
         return {"text": text}
@@ -134,15 +137,18 @@ def _stt_logic(request_dict):
         if os.path.exists(temp_path): os.unlink(temp_path)
 def _translate_logic(text, target_lang):
     from deep_translator import GoogleTranslator
     translated = GoogleTranslator(source='auto', target=target_lang).translate(text)
     return translated
-def _tts_logic(text, lang, speaker_wav_b64):
     if not text or not text.strip():
         return {"error": "TTS Error: Input text is empty"}
-    # 🧹 Normalize language code
     if lang:
         lang = lang.strip().lower()
         if '-' in lang: lang = lang.split('-')[0]
@@ -159,7 +165,10 @@ def _tts_logic(text, lang, speaker_wav_b64):
     try:
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
             output_path = output_file.name
         MODELS["tts"].tts_to_file(text=text, language=lang, file_path=output_path, speaker_wav=speaker_wav_path)
         with open(output_path, "rb") as f:
             audio_b64 = base64.b64encode(f.read()).decode()
         return {"audio": audio_b64}
@@ -168,35 +177,38 @@ def _tts_logic(text, lang, speaker_wav_b64):
             if os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
         if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
-@spaces.GPU
 def core_process(request_dict):
-    """Entry point for GPU-bound tasks. Only one GPU allocation per call."""
     action = request_dict.get("action")
     t0 = time.time()
-    print(f"--- [v74] 🛠️ GPU Start: {action} at {time.ctime()} ---")
-    load_models()
     if action == "stt":
         res = _stt_logic(request_dict)
     elif action == "tts":
-        res = _tts_logic(request_dict.get("text"), request_dict.get("lang"), request_dict.get("speaker_wav"))
     elif action == "s2st":
-        # 🔗 COMPACT PIPELINE: Stay on the same GPU worker for all steps
-        # Step 1: STT
         stt_res = _stt_logic({"file": request_dict.get("file"), "lang": request_dict.get("source_lang")})
         text = stt_res.get("text", "")
         if not text: return {"error": "No speech detected"}
-        # Step 2: Translation (Google API)
         translated = _translate_logic(text, request_dict.get("target_lang"))
-        # Step 3: TTS
-        tts_res = _tts_logic(translated, request_dict.get("target_lang"), request_dict.get("speaker_wav"))
         res = {"text": text, "translated": translated, "audio": tts_res.get("audio")}
     else:
-        res = {"error": f"Unknown GPU action: {action}"}
-    print(f"--- [v74] ✅ GPU End: {action} (Took {time.time()-t0:.2f}s) ---")
     return res
     return {"error": f"Unknown action: {action}"}
@@ -243,17 +255,10 @@ app = FastAPI()
 @app.post("/api/v1/process")
 async def api_process(request: Request):
-    """Async endpoint routes to GPU or CPU logic"""
     try:
         data = await request.json()
-        action = data.get("action")
-        if action == "translate":
-            # ⚡ CPU OPTIMIZATION: Translation is just a web request, don't waste GPU allocation
-            translated = _translate_logic(data.get("text"), data.get("target_lang", "en"))
-            return {"translated": translated}
-        # For STT, TTS, S2ST: Trigger ONE GPU allocation
         result = core_process(data)
         return result
     except Exception as e:

 from df.enhance import enhance, init_df, load_audio, save_audio
 # FORCE BUILD TRIGGER: 07:15:00 Jan 21 2026
+# v76: CPU-STT (Instant) + GPU-TTS (High Quality)
 # 🛠️ Monkeypatch torchaudio.load
 try:
             raise e
 def _stt_logic(request_dict):
+    """STT runs on CPU for instant start (no GPU queue wait)"""
     audio_bytes = base64.b64decode(request_dict.get("file"))
     lang = request_dict.get("lang")
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
         f.write(audio_bytes)
         temp_path = f.name
     try:
+        # ⚡ CPU Transcription: No @spaces.GPU needed
         segments, _ = MODELS["stt"].transcribe(temp_path, language=lang, beam_size=1)
         text = " ".join([s.text for s in segments]).strip()
         return {"text": text}
         if os.path.exists(temp_path): os.unlink(temp_path)
 def _translate_logic(text, target_lang):
+    """Translation runs on CPU (Instant)"""
     from deep_translator import GoogleTranslator
     translated = GoogleTranslator(source='auto', target=target_lang).translate(text)
     return translated
+@spaces.GPU
+def _tts_gpu_logic(text, lang, speaker_wav_b64):
+    """Only TTS triggers GPU allocation"""
+    load_models()
     if not text or not text.strip():
         return {"error": "TTS Error: Input text is empty"}
     if lang:
         lang = lang.strip().lower()
         if '-' in lang: lang = lang.split('-')[0]
     try:
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
             output_path = output_file.name
+        # 🎙️ XTTS Inference on GPU
         MODELS["tts"].tts_to_file(text=text, language=lang, file_path=output_path, speaker_wav=speaker_wav_path)
         with open(output_path, "rb") as f:
             audio_b64 = base64.b64encode(f.read()).decode()
         return {"audio": audio_b64}
             if os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
         if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
 def core_process(request_dict):
+    """Unified entry (CPU/Hybrid)"""
     action = request_dict.get("action")
     t0 = time.time()
+    print(f"--- [v76] 🛠️ Process: {action} at {time.ctime()} ---")
+    load_models() # Load CPU bits if needed
     if action == "stt":
+        # ⚡ Instant STT on CPU
         res = _stt_logic(request_dict)
+    elif action == "translate":
+        res = {"translated": _translate_logic(request_dict.get("text"), request_dict.get("target_lang", "en"))}
     elif action == "tts":
+        # 🚀 TTS on GPU
+        res = _tts_gpu_logic(request_dict.get("text"), request_dict.get("lang"), request_dict.get("speaker_wav"))
     elif action == "s2st":
+        # 🔗 HYBRID PIPELINE
+        # Step 1: STT (CPU - Instant)
         stt_res = _stt_logic({"file": request_dict.get("file"), "lang": request_dict.get("source_lang")})
         text = stt_res.get("text", "")
         if not text: return {"error": "No speech detected"}
+        # Step 2: Translation (CPU - Instant)
         translated = _translate_logic(text, request_dict.get("target_lang"))
+        # Step 3: TTS (GPU - Quality)
+        tts_res = _tts_gpu_logic(translated, request_dict.get("target_lang"), request_dict.get("speaker_wav"))
         res = {"text": text, "translated": translated, "audio": tts_res.get("audio")}
     else:
+        res = {"error": f"Unknown action: {action}"}
+    print(f"--- [v76] ✅ End: {action} (Took {time.time()-t0:.2f}s) ---")
     return res
     return {"error": f"Unknown action: {action}"}
 @app.post("/api/v1/process")
 async def api_process(request: Request):
+    """Async endpoint. Routes to CPU (STT/Translate) or Hybrid (S2ST/TTS)"""
     try:
         data = await request.json()
+        # Direct call to the hybrid process
         result = core_process(data)
         return result
     except Exception as e: