Spaces:

TGPro1
/

S2ST

Running on Zero

App Files Files Community

TGPro1 commited on 29 days ago

Commit

2934096

verified ·

1 Parent(s): ae76cda

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +78 -63

app.py CHANGED Viewed

@@ -59,7 +59,7 @@ if not hasattr(torchaudio, "info"):
 from df.enhance import enhance, init_df, load_audio, save_audio
-# FORCE BUILD TRIGGER: 15:58:00 Jan 20 2026
 # 🛠️ Monkeypatch torchaudio.load
 try:
@@ -120,78 +120,84 @@ def load_models():
             print(f"❌ Failed to load XTTS: {e}")
             raise e
 @spaces.GPU
 def core_process(request_dict):
-    """Synchronous inference logic with GPU decorator"""
     action = request_dict.get("action")
-    print(f"--- 🛠️ Processing Action: {action} (GPU Context) ---")
     load_models()
     if action == "stt":
-        audio_bytes = base64.b64decode(request_dict.get("file"))
-        lang = request_dict.get("lang")
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
-            f.write(audio_bytes)
-            temp_path = f.name
-        try:
-            segments, _ = MODELS["stt"].transcribe(temp_path, language=lang, beam_size=1)
-            text = " ".join([s.text for s in segments]).strip()
-            return {"text": text}
-        finally:
-            if os.path.exists(temp_path): os.unlink(temp_path)
-    elif action == "translate":
-        from deep_translator import GoogleTranslator
-        text = request_dict.get("text")
-        target_lang = request_dict.get("target_lang", "en")
-        translated = GoogleTranslator(source='auto', target=target_lang).translate(text)
-        return {"translated": translated}
     elif action == "tts":
-        text = request_dict.get("text")
-        lang = request_dict.get("lang")
-        if not text or not text.strip():
-            return {"error": "TTS Error: Input text is empty"}
-        # 🧹 Normalize language code
-        if lang:
-            lang = lang.strip().lower()
-            # Map complex codes to 2-letter codes if needed, e.g., 'fr-fr' -> 'fr'
-            if '-' in lang: lang = lang.split('-')[0]
-        speaker_wav_b64 = request_dict.get("speaker_wav")
-        speaker_wav_path = None
-        if speaker_wav_b64:
-            sb = base64.b64decode(speaker_wav_b64)
-            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
-                f.write(sb)
-                speaker_wav_path = f.name
-        else:
-            speaker_wav_path = "default_speaker.wav"
-        try:
-            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
-                output_path = output_file.name
-            MODELS["tts"].tts_to_file(text=text, language=lang, file_path=output_path, speaker_wav=speaker_wav_path)
-            with open(output_path, "rb") as f:
-                audio_b64 = base64.b64encode(f.read()).decode()
-            return {"audio": audio_b64}
-        finally:
-            if speaker_wav_path and "default_speaker" not in speaker_wav_path:
-                if os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
-            if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
     elif action == "s2st":
-        # Full S2ST flow
-        data = core_process({"action": "stt", "file": request_dict.get("file"), "lang": request_dict.get("source_lang")})
-        text = data.get("text", "")
         if not text: return {"error": "No speech detected"}
-        data_tr = core_process({"action": "translate", "text": text, "target_lang": request_dict.get("target_lang")})
-        translated = data_tr.get("translated", "")
-        data_tts = core_process({"action": "tts", "text": translated, "lang": request_dict.get("target_lang"), "speaker_wav": request_dict.get("speaker_wav")})
-        return {"text": text, "translated": translated, "audio": data_tts.get("audio")}
     return {"error": f"Unknown action: {action}"}
@@ -237,12 +243,21 @@ app = FastAPI()
 @app.post("/api/v1/process")
 async def api_process(request: Request):
-    """Async endpoint calls synchronous GPU function"""
     try:
         data = await request.json()
         result = core_process(data)
         return result
     except Exception as e:
         return {"error": str(e)}
 @app.post("/api/v1/tts_stream")

 from df.enhance import enhance, init_df, load_audio, save_audio
+# FORCE BUILD TRIGGER: 07:15:00 Jan 21 2026
 # 🛠️ Monkeypatch torchaudio.load
 try:
             print(f"❌ Failed to load XTTS: {e}")
             raise e
+def _stt_logic(request_dict):
+    audio_bytes = base64.b64decode(request_dict.get("file"))
+    lang = request_dict.get("lang")
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+        f.write(audio_bytes)
+        temp_path = f.name
+    try:
+        segments, _ = MODELS["stt"].transcribe(temp_path, language=lang, beam_size=1)
+        text = " ".join([s.text for s in segments]).strip()
+        return {"text": text}
+    finally:
+        if os.path.exists(temp_path): os.unlink(temp_path)
+def _translate_logic(text, target_lang):
+    from deep_translator import GoogleTranslator
+    translated = GoogleTranslator(source='auto', target=target_lang).translate(text)
+    return translated
+def _tts_logic(text, lang, speaker_wav_b64):
+    if not text or not text.strip():
+        return {"error": "TTS Error: Input text is empty"}
+    # 🧹 Normalize language code
+    if lang:
+        lang = lang.strip().lower()
+        if '-' in lang: lang = lang.split('-')[0]
+    speaker_wav_path = None
+    if speaker_wav_b64:
+        sb = base64.b64decode(speaker_wav_b64)
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+            f.write(sb)
+            speaker_wav_path = f.name
+    else:
+        speaker_wav_path = "default_speaker.wav"
+    try:
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
+            output_path = output_file.name
+        MODELS["tts"].tts_to_file(text=text, language=lang, file_path=output_path, speaker_wav=speaker_wav_path)
+        with open(output_path, "rb") as f:
+            audio_b64 = base64.b64encode(f.read()).decode()
+        return {"audio": audio_b64}
+    finally:
+        if speaker_wav_path and "default_speaker" not in speaker_wav_path:
+            if os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
+        if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
 @spaces.GPU
 def core_process(request_dict):
+    """Entry point for GPU-bound tasks. Only one GPU allocation per call."""
     action = request_dict.get("action")
+    t0 = time.time()
+    print(f"--- [v74] 🛠️ GPU Start: {action} at {time.ctime()} ---")
     load_models()
     if action == "stt":
+        res = _stt_logic(request_dict)
     elif action == "tts":
+        res = _tts_logic(request_dict.get("text"), request_dict.get("lang"), request_dict.get("speaker_wav"))
     elif action == "s2st":
+        # 🔗 COMPACT PIPELINE: Stay on the same GPU worker for all steps
+        # Step 1: STT
+        stt_res = _stt_logic({"file": request_dict.get("file"), "lang": request_dict.get("source_lang")})
+        text = stt_res.get("text", "")
         if not text: return {"error": "No speech detected"}
+        # Step 2: Translation (Google API)
+        translated = _translate_logic(text, request_dict.get("target_lang"))
+        # Step 3: TTS
+        tts_res = _tts_logic(translated, request_dict.get("target_lang"), request_dict.get("speaker_wav"))
+        res = {"text": text, "translated": translated, "audio": tts_res.get("audio")}
+    else:
+        res = {"error": f"Unknown GPU action: {action}"}
+    print(f"--- [v74] ✅ GPU End: {action} (Took {time.time()-t0:.2f}s) ---")
+    return res
     return {"error": f"Unknown action: {action}"}
 @app.post("/api/v1/process")
 async def api_process(request: Request):
+    """Async endpoint routes to GPU or CPU logic"""
     try:
         data = await request.json()
+        action = data.get("action")
+        if action == "translate":
+            # ⚡ CPU OPTIMIZATION: Translation is just a web request, don't waste GPU allocation
+            translated = _translate_logic(data.get("text"), data.get("target_lang", "en"))
+            return {"translated": translated}
+        # For STT, TTS, S2ST: Trigger ONE GPU allocation
         result = core_process(data)
         return result
     except Exception as e:
+        traceback.print_exc()
         return {"error": str(e)}
 @app.post("/api/v1/tts_stream")