Spaces:

TGPro1
/

S2ST

Running on Zero

App Files Files Community

TGPro1 commited on 29 days ago

Commit

4258912

verified ·

1 Parent(s): 060b891

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +22 -22

app.py CHANGED Viewed

@@ -59,8 +59,8 @@ if not hasattr(torchaudio, "info"):
 from df.enhance import enhance, init_df, load_audio, save_audio
-# FORCE BUILD TRIGGER: 07:15:00 Jan 21 2026
-# v76: CPU-STT (Instant) + GPU-TTS (High Quality)
 # 🛠️ Monkeypatch torchaudio.load
 try:
@@ -122,14 +122,14 @@ def load_models():
             raise e
 def _stt_logic(request_dict):
-    """STT runs on CPU for instant start (no GPU queue wait)"""
     audio_bytes = base64.b64decode(request_dict.get("file"))
     lang = request_dict.get("lang")
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
         f.write(audio_bytes)
         temp_path = f.name
     try:
-        # ⚡ CPU Transcription: No @spaces.GPU needed
         segments, _ = MODELS["stt"].transcribe(temp_path, language=lang, beam_size=1)
         text = " ".join([s.text for s in segments]).strip()
         return {"text": text}
@@ -137,15 +137,13 @@ def _stt_logic(request_dict):
         if os.path.exists(temp_path): os.unlink(temp_path)
 def _translate_logic(text, target_lang):
-    """Translation runs on CPU (Instant)"""
     from deep_translator import GoogleTranslator
     translated = GoogleTranslator(source='auto', target=target_lang).translate(text)
     return translated
-@spaces.GPU
-def _tts_gpu_logic(text, lang, speaker_wav_b64):
-    """Only TTS triggers GPU allocation"""
-    load_models()
     if not text or not text.strip():
         return {"error": "TTS Error: Input text is empty"}
@@ -166,7 +164,7 @@ def _tts_gpu_logic(text, lang, speaker_wav_b64):
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
             output_path = output_file.name
-        # 🎙️ XTTS Inference on GPU
         MODELS["tts"].tts_to_file(text=text, language=lang, file_path=output_path, speaker_wav=speaker_wav_path)
         with open(output_path, "rb") as f:
@@ -177,38 +175,40 @@ def _tts_gpu_logic(text, lang, speaker_wav_b64):
             if os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
         if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
 def core_process(request_dict):
-    """Unified entry (CPU/Hybrid)"""
     action = request_dict.get("action")
     t0 = time.time()
-    print(f"--- [v76] 🛠️ Process: {action} at {time.ctime()} ---")
-    load_models() # Load CPU bits if needed
     if action == "stt":
-        # ⚡ Instant STT on CPU
         res = _stt_logic(request_dict)
     elif action == "translate":
         res = {"translated": _translate_logic(request_dict.get("text"), request_dict.get("target_lang", "en"))}
     elif action == "tts":
-        # 🚀 TTS on GPU
-        res = _tts_gpu_logic(request_dict.get("text"), request_dict.get("lang"), request_dict.get("speaker_wav"))
     elif action == "s2st":
-        # 🔗 HYBRID PIPELINE
-        # Step 1: STT (CPU - Instant)
         stt_res = _stt_logic({"file": request_dict.get("file"), "lang": request_dict.get("source_lang")})
         text = stt_res.get("text", "")
         if not text: return {"error": "No speech detected"}
-        # Step 2: Translation (CPU - Instant)
         translated = _translate_logic(text, request_dict.get("target_lang"))
-        # Step 3: TTS (GPU - Quality)
-        tts_res = _tts_gpu_logic(translated, request_dict.get("target_lang"), request_dict.get("speaker_wav"))
         res = {"text": text, "translated": translated, "audio": tts_res.get("audio")}
     else:
         res = {"error": f"Unknown action: {action}"}
-    print(f"--- [v76] ✅ End: {action} (Took {time.time()-t0:.2f}s) ---")
     return res
     return {"error": f"Unknown action: {action}"}

 from df.enhance import enhance, init_df, load_audio, save_audio
+# FORCE BUILD TRIGGER: 07:18:00 Jan 21 2026
+# v77: High-Speed GPU Pipeline (STT + TTS on GPU)
 # 🛠️ Monkeypatch torchaudio.load
 try:
             raise e
 def _stt_logic(request_dict):
+    """STT Logic (Runs on GPU when called via core_process)"""
     audio_bytes = base64.b64decode(request_dict.get("file"))
     lang = request_dict.get("lang")
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
         f.write(audio_bytes)
         temp_path = f.name
     try:
+        # Transcribe (Uses GPU if device="cuda" in MODELS)
         segments, _ = MODELS["stt"].transcribe(temp_path, language=lang, beam_size=1)
         text = " ".join([s.text for s in segments]).strip()
         return {"text": text}
         if os.path.exists(temp_path): os.unlink(temp_path)
 def _translate_logic(text, target_lang):
+    """Translation (CPU/Network)"""
     from deep_translator import GoogleTranslator
     translated = GoogleTranslator(source='auto', target=target_lang).translate(text)
     return translated
+def _tts_logic(text, lang, speaker_wav_b64):
+    """TTS Logic (Runs on GPU when called via core_process)"""
     if not text or not text.strip():
         return {"error": "TTS Error: Input text is empty"}
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
             output_path = output_file.name
+        # 🎙️ XTTS Inference
         MODELS["tts"].tts_to_file(text=text, language=lang, file_path=output_path, speaker_wav=speaker_wav_path)
         with open(output_path, "rb") as f:
             if os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
         if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
+@spaces.GPU
 def core_process(request_dict):
+    """
+    Unified GPU Entry Point (v77).
+    This function handles all high-speed tasks inside a single GPU allocation.
+    The container stays resident on CPU but triggers GPU on demand.
+    """
     action = request_dict.get("action")
     t0 = time.time()
+    print(f"--- [v77] 🚀 GPU SESSION START: {action} at {time.ctime()} ---")
+    load_models()
     if action == "stt":
         res = _stt_logic(request_dict)
     elif action == "translate":
         res = {"translated": _translate_logic(request_dict.get("text"), request_dict.get("target_lang", "en"))}
     elif action == "tts":
+        res = _tts_logic(request_dict.get("text"), request_dict.get("lang"), request_dict.get("speaker_wav"))
     elif action == "s2st":
+        # 🔗 FULL PIPELINE (Single GPU Call)
         stt_res = _stt_logic({"file": request_dict.get("file"), "lang": request_dict.get("source_lang")})
         text = stt_res.get("text", "")
         if not text: return {"error": "No speech detected"}
         translated = _translate_logic(text, request_dict.get("target_lang"))
+        tts_res = _tts_logic(translated, request_dict.get("target_lang"), request_dict.get("speaker_wav"))
         res = {"text": text, "translated": translated, "audio": tts_res.get("audio")}
+    elif action == "health":
+        res = {"status": "awake", "time": time.ctime()}
     else:
         res = {"error": f"Unknown action: {action}"}
+    print(f"--- [v77] ✨ GPU SESSION END: {action} (Total: {time.time()-t0:.2f}s) ---")
     return res
     return {"error": f"Unknown action: {action}"}