Spaces:

TGPro1
/

S2ST

Running on Zero

File size: 8,710 Bytes

7454cce
 
811f60e
 
 
 
 
6fd2aaf
7bc2a36
69d9eef
811f60e
 
0c19477
0695186
 
 
6fd2aaf
6ce2f3c
6fd2aaf
6ce2f3c
3d20be5
6ce2f3c
 
 
 
 
7bc2a36
6ce2f3c
 
4bfa772
6ce2f3c
 
 
 
 
0695186
0c19477
971c294
6a4b0e8
0c19477
376aa42
6a4b0e8
 
 
 
 
 
ea78b72
 
 
 
 
 
 
 
 
 
 
 
2ebc6b4
 
 
811f60e
 
 
4bfa772
b93eba5
4bfa772
 
 
 
 
 
971c294
7bc2a36
4bfa772
71c50e8
4bfa772
7bc2a36
cb0d204
0695186
7bc2a36
0695186
 
971c294
 
 
 
 
 
0695186
7bc2a36
971c294
7bc2a36
0695186
7bc2a36
 
0695186
7bc2a36
 
0695186
 
0af9862
7bc2a36
0695186
 
 
 
 
971c294
639ffca
4bfa772
 
7bc2a36
4bfa772
7bc2a36
 
0695186
7bc2a36
4bfa772
 
 
 
0695186
7bc2a36
 
 
 
 
 
 
 
4bfa772
 
 
 
 
 
7bc2a36
 
639ffca
d8da089
639ffca
 
 
0695186
639ffca
0695186
639ffca
22c6fab
4bfa772
639ffca
6ce2f3c
4bfa772
 
6ce2f3c
23b6539
7bc2a36
639ffca
4bfa772
 
639ffca
 
4bfa772
22c6fab
639ffca
4bfa772
639ffca
 
6ce2f3c
 
639ffca
7bc2a36
4bfa772
639ffca
0695186
639ffca
 
0695186
639ffca
 
 
 
 
 
 
 
 
4bfa772
639ffca
 
 
 
 
7bc2a36
639ffca
 
 
 
 
4bfa772
 
639ffca
 
 
 
 
71c50e8
639ffca
0695186
639ffca
 
0695186
b0d71b5
 
 
639ffca
32297a1
811f60e
7bc2a36
 
4bfa772
0695186
4bfa772
0af9862
0695186
 
0af9862
7bc2a36
811f60e
 
0695186
e0a0f24
92366fd
811f60e

import os
import sys
import time
import base64
import torch
import tempfile
import traceback
import uvicorn
import gc
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import HTMLResponse

# --- [v164] 🚀 PRO GPU ENGINE (ULTIMATE STABILITY) ---
# This version enforces float32 for STT to avoid CUBLAS errors and uses batch_size=1.
print(f"--- [v164] 📡 BOOTING ENGINE ---")

# 🛠️ CRITICAL: TORCHAUDIO MONKEYPATCH 🛠️
import torchaudio
import soundfile as sf
def HeroLoad(filepath, **kwargs):
    try:
        data, samplerate = sf.read(filepath)
        if len(data.shape) == 1:
            data = data.reshape(1, -1)
        else:
            data = data.T
        return torch.from_numpy(data).float(), samplerate
    except Exception as e:
        print(f"--- [v162] ❌ PATCHED LOAD FAILED: {e} ---")
        return torchaudio.load_orig(filepath, **kwargs)

if not hasattr(torchaudio, 'load_orig'):
    torchaudio.load_orig = torchaudio.load
    torchaudio.load = HeroLoad
    print("--- [v164] 🩹 TORCHAUDIO PATCH APPLIED ---")

from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
from TTS.api import TTS
from deep_translator import GoogleTranslator

try:
    import chatterbox_utils
    HAS_CHATTERBOX = True
except ImportError:
    HAS_CHATTERBOX = False

try:
    import spaces
    HAS_SPACES = True
except ImportError:
    HAS_SPACES = False
    class spaces:
        @staticmethod
        def GPU(duration=60):
            def decorator(func):
                return func
            return decorator

os.environ["COQUI_TOS_AGREED"] = "1"
os.environ["PYTHONWARNINGS"] = "ignore"

app = FastAPI()
app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])

MODELS = {"stt": None, "tts": None, "gpu_id": 0}

def get_best_gpu():
    """Architecture for multi-GPU support (Switch)."""
    if not torch.cuda.is_available(): return "cpu"
    # Select GPU with most free memory if multiple exist
    # For ZeroGPU, this defaults to the allocated MIG instance.
    return f"cuda:{MODELS['gpu_id']}"

@spaces.GPU(duration=120)
def gpu_stt_full(temp_path, lang):
    global MODELS
    device = get_best_gpu()
    
    if MODELS.get("stt") is None:
        print(f"--- [v164] 📥 LOADING WHISPER LARGE (FP32) ON {device} ---")
        model_id = "openai/whisper-large-v3-turbo"
        # Using float32 to resolve CUBLAS_STATUS_EXECUTION_FAILED on H200/A10G MIG
        model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch.float32).to(device)
        processor = AutoProcessor.from_pretrained(model_id)
        MODELS["stt"] = pipeline(
            "automatic-speech-recognition",
            model=model,
            tokenizer=processor.tokenizer,
            feature_extractor=processor.feature_extractor,
            chunk_length_s=30, 
            device=device
        )
    
    print(f"--- [v164] 🎙️ WHISPER INFERENCE (TEMP 0, BS 1) ---")
    res = MODELS["stt"](
        temp_path, 
        batch_size=1, # Ultimate stability
        generate_kwargs={
            "language": lang if lang and len(lang) <= 3 else None,
            "temperature": 0.0,
            "return_timestamps": True
        }
    )
    
    # Post-inference cleanup
    torch.cuda.empty_cache()
    gc.collect()
    
    return res["text"].strip()

@spaces.GPU(duration=180)
def gpu_tts_full(text, mapped_lang, speaker_path):
    global MODELS
    device = "cuda"
    
    if MODELS.get("tts") is None:
        print(f"--- [v164] 📥 LOADING XTTS V2 ON GPU ---")
        MODELS["tts"] = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
    else:
        try: MODELS["tts"].to(device)
        except: pass

    print(f"--- [v164] 🔊 XTTS GPU INFERENCE ---")
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_f:
        out_p = out_f.name
    
    MODELS["tts"].tts_to_file(text=text, language=mapped_lang, file_path=out_p, speaker_wav=speaker_path)
    
    with open(out_p, "rb") as f:
        audio_b64 = base64.b64encode(f.read()).decode()
    
    if os.path.exists(out_p): os.unlink(out_p)
    
    # Cleanup to prevent ZeroGPU worker errors
    torch.cuda.empty_cache()
    gc.collect()
    
    return audio_b64

async def handle_process(request: Request):
    t1 = time.time()
    try:
        data = await request.json()
        action = data.get("action")
        if action == "health": return {"status": "awake", "v": "164"}
        
        print(f"--- [v164] 🛠️ API REQUEST: {action.upper()} ---")
        
        stt_text = ""
        # 🟢 SPEECH-TO-TEXT
        if action in ["stt", "s2st"]:
            audio_b64 = data.get("file")
            if not audio_b64: return {"error": "Missing audio data"}
            
            audio_bytes = base64.b64decode(audio_b64)
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
                f.write(audio_bytes); temp_path = f.name
            try:
                stt_text = gpu_stt_full(temp_path, data.get("lang"))
                print(f"--- [v162] 🎙️ TEXT: {stt_text[:100]}... ---")
            finally:
                if os.path.exists(temp_path): os.unlink(temp_path)
            
            if action == "stt": return {"text": stt_text}

        # 🔵 TEXT-TO-SPEECH
        if action in ["tts", "s2st"]:
            text = (data.get("text") if action == "tts" else stt_text).strip()
            if not text: return {"error": "Input text is empty"}
            
            target = data.get("target_lang") or data.get("lang") or "en"
            trans_text = text
            
            if action == "s2st":
                print(f"--- [v164] 🌏 TRANSLATING TO {target} ---")
                trans_text = GoogleTranslator(source='auto', target=target).translate(stt_text)
                text = trans_text
                print(f"--- [v164] 📝 TRANS: {text[:100]}... ---")

            XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
            clean_lang = target.split('-')[0].lower()
            mapped_lang = XTTS_MAP.get(clean_lang) or ("zh-cn" if clean_lang == "zh" else None)
            
            if not mapped_lang:
                if HAS_CHATTERBOX:
                    audio_bytes = chatterbox_utils.run_chatterbox_inference(text, clean_lang)
                    audio_b64 = base64.b64encode(audio_bytes).decode()
                else: return {"error": f"Language {clean_lang} not supported by XTTS/Chatterbox"}
            else:
                speaker_wav_b64 = data.get("speaker_wav")
                speaker_path = None
                if speaker_wav_b64:
                    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
                        f.write(base64.b64decode(speaker_wav_b64)); speaker_path = f.name
                else:
                    speaker_path = "default_speaker.wav"
                    if not os.path.exists(speaker_path): speaker_path = None
                
                try:
                    # EXECUTE GPU TTS
                    audio_b64 = gpu_tts_full(text, mapped_lang, speaker_path)
                finally:
                    if speaker_wav_b64 and speaker_path and os.path.exists(speaker_path): os.unlink(speaker_path)
            
            if action == "tts": return {"audio": audio_b64}
            return {"text": stt_text, "translated": trans_text, "audio": audio_b64}

    except Exception as e:
        print(f"❌ [v164] ENGINE ERROR: {traceback.format_exc()}")
        return {"error": str(e)}
    finally:
        print(f"--- [v164] ✨ MISSION COMPLETED ({time.time()-t1:.1f}s) ---")

@app.post("/process")
@app.post("/api/v1/process")
async def api_process(request: Request): return await handle_process(request)

@app.get("/health")
def health(): 
    return {
        "status": "ready", 
        "v": "164", 
        "gpu": torch.cuda.is_available(),
        "devices": torch.cuda.device_count(),
        "engine": "Full GPU PRO (Stable)",
        "stt": "Whisper-v3-Turbo (FP32-GPU)",
        "tts": "XTTS-v2 (GPU)"
    }

@app.get("/", response_class=HTMLResponse)
def root(): return "<h1>🚀 PRO AI Engine v164 (GPU MODE)</h1>"

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=7860)