import os import sys import time import base64 import torch import tempfile import traceback import uvicorn import gc from fastapi import FastAPI, Request from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import HTMLResponse # --- [v164] 🚀 PRO GPU ENGINE (ULTIMATE STABILITY) --- # This version enforces float32 for STT to avoid CUBLAS errors and uses batch_size=1. print(f"--- [v164] 📡 BOOTING ENGINE ---") # 🛠️ CRITICAL: TORCHAUDIO MONKEYPATCH 🛠️ import torchaudio import soundfile as sf def HeroLoad(filepath, **kwargs): try: data, samplerate = sf.read(filepath) if len(data.shape) == 1: data = data.reshape(1, -1) else: data = data.T return torch.from_numpy(data).float(), samplerate except Exception as e: print(f"--- [v162] ❌ PATCHED LOAD FAILED: {e} ---") return torchaudio.load_orig(filepath, **kwargs) if not hasattr(torchaudio, 'load_orig'): torchaudio.load_orig = torchaudio.load torchaudio.load = HeroLoad print("--- [v164] 🩹 TORCHAUDIO PATCH APPLIED ---") from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor from TTS.api import TTS from deep_translator import GoogleTranslator try: import chatterbox_utils HAS_CHATTERBOX = True except ImportError: HAS_CHATTERBOX = False try: import spaces HAS_SPACES = True except ImportError: HAS_SPACES = False class spaces: @staticmethod def GPU(duration=60): def decorator(func): return func return decorator os.environ["COQUI_TOS_AGREED"] = "1" os.environ["PYTHONWARNINGS"] = "ignore" app = FastAPI() app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"]) MODELS = {"stt": None, "tts": None, "gpu_id": 0} def get_best_gpu(): """Architecture for multi-GPU support (Switch).""" if not torch.cuda.is_available(): return "cpu" # Select GPU with most free memory if multiple exist # For ZeroGPU, this defaults to the allocated MIG instance. return f"cuda:{MODELS['gpu_id']}" @spaces.GPU(duration=120) def gpu_stt_full(temp_path, lang): global MODELS device = get_best_gpu() if MODELS.get("stt") is None: print(f"--- [v164] 📥 LOADING WHISPER LARGE (FP32) ON {device} ---") model_id = "openai/whisper-large-v3-turbo" # Using float32 to resolve CUBLAS_STATUS_EXECUTION_FAILED on H200/A10G MIG model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch.float32).to(device) processor = AutoProcessor.from_pretrained(model_id) MODELS["stt"] = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, chunk_length_s=30, device=device ) print(f"--- [v164] 🎙️ WHISPER INFERENCE (TEMP 0, BS 1) ---") res = MODELS["stt"]( temp_path, batch_size=1, # Ultimate stability generate_kwargs={ "language": lang if lang and len(lang) <= 3 else None, "temperature": 0.0, "return_timestamps": True } ) # Post-inference cleanup torch.cuda.empty_cache() gc.collect() return res["text"].strip() @spaces.GPU(duration=180) def gpu_tts_full(text, mapped_lang, speaker_path): global MODELS device = "cuda" if MODELS.get("tts") is None: print(f"--- [v164] 📥 LOADING XTTS V2 ON GPU ---") MODELS["tts"] = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device) else: try: MODELS["tts"].to(device) except: pass print(f"--- [v164] 🔊 XTTS GPU INFERENCE ---") with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_f: out_p = out_f.name MODELS["tts"].tts_to_file(text=text, language=mapped_lang, file_path=out_p, speaker_wav=speaker_path) with open(out_p, "rb") as f: audio_b64 = base64.b64encode(f.read()).decode() if os.path.exists(out_p): os.unlink(out_p) # Cleanup to prevent ZeroGPU worker errors torch.cuda.empty_cache() gc.collect() return audio_b64 async def handle_process(request: Request): t1 = time.time() try: data = await request.json() action = data.get("action") if action == "health": return {"status": "awake", "v": "164"} print(f"--- [v164] 🛠️ API REQUEST: {action.upper()} ---") stt_text = "" # 🟢 SPEECH-TO-TEXT if action in ["stt", "s2st"]: audio_b64 = data.get("file") if not audio_b64: return {"error": "Missing audio data"} audio_bytes = base64.b64decode(audio_b64) with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: f.write(audio_bytes); temp_path = f.name try: stt_text = gpu_stt_full(temp_path, data.get("lang")) print(f"--- [v162] 🎙️ TEXT: {stt_text[:100]}... ---") finally: if os.path.exists(temp_path): os.unlink(temp_path) if action == "stt": return {"text": stt_text} # 🔵 TEXT-TO-SPEECH if action in ["tts", "s2st"]: text = (data.get("text") if action == "tts" else stt_text).strip() if not text: return {"error": "Input text is empty"} target = data.get("target_lang") or data.get("lang") or "en" trans_text = text if action == "s2st": print(f"--- [v164] 🌏 TRANSLATING TO {target} ---") trans_text = GoogleTranslator(source='auto', target=target).translate(stt_text) text = trans_text print(f"--- [v164] 📝 TRANS: {text[:100]}... ---") XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"} clean_lang = target.split('-')[0].lower() mapped_lang = XTTS_MAP.get(clean_lang) or ("zh-cn" if clean_lang == "zh" else None) if not mapped_lang: if HAS_CHATTERBOX: audio_bytes = chatterbox_utils.run_chatterbox_inference(text, clean_lang) audio_b64 = base64.b64encode(audio_bytes).decode() else: return {"error": f"Language {clean_lang} not supported by XTTS/Chatterbox"} else: speaker_wav_b64 = data.get("speaker_wav") speaker_path = None if speaker_wav_b64: with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: f.write(base64.b64decode(speaker_wav_b64)); speaker_path = f.name else: speaker_path = "default_speaker.wav" if not os.path.exists(speaker_path): speaker_path = None try: # EXECUTE GPU TTS audio_b64 = gpu_tts_full(text, mapped_lang, speaker_path) finally: if speaker_wav_b64 and speaker_path and os.path.exists(speaker_path): os.unlink(speaker_path) if action == "tts": return {"audio": audio_b64} return {"text": stt_text, "translated": trans_text, "audio": audio_b64} except Exception as e: print(f"❌ [v164] ENGINE ERROR: {traceback.format_exc()}") return {"error": str(e)} finally: print(f"--- [v164] ✨ MISSION COMPLETED ({time.time()-t1:.1f}s) ---") @app.post("/process") @app.post("/api/v1/process") async def api_process(request: Request): return await handle_process(request) @app.get("/health") def health(): return { "status": "ready", "v": "164", "gpu": torch.cuda.is_available(), "devices": torch.cuda.device_count(), "engine": "Full GPU PRO (Stable)", "stt": "Whisper-v3-Turbo (FP32-GPU)", "tts": "XTTS-v2 (GPU)" } @app.get("/", response_class=HTMLResponse) def root(): return "