S2ST / app.py
TGPro1's picture
Upload app.py with huggingface_hub
0695186 verified
import os
import sys
import time
import base64
import torch
import tempfile
import traceback
import uvicorn
import gc
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import HTMLResponse
# --- [v164] πŸš€ PRO GPU ENGINE (ULTIMATE STABILITY) ---
# This version enforces float32 for STT to avoid CUBLAS errors and uses batch_size=1.
print(f"--- [v164] πŸ“‘ BOOTING ENGINE ---")
# πŸ› οΈ CRITICAL: TORCHAUDIO MONKEYPATCH πŸ› οΈ
import torchaudio
import soundfile as sf
def HeroLoad(filepath, **kwargs):
try:
data, samplerate = sf.read(filepath)
if len(data.shape) == 1:
data = data.reshape(1, -1)
else:
data = data.T
return torch.from_numpy(data).float(), samplerate
except Exception as e:
print(f"--- [v162] ❌ PATCHED LOAD FAILED: {e} ---")
return torchaudio.load_orig(filepath, **kwargs)
if not hasattr(torchaudio, 'load_orig'):
torchaudio.load_orig = torchaudio.load
torchaudio.load = HeroLoad
print("--- [v164] 🩹 TORCHAUDIO PATCH APPLIED ---")
from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
from TTS.api import TTS
from deep_translator import GoogleTranslator
try:
import chatterbox_utils
HAS_CHATTERBOX = True
except ImportError:
HAS_CHATTERBOX = False
try:
import spaces
HAS_SPACES = True
except ImportError:
HAS_SPACES = False
class spaces:
@staticmethod
def GPU(duration=60):
def decorator(func):
return func
return decorator
os.environ["COQUI_TOS_AGREED"] = "1"
os.environ["PYTHONWARNINGS"] = "ignore"
app = FastAPI()
app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
MODELS = {"stt": None, "tts": None, "gpu_id": 0}
def get_best_gpu():
"""Architecture for multi-GPU support (Switch)."""
if not torch.cuda.is_available(): return "cpu"
# Select GPU with most free memory if multiple exist
# For ZeroGPU, this defaults to the allocated MIG instance.
return f"cuda:{MODELS['gpu_id']}"
@spaces.GPU(duration=120)
def gpu_stt_full(temp_path, lang):
global MODELS
device = get_best_gpu()
if MODELS.get("stt") is None:
print(f"--- [v164] πŸ“₯ LOADING WHISPER LARGE (FP32) ON {device} ---")
model_id = "openai/whisper-large-v3-turbo"
# Using float32 to resolve CUBLAS_STATUS_EXECUTION_FAILED on H200/A10G MIG
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch.float32).to(device)
processor = AutoProcessor.from_pretrained(model_id)
MODELS["stt"] = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
chunk_length_s=30,
device=device
)
print(f"--- [v164] πŸŽ™οΈ WHISPER INFERENCE (TEMP 0, BS 1) ---")
res = MODELS["stt"](
temp_path,
batch_size=1, # Ultimate stability
generate_kwargs={
"language": lang if lang and len(lang) <= 3 else None,
"temperature": 0.0,
"return_timestamps": True
}
)
# Post-inference cleanup
torch.cuda.empty_cache()
gc.collect()
return res["text"].strip()
@spaces.GPU(duration=180)
def gpu_tts_full(text, mapped_lang, speaker_path):
global MODELS
device = "cuda"
if MODELS.get("tts") is None:
print(f"--- [v164] πŸ“₯ LOADING XTTS V2 ON GPU ---")
MODELS["tts"] = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
else:
try: MODELS["tts"].to(device)
except: pass
print(f"--- [v164] πŸ”Š XTTS GPU INFERENCE ---")
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_f:
out_p = out_f.name
MODELS["tts"].tts_to_file(text=text, language=mapped_lang, file_path=out_p, speaker_wav=speaker_path)
with open(out_p, "rb") as f:
audio_b64 = base64.b64encode(f.read()).decode()
if os.path.exists(out_p): os.unlink(out_p)
# Cleanup to prevent ZeroGPU worker errors
torch.cuda.empty_cache()
gc.collect()
return audio_b64
async def handle_process(request: Request):
t1 = time.time()
try:
data = await request.json()
action = data.get("action")
if action == "health": return {"status": "awake", "v": "164"}
print(f"--- [v164] πŸ› οΈ API REQUEST: {action.upper()} ---")
stt_text = ""
# 🟒 SPEECH-TO-TEXT
if action in ["stt", "s2st"]:
audio_b64 = data.get("file")
if not audio_b64: return {"error": "Missing audio data"}
audio_bytes = base64.b64decode(audio_b64)
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
f.write(audio_bytes); temp_path = f.name
try:
stt_text = gpu_stt_full(temp_path, data.get("lang"))
print(f"--- [v162] πŸŽ™οΈ TEXT: {stt_text[:100]}... ---")
finally:
if os.path.exists(temp_path): os.unlink(temp_path)
if action == "stt": return {"text": stt_text}
# πŸ”΅ TEXT-TO-SPEECH
if action in ["tts", "s2st"]:
text = (data.get("text") if action == "tts" else stt_text).strip()
if not text: return {"error": "Input text is empty"}
target = data.get("target_lang") or data.get("lang") or "en"
trans_text = text
if action == "s2st":
print(f"--- [v164] 🌏 TRANSLATING TO {target} ---")
trans_text = GoogleTranslator(source='auto', target=target).translate(stt_text)
text = trans_text
print(f"--- [v164] πŸ“ TRANS: {text[:100]}... ---")
XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
clean_lang = target.split('-')[0].lower()
mapped_lang = XTTS_MAP.get(clean_lang) or ("zh-cn" if clean_lang == "zh" else None)
if not mapped_lang:
if HAS_CHATTERBOX:
audio_bytes = chatterbox_utils.run_chatterbox_inference(text, clean_lang)
audio_b64 = base64.b64encode(audio_bytes).decode()
else: return {"error": f"Language {clean_lang} not supported by XTTS/Chatterbox"}
else:
speaker_wav_b64 = data.get("speaker_wav")
speaker_path = None
if speaker_wav_b64:
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
f.write(base64.b64decode(speaker_wav_b64)); speaker_path = f.name
else:
speaker_path = "default_speaker.wav"
if not os.path.exists(speaker_path): speaker_path = None
try:
# EXECUTE GPU TTS
audio_b64 = gpu_tts_full(text, mapped_lang, speaker_path)
finally:
if speaker_wav_b64 and speaker_path and os.path.exists(speaker_path): os.unlink(speaker_path)
if action == "tts": return {"audio": audio_b64}
return {"text": stt_text, "translated": trans_text, "audio": audio_b64}
except Exception as e:
print(f"❌ [v164] ENGINE ERROR: {traceback.format_exc()}")
return {"error": str(e)}
finally:
print(f"--- [v164] ✨ MISSION COMPLETED ({time.time()-t1:.1f}s) ---")
@app.post("/process")
@app.post("/api/v1/process")
async def api_process(request: Request): return await handle_process(request)
@app.get("/health")
def health():
return {
"status": "ready",
"v": "164",
"gpu": torch.cuda.is_available(),
"devices": torch.cuda.device_count(),
"engine": "Full GPU PRO (Stable)",
"stt": "Whisper-v3-Turbo (FP32-GPU)",
"tts": "XTTS-v2 (GPU)"
}
@app.get("/", response_class=HTMLResponse)
def root(): return "<h1>πŸš€ PRO AI Engine v164 (GPU MODE)</h1>"
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=7860)