Spaces:

TGPro1
/

S2ST

Running on Zero

App Files Files Community

S2ST / app.py

TGPro1

Upload app.py with huggingface_hub

0695186 verified 26 days ago

raw

history blame contribute delete

8.71 kB

	import os
	import sys
	import time
	import base64
	import torch
	import tempfile
	import traceback
	import uvicorn
	import gc
	from fastapi import FastAPI, Request
	from fastapi.middleware.cors import CORSMiddleware
	from fastapi.responses import HTMLResponse

	# --- [v164] 🚀 PRO GPU ENGINE (ULTIMATE STABILITY) ---
	# This version enforces float32 for STT to avoid CUBLAS errors and uses batch_size=1.
	print(f"--- [v164] 📡 BOOTING ENGINE ---")

	# 🛠️ CRITICAL: TORCHAUDIO MONKEYPATCH 🛠️
	import torchaudio
	import soundfile as sf
	def HeroLoad(filepath, **kwargs):
	try:
	data, samplerate = sf.read(filepath)
	if len(data.shape) == 1:
	data = data.reshape(1, -1)
	else:
	data = data.T
	return torch.from_numpy(data).float(), samplerate
	except Exception as e:
	print(f"--- [v162] ❌ PATCHED LOAD FAILED: {e} ---")
	return torchaudio.load_orig(filepath, **kwargs)

	if not hasattr(torchaudio, 'load_orig'):
	torchaudio.load_orig = torchaudio.load
	torchaudio.load = HeroLoad
	print("--- [v164] 🩹 TORCHAUDIO PATCH APPLIED ---")

	from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
	from TTS.api import TTS
	from deep_translator import GoogleTranslator

	try:
	import chatterbox_utils
	HAS_CHATTERBOX = True
	except ImportError:
	HAS_CHATTERBOX = False

	try:
	import spaces
	HAS_SPACES = True
	except ImportError:
	HAS_SPACES = False
	class spaces:
	@staticmethod
	def GPU(duration=60):
	def decorator(func):
	return func
	return decorator

	os.environ["COQUI_TOS_AGREED"] = "1"
	os.environ["PYTHONWARNINGS"] = "ignore"

	app = FastAPI()
	app.add_middleware(CORSMiddleware, allow_origins=[""], allow_methods=[""], allow_headers=["*"])

	MODELS = {"stt": None, "tts": None, "gpu_id": 0}

	def get_best_gpu():
	"""Architecture for multi-GPU support (Switch)."""
	if not torch.cuda.is_available(): return "cpu"
	# Select GPU with most free memory if multiple exist
	# For ZeroGPU, this defaults to the allocated MIG instance.
	return f"cuda:{MODELS['gpu_id']}"

	@spaces.GPU(duration=120)
	def gpu_stt_full(temp_path, lang):
	global MODELS
	device = get_best_gpu()

	if MODELS.get("stt") is None:
	print(f"--- [v164] 📥 LOADING WHISPER LARGE (FP32) ON {device} ---")
	model_id = "openai/whisper-large-v3-turbo"
	# Using float32 to resolve CUBLAS_STATUS_EXECUTION_FAILED on H200/A10G MIG
	model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch.float32).to(device)
	processor = AutoProcessor.from_pretrained(model_id)
	MODELS["stt"] = pipeline(
	"automatic-speech-recognition",
	model=model,
	tokenizer=processor.tokenizer,
	feature_extractor=processor.feature_extractor,
	chunk_length_s=30,
	device=device
	)

	print(f"--- [v164] 🎙️ WHISPER INFERENCE (TEMP 0, BS 1) ---")
	res = MODELS["stt"](
	temp_path,
	batch_size=1, # Ultimate stability
	generate_kwargs={
	"language": lang if lang and len(lang) <= 3 else None,
	"temperature": 0.0,
	"return_timestamps": True
	}
	)

	# Post-inference cleanup
	torch.cuda.empty_cache()
	gc.collect()

	return res["text"].strip()

	@spaces.GPU(duration=180)
	def gpu_tts_full(text, mapped_lang, speaker_path):
	global MODELS
	device = "cuda"

	if MODELS.get("tts") is None:
	print(f"--- [v164] 📥 LOADING XTTS V2 ON GPU ---")
	MODELS["tts"] = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
	else:
	try: MODELS["tts"].to(device)
	except: pass

	print(f"--- [v164] 🔊 XTTS GPU INFERENCE ---")
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_f:
	out_p = out_f.name

	MODELS["tts"].tts_to_file(text=text, language=mapped_lang, file_path=out_p, speaker_wav=speaker_path)

	with open(out_p, "rb") as f:
	audio_b64 = base64.b64encode(f.read()).decode()

	if os.path.exists(out_p): os.unlink(out_p)

	# Cleanup to prevent ZeroGPU worker errors
	torch.cuda.empty_cache()
	gc.collect()

	return audio_b64

	async def handle_process(request: Request):
	t1 = time.time()
	try:
	data = await request.json()
	action = data.get("action")
	if action == "health": return {"status": "awake", "v": "164"}

	print(f"--- [v164] 🛠️ API REQUEST: {action.upper()} ---")

	stt_text = ""
	# 🟢 SPEECH-TO-TEXT
	if action in ["stt", "s2st"]:
	audio_b64 = data.get("file")
	if not audio_b64: return {"error": "Missing audio data"}

	audio_bytes = base64.b64decode(audio_b64)
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
	f.write(audio_bytes); temp_path = f.name
	try:
	stt_text = gpu_stt_full(temp_path, data.get("lang"))
	print(f"--- [v162] 🎙️ TEXT: {stt_text[:100]}... ---")
	finally:
	if os.path.exists(temp_path): os.unlink(temp_path)

	if action == "stt": return {"text": stt_text}

	# 🔵 TEXT-TO-SPEECH
	if action in ["tts", "s2st"]:
	text = (data.get("text") if action == "tts" else stt_text).strip()
	if not text: return {"error": "Input text is empty"}

	target = data.get("target_lang") or data.get("lang") or "en"
	trans_text = text

	if action == "s2st":
	print(f"--- [v164] 🌏 TRANSLATING TO {target} ---")
	trans_text = GoogleTranslator(source='auto', target=target).translate(stt_text)
	text = trans_text
	print(f"--- [v164] 📝 TRANS: {text[:100]}... ---")

	XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
	clean_lang = target.split('-')[0].lower()
	mapped_lang = XTTS_MAP.get(clean_lang) or ("zh-cn" if clean_lang == "zh" else None)

	if not mapped_lang:
	if HAS_CHATTERBOX:
	audio_bytes = chatterbox_utils.run_chatterbox_inference(text, clean_lang)
	audio_b64 = base64.b64encode(audio_bytes).decode()
	else: return {"error": f"Language {clean_lang} not supported by XTTS/Chatterbox"}
	else:
	speaker_wav_b64 = data.get("speaker_wav")
	speaker_path = None
	if speaker_wav_b64:
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
	f.write(base64.b64decode(speaker_wav_b64)); speaker_path = f.name
	else:
	speaker_path = "default_speaker.wav"
	if not os.path.exists(speaker_path): speaker_path = None

	try:
	# EXECUTE GPU TTS
	audio_b64 = gpu_tts_full(text, mapped_lang, speaker_path)
	finally:
	if speaker_wav_b64 and speaker_path and os.path.exists(speaker_path): os.unlink(speaker_path)

	if action == "tts": return {"audio": audio_b64}
	return {"text": stt_text, "translated": trans_text, "audio": audio_b64}

	except Exception as e:
	print(f"❌ [v164] ENGINE ERROR: {traceback.format_exc()}")
	return {"error": str(e)}
	finally:
	print(f"--- [v164] ✨ MISSION COMPLETED ({time.time()-t1:.1f}s) ---")

	@app.post("/process")
	@app.post("/api/v1/process")
	async def api_process(request: Request): return await handle_process(request)

	@app.get("/health")
	def health():
	return {
	"status": "ready",
	"v": "164",
	"gpu": torch.cuda.is_available(),
	"devices": torch.cuda.device_count(),
	"engine": "Full GPU PRO (Stable)",
	"stt": "Whisper-v3-Turbo (FP32-GPU)",
	"tts": "XTTS-v2 (GPU)"
	}

	@app.get("/", response_class=HTMLResponse)
	def root(): return "<h1>🚀 PRO AI Engine v164 (GPU MODE)</h1>"

	if __name__ == "__main__":
	uvicorn.run(app, host="0.0.0.0", port=7860)