File size: 8,710 Bytes
7454cce 811f60e 6fd2aaf 7bc2a36 69d9eef 811f60e 0c19477 0695186 6fd2aaf 6ce2f3c 6fd2aaf 6ce2f3c 3d20be5 6ce2f3c 7bc2a36 6ce2f3c 4bfa772 6ce2f3c 0695186 0c19477 971c294 6a4b0e8 0c19477 376aa42 6a4b0e8 ea78b72 2ebc6b4 811f60e 4bfa772 b93eba5 4bfa772 971c294 7bc2a36 4bfa772 71c50e8 4bfa772 7bc2a36 cb0d204 0695186 7bc2a36 0695186 971c294 0695186 7bc2a36 971c294 7bc2a36 0695186 7bc2a36 0695186 7bc2a36 0695186 0af9862 7bc2a36 0695186 971c294 639ffca 4bfa772 7bc2a36 4bfa772 7bc2a36 0695186 7bc2a36 4bfa772 0695186 7bc2a36 4bfa772 7bc2a36 639ffca d8da089 639ffca 0695186 639ffca 0695186 639ffca 22c6fab 4bfa772 639ffca 6ce2f3c 4bfa772 6ce2f3c 23b6539 7bc2a36 639ffca 4bfa772 639ffca 4bfa772 22c6fab 639ffca 4bfa772 639ffca 6ce2f3c 639ffca 7bc2a36 4bfa772 639ffca 0695186 639ffca 0695186 639ffca 4bfa772 639ffca 7bc2a36 639ffca 4bfa772 639ffca 71c50e8 639ffca 0695186 639ffca 0695186 b0d71b5 639ffca 32297a1 811f60e 7bc2a36 4bfa772 0695186 4bfa772 0af9862 0695186 0af9862 7bc2a36 811f60e 0695186 e0a0f24 92366fd 811f60e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 |
import os
import sys
import time
import base64
import torch
import tempfile
import traceback
import uvicorn
import gc
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import HTMLResponse
# --- [v164] π PRO GPU ENGINE (ULTIMATE STABILITY) ---
# This version enforces float32 for STT to avoid CUBLAS errors and uses batch_size=1.
print(f"--- [v164] π‘ BOOTING ENGINE ---")
# π οΈ CRITICAL: TORCHAUDIO MONKEYPATCH π οΈ
import torchaudio
import soundfile as sf
def HeroLoad(filepath, **kwargs):
try:
data, samplerate = sf.read(filepath)
if len(data.shape) == 1:
data = data.reshape(1, -1)
else:
data = data.T
return torch.from_numpy(data).float(), samplerate
except Exception as e:
print(f"--- [v162] β PATCHED LOAD FAILED: {e} ---")
return torchaudio.load_orig(filepath, **kwargs)
if not hasattr(torchaudio, 'load_orig'):
torchaudio.load_orig = torchaudio.load
torchaudio.load = HeroLoad
print("--- [v164] π©Ή TORCHAUDIO PATCH APPLIED ---")
from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
from TTS.api import TTS
from deep_translator import GoogleTranslator
try:
import chatterbox_utils
HAS_CHATTERBOX = True
except ImportError:
HAS_CHATTERBOX = False
try:
import spaces
HAS_SPACES = True
except ImportError:
HAS_SPACES = False
class spaces:
@staticmethod
def GPU(duration=60):
def decorator(func):
return func
return decorator
os.environ["COQUI_TOS_AGREED"] = "1"
os.environ["PYTHONWARNINGS"] = "ignore"
app = FastAPI()
app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
MODELS = {"stt": None, "tts": None, "gpu_id": 0}
def get_best_gpu():
"""Architecture for multi-GPU support (Switch)."""
if not torch.cuda.is_available(): return "cpu"
# Select GPU with most free memory if multiple exist
# For ZeroGPU, this defaults to the allocated MIG instance.
return f"cuda:{MODELS['gpu_id']}"
@spaces.GPU(duration=120)
def gpu_stt_full(temp_path, lang):
global MODELS
device = get_best_gpu()
if MODELS.get("stt") is None:
print(f"--- [v164] π₯ LOADING WHISPER LARGE (FP32) ON {device} ---")
model_id = "openai/whisper-large-v3-turbo"
# Using float32 to resolve CUBLAS_STATUS_EXECUTION_FAILED on H200/A10G MIG
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch.float32).to(device)
processor = AutoProcessor.from_pretrained(model_id)
MODELS["stt"] = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
chunk_length_s=30,
device=device
)
print(f"--- [v164] ποΈ WHISPER INFERENCE (TEMP 0, BS 1) ---")
res = MODELS["stt"](
temp_path,
batch_size=1, # Ultimate stability
generate_kwargs={
"language": lang if lang and len(lang) <= 3 else None,
"temperature": 0.0,
"return_timestamps": True
}
)
# Post-inference cleanup
torch.cuda.empty_cache()
gc.collect()
return res["text"].strip()
@spaces.GPU(duration=180)
def gpu_tts_full(text, mapped_lang, speaker_path):
global MODELS
device = "cuda"
if MODELS.get("tts") is None:
print(f"--- [v164] π₯ LOADING XTTS V2 ON GPU ---")
MODELS["tts"] = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
else:
try: MODELS["tts"].to(device)
except: pass
print(f"--- [v164] π XTTS GPU INFERENCE ---")
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_f:
out_p = out_f.name
MODELS["tts"].tts_to_file(text=text, language=mapped_lang, file_path=out_p, speaker_wav=speaker_path)
with open(out_p, "rb") as f:
audio_b64 = base64.b64encode(f.read()).decode()
if os.path.exists(out_p): os.unlink(out_p)
# Cleanup to prevent ZeroGPU worker errors
torch.cuda.empty_cache()
gc.collect()
return audio_b64
async def handle_process(request: Request):
t1 = time.time()
try:
data = await request.json()
action = data.get("action")
if action == "health": return {"status": "awake", "v": "164"}
print(f"--- [v164] π οΈ API REQUEST: {action.upper()} ---")
stt_text = ""
# π’ SPEECH-TO-TEXT
if action in ["stt", "s2st"]:
audio_b64 = data.get("file")
if not audio_b64: return {"error": "Missing audio data"}
audio_bytes = base64.b64decode(audio_b64)
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
f.write(audio_bytes); temp_path = f.name
try:
stt_text = gpu_stt_full(temp_path, data.get("lang"))
print(f"--- [v162] ποΈ TEXT: {stt_text[:100]}... ---")
finally:
if os.path.exists(temp_path): os.unlink(temp_path)
if action == "stt": return {"text": stt_text}
# π΅ TEXT-TO-SPEECH
if action in ["tts", "s2st"]:
text = (data.get("text") if action == "tts" else stt_text).strip()
if not text: return {"error": "Input text is empty"}
target = data.get("target_lang") or data.get("lang") or "en"
trans_text = text
if action == "s2st":
print(f"--- [v164] π TRANSLATING TO {target} ---")
trans_text = GoogleTranslator(source='auto', target=target).translate(stt_text)
text = trans_text
print(f"--- [v164] π TRANS: {text[:100]}... ---")
XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
clean_lang = target.split('-')[0].lower()
mapped_lang = XTTS_MAP.get(clean_lang) or ("zh-cn" if clean_lang == "zh" else None)
if not mapped_lang:
if HAS_CHATTERBOX:
audio_bytes = chatterbox_utils.run_chatterbox_inference(text, clean_lang)
audio_b64 = base64.b64encode(audio_bytes).decode()
else: return {"error": f"Language {clean_lang} not supported by XTTS/Chatterbox"}
else:
speaker_wav_b64 = data.get("speaker_wav")
speaker_path = None
if speaker_wav_b64:
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
f.write(base64.b64decode(speaker_wav_b64)); speaker_path = f.name
else:
speaker_path = "default_speaker.wav"
if not os.path.exists(speaker_path): speaker_path = None
try:
# EXECUTE GPU TTS
audio_b64 = gpu_tts_full(text, mapped_lang, speaker_path)
finally:
if speaker_wav_b64 and speaker_path and os.path.exists(speaker_path): os.unlink(speaker_path)
if action == "tts": return {"audio": audio_b64}
return {"text": stt_text, "translated": trans_text, "audio": audio_b64}
except Exception as e:
print(f"β [v164] ENGINE ERROR: {traceback.format_exc()}")
return {"error": str(e)}
finally:
print(f"--- [v164] β¨ MISSION COMPLETED ({time.time()-t1:.1f}s) ---")
@app.post("/process")
@app.post("/api/v1/process")
async def api_process(request: Request): return await handle_process(request)
@app.get("/health")
def health():
return {
"status": "ready",
"v": "164",
"gpu": torch.cuda.is_available(),
"devices": torch.cuda.device_count(),
"engine": "Full GPU PRO (Stable)",
"stt": "Whisper-v3-Turbo (FP32-GPU)",
"tts": "XTTS-v2 (GPU)"
}
@app.get("/", response_class=HTMLResponse)
def root(): return "<h1>π PRO AI Engine v164 (GPU MODE)</h1>"
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=7860)
|