Upload app.py with huggingface_hub
Browse files
app.py
CHANGED
|
@@ -10,6 +10,8 @@ import traceback
|
|
| 10 |
import json
|
| 11 |
import time
|
| 12 |
import torchaudio
|
|
|
|
|
|
|
| 13 |
|
| 14 |
# 🛡️ ZeroGPU Support (v69)
|
| 15 |
# CRITICAL: @spaces.GPU MUST only be used on synchronous functions (def, not async def)
|
|
@@ -59,8 +61,8 @@ if not hasattr(torchaudio, "info"):
|
|
| 59 |
|
| 60 |
from df.enhance import enhance, init_df, load_audio, save_audio
|
| 61 |
|
| 62 |
-
# FORCE BUILD TRIGGER:
|
| 63 |
-
#
|
| 64 |
|
| 65 |
# 🛠️ Monkeypatch torchaudio.load
|
| 66 |
try:
|
|
@@ -100,6 +102,15 @@ def load_models():
|
|
| 100 |
print("⚠️ Falling back to CPU (int8)")
|
| 101 |
MODELS["stt"] = WhisperModel("large-v3", device="cpu", compute_type="int8")
|
| 102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
if MODELS["translate"] is None:
|
| 104 |
print("🌍 Loading Google Translate...")
|
| 105 |
MODELS["translate"] = "active"
|
|
@@ -168,37 +179,64 @@ def _tts_logic(text, lang, speaker_wav_b64):
|
|
| 168 |
"zh": "zh-cn", "zh-cn": "zh-cn", "zh-tw": "zh-cn"
|
| 169 |
}
|
| 170 |
|
|
|
|
|
|
|
|
|
|
| 171 |
if lang:
|
| 172 |
lang_key = lang.strip().lower()
|
| 173 |
-
|
| 174 |
-
# 2. Try the sub-code split match (e.g. 'en-US' -> 'en')
|
| 175 |
-
# 3. Fallback to the original key if not in map
|
| 176 |
-
lang = XTTS_MAP.get(lang_key) or XTTS_MAP.get(lang_key.split('-')[0]) or lang_key
|
| 177 |
-
|
| 178 |
-
print(f"[v79] TTS mapped language: {lang}")
|
| 179 |
-
speaker_wav_path = None
|
| 180 |
-
if speaker_wav_b64:
|
| 181 |
-
sb = base64.b64decode(speaker_wav_b64)
|
| 182 |
-
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
| 183 |
-
f.write(sb)
|
| 184 |
-
speaker_wav_path = f.name
|
| 185 |
-
else:
|
| 186 |
-
speaker_wav_path = "default_speaker.wav"
|
| 187 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
try:
|
| 189 |
-
|
| 190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
|
| 192 |
-
|
| 193 |
-
MODELS["tts"].tts_to_file(text=text, language=lang, file_path=output_path, speaker_wav=speaker_wav_path)
|
| 194 |
|
| 195 |
-
|
| 196 |
-
audio_b64 = base64.b64encode(f.read()).decode()
|
| 197 |
return {"audio": audio_b64}
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
|
| 202 |
|
| 203 |
@spaces.GPU
|
| 204 |
def core_process(request_dict):
|
|
@@ -233,7 +271,12 @@ def core_process(request_dict):
|
|
| 233 |
else:
|
| 234 |
res = {"error": f"Unknown action: {action}"}
|
| 235 |
|
| 236 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
return res
|
| 238 |
|
| 239 |
return {"error": f"Unknown action: {action}"}
|
|
@@ -274,6 +317,9 @@ def gpu_tts_generator(text, lang, speaker_wav_path):
|
|
| 274 |
finally:
|
| 275 |
if speaker_wav_path and "default_speaker" not in speaker_wav_path:
|
| 276 |
if os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
|
|
|
|
|
|
|
|
|
|
| 277 |
|
| 278 |
# --- FastAPI Entry Points ---
|
| 279 |
app = FastAPI()
|
|
@@ -314,7 +360,40 @@ async def api_tts_stream(request: Request):
|
|
| 314 |
|
| 315 |
@app.get("/health")
|
| 316 |
def health():
|
| 317 |
-
return {"status": "ok", "gpu": torch.cuda.is_available()}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 318 |
|
| 319 |
# --- Gradio UI ---
|
| 320 |
def gradio_fn(req_json):
|
|
|
|
| 10 |
import json
|
| 11 |
import time
|
| 12 |
import torchaudio
|
| 13 |
+
import chatterbox_utils
|
| 14 |
+
import gc
|
| 15 |
|
| 16 |
# 🛡️ ZeroGPU Support (v69)
|
| 17 |
# CRITICAL: @spaces.GPU MUST only be used on synchronous functions (def, not async def)
|
|
|
|
| 61 |
|
| 62 |
from df.enhance import enhance, init_df, load_audio, save_audio
|
| 63 |
|
| 64 |
+
# FORCE BUILD TRIGGER: 09:10:00 Jan 21 2026
|
| 65 |
+
# v81: Stability Optimizations (Memory Management + Cache Clearing)
|
| 66 |
|
| 67 |
# 🛠️ Monkeypatch torchaudio.load
|
| 68 |
try:
|
|
|
|
| 102 |
print("⚠️ Falling back to CPU (int8)")
|
| 103 |
MODELS["stt"] = WhisperModel("large-v3", device="cpu", compute_type="int8")
|
| 104 |
|
| 105 |
+
# 🧹 Proactive Memory Cleanup
|
| 106 |
+
gc.collect()
|
| 107 |
+
if torch.cuda.is_available():
|
| 108 |
+
torch.cuda.empty_cache()
|
| 109 |
+
|
| 110 |
+
# Initialize Chatterbox ONNX (High-Speed Fallback)
|
| 111 |
+
# This will load the model if not already loaded internally by chatterbox_utils
|
| 112 |
+
chatterbox_utils.load_chatterbox(device="cuda" if torch.cuda.is_available() else "cpu")
|
| 113 |
+
|
| 114 |
if MODELS["translate"] is None:
|
| 115 |
print("🌍 Loading Google Translate...")
|
| 116 |
MODELS["translate"] = "active"
|
|
|
|
| 179 |
"zh": "zh-cn", "zh-cn": "zh-cn", "zh-tw": "zh-cn"
|
| 180 |
}
|
| 181 |
|
| 182 |
+
XTTS_LANG_CODES = set(XTTS_MAP.values())
|
| 183 |
+
|
| 184 |
+
mapped_lang = None
|
| 185 |
if lang:
|
| 186 |
lang_key = lang.strip().lower()
|
| 187 |
+
mapped_lang = XTTS_MAP.get(lang_key) or XTTS_MAP.get(lang_key.split('-')[0])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
|
| 189 |
+
print(f"[v80] TTS Request - Original: {lang}, Mapped: {mapped_lang}")
|
| 190 |
+
|
| 191 |
+
# 🛣️ INTELLIGENT ROUTING
|
| 192 |
+
# Case A: XTTS Support (Voice Cloning)
|
| 193 |
+
if mapped_lang and mapped_lang in XTTS_LANG_CODES:
|
| 194 |
+
print(f"[v80] Using XTTS-v2 for '{mapped_lang}'")
|
| 195 |
+
speaker_wav_path = None
|
| 196 |
+
if speaker_wav_b64:
|
| 197 |
+
sb = base64.b64decode(speaker_wav_b64)
|
| 198 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
| 199 |
+
f.write(sb)
|
| 200 |
+
speaker_wav_path = f.name
|
| 201 |
+
else:
|
| 202 |
+
speaker_wav_path = "default_speaker.wav"
|
| 203 |
+
|
| 204 |
+
try:
|
| 205 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
|
| 206 |
+
output_path = output_file.name
|
| 207 |
+
|
| 208 |
+
# 🎙️ XTTS Inference
|
| 209 |
+
MODELS["tts"].tts_to_file(text=text, language=mapped_lang, file_path=output_path, speaker_wav=speaker_wav_path)
|
| 210 |
+
|
| 211 |
+
with open(output_path, "rb") as f:
|
| 212 |
+
audio_b64 = base64.b64encode(f.read()).decode()
|
| 213 |
+
return {"audio": audio_b64}
|
| 214 |
+
finally:
|
| 215 |
+
if speaker_wav_path and "default_speaker" not in speaker_wav_path:
|
| 216 |
+
if os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
|
| 217 |
+
if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
|
| 218 |
+
|
| 219 |
+
# Case B: Chatterbox ONNX Support (High-Quality Fast Fallback)
|
| 220 |
+
print(f"[v80] Using Chatterbox ONNX Fallback for '{lang}'")
|
| 221 |
try:
|
| 222 |
+
# Use local file if available for cloning in Chatterbox too
|
| 223 |
+
temp_ref = None
|
| 224 |
+
if speaker_wav_b64:
|
| 225 |
+
sb = base64.b64decode(speaker_wav_b64)
|
| 226 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
| 227 |
+
f.write(sb); temp_ref = f.name
|
| 228 |
+
|
| 229 |
+
# Chatterbox supports codes like 'fi', 'el', 'da', etc.
|
| 230 |
+
chatter_lang = lang.strip().lower().split('-')[0]
|
| 231 |
+
audio_bytes = chatterbox_utils.run_chatterbox_inference(text, chatter_lang, speaker_wav_path=temp_ref)
|
| 232 |
|
| 233 |
+
if temp_ref and os.path.exists(temp_ref): os.unlink(temp_ref)
|
|
|
|
| 234 |
|
| 235 |
+
audio_b64 = base64.b64encode(audio_bytes).decode()
|
|
|
|
| 236 |
return {"audio": audio_b64}
|
| 237 |
+
except Exception as e:
|
| 238 |
+
print(f"❌ Chatterbox Fallback failed: {e}")
|
| 239 |
+
return {"error": f"TTS Failure: '{lang}' not supported by XTTS or Chatterbox."}
|
|
|
|
| 240 |
|
| 241 |
@spaces.GPU
|
| 242 |
def core_process(request_dict):
|
|
|
|
| 271 |
else:
|
| 272 |
res = {"error": f"Unknown action: {action}"}
|
| 273 |
|
| 274 |
+
finally:
|
| 275 |
+
print(f"--- [v81] ✨ SESSION END: {action} ---")
|
| 276 |
+
gc.collect()
|
| 277 |
+
if torch.cuda.is_available():
|
| 278 |
+
torch.cuda.empty_cache()
|
| 279 |
+
|
| 280 |
return res
|
| 281 |
|
| 282 |
return {"error": f"Unknown action: {action}"}
|
|
|
|
| 317 |
finally:
|
| 318 |
if speaker_wav_path and "default_speaker" not in speaker_wav_path:
|
| 319 |
if os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
|
| 320 |
+
gc.collect()
|
| 321 |
+
if torch.cuda.is_available():
|
| 322 |
+
torch.cuda.empty_cache()
|
| 323 |
|
| 324 |
# --- FastAPI Entry Points ---
|
| 325 |
app = FastAPI()
|
|
|
|
| 360 |
|
| 361 |
@app.get("/health")
|
| 362 |
def health():
|
| 363 |
+
return {"status": "ok", "gpu": torch.cuda.is_available(), "time": time.ctime()}
|
| 364 |
+
|
| 365 |
+
@app.post("/api/v1/clear_cache")
|
| 366 |
+
async def clear_cache():
|
| 367 |
+
"""Manual deep cleanup of memory and caches"""
|
| 368 |
+
try:
|
| 369 |
+
t0 = time.time()
|
| 370 |
+
print("🧹 Manual Cache Clearing Triggered...")
|
| 371 |
+
|
| 372 |
+
# 1. GC collect
|
| 373 |
+
gc.collect()
|
| 374 |
+
|
| 375 |
+
# 2. CUDA cache
|
| 376 |
+
if torch.cuda.is_available():
|
| 377 |
+
torch.cuda.empty_cache()
|
| 378 |
+
|
| 379 |
+
# 3. Clean temp files
|
| 380 |
+
temp_dir = tempfile.gettempdir()
|
| 381 |
+
count = 0
|
| 382 |
+
for f in os.listdir(temp_dir):
|
| 383 |
+
if f.endswith(".wav") or f.startswith("tm"):
|
| 384 |
+
try:
|
| 385 |
+
os.unlink(os.path.join(temp_dir, f))
|
| 386 |
+
count += 1
|
| 387 |
+
except: pass
|
| 388 |
+
|
| 389 |
+
return {
|
| 390 |
+
"status": "success",
|
| 391 |
+
"cleaned_files": count,
|
| 392 |
+
"duration": f"{time.time()-t0:.2f}s",
|
| 393 |
+
"gpu_memory": f"{torch.cuda.memory_allocated() / 1024**2:.2f}MB" if torch.cuda.is_available() else "N/A"
|
| 394 |
+
}
|
| 395 |
+
except Exception as e:
|
| 396 |
+
return {"status": "error", "message": str(e)}
|
| 397 |
|
| 398 |
# --- Gradio UI ---
|
| 399 |
def gradio_fn(req_json):
|