Upload app.py with huggingface_hub
Browse files
app.py
CHANGED
|
@@ -61,8 +61,8 @@ if not hasattr(torchaudio, "info"):
|
|
| 61 |
|
| 62 |
from df.enhance import enhance, init_df, load_audio, save_audio
|
| 63 |
|
| 64 |
-
# FORCE BUILD TRIGGER: 09:
|
| 65 |
-
#
|
| 66 |
|
| 67 |
# π οΈ Monkeypatch torchaudio.load
|
| 68 |
try:
|
|
@@ -186,12 +186,12 @@ def _tts_logic(text, lang, speaker_wav_b64):
|
|
| 186 |
lang_key = lang.strip().lower()
|
| 187 |
mapped_lang = XTTS_MAP.get(lang_key) or XTTS_MAP.get(lang_key.split('-')[0])
|
| 188 |
|
| 189 |
-
print(f"[
|
| 190 |
|
| 191 |
# π£οΈ INTELLIGENT ROUTING
|
| 192 |
# Case A: XTTS Support (Voice Cloning)
|
| 193 |
if mapped_lang and mapped_lang in XTTS_LANG_CODES:
|
| 194 |
-
print(f"[
|
| 195 |
speaker_wav_path = None
|
| 196 |
if speaker_wav_b64:
|
| 197 |
sb = base64.b64decode(speaker_wav_b64)
|
|
@@ -217,7 +217,7 @@ def _tts_logic(text, lang, speaker_wav_b64):
|
|
| 217 |
if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
|
| 218 |
|
| 219 |
# Case B: Chatterbox ONNX Support (High-Quality Fast Fallback)
|
| 220 |
-
print(f"[
|
| 221 |
try:
|
| 222 |
# Use local file if available for cloning in Chatterbox too
|
| 223 |
temp_ref = None
|
|
@@ -241,38 +241,39 @@ def _tts_logic(text, lang, speaker_wav_b64):
|
|
| 241 |
@spaces.GPU
|
| 242 |
def core_process(request_dict):
|
| 243 |
"""
|
| 244 |
-
Unified GPU Entry Point (
|
| 245 |
This function handles all high-speed tasks inside a single GPU allocation.
|
| 246 |
The container stays resident on CPU but triggers GPU on demand.
|
| 247 |
"""
|
| 248 |
action = request_dict.get("action")
|
| 249 |
t0 = time.time()
|
| 250 |
-
print(f"--- [
|
| 251 |
load_models()
|
| 252 |
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
|
|
|
| 274 |
finally:
|
| 275 |
-
print(f"--- [
|
| 276 |
gc.collect()
|
| 277 |
if torch.cuda.is_available():
|
| 278 |
torch.cuda.empty_cache()
|
|
|
|
| 61 |
|
| 62 |
from df.enhance import enhance, init_df, load_audio, save_audio
|
| 63 |
|
| 64 |
+
# FORCE BUILD TRIGGER: 09:40:00 Jan 21 2026
|
| 65 |
+
# v84: Fixed SyntaxError (Missing try block in core_process)
|
| 66 |
|
| 67 |
# π οΈ Monkeypatch torchaudio.load
|
| 68 |
try:
|
|
|
|
| 186 |
lang_key = lang.strip().lower()
|
| 187 |
mapped_lang = XTTS_MAP.get(lang_key) or XTTS_MAP.get(lang_key.split('-')[0])
|
| 188 |
|
| 189 |
+
print(f"[v84] TTS Request - Original: {lang}, Mapped: {mapped_lang}")
|
| 190 |
|
| 191 |
# π£οΈ INTELLIGENT ROUTING
|
| 192 |
# Case A: XTTS Support (Voice Cloning)
|
| 193 |
if mapped_lang and mapped_lang in XTTS_LANG_CODES:
|
| 194 |
+
print(f"[v84] Using XTTS-v2 for '{mapped_lang}'")
|
| 195 |
speaker_wav_path = None
|
| 196 |
if speaker_wav_b64:
|
| 197 |
sb = base64.b64decode(speaker_wav_b64)
|
|
|
|
| 217 |
if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)
|
| 218 |
|
| 219 |
# Case B: Chatterbox ONNX Support (High-Quality Fast Fallback)
|
| 220 |
+
print(f"[v84] Using Chatterbox ONNX Fallback for '{lang}'")
|
| 221 |
try:
|
| 222 |
# Use local file if available for cloning in Chatterbox too
|
| 223 |
temp_ref = None
|
|
|
|
| 241 |
@spaces.GPU
|
| 242 |
def core_process(request_dict):
|
| 243 |
"""
|
| 244 |
+
Unified GPU Entry Point (v84).
|
| 245 |
This function handles all high-speed tasks inside a single GPU allocation.
|
| 246 |
The container stays resident on CPU but triggers GPU on demand.
|
| 247 |
"""
|
| 248 |
action = request_dict.get("action")
|
| 249 |
t0 = time.time()
|
| 250 |
+
print(f"--- [v84] π GPU SESSION START: {action} at {time.ctime()} ---")
|
| 251 |
load_models()
|
| 252 |
|
| 253 |
+
try:
|
| 254 |
+
if action == "stt":
|
| 255 |
+
res = _stt_logic(request_dict)
|
| 256 |
+
elif action == "translate":
|
| 257 |
+
res = {"translated": _translate_logic(request_dict.get("text"), request_dict.get("target_lang", "en"))}
|
| 258 |
+
elif action == "tts":
|
| 259 |
+
res = _tts_logic(request_dict.get("text"), request_dict.get("lang"), request_dict.get("speaker_wav"))
|
| 260 |
+
elif action == "s2st":
|
| 261 |
+
# π FULL PIPELINE (Single GPU Call)
|
| 262 |
+
stt_res = _stt_logic({"file": request_dict.get("file"), "lang": request_dict.get("source_lang")})
|
| 263 |
+
text = stt_res.get("text", "")
|
| 264 |
+
if not text: return {"error": "No speech detected"}
|
| 265 |
+
|
| 266 |
+
translated = _translate_logic(text, request_dict.get("target_lang"))
|
| 267 |
+
|
| 268 |
+
tts_res = _tts_logic(translated, request_dict.get("target_lang"), request_dict.get("speaker_wav"))
|
| 269 |
+
res = {"text": text, "translated": translated, "audio": tts_res.get("audio")}
|
| 270 |
+
elif action == "health":
|
| 271 |
+
res = {"status": "awake", "time": time.ctime()}
|
| 272 |
+
else:
|
| 273 |
+
res = {"error": f"Unknown action: {action}"}
|
| 274 |
+
|
| 275 |
finally:
|
| 276 |
+
print(f"--- [v84] β¨ SESSION END: {action} (Total: {time.time()-t0:.2f}s) ---")
|
| 277 |
gc.collect()
|
| 278 |
if torch.cuda.is_available():
|
| 279 |
torch.cuda.empty_cache()
|