TGPro1 commited on
Commit
71c50e8
Β·
verified Β·
1 Parent(s): c4e65cf

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +31 -37
app.py CHANGED
@@ -1,4 +1,4 @@
1
- # πŸš€ v135: ZEROGPU HOPPER ELITE (FP32 STABILITY)
2
  try:
3
  import spaces
4
  except ImportError:
@@ -23,11 +23,14 @@ import traceback
23
  import soundfile as sf
24
  from faster_whisper import WhisperModel
25
 
26
- # πŸ›‘οΈ 0. INFRASTRUCTURE OPTIMIZATION (v135)
27
  os.environ["COQUI_TOS_AGREED"] = "1"
28
  os.environ["PYTHONWARNINGS"] = "ignore"
29
- os.environ["CT2_CUDA_ALLOW_TF32"] = "1" # Leverage H200 TF32 cores
30
- torch.set_float32_matmul_precision('high')
 
 
 
31
 
32
  import torchaudio
33
  def torchaudio_load_safe(filepath, **kwargs):
@@ -37,47 +40,46 @@ def torchaudio_load_safe(filepath, **kwargs):
37
  return tensor, sr
38
  torchaudio.load = torchaudio_load_safe
39
 
40
- # πŸ“¦ 1. GLOBAL MODELS (LAZY CPU LOAD)
41
  MODELS = {"stt": None, "tts": None}
42
 
43
- def get_tts():
 
 
 
 
44
  if MODELS["tts"] is None:
45
- print("πŸ”Š Pre-loading XTTS-v2 (CPU RAM)...")
46
  from TTS.api import TTS
47
- MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
48
- return MODELS["tts"]
49
 
50
- # πŸ› οΈ 2. CORE PROCESSING (v135: FP32 FOR STABILITY)
51
  @spaces.GPU(duration=120)
52
  def core_process(request_dict):
53
  global MODELS
54
  action = request_dict.get("action")
55
- print(f"--- [v135] πŸ› οΈ ELITE ENGINE: {action} ---")
56
  t1 = time.time()
57
 
58
  try:
59
- # πŸŽ™οΈ STT PATH (Fast-Whisper GPU FP32)
 
 
 
60
  if action in ["stt", "s2st"]:
61
- print("⚑ Promoting STT to GPU (FP32 path)...")
62
- # Force float32 to avoid cublasSgemm alignment errors on H200 drivers
63
- gpu_stt = WhisperModel("large-v3-turbo", device="cuda", compute_type="float32")
64
-
65
  audio_bytes = base64.b64decode(request_dict.get("file"))
66
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
67
  f.write(audio_bytes); temp_path = f.name
68
  try:
69
  lang = request_dict.get("lang")
70
- segments, _ = gpu_stt.transcribe(temp_path, language=lang if lang and len(lang) <= 3 else None, beam_size=1)
71
  stt_text = "".join([s.text for s in segments]).strip()
72
  finally:
73
  if os.path.exists(temp_path): os.unlink(temp_path)
74
- del gpu_stt
75
- gc.collect()
76
- torch.cuda.empty_cache()
77
 
78
  if action == "stt": return {"text": stt_text}
79
 
80
- # πŸ”Š TTS PATH (XTTS GPU)
81
  if action in ["tts", "s2st"]:
82
  text = (request_dict.get("text") if action == "tts" else stt_text).strip()
83
  if action == "s2st":
@@ -89,10 +91,6 @@ def core_process(request_dict):
89
  if len(text) < 2 or not any(c.isalnum() for c in text):
90
  return {"audio": ""} if action == "tts" else {"text": stt_text, "translated": "", "audio": ""}
91
 
92
- print("⚑ Promoting TTS to GPU...")
93
- from TTS.api import TTS
94
- gpu_tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
95
-
96
  XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
97
  raw_lang = (request_dict.get("lang") if action == "tts" else target).strip().lower()
98
  clean_lang = raw_lang.split('-')[0]
@@ -111,14 +109,11 @@ def core_process(request_dict):
111
  try:
112
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_f:
113
  out_p = out_f.name
114
- gpu_tts.tts_to_file(text=text, language=mapped_lang, file_path=out_p, speaker_wav=speaker_wav_path)
115
  with open(out_p, "rb") as f: audio_b64 = base64.b64encode(f.read()).decode()
116
  finally:
117
  if speaker_wav_path and "default" not in speaker_wav_path and os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
118
  if 'out_p' in locals() and os.path.exists(out_p): os.unlink(out_p)
119
- del gpu_tts
120
- gc.collect()
121
- torch.cuda.empty_cache()
122
  else:
123
  import chatterbox_utils
124
  audio_bytes = chatterbox_utils.run_chatterbox_inference(text, clean_lang)
@@ -128,11 +123,11 @@ def core_process(request_dict):
128
  return {"text": stt_text, "translated": trans_text, "audio": audio_b64}
129
 
130
  except Exception as e:
131
- print(f"❌ [v135] ERROR: {traceback.format_exc()}")
132
  return {"error": str(e)}
133
  finally:
134
- print(f"--- [v135] ✨ DONE ({time.time()-t1:.1f}s) ---")
135
- gc.collect()
136
 
137
  # πŸš€ 3. SERVER SETUP
138
  app = FastAPI()
@@ -142,20 +137,19 @@ app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], all
142
  async def api_process(request: Request):
143
  try:
144
  data = await request.json()
145
- if data.get("action") == "health": return {"status": "awake", "v": "135"}
146
  return core_process(data)
147
  except Exception as e: return {"error": str(e)}
148
 
149
  @app.get("/health")
150
- def health(): return {"status": "ok", "v": "135"}
151
 
152
  demo = gr.Interface(
153
  fn=lambda x: json.dumps(core_process(json.loads(x))),
154
- inputs="text", outputs="text", title="πŸš€ AI Engine v135 (H200 FP32)",
155
- description="Optimized for H200 | GPU STT (FP32) | GPU TTS | Zero-Crash"
156
  ).queue()
157
  app = gr.mount_gradio_app(app, demo, path="/")
158
 
159
  if __name__ == "__main__":
160
- # Simplified entry point for Hugging Face compatibility
161
  uvicorn.run(app, host="0.0.0.0", port=7860, log_level="warning")
 
1
+ # πŸš€ v136: ZEROGPU HOPPER ULTIMATE (PERSISTENT GPU)
2
  try:
3
  import spaces
4
  except ImportError:
 
23
  import soundfile as sf
24
  from faster_whisper import WhisperModel
25
 
26
+ # πŸ›‘οΈ 0. INFRASTRUCTURE PURIST (v136)
27
  os.environ["COQUI_TOS_AGREED"] = "1"
28
  os.environ["PYTHONWARNINGS"] = "ignore"
29
+ # Strict CUBLAS stability for H200
30
+ os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
31
+ torch.backends.cuda.matmul.allow_tf32 = False
32
+ torch.backends.cudnn.allow_tf32 = False
33
+ torch.use_deterministic_algorithms(False) # Some kernels might need this, but let's keep it flexible
34
 
35
  import torchaudio
36
  def torchaudio_load_safe(filepath, **kwargs):
 
40
  return tensor, sr
41
  torchaudio.load = torchaudio_load_safe
42
 
43
+ # πŸ“¦ 1. GLOBAL MODELS (SINGLETON PATTERN)
44
  MODELS = {"stt": None, "tts": None}
45
 
46
+ def load_gpu_models():
47
+ global MODELS
48
+ if MODELS["stt"] is None:
49
+ print("πŸŽ™οΈ Loading Faster-Whisper to GPU (Persistent)...")
50
+ MODELS["stt"] = WhisperModel("large-v3-turbo", device="cuda", compute_type="float16")
51
  if MODELS["tts"] is None:
52
+ print("πŸ”Š Loading XTTS-v2 to GPU (Persistent)...")
53
  from TTS.api import TTS
54
+ MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
 
55
 
56
+ # πŸ› οΈ 2. CORE PROCESSING (v136: NO PAGING, NO JITTER)
57
  @spaces.GPU(duration=120)
58
  def core_process(request_dict):
59
  global MODELS
60
  action = request_dict.get("action")
61
+ print(f"--- [v136] πŸ› οΈ PURIST ENGINE: {action} ---")
62
  t1 = time.time()
63
 
64
  try:
65
+ # Load once and keep in VRAM within the worker life
66
+ load_gpu_models()
67
+
68
+ # πŸŽ™οΈ STT PATH
69
  if action in ["stt", "s2st"]:
 
 
 
 
70
  audio_bytes = base64.b64decode(request_dict.get("file"))
71
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
72
  f.write(audio_bytes); temp_path = f.name
73
  try:
74
  lang = request_dict.get("lang")
75
+ segments, _ = MODELS["stt"].transcribe(temp_path, language=lang if lang and len(lang) <= 3 else None, beam_size=1)
76
  stt_text = "".join([s.text for s in segments]).strip()
77
  finally:
78
  if os.path.exists(temp_path): os.unlink(temp_path)
 
 
 
79
 
80
  if action == "stt": return {"text": stt_text}
81
 
82
+ # πŸ”Š TTS PATH
83
  if action in ["tts", "s2st"]:
84
  text = (request_dict.get("text") if action == "tts" else stt_text).strip()
85
  if action == "s2st":
 
91
  if len(text) < 2 or not any(c.isalnum() for c in text):
92
  return {"audio": ""} if action == "tts" else {"text": stt_text, "translated": "", "audio": ""}
93
 
 
 
 
 
94
  XTTS_MAP = {"en": "en", "de": "de", "fr": "fr", "es": "es", "it": "it", "pl": "pl", "pt": "pt", "tr": "tr", "ru": "ru", "nl": "nl", "cs": "cs", "ar": "ar", "hu": "hu", "ko": "ko", "hi": "hi", "zh": "zh-cn"}
95
  raw_lang = (request_dict.get("lang") if action == "tts" else target).strip().lower()
96
  clean_lang = raw_lang.split('-')[0]
 
109
  try:
110
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_f:
111
  out_p = out_f.name
112
+ MODELS["tts"].tts_to_file(text=text, language=mapped_lang, file_path=out_p, speaker_wav=speaker_wav_path)
113
  with open(out_p, "rb") as f: audio_b64 = base64.b64encode(f.read()).decode()
114
  finally:
115
  if speaker_wav_path and "default" not in speaker_wav_path and os.path.exists(speaker_wav_path): os.unlink(speaker_wav_path)
116
  if 'out_p' in locals() and os.path.exists(out_p): os.unlink(out_p)
 
 
 
117
  else:
118
  import chatterbox_utils
119
  audio_bytes = chatterbox_utils.run_chatterbox_inference(text, clean_lang)
 
123
  return {"text": stt_text, "translated": trans_text, "audio": audio_b64}
124
 
125
  except Exception as e:
126
+ print(f"❌ [v136] ERROR: {traceback.format_exc()}")
127
  return {"error": str(e)}
128
  finally:
129
+ print(f"--- [v136] ✨ DONE ({time.time()-t1:.1f}s) ---")
130
+ torch.cuda.empty_cache() # Keep models in VRAM, but clear temp buffers
131
 
132
  # πŸš€ 3. SERVER SETUP
133
  app = FastAPI()
 
137
  async def api_process(request: Request):
138
  try:
139
  data = await request.json()
140
+ if data.get("action") == "health": return {"status": "awake", "v": "136"}
141
  return core_process(data)
142
  except Exception as e: return {"error": str(e)}
143
 
144
  @app.get("/health")
145
+ def health(): return {"status": "ok", "v": "136"}
146
 
147
  demo = gr.Interface(
148
  fn=lambda x: json.dumps(core_process(json.loads(x))),
149
+ inputs="text", outputs="text", title="πŸš€ AI Engine v136 (Persistent GPU)",
150
+ description="H200 Native | Fast-Whisper + XTTS-v2 | Full VRAM Mode"
151
  ).queue()
152
  app = gr.mount_gradio_app(app, demo, path="/")
153
 
154
  if __name__ == "__main__":
 
155
  uvicorn.run(app, host="0.0.0.0", port=7860, log_level="warning")