TGPro1 commited on
Commit
92f88b0
Β·
verified Β·
1 Parent(s): 2b4125e

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +70 -33
app.py CHANGED
@@ -24,6 +24,24 @@ import soundfile as sf
24
  from faster_whisper import WhisperModel
25
 
26
  # πŸ›‘οΈ 0. INFRASTRUCTURE PURIST (v136)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  os.environ["COQUI_TOS_AGREED"] = "1"
28
  os.environ["PYTHONWARNINGS"] = "ignore"
29
  # Strict CUBLAS stability for H200
@@ -32,33 +50,48 @@ torch.backends.cuda.matmul.allow_tf32 = False
32
  torch.backends.cudnn.allow_tf32 = False
33
  torch.use_deterministic_algorithms(False) # Some kernels might need this, but let's keep it flexible
34
 
35
- import torchaudio
36
- def torchaudio_load_safe(filepath, **kwargs):
37
- data, sr = sf.read(filepath)
38
- if len(data.shape) == 1: tensor = torch.from_numpy(data).float().unsqueeze(0)
39
- else: tensor = torch.from_numpy(data).float().transpose(0, 1)
40
- return tensor, sr
41
- torchaudio.load = torchaudio_load_safe
42
 
43
- # πŸ“¦ 1. GLOBAL MODELS (SINGLETON PATTERN)
44
- MODELS = {"stt": None, "tts": None}
45
 
46
  def load_gpu_models():
 
47
  global MODELS
 
 
48
  if MODELS["stt"] is None:
49
- print("πŸŽ™οΈ Loading Faster-Whisper to GPU (Persistent)...")
50
- MODELS["stt"] = WhisperModel("large-v3-turbo", device="cuda", compute_type="float16")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  if MODELS["tts"] is None:
52
- print("πŸ”Š Loading XTTS-v2 to GPU (Persistent)...")
53
- from TTS.api import TTS
54
- MODELS["tts"] = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
55
 
56
- # πŸ› οΈ 2. CORE PROCESSING (v136: NO PAGING, NO JITTER)
57
  @spaces.GPU(duration=120)
58
  def core_process(request_dict):
59
  global MODELS
60
  action = request_dict.get("action")
61
- print(f"--- [v136] πŸ› οΈ PURIST ENGINE: {action} ---")
62
  t1 = time.time()
63
 
64
  try:
@@ -70,10 +103,12 @@ def core_process(request_dict):
70
  audio_bytes = base64.b64decode(request_dict.get("file"))
71
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
72
  f.write(audio_bytes); temp_path = f.name
 
73
  try:
74
  lang = request_dict.get("lang")
75
- segments, _ = MODELS["stt"].transcribe(temp_path, language=lang if lang and len(lang) <= 3 else None, beam_size=1)
76
- stt_text = "".join([s.text for s in segments]).strip()
 
77
  finally:
78
  if os.path.exists(temp_path): os.unlink(temp_path)
79
 
@@ -82,6 +117,8 @@ def core_process(request_dict):
82
  # πŸ”Š TTS PATH
83
  if action in ["tts", "s2st"]:
84
  text = (request_dict.get("text") if action == "tts" else stt_text).strip()
 
 
85
  if action == "s2st":
86
  from deep_translator import GoogleTranslator
87
  target = request_dict.get("target_lang") or "en"
@@ -123,32 +160,32 @@ def core_process(request_dict):
123
  return {"text": stt_text, "translated": trans_text, "audio": audio_b64}
124
 
125
  except Exception as e:
126
- print(f"❌ [v136] ERROR: {traceback.format_exc()}")
127
  return {"error": str(e)}
128
  finally:
129
- print(f"--- [v136] ✨ DONE ({time.time()-t1:.1f}s) ---")
130
- torch.cuda.empty_cache() # Keep models in VRAM, but clear temp buffers
131
 
132
- # πŸš€ 3. SERVER SETUP
133
- app = FastAPI()
134
- app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
135
-
136
- @app.post("/api/v1/process")
137
  async def api_process(request: Request):
138
  try:
139
  data = await request.json()
140
- if data.get("action") == "health": return {"status": "awake", "v": "136"}
141
  return core_process(data)
142
  except Exception as e: return {"error": str(e)}
143
 
144
  @app.get("/health")
145
- def health(): return {"status": "ok", "v": "136"}
 
 
 
 
 
 
 
 
 
146
 
147
- demo = gr.Interface(
148
- fn=lambda x: json.dumps(core_process(json.loads(x))),
149
- inputs="text", outputs="text", title="πŸš€ AI Engine v136 (Persistent GPU)",
150
- description="H200 Native | Fast-Whisper + XTTS-v2 | Full VRAM Mode"
151
- ).queue()
152
  app = gr.mount_gradio_app(app, demo, path="/")
153
 
154
  if __name__ == "__main__":
 
24
  from faster_whisper import WhisperModel
25
 
26
  # πŸ›‘οΈ 0. INFRASTRUCTURE PURIST (v136)
27
+ import numpy as np
28
+ import uvicorn
29
+ from fastapi import FastAPI, Request
30
+ from fastapi.middleware.cors import CORSMiddleware
31
+ from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
32
+ from TTS.api import TTS
33
+ import gradio as gr
34
+ import json # Added for gradio interface
35
+
36
+ # ==========================================
37
+ # πŸš€ v137 - HOPPER NATIVE (Transformers + Persistent VRAM)
38
+ # ==========================================
39
+ # Stability Strategy:
40
+ # 1. Revert to 'transformers' pipeline (Native PyTorch kernels for H200).
41
+ # 2. LOAD ONCE, STAY IN VRAM (Singleton Pattern).
42
+ # 3. Force SDPA (Flash Attention) + FP16.
43
+ # 4. Strict GPU-only path inside ZeroGPU context.
44
+
45
  os.environ["COQUI_TOS_AGREED"] = "1"
46
  os.environ["PYTHONWARNINGS"] = "ignore"
47
  # Strict CUBLAS stability for H200
 
50
  torch.backends.cudnn.allow_tf32 = False
51
  torch.use_deterministic_algorithms(False) # Some kernels might need this, but let's keep it flexible
52
 
53
+ app = FastAPI()
54
+ app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
 
 
 
 
 
55
 
56
+ MODELS = {"stt": None, "tts": None, "processor": None}
 
57
 
58
  def load_gpu_models():
59
+ """Persistent loading into GPU VRAM. Only runs once per worker."""
60
  global MODELS
61
+ device = "cuda"
62
+
63
  if MODELS["stt"] is None:
64
+ print("--- [v137] πŸ“₯ LOADING NATIVE WHISPER (Large-v3-Turbo) ---")
65
+ model_id = "openai/whisper-large-v3-turbo"
66
+ torch_dtype = torch.float16
67
+
68
+ # Load model with SDPA (Flash Attention) for H200
69
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
70
+ model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
71
+ ).to(device)
72
+ processor = AutoProcessor.from_pretrained(model_id)
73
+
74
+ MODELS["stt"] = pipeline(
75
+ "automatic-speech-recognition",
76
+ model=model,
77
+ tokenizer=processor.tokenizer,
78
+ feature_extractor=processor.feature_extractor,
79
+ torch_dtype=torch_dtype,
80
+ device=device,
81
+ model_kwargs={"attn_implementation": "sdpa"}
82
+ )
83
+ print("--- [v137] βœ… WHISPER LOADED ---")
84
+
85
  if MODELS["tts"] is None:
86
+ print("--- [v137] πŸ“₯ LOADING XTTS (VRAM STABLE) ---")
87
+ MODELS["tts"] = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
88
+ print("--- [v137] βœ… XTTS LOADED ---")
89
 
 
90
  @spaces.GPU(duration=120)
91
  def core_process(request_dict):
92
  global MODELS
93
  action = request_dict.get("action")
94
+ print(f"--- [v137] πŸ› οΈ HOPPER ENGINE: {action} ---")
95
  t1 = time.time()
96
 
97
  try:
 
103
  audio_bytes = base64.b64decode(request_dict.get("file"))
104
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
105
  f.write(audio_bytes); temp_path = f.name
106
+
107
  try:
108
  lang = request_dict.get("lang")
109
+ # Inference using transformers pipeline
110
+ result = MODELS["stt"](temp_path, generate_kwargs={"language": lang if lang and len(lang) <= 3 else None})
111
+ stt_text = result["text"].strip()
112
  finally:
113
  if os.path.exists(temp_path): os.unlink(temp_path)
114
 
 
117
  # πŸ”Š TTS PATH
118
  if action in ["tts", "s2st"]:
119
  text = (request_dict.get("text") if action == "tts" else stt_text).strip()
120
+ trans_text = text
121
+
122
  if action == "s2st":
123
  from deep_translator import GoogleTranslator
124
  target = request_dict.get("target_lang") or "en"
 
160
  return {"text": stt_text, "translated": trans_text, "audio": audio_b64}
161
 
162
  except Exception as e:
163
+ print(f"❌ [v137] ERROR: {traceback.format_exc()}")
164
  return {"error": str(e)}
165
  finally:
166
+ print(f"--- [v137] ✨ DONE ({time.time()-t1:.1f}s) ---")
167
+ torch.cuda.empty_cache()
168
 
169
+ @app.post("/process")
 
 
 
 
170
  async def api_process(request: Request):
171
  try:
172
  data = await request.json()
173
+ if data.get("action") == "health": return {"status": "awake", "v": "137"}
174
  return core_process(data)
175
  except Exception as e: return {"error": str(e)}
176
 
177
  @app.get("/health")
178
+ def health(): return {"status": "ok", "v": "137"}
179
+
180
+ # Gradio interface for debugging
181
+ with gr.Blocks() as demo:
182
+ gr.Markdown("## v137 HOPPER NATIVE (H200 Stable)")
183
+ with gr.Row():
184
+ audio_in = gr.Audio(type="filepath", label="Input Audio")
185
+ stt_btn = gr.Button("STT")
186
+ txt_out = gr.Textbox(label="STT Result")
187
+ stt_btn.click(fn=lambda x: core_process({"action": "stt", "file": base64.b64encode(open(x, "rb").read()).decode()})["text"], inputs=audio_in, outputs=txt_out)
188
 
 
 
 
 
 
189
  app = gr.mount_gradio_app(app, demo, path="/")
190
 
191
  if __name__ == "__main__":