Upload app.py with huggingface_hub
Browse files
app.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
try:
|
| 2 |
import spaces
|
| 3 |
except ImportError:
|
|
@@ -27,15 +28,9 @@ from TTS.api import TTS
|
|
| 27 |
# ==========================================
|
| 28 |
# π v137 - HOPPER NATIVE (Transformers + Persistent VRAM)
|
| 29 |
# ==========================================
|
| 30 |
-
# Stability Strategy:
|
| 31 |
-
# 1. Revert to 'transformers' pipeline (Native PyTorch kernels for H200).
|
| 32 |
-
# 2. LOAD ONCE, STAY IN VRAM (Singleton Pattern).
|
| 33 |
-
# 3. Force SDPA (Flash Attention) + FP16.
|
| 34 |
-
# 4. Strict GPU-only path inside ZeroGPU context.
|
| 35 |
|
| 36 |
os.environ["COQUI_TOS_AGREED"] = "1"
|
| 37 |
os.environ["PYTHONWARNINGS"] = "ignore"
|
| 38 |
-
# Strict CUBLAS stability for H200
|
| 39 |
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
|
| 40 |
torch.backends.cuda.matmul.allow_tf32 = False
|
| 41 |
torch.backends.cudnn.allow_tf32 = False
|
|
@@ -53,11 +48,9 @@ def load_gpu_models():
|
|
| 53 |
if MODELS.get("stt") is None:
|
| 54 |
print("--- [v137] π₯ LOADING NATIVE WHISPER (Large-v3-Turbo) ---")
|
| 55 |
model_id = "openai/whisper-large-v3-turbo"
|
| 56 |
-
torch_dtype = torch.float16
|
| 57 |
-
|
| 58 |
# Load model with SDPA (Flash Attention) for H200
|
| 59 |
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
| 60 |
-
model_id, torch_dtype=
|
| 61 |
).to(device)
|
| 62 |
processor = AutoProcessor.from_pretrained(model_id)
|
| 63 |
|
|
@@ -66,7 +59,7 @@ def load_gpu_models():
|
|
| 66 |
model=model,
|
| 67 |
tokenizer=processor.tokenizer,
|
| 68 |
feature_extractor=processor.feature_extractor,
|
| 69 |
-
torch_dtype=
|
| 70 |
device=device,
|
| 71 |
model_kwargs={"attn_implementation": "sdpa"}
|
| 72 |
)
|
|
@@ -85,7 +78,6 @@ def core_process(request_dict):
|
|
| 85 |
t1 = time.time()
|
| 86 |
|
| 87 |
try:
|
| 88 |
-
# Load once and keep in VRAM within the worker life
|
| 89 |
load_gpu_models()
|
| 90 |
|
| 91 |
# ποΈ STT PATH
|
|
@@ -96,7 +88,6 @@ def core_process(request_dict):
|
|
| 96 |
|
| 97 |
try:
|
| 98 |
lang = request_dict.get("lang")
|
| 99 |
-
# Inference using transformers pipeline
|
| 100 |
result = MODELS["stt"](temp_path, generate_kwargs={"language": lang if lang and len(lang) <= 3 else None})
|
| 101 |
stt_text = result["text"].strip()
|
| 102 |
finally:
|
|
@@ -167,7 +158,14 @@ async def api_process(request: Request):
|
|
| 167 |
@app.get("/health")
|
| 168 |
def health(): return {"status": "ok", "v": "137"}
|
| 169 |
|
| 170 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
with gr.Blocks() as demo:
|
| 172 |
gr.Markdown("# π v137 HOPPER NATIVE (H200 Stable)")
|
| 173 |
gr.Markdown("Direct GPU path | Transformers Whisper | XTTS-v2 Singleton")
|
|
@@ -175,9 +173,11 @@ with gr.Blocks() as demo:
|
|
| 175 |
audio_in = gr.Audio(type="filepath", label="Input Audio")
|
| 176 |
stt_btn = gr.Button("STT")
|
| 177 |
txt_out = gr.Textbox(label="STT Result")
|
| 178 |
-
stt_btn.click(fn=
|
| 179 |
|
|
|
|
| 180 |
app = gr.mount_gradio_app(app, demo, path="/")
|
| 181 |
|
| 182 |
if __name__ == "__main__":
|
| 183 |
-
|
|
|
|
|
|
| 1 |
+
print("--- [v137-clean] π BOOTING APP.PY ---")
|
| 2 |
try:
|
| 3 |
import spaces
|
| 4 |
except ImportError:
|
|
|
|
| 28 |
# ==========================================
|
| 29 |
# π v137 - HOPPER NATIVE (Transformers + Persistent VRAM)
|
| 30 |
# ==========================================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
os.environ["COQUI_TOS_AGREED"] = "1"
|
| 33 |
os.environ["PYTHONWARNINGS"] = "ignore"
|
|
|
|
| 34 |
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
|
| 35 |
torch.backends.cuda.matmul.allow_tf32 = False
|
| 36 |
torch.backends.cudnn.allow_tf32 = False
|
|
|
|
| 48 |
if MODELS.get("stt") is None:
|
| 49 |
print("--- [v137] π₯ LOADING NATIVE WHISPER (Large-v3-Turbo) ---")
|
| 50 |
model_id = "openai/whisper-large-v3-turbo"
|
|
|
|
|
|
|
| 51 |
# Load model with SDPA (Flash Attention) for H200
|
| 52 |
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
| 53 |
+
model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True
|
| 54 |
).to(device)
|
| 55 |
processor = AutoProcessor.from_pretrained(model_id)
|
| 56 |
|
|
|
|
| 59 |
model=model,
|
| 60 |
tokenizer=processor.tokenizer,
|
| 61 |
feature_extractor=processor.feature_extractor,
|
| 62 |
+
torch_dtype=torch.float16,
|
| 63 |
device=device,
|
| 64 |
model_kwargs={"attn_implementation": "sdpa"}
|
| 65 |
)
|
|
|
|
| 78 |
t1 = time.time()
|
| 79 |
|
| 80 |
try:
|
|
|
|
| 81 |
load_gpu_models()
|
| 82 |
|
| 83 |
# ποΈ STT PATH
|
|
|
|
| 88 |
|
| 89 |
try:
|
| 90 |
lang = request_dict.get("lang")
|
|
|
|
| 91 |
result = MODELS["stt"](temp_path, generate_kwargs={"language": lang if lang and len(lang) <= 3 else None})
|
| 92 |
stt_text = result["text"].strip()
|
| 93 |
finally:
|
|
|
|
| 158 |
@app.get("/health")
|
| 159 |
def health(): return {"status": "ok", "v": "137"}
|
| 160 |
|
| 161 |
+
# Named function for Gradio to avoid lambda schema issues
|
| 162 |
+
def gradio_stt(audio_path):
|
| 163 |
+
if not audio_path: return ""
|
| 164 |
+
with open(audio_path, "rb") as f:
|
| 165 |
+
b64 = base64.b64encode(f.read()).decode()
|
| 166 |
+
res = core_process({"action": "stt", "file": b64})
|
| 167 |
+
return res.get("text", f"Error: {res.get('error')}")
|
| 168 |
+
|
| 169 |
with gr.Blocks() as demo:
|
| 170 |
gr.Markdown("# π v137 HOPPER NATIVE (H200 Stable)")
|
| 171 |
gr.Markdown("Direct GPU path | Transformers Whisper | XTTS-v2 Singleton")
|
|
|
|
| 173 |
audio_in = gr.Audio(type="filepath", label="Input Audio")
|
| 174 |
stt_btn = gr.Button("STT")
|
| 175 |
txt_out = gr.Textbox(label="STT Result")
|
| 176 |
+
stt_btn.click(fn=gradio_stt, inputs=audio_in, outputs=txt_out)
|
| 177 |
|
| 178 |
+
print("--- [v137-clean] π§ MOUNTING GRADIO ---")
|
| 179 |
app = gr.mount_gradio_app(app, demo, path="/")
|
| 180 |
|
| 181 |
if __name__ == "__main__":
|
| 182 |
+
print("--- [v137-clean] π‘ STARTING UVICORN ---")
|
| 183 |
+
uvicorn.run(app, host="0.0.0.0", port=7860, log_level="info")
|