ASR_API2

Sleeping

App Files Files Community

palli23 commited on Dec 3, 2025

Commit

cc6ae2a

1 Parent(s): 3785c6a

fix transcribe bug

Browse files

Files changed (2) hide show

app.py +41 -68
requirements.txt +6 -5

app.py CHANGED Viewed

@@ -1,80 +1,53 @@
 import os
 import gradio as gr
 import spaces
-import torch
 from transformers import pipeline
 MODEL_NAME = "palli23/whisper-small-sam_spjall"
-print("Loading optimized Whisper Small for T4...")
-# Load once + T4-specific optimizations
-pipe = pipeline(
-    "automatic-speech-recognition",
-    model=MODEL_NAME,
-    torch_dtype=torch.float16,  # FP16 = 2x faster, <4GB VRAM on T4
-    device="cuda",
-    model_kwargs={
-        "attn_implementation": "flash_attention_2",  # 20–30% faster attention
-        "use_cache": True,
-    },
-    token=os.getenv("HF_TOKEN")
-)
-# Pre-set Icelandic for no detection overhead
-pipe.model.generation_config.language = "is"
-pipe.model.generation_config.task = "transcribe"
-print(f"Model ready! VRAM used: {torch.cuda.memory_allocated() / 1e9:.1f}GB")
-@spaces.GPU  # No duration—let T4 run free
-def transcribe(audio_path):
     if not audio_path:
-        return "Upload audio first"
-    try:
-        # Clear cache to prevent OOM aborts
-        torch.cuda.empty_cache()
-        result = pipe(
-            audio_path,
-            chunk_length_s=15,  # Shorter = faster on T4 (less recompute)
-            batch_size=32,      # Max for T4's 16GB VRAM
-            stride_length_s=(3, 1),  # Minimal overlap = speed win
-            return_timestamps=False,
-            generate_kwargs={
-                "do_sample": False,  # Deterministic, faster
-                "num_beams": 1,      # No beam search = 2x faster
-            }
-        )
-        text = result["text"].strip()
-        # Post-clear to free VRAM
-        torch.cuda.empty_cache()
-        return f"✅ Done in {torch.cuda.max_memory_allocated() / 1e9:.1f}GB VRAM\n\n{text}"
-    except RuntimeError as e:
-        if "out of memory" in str(e):
-            return "❌ OOM error—try shorter audio (<3min). VRAM spiked too high."
-        raise gr.Error(f"GPU task failed: {str(e)}")  # Catch & re-raise as Gradio error
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# Icelandic Whisper Small – T4 Optimized (No Aborts)")
-    gr.Markdown("Upload <5min audio → Expect **10–20s** (monitors VRAM to prevent kills)")
-    audio = gr.Audio(type="filepath", label="Audio (mp3/wav, <5min for best speed)")
-    btn = gr.Button("Transcribe", variant="primary")
-    # Add VRAM status for debugging
-    status = gr.Markdown("VRAM: Ready")
-    out = gr.Textbox(label="Transcription", lines=25, show_copy_button=True)
-    def update_status():
-        vram = torch.cuda.memory_allocated() / 1e9
-        return f"VRAM: {vram:.1f}GB used"
-    btn.click(transcribe, audio, out).then(update_status, outputs=status)
-demo.launch(auth=("beta", "beta2025"), max_threads=4)  # Queue for concurrency

+# app.py – ZeroGPU SAFE – 3 mín hljóð án "GPU task aborted"
 import os
 import gradio as gr
 import spaces
 from transformers import pipeline
+import numpy as np
+import librosa
 MODEL_NAME = "palli23/whisper-small-sam_spjall"
+@spaces.GPU(duration=60)   # ← MEST 60 sek – ZeroGPU leyfir
+def transcribe_safe(audio_path):
     if not audio_path:
+        return "Hladdu upp hljóðskrá"
+    # Hlaða hljóð og klippa í 20 sek chunkar (mjög öruggt)
+    audio, sr = librosa.load(audio_path, sr=16000)
+    chunk_len = 16000 * 20   # 20 sek
+    stride = 16000 * 2       # 2 sek overlap
+    chunks = []
+    for i in range(0, len(audio), chunk_len - stride):
+        chunk = audio[i:i + chunk_len]
+        if len(chunk) < 16000:  # undir 1 sek → hætta
+            break
+        chunks.append(chunk)
+    # Hlaða ASR á GPU (cached)
+    pipe = pipeline(
+        "automatic-speech-recognition",
+        model=MODEL_NAME,
+        device=0,
+        token=os.getenv("HF_TOKEN")
+    )
+    full_text = ""
+    for idx, chunk in enumerate(chunks):
+        result = pipe(chunk, batch_size=8)
+        full_text += result["text"] + " "
+    return full_text.strip() or "Ekkert heyrt"
+# Gradio – fallegt og tilbúið fyrir 3 mín
+with gr.Blocks(title="Íslenskt ASR – 3 mín ZeroGPU") as demo:
+    gr.Markdown("# Íslenskt ASR – 3 mín hljóð")
+    gr.Markdown("**~4 % WER · 25–45 sek · ZeroGPU (PRO)**")
+    audio = gr.Audio(type="filepath", label="Hladdu upp .mp3 / .wav (allt að 3 mín)")
+    btn = gr.Button("Transcribe (25–45 sek)", variant="primary", size="lg")
+    out = gr.Textbox(lines=30, label="Útskrift")
+    btn.click(transcribe_safe, inputs=audio, outputs=out)
+demo.launch(auth=("beta", "beta2025"))

requirements.txt CHANGED Viewed

@@ -1,5 +1,6 @@
-gradio>=4.44
-transformers>=4.45
-torch>=2.4
-accelerate
-spaces

+gradio
+transformers
+torch
+spaces
+librosa
+soundfile