TextGeneratorSmall

Running on Zero

App Files Files Community

palli23 commited on Dec 5, 2025

Commit

1f8d8c7

verified ·

1 Parent(s): a7eba16

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -40

app.py CHANGED Viewed

@@ -1,7 +1,9 @@
-# app.py — Íslenskt ASR – ZeroGPU Fixed (no CUDA init at startup, Dec 2025)
 import os
 os.environ["OMP_NUM_THREADS"] = "1"
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "garbage_collection_threshold:0.6,max_split_size_mb:128"
 import gradio as gr
 import spaces
@@ -10,47 +12,37 @@ import torch
 import gc
 MODEL_NAME = "palli23/whisper-small-sam_spjall"
-pipe = None  # Global pipeline – loaded ONLY inside @spaces.GPU
 @spaces.GPU(duration=180, max_batch_size=4)
-def get_or_refresh_pipeline():
-    global pipe
-    # Check if pipeline is broken (now safe inside GPU worker)
-    if pipe is not None:
-        try:
-            _ = pipe.model.device  # Quick health check
-        except Exception:
-            print("GPU context lost → rebuilding pipeline...")
-            pipe = None
-            gc.collect()
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-    if pipe is None:
-        print("Loading Whisper model (cold start ~15-25s)...")
         pipe = pipeline(
             "automatic-speech-recognition",
             model=MODEL_NAME,
             torch_dtype=torch.float16,
-            device=0,  # CUDA init happens HERE, inside GPU worker
             token=os.getenv("HF_TOKEN"),
         )
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-    return pipe
-def transcribe_3min(audio_path):
-    if not audio_path:
-        return "Hlaðið upp hljóðskrá (mp3/wav, max 5 mín)"
-    global pipe  # Safe now, since no CUDA at function level
-    try:
-        current_pipe = get_or_refresh_pipeline()  # This triggers GPU context
-        result = current_pipe(
             audio_path,
             chunk_length_s=30,
             stride_length_s=(6, 0),
@@ -61,10 +53,12 @@ def transcribe_3min(audio_path):
         text = result["text"].strip()
-        # Clean up chunks if present
         if "chunks" in result:
             del result["chunks"]
         gc.collect()
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
@@ -72,8 +66,6 @@ def transcribe_3min(audio_path):
         return text if text else "(ekkert tal greint)"
     except torch.cuda.OutOfMemoryError:
-        print("OOM detected → forcing full pipeline reload")
-        pipe = None
         gc.collect()
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
@@ -87,7 +79,7 @@ with gr.Blocks(title="Íslenskt ASR") as demo:
     gr.Markdown("# Íslenskt ASR – 3–5 mín hljóð")
     gr.Markdown("**Whisper-small fínstillt á íslensku spjalli · mjög lágur WER**")
     gr.Markdown("**Hafa samband:** pallinr1@protonmail.com")
-    gr.Markdown("> Keyrt á **ZeroGPU** – fyrsta ræsing tekur 15–30 sek, síðan hröð")
     audio_in = gr.Audio(
         type="filepath",
@@ -97,13 +89,14 @@ with gr.Blocks(title="Íslenskt ASR") as demo:
     btn = gr.Button("Umrita", variant="primary", size="lg")
     output = gr.Textbox(lines=25, label="Texti")
-    btn.click(fn=transcribe_3min, inputs=audio_in, outputs=output)
     gr.Markdown("""
     ### Leiðbeiningar
-    - Fyrsta umritunin tekur lengur (model hleðst inn á GPU)
-    - Eftir það: 5–15 sek fyrir 3 mín hljóð
-    - Ef þú færð minnisvillu → bíddu öðruhvolf og prófaðu aftur
     """)
 # ————————————————————— Launch ——————���——————————————

+# app.py — Íslenskt ASR – ZeroGPU Fully Stateless Fix (Dec 2025)
 import os
 os.environ["OMP_NUM_THREADS"] = "1"
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "garbage_collection_threshold:0.6,max_split_size_mb:128"
+# Force CPU-only at import to prevent any lazy CUDA init
+os.environ["CUDA_VISIBLE_DEVICES"] = ""
 import gradio as gr
 import spaces
 import gc
 MODEL_NAME = "palli23/whisper-small-sam_spjall"
 @spaces.GPU(duration=180, max_batch_size=4)
+def transcribe_3min_gpu(audio_path):
+    """
+    FULLY SELF-CONTAINED GPU FUNCTION – no globals, no prior CUDA touches.
+    Loads model fresh on CPU first, then moves to GPU INSIDE worker.
+    """
+    if not audio_path:
+        return "Hlaðið upp hljóðskrá (mp3/wav, max 5 mín)"
+    try:
+        print("Loading Whisper model on CPU first (safe init)...")
+        # Load on CPU explicitly to avoid any CUDA during model download/init
         pipe = pipeline(
             "automatic-speech-recognition",
             model=MODEL_NAME,
             torch_dtype=torch.float16,
+            device="cpu",  # ← KEY FIX: CPU first, no CUDA yet
             token=os.getenv("HF_TOKEN"),
         )
+        # Now move to GPU – this happens INSIDE @spaces.GPU worker, safe!
+        print("Moving model to GPU...")
+        pipe.model = pipe.model.to("cuda")
+        pipe.device = "cuda"
+        if hasattr(pipe, 'model_decoder'):
+            pipe.model_decoder = pipe.model_decoder.to("cuda")
+        # Run inference
+        print("Running transcription...")
+        result = pipe(
             audio_path,
             chunk_length_s=30,
             stride_length_s=(6, 0),
         text = result["text"].strip()
+        # Cleanup chunks
         if "chunks" in result:
             del result["chunks"]
+        # Aggressive cleanup BEFORE returning
+        del pipe
         gc.collect()
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
         return text if text else "(ekkert tal greint)"
     except torch.cuda.OutOfMemoryError:
         gc.collect()
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
     gr.Markdown("# Íslenskt ASR – 3–5 mín hljóð")
     gr.Markdown("**Whisper-small fínstillt á íslensku spjalli · mjög lágur WER**")
     gr.Markdown("**Hafa samband:** pallinr1@protonmail.com")
+    gr.Markdown("> Keyrt á **ZeroGPU** – hver umritun hleðst nýtt (15–30 sek), en örugg og stöðug")
     audio_in = gr.Audio(
         type="filepath",
     btn = gr.Button("Umrita", variant="primary", size="lg")
     output = gr.Textbox(lines=25, label="Texti")
+    # Use the GPU-decorated function directly
+    btn.click(fn=transcribe_3min_gpu, inputs=audio_in, outputs=output)
     gr.Markdown("""
     ### Leiðbeiningar
+    - Hver umritun hleðst módelinu nýtt á GPU (ZeroGPU regla)
+    - Tími: 15–30 sek (lengur en á venjulegu GPU, en lifir endalaust)
+    - Ef villa kemur → bíddu 10 sek og prófaðu aftur
     """)
 # ————————————————————— Launch ——————���——————————————