Spaces:

palli23
/

ASR_API

Running on Zero

App Files Files Community

palli23 commited on Dec 5, 2025

Commit

e4aa950

verified ·

1 Parent(s): 2b95269

Update app.py

Browse files

Whisperx + diarization test

Files changed (1) hide show

app.py +86 -56

app.py CHANGED Viewed

@@ -1,70 +1,100 @@
-# app.py — Íslenskt ASR – 3 mínútur (public, no login, with contact)
 import os
 os.environ["OMP_NUM_THREADS"] = "1"
-os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
 import gradio as gr
 import spaces
 from transformers import pipeline
-# ——————————————————————————————
-# Model loaded ONCE at startup (global)
-# ——————————————————————————————
-MODEL_NAME = "palli23/whisper-small-sam_spjall"
-@spaces.GPU(duration=180)
-def get_pipe():
-    return pipeline(
-        "automatic-speech-recognition",
-        model=MODEL_NAME,
-        torch_dtype="float16",
-        device=0,
-        token=os.getenv("HF_TOKEN"),
-    )
-pipe = get_pipe()
-# ——————————————————————————————
-# Transcription function
-# ——————————————————————————————
-def transcribe_3min(audio_path):
-    if not audio_path:
-        return "Hladdu upp hljóðskrá"
-    result = pipe(
-        audio_path,
-        chunk_length_s=30,
-        stride_length_s=(6, 0),
-        batch_size=8,
-        return_timestamps=False,
-    )
-    return result["text"]
-# ——————————————————————————————
-# UI — only added your email, nothing else changed
-# ——————————————————————————————
-with gr.Blocks() as demo:  # ← removed 'theme=' (was causing error)
-    gr.Markdown("# Íslenskt ASR – 3 mínútur")
-    gr.Markdown("**Whisper small· mjög lágur WER á prófunarupptökum · allt að 5 mín hljóð**")
-    gr.Markdown("**Hafa samband:** pallinr1@protonmail.com")
-    audio_in = gr.Audio(
-        type="filepath",
-        label="Hladdu upp .mp3 / .wav (max 5 mín)"
-    )
-    btn = gr.Button("Transcribe", variant="primary", size="lg")
-    output = gr.Textbox(lines=30, label="Útskrift")
-    btn.click(fn=transcribe_3min, inputs=audio_in, outputs=output)
-# ——————————————————————————————
-# PUBLIC — NO LOGIN, NO PASSWORD
-# ——————————————————————————————
-demo.launch(
-    auth=None,                    # ← No login
-    share=True,                   # ← Public
-    server_name="0.0.0.0",
-    server_port=7860,
-    show_error=True,
-    quiet=False
-)

+# app.py — Whisper-small + WhisperX Diarization + Timestamps
+# Public, no login, contact email
 import os
 os.environ["OMP_NUM_THREADS"] = "1"
 import gradio as gr
 import spaces
+import whisperx
 from transformers import pipeline
+import torch
+# Keep Space awake
+import threading, time, requests
+def keep_awake():
+    while True:
+        time.sleep(45 * 60)
+        try:
+            requests.get(f"https://{os.getenv('SPACE_HOST')}")
+        except: pass
+threading.Thread(target=keep_awake, daemon=True).start()
+# Load your Whisper-small
+asr = pipeline(
+    "automatic-speech-recognition",
+    model="palli23/whisper-small-sam_spjall",
+    torch_dtype="float16",
+    device=0,
+    chunk_length_s=30,
+    batch_size=8,
+)
+# WhisperX setup (diarization + timestamps)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+batch_size = 16
+compute_type = "float16"
+# Load WhisperX model
+model = whisperx.load_model("base", device, compute_type=compute_type)
+# Load diarization model
+diarize_model = whisperx.DiarizationPipeline(
+    use_auth_token=True,
+    device=device,
+    min_speakers=2,
+    max_speakers=5,
+)
+def transcribe_with_whisperx(audio_path, use_diarization=False):
+    if not audio_path:
+        return "Hladdu upp hljóðskrá"
+    # Load audio
+    audio = whisperx.load_audio(audio_path)
+    # Transcribe with Whisper
+    result = model.transcribe(audio, batch_size=batch_size)
+    # Align for word-level timestamps
+    model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
+    result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
+    if not use_diarization:
+        # Return with timestamps
+        lines = []
+        for segment in result["segments"]:
+            start = segment["start"]
+            end = segment["end"]
+            text = segment["text"]
+            lines.append(f"{start:.1f}s – {end:.1f}s: {text}")
+        return "\n".join(lines)
+    # Diarization
+    diarize_segments = diarize_model(audio)
+    result = whisperx.assign_word_speakers(diarize_segments, result)
+    # Return with speakers + timestamps
+    lines = []
+    for segment in result["segments"]:
+        speaker = segment.get("speaker", "Unknown")
+        start = segment["start"]
+        end = segment["end"]
+        text = segment["text"]
+        lines.append(f"[{speaker}] {start:.1f}s – {end:.1f}s: {text}")
+    return "\n".join(lines)
+# UI
+with gr.Blocks(title="Íslensk talgreining + WhisperX") as demo:
+    gr.Markdown("# Íslensk talgreining + WhisperX")
+    gr.Markdown("**Whisper-small + diarization + timestamps • pallinr1@protonmail.com**")
+    audio = gr.Audio(type="filepath", label="Hladdu upp hljóð (max 15 mín)")
+    diarize = gr.Checkbox(label="Virkja diarization (speakers + timestamps)", value=True)
+    btn = gr.Button("Transcribe", variant="primary")
+    out = gr.Textbox(lines=25, label="Útskrift")
+    btn.click(transcribe_with_whisperx, inputs=[audio, diarize], outputs=out)
+demo.launch(auth=None, share=True)