Spaces:

aidn
/

yapper

Running on Zero

App Files Files Community

aidn commited on 21 days ago

Commit

dd3c0dd

verified ·

1 Parent(s): ed8b8a1

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -41

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ import tempfile
 import numpy as np
 import soundfile as sf
 import torch
-import spaces                          # ← ZeroGPU: muss importiert werden
 import gradio as gr
 from transformers import pipeline as hf_pipeline
@@ -14,28 +14,32 @@ HF_TOKEN = os.environ.get("HF_TOKEN", "")
 ASR_MODELS = {
     "whisper-small  (gut, schnell)": "openai/whisper-small",
     "whisper-large-v3 (beste Qualität)": "openai/whisper-large-v3",
-    "distil-whisper-large-v3 (empfohlen: Qualität+Speed)": "distil-whisper/distil-large-v3",
 }
 _asr_cache: dict = {}
 _diar_pipe = None
-def get_asr(model_key: str):
     model_id = ASR_MODELS[model_key]
     if model_id not in _asr_cache:
         _asr_cache[model_id] = hf_pipeline(
             "automatic-speech-recognition",
             model=model_id,
-            device="cuda",             # ← ZeroGPU: cuda statt cpu
-            torch_dtype=torch.float16, # ← ZeroGPU: float16 statt float32
-            chunk_length_s=30,
             return_timestamps=True,
         )
     return _asr_cache[model_id]
-def get_diar():
     global _diar_pipe
     if _diar_pipe is None:
         if not HF_TOKEN:
@@ -48,11 +52,26 @@ def get_diar():
         _diar_pipe = PyannotePipeline.from_pretrained(
             "pyannote/speaker-diarization-3.1",
             use_auth_token=HF_TOKEN,
-        ).to(torch.device("cuda"))     # ← ZeroGPU: auf GPU verschieben
     return _diar_pipe
-# ── Hilfsfunktionen ────────────────────────────────────────────────────────────
 def merge_with_speakers(chunks: list, diarization) -> list[tuple]:
     merged = []
@@ -94,60 +113,76 @@ def format_diarized(segments: list[tuple]) -> str:
     return "\n\n".join(lines)
-# ── Haupt-Pipeline (mit @spaces.GPU dekoriert) ────────────────────────────────
-# duration=300 = max. 5 Minuten GPU-Zeit pro Call.
-# Passe den Wert an deine längsten Meetings an (300s reicht für ~30 min Audio).
-@spaces.GPU(duration=300)              # ← ZeroGPU: Pflicht-Decorator
-def run_pipeline(tmp_path: str, model_key: str, use_diar: bool):
-    """Läuft komplett auf der GPU. Wird von transcribe() aufgerufen."""
-    asr = get_asr(model_key)
-    result = asr(tmp_path)
     raw_transcript = result["text"].strip()
-    chunks = result.get("chunks", [])
     if not use_diar:
         return raw_transcript, ""
     try:
-        diar = get_diar()
         diarization = diar(tmp_path)
-        segments = merge_with_speakers(chunks, diarization)
-        labeled = format_diarized(segments)
         return raw_transcript, labeled or "(Keine Sprecher erkannt.)"
     except EnvironmentError as e:
         return raw_transcript, f"⚠️ {e}"
     except Exception as e:
         return raw_transcript, f"⚠️ Diarisierung fehlgeschlagen: {e}"
 def transcribe(audio, model_key: str, use_diar: bool):
-    """UI-Handler: Audio vorbereiten, GPU-Funktion aufrufen."""
     if audio is None:
         yield "⚠️ Kein Audio eingegeben.", ""
         return
     sample_rate, audio_data = audio
     if audio_data.ndim > 1:
         audio_data = audio_data.mean(axis=1)
     audio_data = audio_data.astype(np.float32)
     if audio_data.max() > 1.0:
         audio_data /= 32768.0
-    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
-        tmp_path = f.name
-        sf.write(tmp_path, audio_data, sample_rate)
-    yield "⏳ GPU wird angefordert und Pipeline gestartet...", ""
-    try:
-        transcript, labeled = run_pipeline(tmp_path, model_key, use_diar)
-        yield transcript, labeled
-    finally:
-        os.unlink(tmp_path)
-# ── UI ─────────────────────────────────────────────────────────────────────────
 TOKEN_WARNING = (
     "> ⚠️ **Kein `HF_TOKEN` gefunden.**  \n"
@@ -157,12 +192,11 @@ TOKEN_WARNING = (
     "[hf.co/pyannote/speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1)."
 )
-with gr.Blocks(title="Meeting Transcriber (ZeroGPU)") as demo:
     gr.Markdown("# 🎙️ YAPPER · ZeroGPU Edition")
     gr.Markdown(
-        "## Dein Teams Meeting Begleiter des Vertrauens.  \n"
-        "Lade eine Audiodatei hoch **oder** nimm direkt über das Mikrofon auf.  \n"
-        "Läuft auf NVIDIA H200 via ZeroGPU."
     )
     if not HF_TOKEN:
@@ -177,7 +211,7 @@ with gr.Blocks(title="Meeting Transcriber (ZeroGPU)") as demo:
             )
             model_dd = gr.Dropdown(
                 choices=list(ASR_MODELS.keys()),
-                value="distil-whisper-large-v3 (empfohlen: Qualität+Speed)",
                 label="Transkriptionsmodell",
             )
             diar_cb = gr.Checkbox(
@@ -201,9 +235,8 @@ with gr.Blocks(title="Meeting Transcriber (ZeroGPU)") as demo:
     gr.Markdown(
         "---\n"
         "**Hinweise:**  \n"
-        "• ZeroGPU-Quota: PRO-User haben 1.500 Sek/Tag (~50 kurze Meetings).  \n"
-        "• Max. 5 Minuten GPU-Zeit pro Transkription (`duration=300`).  \n"
-        "• Für pyannote musst du die Lizenzbedingungen auf Hugging Face akzeptiert haben."
     )
     run_btn.click(

 import numpy as np
 import soundfile as sf
 import torch
+import spaces
 import gradio as gr
 from transformers import pipeline as hf_pipeline
 ASR_MODELS = {
     "whisper-small  (gut, schnell)": "openai/whisper-small",
     "whisper-large-v3 (beste Qualität)": "openai/whisper-large-v3",
+    "distil-whisper-large-v3 (empfohlen)": "distil-whisper/distil-large-v3",
 }
 _asr_cache: dict = {}
 _diar_pipe = None
+WHISPER_SR = 16_000  # Whisper erwartet immer 16 kHz
+# ── Model Loading ──────────────────────────────────────────────────────────────
+def get_asr(model_key: str, device: str, dtype: torch.dtype):
     model_id = ASR_MODELS[model_key]
     if model_id not in _asr_cache:
         _asr_cache[model_id] = hf_pipeline(
             "automatic-speech-recognition",
             model=model_id,
+            device=device,
+            dtype=dtype,
+            # chunk_length_s weglassen – wir übergeben Array, kein Dateipfad
             return_timestamps=True,
         )
     return _asr_cache[model_id]
+def get_diar(device: str):
     global _diar_pipe
     if _diar_pipe is None:
         if not HF_TOKEN:
         _diar_pipe = PyannotePipeline.from_pretrained(
             "pyannote/speaker-diarization-3.1",
             use_auth_token=HF_TOKEN,
+        )
+        if device == "cuda":
+            _diar_pipe = _diar_pipe.to(torch.device("cuda"))
     return _diar_pipe
+# ── Hilfsfunktionen ───────────────────────────────────────────────────────────
+def resample(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
+    """Einfaches lineares Resampling ohne librosa-Abhängigkeit."""
+    if orig_sr == target_sr:
+        return audio
+    ratio = target_sr / orig_sr
+    new_len = int(len(audio) * ratio)
+    return np.interp(
+        np.linspace(0, len(audio) - 1, new_len),
+        np.arange(len(audio)),
+        audio,
+    ).astype(np.float32)
 def merge_with_speakers(chunks: list, diarization) -> list[tuple]:
     merged = []
     return "\n\n".join(lines)
+# ── Haupt-Pipeline ─────────────────────────────────────────────────────────────
+@spaces.GPU(duration=300)
+def run_pipeline(
+    audio_array: np.ndarray,
+    sample_rate: int,
+    model_key: str,
+    use_diar: bool,
+):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    dtype  = torch.float16 if device == "cuda" else torch.float32
+    # ── ASR: Array direkt übergeben → kein torchcodec / FFmpeg nötig ──
+    audio_16k = resample(audio_array, sample_rate, WHISPER_SR)
+    asr_input  = {"array": audio_16k, "sampling_rate": WHISPER_SR}
+    asr    = get_asr(model_key, device, dtype)
+    result = asr(asr_input)
     raw_transcript = result["text"].strip()
+    chunks         = result.get("chunks", [])
     if not use_diar:
         return raw_transcript, ""
+    # ── Diarisierung: pyannote braucht eine Datei ──────────────────────
+    tmp_path = None
     try:
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+            tmp_path = f.name
+            sf.write(tmp_path, audio_array, sample_rate)
+        diar       = get_diar(device)
         diarization = diar(tmp_path)
+        segments   = merge_with_speakers(chunks, diarization)
+        labeled    = format_diarized(segments)
         return raw_transcript, labeled or "(Keine Sprecher erkannt.)"
     except EnvironmentError as e:
         return raw_transcript, f"⚠️ {e}"
     except Exception as e:
         return raw_transcript, f"⚠️ Diarisierung fehlgeschlagen: {e}"
+    finally:
+        if tmp_path and os.path.exists(tmp_path):
+            os.unlink(tmp_path)
+# ── Gradio-Handler ────────────────────────────────────────────────────────────
 def transcribe(audio, model_key: str, use_diar: bool):
     if audio is None:
         yield "⚠️ Kein Audio eingegeben.", ""
         return
     sample_rate, audio_data = audio
+    # Mono erzwingen
     if audio_data.ndim > 1:
         audio_data = audio_data.mean(axis=1)
     audio_data = audio_data.astype(np.float32)
+    # 16-bit PCM → float normalisieren
     if audio_data.max() > 1.0:
         audio_data /= 32768.0
+    yield "⏳ GPU wird angefordert, Pipeline startet...", ""
+    transcript, labeled = run_pipeline(audio_data, sample_rate, model_key, use_diar)
+    yield transcript, labeled
+# ── UI ────────────────────────────────────────────────────────────────────────
 TOKEN_WARNING = (
     "> ⚠️ **Kein `HF_TOKEN` gefunden.**  \n"
     "[hf.co/pyannote/speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1)."
 )
+with gr.Blocks(title="🎙️ YAPPER · ZeroGPU Edition") as demo:
     gr.Markdown("# 🎙️ YAPPER · ZeroGPU Edition")
     gr.Markdown(
+        "## Transkription & Speaker-Diarisierung für Teams Meetings.  \n"
+        "Lade eine Datei hoch oder nimm direkt über das Mikrofon auf."
     )
     if not HF_TOKEN:
             )
             model_dd = gr.Dropdown(
                 choices=list(ASR_MODELS.keys()),
+                value="distil-whisper-large-v3 (empfohlen)",
                 label="Transkriptionsmodell",
             )
             diar_cb = gr.Checkbox(
     gr.Markdown(
         "---\n"
         "**Hinweise:**  \n"
+        "• Für pyannote musst du die Lizenzbedingungen auf Hugging Face akzeptiert haben.  \n"
+        "• ZeroGPU-Quota: 1.500 Sek/Tag für PRO-User (reicht für ~50 kurze Meetings)."
     )
     run_btn.click(