Spaces:

aidn
/

yapper

Running on Zero

App Files Files Community

aidn commited on 22 days ago

Commit

a5cc652

verified ·

1 Parent(s): e7cb287

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -17

app.py CHANGED Viewed

@@ -1,8 +1,5 @@
 import os
-import tempfile
 import numpy as np
-import soundfile as sf
 import torch
 import spaces
 import gradio as gr
@@ -53,7 +50,7 @@ def get_diar(device: str):
         _diar_pipe = PyannotePipeline.from_pretrained(
             "pyannote/speaker-diarization-3.1",
-            use_auth_token=HF_TOKEN,
         )
         if device == "cuda":
             _diar_pipe = _diar_pipe.to(torch.device("cuda"))
@@ -110,11 +107,18 @@ def transcribe_audio(audio_16k: np.ndarray, processor, model, device: str, dtype
                 language="de",          # ggf. auf "en" ändern oder weglassen für Auto-Detect
             )
-        # Dekodieren mit Timestamps
-        result = processor.batch_decode(predicted_ids, decode_with_timestamps=True)[0]
-        # Timestamps aus dem Ergebnis extrahieren (Format: <|0.00|> Text <|1.50|>)
         import re
         ts_pattern = re.compile(r"<\|([\d.]+)\|>")
         tokens = ts_pattern.split(result)
@@ -187,15 +191,13 @@ def run_pipeline(audio_array: np.ndarray, sample_rate: int, model_key: str, use_
     if not use_diar:
         return raw_transcript, ""
-    # 3. Diarisierung (pyannote braucht Datei)
-    tmp_path = None
     try:
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
-            tmp_path = f.name
-            sf.write(tmp_path, audio_array, sample_rate)
         diar        = get_diar(device)
-        diarization = diar(tmp_path)
         segments    = merge_with_speakers(chunks, diarization)
         labeled     = format_diarized(segments)
         return raw_transcript, labeled or "(Keine Sprecher erkannt.)"
@@ -204,9 +206,6 @@ def run_pipeline(audio_array: np.ndarray, sample_rate: int, model_key: str, use_
         return raw_transcript, f"⚠️ {e}"
     except Exception as e:
         return raw_transcript, f"⚠️ Diarisierung fehlgeschlagen: {e}"
-    finally:
-        if tmp_path and os.path.exists(tmp_path):
-            os.unlink(tmp_path)
 # ── Gradio-Handler ────────────────────────────────────────────────────────────
@@ -242,7 +241,7 @@ TOKEN_WARNING = (
 with gr.Blocks(title="🎙️ YAPPER · ZeroGPU Edition") as demo:
     gr.Markdown("# 🎙️ YAPPER · ZeroGPU Edition")
     gr.Markdown(
-        "## Transkription & Speaker-Diarisierung für Teams Meetings.  \n"
         "Lade eine Datei hoch oder nimm direkt über das Mikrofon auf."
     )

 import os
 import numpy as np
 import torch
 import spaces
 import gradio as gr
         _diar_pipe = PyannotePipeline.from_pretrained(
             "pyannote/speaker-diarization-3.1",
+            token=HF_TOKEN,            # ← use_auth_token wurde entfernt
         )
         if device == "cuda":
             _diar_pipe = _diar_pipe.to(torch.device("cuda"))
                 language="de",          # ggf. auf "en" ändern oder weglassen für Auto-Detect
             )
+        # Dekodieren mit Timestamps, Special Tokens filtern
+        result = processor.batch_decode(
+            predicted_ids,
+            decode_with_timestamps=True,
+            skip_special_tokens=False,  # brauchen Timestamp-Tokens
+        )[0]
+        # Nicht-Timestamp Special Tokens entfernen (<|startoftranscript|> etc.)
         import re
+        result = re.sub(r"<\|(?![\d.]+\|)[^>]+\|>", "", result).strip()
+        # Timestamps extrahieren (Format: <|0.00|> Text <|1.50|>)
         ts_pattern = re.compile(r"<\|([\d.]+)\|>")
         tokens = ts_pattern.split(result)
     if not use_diar:
         return raw_transcript, ""
+    # 3. Diarisierung: Tensor-Dict → kein torchcodec nötig
     try:
+        waveform   = torch.tensor(audio_array).unsqueeze(0).float()
+        diar_input = {"waveform": waveform, "sample_rate": sample_rate}
         diar        = get_diar(device)
+        diarization = diar(diar_input)
         segments    = merge_with_speakers(chunks, diarization)
         labeled     = format_diarized(segments)
         return raw_transcript, labeled or "(Keine Sprecher erkannt.)"
         return raw_transcript, f"⚠️ {e}"
     except Exception as e:
         return raw_transcript, f"⚠️ Diarisierung fehlgeschlagen: {e}"
 # ── Gradio-Handler ────────────────────────────────────────────────────────────
 with gr.Blocks(title="🎙️ YAPPER · ZeroGPU Edition") as demo:
     gr.Markdown("# 🎙️ YAPPER · ZeroGPU Edition")
     gr.Markdown(
+        "## Transkription & Speaker-Segmentierung für Teams Meetings.  \n"
         "Lade eine Datei hoch oder nimm direkt über das Mikrofon auf."
     )