Spaces:

aidn
/

yapper

Sleeping

App Files Files Community

aidn commited on 29 days ago

Commit

5e00e96

verified ·

1 Parent(s): 2b0351a

Update app.py

Browse files

Files changed (1) hide show

app.py +190 -298

app.py CHANGED Viewed

@@ -1,330 +1,222 @@
-import math
 import os
 import gradio as gr
-from PIL import Image, ImageDraw, ImageFont
-ZONES = [
-    ("Audio Layer", 30, 190, "#dbeafe", "#4a9eed"),
-    ("VAD", 240, 160, "#ede9fe", "#8b5cf6"),
-    ("Transcription", 420, 210, "#dcfce7", "#22c55e"),
-    ("Diarization\n(optional)", 650, 200, "#fef9c3", "#f59e0b"),
-    ("Summarisation", 870, 210, "#ffedd5", "#f97316"),
-    ("Output", 1100, 270, "#d1fae5", "#22c55e"),
-]
-MODEL_OPTIONS = {
-    "transcription": [
-        "distil-whisper-large-v3 (fast)",
-        "whisper-large-v3 (accurate)",
-    ],
-    "summarisation": [
-        "Ollama local LLM (recommended)",
-        "facebook/bart-large-cnn (fallback)",
-    ],
-}
-DESCRIPTIONS = {
-    "Audio Layer": (
-        "**PipeWire / PulseAudio loopback**\n\n"
-        "Creates a virtual sink that captures both your microphone and speaker output "
-        "simultaneously into a single stream. On modern Arch Linux you will typically run "
-        "PipeWire and can use `pw-loopback` or `pactl load-module module-loopback`. "
-        "Python reads the stream via `sounddevice` or `pyaudio`."
-    ),
-    "VAD": (
-        "**silero-vad**\n\n"
-        "Tiny, CPU-friendly voice activity detection model. Acts as a gatekeeper: "
-        "it fires only when someone is actually speaking, chunking the stream into "
-        "speech segments and discarding silence. This keeps downstream models from "
-        "wasting cycles on dead air and reduces latency."
-    ),
-    "Transcription": (
-        "**distil-whisper-large-v3**: faster than full Whisper with strong real-time accuracy. "
-        "Recommended starting point.\n\n"
-        "**whisper-large-v3**: higher accuracy at the cost of more CPU/GPU. "
-        "Switch to this if transcription quality is the bottleneck."
-    ),
-    "Diarization\n(optional)": (
-        "**pyannote/speaker-diarization-3.1**\n\n"
-        "Labels each speech chunk with a speaker ID (for example, Speaker A and Speaker B). "
-        "Requires a Hugging Face token (gated model; request access on the HF Hub). "
-        "Skip this on your first pass and add it after the base pipeline is stable."
-    ),
-    "Summarisation": (
-        "**Ollama (local LLM)**: best output quality, full prompt control, and on-device runtime. "
-        "Recommended if Ollama is running.\n\n"
-        "**facebook/bart-large-cnn**: lighter and faster extractive summariser, good fallback."
-    ),
-    "Output": (
-        "**Summary + Action Items**\n\n"
-        "Final structured output: a concise meeting summary plus extracted action items. "
-        "Can be enriched with speaker attribution when diarization is enabled upstream."
-    ),
 }
-BUILD_STEPS = [
-    ("1", "PipeWire +\nsounddevice", "#bfdbfe", "#4a9eed"),
-    ("2", "silero-vad +\ndistil-whisper", "#ddd6fe", "#8b5cf6"),
-    ("3", "Ollama\nsummarisation", "#fed7aa", "#f97316"),
-    ("4 (opt.)", "pyannote\ndiarization", "#fef08a", "#f59e0b"),
-]
-def _font(bold: bool, size: int) -> ImageFont.FreeTypeFont | ImageFont.ImageFont:
-    if bold:
-        candidates = [
-            "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
-            "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf",
-        ]
-    else:
-        candidates = [
-            "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
-            "/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf",
-        ]
-    for path in candidates:
-        if os.path.exists(path):
-            return ImageFont.truetype(path, size)
-    return ImageFont.load_default()
-def _rbox(draw: ImageDraw.ImageDraw, x: int, y: int, w: int, h: int, fill: str, stroke: str, r: int = 12) -> None:
-    draw.rounded_rectangle([x, y, x + w, y + h], radius=r, fill=fill, outline=stroke, width=2)
-def _center_text(
-    draw: ImageDraw.ImageDraw,
-    x: int,
-    y: int,
-    w: int,
-    lines: list[str],
-    font: ImageFont.FreeTypeFont | ImageFont.ImageFont,
-    color: str = "#1e1e1e",
-    lh: int = 20,
-) -> None:
-    total = len(lines) * lh
-    current_y = y - total // 2
-    for line in lines:
-        left, _, right, _ = draw.textbbox((0, 0), line, font=font)
-        text_width = right - left
-        draw.text((x + (w - text_width) // 2, current_y), line, font=font, fill=color)
-        current_y += lh
-def _arrow(
-    draw: ImageDraw.ImageDraw,
-    x1: int,
-    y1: int,
-    x2: int,
-    y2: int,
-    color: str = "#555",
-    label: str = "",
-    label_font: ImageFont.FreeTypeFont | ImageFont.ImageFont | None = None,
-) -> None:
-    draw.line([(x1, y1), (x2, y2)], fill=color, width=2)
-    angle = math.atan2(y2 - y1, x2 - x1)
-    size = 10
-    for delta in (0.4, -0.4):
-        ax = x2 - size * math.cos(angle - delta)
-        ay = y2 - size * math.sin(angle - delta)
-        draw.line([(x2, y2), (ax, ay)], fill=color, width=2)
-    if label and label_font:
-        mx, my = (x1 + x2) // 2, (y1 + y2) // 2
-        left, _, right, _ = draw.textbbox((0, 0), label, font=label_font)
-        text_width = right - left
-        draw.text((mx - text_width // 2, my - 16), label, font=label_font, fill="#555")
-def generate_diagram(asr_choice: str, sum_choice: str, show_diar: bool) -> Image.Image:
-    width, height = 1400, 900
-    img = Image.new("RGB", (width, height), "#f8f9fa")
-    draw = ImageDraw.Draw(img)
-    font_bold = _font(True, 15)
-    font_regular = _font(False, 13)
-    font_title = _font(True, 22)
-    font_zone_title = _font(True, 13)
-    font_step = _font(True, 12)
-    left, _, right, _ = draw.textbbox((0, 0), "Meeting Summarisation Pipeline", font=font_title)
-    title_width = right - left
-    draw.text(
-        ((width - title_width) // 2, 18),
-        "Meeting Summarisation Pipeline",
-        font=font_title,
-        fill="#1e1e1e",
-    )
-    zone_y, zone_h = 60, 710
-    for label, zone_x, zone_w, zone_fill, zone_stroke in ZONES:
-        if not show_diar and "Diarization" in label:
-            continue
-        _rbox(draw, zone_x, zone_y, zone_w, zone_h, zone_fill, zone_stroke, r=14)
-        for idx, line in enumerate(label.split("\n")):
-            left, _, right, _ = draw.textbbox((0, 0), line, font=font_zone_title)
-            text_width = right - left
-            draw.text(
-                (zone_x + (zone_w - text_width) // 2, zone_y + 6 + idx * 16),
-                line,
-                font=font_zone_title,
-                fill=zone_stroke,
             )
-    _rbox(draw, 45, 130, 160, 60, "#bfdbfe", "#4a9eed")
-    _center_text(draw, 45, 160, 160, ["PipeWire", "Loopback Sink"], font_bold, "#1e3a8a")
-    _arrow(draw, 125, 190, 125, 230, "#4a9eed")
-    _rbox(draw, 45, 230, 160, 60, "#bfdbfe", "#4a9eed")
-    _center_text(draw, 45, 260, 160, ["sounddevice", "/ pyaudio"], font_bold, "#1e3a8a")
-    _rbox(draw, 255, 175, 130, 65, "#ddd6fe", "#8b5cf6")
-    _center_text(draw, 255, 207, 130, ["silero-vad", "voice activity"], font_bold, "#4c1d95")
-    _arrow(draw, 205, 260, 255, 210, "#4a9eed", "raw audio", font_regular)
-    use_fast = "distil" in asr_choice
-    if use_fast:
-        asr_lines = ["distil-whisper-v3", "fast / real-time"]
-    else:
-        asr_lines = ["whisper-large-v3", "high accuracy"]
-    _rbox(draw, 435, 175, 180, 65, "#bbf7d0", "#22c55e")
-    _center_text(draw, 435, 207, 180, asr_lines, font_bold, "#14532d")
-    _arrow(draw, 385, 207, 435, 207, "#8b5cf6", "speech chunks", font_regular)
-    if show_diar:
-        _rbox(draw, 665, 175, 170, 75, "#fef08a", "#f59e0b")
-        _center_text(
-            draw,
-            665,
-            212,
-            170,
-            ["pyannote/", "speaker-diar-3.1", "needs HF token"],
-            font_step,
-            "#78350f",
-            lh=18,
         )
-        _arrow(draw, 615, 207, 665, 207, "#22c55e", "transcript", font_regular)
-        sum_src_x = 835
-    else:
-        draw.line([(615, 207), (650, 207)], fill="#22c55e", width=2)
-        draw.line([(650, 207), (650, 340), (920, 340), (920, 300)], fill="#22c55e", width=2)
-        left, _, right, _ = draw.textbbox((0, 0), "skip diarization", font=font_regular)
-        text_width = right - left
-        draw.text((750 - text_width // 2, 345), "skip diarization", font=font_regular, fill="#15803d")
-        sum_src_x = None
-    use_ollama = "Ollama" in sum_choice
-    if use_ollama:
-        sum_lines = ["Ollama (local LLM)", "recommended"]
-        sum_fill = "#fed7aa"
-    else:
-        sum_lines = ["facebook/", "bart-large-cnn"]
-        sum_fill = "#fde8d8"
-    _rbox(draw, 885, 175, 175, 65, sum_fill, "#f97316")
-    _center_text(draw, 885, 207, 175, sum_lines, font_bold, "#7c2d12")
-    if show_diar and sum_src_x is not None:
-        _arrow(draw, sum_src_x, 207, 885, 207, "#f59e0b", "labelled speech", font_regular)
-    _arrow(draw, 1060, 207, 1115, 207, "#f97316")
-    _rbox(draw, 1115, 165, 235, 75, "#6ee7b7", "#22c55e")
-    _center_text(draw, 1115, 202, 235, ["Summary +", "Action Items"], font_bold, "#064e3b")
-    box_x, box_y = 30, 790
-    draw.rounded_rectangle(
-        [box_x, box_y, box_x + 1340, box_y + 85],
-        radius=10,
-        fill="#f1f5f9",
-        outline="#cbd5e1",
-        width=1,
-    )
-    draw.text((box_x + 14, box_y + 10), "Build Order:", font=font_bold, fill="#1e1e1e")
-    step_x = box_x + 120
-    for num, text, fill, stroke in BUILD_STEPS:
-        _rbox(draw, step_x, box_y + 8, 185, 65, fill, stroke, r=8)
-        lines = [f"Step {num}"] + text.split("\n")
-        y0 = box_y + 14
-        for line in lines:
-            left, _, right, _ = draw.textbbox((0, 0), line, font=font_step)
-            text_width = right - left
-            draw.text((step_x + (185 - text_width) // 2, y0), line, font=font_step, fill="#1e1e1e")
-            y0 += 16
-        if step_x + 185 + 40 < box_x + 1340:
-            _arrow(draw, step_x + 185, box_y + 40, step_x + 225, box_y + 40, "#555")
-        step_x += 225
-    return img
-def show_desc(stage: str | None) -> str:
-    if not stage:
-        return "No description available."
-    return DESCRIPTIONS.get(stage, "No description available.")
-with gr.Blocks(title="Meeting Summarisation Pipeline") as demo:
-    gr.Markdown("## Meeting Summarisation Pipeline Explorer")
     gr.Markdown(
-        "Visualise and configure a local, cross-platform meeting summariser "
-        "built on Hugging Face models and PipeWire. Adjust the options below "
-        "and the diagram will update live."
     )
-    with gr.Row():
-        with gr.Column(scale=3):
-            diagram = gr.Image(
-                value=generate_diagram(
-                    MODEL_OPTIONS["transcription"][0],
-                    MODEL_OPTIONS["summarisation"][0],
-                    True,
-                ),
-                label="Pipeline Diagram",
-                interactive=False,
-            )
         with gr.Column(scale=1):
-            gr.Markdown("### Configuration")
-            asr_dd = gr.Dropdown(
-                choices=MODEL_OPTIONS["transcription"],
-                value=MODEL_OPTIONS["transcription"][0],
-                label="Transcription model",
             )
-            sum_dd = gr.Dropdown(
-                choices=MODEL_OPTIONS["summarisation"],
-                value=MODEL_OPTIONS["summarisation"][0],
-                label="Summarisation model",
             )
-            diar_cb = gr.Checkbox(value=True, label="Include diarization (pyannote)")
-            gr.Markdown("---")
-            gr.Markdown("### Stage Info")
-            stage_dd = gr.Dropdown(
-                choices=list(DESCRIPTIONS.keys()),
-                label="Select a stage to learn more",
-                value=None,
             )
-            stage_info = gr.Markdown("Select a stage above.")
-    for ctrl in (asr_dd, sum_dd, diar_cb):
-        ctrl.change(
-            fn=lambda a, s, dz: generate_diagram(a, s, dz),
-            inputs=[asr_dd, sum_dd, diar_cb],
-            outputs=diagram,
-        )
-    stage_dd.change(fn=show_desc, inputs=stage_dd, outputs=stage_info)
-    gr.Markdown("---")
     gr.Markdown(
-        "**Build order:** PipeWire + sounddevice -> silero-vad + distil-whisper "
-        "-> Ollama summarisation -> pyannote diarization (optional, last)"
     )
 if __name__ == "__main__":
-    demo.launch()

 import os
+import tempfile
+import numpy as np
+import soundfile as sf
+import torch
 import gradio as gr
+from transformers import pipeline as hf_pipeline
+# ── Konfiguration ──────────────────────────────────────────────────────────────
+HF_TOKEN = os.environ.get("HF_TOKEN", "")
+ASR_MODELS = {
+    "whisper-tiny  (schnellste, geringste Qualität)": "openai/whisper-tiny",
+    "whisper-base  (schnell, gut für kurze Aufnahmen)": "openai/whisper-base",
+    "whisper-small (empfohlen für CPU)": "openai/whisper-small",
+    "distil-whisper-large-v3 (langsam, beste Qualität)": "distil-whisper/distil-large-v3",
 }
+# ── Lazy Model Loading ─────────────────────────────────────────────────────────
+_asr_cache: dict = {}
+_diar_pipe = None
+def get_asr(model_key: str):
+    model_id = ASR_MODELS[model_key]
+    if model_id not in _asr_cache:
+        _asr_cache[model_id] = hf_pipeline(
+            "automatic-speech-recognition",
+            model=model_id,
+            device="cpu",
+            torch_dtype=torch.float32,
+            chunk_length_s=30,
+            return_timestamps=True,
+        )
+    return _asr_cache[model_id]
+def get_diar():
+    global _diar_pipe
+    if _diar_pipe is None:
+        if not HF_TOKEN:
+            raise EnvironmentError(
+                "HF_TOKEN nicht gesetzt. Füge ihn in den Space-Settings unter "
+                "'Settings → Variables and secrets' hinzu."
             )
+        from pyannote.audio import Pipeline as PyannotePipeline
+        _diar_pipe = PyannotePipeline.from_pretrained(
+            "pyannote/speaker-diarization-3.1",
+            use_auth_token=HF_TOKEN,
         )
+    return _diar_pipe
+# ── Hilfsfunktionen ────────────────────────────────────────────────────────────
+def merge_with_speakers(chunks: list, diarization) -> list[tuple]:
+    """Ordnet jedem ASR-Chunk den dominanten Sprecher zu."""
+    merged = []
+    for chunk in chunks:
+        ts = chunk.get("timestamp", (None, None))
+        start, end = ts if ts else (None, None)
+        if start is None:
+            continue
+        end = end or (start + 1.0)  # Fallback falls letzter Chunk kein End-Timestamp hat
+        best_speaker, best_overlap = "Unbekannt", 0.0
+        for turn, _, speaker in diarization.itertracks(yield_label=True):
+            overlap = max(0.0, min(end, turn.end) - max(start, turn.start))
+            if overlap > best_overlap:
+                best_overlap = overlap
+                best_speaker = speaker
+        merged.append((start, end, best_speaker, chunk["text"].strip()))
+    return merged
+def format_diarized(segments: list[tuple]) -> str:
+    """Gruppiert aufeinanderfolgende Chunks desselben Sprechers."""
+    if not segments:
+        return ""
+    lines = []
+    cur_speaker, cur_start, cur_texts = None, 0.0, []
+    for start, _end, speaker, text in segments:
+        if speaker != cur_speaker:
+            if cur_speaker is not None:
+                lines.append(f"**{cur_speaker}** [{cur_start:.1f}s]:\n{' '.join(cur_texts)}")
+            cur_speaker, cur_start, cur_texts = speaker, start, [text]
+        else:
+            cur_texts.append(text)
+    if cur_speaker and cur_texts:
+        lines.append(f"**{cur_speaker}** [{cur_start:.1f}s]:\n{' '.join(cur_texts)}")
+    return "\n\n".join(lines)
+# ── Haupt-Pipeline ─────────────────────────────────────────────────────────────
+def transcribe(audio, model_key: str, use_diar: bool):
+    """Generator-Funktion: liefert Zwischenergebnisse live an die UI."""
+    if audio is None:
+        yield "⚠️ Kein Audio eingegeben.", ""
+        return
+    sample_rate, audio_data = audio
+    # Mono erzwingen
+    if audio_data.ndim > 1:
+        audio_data = audio_data.mean(axis=1)
+    audio_data = audio_data.astype(np.float32)
+    # Normalisieren (16-bit PCM → float)
+    if audio_data.max() > 1.0:
+        audio_data /= 32768.0
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+        tmp_path = f.name
+        sf.write(tmp_path, audio_data, sample_rate)
+    try:
+        # ── Schritt 1: Transkription ──
+        yield "⏳ Lade ASR-Modell und transkribiere...", ""
+        asr = get_asr(model_key)
+        result = asr(tmp_path)
+        raw_transcript = result["text"].strip()
+        chunks = result.get("chunks", [])
+        if not use_diar:
+            yield raw_transcript, ""
+            return
+        # ── Schritt 2: Diarisierung ──
+        yield raw_transcript, "⏳ Diarisierung läuft (auf CPU kann das einige Minuten dauern)..."
+        try:
+            diar = get_diar()
+            diarization = diar(tmp_path)
+            segments = merge_with_speakers(chunks, diarization)
+            labeled = format_diarized(segments)
+            yield raw_transcript, labeled or "(Keine Sprecher erkannt.)"
+        except EnvironmentError as e:
+            yield raw_transcript, f"⚠️ {e}"
+        except Exception as e:
+            yield raw_transcript, f"⚠️ Diarisierung fehlgeschlagen: {e}"
+    finally:
+        os.unlink(tmp_path)
+# ── UI ─────────────────────────────────────────────────────────────────────────
+TOKEN_WARNING = (
+    "> ⚠️ **Kein `HF_TOKEN` gefunden.**  \n"
+    "> Diarisierung (pyannote) ist deaktiviert.  \n"
+    "> Füge das Token unter **Settings → Variables and secrets** als `HF_TOKEN` hinzu  \n"
+    "> und akzeptiere die Lizenzbedingungen auf [hf.co/pyannote/speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1)."
+)
+with gr.Blocks(title="Meeting Transcriber") as demo:
+    gr.Markdown("# 🎙️ Meeting Transcriber")
     gr.Markdown(
+        "Lade eine Audiodatei hoch **oder** nimm direkt über das Mikrofon auf.  \n"
+        "Das Audio wird transkribiert und optional nach Sprechern getrennt."
     )
+    if not HF_TOKEN:
+        gr.Markdown(TOKEN_WARNING)
+    with gr.Row():
         with gr.Column(scale=1):
+            audio_input = gr.Audio(
+                sources=["microphone", "upload"],
+                type="numpy",
+                label="Audio (Mikrofon oder Datei)",
             )
+            model_dd = gr.Dropdown(
+                choices=list(ASR_MODELS.keys()),
+                value="whisper-small (empfohlen für CPU)",
+                label="Transkriptionsmodell",
             )
+            diar_cb = gr.Checkbox(
+                value=bool(HF_TOKEN),
+                label="Speaker-Diarisierung aktivieren (pyannote, braucht HF_TOKEN)",
+                interactive=bool(HF_TOKEN),
             )
+            run_btn = gr.Button("▶ Transkribieren", variant="primary")
+        with gr.Column(scale=2):
+            transcript_out = gr.Textbox(
+                label="Rohtranskript (Whisper)",
+                lines=12,
+                show_copy_button=True,
+            )
+            diar_out = gr.Textbox(
+                label="Transkript mit Sprecher-Labels (pyannote)",
+                lines=12,
+                show_copy_button=True,
+                placeholder="Nur sichtbar wenn Diarisierung aktiviert ist.",
+            )
     gr.Markdown(
+        "---\n"
+        "**Hinweise:**  \n"
+        "• Auf Free CPU dauert Whisper-small ~1–2× Echtzeit, Diarisierung ~2–5× Echtzeit.  \n"
+        "• Für pyannote musst du die Lizenzbedingungen auf Hugging Face akzeptiert haben.  \n"
+        "• Das erste Laden der Modelle dauert länger (Download-Cache)."
     )
+    run_btn.click(
+        fn=transcribe,
+        inputs=[audio_input, model_dd, diar_cb],
+        outputs=[transcript_out, diar_out],
+    )
 if __name__ == "__main__":
+    demo.launch()