Spaces:

aidn
/

yapper

Running on Zero

App Files Files Community

aidn commited on 21 days ago

Commit

2b0351a

verified ·

1 Parent(s): 8b9e841

Create app.py

Browse files

Files changed (1) hide show

app.py +330 -0

app.py ADDED Viewed

	@@ -0,0 +1,330 @@

+import math
+import os
+import gradio as gr
+from PIL import Image, ImageDraw, ImageFont
+ZONES = [
+    ("Audio Layer", 30, 190, "#dbeafe", "#4a9eed"),
+    ("VAD", 240, 160, "#ede9fe", "#8b5cf6"),
+    ("Transcription", 420, 210, "#dcfce7", "#22c55e"),
+    ("Diarization\n(optional)", 650, 200, "#fef9c3", "#f59e0b"),
+    ("Summarisation", 870, 210, "#ffedd5", "#f97316"),
+    ("Output", 1100, 270, "#d1fae5", "#22c55e"),
+]
+MODEL_OPTIONS = {
+    "transcription": [
+        "distil-whisper-large-v3 (fast)",
+        "whisper-large-v3 (accurate)",
+    ],
+    "summarisation": [
+        "Ollama local LLM (recommended)",
+        "facebook/bart-large-cnn (fallback)",
+    ],
+}
+DESCRIPTIONS = {
+    "Audio Layer": (
+        "**PipeWire / PulseAudio loopback**\n\n"
+        "Creates a virtual sink that captures both your microphone and speaker output "
+        "simultaneously into a single stream. On modern Arch Linux you will typically run "
+        "PipeWire and can use `pw-loopback` or `pactl load-module module-loopback`. "
+        "Python reads the stream via `sounddevice` or `pyaudio`."
+    ),
+    "VAD": (
+        "**silero-vad**\n\n"
+        "Tiny, CPU-friendly voice activity detection model. Acts as a gatekeeper: "
+        "it fires only when someone is actually speaking, chunking the stream into "
+        "speech segments and discarding silence. This keeps downstream models from "
+        "wasting cycles on dead air and reduces latency."
+    ),
+    "Transcription": (
+        "**distil-whisper-large-v3**: faster than full Whisper with strong real-time accuracy. "
+        "Recommended starting point.\n\n"
+        "**whisper-large-v3**: higher accuracy at the cost of more CPU/GPU. "
+        "Switch to this if transcription quality is the bottleneck."
+    ),
+    "Diarization\n(optional)": (
+        "**pyannote/speaker-diarization-3.1**\n\n"
+        "Labels each speech chunk with a speaker ID (for example, Speaker A and Speaker B). "
+        "Requires a Hugging Face token (gated model; request access on the HF Hub). "
+        "Skip this on your first pass and add it after the base pipeline is stable."
+    ),
+    "Summarisation": (
+        "**Ollama (local LLM)**: best output quality, full prompt control, and on-device runtime. "
+        "Recommended if Ollama is running.\n\n"
+        "**facebook/bart-large-cnn**: lighter and faster extractive summariser, good fallback."
+    ),
+    "Output": (
+        "**Summary + Action Items**\n\n"
+        "Final structured output: a concise meeting summary plus extracted action items. "
+        "Can be enriched with speaker attribution when diarization is enabled upstream."
+    ),
+}
+BUILD_STEPS = [
+    ("1", "PipeWire +\nsounddevice", "#bfdbfe", "#4a9eed"),
+    ("2", "silero-vad +\ndistil-whisper", "#ddd6fe", "#8b5cf6"),
+    ("3", "Ollama\nsummarisation", "#fed7aa", "#f97316"),
+    ("4 (opt.)", "pyannote\ndiarization", "#fef08a", "#f59e0b"),
+]
+def _font(bold: bool, size: int) -> ImageFont.FreeTypeFont | ImageFont.ImageFont:
+    if bold:
+        candidates = [
+            "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
+            "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf",
+        ]
+    else:
+        candidates = [
+            "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
+            "/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf",
+        ]
+    for path in candidates:
+        if os.path.exists(path):
+            return ImageFont.truetype(path, size)
+    return ImageFont.load_default()
+def _rbox(draw: ImageDraw.ImageDraw, x: int, y: int, w: int, h: int, fill: str, stroke: str, r: int = 12) -> None:
+    draw.rounded_rectangle([x, y, x + w, y + h], radius=r, fill=fill, outline=stroke, width=2)
+def _center_text(
+    draw: ImageDraw.ImageDraw,
+    x: int,
+    y: int,
+    w: int,
+    lines: list[str],
+    font: ImageFont.FreeTypeFont | ImageFont.ImageFont,
+    color: str = "#1e1e1e",
+    lh: int = 20,
+) -> None:
+    total = len(lines) * lh
+    current_y = y - total // 2
+    for line in lines:
+        left, _, right, _ = draw.textbbox((0, 0), line, font=font)
+        text_width = right - left
+        draw.text((x + (w - text_width) // 2, current_y), line, font=font, fill=color)
+        current_y += lh
+def _arrow(
+    draw: ImageDraw.ImageDraw,
+    x1: int,
+    y1: int,
+    x2: int,
+    y2: int,
+    color: str = "#555",
+    label: str = "",
+    label_font: ImageFont.FreeTypeFont | ImageFont.ImageFont | None = None,
+) -> None:
+    draw.line([(x1, y1), (x2, y2)], fill=color, width=2)
+    angle = math.atan2(y2 - y1, x2 - x1)
+    size = 10
+    for delta in (0.4, -0.4):
+        ax = x2 - size * math.cos(angle - delta)
+        ay = y2 - size * math.sin(angle - delta)
+        draw.line([(x2, y2), (ax, ay)], fill=color, width=2)
+    if label and label_font:
+        mx, my = (x1 + x2) // 2, (y1 + y2) // 2
+        left, _, right, _ = draw.textbbox((0, 0), label, font=label_font)
+        text_width = right - left
+        draw.text((mx - text_width // 2, my - 16), label, font=label_font, fill="#555")
+def generate_diagram(asr_choice: str, sum_choice: str, show_diar: bool) -> Image.Image:
+    width, height = 1400, 900
+    img = Image.new("RGB", (width, height), "#f8f9fa")
+    draw = ImageDraw.Draw(img)
+    font_bold = _font(True, 15)
+    font_regular = _font(False, 13)
+    font_title = _font(True, 22)
+    font_zone_title = _font(True, 13)
+    font_step = _font(True, 12)
+    left, _, right, _ = draw.textbbox((0, 0), "Meeting Summarisation Pipeline", font=font_title)
+    title_width = right - left
+    draw.text(
+        ((width - title_width) // 2, 18),
+        "Meeting Summarisation Pipeline",
+        font=font_title,
+        fill="#1e1e1e",
+    )
+    zone_y, zone_h = 60, 710
+    for label, zone_x, zone_w, zone_fill, zone_stroke in ZONES:
+        if not show_diar and "Diarization" in label:
+            continue
+        _rbox(draw, zone_x, zone_y, zone_w, zone_h, zone_fill, zone_stroke, r=14)
+        for idx, line in enumerate(label.split("\n")):
+            left, _, right, _ = draw.textbbox((0, 0), line, font=font_zone_title)
+            text_width = right - left
+            draw.text(
+                (zone_x + (zone_w - text_width) // 2, zone_y + 6 + idx * 16),
+                line,
+                font=font_zone_title,
+                fill=zone_stroke,
+            )
+    _rbox(draw, 45, 130, 160, 60, "#bfdbfe", "#4a9eed")
+    _center_text(draw, 45, 160, 160, ["PipeWire", "Loopback Sink"], font_bold, "#1e3a8a")
+    _arrow(draw, 125, 190, 125, 230, "#4a9eed")
+    _rbox(draw, 45, 230, 160, 60, "#bfdbfe", "#4a9eed")
+    _center_text(draw, 45, 260, 160, ["sounddevice", "/ pyaudio"], font_bold, "#1e3a8a")
+    _rbox(draw, 255, 175, 130, 65, "#ddd6fe", "#8b5cf6")
+    _center_text(draw, 255, 207, 130, ["silero-vad", "voice activity"], font_bold, "#4c1d95")
+    _arrow(draw, 205, 260, 255, 210, "#4a9eed", "raw audio", font_regular)
+    use_fast = "distil" in asr_choice
+    if use_fast:
+        asr_lines = ["distil-whisper-v3", "fast / real-time"]
+    else:
+        asr_lines = ["whisper-large-v3", "high accuracy"]
+    _rbox(draw, 435, 175, 180, 65, "#bbf7d0", "#22c55e")
+    _center_text(draw, 435, 207, 180, asr_lines, font_bold, "#14532d")
+    _arrow(draw, 385, 207, 435, 207, "#8b5cf6", "speech chunks", font_regular)
+    if show_diar:
+        _rbox(draw, 665, 175, 170, 75, "#fef08a", "#f59e0b")
+        _center_text(
+            draw,
+            665,
+            212,
+            170,
+            ["pyannote/", "speaker-diar-3.1", "needs HF token"],
+            font_step,
+            "#78350f",
+            lh=18,
+        )
+        _arrow(draw, 615, 207, 665, 207, "#22c55e", "transcript", font_regular)
+        sum_src_x = 835
+    else:
+        draw.line([(615, 207), (650, 207)], fill="#22c55e", width=2)
+        draw.line([(650, 207), (650, 340), (920, 340), (920, 300)], fill="#22c55e", width=2)
+        left, _, right, _ = draw.textbbox((0, 0), "skip diarization", font=font_regular)
+        text_width = right - left
+        draw.text((750 - text_width // 2, 345), "skip diarization", font=font_regular, fill="#15803d")
+        sum_src_x = None
+    use_ollama = "Ollama" in sum_choice
+    if use_ollama:
+        sum_lines = ["Ollama (local LLM)", "recommended"]
+        sum_fill = "#fed7aa"
+    else:
+        sum_lines = ["facebook/", "bart-large-cnn"]
+        sum_fill = "#fde8d8"
+    _rbox(draw, 885, 175, 175, 65, sum_fill, "#f97316")
+    _center_text(draw, 885, 207, 175, sum_lines, font_bold, "#7c2d12")
+    if show_diar and sum_src_x is not None:
+        _arrow(draw, sum_src_x, 207, 885, 207, "#f59e0b", "labelled speech", font_regular)
+    _arrow(draw, 1060, 207, 1115, 207, "#f97316")
+    _rbox(draw, 1115, 165, 235, 75, "#6ee7b7", "#22c55e")
+    _center_text(draw, 1115, 202, 235, ["Summary +", "Action Items"], font_bold, "#064e3b")
+    box_x, box_y = 30, 790
+    draw.rounded_rectangle(
+        [box_x, box_y, box_x + 1340, box_y + 85],
+        radius=10,
+        fill="#f1f5f9",
+        outline="#cbd5e1",
+        width=1,
+    )
+    draw.text((box_x + 14, box_y + 10), "Build Order:", font=font_bold, fill="#1e1e1e")
+    step_x = box_x + 120
+    for num, text, fill, stroke in BUILD_STEPS:
+        _rbox(draw, step_x, box_y + 8, 185, 65, fill, stroke, r=8)
+        lines = [f"Step {num}"] + text.split("\n")
+        y0 = box_y + 14
+        for line in lines:
+            left, _, right, _ = draw.textbbox((0, 0), line, font=font_step)
+            text_width = right - left
+            draw.text((step_x + (185 - text_width) // 2, y0), line, font=font_step, fill="#1e1e1e")
+            y0 += 16
+        if step_x + 185 + 40 < box_x + 1340:
+            _arrow(draw, step_x + 185, box_y + 40, step_x + 225, box_y + 40, "#555")
+        step_x += 225
+    return img
+def show_desc(stage: str | None) -> str:
+    if not stage:
+        return "No description available."
+    return DESCRIPTIONS.get(stage, "No description available.")
+with gr.Blocks(title="Meeting Summarisation Pipeline") as demo:
+    gr.Markdown("## Meeting Summarisation Pipeline Explorer")
+    gr.Markdown(
+        "Visualise and configure a local, cross-platform meeting summariser "
+        "built on Hugging Face models and PipeWire. Adjust the options below "
+        "and the diagram will update live."
+    )
+    with gr.Row():
+        with gr.Column(scale=3):
+            diagram = gr.Image(
+                value=generate_diagram(
+                    MODEL_OPTIONS["transcription"][0],
+                    MODEL_OPTIONS["summarisation"][0],
+                    True,
+                ),
+                label="Pipeline Diagram",
+                interactive=False,
+            )
+        with gr.Column(scale=1):
+            gr.Markdown("### Configuration")
+            asr_dd = gr.Dropdown(
+                choices=MODEL_OPTIONS["transcription"],
+                value=MODEL_OPTIONS["transcription"][0],
+                label="Transcription model",
+            )
+            sum_dd = gr.Dropdown(
+                choices=MODEL_OPTIONS["summarisation"],
+                value=MODEL_OPTIONS["summarisation"][0],
+                label="Summarisation model",
+            )
+            diar_cb = gr.Checkbox(value=True, label="Include diarization (pyannote)")
+            gr.Markdown("---")
+            gr.Markdown("### Stage Info")
+            stage_dd = gr.Dropdown(
+                choices=list(DESCRIPTIONS.keys()),
+                label="Select a stage to learn more",
+                value=None,
+            )
+            stage_info = gr.Markdown("Select a stage above.")
+    for ctrl in (asr_dd, sum_dd, diar_cb):
+        ctrl.change(
+            fn=lambda a, s, dz: generate_diagram(a, s, dz),
+            inputs=[asr_dd, sum_dd, diar_cb],
+            outputs=diagram,
+        )
+    stage_dd.change(fn=show_desc, inputs=stage_dd, outputs=stage_info)
+    gr.Markdown("---")
+    gr.Markdown(
+        "**Build order:** PipeWire + sounddevice -> silero-vad + distil-whisper "
+        "-> Ollama summarisation -> pyannote diarization (optional, last)"
+    )
+if __name__ == "__main__":
+    demo.launch()