aidn commited on
Commit
2b0351a
·
verified ·
1 Parent(s): 8b9e841

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +330 -0
app.py ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import os
3
+
4
+ import gradio as gr
5
+ from PIL import Image, ImageDraw, ImageFont
6
+
7
+
8
+ ZONES = [
9
+ ("Audio Layer", 30, 190, "#dbeafe", "#4a9eed"),
10
+ ("VAD", 240, 160, "#ede9fe", "#8b5cf6"),
11
+ ("Transcription", 420, 210, "#dcfce7", "#22c55e"),
12
+ ("Diarization\n(optional)", 650, 200, "#fef9c3", "#f59e0b"),
13
+ ("Summarisation", 870, 210, "#ffedd5", "#f97316"),
14
+ ("Output", 1100, 270, "#d1fae5", "#22c55e"),
15
+ ]
16
+
17
+ MODEL_OPTIONS = {
18
+ "transcription": [
19
+ "distil-whisper-large-v3 (fast)",
20
+ "whisper-large-v3 (accurate)",
21
+ ],
22
+ "summarisation": [
23
+ "Ollama local LLM (recommended)",
24
+ "facebook/bart-large-cnn (fallback)",
25
+ ],
26
+ }
27
+
28
+ DESCRIPTIONS = {
29
+ "Audio Layer": (
30
+ "**PipeWire / PulseAudio loopback**\n\n"
31
+ "Creates a virtual sink that captures both your microphone and speaker output "
32
+ "simultaneously into a single stream. On modern Arch Linux you will typically run "
33
+ "PipeWire and can use `pw-loopback` or `pactl load-module module-loopback`. "
34
+ "Python reads the stream via `sounddevice` or `pyaudio`."
35
+ ),
36
+ "VAD": (
37
+ "**silero-vad**\n\n"
38
+ "Tiny, CPU-friendly voice activity detection model. Acts as a gatekeeper: "
39
+ "it fires only when someone is actually speaking, chunking the stream into "
40
+ "speech segments and discarding silence. This keeps downstream models from "
41
+ "wasting cycles on dead air and reduces latency."
42
+ ),
43
+ "Transcription": (
44
+ "**distil-whisper-large-v3**: faster than full Whisper with strong real-time accuracy. "
45
+ "Recommended starting point.\n\n"
46
+ "**whisper-large-v3**: higher accuracy at the cost of more CPU/GPU. "
47
+ "Switch to this if transcription quality is the bottleneck."
48
+ ),
49
+ "Diarization\n(optional)": (
50
+ "**pyannote/speaker-diarization-3.1**\n\n"
51
+ "Labels each speech chunk with a speaker ID (for example, Speaker A and Speaker B). "
52
+ "Requires a Hugging Face token (gated model; request access on the HF Hub). "
53
+ "Skip this on your first pass and add it after the base pipeline is stable."
54
+ ),
55
+ "Summarisation": (
56
+ "**Ollama (local LLM)**: best output quality, full prompt control, and on-device runtime. "
57
+ "Recommended if Ollama is running.\n\n"
58
+ "**facebook/bart-large-cnn**: lighter and faster extractive summariser, good fallback."
59
+ ),
60
+ "Output": (
61
+ "**Summary + Action Items**\n\n"
62
+ "Final structured output: a concise meeting summary plus extracted action items. "
63
+ "Can be enriched with speaker attribution when diarization is enabled upstream."
64
+ ),
65
+ }
66
+
67
+ BUILD_STEPS = [
68
+ ("1", "PipeWire +\nsounddevice", "#bfdbfe", "#4a9eed"),
69
+ ("2", "silero-vad +\ndistil-whisper", "#ddd6fe", "#8b5cf6"),
70
+ ("3", "Ollama\nsummarisation", "#fed7aa", "#f97316"),
71
+ ("4 (opt.)", "pyannote\ndiarization", "#fef08a", "#f59e0b"),
72
+ ]
73
+
74
+
75
+ def _font(bold: bool, size: int) -> ImageFont.FreeTypeFont | ImageFont.ImageFont:
76
+ if bold:
77
+ candidates = [
78
+ "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
79
+ "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf",
80
+ ]
81
+ else:
82
+ candidates = [
83
+ "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
84
+ "/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf",
85
+ ]
86
+
87
+ for path in candidates:
88
+ if os.path.exists(path):
89
+ return ImageFont.truetype(path, size)
90
+
91
+ return ImageFont.load_default()
92
+
93
+
94
+ def _rbox(draw: ImageDraw.ImageDraw, x: int, y: int, w: int, h: int, fill: str, stroke: str, r: int = 12) -> None:
95
+ draw.rounded_rectangle([x, y, x + w, y + h], radius=r, fill=fill, outline=stroke, width=2)
96
+
97
+
98
+ def _center_text(
99
+ draw: ImageDraw.ImageDraw,
100
+ x: int,
101
+ y: int,
102
+ w: int,
103
+ lines: list[str],
104
+ font: ImageFont.FreeTypeFont | ImageFont.ImageFont,
105
+ color: str = "#1e1e1e",
106
+ lh: int = 20,
107
+ ) -> None:
108
+ total = len(lines) * lh
109
+ current_y = y - total // 2
110
+ for line in lines:
111
+ left, _, right, _ = draw.textbbox((0, 0), line, font=font)
112
+ text_width = right - left
113
+ draw.text((x + (w - text_width) // 2, current_y), line, font=font, fill=color)
114
+ current_y += lh
115
+
116
+
117
+ def _arrow(
118
+ draw: ImageDraw.ImageDraw,
119
+ x1: int,
120
+ y1: int,
121
+ x2: int,
122
+ y2: int,
123
+ color: str = "#555",
124
+ label: str = "",
125
+ label_font: ImageFont.FreeTypeFont | ImageFont.ImageFont | None = None,
126
+ ) -> None:
127
+ draw.line([(x1, y1), (x2, y2)], fill=color, width=2)
128
+
129
+ angle = math.atan2(y2 - y1, x2 - x1)
130
+ size = 10
131
+ for delta in (0.4, -0.4):
132
+ ax = x2 - size * math.cos(angle - delta)
133
+ ay = y2 - size * math.sin(angle - delta)
134
+ draw.line([(x2, y2), (ax, ay)], fill=color, width=2)
135
+
136
+ if label and label_font:
137
+ mx, my = (x1 + x2) // 2, (y1 + y2) // 2
138
+ left, _, right, _ = draw.textbbox((0, 0), label, font=label_font)
139
+ text_width = right - left
140
+ draw.text((mx - text_width // 2, my - 16), label, font=label_font, fill="#555")
141
+
142
+
143
+ def generate_diagram(asr_choice: str, sum_choice: str, show_diar: bool) -> Image.Image:
144
+ width, height = 1400, 900
145
+ img = Image.new("RGB", (width, height), "#f8f9fa")
146
+ draw = ImageDraw.Draw(img)
147
+
148
+ font_bold = _font(True, 15)
149
+ font_regular = _font(False, 13)
150
+ font_title = _font(True, 22)
151
+ font_zone_title = _font(True, 13)
152
+ font_step = _font(True, 12)
153
+
154
+ left, _, right, _ = draw.textbbox((0, 0), "Meeting Summarisation Pipeline", font=font_title)
155
+ title_width = right - left
156
+ draw.text(
157
+ ((width - title_width) // 2, 18),
158
+ "Meeting Summarisation Pipeline",
159
+ font=font_title,
160
+ fill="#1e1e1e",
161
+ )
162
+
163
+ zone_y, zone_h = 60, 710
164
+ for label, zone_x, zone_w, zone_fill, zone_stroke in ZONES:
165
+ if not show_diar and "Diarization" in label:
166
+ continue
167
+
168
+ _rbox(draw, zone_x, zone_y, zone_w, zone_h, zone_fill, zone_stroke, r=14)
169
+ for idx, line in enumerate(label.split("\n")):
170
+ left, _, right, _ = draw.textbbox((0, 0), line, font=font_zone_title)
171
+ text_width = right - left
172
+ draw.text(
173
+ (zone_x + (zone_w - text_width) // 2, zone_y + 6 + idx * 16),
174
+ line,
175
+ font=font_zone_title,
176
+ fill=zone_stroke,
177
+ )
178
+
179
+ _rbox(draw, 45, 130, 160, 60, "#bfdbfe", "#4a9eed")
180
+ _center_text(draw, 45, 160, 160, ["PipeWire", "Loopback Sink"], font_bold, "#1e3a8a")
181
+ _arrow(draw, 125, 190, 125, 230, "#4a9eed")
182
+ _rbox(draw, 45, 230, 160, 60, "#bfdbfe", "#4a9eed")
183
+ _center_text(draw, 45, 260, 160, ["sounddevice", "/ pyaudio"], font_bold, "#1e3a8a")
184
+
185
+ _rbox(draw, 255, 175, 130, 65, "#ddd6fe", "#8b5cf6")
186
+ _center_text(draw, 255, 207, 130, ["silero-vad", "voice activity"], font_bold, "#4c1d95")
187
+ _arrow(draw, 205, 260, 255, 210, "#4a9eed", "raw audio", font_regular)
188
+
189
+ use_fast = "distil" in asr_choice
190
+ if use_fast:
191
+ asr_lines = ["distil-whisper-v3", "fast / real-time"]
192
+ else:
193
+ asr_lines = ["whisper-large-v3", "high accuracy"]
194
+ _rbox(draw, 435, 175, 180, 65, "#bbf7d0", "#22c55e")
195
+ _center_text(draw, 435, 207, 180, asr_lines, font_bold, "#14532d")
196
+ _arrow(draw, 385, 207, 435, 207, "#8b5cf6", "speech chunks", font_regular)
197
+
198
+ if show_diar:
199
+ _rbox(draw, 665, 175, 170, 75, "#fef08a", "#f59e0b")
200
+ _center_text(
201
+ draw,
202
+ 665,
203
+ 212,
204
+ 170,
205
+ ["pyannote/", "speaker-diar-3.1", "needs HF token"],
206
+ font_step,
207
+ "#78350f",
208
+ lh=18,
209
+ )
210
+ _arrow(draw, 615, 207, 665, 207, "#22c55e", "transcript", font_regular)
211
+ sum_src_x = 835
212
+ else:
213
+ draw.line([(615, 207), (650, 207)], fill="#22c55e", width=2)
214
+ draw.line([(650, 207), (650, 340), (920, 340), (920, 300)], fill="#22c55e", width=2)
215
+ left, _, right, _ = draw.textbbox((0, 0), "skip diarization", font=font_regular)
216
+ text_width = right - left
217
+ draw.text((750 - text_width // 2, 345), "skip diarization", font=font_regular, fill="#15803d")
218
+ sum_src_x = None
219
+
220
+ use_ollama = "Ollama" in sum_choice
221
+ if use_ollama:
222
+ sum_lines = ["Ollama (local LLM)", "recommended"]
223
+ sum_fill = "#fed7aa"
224
+ else:
225
+ sum_lines = ["facebook/", "bart-large-cnn"]
226
+ sum_fill = "#fde8d8"
227
+
228
+ _rbox(draw, 885, 175, 175, 65, sum_fill, "#f97316")
229
+ _center_text(draw, 885, 207, 175, sum_lines, font_bold, "#7c2d12")
230
+ if show_diar and sum_src_x is not None:
231
+ _arrow(draw, sum_src_x, 207, 885, 207, "#f59e0b", "labelled speech", font_regular)
232
+ _arrow(draw, 1060, 207, 1115, 207, "#f97316")
233
+
234
+ _rbox(draw, 1115, 165, 235, 75, "#6ee7b7", "#22c55e")
235
+ _center_text(draw, 1115, 202, 235, ["Summary +", "Action Items"], font_bold, "#064e3b")
236
+
237
+ box_x, box_y = 30, 790
238
+ draw.rounded_rectangle(
239
+ [box_x, box_y, box_x + 1340, box_y + 85],
240
+ radius=10,
241
+ fill="#f1f5f9",
242
+ outline="#cbd5e1",
243
+ width=1,
244
+ )
245
+ draw.text((box_x + 14, box_y + 10), "Build Order:", font=font_bold, fill="#1e1e1e")
246
+
247
+ step_x = box_x + 120
248
+ for num, text, fill, stroke in BUILD_STEPS:
249
+ _rbox(draw, step_x, box_y + 8, 185, 65, fill, stroke, r=8)
250
+ lines = [f"Step {num}"] + text.split("\n")
251
+ y0 = box_y + 14
252
+ for line in lines:
253
+ left, _, right, _ = draw.textbbox((0, 0), line, font=font_step)
254
+ text_width = right - left
255
+ draw.text((step_x + (185 - text_width) // 2, y0), line, font=font_step, fill="#1e1e1e")
256
+ y0 += 16
257
+ if step_x + 185 + 40 < box_x + 1340:
258
+ _arrow(draw, step_x + 185, box_y + 40, step_x + 225, box_y + 40, "#555")
259
+ step_x += 225
260
+
261
+ return img
262
+
263
+
264
+ def show_desc(stage: str | None) -> str:
265
+ if not stage:
266
+ return "No description available."
267
+ return DESCRIPTIONS.get(stage, "No description available.")
268
+
269
+
270
+ with gr.Blocks(title="Meeting Summarisation Pipeline") as demo:
271
+ gr.Markdown("## Meeting Summarisation Pipeline Explorer")
272
+ gr.Markdown(
273
+ "Visualise and configure a local, cross-platform meeting summariser "
274
+ "built on Hugging Face models and PipeWire. Adjust the options below "
275
+ "and the diagram will update live."
276
+ )
277
+
278
+ with gr.Row():
279
+ with gr.Column(scale=3):
280
+ diagram = gr.Image(
281
+ value=generate_diagram(
282
+ MODEL_OPTIONS["transcription"][0],
283
+ MODEL_OPTIONS["summarisation"][0],
284
+ True,
285
+ ),
286
+ label="Pipeline Diagram",
287
+ interactive=False,
288
+ )
289
+
290
+ with gr.Column(scale=1):
291
+ gr.Markdown("### Configuration")
292
+ asr_dd = gr.Dropdown(
293
+ choices=MODEL_OPTIONS["transcription"],
294
+ value=MODEL_OPTIONS["transcription"][0],
295
+ label="Transcription model",
296
+ )
297
+ sum_dd = gr.Dropdown(
298
+ choices=MODEL_OPTIONS["summarisation"],
299
+ value=MODEL_OPTIONS["summarisation"][0],
300
+ label="Summarisation model",
301
+ )
302
+ diar_cb = gr.Checkbox(value=True, label="Include diarization (pyannote)")
303
+ gr.Markdown("---")
304
+ gr.Markdown("### Stage Info")
305
+ stage_dd = gr.Dropdown(
306
+ choices=list(DESCRIPTIONS.keys()),
307
+ label="Select a stage to learn more",
308
+ value=None,
309
+ )
310
+ stage_info = gr.Markdown("Select a stage above.")
311
+
312
+ for ctrl in (asr_dd, sum_dd, diar_cb):
313
+ ctrl.change(
314
+ fn=lambda a, s, dz: generate_diagram(a, s, dz),
315
+ inputs=[asr_dd, sum_dd, diar_cb],
316
+ outputs=diagram,
317
+ )
318
+
319
+ stage_dd.change(fn=show_desc, inputs=stage_dd, outputs=stage_info)
320
+
321
+ gr.Markdown("---")
322
+ gr.Markdown(
323
+ "**Build order:** PipeWire + sounddevice -> silero-vad + distil-whisper "
324
+ "-> Ollama summarisation -> pyannote diarization (optional, last)"
325
+ )
326
+
327
+
328
+ if __name__ == "__main__":
329
+ demo.launch()
330
+