aidn commited on
Commit
5e00e96
Β·
verified Β·
1 Parent(s): 2b0351a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +190 -298
app.py CHANGED
@@ -1,330 +1,222 @@
1
- import math
2
  import os
 
3
 
 
 
 
4
  import gradio as gr
5
- from PIL import Image, ImageDraw, ImageFont
6
-
7
-
8
- ZONES = [
9
- ("Audio Layer", 30, 190, "#dbeafe", "#4a9eed"),
10
- ("VAD", 240, 160, "#ede9fe", "#8b5cf6"),
11
- ("Transcription", 420, 210, "#dcfce7", "#22c55e"),
12
- ("Diarization\n(optional)", 650, 200, "#fef9c3", "#f59e0b"),
13
- ("Summarisation", 870, 210, "#ffedd5", "#f97316"),
14
- ("Output", 1100, 270, "#d1fae5", "#22c55e"),
15
- ]
16
-
17
- MODEL_OPTIONS = {
18
- "transcription": [
19
- "distil-whisper-large-v3 (fast)",
20
- "whisper-large-v3 (accurate)",
21
- ],
22
- "summarisation": [
23
- "Ollama local LLM (recommended)",
24
- "facebook/bart-large-cnn (fallback)",
25
- ],
26
- }
27
 
28
- DESCRIPTIONS = {
29
- "Audio Layer": (
30
- "**PipeWire / PulseAudio loopback**\n\n"
31
- "Creates a virtual sink that captures both your microphone and speaker output "
32
- "simultaneously into a single stream. On modern Arch Linux you will typically run "
33
- "PipeWire and can use `pw-loopback` or `pactl load-module module-loopback`. "
34
- "Python reads the stream via `sounddevice` or `pyaudio`."
35
- ),
36
- "VAD": (
37
- "**silero-vad**\n\n"
38
- "Tiny, CPU-friendly voice activity detection model. Acts as a gatekeeper: "
39
- "it fires only when someone is actually speaking, chunking the stream into "
40
- "speech segments and discarding silence. This keeps downstream models from "
41
- "wasting cycles on dead air and reduces latency."
42
- ),
43
- "Transcription": (
44
- "**distil-whisper-large-v3**: faster than full Whisper with strong real-time accuracy. "
45
- "Recommended starting point.\n\n"
46
- "**whisper-large-v3**: higher accuracy at the cost of more CPU/GPU. "
47
- "Switch to this if transcription quality is the bottleneck."
48
- ),
49
- "Diarization\n(optional)": (
50
- "**pyannote/speaker-diarization-3.1**\n\n"
51
- "Labels each speech chunk with a speaker ID (for example, Speaker A and Speaker B). "
52
- "Requires a Hugging Face token (gated model; request access on the HF Hub). "
53
- "Skip this on your first pass and add it after the base pipeline is stable."
54
- ),
55
- "Summarisation": (
56
- "**Ollama (local LLM)**: best output quality, full prompt control, and on-device runtime. "
57
- "Recommended if Ollama is running.\n\n"
58
- "**facebook/bart-large-cnn**: lighter and faster extractive summariser, good fallback."
59
- ),
60
- "Output": (
61
- "**Summary + Action Items**\n\n"
62
- "Final structured output: a concise meeting summary plus extracted action items. "
63
- "Can be enriched with speaker attribution when diarization is enabled upstream."
64
- ),
65
  }
66
 
67
- BUILD_STEPS = [
68
- ("1", "PipeWire +\nsounddevice", "#bfdbfe", "#4a9eed"),
69
- ("2", "silero-vad +\ndistil-whisper", "#ddd6fe", "#8b5cf6"),
70
- ("3", "Ollama\nsummarisation", "#fed7aa", "#f97316"),
71
- ("4 (opt.)", "pyannote\ndiarization", "#fef08a", "#f59e0b"),
72
- ]
73
-
74
-
75
- def _font(bold: bool, size: int) -> ImageFont.FreeTypeFont | ImageFont.ImageFont:
76
- if bold:
77
- candidates = [
78
- "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
79
- "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf",
80
- ]
81
- else:
82
- candidates = [
83
- "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
84
- "/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf",
85
- ]
86
-
87
- for path in candidates:
88
- if os.path.exists(path):
89
- return ImageFont.truetype(path, size)
90
-
91
- return ImageFont.load_default()
92
-
93
-
94
- def _rbox(draw: ImageDraw.ImageDraw, x: int, y: int, w: int, h: int, fill: str, stroke: str, r: int = 12) -> None:
95
- draw.rounded_rectangle([x, y, x + w, y + h], radius=r, fill=fill, outline=stroke, width=2)
96
-
97
-
98
- def _center_text(
99
- draw: ImageDraw.ImageDraw,
100
- x: int,
101
- y: int,
102
- w: int,
103
- lines: list[str],
104
- font: ImageFont.FreeTypeFont | ImageFont.ImageFont,
105
- color: str = "#1e1e1e",
106
- lh: int = 20,
107
- ) -> None:
108
- total = len(lines) * lh
109
- current_y = y - total // 2
110
- for line in lines:
111
- left, _, right, _ = draw.textbbox((0, 0), line, font=font)
112
- text_width = right - left
113
- draw.text((x + (w - text_width) // 2, current_y), line, font=font, fill=color)
114
- current_y += lh
115
-
116
-
117
- def _arrow(
118
- draw: ImageDraw.ImageDraw,
119
- x1: int,
120
- y1: int,
121
- x2: int,
122
- y2: int,
123
- color: str = "#555",
124
- label: str = "",
125
- label_font: ImageFont.FreeTypeFont | ImageFont.ImageFont | None = None,
126
- ) -> None:
127
- draw.line([(x1, y1), (x2, y2)], fill=color, width=2)
128
-
129
- angle = math.atan2(y2 - y1, x2 - x1)
130
- size = 10
131
- for delta in (0.4, -0.4):
132
- ax = x2 - size * math.cos(angle - delta)
133
- ay = y2 - size * math.sin(angle - delta)
134
- draw.line([(x2, y2), (ax, ay)], fill=color, width=2)
135
-
136
- if label and label_font:
137
- mx, my = (x1 + x2) // 2, (y1 + y2) // 2
138
- left, _, right, _ = draw.textbbox((0, 0), label, font=label_font)
139
- text_width = right - left
140
- draw.text((mx - text_width // 2, my - 16), label, font=label_font, fill="#555")
141
-
142
-
143
- def generate_diagram(asr_choice: str, sum_choice: str, show_diar: bool) -> Image.Image:
144
- width, height = 1400, 900
145
- img = Image.new("RGB", (width, height), "#f8f9fa")
146
- draw = ImageDraw.Draw(img)
147
-
148
- font_bold = _font(True, 15)
149
- font_regular = _font(False, 13)
150
- font_title = _font(True, 22)
151
- font_zone_title = _font(True, 13)
152
- font_step = _font(True, 12)
153
-
154
- left, _, right, _ = draw.textbbox((0, 0), "Meeting Summarisation Pipeline", font=font_title)
155
- title_width = right - left
156
- draw.text(
157
- ((width - title_width) // 2, 18),
158
- "Meeting Summarisation Pipeline",
159
- font=font_title,
160
- fill="#1e1e1e",
161
- )
162
 
163
- zone_y, zone_h = 60, 710
164
- for label, zone_x, zone_w, zone_fill, zone_stroke in ZONES:
165
- if not show_diar and "Diarization" in label:
166
- continue
167
 
168
- _rbox(draw, zone_x, zone_y, zone_w, zone_h, zone_fill, zone_stroke, r=14)
169
- for idx, line in enumerate(label.split("\n")):
170
- left, _, right, _ = draw.textbbox((0, 0), line, font=font_zone_title)
171
- text_width = right - left
172
- draw.text(
173
- (zone_x + (zone_w - text_width) // 2, zone_y + 6 + idx * 16),
174
- line,
175
- font=font_zone_title,
176
- fill=zone_stroke,
177
  )
 
178
 
179
- _rbox(draw, 45, 130, 160, 60, "#bfdbfe", "#4a9eed")
180
- _center_text(draw, 45, 160, 160, ["PipeWire", "Loopback Sink"], font_bold, "#1e3a8a")
181
- _arrow(draw, 125, 190, 125, 230, "#4a9eed")
182
- _rbox(draw, 45, 230, 160, 60, "#bfdbfe", "#4a9eed")
183
- _center_text(draw, 45, 260, 160, ["sounddevice", "/ pyaudio"], font_bold, "#1e3a8a")
184
-
185
- _rbox(draw, 255, 175, 130, 65, "#ddd6fe", "#8b5cf6")
186
- _center_text(draw, 255, 207, 130, ["silero-vad", "voice activity"], font_bold, "#4c1d95")
187
- _arrow(draw, 205, 260, 255, 210, "#4a9eed", "raw audio", font_regular)
188
-
189
- use_fast = "distil" in asr_choice
190
- if use_fast:
191
- asr_lines = ["distil-whisper-v3", "fast / real-time"]
192
- else:
193
- asr_lines = ["whisper-large-v3", "high accuracy"]
194
- _rbox(draw, 435, 175, 180, 65, "#bbf7d0", "#22c55e")
195
- _center_text(draw, 435, 207, 180, asr_lines, font_bold, "#14532d")
196
- _arrow(draw, 385, 207, 435, 207, "#8b5cf6", "speech chunks", font_regular)
197
-
198
- if show_diar:
199
- _rbox(draw, 665, 175, 170, 75, "#fef08a", "#f59e0b")
200
- _center_text(
201
- draw,
202
- 665,
203
- 212,
204
- 170,
205
- ["pyannote/", "speaker-diar-3.1", "needs HF token"],
206
- font_step,
207
- "#78350f",
208
- lh=18,
209
  )
210
- _arrow(draw, 615, 207, 665, 207, "#22c55e", "transcript", font_regular)
211
- sum_src_x = 835
212
- else:
213
- draw.line([(615, 207), (650, 207)], fill="#22c55e", width=2)
214
- draw.line([(650, 207), (650, 340), (920, 340), (920, 300)], fill="#22c55e", width=2)
215
- left, _, right, _ = draw.textbbox((0, 0), "skip diarization", font=font_regular)
216
- text_width = right - left
217
- draw.text((750 - text_width // 2, 345), "skip diarization", font=font_regular, fill="#15803d")
218
- sum_src_x = None
219
-
220
- use_ollama = "Ollama" in sum_choice
221
- if use_ollama:
222
- sum_lines = ["Ollama (local LLM)", "recommended"]
223
- sum_fill = "#fed7aa"
224
- else:
225
- sum_lines = ["facebook/", "bart-large-cnn"]
226
- sum_fill = "#fde8d8"
227
-
228
- _rbox(draw, 885, 175, 175, 65, sum_fill, "#f97316")
229
- _center_text(draw, 885, 207, 175, sum_lines, font_bold, "#7c2d12")
230
- if show_diar and sum_src_x is not None:
231
- _arrow(draw, sum_src_x, 207, 885, 207, "#f59e0b", "labelled speech", font_regular)
232
- _arrow(draw, 1060, 207, 1115, 207, "#f97316")
233
-
234
- _rbox(draw, 1115, 165, 235, 75, "#6ee7b7", "#22c55e")
235
- _center_text(draw, 1115, 202, 235, ["Summary +", "Action Items"], font_bold, "#064e3b")
236
-
237
- box_x, box_y = 30, 790
238
- draw.rounded_rectangle(
239
- [box_x, box_y, box_x + 1340, box_y + 85],
240
- radius=10,
241
- fill="#f1f5f9",
242
- outline="#cbd5e1",
243
- width=1,
244
- )
245
- draw.text((box_x + 14, box_y + 10), "Build Order:", font=font_bold, fill="#1e1e1e")
 
 
 
 
 
 
 
 
 
 
 
246
 
247
- step_x = box_x + 120
248
- for num, text, fill, stroke in BUILD_STEPS:
249
- _rbox(draw, step_x, box_y + 8, 185, 65, fill, stroke, r=8)
250
- lines = [f"Step {num}"] + text.split("\n")
251
- y0 = box_y + 14
252
- for line in lines:
253
- left, _, right, _ = draw.textbbox((0, 0), line, font=font_step)
254
- text_width = right - left
255
- draw.text((step_x + (185 - text_width) // 2, y0), line, font=font_step, fill="#1e1e1e")
256
- y0 += 16
257
- if step_x + 185 + 40 < box_x + 1340:
258
- _arrow(draw, step_x + 185, box_y + 40, step_x + 225, box_y + 40, "#555")
259
- step_x += 225
260
 
261
- return img
 
 
 
 
262
 
 
263
 
264
- def show_desc(stage: str | None) -> str:
265
- if not stage:
266
- return "No description available."
267
- return DESCRIPTIONS.get(stage, "No description available.")
268
 
 
 
 
269
 
270
- with gr.Blocks(title="Meeting Summarisation Pipeline") as demo:
271
- gr.Markdown("## Meeting Summarisation Pipeline Explorer")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  gr.Markdown(
273
- "Visualise and configure a local, cross-platform meeting summariser "
274
- "built on Hugging Face models and PipeWire. Adjust the options below "
275
- "and the diagram will update live."
276
  )
277
 
278
- with gr.Row():
279
- with gr.Column(scale=3):
280
- diagram = gr.Image(
281
- value=generate_diagram(
282
- MODEL_OPTIONS["transcription"][0],
283
- MODEL_OPTIONS["summarisation"][0],
284
- True,
285
- ),
286
- label="Pipeline Diagram",
287
- interactive=False,
288
- )
289
 
 
290
  with gr.Column(scale=1):
291
- gr.Markdown("### Configuration")
292
- asr_dd = gr.Dropdown(
293
- choices=MODEL_OPTIONS["transcription"],
294
- value=MODEL_OPTIONS["transcription"][0],
295
- label="Transcription model",
296
  )
297
- sum_dd = gr.Dropdown(
298
- choices=MODEL_OPTIONS["summarisation"],
299
- value=MODEL_OPTIONS["summarisation"][0],
300
- label="Summarisation model",
301
  )
302
- diar_cb = gr.Checkbox(value=True, label="Include diarization (pyannote)")
303
- gr.Markdown("---")
304
- gr.Markdown("### Stage Info")
305
- stage_dd = gr.Dropdown(
306
- choices=list(DESCRIPTIONS.keys()),
307
- label="Select a stage to learn more",
308
- value=None,
309
  )
310
- stage_info = gr.Markdown("Select a stage above.")
311
 
312
- for ctrl in (asr_dd, sum_dd, diar_cb):
313
- ctrl.change(
314
- fn=lambda a, s, dz: generate_diagram(a, s, dz),
315
- inputs=[asr_dd, sum_dd, diar_cb],
316
- outputs=diagram,
317
- )
318
-
319
- stage_dd.change(fn=show_desc, inputs=stage_dd, outputs=stage_info)
 
 
 
 
320
 
321
- gr.Markdown("---")
322
  gr.Markdown(
323
- "**Build order:** PipeWire + sounddevice -> silero-vad + distil-whisper "
324
- "-> Ollama summarisation -> pyannote diarization (optional, last)"
 
 
 
325
  )
326
 
 
 
 
 
 
327
 
328
  if __name__ == "__main__":
329
- demo.launch()
330
-
 
 
1
  import os
2
+ import tempfile
3
 
4
+ import numpy as np
5
+ import soundfile as sf
6
+ import torch
7
  import gradio as gr
8
+ from transformers import pipeline as hf_pipeline
9
+
10
+ # ── Konfiguration ──────────────────────────────────────────────────────────────
11
+ HF_TOKEN = os.environ.get("HF_TOKEN", "")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ ASR_MODELS = {
14
+ "whisper-tiny (schnellste, geringste QualitΓ€t)": "openai/whisper-tiny",
15
+ "whisper-base (schnell, gut fΓΌr kurze Aufnahmen)": "openai/whisper-base",
16
+ "whisper-small (empfohlen fΓΌr CPU)": "openai/whisper-small",
17
+ "distil-whisper-large-v3 (langsam, beste QualitΓ€t)": "distil-whisper/distil-large-v3",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  }
19
 
20
+ # ── Lazy Model Loading ─────────────────────────────────────────────────────────
21
+ _asr_cache: dict = {}
22
+ _diar_pipe = None
23
+
24
+
25
+ def get_asr(model_key: str):
26
+ model_id = ASR_MODELS[model_key]
27
+ if model_id not in _asr_cache:
28
+ _asr_cache[model_id] = hf_pipeline(
29
+ "automatic-speech-recognition",
30
+ model=model_id,
31
+ device="cpu",
32
+ torch_dtype=torch.float32,
33
+ chunk_length_s=30,
34
+ return_timestamps=True,
35
+ )
36
+ return _asr_cache[model_id]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
 
 
 
 
38
 
39
+ def get_diar():
40
+ global _diar_pipe
41
+ if _diar_pipe is None:
42
+ if not HF_TOKEN:
43
+ raise EnvironmentError(
44
+ "HF_TOKEN nicht gesetzt. FΓΌge ihn in den Space-Settings unter "
45
+ "'Settings β†’ Variables and secrets' hinzu."
 
 
46
  )
47
+ from pyannote.audio import Pipeline as PyannotePipeline
48
 
49
+ _diar_pipe = PyannotePipeline.from_pretrained(
50
+ "pyannote/speaker-diarization-3.1",
51
+ use_auth_token=HF_TOKEN,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  )
53
+ return _diar_pipe
54
+
55
+
56
+ # ── Hilfsfunktionen ────────────────────────────────────────────────────────────
57
+
58
+ def merge_with_speakers(chunks: list, diarization) -> list[tuple]:
59
+ """Ordnet jedem ASR-Chunk den dominanten Sprecher zu."""
60
+ merged = []
61
+ for chunk in chunks:
62
+ ts = chunk.get("timestamp", (None, None))
63
+ start, end = ts if ts else (None, None)
64
+ if start is None:
65
+ continue
66
+ end = end or (start + 1.0) # Fallback falls letzter Chunk kein End-Timestamp hat
67
+
68
+ best_speaker, best_overlap = "Unbekannt", 0.0
69
+ for turn, _, speaker in diarization.itertracks(yield_label=True):
70
+ overlap = max(0.0, min(end, turn.end) - max(start, turn.start))
71
+ if overlap > best_overlap:
72
+ best_overlap = overlap
73
+ best_speaker = speaker
74
+
75
+ merged.append((start, end, best_speaker, chunk["text"].strip()))
76
+ return merged
77
+
78
+
79
+ def format_diarized(segments: list[tuple]) -> str:
80
+ """Gruppiert aufeinanderfolgende Chunks desselben Sprechers."""
81
+ if not segments:
82
+ return ""
83
+
84
+ lines = []
85
+ cur_speaker, cur_start, cur_texts = None, 0.0, []
86
+
87
+ for start, _end, speaker, text in segments:
88
+ if speaker != cur_speaker:
89
+ if cur_speaker is not None:
90
+ lines.append(f"**{cur_speaker}** [{cur_start:.1f}s]:\n{' '.join(cur_texts)}")
91
+ cur_speaker, cur_start, cur_texts = speaker, start, [text]
92
+ else:
93
+ cur_texts.append(text)
94
+
95
+ if cur_speaker and cur_texts:
96
+ lines.append(f"**{cur_speaker}** [{cur_start:.1f}s]:\n{' '.join(cur_texts)}")
97
+
98
+ return "\n\n".join(lines)
99
+
100
 
101
+ # ── Haupt-Pipeline ─────────────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
+ def transcribe(audio, model_key: str, use_diar: bool):
104
+ """Generator-Funktion: liefert Zwischenergebnisse live an die UI."""
105
+ if audio is None:
106
+ yield "⚠️ Kein Audio eingegeben.", ""
107
+ return
108
 
109
+ sample_rate, audio_data = audio
110
 
111
+ # Mono erzwingen
112
+ if audio_data.ndim > 1:
113
+ audio_data = audio_data.mean(axis=1)
114
+ audio_data = audio_data.astype(np.float32)
115
 
116
+ # Normalisieren (16-bit PCM β†’ float)
117
+ if audio_data.max() > 1.0:
118
+ audio_data /= 32768.0
119
 
120
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
121
+ tmp_path = f.name
122
+ sf.write(tmp_path, audio_data, sample_rate)
123
+
124
+ try:
125
+ # ── Schritt 1: Transkription ──
126
+ yield "⏳ Lade ASR-Modell und transkribiere...", ""
127
+
128
+ asr = get_asr(model_key)
129
+ result = asr(tmp_path)
130
+ raw_transcript = result["text"].strip()
131
+ chunks = result.get("chunks", [])
132
+
133
+ if not use_diar:
134
+ yield raw_transcript, ""
135
+ return
136
+
137
+ # ── Schritt 2: Diarisierung ──
138
+ yield raw_transcript, "⏳ Diarisierung lÀuft (auf CPU kann das einige Minuten dauern)..."
139
+
140
+ try:
141
+ diar = get_diar()
142
+ diarization = diar(tmp_path)
143
+ segments = merge_with_speakers(chunks, diarization)
144
+ labeled = format_diarized(segments)
145
+ yield raw_transcript, labeled or "(Keine Sprecher erkannt.)"
146
+
147
+ except EnvironmentError as e:
148
+ yield raw_transcript, f"⚠️ {e}"
149
+ except Exception as e:
150
+ yield raw_transcript, f"⚠️ Diarisierung fehlgeschlagen: {e}"
151
+
152
+ finally:
153
+ os.unlink(tmp_path)
154
+
155
+
156
+ # ── UI ─────────────────────────────────────────────────────────────────────────
157
+
158
+ TOKEN_WARNING = (
159
+ "> ⚠️ **Kein `HF_TOKEN` gefunden.** \n"
160
+ "> Diarisierung (pyannote) ist deaktiviert. \n"
161
+ "> FΓΌge das Token unter **Settings β†’ Variables and secrets** als `HF_TOKEN` hinzu \n"
162
+ "> und akzeptiere die Lizenzbedingungen auf [hf.co/pyannote/speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1)."
163
+ )
164
+
165
+ with gr.Blocks(title="Meeting Transcriber") as demo:
166
+ gr.Markdown("# πŸŽ™οΈ Meeting Transcriber")
167
  gr.Markdown(
168
+ "Lade eine Audiodatei hoch **oder** nimm direkt ΓΌber das Mikrofon auf. \n"
169
+ "Das Audio wird transkribiert und optional nach Sprechern getrennt."
 
170
  )
171
 
172
+ if not HF_TOKEN:
173
+ gr.Markdown(TOKEN_WARNING)
 
 
 
 
 
 
 
 
 
174
 
175
+ with gr.Row():
176
  with gr.Column(scale=1):
177
+ audio_input = gr.Audio(
178
+ sources=["microphone", "upload"],
179
+ type="numpy",
180
+ label="Audio (Mikrofon oder Datei)",
 
181
  )
182
+ model_dd = gr.Dropdown(
183
+ choices=list(ASR_MODELS.keys()),
184
+ value="whisper-small (empfohlen fΓΌr CPU)",
185
+ label="Transkriptionsmodell",
186
  )
187
+ diar_cb = gr.Checkbox(
188
+ value=bool(HF_TOKEN),
189
+ label="Speaker-Diarisierung aktivieren (pyannote, braucht HF_TOKEN)",
190
+ interactive=bool(HF_TOKEN),
 
 
 
191
  )
192
+ run_btn = gr.Button("β–Ά Transkribieren", variant="primary")
193
 
194
+ with gr.Column(scale=2):
195
+ transcript_out = gr.Textbox(
196
+ label="Rohtranskript (Whisper)",
197
+ lines=12,
198
+ show_copy_button=True,
199
+ )
200
+ diar_out = gr.Textbox(
201
+ label="Transkript mit Sprecher-Labels (pyannote)",
202
+ lines=12,
203
+ show_copy_button=True,
204
+ placeholder="Nur sichtbar wenn Diarisierung aktiviert ist.",
205
+ )
206
 
 
207
  gr.Markdown(
208
+ "---\n"
209
+ "**Hinweise:** \n"
210
+ "β€’ Auf Free CPU dauert Whisper-small ~1–2Γ— Echtzeit, Diarisierung ~2–5Γ— Echtzeit. \n"
211
+ "β€’ FΓΌr pyannote musst du die Lizenzbedingungen auf Hugging Face akzeptiert haben. \n"
212
+ "β€’ Das erste Laden der Modelle dauert lΓ€nger (Download-Cache)."
213
  )
214
 
215
+ run_btn.click(
216
+ fn=transcribe,
217
+ inputs=[audio_input, model_dd, diar_cb],
218
+ outputs=[transcript_out, diar_out],
219
+ )
220
 
221
  if __name__ == "__main__":
222
+ demo.launch()