GitHub Actions commited on
Commit
f2532fa
·
0 Parent(s):

deploy from GitHub 2026-03-04_03:47:45

Browse files
Files changed (5) hide show
  1. README.md +31 -0
  2. app.py +550 -0
  3. lecture_processor.py +389 -0
  4. requirements.txt +11 -0
  5. transcribe.py +71 -0
README.md ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Lecture Processor
3
+ emoji: "\U0001F393"
4
+ colorFrom: blue
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 5.15.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ hardware: zero-a10g
12
+ ---
13
+
14
+ # Lecture Processor
15
+
16
+ Transcribe, summarize, and generate quizzes from lecture recordings using **WhisperX** and a fine-tuned **Gemma 3 4B** model.
17
+
18
+ ## How It Works
19
+
20
+ 1. Paste a YouTube lecture URL
21
+ 2. The pipeline automatically:
22
+ - **Transcribes** speech using WhisperX
23
+ - **Summarizes** the lecture with structured sections (Summary, Key Points, Action Points)
24
+ - **Generates quiz questions** (5 MCQ + 3 short answer)
25
+
26
+ ## Tech Stack
27
+
28
+ - **WhisperX** - Speech-to-text transcription
29
+ - **Gemma 3 4B Instruct** - Fine-tuned with QLoRA for lecture summarization and quiz generation
30
+ - **LoRA Adapter** - [noufwithy/gemma-lecture-adapter](https://huggingface.co/noufwithy/gemma-lecture-adapter)
31
+ - **Gradio** - Web interface with ZeroGPU support
app.py ADDED
@@ -0,0 +1,550 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import tempfile
4
+ import time
5
+ import traceback
6
+
7
+ import gradio as gr
8
+ import httpx
9
+ import yt_dlp
10
+
11
+ try:
12
+ import spaces
13
+ except ImportError:
14
+ class spaces:
15
+ @staticmethod
16
+ def GPU(duration=60):
17
+ def decorator(fn):
18
+ return fn
19
+ return decorator
20
+
21
+ PROXY_BASE = os.environ.get("PROXY_BASE", "").rstrip("/")
22
+ PROXY_TOKEN = os.environ.get("PROXY_TOKEN", "")
23
+
24
+ from transcribe import transcribe_audio, unload_model as unload_whisper
25
+ from lecture_processor import summarize_lecture, generate_quiz
26
+
27
+ # LANGUAGES = {
28
+ # "Auto-detect": None,
29
+ # "English": "en",
30
+ # "Korean": "ko",
31
+ # "Japanese": "ja",
32
+ # "Chinese": "zh",
33
+ # "Spanish": "es",
34
+ # "French": "fr",
35
+ # "German": "de",
36
+ # "Italian": "it",
37
+ # "Portuguese": "pt",
38
+ # "Russian": "ru",
39
+ # "Arabic": "ar",
40
+ # "Hindi": "hi",
41
+ # }
42
+
43
+
44
+ def get_youtube_video_id(url: str) -> str | None:
45
+ """Extract video ID from various YouTube URL formats."""
46
+ patterns = [
47
+ r"(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/embed/)([a-zA-Z0-9_-]{11})",
48
+ ]
49
+ for pattern in patterns:
50
+ match = re.search(pattern, url)
51
+ if match:
52
+ return match.group(1)
53
+ return None
54
+
55
+
56
+ def make_embed_html(video_id: str) -> str:
57
+ return f'<iframe width="100%" height="400" src="https://www.youtube.com/embed/{video_id}" frameborder="0" allowfullscreen></iframe>'
58
+
59
+
60
+ def download_youtube_audio(url: str) -> str:
61
+ """Download audio from YouTube URL, returns path to wav file."""
62
+ tmp_dir = tempfile.mkdtemp()
63
+ output_path = f"{tmp_dir}/audio.wav"
64
+ ydl_opts = {
65
+ "format": "bestaudio/best",
66
+ "postprocessors": [{
67
+ "key": "FFmpegExtractAudio",
68
+ "preferredcodec": "wav",
69
+ }],
70
+ "outtmpl": f"{tmp_dir}/audio",
71
+ "quiet": True,
72
+ "no_warnings": True,
73
+ }
74
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
75
+ ydl.download([url])
76
+ return output_path
77
+
78
+
79
+ LANGUAGES = {
80
+ "English": "en",
81
+ }
82
+
83
+
84
+ def make_status_html(step: int = 0, timing: str = "", error: str = "") -> str:
85
+ """Step progress indicator. Steps: 0=idle, 1=download, 2=transcribe, 3=summarize, 4=quiz, 5=done."""
86
+ if error:
87
+ return f'<div class="status-bar error">{error}</div>'
88
+ if step == 0:
89
+ return ""
90
+
91
+ labels = ["Download", "Transcribe", "Summarize", "Quiz"]
92
+ items = []
93
+ for i, label in enumerate(labels):
94
+ s = i + 1
95
+ if s < step or step == 5:
96
+ cls, icon = "done", "&#10003;"
97
+ elif s == step:
98
+ cls, icon = "active", "&#8635;"
99
+ else:
100
+ cls, icon = "pending", str(s)
101
+ items.append(
102
+ f'<div class="step {cls}"><span class="num">{icon}</span>{label}</div>'
103
+ )
104
+
105
+ connector = '<div class="conn"></div>'
106
+ steps_html = connector.join(items)
107
+ timing_html = f'<div class="timing">{timing}</div>' if timing else ""
108
+
109
+ return f'<div class="status-bar"><div class="steps">{steps_html}</div>{timing_html}</div>'
110
+
111
+
112
+ @spaces.GPU(duration=120)
113
+ def _run_pipeline(audio_path: str, language: str):
114
+ """Pipeline that yields (transcript, summary, quiz, step, timing) progressively."""
115
+ lang_code = LANGUAGES.get(language)
116
+ timings = {}
117
+
118
+ gr.Info("Transcribing audio with WhisperX...")
119
+ try:
120
+ t0 = time.time()
121
+ raw_text = transcribe_audio(audio_path, language=lang_code)
122
+ timings["Transcription"] = time.time() - t0
123
+ except Exception as e:
124
+ yield f"[Transcription error] {e}", "", "", 0, ""
125
+ return
126
+
127
+ if not raw_text:
128
+ yield "(no speech detected)", "", "", 0, ""
129
+ return
130
+
131
+ timing_str = " | ".join(f"{k}: {v:.1f}s" for k, v in timings.items())
132
+ yield raw_text, "", "", 3, timing_str
133
+
134
+ unload_whisper()
135
+
136
+ gr.Info("Generating summary with Gemma...")
137
+ try:
138
+ t0 = time.time()
139
+ summary = summarize_lecture(raw_text)
140
+ timings["Summarization"] = time.time() - t0
141
+ except Exception as e:
142
+ print(f"[ERROR] Summarization failed: {e}")
143
+ traceback.print_exc()
144
+ summary = f"[Summarization error] {e}"
145
+
146
+ timing_str = " | ".join(f"{k}: {v:.1f}s" for k, v in timings.items())
147
+ yield raw_text, summary, "", 4, timing_str
148
+
149
+ gr.Info("Generating quiz with Gemma...")
150
+ try:
151
+ t0 = time.time()
152
+ quiz = generate_quiz(raw_text)
153
+ timings["Quiz Generation"] = time.time() - t0
154
+ except Exception as e:
155
+ print(f"[ERROR] Quiz generation failed: {e}")
156
+ traceback.print_exc()
157
+ quiz = f"[Quiz generation error] {e}"
158
+
159
+ timing_str = " | ".join(f"{k}: {v:.1f}s" for k, v in timings.items())
160
+ total = sum(timings.values())
161
+ timing_str += f" | Total: {total:.1f}s"
162
+
163
+ yield raw_text, summary, quiz, 5, timing_str
164
+
165
+
166
+ def fetch_audio_from_proxy(url: str) -> str:
167
+ """Request audio extraction from proxy, save to tmp file, return path."""
168
+ headers = {"x-proxy-token": PROXY_TOKEN} if PROXY_TOKEN else {}
169
+ with httpx.stream(
170
+ "POST",
171
+ f"{PROXY_BASE}/extract",
172
+ json={"url": url, "audio_format": "best"},
173
+ headers=headers,
174
+ timeout=600,
175
+ ) as resp:
176
+ resp.raise_for_status()
177
+ tmp_dir = tempfile.mkdtemp()
178
+ audio_path = f"{tmp_dir}/audio.wav"
179
+ with open(audio_path, "wb") as f:
180
+ for chunk in resp.iter_bytes(chunk_size=8192):
181
+ f.write(chunk)
182
+ return audio_path
183
+
184
+
185
+ def process_youtube(url: str, language: str):
186
+ """Yields (embed, transcript, summary, quiz, status_html) progressively."""
187
+ if not url or not url.strip():
188
+ yield "", "", "", "", ""
189
+ return
190
+
191
+ url = url.strip()
192
+
193
+ video_id = get_youtube_video_id(url)
194
+ if not video_id:
195
+ yield "", "", "", "", make_status_html(error="Please enter a valid YouTube URL")
196
+ return
197
+
198
+ embed_html = make_embed_html(video_id)
199
+ yield embed_html, "", "", "", make_status_html(1)
200
+
201
+ try:
202
+ t0 = time.time()
203
+ if PROXY_BASE:
204
+ audio_path = fetch_audio_from_proxy(url)
205
+ else:
206
+ gr.Info("Downloading audio from YouTube...")
207
+ audio_path = download_youtube_audio(url)
208
+ dl_time = time.time() - t0
209
+ except Exception as e:
210
+ yield embed_html, "", "", "", make_status_html(error=f"Download failed: {e}")
211
+ return
212
+
213
+ yield embed_html, "", "", "", make_status_html(2, f"Download: {dl_time:.1f}s")
214
+
215
+ for raw_text, summary, quiz, step, timing_str in _run_pipeline(audio_path, language):
216
+ full_timing = f"Download: {dl_time:.1f}s | {timing_str}" if timing_str else ""
217
+ yield embed_html, raw_text, summary, quiz, make_status_html(step, full_timing)
218
+
219
+
220
+ EXAMPLES = {
221
+ "MIT OpenCourseWare": "https://www.youtube.com/watch?v=7Pq-S557XQU",
222
+ "Stanford CS229": "https://www.youtube.com/watch?v=jGwO_UgTS7I",
223
+ }
224
+
225
+ # ---------------------------------------------------------------------------
226
+ # ICL Gradio Theme
227
+ # ---------------------------------------------------------------------------
228
+ _icl_blue = gr.themes.Color(
229
+ c50="#F0F7FC",
230
+ c100="#D4EFFC",
231
+ c200="#A8DFFA",
232
+ c300="#5CC4F0",
233
+ c400="#00ACD7",
234
+ c500="#0091D4",
235
+ c600="#003E74",
236
+ c700="#002147",
237
+ c800="#001A38",
238
+ c900="#001029",
239
+ c950="#000A1A",
240
+ name="icl-blue",
241
+ )
242
+
243
+ _icl_tangerine = gr.themes.Color(
244
+ c50="#FFF5EB",
245
+ c100="#FFE6CC",
246
+ c200="#FFCC99",
247
+ c300="#FFB366",
248
+ c400="#FF9933",
249
+ c500="#EC7300",
250
+ c600="#CC6300",
251
+ c700="#A35000",
252
+ c800="#7A3C00",
253
+ c900="#522800",
254
+ c950="#331900",
255
+ name="icl-tangerine",
256
+ )
257
+
258
+ _icl_grey = gr.themes.Color(
259
+ c50="#F7F8F8",
260
+ c100="#EBEEEE",
261
+ c200="#D5D9D9",
262
+ c300="#B8BCBC",
263
+ c400="#9D9D9D",
264
+ c500="#7A7A7A",
265
+ c600="#5C5C5C",
266
+ c700="#4A4A4A",
267
+ c800="#373A36",
268
+ c900="#2A2D2A",
269
+ c950="#1A1C1A",
270
+ name="icl-grey",
271
+ )
272
+
273
+ ICL_THEME = gr.themes.Base(
274
+ primary_hue=_icl_blue,
275
+ secondary_hue=_icl_tangerine,
276
+ neutral_hue=_icl_grey,
277
+ font=[gr.themes.GoogleFont("Source Sans Pro"), "Arial", "sans-serif"],
278
+ font_mono=[gr.themes.GoogleFont("Source Code Pro"), "monospace"],
279
+ ).set(
280
+ # Primary buttons – Navy background
281
+ button_primary_background_fill="#002147",
282
+ button_primary_background_fill_dark="#003E74",
283
+ button_primary_background_fill_hover="#003E74",
284
+ button_primary_background_fill_hover_dark="#0091D4",
285
+ button_primary_border_color="#002147",
286
+ button_primary_border_color_dark="#003E74",
287
+ button_primary_border_color_hover="#003E74",
288
+ button_primary_text_color="white",
289
+ button_primary_text_color_dark="white",
290
+ # Secondary buttons – white bg, blue border/text
291
+ button_secondary_background_fill="white",
292
+ button_secondary_background_fill_dark="#1A1C1A",
293
+ button_secondary_background_fill_hover="#D4EFFC",
294
+ button_secondary_background_fill_hover_dark="#001A38",
295
+ button_secondary_border_color="#003E74",
296
+ button_secondary_border_color_dark="#0091D4",
297
+ button_secondary_border_color_hover="#002147",
298
+ button_secondary_text_color="#003E74",
299
+ button_secondary_text_color_dark="#D4EFFC",
300
+ button_secondary_text_color_hover="#002147",
301
+ # Focus & loader
302
+ input_border_color_focus="#00ACD7",
303
+ input_border_color_focus_dark="#00ACD7",
304
+ loader_color="#003E74",
305
+ loader_color_dark="#0091D4",
306
+ )
307
+
308
+ # ---------------------------------------------------------------------------
309
+ # CSS – custom properties + minimal overrides
310
+ # ---------------------------------------------------------------------------
311
+ CSS = """
312
+ :root {
313
+ --icl-navy: #002147;
314
+ --icl-blue: #003E74;
315
+ --icl-process-blue: #0091D4;
316
+ --icl-pool: #00ACD7;
317
+ --icl-light-blue: #D4EFFC;
318
+ --icl-tangerine: #EC7300;
319
+ --icl-violet: #653098;
320
+ --icl-green: #02893B;
321
+ --icl-lime: #BBCE00;
322
+ --icl-red: #B22234;
323
+ --icl-grey: #EBEEEE;
324
+ --icl-cool-grey: #9D9D9D;
325
+ --icl-dark-grey: #373A36;
326
+ --sp-1: 4px; --sp-2: 8px; --sp-3: 12px; --sp-4: 16px;
327
+ --sp-5: 24px; --sp-6: 32px; --sp-7: 48px; --sp-8: 64px;
328
+ }
329
+
330
+ /* Header brand bar */
331
+ .icl-header {
332
+ text-align: center;
333
+ padding: var(--sp-5) var(--sp-4);
334
+ border-bottom: 3px solid var(--icl-navy);
335
+ margin-bottom: var(--sp-5);
336
+ }
337
+ .icl-header img { height: 60px; margin-bottom: var(--sp-2); }
338
+ .dark .icl-header { border-bottom-color: var(--icl-pool); }
339
+
340
+ /* Title & subtitle */
341
+ .main-title { text-align: center; color: var(--icl-navy); margin-bottom: 0 !important; }
342
+ .subtitle { text-align: center; color: var(--icl-blue); margin-top: 0 !important; }
343
+ .dark .main-title { color: var(--icl-light-blue); }
344
+ .dark .subtitle { color: var(--icl-pool); }
345
+
346
+ /* Tab selected override (Gradio tabs need !important) */
347
+ .tabs .tab-nav button.selected {
348
+ border-color: var(--icl-navy) !important;
349
+ color: var(--icl-navy) !important;
350
+ }
351
+ .dark .tabs .tab-nav button.selected {
352
+ border-color: var(--icl-pool) !important;
353
+ color: var(--icl-pool) !important;
354
+ }
355
+
356
+ /* Focus & active states */
357
+ button:focus-visible, input:focus-visible, textarea:focus-visible, select:focus-visible {
358
+ outline: 3px solid var(--icl-pool);
359
+ outline-offset: 2px;
360
+ }
361
+ button:active { transform: scale(0.97); }
362
+
363
+ /* Example buttons – compact inside bordered card */
364
+ .examples-row {
365
+ justify-content: center !important;
366
+ gap: var(--sp-2);
367
+ border: 1px solid var(--icl-light-blue);
368
+ border-radius: 8px;
369
+ padding: var(--sp-3) var(--sp-4);
370
+ background: var(--icl-grey);
371
+ }
372
+ .examples-row > * { flex: 0 0 auto !important; max-width: fit-content !important; }
373
+ .dark .examples-row { background: #1f2937; border-color: var(--icl-blue); }
374
+
375
+ /* Step progress indicator */
376
+ .status-bar {
377
+ padding: var(--sp-3) var(--sp-4);
378
+ border-radius: 8px;
379
+ background: var(--icl-grey);
380
+ border: 1px solid var(--icl-light-blue);
381
+ }
382
+ .status-bar.error {
383
+ background: #f8d7da;
384
+ border-color: #f5c6cb;
385
+ color: #721c24;
386
+ text-align: center;
387
+ font-weight: 500;
388
+ }
389
+ .status-bar .steps {
390
+ display: flex;
391
+ align-items: center;
392
+ justify-content: center;
393
+ gap: 0;
394
+ }
395
+ .status-bar .step {
396
+ display: flex;
397
+ align-items: center;
398
+ gap: 6px;
399
+ padding: 6px 14px;
400
+ border-radius: 20px;
401
+ font-size: 14px;
402
+ font-weight: 500;
403
+ background: var(--icl-light-blue);
404
+ color: var(--icl-blue);
405
+ white-space: nowrap;
406
+ transition: all 0.3s ease;
407
+ }
408
+ .status-bar .step.active {
409
+ background: var(--icl-blue);
410
+ color: white;
411
+ animation: pulse 1.5s ease-in-out infinite;
412
+ }
413
+ .status-bar .step.done {
414
+ background: var(--icl-navy);
415
+ color: white;
416
+ }
417
+ .status-bar .step .num {
418
+ font-weight: 700;
419
+ min-width: 18px;
420
+ text-align: center;
421
+ }
422
+ .status-bar .conn {
423
+ width: 24px;
424
+ height: 2px;
425
+ background: var(--icl-light-blue);
426
+ flex-shrink: 0;
427
+ }
428
+ .status-bar .timing {
429
+ text-align: center;
430
+ margin-top: var(--sp-2);
431
+ font-size: 13px;
432
+ color: var(--icl-blue);
433
+ }
434
+ @keyframes pulse {
435
+ 0%, 100% { opacity: 1; }
436
+ 50% { opacity: 0.6; }
437
+ }
438
+
439
+ /* Dark mode – status bar */
440
+ .dark .status-bar { background: #1f2937; border-color: var(--icl-blue); }
441
+ .dark .status-bar.error { background: #7f1d1d; border-color: #991b1b; color: #fca5a5; }
442
+ .dark .status-bar .step { background: var(--icl-blue); color: var(--icl-light-blue); }
443
+ .dark .status-bar .step.active { background: var(--icl-tangerine); color: white; }
444
+ .dark .status-bar .step.done { background: var(--icl-navy); color: var(--icl-light-blue); }
445
+ .dark .status-bar .conn { background: var(--icl-blue); }
446
+ .dark .status-bar .timing { color: var(--icl-light-blue); }
447
+
448
+ /* Footer */
449
+ .footer {
450
+ text-align: center;
451
+ color: var(--icl-dark-grey);
452
+ font-size: 0.85em;
453
+ margin-top: var(--sp-4);
454
+ }
455
+ .dark .footer { color: var(--icl-cool-grey); }
456
+
457
+ /* Reduced motion */
458
+ @media (prefers-reduced-motion: reduce) {
459
+ *, *::before, *::after {
460
+ animation-duration: 0.01ms !important;
461
+ animation-iteration-count: 1 !important;
462
+ transition-duration: 0.01ms !important;
463
+ }
464
+ }
465
+
466
+ /* Responsive */
467
+ @media (max-width: 768px) {
468
+ .icl-header img { height: 40px; }
469
+ .status-bar .step { padding: 4px 10px; font-size: 12px; }
470
+ .status-bar .conn { width: 12px; }
471
+ }
472
+ @media (max-width: 480px) {
473
+ .icl-header img { height: 32px; }
474
+ .icl-header { padding: var(--sp-3) var(--sp-2); }
475
+ }
476
+ """
477
+
478
+ with gr.Blocks(
479
+ title="Lecture Processor",
480
+ css=CSS,
481
+ theme=ICL_THEME,
482
+ ) as demo:
483
+ gr.HTML("""
484
+ <div class="icl-header">
485
+ <img src="https://upload.wikimedia.org/wikipedia/commons/5/51/Imperial_College_London_crest.svg"
486
+ alt="ICL Crest"
487
+ onerror="this.style.display='none';">
488
+ </div>
489
+ """)
490
+ gr.Markdown("# Lecture Processor", elem_classes="main-title")
491
+ gr.Markdown(
492
+ "Transcribe, summarize, and generate quizzes from lectures",
493
+ elem_classes="subtitle",
494
+ )
495
+
496
+ with gr.Row():
497
+ youtube_input = gr.Textbox(
498
+ label="🔗 YouTube URL",
499
+ placeholder="https://www.youtube.com/watch?v=...",
500
+ scale=3,
501
+ )
502
+ language_dropdown = gr.Dropdown(
503
+ choices=list(LANGUAGES.keys()),
504
+ value="English",
505
+ label="Language",
506
+ scale=1,
507
+ )
508
+
509
+ youtube_btn = gr.Button("▶ Process Lecture", variant="primary", size="lg")
510
+
511
+ gr.Markdown("**Examples:**")
512
+ with gr.Row(elem_classes="examples-row"):
513
+ for name, url in EXAMPLES.items():
514
+ gr.Button(name, variant="secondary", size="sm", min_width=160).click(
515
+ fn=lambda u=url: u, outputs=[youtube_input]
516
+ )
517
+
518
+ status_output = gr.HTML()
519
+ video_embed = gr.HTML()
520
+
521
+ with gr.Tabs():
522
+ with gr.TabItem("Transcript"):
523
+ raw_output = gr.Textbox(
524
+ label="Raw Transcription", lines=12
525
+ )
526
+ with gr.TabItem("Summary"):
527
+ summary_output = gr.Textbox(label="Lecture Summary", lines=12)
528
+ with gr.TabItem("Quiz"):
529
+ quiz_output = gr.Textbox(label="Quiz Questions", lines=12)
530
+
531
+ gr.Markdown(
532
+ "Powered by **WhisperX** & **Gemma 3 4B** | Fine-tuned LoRA adapter",
533
+ elem_classes="footer",
534
+ )
535
+
536
+ outputs = [video_embed, raw_output, summary_output, quiz_output, status_output]
537
+
538
+ youtube_btn.click(
539
+ fn=process_youtube,
540
+ inputs=[youtube_input, language_dropdown],
541
+ outputs=outputs,
542
+ )
543
+ youtube_input.submit(
544
+ fn=process_youtube,
545
+ inputs=[youtube_input, language_dropdown],
546
+ outputs=outputs,
547
+ )
548
+
549
+ if __name__ == "__main__":
550
+ demo.launch(server_name="0.0.0.0", share=True)
lecture_processor.py ADDED
@@ -0,0 +1,389 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import traceback
3
+ import torch
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer
5
+ from peft import PeftModel
6
+
7
+ DEFAULT_MODEL = "google/gemma-3-4b-it"
8
+ ADAPTER_PATH = "./gemma-lecture-adapter"
9
+ HUB_ADAPTER_ID = "noufwithy/gemma-lecture-adapter"
10
+
11
+ SUMMARIZE_SYSTEM_PROMPT = """You are a lecture summarization assistant.
12
+ Summarize the following lecture transcription into a comprehensive, structured summary with these sections:
13
+ - **Summary**: A concise overview of what the lecture covered
14
+ - **Key Points**: The main concepts, definitions, and important details covered in the lecture (use bullet points)
15
+ - **Action Points**: Any tasks, assignments, or follow-up actions mentioned by the lecturer
16
+
17
+ Cover ALL topics discussed. Do not omit any major points.
18
+ Output ONLY the summary. No explanations or extra commentary."""
19
+
20
+ # Quiz prompts match the training data format exactly (one question per call)
21
+ MCQ_SYSTEM_PROMPT = """You are an educational quiz generator.
22
+ Based on the following lecture transcription, generate a multiple choice question
23
+ with 4 options labeled A-D and indicate the correct answer.
24
+
25
+ Format:
26
+ Q1. [Question]
27
+ A) [Option]
28
+ B) [Option]
29
+ C) [Option]
30
+ D) [Option]
31
+ Correct Answer: [Letter]
32
+
33
+ Output ONLY the question. No explanations or extra commentary."""
34
+
35
+ SHORT_ANSWER_SYSTEM_PROMPT = """You are an educational quiz generator.
36
+ Based on the following lecture transcription, generate a short answer question
37
+ with the expected answer.
38
+
39
+ Format:
40
+ Q1. [Question]
41
+ Expected Answer: [Brief answer]
42
+
43
+ Output ONLY the question. No explanations or extra commentary."""
44
+
45
+ NUM_MCQ = 5
46
+ NUM_SHORT_ANSWER = 3
47
+
48
+ _model = None
49
+ _tokenizer = None
50
+
51
+
52
+ def _load_model(model_id: str = DEFAULT_MODEL, adapter_path: str = ADAPTER_PATH):
53
+ global _model, _tokenizer
54
+ if _model is not None:
55
+ return _model, _tokenizer
56
+
57
+ _tokenizer = AutoTokenizer.from_pretrained(model_id)
58
+
59
+ # Try local adapter first, then HuggingFace Hub, then base model
60
+ adapter_source = adapter_path if os.path.isdir(adapter_path) else HUB_ADAPTER_ID
61
+
62
+ # Load in bfloat16 (bitsandbytes 4-bit/8-bit quantization broken with Gemma 3)
63
+ try:
64
+ print(f"Loading model with LoRA adapter from {adapter_source}...")
65
+ base_model = AutoModelForCausalLM.from_pretrained(
66
+ model_id,
67
+ device_map="auto",
68
+ dtype=torch.bfloat16,
69
+ attn_implementation="eager",
70
+ )
71
+ _model = PeftModel.from_pretrained(base_model, adapter_source)
72
+ _model.eval()
73
+ print("LoRA adapter loaded successfully on bfloat16 base model.")
74
+ except Exception as e:
75
+ print(f"LoRA adapter failed ({e}), falling back to base model...")
76
+ traceback.print_exc()
77
+ _model = AutoModelForCausalLM.from_pretrained(
78
+ model_id, device_map="auto", dtype=torch.bfloat16,
79
+ )
80
+
81
+ return _model, _tokenizer
82
+
83
+
84
+ def _generate(messages, max_new_tokens=2048, do_sample=False, temperature=0.7):
85
+ """Generate text using model.generate() directly."""
86
+ model, tokenizer = _load_model()
87
+
88
+ # Format chat messages into a string, then tokenize
89
+ prompt = tokenizer.apply_chat_template(
90
+ messages, tokenize=False, add_generation_prompt=True
91
+ )
92
+ inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
93
+ input_ids = inputs["input_ids"].to(model.device)
94
+ attention_mask = inputs["attention_mask"].to(model.device)
95
+
96
+ print(f"[DEBUG] input length: {input_ids.shape[-1]} tokens")
97
+
98
+ with torch.no_grad():
99
+ outputs = model.generate(
100
+ input_ids=input_ids,
101
+ attention_mask=attention_mask,
102
+ max_new_tokens=max_new_tokens,
103
+ do_sample=do_sample,
104
+ temperature=temperature if do_sample else None,
105
+ top_p=0.9 if do_sample else None,
106
+ repetition_penalty=1.3,
107
+ )
108
+
109
+ # Decode only the new tokens (skip the input)
110
+ new_tokens = outputs[0][input_ids.shape[-1]:]
111
+ print(f"[DEBUG] generated {len(new_tokens)} new tokens")
112
+
113
+ response = tokenizer.decode(new_tokens, skip_special_tokens=True)
114
+ return response.strip()
115
+
116
+
117
+ def _is_good_summary(text: str, transcript: str = "") -> bool:
118
+ """Check if a summary meets minimum quality: long enough, not repetitive, not parroting."""
119
+ if len(text) < 100:
120
+ return False
121
+
122
+ # Check for excessive repetition (same line or sentence repeated 2+ times)
123
+ from collections import Counter
124
+ for chunks in [
125
+ [s.strip() for s in text.split("\n") if s.strip()],
126
+ [s.strip() for s in text.split(".") if s.strip()],
127
+ ]:
128
+ if chunks:
129
+ counts = Counter(chunks)
130
+ most_common_count = counts.most_common(1)[0][1]
131
+ if most_common_count >= 2:
132
+ print(f"[QUALITY] Repetitive output detected ({most_common_count} repeats)")
133
+ return False
134
+
135
+ # Check if summary is just parroting the transcript (high word overlap)
136
+ if transcript:
137
+ summary_words = set(text.lower().split())
138
+ transcript_words = set(transcript.lower().split())
139
+ if summary_words and transcript_words:
140
+ overlap = len(summary_words & transcript_words) / len(summary_words)
141
+ if overlap > 0.85:
142
+ print(f"[QUALITY] Summary too similar to transcript ({overlap:.0%} word overlap)")
143
+ return False
144
+
145
+ # Check if summary has enough key points (at least 3 bullet points)
146
+ bullet_count = text.count("- ")
147
+ has_key_points = "key points" in text.lower()
148
+ if has_key_points and bullet_count < 3:
149
+ print(f"[QUALITY] Summary has too few key points ({bullet_count})")
150
+ return False
151
+
152
+ # Check minimum unique content (summary should have substance)
153
+ unique_lines = set(s.strip() for s in text.split("\n") if s.strip() and len(s.strip()) > 10)
154
+ if len(unique_lines) < 5:
155
+ print(f"[QUALITY] Summary too shallow ({len(unique_lines)} unique lines)")
156
+ return False
157
+
158
+ return True
159
+
160
+
161
+ def _generate_with_base_fallback(messages, transcript="", **kwargs):
162
+ """Generate with adapter first. If output is bad, retry with base model."""
163
+ result = _generate(messages, **kwargs)
164
+
165
+ if _is_good_summary(result, transcript=transcript):
166
+ return result
167
+
168
+ # Adapter output is bad, try base model
169
+ model, _ = _load_model()
170
+ if isinstance(model, PeftModel):
171
+ print("[FALLBACK] Adapter output too short or repetitive, retrying with base model...")
172
+ model.disable_adapter_layers()
173
+ try:
174
+ result = _generate(messages, **kwargs)
175
+ finally:
176
+ model.enable_adapter_layers()
177
+ print(f"[FALLBACK] base model response length: {len(result)}")
178
+
179
+ return result
180
+
181
+
182
+ def _truncate_transcript(transcript: str, max_words: int = 4000) -> str:
183
+ """Truncate transcript to fit model's effective context (trained on 3072 tokens)."""
184
+ words = transcript.split()
185
+ if len(words) <= max_words:
186
+ return transcript
187
+ print(f"[TRUNCATE] Transcript has {len(words)} words, truncating to {max_words}")
188
+ return " ".join(words[:max_words])
189
+
190
+
191
+ def summarize_lecture(transcript: str, model: str = DEFAULT_MODEL) -> str:
192
+ """Summarize a lecture transcript using Gemma."""
193
+ if not transcript or not transcript.strip():
194
+ return ""
195
+
196
+ truncated = _truncate_transcript(transcript)
197
+ messages = [
198
+ {"role": "system", "content": SUMMARIZE_SYSTEM_PROMPT},
199
+ {"role": "user", "content": f"Lecture transcription:\n\n{truncated}"},
200
+ ]
201
+ # Try adapter first, fall back to base model if quality is bad
202
+ result = _generate_with_base_fallback(messages, transcript=transcript, do_sample=True, temperature=0.3)
203
+ print(f"[DEBUG summarize] response length: {len(result)}")
204
+ return result
205
+
206
+
207
+ def _extract_question_text(result: str) -> str:
208
+ """Extract just the question text (first line after Q number) for dedup comparison."""
209
+ import re
210
+ match = re.search(r'Q\d+\.\s*(.+)', result)
211
+ return match.group(1).strip().lower() if match else result.strip().lower()
212
+
213
+
214
+ def _is_good_quiz_answer(result: str, transcript: str = "") -> bool:
215
+ """Check if a generated quiz question is reasonable quality."""
216
+ # Reject if response doesn't match any expected format (no question generated)
217
+ if "Correct Answer:" not in result and "Expected Answer:" not in result:
218
+ print(f"[QUALITY] Response has no valid question format (missing Correct/Expected Answer)")
219
+ return False
220
+
221
+ # Reject if there's no actual question (Q1. pattern)
222
+ if "Q1." not in result:
223
+ print(f"[QUALITY] Response missing Q1. question marker")
224
+ return False
225
+
226
+ # Short answer: reject if expected answer is just a transcript fragment with no real content
227
+ if "Expected Answer:" in result:
228
+ answer = result.split("Expected Answer:")[-1].strip()
229
+ # Reject vague/pointer answers like "right here", "this arrow", "at this point"
230
+ vague_phrases = ["right here", "this arrow", "at this point", "this one", "over here", "right there"]
231
+ if any(phrase in answer.lower() for phrase in vague_phrases):
232
+ print(f"[QUALITY] Short answer too vague: {answer}")
233
+ return False
234
+ if len(answer.split()) < 2:
235
+ print(f"[QUALITY] Short answer too short: {answer}")
236
+ return False
237
+
238
+ # MCQ: reject if it doesn't have 4 options or has duplicate options
239
+ if "Correct Answer:" in result and "Expected Answer:" not in result:
240
+ import re
241
+ for label in ["A)", "B)", "C)", "D)"]:
242
+ if label not in result:
243
+ print(f"[QUALITY] MCQ missing option {label}")
244
+ return False
245
+ # Reject if options are mostly duplicated
246
+ options = re.findall(r'[A-D]\)\s*(.+)', result)
247
+ unique_options = set(opt.strip().lower() for opt in options)
248
+ if len(unique_options) < 3:
249
+ print(f"[QUALITY] MCQ has duplicate options ({len(unique_options)} unique out of {len(options)})")
250
+ return False
251
+
252
+ return True
253
+
254
+
255
+ def _dedup_mcq_options(result: str) -> str:
256
+ """Remove duplicate MCQ options, keeping unique ones only."""
257
+ import re
258
+ options = re.findall(r'([A-D])\)\s*(.+)', result)
259
+ if len(options) != 4:
260
+ return result
261
+
262
+ seen = {}
263
+ unique = []
264
+ for label, text in options:
265
+ key = text.strip().lower()
266
+ if key not in seen:
267
+ seen[key] = True
268
+ unique.append((label, text.strip()))
269
+
270
+ if len(unique) == len(options):
271
+ return result # no duplicates
272
+
273
+ print(f"[QUALITY] Removed {len(options) - len(unique)} duplicate MCQ option(s)")
274
+ # Rebuild with correct labels
275
+ lines = result.split("\n")
276
+ new_lines = []
277
+ option_idx = 0
278
+ labels = ["A", "B", "C", "D"]
279
+ for line in lines:
280
+ if re.match(r'^[A-D]\)', line):
281
+ if option_idx < len(unique):
282
+ new_lines.append(f"{labels[option_idx]}) {unique[option_idx][1]}")
283
+ option_idx += 1
284
+ else:
285
+ new_lines.append(line)
286
+
287
+ return "\n".join(new_lines)
288
+
289
+
290
+ def _generate_quiz_with_fallback(messages, transcript="", **kwargs):
291
+ """Generate a quiz question with adapter, fall back to base model if bad."""
292
+ result = _generate(messages, **kwargs)
293
+
294
+ if _is_good_quiz_answer(result, transcript):
295
+ return result
296
+
297
+ model, _ = _load_model()
298
+ if isinstance(model, PeftModel):
299
+ print("[FALLBACK] Quiz answer bad, retrying with base model...")
300
+ model.disable_adapter_layers()
301
+ try:
302
+ result = _generate(messages, **kwargs)
303
+ finally:
304
+ model.enable_adapter_layers()
305
+
306
+ return result
307
+
308
+
309
+ def _normalize_words(text: str) -> set[str]:
310
+ """Strip punctuation from words for cleaner comparison."""
311
+ import re
312
+ return set(re.sub(r'[^\w\s]', '', word) for word in text.split() if word.strip())
313
+
314
+
315
+ def _is_duplicate(result: str, existing_parts: list[str]) -> bool:
316
+ """Check if a generated question is too similar to any already generated."""
317
+ new_q = _extract_question_text(result)
318
+ for part in existing_parts:
319
+ old_q = _extract_question_text(part)
320
+ # Check if questions share most of their words (punctuation-stripped)
321
+ new_words = _normalize_words(new_q)
322
+ old_words = _normalize_words(old_q)
323
+ if not new_words or not old_words:
324
+ continue
325
+ overlap = len(new_words & old_words) / min(len(new_words), len(old_words))
326
+ if overlap > 0.7:
327
+ print(f"[QUALITY] Duplicate question detected ({overlap:.0%} word overlap)")
328
+ return True
329
+ return False
330
+
331
+
332
+ def generate_quiz(transcript: str, model: str = DEFAULT_MODEL) -> str:
333
+ """Generate quiz questions from a lecture transcript using Gemma.
334
+
335
+ Generates questions one at a time to match training format, then combines them.
336
+ Skips duplicate questions automatically.
337
+ """
338
+ if not transcript or not transcript.strip():
339
+ return ""
340
+
341
+ transcript = _truncate_transcript(transcript)
342
+ parts = []
343
+ max_retries = 2 # extra attempts per question if duplicate
344
+
345
+ # Generate MCQs one at a time (matches training: one MCQ per example)
346
+ for i in range(NUM_MCQ):
347
+ print(f"[DEBUG quiz] generating MCQ {i + 1}/{NUM_MCQ}...")
348
+ messages = [
349
+ {"role": "system", "content": MCQ_SYSTEM_PROMPT},
350
+ {"role": "user", "content": f"Lecture transcription:\n\n{transcript}"},
351
+ ]
352
+ good = False
353
+ for attempt in range(1 + max_retries):
354
+ result = _generate_quiz_with_fallback(messages, transcript=transcript, max_new_tokens=256, do_sample=True)
355
+ if _is_good_quiz_answer(result, transcript) and not _is_duplicate(result, parts):
356
+ good = True
357
+ break
358
+ print(f"[DEBUG quiz] MCQ {i + 1} attempt {attempt + 1} was bad or duplicate, retrying...")
359
+ if good:
360
+ result = _dedup_mcq_options(result)
361
+ result = result.replace("Q1.", f"Q{len(parts) + 1}.", 1)
362
+ parts.append(result)
363
+ else:
364
+ print(f"[DEBUG quiz] MCQ {i + 1} dropped (unreliable after {1 + max_retries} attempts)")
365
+
366
+ # Generate short answer questions one at a time
367
+ for i in range(NUM_SHORT_ANSWER):
368
+ q_num = NUM_MCQ + i + 1
369
+ print(f"[DEBUG quiz] generating short answer {i + 1}/{NUM_SHORT_ANSWER}...")
370
+ messages = [
371
+ {"role": "system", "content": SHORT_ANSWER_SYSTEM_PROMPT},
372
+ {"role": "user", "content": f"Lecture transcription:\n\n{transcript}"},
373
+ ]
374
+ good = False
375
+ for attempt in range(1 + max_retries):
376
+ result = _generate_quiz_with_fallback(messages, transcript=transcript, max_new_tokens=256, do_sample=True)
377
+ if _is_good_quiz_answer(result, transcript) and not _is_duplicate(result, parts):
378
+ good = True
379
+ break
380
+ print(f"[DEBUG quiz] short answer {i + 1} attempt {attempt + 1} was bad or duplicate, retrying...")
381
+ if good:
382
+ result = result.replace("Q1.", f"Q{len(parts) + 1}.", 1)
383
+ parts.append(result)
384
+ else:
385
+ print(f"[DEBUG quiz] short answer {i + 1} dropped (unreliable after {1 + max_retries} attempts)")
386
+
387
+ combined = "\n\n".join(parts)
388
+ print(f"[DEBUG quiz] total response length: {len(combined)}")
389
+ return combined
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ --extra-index-url https://download.pytorch.org/whl/cpu
2
+ torch
3
+ torchaudio
4
+ whisperx @ git+https://github.com/m-bain/whisperX.git
5
+ transformers
6
+ accelerate
7
+ gradio
8
+ yt-dlp
9
+ httpx
10
+ peft
11
+ spaces
transcribe.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ # Workaround for PyTorch 2.6+ weights_only=True default.
4
+ # pyannote VAD model checkpoints (used by WhisperX) contain omegaconf types
5
+ # and other globals that are not in torch's safe-globals allowlist.
6
+ # This env var tells PyTorch to fall back to weights_only=False when the
7
+ # caller did not explicitly pass weights_only. The pyannote models are
8
+ # published, trusted checkpoints.
9
+ os.environ.setdefault("TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD", "1")
10
+
11
+ import whisperx
12
+
13
+ import gc
14
+ import torch
15
+
16
+ _model = None
17
+ _current_device = None
18
+
19
+
20
+ def _get_model(device: str = None):
21
+ if device is None:
22
+ device = "cuda" if torch.cuda.is_available() else "cpu"
23
+ global _model, _current_device
24
+ if _model is None or _current_device != device:
25
+ _model = whisperx.load_model(
26
+ "base",
27
+ device=device,
28
+ compute_type="int8",
29
+ )
30
+ _current_device = device
31
+ return _model
32
+
33
+
34
+ def unload_model():
35
+ """Free WhisperX model from GPU memory to make room for other models."""
36
+ global _model, _current_device
37
+ if _model is not None:
38
+ del _model
39
+ _model = None
40
+ _current_device = None
41
+ gc.collect()
42
+ if torch.cuda.is_available():
43
+ torch.cuda.empty_cache()
44
+ print("[WhisperX] Model unloaded, GPU memory freed.")
45
+
46
+
47
+ def transcribe_audio(audio_path: str, language: str | None = None, device: str = None) -> str:
48
+ """
49
+ Transcribe audio file using WhisperX.
50
+
51
+ Args:
52
+ audio_path: Path to audio file (any format supported by ffmpeg).
53
+ language: ISO 639-1 language code (e.g. "en", "ko", "ja").
54
+ None for auto-detection.
55
+ device: "cuda" or "cpu".
56
+
57
+ Returns:
58
+ Transcribed text as a single string.
59
+ """
60
+ model = _get_model(device)
61
+ audio = whisperx.load_audio(audio_path)
62
+
63
+ transcribe_kwargs = {"batch_size": 16}
64
+ if language:
65
+ transcribe_kwargs["language"] = language
66
+
67
+ result = model.transcribe(audio, **transcribe_kwargs)
68
+
69
+ segments = result.get("segments", [])
70
+ text = " ".join(seg["text"].strip() for seg in segments if seg.get("text"))
71
+ return text