""" The Lecture Whisperer — app.py A Hugging Face Space that transcribes lectures, extracts slide content, syncs them together, and generates mock quizzes. Uses the HF Inference API to keep Space RAM low. """ import os import re import json import time import tempfile import textwrap from pathlib import Path from typing import Optional import gradio as gr import requests from pdf2image import convert_from_path from PIL import Image import base64 from io import BytesIO # --------------------------------------------------------------------------- # Configuration # --------------------------------------------------------------------------- HF_TOKEN = os.environ.get("HF_TOKEN", "") # Set as a Space secret HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} # Inference API endpoints WHISPER_API = "https://api-inference.huggingface.co/models/openai/whisper-large-v3" QWEN_API = "https://api-inference.huggingface.co/models/Qwen/Qwen2-VL-7B-Instruct" LLAMA_API = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct" # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def pil_to_base64(img: Image.Image, fmt: str = "PNG") -> str: buf = BytesIO() img.save(buf, format=fmt) return base64.b64encode(buf.getvalue()).decode() def hf_post(url: str, payload: dict, retries: int = 3, wait: int = 20) -> dict: """POST to a HF Inference endpoint with simple retry / loading handling.""" for attempt in range(retries): resp = requests.post(url, headers=HEADERS, json=payload, timeout=120) if resp.status_code == 503: data = resp.json() if "estimated_time" in data: secs = min(int(data["estimated_time"]) + 5, 60) else: secs = wait time.sleep(secs) continue resp.raise_for_status() return resp.json() raise RuntimeError(f"HF API unavailable after {retries} retries: {url}") # --------------------------------------------------------------------------- # 1. Audio → Transcription with timestamps # --------------------------------------------------------------------------- def transcribe_audio(audio_path: str) -> list[dict]: """ Returns a list of segments: [{"start": float, "end": float, "text": str}, ...] """ if not audio_path: return [] with open(audio_path, "rb") as f: audio_bytes = f.read() # Whisper via Inference API (binary upload) resp = requests.post( WHISPER_API, headers={**HEADERS, "Content-Type": "audio/mpeg"}, data=audio_bytes, params={"return_timestamps": "true"}, timeout=300, ) resp.raise_for_status() result = resp.json() # Normalise output shape segments = [] if "chunks" in result: for chunk in result["chunks"]: ts = chunk.get("timestamp", [0, 0]) or [0, 0] segments.append({ "start": ts[0] if ts[0] is not None else 0, "end": ts[1] if ts[1] is not None else 0, "text": chunk.get("text", "").strip(), }) elif "text" in result: # Fallback: single block, no timestamps segments.append({"start": 0.0, "end": 0.0, "text": result["text"].strip()}) return segments def format_transcript(segments: list[dict]) -> str: lines = [] for seg in segments: ts = f"[{int(seg['start'])//60:02d}:{int(seg['start'])%60:02d}]" lines.append(f"{ts} {seg['text']}") return "\n".join(lines) # --------------------------------------------------------------------------- # 2. PDF → slide images → text extraction # --------------------------------------------------------------------------- def pdf_to_images(pdf_path: str) -> list[Image.Image]: """Convert each PDF page to a PIL Image.""" if not pdf_path: return [] images = convert_from_path(pdf_path, dpi=150) return images def extract_slide_text(img: Image.Image, slide_num: int) -> str: """Ask Qwen2-VL to extract key text/concepts from a slide image.""" b64 = pil_to_base64(img) payload = { "inputs": { "image": b64, "question": ( "You are an academic assistant. " "Extract ALL key text, equations, definitions, bullet points, " "and concepts visible on this lecture slide. " "Be thorough and concise. Output plain text only." ), } } try: result = hf_post(QWEN_API, payload) if isinstance(result, list): return result[0].get("generated_text", "").strip() if isinstance(result, dict): return result.get("generated_text", "").strip() return str(result) except Exception as e: return f"[Slide {slide_num}: extraction error — {e}]" # --------------------------------------------------------------------------- # 3. Sync Logic # --------------------------------------------------------------------------- def build_keyword_index(slide_texts: list[str]) -> list[set]: """Build a simple keyword set per slide (lowercase words, len > 3).""" stop = { "this", "that", "with", "from", "have", "will", "been", "they", "their", "there", "were", "what", "when", "which", "also", "into", "over", "more", "some", "such", "than", "then", "these", "those", "after", "about", "would", } index = [] for text in slide_texts: words = re.findall(r"[a-z]{4,}", text.lower()) index.append({w for w in words if w not in stop}) return index def sync_transcript_to_slides( segments: list[dict], slide_texts: list[str], ) -> list[dict]: """ For each transcript segment, find the most likely slide. Returns enriched segments with a `slide_idx` key (0-based, or -1 if no match). """ if not slide_texts or not segments: return segments slide_kw_index = build_keyword_index(slide_texts) enriched = [] for seg in segments: seg_words = set(re.findall(r"[a-z]{4,}", seg["text"].lower())) best_slide, best_score = -1, 0 for idx, kw_set in enumerate(slide_kw_index): score = len(seg_words & kw_set) if score > best_score: best_score, best_slide = score, idx seg = dict(seg) seg["slide_idx"] = best_slide if best_score > 0 else -1 seg["match_score"] = best_score enriched.append(seg) return enriched def format_sync_report(synced: list[dict]) -> str: lines = ["## Transcript ↔ Slide Sync Report\n"] for seg in synced: ts = f"[{int(seg['start'])//60:02d}:{int(seg['start'])%60:02d}]" slide_ref = ( f"→ **Slide {seg['slide_idx']+1}** (score: {seg['match_score']})" if seg["slide_idx"] >= 0 else "→ no slide match" ) lines.append(f"`{ts}` {seg['text']}\n {slide_ref}\n") return "\n".join(lines) # --------------------------------------------------------------------------- # 4. Mock Quiz Generation # --------------------------------------------------------------------------- def generate_quiz(transcript_text: str) -> str: """Ask Llama-3-8B to write 5-10 MCQs based strictly on the transcript.""" if not transcript_text.strip(): return "No transcript available. Please process audio first." # Truncate to avoid token overflow clipped = transcript_text[:6000] prompt = textwrap.dedent(f""" <|begin_of_text|> <|start_header_id|>system<|end_header_id|> You are an expert educator. Generate a multiple-choice quiz based ONLY on the lecture transcript provided. Do not add external knowledge. Format each question as: Q1. A)