""" Notebook-friendly prototype helpers for the coursework workflow: PDF -> lecture text -> 5 MCQs (with answers + explanations) Usage in Colab/Jupyter: from notebook_prototype import run_prototype result = run_prototype("/path/to/paper.pdf", mock=True) print(result["lecture_text"]) print(result["mcqs"][0]) Set mock=False to reuse the real Qwen3-VL backend from app.py (same prompts/parsing flow). """ import json import re from pathlib import Path from typing import Any, Dict, List try: from pypdf import PdfReader except Exception: PdfReader = None # type: ignore LECTURE_PROMPT = """ 你是一名课程助教。请阅读论文内容并写一段中文讲解(400-700字),包括: 问题背景、核心方法、实验亮点、局限性与适用场景。 论文内容: {document} """.strip() MCQ_PROMPT = """ 请基于论文内容生成 5 道中文单选题,并严格输出 JSON: { "questions": [ { "question": "...", "options": ["...", "...", "...", "..."], "answer": "A", "explanation": "..." } ] } """.strip() def extract_pdf_text(pdf_path: str, max_chars: int = 16000) -> str: if PdfReader is None: raise RuntimeError("pypdf is not installed.") reader = PdfReader(pdf_path) chunks: List[str] = [] total = 0 for i, page in enumerate(reader.pages, start=1): text = (page.extract_text() or "").strip() if not text: continue part = f"[Page {i}]\\n{text}\\n" chunks.append(part) total += len(part) if total >= max_chars: break if not chunks: return "No extractable text found. For scanned PDFs, convert pages to images and feed them to a VL model." return "\\n".join(chunks)[:max_chars] def _mock_lecture(document: str) -> str: short = re.sub(r"\\s+", " ", document)[:1000] return ( "【Mock讲解】这篇论文主要围绕一个机器学习/生成式AI任务展开,目标是改善现有方法在效果、效率或稳定性上的不足。" "作者通过提出新的模型结构、训练策略或推理流程来解决该问题,并通过实验与基线比较验证方法有效性。" "在阅读时建议重点关注:任务定义、输入输出、方法模块、实验设置、指标、消融实验,以及论文提到的局限性。\\n\\n" f"论文节选:{short}" ) def _mock_mcqs() -> List[Dict[str, Any]]: return [ { "question": "论文讲解中首先应说明什么?", "options": ["问题背景与任务目标", "部署服务器价格", "前端样式颜色", "Git分支命名"], "answer": "A", "explanation": "先解释背景与目标,听众才知道作者为什么提出该方法。", }, { "question": "哪一项更适合用于解释论文方法?", "options": ["按模块/步骤描述输入到输出流程", "只贴公式不解释", "只读摘要", "只看结论"], "answer": "A", "explanation": "方法讲解应结构化呈现,否则难以理解论文贡献点。", }, { "question": "为什么要生成带解析的选择题?", "options": ["支持交互式教学反馈", "为了减少推理时间", "为了替代PDF上传", "为了训练TTS模型"], "answer": "A", "explanation": "解析能帮助学生理解错误原因并形成学习闭环。", }, { "question": "长论文处理通常更稳妥的做法是?", "options": ["分块阅读后汇总", "一次性全部输入且不做控制", "只看标题", "随机抽样一页"], "answer": "A", "explanation": "分块可以降低上下文长度风险并提高稳定性。", }, { "question": "在你的课程Demo里,TTS最主要用于?", "options": ["讲解和错题解析语音输出", "替代VL模型阅读PDF", "生成图片", "训练新LoRA"], "answer": "A", "explanation": "TTS负责文本转语音,增强演示交互体验。", }, ] def run_prototype(pdf_path: str, mock: bool = True) -> Dict[str, Any]: pdf_path = str(Path(pdf_path)) document = extract_pdf_text(pdf_path) lecture_prompt = LECTURE_PROMPT.format(document=document) mcq_prompt = MCQ_PROMPT.format(document=document) if mock: lecture_text = _mock_lecture(document) mcqs = _mock_mcqs() else: # Reuse the same backend implementation as app.py to keep notebook/app behavior aligned. from app import QwenPipelineEngine, parse_mcq_json # local import avoids gradio setup cost until needed engine = QwenPipelineEngine() engine.mock_mode = False engine.ensure_vl_loaded() lecture_text = engine._real_generate_text_from_pdf(pdf_path, lecture_prompt) raw_mcq_json = engine._real_generate_text_from_pdf(pdf_path, mcq_prompt) mcqs = [q.__dict__ for q in parse_mcq_json(raw_mcq_json)] return { "pdf_path": pdf_path, "document_excerpt": document[:2000], "lecture_prompt": lecture_prompt, "mcq_prompt": mcq_prompt, "lecture_text": lecture_text, "mcqs": mcqs, } def pretty_print_mcqs(mcqs: List[Dict[str, Any]]) -> None: for i, q in enumerate(mcqs, start=1): print(f"\\nQ{i}. {q['question']}") for label, opt in zip(["A", "B", "C", "D"], q["options"]): print(f" {label}. {opt}") print(f"Answer: {q['answer']}") print(f"Explanation: {q['explanation']}") if __name__ == "__main__": # Minimal local check (replace with a real PDF path). print("Import this file in a notebook and call run_prototype('/path/to/file.pdf').")