| """ |
| Notebook-friendly prototype helpers for the coursework workflow: |
| PDF -> lecture text -> 5 MCQs (with answers + explanations) |
| |
| Usage in Colab/Jupyter: |
| from notebook_prototype import run_prototype |
| result = run_prototype("/path/to/paper.pdf", mock=True) |
| print(result["lecture_text"]) |
| print(result["mcqs"][0]) |
| |
| Set mock=False to reuse the real Qwen3-VL backend from app.py (same prompts/parsing flow). |
| """ |
|
|
| import json |
| import re |
| from pathlib import Path |
| from typing import Any, Dict, List |
|
|
| try: |
| from pypdf import PdfReader |
| except Exception: |
| PdfReader = None |
|
|
|
|
| LECTURE_PROMPT = """ |
| 你是一名课程助教。请阅读论文内容并写一段中文讲解(400-700字),包括: |
| 问题背景、核心方法、实验亮点、局限性与适用场景。 |
| |
| 论文内容: |
| {document} |
| """.strip() |
|
|
|
|
| MCQ_PROMPT = """ |
| 请基于论文内容生成 5 道中文单选题,并严格输出 JSON: |
| { |
| "questions": [ |
| { |
| "question": "...", |
| "options": ["...", "...", "...", "..."], |
| "answer": "A", |
| "explanation": "..." |
| } |
| ] |
| } |
| """.strip() |
|
|
|
|
| def extract_pdf_text(pdf_path: str, max_chars: int = 16000) -> str: |
| if PdfReader is None: |
| raise RuntimeError("pypdf is not installed.") |
| reader = PdfReader(pdf_path) |
| chunks: List[str] = [] |
| total = 0 |
| for i, page in enumerate(reader.pages, start=1): |
| text = (page.extract_text() or "").strip() |
| if not text: |
| continue |
| part = f"[Page {i}]\\n{text}\\n" |
| chunks.append(part) |
| total += len(part) |
| if total >= max_chars: |
| break |
| if not chunks: |
| return "No extractable text found. For scanned PDFs, convert pages to images and feed them to a VL model." |
| return "\\n".join(chunks)[:max_chars] |
|
|
|
|
| def _mock_lecture(document: str) -> str: |
| short = re.sub(r"\\s+", " ", document)[:1000] |
| return ( |
| "【Mock讲解】这篇论文主要围绕一个机器学习/生成式AI任务展开,目标是改善现有方法在效果、效率或稳定性上的不足。" |
| "作者通过提出新的模型结构、训练策略或推理流程来解决该问题,并通过实验与基线比较验证方法有效性。" |
| "在阅读时建议重点关注:任务定义、输入输出、方法模块、实验设置、指标、消融实验,以及论文提到的局限性。\\n\\n" |
| f"论文节选:{short}" |
| ) |
|
|
|
|
| def _mock_mcqs() -> List[Dict[str, Any]]: |
| return [ |
| { |
| "question": "论文讲解中首先应说明什么?", |
| "options": ["问题背景与任务目标", "部署服务器价格", "前端样式颜色", "Git分支命名"], |
| "answer": "A", |
| "explanation": "先解释背景与目标,听众才知道作者为什么提出该方法。", |
| }, |
| { |
| "question": "哪一项更适合用于解释论文方法?", |
| "options": ["按模块/步骤描述输入到输出流程", "只贴公式不解释", "只读摘要", "只看结论"], |
| "answer": "A", |
| "explanation": "方法讲解应结构化呈现,否则难以理解论文贡献点。", |
| }, |
| { |
| "question": "为什么要生成带解析的选择题?", |
| "options": ["支持交互式教学反馈", "为了减少推理时间", "为了替代PDF上传", "为了训练TTS模型"], |
| "answer": "A", |
| "explanation": "解析能帮助学生理解错误原因并形成学习闭环。", |
| }, |
| { |
| "question": "长论文处理通常更稳妥的做法是?", |
| "options": ["分块阅读后汇总", "一次性全部输入且不做控制", "只看标题", "随机抽样一页"], |
| "answer": "A", |
| "explanation": "分块可以降低上下文长度风险并提高稳定性。", |
| }, |
| { |
| "question": "在你的课程Demo里,TTS最主要用于?", |
| "options": ["讲解和错题解析语音输出", "替代VL模型阅读PDF", "生成图片", "训练新LoRA"], |
| "answer": "A", |
| "explanation": "TTS负责文本转语音,增强演示交互体验。", |
| }, |
| ] |
|
|
|
|
| def run_prototype(pdf_path: str, mock: bool = True) -> Dict[str, Any]: |
| pdf_path = str(Path(pdf_path)) |
| document = extract_pdf_text(pdf_path) |
|
|
| lecture_prompt = LECTURE_PROMPT.format(document=document) |
| mcq_prompt = MCQ_PROMPT.format(document=document) |
|
|
| if mock: |
| lecture_text = _mock_lecture(document) |
| mcqs = _mock_mcqs() |
| else: |
| |
| from app import QwenPipelineEngine, parse_mcq_json |
|
|
| engine = QwenPipelineEngine() |
| engine.mock_mode = False |
| engine.ensure_vl_loaded() |
| lecture_text = engine._real_generate_text_from_pdf(pdf_path, lecture_prompt) |
| raw_mcq_json = engine._real_generate_text_from_pdf(pdf_path, mcq_prompt) |
| mcqs = [q.__dict__ for q in parse_mcq_json(raw_mcq_json)] |
|
|
| return { |
| "pdf_path": pdf_path, |
| "document_excerpt": document[:2000], |
| "lecture_prompt": lecture_prompt, |
| "mcq_prompt": mcq_prompt, |
| "lecture_text": lecture_text, |
| "mcqs": mcqs, |
| } |
|
|
|
|
| def pretty_print_mcqs(mcqs: List[Dict[str, Any]]) -> None: |
| for i, q in enumerate(mcqs, start=1): |
| print(f"\\nQ{i}. {q['question']}") |
| for label, opt in zip(["A", "B", "C", "D"], q["options"]): |
| print(f" {label}. {opt}") |
| print(f"Answer: {q['answer']}") |
| print(f"Explanation: {q['explanation']}") |
|
|
|
|
| if __name__ == "__main__": |
| |
| print("Import this file in a notebook and call run_prototype('/path/to/file.pdf').") |
|
|