File size: 5,856 Bytes
733c2e2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | """
Notebook-friendly prototype helpers for the coursework workflow:
PDF -> lecture text -> 5 MCQs (with answers + explanations)
Usage in Colab/Jupyter:
from notebook_prototype import run_prototype
result = run_prototype("/path/to/paper.pdf", mock=True)
print(result["lecture_text"])
print(result["mcqs"][0])
Set mock=False to reuse the real Qwen3-VL backend from app.py (same prompts/parsing flow).
"""
import json
import re
from pathlib import Path
from typing import Any, Dict, List
try:
from pypdf import PdfReader
except Exception:
PdfReader = None # type: ignore
LECTURE_PROMPT = """
你是一名课程助教。请阅读论文内容并写一段中文讲解(400-700字),包括:
问题背景、核心方法、实验亮点、局限性与适用场景。
论文内容:
{document}
""".strip()
MCQ_PROMPT = """
请基于论文内容生成 5 道中文单选题,并严格输出 JSON:
{
"questions": [
{
"question": "...",
"options": ["...", "...", "...", "..."],
"answer": "A",
"explanation": "..."
}
]
}
""".strip()
def extract_pdf_text(pdf_path: str, max_chars: int = 16000) -> str:
if PdfReader is None:
raise RuntimeError("pypdf is not installed.")
reader = PdfReader(pdf_path)
chunks: List[str] = []
total = 0
for i, page in enumerate(reader.pages, start=1):
text = (page.extract_text() or "").strip()
if not text:
continue
part = f"[Page {i}]\\n{text}\\n"
chunks.append(part)
total += len(part)
if total >= max_chars:
break
if not chunks:
return "No extractable text found. For scanned PDFs, convert pages to images and feed them to a VL model."
return "\\n".join(chunks)[:max_chars]
def _mock_lecture(document: str) -> str:
short = re.sub(r"\\s+", " ", document)[:1000]
return (
"【Mock讲解】这篇论文主要围绕一个机器学习/生成式AI任务展开,目标是改善现有方法在效果、效率或稳定性上的不足。"
"作者通过提出新的模型结构、训练策略或推理流程来解决该问题,并通过实验与基线比较验证方法有效性。"
"在阅读时建议重点关注:任务定义、输入输出、方法模块、实验设置、指标、消融实验,以及论文提到的局限性。\\n\\n"
f"论文节选:{short}"
)
def _mock_mcqs() -> List[Dict[str, Any]]:
return [
{
"question": "论文讲解中首先应说明什么?",
"options": ["问题背景与任务目标", "部署服务器价格", "前端样式颜色", "Git分支命名"],
"answer": "A",
"explanation": "先解释背景与目标,听众才知道作者为什么提出该方法。",
},
{
"question": "哪一项更适合用于解释论文方法?",
"options": ["按模块/步骤描述输入到输出流程", "只贴公式不解释", "只读摘要", "只看结论"],
"answer": "A",
"explanation": "方法讲解应结构化呈现,否则难以理解论文贡献点。",
},
{
"question": "为什么要生成带解析的选择题?",
"options": ["支持交互式教学反馈", "为了减少推理时间", "为了替代PDF上传", "为了训练TTS模型"],
"answer": "A",
"explanation": "解析能帮助学生理解错误原因并形成学习闭环。",
},
{
"question": "长论文处理通常更稳妥的做法是?",
"options": ["分块阅读后汇总", "一次性全部输入且不做控制", "只看标题", "随机抽样一页"],
"answer": "A",
"explanation": "分块可以降低上下文长度风险并提高稳定性。",
},
{
"question": "在你的课程Demo里,TTS最主要用于?",
"options": ["讲解和错题解析语音输出", "替代VL模型阅读PDF", "生成图片", "训练新LoRA"],
"answer": "A",
"explanation": "TTS负责文本转语音,增强演示交互体验。",
},
]
def run_prototype(pdf_path: str, mock: bool = True) -> Dict[str, Any]:
pdf_path = str(Path(pdf_path))
document = extract_pdf_text(pdf_path)
lecture_prompt = LECTURE_PROMPT.format(document=document)
mcq_prompt = MCQ_PROMPT.format(document=document)
if mock:
lecture_text = _mock_lecture(document)
mcqs = _mock_mcqs()
else:
# Reuse the same backend implementation as app.py to keep notebook/app behavior aligned.
from app import QwenPipelineEngine, parse_mcq_json # local import avoids gradio setup cost until needed
engine = QwenPipelineEngine()
engine.mock_mode = False
engine.ensure_vl_loaded()
lecture_text = engine._real_generate_text_from_pdf(pdf_path, lecture_prompt)
raw_mcq_json = engine._real_generate_text_from_pdf(pdf_path, mcq_prompt)
mcqs = [q.__dict__ for q in parse_mcq_json(raw_mcq_json)]
return {
"pdf_path": pdf_path,
"document_excerpt": document[:2000],
"lecture_prompt": lecture_prompt,
"mcq_prompt": mcq_prompt,
"lecture_text": lecture_text,
"mcqs": mcqs,
}
def pretty_print_mcqs(mcqs: List[Dict[str, Any]]) -> None:
for i, q in enumerate(mcqs, start=1):
print(f"\\nQ{i}. {q['question']}")
for label, opt in zip(["A", "B", "C", "D"], q["options"]):
print(f" {label}. {opt}")
print(f"Answer: {q['answer']}")
print(f"Explanation: {q['explanation']}")
if __name__ == "__main__":
# Minimal local check (replace with a real PDF path).
print("Import this file in a notebook and call run_prototype('/path/to/file.pdf').")
|