genai / notebook_prototype.py
llexieguo
Initial PDF tutor app
733c2e2
"""
Notebook-friendly prototype helpers for the coursework workflow:
PDF -> lecture text -> 5 MCQs (with answers + explanations)
Usage in Colab/Jupyter:
from notebook_prototype import run_prototype
result = run_prototype("/path/to/paper.pdf", mock=True)
print(result["lecture_text"])
print(result["mcqs"][0])
Set mock=False to reuse the real Qwen3-VL backend from app.py (same prompts/parsing flow).
"""
import json
import re
from pathlib import Path
from typing import Any, Dict, List
try:
from pypdf import PdfReader
except Exception:
PdfReader = None # type: ignore
LECTURE_PROMPT = """
你是一名课程助教。请阅读论文内容并写一段中文讲解(400-700字),包括:
问题背景、核心方法、实验亮点、局限性与适用场景。
论文内容:
{document}
""".strip()
MCQ_PROMPT = """
请基于论文内容生成 5 道中文单选题,并严格输出 JSON:
{
"questions": [
{
"question": "...",
"options": ["...", "...", "...", "..."],
"answer": "A",
"explanation": "..."
}
]
}
""".strip()
def extract_pdf_text(pdf_path: str, max_chars: int = 16000) -> str:
if PdfReader is None:
raise RuntimeError("pypdf is not installed.")
reader = PdfReader(pdf_path)
chunks: List[str] = []
total = 0
for i, page in enumerate(reader.pages, start=1):
text = (page.extract_text() or "").strip()
if not text:
continue
part = f"[Page {i}]\\n{text}\\n"
chunks.append(part)
total += len(part)
if total >= max_chars:
break
if not chunks:
return "No extractable text found. For scanned PDFs, convert pages to images and feed them to a VL model."
return "\\n".join(chunks)[:max_chars]
def _mock_lecture(document: str) -> str:
short = re.sub(r"\\s+", " ", document)[:1000]
return (
"【Mock讲解】这篇论文主要围绕一个机器学习/生成式AI任务展开,目标是改善现有方法在效果、效率或稳定性上的不足。"
"作者通过提出新的模型结构、训练策略或推理流程来解决该问题,并通过实验与基线比较验证方法有效性。"
"在阅读时建议重点关注:任务定义、输入输出、方法模块、实验设置、指标、消融实验,以及论文提到的局限性。\\n\\n"
f"论文节选:{short}"
)
def _mock_mcqs() -> List[Dict[str, Any]]:
return [
{
"question": "论文讲解中首先应说明什么?",
"options": ["问题背景与任务目标", "部署服务器价格", "前端样式颜色", "Git分支命名"],
"answer": "A",
"explanation": "先解释背景与目标,听众才知道作者为什么提出该方法。",
},
{
"question": "哪一项更适合用于解释论文方法?",
"options": ["按模块/步骤描述输入到输出流程", "只贴公式不解释", "只读摘要", "只看结论"],
"answer": "A",
"explanation": "方法讲解应结构化呈现,否则难以理解论文贡献点。",
},
{
"question": "为什么要生成带解析的选择题?",
"options": ["支持交互式教学反馈", "为了减少推理时间", "为了替代PDF上传", "为了训练TTS模型"],
"answer": "A",
"explanation": "解析能帮助学生理解错误原因并形成学习闭环。",
},
{
"question": "长论文处理通常更稳妥的做法是?",
"options": ["分块阅读后汇总", "一次性全部输入且不做控制", "只看标题", "随机抽样一页"],
"answer": "A",
"explanation": "分块可以降低上下文长度风险并提高稳定性。",
},
{
"question": "在你的课程Demo里,TTS最主要用于?",
"options": ["讲解和错题解析语音输出", "替代VL模型阅读PDF", "生成图片", "训练新LoRA"],
"answer": "A",
"explanation": "TTS负责文本转语音,增强演示交互体验。",
},
]
def run_prototype(pdf_path: str, mock: bool = True) -> Dict[str, Any]:
pdf_path = str(Path(pdf_path))
document = extract_pdf_text(pdf_path)
lecture_prompt = LECTURE_PROMPT.format(document=document)
mcq_prompt = MCQ_PROMPT.format(document=document)
if mock:
lecture_text = _mock_lecture(document)
mcqs = _mock_mcqs()
else:
# Reuse the same backend implementation as app.py to keep notebook/app behavior aligned.
from app import QwenPipelineEngine, parse_mcq_json # local import avoids gradio setup cost until needed
engine = QwenPipelineEngine()
engine.mock_mode = False
engine.ensure_vl_loaded()
lecture_text = engine._real_generate_text_from_pdf(pdf_path, lecture_prompt)
raw_mcq_json = engine._real_generate_text_from_pdf(pdf_path, mcq_prompt)
mcqs = [q.__dict__ for q in parse_mcq_json(raw_mcq_json)]
return {
"pdf_path": pdf_path,
"document_excerpt": document[:2000],
"lecture_prompt": lecture_prompt,
"mcq_prompt": mcq_prompt,
"lecture_text": lecture_text,
"mcqs": mcqs,
}
def pretty_print_mcqs(mcqs: List[Dict[str, Any]]) -> None:
for i, q in enumerate(mcqs, start=1):
print(f"\\nQ{i}. {q['question']}")
for label, opt in zip(["A", "B", "C", "D"], q["options"]):
print(f" {label}. {opt}")
print(f"Answer: {q['answer']}")
print(f"Explanation: {q['explanation']}")
if __name__ == "__main__":
# Minimal local check (replace with a real PDF path).
print("Import this file in a notebook and call run_prototype('/path/to/file.pdf').")