File size: 5,856 Bytes
733c2e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
"""
Notebook-friendly prototype helpers for the coursework workflow:
PDF -> lecture text -> 5 MCQs (with answers + explanations)

Usage in Colab/Jupyter:
    from notebook_prototype import run_prototype
    result = run_prototype("/path/to/paper.pdf", mock=True)
    print(result["lecture_text"])
    print(result["mcqs"][0])

Set mock=False to reuse the real Qwen3-VL backend from app.py (same prompts/parsing flow).
"""

import json
import re
from pathlib import Path
from typing import Any, Dict, List

try:
    from pypdf import PdfReader
except Exception:
    PdfReader = None  # type: ignore


LECTURE_PROMPT = """
你是一名课程助教。请阅读论文内容并写一段中文讲解(400-700字),包括:
问题背景、核心方法、实验亮点、局限性与适用场景。

论文内容:
{document}
""".strip()


MCQ_PROMPT = """
请基于论文内容生成 5 道中文单选题,并严格输出 JSON:
{
  "questions": [
    {
      "question": "...",
      "options": ["...", "...", "...", "..."],
      "answer": "A",
      "explanation": "..."
    }
  ]
}
""".strip()


def extract_pdf_text(pdf_path: str, max_chars: int = 16000) -> str:
    if PdfReader is None:
        raise RuntimeError("pypdf is not installed.")
    reader = PdfReader(pdf_path)
    chunks: List[str] = []
    total = 0
    for i, page in enumerate(reader.pages, start=1):
        text = (page.extract_text() or "").strip()
        if not text:
            continue
        part = f"[Page {i}]\\n{text}\\n"
        chunks.append(part)
        total += len(part)
        if total >= max_chars:
            break
    if not chunks:
        return "No extractable text found. For scanned PDFs, convert pages to images and feed them to a VL model."
    return "\\n".join(chunks)[:max_chars]


def _mock_lecture(document: str) -> str:
    short = re.sub(r"\\s+", " ", document)[:1000]
    return (
        "【Mock讲解】这篇论文主要围绕一个机器学习/生成式AI任务展开,目标是改善现有方法在效果、效率或稳定性上的不足。"
        "作者通过提出新的模型结构、训练策略或推理流程来解决该问题,并通过实验与基线比较验证方法有效性。"
        "在阅读时建议重点关注:任务定义、输入输出、方法模块、实验设置、指标、消融实验,以及论文提到的局限性。\\n\\n"
        f"论文节选:{short}"
    )


def _mock_mcqs() -> List[Dict[str, Any]]:
    return [
        {
            "question": "论文讲解中首先应说明什么?",
            "options": ["问题背景与任务目标", "部署服务器价格", "前端样式颜色", "Git分支命名"],
            "answer": "A",
            "explanation": "先解释背景与目标,听众才知道作者为什么提出该方法。",
        },
        {
            "question": "哪一项更适合用于解释论文方法?",
            "options": ["按模块/步骤描述输入到输出流程", "只贴公式不解释", "只读摘要", "只看结论"],
            "answer": "A",
            "explanation": "方法讲解应结构化呈现,否则难以理解论文贡献点。",
        },
        {
            "question": "为什么要生成带解析的选择题?",
            "options": ["支持交互式教学反馈", "为了减少推理时间", "为了替代PDF上传", "为了训练TTS模型"],
            "answer": "A",
            "explanation": "解析能帮助学生理解错误原因并形成学习闭环。",
        },
        {
            "question": "长论文处理通常更稳妥的做法是?",
            "options": ["分块阅读后汇总", "一次性全部输入且不做控制", "只看标题", "随机抽样一页"],
            "answer": "A",
            "explanation": "分块可以降低上下文长度风险并提高稳定性。",
        },
        {
            "question": "在你的课程Demo里,TTS最主要用于?",
            "options": ["讲解和错题解析语音输出", "替代VL模型阅读PDF", "生成图片", "训练新LoRA"],
            "answer": "A",
            "explanation": "TTS负责文本转语音,增强演示交互体验。",
        },
    ]


def run_prototype(pdf_path: str, mock: bool = True) -> Dict[str, Any]:
    pdf_path = str(Path(pdf_path))
    document = extract_pdf_text(pdf_path)

    lecture_prompt = LECTURE_PROMPT.format(document=document)
    mcq_prompt = MCQ_PROMPT.format(document=document)

    if mock:
        lecture_text = _mock_lecture(document)
        mcqs = _mock_mcqs()
    else:
        # Reuse the same backend implementation as app.py to keep notebook/app behavior aligned.
        from app import QwenPipelineEngine, parse_mcq_json  # local import avoids gradio setup cost until needed

        engine = QwenPipelineEngine()
        engine.mock_mode = False
        engine.ensure_vl_loaded()
        lecture_text = engine._real_generate_text_from_pdf(pdf_path, lecture_prompt)
        raw_mcq_json = engine._real_generate_text_from_pdf(pdf_path, mcq_prompt)
        mcqs = [q.__dict__ for q in parse_mcq_json(raw_mcq_json)]

    return {
        "pdf_path": pdf_path,
        "document_excerpt": document[:2000],
        "lecture_prompt": lecture_prompt,
        "mcq_prompt": mcq_prompt,
        "lecture_text": lecture_text,
        "mcqs": mcqs,
    }


def pretty_print_mcqs(mcqs: List[Dict[str, Any]]) -> None:
    for i, q in enumerate(mcqs, start=1):
        print(f"\\nQ{i}. {q['question']}")
        for label, opt in zip(["A", "B", "C", "D"], q["options"]):
            print(f"  {label}. {opt}")
        print(f"Answer: {q['answer']}")
        print(f"Explanation: {q['explanation']}")


if __name__ == "__main__":
    # Minimal local check (replace with a real PDF path).
    print("Import this file in a notebook and call run_prototype('/path/to/file.pdf').")