Spaces:

lexie1218
/

genai

Sleeping

File size: 5,856 Bytes

733c2e2

"""
Notebook-friendly prototype helpers for the coursework workflow:
PDF -> lecture text -> 5 MCQs (with answers + explanations)

Usage in Colab/Jupyter:
    from notebook_prototype import run_prototype
    result = run_prototype("/path/to/paper.pdf", mock=True)
    print(result["lecture_text"])
    print(result["mcqs"][0])

Set mock=False to reuse the real Qwen3-VL backend from app.py (same prompts/parsing flow).
"""

import json
import re
from pathlib import Path
from typing import Any, Dict, List

try:
    from pypdf import PdfReader
except Exception:
    PdfReader = None  # type: ignore


LECTURE_PROMPT = """
你是一名课程助教。请阅读论文内容并写一段中文讲解（400-700字），包括：
问题背景、核心方法、实验亮点、局限性与适用场景。

论文内容：
{document}
""".strip()


MCQ_PROMPT = """
请基于论文内容生成 5 道中文单选题，并严格输出 JSON：
{
  "questions": [
    {
      "question": "...",
      "options": ["...", "...", "...", "..."],
      "answer": "A",
      "explanation": "..."
    }
  ]
}
""".strip()


def extract_pdf_text(pdf_path: str, max_chars: int = 16000) -> str:
    if PdfReader is None:
        raise RuntimeError("pypdf is not installed.")
    reader = PdfReader(pdf_path)
    chunks: List[str] = []
    total = 0
    for i, page in enumerate(reader.pages, start=1):
        text = (page.extract_text() or "").strip()
        if not text:
            continue
        part = f"[Page {i}]\\n{text}\\n"
        chunks.append(part)
        total += len(part)
        if total >= max_chars:
            break
    if not chunks:
        return "No extractable text found. For scanned PDFs, convert pages to images and feed them to a VL model."
    return "\\n".join(chunks)[:max_chars]


def _mock_lecture(document: str) -> str:
    short = re.sub(r"\\s+", " ", document)[:1000]
    return (
        "【Mock讲解】这篇论文主要围绕一个机器学习/生成式AI任务展开，目标是改善现有方法在效果、效率或稳定性上的不足。"
        "作者通过提出新的模型结构、训练策略或推理流程来解决该问题，并通过实验与基线比较验证方法有效性。"
        "在阅读时建议重点关注：任务定义、输入输出、方法模块、实验设置、指标、消融实验，以及论文提到的局限性。\\n\\n"
        f"论文节选：{short}"
    )


def _mock_mcqs() -> List[Dict[str, Any]]:
    return [
        {
            "question": "论文讲解中首先应说明什么？",
            "options": ["问题背景与任务目标", "部署服务器价格", "前端样式颜色", "Git分支命名"],
            "answer": "A",
            "explanation": "先解释背景与目标，听众才知道作者为什么提出该方法。",
        },
        {
            "question": "哪一项更适合用于解释论文方法？",
            "options": ["按模块/步骤描述输入到输出流程", "只贴公式不解释", "只读摘要", "只看结论"],
            "answer": "A",
            "explanation": "方法讲解应结构化呈现，否则难以理解论文贡献点。",
        },
        {
            "question": "为什么要生成带解析的选择题？",
            "options": ["支持交互式教学反馈", "为了减少推理时间", "为了替代PDF上传", "为了训练TTS模型"],
            "answer": "A",
            "explanation": "解析能帮助学生理解错误原因并形成学习闭环。",
        },
        {
            "question": "长论文处理通常更稳妥的做法是？",
            "options": ["分块阅读后汇总", "一次性全部输入且不做控制", "只看标题", "随机抽样一页"],
            "answer": "A",
            "explanation": "分块可以降低上下文长度风险并提高稳定性。",
        },
        {
            "question": "在你的课程Demo里，TTS最主要用于？",
            "options": ["讲解和错题解析语音输出", "替代VL模型阅读PDF", "生成图片", "训练新LoRA"],
            "answer": "A",
            "explanation": "TTS负责文本转语音，增强演示交互体验。",
        },
    ]


def run_prototype(pdf_path: str, mock: bool = True) -> Dict[str, Any]:
    pdf_path = str(Path(pdf_path))
    document = extract_pdf_text(pdf_path)

    lecture_prompt = LECTURE_PROMPT.format(document=document)
    mcq_prompt = MCQ_PROMPT.format(document=document)

    if mock:
        lecture_text = _mock_lecture(document)
        mcqs = _mock_mcqs()
    else:
        # Reuse the same backend implementation as app.py to keep notebook/app behavior aligned.
        from app import QwenPipelineEngine, parse_mcq_json  # local import avoids gradio setup cost until needed

        engine = QwenPipelineEngine()
        engine.mock_mode = False
        engine.ensure_vl_loaded()
        lecture_text = engine._real_generate_text_from_pdf(pdf_path, lecture_prompt)
        raw_mcq_json = engine._real_generate_text_from_pdf(pdf_path, mcq_prompt)
        mcqs = [q.__dict__ for q in parse_mcq_json(raw_mcq_json)]

    return {
        "pdf_path": pdf_path,
        "document_excerpt": document[:2000],
        "lecture_prompt": lecture_prompt,
        "mcq_prompt": mcq_prompt,
        "lecture_text": lecture_text,
        "mcqs": mcqs,
    }


def pretty_print_mcqs(mcqs: List[Dict[str, Any]]) -> None:
    for i, q in enumerate(mcqs, start=1):
        print(f"\\nQ{i}. {q['question']}")
        for label, opt in zip(["A", "B", "C", "D"], q["options"]):
            print(f"  {label}. {opt}")
        print(f"Answer: {q['answer']}")
        print(f"Explanation: {q['explanation']}")


if __name__ == "__main__":
    # Minimal local check (replace with a real PDF path).
    print("Import this file in a notebook and call run_prototype('/path/to/file.pdf').")