File size: 4,697 Bytes
e1c0b77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
978d90d
 
e1c0b77
67da08d
e1c0b77
e984a0a
 
 
 
 
 
e1c0b77
 
e984a0a
 
 
3e7d6d6
 
 
e984a0a
3e7d6d6
490c5f1
3e7d6d6
e1c0b77
 
 
 
 
 
 
978d90d
 
 
 
 
 
c8e8b73
978d90d
c8e8b73
978d90d
c8e8b73
 
 
978d90d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67da08d
c8e8b73
 
8f2e039
c8e8b73
 
 
 
 
978d90d
 
81e2542
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e984a0a
e1c0b77
 
e984a0a
67da08d
e984a0a
 
 
81e2542
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
"""
core/questioner.py β€” Generate study questions from text chunks using an LLM.

Responsibility:
    Given a thematic chunk of course material, craft a single, focused
    open-ended question that tests conceptual understanding.  The question
    should be answerable solely from the provided chunk.

The prompt instructs the model to produce exactly one question with no
surrounding commentary so the output can be displayed directly to the student.

Public API:
    generate_question(chunk: str) -> str
"""

import re
import json
from model.llm import get_llm
from core.lang import ensure_english

_DIFFICULTY_HINT = {
    "Easy":   "Ask for simple factual recall (What is X? Define X.).",
    "Normal": "Ask for conceptual understanding (Explain X. Why does X happen?).",
    "Hard":   "Ask for analysis or application (Compare X and Y. How would you apply X to Y?).",
}

_PROMPT_TEMPLATE = """\
You are a university professor creating exam questions.
Given the following excerpt from a course, write ONE focused question.

Difficulty β€” {difficulty_hint}

Rules:
- ONE question only, on ONE concept
- Maximum 25 words
- No sub-questions, no "and", no compound questions
- IMPORTANT: Always write the question in English, even if the source text is in another language
- Output only the question, nothing else

Excerpt:
{chunk}

Question:"""


_MCQ_TEMPLATE = """\
You are a university professor creating a multiple choice exam question.
Given the following excerpt, write ONE multiple choice question.

Rules:
- One clear question, maximum 25 words
- The question must NOT contain or reveal the answer β€” do not quote the correct option in the question
- Exactly 4 options (A, B, C, D), only ONE is correct
- ALL 4 options MUST be completely different from each other β€” no two options may say the same thing
- All wrong options must be plausible β€” no obviously wrong answers
- IMPORTANT: Write ONLY in English β€” translate all concepts, do NOT quote the source in its original language

For each option, write a 1-sentence explanation of why it is correct or incorrect.

Output format (use EXACTLY these labels, one per line, nothing else):
QUESTION: <question>
A) <option>
B) <option>
C) <option>
D) <option>
CORRECT: <A, B, C or D>
EXPLAIN_A: <explanation>
EXPLAIN_B: <explanation>
EXPLAIN_C: <explanation>
EXPLAIN_D: <explanation>

Excerpt:
{chunk}
"""


def parse_mcq(raw: str) -> dict:
    """Parse the LLM's structured MCQ output into a dict."""
    result: dict = {"question": "", "choices": {}, "correct": "", "explanations": {}}
    for line in raw.splitlines():
        line = line.strip()
        if line.startswith("QUESTION:"):
            result["question"] = line[9:].strip()
        elif m := re.match(r'^([ABCD])\)\s+(.*)', line):
            result["choices"][m.group(1)] = m.group(2)
        elif line.startswith("CORRECT:"):
            c = line[8:].strip().upper()
            result["correct"] = c[0] if c else ""
        elif m := re.match(r'^EXPLAIN_([ABCD]):\s+(.*)', line):
            result["explanations"][m.group(1)] = m.group(2)
    return result


def generate_mcq(chunk: str, language: str = "English") -> dict:
    """Return a multiple-choice question dict generated from *chunk*."""
    llm = get_llm()
    prompt = _MCQ_TEMPLATE.format(chunk=ensure_english(chunk.strip()), language=language)
    mcq: dict = {}
    for _ in range(3):
        raw = llm.generate(prompt, temperature=0.8).strip()
        mcq = parse_mcq(raw)
        choices = list(mcq.get("choices", {}).values())
        if len(choices) == 4 and len({c.lower().strip() for c in choices}) == 4:
            return mcq
    return mcq


def _clean_question(raw: str) -> str:
    """Keep only the first question β€” the model can ramble past it."""
    text = raw.strip()
    if text.lower().startswith("question:"):
        text = text[9:].strip()
    qmark = text.find("?")
    if qmark != -1:
        return text[: qmark + 1].strip()
    for marker in ("\nAnswer:", "Answer:", "\nQuestion:"):
        idx = text.find(marker)
        if idx > 0:
            text = text[:idx]
    return text.strip()


def generate_question(chunk: str, language: str = "English", difficulty: str = "Normal") -> str:
    """Return a single study question generated from *chunk*."""
    llm = get_llm()
    prompt = _PROMPT_TEMPLATE.format(
        chunk=ensure_english(chunk.strip()),
        language=language,
        difficulty_hint=_DIFFICULTY_HINT.get(difficulty, _DIFFICULTY_HINT["Normal"]),
    )
    # Questions are 25 words max β€” 80 tokens is plenty, and 6x faster
    # than the default 512 when the model fails to stop.
    return _clean_question(llm.generate(prompt, max_new_tokens=80))