Spaces:

build-small-hackathon
/

PaperProf

Running on Zero

Mehdi

fix: pre-translate non-English chunks to English before LLM inference

67da08d 13 days ago

4.7 kB

	"""
	core/questioner.py — Generate study questions from text chunks using an LLM.

	Responsibility:
	Given a thematic chunk of course material, craft a single, focused
	open-ended question that tests conceptual understanding. The question
	should be answerable solely from the provided chunk.

	The prompt instructs the model to produce exactly one question with no
	surrounding commentary so the output can be displayed directly to the student.

	Public API:
	generate_question(chunk: str) -> str
	"""

	import re
	import json
	from model.llm import get_llm
	from core.lang import ensure_english

	_DIFFICULTY_HINT = {
	"Easy": "Ask for simple factual recall (What is X? Define X.).",
	"Normal": "Ask for conceptual understanding (Explain X. Why does X happen?).",
	"Hard": "Ask for analysis or application (Compare X and Y. How would you apply X to Y?).",
	}

	_PROMPT_TEMPLATE = """\
	You are a university professor creating exam questions.
	Given the following excerpt from a course, write ONE focused question.

	Difficulty — {difficulty_hint}

	Rules:
	- ONE question only, on ONE concept
	- Maximum 25 words
	- No sub-questions, no "and", no compound questions
	- IMPORTANT: Always write the question in English, even if the source text is in another language
	- Output only the question, nothing else

	Excerpt:
	{chunk}

	Question:"""


	_MCQ_TEMPLATE = """\
	You are a university professor creating a multiple choice exam question.
	Given the following excerpt, write ONE multiple choice question.

	Rules:
	- One clear question, maximum 25 words
	- The question must NOT contain or reveal the answer — do not quote the correct option in the question
	- Exactly 4 options (A, B, C, D), only ONE is correct
	- ALL 4 options MUST be completely different from each other — no two options may say the same thing
	- All wrong options must be plausible — no obviously wrong answers
	- IMPORTANT: Write ONLY in English — translate all concepts, do NOT quote the source in its original language

	For each option, write a 1-sentence explanation of why it is correct or incorrect.

	Output format (use EXACTLY these labels, one per line, nothing else):
	QUESTION: <question>
	A) <option>
	B) <option>
	C) <option>
	D) <option>
	CORRECT: <A, B, C or D>
	EXPLAIN_A: <explanation>
	EXPLAIN_B: <explanation>
	EXPLAIN_C: <explanation>
	EXPLAIN_D: <explanation>

	Excerpt:
	{chunk}
	"""


	def parse_mcq(raw: str) -> dict:
	"""Parse the LLM's structured MCQ output into a dict."""
	result: dict = {"question": "", "choices": {}, "correct": "", "explanations": {}}
	for line in raw.splitlines():
	line = line.strip()
	if line.startswith("QUESTION:"):
	result["question"] = line[9:].strip()
	elif m := re.match(r'^([ABCD])\)\s+(.*)', line):
	result["choices"][m.group(1)] = m.group(2)
	elif line.startswith("CORRECT:"):
	c = line[8:].strip().upper()
	result["correct"] = c[0] if c else ""
	elif m := re.match(r'^EXPLAIN_([ABCD]):\s+(.*)', line):
	result["explanations"][m.group(1)] = m.group(2)
	return result


	def generate_mcq(chunk: str, language: str = "English") -> dict:
	"""Return a multiple-choice question dict generated from chunk."""
	llm = get_llm()
	prompt = _MCQ_TEMPLATE.format(chunk=ensure_english(chunk.strip()), language=language)
	mcq: dict = {}
	for _ in range(3):
	raw = llm.generate(prompt, temperature=0.8).strip()
	mcq = parse_mcq(raw)
	choices = list(mcq.get("choices", {}).values())
	if len(choices) == 4 and len({c.lower().strip() for c in choices}) == 4:
	return mcq
	return mcq


	def _clean_question(raw: str) -> str:
	"""Keep only the first question — the model can ramble past it."""
	text = raw.strip()
	if text.lower().startswith("question:"):
	text = text[9:].strip()
	qmark = text.find("?")
	if qmark != -1:
	return text[: qmark + 1].strip()
	for marker in ("\nAnswer:", "Answer:", "\nQuestion:"):
	idx = text.find(marker)
	if idx > 0:
	text = text[:idx]
	return text.strip()


	def generate_question(chunk: str, language: str = "English", difficulty: str = "Normal") -> str:
	"""Return a single study question generated from chunk."""
	llm = get_llm()
	prompt = _PROMPT_TEMPLATE.format(
	chunk=ensure_english(chunk.strip()),
	language=language,
	difficulty_hint=_DIFFICULTY_HINT.get(difficulty, _DIFFICULTY_HINT["Normal"]),
	)
	# Questions are 25 words max — 80 tokens is plenty, and 6x faster
	# than the default 512 when the model fails to stop.
	return _clean_question(llm.generate(prompt, max_new_tokens=80))