Spaces:

build-small-hackathon
/

study-partner

Running on Zero

App Files Files Community

study-partner / test_grade_answer.py

nz-nz

Sync from GitHub via hub-sync

6505c2d verified 13 days ago

Raw

History Blame Contribute Delete

6.33 kB

	"""
	NAH-8 — grade_answer strict-JSON grading + parser/repair retry.

	Runs with the real (non-stub) code path but with llm.chat monkeypatched, so no
	model/GPU is needed. Verifies: clean JSON parses, a bad first reply triggers one
	repair retry, and a never-valid model gives a safe default instead of crashing.

	RECALL_STUB=0 python3 test_grade_answer.py
	"""
	import os

	os.environ["RECALL_STUB"] = "0" # exercise the real model path, not the heuristic

	import llm
	import learning_engine as le
	from schema import new_card

	# `llm.STUB` is read once at import. Under the full pytest run another test file
	# imports `llm` first (with stub on), so the env flip above wouldn't take — and
	# grade_answer would silently use the heuristic instead of the patched chat.
	# Pin it off here so these tests are robust to import order. (Downstream test
	# files either reload llm or don't depend on STUB.)
	llm.STUB = False


	def _card():
	return new_card(
	question="What does mitochondria do?",
	answer="It produces ATP, the cell's energy.",
	topic="Cell Biology",
	)


	def _fake_chat(replies):
	"""Return a chat() that yields the given replies in order."""
	calls = {"n": 0}

	def chat(messages, max_tokens=512):
	i = min(calls["n"], len(replies) - 1)
	calls["n"] += 1
	return replies[i]

	return chat, calls


	def test_clean_json_first_try():
	llm.chat, calls = _fake_chat([
	'{"score": 5, "explanation": "Spot on.", "missed_concept": ""}'
	])
	g = le.grade_answer(_card(), "It makes ATP energy")
	assert g["score"] == 5 and g["correct"] is True
	assert g["explanation"] == "Spot on."
	assert calls["n"] == 1, "should not retry when first reply is valid"
	print("ok clean JSON on first try")


	def test_repair_retry_recovers():
	# First reply is junk; repair pass returns valid JSON.
	llm.chat, calls = _fake_chat([
	"Sure! The student did okay I think, maybe a 2 or 3.",
	'```json\n{"score": 2, "explanation": "Missed the ATP detail.", '
	'"missed_concept": "ATP production"}\n```',
	])
	g = le.grade_answer(_card(), "it is in the cell")
	assert g["score"] == 2 and g["correct"] is False
	assert g["missed_concept"] == "ATP production"
	assert calls["n"] == 2, "should retry exactly once to repair bad JSON"
	print("ok repair retry recovers bad first reply")


	def test_safe_default_when_never_valid():
	llm.chat, calls = _fake_chat(["no json here", "still no json at all"])
	g = le.grade_answer(_card(), "it makes energy for the cell") # a real attempt
	assert g["score"] == 2 # neutral safe default
	assert "reference" in g["explanation"].lower()
	assert calls["n"] == 2, "tries once + one repair, then gives up"
	print("ok safe default when model never returns JSON")


	def test_out_of_range_score_rejected():
	# Score outside 0-5 must be treated as unusable, not clamped silently.
	llm.chat, calls = _fake_chat([
	'{"score": 99, "explanation": "x"}',
	'also not valid json',
	])
	g = le.grade_answer(_card(), "whatever")
	assert g["score"] == 2, "out-of-range score should fall through to default"
	print("ok out-of-range score rejected -> safe default")


	def test_third_person_possessive_rewritten_to_second():
	# The model slips into "The student's answer/response" ~half the time; the
	# safe possessive swaps are applied to the returned explanation.
	llm.chat, _ = _fake_chat([
	'{"score": 1, "explanation": "The student\'s answer, \'magic\', is wrong.", '
	'"missed_concept": "the student\'s grasp of the mechanism"}'
	])
	g = le.grade_answer(_card(), "magic")
	assert g["explanation"] == "Your answer, 'magic', is wrong.", g["explanation"]
	assert g["missed_concept"] == "your grasp of the mechanism", g["missed_concept"]
	print("ok third-person possessive rewritten to second person")


	def test_second_person_leaves_safe_subject_form_alone():
	# We only swap possessives — a subject "The student identifies..." is left
	# untouched rather than mangled into "You identifies...".
	assert le._to_second_person("The student identifies it.") == "The student identifies it."
	assert le._to_second_person("Your answer is close.") == "Your answer is close."
	print("ok subject-form third person left alone (no grammar mangling)")


	def test_empty_answer_short_circuits_to_zero():
	# An empty answer is a miss — score 0 with no model call (the model otherwise
	# ignores the blank input and hallucinates a 4/5 "correct").
	llm.chat, calls = _fake_chat(['{"score": 5, "explanation": "x"}'])
	for blank in ("", " ", "\n\t"):
	g = le.grade_answer(_card(), blank)
	assert g["score"] == 0 and g["correct"] is False, (blank, g)
	assert "reference answer" in g["explanation"].lower()
	assert calls["n"] == 0, "empty answer must not call the model"
	print("ok empty answer short-circuits to score 0 (no model call)")


	def test_non_answer_short_circuits_to_zero():
	# "idk" / "don't know" / "?" are misses too — the model otherwise ignores
	# them and grades the reference answer, hallucinating a 4/5 "correct".
	llm.chat, calls = _fake_chat(['{"score": 5, "explanation": "x"}'])
	for non in ("idk", "I don't know", "don know", "no idea", "?", "..."):
	g = le.grade_answer(_card(), non)
	assert g["score"] == 0 and g["correct"] is False, (non, g)
	assert calls["n"] == 0, "a non-answer must not call the model"
	# A real attempt that merely contains "no"/"don't know" still reaches the model.
	llm.chat, calls = _fake_chat(['{"score": 4, "explanation": "Close."}'])
	g = le.grade_answer(_card(), "no, it is the stroma")
	assert calls["n"] == 1, "a real attempt must still be graded by the model"
	print("ok non-answers ('idk', \"don't know\", '?') short-circuit to score 0")


	if __name__ == "__main__":
	test_clean_json_first_try()
	test_repair_retry_recovers()
	test_safe_default_when_never_valid()
	test_out_of_range_score_rejected()
	test_third_person_possessive_rewritten_to_second()
	test_second_person_leaves_safe_subject_form_alone()
	test_empty_answer_short_circuits_to_zero()
	test_non_answer_short_circuits_to_zero()
	print("\nAll NAH-8 grade_answer tests passed.")