Spaces:

build-small-hackathon
/

JudgeGPT

Sleeping

App Files Files Community

JudgeGPT / tests /test_engine.py

AliIqbal05

Update Judge-GPT code and README

f1c85c1 verified 20 days ago

Raw

History Blame

13.1 kB

	import json
	import re

	import pytest

	from sovereign_bench.engine import JUDGE_NAME, JUROR_PERSONAS, RequiredModelError, run_trial, stream_trial
	from sovereign_bench.llm import ModelCall, ModelResult, build_role_messages, clean_model_text
	from sovereign_bench.models import CasePacket, EvidenceItem, TrialRequest


	def _juror_json(kwargs, vote: str = "liable") -> str:
	evidence_ids = re.findall(r"^([A-Z]+-[A-Z]\d+):", kwargs["evidence_summary"], flags=re.M)
	evidence_id = (evidence_ids or ["SOC-E1"])[0]
	return json.dumps(
	{
	"juror": kwargs["agent"],
	"persona": kwargs["persona"],
	"vote": vote,
	"reason": f"{kwargs['agent']} applies {kwargs['persona']} to exhibit {evidence_id}.",
	"evidence_ids": [evidence_id],
	}
	)


	def fake_model_runner(**kwargs):
	text = (
	_juror_json(kwargs, vote="liable" if list(JUROR_PERSONAS).index(kwargs["agent"]) < 4 else "not_liable")
	if kwargs["role"] == "juror"
	else f"{kwargs['agent']} responds to: {kwargs['task']}"
	)
	prompt = (
	f"SYSTEM:\nFake live model for tests.\n\nUSER:\n"
	f"Agent: {kwargs['agent']}\nRole: {kwargs['role']}\n"
	f"Persona: {kwargs.get('persona', '')}\nObjective: {kwargs.get('objective', '')}\n"
	f"History: {kwargs.get('trial_history', '')}\nTask: {kwargs['task']}\n\nASSISTANT:\n"
	)
	return ModelResult(
	text=text,
	input_text=prompt,
	call=ModelCall(
	model=kwargs["model"],
	provider=kwargs.get("provider", "test"),
	ok=True,
	latency_ms=1,
	prompt_hash="test-prompt",
	),
	)


	def test_cached_cases_emit_sequential_speaker_order():
	expected_speakers = [
	"Clerk Meridian",
	JUDGE_NAME,
	"Mike OSS",
	"Harvey Vector",
	JUDGE_NAME,
	"Mike OSS",
	"Harvey Vector",
	"Nemotron Jury",
	*list(JUROR_PERSONAS),
	JUDGE_NAME,
	]
	for case_id in ["socrates", "barnaby"]:
	events = run_trial(TrialRequest(case_id=case_id), model_runner=fake_model_runner)

	assert [event.turns[0].agent for event in events if event.turns] == expected_speakers
	evidence_event = next(event for event in events if event.phase == "evidence")
	assert evidence_event.title == "The Evidence Record"
	assert evidence_event.turns == []
	assert [event.phase for event in events].count("deliberation") == 7
	assert events[0].turns[0].input
	assert "SYSTEM:" in events[0].turns[0].input
	assert events[-1].verdict is not None
	assert events[-1].votes and len(events[-1].votes) == 6
	assert "uncertainty" in events[-1].verdict.uncertainty.lower()


	def test_no_event_contains_both_lawyers_speaking_together():
	events = run_trial(TrialRequest(case_id="socrates"), model_runner=fake_model_runner)

	for event in events:
	agents = {turn.agent for turn in event.turns}
	assert not {"Mike OSS", "Harvey Vector"}.issubset(agents)


	def test_juror_vote_events_have_fixed_personas_and_evidence():
	events = run_trial(TrialRequest(case_id="socrates"), model_runner=fake_model_runner)
	juror_events = [event for event in events if event.turns and event.turns[0].agent in JUROR_PERSONAS]

	assert len(juror_events) == 6
	for event in juror_events:
	vote = event.votes[0]
	assert vote.juror == event.turns[0].agent
	assert vote.persona == JUROR_PERSONAS[vote.juror]
	assert vote.vote in {"liable", "not_liable", "uncertain"}
	assert event.turns[0].content.startswith("I vote ")
	assert vote.reason
	assert vote.evidence_ids

	final = events[-1]
	assert final.phase == "verdict"
	assert [vote.juror for vote in final.votes] == list(JUROR_PERSONAS)


	def test_jurors_are_called_independently_with_personas_and_trial_history():
	calls = []

	def recording_runner(**kwargs):
	calls.append(kwargs.copy())
	return fake_model_runner(**kwargs)

	run_trial(TrialRequest(case_id="socrates"), model_runner=recording_runner)

	juror_calls = [call for call in calls if call["role"] == "juror"]
	assert [call["agent"] for call in juror_calls] == list(JUROR_PERSONAS)
	assert len(juror_calls) == 6
	for call in juror_calls:
	assert call["persona"] == JUROR_PERSONAS[call["agent"]]
	assert "Claimant Opening" in call["trial_history"]
	assert "Respondent Opening" in call["trial_history"]
	assert "The Evidence Record" in call["trial_history"]
	assert "historical worldview" in call["objective"]


	def test_lawyers_and_judge_receive_trial_history_and_objectives():
	calls = []

	def recording_runner(**kwargs):
	calls.append(kwargs.copy())
	return fake_model_runner(**kwargs)

	run_trial(TrialRequest(case_id="socrates"), model_runner=recording_runner)

	claimant_answer = next(call for call in calls if call["agent"] == "Mike OSS" and "hinge question" in call["task"])
	respondent_answer = next(call for call in calls if call["agent"] == "Harvey Vector" and "hinge question" in call["task"])
	verdict_call = next(call for call in calls if call["role"] == "verdict writer")

	assert "The Hinge Question" in claimant_answer["trial_history"]
	assert "The Hinge Question" in respondent_answer["trial_history"]
	assert "most favorable to the claimant" in claimant_answer["objective"]
	assert "most favorable to the respondent" in respondent_answer["objective"]
	assert all(name in verdict_call["trial_history"] for name in JUROR_PERSONAS)
	assert "do not override the jury" in verdict_call["objective"]


	def test_custom_case_context_and_evidence_reach_lawyer_prompts():
	custom = CasePacket(
	id="custom",
	title="Custom Trial",
	subtitle="Entered by user.",
	claimant="Claimant",
	respondent="Respondent",
	charge="Whether the custom record favors the claimant.",
	setting="A custom courtroom.",
	context="A bicycle disappeared after a disputed garage visit.",
	claimant_claim="The claimant says the visit explains the missing bicycle.",
	respondent_claim="The respondent says the timing and evidence are ambiguous.",
	source_note="Custom test packet.",
	evidence=[
	EvidenceItem(
	id="CUS-F1",
	title="Garage Text",
	source="Custom",
	excerpt="The respondent asked to enter the garage.",
	supports="claimant",
	reliability=0.65,
	note="Supports access.",
	),
	EvidenceItem(
	id="CUS-A1",
	title="Neighbor Sighting",
	source="Custom",
	excerpt="A neighbor saw the bicycle later that day.",
	supports="respondent",
	reliability=0.65,
	note="Supports alternative timing.",
	),
	],
	)
	calls = []

	def recording_runner(**kwargs):
	calls.append(kwargs.copy())
	return fake_model_runner(**kwargs)

	run_trial(TrialRequest(case_id="custom", custom_case=custom), model_runner=recording_runner)

	claimant_opening = next(call for call in calls if call["agent"] == "Mike OSS" and call["role"] == "claimant advocate")
	assert "A bicycle disappeared" in claimant_opening["case_summary"]
	assert "CUS-F1" in claimant_opening["evidence_summary"]
	assert "CUS-A1" in claimant_opening["evidence_summary"]


	def test_jury_contract_uses_public_history_personas():
	assert JUDGE_NAME == "Marcus Aurelius"
	assert JUROR_PERSONAS == {
	"Karl Marx": "class power, material conditions, exploitation, institutional incentives",
	"John Stuart Mill": "liberty, harm principle, utility, individual rights",
	"Confucius": "social harmony, role duty, ritual order, moral cultivation",
	"Cleopatra VII": "sovereign pragmatism, diplomacy, survival, legitimacy under pressure",
	"Niccolo Machiavelli": "political realism, stability, power, consequences over ideals",
	"Jensen Huang": "technological optimism, operator mindset, systems thinking, innovation tradeoffs",
	}


	def test_role_prompt_requires_first_person_in_character_speech():
	messages = build_role_messages(
	agent="Harvey Vector",
	role="respondent advocate",
	case_summary="A short case summary.",
	evidence_summary="SOC-E1: A record excerpt.",
	task="Answer the bench for the respondent.",
	)

	system = messages[0]["content"]
	user = messages[1]["content"]

	assert "Stay fully in character as the assigned Agent and Role." in system
	assert "Output only the words this character says aloud in court." in system
	assert "Do not narrate about yourself in the third person." in system
	assert "Use the case facts and evidence provided below" in system
	assert "Speak as Harvey Vector." in user
	assert "Give only the in-scene court line" in user
	assert "SOC-E1" in user


	def test_juror_vote_prompt_uses_persona_history_and_json_contract():
	messages = build_role_messages(
	agent="Karl Marx",
	role="juror",
	case_summary="A short case summary.",
	evidence_summary="SOC-E1: A record excerpt.",
	trial_history="Mike OSS argued from SOC-E1.",
	persona=JUROR_PERSONAS["Karl Marx"],
	objective="Vote as Karl Marx would after watching the trial.",
	task="Return one juror vote as JSON.",
	)

	system = messages[0]["content"]
	user = messages[1]["content"]

	assert "Output only the words this character says aloud in court." not in messages[0]["content"]
	assert "You are an individual juror." in system
	assert JUROR_PERSONAS["Karl Marx"] in user
	assert "Mike OSS argued from SOC-E1." in user
	assert "Return only the requested JSON object." in user


	def test_model_cleaner_extracts_final_speech_after_analysis_channel():
	text = clean_model_text(
	"analysis\nI should reason about the case first.\n\nfinal\nI stand for the respondent, and SOC-E1 leaves doubt."
	)

	assert text == "I stand for the respondent, and SOC-E1 leaves doubt."
	assert "analysis" not in text.lower()


	def test_model_cleaner_rejects_visible_analysis_without_final_speech():
	def analysis_runner(**kwargs):
	return ModelResult(
	text="analysis: I should think through the case before answering.",
	input_text="SYSTEM:\nanalysis leak",
	call=ModelCall(
	model=kwargs["model"],
	provider=kwargs.get("provider", "test"),
	ok=True,
	latency_ms=1,
	prompt_hash="test-prompt",
	),
	)

	with pytest.raises(RequiredModelError):
	next(stream_trial(TrialRequest(case_id="socrates"), model_runner=analysis_runner))


	def test_model_cleaner_removes_instruction_echo_when_dialogue_remains():
	text = clean_model_text(
	"I will now announce the case as requested, while maintaining the theatrical but clear tone required. "
	"I will speak as Clerk Meridian in first person, starting with a pronoun.\n\n"
	"I call The Polis v. Socrates before this court."
	)

	assert text == "I call The Polis v. Socrates before this court."


	def test_model_cleaner_rejects_instruction_echo_without_dialogue():
	with pytest.raises(Exception, match="echoed instructions"):
	clean_model_text(
	"I will now announce the case as requested, while maintaining the theatrical but clear tone required. "
	"I will speak as Clerk Meridian in first person, starting with a pronoun."
	)


	def test_required_model_failure_stops_trial_without_canned_dialogue():
	def failing_runner(**kwargs):
	return ModelResult(
	text="",
	input_text="SYSTEM:\nfailed",
	call=ModelCall(
	model=kwargs["model"],
	provider=kwargs.get("provider", "test"),
	ok=False,
	latency_ms=1,
	prompt_hash="test-prompt",
	error="offline",
	),
	)

	with pytest.raises(RequiredModelError, match="unavailable"):
	run_trial(TrialRequest(case_id="socrates"), model_runner=failing_runner)


	def test_invalid_jury_output_stops_trial_without_fallback_votes():
	def invalid_jury_runner(**kwargs):
	result = fake_model_runner(**kwargs)
	if kwargs["role"] == "juror":
	result.text = "the jury refuses structured output"
	return result

	with pytest.raises(RequiredModelError, match="invalid JSON"):
	run_trial(TrialRequest(case_id="socrates"), model_runner=invalid_jury_runner)


	def test_live_search_stops_when_query_is_weak():
	with pytest.raises(RuntimeError, match="no fallback case"):
	run_trial(TrialRequest(case_id="live", search_query="x"), model_runner=fake_model_runner)