JudgeGPT / tests /test_engine.py
AliIqbal05's picture
Initialize Judge-GPT Space (#1)
335b794
Raw
History Blame Contribute Delete
5.52 kB
import json
import re
import pytest
from sovereign_bench.engine import JUDGE_NAME, JUROR_PERSONAS, RequiredModelError, run_trial
from sovereign_bench.llm import ModelCall, ModelResult
from sovereign_bench.models import TrialRequest
def _jury_json(evidence_summary: str, vote: str = "liable") -> str:
evidence_ids = re.findall(r"^([A-Z]+-E\d+):", evidence_summary, flags=re.M)
evidence_ids = (evidence_ids or ["SOC-E1"]) * 6
return json.dumps(
{
"votes": [
{
"juror": name,
"persona": persona,
"vote": vote if idx < 4 else "not_liable",
"reason": f"{name} applies a {persona} lens to exhibit {evidence_ids[idx]}.",
"evidence_ids": [evidence_ids[idx]],
}
for idx, (name, persona) in enumerate(JUROR_PERSONAS.items())
]
}
)
def fake_model_runner(**kwargs):
text = (
_jury_json(kwargs["evidence_summary"])
if kwargs["role"] == "juror vote generator"
else f"{kwargs['agent']} responds to: {kwargs['task']}"
)
prompt = (
f"SYSTEM:\nFake live model for tests.\n\nUSER:\n"
f"Agent: {kwargs['agent']}\nRole: {kwargs['role']}\nTask: {kwargs['task']}\n\nASSISTANT:\n"
)
return ModelResult(
text=text,
input_text=prompt,
call=ModelCall(
model=kwargs["model"],
provider=kwargs.get("provider", "test"),
ok=True,
latency_ms=1,
prompt_hash="test-prompt",
),
)
def test_cached_cases_emit_sequential_speaker_order():
expected_speakers = [
"Clerk Meridian",
JUDGE_NAME,
"Advocate Auric",
"Counsel Sable",
"Auditor Prism",
JUDGE_NAME,
"Advocate Auric",
"Counsel Sable",
"Nemotron Jury",
*list(JUROR_PERSONAS),
JUDGE_NAME,
]
for case_id in ["socrates", "barnaby"]:
events = run_trial(TrialRequest(case_id=case_id), model_runner=fake_model_runner)
assert [event.turns[0].agent for event in events] == expected_speakers
assert [event.phase for event in events].count("deliberation") == 7
assert events[0].turns[0].input
assert "SYSTEM:" in events[0].turns[0].input
assert events[-1].verdict is not None
assert events[-1].votes and len(events[-1].votes) == 6
assert "uncertainty" in events[-1].verdict.uncertainty.lower()
def test_no_event_contains_both_lawyers_speaking_together():
events = run_trial(TrialRequest(case_id="socrates"), model_runner=fake_model_runner)
for event in events:
agents = {turn.agent for turn in event.turns}
assert not {"Advocate Auric", "Counsel Sable"}.issubset(agents)
def test_juror_vote_events_have_fixed_personas_and_evidence():
events = run_trial(TrialRequest(case_id="socrates"), model_runner=fake_model_runner)
juror_events = [event for event in events if event.turns[0].agent in JUROR_PERSONAS]
assert len(juror_events) == 6
for event in juror_events:
vote = event.votes[0]
assert vote.juror == event.turns[0].agent
assert vote.persona == JUROR_PERSONAS[vote.juror]
assert vote.vote in {"liable", "not_liable", "uncertain"}
assert vote.reason
assert vote.evidence_ids
final = events[-1]
assert final.phase == "verdict"
assert [vote.juror for vote in final.votes] == list(JUROR_PERSONAS)
def test_jury_contract_uses_public_history_personas():
assert JUDGE_NAME == "Marcus Aurelius"
assert JUROR_PERSONAS == {
"Karl Marx": "class power, material conditions, exploitation, institutional incentives",
"John Stuart Mill": "liberty, harm principle, utility, individual rights",
"Confucius": "social harmony, role duty, ritual order, moral cultivation",
"Cleopatra VII": "sovereign pragmatism, diplomacy, survival, legitimacy under pressure",
"Niccolo Machiavelli": "political realism, stability, power, consequences over ideals",
"Jensen Huang": "technological optimism, operator mindset, systems thinking, innovation tradeoffs",
}
def test_required_model_failure_stops_trial_without_canned_dialogue():
def failing_runner(**kwargs):
return ModelResult(
text="",
input_text="SYSTEM:\nfailed",
call=ModelCall(
model=kwargs["model"],
provider=kwargs.get("provider", "test"),
ok=False,
latency_ms=1,
prompt_hash="test-prompt",
error="offline",
),
)
with pytest.raises(RequiredModelError, match="unavailable"):
run_trial(TrialRequest(case_id="socrates"), model_runner=failing_runner)
def test_invalid_jury_output_stops_trial_without_fallback_votes():
def invalid_jury_runner(**kwargs):
result = fake_model_runner(**kwargs)
if kwargs["role"] == "juror vote generator":
result.text = "the jury refuses structured output"
return result
with pytest.raises(RequiredModelError, match="invalid JSON"):
run_trial(TrialRequest(case_id="socrates"), model_runner=invalid_jury_runner)
def test_live_search_stops_when_query_is_weak():
with pytest.raises(RuntimeError, match="no fallback case"):
run_trial(TrialRequest(case_id="live", search_query="x"), model_runner=fake_model_runner)