import json import re import pytest from sovereign_bench.engine import JUDGE_NAME, JUROR_PERSONAS, RequiredModelError, run_trial from sovereign_bench.llm import ModelCall, ModelResult from sovereign_bench.models import TrialRequest def _jury_json(evidence_summary: str, vote: str = "liable") -> str: evidence_ids = re.findall(r"^([A-Z]+-E\d+):", evidence_summary, flags=re.M) evidence_ids = (evidence_ids or ["SOC-E1"]) * 6 return json.dumps( { "votes": [ { "juror": name, "persona": persona, "vote": vote if idx < 4 else "not_liable", "reason": f"{name} applies a {persona} lens to exhibit {evidence_ids[idx]}.", "evidence_ids": [evidence_ids[idx]], } for idx, (name, persona) in enumerate(JUROR_PERSONAS.items()) ] } ) def fake_model_runner(**kwargs): text = ( _jury_json(kwargs["evidence_summary"]) if kwargs["role"] == "juror vote generator" else f"{kwargs['agent']} responds to: {kwargs['task']}" ) prompt = ( f"SYSTEM:\nFake live model for tests.\n\nUSER:\n" f"Agent: {kwargs['agent']}\nRole: {kwargs['role']}\nTask: {kwargs['task']}\n\nASSISTANT:\n" ) return ModelResult( text=text, input_text=prompt, call=ModelCall( model=kwargs["model"], provider=kwargs.get("provider", "test"), ok=True, latency_ms=1, prompt_hash="test-prompt", ), ) def test_cached_cases_emit_sequential_speaker_order(): expected_speakers = [ "Clerk Meridian", JUDGE_NAME, "Advocate Auric", "Counsel Sable", "Auditor Prism", JUDGE_NAME, "Advocate Auric", "Counsel Sable", "Nemotron Jury", *list(JUROR_PERSONAS), JUDGE_NAME, ] for case_id in ["socrates", "barnaby"]: events = run_trial(TrialRequest(case_id=case_id), model_runner=fake_model_runner) assert [event.turns[0].agent for event in events] == expected_speakers assert [event.phase for event in events].count("deliberation") == 7 assert events[0].turns[0].input assert "SYSTEM:" in events[0].turns[0].input assert events[-1].verdict is not None assert events[-1].votes and len(events[-1].votes) == 6 assert "uncertainty" in events[-1].verdict.uncertainty.lower() def test_no_event_contains_both_lawyers_speaking_together(): events = run_trial(TrialRequest(case_id="socrates"), model_runner=fake_model_runner) for event in events: agents = {turn.agent for turn in event.turns} assert not {"Advocate Auric", "Counsel Sable"}.issubset(agents) def test_juror_vote_events_have_fixed_personas_and_evidence(): events = run_trial(TrialRequest(case_id="socrates"), model_runner=fake_model_runner) juror_events = [event for event in events if event.turns[0].agent in JUROR_PERSONAS] assert len(juror_events) == 6 for event in juror_events: vote = event.votes[0] assert vote.juror == event.turns[0].agent assert vote.persona == JUROR_PERSONAS[vote.juror] assert vote.vote in {"liable", "not_liable", "uncertain"} assert vote.reason assert vote.evidence_ids final = events[-1] assert final.phase == "verdict" assert [vote.juror for vote in final.votes] == list(JUROR_PERSONAS) def test_jury_contract_uses_public_history_personas(): assert JUDGE_NAME == "Marcus Aurelius" assert JUROR_PERSONAS == { "Karl Marx": "class power, material conditions, exploitation, institutional incentives", "John Stuart Mill": "liberty, harm principle, utility, individual rights", "Confucius": "social harmony, role duty, ritual order, moral cultivation", "Cleopatra VII": "sovereign pragmatism, diplomacy, survival, legitimacy under pressure", "Niccolo Machiavelli": "political realism, stability, power, consequences over ideals", "Jensen Huang": "technological optimism, operator mindset, systems thinking, innovation tradeoffs", } def test_required_model_failure_stops_trial_without_canned_dialogue(): def failing_runner(**kwargs): return ModelResult( text="", input_text="SYSTEM:\nfailed", call=ModelCall( model=kwargs["model"], provider=kwargs.get("provider", "test"), ok=False, latency_ms=1, prompt_hash="test-prompt", error="offline", ), ) with pytest.raises(RequiredModelError, match="unavailable"): run_trial(TrialRequest(case_id="socrates"), model_runner=failing_runner) def test_invalid_jury_output_stops_trial_without_fallback_votes(): def invalid_jury_runner(**kwargs): result = fake_model_runner(**kwargs) if kwargs["role"] == "juror vote generator": result.text = "the jury refuses structured output" return result with pytest.raises(RequiredModelError, match="invalid JSON"): run_trial(TrialRequest(case_id="socrates"), model_runner=invalid_jury_runner) def test_live_search_stops_when_query_is_weak(): with pytest.raises(RuntimeError, match="no fallback case"): run_trial(TrialRequest(case_id="live", search_query="x"), model_runner=fake_model_runner)