import json import re import pytest from sovereign_bench.engine import JUDGE_NAME, JUROR_PERSONAS, RequiredModelError, run_trial, stream_trial from sovereign_bench.llm import ModelCall, ModelResult, build_role_messages, clean_model_text from sovereign_bench.models import CasePacket, EvidenceItem, TrialRequest def _juror_json(kwargs, vote: str = "liable") -> str: evidence_ids = re.findall(r"^([A-Z]+-[A-Z]\d+):", kwargs["evidence_summary"], flags=re.M) evidence_id = (evidence_ids or ["SOC-E1"])[0] return json.dumps( { "juror": kwargs["agent"], "persona": kwargs["persona"], "vote": vote, "reason": f"{kwargs['agent']} applies {kwargs['persona']} to exhibit {evidence_id}.", "evidence_ids": [evidence_id], } ) def fake_model_runner(**kwargs): text = ( _juror_json(kwargs, vote="liable" if list(JUROR_PERSONAS).index(kwargs["agent"]) < 4 else "not_liable") if kwargs["role"] == "juror" else f"{kwargs['agent']} responds to: {kwargs['task']}" ) prompt = ( f"SYSTEM:\nFake live model for tests.\n\nUSER:\n" f"Agent: {kwargs['agent']}\nRole: {kwargs['role']}\n" f"Persona: {kwargs.get('persona', '')}\nObjective: {kwargs.get('objective', '')}\n" f"History: {kwargs.get('trial_history', '')}\nTask: {kwargs['task']}\n\nASSISTANT:\n" ) return ModelResult( text=text, input_text=prompt, call=ModelCall( model=kwargs["model"], provider=kwargs.get("provider", "test"), ok=True, latency_ms=1, prompt_hash="test-prompt", ), ) def test_cached_cases_emit_sequential_speaker_order(): expected_speakers = [ "Clerk Meridian", JUDGE_NAME, "Mike OSS", "Harvey Vector", JUDGE_NAME, "Mike OSS", "Harvey Vector", "Nemotron Jury", *list(JUROR_PERSONAS), JUDGE_NAME, ] for case_id in ["socrates", "barnaby"]: events = run_trial(TrialRequest(case_id=case_id), model_runner=fake_model_runner) assert [event.turns[0].agent for event in events if event.turns] == expected_speakers evidence_event = next(event for event in events if event.phase == "evidence") assert evidence_event.title == "The Evidence Record" assert evidence_event.turns == [] assert [event.phase for event in events].count("deliberation") == 7 assert events[0].turns[0].input assert "SYSTEM:" in events[0].turns[0].input assert events[-1].verdict is not None assert events[-1].votes and len(events[-1].votes) == 6 assert "uncertainty" in events[-1].verdict.uncertainty.lower() def test_no_event_contains_both_lawyers_speaking_together(): events = run_trial(TrialRequest(case_id="socrates"), model_runner=fake_model_runner) for event in events: agents = {turn.agent for turn in event.turns} assert not {"Mike OSS", "Harvey Vector"}.issubset(agents) def test_juror_vote_events_have_fixed_personas_and_evidence(): events = run_trial(TrialRequest(case_id="socrates"), model_runner=fake_model_runner) juror_events = [event for event in events if event.turns and event.turns[0].agent in JUROR_PERSONAS] assert len(juror_events) == 6 for event in juror_events: vote = event.votes[0] assert vote.juror == event.turns[0].agent assert vote.persona == JUROR_PERSONAS[vote.juror] assert vote.vote in {"liable", "not_liable", "uncertain"} assert event.turns[0].content.startswith("I vote ") assert vote.reason assert vote.evidence_ids final = events[-1] assert final.phase == "verdict" assert [vote.juror for vote in final.votes] == list(JUROR_PERSONAS) def test_jurors_are_called_independently_with_personas_and_trial_history(): calls = [] def recording_runner(**kwargs): calls.append(kwargs.copy()) return fake_model_runner(**kwargs) run_trial(TrialRequest(case_id="socrates"), model_runner=recording_runner) juror_calls = [call for call in calls if call["role"] == "juror"] assert [call["agent"] for call in juror_calls] == list(JUROR_PERSONAS) assert len(juror_calls) == 6 for call in juror_calls: assert call["persona"] == JUROR_PERSONAS[call["agent"]] assert "Claimant Opening" in call["trial_history"] assert "Respondent Opening" in call["trial_history"] assert "The Evidence Record" in call["trial_history"] assert "historical worldview" in call["objective"] def test_lawyers_and_judge_receive_trial_history_and_objectives(): calls = [] def recording_runner(**kwargs): calls.append(kwargs.copy()) return fake_model_runner(**kwargs) run_trial(TrialRequest(case_id="socrates"), model_runner=recording_runner) claimant_answer = next(call for call in calls if call["agent"] == "Mike OSS" and "hinge question" in call["task"]) respondent_answer = next(call for call in calls if call["agent"] == "Harvey Vector" and "hinge question" in call["task"]) verdict_call = next(call for call in calls if call["role"] == "verdict writer") assert "The Hinge Question" in claimant_answer["trial_history"] assert "The Hinge Question" in respondent_answer["trial_history"] assert "most favorable to the claimant" in claimant_answer["objective"] assert "most favorable to the respondent" in respondent_answer["objective"] assert all(name in verdict_call["trial_history"] for name in JUROR_PERSONAS) assert "do not override the jury" in verdict_call["objective"] def test_custom_case_context_and_evidence_reach_lawyer_prompts(): custom = CasePacket( id="custom", title="Custom Trial", subtitle="Entered by user.", claimant="Claimant", respondent="Respondent", charge="Whether the custom record favors the claimant.", setting="A custom courtroom.", context="A bicycle disappeared after a disputed garage visit.", claimant_claim="The claimant says the visit explains the missing bicycle.", respondent_claim="The respondent says the timing and evidence are ambiguous.", source_note="Custom test packet.", evidence=[ EvidenceItem( id="CUS-F1", title="Garage Text", source="Custom", excerpt="The respondent asked to enter the garage.", supports="claimant", reliability=0.65, note="Supports access.", ), EvidenceItem( id="CUS-A1", title="Neighbor Sighting", source="Custom", excerpt="A neighbor saw the bicycle later that day.", supports="respondent", reliability=0.65, note="Supports alternative timing.", ), ], ) calls = [] def recording_runner(**kwargs): calls.append(kwargs.copy()) return fake_model_runner(**kwargs) run_trial(TrialRequest(case_id="custom", custom_case=custom), model_runner=recording_runner) claimant_opening = next(call for call in calls if call["agent"] == "Mike OSS" and call["role"] == "claimant advocate") assert "A bicycle disappeared" in claimant_opening["case_summary"] assert "CUS-F1" in claimant_opening["evidence_summary"] assert "CUS-A1" in claimant_opening["evidence_summary"] def test_jury_contract_uses_public_history_personas(): assert JUDGE_NAME == "Marcus Aurelius" assert JUROR_PERSONAS == { "Karl Marx": "class power, material conditions, exploitation, institutional incentives", "John Stuart Mill": "liberty, harm principle, utility, individual rights", "Confucius": "social harmony, role duty, ritual order, moral cultivation", "Cleopatra VII": "sovereign pragmatism, diplomacy, survival, legitimacy under pressure", "Niccolo Machiavelli": "political realism, stability, power, consequences over ideals", "Jensen Huang": "technological optimism, operator mindset, systems thinking, innovation tradeoffs", } def test_role_prompt_requires_first_person_in_character_speech(): messages = build_role_messages( agent="Harvey Vector", role="respondent advocate", case_summary="A short case summary.", evidence_summary="SOC-E1: A record excerpt.", task="Answer the bench for the respondent.", ) system = messages[0]["content"] user = messages[1]["content"] assert "Stay fully in character as the assigned Agent and Role." in system assert "Output only the words this character says aloud in court." in system assert "Do not narrate about yourself in the third person." in system assert "Use the case facts and evidence provided below" in system assert "Speak as Harvey Vector." in user assert "Give only the in-scene court line" in user assert "SOC-E1" in user def test_juror_vote_prompt_uses_persona_history_and_json_contract(): messages = build_role_messages( agent="Karl Marx", role="juror", case_summary="A short case summary.", evidence_summary="SOC-E1: A record excerpt.", trial_history="Mike OSS argued from SOC-E1.", persona=JUROR_PERSONAS["Karl Marx"], objective="Vote as Karl Marx would after watching the trial.", task="Return one juror vote as JSON.", ) system = messages[0]["content"] user = messages[1]["content"] assert "Output only the words this character says aloud in court." not in messages[0]["content"] assert "You are an individual juror." in system assert JUROR_PERSONAS["Karl Marx"] in user assert "Mike OSS argued from SOC-E1." in user assert "Return only the requested JSON object." in user def test_model_cleaner_extracts_final_speech_after_analysis_channel(): text = clean_model_text( "analysis\nI should reason about the case first.\n\nfinal\nI stand for the respondent, and SOC-E1 leaves doubt." ) assert text == "I stand for the respondent, and SOC-E1 leaves doubt." assert "analysis" not in text.lower() def test_model_cleaner_rejects_visible_analysis_without_final_speech(): def analysis_runner(**kwargs): return ModelResult( text="analysis: I should think through the case before answering.", input_text="SYSTEM:\nanalysis leak", call=ModelCall( model=kwargs["model"], provider=kwargs.get("provider", "test"), ok=True, latency_ms=1, prompt_hash="test-prompt", ), ) with pytest.raises(RequiredModelError): next(stream_trial(TrialRequest(case_id="socrates"), model_runner=analysis_runner)) def test_model_cleaner_removes_instruction_echo_when_dialogue_remains(): text = clean_model_text( "I will now announce the case as requested, while maintaining the theatrical but clear tone required. " "I will speak as Clerk Meridian in first person, starting with a pronoun.\n\n" "I call The Polis v. Socrates before this court." ) assert text == "I call The Polis v. Socrates before this court." def test_model_cleaner_rejects_instruction_echo_without_dialogue(): with pytest.raises(Exception, match="echoed instructions"): clean_model_text( "I will now announce the case as requested, while maintaining the theatrical but clear tone required. " "I will speak as Clerk Meridian in first person, starting with a pronoun." ) def test_required_model_failure_stops_trial_without_canned_dialogue(): def failing_runner(**kwargs): return ModelResult( text="", input_text="SYSTEM:\nfailed", call=ModelCall( model=kwargs["model"], provider=kwargs.get("provider", "test"), ok=False, latency_ms=1, prompt_hash="test-prompt", error="offline", ), ) with pytest.raises(RequiredModelError, match="unavailable"): run_trial(TrialRequest(case_id="socrates"), model_runner=failing_runner) def test_invalid_jury_output_stops_trial_without_fallback_votes(): def invalid_jury_runner(**kwargs): result = fake_model_runner(**kwargs) if kwargs["role"] == "juror": result.text = "the jury refuses structured output" return result with pytest.raises(RequiredModelError, match="invalid JSON"): run_trial(TrialRequest(case_id="socrates"), model_runner=invalid_jury_runner) def test_live_search_stops_when_query_is_weak(): with pytest.raises(RuntimeError, match="no fallback case"): run_trial(TrialRequest(case_id="live", search_query="x"), model_runner=fake_model_runner)