Spaces:
Sleeping
Sleeping
| import json | |
| import re | |
| import pytest | |
| from sovereign_bench.engine import JUDGE_NAME, JUROR_PERSONAS, RequiredModelError, run_trial, stream_trial | |
| from sovereign_bench.llm import ModelCall, ModelResult, build_role_messages, clean_model_text | |
| from sovereign_bench.models import CasePacket, EvidenceItem, TrialRequest | |
| def _juror_json(kwargs, vote: str = "liable") -> str: | |
| evidence_ids = re.findall(r"^([A-Z]+-[A-Z]\d+):", kwargs["evidence_summary"], flags=re.M) | |
| evidence_id = (evidence_ids or ["SOC-E1"])[0] | |
| return json.dumps( | |
| { | |
| "juror": kwargs["agent"], | |
| "persona": kwargs["persona"], | |
| "vote": vote, | |
| "reason": f"{kwargs['agent']} applies {kwargs['persona']} to exhibit {evidence_id}.", | |
| "evidence_ids": [evidence_id], | |
| } | |
| ) | |
| def fake_model_runner(**kwargs): | |
| text = ( | |
| _juror_json(kwargs, vote="liable" if list(JUROR_PERSONAS).index(kwargs["agent"]) < 4 else "not_liable") | |
| if kwargs["role"] == "juror" | |
| else f"{kwargs['agent']} responds to: {kwargs['task']}" | |
| ) | |
| prompt = ( | |
| f"SYSTEM:\nFake live model for tests.\n\nUSER:\n" | |
| f"Agent: {kwargs['agent']}\nRole: {kwargs['role']}\n" | |
| f"Persona: {kwargs.get('persona', '')}\nObjective: {kwargs.get('objective', '')}\n" | |
| f"History: {kwargs.get('trial_history', '')}\nTask: {kwargs['task']}\n\nASSISTANT:\n" | |
| ) | |
| return ModelResult( | |
| text=text, | |
| input_text=prompt, | |
| call=ModelCall( | |
| model=kwargs["model"], | |
| provider=kwargs.get("provider", "test"), | |
| ok=True, | |
| latency_ms=1, | |
| prompt_hash="test-prompt", | |
| ), | |
| ) | |
| def test_cached_cases_emit_sequential_speaker_order(): | |
| expected_speakers = [ | |
| "Clerk Meridian", | |
| JUDGE_NAME, | |
| "Mike OSS", | |
| "Harvey Vector", | |
| JUDGE_NAME, | |
| "Mike OSS", | |
| "Harvey Vector", | |
| "Nemotron Jury", | |
| *list(JUROR_PERSONAS), | |
| JUDGE_NAME, | |
| ] | |
| for case_id in ["socrates", "barnaby"]: | |
| events = run_trial(TrialRequest(case_id=case_id), model_runner=fake_model_runner) | |
| assert [event.turns[0].agent for event in events if event.turns] == expected_speakers | |
| evidence_event = next(event for event in events if event.phase == "evidence") | |
| assert evidence_event.title == "The Evidence Record" | |
| assert evidence_event.turns == [] | |
| assert [event.phase for event in events].count("deliberation") == 7 | |
| assert events[0].turns[0].input | |
| assert "SYSTEM:" in events[0].turns[0].input | |
| assert events[-1].verdict is not None | |
| assert events[-1].votes and len(events[-1].votes) == 6 | |
| assert "uncertainty" in events[-1].verdict.uncertainty.lower() | |
| def test_no_event_contains_both_lawyers_speaking_together(): | |
| events = run_trial(TrialRequest(case_id="socrates"), model_runner=fake_model_runner) | |
| for event in events: | |
| agents = {turn.agent for turn in event.turns} | |
| assert not {"Mike OSS", "Harvey Vector"}.issubset(agents) | |
| def test_juror_vote_events_have_fixed_personas_and_evidence(): | |
| events = run_trial(TrialRequest(case_id="socrates"), model_runner=fake_model_runner) | |
| juror_events = [event for event in events if event.turns and event.turns[0].agent in JUROR_PERSONAS] | |
| assert len(juror_events) == 6 | |
| for event in juror_events: | |
| vote = event.votes[0] | |
| assert vote.juror == event.turns[0].agent | |
| assert vote.persona == JUROR_PERSONAS[vote.juror] | |
| assert vote.vote in {"liable", "not_liable", "uncertain"} | |
| assert event.turns[0].content.startswith("I vote ") | |
| assert vote.reason | |
| assert vote.evidence_ids | |
| final = events[-1] | |
| assert final.phase == "verdict" | |
| assert [vote.juror for vote in final.votes] == list(JUROR_PERSONAS) | |
| def test_jurors_are_called_independently_with_personas_and_trial_history(): | |
| calls = [] | |
| def recording_runner(**kwargs): | |
| calls.append(kwargs.copy()) | |
| return fake_model_runner(**kwargs) | |
| run_trial(TrialRequest(case_id="socrates"), model_runner=recording_runner) | |
| juror_calls = [call for call in calls if call["role"] == "juror"] | |
| assert [call["agent"] for call in juror_calls] == list(JUROR_PERSONAS) | |
| assert len(juror_calls) == 6 | |
| for call in juror_calls: | |
| assert call["persona"] == JUROR_PERSONAS[call["agent"]] | |
| assert "Claimant Opening" in call["trial_history"] | |
| assert "Respondent Opening" in call["trial_history"] | |
| assert "The Evidence Record" in call["trial_history"] | |
| assert "historical worldview" in call["objective"] | |
| def test_lawyers_and_judge_receive_trial_history_and_objectives(): | |
| calls = [] | |
| def recording_runner(**kwargs): | |
| calls.append(kwargs.copy()) | |
| return fake_model_runner(**kwargs) | |
| run_trial(TrialRequest(case_id="socrates"), model_runner=recording_runner) | |
| claimant_answer = next(call for call in calls if call["agent"] == "Mike OSS" and "hinge question" in call["task"]) | |
| respondent_answer = next(call for call in calls if call["agent"] == "Harvey Vector" and "hinge question" in call["task"]) | |
| verdict_call = next(call for call in calls if call["role"] == "verdict writer") | |
| assert "The Hinge Question" in claimant_answer["trial_history"] | |
| assert "The Hinge Question" in respondent_answer["trial_history"] | |
| assert "most favorable to the claimant" in claimant_answer["objective"] | |
| assert "most favorable to the respondent" in respondent_answer["objective"] | |
| assert all(name in verdict_call["trial_history"] for name in JUROR_PERSONAS) | |
| assert "do not override the jury" in verdict_call["objective"] | |
| def test_custom_case_context_and_evidence_reach_lawyer_prompts(): | |
| custom = CasePacket( | |
| id="custom", | |
| title="Custom Trial", | |
| subtitle="Entered by user.", | |
| claimant="Claimant", | |
| respondent="Respondent", | |
| charge="Whether the custom record favors the claimant.", | |
| setting="A custom courtroom.", | |
| context="A bicycle disappeared after a disputed garage visit.", | |
| claimant_claim="The claimant says the visit explains the missing bicycle.", | |
| respondent_claim="The respondent says the timing and evidence are ambiguous.", | |
| source_note="Custom test packet.", | |
| evidence=[ | |
| EvidenceItem( | |
| id="CUS-F1", | |
| title="Garage Text", | |
| source="Custom", | |
| excerpt="The respondent asked to enter the garage.", | |
| supports="claimant", | |
| reliability=0.65, | |
| note="Supports access.", | |
| ), | |
| EvidenceItem( | |
| id="CUS-A1", | |
| title="Neighbor Sighting", | |
| source="Custom", | |
| excerpt="A neighbor saw the bicycle later that day.", | |
| supports="respondent", | |
| reliability=0.65, | |
| note="Supports alternative timing.", | |
| ), | |
| ], | |
| ) | |
| calls = [] | |
| def recording_runner(**kwargs): | |
| calls.append(kwargs.copy()) | |
| return fake_model_runner(**kwargs) | |
| run_trial(TrialRequest(case_id="custom", custom_case=custom), model_runner=recording_runner) | |
| claimant_opening = next(call for call in calls if call["agent"] == "Mike OSS" and call["role"] == "claimant advocate") | |
| assert "A bicycle disappeared" in claimant_opening["case_summary"] | |
| assert "CUS-F1" in claimant_opening["evidence_summary"] | |
| assert "CUS-A1" in claimant_opening["evidence_summary"] | |
| def test_jury_contract_uses_public_history_personas(): | |
| assert JUDGE_NAME == "Marcus Aurelius" | |
| assert JUROR_PERSONAS == { | |
| "Karl Marx": "class power, material conditions, exploitation, institutional incentives", | |
| "John Stuart Mill": "liberty, harm principle, utility, individual rights", | |
| "Confucius": "social harmony, role duty, ritual order, moral cultivation", | |
| "Cleopatra VII": "sovereign pragmatism, diplomacy, survival, legitimacy under pressure", | |
| "Niccolo Machiavelli": "political realism, stability, power, consequences over ideals", | |
| "Jensen Huang": "technological optimism, operator mindset, systems thinking, innovation tradeoffs", | |
| } | |
| def test_role_prompt_requires_first_person_in_character_speech(): | |
| messages = build_role_messages( | |
| agent="Harvey Vector", | |
| role="respondent advocate", | |
| case_summary="A short case summary.", | |
| evidence_summary="SOC-E1: A record excerpt.", | |
| task="Answer the bench for the respondent.", | |
| ) | |
| system = messages[0]["content"] | |
| user = messages[1]["content"] | |
| assert "Stay fully in character as the assigned Agent and Role." in system | |
| assert "Output only the words this character says aloud in court." in system | |
| assert "Do not narrate about yourself in the third person." in system | |
| assert "Use the case facts and evidence provided below" in system | |
| assert "Speak as Harvey Vector." in user | |
| assert "Give only the in-scene court line" in user | |
| assert "SOC-E1" in user | |
| def test_juror_vote_prompt_uses_persona_history_and_json_contract(): | |
| messages = build_role_messages( | |
| agent="Karl Marx", | |
| role="juror", | |
| case_summary="A short case summary.", | |
| evidence_summary="SOC-E1: A record excerpt.", | |
| trial_history="Mike OSS argued from SOC-E1.", | |
| persona=JUROR_PERSONAS["Karl Marx"], | |
| objective="Vote as Karl Marx would after watching the trial.", | |
| task="Return one juror vote as JSON.", | |
| ) | |
| system = messages[0]["content"] | |
| user = messages[1]["content"] | |
| assert "Output only the words this character says aloud in court." not in messages[0]["content"] | |
| assert "You are an individual juror." in system | |
| assert JUROR_PERSONAS["Karl Marx"] in user | |
| assert "Mike OSS argued from SOC-E1." in user | |
| assert "Return only the requested JSON object." in user | |
| def test_model_cleaner_extracts_final_speech_after_analysis_channel(): | |
| text = clean_model_text( | |
| "analysis\nI should reason about the case first.\n\nfinal\nI stand for the respondent, and SOC-E1 leaves doubt." | |
| ) | |
| assert text == "I stand for the respondent, and SOC-E1 leaves doubt." | |
| assert "analysis" not in text.lower() | |
| def test_model_cleaner_rejects_visible_analysis_without_final_speech(): | |
| def analysis_runner(**kwargs): | |
| return ModelResult( | |
| text="analysis: I should think through the case before answering.", | |
| input_text="SYSTEM:\nanalysis leak", | |
| call=ModelCall( | |
| model=kwargs["model"], | |
| provider=kwargs.get("provider", "test"), | |
| ok=True, | |
| latency_ms=1, | |
| prompt_hash="test-prompt", | |
| ), | |
| ) | |
| with pytest.raises(RequiredModelError): | |
| next(stream_trial(TrialRequest(case_id="socrates"), model_runner=analysis_runner)) | |
| def test_model_cleaner_removes_instruction_echo_when_dialogue_remains(): | |
| text = clean_model_text( | |
| "I will now announce the case as requested, while maintaining the theatrical but clear tone required. " | |
| "I will speak as Clerk Meridian in first person, starting with a pronoun.\n\n" | |
| "I call The Polis v. Socrates before this court." | |
| ) | |
| assert text == "I call The Polis v. Socrates before this court." | |
| def test_model_cleaner_rejects_instruction_echo_without_dialogue(): | |
| with pytest.raises(Exception, match="echoed instructions"): | |
| clean_model_text( | |
| "I will now announce the case as requested, while maintaining the theatrical but clear tone required. " | |
| "I will speak as Clerk Meridian in first person, starting with a pronoun." | |
| ) | |
| def test_required_model_failure_stops_trial_without_canned_dialogue(): | |
| def failing_runner(**kwargs): | |
| return ModelResult( | |
| text="", | |
| input_text="SYSTEM:\nfailed", | |
| call=ModelCall( | |
| model=kwargs["model"], | |
| provider=kwargs.get("provider", "test"), | |
| ok=False, | |
| latency_ms=1, | |
| prompt_hash="test-prompt", | |
| error="offline", | |
| ), | |
| ) | |
| with pytest.raises(RequiredModelError, match="unavailable"): | |
| run_trial(TrialRequest(case_id="socrates"), model_runner=failing_runner) | |
| def test_invalid_jury_output_stops_trial_without_fallback_votes(): | |
| def invalid_jury_runner(**kwargs): | |
| result = fake_model_runner(**kwargs) | |
| if kwargs["role"] == "juror": | |
| result.text = "the jury refuses structured output" | |
| return result | |
| with pytest.raises(RequiredModelError, match="invalid JSON"): | |
| run_trial(TrialRequest(case_id="socrates"), model_runner=invalid_jury_runner) | |
| def test_live_search_stops_when_query_is_weak(): | |
| with pytest.raises(RuntimeError, match="no fallback case"): | |
| run_trial(TrialRequest(case_id="live", search_query="x"), model_runner=fake_model_runner) | |