Spaces:
Sleeping
Sleeping
File size: 5,524 Bytes
335b794 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 | import json
import re
import pytest
from sovereign_bench.engine import JUDGE_NAME, JUROR_PERSONAS, RequiredModelError, run_trial
from sovereign_bench.llm import ModelCall, ModelResult
from sovereign_bench.models import TrialRequest
def _jury_json(evidence_summary: str, vote: str = "liable") -> str:
evidence_ids = re.findall(r"^([A-Z]+-E\d+):", evidence_summary, flags=re.M)
evidence_ids = (evidence_ids or ["SOC-E1"]) * 6
return json.dumps(
{
"votes": [
{
"juror": name,
"persona": persona,
"vote": vote if idx < 4 else "not_liable",
"reason": f"{name} applies a {persona} lens to exhibit {evidence_ids[idx]}.",
"evidence_ids": [evidence_ids[idx]],
}
for idx, (name, persona) in enumerate(JUROR_PERSONAS.items())
]
}
)
def fake_model_runner(**kwargs):
text = (
_jury_json(kwargs["evidence_summary"])
if kwargs["role"] == "juror vote generator"
else f"{kwargs['agent']} responds to: {kwargs['task']}"
)
prompt = (
f"SYSTEM:\nFake live model for tests.\n\nUSER:\n"
f"Agent: {kwargs['agent']}\nRole: {kwargs['role']}\nTask: {kwargs['task']}\n\nASSISTANT:\n"
)
return ModelResult(
text=text,
input_text=prompt,
call=ModelCall(
model=kwargs["model"],
provider=kwargs.get("provider", "test"),
ok=True,
latency_ms=1,
prompt_hash="test-prompt",
),
)
def test_cached_cases_emit_sequential_speaker_order():
expected_speakers = [
"Clerk Meridian",
JUDGE_NAME,
"Advocate Auric",
"Counsel Sable",
"Auditor Prism",
JUDGE_NAME,
"Advocate Auric",
"Counsel Sable",
"Nemotron Jury",
*list(JUROR_PERSONAS),
JUDGE_NAME,
]
for case_id in ["socrates", "barnaby"]:
events = run_trial(TrialRequest(case_id=case_id), model_runner=fake_model_runner)
assert [event.turns[0].agent for event in events] == expected_speakers
assert [event.phase for event in events].count("deliberation") == 7
assert events[0].turns[0].input
assert "SYSTEM:" in events[0].turns[0].input
assert events[-1].verdict is not None
assert events[-1].votes and len(events[-1].votes) == 6
assert "uncertainty" in events[-1].verdict.uncertainty.lower()
def test_no_event_contains_both_lawyers_speaking_together():
events = run_trial(TrialRequest(case_id="socrates"), model_runner=fake_model_runner)
for event in events:
agents = {turn.agent for turn in event.turns}
assert not {"Advocate Auric", "Counsel Sable"}.issubset(agents)
def test_juror_vote_events_have_fixed_personas_and_evidence():
events = run_trial(TrialRequest(case_id="socrates"), model_runner=fake_model_runner)
juror_events = [event for event in events if event.turns[0].agent in JUROR_PERSONAS]
assert len(juror_events) == 6
for event in juror_events:
vote = event.votes[0]
assert vote.juror == event.turns[0].agent
assert vote.persona == JUROR_PERSONAS[vote.juror]
assert vote.vote in {"liable", "not_liable", "uncertain"}
assert vote.reason
assert vote.evidence_ids
final = events[-1]
assert final.phase == "verdict"
assert [vote.juror for vote in final.votes] == list(JUROR_PERSONAS)
def test_jury_contract_uses_public_history_personas():
assert JUDGE_NAME == "Marcus Aurelius"
assert JUROR_PERSONAS == {
"Karl Marx": "class power, material conditions, exploitation, institutional incentives",
"John Stuart Mill": "liberty, harm principle, utility, individual rights",
"Confucius": "social harmony, role duty, ritual order, moral cultivation",
"Cleopatra VII": "sovereign pragmatism, diplomacy, survival, legitimacy under pressure",
"Niccolo Machiavelli": "political realism, stability, power, consequences over ideals",
"Jensen Huang": "technological optimism, operator mindset, systems thinking, innovation tradeoffs",
}
def test_required_model_failure_stops_trial_without_canned_dialogue():
def failing_runner(**kwargs):
return ModelResult(
text="",
input_text="SYSTEM:\nfailed",
call=ModelCall(
model=kwargs["model"],
provider=kwargs.get("provider", "test"),
ok=False,
latency_ms=1,
prompt_hash="test-prompt",
error="offline",
),
)
with pytest.raises(RequiredModelError, match="unavailable"):
run_trial(TrialRequest(case_id="socrates"), model_runner=failing_runner)
def test_invalid_jury_output_stops_trial_without_fallback_votes():
def invalid_jury_runner(**kwargs):
result = fake_model_runner(**kwargs)
if kwargs["role"] == "juror vote generator":
result.text = "the jury refuses structured output"
return result
with pytest.raises(RequiredModelError, match="invalid JSON"):
run_trial(TrialRequest(case_id="socrates"), model_runner=invalid_jury_runner)
def test_live_search_stops_when_query_is_weak():
with pytest.raises(RuntimeError, match="no fallback case"):
run_trial(TrialRequest(case_id="live", search_query="x"), model_runner=fake_model_runner)
|