File size: 5,524 Bytes
335b794
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import json
import re

import pytest

from sovereign_bench.engine import JUDGE_NAME, JUROR_PERSONAS, RequiredModelError, run_trial
from sovereign_bench.llm import ModelCall, ModelResult
from sovereign_bench.models import TrialRequest


def _jury_json(evidence_summary: str, vote: str = "liable") -> str:
    evidence_ids = re.findall(r"^([A-Z]+-E\d+):", evidence_summary, flags=re.M)
    evidence_ids = (evidence_ids or ["SOC-E1"]) * 6
    return json.dumps(
        {
            "votes": [
                {
                    "juror": name,
                    "persona": persona,
                    "vote": vote if idx < 4 else "not_liable",
                    "reason": f"{name} applies a {persona} lens to exhibit {evidence_ids[idx]}.",
                    "evidence_ids": [evidence_ids[idx]],
                }
                for idx, (name, persona) in enumerate(JUROR_PERSONAS.items())
            ]
        }
    )


def fake_model_runner(**kwargs):
    text = (
        _jury_json(kwargs["evidence_summary"])
        if kwargs["role"] == "juror vote generator"
        else f"{kwargs['agent']} responds to: {kwargs['task']}"
    )
    prompt = (
        f"SYSTEM:\nFake live model for tests.\n\nUSER:\n"
        f"Agent: {kwargs['agent']}\nRole: {kwargs['role']}\nTask: {kwargs['task']}\n\nASSISTANT:\n"
    )
    return ModelResult(
        text=text,
        input_text=prompt,
        call=ModelCall(
            model=kwargs["model"],
            provider=kwargs.get("provider", "test"),
            ok=True,
            latency_ms=1,
            prompt_hash="test-prompt",
        ),
    )


def test_cached_cases_emit_sequential_speaker_order():
    expected_speakers = [
        "Clerk Meridian",
        JUDGE_NAME,
        "Advocate Auric",
        "Counsel Sable",
        "Auditor Prism",
        JUDGE_NAME,
        "Advocate Auric",
        "Counsel Sable",
        "Nemotron Jury",
        *list(JUROR_PERSONAS),
        JUDGE_NAME,
    ]
    for case_id in ["socrates", "barnaby"]:
        events = run_trial(TrialRequest(case_id=case_id), model_runner=fake_model_runner)

        assert [event.turns[0].agent for event in events] == expected_speakers
        assert [event.phase for event in events].count("deliberation") == 7
        assert events[0].turns[0].input
        assert "SYSTEM:" in events[0].turns[0].input
        assert events[-1].verdict is not None
        assert events[-1].votes and len(events[-1].votes) == 6
        assert "uncertainty" in events[-1].verdict.uncertainty.lower()


def test_no_event_contains_both_lawyers_speaking_together():
    events = run_trial(TrialRequest(case_id="socrates"), model_runner=fake_model_runner)

    for event in events:
        agents = {turn.agent for turn in event.turns}
        assert not {"Advocate Auric", "Counsel Sable"}.issubset(agents)


def test_juror_vote_events_have_fixed_personas_and_evidence():
    events = run_trial(TrialRequest(case_id="socrates"), model_runner=fake_model_runner)
    juror_events = [event for event in events if event.turns[0].agent in JUROR_PERSONAS]

    assert len(juror_events) == 6
    for event in juror_events:
        vote = event.votes[0]
        assert vote.juror == event.turns[0].agent
        assert vote.persona == JUROR_PERSONAS[vote.juror]
        assert vote.vote in {"liable", "not_liable", "uncertain"}
        assert vote.reason
        assert vote.evidence_ids

    final = events[-1]
    assert final.phase == "verdict"
    assert [vote.juror for vote in final.votes] == list(JUROR_PERSONAS)


def test_jury_contract_uses_public_history_personas():
    assert JUDGE_NAME == "Marcus Aurelius"
    assert JUROR_PERSONAS == {
        "Karl Marx": "class power, material conditions, exploitation, institutional incentives",
        "John Stuart Mill": "liberty, harm principle, utility, individual rights",
        "Confucius": "social harmony, role duty, ritual order, moral cultivation",
        "Cleopatra VII": "sovereign pragmatism, diplomacy, survival, legitimacy under pressure",
        "Niccolo Machiavelli": "political realism, stability, power, consequences over ideals",
        "Jensen Huang": "technological optimism, operator mindset, systems thinking, innovation tradeoffs",
    }


def test_required_model_failure_stops_trial_without_canned_dialogue():
    def failing_runner(**kwargs):
        return ModelResult(
            text="",
            input_text="SYSTEM:\nfailed",
            call=ModelCall(
                model=kwargs["model"],
                provider=kwargs.get("provider", "test"),
                ok=False,
                latency_ms=1,
                prompt_hash="test-prompt",
                error="offline",
            ),
        )

    with pytest.raises(RequiredModelError, match="unavailable"):
        run_trial(TrialRequest(case_id="socrates"), model_runner=failing_runner)


def test_invalid_jury_output_stops_trial_without_fallback_votes():
    def invalid_jury_runner(**kwargs):
        result = fake_model_runner(**kwargs)
        if kwargs["role"] == "juror vote generator":
            result.text = "the jury refuses structured output"
        return result

    with pytest.raises(RequiredModelError, match="invalid JSON"):
        run_trial(TrialRequest(case_id="socrates"), model_runner=invalid_jury_runner)


def test_live_search_stops_when_query_is_weak():
    with pytest.raises(RuntimeError, match="no fallback case"):
        run_trial(TrialRequest(case_id="live", search_query="x"), model_runner=fake_model_runner)