Spaces:
Sleeping
Sleeping
| import json | |
| import re | |
| import pytest | |
| from sovereign_bench.engine import JUDGE_NAME, JUROR_PERSONAS, RequiredModelError, run_trial | |
| from sovereign_bench.llm import ModelCall, ModelResult | |
| from sovereign_bench.models import TrialRequest | |
| def _jury_json(evidence_summary: str, vote: str = "liable") -> str: | |
| evidence_ids = re.findall(r"^([A-Z]+-E\d+):", evidence_summary, flags=re.M) | |
| evidence_ids = (evidence_ids or ["SOC-E1"]) * 6 | |
| return json.dumps( | |
| { | |
| "votes": [ | |
| { | |
| "juror": name, | |
| "persona": persona, | |
| "vote": vote if idx < 4 else "not_liable", | |
| "reason": f"{name} applies a {persona} lens to exhibit {evidence_ids[idx]}.", | |
| "evidence_ids": [evidence_ids[idx]], | |
| } | |
| for idx, (name, persona) in enumerate(JUROR_PERSONAS.items()) | |
| ] | |
| } | |
| ) | |
| def fake_model_runner(**kwargs): | |
| text = ( | |
| _jury_json(kwargs["evidence_summary"]) | |
| if kwargs["role"] == "juror vote generator" | |
| else f"{kwargs['agent']} responds to: {kwargs['task']}" | |
| ) | |
| prompt = ( | |
| f"SYSTEM:\nFake live model for tests.\n\nUSER:\n" | |
| f"Agent: {kwargs['agent']}\nRole: {kwargs['role']}\nTask: {kwargs['task']}\n\nASSISTANT:\n" | |
| ) | |
| return ModelResult( | |
| text=text, | |
| input_text=prompt, | |
| call=ModelCall( | |
| model=kwargs["model"], | |
| provider=kwargs.get("provider", "test"), | |
| ok=True, | |
| latency_ms=1, | |
| prompt_hash="test-prompt", | |
| ), | |
| ) | |
| def test_cached_cases_emit_sequential_speaker_order(): | |
| expected_speakers = [ | |
| "Clerk Meridian", | |
| JUDGE_NAME, | |
| "Advocate Auric", | |
| "Counsel Sable", | |
| "Auditor Prism", | |
| JUDGE_NAME, | |
| "Advocate Auric", | |
| "Counsel Sable", | |
| "Nemotron Jury", | |
| *list(JUROR_PERSONAS), | |
| JUDGE_NAME, | |
| ] | |
| for case_id in ["socrates", "barnaby"]: | |
| events = run_trial(TrialRequest(case_id=case_id), model_runner=fake_model_runner) | |
| assert [event.turns[0].agent for event in events] == expected_speakers | |
| assert [event.phase for event in events].count("deliberation") == 7 | |
| assert events[0].turns[0].input | |
| assert "SYSTEM:" in events[0].turns[0].input | |
| assert events[-1].verdict is not None | |
| assert events[-1].votes and len(events[-1].votes) == 6 | |
| assert "uncertainty" in events[-1].verdict.uncertainty.lower() | |
| def test_no_event_contains_both_lawyers_speaking_together(): | |
| events = run_trial(TrialRequest(case_id="socrates"), model_runner=fake_model_runner) | |
| for event in events: | |
| agents = {turn.agent for turn in event.turns} | |
| assert not {"Advocate Auric", "Counsel Sable"}.issubset(agents) | |
| def test_juror_vote_events_have_fixed_personas_and_evidence(): | |
| events = run_trial(TrialRequest(case_id="socrates"), model_runner=fake_model_runner) | |
| juror_events = [event for event in events if event.turns[0].agent in JUROR_PERSONAS] | |
| assert len(juror_events) == 6 | |
| for event in juror_events: | |
| vote = event.votes[0] | |
| assert vote.juror == event.turns[0].agent | |
| assert vote.persona == JUROR_PERSONAS[vote.juror] | |
| assert vote.vote in {"liable", "not_liable", "uncertain"} | |
| assert vote.reason | |
| assert vote.evidence_ids | |
| final = events[-1] | |
| assert final.phase == "verdict" | |
| assert [vote.juror for vote in final.votes] == list(JUROR_PERSONAS) | |
| def test_jury_contract_uses_public_history_personas(): | |
| assert JUDGE_NAME == "Marcus Aurelius" | |
| assert JUROR_PERSONAS == { | |
| "Karl Marx": "class power, material conditions, exploitation, institutional incentives", | |
| "John Stuart Mill": "liberty, harm principle, utility, individual rights", | |
| "Confucius": "social harmony, role duty, ritual order, moral cultivation", | |
| "Cleopatra VII": "sovereign pragmatism, diplomacy, survival, legitimacy under pressure", | |
| "Niccolo Machiavelli": "political realism, stability, power, consequences over ideals", | |
| "Jensen Huang": "technological optimism, operator mindset, systems thinking, innovation tradeoffs", | |
| } | |
| def test_required_model_failure_stops_trial_without_canned_dialogue(): | |
| def failing_runner(**kwargs): | |
| return ModelResult( | |
| text="", | |
| input_text="SYSTEM:\nfailed", | |
| call=ModelCall( | |
| model=kwargs["model"], | |
| provider=kwargs.get("provider", "test"), | |
| ok=False, | |
| latency_ms=1, | |
| prompt_hash="test-prompt", | |
| error="offline", | |
| ), | |
| ) | |
| with pytest.raises(RequiredModelError, match="unavailable"): | |
| run_trial(TrialRequest(case_id="socrates"), model_runner=failing_runner) | |
| def test_invalid_jury_output_stops_trial_without_fallback_votes(): | |
| def invalid_jury_runner(**kwargs): | |
| result = fake_model_runner(**kwargs) | |
| if kwargs["role"] == "juror vote generator": | |
| result.text = "the jury refuses structured output" | |
| return result | |
| with pytest.raises(RequiredModelError, match="invalid JSON"): | |
| run_trial(TrialRequest(case_id="socrates"), model_runner=invalid_jury_runner) | |
| def test_live_search_stops_when_query_is_weak(): | |
| with pytest.raises(RuntimeError, match="no fallback case"): | |
| run_trial(TrialRequest(case_id="live", search_query="x"), model_runner=fake_model_runner) | |