Spaces:
Sleeping
Sleeping
Sync JudgeGPT app from GitHub
#5
by AliIqbal05 - opened
- .env.example +3 -0
- .gitignore +10 -10
- README.md +173 -105
- app.py +0 -0
- assets/ATTRIBUTION.md +10 -10
- assets/audio/ATTRIBUTION.md +51 -51
- assets/book/README.md +14 -14
- data/README.md +5 -5
- data/agent_trace_sample.json +23 -23
- modal_app.py +212 -193
- requirements.txt +7 -7
- sovereign_bench/__init__.py +6 -6
- sovereign_bench/cases.py +274 -141
- sovereign_bench/engine.py +258 -194
- sovereign_bench/export.py +35 -35
- sovereign_bench/llm.py +296 -209
- sovereign_bench/models.py +88 -86
- sovereign_bench/retrieval.py +70 -70
- tests/test_cases.py +16 -8
- tests/test_engine.py +326 -149
- tests/test_ui_rendering.py +578 -283
.env.example
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Local-only secrets. Do not commit real values.
|
| 2 |
+
HF_TOKEN=
|
| 3 |
+
MODAL_TRIAL_URL=https://ali-j-iqbal24--trial-stream.modal.run
|
.gitignore
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
-
.env
|
| 2 |
-
.env.*
|
| 3 |
-
!.env.example
|
| 4 |
-
__pycache__/
|
| 5 |
-
*.py[cod]
|
| 6 |
-
.venv/
|
| 7 |
-
venv/
|
| 8 |
-
.modal.toml
|
| 9 |
-
.cache/
|
| 10 |
-
artifacts/
|
|
|
|
| 1 |
+
.env
|
| 2 |
+
.env.*
|
| 3 |
+
!.env.example
|
| 4 |
+
__pycache__/
|
| 5 |
+
*.py[cod]
|
| 6 |
+
.venv/
|
| 7 |
+
venv/
|
| 8 |
+
.modal.toml
|
| 9 |
+
.cache/
|
| 10 |
+
artifacts/
|
README.md
CHANGED
|
@@ -1,105 +1,173 @@
|
|
| 1 |
-
---
|
| 2 |
-
title: Judge-GPT
|
| 3 |
-
emoji: ⚖️
|
| 4 |
-
colorFrom: yellow
|
| 5 |
-
colorTo: red
|
| 6 |
-
sdk: gradio
|
| 7 |
-
sdk_version: 6.17.3
|
| 8 |
-
app_file: app.py
|
| 9 |
-
pinned: false
|
| 10 |
-
license: mit
|
| 11 |
-
short_description: AI-native miniature trials under 32B.
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
-
|
| 30 |
-
-
|
| 31 |
-
-
|
| 32 |
-
-
|
| 33 |
-
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
``
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
``
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
- `
|
| 92 |
-
- `sovereign_bench/
|
| 93 |
-
- `
|
| 94 |
-
- `
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
```
|
| 104 |
-
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Judge-GPT
|
| 3 |
+
emoji: ⚖️
|
| 4 |
+
colorFrom: yellow
|
| 5 |
+
colorTo: red
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 6.17.3
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
license: mit
|
| 11 |
+
short_description: AI-native miniature trials under 32B.
|
| 12 |
+
tags:
|
| 13 |
+
- track:wood
|
| 14 |
+
- sponsor:openai
|
| 15 |
+
- sponsor:nvidia
|
| 16 |
+
- sponsor:modal
|
| 17 |
+
- achievement:offbrand
|
| 18 |
+
- achievement:fieldnotes
|
| 19 |
+
---
|
| 20 |
+
|
| 21 |
+
# Judge-GPT
|
| 22 |
+
|
| 23 |
+
Judge-GPT is a cinematic Gradio courtroom for the Build Small Hackathon's Thousand Token Wood track. It turns a compact evidence packet into a two-minute AI-native trial: a clerk opens the docket, two lawyers argue opposite sides, Marcus Aurelius presides, six fixed-perspective jurors vote, and the court seals a verdict.
|
| 24 |
+
|
| 25 |
+
The point is not legal advice. It is a small-model theater for structured disagreement: evidence is visible, roles are constrained, hidden reasoning is stripped, and every trial leaves a trace of which agent said what.
|
| 26 |
+
|
| 27 |
+
## Submission Links
|
| 28 |
+
|
| 29 |
+
- Hugging Face Space: https://huggingface.co/spaces/build-small-hackathon/JudgeGPT
|
| 30 |
+
- Demo video: https://drive.google.com/drive/folders/10pWJ7NVCsnVV7wOlqm4MGWg4Kmh4rMY2?usp=sharing
|
| 31 |
+
- Social post: TODO paste final public social post URL
|
| 32 |
+
- GitHub repo: https://github.com/aliiqbal24/BuildSmallfinal
|
| 33 |
+
- Field guide validator: https://build-small-hackathon-field-guide.hf.space/submit
|
| 34 |
+
|
| 35 |
+
## What Judges Should Try
|
| 36 |
+
|
| 37 |
+
1. Open the Space and keep the default `Trial of Socrates`.
|
| 38 |
+
2. Click `Begin Trial`.
|
| 39 |
+
3. Watch the courtroom progress from intake to verdict.
|
| 40 |
+
4. Hover the judge, clerk, lawyers, and jurors to inspect model/agent threads.
|
| 41 |
+
5. Open the `Evidence Drawer` and `Juror Panel` tabs after the verdict.
|
| 42 |
+
6. Try `Greg Heffley vs Mom` for a lighter family-court case.
|
| 43 |
+
7. Try `Custom` to write a short dispute and up to three pieces of evidence per side directly into the docket book.
|
| 44 |
+
|
| 45 |
+
## Why It Fits Build Small
|
| 46 |
+
|
| 47 |
+
- **Thousand Token Wood:** the app is whimsical, theatrical, and AI-native rather than a generic chatbot.
|
| 48 |
+
- **Best Use of Codex:** Codex was used throughout implementation, debugging, UI iteration, tests, and commit prep in the connected GitHub repo.
|
| 49 |
+
- **Nemotron Hardware Prize:** Nemotron is a core runtime model for the jury and juror vote generation.
|
| 50 |
+
- **Best Use of Modal:** the Gradio Space delegates live model inference to a Modal GPU streaming endpoint.
|
| 51 |
+
- **Off-Brand:** the UI pushes past stock Gradio with a custom courtroom, animated puppets, docket book, evidence props, audio cues, and verdict staging.
|
| 52 |
+
- **Field Notes:** this README documents the build idea, model choices, runtime architecture, limitations, and submission checklist.
|
| 53 |
+
|
| 54 |
+
## Small-Model Budget
|
| 55 |
+
|
| 56 |
+
Every named model is under the 32B parameter cap.
|
| 57 |
+
|
| 58 |
+
| Role | Model | Budgeted size | Used for |
|
| 59 |
+
| --- | --- | ---: | --- |
|
| 60 |
+
| Presiding advocate | `openai/gpt-oss-20b` | 20B | Judge, claimant lawyer, respondent lawyer, verdict voice |
|
| 61 |
+
| Clerk of style | `openbmb/AgentCPM-Explore` | 4B | Clerk/stage voice |
|
| 62 |
+
| Jury ring | `nvidia/Nemotron-Orchestrator-8B` | 8B | Jury panel and six juror votes |
|
| 63 |
+
|
| 64 |
+
Displayed aggregate budget: 32B. The app does not use a model above 32B.
|
| 65 |
+
|
| 66 |
+
## How It Works
|
| 67 |
+
|
| 68 |
+
Judge-GPT runs a deterministic courtroom sequence over a `CasePacket`:
|
| 69 |
+
|
| 70 |
+
1. Clerk opens the docket.
|
| 71 |
+
2. Judge frames the dispute.
|
| 72 |
+
3. Mike OSS argues for the claimant.
|
| 73 |
+
4. Harvey Vector argues for the respondent.
|
| 74 |
+
5. The evidence record is displayed without adding a third lawyer.
|
| 75 |
+
6. The judge asks a hinge question.
|
| 76 |
+
7. Each lawyer answers from their side.
|
| 77 |
+
8. Nemotron Jury retires the panel.
|
| 78 |
+
9. Six named jurors vote from distinct worldviews.
|
| 79 |
+
10. The judge announces the final verdict.
|
| 80 |
+
|
| 81 |
+
The shipped demo cases are:
|
| 82 |
+
|
| 83 |
+
- `The Polis v. Socrates`
|
| 84 |
+
- `Greg Heffley v. Mom`
|
| 85 |
+
- `Custom`, built from the docket-book fields in the UI
|
| 86 |
+
|
| 87 |
+
## Runtime Architecture
|
| 88 |
+
|
| 89 |
+
- `app.py` renders the Gradio UI, courtroom HTML/CSS, audio hooks, case preview book, and live event stream.
|
| 90 |
+
- `sovereign_bench/engine.py` orchestrates trial phases, model calls, evidence events, jury votes, verdict assembly, and trace metadata.
|
| 91 |
+
- `sovereign_bench/llm.py` builds role prompts, calls Hugging Face-compatible chat models, and rejects hidden reasoning or instruction echoes.
|
| 92 |
+
- `sovereign_bench/cases.py` contains the cached demo case packets.
|
| 93 |
+
- `modal_app.py` hosts the GPU-backed streaming endpoint used by the Space.
|
| 94 |
+
- `tests/` contains engine, case, and rendering regression tests.
|
| 95 |
+
|
| 96 |
+
The Gradio app uses `MODAL_TRIAL_URL` when set, otherwise it uses the built-in deployed Modal endpoint. The Modal app owns the Hugging Face token through a Modal secret named `huggingface`; no real credentials are committed.
|
| 97 |
+
|
| 98 |
+
## Run Locally
|
| 99 |
+
|
| 100 |
+
```powershell
|
| 101 |
+
python -m pip install -r requirements.txt
|
| 102 |
+
python app.py
|
| 103 |
+
```
|
| 104 |
+
|
| 105 |
+
Open:
|
| 106 |
+
|
| 107 |
+
```text
|
| 108 |
+
http://127.0.0.1:7860
|
| 109 |
+
```
|
| 110 |
+
|
| 111 |
+
## Deploy Modal Backend
|
| 112 |
+
|
| 113 |
+
```powershell
|
| 114 |
+
python -m modal deploy modal_app.py
|
| 115 |
+
```
|
| 116 |
+
|
| 117 |
+
After deployment, pre-warm every configured courtroom model in the deployed `sovereign-bench` app so the first trial does not wait for all GPU containers to cold start. Run this after each deploy because deployments reset Modal autoscaler overrides:
|
| 118 |
+
|
| 119 |
+
```powershell
|
| 120 |
+
python -m modal run modal_app.py::warm_models
|
| 121 |
+
```
|
| 122 |
+
|
| 123 |
+
If the endpoint changes, set the Hugging Face Space variable:
|
| 124 |
+
|
| 125 |
+
```text
|
| 126 |
+
MODAL_TRIAL_URL=https://your-modal-endpoint.example
|
| 127 |
+
```
|
| 128 |
+
|
| 129 |
+
## Deploy Hugging Face Space
|
| 130 |
+
|
| 131 |
+
Create or upload this repo as a Gradio Space inside the official Build Small org:
|
| 132 |
+
|
| 133 |
+
```text
|
| 134 |
+
build-small-hackathon/<your-space-name>
|
| 135 |
+
```
|
| 136 |
+
|
| 137 |
+
Space settings:
|
| 138 |
+
|
| 139 |
+
- SDK: Gradio
|
| 140 |
+
- App file: `app.py`
|
| 141 |
+
- Python requirements: `requirements.txt`
|
| 142 |
+
- Optional variable: `MODAL_TRIAL_URL`
|
| 143 |
+
- No Space secret is required if using the hosted Modal endpoint.
|
| 144 |
+
|
| 145 |
+
## Verification
|
| 146 |
+
|
| 147 |
+
```powershell
|
| 148 |
+
python -m pytest
|
| 149 |
+
```
|
| 150 |
+
|
| 151 |
+
Focused checks used during final prep:
|
| 152 |
+
|
| 153 |
+
```powershell
|
| 154 |
+
python -m pytest tests/test_engine.py tests/test_ui_rendering.py
|
| 155 |
+
```
|
| 156 |
+
|
| 157 |
+
## Limitations
|
| 158 |
+
|
| 159 |
+
- Judge-GPT is not legal advice and should not be used for real legal decisions.
|
| 160 |
+
- The demo packets are compact, staged evidence packets, not exhaustive source research.
|
| 161 |
+
- Model, Modal, or retrieval failures stop the current trial instead of substituting fake dialogue.
|
| 162 |
+
- Trial results are not persisted across sessions.
|
| 163 |
+
- Custom trials require a short case context and evidence from both sides.
|
| 164 |
+
|
| 165 |
+
## Final Submission Checklist
|
| 166 |
+
|
| 167 |
+
- [ ] Push the repo to the Build Small Hugging Face org as a Gradio Space.
|
| 168 |
+
- [ ] Confirm the Space launches and can complete `Trial of Socrates`.
|
| 169 |
+
- [ ] Record a short demo video showing the trial flow and verdict.
|
| 170 |
+
- [ ] Replace the `Demo video` TODO above with the final public URL.
|
| 171 |
+
- [ ] Publish one social post about the app.
|
| 172 |
+
- [ ] Replace the `Social post` TODO above with the final public URL.
|
| 173 |
+
- [ ] Run the README through the Build Small validator.
|
app.py
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
assets/ATTRIBUTION.md
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
-
# Asset Attribution
|
| 2 |
-
|
| 3 |
-
## `courtroom-dickinson.jpg`
|
| 4 |
-
|
| 5 |
-
- Source: https://commons.wikimedia.org/wiki/File:Dickinson_Law_Courtroom.jpg
|
| 6 |
-
- Description: Penn State University, Dickinson School of Law courtroom
|
| 7 |
-
- Author: Jeremy Hess Photography
|
| 8 |
-
- License: Creative Commons CC0 1.0 Universal Public Domain Dedication
|
| 9 |
-
- Local use: cinematic courtroom background for Sovereign Bench
|
| 10 |
-
|
|
|
|
| 1 |
+
# Asset Attribution
|
| 2 |
+
|
| 3 |
+
## `courtroom-dickinson.jpg`
|
| 4 |
+
|
| 5 |
+
- Source: https://commons.wikimedia.org/wiki/File:Dickinson_Law_Courtroom.jpg
|
| 6 |
+
- Description: Penn State University, Dickinson School of Law courtroom
|
| 7 |
+
- Author: Jeremy Hess Photography
|
| 8 |
+
- License: Creative Commons CC0 1.0 Universal Public Domain Dedication
|
| 9 |
+
- Local use: cinematic courtroom background for Sovereign Bench
|
| 10 |
+
|
assets/audio/ATTRIBUTION.md
CHANGED
|
@@ -1,51 +1,51 @@
|
|
| 1 |
-
# Audio Attribution
|
| 2 |
-
|
| 3 |
-
All selected audio is stored locally in `assets/audio/` for the animated courtroom episode.
|
| 4 |
-
|
| 5 |
-
## Courtroom score and judgement sting
|
| 6 |
-
|
| 7 |
-
- Files: `courtroom.ogg`, `Judgement.ogg`
|
| 8 |
-
- Source: OpenGameArt, "Courtroom and Judgement"
|
| 9 |
-
- Author: Spring Spring
|
| 10 |
-
- License: CC0
|
| 11 |
-
- URL: https://opengameart.org/content/courtroom-and-judgement
|
| 12 |
-
|
| 13 |
-
## Courtroom chatter and crowd reaction
|
| 14 |
-
|
| 15 |
-
- File: `crowd_shouting.ogg`
|
| 16 |
-
- Source: OpenGameArt, "Crowd Shouting/Speaking Ambience"
|
| 17 |
-
- Author: StarNinjas
|
| 18 |
-
- License: CC0
|
| 19 |
-
- URL: https://opengameart.org/content/crowd-shoutingspeaking-ambience
|
| 20 |
-
|
| 21 |
-
## Gavel and wood hits
|
| 22 |
-
|
| 23 |
-
- Files: `wood_hammer_01.ogg`, `wood_hit_03.ogg`
|
| 24 |
-
- Source: OpenGameArt, "100 CC0 metal and wood SFX"
|
| 25 |
-
- Author: rubberduck
|
| 26 |
-
- License: CC0
|
| 27 |
-
- URL: https://opengameart.org/content/100-cc0-metal-and-wood-sfx
|
| 28 |
-
|
| 29 |
-
## Lawyer footsteps
|
| 30 |
-
|
| 31 |
-
- File: `steps_in_wood_floor.wav`
|
| 32 |
-
- Source: OpenGameArt, "Steps in wood floor"
|
| 33 |
-
- Author: mikeask
|
| 34 |
-
- License: CC0
|
| 35 |
-
- URL: https://opengameart.org/content/steps-in-wood-floor
|
| 36 |
-
|
| 37 |
-
## Book and paper movement
|
| 38 |
-
|
| 39 |
-
- Files: `paper_sound_1.mp3`, `paper_sound_4.mp3`
|
| 40 |
-
- Source: OpenGameArt, "Various Paper Sound Effects"
|
| 41 |
-
- Author: Luckius
|
| 42 |
-
- License: CC0
|
| 43 |
-
- URL: https://opengameart.org/content/various-paper-sound-effects
|
| 44 |
-
|
| 45 |
-
## Docket selection UI cue
|
| 46 |
-
|
| 47 |
-
- File: `select_001.ogg`
|
| 48 |
-
- Source: OpenGameArt, "Interface Sounds"
|
| 49 |
-
- Author: Kenney
|
| 50 |
-
- License: CC0
|
| 51 |
-
- URL: https://opengameart.org/content/interface-sounds
|
|
|
|
| 1 |
+
# Audio Attribution
|
| 2 |
+
|
| 3 |
+
All selected audio is stored locally in `assets/audio/` for the animated courtroom episode.
|
| 4 |
+
|
| 5 |
+
## Courtroom score and judgement sting
|
| 6 |
+
|
| 7 |
+
- Files: `courtroom.ogg`, `Judgement.ogg`
|
| 8 |
+
- Source: OpenGameArt, "Courtroom and Judgement"
|
| 9 |
+
- Author: Spring Spring
|
| 10 |
+
- License: CC0
|
| 11 |
+
- URL: https://opengameart.org/content/courtroom-and-judgement
|
| 12 |
+
|
| 13 |
+
## Courtroom chatter and crowd reaction
|
| 14 |
+
|
| 15 |
+
- File: `crowd_shouting.ogg`
|
| 16 |
+
- Source: OpenGameArt, "Crowd Shouting/Speaking Ambience"
|
| 17 |
+
- Author: StarNinjas
|
| 18 |
+
- License: CC0
|
| 19 |
+
- URL: https://opengameart.org/content/crowd-shoutingspeaking-ambience
|
| 20 |
+
|
| 21 |
+
## Gavel and wood hits
|
| 22 |
+
|
| 23 |
+
- Files: `wood_hammer_01.ogg`, `wood_hit_03.ogg`
|
| 24 |
+
- Source: OpenGameArt, "100 CC0 metal and wood SFX"
|
| 25 |
+
- Author: rubberduck
|
| 26 |
+
- License: CC0
|
| 27 |
+
- URL: https://opengameart.org/content/100-cc0-metal-and-wood-sfx
|
| 28 |
+
|
| 29 |
+
## Lawyer footsteps
|
| 30 |
+
|
| 31 |
+
- File: `steps_in_wood_floor.wav`
|
| 32 |
+
- Source: OpenGameArt, "Steps in wood floor"
|
| 33 |
+
- Author: mikeask
|
| 34 |
+
- License: CC0
|
| 35 |
+
- URL: https://opengameart.org/content/steps-in-wood-floor
|
| 36 |
+
|
| 37 |
+
## Book and paper movement
|
| 38 |
+
|
| 39 |
+
- Files: `paper_sound_1.mp3`, `paper_sound_4.mp3`
|
| 40 |
+
- Source: OpenGameArt, "Various Paper Sound Effects"
|
| 41 |
+
- Author: Luckius
|
| 42 |
+
- License: CC0
|
| 43 |
+
- URL: https://opengameart.org/content/various-paper-sound-effects
|
| 44 |
+
|
| 45 |
+
## Docket selection UI cue
|
| 46 |
+
|
| 47 |
+
- File: `select_001.ogg`
|
| 48 |
+
- Source: OpenGameArt, "Interface Sounds"
|
| 49 |
+
- Author: Kenney
|
| 50 |
+
- License: CC0
|
| 51 |
+
- URL: https://opengameart.org/content/interface-sounds
|
assets/book/README.md
CHANGED
|
@@ -1,14 +1,14 @@
|
|
| 1 |
-
# Docket Book Assets
|
| 2 |
-
|
| 3 |
-
These project-bound UI prop assets were generated with the built-in Codex image generation tool, then processed locally from a chroma-key background to transparent PNGs.
|
| 4 |
-
|
| 5 |
-
- `docket-book-open.png`: open docket book used before the trial starts.
|
| 6 |
-
- `docket-book-closed.png`: closed docket book used after the trial begins.
|
| 7 |
-
- `docket-book-open-keyed.png`: preserved chroma-key source.
|
| 8 |
-
- `docket-book-closed-keyed.png`: preserved chroma-key source.
|
| 9 |
-
|
| 10 |
-
Generation prompt summary:
|
| 11 |
-
|
| 12 |
-
- Antique legal docket book, warm parchment or dark leather, gold corner protectors, polished painterly game UI prop, centered with generous padding.
|
| 13 |
-
- No text, no logos, no watermark, no hands, no pen.
|
| 14 |
-
- Generated on a flat `#00ff00` background for local alpha extraction.
|
|
|
|
| 1 |
+
# Docket Book Assets
|
| 2 |
+
|
| 3 |
+
These project-bound UI prop assets were generated with the built-in Codex image generation tool, then processed locally from a chroma-key background to transparent PNGs.
|
| 4 |
+
|
| 5 |
+
- `docket-book-open.png`: open docket book used before the trial starts.
|
| 6 |
+
- `docket-book-closed.png`: closed docket book used after the trial begins.
|
| 7 |
+
- `docket-book-open-keyed.png`: preserved chroma-key source.
|
| 8 |
+
- `docket-book-closed-keyed.png`: preserved chroma-key source.
|
| 9 |
+
|
| 10 |
+
Generation prompt summary:
|
| 11 |
+
|
| 12 |
+
- Antique legal docket book, warm parchment or dark leather, gold corner protectors, polished painterly game UI prop, centered with generous padding.
|
| 13 |
+
- No text, no logos, no watermark, no hands, no pen.
|
| 14 |
+
- Generated on a flat `#00ff00` background for local alpha extraction.
|
data/README.md
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
-
# Sovereign Bench Agent Trace Sample
|
| 2 |
-
|
| 3 |
-
This sample contains compact phase-level trace rows from the cached Barnaby Buttons trial. Runtime traces exported by the Gradio app include the full structured `TrialEvent` objects.
|
| 4 |
-
|
| 5 |
-
The trace is synthetic and intended for hackathon demonstration, reproducibility, and UI testing.
|
|
|
|
| 1 |
+
# Sovereign Bench Agent Trace Sample
|
| 2 |
+
|
| 3 |
+
This sample contains compact phase-level trace rows from the cached Barnaby Buttons trial. Runtime traces exported by the Gradio app include the full structured `TrialEvent` objects.
|
| 4 |
+
|
| 5 |
+
The trace is synthetic and intended for hackathon demonstration, reproducibility, and UI testing.
|
data/agent_trace_sample.json
CHANGED
|
@@ -1,23 +1,23 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"phase": "intake",
|
| 4 |
-
"case_id": "barnaby",
|
| 5 |
-
"agent": "Clerk Meridian",
|
| 6 |
-
"model": "openbmb/AgentCPM-Explore",
|
| 7 |
-
"summary": "Opened The People v. Barnaby Buttons and recorded source provenance for cached demo reliability."
|
| 8 |
-
},
|
| 9 |
-
{
|
| 10 |
-
"phase": "evidence",
|
| 11 |
-
"case_id": "barnaby",
|
| 12 |
-
"agent": "Auditor Prism",
|
| 13 |
-
"model": "nvidia/Nemotron-Orchestrator-8B",
|
| 14 |
-
"summary": "Scored ledger ink, crumb trail, calendar motive, and biscuit alibi as directional evidence with uncertainty."
|
| 15 |
-
},
|
| 16 |
-
{
|
| 17 |
-
"phase": "verdict",
|
| 18 |
-
"case_id": "barnaby",
|
| 19 |
-
"agent": "Marcus Aurelius",
|
| 20 |
-
"model": "openai/gpt-oss-20b",
|
| 21 |
-
"summary": "Issued a narrow claimant finding with cited evidence IDs and an explicit uncertainty warning."
|
| 22 |
-
}
|
| 23 |
-
]
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"phase": "intake",
|
| 4 |
+
"case_id": "barnaby",
|
| 5 |
+
"agent": "Clerk Meridian",
|
| 6 |
+
"model": "openbmb/AgentCPM-Explore",
|
| 7 |
+
"summary": "Opened The People v. Barnaby Buttons and recorded source provenance for cached demo reliability."
|
| 8 |
+
},
|
| 9 |
+
{
|
| 10 |
+
"phase": "evidence",
|
| 11 |
+
"case_id": "barnaby",
|
| 12 |
+
"agent": "Auditor Prism",
|
| 13 |
+
"model": "nvidia/Nemotron-Orchestrator-8B",
|
| 14 |
+
"summary": "Scored ledger ink, crumb trail, calendar motive, and biscuit alibi as directional evidence with uncertainty."
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"phase": "verdict",
|
| 18 |
+
"case_id": "barnaby",
|
| 19 |
+
"agent": "Marcus Aurelius",
|
| 20 |
+
"model": "openai/gpt-oss-20b",
|
| 21 |
+
"summary": "Issued a narrow claimant finding with cited evidence IDs and an explicit uncertainty warning."
|
| 22 |
+
}
|
| 23 |
+
]
|
modal_app.py
CHANGED
|
@@ -1,193 +1,212 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import time
|
| 3 |
-
|
| 4 |
-
import modal
|
| 5 |
-
|
| 6 |
-
from sovereign_bench.engine import stream_trial_jsonl
|
| 7 |
-
from sovereign_bench.llm import (
|
| 8 |
-
ModelCall,
|
| 9 |
-
ModelResult,
|
| 10 |
-
build_role_messages,
|
| 11 |
-
messages_hash,
|
| 12 |
-
)
|
| 13 |
-
from sovereign_bench.models import TrialRequest
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
.
|
| 24 |
-
)
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
.
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
"
|
| 35 |
-
"
|
| 36 |
-
"
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
"
|
| 43 |
-
"
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
)
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import time
|
| 3 |
+
|
| 4 |
+
import modal
|
| 5 |
+
|
| 6 |
+
from sovereign_bench.engine import MODEL_BUDGET, stream_trial_jsonl
|
| 7 |
+
from sovereign_bench.llm import (
|
| 8 |
+
ModelCall,
|
| 9 |
+
ModelResult,
|
| 10 |
+
build_role_messages,
|
| 11 |
+
messages_hash,
|
| 12 |
+
)
|
| 13 |
+
from sovereign_bench.models import TrialRequest
|
| 14 |
+
|
| 15 |
+
MODAL_APP_NAME = "sovereign-bench"
|
| 16 |
+
app = modal.App(MODAL_APP_NAME)
|
| 17 |
+
GPU_NAME = "H100"
|
| 18 |
+
GPU_TIMEOUT_SECONDS = 20 * 60
|
| 19 |
+
HF_CACHE_DIR = "/root/.cache/huggingface"
|
| 20 |
+
USED_MODEL_IDS = tuple(dict.fromkeys(model for _, model, _ in MODEL_BUDGET))
|
| 21 |
+
|
| 22 |
+
image = (
|
| 23 |
+
modal.Image.debian_slim(python_version="3.12")
|
| 24 |
+
.pip_install("fastapi", "huggingface_hub", "httpx", "pydantic")
|
| 25 |
+
.add_local_dir("sovereign_bench", remote_path="/root/sovereign_bench")
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
model_cache = modal.Volume.from_name("sovereign-bench-model-cache", create_if_missing=True)
|
| 29 |
+
|
| 30 |
+
vllm_image = (
|
| 31 |
+
modal.Image.from_registry("nvidia/cuda:12.8.1-devel-ubuntu22.04", add_python="3.12")
|
| 32 |
+
.entrypoint([])
|
| 33 |
+
.uv_pip_install(
|
| 34 |
+
"vllm==0.18.1",
|
| 35 |
+
"huggingface_hub[hf_transfer]==0.36.0",
|
| 36 |
+
"transformers",
|
| 37 |
+
"httpx",
|
| 38 |
+
"pydantic",
|
| 39 |
+
)
|
| 40 |
+
.env(
|
| 41 |
+
{
|
| 42 |
+
"HF_HUB_ENABLE_HF_TRANSFER": "1",
|
| 43 |
+
"HF_HOME": HF_CACHE_DIR,
|
| 44 |
+
"VLLM_WORKER_MULTIPROC_METHOD": "spawn",
|
| 45 |
+
"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8": "1",
|
| 46 |
+
}
|
| 47 |
+
)
|
| 48 |
+
.add_local_dir("sovereign_bench", remote_path="/root/sovereign_bench")
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
@app.cls(
|
| 53 |
+
image=vllm_image,
|
| 54 |
+
gpu=GPU_NAME,
|
| 55 |
+
secrets=[modal.Secret.from_name("huggingface")],
|
| 56 |
+
volumes={HF_CACHE_DIR: model_cache},
|
| 57 |
+
timeout=GPU_TIMEOUT_SECONDS,
|
| 58 |
+
scaledown_window=10 * 60,
|
| 59 |
+
max_containers=3,
|
| 60 |
+
)
|
| 61 |
+
class VllmModel:
|
| 62 |
+
model_id: str = modal.parameter()
|
| 63 |
+
|
| 64 |
+
@modal.enter()
|
| 65 |
+
def load(self) -> None:
|
| 66 |
+
from vllm import LLM, SamplingParams
|
| 67 |
+
|
| 68 |
+
self.SamplingParams = SamplingParams
|
| 69 |
+
self.llm = LLM(
|
| 70 |
+
model=self.model_id,
|
| 71 |
+
trust_remote_code=True,
|
| 72 |
+
max_model_len=4096,
|
| 73 |
+
gpu_memory_utilization=0.9,
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
@modal.method()
|
| 77 |
+
def generate(self, payload: dict) -> dict:
|
| 78 |
+
from sovereign_bench.llm import ModelCallError, clean_model_text
|
| 79 |
+
|
| 80 |
+
started = time.perf_counter()
|
| 81 |
+
messages = payload["messages"]
|
| 82 |
+
max_tokens = int(payload.get("max_tokens") or 120)
|
| 83 |
+
temperature = float(payload.get("temperature") or 0.45)
|
| 84 |
+
sampling_params = self.SamplingParams(
|
| 85 |
+
max_tokens=max_tokens,
|
| 86 |
+
temperature=temperature,
|
| 87 |
+
top_p=0.9,
|
| 88 |
+
)
|
| 89 |
+
retry_messages = messages + [
|
| 90 |
+
{
|
| 91 |
+
"role": "user",
|
| 92 |
+
"content": (
|
| 93 |
+
"Your previous response did not include visible courtroom dialogue. "
|
| 94 |
+
"Return only the final answer now. Do not mention prompts, tasks, requirements, or that you are following instructions. "
|
| 95 |
+
"Do not include <think>, analysis, reasoning, markdown, narration, or notes. /no_think"
|
| 96 |
+
),
|
| 97 |
+
}
|
| 98 |
+
]
|
| 99 |
+
last_error: Exception | None = None
|
| 100 |
+
text = ""
|
| 101 |
+
for attempt_messages in (messages, retry_messages):
|
| 102 |
+
outputs = self.llm.chat(
|
| 103 |
+
[attempt_messages],
|
| 104 |
+
sampling_params=sampling_params,
|
| 105 |
+
use_tqdm=False,
|
| 106 |
+
chat_template_kwargs={"enable_thinking": False},
|
| 107 |
+
)
|
| 108 |
+
raw_text = outputs[0].outputs[0].text.strip()
|
| 109 |
+
try:
|
| 110 |
+
text = clean_model_text(raw_text)
|
| 111 |
+
break
|
| 112 |
+
except ModelCallError as exc:
|
| 113 |
+
last_error = exc
|
| 114 |
+
if not text and last_error:
|
| 115 |
+
raise last_error
|
| 116 |
+
return {
|
| 117 |
+
"text": text,
|
| 118 |
+
"latency_ms": int((time.perf_counter() - started) * 1000),
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
@modal.method()
|
| 122 |
+
def warm(self) -> dict:
|
| 123 |
+
return {"model": self.model_id, "status": "warm"}
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def modal_gpu_enabled() -> bool:
|
| 127 |
+
return os.getenv("SOVEREIGN_DISABLE_MODAL_GPU", "").lower() not in {"1", "true", "yes"}
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def modal_gpu_runner(**kwargs) -> ModelResult:
|
| 131 |
+
messages = build_role_messages(
|
| 132 |
+
agent=kwargs["agent"],
|
| 133 |
+
role=kwargs["role"],
|
| 134 |
+
case_summary=kwargs["case_summary"],
|
| 135 |
+
task=kwargs["task"],
|
| 136 |
+
evidence_summary=kwargs["evidence_summary"],
|
| 137 |
+
trial_history=kwargs.get("trial_history", ""),
|
| 138 |
+
persona=kwargs.get("persona", ""),
|
| 139 |
+
objective=kwargs.get("objective", ""),
|
| 140 |
+
)
|
| 141 |
+
requested_model = kwargs["model"]
|
| 142 |
+
prompt_hash = messages_hash(messages)
|
| 143 |
+
|
| 144 |
+
if modal_gpu_enabled():
|
| 145 |
+
output = VllmModel(model_id=requested_model).generate.remote(
|
| 146 |
+
{
|
| 147 |
+
"messages": messages,
|
| 148 |
+
"max_tokens": kwargs.get("max_tokens", 120),
|
| 149 |
+
"temperature": 0.45,
|
| 150 |
+
}
|
| 151 |
+
)
|
| 152 |
+
return ModelResult(
|
| 153 |
+
text=output["text"],
|
| 154 |
+
input_text="\n\n".join(f"{item.get('role', 'user').upper()}:\n{item.get('content', '')}" for item in messages)
|
| 155 |
+
+ "\n\nASSISTANT:\n",
|
| 156 |
+
call=ModelCall(
|
| 157 |
+
model=requested_model,
|
| 158 |
+
provider="modal-gpu-vllm",
|
| 159 |
+
ok=True,
|
| 160 |
+
latency_ms=output["latency_ms"],
|
| 161 |
+
prompt_hash=prompt_hash,
|
| 162 |
+
requested_model=requested_model,
|
| 163 |
+
runtime="modal-gpu-vllm",
|
| 164 |
+
gpu=GPU_NAME,
|
| 165 |
+
),
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
raise RuntimeError("Modal GPU is disabled; no provider fallback is allowed.")
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
@app.function(image=image, secrets=[modal.Secret.from_name("huggingface")])
|
| 172 |
+
def check_huggingface_connection() -> str:
|
| 173 |
+
token = os.getenv("HF_TOKEN")
|
| 174 |
+
if not token:
|
| 175 |
+
return "HF_TOKEN is not available inside Modal."
|
| 176 |
+
|
| 177 |
+
from huggingface_hub import HfApi
|
| 178 |
+
|
| 179 |
+
user = HfApi(token=token).whoami()["name"]
|
| 180 |
+
return f"Connected to Hugging Face as {user}."
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
@app.function(
|
| 184 |
+
image=image,
|
| 185 |
+
secrets=[modal.Secret.from_name("huggingface")],
|
| 186 |
+
min_containers=1,
|
| 187 |
+
timeout=GPU_TIMEOUT_SECONDS,
|
| 188 |
+
)
|
| 189 |
+
@modal.fastapi_endpoint(method="POST", label="trial-stream")
|
| 190 |
+
def trial_stream(payload: dict):
|
| 191 |
+
from fastapi.responses import StreamingResponse
|
| 192 |
+
|
| 193 |
+
request = TrialRequest.model_validate(payload)
|
| 194 |
+
delay = {"swift": 0.02, "measured": 0.12, "ceremonial": 0.25}[request.speed]
|
| 195 |
+
return StreamingResponse(
|
| 196 |
+
stream_trial_jsonl(request, delay=delay, model_runner=modal_gpu_runner),
|
| 197 |
+
media_type="application/x-ndjson",
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
@app.local_entrypoint()
|
| 202 |
+
def main():
|
| 203 |
+
print(check_huggingface_connection.remote())
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
@app.local_entrypoint()
|
| 207 |
+
def warm_models():
|
| 208 |
+
deployed_model = modal.Cls.from_name(MODAL_APP_NAME, "VllmModel")
|
| 209 |
+
for model_id in USED_MODEL_IDS:
|
| 210 |
+
model = deployed_model(model_id=model_id)
|
| 211 |
+
model.update_autoscaler(min_containers=1)
|
| 212 |
+
print(model.warm.remote())
|
requirements.txt
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
-
gradio
|
| 2 |
-
huggingface_hub
|
| 3 |
-
httpx
|
| 4 |
-
modal
|
| 5 |
-
pydantic
|
| 6 |
-
pytest
|
| 7 |
-
python-dotenv
|
|
|
|
| 1 |
+
gradio
|
| 2 |
+
huggingface_hub
|
| 3 |
+
httpx
|
| 4 |
+
modal
|
| 5 |
+
pydantic
|
| 6 |
+
pytest
|
| 7 |
+
python-dotenv
|
sovereign_bench/__init__.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
-
"""Sovereign Bench trial engine package."""
|
| 2 |
-
|
| 3 |
-
from .engine import run_trial, stream_trial
|
| 4 |
-
from .models import TrialRequest
|
| 5 |
-
|
| 6 |
-
__all__ = ["TrialRequest", "run_trial", "stream_trial"]
|
|
|
|
| 1 |
+
"""Sovereign Bench trial engine package."""
|
| 2 |
+
|
| 3 |
+
from .engine import run_trial, stream_trial
|
| 4 |
+
from .models import TrialRequest
|
| 5 |
+
|
| 6 |
+
__all__ = ["TrialRequest", "run_trial", "stream_trial"]
|
sovereign_bench/cases.py
CHANGED
|
@@ -1,141 +1,274 @@
|
|
| 1 |
-
from __future__ import annotations
|
| 2 |
-
|
| 3 |
-
from .models import CasePacket, EvidenceItem
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
SOCRATES = CasePacket(
|
| 7 |
-
id="socrates",
|
| 8 |
-
title="The Polis v. Socrates",
|
| 9 |
-
subtitle="A miniature retrial of impiety, civic anxiety, and troublesome questions.",
|
| 10 |
-
claimant="The Athenian Polis",
|
| 11 |
-
respondent="Socrates",
|
| 12 |
-
charge="Corrupting the youth and refusing the sanctioned gods of the city.",
|
| 13 |
-
setting="Athens, 399 BCE, reassembled inside a pocket tribunal.",
|
| 14 |
-
|
| 15 |
-
"
|
| 16 |
-
"and
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
"
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
"
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from .models import CasePacket, EvidenceItem
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
SOCRATES = CasePacket(
|
| 7 |
+
id="socrates",
|
| 8 |
+
title="The Polis v. Socrates",
|
| 9 |
+
subtitle="A miniature retrial of impiety, civic anxiety, and troublesome questions.",
|
| 10 |
+
claimant="The Athenian Polis",
|
| 11 |
+
respondent="Socrates",
|
| 12 |
+
charge="Corrupting the youth and refusing the sanctioned gods of the city.",
|
| 13 |
+
setting="Athens, 399 BCE, reassembled inside a pocket tribunal.",
|
| 14 |
+
context=(
|
| 15 |
+
"Athens has brought Socrates back before a civic court after years of public questioning, "
|
| 16 |
+
"youthful imitators, and anxiety about private religious claims. The city says his method "
|
| 17 |
+
"weakened civic order; Socrates says he served the public by exposing false wisdom."
|
| 18 |
+
),
|
| 19 |
+
claimant_claim=(
|
| 20 |
+
"The city argues that Socrates trained young citizens to mock public authority "
|
| 21 |
+
"and placed private daimonion guidance above civic religion."
|
| 22 |
+
),
|
| 23 |
+
respondent_claim=(
|
| 24 |
+
"Socrates answers that cross-examination was a public service, not corruption, "
|
| 25 |
+
"and that unpopular inquiry should not be confused with civic sabotage."
|
| 26 |
+
),
|
| 27 |
+
source_note=(
|
| 28 |
+
"Cached public-domain style packet derived from Plato's Apology and Crito, "
|
| 29 |
+
"Xenophon's Apology, and common historical summaries. It is not a live scholarly edition."
|
| 30 |
+
),
|
| 31 |
+
evidence=[
|
| 32 |
+
EvidenceItem(
|
| 33 |
+
id="SOC-F1",
|
| 34 |
+
title="Youthful Imitators",
|
| 35 |
+
source="Plato, Apology tradition",
|
| 36 |
+
excerpt=(
|
| 37 |
+
"Young men with leisure reportedly followed Socrates and copied his questioning, "
|
| 38 |
+
"which angered the questioned citizens."
|
| 39 |
+
),
|
| 40 |
+
supports="claimant",
|
| 41 |
+
reliability=0.68,
|
| 42 |
+
note="Supports social effect, but does not prove intentional corruption.",
|
| 43 |
+
),
|
| 44 |
+
EvidenceItem(
|
| 45 |
+
id="SOC-F2",
|
| 46 |
+
title="Public Embarrassment",
|
| 47 |
+
source="Ancient defense tradition",
|
| 48 |
+
excerpt=(
|
| 49 |
+
"Socrates describes testing reputedly wise citizens in public after hearing the "
|
| 50 |
+
"Delphic oracle report."
|
| 51 |
+
),
|
| 52 |
+
supports="claimant",
|
| 53 |
+
reliability=0.74,
|
| 54 |
+
note="Shows a repeated practice that made civic leaders look foolish.",
|
| 55 |
+
),
|
| 56 |
+
EvidenceItem(
|
| 57 |
+
id="SOC-F3",
|
| 58 |
+
title="The Daimonion Suspicion",
|
| 59 |
+
source="Ancient biographical tradition",
|
| 60 |
+
excerpt=(
|
| 61 |
+
"Socrates reports a private divine sign that restrains him from certain actions, "
|
| 62 |
+
"which civic accusers read as religious irregularity."
|
| 63 |
+
),
|
| 64 |
+
supports="claimant",
|
| 65 |
+
reliability=0.64,
|
| 66 |
+
note="Supports the impiety theory if private revelation is treated as civic defiance.",
|
| 67 |
+
),
|
| 68 |
+
EvidenceItem(
|
| 69 |
+
id="SOC-A1",
|
| 70 |
+
title="No Fee, No School",
|
| 71 |
+
source="Ancient defense tradition",
|
| 72 |
+
excerpt=(
|
| 73 |
+
"Socrates distinguishes himself from paid teachers and denies promising technical "
|
| 74 |
+
"instruction or private doctrine."
|
| 75 |
+
),
|
| 76 |
+
supports="respondent",
|
| 77 |
+
reliability=0.72,
|
| 78 |
+
note="Weakens the claim that he operated a formal corrupting academy.",
|
| 79 |
+
),
|
| 80 |
+
EvidenceItem(
|
| 81 |
+
id="SOC-A2",
|
| 82 |
+
title="Oracle as Duty",
|
| 83 |
+
source="Plato, Apology tradition",
|
| 84 |
+
excerpt=(
|
| 85 |
+
"Socrates frames his questioning as obedience to a divine puzzle rather than "
|
| 86 |
+
"contempt for religion."
|
| 87 |
+
),
|
| 88 |
+
supports="respondent",
|
| 89 |
+
reliability=0.78,
|
| 90 |
+
note="Turns the impiety charge into a competing account of piety.",
|
| 91 |
+
),
|
| 92 |
+
EvidenceItem(
|
| 93 |
+
id="SOC-A3",
|
| 94 |
+
title="Cross-Examination as Service",
|
| 95 |
+
source="Defense summary",
|
| 96 |
+
excerpt=(
|
| 97 |
+
"The defense treats uncomfortable questioning as civic improvement, not sabotage "
|
| 98 |
+
"or intentional corruption."
|
| 99 |
+
),
|
| 100 |
+
supports="respondent",
|
| 101 |
+
reliability=0.7,
|
| 102 |
+
note="Gives the jury a public-interest reason to tolerate Socrates.",
|
| 103 |
+
),
|
| 104 |
+
],
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
GREG = CasePacket(
|
| 109 |
+
id="greg",
|
| 110 |
+
title="Greg Heffley v. Mom",
|
| 111 |
+
subtitle="A family-court argument over a diary, embarrassment, and parental good intentions.",
|
| 112 |
+
claimant="Greg Heffley",
|
| 113 |
+
respondent="Susan Heffley",
|
| 114 |
+
charge="Whether Mom wrongfully saddled Greg with an embarrassing diary instead of a normal journal.",
|
| 115 |
+
setting="The Heffley house on the eve of another middle-school year.",
|
| 116 |
+
context=(
|
| 117 |
+
"Greg receives a book from his mom meant to help him record his thoughts, but he objects "
|
| 118 |
+
"that the word diary makes him look childish and vulnerable at school. Mom treats the book "
|
| 119 |
+
"as a harmless tool for reflection; Greg treats it as social evidence waiting to be used "
|
| 120 |
+
"against him."
|
| 121 |
+
),
|
| 122 |
+
claimant_claim=(
|
| 123 |
+
"Greg argues that Mom ignored the obvious social risk of handing a middle-school boy a diary "
|
| 124 |
+
"and failed to respect how easily classmates can turn an object into humiliation."
|
| 125 |
+
),
|
| 126 |
+
respondent_claim=(
|
| 127 |
+
"Mom answers that the writing book is a constructive outlet, that Greg can choose how to use it, "
|
| 128 |
+
"and that parental encouragement is not social sabotage."
|
| 129 |
+
),
|
| 130 |
+
source_note=(
|
| 131 |
+
"Cached demo packet using paraphrased context from the Diary of a Wimpy Kid setup. "
|
| 132 |
+
"No book text is quoted."
|
| 133 |
+
),
|
| 134 |
+
evidence=[
|
| 135 |
+
EvidenceItem(
|
| 136 |
+
id="GRG-F1",
|
| 137 |
+
title="The Label Problem",
|
| 138 |
+
source="Greg's objection",
|
| 139 |
+
excerpt=(
|
| 140 |
+
"Greg objects that diary is the wrong label for a middle-school boy and could be "
|
| 141 |
+
"used to mock him."
|
| 142 |
+
),
|
| 143 |
+
supports="claimant",
|
| 144 |
+
reliability=0.74,
|
| 145 |
+
note="Shows a foreseeable embarrassment risk from Greg's perspective.",
|
| 146 |
+
),
|
| 147 |
+
EvidenceItem(
|
| 148 |
+
id="GRG-F2",
|
| 149 |
+
title="Middle-School Audience",
|
| 150 |
+
source="School context",
|
| 151 |
+
excerpt=(
|
| 152 |
+
"Greg's social world rewards status and punishes anything classmates can frame "
|
| 153 |
+
"as childish."
|
| 154 |
+
),
|
| 155 |
+
supports="claimant",
|
| 156 |
+
reliability=0.7,
|
| 157 |
+
note="Makes the harm plausible even before anyone finds the book.",
|
| 158 |
+
),
|
| 159 |
+
EvidenceItem(
|
| 160 |
+
id="GRG-F3",
|
| 161 |
+
title="Ignored Preference",
|
| 162 |
+
source="Family exchange summary",
|
| 163 |
+
excerpt=(
|
| 164 |
+
"Greg wanted distance from the diary framing, but Mom treated the gift as settled."
|
| 165 |
+
),
|
| 166 |
+
supports="claimant",
|
| 167 |
+
reliability=0.66,
|
| 168 |
+
note="Supports Greg's autonomy argument, though parents often choose school supplies.",
|
| 169 |
+
),
|
| 170 |
+
EvidenceItem(
|
| 171 |
+
id="GRG-A1",
|
| 172 |
+
title="Private Writing Tool",
|
| 173 |
+
source="Mom's purpose",
|
| 174 |
+
excerpt=(
|
| 175 |
+
"Mom intended the book as a private place for Greg to record his thoughts and school year."
|
| 176 |
+
),
|
| 177 |
+
supports="respondent",
|
| 178 |
+
reliability=0.78,
|
| 179 |
+
note="Shows a constructive parental purpose rather than intent to embarrass.",
|
| 180 |
+
),
|
| 181 |
+
EvidenceItem(
|
| 182 |
+
id="GRG-A2",
|
| 183 |
+
title="Greg Controls Disclosure",
|
| 184 |
+
source="Household facts",
|
| 185 |
+
excerpt=(
|
| 186 |
+
"The book is not inherently public; Greg can keep it private and decide what to write."
|
| 187 |
+
),
|
| 188 |
+
supports="respondent",
|
| 189 |
+
reliability=0.68,
|
| 190 |
+
note="Weakens the claim that the gift itself creates inevitable harm.",
|
| 191 |
+
),
|
| 192 |
+
EvidenceItem(
|
| 193 |
+
id="GRG-A3",
|
| 194 |
+
title="Reflection Has Value",
|
| 195 |
+
source="Parenting rationale",
|
| 196 |
+
excerpt=(
|
| 197 |
+
"A journal can help a student process school, family, and growing-up pressures."
|
| 198 |
+
),
|
| 199 |
+
supports="respondent",
|
| 200 |
+
reliability=0.71,
|
| 201 |
+
note="Gives Mom a reasonable-benefit argument even if the branding is awkward.",
|
| 202 |
+
),
|
| 203 |
+
],
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
BARNABY = CasePacket(
|
| 208 |
+
id="barnaby",
|
| 209 |
+
title="The People v. Barnaby Buttons",
|
| 210 |
+
subtitle="The last office mooncake, a tampered snack ledger, and crumbs shaped like intent.",
|
| 211 |
+
claimant="The Breakroom Commonwealth",
|
| 212 |
+
respondent="Barnaby Buttons",
|
| 213 |
+
charge="Theft of the final mooncake and alteration of the communal snack ledger.",
|
| 214 |
+
setting="A fluorescent office kitchen at 4:47 p.m., under the humming republic of the fridge.",
|
| 215 |
+
context=(
|
| 216 |
+
"An office breakroom has lost its final mooncake after a suspicious ledger update and "
|
| 217 |
+
"a trail of crumbs. The commonwealth blames Barnaby Buttons; Barnaby says the evidence "
|
| 218 |
+
"is ordinary office mess and coincidence."
|
| 219 |
+
),
|
| 220 |
+
claimant_claim=(
|
| 221 |
+
"Barnaby removed the final mooncake, changed the snack ledger from '1 mooncake' "
|
| 222 |
+
"to '0 mooncakes', and left the team dessertless."
|
| 223 |
+
),
|
| 224 |
+
respondent_claim=(
|
| 225 |
+
"Barnaby says the mooncake was already abandoned, the ledger pen skipped naturally, "
|
| 226 |
+
"and the crumbs came from an unrelated biscuit."
|
| 227 |
+
),
|
| 228 |
+
source_note="Cached original whimsical packet kept for compatibility with older tests.",
|
| 229 |
+
evidence=[
|
| 230 |
+
EvidenceItem(
|
| 231 |
+
id="BTN-E1",
|
| 232 |
+
title="Ledger Ink Discontinuity",
|
| 233 |
+
source="Clerk's magnifying loupe",
|
| 234 |
+
excerpt="The zero in '0 mooncakes' uses a darker ink than the previous entries.",
|
| 235 |
+
supports="claimant",
|
| 236 |
+
reliability=0.82,
|
| 237 |
+
note="Strong tampering indicator, though pen swaps happen in offices.",
|
| 238 |
+
),
|
| 239 |
+
EvidenceItem(
|
| 240 |
+
id="BTN-E2",
|
| 241 |
+
title="Crumb Constellation",
|
| 242 |
+
source="Breakroom floor survey",
|
| 243 |
+
excerpt="Sesame crumbs form a trail from the pantry shelf to Barnaby's keyboard.",
|
| 244 |
+
supports="claimant",
|
| 245 |
+
reliability=0.71,
|
| 246 |
+
note="Suggestive route evidence, vulnerable to shared-desk contamination.",
|
| 247 |
+
),
|
| 248 |
+
EvidenceItem(
|
| 249 |
+
id="BTN-E3",
|
| 250 |
+
title="Calendar Entry",
|
| 251 |
+
source="Respondent's calendar",
|
| 252 |
+
excerpt="Barnaby had a 4:45 p.m. reminder titled 'Do not forget tea with lunar pastry'.",
|
| 253 |
+
supports="mixed",
|
| 254 |
+
reliability=0.76,
|
| 255 |
+
note="Shows desire and opportunity, but not necessarily theft.",
|
| 256 |
+
),
|
| 257 |
+
EvidenceItem(
|
| 258 |
+
id="BTN-E4",
|
| 259 |
+
title="Biscuit Alibi",
|
| 260 |
+
source="Vending machine receipt",
|
| 261 |
+
excerpt="A receipt shows Barnaby bought a sesame biscuit at 4:39 p.m.",
|
| 262 |
+
supports="respondent",
|
| 263 |
+
reliability=0.67,
|
| 264 |
+
note="Explains crumbs but not ledger alteration.",
|
| 265 |
+
),
|
| 266 |
+
],
|
| 267 |
+
)
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
CASES = {case.id: case for case in (SOCRATES, GREG, BARNABY)}
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
def get_case(case_id: str) -> CasePacket:
|
| 274 |
+
return CASES.get(case_id, SOCRATES)
|
sovereign_bench/engine.py
CHANGED
|
@@ -9,7 +9,7 @@ from collections.abc import Callable, Iterable
|
|
| 9 |
from pydantic import ValidationError
|
| 10 |
|
| 11 |
from .cases import get_case
|
| 12 |
-
from .llm import ModelCall, ModelResult, call_small_model
|
| 13 |
from .models import AgentTurn, CasePacket, JurorVote, TrialEvent, TrialRequest, Verdict
|
| 14 |
from .retrieval import build_live_case
|
| 15 |
|
|
@@ -23,22 +23,22 @@ NEMOTRON_PROVIDER = "featherless-ai"
|
|
| 23 |
MODEL_BUDGET = [
|
| 24 |
("Presiding Advocate", GPT_OSS_MODEL, 20.0),
|
| 25 |
("Clerk of Style", OPENBMB_MODEL, 4.0),
|
| 26 |
-
("
|
| 27 |
]
|
| 28 |
-
TOTAL_PARAMS_B = sum(item[2] for item in MODEL_BUDGET)
|
| 29 |
-
|
| 30 |
-
JUDGE_NAME = "Marcus Aurelius"
|
| 31 |
-
JUDGE_PERSONA = "Stoic duty, restraint, public reason, and disciplined judgment"
|
| 32 |
-
|
| 33 |
-
JUROR_PERSONAS = {
|
| 34 |
-
"Karl Marx": "class power, material conditions, exploitation, institutional incentives",
|
| 35 |
-
"John Stuart Mill": "liberty, harm principle, utility, individual rights",
|
| 36 |
-
"Confucius": "social harmony, role duty, ritual order, moral cultivation",
|
| 37 |
-
"Cleopatra VII": "sovereign pragmatism, diplomacy, survival, legitimacy under pressure",
|
| 38 |
-
"Niccolo Machiavelli": "political realism, stability, power, consequences over ideals",
|
| 39 |
-
"Jensen Huang": "technological optimism, operator mindset, systems thinking, innovation tradeoffs",
|
| 40 |
-
}
|
| 41 |
-
JUROR_NAMES = list(JUROR_PERSONAS)
|
| 42 |
|
| 43 |
|
| 44 |
class RequiredModelError(RuntimeError):
|
|
@@ -60,8 +60,10 @@ def _turn(agent: str, role: str, result: ModelResult, model: str, confidence: fl
|
|
| 60 |
|
| 61 |
|
| 62 |
def _case_summary(packet: CasePacket) -> str:
|
|
|
|
| 63 |
return (
|
| 64 |
f"{packet.title}. Charge: {packet.charge}\n"
|
|
|
|
| 65 |
f"Claimant: {packet.claimant_claim}\n"
|
| 66 |
f"Respondent: {packet.respondent_claim}"
|
| 67 |
)
|
|
@@ -79,12 +81,16 @@ def _call_trace(calls: list[ModelCall]) -> list[dict]:
|
|
| 79 |
|
| 80 |
|
| 81 |
def resolve_case(request: TrialRequest) -> tuple[CasePacket, dict]:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
if request.case_id == "live":
|
| 83 |
-
packet = build_live_case(request.search_query, request.hypothetical)
|
| 84 |
-
if packet:
|
| 85 |
-
return packet, {"mode": "live"}
|
| 86 |
-
raise RuntimeError("Live retrieval produced too little usable evidence; no fallback case will be substituted.")
|
| 87 |
-
return get_case(request.case_id), {"mode": "cached"}
|
| 88 |
|
| 89 |
|
| 90 |
def _generate_role(model_runner: ModelRunner | None = None, **kwargs) -> ModelResult:
|
|
@@ -93,15 +99,19 @@ def _generate_role(model_runner: ModelRunner | None = None, **kwargs) -> ModelRe
|
|
| 93 |
return call_small_model(**kwargs)
|
| 94 |
|
| 95 |
|
| 96 |
-
def _required_role(model_runner: ModelRunner | None, model_calls: list[ModelCall], **kwargs) -> ModelResult:
|
| 97 |
-
try:
|
| 98 |
-
result = _generate_role(model_runner, **kwargs)
|
| 99 |
-
except Exception as exc:
|
| 100 |
-
raise RequiredModelError(f"{kwargs.get('agent', 'Model')} unavailable: {exc}") from exc
|
| 101 |
-
model_calls.append(result.call)
|
| 102 |
if not result.call.ok:
|
| 103 |
error = result.call.error or "model call did not complete"
|
| 104 |
raise RequiredModelError(f"{kwargs.get('agent', 'Model')} unavailable: {error}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
if not result.text.strip():
|
| 106 |
raise RequiredModelError(f"{kwargs.get('agent', 'Model')} returned an empty response.")
|
| 107 |
return result
|
|
@@ -132,6 +142,43 @@ def _emit(
|
|
| 132 |
return event
|
| 133 |
|
| 134 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
def _extract_json(text: str) -> object:
|
| 136 |
stripped = text.strip()
|
| 137 |
if stripped.startswith("```"):
|
|
@@ -146,38 +193,34 @@ def _extract_json(text: str) -> object:
|
|
| 146 |
return json.loads(match.group(1))
|
| 147 |
|
| 148 |
|
| 149 |
-
def
|
| 150 |
try:
|
| 151 |
data = _extract_json(result.text)
|
| 152 |
except json.JSONDecodeError as exc:
|
| 153 |
-
raise RequiredModelError(f"
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
if
|
| 159 |
-
raise RequiredModelError("
|
| 160 |
|
| 161 |
-
known_evidence = {item.id for item in packet.evidence}
|
| 162 |
-
votes: list[JurorVote] = []
|
| 163 |
try:
|
| 164 |
-
|
| 165 |
-
vote = JurorVote.model_validate(item)
|
| 166 |
-
votes.append(vote)
|
| 167 |
except ValidationError as exc:
|
| 168 |
-
raise RequiredModelError(f"
|
| 169 |
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
return
|
| 181 |
|
| 182 |
|
| 183 |
def _majority_finding(votes: list[JurorVote]) -> str:
|
|
@@ -227,16 +270,13 @@ def _verdict_from_votes(votes: list[JurorVote]) -> Verdict:
|
|
| 227 |
)
|
| 228 |
|
| 229 |
|
| 230 |
-
def
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
"Return JSON
|
| 234 |
-
|
| 235 |
-
"
|
| 236 |
-
|
| 237 |
-
"Vote through the named public-history worldview, not a generic juror role.\n"
|
| 238 |
-
f"{personas}"
|
| 239 |
-
)
|
| 240 |
|
| 241 |
|
| 242 |
def run_trial(request: TrialRequest, model_runner: ModelRunner | None = None) -> list[TrialEvent]:
|
|
@@ -252,6 +292,7 @@ def stream_trial(
|
|
| 252 |
case_summary = _case_summary(packet)
|
| 253 |
evidence_summary = _evidence_summary(packet)
|
| 254 |
model_calls: list[ModelCall] = []
|
|
|
|
| 255 |
hypo = request.hypothetical.strip()
|
| 256 |
hypo_line = f"\n\nUser hypothetical admitted as a blue-ribbon sidebar: {hypo}" if hypo else ""
|
| 257 |
|
|
@@ -263,11 +304,12 @@ def stream_trial(
|
|
| 263 |
model=OPENBMB_MODEL,
|
| 264 |
case_summary=case_summary,
|
| 265 |
evidence_summary=evidence_summary,
|
| 266 |
-
task="Announce the case by name, identify the parties, and read the charge.",
|
| 267 |
provider=OPENBMB_PROVIDER,
|
| 268 |
max_tokens=110,
|
| 269 |
)
|
| 270 |
-
yield
|
|
|
|
| 271 |
packet,
|
| 272 |
source_trace,
|
| 273 |
model_calls,
|
|
@@ -281,48 +323,55 @@ def stream_trial(
|
|
| 281 |
delay,
|
| 282 |
)
|
| 283 |
|
| 284 |
-
judge_open = _required_role(
|
| 285 |
-
model_runner,
|
| 286 |
-
model_calls,
|
| 287 |
-
agent=JUDGE_NAME,
|
| 288 |
-
role="judge",
|
| 289 |
-
model=GPT_OSS_MODEL,
|
| 290 |
-
case_summary=case_summary,
|
| 291 |
-
evidence_summary=evidence_summary,
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 300 |
packet,
|
| 301 |
source_trace,
|
| 302 |
model_calls,
|
| 303 |
TrialEvent(
|
| 304 |
-
phase="intake",
|
| 305 |
-
title="The Burden Is Set",
|
| 306 |
-
body="The bench defines how the miniature court will weigh the record.",
|
| 307 |
-
turns=[_turn(JUDGE_NAME, "judge", judge_open, GPT_OSS_MODEL, 0.88)],
|
| 308 |
-
evidence=packet.evidence,
|
| 309 |
-
),
|
| 310 |
-
delay,
|
| 311 |
)
|
| 312 |
|
| 313 |
claimant_opening = _required_role(
|
| 314 |
model_runner,
|
| 315 |
model_calls,
|
| 316 |
-
agent="
|
| 317 |
role="claimant advocate",
|
| 318 |
model=GPT_OSS_MODEL,
|
| 319 |
case_summary=case_summary,
|
| 320 |
evidence_summary=evidence_summary,
|
| 321 |
-
|
|
|
|
|
|
|
| 322 |
provider=OPENAI_PROVIDER,
|
| 323 |
max_tokens=130,
|
| 324 |
)
|
| 325 |
-
yield
|
|
|
|
| 326 |
packet,
|
| 327 |
source_trace,
|
| 328 |
model_calls,
|
|
@@ -330,7 +379,7 @@ def stream_trial(
|
|
| 330 |
phase="claims",
|
| 331 |
title="Claimant Opening",
|
| 332 |
body=packet.claimant_claim,
|
| 333 |
-
turns=[_turn("
|
| 334 |
evidence=packet.evidence,
|
| 335 |
),
|
| 336 |
delay,
|
|
@@ -339,16 +388,19 @@ def stream_trial(
|
|
| 339 |
respondent_opening = _required_role(
|
| 340 |
model_runner,
|
| 341 |
model_calls,
|
| 342 |
-
agent="
|
| 343 |
role="respondent advocate",
|
| 344 |
model=GPT_OSS_MODEL,
|
| 345 |
case_summary=case_summary,
|
| 346 |
evidence_summary=evidence_summary,
|
| 347 |
-
|
|
|
|
|
|
|
| 348 |
provider=OPENAI_PROVIDER,
|
| 349 |
max_tokens=130,
|
| 350 |
)
|
| 351 |
-
yield
|
|
|
|
| 352 |
packet,
|
| 353 |
source_trace,
|
| 354 |
model_calls,
|
|
@@ -356,80 +408,76 @@ def stream_trial(
|
|
| 356 |
phase="opening",
|
| 357 |
title="Respondent Opening",
|
| 358 |
body=packet.respondent_claim,
|
| 359 |
-
turns=[_turn("
|
| 360 |
evidence=packet.evidence,
|
| 361 |
),
|
| 362 |
delay,
|
| 363 |
)
|
| 364 |
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
model_calls,
|
| 368 |
-
agent="Auditor Prism",
|
| 369 |
-
role="evidence auditor",
|
| 370 |
-
model=NEMOTRON_MODEL,
|
| 371 |
-
case_summary=case_summary,
|
| 372 |
-
evidence_summary=evidence_summary,
|
| 373 |
-
task="Present the evidence record. Identify the strongest exhibit and the weakest inference.",
|
| 374 |
-
provider=NEMOTRON_PROVIDER,
|
| 375 |
-
max_tokens=150,
|
| 376 |
-
)
|
| 377 |
-
yield _emit(
|
| 378 |
packet,
|
| 379 |
source_trace,
|
| 380 |
model_calls,
|
| 381 |
TrialEvent(
|
| 382 |
phase="evidence",
|
| 383 |
-
title="The
|
| 384 |
body="\n".join(f"{item.id}: {item.title} | reliability {item.reliability:.2f} | {item.note}" for item in packet.evidence),
|
| 385 |
-
turns=[
|
| 386 |
evidence=packet.evidence,
|
| 387 |
),
|
| 388 |
delay,
|
| 389 |
)
|
| 390 |
|
| 391 |
-
judge_question = _required_role(
|
| 392 |
-
model_runner,
|
| 393 |
-
model_calls,
|
| 394 |
-
agent=JUDGE_NAME,
|
| 395 |
-
role="judge",
|
| 396 |
-
model=GPT_OSS_MODEL,
|
| 397 |
-
case_summary=case_summary,
|
| 398 |
-
evidence_summary=evidence_summary,
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 407 |
packet,
|
| 408 |
source_trace,
|
| 409 |
model_calls,
|
| 410 |
TrialEvent(
|
| 411 |
-
phase="questions",
|
| 412 |
-
title="The Hinge Question",
|
| 413 |
-
body="The bench asks the single question that could turn the record.",
|
| 414 |
-
turns=[_turn(JUDGE_NAME, "judge", judge_question, GPT_OSS_MODEL, 0.88)],
|
| 415 |
-
evidence=packet.evidence,
|
| 416 |
-
),
|
| 417 |
-
delay,
|
| 418 |
)
|
| 419 |
|
| 420 |
claimant_answer = _required_role(
|
| 421 |
model_runner,
|
| 422 |
model_calls,
|
| 423 |
-
agent="
|
| 424 |
role="claimant advocate",
|
| 425 |
-
model=GPT_OSS_MODEL,
|
| 426 |
-
case_summary=case_summary,
|
| 427 |
-
evidence_summary=evidence_summary,
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
|
|
|
|
|
|
|
|
|
| 433 |
packet,
|
| 434 |
source_trace,
|
| 435 |
model_calls,
|
|
@@ -437,7 +485,7 @@ def stream_trial(
|
|
| 437 |
phase="questions",
|
| 438 |
title="Claimant Answers the Bench",
|
| 439 |
body="The claimant answers the hinge question.",
|
| 440 |
-
turns=[_turn("
|
| 441 |
evidence=packet.evidence,
|
| 442 |
),
|
| 443 |
delay,
|
|
@@ -446,16 +494,19 @@ def stream_trial(
|
|
| 446 |
respondent_answer = _required_role(
|
| 447 |
model_runner,
|
| 448 |
model_calls,
|
| 449 |
-
agent="
|
| 450 |
role="respondent advocate",
|
| 451 |
-
model=GPT_OSS_MODEL,
|
| 452 |
-
case_summary=case_summary,
|
| 453 |
-
evidence_summary=evidence_summary,
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
|
|
|
|
|
|
|
|
|
| 459 |
packet,
|
| 460 |
source_trace,
|
| 461 |
model_calls,
|
|
@@ -463,7 +514,7 @@ def stream_trial(
|
|
| 463 |
phase="questions",
|
| 464 |
title="Respondent Answers the Bench",
|
| 465 |
body="The respondent answers the hinge question.",
|
| 466 |
-
turns=[_turn("
|
| 467 |
evidence=packet.evidence,
|
| 468 |
),
|
| 469 |
delay,
|
|
@@ -477,11 +528,14 @@ def stream_trial(
|
|
| 477 |
model=NEMOTRON_MODEL,
|
| 478 |
case_summary=case_summary,
|
| 479 |
evidence_summary=evidence_summary,
|
| 480 |
-
|
|
|
|
|
|
|
| 481 |
provider=NEMOTRON_PROVIDER,
|
| 482 |
max_tokens=100,
|
| 483 |
)
|
| 484 |
-
yield
|
|
|
|
| 485 |
packet,
|
| 486 |
source_trace,
|
| 487 |
model_calls,
|
|
@@ -495,26 +549,32 @@ def stream_trial(
|
|
| 495 |
delay,
|
| 496 |
)
|
| 497 |
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 512 |
juror_result = ModelResult(
|
| 513 |
-
text=f"{vote.vote.replace('_', ' ').title()}. {vote.reason}",
|
| 514 |
-
call=
|
| 515 |
-
input_text=
|
| 516 |
)
|
| 517 |
-
yield
|
|
|
|
| 518 |
packet,
|
| 519 |
source_trace,
|
| 520 |
model_calls,
|
|
@@ -530,23 +590,27 @@ def stream_trial(
|
|
| 530 |
)
|
| 531 |
|
| 532 |
verdict = _verdict_from_votes(votes)
|
| 533 |
-
verdict_voice = _required_role(
|
| 534 |
-
model_runner,
|
| 535 |
-
model_calls,
|
| 536 |
-
agent=JUDGE_NAME,
|
| 537 |
-
role="verdict writer",
|
| 538 |
-
model=GPT_OSS_MODEL,
|
| 539 |
-
case_summary=case_summary,
|
| 540 |
-
evidence_summary=evidence_summary,
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
|
|
|
|
|
|
|
|
|
| 546 |
provider=OPENAI_PROVIDER,
|
| 547 |
max_tokens=160,
|
| 548 |
)
|
| 549 |
-
yield
|
|
|
|
| 550 |
packet,
|
| 551 |
source_trace,
|
| 552 |
model_calls,
|
|
@@ -554,13 +618,13 @@ def stream_trial(
|
|
| 554 |
phase="verdict",
|
| 555 |
title="The Court Announces Judgment",
|
| 556 |
body=f"{verdict_voice.text}\n\n{verdict.rationale}\n\nRemedy: {verdict.remedy}",
|
| 557 |
-
verdict=verdict,
|
| 558 |
-
votes=votes,
|
| 559 |
-
evidence=packet.evidence,
|
| 560 |
-
turns=[_turn(JUDGE_NAME, "verdict writer", verdict_voice, GPT_OSS_MODEL, 0.88)],
|
| 561 |
-
),
|
| 562 |
-
delay,
|
| 563 |
-
)
|
| 564 |
|
| 565 |
|
| 566 |
def stream_trial_jsonl(
|
|
|
|
| 9 |
from pydantic import ValidationError
|
| 10 |
|
| 11 |
from .cases import get_case
|
| 12 |
+
from .llm import ModelCall, ModelCallError, ModelResult, call_small_model, clean_model_text
|
| 13 |
from .models import AgentTurn, CasePacket, JurorVote, TrialEvent, TrialRequest, Verdict
|
| 14 |
from .retrieval import build_live_case
|
| 15 |
|
|
|
|
| 23 |
MODEL_BUDGET = [
|
| 24 |
("Presiding Advocate", GPT_OSS_MODEL, 20.0),
|
| 25 |
("Clerk of Style", OPENBMB_MODEL, 4.0),
|
| 26 |
+
("Jury Ring", NEMOTRON_MODEL, 8.0),
|
| 27 |
]
|
| 28 |
+
TOTAL_PARAMS_B = sum(item[2] for item in MODEL_BUDGET)
|
| 29 |
+
|
| 30 |
+
JUDGE_NAME = "Marcus Aurelius"
|
| 31 |
+
JUDGE_PERSONA = "Stoic duty, restraint, public reason, and disciplined judgment"
|
| 32 |
+
|
| 33 |
+
JUROR_PERSONAS = {
|
| 34 |
+
"Karl Marx": "class power, material conditions, exploitation, institutional incentives",
|
| 35 |
+
"John Stuart Mill": "liberty, harm principle, utility, individual rights",
|
| 36 |
+
"Confucius": "social harmony, role duty, ritual order, moral cultivation",
|
| 37 |
+
"Cleopatra VII": "sovereign pragmatism, diplomacy, survival, legitimacy under pressure",
|
| 38 |
+
"Niccolo Machiavelli": "political realism, stability, power, consequences over ideals",
|
| 39 |
+
"Jensen Huang": "technological optimism, operator mindset, systems thinking, innovation tradeoffs",
|
| 40 |
+
}
|
| 41 |
+
JUROR_NAMES = list(JUROR_PERSONAS)
|
| 42 |
|
| 43 |
|
| 44 |
class RequiredModelError(RuntimeError):
|
|
|
|
| 60 |
|
| 61 |
|
| 62 |
def _case_summary(packet: CasePacket) -> str:
|
| 63 |
+
context = packet.context or packet.setting
|
| 64 |
return (
|
| 65 |
f"{packet.title}. Charge: {packet.charge}\n"
|
| 66 |
+
f"Context: {context}\n"
|
| 67 |
f"Claimant: {packet.claimant_claim}\n"
|
| 68 |
f"Respondent: {packet.respondent_claim}"
|
| 69 |
)
|
|
|
|
| 81 |
|
| 82 |
|
| 83 |
def resolve_case(request: TrialRequest) -> tuple[CasePacket, dict]:
|
| 84 |
+
if request.case_id == "custom":
|
| 85 |
+
if request.custom_case is None:
|
| 86 |
+
raise RuntimeError("Custom case requires trial details and evidence before the court can begin.")
|
| 87 |
+
return request.custom_case, {"mode": "custom"}
|
| 88 |
if request.case_id == "live":
|
| 89 |
+
packet = build_live_case(request.search_query, request.hypothetical)
|
| 90 |
+
if packet:
|
| 91 |
+
return packet, {"mode": "live"}
|
| 92 |
+
raise RuntimeError("Live retrieval produced too little usable evidence; no fallback case will be substituted.")
|
| 93 |
+
return get_case(request.case_id), {"mode": "cached"}
|
| 94 |
|
| 95 |
|
| 96 |
def _generate_role(model_runner: ModelRunner | None = None, **kwargs) -> ModelResult:
|
|
|
|
| 99 |
return call_small_model(**kwargs)
|
| 100 |
|
| 101 |
|
| 102 |
+
def _required_role(model_runner: ModelRunner | None, model_calls: list[ModelCall], **kwargs) -> ModelResult:
|
| 103 |
+
try:
|
| 104 |
+
result = _generate_role(model_runner, **kwargs)
|
| 105 |
+
except Exception as exc:
|
| 106 |
+
raise RequiredModelError(f"{kwargs.get('agent', 'Model')} unavailable: {exc}") from exc
|
| 107 |
+
model_calls.append(result.call)
|
| 108 |
if not result.call.ok:
|
| 109 |
error = result.call.error or "model call did not complete"
|
| 110 |
raise RequiredModelError(f"{kwargs.get('agent', 'Model')} unavailable: {error}")
|
| 111 |
+
try:
|
| 112 |
+
result.text = clean_model_text(result.text)
|
| 113 |
+
except ModelCallError as exc:
|
| 114 |
+
raise RequiredModelError(f"{kwargs.get('agent', 'Model')} returned non-dialogue output: {exc}") from exc
|
| 115 |
if not result.text.strip():
|
| 116 |
raise RequiredModelError(f"{kwargs.get('agent', 'Model')} returned an empty response.")
|
| 117 |
return result
|
|
|
|
| 142 |
return event
|
| 143 |
|
| 144 |
|
| 145 |
+
def _record_and_emit(
|
| 146 |
+
events: list[TrialEvent],
|
| 147 |
+
packet: CasePacket,
|
| 148 |
+
source_trace: dict,
|
| 149 |
+
model_calls: list[ModelCall],
|
| 150 |
+
event: TrialEvent,
|
| 151 |
+
delay: float,
|
| 152 |
+
) -> TrialEvent:
|
| 153 |
+
emitted = _emit(packet, source_trace, model_calls, event, delay)
|
| 154 |
+
events.append(emitted)
|
| 155 |
+
return emitted
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def _compact(value: str, limit: int = 420) -> str:
|
| 159 |
+
text = " ".join(value.split())
|
| 160 |
+
return text if len(text) <= limit else text[: limit - 3].rstrip() + "..."
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def _trial_history(events: list[TrialEvent]) -> str:
|
| 164 |
+
if not events:
|
| 165 |
+
return "No trial statements have been made yet."
|
| 166 |
+
lines = []
|
| 167 |
+
for index, event in enumerate(events, start=1):
|
| 168 |
+
if event.turns:
|
| 169 |
+
turn = event.turns[0]
|
| 170 |
+
lines.append(
|
| 171 |
+
f"{index}. {event.phase} / {event.title} - {turn.agent} ({turn.role}): {_compact(turn.content)}"
|
| 172 |
+
)
|
| 173 |
+
elif event.body:
|
| 174 |
+
lines.append(f"{index}. {event.phase} / {event.title}: {_compact(event.body)}")
|
| 175 |
+
for vote in event.votes:
|
| 176 |
+
lines.append(
|
| 177 |
+
f" Vote - {vote.juror}: {vote.vote}; reason: {_compact(vote.reason, 220)}; evidence: {', '.join(vote.evidence_ids)}"
|
| 178 |
+
)
|
| 179 |
+
return "\n".join(lines)
|
| 180 |
+
|
| 181 |
+
|
| 182 |
def _extract_json(text: str) -> object:
|
| 183 |
stripped = text.strip()
|
| 184 |
if stripped.startswith("```"):
|
|
|
|
| 193 |
return json.loads(match.group(1))
|
| 194 |
|
| 195 |
|
| 196 |
+
def _parse_juror_vote(result: ModelResult, packet: CasePacket, juror: str) -> JurorVote:
|
| 197 |
try:
|
| 198 |
data = _extract_json(result.text)
|
| 199 |
except json.JSONDecodeError as exc:
|
| 200 |
+
raise RequiredModelError(f"{juror} returned invalid JSON: {exc.msg}") from exc
|
| 201 |
+
if isinstance(data, dict) and isinstance(data.get("votes"), list):
|
| 202 |
+
if len(data["votes"]) != 1:
|
| 203 |
+
raise RequiredModelError(f"{juror} must return exactly one vote.")
|
| 204 |
+
data = data["votes"][0]
|
| 205 |
+
if not isinstance(data, dict):
|
| 206 |
+
raise RequiredModelError(f"{juror} vote output must be a JSON object.")
|
| 207 |
|
|
|
|
|
|
|
| 208 |
try:
|
| 209 |
+
vote = JurorVote.model_validate(data)
|
|
|
|
|
|
|
| 210 |
except ValidationError as exc:
|
| 211 |
+
raise RequiredModelError(f"{juror} vote schema is invalid: {exc.errors()[0]['msg']}") from exc
|
| 212 |
|
| 213 |
+
known_evidence = {item.id for item in packet.evidence}
|
| 214 |
+
expected_persona = JUROR_PERSONAS[juror]
|
| 215 |
+
if vote.juror != juror:
|
| 216 |
+
raise RequiredModelError(f"{juror} vote must use juror '{juror}'.")
|
| 217 |
+
if vote.persona.strip().lower() != expected_persona:
|
| 218 |
+
raise RequiredModelError(f"{juror} persona must be '{expected_persona}'.")
|
| 219 |
+
if not vote.reason.strip():
|
| 220 |
+
raise RequiredModelError(f"{juror} must include a rationale.")
|
| 221 |
+
if not vote.evidence_ids or any(evidence_id not in known_evidence for evidence_id in vote.evidence_ids):
|
| 222 |
+
raise RequiredModelError(f"{juror} must cite known evidence IDs.")
|
| 223 |
+
return vote
|
| 224 |
|
| 225 |
|
| 226 |
def _majority_finding(votes: list[JurorVote]) -> str:
|
|
|
|
| 270 |
)
|
| 271 |
|
| 272 |
|
| 273 |
+
def _juror_task(juror: str, persona: str) -> str:
|
| 274 |
+
return (
|
| 275 |
+
f"After watching the trial, vote as {juror}. Your worldview is: {persona}. "
|
| 276 |
+
"Return exactly one JSON object with keys juror, persona, vote, reason, and evidence_ids. "
|
| 277 |
+
"Valid vote values are liable, not_liable, uncertain. The persona value must exactly match your worldview. "
|
| 278 |
+
"The reason must be one concise sentence grounded in your beliefs and the record. Cite evidence IDs from the record."
|
| 279 |
+
)
|
|
|
|
|
|
|
|
|
|
| 280 |
|
| 281 |
|
| 282 |
def run_trial(request: TrialRequest, model_runner: ModelRunner | None = None) -> list[TrialEvent]:
|
|
|
|
| 292 |
case_summary = _case_summary(packet)
|
| 293 |
evidence_summary = _evidence_summary(packet)
|
| 294 |
model_calls: list[ModelCall] = []
|
| 295 |
+
events: list[TrialEvent] = []
|
| 296 |
hypo = request.hypothetical.strip()
|
| 297 |
hypo_line = f"\n\nUser hypothetical admitted as a blue-ribbon sidebar: {hypo}" if hypo else ""
|
| 298 |
|
|
|
|
| 304 |
model=OPENBMB_MODEL,
|
| 305 |
case_summary=case_summary,
|
| 306 |
evidence_summary=evidence_summary,
|
| 307 |
+
task="Begin with 'I call'. Announce the case by name, identify the parties, and read the charge.",
|
| 308 |
provider=OPENBMB_PROVIDER,
|
| 309 |
max_tokens=110,
|
| 310 |
)
|
| 311 |
+
yield _record_and_emit(
|
| 312 |
+
events,
|
| 313 |
packet,
|
| 314 |
source_trace,
|
| 315 |
model_calls,
|
|
|
|
| 323 |
delay,
|
| 324 |
)
|
| 325 |
|
| 326 |
+
judge_open = _required_role(
|
| 327 |
+
model_runner,
|
| 328 |
+
model_calls,
|
| 329 |
+
agent=JUDGE_NAME,
|
| 330 |
+
role="judge",
|
| 331 |
+
model=GPT_OSS_MODEL,
|
| 332 |
+
case_summary=case_summary,
|
| 333 |
+
evidence_summary=evidence_summary,
|
| 334 |
+
trial_history=_trial_history(events),
|
| 335 |
+
persona=JUDGE_PERSONA,
|
| 336 |
+
objective="Set a fair standard for hearing both sides.",
|
| 337 |
+
task=(
|
| 338 |
+
f"As {JUDGE_NAME}, a Stoic courtroom judge guided by {JUDGE_PERSONA}, explain the proceeding "
|
| 339 |
+
"and the burden of proof in one or two disciplined sentences using I or we."
|
| 340 |
+
),
|
| 341 |
+
provider=OPENAI_PROVIDER,
|
| 342 |
+
max_tokens=110,
|
| 343 |
+
)
|
| 344 |
+
yield _record_and_emit(
|
| 345 |
+
events,
|
| 346 |
packet,
|
| 347 |
source_trace,
|
| 348 |
model_calls,
|
| 349 |
TrialEvent(
|
| 350 |
+
phase="intake",
|
| 351 |
+
title="The Burden Is Set",
|
| 352 |
+
body="The bench defines how the miniature court will weigh the record.",
|
| 353 |
+
turns=[_turn(JUDGE_NAME, "judge", judge_open, GPT_OSS_MODEL, 0.88)],
|
| 354 |
+
evidence=packet.evidence,
|
| 355 |
+
),
|
| 356 |
+
delay,
|
| 357 |
)
|
| 358 |
|
| 359 |
claimant_opening = _required_role(
|
| 360 |
model_runner,
|
| 361 |
model_calls,
|
| 362 |
+
agent="Mike OSS",
|
| 363 |
role="claimant advocate",
|
| 364 |
model=GPT_OSS_MODEL,
|
| 365 |
case_summary=case_summary,
|
| 366 |
evidence_summary=evidence_summary,
|
| 367 |
+
trial_history=_trial_history(events),
|
| 368 |
+
objective="Win the case for the claimant using the strongest fair reading of the record.",
|
| 369 |
+
task="Make the claimant's opening statement alone, speaking as I for the claimant. Cite the strongest claimant-side exhibit.",
|
| 370 |
provider=OPENAI_PROVIDER,
|
| 371 |
max_tokens=130,
|
| 372 |
)
|
| 373 |
+
yield _record_and_emit(
|
| 374 |
+
events,
|
| 375 |
packet,
|
| 376 |
source_trace,
|
| 377 |
model_calls,
|
|
|
|
| 379 |
phase="claims",
|
| 380 |
title="Claimant Opening",
|
| 381 |
body=packet.claimant_claim,
|
| 382 |
+
turns=[_turn("Mike OSS", "claimant advocate", claimant_opening, GPT_OSS_MODEL, 0.88)],
|
| 383 |
evidence=packet.evidence,
|
| 384 |
),
|
| 385 |
delay,
|
|
|
|
| 388 |
respondent_opening = _required_role(
|
| 389 |
model_runner,
|
| 390 |
model_calls,
|
| 391 |
+
agent="Harvey Vector",
|
| 392 |
role="respondent advocate",
|
| 393 |
model=GPT_OSS_MODEL,
|
| 394 |
case_summary=case_summary,
|
| 395 |
evidence_summary=evidence_summary,
|
| 396 |
+
trial_history=_trial_history(events),
|
| 397 |
+
objective="Win the case for the respondent using doubt, context, and the strongest fair reading of the record.",
|
| 398 |
+
task="Make the respondent's opening statement alone, speaking as I for the respondent. Emphasize uncertainty and cite a helpful exhibit.",
|
| 399 |
provider=OPENAI_PROVIDER,
|
| 400 |
max_tokens=130,
|
| 401 |
)
|
| 402 |
+
yield _record_and_emit(
|
| 403 |
+
events,
|
| 404 |
packet,
|
| 405 |
source_trace,
|
| 406 |
model_calls,
|
|
|
|
| 408 |
phase="opening",
|
| 409 |
title="Respondent Opening",
|
| 410 |
body=packet.respondent_claim,
|
| 411 |
+
turns=[_turn("Harvey Vector", "respondent advocate", respondent_opening, GPT_OSS_MODEL, 0.88)],
|
| 412 |
evidence=packet.evidence,
|
| 413 |
),
|
| 414 |
delay,
|
| 415 |
)
|
| 416 |
|
| 417 |
+
yield _record_and_emit(
|
| 418 |
+
events,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 419 |
packet,
|
| 420 |
source_trace,
|
| 421 |
model_calls,
|
| 422 |
TrialEvent(
|
| 423 |
phase="evidence",
|
| 424 |
+
title="The Evidence Record",
|
| 425 |
body="\n".join(f"{item.id}: {item.title} | reliability {item.reliability:.2f} | {item.note}" for item in packet.evidence),
|
| 426 |
+
turns=[],
|
| 427 |
evidence=packet.evidence,
|
| 428 |
),
|
| 429 |
delay,
|
| 430 |
)
|
| 431 |
|
| 432 |
+
judge_question = _required_role(
|
| 433 |
+
model_runner,
|
| 434 |
+
model_calls,
|
| 435 |
+
agent=JUDGE_NAME,
|
| 436 |
+
role="judge",
|
| 437 |
+
model=GPT_OSS_MODEL,
|
| 438 |
+
case_summary=case_summary,
|
| 439 |
+
evidence_summary=evidence_summary,
|
| 440 |
+
trial_history=_trial_history(events),
|
| 441 |
+
persona=JUDGE_PERSONA,
|
| 442 |
+
objective="Ask the question most likely to reveal which side has met its burden.",
|
| 443 |
+
task=(
|
| 444 |
+
f"As {JUDGE_NAME}, ask one sharp hinge question that would change the outcome if answered. "
|
| 445 |
+
"Use Stoic restraint and public reason, speaking from the bench as I or we."
|
| 446 |
+
),
|
| 447 |
+
provider=OPENAI_PROVIDER,
|
| 448 |
+
max_tokens=100,
|
| 449 |
+
)
|
| 450 |
+
yield _record_and_emit(
|
| 451 |
+
events,
|
| 452 |
packet,
|
| 453 |
source_trace,
|
| 454 |
model_calls,
|
| 455 |
TrialEvent(
|
| 456 |
+
phase="questions",
|
| 457 |
+
title="The Hinge Question",
|
| 458 |
+
body="The bench asks the single question that could turn the record.",
|
| 459 |
+
turns=[_turn(JUDGE_NAME, "judge", judge_question, GPT_OSS_MODEL, 0.88)],
|
| 460 |
+
evidence=packet.evidence,
|
| 461 |
+
),
|
| 462 |
+
delay,
|
| 463 |
)
|
| 464 |
|
| 465 |
claimant_answer = _required_role(
|
| 466 |
model_runner,
|
| 467 |
model_calls,
|
| 468 |
+
agent="Mike OSS",
|
| 469 |
role="claimant advocate",
|
| 470 |
+
model=GPT_OSS_MODEL,
|
| 471 |
+
case_summary=case_summary,
|
| 472 |
+
evidence_summary=evidence_summary,
|
| 473 |
+
trial_history=_trial_history(events),
|
| 474 |
+
objective="Answer the judge in the way most favorable to the claimant.",
|
| 475 |
+
task=f"Answer {JUDGE_NAME}'s hinge question as I for the claimant: {judge_question.text}",
|
| 476 |
+
provider=OPENAI_PROVIDER,
|
| 477 |
+
max_tokens=130,
|
| 478 |
+
)
|
| 479 |
+
yield _record_and_emit(
|
| 480 |
+
events,
|
| 481 |
packet,
|
| 482 |
source_trace,
|
| 483 |
model_calls,
|
|
|
|
| 485 |
phase="questions",
|
| 486 |
title="Claimant Answers the Bench",
|
| 487 |
body="The claimant answers the hinge question.",
|
| 488 |
+
turns=[_turn("Mike OSS", "claimant advocate", claimant_answer, GPT_OSS_MODEL, 0.88)],
|
| 489 |
evidence=packet.evidence,
|
| 490 |
),
|
| 491 |
delay,
|
|
|
|
| 494 |
respondent_answer = _required_role(
|
| 495 |
model_runner,
|
| 496 |
model_calls,
|
| 497 |
+
agent="Harvey Vector",
|
| 498 |
role="respondent advocate",
|
| 499 |
+
model=GPT_OSS_MODEL,
|
| 500 |
+
case_summary=case_summary,
|
| 501 |
+
evidence_summary=evidence_summary,
|
| 502 |
+
trial_history=_trial_history(events),
|
| 503 |
+
objective="Answer the judge in the way most favorable to the respondent.",
|
| 504 |
+
task=f"Answer {JUDGE_NAME}'s hinge question as I for the respondent: {judge_question.text}",
|
| 505 |
+
provider=OPENAI_PROVIDER,
|
| 506 |
+
max_tokens=130,
|
| 507 |
+
)
|
| 508 |
+
yield _record_and_emit(
|
| 509 |
+
events,
|
| 510 |
packet,
|
| 511 |
source_trace,
|
| 512 |
model_calls,
|
|
|
|
| 514 |
phase="questions",
|
| 515 |
title="Respondent Answers the Bench",
|
| 516 |
body="The respondent answers the hinge question.",
|
| 517 |
+
turns=[_turn("Harvey Vector", "respondent advocate", respondent_answer, GPT_OSS_MODEL, 0.88)],
|
| 518 |
evidence=packet.evidence,
|
| 519 |
),
|
| 520 |
delay,
|
|
|
|
| 528 |
model=NEMOTRON_MODEL,
|
| 529 |
case_summary=case_summary,
|
| 530 |
evidence_summary=evidence_summary,
|
| 531 |
+
trial_history=_trial_history(events),
|
| 532 |
+
objective="Move the court from arguments into individual jury votes.",
|
| 533 |
+
task="Announce as we, the six named jurors, that we retire to vote. Do not reveal the votes yet.",
|
| 534 |
provider=NEMOTRON_PROVIDER,
|
| 535 |
max_tokens=100,
|
| 536 |
)
|
| 537 |
+
yield _record_and_emit(
|
| 538 |
+
events,
|
| 539 |
packet,
|
| 540 |
source_trace,
|
| 541 |
model_calls,
|
|
|
|
| 549 |
delay,
|
| 550 |
)
|
| 551 |
|
| 552 |
+
votes: list[JurorVote] = []
|
| 553 |
+
for juror, persona in JUROR_PERSONAS.items():
|
| 554 |
+
juror_vote_result = _required_role(
|
| 555 |
+
model_runner,
|
| 556 |
+
model_calls,
|
| 557 |
+
agent=juror,
|
| 558 |
+
role="juror",
|
| 559 |
+
model=NEMOTRON_MODEL,
|
| 560 |
+
case_summary=case_summary,
|
| 561 |
+
evidence_summary=evidence_summary,
|
| 562 |
+
trial_history=_trial_history(events),
|
| 563 |
+
persona=persona,
|
| 564 |
+
objective="Reach the verdict this historical worldview would consider right after watching the trial.",
|
| 565 |
+
task=_juror_task(juror, persona),
|
| 566 |
+
provider=NEMOTRON_PROVIDER,
|
| 567 |
+
max_tokens=220,
|
| 568 |
+
)
|
| 569 |
+
vote = _parse_juror_vote(juror_vote_result, packet, juror)
|
| 570 |
+
votes.append(vote)
|
| 571 |
juror_result = ModelResult(
|
| 572 |
+
text=f"I vote {vote.vote.replace('_', ' ').title()}. {vote.reason}",
|
| 573 |
+
call=juror_vote_result.call,
|
| 574 |
+
input_text=juror_vote_result.input_text,
|
| 575 |
)
|
| 576 |
+
yield _record_and_emit(
|
| 577 |
+
events,
|
| 578 |
packet,
|
| 579 |
source_trace,
|
| 580 |
model_calls,
|
|
|
|
| 590 |
)
|
| 591 |
|
| 592 |
verdict = _verdict_from_votes(votes)
|
| 593 |
+
verdict_voice = _required_role(
|
| 594 |
+
model_runner,
|
| 595 |
+
model_calls,
|
| 596 |
+
agent=JUDGE_NAME,
|
| 597 |
+
role="verdict writer",
|
| 598 |
+
model=GPT_OSS_MODEL,
|
| 599 |
+
case_summary=case_summary,
|
| 600 |
+
evidence_summary=evidence_summary,
|
| 601 |
+
trial_history=_trial_history(events),
|
| 602 |
+
persona=JUDGE_PERSONA,
|
| 603 |
+
objective="Announce the jury result fairly, summarize both sides, and do not override the jury.",
|
| 604 |
+
task=(
|
| 605 |
+
f"As {JUDGE_NAME}, announce the final legal finding after the jury vote with Stoic restraint. "
|
| 606 |
+
f"Finding: {verdict.finding}. "
|
| 607 |
+
f"Jury rationale: {verdict.rationale} Remedy: {verdict.remedy}. Speak as I from the bench and include uncertainty without disclaiming the role."
|
| 608 |
+
),
|
| 609 |
provider=OPENAI_PROVIDER,
|
| 610 |
max_tokens=160,
|
| 611 |
)
|
| 612 |
+
yield _record_and_emit(
|
| 613 |
+
events,
|
| 614 |
packet,
|
| 615 |
source_trace,
|
| 616 |
model_calls,
|
|
|
|
| 618 |
phase="verdict",
|
| 619 |
title="The Court Announces Judgment",
|
| 620 |
body=f"{verdict_voice.text}\n\n{verdict.rationale}\n\nRemedy: {verdict.remedy}",
|
| 621 |
+
verdict=verdict,
|
| 622 |
+
votes=votes,
|
| 623 |
+
evidence=packet.evidence,
|
| 624 |
+
turns=[_turn(JUDGE_NAME, "verdict writer", verdict_voice, GPT_OSS_MODEL, 0.88)],
|
| 625 |
+
),
|
| 626 |
+
delay,
|
| 627 |
+
)
|
| 628 |
|
| 629 |
|
| 630 |
def stream_trial_jsonl(
|
sovereign_bench/export.py
CHANGED
|
@@ -1,35 +1,35 @@
|
|
| 1 |
-
from __future__ import annotations
|
| 2 |
-
|
| 3 |
-
import json
|
| 4 |
-
import tempfile
|
| 5 |
-
from pathlib import Path
|
| 6 |
-
|
| 7 |
-
from .models import TrialEvent
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
def write_trace(events: list[TrialEvent]) -> str:
|
| 11 |
-
path = Path(tempfile.gettempdir()) / "sovereign_bench_trace.json"
|
| 12 |
-
path.write_text(
|
| 13 |
-
json.dumps([event.model_dump() for event in events], indent=2, ensure_ascii=True),
|
| 14 |
-
encoding="utf-8",
|
| 15 |
-
)
|
| 16 |
-
return str(path)
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
def write_decree(events: list[TrialEvent]) -> str:
|
| 20 |
-
verdict_event = next((event for event in events if event.verdict), events[-1])
|
| 21 |
-
verdict = verdict_event.verdict
|
| 22 |
-
path = Path(tempfile.gettempdir()) / "sovereign_bench_decree.md"
|
| 23 |
-
if verdict is None:
|
| 24 |
-
text = "# Sovereign Bench Decree\n\nNo verdict was recorded."
|
| 25 |
-
else:
|
| 26 |
-
text = (
|
| 27 |
-
"# Sovereign Bench Decree\n\n"
|
| 28 |
-
f"## Finding\n{verdict.finding}\n\n"
|
| 29 |
-
f"## Decree\n{verdict.decree}\n\n"
|
| 30 |
-
f"## Rationale\n{verdict.rationale}\n\n"
|
| 31 |
-
f"## Remedy\n{verdict.remedy}\n\n"
|
| 32 |
-
f"## Uncertainty\n{verdict.uncertainty}\n"
|
| 33 |
-
)
|
| 34 |
-
path.write_text(text, encoding="utf-8")
|
| 35 |
-
return str(path)
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import tempfile
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
from .models import TrialEvent
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def write_trace(events: list[TrialEvent]) -> str:
|
| 11 |
+
path = Path(tempfile.gettempdir()) / "sovereign_bench_trace.json"
|
| 12 |
+
path.write_text(
|
| 13 |
+
json.dumps([event.model_dump() for event in events], indent=2, ensure_ascii=True),
|
| 14 |
+
encoding="utf-8",
|
| 15 |
+
)
|
| 16 |
+
return str(path)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def write_decree(events: list[TrialEvent]) -> str:
|
| 20 |
+
verdict_event = next((event for event in events if event.verdict), events[-1])
|
| 21 |
+
verdict = verdict_event.verdict
|
| 22 |
+
path = Path(tempfile.gettempdir()) / "sovereign_bench_decree.md"
|
| 23 |
+
if verdict is None:
|
| 24 |
+
text = "# Sovereign Bench Decree\n\nNo verdict was recorded."
|
| 25 |
+
else:
|
| 26 |
+
text = (
|
| 27 |
+
"# Sovereign Bench Decree\n\n"
|
| 28 |
+
f"## Finding\n{verdict.finding}\n\n"
|
| 29 |
+
f"## Decree\n{verdict.decree}\n\n"
|
| 30 |
+
f"## Rationale\n{verdict.rationale}\n\n"
|
| 31 |
+
f"## Remedy\n{verdict.remedy}\n\n"
|
| 32 |
+
f"## Uncertainty\n{verdict.uncertainty}\n"
|
| 33 |
+
)
|
| 34 |
+
path.write_text(text, encoding="utf-8")
|
| 35 |
+
return str(path)
|
sovereign_bench/llm.py
CHANGED
|
@@ -1,209 +1,296 @@
|
|
| 1 |
-
from __future__ import annotations
|
| 2 |
-
|
| 3 |
-
import os
|
| 4 |
-
import re
|
| 5 |
-
import time
|
| 6 |
-
from dataclasses import dataclass
|
| 7 |
-
from hashlib import sha256
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
@dataclass
|
| 11 |
-
class ModelCall:
|
| 12 |
-
model: str
|
| 13 |
-
provider: str
|
| 14 |
-
ok: bool
|
| 15 |
-
latency_ms: int
|
| 16 |
-
prompt_hash: str
|
| 17 |
-
error: str | None = None
|
| 18 |
-
requested_model: str | None = None
|
| 19 |
-
runtime: str | None = None
|
| 20 |
-
gpu: str | None = None
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
@dataclass
|
| 24 |
-
class ModelResult:
|
| 25 |
-
text: str
|
| 26 |
-
call: ModelCall
|
| 27 |
-
input_text: str = ""
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
class ModelCallError(RuntimeError):
|
| 31 |
-
pass
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
def _short_error(exc: Exception) -> str:
|
| 35 |
-
message = str(exc).replace("\n", " ").strip()
|
| 36 |
-
return f"{exc.__class__.__name__}: {message[:220]}"
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
def messages_hash(messages: list[dict[str, str]]) -> str:
|
| 40 |
-
joined = "\n".join(f"{item.get('role', '')}:{item.get('content', '')}" for item in messages)
|
| 41 |
-
return sha256(joined.encode("utf-8")).hexdigest()[:16]
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
def _prompt_from_messages(messages: list[dict[str, str]]) -> str:
|
| 45 |
-
return "\n\n".join(f"{item.get('role', 'user').upper()}:\n{item.get('content', '')}" for item in messages) + "\n\nASSISTANT:\n"
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
def _response_text(response: object) -> str:
|
| 49 |
-
choice = response.choices[0] # type: ignore[attr-defined]
|
| 50 |
-
message = choice.message
|
| 51 |
-
for attr in ("content", "reasoning_content", "reasoning"):
|
| 52 |
-
value = getattr(message, attr, None)
|
| 53 |
-
if isinstance(value, str) and value.strip():
|
| 54 |
-
return value.strip()
|
| 55 |
-
if isinstance(value, list):
|
| 56 |
-
pieces = []
|
| 57 |
-
for item in value:
|
| 58 |
-
text = getattr(item, "text", None) or (item.get("text") if isinstance(item, dict) else None)
|
| 59 |
-
if text:
|
| 60 |
-
pieces.append(str(text))
|
| 61 |
-
if pieces:
|
| 62 |
-
return " ".join(pieces).strip()
|
| 63 |
-
if hasattr(message, "model_dump"):
|
| 64 |
-
data = message.model_dump()
|
| 65 |
-
for key in ("content", "reasoning_content", "reasoning"):
|
| 66 |
-
value = data.get(key)
|
| 67 |
-
if isinstance(value, str) and value.strip():
|
| 68 |
-
return value.strip()
|
| 69 |
-
return ""
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
)
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
)
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import re
|
| 5 |
+
import time
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
from hashlib import sha256
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@dataclass
|
| 11 |
+
class ModelCall:
|
| 12 |
+
model: str
|
| 13 |
+
provider: str
|
| 14 |
+
ok: bool
|
| 15 |
+
latency_ms: int
|
| 16 |
+
prompt_hash: str
|
| 17 |
+
error: str | None = None
|
| 18 |
+
requested_model: str | None = None
|
| 19 |
+
runtime: str | None = None
|
| 20 |
+
gpu: str | None = None
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
@dataclass
|
| 24 |
+
class ModelResult:
|
| 25 |
+
text: str
|
| 26 |
+
call: ModelCall
|
| 27 |
+
input_text: str = ""
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class ModelCallError(RuntimeError):
|
| 31 |
+
pass
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def _short_error(exc: Exception) -> str:
|
| 35 |
+
message = str(exc).replace("\n", " ").strip()
|
| 36 |
+
return f"{exc.__class__.__name__}: {message[:220]}"
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def messages_hash(messages: list[dict[str, str]]) -> str:
|
| 40 |
+
joined = "\n".join(f"{item.get('role', '')}:{item.get('content', '')}" for item in messages)
|
| 41 |
+
return sha256(joined.encode("utf-8")).hexdigest()[:16]
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def _prompt_from_messages(messages: list[dict[str, str]]) -> str:
|
| 45 |
+
return "\n\n".join(f"{item.get('role', 'user').upper()}:\n{item.get('content', '')}" for item in messages) + "\n\nASSISTANT:\n"
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def _response_text(response: object) -> str:
|
| 49 |
+
choice = response.choices[0] # type: ignore[attr-defined]
|
| 50 |
+
message = choice.message
|
| 51 |
+
for attr in ("content", "reasoning_content", "reasoning"):
|
| 52 |
+
value = getattr(message, attr, None)
|
| 53 |
+
if isinstance(value, str) and value.strip():
|
| 54 |
+
return value.strip()
|
| 55 |
+
if isinstance(value, list):
|
| 56 |
+
pieces = []
|
| 57 |
+
for item in value:
|
| 58 |
+
text = getattr(item, "text", None) or (item.get("text") if isinstance(item, dict) else None)
|
| 59 |
+
if text:
|
| 60 |
+
pieces.append(str(text))
|
| 61 |
+
if pieces:
|
| 62 |
+
return " ".join(pieces).strip()
|
| 63 |
+
if hasattr(message, "model_dump"):
|
| 64 |
+
data = message.model_dump()
|
| 65 |
+
for key in ("content", "reasoning_content", "reasoning"):
|
| 66 |
+
value = data.get(key)
|
| 67 |
+
if isinstance(value, str) and value.strip():
|
| 68 |
+
return value.strip()
|
| 69 |
+
return ""
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
INSTRUCTION_ECHO_RE = re.compile(
|
| 73 |
+
r"(?is)\b("
|
| 74 |
+
r"as requested|"
|
| 75 |
+
r"first[- ]person|"
|
| 76 |
+
r"pronoun|"
|
| 77 |
+
r"1\s*-\s*3 sentences|"
|
| 78 |
+
r"theatrical but clear|"
|
| 79 |
+
r"i will speak as|"
|
| 80 |
+
r"i will now (?:announce|answer|respond|deliver|speak)|"
|
| 81 |
+
r"as the assigned agent|"
|
| 82 |
+
r"the task"
|
| 83 |
+
r")\b"
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def clean_model_text(text: str) -> str:
|
| 88 |
+
cleaned = re.sub(r"(?is)<think>.*?</think>", "", text).strip()
|
| 89 |
+
if re.search(r"(?i)<think>", cleaned):
|
| 90 |
+
raise ModelCallError("model returned unclosed hidden reasoning")
|
| 91 |
+
cleaned = re.sub(r"(?is)<analysis>.*?</analysis>", "", cleaned).strip()
|
| 92 |
+
cleaned = re.sub(r"(?is)<reasoning>.*?</reasoning>", "", cleaned).strip()
|
| 93 |
+
cleaned = cleaned.replace("</think>", "").strip()
|
| 94 |
+
channel_match = re.search(r"(?ims)^\s*(?:final|assistant_final)\s*:?\s*(.+)\Z", cleaned)
|
| 95 |
+
if channel_match:
|
| 96 |
+
cleaned = channel_match.group(1).strip()
|
| 97 |
+
else:
|
| 98 |
+
final_after_analysis = re.search(
|
| 99 |
+
r"(?ims)^\s*(?:analysis|reasoning|assistant_analysis)\s*:?.*?^\s*(?:final|assistant_final)\s*:?\s*(.+)\Z",
|
| 100 |
+
cleaned,
|
| 101 |
+
)
|
| 102 |
+
if final_after_analysis:
|
| 103 |
+
cleaned = final_after_analysis.group(1).strip()
|
| 104 |
+
elif re.search(r"(?im)^\s*(?:analysis|reasoning|assistant_analysis)\s*:?", cleaned):
|
| 105 |
+
raise ModelCallError("model returned hidden analysis instead of courtroom dialogue")
|
| 106 |
+
if re.search(r"(?i)\b(?:analysis|reasoning)\s*:", cleaned[:80]):
|
| 107 |
+
raise ModelCallError("model returned hidden analysis instead of courtroom dialogue")
|
| 108 |
+
if INSTRUCTION_ECHO_RE.search(cleaned[:420]):
|
| 109 |
+
pieces = [piece.strip() for piece in re.split(r"\n\s*\n", cleaned) if piece.strip()]
|
| 110 |
+
dialogue_pieces = [piece for piece in pieces if not INSTRUCTION_ECHO_RE.search(piece)]
|
| 111 |
+
if not dialogue_pieces:
|
| 112 |
+
raise ModelCallError("model echoed instructions instead of courtroom dialogue")
|
| 113 |
+
cleaned = "\n\n".join(dialogue_pieces).strip()
|
| 114 |
+
if not cleaned:
|
| 115 |
+
raise ModelCallError("model returned no visible output")
|
| 116 |
+
return cleaned
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def model_enabled() -> bool:
|
| 120 |
+
return os.getenv("SOVEREIGN_DISABLE_LIVE_MODELS", "").lower() not in {"1", "true", "yes"}
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def call_hf_chat_model(
|
| 124 |
+
*,
|
| 125 |
+
model: str,
|
| 126 |
+
messages: list[dict[str, str]],
|
| 127 |
+
provider: str = "auto",
|
| 128 |
+
max_tokens: int = 140,
|
| 129 |
+
temperature: float = 0.45,
|
| 130 |
+
) -> ModelResult:
|
| 131 |
+
prompt_hash = messages_hash(messages)
|
| 132 |
+
started = time.perf_counter()
|
| 133 |
+
token = os.getenv("HF_TOKEN")
|
| 134 |
+
if not token or not model_enabled():
|
| 135 |
+
raise ModelCallError("HF_TOKEN missing or live models disabled")
|
| 136 |
+
|
| 137 |
+
try:
|
| 138 |
+
from huggingface_hub import InferenceClient
|
| 139 |
+
|
| 140 |
+
client = InferenceClient(model=model, provider=provider, token=token, timeout=45.0)
|
| 141 |
+
retry_messages = messages + [
|
| 142 |
+
{
|
| 143 |
+
"role": "user",
|
| 144 |
+
"content": (
|
| 145 |
+
"Your previous response did not include visible courtroom dialogue. "
|
| 146 |
+
"Return only the final first-person spoken dialogue now, as the assigned agent. "
|
| 147 |
+
"Do not mention prompts, tasks, requirements, pronouns, sentence counts, or that you are following instructions. "
|
| 148 |
+
"Do not include <think>, analysis, reasoning, markdown, narration, or notes. /no_think"
|
| 149 |
+
),
|
| 150 |
+
}
|
| 151 |
+
]
|
| 152 |
+
last_error: Exception | None = None
|
| 153 |
+
text = ""
|
| 154 |
+
for attempt_messages in (messages, retry_messages):
|
| 155 |
+
try:
|
| 156 |
+
response = client.chat_completion(
|
| 157 |
+
messages=attempt_messages,
|
| 158 |
+
max_tokens=max_tokens,
|
| 159 |
+
temperature=temperature,
|
| 160 |
+
top_p=0.9,
|
| 161 |
+
)
|
| 162 |
+
raw_text = _response_text(response)
|
| 163 |
+
except Exception as chat_exc:
|
| 164 |
+
prompt = _prompt_from_messages(attempt_messages)
|
| 165 |
+
generated = client.text_generation(
|
| 166 |
+
prompt,
|
| 167 |
+
max_new_tokens=max_tokens,
|
| 168 |
+
temperature=temperature,
|
| 169 |
+
top_p=0.9,
|
| 170 |
+
return_full_text=False,
|
| 171 |
+
)
|
| 172 |
+
raw_text = str(generated).strip()
|
| 173 |
+
if not raw_text:
|
| 174 |
+
raise chat_exc
|
| 175 |
+
try:
|
| 176 |
+
text = clean_model_text(raw_text)
|
| 177 |
+
break
|
| 178 |
+
except ModelCallError as exc:
|
| 179 |
+
last_error = exc
|
| 180 |
+
if not text:
|
| 181 |
+
raise last_error or RuntimeError("empty model response")
|
| 182 |
+
return ModelResult(
|
| 183 |
+
text=text,
|
| 184 |
+
call=ModelCall(
|
| 185 |
+
model=model,
|
| 186 |
+
provider=provider,
|
| 187 |
+
ok=True,
|
| 188 |
+
latency_ms=int((time.perf_counter() - started) * 1000),
|
| 189 |
+
prompt_hash=prompt_hash,
|
| 190 |
+
),
|
| 191 |
+
)
|
| 192 |
+
except Exception as exc:
|
| 193 |
+
raise ModelCallError(
|
| 194 |
+
f"{model} via {provider} failed after {int((time.perf_counter() - started) * 1000)}ms: {_short_error(exc)}"
|
| 195 |
+
) from exc
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
def call_small_model(
|
| 199 |
+
*,
|
| 200 |
+
agent: str,
|
| 201 |
+
role: str,
|
| 202 |
+
model: str,
|
| 203 |
+
case_summary: str,
|
| 204 |
+
task: str,
|
| 205 |
+
evidence_summary: str,
|
| 206 |
+
trial_history: str = "",
|
| 207 |
+
persona: str = "",
|
| 208 |
+
objective: str = "",
|
| 209 |
+
provider: str = "auto",
|
| 210 |
+
max_tokens: int = 120,
|
| 211 |
+
) -> ModelResult:
|
| 212 |
+
messages = build_role_messages(
|
| 213 |
+
agent=agent,
|
| 214 |
+
role=role,
|
| 215 |
+
case_summary=case_summary,
|
| 216 |
+
task=task,
|
| 217 |
+
evidence_summary=evidence_summary,
|
| 218 |
+
trial_history=trial_history,
|
| 219 |
+
persona=persona,
|
| 220 |
+
objective=objective,
|
| 221 |
+
)
|
| 222 |
+
result = call_hf_chat_model(
|
| 223 |
+
model=model,
|
| 224 |
+
provider=provider,
|
| 225 |
+
messages=messages,
|
| 226 |
+
max_tokens=max_tokens,
|
| 227 |
+
)
|
| 228 |
+
result.input_text = _prompt_from_messages(messages)
|
| 229 |
+
return result
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
def build_role_messages(
|
| 233 |
+
*,
|
| 234 |
+
agent: str,
|
| 235 |
+
role: str,
|
| 236 |
+
case_summary: str,
|
| 237 |
+
task: str,
|
| 238 |
+
evidence_summary: str,
|
| 239 |
+
trial_history: str = "",
|
| 240 |
+
persona: str = "",
|
| 241 |
+
objective: str = "",
|
| 242 |
+
) -> list[dict[str, str]]:
|
| 243 |
+
vote_role = role == "juror"
|
| 244 |
+
dialogue_role = not vote_role
|
| 245 |
+
system = (
|
| 246 |
+
"You are one AI character in Sovereign Bench, a miniature virtual courtroom. "
|
| 247 |
+
"Stay fully in character as the assigned Agent and Role. "
|
| 248 |
+
"Use the case facts and evidence provided below; cite evidence IDs when relevant. "
|
| 249 |
+
"Do not claim certainty beyond the record. Do not add markdown. "
|
| 250 |
+
"Never reveal hidden reasoning, analysis, or <think> text. "
|
| 251 |
+
"Do not use thinking mode."
|
| 252 |
+
)
|
| 253 |
+
if role in {"claimant advocate", "respondent advocate"}:
|
| 254 |
+
system += (
|
| 255 |
+
" You are a lawyer trying to win for your side. Use the evidence, the other side's claims, "
|
| 256 |
+
"and the trial record to make the strongest fair argument available."
|
| 257 |
+
)
|
| 258 |
+
elif role in {"judge", "verdict writer"}:
|
| 259 |
+
system += (
|
| 260 |
+
" You are a fair judge. Consider both sides, the evidence, and the trial record. "
|
| 261 |
+
"At verdict, announce and contextualize the jury result rather than replacing it with your own preferred outcome."
|
| 262 |
+
)
|
| 263 |
+
elif role == "juror":
|
| 264 |
+
system += (
|
| 265 |
+
" You are an individual juror. Decide through your named worldview and the trial transcript, "
|
| 266 |
+
"not a generic juror role. Output only valid JSON for your vote."
|
| 267 |
+
)
|
| 268 |
+
elif role == "juror panel":
|
| 269 |
+
system += " You speak for the jury panel procedurally; do not reveal votes before deliberation."
|
| 270 |
+
elif role == "clerk":
|
| 271 |
+
system += " You are a procedural courtroom role; present the record clearly without deciding the verdict."
|
| 272 |
+
|
| 273 |
+
if dialogue_role:
|
| 274 |
+
system += (
|
| 275 |
+
" Output only the words this character says aloud in court. "
|
| 276 |
+
"Use I, me, my, we, or our naturally when the role calls for it. "
|
| 277 |
+
"Do not narrate about yourself in the third person. Do not summarize what the agent would say."
|
| 278 |
+
)
|
| 279 |
+
answer_instruction = (
|
| 280 |
+
f"Speak as {agent}. Give only the in-scene court line, 1-3 concise sentences."
|
| 281 |
+
)
|
| 282 |
+
else:
|
| 283 |
+
answer_instruction = (
|
| 284 |
+
"Return only the requested JSON object. "
|
| 285 |
+
"Do not add dialogue, markdown, or commentary."
|
| 286 |
+
)
|
| 287 |
+
persona_block = f"\nPersona / worldview:\n{persona}\n" if persona else ""
|
| 288 |
+
objective_block = f"\nObjective:\n{objective}\n" if objective else ""
|
| 289 |
+
history_block = f"\nTrial history so far:\n{trial_history}\n" if trial_history else ""
|
| 290 |
+
user = (
|
| 291 |
+
f"Agent: {agent}\nRole: {role}\nCase:\n{case_summary}\n\n"
|
| 292 |
+
f"Evidence:\n{evidence_summary}\n"
|
| 293 |
+
f"{persona_block}{objective_block}{history_block}\nTask: {task}\n"
|
| 294 |
+
f"{answer_instruction}\n/no_think"
|
| 295 |
+
)
|
| 296 |
+
return [{"role": "system", "content": system}, {"role": "user", "content": user}]
|
sovereign_bench/models.py
CHANGED
|
@@ -1,86 +1,88 @@
|
|
| 1 |
-
from __future__ import annotations
|
| 2 |
-
|
| 3 |
-
from typing import Literal
|
| 4 |
-
|
| 5 |
-
from pydantic import BaseModel, Field
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
TrialPhase = Literal[
|
| 9 |
-
"intake",
|
| 10 |
-
"claims",
|
| 11 |
-
"opening",
|
| 12 |
-
"evidence",
|
| 13 |
-
"questions",
|
| 14 |
-
"deliberation",
|
| 15 |
-
"verdict",
|
| 16 |
-
"appeal",
|
| 17 |
-
]
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
class EvidenceItem(BaseModel):
|
| 21 |
-
id: str
|
| 22 |
-
title: str
|
| 23 |
-
source: str
|
| 24 |
-
excerpt: str
|
| 25 |
-
supports: Literal["claimant", "respondent", "mixed", "context"]
|
| 26 |
-
reliability: float = Field(ge=0.0, le=1.0)
|
| 27 |
-
note: str
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
class CasePacket(BaseModel):
|
| 31 |
-
id: str
|
| 32 |
-
title: str
|
| 33 |
-
subtitle: str
|
| 34 |
-
claimant: str
|
| 35 |
-
respondent: str
|
| 36 |
-
charge: str
|
| 37 |
-
setting: str
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import Literal
|
| 4 |
+
|
| 5 |
+
from pydantic import BaseModel, Field
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
TrialPhase = Literal[
|
| 9 |
+
"intake",
|
| 10 |
+
"claims",
|
| 11 |
+
"opening",
|
| 12 |
+
"evidence",
|
| 13 |
+
"questions",
|
| 14 |
+
"deliberation",
|
| 15 |
+
"verdict",
|
| 16 |
+
"appeal",
|
| 17 |
+
]
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class EvidenceItem(BaseModel):
|
| 21 |
+
id: str
|
| 22 |
+
title: str
|
| 23 |
+
source: str
|
| 24 |
+
excerpt: str
|
| 25 |
+
supports: Literal["claimant", "respondent", "mixed", "context"]
|
| 26 |
+
reliability: float = Field(ge=0.0, le=1.0)
|
| 27 |
+
note: str
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class CasePacket(BaseModel):
|
| 31 |
+
id: str
|
| 32 |
+
title: str
|
| 33 |
+
subtitle: str
|
| 34 |
+
claimant: str
|
| 35 |
+
respondent: str
|
| 36 |
+
charge: str
|
| 37 |
+
setting: str
|
| 38 |
+
context: str = ""
|
| 39 |
+
claimant_claim: str
|
| 40 |
+
respondent_claim: str
|
| 41 |
+
source_note: str
|
| 42 |
+
evidence: list[EvidenceItem]
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class TrialRequest(BaseModel):
|
| 46 |
+
case_id: str = "socrates"
|
| 47 |
+
search_query: str = ""
|
| 48 |
+
hypothetical: str = ""
|
| 49 |
+
custom_case: CasePacket | None = None
|
| 50 |
+
speed: Literal["swift", "measured", "ceremonial"] = "swift"
|
| 51 |
+
mind_layer: bool = True
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class AgentTurn(BaseModel):
|
| 55 |
+
agent: str
|
| 56 |
+
role: str
|
| 57 |
+
content: str
|
| 58 |
+
model: str
|
| 59 |
+
confidence: float = Field(ge=0.0, le=1.0)
|
| 60 |
+
input: str = ""
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
class JurorVote(BaseModel):
|
| 64 |
+
juror: str
|
| 65 |
+
persona: str = ""
|
| 66 |
+
vote: Literal["liable", "not_liable", "uncertain"]
|
| 67 |
+
reason: str
|
| 68 |
+
evidence_ids: list[str]
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
class Verdict(BaseModel):
|
| 72 |
+
finding: Literal["liable", "not_liable", "mixed", "uncertain"]
|
| 73 |
+
decree: str
|
| 74 |
+
rationale: str
|
| 75 |
+
evidence_ids: list[str]
|
| 76 |
+
uncertainty: str
|
| 77 |
+
remedy: str
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
class TrialEvent(BaseModel):
|
| 81 |
+
phase: TrialPhase
|
| 82 |
+
title: str
|
| 83 |
+
body: str
|
| 84 |
+
turns: list[AgentTurn] = Field(default_factory=list)
|
| 85 |
+
evidence: list[EvidenceItem] = Field(default_factory=list)
|
| 86 |
+
votes: list[JurorVote] = Field(default_factory=list)
|
| 87 |
+
verdict: Verdict | None = None
|
| 88 |
+
trace: dict = Field(default_factory=dict)
|
sovereign_bench/retrieval.py
CHANGED
|
@@ -1,70 +1,70 @@
|
|
| 1 |
-
from __future__ import annotations
|
| 2 |
-
|
| 3 |
-
import re
|
| 4 |
-
from urllib.parse import quote_plus
|
| 5 |
-
|
| 6 |
-
import httpx
|
| 7 |
-
|
| 8 |
-
from .models import CasePacket, EvidenceItem
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
def _plain_text(html: str) -> str:
|
| 12 |
-
html = re.sub(r"(?is)<script.*?</script>|<style.*?</style>", " ", html)
|
| 13 |
-
html = re.sub(r"(?s)<[^>]+>", " ", html)
|
| 14 |
-
html = re.sub(r"\s+", " ", html)
|
| 15 |
-
return html.strip()
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
def build_live_case(query: str, hypothetical: str = "") -> CasePacket | None:
|
| 19 |
-
clean_query = " ".join(query.split())
|
| 20 |
-
if len(clean_query) < 8:
|
| 21 |
-
return None
|
| 22 |
-
|
| 23 |
-
try:
|
| 24 |
-
url = f"https://r.jina.ai/http://r.jina.ai/http://duckduckgo.com/html/?q={quote_plus(clean_query)}"
|
| 25 |
-
response = httpx.get(url, timeout=8.0, follow_redirects=True)
|
| 26 |
-
text = _plain_text(response.text)
|
| 27 |
-
except Exception:
|
| 28 |
-
return None
|
| 29 |
-
|
| 30 |
-
snippets = [
|
| 31 |
-
segment.strip()
|
| 32 |
-
for segment in re.split(r"(?<=[.!?])\s+", text)
|
| 33 |
-
if 80 <= len(segment.strip()) <= 320 and "http" not in segment[:20].lower()
|
| 34 |
-
]
|
| 35 |
-
unique: list[str] = []
|
| 36 |
-
for snippet in snippets:
|
| 37 |
-
if snippet.lower() not in {item.lower() for item in unique}:
|
| 38 |
-
unique.append(snippet)
|
| 39 |
-
if len(unique) == 4:
|
| 40 |
-
break
|
| 41 |
-
|
| 42 |
-
if len(unique) < 2:
|
| 43 |
-
return None
|
| 44 |
-
|
| 45 |
-
evidence = [
|
| 46 |
-
EvidenceItem(
|
| 47 |
-
id=f"WEB-E{i}",
|
| 48 |
-
title=f"Retrieved fragment {i}",
|
| 49 |
-
source=f"Web retrieval for: {clean_query}",
|
| 50 |
-
excerpt=snippet,
|
| 51 |
-
supports="context" if i == 1 else "mixed",
|
| 52 |
-
reliability=max(0.45, 0.72 - (i * 0.06)),
|
| 53 |
-
note="Live retrieval fragment; the court treats it as context until corroborated.",
|
| 54 |
-
)
|
| 55 |
-
for i, snippet in enumerate(unique, start=1)
|
| 56 |
-
]
|
| 57 |
-
framing = hypothetical.strip() or "the parties dispute how the retrieved facts should be interpreted"
|
| 58 |
-
return CasePacket(
|
| 59 |
-
id="live",
|
| 60 |
-
title=f"Live Search Tribunal: {clean_query[:58]}",
|
| 61 |
-
subtitle="A search-fed miniature proceeding with uncertainty kept visible.",
|
| 62 |
-
claimant="The Search Record",
|
| 63 |
-
respondent="The Counter-Interpretation",
|
| 64 |
-
charge=f"Whether {framing}.",
|
| 65 |
-
setting="A temporary court assembled from retrieved public web fragments.",
|
| 66 |
-
claimant_claim="The retrieved record supports a coherent claim that should be credited.",
|
| 67 |
-
respondent_claim="The retrieved record is incomplete, ambiguous, or overread by the claimant.",
|
| 68 |
-
source_note="Live web retrieval via public search snippets. Treat as unverified context, not ground truth.",
|
| 69 |
-
evidence=evidence,
|
| 70 |
-
)
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
from urllib.parse import quote_plus
|
| 5 |
+
|
| 6 |
+
import httpx
|
| 7 |
+
|
| 8 |
+
from .models import CasePacket, EvidenceItem
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def _plain_text(html: str) -> str:
|
| 12 |
+
html = re.sub(r"(?is)<script.*?</script>|<style.*?</style>", " ", html)
|
| 13 |
+
html = re.sub(r"(?s)<[^>]+>", " ", html)
|
| 14 |
+
html = re.sub(r"\s+", " ", html)
|
| 15 |
+
return html.strip()
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def build_live_case(query: str, hypothetical: str = "") -> CasePacket | None:
|
| 19 |
+
clean_query = " ".join(query.split())
|
| 20 |
+
if len(clean_query) < 8:
|
| 21 |
+
return None
|
| 22 |
+
|
| 23 |
+
try:
|
| 24 |
+
url = f"https://r.jina.ai/http://r.jina.ai/http://duckduckgo.com/html/?q={quote_plus(clean_query)}"
|
| 25 |
+
response = httpx.get(url, timeout=8.0, follow_redirects=True)
|
| 26 |
+
text = _plain_text(response.text)
|
| 27 |
+
except Exception:
|
| 28 |
+
return None
|
| 29 |
+
|
| 30 |
+
snippets = [
|
| 31 |
+
segment.strip()
|
| 32 |
+
for segment in re.split(r"(?<=[.!?])\s+", text)
|
| 33 |
+
if 80 <= len(segment.strip()) <= 320 and "http" not in segment[:20].lower()
|
| 34 |
+
]
|
| 35 |
+
unique: list[str] = []
|
| 36 |
+
for snippet in snippets:
|
| 37 |
+
if snippet.lower() not in {item.lower() for item in unique}:
|
| 38 |
+
unique.append(snippet)
|
| 39 |
+
if len(unique) == 4:
|
| 40 |
+
break
|
| 41 |
+
|
| 42 |
+
if len(unique) < 2:
|
| 43 |
+
return None
|
| 44 |
+
|
| 45 |
+
evidence = [
|
| 46 |
+
EvidenceItem(
|
| 47 |
+
id=f"WEB-E{i}",
|
| 48 |
+
title=f"Retrieved fragment {i}",
|
| 49 |
+
source=f"Web retrieval for: {clean_query}",
|
| 50 |
+
excerpt=snippet,
|
| 51 |
+
supports="context" if i == 1 else "mixed",
|
| 52 |
+
reliability=max(0.45, 0.72 - (i * 0.06)),
|
| 53 |
+
note="Live retrieval fragment; the court treats it as context until corroborated.",
|
| 54 |
+
)
|
| 55 |
+
for i, snippet in enumerate(unique, start=1)
|
| 56 |
+
]
|
| 57 |
+
framing = hypothetical.strip() or "the parties dispute how the retrieved facts should be interpreted"
|
| 58 |
+
return CasePacket(
|
| 59 |
+
id="live",
|
| 60 |
+
title=f"Live Search Tribunal: {clean_query[:58]}",
|
| 61 |
+
subtitle="A search-fed miniature proceeding with uncertainty kept visible.",
|
| 62 |
+
claimant="The Search Record",
|
| 63 |
+
respondent="The Counter-Interpretation",
|
| 64 |
+
charge=f"Whether {framing}.",
|
| 65 |
+
setting="A temporary court assembled from retrieved public web fragments.",
|
| 66 |
+
claimant_claim="The retrieved record supports a coherent claim that should be credited.",
|
| 67 |
+
respondent_claim="The retrieved record is incomplete, ambiguous, or overread by the claimant.",
|
| 68 |
+
source_note="Live web retrieval via public search snippets. Treat as unverified context, not ground truth.",
|
| 69 |
+
evidence=evidence,
|
| 70 |
+
)
|
tests/test_cases.py
CHANGED
|
@@ -1,8 +1,16 @@
|
|
| 1 |
-
from sovereign_bench.cases import CASES
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
def test_cached_cases_have_evidence():
|
| 5 |
-
assert {"socrates", "barnaby"} <= set(CASES)
|
| 6 |
-
for case in CASES.values():
|
| 7 |
-
assert len(case.evidence) >= 4
|
| 8 |
-
assert all(item.id and item.excerpt for item in case.evidence)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sovereign_bench.cases import CASES
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def test_cached_cases_have_evidence():
|
| 5 |
+
assert {"socrates", "greg", "barnaby"} <= set(CASES)
|
| 6 |
+
for case in CASES.values():
|
| 7 |
+
assert len(case.evidence) >= 4
|
| 8 |
+
assert all(item.id and item.excerpt for item in case.evidence)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def test_demo_cases_have_book_context_and_three_items_per_side():
|
| 12 |
+
for case_id in ["socrates", "greg"]:
|
| 13 |
+
case = CASES[case_id]
|
| 14 |
+
assert case.context
|
| 15 |
+
assert len([item for item in case.evidence if item.supports == "claimant"]) >= 3
|
| 16 |
+
assert len([item for item in case.evidence if item.supports == "respondent"]) >= 3
|
tests/test_engine.py
CHANGED
|
@@ -1,149 +1,326 @@
|
|
| 1 |
-
import json
|
| 2 |
-
import re
|
| 3 |
-
|
| 4 |
-
import pytest
|
| 5 |
-
|
| 6 |
-
from sovereign_bench.engine import JUDGE_NAME, JUROR_PERSONAS, RequiredModelError, run_trial
|
| 7 |
-
from sovereign_bench.llm import ModelCall, ModelResult
|
| 8 |
-
from sovereign_bench.models import TrialRequest
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
def
|
| 12 |
-
evidence_ids = re.findall(r"^([A-Z]+-
|
| 13 |
-
|
| 14 |
-
return json.dumps(
|
| 15 |
-
{
|
| 16 |
-
"
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
"
|
| 56 |
-
JUDGE_NAME,
|
| 57 |
-
"
|
| 58 |
-
"
|
| 59 |
-
"
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
assert [event.
|
| 71 |
-
assert [
|
| 72 |
-
assert events[0].turns[0].input
|
| 73 |
-
assert
|
| 74 |
-
assert events[-1].
|
| 75 |
-
assert
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
vote = event.
|
| 94 |
-
assert vote.
|
| 95 |
-
assert vote.
|
| 96 |
-
assert
|
| 97 |
-
assert vote.reason
|
| 98 |
-
assert vote.evidence_ids
|
| 99 |
-
|
| 100 |
-
final = events[-1]
|
| 101 |
-
assert final.phase == "verdict"
|
| 102 |
-
assert [vote.juror for vote in final.votes] == list(JUROR_PERSONAS)
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
def
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
)
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import re
|
| 3 |
+
|
| 4 |
+
import pytest
|
| 5 |
+
|
| 6 |
+
from sovereign_bench.engine import JUDGE_NAME, JUROR_PERSONAS, RequiredModelError, run_trial, stream_trial
|
| 7 |
+
from sovereign_bench.llm import ModelCall, ModelResult, build_role_messages, clean_model_text
|
| 8 |
+
from sovereign_bench.models import CasePacket, EvidenceItem, TrialRequest
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def _juror_json(kwargs, vote: str = "liable") -> str:
|
| 12 |
+
evidence_ids = re.findall(r"^([A-Z]+-[A-Z]\d+):", kwargs["evidence_summary"], flags=re.M)
|
| 13 |
+
evidence_id = (evidence_ids or ["SOC-E1"])[0]
|
| 14 |
+
return json.dumps(
|
| 15 |
+
{
|
| 16 |
+
"juror": kwargs["agent"],
|
| 17 |
+
"persona": kwargs["persona"],
|
| 18 |
+
"vote": vote,
|
| 19 |
+
"reason": f"{kwargs['agent']} applies {kwargs['persona']} to exhibit {evidence_id}.",
|
| 20 |
+
"evidence_ids": [evidence_id],
|
| 21 |
+
}
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def fake_model_runner(**kwargs):
|
| 26 |
+
text = (
|
| 27 |
+
_juror_json(kwargs, vote="liable" if list(JUROR_PERSONAS).index(kwargs["agent"]) < 4 else "not_liable")
|
| 28 |
+
if kwargs["role"] == "juror"
|
| 29 |
+
else f"{kwargs['agent']} responds to: {kwargs['task']}"
|
| 30 |
+
)
|
| 31 |
+
prompt = (
|
| 32 |
+
f"SYSTEM:\nFake live model for tests.\n\nUSER:\n"
|
| 33 |
+
f"Agent: {kwargs['agent']}\nRole: {kwargs['role']}\n"
|
| 34 |
+
f"Persona: {kwargs.get('persona', '')}\nObjective: {kwargs.get('objective', '')}\n"
|
| 35 |
+
f"History: {kwargs.get('trial_history', '')}\nTask: {kwargs['task']}\n\nASSISTANT:\n"
|
| 36 |
+
)
|
| 37 |
+
return ModelResult(
|
| 38 |
+
text=text,
|
| 39 |
+
input_text=prompt,
|
| 40 |
+
call=ModelCall(
|
| 41 |
+
model=kwargs["model"],
|
| 42 |
+
provider=kwargs.get("provider", "test"),
|
| 43 |
+
ok=True,
|
| 44 |
+
latency_ms=1,
|
| 45 |
+
prompt_hash="test-prompt",
|
| 46 |
+
),
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def test_cached_cases_emit_sequential_speaker_order():
|
| 51 |
+
expected_speakers = [
|
| 52 |
+
"Clerk Meridian",
|
| 53 |
+
JUDGE_NAME,
|
| 54 |
+
"Mike OSS",
|
| 55 |
+
"Harvey Vector",
|
| 56 |
+
JUDGE_NAME,
|
| 57 |
+
"Mike OSS",
|
| 58 |
+
"Harvey Vector",
|
| 59 |
+
"Nemotron Jury",
|
| 60 |
+
*list(JUROR_PERSONAS),
|
| 61 |
+
JUDGE_NAME,
|
| 62 |
+
]
|
| 63 |
+
for case_id in ["socrates", "barnaby"]:
|
| 64 |
+
events = run_trial(TrialRequest(case_id=case_id), model_runner=fake_model_runner)
|
| 65 |
+
|
| 66 |
+
assert [event.turns[0].agent for event in events if event.turns] == expected_speakers
|
| 67 |
+
evidence_event = next(event for event in events if event.phase == "evidence")
|
| 68 |
+
assert evidence_event.title == "The Evidence Record"
|
| 69 |
+
assert evidence_event.turns == []
|
| 70 |
+
assert [event.phase for event in events].count("deliberation") == 7
|
| 71 |
+
assert events[0].turns[0].input
|
| 72 |
+
assert "SYSTEM:" in events[0].turns[0].input
|
| 73 |
+
assert events[-1].verdict is not None
|
| 74 |
+
assert events[-1].votes and len(events[-1].votes) == 6
|
| 75 |
+
assert "uncertainty" in events[-1].verdict.uncertainty.lower()
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def test_no_event_contains_both_lawyers_speaking_together():
|
| 79 |
+
events = run_trial(TrialRequest(case_id="socrates"), model_runner=fake_model_runner)
|
| 80 |
+
|
| 81 |
+
for event in events:
|
| 82 |
+
agents = {turn.agent for turn in event.turns}
|
| 83 |
+
assert not {"Mike OSS", "Harvey Vector"}.issubset(agents)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def test_juror_vote_events_have_fixed_personas_and_evidence():
|
| 87 |
+
events = run_trial(TrialRequest(case_id="socrates"), model_runner=fake_model_runner)
|
| 88 |
+
juror_events = [event for event in events if event.turns and event.turns[0].agent in JUROR_PERSONAS]
|
| 89 |
+
|
| 90 |
+
assert len(juror_events) == 6
|
| 91 |
+
for event in juror_events:
|
| 92 |
+
vote = event.votes[0]
|
| 93 |
+
assert vote.juror == event.turns[0].agent
|
| 94 |
+
assert vote.persona == JUROR_PERSONAS[vote.juror]
|
| 95 |
+
assert vote.vote in {"liable", "not_liable", "uncertain"}
|
| 96 |
+
assert event.turns[0].content.startswith("I vote ")
|
| 97 |
+
assert vote.reason
|
| 98 |
+
assert vote.evidence_ids
|
| 99 |
+
|
| 100 |
+
final = events[-1]
|
| 101 |
+
assert final.phase == "verdict"
|
| 102 |
+
assert [vote.juror for vote in final.votes] == list(JUROR_PERSONAS)
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def test_jurors_are_called_independently_with_personas_and_trial_history():
|
| 106 |
+
calls = []
|
| 107 |
+
|
| 108 |
+
def recording_runner(**kwargs):
|
| 109 |
+
calls.append(kwargs.copy())
|
| 110 |
+
return fake_model_runner(**kwargs)
|
| 111 |
+
|
| 112 |
+
run_trial(TrialRequest(case_id="socrates"), model_runner=recording_runner)
|
| 113 |
+
|
| 114 |
+
juror_calls = [call for call in calls if call["role"] == "juror"]
|
| 115 |
+
assert [call["agent"] for call in juror_calls] == list(JUROR_PERSONAS)
|
| 116 |
+
assert len(juror_calls) == 6
|
| 117 |
+
for call in juror_calls:
|
| 118 |
+
assert call["persona"] == JUROR_PERSONAS[call["agent"]]
|
| 119 |
+
assert "Claimant Opening" in call["trial_history"]
|
| 120 |
+
assert "Respondent Opening" in call["trial_history"]
|
| 121 |
+
assert "The Evidence Record" in call["trial_history"]
|
| 122 |
+
assert "historical worldview" in call["objective"]
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def test_lawyers_and_judge_receive_trial_history_and_objectives():
|
| 126 |
+
calls = []
|
| 127 |
+
|
| 128 |
+
def recording_runner(**kwargs):
|
| 129 |
+
calls.append(kwargs.copy())
|
| 130 |
+
return fake_model_runner(**kwargs)
|
| 131 |
+
|
| 132 |
+
run_trial(TrialRequest(case_id="socrates"), model_runner=recording_runner)
|
| 133 |
+
|
| 134 |
+
claimant_answer = next(call for call in calls if call["agent"] == "Mike OSS" and "hinge question" in call["task"])
|
| 135 |
+
respondent_answer = next(call for call in calls if call["agent"] == "Harvey Vector" and "hinge question" in call["task"])
|
| 136 |
+
verdict_call = next(call for call in calls if call["role"] == "verdict writer")
|
| 137 |
+
|
| 138 |
+
assert "The Hinge Question" in claimant_answer["trial_history"]
|
| 139 |
+
assert "The Hinge Question" in respondent_answer["trial_history"]
|
| 140 |
+
assert "most favorable to the claimant" in claimant_answer["objective"]
|
| 141 |
+
assert "most favorable to the respondent" in respondent_answer["objective"]
|
| 142 |
+
assert all(name in verdict_call["trial_history"] for name in JUROR_PERSONAS)
|
| 143 |
+
assert "do not override the jury" in verdict_call["objective"]
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
def test_custom_case_context_and_evidence_reach_lawyer_prompts():
|
| 147 |
+
custom = CasePacket(
|
| 148 |
+
id="custom",
|
| 149 |
+
title="Custom Trial",
|
| 150 |
+
subtitle="Entered by user.",
|
| 151 |
+
claimant="Claimant",
|
| 152 |
+
respondent="Respondent",
|
| 153 |
+
charge="Whether the custom record favors the claimant.",
|
| 154 |
+
setting="A custom courtroom.",
|
| 155 |
+
context="A bicycle disappeared after a disputed garage visit.",
|
| 156 |
+
claimant_claim="The claimant says the visit explains the missing bicycle.",
|
| 157 |
+
respondent_claim="The respondent says the timing and evidence are ambiguous.",
|
| 158 |
+
source_note="Custom test packet.",
|
| 159 |
+
evidence=[
|
| 160 |
+
EvidenceItem(
|
| 161 |
+
id="CUS-F1",
|
| 162 |
+
title="Garage Text",
|
| 163 |
+
source="Custom",
|
| 164 |
+
excerpt="The respondent asked to enter the garage.",
|
| 165 |
+
supports="claimant",
|
| 166 |
+
reliability=0.65,
|
| 167 |
+
note="Supports access.",
|
| 168 |
+
),
|
| 169 |
+
EvidenceItem(
|
| 170 |
+
id="CUS-A1",
|
| 171 |
+
title="Neighbor Sighting",
|
| 172 |
+
source="Custom",
|
| 173 |
+
excerpt="A neighbor saw the bicycle later that day.",
|
| 174 |
+
supports="respondent",
|
| 175 |
+
reliability=0.65,
|
| 176 |
+
note="Supports alternative timing.",
|
| 177 |
+
),
|
| 178 |
+
],
|
| 179 |
+
)
|
| 180 |
+
calls = []
|
| 181 |
+
|
| 182 |
+
def recording_runner(**kwargs):
|
| 183 |
+
calls.append(kwargs.copy())
|
| 184 |
+
return fake_model_runner(**kwargs)
|
| 185 |
+
|
| 186 |
+
run_trial(TrialRequest(case_id="custom", custom_case=custom), model_runner=recording_runner)
|
| 187 |
+
|
| 188 |
+
claimant_opening = next(call for call in calls if call["agent"] == "Mike OSS" and call["role"] == "claimant advocate")
|
| 189 |
+
assert "A bicycle disappeared" in claimant_opening["case_summary"]
|
| 190 |
+
assert "CUS-F1" in claimant_opening["evidence_summary"]
|
| 191 |
+
assert "CUS-A1" in claimant_opening["evidence_summary"]
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
def test_jury_contract_uses_public_history_personas():
|
| 195 |
+
assert JUDGE_NAME == "Marcus Aurelius"
|
| 196 |
+
assert JUROR_PERSONAS == {
|
| 197 |
+
"Karl Marx": "class power, material conditions, exploitation, institutional incentives",
|
| 198 |
+
"John Stuart Mill": "liberty, harm principle, utility, individual rights",
|
| 199 |
+
"Confucius": "social harmony, role duty, ritual order, moral cultivation",
|
| 200 |
+
"Cleopatra VII": "sovereign pragmatism, diplomacy, survival, legitimacy under pressure",
|
| 201 |
+
"Niccolo Machiavelli": "political realism, stability, power, consequences over ideals",
|
| 202 |
+
"Jensen Huang": "technological optimism, operator mindset, systems thinking, innovation tradeoffs",
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
def test_role_prompt_requires_first_person_in_character_speech():
|
| 207 |
+
messages = build_role_messages(
|
| 208 |
+
agent="Harvey Vector",
|
| 209 |
+
role="respondent advocate",
|
| 210 |
+
case_summary="A short case summary.",
|
| 211 |
+
evidence_summary="SOC-E1: A record excerpt.",
|
| 212 |
+
task="Answer the bench for the respondent.",
|
| 213 |
+
)
|
| 214 |
+
|
| 215 |
+
system = messages[0]["content"]
|
| 216 |
+
user = messages[1]["content"]
|
| 217 |
+
|
| 218 |
+
assert "Stay fully in character as the assigned Agent and Role." in system
|
| 219 |
+
assert "Output only the words this character says aloud in court." in system
|
| 220 |
+
assert "Do not narrate about yourself in the third person." in system
|
| 221 |
+
assert "Use the case facts and evidence provided below" in system
|
| 222 |
+
assert "Speak as Harvey Vector." in user
|
| 223 |
+
assert "Give only the in-scene court line" in user
|
| 224 |
+
assert "SOC-E1" in user
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
def test_juror_vote_prompt_uses_persona_history_and_json_contract():
|
| 228 |
+
messages = build_role_messages(
|
| 229 |
+
agent="Karl Marx",
|
| 230 |
+
role="juror",
|
| 231 |
+
case_summary="A short case summary.",
|
| 232 |
+
evidence_summary="SOC-E1: A record excerpt.",
|
| 233 |
+
trial_history="Mike OSS argued from SOC-E1.",
|
| 234 |
+
persona=JUROR_PERSONAS["Karl Marx"],
|
| 235 |
+
objective="Vote as Karl Marx would after watching the trial.",
|
| 236 |
+
task="Return one juror vote as JSON.",
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
system = messages[0]["content"]
|
| 240 |
+
user = messages[1]["content"]
|
| 241 |
+
|
| 242 |
+
assert "Output only the words this character says aloud in court." not in messages[0]["content"]
|
| 243 |
+
assert "You are an individual juror." in system
|
| 244 |
+
assert JUROR_PERSONAS["Karl Marx"] in user
|
| 245 |
+
assert "Mike OSS argued from SOC-E1." in user
|
| 246 |
+
assert "Return only the requested JSON object." in user
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
def test_model_cleaner_extracts_final_speech_after_analysis_channel():
|
| 250 |
+
text = clean_model_text(
|
| 251 |
+
"analysis\nI should reason about the case first.\n\nfinal\nI stand for the respondent, and SOC-E1 leaves doubt."
|
| 252 |
+
)
|
| 253 |
+
|
| 254 |
+
assert text == "I stand for the respondent, and SOC-E1 leaves doubt."
|
| 255 |
+
assert "analysis" not in text.lower()
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
def test_model_cleaner_rejects_visible_analysis_without_final_speech():
|
| 259 |
+
def analysis_runner(**kwargs):
|
| 260 |
+
return ModelResult(
|
| 261 |
+
text="analysis: I should think through the case before answering.",
|
| 262 |
+
input_text="SYSTEM:\nanalysis leak",
|
| 263 |
+
call=ModelCall(
|
| 264 |
+
model=kwargs["model"],
|
| 265 |
+
provider=kwargs.get("provider", "test"),
|
| 266 |
+
ok=True,
|
| 267 |
+
latency_ms=1,
|
| 268 |
+
prompt_hash="test-prompt",
|
| 269 |
+
),
|
| 270 |
+
)
|
| 271 |
+
|
| 272 |
+
with pytest.raises(RequiredModelError):
|
| 273 |
+
next(stream_trial(TrialRequest(case_id="socrates"), model_runner=analysis_runner))
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
def test_model_cleaner_removes_instruction_echo_when_dialogue_remains():
|
| 277 |
+
text = clean_model_text(
|
| 278 |
+
"I will now announce the case as requested, while maintaining the theatrical but clear tone required. "
|
| 279 |
+
"I will speak as Clerk Meridian in first person, starting with a pronoun.\n\n"
|
| 280 |
+
"I call The Polis v. Socrates before this court."
|
| 281 |
+
)
|
| 282 |
+
|
| 283 |
+
assert text == "I call The Polis v. Socrates before this court."
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
def test_model_cleaner_rejects_instruction_echo_without_dialogue():
|
| 287 |
+
with pytest.raises(Exception, match="echoed instructions"):
|
| 288 |
+
clean_model_text(
|
| 289 |
+
"I will now announce the case as requested, while maintaining the theatrical but clear tone required. "
|
| 290 |
+
"I will speak as Clerk Meridian in first person, starting with a pronoun."
|
| 291 |
+
)
|
| 292 |
+
|
| 293 |
+
|
| 294 |
+
def test_required_model_failure_stops_trial_without_canned_dialogue():
|
| 295 |
+
def failing_runner(**kwargs):
|
| 296 |
+
return ModelResult(
|
| 297 |
+
text="",
|
| 298 |
+
input_text="SYSTEM:\nfailed",
|
| 299 |
+
call=ModelCall(
|
| 300 |
+
model=kwargs["model"],
|
| 301 |
+
provider=kwargs.get("provider", "test"),
|
| 302 |
+
ok=False,
|
| 303 |
+
latency_ms=1,
|
| 304 |
+
prompt_hash="test-prompt",
|
| 305 |
+
error="offline",
|
| 306 |
+
),
|
| 307 |
+
)
|
| 308 |
+
|
| 309 |
+
with pytest.raises(RequiredModelError, match="unavailable"):
|
| 310 |
+
run_trial(TrialRequest(case_id="socrates"), model_runner=failing_runner)
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
def test_invalid_jury_output_stops_trial_without_fallback_votes():
|
| 314 |
+
def invalid_jury_runner(**kwargs):
|
| 315 |
+
result = fake_model_runner(**kwargs)
|
| 316 |
+
if kwargs["role"] == "juror":
|
| 317 |
+
result.text = "the jury refuses structured output"
|
| 318 |
+
return result
|
| 319 |
+
|
| 320 |
+
with pytest.raises(RequiredModelError, match="invalid JSON"):
|
| 321 |
+
run_trial(TrialRequest(case_id="socrates"), model_runner=invalid_jury_runner)
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
def test_live_search_stops_when_query_is_weak():
|
| 325 |
+
with pytest.raises(RuntimeError, match="no fallback case"):
|
| 326 |
+
run_trial(TrialRequest(case_id="live", search_query="x"), model_runner=fake_model_runner)
|
tests/test_ui_rendering.py
CHANGED
|
@@ -1,283 +1,578 @@
|
|
| 1 |
-
import inspect
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
"
|
| 13 |
-
"
|
| 14 |
-
"
|
| 15 |
-
"
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
assert "
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
assert "
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
assert "
|
| 133 |
-
assert "
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
assert html.
|
| 144 |
-
assert "
|
| 145 |
-
assert
|
| 146 |
-
assert ".
|
| 147 |
-
assert ".
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
assert "
|
| 167 |
-
assert "class='
|
| 168 |
-
assert "
|
| 169 |
-
assert "puppet
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
assert "
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
assert "
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
assert "
|
| 217 |
-
assert "
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
assert
|
| 231 |
-
assert
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
assert "
|
| 254 |
-
assert "
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
assert "
|
| 260 |
-
assert "
|
| 261 |
-
assert "
|
| 262 |
-
assert "
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
def
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
assert "
|
| 281 |
-
assert "
|
| 282 |
-
assert "
|
| 283 |
-
assert "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import inspect
|
| 2 |
+
import json
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
from PIL import Image
|
| 6 |
+
|
| 7 |
+
import app
|
| 8 |
+
from sovereign_bench.models import AgentTurn, EvidenceItem, JurorVote, TrialEvent, Verdict
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
OLD_CARD_CLASSES = [
|
| 12 |
+
"paper-panel",
|
| 13 |
+
"juror-panel",
|
| 14 |
+
"mind-panel",
|
| 15 |
+
"empty-state",
|
| 16 |
+
"trial-downloads",
|
| 17 |
+
]
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def _event_with_lower_tab_data() -> TrialEvent:
|
| 21 |
+
evidence = EvidenceItem(
|
| 22 |
+
id="E1",
|
| 23 |
+
title="Ledger entry",
|
| 24 |
+
source="Archive",
|
| 25 |
+
excerpt="A short exhibit excerpt.",
|
| 26 |
+
supports="claimant",
|
| 27 |
+
reliability=0.82,
|
| 28 |
+
note="Useful but incomplete.",
|
| 29 |
+
)
|
| 30 |
+
vote = JurorVote(
|
| 31 |
+
juror="Karl Marx",
|
| 32 |
+
persona=app.JUROR_PERSONAS["Karl Marx"],
|
| 33 |
+
vote="liable",
|
| 34 |
+
reason="The exhibit supports the claim.",
|
| 35 |
+
evidence_ids=["E1"],
|
| 36 |
+
)
|
| 37 |
+
return TrialEvent(
|
| 38 |
+
phase="deliberation",
|
| 39 |
+
title="Jury weighs the record",
|
| 40 |
+
body="The jury reviews the record.",
|
| 41 |
+
turns=[
|
| 42 |
+
AgentTurn(
|
| 43 |
+
agent="Nemotron Jury",
|
| 44 |
+
role="juror panel",
|
| 45 |
+
content="The jurors compare E1 and state their votes.",
|
| 46 |
+
model="nvidia/Nemotron-Orchestrator-8B",
|
| 47 |
+
confidence=0.84,
|
| 48 |
+
input="SYSTEM:\nYou are the jury.\n\nUSER:\nWeigh E1 and explain the vote.",
|
| 49 |
+
)
|
| 50 |
+
],
|
| 51 |
+
evidence=[evidence],
|
| 52 |
+
votes=[vote],
|
| 53 |
+
trace={"mode": "test"},
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def _speaker_event(agent: str, phase: str = "questions") -> TrialEvent:
|
| 58 |
+
return TrialEvent(
|
| 59 |
+
phase=phase,
|
| 60 |
+
title=f"{agent} speaks",
|
| 61 |
+
body="A single speaker takes the floor.",
|
| 62 |
+
turns=[
|
| 63 |
+
AgentTurn(
|
| 64 |
+
agent=agent,
|
| 65 |
+
role="test speaker",
|
| 66 |
+
content=f"{agent} has the visible floor.",
|
| 67 |
+
model="test-model",
|
| 68 |
+
confidence=0.9,
|
| 69 |
+
input="SYSTEM:\nTest prompt.",
|
| 70 |
+
)
|
| 71 |
+
],
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def _verdict_event(finding: str = "liable") -> TrialEvent:
|
| 76 |
+
return TrialEvent(
|
| 77 |
+
phase="verdict",
|
| 78 |
+
title="The Court Announces Judgment",
|
| 79 |
+
body="Judgment is announced.",
|
| 80 |
+
verdict=Verdict(
|
| 81 |
+
finding=finding,
|
| 82 |
+
decree="The court enters the final judgment.",
|
| 83 |
+
rationale="The jury majority decides the record.",
|
| 84 |
+
evidence_ids=["E1"],
|
| 85 |
+
uncertainty="Some uncertainty remains.",
|
| 86 |
+
remedy="Record the judgment.",
|
| 87 |
+
),
|
| 88 |
+
turns=[
|
| 89 |
+
AgentTurn(
|
| 90 |
+
agent=app.JUDGE_NAME,
|
| 91 |
+
role="verdict writer",
|
| 92 |
+
content="The judgment of the court is guilty.",
|
| 93 |
+
model="test-model",
|
| 94 |
+
confidence=0.9,
|
| 95 |
+
input="SYSTEM:\nAnnounce verdict.",
|
| 96 |
+
)
|
| 97 |
+
],
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def test_lower_tab_renderers_emit_plain_text_classes():
|
| 102 |
+
event = _event_with_lower_tab_data()
|
| 103 |
+
html = "\n".join(
|
| 104 |
+
[
|
| 105 |
+
app.render_evidence([]),
|
| 106 |
+
app.render_evidence([event]),
|
| 107 |
+
app.render_jurors([]),
|
| 108 |
+
app.render_jurors([event]),
|
| 109 |
+
app.render_mind([], True),
|
| 110 |
+
app.render_mind([event], True),
|
| 111 |
+
app.render_mind([event], False),
|
| 112 |
+
]
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
for class_name in OLD_CARD_CLASSES:
|
| 116 |
+
assert class_name not in html
|
| 117 |
+
|
| 118 |
+
assert "drawer-text-block" in html
|
| 119 |
+
assert "drawer-empty" in html
|
| 120 |
+
assert "mind-text" in html
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def test_download_controls_are_not_wired_into_app():
|
| 124 |
+
source = inspect.getsource(app.build_app)
|
| 125 |
+
|
| 126 |
+
assert "DownloadButton" not in source
|
| 127 |
+
assert "Download decree" not in source
|
| 128 |
+
assert "Download agent trace" not in source
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def test_case_dropdown_only_exposes_demo_and_custom_cases():
|
| 132 |
+
assert list(app.CASE_OPTIONS) == ["Trial of Socrates", "Greg Heffley vs Mom", "Custom"]
|
| 133 |
+
assert "The People v. Barnaby Buttons" not in app.CASE_OPTIONS
|
| 134 |
+
assert "Live Search Tribunal" not in app.CASE_OPTIONS
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def test_courtroom_splits_six_jurors_between_side_benches():
|
| 138 |
+
html = app.render_court([_event_with_lower_tab_data()], started=True)
|
| 139 |
+
|
| 140 |
+
assert "jury-benches left" in html
|
| 141 |
+
assert "jury-benches right" in html
|
| 142 |
+
assert html.count("<a class='juror") == 6
|
| 143 |
+
assert html.find("jury-benches left") < html.find("jury-benches right")
|
| 144 |
+
assert ".jury-benches.left {\n left: 1%;" in app.CSS
|
| 145 |
+
assert ".jury-benches.right {\n right: 1%;" in app.CSS
|
| 146 |
+
assert ".jury-benches.left {\n left: .5%;" in app.CSS
|
| 147 |
+
assert ".jury-benches.right {\n right: .5%;" in app.CSS
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def test_courtroom_threads_show_model_input_output_on_hover_and_click():
|
| 151 |
+
html = app.render_court([_event_with_lower_tab_data()], started=True)
|
| 152 |
+
|
| 153 |
+
assert "tooltip-io-label'>Input" in html
|
| 154 |
+
assert "tooltip-io-label'>Output" in html
|
| 155 |
+
assert "Click to open full thread" in html
|
| 156 |
+
assert "class='ai-thread-modal'" in html
|
| 157 |
+
assert "thread-block'>SYSTEM:" in html
|
| 158 |
+
assert "The jurors compare E1 and state their votes." in html
|
| 159 |
+
assert "href='#ai-thread-karl-marx'" in html
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
def test_courtroom_renders_historical_judge_and_juror_assets():
|
| 163 |
+
html = app.render_court([_event_with_lower_tab_data()], started=True)
|
| 164 |
+
|
| 165 |
+
assert "Marcus Aurelius" in html
|
| 166 |
+
assert "assets/characters/marcus-aurelius.png" in html
|
| 167 |
+
assert "<img class='puppet-portrait' src='/gradio_api/file=assets/characters/marcus-aurelius.png'" in html
|
| 168 |
+
assert ".puppet.judge::before,\n.puppet.judge::after {\n display: none;\n}" in app.CSS
|
| 169 |
+
assert ".puppet.judge .mouth {\n display: none;\n}" in app.CSS
|
| 170 |
+
for name, image in app.JUROR_IMAGES.items():
|
| 171 |
+
assert name in html
|
| 172 |
+
assert image in html
|
| 173 |
+
assert html.count("class='juror-portrait'") == 6
|
| 174 |
+
assert "class='juror-face'" not in html
|
| 175 |
+
assert "class='juror-body'" not in html
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def test_courtroom_renders_foreground_fences_and_judge_table_above_characters():
|
| 179 |
+
html = app.render_court([_event_with_lower_tab_data()], started=True)
|
| 180 |
+
|
| 181 |
+
assert html.count("assets/foreground/foregroundFence.png") == 2
|
| 182 |
+
assert "assets/foreground/JudgeTable.png" in html
|
| 183 |
+
assert html.find("class='puppet judge") < html.find("class='foreground-props'")
|
| 184 |
+
assert ".foreground-props {\n position: absolute;\n inset: 0;\n z-index: 13;" in app.CSS
|
| 185 |
+
assert ".puppet {\n --skin: #c99257;" in app.CSS
|
| 186 |
+
assert "z-index: 8;" in app.CSS
|
| 187 |
+
assert ".puppet.clerk {\n left: 43%;\n top: 66%;\n z-index: 14;" in app.CSS
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
def test_trial_progress_defaults_to_pretrial_and_renders_all_stages():
|
| 191 |
+
html = app.render_court([])
|
| 192 |
+
|
| 193 |
+
assert "class='trial-progress'" in html
|
| 194 |
+
assert "data-phase='pretrial' aria-current='step'" in html
|
| 195 |
+
for _key, label in app.TRIAL_PROGRESS_STAGES:
|
| 196 |
+
assert label in html
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
def test_trial_progress_marks_questions_current():
|
| 200 |
+
html = app.render_court([_speaker_event("Mike OSS", phase="questions")], started=True)
|
| 201 |
+
|
| 202 |
+
assert "class='trial-progress-segment current' data-phase='questions' aria-current='step'" in html
|
| 203 |
+
assert "data-phase='evidence'" in html
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
def test_trial_progress_marks_deliberation_current():
|
| 207 |
+
html = app.render_court([_event_with_lower_tab_data()], started=True)
|
| 208 |
+
|
| 209 |
+
assert "class='trial-progress-segment current' data-phase='deliberation' aria-current='step'" in html
|
| 210 |
+
assert "class='trial-progress-segment complete' data-phase='questions'" in html
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
def test_trial_progress_marks_verdict_current_and_complete():
|
| 214 |
+
html = app.render_court([_speaker_event(app.JUDGE_NAME, phase="verdict")], started=True)
|
| 215 |
+
|
| 216 |
+
assert "class='trial-progress-segment current complete' data-phase='verdict' aria-current='step'" in html
|
| 217 |
+
assert "class='trial-progress-segment complete' data-phase='deliberation'" in html
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
def test_verdict_popup_renders_only_when_final_verdict_is_revealed():
|
| 221 |
+
event = _verdict_event("liable")
|
| 222 |
+
|
| 223 |
+
announcement = app.render_court([event], started=True)
|
| 224 |
+
sealed = app.render_court([event], started=True, show_verdict_popup=True)
|
| 225 |
+
|
| 226 |
+
assert "class='speech-bubble active-dialogue speaker-judge'" in announcement
|
| 227 |
+
assert "class='verdict-popup'" not in announcement
|
| 228 |
+
assert "class='speech-bubble active-dialogue speaker-judge'" in sealed
|
| 229 |
+
assert "class='verdict-popup'" in sealed
|
| 230 |
+
assert "data-finding='liable'" in sealed
|
| 231 |
+
assert "Verdict: Guilty" in sealed
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
def test_run_ui_reveals_verdict_popup_after_judge_speech(monkeypatch):
|
| 235 |
+
event = _verdict_event("not_liable")
|
| 236 |
+
monkeypatch.setattr(app, "get_events", lambda request: iter([event]))
|
| 237 |
+
monkeypatch.setattr(app, "_reading_duration", lambda text: 0)
|
| 238 |
+
|
| 239 |
+
outputs = list(app.run_ui("Trial of Socrates", "", "", "", "swift", True))
|
| 240 |
+
|
| 241 |
+
assert "class='speech-bubble active-dialogue speaker-judge'" in outputs[1][0]
|
| 242 |
+
assert "class='verdict-popup'" not in outputs[1][0]
|
| 243 |
+
assert outputs[-1][-1] == "Verdict sealed."
|
| 244 |
+
assert "class='verdict-popup'" in outputs[-1][0]
|
| 245 |
+
assert "Verdict: Not Guilty" in outputs[-1][0]
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
def test_trial_progress_ignores_unknown_phase_without_extra_segment():
|
| 249 |
+
html = app.render_court([_speaker_event("Clerk Meridian", phase="appeal")], started=True)
|
| 250 |
+
|
| 251 |
+
assert "class='trial-progress'" in html
|
| 252 |
+
assert html.count("class='trial-progress-segment") == len(app.TRIAL_PROGRESS_STAGES)
|
| 253 |
+
assert "aria-current='step'" not in html
|
| 254 |
+
assert "class='trial-progress-segment' data-phase='appeal'" not in html
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
def test_trial_progress_css_is_fixed_and_translucent_theme_matched():
|
| 258 |
+
assert ".trial-progress {\n position: fixed;\n top: 0;" in app.CSS
|
| 259 |
+
assert "background: rgba(23, 13, 8, .58);" in app.CSS
|
| 260 |
+
assert "backdrop-filter: blur(8px);" in app.CSS
|
| 261 |
+
assert "background: #ffd675;" in app.CSS
|
| 262 |
+
assert ".trial-progress-abbrev {\n display: inline;" in app.CSS
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
def test_foreground_prop_assets_have_real_transparency():
|
| 266 |
+
for path in [
|
| 267 |
+
Path("assets/foreground/foregroundFence.png"),
|
| 268 |
+
Path("assets/foreground/JudgeTable.png"),
|
| 269 |
+
]:
|
| 270 |
+
alpha = Image.open(path).convert("RGBA").getchannel("A")
|
| 271 |
+
histogram = alpha.histogram()
|
| 272 |
+
|
| 273 |
+
assert histogram[0] > 0, f"{path} has no fully transparent pixels"
|
| 274 |
+
assert histogram[255] > 0, f"{path} has no fully opaque prop pixels"
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
def test_latest_speaker_sets_stage_class_and_speech_bubble():
|
| 278 |
+
html = app.render_court([_speaker_event("Mike OSS", phase="claims")], started=True)
|
| 279 |
+
|
| 280 |
+
assert "speaker-auric" in html
|
| 281 |
+
assert "class='speech-bubble active-dialogue speaker-auric'" in html
|
| 282 |
+
assert "data-speaker='Mike OSS'" in html
|
| 283 |
+
assert "<strong>Mike OSS</strong>" in html
|
| 284 |
+
assert "test speaker" in html
|
| 285 |
+
assert "Mike OSS has the visible floor." in html
|
| 286 |
+
assert "puppet auric active walking" in html
|
| 287 |
+
assert "puppet sable active" not in html
|
| 288 |
+
assert html.count("class='speech-bubble") == 1
|
| 289 |
+
assert html.find("class='foreground-props'") < html.find("class='speech-bubble active-dialogue")
|
| 290 |
+
assert ".speech-bubble.active-dialogue,\n.speech-bubble.active-dialogue * {\n color: #141413 !important;\n}" in app.CSS
|
| 291 |
+
assert "border: 2px solid #141413;" in app.CSS
|
| 292 |
+
assert "font-size: 12px;" in app.CSS
|
| 293 |
+
|
| 294 |
+
|
| 295 |
+
def test_speech_bubble_uses_full_turn_content_not_event_body():
|
| 296 |
+
long_text = " ".join(["The record speaks plainly"] * 18) + " with a final visible phrase."
|
| 297 |
+
event = TrialEvent(
|
| 298 |
+
phase="questions",
|
| 299 |
+
title="Counsel answers",
|
| 300 |
+
body="Narration only, not spoken dialogue.",
|
| 301 |
+
turns=[
|
| 302 |
+
AgentTurn(
|
| 303 |
+
agent="Harvey Vector",
|
| 304 |
+
role="respondent advocate",
|
| 305 |
+
content=long_text,
|
| 306 |
+
model="test-model",
|
| 307 |
+
confidence=0.9,
|
| 308 |
+
)
|
| 309 |
+
],
|
| 310 |
+
)
|
| 311 |
+
html = app.render_court([event], started=True)
|
| 312 |
+
bubble = html[html.index("<div class='speech-bubble") : html.index("<div class='gallery-benches")]
|
| 313 |
+
|
| 314 |
+
assert "with a final visible phrase." in bubble
|
| 315 |
+
assert "Narration only" not in bubble
|
| 316 |
+
assert "..." not in bubble
|
| 317 |
+
|
| 318 |
+
|
| 319 |
+
def test_pending_speaker_renders_single_preparing_bubble():
|
| 320 |
+
pending = app.SpeakerCue(
|
| 321 |
+
name="Harvey Vector",
|
| 322 |
+
role="respondent advocate",
|
| 323 |
+
text="Harvey Vector is preparing a response.",
|
| 324 |
+
pending=True,
|
| 325 |
+
)
|
| 326 |
+
html = app.render_court([], started=True, pending_speaker=pending)
|
| 327 |
+
|
| 328 |
+
assert "class='speech-bubble active-dialogue speaker-sable pending'" in html
|
| 329 |
+
assert "data-pending='true'" in html
|
| 330 |
+
assert "Harvey Vector is preparing a response." in html
|
| 331 |
+
assert "puppet sable active walking" in html
|
| 332 |
+
assert html.count("class='speech-bubble") == 1
|
| 333 |
+
|
| 334 |
+
|
| 335 |
+
def test_reading_duration_scales_with_words_and_caps():
|
| 336 |
+
assert app._reading_duration("short line") == app.MIN_READ_SECONDS
|
| 337 |
+
assert app._reading_duration("word " * 18) > app.MIN_READ_SECONDS
|
| 338 |
+
assert app._reading_duration("word " * 200) == app.MAX_READ_SECONDS
|
| 339 |
+
|
| 340 |
+
|
| 341 |
+
def test_individual_juror_can_be_active_speaker():
|
| 342 |
+
event = TrialEvent(
|
| 343 |
+
phase="deliberation",
|
| 344 |
+
title="Juror Karl Marx Votes",
|
| 345 |
+
body=app.JUROR_PERSONAS["Karl Marx"],
|
| 346 |
+
turns=[
|
| 347 |
+
AgentTurn(
|
| 348 |
+
agent="Karl Marx",
|
| 349 |
+
role="juror",
|
| 350 |
+
content="Liable. E1 carries the record.",
|
| 351 |
+
model="nvidia/Nemotron-Orchestrator-8B",
|
| 352 |
+
confidence=0.86,
|
| 353 |
+
input="SYSTEM:\nJury JSON prompt.",
|
| 354 |
+
)
|
| 355 |
+
],
|
| 356 |
+
votes=[
|
| 357 |
+
JurorVote(
|
| 358 |
+
juror="Karl Marx",
|
| 359 |
+
persona=app.JUROR_PERSONAS["Karl Marx"],
|
| 360 |
+
vote="liable",
|
| 361 |
+
reason="E1 carries the record.",
|
| 362 |
+
evidence_ids=["E1"],
|
| 363 |
+
)
|
| 364 |
+
],
|
| 365 |
+
)
|
| 366 |
+
html = app.render_court([event], started=True)
|
| 367 |
+
|
| 368 |
+
assert "speaker-karl-marx" in html
|
| 369 |
+
assert "<a class='juror active'" in html
|
| 370 |
+
assert "class='speech-bubble active-dialogue speaker-karl-marx juror-dialogue'" in html
|
| 371 |
+
assert "Liable. E1 carries the record." in html
|
| 372 |
+
assert html.count("class='speech-bubble") == 1
|
| 373 |
+
|
| 374 |
+
|
| 375 |
+
def test_juror_speech_bubbles_anchor_above_side_benches():
|
| 376 |
+
assert ".speech-bubble.active-dialogue.juror-dialogue {\n top: 42%;" in app.CSS
|
| 377 |
+
assert ".speech-bubble.active-dialogue.speaker-karl-marx,\n.speech-bubble.active-dialogue.speaker-john-stuart-mill,\n.speech-bubble.active-dialogue.speaker-confucius {\n left: 1.5%;" in app.CSS
|
| 378 |
+
assert ".speech-bubble.active-dialogue.speaker-cleopatra-vii,\n.speech-bubble.active-dialogue.speaker-niccolo-machiavelli,\n.speech-bubble.active-dialogue.speaker-jensen-huang {\n right: 1.5%;" in app.CSS
|
| 379 |
+
assert "--bubble-tail-x: 19%;" in app.CSS
|
| 380 |
+
assert "--bubble-tail-x: 81%;" in app.CSS
|
| 381 |
+
assert ".speech-bubble.active-dialogue.juror-dialogue,\n .speech-bubble.active-dialogue.speaker-karl-marx" in app.CSS
|
| 382 |
+
assert "top: 500px;" in app.CSS
|
| 383 |
+
|
| 384 |
+
|
| 385 |
+
def test_lawyer_movement_css_is_speaker_specific_not_phase_wide():
|
| 386 |
+
assert ".speaker-auric .puppet.auric" in app.CSS
|
| 387 |
+
assert ".speaker-sable .puppet.sable" in app.CSS
|
| 388 |
+
assert ".phase-claims .puppet.auric" not in app.CSS
|
| 389 |
+
assert ".phase-opening .puppet.sable" not in app.CSS
|
| 390 |
+
|
| 391 |
+
|
| 392 |
+
def test_closed_book_and_key_characters_align_with_judge_table():
|
| 393 |
+
assert ".episode-book {\n position: absolute;\n left: 50%;\n top: 122px;\n z-index: 14;" in app.CSS
|
| 394 |
+
assert "width: min(980px, calc(100% - 32px));" in app.CSS
|
| 395 |
+
assert ".episode-book.closed {\n top: 50%;\n width: min(163px, 20vw);" in app.CSS
|
| 396 |
+
assert ".foreground-fence {\n bottom: -6.5%;\n width: 47%;" in app.CSS
|
| 397 |
+
assert ".judge-table-foreground {\n left: 50%;\n top: 20%;\n z-index: 1;\n width: 39.1%;" in app.CSS
|
| 398 |
+
assert ".puppet.judge {\n left: 50%;\n top: calc(40% + 156px);" in app.CSS
|
| 399 |
+
assert ".puppet.auric {\n left: 24%;\n top: 87%;" in app.CSS
|
| 400 |
+
assert ".speaker-auric .puppet.auric {\n left: 43%;\n top: 87%;" in app.CSS
|
| 401 |
+
assert ".puppet.sable {\n left: 75%;\n top: 87%;" in app.CSS
|
| 402 |
+
assert ".speaker-sable .puppet.sable {\n left: 75%;\n top: 87%;" in app.CSS
|
| 403 |
+
assert ".puppet.clerk {\n left: 43%;\n top: 66%;" in app.CSS
|
| 404 |
+
assert ".puppet.auditor" not in app.CSS
|
| 405 |
+
assert ".episode-book.closed {\n top: 640px;\n width: 140px;" in app.CSS
|
| 406 |
+
assert ".episode-book {\n top: 218px;\n width: min(680px, calc(100% - 20px));" in app.CSS
|
| 407 |
+
assert ".foreground-fence {\n bottom: -66px;\n width: 64%;" in app.CSS
|
| 408 |
+
assert ".judge-table-foreground {\n top: 213px;\n width: 646px;" in app.CSS
|
| 409 |
+
assert ".puppet.auric {\n left: 20%;\n top: 970px;" in app.CSS
|
| 410 |
+
assert ".puppet.sable {\n left: 80%;\n top: 970px;" in app.CSS
|
| 411 |
+
assert ".speaker-sable .puppet.sable {\n left: 80%;\n top: 970px;" in app.CSS
|
| 412 |
+
assert ".puppet.judge {\n top: 576px;" not in app.CSS
|
| 413 |
+
assert ".puppet.sable {\n left: 80%;\n top: 640px;" not in app.CSS
|
| 414 |
+
assert ".speaker-sable .puppet.sable {\n left: 80%;\n top: 640px;" not in app.CSS
|
| 415 |
+
assert ".puppet.clerk {\n left: 35%;\n top: 880px;" in app.CSS
|
| 416 |
+
assert ".speech-bubble.active-dialogue.speaker-auditor" not in app.CSS
|
| 417 |
+
|
| 418 |
+
|
| 419 |
+
def test_open_docket_book_renders_text_above_book_art():
|
| 420 |
+
html = app.render_court([])
|
| 421 |
+
|
| 422 |
+
assert "class='episode-book'" in html
|
| 423 |
+
assert "class='book-open-content'" in html
|
| 424 |
+
assert "Trial details" in html
|
| 425 |
+
assert "Evidence" in html
|
| 426 |
+
|
| 427 |
+
|
| 428 |
+
def test_greg_case_preview_uses_cached_context_and_evidence_columns():
|
| 429 |
+
html = app.render_case_preview("Greg Heffley vs Mom")
|
| 430 |
+
|
| 431 |
+
assert "Greg Heffley v. Mom" in html
|
| 432 |
+
assert "diary" in html
|
| 433 |
+
assert "Evidence for Greg Heffley" in html
|
| 434 |
+
assert "Evidence for Susan Heffley" in html
|
| 435 |
+
|
| 436 |
+
|
| 437 |
+
def test_custom_case_preview_renders_fillable_book_fields():
|
| 438 |
+
html = app.render_case_preview("Custom")
|
| 439 |
+
|
| 440 |
+
assert "episode-book custom-book" in html
|
| 441 |
+
assert "book-context-field" in html
|
| 442 |
+
assert html.count("book-claimant-field") == 3
|
| 443 |
+
assert html.count("book-respondent-field") == 3
|
| 444 |
+
|
| 445 |
+
|
| 446 |
+
def test_custom_payload_builds_trial_request_packet(monkeypatch):
|
| 447 |
+
captured = {}
|
| 448 |
+
|
| 449 |
+
def fake_events(request):
|
| 450 |
+
captured["request"] = request
|
| 451 |
+
return iter([_event_with_lower_tab_data()])
|
| 452 |
+
|
| 453 |
+
monkeypatch.setattr(app, "get_events", fake_events)
|
| 454 |
+
monkeypatch.setattr(app, "_reading_duration", lambda text: 0)
|
| 455 |
+
payload = json.dumps(
|
| 456 |
+
{
|
| 457 |
+
"context": "A missing bicycle is traced to a disputed garage visit.",
|
| 458 |
+
"claimant_evidence": ["Garage text", "", "Scuffed tire mark"],
|
| 459 |
+
"respondent_evidence": ["Neighbor saw bike later", "", ""],
|
| 460 |
+
}
|
| 461 |
+
)
|
| 462 |
+
|
| 463 |
+
outputs = list(app.run_ui("Custom", "", "", payload, "swift", True))
|
| 464 |
+
|
| 465 |
+
assert outputs[-1][-1] == "Verdict sealed."
|
| 466 |
+
request = captured["request"]
|
| 467 |
+
assert request.case_id == "custom"
|
| 468 |
+
assert request.custom_case is not None
|
| 469 |
+
assert request.custom_case.context.startswith("A missing bicycle")
|
| 470 |
+
assert [item.supports for item in request.custom_case.evidence] == ["claimant", "claimant", "respondent"]
|
| 471 |
+
|
| 472 |
+
|
| 473 |
+
def test_custom_payload_requires_context_and_both_evidence_sides():
|
| 474 |
+
payload = json.dumps({"context": "", "claimant_evidence": ["Only one side"], "respondent_evidence": []})
|
| 475 |
+
|
| 476 |
+
outputs = list(app.run_ui("Custom", "", "", payload, "swift", True))
|
| 477 |
+
|
| 478 |
+
assert outputs[-1][-1] == "Custom requires a trial details paragraph."
|
| 479 |
+
|
| 480 |
+
|
| 481 |
+
def test_run_ui_yields_five_outputs_without_download_status(monkeypatch):
|
| 482 |
+
event = _event_with_lower_tab_data()
|
| 483 |
+
monkeypatch.setattr(app, "get_events", lambda request: iter([event]))
|
| 484 |
+
monkeypatch.setattr(app, "_reading_duration", lambda text: 0)
|
| 485 |
+
|
| 486 |
+
outputs = list(app.run_ui("Trial of Socrates", "", "", "", "swift", True))
|
| 487 |
+
|
| 488 |
+
assert outputs
|
| 489 |
+
assert all(len(output) == 5 for output in outputs)
|
| 490 |
+
assert outputs[0][-1] == "Clerk Meridian is preparing their response."
|
| 491 |
+
assert outputs[1][-1] == "Step 1: Nemotron Jury - Jury weighs the record"
|
| 492 |
+
assert outputs[-1][-1] == "Verdict sealed."
|
| 493 |
+
assert "download" not in outputs[-1][-1].lower()
|
| 494 |
+
|
| 495 |
+
|
| 496 |
+
def test_run_ui_stops_with_model_unavailable_error(monkeypatch):
|
| 497 |
+
def broken_events(request):
|
| 498 |
+
raise RuntimeError("Marcus Aurelius unavailable: offline")
|
| 499 |
+
yield
|
| 500 |
+
|
| 501 |
+
monkeypatch.setattr(app, "get_events", broken_events)
|
| 502 |
+
|
| 503 |
+
outputs = list(app.run_ui("Trial of Socrates", "", "", "", "swift", True))
|
| 504 |
+
|
| 505 |
+
assert outputs[-1][-1] == "Model response required. Trial stopped: Marcus Aurelius unavailable: offline"
|
| 506 |
+
assert "Claimant score" not in outputs[-1][0]
|
| 507 |
+
|
| 508 |
+
|
| 509 |
+
def test_remote_events_uses_default_modal_endpoint_without_local_token(monkeypatch):
|
| 510 |
+
captured = {}
|
| 511 |
+
|
| 512 |
+
class FakeResponse:
|
| 513 |
+
def __enter__(self):
|
| 514 |
+
return self
|
| 515 |
+
|
| 516 |
+
def __exit__(self, exc_type, exc, traceback):
|
| 517 |
+
return False
|
| 518 |
+
|
| 519 |
+
def raise_for_status(self):
|
| 520 |
+
return None
|
| 521 |
+
|
| 522 |
+
def iter_lines(self):
|
| 523 |
+
event = _speaker_event("Clerk Meridian", phase="intake")
|
| 524 |
+
yield json.dumps(event.model_dump())
|
| 525 |
+
|
| 526 |
+
def fake_stream(method, endpoint, json, timeout):
|
| 527 |
+
captured["method"] = method
|
| 528 |
+
captured["endpoint"] = endpoint
|
| 529 |
+
captured["payload"] = json
|
| 530 |
+
captured["timeout"] = timeout
|
| 531 |
+
return FakeResponse()
|
| 532 |
+
|
| 533 |
+
monkeypatch.delenv("MODAL_TRIAL_URL", raising=False)
|
| 534 |
+
monkeypatch.delenv("HF_TOKEN", raising=False)
|
| 535 |
+
monkeypatch.setattr(app.httpx, "stream", fake_stream)
|
| 536 |
+
|
| 537 |
+
event = next(app.get_events(app.TrialRequest(case_id="socrates"), delay=0.0))
|
| 538 |
+
|
| 539 |
+
assert captured["method"] == "POST"
|
| 540 |
+
assert captured["endpoint"] == app.DEFAULT_MODAL_TRIAL_URL
|
| 541 |
+
assert captured["timeout"] == 900.0
|
| 542 |
+
assert event.turns[0].agent == "Clerk Meridian"
|
| 543 |
+
|
| 544 |
+
|
| 545 |
+
def test_court_renders_sound_toggle():
|
| 546 |
+
html = app.render_court([])
|
| 547 |
+
|
| 548 |
+
assert "sound-toggle" in html
|
| 549 |
+
assert "aria-label='Toggle sound'" in html
|
| 550 |
+
assert "aria-pressed='false'" in html
|
| 551 |
+
|
| 552 |
+
|
| 553 |
+
def test_audio_controller_has_score_breathing_and_mute_toggle():
|
| 554 |
+
assert "SCORE_BREATH_INTERVAL_MS = 20000" in app.APP_JS
|
| 555 |
+
assert "SCORE_BREATH_DURATION_MS = 5000" in app.APP_JS
|
| 556 |
+
assert "toggleMuted()" in app.APP_JS
|
| 557 |
+
assert "this.fadeScore(SCORE_QUIET_VOLUME, halfDuration" in app.APP_JS
|
| 558 |
+
|
| 559 |
+
|
| 560 |
+
def test_courtroom_background_has_no_overlay_or_character_shadow():
|
| 561 |
+
assert "background: #141413 !important;" in app.CSS
|
| 562 |
+
assert "background-color: #141413 !important;" in app.CSS
|
| 563 |
+
assert "cover fixed no-repeat" not in app.CSS
|
| 564 |
+
assert ".court-episode-stage::before {\n content: \"\";\n display: none;" in app.CSS
|
| 565 |
+
assert ".court-episode-stage::after {\n content: \"\";\n display: none;" in app.CSS
|
| 566 |
+
assert "url('/gradio_api/file=assets/background/CourtRoom.png') center center / 100% 100% no-repeat" in app.CSS
|
| 567 |
+
assert "filter: drop-shadow(0 12px 14px" not in app.CSS
|
| 568 |
+
assert "filter: drop-shadow(0 8px 10px" not in app.CSS
|
| 569 |
+
|
| 570 |
+
|
| 571 |
+
def test_synthetic_stage_props_do_not_tint_background():
|
| 572 |
+
assert ".bench-front {\n display: none;" in app.CSS
|
| 573 |
+
assert ".trial-floor-mark {\n display: none;" in app.CSS
|
| 574 |
+
assert ".gallery-benches {\n display: none;" in app.CSS
|
| 575 |
+
assert ".prop-label {\n display: none;" in app.CSS
|
| 576 |
+
assert ".counsel-table" in app.CSS
|
| 577 |
+
assert "background: transparent;\n box-shadow: none;" in app.CSS
|
| 578 |
+
assert ".witness-area" in app.CSS
|