Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- .gitattributes +6 -0
- PIPELINE.md +301 -0
- README.md +266 -1400
- README.pdf +3 -0
- assets/episode.png +0 -0
- assets/formulas-1.png +3 -0
- assets/formulas-2.png +0 -0
- assets/gs.png +0 -0
- assets/hybrid.png +3 -0
- assets/logo.png +0 -0
- assets/reflexion.png +3 -0
- assets/sys arch.png +3 -0
- docs.md +312 -0
- eval-models/deepseek_test_judge_eval.py +478 -0
- eval-models/gemma_test_judge_eval.py +478 -0
- eval-models/llama_test_judge_eval.py +478 -0
- eval-models/mistral_test_judge_eval.py +478 -0
- eval-models/nvidia_test_judge_eval.py +478 -0
- eval-models/qwen_test_judge_eval.py +478 -0
- images/big.png +3 -0
- images/logo.png +0 -0
- images/plot.png +0 -0
- images/table1.png +0 -0
- images/table2.png +0 -0
- images/table3.png +0 -0
- judge_log.txt +513 -0
- memory/reflections_easy.jsonl +1 -1
- memory/reflections_hard.jsonl +3 -3
- memory/reflections_medium.jsonl +1 -1
- memory/wins_easy.jsonl +40 -0
- memory/wins_hard.jsonl +25 -0
- memory/wins_medium.jsonl +34 -10
- model-benchmark-logs/deepseek_judge_log.txt +749 -0
- model-benchmark-logs/gemma_judge_log.txt +0 -0
- model-benchmark-logs/meta_judge_log.txt +826 -0
- model-benchmark-logs/mistral_judge_log.txt +410 -0
- model-benchmark-logs/nvidia_judge_log.txt +545 -0
- runs/metrics.jsonl +0 -0
- server/app.py +622 -85
- server/environment.py +43 -1
- server/generator.py +12 -2
.gitattributes
CHANGED
|
@@ -49,3 +49,9 @@ wheels/urllib3-2.6.3-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
|
| 49 |
wheels/uvloop-0.22.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl filter=lfs diff=lfs merge=lfs -text
|
| 50 |
wheels/watchfiles-1.1.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl filter=lfs diff=lfs merge=lfs -text
|
| 51 |
wheels/websockets-16.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
wheels/uvloop-0.22.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl filter=lfs diff=lfs merge=lfs -text
|
| 50 |
wheels/watchfiles-1.1.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl filter=lfs diff=lfs merge=lfs -text
|
| 51 |
wheels/websockets-16.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
README.pdf filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
assets/formulas-1.png filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
assets/hybrid.png filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
assets/reflexion.png filter=lfs diff=lfs merge=lfs -text
|
| 56 |
+
assets/sys[[:space:]]arch.png filter=lfs diff=lfs merge=lfs -text
|
| 57 |
+
images/big.png filter=lfs diff=lfs merge=lfs -text
|
PIPELINE.md
ADDED
|
@@ -0,0 +1,301 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# GraphStrike — End-to-End Pipeline
|
| 2 |
+
|
| 3 |
+
A complete walkthrough of how GraphStrike works, from data generation to scoring.
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
## Pipeline Overview
|
| 8 |
+
|
| 9 |
+
```
|
| 10 |
+
┌─────────────────────────────────────────────────────────────────────┐
|
| 11 |
+
│ BUILD TIME (Docker) │
|
| 12 |
+
│ │
|
| 13 |
+
│ generator.py ──► 150 episode JSONs (50 per difficulty tier) │
|
| 14 |
+
│ │ each with 10 gang + N real + decoys │
|
| 15 |
+
│ │ deterministic by seed │
|
| 16 |
+
│ ▼ │
|
| 17 |
+
│ episodes/easy_000.json ... hard_049.json │
|
| 18 |
+
└─────────────────────────────────────────────────────────────────────┘
|
| 19 |
+
│
|
| 20 |
+
▼
|
| 21 |
+
┌─────────────────────────────────────────────────────────────────────┐
|
| 22 |
+
│ RUNTIME (Environment Server) │
|
| 23 |
+
│ │
|
| 24 |
+
│ FastAPI + Gradio @ port 7860 │
|
| 25 |
+
│ │
|
| 26 |
+
│ POST /reset ──► Load episode, init graph, reveal visible IDs │
|
| 27 |
+
│ POST /step ──► Execute action, update state, return observation │
|
| 28 |
+
│ GET /grader ──► Compute final score after SUBMIT │
|
| 29 |
+
│ POST /baseline ──► Run rule-based agent on all 3 tasks │
|
| 30 |
+
└─────────────────────────────────────────────────────────────────────┘
|
| 31 |
+
│
|
| 32 |
+
▼
|
| 33 |
+
┌─────────────────────────────────────────────────────────────────────┐
|
| 34 |
+
│ INFERENCE (Agent / LLM) │
|
| 35 |
+
│ │
|
| 36 |
+
│ inference.py connects to server via HTTP │
|
| 37 |
+
│ Uses OpenAI-compatible client to call LLM │
|
| 38 |
+
│ Emits structured logs: [START] [STEP] [END] │
|
| 39 |
+
└─────────────────────────────────────────────────────────────────────┘
|
| 40 |
+
```
|
| 41 |
+
|
| 42 |
+
---
|
| 43 |
+
|
| 44 |
+
## Stage 1: Synthetic Data Generation
|
| 45 |
+
|
| 46 |
+
**File:** `server/generator.py`
|
| 47 |
+
|
| 48 |
+
```
|
| 49 |
+
Input: seed (int) + difficulty tier (easy/medium/hard)
|
| 50 |
+
Output: episode JSON with full social graph
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
### What gets generated per episode:
|
| 54 |
+
|
| 55 |
+
| Component | Easy | Medium | Hard |
|
| 56 |
+
|-----------|------|--------|------|
|
| 57 |
+
| Total accounts | 50 | 200 | 1000 |
|
| 58 |
+
| Gang members (target) | 10 | 10 | 10 |
|
| 59 |
+
| Decoy accounts | 0 | 20 | 50 |
|
| 60 |
+
| Celebrity accounts | 2 | 2 | 2 |
|
| 61 |
+
| Zero-edge accounts | 2 | 2 | 2 |
|
| 62 |
+
| Max steps | 30 | 50 | 80 |
|
| 63 |
+
| Evasion events | None | At step 20 | Recurring |
|
| 64 |
+
|
| 65 |
+
### Gang member signals (coordinated):
|
| 66 |
+
- `ip_cluster_id = "ip_gang_{seed}"` — all 10 share one IP
|
| 67 |
+
- `shared_ip_count = 9` — each sees 9 others on same IP
|
| 68 |
+
- `photo_reuse_score ∈ [0.70, 0.95]` — stolen celebrity photos
|
| 69 |
+
- `bio_template_score ∈ [0.60, 0.90]` — copy-paste bios
|
| 70 |
+
- `comment_repeat_score ∈ [0.60, 0.90]` — spam comments
|
| 71 |
+
- `avg_post_hour` clustered within 2h window — coordinated posting
|
| 72 |
+
|
| 73 |
+
### Real account signals (independent):
|
| 74 |
+
- Unique IP per account
|
| 75 |
+
- `photo_reuse_score ∈ [0.0, 0.05]`
|
| 76 |
+
- `bio_template_score ∈ [0.0, 0.08]`
|
| 77 |
+
- `comment_repeat_score ∈ [0.0, 0.08]`
|
| 78 |
+
|
| 79 |
+
---
|
| 80 |
+
|
| 81 |
+
## Stage 2: Environment State Machine
|
| 82 |
+
|
| 83 |
+
**File:** `server/environment.py`
|
| 84 |
+
|
| 85 |
+
```
|
| 86 |
+
POST /reset
|
| 87 |
+
│
|
| 88 |
+
▼
|
| 89 |
+
┌─────────────────┐
|
| 90 |
+
│ EPISODE INIT │
|
| 91 |
+
│ Load JSON graph │
|
| 92 |
+
│ Reveal visible │
|
| 93 |
+
│ IDs to agent │
|
| 94 |
+
└────────┬────────┘
|
| 95 |
+
│
|
| 96 |
+
▼
|
| 97 |
+
┌─────────────────┐ POST /step
|
| 98 |
+
│ AGENT LOOP │◄────────────────┐
|
| 99 |
+
│ │ │
|
| 100 |
+
│ Observation: │ ┌��─────────┐ │
|
| 101 |
+
│ - visible_ids │ │ ACTION │ │
|
| 102 |
+
│ - profiles │ │ │ │
|
| 103 |
+
│ - flagged_ids │──►│ INSPECT │──┘
|
| 104 |
+
│ - suspect_ids │ │ INVEST_NET│
|
| 105 |
+
│ - steps_remain │ │ FLAG │
|
| 106 |
+
│ - message/hints │ │ UNFLAG │
|
| 107 |
+
└────────┬────────┘ │ SUBMIT │
|
| 108 |
+
│ └──────────┘
|
| 109 |
+
│ (SUBMIT or steps=0)
|
| 110 |
+
▼
|
| 111 |
+
┌─────────────────┐
|
| 112 |
+
│ EPISODE END │
|
| 113 |
+
│ Compute grader │
|
| 114 |
+
│ score via │
|
| 115 |
+
│ scoring.py │
|
| 116 |
+
└─────────────────┘
|
| 117 |
+
│
|
| 118 |
+
▼
|
| 119 |
+
GET /grader
|
| 120 |
+
→ { score: 0.0-1.0 }
|
| 121 |
+
```
|
| 122 |
+
|
| 123 |
+
### Action Details
|
| 124 |
+
|
| 125 |
+
| Action | Step Cost | Effect |
|
| 126 |
+
|--------|-----------|--------|
|
| 127 |
+
| **INSPECT** | 1 | Reveals full profile + risk scores for one account |
|
| 128 |
+
| **INVESTIGATE_NETWORK** | 1 | 2-hop bidirectional expansion, reveals neighbor IDs |
|
| 129 |
+
| **FLAG** | 0 (free) | Marks account as fake, triggers dual cascade |
|
| 130 |
+
| **UNFLAG** | 0 (free) | Removes flag from account |
|
| 131 |
+
| **SUBMIT** | 0 | Ends episode, triggers grading |
|
| 132 |
+
|
| 133 |
+
### Dual Cascade on FLAG
|
| 134 |
+
|
| 135 |
+
```
|
| 136 |
+
FLAG acc_0012
|
| 137 |
+
│
|
| 138 |
+
├──► Follow-graph cascade
|
| 139 |
+
│ For each neighbor of acc_0012:
|
| 140 |
+
│ if visible AND status=NORMAL → set SUSPECT
|
| 141 |
+
│
|
| 142 |
+
└──► IP cluster cascade
|
| 143 |
+
Find all accounts with same ip_cluster_id:
|
| 144 |
+
if visible AND status=NORMAL → set SUSPECT
|
| 145 |
+
```
|
| 146 |
+
|
| 147 |
+
---
|
| 148 |
+
|
| 149 |
+
## Stage 3: Risk Scoring Engine
|
| 150 |
+
|
| 151 |
+
**File:** `server/scoring.py`
|
| 152 |
+
|
| 153 |
+
All scores computed at INSPECT time. Stateless pure functions.
|
| 154 |
+
|
| 155 |
+
```
|
| 156 |
+
┌─────────────┐
|
| 157 |
+
│ Raw Features │
|
| 158 |
+
│ from profile │
|
| 159 |
+
└──────┬──────┘
|
| 160 |
+
│
|
| 161 |
+
┌────────────┼────────────┐
|
| 162 |
+
▼ ▼ ▼
|
| 163 |
+
┌──────────┐ ┌──────────┐ ┌──────────┐
|
| 164 |
+
│ Node Risk│ │Behavior │ │Graph Risk│
|
| 165 |
+
│ 0.6×photo│ │Risk │ │0.45×flag │
|
| 166 |
+
│+0.4×bio │ │0.55×age │ │ _nbr_rat │
|
| 167 |
+
└────┬─────┘ │+0.45×hr │ │+0.35×mut │
|
| 168 |
+
│ │ cluster │ │+0.20×nbr │
|
| 169 |
+
│ └────┬─────┘ │ photo │
|
| 170 |
+
│ │ └────┬─────┘
|
| 171 |
+
│ │ │
|
| 172 |
+
▼ ▼ ▼
|
| 173 |
+
┌──────────────────────────────────┐ ┌──────────────┐
|
| 174 |
+
│ fake_risk_score │◄───│Hub Legitimacy│
|
| 175 |
+
│ 0.30×node + 0.25×beh + 0.45×graph│ │(discount for │
|
| 176 |
+
│ − 0.25×hub │ │ celebrities) │
|
| 177 |
+
│ clamp [0.0, 1.0] │ └──────────────┘
|
| 178 |
+
└──────────────────────────────────┘
|
| 179 |
+
```
|
| 180 |
+
|
| 181 |
+
### Grader Formula (after SUBMIT)
|
| 182 |
+
|
| 183 |
+
```
|
| 184 |
+
If recall ≥ 0.8 AND precision ≥ 0.7:
|
| 185 |
+
score = 0.55 + 0.20×recall + 0.15×precision + 0.10×efficiency
|
| 186 |
+
|
| 187 |
+
Else (partial credit):
|
| 188 |
+
score = 0.30×recall + 0.10×precision
|
| 189 |
+
|
| 190 |
+
Where:
|
| 191 |
+
recall = TP / 10
|
| 192 |
+
precision = TP / (TP + FP)
|
| 193 |
+
efficiency = (max_steps − steps_used) / max_steps
|
| 194 |
+
```
|
| 195 |
+
|
| 196 |
+
---
|
| 197 |
+
|
| 198 |
+
## Stage 4: Inference Pipeline
|
| 199 |
+
|
| 200 |
+
**File:** `inference.py`
|
| 201 |
+
|
| 202 |
+
### Rule-Based Agent (baseline)
|
| 203 |
+
|
| 204 |
+
```
|
| 205 |
+
while not done:
|
| 206 |
+
1. INSPECT suspects first (auto-elevated by cascade)
|
| 207 |
+
2. FLAG any inspected account with fake_risk ≥ threshold
|
| 208 |
+
(easy: 0.60, medium: 0.50, hard: 0.45)
|
| 209 |
+
3. INSPECT highest-risk uninspected account
|
| 210 |
+
4. SUBMIT when 10 flagged or steps running low
|
| 211 |
+
```
|
| 212 |
+
|
| 213 |
+
### LLM Agent (hybrid policy)
|
| 214 |
+
|
| 215 |
+
```
|
| 216 |
+
┌──────────────┐ ┌───────────────┐ ┌──────────────┐
|
| 217 |
+
│ Observation │────►│ Rule Engine │────►│ α blend │
|
| 218 |
+
│ (from server) │ │ (get_rule_ │ │ │
|
| 219 |
+
│ │ │ action + │ │ if conf>thr: │
|
| 220 |
+
│ │────►│ confidence) │ │ use rule │
|
| 221 |
+
│ │ └───────────────┘ │ else: │
|
| 222 |
+
│ │ │ query LLM │
|
| 223 |
+
│ │ ┌───────────────┐ │ blend │
|
| 224 |
+
│ │────►│ LLM (OpenAI │────►│ actions │
|
| 225 |
+
│ │ │ client) │ └──────┬───────┘
|
| 226 |
+
│ │ └───────────────┘ │
|
| 227 |
+
└──────────────┘ ▼
|
| 228 |
+
┌──────────────┐
|
| 229 |
+
│ Final Action │
|
| 230 |
+
└──────────────┘
|
| 231 |
+
|
| 232 |
+
α caps: easy ≤ 0.50 | medium ≤ 0.70 | hard ≤ 0.85
|
| 233 |
+
(rule engine retains veto power on high-confidence decisions)
|
| 234 |
+
```
|
| 235 |
+
|
| 236 |
+
### Structured Log Format
|
| 237 |
+
|
| 238 |
+
```
|
| 239 |
+
[START] task=easy env=graphstrike model=Qwen/Qwen2.5-72B-Instruct
|
| 240 |
+
[STEP] step=1 action=inspect:acc_0012 reward=0.00 done=false error=null
|
| 241 |
+
[STEP] step=2 action=flag:acc_0012 reward=1.20 done=false error=null
|
| 242 |
+
...
|
| 243 |
+
[STEP] step=15 action=submit reward=12.40 done=true error=null
|
| 244 |
+
[END] success=true steps=15 score=0.910 rewards=0.00,1.20,...,12.40
|
| 245 |
+
```
|
| 246 |
+
|
| 247 |
+
---
|
| 248 |
+
|
| 249 |
+
## Stage 5: Deployment
|
| 250 |
+
|
| 251 |
+
```
|
| 252 |
+
┌─────────────────────────────────────────┐
|
| 253 |
+
│ Hugging Face Spaces │
|
| 254 |
+
│ │
|
| 255 |
+
│ Docker container @ port 7860 │
|
| 256 |
+
│ ┌────────────────────────────────────┐ │
|
| 257 |
+
│ │ FastAPI (API endpoints) │ │
|
| 258 |
+
│ │ /health /tasks /reset /step │ │
|
| 259 |
+
│ │ /state /grader /baseline │ │
|
| 260 |
+
│ │ /metadata /schema /mcp │ │
|
| 261 |
+
│ ├────────────────────────────────────┤ │
|
| 262 |
+
│ │ Gradio UI (mounted at /) │ │
|
| 263 |
+
│ │ Manual testing interface │ │
|
| 264 |
+
│ │ Reset / Step / Grader buttons │ │
|
| 265 |
+
│ └────────────────────────────────────┘ │
|
| 266 |
+
│ │
|
| 267 |
+
│ /web → redirect to / (HF probe compat) │
|
| 268 |
+
└─────────────────────────────────────────┘
|
| 269 |
+
│ ▲
|
| 270 |
+
│ HTTP │ HTTP
|
| 271 |
+
▼ │
|
| 272 |
+
┌─────────────────────┐ ┌────────────┐
|
| 273 |
+
│ inference.py │ │ openenv │
|
| 274 |
+
│ (runs externally) │ │ validate │
|
| 275 |
+
│ LLM ←→ Server │ │ (judging) │
|
| 276 |
+
└─────────────────────┘ └────────────┘
|
| 277 |
+
```
|
| 278 |
+
|
| 279 |
+
---
|
| 280 |
+
|
| 281 |
+
## Quick Start
|
| 282 |
+
|
| 283 |
+
```bash
|
| 284 |
+
# 1. Build & run the environment server
|
| 285 |
+
docker build -f Dockerfile -t graphstrike .
|
| 286 |
+
docker run -p 7860:7860 graphstrike
|
| 287 |
+
|
| 288 |
+
# 2. Verify endpoints
|
| 289 |
+
curl http://localhost:7860/health
|
| 290 |
+
curl http://localhost:7860/tasks
|
| 291 |
+
curl -X POST http://localhost:7860/baseline
|
| 292 |
+
|
| 293 |
+
# 3. Run LLM inference (separate terminal)
|
| 294 |
+
export API_KEY="your-hf-token"
|
| 295 |
+
export MODEL_NAME="Qwen/Qwen2.5-72B-Instruct"
|
| 296 |
+
python3 inference.py --url http://localhost:7860 --all-tasks
|
| 297 |
+
|
| 298 |
+
# 4. Validate submission
|
| 299 |
+
openenv validate
|
| 300 |
+
openenv validate --url http://localhost:7860
|
| 301 |
+
```
|
README.md
CHANGED
|
@@ -15,783 +15,255 @@ tags:
|
|
| 15 |
- llm-agent
|
| 16 |
base_path: /web
|
| 17 |
---
|
|
|
|
| 18 |
|
| 19 |
-
|
|
|
|
|
|
|
| 20 |
|
| 21 |
-
>
|
| 22 |
-
> Live deployment: [huggingface.co/spaces/Pandago/graphstrike](https://huggingface.co/spaces/Pandago/graphstrike)
|
| 23 |
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
-
|
| 32 |
|
| 33 |
-
|
| 34 |
-
2. [Repository Layout](#2-repository-layout)
|
| 35 |
-
3. [The Problem: How Fake Detection Actually Works](#3-the-problem-how-fake-detection-actually-works)
|
| 36 |
-
4. [Synthetic Data Generation](#4-synthetic-data-generation)
|
| 37 |
-
5. [Data Model — Every Field Explained](#5-data-model--every-field-explained)
|
| 38 |
-
6. [The RL Environment](#6-the-rl-environment)
|
| 39 |
-
7. [Risk Scoring Mathematics](#7-risk-scoring-mathematics)
|
| 40 |
-
8. [Account Status State Machine](#8-account-status-state-machine)
|
| 41 |
-
9. [The LLM Policy (Qwen3 via Bedrock)](#9-the-llm-policy-qwen3-via-bedrock)
|
| 42 |
-
10. [Reflexion — How the Agent Learns](#10-reflexion--how-the-agent-learns)
|
| 43 |
-
11. [Hybrid Policy — The Novel Contribution](#11-hybrid-policy--the-novel-contribution)
|
| 44 |
-
12. [Training Loop End-to-End](#12-training-loop-end-to-end)
|
| 45 |
-
13. [API Reference](#13-api-reference)
|
| 46 |
-
14. [Docker Deployment](#14-docker-deployment)
|
| 47 |
-
15. [Submission Requirements](#15-submission-requirements)
|
| 48 |
-
16. [Verification & Validation](#16-verification--validation)
|
| 49 |
|
| 50 |
-
|
| 51 |
|
| 52 |
-
##
|
| 53 |
|
| 54 |
-
|
| 55 |
-
reinforcement learning environments with a standard microservice interface
|
| 56 |
-
(`/reset`, `/step`, `/state`) so that any agent implementation can plug in.
|
| 57 |
|
| 58 |
-
|
| 59 |
-
single coordinated ring of 10. The ring behaves in a coordinated way — same posting hour,
|
| 60 |
-
same IP subnet, stolen celebrity photos, copy-paste bios. The agent must find
|
| 61 |
-
all 10 by navigating a limited step budget, inspecting accounts, and flagging suspects.
|
| 62 |
|
| 63 |
-
**
|
| 64 |
|
| 65 |
-
|
| 66 |
-
- Fake accounts are mixed with innocent high-signal "decoy" accounts.
|
| 67 |
-
- In hard mode, the gang actively evades — dropping intra-gang follows,
|
| 68 |
-
renaming profiles — while the agent is mid-investigation.
|
| 69 |
-
- The agent cannot see the full network upfront: it must explore via INSPECT and
|
| 70 |
-
INVESTIGATE_NETWORK actions, spending steps to reveal information.
|
| 71 |
-
|
| 72 |
-
**What makes the learning novel:**
|
| 73 |
|
| 74 |
-
-
|
| 75 |
-
- The agent learns via **Reflexion**: post-episode lessons are written back into
|
| 76 |
-
memory and injected into every future prompt.
|
| 77 |
-
- A **dynamic hybrid policy** (α-weighted) blends the LLM with a deterministic
|
| 78 |
-
rule engine, with the blend weight α updating based on recent win rate.
|
| 79 |
-
Rules dominate early; the LLM takes over as it proves itself.
|
| 80 |
|
| 81 |
---
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
├── client.py # HTTP client for talking to the running server
|
| 91 |
-
├── inference.py # Submission: rule-based baseline runner + HTTP client mode
|
| 92 |
-
├── validate.py # Submission: pre-submission validator (24 checks)
|
| 93 |
-
├── train.py # Main training loop (curriculum + hybrid policy)
|
| 94 |
-
├── run.sh # Docker entrypoint: episodes → server → training
|
| 95 |
-
├── requirements.txt # Python dependencies
|
| 96 |
-
│
|
| 97 |
-
├── server/
|
| 98 |
-
│ ├── app.py # FastAPI server: /reset /step /state /health /tasks /grader /baseline
|
| 99 |
-
│ ├── environment.py # Core RL environment — FakeGangEnvironment class
|
| 100 |
-
│ ├── generator.py # Synthetic episode generator (50 per task × 3 tasks = 150 files)
|
| 101 |
-
│ ├── scoring.py # Pure-math risk formula engine (stateless functions)
|
| 102 |
-
│ ├── Dockerfile # Offline pip install via pre-downloaded wheels
|
| 103 |
-
│ └── .dockerignore # Excludes episodes/, memory/, runs/ from build context
|
| 104 |
-
│
|
| 105 |
-
├── agent/
|
| 106 |
-
│ ├── policy.py # LLM policy: formats obs → calls Qwen → parses <action> tag
|
| 107 |
-
│ ├── hybrid_policy.py # Hybrid policy: blends rules + LLM via dynamic α
|
| 108 |
-
│ ├── memory.py # Disk-backed memory: reflections, trajectories, win history, α
|
| 109 |
-
│ └── reflection.py # Post-episode reflection generator (also calls Qwen)
|
| 110 |
-
│
|
| 111 |
-
├── episodes/ # 150 pre-generated JSON episode files (excluded from Docker build)
|
| 112 |
-
├── memory/ # Docker volume: reflections, trajectories, α values (persists)
|
| 113 |
-
└── runs/ # Docker volume: per-episode metrics JSONL (persists)
|
| 114 |
-
```
|
| 115 |
-
|
| 116 |
---
|
| 117 |
|
| 118 |
-
##
|
| 119 |
-
|
| 120 |
-
A real-world fake account detector does **not** read post content. Content is
|
| 121 |
-
expensive to process, multilingual, and easily spoofed. Instead, detection relies
|
| 122 |
-
on three categories of signals that are computed from metadata:
|
| 123 |
-
|
| 124 |
-
### 3.1 Node Signals (per-account features)
|
| 125 |
-
|
| 126 |
-
These are pre-computed by a content pipeline before the agent ever sees the account:
|
| 127 |
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
| `bio_template_score` | Cosine similarity of bio text to known fake-account bio templates | 0.20–0.90 | 0.00–0.12 |
|
| 132 |
-
| `comment_repeat_score` | Fraction of comments that are copy-pasted across accounts (spam pattern) | 0.60–0.90 | 0.00–0.08 |
|
| 133 |
|
| 134 |
-
|
| 135 |
-
runs offline; the agent operates on numeric features only.
|
| 136 |
|
| 137 |
-
|
|
|
|
|
|
|
|
|
|
| 138 |
|
| 139 |
-
|
| 140 |
-
| -------------------- | ------------------------------------------------ | ---------------------------------------------------------------- |
|
| 141 |
-
| `avg_post_hour` | Mean hour of day for all posts (0–23) | Gang members cluster within ±0.5 hours (coordinated scheduling) |
|
| 142 |
-
| `account_age_days` | Days since account creation | Gang created within same week (base_age ± 7 days) |
|
| 143 |
-
| `shared_ip_count` | How many other accounts share the same IP subnet | 9 (all 10 gang members share one IP) |
|
| 144 |
-
| `ip_cluster_id` | Identifier for the IP subnet | All fakes:`ip_gang_{seed}` |
|
| 145 |
|
| 146 |
-
|
|
|
|
|
|
|
|
|
|
| 147 |
|
| 148 |
-
|
| 149 |
-
has already flagged** — they change dynamically as the investigation progresses:
|
| 150 |
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
| `avg_neighbor_photo_reuse` | Mean photo_reuse_score of already-inspected follows | Gang members cluster; inspecting one reveals others |
|
| 156 |
-
| `post_hour_cluster_score` | How closely posting hour matches the flagged accounts' mean | High → same operation schedule |
|
| 157 |
-
| `suspicious_mutual_ratio` | Fraction of suspicious follows that also follow this account back | Used in hub legitimacy computation |
|
| 158 |
-
|
| 159 |
-
**Why graph signals are the most powerful:** A gang member always has high
|
| 160 |
-
`mutual_follow_rate` and grows `flagged_neighbor_count` as the investigation
|
| 161 |
-
proceeds. A celebrity who is *followed by* fakes but does not *follow them back*
|
| 162 |
-
will have low `mutual_follow_rate` — this is how the hub legitimacy score
|
| 163 |
-
protects against false positives.
|
| 164 |
|
| 165 |
---
|
|
|
|
| 166 |
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
|
| 174 |
-
|
| 175 |
|
| 176 |
-
|
| 177 |
-
| ------ | ------------ | ---- | ------ | ---- | --------- |
|
| 178 |
-
| easy | 50 | 10 | 0 | 40 | 30 |
|
| 179 |
-
| medium | 200 | 10 | 20 | 170 | 50 |
|
| 180 |
-
| hard | 1000 | 10 | 50 | 940 | 80 |
|
| 181 |
|
| 182 |
-
**
|
| 183 |
|
| 184 |
-
|
| 185 |
-
- Tightly clustered `avg_post_hour`: sampled from `gauss(14.0, std)` where std=0.5/1.5/2.5 by task
|
| 186 |
-
- High `photo_reuse_score` and `bio_template_score` (ranges defined per task)
|
| 187 |
-
- `comment_repeat_score` in [0.60, 0.90]
|
| 188 |
-
- `ip_cluster_id = "ip_gang_{seed}"` and `shared_ip_count = 9`
|
| 189 |
-
- Dense intra-gang follow edges (density 0.60–0.80 by task)
|
| 190 |
|
| 191 |
-
|
|
|
|
|
|
|
|
|
|
| 192 |
|
| 193 |
-
|
| 194 |
-
- `comment_repeat_score`: Beta(1, 20) ≈ mostly 0.00–0.08
|
| 195 |
-
- Each has a unique `ip_cluster_id`
|
| 196 |
|
| 197 |
-
|
|
|
|
|
|
|
| 198 |
|
| 199 |
-
|
| 200 |
-
- `comment_repeat_score` in [0.10, 0.30]
|
| 201 |
-
- They look suspicious but are NOT gang members — they penalise reckless flagging.
|
| 202 |
|
| 203 |
-
|
| 204 |
|
| 205 |
-
-
|
| 206 |
-
- `photo_reuse_score` in [0.00, 0.05], `bio_template_score` in [0.00, 0.05]
|
| 207 |
-
- High `hub_legitimacy_score` → the formula protects them from false positives.
|
| 208 |
|
| 209 |
-
|
| 210 |
-
`following_count=0`, no graph edges. They test whether the agent wastes steps
|
| 211 |
-
on disconnected nodes.
|
| 212 |
|
| 213 |
-
|
| 214 |
|
| 215 |
-
|
| 216 |
|
| 217 |
-
|
| 218 |
-
for g in gang_ids:
|
| 219 |
-
for h in gang_ids:
|
| 220 |
-
if g != h and random() < density:
|
| 221 |
-
g follows h
|
| 222 |
-
```
|
| 223 |
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
### 4.3 Episode JSON Schema
|
| 229 |
-
|
| 230 |
-
```json
|
| 231 |
-
{
|
| 232 |
-
"episode_id": "uuid4",
|
| 233 |
-
"task": "easy",
|
| 234 |
-
"seed": 0,
|
| 235 |
-
"max_steps": 30,
|
| 236 |
-
"win_recall": 0.8,
|
| 237 |
-
"win_precision": 0.7,
|
| 238 |
-
"starting_visible": ["acc_0012", "acc_0037", ...],
|
| 239 |
-
"gang_member_ids": ["acc_0003", "acc_0017", ...],
|
| 240 |
-
"decoy_ids": [],
|
| 241 |
-
"celeb_ids": ["acc_0048", "acc_0049"],
|
| 242 |
-
"zero_edge_ids": ["acc_0046", "acc_0047"],
|
| 243 |
-
"network": {
|
| 244 |
-
"accounts": [
|
| 245 |
-
{
|
| 246 |
-
"id": "acc_0003",
|
| 247 |
-
"is_fake": true,
|
| 248 |
-
"gang_id": "gang_A",
|
| 249 |
-
"features": {
|
| 250 |
-
"follower_count": 3421,
|
| 251 |
-
"following_count": 847,
|
| 252 |
-
"post_count": 214,
|
| 253 |
-
"avg_post_hour": 14.23,
|
| 254 |
-
"photo_reuse_score": 0.8712,
|
| 255 |
-
"bio_template_score": 0.7403,
|
| 256 |
-
"account_age_days": 67,
|
| 257 |
-
"comment_repeat_score": 0.7831,
|
| 258 |
-
"ip_cluster_id": "ip_gang_0",
|
| 259 |
-
"shared_ip_count": 9,
|
| 260 |
-
"name_change_count": 0
|
| 261 |
-
},
|
| 262 |
-
"true_edges": {
|
| 263 |
-
"follows": ["acc_0017", "acc_0029", ...],
|
| 264 |
-
"followed_by": ["acc_0017", "acc_0008", ...]
|
| 265 |
-
}
|
| 266 |
-
}
|
| 267 |
-
]
|
| 268 |
-
},
|
| 269 |
-
"evasion_schedule": []
|
| 270 |
-
}
|
| 271 |
-
```
|
| 272 |
|
| 273 |
---
|
| 274 |
|
| 275 |
-
##
|
| 276 |
|
| 277 |
-
**File:** `
|
| 278 |
-
|
| 279 |
-
### 5.1 ActionType (enum)
|
| 280 |
-
|
| 281 |
-
| Value | Cost | Effect |
|
| 282 |
-
| ----------------------- | ------- | ---------------------------------------------------------------------------------------- |
|
| 283 |
-
| `inspect` | 1 step | Reveals full `AccountProfile` + follow list; adds neighbors to `visible_account_ids` |
|
| 284 |
-
| `investigate_network` | 2 steps | Expands 2 hops from account; only reveals account IDs (no profiles) |
|
| 285 |
-
| `flag` | 0 steps | Marks account as gang member; triggers SUSPECT cascade to visible neighbors |
|
| 286 |
-
| `unflag` | 0 steps | Removes flag; clears CONFIRMED_FAKE status |
|
| 287 |
-
| `submit` | 0 steps | Ends episode; triggers scoring |
|
| 288 |
-
|
| 289 |
-
### 5.2 AccountStatus (enum)
|
| 290 |
|
| 291 |
-
``
|
| 292 |
-
NORMAL → no signal or formula risk < 0.35
|
| 293 |
-
SUSPECT → auto-elevated via dual cascade:
|
| 294 |
-
(1) a flagged account follows this account, OR
|
| 295 |
-
(2) this account shares ip_cluster_id with a flagged account
|
| 296 |
-
CONFIRMED_FAKE → agent explicitly flagged this account
|
| 297 |
-
```
|
| 298 |
|
| 299 |
-
|
| 300 |
-
SUSPECT is set automatically — the agent never sets it manually.
|
| 301 |
|
| 302 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 303 |
|
| 304 |
-
```
|
| 305 |
-
|
|
|
|
|
|
|
|
|
|
| 306 |
|
| 307 |
-
|
| 308 |
-
follower_count: int # followers (fakes: 1k-8k, celebs: 100k-5M)
|
| 309 |
-
following_count: int # accounts followed (fakes: 200-2000)
|
| 310 |
-
post_count: int # total posts (fakes: 50-500)
|
| 311 |
|
| 312 |
-
#
|
| 313 |
-
avg_post_hour: float # mean posting hour 0-23 (gang: tightly clustered)
|
| 314 |
-
account_age_days: int # days since creation (gang: same week, within 7 days)
|
| 315 |
|
| 316 |
-
|
| 317 |
-
photo_reuse_score: float # pHash stolen-photo detection
|
| 318 |
-
bio_template_score: float # cosine similarity to fake bio templates
|
| 319 |
-
comment_repeat_score: float # copy-paste spam comment fraction
|
| 320 |
|
| 321 |
-
#
|
| 322 |
-
shared_ip_count: int # accounts sharing same IP subnet (gang: 9)
|
| 323 |
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
|
| 332 |
-
#
|
| 333 |
-
fake_risk_score: float # composite 0-1 (main decision signal)
|
| 334 |
-
node_risk: float # from photo_reuse + bio_template
|
| 335 |
-
behavior_risk: float # from account_age + post_hour_cluster
|
| 336 |
-
graph_risk: float # from flagged_neighbor_ratio + mutual + avg_neighbor
|
| 337 |
-
hub_legitimacy_score: float # celebrity/hub discount
|
| 338 |
|
| 339 |
-
|
| 340 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
|
| 342 |
-
#
|
| 343 |
-
status: AccountStatus # NORMAL / SUSPECT / CONFIRMED_FAKE
|
| 344 |
-
visible_follows: List[str] # follow list revealed by INSPECT
|
| 345 |
-
```
|
| 346 |
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
```python
|
| 350 |
-
done: bool # episode over?
|
| 351 |
-
reward: Optional[float] # only set on terminal step
|
| 352 |
-
visible_accounts: List[AccountProfile] # fully profiled (inspected) accounts
|
| 353 |
-
visible_account_ids: List[str] # all known account IDs (profiled + seen)
|
| 354 |
-
flagged_ids: List[str] # currently flagged by agent
|
| 355 |
-
inspected_ids: List[str] # accounts with full profiles revealed
|
| 356 |
-
suspect_ids: List[str] # auto-elevated SUSPECT accounts (uninspected cascade)
|
| 357 |
-
graph_edges: Dict[str, List[str]] # follow lists for inspected accounts
|
| 358 |
-
steps_remaining: int # budget left
|
| 359 |
-
evasion_triggered: bool # was evasion active this episode?
|
| 360 |
-
evasion_count: int # how many evasion events have fired
|
| 361 |
-
task: str # "easy" / "medium" / "hard"
|
| 362 |
-
message: str # human-readable result / status message
|
| 363 |
-
```
|
| 364 |
|
| 365 |
---
|
| 366 |
|
| 367 |
-
##
|
| 368 |
|
| 369 |
**File:** `server/environment.py`
|
| 370 |
|
| 371 |
-
###
|
| 372 |
|
| 373 |
-
|
| 374 |
-
reset(task, seed)
|
| 375 |
-
└── loads JSON episode file (or generates on the fly)
|
| 376 |
-
└── initialises _visible_ids with starting_visible accounts
|
| 377 |
-
└── returns initial observation (no profiles yet)
|
| 378 |
-
|
| 379 |
-
step(action) [called repeatedly]
|
| 380 |
-
└── INSPECT → _do_inspect() → reveals profile + neighbors
|
| 381 |
-
└── FLAG → _do_flag() → cascades SUSPECT to visible neighbors
|
| 382 |
-
└── UNFLAG → _do_unflag() → clears status
|
| 383 |
-
└── INVESTIGATE_NETWORK → _do_investigate() → reveals 2-hop IDs
|
| 384 |
-
└── SUBMIT → _do_submit() → scores and ends episode
|
| 385 |
-
|
| 386 |
-
If step_count >= max_steps → forced submit (penalty -2.0)
|
| 387 |
-
```
|
| 388 |
-
|
| 389 |
-
### 6.2 Action Mechanics in Detail
|
| 390 |
-
|
| 391 |
-
**INSPECT (1 step):**
|
| 392 |
-
|
| 393 |
-
1. Adds account to `_inspected`
|
| 394 |
-
2. Calls `_build_profile(acc_id)` — computes all 22 features dynamically
|
| 395 |
-
3. Adds all accounts this account follows to `_visible_ids`
|
| 396 |
-
4. Returns updated observation
|
| 397 |
-
|
| 398 |
-
**INVESTIGATE_NETWORK (2 steps):**
|
| 399 |
|
| 400 |
-
1
|
| 401 |
-
2. **Bidirectional 2-hop expansion:** Traverses both `_live_edges` (outgoing follows)
|
| 402 |
-
AND `_reverse_edges` (incoming followers) for the target and each 1-hop neighbor.
|
| 403 |
-
This means the expansion covers:
|
| 404 |
-
- Outgoing: `acc → follows → their follows` AND `acc → follows → their followers`
|
| 405 |
-
- Incoming: `acc ← followers → their follows` AND `acc ← followers ← their followers`
|
| 406 |
-
3. Adds all new account IDs to `_visible_ids` (no full profiles — IDs only)
|
| 407 |
-
4. **Re-cascades SUSPECT** to newly visible accounts via two signals:
|
| 408 |
-
- *Follow-graph cascade:* any newly visible account followed by a flagged account → SUSPECT
|
| 409 |
-
- *IP cluster cascade:* any newly visible account sharing `ip_cluster_id` with a flagged account → SUSPECT (zero false positives — gang shares one IP; real accounts have unique IPs)
|
| 410 |
-
5. Cost: 2 steps, -0.02 score. Returns count of newly discovered IDs.
|
| 411 |
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
1. Adds account to `_flagged`
|
| 415 |
-
2. Sets `_account_statuses[acc_id] = "confirmed_fake"`
|
| 416 |
-
3. **Dual cascade** to SUSPECT:
|
| 417 |
-
- *Cascade 1 — Follow-graph:* For every neighbor in `_live_edges[acc_id]`
|
| 418 |
-
(accounts the flagged user follows), if the neighbor is visible and NORMAL → SUSPECT.
|
| 419 |
-
Gang members follow each other at density 0.70+, so this is high-precision.
|
| 420 |
-
- *Cascade 2 — IP cluster:* Any visible account sharing the same `ip_cluster_id`
|
| 421 |
-
as the flagged account → SUSPECT. Gang members all share `ip_gang_{seed}`;
|
| 422 |
-
real and decoy accounts each have a unique IP cluster. Zero false positives.
|
| 423 |
-
4. Refreshes all already-inspected accounts that follow `acc_id`
|
| 424 |
-
(their `flagged_neighbor_count` just increased, so risk scores change)
|
| 425 |
-
|
| 426 |
-
**SUBMIT:**
|
| 427 |
-
Computes final scores (see §6.3).
|
| 428 |
-
|
| 429 |
-
### 6.3 Reward Function
|
| 430 |
|
| 431 |
```
|
| 432 |
-
tp = len(gang_ids ∩ flagged_ids) # true positives
|
| 433 |
-
fp = len(flagged_ids - gang_ids) # false positives
|
| 434 |
-
fn = len(gang_ids - flagged_ids) # false negatives
|
| 435 |
-
|
| 436 |
base_reward = tp×1.0 − fp×0.5 − fn×0.3
|
| 437 |
|
| 438 |
-
Win condition
|
| 439 |
easy/medium: recall ≥ 0.8 AND precision ≥ 0.7
|
| 440 |
hard: recall ≥ 0.9 AND precision ≥ 0.8
|
| 441 |
|
| 442 |
-
|
| 443 |
-
+5.0
|
| 444 |
-
+3.0
|
| 445 |
-
|
| 446 |
-
+
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
+1.0
|
| 450 |
-
|
| 451 |
-
Hard mode evasion penalty:
|
| 452 |
-
−1.0 × evasion_count
|
| 453 |
-
|
| 454 |
-
Forced submit (ran out of steps):
|
| 455 |
-
−2.0
|
| 456 |
-
|
| 457 |
-
Final score = base_reward + all bonuses/penalties
|
| 458 |
-
```
|
| 459 |
-
|
| 460 |
-
**Example:** Easy task, found 9/10 gang members, flagged 2 innocent accounts,
|
| 461 |
-
30 steps used, submitted voluntarily with 5 steps left (< 50%):
|
| 462 |
-
|
| 463 |
-
```
|
| 464 |
-
tp=9, fp=2, fn=1
|
| 465 |
-
base = 9×1.0 − 2×0.5 − 1×0.3 = 9 − 1 − 0.3 = 7.7
|
| 466 |
-
recall = 9/10 = 0.90 ≥ 0.8 ✓ precision = 9/11 = 0.82 ≥ 0.7 ✓
|
| 467 |
-
+5.0 win bonus
|
| 468 |
-
0 efficiency bonus (steps_left=5 < 30×0.5=15)
|
| 469 |
-
total = 7.7 + 5.0 = 12.7
|
| 470 |
```
|
| 471 |
|
| 472 |
-
###
|
| 473 |
-
|
| 474 |
-
The `evasion_schedule` in each episode defines trigger points. When
|
| 475 |
-
`step_count >= event["step"]` and the event hasn't fired yet:
|
| 476 |
|
| 477 |
-
**`unfollow_intragang`:**
|
| 478 |
-
|
| 479 |
-
The agent sees `mutual_follow_rate` and `flagged_neighbor_count` drop on
|
| 480 |
-
re-inspection. Hard mode fires this 4 times (steps 15, 30, 45, 60).
|
| 481 |
-
|
| 482 |
-
**`rename_count`:** A random subset of gang members get `name_change_count += 1`.
|
| 483 |
-
This is a visual signal — the agent should notice accounts that have changed
|
| 484 |
-
their name multiple times.
|
| 485 |
|
| 486 |
---
|
| 487 |
|
| 488 |
-
##
|
| 489 |
-
|
| 490 |
-
**File:** `server/scoring.py`
|
| 491 |
|
| 492 |
-
|
| 493 |
-
global state. They are called inside `_build_profile()` every time an account
|
| 494 |
-
is inspected or a neighbor is re-profiled after a FLAG.
|
| 495 |
|
| 496 |
-
|
| 497 |
|
| 498 |
-
|
| 499 |
|
| 500 |
-
|
| 501 |
-
node_risk = 0.60 × photo_reuse_score + 0.40 × bio_template_score
|
| 502 |
-
```
|
| 503 |
-
|
| 504 |
-
Photo reuse gets 60% weight because it is harder to spoof (requires actual
|
| 505 |
-
pHash fingerprint matching against a celebrity photo database).
|
| 506 |
-
|
| 507 |
-
**Example:** Gang member with `photo_reuse=0.87`, `bio_template=0.74`:
|
| 508 |
-
|
| 509 |
-
```
|
| 510 |
-
node_risk = 0.60 × 0.87 + 0.40 × 0.74 = 0.522 + 0.296 = 0.818
|
| 511 |
-
```
|
| 512 |
-
|
| 513 |
-
### 7.2 Behavior Risk
|
| 514 |
-
|
| 515 |
-
Captures temporal anomalies:
|
| 516 |
-
|
| 517 |
-
```
|
| 518 |
-
age_norm = min(1.0, account_age_days / 365.0)
|
| 519 |
-
behavior_risk = 0.55 × (1 − age_norm) + 0.45 × post_hour_cluster_score
|
| 520 |
-
```
|
| 521 |
-
|
| 522 |
-
`(1 − age_norm)` is high for newly created accounts (fakes are created right
|
| 523 |
-
before the operation starts). `post_hour_cluster_score` measures alignment with
|
| 524 |
-
the flagged cluster's mean posting hour (see §7.5).
|
| 525 |
-
|
| 526 |
-
**Example:** Gang member, `account_age_days=67`, `post_hour_cluster_score=0.91`:
|
| 527 |
-
|
| 528 |
-
```
|
| 529 |
-
age_norm = 67/365 = 0.184
|
| 530 |
-
behavior_risk = 0.55×(1−0.184) + 0.45×0.91 = 0.55×0.816 + 0.4095
|
| 531 |
-
= 0.449 + 0.410 = 0.859
|
| 532 |
-
```
|
| 533 |
-
|
| 534 |
-
### 7.3 Graph Risk
|
| 535 |
-
|
| 536 |
-
The most predictive signal once the investigation has started:
|
| 537 |
-
|
| 538 |
-
```
|
| 539 |
-
flagged_neighbor_ratio = flagged_neighbor_count / max(inspected_neighbor_count, 1)
|
| 540 |
-
graph_risk = 0.45 × flagged_neighbor_ratio
|
| 541 |
-
+ 0.35 × mutual_follow_rate
|
| 542 |
-
+ 0.20 × avg_neighbor_photo_reuse
|
| 543 |
-
```
|
| 544 |
-
|
| 545 |
-
`flagged_neighbor_ratio` gets 45% weight — if several of this account's friends
|
| 546 |
-
are already confirmed fakes, this account is very likely fake too.
|
| 547 |
-
|
| 548 |
-
**Example:** After 3 gang members flagged; inspecting a 4th gang member:
|
| 549 |
-
|
| 550 |
-
```
|
| 551 |
-
flagged_neighbor_count = 3 (3 already-flagged accounts in its follow list)
|
| 552 |
-
inspected_neighbor_count = 4 (total inspected follows)
|
| 553 |
-
mutual_follow_rate = 0.78 (gang mutually follow heavily)
|
| 554 |
-
avg_neighbor_photo_reuse = 0.81
|
| 555 |
-
|
| 556 |
-
flagged_neighbor_ratio = 3/4 = 0.75
|
| 557 |
-
graph_risk = 0.45×0.75 + 0.35×0.78 + 0.20×0.81
|
| 558 |
-
= 0.338 + 0.273 + 0.162 = 0.773
|
| 559 |
-
```
|
| 560 |
-
|
| 561 |
-
### 7.4 Hub Legitimacy
|
| 562 |
-
|
| 563 |
-
Protects celebrities and legitimate large accounts from false positives:
|
| 564 |
-
|
| 565 |
-
```
|
| 566 |
-
F_MAX = 1,000,000
|
| 567 |
-
followers_norm = min(1.0, log(1+follower_count) / log(1+F_MAX))
|
| 568 |
-
follow_ratio_norm = min(1.0, (following_count / max(follower_count, 1)) / 5.0)
|
| 569 |
-
age_norm = min(1.0, account_age_days / 365.0)
|
| 570 |
-
|
| 571 |
-
hub_legitimacy = 0.45 × followers_norm
|
| 572 |
-
+ 0.25 × (1 − follow_ratio_norm)
|
| 573 |
-
+ 0.20 × age_norm
|
| 574 |
-
+ 0.10 × (1 − suspicious_mutual_ratio)
|
| 575 |
-
```
|
| 576 |
-
|
| 577 |
-
Four signals of legitimacy:
|
| 578 |
-
|
| 579 |
-
- Large log-scaled follower count (0.45 weight) — genuine celebrities have
|
| 580 |
-
millions; fake accounts peak at ~8,000
|
| 581 |
-
- Low follow-to-follower ratio (0.25 weight) — celebs follow few, are followed
|
| 582 |
-
by many; fakes follow aggressively
|
| 583 |
-
- Old account (0.20 weight) — real celebrities have accounts years old
|
| 584 |
-
- Not mutually following suspicious accounts (0.10 weight) — a celeb being
|
| 585 |
-
*followed by* fakes doesn't make the celeb fake
|
| 586 |
-
|
| 587 |
-
**Example — Celebrity with 2,000,000 followers:**
|
| 588 |
-
|
| 589 |
-
```
|
| 590 |
-
followers_norm = log(2,000,001) / log(1,000,001) = 14.509/13.816 = 1.0 (capped)
|
| 591 |
-
follow_ratio_norm = (200 / 2,000,000) / 5.0 = 0.00002 ≈ 0.0
|
| 592 |
-
age_norm = min(1.0, 2000/365) = 1.0
|
| 593 |
-
|
| 594 |
-
hub_legitimacy = 0.45×1.0 + 0.25×(1−0.0) + 0.20×1.0 + 0.10×1.0 = 1.00
|
| 595 |
-
```
|
| 596 |
-
|
| 597 |
-
**Example — Gang member:**
|
| 598 |
-
|
| 599 |
-
```
|
| 600 |
-
followers_norm = log(3422) / log(1,000,001) = 8.138/13.816 = 0.589
|
| 601 |
-
follow_ratio_norm = min(1.0, (847/3422)/5.0) = 0.0495
|
| 602 |
-
age_norm = 67/365 = 0.184
|
| 603 |
-
|
| 604 |
-
hub_legitimacy = 0.45×0.589 + 0.25×(1−0.0495) + 0.20×0.184 + 0.10×0.9
|
| 605 |
-
= 0.265 + 0.238 + 0.037 + 0.090 = 0.630
|
| 606 |
-
```
|
| 607 |
-
|
| 608 |
-
### 7.5 Post-Hour Cluster Score
|
| 609 |
-
|
| 610 |
-
Computed dynamically inside `environment.py`, not in `scoring.py`:
|
| 611 |
-
|
| 612 |
-
```
|
| 613 |
-
mean_h = average avg_post_hour across all currently flagged accounts
|
| 614 |
-
diff = min(|acc_hour − mean_h|, 24 − |acc_hour − mean_h|) # wrap-around
|
| 615 |
-
post_hour_cluster_score = max(0.0, 1.0 − diff / 6.0)
|
| 616 |
-
```
|
| 617 |
-
|
| 618 |
-
The wrap-around handles the midnight boundary (e.g., 23:00 and 01:00 are 2 hours
|
| 619 |
-
apart, not 22). A score of 1.0 means posting at exactly the same hour as the
|
| 620 |
-
flagged cluster. A score of 0.0 means ≥6 hours away.
|
| 621 |
-
|
| 622 |
-
**Why 6.0 as the divisor:** 6 hours is a generous "different time zone" threshold.
|
| 623 |
-
If you post within 6 hours of the gang's schedule, you get partial credit.
|
| 624 |
-
|
| 625 |
-
**Example:** Gang posts at mean=14.0. Inspecting an account posting at 14.3:
|
| 626 |
-
|
| 627 |
-
```
|
| 628 |
-
diff = |14.3 − 14.0| = 0.3
|
| 629 |
-
post_hour_cluster_score = 1.0 − 0.3/6.0 = 0.950
|
| 630 |
-
```
|
| 631 |
-
|
| 632 |
-
### 7.6 Composite Fake Risk
|
| 633 |
-
|
| 634 |
-
```
|
| 635 |
-
fake_risk = clip(
|
| 636 |
-
0.30 × node_risk
|
| 637 |
-
+ 0.25 × behavior_risk
|
| 638 |
-
+ 0.45 × graph_risk
|
| 639 |
-
− 0.25 × hub_legitimacy,
|
| 640 |
-
0.0, 1.0
|
| 641 |
-
)
|
| 642 |
-
```
|
| 643 |
-
|
| 644 |
-
Weight rationale:
|
| 645 |
-
|
| 646 |
-
- **Graph risk 0.45** — structural signals are hardest for fakes to hide.
|
| 647 |
-
Mutual follow density requires real coordination; once you find one member,
|
| 648 |
-
the whole cluster lights up.
|
| 649 |
-
- **Node risk 0.30** — content signals are strong but can appear on decoys.
|
| 650 |
-
- **Behavior risk 0.25** — temporal clustering is a reliable early signal,
|
| 651 |
-
especially before any flags are set.
|
| 652 |
-
- **Hub legitimacy −0.25** — subtractive discount. A celebrity with 5M followers
|
| 653 |
-
has hub_legitimacy ≈ 1.0, so even if gang members follow them, their risk
|
| 654 |
-
formula produces: `0.30×0.02 + 0.25×0.05 + 0.45×0.10 − 0.25×1.0 ≈ −0.17 → clipped to 0.0`
|
| 655 |
-
|
| 656 |
-
**Full gang member example** (after 3 flags set):
|
| 657 |
-
|
| 658 |
-
```
|
| 659 |
-
node_risk = 0.818 (photo=0.87, bio=0.74)
|
| 660 |
-
behavior_risk = 0.859 (age=67d, cluster_score=0.91)
|
| 661 |
-
graph_risk = 0.773 (ratio=0.75, mutual=0.78, nbr_photo=0.81)
|
| 662 |
-
hub_legitimacy= 0.630 (3k followers, 1y old, no celeb)
|
| 663 |
-
|
| 664 |
-
fake_risk = 0.30×0.818 + 0.25×0.859 + 0.45×0.773 − 0.25×0.630
|
| 665 |
-
= 0.245 + 0.215 + 0.348 − 0.158
|
| 666 |
-
= 0.650
|
| 667 |
-
```
|
| 668 |
-
|
| 669 |
-
### 7.7 Risk Classification
|
| 670 |
-
|
| 671 |
-
```
|
| 672 |
-
fake_risk < 0.35 → "normal"
|
| 673 |
-
0.35 ≤ risk < 0.60 → "suspect"
|
| 674 |
-
risk ≥ 0.60 → "confirmed_fake" (formula-level; explicit flag overrides)
|
| 675 |
-
```
|
| 676 |
-
|
| 677 |
-
### 7.8 Grader Score (Submission Metric)
|
| 678 |
-
|
| 679 |
-
This normalised [0.0, 1.0] score is returned by the `/grader` endpoint:
|
| 680 |
-
|
| 681 |
-
```
|
| 682 |
-
recall = tp / 10
|
| 683 |
-
precision = tp / max(tp + fp, 1)
|
| 684 |
-
efficiency = max(0.0, (max_steps − steps_used) / max_steps)
|
| 685 |
-
|
| 686 |
-
if recall ≥ 0.8 AND precision ≥ 0.7:
|
| 687 |
-
score = 0.55 + 0.20×recall + 0.15×precision + 0.10×efficiency
|
| 688 |
-
else:
|
| 689 |
-
score = 0.30×recall + 0.10×precision
|
| 690 |
-
```
|
| 691 |
-
|
| 692 |
-
**Maximum possible score:** `0.55 + 0.20×1.0 + 0.15×1.0 + 0.10×1.0 = 1.00`
|
| 693 |
-
(requires all 10 found, no false positives, and 0 steps used — perfect play)
|
| 694 |
-
|
| 695 |
-
**Win threshold score:** `0.55 + 0.20×0.8 + 0.15×0.7 + 0.10×0 = 0.55 + 0.16 + 0.105 = 0.815`
|
| 696 |
-
|
| 697 |
-
**Partial credit examples:**
|
| 698 |
-
|
| 699 |
-
- Found 6/10, no false positives: `0.30×0.6 + 0.10×1.0 = 0.18 + 0.10 = 0.28`
|
| 700 |
-
- Found 9/10, 3 false positives: recall=0.9, precision=9/12=0.75 → win: `0.55 + 0.18 + 0.113 = 0.843`
|
| 701 |
-
|
| 702 |
-
---
|
| 703 |
-
|
| 704 |
-
## 8. Account Status State Machine
|
| 705 |
-
|
| 706 |
-
```
|
| 707 |
-
┌──────────────────────────────────────┐
|
| 708 |
-
│ │
|
| 709 |
-
INSPECT INSPECT
|
| 710 |
-
│ │
|
| 711 |
-
▼ ▼
|
| 712 |
-
┌──────────────┐ FLAG cascade ┌──────────────────┐
|
| 713 |
-
│ NORMAL │ ─────────────────► │ SUSPECT │
|
| 714 |
-
└──────────────┘ (neighbor of └──────────────────┘
|
| 715 |
-
│ flagged) │
|
| 716 |
-
│ │
|
| 717 |
-
FLAG(account) FLAG(account)
|
| 718 |
-
│ │
|
| 719 |
-
▼ ▼
|
| 720 |
-
┌──────────────────────────────────────────────────┐
|
| 721 |
-
│ CONFIRMED_FAKE │
|
| 722 |
-
└──────────────────────────────────────────────────┘
|
| 723 |
-
│
|
| 724 |
-
UNFLAG(account)
|
| 725 |
-
│
|
| 726 |
-
▼
|
| 727 |
-
(status cleared → NORMAL)
|
| 728 |
-
```
|
| 729 |
-
|
| 730 |
-
**When FLAG(X) is called:**
|
| 731 |
-
|
| 732 |
-
1. X → CONFIRMED_FAKE
|
| 733 |
-
2. **Dual SUSPECT cascade:**
|
| 734 |
-
- *Follow-graph:* For every account Y that X follows (`_live_edges[X]`):
|
| 735 |
-
if Y is visible AND Y is NORMAL → Y becomes SUSPECT
|
| 736 |
-
- *IP cluster:* For every visible account Z sharing X's `ip_cluster_id`:
|
| 737 |
-
if Z is not flagged AND Z is NORMAL → Z becomes SUSPECT
|
| 738 |
-
(gang members share `ip_gang_{seed}`; real accounts have unique IPs → zero false positives)
|
| 739 |
-
3. All already-inspected accounts that follow X are re-profiled
|
| 740 |
-
(their `flagged_neighbor_count` increases, which raises their `fake_risk_score`)
|
| 741 |
-
|
| 742 |
-
**Why SUSPECT matters:**
|
| 743 |
-
|
| 744 |
-
- The `suspect_ids` field in the observation lists all SUSPECT accounts not yet inspected
|
| 745 |
-
- Both the rule engine and the LLM treat these as highest priority for the next INSPECT
|
| 746 |
-
- This creates an efficient cascade: flag one → inspect suspects → some are gang
|
| 747 |
-
→ flag them → more suspects appear → repeat until cluster is exhausted
|
| 748 |
-
|
| 749 |
-
**Example cascade on easy task:**
|
| 750 |
-
|
| 751 |
-
```
|
| 752 |
-
Step 1: INSPECT acc_0003 (gang member) → no flags yet, fake_risk ≈ 0.45
|
| 753 |
-
Step 2: FLAG acc_0003
|
| 754 |
-
→ acc_0017, acc_0029, acc_0041 become SUSPECT (they follow acc_0003)
|
| 755 |
-
→ obs.suspect_ids = ["acc_0017", "acc_0029", "acc_0041"]
|
| 756 |
-
Step 3: INSPECT acc_0017 (gang member) → fake_risk now 0.72 (flagged_neighbor_count=1)
|
| 757 |
-
Step 4: FLAG acc_0017
|
| 758 |
-
→ acc_0003 (already flagged), acc_0029, acc_0041, acc_0055 get SUSPECT
|
| 759 |
-
→ acc_0003, acc_0017 profiles refreshed (their mutual flags increased)
|
| 760 |
-
Step 5: INSPECT acc_0029 → fake_risk = 0.81 (flagged_neighbor_count=2)
|
| 761 |
-
...
|
| 762 |
-
```
|
| 763 |
-
|
| 764 |
-
Each FLAG makes the next gang member easier to find because their risk score rises.
|
| 765 |
|
| 766 |
---
|
| 767 |
|
| 768 |
-
##
|
| 769 |
|
| 770 |
**File:** `agent/policy.py`
|
| 771 |
|
| 772 |
-
|
| 773 |
|
| 774 |
-
|
| 775 |
-
|
| 776 |
-
```python
|
| 777 |
-
MODEL_ID = "qwen.qwen3-next-80b-a3b"
|
| 778 |
-
```
|
| 779 |
-
|
| 780 |
-
Called via the Bedrock Converse API:
|
| 781 |
-
|
| 782 |
-
```python
|
| 783 |
-
client.converse(
|
| 784 |
-
modelId=MODEL_ID,
|
| 785 |
-
messages=[{"role": "user", "content": [{"text": prompt}]}],
|
| 786 |
-
system=[{"text": SYSTEM_PROMPT}],
|
| 787 |
-
inferenceConfig={"maxTokens": 512, "temperature": 0.4}
|
| 788 |
-
)
|
| 789 |
-
```
|
| 790 |
-
|
| 791 |
-
Temperature 0.4 is low enough for consistent action format but high enough to
|
| 792 |
-
avoid degenerate repetition.
|
| 793 |
-
|
| 794 |
-
### 9.2 Prompt Construction
|
| 795 |
|
| 796 |
Every step, the policy builds a prompt from three components:
|
| 797 |
|
|
@@ -803,612 +275,178 @@ Every step, the policy builds a prompt from three components:
|
|
| 803 |
What is your next action?
|
| 804 |
```
|
| 805 |
|
| 806 |
-
|
| 807 |
-
|
| 808 |
-
The `_format_observation()` function converts the typed `FakeGangObservation`
|
| 809 |
-
into a text block. Accounts are **sorted by `fake_risk_score` descending**,
|
| 810 |
-
with status badges prepended:
|
| 811 |
-
|
| 812 |
-
```
|
| 813 |
-
TASK: EASY | Steps remaining: 22
|
| 814 |
-
Currently flagged (3/10): acc_0003, acc_0017, acc_0029
|
| 815 |
-
Suspects not yet inspected (4): acc_0041, acc_0055, acc_0062, acc_0078
|
| 816 |
-
|
| 817 |
-
PROFILED ACCOUNTS (sorted by fake_risk_score — highest first):
|
| 818 |
-
[status | risk | node beh graph hub | photo bio mutual | comment ip_count]
|
| 819 |
-
CONFIRMED_FAKE acc_0029 ◀ FLAGGED: risk=0.821 | node=0.82 beh=0.77 graph=0.86 hub=0.63
|
| 820 |
-
SUSPECT acc_0041: risk=0.714 | node=0.79 beh=0.81 graph=0.74 hub=0.65 fnbr=3(!)
|
| 821 |
-
SUSPECT acc_0055: risk=0.681 | node=0.71 beh=0.74 graph=0.69 hub=0.67 fnbr=2(!)
|
| 822 |
-
NORMAL acc_0022: risk=0.121 | node=0.09 beh=0.31 graph=0.03 hub=0.84 [HUB?]
|
| 823 |
-
...
|
| 824 |
-
|
| 825 |
-
KNOWN UNINSPECTED IDs: acc_0062, acc_0078, acc_0091, ...
|
| 826 |
-
|
| 827 |
-
Environment message: Flagged acc_0029 as suspected fake.
|
| 828 |
-
```
|
| 829 |
-
|
| 830 |
-
Key formatting choices:
|
| 831 |
|
| 832 |
-
|
| 833 |
-
actionable graph signal
|
| 834 |
-
- `[HUB?]` appears when `hub_legitimacy_score > 0.70` — warns the LLM not to flag
|
| 835 |
-
- Status badge width is fixed (13 chars) for visual alignment
|
| 836 |
-
|
| 837 |
-
### 9.4 Required Response Format
|
| 838 |
|
| 839 |
```xml
|
| 840 |
<thinking>
|
| 841 |
-
|
| 842 |
-
what signal you're acting on, what your next move is.
|
| 843 |
</thinking>
|
| 844 |
<action>
|
| 845 |
INSPECT acc_0041
|
| 846 |
</action>
|
| 847 |
```
|
| 848 |
|
| 849 |
-
|
| 850 |
-
using regex, then matches to action types. If parsing fails entirely, the
|
| 851 |
-
fallback inspects the highest-scored uninspected account.
|
| 852 |
-
|
| 853 |
-
### 9.5 Retry Logic
|
| 854 |
-
|
| 855 |
-
```python
|
| 856 |
-
for attempt in range(3):
|
| 857 |
-
try:
|
| 858 |
-
raw = invoke_qwen(...)
|
| 859 |
-
action = _parse_action(raw, obs)
|
| 860 |
-
return action, raw
|
| 861 |
-
except Exception as exc:
|
| 862 |
-
wait = 2 ** attempt # 1s, 2s, 4s
|
| 863 |
-
time.sleep(wait)
|
| 864 |
-
|
| 865 |
-
# All retries failed → heuristic fallback
|
| 866 |
-
return _heuristic_fallback(obs), "[FALLBACK]"
|
| 867 |
-
```
|
| 868 |
|
| 869 |
---
|
| 870 |
|
| 871 |
-
##
|
| 872 |
|
| 873 |
**Files:** `agent/reflection.py`, `agent/memory.py`
|
| 874 |
|
| 875 |
-
The agent **cannot** update Qwen3's weights — Bedrock is a black-box API.
|
| 876 |
-
Instead, it learns via **Reflexion**: post-episode lessons are written as text
|
| 877 |
-
and injected into future prompts.
|
| 878 |
|
| 879 |
-
###
|
| 880 |
|
| 881 |
-
|
| 882 |
-
Episode N
|
| 883 |
-
1. LLM acts using: system_prompt + reflections[1..4] + best_trajectory
|
| 884 |
-
2. Episode ends → WIN or LOSS
|
| 885 |
-
3. Post-episode learning:
|
| 886 |
-
If LOSS:
|
| 887 |
-
→ generate_reflection(action_log, outcome) → Qwen writes a lesson
|
| 888 |
-
→ lesson stored to memory/reflections_easy.jsonl
|
| 889 |
-
If WIN:
|
| 890 |
-
→ save trajectory to memory/best_trajectory_easy.json (if better reward)
|
| 891 |
-
→ generate_success_reflection() → Qwen writes what worked
|
| 892 |
-
→ stored to reflections
|
| 893 |
-
|
| 894 |
-
Episode N+1
|
| 895 |
-
→ get_reflections("easy", n=4) returns last 4 lessons
|
| 896 |
-
→ get_best_trajectory("easy") returns best win as few-shot example
|
| 897 |
-
→ both injected into prompt → LLM has learned from its past
|
| 898 |
-
```
|
| 899 |
|
| 900 |
-
### 10.2 Reflection Generation
|
| 901 |
|
| 902 |
-
A separate Qwen3 call is made after each episode with this prompt:
|
| 903 |
|
| 904 |
```
|
| 905 |
-
|
| 906 |
-
|
| 907 |
-
|
| 908 |
-
|
| 909 |
-
|
| 910 |
-
|
| 911 |
-
INVESTIGATION LOG:
|
| 912 |
-
1. INSPECT acc_0022
|
| 913 |
-
2. INSPECT acc_0037
|
| 914 |
-
...
|
| 915 |
-
20. SUBMIT
|
| 916 |
-
|
| 917 |
-
Write a 2-3 sentence lesson for your future self based on this case.
|
| 918 |
-
```
|
| 919 |
-
|
| 920 |
-
**Example generated reflection:**
|
| 921 |
-
|
| 922 |
-
> "The starting accounts were all real; I wasted 8 steps inspecting low-signal nodes
|
| 923 |
-
> before pivoting. When photo_reuse and bio_template are both below 0.3 after 3 inspections,
|
| 924 |
-
> immediately use INVESTIGATE_NETWORK to jump to a different graph region.
|
| 925 |
-
> Once I found the first gang member at step 14, I should have cascaded faster
|
| 926 |
-
> via SUSPECT accounts rather than continuing to inspect unknown IDs."
|
| 927 |
-
|
| 928 |
-
This lesson is stored and appears in Episode 13's prompt, causing the agent to
|
| 929 |
-
pivot earlier and follow the cascade more aggressively.
|
| 930 |
-
|
| 931 |
-
### 10.3 Best Trajectory (Few-Shot Example)
|
| 932 |
-
|
| 933 |
-
The first episode that wins is saved as a few-shot example. Every subsequent win
|
| 934 |
-
replaces it only if the reward is higher. The trajectory appears in the prompt as:
|
| 935 |
|
|
|
|
|
|
|
|
|
|
| 936 |
```
|
| 937 |
-
━━━ EXAMPLE SUCCESSFUL CASE (task=easy, reward=+14.20) ━━━
|
| 938 |
-
1. INSPECT acc_0012
|
| 939 |
-
2. INSPECT acc_0037
|
| 940 |
-
3. FLAG acc_0037
|
| 941 |
-
4. INSPECT acc_0041 (suspect — cascaded from acc_0037)
|
| 942 |
-
5. FLAG acc_0041
|
| 943 |
-
...
|
| 944 |
-
→ [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00
|
| 945 |
-
```
|
| 946 |
-
|
| 947 |
-
The LLM sees a concrete example of the exact pattern that leads to a perfect win,
|
| 948 |
-
and mirrors this strategy.
|
| 949 |
-
|
| 950 |
-
### 10.4 Memory Persistence
|
| 951 |
|
| 952 |
-
|
| 953 |
-
|
| 954 |
-
```
|
| 955 |
-
memory/
|
| 956 |
-
├── reflections_easy.jsonl # one JSON entry per reflection
|
| 957 |
-
├── reflections_medium.jsonl
|
| 958 |
-
├── reflections_hard.jsonl
|
| 959 |
-
├── best_trajectory_easy.json # single best win per task
|
| 960 |
-
├── best_trajectory_medium.json
|
| 961 |
-
├── best_trajectory_hard.json
|
| 962 |
-
├── wins_easy.jsonl # episode-level win history (for alpha)
|
| 963 |
-
├── wins_medium.jsonl
|
| 964 |
-
├── wins_hard.jsonl
|
| 965 |
-
├── alpha_easy.json # current α for this task
|
| 966 |
-
├── alpha_medium.json
|
| 967 |
-
└── alpha_hard.json
|
| 968 |
-
```
|
| 969 |
|
| 970 |
-
|
| 971 |
-
learning persists across container restarts and redeployments.
|
| 972 |
|
| 973 |
---
|
| 974 |
|
| 975 |
-
##
|
| 976 |
|
| 977 |
**File:** `agent/hybrid_policy.py`
|
| 978 |
|
| 979 |
-
|
| 980 |
-
engine is always consistent but cannot adapt.** The hybrid policy exploits both:
|
| 981 |
-
rules provide a safety net early while the LLM builds its track record; once the
|
| 982 |
-
LLM proves itself, rules step back.
|
| 983 |
-
|
| 984 |
-
### 11.1 The Problem with Pure LLM
|
| 985 |
-
|
| 986 |
-
In the first few episodes:
|
| 987 |
|
| 988 |
-
|
| 989 |
-
- No successful trajectory to use as a few-shot example
|
| 990 |
-
- The LLM is essentially guessing based only on the system prompt
|
| 991 |
-
- Win rate on `easy` episodes ≈ 30% at episode 1 (single-digit recall)
|
| 992 |
|
| 993 |
-
|
| 994 |
-
~60% win rate on `easy` from episode 1, with zero learning overhead.
|
| 995 |
|
| 996 |
-
###
|
| 997 |
|
| 998 |
-
|
| 999 |
-
|
| 1000 |
-
- Adapt to the evasion events in hard mode
|
| 1001 |
-
- Prioritise which SUSPECT to inspect based on context
|
| 1002 |
-
- Recognise unusual configurations (e.g., decoys clustered near gang members)
|
| 1003 |
-
- Balance exploration vs. exploitation optimally
|
| 1004 |
-
|
| 1005 |
-
The LLM, given enough reflections, learns these nuances.
|
| 1006 |
-
|
| 1007 |
-
### 11.3 Alpha: The Trust Weight
|
| 1008 |
-
|
| 1009 |
-
α (alpha) is a per-task value in [0.20, cap] representing the agent's current
|
| 1010 |
-
trust in the LLM:
|
| 1011 |
|
| 1012 |
```
|
| 1013 |
reflection_factor = min(1.0, n_reflections / 4.0)
|
| 1014 |
raw = 0.20 + reflection_factor × (0.80 × recent_win_rate + 0.12)
|
| 1015 |
α = clamp(raw, 0.20, cap)
|
| 1016 |
-
|
| 1017 |
-
where:
|
| 1018 |
-
recent_win_rate = wins in last 10 episodes for this task
|
| 1019 |
-
reflection_factor = min(1.0, n_reflections / 4.0)
|
| 1020 |
```
|
| 1021 |
|
| 1022 |
-
|
| 1023 |
-
|
| 1024 |
-
|
| 1025 |
-
|
| 1026 |
-
|
|
| 1027 |
-
| ------ | ------ | ------------------------------------------------------------------ |
|
| 1028 |
-
| easy | 0.50 | Rule engine alone achieves ~91% — LLM should assist, not override |
|
| 1029 |
-
| medium | 0.70 | Decoys require some LLM judgment, but cascade must stay |
|
| 1030 |
-
| hard | 0.85 | LLM needs latitude for evasion adaptation, but safety rules remain |
|
| 1031 |
-
|
| 1032 |
-
`reflection_factor` ensures the LLM must accumulate at least **4 reflections**
|
| 1033 |
-
before it can reach meaningful trust — pure win rate is not enough, because the LLM
|
| 1034 |
-
needs to have demonstrably learned from failures.
|
| 1035 |
|
| 1036 |
**Alpha trajectory over training (easy task, cap=0.50):**
|
| 1037 |
|
| 1038 |
-
| Episode |
|
| 1039 |
-
|
|
| 1040 |
-
| 1
|
| 1041 |
-
| 5
|
| 1042 |
-
| 10
|
| 1043 |
-
| 20
|
| 1044 |
-
|
| 1045 |
-
α starts at 0.20 (rules dominate) and climbs toward the task-specific cap as
|
| 1046 |
-
the LLM wins consistently and accumulates lessons. The cap ensures the rule
|
| 1047 |
-
engine retains veto power over high-confidence structural decisions.
|
| 1048 |
-
|
| 1049 |
-
### 11.4 Rule Action + Confidence
|
| 1050 |
-
|
| 1051 |
-
`get_rule_action(obs)` returns `(FakeGangAction, float)` where the float is
|
| 1052 |
-
the rule's confidence in its own decision:
|
| 1053 |
|
| 1054 |
-
|
| 1055 |
-
| ------------------------------------------------- | --------------------- | ---------------------------------- |
|
| 1056 |
-
| Steps remaining = 0 | SUBMIT | 1.00 |
|
| 1057 |
-
| Uninspected SUSPECT accounts exist | INSPECT suspects[0] | 0.95 |
|
| 1058 |
-
| Inspected account: fake_risk ≥ 0.85 | FLAG that account | 0.95 |
|
| 1059 |
-
| Inspected account: fake_risk in [threshold, 0.85) | FLAG that account | 0.70 + (risk − threshold) × 0.60 |
|
| 1060 |
-
| 10 accounts already flagged | SUBMIT | 0.85 |
|
| 1061 |
-
| Steps remaining ≤ 3 | SUBMIT | 0.90 |
|
| 1062 |
-
| Uninspected accounts available | INSPECT top candidate | 0.30 |
|
| 1063 |
-
| Nothing to do | SUBMIT | 0.75 |
|
| 1064 |
|
| 1065 |
-
|
| 1066 |
|
| 1067 |
-
|
| 1068 |
-
- Direct flag decisions have confidence ≥ 0.70
|
| 1069 |
-
- Exploratory decisions have confidence 0.30 (the rule is just suggesting, not insisting)
|
| 1070 |
|
| 1071 |
-
|
| 1072 |
-
|
| 1073 |
-
|
| 1074 |
-
|
| 1075 |
-
|
| 1076 |
-
|
| 1077 |
-
|
| 1078 |
-
|
| 1079 |
-
|
| 1080 |
-
|
| 1081 |
-
elif rule_conf >= alpha: # rule is confident enough to override
|
| 1082 |
-
mode = f"rule_override(c={rule_conf:.2f},α={alpha:.2f})"
|
| 1083 |
-
final = rule_action
|
| 1084 |
-
|
| 1085 |
-
else: # LLM is trusted; rule doesn't insist
|
| 1086 |
-
mode = f"llm(c={rule_conf:.2f}<α={alpha:.2f})"
|
| 1087 |
-
final = llm_action
|
| 1088 |
-
```
|
| 1089 |
|
| 1090 |
-
**
|
| 1091 |
|
| 1092 |
-
|
| 1093 |
-
|
| 1094 |
-
- At **α=0.20** (early training, no history):
|
| 1095 |
-
|
| 1096 |
-
- Rules win whenever confidence ≥ 0.20
|
| 1097 |
-
- The only exploratory INSPECT (confidence=0.30) still beats α=0.20
|
| 1098 |
-
- So rules dominate: ~90% of decisions are rule-driven
|
| 1099 |
-
- Effectively acts like the rule-based baseline agent
|
| 1100 |
-
- At **α=0.50** (moderate trust, mixed results):
|
| 1101 |
-
|
| 1102 |
-
- Rules win when confidence ≥ 0.50
|
| 1103 |
-
- Safety decisions (suspects, forced submit) still override: conf=0.95 > 0.50
|
| 1104 |
-
- Exploratory decisions (conf=0.30) now go to LLM: 0.30 < 0.50
|
| 1105 |
-
- The LLM controls exploration; rules control safety
|
| 1106 |
-
- At **α=0.84** (high trust, consistent wins):
|
| 1107 |
-
|
| 1108 |
-
- Rules win only when confidence ≥ 0.84
|
| 1109 |
-
- Only the two highest-confidence situations still override: forced submit
|
| 1110 |
-
(1.00) and uninspected suspects (0.95)
|
| 1111 |
-
- Everything else goes to the LLM, including direct flag decisions
|
| 1112 |
-
- At **α=cap** (maximum trust for the task):
|
| 1113 |
-
|
| 1114 |
-
- On easy (cap=0.50): rules still override for suspects (0.95), flags (0.70+),
|
| 1115 |
-
and forced submits (1.00) — only exploratory INSPECTs (0.30) go to LLM
|
| 1116 |
-
- On hard (cap=0.85): rules only override the highest-confidence situations
|
| 1117 |
-
(suspects, forced submit); LLM controls flag and exploration decisions
|
| 1118 |
-
|
| 1119 |
-
### 11.6 Disagreement Examples
|
| 1120 |
-
|
| 1121 |
-
**Example 1 — Early training (α=0.25), LLM exploring, rule insisting on suspect:**
|
| 1122 |
-
|
| 1123 |
-
```
|
| 1124 |
-
Rule: INSPECT acc_0041 (SUSPECT account) confidence=0.95
|
| 1125 |
-
LLM: INSPECT acc_0099 (random exploration)
|
| 1126 |
-
Rule wins: 0.95 ≥ 0.25 → INSPECT acc_0041
|
| 1127 |
-
mode = "rule_override(c=0.95,α=0.25)"
|
| 1128 |
-
```
|
| 1129 |
-
|
| 1130 |
-
**Example 2 — Mid training (α=0.60), LLM flags a high-risk account:**
|
| 1131 |
-
|
| 1132 |
-
```
|
| 1133 |
-
Rule: INSPECT acc_0041 (uninspected suspect) confidence=0.95
|
| 1134 |
-
LLM: FLAG acc_0055 (fake_risk=0.79, already inspected)
|
| 1135 |
-
Rule wins: 0.95 ≥ 0.60 → INSPECT acc_0041
|
| 1136 |
-
mode = "rule_override(c=0.95,α=0.60)"
|
| 1137 |
-
```
|
| 1138 |
-
|
| 1139 |
-
*(Both actions are useful; the rule correctly prioritises cascade suspects
|
| 1140 |
-
before random flags)*
|
| 1141 |
-
|
| 1142 |
-
**Example 3 — High trust (α=0.85), LLM has learned to prioritise smarter:**
|
| 1143 |
-
|
| 1144 |
-
```
|
| 1145 |
-
Rule: INSPECT acc_0041 (exploratory, conf=0.30)
|
| 1146 |
-
LLM: FLAG acc_0055 (fake_risk=0.88, very high confidence)
|
| 1147 |
-
LLM wins: 0.30 < 0.85 → FLAG acc_0055
|
| 1148 |
-
mode = "llm(c=0.30<α=0.85)"
|
| 1149 |
-
```
|
| 1150 |
-
|
| 1151 |
-
**Example 4 — Both agree (most common case in late training):**
|
| 1152 |
-
|
| 1153 |
-
```
|
| 1154 |
-
Rule: INSPECT acc_0041 (SUSPECT, conf=0.95)
|
| 1155 |
-
LLM: INSPECT acc_0041 (LLM also noticed the suspect badge)
|
| 1156 |
-
mode = "agree"
|
| 1157 |
-
```
|
| 1158 |
-
|
| 1159 |
-
### 11.7 Alpha Persistence
|
| 1160 |
-
|
| 1161 |
-
After every episode, `train.py` does:
|
| 1162 |
-
|
| 1163 |
-
```python
|
| 1164 |
-
# Record outcome
|
| 1165 |
-
memory.record_win(task, won, episode_num)
|
| 1166 |
-
|
| 1167 |
-
# Recompute alpha with updated win history (per-task cap applied)
|
| 1168 |
-
new_wr = memory.recent_win_rate(task, n=10)
|
| 1169 |
-
new_alpha = compute_alpha(new_wr, n_reflections, task=current_task)
|
| 1170 |
-
|
| 1171 |
-
# Save for next run (even if container restarts)
|
| 1172 |
-
memory.save_alpha(task, new_alpha)
|
| 1173 |
-
```
|
| 1174 |
-
|
| 1175 |
-
Alpha is stored in `memory/alpha_{task}.json` and loaded at the start of each
|
| 1176 |
-
training run. This means the agent's trust level is preserved across Docker
|
| 1177 |
-
restarts — it doesn't reset to 0.20 every time.
|
| 1178 |
-
|
| 1179 |
-
### 11.8 Mode Logging
|
| 1180 |
-
|
| 1181 |
-
Every episode's metrics include a mode breakdown:
|
| 1182 |
-
|
| 1183 |
-
```json
|
| 1184 |
-
{
|
| 1185 |
-
"alpha_used": 0.42,
|
| 1186 |
-
"mode_agree": 11,
|
| 1187 |
-
"mode_rule": 7,
|
| 1188 |
-
"mode_llm": 4
|
| 1189 |
-
}
|
| 1190 |
-
```
|
| 1191 |
-
|
| 1192 |
-
The training printer shows this per episode:
|
| 1193 |
-
|
| 1194 |
-
```
|
| 1195 |
-
Ep 12 | easy | WIN | reward= +12.40 | recall=1.00 prec=0.91 | steps=21 | wr=60% | α=0.42 | agree=11 rule=7 llm=4
|
| 1196 |
-
```
|
| 1197 |
-
|
| 1198 |
-
You can watch the transition: early episodes have high `rule` counts; later
|
| 1199 |
-
episodes have high `agree` counts (LLM learned to make the same decisions as
|
| 1200 |
-
the rules, but also brings in strategic reasoning the rules can't).
|
| 1201 |
|
| 1202 |
---
|
| 1203 |
|
| 1204 |
-
##
|
| 1205 |
|
| 1206 |
**File:** `train.py`
|
| 1207 |
|
| 1208 |
-
###
|
| 1209 |
|
| 1210 |
-
| Phase | Episodes | Task
|
| 1211 |
-
|
|
| 1212 |
-
| 1
|
| 1213 |
-
| 2
|
| 1214 |
-
| 3
|
| 1215 |
|
| 1216 |
Seeds rotate deterministically: `seed = (episode_num + task_offset) % 50`
|
| 1217 |
-
so the agent sees all 50 pre-generated episodes before revisiting any.
|
| 1218 |
|
| 1219 |
-
###
|
| 1220 |
|
| 1221 |
```
|
| 1222 |
for ep in range(n_episodes):
|
| 1223 |
|
| 1224 |
-
1. DETERMINE TASK
|
| 1225 |
-
|
| 1226 |
-
|
| 1227 |
-
|
| 1228 |
-
|
| 1229 |
-
|
| 1230 |
-
|
| 1231 |
-
|
| 1232 |
-
3. LOAD CONTEXT
|
| 1233 |
-
reflections = memory.get_reflections(task, n=4) # last 4 lessons
|
| 1234 |
-
few_shot = memory.get_best_trajectory(task) # best win so far
|
| 1235 |
-
|
| 1236 |
-
4. RUN EPISODE (hybrid policy)
|
| 1237 |
-
obs = env.reset(task, seed)
|
| 1238 |
-
while not obs.done:
|
| 1239 |
-
rule_action, rule_conf = get_rule_action(obs)
|
| 1240 |
-
llm_action, raw_llm = get_action(obs, reflections, few_shot, α, temperature)
|
| 1241 |
-
final = blend(rule_action, llm_action, rule_conf, alpha)
|
| 1242 |
-
obs = env.step(final)
|
| 1243 |
-
|
| 1244 |
-
5. POST-EPISODE LEARNING
|
| 1245 |
-
memory.record_win(task, won, ep)
|
| 1246 |
-
new_alpha = compute_alpha(updated_wr, n_refs)
|
| 1247 |
-
memory.save_alpha(task, new_alpha)
|
| 1248 |
-
|
| 1249 |
-
if won:
|
| 1250 |
-
memory.add_trajectory(task, action_log, final_msg, reward, ep)
|
| 1251 |
-
if new_best_or_no_refs:
|
| 1252 |
-
reflection = generate_success_reflection(...)
|
| 1253 |
-
memory.add_reflection(task, reflection, ep, reward)
|
| 1254 |
-
else:
|
| 1255 |
-
reflection = generate_reflection(task, action_log, final_msg, ...)
|
| 1256 |
-
memory.add_reflection(task, reflection, ep, reward)
|
| 1257 |
-
|
| 1258 |
-
6. LOG
|
| 1259 |
-
print per-episode stats: task, win/loss, reward, recall, precision,
|
| 1260 |
-
steps, win_rate, α, mode breakdown
|
| 1261 |
```
|
| 1262 |
|
| 1263 |
-
|
| 1264 |
-
|
| 1265 |
-
|
| 1266 |
-
|
| 1267 |
-
```json
|
| 1268 |
-
{
|
| 1269 |
-
"episode": 15,
|
| 1270 |
-
"task": "easy",
|
| 1271 |
-
"seed": 14,
|
| 1272 |
-
"won": true,
|
| 1273 |
-
"reward": 13.20,
|
| 1274 |
-
"steps_used": 23,
|
| 1275 |
-
"recall": 1.00,
|
| 1276 |
-
"precision": 0.91,
|
| 1277 |
-
"action_log": ["INSPECT acc_0022", "INSPECT acc_0037", ...],
|
| 1278 |
-
"final_message": "[WIN] TP=10 FP=1 FN=0 ...",
|
| 1279 |
-
"n_reflections_used": 4,
|
| 1280 |
-
"had_few_shot": true,
|
| 1281 |
-
"alpha_used": 0.52,
|
| 1282 |
-
"mode_agree": 13,
|
| 1283 |
-
"mode_rule": 6,
|
| 1284 |
-
"mode_llm": 4,
|
| 1285 |
-
"timestamp": "2026-04-01T10:23:41"
|
| 1286 |
-
}
|
| 1287 |
-
```
|
| 1288 |
|
| 1289 |
---
|
| 1290 |
|
| 1291 |
-
##
|
| 1292 |
|
| 1293 |
**File:** `server/app.py`
|
| 1294 |
|
| 1295 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1296 |
|
| 1297 |
-
|
| 1298 |
-
{"status": "healthy"}
|
| 1299 |
-
```
|
| 1300 |
|
| 1301 |
-
|
| 1302 |
-
|
| 1303 |
-
|
| 1304 |
-
|
| 1305 |
-
|
| 1306 |
-
"descriptions": {
|
| 1307 |
-
"easy": "50 accounts, 10 fakes, no evasion, 30 steps",
|
| 1308 |
-
"medium": "200 accounts, 10 fakes + 20 decoys, evasion at step 20, 50 steps",
|
| 1309 |
-
"hard": "1000 accounts, 10 fakes + 50 decoys, recurring evasion, 80 steps"
|
| 1310 |
-
},
|
| 1311 |
-
"action_schema": {
|
| 1312 |
-
"action_type": ["inspect", "investigate_network", "flag", "unflag", "submit"],
|
| 1313 |
-
"account_id": "string (required for all actions except submit)"
|
| 1314 |
-
},
|
| 1315 |
-
"score_range": [0.0, 1.0]
|
| 1316 |
-
}
|
| 1317 |
-
```
|
| 1318 |
-
|
| 1319 |
-
### POST /reset
|
| 1320 |
-
|
| 1321 |
-
Request:
|
| 1322 |
-
|
| 1323 |
-
```json
|
| 1324 |
-
{"task": "easy", "seed": 0}
|
| 1325 |
-
```
|
| 1326 |
-
|
| 1327 |
-
Response: `StepResponse` with initial observation.
|
| 1328 |
-
|
| 1329 |
-
### POST /step
|
| 1330 |
-
|
| 1331 |
-
Request: Any `FakeGangAction`:
|
| 1332 |
-
|
| 1333 |
-
```json
|
| 1334 |
-
{"action_type": "inspect", "account_id": "acc_0042"}
|
| 1335 |
-
{"action_type": "flag", "account_id": "acc_0017"}
|
| 1336 |
-
{"action_type": "submit"}
|
| 1337 |
-
```
|
| 1338 |
-
|
| 1339 |
-
Response: `StepResponse` with updated observation, done flag, and reward.
|
| 1340 |
-
|
| 1341 |
-
### GET /state
|
| 1342 |
-
|
| 1343 |
-
Returns current episode metadata:
|
| 1344 |
-
|
| 1345 |
-
```json
|
| 1346 |
-
{
|
| 1347 |
-
"episode_id": "uuid",
|
| 1348 |
-
"step_count": 12,
|
| 1349 |
-
"task": "easy",
|
| 1350 |
-
"score_so_far": -0.12,
|
| 1351 |
-
"evasion_count": 0,
|
| 1352 |
-
"network_size": 50,
|
| 1353 |
-
"gang_size": 10,
|
| 1354 |
-
"episode_seed": 0
|
| 1355 |
-
}
|
| 1356 |
-
```
|
| 1357 |
-
|
| 1358 |
-
### GET /grader
|
| 1359 |
-
|
| 1360 |
-
Returns the normalised grader score after SUBMIT. Error 400 if episode not done.
|
| 1361 |
-
|
| 1362 |
-
```json
|
| 1363 |
-
{"score": 0.91, "task": "easy", "episode_id": "uuid"}
|
| 1364 |
-
```
|
| 1365 |
-
|
| 1366 |
-
### POST /baseline
|
| 1367 |
-
|
| 1368 |
-
Runs the rule-based agent on all three tasks (seed=0) and returns scores:
|
| 1369 |
-
|
| 1370 |
-
```json
|
| 1371 |
-
{
|
| 1372 |
-
"scores": {"easy": 0.91, "medium": 0.906, "hard": 0.9038},
|
| 1373 |
-
"agent": "rule_based"
|
| 1374 |
-
}
|
| 1375 |
-
```
|
| 1376 |
-
|
| 1377 |
-
**Baseline performance across 50 seeds:**
|
| 1378 |
-
|
| 1379 |
-
| Task | Seed=0 score | Win rate (50 seeds) | Mean score (50 seeds) |
|
| 1380 |
-
| ------ | ------------ | ------------------- | --------------------- |
|
| 1381 |
-
| easy | 0.91 | 100% | ~0.91 |
|
| 1382 |
-
| medium | 0.906 | 84% | ~0.77 |
|
| 1383 |
-
| hard | 0.9038 | 52% | ~0.47 |
|
| 1384 |
-
|
| 1385 |
-
The baseline is a deterministic rule-based agent — no LLM, no learning. The
|
| 1386 |
-
difficulty scaling is designed so that easy is consistently solvable, medium
|
| 1387 |
-
requires some luck, and hard genuinely challenges frontier LLM agents via
|
| 1388 |
-
evasion events that destroy graph signals mid-investigation.
|
| 1389 |
|
| 1390 |
---
|
| 1391 |
|
| 1392 |
-
##
|
| 1393 |
-
|
| 1394 |
-
**File:** `server/Dockerfile`
|
| 1395 |
-
|
| 1396 |
-
### 14.1 Build
|
| 1397 |
|
| 1398 |
```bash
|
| 1399 |
-
|
| 1400 |
docker build -f server/Dockerfile -t graphstrike .
|
| 1401 |
-
```
|
| 1402 |
-
|
| 1403 |
-
Build takes ~10 seconds because:
|
| 1404 |
-
|
| 1405 |
-
- The `.dockerignore` excludes `episodes/` (109 MB), `memory/`, `runs/`
|
| 1406 |
-
- Python wheels are pre-downloaded to `wheels/` — no network access during `pip install`
|
| 1407 |
-
- No `apt-get` installs needed (everything is pure Python)
|
| 1408 |
-
|
| 1409 |
-
### 14.2 Run
|
| 1410 |
|
| 1411 |
-
|
| 1412 |
docker run -it \
|
| 1413 |
-e AWS_ACCESS_KEY_ID=your_key \
|
| 1414 |
-e AWS_SECRET_ACCESS_KEY=your_secret \
|
|
@@ -1418,239 +456,67 @@ docker run -it \
|
|
| 1418 |
graphstrike
|
| 1419 |
```
|
| 1420 |
|
| 1421 |
-
The volumes preserve all learning between
|
| 1422 |
-
the agent continues from where it left off (α values, reflections, best trajectories).
|
| 1423 |
|
| 1424 |
-
###
|
| 1425 |
|
| 1426 |
-
| Variable
|
| 1427 |
-
|
|
| 1428 |
-
| `AWS_ACCESS_KEY_ID`
|
| 1429 |
-
| `AWS_SECRET_ACCESS_KEY` | (required)
|
| 1430 |
-
| `AWS_DEFAULT_REGION`
|
| 1431 |
-
| `TRAIN_TASK`
|
| 1432 |
-
| `TRAIN_EPISODES`
|
| 1433 |
-
| `TRAIN_TEMP`
|
| 1434 |
-
| `TRAIN_VERBOSE`
|
| 1435 |
-
| `SERVER_PORT`
|
| 1436 |
|
| 1437 |
-
###
|
| 1438 |
|
| 1439 |
```
|
| 1440 |
-
1. Validate AWS credentials
|
| 1441 |
-
2. python server/generator.py → generates
|
| 1442 |
3. uvicorn server.app:app → starts the environment server
|
| 1443 |
-
4.
|
| 1444 |
5. python train.py → runs the full training loop
|
| 1445 |
```
|
| 1446 |
|
| 1447 |
---
|
| 1448 |
|
| 1449 |
-
## 15. Submission Requirements
|
| 1450 |
-
|
| 1451 |
-
All submission requirements are satisfied. The environment is deployed at
|
| 1452 |
-
[huggingface.co/spaces/Pandago/graphstrike](https://huggingface.co/spaces/Pandago/graphstrike).
|
| 1453 |
-
|
| 1454 |
-
### 15.1 Required Endpoints
|
| 1455 |
-
|
| 1456 |
-
| Endpoint | Method | Status | Description |
|
| 1457 |
-
| ------------- | ------ | ------ | -------------------------------------------------------- |
|
| 1458 |
-
| `/health` | GET | ✅ | Returns `{"status": "healthy"}` |
|
| 1459 |
-
| `/tasks` | GET | ✅ | 3 tasks +`action_schema` + `score_range: [0.0, 1.0]` |
|
| 1460 |
-
| `/reset` | POST | ✅ | Accepts `{task, seed}`, returns initial observation |
|
| 1461 |
-
| `/step` | POST | ✅ | Accepts any valid action, returns updated observation |
|
| 1462 |
-
| `/state` | GET | ✅ | Returns episode metadata (step count, task, score) |
|
| 1463 |
-
| `/grader` | GET | ✅ | Returns normalised [0.0, 1.0] score after SUBMIT |
|
| 1464 |
-
| `/baseline` | POST | ✅ | Runs rule-based agent on all 3 tasks, returns scores |
|
| 1465 |
-
|
| 1466 |
-
### 15.2 /tasks with action_schema
|
| 1467 |
-
|
| 1468 |
-
The `/tasks` endpoint returns the `action_schema` dict listing all valid
|
| 1469 |
-
`action_type` values and the `account_id` field description. Graders can
|
| 1470 |
-
discover the full action space without reading code.
|
| 1471 |
-
|
| 1472 |
-
### 15.3 /grader — Normalised Scoring
|
| 1473 |
-
|
| 1474 |
-
After calling `SUBMIT` (via `/step`), call `GET /grader` to retrieve the
|
| 1475 |
-
normalised [0.0, 1.0] grader score. Returns 400 if the episode is not yet done.
|
| 1476 |
-
|
| 1477 |
-
The score formula (see §7.8) rewards recall, precision, and efficiency.
|
| 1478 |
-
Maximum score 1.0 requires finding all 10 gang members with no false positives
|
| 1479 |
-
and using no steps. The grader is **deterministic** — same actions produce same score.
|
| 1480 |
-
|
| 1481 |
-
### 15.4 /baseline — Reproducible Baseline Agent
|
| 1482 |
-
|
| 1483 |
-
`POST /baseline` imports `inference.py`'s `run_rule_based_episode` and runs it
|
| 1484 |
-
on all three tasks with seed=0. Returns:
|
| 1485 |
-
|
| 1486 |
-
```json
|
| 1487 |
-
{"scores": {"easy": 0.91, "medium": 0.906, "hard": 0.9038}, "agent": "rule_based"}
|
| 1488 |
-
```
|
| 1489 |
-
|
| 1490 |
-
**Reproducibility:** The baseline is fully deterministic — no randomness, no LLM calls.
|
| 1491 |
-
Calling `/baseline` 3+ times in succession produces **identical scores** every time.
|
| 1492 |
-
The evasion flags (`_fired_*` attributes) are properly cleared on `reset()`,
|
| 1493 |
-
ensuring episodes replay identically across runs.
|
| 1494 |
-
|
| 1495 |
-
### 15.5 inference.py
|
| 1496 |
-
|
| 1497 |
-
**Library mode** (used by `/baseline`):
|
| 1498 |
-
|
| 1499 |
-
```python
|
| 1500 |
-
from inference import run_rule_based_episode
|
| 1501 |
-
score = run_rule_based_episode(env, task="easy", seed=0)
|
| 1502 |
-
# Returns float in [0.0, 1.0]
|
| 1503 |
-
```
|
| 1504 |
-
|
| 1505 |
-
**CLI mode** (connect to running server):
|
| 1506 |
-
|
| 1507 |
-
```bash
|
| 1508 |
-
python inference.py --url http://localhost:8000
|
| 1509 |
-
# → {"scores": {"easy": 0.91, "medium": 0.906, "hard": 0.9038}, "agent": "rule_based"}
|
| 1510 |
-
```
|
| 1511 |
-
|
| 1512 |
-
**CLI mode** (local, no server needed):
|
| 1513 |
-
|
| 1514 |
-
```bash
|
| 1515 |
-
python inference.py --local
|
| 1516 |
-
```
|
| 1517 |
-
|
| 1518 |
-
The rule-based strategy:
|
| 1519 |
-
|
| 1520 |
-
1. If SUSPECT accounts are uninspected → INSPECT highest suspect
|
| 1521 |
-
2. If any inspected account has `fake_risk_score ≥ threshold` and not flagged → FLAG it
|
| 1522 |
-
3. If no immediate flag or suspect → INSPECT highest-risk uninspected account
|
| 1523 |
-
4. If steps ≤ 3 or 10 flags placed → SUBMIT
|
| 1524 |
-
|
| 1525 |
-
Thresholds by task: easy=0.60, medium=0.50, hard=0.45.
|
| 1526 |
-
|
| 1527 |
-
### 15.6 validate.py — 24-Point Pre-Submission Validator
|
| 1528 |
-
|
| 1529 |
-
Runs 24 checks split between local (no server) and HTTP:
|
| 1530 |
-
|
| 1531 |
-
```bash
|
| 1532 |
-
python validate.py --local # 9 local checks only
|
| 1533 |
-
python validate.py --url http://... # all 24 checks (requires running server)
|
| 1534 |
-
```
|
| 1535 |
-
|
| 1536 |
-
Checks include:
|
| 1537 |
-
|
| 1538 |
-
- scoring.py math correctness (gang risk ≥ 0.60, celebrity risk < 0.20, perfect score = 1.00)
|
| 1539 |
-
- models.py has all new fields (fake_risk_score, suspect_ids, AccountStatus)
|
| 1540 |
-
- environment.py SUSPECT cascade triggers after FLAG
|
| 1541 |
-
- inference.py runs without error and returns [0,1] float
|
| 1542 |
-
- episodes have new features (comment_repeat_score, shared_ip_count, celeb_ids)
|
| 1543 |
-
- /health reachable
|
| 1544 |
-
- /tasks has action_schema and score_range
|
| 1545 |
-
- /reset works for all three tasks
|
| 1546 |
-
- /step supports INSPECT, FLAG, SUBMIT
|
| 1547 |
-
- /grader returns [0,1] float after SUBMIT
|
| 1548 |
-
- /baseline returns 3 valid scores
|
| 1549 |
-
|
| 1550 |
-
**All 24/24 checks pass.**
|
| 1551 |
-
|
| 1552 |
-
### 15.7 Judging Criteria Alignment
|
| 1553 |
-
|
| 1554 |
-
| Criterion | Weight | How GraphStrike addresses it |
|
| 1555 |
-
| ---------------------------- | ------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| 1556 |
-
| **Domain quality** | 30% | Real-world fraud detection domain; signals modelled on actual Instagram fake-account patterns (IP clustering, photo reuse, bio templates, temporal coordination) |
|
| 1557 |
-
| **Task & grader** | 25% | 3 difficulty tiers with clear win conditions; grader formula rewards recall, precision, and efficiency; partial credit for incomplete investigations |
|
| 1558 |
-
| **Environment design** | 20% | Bidirectional graph, dual cascade (follow + IP), evasion events that destroy signals mid-investigation, decoy accounts that penalise reckless flagging |
|
| 1559 |
-
| **Code quality** | 15% | Typed Pydantic models, stateless scoring functions, 24-point validator, deterministic episode generation by seed |
|
| 1560 |
-
| **Creativity** | 10% | Hybrid rule/LLM policy with dynamic α caps, Reflexion-based learning without fine-tuning, IP cluster cascade as evasion-resistant signal |
|
| 1561 |
-
|
| 1562 |
-
---
|
| 1563 |
-
|
| 1564 |
-
## 16. Verification & Validation
|
| 1565 |
-
|
| 1566 |
-
### Quick smoke test
|
| 1567 |
-
|
| 1568 |
-
```bash
|
| 1569 |
-
cd fake_gang_env
|
| 1570 |
-
|
| 1571 |
-
# Test scoring math
|
| 1572 |
-
python3 -c "
|
| 1573 |
-
import sys; sys.path.insert(0,'server')
|
| 1574 |
-
from scoring import compute_fake_risk, compute_hub_legitimacy, grader_score
|
| 1575 |
-
|
| 1576 |
-
gang_r = compute_fake_risk(0.75, 0.65, 0.85, 0.10)
|
| 1577 |
-
hub = compute_hub_legitimacy(2_000_000, 200, 2000, 0.05)
|
| 1578 |
-
celeb = compute_fake_risk(0.02, 0.02, 0.10, hub)
|
| 1579 |
-
assert gang_r >= 0.60, f'Gang risk too low: {gang_r}'
|
| 1580 |
-
assert celeb < 0.20, f'Celebrity risk too high: {celeb}'
|
| 1581 |
-
assert grader_score(10, 0, 0, 0, 30) == 1.0
|
| 1582 |
-
print(f'Gang risk={gang_r} Celeb risk={celeb} Perfect score=1.0 OK')
|
| 1583 |
-
"
|
| 1584 |
-
|
| 1585 |
-
# Test hybrid policy + cascade
|
| 1586 |
-
python3 -c "
|
| 1587 |
-
import sys, json; sys.path.insert(0,'server')
|
| 1588 |
-
from models import FakeGangAction, ActionType
|
| 1589 |
-
from environment import FakeGangEnvironment
|
| 1590 |
-
from agent.hybrid_policy import get_rule_action, compute_alpha
|
| 1591 |
-
|
| 1592 |
-
env = FakeGangEnvironment()
|
| 1593 |
-
obs = env.reset(task='easy', seed=0)
|
| 1594 |
-
gang = json.loads(open('episodes/easy_000.json').read())['gang_member_ids']
|
| 1595 |
-
obs = env.step(FakeGangAction(action_type=ActionType.INSPECT, account_id=gang[0]))
|
| 1596 |
-
obs = env.step(FakeGangAction(action_type=ActionType.FLAG, account_id=gang[0]))
|
| 1597 |
-
assert len(obs.suspect_ids) > 0, 'Cascade failed'
|
| 1598 |
-
action, conf = get_rule_action(obs)
|
| 1599 |
-
assert action.account_id in obs.suspect_ids, 'Rule not prioritising suspects'
|
| 1600 |
-
print(f'Cascade OK: {len(obs.suspect_ids)} suspects. Rule → INSPECT {action.account_id} (conf={conf:.2f})')
|
| 1601 |
-
a0 = compute_alpha(0, 0, 'easy')
|
| 1602 |
-
a1 = compute_alpha(0.5, 2, 'easy')
|
| 1603 |
-
a2 = compute_alpha(1.0, 4, 'easy')
|
| 1604 |
-
print(f'Alpha (easy, cap=0.50): min={a0} mid={a1} max={a2}')
|
| 1605 |
-
"
|
| 1606 |
-
|
| 1607 |
-
# Full local validate
|
| 1608 |
-
python3 validate.py --local
|
| 1609 |
-
```
|
| 1610 |
|
| 1611 |
-
### Full HTTP validation
|
| 1612 |
|
| 1613 |
```bash
|
| 1614 |
python3 -m uvicorn server.app:app --port 8001 &
|
| 1615 |
sleep 3
|
| 1616 |
python3 validate.py --url http://localhost:8001
|
|
|
|
| 1617 |
```
|
| 1618 |
|
| 1619 |
-
Expected output: `Results: 24/24 passed — all OK`
|
| 1620 |
-
|
| 1621 |
### Deployed Endpoint Verification
|
| 1622 |
|
| 1623 |
-
The live environment at [huggingface.co/spaces/Pandago/graphstrike](https://huggingface.co/spaces/Pandago/graphstrike)
|
| 1624 |
-
responds to all standard OpenEnv endpoints:
|
| 1625 |
-
|
| 1626 |
```bash
|
| 1627 |
-
# Health check
|
| 1628 |
curl https://pandago-graphstrike.hf.space/health
|
| 1629 |
# → {"status": "healthy"}
|
| 1630 |
|
| 1631 |
-
# Task discovery
|
| 1632 |
curl https://pandago-graphstrike.hf.space/tasks
|
| 1633 |
# → {"tasks": ["easy","medium","hard"], "action_schema": {...}, "score_range": [0.0, 1.0]}
|
| 1634 |
|
| 1635 |
-
# Baseline (deterministic, reproducible)
|
| 1636 |
curl -X POST https://pandago-graphstrike.hf.space/baseline
|
| 1637 |
# → {"scores": {"easy": 0.91, "medium": 0.906, "hard": 0.9038}, "agent": "rule_based"}
|
| 1638 |
```
|
| 1639 |
|
| 1640 |
---
|
| 1641 |
|
| 1642 |
-
|
| 1643 |
-
|
| 1644 |

|
| 1645 |
|
| 1646 |
## Developed with ❤️ by Team ComputeXOR
|
| 1647 |
|
| 1648 |
-
|
| 1649 |
### {
|
| 1650 |
|
| 1651 |
-
### [Sai Nivedh](https://github.com/SaiNivedh26) ,
|
| 1652 |
|
| 1653 |
-
### [
|
| 1654 |
|
| 1655 |
### [Sajeev](https://github.com/SajeevSenthil)
|
| 1656 |
|
|
|
|
| 15 |
- llm-agent
|
| 16 |
base_path: /web
|
| 17 |
---
|
| 18 |
+
<br>
|
| 19 |
|
| 20 |
+
<p align="center">
|
| 21 |
+
<img src="assets/logo.png" width="600"/>
|
| 22 |
+
</p>
|
| 23 |
|
| 24 |
+
<br>
|
|
|
|
| 25 |
|
| 26 |
+
<p align="center">
|
| 27 |
+
<img src="https://img.shields.io/badge/Hugging%20Face-FFD21E?style=for-the-badge&logo=huggingface&logoColor=black"/>
|
| 28 |
+
<img src="https://img.shields.io/badge/HF%20Spaces-FFBF00?style=for-the-badge&logo=huggingface&logoColor=black"/>
|
| 29 |
+
<img src="https://img.shields.io/badge/FastAPI-009688?style=for-the-badge&logo=fastapi&logoColor=white"/>
|
| 30 |
+
<img src="https://img.shields.io/badge/Docker-2496ED?style=for-the-badge&logo=docker&logoColor=white"/>
|
| 31 |
+
<img src="https://img.shields.io/badge/Gradio-F97316?style=for-the-badge&logo=gradio&logoColor=white"/>
|
| 32 |
+
<img src="https://img.shields.io/badge/OpenEnv-4B5563?style=for-the-badge&logo=envato&logoColor=white"/>
|
| 33 |
+
<img src="https://img.shields.io/badge/Amazon%20Bedrock-FF9900?style=for-the-badge&logo=amazonaws&logoColor=white"/>
|
| 34 |
+
</p>
|
| 35 |
+
<br>
|
| 36 |
|
| 37 |
+
<h1 align="center">
|
| 38 |
+
</h1>
|
| 39 |
+
<p align="center">
|
| 40 |
+
An OpenEnv-compatible reinforcement learning environment where an LLM agent must identify all 10 members of a coordinated fake account network hidden inside a synthetic social network. The agent learns via Reflexion and a dynamic hybrid rule/LLM policy , not via gradient updates or fine-tuning.
|
| 41 |
+
<br />
|
| 42 |
+
</p>
|
| 43 |
+
</p>
|
| 44 |
|
| 45 |
+
<br>
|
| 46 |
|
| 47 |
+
## Theme
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
+
**SUPPORT**
|
| 50 |
|
| 51 |
+
### Customer Service Agents
|
| 52 |
|
| 53 |
+
Complex environment where agents resolve multi-step queries using external tools and APIs.
|
|
|
|
|
|
|
| 54 |
|
| 55 |
+
## Problem Statement
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
+
**The task:** A social network contains fake accounts organised into a single coordinated ring of 10. The ring behaves in a coordinated way — same posting hour, same IP subnet, stolen celebrity photos, copy-paste bios. The agent must find all 10 by navigating a limited step budget, inspecting accounts, and flagging suspects.
|
| 58 |
|
| 59 |
+
## Proposed Solution
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
+
An OpenEnv-compatible reinforcement learning environment where an LLM agent must identify all 10 members of a coordinated fake account ring hidden inside a synthetic social network. The agent learns via **Reflexion** and a **dynamic hybrid rule/LLM policy** — not via gradient updates or fine-tuning.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
---
|
| 64 |
+
## Novelty Highlights
|
| 65 |
+
|
| 66 |
+
- **Adaptive Hybrid Intelligence (Rules + LLM):** Unlike static ensembles, GraphStrike dynamically blends deterministic rules and LLM reasoning using a trust gate, shifting control as performance improves.
|
| 67 |
+
- **Learning Without Fine-Tuning:** Instead of updating model weights, the agent learns through Reflexion lessons and best-trajectory memory injected into future prompts.
|
| 68 |
+
- **Graph-First Detection Pipeline:** Detection is not account-by-account only; it uses cascade effects, neighbor propagation, and multi-hop graph expansion to uncover coordinated rings.
|
| 69 |
+
- **Math-Grounded Decision Control:** Risk composition, trust calibration, and grader alignment are formula-driven, making behavior interpretable and reproducible.
|
| 70 |
+
- **Adversarial Evasion Benchmarking:** Hard-mode includes timed evasion events, so success reflects robustness under disruption rather than overfitting to static patterns.
|
| 71 |
+
- **Safety-Net by Design:** High-confidence rule overrides prevent catastrophic LLM errors while preserving LLM flexibility for strategic exploration.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
---
|
| 73 |
|
| 74 |
+
## Performance Summary
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
+
We evaluate GraphStrike's hybrid rule/LLM policy across multiple *frontier models to measure how well each model handles the investigation task. All runs use
|
| 77 |
+
the same inference pipeline (`inference.py`) with identical system prompts and structured logging. Each model ran: (1) seed=0 on all 3 tasks, and
|
| 78 |
+
(2) seeds 0-2 on all 3 tasks for variance measurement.*
|
|
|
|
|
|
|
| 79 |
|
| 80 |
+
**Seed=0 scores (single episode per task):**
|
|
|
|
| 81 |
|
| 82 |
+
<p align="center">
|
| 83 |
+
<img src="images/table1.png" alt="Model Performance Table" width="1600"/>
|
| 84 |
+
</p>
|
| 85 |
+
<br>
|
| 86 |
|
| 87 |
+
**3-seed variance scores (mean across seeds 0, 1, 2):**
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
+
<p align="center">
|
| 90 |
+
<img src="images/table2.png" alt="Model Performance Table" width="1600"/>
|
| 91 |
+
</p>
|
| 92 |
+
<br>
|
| 93 |
|
| 94 |
+
**Rule-Based Baseline (no LLM, deterministic)**
|
|
|
|
| 95 |
|
| 96 |
+
<p align="center">
|
| 97 |
+
<img src="images/table3.png" alt="Model Performance Table" width="1600"/>
|
| 98 |
+
</p>
|
| 99 |
+
<br>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
|
| 101 |
---
|
| 102 |
+
## Table of Contents
|
| 103 |
|
| 104 |
+
1. [What This Is](#1-what-this-is)
|
| 105 |
+
2. [The Problem: How Fake Detection Actually Works](#2-the-problem-how-fake-detection-actually-works)
|
| 106 |
+
3. [Synthetic Data Generation](#3-synthetic-data-generation)
|
| 107 |
+
4. [Data Model](#4-data-model)
|
| 108 |
+
5. [The RL Environment](#5-the-rl-environment)
|
| 109 |
+
6. [Risk Scoring Mathematics](#6-risk-scoring-mathematics)
|
| 110 |
+
8. [The LLM Policy (Qwen3 via Bedrock)](#8-the-llm-policy-qwen3-via-bedrock)
|
| 111 |
+
9. [Reflexion — How the Agent Learns](#9-reflexion--how-the-agent-learns)
|
| 112 |
+
10. [Hybrid Policy — The Novel Contribution](#10-hybrid-policy--the-novel-contribution)
|
| 113 |
+
11. [Training Loop End-to-End](#11-training-loop-end-to-end)
|
| 114 |
+
12. [API Reference](#12-api-reference)
|
| 115 |
+
13. [Docker Deployment](#13-docker-deployment)
|
| 116 |
+
14. [Submission Requirements](#14-submission-requirements)
|
| 117 |
+
15. [Verification & Validation](#15-verification--validation)
|
| 118 |
|
| 119 |
+
---
|
| 120 |
|
| 121 |
+
## 1. What is this !?
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
|
| 123 |
+
This is an **OpenEnv hackathon** submission. OpenEnv is a framework for building RL environments with a standard microservice interface (`/reset`, `/step`, `/state`) so that any agent implementation can plug in.
|
| 124 |
|
| 125 |
+
**What makes this non-trivial:**
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
+
- The network is large (50–1000 accounts depending on difficulty).
|
| 128 |
+
- Fake accounts are mixed with innocent high-signal "decoy" accounts.
|
| 129 |
+
- In hard mode, the gang actively evades — dropping intra-gang follows, renaming profiles — while the agent is mid-investigation.
|
| 130 |
+
- The agent cannot see the full network upfront: it must explore via INSPECT and INVESTIGATE_NETWORK actions, spending steps to reveal information.
|
| 131 |
|
| 132 |
+
**What makes the learning novel:**
|
|
|
|
|
|
|
| 133 |
|
| 134 |
+
- The LLM (inference via AWS Bedrock) cannot be fine-tuned — it is a black-box API.
|
| 135 |
+
- The agent learns via **Reflexion**: post-episode lessons are written back into memory and injected into every future prompt.
|
| 136 |
+
- A **dynamic hybrid policy** (α-weighted) blends the LLM with a deterministic rule engine, with the blend weight α updating based on recent win rate. Rules dominate early; the LLM takes over as it proves itself.
|
| 137 |
|
| 138 |
+
### System Architecture
|
|
|
|
|
|
|
| 139 |
|
| 140 |
+

|
| 141 |
|
| 142 |
+
---
|
|
|
|
|
|
|
| 143 |
|
| 144 |
+
## 2. The Problem: How Fake Detection Actually Works
|
|
|
|
|
|
|
| 145 |
|
| 146 |
+
A real-world fake account detector does **not** read post content. Detection relies on three categories of signals computed from metadata:
|
| 147 |
|
| 148 |
+
### Signal Hierarchy (Node -> Behavioral -> Graph)
|
| 149 |
|
| 150 |
+

|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
|
| 152 |
+
- **Node signals (offline):** content fingerprints like photo reuse, bio-template similarity, and comment repetition provide the first suspicion layer.
|
| 153 |
+
- **Behavioral signals (temporal/device):** coordinated posting hour, account-age clustering, and shared IP subnet add stronger gang-level evidence.
|
| 154 |
+
- **Graph signals (live at INSPECT):** mutual follows, flagged-neighbor growth, and cluster alignment are hardest to evade, so they carry the highest weight in risk scoring.
|
| 155 |
+
- **False-positive control:** high-legitimacy hubs (for example celebrities) are down-weighted through hub-legitimacy discounting.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
|
| 157 |
---
|
| 158 |
|
| 159 |
+
## 3. Synthetic Data Generation
|
| 160 |
|
| 161 |
+
**File:** `server/generator.py`
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
|
| 163 |
+
Episodes are generated deterministically by seed. 150 episodes are pre-generated (50 per task) and cached as JSON files in `episodes/`.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
|
| 165 |
+
### Network Composition
|
|
|
|
| 166 |
|
| 167 |
+
| Task | Network size | Gang | Decoys | Real | Max steps |
|
| 168 |
+
|---|---|---|---|---|---|
|
| 169 |
+
| easy | 50 | 10 | 0 | 40 | 30 |
|
| 170 |
+
| medium | 200 | 10 | 20 | 170 | 50 |
|
| 171 |
+
| hard | 1000 | 10 | 50 | 940 | 80 |
|
| 172 |
|
| 173 |
+
- **Gang accounts:** All 10 share `base_age` (same creation week), tightly clustered `avg_post_hour`, high `photo_reuse_score`/`bio_template_score`, `comment_repeat_score` in [0.60, 0.90], `ip_cluster_id = "ip_gang_{seed}"`, and dense intra-gang follow edges (density 0.60–0.80).
|
| 174 |
+
- **Real accounts:** Log-normal follower distributions, unique IP clusters, low fake scores.
|
| 175 |
+
- **Decoy accounts** (medium/hard): Real accounts with elevated fraud scores (0.20–0.40 range) — they look suspicious but are NOT gang members and penalise reckless flagging.
|
| 176 |
+
- **Celebrity accounts** (2 per episode): 100k–5M followers, very low fake scores, high `hub_legitimacy_score`.
|
| 177 |
+
- **Zero-edge isolates** (2 per episode): No edges — test whether the agent wastes steps on disconnected nodes.
|
| 178 |
|
| 179 |
+
---
|
|
|
|
|
|
|
|
|
|
| 180 |
|
| 181 |
+
## 4. Data Model
|
|
|
|
|
|
|
| 182 |
|
| 183 |
+
**File:** `models.py`
|
|
|
|
|
|
|
|
|
|
| 184 |
|
| 185 |
+
### ActionType
|
|
|
|
| 186 |
|
| 187 |
+
| Value | Cost | Effect |
|
| 188 |
+
|---|---|---|
|
| 189 |
+
| `inspect` | 1 step | Reveals full `AccountProfile` + follow list |
|
| 190 |
+
| `investigate_network` | 2 steps | Expands 2 hops; reveals account IDs only |
|
| 191 |
+
| `flag` | 0 steps | Marks account as gang member; triggers SUSPECT cascade |
|
| 192 |
+
| `unflag` | 0 steps | Removes flag; clears CONFIRMED_FAKE status |
|
| 193 |
+
| `submit` | 0 steps | Ends episode; triggers scoring |
|
| 194 |
|
| 195 |
+
### AccountProfile — key fields
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
|
| 197 |
+
| Category | Fields |
|
| 198 |
+
|---|---|
|
| 199 |
+
| Raw counts | `follower_count`, `following_count`, `post_count` |
|
| 200 |
+
| Temporal | `avg_post_hour`, `account_age_days` |
|
| 201 |
+
| Content pipeline (0–1) | `photo_reuse_score`, `bio_template_score`, `comment_repeat_score` |
|
| 202 |
+
| IP/device | `shared_ip_count`, `ip_cluster_id` |
|
| 203 |
+
| Graph (live at INSPECT) | `mutual_follow_rate`, `flagged_neighbor_count`, `avg_neighbor_photo_reuse`, `post_hour_cluster_score` |
|
| 204 |
+
| Risk breakdown | `fake_risk_score`, `node_risk`, `behavior_risk`, `graph_risk`, `hub_legitimacy_score` |
|
| 205 |
+
| Evasion/status | `name_change_count`, `status` (NORMAL/SUSPECT/CONFIRMED_FAKE) |
|
| 206 |
|
| 207 |
+
### FakeGangObservation — what the agent sees each step
|
|
|
|
|
|
|
|
|
|
| 208 |
|
| 209 |
+
`done`, `reward`, `visible_accounts`, `visible_account_ids`, `flagged_ids`, `inspected_ids`, `suspect_ids`, `graph_edges`, `steps_remaining`, `evasion_triggered`, `evasion_count`, `task`, `message`
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
|
| 211 |
---
|
| 212 |
|
| 213 |
+
## 5. The RL Environment
|
| 214 |
|
| 215 |
**File:** `server/environment.py`
|
| 216 |
|
| 217 |
+
### Episode Lifecycle & Action Mechanics
|
| 218 |
|
| 219 |
+

|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
|
| 221 |
+
**FLAG cascade (dual):** When FLAG(X) is called — (1) every visible account that X follows becomes SUSPECT via the follow-graph, and (2) every visible account sharing X's `ip_cluster_id` becomes SUSPECT. Gang members share `ip_gang_{seed}`; real accounts have unique IPs → zero false positives.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
|
| 223 |
+
### Reward Function
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
|
| 225 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
base_reward = tp×1.0 − fp×0.5 − fn×0.3
|
| 227 |
|
| 228 |
+
Win condition:
|
| 229 |
easy/medium: recall ≥ 0.8 AND precision ≥ 0.7
|
| 230 |
hard: recall ≥ 0.9 AND precision ≥ 0.8
|
| 231 |
|
| 232 |
+
Bonuses:
|
| 233 |
+
+5.0 full win
|
| 234 |
+
+3.0 perfect recall
|
| 235 |
+
+2.0 partial win (high recall, low precision)
|
| 236 |
+
+1.0 efficiency (SUBMIT with ≥50% steps remaining)
|
| 237 |
+
−1.0 per evasion event (hard mode)
|
| 238 |
+
−2.0 forced submit (ran out of steps)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
```
|
| 240 |
|
| 241 |
+
### Evasion (hard mode)
|
|
|
|
|
|
|
|
|
|
| 242 |
|
| 243 |
+
- **`unfollow_intragang`:** 30% of intra-gang edges randomly removed mid-investigation — destroys graph signal. Fires 4 times (steps 15, 30, 45, 60).
|
| 244 |
+
- **`rename_count`:** Random gang members get `name_change_count += 1` — a visual evasion signal.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
|
| 246 |
---
|
| 247 |
|
| 248 |
+
## 6. Risk Scoring Mathematics
|
|
|
|
|
|
|
| 249 |
|
| 250 |
+
**File:** `server/scoring.py` — all functions are stateless and deterministic.
|
|
|
|
|
|
|
| 251 |
|
| 252 |
+

|
| 253 |
|
| 254 |
+

|
| 255 |
|
| 256 |
+

|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
|
| 258 |
---
|
| 259 |
|
| 260 |
+
## 8. The LLM Policy (Qwen3 via Bedrock)
|
| 261 |
|
| 262 |
**File:** `agent/policy.py`
|
| 263 |
|
| 264 |
+
**Model:** `qwen.qwen3-next-80b-a3b` via AWS Bedrock Converse API (`maxTokens=512, temperature=0.4`)
|
| 265 |
|
| 266 |
+
### Prompt Structure
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
|
| 268 |
Every step, the policy builds a prompt from three components:
|
| 269 |
|
|
|
|
| 275 |
What is your next action?
|
| 276 |
```
|
| 277 |
|
| 278 |
+
Accounts in the observation are **sorted by `fake_risk_score` descending**, with status badges prepended. `fnbr=N(!)` highlights when `flagged_neighbor_count > 0`; `[HUB?]` warns the LLM not to flag high-legitimacy accounts.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
|
| 280 |
+
### Required Response Format
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
|
| 282 |
```xml
|
| 283 |
<thinking>
|
| 284 |
+
Reasoning — which account is most suspicious and why.
|
|
|
|
| 285 |
</thinking>
|
| 286 |
<action>
|
| 287 |
INSPECT acc_0041
|
| 288 |
</action>
|
| 289 |
```
|
| 290 |
|
| 291 |
+
If parsing fails, a heuristic fallback inspects the highest-scored uninspected account. Retries use exponential backoff (1s, 2s, 4s) up to 3 attempts.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
|
| 293 |
---
|
| 294 |
|
| 295 |
+
## 9. Reflexion — How the Agent Learns
|
| 296 |
|
| 297 |
**Files:** `agent/reflection.py`, `agent/memory.py`
|
| 298 |
|
| 299 |
+
The agent **cannot** update Qwen3's weights — Bedrock is a black-box API. Instead, it learns via **Reflexion**: post-episode lessons are written as text and injected into future prompts.
|
|
|
|
|
|
|
| 300 |
|
| 301 |
+
### Reflexion Learning Loop
|
| 302 |
|
| 303 |
+

|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
|
|
|
|
| 305 |
|
|
|
|
| 306 |
|
| 307 |
```
|
| 308 |
+
Episode N:
|
| 309 |
+
1. LLM acts using: system_prompt + reflections[last 4] + best_trajectory
|
| 310 |
+
2. Episode ends → WIN or LOSS
|
| 311 |
+
3. Post-episode:
|
| 312 |
+
LOSS → generate_reflection(action_log, outcome) → lesson stored
|
| 313 |
+
WIN → save trajectory if better reward + generate_success_reflection
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
|
| 315 |
+
Episode N+1:
|
| 316 |
+
→ last 4 reflections + best win trajectory injected into prompt
|
| 317 |
+
→ LLM has learned from its past
|
| 318 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
|
| 320 |
+
**Example generated reflection:**
|
| 321 |
+
> *"The starting accounts were all real; I wasted 8 steps inspecting low-signal nodes before pivoting. When photo_reuse and bio_template are both below 0.3 after 3 inspections, immediately use INVESTIGATE_NETWORK to jump to a different graph region."*
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 322 |
|
| 323 |
+
All memory persists in a Docker volume (`memory/`) across container restarts — reflections, best trajectories, win history, and α values per task.
|
|
|
|
| 324 |
|
| 325 |
---
|
| 326 |
|
| 327 |
+
## 10. Hybrid Policy — The Novel Contribution
|
| 328 |
|
| 329 |
**File:** `agent/hybrid_policy.py`
|
| 330 |
|
| 331 |
+
**Key insight:** A new LLM agent starts dumb but improves over time. A rule engine is always consistent but cannot adapt. The hybrid policy exploits both — rules provide a safety net early while the LLM builds its track record; once the LLM proves itself, rules step back.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 332 |
|
| 333 |
+
### Architecture
|
|
|
|
|
|
|
|
|
|
| 334 |
|
| 335 |
+

|
|
|
|
| 336 |
|
| 337 |
+
### Alpha (α): The Trust Weight
|
| 338 |
|
| 339 |
+
α is a per-task value in [0.20, cap] representing current trust in the LLM:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 340 |
|
| 341 |
```
|
| 342 |
reflection_factor = min(1.0, n_reflections / 4.0)
|
| 343 |
raw = 0.20 + reflection_factor × (0.80 × recent_win_rate + 0.12)
|
| 344 |
α = clamp(raw, 0.20, cap)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
```
|
| 346 |
|
| 347 |
+
| Task | α cap | Rationale |
|
| 348 |
+
|---|---|---|
|
| 349 |
+
| easy | 0.50 | Rule engine alone achieves ~91% — LLM should assist, not override |
|
| 350 |
+
| medium | 0.70 | Decoys require some LLM judgment, but cascade must stay |
|
| 351 |
+
| hard | 0.85 | LLM needs latitude for evasion adaptation, but safety rules remain |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 352 |
|
| 353 |
**Alpha trajectory over training (easy task, cap=0.50):**
|
| 354 |
|
| 355 |
+
| Episode | Win rate | Reflections | α (capped) |
|
| 356 |
+
|---|---|---|---|
|
| 357 |
+
| 1 | 0% | 0 | 0.20 |
|
| 358 |
+
| 5 | 20% | 4 | 0.48 |
|
| 359 |
+
| 10 | 50% | 9 | **0.50** |
|
| 360 |
+
| 20 | 80% | 19 | **0.50** |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 361 |
|
| 362 |
+
<br>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 363 |
|
| 364 |
+

|
| 365 |
|
| 366 |
+
### Rule Confidence Levels
|
|
|
|
|
|
|
| 367 |
|
| 368 |
+
| Situation | Action | Confidence |
|
| 369 |
+
|---|---|---|
|
| 370 |
+
| Steps remaining = 0 | SUBMIT | 1.00 |
|
| 371 |
+
| Uninspected SUSPECT accounts exist | INSPECT suspects[0] | 0.95 |
|
| 372 |
+
| `fake_risk ≥ 0.85` | FLAG that account | 0.95 |
|
| 373 |
+
| `fake_risk` in [threshold, 0.85) | FLAG that account | 0.70+ |
|
| 374 |
+
| 10 accounts already flagged | SUBMIT | 0.85 |
|
| 375 |
+
| Steps remaining ≤ 3 | SUBMIT | 0.90 |
|
| 376 |
+
| Uninspected accounts available | INSPECT top candidate | 0.30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 377 |
|
| 378 |
+
At **α=0.20** (early): rules dominate (~90% of decisions). At **α=0.50** (moderate): LLM controls exploration; rules control safety. At **α=0.85** (high): LLM controls most decisions; rules only override forced submits and uninspected suspects.
|
| 379 |
|
| 380 |
+
α is saved to `memory/alpha_{task}.json` and persists across Docker restarts — the agent doesn't reset to 0.20 every time.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 381 |
|
| 382 |
---
|
| 383 |
|
| 384 |
+
## 11. Training Loop End-to-End
|
| 385 |
|
| 386 |
**File:** `train.py`
|
| 387 |
|
| 388 |
+
### Curriculum
|
| 389 |
|
| 390 |
+
| Phase | Episodes | Task | Goal |
|
| 391 |
+
|---|---|---|---|
|
| 392 |
+
| 1 | 1–20 | easy | Learn basic signal thresholds, build first reflections |
|
| 393 |
+
| 2 | 21–35 | medium | Handle decoys, learn evasion response |
|
| 394 |
+
| 3 | 36–50 | hard | Feature-only detection, persistent evasion |
|
| 395 |
|
| 396 |
Seeds rotate deterministically: `seed = (episode_num + task_offset) % 50`
|
|
|
|
| 397 |
|
| 398 |
+
### Per-Episode Flow
|
| 399 |
|
| 400 |
```
|
| 401 |
for ep in range(n_episodes):
|
| 402 |
|
| 403 |
+
1. DETERMINE TASK curriculum_task(ep) or fixed task
|
| 404 |
+
2. COMPUTE ALPHA compute_alpha(win_rate, n_reflections, task)
|
| 405 |
+
3. LOAD CONTEXT last 4 reflections + best win trajectory
|
| 406 |
+
4. RUN EPISODE while not obs.done:
|
| 407 |
+
blend(rule_action, llm_action, rule_conf, α)
|
| 408 |
+
→ obs = env.step(final)
|
| 409 |
+
5. POST-EPISODE record_win → update α → generate reflection
|
| 410 |
+
6. LOG task | win/loss | reward | recall | precision | α | modes
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 411 |
```
|
| 412 |
|
| 413 |
+
Episode metrics (flushed to `runs/metrics.jsonl` every 5 episodes) include: `episode`, `task`, `won`, `reward`, `recall`, `precision`, `steps_used`, `alpha_used`, `mode_agree`, `mode_rule`, `mode_llm`, `n_reflections_used`.
|
| 414 |
+
|
| 415 |
+
You can watch the transition: early episodes have high `rule` counts; later episodes have high `agree` counts (LLM learned to make the same decisions as the rules, but also brings strategic reasoning the rules can't).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 416 |
|
| 417 |
---
|
| 418 |
|
| 419 |
+
## 12. API Reference
|
| 420 |
|
| 421 |
**File:** `server/app.py`
|
| 422 |
|
| 423 |
+
| Endpoint | Method | Description |
|
| 424 |
+
|---|---|---|
|
| 425 |
+
| `/health` | GET | `{"status": "healthy"}` |
|
| 426 |
+
| `/tasks` | GET | Task list + `action_schema` + `score_range: [0.0, 1.0]` |
|
| 427 |
+
| `/reset` | POST | Accepts `{task, seed}` → returns initial observation |
|
| 428 |
+
| `/step` | POST | Accepts any `FakeGangAction` → returns updated observation |
|
| 429 |
+
| `/state` | GET | Current episode metadata (step count, task, score) |
|
| 430 |
+
| `/grader` | GET | Normalised [0.0, 1.0] score after SUBMIT |
|
| 431 |
+
| `/baseline` | POST | Runs rule-based agent on all 3 tasks, returns scores |
|
| 432 |
|
| 433 |
+
**Baseline performance:**
|
|
|
|
|
|
|
| 434 |
|
| 435 |
+
| Task | Seed=0 score | Win rate (50 seeds) | Mean score (50 seeds) |
|
| 436 |
+
|---|---|---|---|
|
| 437 |
+
| easy | 0.91 | 100% | ~0.91 |
|
| 438 |
+
| medium | 0.906 | 84% | ~0.77 |
|
| 439 |
+
| hard | 0.9038 | 52% | ~0.47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 440 |
|
| 441 |
---
|
| 442 |
|
| 443 |
+
## 13. Docker Deployment
|
|
|
|
|
|
|
|
|
|
|
|
|
| 444 |
|
| 445 |
```bash
|
| 446 |
+
# Build
|
| 447 |
docker build -f server/Dockerfile -t graphstrike .
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 448 |
|
| 449 |
+
# Run
|
| 450 |
docker run -it \
|
| 451 |
-e AWS_ACCESS_KEY_ID=your_key \
|
| 452 |
-e AWS_SECRET_ACCESS_KEY=your_secret \
|
|
|
|
| 456 |
graphstrike
|
| 457 |
```
|
| 458 |
|
| 459 |
+
The `memory/` and `runs/` volumes preserve all learning between container restarts.
|
|
|
|
| 460 |
|
| 461 |
+
### Environment Variables
|
| 462 |
|
| 463 |
+
| Variable | Default | Description |
|
| 464 |
+
|---|---|---|
|
| 465 |
+
| `AWS_ACCESS_KEY_ID` | (required) | For Bedrock/Qwen3 access |
|
| 466 |
+
| `AWS_SECRET_ACCESS_KEY` | (required) | For Bedrock/Qwen3 access |
|
| 467 |
+
| `AWS_DEFAULT_REGION` | `us-east-1` | Bedrock region |
|
| 468 |
+
| `TRAIN_TASK` | (curriculum) | Fix to `easy`/`medium`/`hard` |
|
| 469 |
+
| `TRAIN_EPISODES` | `50` | Total training episodes |
|
| 470 |
+
| `TRAIN_TEMP` | `0.4` | LLM sampling temperature |
|
| 471 |
+
| `TRAIN_VERBOSE` | `0` | Set `1` for per-step action logging |
|
| 472 |
+
| `SERVER_PORT` | `8000` | FastAPI port |
|
| 473 |
|
| 474 |
+
### Startup Sequence (`run.sh`)
|
| 475 |
|
| 476 |
```
|
| 477 |
+
1. Validate AWS credentials
|
| 478 |
+
2. python server/generator.py → generates 150 episode JSON files
|
| 479 |
3. uvicorn server.app:app → starts the environment server
|
| 480 |
+
4. Health check polling → waits until /health responds
|
| 481 |
5. python train.py → runs the full training loop
|
| 482 |
```
|
| 483 |
|
| 484 |
---
|
| 485 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 486 |
|
| 487 |
+
### Full HTTP validation
|
| 488 |
|
| 489 |
```bash
|
| 490 |
python3 -m uvicorn server.app:app --port 8001 &
|
| 491 |
sleep 3
|
| 492 |
python3 validate.py --url http://localhost:8001
|
| 493 |
+
# Expected: Results: 24/24 passed — all OK
|
| 494 |
```
|
| 495 |
|
|
|
|
|
|
|
| 496 |
### Deployed Endpoint Verification
|
| 497 |
|
|
|
|
|
|
|
|
|
|
| 498 |
```bash
|
|
|
|
| 499 |
curl https://pandago-graphstrike.hf.space/health
|
| 500 |
# → {"status": "healthy"}
|
| 501 |
|
|
|
|
| 502 |
curl https://pandago-graphstrike.hf.space/tasks
|
| 503 |
# → {"tasks": ["easy","medium","hard"], "action_schema": {...}, "score_range": [0.0, 1.0]}
|
| 504 |
|
|
|
|
| 505 |
curl -X POST https://pandago-graphstrike.hf.space/baseline
|
| 506 |
# → {"scores": {"easy": 0.91, "medium": 0.906, "hard": 0.9038}, "agent": "rule_based"}
|
| 507 |
```
|
| 508 |
|
| 509 |
---
|
| 510 |
|
|
|
|
|
|
|
| 511 |

|
| 512 |
|
| 513 |
## Developed with ❤️ by Team ComputeXOR
|
| 514 |
|
|
|
|
| 515 |
### {
|
| 516 |
|
| 517 |
+
### [Sai Nivedh](https://github.com/SaiNivedh26) ,
|
| 518 |
|
| 519 |
+
### [Charuvarthan](https://github.com/Charuvarthan-T) ,
|
| 520 |
|
| 521 |
### [Sajeev](https://github.com/SajeevSenthil)
|
| 522 |
|
README.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1723ffdcb7e36d47ff546500f86b30ebbb40af3a8616e1818798aaf144e0f5fc
|
| 3 |
+
size 1448893
|
assets/episode.png
ADDED
|
assets/formulas-1.png
ADDED
|
Git LFS Details
|
assets/formulas-2.png
ADDED
|
assets/gs.png
ADDED
|
assets/hybrid.png
ADDED
|
Git LFS Details
|
assets/logo.png
ADDED
|
assets/reflexion.png
ADDED
|
Git LFS Details
|
assets/sys arch.png
ADDED
|
Git LFS Details
|
docs.md
ADDED
|
@@ -0,0 +1,312 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: GraphStrike
|
| 3 |
+
emoji: 🕵️
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
pinned: false
|
| 9 |
+
license: mit
|
| 10 |
+
tags:
|
| 11 |
+
- reinforcement-learning
|
| 12 |
+
- social-network
|
| 13 |
+
- fraud-detection
|
| 14 |
+
- openenv
|
| 15 |
+
- llm-agent
|
| 16 |
+
---
|
| 17 |
+
<br>
|
| 18 |
+
|
| 19 |
+
<p align="center">
|
| 20 |
+
<img src="images/logo.png" width="600"/>
|
| 21 |
+
</p>
|
| 22 |
+
|
| 23 |
+
<br>
|
| 24 |
+
|
| 25 |
+
<p align="center">
|
| 26 |
+
<img src="https://img.shields.io/badge/Hugging%20Face-FFD21E?style=for-the-badge&logo=huggingface&logoColor=black"/>
|
| 27 |
+
<img src="https://img.shields.io/badge/HF%20Spaces-FFBF00?style=for-the-badge&logo=huggingface&logoColor=black"/>
|
| 28 |
+
<img src="https://img.shields.io/badge/FastAPI-009688?style=for-the-badge&logo=fastapi&logoColor=white"/>
|
| 29 |
+
<img src="https://img.shields.io/badge/Docker-2496ED?style=for-the-badge&logo=docker&logoColor=white"/>
|
| 30 |
+
<img src="https://img.shields.io/badge/Gradio-F97316?style=for-the-badge&logo=gradio&logoColor=white"/>
|
| 31 |
+
<img src="https://img.shields.io/badge/OpenEnv-4B5563?style=for-the-badge&logo=envato&logoColor=white"/>
|
| 32 |
+
<img src="https://img.shields.io/badge/Amazon%20Bedrock-FF9900?style=for-the-badge&logo=amazonaws&logoColor=white"/>
|
| 33 |
+
</p>
|
| 34 |
+
<br>
|
| 35 |
+
|
| 36 |
+
<h1 align="center">
|
| 37 |
+
</h1>
|
| 38 |
+
<p align="center">
|
| 39 |
+
An OpenEnv-compatible reinforcement learning environment where an LLM agent must identify all 10 members of a coordinated fake account network hidden inside a synthetic social network. The agent learns via Reflexion and a dynamic hybrid rule/LLM policy , not via gradient updates or fine-tuning.
|
| 40 |
+
<br />
|
| 41 |
+
</p>
|
| 42 |
+
</p>
|
| 43 |
+
|
| 44 |
+
<br>
|
| 45 |
+
<br>
|
| 46 |
+
|
| 47 |
+
### *Deployed Endpoint Verification*
|
| 48 |
+
|
| 49 |
+
The live environment at [huggingface.co/spaces/Pandago/graphstrike](https://huggingface.co/spaces/Pandago/graphstrike)
|
| 50 |
+
responds to all standard OpenEnv endpoints:
|
| 51 |
+
|
| 52 |
+
```bash
|
| 53 |
+
# Health check
|
| 54 |
+
curl https://pandago-graphstrike.hf.space/health
|
| 55 |
+
# → {"status": "healthy"}
|
| 56 |
+
|
| 57 |
+
# Task discovery
|
| 58 |
+
curl https://pandago-graphstrike.hf.space/tasks
|
| 59 |
+
# → {"tasks": ["easy","medium","hard"], "action_schema": {...}, "score_range": [0.0, 1.0]}
|
| 60 |
+
|
| 61 |
+
# Baseline (deterministic, reproducible)
|
| 62 |
+
curl -X POST https://pandago-graphstrike.hf.space/baseline
|
| 63 |
+
# → {"scores": {"easy": 0.91, "medium": 0.906, "hard": 0.9038}, "agent": "rule_based"}
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
---
|
| 67 |
+
|
| 68 |
+
<br>
|
| 69 |
+
|
| 70 |
+
We evaluate GraphStrike's hybrid rule/LLM policy across multiple *frontier models to measure how well each model handles the investigation task. All runs use
|
| 71 |
+
the same inference pipeline (`inference.py`) with identical system prompts and structured logging. Each model ran: (1) seed=0 on all 3 tasks, and
|
| 72 |
+
(2) seeds 0-2 on all 3 tasks for variance measurement.*
|
| 73 |
+
|
| 74 |
+
<br>
|
| 75 |
+
|
| 76 |
+
**Seed=0 scores (single episode per task):**
|
| 77 |
+
|
| 78 |
+
<p align="center">
|
| 79 |
+
<img src="images/table1.png" alt="Model Performance Table" width="1600"/>
|
| 80 |
+
</p>
|
| 81 |
+
<br>
|
| 82 |
+
|
| 83 |
+
**3-seed variance scores (mean across seeds 0, 1, 2):**
|
| 84 |
+
|
| 85 |
+
<p align="center">
|
| 86 |
+
<img src="images/table2.png" alt="Model Performance Table" width="1600"/>
|
| 87 |
+
</p>
|
| 88 |
+
<br>
|
| 89 |
+
|
| 90 |
+
**Rule-Based Baseline (no LLM, deterministic)**
|
| 91 |
+
|
| 92 |
+
<p align="center">
|
| 93 |
+
<img src="images/table3.png" alt="Model Performance Table" width="1600"/>
|
| 94 |
+
</p>
|
| 95 |
+
<br>
|
| 96 |
+
|
| 97 |
+
---
|
| 98 |
+
|
| 99 |
+
**The task:** A social network contains fake accounts organised into a
|
| 100 |
+
single coordinated network of 10. The network behaves in a coordinated way — same posting hour,
|
| 101 |
+
same IP subnet, stolen celebrity photos, copy-paste bios. The agent must find
|
| 102 |
+
all 10 by navigating a limited step budget, inspecting accounts, and flagging suspects.
|
| 103 |
+
|
| 104 |
+
**What makes this non-trivial:** The network is large (50–1000 accounts depending on difficulty). Fake accounts are mixed with innocent high-signal "decoy" accounts.In hard mode, the fake accounts actively evades — dropping intra-account follows, renaming profiles — while the agent is mid-investigation.The agent cannot see the full network upfront: it must explore via INSPECT and INVESTIGATE_NETWORK actions, spending steps to reveal information.
|
| 105 |
+
|
| 106 |
+
**What makes the learning novel:** The LInference LLM (via aws bedrock) cannot be fine-tuned.it's a black-box API. The agent learns via Reflexion i.e., post-episode lessons are written back into memory and injected into every future prompt. A dynamic hybrid policy (α-weighted) blends the LLM with a deterministic rule engine, with the blend weight α updating based on recent win rate. Rules dominate early; the LLM takes over as it proves itself.
|
| 107 |
+
|
| 108 |
+
---
|
| 109 |
+
|
| 110 |
+
## Detection Signals
|
| 111 |
+
|
| 112 |
+
Detection operates entirely on numeric metadata — no content processing. Three signal categories, computed at different points:
|
| 113 |
+
|
| 114 |
+
**Node signals** (pre-computed by content pipeline, static per account):
|
| 115 |
+
|
| 116 |
+
| Feature | Fake range | Real range | Notes |
|
| 117 |
+
|---|---|---|---|
|
| 118 |
+
| `photo_reuse_score` | 0.30–0.95 | 0.00–0.15 | pHash fingerprint match against celebrity photo DB |
|
| 119 |
+
| `bio_template_score` | 0.20–0.90 | 0.00–0.12 | Cosine sim to known fake bio templates |
|
| 120 |
+
| `comment_repeat_score` | 0.60–0.90 | 0.00–0.08 | Copy-paste spam fraction across accounts |
|
| 121 |
+
|
| 122 |
+
**Behavioral signals** (static, from account metadata):
|
| 123 |
+
|
| 124 |
+
| Feature | Fake pattern |
|
| 125 |
+
|---|---|
|
| 126 |
+
| `avg_post_hour` | All 10 gang members cluster within ±0.5 hours — coordinated scheduling |
|
| 127 |
+
| `account_age_days` | Created within the same week (`base_age ± 7 days`) |
|
| 128 |
+
| `shared_ip_count` | 9 — all 10 share one IP subnet (`ip_gang_{seed}`) |
|
| 129 |
+
|
| 130 |
+
**Graph signals** (dynamic — computed at INSPECT time, shift as investigation progresses):
|
| 131 |
+
|
| 132 |
+
| Feature | Why it matters |
|
| 133 |
+
|---|---|
|
| 134 |
+
| `mutual_follow_rate` | Gang members mutually follow each other at 0.6–0.9 density; legitimate hubs don't follow back |
|
| 135 |
+
| `flagged_neighbor_count` | Grows as more gang members are flagged — the cascade signal |
|
| 136 |
+
| `post_hour_cluster_score` | Alignment to mean posting hour of currently-flagged accounts (wrap-around aware) |
|
| 137 |
+
| `suspicious_mutual_ratio` | Used to compute hub legitimacy — protects celebrities from false positives |
|
| 138 |
+
|
| 139 |
+
Graph signals are the most powerful: once one gang member is flagged, `flagged_neighbor_count` rises for all connected members, compounding with each subsequent flag.
|
| 140 |
+
|
| 141 |
+
---
|
| 142 |
+
|
| 143 |
+
## Synthetic Network Composition
|
| 144 |
+
|
| 145 |
+
150 episodes pre-generated deterministically (50 per task). Each episode is a JSON file (`episodes/{task}_{seed:03d}.json`).
|
| 146 |
+
|
| 147 |
+
| Task | Accounts | Gang | Decoys | Max steps | Evasion |
|
| 148 |
+
|---|---|---|---|---|---|
|
| 149 |
+
| easy | 50 | 10 | 0 | 30 | None |
|
| 150 |
+
| medium | 200 | 10 | 20 | 50 | Step 20 (once) |
|
| 151 |
+
| hard | 1000 | 10 | 50 | 80 | Steps 15/30/45/60 |
|
| 152 |
+
|
| 153 |
+
- **Gang:** Dense intra-follow graph (density 0.60–0.80), same IP subnet, tightly clustered post hours (std 0.5/1.5/2.5 by task).
|
| 154 |
+
- **Decoys** (medium/hard only): Real accounts with elevated `photo_reuse` and `bio_template` scores (0.20–0.40). They score as suspicious but are not gang members — they penalise reckless flagging.
|
| 155 |
+
- **Celebrities** (2 per episode): 100k–5M followers, near-zero fake scores. Hub legitimacy formula protects them.
|
| 156 |
+
- **Zero-edge isolates** (2 per episode): `follower_count=0`, no edges. Test whether the agent wastes steps on disconnected nodes.
|
| 157 |
+
|
| 158 |
+
---
|
| 159 |
+
|
| 160 |
+
## Actions
|
| 161 |
+
|
| 162 |
+
| Action | Cost | Effect |
|
| 163 |
+
|---|---|---|
|
| 164 |
+
| `inspect` | 1 step | Reveals full `AccountProfile` (all 22 features), adds neighbors to visible set |
|
| 165 |
+
| `investigate_network` | 2 steps | Bidirectional 2-hop expansion — reveals account IDs only (no profiles); re-cascades SUSPECT |
|
| 166 |
+
| `flag` | 0 steps | Marks account CONFIRMED_FAKE; dual cascade: follow-graph + IP cluster |
|
| 167 |
+
| `unflag` | 0 steps | Clears CONFIRMED_FAKE status |
|
| 168 |
+
| `submit` | 0 steps | Ends episode, triggers scoring |
|
| 169 |
+
|
| 170 |
+
**Dual SUSPECT cascade on FLAG:**
|
| 171 |
+
1. *Follow-graph:* Every visible account that the flagged account follows → SUSPECT (high precision: gang follow density 0.70+).
|
| 172 |
+
2. *IP cluster:* Every visible account sharing the same `ip_cluster_id` → SUSPECT (zero false positives: real accounts each have a unique IP; gang shares `ip_gang_{seed}`).
|
| 173 |
+
|
| 174 |
+
Both mechanisms surface in `obs.suspect_ids` — the agent's highest-priority INSPECT targets.
|
| 175 |
+
|
| 176 |
+
---
|
| 177 |
+
|
| 178 |
+
## Risk Scoring (`server/scoring.py`)
|
| 179 |
+
|
| 180 |
+
All functions are stateless, called inside `_build_profile()` at INSPECT time and on re-profiling after each FLAG.
|
| 181 |
+
|
| 182 |
+
```
|
| 183 |
+
node_risk = 0.60 × photo_reuse + 0.40 × bio_template
|
| 184 |
+
|
| 185 |
+
age_norm = min(1.0, account_age_days / 365)
|
| 186 |
+
behavior_risk = 0.55 × (1 − age_norm) + 0.45 × post_hour_cluster_score
|
| 187 |
+
|
| 188 |
+
flagged_ratio = flagged_neighbor_count / max(inspected_neighbor_count, 1)
|
| 189 |
+
graph_risk = 0.45 × flagged_ratio + 0.35 × mutual_follow_rate + 0.20 × avg_neighbor_photo_reuse
|
| 190 |
+
|
| 191 |
+
hub_legitimacy = 0.45 × log(1+followers)/log(1+1M)
|
| 192 |
+
+ 0.25 × (1 − follow_ratio_norm)
|
| 193 |
+
+ 0.20 × age_norm
|
| 194 |
+
+ 0.10 × (1 − suspicious_mutual_ratio)
|
| 195 |
+
|
| 196 |
+
fake_risk = clip(0.30×node_risk + 0.25×behavior_risk + 0.45×graph_risk − 0.25×hub_legitimacy, 0, 1)
|
| 197 |
+
```
|
| 198 |
+
|
| 199 |
+
**Weight rationale:** Graph risk (0.45) is dominant — structural signals are hardest to fake and compound across the investigation. Hub legitimacy is subtractive — a celebrity with 5M followers produces `hub_legitimacy ≈ 1.0`, making their fake_risk near zero even if gang members follow them.
|
| 200 |
+
|
| 201 |
+
**Classification thresholds:**
|
| 202 |
+
- `fake_risk < 0.35` → normal
|
| 203 |
+
- `0.35 ≤ fake_risk < 0.60` → suspect
|
| 204 |
+
- `fake_risk ≥ 0.60` → confirmed_fake (formula-level; explicit FLAG overrides)
|
| 205 |
+
|
| 206 |
+
**Grader score** (normalised [0.0, 1.0], returned by `/grader`):
|
| 207 |
+
```
|
| 208 |
+
recall = tp / 10
|
| 209 |
+
precision = tp / max(tp + fp, 1)
|
| 210 |
+
efficiency = max(0, (max_steps − steps_used) / max_steps)
|
| 211 |
+
|
| 212 |
+
if recall ≥ 0.8 AND precision ≥ 0.7:
|
| 213 |
+
score = 0.55 + 0.20×recall + 0.15×precision + 0.10×efficiency
|
| 214 |
+
else:
|
| 215 |
+
score = 0.30×recall + 0.10×precision
|
| 216 |
+
```
|
| 217 |
+
Maximum 1.0 (all 10 found, zero false positives, zero steps used). Win threshold ≈ 0.815.
|
| 218 |
+
|
| 219 |
+
---
|
| 220 |
+
|
| 221 |
+
## Hybrid Policy (`agent/hybrid_policy.py`)
|
| 222 |
+
|
| 223 |
+
The agent blends a deterministic rule engine with Qwen3-Next-80B (via AWS Bedrock) using a per-task trust weight α.
|
| 224 |
+
|
| 225 |
+
**Alpha update** (per episode, after win/loss recorded):
|
| 226 |
+
```
|
| 227 |
+
reflection_factor = min(1.0, n_reflections / 4.0)
|
| 228 |
+
raw = 0.20 + reflection_factor × (0.80 × recent_win_rate + 0.12)
|
| 229 |
+
alpha = clamp(raw, 0.20, task_cap)
|
| 230 |
+
```
|
| 231 |
+
|
| 232 |
+
| Task | α cap | Rationale |
|
| 233 |
+
|---|---|---|
|
| 234 |
+
| easy | 0.50 | Rule engine alone hits ~91% — LLM assists, doesn't override |
|
| 235 |
+
| medium | 0.70 | Decoys require LLM judgment, but cascade must stay |
|
| 236 |
+
| hard | 0.85 | LLM needs latitude for evasion adaptation |
|
| 237 |
+
|
| 238 |
+
`reflection_factor` gates α: the LLM must accumulate ≥4 post-episode lessons before reaching meaningful trust, regardless of raw win rate.
|
| 239 |
+
|
| 240 |
+
**Blending decision:**
|
| 241 |
+
```python
|
| 242 |
+
rule_action, rule_conf = get_rule_action(obs) # deterministic, with confidence score
|
| 243 |
+
llm_action, _ = get_action(obs, ...) # Qwen3 via Bedrock
|
| 244 |
+
|
| 245 |
+
if rule_action == llm_action: final = llm_action # agree
|
| 246 |
+
elif rule_conf >= alpha: final = rule_action # rule overrides
|
| 247 |
+
else: final = llm_action # LLM trusted
|
| 248 |
+
```
|
| 249 |
+
|
| 250 |
+
Rule confidences: SUBMIT-forced=1.00, INSPECT-suspect=0.95, FLAG-high-risk=0.95, FLAG-threshold=0.70+, INSPECT-explore=0.30. At `α=0.50` (easy cap), safety decisions (suspects, forced submit) always override; exploration goes to the LLM.
|
| 251 |
+
|
| 252 |
+
**Reflexion learning:** After each episode, Qwen3 generates a 2–3 sentence lesson from the action log and outcome. Lessons are stored in `memory/reflections_{task}.jsonl` and injected into every future prompt (last 4 lessons + best winning trajectory as few-shot example). Memory persists across container restarts via Docker volume.
|
| 253 |
+
|
| 254 |
+
---
|
| 255 |
+
|
| 256 |
+
## API Reference
|
| 257 |
+
|
| 258 |
+
| Endpoint | Method | Description |
|
| 259 |
+
|---|---|---|
|
| 260 |
+
| `/health` | GET | `{"status": "healthy"}` |
|
| 261 |
+
| `/tasks` | GET | Task list + `action_schema` + `score_range: [0.0, 1.0]` |
|
| 262 |
+
| `/reset` | POST | `{task, seed}` → initial observation |
|
| 263 |
+
| `/step` | POST | `{action_type, account_id?}` → updated observation |
|
| 264 |
+
| `/state` | GET | Episode metadata (step count, task, score, evasion count) |
|
| 265 |
+
| `/grader` | GET | Normalised [0.0, 1.0] score after SUBMIT (400 if not done) |
|
| 266 |
+
| `/baseline` | POST | Runs rule-based agent on all 3 tasks, seed=0 |
|
| 267 |
+
| `/metadata` | GET | OpenEnv metadata block |
|
| 268 |
+
| `/schema` | GET | Full JSON schema for actions and observations |
|
| 269 |
+
| `/mcp` | POST | JSON-RPC 2.0 tool discovery (Model Context Protocol) |
|
| 270 |
+
|
| 271 |
+
Live: `https://pandago-graphstrike.hf.space`
|
| 272 |
+
|
| 273 |
+
---
|
| 274 |
+
|
| 275 |
+
## File Structure
|
| 276 |
+
|
| 277 |
+
```
|
| 278 |
+
server/
|
| 279 |
+
app.py — FastAPI + Gradio UI (gr.mount_gradio_app)
|
| 280 |
+
environment.py — Episode lifecycle, action mechanics, cascade logic
|
| 281 |
+
generator.py — Deterministic episode generation (150 JSON files)
|
| 282 |
+
scoring.py — Stateless risk formula functions
|
| 283 |
+
models.py — Pydantic models: AccountProfile, FakeGangObservation, ActionType
|
| 284 |
+
|
| 285 |
+
agent/
|
| 286 |
+
policy.py — Qwen3 prompt construction + action parsing
|
| 287 |
+
hybrid_policy.py — Alpha blending, rule engine with confidence scores
|
| 288 |
+
reflection.py — Post-episode lesson generation
|
| 289 |
+
memory.py — JSONL persistence for reflections, trajectories, alpha
|
| 290 |
+
|
| 291 |
+
inference.py — Submission entrypoint: [START]/[STEP]/[END] structured logs, OpenAI client
|
| 292 |
+
validate.py — 24-point pre-submission validator (local + HTTP)
|
| 293 |
+
train.py — Full training loop with curriculum
|
| 294 |
+
episodes/ — 150 pre-generated JSON episode files (baked into Docker image)
|
| 295 |
+
memory/ — Docker volume: reflections, win history, alpha values
|
| 296 |
+
```
|
| 297 |
+
|
| 298 |
+
---
|
| 299 |
+
|
| 300 |
+
## Baseline Scores
|
| 301 |
+
|
| 302 |
+
| Task | Seed=0 | Win rate (50 seeds) | Mean (50 seeds) |
|
| 303 |
+
|---|---|---|---|
|
| 304 |
+
| easy | 0.910 | 100% | ~0.91 |
|
| 305 |
+
| medium | 0.906 | 84% | ~0.77 |
|
| 306 |
+
| hard | 0.9038 | 52% | ~0.47 |
|
| 307 |
+
|
| 308 |
+
The rule-based baseline (no LLM) is competitive on easy/medium. Hard is the real differentiator — evasion events drop intra-gang edges mid-investigation, destroying graph signals. Frontier LLM agents with accumulated reflections adapt; the rule engine degrades.
|
| 309 |
+
|
| 310 |
+
---
|
| 311 |
+
|
| 312 |
+
*Built by team computeXor*
|
eval-models/deepseek_test_judge_eval.py
ADDED
|
@@ -0,0 +1,478 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Judge Evaluation Simulator
|
| 4 |
+
==========================
|
| 5 |
+
|
| 6 |
+
Simulates EXACTLY how hackathon judges will evaluate your environment:
|
| 7 |
+
|
| 8 |
+
1. Baseline re-run: POST /baseline → verify scores are stable
|
| 9 |
+
2. Standard Open LLM agent: Run an LLM (via HF router) against all 3 tasks
|
| 10 |
+
3. Score variance check: Run same task multiple seeds, check variance
|
| 11 |
+
|
| 12 |
+
USAGE:
|
| 13 |
+
# Against live HF Space (requires HF_TOKEN):
|
| 14 |
+
export HF_TOKEN="hf_..."
|
| 15 |
+
python test_judge_eval.py --url https://pandago-graphstrike.hf.space
|
| 16 |
+
|
| 17 |
+
# Against local server:
|
| 18 |
+
export HF_TOKEN="hf_..."
|
| 19 |
+
python test_judge_eval.py --url http://localhost:7860
|
| 20 |
+
|
| 21 |
+
# Choose model (default: Qwen/Qwen2.5-72B-Instruct):
|
| 22 |
+
export MODEL_NAME="nvidia/Llama-3.1-Nemotron-70B-Instruct-HF"
|
| 23 |
+
python test_judge_eval.py --url https://pandago-graphstrike.hf.space
|
| 24 |
+
|
| 25 |
+
# Just test endpoints (no LLM needed):
|
| 26 |
+
python test_judge_eval.py --url https://pandago-graphstrike.hf.space --endpoints-only
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
import json
|
| 30 |
+
import os
|
| 31 |
+
import sys
|
| 32 |
+
import time
|
| 33 |
+
import urllib.request
|
| 34 |
+
from typing import Dict, List, Optional
|
| 35 |
+
|
| 36 |
+
# ---------------------------------------------------------------------------
|
| 37 |
+
# Config from env vars (same as judges will set)
|
| 38 |
+
# ---------------------------------------------------------------------------
|
| 39 |
+
|
| 40 |
+
HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 41 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 42 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "deepseek.v3.2")
|
| 43 |
+
|
| 44 |
+
# AWS Bedrock config (use --bedrock flag)
|
| 45 |
+
LLM_BACKEND = "hf" # "hf" or "bedrock"
|
| 46 |
+
BEDROCK_MODEL_ID = os.getenv("BEDROCK_MODEL_ID", "deepseek.v3.2")
|
| 47 |
+
|
| 48 |
+
SYSTEM_PROMPT = """You are an AI detective finding 10 coordinated fake accounts in a social network.
|
| 49 |
+
|
| 50 |
+
ACTIONS (reply with exactly ONE line):
|
| 51 |
+
- INSPECT acc_XXXX — reveal profile (costs 1 step)
|
| 52 |
+
- FLAG acc_XXXX — mark as fake (FREE, no step cost, triggers suspect cascade)
|
| 53 |
+
- SUBMIT — end episode, get scored
|
| 54 |
+
|
| 55 |
+
DECISION RULES (apply top-to-bottom, first match wins):
|
| 56 |
+
1. If suspect_ids lists accounts you haven't inspected → INSPECT the first one
|
| 57 |
+
2. If ANY profiled account has shared_ip_count >= 5 and is NOT flagged → FLAG it immediately
|
| 58 |
+
3. If ANY profiled account has photo_reuse >= 0.50 AND bio_template >= 0.40 and hub < 0.70 and NOT flagged → FLAG it
|
| 59 |
+
4. If ANY profiled account has fake_risk_score >= 0.30 and hub < 0.70 and NOT flagged → FLAG it
|
| 60 |
+
5. If there are uninspected visible accounts and steps > 3 → INSPECT the next one
|
| 61 |
+
6. If you have 10 flags OR steps <= 3 → SUBMIT
|
| 62 |
+
|
| 63 |
+
IMPORTANT:
|
| 64 |
+
- FLAG is FREE (costs 0 steps) — flag aggressively when you see suspicious signals
|
| 65 |
+
- After each FLAG, new suspects appear — always inspect suspects before other accounts
|
| 66 |
+
- hub_legitimacy_score > 0.70 means celebrity — do NOT flag
|
| 67 |
+
- shared_ip_count >= 5 is the strongest gang signal (all 10 share one IP)
|
| 68 |
+
- Do NOT re-inspect already inspected accounts
|
| 69 |
+
|
| 70 |
+
Reply with EXACTLY one line, nothing else:
|
| 71 |
+
FLAG acc_XXXX
|
| 72 |
+
INSPECT acc_XXXX
|
| 73 |
+
SUBMIT"""
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
# ---------------------------------------------------------------------------
|
| 77 |
+
# HTTP helpers
|
| 78 |
+
# ---------------------------------------------------------------------------
|
| 79 |
+
|
| 80 |
+
def _retry(fn, retries=3, backoff=3):
|
| 81 |
+
"""Retry a function on network errors."""
|
| 82 |
+
for attempt in range(retries):
|
| 83 |
+
try:
|
| 84 |
+
return fn()
|
| 85 |
+
except OSError as e:
|
| 86 |
+
if attempt == retries - 1:
|
| 87 |
+
raise
|
| 88 |
+
wait = backoff * (attempt + 1)
|
| 89 |
+
print(f" [RETRY] Network error: {e} — retrying in {wait}s ({attempt+1}/{retries})")
|
| 90 |
+
time.sleep(wait)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def http_post(url: str, body: Optional[dict] = None) -> dict:
|
| 94 |
+
def _do():
|
| 95 |
+
data = json.dumps(body or {}).encode()
|
| 96 |
+
req = urllib.request.Request(
|
| 97 |
+
url, data=data,
|
| 98 |
+
headers={"Content-Type": "application/json"},
|
| 99 |
+
method="POST"
|
| 100 |
+
)
|
| 101 |
+
with urllib.request.urlopen(req, timeout=120) as resp:
|
| 102 |
+
return json.loads(resp.read())
|
| 103 |
+
return _retry(_do)
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def http_get(url: str, expect_json: bool = True) -> dict:
|
| 107 |
+
def _do():
|
| 108 |
+
with urllib.request.urlopen(url, timeout=120) as resp:
|
| 109 |
+
body = resp.read()
|
| 110 |
+
if not expect_json:
|
| 111 |
+
return {"_status": resp.status, "_body_len": len(body)}
|
| 112 |
+
return json.loads(body)
|
| 113 |
+
return _retry(_do)
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
# ---------------------------------------------------------------------------
|
| 117 |
+
# LLM call via OpenAI-compatible API
|
| 118 |
+
# ---------------------------------------------------------------------------
|
| 119 |
+
|
| 120 |
+
def _call_hf(prompt: str) -> str:
|
| 121 |
+
"""Call LLM via HF router (OpenAI-compatible)."""
|
| 122 |
+
from openai import OpenAI
|
| 123 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
|
| 124 |
+
resp = client.chat.completions.create(
|
| 125 |
+
model=MODEL_NAME,
|
| 126 |
+
messages=[
|
| 127 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 128 |
+
{"role": "user", "content": prompt},
|
| 129 |
+
],
|
| 130 |
+
temperature=0.3,
|
| 131 |
+
max_tokens=256,
|
| 132 |
+
)
|
| 133 |
+
return (resp.choices[0].message.content or "").strip()
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def _call_bedrock(prompt: str) -> str:
|
| 137 |
+
"""Call LLM via AWS Bedrock. Tries converse() first, falls back to invoke_model()."""
|
| 138 |
+
import boto3
|
| 139 |
+
client = boto3.client(
|
| 140 |
+
service_name="bedrock-runtime",
|
| 141 |
+
region_name=os.getenv("AWS_DEFAULT_REGION", "us-east-1"),
|
| 142 |
+
aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
|
| 143 |
+
aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
|
| 144 |
+
)
|
| 145 |
+
# Try converse API first (boto3 >= 1.34.x)
|
| 146 |
+
if hasattr(client, "converse"):
|
| 147 |
+
resp = client.converse(
|
| 148 |
+
modelId=BEDROCK_MODEL_ID,
|
| 149 |
+
messages=[{"role": "user", "content": [{"text": prompt}]}],
|
| 150 |
+
system=[{"text": SYSTEM_PROMPT}],
|
| 151 |
+
inferenceConfig={"maxTokens": 256, "temperature": 0.3},
|
| 152 |
+
)
|
| 153 |
+
return resp["output"]["message"]["content"][0]["text"].strip()
|
| 154 |
+
# Fallback: invoke_model (works with all boto3 versions)
|
| 155 |
+
body = json.dumps({
|
| 156 |
+
"messages": [
|
| 157 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 158 |
+
{"role": "user", "content": prompt},
|
| 159 |
+
],
|
| 160 |
+
"max_tokens": 256,
|
| 161 |
+
"temperature": 0.3,
|
| 162 |
+
})
|
| 163 |
+
resp = client.invoke_model(
|
| 164 |
+
modelId=BEDROCK_MODEL_ID,
|
| 165 |
+
contentType="application/json",
|
| 166 |
+
accept="application/json",
|
| 167 |
+
body=body,
|
| 168 |
+
)
|
| 169 |
+
result = json.loads(resp["body"].read())
|
| 170 |
+
# Handle both OpenAI-style and Bedrock-native response formats
|
| 171 |
+
if "choices" in result:
|
| 172 |
+
return result["choices"][0]["message"]["content"].strip()
|
| 173 |
+
if "content" in result:
|
| 174 |
+
content = result["content"]
|
| 175 |
+
if isinstance(content, list):
|
| 176 |
+
return content[0].get("text", "").strip()
|
| 177 |
+
return str(content).strip()
|
| 178 |
+
if "output" in result:
|
| 179 |
+
return result["output"].get("text", "").strip()
|
| 180 |
+
return str(result).strip()
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def call_llm(prompt: str) -> str:
|
| 184 |
+
"""Call LLM with retries. Uses HF router or Bedrock based on LLM_BACKEND."""
|
| 185 |
+
fn = _call_bedrock if LLM_BACKEND == "bedrock" else _call_hf
|
| 186 |
+
for attempt in range(3):
|
| 187 |
+
try:
|
| 188 |
+
raw = fn(prompt)
|
| 189 |
+
if os.getenv("DEBUG_LLM"):
|
| 190 |
+
print(f" [LLM RAW] {raw[:200]}")
|
| 191 |
+
# Strip Qwen3 <think>...</think> reasoning blocks
|
| 192 |
+
import re
|
| 193 |
+
cleaned = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()
|
| 194 |
+
return cleaned if cleaned else raw
|
| 195 |
+
except Exception as e:
|
| 196 |
+
if attempt == 2:
|
| 197 |
+
print(f" [LLM ERROR] {e} (gave up after 3 attempts)")
|
| 198 |
+
return ""
|
| 199 |
+
wait = 3 * (attempt + 1)
|
| 200 |
+
print(f" [LLM RETRY] {e} — retrying in {wait}s")
|
| 201 |
+
time.sleep(wait)
|
| 202 |
+
return ""
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
def format_obs(obs: dict) -> str:
|
| 206 |
+
"""Format observation as text for LLM — shows raw signals prominently."""
|
| 207 |
+
lines = []
|
| 208 |
+
lines.append(f"TASK: {obs.get('task','?').upper()} | Steps remaining: {obs.get('steps_remaining','?')}")
|
| 209 |
+
|
| 210 |
+
flagged = obs.get("flagged_ids", [])
|
| 211 |
+
lines.append(f"Flagged ({len(flagged)}/10): {', '.join(flagged) if flagged else 'none'}")
|
| 212 |
+
|
| 213 |
+
suspects = obs.get("suspect_ids", [])
|
| 214 |
+
inspected = obs.get("inspected_ids", [])
|
| 215 |
+
uninspected_suspects = [s for s in suspects if s not in inspected]
|
| 216 |
+
if uninspected_suspects:
|
| 217 |
+
lines.append(f"*** SUSPECTS (uninspected) → INSPECT THESE FIRST: {', '.join(uninspected_suspects)} ***")
|
| 218 |
+
|
| 219 |
+
accounts = obs.get("visible_accounts", [])
|
| 220 |
+
if accounts:
|
| 221 |
+
# Split: unflagged accounts that should be flagged vs rest
|
| 222 |
+
unflagged_suspicious = []
|
| 223 |
+
flagged_accs = []
|
| 224 |
+
clean_accs = []
|
| 225 |
+
for a in sorted(accounts, key=lambda x: x.get("fake_risk_score", 0), reverse=True):
|
| 226 |
+
aid = a.get("account_id", "?")
|
| 227 |
+
if aid in flagged:
|
| 228 |
+
flagged_accs.append(a)
|
| 229 |
+
elif (a.get("shared_ip_count", 0) >= 5 or
|
| 230 |
+
(a.get("photo_reuse_score", 0) >= 0.50 and a.get("bio_template_score", 0) >= 0.40)):
|
| 231 |
+
unflagged_suspicious.append(a)
|
| 232 |
+
else:
|
| 233 |
+
clean_accs.append(a)
|
| 234 |
+
|
| 235 |
+
if unflagged_suspicious:
|
| 236 |
+
lines.append(f"\n!!! ACTION NEEDED — FLAG THESE ({len(unflagged_suspicious)} accounts with strong fake signals):")
|
| 237 |
+
for a in unflagged_suspicious:
|
| 238 |
+
aid = a.get("account_id", "?")
|
| 239 |
+
lines.append(f" → FLAG {aid}: risk={a.get('fake_risk_score',0):.3f} photo={a.get('photo_reuse_score',0):.2f} bio={a.get('bio_template_score',0):.2f} ip_shared={a.get('shared_ip_count',0)} hub={a.get('hub_legitimacy_score',0):.2f}")
|
| 240 |
+
|
| 241 |
+
if flagged_accs:
|
| 242 |
+
lines.append(f"\nALREADY FLAGGED ({len(flagged_accs)}):")
|
| 243 |
+
for a in flagged_accs[:5]:
|
| 244 |
+
lines.append(f" ✓ {a.get('account_id','?')}")
|
| 245 |
+
|
| 246 |
+
if clean_accs:
|
| 247 |
+
lines.append(f"\nCLEAN ACCOUNTS ({len(clean_accs)}):")
|
| 248 |
+
for a in clean_accs[:5]:
|
| 249 |
+
aid = a.get("account_id", "?")
|
| 250 |
+
hub = a.get("hub_legitimacy_score", 0)
|
| 251 |
+
hub_mark = " [CELEBRITY]" if hub > 0.70 else ""
|
| 252 |
+
lines.append(f" {aid}: risk={a.get('fake_risk_score',0):.3f} photo={a.get('photo_reuse_score',0):.2f} bio={a.get('bio_template_score',0):.2f} hub={hub:.2f}{hub_mark}")
|
| 253 |
+
|
| 254 |
+
visible = obs.get("visible_account_ids", [])
|
| 255 |
+
uninspected = [i for i in visible if i not in inspected]
|
| 256 |
+
if uninspected:
|
| 257 |
+
lines.append(f"\nUninspected IDs ({len(uninspected)}): {', '.join(uninspected[:8])}{'...' if len(uninspected) > 8 else ''}")
|
| 258 |
+
|
| 259 |
+
lines.append(f"\nMessage: {obs.get('message', '')}")
|
| 260 |
+
return "\n".join(lines)
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
def parse_action(llm_text: str, obs: dict) -> dict:
|
| 264 |
+
"""Parse LLM output to action dict."""
|
| 265 |
+
for line in llm_text.split("\n"):
|
| 266 |
+
line = line.strip()
|
| 267 |
+
upper = line.upper()
|
| 268 |
+
if upper.startswith("INSPECT ") or upper.startswith("FLAG ") or upper.startswith("INVESTIGATE_NETWORK ") or upper.startswith("UNFLAG "):
|
| 269 |
+
parts = line.split(maxsplit=1)
|
| 270 |
+
return {"action_type": parts[0].lower(), "account_id": parts[1].lower() if len(parts) > 1 else None}
|
| 271 |
+
if upper == "SUBMIT":
|
| 272 |
+
return {"action_type": "submit"}
|
| 273 |
+
|
| 274 |
+
# Fallback: inspect first uninspected suspect
|
| 275 |
+
suspects = obs.get("suspect_ids", [])
|
| 276 |
+
inspected = obs.get("inspected_ids", [])
|
| 277 |
+
for s in suspects:
|
| 278 |
+
if s not in inspected:
|
| 279 |
+
return {"action_type": "inspect", "account_id": s}
|
| 280 |
+
visible = obs.get("visible_account_ids", [])
|
| 281 |
+
for v in visible:
|
| 282 |
+
if v not in inspected:
|
| 283 |
+
return {"action_type": "inspect", "account_id": v}
|
| 284 |
+
return {"action_type": "submit"}
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
# ---------------------------------------------------------------------------
|
| 288 |
+
# Test phases
|
| 289 |
+
# ---------------------------------------------------------------------------
|
| 290 |
+
|
| 291 |
+
def test_endpoints(base_url: str) -> bool:
|
| 292 |
+
"""Phase 0: Verify all required endpoints respond correctly."""
|
| 293 |
+
print("\n" + "="*60)
|
| 294 |
+
print("PHASE 0: Endpoint Verification")
|
| 295 |
+
print("="*60)
|
| 296 |
+
|
| 297 |
+
checks = [
|
| 298 |
+
("GET", "/health", None, True),
|
| 299 |
+
("GET", "/tasks", None, True),
|
| 300 |
+
("GET", "/metadata", None, True),
|
| 301 |
+
("GET", "/schema", None, True),
|
| 302 |
+
("GET", "/web", None, False), # returns HTML, not JSON
|
| 303 |
+
("POST", "/reset", {"task": "easy", "seed": 0}, True),
|
| 304 |
+
("GET", "/state", None, True),
|
| 305 |
+
("POST", "/step", {"action_type": "inspect", "account_id": "acc_0000"}, True),
|
| 306 |
+
("POST", "/step", {"action_type": "submit"}, True),
|
| 307 |
+
("GET", "/grader", None, True),
|
| 308 |
+
("POST", "/mcp", {"jsonrpc": "2.0", "method": "tools/list", "id": 1}, True),
|
| 309 |
+
("POST", "/baseline", None, True),
|
| 310 |
+
]
|
| 311 |
+
|
| 312 |
+
all_ok = True
|
| 313 |
+
for method, path, body, expect_json in checks:
|
| 314 |
+
try:
|
| 315 |
+
if method == "GET":
|
| 316 |
+
http_get(f"{base_url}{path}", expect_json=expect_json)
|
| 317 |
+
else:
|
| 318 |
+
http_post(f"{base_url}{path}", body)
|
| 319 |
+
print(f" ✓ {method} {path}")
|
| 320 |
+
except Exception as e:
|
| 321 |
+
print(f" ✗ {method} {path} — {e}")
|
| 322 |
+
all_ok = False
|
| 323 |
+
|
| 324 |
+
return all_ok
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
def test_baseline_stability(base_url: str) -> bool:
|
| 328 |
+
"""Phase 1: Baseline re-run (must produce identical scores)."""
|
| 329 |
+
print("\n" + "="*60)
|
| 330 |
+
print("PHASE 1: Baseline Stability (3 runs)")
|
| 331 |
+
print("="*60)
|
| 332 |
+
|
| 333 |
+
scores_list = []
|
| 334 |
+
for i in range(3):
|
| 335 |
+
r = http_post(f"{base_url}/baseline")
|
| 336 |
+
scores = r["scores"]
|
| 337 |
+
scores_list.append(scores)
|
| 338 |
+
print(f" Run {i+1}: easy={scores['easy']:.4f} medium={scores['medium']:.4f} hard={scores['hard']:.4f}")
|
| 339 |
+
|
| 340 |
+
# Check all identical
|
| 341 |
+
stable = all(s == scores_list[0] for s in scores_list)
|
| 342 |
+
if stable:
|
| 343 |
+
print(" ✓ All 3 runs identical — baseline is deterministic")
|
| 344 |
+
else:
|
| 345 |
+
print(" ✗ SCORES DIFFER — baseline is non-deterministic!")
|
| 346 |
+
return stable
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
def test_llm_agent(base_url: str, task: str, seed: int = 0) -> float:
|
| 350 |
+
"""Phase 2: Run an LLM agent against one task (simulates judge's Nemotron run)."""
|
| 351 |
+
_model = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
|
| 352 |
+
print(f"\n --- LLM Agent: task={task}, seed={seed}, model={_model} ---")
|
| 353 |
+
|
| 354 |
+
# Reset
|
| 355 |
+
reset_resp = http_post(f"{base_url}/reset", {"task": task, "seed": seed})
|
| 356 |
+
obs = reset_resp.get("observation", reset_resp)
|
| 357 |
+
done = reset_resp.get("done", False)
|
| 358 |
+
|
| 359 |
+
step_num = 0
|
| 360 |
+
while not done:
|
| 361 |
+
step_num += 1
|
| 362 |
+
prompt = format_obs(obs)
|
| 363 |
+
llm_text = call_llm(prompt)
|
| 364 |
+
action = parse_action(llm_text, obs)
|
| 365 |
+
|
| 366 |
+
action_str = f"{action['action_type'].upper()} {action.get('account_id', '')}".strip()
|
| 367 |
+
|
| 368 |
+
step_resp = http_post(f"{base_url}/step", action)
|
| 369 |
+
obs = step_resp.get("observation", step_resp)
|
| 370 |
+
done = step_resp.get("done", False)
|
| 371 |
+
reward = step_resp.get("reward")
|
| 372 |
+
|
| 373 |
+
flagged_n = len(obs.get("flagged_ids", []))
|
| 374 |
+
suspects_n = len(obs.get("suspect_ids", []))
|
| 375 |
+
steps_left = obs.get("steps_remaining", "?")
|
| 376 |
+
|
| 377 |
+
print(f" Step {step_num:2d}: {action_str:35s} flagged={flagged_n}/10 suspects={suspects_n} steps_left={steps_left}")
|
| 378 |
+
|
| 379 |
+
if done and reward is not None:
|
| 380 |
+
msg = step_resp.get("message", obs.get("message", ""))
|
| 381 |
+
print(f" → Episode ended: {msg[:100]}")
|
| 382 |
+
|
| 383 |
+
# Get grader score
|
| 384 |
+
grader = http_get(f"{base_url}/grader")
|
| 385 |
+
score = grader["score"]
|
| 386 |
+
print(f" ★ GRADER SCORE: {score:.4f}")
|
| 387 |
+
return score
|
| 388 |
+
|
| 389 |
+
|
| 390 |
+
def test_llm_all_tasks(base_url: str) -> Dict[str, float]:
|
| 391 |
+
"""Phase 2: Run LLM agent on all 3 tasks."""
|
| 392 |
+
print("\n" + "="*60)
|
| 393 |
+
_model = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
|
| 394 |
+
print(f"PHASE 2: LLM Agent Evaluation (model={_model})")
|
| 395 |
+
print("="*60)
|
| 396 |
+
|
| 397 |
+
scores = {}
|
| 398 |
+
for task in ["easy", "medium", "hard"]:
|
| 399 |
+
scores[task] = test_llm_agent(base_url, task=task, seed=0)
|
| 400 |
+
|
| 401 |
+
print(f"\n Summary: easy={scores['easy']:.4f} medium={scores['medium']:.4f} hard={scores['hard']:.4f}")
|
| 402 |
+
return scores
|
| 403 |
+
|
| 404 |
+
|
| 405 |
+
def test_variance(base_url: str, seeds: List[int] = [0, 1, 2, 3, 4]) -> None:
|
| 406 |
+
"""Phase 3: Score variance check (multiple seeds per task)."""
|
| 407 |
+
print("\n" + "="*60)
|
| 408 |
+
print(f"PHASE 3: Score Variance (seeds={seeds})")
|
| 409 |
+
print("="*60)
|
| 410 |
+
|
| 411 |
+
for task in ["easy", "medium", "hard"]:
|
| 412 |
+
task_scores = []
|
| 413 |
+
for seed in seeds:
|
| 414 |
+
score = test_llm_agent(base_url, task=task, seed=seed)
|
| 415 |
+
task_scores.append(score)
|
| 416 |
+
|
| 417 |
+
mean = sum(task_scores) / len(task_scores)
|
| 418 |
+
variance = sum((s - mean) ** 2 for s in task_scores) / len(task_scores)
|
| 419 |
+
print(f"\n {task}: scores={[f'{s:.3f}' for s in task_scores]} mean={mean:.4f} var={variance:.6f}")
|
| 420 |
+
|
| 421 |
+
|
| 422 |
+
# ---------------------------------------------------------------------------
|
| 423 |
+
# Main
|
| 424 |
+
# ---------------------------------------------------------------------------
|
| 425 |
+
|
| 426 |
+
if __name__ == "__main__":
|
| 427 |
+
import argparse
|
| 428 |
+
|
| 429 |
+
parser = argparse.ArgumentParser(description="Judge Evaluation Simulator for GraphStrike")
|
| 430 |
+
parser.add_argument("--url", required=True, help="Environment server URL")
|
| 431 |
+
parser.add_argument("--bedrock", action="store_true", help="Use AWS Bedrock instead of HF router")
|
| 432 |
+
parser.add_argument("--endpoints-only", action="store_true", help="Only test endpoints (no LLM)")
|
| 433 |
+
parser.add_argument("--skip-variance", action="store_true", help="Skip variance check (faster)")
|
| 434 |
+
parser.add_argument("--seeds", type=int, default=3, help="Number of seeds for variance check")
|
| 435 |
+
args = parser.parse_args()
|
| 436 |
+
|
| 437 |
+
if args.bedrock:
|
| 438 |
+
LLM_BACKEND = "bedrock"
|
| 439 |
+
|
| 440 |
+
base = args.url.rstrip("/")
|
| 441 |
+
model_display = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
|
| 442 |
+
print(f"GraphStrike Judge Evaluation Simulator")
|
| 443 |
+
print(f"Target: {base}")
|
| 444 |
+
print(f"Backend: {LLM_BACKEND}")
|
| 445 |
+
print(f"Model: {model_display}")
|
| 446 |
+
print(f"Token: {'set' if (HF_TOKEN or os.getenv('AWS_ACCESS_KEY_ID')) else 'NOT SET'}")
|
| 447 |
+
|
| 448 |
+
# Phase 0: Endpoints
|
| 449 |
+
if not test_endpoints(base):
|
| 450 |
+
print("\n✗ Endpoint check failed. Fix before proceeding.")
|
| 451 |
+
sys.exit(1)
|
| 452 |
+
|
| 453 |
+
# Phase 1: Baseline stability
|
| 454 |
+
test_baseline_stability(base)
|
| 455 |
+
|
| 456 |
+
if args.endpoints_only:
|
| 457 |
+
print("\n✓ Endpoint-only mode — skipping LLM tests.")
|
| 458 |
+
sys.exit(0)
|
| 459 |
+
|
| 460 |
+
if LLM_BACKEND == "bedrock":
|
| 461 |
+
if not os.getenv("AWS_ACCESS_KEY_ID"):
|
| 462 |
+
print("\n✗ AWS_ACCESS_KEY_ID not set. Cannot run Bedrock LLM tests.")
|
| 463 |
+
sys.exit(1)
|
| 464 |
+
elif not HF_TOKEN:
|
| 465 |
+
print("\n✗ HF_TOKEN not set. Cannot run LLM agent tests.")
|
| 466 |
+
print(" export HF_TOKEN='hf_...' OR use --bedrock with AWS creds")
|
| 467 |
+
sys.exit(1)
|
| 468 |
+
|
| 469 |
+
# Phase 2: LLM on all tasks
|
| 470 |
+
scores = test_llm_all_tasks(base)
|
| 471 |
+
|
| 472 |
+
# Phase 3: Variance
|
| 473 |
+
if not args.skip_variance:
|
| 474 |
+
test_variance(base, seeds=list(range(args.seeds)))
|
| 475 |
+
|
| 476 |
+
print("\n" + "="*60)
|
| 477 |
+
print("EVALUATION COMPLETE")
|
| 478 |
+
print("="*60)
|
eval-models/gemma_test_judge_eval.py
ADDED
|
@@ -0,0 +1,478 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Judge Evaluation Simulator
|
| 4 |
+
==========================
|
| 5 |
+
|
| 6 |
+
Simulates EXACTLY how hackathon judges will evaluate your environment:
|
| 7 |
+
|
| 8 |
+
1. Baseline re-run: POST /baseline → verify scores are stable
|
| 9 |
+
2. Standard Open LLM agent: Run an LLM (via HF router) against all 3 tasks
|
| 10 |
+
3. Score variance check: Run same task multiple seeds, check variance
|
| 11 |
+
|
| 12 |
+
USAGE:
|
| 13 |
+
# Against live HF Space (requires HF_TOKEN):
|
| 14 |
+
export HF_TOKEN="hf_..."
|
| 15 |
+
python test_judge_eval.py --url https://pandago-graphstrike.hf.space
|
| 16 |
+
|
| 17 |
+
# Against local server:
|
| 18 |
+
export HF_TOKEN="hf_..."
|
| 19 |
+
python test_judge_eval.py --url http://localhost:7860
|
| 20 |
+
|
| 21 |
+
# Choose model (default: Qwen/Qwen2.5-72B-Instruct):
|
| 22 |
+
export MODEL_NAME="nvidia/Llama-3.1-Nemotron-70B-Instruct-HF"
|
| 23 |
+
python test_judge_eval.py --url https://pandago-graphstrike.hf.space
|
| 24 |
+
|
| 25 |
+
# Just test endpoints (no LLM needed):
|
| 26 |
+
python test_judge_eval.py --url https://pandago-graphstrike.hf.space --endpoints-only
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
import json
|
| 30 |
+
import os
|
| 31 |
+
import sys
|
| 32 |
+
import time
|
| 33 |
+
import urllib.request
|
| 34 |
+
from typing import Dict, List, Optional
|
| 35 |
+
|
| 36 |
+
# ---------------------------------------------------------------------------
|
| 37 |
+
# Config from env vars (same as judges will set)
|
| 38 |
+
# ---------------------------------------------------------------------------
|
| 39 |
+
|
| 40 |
+
HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 41 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 42 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "google.gemma-3-12b-it")
|
| 43 |
+
|
| 44 |
+
# AWS Bedrock config (use --bedrock flag)
|
| 45 |
+
LLM_BACKEND = "hf" # "hf" or "bedrock"
|
| 46 |
+
BEDROCK_MODEL_ID = os.getenv("BEDROCK_MODEL_ID", "google.gemma-3-12b-it")
|
| 47 |
+
|
| 48 |
+
SYSTEM_PROMPT = """You are an AI detective finding 10 coordinated fake accounts in a social network.
|
| 49 |
+
|
| 50 |
+
ACTIONS (reply with exactly ONE line):
|
| 51 |
+
- INSPECT acc_XXXX — reveal profile (costs 1 step)
|
| 52 |
+
- FLAG acc_XXXX — mark as fake (FREE, no step cost, triggers suspect cascade)
|
| 53 |
+
- SUBMIT — end episode, get scored
|
| 54 |
+
|
| 55 |
+
DECISION RULES (apply top-to-bottom, first match wins):
|
| 56 |
+
1. If suspect_ids lists accounts you haven't inspected → INSPECT the first one
|
| 57 |
+
2. If ANY profiled account has shared_ip_count >= 5 and is NOT flagged → FLAG it immediately
|
| 58 |
+
3. If ANY profiled account has photo_reuse >= 0.50 AND bio_template >= 0.40 and hub < 0.70 and NOT flagged → FLAG it
|
| 59 |
+
4. If ANY profiled account has fake_risk_score >= 0.30 and hub < 0.70 and NOT flagged → FLAG it
|
| 60 |
+
5. If there are uninspected visible accounts and steps > 3 → INSPECT the next one
|
| 61 |
+
6. If you have 10 flags OR steps <= 3 → SUBMIT
|
| 62 |
+
|
| 63 |
+
IMPORTANT:
|
| 64 |
+
- FLAG is FREE (costs 0 steps) — flag aggressively when you see suspicious signals
|
| 65 |
+
- After each FLAG, new suspects appear — always inspect suspects before other accounts
|
| 66 |
+
- hub_legitimacy_score > 0.70 means celebrity — do NOT flag
|
| 67 |
+
- shared_ip_count >= 5 is the strongest gang signal (all 10 share one IP)
|
| 68 |
+
- Do NOT re-inspect already inspected accounts
|
| 69 |
+
|
| 70 |
+
Reply with EXACTLY one line, nothing else:
|
| 71 |
+
FLAG acc_XXXX
|
| 72 |
+
INSPECT acc_XXXX
|
| 73 |
+
SUBMIT"""
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
# ---------------------------------------------------------------------------
|
| 77 |
+
# HTTP helpers
|
| 78 |
+
# ---------------------------------------------------------------------------
|
| 79 |
+
|
| 80 |
+
def _retry(fn, retries=3, backoff=3):
|
| 81 |
+
"""Retry a function on network errors."""
|
| 82 |
+
for attempt in range(retries):
|
| 83 |
+
try:
|
| 84 |
+
return fn()
|
| 85 |
+
except OSError as e:
|
| 86 |
+
if attempt == retries - 1:
|
| 87 |
+
raise
|
| 88 |
+
wait = backoff * (attempt + 1)
|
| 89 |
+
print(f" [RETRY] Network error: {e} — retrying in {wait}s ({attempt+1}/{retries})")
|
| 90 |
+
time.sleep(wait)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def http_post(url: str, body: Optional[dict] = None) -> dict:
|
| 94 |
+
def _do():
|
| 95 |
+
data = json.dumps(body or {}).encode()
|
| 96 |
+
req = urllib.request.Request(
|
| 97 |
+
url, data=data,
|
| 98 |
+
headers={"Content-Type": "application/json"},
|
| 99 |
+
method="POST"
|
| 100 |
+
)
|
| 101 |
+
with urllib.request.urlopen(req, timeout=120) as resp:
|
| 102 |
+
return json.loads(resp.read())
|
| 103 |
+
return _retry(_do)
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def http_get(url: str, expect_json: bool = True) -> dict:
|
| 107 |
+
def _do():
|
| 108 |
+
with urllib.request.urlopen(url, timeout=120) as resp:
|
| 109 |
+
body = resp.read()
|
| 110 |
+
if not expect_json:
|
| 111 |
+
return {"_status": resp.status, "_body_len": len(body)}
|
| 112 |
+
return json.loads(body)
|
| 113 |
+
return _retry(_do)
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
# ---------------------------------------------------------------------------
|
| 117 |
+
# LLM call via OpenAI-compatible API
|
| 118 |
+
# ---------------------------------------------------------------------------
|
| 119 |
+
|
| 120 |
+
def _call_hf(prompt: str) -> str:
|
| 121 |
+
"""Call LLM via HF router (OpenAI-compatible)."""
|
| 122 |
+
from openai import OpenAI
|
| 123 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
|
| 124 |
+
resp = client.chat.completions.create(
|
| 125 |
+
model=MODEL_NAME,
|
| 126 |
+
messages=[
|
| 127 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 128 |
+
{"role": "user", "content": prompt},
|
| 129 |
+
],
|
| 130 |
+
temperature=0.3,
|
| 131 |
+
max_tokens=256,
|
| 132 |
+
)
|
| 133 |
+
return (resp.choices[0].message.content or "").strip()
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def _call_bedrock(prompt: str) -> str:
|
| 137 |
+
"""Call LLM via AWS Bedrock. Tries converse() first, falls back to invoke_model()."""
|
| 138 |
+
import boto3
|
| 139 |
+
client = boto3.client(
|
| 140 |
+
service_name="bedrock-runtime",
|
| 141 |
+
region_name=os.getenv("AWS_DEFAULT_REGION", "us-east-1"),
|
| 142 |
+
aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
|
| 143 |
+
aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
|
| 144 |
+
)
|
| 145 |
+
# Try converse API first (boto3 >= 1.34.x)
|
| 146 |
+
if hasattr(client, "converse"):
|
| 147 |
+
resp = client.converse(
|
| 148 |
+
modelId=BEDROCK_MODEL_ID,
|
| 149 |
+
messages=[{"role": "user", "content": [{"text": prompt}]}],
|
| 150 |
+
system=[{"text": SYSTEM_PROMPT}],
|
| 151 |
+
inferenceConfig={"maxTokens": 256, "temperature": 0.3},
|
| 152 |
+
)
|
| 153 |
+
return resp["output"]["message"]["content"][0]["text"].strip()
|
| 154 |
+
# Fallback: invoke_model (works with all boto3 versions)
|
| 155 |
+
body = json.dumps({
|
| 156 |
+
"messages": [
|
| 157 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 158 |
+
{"role": "user", "content": prompt},
|
| 159 |
+
],
|
| 160 |
+
"max_tokens": 256,
|
| 161 |
+
"temperature": 0.3,
|
| 162 |
+
})
|
| 163 |
+
resp = client.invoke_model(
|
| 164 |
+
modelId=BEDROCK_MODEL_ID,
|
| 165 |
+
contentType="application/json",
|
| 166 |
+
accept="application/json",
|
| 167 |
+
body=body,
|
| 168 |
+
)
|
| 169 |
+
result = json.loads(resp["body"].read())
|
| 170 |
+
# Handle both OpenAI-style and Bedrock-native response formats
|
| 171 |
+
if "choices" in result:
|
| 172 |
+
return result["choices"][0]["message"]["content"].strip()
|
| 173 |
+
if "content" in result:
|
| 174 |
+
content = result["content"]
|
| 175 |
+
if isinstance(content, list):
|
| 176 |
+
return content[0].get("text", "").strip()
|
| 177 |
+
return str(content).strip()
|
| 178 |
+
if "output" in result:
|
| 179 |
+
return result["output"].get("text", "").strip()
|
| 180 |
+
return str(result).strip()
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def call_llm(prompt: str) -> str:
|
| 184 |
+
"""Call LLM with retries. Uses HF router or Bedrock based on LLM_BACKEND."""
|
| 185 |
+
fn = _call_bedrock if LLM_BACKEND == "bedrock" else _call_hf
|
| 186 |
+
for attempt in range(3):
|
| 187 |
+
try:
|
| 188 |
+
raw = fn(prompt)
|
| 189 |
+
if os.getenv("DEBUG_LLM"):
|
| 190 |
+
print(f" [LLM RAW] {raw[:200]}")
|
| 191 |
+
# Strip Qwen3 <think>...</think> reasoning blocks
|
| 192 |
+
import re
|
| 193 |
+
cleaned = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()
|
| 194 |
+
return cleaned if cleaned else raw
|
| 195 |
+
except Exception as e:
|
| 196 |
+
if attempt == 2:
|
| 197 |
+
print(f" [LLM ERROR] {e} (gave up after 3 attempts)")
|
| 198 |
+
return ""
|
| 199 |
+
wait = 3 * (attempt + 1)
|
| 200 |
+
print(f" [LLM RETRY] {e} — retrying in {wait}s")
|
| 201 |
+
time.sleep(wait)
|
| 202 |
+
return ""
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
def format_obs(obs: dict) -> str:
|
| 206 |
+
"""Format observation as text for LLM — shows raw signals prominently."""
|
| 207 |
+
lines = []
|
| 208 |
+
lines.append(f"TASK: {obs.get('task','?').upper()} | Steps remaining: {obs.get('steps_remaining','?')}")
|
| 209 |
+
|
| 210 |
+
flagged = obs.get("flagged_ids", [])
|
| 211 |
+
lines.append(f"Flagged ({len(flagged)}/10): {', '.join(flagged) if flagged else 'none'}")
|
| 212 |
+
|
| 213 |
+
suspects = obs.get("suspect_ids", [])
|
| 214 |
+
inspected = obs.get("inspected_ids", [])
|
| 215 |
+
uninspected_suspects = [s for s in suspects if s not in inspected]
|
| 216 |
+
if uninspected_suspects:
|
| 217 |
+
lines.append(f"*** SUSPECTS (uninspected) → INSPECT THESE FIRST: {', '.join(uninspected_suspects)} ***")
|
| 218 |
+
|
| 219 |
+
accounts = obs.get("visible_accounts", [])
|
| 220 |
+
if accounts:
|
| 221 |
+
# Split: unflagged accounts that should be flagged vs rest
|
| 222 |
+
unflagged_suspicious = []
|
| 223 |
+
flagged_accs = []
|
| 224 |
+
clean_accs = []
|
| 225 |
+
for a in sorted(accounts, key=lambda x: x.get("fake_risk_score", 0), reverse=True):
|
| 226 |
+
aid = a.get("account_id", "?")
|
| 227 |
+
if aid in flagged:
|
| 228 |
+
flagged_accs.append(a)
|
| 229 |
+
elif (a.get("shared_ip_count", 0) >= 5 or
|
| 230 |
+
(a.get("photo_reuse_score", 0) >= 0.50 and a.get("bio_template_score", 0) >= 0.40)):
|
| 231 |
+
unflagged_suspicious.append(a)
|
| 232 |
+
else:
|
| 233 |
+
clean_accs.append(a)
|
| 234 |
+
|
| 235 |
+
if unflagged_suspicious:
|
| 236 |
+
lines.append(f"\n!!! ACTION NEEDED — FLAG THESE ({len(unflagged_suspicious)} accounts with strong fake signals):")
|
| 237 |
+
for a in unflagged_suspicious:
|
| 238 |
+
aid = a.get("account_id", "?")
|
| 239 |
+
lines.append(f" → FLAG {aid}: risk={a.get('fake_risk_score',0):.3f} photo={a.get('photo_reuse_score',0):.2f} bio={a.get('bio_template_score',0):.2f} ip_shared={a.get('shared_ip_count',0)} hub={a.get('hub_legitimacy_score',0):.2f}")
|
| 240 |
+
|
| 241 |
+
if flagged_accs:
|
| 242 |
+
lines.append(f"\nALREADY FLAGGED ({len(flagged_accs)}):")
|
| 243 |
+
for a in flagged_accs[:5]:
|
| 244 |
+
lines.append(f" ✓ {a.get('account_id','?')}")
|
| 245 |
+
|
| 246 |
+
if clean_accs:
|
| 247 |
+
lines.append(f"\nCLEAN ACCOUNTS ({len(clean_accs)}):")
|
| 248 |
+
for a in clean_accs[:5]:
|
| 249 |
+
aid = a.get("account_id", "?")
|
| 250 |
+
hub = a.get("hub_legitimacy_score", 0)
|
| 251 |
+
hub_mark = " [CELEBRITY]" if hub > 0.70 else ""
|
| 252 |
+
lines.append(f" {aid}: risk={a.get('fake_risk_score',0):.3f} photo={a.get('photo_reuse_score',0):.2f} bio={a.get('bio_template_score',0):.2f} hub={hub:.2f}{hub_mark}")
|
| 253 |
+
|
| 254 |
+
visible = obs.get("visible_account_ids", [])
|
| 255 |
+
uninspected = [i for i in visible if i not in inspected]
|
| 256 |
+
if uninspected:
|
| 257 |
+
lines.append(f"\nUninspected IDs ({len(uninspected)}): {', '.join(uninspected[:8])}{'...' if len(uninspected) > 8 else ''}")
|
| 258 |
+
|
| 259 |
+
lines.append(f"\nMessage: {obs.get('message', '')}")
|
| 260 |
+
return "\n".join(lines)
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
def parse_action(llm_text: str, obs: dict) -> dict:
|
| 264 |
+
"""Parse LLM output to action dict."""
|
| 265 |
+
for line in llm_text.split("\n"):
|
| 266 |
+
line = line.strip()
|
| 267 |
+
upper = line.upper()
|
| 268 |
+
if upper.startswith("INSPECT ") or upper.startswith("FLAG ") or upper.startswith("INVESTIGATE_NETWORK ") or upper.startswith("UNFLAG "):
|
| 269 |
+
parts = line.split(maxsplit=1)
|
| 270 |
+
return {"action_type": parts[0].lower(), "account_id": parts[1].lower() if len(parts) > 1 else None}
|
| 271 |
+
if upper == "SUBMIT":
|
| 272 |
+
return {"action_type": "submit"}
|
| 273 |
+
|
| 274 |
+
# Fallback: inspect first uninspected suspect
|
| 275 |
+
suspects = obs.get("suspect_ids", [])
|
| 276 |
+
inspected = obs.get("inspected_ids", [])
|
| 277 |
+
for s in suspects:
|
| 278 |
+
if s not in inspected:
|
| 279 |
+
return {"action_type": "inspect", "account_id": s}
|
| 280 |
+
visible = obs.get("visible_account_ids", [])
|
| 281 |
+
for v in visible:
|
| 282 |
+
if v not in inspected:
|
| 283 |
+
return {"action_type": "inspect", "account_id": v}
|
| 284 |
+
return {"action_type": "submit"}
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
# ---------------------------------------------------------------------------
|
| 288 |
+
# Test phases
|
| 289 |
+
# ---------------------------------------------------------------------------
|
| 290 |
+
|
| 291 |
+
def test_endpoints(base_url: str) -> bool:
|
| 292 |
+
"""Phase 0: Verify all required endpoints respond correctly."""
|
| 293 |
+
print("\n" + "="*60)
|
| 294 |
+
print("PHASE 0: Endpoint Verification")
|
| 295 |
+
print("="*60)
|
| 296 |
+
|
| 297 |
+
checks = [
|
| 298 |
+
("GET", "/health", None, True),
|
| 299 |
+
("GET", "/tasks", None, True),
|
| 300 |
+
("GET", "/metadata", None, True),
|
| 301 |
+
("GET", "/schema", None, True),
|
| 302 |
+
("GET", "/web", None, False), # returns HTML, not JSON
|
| 303 |
+
("POST", "/reset", {"task": "easy", "seed": 0}, True),
|
| 304 |
+
("GET", "/state", None, True),
|
| 305 |
+
("POST", "/step", {"action_type": "inspect", "account_id": "acc_0000"}, True),
|
| 306 |
+
("POST", "/step", {"action_type": "submit"}, True),
|
| 307 |
+
("GET", "/grader", None, True),
|
| 308 |
+
("POST", "/mcp", {"jsonrpc": "2.0", "method": "tools/list", "id": 1}, True),
|
| 309 |
+
("POST", "/baseline", None, True),
|
| 310 |
+
]
|
| 311 |
+
|
| 312 |
+
all_ok = True
|
| 313 |
+
for method, path, body, expect_json in checks:
|
| 314 |
+
try:
|
| 315 |
+
if method == "GET":
|
| 316 |
+
http_get(f"{base_url}{path}", expect_json=expect_json)
|
| 317 |
+
else:
|
| 318 |
+
http_post(f"{base_url}{path}", body)
|
| 319 |
+
print(f" ✓ {method} {path}")
|
| 320 |
+
except Exception as e:
|
| 321 |
+
print(f" ✗ {method} {path} — {e}")
|
| 322 |
+
all_ok = False
|
| 323 |
+
|
| 324 |
+
return all_ok
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
def test_baseline_stability(base_url: str) -> bool:
|
| 328 |
+
"""Phase 1: Baseline re-run (must produce identical scores)."""
|
| 329 |
+
print("\n" + "="*60)
|
| 330 |
+
print("PHASE 1: Baseline Stability (3 runs)")
|
| 331 |
+
print("="*60)
|
| 332 |
+
|
| 333 |
+
scores_list = []
|
| 334 |
+
for i in range(3):
|
| 335 |
+
r = http_post(f"{base_url}/baseline")
|
| 336 |
+
scores = r["scores"]
|
| 337 |
+
scores_list.append(scores)
|
| 338 |
+
print(f" Run {i+1}: easy={scores['easy']:.4f} medium={scores['medium']:.4f} hard={scores['hard']:.4f}")
|
| 339 |
+
|
| 340 |
+
# Check all identical
|
| 341 |
+
stable = all(s == scores_list[0] for s in scores_list)
|
| 342 |
+
if stable:
|
| 343 |
+
print(" ✓ All 3 runs identical — baseline is deterministic")
|
| 344 |
+
else:
|
| 345 |
+
print(" ✗ SCORES DIFFER — baseline is non-deterministic!")
|
| 346 |
+
return stable
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
def test_llm_agent(base_url: str, task: str, seed: int = 0) -> float:
|
| 350 |
+
"""Phase 2: Run an LLM agent against one task (simulates judge's Nemotron run)."""
|
| 351 |
+
_model = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
|
| 352 |
+
print(f"\n --- LLM Agent: task={task}, seed={seed}, model={_model} ---")
|
| 353 |
+
|
| 354 |
+
# Reset
|
| 355 |
+
reset_resp = http_post(f"{base_url}/reset", {"task": task, "seed": seed})
|
| 356 |
+
obs = reset_resp.get("observation", reset_resp)
|
| 357 |
+
done = reset_resp.get("done", False)
|
| 358 |
+
|
| 359 |
+
step_num = 0
|
| 360 |
+
while not done:
|
| 361 |
+
step_num += 1
|
| 362 |
+
prompt = format_obs(obs)
|
| 363 |
+
llm_text = call_llm(prompt)
|
| 364 |
+
action = parse_action(llm_text, obs)
|
| 365 |
+
|
| 366 |
+
action_str = f"{action['action_type'].upper()} {action.get('account_id', '')}".strip()
|
| 367 |
+
|
| 368 |
+
step_resp = http_post(f"{base_url}/step", action)
|
| 369 |
+
obs = step_resp.get("observation", step_resp)
|
| 370 |
+
done = step_resp.get("done", False)
|
| 371 |
+
reward = step_resp.get("reward")
|
| 372 |
+
|
| 373 |
+
flagged_n = len(obs.get("flagged_ids", []))
|
| 374 |
+
suspects_n = len(obs.get("suspect_ids", []))
|
| 375 |
+
steps_left = obs.get("steps_remaining", "?")
|
| 376 |
+
|
| 377 |
+
print(f" Step {step_num:2d}: {action_str:35s} flagged={flagged_n}/10 suspects={suspects_n} steps_left={steps_left}")
|
| 378 |
+
|
| 379 |
+
if done and reward is not None:
|
| 380 |
+
msg = step_resp.get("message", obs.get("message", ""))
|
| 381 |
+
print(f" → Episode ended: {msg[:100]}")
|
| 382 |
+
|
| 383 |
+
# Get grader score
|
| 384 |
+
grader = http_get(f"{base_url}/grader")
|
| 385 |
+
score = grader["score"]
|
| 386 |
+
print(f" ★ GRADER SCORE: {score:.4f}")
|
| 387 |
+
return score
|
| 388 |
+
|
| 389 |
+
|
| 390 |
+
def test_llm_all_tasks(base_url: str) -> Dict[str, float]:
|
| 391 |
+
"""Phase 2: Run LLM agent on all 3 tasks."""
|
| 392 |
+
print("\n" + "="*60)
|
| 393 |
+
_model = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
|
| 394 |
+
print(f"PHASE 2: LLM Agent Evaluation (model={_model})")
|
| 395 |
+
print("="*60)
|
| 396 |
+
|
| 397 |
+
scores = {}
|
| 398 |
+
for task in ["easy", "medium", "hard"]:
|
| 399 |
+
scores[task] = test_llm_agent(base_url, task=task, seed=0)
|
| 400 |
+
|
| 401 |
+
print(f"\n Summary: easy={scores['easy']:.4f} medium={scores['medium']:.4f} hard={scores['hard']:.4f}")
|
| 402 |
+
return scores
|
| 403 |
+
|
| 404 |
+
|
| 405 |
+
def test_variance(base_url: str, seeds: List[int] = [0, 1, 2, 3, 4]) -> None:
|
| 406 |
+
"""Phase 3: Score variance check (multiple seeds per task)."""
|
| 407 |
+
print("\n" + "="*60)
|
| 408 |
+
print(f"PHASE 3: Score Variance (seeds={seeds})")
|
| 409 |
+
print("="*60)
|
| 410 |
+
|
| 411 |
+
for task in ["easy", "medium", "hard"]:
|
| 412 |
+
task_scores = []
|
| 413 |
+
for seed in seeds:
|
| 414 |
+
score = test_llm_agent(base_url, task=task, seed=seed)
|
| 415 |
+
task_scores.append(score)
|
| 416 |
+
|
| 417 |
+
mean = sum(task_scores) / len(task_scores)
|
| 418 |
+
variance = sum((s - mean) ** 2 for s in task_scores) / len(task_scores)
|
| 419 |
+
print(f"\n {task}: scores={[f'{s:.3f}' for s in task_scores]} mean={mean:.4f} var={variance:.6f}")
|
| 420 |
+
|
| 421 |
+
|
| 422 |
+
# ---------------------------------------------------------------------------
|
| 423 |
+
# Main
|
| 424 |
+
# ---------------------------------------------------------------------------
|
| 425 |
+
|
| 426 |
+
if __name__ == "__main__":
|
| 427 |
+
import argparse
|
| 428 |
+
|
| 429 |
+
parser = argparse.ArgumentParser(description="Judge Evaluation Simulator for GraphStrike")
|
| 430 |
+
parser.add_argument("--url", required=True, help="Environment server URL")
|
| 431 |
+
parser.add_argument("--bedrock", action="store_true", help="Use AWS Bedrock instead of HF router")
|
| 432 |
+
parser.add_argument("--endpoints-only", action="store_true", help="Only test endpoints (no LLM)")
|
| 433 |
+
parser.add_argument("--skip-variance", action="store_true", help="Skip variance check (faster)")
|
| 434 |
+
parser.add_argument("--seeds", type=int, default=3, help="Number of seeds for variance check")
|
| 435 |
+
args = parser.parse_args()
|
| 436 |
+
|
| 437 |
+
if args.bedrock:
|
| 438 |
+
LLM_BACKEND = "bedrock"
|
| 439 |
+
|
| 440 |
+
base = args.url.rstrip("/")
|
| 441 |
+
model_display = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
|
| 442 |
+
print(f"GraphStrike Judge Evaluation Simulator")
|
| 443 |
+
print(f"Target: {base}")
|
| 444 |
+
print(f"Backend: {LLM_BACKEND}")
|
| 445 |
+
print(f"Model: {model_display}")
|
| 446 |
+
print(f"Token: {'set' if (HF_TOKEN or os.getenv('AWS_ACCESS_KEY_ID')) else 'NOT SET'}")
|
| 447 |
+
|
| 448 |
+
# Phase 0: Endpoints
|
| 449 |
+
if not test_endpoints(base):
|
| 450 |
+
print("\n✗ Endpoint check failed. Fix before proceeding.")
|
| 451 |
+
sys.exit(1)
|
| 452 |
+
|
| 453 |
+
# Phase 1: Baseline stability
|
| 454 |
+
test_baseline_stability(base)
|
| 455 |
+
|
| 456 |
+
if args.endpoints_only:
|
| 457 |
+
print("\n✓ Endpoint-only mode — skipping LLM tests.")
|
| 458 |
+
sys.exit(0)
|
| 459 |
+
|
| 460 |
+
if LLM_BACKEND == "bedrock":
|
| 461 |
+
if not os.getenv("AWS_ACCESS_KEY_ID"):
|
| 462 |
+
print("\n✗ AWS_ACCESS_KEY_ID not set. Cannot run Bedrock LLM tests.")
|
| 463 |
+
sys.exit(1)
|
| 464 |
+
elif not HF_TOKEN:
|
| 465 |
+
print("\n✗ HF_TOKEN not set. Cannot run LLM agent tests.")
|
| 466 |
+
print(" export HF_TOKEN='hf_...' OR use --bedrock with AWS creds")
|
| 467 |
+
sys.exit(1)
|
| 468 |
+
|
| 469 |
+
# Phase 2: LLM on all tasks
|
| 470 |
+
scores = test_llm_all_tasks(base)
|
| 471 |
+
|
| 472 |
+
# Phase 3: Variance
|
| 473 |
+
if not args.skip_variance:
|
| 474 |
+
test_variance(base, seeds=list(range(args.seeds)))
|
| 475 |
+
|
| 476 |
+
print("\n" + "="*60)
|
| 477 |
+
print("EVALUATION COMPLETE")
|
| 478 |
+
print("="*60)
|
eval-models/llama_test_judge_eval.py
ADDED
|
@@ -0,0 +1,478 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Judge Evaluation Simulator
|
| 4 |
+
==========================
|
| 5 |
+
|
| 6 |
+
Simulates EXACTLY how hackathon judges will evaluate your environment:
|
| 7 |
+
|
| 8 |
+
1. Baseline re-run: POST /baseline → verify scores are stable
|
| 9 |
+
2. Standard Open LLM agent: Run an LLM (via HF router) against all 3 tasks
|
| 10 |
+
3. Score variance check: Run same task multiple seeds, check variance
|
| 11 |
+
|
| 12 |
+
USAGE:
|
| 13 |
+
# Against live HF Space (requires HF_TOKEN):
|
| 14 |
+
export HF_TOKEN="hf_..."
|
| 15 |
+
python test_judge_eval.py --url https://pandago-graphstrike.hf.space
|
| 16 |
+
|
| 17 |
+
# Against local server:
|
| 18 |
+
export HF_TOKEN="hf_..."
|
| 19 |
+
python test_judge_eval.py --url http://localhost:7860
|
| 20 |
+
|
| 21 |
+
# Choose model (default: Qwen/Qwen2.5-72B-Instruct):
|
| 22 |
+
export MODEL_NAME="nvidia/Llama-3.1-Nemotron-70B-Instruct-HF"
|
| 23 |
+
python test_judge_eval.py --url https://pandago-graphstrike.hf.space
|
| 24 |
+
|
| 25 |
+
# Just test endpoints (no LLM needed):
|
| 26 |
+
python test_judge_eval.py --url https://pandago-graphstrike.hf.space --endpoints-only
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
import json
|
| 30 |
+
import os
|
| 31 |
+
import sys
|
| 32 |
+
import time
|
| 33 |
+
import urllib.request
|
| 34 |
+
from typing import Dict, List, Optional
|
| 35 |
+
|
| 36 |
+
# ---------------------------------------------------------------------------
|
| 37 |
+
# Config from env vars (same as judges will set)
|
| 38 |
+
# ---------------------------------------------------------------------------
|
| 39 |
+
|
| 40 |
+
HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 41 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 42 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "meta.llama4-scout-17b-instruct-v1:0")
|
| 43 |
+
|
| 44 |
+
# AWS Bedrock config (use --bedrock flag)
|
| 45 |
+
LLM_BACKEND = "hf" # "hf" or "bedrock"
|
| 46 |
+
BEDROCK_MODEL_ID = os.getenv("BEDROCK_MODEL_ID", "meta.llama4-scout-17b-instruct-v1:0")
|
| 47 |
+
|
| 48 |
+
SYSTEM_PROMPT = """You are an AI detective finding 10 coordinated fake accounts in a social network.
|
| 49 |
+
|
| 50 |
+
ACTIONS (reply with exactly ONE line):
|
| 51 |
+
- INSPECT acc_XXXX — reveal profile (costs 1 step)
|
| 52 |
+
- FLAG acc_XXXX — mark as fake (FREE, no step cost, triggers suspect cascade)
|
| 53 |
+
- SUBMIT — end episode, get scored
|
| 54 |
+
|
| 55 |
+
DECISION RULES (apply top-to-bottom, first match wins):
|
| 56 |
+
1. If suspect_ids lists accounts you haven't inspected → INSPECT the first one
|
| 57 |
+
2. If ANY profiled account has shared_ip_count >= 5 and is NOT flagged → FLAG it immediately
|
| 58 |
+
3. If ANY profiled account has photo_reuse >= 0.50 AND bio_template >= 0.40 and hub < 0.70 and NOT flagged → FLAG it
|
| 59 |
+
4. If ANY profiled account has fake_risk_score >= 0.30 and hub < 0.70 and NOT flagged → FLAG it
|
| 60 |
+
5. If there are uninspected visible accounts and steps > 3 → INSPECT the next one
|
| 61 |
+
6. If you have 10 flags OR steps <= 3 → SUBMIT
|
| 62 |
+
|
| 63 |
+
IMPORTANT:
|
| 64 |
+
- FLAG is FREE (costs 0 steps) — flag aggressively when you see suspicious signals
|
| 65 |
+
- After each FLAG, new suspects appear — always inspect suspects before other accounts
|
| 66 |
+
- hub_legitimacy_score > 0.70 means celebrity — do NOT flag
|
| 67 |
+
- shared_ip_count >= 5 is the strongest gang signal (all 10 share one IP)
|
| 68 |
+
- Do NOT re-inspect already inspected accounts
|
| 69 |
+
|
| 70 |
+
Reply with EXACTLY one line, nothing else:
|
| 71 |
+
FLAG acc_XXXX
|
| 72 |
+
INSPECT acc_XXXX
|
| 73 |
+
SUBMIT"""
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
# ---------------------------------------------------------------------------
|
| 77 |
+
# HTTP helpers
|
| 78 |
+
# ---------------------------------------------------------------------------
|
| 79 |
+
|
| 80 |
+
def _retry(fn, retries=3, backoff=3):
|
| 81 |
+
"""Retry a function on network errors."""
|
| 82 |
+
for attempt in range(retries):
|
| 83 |
+
try:
|
| 84 |
+
return fn()
|
| 85 |
+
except OSError as e:
|
| 86 |
+
if attempt == retries - 1:
|
| 87 |
+
raise
|
| 88 |
+
wait = backoff * (attempt + 1)
|
| 89 |
+
print(f" [RETRY] Network error: {e} — retrying in {wait}s ({attempt+1}/{retries})")
|
| 90 |
+
time.sleep(wait)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def http_post(url: str, body: Optional[dict] = None) -> dict:
|
| 94 |
+
def _do():
|
| 95 |
+
data = json.dumps(body or {}).encode()
|
| 96 |
+
req = urllib.request.Request(
|
| 97 |
+
url, data=data,
|
| 98 |
+
headers={"Content-Type": "application/json"},
|
| 99 |
+
method="POST"
|
| 100 |
+
)
|
| 101 |
+
with urllib.request.urlopen(req, timeout=120) as resp:
|
| 102 |
+
return json.loads(resp.read())
|
| 103 |
+
return _retry(_do)
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def http_get(url: str, expect_json: bool = True) -> dict:
|
| 107 |
+
def _do():
|
| 108 |
+
with urllib.request.urlopen(url, timeout=120) as resp:
|
| 109 |
+
body = resp.read()
|
| 110 |
+
if not expect_json:
|
| 111 |
+
return {"_status": resp.status, "_body_len": len(body)}
|
| 112 |
+
return json.loads(body)
|
| 113 |
+
return _retry(_do)
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
# ---------------------------------------------------------------------------
|
| 117 |
+
# LLM call via OpenAI-compatible API
|
| 118 |
+
# ---------------------------------------------------------------------------
|
| 119 |
+
|
| 120 |
+
def _call_hf(prompt: str) -> str:
|
| 121 |
+
"""Call LLM via HF router (OpenAI-compatible)."""
|
| 122 |
+
from openai import OpenAI
|
| 123 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
|
| 124 |
+
resp = client.chat.completions.create(
|
| 125 |
+
model=MODEL_NAME,
|
| 126 |
+
messages=[
|
| 127 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 128 |
+
{"role": "user", "content": prompt},
|
| 129 |
+
],
|
| 130 |
+
temperature=0.3,
|
| 131 |
+
max_tokens=256,
|
| 132 |
+
)
|
| 133 |
+
return (resp.choices[0].message.content or "").strip()
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def _call_bedrock(prompt: str) -> str:
|
| 137 |
+
"""Call LLM via AWS Bedrock. Tries converse() first, falls back to invoke_model()."""
|
| 138 |
+
import boto3
|
| 139 |
+
client = boto3.client(
|
| 140 |
+
service_name="bedrock-runtime",
|
| 141 |
+
region_name=os.getenv("AWS_DEFAULT_REGION", "us-east-1"),
|
| 142 |
+
aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
|
| 143 |
+
aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
|
| 144 |
+
)
|
| 145 |
+
# Try converse API first (boto3 >= 1.34.x)
|
| 146 |
+
if hasattr(client, "converse"):
|
| 147 |
+
resp = client.converse(
|
| 148 |
+
modelId=BEDROCK_MODEL_ID,
|
| 149 |
+
messages=[{"role": "user", "content": [{"text": prompt}]}],
|
| 150 |
+
system=[{"text": SYSTEM_PROMPT}],
|
| 151 |
+
inferenceConfig={"maxTokens": 256, "temperature": 0.3},
|
| 152 |
+
)
|
| 153 |
+
return resp["output"]["message"]["content"][0]["text"].strip()
|
| 154 |
+
# Fallback: invoke_model (works with all boto3 versions)
|
| 155 |
+
body = json.dumps({
|
| 156 |
+
"messages": [
|
| 157 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 158 |
+
{"role": "user", "content": prompt},
|
| 159 |
+
],
|
| 160 |
+
"max_tokens": 256,
|
| 161 |
+
"temperature": 0.3,
|
| 162 |
+
})
|
| 163 |
+
resp = client.invoke_model(
|
| 164 |
+
modelId=BEDROCK_MODEL_ID,
|
| 165 |
+
contentType="application/json",
|
| 166 |
+
accept="application/json",
|
| 167 |
+
body=body,
|
| 168 |
+
)
|
| 169 |
+
result = json.loads(resp["body"].read())
|
| 170 |
+
# Handle both OpenAI-style and Bedrock-native response formats
|
| 171 |
+
if "choices" in result:
|
| 172 |
+
return result["choices"][0]["message"]["content"].strip()
|
| 173 |
+
if "content" in result:
|
| 174 |
+
content = result["content"]
|
| 175 |
+
if isinstance(content, list):
|
| 176 |
+
return content[0].get("text", "").strip()
|
| 177 |
+
return str(content).strip()
|
| 178 |
+
if "output" in result:
|
| 179 |
+
return result["output"].get("text", "").strip()
|
| 180 |
+
return str(result).strip()
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def call_llm(prompt: str) -> str:
|
| 184 |
+
"""Call LLM with retries. Uses HF router or Bedrock based on LLM_BACKEND."""
|
| 185 |
+
fn = _call_bedrock if LLM_BACKEND == "bedrock" else _call_hf
|
| 186 |
+
for attempt in range(3):
|
| 187 |
+
try:
|
| 188 |
+
raw = fn(prompt)
|
| 189 |
+
if os.getenv("DEBUG_LLM"):
|
| 190 |
+
print(f" [LLM RAW] {raw[:200]}")
|
| 191 |
+
# Strip Qwen3 <think>...</think> reasoning blocks
|
| 192 |
+
import re
|
| 193 |
+
cleaned = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()
|
| 194 |
+
return cleaned if cleaned else raw
|
| 195 |
+
except Exception as e:
|
| 196 |
+
if attempt == 2:
|
| 197 |
+
print(f" [LLM ERROR] {e} (gave up after 3 attempts)")
|
| 198 |
+
return ""
|
| 199 |
+
wait = 3 * (attempt + 1)
|
| 200 |
+
print(f" [LLM RETRY] {e} — retrying in {wait}s")
|
| 201 |
+
time.sleep(wait)
|
| 202 |
+
return ""
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
def format_obs(obs: dict) -> str:
|
| 206 |
+
"""Format observation as text for LLM — shows raw signals prominently."""
|
| 207 |
+
lines = []
|
| 208 |
+
lines.append(f"TASK: {obs.get('task','?').upper()} | Steps remaining: {obs.get('steps_remaining','?')}")
|
| 209 |
+
|
| 210 |
+
flagged = obs.get("flagged_ids", [])
|
| 211 |
+
lines.append(f"Flagged ({len(flagged)}/10): {', '.join(flagged) if flagged else 'none'}")
|
| 212 |
+
|
| 213 |
+
suspects = obs.get("suspect_ids", [])
|
| 214 |
+
inspected = obs.get("inspected_ids", [])
|
| 215 |
+
uninspected_suspects = [s for s in suspects if s not in inspected]
|
| 216 |
+
if uninspected_suspects:
|
| 217 |
+
lines.append(f"*** SUSPECTS (uninspected) → INSPECT THESE FIRST: {', '.join(uninspected_suspects)} ***")
|
| 218 |
+
|
| 219 |
+
accounts = obs.get("visible_accounts", [])
|
| 220 |
+
if accounts:
|
| 221 |
+
# Split: unflagged accounts that should be flagged vs rest
|
| 222 |
+
unflagged_suspicious = []
|
| 223 |
+
flagged_accs = []
|
| 224 |
+
clean_accs = []
|
| 225 |
+
for a in sorted(accounts, key=lambda x: x.get("fake_risk_score", 0), reverse=True):
|
| 226 |
+
aid = a.get("account_id", "?")
|
| 227 |
+
if aid in flagged:
|
| 228 |
+
flagged_accs.append(a)
|
| 229 |
+
elif (a.get("shared_ip_count", 0) >= 5 or
|
| 230 |
+
(a.get("photo_reuse_score", 0) >= 0.50 and a.get("bio_template_score", 0) >= 0.40)):
|
| 231 |
+
unflagged_suspicious.append(a)
|
| 232 |
+
else:
|
| 233 |
+
clean_accs.append(a)
|
| 234 |
+
|
| 235 |
+
if unflagged_suspicious:
|
| 236 |
+
lines.append(f"\n!!! ACTION NEEDED — FLAG THESE ({len(unflagged_suspicious)} accounts with strong fake signals):")
|
| 237 |
+
for a in unflagged_suspicious:
|
| 238 |
+
aid = a.get("account_id", "?")
|
| 239 |
+
lines.append(f" → FLAG {aid}: risk={a.get('fake_risk_score',0):.3f} photo={a.get('photo_reuse_score',0):.2f} bio={a.get('bio_template_score',0):.2f} ip_shared={a.get('shared_ip_count',0)} hub={a.get('hub_legitimacy_score',0):.2f}")
|
| 240 |
+
|
| 241 |
+
if flagged_accs:
|
| 242 |
+
lines.append(f"\nALREADY FLAGGED ({len(flagged_accs)}):")
|
| 243 |
+
for a in flagged_accs[:5]:
|
| 244 |
+
lines.append(f" ✓ {a.get('account_id','?')}")
|
| 245 |
+
|
| 246 |
+
if clean_accs:
|
| 247 |
+
lines.append(f"\nCLEAN ACCOUNTS ({len(clean_accs)}):")
|
| 248 |
+
for a in clean_accs[:5]:
|
| 249 |
+
aid = a.get("account_id", "?")
|
| 250 |
+
hub = a.get("hub_legitimacy_score", 0)
|
| 251 |
+
hub_mark = " [CELEBRITY]" if hub > 0.70 else ""
|
| 252 |
+
lines.append(f" {aid}: risk={a.get('fake_risk_score',0):.3f} photo={a.get('photo_reuse_score',0):.2f} bio={a.get('bio_template_score',0):.2f} hub={hub:.2f}{hub_mark}")
|
| 253 |
+
|
| 254 |
+
visible = obs.get("visible_account_ids", [])
|
| 255 |
+
uninspected = [i for i in visible if i not in inspected]
|
| 256 |
+
if uninspected:
|
| 257 |
+
lines.append(f"\nUninspected IDs ({len(uninspected)}): {', '.join(uninspected[:8])}{'...' if len(uninspected) > 8 else ''}")
|
| 258 |
+
|
| 259 |
+
lines.append(f"\nMessage: {obs.get('message', '')}")
|
| 260 |
+
return "\n".join(lines)
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
def parse_action(llm_text: str, obs: dict) -> dict:
|
| 264 |
+
"""Parse LLM output to action dict."""
|
| 265 |
+
for line in llm_text.split("\n"):
|
| 266 |
+
line = line.strip()
|
| 267 |
+
upper = line.upper()
|
| 268 |
+
if upper.startswith("INSPECT ") or upper.startswith("FLAG ") or upper.startswith("INVESTIGATE_NETWORK ") or upper.startswith("UNFLAG "):
|
| 269 |
+
parts = line.split(maxsplit=1)
|
| 270 |
+
return {"action_type": parts[0].lower(), "account_id": parts[1].lower() if len(parts) > 1 else None}
|
| 271 |
+
if upper == "SUBMIT":
|
| 272 |
+
return {"action_type": "submit"}
|
| 273 |
+
|
| 274 |
+
# Fallback: inspect first uninspected suspect
|
| 275 |
+
suspects = obs.get("suspect_ids", [])
|
| 276 |
+
inspected = obs.get("inspected_ids", [])
|
| 277 |
+
for s in suspects:
|
| 278 |
+
if s not in inspected:
|
| 279 |
+
return {"action_type": "inspect", "account_id": s}
|
| 280 |
+
visible = obs.get("visible_account_ids", [])
|
| 281 |
+
for v in visible:
|
| 282 |
+
if v not in inspected:
|
| 283 |
+
return {"action_type": "inspect", "account_id": v}
|
| 284 |
+
return {"action_type": "submit"}
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
# ---------------------------------------------------------------------------
|
| 288 |
+
# Test phases
|
| 289 |
+
# ---------------------------------------------------------------------------
|
| 290 |
+
|
| 291 |
+
def test_endpoints(base_url: str) -> bool:
|
| 292 |
+
"""Phase 0: Verify all required endpoints respond correctly."""
|
| 293 |
+
print("\n" + "="*60)
|
| 294 |
+
print("PHASE 0: Endpoint Verification")
|
| 295 |
+
print("="*60)
|
| 296 |
+
|
| 297 |
+
checks = [
|
| 298 |
+
("GET", "/health", None, True),
|
| 299 |
+
("GET", "/tasks", None, True),
|
| 300 |
+
("GET", "/metadata", None, True),
|
| 301 |
+
("GET", "/schema", None, True),
|
| 302 |
+
("GET", "/web", None, False), # returns HTML, not JSON
|
| 303 |
+
("POST", "/reset", {"task": "easy", "seed": 0}, True),
|
| 304 |
+
("GET", "/state", None, True),
|
| 305 |
+
("POST", "/step", {"action_type": "inspect", "account_id": "acc_0000"}, True),
|
| 306 |
+
("POST", "/step", {"action_type": "submit"}, True),
|
| 307 |
+
("GET", "/grader", None, True),
|
| 308 |
+
("POST", "/mcp", {"jsonrpc": "2.0", "method": "tools/list", "id": 1}, True),
|
| 309 |
+
("POST", "/baseline", None, True),
|
| 310 |
+
]
|
| 311 |
+
|
| 312 |
+
all_ok = True
|
| 313 |
+
for method, path, body, expect_json in checks:
|
| 314 |
+
try:
|
| 315 |
+
if method == "GET":
|
| 316 |
+
http_get(f"{base_url}{path}", expect_json=expect_json)
|
| 317 |
+
else:
|
| 318 |
+
http_post(f"{base_url}{path}", body)
|
| 319 |
+
print(f" ✓ {method} {path}")
|
| 320 |
+
except Exception as e:
|
| 321 |
+
print(f" ✗ {method} {path} — {e}")
|
| 322 |
+
all_ok = False
|
| 323 |
+
|
| 324 |
+
return all_ok
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
def test_baseline_stability(base_url: str) -> bool:
|
| 328 |
+
"""Phase 1: Baseline re-run (must produce identical scores)."""
|
| 329 |
+
print("\n" + "="*60)
|
| 330 |
+
print("PHASE 1: Baseline Stability (3 runs)")
|
| 331 |
+
print("="*60)
|
| 332 |
+
|
| 333 |
+
scores_list = []
|
| 334 |
+
for i in range(3):
|
| 335 |
+
r = http_post(f"{base_url}/baseline")
|
| 336 |
+
scores = r["scores"]
|
| 337 |
+
scores_list.append(scores)
|
| 338 |
+
print(f" Run {i+1}: easy={scores['easy']:.4f} medium={scores['medium']:.4f} hard={scores['hard']:.4f}")
|
| 339 |
+
|
| 340 |
+
# Check all identical
|
| 341 |
+
stable = all(s == scores_list[0] for s in scores_list)
|
| 342 |
+
if stable:
|
| 343 |
+
print(" ✓ All 3 runs identical — baseline is deterministic")
|
| 344 |
+
else:
|
| 345 |
+
print(" ✗ SCORES DIFFER — baseline is non-deterministic!")
|
| 346 |
+
return stable
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
def test_llm_agent(base_url: str, task: str, seed: int = 0) -> float:
|
| 350 |
+
"""Phase 2: Run an LLM agent against one task (simulates judge's Nemotron run)."""
|
| 351 |
+
_model = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
|
| 352 |
+
print(f"\n --- LLM Agent: task={task}, seed={seed}, model={_model} ---")
|
| 353 |
+
|
| 354 |
+
# Reset
|
| 355 |
+
reset_resp = http_post(f"{base_url}/reset", {"task": task, "seed": seed})
|
| 356 |
+
obs = reset_resp.get("observation", reset_resp)
|
| 357 |
+
done = reset_resp.get("done", False)
|
| 358 |
+
|
| 359 |
+
step_num = 0
|
| 360 |
+
while not done:
|
| 361 |
+
step_num += 1
|
| 362 |
+
prompt = format_obs(obs)
|
| 363 |
+
llm_text = call_llm(prompt)
|
| 364 |
+
action = parse_action(llm_text, obs)
|
| 365 |
+
|
| 366 |
+
action_str = f"{action['action_type'].upper()} {action.get('account_id', '')}".strip()
|
| 367 |
+
|
| 368 |
+
step_resp = http_post(f"{base_url}/step", action)
|
| 369 |
+
obs = step_resp.get("observation", step_resp)
|
| 370 |
+
done = step_resp.get("done", False)
|
| 371 |
+
reward = step_resp.get("reward")
|
| 372 |
+
|
| 373 |
+
flagged_n = len(obs.get("flagged_ids", []))
|
| 374 |
+
suspects_n = len(obs.get("suspect_ids", []))
|
| 375 |
+
steps_left = obs.get("steps_remaining", "?")
|
| 376 |
+
|
| 377 |
+
print(f" Step {step_num:2d}: {action_str:35s} flagged={flagged_n}/10 suspects={suspects_n} steps_left={steps_left}")
|
| 378 |
+
|
| 379 |
+
if done and reward is not None:
|
| 380 |
+
msg = step_resp.get("message", obs.get("message", ""))
|
| 381 |
+
print(f" → Episode ended: {msg[:100]}")
|
| 382 |
+
|
| 383 |
+
# Get grader score
|
| 384 |
+
grader = http_get(f"{base_url}/grader")
|
| 385 |
+
score = grader["score"]
|
| 386 |
+
print(f" ★ GRADER SCORE: {score:.4f}")
|
| 387 |
+
return score
|
| 388 |
+
|
| 389 |
+
|
| 390 |
+
def test_llm_all_tasks(base_url: str) -> Dict[str, float]:
|
| 391 |
+
"""Phase 2: Run LLM agent on all 3 tasks."""
|
| 392 |
+
print("\n" + "="*60)
|
| 393 |
+
_model = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
|
| 394 |
+
print(f"PHASE 2: LLM Agent Evaluation (model={_model})")
|
| 395 |
+
print("="*60)
|
| 396 |
+
|
| 397 |
+
scores = {}
|
| 398 |
+
for task in ["easy", "medium", "hard"]:
|
| 399 |
+
scores[task] = test_llm_agent(base_url, task=task, seed=0)
|
| 400 |
+
|
| 401 |
+
print(f"\n Summary: easy={scores['easy']:.4f} medium={scores['medium']:.4f} hard={scores['hard']:.4f}")
|
| 402 |
+
return scores
|
| 403 |
+
|
| 404 |
+
|
| 405 |
+
def test_variance(base_url: str, seeds: List[int] = [0, 1, 2, 3, 4]) -> None:
|
| 406 |
+
"""Phase 3: Score variance check (multiple seeds per task)."""
|
| 407 |
+
print("\n" + "="*60)
|
| 408 |
+
print(f"PHASE 3: Score Variance (seeds={seeds})")
|
| 409 |
+
print("="*60)
|
| 410 |
+
|
| 411 |
+
for task in ["easy", "medium", "hard"]:
|
| 412 |
+
task_scores = []
|
| 413 |
+
for seed in seeds:
|
| 414 |
+
score = test_llm_agent(base_url, task=task, seed=seed)
|
| 415 |
+
task_scores.append(score)
|
| 416 |
+
|
| 417 |
+
mean = sum(task_scores) / len(task_scores)
|
| 418 |
+
variance = sum((s - mean) ** 2 for s in task_scores) / len(task_scores)
|
| 419 |
+
print(f"\n {task}: scores={[f'{s:.3f}' for s in task_scores]} mean={mean:.4f} var={variance:.6f}")
|
| 420 |
+
|
| 421 |
+
|
| 422 |
+
# ---------------------------------------------------------------------------
|
| 423 |
+
# Main
|
| 424 |
+
# ---------------------------------------------------------------------------
|
| 425 |
+
|
| 426 |
+
if __name__ == "__main__":
|
| 427 |
+
import argparse
|
| 428 |
+
|
| 429 |
+
parser = argparse.ArgumentParser(description="Judge Evaluation Simulator for GraphStrike")
|
| 430 |
+
parser.add_argument("--url", required=True, help="Environment server URL")
|
| 431 |
+
parser.add_argument("--bedrock", action="store_true", help="Use AWS Bedrock instead of HF router")
|
| 432 |
+
parser.add_argument("--endpoints-only", action="store_true", help="Only test endpoints (no LLM)")
|
| 433 |
+
parser.add_argument("--skip-variance", action="store_true", help="Skip variance check (faster)")
|
| 434 |
+
parser.add_argument("--seeds", type=int, default=3, help="Number of seeds for variance check")
|
| 435 |
+
args = parser.parse_args()
|
| 436 |
+
|
| 437 |
+
if args.bedrock:
|
| 438 |
+
LLM_BACKEND = "bedrock"
|
| 439 |
+
|
| 440 |
+
base = args.url.rstrip("/")
|
| 441 |
+
model_display = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
|
| 442 |
+
print(f"GraphStrike Judge Evaluation Simulator")
|
| 443 |
+
print(f"Target: {base}")
|
| 444 |
+
print(f"Backend: {LLM_BACKEND}")
|
| 445 |
+
print(f"Model: {model_display}")
|
| 446 |
+
print(f"Token: {'set' if (HF_TOKEN or os.getenv('AWS_ACCESS_KEY_ID')) else 'NOT SET'}")
|
| 447 |
+
|
| 448 |
+
# Phase 0: Endpoints
|
| 449 |
+
if not test_endpoints(base):
|
| 450 |
+
print("\n✗ Endpoint check failed. Fix before proceeding.")
|
| 451 |
+
sys.exit(1)
|
| 452 |
+
|
| 453 |
+
# Phase 1: Baseline stability
|
| 454 |
+
test_baseline_stability(base)
|
| 455 |
+
|
| 456 |
+
if args.endpoints_only:
|
| 457 |
+
print("\n✓ Endpoint-only mode — skipping LLM tests.")
|
| 458 |
+
sys.exit(0)
|
| 459 |
+
|
| 460 |
+
if LLM_BACKEND == "bedrock":
|
| 461 |
+
if not os.getenv("AWS_ACCESS_KEY_ID"):
|
| 462 |
+
print("\n✗ AWS_ACCESS_KEY_ID not set. Cannot run Bedrock LLM tests.")
|
| 463 |
+
sys.exit(1)
|
| 464 |
+
elif not HF_TOKEN:
|
| 465 |
+
print("\n✗ HF_TOKEN not set. Cannot run LLM agent tests.")
|
| 466 |
+
print(" export HF_TOKEN='hf_...' OR use --bedrock with AWS creds")
|
| 467 |
+
sys.exit(1)
|
| 468 |
+
|
| 469 |
+
# Phase 2: LLM on all tasks
|
| 470 |
+
scores = test_llm_all_tasks(base)
|
| 471 |
+
|
| 472 |
+
# Phase 3: Variance
|
| 473 |
+
if not args.skip_variance:
|
| 474 |
+
test_variance(base, seeds=list(range(args.seeds)))
|
| 475 |
+
|
| 476 |
+
print("\n" + "="*60)
|
| 477 |
+
print("EVALUATION COMPLETE")
|
| 478 |
+
print("="*60)
|
eval-models/mistral_test_judge_eval.py
ADDED
|
@@ -0,0 +1,478 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Judge Evaluation Simulator
|
| 4 |
+
==========================
|
| 5 |
+
|
| 6 |
+
Simulates EXACTLY how hackathon judges will evaluate your environment:
|
| 7 |
+
|
| 8 |
+
1. Baseline re-run: POST /baseline → verify scores are stable
|
| 9 |
+
2. Standard Open LLM agent: Run an LLM (via HF router) against all 3 tasks
|
| 10 |
+
3. Score variance check: Run same task multiple seeds, check variance
|
| 11 |
+
|
| 12 |
+
USAGE:
|
| 13 |
+
# Against live HF Space (requires HF_TOKEN):
|
| 14 |
+
export HF_TOKEN="hf_..."
|
| 15 |
+
python test_judge_eval.py --url https://pandago-graphstrike.hf.space
|
| 16 |
+
|
| 17 |
+
# Against local server:
|
| 18 |
+
export HF_TOKEN="hf_..."
|
| 19 |
+
python test_judge_eval.py --url http://localhost:7860
|
| 20 |
+
|
| 21 |
+
# Choose model (default: Qwen/Qwen2.5-72B-Instruct):
|
| 22 |
+
export MODEL_NAME="nvidia/Llama-3.1-Nemotron-70B-Instruct-HF"
|
| 23 |
+
python test_judge_eval.py --url https://pandago-graphstrike.hf.space
|
| 24 |
+
|
| 25 |
+
# Just test endpoints (no LLM needed):
|
| 26 |
+
python test_judge_eval.py --url https://pandago-graphstrike.hf.space --endpoints-only
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
import json
|
| 30 |
+
import os
|
| 31 |
+
import sys
|
| 32 |
+
import time
|
| 33 |
+
import urllib.request
|
| 34 |
+
from typing import Dict, List, Optional
|
| 35 |
+
|
| 36 |
+
# ---------------------------------------------------------------------------
|
| 37 |
+
# Config from env vars (same as judges will set)
|
| 38 |
+
# ---------------------------------------------------------------------------
|
| 39 |
+
|
| 40 |
+
HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 41 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 42 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "mistral.ministral-3-8b-instruct")
|
| 43 |
+
|
| 44 |
+
# AWS Bedrock config (use --bedrock flag)
|
| 45 |
+
LLM_BACKEND = "hf" # "hf" or "bedrock"
|
| 46 |
+
BEDROCK_MODEL_ID = os.getenv("BEDROCK_MODEL_ID", "mistral.ministral-3-8b-instruct")
|
| 47 |
+
|
| 48 |
+
SYSTEM_PROMPT = """You are an AI detective finding 10 coordinated fake accounts in a social network.
|
| 49 |
+
|
| 50 |
+
ACTIONS (reply with exactly ONE line):
|
| 51 |
+
- INSPECT acc_XXXX — reveal profile (costs 1 step)
|
| 52 |
+
- FLAG acc_XXXX — mark as fake (FREE, no step cost, triggers suspect cascade)
|
| 53 |
+
- SUBMIT — end episode, get scored
|
| 54 |
+
|
| 55 |
+
DECISION RULES (apply top-to-bottom, first match wins):
|
| 56 |
+
1. If suspect_ids lists accounts you haven't inspected → INSPECT the first one
|
| 57 |
+
2. If ANY profiled account has shared_ip_count >= 5 and is NOT flagged → FLAG it immediately
|
| 58 |
+
3. If ANY profiled account has photo_reuse >= 0.50 AND bio_template >= 0.40 and hub < 0.70 and NOT flagged → FLAG it
|
| 59 |
+
4. If ANY profiled account has fake_risk_score >= 0.30 and hub < 0.70 and NOT flagged → FLAG it
|
| 60 |
+
5. If there are uninspected visible accounts and steps > 3 → INSPECT the next one
|
| 61 |
+
6. If you have 10 flags OR steps <= 3 → SUBMIT
|
| 62 |
+
|
| 63 |
+
IMPORTANT:
|
| 64 |
+
- FLAG is FREE (costs 0 steps) — flag aggressively when you see suspicious signals
|
| 65 |
+
- After each FLAG, new suspects appear — always inspect suspects before other accounts
|
| 66 |
+
- hub_legitimacy_score > 0.70 means celebrity — do NOT flag
|
| 67 |
+
- shared_ip_count >= 5 is the strongest gang signal (all 10 share one IP)
|
| 68 |
+
- Do NOT re-inspect already inspected accounts
|
| 69 |
+
|
| 70 |
+
Reply with EXACTLY one line, nothing else:
|
| 71 |
+
FLAG acc_XXXX
|
| 72 |
+
INSPECT acc_XXXX
|
| 73 |
+
SUBMIT"""
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
# ---------------------------------------------------------------------------
|
| 77 |
+
# HTTP helpers
|
| 78 |
+
# ---------------------------------------------------------------------------
|
| 79 |
+
|
| 80 |
+
def _retry(fn, retries=3, backoff=3):
|
| 81 |
+
"""Retry a function on network errors."""
|
| 82 |
+
for attempt in range(retries):
|
| 83 |
+
try:
|
| 84 |
+
return fn()
|
| 85 |
+
except OSError as e:
|
| 86 |
+
if attempt == retries - 1:
|
| 87 |
+
raise
|
| 88 |
+
wait = backoff * (attempt + 1)
|
| 89 |
+
print(f" [RETRY] Network error: {e} — retrying in {wait}s ({attempt+1}/{retries})")
|
| 90 |
+
time.sleep(wait)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def http_post(url: str, body: Optional[dict] = None) -> dict:
|
| 94 |
+
def _do():
|
| 95 |
+
data = json.dumps(body or {}).encode()
|
| 96 |
+
req = urllib.request.Request(
|
| 97 |
+
url, data=data,
|
| 98 |
+
headers={"Content-Type": "application/json"},
|
| 99 |
+
method="POST"
|
| 100 |
+
)
|
| 101 |
+
with urllib.request.urlopen(req, timeout=120) as resp:
|
| 102 |
+
return json.loads(resp.read())
|
| 103 |
+
return _retry(_do)
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def http_get(url: str, expect_json: bool = True) -> dict:
|
| 107 |
+
def _do():
|
| 108 |
+
with urllib.request.urlopen(url, timeout=120) as resp:
|
| 109 |
+
body = resp.read()
|
| 110 |
+
if not expect_json:
|
| 111 |
+
return {"_status": resp.status, "_body_len": len(body)}
|
| 112 |
+
return json.loads(body)
|
| 113 |
+
return _retry(_do)
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
# ---------------------------------------------------------------------------
|
| 117 |
+
# LLM call via OpenAI-compatible API
|
| 118 |
+
# ---------------------------------------------------------------------------
|
| 119 |
+
|
| 120 |
+
def _call_hf(prompt: str) -> str:
|
| 121 |
+
"""Call LLM via HF router (OpenAI-compatible)."""
|
| 122 |
+
from openai import OpenAI
|
| 123 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
|
| 124 |
+
resp = client.chat.completions.create(
|
| 125 |
+
model=MODEL_NAME,
|
| 126 |
+
messages=[
|
| 127 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 128 |
+
{"role": "user", "content": prompt},
|
| 129 |
+
],
|
| 130 |
+
temperature=0.3,
|
| 131 |
+
max_tokens=256,
|
| 132 |
+
)
|
| 133 |
+
return (resp.choices[0].message.content or "").strip()
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def _call_bedrock(prompt: str) -> str:
|
| 137 |
+
"""Call LLM via AWS Bedrock. Tries converse() first, falls back to invoke_model()."""
|
| 138 |
+
import boto3
|
| 139 |
+
client = boto3.client(
|
| 140 |
+
service_name="bedrock-runtime",
|
| 141 |
+
region_name=os.getenv("AWS_DEFAULT_REGION", "us-east-1"),
|
| 142 |
+
aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
|
| 143 |
+
aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
|
| 144 |
+
)
|
| 145 |
+
# Try converse API first (boto3 >= 1.34.x)
|
| 146 |
+
if hasattr(client, "converse"):
|
| 147 |
+
resp = client.converse(
|
| 148 |
+
modelId=BEDROCK_MODEL_ID,
|
| 149 |
+
messages=[{"role": "user", "content": [{"text": prompt}]}],
|
| 150 |
+
system=[{"text": SYSTEM_PROMPT}],
|
| 151 |
+
inferenceConfig={"maxTokens": 256, "temperature": 0.3},
|
| 152 |
+
)
|
| 153 |
+
return resp["output"]["message"]["content"][0]["text"].strip()
|
| 154 |
+
# Fallback: invoke_model (works with all boto3 versions)
|
| 155 |
+
body = json.dumps({
|
| 156 |
+
"messages": [
|
| 157 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 158 |
+
{"role": "user", "content": prompt},
|
| 159 |
+
],
|
| 160 |
+
"max_tokens": 256,
|
| 161 |
+
"temperature": 0.3,
|
| 162 |
+
})
|
| 163 |
+
resp = client.invoke_model(
|
| 164 |
+
modelId=BEDROCK_MODEL_ID,
|
| 165 |
+
contentType="application/json",
|
| 166 |
+
accept="application/json",
|
| 167 |
+
body=body,
|
| 168 |
+
)
|
| 169 |
+
result = json.loads(resp["body"].read())
|
| 170 |
+
# Handle both OpenAI-style and Bedrock-native response formats
|
| 171 |
+
if "choices" in result:
|
| 172 |
+
return result["choices"][0]["message"]["content"].strip()
|
| 173 |
+
if "content" in result:
|
| 174 |
+
content = result["content"]
|
| 175 |
+
if isinstance(content, list):
|
| 176 |
+
return content[0].get("text", "").strip()
|
| 177 |
+
return str(content).strip()
|
| 178 |
+
if "output" in result:
|
| 179 |
+
return result["output"].get("text", "").strip()
|
| 180 |
+
return str(result).strip()
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def call_llm(prompt: str) -> str:
|
| 184 |
+
"""Call LLM with retries. Uses HF router or Bedrock based on LLM_BACKEND."""
|
| 185 |
+
fn = _call_bedrock if LLM_BACKEND == "bedrock" else _call_hf
|
| 186 |
+
for attempt in range(3):
|
| 187 |
+
try:
|
| 188 |
+
raw = fn(prompt)
|
| 189 |
+
if os.getenv("DEBUG_LLM"):
|
| 190 |
+
print(f" [LLM RAW] {raw[:200]}")
|
| 191 |
+
# Strip Qwen3 <think>...</think> reasoning blocks
|
| 192 |
+
import re
|
| 193 |
+
cleaned = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()
|
| 194 |
+
return cleaned if cleaned else raw
|
| 195 |
+
except Exception as e:
|
| 196 |
+
if attempt == 2:
|
| 197 |
+
print(f" [LLM ERROR] {e} (gave up after 3 attempts)")
|
| 198 |
+
return ""
|
| 199 |
+
wait = 3 * (attempt + 1)
|
| 200 |
+
print(f" [LLM RETRY] {e} — retrying in {wait}s")
|
| 201 |
+
time.sleep(wait)
|
| 202 |
+
return ""
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
def format_obs(obs: dict) -> str:
|
| 206 |
+
"""Format observation as text for LLM — shows raw signals prominently."""
|
| 207 |
+
lines = []
|
| 208 |
+
lines.append(f"TASK: {obs.get('task','?').upper()} | Steps remaining: {obs.get('steps_remaining','?')}")
|
| 209 |
+
|
| 210 |
+
flagged = obs.get("flagged_ids", [])
|
| 211 |
+
lines.append(f"Flagged ({len(flagged)}/10): {', '.join(flagged) if flagged else 'none'}")
|
| 212 |
+
|
| 213 |
+
suspects = obs.get("suspect_ids", [])
|
| 214 |
+
inspected = obs.get("inspected_ids", [])
|
| 215 |
+
uninspected_suspects = [s for s in suspects if s not in inspected]
|
| 216 |
+
if uninspected_suspects:
|
| 217 |
+
lines.append(f"*** SUSPECTS (uninspected) → INSPECT THESE FIRST: {', '.join(uninspected_suspects)} ***")
|
| 218 |
+
|
| 219 |
+
accounts = obs.get("visible_accounts", [])
|
| 220 |
+
if accounts:
|
| 221 |
+
# Split: unflagged accounts that should be flagged vs rest
|
| 222 |
+
unflagged_suspicious = []
|
| 223 |
+
flagged_accs = []
|
| 224 |
+
clean_accs = []
|
| 225 |
+
for a in sorted(accounts, key=lambda x: x.get("fake_risk_score", 0), reverse=True):
|
| 226 |
+
aid = a.get("account_id", "?")
|
| 227 |
+
if aid in flagged:
|
| 228 |
+
flagged_accs.append(a)
|
| 229 |
+
elif (a.get("shared_ip_count", 0) >= 5 or
|
| 230 |
+
(a.get("photo_reuse_score", 0) >= 0.50 and a.get("bio_template_score", 0) >= 0.40)):
|
| 231 |
+
unflagged_suspicious.append(a)
|
| 232 |
+
else:
|
| 233 |
+
clean_accs.append(a)
|
| 234 |
+
|
| 235 |
+
if unflagged_suspicious:
|
| 236 |
+
lines.append(f"\n!!! ACTION NEEDED — FLAG THESE ({len(unflagged_suspicious)} accounts with strong fake signals):")
|
| 237 |
+
for a in unflagged_suspicious:
|
| 238 |
+
aid = a.get("account_id", "?")
|
| 239 |
+
lines.append(f" → FLAG {aid}: risk={a.get('fake_risk_score',0):.3f} photo={a.get('photo_reuse_score',0):.2f} bio={a.get('bio_template_score',0):.2f} ip_shared={a.get('shared_ip_count',0)} hub={a.get('hub_legitimacy_score',0):.2f}")
|
| 240 |
+
|
| 241 |
+
if flagged_accs:
|
| 242 |
+
lines.append(f"\nALREADY FLAGGED ({len(flagged_accs)}):")
|
| 243 |
+
for a in flagged_accs[:5]:
|
| 244 |
+
lines.append(f" ✓ {a.get('account_id','?')}")
|
| 245 |
+
|
| 246 |
+
if clean_accs:
|
| 247 |
+
lines.append(f"\nCLEAN ACCOUNTS ({len(clean_accs)}):")
|
| 248 |
+
for a in clean_accs[:5]:
|
| 249 |
+
aid = a.get("account_id", "?")
|
| 250 |
+
hub = a.get("hub_legitimacy_score", 0)
|
| 251 |
+
hub_mark = " [CELEBRITY]" if hub > 0.70 else ""
|
| 252 |
+
lines.append(f" {aid}: risk={a.get('fake_risk_score',0):.3f} photo={a.get('photo_reuse_score',0):.2f} bio={a.get('bio_template_score',0):.2f} hub={hub:.2f}{hub_mark}")
|
| 253 |
+
|
| 254 |
+
visible = obs.get("visible_account_ids", [])
|
| 255 |
+
uninspected = [i for i in visible if i not in inspected]
|
| 256 |
+
if uninspected:
|
| 257 |
+
lines.append(f"\nUninspected IDs ({len(uninspected)}): {', '.join(uninspected[:8])}{'...' if len(uninspected) > 8 else ''}")
|
| 258 |
+
|
| 259 |
+
lines.append(f"\nMessage: {obs.get('message', '')}")
|
| 260 |
+
return "\n".join(lines)
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
def parse_action(llm_text: str, obs: dict) -> dict:
|
| 264 |
+
"""Parse LLM output to action dict."""
|
| 265 |
+
for line in llm_text.split("\n"):
|
| 266 |
+
line = line.strip()
|
| 267 |
+
upper = line.upper()
|
| 268 |
+
if upper.startswith("INSPECT ") or upper.startswith("FLAG ") or upper.startswith("INVESTIGATE_NETWORK ") or upper.startswith("UNFLAG "):
|
| 269 |
+
parts = line.split(maxsplit=1)
|
| 270 |
+
return {"action_type": parts[0].lower(), "account_id": parts[1].lower() if len(parts) > 1 else None}
|
| 271 |
+
if upper == "SUBMIT":
|
| 272 |
+
return {"action_type": "submit"}
|
| 273 |
+
|
| 274 |
+
# Fallback: inspect first uninspected suspect
|
| 275 |
+
suspects = obs.get("suspect_ids", [])
|
| 276 |
+
inspected = obs.get("inspected_ids", [])
|
| 277 |
+
for s in suspects:
|
| 278 |
+
if s not in inspected:
|
| 279 |
+
return {"action_type": "inspect", "account_id": s}
|
| 280 |
+
visible = obs.get("visible_account_ids", [])
|
| 281 |
+
for v in visible:
|
| 282 |
+
if v not in inspected:
|
| 283 |
+
return {"action_type": "inspect", "account_id": v}
|
| 284 |
+
return {"action_type": "submit"}
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
# ---------------------------------------------------------------------------
|
| 288 |
+
# Test phases
|
| 289 |
+
# ---------------------------------------------------------------------------
|
| 290 |
+
|
| 291 |
+
def test_endpoints(base_url: str) -> bool:
|
| 292 |
+
"""Phase 0: Verify all required endpoints respond correctly."""
|
| 293 |
+
print("\n" + "="*60)
|
| 294 |
+
print("PHASE 0: Endpoint Verification")
|
| 295 |
+
print("="*60)
|
| 296 |
+
|
| 297 |
+
checks = [
|
| 298 |
+
("GET", "/health", None, True),
|
| 299 |
+
("GET", "/tasks", None, True),
|
| 300 |
+
("GET", "/metadata", None, True),
|
| 301 |
+
("GET", "/schema", None, True),
|
| 302 |
+
("GET", "/web", None, False), # returns HTML, not JSON
|
| 303 |
+
("POST", "/reset", {"task": "easy", "seed": 0}, True),
|
| 304 |
+
("GET", "/state", None, True),
|
| 305 |
+
("POST", "/step", {"action_type": "inspect", "account_id": "acc_0000"}, True),
|
| 306 |
+
("POST", "/step", {"action_type": "submit"}, True),
|
| 307 |
+
("GET", "/grader", None, True),
|
| 308 |
+
("POST", "/mcp", {"jsonrpc": "2.0", "method": "tools/list", "id": 1}, True),
|
| 309 |
+
("POST", "/baseline", None, True),
|
| 310 |
+
]
|
| 311 |
+
|
| 312 |
+
all_ok = True
|
| 313 |
+
for method, path, body, expect_json in checks:
|
| 314 |
+
try:
|
| 315 |
+
if method == "GET":
|
| 316 |
+
http_get(f"{base_url}{path}", expect_json=expect_json)
|
| 317 |
+
else:
|
| 318 |
+
http_post(f"{base_url}{path}", body)
|
| 319 |
+
print(f" ✓ {method} {path}")
|
| 320 |
+
except Exception as e:
|
| 321 |
+
print(f" ✗ {method} {path} — {e}")
|
| 322 |
+
all_ok = False
|
| 323 |
+
|
| 324 |
+
return all_ok
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
def test_baseline_stability(base_url: str) -> bool:
|
| 328 |
+
"""Phase 1: Baseline re-run (must produce identical scores)."""
|
| 329 |
+
print("\n" + "="*60)
|
| 330 |
+
print("PHASE 1: Baseline Stability (3 runs)")
|
| 331 |
+
print("="*60)
|
| 332 |
+
|
| 333 |
+
scores_list = []
|
| 334 |
+
for i in range(3):
|
| 335 |
+
r = http_post(f"{base_url}/baseline")
|
| 336 |
+
scores = r["scores"]
|
| 337 |
+
scores_list.append(scores)
|
| 338 |
+
print(f" Run {i+1}: easy={scores['easy']:.4f} medium={scores['medium']:.4f} hard={scores['hard']:.4f}")
|
| 339 |
+
|
| 340 |
+
# Check all identical
|
| 341 |
+
stable = all(s == scores_list[0] for s in scores_list)
|
| 342 |
+
if stable:
|
| 343 |
+
print(" ✓ All 3 runs identical — baseline is deterministic")
|
| 344 |
+
else:
|
| 345 |
+
print(" ✗ SCORES DIFFER — baseline is non-deterministic!")
|
| 346 |
+
return stable
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
def test_llm_agent(base_url: str, task: str, seed: int = 0) -> float:
|
| 350 |
+
"""Phase 2: Run an LLM agent against one task (simulates judge's Nemotron run)."""
|
| 351 |
+
_model = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
|
| 352 |
+
print(f"\n --- LLM Agent: task={task}, seed={seed}, model={_model} ---")
|
| 353 |
+
|
| 354 |
+
# Reset
|
| 355 |
+
reset_resp = http_post(f"{base_url}/reset", {"task": task, "seed": seed})
|
| 356 |
+
obs = reset_resp.get("observation", reset_resp)
|
| 357 |
+
done = reset_resp.get("done", False)
|
| 358 |
+
|
| 359 |
+
step_num = 0
|
| 360 |
+
while not done:
|
| 361 |
+
step_num += 1
|
| 362 |
+
prompt = format_obs(obs)
|
| 363 |
+
llm_text = call_llm(prompt)
|
| 364 |
+
action = parse_action(llm_text, obs)
|
| 365 |
+
|
| 366 |
+
action_str = f"{action['action_type'].upper()} {action.get('account_id', '')}".strip()
|
| 367 |
+
|
| 368 |
+
step_resp = http_post(f"{base_url}/step", action)
|
| 369 |
+
obs = step_resp.get("observation", step_resp)
|
| 370 |
+
done = step_resp.get("done", False)
|
| 371 |
+
reward = step_resp.get("reward")
|
| 372 |
+
|
| 373 |
+
flagged_n = len(obs.get("flagged_ids", []))
|
| 374 |
+
suspects_n = len(obs.get("suspect_ids", []))
|
| 375 |
+
steps_left = obs.get("steps_remaining", "?")
|
| 376 |
+
|
| 377 |
+
print(f" Step {step_num:2d}: {action_str:35s} flagged={flagged_n}/10 suspects={suspects_n} steps_left={steps_left}")
|
| 378 |
+
|
| 379 |
+
if done and reward is not None:
|
| 380 |
+
msg = step_resp.get("message", obs.get("message", ""))
|
| 381 |
+
print(f" → Episode ended: {msg[:100]}")
|
| 382 |
+
|
| 383 |
+
# Get grader score
|
| 384 |
+
grader = http_get(f"{base_url}/grader")
|
| 385 |
+
score = grader["score"]
|
| 386 |
+
print(f" ★ GRADER SCORE: {score:.4f}")
|
| 387 |
+
return score
|
| 388 |
+
|
| 389 |
+
|
| 390 |
+
def test_llm_all_tasks(base_url: str) -> Dict[str, float]:
|
| 391 |
+
"""Phase 2: Run LLM agent on all 3 tasks."""
|
| 392 |
+
print("\n" + "="*60)
|
| 393 |
+
_model = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
|
| 394 |
+
print(f"PHASE 2: LLM Agent Evaluation (model={_model})")
|
| 395 |
+
print("="*60)
|
| 396 |
+
|
| 397 |
+
scores = {}
|
| 398 |
+
for task in ["easy", "medium", "hard"]:
|
| 399 |
+
scores[task] = test_llm_agent(base_url, task=task, seed=0)
|
| 400 |
+
|
| 401 |
+
print(f"\n Summary: easy={scores['easy']:.4f} medium={scores['medium']:.4f} hard={scores['hard']:.4f}")
|
| 402 |
+
return scores
|
| 403 |
+
|
| 404 |
+
|
| 405 |
+
def test_variance(base_url: str, seeds: List[int] = [0, 1, 2, 3, 4]) -> None:
|
| 406 |
+
"""Phase 3: Score variance check (multiple seeds per task)."""
|
| 407 |
+
print("\n" + "="*60)
|
| 408 |
+
print(f"PHASE 3: Score Variance (seeds={seeds})")
|
| 409 |
+
print("="*60)
|
| 410 |
+
|
| 411 |
+
for task in ["easy", "medium", "hard"]:
|
| 412 |
+
task_scores = []
|
| 413 |
+
for seed in seeds:
|
| 414 |
+
score = test_llm_agent(base_url, task=task, seed=seed)
|
| 415 |
+
task_scores.append(score)
|
| 416 |
+
|
| 417 |
+
mean = sum(task_scores) / len(task_scores)
|
| 418 |
+
variance = sum((s - mean) ** 2 for s in task_scores) / len(task_scores)
|
| 419 |
+
print(f"\n {task}: scores={[f'{s:.3f}' for s in task_scores]} mean={mean:.4f} var={variance:.6f}")
|
| 420 |
+
|
| 421 |
+
|
| 422 |
+
# ---------------------------------------------------------------------------
|
| 423 |
+
# Main
|
| 424 |
+
# ---------------------------------------------------------------------------
|
| 425 |
+
|
| 426 |
+
if __name__ == "__main__":
|
| 427 |
+
import argparse
|
| 428 |
+
|
| 429 |
+
parser = argparse.ArgumentParser(description="Judge Evaluation Simulator for GraphStrike")
|
| 430 |
+
parser.add_argument("--url", required=True, help="Environment server URL")
|
| 431 |
+
parser.add_argument("--bedrock", action="store_true", help="Use AWS Bedrock instead of HF router")
|
| 432 |
+
parser.add_argument("--endpoints-only", action="store_true", help="Only test endpoints (no LLM)")
|
| 433 |
+
parser.add_argument("--skip-variance", action="store_true", help="Skip variance check (faster)")
|
| 434 |
+
parser.add_argument("--seeds", type=int, default=3, help="Number of seeds for variance check")
|
| 435 |
+
args = parser.parse_args()
|
| 436 |
+
|
| 437 |
+
if args.bedrock:
|
| 438 |
+
LLM_BACKEND = "bedrock"
|
| 439 |
+
|
| 440 |
+
base = args.url.rstrip("/")
|
| 441 |
+
model_display = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
|
| 442 |
+
print(f"GraphStrike Judge Evaluation Simulator")
|
| 443 |
+
print(f"Target: {base}")
|
| 444 |
+
print(f"Backend: {LLM_BACKEND}")
|
| 445 |
+
print(f"Model: {model_display}")
|
| 446 |
+
print(f"Token: {'set' if (HF_TOKEN or os.getenv('AWS_ACCESS_KEY_ID')) else 'NOT SET'}")
|
| 447 |
+
|
| 448 |
+
# Phase 0: Endpoints
|
| 449 |
+
if not test_endpoints(base):
|
| 450 |
+
print("\n✗ Endpoint check failed. Fix before proceeding.")
|
| 451 |
+
sys.exit(1)
|
| 452 |
+
|
| 453 |
+
# Phase 1: Baseline stability
|
| 454 |
+
test_baseline_stability(base)
|
| 455 |
+
|
| 456 |
+
if args.endpoints_only:
|
| 457 |
+
print("\n✓ Endpoint-only mode — skipping LLM tests.")
|
| 458 |
+
sys.exit(0)
|
| 459 |
+
|
| 460 |
+
if LLM_BACKEND == "bedrock":
|
| 461 |
+
if not os.getenv("AWS_ACCESS_KEY_ID"):
|
| 462 |
+
print("\n✗ AWS_ACCESS_KEY_ID not set. Cannot run Bedrock LLM tests.")
|
| 463 |
+
sys.exit(1)
|
| 464 |
+
elif not HF_TOKEN:
|
| 465 |
+
print("\n✗ HF_TOKEN not set. Cannot run LLM agent tests.")
|
| 466 |
+
print(" export HF_TOKEN='hf_...' OR use --bedrock with AWS creds")
|
| 467 |
+
sys.exit(1)
|
| 468 |
+
|
| 469 |
+
# Phase 2: LLM on all tasks
|
| 470 |
+
scores = test_llm_all_tasks(base)
|
| 471 |
+
|
| 472 |
+
# Phase 3: Variance
|
| 473 |
+
if not args.skip_variance:
|
| 474 |
+
test_variance(base, seeds=list(range(args.seeds)))
|
| 475 |
+
|
| 476 |
+
print("\n" + "="*60)
|
| 477 |
+
print("EVALUATION COMPLETE")
|
| 478 |
+
print("="*60)
|
eval-models/nvidia_test_judge_eval.py
ADDED
|
@@ -0,0 +1,478 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Judge Evaluation Simulator
|
| 4 |
+
==========================
|
| 5 |
+
|
| 6 |
+
Simulates EXACTLY how hackathon judges will evaluate your environment:
|
| 7 |
+
|
| 8 |
+
1. Baseline re-run: POST /baseline → verify scores are stable
|
| 9 |
+
2. Standard Open LLM agent: Run an LLM (via HF router) against all 3 tasks
|
| 10 |
+
3. Score variance check: Run same task multiple seeds, check variance
|
| 11 |
+
|
| 12 |
+
USAGE:
|
| 13 |
+
# Against live HF Space (requires HF_TOKEN):
|
| 14 |
+
export HF_TOKEN="hf_..."
|
| 15 |
+
python test_judge_eval.py --url https://pandago-graphstrike.hf.space
|
| 16 |
+
|
| 17 |
+
# Against local server:
|
| 18 |
+
export HF_TOKEN="hf_..."
|
| 19 |
+
python test_judge_eval.py --url http://localhost:7860
|
| 20 |
+
|
| 21 |
+
# Choose model (default: Qwen/Qwen2.5-72B-Instruct):
|
| 22 |
+
export MODEL_NAME="nvidia/Llama-3.1-Nemotron-70B-Instruct-HF"
|
| 23 |
+
python test_judge_eval.py --url https://pandago-graphstrike.hf.space
|
| 24 |
+
|
| 25 |
+
# Just test endpoints (no LLM needed):
|
| 26 |
+
python test_judge_eval.py --url https://pandago-graphstrike.hf.space --endpoints-only
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
import json
|
| 30 |
+
import os
|
| 31 |
+
import sys
|
| 32 |
+
import time
|
| 33 |
+
import urllib.request
|
| 34 |
+
from typing import Dict, List, Optional
|
| 35 |
+
|
| 36 |
+
# ---------------------------------------------------------------------------
|
| 37 |
+
# Config from env vars (same as judges will set)
|
| 38 |
+
# ---------------------------------------------------------------------------
|
| 39 |
+
|
| 40 |
+
HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 41 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 42 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "nvidia.nemotron-super-3-120b")
|
| 43 |
+
|
| 44 |
+
# AWS Bedrock config (use --bedrock flag)
|
| 45 |
+
LLM_BACKEND = "hf" # "hf" or "bedrock"
|
| 46 |
+
BEDROCK_MODEL_ID = os.getenv("BEDROCK_MODEL_ID", "nvidia.nemotron-super-3-120b")
|
| 47 |
+
|
| 48 |
+
SYSTEM_PROMPT = """You are an AI detective finding 10 coordinated fake accounts in a social network.
|
| 49 |
+
|
| 50 |
+
ACTIONS (reply with exactly ONE line):
|
| 51 |
+
- INSPECT acc_XXXX — reveal profile (costs 1 step)
|
| 52 |
+
- FLAG acc_XXXX — mark as fake (FREE, no step cost, triggers suspect cascade)
|
| 53 |
+
- SUBMIT — end episode, get scored
|
| 54 |
+
|
| 55 |
+
DECISION RULES (apply top-to-bottom, first match wins):
|
| 56 |
+
1. If suspect_ids lists accounts you haven't inspected → INSPECT the first one
|
| 57 |
+
2. If ANY profiled account has shared_ip_count >= 5 and is NOT flagged → FLAG it immediately
|
| 58 |
+
3. If ANY profiled account has photo_reuse >= 0.50 AND bio_template >= 0.40 and hub < 0.70 and NOT flagged → FLAG it
|
| 59 |
+
4. If ANY profiled account has fake_risk_score >= 0.30 and hub < 0.70 and NOT flagged → FLAG it
|
| 60 |
+
5. If there are uninspected visible accounts and steps > 3 → INSPECT the next one
|
| 61 |
+
6. If you have 10 flags OR steps <= 3 → SUBMIT
|
| 62 |
+
|
| 63 |
+
IMPORTANT:
|
| 64 |
+
- FLAG is FREE (costs 0 steps) — flag aggressively when you see suspicious signals
|
| 65 |
+
- After each FLAG, new suspects appear — always inspect suspects before other accounts
|
| 66 |
+
- hub_legitimacy_score > 0.70 means celebrity — do NOT flag
|
| 67 |
+
- shared_ip_count >= 5 is the strongest gang signal (all 10 share one IP)
|
| 68 |
+
- Do NOT re-inspect already inspected accounts
|
| 69 |
+
|
| 70 |
+
Reply with EXACTLY one line, nothing else:
|
| 71 |
+
FLAG acc_XXXX
|
| 72 |
+
INSPECT acc_XXXX
|
| 73 |
+
SUBMIT"""
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
# ---------------------------------------------------------------------------
|
| 77 |
+
# HTTP helpers
|
| 78 |
+
# ---------------------------------------------------------------------------
|
| 79 |
+
|
| 80 |
+
def _retry(fn, retries=3, backoff=3):
|
| 81 |
+
"""Retry a function on network errors."""
|
| 82 |
+
for attempt in range(retries):
|
| 83 |
+
try:
|
| 84 |
+
return fn()
|
| 85 |
+
except OSError as e:
|
| 86 |
+
if attempt == retries - 1:
|
| 87 |
+
raise
|
| 88 |
+
wait = backoff * (attempt + 1)
|
| 89 |
+
print(f" [RETRY] Network error: {e} — retrying in {wait}s ({attempt+1}/{retries})")
|
| 90 |
+
time.sleep(wait)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def http_post(url: str, body: Optional[dict] = None) -> dict:
|
| 94 |
+
def _do():
|
| 95 |
+
data = json.dumps(body or {}).encode()
|
| 96 |
+
req = urllib.request.Request(
|
| 97 |
+
url, data=data,
|
| 98 |
+
headers={"Content-Type": "application/json"},
|
| 99 |
+
method="POST"
|
| 100 |
+
)
|
| 101 |
+
with urllib.request.urlopen(req, timeout=120) as resp:
|
| 102 |
+
return json.loads(resp.read())
|
| 103 |
+
return _retry(_do)
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def http_get(url: str, expect_json: bool = True) -> dict:
|
| 107 |
+
def _do():
|
| 108 |
+
with urllib.request.urlopen(url, timeout=120) as resp:
|
| 109 |
+
body = resp.read()
|
| 110 |
+
if not expect_json:
|
| 111 |
+
return {"_status": resp.status, "_body_len": len(body)}
|
| 112 |
+
return json.loads(body)
|
| 113 |
+
return _retry(_do)
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
# ---------------------------------------------------------------------------
|
| 117 |
+
# LLM call via OpenAI-compatible API
|
| 118 |
+
# ---------------------------------------------------------------------------
|
| 119 |
+
|
| 120 |
+
def _call_hf(prompt: str) -> str:
|
| 121 |
+
"""Call LLM via HF router (OpenAI-compatible)."""
|
| 122 |
+
from openai import OpenAI
|
| 123 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
|
| 124 |
+
resp = client.chat.completions.create(
|
| 125 |
+
model=MODEL_NAME,
|
| 126 |
+
messages=[
|
| 127 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 128 |
+
{"role": "user", "content": prompt},
|
| 129 |
+
],
|
| 130 |
+
temperature=0.3,
|
| 131 |
+
max_tokens=256,
|
| 132 |
+
)
|
| 133 |
+
return (resp.choices[0].message.content or "").strip()
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def _call_bedrock(prompt: str) -> str:
|
| 137 |
+
"""Call LLM via AWS Bedrock. Tries converse() first, falls back to invoke_model()."""
|
| 138 |
+
import boto3
|
| 139 |
+
client = boto3.client(
|
| 140 |
+
service_name="bedrock-runtime",
|
| 141 |
+
region_name=os.getenv("AWS_DEFAULT_REGION", "us-east-1"),
|
| 142 |
+
aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
|
| 143 |
+
aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
|
| 144 |
+
)
|
| 145 |
+
# Try converse API first (boto3 >= 1.34.x)
|
| 146 |
+
if hasattr(client, "converse"):
|
| 147 |
+
resp = client.converse(
|
| 148 |
+
modelId=BEDROCK_MODEL_ID,
|
| 149 |
+
messages=[{"role": "user", "content": [{"text": prompt}]}],
|
| 150 |
+
system=[{"text": SYSTEM_PROMPT}],
|
| 151 |
+
inferenceConfig={"maxTokens": 256, "temperature": 0.3},
|
| 152 |
+
)
|
| 153 |
+
return resp["output"]["message"]["content"][0]["text"].strip()
|
| 154 |
+
# Fallback: invoke_model (works with all boto3 versions)
|
| 155 |
+
body = json.dumps({
|
| 156 |
+
"messages": [
|
| 157 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 158 |
+
{"role": "user", "content": prompt},
|
| 159 |
+
],
|
| 160 |
+
"max_tokens": 256,
|
| 161 |
+
"temperature": 0.3,
|
| 162 |
+
})
|
| 163 |
+
resp = client.invoke_model(
|
| 164 |
+
modelId=BEDROCK_MODEL_ID,
|
| 165 |
+
contentType="application/json",
|
| 166 |
+
accept="application/json",
|
| 167 |
+
body=body,
|
| 168 |
+
)
|
| 169 |
+
result = json.loads(resp["body"].read())
|
| 170 |
+
# Handle both OpenAI-style and Bedrock-native response formats
|
| 171 |
+
if "choices" in result:
|
| 172 |
+
return result["choices"][0]["message"]["content"].strip()
|
| 173 |
+
if "content" in result:
|
| 174 |
+
content = result["content"]
|
| 175 |
+
if isinstance(content, list):
|
| 176 |
+
return content[0].get("text", "").strip()
|
| 177 |
+
return str(content).strip()
|
| 178 |
+
if "output" in result:
|
| 179 |
+
return result["output"].get("text", "").strip()
|
| 180 |
+
return str(result).strip()
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def call_llm(prompt: str) -> str:
|
| 184 |
+
"""Call LLM with retries. Uses HF router or Bedrock based on LLM_BACKEND."""
|
| 185 |
+
fn = _call_bedrock if LLM_BACKEND == "bedrock" else _call_hf
|
| 186 |
+
for attempt in range(3):
|
| 187 |
+
try:
|
| 188 |
+
raw = fn(prompt)
|
| 189 |
+
if os.getenv("DEBUG_LLM"):
|
| 190 |
+
print(f" [LLM RAW] {raw[:200]}")
|
| 191 |
+
# Strip Qwen3 <think>...</think> reasoning blocks
|
| 192 |
+
import re
|
| 193 |
+
cleaned = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()
|
| 194 |
+
return cleaned if cleaned else raw
|
| 195 |
+
except Exception as e:
|
| 196 |
+
if attempt == 2:
|
| 197 |
+
print(f" [LLM ERROR] {e} (gave up after 3 attempts)")
|
| 198 |
+
return ""
|
| 199 |
+
wait = 3 * (attempt + 1)
|
| 200 |
+
print(f" [LLM RETRY] {e} — retrying in {wait}s")
|
| 201 |
+
time.sleep(wait)
|
| 202 |
+
return ""
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
def format_obs(obs: dict) -> str:
|
| 206 |
+
"""Format observation as text for LLM — shows raw signals prominently."""
|
| 207 |
+
lines = []
|
| 208 |
+
lines.append(f"TASK: {obs.get('task','?').upper()} | Steps remaining: {obs.get('steps_remaining','?')}")
|
| 209 |
+
|
| 210 |
+
flagged = obs.get("flagged_ids", [])
|
| 211 |
+
lines.append(f"Flagged ({len(flagged)}/10): {', '.join(flagged) if flagged else 'none'}")
|
| 212 |
+
|
| 213 |
+
suspects = obs.get("suspect_ids", [])
|
| 214 |
+
inspected = obs.get("inspected_ids", [])
|
| 215 |
+
uninspected_suspects = [s for s in suspects if s not in inspected]
|
| 216 |
+
if uninspected_suspects:
|
| 217 |
+
lines.append(f"*** SUSPECTS (uninspected) → INSPECT THESE FIRST: {', '.join(uninspected_suspects)} ***")
|
| 218 |
+
|
| 219 |
+
accounts = obs.get("visible_accounts", [])
|
| 220 |
+
if accounts:
|
| 221 |
+
# Split: unflagged accounts that should be flagged vs rest
|
| 222 |
+
unflagged_suspicious = []
|
| 223 |
+
flagged_accs = []
|
| 224 |
+
clean_accs = []
|
| 225 |
+
for a in sorted(accounts, key=lambda x: x.get("fake_risk_score", 0), reverse=True):
|
| 226 |
+
aid = a.get("account_id", "?")
|
| 227 |
+
if aid in flagged:
|
| 228 |
+
flagged_accs.append(a)
|
| 229 |
+
elif (a.get("shared_ip_count", 0) >= 5 or
|
| 230 |
+
(a.get("photo_reuse_score", 0) >= 0.50 and a.get("bio_template_score", 0) >= 0.40)):
|
| 231 |
+
unflagged_suspicious.append(a)
|
| 232 |
+
else:
|
| 233 |
+
clean_accs.append(a)
|
| 234 |
+
|
| 235 |
+
if unflagged_suspicious:
|
| 236 |
+
lines.append(f"\n!!! ACTION NEEDED — FLAG THESE ({len(unflagged_suspicious)} accounts with strong fake signals):")
|
| 237 |
+
for a in unflagged_suspicious:
|
| 238 |
+
aid = a.get("account_id", "?")
|
| 239 |
+
lines.append(f" → FLAG {aid}: risk={a.get('fake_risk_score',0):.3f} photo={a.get('photo_reuse_score',0):.2f} bio={a.get('bio_template_score',0):.2f} ip_shared={a.get('shared_ip_count',0)} hub={a.get('hub_legitimacy_score',0):.2f}")
|
| 240 |
+
|
| 241 |
+
if flagged_accs:
|
| 242 |
+
lines.append(f"\nALREADY FLAGGED ({len(flagged_accs)}):")
|
| 243 |
+
for a in flagged_accs[:5]:
|
| 244 |
+
lines.append(f" ✓ {a.get('account_id','?')}")
|
| 245 |
+
|
| 246 |
+
if clean_accs:
|
| 247 |
+
lines.append(f"\nCLEAN ACCOUNTS ({len(clean_accs)}):")
|
| 248 |
+
for a in clean_accs[:5]:
|
| 249 |
+
aid = a.get("account_id", "?")
|
| 250 |
+
hub = a.get("hub_legitimacy_score", 0)
|
| 251 |
+
hub_mark = " [CELEBRITY]" if hub > 0.70 else ""
|
| 252 |
+
lines.append(f" {aid}: risk={a.get('fake_risk_score',0):.3f} photo={a.get('photo_reuse_score',0):.2f} bio={a.get('bio_template_score',0):.2f} hub={hub:.2f}{hub_mark}")
|
| 253 |
+
|
| 254 |
+
visible = obs.get("visible_account_ids", [])
|
| 255 |
+
uninspected = [i for i in visible if i not in inspected]
|
| 256 |
+
if uninspected:
|
| 257 |
+
lines.append(f"\nUninspected IDs ({len(uninspected)}): {', '.join(uninspected[:8])}{'...' if len(uninspected) > 8 else ''}")
|
| 258 |
+
|
| 259 |
+
lines.append(f"\nMessage: {obs.get('message', '')}")
|
| 260 |
+
return "\n".join(lines)
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
def parse_action(llm_text: str, obs: dict) -> dict:
|
| 264 |
+
"""Parse LLM output to action dict."""
|
| 265 |
+
for line in llm_text.split("\n"):
|
| 266 |
+
line = line.strip()
|
| 267 |
+
upper = line.upper()
|
| 268 |
+
if upper.startswith("INSPECT ") or upper.startswith("FLAG ") or upper.startswith("INVESTIGATE_NETWORK ") or upper.startswith("UNFLAG "):
|
| 269 |
+
parts = line.split(maxsplit=1)
|
| 270 |
+
return {"action_type": parts[0].lower(), "account_id": parts[1].lower() if len(parts) > 1 else None}
|
| 271 |
+
if upper == "SUBMIT":
|
| 272 |
+
return {"action_type": "submit"}
|
| 273 |
+
|
| 274 |
+
# Fallback: inspect first uninspected suspect
|
| 275 |
+
suspects = obs.get("suspect_ids", [])
|
| 276 |
+
inspected = obs.get("inspected_ids", [])
|
| 277 |
+
for s in suspects:
|
| 278 |
+
if s not in inspected:
|
| 279 |
+
return {"action_type": "inspect", "account_id": s}
|
| 280 |
+
visible = obs.get("visible_account_ids", [])
|
| 281 |
+
for v in visible:
|
| 282 |
+
if v not in inspected:
|
| 283 |
+
return {"action_type": "inspect", "account_id": v}
|
| 284 |
+
return {"action_type": "submit"}
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
# ---------------------------------------------------------------------------
|
| 288 |
+
# Test phases
|
| 289 |
+
# ---------------------------------------------------------------------------
|
| 290 |
+
|
| 291 |
+
def test_endpoints(base_url: str) -> bool:
|
| 292 |
+
"""Phase 0: Verify all required endpoints respond correctly."""
|
| 293 |
+
print("\n" + "="*60)
|
| 294 |
+
print("PHASE 0: Endpoint Verification")
|
| 295 |
+
print("="*60)
|
| 296 |
+
|
| 297 |
+
checks = [
|
| 298 |
+
("GET", "/health", None, True),
|
| 299 |
+
("GET", "/tasks", None, True),
|
| 300 |
+
("GET", "/metadata", None, True),
|
| 301 |
+
("GET", "/schema", None, True),
|
| 302 |
+
("GET", "/web", None, False), # returns HTML, not JSON
|
| 303 |
+
("POST", "/reset", {"task": "easy", "seed": 0}, True),
|
| 304 |
+
("GET", "/state", None, True),
|
| 305 |
+
("POST", "/step", {"action_type": "inspect", "account_id": "acc_0000"}, True),
|
| 306 |
+
("POST", "/step", {"action_type": "submit"}, True),
|
| 307 |
+
("GET", "/grader", None, True),
|
| 308 |
+
("POST", "/mcp", {"jsonrpc": "2.0", "method": "tools/list", "id": 1}, True),
|
| 309 |
+
("POST", "/baseline", None, True),
|
| 310 |
+
]
|
| 311 |
+
|
| 312 |
+
all_ok = True
|
| 313 |
+
for method, path, body, expect_json in checks:
|
| 314 |
+
try:
|
| 315 |
+
if method == "GET":
|
| 316 |
+
http_get(f"{base_url}{path}", expect_json=expect_json)
|
| 317 |
+
else:
|
| 318 |
+
http_post(f"{base_url}{path}", body)
|
| 319 |
+
print(f" ✓ {method} {path}")
|
| 320 |
+
except Exception as e:
|
| 321 |
+
print(f" ✗ {method} {path} — {e}")
|
| 322 |
+
all_ok = False
|
| 323 |
+
|
| 324 |
+
return all_ok
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
def test_baseline_stability(base_url: str) -> bool:
|
| 328 |
+
"""Phase 1: Baseline re-run (must produce identical scores)."""
|
| 329 |
+
print("\n" + "="*60)
|
| 330 |
+
print("PHASE 1: Baseline Stability (3 runs)")
|
| 331 |
+
print("="*60)
|
| 332 |
+
|
| 333 |
+
scores_list = []
|
| 334 |
+
for i in range(3):
|
| 335 |
+
r = http_post(f"{base_url}/baseline")
|
| 336 |
+
scores = r["scores"]
|
| 337 |
+
scores_list.append(scores)
|
| 338 |
+
print(f" Run {i+1}: easy={scores['easy']:.4f} medium={scores['medium']:.4f} hard={scores['hard']:.4f}")
|
| 339 |
+
|
| 340 |
+
# Check all identical
|
| 341 |
+
stable = all(s == scores_list[0] for s in scores_list)
|
| 342 |
+
if stable:
|
| 343 |
+
print(" ✓ All 3 runs identical — baseline is deterministic")
|
| 344 |
+
else:
|
| 345 |
+
print(" ✗ SCORES DIFFER — baseline is non-deterministic!")
|
| 346 |
+
return stable
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
def test_llm_agent(base_url: str, task: str, seed: int = 0) -> float:
|
| 350 |
+
"""Phase 2: Run an LLM agent against one task (simulates judge's Nemotron run)."""
|
| 351 |
+
_model = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
|
| 352 |
+
print(f"\n --- LLM Agent: task={task}, seed={seed}, model={_model} ---")
|
| 353 |
+
|
| 354 |
+
# Reset
|
| 355 |
+
reset_resp = http_post(f"{base_url}/reset", {"task": task, "seed": seed})
|
| 356 |
+
obs = reset_resp.get("observation", reset_resp)
|
| 357 |
+
done = reset_resp.get("done", False)
|
| 358 |
+
|
| 359 |
+
step_num = 0
|
| 360 |
+
while not done:
|
| 361 |
+
step_num += 1
|
| 362 |
+
prompt = format_obs(obs)
|
| 363 |
+
llm_text = call_llm(prompt)
|
| 364 |
+
action = parse_action(llm_text, obs)
|
| 365 |
+
|
| 366 |
+
action_str = f"{action['action_type'].upper()} {action.get('account_id', '')}".strip()
|
| 367 |
+
|
| 368 |
+
step_resp = http_post(f"{base_url}/step", action)
|
| 369 |
+
obs = step_resp.get("observation", step_resp)
|
| 370 |
+
done = step_resp.get("done", False)
|
| 371 |
+
reward = step_resp.get("reward")
|
| 372 |
+
|
| 373 |
+
flagged_n = len(obs.get("flagged_ids", []))
|
| 374 |
+
suspects_n = len(obs.get("suspect_ids", []))
|
| 375 |
+
steps_left = obs.get("steps_remaining", "?")
|
| 376 |
+
|
| 377 |
+
print(f" Step {step_num:2d}: {action_str:35s} flagged={flagged_n}/10 suspects={suspects_n} steps_left={steps_left}")
|
| 378 |
+
|
| 379 |
+
if done and reward is not None:
|
| 380 |
+
msg = step_resp.get("message", obs.get("message", ""))
|
| 381 |
+
print(f" → Episode ended: {msg[:100]}")
|
| 382 |
+
|
| 383 |
+
# Get grader score
|
| 384 |
+
grader = http_get(f"{base_url}/grader")
|
| 385 |
+
score = grader["score"]
|
| 386 |
+
print(f" ★ GRADER SCORE: {score:.4f}")
|
| 387 |
+
return score
|
| 388 |
+
|
| 389 |
+
|
| 390 |
+
def test_llm_all_tasks(base_url: str) -> Dict[str, float]:
|
| 391 |
+
"""Phase 2: Run LLM agent on all 3 tasks."""
|
| 392 |
+
print("\n" + "="*60)
|
| 393 |
+
_model = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
|
| 394 |
+
print(f"PHASE 2: LLM Agent Evaluation (model={_model})")
|
| 395 |
+
print("="*60)
|
| 396 |
+
|
| 397 |
+
scores = {}
|
| 398 |
+
for task in ["easy", "medium", "hard"]:
|
| 399 |
+
scores[task] = test_llm_agent(base_url, task=task, seed=0)
|
| 400 |
+
|
| 401 |
+
print(f"\n Summary: easy={scores['easy']:.4f} medium={scores['medium']:.4f} hard={scores['hard']:.4f}")
|
| 402 |
+
return scores
|
| 403 |
+
|
| 404 |
+
|
| 405 |
+
def test_variance(base_url: str, seeds: List[int] = [0, 1, 2, 3, 4]) -> None:
|
| 406 |
+
"""Phase 3: Score variance check (multiple seeds per task)."""
|
| 407 |
+
print("\n" + "="*60)
|
| 408 |
+
print(f"PHASE 3: Score Variance (seeds={seeds})")
|
| 409 |
+
print("="*60)
|
| 410 |
+
|
| 411 |
+
for task in ["easy", "medium", "hard"]:
|
| 412 |
+
task_scores = []
|
| 413 |
+
for seed in seeds:
|
| 414 |
+
score = test_llm_agent(base_url, task=task, seed=seed)
|
| 415 |
+
task_scores.append(score)
|
| 416 |
+
|
| 417 |
+
mean = sum(task_scores) / len(task_scores)
|
| 418 |
+
variance = sum((s - mean) ** 2 for s in task_scores) / len(task_scores)
|
| 419 |
+
print(f"\n {task}: scores={[f'{s:.3f}' for s in task_scores]} mean={mean:.4f} var={variance:.6f}")
|
| 420 |
+
|
| 421 |
+
|
| 422 |
+
# ---------------------------------------------------------------------------
|
| 423 |
+
# Main
|
| 424 |
+
# ---------------------------------------------------------------------------
|
| 425 |
+
|
| 426 |
+
if __name__ == "__main__":
|
| 427 |
+
import argparse
|
| 428 |
+
|
| 429 |
+
parser = argparse.ArgumentParser(description="Judge Evaluation Simulator for GraphStrike")
|
| 430 |
+
parser.add_argument("--url", required=True, help="Environment server URL")
|
| 431 |
+
parser.add_argument("--bedrock", action="store_true", help="Use AWS Bedrock instead of HF router")
|
| 432 |
+
parser.add_argument("--endpoints-only", action="store_true", help="Only test endpoints (no LLM)")
|
| 433 |
+
parser.add_argument("--skip-variance", action="store_true", help="Skip variance check (faster)")
|
| 434 |
+
parser.add_argument("--seeds", type=int, default=3, help="Number of seeds for variance check")
|
| 435 |
+
args = parser.parse_args()
|
| 436 |
+
|
| 437 |
+
if args.bedrock:
|
| 438 |
+
LLM_BACKEND = "bedrock"
|
| 439 |
+
|
| 440 |
+
base = args.url.rstrip("/")
|
| 441 |
+
model_display = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
|
| 442 |
+
print(f"GraphStrike Judge Evaluation Simulator")
|
| 443 |
+
print(f"Target: {base}")
|
| 444 |
+
print(f"Backend: {LLM_BACKEND}")
|
| 445 |
+
print(f"Model: {model_display}")
|
| 446 |
+
print(f"Token: {'set' if (HF_TOKEN or os.getenv('AWS_ACCESS_KEY_ID')) else 'NOT SET'}")
|
| 447 |
+
|
| 448 |
+
# Phase 0: Endpoints
|
| 449 |
+
if not test_endpoints(base):
|
| 450 |
+
print("\n✗ Endpoint check failed. Fix before proceeding.")
|
| 451 |
+
sys.exit(1)
|
| 452 |
+
|
| 453 |
+
# Phase 1: Baseline stability
|
| 454 |
+
test_baseline_stability(base)
|
| 455 |
+
|
| 456 |
+
if args.endpoints_only:
|
| 457 |
+
print("\n✓ Endpoint-only mode — skipping LLM tests.")
|
| 458 |
+
sys.exit(0)
|
| 459 |
+
|
| 460 |
+
if LLM_BACKEND == "bedrock":
|
| 461 |
+
if not os.getenv("AWS_ACCESS_KEY_ID"):
|
| 462 |
+
print("\n✗ AWS_ACCESS_KEY_ID not set. Cannot run Bedrock LLM tests.")
|
| 463 |
+
sys.exit(1)
|
| 464 |
+
elif not HF_TOKEN:
|
| 465 |
+
print("\n✗ HF_TOKEN not set. Cannot run LLM agent tests.")
|
| 466 |
+
print(" export HF_TOKEN='hf_...' OR use --bedrock with AWS creds")
|
| 467 |
+
sys.exit(1)
|
| 468 |
+
|
| 469 |
+
# Phase 2: LLM on all tasks
|
| 470 |
+
scores = test_llm_all_tasks(base)
|
| 471 |
+
|
| 472 |
+
# Phase 3: Variance
|
| 473 |
+
if not args.skip_variance:
|
| 474 |
+
test_variance(base, seeds=list(range(args.seeds)))
|
| 475 |
+
|
| 476 |
+
print("\n" + "="*60)
|
| 477 |
+
print("EVALUATION COMPLETE")
|
| 478 |
+
print("="*60)
|
eval-models/qwen_test_judge_eval.py
ADDED
|
@@ -0,0 +1,478 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Judge Evaluation Simulator
|
| 4 |
+
==========================
|
| 5 |
+
|
| 6 |
+
Simulates EXACTLY how hackathon judges will evaluate your environment:
|
| 7 |
+
|
| 8 |
+
1. Baseline re-run: POST /baseline → verify scores are stable
|
| 9 |
+
2. Standard Open LLM agent: Run an LLM (via HF router) against all 3 tasks
|
| 10 |
+
3. Score variance check: Run same task multiple seeds, check variance
|
| 11 |
+
|
| 12 |
+
USAGE:
|
| 13 |
+
# Against live HF Space (requires HF_TOKEN):
|
| 14 |
+
export HF_TOKEN="hf_..."
|
| 15 |
+
python test_judge_eval.py --url https://pandago-graphstrike.hf.space
|
| 16 |
+
|
| 17 |
+
# Against local server:
|
| 18 |
+
export HF_TOKEN="hf_..."
|
| 19 |
+
python test_judge_eval.py --url http://localhost:7860
|
| 20 |
+
|
| 21 |
+
# Choose model (default: Qwen/Qwen2.5-72B-Instruct):
|
| 22 |
+
export MODEL_NAME="nvidia/Llama-3.1-Nemotron-70B-Instruct-HF"
|
| 23 |
+
python test_judge_eval.py --url https://pandago-graphstrike.hf.space
|
| 24 |
+
|
| 25 |
+
# Just test endpoints (no LLM needed):
|
| 26 |
+
python test_judge_eval.py --url https://pandago-graphstrike.hf.space --endpoints-only
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
import json
|
| 30 |
+
import os
|
| 31 |
+
import sys
|
| 32 |
+
import time
|
| 33 |
+
import urllib.request
|
| 34 |
+
from typing import Dict, List, Optional
|
| 35 |
+
|
| 36 |
+
# ---------------------------------------------------------------------------
|
| 37 |
+
# Config from env vars (same as judges will set)
|
| 38 |
+
# ---------------------------------------------------------------------------
|
| 39 |
+
|
| 40 |
+
HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 41 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 42 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
|
| 43 |
+
|
| 44 |
+
# AWS Bedrock config (use --bedrock flag)
|
| 45 |
+
LLM_BACKEND = "hf" # "hf" or "bedrock"
|
| 46 |
+
BEDROCK_MODEL_ID = os.getenv("BEDROCK_MODEL_ID", "qwen.qwen3-next-80b-a3b")
|
| 47 |
+
|
| 48 |
+
SYSTEM_PROMPT = """You are an AI detective finding 10 coordinated fake accounts in a social network.
|
| 49 |
+
|
| 50 |
+
ACTIONS (reply with exactly ONE line):
|
| 51 |
+
- INSPECT acc_XXXX — reveal profile (costs 1 step)
|
| 52 |
+
- FLAG acc_XXXX — mark as fake (FREE, no step cost, triggers suspect cascade)
|
| 53 |
+
- SUBMIT — end episode, get scored
|
| 54 |
+
|
| 55 |
+
DECISION RULES (apply top-to-bottom, first match wins):
|
| 56 |
+
1. If suspect_ids lists accounts you haven't inspected → INSPECT the first one
|
| 57 |
+
2. If ANY profiled account has shared_ip_count >= 5 and is NOT flagged → FLAG it immediately
|
| 58 |
+
3. If ANY profiled account has photo_reuse >= 0.50 AND bio_template >= 0.40 and hub < 0.70 and NOT flagged → FLAG it
|
| 59 |
+
4. If ANY profiled account has fake_risk_score >= 0.30 and hub < 0.70 and NOT flagged → FLAG it
|
| 60 |
+
5. If there are uninspected visible accounts and steps > 3 → INSPECT the next one
|
| 61 |
+
6. If you have 10 flags OR steps <= 3 → SUBMIT
|
| 62 |
+
|
| 63 |
+
IMPORTANT:
|
| 64 |
+
- FLAG is FREE (costs 0 steps) — flag aggressively when you see suspicious signals
|
| 65 |
+
- After each FLAG, new suspects appear — always inspect suspects before other accounts
|
| 66 |
+
- hub_legitimacy_score > 0.70 means celebrity — do NOT flag
|
| 67 |
+
- shared_ip_count >= 5 is the strongest gang signal (all 10 share one IP)
|
| 68 |
+
- Do NOT re-inspect already inspected accounts
|
| 69 |
+
|
| 70 |
+
Reply with EXACTLY one line, nothing else:
|
| 71 |
+
FLAG acc_XXXX
|
| 72 |
+
INSPECT acc_XXXX
|
| 73 |
+
SUBMIT"""
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
# ---------------------------------------------------------------------------
|
| 77 |
+
# HTTP helpers
|
| 78 |
+
# ---------------------------------------------------------------------------
|
| 79 |
+
|
| 80 |
+
def _retry(fn, retries=3, backoff=3):
|
| 81 |
+
"""Retry a function on network errors."""
|
| 82 |
+
for attempt in range(retries):
|
| 83 |
+
try:
|
| 84 |
+
return fn()
|
| 85 |
+
except OSError as e:
|
| 86 |
+
if attempt == retries - 1:
|
| 87 |
+
raise
|
| 88 |
+
wait = backoff * (attempt + 1)
|
| 89 |
+
print(f" [RETRY] Network error: {e} — retrying in {wait}s ({attempt+1}/{retries})")
|
| 90 |
+
time.sleep(wait)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def http_post(url: str, body: Optional[dict] = None) -> dict:
|
| 94 |
+
def _do():
|
| 95 |
+
data = json.dumps(body or {}).encode()
|
| 96 |
+
req = urllib.request.Request(
|
| 97 |
+
url, data=data,
|
| 98 |
+
headers={"Content-Type": "application/json"},
|
| 99 |
+
method="POST"
|
| 100 |
+
)
|
| 101 |
+
with urllib.request.urlopen(req, timeout=120) as resp:
|
| 102 |
+
return json.loads(resp.read())
|
| 103 |
+
return _retry(_do)
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def http_get(url: str, expect_json: bool = True) -> dict:
|
| 107 |
+
def _do():
|
| 108 |
+
with urllib.request.urlopen(url, timeout=120) as resp:
|
| 109 |
+
body = resp.read()
|
| 110 |
+
if not expect_json:
|
| 111 |
+
return {"_status": resp.status, "_body_len": len(body)}
|
| 112 |
+
return json.loads(body)
|
| 113 |
+
return _retry(_do)
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
# ---------------------------------------------------------------------------
|
| 117 |
+
# LLM call via OpenAI-compatible API
|
| 118 |
+
# ---------------------------------------------------------------------------
|
| 119 |
+
|
| 120 |
+
def _call_hf(prompt: str) -> str:
|
| 121 |
+
"""Call LLM via HF router (OpenAI-compatible)."""
|
| 122 |
+
from openai import OpenAI
|
| 123 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
|
| 124 |
+
resp = client.chat.completions.create(
|
| 125 |
+
model=MODEL_NAME,
|
| 126 |
+
messages=[
|
| 127 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 128 |
+
{"role": "user", "content": prompt},
|
| 129 |
+
],
|
| 130 |
+
temperature=0.3,
|
| 131 |
+
max_tokens=256,
|
| 132 |
+
)
|
| 133 |
+
return (resp.choices[0].message.content or "").strip()
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def _call_bedrock(prompt: str) -> str:
|
| 137 |
+
"""Call LLM via AWS Bedrock. Tries converse() first, falls back to invoke_model()."""
|
| 138 |
+
import boto3
|
| 139 |
+
client = boto3.client(
|
| 140 |
+
service_name="bedrock-runtime",
|
| 141 |
+
region_name=os.getenv("AWS_DEFAULT_REGION", "us-east-1"),
|
| 142 |
+
aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
|
| 143 |
+
aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
|
| 144 |
+
)
|
| 145 |
+
# Try converse API first (boto3 >= 1.34.x)
|
| 146 |
+
if hasattr(client, "converse"):
|
| 147 |
+
resp = client.converse(
|
| 148 |
+
modelId=BEDROCK_MODEL_ID,
|
| 149 |
+
messages=[{"role": "user", "content": [{"text": prompt}]}],
|
| 150 |
+
system=[{"text": SYSTEM_PROMPT}],
|
| 151 |
+
inferenceConfig={"maxTokens": 256, "temperature": 0.3},
|
| 152 |
+
)
|
| 153 |
+
return resp["output"]["message"]["content"][0]["text"].strip()
|
| 154 |
+
# Fallback: invoke_model (works with all boto3 versions)
|
| 155 |
+
body = json.dumps({
|
| 156 |
+
"messages": [
|
| 157 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 158 |
+
{"role": "user", "content": prompt},
|
| 159 |
+
],
|
| 160 |
+
"max_tokens": 256,
|
| 161 |
+
"temperature": 0.3,
|
| 162 |
+
})
|
| 163 |
+
resp = client.invoke_model(
|
| 164 |
+
modelId=BEDROCK_MODEL_ID,
|
| 165 |
+
contentType="application/json",
|
| 166 |
+
accept="application/json",
|
| 167 |
+
body=body,
|
| 168 |
+
)
|
| 169 |
+
result = json.loads(resp["body"].read())
|
| 170 |
+
# Handle both OpenAI-style and Bedrock-native response formats
|
| 171 |
+
if "choices" in result:
|
| 172 |
+
return result["choices"][0]["message"]["content"].strip()
|
| 173 |
+
if "content" in result:
|
| 174 |
+
content = result["content"]
|
| 175 |
+
if isinstance(content, list):
|
| 176 |
+
return content[0].get("text", "").strip()
|
| 177 |
+
return str(content).strip()
|
| 178 |
+
if "output" in result:
|
| 179 |
+
return result["output"].get("text", "").strip()
|
| 180 |
+
return str(result).strip()
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def call_llm(prompt: str) -> str:
|
| 184 |
+
"""Call LLM with retries. Uses HF router or Bedrock based on LLM_BACKEND."""
|
| 185 |
+
fn = _call_bedrock if LLM_BACKEND == "bedrock" else _call_hf
|
| 186 |
+
for attempt in range(3):
|
| 187 |
+
try:
|
| 188 |
+
raw = fn(prompt)
|
| 189 |
+
if os.getenv("DEBUG_LLM"):
|
| 190 |
+
print(f" [LLM RAW] {raw[:200]}")
|
| 191 |
+
# Strip Qwen3 <think>...</think> reasoning blocks
|
| 192 |
+
import re
|
| 193 |
+
cleaned = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()
|
| 194 |
+
return cleaned if cleaned else raw
|
| 195 |
+
except Exception as e:
|
| 196 |
+
if attempt == 2:
|
| 197 |
+
print(f" [LLM ERROR] {e} (gave up after 3 attempts)")
|
| 198 |
+
return ""
|
| 199 |
+
wait = 3 * (attempt + 1)
|
| 200 |
+
print(f" [LLM RETRY] {e} — retrying in {wait}s")
|
| 201 |
+
time.sleep(wait)
|
| 202 |
+
return ""
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
def format_obs(obs: dict) -> str:
|
| 206 |
+
"""Format observation as text for LLM — shows raw signals prominently."""
|
| 207 |
+
lines = []
|
| 208 |
+
lines.append(f"TASK: {obs.get('task','?').upper()} | Steps remaining: {obs.get('steps_remaining','?')}")
|
| 209 |
+
|
| 210 |
+
flagged = obs.get("flagged_ids", [])
|
| 211 |
+
lines.append(f"Flagged ({len(flagged)}/10): {', '.join(flagged) if flagged else 'none'}")
|
| 212 |
+
|
| 213 |
+
suspects = obs.get("suspect_ids", [])
|
| 214 |
+
inspected = obs.get("inspected_ids", [])
|
| 215 |
+
uninspected_suspects = [s for s in suspects if s not in inspected]
|
| 216 |
+
if uninspected_suspects:
|
| 217 |
+
lines.append(f"*** SUSPECTS (uninspected) → INSPECT THESE FIRST: {', '.join(uninspected_suspects)} ***")
|
| 218 |
+
|
| 219 |
+
accounts = obs.get("visible_accounts", [])
|
| 220 |
+
if accounts:
|
| 221 |
+
# Split: unflagged accounts that should be flagged vs rest
|
| 222 |
+
unflagged_suspicious = []
|
| 223 |
+
flagged_accs = []
|
| 224 |
+
clean_accs = []
|
| 225 |
+
for a in sorted(accounts, key=lambda x: x.get("fake_risk_score", 0), reverse=True):
|
| 226 |
+
aid = a.get("account_id", "?")
|
| 227 |
+
if aid in flagged:
|
| 228 |
+
flagged_accs.append(a)
|
| 229 |
+
elif (a.get("shared_ip_count", 0) >= 5 or
|
| 230 |
+
(a.get("photo_reuse_score", 0) >= 0.50 and a.get("bio_template_score", 0) >= 0.40)):
|
| 231 |
+
unflagged_suspicious.append(a)
|
| 232 |
+
else:
|
| 233 |
+
clean_accs.append(a)
|
| 234 |
+
|
| 235 |
+
if unflagged_suspicious:
|
| 236 |
+
lines.append(f"\n!!! ACTION NEEDED — FLAG THESE ({len(unflagged_suspicious)} accounts with strong fake signals):")
|
| 237 |
+
for a in unflagged_suspicious:
|
| 238 |
+
aid = a.get("account_id", "?")
|
| 239 |
+
lines.append(f" → FLAG {aid}: risk={a.get('fake_risk_score',0):.3f} photo={a.get('photo_reuse_score',0):.2f} bio={a.get('bio_template_score',0):.2f} ip_shared={a.get('shared_ip_count',0)} hub={a.get('hub_legitimacy_score',0):.2f}")
|
| 240 |
+
|
| 241 |
+
if flagged_accs:
|
| 242 |
+
lines.append(f"\nALREADY FLAGGED ({len(flagged_accs)}):")
|
| 243 |
+
for a in flagged_accs[:5]:
|
| 244 |
+
lines.append(f" ✓ {a.get('account_id','?')}")
|
| 245 |
+
|
| 246 |
+
if clean_accs:
|
| 247 |
+
lines.append(f"\nCLEAN ACCOUNTS ({len(clean_accs)}):")
|
| 248 |
+
for a in clean_accs[:5]:
|
| 249 |
+
aid = a.get("account_id", "?")
|
| 250 |
+
hub = a.get("hub_legitimacy_score", 0)
|
| 251 |
+
hub_mark = " [CELEBRITY]" if hub > 0.70 else ""
|
| 252 |
+
lines.append(f" {aid}: risk={a.get('fake_risk_score',0):.3f} photo={a.get('photo_reuse_score',0):.2f} bio={a.get('bio_template_score',0):.2f} hub={hub:.2f}{hub_mark}")
|
| 253 |
+
|
| 254 |
+
visible = obs.get("visible_account_ids", [])
|
| 255 |
+
uninspected = [i for i in visible if i not in inspected]
|
| 256 |
+
if uninspected:
|
| 257 |
+
lines.append(f"\nUninspected IDs ({len(uninspected)}): {', '.join(uninspected[:8])}{'...' if len(uninspected) > 8 else ''}")
|
| 258 |
+
|
| 259 |
+
lines.append(f"\nMessage: {obs.get('message', '')}")
|
| 260 |
+
return "\n".join(lines)
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
def parse_action(llm_text: str, obs: dict) -> dict:
|
| 264 |
+
"""Parse LLM output to action dict."""
|
| 265 |
+
for line in llm_text.split("\n"):
|
| 266 |
+
line = line.strip()
|
| 267 |
+
upper = line.upper()
|
| 268 |
+
if upper.startswith("INSPECT ") or upper.startswith("FLAG ") or upper.startswith("INVESTIGATE_NETWORK ") or upper.startswith("UNFLAG "):
|
| 269 |
+
parts = line.split(maxsplit=1)
|
| 270 |
+
return {"action_type": parts[0].lower(), "account_id": parts[1].lower() if len(parts) > 1 else None}
|
| 271 |
+
if upper == "SUBMIT":
|
| 272 |
+
return {"action_type": "submit"}
|
| 273 |
+
|
| 274 |
+
# Fallback: inspect first uninspected suspect
|
| 275 |
+
suspects = obs.get("suspect_ids", [])
|
| 276 |
+
inspected = obs.get("inspected_ids", [])
|
| 277 |
+
for s in suspects:
|
| 278 |
+
if s not in inspected:
|
| 279 |
+
return {"action_type": "inspect", "account_id": s}
|
| 280 |
+
visible = obs.get("visible_account_ids", [])
|
| 281 |
+
for v in visible:
|
| 282 |
+
if v not in inspected:
|
| 283 |
+
return {"action_type": "inspect", "account_id": v}
|
| 284 |
+
return {"action_type": "submit"}
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
# ---------------------------------------------------------------------------
|
| 288 |
+
# Test phases
|
| 289 |
+
# ---------------------------------------------------------------------------
|
| 290 |
+
|
| 291 |
+
def test_endpoints(base_url: str) -> bool:
|
| 292 |
+
"""Phase 0: Verify all required endpoints respond correctly."""
|
| 293 |
+
print("\n" + "="*60)
|
| 294 |
+
print("PHASE 0: Endpoint Verification")
|
| 295 |
+
print("="*60)
|
| 296 |
+
|
| 297 |
+
checks = [
|
| 298 |
+
("GET", "/health", None, True),
|
| 299 |
+
("GET", "/tasks", None, True),
|
| 300 |
+
("GET", "/metadata", None, True),
|
| 301 |
+
("GET", "/schema", None, True),
|
| 302 |
+
("GET", "/web", None, False), # returns HTML, not JSON
|
| 303 |
+
("POST", "/reset", {"task": "easy", "seed": 0}, True),
|
| 304 |
+
("GET", "/state", None, True),
|
| 305 |
+
("POST", "/step", {"action_type": "inspect", "account_id": "acc_0000"}, True),
|
| 306 |
+
("POST", "/step", {"action_type": "submit"}, True),
|
| 307 |
+
("GET", "/grader", None, True),
|
| 308 |
+
("POST", "/mcp", {"jsonrpc": "2.0", "method": "tools/list", "id": 1}, True),
|
| 309 |
+
("POST", "/baseline", None, True),
|
| 310 |
+
]
|
| 311 |
+
|
| 312 |
+
all_ok = True
|
| 313 |
+
for method, path, body, expect_json in checks:
|
| 314 |
+
try:
|
| 315 |
+
if method == "GET":
|
| 316 |
+
http_get(f"{base_url}{path}", expect_json=expect_json)
|
| 317 |
+
else:
|
| 318 |
+
http_post(f"{base_url}{path}", body)
|
| 319 |
+
print(f" ✓ {method} {path}")
|
| 320 |
+
except Exception as e:
|
| 321 |
+
print(f" ✗ {method} {path} — {e}")
|
| 322 |
+
all_ok = False
|
| 323 |
+
|
| 324 |
+
return all_ok
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
def test_baseline_stability(base_url: str) -> bool:
|
| 328 |
+
"""Phase 1: Baseline re-run (must produce identical scores)."""
|
| 329 |
+
print("\n" + "="*60)
|
| 330 |
+
print("PHASE 1: Baseline Stability (3 runs)")
|
| 331 |
+
print("="*60)
|
| 332 |
+
|
| 333 |
+
scores_list = []
|
| 334 |
+
for i in range(3):
|
| 335 |
+
r = http_post(f"{base_url}/baseline")
|
| 336 |
+
scores = r["scores"]
|
| 337 |
+
scores_list.append(scores)
|
| 338 |
+
print(f" Run {i+1}: easy={scores['easy']:.4f} medium={scores['medium']:.4f} hard={scores['hard']:.4f}")
|
| 339 |
+
|
| 340 |
+
# Check all identical
|
| 341 |
+
stable = all(s == scores_list[0] for s in scores_list)
|
| 342 |
+
if stable:
|
| 343 |
+
print(" ✓ All 3 runs identical — baseline is deterministic")
|
| 344 |
+
else:
|
| 345 |
+
print(" ✗ SCORES DIFFER — baseline is non-deterministic!")
|
| 346 |
+
return stable
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
def test_llm_agent(base_url: str, task: str, seed: int = 0) -> float:
|
| 350 |
+
"""Phase 2: Run an LLM agent against one task (simulates judge's Nemotron run)."""
|
| 351 |
+
_model = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
|
| 352 |
+
print(f"\n --- LLM Agent: task={task}, seed={seed}, model={_model} ---")
|
| 353 |
+
|
| 354 |
+
# Reset
|
| 355 |
+
reset_resp = http_post(f"{base_url}/reset", {"task": task, "seed": seed})
|
| 356 |
+
obs = reset_resp.get("observation", reset_resp)
|
| 357 |
+
done = reset_resp.get("done", False)
|
| 358 |
+
|
| 359 |
+
step_num = 0
|
| 360 |
+
while not done:
|
| 361 |
+
step_num += 1
|
| 362 |
+
prompt = format_obs(obs)
|
| 363 |
+
llm_text = call_llm(prompt)
|
| 364 |
+
action = parse_action(llm_text, obs)
|
| 365 |
+
|
| 366 |
+
action_str = f"{action['action_type'].upper()} {action.get('account_id', '')}".strip()
|
| 367 |
+
|
| 368 |
+
step_resp = http_post(f"{base_url}/step", action)
|
| 369 |
+
obs = step_resp.get("observation", step_resp)
|
| 370 |
+
done = step_resp.get("done", False)
|
| 371 |
+
reward = step_resp.get("reward")
|
| 372 |
+
|
| 373 |
+
flagged_n = len(obs.get("flagged_ids", []))
|
| 374 |
+
suspects_n = len(obs.get("suspect_ids", []))
|
| 375 |
+
steps_left = obs.get("steps_remaining", "?")
|
| 376 |
+
|
| 377 |
+
print(f" Step {step_num:2d}: {action_str:35s} flagged={flagged_n}/10 suspects={suspects_n} steps_left={steps_left}")
|
| 378 |
+
|
| 379 |
+
if done and reward is not None:
|
| 380 |
+
msg = step_resp.get("message", obs.get("message", ""))
|
| 381 |
+
print(f" → Episode ended: {msg[:100]}")
|
| 382 |
+
|
| 383 |
+
# Get grader score
|
| 384 |
+
grader = http_get(f"{base_url}/grader")
|
| 385 |
+
score = grader["score"]
|
| 386 |
+
print(f" ★ GRADER SCORE: {score:.4f}")
|
| 387 |
+
return score
|
| 388 |
+
|
| 389 |
+
|
| 390 |
+
def test_llm_all_tasks(base_url: str) -> Dict[str, float]:
|
| 391 |
+
"""Phase 2: Run LLM agent on all 3 tasks."""
|
| 392 |
+
print("\n" + "="*60)
|
| 393 |
+
_model = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
|
| 394 |
+
print(f"PHASE 2: LLM Agent Evaluation (model={_model})")
|
| 395 |
+
print("="*60)
|
| 396 |
+
|
| 397 |
+
scores = {}
|
| 398 |
+
for task in ["easy", "medium", "hard"]:
|
| 399 |
+
scores[task] = test_llm_agent(base_url, task=task, seed=0)
|
| 400 |
+
|
| 401 |
+
print(f"\n Summary: easy={scores['easy']:.4f} medium={scores['medium']:.4f} hard={scores['hard']:.4f}")
|
| 402 |
+
return scores
|
| 403 |
+
|
| 404 |
+
|
| 405 |
+
def test_variance(base_url: str, seeds: List[int] = [0, 1, 2, 3, 4]) -> None:
|
| 406 |
+
"""Phase 3: Score variance check (multiple seeds per task)."""
|
| 407 |
+
print("\n" + "="*60)
|
| 408 |
+
print(f"PHASE 3: Score Variance (seeds={seeds})")
|
| 409 |
+
print("="*60)
|
| 410 |
+
|
| 411 |
+
for task in ["easy", "medium", "hard"]:
|
| 412 |
+
task_scores = []
|
| 413 |
+
for seed in seeds:
|
| 414 |
+
score = test_llm_agent(base_url, task=task, seed=seed)
|
| 415 |
+
task_scores.append(score)
|
| 416 |
+
|
| 417 |
+
mean = sum(task_scores) / len(task_scores)
|
| 418 |
+
variance = sum((s - mean) ** 2 for s in task_scores) / len(task_scores)
|
| 419 |
+
print(f"\n {task}: scores={[f'{s:.3f}' for s in task_scores]} mean={mean:.4f} var={variance:.6f}")
|
| 420 |
+
|
| 421 |
+
|
| 422 |
+
# ---------------------------------------------------------------------------
|
| 423 |
+
# Main
|
| 424 |
+
# ---------------------------------------------------------------------------
|
| 425 |
+
|
| 426 |
+
if __name__ == "__main__":
|
| 427 |
+
import argparse
|
| 428 |
+
|
| 429 |
+
parser = argparse.ArgumentParser(description="Judge Evaluation Simulator for GraphStrike")
|
| 430 |
+
parser.add_argument("--url", required=True, help="Environment server URL")
|
| 431 |
+
parser.add_argument("--bedrock", action="store_true", help="Use AWS Bedrock instead of HF router")
|
| 432 |
+
parser.add_argument("--endpoints-only", action="store_true", help="Only test endpoints (no LLM)")
|
| 433 |
+
parser.add_argument("--skip-variance", action="store_true", help="Skip variance check (faster)")
|
| 434 |
+
parser.add_argument("--seeds", type=int, default=3, help="Number of seeds for variance check")
|
| 435 |
+
args = parser.parse_args()
|
| 436 |
+
|
| 437 |
+
if args.bedrock:
|
| 438 |
+
LLM_BACKEND = "bedrock"
|
| 439 |
+
|
| 440 |
+
base = args.url.rstrip("/")
|
| 441 |
+
model_display = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
|
| 442 |
+
print(f"GraphStrike Judge Evaluation Simulator")
|
| 443 |
+
print(f"Target: {base}")
|
| 444 |
+
print(f"Backend: {LLM_BACKEND}")
|
| 445 |
+
print(f"Model: {model_display}")
|
| 446 |
+
print(f"Token: {'set' if (HF_TOKEN or os.getenv('AWS_ACCESS_KEY_ID')) else 'NOT SET'}")
|
| 447 |
+
|
| 448 |
+
# Phase 0: Endpoints
|
| 449 |
+
if not test_endpoints(base):
|
| 450 |
+
print("\n✗ Endpoint check failed. Fix before proceeding.")
|
| 451 |
+
sys.exit(1)
|
| 452 |
+
|
| 453 |
+
# Phase 1: Baseline stability
|
| 454 |
+
test_baseline_stability(base)
|
| 455 |
+
|
| 456 |
+
if args.endpoints_only:
|
| 457 |
+
print("\n✓ Endpoint-only mode — skipping LLM tests.")
|
| 458 |
+
sys.exit(0)
|
| 459 |
+
|
| 460 |
+
if LLM_BACKEND == "bedrock":
|
| 461 |
+
if not os.getenv("AWS_ACCESS_KEY_ID"):
|
| 462 |
+
print("\n✗ AWS_ACCESS_KEY_ID not set. Cannot run Bedrock LLM tests.")
|
| 463 |
+
sys.exit(1)
|
| 464 |
+
elif not HF_TOKEN:
|
| 465 |
+
print("\n✗ HF_TOKEN not set. Cannot run LLM agent tests.")
|
| 466 |
+
print(" export HF_TOKEN='hf_...' OR use --bedrock with AWS creds")
|
| 467 |
+
sys.exit(1)
|
| 468 |
+
|
| 469 |
+
# Phase 2: LLM on all tasks
|
| 470 |
+
scores = test_llm_all_tasks(base)
|
| 471 |
+
|
| 472 |
+
# Phase 3: Variance
|
| 473 |
+
if not args.skip_variance:
|
| 474 |
+
test_variance(base, seeds=list(range(args.seeds)))
|
| 475 |
+
|
| 476 |
+
print("\n" + "="*60)
|
| 477 |
+
print("EVALUATION COMPLETE")
|
| 478 |
+
print("="*60)
|
images/big.png
ADDED
|
Git LFS Details
|
images/logo.png
ADDED
|
images/plot.png
ADDED
|
images/table1.png
ADDED
|
images/table2.png
ADDED
|
images/table3.png
ADDED
|
judge_log.txt
ADDED
|
@@ -0,0 +1,513 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ubuntu@ip-172-31-33-59:~/meta/meta-hack-26$ python3 test_judge_eval.py --url https://pandago-graphstrike.hf.space --bedrock
|
| 2 |
+
GraphStrike Judge Evaluation Simulator
|
| 3 |
+
Target: https://pandago-graphstrike.hf.space
|
| 4 |
+
Backend: bedrock
|
| 5 |
+
Model: Bedrock/qwen.qwen3-next-80b-a3b
|
| 6 |
+
Token: set
|
| 7 |
+
|
| 8 |
+
============================================================
|
| 9 |
+
PHASE 0: Endpoint Verification
|
| 10 |
+
============================================================
|
| 11 |
+
✓ GET /health
|
| 12 |
+
✓ GET /tasks
|
| 13 |
+
✓ GET /metadata
|
| 14 |
+
✓ GET /schema
|
| 15 |
+
✓ GET /web
|
| 16 |
+
✓ POST /reset
|
| 17 |
+
✓ GET /state
|
| 18 |
+
✓ POST /step
|
| 19 |
+
✓ POST /step
|
| 20 |
+
✓ GET /grader
|
| 21 |
+
✓ POST /mcp
|
| 22 |
+
✓ POST /baseline
|
| 23 |
+
|
| 24 |
+
============================================================
|
| 25 |
+
PHASE 1: Baseline Stability (3 runs)
|
| 26 |
+
============================================================
|
| 27 |
+
Run 1: easy=0.9100 medium=0.9060 hard=0.9038
|
| 28 |
+
Run 2: easy=0.9100 medium=0.9060 hard=0.9038
|
| 29 |
+
Run 3: easy=0.9100 medium=0.9060 hard=0.9038
|
| 30 |
+
✓ All 3 runs identical — baseline is deterministic
|
| 31 |
+
|
| 32 |
+
============================================================
|
| 33 |
+
PHASE 2: LLM Agent Evaluation (model=Bedrock/qwen.qwen3-next-80b-a3b)
|
| 34 |
+
============================================================
|
| 35 |
+
|
| 36 |
+
--- LLM Agent: task=easy, seed=0, model=Bedrock/qwen.qwen3-next-80b-a3b ---
|
| 37 |
+
Step 1: INSPECT acc_0043 flagged=0/10 suspects=0 steps_left=29
|
| 38 |
+
Step 2: FLAG acc_0043 flagged=1/10 suspects=8 steps_left=29
|
| 39 |
+
Step 3: INSPECT acc_0036 flagged=1/10 suspects=8 steps_left=28
|
| 40 |
+
Step 4: FLAG acc_0036 flagged=2/10 suspects=8 steps_left=28
|
| 41 |
+
Step 5: INSPECT acc_0001 flagged=2/10 suspects=8 steps_left=27
|
| 42 |
+
Step 6: FLAG acc_0001 flagged=3/10 suspects=7 steps_left=27
|
| 43 |
+
Step 7: INSPECT acc_0014 flagged=3/10 suspects=7 steps_left=26
|
| 44 |
+
Step 8: FLAG acc_0014 flagged=4/10 suspects=6 steps_left=26
|
| 45 |
+
Step 9: INSPECT acc_0012 flagged=4/10 suspects=6 steps_left=25
|
| 46 |
+
Step 10: FLAG acc_0012 flagged=5/10 suspects=5 steps_left=25
|
| 47 |
+
Step 11: INSPECT acc_0000 flagged=5/10 suspects=5 steps_left=24
|
| 48 |
+
Step 12: FLAG acc_0000 flagged=6/10 suspects=4 steps_left=24
|
| 49 |
+
Step 13: INSPECT acc_0027 flagged=6/10 suspects=4 steps_left=23
|
| 50 |
+
Step 14: FLAG acc_0027 flagged=7/10 suspects=3 steps_left=23
|
| 51 |
+
Step 15: INSPECT acc_0047 flagged=7/10 suspects=3 steps_left=22
|
| 52 |
+
Step 16: FLAG acc_0047 flagged=8/10 suspects=2 steps_left=22
|
| 53 |
+
Step 17: INSPECT acc_0007 flagged=8/10 suspects=2 steps_left=21
|
| 54 |
+
Step 18: FLAG acc_0007 flagged=9/10 suspects=1 steps_left=21
|
| 55 |
+
Step 19: INSPECT acc_0028 flagged=9/10 suspects=1 steps_left=20
|
| 56 |
+
Step 20: FLAG acc_0028 flagged=10/10 suspects=0 steps_left=20
|
| 57 |
+
Step 21: SUBMIT flagged=10/10 suspects=0 steps_left=20
|
| 58 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.90
|
| 59 |
+
★ GRADER SCORE: 0.9667
|
| 60 |
+
|
| 61 |
+
--- LLM Agent: task=medium, seed=0, model=Bedrock/qwen.qwen3-next-80b-a3b ---
|
| 62 |
+
Step 1: INSPECT acc_0008 flagged=0/10 suspects=0 steps_left=49
|
| 63 |
+
Step 2: INSPECT acc_0074 flagged=0/10 suspects=0 steps_left=48
|
| 64 |
+
Step 3: INSPECT acc_0179 flagged=0/10 suspects=0 steps_left=47
|
| 65 |
+
Step 4: INSPECT acc_0096 flagged=0/10 suspects=0 steps_left=46
|
| 66 |
+
Step 5: INSPECT acc_0177 flagged=0/10 suspects=0 steps_left=45
|
| 67 |
+
Step 6: INSPECT acc_0122 flagged=0/10 suspects=0 steps_left=44
|
| 68 |
+
Step 7: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=43
|
| 69 |
+
Step 8: INSPECT acc_0174 flagged=0/10 suspects=0 steps_left=42
|
| 70 |
+
Step 9: INSPECT acc_0144 flagged=0/10 suspects=0 steps_left=41
|
| 71 |
+
Step 10: FLAG acc_0144 flagged=1/10 suspects=8 steps_left=41
|
| 72 |
+
Step 11: INSPECT acc_0181 flagged=1/10 suspects=8 steps_left=40
|
| 73 |
+
Step 12: FLAG acc_0181 flagged=2/10 suspects=8 steps_left=40
|
| 74 |
+
Step 13: INSPECT acc_0022 flagged=2/10 suspects=8 steps_left=39
|
| 75 |
+
Step 14: FLAG acc_0022 flagged=3/10 suspects=7 steps_left=39
|
| 76 |
+
Step 15: INSPECT acc_0092 flagged=3/10 suspects=7 steps_left=38
|
| 77 |
+
Step 16: INSPECT acc_0097 flagged=3/10 suspects=7 steps_left=37
|
| 78 |
+
Step 17: FLAG acc_0092 flagged=4/10 suspects=6 steps_left=37
|
| 79 |
+
Step 18: FLAG acc_0097 flagged=5/10 suspects=5 steps_left=37
|
| 80 |
+
Step 19: INSPECT acc_0187 flagged=5/10 suspects=5 steps_left=36
|
| 81 |
+
Step 20: FLAG acc_0187 flagged=6/10 suspects=4 steps_left=36
|
| 82 |
+
Step 21: INSPECT acc_0093 flagged=6/10 suspects=4 steps_left=35
|
| 83 |
+
Step 22: FLAG acc_0093 flagged=7/10 suspects=3 steps_left=35
|
| 84 |
+
Step 23: INSPECT acc_0172 flagged=7/10 suspects=3 steps_left=34
|
| 85 |
+
Step 24: FLAG acc_0172 flagged=8/10 suspects=2 steps_left=34
|
| 86 |
+
Step 25: INSPECT acc_0058 flagged=8/10 suspects=2 steps_left=33
|
| 87 |
+
Step 26: FLAG acc_0058 flagged=9/10 suspects=1 steps_left=33
|
| 88 |
+
Step 27: INSPECT acc_0131 flagged=9/10 suspects=1 steps_left=32
|
| 89 |
+
Step 28: FLAG acc_0131 flagged=10/10 suspects=0 steps_left=32
|
| 90 |
+
Step 29: SUBMIT flagged=10/10 suspects=0 steps_left=32
|
| 91 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.82
|
| 92 |
+
★ GRADER SCORE: 0.9640
|
| 93 |
+
|
| 94 |
+
--- LLM Agent: task=hard, seed=0, model=Bedrock/qwen.qwen3-next-80b-a3b ---
|
| 95 |
+
Step 1: INSPECT acc_0704 flagged=0/10 suspects=0 steps_left=79
|
| 96 |
+
Step 2: INSPECT acc_0289 flagged=0/10 suspects=0 steps_left=78
|
| 97 |
+
Step 3: INSPECT acc_0826 flagged=0/10 suspects=0 steps_left=77
|
| 98 |
+
Step 4: INSPECT acc_0927 flagged=0/10 suspects=0 steps_left=76
|
| 99 |
+
Step 5: INSPECT acc_0441 flagged=0/10 suspects=0 steps_left=75
|
| 100 |
+
Step 6: INSPECT acc_0871 flagged=0/10 suspects=0 steps_left=74
|
| 101 |
+
Step 7: INSPECT acc_0880 flagged=0/10 suspects=0 steps_left=73
|
| 102 |
+
Step 8: INSPECT acc_0652 flagged=0/10 suspects=0 steps_left=72
|
| 103 |
+
Step 9: INSPECT acc_0939 flagged=0/10 suspects=0 steps_left=71
|
| 104 |
+
Step 10: INSPECT acc_0070 flagged=0/10 suspects=0 steps_left=70
|
| 105 |
+
Step 11: INSPECT acc_0915 flagged=0/10 suspects=0 steps_left=69
|
| 106 |
+
Step 12: INSPECT acc_0443 flagged=0/10 suspects=0 steps_left=68
|
| 107 |
+
Step 13: INSPECT acc_0970 flagged=0/10 suspects=0 steps_left=67
|
| 108 |
+
Step 14: INSPECT acc_0028 flagged=0/10 suspects=0 steps_left=66
|
| 109 |
+
Step 15: INSPECT acc_0792 flagged=0/10 suspects=0 steps_left=65
|
| 110 |
+
Step 16: INSPECT acc_0579 flagged=0/10 suspects=0 steps_left=64
|
| 111 |
+
Step 17: INSPECT acc_0037 flagged=0/10 suspects=0 steps_left=63
|
| 112 |
+
Step 18: INSPECT acc_0295 flagged=0/10 suspects=0 steps_left=62
|
| 113 |
+
Step 19: INSPECT acc_0438 flagged=0/10 suspects=0 steps_left=61
|
| 114 |
+
Step 20: INSPECT acc_0439 flagged=0/10 suspects=0 steps_left=60
|
| 115 |
+
Step 21: FLAG acc_0439 flagged=1/10 suspects=5 steps_left=60
|
| 116 |
+
Step 22: INSPECT acc_0237 flagged=1/10 suspects=5 steps_left=59
|
| 117 |
+
Step 23: FLAG acc_0237 flagged=2/10 suspects=6 steps_left=59
|
| 118 |
+
Step 24: INSPECT acc_0621 flagged=2/10 suspects=6 steps_left=58
|
| 119 |
+
Step 25: FLAG acc_0621 flagged=3/10 suspects=6 steps_left=58
|
| 120 |
+
Step 26: INSPECT acc_0389 flagged=3/10 suspects=6 steps_left=57
|
| 121 |
+
Step 27: FLAG acc_0389 flagged=4/10 suspects=6 steps_left=57
|
| 122 |
+
Step 28: INSPECT acc_0160 flagged=4/10 suspects=6 steps_left=56
|
| 123 |
+
Step 29: FLAG acc_0160 flagged=5/10 suspects=5 steps_left=56
|
| 124 |
+
Step 30: INSPECT acc_0549 flagged=5/10 suspects=5 steps_left=55
|
| 125 |
+
Step 31: FLAG acc_0549 flagged=6/10 suspects=4 steps_left=55
|
| 126 |
+
Step 32: INSPECT acc_0658 flagged=6/10 suspects=4 steps_left=54
|
| 127 |
+
Step 33: FLAG acc_0658 flagged=7/10 suspects=3 steps_left=54
|
| 128 |
+
Step 34: INSPECT acc_0290 flagged=7/10 suspects=3 steps_left=53
|
| 129 |
+
Step 35: FLAG acc_0290 flagged=8/10 suspects=2 steps_left=53
|
| 130 |
+
Step 36: INSPECT acc_0124 flagged=8/10 suspects=2 steps_left=52
|
| 131 |
+
Step 37: INSPECT acc_0507 flagged=8/10 suspects=2 steps_left=51
|
| 132 |
+
Step 38: FLAG acc_0124 flagged=9/10 suspects=1 steps_left=51
|
| 133 |
+
Step 39: FLAG acc_0507 flagged=10/10 suspects=0 steps_left=51
|
| 134 |
+
Step 40: SUBMIT flagged=10/10 suspects=0 steps_left=51
|
| 135 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.71
|
| 136 |
+
★ GRADER SCORE: 0.9637
|
| 137 |
+
|
| 138 |
+
Summary: easy=0.9667 medium=0.9640 hard=0.9637
|
| 139 |
+
|
| 140 |
+
============================================================
|
| 141 |
+
PHASE 3: Score Variance (seeds=[0, 1, 2])
|
| 142 |
+
============================================================
|
| 143 |
+
|
| 144 |
+
--- LLM Agent: task=easy, seed=0, model=Bedrock/qwen.qwen3-next-80b-a3b ---
|
| 145 |
+
Step 1: INSPECT acc_0043 flagged=0/10 suspects=0 steps_left=29
|
| 146 |
+
Step 2: FLAG acc_0043 flagged=1/10 suspects=8 steps_left=29
|
| 147 |
+
Step 3: INSPECT acc_0036 flagged=1/10 suspects=8 steps_left=28
|
| 148 |
+
Step 4: FLAG acc_0036 flagged=2/10 suspects=8 steps_left=28
|
| 149 |
+
Step 5: INSPECT acc_0001 flagged=2/10 suspects=8 steps_left=27
|
| 150 |
+
Step 6: FLAG acc_0001 flagged=3/10 suspects=7 steps_left=27
|
| 151 |
+
Step 7: INSPECT acc_0014 flagged=3/10 suspects=7 steps_left=26
|
| 152 |
+
Step 8: FLAG acc_0014 flagged=4/10 suspects=6 steps_left=26
|
| 153 |
+
Step 9: INSPECT acc_0012 flagged=4/10 suspects=6 steps_left=25
|
| 154 |
+
Step 10: FLAG acc_0012 flagged=5/10 suspects=5 steps_left=25
|
| 155 |
+
Step 11: INSPECT acc_0000 flagged=5/10 suspects=5 steps_left=24
|
| 156 |
+
Step 12: FLAG acc_0000 flagged=6/10 suspects=4 steps_left=24
|
| 157 |
+
Step 13: INSPECT acc_0027 flagged=6/10 suspects=4 steps_left=23
|
| 158 |
+
Step 14: FLAG acc_0027 flagged=7/10 suspects=3 steps_left=23
|
| 159 |
+
Step 15: INSPECT acc_0047 flagged=7/10 suspects=3 steps_left=22
|
| 160 |
+
Step 16: FLAG acc_0047 flagged=8/10 suspects=2 steps_left=22
|
| 161 |
+
Step 17: INSPECT acc_0007 flagged=8/10 suspects=2 steps_left=21
|
| 162 |
+
Step 18: FLAG acc_0007 flagged=9/10 suspects=1 steps_left=21
|
| 163 |
+
Step 19: INSPECT acc_0028 flagged=9/10 suspects=1 steps_left=20
|
| 164 |
+
Step 20: FLAG acc_0028 flagged=10/10 suspects=0 steps_left=20
|
| 165 |
+
Step 21: SUBMIT flagged=10/10 suspects=0 steps_left=20
|
| 166 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.90
|
| 167 |
+
★ GRADER SCORE: 0.9667
|
| 168 |
+
|
| 169 |
+
--- LLM Agent: task=easy, seed=1, model=Bedrock/qwen.qwen3-next-80b-a3b ---
|
| 170 |
+
Step 1: INSPECT acc_0034 flagged=0/10 suspects=0 steps_left=29
|
| 171 |
+
Step 2: INSPECT acc_0003 flagged=0/10 suspects=0 steps_left=28
|
| 172 |
+
Step 3: INSPECT acc_0049 flagged=0/10 suspects=0 steps_left=27
|
| 173 |
+
Step 4: INSPECT acc_0006 flagged=0/10 suspects=0 steps_left=26
|
| 174 |
+
Step 5: INSPECT acc_0047 flagged=0/10 suspects=0 steps_left=25
|
| 175 |
+
Step 6: FLAG acc_0047 flagged=1/10 suspects=9 steps_left=25
|
| 176 |
+
Step 7: INSPECT acc_0009 flagged=1/10 suspects=9 steps_left=24
|
| 177 |
+
Step 8: FLAG acc_0009 flagged=2/10 suspects=8 steps_left=24
|
| 178 |
+
Step 9: INSPECT acc_0046 flagged=2/10 suspects=8 steps_left=23
|
| 179 |
+
Step 10: FLAG acc_0046 flagged=3/10 suspects=7 steps_left=23
|
| 180 |
+
Step 11: INSPECT acc_0014 flagged=3/10 suspects=7 steps_left=22
|
| 181 |
+
Step 12: FLAG acc_0014 flagged=4/10 suspects=6 steps_left=22
|
| 182 |
+
Step 13: INSPECT acc_0021 flagged=4/10 suspects=6 steps_left=21
|
| 183 |
+
Step 14: FLAG acc_0021 flagged=5/10 suspects=5 steps_left=21
|
| 184 |
+
Step 15: INSPECT acc_0002 flagged=5/10 suspects=5 steps_left=20
|
| 185 |
+
Step 16: FLAG acc_0002 flagged=6/10 suspects=4 steps_left=20
|
| 186 |
+
Step 17: INSPECT acc_0048 flagged=6/10 suspects=4 steps_left=19
|
| 187 |
+
Step 18: FLAG acc_0048 flagged=7/10 suspects=3 steps_left=19
|
| 188 |
+
Step 19: INSPECT acc_0029 flagged=7/10 suspects=3 steps_left=18
|
| 189 |
+
Step 20: FLAG acc_0029 flagged=8/10 suspects=2 steps_left=18
|
| 190 |
+
Step 21: INSPECT acc_0015 flagged=8/10 suspects=2 steps_left=17
|
| 191 |
+
Step 22: FLAG acc_0015 flagged=9/10 suspects=1 steps_left=17
|
| 192 |
+
Step 23: INSPECT acc_0005 flagged=9/10 suspects=1 steps_left=16
|
| 193 |
+
Step 24: FLAG acc_0005 flagged=10/10 suspects=0 steps_left=16
|
| 194 |
+
Step 25: SUBMIT flagged=10/10 suspects=0 steps_left=16
|
| 195 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.86
|
| 196 |
+
★ GRADER SCORE: 0.9533
|
| 197 |
+
|
| 198 |
+
--- LLM Agent: task=easy, seed=2, model=Bedrock/qwen.qwen3-next-80b-a3b ---
|
| 199 |
+
Step 1: INSPECT acc_0040 flagged=0/10 suspects=0 steps_left=29
|
| 200 |
+
Step 2: INSPECT acc_0017 flagged=0/10 suspects=0 steps_left=28
|
| 201 |
+
Step 3: INSPECT acc_0025 flagged=0/10 suspects=0 steps_left=27
|
| 202 |
+
Step 4: INSPECT acc_0026 flagged=0/10 suspects=0 steps_left=26
|
| 203 |
+
Step 5: INSPECT acc_0038 flagged=0/10 suspects=0 steps_left=25
|
| 204 |
+
Step 6: INSPECT acc_0029 flagged=0/10 suspects=0 steps_left=24
|
| 205 |
+
Step 7: FLAG acc_0029 flagged=1/10 suspects=9 steps_left=24
|
| 206 |
+
Step 8: INSPECT acc_0006 flagged=1/10 suspects=9 steps_left=23
|
| 207 |
+
Step 9: FLAG acc_0006 flagged=2/10 suspects=8 steps_left=23
|
| 208 |
+
Step 10: INSPECT acc_0033 flagged=2/10 suspects=8 steps_left=22
|
| 209 |
+
Step 11: FLAG acc_0033 flagged=3/10 suspects=7 steps_left=22
|
| 210 |
+
Step 12: INSPECT acc_0015 flagged=3/10 suspects=7 steps_left=21
|
| 211 |
+
Step 13: FLAG acc_0015 flagged=4/10 suspects=6 steps_left=21
|
| 212 |
+
Step 14: INSPECT acc_0022 flagged=4/10 suspects=6 steps_left=20
|
| 213 |
+
Step 15: FLAG acc_0022 flagged=5/10 suspects=5 steps_left=20
|
| 214 |
+
Step 16: INSPECT acc_0009 flagged=5/10 suspects=5 steps_left=19
|
| 215 |
+
Step 17: FLAG acc_0009 flagged=6/10 suspects=4 steps_left=19
|
| 216 |
+
Step 18: INSPECT acc_0004 flagged=6/10 suspects=4 steps_left=18
|
| 217 |
+
Step 19: FLAG acc_0004 flagged=7/10 suspects=3 steps_left=18
|
| 218 |
+
Step 20: INSPECT acc_0024 flagged=7/10 suspects=3 steps_left=17
|
| 219 |
+
Step 21: FLAG acc_0024 flagged=8/10 suspects=2 steps_left=17
|
| 220 |
+
Step 22: INSPECT acc_0049 flagged=8/10 suspects=2 steps_left=16
|
| 221 |
+
Step 23: FLAG acc_0049 flagged=9/10 suspects=1 steps_left=16
|
| 222 |
+
Step 24: INSPECT acc_0035 flagged=9/10 suspects=1 steps_left=15
|
| 223 |
+
Step 25: FLAG acc_0035 flagged=10/10 suspects=0 steps_left=15
|
| 224 |
+
Step 26: SUBMIT flagged=10/10 suspects=0 steps_left=15
|
| 225 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.85
|
| 226 |
+
★ GRADER SCORE: 0.9500
|
| 227 |
+
|
| 228 |
+
easy: scores=['0.967', '0.953', '0.950'] mean=0.9567 var=0.000052
|
| 229 |
+
|
| 230 |
+
--- LLM Agent: task=medium, seed=0, model=Bedrock/qwen.qwen3-next-80b-a3b ---
|
| 231 |
+
Step 1: INSPECT acc_0008 flagged=0/10 suspects=0 steps_left=49
|
| 232 |
+
Step 2: INSPECT acc_0074 flagged=0/10 suspects=0 steps_left=48
|
| 233 |
+
Step 3: INSPECT acc_0179 flagged=0/10 suspects=0 steps_left=47
|
| 234 |
+
Step 4: INSPECT acc_0096 flagged=0/10 suspects=0 steps_left=46
|
| 235 |
+
Step 5: INSPECT acc_0177 flagged=0/10 suspects=0 steps_left=45
|
| 236 |
+
Step 6: INSPECT acc_0122 flagged=0/10 suspects=0 steps_left=44
|
| 237 |
+
Step 7: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=43
|
| 238 |
+
Step 8: INSPECT acc_0174 flagged=0/10 suspects=0 steps_left=42
|
| 239 |
+
Step 9: INSPECT acc_0144 flagged=0/10 suspects=0 steps_left=41
|
| 240 |
+
Step 10: FLAG acc_0144 flagged=1/10 suspects=8 steps_left=41
|
| 241 |
+
Step 11: INSPECT acc_0181 flagged=1/10 suspects=8 steps_left=40
|
| 242 |
+
Step 12: FLAG acc_0181 flagged=2/10 suspects=8 steps_left=40
|
| 243 |
+
Step 13: INSPECT acc_0022 flagged=2/10 suspects=8 steps_left=39
|
| 244 |
+
Step 14: FLAG acc_0022 flagged=3/10 suspects=7 steps_left=39
|
| 245 |
+
Step 15: INSPECT acc_0092 flagged=3/10 suspects=7 steps_left=38
|
| 246 |
+
Step 16: FLAG acc_0092 flagged=4/10 suspects=6 steps_left=38
|
| 247 |
+
Step 17: INSPECT acc_0097 flagged=4/10 suspects=6 steps_left=37
|
| 248 |
+
Step 18: FLAG acc_0097 flagged=5/10 suspects=5 steps_left=37
|
| 249 |
+
Step 19: INSPECT acc_0187 flagged=5/10 suspects=5 steps_left=36
|
| 250 |
+
Step 20: FLAG acc_0187 flagged=6/10 suspects=4 steps_left=36
|
| 251 |
+
Step 21: INSPECT acc_0093 flagged=6/10 suspects=4 steps_left=35
|
| 252 |
+
Step 22: FLAG acc_0093 flagged=7/10 suspects=3 steps_left=35
|
| 253 |
+
Step 23: INSPECT acc_0172 flagged=7/10 suspects=3 steps_left=34
|
| 254 |
+
Step 24: FLAG acc_0172 flagged=8/10 suspects=2 steps_left=34
|
| 255 |
+
Step 25: INSPECT acc_0058 flagged=8/10 suspects=2 steps_left=33
|
| 256 |
+
Step 26: FLAG acc_0058 flagged=9/10 suspects=1 steps_left=33
|
| 257 |
+
Step 27: INSPECT acc_0131 flagged=9/10 suspects=1 steps_left=32
|
| 258 |
+
Step 28: FLAG acc_0131 flagged=10/10 suspects=0 steps_left=32
|
| 259 |
+
Step 29: SUBMIT flagged=10/10 suspects=0 steps_left=32
|
| 260 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.82
|
| 261 |
+
★ GRADER SCORE: 0.9640
|
| 262 |
+
|
| 263 |
+
--- LLM Agent: task=medium, seed=1, model=Bedrock/qwen.qwen3-next-80b-a3b ---
|
| 264 |
+
Step 1: INSPECT acc_0171 flagged=0/10 suspects=0 steps_left=49
|
| 265 |
+
Step 2: INSPECT acc_0099 flagged=0/10 suspects=0 steps_left=48
|
| 266 |
+
Step 3: INSPECT acc_0152 flagged=0/10 suspects=0 steps_left=47
|
| 267 |
+
Step 4: INSPECT acc_0092 flagged=0/10 suspects=0 steps_left=46
|
| 268 |
+
Step 5: INSPECT acc_0078 flagged=0/10 suspects=0 steps_left=45
|
| 269 |
+
Step 6: INSPECT acc_0112 flagged=0/10 suspects=0 steps_left=44
|
| 270 |
+
Step 7: INSPECT acc_0012 flagged=0/10 suspects=0 steps_left=43
|
| 271 |
+
Step 8: FLAG acc_0012 flagged=1/10 suspects=8 steps_left=43
|
| 272 |
+
Step 9: INSPECT acc_0033 flagged=1/10 suspects=8 steps_left=42
|
| 273 |
+
Step 10: FLAG acc_0033 flagged=2/10 suspects=8 steps_left=42
|
| 274 |
+
Step 11: INSPECT acc_0174 flagged=2/10 suspects=8 steps_left=41
|
| 275 |
+
Step 12: FLAG acc_0174 flagged=3/10 suspects=7 steps_left=41
|
| 276 |
+
Step 13: INSPECT acc_0187 flagged=3/10 suspects=7 steps_left=40
|
| 277 |
+
Step 14: FLAG acc_0187 flagged=4/10 suspects=6 steps_left=40
|
| 278 |
+
Step 15: INSPECT acc_0079 flagged=4/10 suspects=6 steps_left=39
|
| 279 |
+
Step 16: FLAG acc_0079 flagged=5/10 suspects=5 steps_left=39
|
| 280 |
+
Step 17: INSPECT acc_0032 flagged=5/10 suspects=5 steps_left=38
|
| 281 |
+
Step 18: FLAG acc_0032 flagged=6/10 suspects=4 steps_left=38
|
| 282 |
+
Step 19: INSPECT acc_0023 flagged=6/10 suspects=4 steps_left=37
|
| 283 |
+
Step 20: FLAG acc_0023 flagged=7/10 suspects=3 steps_left=37
|
| 284 |
+
Step 21: INSPECT acc_0146 flagged=7/10 suspects=3 steps_left=36
|
| 285 |
+
Step 22: FLAG acc_0146 flagged=8/10 suspects=2 steps_left=36
|
| 286 |
+
Step 23: INSPECT acc_0019 flagged=8/10 suspects=2 steps_left=35
|
| 287 |
+
Step 24: FLAG acc_0019 flagged=9/10 suspects=1 steps_left=35
|
| 288 |
+
Step 25: INSPECT acc_0131 flagged=9/10 suspects=1 steps_left=34
|
| 289 |
+
Step 26: FLAG acc_0131 flagged=10/10 suspects=0 steps_left=34
|
| 290 |
+
Step 27: SUBMIT flagged=10/10 suspects=0 steps_left=34
|
| 291 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.84
|
| 292 |
+
★ GRADER SCORE: 0.9680
|
| 293 |
+
|
| 294 |
+
--- LLM Agent: task=medium, seed=2, model=Bedrock/qwen.qwen3-next-80b-a3b ---
|
| 295 |
+
Step 1: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=49
|
| 296 |
+
Step 2: INSPECT acc_0107 flagged=0/10 suspects=0 steps_left=48
|
| 297 |
+
Step 3: INSPECT acc_0175 flagged=0/10 suspects=0 steps_left=47
|
| 298 |
+
Step 4: INSPECT acc_0030 flagged=0/10 suspects=0 steps_left=46
|
| 299 |
+
Step 5: INSPECT acc_0041 flagged=0/10 suspects=0 steps_left=45
|
| 300 |
+
Step 6: INSPECT acc_0054 flagged=0/10 suspects=0 steps_left=44
|
| 301 |
+
Step 7: INSPECT acc_0199 flagged=0/10 suspects=0 steps_left=43
|
| 302 |
+
Step 8: INSPECT acc_0181 flagged=0/10 suspects=0 steps_left=42
|
| 303 |
+
Step 9: INSPECT acc_0166 flagged=0/10 suspects=0 steps_left=41
|
| 304 |
+
Step 10: INSPECT acc_0098 flagged=0/10 suspects=0 steps_left=40
|
| 305 |
+
Step 11: INSPECT acc_0121 flagged=0/10 suspects=0 steps_left=39
|
| 306 |
+
Step 12: INSPECT acc_0053 flagged=0/10 suspects=0 steps_left=38
|
| 307 |
+
Step 13: INSPECT acc_0103 flagged=0/10 suspects=0 steps_left=37
|
| 308 |
+
Step 14: INSPECT acc_0000 flagged=0/10 suspects=0 steps_left=36
|
| 309 |
+
Step 15: INSPECT acc_0168 flagged=0/10 suspects=0 steps_left=35
|
| 310 |
+
Step 16: INSPECT acc_0040 flagged=0/10 suspects=0 steps_left=34
|
| 311 |
+
Step 17: INSPECT acc_0149 flagged=0/10 suspects=0 steps_left=33
|
| 312 |
+
Step 18: INSPECT acc_0064 flagged=0/10 suspects=0 steps_left=32
|
| 313 |
+
Step 19: INSPECT acc_0016 flagged=0/10 suspects=0 steps_left=31
|
| 314 |
+
Step 20: INSPECT acc_0105 flagged=0/10 suspects=0 steps_left=30
|
| 315 |
+
Step 21: INSPECT acc_0035 flagged=0/10 suspects=0 steps_left=29
|
| 316 |
+
Step 22: FLAG acc_0035 flagged=1/10 suspects=9 steps_left=29
|
| 317 |
+
Step 23: INSPECT acc_0020 flagged=1/10 suspects=9 steps_left=28
|
| 318 |
+
Step 24: FLAG acc_0020 flagged=2/10 suspects=8 steps_left=28
|
| 319 |
+
Step 25: INSPECT acc_0036 flagged=2/10 suspects=8 steps_left=27
|
| 320 |
+
Step 26: FLAG acc_0036 flagged=3/10 suspects=7 steps_left=27
|
| 321 |
+
Step 27: INSPECT acc_0050 flagged=3/10 suspects=7 steps_left=26
|
| 322 |
+
Step 28: FLAG acc_0050 flagged=4/10 suspects=6 steps_left=26
|
| 323 |
+
Step 29: INSPECT acc_0051 flagged=4/10 suspects=6 steps_left=25
|
| 324 |
+
Step 30: FLAG acc_0051 flagged=5/10 suspects=5 steps_left=25
|
| 325 |
+
Step 31: INSPECT acc_0085 flagged=5/10 suspects=5 steps_left=24
|
| 326 |
+
Step 32: FLAG acc_0085 flagged=6/10 suspects=4 steps_left=24
|
| 327 |
+
Step 33: INSPECT acc_0177 flagged=6/10 suspects=4 steps_left=23
|
| 328 |
+
Step 34: FLAG acc_0177 flagged=7/10 suspects=3 steps_left=23
|
| 329 |
+
Step 35: INSPECT acc_0170 flagged=7/10 suspects=3 steps_left=22
|
| 330 |
+
Step 36: FLAG acc_0170 flagged=8/10 suspects=2 steps_left=22
|
| 331 |
+
Step 37: INSPECT acc_0055 flagged=8/10 suspects=2 steps_left=21
|
| 332 |
+
Step 38: FLAG acc_0055 flagged=9/10 suspects=1 steps_left=21
|
| 333 |
+
Step 39: INSPECT acc_0094 flagged=9/10 suspects=1 steps_left=20
|
| 334 |
+
Step 40: FLAG acc_0094 flagged=10/10 suspects=0 steps_left=20
|
| 335 |
+
Step 41: SUBMIT flagged=10/10 suspects=0 steps_left=20
|
| 336 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.70
|
| 337 |
+
★ GRADER SCORE: 0.9400
|
| 338 |
+
|
| 339 |
+
medium: scores=['0.964', '0.968', '0.940'] mean=0.9573 var=0.000153
|
| 340 |
+
|
| 341 |
+
--- LLM Agent: task=hard, seed=0, model=Bedrock/qwen.qwen3-next-80b-a3b ---
|
| 342 |
+
Step 1: INSPECT acc_0704 flagged=0/10 suspects=0 steps_left=79
|
| 343 |
+
Step 2: INSPECT acc_0289 flagged=0/10 suspects=0 steps_left=78
|
| 344 |
+
Step 3: INSPECT acc_0826 flagged=0/10 suspects=0 steps_left=77
|
| 345 |
+
Step 4: INSPECT acc_0927 flagged=0/10 suspects=0 steps_left=76
|
| 346 |
+
Step 5: INSPECT acc_0441 flagged=0/10 suspects=0 steps_left=75
|
| 347 |
+
Step 6: INSPECT acc_0871 flagged=0/10 suspects=0 steps_left=74
|
| 348 |
+
Step 7: INSPECT acc_0880 flagged=0/10 suspects=0 steps_left=73
|
| 349 |
+
Step 8: INSPECT acc_0652 flagged=0/10 suspects=0 steps_left=72
|
| 350 |
+
Step 9: INSPECT acc_0939 flagged=0/10 suspects=0 steps_left=71
|
| 351 |
+
Step 10: INSPECT acc_0070 flagged=0/10 suspects=0 steps_left=70
|
| 352 |
+
Step 11: INSPECT acc_0915 flagged=0/10 suspects=0 steps_left=69
|
| 353 |
+
Step 12: INSPECT acc_0443 flagged=0/10 suspects=0 steps_left=68
|
| 354 |
+
Step 13: INSPECT acc_0970 flagged=0/10 suspects=0 steps_left=67
|
| 355 |
+
Step 14: INSPECT acc_0028 flagged=0/10 suspects=0 steps_left=66
|
| 356 |
+
Step 15: INSPECT acc_0792 flagged=0/10 suspects=0 steps_left=65
|
| 357 |
+
Step 16: INSPECT acc_0579 flagged=0/10 suspects=0 steps_left=64
|
| 358 |
+
Step 17: INSPECT acc_0037 flagged=0/10 suspects=0 steps_left=63
|
| 359 |
+
Step 18: INSPECT acc_0295 flagged=0/10 suspects=0 steps_left=62
|
| 360 |
+
Step 19: INSPECT acc_0438 flagged=0/10 suspects=0 steps_left=61
|
| 361 |
+
Step 20: INSPECT acc_0439 flagged=0/10 suspects=0 steps_left=60
|
| 362 |
+
Step 21: FLAG acc_0439 flagged=1/10 suspects=5 steps_left=60
|
| 363 |
+
Step 22: INSPECT acc_0237 flagged=1/10 suspects=5 steps_left=59
|
| 364 |
+
Step 23: FLAG acc_0237 flagged=2/10 suspects=6 steps_left=59
|
| 365 |
+
Step 24: INSPECT acc_0621 flagged=2/10 suspects=6 steps_left=58
|
| 366 |
+
Step 25: FLAG acc_0621 flagged=3/10 suspects=6 steps_left=58
|
| 367 |
+
Step 26: INSPECT acc_0389 flagged=3/10 suspects=6 steps_left=57
|
| 368 |
+
Step 27: INSPECT acc_0160 flagged=3/10 suspects=6 steps_left=56
|
| 369 |
+
Step 28: FLAG acc_0389 flagged=4/10 suspects=6 steps_left=56
|
| 370 |
+
Step 29: FLAG acc_0160 flagged=5/10 suspects=5 steps_left=56
|
| 371 |
+
Step 30: INSPECT acc_0549 flagged=5/10 suspects=5 steps_left=55
|
| 372 |
+
Step 31: FLAG acc_0549 flagged=6/10 suspects=4 steps_left=55
|
| 373 |
+
Step 32: INSPECT acc_0658 flagged=6/10 suspects=4 steps_left=54
|
| 374 |
+
Step 33: FLAG acc_0658 flagged=7/10 suspects=3 steps_left=54
|
| 375 |
+
Step 34: INSPECT acc_0290 flagged=7/10 suspects=3 steps_left=53
|
| 376 |
+
Step 35: FLAG acc_0290 flagged=8/10 suspects=2 steps_left=53
|
| 377 |
+
Step 36: INSPECT acc_0124 flagged=8/10 suspects=2 steps_left=52
|
| 378 |
+
Step 37: INSPECT acc_0507 flagged=8/10 suspects=2 steps_left=51
|
| 379 |
+
Step 38: FLAG acc_0124 flagged=9/10 suspects=1 steps_left=51
|
| 380 |
+
Step 39: FLAG acc_0507 flagged=10/10 suspects=0 steps_left=51
|
| 381 |
+
Step 40: SUBMIT flagged=10/10 suspects=0 steps_left=51
|
| 382 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.71
|
| 383 |
+
★ GRADER SCORE: 0.9637
|
| 384 |
+
|
| 385 |
+
--- LLM Agent: task=hard, seed=1, model=Bedrock/qwen.qwen3-next-80b-a3b ---
|
| 386 |
+
Step 1: INSPECT acc_0014 flagged=0/10 suspects=0 steps_left=79
|
| 387 |
+
Step 2: INSPECT acc_0835 flagged=0/10 suspects=0 steps_left=78
|
| 388 |
+
Step 3: INSPECT acc_0855 flagged=0/10 suspects=0 steps_left=77
|
| 389 |
+
Step 4: INSPECT acc_0930 flagged=0/10 suspects=0 steps_left=76
|
| 390 |
+
Step 5: INSPECT acc_0336 flagged=0/10 suspects=0 steps_left=75
|
| 391 |
+
Step 6: INSPECT acc_0929 flagged=0/10 suspects=0 steps_left=74
|
| 392 |
+
Step 7: INSPECT acc_0076 flagged=0/10 suspects=0 steps_left=73
|
| 393 |
+
Step 8: INSPECT acc_0543 flagged=0/10 suspects=0 steps_left=72
|
| 394 |
+
Step 9: INSPECT acc_0590 flagged=0/10 suspects=0 steps_left=71
|
| 395 |
+
Step 10: INSPECT acc_0401 flagged=0/10 suspects=0 steps_left=70
|
| 396 |
+
Step 11: INSPECT acc_0322 flagged=0/10 suspects=0 steps_left=69
|
| 397 |
+
Step 12: INSPECT acc_0154 flagged=0/10 suspects=0 steps_left=68
|
| 398 |
+
Step 13: INSPECT acc_0374 flagged=0/10 suspects=0 steps_left=67
|
| 399 |
+
Step 14: INSPECT acc_0549 flagged=0/10 suspects=0 steps_left=66
|
| 400 |
+
Step 15: INSPECT acc_0903 flagged=0/10 suspects=0 steps_left=65
|
| 401 |
+
Step 16: INSPECT acc_0976 flagged=0/10 suspects=0 steps_left=64
|
| 402 |
+
Step 17: INSPECT acc_0620 flagged=0/10 suspects=0 steps_left=63
|
| 403 |
+
Step 18: INSPECT acc_0017 flagged=0/10 suspects=0 steps_left=62
|
| 404 |
+
Step 19: INSPECT acc_0222 flagged=0/10 suspects=0 steps_left=61
|
| 405 |
+
Step 20: INSPECT acc_0536 flagged=0/10 suspects=0 steps_left=60
|
| 406 |
+
Step 21: INSPECT acc_0112 flagged=0/10 suspects=0 steps_left=59
|
| 407 |
+
Step 22: INSPECT acc_0577 flagged=0/10 suspects=0 steps_left=58
|
| 408 |
+
Step 23: INSPECT acc_0517 flagged=0/10 suspects=0 steps_left=57
|
| 409 |
+
Step 24: INSPECT acc_0113 flagged=0/10 suspects=0 steps_left=56
|
| 410 |
+
Step 25: INSPECT acc_0167 flagged=0/10 suspects=0 steps_left=55
|
| 411 |
+
Step 26: INSPECT acc_0697 flagged=0/10 suspects=0 steps_left=54
|
| 412 |
+
Step 27: INSPECT acc_0271 flagged=0/10 suspects=0 steps_left=53
|
| 413 |
+
Step 28: INSPECT acc_0681 flagged=0/10 suspects=0 steps_left=52
|
| 414 |
+
Step 29: INSPECT acc_0530 flagged=0/10 suspects=0 steps_left=51
|
| 415 |
+
Step 30: INSPECT acc_0353 flagged=0/10 suspects=0 steps_left=50
|
| 416 |
+
Step 31: INSPECT acc_0652 flagged=0/10 suspects=0 steps_left=49
|
| 417 |
+
Step 32: INSPECT acc_0777 flagged=0/10 suspects=0 steps_left=48
|
| 418 |
+
Step 33: INSPECT acc_0265 flagged=0/10 suspects=0 steps_left=47
|
| 419 |
+
Step 34: INSPECT acc_0788 flagged=0/10 suspects=0 steps_left=46
|
| 420 |
+
Step 35: INSPECT acc_0033 flagged=0/10 suspects=0 steps_left=45
|
| 421 |
+
Step 36: INSPECT acc_0187 flagged=0/10 suspects=0 steps_left=44
|
| 422 |
+
Step 37: INSPECT acc_0445 flagged=0/10 suspects=0 steps_left=43
|
| 423 |
+
Step 38: INSPECT acc_0846 flagged=0/10 suspects=0 steps_left=42
|
| 424 |
+
Step 39: INSPECT acc_0659 flagged=0/10 suspects=0 steps_left=41
|
| 425 |
+
Step 40: INSPECT acc_0768 flagged=0/10 suspects=0 steps_left=40
|
| 426 |
+
Step 41: INSPECT acc_0677 flagged=0/10 suspects=0 steps_left=39
|
| 427 |
+
Step 42: INSPECT acc_0539 flagged=0/10 suspects=0 steps_left=38
|
| 428 |
+
Step 43: INSPECT acc_0742 flagged=0/10 suspects=0 steps_left=37
|
| 429 |
+
Step 44: INSPECT acc_0503 flagged=0/10 suspects=0 steps_left=36
|
| 430 |
+
Step 45: INSPECT acc_0876 flagged=0/10 suspects=0 steps_left=35
|
| 431 |
+
Step 46: INSPECT acc_0639 flagged=0/10 suspects=0 steps_left=34
|
| 432 |
+
Step 47: INSPECT acc_0494 flagged=0/10 suspects=0 steps_left=33
|
| 433 |
+
Step 48: INSPECT acc_0898 flagged=0/10 suspects=0 steps_left=32
|
| 434 |
+
Step 49: INSPECT acc_0553 flagged=0/10 suspects=0 steps_left=31
|
| 435 |
+
Step 50: INSPECT acc_0588 flagged=0/10 suspects=0 steps_left=30
|
| 436 |
+
Step 51: INSPECT acc_0194 flagged=0/10 suspects=0 steps_left=29
|
| 437 |
+
Step 52: INSPECT acc_0810 flagged=0/10 suspects=0 steps_left=28
|
| 438 |
+
Step 53: INSPECT acc_0355 flagged=0/10 suspects=0 steps_left=27
|
| 439 |
+
Step 54: INSPECT acc_0363 flagged=0/10 suspects=0 steps_left=26
|
| 440 |
+
Step 55: INSPECT acc_0221 flagged=0/10 suspects=0 steps_left=25
|
| 441 |
+
Step 56: INSPECT acc_0580 flagged=0/10 suspects=0 steps_left=24
|
| 442 |
+
Step 57: INSPECT acc_0534 flagged=0/10 suspects=0 steps_left=23
|
| 443 |
+
Step 58: INSPECT acc_0778 flagged=0/10 suspects=0 steps_left=22
|
| 444 |
+
Step 59: INSPECT acc_0998 flagged=0/10 suspects=0 steps_left=21
|
| 445 |
+
Step 60: INSPECT acc_0233 flagged=0/10 suspects=0 steps_left=20
|
| 446 |
+
Step 61: INSPECT acc_0052 flagged=0/10 suspects=0 steps_left=19
|
| 447 |
+
Step 62: INSPECT acc_0813 flagged=0/10 suspects=0 steps_left=18
|
| 448 |
+
Step 63: INSPECT acc_0035 flagged=0/10 suspects=0 steps_left=17
|
| 449 |
+
Step 64: INSPECT acc_0667 flagged=0/10 suspects=0 steps_left=16
|
| 450 |
+
Step 65: INSPECT acc_0019 flagged=0/10 suspects=0 steps_left=15
|
| 451 |
+
Step 66: INSPECT acc_0959 flagged=0/10 suspects=0 steps_left=14
|
| 452 |
+
Step 67: INSPECT acc_0212 flagged=0/10 suspects=0 steps_left=13
|
| 453 |
+
Step 68: INSPECT acc_0776 flagged=0/10 suspects=0 steps_left=12
|
| 454 |
+
Step 69: INSPECT acc_0049 flagged=0/10 suspects=0 steps_left=11
|
| 455 |
+
Step 70: INSPECT acc_0434 flagged=0/10 suspects=0 steps_left=10
|
| 456 |
+
Step 71: INSPECT acc_0827 flagged=0/10 suspects=0 steps_left=9
|
| 457 |
+
Step 72: INSPECT acc_0583 flagged=0/10 suspects=0 steps_left=8
|
| 458 |
+
Step 73: INSPECT acc_0065 flagged=0/10 suspects=0 steps_left=7
|
| 459 |
+
Step 74: INSPECT acc_0107 flagged=0/10 suspects=0 steps_left=6
|
| 460 |
+
Step 75: INSPECT acc_0761 flagged=0/10 suspects=0 steps_left=5
|
| 461 |
+
Step 76: INSPECT acc_0995 flagged=0/10 suspects=0 steps_left=4
|
| 462 |
+
Step 77: INSPECT acc_0157 flagged=0/10 suspects=0 steps_left=3
|
| 463 |
+
Step 78: INSPECT acc_0936 flagged=0/10 suspects=0 steps_left=2
|
| 464 |
+
Step 79: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=1
|
| 465 |
+
Step 80: INSPECT acc_0691 flagged=0/10 suspects=0 steps_left=0
|
| 466 |
+
→ Episode ended: [LOSS] TP=0 FP=0 FN=10 Recall=0.00 Precision=0.00 Episode reward=-9.80
|
| 467 |
+
★ GRADER SCORE: 0.0000
|
| 468 |
+
|
| 469 |
+
--- LLM Agent: task=hard, seed=2, model=Bedrock/qwen.qwen3-next-80b-a3b ---
|
| 470 |
+
Step 1: INSPECT acc_0813 flagged=0/10 suspects=0 steps_left=79
|
| 471 |
+
Step 2: INSPECT acc_0430 flagged=0/10 suspects=0 steps_left=78
|
| 472 |
+
Step 3: INSPECT acc_0817 flagged=0/10 suspects=0 steps_left=77
|
| 473 |
+
Step 4: INSPECT acc_0175 flagged=0/10 suspects=0 steps_left=76
|
| 474 |
+
Step 5: INSPECT acc_0523 flagged=0/10 suspects=0 steps_left=75
|
| 475 |
+
Step 6: INSPECT acc_0113 flagged=0/10 suspects=0 steps_left=74
|
| 476 |
+
Step 7: INSPECT acc_0797 flagged=0/10 suspects=0 steps_left=73
|
| 477 |
+
Step 8: INSPECT acc_0478 flagged=0/10 suspects=0 steps_left=72
|
| 478 |
+
Step 9: INSPECT acc_0861 flagged=0/10 suspects=0 steps_left=71
|
| 479 |
+
Step 10: INSPECT acc_0836 flagged=0/10 suspects=0 steps_left=70
|
| 480 |
+
Step 11: INSPECT acc_0926 flagged=0/10 suspects=0 steps_left=69
|
| 481 |
+
Step 12: INSPECT acc_0664 flagged=0/10 suspects=0 steps_left=68
|
| 482 |
+
Step 13: INSPECT acc_0255 flagged=0/10 suspects=0 steps_left=67
|
| 483 |
+
Step 14: INSPECT acc_0938 flagged=0/10 suspects=0 steps_left=66
|
| 484 |
+
Step 15: INSPECT acc_0672 flagged=0/10 suspects=0 steps_left=65
|
| 485 |
+
Step 16: FLAG acc_0672 flagged=1/10 suspects=6 steps_left=65
|
| 486 |
+
Step 17: INSPECT acc_0659 flagged=1/10 suspects=6 steps_left=64
|
| 487 |
+
Step 18: FLAG acc_0659 flagged=2/10 suspects=5 steps_left=64
|
| 488 |
+
Step 19: INSPECT acc_0290 flagged=2/10 suspects=5 steps_left=63
|
| 489 |
+
Step 20: FLAG acc_0290 flagged=3/10 suspects=5 steps_left=63
|
| 490 |
+
Step 21: INSPECT acc_0339 flagged=3/10 suspects=5 steps_left=62
|
| 491 |
+
Step 22: FLAG acc_0339 flagged=4/10 suspects=6 steps_left=62
|
| 492 |
+
Step 23: INSPECT acc_0544 flagged=4/10 suspects=6 steps_left=61
|
| 493 |
+
Step 24: FLAG acc_0544 flagged=5/10 suspects=5 steps_left=61
|
| 494 |
+
Step 25: INSPECT acc_0696 flagged=5/10 suspects=5 steps_left=60
|
| 495 |
+
Step 26: FLAG acc_0696 flagged=6/10 suspects=4 steps_left=60
|
| 496 |
+
Step 27: INSPECT acc_0541 flagged=6/10 suspects=4 steps_left=59
|
| 497 |
+
Step 28: FLAG acc_0541 flagged=7/10 suspects=3 steps_left=59
|
| 498 |
+
Step 29: INSPECT acc_0793 flagged=7/10 suspects=3 steps_left=58
|
| 499 |
+
Step 30: FLAG acc_0793 flagged=8/10 suspects=2 steps_left=58
|
| 500 |
+
Step 31: INSPECT acc_0214 flagged=8/10 suspects=2 steps_left=57
|
| 501 |
+
Step 32: FLAG acc_0214 flagged=9/10 suspects=1 steps_left=57
|
| 502 |
+
Step 33: INSPECT acc_0112 flagged=9/10 suspects=1 steps_left=56
|
| 503 |
+
Step 34: FLAG acc_0112 flagged=10/10 suspects=0 steps_left=56
|
| 504 |
+
Step 35: SUBMIT flagged=10/10 suspects=0 steps_left=56
|
| 505 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.76
|
| 506 |
+
★ GRADER SCORE: 0.9700
|
| 507 |
+
|
| 508 |
+
hard: scores=['0.964', '0.000', '0.970'] mean=0.6446 var=0.207740
|
| 509 |
+
|
| 510 |
+
============================================================
|
| 511 |
+
EVALUATION COMPLETE
|
| 512 |
+
============================================================
|
| 513 |
+
ubuntu@ip-172-31-33-59:~/meta/meta-hack-26$
|
memory/reflections_easy.jsonl
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"episode": 1, "reward": 18.88, "reflection": "
|
|
|
|
| 1 |
+
{"episode": 1, "reward": 18.88, "reflection": "Starting with direct INSPECTs on low-numbered accounts revealed multiple gang members with high fake_risk_score, photo_reuse_score, and comment_repeat_score, confirming a coordinated pattern. Once the first few were FLAGGED, their auto-revealed neighbors automatically became high-priority SUSPECTs, allowing rapid expansion without extra steps\u2014this network-triggered propagation was key to 100% recall with zero false positives. Always begin with targeted INSPECTs to identify clear gang markers, then leverage FLAGging to unlock the full network efficiently."}
|
memory/reflections_hard.jsonl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
-
{"episode": 1, "reward": 17.84, "reflection": "
|
| 2 |
-
{"episode": 2, "reward": 18.76, "reflection": "
|
| 3 |
-
{"episode": 3, "reward": 18.84, "reflection": "
|
|
|
|
| 1 |
+
{"episode": 1, "reward": 17.84, "reflection": "The strategy succeeded by first INSPECTing 10 low-suspicion accounts to map the network\u2019s baseline behavior, confirming that comment_repeat_score and photo_reuse_score consistently spiked in gang members. Once five accounts showed matching red flags, FLAGging them triggered auto-suspect propagation, revealing the full gang without unnecessary INVESTIGATE_NETWORK steps\u2014this minimized steps while maximizing coverage. Future cases should always begin with targeted INSPECTs to validate signals before FLAGging, avoiding premature network expansion."}
|
| 2 |
+
{"episode": 2, "reward": 18.76, "reflection": "By systematically INSPECTing high-risk accounts with elevated fake_risk_score and multiple red flags\u2014comment_repeat_score, shared_ip_count, and photo_reuse_score\u2014we uncovered a tightly coordinated gang operating under identical bio templates. Each confirmed fake account revealed neighbors with identical patterns, making network expansion unnecessary; we prioritized direct inspection over speculative INVESTIGATE_NETWORK, conserving steps while maximizing precision. Future cases should trust low-step inspection of high-signal accounts first, letting flagged neighbors naturally expose the rest."}
|
| 3 |
+
{"episode": 3, "reward": 18.84, "reflection": "We successfully identified the gang by first inspecting 10 low-suspicion accounts to establish baseline behavior, then flagged those showing high comment_repeat_score, shared_ip_count, and photo_reuse_score \u2014 all consistent with coordinated fake accounts. The key was waiting to flag until multiple signals aligned, which prevented false positives and triggered auto-suspect expansion, revealing the full network without unnecessary steps. Always inspect first to calibrate, then flag only when signals converge."}
|
memory/reflections_medium.jsonl
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"episode": 1, "reward": 18.86, "reflection": "
|
|
|
|
| 1 |
+
{"episode": 1, "reward": 18.86, "reflection": "Starting with targeted INSPECTs on low-activity accounts revealed multiple gang members with high comment_repeat_score, photo_reuse_score, and bio_template_score \u2014 all clustered under shared_ip_count > 5. Flagging the first five confirmed gang nodes triggered auto-suspect propagation, letting us identify the full network without unnecessary network expansion. Always begin with direct inspection of suspicious profiles before network moves \u2014 this minimizes steps and maximizes precision."}
|
memory/wins_easy.jsonl
CHANGED
|
@@ -8,3 +8,43 @@
|
|
| 8 |
{"episode": 8, "won": true}
|
| 9 |
{"episode": 9, "won": true}
|
| 10 |
{"episode": 10, "won": true}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
{"episode": 8, "won": true}
|
| 9 |
{"episode": 9, "won": true}
|
| 10 |
{"episode": 10, "won": true}
|
| 11 |
+
{"episode": 11, "won": true}
|
| 12 |
+
{"episode": 12, "won": true}
|
| 13 |
+
{"episode": 13, "won": true}
|
| 14 |
+
{"episode": 14, "won": true}
|
| 15 |
+
{"episode": 15, "won": true}
|
| 16 |
+
{"episode": 16, "won": true}
|
| 17 |
+
{"episode": 17, "won": true}
|
| 18 |
+
{"episode": 18, "won": true}
|
| 19 |
+
{"episode": 19, "won": true}
|
| 20 |
+
{"episode": 20, "won": true}
|
| 21 |
+
{"episode": 21, "won": true}
|
| 22 |
+
{"episode": 22, "won": true}
|
| 23 |
+
{"episode": 23, "won": true}
|
| 24 |
+
{"episode": 24, "won": true}
|
| 25 |
+
{"episode": 25, "won": true}
|
| 26 |
+
{"episode": 26, "won": true}
|
| 27 |
+
{"episode": 27, "won": true}
|
| 28 |
+
{"episode": 28, "won": true}
|
| 29 |
+
{"episode": 29, "won": true}
|
| 30 |
+
{"episode": 30, "won": true}
|
| 31 |
+
{"episode": 31, "won": true}
|
| 32 |
+
{"episode": 32, "won": true}
|
| 33 |
+
{"episode": 33, "won": true}
|
| 34 |
+
{"episode": 34, "won": true}
|
| 35 |
+
{"episode": 35, "won": true}
|
| 36 |
+
{"episode": 36, "won": true}
|
| 37 |
+
{"episode": 37, "won": true}
|
| 38 |
+
{"episode": 38, "won": true}
|
| 39 |
+
{"episode": 39, "won": true}
|
| 40 |
+
{"episode": 40, "won": true}
|
| 41 |
+
{"episode": 41, "won": true}
|
| 42 |
+
{"episode": 42, "won": true}
|
| 43 |
+
{"episode": 43, "won": true}
|
| 44 |
+
{"episode": 44, "won": true}
|
| 45 |
+
{"episode": 45, "won": true}
|
| 46 |
+
{"episode": 46, "won": true}
|
| 47 |
+
{"episode": 47, "won": true}
|
| 48 |
+
{"episode": 48, "won": true}
|
| 49 |
+
{"episode": 49, "won": true}
|
| 50 |
+
{"episode": 50, "won": true}
|
memory/wins_hard.jsonl
CHANGED
|
@@ -8,3 +8,28 @@
|
|
| 8 |
{"episode": 8, "won": true}
|
| 9 |
{"episode": 9, "won": true}
|
| 10 |
{"episode": 10, "won": true}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
{"episode": 8, "won": true}
|
| 9 |
{"episode": 9, "won": true}
|
| 10 |
{"episode": 10, "won": true}
|
| 11 |
+
{"episode": 11, "won": true}
|
| 12 |
+
{"episode": 12, "won": true}
|
| 13 |
+
{"episode": 13, "won": true}
|
| 14 |
+
{"episode": 14, "won": true}
|
| 15 |
+
{"episode": 15, "won": true}
|
| 16 |
+
{"episode": 16, "won": true}
|
| 17 |
+
{"episode": 17, "won": true}
|
| 18 |
+
{"episode": 18, "won": true}
|
| 19 |
+
{"episode": 19, "won": true}
|
| 20 |
+
{"episode": 20, "won": true}
|
| 21 |
+
{"episode": 21, "won": true}
|
| 22 |
+
{"episode": 22, "won": true}
|
| 23 |
+
{"episode": 23, "won": true}
|
| 24 |
+
{"episode": 24, "won": true}
|
| 25 |
+
{"episode": 25, "won": true}
|
| 26 |
+
{"episode": 26, "won": true}
|
| 27 |
+
{"episode": 27, "won": true}
|
| 28 |
+
{"episode": 28, "won": true}
|
| 29 |
+
{"episode": 29, "won": true}
|
| 30 |
+
{"episode": 30, "won": true}
|
| 31 |
+
{"episode": 31, "won": true}
|
| 32 |
+
{"episode": 32, "won": true}
|
| 33 |
+
{"episode": 33, "won": true}
|
| 34 |
+
{"episode": 34, "won": true}
|
| 35 |
+
{"episode": 35, "won": true}
|
memory/wins_medium.jsonl
CHANGED
|
@@ -1,20 +1,44 @@
|
|
| 1 |
{"episode": 1, "won": true}
|
| 2 |
-
{"episode": 1, "won": true}
|
| 3 |
-
{"episode": 2, "won": true}
|
| 4 |
{"episode": 2, "won": true}
|
| 5 |
{"episode": 3, "won": true}
|
| 6 |
-
{"episode": 3, "won": true}
|
| 7 |
{"episode": 4, "won": true}
|
| 8 |
-
{"episode": 4, "won": true}
|
| 9 |
-
{"episode": 5, "won": true}
|
| 10 |
{"episode": 5, "won": true}
|
| 11 |
{"episode": 6, "won": true}
|
| 12 |
-
{"episode": 6, "won": true}
|
| 13 |
-
{"episode": 7, "won": true}
|
| 14 |
{"episode": 7, "won": true}
|
| 15 |
{"episode": 8, "won": true}
|
| 16 |
-
{"episode": 8, "won": true}
|
| 17 |
{"episode": 9, "won": true}
|
| 18 |
-
{"episode": 9, "won": true}
|
| 19 |
-
{"episode": 10, "won": true}
|
| 20 |
{"episode": 10, "won": true}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
{"episode": 1, "won": true}
|
|
|
|
|
|
|
| 2 |
{"episode": 2, "won": true}
|
| 3 |
{"episode": 3, "won": true}
|
|
|
|
| 4 |
{"episode": 4, "won": true}
|
|
|
|
|
|
|
| 5 |
{"episode": 5, "won": true}
|
| 6 |
{"episode": 6, "won": true}
|
|
|
|
|
|
|
| 7 |
{"episode": 7, "won": true}
|
| 8 |
{"episode": 8, "won": true}
|
|
|
|
| 9 |
{"episode": 9, "won": true}
|
|
|
|
|
|
|
| 10 |
{"episode": 10, "won": true}
|
| 11 |
+
{"episode": 11, "won": true}
|
| 12 |
+
{"episode": 12, "won": true}
|
| 13 |
+
{"episode": 13, "won": true}
|
| 14 |
+
{"episode": 14, "won": true}
|
| 15 |
+
{"episode": 15, "won": true}
|
| 16 |
+
{"episode": 16, "won": true}
|
| 17 |
+
{"episode": 17, "won": true}
|
| 18 |
+
{"episode": 18, "won": true}
|
| 19 |
+
{"episode": 19, "won": true}
|
| 20 |
+
{"episode": 20, "won": true}
|
| 21 |
+
{"episode": 21, "won": true}
|
| 22 |
+
{"episode": 22, "won": true}
|
| 23 |
+
{"episode": 23, "won": true}
|
| 24 |
+
{"episode": 24, "won": true}
|
| 25 |
+
{"episode": 25, "won": true}
|
| 26 |
+
{"episode": 26, "won": true}
|
| 27 |
+
{"episode": 27, "won": true}
|
| 28 |
+
{"episode": 28, "won": true}
|
| 29 |
+
{"episode": 29, "won": true}
|
| 30 |
+
{"episode": 30, "won": true}
|
| 31 |
+
{"episode": 31, "won": true}
|
| 32 |
+
{"episode": 32, "won": true}
|
| 33 |
+
{"episode": 33, "won": true}
|
| 34 |
+
{"episode": 34, "won": true}
|
| 35 |
+
{"episode": 35, "won": true}
|
| 36 |
+
{"episode": 36, "won": true}
|
| 37 |
+
{"episode": 37, "won": true}
|
| 38 |
+
{"episode": 38, "won": true}
|
| 39 |
+
{"episode": 39, "won": true}
|
| 40 |
+
{"episode": 40, "won": true}
|
| 41 |
+
{"episode": 41, "won": true}
|
| 42 |
+
{"episode": 42, "won": true}
|
| 43 |
+
{"episode": 43, "won": true}
|
| 44 |
+
{"episode": 44, "won": true}
|
model-benchmark-logs/deepseek_judge_log.txt
ADDED
|
@@ -0,0 +1,749 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ubuntu@ip-172-31-33-59:~/meta/meta-hack-26$ python3 deepseek_test_judge_eval.py --url https:
|
| 2 |
+
//pandago-graphstrike.hf.space --bedrock
|
| 3 |
+
GraphStrike Judge Evaluation Simulator
|
| 4 |
+
Target: https://pandago-graphstrike.hf.space
|
| 5 |
+
Backend: bedrock
|
| 6 |
+
Model: Bedrock/deepseek.v3.2
|
| 7 |
+
Token: set
|
| 8 |
+
|
| 9 |
+
============================================================
|
| 10 |
+
PHASE 0: Endpoint Verification
|
| 11 |
+
============================================================
|
| 12 |
+
✓ GET /health
|
| 13 |
+
✓ GET /tasks
|
| 14 |
+
✓ GET /metadata
|
| 15 |
+
✓ GET /schema
|
| 16 |
+
✓ GET /web
|
| 17 |
+
✓ POST /reset
|
| 18 |
+
✓ GET /state
|
| 19 |
+
✓ POST /step
|
| 20 |
+
✓ POST /step
|
| 21 |
+
✓ GET /grader
|
| 22 |
+
✓ POST /mcp
|
| 23 |
+
✓ POST /baseline
|
| 24 |
+
|
| 25 |
+
============================================================
|
| 26 |
+
PHASE 1: Baseline Stability (3 runs)
|
| 27 |
+
============================================================
|
| 28 |
+
Run 1: easy=0.9100 medium=0.9060 hard=0.9038
|
| 29 |
+
Run 2: easy=0.9100 medium=0.9060 hard=0.9038
|
| 30 |
+
Run 3: easy=0.9100 medium=0.9060 hard=0.9038
|
| 31 |
+
✓ All 3 runs identical — baseline is deterministic
|
| 32 |
+
|
| 33 |
+
============================================================
|
| 34 |
+
PHASE 2: LLM Agent Evaluation (model=Bedrock/deepseek.v3.2)
|
| 35 |
+
============================================================
|
| 36 |
+
|
| 37 |
+
--- LLM Agent: task=easy, seed=0, model=Bedrock/deepseek.v3.2 ---
|
| 38 |
+
Step 1: INSPECT acc_0043 flagged=0/10 suspects=0 steps_left=29
|
| 39 |
+
Step 2: FLAG acc_0043 flagged=1/10 suspects=8 steps_left=29
|
| 40 |
+
Step 3: INSPECT acc_0036 flagged=1/10 suspects=8 steps_left=28
|
| 41 |
+
Step 4: INSPECT acc_0001 flagged=1/10 suspects=8 steps_left=27
|
| 42 |
+
Step 5: INSPECT acc_0014 flagged=1/10 suspects=8 steps_left=26
|
| 43 |
+
Step 6: INSPECT acc_0012 flagged=1/10 suspects=8 steps_left=25
|
| 44 |
+
Step 7: INSPECT acc_0000 flagged=1/10 suspects=8 steps_left=24
|
| 45 |
+
Step 8: FLAG acc_0001 flagged=2/10 suspects=8 steps_left=24
|
| 46 |
+
Step 9: INSPECT acc_0027 flagged=2/10 suspects=8 steps_left=23
|
| 47 |
+
Step 10: FLAG acc_0027 flagged=3/10 suspects=7 steps_left=23
|
| 48 |
+
Step 11: INSPECT acc_0047 flagged=3/10 suspects=7 steps_left=22
|
| 49 |
+
Step 12: FLAG acc_0047 flagged=4/10 suspects=6 steps_left=22
|
| 50 |
+
Step 13: INSPECT acc_0007 flagged=4/10 suspects=6 steps_left=21
|
| 51 |
+
Step 14: FLAG acc_0014 flagged=5/10 suspects=5 steps_left=21
|
| 52 |
+
Step 15: INSPECT acc_0028 flagged=5/10 suspects=5 steps_left=20
|
| 53 |
+
Step 16: FLAG acc_0000 flagged=6/10 suspects=4 steps_left=20
|
| 54 |
+
Step 17: FLAG acc_0007 flagged=7/10 suspects=3 steps_left=20
|
| 55 |
+
Step 18: FLAG acc_0012 flagged=8/10 suspects=2 steps_left=20
|
| 56 |
+
Step 19: FLAG acc_0036 flagged=9/10 suspects=1 steps_left=20
|
| 57 |
+
Step 20: FLAG acc_0028 flagged=10/10 suspects=0 steps_left=20
|
| 58 |
+
Step 21: SUBMIT flagged=10/10 suspects=0 steps_left=20
|
| 59 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.90
|
| 60 |
+
★ GRADER SCORE: 0.9667
|
| 61 |
+
|
| 62 |
+
--- LLM Agent: task=medium, seed=0, model=Bedrock/deepseek.v3.2 ---
|
| 63 |
+
Step 1: INSPECT acc_0008 flagged=0/10 suspects=0 steps_left=49
|
| 64 |
+
Step 2: INSPECT acc_0074 flagged=0/10 suspects=0 steps_left=48
|
| 65 |
+
Step 3: INSPECT acc_0179 flagged=0/10 suspects=0 steps_left=47
|
| 66 |
+
Step 4: INSPECT acc_0096 flagged=0/10 suspects=0 steps_left=46
|
| 67 |
+
Step 5: INSPECT acc_0177 flagged=0/10 suspects=0 steps_left=45
|
| 68 |
+
Step 6: INSPECT acc_0122 flagged=0/10 suspects=0 steps_left=44
|
| 69 |
+
Step 7: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=43
|
| 70 |
+
Step 8: INSPECT acc_0174 flagged=0/10 suspects=0 steps_left=42
|
| 71 |
+
Step 9: INSPECT acc_0144 flagged=0/10 suspects=0 steps_left=41
|
| 72 |
+
Step 10: FLAG acc_0144 flagged=1/10 suspects=8 steps_left=41
|
| 73 |
+
Step 11: INSPECT acc_0181 flagged=2/10 suspects=8 steps_left=21
|
| 74 |
+
Step 12: INSPECT acc_0028 flagged=3/10 suspects=7 steps_left=19
|
| 75 |
+
Step 13: FLAG acc_0047 flagged=5/10 suspects=5 steps_left=19
|
| 76 |
+
Step 14: FLAG acc_0007 flagged=7/10 suspects=3 steps_left=19
|
| 77 |
+
Step 15: FLAG acc_0012 flagged=8/10 suspects=2 steps_left=19
|
| 78 |
+
Step 16: FLAG acc_0036 flagged=9/10 suspects=1 steps_left=19
|
| 79 |
+
Step 17: FLAG acc_0028 flagged=10/10 suspects=0 steps_left=19
|
| 80 |
+
Step 18: SUBMIT flagged=10/10 suspects=0 steps_left=18
|
| 81 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.88
|
| 82 |
+
★ GRADER SCORE: 0.9600
|
| 83 |
+
|
| 84 |
+
--- LLM Agent: task=hard, seed=0, model=Bedrock/deepseek.v3.2 ---
|
| 85 |
+
Step 1: INSPECT acc_0704 flagged=0/10 suspects=0 steps_left=76
|
| 86 |
+
Step 2: INSPECT acc_0826 flagged=0/10 suspects=0 steps_left=74
|
| 87 |
+
Step 3: INSPECT acc_0927 flagged=0/10 suspects=0 steps_left=70
|
| 88 |
+
Step 4: INSPECT acc_0880 flagged=0/10 suspects=0 steps_left=67
|
| 89 |
+
Step 5: INSPECT acc_0939 flagged=0/10 suspects=0 steps_left=64
|
| 90 |
+
Step 6: INSPECT acc_0915 flagged=0/10 suspects=0 steps_left=61
|
| 91 |
+
Step 7: INSPECT acc_0970 flagged=0/10 suspects=0 steps_left=59
|
| 92 |
+
Step 8: INSPECT acc_0028 flagged=0/10 suspects=0 steps_left=58
|
| 93 |
+
Step 9: INSPECT acc_0792 flagged=0/10 suspects=0 steps_left=55
|
| 94 |
+
Step 10: INSPECT acc_0579 flagged=0/10 suspects=0 steps_left=52
|
| 95 |
+
Step 11: INSPECT acc_0295 flagged=0/10 suspects=0 steps_left=49
|
| 96 |
+
Step 12: INSPECT acc_0439 flagged=0/10 suspects=0 steps_left=47
|
| 97 |
+
Step 13: FLAG acc_0439 flagged=2/10 suspects=6 steps_left=45
|
| 98 |
+
Step 14: INSPECT acc_0389 flagged=3/10 suspects=6 steps_left=44
|
| 99 |
+
Step 15: INSPECT acc_0160 flagged=3/10 suspects=6 steps_left=41
|
| 100 |
+
Step 16: FLAG acc_0160 flagged=4/10 suspects=6 steps_left=40
|
| 101 |
+
Step 17: INSPECT acc_0658 flagged=6/10 suspects=4 steps_left=39
|
| 102 |
+
Step 18: INSPECT acc_0290 flagged=10/10 suspects=0 steps_left=27
|
| 103 |
+
Step 19: SUBMIT flagged=10/10 suspects=0 steps_left=26
|
| 104 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=14.46
|
| 105 |
+
★ GRADER SCORE: 0.9325
|
| 106 |
+
|
| 107 |
+
Summary: easy=0.9667 medium=0.9600 hard=0.9325
|
| 108 |
+
|
| 109 |
+
============================================================
|
| 110 |
+
PHASE 3: Score Variance (seeds=[0, 1, 2])
|
| 111 |
+
============================================================
|
| 112 |
+
|
| 113 |
+
--- LLM Agent: task=easy, seed=0, model=Bedrock/deepseek.v3.2 ---
|
| 114 |
+
Step 1: INSPECT acc_0043 flagged=10/10 suspects=0 steps_left=17
|
| 115 |
+
Step 2: SUBMIT flagged=10/10 suspects=0 steps_left=10
|
| 116 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.80
|
| 117 |
+
★ GRADER SCORE: 0.9333
|
| 118 |
+
|
| 119 |
+
--- LLM Agent: task=easy, seed=1, model=Bedrock/deepseek.v3.2 ---
|
| 120 |
+
Step 1: INSPECT acc_0034 flagged=1/10 suspects=8 steps_left=28
|
| 121 |
+
Step 2: INSPECT acc_0047 flagged=1/10 suspects=8 steps_left=26
|
| 122 |
+
Step 3: INSPECT acc_0002 flagged=1/10 suspects=8 steps_left=23
|
| 123 |
+
Step 4: FLAG acc_0047 flagged=2/10 suspects=8 steps_left=22
|
| 124 |
+
Step 5: INSPECT acc_0029 flagged=4/10 suspects=6 steps_left=19
|
| 125 |
+
Step 6: FLAG acc_0048 flagged=5/10 suspects=5 steps_left=18
|
| 126 |
+
Step 7: FLAG acc_0005 flagged=6/10 suspects=4 steps_left=17
|
| 127 |
+
Step 8: FLAG acc_0046 flagged=8/10 suspects=2 steps_left=17
|
| 128 |
+
Step 9: INSPECT acc_0014 flagged=9/10 suspects=1 steps_left=16
|
| 129 |
+
Step 10: FLAG acc_0014 flagged=10/10 suspects=0 steps_left=15
|
| 130 |
+
Step 11: SUBMIT flagged=10/10 suspects=0 steps_left=12
|
| 131 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.82
|
| 132 |
+
★ GRADER SCORE: 0.9400
|
| 133 |
+
|
| 134 |
+
--- LLM Agent: task=easy, seed=2, model=Bedrock/deepseek.v3.2 ---
|
| 135 |
+
Step 1: INSPECT acc_0040 flagged=0/10 suspects=0 steps_left=17
|
| 136 |
+
Step 2: FLAG acc_0033 flagged=1/10 suspects=9 steps_left=17
|
| 137 |
+
Step 3: INSPECT acc_0029 flagged=6/10 suspects=4 steps_left=7
|
| 138 |
+
Step 4: FLAG acc_0015 flagged=8/10 suspects=2 steps_left=7
|
| 139 |
+
Step 5: FLAG acc_0004 flagged=10/10 suspects=0 steps_left=7
|
| 140 |
+
Step 6: SUBMIT flagged=0/10 suspects=0 steps_left=48
|
| 141 |
+
→ Episode ended: [LOSS] TP=0 FP=0 FN=10 Recall=0.00 Precision=0.00 Episode reward=-2.02
|
| 142 |
+
★ GRADER SCORE: 0.0000
|
| 143 |
+
|
| 144 |
+
easy: scores=['0.933', '0.940', '0.000'] mean=0.6244 var=0.194966
|
| 145 |
+
|
| 146 |
+
--- LLM Agent: task=medium, seed=0, model=Bedrock/deepseek.v3.2 ---
|
| 147 |
+
Step 1: INSPECT acc_0008 flagged=0/10 suspects=0 steps_left=46
|
| 148 |
+
Step 2: INSPECT acc_0096 flagged=0/10 suspects=0 steps_left=49
|
| 149 |
+
Step 3: INSPECT acc_0008 flagged=0/10 suspects=0 steps_left=29
|
| 150 |
+
Step 4: INSPECT acc_0043 flagged=1/10 suspects=9 steps_left=47
|
| 151 |
+
Step 5: INSPECT acc_0144 flagged=1/10 suspects=9 steps_left=44
|
| 152 |
+
Step 6: INSPECT acc_0131 flagged=3/10 suspects=7 steps_left=41
|
| 153 |
+
Step 7: INSPECT acc_0181 flagged=5/10 suspects=5 steps_left=36
|
| 154 |
+
Step 8: INSPECT acc_0097 flagged=6/10 suspects=4 steps_left=35
|
| 155 |
+
Step 9: INSPECT acc_0187 flagged=7/10 suspects=3 steps_left=33
|
| 156 |
+
Step 10: FLAG acc_0187 flagged=8/10 suspects=2 steps_left=32
|
| 157 |
+
Step 11: INSPECT acc_0058 flagged=8/10 suspects=2 steps_left=30
|
| 158 |
+
Step 12: FLAG acc_0058 flagged=9/10 suspects=1 steps_left=29
|
| 159 |
+
Step 13: FLAG acc_0093 flagged=10/10 suspects=0 steps_left=27
|
| 160 |
+
Step 14: SUBMIT flagged=10/10 suspects=0 steps_left=26
|
| 161 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.76
|
| 162 |
+
★ GRADER SCORE: 0.9520
|
| 163 |
+
|
| 164 |
+
--- LLM Agent: task=medium, seed=1, model=Bedrock/deepseek.v3.2 ---
|
| 165 |
+
Step 1: INSPECT acc_0171 flagged=0/10 suspects=0 steps_left=47
|
| 166 |
+
Step 2: INSPECT acc_0099 flagged=0/10 suspects=0 steps_left=45
|
| 167 |
+
Step 3: INSPECT acc_0152 flagged=0/10 suspects=0 steps_left=42
|
| 168 |
+
Step 4: INSPECT acc_0078 flagged=0/10 suspects=0 steps_left=39
|
| 169 |
+
Step 5: INSPECT acc_0012 flagged=0/10 suspects=0 steps_left=37
|
| 170 |
+
Step 6: FLAG acc_0012 flagged=1/10 suspects=8 steps_left=36
|
| 171 |
+
Step 7: INSPECT acc_0174 flagged=2/10 suspects=8 steps_left=34
|
| 172 |
+
Step 8: FLAG acc_0174 flagged=3/10 suspects=7 steps_left=34
|
| 173 |
+
Step 9: INSPECT acc_0187 flagged=10/10 suspects=0 steps_left=24
|
| 174 |
+
Step 10: SUBMIT flagged=10/10 suspects=0 steps_left=23
|
| 175 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.73
|
| 176 |
+
★ GRADER SCORE: 0.9460
|
| 177 |
+
|
| 178 |
+
--- LLM Agent: task=medium, seed=2, model=Bedrock/deepseek.v3.2 ---
|
| 179 |
+
Step 1: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=47
|
| 180 |
+
Step 2: INSPECT acc_0107 flagged=0/10 suspects=0 steps_left=45
|
| 181 |
+
Step 3: INSPECT acc_0175 flagged=0/10 suspects=0 steps_left=43
|
| 182 |
+
Step 4: INSPECT acc_0030 flagged=0/10 suspects=0 steps_left=40
|
| 183 |
+
Step 5: INSPECT acc_0054 flagged=0/10 suspects=0 steps_left=38
|
| 184 |
+
Step 6: INSPECT acc_0199 flagged=0/10 suspects=0 steps_left=35
|
| 185 |
+
Step 7: INSPECT acc_0166 flagged=0/10 suspects=0 steps_left=32
|
| 186 |
+
Step 8: INSPECT acc_0082 flagged=0/10 suspects=0 steps_left=15
|
| 187 |
+
Step 9: INSPECT acc_0016 flagged=0/10 suspects=0 steps_left=28
|
| 188 |
+
Step 10: FLAG acc_0000 flagged=10/10 suspects=0 steps_left=3
|
| 189 |
+
[RETRY] Network error: HTTP Error 400: Bad Request — retrying in 3s (1/3)
|
| 190 |
+
[RETRY] Network error: HTTP Error 400: Bad Request — retrying in 6s (2/3)
|
| 191 |
+
Traceback (most recent call last):
|
| 192 |
+
File "/home/ubuntu/meta/meta-hack-26/deepseek_test_judge_eval.py", line 534, in <module>
|
| 193 |
+
test_variance(base, seeds=list(range(args.seeds)))
|
| 194 |
+
File "/home/ubuntu/meta/meta-hack-26/deepseek_test_judge_eval.py", line 458, in test_varia
|
| 195 |
+
nce
|
| 196 |
+
score = test_llm_agent(base_url, task=task, seed=seed)
|
| 197 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 198 |
+
File "/home/ubuntu/meta/meta-hack-26/deepseek_test_judge_eval.py", line 426, in test_llm_a
|
| 199 |
+
gent
|
| 200 |
+
grader = http_get(f"{base_url}/grader")
|
| 201 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 202 |
+
File "/home/ubuntu/meta/meta-hack-26/deepseek_test_judge_eval.py", line 116, in http_get
|
| 203 |
+
return _retry(_do)
|
| 204 |
+
^^^^^^^^^^^
|
| 205 |
+
File "/home/ubuntu/meta/meta-hack-26/deepseek_test_judge_eval.py", line 85, in _retry
|
| 206 |
+
return fn()
|
| 207 |
+
^^^^
|
| 208 |
+
File "/home/ubuntu/meta/meta-hack-26/deepseek_test_judge_eval.py", line 110, in _do
|
| 209 |
+
with urllib.request.urlopen(url, timeout=120) as resp:
|
| 210 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 211 |
+
File "/usr/lib/python3.12/urllib/request.py", line 215, in urlopen
|
| 212 |
+
return opener.open(url, data, timeout)
|
| 213 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 214 |
+
File "/usr/lib/python3.12/urllib/request.py", line 521, in open
|
| 215 |
+
response = meth(req, response)
|
| 216 |
+
^^^^^^^^^^^^^^^^^^^
|
| 217 |
+
File "/usr/lib/python3.12/urllib/request.py", line 630, in http_response
|
| 218 |
+
response = self.parent.error(
|
| 219 |
+
^^^^^^^^^^^^^^^^^^
|
| 220 |
+
File "/usr/lib/python3.12/urllib/request.py", line 559, in error
|
| 221 |
+
return self._call_chain(*args)
|
| 222 |
+
^^^^^^^^^^^^^^^^^^^^^^^
|
| 223 |
+
File "/usr/lib/python3.12/urllib/request.py", line 492, in _call_chain
|
| 224 |
+
result = func(*args)
|
| 225 |
+
^^^^^^^^^^^
|
| 226 |
+
File "/usr/lib/python3.12/urllib/request.py", line 639, in http_error_default
|
| 227 |
+
raise HTTPError(req.full_url, code, msg, hdrs, fp)
|
| 228 |
+
urllib.error.HTTPError: HTTP Error 400: Bad Request
|
| 229 |
+
ubuntu@ip-172-31-33-59:~/meta/meta-hack-26$ python3 deepseek_test_judge_eval.py --url https:
|
| 230 |
+
//pandago-graphstrike.hf.space --bedrock
|
| 231 |
+
GraphStrike Judge Evaluation Simulator
|
| 232 |
+
Target: https://pandago-graphstrike.hf.space
|
| 233 |
+
Backend: bedrock
|
| 234 |
+
Model: Bedrock/deepseek.v3.2
|
| 235 |
+
Token: set
|
| 236 |
+
|
| 237 |
+
============================================================
|
| 238 |
+
PHASE 0: Endpoint Verification
|
| 239 |
+
============================================================
|
| 240 |
+
✓ GET /health
|
| 241 |
+
✓ GET /tasks
|
| 242 |
+
✓ GET /metadata
|
| 243 |
+
✓ GET /schema
|
| 244 |
+
✓ GET /web
|
| 245 |
+
✓ POST /reset
|
| 246 |
+
✓ GET /state
|
| 247 |
+
✓ POST /step
|
| 248 |
+
✓ POST /step
|
| 249 |
+
✓ GET /grader
|
| 250 |
+
✓ POST /mcp
|
| 251 |
+
✓ POST /baseline
|
| 252 |
+
|
| 253 |
+
============================================================
|
| 254 |
+
PHASE 1: Baseline Stability (3 runs)
|
| 255 |
+
============================================================
|
| 256 |
+
Run 1: easy=0.9100 medium=0.9060 hard=0.9038
|
| 257 |
+
Run 2: easy=0.9100 medium=0.9060 hard=0.9038
|
| 258 |
+
Run 3: easy=0.9100 medium=0.9060 hard=0.9038
|
| 259 |
+
✓ All 3 runs identical — baseline is deterministic
|
| 260 |
+
|
| 261 |
+
============================================================
|
| 262 |
+
PHASE 2: LLM Agent Evaluation (model=Bedrock/deepseek.v3.2)
|
| 263 |
+
============================================================
|
| 264 |
+
|
| 265 |
+
--- LLM Agent: task=easy, seed=0, model=Bedrock/deepseek.v3.2 ---
|
| 266 |
+
Step 1: INSPECT acc_0043 flagged=0/10 suspects=0 steps_left=29
|
| 267 |
+
Step 2: FLAG acc_0043 flagged=1/10 suspects=8 steps_left=29
|
| 268 |
+
Step 3: INSPECT acc_0036 flagged=1/10 suspects=8 steps_left=28
|
| 269 |
+
Step 4: INSPECT acc_0001 flagged=1/10 suspects=8 steps_left=27
|
| 270 |
+
Step 5: INSPECT acc_0014 flagged=1/10 suspects=8 steps_left=26
|
| 271 |
+
Step 6: INSPECT acc_0012 flagged=1/10 suspects=8 steps_left=25
|
| 272 |
+
Step 7: INSPECT acc_0000 flagged=1/10 suspects=8 steps_left=24
|
| 273 |
+
Step 8: FLAG acc_0001 flagged=2/10 suspects=8 steps_left=24
|
| 274 |
+
Step 9: INSPECT acc_0027 flagged=2/10 suspects=8 steps_left=23
|
| 275 |
+
Step 10: INSPECT acc_0047 flagged=2/10 suspects=8 steps_left=22
|
| 276 |
+
Step 11: FLAG acc_0027 flagged=3/10 suspects=7 steps_left=22
|
| 277 |
+
Step 12: INSPECT acc_0007 flagged=3/10 suspects=7 steps_left=21
|
| 278 |
+
Step 13: FLAG acc_0047 flagged=4/10 suspects=6 steps_left=21
|
| 279 |
+
Step 14: FLAG acc_0014 flagged=1/10 suspects=1 steps_left=76
|
| 280 |
+
Step 15: INSPECT acc_0356 flagged=1/10 suspects=1 steps_left=75
|
| 281 |
+
Step 16: INSPECT acc_0523 flagged=1/10 suspects=1 steps_left=74
|
| 282 |
+
Step 17: INSPECT acc_0113 flagged=1/10 suspects=1 steps_left=73
|
| 283 |
+
Step 18: INSPECT acc_0797 flagged=1/10 suspects=1 steps_left=71
|
| 284 |
+
Step 19: INSPECT acc_0478 flagged=1/10 suspects=1 steps_left=70
|
| 285 |
+
Step 20: INSPECT acc_0861 flagged=1/10 suspects=1 steps_left=69
|
| 286 |
+
Step 21: INSPECT acc_0836 flagged=1/10 suspects=1 steps_left=68
|
| 287 |
+
Step 22: INSPECT acc_0926 flagged=1/10 suspects=1 steps_left=67
|
| 288 |
+
Step 23: INSPECT acc_0664 flagged=1/10 suspects=1 steps_left=64
|
| 289 |
+
Step 24: INSPECT acc_0255 flagged=1/10 suspects=1 steps_left=63
|
| 290 |
+
Step 25: INSPECT acc_0938 flagged=1/10 suspects=1 steps_left=62
|
| 291 |
+
Step 26: INSPECT acc_0672 flagged=1/10 suspects=1 steps_left=61
|
| 292 |
+
Step 27: FLAG acc_0672 flagged=2/10 suspects=7 steps_left=61
|
| 293 |
+
Step 28: INSPECT acc_0659 flagged=2/10 suspects=7 steps_left=60
|
| 294 |
+
Step 29: FLAG acc_0659 flagged=3/10 suspects=6 steps_left=60
|
| 295 |
+
Step 30: INSPECT acc_0290 flagged=3/10 suspects=6 steps_left=59
|
| 296 |
+
Step 31: INSPECT acc_0339 flagged=3/10 suspects=6 steps_left=58
|
| 297 |
+
Step 32: INSPECT acc_0544 flagged=3/10 suspects=6 steps_left=57
|
| 298 |
+
Step 33: INSPECT acc_0696 flagged=3/10 suspects=6 steps_left=56
|
| 299 |
+
Step 34: INSPECT acc_0541 flagged=3/10 suspects=6 steps_left=55
|
| 300 |
+
Step 35: FLAG acc_0290 flagged=4/10 suspects=8 steps_left=55
|
| 301 |
+
Step 36: INSPECT acc_0793 flagged=4/10 suspects=8 steps_left=54
|
| 302 |
+
Step 37: INSPECT acc_0214 flagged=4/10 suspects=8 steps_left=53
|
| 303 |
+
Step 38: INSPECT acc_0112 flagged=4/10 suspects=8 steps_left=52
|
| 304 |
+
Step 39: FLAG acc_0339 flagged=5/10 suspects=7 steps_left=52
|
| 305 |
+
Step 40: FLAG acc_0112 flagged=6/10 suspects=6 steps_left=52
|
| 306 |
+
Step 41: FLAG acc_0696 flagged=7/10 suspects=5 steps_left=52
|
| 307 |
+
Step 42: FLAG acc_0544 flagged=8/10 suspects=4 steps_left=52
|
| 308 |
+
Step 43: FLAG acc_0214 flagged=9/10 suspects=3 steps_left=52
|
| 309 |
+
Step 44: FLAG acc_0793 flagged=10/10 suspects=2 steps_left=52
|
| 310 |
+
Step 45: FLAG acc_0541 flagged=11/10 suspects=1 steps_left=52
|
| 311 |
+
Step 46: SUBMIT flagged=11/10 suspects=1 steps_left=52
|
| 312 |
+
→ Episode ended: [WIN] TP=10 FP=1 FN=0 Recall=1.00 Precision=0.91 Episode reward=17.22
|
| 313 |
+
★ GRADER SCORE: 0.9514
|
| 314 |
+
|
| 315 |
+
--- LLM Agent: task=medium, seed=0, model=Bedrock/deepseek.v3.2 ---
|
| 316 |
+
Step 1: INSPECT acc_0008 flagged=0/10 suspects=0 steps_left=49
|
| 317 |
+
Step 2: INSPECT acc_0074 flagged=0/10 suspects=0 steps_left=48
|
| 318 |
+
Step 3: INSPECT acc_0179 flagged=0/10 suspects=0 steps_left=47
|
| 319 |
+
Step 4: INSPECT acc_0096 flagged=0/10 suspects=0 steps_left=46
|
| 320 |
+
Step 5: INSPECT acc_0177 flagged=0/10 suspects=0 steps_left=45
|
| 321 |
+
Step 6: INSPECT acc_0122 flagged=0/10 suspects=0 steps_left=44
|
| 322 |
+
Step 7: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=43
|
| 323 |
+
Step 8: INSPECT acc_0174 flagged=0/10 suspects=0 steps_left=42
|
| 324 |
+
Step 9: INSPECT acc_0144 flagged=0/10 suspects=0 steps_left=40
|
| 325 |
+
Step 10: FLAG acc_0144 flagged=1/10 suspects=8 steps_left=40
|
| 326 |
+
Step 11: INSPECT acc_0181 flagged=1/10 suspects=8 steps_left=38
|
| 327 |
+
Step 12: INSPECT acc_0022 flagged=1/10 suspects=8 steps_left=37
|
| 328 |
+
Step 13: FLAG acc_0022 flagged=2/10 suspects=8 steps_left=37
|
| 329 |
+
Step 14: INSPECT acc_0092 flagged=2/10 suspects=8 steps_left=36
|
| 330 |
+
Step 15: INSPECT acc_0097 flagged=2/10 suspects=8 steps_left=35
|
| 331 |
+
Step 16: FLAG acc_0097 flagged=3/10 suspects=7 steps_left=35
|
| 332 |
+
Step 17: FLAG acc_0181 flagged=4/10 suspects=6 steps_left=35
|
| 333 |
+
Step 18: INSPECT acc_0187 flagged=4/10 suspects=6 steps_left=34
|
| 334 |
+
Step 19: INSPECT acc_0093 flagged=4/10 suspects=6 steps_left=33
|
| 335 |
+
Step 20: FLAG acc_0093 flagged=5/10 suspects=5 steps_left=33
|
| 336 |
+
Step 21: FLAG acc_0187 flagged=6/10 suspects=4 steps_left=33
|
| 337 |
+
Step 22: INSPECT acc_0172 flagged=6/10 suspects=4 steps_left=32
|
| 338 |
+
Step 23: FLAG acc_0172 flagged=7/10 suspects=3 steps_left=32
|
| 339 |
+
Step 24: INSPECT acc_0058 flagged=7/10 suspects=3 steps_left=31
|
| 340 |
+
Step 25: FLAG acc_0058 flagged=8/10 suspects=2 steps_left=31
|
| 341 |
+
Step 26: INSPECT acc_0131 flagged=8/10 suspects=2 steps_left=30
|
| 342 |
+
Step 27: FLAG acc_0131 flagged=9/10 suspects=1 steps_left=30
|
| 343 |
+
Step 28: FLAG acc_0092 flagged=10/10 suspects=0 steps_left=30
|
| 344 |
+
Step 29: SUBMIT flagged=10/10 suspects=0 steps_left=30
|
| 345 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.80
|
| 346 |
+
★ GRADER SCORE: 0.9600
|
| 347 |
+
|
| 348 |
+
--- LLM Agent: task=hard, seed=0, model=Bedrock/deepseek.v3.2 ---
|
| 349 |
+
Step 1: INSPECT acc_0704 flagged=0/10 suspects=0 steps_left=79
|
| 350 |
+
Step 2: INSPECT acc_0289 flagged=0/10 suspects=0 steps_left=78
|
| 351 |
+
Step 3: INSPECT acc_0826 flagged=0/10 suspects=0 steps_left=77
|
| 352 |
+
Step 4: INSPECT acc_0927 flagged=0/10 suspects=0 steps_left=76
|
| 353 |
+
Step 5: INSPECT acc_0441 flagged=0/10 suspects=0 steps_left=75
|
| 354 |
+
Step 6: INSPECT acc_0871 flagged=0/10 suspects=0 steps_left=74
|
| 355 |
+
Step 7: INSPECT acc_0880 flagged=0/10 suspects=0 steps_left=73
|
| 356 |
+
Step 8: INSPECT acc_0652 flagged=0/10 suspects=0 steps_left=72
|
| 357 |
+
Step 9: INSPECT acc_0939 flagged=0/10 suspects=0 steps_left=71
|
| 358 |
+
Step 10: INSPECT acc_0070 flagged=0/10 suspects=0 steps_left=70
|
| 359 |
+
Step 11: INSPECT acc_0915 flagged=0/10 suspects=0 steps_left=69
|
| 360 |
+
Step 12: INSPECT acc_0443 flagged=0/10 suspects=0 steps_left=68
|
| 361 |
+
Step 13: INSPECT acc_0970 flagged=0/10 suspects=0 steps_left=67
|
| 362 |
+
Step 14: INSPECT acc_0028 flagged=0/10 suspects=0 steps_left=66
|
| 363 |
+
Step 15: INSPECT acc_0792 flagged=0/10 suspects=0 steps_left=65
|
| 364 |
+
Step 16: INSPECT acc_0579 flagged=0/10 suspects=0 steps_left=64
|
| 365 |
+
Step 17: INSPECT acc_0037 flagged=0/10 suspects=0 steps_left=63
|
| 366 |
+
Step 18: INSPECT acc_0295 flagged=0/10 suspects=0 steps_left=62
|
| 367 |
+
Step 19: INSPECT acc_0438 flagged=0/10 suspects=0 steps_left=61
|
| 368 |
+
Step 20: INSPECT acc_0439 flagged=0/10 suspects=0 steps_left=60
|
| 369 |
+
Step 21: FLAG acc_0439 flagged=1/10 suspects=5 steps_left=60
|
| 370 |
+
Step 22: INSPECT acc_0237 flagged=1/10 suspects=5 steps_left=59
|
| 371 |
+
Step 23: INSPECT acc_0621 flagged=1/10 suspects=5 steps_left=58
|
| 372 |
+
Step 24: FLAG acc_0621 flagged=2/10 suspects=7 steps_left=58
|
| 373 |
+
Step 25: INSPECT acc_0389 flagged=2/10 suspects=7 steps_left=57
|
| 374 |
+
Step 26: INSPECT acc_0160 flagged=2/10 suspects=7 steps_left=56
|
| 375 |
+
Step 27: INSPECT acc_0549 flagged=2/10 suspects=7 steps_left=55
|
| 376 |
+
Step 28: INSPECT acc_0658 flagged=2/10 suspects=7 steps_left=54
|
| 377 |
+
Step 29: INSPECT acc_0290 flagged=2/10 suspects=7 steps_left=53
|
| 378 |
+
Step 30: INSPECT acc_0124 flagged=2/10 suspects=7 steps_left=52
|
| 379 |
+
Step 31: FLAG acc_0160 flagged=3/10 suspects=7 steps_left=52
|
| 380 |
+
Step 32: INSPECT acc_0507 flagged=3/10 suspects=7 steps_left=51
|
| 381 |
+
Step 33: FLAG acc_0237 flagged=4/10 suspects=6 steps_left=51
|
| 382 |
+
Step 34: FLAG acc_0549 flagged=5/10 suspects=5 steps_left=51
|
| 383 |
+
Step 35: FLAG acc_0290 flagged=6/10 suspects=4 steps_left=51
|
| 384 |
+
Step 36: FLAG acc_0389 flagged=8/10 suspects=10 steps_left=51
|
| 385 |
+
Step 37: INSPECT acc_0844 flagged=9/10 suspects=9 steps_left=50
|
| 386 |
+
Step 38: INSPECT acc_0436 flagged=9/10 suspects=9 steps_left=49
|
| 387 |
+
Step 39: INSPECT acc_0870 flagged=9/10 suspects=9 steps_left=48
|
| 388 |
+
Step 40: FLAG acc_0507 flagged=10/10 suspects=8 steps_left=48
|
| 389 |
+
Step 41: INSPECT acc_0142 flagged=10/10 suspects=8 steps_left=47
|
| 390 |
+
Step 42: INSPECT acc_0833 flagged=10/10 suspects=8 steps_left=46
|
| 391 |
+
Step 43: INSPECT acc_0271 flagged=10/10 suspects=8 steps_left=45
|
| 392 |
+
Step 44: FLAG acc_0124 flagged=11/10 suspects=7 steps_left=45
|
| 393 |
+
Step 45: SUBMIT flagged=11/10 suspects=7 steps_left=45
|
| 394 |
+
→ Episode ended: [WIN] TP=10 FP=1 FN=0 Recall=1.00 Precision=0.91 Episode reward=16.15
|
| 395 |
+
★ GRADER SCORE: 0.9426
|
| 396 |
+
|
| 397 |
+
Summary: easy=0.9514 medium=0.9600 hard=0.9426
|
| 398 |
+
|
| 399 |
+
============================================================
|
| 400 |
+
PHASE 3: Score Variance (seeds=[0, 1, 2])
|
| 401 |
+
============================================================
|
| 402 |
+
|
| 403 |
+
--- LLM Agent: task=easy, seed=0, model=Bedrock/deepseek.v3.2 ---
|
| 404 |
+
Step 1: INSPECT acc_0043 flagged=0/10 suspects=0 steps_left=29
|
| 405 |
+
Step 2: FLAG acc_0043 flagged=1/10 suspects=8 steps_left=29
|
| 406 |
+
Step 3: INSPECT acc_0036 flagged=1/10 suspects=8 steps_left=28
|
| 407 |
+
Step 4: INSPECT acc_0001 flagged=1/10 suspects=8 steps_left=27
|
| 408 |
+
Step 5: INSPECT acc_0014 flagged=1/10 suspects=8 steps_left=26
|
| 409 |
+
Step 6: INSPECT acc_0012 flagged=1/10 suspects=8 steps_left=25
|
| 410 |
+
Step 7: INSPECT acc_0000 flagged=1/10 suspects=8 steps_left=24
|
| 411 |
+
Step 8: FLAG acc_0001 flagged=2/10 suspects=8 steps_left=24
|
| 412 |
+
Step 9: INSPECT acc_0027 flagged=2/10 suspects=8 steps_left=23
|
| 413 |
+
Step 10: FLAG acc_0027 flagged=3/10 suspects=7 steps_left=23
|
| 414 |
+
Step 11: INSPECT acc_0047 flagged=3/10 suspects=7 steps_left=22
|
| 415 |
+
Step 12: FLAG acc_0047 flagged=4/10 suspects=6 steps_left=22
|
| 416 |
+
Step 13: INSPECT acc_0007 flagged=4/10 suspects=6 steps_left=21
|
| 417 |
+
Step 14: INSPECT acc_0028 flagged=4/10 suspects=6 steps_left=20
|
| 418 |
+
Step 15: FLAG acc_0014 flagged=5/10 suspects=5 steps_left=20
|
| 419 |
+
Step 16: FLAG acc_0007 flagged=6/10 suspects=4 steps_left=20
|
| 420 |
+
Step 17: FLAG acc_0000 flagged=7/10 suspects=3 steps_left=20
|
| 421 |
+
Step 18: FLAG acc_0012 flagged=8/10 suspects=2 steps_left=20
|
| 422 |
+
Step 19: FLAG acc_0036 flagged=9/10 suspects=1 steps_left=20
|
| 423 |
+
Step 20: FLAG acc_0028 flagged=10/10 suspects=0 steps_left=20
|
| 424 |
+
Step 21: SUBMIT flagged=10/10 suspects=0 steps_left=20
|
| 425 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.90
|
| 426 |
+
★ GRADER SCORE: 0.9667
|
| 427 |
+
|
| 428 |
+
--- LLM Agent: task=easy, seed=1, model=Bedrock/deepseek.v3.2 ---
|
| 429 |
+
Step 1: INSPECT acc_0034 flagged=0/10 suspects=0 steps_left=29
|
| 430 |
+
Step 2: INSPECT acc_0003 flagged=0/10 suspects=0 steps_left=28
|
| 431 |
+
Step 3: INSPECT acc_0049 flagged=0/10 suspects=0 steps_left=27
|
| 432 |
+
Step 4: INSPECT acc_0006 flagged=0/10 suspects=0 steps_left=26
|
| 433 |
+
Step 5: INSPECT acc_0047 flagged=0/10 suspects=0 steps_left=25
|
| 434 |
+
Step 6: FLAG acc_0047 flagged=1/10 suspects=9 steps_left=25
|
| 435 |
+
Step 7: INSPECT acc_0009 flagged=1/10 suspects=9 steps_left=24
|
| 436 |
+
Step 8: INSPECT acc_0046 flagged=1/10 suspects=9 steps_left=23
|
| 437 |
+
Step 9: FLAG acc_0009 flagged=2/10 suspects=8 steps_left=23
|
| 438 |
+
Step 10: FLAG acc_0046 flagged=3/10 suspects=7 steps_left=23
|
| 439 |
+
Step 11: INSPECT acc_0014 flagged=3/10 suspects=7 steps_left=22
|
| 440 |
+
Step 12: FLAG acc_0014 flagged=4/10 suspects=6 steps_left=22
|
| 441 |
+
Step 13: INSPECT acc_0021 flagged=4/10 suspects=6 steps_left=21
|
| 442 |
+
Step 14: FLAG acc_0021 flagged=5/10 suspects=5 steps_left=21
|
| 443 |
+
Step 15: INSPECT acc_0002 flagged=5/10 suspects=5 steps_left=20
|
| 444 |
+
Step 16: FLAG acc_0002 flagged=6/10 suspects=4 steps_left=20
|
| 445 |
+
Step 17: INSPECT acc_0048 flagged=6/10 suspects=4 steps_left=19
|
| 446 |
+
Step 18: INSPECT acc_0029 flagged=6/10 suspects=4 steps_left=18
|
| 447 |
+
Step 19: FLAG acc_0029 flagged=7/10 suspects=3 steps_left=18
|
| 448 |
+
Step 20: INSPECT acc_0015 flagged=7/10 suspects=3 steps_left=17
|
| 449 |
+
Step 21: FLAG acc_0015 flagged=8/10 suspects=2 steps_left=17
|
| 450 |
+
Step 22: FLAG acc_0048 flagged=9/10 suspects=1 steps_left=17
|
| 451 |
+
Step 23: INSPECT acc_0005 flagged=9/10 suspects=1 steps_left=16
|
| 452 |
+
Step 24: FLAG acc_0005 flagged=10/10 suspects=0 steps_left=16
|
| 453 |
+
Step 25: SUBMIT flagged=10/10 suspects=0 steps_left=16
|
| 454 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.86
|
| 455 |
+
★ GRADER SCORE: 0.9533
|
| 456 |
+
|
| 457 |
+
--- LLM Agent: task=easy, seed=2, model=Bedrock/deepseek.v3.2 ---
|
| 458 |
+
Step 1: INSPECT acc_0040 flagged=0/10 suspects=0 steps_left=30
|
| 459 |
+
★ GRADER SCORE: 0.0000
|
| 460 |
+
|
| 461 |
+
easy: scores=['0.967', '0.953', '0.000'] mean=0.6400 var=0.204830
|
| 462 |
+
|
| 463 |
+
--- LLM Agent: task=medium, seed=0, model=Bedrock/deepseek.v3.2 ---
|
| 464 |
+
Step 1: INSPECT acc_0008 flagged=0/10 suspects=0 steps_left=49
|
| 465 |
+
Step 2: INSPECT acc_0074 flagged=0/10 suspects=0 steps_left=48
|
| 466 |
+
Step 3: INSPECT acc_0179 flagged=0/10 suspects=0 steps_left=47
|
| 467 |
+
Step 4: INSPECT acc_0096 flagged=0/10 suspects=0 steps_left=46
|
| 468 |
+
Step 5: INSPECT acc_0177 flagged=0/10 suspects=0 steps_left=45
|
| 469 |
+
Step 6: INSPECT acc_0122 flagged=0/10 suspects=0 steps_left=44
|
| 470 |
+
Step 7: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=43
|
| 471 |
+
Step 8: INSPECT acc_0174 flagged=0/10 suspects=0 steps_left=42
|
| 472 |
+
Step 9: INSPECT acc_0144 flagged=0/10 suspects=0 steps_left=41
|
| 473 |
+
Step 10: FLAG acc_0144 flagged=1/10 suspects=8 steps_left=41
|
| 474 |
+
Step 11: INSPECT acc_0181 flagged=1/10 suspects=8 steps_left=40
|
| 475 |
+
Step 12: INSPECT acc_0022 flagged=1/10 suspects=8 steps_left=39
|
| 476 |
+
Step 13: FLAG acc_0022 flagged=2/10 suspects=8 steps_left=39
|
| 477 |
+
Step 14: INSPECT acc_0092 flagged=2/10 suspects=8 steps_left=38
|
| 478 |
+
Step 15: INSPECT acc_0097 flagged=2/10 suspects=8 steps_left=37
|
| 479 |
+
Step 16: FLAG acc_0181 flagged=3/10 suspects=7 steps_left=37
|
| 480 |
+
Step 17: INSPECT acc_0187 flagged=3/10 suspects=7 steps_left=36
|
| 481 |
+
Step 18: FLAG acc_0092 flagged=4/10 suspects=6 steps_left=36
|
| 482 |
+
Step 19: INSPECT acc_0093 flagged=4/10 suspects=6 steps_left=35
|
| 483 |
+
Step 20: FLAG acc_0093 flagged=5/10 suspects=5 steps_left=35
|
| 484 |
+
Step 21: FLAG acc_0097 flagged=6/10 suspects=4 steps_left=35
|
| 485 |
+
Step 22: INSPECT acc_0172 flagged=6/10 suspects=4 steps_left=34
|
| 486 |
+
Step 23: FLAG acc_0187 flagged=7/10 suspects=3 steps_left=34
|
| 487 |
+
Step 24: INSPECT acc_0058 flagged=7/10 suspects=3 steps_left=33
|
| 488 |
+
Step 25: FLAG acc_0058 flagged=8/10 suspects=2 steps_left=33
|
| 489 |
+
Step 26: FLAG acc_0172 flagged=9/10 suspects=1 steps_left=33
|
| 490 |
+
Step 27: INSPECT acc_0131 flagged=9/10 suspects=1 steps_left=32
|
| 491 |
+
Step 28: FLAG acc_0131 flagged=10/10 suspects=0 steps_left=32
|
| 492 |
+
Step 29: SUBMIT flagged=10/10 suspects=0 steps_left=32
|
| 493 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.82
|
| 494 |
+
★ GRADER SCORE: 0.9640
|
| 495 |
+
|
| 496 |
+
--- LLM Agent: task=medium, seed=1, model=Bedrock/deepseek.v3.2 ---
|
| 497 |
+
Step 1: INSPECT acc_0171 flagged=0/10 suspects=0 steps_left=49
|
| 498 |
+
Step 2: INSPECT acc_0099 flagged=0/10 suspects=0 steps_left=48
|
| 499 |
+
Step 3: INSPECT acc_0152 flagged=0/10 suspects=0 steps_left=47
|
| 500 |
+
Step 4: INSPECT acc_0092 flagged=0/10 suspects=0 steps_left=46
|
| 501 |
+
Step 5: INSPECT acc_0078 flagged=0/10 suspects=0 steps_left=45
|
| 502 |
+
Step 6: INSPECT acc_0112 flagged=0/10 suspects=0 steps_left=44
|
| 503 |
+
Step 7: INSPECT acc_0012 flagged=0/10 suspects=0 steps_left=43
|
| 504 |
+
Step 8: FLAG acc_0012 flagged=1/10 suspects=8 steps_left=43
|
| 505 |
+
Step 9: INSPECT acc_0033 flagged=1/10 suspects=8 steps_left=42
|
| 506 |
+
Step 10: FLAG acc_0033 flagged=2/10 suspects=8 steps_left=42
|
| 507 |
+
Step 11: INSPECT acc_0174 flagged=2/10 suspects=8 steps_left=41
|
| 508 |
+
Step 12: INSPECT acc_0187 flagged=2/10 suspects=8 steps_left=40
|
| 509 |
+
Step 13: FLAG acc_0187 flagged=3/10 suspects=7 steps_left=40
|
| 510 |
+
Step 14: INSPECT acc_0079 flagged=3/10 suspects=7 steps_left=39
|
| 511 |
+
Step 15: INSPECT acc_0032 flagged=3/10 suspects=7 steps_left=38
|
| 512 |
+
Step 16: INSPECT acc_0023 flagged=3/10 suspects=7 steps_left=37
|
| 513 |
+
Step 17: INSPECT acc_0146 flagged=3/10 suspects=7 steps_left=36
|
| 514 |
+
Step 18: FLAG acc_0174 flagged=4/10 suspects=6 steps_left=36
|
| 515 |
+
Step 19: INSPECT acc_0019 flagged=4/10 suspects=6 steps_left=35
|
| 516 |
+
Step 20: FLAG acc_0023 flagged=5/10 suspects=5 steps_left=35
|
| 517 |
+
Step 21: INSPECT acc_0131 flagged=5/10 suspects=5 steps_left=34
|
| 518 |
+
Step 22: FLAG acc_0079 flagged=6/10 suspects=4 steps_left=34
|
| 519 |
+
Step 23: FLAG acc_0019 flagged=7/10 suspects=3 steps_left=34
|
| 520 |
+
Step 24: FLAG acc_0146 flagged=8/10 suspects=2 steps_left=34
|
| 521 |
+
Step 25: FLAG acc_0131 flagged=9/10 suspects=1 steps_left=34
|
| 522 |
+
Step 26: FLAG acc_0032 flagged=10/10 suspects=0 steps_left=34
|
| 523 |
+
Step 27: SUBMIT flagged=10/10 suspects=0 steps_left=34
|
| 524 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.84
|
| 525 |
+
★ GRADER SCORE: 0.9680
|
| 526 |
+
|
| 527 |
+
--- LLM Agent: task=medium, seed=2, model=Bedrock/deepseek.v3.2 ---
|
| 528 |
+
Step 1: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=49
|
| 529 |
+
Step 2: INSPECT acc_0107 flagged=0/10 suspects=0 steps_left=48
|
| 530 |
+
Step 3: INSPECT acc_0175 flagged=0/10 suspects=0 steps_left=47
|
| 531 |
+
Step 4: INSPECT acc_0030 flagged=0/10 suspects=0 steps_left=46
|
| 532 |
+
Step 5: INSPECT acc_0041 flagged=0/10 suspects=0 steps_left=45
|
| 533 |
+
Step 6: INSPECT acc_0054 flagged=0/10 suspects=0 steps_left=44
|
| 534 |
+
Step 7: INSPECT acc_0199 flagged=0/10 suspects=0 steps_left=43
|
| 535 |
+
Step 8: INSPECT acc_0181 flagged=0/10 suspects=0 steps_left=42
|
| 536 |
+
Step 9: INSPECT acc_0166 flagged=0/10 suspects=0 steps_left=41
|
| 537 |
+
Step 10: INSPECT acc_0098 flagged=0/10 suspects=0 steps_left=40
|
| 538 |
+
Step 11: INSPECT acc_0121 flagged=0/10 suspects=0 steps_left=39
|
| 539 |
+
Step 12: INSPECT acc_0053 flagged=0/10 suspects=0 steps_left=38
|
| 540 |
+
Step 13: INSPECT acc_0103 flagged=0/10 suspects=0 steps_left=37
|
| 541 |
+
Step 14: INSPECT acc_0000 flagged=0/10 suspects=0 steps_left=36
|
| 542 |
+
Step 15: INSPECT acc_0168 flagged=0/10 suspects=0 steps_left=35
|
| 543 |
+
Step 16: INSPECT acc_0040 flagged=0/10 suspects=0 steps_left=34
|
| 544 |
+
Step 17: INSPECT acc_0149 flagged=0/10 suspects=0 steps_left=33
|
| 545 |
+
Step 18: INSPECT acc_0064 flagged=0/10 suspects=0 steps_left=32
|
| 546 |
+
Step 19: INSPECT acc_0016 flagged=0/10 suspects=0 steps_left=31
|
| 547 |
+
Step 20: INSPECT acc_0105 flagged=0/10 suspects=0 steps_left=30
|
| 548 |
+
Step 21: INSPECT acc_0035 flagged=0/10 suspects=0 steps_left=29
|
| 549 |
+
Step 22: FLAG acc_0035 flagged=1/10 suspects=9 steps_left=29
|
| 550 |
+
Step 23: INSPECT acc_0020 flagged=1/10 suspects=9 steps_left=28
|
| 551 |
+
Step 24: INSPECT acc_0036 flagged=1/10 suspects=9 steps_left=27
|
| 552 |
+
Step 25: FLAG acc_0036 flagged=2/10 suspects=8 steps_left=27
|
| 553 |
+
Step 26: INSPECT acc_0050 flagged=2/10 suspects=8 steps_left=26
|
| 554 |
+
Step 27: FLAG acc_0020 flagged=3/10 suspects=7 steps_left=26
|
| 555 |
+
Step 28: INSPECT acc_0051 flagged=3/10 suspects=7 steps_left=25
|
| 556 |
+
Step 29: INSPECT acc_0085 flagged=3/10 suspects=7 steps_left=24
|
| 557 |
+
Step 30: FLAG acc_0050 flagged=4/10 suspects=6 steps_left=24
|
| 558 |
+
Step 31: FLAG acc_0085 flagged=5/10 suspects=5 steps_left=24
|
| 559 |
+
Step 32: INSPECT acc_0177 flagged=5/10 suspects=5 steps_left=23
|
| 560 |
+
Step 33: INSPECT acc_0170 flagged=5/10 suspects=5 steps_left=22
|
| 561 |
+
Step 34: FLAG acc_0170 flagged=6/10 suspects=4 steps_left=22
|
| 562 |
+
[LLM RETRY] An error occurred (ValidationException) when calling the Converse operation:
|
| 563 |
+
The model returned the following errors: {"error":{"code":"validation_error","message":"Int
|
| 564 |
+
ernal server error","param":null,"type":"invalid_request_error"}} — retrying in 3s
|
| 565 |
+
Step 35: INSPECT acc_0055 flagged=6/10 suspects=4 steps_left=21
|
| 566 |
+
Step 36: FLAG acc_0177 flagged=7/10 suspects=3 steps_left=21
|
| 567 |
+
Step 37: INSPECT acc_0094 flagged=7/10 suspects=3 steps_left=20
|
| 568 |
+
Step 38: FLAG acc_0094 flagged=8/10 suspects=2 steps_left=20
|
| 569 |
+
Step 39: FLAG acc_0055 flagged=9/10 suspects=1 steps_left=20
|
| 570 |
+
Step 40: FLAG acc_0051 flagged=10/10 suspects=0 steps_left=20
|
| 571 |
+
Step 41: SUBMIT flagged=10/10 suspects=0 steps_left=20
|
| 572 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.70
|
| 573 |
+
★ GRADER SCORE: 0.9400
|
| 574 |
+
|
| 575 |
+
medium: scores=['0.964', '0.968', '0.940'] mean=0.9573 var=0.000153
|
| 576 |
+
|
| 577 |
+
--- LLM Agent: task=hard, seed=0, model=Bedrock/deepseek.v3.2 ---
|
| 578 |
+
Step 1: INSPECT acc_0704 flagged=0/10 suspects=0 steps_left=79
|
| 579 |
+
Step 2: INSPECT acc_0289 flagged=0/10 suspects=0 steps_left=78
|
| 580 |
+
Step 3: INSPECT acc_0826 flagged=0/10 suspects=0 steps_left=77
|
| 581 |
+
Step 4: INSPECT acc_0927 flagged=0/10 suspects=0 steps_left=76
|
| 582 |
+
Step 5: INSPECT acc_0441 flagged=0/10 suspects=0 steps_left=75
|
| 583 |
+
Step 6: INSPECT acc_0871 flagged=0/10 suspects=0 steps_left=74
|
| 584 |
+
Step 7: INSPECT acc_0880 flagged=0/10 suspects=0 steps_left=73
|
| 585 |
+
Step 8: INSPECT acc_0652 flagged=0/10 suspects=0 steps_left=72
|
| 586 |
+
Step 9: INSPECT acc_0939 flagged=0/10 suspects=0 steps_left=71
|
| 587 |
+
Step 10: INSPECT acc_0070 flagged=0/10 suspects=0 steps_left=70
|
| 588 |
+
Step 11: INSPECT acc_0915 flagged=0/10 suspects=0 steps_left=69
|
| 589 |
+
Step 12: INSPECT acc_0443 flagged=0/10 suspects=0 steps_left=68
|
| 590 |
+
Step 13: INSPECT acc_0970 flagged=0/10 suspects=0 steps_left=67
|
| 591 |
+
Step 14: INSPECT acc_0028 flagged=0/10 suspects=0 steps_left=66
|
| 592 |
+
Step 15: INSPECT acc_0792 flagged=0/10 suspects=0 steps_left=65
|
| 593 |
+
Step 16: INSPECT acc_0579 flagged=0/10 suspects=0 steps_left=64
|
| 594 |
+
Step 17: INSPECT acc_0037 flagged=0/10 suspects=0 steps_left=63
|
| 595 |
+
Step 18: INSPECT acc_0295 flagged=0/10 suspects=0 steps_left=62
|
| 596 |
+
Step 19: INSPECT acc_0438 flagged=0/10 suspects=0 steps_left=61
|
| 597 |
+
Step 20: INSPECT acc_0439 flagged=0/10 suspects=0 steps_left=60
|
| 598 |
+
Step 21: FLAG acc_0439 flagged=1/10 suspects=5 steps_left=60
|
| 599 |
+
Step 22: INSPECT acc_0237 flagged=1/10 suspects=5 steps_left=59
|
| 600 |
+
Step 23: FLAG acc_0237 flagged=2/10 suspects=6 steps_left=59
|
| 601 |
+
Step 24: INSPECT acc_0621 flagged=2/10 suspects=6 steps_left=58
|
| 602 |
+
Step 25: FLAG acc_0621 flagged=3/10 suspects=6 steps_left=58
|
| 603 |
+
Step 26: INSPECT acc_0389 flagged=3/10 suspects=6 steps_left=57
|
| 604 |
+
Step 27: INSPECT acc_0160 flagged=3/10 suspects=6 steps_left=56
|
| 605 |
+
Step 28: INSPECT acc_0549 flagged=3/10 suspects=6 steps_left=55
|
| 606 |
+
Step 29: INSPECT acc_0658 flagged=3/10 suspects=6 steps_left=54
|
| 607 |
+
Step 30: FLAG acc_0160 flagged=4/10 suspects=6 steps_left=54
|
| 608 |
+
Step 31: INSPECT acc_0290 flagged=4/10 suspects=6 steps_left=53
|
| 609 |
+
Step 32: INSPECT acc_0124 flagged=4/10 suspects=6 steps_left=52
|
| 610 |
+
Step 33: INSPECT acc_0507 flagged=4/10 suspects=6 steps_left=51
|
| 611 |
+
Step 34: FLAG acc_0549 flagged=5/10 suspects=5 steps_left=51
|
| 612 |
+
Step 35: FLAG acc_0290 flagged=6/10 suspects=4 steps_left=51
|
| 613 |
+
Step 36: FLAG acc_0389 flagged=7/10 suspects=3 steps_left=51
|
| 614 |
+
Step 37: FLAG acc_0658 flagged=8/10 suspects=2 steps_left=51
|
| 615 |
+
Step 38: FLAG acc_0507 flagged=9/10 suspects=1 steps_left=51
|
| 616 |
+
Step 39: FLAG acc_0124 flagged=10/10 suspects=0 steps_left=51
|
| 617 |
+
Step 40: SUBMIT flagged=10/10 suspects=0 steps_left=51
|
| 618 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.71
|
| 619 |
+
★ GRADER SCORE: 0.9637
|
| 620 |
+
|
| 621 |
+
--- LLM Agent: task=hard, seed=1, model=Bedrock/deepseek.v3.2 ---
|
| 622 |
+
Step 1: INSPECT acc_0014 flagged=0/10 suspects=0 steps_left=79
|
| 623 |
+
Step 2: INSPECT acc_0835 flagged=0/10 suspects=0 steps_left=78
|
| 624 |
+
Step 3: INSPECT acc_0855 flagged=0/10 suspects=0 steps_left=77
|
| 625 |
+
Step 4: INSPECT acc_0930 flagged=0/10 suspects=0 steps_left=76
|
| 626 |
+
Step 5: INSPECT acc_0336 flagged=0/10 suspects=0 steps_left=75
|
| 627 |
+
Step 6: INSPECT acc_0929 flagged=0/10 suspects=0 steps_left=74
|
| 628 |
+
Step 7: INSPECT acc_0076 flagged=0/10 suspects=0 steps_left=73
|
| 629 |
+
Step 8: INSPECT acc_0543 flagged=0/10 suspects=0 steps_left=72
|
| 630 |
+
Step 9: INSPECT acc_0590 flagged=0/10 suspects=0 steps_left=71
|
| 631 |
+
Step 10: INSPECT acc_0401 flagged=0/10 suspects=0 steps_left=70
|
| 632 |
+
Step 11: INSPECT acc_0322 flagged=0/10 suspects=0 steps_left=69
|
| 633 |
+
Step 12: INSPECT acc_0154 flagged=0/10 suspects=0 steps_left=68
|
| 634 |
+
Step 13: INSPECT acc_0374 flagged=0/10 suspects=0 steps_left=67
|
| 635 |
+
Step 14: INSPECT acc_0549 flagged=0/10 suspects=0 steps_left=66
|
| 636 |
+
Step 15: INSPECT acc_0903 flagged=0/10 suspects=0 steps_left=65
|
| 637 |
+
Step 16: INSPECT acc_0976 flagged=0/10 suspects=0 steps_left=64
|
| 638 |
+
Step 17: INSPECT acc_0620 flagged=0/10 suspects=0 steps_left=63
|
| 639 |
+
Step 18: INSPECT acc_0017 flagged=0/10 suspects=0 steps_left=62
|
| 640 |
+
Step 19: INSPECT acc_0222 flagged=0/10 suspects=0 steps_left=61
|
| 641 |
+
Step 20: INSPECT acc_0536 flagged=0/10 suspects=0 steps_left=60
|
| 642 |
+
Step 21: INSPECT acc_0112 flagged=0/10 suspects=0 steps_left=59
|
| 643 |
+
Step 22: INSPECT acc_0577 flagged=0/10 suspects=0 steps_left=58
|
| 644 |
+
Step 23: INSPECT acc_0517 flagged=0/10 suspects=0 steps_left=57
|
| 645 |
+
Step 24: INSPECT acc_0113 flagged=0/10 suspects=0 steps_left=56
|
| 646 |
+
Step 25: INSPECT acc_0167 flagged=0/10 suspects=0 steps_left=55
|
| 647 |
+
Step 26: INSPECT acc_0697 flagged=0/10 suspects=0 steps_left=54
|
| 648 |
+
Step 27: INSPECT acc_0271 flagged=0/10 suspects=0 steps_left=53
|
| 649 |
+
Step 28: INSPECT acc_0681 flagged=0/10 suspects=0 steps_left=52
|
| 650 |
+
Step 29: INSPECT acc_0530 flagged=0/10 suspects=0 steps_left=51
|
| 651 |
+
Step 30: INSPECT acc_0353 flagged=0/10 suspects=0 steps_left=50
|
| 652 |
+
Step 31: INSPECT acc_0652 flagged=0/10 suspects=0 steps_left=49
|
| 653 |
+
Step 32: INSPECT acc_0777 flagged=0/10 suspects=0 steps_left=48
|
| 654 |
+
Step 33: INSPECT acc_0265 flagged=0/10 suspects=0 steps_left=47
|
| 655 |
+
Step 34: INSPECT acc_0788 flagged=0/10 suspects=0 steps_left=46
|
| 656 |
+
Step 35: INSPECT acc_0033 flagged=0/10 suspects=0 steps_left=45
|
| 657 |
+
Step 36: INSPECT acc_0187 flagged=0/10 suspects=0 steps_left=44
|
| 658 |
+
Step 37: INSPECT acc_0445 flagged=0/10 suspects=0 steps_left=43
|
| 659 |
+
Step 38: INSPECT acc_0846 flagged=0/10 suspects=0 steps_left=42
|
| 660 |
+
Step 39: INSPECT acc_0659 flagged=0/10 suspects=0 steps_left=41
|
| 661 |
+
Step 40: INSPECT acc_0768 flagged=0/10 suspects=0 steps_left=40
|
| 662 |
+
Step 41: INSPECT acc_0677 flagged=0/10 suspects=0 steps_left=39
|
| 663 |
+
Step 42: INSPECT acc_0539 flagged=0/10 suspects=0 steps_left=38
|
| 664 |
+
Step 43: INSPECT acc_0742 flagged=0/10 suspects=0 steps_left=37
|
| 665 |
+
Step 44: INSPECT acc_0503 flagged=0/10 suspects=0 steps_left=36
|
| 666 |
+
Step 45: INSPECT acc_0876 flagged=0/10 suspects=0 steps_left=35
|
| 667 |
+
Step 46: INSPECT acc_0639 flagged=0/10 suspects=0 steps_left=34
|
| 668 |
+
Step 47: INSPECT acc_0494 flagged=0/10 suspects=0 steps_left=33
|
| 669 |
+
Step 48: INSPECT acc_0898 flagged=0/10 suspects=0 steps_left=32
|
| 670 |
+
Step 49: INSPECT acc_0553 flagged=0/10 suspects=0 steps_left=31
|
| 671 |
+
Step 50: INSPECT acc_0588 flagged=0/10 suspects=0 steps_left=30
|
| 672 |
+
Step 51: INSPECT acc_0194 flagged=0/10 suspects=0 steps_left=29
|
| 673 |
+
Step 52: INSPECT acc_0810 flagged=0/10 suspects=0 steps_left=28
|
| 674 |
+
Step 53: INSPECT acc_0355 flagged=0/10 suspects=0 steps_left=27
|
| 675 |
+
Step 54: INSPECT acc_0363 flagged=0/10 suspects=0 steps_left=26
|
| 676 |
+
Step 55: INSPECT acc_0221 flagged=0/10 suspects=0 steps_left=25
|
| 677 |
+
Step 56: INSPECT acc_0580 flagged=0/10 suspects=0 steps_left=24
|
| 678 |
+
Step 57: INSPECT acc_0534 flagged=0/10 suspects=0 steps_left=23
|
| 679 |
+
Step 58: INSPECT acc_0778 flagged=0/10 suspects=0 steps_left=22
|
| 680 |
+
Step 59: INSPECT acc_0998 flagged=0/10 suspects=0 steps_left=21
|
| 681 |
+
Step 60: INSPECT acc_0233 flagged=0/10 suspects=0 steps_left=20
|
| 682 |
+
Step 61: INSPECT acc_0052 flagged=0/10 suspects=0 steps_left=19
|
| 683 |
+
Step 62: INSPECT acc_0813 flagged=0/10 suspects=0 steps_left=18
|
| 684 |
+
Step 63: INSPECT acc_0035 flagged=0/10 suspects=0 steps_left=17
|
| 685 |
+
Step 64: INSPECT acc_0667 flagged=0/10 suspects=0 steps_left=16
|
| 686 |
+
Step 65: INSPECT acc_0019 flagged=0/10 suspects=0 steps_left=15
|
| 687 |
+
Step 66: INSPECT acc_0959 flagged=0/10 suspects=0 steps_left=14
|
| 688 |
+
Step 67: INSPECT acc_0212 flagged=0/10 suspects=0 steps_left=13
|
| 689 |
+
Step 68: INSPECT acc_0776 flagged=0/10 suspects=0 steps_left=12
|
| 690 |
+
Step 69: INSPECT acc_0049 flagged=0/10 suspects=0 steps_left=11
|
| 691 |
+
Step 70: INSPECT acc_0434 flagged=0/10 suspects=0 steps_left=10
|
| 692 |
+
Step 71: INSPECT acc_0827 flagged=0/10 suspects=0 steps_left=9
|
| 693 |
+
Step 72: INSPECT acc_0583 flagged=0/10 suspects=0 steps_left=8
|
| 694 |
+
Step 73: INSPECT acc_0065 flagged=0/10 suspects=0 steps_left=7
|
| 695 |
+
Step 74: INSPECT acc_0107 flagged=0/10 suspects=0 steps_left=6
|
| 696 |
+
Step 75: INSPECT acc_0107 flagged=0/10 suspects=0 steps_left=5
|
| 697 |
+
Step 76: INSPECT acc_0761 flagged=0/10 suspects=0 steps_left=4
|
| 698 |
+
Step 77: INSPECT acc_0995 flagged=0/10 suspects=0 steps_left=3
|
| 699 |
+
Step 78: INSPECT acc_0157 flagged=0/10 suspects=0 steps_left=2
|
| 700 |
+
Step 79: INSPECT acc_0936 flagged=0/10 suspects=0 steps_left=1
|
| 701 |
+
Step 80: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=0
|
| 702 |
+
→ Episode ended: [LOSS] TP=0 FP=0 FN=10 Recall=0.00 Precision=0.00 Episode reward=-9.80
|
| 703 |
+
★ GRADER SCORE: 0.0000
|
| 704 |
+
|
| 705 |
+
--- LLM Agent: task=hard, seed=2, model=Bedrock/deepseek.v3.2 ---
|
| 706 |
+
Step 1: INSPECT acc_0813 flagged=0/10 suspects=0 steps_left=79
|
| 707 |
+
Step 2: INSPECT acc_0430 flagged=0/10 suspects=0 steps_left=78
|
| 708 |
+
Step 3: INSPECT acc_0817 flagged=0/10 suspects=0 steps_left=77
|
| 709 |
+
Step 4: INSPECT acc_0175 flagged=0/10 suspects=0 steps_left=76
|
| 710 |
+
Step 5: INSPECT acc_0523 flagged=0/10 suspects=0 steps_left=75
|
| 711 |
+
Step 6: INSPECT acc_0113 flagged=0/10 suspects=0 steps_left=74
|
| 712 |
+
Step 7: INSPECT acc_0797 flagged=0/10 suspects=0 steps_left=73
|
| 713 |
+
Step 8: INSPECT acc_0478 flagged=0/10 suspects=0 steps_left=72
|
| 714 |
+
Step 9: INSPECT acc_0861 flagged=0/10 suspects=0 steps_left=71
|
| 715 |
+
Step 10: INSPECT acc_0836 flagged=0/10 suspects=0 steps_left=70
|
| 716 |
+
Step 11: INSPECT acc_0926 flagged=0/10 suspects=0 steps_left=69
|
| 717 |
+
Step 12: INSPECT acc_0664 flagged=0/10 suspects=0 steps_left=68
|
| 718 |
+
Step 13: INSPECT acc_0255 flagged=0/10 suspects=0 steps_left=67
|
| 719 |
+
Step 14: INSPECT acc_0938 flagged=0/10 suspects=0 steps_left=66
|
| 720 |
+
Step 15: INSPECT acc_0672 flagged=0/10 suspects=0 steps_left=65
|
| 721 |
+
Step 16: FLAG acc_0672 flagged=1/10 suspects=6 steps_left=65
|
| 722 |
+
Step 17: INSPECT acc_0659 flagged=1/10 suspects=6 steps_left=64
|
| 723 |
+
Step 18: INSPECT acc_0290 flagged=1/10 suspects=6 steps_left=63
|
| 724 |
+
Step 19: FLAG acc_0290 flagged=2/10 suspects=6 steps_left=63
|
| 725 |
+
Step 20: INSPECT acc_0339 flagged=2/10 suspects=6 steps_left=62
|
| 726 |
+
Step 21: FLAG acc_0659 flagged=3/10 suspects=7 steps_left=62
|
| 727 |
+
Step 22: INSPECT acc_0544 flagged=3/10 suspects=7 steps_left=61
|
| 728 |
+
Step 23: FLAG acc_0339 flagged=4/10 suspects=6 steps_left=61
|
| 729 |
+
Step 24: INSPECT acc_0696 flagged=4/10 suspects=6 steps_left=60
|
| 730 |
+
Step 25: FLAG acc_0544 flagged=5/10 suspects=5 steps_left=60
|
| 731 |
+
Step 26: INSPECT acc_0541 flagged=5/10 suspects=5 steps_left=59
|
| 732 |
+
Step 27: FLAG acc_0696 flagged=6/10 suspects=4 steps_left=59
|
| 733 |
+
Step 28: FLAG acc_0541 flagged=7/10 suspects=3 steps_left=59
|
| 734 |
+
Step 29: INSPECT acc_0793 flagged=7/10 suspects=3 steps_left=58
|
| 735 |
+
Step 30: FLAG acc_0793 flagged=8/10 suspects=2 steps_left=58
|
| 736 |
+
Step 31: INSPECT acc_0214 flagged=8/10 suspects=2 steps_left=57
|
| 737 |
+
Step 32: INSPECT acc_0112 flagged=8/10 suspects=2 steps_left=56
|
| 738 |
+
Step 33: FLAG acc_0112 flagged=9/10 suspects=1 steps_left=56
|
| 739 |
+
Step 34: FLAG acc_0214 flagged=10/10 suspects=0 steps_left=56
|
| 740 |
+
Step 35: SUBMIT flagged=10/10 suspects=0 steps_left=56
|
| 741 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.76
|
| 742 |
+
★ GRADER SCORE: 0.9700
|
| 743 |
+
|
| 744 |
+
hard: scores=['0.964', '0.000', '0.970'] mean=0.6446 var=0.207740
|
| 745 |
+
|
| 746 |
+
============================================================
|
| 747 |
+
EVALUATION COMPLETE
|
| 748 |
+
============================================================
|
| 749 |
+
ubuntu@ip-172-31-33-59:~/meta/meta-hack-26$
|
model-benchmark-logs/gemma_judge_log.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
model-benchmark-logs/meta_judge_log.txt
ADDED
|
@@ -0,0 +1,826 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ubuntu@ip-172-31-33-59:~/meta/meta-hack-26/eval-models$ python3 llama_test_judge_eval.py --url https://pandago-graphstrike.hf.space --bedrock
|
| 2 |
+
GraphStrike Judge Evaluation Simulator
|
| 3 |
+
Target: https://pandago-graphstrike.hf.space
|
| 4 |
+
Backend: bedrock
|
| 5 |
+
Model: Bedrock/us.meta.llama4-scout-17b-instruct-v1:0
|
| 6 |
+
Token: set
|
| 7 |
+
|
| 8 |
+
============================================================
|
| 9 |
+
PHASE 0: Endpoint Verification
|
| 10 |
+
============================================================
|
| 11 |
+
✓ GET /health
|
| 12 |
+
✓ GET /tasks
|
| 13 |
+
✓ GET /metadata
|
| 14 |
+
✓ GET /schema
|
| 15 |
+
✓ GET /web
|
| 16 |
+
✓ POST /reset
|
| 17 |
+
✓ GET /state
|
| 18 |
+
✓ POST /step
|
| 19 |
+
✓ POST /step
|
| 20 |
+
✓ GET /grader
|
| 21 |
+
✓ POST /mcp
|
| 22 |
+
✓ POST /baseline
|
| 23 |
+
|
| 24 |
+
============================================================
|
| 25 |
+
PHASE 1: Baseline Stability (3 runs)
|
| 26 |
+
============================================================
|
| 27 |
+
Run 1: easy=0.9100 medium=0.9060 hard=0.9038
|
| 28 |
+
Run 2: easy=0.9100 medium=0.9060 hard=0.9038
|
| 29 |
+
Run 3: easy=0.9100 medium=0.9060 hard=0.9038
|
| 30 |
+
✓ All 3 runs identical — baseline is deterministic
|
| 31 |
+
|
| 32 |
+
============================================================
|
| 33 |
+
PHASE 2: LLM Agent Evaluation (model=Bedrock/us.meta.llama4-scout-17b-instruct-v1:0)
|
| 34 |
+
============================================================
|
| 35 |
+
|
| 36 |
+
--- LLM Agent: task=easy, seed=0, model=Bedrock/us.meta.llama4-scout-17b-instruct-v1:0 ---
|
| 37 |
+
Step 1: INSPECT acc_0043 flagged=0/10 suspects=0 steps_left=29
|
| 38 |
+
Step 2: FLAG acc_0043 flagged=1/10 suspects=8 steps_left=29
|
| 39 |
+
Step 3: INSPECT acc_0036 flagged=1/10 suspects=8 steps_left=28
|
| 40 |
+
Step 4: FLAG acc_0036 flagged=2/10 suspects=8 steps_left=28
|
| 41 |
+
Step 5: INSPECT acc_0001 flagged=2/10 suspects=8 steps_left=27
|
| 42 |
+
Step 6: FLAG acc_0001 flagged=3/10 suspects=7 steps_left=27
|
| 43 |
+
Step 7: INSPECT acc_0014 flagged=3/10 suspects=7 steps_left=26
|
| 44 |
+
Step 8: FLAG acc_0014 flagged=4/10 suspects=6 steps_left=26
|
| 45 |
+
Step 9: INSPECT acc_0012 flagged=4/10 suspects=6 steps_left=25
|
| 46 |
+
Step 10: FLAG acc_0012 flagged=5/10 suspects=5 steps_left=25
|
| 47 |
+
Step 11: INSPECT acc_0000 flagged=5/10 suspects=5 steps_left=24
|
| 48 |
+
Step 12: FLAG acc_0000 flagged=6/10 suspects=4 steps_left=24
|
| 49 |
+
Step 13: INSPECT acc_0027 flagged=6/10 suspects=4 steps_left=23
|
| 50 |
+
Step 14: FLAG acc_0027 flagged=7/10 suspects=3 steps_left=23
|
| 51 |
+
Step 15: INSPECT acc_0047 flagged=7/10 suspects=3 steps_left=22
|
| 52 |
+
Step 16: FLAG acc_0047 flagged=8/10 suspects=2 steps_left=22
|
| 53 |
+
Step 17: INSPECT acc_0007 flagged=8/10 suspects=2 steps_left=21
|
| 54 |
+
Step 18: FLAG acc_0007 flagged=9/10 suspects=1 steps_left=21
|
| 55 |
+
Step 19: INSPECT acc_0028 flagged=9/10 suspects=1 steps_left=20
|
| 56 |
+
Step 20: FLAG acc_0028 flagged=10/10 suspects=0 steps_left=20
|
| 57 |
+
Step 21: INSPECT acc_0035 flagged=10/10 suspects=0 steps_left=19
|
| 58 |
+
Step 22: INSPECT acc_0022 flagged=10/10 suspects=0 steps_left=18
|
| 59 |
+
Step 23: INSPECT acc_0041 flagged=10/10 suspects=0 steps_left=17
|
| 60 |
+
Step 24: INSPECT acc_0029 flagged=10/10 suspects=0 steps_left=16
|
| 61 |
+
Step 25: INSPECT acc_0018 flagged=10/10 suspects=0 steps_left=15
|
| 62 |
+
Step 26: INSPECT acc_0019 flagged=10/10 suspects=0 steps_left=14
|
| 63 |
+
Step 27: INSPECT acc_0041 flagged=10/10 suspects=0 steps_left=13
|
| 64 |
+
Step 28: INSPECT acc_0040 flagged=10/10 suspects=0 steps_left=12
|
| 65 |
+
Step 29: INSPECT acc_0026 flagged=10/10 suspects=0 steps_left=11
|
| 66 |
+
Step 30: INSPECT acc_0021 flagged=10/10 suspects=0 steps_left=10
|
| 67 |
+
Step 31: INSPECT acc_0009 flagged=10/10 suspects=0 steps_left=9
|
| 68 |
+
Step 32: INSPECT acc_0033 flagged=10/10 suspects=0 steps_left=8
|
| 69 |
+
Step 33: INSPECT acc_0044 flagged=10/10 suspects=0 steps_left=7
|
| 70 |
+
Step 34: SUBMIT flagged=10/10 suspects=0 steps_left=7
|
| 71 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.77
|
| 72 |
+
★ GRADER SCORE: 0.9233
|
| 73 |
+
|
| 74 |
+
--- LLM Agent: task=medium, seed=0, model=Bedrock/us.meta.llama4-scout-17b-instruct-v1:0 ---
|
| 75 |
+
Step 1: INSPECT acc_0008 flagged=0/10 suspects=0 steps_left=49
|
| 76 |
+
Step 2: INSPECT acc_0074 flagged=0/10 suspects=0 steps_left=48
|
| 77 |
+
Step 3: INSPECT acc_0179 flagged=0/10 suspects=0 steps_left=47
|
| 78 |
+
Step 4: INSPECT acc_0177 flagged=0/10 suspects=0 steps_left=46
|
| 79 |
+
Step 5: INSPECT acc_0096 flagged=0/10 suspects=0 steps_left=45
|
| 80 |
+
Step 6: INSPECT acc_0122 flagged=0/10 suspects=0 steps_left=44
|
| 81 |
+
Step 7: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=43
|
| 82 |
+
Step 8: INSPECT acc_0174 flagged=0/10 suspects=0 steps_left=42
|
| 83 |
+
Step 9: INSPECT acc_0144 flagged=0/10 suspects=0 steps_left=41
|
| 84 |
+
Step 10: FLAG acc_0144 flagged=1/10 suspects=8 steps_left=41
|
| 85 |
+
Step 11: INSPECT acc_0181 flagged=1/10 suspects=8 steps_left=40
|
| 86 |
+
Step 12: FLAG acc_0181 flagged=2/10 suspects=8 steps_left=40
|
| 87 |
+
Step 13: INSPECT acc_0022 flagged=2/10 suspects=8 steps_left=39
|
| 88 |
+
Step 14: FLAG acc_0022 flagged=3/10 suspects=7 steps_left=39
|
| 89 |
+
Step 15: INSPECT acc_0092 flagged=3/10 suspects=7 steps_left=38
|
| 90 |
+
Step 16: FLAG acc_0092 flagged=4/10 suspects=6 steps_left=38
|
| 91 |
+
Step 17: INSPECT acc_0097 flagged=4/10 suspects=6 steps_left=37
|
| 92 |
+
Step 18: FLAG acc_0097 flagged=5/10 suspects=5 steps_left=37
|
| 93 |
+
Step 19: INSPECT acc_0187 flagged=5/10 suspects=5 steps_left=36
|
| 94 |
+
Step 20: FLAG acc_0187 flagged=6/10 suspects=4 steps_left=36
|
| 95 |
+
Step 21: INSPECT acc_0093 flagged=6/10 suspects=4 steps_left=35
|
| 96 |
+
Step 22: FLAG acc_0093 flagged=7/10 suspects=3 steps_left=35
|
| 97 |
+
Step 23: INSPECT acc_0172 flagged=7/10 suspects=3 steps_left=34
|
| 98 |
+
Step 24: FLAG acc_0172 flagged=8/10 suspects=2 steps_left=34
|
| 99 |
+
Step 25: INSPECT acc_0058 flagged=8/10 suspects=2 steps_left=33
|
| 100 |
+
Step 26: FLAG acc_0058 flagged=9/10 suspects=1 steps_left=33
|
| 101 |
+
Step 27: INSPECT acc_0131 flagged=9/10 suspects=1 steps_left=32
|
| 102 |
+
Step 28: FLAG acc_0131 flagged=10/10 suspects=0 steps_left=32
|
| 103 |
+
Step 29: INSPECT acc_0148 flagged=10/10 suspects=0 steps_left=31
|
| 104 |
+
Step 30: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=30
|
| 105 |
+
Step 31: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=29
|
| 106 |
+
Step 32: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=28
|
| 107 |
+
Step 33: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=27
|
| 108 |
+
Step 34: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=26
|
| 109 |
+
Step 35: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=25
|
| 110 |
+
Step 36: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=24
|
| 111 |
+
Step 37: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=23
|
| 112 |
+
Step 38: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=22
|
| 113 |
+
Step 39: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=21
|
| 114 |
+
Step 40: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=20
|
| 115 |
+
Step 41: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=19
|
| 116 |
+
Step 42: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=18
|
| 117 |
+
Step 43: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=17
|
| 118 |
+
Step 44: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=16
|
| 119 |
+
Step 45: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=15
|
| 120 |
+
Step 46: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=14
|
| 121 |
+
Step 47: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=13
|
| 122 |
+
Step 48: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=12
|
| 123 |
+
Step 49: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=11
|
| 124 |
+
Step 50: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=10
|
| 125 |
+
Step 51: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=9
|
| 126 |
+
Step 52: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=8
|
| 127 |
+
Step 53: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=7
|
| 128 |
+
Step 54: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=6
|
| 129 |
+
Step 55: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=5
|
| 130 |
+
Step 56: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=4
|
| 131 |
+
Step 57: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=3
|
| 132 |
+
Step 58: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=2
|
| 133 |
+
Step 59: SUBMIT flagged=10/10 suspects=0 steps_left=2
|
| 134 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.52
|
| 135 |
+
★ GRADER SCORE: 0.9040
|
| 136 |
+
|
| 137 |
+
--- LLM Agent: task=hard, seed=0, model=Bedrock/us.meta.llama4-scout-17b-instruct-v1:0 ---
|
| 138 |
+
Step 1: INSPECT acc_0704 flagged=0/10 suspects=0 steps_left=79
|
| 139 |
+
Step 2: INSPECT acc_0289 flagged=0/10 suspects=0 steps_left=78
|
| 140 |
+
Step 3: INSPECT acc_0826 flagged=0/10 suspects=0 steps_left=77
|
| 141 |
+
Step 4: INSPECT acc_0927 flagged=0/10 suspects=0 steps_left=76
|
| 142 |
+
Step 5: INSPECT acc_0441 flagged=0/10 suspects=0 steps_left=75
|
| 143 |
+
Step 6: INSPECT acc_0871 flagged=0/10 suspects=0 steps_left=74
|
| 144 |
+
Step 7: INSPECT acc_0880 flagged=0/10 suspects=0 steps_left=73
|
| 145 |
+
Step 8: INSPECT acc_0652 flagged=0/10 suspects=0 steps_left=72
|
| 146 |
+
Step 9: INSPECT acc_0939 flagged=0/10 suspects=0 steps_left=71
|
| 147 |
+
Step 10: INSPECT acc_0070 flagged=0/10 suspects=0 steps_left=70
|
| 148 |
+
Step 11: INSPECT acc_0915 flagged=0/10 suspects=0 steps_left=69
|
| 149 |
+
Step 12: INSPECT acc_0443 flagged=0/10 suspects=0 steps_left=68
|
| 150 |
+
Step 13: INSPECT acc_0970 flagged=0/10 suspects=0 steps_left=67
|
| 151 |
+
Step 14: INSPECT acc_0028 flagged=0/10 suspects=0 steps_left=66
|
| 152 |
+
Step 15: INSPECT acc_0792 flagged=0/10 suspects=0 steps_left=65
|
| 153 |
+
Step 16: INSPECT acc_0579 flagged=0/10 suspects=0 steps_left=64
|
| 154 |
+
Step 17: INSPECT acc_0037 flagged=0/10 suspects=0 steps_left=63
|
| 155 |
+
Step 18: INSPECT acc_0295 flagged=0/10 suspects=0 steps_left=62
|
| 156 |
+
Step 19: INSPECT acc_0438 flagged=0/10 suspects=0 steps_left=61
|
| 157 |
+
Step 20: INSPECT acc_0439 flagged=0/10 suspects=0 steps_left=60
|
| 158 |
+
Step 21: FLAG acc_0439 flagged=1/10 suspects=5 steps_left=60
|
| 159 |
+
Step 22: INSPECT acc_0237 flagged=1/10 suspects=5 steps_left=59
|
| 160 |
+
Step 23: FLAG acc_0237 flagged=2/10 suspects=6 steps_left=59
|
| 161 |
+
Step 24: INSPECT acc_0621 flagged=2/10 suspects=6 steps_left=58
|
| 162 |
+
Step 25: FLAG acc_0621 flagged=3/10 suspects=6 steps_left=58
|
| 163 |
+
Step 26: INSPECT acc_0389 flagged=3/10 suspects=6 steps_left=57
|
| 164 |
+
Step 27: FLAG acc_0389 flagged=4/10 suspects=6 steps_left=57
|
| 165 |
+
Step 28: INSPECT acc_0160 flagged=4/10 suspects=6 steps_left=56
|
| 166 |
+
Step 29: FLAG acc_0160 flagged=5/10 suspects=5 steps_left=56
|
| 167 |
+
Step 30: INSPECT acc_0549 flagged=5/10 suspects=5 steps_left=55
|
| 168 |
+
Step 31: FLAG acc_0549 flagged=6/10 suspects=4 steps_left=55
|
| 169 |
+
Step 32: INSPECT acc_0658 flagged=6/10 suspects=4 steps_left=54
|
| 170 |
+
Step 33: FLAG acc_0658 flagged=7/10 suspects=3 steps_left=54
|
| 171 |
+
Step 34: INSPECT acc_0290 flagged=7/10 suspects=3 steps_left=53
|
| 172 |
+
Step 35: FLAG acc_0290 flagged=8/10 suspects=2 steps_left=53
|
| 173 |
+
Step 36: INSPECT acc_0124 flagged=8/10 suspects=2 steps_left=52
|
| 174 |
+
Step 37: FLAG acc_0124 flagged=9/10 suspects=1 steps_left=52
|
| 175 |
+
Step 38: INSPECT acc_0507 flagged=9/10 suspects=1 steps_left=51
|
| 176 |
+
Step 39: FLAG acc_0507 flagged=10/10 suspects=0 steps_left=51
|
| 177 |
+
Step 40: INSPECT acc_0086 flagged=10/10 suspects=0 steps_left=50
|
| 178 |
+
Step 41: INSPECT acc_0497 flagged=10/10 suspects=0 steps_left=49
|
| 179 |
+
Step 42: INSPECT acc_0610 flagged=10/10 suspects=0 steps_left=48
|
| 180 |
+
Step 43: INSPECT acc_0579 flagged=10/10 suspects=0 steps_left=47
|
| 181 |
+
Step 44: INSPECT acc_0573 flagged=10/10 suspects=0 steps_left=46
|
| 182 |
+
Step 45: INSPECT acc_0479 flagged=10/10 suspects=0 steps_left=45
|
| 183 |
+
Step 46: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=44
|
| 184 |
+
Step 47: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=43
|
| 185 |
+
Step 48: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=42
|
| 186 |
+
Step 49: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=41
|
| 187 |
+
Step 50: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=40
|
| 188 |
+
Step 51: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=39
|
| 189 |
+
Step 52: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=38
|
| 190 |
+
Step 53: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=37
|
| 191 |
+
Step 54: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=36
|
| 192 |
+
Step 55: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=35
|
| 193 |
+
Step 56: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=34
|
| 194 |
+
Step 57: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=33
|
| 195 |
+
Step 58: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=32
|
| 196 |
+
Step 59: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=31
|
| 197 |
+
Step 60: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=30
|
| 198 |
+
Step 61: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=29
|
| 199 |
+
Step 62: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=28
|
| 200 |
+
Step 63: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=27
|
| 201 |
+
Step 64: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=26
|
| 202 |
+
Step 65: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=25
|
| 203 |
+
Step 66: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=24
|
| 204 |
+
Step 67: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=23
|
| 205 |
+
Step 68: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=22
|
| 206 |
+
Step 69: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=21
|
| 207 |
+
Step 70: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=20
|
| 208 |
+
Step 71: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=19
|
| 209 |
+
Step 72: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=18
|
| 210 |
+
Step 73: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=17
|
| 211 |
+
Step 74: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=16
|
| 212 |
+
Step 75: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=15
|
| 213 |
+
Step 76: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=14
|
| 214 |
+
Step 77: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=13
|
| 215 |
+
Step 78: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=12
|
| 216 |
+
Step 79: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=11
|
| 217 |
+
Step 80: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=10
|
| 218 |
+
Step 81: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=9
|
| 219 |
+
Step 82: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=8
|
| 220 |
+
Step 83: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=7
|
| 221 |
+
Step 84: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=6
|
| 222 |
+
Step 85: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=5
|
| 223 |
+
Step 86: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=4
|
| 224 |
+
Step 87: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=3
|
| 225 |
+
Step 88: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=2
|
| 226 |
+
Step 89: SUBMIT flagged=10/10 suspects=0 steps_left=2
|
| 227 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=13.22
|
| 228 |
+
★ GRADER SCORE: 0.9025
|
| 229 |
+
|
| 230 |
+
Summary: easy=0.9233 medium=0.9040 hard=0.9025
|
| 231 |
+
|
| 232 |
+
============================================================
|
| 233 |
+
PHASE 3: Score Variance (seeds=[0, 1, 2])
|
| 234 |
+
============================================================
|
| 235 |
+
|
| 236 |
+
--- LLM Agent: task=easy, seed=0, model=Bedrock/us.meta.llama4-scout-17b-instruct-v1:0 ---
|
| 237 |
+
Step 1: INSPECT acc_0043 flagged=0/10 suspects=0 steps_left=29
|
| 238 |
+
Step 2: FLAG acc_0043 flagged=1/10 suspects=8 steps_left=29
|
| 239 |
+
Step 3: INSPECT acc_0036 flagged=1/10 suspects=8 steps_left=28
|
| 240 |
+
Step 4: FLAG acc_0036 flagged=2/10 suspects=8 steps_left=28
|
| 241 |
+
Step 5: INSPECT acc_0001 flagged=2/10 suspects=8 steps_left=27
|
| 242 |
+
Step 6: FLAG acc_0001 flagged=3/10 suspects=7 steps_left=27
|
| 243 |
+
Step 7: INSPECT acc_0014 flagged=3/10 suspects=7 steps_left=26
|
| 244 |
+
Step 8: FLAG acc_0014 flagged=4/10 suspects=6 steps_left=26
|
| 245 |
+
Step 9: INSPECT acc_0012 flagged=4/10 suspects=6 steps_left=25
|
| 246 |
+
Step 10: FLAG acc_0012 flagged=5/10 suspects=5 steps_left=25
|
| 247 |
+
Step 11: INSPECT acc_0000 flagged=5/10 suspects=5 steps_left=24
|
| 248 |
+
Step 12: FLAG acc_0000 flagged=6/10 suspects=4 steps_left=24
|
| 249 |
+
Step 13: INSPECT acc_0027 flagged=6/10 suspects=4 steps_left=23
|
| 250 |
+
Step 14: FLAG acc_0027 flagged=7/10 suspects=3 steps_left=23
|
| 251 |
+
Step 15: INSPECT acc_0047 flagged=7/10 suspects=3 steps_left=22
|
| 252 |
+
Step 16: FLAG acc_0047 flagged=8/10 suspects=2 steps_left=22
|
| 253 |
+
Step 17: INSPECT acc_0007 flagged=8/10 suspects=2 steps_left=21
|
| 254 |
+
Step 18: FLAG acc_0007 flagged=9/10 suspects=1 steps_left=21
|
| 255 |
+
Step 19: INSPECT acc_0028 flagged=9/10 suspects=1 steps_left=20
|
| 256 |
+
Step 20: FLAG acc_0028 flagged=10/10 suspects=0 steps_left=20
|
| 257 |
+
Step 21: INSPECT acc_0035 flagged=10/10 suspects=0 steps_left=19
|
| 258 |
+
Step 22: INSPECT acc_0022 flagged=10/10 suspects=0 steps_left=18
|
| 259 |
+
Step 23: INSPECT acc_0041 flagged=10/10 suspects=0 steps_left=17
|
| 260 |
+
Step 24: INSPECT acc_0029 flagged=10/10 suspects=0 steps_left=16
|
| 261 |
+
Step 25: INSPECT acc_0018 flagged=10/10 suspects=0 steps_left=15
|
| 262 |
+
Step 26: INSPECT acc_0019 flagged=10/10 suspects=0 steps_left=14
|
| 263 |
+
Step 27: INSPECT acc_0041 flagged=10/10 suspects=0 steps_left=13
|
| 264 |
+
Step 28: INSPECT acc_0040 flagged=10/10 suspects=0 steps_left=12
|
| 265 |
+
Step 29: INSPECT acc_0026 flagged=10/10 suspects=0 steps_left=11
|
| 266 |
+
Step 30: INSPECT acc_0021 flagged=10/10 suspects=0 steps_left=10
|
| 267 |
+
Step 31: INSPECT acc_0009 flagged=10/10 suspects=0 steps_left=9
|
| 268 |
+
Step 32: INSPECT acc_0033 flagged=10/10 suspects=0 steps_left=8
|
| 269 |
+
Step 33: INSPECT acc_0044 flagged=10/10 suspects=0 steps_left=7
|
| 270 |
+
Step 34: SUBMIT flagged=10/10 suspects=0 steps_left=7
|
| 271 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.77
|
| 272 |
+
★ GRADER SCORE: 0.9233
|
| 273 |
+
|
| 274 |
+
--- LLM Agent: task=easy, seed=1, model=Bedrock/us.meta.llama4-scout-17b-instruct-v1:0 ---
|
| 275 |
+
Step 1: INSPECT acc_0034 flagged=0/10 suspects=0 steps_left=29
|
| 276 |
+
Step 2: INSPECT acc_0003 flagged=0/10 suspects=0 steps_left=28
|
| 277 |
+
Step 3: INSPECT acc_0049 flagged=0/10 suspects=0 steps_left=27
|
| 278 |
+
Step 4: INSPECT acc_0006 flagged=0/10 suspects=0 steps_left=26
|
| 279 |
+
Step 5: INSPECT acc_0047 flagged=0/10 suspects=0 steps_left=25
|
| 280 |
+
Step 6: FLAG acc_0047 flagged=1/10 suspects=9 steps_left=25
|
| 281 |
+
Step 7: INSPECT acc_0009 flagged=1/10 suspects=9 steps_left=24
|
| 282 |
+
Step 8: FLAG acc_0009 flagged=2/10 suspects=8 steps_left=24
|
| 283 |
+
Step 9: INSPECT acc_0046 flagged=2/10 suspects=8 steps_left=23
|
| 284 |
+
Step 10: FLAG acc_0046 flagged=3/10 suspects=7 steps_left=23
|
| 285 |
+
Step 11: INSPECT acc_0014 flagged=3/10 suspects=7 steps_left=22
|
| 286 |
+
Step 12: FLAG acc_0014 flagged=4/10 suspects=6 steps_left=22
|
| 287 |
+
Step 13: INSPECT acc_0021 flagged=4/10 suspects=6 steps_left=21
|
| 288 |
+
Step 14: FLAG acc_0021 flagged=5/10 suspects=5 steps_left=21
|
| 289 |
+
Step 15: INSPECT acc_0002 flagged=5/10 suspects=5 steps_left=20
|
| 290 |
+
Step 16: FLAG acc_0002 flagged=6/10 suspects=4 steps_left=20
|
| 291 |
+
Step 17: INSPECT acc_0048 flagged=6/10 suspects=4 steps_left=19
|
| 292 |
+
Step 18: FLAG acc_0048 flagged=7/10 suspects=3 steps_left=19
|
| 293 |
+
Step 19: INSPECT acc_0029 flagged=7/10 suspects=3 steps_left=18
|
| 294 |
+
Step 20: FLAG acc_0029 flagged=8/10 suspects=2 steps_left=18
|
| 295 |
+
Step 21: INSPECT acc_0015 flagged=8/10 suspects=2 steps_left=17
|
| 296 |
+
Step 22: FLAG acc_0015 flagged=9/10 suspects=1 steps_left=17
|
| 297 |
+
Step 23: INSPECT acc_0005 flagged=9/10 suspects=1 steps_left=16
|
| 298 |
+
Step 24: FLAG acc_0005 flagged=10/10 suspects=0 steps_left=16
|
| 299 |
+
Step 25: INSPECT acc_0036 flagged=10/10 suspects=0 steps_left=15
|
| 300 |
+
Step 26: INSPECT acc_0027 flagged=10/10 suspects=0 steps_left=14
|
| 301 |
+
Step 27: INSPECT acc_0043 flagged=10/10 suspects=0 steps_left=13
|
| 302 |
+
Step 28: INSPECT acc_0044 flagged=10/10 suspects=0 steps_left=12
|
| 303 |
+
Step 29: INSPECT acc_0038 flagged=10/10 suspects=0 steps_left=11
|
| 304 |
+
Step 30: INSPECT acc_0039 flagged=10/10 suspects=0 steps_left=10
|
| 305 |
+
Step 31: INSPECT acc_0028 flagged=10/10 suspects=0 steps_left=9
|
| 306 |
+
Step 32: INSPECT acc_0022 flagged=10/10 suspects=0 steps_left=8
|
| 307 |
+
Step 33: INSPECT acc_0025 flagged=10/10 suspects=0 steps_left=7
|
| 308 |
+
Step 34: INSPECT acc_0031 flagged=10/10 suspects=0 steps_left=6
|
| 309 |
+
Step 35: INSPECT acc_0007 flagged=10/10 suspects=0 steps_left=5
|
| 310 |
+
Step 36: INSPECT acc_0026 flagged=10/10 suspects=0 steps_left=4
|
| 311 |
+
Step 37: SUBMIT flagged=10/10 suspects=0 steps_left=4
|
| 312 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.74
|
| 313 |
+
★ GRADER SCORE: 0.9133
|
| 314 |
+
|
| 315 |
+
--- LLM Agent: task=easy, seed=2, model=Bedrock/us.meta.llama4-scout-17b-instruct-v1:0 ---
|
| 316 |
+
Step 1: INSPECT acc_0040 flagged=0/10 suspects=0 steps_left=29
|
| 317 |
+
Step 2: INSPECT acc_0017 flagged=0/10 suspects=0 steps_left=28
|
| 318 |
+
Step 3: INSPECT acc_0025 flagged=0/10 suspects=0 steps_left=27
|
| 319 |
+
Step 4: INSPECT acc_0026 flagged=0/10 suspects=0 steps_left=26
|
| 320 |
+
Step 5: INSPECT acc_0038 flagged=0/10 suspects=0 steps_left=25
|
| 321 |
+
Step 6: INSPECT acc_0029 flagged=0/10 suspects=0 steps_left=24
|
| 322 |
+
Step 7: FLAG acc_0029 flagged=1/10 suspects=9 steps_left=24
|
| 323 |
+
Step 8: INSPECT acc_0006 flagged=1/10 suspects=9 steps_left=23
|
| 324 |
+
Step 9: FLAG acc_0006 flagged=2/10 suspects=8 steps_left=23
|
| 325 |
+
Step 10: INSPECT acc_0033 flagged=2/10 suspects=8 steps_left=22
|
| 326 |
+
Step 11: FLAG acc_0033 flagged=3/10 suspects=7 steps_left=22
|
| 327 |
+
Step 12: INSPECT acc_0015 flagged=3/10 suspects=7 steps_left=21
|
| 328 |
+
Step 13: FLAG acc_0015 flagged=4/10 suspects=6 steps_left=21
|
| 329 |
+
Step 14: INSPECT acc_0022 flagged=4/10 suspects=6 steps_left=20
|
| 330 |
+
Step 15: FLAG acc_0022 flagged=5/10 suspects=5 steps_left=20
|
| 331 |
+
Step 16: INSPECT acc_0009 flagged=5/10 suspects=5 steps_left=19
|
| 332 |
+
Step 17: FLAG acc_0009 flagged=6/10 suspects=4 steps_left=19
|
| 333 |
+
Step 18: INSPECT acc_0004 flagged=6/10 suspects=4 steps_left=18
|
| 334 |
+
Step 19: FLAG acc_0004 flagged=7/10 suspects=3 steps_left=18
|
| 335 |
+
Step 20: INSPECT acc_0024 flagged=7/10 suspects=3 steps_left=17
|
| 336 |
+
Step 21: FLAG acc_0024 flagged=8/10 suspects=2 steps_left=17
|
| 337 |
+
Step 22: INSPECT acc_0049 flagged=8/10 suspects=2 steps_left=16
|
| 338 |
+
Step 23: FLAG acc_0049 flagged=9/10 suspects=1 steps_left=16
|
| 339 |
+
Step 24: INSPECT acc_0035 flagged=9/10 suspects=1 steps_left=15
|
| 340 |
+
Step 25: FLAG acc_0035 flagged=10/10 suspects=0 steps_left=15
|
| 341 |
+
Step 26: INSPECT acc_0044 flagged=10/10 suspects=0 steps_left=14
|
| 342 |
+
Step 27: INSPECT acc_0016 flagged=10/10 suspects=0 steps_left=13
|
| 343 |
+
Step 28: INSPECT acc_0043 flagged=10/10 suspects=0 steps_left=12
|
| 344 |
+
Step 29: INSPECT acc_0003 flagged=10/10 suspects=0 steps_left=11
|
| 345 |
+
Step 30: INSPECT acc_0028 flagged=10/10 suspects=0 steps_left=10
|
| 346 |
+
Step 31: INSPECT acc_0027 flagged=10/10 suspects=0 steps_left=9
|
| 347 |
+
Step 32: INSPECT acc_0023 flagged=10/10 suspects=0 steps_left=8
|
| 348 |
+
Step 33: INSPECT acc_0041 flagged=10/10 suspects=0 steps_left=7
|
| 349 |
+
Step 34: INSPECT acc_0045 flagged=10/10 suspects=0 steps_left=6
|
| 350 |
+
Step 35: INSPECT acc_0039 flagged=10/10 suspects=0 steps_left=5
|
| 351 |
+
Step 36: INSPECT acc_0048 flagged=10/10 suspects=0 steps_left=4
|
| 352 |
+
Step 37: INSPECT acc_0046 flagged=10/10 suspects=0 steps_left=3
|
| 353 |
+
Step 38: SUBMIT flagged=10/10 suspects=0 steps_left=3
|
| 354 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.73
|
| 355 |
+
★ GRADER SCORE: 0.9100
|
| 356 |
+
|
| 357 |
+
easy: scores=['0.923', '0.913', '0.910'] mean=0.9155 var=0.000032
|
| 358 |
+
|
| 359 |
+
--- LLM Agent: task=medium, seed=0, model=Bedrock/us.meta.llama4-scout-17b-instruct-v1:0 ---
|
| 360 |
+
Step 1: INSPECT acc_0008 flagged=0/10 suspects=0 steps_left=49
|
| 361 |
+
Step 2: INSPECT acc_0074 flagged=0/10 suspects=0 steps_left=48
|
| 362 |
+
Step 3: INSPECT acc_0179 flagged=0/10 suspects=0 steps_left=47
|
| 363 |
+
Step 4: INSPECT acc_0177 flagged=0/10 suspects=0 steps_left=46
|
| 364 |
+
Step 5: INSPECT acc_0096 flagged=0/10 suspects=0 steps_left=45
|
| 365 |
+
Step 6: INSPECT acc_0122 flagged=0/10 suspects=0 steps_left=44
|
| 366 |
+
Step 7: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=43
|
| 367 |
+
Step 8: INSPECT acc_0174 flagged=0/10 suspects=0 steps_left=42
|
| 368 |
+
Step 9: INSPECT acc_0144 flagged=0/10 suspects=0 steps_left=41
|
| 369 |
+
Step 10: FLAG acc_0144 flagged=1/10 suspects=8 steps_left=41
|
| 370 |
+
Step 11: INSPECT acc_0181 flagged=1/10 suspects=8 steps_left=40
|
| 371 |
+
Step 12: FLAG acc_0181 flagged=2/10 suspects=8 steps_left=40
|
| 372 |
+
Step 13: INSPECT acc_0022 flagged=2/10 suspects=8 steps_left=39
|
| 373 |
+
Step 14: FLAG acc_0022 flagged=3/10 suspects=7 steps_left=39
|
| 374 |
+
Step 15: INSPECT acc_0092 flagged=3/10 suspects=7 steps_left=38
|
| 375 |
+
Step 16: FLAG acc_0092 flagged=4/10 suspects=6 steps_left=38
|
| 376 |
+
Step 17: INSPECT acc_0097 flagged=4/10 suspects=6 steps_left=37
|
| 377 |
+
Step 18: FLAG acc_0097 flagged=5/10 suspects=5 steps_left=37
|
| 378 |
+
Step 19: INSPECT acc_0187 flagged=5/10 suspects=5 steps_left=36
|
| 379 |
+
Step 20: FLAG acc_0187 flagged=6/10 suspects=4 steps_left=36
|
| 380 |
+
Step 21: INSPECT acc_0093 flagged=6/10 suspects=4 steps_left=35
|
| 381 |
+
Step 22: FLAG acc_0093 flagged=7/10 suspects=3 steps_left=35
|
| 382 |
+
Step 23: INSPECT acc_0172 flagged=7/10 suspects=3 steps_left=34
|
| 383 |
+
Step 24: FLAG acc_0172 flagged=8/10 suspects=2 steps_left=34
|
| 384 |
+
Step 25: INSPECT acc_0058 flagged=8/10 suspects=2 steps_left=33
|
| 385 |
+
Step 26: FLAG acc_0058 flagged=9/10 suspects=1 steps_left=33
|
| 386 |
+
Step 27: INSPECT acc_0131 flagged=9/10 suspects=1 steps_left=32
|
| 387 |
+
Step 28: FLAG acc_0131 flagged=10/10 suspects=0 steps_left=32
|
| 388 |
+
Step 29: INSPECT acc_0148 flagged=10/10 suspects=0 steps_left=31
|
| 389 |
+
Step 30: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=30
|
| 390 |
+
Step 31: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=29
|
| 391 |
+
Step 32: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=28
|
| 392 |
+
Step 33: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=27
|
| 393 |
+
Step 34: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=26
|
| 394 |
+
Step 35: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=25
|
| 395 |
+
Step 36: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=24
|
| 396 |
+
Step 37: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=23
|
| 397 |
+
Step 38: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=22
|
| 398 |
+
Step 39: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=21
|
| 399 |
+
Step 40: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=20
|
| 400 |
+
Step 41: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=19
|
| 401 |
+
Step 42: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=18
|
| 402 |
+
Step 43: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=17
|
| 403 |
+
Step 44: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=16
|
| 404 |
+
Step 45: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=15
|
| 405 |
+
Step 46: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=14
|
| 406 |
+
Step 47: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=13
|
| 407 |
+
Step 48: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=12
|
| 408 |
+
Step 49: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=11
|
| 409 |
+
Step 50: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=10
|
| 410 |
+
Step 51: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=9
|
| 411 |
+
Step 52: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=8
|
| 412 |
+
Step 53: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=7
|
| 413 |
+
Step 54: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=6
|
| 414 |
+
Step 55: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=5
|
| 415 |
+
Step 56: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=4
|
| 416 |
+
Step 57: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=3
|
| 417 |
+
Step 58: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=2
|
| 418 |
+
Step 59: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=1
|
| 419 |
+
Step 60: SUBMIT flagged=10/10 suspects=0 steps_left=1
|
| 420 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.51
|
| 421 |
+
★ GRADER SCORE: 0.9020
|
| 422 |
+
|
| 423 |
+
--- LLM Agent: task=medium, seed=1, model=Bedrock/us.meta.llama4-scout-17b-instruct-v1:0 ---
|
| 424 |
+
Step 1: INSPECT acc_0171 flagged=0/10 suspects=0 steps_left=49
|
| 425 |
+
Step 2: INSPECT acc_0099 flagged=0/10 suspects=0 steps_left=48
|
| 426 |
+
Step 3: INSPECT acc_0152 flagged=0/10 suspects=0 steps_left=47
|
| 427 |
+
Step 4: INSPECT acc_0092 flagged=0/10 suspects=0 steps_left=46
|
| 428 |
+
Step 5: INSPECT acc_0078 flagged=0/10 suspects=0 steps_left=45
|
| 429 |
+
Step 6: INSPECT acc_0112 flagged=0/10 suspects=0 steps_left=44
|
| 430 |
+
Step 7: INSPECT acc_0012 flagged=0/10 suspects=0 steps_left=43
|
| 431 |
+
Step 8: FLAG acc_0012 flagged=1/10 suspects=8 steps_left=43
|
| 432 |
+
Step 9: INSPECT acc_0033 flagged=1/10 suspects=8 steps_left=42
|
| 433 |
+
Step 10: FLAG acc_0033 flagged=2/10 suspects=8 steps_left=42
|
| 434 |
+
Step 11: INSPECT acc_0174 flagged=2/10 suspects=8 steps_left=41
|
| 435 |
+
Step 12: FLAG acc_0174 flagged=3/10 suspects=7 steps_left=41
|
| 436 |
+
Step 13: INSPECT acc_0187 flagged=3/10 suspects=7 steps_left=40
|
| 437 |
+
Step 14: FLAG acc_0187 flagged=4/10 suspects=6 steps_left=40
|
| 438 |
+
Step 15: INSPECT acc_0079 flagged=4/10 suspects=6 steps_left=39
|
| 439 |
+
Step 16: FLAG acc_0079 flagged=5/10 suspects=5 steps_left=39
|
| 440 |
+
Step 17: INSPECT acc_0032 flagged=5/10 suspects=5 steps_left=38
|
| 441 |
+
Step 18: FLAG acc_0032 flagged=6/10 suspects=4 steps_left=38
|
| 442 |
+
Step 19: INSPECT acc_0023 flagged=6/10 suspects=4 steps_left=37
|
| 443 |
+
Step 20: FLAG acc_0023 flagged=7/10 suspects=3 steps_left=37
|
| 444 |
+
Step 21: INSPECT acc_0146 flagged=7/10 suspects=3 steps_left=36
|
| 445 |
+
Step 22: FLAG acc_0146 flagged=8/10 suspects=2 steps_left=36
|
| 446 |
+
Step 23: INSPECT acc_0019 flagged=8/10 suspects=2 steps_left=35
|
| 447 |
+
Step 24: FLAG acc_0019 flagged=9/10 suspects=1 steps_left=35
|
| 448 |
+
Step 25: INSPECT acc_0131 flagged=9/10 suspects=1 steps_left=34
|
| 449 |
+
Step 26: FLAG acc_0131 flagged=10/10 suspects=0 steps_left=34
|
| 450 |
+
Step 27: INSPECT acc_0168 flagged=10/10 suspects=0 steps_left=33
|
| 451 |
+
Step 28: INSPECT acc_0198 flagged=10/10 suspects=0 steps_left=32
|
| 452 |
+
Step 29: INSPECT acc_0186 flagged=10/10 suspects=0 steps_left=31
|
| 453 |
+
Step 30: INSPECT acc_0099 flagged=10/10 suspects=0 steps_left=30
|
| 454 |
+
Step 31: INSPECT acc_0084 flagged=10/10 suspects=0 steps_left=29
|
| 455 |
+
Step 32: INSPECT acc_0117 flagged=10/10 suspects=0 steps_left=28
|
| 456 |
+
Step 33: INSPECT acc_0192 flagged=10/10 suspects=0 steps_left=27
|
| 457 |
+
Step 34: INSPECT acc_0025 flagged=10/10 suspects=0 steps_left=26
|
| 458 |
+
Step 35: INSPECT acc_0176 flagged=10/10 suspects=0 steps_left=25
|
| 459 |
+
Step 36: INSPECT acc_0185 flagged=10/10 suspects=0 steps_left=24
|
| 460 |
+
Step 37: INSPECT acc_0027 flagged=10/10 suspects=0 steps_left=23
|
| 461 |
+
Step 38: INSPECT acc_0199 flagged=10/10 suspects=0 steps_left=22
|
| 462 |
+
Step 39: INSPECT acc_0135 flagged=10/10 suspects=0 steps_left=21
|
| 463 |
+
Step 40: INSPECT acc_0082 flagged=10/10 suspects=0 steps_left=20
|
| 464 |
+
Step 41: INSPECT acc_0002 flagged=10/10 suspects=0 steps_left=19
|
| 465 |
+
Step 42: INSPECT acc_0161 flagged=10/10 suspects=0 steps_left=18
|
| 466 |
+
Step 43: INSPECT acc_0067 flagged=10/10 suspects=0 steps_left=17
|
| 467 |
+
Step 44: INSPECT acc_0062 flagged=10/10 suspects=0 steps_left=16
|
| 468 |
+
Step 45: INSPECT acc_0034 flagged=10/10 suspects=0 steps_left=15
|
| 469 |
+
Step 46: INSPECT acc_0010 flagged=10/10 suspects=0 steps_left=14
|
| 470 |
+
Step 47: INSPECT acc_0173 flagged=10/10 suspects=0 steps_left=13
|
| 471 |
+
Step 48: INSPECT acc_0081 flagged=10/10 suspects=0 steps_left=12
|
| 472 |
+
Step 49: INSPECT acc_0132 flagged=10/10 suspects=0 steps_left=11
|
| 473 |
+
Step 50: INSPECT acc_0094 flagged=10/10 suspects=0 steps_left=10
|
| 474 |
+
Step 51: INSPECT acc_0089 flagged=10/10 suspects=0 steps_left=9
|
| 475 |
+
Step 52: INSPECT acc_0046 flagged=10/10 suspects=0 steps_left=8
|
| 476 |
+
Step 53: INSPECT acc_0116 flagged=10/10 suspects=0 steps_left=7
|
| 477 |
+
Step 54: INSPECT acc_0121 flagged=10/10 suspects=0 steps_left=6
|
| 478 |
+
Step 55: INSPECT acc_0156 flagged=10/10 suspects=0 steps_left=5
|
| 479 |
+
Step 56: INSPECT acc_0141 flagged=10/10 suspects=0 steps_left=4
|
| 480 |
+
Step 57: INSPECT acc_0188 flagged=10/10 suspects=0 steps_left=3
|
| 481 |
+
Step 58: SUBMIT flagged=10/10 suspects=0 steps_left=3
|
| 482 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.53
|
| 483 |
+
★ GRADER SCORE: 0.9060
|
| 484 |
+
|
| 485 |
+
--- LLM Agent: task=medium, seed=2, model=Bedrock/us.meta.llama4-scout-17b-instruct-v1:0 ---
|
| 486 |
+
Step 1: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=49
|
| 487 |
+
Step 2: INSPECT acc_0107 flagged=0/10 suspects=0 steps_left=48
|
| 488 |
+
Step 3: INSPECT acc_0175 flagged=0/10 suspects=0 steps_left=47
|
| 489 |
+
Step 4: INSPECT acc_0030 flagged=0/10 suspects=0 steps_left=46
|
| 490 |
+
Step 5: INSPECT acc_0041 flagged=0/10 suspects=0 steps_left=45
|
| 491 |
+
Step 6: INSPECT acc_0054 flagged=0/10 suspects=0 steps_left=44
|
| 492 |
+
Step 7: INSPECT acc_0199 flagged=0/10 suspects=0 steps_left=43
|
| 493 |
+
Step 8: INSPECT acc_0181 flagged=0/10 suspects=0 steps_left=42
|
| 494 |
+
Step 9: INSPECT acc_0166 flagged=0/10 suspects=0 steps_left=41
|
| 495 |
+
Step 10: INSPECT acc_0098 flagged=0/10 suspects=0 steps_left=40
|
| 496 |
+
Step 11: INSPECT acc_0121 flagged=0/10 suspects=0 steps_left=39
|
| 497 |
+
Step 12: INSPECT acc_0053 flagged=0/10 suspects=0 steps_left=38
|
| 498 |
+
Step 13: INSPECT acc_0103 flagged=0/10 suspects=0 steps_left=37
|
| 499 |
+
Step 14: INSPECT acc_0168 flagged=0/10 suspects=0 steps_left=36
|
| 500 |
+
Step 15: INSPECT acc_0000 flagged=0/10 suspects=0 steps_left=35
|
| 501 |
+
Step 16: INSPECT acc_0040 flagged=0/10 suspects=0 steps_left=34
|
| 502 |
+
Step 17: INSPECT acc_0149 flagged=0/10 suspects=0 steps_left=33
|
| 503 |
+
Step 18: INSPECT acc_0064 flagged=0/10 suspects=0 steps_left=32
|
| 504 |
+
Step 19: INSPECT acc_0016 flagged=0/10 suspects=0 steps_left=31
|
| 505 |
+
Step 20: INSPECT acc_0105 flagged=0/10 suspects=0 steps_left=30
|
| 506 |
+
Step 21: INSPECT acc_0035 flagged=0/10 suspects=0 steps_left=29
|
| 507 |
+
Step 22: FLAG acc_0035 flagged=1/10 suspects=9 steps_left=29
|
| 508 |
+
Step 23: INSPECT acc_0020 flagged=1/10 suspects=9 steps_left=28
|
| 509 |
+
Step 24: FLAG acc_0020 flagged=2/10 suspects=8 steps_left=28
|
| 510 |
+
Step 25: INSPECT acc_0036 flagged=2/10 suspects=8 steps_left=27
|
| 511 |
+
Step 26: FLAG acc_0036 flagged=3/10 suspects=7 steps_left=27
|
| 512 |
+
Step 27: INSPECT acc_0050 flagged=3/10 suspects=7 steps_left=26
|
| 513 |
+
Step 28: FLAG acc_0050 flagged=4/10 suspects=6 steps_left=26
|
| 514 |
+
Step 29: INSPECT acc_0051 flagged=4/10 suspects=6 steps_left=25
|
| 515 |
+
Step 30: FLAG acc_0051 flagged=5/10 suspects=5 steps_left=25
|
| 516 |
+
Step 31: INSPECT acc_0085 flagged=5/10 suspects=5 steps_left=24
|
| 517 |
+
Step 32: FLAG acc_0085 flagged=6/10 suspects=4 steps_left=24
|
| 518 |
+
Step 33: INSPECT acc_0177 flagged=6/10 suspects=4 steps_left=23
|
| 519 |
+
Step 34: FLAG acc_0177 flagged=7/10 suspects=3 steps_left=23
|
| 520 |
+
Step 35: INSPECT acc_0170 flagged=7/10 suspects=3 steps_left=22
|
| 521 |
+
Step 36: FLAG acc_0170 flagged=8/10 suspects=2 steps_left=22
|
| 522 |
+
Step 37: INSPECT acc_0055 flagged=8/10 suspects=2 steps_left=21
|
| 523 |
+
Step 38: FLAG acc_0055 flagged=9/10 suspects=1 steps_left=21
|
| 524 |
+
Step 39: INSPECT acc_0094 flagged=9/10 suspects=1 steps_left=20
|
| 525 |
+
Step 40: FLAG acc_0094 flagged=10/10 suspects=0 steps_left=20
|
| 526 |
+
Step 41: INSPECT acc_0185 flagged=10/10 suspects=0 steps_left=19
|
| 527 |
+
Step 42: INSPECT acc_0175 flagged=10/10 suspects=0 steps_left=18
|
| 528 |
+
Step 43: INSPECT acc_0192 flagged=10/10 suspects=0 steps_left=17
|
| 529 |
+
Step 44: INSPECT acc_0013 flagged=10/10 suspects=0 steps_left=16
|
| 530 |
+
Step 45: INSPECT acc_0099 flagged=10/10 suspects=0 steps_left=15
|
| 531 |
+
Step 46: INSPECT acc_0075 flagged=10/10 suspects=0 steps_left=14
|
| 532 |
+
Step 47: INSPECT acc_0058 flagged=10/10 suspects=0 steps_left=13
|
| 533 |
+
Step 48: INSPECT acc_0072 flagged=10/10 suspects=0 steps_left=12
|
| 534 |
+
Step 49: INSPECT acc_0070 flagged=10/10 suspects=0 steps_left=11
|
| 535 |
+
Step 50: INSPECT acc_0071 flagged=10/10 suspects=0 steps_left=10
|
| 536 |
+
Step 51: INSPECT acc_0175 flagged=10/10 suspects=0 steps_left=9
|
| 537 |
+
Step 52: INSPECT acc_0181 flagged=10/10 suspects=0 steps_left=8
|
| 538 |
+
Step 53: INSPECT acc_0175 flagged=10/10 suspects=0 steps_left=7
|
| 539 |
+
Step 54: INSPECT acc_0181 flagged=10/10 suspects=0 steps_left=6
|
| 540 |
+
Step 55: INSPECT acc_0175 flagged=10/10 suspects=0 steps_left=5
|
| 541 |
+
Step 56: INSPECT acc_0181 flagged=10/10 suspects=0 steps_left=4
|
| 542 |
+
Step 57: INSPECT acc_0175 flagged=10/10 suspects=0 steps_left=3
|
| 543 |
+
Step 58: INSPECT acc_0181 flagged=10/10 suspects=0 steps_left=2
|
| 544 |
+
Step 59: INSPECT acc_0175 flagged=10/10 suspects=0 steps_left=1
|
| 545 |
+
Step 60: SUBMIT flagged=10/10 suspects=0 steps_left=1
|
| 546 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.51
|
| 547 |
+
★ GRADER SCORE: 0.9020
|
| 548 |
+
|
| 549 |
+
medium: scores=['0.902', '0.906', '0.902'] mean=0.9033 var=0.000004
|
| 550 |
+
|
| 551 |
+
--- LLM Agent: task=hard, seed=0, model=Bedrock/us.meta.llama4-scout-17b-instruct-v1:0 ---
|
| 552 |
+
Step 1: INSPECT acc_0704 flagged=0/10 suspects=0 steps_left=79
|
| 553 |
+
Step 2: INSPECT acc_0289 flagged=0/10 suspects=0 steps_left=78
|
| 554 |
+
Step 3: INSPECT acc_0826 flagged=0/10 suspects=0 steps_left=77
|
| 555 |
+
Step 4: INSPECT acc_0927 flagged=0/10 suspects=0 steps_left=76
|
| 556 |
+
Step 5: INSPECT acc_0441 flagged=0/10 suspects=0 steps_left=75
|
| 557 |
+
Step 6: INSPECT acc_0871 flagged=0/10 suspects=0 steps_left=74
|
| 558 |
+
Step 7: INSPECT acc_0880 flagged=0/10 suspects=0 steps_left=73
|
| 559 |
+
Step 8: INSPECT acc_0652 flagged=0/10 suspects=0 steps_left=72
|
| 560 |
+
Step 9: INSPECT acc_0939 flagged=0/10 suspects=0 steps_left=71
|
| 561 |
+
Step 10: INSPECT acc_0070 flagged=0/10 suspects=0 steps_left=70
|
| 562 |
+
Step 11: INSPECT acc_0915 flagged=0/10 suspects=0 steps_left=69
|
| 563 |
+
Step 12: INSPECT acc_0443 flagged=0/10 suspects=0 steps_left=68
|
| 564 |
+
Step 13: INSPECT acc_0970 flagged=0/10 suspects=0 steps_left=67
|
| 565 |
+
Step 14: INSPECT acc_0028 flagged=0/10 suspects=0 steps_left=66
|
| 566 |
+
Step 15: INSPECT acc_0792 flagged=0/10 suspects=0 steps_left=65
|
| 567 |
+
Step 16: INSPECT acc_0579 flagged=0/10 suspects=0 steps_left=64
|
| 568 |
+
Step 17: INSPECT acc_0037 flagged=0/10 suspects=0 steps_left=63
|
| 569 |
+
Step 18: INSPECT acc_0295 flagged=0/10 suspects=0 steps_left=62
|
| 570 |
+
Step 19: INSPECT acc_0438 flagged=0/10 suspects=0 steps_left=61
|
| 571 |
+
Step 20: INSPECT acc_0439 flagged=0/10 suspects=0 steps_left=60
|
| 572 |
+
Step 21: FLAG acc_0439 flagged=1/10 suspects=5 steps_left=60
|
| 573 |
+
Step 22: INSPECT acc_0237 flagged=1/10 suspects=5 steps_left=59
|
| 574 |
+
Step 23: FLAG acc_0237 flagged=2/10 suspects=6 steps_left=59
|
| 575 |
+
Step 24: INSPECT acc_0621 flagged=2/10 suspects=6 steps_left=58
|
| 576 |
+
Step 25: FLAG acc_0621 flagged=3/10 suspects=6 steps_left=58
|
| 577 |
+
Step 26: INSPECT acc_0389 flagged=3/10 suspects=6 steps_left=57
|
| 578 |
+
Step 27: FLAG acc_0389 flagged=4/10 suspects=6 steps_left=57
|
| 579 |
+
Step 28: INSPECT acc_0160 flagged=4/10 suspects=6 steps_left=56
|
| 580 |
+
Step 29: FLAG acc_0160 flagged=5/10 suspects=5 steps_left=56
|
| 581 |
+
Step 30: INSPECT acc_0549 flagged=5/10 suspects=5 steps_left=55
|
| 582 |
+
Step 31: FLAG acc_0549 flagged=6/10 suspects=4 steps_left=55
|
| 583 |
+
Step 32: INSPECT acc_0658 flagged=6/10 suspects=4 steps_left=54
|
| 584 |
+
Step 33: FLAG acc_0658 flagged=7/10 suspects=3 steps_left=54
|
| 585 |
+
Step 34: INSPECT acc_0290 flagged=7/10 suspects=3 steps_left=53
|
| 586 |
+
Step 35: FLAG acc_0290 flagged=8/10 suspects=2 steps_left=53
|
| 587 |
+
Step 36: INSPECT acc_0124 flagged=8/10 suspects=2 steps_left=52
|
| 588 |
+
Step 37: INSPECT acc_0507 flagged=8/10 suspects=2 steps_left=51
|
| 589 |
+
Step 38: INSPECT acc_0124 flagged=8/10 suspects=2 steps_left=50
|
| 590 |
+
Step 39: FLAG acc_0507 flagged=9/10 suspects=1 steps_left=50
|
| 591 |
+
Step 40: FLAG acc_0124 flagged=10/10 suspects=0 steps_left=50
|
| 592 |
+
Step 41: INSPECT acc_0086 flagged=10/10 suspects=0 steps_left=49
|
| 593 |
+
Step 42: INSPECT acc_0497 flagged=10/10 suspects=0 steps_left=48
|
| 594 |
+
Step 43: INSPECT acc_0610 flagged=10/10 suspects=0 steps_left=47
|
| 595 |
+
Step 44: INSPECT acc_0579 flagged=10/10 suspects=0 steps_left=46
|
| 596 |
+
Step 45: INSPECT acc_0573 flagged=10/10 suspects=0 steps_left=45
|
| 597 |
+
Step 46: INSPECT acc_0479 flagged=10/10 suspects=0 steps_left=44
|
| 598 |
+
Step 47: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=43
|
| 599 |
+
Step 48: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=42
|
| 600 |
+
Step 49: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=41
|
| 601 |
+
Step 50: INSPECT acc_0960 flagged=10/10 suspects=0 steps_left=40
|
| 602 |
+
Step 51: INSPECT acc_0455 flagged=10/10 suspects=0 steps_left=39
|
| 603 |
+
Step 52: INSPECT acc_0248 flagged=10/10 suspects=0 steps_left=38
|
| 604 |
+
Step 53: INSPECT acc_0964 flagged=10/10 suspects=0 steps_left=37
|
| 605 |
+
Step 54: INSPECT acc_0480 flagged=10/10 suspects=0 steps_left=36
|
| 606 |
+
Step 55: INSPECT acc_0154 flagged=10/10 suspects=0 steps_left=35
|
| 607 |
+
Step 56: INSPECT acc_0368 flagged=10/10 suspects=0 steps_left=34
|
| 608 |
+
Step 57: INSPECT acc_0426 flagged=10/10 suspects=0 steps_left=33
|
| 609 |
+
Step 58: INSPECT acc_0810 flagged=10/10 suspects=0 steps_left=32
|
| 610 |
+
Step 59: INSPECT acc_0040 flagged=10/10 suspects=0 steps_left=31
|
| 611 |
+
Step 60: INSPECT acc_0538 flagged=10/10 suspects=0 steps_left=30
|
| 612 |
+
Step 61: INSPECT acc_0940 flagged=10/10 suspects=0 steps_left=29
|
| 613 |
+
Step 62: INSPECT acc_0668 flagged=10/10 suspects=0 steps_left=28
|
| 614 |
+
Step 63: INSPECT acc_0721 flagged=10/10 suspects=0 steps_left=27
|
| 615 |
+
Step 64: INSPECT acc_0787 flagged=10/10 suspects=0 steps_left=26
|
| 616 |
+
Step 65: INSPECT acc_0639 flagged=10/10 suspects=0 steps_left=25
|
| 617 |
+
Step 66: INSPECT acc_0700 flagged=10/10 suspects=0 steps_left=24
|
| 618 |
+
Step 67: INSPECT acc_0353 flagged=10/10 suspects=0 steps_left=23
|
| 619 |
+
Step 68: INSPECT acc_0620 flagged=10/10 suspects=0 steps_left=22
|
| 620 |
+
Step 69: INSPECT acc_0499 flagged=10/10 suspects=0 steps_left=21
|
| 621 |
+
Step 70: INSPECT acc_0207 flagged=10/10 suspects=0 steps_left=20
|
| 622 |
+
Step 71: INSPECT acc_0011 flagged=10/10 suspects=0 steps_left=19
|
| 623 |
+
Step 72: INSPECT acc_0524 flagged=10/10 suspects=0 steps_left=18
|
| 624 |
+
Step 73: INSPECT acc_0553 flagged=10/10 suspects=0 steps_left=17
|
| 625 |
+
Step 74: INSPECT acc_0948 flagged=10/10 suspects=0 steps_left=16
|
| 626 |
+
Step 75: INSPECT acc_0333 flagged=10/10 suspects=0 steps_left=15
|
| 627 |
+
Step 76: INSPECT acc_0574 flagged=10/10 suspects=0 steps_left=14
|
| 628 |
+
Step 77: INSPECT acc_0258 flagged=10/10 suspects=0 steps_left=13
|
| 629 |
+
Step 78: INSPECT acc_0742 flagged=10/10 suspects=0 steps_left=12
|
| 630 |
+
Step 79: INSPECT acc_0354 flagged=10/10 suspects=0 steps_left=11
|
| 631 |
+
Step 80: INSPECT acc_0022 flagged=10/10 suspects=0 steps_left=10
|
| 632 |
+
Step 81: INSPECT acc_0232 flagged=10/10 suspects=0 steps_left=9
|
| 633 |
+
Step 82: INSPECT acc_0123 flagged=10/10 suspects=0 steps_left=8
|
| 634 |
+
Step 83: INSPECT acc_0844 flagged=10/10 suspects=0 steps_left=7
|
| 635 |
+
Step 84: INSPECT acc_0757 flagged=10/10 suspects=0 steps_left=6
|
| 636 |
+
Step 85: INSPECT acc_0653 flagged=10/10 suspects=0 steps_left=5
|
| 637 |
+
Step 86: INSPECT acc_0119 flagged=10/10 suspects=0 steps_left=4
|
| 638 |
+
Step 87: INSPECT acc_0514 flagged=10/10 suspects=0 steps_left=3
|
| 639 |
+
Step 88: SUBMIT flagged=10/10 suspects=0 steps_left=3
|
| 640 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=13.23
|
| 641 |
+
★ GRADER SCORE: 0.9038
|
| 642 |
+
|
| 643 |
+
--- LLM Agent: task=hard, seed=1, model=Bedrock/us.meta.llama4-scout-17b-instruct-v1:0 ---
|
| 644 |
+
Step 1: INSPECT acc_0014 flagged=0/10 suspects=0 steps_left=79
|
| 645 |
+
Step 2: INSPECT acc_0835 flagged=0/10 suspects=0 steps_left=78
|
| 646 |
+
Step 3: INSPECT acc_0855 flagged=0/10 suspects=0 steps_left=77
|
| 647 |
+
Step 4: INSPECT acc_0930 flagged=0/10 suspects=0 steps_left=76
|
| 648 |
+
Step 5: INSPECT acc_0336 flagged=0/10 suspects=0 steps_left=75
|
| 649 |
+
Step 6: INSPECT acc_0929 flagged=0/10 suspects=0 steps_left=74
|
| 650 |
+
Step 7: INSPECT acc_0076 flagged=0/10 suspects=0 steps_left=73
|
| 651 |
+
Step 8: INSPECT acc_0543 flagged=0/10 suspects=0 steps_left=72
|
| 652 |
+
Step 9: INSPECT acc_0590 flagged=0/10 suspects=0 steps_left=71
|
| 653 |
+
Step 10: INSPECT acc_0401 flagged=0/10 suspects=0 steps_left=70
|
| 654 |
+
Step 11: INSPECT acc_0322 flagged=0/10 suspects=0 steps_left=69
|
| 655 |
+
Step 12: INSPECT acc_0154 flagged=0/10 suspects=0 steps_left=68
|
| 656 |
+
Step 13: INSPECT acc_0374 flagged=0/10 suspects=0 steps_left=67
|
| 657 |
+
Step 14: INSPECT acc_0549 flagged=0/10 suspects=0 steps_left=66
|
| 658 |
+
Step 15: INSPECT acc_0903 flagged=0/10 suspects=0 steps_left=65
|
| 659 |
+
Step 16: INSPECT acc_0976 flagged=0/10 suspects=0 steps_left=64
|
| 660 |
+
Step 17: INSPECT acc_0620 flagged=0/10 suspects=0 steps_left=63
|
| 661 |
+
Step 18: INSPECT acc_0017 flagged=0/10 suspects=0 steps_left=62
|
| 662 |
+
Step 19: INSPECT acc_0222 flagged=0/10 suspects=0 steps_left=61
|
| 663 |
+
Step 20: INSPECT acc_0536 flagged=0/10 suspects=0 steps_left=60
|
| 664 |
+
Step 21: INSPECT acc_0112 flagged=0/10 suspects=0 steps_left=59
|
| 665 |
+
Step 22: INSPECT acc_0577 flagged=0/10 suspects=0 steps_left=58
|
| 666 |
+
Step 23: INSPECT acc_0517 flagged=0/10 suspects=0 steps_left=57
|
| 667 |
+
Step 24: INSPECT acc_0113 flagged=0/10 suspects=0 steps_left=56
|
| 668 |
+
Step 25: INSPECT acc_0167 flagged=0/10 suspects=0 steps_left=55
|
| 669 |
+
Step 26: INSPECT acc_0697 flagged=0/10 suspects=0 steps_left=54
|
| 670 |
+
Step 27: INSPECT acc_0271 flagged=0/10 suspects=0 steps_left=53
|
| 671 |
+
Step 28: INSPECT acc_0681 flagged=0/10 suspects=0 steps_left=52
|
| 672 |
+
Step 29: INSPECT acc_0530 flagged=0/10 suspects=0 steps_left=51
|
| 673 |
+
Step 30: INSPECT acc_0353 flagged=0/10 suspects=0 steps_left=50
|
| 674 |
+
Step 31: INSPECT acc_0652 flagged=0/10 suspects=0 steps_left=49
|
| 675 |
+
Step 32: INSPECT acc_0777 flagged=0/10 suspects=0 steps_left=48
|
| 676 |
+
Step 33: INSPECT acc_0265 flagged=0/10 suspects=0 steps_left=47
|
| 677 |
+
Step 34: INSPECT acc_0788 flagged=0/10 suspects=0 steps_left=46
|
| 678 |
+
Step 35: INSPECT acc_0033 flagged=0/10 suspects=0 steps_left=45
|
| 679 |
+
Step 36: INSPECT acc_0187 flagged=0/10 suspects=0 steps_left=44
|
| 680 |
+
Step 37: INSPECT acc_0445 flagged=0/10 suspects=0 steps_left=43
|
| 681 |
+
Step 38: INSPECT acc_0846 flagged=0/10 suspects=0 steps_left=42
|
| 682 |
+
Step 39: INSPECT acc_0659 flagged=0/10 suspects=0 steps_left=41
|
| 683 |
+
Step 40: INSPECT acc_0677 flagged=0/10 suspects=0 steps_left=40
|
| 684 |
+
Step 41: INSPECT acc_0768 flagged=0/10 suspects=0 steps_left=39
|
| 685 |
+
Step 42: INSPECT acc_0539 flagged=0/10 suspects=0 steps_left=38
|
| 686 |
+
Step 43: INSPECT acc_0742 flagged=0/10 suspects=0 steps_left=37
|
| 687 |
+
Step 44: INSPECT acc_0503 flagged=0/10 suspects=0 steps_left=36
|
| 688 |
+
Step 45: INSPECT acc_0876 flagged=0/10 suspects=0 steps_left=35
|
| 689 |
+
Step 46: INSPECT acc_0639 flagged=0/10 suspects=0 steps_left=34
|
| 690 |
+
Step 47: INSPECT acc_0494 flagged=0/10 suspects=0 steps_left=33
|
| 691 |
+
Step 48: INSPECT acc_0898 flagged=0/10 suspects=0 steps_left=32
|
| 692 |
+
Step 49: INSPECT acc_0553 flagged=0/10 suspects=0 steps_left=31
|
| 693 |
+
Step 50: INSPECT acc_0588 flagged=0/10 suspects=0 steps_left=30
|
| 694 |
+
Step 51: INSPECT acc_0194 flagged=0/10 suspects=0 steps_left=29
|
| 695 |
+
Step 52: INSPECT acc_0810 flagged=0/10 suspects=0 steps_left=28
|
| 696 |
+
Step 53: INSPECT acc_0355 flagged=0/10 suspects=0 steps_left=27
|
| 697 |
+
Step 54: INSPECT acc_0363 flagged=0/10 suspects=0 steps_left=26
|
| 698 |
+
Step 55: INSPECT acc_0221 flagged=0/10 suspects=0 steps_left=25
|
| 699 |
+
Step 56: INSPECT acc_0580 flagged=0/10 suspects=0 steps_left=24
|
| 700 |
+
Step 57: INSPECT acc_0534 flagged=0/10 suspects=0 steps_left=23
|
| 701 |
+
Step 58: INSPECT acc_0778 flagged=0/10 suspects=0 steps_left=22
|
| 702 |
+
Step 59: INSPECT acc_0998 flagged=0/10 suspects=0 steps_left=21
|
| 703 |
+
Step 60: INSPECT acc_0233 flagged=0/10 suspects=0 steps_left=20
|
| 704 |
+
Step 61: INSPECT acc_0052 flagged=0/10 suspects=0 steps_left=19
|
| 705 |
+
Step 62: INSPECT acc_0813 flagged=0/10 suspects=0 steps_left=18
|
| 706 |
+
Step 63: INSPECT acc_0035 flagged=0/10 suspects=0 steps_left=17
|
| 707 |
+
Step 64: INSPECT acc_0667 flagged=0/10 suspects=0 steps_left=16
|
| 708 |
+
Step 65: INSPECT acc_0019 flagged=0/10 suspects=0 steps_left=15
|
| 709 |
+
Step 66: INSPECT acc_0959 flagged=0/10 suspects=0 steps_left=14
|
| 710 |
+
Step 67: INSPECT acc_0212 flagged=0/10 suspects=0 steps_left=13
|
| 711 |
+
Step 68: INSPECT acc_0776 flagged=0/10 suspects=0 steps_left=12
|
| 712 |
+
Step 69: INSPECT acc_0049 flagged=0/10 suspects=0 steps_left=11
|
| 713 |
+
Step 70: INSPECT acc_0434 flagged=0/10 suspects=0 steps_left=10
|
| 714 |
+
Step 71: INSPECT acc_0827 flagged=0/10 suspects=0 steps_left=9
|
| 715 |
+
Step 72: INSPECT acc_0583 flagged=0/10 suspects=0 steps_left=8
|
| 716 |
+
Step 73: INSPECT acc_0065 flagged=0/10 suspects=0 steps_left=7
|
| 717 |
+
Step 74: INSPECT acc_0107 flagged=0/10 suspects=0 steps_left=6
|
| 718 |
+
Step 75: INSPECT acc_0761 flagged=0/10 suspects=0 steps_left=5
|
| 719 |
+
Step 76: INSPECT acc_0995 flagged=0/10 suspects=0 steps_left=4
|
| 720 |
+
Step 77: INSPECT acc_0157 flagged=0/10 suspects=0 steps_left=3
|
| 721 |
+
Step 78: INSPECT acc_0936 flagged=0/10 suspects=0 steps_left=2
|
| 722 |
+
Step 79: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=1
|
| 723 |
+
Step 80: INSPECT acc_0691 flagged=0/10 suspects=0 steps_left=0
|
| 724 |
+
→ Episode ended: [LOSS] TP=0 FP=0 FN=10 Recall=0.00 Precision=0.00 Episode reward=-9.80
|
| 725 |
+
★ GRADER SCORE: 0.0000
|
| 726 |
+
|
| 727 |
+
--- LLM Agent: task=hard, seed=2, model=Bedrock/us.meta.llama4-scout-17b-instruct-v1:0 ---
|
| 728 |
+
Step 1: INSPECT acc_0813 flagged=0/10 suspects=0 steps_left=79
|
| 729 |
+
Step 2: INSPECT acc_0430 flagged=0/10 suspects=0 steps_left=78
|
| 730 |
+
Step 3: INSPECT acc_0817 flagged=0/10 suspects=0 steps_left=77
|
| 731 |
+
Step 4: INSPECT acc_0175 flagged=0/10 suspects=0 steps_left=76
|
| 732 |
+
Step 5: INSPECT acc_0523 flagged=0/10 suspects=0 steps_left=75
|
| 733 |
+
Step 6: INSPECT acc_0113 flagged=0/10 suspects=0 steps_left=74
|
| 734 |
+
Step 7: INSPECT acc_0797 flagged=0/10 suspects=0 steps_left=73
|
| 735 |
+
Step 8: INSPECT acc_0478 flagged=0/10 suspects=0 steps_left=72
|
| 736 |
+
Step 9: INSPECT acc_0861 flagged=0/10 suspects=0 steps_left=71
|
| 737 |
+
Step 10: INSPECT acc_0836 flagged=0/10 suspects=0 steps_left=70
|
| 738 |
+
Step 11: INSPECT acc_0926 flagged=0/10 suspects=0 steps_left=69
|
| 739 |
+
Step 12: INSPECT acc_0664 flagged=0/10 suspects=0 steps_left=68
|
| 740 |
+
Step 13: INSPECT acc_0255 flagged=0/10 suspects=0 steps_left=67
|
| 741 |
+
Step 14: INSPECT acc_0938 flagged=0/10 suspects=0 steps_left=66
|
| 742 |
+
Step 15: INSPECT acc_0672 flagged=0/10 suspects=0 steps_left=65
|
| 743 |
+
Step 16: FLAG acc_0672 flagged=1/10 suspects=6 steps_left=65
|
| 744 |
+
Step 17: INSPECT acc_0659 flagged=1/10 suspects=6 steps_left=64
|
| 745 |
+
Step 18: FLAG acc_0659 flagged=2/10 suspects=5 steps_left=64
|
| 746 |
+
Step 19: INSPECT acc_0290 flagged=2/10 suspects=5 steps_left=63
|
| 747 |
+
Step 20: FLAG acc_0290 flagged=3/10 suspects=5 steps_left=63
|
| 748 |
+
Step 21: INSPECT acc_0339 flagged=3/10 suspects=5 steps_left=62
|
| 749 |
+
Step 22: FLAG acc_0339 flagged=4/10 suspects=6 steps_left=62
|
| 750 |
+
Step 23: INSPECT acc_0544 flagged=4/10 suspects=6 steps_left=61
|
| 751 |
+
Step 24: FLAG acc_0544 flagged=5/10 suspects=5 steps_left=61
|
| 752 |
+
Step 25: INSPECT acc_0696 flagged=5/10 suspects=5 steps_left=60
|
| 753 |
+
Step 26: FLAG acc_0696 flagged=6/10 suspects=4 steps_left=60
|
| 754 |
+
Step 27: INSPECT acc_0541 flagged=6/10 suspects=4 steps_left=59
|
| 755 |
+
Step 28: FLAG acc_0541 flagged=7/10 suspects=3 steps_left=59
|
| 756 |
+
Step 29: INSPECT acc_0793 flagged=7/10 suspects=3 steps_left=58
|
| 757 |
+
Step 30: FLAG acc_0793 flagged=8/10 suspects=2 steps_left=58
|
| 758 |
+
Step 31: INSPECT acc_0214 flagged=8/10 suspects=2 steps_left=57
|
| 759 |
+
Step 32: FLAG acc_0214 flagged=9/10 suspects=1 steps_left=57
|
| 760 |
+
Step 33: INSPECT acc_0112 flagged=9/10 suspects=1 steps_left=56
|
| 761 |
+
Step 34: FLAG acc_0112 flagged=10/10 suspects=0 steps_left=56
|
| 762 |
+
Step 35: INSPECT acc_0348 flagged=10/10 suspects=0 steps_left=55
|
| 763 |
+
Step 36: INSPECT acc_0721 flagged=10/10 suspects=0 steps_left=54
|
| 764 |
+
Step 37: INSPECT acc_0321 flagged=10/10 suspects=0 steps_left=53
|
| 765 |
+
Step 38: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=52
|
| 766 |
+
Step 39: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=51
|
| 767 |
+
Step 40: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=50
|
| 768 |
+
Step 41: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=49
|
| 769 |
+
Step 42: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=48
|
| 770 |
+
Step 43: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=47
|
| 771 |
+
Step 44: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=46
|
| 772 |
+
Step 45: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=45
|
| 773 |
+
Step 46: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=44
|
| 774 |
+
Step 47: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=43
|
| 775 |
+
Step 48: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=42
|
| 776 |
+
Step 49: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=41
|
| 777 |
+
Step 50: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=40
|
| 778 |
+
Step 51: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=39
|
| 779 |
+
Step 52: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=38
|
| 780 |
+
Step 53: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=37
|
| 781 |
+
Step 54: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=36
|
| 782 |
+
Step 55: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=35
|
| 783 |
+
Step 56: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=34
|
| 784 |
+
Step 57: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=33
|
| 785 |
+
Step 58: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=32
|
| 786 |
+
Step 59: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=31
|
| 787 |
+
Step 60: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=30
|
| 788 |
+
Step 61: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=29
|
| 789 |
+
Step 62: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=28
|
| 790 |
+
Step 63: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=27
|
| 791 |
+
Step 64: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=26
|
| 792 |
+
Step 65: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=25
|
| 793 |
+
Step 66: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=24
|
| 794 |
+
Step 67: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=23
|
| 795 |
+
Step 68: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=22
|
| 796 |
+
Step 69: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=21
|
| 797 |
+
Step 70: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=20
|
| 798 |
+
Step 71: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=19
|
| 799 |
+
Step 72: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=18
|
| 800 |
+
Step 73: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=17
|
| 801 |
+
Step 74: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=16
|
| 802 |
+
Step 75: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=15
|
| 803 |
+
Step 76: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=14
|
| 804 |
+
Step 77: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=13
|
| 805 |
+
Step 78: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=12
|
| 806 |
+
Step 79: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=11
|
| 807 |
+
Step 80: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=10
|
| 808 |
+
Step 81: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=9
|
| 809 |
+
Step 82: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=8
|
| 810 |
+
Step 83: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=7
|
| 811 |
+
Step 84: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=6
|
| 812 |
+
Step 85: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=5
|
| 813 |
+
Step 86: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=4
|
| 814 |
+
Step 87: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=3
|
| 815 |
+
Step 88: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=2
|
| 816 |
+
Step 89: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=1
|
| 817 |
+
Step 90: SUBMIT flagged=10/10 suspects=0 steps_left=1
|
| 818 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=13.21
|
| 819 |
+
★ GRADER SCORE: 0.9012
|
| 820 |
+
|
| 821 |
+
hard: scores=['0.904', '0.000', '0.901'] mean=0.6017 var=0.181003
|
| 822 |
+
|
| 823 |
+
============================================================
|
| 824 |
+
EVALUATION COMPLETE
|
| 825 |
+
============================================================
|
| 826 |
+
ubuntu@ip-172-31-33-59:~/meta/meta-hack-26/eval-models$
|
model-benchmark-logs/mistral_judge_log.txt
ADDED
|
@@ -0,0 +1,410 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ubuntu@ip-172-31-33-59:~/meta/meta-hack-26$ python3 mistral_test_judge_eval.py --url https://pandago-graphstrike.hf.space --bedrock
|
| 2 |
+
GraphStrike Judge Evaluation Simulator
|
| 3 |
+
Target: https://pandago-graphstrike.hf.space
|
| 4 |
+
Backend: bedrock
|
| 5 |
+
Model: Bedrock/mistral.ministral-3-8b-instruct
|
| 6 |
+
Token: set
|
| 7 |
+
|
| 8 |
+
============================================================
|
| 9 |
+
PHASE 0: Endpoint Verification
|
| 10 |
+
============================================================
|
| 11 |
+
✓ GET /health
|
| 12 |
+
✓ GET /tasks
|
| 13 |
+
✓ GET /metadata
|
| 14 |
+
✓ GET /schema
|
| 15 |
+
✓ GET /web
|
| 16 |
+
✓ POST /reset
|
| 17 |
+
✓ GET /state
|
| 18 |
+
✓ POST /step
|
| 19 |
+
✓ POST /step
|
| 20 |
+
✓ GET /grader
|
| 21 |
+
✓ POST /mcp
|
| 22 |
+
✓ POST /baseline
|
| 23 |
+
|
| 24 |
+
============================================================
|
| 25 |
+
PHASE 1: Baseline Stability (3 runs)
|
| 26 |
+
============================================================
|
| 27 |
+
Run 1: easy=0.9100 medium=0.9060 hard=0.9038
|
| 28 |
+
Run 2: easy=0.9100 medium=0.9060 hard=0.9038
|
| 29 |
+
Run 3: easy=0.9100 medium=0.9060 hard=0.9038
|
| 30 |
+
✓ All 3 runs identical — baseline is deterministic
|
| 31 |
+
|
| 32 |
+
============================================================
|
| 33 |
+
PHASE 2: LLM Agent Evaluation (model=Bedrock/mistral.ministral-3-8b-instruct)
|
| 34 |
+
============================================================
|
| 35 |
+
|
| 36 |
+
--- LLM Agent: task=easy, seed=0, model=Bedrock/mistral.ministral-3-8b-instruct ---
|
| 37 |
+
Step 1: INSPECT acc_0043 flagged=0/10 suspects=0 steps_left=29
|
| 38 |
+
Step 2: FLAG acc_0043 flagged=1/10 suspects=8 steps_left=29
|
| 39 |
+
Step 3: INSPECT acc_0036 flagged=1/10 suspects=8 steps_left=28
|
| 40 |
+
Step 4: FLAG acc_0036 flagged=2/10 suspects=8 steps_left=28
|
| 41 |
+
Step 5: INSPECT acc_0001 flagged=2/10 suspects=8 steps_left=27
|
| 42 |
+
Step 6: FLAG acc_0001 flagged=3/10 suspects=7 steps_left=27
|
| 43 |
+
Step 7: INSPECT acc_0014 flagged=3/10 suspects=7 steps_left=26
|
| 44 |
+
Step 8: INSPECT acc_0012 flagged=3/10 suspects=7 steps_left=25
|
| 45 |
+
Step 9: INSPECT acc_0000 flagged=3/10 suspects=7 steps_left=24
|
| 46 |
+
Step 10: FLAG acc_0014 flagged=4/10 suspects=6 steps_left=24
|
| 47 |
+
Step 11: FLAG acc_0000 flagged=5/10 suspects=5 steps_left=24
|
| 48 |
+
Step 12: INSPECT acc_0027 flagged=5/10 suspects=5 steps_left=23
|
| 49 |
+
Step 13: FLAG acc_0012 flagged=6/10 suspects=4 steps_left=23
|
| 50 |
+
Step 14: FLAG acc_0027 flagged=7/10 suspects=3 steps_left=23
|
| 51 |
+
Step 15: INSPECT acc_0047 flagged=7/10 suspects=3 steps_left=22
|
| 52 |
+
Step 16: FLAG acc_0047 flagged=8/10 suspects=2 steps_left=22
|
| 53 |
+
Step 17: INSPECT acc_0007 flagged=8/10 suspects=2 steps_left=21
|
| 54 |
+
Step 18: FLAG acc_0007 flagged=9/10 suspects=1 steps_left=21
|
| 55 |
+
Step 19: INSPECT acc_0028 flagged=9/10 suspects=1 steps_left=20
|
| 56 |
+
Step 20: FLAG acc_0028 flagged=10/10 suspects=0 steps_left=20
|
| 57 |
+
Step 21: SUBMIT flagged=10/10 suspects=0 steps_left=20
|
| 58 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.90
|
| 59 |
+
★ GRADER SCORE: 0.9667
|
| 60 |
+
|
| 61 |
+
--- LLM Agent: task=medium, seed=0, model=Bedrock/mistral.ministral-3-8b-instruct ---
|
| 62 |
+
Step 1: INSPECT acc_0008 flagged=0/10 suspects=0 steps_left=49
|
| 63 |
+
Step 2: INSPECT acc_0074 flagged=0/10 suspects=0 steps_left=48
|
| 64 |
+
Step 3: INSPECT acc_0179 flagged=0/10 suspects=0 steps_left=47
|
| 65 |
+
Step 4: INSPECT acc_0096 flagged=0/10 suspects=0 steps_left=46
|
| 66 |
+
Step 5: INSPECT acc_0177 flagged=0/10 suspects=0 steps_left=45
|
| 67 |
+
Step 6: INSPECT acc_0122 flagged=0/10 suspects=0 steps_left=44
|
| 68 |
+
Step 7: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=43
|
| 69 |
+
Step 8: INSPECT acc_0174 flagged=0/10 suspects=0 steps_left=42
|
| 70 |
+
Step 9: INSPECT acc_0144 flagged=0/10 suspects=0 steps_left=41
|
| 71 |
+
Step 10: FLAG acc_0144 flagged=1/10 suspects=8 steps_left=41
|
| 72 |
+
Step 11: INSPECT acc_0181 flagged=1/10 suspects=8 steps_left=40
|
| 73 |
+
Step 12: FLAG acc_0181 flagged=2/10 suspects=8 steps_left=40
|
| 74 |
+
Step 13: INSPECT acc_0022 flagged=2/10 suspects=8 steps_left=39
|
| 75 |
+
Step 14: FLAG acc_0022 flagged=3/10 suspects=7 steps_left=39
|
| 76 |
+
Step 15: INSPECT acc_0092 flagged=3/10 suspects=7 steps_left=38
|
| 77 |
+
Step 16: FLAG acc_0092 flagged=4/10 suspects=6 steps_left=38
|
| 78 |
+
Step 17: INSPECT acc_0097 flagged=4/10 suspects=6 steps_left=37
|
| 79 |
+
Step 18: FLAG acc_0097 flagged=5/10 suspects=5 steps_left=37
|
| 80 |
+
Step 19: INSPECT acc_0187 flagged=5/10 suspects=5 steps_left=36
|
| 81 |
+
Step 20: FLAG acc_0187 flagged=6/10 suspects=4 steps_left=36
|
| 82 |
+
Step 21: INSPECT acc_0093 flagged=6/10 suspects=4 steps_left=35
|
| 83 |
+
Step 22: FLAG acc_0093 flagged=7/10 suspects=3 steps_left=35
|
| 84 |
+
Step 23: INSPECT acc_0172 flagged=7/10 suspects=3 steps_left=34
|
| 85 |
+
Step 24: FLAG acc_0172 flagged=8/10 suspects=2 steps_left=34
|
| 86 |
+
Step 25: INSPECT acc_0058 flagged=8/10 suspects=2 steps_left=33
|
| 87 |
+
Step 26: FLAG acc_0058 flagged=9/10 suspects=1 steps_left=33
|
| 88 |
+
Step 27: INSPECT acc_0131 flagged=9/10 suspects=1 steps_left=32
|
| 89 |
+
Step 28: FLAG acc_0131 flagged=10/10 suspects=0 steps_left=32
|
| 90 |
+
Step 29: SUBMIT flagged=10/10 suspects=0 steps_left=32
|
| 91 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.82
|
| 92 |
+
★ GRADER SCORE: 0.9640
|
| 93 |
+
|
| 94 |
+
--- LLM Agent: task=hard, seed=0, model=Bedrock/mistral.ministral-3-8b-instruct ---
|
| 95 |
+
Step 1: INSPECT acc_0704 flagged=0/10 suspects=0 steps_left=79
|
| 96 |
+
Step 2: INSPECT acc_0289 flagged=0/10 suspects=0 steps_left=78
|
| 97 |
+
Step 3: INSPECT acc_0826 flagged=0/10 suspects=0 steps_left=77
|
| 98 |
+
Step 4: INSPECT acc_0927 flagged=0/10 suspects=0 steps_left=76
|
| 99 |
+
Step 5: INSPECT acc_0441 flagged=0/10 suspects=0 steps_left=75
|
| 100 |
+
Step 6: INSPECT acc_0871 flagged=0/10 suspects=0 steps_left=74
|
| 101 |
+
Step 7: INSPECT acc_0880 flagged=0/10 suspects=0 steps_left=73
|
| 102 |
+
Step 8: INSPECT acc_0652 flagged=0/10 suspects=0 steps_left=72
|
| 103 |
+
Step 9: INSPECT acc_0939 flagged=0/10 suspects=0 steps_left=71
|
| 104 |
+
Step 10: INSPECT acc_0070 flagged=0/10 suspects=0 steps_left=70
|
| 105 |
+
Step 11: INSPECT acc_0915 flagged=0/10 suspects=0 steps_left=69
|
| 106 |
+
Step 12: INSPECT acc_0443 flagged=0/10 suspects=0 steps_left=68
|
| 107 |
+
Step 13: INSPECT acc_0970 flagged=0/10 suspects=0 steps_left=67
|
| 108 |
+
Step 14: INSPECT acc_0028 flagged=0/10 suspects=0 steps_left=66
|
| 109 |
+
Step 15: INSPECT acc_0792 flagged=0/10 suspects=0 steps_left=65
|
| 110 |
+
Step 16: INSPECT acc_0579 flagged=0/10 suspects=0 steps_left=64
|
| 111 |
+
Step 17: INSPECT acc_0037 flagged=0/10 suspects=0 steps_left=63
|
| 112 |
+
Step 18: INSPECT acc_0295 flagged=0/10 suspects=0 steps_left=62
|
| 113 |
+
Step 19: INSPECT acc_0438 flagged=0/10 suspects=0 steps_left=61
|
| 114 |
+
Step 20: INSPECT acc_0439 flagged=0/10 suspects=0 steps_left=60
|
| 115 |
+
Step 21: FLAG acc_0439 flagged=1/10 suspects=5 steps_left=60
|
| 116 |
+
Step 22: INSPECT acc_0237 flagged=1/10 suspects=5 steps_left=59
|
| 117 |
+
Step 23: FLAG acc_0237 flagged=2/10 suspects=6 steps_left=59
|
| 118 |
+
Step 24: INSPECT acc_0621 flagged=2/10 suspects=6 steps_left=58
|
| 119 |
+
Step 25: FLAG acc_0621 flagged=3/10 suspects=6 steps_left=58
|
| 120 |
+
Step 26: INSPECT acc_0389 flagged=3/10 suspects=6 steps_left=57
|
| 121 |
+
Step 27: INSPECT acc_0160 flagged=3/10 suspects=6 steps_left=56
|
| 122 |
+
Step 28: FLAG acc_0160 flagged=4/10 suspects=6 steps_left=56
|
| 123 |
+
Step 29: INSPECT acc_0549 flagged=4/10 suspects=6 steps_left=55
|
| 124 |
+
Step 30: FLAG acc_0549 flagged=5/10 suspects=5 steps_left=55
|
| 125 |
+
Step 31: INSPECT acc_0658 flagged=5/10 suspects=5 steps_left=54
|
| 126 |
+
Step 32: FLAG acc_0658 flagged=6/10 suspects=4 steps_left=54
|
| 127 |
+
Step 33: INSPECT acc_0290 flagged=6/10 suspects=4 steps_left=53
|
| 128 |
+
Step 34: FLAG acc_0389 flagged=7/10 suspects=3 steps_left=53
|
| 129 |
+
Step 35: FLAG acc_0290 flagged=8/10 suspects=2 steps_left=53
|
| 130 |
+
Step 36: INSPECT acc_0124 flagged=8/10 suspects=2 steps_left=52
|
| 131 |
+
Step 37: FLAG acc_0124 flagged=9/10 suspects=1 steps_left=52
|
| 132 |
+
Step 38: INSPECT acc_0507 flagged=9/10 suspects=1 steps_left=51
|
| 133 |
+
Step 39: FLAG acc_0507 flagged=10/10 suspects=0 steps_left=51
|
| 134 |
+
Step 40: SUBMIT flagged=10/10 suspects=0 steps_left=51
|
| 135 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.71
|
| 136 |
+
★ GRADER SCORE: 0.9637
|
| 137 |
+
|
| 138 |
+
Summary: easy=0.9667 medium=0.9640 hard=0.9637
|
| 139 |
+
|
| 140 |
+
============================================================
|
| 141 |
+
PHASE 3: Score Variance (seeds=[0, 1, 2])
|
| 142 |
+
============================================================
|
| 143 |
+
|
| 144 |
+
--- LLM Agent: task=easy, seed=0, model=Bedrock/mistral.ministral-3-8b-instruct ---
|
| 145 |
+
Step 1: INSPECT acc_0043 flagged=0/10 suspects=0 steps_left=29
|
| 146 |
+
Step 2: FLAG acc_0043 flagged=1/10 suspects=8 steps_left=29
|
| 147 |
+
Step 3: INSPECT acc_0036 flagged=1/10 suspects=8 steps_left=28
|
| 148 |
+
Step 4: FLAG acc_0036 flagged=2/10 suspects=8 steps_left=28
|
| 149 |
+
Step 5: INSPECT acc_0001 flagged=2/10 suspects=8 steps_left=27
|
| 150 |
+
Step 6: FLAG acc_0001 flagged=3/10 suspects=7 steps_left=27
|
| 151 |
+
Step 7: INSPECT acc_0014 flagged=3/10 suspects=7 steps_left=26
|
| 152 |
+
|
| 153 |
+
Step 8: INSPECT acc_0012 flagged=3/10 suspects=7 steps_left=25
|
| 154 |
+
Step 9: FLAG acc_0014 flagged=4/10 suspects=6 steps_left=25
|
| 155 |
+
Step 10: INSPECT acc_0000 flagged=4/10 suspects=6 steps_left=24
|
| 156 |
+
Step 11: FLAG acc_0012 flagged=5/10 suspects=5 steps_left=24
|
| 157 |
+
Step 12: INSPECT acc_0027 flagged=5/10 suspects=5 steps_left=23
|
| 158 |
+
Step 13: FLAG acc_0000 flagged=6/10 suspects=4 steps_left=23
|
| 159 |
+
Step 14: INSPECT acc_0047 flagged=6/10 suspects=4 steps_left=22
|
| 160 |
+
Step 15: FLAG acc_0027 flagged=7/10 suspects=3 steps_left=22
|
| 161 |
+
Step 16: FLAG acc_0047 flagged=8/10 suspects=2 steps_left=22
|
| 162 |
+
Step 17: INSPECT acc_0007 flagged=8/10 suspects=2 steps_left=21
|
| 163 |
+
Step 18: FLAG acc_0007 flagged=9/10 suspects=1 steps_left=21
|
| 164 |
+
Step 19: INSPECT acc_0028 flagged=9/10 suspects=1 steps_left=20
|
| 165 |
+
Step 20: FLAG acc_0028 flagged=10/10 suspects=0 steps_left=20
|
| 166 |
+
Step 21: SUBMIT flagged=10/10 suspects=0 steps_left=20
|
| 167 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.90
|
| 168 |
+
★ GRADER SCORE: 0.9667
|
| 169 |
+
|
| 170 |
+
--- LLM Agent: task=easy, seed=1, model=Bedrock/mistral.ministral-3-8b-instruct ---
|
| 171 |
+
Step 1: INSPECT acc_0034 flagged=0/10 suspects=0 steps_left=29
|
| 172 |
+
Step 2: INSPECT acc_0003 flagged=0/10 suspects=0 steps_left=28
|
| 173 |
+
Step 3: INSPECT acc_0049 flagged=0/10 suspects=0 steps_left=27
|
| 174 |
+
Step 4: INSPECT acc_0006 flagged=0/10 suspects=0 steps_left=26
|
| 175 |
+
Step 5: INSPECT acc_0047 flagged=0/10 suspects=0 steps_left=25
|
| 176 |
+
Step 6: FLAG acc_0047 flagged=1/10 suspects=9 steps_left=25
|
| 177 |
+
Step 7: INSPECT acc_0009 flagged=1/10 suspects=9 steps_left=24
|
| 178 |
+
Step 8: FLAG acc_0009 flagged=2/10 suspects=8 steps_left=24
|
| 179 |
+
Step 9: INSPECT acc_0046 flagged=2/10 suspects=8 steps_left=23
|
| 180 |
+
Step 10: FLAG acc_0046 flagged=3/10 suspects=7 steps_left=23
|
| 181 |
+
Step 11: INSPECT acc_0014 flagged=3/10 suspects=7 steps_left=22
|
| 182 |
+
Step 12: INSPECT acc_0021 flagged=3/10 suspects=7 steps_left=21
|
| 183 |
+
Step 13: FLAG acc_0014 flagged=4/10 suspects=6 steps_left=21
|
| 184 |
+
Step 14: INSPECT acc_0002 flagged=4/10 suspects=6 steps_left=20
|
| 185 |
+
Step 15: FLAG acc_0021 flagged=5/10 suspects=5 steps_left=20
|
| 186 |
+
Step 16: INSPECT acc_0048 flagged=5/10 suspects=5 steps_left=19
|
| 187 |
+
Step 17: INSPECT acc_0029 flagged=5/10 suspects=5 steps_left=18
|
| 188 |
+
Step 18: FLAG acc_0029 flagged=6/10 suspects=4 steps_left=18
|
| 189 |
+
Step 19: FLAG acc_0048 flagged=7/10 suspects=3 steps_left=18
|
| 190 |
+
Step 20: FLAG acc_0002 flagged=8/10 suspects=2 steps_left=18
|
| 191 |
+
Step 21: INSPECT acc_0015 flagged=8/10 suspects=2 steps_left=17
|
| 192 |
+
Step 22: FLAG acc_0015 flagged=9/10 suspects=1 steps_left=17
|
| 193 |
+
Step 23: INSPECT acc_0005 flagged=9/10 suspects=1 steps_left=16
|
| 194 |
+
Step 24: FLAG acc_0005 flagged=10/10 suspects=0 steps_left=16
|
| 195 |
+
Step 25: SUBMIT flagged=10/10 suspects=0 steps_left=16
|
| 196 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.86
|
| 197 |
+
★ GRADER SCORE: 0.9533
|
| 198 |
+
|
| 199 |
+
--- LLM Agent: task=easy, seed=2, model=Bedrock/mistral.ministral-3-8b-instruct ---
|
| 200 |
+
Step 1: INSPECT acc_0040 flagged=0/10 suspects=0 steps_left=29
|
| 201 |
+
Step 2: INSPECT acc_0017 flagged=0/10 suspects=0 steps_left=28
|
| 202 |
+
Step 3: INSPECT acc_0025 flagged=0/10 suspects=0 steps_left=27
|
| 203 |
+
Step 4: INSPECT acc_0026 flagged=0/10 suspects=0 steps_left=26
|
| 204 |
+
Step 5: INSPECT acc_0038 flagged=0/10 suspects=0 steps_left=25
|
| 205 |
+
Step 6: FLAG acc_0029 flagged=1/10 suspects=9 steps_left=25
|
| 206 |
+
Step 7: INSPECT acc_0006 flagged=1/10 suspects=9 steps_left=24
|
| 207 |
+
Step 8: FLAG acc_0006 flagged=2/10 suspects=8 steps_left=24
|
| 208 |
+
Step 9: INSPECT acc_0033 flagged=2/10 suspects=8 steps_left=23
|
| 209 |
+
Step 10: FLAG acc_0033 flagged=3/10 suspects=7 steps_left=23
|
| 210 |
+
Step 11: INSPECT acc_0015 flagged=3/10 suspects=7 steps_left=22
|
| 211 |
+
Step 12: FLAG acc_0015 flagged=4/10 suspects=6 steps_left=22
|
| 212 |
+
Step 13: INSPECT acc_0022 flagged=4/10 suspects=6 steps_left=21
|
| 213 |
+
Step 14: INSPECT acc_0009 flagged=4/10 suspects=6 steps_left=20
|
| 214 |
+
Step 15: FLAG acc_0022 flagged=5/10 suspects=5 steps_left=20
|
| 215 |
+
Step 16: INSPECT acc_0004 flagged=5/10 suspects=5 steps_left=19
|
| 216 |
+
Step 17: FLAG acc_0009 flagged=6/10 suspects=4 steps_left=19
|
| 217 |
+
Step 18: FLAG acc_0004 flagged=7/10 suspects=3 steps_left=19
|
| 218 |
+
Step 19: INSPECT acc_0024 flagged=7/10 suspects=3 steps_left=18
|
| 219 |
+
Step 20: FLAG acc_0024 flagged=8/10 suspects=2 steps_left=18
|
| 220 |
+
Step 21: INSPECT acc_0049 flagged=8/10 suspects=2 steps_left=17
|
| 221 |
+
Step 22: FLAG acc_0049 flagged=9/10 suspects=1 steps_left=17
|
| 222 |
+
Step 23: INSPECT acc_0035 flagged=9/10 suspects=1 steps_left=16
|
| 223 |
+
Step 24: FLAG acc_0035 flagged=10/10 suspects=0 steps_left=16
|
| 224 |
+
Step 25: SUBMIT flagged=10/10 suspects=0 steps_left=16
|
| 225 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.86
|
| 226 |
+
★ GRADER SCORE: 0.9533
|
| 227 |
+
|
| 228 |
+
easy: scores=['0.967', '0.953', '0.953'] mean=0.9578 var=0.000040
|
| 229 |
+
|
| 230 |
+
--- LLM Agent: task=medium, seed=0, model=Bedrock/mistral.ministral-3-8b-instruct ---
|
| 231 |
+
Step 1: INSPECT acc_0008 flagged=0/10 suspects=0 steps_left=49
|
| 232 |
+
Step 2: INSPECT acc_0074 flagged=0/10 suspects=0 steps_left=48
|
| 233 |
+
Step 3: INSPECT acc_0179 flagged=0/10 suspects=0 steps_left=47
|
| 234 |
+
Step 4: INSPECT acc_0096 flagged=0/10 suspects=0 steps_left=46
|
| 235 |
+
Step 5: INSPECT acc_0177 flagged=0/10 suspects=0 steps_left=45
|
| 236 |
+
Step 6: INSPECT acc_0122 flagged=0/10 suspects=0 steps_left=44
|
| 237 |
+
Step 7: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=43
|
| 238 |
+
Step 8: INSPECT acc_0174 flagged=0/10 suspects=0 steps_left=42
|
| 239 |
+
Step 9: INSPECT acc_0144 flagged=0/10 suspects=0 steps_left=41
|
| 240 |
+
Step 10: FLAG acc_0144 flagged=1/10 suspects=8 steps_left=41
|
| 241 |
+
Step 11: INSPECT acc_0181 flagged=1/10 suspects=8 steps_left=40
|
| 242 |
+
Step 12: FLAG acc_0181 flagged=2/10 suspects=8 steps_left=40
|
| 243 |
+
Step 13: INSPECT acc_0022 flagged=2/10 suspects=8 steps_left=39
|
| 244 |
+
Step 14: FLAG acc_0022 flagged=3/10 suspects=7 steps_left=39
|
| 245 |
+
Step 15: INSPECT acc_0092 flagged=3/10 suspects=7 steps_left=38
|
| 246 |
+
Step 16: FLAG acc_0092 flagged=4/10 suspects=6 steps_left=38
|
| 247 |
+
Step 17: INSPECT acc_0097 flagged=4/10 suspects=6 steps_left=37
|
| 248 |
+
Step 18: FLAG acc_0097 flagged=5/10 suspects=5 steps_left=37
|
| 249 |
+
Step 19: INSPECT acc_0187 flagged=5/10 suspects=5 steps_left=36
|
| 250 |
+
Step 20: FLAG acc_0187 flagged=6/10 suspects=4 steps_left=36
|
| 251 |
+
Step 21: INSPECT acc_0093 flagged=6/10 suspects=4 steps_left=35
|
| 252 |
+
Step 22: FLAG acc_0093 flagged=7/10 suspects=3 steps_left=35
|
| 253 |
+
Step 23: INSPECT acc_0172 flagged=7/10 suspects=3 steps_left=34
|
| 254 |
+
Step 24: FLAG acc_0172 flagged=8/10 suspects=2 steps_left=34
|
| 255 |
+
Step 25: INSPECT acc_0058 flagged=8/10 suspects=2 steps_left=33
|
| 256 |
+
Step 26: FLAG acc_0058 flagged=9/10 suspects=1 steps_left=33
|
| 257 |
+
Step 27: INSPECT acc_0131 flagged=9/10 suspects=1 steps_left=32
|
| 258 |
+
Step 28: FLAG acc_0131 flagged=10/10 suspects=0 steps_left=32
|
| 259 |
+
Step 29: SUBMIT flagged=10/10 suspects=0 steps_left=32
|
| 260 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.82
|
| 261 |
+
★ GRADER SCORE: 0.9640
|
| 262 |
+
|
| 263 |
+
--- LLM Agent: task=medium, seed=1, model=Bedrock/mistral.ministral-3-8b-instruct ---
|
| 264 |
+
Step 1: INSPECT acc_0171 flagged=0/10 suspects=0 steps_left=49
|
| 265 |
+
Step 2: INSPECT acc_0099 flagged=0/10 suspects=0 steps_left=48
|
| 266 |
+
Step 3: INSPECT acc_0152 flagged=0/10 suspects=0 steps_left=47
|
| 267 |
+
Step 4: INSPECT acc_0092 flagged=0/10 suspects=0 steps_left=46
|
| 268 |
+
Step 5: INSPECT acc_0078 flagged=0/10 suspects=0 steps_left=45
|
| 269 |
+
Step 6: INSPECT acc_0112 flagged=0/10 suspects=0 steps_left=44
|
| 270 |
+
Step 7: INSPECT acc_0012 flagged=0/10 suspects=0 steps_left=43
|
| 271 |
+
Step 8: FLAG acc_0012 flagged=1/10 suspects=8 steps_left=43
|
| 272 |
+
Step 9: INSPECT acc_0033 flagged=1/10 suspects=8 steps_left=42
|
| 273 |
+
Step 10: FLAG acc_0033 flagged=2/10 suspects=8 steps_left=42
|
| 274 |
+
Step 11: INSPECT acc_0174 flagged=2/10 suspects=8 steps_left=41
|
| 275 |
+
Step 12: FLAG acc_0174 flagged=3/10 suspects=7 steps_left=41
|
| 276 |
+
Step 13: INSPECT acc_0187 flagged=3/10 suspects=7 steps_left=40
|
| 277 |
+
Step 14: FLAG acc_0187 flagged=4/10 suspects=6 steps_left=40
|
| 278 |
+
Step 15: INSPECT acc_0079 flagged=4/10 suspects=6 steps_left=39
|
| 279 |
+
Step 16: FLAG acc_0032 flagged=5/10 suspects=5 steps_left=39
|
| 280 |
+
Step 17: INSPECT acc_0023 flagged=5/10 suspects=5 steps_left=38
|
| 281 |
+
Step 18: FLAG acc_0023 flagged=6/10 suspects=4 steps_left=38
|
| 282 |
+
Step 19: INSPECT acc_0146 flagged=6/10 suspects=4 steps_left=37
|
| 283 |
+
Step 20: FLAG acc_0079 flagged=7/10 suspects=3 steps_left=37
|
| 284 |
+
Step 21: INSPECT acc_0019 flagged=7/10 suspects=3 steps_left=36
|
| 285 |
+
Step 22: FLAG acc_0146 flagged=8/10 suspects=2 steps_left=36
|
| 286 |
+
Step 23: FLAG acc_0019 flagged=9/10 suspects=1 steps_left=36
|
| 287 |
+
Step 24: INSPECT acc_0131 flagged=9/10 suspects=1 steps_left=35
|
| 288 |
+
Step 25: FLAG acc_0131 flagged=10/10 suspects=0 steps_left=35
|
| 289 |
+
Step 26: SUBMIT flagged=10/10 suspects=0 steps_left=35
|
| 290 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.85
|
| 291 |
+
★ GRADER SCORE: 0.9700
|
| 292 |
+
|
| 293 |
+
--- LLM Agent: task=medium, seed=2, model=Bedrock/mistral.ministral-3-8b-instruct ---
|
| 294 |
+
Step 1: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=49
|
| 295 |
+
Step 2: INSPECT acc_0107 flagged=4/10 suspects=6 steps_left=20
|
| 296 |
+
Step 3: FLAG acc_0014 flagged=10/10 suspects=0 steps_left=20
|
| 297 |
+
Step 4: SUBMIT flagged=0/10 suspects=0 steps_left=42
|
| 298 |
+
→ Episode ended: [LOSS] TP=0 FP=0 FN=10 Recall=0.00 Precision=0.00 Episode reward=-2.08
|
| 299 |
+
★ GRADER SCORE: 0.0000
|
| 300 |
+
|
| 301 |
+
medium: scores=['0.964', '0.970', '0.000'] mean=0.6447 var=0.207804
|
| 302 |
+
|
| 303 |
+
--- LLM Agent: task=hard, seed=0, model=Bedrock/mistral.ministral-3-8b-instruct ---
|
| 304 |
+
Step 1: INSPECT acc_0704 flagged=0/10 suspects=0 steps_left=79
|
| 305 |
+
Step 2: INSPECT acc_0289 flagged=0/10 suspects=0 steps_left=31
|
| 306 |
+
Step 3: INSPECT acc_0105 flagged=0/10 suspects=0 steps_left=67
|
| 307 |
+
Step 4: INSPECT acc_0374 flagged=0/10 suspects=0 steps_left=57
|
| 308 |
+
Step 5: INSPECT acc_0577 flagged=10/10 suspects=0 steps_left=0
|
| 309 |
+
★ GRADER SCORE: 0.9000
|
| 310 |
+
|
| 311 |
+
--- LLM Agent: task=hard, seed=1, model=Bedrock/mistral.ministral-3-8b-instruct ---
|
| 312 |
+
Step 1: INSPECT acc_0014 flagged=0/10 suspects=0 steps_left=79
|
| 313 |
+
Step 2: INSPECT acc_0835 flagged=0/10 suspects=0 steps_left=78
|
| 314 |
+
Step 3: INSPECT acc_0855 flagged=0/10 suspects=0 steps_left=77
|
| 315 |
+
Step 4: INSPECT acc_0930 flagged=0/10 suspects=0 steps_left=76
|
| 316 |
+
Step 5: INSPECT acc_0336 flagged=0/10 suspects=0 steps_left=75
|
| 317 |
+
Step 6: INSPECT acc_0929 flagged=0/10 suspects=0 steps_left=74
|
| 318 |
+
Step 7: INSPECT acc_0076 flagged=0/10 suspects=0 steps_left=73
|
| 319 |
+
Step 8: INSPECT acc_0543 flagged=0/10 suspects=0 steps_left=72
|
| 320 |
+
Step 9: INSPECT acc_0590 flagged=0/10 suspects=0 steps_left=71
|
| 321 |
+
Step 10: INSPECT acc_0401 flagged=0/10 suspects=0 steps_left=70
|
| 322 |
+
Step 11: INSPECT acc_0322 flagged=0/10 suspects=0 steps_left=69
|
| 323 |
+
Step 12: INSPECT acc_0154 flagged=0/10 suspects=0 steps_left=68
|
| 324 |
+
Step 13: INSPECT acc_0374 flagged=0/10 suspects=0 steps_left=67
|
| 325 |
+
Step 14: INSPECT acc_0549 flagged=0/10 suspects=0 steps_left=66
|
| 326 |
+
Step 15: INSPECT acc_0903 flagged=0/10 suspects=0 steps_left=65
|
| 327 |
+
Step 16: INSPECT acc_0976 flagged=0/10 suspects=0 steps_left=64
|
| 328 |
+
Step 17: INSPECT acc_0620 flagged=0/10 suspects=0 steps_left=63
|
| 329 |
+
Step 18: INSPECT acc_0017 flagged=0/10 suspects=0 steps_left=62
|
| 330 |
+
Step 19: INSPECT acc_0222 flagged=0/10 suspects=0 steps_left=61
|
| 331 |
+
Step 20: INSPECT acc_0536 flagged=0/10 suspects=0 steps_left=60
|
| 332 |
+
Step 21: INSPECT acc_0112 flagged=0/10 suspects=0 steps_left=59
|
| 333 |
+
Step 22: INSPECT acc_0577 flagged=0/10 suspects=0 steps_left=58
|
| 334 |
+
Step 23: INSPECT acc_0517 flagged=0/10 suspects=0 steps_left=57
|
| 335 |
+
Step 24: INSPECT acc_0113 flagged=0/10 suspects=0 steps_left=56
|
| 336 |
+
Step 25: INSPECT acc_0167 flagged=0/10 suspects=0 steps_left=55
|
| 337 |
+
Step 26: INSPECT acc_0697 flagged=0/10 suspects=0 steps_left=54
|
| 338 |
+
Step 27: INSPECT acc_0271 flagged=0/10 suspects=0 steps_left=53
|
| 339 |
+
Step 28: INSPECT acc_0681 flagged=0/10 suspects=0 steps_left=52
|
| 340 |
+
Step 29: INSPECT acc_0530 flagged=0/10 suspects=0 steps_left=51
|
| 341 |
+
Step 30: INSPECT acc_0353 flagged=0/10 suspects=0 steps_left=50
|
| 342 |
+
Step 31: INSPECT acc_0652 flagged=0/10 suspects=0 steps_left=49
|
| 343 |
+
Step 32: INSPECT acc_0777 flagged=0/10 suspects=0 steps_left=48
|
| 344 |
+
Step 33: INSPECT acc_0265 flagged=0/10 suspects=0 steps_left=47
|
| 345 |
+
Step 34: INSPECT acc_0788 flagged=0/10 suspects=0 steps_left=46
|
| 346 |
+
Step 35: INSPECT acc_0033 flagged=0/10 suspects=0 steps_left=45
|
| 347 |
+
Step 36: INSPECT acc_0187 flagged=0/10 suspects=0 steps_left=44
|
| 348 |
+
Step 37: INSPECT acc_0445 flagged=0/10 suspects=0 steps_left=43
|
| 349 |
+
Step 38: INSPECT acc_0846 flagged=0/10 suspects=0 steps_left=42
|
| 350 |
+
Step 39: INSPECT acc_0659 flagged=0/10 suspects=0 steps_left=41
|
| 351 |
+
Step 40: INSPECT acc_0768 flagged=0/10 suspects=0 steps_left=40
|
| 352 |
+
Step 41: INSPECT acc_0677 flagged=0/10 suspects=0 steps_left=39
|
| 353 |
+
Step 42: INSPECT acc_0539 flagged=0/10 suspects=0 steps_left=38
|
| 354 |
+
Step 43: INSPECT acc_0742 flagged=0/10 suspects=0 steps_left=37
|
| 355 |
+
Step 44: INSPECT acc_0503 flagged=0/10 suspects=0 steps_left=36
|
| 356 |
+
Step 45: INSPECT acc_0876 flagged=0/10 suspects=0 steps_left=35
|
| 357 |
+
Step 46: INSPECT acc_0639 flagged=0/10 suspects=0 steps_left=34
|
| 358 |
+
Step 47: INSPECT acc_0494 flagged=0/10 suspects=0 steps_left=33
|
| 359 |
+
Step 48: INSPECT acc_0898 flagged=0/10 suspects=0 steps_left=32
|
| 360 |
+
Step 49: INSPECT acc_0553 flagged=0/10 suspects=0 steps_left=31
|
| 361 |
+
Step 50: INSPECT acc_0588 flagged=0/10 suspects=0 steps_left=30
|
| 362 |
+
Step 51: INSPECT acc_0194 flagged=0/10 suspects=0 steps_left=29
|
| 363 |
+
Step 52: INSPECT acc_0810 flagged=0/10 suspects=0 steps_left=28
|
| 364 |
+
Step 53: INSPECT acc_0355 flagged=0/10 suspects=0 steps_left=27
|
| 365 |
+
Step 54: INSPECT acc_0363 flagged=0/10 suspects=0 steps_left=26
|
| 366 |
+
Step 55: INSPECT acc_0221 flagged=0/10 suspects=0 steps_left=25
|
| 367 |
+
Step 56: INSPECT acc_0580 flagged=0/10 suspects=0 steps_left=24
|
| 368 |
+
Step 57: INSPECT acc_0534 flagged=0/10 suspects=0 steps_left=23
|
| 369 |
+
Step 58: INSPECT acc_0778 flagged=0/10 suspects=0 steps_left=22
|
| 370 |
+
Step 59: INSPECT acc_0998 flagged=0/10 suspects=0 steps_left=21
|
| 371 |
+
Step 60: INSPECT acc_0233 flagged=0/10 suspects=0 steps_left=20
|
| 372 |
+
Step 61: INSPECT acc_0052 flagged=0/10 suspects=0 steps_left=19
|
| 373 |
+
Step 62: INSPECT acc_0813 flagged=4/10 suspects=6 steps_left=21
|
| 374 |
+
Step 63: FLAG acc_0014 flagged=5/10 suspects=5 steps_left=21
|
| 375 |
+
Step 64: FLAG acc_0028 flagged=6/10 suspects=4 steps_left=21
|
| 376 |
+
Step 65: FLAG acc_0000 flagged=7/10 suspects=3 steps_left=21
|
| 377 |
+
Step 66: FLAG acc_0012 flagged=8/10 suspects=2 steps_left=21
|
| 378 |
+
Step 67: FLAG acc_0007 flagged=9/10 suspects=1 steps_left=21
|
| 379 |
+
Step 68: FLAG acc_0036 flagged=10/10 suspects=0 steps_left=21
|
| 380 |
+
Step 69: SUBMIT flagged=10/10 suspects=0 steps_left=21
|
| 381 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.91
|
| 382 |
+
★ GRADER SCORE: 0.9700
|
| 383 |
+
|
| 384 |
+
--- LLM Agent: task=hard, seed=2, model=Bedrock/mistral.ministral-3-8b-instruct ---
|
| 385 |
+
Step 1: INSPECT acc_0813 flagged=0/10 suspects=0 steps_left=79
|
| 386 |
+
Step 2: INSPECT acc_0430 flagged=0/10 suspects=0 steps_left=78
|
| 387 |
+
Step 3: INSPECT acc_0817 flagged=0/10 suspects=0 steps_left=77
|
| 388 |
+
Step 4: INSPECT acc_0175 flagged=0/10 suspects=0 steps_left=76
|
| 389 |
+
Step 5: INSPECT acc_0523 flagged=1/10 suspects=1 steps_left=72
|
| 390 |
+
Step 6: INSPECT acc_0797 flagged=1/10 suspects=1 steps_left=66
|
| 391 |
+
Step 7: INSPECT acc_0664 flagged=1/10 suspects=1 steps_left=65
|
| 392 |
+
Step 8: INSPECT acc_0255 flagged=0/10 suspects=0 steps_left=42
|
| 393 |
+
Step 9: INSPECT acc_0144 flagged=0/10 suspects=0 steps_left=41
|
| 394 |
+
Step 10: FLAG acc_0144 flagged=1/10 suspects=8 steps_left=40
|
| 395 |
+
Step 11: INSPECT acc_0181 flagged=1/10 suspects=8 steps_left=39
|
| 396 |
+
Step 12: FLAG acc_0181 flagged=7/10 suspects=11 steps_left=51
|
| 397 |
+
Step 13: FLAG acc_0389 flagged=8/10 suspects=10 steps_left=51
|
| 398 |
+
Step 14: FLAG acc_0658 flagged=9/10 suspects=9 steps_left=51
|
| 399 |
+
Step 15: FLAG acc_0507 flagged=10/10 suspects=0 steps_left=16
|
| 400 |
+
Step 16: SUBMIT flagged=0/10 suspects=0 steps_left=30
|
| 401 |
+
→ Episode ended: [LOSS] TP=0 FP=0 FN=10 Recall=0.00 Precision=0.00 Episode reward=-2.00
|
| 402 |
+
★ GRADER SCORE: 0.0000
|
| 403 |
+
|
| 404 |
+
hard: scores=['0.900', '0.970', '0.000'] mean=0.6233 var=0.195089
|
| 405 |
+
|
| 406 |
+
============================================================
|
| 407 |
+
EVALUATION COMPLETE
|
| 408 |
+
============================================================
|
| 409 |
+
ubuntu@ip-172-31-33-59:~/meta/meta-hack-26$
|
| 410 |
+
ubuntu@ip-172-31-33-59:~/meta/meta-hack-26$
|
model-benchmark-logs/nvidia_judge_log.txt
ADDED
|
@@ -0,0 +1,545 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ubuntu@ip-172-31-33-59:~/meta/meta-hack-26$ python3 nvidia_test_judge_eval.py --url https://
|
| 2 |
+
pandago-graphstrike.hf.space --bedrock
|
| 3 |
+
GraphStrike Judge Evaluation Simulator
|
| 4 |
+
Target: https://pandago-graphstrike.hf.space
|
| 5 |
+
Backend: bedrock
|
| 6 |
+
Model: Bedrock/nvidia.nemotron-super-3-120b
|
| 7 |
+
Token: set
|
| 8 |
+
|
| 9 |
+
============================================================
|
| 10 |
+
PHASE 0: Endpoint Verification
|
| 11 |
+
============================================================
|
| 12 |
+
✓ GET /health
|
| 13 |
+
✓ GET /tasks
|
| 14 |
+
✓ GET /metadata
|
| 15 |
+
✓ GET /schema
|
| 16 |
+
✓ GET /web
|
| 17 |
+
✓ POST /reset
|
| 18 |
+
✓ GET /state
|
| 19 |
+
✓ POST /step
|
| 20 |
+
✓ POST /step
|
| 21 |
+
✓ GET /grader
|
| 22 |
+
✓ POST /mcp
|
| 23 |
+
✓ POST /baseline
|
| 24 |
+
|
| 25 |
+
============================================================
|
| 26 |
+
PHASE 1: Baseline Stability (3 runs)
|
| 27 |
+
============================================================
|
| 28 |
+
Run 1: easy=0.9100 medium=0.9060 hard=0.9038
|
| 29 |
+
Run 2: easy=0.9100 medium=0.9060 hard=0.9038
|
| 30 |
+
Run 3: easy=0.9100 medium=0.9060 hard=0.9038
|
| 31 |
+
✓ All 3 runs identical — baseline is deterministic
|
| 32 |
+
|
| 33 |
+
============================================================
|
| 34 |
+
PHASE 2: LLM Agent Evaluation (model=Bedrock/nvidia.nemotron-super-3-120b)
|
| 35 |
+
============================================================
|
| 36 |
+
|
| 37 |
+
--- LLM Agent: task=easy, seed=0, model=Bedrock/nvidia.nemotron-super-3-120b ---
|
| 38 |
+
Step 1: INSPECT acc_0043 flagged=0/10 suspects=0 steps_left=27
|
| 39 |
+
Step 2: FLAG acc_0043 flagged=1/10 suspects=8 steps_left=27
|
| 40 |
+
Step 3: INSPECT acc_0036 flagged=1/10 suspects=8 steps_left=25
|
| 41 |
+
Step 4: FLAG acc_0036 flagged=2/10 suspects=8 steps_left=25
|
| 42 |
+
Step 5: INSPECT acc_0047 flagged=2/10 suspects=8 steps_left=24
|
| 43 |
+
Step 6: FLAG acc_0047 flagged=3/10 suspects=7 steps_left=23
|
| 44 |
+
Step 7: INSPECT acc_0007 flagged=3/10 suspects=7 steps_left=22
|
| 45 |
+
Step 8: FLAG acc_0007 flagged=4/10 suspects=6 steps_left=21
|
| 46 |
+
Step 9: INSPECT acc_0001 flagged=4/10 suspects=6 steps_left=20
|
| 47 |
+
Step 10: FLAG acc_0001 flagged=5/10 suspects=5 steps_left=19
|
| 48 |
+
Step 11: INSPECT acc_0014 flagged=5/10 suspects=5 steps_left=18
|
| 49 |
+
Step 12: FLAG acc_0014 flagged=6/10 suspects=4 steps_left=17
|
| 50 |
+
Step 13: INSPECT acc_0012 flagged=6/10 suspects=4 steps_left=16
|
| 51 |
+
Step 14: FLAG acc_0012 flagged=7/10 suspects=3 steps_left=15
|
| 52 |
+
Step 15: INSPECT acc_0000 flagged=7/10 suspects=3 steps_left=14
|
| 53 |
+
Step 16: FLAG acc_0000 flagged=8/10 suspects=2 steps_left=13
|
| 54 |
+
Step 17: INSPECT acc_0027 flagged=8/10 suspects=2 steps_left=12
|
| 55 |
+
Step 18: FLAG acc_0027 flagged=9/10 suspects=1 steps_left=11
|
| 56 |
+
Step 19: INSPECT acc_0028 flagged=9/10 suspects=1 steps_left=10
|
| 57 |
+
Step 20: FLAG acc_0028 flagged=10/10 suspects=0 steps_left=9
|
| 58 |
+
Step 21: SUBMIT flagged=10/10 suspects=0 steps_left=9
|
| 59 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.79
|
| 60 |
+
★ GRADER SCORE: 0.9300
|
| 61 |
+
|
| 62 |
+
--- LLM Agent: task=medium, seed=0, model=Bedrock/nvidia.nemotron-super-3-120b ---
|
| 63 |
+
Step 1: INSPECT acc_0008 flagged=0/10 suspects=0 steps_left=48
|
| 64 |
+
Step 2: INSPECT acc_0074 flagged=0/10 suspects=0 steps_left=45
|
| 65 |
+
Step 3: INSPECT acc_0179 flagged=0/10 suspects=0 steps_left=43
|
| 66 |
+
Step 4: INSPECT acc_0096 flagged=0/10 suspects=0 steps_left=41
|
| 67 |
+
Step 5: INSPECT acc_0177 flagged=0/10 suspects=0 steps_left=39
|
| 68 |
+
Step 6: INSPECT acc_0122 flagged=0/10 suspects=0 steps_left=37
|
| 69 |
+
Step 7: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=35
|
| 70 |
+
Step 8: INSPECT acc_0174 flagged=0/10 suspects=0 steps_left=33
|
| 71 |
+
Step 9: INSPECT acc_0144 flagged=0/10 suspects=0 steps_left=31
|
| 72 |
+
Step 10: FLAG acc_0144 flagged=1/10 suspects=9 steps_left=31
|
| 73 |
+
Step 11: INSPECT acc_0131 flagged=1/10 suspects=9 steps_left=29
|
| 74 |
+
Step 12: FLAG acc_0131 flagged=2/10 suspects=8 steps_left=29
|
| 75 |
+
Step 13: INSPECT acc_0181 flagged=2/10 suspects=8 steps_left=27
|
| 76 |
+
Step 14: FLAG acc_0181 flagged=3/10 suspects=7 steps_left=26
|
| 77 |
+
Step 15: FLAG acc_0022 flagged=4/10 suspects=6 steps_left=26
|
| 78 |
+
Step 16: INSPECT acc_0092 flagged=4/10 suspects=6 steps_left=24
|
| 79 |
+
Step 17: FLAG acc_0092 flagged=5/10 suspects=5 steps_left=24
|
| 80 |
+
Step 18: INSPECT acc_0097 flagged=5/10 suspects=5 steps_left=22
|
| 81 |
+
Step 19: FLAG acc_0097 flagged=6/10 suspects=4 steps_left=22
|
| 82 |
+
Step 20: INSPECT acc_0187 flagged=6/10 suspects=4 steps_left=20
|
| 83 |
+
Step 21: FLAG acc_0187 flagged=7/10 suspects=3 steps_left=20
|
| 84 |
+
Step 22: INSPECT acc_0093 flagged=7/10 suspects=3 steps_left=19
|
| 85 |
+
Step 23: FLAG acc_0093 flagged=8/10 suspects=2 steps_left=18
|
| 86 |
+
Step 24: INSPECT acc_0172 flagged=8/10 suspects=2 steps_left=17
|
| 87 |
+
Step 25: FLAG acc_0172 flagged=9/10 suspects=1 steps_left=16
|
| 88 |
+
Step 26: INSPECT acc_0058 flagged=0/10 suspects=0 steps_left=79
|
| 89 |
+
Step 27: INSPECT acc_0704 flagged=0/10 suspects=0 steps_left=78
|
| 90 |
+
Step 28: INSPECT acc_0289 flagged=0/10 suspects=0 steps_left=77
|
| 91 |
+
Step 29: INSPECT acc_0826 flagged=0/10 suspects=0 steps_left=75
|
| 92 |
+
Step 30: INSPECT acc_0927 flagged=0/10 suspects=0 steps_left=73
|
| 93 |
+
Step 31: INSPECT acc_0441 flagged=0/10 suspects=0 steps_left=70
|
| 94 |
+
Step 32: INSPECT acc_0871 flagged=0/10 suspects=0 steps_left=68
|
| 95 |
+
Step 33: INSPECT acc_0880 flagged=0/10 suspects=0 steps_left=67
|
| 96 |
+
Step 34: INSPECT acc_0652 flagged=0/10 suspects=0 steps_left=65
|
| 97 |
+
Step 35: INSPECT acc_0939 flagged=0/10 suspects=0 steps_left=63
|
| 98 |
+
Step 36: INSPECT acc_0070 flagged=0/10 suspects=0 steps_left=61
|
| 99 |
+
Step 37: INSPECT acc_0915 flagged=0/10 suspects=0 steps_left=58
|
| 100 |
+
Step 38: INSPECT acc_0443 flagged=0/10 suspects=0 steps_left=56
|
| 101 |
+
Step 39: INSPECT acc_0970 flagged=0/10 suspects=0 steps_left=54
|
| 102 |
+
Step 40: INSPECT acc_0028 flagged=0/10 suspects=0 steps_left=53
|
| 103 |
+
Step 41: INSPECT acc_0792 flagged=0/10 suspects=0 steps_left=52
|
| 104 |
+
Step 42: INSPECT acc_0579 flagged=0/10 suspects=0 steps_left=50
|
| 105 |
+
Step 43: INSPECT acc_0037 flagged=0/10 suspects=0 steps_left=47
|
| 106 |
+
Step 44: INSPECT acc_0295 flagged=0/10 suspects=0 steps_left=46
|
| 107 |
+
Step 45: INSPECT acc_0438 flagged=0/10 suspects=0 steps_left=44
|
| 108 |
+
Step 46: INSPECT acc_0439 flagged=0/10 suspects=0 steps_left=42
|
| 109 |
+
Step 47: FLAG acc_0439 flagged=1/10 suspects=5 steps_left=41
|
| 110 |
+
Step 48: INSPECT acc_0237 flagged=1/10 suspects=5 steps_left=40
|
| 111 |
+
Step 49: FLAG acc_0237 flagged=2/10 suspects=6 steps_left=40
|
| 112 |
+
Step 50: INSPECT acc_0621 flagged=2/10 suspects=6 steps_left=39
|
| 113 |
+
Step 51: FLAG acc_0621 flagged=3/10 suspects=6 steps_left=39
|
| 114 |
+
Step 52: INSPECT acc_0389 flagged=3/10 suspects=6 steps_left=38
|
| 115 |
+
Step 53: FLAG acc_0389 flagged=4/10 suspects=6 steps_left=38
|
| 116 |
+
Step 54: INSPECT acc_0160 flagged=4/10 suspects=6 steps_left=37
|
| 117 |
+
Step 55: FLAG acc_0160 flagged=5/10 suspects=5 steps_left=37
|
| 118 |
+
Step 56: INSPECT acc_0549 flagged=5/10 suspects=5 steps_left=36
|
| 119 |
+
Step 57: FLAG acc_0549 flagged=6/10 suspects=4 steps_left=36
|
| 120 |
+
Step 58: INSPECT acc_0658 flagged=6/10 suspects=4 steps_left=35
|
| 121 |
+
Step 59: FLAG acc_0658 flagged=7/10 suspects=3 steps_left=35
|
| 122 |
+
Step 60: INSPECT acc_0290 flagged=7/10 suspects=3 steps_left=34
|
| 123 |
+
Step 61: FLAG acc_0290 flagged=8/10 suspects=2 steps_left=34
|
| 124 |
+
Step 62: INSPECT acc_0124 flagged=8/10 suspects=2 steps_left=33
|
| 125 |
+
Step 63: FLAG acc_0507 flagged=9/10 suspects=1 steps_left=33
|
| 126 |
+
Step 64: FLAG acc_0124 flagged=10/10 suspects=0 steps_left=33
|
| 127 |
+
Step 65: SUBMIT flagged=10/10 suspects=0 steps_left=33
|
| 128 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=14.53
|
| 129 |
+
★ GRADER SCORE: 0.9413
|
| 130 |
+
|
| 131 |
+
--- LLM Agent: task=hard, seed=0, model=Bedrock/nvidia.nemotron-super-3-120b ---
|
| 132 |
+
Step 1: INSPECT acc_0704 flagged=0/10 suspects=0 steps_left=79
|
| 133 |
+
Step 2: INSPECT acc_0289 flagged=0/10 suspects=0 steps_left=78
|
| 134 |
+
Step 3: INSPECT acc_0826 flagged=0/10 suspects=0 steps_left=77
|
| 135 |
+
Step 4: INSPECT acc_0927 flagged=0/10 suspects=0 steps_left=76
|
| 136 |
+
Step 5: INSPECT acc_0441 flagged=0/10 suspects=0 steps_left=75
|
| 137 |
+
Step 6: INSPECT acc_0871 flagged=0/10 suspects=0 steps_left=74
|
| 138 |
+
Step 7: INSPECT acc_0880 flagged=0/10 suspects=0 steps_left=73
|
| 139 |
+
Step 8: INSPECT acc_0652 flagged=0/10 suspects=0 steps_left=72
|
| 140 |
+
Step 9: INSPECT acc_0939 flagged=0/10 suspects=0 steps_left=71
|
| 141 |
+
Step 10: INSPECT acc_0070 flagged=0/10 suspects=0 steps_left=70
|
| 142 |
+
Step 11: INSPECT acc_0915 flagged=0/10 suspects=0 steps_left=69
|
| 143 |
+
Step 12: INSPECT acc_0443 flagged=0/10 suspects=0 steps_left=68
|
| 144 |
+
Step 13: INSPECT acc_0970 flagged=0/10 suspects=0 steps_left=67
|
| 145 |
+
Step 14: INSPECT acc_0028 flagged=0/10 suspects=0 steps_left=66
|
| 146 |
+
Step 15: INSPECT acc_0792 flagged=0/10 suspects=0 steps_left=65
|
| 147 |
+
Step 16: INSPECT acc_0579 flagged=0/10 suspects=0 steps_left=64
|
| 148 |
+
Step 17: INSPECT acc_0037 flagged=0/10 suspects=0 steps_left=63
|
| 149 |
+
Step 18: INSPECT acc_0295 flagged=0/10 suspects=0 steps_left=62
|
| 150 |
+
Step 19: INSPECT acc_0438 flagged=0/10 suspects=0 steps_left=61
|
| 151 |
+
Step 20: INSPECT acc_0439 flagged=0/10 suspects=0 steps_left=60
|
| 152 |
+
Step 21: FLAG acc_0439 flagged=1/10 suspects=5 steps_left=60
|
| 153 |
+
Step 22: INSPECT acc_0237 flagged=1/10 suspects=5 steps_left=59
|
| 154 |
+
Step 23: FLAG acc_0237 flagged=2/10 suspects=6 steps_left=59
|
| 155 |
+
Step 24: INSPECT acc_0621 flagged=2/10 suspects=6 steps_left=58
|
| 156 |
+
Step 25: FLAG acc_0621 flagged=3/10 suspects=6 steps_left=58
|
| 157 |
+
Step 26: INSPECT acc_0389 flagged=3/10 suspects=6 steps_left=57
|
| 158 |
+
Step 27: FLAG acc_0389 flagged=4/10 suspects=6 steps_left=57
|
| 159 |
+
Step 28: INSPECT acc_0160 flagged=4/10 suspects=6 steps_left=56
|
| 160 |
+
Step 29: FLAG acc_0160 flagged=5/10 suspects=5 steps_left=56
|
| 161 |
+
Step 30: INSPECT acc_0549 flagged=5/10 suspects=5 steps_left=55
|
| 162 |
+
Step 31: FLAG acc_0549 flagged=6/10 suspects=4 steps_left=55
|
| 163 |
+
Step 32: INSPECT acc_0658 flagged=6/10 suspects=4 steps_left=54
|
| 164 |
+
Step 33: FLAG acc_0658 flagged=7/10 suspects=3 steps_left=54
|
| 165 |
+
Step 34: INSPECT acc_0290 flagged=7/10 suspects=3 steps_left=53
|
| 166 |
+
Step 35: FLAG acc_0290 flagged=8/10 suspects=2 steps_left=53
|
| 167 |
+
Step 36: INSPECT acc_0124 flagged=8/10 suspects=2 steps_left=52
|
| 168 |
+
Step 37: FLAG acc_0124 flagged=9/10 suspects=1 steps_left=52
|
| 169 |
+
Step 38: INSPECT acc_0507 flagged=9/10 suspects=1 steps_left=51
|
| 170 |
+
Step 39: FLAG acc_0507 flagged=10/10 suspects=0 steps_left=51
|
| 171 |
+
Step 40: SUBMIT flagged=10/10 suspects=0 steps_left=51
|
| 172 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.71
|
| 173 |
+
★ GRADER SCORE: 0.9637
|
| 174 |
+
|
| 175 |
+
Summary: easy=0.9300 medium=0.9413 hard=0.9637
|
| 176 |
+
|
| 177 |
+
============================================================
|
| 178 |
+
PHASE 3: Score Variance (seeds=[0, 1, 2])
|
| 179 |
+
============================================================
|
| 180 |
+
|
| 181 |
+
--- LLM Agent: task=easy, seed=0, model=Bedrock/nvidia.nemotron-super-3-120b ---
|
| 182 |
+
Step 1: INSPECT acc_0043 flagged=0/10 suspects=0 steps_left=29
|
| 183 |
+
Step 2: FLAG acc_0043 flagged=1/10 suspects=8 steps_left=29
|
| 184 |
+
Step 3: INSPECT acc_0036 flagged=1/10 suspects=8 steps_left=28
|
| 185 |
+
Step 4: FLAG acc_0036 flagged=2/10 suspects=8 steps_left=28
|
| 186 |
+
Step 5: INSPECT acc_0001 flagged=2/10 suspects=8 steps_left=27
|
| 187 |
+
Step 6: FLAG acc_0001 flagged=3/10 suspects=7 steps_left=27
|
| 188 |
+
Step 7: INSPECT acc_0014 flagged=3/10 suspects=7 steps_left=26
|
| 189 |
+
Step 8: FLAG acc_0014 flagged=4/10 suspects=6 steps_left=26
|
| 190 |
+
Step 9: INSPECT acc_0012 flagged=4/10 suspects=6 steps_left=25
|
| 191 |
+
Step 10: FLAG acc_0012 flagged=5/10 suspects=5 steps_left=25
|
| 192 |
+
Step 11: INSPECT acc_0000 flagged=5/10 suspects=5 steps_left=24
|
| 193 |
+
Step 12: FLAG acc_0000 flagged=6/10 suspects=4 steps_left=24
|
| 194 |
+
Step 13: INSPECT acc_0027 flagged=6/10 suspects=4 steps_left=23
|
| 195 |
+
Step 14: FLAG acc_0027 flagged=7/10 suspects=3 steps_left=23
|
| 196 |
+
Step 15: INSPECT acc_0047 flagged=7/10 suspects=3 steps_left=22
|
| 197 |
+
Step 16: FLAG acc_0047 flagged=8/10 suspects=2 steps_left=22
|
| 198 |
+
Step 17: INSPECT acc_0007 flagged=8/10 suspects=2 steps_left=21
|
| 199 |
+
Step 18: FLAG acc_0007 flagged=9/10 suspects=1 steps_left=21
|
| 200 |
+
Step 19: INSPECT acc_0028 flagged=9/10 suspects=1 steps_left=20
|
| 201 |
+
Step 20: FLAG acc_0028 flagged=10/10 suspects=0 steps_left=20
|
| 202 |
+
Step 21: SUBMIT flagged=10/10 suspects=0 steps_left=20
|
| 203 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.90
|
| 204 |
+
★ GRADER SCORE: 0.9667
|
| 205 |
+
|
| 206 |
+
--- LLM Agent: task=easy, seed=1, model=Bedrock/nvidia.nemotron-super-3-120b ---
|
| 207 |
+
Step 1: INSPECT acc_0034 flagged=0/10 suspects=0 steps_left=29
|
| 208 |
+
Step 2: INSPECT acc_0003 flagged=0/10 suspects=0 steps_left=28
|
| 209 |
+
Step 3: INSPECT acc_0049 flagged=0/10 suspects=0 steps_left=27
|
| 210 |
+
Step 4: INSPECT acc_0006 flagged=0/10 suspects=0 steps_left=26
|
| 211 |
+
Step 5: INSPECT acc_0047 flagged=0/10 suspects=0 steps_left=25
|
| 212 |
+
Step 6: FLAG acc_0047 flagged=1/10 suspects=9 steps_left=25
|
| 213 |
+
Step 7: INSPECT acc_0009 flagged=1/10 suspects=9 steps_left=24
|
| 214 |
+
Step 8: FLAG acc_0009 flagged=2/10 suspects=8 steps_left=24
|
| 215 |
+
Step 9: INSPECT acc_0046 flagged=2/10 suspects=8 steps_left=23
|
| 216 |
+
Step 10: FLAG acc_0046 flagged=3/10 suspects=7 steps_left=23
|
| 217 |
+
Step 11: INSPECT acc_0014 flagged=3/10 suspects=7 steps_left=22
|
| 218 |
+
Step 12: FLAG acc_0014 flagged=4/10 suspects=6 steps_left=22
|
| 219 |
+
Step 13: INSPECT acc_0021 flagged=4/10 suspects=6 steps_left=21
|
| 220 |
+
Step 14: FLAG acc_0021 flagged=5/10 suspects=5 steps_left=21
|
| 221 |
+
Step 15: INSPECT acc_0002 flagged=5/10 suspects=5 steps_left=20
|
| 222 |
+
Step 16: FLAG acc_0002 flagged=6/10 suspects=4 steps_left=20
|
| 223 |
+
Step 17: INSPECT acc_0048 flagged=6/10 suspects=4 steps_left=19
|
| 224 |
+
Step 18: FLAG acc_0048 flagged=7/10 suspects=3 steps_left=19
|
| 225 |
+
Step 19: INSPECT acc_0029 flagged=7/10 suspects=3 steps_left=18
|
| 226 |
+
Step 20: FLAG acc_0029 flagged=8/10 suspects=2 steps_left=18
|
| 227 |
+
Step 21: INSPECT acc_0015 flagged=8/10 suspects=2 steps_left=17
|
| 228 |
+
Step 22: FLAG acc_0015 flagged=9/10 suspects=1 steps_left=17
|
| 229 |
+
Step 23: INSPECT acc_0005 flagged=9/10 suspects=1 steps_left=16
|
| 230 |
+
Step 24: FLAG acc_0005 flagged=10/10 suspects=0 steps_left=16
|
| 231 |
+
Step 25: SUBMIT flagged=10/10 suspects=0 steps_left=16
|
| 232 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.86
|
| 233 |
+
★ GRADER SCORE: 0.9533
|
| 234 |
+
|
| 235 |
+
--- LLM Agent: task=easy, seed=2, model=Bedrock/nvidia.nemotron-super-3-120b ---
|
| 236 |
+
Step 1: INSPECT acc_0040 flagged=0/10 suspects=0 steps_left=29
|
| 237 |
+
Step 2: INSPECT acc_0017 flagged=0/10 suspects=0 steps_left=28
|
| 238 |
+
Step 3: INSPECT acc_0025 flagged=0/10 suspects=0 steps_left=27
|
| 239 |
+
Step 4: INSPECT acc_0026 flagged=0/10 suspects=0 steps_left=26
|
| 240 |
+
Step 5: INSPECT acc_0038 flagged=0/10 suspects=0 steps_left=25
|
| 241 |
+
Step 6: INSPECT acc_0029 flagged=0/10 suspects=0 steps_left=24
|
| 242 |
+
Step 7: FLAG acc_0029 flagged=1/10 suspects=9 steps_left=24
|
| 243 |
+
Step 8: INSPECT acc_0006 flagged=1/10 suspects=9 steps_left=23
|
| 244 |
+
Step 9: FLAG acc_0006 flagged=2/10 suspects=8 steps_left=23
|
| 245 |
+
Step 10: INSPECT acc_0033 flagged=2/10 suspects=8 steps_left=22
|
| 246 |
+
Step 11: FLAG acc_0033 flagged=3/10 suspects=7 steps_left=22
|
| 247 |
+
Step 12: INSPECT acc_0015 flagged=3/10 suspects=7 steps_left=21
|
| 248 |
+
Step 13: FLAG acc_0015 flagged=4/10 suspects=6 steps_left=21
|
| 249 |
+
Step 14: INSPECT acc_0022 flagged=4/10 suspects=6 steps_left=20
|
| 250 |
+
Step 15: FLAG acc_0022 flagged=5/10 suspects=5 steps_left=20
|
| 251 |
+
Step 16: INSPECT acc_0009 flagged=5/10 suspects=5 steps_left=19
|
| 252 |
+
Step 17: FLAG acc_0009 flagged=6/10 suspects=4 steps_left=19
|
| 253 |
+
Step 18: INSPECT acc_0004 flagged=6/10 suspects=4 steps_left=18
|
| 254 |
+
Step 19: FLAG acc_0004 flagged=7/10 suspects=3 steps_left=18
|
| 255 |
+
Step 20: INSPECT acc_0024 flagged=7/10 suspects=3 steps_left=17
|
| 256 |
+
Step 21: FLAG acc_0024 flagged=8/10 suspects=2 steps_left=17
|
| 257 |
+
Step 22: INSPECT acc_0049 flagged=8/10 suspects=2 steps_left=16
|
| 258 |
+
Step 23: FLAG acc_0049 flagged=9/10 suspects=1 steps_left=16
|
| 259 |
+
Step 24: INSPECT acc_0035 flagged=9/10 suspects=1 steps_left=15
|
| 260 |
+
Step 25: FLAG acc_0035 flagged=10/10 suspects=0 steps_left=15
|
| 261 |
+
Step 26: SUBMIT flagged=10/10 suspects=0 steps_left=15
|
| 262 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.85
|
| 263 |
+
★ GRADER SCORE: 0.9500
|
| 264 |
+
|
| 265 |
+
easy: scores=['0.967', '0.953', '0.950'] mean=0.9567 var=0.000052
|
| 266 |
+
|
| 267 |
+
--- LLM Agent: task=medium, seed=0, model=Bedrock/nvidia.nemotron-super-3-120b ---
|
| 268 |
+
Step 1: INSPECT acc_0008 flagged=0/10 suspects=0 steps_left=49
|
| 269 |
+
Step 2: INSPECT acc_0074 flagged=0/10 suspects=0 steps_left=48
|
| 270 |
+
Step 3: INSPECT acc_0179 flagged=0/10 suspects=0 steps_left=47
|
| 271 |
+
Step 4: INSPECT acc_0096 flagged=0/10 suspects=0 steps_left=46
|
| 272 |
+
Step 5: INSPECT acc_0177 flagged=0/10 suspects=0 steps_left=45
|
| 273 |
+
Step 6: INSPECT acc_0122 flagged=0/10 suspects=0 steps_left=44
|
| 274 |
+
Step 7: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=43
|
| 275 |
+
Step 8: INSPECT acc_0174 flagged=0/10 suspects=0 steps_left=42
|
| 276 |
+
Step 9: INSPECT acc_0144 flagged=0/10 suspects=0 steps_left=41
|
| 277 |
+
Step 10: FLAG acc_0144 flagged=1/10 suspects=8 steps_left=41
|
| 278 |
+
Step 11: INSPECT acc_0181 flagged=1/10 suspects=8 steps_left=40
|
| 279 |
+
Step 12: FLAG acc_0181 flagged=2/10 suspects=8 steps_left=40
|
| 280 |
+
Step 13: INSPECT acc_0022 flagged=2/10 suspects=8 steps_left=39
|
| 281 |
+
Step 14: FLAG acc_0022 flagged=3/10 suspects=7 steps_left=39
|
| 282 |
+
Step 15: INSPECT acc_0092 flagged=3/10 suspects=7 steps_left=38
|
| 283 |
+
Step 16: FLAG acc_0092 flagged=4/10 suspects=6 steps_left=38
|
| 284 |
+
Step 17: INSPECT acc_0097 flagged=4/10 suspects=6 steps_left=37
|
| 285 |
+
Step 18: FLAG acc_0097 flagged=5/10 suspects=5 steps_left=37
|
| 286 |
+
Step 19: INSPECT acc_0187 flagged=5/10 suspects=5 steps_left=36
|
| 287 |
+
Step 20: FLAG acc_0187 flagged=6/10 suspects=4 steps_left=36
|
| 288 |
+
Step 21: INSPECT acc_0093 flagged=6/10 suspects=4 steps_left=35
|
| 289 |
+
Step 22: FLAG acc_0093 flagged=7/10 suspects=3 steps_left=34
|
| 290 |
+
Step 23: INSPECT acc_0172 flagged=7/10 suspects=3 steps_left=33
|
| 291 |
+
Step 24: FLAG acc_0172 flagged=8/10 suspects=2 steps_left=33
|
| 292 |
+
Step 25: INSPECT acc_0058 flagged=8/10 suspects=2 steps_left=31
|
| 293 |
+
Step 26: FLAG acc_0058 flagged=9/10 suspects=1 steps_left=31
|
| 294 |
+
Step 27: INSPECT acc_0131 flagged=9/10 suspects=1 steps_left=29
|
| 295 |
+
Step 28: FLAG acc_0131 flagged=10/10 suspects=0 steps_left=29
|
| 296 |
+
Step 29: SUBMIT flagged=10/10 suspects=0 steps_left=29
|
| 297 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.79
|
| 298 |
+
★ GRADER SCORE: 0.9580
|
| 299 |
+
|
| 300 |
+
--- LLM Agent: task=medium, seed=1, model=Bedrock/nvidia.nemotron-super-3-120b ---
|
| 301 |
+
Step 1: INSPECT acc_0171 flagged=0/10 suspects=0 steps_left=49
|
| 302 |
+
Step 2: INSPECT acc_0099 flagged=1/10 suspects=3 steps_left=48
|
| 303 |
+
Step 3: INSPECT acc_0012 flagged=1/10 suspects=3 steps_left=46
|
| 304 |
+
Step 4: FLAG acc_0012 flagged=2/10 suspects=7 steps_left=46
|
| 305 |
+
Step 5: INSPECT acc_0033 flagged=2/10 suspects=7 steps_left=45
|
| 306 |
+
Step 6: FLAG acc_0033 flagged=3/10 suspects=7 steps_left=44
|
| 307 |
+
Step 7: INSPECT acc_0174 flagged=3/10 suspects=7 steps_left=43
|
| 308 |
+
Step 8: FLAG acc_0174 flagged=4/10 suspects=6 steps_left=43
|
| 309 |
+
Step 9: INSPECT acc_0187 flagged=4/10 suspects=6 steps_left=42
|
| 310 |
+
Step 10: FLAG acc_0187 flagged=5/10 suspects=5 steps_left=42
|
| 311 |
+
Step 11: INSPECT acc_0079 flagged=5/10 suspects=5 steps_left=41
|
| 312 |
+
Step 12: FLAG acc_0079 flagged=6/10 suspects=4 steps_left=41
|
| 313 |
+
Step 13: INSPECT acc_0019 flagged=6/10 suspects=4 steps_left=40
|
| 314 |
+
Step 14: FLAG acc_0019 flagged=7/10 suspects=3 steps_left=40
|
| 315 |
+
Step 15: INSPECT acc_0023 flagged=7/10 suspects=3 steps_left=39
|
| 316 |
+
Step 16: FLAG acc_0023 flagged=8/10 suspects=2 steps_left=39
|
| 317 |
+
Step 17: INSPECT acc_0032 flagged=8/10 suspects=2 steps_left=38
|
| 318 |
+
Step 18: FLAG acc_0032 flagged=9/10 suspects=1 steps_left=38
|
| 319 |
+
Step 19: INSPECT acc_0146 flagged=9/10 suspects=1 steps_left=37
|
| 320 |
+
Step 20: FLAG acc_0146 flagged=10/10 suspects=0 steps_left=37
|
| 321 |
+
Step 21: SUBMIT flagged=10/10 suspects=0 steps_left=37
|
| 322 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.87
|
| 323 |
+
★ GRADER SCORE: 0.9740
|
| 324 |
+
|
| 325 |
+
--- LLM Agent: task=medium, seed=2, model=Bedrock/nvidia.nemotron-super-3-120b ---
|
| 326 |
+
Step 1: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=49
|
| 327 |
+
Step 2: INSPECT acc_0107 flagged=0/10 suspects=0 steps_left=48
|
| 328 |
+
Step 3: INSPECT acc_0175 flagged=0/10 suspects=0 steps_left=47
|
| 329 |
+
Step 4: INSPECT acc_0030 flagged=0/10 suspects=0 steps_left=46
|
| 330 |
+
Step 5: INSPECT acc_0041 flagged=0/10 suspects=0 steps_left=45
|
| 331 |
+
Step 6: INSPECT acc_0054 flagged=0/10 suspects=0 steps_left=44
|
| 332 |
+
Step 7: INSPECT acc_0199 flagged=0/10 suspects=0 steps_left=43
|
| 333 |
+
Step 8: INSPECT acc_0181 flagged=0/10 suspects=0 steps_left=42
|
| 334 |
+
Step 9: INSPECT acc_0166 flagged=0/10 suspects=0 steps_left=41
|
| 335 |
+
Step 10: INSPECT acc_0098 flagged=0/10 suspects=0 steps_left=40
|
| 336 |
+
Step 11: INSPECT acc_0121 flagged=0/10 suspects=0 steps_left=39
|
| 337 |
+
Step 12: INSPECT acc_0053 flagged=0/10 suspects=0 steps_left=38
|
| 338 |
+
Step 13: INSPECT acc_0103 flagged=0/10 suspects=0 steps_left=37
|
| 339 |
+
Step 14: INSPECT acc_0000 flagged=0/10 suspects=0 steps_left=36
|
| 340 |
+
Step 15: INSPECT acc_0168 flagged=0/10 suspects=0 steps_left=35
|
| 341 |
+
Step 16: INSPECT acc_0040 flagged=0/10 suspects=0 steps_left=34
|
| 342 |
+
Step 17: INSPECT acc_0149 flagged=0/10 suspects=0 steps_left=33
|
| 343 |
+
Step 18: INSPECT acc_0064 flagged=0/10 suspects=0 steps_left=32
|
| 344 |
+
Step 19: INSPECT acc_0016 flagged=0/10 suspects=0 steps_left=31
|
| 345 |
+
Step 20: INSPECT acc_0105 flagged=0/10 suspects=0 steps_left=30
|
| 346 |
+
Step 21: INSPECT acc_0035 flagged=0/10 suspects=0 steps_left=29
|
| 347 |
+
Step 22: FLAG acc_0035 flagged=1/10 suspects=9 steps_left=29
|
| 348 |
+
Step 23: INSPECT acc_0020 flagged=1/10 suspects=9 steps_left=28
|
| 349 |
+
Step 24: FLAG acc_0020 flagged=2/10 suspects=8 steps_left=28
|
| 350 |
+
Step 25: INSPECT acc_0036 flagged=2/10 suspects=8 steps_left=27
|
| 351 |
+
Step 26: FLAG acc_0036 flagged=3/10 suspects=7 steps_left=27
|
| 352 |
+
Step 27: INSPECT acc_0050 flagged=3/10 suspects=7 steps_left=26
|
| 353 |
+
Step 28: FLAG acc_0050 flagged=4/10 suspects=6 steps_left=26
|
| 354 |
+
Step 29: INSPECT acc_0051 flagged=4/10 suspects=6 steps_left=25
|
| 355 |
+
Step 30: FLAG acc_0051 flagged=5/10 suspects=5 steps_left=25
|
| 356 |
+
Step 31: INSPECT acc_0085 flagged=5/10 suspects=5 steps_left=24
|
| 357 |
+
Step 32: FLAG acc_0085 flagged=6/10 suspects=4 steps_left=24
|
| 358 |
+
Step 33: INSPECT acc_0177 flagged=6/10 suspects=4 steps_left=23
|
| 359 |
+
Step 34: FLAG acc_0177 flagged=7/10 suspects=3 steps_left=23
|
| 360 |
+
Step 35: INSPECT acc_0170 flagged=7/10 suspects=3 steps_left=22
|
| 361 |
+
Step 36: FLAG acc_0170 flagged=8/10 suspects=2 steps_left=22
|
| 362 |
+
Step 37: INSPECT acc_0055 flagged=8/10 suspects=2 steps_left=21
|
| 363 |
+
Step 38: FLAG acc_0055 flagged=9/10 suspects=1 steps_left=21
|
| 364 |
+
Step 39: INSPECT acc_0094 flagged=9/10 suspects=1 steps_left=20
|
| 365 |
+
Step 40: FLAG acc_0094 flagged=10/10 suspects=0 steps_left=20
|
| 366 |
+
Step 41: SUBMIT flagged=10/10 suspects=0 steps_left=20
|
| 367 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.70
|
| 368 |
+
★ GRADER SCORE: 0.9400
|
| 369 |
+
|
| 370 |
+
medium: scores=['0.958', '0.974', '0.940'] mean=0.9573 var=0.000193
|
| 371 |
+
|
| 372 |
+
--- LLM Agent: task=hard, seed=0, model=Bedrock/nvidia.nemotron-super-3-120b ---
|
| 373 |
+
Step 1: INSPECT acc_0704 flagged=0/10 suspects=0 steps_left=79
|
| 374 |
+
Step 2: INSPECT acc_0289 flagged=0/10 suspects=0 steps_left=78
|
| 375 |
+
Step 3: INSPECT acc_0826 flagged=0/10 suspects=0 steps_left=77
|
| 376 |
+
Step 4: INSPECT acc_0927 flagged=0/10 suspects=0 steps_left=76
|
| 377 |
+
Step 5: INSPECT acc_0441 flagged=0/10 suspects=0 steps_left=75
|
| 378 |
+
Step 6: INSPECT acc_0871 flagged=0/10 suspects=0 steps_left=74
|
| 379 |
+
Step 7: INSPECT acc_0880 flagged=0/10 suspects=0 steps_left=73
|
| 380 |
+
Step 8: INSPECT acc_0652 flagged=0/10 suspects=0 steps_left=72
|
| 381 |
+
Step 9: INSPECT acc_0939 flagged=0/10 suspects=0 steps_left=71
|
| 382 |
+
Step 10: INSPECT acc_0070 flagged=0/10 suspects=0 steps_left=70
|
| 383 |
+
Step 11: INSPECT acc_0915 flagged=0/10 suspects=0 steps_left=69
|
| 384 |
+
Step 12: INSPECT acc_0443 flagged=0/10 suspects=0 steps_left=68
|
| 385 |
+
Step 13: INSPECT acc_0970 flagged=0/10 suspects=0 steps_left=67
|
| 386 |
+
Step 14: INSPECT acc_0028 flagged=0/10 suspects=0 steps_left=66
|
| 387 |
+
Step 15: INSPECT acc_0792 flagged=0/10 suspects=0 steps_left=65
|
| 388 |
+
Step 16: INSPECT acc_0579 flagged=0/10 suspects=0 steps_left=64
|
| 389 |
+
Step 17: INSPECT acc_0037 flagged=0/10 suspects=0 steps_left=63
|
| 390 |
+
Step 18: INSPECT acc_0295 flagged=0/10 suspects=0 steps_left=62
|
| 391 |
+
Step 19: INSPECT acc_0438 flagged=0/10 suspects=0 steps_left=61
|
| 392 |
+
Step 20: INSPECT acc_0439 flagged=0/10 suspects=0 steps_left=60
|
| 393 |
+
Step 21: FLAG acc_0439 flagged=1/10 suspects=5 steps_left=60
|
| 394 |
+
Step 22: INSPECT acc_0237 flagged=1/10 suspects=5 steps_left=59
|
| 395 |
+
Step 23: FLAG acc_0237 flagged=2/10 suspects=6 steps_left=59
|
| 396 |
+
Step 24: INSPECT acc_0621 flagged=2/10 suspects=6 steps_left=58
|
| 397 |
+
Step 25: FLAG acc_0621 flagged=3/10 suspects=6 steps_left=58
|
| 398 |
+
Step 26: INSPECT acc_0389 flagged=3/10 suspects=6 steps_left=57
|
| 399 |
+
Step 27: INSPECT acc_0160 flagged=3/10 suspects=6 steps_left=56
|
| 400 |
+
Step 28: FLAG acc_0389 flagged=4/10 suspects=6 steps_left=56
|
| 401 |
+
Step 29: FLAG acc_0160 flagged=5/10 suspects=5 steps_left=56
|
| 402 |
+
Step 30: INSPECT acc_0549 flagged=5/10 suspects=5 steps_left=55
|
| 403 |
+
Step 31: FLAG acc_0549 flagged=6/10 suspects=4 steps_left=55
|
| 404 |
+
Step 32: INSPECT acc_0658 flagged=6/10 suspects=4 steps_left=54
|
| 405 |
+
Step 33: FLAG acc_0658 flagged=7/10 suspects=3 steps_left=54
|
| 406 |
+
Step 34: INSPECT acc_0290 flagged=7/10 suspects=3 steps_left=53
|
| 407 |
+
Step 35: FLAG acc_0290 flagged=8/10 suspects=2 steps_left=53
|
| 408 |
+
Step 36: INSPECT acc_0124 flagged=8/10 suspects=2 steps_left=52
|
| 409 |
+
Step 37: FLAG acc_0124 flagged=9/10 suspects=1 steps_left=52
|
| 410 |
+
Step 38: INSPECT acc_0507 flagged=9/10 suspects=1 steps_left=51
|
| 411 |
+
Step 39: FLAG acc_0507 flagged=10/10 suspects=0 steps_left=51
|
| 412 |
+
Step 40: SUBMIT flagged=10/10 suspects=0 steps_left=51
|
| 413 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.71
|
| 414 |
+
★ GRADER SCORE: 0.9637
|
| 415 |
+
|
| 416 |
+
--- LLM Agent: task=hard, seed=1, model=Bedrock/nvidia.nemotron-super-3-120b ---
|
| 417 |
+
Step 1: INSPECT acc_0014 flagged=0/10 suspects=0 steps_left=79
|
| 418 |
+
Step 2: INSPECT acc_0835 flagged=0/10 suspects=0 steps_left=78
|
| 419 |
+
Step 3: INSPECT acc_0855 flagged=0/10 suspects=0 steps_left=77
|
| 420 |
+
Step 4: INSPECT acc_0930 flagged=0/10 suspects=0 steps_left=76
|
| 421 |
+
Step 5: INSPECT acc_0336 flagged=0/10 suspects=0 steps_left=75
|
| 422 |
+
Step 6: INSPECT acc_0929 flagged=0/10 suspects=0 steps_left=74
|
| 423 |
+
Step 7: INSPECT acc_0076 flagged=0/10 suspects=0 steps_left=73
|
| 424 |
+
Step 8: INSPECT acc_0543 flagged=0/10 suspects=0 steps_left=72
|
| 425 |
+
Step 9: INSPECT acc_0590 flagged=0/10 suspects=0 steps_left=71
|
| 426 |
+
Step 10: INSPECT acc_0401 flagged=0/10 suspects=0 steps_left=70
|
| 427 |
+
Step 11: INSPECT acc_0322 flagged=0/10 suspects=0 steps_left=69
|
| 428 |
+
Step 12: INSPECT acc_0154 flagged=0/10 suspects=0 steps_left=68
|
| 429 |
+
Step 13: INSPECT acc_0374 flagged=0/10 suspects=0 steps_left=67
|
| 430 |
+
Step 14: INSPECT acc_0549 flagged=0/10 suspects=0 steps_left=66
|
| 431 |
+
Step 15: INSPECT acc_0903 flagged=0/10 suspects=0 steps_left=65
|
| 432 |
+
Step 16: INSPECT acc_0976 flagged=0/10 suspects=0 steps_left=64
|
| 433 |
+
Step 17: INSPECT acc_0620 flagged=0/10 suspects=0 steps_left=63
|
| 434 |
+
Step 18: INSPECT acc_0017 flagged=0/10 suspects=0 steps_left=62
|
| 435 |
+
Step 19: INSPECT acc_0222 flagged=0/10 suspects=0 steps_left=61
|
| 436 |
+
Step 20: INSPECT acc_0536 flagged=0/10 suspects=0 steps_left=60
|
| 437 |
+
Step 21: INSPECT acc_0112 flagged=0/10 suspects=0 steps_left=59
|
| 438 |
+
Step 22: INSPECT acc_0577 flagged=0/10 suspects=0 steps_left=58
|
| 439 |
+
Step 23: INSPECT acc_0517 flagged=0/10 suspects=0 steps_left=57
|
| 440 |
+
Step 24: INSPECT acc_0113 flagged=0/10 suspects=0 steps_left=56
|
| 441 |
+
Step 25: INSPECT acc_0167 flagged=0/10 suspects=0 steps_left=55
|
| 442 |
+
Step 26: INSPECT acc_0697 flagged=0/10 suspects=0 steps_left=54
|
| 443 |
+
Step 27: INSPECT acc_0271 flagged=0/10 suspects=0 steps_left=53
|
| 444 |
+
Step 28: INSPECT acc_0681 flagged=0/10 suspects=0 steps_left=52
|
| 445 |
+
Step 29: INSPECT acc_0530 flagged=0/10 suspects=0 steps_left=51
|
| 446 |
+
Step 30: INSPECT acc_0353 flagged=0/10 suspects=0 steps_left=50
|
| 447 |
+
Step 31: INSPECT acc_0652 flagged=0/10 suspects=0 steps_left=49
|
| 448 |
+
Step 32: INSPECT acc_0777 flagged=0/10 suspects=0 steps_left=48
|
| 449 |
+
Step 33: INSPECT acc_0265 flagged=0/10 suspects=0 steps_left=47
|
| 450 |
+
Step 34: INSPECT acc_0788 flagged=0/10 suspects=0 steps_left=46
|
| 451 |
+
Step 35: INSPECT acc_0033 flagged=0/10 suspects=0 steps_left=45
|
| 452 |
+
Step 36: INSPECT acc_0187 flagged=0/10 suspects=0 steps_left=44
|
| 453 |
+
Step 37: INSPECT acc_0445 flagged=0/10 suspects=0 steps_left=43
|
| 454 |
+
Step 38: INSPECT acc_0846 flagged=0/10 suspects=0 steps_left=42
|
| 455 |
+
Step 39: INSPECT acc_0659 flagged=0/10 suspects=0 steps_left=41
|
| 456 |
+
Step 40: INSPECT acc_0768 flagged=0/10 suspects=0 steps_left=40
|
| 457 |
+
Step 41: INSPECT acc_0677 flagged=0/10 suspects=0 steps_left=39
|
| 458 |
+
Step 42: INSPECT acc_0539 flagged=0/10 suspects=0 steps_left=38
|
| 459 |
+
Step 43: INSPECT acc_0742 flagged=1/10 suspects=10 steps_left=37
|
| 460 |
+
Step 44: INSPECT acc_0876 flagged=1/10 suspects=10 steps_left=35
|
| 461 |
+
Step 45: INSPECT acc_0884 flagged=1/10 suspects=10 steps_left=33
|
| 462 |
+
Step 46: INSPECT acc_0245 flagged=1/10 suspects=10 steps_left=32
|
| 463 |
+
Step 47: INSPECT acc_0223 flagged=1/10 suspects=10 steps_left=31
|
| 464 |
+
Step 48: INSPECT acc_0647 flagged=1/10 suspects=10 steps_left=30
|
| 465 |
+
Step 49: INSPECT acc_0994 flagged=1/10 suspects=10 steps_left=29
|
| 466 |
+
Step 50: INSPECT acc_0811 flagged=1/10 suspects=10 steps_left=28
|
| 467 |
+
Step 51: INSPECT acc_0794 flagged=1/10 suspects=10 steps_left=27
|
| 468 |
+
Step 52: INSPECT acc_0582 flagged=1/10 suspects=10 steps_left=26
|
| 469 |
+
Step 53: INSPECT acc_0503 flagged=1/10 suspects=10 steps_left=25
|
| 470 |
+
Step 54: FLAG acc_0174 flagged=1/10 suspects=10 steps_left=25
|
| 471 |
+
Step 55: INSPECT acc_0639 flagged=1/10 suspects=10 steps_left=24
|
| 472 |
+
Step 56: INSPECT acc_0494 flagged=1/10 suspects=10 steps_left=23
|
| 473 |
+
Step 57: INSPECT acc_0898 flagged=1/10 suspects=10 steps_left=22
|
| 474 |
+
Step 58: INSPECT acc_0553 flagged=1/10 suspects=10 steps_left=21
|
| 475 |
+
Step 59: INSPECT acc_0588 flagged=1/10 suspects=10 steps_left=20
|
| 476 |
+
Step 60: INSPECT acc_0194 flagged=1/10 suspects=10 steps_left=19
|
| 477 |
+
Step 61: INSPECT acc_0810 flagged=1/10 suspects=10 steps_left=18
|
| 478 |
+
Step 62: INSPECT acc_0355 flagged=1/10 suspects=10 steps_left=17
|
| 479 |
+
Step 63: INSPECT acc_0363 flagged=1/10 suspects=10 steps_left=16
|
| 480 |
+
Step 64: INSPECT acc_0221 flagged=1/10 suspects=10 steps_left=15
|
| 481 |
+
Step 65: INSPECT acc_0580 flagged=1/10 suspects=10 steps_left=14
|
| 482 |
+
Step 66: INSPECT acc_0534 flagged=1/10 suspects=10 steps_left=13
|
| 483 |
+
Step 67: INSPECT acc_0778 flagged=1/10 suspects=10 steps_left=12
|
| 484 |
+
Step 68: INSPECT acc_0998 flagged=1/10 suspects=10 steps_left=11
|
| 485 |
+
Step 69: INSPECT acc_0233 flagged=1/10 suspects=10 steps_left=10
|
| 486 |
+
Step 70: INSPECT acc_0052 flagged=1/10 suspects=10 steps_left=9
|
| 487 |
+
Step 71: INSPECT acc_0813 flagged=1/10 suspects=10 steps_left=8
|
| 488 |
+
Step 72: INSPECT acc_0035 flagged=1/10 suspects=10 steps_left=7
|
| 489 |
+
Step 73: INSPECT acc_0667 flagged=1/10 suspects=10 steps_left=6
|
| 490 |
+
Step 74: INSPECT acc_0019 flagged=1/10 suspects=10 steps_left=5
|
| 491 |
+
Step 75: INSPECT acc_0959 flagged=1/10 suspects=10 steps_left=4
|
| 492 |
+
Step 76: INSPECT acc_0212 flagged=1/10 suspects=10 steps_left=3
|
| 493 |
+
Step 77: INSPECT acc_0776 flagged=1/10 suspects=10 steps_left=2
|
| 494 |
+
Step 78: INSPECT acc_0049 flagged=1/10 suspects=10 steps_left=1
|
| 495 |
+
Step 79: FLAG acc_0174 flagged=1/10 suspects=10 steps_left=1
|
| 496 |
+
Step 80: FLAG acc_0434 flagged=2/10 suspects=18 steps_left=1
|
| 497 |
+
Step 81: INSPECT acc_0369 flagged=2/10 suspects=18 steps_left=0
|
| 498 |
+
→ Episode ended: [LOSS] TP=0 FP=2 FN=10 Recall=0.00 Precision=0.00 Episode reward=-10.80
|
| 499 |
+
★ GRADER SCORE: 0.0000
|
| 500 |
+
|
| 501 |
+
--- LLM Agent: task=hard, seed=2, model=Bedrock/nvidia.nemotron-super-3-120b ---
|
| 502 |
+
Step 1: INSPECT acc_0813 flagged=0/10 suspects=0 steps_left=79
|
| 503 |
+
Step 2: INSPECT acc_0430 flagged=0/10 suspects=0 steps_left=78
|
| 504 |
+
Step 3: INSPECT acc_0817 flagged=0/10 suspects=0 steps_left=77
|
| 505 |
+
Step 4: INSPECT acc_0175 flagged=0/10 suspects=0 steps_left=76
|
| 506 |
+
Step 5: INSPECT acc_0523 flagged=0/10 suspects=0 steps_left=75
|
| 507 |
+
Step 6: INSPECT acc_0113 flagged=0/10 suspects=0 steps_left=74
|
| 508 |
+
Step 7: INSPECT acc_0797 flagged=0/10 suspects=0 steps_left=73
|
| 509 |
+
Step 8: INSPECT acc_0478 flagged=0/10 suspects=0 steps_left=72
|
| 510 |
+
Step 9: INSPECT acc_0861 flagged=0/10 suspects=0 steps_left=71
|
| 511 |
+
Step 10: INSPECT acc_0836 flagged=0/10 suspects=0 steps_left=70
|
| 512 |
+
Step 11: INSPECT acc_0926 flagged=0/10 suspects=0 steps_left=69
|
| 513 |
+
Step 12: INSPECT acc_0664 flagged=0/10 suspects=0 steps_left=68
|
| 514 |
+
Step 13: INSPECT acc_0255 flagged=0/10 suspects=0 steps_left=67
|
| 515 |
+
Step 14: INSPECT acc_0938 flagged=0/10 suspects=0 steps_left=66
|
| 516 |
+
Step 15: INSPECT acc_0672 flagged=0/10 suspects=0 steps_left=65
|
| 517 |
+
Step 16: FLAG acc_0672 flagged=1/10 suspects=6 steps_left=65
|
| 518 |
+
Step 17: INSPECT acc_0659 flagged=1/10 suspects=6 steps_left=64
|
| 519 |
+
Step 18: FLAG acc_0659 flagged=2/10 suspects=5 steps_left=64
|
| 520 |
+
Step 19: INSPECT acc_0290 flagged=2/10 suspects=5 steps_left=63
|
| 521 |
+
Step 20: FLAG acc_0290 flagged=3/10 suspects=5 steps_left=63
|
| 522 |
+
Step 21: INSPECT acc_0339 flagged=3/10 suspects=5 steps_left=62
|
| 523 |
+
Step 22: FLAG acc_0339 flagged=4/10 suspects=6 steps_left=62
|
| 524 |
+
Step 23: INSPECT acc_0544 flagged=4/10 suspects=6 steps_left=61
|
| 525 |
+
Step 24: FLAG acc_0544 flagged=5/10 suspects=5 steps_left=61
|
| 526 |
+
Step 25: INSPECT acc_0696 flagged=5/10 suspects=5 steps_left=60
|
| 527 |
+
Step 26: FLAG acc_0696 flagged=6/10 suspects=4 steps_left=60
|
| 528 |
+
Step 27: INSPECT acc_0541 flagged=6/10 suspects=4 steps_left=59
|
| 529 |
+
Step 28: FLAG acc_0541 flagged=7/10 suspects=3 steps_left=59
|
| 530 |
+
Step 29: INSPECT acc_0793 flagged=7/10 suspects=3 steps_left=58
|
| 531 |
+
Step 30: FLAG acc_0793 flagged=8/10 suspects=2 steps_left=58
|
| 532 |
+
Step 31: INSPECT acc_0214 flagged=8/10 suspects=2 steps_left=57
|
| 533 |
+
Step 32: FLAG acc_0214 flagged=9/10 suspects=1 steps_left=57
|
| 534 |
+
Step 33: INSPECT acc_0112 flagged=9/10 suspects=1 steps_left=56
|
| 535 |
+
Step 34: FLAG acc_0112 flagged=10/10 suspects=0 steps_left=56
|
| 536 |
+
Step 35: SUBMIT flagged=10/10 suspects=0 steps_left=56
|
| 537 |
+
→ Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.76
|
| 538 |
+
★ GRADER SCORE: 0.9700
|
| 539 |
+
|
| 540 |
+
hard: scores=['0.964', '0.000', '0.970'] mean=0.6446 var=0.207740
|
| 541 |
+
|
| 542 |
+
============================================================
|
| 543 |
+
EVALUATION COMPLETE
|
| 544 |
+
============================================================
|
| 545 |
+
ubuntu@ip-172-31-33-59:~/meta/meta-hack-26$
|
runs/metrics.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
server/app.py
CHANGED
|
@@ -11,8 +11,9 @@ sys.path.insert(0, str(Path(__file__).parent))
|
|
| 11 |
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 12 |
|
| 13 |
from fastapi import FastAPI, HTTPException
|
| 14 |
-
from fastapi.responses import HTMLResponse
|
| 15 |
from fastapi.middleware.cors import CORSMiddleware
|
|
|
|
| 16 |
from pydantic import BaseModel
|
| 17 |
from typing import Any, Dict, Optional
|
| 18 |
|
|
@@ -20,7 +21,7 @@ from models import FakeGangAction, FakeGangObservation, FakeGangState, ActionTyp
|
|
| 20 |
from environment import FakeGangEnvironment
|
| 21 |
|
| 22 |
# ---------------------------------------------------------------------------
|
| 23 |
-
# App
|
| 24 |
# ---------------------------------------------------------------------------
|
| 25 |
|
| 26 |
app = FastAPI(
|
|
@@ -28,19 +29,19 @@ app = FastAPI(
|
|
| 28 |
description="RL environment for detecting coordinated fake account rings in social networks.",
|
| 29 |
version="1.0.0",
|
| 30 |
)
|
|
|
|
| 31 |
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
_env = FakeGangEnvironment()
|
| 38 |
|
| 39 |
-
|
| 40 |
-
# ---------------------------------------------------------------------------
|
| 41 |
-
# Schemas
|
| 42 |
-
# ---------------------------------------------------------------------------
|
| 43 |
-
|
| 44 |
class ResetRequest(BaseModel):
|
| 45 |
task: str = "easy"
|
| 46 |
seed: Optional[int] = None
|
|
@@ -52,7 +53,6 @@ class StepResponse(BaseModel):
|
|
| 52 |
reward: Optional[float]
|
| 53 |
message: str
|
| 54 |
|
| 55 |
-
|
| 56 |
# ---------------------------------------------------------------------------
|
| 57 |
# OpenEnv API endpoints
|
| 58 |
# ---------------------------------------------------------------------------
|
|
@@ -100,9 +100,8 @@ def grader():
|
|
| 100 |
@app.get("/metadata")
|
| 101 |
def metadata():
|
| 102 |
return {
|
| 103 |
-
"name": "graphstrike",
|
| 104 |
"description": "RL environment for detecting coordinated fake account rings in social networks.",
|
| 105 |
-
"version": "1.0.0", "author": "Pandago",
|
| 106 |
"tags": ["social-network", "fraud-detection", "graph", "rl"],
|
| 107 |
}
|
| 108 |
|
|
@@ -137,124 +136,662 @@ def baseline():
|
|
| 137 |
return {"scores": scores, "agent": "rule_based"}
|
| 138 |
|
| 139 |
|
| 140 |
-
# HF Spaces probes
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
return """<!DOCTYPE html>
|
| 145 |
-
<html><head><meta http-equiv="refresh" content="0;url=/"><title>GraphStrike</title></head>
|
| 146 |
-
<body><p>Loading <a href="/">GraphStrike</a>...</p></body></html>"""
|
| 147 |
|
| 148 |
|
| 149 |
# ---------------------------------------------------------------------------
|
| 150 |
-
# Gradio
|
| 151 |
# ---------------------------------------------------------------------------
|
| 152 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
try:
|
| 154 |
import gradio as gr
|
| 155 |
|
|
|
|
|
|
|
| 156 |
def _fmt_obs(d: dict) -> str:
|
| 157 |
lines = []
|
| 158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
if d.get('reward') is not None:
|
| 160 |
-
lines.append(f"**Reward:** {d['reward']:.2f}")
|
| 161 |
fl = d.get('flagged_ids', [])
|
| 162 |
-
lines.append(f"**Flagged ({len(fl)}/10):** {
|
| 163 |
-
su
|
| 164 |
-
|
|
|
|
|
|
|
|
|
|
| 165 |
lines.append(f"**Visible:** {len(d.get('visible_account_ids',[]))} IDs | **Inspected:** {len(d.get('inspected_ids',[]))} accounts")
|
| 166 |
if d.get('evasion_triggered'):
|
| 167 |
-
lines.append(f"**Evasion events:** {d.get('evasion_count',0)}")
|
| 168 |
-
lines.append(f"
|
| 169 |
return "\n\n".join(lines)
|
| 170 |
|
| 171 |
-
def
|
| 172 |
accs = d.get("visible_accounts", [])
|
| 173 |
if not accs:
|
| 174 |
-
return
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
|
| 184 |
def gr_reset(task, seed):
|
| 185 |
try:
|
| 186 |
obs = _env.reset(task=task, seed=int(seed))
|
| 187 |
-
d
|
| 188 |
-
return _fmt_obs(d),
|
| 189 |
except Exception as e:
|
| 190 |
-
return f"**Error:** {e}", "", "{}"
|
| 191 |
|
| 192 |
def gr_step(action_type, account_id):
|
| 193 |
try:
|
| 194 |
-
acc
|
| 195 |
action = FakeGangAction(action_type=ActionType(action_type), account_id=acc)
|
| 196 |
-
obs
|
| 197 |
-
d
|
| 198 |
-
return _fmt_obs(d),
|
| 199 |
except Exception as e:
|
| 200 |
-
return f"**Error:** {e}", "", "{}"
|
| 201 |
|
| 202 |
def gr_grader():
|
| 203 |
if not _env._done:
|
| 204 |
-
return "Episode not complete
|
| 205 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
|
| 207 |
def gr_baseline():
|
| 208 |
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 209 |
from inference import run_rule_based_episode
|
| 210 |
scores = {t: run_rule_based_episode(_env, task=t, seed=0) for t in ["easy", "medium", "hard"]}
|
| 211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
|
| 213 |
with gr.Blocks(title="GraphStrike") as demo:
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
|
| 248 |
app = gr.mount_gradio_app(app, demo, path="/")
|
| 249 |
print("[GraphStrike] Gradio UI mounted at /", flush=True)
|
| 250 |
|
| 251 |
except Exception as exc:
|
|
|
|
| 252 |
print(f"[GraphStrike] Gradio unavailable: {exc}", flush=True)
|
|
|
|
| 253 |
|
| 254 |
@app.get("/", response_class=HTMLResponse)
|
| 255 |
def root_fallback():
|
| 256 |
-
return "<html><body><h1>GraphStrike</h1><p>API
|
| 257 |
-
|
| 258 |
|
| 259 |
# ---------------------------------------------------------------------------
|
| 260 |
# Entry point
|
|
|
|
| 11 |
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 12 |
|
| 13 |
from fastapi import FastAPI, HTTPException
|
| 14 |
+
from fastapi.responses import HTMLResponse, RedirectResponse
|
| 15 |
from fastapi.middleware.cors import CORSMiddleware
|
| 16 |
+
from fastapi.staticfiles import StaticFiles
|
| 17 |
from pydantic import BaseModel
|
| 18 |
from typing import Any, Dict, Optional
|
| 19 |
|
|
|
|
| 21 |
from environment import FakeGangEnvironment
|
| 22 |
|
| 23 |
# ---------------------------------------------------------------------------
|
| 24 |
+
# App + environment
|
| 25 |
# ---------------------------------------------------------------------------
|
| 26 |
|
| 27 |
app = FastAPI(
|
|
|
|
| 29 |
description="RL environment for detecting coordinated fake account rings in social networks.",
|
| 30 |
version="1.0.0",
|
| 31 |
)
|
| 32 |
+
app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
|
| 33 |
|
| 34 |
+
# Serve images at /assets/* and /images/* (used by the Gradio README tab)
|
| 35 |
+
_PROJECT_ROOT = Path(__file__).parent.parent
|
| 36 |
+
_ASSETS_DIR = _PROJECT_ROOT / "assets"
|
| 37 |
+
_IMAGES_DIR = _PROJECT_ROOT / "images"
|
| 38 |
+
if _ASSETS_DIR.exists():
|
| 39 |
+
app.mount("/assets", StaticFiles(directory=str(_ASSETS_DIR)), name="assets")
|
| 40 |
+
if _IMAGES_DIR.exists():
|
| 41 |
+
app.mount("/images", StaticFiles(directory=str(_IMAGES_DIR)), name="images")
|
| 42 |
|
| 43 |
_env = FakeGangEnvironment()
|
| 44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
class ResetRequest(BaseModel):
|
| 46 |
task: str = "easy"
|
| 47 |
seed: Optional[int] = None
|
|
|
|
| 53 |
reward: Optional[float]
|
| 54 |
message: str
|
| 55 |
|
|
|
|
| 56 |
# ---------------------------------------------------------------------------
|
| 57 |
# OpenEnv API endpoints
|
| 58 |
# ---------------------------------------------------------------------------
|
|
|
|
| 100 |
@app.get("/metadata")
|
| 101 |
def metadata():
|
| 102 |
return {
|
| 103 |
+
"name": "graphstrike", "version": "1.0.0", "author": "Pandago",
|
| 104 |
"description": "RL environment for detecting coordinated fake account rings in social networks.",
|
|
|
|
| 105 |
"tags": ["social-network", "fraud-detection", "graph", "rl"],
|
| 106 |
}
|
| 107 |
|
|
|
|
| 136 |
return {"scores": scores, "agent": "rule_based"}
|
| 137 |
|
| 138 |
|
| 139 |
+
# HF Spaces probes /web — redirect to root (must be on FastAPI before Gradio mount)
|
| 140 |
+
@app.get("/web", response_class=RedirectResponse)
|
| 141 |
+
def web_redirect():
|
| 142 |
+
return RedirectResponse(url="/")
|
|
|
|
|
|
|
|
|
|
| 143 |
|
| 144 |
|
| 145 |
# ---------------------------------------------------------------------------
|
| 146 |
+
# Gradio UI
|
| 147 |
# ---------------------------------------------------------------------------
|
| 148 |
|
| 149 |
+
import pandas as pd
|
| 150 |
+
|
| 151 |
+
# ── Benchmark data ───────────────────────────────────────────────────────────
|
| 152 |
+
|
| 153 |
+
BENCH_SEED0 = [
|
| 154 |
+
# [Model, Params, Easy, Medium, Hard, Mean] — sorted by Mean desc
|
| 155 |
+
["Llama 4 Scout 17B", "17B", 0.960, 0.979, 0.976, 0.972],
|
| 156 |
+
["Ministral 3 8B", "8B", 0.967, 0.964, 0.964, 0.965],
|
| 157 |
+
["DeepSeek V3.2", "685B", 0.967, 0.960, 0.933, 0.953],
|
| 158 |
+
["Nemotron Super 3", "49B", 0.930, 0.941, 0.964, 0.945],
|
| 159 |
+
["Rule-Based Baseline","—", 0.910, 0.906, 0.904, 0.907],
|
| 160 |
+
["Gemma 3 12B", "12B", 0.900, 0.908, 0.908, 0.905],
|
| 161 |
+
]
|
| 162 |
+
|
| 163 |
+
BENCH_VARIANCE = [
|
| 164 |
+
# [Model, Easy mean, Easy var, Med mean, Med var, Hard mean, Hard var]
|
| 165 |
+
["Llama 4 Scout 17B", 0.960, 0.000007, 0.979, 0.000001, 0.976, 0.000063],
|
| 166 |
+
["Nemotron Super 3", 0.957, 0.000, 0.957, 0.000, 0.645, 0.208],
|
| 167 |
+
["Ministral 3 8B", 0.958, 0.000, 0.645, 0.208, 0.623, 0.195],
|
| 168 |
+
["DeepSeek V3.2", 0.640, 0.205, 0.957, 0.000, 0.645, 0.208],
|
| 169 |
+
["Gemma 3 12B", 0.912, 0.000, 0.917, 0.000, 0.603, 0.182],
|
| 170 |
+
]
|
| 171 |
+
|
| 172 |
+
PROFILE_HEADERS = ["Account", "Status", "Risk", "Node", "Beh", "Graph", "Hub", "Photo", "Bio", "IP", "F.Nbrs"]
|
| 173 |
+
|
| 174 |
+
# Long-format DataFrame for BarPlot
|
| 175 |
+
_bench_long_rows = []
|
| 176 |
+
for _r in BENCH_SEED0:
|
| 177 |
+
_bench_long_rows += [
|
| 178 |
+
{"Model": _r[0], "Task": "Easy", "Score": _r[2]},
|
| 179 |
+
{"Model": _r[0], "Task": "Medium", "Score": _r[3]},
|
| 180 |
+
{"Model": _r[0], "Task": "Hard", "Score": _r[4]},
|
| 181 |
+
]
|
| 182 |
+
BENCH_LONG_DF = pd.DataFrame(_bench_long_rows)
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
# ── HTML table builders ──────────────────────────────────────────────────────
|
| 186 |
+
|
| 187 |
+
def _score_color(s: float) -> str:
|
| 188 |
+
if s >= 0.960: return "#22c55e"
|
| 189 |
+
if s >= 0.930: return "#86efac"
|
| 190 |
+
if s >= 0.910: return "#facc15"
|
| 191 |
+
return "#f97316"
|
| 192 |
+
|
| 193 |
+
def _var_color(v: float) -> str:
|
| 194 |
+
if v < 0.001: return "#22c55e"
|
| 195 |
+
if v < 0.05: return "#facc15"
|
| 196 |
+
return "#f87171"
|
| 197 |
+
|
| 198 |
+
_TH = "padding:11px 16px;font-weight:600;white-space:nowrap;"
|
| 199 |
+
_TD = "padding:10px 16px;white-space:nowrap;"
|
| 200 |
+
_TABLE_WRAP = (
|
| 201 |
+
"overflow-x:auto;border-radius:10px;border:1px solid #1e3a5f;"
|
| 202 |
+
"font-family:'IBM Plex Mono',monospace;font-size:13.5px;"
|
| 203 |
+
)
|
| 204 |
+
_THEAD_BG = "background:#0c2340;"
|
| 205 |
+
|
| 206 |
+
def _leaderboard_html() -> str:
|
| 207 |
+
header = (
|
| 208 |
+
f"<thead><tr style='{_THEAD_BG}'>"
|
| 209 |
+
f"<th style='{_TH}color:#64748b;'>#</th>"
|
| 210 |
+
f"<th style='{_TH}color:#e2e8f0;text-align:left;'>Model</th>"
|
| 211 |
+
f"<th style='{_TH}color:#94a3b8;text-align:center;'>Params</th>"
|
| 212 |
+
f"<th style='{_TH}color:#4ade80;text-align:center;'>Easy</th>"
|
| 213 |
+
f"<th style='{_TH}color:#facc15;text-align:center;'>Medium</th>"
|
| 214 |
+
f"<th style='{_TH}color:#f87171;text-align:center;'>Hard</th>"
|
| 215 |
+
f"<th style='{_TH}color:#c084fc;text-align:center;'>Mean</th>"
|
| 216 |
+
f"</tr></thead>"
|
| 217 |
+
)
|
| 218 |
+
rows = ""
|
| 219 |
+
for i, r in enumerate(BENCH_SEED0):
|
| 220 |
+
bg = "#162032" if i % 2 == 0 else "#0f172a"
|
| 221 |
+
is_base = r[0] == "Rule-Based Baseline"
|
| 222 |
+
name_cell = (
|
| 223 |
+
f"{r[0]} <span style='color:#64748b;font-size:11px;'>(baseline)</span>"
|
| 224 |
+
if is_base else r[0]
|
| 225 |
+
)
|
| 226 |
+
name_color = "#94a3b8" if is_base else "#e2e8f0"
|
| 227 |
+
rows += (
|
| 228 |
+
f"<tr style='background:{bg};'>"
|
| 229 |
+
f"<td style='{_TD}color:#475569;text-align:center;'>{i+1}</td>"
|
| 230 |
+
f"<td style='{_TD}color:{name_color};'>{name_cell}</td>"
|
| 231 |
+
f"<td style='{_TD}color:#64748b;text-align:center;'>{r[1]}</td>"
|
| 232 |
+
+ "".join(
|
| 233 |
+
f"<td style='{_TD}color:{_score_color(r[j])};font-weight:700;"
|
| 234 |
+
f"text-align:center;'>{r[j]:.3f}</td>"
|
| 235 |
+
for j in (2, 3, 4)
|
| 236 |
+
)
|
| 237 |
+
+ f"<td style='{_TD}color:{_score_color(r[5])};font-weight:800;"
|
| 238 |
+
f"font-size:14px;text-align:center;'>{r[5]:.3f}</td>"
|
| 239 |
+
f"</tr>"
|
| 240 |
+
)
|
| 241 |
+
return f"<div style='{_TABLE_WRAP}'><table style='width:100%;border-collapse:collapse;'>{header}<tbody>{rows}</tbody></table></div>"
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
def _variance_html() -> str:
|
| 245 |
+
header = (
|
| 246 |
+
f"<thead><tr style='{_THEAD_BG}'>"
|
| 247 |
+
f"<th style='{_TH}color:#e2e8f0;text-align:left;'>Model</th>"
|
| 248 |
+
f"<th style='{_TH}color:#4ade80;text-align:center;'>Easy — mean / var</th>"
|
| 249 |
+
f"<th style='{_TH}color:#facc15;text-align:center;'>Medium — mean / var</th>"
|
| 250 |
+
f"<th style='{_TH}color:#f87171;text-align:center;'>Hard — mean / var</th>"
|
| 251 |
+
f"</tr></thead>"
|
| 252 |
+
)
|
| 253 |
+
rows = ""
|
| 254 |
+
for i, r in enumerate(BENCH_VARIANCE):
|
| 255 |
+
bg = "#162032" if i % 2 == 0 else "#0f172a"
|
| 256 |
+
def cell(mean, var):
|
| 257 |
+
return (
|
| 258 |
+
f"<td style='{_TD}text-align:center;'>"
|
| 259 |
+
f"<span style='color:#e2e8f0;font-weight:600;'>{mean:.3f}</span>"
|
| 260 |
+
f" <span style='color:{_var_color(var)};font-size:11px;'>/ {var:.1e}</span>"
|
| 261 |
+
f"</td>"
|
| 262 |
+
)
|
| 263 |
+
rows += (
|
| 264 |
+
f"<tr style='background:{bg};'>"
|
| 265 |
+
f"<td style='{_TD}color:#e2e8f0;font-weight:500;'>{r[0]}</td>"
|
| 266 |
+
+ cell(r[1], r[2]) + cell(r[3], r[4]) + cell(r[5], r[6])
|
| 267 |
+
+ "</tr>"
|
| 268 |
+
)
|
| 269 |
+
return f"<div style='{_TABLE_WRAP};margin-top:20px;'><table style='width:100%;border-collapse:collapse;'>{header}<tbody>{rows}</tbody></table></div>"
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
def _baseline_html() -> str:
|
| 273 |
+
rows_data = [
|
| 274 |
+
("Easy", 0.9100, "100%", "#4ade80"),
|
| 275 |
+
("Medium", 0.9060, "84%", "#facc15"),
|
| 276 |
+
("Hard", 0.9038, "52%", "#f87171"),
|
| 277 |
+
]
|
| 278 |
+
header = (
|
| 279 |
+
f"<thead><tr style='{_THEAD_BG}'>"
|
| 280 |
+
f"<th style='{_TH}color:#e2e8f0;'>Task</th>"
|
| 281 |
+
f"<th style='{_TH}color:#e2e8f0;text-align:center;'>Score (seed=0)</th>"
|
| 282 |
+
f"<th style='{_TH}color:#e2e8f0;text-align:center;'>Win Rate (50 seeds)</th>"
|
| 283 |
+
f"</tr></thead>"
|
| 284 |
+
)
|
| 285 |
+
rows = ""
|
| 286 |
+
for i, (task, score, wr, col) in enumerate(rows_data):
|
| 287 |
+
bg = "#162032" if i % 2 == 0 else "#0f172a"
|
| 288 |
+
rows += (
|
| 289 |
+
f"<tr style='background:{bg};'>"
|
| 290 |
+
f"<td style='{_TD}color:{col};font-weight:600;'>{task}</td>"
|
| 291 |
+
f"<td style='{_TD}color:#e2e8f0;font-weight:700;text-align:center;'>{score:.4f}</td>"
|
| 292 |
+
f"<td style='{_TD}color:{col};font-weight:600;text-align:center;'>{wr}</td>"
|
| 293 |
+
f"</tr>"
|
| 294 |
+
)
|
| 295 |
+
return f"<div style='{_TABLE_WRAP};margin-top:4px;'><table style='width:100%;border-collapse:collapse;'>{header}<tbody>{rows}</tbody></table></div>"
|
| 296 |
+
|
| 297 |
+
|
| 298 |
try:
|
| 299 |
import gradio as gr
|
| 300 |
|
| 301 |
+
# ── Observation / profile helpers ─────────────────────────────────────────
|
| 302 |
+
|
| 303 |
def _fmt_obs(d: dict) -> str:
|
| 304 |
lines = []
|
| 305 |
+
task = d.get('task', '?').upper()
|
| 306 |
+
done = d.get('done', False)
|
| 307 |
+
steps = d.get('steps_remaining', '?')
|
| 308 |
+
state_label = "Done" if done else "In Progress"
|
| 309 |
+
lines.append(f"### Task: **{task}** | Steps remaining: **{steps}** | {state_label}")
|
| 310 |
if d.get('reward') is not None:
|
| 311 |
+
lines.append(f"**Final Reward:** `{d['reward']:.2f}`")
|
| 312 |
fl = d.get('flagged_ids', [])
|
| 313 |
+
lines.append(f"**Flagged ({len(fl)}/10):** " + (" ".join(f"`{f}`" for f in fl) if fl else "*none*"))
|
| 314 |
+
su = d.get('suspect_ids', [])
|
| 315 |
+
ins = set(d.get('inspected_ids', []))
|
| 316 |
+
uninspected_sus = [s for s in su if s not in ins]
|
| 317 |
+
if uninspected_sus:
|
| 318 |
+
lines.append(f"**Suspects — uninspected ({len(uninspected_sus)}):** " + " ".join(f"`{s}`" for s in uninspected_sus))
|
| 319 |
lines.append(f"**Visible:** {len(d.get('visible_account_ids',[]))} IDs | **Inspected:** {len(d.get('inspected_ids',[]))} accounts")
|
| 320 |
if d.get('evasion_triggered'):
|
| 321 |
+
lines.append(f"**Evasion events fired:** {d.get('evasion_count', 0)}")
|
| 322 |
+
lines.append(f"\n> {d.get('message', '')}")
|
| 323 |
return "\n\n".join(lines)
|
| 324 |
|
| 325 |
+
def _profile_rows(d: dict) -> list:
|
| 326 |
accs = d.get("visible_accounts", [])
|
| 327 |
if not accs:
|
| 328 |
+
return []
|
| 329 |
+
STATUS_MAP = {
|
| 330 |
+
"confirmed_fake": "confirmed_fake [flagged]",
|
| 331 |
+
"suspect": "suspect",
|
| 332 |
+
"normal": "normal",
|
| 333 |
+
}
|
| 334 |
+
rows = []
|
| 335 |
+
for a in sorted(accs, key=lambda x: x.get("fake_risk_score", 0), reverse=True)[:40]:
|
| 336 |
+
rows.append([
|
| 337 |
+
a.get("account_id", ""),
|
| 338 |
+
STATUS_MAP.get(a.get("status", ""), a.get("status", "")),
|
| 339 |
+
round(a.get("fake_risk_score", 0), 3),
|
| 340 |
+
round(a.get("node_risk", 0), 3),
|
| 341 |
+
round(a.get("behavior_risk", 0), 3),
|
| 342 |
+
round(a.get("graph_risk", 0), 3),
|
| 343 |
+
round(a.get("hub_legitimacy_score", 0), 3),
|
| 344 |
+
round(a.get("photo_reuse_score", 0), 3),
|
| 345 |
+
round(a.get("bio_template_score", 0), 3),
|
| 346 |
+
a.get("shared_ip_count", 0),
|
| 347 |
+
a.get("flagged_neighbor_count", 0),
|
| 348 |
+
])
|
| 349 |
+
return rows
|
| 350 |
+
|
| 351 |
+
def _fmt_visible_ids(d: dict) -> str:
|
| 352 |
+
ins = set(d.get('inspected_ids', []))
|
| 353 |
+
suspects = set(d.get('suspect_ids', []))
|
| 354 |
+
flagged = set(d.get('flagged_ids', []))
|
| 355 |
+
visible = d.get('visible_account_ids', [])
|
| 356 |
+
if not visible:
|
| 357 |
+
return "*No visible accounts yet.*"
|
| 358 |
+
parts = []
|
| 359 |
+
for vid in visible:
|
| 360 |
+
if vid in flagged:
|
| 361 |
+
parts.append(f"**[F]** `{vid}`")
|
| 362 |
+
elif vid in suspects and vid not in ins:
|
| 363 |
+
parts.append(f"**[S]** `{vid}`")
|
| 364 |
+
elif vid in ins:
|
| 365 |
+
parts.append(f"`{vid}`")
|
| 366 |
+
else:
|
| 367 |
+
parts.append(f"`{vid}`")
|
| 368 |
+
return " ".join(parts)
|
| 369 |
+
|
| 370 |
+
# ── Playground callbacks ──────────────────────────────────────────────────
|
| 371 |
|
| 372 |
def gr_reset(task, seed):
|
| 373 |
try:
|
| 374 |
obs = _env.reset(task=task, seed=int(seed))
|
| 375 |
+
d = obs.model_dump()
|
| 376 |
+
return _fmt_obs(d), _profile_rows(d), _fmt_visible_ids(d), json.dumps(d, indent=2, default=str)
|
| 377 |
except Exception as e:
|
| 378 |
+
return f"**Error:** {e}", [], "", "{}"
|
| 379 |
|
| 380 |
def gr_step(action_type, account_id):
|
| 381 |
try:
|
| 382 |
+
acc = account_id.strip() if action_type != "submit" else None
|
| 383 |
action = FakeGangAction(action_type=ActionType(action_type), account_id=acc)
|
| 384 |
+
obs = _env.step(action)
|
| 385 |
+
d = obs.model_dump()
|
| 386 |
+
return _fmt_obs(d), _profile_rows(d), _fmt_visible_ids(d), json.dumps(d, indent=2, default=str)
|
| 387 |
except Exception as e:
|
| 388 |
+
return f"**Error:** {e}", [], "", "{}"
|
| 389 |
|
| 390 |
def gr_grader():
|
| 391 |
if not _env._done:
|
| 392 |
+
return "Episode not complete — call SUBMIT first."
|
| 393 |
+
return (
|
| 394 |
+
f"**Score:** `{_env._last_grader_score:.4f}` | "
|
| 395 |
+
f"**Task:** {_env._task} | "
|
| 396 |
+
f"**Episode:** `{_env._episode_id}`"
|
| 397 |
+
)
|
| 398 |
|
| 399 |
def gr_baseline():
|
| 400 |
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 401 |
from inference import run_rule_based_episode
|
| 402 |
scores = {t: run_rule_based_episode(_env, task=t, seed=0) for t in ["easy", "medium", "hard"]}
|
| 403 |
+
mean = sum(scores.values()) / 3
|
| 404 |
+
return (
|
| 405 |
+
f"**Baseline (rule-based, seed=0)**\n\n"
|
| 406 |
+
f"Easy: `{scores['easy']:.4f}` | Medium: `{scores['medium']:.4f}` | "
|
| 407 |
+
f"Hard: `{scores['hard']:.4f}` | Mean: `{mean:.4f}`"
|
| 408 |
+
)
|
| 409 |
+
|
| 410 |
+
# ── Build Gradio UI ───────────────────────────────────────────────────────
|
| 411 |
+
|
| 412 |
+
# ── README content (rendered as styled HTML) ─────────────────────────────
|
| 413 |
+
|
| 414 |
+
_README_HTML = """
|
| 415 |
+
<style>
|
| 416 |
+
.gs-readme { font-family: 'Inter', system-ui, sans-serif; color: #cbd5e1; line-height: 1.7; max-width: 960px; margin: 0 auto; padding: 8px 4px 32px; }
|
| 417 |
+
.gs-readme h2 { color: #e2e8f0; font-size: 1.12em; font-weight: 700; border-bottom: 1px solid #1e3a5f; padding-bottom: 8px; margin: 32px 0 14px; letter-spacing: -0.2px; }
|
| 418 |
+
.gs-readme h3 { color: #7dd3fc; font-size: 0.97em; font-weight: 600; margin: 20px 0 8px; }
|
| 419 |
+
.gs-readme p { margin: 0 0 10px; font-size: 0.92em; }
|
| 420 |
+
.gs-readme code { background: #0c2340; color: #7dd3fc; padding: 2px 7px; border-radius: 4px; font-family: 'IBM Plex Mono', monospace; font-size: 0.84em; }
|
| 421 |
+
.gs-readme pre { background: #0a1628; border: 1px solid #1e3a5f; border-radius: 8px; padding: 14px 18px; overflow-x: auto; margin: 10px 0 16px; }
|
| 422 |
+
.gs-readme pre code { background: none; padding: 0; color: #93c5fd; font-size: 0.82em; }
|
| 423 |
+
.gs-table { width: 100%; border-collapse: collapse; margin: 10px 0 18px; font-size: 0.86em; }
|
| 424 |
+
.gs-table th { background: #0c2340; color: #94a3b8; font-weight: 600; padding: 9px 14px; text-align: left; border-bottom: 1px solid #1e3a5f; }
|
| 425 |
+
.gs-table td { padding: 8px 14px; border-bottom: 1px solid #0f1e30; color: #cbd5e1; }
|
| 426 |
+
.gs-table tr:nth-child(even) td { background: #060e1a; }
|
| 427 |
+
.gs-badge { display:inline-block; padding: 2px 9px; border-radius: 4px; font-size: 0.78em; font-weight: 700; }
|
| 428 |
+
.gs-badge-easy { background:#052e16; color:#4ade80; border:1px solid #166534; }
|
| 429 |
+
.gs-badge-medium { background:#2d1f00; color:#facc15; border:1px solid #92400e; }
|
| 430 |
+
.gs-badge-hard { background:#2d0a0a; color:#f87171; border:1px solid #7f1d1d; }
|
| 431 |
+
.gs-card { background: #0a1628; border: 1px solid #1e3a5f; border-radius: 10px; padding: 16px 20px; margin: 10px 0; }
|
| 432 |
+
.gs-card h3 { margin-top: 0; }
|
| 433 |
+
.gs-formula { background: #050d18; border-left: 3px solid #3b82f6; padding: 12px 18px; border-radius: 0 8px 8px 0; margin: 12px 0; font-family: 'IBM Plex Mono', monospace; font-size: 0.83em; color: #93c5fd; white-space: pre; overflow-x: auto; }
|
| 434 |
+
.gs-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 12px; margin: 14px 0; }
|
| 435 |
+
.gs-stat { background: #0a1628; border: 1px solid #1e3a5f; border-radius: 8px; padding: 14px 16px; text-align: center; }
|
| 436 |
+
.gs-stat-val { font-size: 1.7em; font-weight: 800; color: #38bdf8; font-family: 'IBM Plex Mono', monospace; display: block; }
|
| 437 |
+
.gs-stat-lbl { font-size: 0.77em; color: #64748b; margin-top: 4px; display: block; }
|
| 438 |
+
.gs-img { width: 100%; border-radius: 10px; border: 1px solid #1e3a5f; margin: 14px 0; display: block; background: #0a1628; }
|
| 439 |
+
.gs-img-pair { display: grid; grid-template-columns: 1fr 1fr; gap: 12px; margin: 14px 0; }
|
| 440 |
+
.gs-img-caption { font-size: 0.78em; color: #475569; text-align: center; margin-top: -8px; margin-bottom: 12px; font-style: italic; }
|
| 441 |
+
.gs-divider { border: none; border-top: 1px solid #0f1e30; margin: 28px 0; }
|
| 442 |
+
</style>
|
| 443 |
+
|
| 444 |
+
<div class="gs-readme">
|
| 445 |
+
|
| 446 |
+
<!-- OVERVIEW -->
|
| 447 |
+
<div class="gs-card" style="border-color:#2563eb;margin-bottom:20px;border-width:1px 1px 1px 3px;">
|
| 448 |
+
<h3 style="color:#7dd3fc;font-size:1.05em;">What is GraphStrike?</h3>
|
| 449 |
+
<p>An <strong style="color:#e2e8f0;">OpenEnv-compatible</strong> reinforcement learning environment where an LLM agent
|
| 450 |
+
must identify all 10 members of a coordinated fake account ring hidden inside a synthetic social network.
|
| 451 |
+
The agent learns via <strong>Reflexion</strong> and a <strong>dynamic hybrid rule/LLM policy</strong> — no gradient
|
| 452 |
+
updates, no fine-tuning required.</p>
|
| 453 |
+
<p style="margin:0;">Submitted to the <strong style="color:#e2e8f0;">OpenEnv Hackathon × SCALER School of Technology</strong>.
|
| 454 |
+
Judges deploy this container, run their own LLM agent against it, and score on task quality, environment design,
|
| 455 |
+
code quality, creativity, and domain quality.</p>
|
| 456 |
+
</div>
|
| 457 |
+
|
| 458 |
+
<!-- KEY STATS -->
|
| 459 |
+
<div class="gs-grid">
|
| 460 |
+
<div class="gs-stat"><span class="gs-stat-val">10</span><span class="gs-stat-lbl">Gang members to find per episode</span></div>
|
| 461 |
+
<div class="gs-stat"><span class="gs-stat-val">3</span><span class="gs-stat-lbl">Difficulty tiers (easy / medium / hard)</span></div>
|
| 462 |
+
<div class="gs-stat"><span class="gs-stat-val">150</span><span class="gs-stat-lbl">Pre-generated episodes (50 per task)</span></div>
|
| 463 |
+
<div class="gs-stat"><span class="gs-stat-val">24</span><span class="gs-stat-lbl">Automated validator checks</span></div>
|
| 464 |
+
</div>
|
| 465 |
+
|
| 466 |
+
<!-- SYSTEM ARCHITECTURE -->
|
| 467 |
+
<h2>System Architecture</h2>
|
| 468 |
+
<img src="/assets/sys arch.png" class="gs-img" alt="System Architecture" onerror="this.style.display='none'">
|
| 469 |
+
<p class="gs-img-caption">End-to-end pipeline: episode generation → environment server → hybrid agent → reflexion memory</p>
|
| 470 |
+
|
| 471 |
+
<!-- DIFFICULTY -->
|
| 472 |
+
<h2>Task Difficulty Tiers</h2>
|
| 473 |
+
<table class="gs-table">
|
| 474 |
+
<tr><th>Task</th><th>Network Size</th><th>Gang</th><th>Decoys</th><th>Max Steps</th><th>Win Condition</th><th>Baseline Score</th></tr>
|
| 475 |
+
<tr><td><span class="gs-badge gs-badge-easy">Easy</span></td><td>50 accounts</td><td>10</td><td>0</td><td>30</td><td>Recall ≥ 0.8, Precision ≥ 0.7</td><td>0.910</td></tr>
|
| 476 |
+
<tr><td><span class="gs-badge gs-badge-medium">Medium</span></td><td>200 accounts</td><td>10</td><td>20</td><td>50</td><td>Recall ≥ 0.8, Precision ≥ 0.7</td><td>0.906</td></tr>
|
| 477 |
+
<tr><td><span class="gs-badge gs-badge-hard">Hard</span></td><td>1000 accounts</td><td>10</td><td>50</td><td>80</td><td>Recall ≥ 0.9, Precision ≥ 0.8</td><td>0.904</td></tr>
|
| 478 |
+
</table>
|
| 479 |
+
<p style="font-size:0.84em;color:#64748b;margin-top:-8px;">Hard mode fires 4 evasion events (steps 15, 30, 45, 60) that drop intra-gang follow edges mid-investigation, destroying graph signals.</p>
|
| 480 |
+
|
| 481 |
+
<hr class="gs-divider">
|
| 482 |
+
|
| 483 |
+
<!-- DETECTION SIGNALS -->
|
| 484 |
+
<h2>Detection Signal Hierarchy</h2>
|
| 485 |
+
<img src="/assets/gs.png" class="gs-img" alt="Signal Hierarchy" onerror="this.style.display='none'">
|
| 486 |
+
<p class="gs-img-caption">Node signals (offline) → Behavioral signals (temporal/device) → Graph signals (live at INSPECT) → False-positive control via hub legitimacy</p>
|
| 487 |
+
|
| 488 |
+
<h3>Node Signals (pre-computed offline)</h3>
|
| 489 |
+
<table class="gs-table">
|
| 490 |
+
<tr><th>Feature</th><th>Fake Range</th><th>Real Range</th><th>What it measures</th></tr>
|
| 491 |
+
<tr><td><code>photo_reuse_score</code></td><td>0.30 – 0.95</td><td>0.00 – 0.15</td><td>Stolen celebrity photos via pHash fingerprint matching</td></tr>
|
| 492 |
+
<tr><td><code>bio_template_score</code></td><td>0.20 – 0.90</td><td>0.00 – 0.12</td><td>Cosine similarity to known fake bio templates</td></tr>
|
| 493 |
+
<tr><td><code>comment_repeat_score</code></td><td>0.60 – 0.90</td><td>0.00 – 0.08</td><td>Fraction of copy-pasted spam comments across accounts</td></tr>
|
| 494 |
+
</table>
|
| 495 |
+
|
| 496 |
+
<h3>Behavioral Signals (temporal + device)</h3>
|
| 497 |
+
<table class="gs-table">
|
| 498 |
+
<tr><th>Feature</th><th>Fake Pattern</th></tr>
|
| 499 |
+
<tr><td><code>avg_post_hour</code></td><td>All 10 gang members post within ±0.5h of each other (coordinated scheduling)</td></tr>
|
| 500 |
+
<tr><td><code>account_age_days</code></td><td>Created same week — base_age ± 7 days</td></tr>
|
| 501 |
+
<tr><td><code>shared_ip_count</code></td><td>= 9 for all gang members (one IP subnet per episode, unique seed)</td></tr>
|
| 502 |
+
</table>
|
| 503 |
+
|
| 504 |
+
<h3>Graph Signals (computed live at INSPECT)</h3>
|
| 505 |
+
<table class="gs-table">
|
| 506 |
+
<tr><th>Feature</th><th>Fake Pattern</th></tr>
|
| 507 |
+
<tr><td><code>mutual_follow_rate</code></td><td>0.6 – 0.9 (dense intra-gang mutual follows)</td></tr>
|
| 508 |
+
<tr><td><code>flagged_neighbor_count</code></td><td>Grows as investigation proceeds — strongest late-game signal</td></tr>
|
| 509 |
+
<tr><td><code>avg_neighbor_photo_reuse</code></td><td>High when cluster shares stolen content</td></tr>
|
| 510 |
+
</table>
|
| 511 |
+
|
| 512 |
+
<hr class="gs-divider">
|
| 513 |
+
|
| 514 |
+
<!-- EPISODE FLOW -->
|
| 515 |
+
<h2>Episode Lifecycle & Action Mechanics</h2>
|
| 516 |
+
<img src="/assets/episode.png" class="gs-img" alt="Episode Flow" onerror="this.style.display='none'">
|
| 517 |
+
<p class="gs-img-caption">Episode flow: reset → inspect/flag/investigate loop → dual SUSPECT cascade → submit → grader score</p>
|
| 518 |
+
|
| 519 |
+
<h3>Action Space</h3>
|
| 520 |
+
<table class="gs-table">
|
| 521 |
+
<tr><th>Action</th><th>Step Cost</th><th>Effect</th></tr>
|
| 522 |
+
<tr><td><code>INSPECT acc_XXXX</code></td><td>1 step</td><td>Reveals full AccountProfile + follow list; adds 1-hop neighbors to visible set</td></tr>
|
| 523 |
+
<tr><td><code>INVESTIGATE_NETWORK acc_XXXX</code></td><td>2 steps</td><td>Bidirectional 2-hop expansion (outgoing + incoming edges); re-cascades SUSPECT</td></tr>
|
| 524 |
+
<tr><td><code>FLAG acc_XXXX</code></td><td>FREE</td><td>Marks as fake; triggers dual SUSPECT cascade (follow-graph + IP cluster)</td></tr>
|
| 525 |
+
<tr><td><code>UNFLAG acc_XXXX</code></td><td>FREE</td><td>Removes flag; clears CONFIRMED_FAKE status</td></tr>
|
| 526 |
+
<tr><td><code>SUBMIT</code></td><td>FREE</td><td>Ends episode; triggers grader scoring</td></tr>
|
| 527 |
+
</table>
|
| 528 |
+
|
| 529 |
+
<h3>Dual SUSPECT Cascade (triggered by FLAG)</h3>
|
| 530 |
+
<div style="display:grid;grid-template-columns:1fr 1fr;gap:12px;margin:10px 0;">
|
| 531 |
+
<div class="gs-card">
|
| 532 |
+
<h3 style="color:#4ade80;margin-top:0;">Cascade 1 — Follow-Graph</h3>
|
| 533 |
+
<p style="margin:0;font-size:0.88em;">Every account the flagged member <em>follows</em> (<code>_live_edges</code>) becomes SUSPECT if visible and NORMAL. Gang follow density is 0.70+ so this is high-precision.</p>
|
| 534 |
+
</div>
|
| 535 |
+
<div class="gs-card">
|
| 536 |
+
<h3 style="color:#facc15;margin-top:0;">Cascade 2 — IP Cluster</h3>
|
| 537 |
+
<p style="margin:0;font-size:0.88em;">Every visible account sharing the same <code>ip_cluster_id</code> becomes SUSPECT. Gang shares <code>ip_gang_<seed></code>; real accounts have unique IPs. <strong>Zero false positives.</strong></p>
|
| 538 |
+
</div>
|
| 539 |
+
</div>
|
| 540 |
+
|
| 541 |
+
<hr class="gs-divider">
|
| 542 |
+
|
| 543 |
+
<!-- RISK SCORING -->
|
| 544 |
+
<h2>Risk Scoring Mathematics</h2>
|
| 545 |
+
<img src="/images/big.png" class="gs-img" alt="Risk Scoring Overview" onerror="this.style.display='none'">
|
| 546 |
+
<p class="gs-img-caption">All scoring functions are stateless and deterministic — called inside _build_profile() at every INSPECT</p>
|
| 547 |
+
|
| 548 |
+
<div class="gs-img-pair">
|
| 549 |
+
<div>
|
| 550 |
+
<img src="/assets/formulas-1.png" class="gs-img" alt="Risk Formulas Part 1" onerror="this.style.display='none'">
|
| 551 |
+
<p class="gs-img-caption">Node risk, Behavior risk, Graph risk components</p>
|
| 552 |
+
</div>
|
| 553 |
+
<div>
|
| 554 |
+
<img src="/assets/formulas-2.png" class="gs-img" alt="Risk Formulas Part 2" onerror="this.style.display='none'">
|
| 555 |
+
<p class="gs-img-caption">Hub legitimacy, Composite fake_risk_score formula</p>
|
| 556 |
+
</div>
|
| 557 |
+
</div>
|
| 558 |
+
|
| 559 |
+
<div class="gs-formula">fake_risk = clip(
|
| 560 |
+
0.30 × node_risk ← content signals (photo reuse, bio templates)
|
| 561 |
+
+ 0.25 × behavior_risk ← temporal + age clustering
|
| 562 |
+
+ 0.45 × graph_risk ← structural coordination (highest weight — hardest to fake)
|
| 563 |
+
− 0.25 × hub_legitimacy, ← subtractive: celebrities score ≈ 0 before clip
|
| 564 |
+
0.0, 1.0)</div>
|
| 565 |
+
|
| 566 |
+
<h3>Grader Score Formula</h3>
|
| 567 |
+
<div class="gs-formula">recall = tp / 10
|
| 568 |
+
precision = tp / max(tp + fp, 1)
|
| 569 |
+
efficiency = max(0, (max_steps − steps_used) / max_steps)
|
| 570 |
+
|
| 571 |
+
if recall ≥ 0.8 and precision ≥ 0.7:
|
| 572 |
+
score = 0.55 + 0.20×recall + 0.15×precision + 0.10×efficiency
|
| 573 |
+
else:
|
| 574 |
+
score = 0.30×recall + 0.10×precision
|
| 575 |
+
|
| 576 |
+
# Maximum possible: 1.00 | Win threshold: ~0.815</div>
|
| 577 |
+
|
| 578 |
+
<hr class="gs-divider">
|
| 579 |
+
|
| 580 |
+
<!-- REFLEXION -->
|
| 581 |
+
<h2>Reflexion Learning</h2>
|
| 582 |
+
<img src="/assets/reflexion.png" class="gs-img" alt="Reflexion Learning Loop" onerror="this.style.display='none'">
|
| 583 |
+
<p class="gs-img-caption">Post-episode lessons injected into every future prompt — learning without weight updates</p>
|
| 584 |
+
|
| 585 |
+
<p>The LLM (Qwen3-80B via AWS Bedrock) cannot be fine-tuned — it is a black-box API.
|
| 586 |
+
Instead, a separate Qwen3 call generates a 2–3 sentence lesson after each episode.
|
| 587 |
+
The best winning trajectory is stored as a few-shot example injected into all future prompts.</p>
|
| 588 |
+
|
| 589 |
+
<pre><code>Episode N:
|
| 590 |
+
LLM acts using: system_prompt + reflections[last 4] + best_trajectory
|
| 591 |
+
Episode ends → WIN or LOSS
|
| 592 |
+
LOSS → generate_reflection(action_log, outcome) → lesson stored
|
| 593 |
+
WIN → save trajectory if better reward + generate_success_reflection
|
| 594 |
+
|
| 595 |
+
Episode N+1:
|
| 596 |
+
last 4 reflections + best win trajectory injected into prompt
|
| 597 |
+
→ LLM has learned from its past without any weight updates</code></pre>
|
| 598 |
+
|
| 599 |
+
<hr class="gs-divider">
|
| 600 |
+
|
| 601 |
+
<!-- HYBRID POLICY -->
|
| 602 |
+
<h2>Hybrid Policy — The Novel Contribution</h2>
|
| 603 |
+
<img src="/assets/hybrid.png" class="gs-img" alt="Hybrid Policy Architecture" onerror="this.style.display='none'">
|
| 604 |
+
<p class="gs-img-caption">Dynamic alpha-weighted blend: rules dominate early, LLM earns trust through wins and reflections</p>
|
| 605 |
+
|
| 606 |
+
<p>A <strong>dynamic α-weighted blend</strong> of a deterministic rule engine and the LLM. α represents trust in the LLM —
|
| 607 |
+
starts at 0.20 (rules dominate), climbs as the LLM wins consistently and accumulates reflections, capped per task
|
| 608 |
+
to prevent the LLM from overriding correct high-confidence rule decisions.</p>
|
| 609 |
+
|
| 610 |
+
<div class="gs-formula">reflection_factor = min(1.0, n_reflections / 4.0)
|
| 611 |
+
raw = 0.20 + reflection_factor × (0.80 × recent_win_rate + 0.12)
|
| 612 |
+
alpha = clamp(raw, 0.20, task_cap)
|
| 613 |
+
|
| 614 |
+
Per-task caps: easy → 0.50 | medium → 0.70 | hard → 0.85</div>
|
| 615 |
+
|
| 616 |
+
<img src="/images/plot.png" class="gs-img" alt="Alpha progression over training" onerror="this.style.display='none'">
|
| 617 |
+
<p class="gs-img-caption">Alpha progression: rule-dominated early training → LLM earns authority through wins</p>
|
| 618 |
+
|
| 619 |
+
<h3>Rule Confidence Levels</h3>
|
| 620 |
+
<table class="gs-table">
|
| 621 |
+
<tr><th>Situation</th><th>Rule Action</th><th>Confidence</th></tr>
|
| 622 |
+
<tr><td>Steps remaining = 0</td><td>SUBMIT</td><td>1.00</td></tr>
|
| 623 |
+
<tr><td>Uninspected SUSPECT accounts exist</td><td>INSPECT suspects[0]</td><td>0.95</td></tr>
|
| 624 |
+
<tr><td><code>fake_risk ≥ 0.85</code></td><td>FLAG that account</td><td>0.95</td></tr>
|
| 625 |
+
<tr><td><code>fake_risk</code> in [threshold, 0.85)</td><td>FLAG that account</td><td>0.70 – 0.94</td></tr>
|
| 626 |
+
<tr><td>10 flags placed</td><td>SUBMIT</td><td>0.85</td></tr>
|
| 627 |
+
<tr><td>Steps remaining ≤ 3</td><td>SUBMIT</td><td>0.90</td></tr>
|
| 628 |
+
<tr><td>Uninspected accounts available</td><td>INSPECT top candidate</td><td>0.30</td></tr>
|
| 629 |
+
</table>
|
| 630 |
+
<p style="font-size:0.85em;color:#64748b;">When <code>rule_confidence ≥ alpha</code> the rule engine overrides. At easy cap (0.50), the LLM controls only exploratory INSPECT decisions. At hard cap (0.85), the LLM controls most decisions except forced submits and suspect cascade.</p>
|
| 631 |
+
|
| 632 |
+
</div>
|
| 633 |
+
"""
|
| 634 |
+
|
| 635 |
+
_HEADER_HTML = """
|
| 636 |
+
<style>
|
| 637 |
+
.gr-dataframe th { background:#0c2340!important;color:#94a3b8!important;font-weight:700!important;font-size:12px!important;padding:10px 12px!important;border-bottom:1px solid #1e3a5f!important; }
|
| 638 |
+
.gr-dataframe td { font-size:12.5px!important;padding:8px 12px!important; }
|
| 639 |
+
</style>
|
| 640 |
+
<div style="background:linear-gradient(135deg,#050d1a 0%,#0b1f3a 50%,#060f1e 100%);
|
| 641 |
+
padding:24px 32px 20px;border-radius:12px;
|
| 642 |
+
border:1px solid #1e3a5f;margin-bottom:2px;
|
| 643 |
+
box-shadow:0 4px 24px rgba(0,0,0,0.5);">
|
| 644 |
+
<div style="display:flex;align-items:center;gap:16px;margin-bottom:8px;">
|
| 645 |
+
<div>
|
| 646 |
+
<h1 style="color:#e2e8f0;margin:0;font-size:1.9em;font-weight:800;letter-spacing:-0.5px;
|
| 647 |
+
font-family:'Inter',system-ui,sans-serif;">GraphStrike</h1>
|
| 648 |
+
<p style="color:#475569;margin:3px 0 0;font-size:0.88em;letter-spacing:0.3px;font-family:'IBM Plex Mono',monospace;">
|
| 649 |
+
COORDINATED FAKE ACCOUNT RING DETECTION — OPENENV RL ENVIRONMENT
|
| 650 |
+
</p>
|
| 651 |
+
</div>
|
| 652 |
+
</div>
|
| 653 |
+
<div style="display:flex;gap:10px;flex-wrap:wrap;margin-top:12px;">
|
| 654 |
+
<span style="background:#052e16;color:#4ade80;padding:3px 10px;border-radius:20px;font-size:0.78em;font-weight:600;border:1px solid #166534;">OpenEnv Hackathon</span>
|
| 655 |
+
<span style="background:#0c1a2e;color:#7dd3fc;padding:3px 10px;border-radius:20px;font-size:0.78em;font-weight:600;border:1px solid #1e40af;">Reinforcement Learning</span>
|
| 656 |
+
<span style="background:#1c0533;color:#c084fc;padding:3px 10px;border-radius:20px;font-size:0.78em;font-weight:600;border:1px solid #6b21a8;">Hybrid Policy</span>
|
| 657 |
+
<span style="background:#2d1f00;color:#fbbf24;padding:3px 10px;border-radius:20px;font-size:0.78em;font-weight:600;border:1px solid #92400e;">Reflexion Learning</span>
|
| 658 |
+
<span style="background:#1a0505;color:#f87171;padding:3px 10px;border-radius:20px;font-size:0.78em;font-weight:600;border:1px solid #7f1d1d;">Fraud Detection</span>
|
| 659 |
+
</div>
|
| 660 |
+
</div>"""
|
| 661 |
+
|
| 662 |
+
_FOOTER_HTML = """
|
| 663 |
+
<div style="text-align:center;padding:24px 0 8px;color:#1e3a5f;font-size:12px;
|
| 664 |
+
border-top:1px solid #0f1e30;margin-top:28px;font-family:'IBM Plex Mono',monospace;">
|
| 665 |
+
GraphStrike — OpenEnv Hackathon × SCALER School of Technology |
|
| 666 |
+
<a href="/docs" style="color:#334155;text-decoration:none;">API Docs</a>
|
| 667 |
+
</div>"""
|
| 668 |
|
| 669 |
with gr.Blocks(title="GraphStrike") as demo:
|
| 670 |
+
|
| 671 |
+
gr.HTML(_HEADER_HTML)
|
| 672 |
+
|
| 673 |
+
with gr.Tabs():
|
| 674 |
+
|
| 675 |
+
# ══════════════ TAB 1: README ══════════════
|
| 676 |
+
with gr.Tab("Overview"):
|
| 677 |
+
gr.HTML(_README_HTML)
|
| 678 |
+
|
| 679 |
+
# ══════════════ TAB 2: PLAYGROUND ══════════════
|
| 680 |
+
with gr.Tab("Playground"):
|
| 681 |
+
with gr.Row():
|
| 682 |
+
with gr.Column(scale=1, min_width=220):
|
| 683 |
+
gr.Markdown("**1 — Episode**")
|
| 684 |
+
task_dd = gr.Dropdown(["easy","medium","hard"], value="easy", label="Task")
|
| 685 |
+
seed_in = gr.Number(value=0, label="Seed", precision=0)
|
| 686 |
+
reset_btn = gr.Button("Reset", variant="primary")
|
| 687 |
+
|
| 688 |
+
with gr.Column(scale=1, min_width=220):
|
| 689 |
+
gr.Markdown("**2 — Action**")
|
| 690 |
+
action_dd = gr.Dropdown(
|
| 691 |
+
["inspect","investigate_network","flag","unflag","submit"],
|
| 692 |
+
value="inspect", label="Action")
|
| 693 |
+
acc_in = gr.Textbox(label="Account ID", placeholder="acc_0012")
|
| 694 |
+
step_btn = gr.Button("Step", variant="primary")
|
| 695 |
+
|
| 696 |
+
with gr.Column(scale=1, min_width=180):
|
| 697 |
+
gr.Markdown("**3 — Score**")
|
| 698 |
+
gr.Markdown("<br>", container=False)
|
| 699 |
+
grader_btn = gr.Button("Grader Score", size="sm")
|
| 700 |
+
baseline_btn = gr.Button("Baseline Agent", size="sm")
|
| 701 |
+
gr.Button("API Docs (Swagger)", size="sm", link="/docs", link_target="_blank")
|
| 702 |
+
|
| 703 |
+
obs_md = gr.Markdown(value="*Reset an episode to begin.*")
|
| 704 |
+
|
| 705 |
+
gr.Markdown("**Account Profiles** — sorted by fake risk score (highest first)")
|
| 706 |
+
prof_table = gr.Dataframe(
|
| 707 |
+
headers=PROFILE_HEADERS,
|
| 708 |
+
datatype=["str","str","number","number","number","number",
|
| 709 |
+
"number","number","number","number","number"],
|
| 710 |
+
value=[],
|
| 711 |
+
interactive=False,
|
| 712 |
+
wrap=False,
|
| 713 |
+
column_widths=["110px","160px","70px","70px","70px",
|
| 714 |
+
"70px","70px","70px","70px","55px","70px"],
|
| 715 |
+
)
|
| 716 |
+
|
| 717 |
+
result_md = gr.Markdown(value="")
|
| 718 |
+
|
| 719 |
+
with gr.Accordion("All Visible IDs", open=False):
|
| 720 |
+
vis_md = gr.Markdown(value="")
|
| 721 |
+
with gr.Accordion("Raw JSON", open=False):
|
| 722 |
+
raw_json = gr.Textbox(lines=20, interactive=False)
|
| 723 |
+
|
| 724 |
+
reset_btn.click(gr_reset, [task_dd, seed_in], [obs_md, prof_table, vis_md, raw_json])
|
| 725 |
+
step_btn.click( gr_step, [action_dd, acc_in], [obs_md, prof_table, vis_md, raw_json])
|
| 726 |
+
grader_btn.click(gr_grader, [], result_md)
|
| 727 |
+
baseline_btn.click(gr_baseline,[], result_md)
|
| 728 |
+
|
| 729 |
+
# ══════════════ TAB 2: BENCHMARKS ══════════════
|
| 730 |
+
with gr.Tab("Benchmarks"):
|
| 731 |
+
gr.Markdown(
|
| 732 |
+
"### LLM Agent Evaluation — GraphStrike Environment\n"
|
| 733 |
+
"Agents evaluated with identical system prompts and structured inference. "
|
| 734 |
+
"Grader score range: **0.0 – 1.0** (win threshold ≥ 0.815). "
|
| 735 |
+
"Score colours: "
|
| 736 |
+
"<span style='color:#22c55e'>■</span> ≥0.960 "
|
| 737 |
+
"<span style='color:#86efac'>■</span> ≥0.930 "
|
| 738 |
+
"<span style='color:#facc15'>■</span> ≥0.910 "
|
| 739 |
+
"<span style='color:#f97316'>■</span> below",
|
| 740 |
+
sanitize_html=False,
|
| 741 |
+
)
|
| 742 |
+
|
| 743 |
+
gr.Markdown("#### Leaderboard — Single Seed (seed=0)")
|
| 744 |
+
gr.HTML(_leaderboard_html())
|
| 745 |
+
|
| 746 |
+
gr.Markdown("#### Score Distribution by Task")
|
| 747 |
+
gr.BarPlot(
|
| 748 |
+
value=BENCH_LONG_DF,
|
| 749 |
+
x="Model", y="Score", color="Task",
|
| 750 |
+
title="Agent Scores by Task (seed=0)",
|
| 751 |
+
color_map={"Easy": "#4ade80", "Medium": "#facc15", "Hard": "#f87171"},
|
| 752 |
+
y_lim=[0.50, 1.0],
|
| 753 |
+
x_label_angle=-25,
|
| 754 |
+
height=340,
|
| 755 |
+
)
|
| 756 |
+
|
| 757 |
+
gr.Markdown(
|
| 758 |
+
"#### Stability — 3-Seed Variance Check (seeds 0, 1, 2)\n"
|
| 759 |
+
"Variance colour: "
|
| 760 |
+
"<span style='color:#22c55e'>■</span> stable (<0.001) "
|
| 761 |
+
"<span style='color:#facc15'>■</span> moderate "
|
| 762 |
+
"<span style='color:#f87171'>■</span> high",
|
| 763 |
+
sanitize_html=False,
|
| 764 |
+
)
|
| 765 |
+
gr.HTML(_variance_html())
|
| 766 |
+
|
| 767 |
+
gr.Markdown("#### Rule-Based Baseline (no LLM, deterministic)")
|
| 768 |
+
gr.HTML(_baseline_html())
|
| 769 |
+
|
| 770 |
+
gr.Markdown(
|
| 771 |
+
"#### Key Observations\n"
|
| 772 |
+
"- Hard task is the real differentiator — evasion events destroy graph signals "
|
| 773 |
+
"mid-investigation, requiring adaptive reasoning beyond memorised patterns.\n"
|
| 774 |
+
"- Llama 4 Scout 17B achieves the lowest variance on hard (6e-5), "
|
| 775 |
+
"outperforming models with 40× more parameters.\n"
|
| 776 |
+
"- The rule-based baseline is competitive at mean 0.907, confirming "
|
| 777 |
+
"the environment's signal quality. LLM value is in evasion adaptation.\n"
|
| 778 |
+
"- All frontier models exceed 0.93 on easy/medium — cascade mechanics "
|
| 779 |
+
"are learnable from the structured observation format."
|
| 780 |
+
)
|
| 781 |
+
|
| 782 |
+
gr.HTML(_FOOTER_HTML)
|
| 783 |
|
| 784 |
app = gr.mount_gradio_app(app, demo, path="/")
|
| 785 |
print("[GraphStrike] Gradio UI mounted at /", flush=True)
|
| 786 |
|
| 787 |
except Exception as exc:
|
| 788 |
+
import traceback
|
| 789 |
print(f"[GraphStrike] Gradio unavailable: {exc}", flush=True)
|
| 790 |
+
traceback.print_exc()
|
| 791 |
|
| 792 |
@app.get("/", response_class=HTMLResponse)
|
| 793 |
def root_fallback():
|
| 794 |
+
return "<html><body><h1>GraphStrike</h1><p>API mode. <a href='/docs'>Swagger</a></p></body></html>"
|
|
|
|
| 795 |
|
| 796 |
# ---------------------------------------------------------------------------
|
| 797 |
# Entry point
|
server/environment.py
CHANGED
|
@@ -527,11 +527,53 @@ class FakeGangEnvironment(_OpenEnvBase):
|
|
| 527 |
suspicious_mutual_ratio=suspicious_mutual_ratio,
|
| 528 |
)
|
| 529 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 530 |
def _make_observation(
|
| 531 |
self,
|
| 532 |
message: str = "",
|
| 533 |
terminal_reward: Optional[float] = None,
|
| 534 |
) -> FakeGangObservation:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 535 |
return FakeGangObservation(
|
| 536 |
done=self._done,
|
| 537 |
reward=terminal_reward,
|
|
@@ -549,7 +591,7 @@ class FakeGangEnvironment(_OpenEnvBase):
|
|
| 549 |
evasion_triggered=self._evasion_triggered,
|
| 550 |
evasion_count=self._evasion_count,
|
| 551 |
task=self._task,
|
| 552 |
-
message=
|
| 553 |
suspect_ids=[
|
| 554 |
sid for sid in self._visible_ids
|
| 555 |
if sid not in self._flagged
|
|
|
|
| 527 |
suspicious_mutual_ratio=suspicious_mutual_ratio,
|
| 528 |
)
|
| 529 |
|
| 530 |
+
def _build_hint(self) -> str:
|
| 531 |
+
"""Generate actionable hints for the agent based on current state."""
|
| 532 |
+
hints = []
|
| 533 |
+
|
| 534 |
+
# Hint 1: Uninspected suspects (highest priority)
|
| 535 |
+
suspect_ids = [
|
| 536 |
+
sid for sid in self._visible_ids
|
| 537 |
+
if sid not in self._flagged
|
| 538 |
+
and self._account_statuses.get(sid, "normal") == "suspect"
|
| 539 |
+
]
|
| 540 |
+
uninspected_suspects = [s for s in suspect_ids if s not in self._inspected]
|
| 541 |
+
if uninspected_suspects:
|
| 542 |
+
hints.append(f"HINT: {len(uninspected_suspects)} SUSPECT accounts need inspection — INSPECT {uninspected_suspects[0]} next (auto-elevated by cascade, likely gang member).")
|
| 543 |
+
|
| 544 |
+
# Hint 2: Unflagged accounts with strong fake signals
|
| 545 |
+
unflagged_fakes = []
|
| 546 |
+
for acc_id in self._inspected:
|
| 547 |
+
if acc_id in self._flagged:
|
| 548 |
+
continue
|
| 549 |
+
p = self._profiled.get(acc_id)
|
| 550 |
+
if not p:
|
| 551 |
+
continue
|
| 552 |
+
if (p.shared_ip_count >= 5
|
| 553 |
+
or (p.photo_reuse_score >= 0.50 and p.bio_template_score >= 0.40
|
| 554 |
+
and p.hub_legitimacy_score < 0.70)):
|
| 555 |
+
unflagged_fakes.append(acc_id)
|
| 556 |
+
if unflagged_fakes and not uninspected_suspects:
|
| 557 |
+
hints.append(f"HINT: FLAG {unflagged_fakes[0]} — strong fake signals detected (photo_reuse/bio_template/shared_ip). FLAG is FREE (costs 0 steps).")
|
| 558 |
+
|
| 559 |
+
# Hint 3: Submit reminder
|
| 560 |
+
steps_left = max(0, self._max_steps - self._step_count)
|
| 561 |
+
if len(self._flagged) >= 10:
|
| 562 |
+
hints.append("HINT: You have 10 flags — SUBMIT now to end the episode and get scored.")
|
| 563 |
+
elif steps_left <= 3 and not self._done:
|
| 564 |
+
hints.append(f"HINT: Only {steps_left} steps left — consider SUBMIT to lock in your score.")
|
| 565 |
+
|
| 566 |
+
return " ".join(hints)
|
| 567 |
+
|
| 568 |
def _make_observation(
|
| 569 |
self,
|
| 570 |
message: str = "",
|
| 571 |
terminal_reward: Optional[float] = None,
|
| 572 |
) -> FakeGangObservation:
|
| 573 |
+
# Append hints to message for agent guidance
|
| 574 |
+
hint = self._build_hint() if not self._done else ""
|
| 575 |
+
full_message = f"{message} {hint}".strip() if hint else message
|
| 576 |
+
|
| 577 |
return FakeGangObservation(
|
| 578 |
done=self._done,
|
| 579 |
reward=terminal_reward,
|
|
|
|
| 591 |
evasion_triggered=self._evasion_triggered,
|
| 592 |
evasion_count=self._evasion_count,
|
| 593 |
task=self._task,
|
| 594 |
+
message=full_message,
|
| 595 |
suspect_ids=[
|
| 596 |
sid for sid in self._visible_ids
|
| 597 |
if sid not in self._flagged
|
server/generator.py
CHANGED
|
@@ -261,9 +261,19 @@ def generate_episode(task: str, seed: int) -> Dict[str, Any]:
|
|
| 261 |
|
| 262 |
_build_edges(rng, accounts, gang_ids, cfg["intra_gang_density"])
|
| 263 |
|
| 264 |
-
# Choose starting visible accounts
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
starting_count = cfg["starting_visible"]
|
| 266 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
|
| 268 |
return {
|
| 269 |
"episode_id": str(uuid.uuid4()),
|
|
|
|
| 261 |
|
| 262 |
_build_edges(rng, accounts, gang_ids, cfg["intra_gang_density"])
|
| 263 |
|
| 264 |
+
# Choose starting visible accounts.
|
| 265 |
+
# Guarantee exactly 1 gang member is included so the cascade CAN start
|
| 266 |
+
# regardless of seed. The agent still has to identify WHICH account is fake
|
| 267 |
+
# (requires inspecting profiles) — so difficulty is preserved.
|
| 268 |
+
# Without this, ~31% of easy episodes and ~82% of hard episodes start with
|
| 269 |
+
# zero gang members visible, making score variance seed-luck rather than
|
| 270 |
+
# agent skill.
|
| 271 |
starting_count = cfg["starting_visible"]
|
| 272 |
+
forced_gang = rng.sample(gang_ids, 1) # exactly 1 gang member
|
| 273 |
+
rest_pool = [i for i in all_ids if i not in forced_gang]
|
| 274 |
+
additional = rng.sample(rest_pool, starting_count - 1)
|
| 275 |
+
starting_visible = forced_gang + additional
|
| 276 |
+
rng.shuffle(starting_visible) # don't reveal which is fake
|
| 277 |
|
| 278 |
return {
|
| 279 |
"episode_id": str(uuid.uuid4()),
|