Pandago commited on
Commit
87f2d84
·
verified ·
1 Parent(s): d92e0b0

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -49,3 +49,9 @@ wheels/urllib3-2.6.3-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
49
  wheels/uvloop-0.22.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl filter=lfs diff=lfs merge=lfs -text
50
  wheels/watchfiles-1.1.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl filter=lfs diff=lfs merge=lfs -text
51
  wheels/websockets-16.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
49
  wheels/uvloop-0.22.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl filter=lfs diff=lfs merge=lfs -text
50
  wheels/watchfiles-1.1.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl filter=lfs diff=lfs merge=lfs -text
51
  wheels/websockets-16.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl filter=lfs diff=lfs merge=lfs -text
52
+ README.pdf filter=lfs diff=lfs merge=lfs -text
53
+ assets/formulas-1.png filter=lfs diff=lfs merge=lfs -text
54
+ assets/hybrid.png filter=lfs diff=lfs merge=lfs -text
55
+ assets/reflexion.png filter=lfs diff=lfs merge=lfs -text
56
+ assets/sys[[:space:]]arch.png filter=lfs diff=lfs merge=lfs -text
57
+ images/big.png filter=lfs diff=lfs merge=lfs -text
PIPELINE.md ADDED
@@ -0,0 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GraphStrike — End-to-End Pipeline
2
+
3
+ A complete walkthrough of how GraphStrike works, from data generation to scoring.
4
+
5
+ ---
6
+
7
+ ## Pipeline Overview
8
+
9
+ ```
10
+ ┌─────────────────────────────────────────────────────────────────────┐
11
+ │ BUILD TIME (Docker) │
12
+ │ │
13
+ │ generator.py ──► 150 episode JSONs (50 per difficulty tier) │
14
+ │ │ each with 10 gang + N real + decoys │
15
+ │ │ deterministic by seed │
16
+ │ ▼ │
17
+ │ episodes/easy_000.json ... hard_049.json │
18
+ └─────────────────────────────────────────────────────────────────────┘
19
+
20
+
21
+ ┌─────────────────────────────────────────────────────────────────────┐
22
+ │ RUNTIME (Environment Server) │
23
+ │ │
24
+ │ FastAPI + Gradio @ port 7860 │
25
+ │ │
26
+ │ POST /reset ──► Load episode, init graph, reveal visible IDs │
27
+ │ POST /step ──► Execute action, update state, return observation │
28
+ │ GET /grader ──► Compute final score after SUBMIT │
29
+ │ POST /baseline ──► Run rule-based agent on all 3 tasks │
30
+ └─────────────────────────────────────────────────────────────────────┘
31
+
32
+
33
+ ┌─────────────────────────────────────────────────────────────────────┐
34
+ │ INFERENCE (Agent / LLM) │
35
+ │ │
36
+ │ inference.py connects to server via HTTP │
37
+ │ Uses OpenAI-compatible client to call LLM │
38
+ │ Emits structured logs: [START] [STEP] [END] │
39
+ └─────────────────────────────────────────────────────────────────────┘
40
+ ```
41
+
42
+ ---
43
+
44
+ ## Stage 1: Synthetic Data Generation
45
+
46
+ **File:** `server/generator.py`
47
+
48
+ ```
49
+ Input: seed (int) + difficulty tier (easy/medium/hard)
50
+ Output: episode JSON with full social graph
51
+ ```
52
+
53
+ ### What gets generated per episode:
54
+
55
+ | Component | Easy | Medium | Hard |
56
+ |-----------|------|--------|------|
57
+ | Total accounts | 50 | 200 | 1000 |
58
+ | Gang members (target) | 10 | 10 | 10 |
59
+ | Decoy accounts | 0 | 20 | 50 |
60
+ | Celebrity accounts | 2 | 2 | 2 |
61
+ | Zero-edge accounts | 2 | 2 | 2 |
62
+ | Max steps | 30 | 50 | 80 |
63
+ | Evasion events | None | At step 20 | Recurring |
64
+
65
+ ### Gang member signals (coordinated):
66
+ - `ip_cluster_id = "ip_gang_{seed}"` — all 10 share one IP
67
+ - `shared_ip_count = 9` — each sees 9 others on same IP
68
+ - `photo_reuse_score ∈ [0.70, 0.95]` — stolen celebrity photos
69
+ - `bio_template_score ∈ [0.60, 0.90]` — copy-paste bios
70
+ - `comment_repeat_score ∈ [0.60, 0.90]` — spam comments
71
+ - `avg_post_hour` clustered within 2h window — coordinated posting
72
+
73
+ ### Real account signals (independent):
74
+ - Unique IP per account
75
+ - `photo_reuse_score ∈ [0.0, 0.05]`
76
+ - `bio_template_score ∈ [0.0, 0.08]`
77
+ - `comment_repeat_score ∈ [0.0, 0.08]`
78
+
79
+ ---
80
+
81
+ ## Stage 2: Environment State Machine
82
+
83
+ **File:** `server/environment.py`
84
+
85
+ ```
86
+ POST /reset
87
+
88
+
89
+ ┌─────────────────┐
90
+ │ EPISODE INIT │
91
+ │ Load JSON graph │
92
+ │ Reveal visible │
93
+ │ IDs to agent │
94
+ └────────┬────────┘
95
+
96
+
97
+ ┌─────────────────┐ POST /step
98
+ │ AGENT LOOP │◄────────────────┐
99
+ │ │ │
100
+ │ Observation: │ ┌��─────────┐ │
101
+ │ - visible_ids │ │ ACTION │ │
102
+ │ - profiles │ │ │ │
103
+ │ - flagged_ids │──►│ INSPECT │──┘
104
+ │ - suspect_ids │ │ INVEST_NET│
105
+ │ - steps_remain │ │ FLAG │
106
+ │ - message/hints │ │ UNFLAG │
107
+ └────────┬────────┘ │ SUBMIT │
108
+ │ └──────────┘
109
+ │ (SUBMIT or steps=0)
110
+
111
+ ┌─────────────────┐
112
+ │ EPISODE END │
113
+ │ Compute grader │
114
+ │ score via │
115
+ │ scoring.py │
116
+ └─────────────────┘
117
+
118
+
119
+ GET /grader
120
+ → { score: 0.0-1.0 }
121
+ ```
122
+
123
+ ### Action Details
124
+
125
+ | Action | Step Cost | Effect |
126
+ |--------|-----------|--------|
127
+ | **INSPECT** | 1 | Reveals full profile + risk scores for one account |
128
+ | **INVESTIGATE_NETWORK** | 1 | 2-hop bidirectional expansion, reveals neighbor IDs |
129
+ | **FLAG** | 0 (free) | Marks account as fake, triggers dual cascade |
130
+ | **UNFLAG** | 0 (free) | Removes flag from account |
131
+ | **SUBMIT** | 0 | Ends episode, triggers grading |
132
+
133
+ ### Dual Cascade on FLAG
134
+
135
+ ```
136
+ FLAG acc_0012
137
+
138
+ ├──► Follow-graph cascade
139
+ │ For each neighbor of acc_0012:
140
+ │ if visible AND status=NORMAL → set SUSPECT
141
+
142
+ └──► IP cluster cascade
143
+ Find all accounts with same ip_cluster_id:
144
+ if visible AND status=NORMAL → set SUSPECT
145
+ ```
146
+
147
+ ---
148
+
149
+ ## Stage 3: Risk Scoring Engine
150
+
151
+ **File:** `server/scoring.py`
152
+
153
+ All scores computed at INSPECT time. Stateless pure functions.
154
+
155
+ ```
156
+ ┌─────────────┐
157
+ │ Raw Features │
158
+ │ from profile │
159
+ └──────┬──────┘
160
+
161
+ ┌────────────┼────────────┐
162
+ ▼ ▼ ▼
163
+ ┌──────────┐ ┌──────────┐ ┌──────────┐
164
+ │ Node Risk│ │Behavior │ │Graph Risk│
165
+ │ 0.6×photo│ │Risk │ │0.45×flag │
166
+ │+0.4×bio │ │0.55×age │ │ _nbr_rat │
167
+ └────┬─────┘ │+0.45×hr │ │+0.35×mut │
168
+ │ │ cluster │ │+0.20×nbr │
169
+ │ └────┬─────┘ │ photo │
170
+ │ │ └────┬─────┘
171
+ │ │ │
172
+ ▼ ▼ ▼
173
+ ┌──────────────────────────────────┐ ┌──────────────┐
174
+ │ fake_risk_score │◄───│Hub Legitimacy│
175
+ │ 0.30×node + 0.25×beh + 0.45×graph│ │(discount for │
176
+ │ − 0.25×hub │ │ celebrities) │
177
+ │ clamp [0.0, 1.0] │ └──────────────┘
178
+ └──────────────────────────────────┘
179
+ ```
180
+
181
+ ### Grader Formula (after SUBMIT)
182
+
183
+ ```
184
+ If recall ≥ 0.8 AND precision ≥ 0.7:
185
+ score = 0.55 + 0.20×recall + 0.15×precision + 0.10×efficiency
186
+
187
+ Else (partial credit):
188
+ score = 0.30×recall + 0.10×precision
189
+
190
+ Where:
191
+ recall = TP / 10
192
+ precision = TP / (TP + FP)
193
+ efficiency = (max_steps − steps_used) / max_steps
194
+ ```
195
+
196
+ ---
197
+
198
+ ## Stage 4: Inference Pipeline
199
+
200
+ **File:** `inference.py`
201
+
202
+ ### Rule-Based Agent (baseline)
203
+
204
+ ```
205
+ while not done:
206
+ 1. INSPECT suspects first (auto-elevated by cascade)
207
+ 2. FLAG any inspected account with fake_risk ≥ threshold
208
+ (easy: 0.60, medium: 0.50, hard: 0.45)
209
+ 3. INSPECT highest-risk uninspected account
210
+ 4. SUBMIT when 10 flagged or steps running low
211
+ ```
212
+
213
+ ### LLM Agent (hybrid policy)
214
+
215
+ ```
216
+ ┌──────────────┐ ┌───────────────┐ ┌──────────────┐
217
+ │ Observation │────►│ Rule Engine │────►│ α blend │
218
+ │ (from server) │ │ (get_rule_ │ │ │
219
+ │ │ │ action + │ │ if conf>thr: │
220
+ │ │────►│ confidence) │ │ use rule │
221
+ │ │ └───────────────┘ │ else: │
222
+ │ │ │ query LLM │
223
+ │ │ ┌───────────────┐ │ blend │
224
+ │ │────►│ LLM (OpenAI │────►│ actions │
225
+ │ │ │ client) │ └──────┬───────┘
226
+ │ │ └───────────────┘ │
227
+ └──────────────┘ ▼
228
+ ┌──────────────┐
229
+ │ Final Action │
230
+ └──────────────┘
231
+
232
+ α caps: easy ≤ 0.50 | medium ≤ 0.70 | hard ≤ 0.85
233
+ (rule engine retains veto power on high-confidence decisions)
234
+ ```
235
+
236
+ ### Structured Log Format
237
+
238
+ ```
239
+ [START] task=easy env=graphstrike model=Qwen/Qwen2.5-72B-Instruct
240
+ [STEP] step=1 action=inspect:acc_0012 reward=0.00 done=false error=null
241
+ [STEP] step=2 action=flag:acc_0012 reward=1.20 done=false error=null
242
+ ...
243
+ [STEP] step=15 action=submit reward=12.40 done=true error=null
244
+ [END] success=true steps=15 score=0.910 rewards=0.00,1.20,...,12.40
245
+ ```
246
+
247
+ ---
248
+
249
+ ## Stage 5: Deployment
250
+
251
+ ```
252
+ ┌─────────────────────────────────────────┐
253
+ │ Hugging Face Spaces │
254
+ │ │
255
+ │ Docker container @ port 7860 │
256
+ │ ┌────────────────────────────────────┐ │
257
+ │ │ FastAPI (API endpoints) │ │
258
+ │ │ /health /tasks /reset /step │ │
259
+ │ │ /state /grader /baseline │ │
260
+ │ │ /metadata /schema /mcp │ │
261
+ │ ├────────────────────────────────────┤ │
262
+ │ │ Gradio UI (mounted at /) │ │
263
+ │ │ Manual testing interface │ │
264
+ │ │ Reset / Step / Grader buttons │ │
265
+ │ └────────────────────────────────────┘ │
266
+ │ │
267
+ │ /web → redirect to / (HF probe compat) │
268
+ └─────────────────────────────────────────┘
269
+ │ ▲
270
+ │ HTTP │ HTTP
271
+ ▼ │
272
+ ┌─────────────────────┐ ┌────────────┐
273
+ │ inference.py │ │ openenv │
274
+ │ (runs externally) │ │ validate │
275
+ │ LLM ←→ Server │ │ (judging) │
276
+ └─────────────────────┘ └────────────┘
277
+ ```
278
+
279
+ ---
280
+
281
+ ## Quick Start
282
+
283
+ ```bash
284
+ # 1. Build & run the environment server
285
+ docker build -f Dockerfile -t graphstrike .
286
+ docker run -p 7860:7860 graphstrike
287
+
288
+ # 2. Verify endpoints
289
+ curl http://localhost:7860/health
290
+ curl http://localhost:7860/tasks
291
+ curl -X POST http://localhost:7860/baseline
292
+
293
+ # 3. Run LLM inference (separate terminal)
294
+ export API_KEY="your-hf-token"
295
+ export MODEL_NAME="Qwen/Qwen2.5-72B-Instruct"
296
+ python3 inference.py --url http://localhost:7860 --all-tasks
297
+
298
+ # 4. Validate submission
299
+ openenv validate
300
+ openenv validate --url http://localhost:7860
301
+ ```
README.md CHANGED
@@ -15,783 +15,255 @@ tags:
15
  - llm-agent
16
  base_path: /web
17
  ---
 
18
 
19
- # GraphStrike : Coordinated Fake Account Ring Detection
 
 
20
 
21
- > **OpenEnv Hackathon × SCALER School of Technology**
22
- > Live deployment: [huggingface.co/spaces/Pandago/graphstrike](https://huggingface.co/spaces/Pandago/graphstrike)
23
 
24
- An OpenEnv-compatible reinforcement learning environment where an LLM agent
25
- must identify all 10 members of a coordinated fake account ring hidden
26
- inside a synthetic social network. The agent learns via **Reflexion** and a
27
- **dynamic hybrid rule/LLM policy** , not via gradient updates or fine-tuning.
 
 
 
 
 
 
28
 
29
- ---
 
 
 
 
 
 
30
 
31
- ## Table of Contents
32
 
33
- 1. [What This Is](#1-what-this-is)
34
- 2. [Repository Layout](#2-repository-layout)
35
- 3. [The Problem: How Fake Detection Actually Works](#3-the-problem-how-fake-detection-actually-works)
36
- 4. [Synthetic Data Generation](#4-synthetic-data-generation)
37
- 5. [Data Model — Every Field Explained](#5-data-model--every-field-explained)
38
- 6. [The RL Environment](#6-the-rl-environment)
39
- 7. [Risk Scoring Mathematics](#7-risk-scoring-mathematics)
40
- 8. [Account Status State Machine](#8-account-status-state-machine)
41
- 9. [The LLM Policy (Qwen3 via Bedrock)](#9-the-llm-policy-qwen3-via-bedrock)
42
- 10. [Reflexion — How the Agent Learns](#10-reflexion--how-the-agent-learns)
43
- 11. [Hybrid Policy — The Novel Contribution](#11-hybrid-policy--the-novel-contribution)
44
- 12. [Training Loop End-to-End](#12-training-loop-end-to-end)
45
- 13. [API Reference](#13-api-reference)
46
- 14. [Docker Deployment](#14-docker-deployment)
47
- 15. [Submission Requirements](#15-submission-requirements)
48
- 16. [Verification & Validation](#16-verification--validation)
49
 
50
- ---
51
 
52
- ## 1. What This Is
53
 
54
- This is an **OpenEnv hackathon** submission. OpenEnv is a framework for building
55
- reinforcement learning environments with a standard microservice interface
56
- (`/reset`, `/step`, `/state`) so that any agent implementation can plug in.
57
 
58
- **The task:** A social network contains fake accounts organised into a
59
- single coordinated ring of 10. The ring behaves in a coordinated way — same posting hour,
60
- same IP subnet, stolen celebrity photos, copy-paste bios. The agent must find
61
- all 10 by navigating a limited step budget, inspecting accounts, and flagging suspects.
62
 
63
- **What makes this non-trivial:**
64
 
65
- - The network is large (50–1000 accounts depending on difficulty).
66
- - Fake accounts are mixed with innocent high-signal "decoy" accounts.
67
- - In hard mode, the gang actively evades — dropping intra-gang follows,
68
- renaming profiles — while the agent is mid-investigation.
69
- - The agent cannot see the full network upfront: it must explore via INSPECT and
70
- INVESTIGATE_NETWORK actions, spending steps to reveal information.
71
-
72
- **What makes the learning novel:**
73
 
74
- - The LLM (Qwen3-80B via AWS Bedrock) cannot be fine-tunedit is a black-box API.
75
- - The agent learns via **Reflexion**: post-episode lessons are written back into
76
- memory and injected into every future prompt.
77
- - A **dynamic hybrid policy** (α-weighted) blends the LLM with a deterministic
78
- rule engine, with the blend weight α updating based on recent win rate.
79
- Rules dominate early; the LLM takes over as it proves itself.
80
 
81
  ---
82
-
83
- ## 2. Repository Layout
84
-
85
- ```
86
- fake_gang_env/
87
-
88
- ├── models.py # All Pydantic types: Action, Observation, State, Profile
89
- ├── bedrock_model.py # AWS Bedrock client invoke_qwen()
90
- ├── client.py # HTTP client for talking to the running server
91
- ├── inference.py # Submission: rule-based baseline runner + HTTP client mode
92
- ├── validate.py # Submission: pre-submission validator (24 checks)
93
- ├── train.py # Main training loop (curriculum + hybrid policy)
94
- ├── run.sh # Docker entrypoint: episodes → server → training
95
- ├── requirements.txt # Python dependencies
96
-
97
- ├── server/
98
- │ ├── app.py # FastAPI server: /reset /step /state /health /tasks /grader /baseline
99
- │ ├── environment.py # Core RL environment — FakeGangEnvironment class
100
- │ ├── generator.py # Synthetic episode generator (50 per task × 3 tasks = 150 files)
101
- │ ├── scoring.py # Pure-math risk formula engine (stateless functions)
102
- │ ├── Dockerfile # Offline pip install via pre-downloaded wheels
103
- │ └── .dockerignore # Excludes episodes/, memory/, runs/ from build context
104
-
105
- ├── agent/
106
- │ ├── policy.py # LLM policy: formats obs → calls Qwen → parses <action> tag
107
- │ ├── hybrid_policy.py # Hybrid policy: blends rules + LLM via dynamic α
108
- │ ├── memory.py # Disk-backed memory: reflections, trajectories, win history, α
109
- │ └── reflection.py # Post-episode reflection generator (also calls Qwen)
110
-
111
- ├── episodes/ # 150 pre-generated JSON episode files (excluded from Docker build)
112
- ├── memory/ # Docker volume: reflections, trajectories, α values (persists)
113
- └── runs/ # Docker volume: per-episode metrics JSONL (persists)
114
- ```
115
-
116
  ---
117
 
118
- ## 3. The Problem: How Fake Detection Actually Works
119
-
120
- A real-world fake account detector does **not** read post content. Content is
121
- expensive to process, multilingual, and easily spoofed. Instead, detection relies
122
- on three categories of signals that are computed from metadata:
123
-
124
- ### 3.1 Node Signals (per-account features)
125
-
126
- These are pre-computed by a content pipeline before the agent ever sees the account:
127
 
128
- | Feature | What it measures | Fake range | Real range |
129
- | ------------------------ | ---------------------------------------------------------------------------- | ---------- | ---------- |
130
- | `photo_reuse_score` | Fraction of posts using stolen celebrity photos (pHash fingerprint matching) | 0.30–0.95 | 0.00–0.15 |
131
- | `bio_template_score` | Cosine similarity of bio text to known fake-account bio templates | 0.20–0.90 | 0.00–0.12 |
132
- | `comment_repeat_score` | Fraction of comments that are copy-pasted across accounts (spam pattern) | 0.60–0.90 | 0.00–0.08 |
133
 
134
- The agent receives these as numbers (not the raw content). The content pipeline
135
- runs offline; the agent operates on numeric features only.
136
 
137
- ### 3.2 Behavioral Signals (temporal + device)
 
 
 
138
 
139
- | Feature | What it measures | Fake pattern |
140
- | -------------------- | ------------------------------------------------ | ---------------------------------------------------------------- |
141
- | `avg_post_hour` | Mean hour of day for all posts (0–23) | Gang members cluster within ±0.5 hours (coordinated scheduling) |
142
- | `account_age_days` | Days since account creation | Gang created within same week (base_age ± 7 days) |
143
- | `shared_ip_count` | How many other accounts share the same IP subnet | 9 (all 10 gang members share one IP) |
144
- | `ip_cluster_id` | Identifier for the IP subnet | All fakes:`ip_gang_{seed}` |
145
 
146
- ### 3.3 Structural / Graph Signals (derived at INSPECT time)
 
 
 
147
 
148
- These cannot be pre-computed because they depend on **which accounts the agent
149
- has already flagged** — they change dynamically as the investigation progresses:
150
 
151
- | Feature | What it measures | Fake pattern |
152
- | ---------------------------- | ----------------------------------------------------------------- | --------------------------------------------------- |
153
- | `mutual_follow_rate` | Fraction of this account's follows that also follow back | Gang mutually follow each other heavily (0.6–0.9) |
154
- | `flagged_neighbor_count` | How many of this account's follows are currently flagged | High → embedded in the cluster you're tracking |
155
- | `avg_neighbor_photo_reuse` | Mean photo_reuse_score of already-inspected follows | Gang members cluster; inspecting one reveals others |
156
- | `post_hour_cluster_score` | How closely posting hour matches the flagged accounts' mean | High → same operation schedule |
157
- | `suspicious_mutual_ratio` | Fraction of suspicious follows that also follow this account back | Used in hub legitimacy computation |
158
-
159
- **Why graph signals are the most powerful:** A gang member always has high
160
- `mutual_follow_rate` and grows `flagged_neighbor_count` as the investigation
161
- proceeds. A celebrity who is *followed by* fakes but does not *follow them back*
162
- will have low `mutual_follow_rate` — this is how the hub legitimacy score
163
- protects against false positives.
164
 
165
  ---
 
166
 
167
- ## 4. Synthetic Data Generation
168
-
169
- **File:** `server/generator.py`
170
-
171
- Episodes are generated deterministically by seed. 150 episodes are pre-generated
172
- (50 per task) and cached as JSON files in `episodes/`.
 
 
 
 
 
 
 
 
173
 
174
- ### 4.1 Network Composition
175
 
176
- | Task | Network size | Gang | Decoys | Real | Max steps |
177
- | ------ | ------------ | ---- | ------ | ---- | --------- |
178
- | easy | 50 | 10 | 0 | 40 | 30 |
179
- | medium | 200 | 10 | 20 | 170 | 50 |
180
- | hard | 1000 | 10 | 50 | 940 | 80 |
181
 
182
- **Gang accounts:** All 10 share:
183
 
184
- - A `base_age` (same creation week): `account_age_days = base_age + randint(0, 7)`
185
- - Tightly clustered `avg_post_hour`: sampled from `gauss(14.0, std)` where std=0.5/1.5/2.5 by task
186
- - High `photo_reuse_score` and `bio_template_score` (ranges defined per task)
187
- - `comment_repeat_score` in [0.60, 0.90]
188
- - `ip_cluster_id = "ip_gang_{seed}"` and `shared_ip_count = 9`
189
- - Dense intra-gang follow edges (density 0.60–0.80 by task)
190
 
191
- **Real accounts:** Drawn from log-normal distributions mimicking real Instagram stats:
 
 
 
192
 
193
- - `follower_count`: log-normal(μ=6.0, σ=1.5), clipped to [10, 100,000]
194
- - `comment_repeat_score`: Beta(1, 20) ≈ mostly 0.00–0.08
195
- - Each has a unique `ip_cluster_id`
196
 
197
- **Decoy accounts** (medium/hard only): Real accounts with elevated fraud scores:
 
 
198
 
199
- - `photo_reuse_score` in [0.20, 0.40], `bio_template_score` in [0.20, 0.40]
200
- - `comment_repeat_score` in [0.10, 0.30]
201
- - They look suspicious but are NOT gang members — they penalise reckless flagging.
202
 
203
- **Celebrity accounts** (2 per episode): Very high follower counts, very low fake scores:
204
 
205
- - `follower_count` in [100,000, 5,000,000]
206
- - `photo_reuse_score` in [0.00, 0.05], `bio_template_score` in [0.00, 0.05]
207
- - High `hub_legitimacy_score` → the formula protects them from false positives.
208
 
209
- **Zero-edge isolates** (2 per episode): Real accounts with `follower_count=0`,
210
- `following_count=0`, no graph edges. They test whether the agent wastes steps
211
- on disconnected nodes.
212
 
213
- ### 4.2 Edge Generation
214
 
215
- Intra-gang edges are added first with high density:
216
 
217
- ```
218
- for g in gang_ids:
219
- for h in gang_ids:
220
- if g != h and random() < density:
221
- g follows h
222
- ```
223
 
224
- Real and decoy accounts get sparse preferential-attachment edges: each follows
225
- 5–50 random other accounts. This creates a realistic social graph where gang
226
- members are much more tightly interconnected than real users.
227
-
228
- ### 4.3 Episode JSON Schema
229
-
230
- ```json
231
- {
232
- "episode_id": "uuid4",
233
- "task": "easy",
234
- "seed": 0,
235
- "max_steps": 30,
236
- "win_recall": 0.8,
237
- "win_precision": 0.7,
238
- "starting_visible": ["acc_0012", "acc_0037", ...],
239
- "gang_member_ids": ["acc_0003", "acc_0017", ...],
240
- "decoy_ids": [],
241
- "celeb_ids": ["acc_0048", "acc_0049"],
242
- "zero_edge_ids": ["acc_0046", "acc_0047"],
243
- "network": {
244
- "accounts": [
245
- {
246
- "id": "acc_0003",
247
- "is_fake": true,
248
- "gang_id": "gang_A",
249
- "features": {
250
- "follower_count": 3421,
251
- "following_count": 847,
252
- "post_count": 214,
253
- "avg_post_hour": 14.23,
254
- "photo_reuse_score": 0.8712,
255
- "bio_template_score": 0.7403,
256
- "account_age_days": 67,
257
- "comment_repeat_score": 0.7831,
258
- "ip_cluster_id": "ip_gang_0",
259
- "shared_ip_count": 9,
260
- "name_change_count": 0
261
- },
262
- "true_edges": {
263
- "follows": ["acc_0017", "acc_0029", ...],
264
- "followed_by": ["acc_0017", "acc_0008", ...]
265
- }
266
- }
267
- ]
268
- },
269
- "evasion_schedule": []
270
- }
271
- ```
272
 
273
  ---
274
 
275
- ## 5. Data Model — Every Field Explained
276
 
277
- **File:** `models.py`
278
-
279
- ### 5.1 ActionType (enum)
280
-
281
- | Value | Cost | Effect |
282
- | ----------------------- | ------- | ---------------------------------------------------------------------------------------- |
283
- | `inspect` | 1 step | Reveals full `AccountProfile` + follow list; adds neighbors to `visible_account_ids` |
284
- | `investigate_network` | 2 steps | Expands 2 hops from account; only reveals account IDs (no profiles) |
285
- | `flag` | 0 steps | Marks account as gang member; triggers SUSPECT cascade to visible neighbors |
286
- | `unflag` | 0 steps | Removes flag; clears CONFIRMED_FAKE status |
287
- | `submit` | 0 steps | Ends episode; triggers scoring |
288
-
289
- ### 5.2 AccountStatus (enum)
290
 
291
- ```
292
- NORMAL → no signal or formula risk < 0.35
293
- SUSPECT → auto-elevated via dual cascade:
294
- (1) a flagged account follows this account, OR
295
- (2) this account shares ip_cluster_id with a flagged account
296
- CONFIRMED_FAKE → agent explicitly flagged this account
297
- ```
298
 
299
- Transitions are one-directional except UNFLAG which clears CONFIRMED_FAKE.
300
- SUSPECT is set automatically — the agent never sets it manually.
301
 
302
- ### 5.3 AccountProfile all 22 fields
 
 
 
 
303
 
304
- ```python
305
- account_id: str # "acc_0042"
 
 
 
306
 
307
- # Raw counts
308
- follower_count: int # followers (fakes: 1k-8k, celebs: 100k-5M)
309
- following_count: int # accounts followed (fakes: 200-2000)
310
- post_count: int # total posts (fakes: 50-500)
311
 
312
- # Temporal
313
- avg_post_hour: float # mean posting hour 0-23 (gang: tightly clustered)
314
- account_age_days: int # days since creation (gang: same week, within 7 days)
315
 
316
- # Content pipeline scores (pre-computed, 0-1)
317
- photo_reuse_score: float # pHash stolen-photo detection
318
- bio_template_score: float # cosine similarity to fake bio templates
319
- comment_repeat_score: float # copy-paste spam comment fraction
320
 
321
- # IP / device
322
- shared_ip_count: int # accounts sharing same IP subnet (gang: 9)
323
 
324
- # Graph features (computed live at INSPECT time)
325
- mutual_follow_rate: float # fraction of follows that follow back (fakes: 0.6-0.9)
326
- flagged_neighbor_count: int # follows already flagged (grows as investigation progresses)
327
- avg_neighbor_photo_reuse: float # mean photo_reuse of inspected follows
328
- inspected_neighbor_count: int # denominator for flagged_neighbor_ratio
329
- post_hour_cluster_score: float # hour alignment to flagged cluster mean (0-1)
330
- suspicious_mutual_ratio: float # mutual rate among suspicious follows (for hub legitimacy)
331
 
332
- # Risk breakdown (computed via scoring.py at INSPECT time)
333
- fake_risk_score: float # composite 0-1 (main decision signal)
334
- node_risk: float # from photo_reuse + bio_template
335
- behavior_risk: float # from account_age + post_hour_cluster
336
- graph_risk: float # from flagged_neighbor_ratio + mutual + avg_neighbor
337
- hub_legitimacy_score: float # celebrity/hub discount
338
 
339
- # Evasion tracking
340
- name_change_count: int # incremented by hard-mode evasion events
 
 
 
 
 
 
 
341
 
342
- # Status
343
- status: AccountStatus # NORMAL / SUSPECT / CONFIRMED_FAKE
344
- visible_follows: List[str] # follow list revealed by INSPECT
345
- ```
346
 
347
- ### 5.4 FakeGangObservation what the agent sees each step
348
-
349
- ```python
350
- done: bool # episode over?
351
- reward: Optional[float] # only set on terminal step
352
- visible_accounts: List[AccountProfile] # fully profiled (inspected) accounts
353
- visible_account_ids: List[str] # all known account IDs (profiled + seen)
354
- flagged_ids: List[str] # currently flagged by agent
355
- inspected_ids: List[str] # accounts with full profiles revealed
356
- suspect_ids: List[str] # auto-elevated SUSPECT accounts (uninspected cascade)
357
- graph_edges: Dict[str, List[str]] # follow lists for inspected accounts
358
- steps_remaining: int # budget left
359
- evasion_triggered: bool # was evasion active this episode?
360
- evasion_count: int # how many evasion events have fired
361
- task: str # "easy" / "medium" / "hard"
362
- message: str # human-readable result / status message
363
- ```
364
 
365
  ---
366
 
367
- ## 6. The RL Environment
368
 
369
  **File:** `server/environment.py`
370
 
371
- ### 6.1 Episode Lifecycle
372
 
373
- ```
374
- reset(task, seed)
375
- └── loads JSON episode file (or generates on the fly)
376
- └── initialises _visible_ids with starting_visible accounts
377
- └── returns initial observation (no profiles yet)
378
-
379
- step(action) [called repeatedly]
380
- └── INSPECT → _do_inspect() → reveals profile + neighbors
381
- └── FLAG → _do_flag() → cascades SUSPECT to visible neighbors
382
- └── UNFLAG → _do_unflag() → clears status
383
- └── INVESTIGATE_NETWORK → _do_investigate() → reveals 2-hop IDs
384
- └── SUBMIT → _do_submit() → scores and ends episode
385
-
386
- If step_count >= max_steps → forced submit (penalty -2.0)
387
- ```
388
-
389
- ### 6.2 Action Mechanics in Detail
390
-
391
- **INSPECT (1 step):**
392
-
393
- 1. Adds account to `_inspected`
394
- 2. Calls `_build_profile(acc_id)` — computes all 22 features dynamically
395
- 3. Adds all accounts this account follows to `_visible_ids`
396
- 4. Returns updated observation
397
-
398
- **INVESTIGATE_NETWORK (2 steps):**
399
 
400
- 1. Adds account to `_inspected` (counts it as seen)
401
- 2. **Bidirectional 2-hop expansion:** Traverses both `_live_edges` (outgoing follows)
402
- AND `_reverse_edges` (incoming followers) for the target and each 1-hop neighbor.
403
- This means the expansion covers:
404
- - Outgoing: `acc → follows → their follows` AND `acc → follows → their followers`
405
- - Incoming: `acc ← followers → their follows` AND `acc ← followers ← their followers`
406
- 3. Adds all new account IDs to `_visible_ids` (no full profiles — IDs only)
407
- 4. **Re-cascades SUSPECT** to newly visible accounts via two signals:
408
- - *Follow-graph cascade:* any newly visible account followed by a flagged account → SUSPECT
409
- - *IP cluster cascade:* any newly visible account sharing `ip_cluster_id` with a flagged account → SUSPECT (zero false positives — gang shares one IP; real accounts have unique IPs)
410
- 5. Cost: 2 steps, -0.02 score. Returns count of newly discovered IDs.
411
 
412
- **FLAG (free):**
413
-
414
- 1. Adds account to `_flagged`
415
- 2. Sets `_account_statuses[acc_id] = "confirmed_fake"`
416
- 3. **Dual cascade** to SUSPECT:
417
- - *Cascade 1 — Follow-graph:* For every neighbor in `_live_edges[acc_id]`
418
- (accounts the flagged user follows), if the neighbor is visible and NORMAL → SUSPECT.
419
- Gang members follow each other at density 0.70+, so this is high-precision.
420
- - *Cascade 2 — IP cluster:* Any visible account sharing the same `ip_cluster_id`
421
- as the flagged account → SUSPECT. Gang members all share `ip_gang_{seed}`;
422
- real and decoy accounts each have a unique IP cluster. Zero false positives.
423
- 4. Refreshes all already-inspected accounts that follow `acc_id`
424
- (their `flagged_neighbor_count` just increased, so risk scores change)
425
-
426
- **SUBMIT:**
427
- Computes final scores (see §6.3).
428
-
429
- ### 6.3 Reward Function
430
 
431
  ```
432
- tp = len(gang_ids ∩ flagged_ids) # true positives
433
- fp = len(flagged_ids - gang_ids) # false positives
434
- fn = len(gang_ids - flagged_ids) # false negatives
435
-
436
  base_reward = tp×1.0 − fp×0.5 − fn×0.3
437
 
438
- Win condition (task-dependent thresholds):
439
  easy/medium: recall ≥ 0.8 AND precision ≥ 0.7
440
  hard: recall ≥ 0.9 AND precision ≥ 0.8
441
 
442
- If WIN:
443
- +5.0 # full win bonus
444
- +3.0 # if perfect recall (tp == len(gang_ids))
445
- elif recall win_recall:
446
- +2.0 # partial win (high recall, low precision)
447
-
448
- Efficiency bonus (if SUBMIT called voluntarily with ≥50% steps remaining):
449
- +1.0
450
-
451
- Hard mode evasion penalty:
452
- −1.0 × evasion_count
453
-
454
- Forced submit (ran out of steps):
455
- −2.0
456
-
457
- Final score = base_reward + all bonuses/penalties
458
- ```
459
-
460
- **Example:** Easy task, found 9/10 gang members, flagged 2 innocent accounts,
461
- 30 steps used, submitted voluntarily with 5 steps left (< 50%):
462
-
463
- ```
464
- tp=9, fp=2, fn=1
465
- base = 9×1.0 − 2×0.5 − 1×0.3 = 9 − 1 − 0.3 = 7.7
466
- recall = 9/10 = 0.90 ≥ 0.8 ✓ precision = 9/11 = 0.82 ≥ 0.7 ✓
467
- +5.0 win bonus
468
- 0 efficiency bonus (steps_left=5 < 30×0.5=15)
469
- total = 7.7 + 5.0 = 12.7
470
  ```
471
 
472
- ### 6.4 Evasion (hard mode)
473
-
474
- The `evasion_schedule` in each episode defines trigger points. When
475
- `step_count >= event["step"]` and the event hasn't fired yet:
476
 
477
- **`unfollow_intragang`:** A fraction (`drop_rate=0.3`) of intra-gang edges are
478
- randomly removed from `_live_edges`. This destroys the graph signal mid-investigation.
479
- The agent sees `mutual_follow_rate` and `flagged_neighbor_count` drop on
480
- re-inspection. Hard mode fires this 4 times (steps 15, 30, 45, 60).
481
-
482
- **`rename_count`:** A random subset of gang members get `name_change_count += 1`.
483
- This is a visual signal — the agent should notice accounts that have changed
484
- their name multiple times.
485
 
486
  ---
487
 
488
- ## 7. Risk Scoring Mathematics
489
-
490
- **File:** `server/scoring.py`
491
 
492
- All five functions are **stateless and deterministic** — no side effects, no
493
- global state. They are called inside `_build_profile()` every time an account
494
- is inspected or a neighbor is re-profiled after a FLAG.
495
 
496
- ### 7.1 Node Risk
497
 
498
- Captures content-based fakeness signals:
499
 
500
- ```
501
- node_risk = 0.60 × photo_reuse_score + 0.40 × bio_template_score
502
- ```
503
-
504
- Photo reuse gets 60% weight because it is harder to spoof (requires actual
505
- pHash fingerprint matching against a celebrity photo database).
506
-
507
- **Example:** Gang member with `photo_reuse=0.87`, `bio_template=0.74`:
508
-
509
- ```
510
- node_risk = 0.60 × 0.87 + 0.40 × 0.74 = 0.522 + 0.296 = 0.818
511
- ```
512
-
513
- ### 7.2 Behavior Risk
514
-
515
- Captures temporal anomalies:
516
-
517
- ```
518
- age_norm = min(1.0, account_age_days / 365.0)
519
- behavior_risk = 0.55 × (1 − age_norm) + 0.45 × post_hour_cluster_score
520
- ```
521
-
522
- `(1 − age_norm)` is high for newly created accounts (fakes are created right
523
- before the operation starts). `post_hour_cluster_score` measures alignment with
524
- the flagged cluster's mean posting hour (see §7.5).
525
-
526
- **Example:** Gang member, `account_age_days=67`, `post_hour_cluster_score=0.91`:
527
-
528
- ```
529
- age_norm = 67/365 = 0.184
530
- behavior_risk = 0.55×(1−0.184) + 0.45×0.91 = 0.55×0.816 + 0.4095
531
- = 0.449 + 0.410 = 0.859
532
- ```
533
-
534
- ### 7.3 Graph Risk
535
-
536
- The most predictive signal once the investigation has started:
537
-
538
- ```
539
- flagged_neighbor_ratio = flagged_neighbor_count / max(inspected_neighbor_count, 1)
540
- graph_risk = 0.45 × flagged_neighbor_ratio
541
- + 0.35 × mutual_follow_rate
542
- + 0.20 × avg_neighbor_photo_reuse
543
- ```
544
-
545
- `flagged_neighbor_ratio` gets 45% weight — if several of this account's friends
546
- are already confirmed fakes, this account is very likely fake too.
547
-
548
- **Example:** After 3 gang members flagged; inspecting a 4th gang member:
549
-
550
- ```
551
- flagged_neighbor_count = 3 (3 already-flagged accounts in its follow list)
552
- inspected_neighbor_count = 4 (total inspected follows)
553
- mutual_follow_rate = 0.78 (gang mutually follow heavily)
554
- avg_neighbor_photo_reuse = 0.81
555
-
556
- flagged_neighbor_ratio = 3/4 = 0.75
557
- graph_risk = 0.45×0.75 + 0.35×0.78 + 0.20×0.81
558
- = 0.338 + 0.273 + 0.162 = 0.773
559
- ```
560
-
561
- ### 7.4 Hub Legitimacy
562
-
563
- Protects celebrities and legitimate large accounts from false positives:
564
-
565
- ```
566
- F_MAX = 1,000,000
567
- followers_norm = min(1.0, log(1+follower_count) / log(1+F_MAX))
568
- follow_ratio_norm = min(1.0, (following_count / max(follower_count, 1)) / 5.0)
569
- age_norm = min(1.0, account_age_days / 365.0)
570
-
571
- hub_legitimacy = 0.45 × followers_norm
572
- + 0.25 × (1 − follow_ratio_norm)
573
- + 0.20 × age_norm
574
- + 0.10 × (1 − suspicious_mutual_ratio)
575
- ```
576
-
577
- Four signals of legitimacy:
578
-
579
- - Large log-scaled follower count (0.45 weight) — genuine celebrities have
580
- millions; fake accounts peak at ~8,000
581
- - Low follow-to-follower ratio (0.25 weight) — celebs follow few, are followed
582
- by many; fakes follow aggressively
583
- - Old account (0.20 weight) — real celebrities have accounts years old
584
- - Not mutually following suspicious accounts (0.10 weight) — a celeb being
585
- *followed by* fakes doesn't make the celeb fake
586
-
587
- **Example — Celebrity with 2,000,000 followers:**
588
-
589
- ```
590
- followers_norm = log(2,000,001) / log(1,000,001) = 14.509/13.816 = 1.0 (capped)
591
- follow_ratio_norm = (200 / 2,000,000) / 5.0 = 0.00002 ≈ 0.0
592
- age_norm = min(1.0, 2000/365) = 1.0
593
-
594
- hub_legitimacy = 0.45×1.0 + 0.25×(1−0.0) + 0.20×1.0 + 0.10×1.0 = 1.00
595
- ```
596
-
597
- **Example — Gang member:**
598
-
599
- ```
600
- followers_norm = log(3422) / log(1,000,001) = 8.138/13.816 = 0.589
601
- follow_ratio_norm = min(1.0, (847/3422)/5.0) = 0.0495
602
- age_norm = 67/365 = 0.184
603
-
604
- hub_legitimacy = 0.45×0.589 + 0.25×(1−0.0495) + 0.20×0.184 + 0.10×0.9
605
- = 0.265 + 0.238 + 0.037 + 0.090 = 0.630
606
- ```
607
-
608
- ### 7.5 Post-Hour Cluster Score
609
-
610
- Computed dynamically inside `environment.py`, not in `scoring.py`:
611
-
612
- ```
613
- mean_h = average avg_post_hour across all currently flagged accounts
614
- diff = min(|acc_hour − mean_h|, 24 − |acc_hour − mean_h|) # wrap-around
615
- post_hour_cluster_score = max(0.0, 1.0 − diff / 6.0)
616
- ```
617
-
618
- The wrap-around handles the midnight boundary (e.g., 23:00 and 01:00 are 2 hours
619
- apart, not 22). A score of 1.0 means posting at exactly the same hour as the
620
- flagged cluster. A score of 0.0 means ≥6 hours away.
621
-
622
- **Why 6.0 as the divisor:** 6 hours is a generous "different time zone" threshold.
623
- If you post within 6 hours of the gang's schedule, you get partial credit.
624
-
625
- **Example:** Gang posts at mean=14.0. Inspecting an account posting at 14.3:
626
-
627
- ```
628
- diff = |14.3 − 14.0| = 0.3
629
- post_hour_cluster_score = 1.0 − 0.3/6.0 = 0.950
630
- ```
631
-
632
- ### 7.6 Composite Fake Risk
633
-
634
- ```
635
- fake_risk = clip(
636
- 0.30 × node_risk
637
- + 0.25 × behavior_risk
638
- + 0.45 × graph_risk
639
- − 0.25 × hub_legitimacy,
640
- 0.0, 1.0
641
- )
642
- ```
643
-
644
- Weight rationale:
645
-
646
- - **Graph risk 0.45** — structural signals are hardest for fakes to hide.
647
- Mutual follow density requires real coordination; once you find one member,
648
- the whole cluster lights up.
649
- - **Node risk 0.30** — content signals are strong but can appear on decoys.
650
- - **Behavior risk 0.25** — temporal clustering is a reliable early signal,
651
- especially before any flags are set.
652
- - **Hub legitimacy −0.25** — subtractive discount. A celebrity with 5M followers
653
- has hub_legitimacy ≈ 1.0, so even if gang members follow them, their risk
654
- formula produces: `0.30×0.02 + 0.25×0.05 + 0.45×0.10 − 0.25×1.0 ≈ −0.17 → clipped to 0.0`
655
-
656
- **Full gang member example** (after 3 flags set):
657
-
658
- ```
659
- node_risk = 0.818 (photo=0.87, bio=0.74)
660
- behavior_risk = 0.859 (age=67d, cluster_score=0.91)
661
- graph_risk = 0.773 (ratio=0.75, mutual=0.78, nbr_photo=0.81)
662
- hub_legitimacy= 0.630 (3k followers, 1y old, no celeb)
663
-
664
- fake_risk = 0.30×0.818 + 0.25×0.859 + 0.45×0.773 − 0.25×0.630
665
- = 0.245 + 0.215 + 0.348 − 0.158
666
- = 0.650
667
- ```
668
-
669
- ### 7.7 Risk Classification
670
-
671
- ```
672
- fake_risk < 0.35 → "normal"
673
- 0.35 ≤ risk < 0.60 → "suspect"
674
- risk ≥ 0.60 → "confirmed_fake" (formula-level; explicit flag overrides)
675
- ```
676
-
677
- ### 7.8 Grader Score (Submission Metric)
678
-
679
- This normalised [0.0, 1.0] score is returned by the `/grader` endpoint:
680
-
681
- ```
682
- recall = tp / 10
683
- precision = tp / max(tp + fp, 1)
684
- efficiency = max(0.0, (max_steps − steps_used) / max_steps)
685
-
686
- if recall ≥ 0.8 AND precision ≥ 0.7:
687
- score = 0.55 + 0.20×recall + 0.15×precision + 0.10×efficiency
688
- else:
689
- score = 0.30×recall + 0.10×precision
690
- ```
691
-
692
- **Maximum possible score:** `0.55 + 0.20×1.0 + 0.15×1.0 + 0.10×1.0 = 1.00`
693
- (requires all 10 found, no false positives, and 0 steps used — perfect play)
694
-
695
- **Win threshold score:** `0.55 + 0.20×0.8 + 0.15×0.7 + 0.10×0 = 0.55 + 0.16 + 0.105 = 0.815`
696
-
697
- **Partial credit examples:**
698
-
699
- - Found 6/10, no false positives: `0.30×0.6 + 0.10×1.0 = 0.18 + 0.10 = 0.28`
700
- - Found 9/10, 3 false positives: recall=0.9, precision=9/12=0.75 → win: `0.55 + 0.18 + 0.113 = 0.843`
701
-
702
- ---
703
-
704
- ## 8. Account Status State Machine
705
-
706
- ```
707
- ┌──────────────────────────────────────┐
708
- │ │
709
- INSPECT INSPECT
710
- │ │
711
- ▼ ▼
712
- ┌──────────────┐ FLAG cascade ┌──────────────────┐
713
- │ NORMAL │ ─────────────────► │ SUSPECT │
714
- └──────────────┘ (neighbor of └──────────────────┘
715
- │ flagged) │
716
- │ │
717
- FLAG(account) FLAG(account)
718
- │ │
719
- ▼ ▼
720
- ┌──────────────────────────────────────────────────┐
721
- │ CONFIRMED_FAKE │
722
- └──────────────────────────────────────────────────┘
723
-
724
- UNFLAG(account)
725
-
726
-
727
- (status cleared → NORMAL)
728
- ```
729
-
730
- **When FLAG(X) is called:**
731
-
732
- 1. X → CONFIRMED_FAKE
733
- 2. **Dual SUSPECT cascade:**
734
- - *Follow-graph:* For every account Y that X follows (`_live_edges[X]`):
735
- if Y is visible AND Y is NORMAL → Y becomes SUSPECT
736
- - *IP cluster:* For every visible account Z sharing X's `ip_cluster_id`:
737
- if Z is not flagged AND Z is NORMAL → Z becomes SUSPECT
738
- (gang members share `ip_gang_{seed}`; real accounts have unique IPs → zero false positives)
739
- 3. All already-inspected accounts that follow X are re-profiled
740
- (their `flagged_neighbor_count` increases, which raises their `fake_risk_score`)
741
-
742
- **Why SUSPECT matters:**
743
-
744
- - The `suspect_ids` field in the observation lists all SUSPECT accounts not yet inspected
745
- - Both the rule engine and the LLM treat these as highest priority for the next INSPECT
746
- - This creates an efficient cascade: flag one → inspect suspects → some are gang
747
- → flag them → more suspects appear → repeat until cluster is exhausted
748
-
749
- **Example cascade on easy task:**
750
-
751
- ```
752
- Step 1: INSPECT acc_0003 (gang member) → no flags yet, fake_risk ≈ 0.45
753
- Step 2: FLAG acc_0003
754
- → acc_0017, acc_0029, acc_0041 become SUSPECT (they follow acc_0003)
755
- → obs.suspect_ids = ["acc_0017", "acc_0029", "acc_0041"]
756
- Step 3: INSPECT acc_0017 (gang member) → fake_risk now 0.72 (flagged_neighbor_count=1)
757
- Step 4: FLAG acc_0017
758
- → acc_0003 (already flagged), acc_0029, acc_0041, acc_0055 get SUSPECT
759
- → acc_0003, acc_0017 profiles refreshed (their mutual flags increased)
760
- Step 5: INSPECT acc_0029 → fake_risk = 0.81 (flagged_neighbor_count=2)
761
- ...
762
- ```
763
-
764
- Each FLAG makes the next gang member easier to find because their risk score rises.
765
 
766
  ---
767
 
768
- ## 9. The LLM Policy (Qwen3 via Bedrock)
769
 
770
  **File:** `agent/policy.py`
771
 
772
- ### 9.1 Model
773
 
774
- **Qwen3-Next-80B** accessed via AWS Bedrock Marketplace:
775
-
776
- ```python
777
- MODEL_ID = "qwen.qwen3-next-80b-a3b"
778
- ```
779
-
780
- Called via the Bedrock Converse API:
781
-
782
- ```python
783
- client.converse(
784
- modelId=MODEL_ID,
785
- messages=[{"role": "user", "content": [{"text": prompt}]}],
786
- system=[{"text": SYSTEM_PROMPT}],
787
- inferenceConfig={"maxTokens": 512, "temperature": 0.4}
788
- )
789
- ```
790
-
791
- Temperature 0.4 is low enough for consistent action format but high enough to
792
- avoid degenerate repetition.
793
-
794
- ### 9.2 Prompt Construction
795
 
796
  Every step, the policy builds a prompt from three components:
797
 
@@ -803,612 +275,178 @@ Every step, the policy builds a prompt from three components:
803
  What is your next action?
804
  ```
805
 
806
- ### 9.3 Observation Formatting
807
-
808
- The `_format_observation()` function converts the typed `FakeGangObservation`
809
- into a text block. Accounts are **sorted by `fake_risk_score` descending**,
810
- with status badges prepended:
811
-
812
- ```
813
- TASK: EASY | Steps remaining: 22
814
- Currently flagged (3/10): acc_0003, acc_0017, acc_0029
815
- Suspects not yet inspected (4): acc_0041, acc_0055, acc_0062, acc_0078
816
-
817
- PROFILED ACCOUNTS (sorted by fake_risk_score — highest first):
818
- [status | risk | node beh graph hub | photo bio mutual | comment ip_count]
819
- CONFIRMED_FAKE acc_0029 ◀ FLAGGED: risk=0.821 | node=0.82 beh=0.77 graph=0.86 hub=0.63
820
- SUSPECT acc_0041: risk=0.714 | node=0.79 beh=0.81 graph=0.74 hub=0.65 fnbr=3(!)
821
- SUSPECT acc_0055: risk=0.681 | node=0.71 beh=0.74 graph=0.69 hub=0.67 fnbr=2(!)
822
- NORMAL acc_0022: risk=0.121 | node=0.09 beh=0.31 graph=0.03 hub=0.84 [HUB?]
823
- ...
824
-
825
- KNOWN UNINSPECTED IDs: acc_0062, acc_0078, acc_0091, ...
826
-
827
- Environment message: Flagged acc_0029 as suspected fake.
828
- ```
829
-
830
- Key formatting choices:
831
 
832
- - `fnbr=N(!)` highlights when `flagged_neighbor_count > 0` — this is the most
833
- actionable graph signal
834
- - `[HUB?]` appears when `hub_legitimacy_score > 0.70` — warns the LLM not to flag
835
- - Status badge width is fixed (13 chars) for visual alignment
836
-
837
- ### 9.4 Required Response Format
838
 
839
  ```xml
840
  <thinking>
841
- Your reasoning here — which account is most suspicious and why,
842
- what signal you're acting on, what your next move is.
843
  </thinking>
844
  <action>
845
  INSPECT acc_0041
846
  </action>
847
  ```
848
 
849
- The parser (`_parse_action`) extracts the content inside `<action>...</action>`
850
- using regex, then matches to action types. If parsing fails entirely, the
851
- fallback inspects the highest-scored uninspected account.
852
-
853
- ### 9.5 Retry Logic
854
-
855
- ```python
856
- for attempt in range(3):
857
- try:
858
- raw = invoke_qwen(...)
859
- action = _parse_action(raw, obs)
860
- return action, raw
861
- except Exception as exc:
862
- wait = 2 ** attempt # 1s, 2s, 4s
863
- time.sleep(wait)
864
-
865
- # All retries failed → heuristic fallback
866
- return _heuristic_fallback(obs), "[FALLBACK]"
867
- ```
868
 
869
  ---
870
 
871
- ## 10. Reflexion — How the Agent Learns
872
 
873
  **Files:** `agent/reflection.py`, `agent/memory.py`
874
 
875
- The agent **cannot** update Qwen3's weights — Bedrock is a black-box API.
876
- Instead, it learns via **Reflexion**: post-episode lessons are written as text
877
- and injected into future prompts.
878
 
879
- ### 10.1 Learning Loop
880
 
881
- ```
882
- Episode N
883
- 1. LLM acts using: system_prompt + reflections[1..4] + best_trajectory
884
- 2. Episode ends → WIN or LOSS
885
- 3. Post-episode learning:
886
- If LOSS:
887
- → generate_reflection(action_log, outcome) → Qwen writes a lesson
888
- → lesson stored to memory/reflections_easy.jsonl
889
- If WIN:
890
- → save trajectory to memory/best_trajectory_easy.json (if better reward)
891
- → generate_success_reflection() → Qwen writes what worked
892
- → stored to reflections
893
-
894
- Episode N+1
895
- → get_reflections("easy", n=4) returns last 4 lessons
896
- → get_best_trajectory("easy") returns best win as few-shot example
897
- → both injected into prompt → LLM has learned from its past
898
- ```
899
 
900
- ### 10.2 Reflection Generation
901
 
902
- A separate Qwen3 call is made after each episode with this prompt:
903
 
904
  ```
905
- CASE DEBRIEF — Episode 12
906
- Task difficulty: MEDIUM
907
- Outcome: FAILURE
908
- Steps used: 50/50
909
- Result: [LOSS] TP=6 FP=3 FN=4 Recall=0.60 Precision=0.67
910
-
911
- INVESTIGATION LOG:
912
- 1. INSPECT acc_0022
913
- 2. INSPECT acc_0037
914
- ...
915
- 20. SUBMIT
916
-
917
- Write a 2-3 sentence lesson for your future self based on this case.
918
- ```
919
-
920
- **Example generated reflection:**
921
-
922
- > "The starting accounts were all real; I wasted 8 steps inspecting low-signal nodes
923
- > before pivoting. When photo_reuse and bio_template are both below 0.3 after 3 inspections,
924
- > immediately use INVESTIGATE_NETWORK to jump to a different graph region.
925
- > Once I found the first gang member at step 14, I should have cascaded faster
926
- > via SUSPECT accounts rather than continuing to inspect unknown IDs."
927
-
928
- This lesson is stored and appears in Episode 13's prompt, causing the agent to
929
- pivot earlier and follow the cascade more aggressively.
930
-
931
- ### 10.3 Best Trajectory (Few-Shot Example)
932
-
933
- The first episode that wins is saved as a few-shot example. Every subsequent win
934
- replaces it only if the reward is higher. The trajectory appears in the prompt as:
935
 
 
 
 
936
  ```
937
- ━━━ EXAMPLE SUCCESSFUL CASE (task=easy, reward=+14.20) ━━━
938
- 1. INSPECT acc_0012
939
- 2. INSPECT acc_0037
940
- 3. FLAG acc_0037
941
- 4. INSPECT acc_0041 (suspect — cascaded from acc_0037)
942
- 5. FLAG acc_0041
943
- ...
944
- → [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00
945
- ```
946
-
947
- The LLM sees a concrete example of the exact pattern that leads to a perfect win,
948
- and mirrors this strategy.
949
-
950
- ### 10.4 Memory Persistence
951
 
952
- All memory is stored in `memory/` as flat files:
953
-
954
- ```
955
- memory/
956
- ├── reflections_easy.jsonl # one JSON entry per reflection
957
- ├── reflections_medium.jsonl
958
- ├── reflections_hard.jsonl
959
- ├── best_trajectory_easy.json # single best win per task
960
- ├── best_trajectory_medium.json
961
- ├── best_trajectory_hard.json
962
- ├── wins_easy.jsonl # episode-level win history (for alpha)
963
- ├── wins_medium.jsonl
964
- ├── wins_hard.jsonl
965
- ├── alpha_easy.json # current α for this task
966
- ├── alpha_medium.json
967
- └── alpha_hard.json
968
- ```
969
 
970
- The `memory/` directory is a Docker volume (`VOLUME ["/app/memory"]`), so all
971
- learning persists across container restarts and redeployments.
972
 
973
  ---
974
 
975
- ## 11. Hybrid Policy — The Novel Contribution
976
 
977
  **File:** `agent/hybrid_policy.py`
978
 
979
- The key insight: **a new LLM agent starts dumb but improves over time. A rule
980
- engine is always consistent but cannot adapt.** The hybrid policy exploits both:
981
- rules provide a safety net early while the LLM builds its track record; once the
982
- LLM proves itself, rules step back.
983
-
984
- ### 11.1 The Problem with Pure LLM
985
-
986
- In the first few episodes:
987
 
988
- - No reflections have been generated yet
989
- - No successful trajectory to use as a few-shot example
990
- - The LLM is essentially guessing based only on the system prompt
991
- - Win rate on `easy` episodes ≈ 30% at episode 1 (single-digit recall)
992
 
993
- A deterministic rule engine using `fake_risk_score` thresholds would achieve
994
- ~60% win rate on `easy` from episode 1, with zero learning overhead.
995
 
996
- ### 11.2 The Problem with Pure Rules
997
 
998
- Rules use fixed thresholds. They cannot:
999
-
1000
- - Adapt to the evasion events in hard mode
1001
- - Prioritise which SUSPECT to inspect based on context
1002
- - Recognise unusual configurations (e.g., decoys clustered near gang members)
1003
- - Balance exploration vs. exploitation optimally
1004
-
1005
- The LLM, given enough reflections, learns these nuances.
1006
-
1007
- ### 11.3 Alpha: The Trust Weight
1008
-
1009
- α (alpha) is a per-task value in [0.20, cap] representing the agent's current
1010
- trust in the LLM:
1011
 
1012
  ```
1013
  reflection_factor = min(1.0, n_reflections / 4.0)
1014
  raw = 0.20 + reflection_factor × (0.80 × recent_win_rate + 0.12)
1015
  α = clamp(raw, 0.20, cap)
1016
-
1017
- where:
1018
- recent_win_rate = wins in last 10 episodes for this task
1019
- reflection_factor = min(1.0, n_reflections / 4.0)
1020
  ```
1021
 
1022
- **Per-task alpha caps** prevent α from climbing so high that the LLM overrides
1023
- correct high-confidence rule-engine decisions (e.g., Priority 2 INSPECT SUSPECT
1024
- at confidence=0.95):
1025
-
1026
- | Task | α cap | Rationale |
1027
- | ------ | ------ | ------------------------------------------------------------------ |
1028
- | easy | 0.50 | Rule engine alone achieves ~91% — LLM should assist, not override |
1029
- | medium | 0.70 | Decoys require some LLM judgment, but cascade must stay |
1030
- | hard | 0.85 | LLM needs latitude for evasion adaptation, but safety rules remain |
1031
-
1032
- `reflection_factor` ensures the LLM must accumulate at least **4 reflections**
1033
- before it can reach meaningful trust — pure win rate is not enough, because the LLM
1034
- needs to have demonstrably learned from failures.
1035
 
1036
  **Alpha trajectory over training (easy task, cap=0.50):**
1037
 
1038
- | Episode | Wins (last 10) | Reflections | reflection_factor | raw | α (capped) |
1039
- | ------- | -------------- | ----------- | ----------------- | ---- | -------------- |
1040
- | 1 | 0/0 → wr=0% | 0 | 0.00 | 0.20 | 0.20 |
1041
- | 5 | 1/5 → wr=20% | 4 | 1.00 | 0.48 | 0.48 |
1042
- | 10 | 5/10 → wr=50% | 9 | 1.00 | 0.72 | **0.50** |
1043
- | 20 | 8/10 → wr=80% | 19 | 1.00 | 0.96 | **0.50** |
1044
-
1045
- α starts at 0.20 (rules dominate) and climbs toward the task-specific cap as
1046
- the LLM wins consistently and accumulates lessons. The cap ensures the rule
1047
- engine retains veto power over high-confidence structural decisions.
1048
-
1049
- ### 11.4 Rule Action + Confidence
1050
-
1051
- `get_rule_action(obs)` returns `(FakeGangAction, float)` where the float is
1052
- the rule's confidence in its own decision:
1053
 
1054
- | Situation | Action | Confidence |
1055
- | ------------------------------------------------- | --------------------- | ---------------------------------- |
1056
- | Steps remaining = 0 | SUBMIT | 1.00 |
1057
- | Uninspected SUSPECT accounts exist | INSPECT suspects[0] | 0.95 |
1058
- | Inspected account: fake_risk ≥ 0.85 | FLAG that account | 0.95 |
1059
- | Inspected account: fake_risk in [threshold, 0.85) | FLAG that account | 0.70 + (risk − threshold) × 0.60 |
1060
- | 10 accounts already flagged | SUBMIT | 0.85 |
1061
- | Steps remaining ≤ 3 | SUBMIT | 0.90 |
1062
- | Uninspected accounts available | INSPECT top candidate | 0.30 |
1063
- | Nothing to do | SUBMIT | 0.75 |
1064
 
1065
- Confidence values are calibrated such that:
1066
 
1067
- - Structural/safety decisions (out of steps, cascade suspects) have confidence ≥ 0.90
1068
- - Direct flag decisions have confidence ≥ 0.70
1069
- - Exploratory decisions have confidence 0.30 (the rule is just suggesting, not insisting)
1070
 
1071
- ### 11.5 Blending Decision
1072
-
1073
- ```python
1074
- rule_action, rule_conf = get_rule_action(obs)
1075
- llm_action, raw_llm = get_action(obs, reflections, few_shot, temperature)
1076
-
1077
- if rule_action == llm_action: # same type AND same account_id
1078
- mode = "agree"
1079
- final = llm_action
1080
-
1081
- elif rule_conf >= alpha: # rule is confident enough to override
1082
- mode = f"rule_override(c={rule_conf:.2f},α={alpha:.2f})"
1083
- final = rule_action
1084
-
1085
- else: # LLM is trusted; rule doesn't insist
1086
- mode = f"llm(c={rule_conf:.2f}<α={alpha:.2f})"
1087
- final = llm_action
1088
- ```
1089
 
1090
- **Why this works mathematically:**
1091
 
1092
- The condition `rule_conf >= alpha` creates a natural threshold system:
1093
-
1094
- - At **α=0.20** (early training, no history):
1095
-
1096
- - Rules win whenever confidence ≥ 0.20
1097
- - The only exploratory INSPECT (confidence=0.30) still beats α=0.20
1098
- - So rules dominate: ~90% of decisions are rule-driven
1099
- - Effectively acts like the rule-based baseline agent
1100
- - At **α=0.50** (moderate trust, mixed results):
1101
-
1102
- - Rules win when confidence ≥ 0.50
1103
- - Safety decisions (suspects, forced submit) still override: conf=0.95 > 0.50
1104
- - Exploratory decisions (conf=0.30) now go to LLM: 0.30 < 0.50
1105
- - The LLM controls exploration; rules control safety
1106
- - At **α=0.84** (high trust, consistent wins):
1107
-
1108
- - Rules win only when confidence ≥ 0.84
1109
- - Only the two highest-confidence situations still override: forced submit
1110
- (1.00) and uninspected suspects (0.95)
1111
- - Everything else goes to the LLM, including direct flag decisions
1112
- - At **α=cap** (maximum trust for the task):
1113
-
1114
- - On easy (cap=0.50): rules still override for suspects (0.95), flags (0.70+),
1115
- and forced submits (1.00) — only exploratory INSPECTs (0.30) go to LLM
1116
- - On hard (cap=0.85): rules only override the highest-confidence situations
1117
- (suspects, forced submit); LLM controls flag and exploration decisions
1118
-
1119
- ### 11.6 Disagreement Examples
1120
-
1121
- **Example 1 — Early training (α=0.25), LLM exploring, rule insisting on suspect:**
1122
-
1123
- ```
1124
- Rule: INSPECT acc_0041 (SUSPECT account) confidence=0.95
1125
- LLM: INSPECT acc_0099 (random exploration)
1126
- Rule wins: 0.95 ≥ 0.25 → INSPECT acc_0041
1127
- mode = "rule_override(c=0.95,α=0.25)"
1128
- ```
1129
-
1130
- **Example 2 — Mid training (α=0.60), LLM flags a high-risk account:**
1131
-
1132
- ```
1133
- Rule: INSPECT acc_0041 (uninspected suspect) confidence=0.95
1134
- LLM: FLAG acc_0055 (fake_risk=0.79, already inspected)
1135
- Rule wins: 0.95 ≥ 0.60 → INSPECT acc_0041
1136
- mode = "rule_override(c=0.95,α=0.60)"
1137
- ```
1138
-
1139
- *(Both actions are useful; the rule correctly prioritises cascade suspects
1140
- before random flags)*
1141
-
1142
- **Example 3 — High trust (α=0.85), LLM has learned to prioritise smarter:**
1143
-
1144
- ```
1145
- Rule: INSPECT acc_0041 (exploratory, conf=0.30)
1146
- LLM: FLAG acc_0055 (fake_risk=0.88, very high confidence)
1147
- LLM wins: 0.30 < 0.85 → FLAG acc_0055
1148
- mode = "llm(c=0.30<α=0.85)"
1149
- ```
1150
-
1151
- **Example 4 — Both agree (most common case in late training):**
1152
-
1153
- ```
1154
- Rule: INSPECT acc_0041 (SUSPECT, conf=0.95)
1155
- LLM: INSPECT acc_0041 (LLM also noticed the suspect badge)
1156
- mode = "agree"
1157
- ```
1158
-
1159
- ### 11.7 Alpha Persistence
1160
-
1161
- After every episode, `train.py` does:
1162
-
1163
- ```python
1164
- # Record outcome
1165
- memory.record_win(task, won, episode_num)
1166
-
1167
- # Recompute alpha with updated win history (per-task cap applied)
1168
- new_wr = memory.recent_win_rate(task, n=10)
1169
- new_alpha = compute_alpha(new_wr, n_reflections, task=current_task)
1170
-
1171
- # Save for next run (even if container restarts)
1172
- memory.save_alpha(task, new_alpha)
1173
- ```
1174
-
1175
- Alpha is stored in `memory/alpha_{task}.json` and loaded at the start of each
1176
- training run. This means the agent's trust level is preserved across Docker
1177
- restarts — it doesn't reset to 0.20 every time.
1178
-
1179
- ### 11.8 Mode Logging
1180
-
1181
- Every episode's metrics include a mode breakdown:
1182
-
1183
- ```json
1184
- {
1185
- "alpha_used": 0.42,
1186
- "mode_agree": 11,
1187
- "mode_rule": 7,
1188
- "mode_llm": 4
1189
- }
1190
- ```
1191
-
1192
- The training printer shows this per episode:
1193
-
1194
- ```
1195
- Ep 12 | easy | WIN | reward= +12.40 | recall=1.00 prec=0.91 | steps=21 | wr=60% | α=0.42 | agree=11 rule=7 llm=4
1196
- ```
1197
-
1198
- You can watch the transition: early episodes have high `rule` counts; later
1199
- episodes have high `agree` counts (LLM learned to make the same decisions as
1200
- the rules, but also brings in strategic reasoning the rules can't).
1201
 
1202
  ---
1203
 
1204
- ## 12. Training Loop End-to-End
1205
 
1206
  **File:** `train.py`
1207
 
1208
- ### 12.1 Curriculum
1209
 
1210
- | Phase | Episodes | Task | Goal |
1211
- | ----- | -------- | ------ | ------------------------------------------------------ |
1212
- | 1 | 1–20 | easy | Learn basic signal thresholds, build first reflections |
1213
- | 2 | 21–35 | medium | Handle decoys, learn evasion response |
1214
- | 3 | 36–50 | hard | Feature-only detection, persistent evasion |
1215
 
1216
  Seeds rotate deterministically: `seed = (episode_num + task_offset) % 50`
1217
- so the agent sees all 50 pre-generated episodes before revisiting any.
1218
 
1219
- ### 12.2 Per-Episode Flow
1220
 
1221
  ```
1222
  for ep in range(n_episodes):
1223
 
1224
- 1. DETERMINE TASK
1225
- current_task = curriculum_task(ep) or fixed task
1226
-
1227
- 2. COMPUTE ALPHA (per-task cap applied)
1228
- n_refs = memory.reflection_count(current_task)
1229
- wr = memory.recent_win_rate(current_task, n=10)
1230
- alpha = compute_alpha(wr, n_refs, task=current_task) # capped per task
1231
-
1232
- 3. LOAD CONTEXT
1233
- reflections = memory.get_reflections(task, n=4) # last 4 lessons
1234
- few_shot = memory.get_best_trajectory(task) # best win so far
1235
-
1236
- 4. RUN EPISODE (hybrid policy)
1237
- obs = env.reset(task, seed)
1238
- while not obs.done:
1239
- rule_action, rule_conf = get_rule_action(obs)
1240
- llm_action, raw_llm = get_action(obs, reflections, few_shot, α, temperature)
1241
- final = blend(rule_action, llm_action, rule_conf, alpha)
1242
- obs = env.step(final)
1243
-
1244
- 5. POST-EPISODE LEARNING
1245
- memory.record_win(task, won, ep)
1246
- new_alpha = compute_alpha(updated_wr, n_refs)
1247
- memory.save_alpha(task, new_alpha)
1248
-
1249
- if won:
1250
- memory.add_trajectory(task, action_log, final_msg, reward, ep)
1251
- if new_best_or_no_refs:
1252
- reflection = generate_success_reflection(...)
1253
- memory.add_reflection(task, reflection, ep, reward)
1254
- else:
1255
- reflection = generate_reflection(task, action_log, final_msg, ...)
1256
- memory.add_reflection(task, reflection, ep, reward)
1257
-
1258
- 6. LOG
1259
- print per-episode stats: task, win/loss, reward, recall, precision,
1260
- steps, win_rate, α, mode breakdown
1261
  ```
1262
 
1263
- ### 12.3 Metrics Saved
1264
-
1265
- Every 5 episodes, metrics are flushed to `runs/metrics.jsonl`:
1266
-
1267
- ```json
1268
- {
1269
- "episode": 15,
1270
- "task": "easy",
1271
- "seed": 14,
1272
- "won": true,
1273
- "reward": 13.20,
1274
- "steps_used": 23,
1275
- "recall": 1.00,
1276
- "precision": 0.91,
1277
- "action_log": ["INSPECT acc_0022", "INSPECT acc_0037", ...],
1278
- "final_message": "[WIN] TP=10 FP=1 FN=0 ...",
1279
- "n_reflections_used": 4,
1280
- "had_few_shot": true,
1281
- "alpha_used": 0.52,
1282
- "mode_agree": 13,
1283
- "mode_rule": 6,
1284
- "mode_llm": 4,
1285
- "timestamp": "2026-04-01T10:23:41"
1286
- }
1287
- ```
1288
 
1289
  ---
1290
 
1291
- ## 13. API Reference
1292
 
1293
  **File:** `server/app.py`
1294
 
1295
- ### GET /health
 
 
 
 
 
 
 
 
1296
 
1297
- ```json
1298
- {"status": "healthy"}
1299
- ```
1300
 
1301
- ### GET /tasks
1302
-
1303
- ```json
1304
- {
1305
- "tasks": ["easy", "medium", "hard"],
1306
- "descriptions": {
1307
- "easy": "50 accounts, 10 fakes, no evasion, 30 steps",
1308
- "medium": "200 accounts, 10 fakes + 20 decoys, evasion at step 20, 50 steps",
1309
- "hard": "1000 accounts, 10 fakes + 50 decoys, recurring evasion, 80 steps"
1310
- },
1311
- "action_schema": {
1312
- "action_type": ["inspect", "investigate_network", "flag", "unflag", "submit"],
1313
- "account_id": "string (required for all actions except submit)"
1314
- },
1315
- "score_range": [0.0, 1.0]
1316
- }
1317
- ```
1318
-
1319
- ### POST /reset
1320
-
1321
- Request:
1322
-
1323
- ```json
1324
- {"task": "easy", "seed": 0}
1325
- ```
1326
-
1327
- Response: `StepResponse` with initial observation.
1328
-
1329
- ### POST /step
1330
-
1331
- Request: Any `FakeGangAction`:
1332
-
1333
- ```json
1334
- {"action_type": "inspect", "account_id": "acc_0042"}
1335
- {"action_type": "flag", "account_id": "acc_0017"}
1336
- {"action_type": "submit"}
1337
- ```
1338
-
1339
- Response: `StepResponse` with updated observation, done flag, and reward.
1340
-
1341
- ### GET /state
1342
-
1343
- Returns current episode metadata:
1344
-
1345
- ```json
1346
- {
1347
- "episode_id": "uuid",
1348
- "step_count": 12,
1349
- "task": "easy",
1350
- "score_so_far": -0.12,
1351
- "evasion_count": 0,
1352
- "network_size": 50,
1353
- "gang_size": 10,
1354
- "episode_seed": 0
1355
- }
1356
- ```
1357
-
1358
- ### GET /grader
1359
-
1360
- Returns the normalised grader score after SUBMIT. Error 400 if episode not done.
1361
-
1362
- ```json
1363
- {"score": 0.91, "task": "easy", "episode_id": "uuid"}
1364
- ```
1365
-
1366
- ### POST /baseline
1367
-
1368
- Runs the rule-based agent on all three tasks (seed=0) and returns scores:
1369
-
1370
- ```json
1371
- {
1372
- "scores": {"easy": 0.91, "medium": 0.906, "hard": 0.9038},
1373
- "agent": "rule_based"
1374
- }
1375
- ```
1376
-
1377
- **Baseline performance across 50 seeds:**
1378
-
1379
- | Task | Seed=0 score | Win rate (50 seeds) | Mean score (50 seeds) |
1380
- | ------ | ------------ | ------------------- | --------------------- |
1381
- | easy | 0.91 | 100% | ~0.91 |
1382
- | medium | 0.906 | 84% | ~0.77 |
1383
- | hard | 0.9038 | 52% | ~0.47 |
1384
-
1385
- The baseline is a deterministic rule-based agent — no LLM, no learning. The
1386
- difficulty scaling is designed so that easy is consistently solvable, medium
1387
- requires some luck, and hard genuinely challenges frontier LLM agents via
1388
- evasion events that destroy graph signals mid-investigation.
1389
 
1390
  ---
1391
 
1392
- ## 14. Docker Deployment
1393
-
1394
- **File:** `server/Dockerfile`
1395
-
1396
- ### 14.1 Build
1397
 
1398
  ```bash
1399
- cd fake_gang_env
1400
  docker build -f server/Dockerfile -t graphstrike .
1401
- ```
1402
-
1403
- Build takes ~10 seconds because:
1404
-
1405
- - The `.dockerignore` excludes `episodes/` (109 MB), `memory/`, `runs/`
1406
- - Python wheels are pre-downloaded to `wheels/` — no network access during `pip install`
1407
- - No `apt-get` installs needed (everything is pure Python)
1408
-
1409
- ### 14.2 Run
1410
 
1411
- ```bash
1412
  docker run -it \
1413
  -e AWS_ACCESS_KEY_ID=your_key \
1414
  -e AWS_SECRET_ACCESS_KEY=your_secret \
@@ -1418,239 +456,67 @@ docker run -it \
1418
  graphstrike
1419
  ```
1420
 
1421
- The volumes preserve all learning between runs. When you restart the container,
1422
- the agent continues from where it left off (α values, reflections, best trajectories).
1423
 
1424
- ### 14.3 Environment Variables
1425
 
1426
- | Variable | Default | Description |
1427
- | ------------------------- | --------------- | ------------------------------------- |
1428
- | `AWS_ACCESS_KEY_ID` | (required) | For Bedrock/Qwen3 access |
1429
- | `AWS_SECRET_ACCESS_KEY` | (required) | For Bedrock/Qwen3 access |
1430
- | `AWS_DEFAULT_REGION` | `us-east-1` | Bedrock region |
1431
- | `TRAIN_TASK` | `` (curriculum) | Fix to `easy`/`medium`/`hard` |
1432
- | `TRAIN_EPISODES` | `50` | Total training episodes |
1433
- | `TRAIN_TEMP` | `0.4` | LLM sampling temperature |
1434
- | `TRAIN_VERBOSE` | `0` | Set `1` for per-step action logging |
1435
- | `SERVER_PORT` | `8000` | FastAPI port |
1436
 
1437
- ### 14.4 Startup Sequence (run.sh)
1438
 
1439
  ```
1440
- 1. Validate AWS credentials (exits if missing)
1441
- 2. python server/generator.py → generates/overwrites 150 episode JSON files (~1s)
1442
  3. uvicorn server.app:app → starts the environment server
1443
- 4. Python urllib health check polls /health until ready (no curl needed)
1444
  5. python train.py → runs the full training loop
1445
  ```
1446
 
1447
  ---
1448
 
1449
- ## 15. Submission Requirements
1450
-
1451
- All submission requirements are satisfied. The environment is deployed at
1452
- [huggingface.co/spaces/Pandago/graphstrike](https://huggingface.co/spaces/Pandago/graphstrike).
1453
-
1454
- ### 15.1 Required Endpoints
1455
-
1456
- | Endpoint | Method | Status | Description |
1457
- | ------------- | ------ | ------ | -------------------------------------------------------- |
1458
- | `/health` | GET | ✅ | Returns `{"status": "healthy"}` |
1459
- | `/tasks` | GET | ✅ | 3 tasks +`action_schema` + `score_range: [0.0, 1.0]` |
1460
- | `/reset` | POST | ✅ | Accepts `{task, seed}`, returns initial observation |
1461
- | `/step` | POST | ✅ | Accepts any valid action, returns updated observation |
1462
- | `/state` | GET | ✅ | Returns episode metadata (step count, task, score) |
1463
- | `/grader` | GET | ✅ | Returns normalised [0.0, 1.0] score after SUBMIT |
1464
- | `/baseline` | POST | ✅ | Runs rule-based agent on all 3 tasks, returns scores |
1465
-
1466
- ### 15.2 /tasks with action_schema
1467
-
1468
- The `/tasks` endpoint returns the `action_schema` dict listing all valid
1469
- `action_type` values and the `account_id` field description. Graders can
1470
- discover the full action space without reading code.
1471
-
1472
- ### 15.3 /grader — Normalised Scoring
1473
-
1474
- After calling `SUBMIT` (via `/step`), call `GET /grader` to retrieve the
1475
- normalised [0.0, 1.0] grader score. Returns 400 if the episode is not yet done.
1476
-
1477
- The score formula (see §7.8) rewards recall, precision, and efficiency.
1478
- Maximum score 1.0 requires finding all 10 gang members with no false positives
1479
- and using no steps. The grader is **deterministic** — same actions produce same score.
1480
-
1481
- ### 15.4 /baseline — Reproducible Baseline Agent
1482
-
1483
- `POST /baseline` imports `inference.py`'s `run_rule_based_episode` and runs it
1484
- on all three tasks with seed=0. Returns:
1485
-
1486
- ```json
1487
- {"scores": {"easy": 0.91, "medium": 0.906, "hard": 0.9038}, "agent": "rule_based"}
1488
- ```
1489
-
1490
- **Reproducibility:** The baseline is fully deterministic — no randomness, no LLM calls.
1491
- Calling `/baseline` 3+ times in succession produces **identical scores** every time.
1492
- The evasion flags (`_fired_*` attributes) are properly cleared on `reset()`,
1493
- ensuring episodes replay identically across runs.
1494
-
1495
- ### 15.5 inference.py
1496
-
1497
- **Library mode** (used by `/baseline`):
1498
-
1499
- ```python
1500
- from inference import run_rule_based_episode
1501
- score = run_rule_based_episode(env, task="easy", seed=0)
1502
- # Returns float in [0.0, 1.0]
1503
- ```
1504
-
1505
- **CLI mode** (connect to running server):
1506
-
1507
- ```bash
1508
- python inference.py --url http://localhost:8000
1509
- # → {"scores": {"easy": 0.91, "medium": 0.906, "hard": 0.9038}, "agent": "rule_based"}
1510
- ```
1511
-
1512
- **CLI mode** (local, no server needed):
1513
-
1514
- ```bash
1515
- python inference.py --local
1516
- ```
1517
-
1518
- The rule-based strategy:
1519
-
1520
- 1. If SUSPECT accounts are uninspected → INSPECT highest suspect
1521
- 2. If any inspected account has `fake_risk_score ≥ threshold` and not flagged → FLAG it
1522
- 3. If no immediate flag or suspect → INSPECT highest-risk uninspected account
1523
- 4. If steps ≤ 3 or 10 flags placed → SUBMIT
1524
-
1525
- Thresholds by task: easy=0.60, medium=0.50, hard=0.45.
1526
-
1527
- ### 15.6 validate.py — 24-Point Pre-Submission Validator
1528
-
1529
- Runs 24 checks split between local (no server) and HTTP:
1530
-
1531
- ```bash
1532
- python validate.py --local # 9 local checks only
1533
- python validate.py --url http://... # all 24 checks (requires running server)
1534
- ```
1535
-
1536
- Checks include:
1537
-
1538
- - scoring.py math correctness (gang risk ≥ 0.60, celebrity risk < 0.20, perfect score = 1.00)
1539
- - models.py has all new fields (fake_risk_score, suspect_ids, AccountStatus)
1540
- - environment.py SUSPECT cascade triggers after FLAG
1541
- - inference.py runs without error and returns [0,1] float
1542
- - episodes have new features (comment_repeat_score, shared_ip_count, celeb_ids)
1543
- - /health reachable
1544
- - /tasks has action_schema and score_range
1545
- - /reset works for all three tasks
1546
- - /step supports INSPECT, FLAG, SUBMIT
1547
- - /grader returns [0,1] float after SUBMIT
1548
- - /baseline returns 3 valid scores
1549
-
1550
- **All 24/24 checks pass.**
1551
-
1552
- ### 15.7 Judging Criteria Alignment
1553
-
1554
- | Criterion | Weight | How GraphStrike addresses it |
1555
- | ---------------------------- | ------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
1556
- | **Domain quality** | 30% | Real-world fraud detection domain; signals modelled on actual Instagram fake-account patterns (IP clustering, photo reuse, bio templates, temporal coordination) |
1557
- | **Task & grader** | 25% | 3 difficulty tiers with clear win conditions; grader formula rewards recall, precision, and efficiency; partial credit for incomplete investigations |
1558
- | **Environment design** | 20% | Bidirectional graph, dual cascade (follow + IP), evasion events that destroy signals mid-investigation, decoy accounts that penalise reckless flagging |
1559
- | **Code quality** | 15% | Typed Pydantic models, stateless scoring functions, 24-point validator, deterministic episode generation by seed |
1560
- | **Creativity** | 10% | Hybrid rule/LLM policy with dynamic α caps, Reflexion-based learning without fine-tuning, IP cluster cascade as evasion-resistant signal |
1561
-
1562
- ---
1563
-
1564
- ## 16. Verification & Validation
1565
-
1566
- ### Quick smoke test
1567
-
1568
- ```bash
1569
- cd fake_gang_env
1570
-
1571
- # Test scoring math
1572
- python3 -c "
1573
- import sys; sys.path.insert(0,'server')
1574
- from scoring import compute_fake_risk, compute_hub_legitimacy, grader_score
1575
-
1576
- gang_r = compute_fake_risk(0.75, 0.65, 0.85, 0.10)
1577
- hub = compute_hub_legitimacy(2_000_000, 200, 2000, 0.05)
1578
- celeb = compute_fake_risk(0.02, 0.02, 0.10, hub)
1579
- assert gang_r >= 0.60, f'Gang risk too low: {gang_r}'
1580
- assert celeb < 0.20, f'Celebrity risk too high: {celeb}'
1581
- assert grader_score(10, 0, 0, 0, 30) == 1.0
1582
- print(f'Gang risk={gang_r} Celeb risk={celeb} Perfect score=1.0 OK')
1583
- "
1584
-
1585
- # Test hybrid policy + cascade
1586
- python3 -c "
1587
- import sys, json; sys.path.insert(0,'server')
1588
- from models import FakeGangAction, ActionType
1589
- from environment import FakeGangEnvironment
1590
- from agent.hybrid_policy import get_rule_action, compute_alpha
1591
-
1592
- env = FakeGangEnvironment()
1593
- obs = env.reset(task='easy', seed=0)
1594
- gang = json.loads(open('episodes/easy_000.json').read())['gang_member_ids']
1595
- obs = env.step(FakeGangAction(action_type=ActionType.INSPECT, account_id=gang[0]))
1596
- obs = env.step(FakeGangAction(action_type=ActionType.FLAG, account_id=gang[0]))
1597
- assert len(obs.suspect_ids) > 0, 'Cascade failed'
1598
- action, conf = get_rule_action(obs)
1599
- assert action.account_id in obs.suspect_ids, 'Rule not prioritising suspects'
1600
- print(f'Cascade OK: {len(obs.suspect_ids)} suspects. Rule → INSPECT {action.account_id} (conf={conf:.2f})')
1601
- a0 = compute_alpha(0, 0, 'easy')
1602
- a1 = compute_alpha(0.5, 2, 'easy')
1603
- a2 = compute_alpha(1.0, 4, 'easy')
1604
- print(f'Alpha (easy, cap=0.50): min={a0} mid={a1} max={a2}')
1605
- "
1606
-
1607
- # Full local validate
1608
- python3 validate.py --local
1609
- ```
1610
 
1611
- ### Full HTTP validation (requires running server)
1612
 
1613
  ```bash
1614
  python3 -m uvicorn server.app:app --port 8001 &
1615
  sleep 3
1616
  python3 validate.py --url http://localhost:8001
 
1617
  ```
1618
 
1619
- Expected output: `Results: 24/24 passed — all OK`
1620
-
1621
  ### Deployed Endpoint Verification
1622
 
1623
- The live environment at [huggingface.co/spaces/Pandago/graphstrike](https://huggingface.co/spaces/Pandago/graphstrike)
1624
- responds to all standard OpenEnv endpoints:
1625
-
1626
  ```bash
1627
- # Health check
1628
  curl https://pandago-graphstrike.hf.space/health
1629
  # → {"status": "healthy"}
1630
 
1631
- # Task discovery
1632
  curl https://pandago-graphstrike.hf.space/tasks
1633
  # → {"tasks": ["easy","medium","hard"], "action_schema": {...}, "score_range": [0.0, 1.0]}
1634
 
1635
- # Baseline (deterministic, reproducible)
1636
  curl -X POST https://pandago-graphstrike.hf.space/baseline
1637
  # → {"scores": {"easy": 0.91, "medium": 0.906, "hard": 0.9038}, "agent": "rule_based"}
1638
  ```
1639
 
1640
  ---
1641
 
1642
-
1643
-
1644
  ![Material wave loading](https://github.com/user-attachments/assets/a08255eb-9647-471d-9881-61871332249f)
1645
 
1646
  ## Developed with ❤️ by Team ComputeXOR
1647
 
1648
-
1649
  ### {
1650
 
1651
- ### [Sai Nivedh](https://github.com/SaiNivedh26) ,
1652
 
1653
- ### [Chaaruvarthan](https://github.com/Charuvarthan-T) ,
1654
 
1655
  ### [Sajeev](https://github.com/SajeevSenthil)
1656
 
 
15
  - llm-agent
16
  base_path: /web
17
  ---
18
+ <br>
19
 
20
+ <p align="center">
21
+ <img src="assets/logo.png" width="600"/>
22
+ </p>
23
 
24
+ <br>
 
25
 
26
+ <p align="center">
27
+ <img src="https://img.shields.io/badge/Hugging%20Face-FFD21E?style=for-the-badge&logo=huggingface&logoColor=black"/>
28
+ <img src="https://img.shields.io/badge/HF%20Spaces-FFBF00?style=for-the-badge&logo=huggingface&logoColor=black"/>
29
+ <img src="https://img.shields.io/badge/FastAPI-009688?style=for-the-badge&logo=fastapi&logoColor=white"/>
30
+ <img src="https://img.shields.io/badge/Docker-2496ED?style=for-the-badge&logo=docker&logoColor=white"/>
31
+ <img src="https://img.shields.io/badge/Gradio-F97316?style=for-the-badge&logo=gradio&logoColor=white"/>
32
+ <img src="https://img.shields.io/badge/OpenEnv-4B5563?style=for-the-badge&logo=envato&logoColor=white"/>
33
+ <img src="https://img.shields.io/badge/Amazon%20Bedrock-FF9900?style=for-the-badge&logo=amazonaws&logoColor=white"/>
34
+ </p>
35
+ <br>
36
 
37
+ <h1 align="center">
38
+ </h1>
39
+ <p align="center">
40
+ An OpenEnv-compatible reinforcement learning environment where an LLM agent must identify all 10 members of a coordinated fake account network hidden inside a synthetic social network. The agent learns via Reflexion and a dynamic hybrid rule/LLM policy , not via gradient updates or fine-tuning.
41
+ <br />
42
+ </p>
43
+ </p>
44
 
45
+ <br>
46
 
47
+ ## Theme
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
+ **SUPPORT**
50
 
51
+ ### Customer Service Agents
52
 
53
+ Complex environment where agents resolve multi-step queries using external tools and APIs.
 
 
54
 
55
+ ## Problem Statement
 
 
 
56
 
57
+ **The task:** A social network contains fake accounts organised into a single coordinated ring of 10. The ring behaves in a coordinated way — same posting hour, same IP subnet, stolen celebrity photos, copy-paste bios. The agent must find all 10 by navigating a limited step budget, inspecting accounts, and flagging suspects.
58
 
59
+ ## Proposed Solution
 
 
 
 
 
 
 
60
 
61
+ An OpenEnv-compatible reinforcement learning environment where an LLM agent must identify all 10 members of a coordinated fake account ring hidden inside a synthetic social network. The agent learns via **Reflexion** and a **dynamic hybrid rule/LLM policy** not via gradient updates or fine-tuning.
 
 
 
 
 
62
 
63
  ---
64
+ ## Novelty Highlights
65
+
66
+ - **Adaptive Hybrid Intelligence (Rules + LLM):** Unlike static ensembles, GraphStrike dynamically blends deterministic rules and LLM reasoning using a trust gate, shifting control as performance improves.
67
+ - **Learning Without Fine-Tuning:** Instead of updating model weights, the agent learns through Reflexion lessons and best-trajectory memory injected into future prompts.
68
+ - **Graph-First Detection Pipeline:** Detection is not account-by-account only; it uses cascade effects, neighbor propagation, and multi-hop graph expansion to uncover coordinated rings.
69
+ - **Math-Grounded Decision Control:** Risk composition, trust calibration, and grader alignment are formula-driven, making behavior interpretable and reproducible.
70
+ - **Adversarial Evasion Benchmarking:** Hard-mode includes timed evasion events, so success reflects robustness under disruption rather than overfitting to static patterns.
71
+ - **Safety-Net by Design:** High-confidence rule overrides prevent catastrophic LLM errors while preserving LLM flexibility for strategic exploration.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  ---
73
 
74
+ ## Performance Summary
 
 
 
 
 
 
 
 
75
 
76
+ We evaluate GraphStrike's hybrid rule/LLM policy across multiple *frontier models to measure how well each model handles the investigation task. All runs use
77
+ the same inference pipeline (`inference.py`) with identical system prompts and structured logging. Each model ran: (1) seed=0 on all 3 tasks, and
78
+ (2) seeds 0-2 on all 3 tasks for variance measurement.*
 
 
79
 
80
+ **Seed=0 scores (single episode per task):**
 
81
 
82
+ <p align="center">
83
+ <img src="images/table1.png" alt="Model Performance Table" width="1600"/>
84
+ </p>
85
+ <br>
86
 
87
+ **3-seed variance scores (mean across seeds 0, 1, 2):**
 
 
 
 
 
88
 
89
+ <p align="center">
90
+ <img src="images/table2.png" alt="Model Performance Table" width="1600"/>
91
+ </p>
92
+ <br>
93
 
94
+ **Rule-Based Baseline (no LLM, deterministic)**
 
95
 
96
+ <p align="center">
97
+ <img src="images/table3.png" alt="Model Performance Table" width="1600"/>
98
+ </p>
99
+ <br>
 
 
 
 
 
 
 
 
 
100
 
101
  ---
102
+ ## Table of Contents
103
 
104
+ 1. [What This Is](#1-what-this-is)
105
+ 2. [The Problem: How Fake Detection Actually Works](#2-the-problem-how-fake-detection-actually-works)
106
+ 3. [Synthetic Data Generation](#3-synthetic-data-generation)
107
+ 4. [Data Model](#4-data-model)
108
+ 5. [The RL Environment](#5-the-rl-environment)
109
+ 6. [Risk Scoring Mathematics](#6-risk-scoring-mathematics)
110
+ 8. [The LLM Policy (Qwen3 via Bedrock)](#8-the-llm-policy-qwen3-via-bedrock)
111
+ 9. [Reflexion — How the Agent Learns](#9-reflexion--how-the-agent-learns)
112
+ 10. [Hybrid Policy — The Novel Contribution](#10-hybrid-policy--the-novel-contribution)
113
+ 11. [Training Loop End-to-End](#11-training-loop-end-to-end)
114
+ 12. [API Reference](#12-api-reference)
115
+ 13. [Docker Deployment](#13-docker-deployment)
116
+ 14. [Submission Requirements](#14-submission-requirements)
117
+ 15. [Verification & Validation](#15-verification--validation)
118
 
119
+ ---
120
 
121
+ ## 1. What is this !?
 
 
 
 
122
 
123
+ This is an **OpenEnv hackathon** submission. OpenEnv is a framework for building RL environments with a standard microservice interface (`/reset`, `/step`, `/state`) so that any agent implementation can plug in.
124
 
125
+ **What makes this non-trivial:**
 
 
 
 
 
126
 
127
+ - The network is large (50–1000 accounts depending on difficulty).
128
+ - Fake accounts are mixed with innocent high-signal "decoy" accounts.
129
+ - In hard mode, the gang actively evades — dropping intra-gang follows, renaming profiles — while the agent is mid-investigation.
130
+ - The agent cannot see the full network upfront: it must explore via INSPECT and INVESTIGATE_NETWORK actions, spending steps to reveal information.
131
 
132
+ **What makes the learning novel:**
 
 
133
 
134
+ - The LLM (inference via AWS Bedrock) cannot be fine-tuned it is a black-box API.
135
+ - The agent learns via **Reflexion**: post-episode lessons are written back into memory and injected into every future prompt.
136
+ - A **dynamic hybrid policy** (α-weighted) blends the LLM with a deterministic rule engine, with the blend weight α updating based on recent win rate. Rules dominate early; the LLM takes over as it proves itself.
137
 
138
+ ### System Architecture
 
 
139
 
140
+ ![System Architecture](assets/sys%20arch.png)
141
 
142
+ ---
 
 
143
 
144
+ ## 2. The Problem: How Fake Detection Actually Works
 
 
145
 
146
+ A real-world fake account detector does **not** read post content. Detection relies on three categories of signals computed from metadata:
147
 
148
+ ### Signal Hierarchy (Node -> Behavioral -> Graph)
149
 
150
+ ![Signal Hierarchy](assets/gs.png)
 
 
 
 
 
151
 
152
+ - **Node signals (offline):** content fingerprints like photo reuse, bio-template similarity, and comment repetition provide the first suspicion layer.
153
+ - **Behavioral signals (temporal/device):** coordinated posting hour, account-age clustering, and shared IP subnet add stronger gang-level evidence.
154
+ - **Graph signals (live at INSPECT):** mutual follows, flagged-neighbor growth, and cluster alignment are hardest to evade, so they carry the highest weight in risk scoring.
155
+ - **False-positive control:** high-legitimacy hubs (for example celebrities) are down-weighted through hub-legitimacy discounting.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
  ---
158
 
159
+ ## 3. Synthetic Data Generation
160
 
161
+ **File:** `server/generator.py`
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
+ Episodes are generated deterministically by seed. 150 episodes are pre-generated (50 per task) and cached as JSON files in `episodes/`.
 
 
 
 
 
 
164
 
165
+ ### Network Composition
 
166
 
167
+ | Task | Network size | Gang | Decoys | Real | Max steps |
168
+ |---|---|---|---|---|---|
169
+ | easy | 50 | 10 | 0 | 40 | 30 |
170
+ | medium | 200 | 10 | 20 | 170 | 50 |
171
+ | hard | 1000 | 10 | 50 | 940 | 80 |
172
 
173
+ - **Gang accounts:** All 10 share `base_age` (same creation week), tightly clustered `avg_post_hour`, high `photo_reuse_score`/`bio_template_score`, `comment_repeat_score` in [0.60, 0.90], `ip_cluster_id = "ip_gang_{seed}"`, and dense intra-gang follow edges (density 0.60–0.80).
174
+ - **Real accounts:** Log-normal follower distributions, unique IP clusters, low fake scores.
175
+ - **Decoy accounts** (medium/hard): Real accounts with elevated fraud scores (0.20–0.40 range) — they look suspicious but are NOT gang members and penalise reckless flagging.
176
+ - **Celebrity accounts** (2 per episode): 100k–5M followers, very low fake scores, high `hub_legitimacy_score`.
177
+ - **Zero-edge isolates** (2 per episode): No edges — test whether the agent wastes steps on disconnected nodes.
178
 
179
+ ---
 
 
 
180
 
181
+ ## 4. Data Model
 
 
182
 
183
+ **File:** `models.py`
 
 
 
184
 
185
+ ### ActionType
 
186
 
187
+ | Value | Cost | Effect |
188
+ |---|---|---|
189
+ | `inspect` | 1 step | Reveals full `AccountProfile` + follow list |
190
+ | `investigate_network` | 2 steps | Expands 2 hops; reveals account IDs only |
191
+ | `flag` | 0 steps | Marks account as gang member; triggers SUSPECT cascade |
192
+ | `unflag` | 0 steps | Removes flag; clears CONFIRMED_FAKE status |
193
+ | `submit` | 0 steps | Ends episode; triggers scoring |
194
 
195
+ ### AccountProfile key fields
 
 
 
 
 
196
 
197
+ | Category | Fields |
198
+ |---|---|
199
+ | Raw counts | `follower_count`, `following_count`, `post_count` |
200
+ | Temporal | `avg_post_hour`, `account_age_days` |
201
+ | Content pipeline (0–1) | `photo_reuse_score`, `bio_template_score`, `comment_repeat_score` |
202
+ | IP/device | `shared_ip_count`, `ip_cluster_id` |
203
+ | Graph (live at INSPECT) | `mutual_follow_rate`, `flagged_neighbor_count`, `avg_neighbor_photo_reuse`, `post_hour_cluster_score` |
204
+ | Risk breakdown | `fake_risk_score`, `node_risk`, `behavior_risk`, `graph_risk`, `hub_legitimacy_score` |
205
+ | Evasion/status | `name_change_count`, `status` (NORMAL/SUSPECT/CONFIRMED_FAKE) |
206
 
207
+ ### FakeGangObservation — what the agent sees each step
 
 
 
208
 
209
+ `done`, `reward`, `visible_accounts`, `visible_account_ids`, `flagged_ids`, `inspected_ids`, `suspect_ids`, `graph_edges`, `steps_remaining`, `evasion_triggered`, `evasion_count`, `task`, `message`
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
 
211
  ---
212
 
213
+ ## 5. The RL Environment
214
 
215
  **File:** `server/environment.py`
216
 
217
+ ### Episode Lifecycle & Action Mechanics
218
 
219
+ ![Episode Flow](assets/episode.png)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
 
221
+ **FLAG cascade (dual):** When FLAG(X) is called — (1) every visible account that X follows becomes SUSPECT via the follow-graph, and (2) every visible account sharing X's `ip_cluster_id` becomes SUSPECT. Gang members share `ip_gang_{seed}`; real accounts have unique IPs → zero false positives.
 
 
 
 
 
 
 
 
 
 
222
 
223
+ ### Reward Function
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
 
225
  ```
 
 
 
 
226
  base_reward = tp×1.0 − fp×0.5 − fn×0.3
227
 
228
+ Win condition:
229
  easy/medium: recall ≥ 0.8 AND precision ≥ 0.7
230
  hard: recall ≥ 0.9 AND precision ≥ 0.8
231
 
232
+ Bonuses:
233
+ +5.0 full win
234
+ +3.0 perfect recall
235
+ +2.0 partial win (high recall, low precision)
236
+ +1.0 efficiency (SUBMIT with ≥50% steps remaining)
237
+ −1.0 per evasion event (hard mode)
238
+ −2.0 forced submit (ran out of steps)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  ```
240
 
241
+ ### Evasion (hard mode)
 
 
 
242
 
243
+ - **`unfollow_intragang`:** 30% of intra-gang edges randomly removed mid-investigation — destroys graph signal. Fires 4 times (steps 15, 30, 45, 60).
244
+ - **`rename_count`:** Random gang members get `name_change_count += 1` a visual evasion signal.
 
 
 
 
 
 
245
 
246
  ---
247
 
248
+ ## 6. Risk Scoring Mathematics
 
 
249
 
250
+ **File:** `server/scoring.py` — all functions are stateless and deterministic.
 
 
251
 
252
+ ![pipw](images/big.png)
253
 
254
+ ![Risk Scoring Formulas 1](assets/formulas-1.png)
255
 
256
+ ![Risk Scoring Formulas 2](assets/formulas-2.png)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
 
258
  ---
259
 
260
+ ## 8. The LLM Policy (Qwen3 via Bedrock)
261
 
262
  **File:** `agent/policy.py`
263
 
264
+ **Model:** `qwen.qwen3-next-80b-a3b` via AWS Bedrock Converse API (`maxTokens=512, temperature=0.4`)
265
 
266
+ ### Prompt Structure
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
 
268
  Every step, the policy builds a prompt from three components:
269
 
 
275
  What is your next action?
276
  ```
277
 
278
+ Accounts in the observation are **sorted by `fake_risk_score` descending**, with status badges prepended. `fnbr=N(!)` highlights when `flagged_neighbor_count > 0`; `[HUB?]` warns the LLM not to flag high-legitimacy accounts.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
 
280
+ ### Required Response Format
 
 
 
 
 
281
 
282
  ```xml
283
  <thinking>
284
+ Reasoning — which account is most suspicious and why.
 
285
  </thinking>
286
  <action>
287
  INSPECT acc_0041
288
  </action>
289
  ```
290
 
291
+ If parsing fails, a heuristic fallback inspects the highest-scored uninspected account. Retries use exponential backoff (1s, 2s, 4s) up to 3 attempts.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
 
293
  ---
294
 
295
+ ## 9. Reflexion — How the Agent Learns
296
 
297
  **Files:** `agent/reflection.py`, `agent/memory.py`
298
 
299
+ The agent **cannot** update Qwen3's weights — Bedrock is a black-box API. Instead, it learns via **Reflexion**: post-episode lessons are written as text and injected into future prompts.
 
 
300
 
301
+ ### Reflexion Learning Loop
302
 
303
+ ![Reflexion Learning Loop](assets/reflexion.png)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
 
 
305
 
 
306
 
307
  ```
308
+ Episode N:
309
+ 1. LLM acts using: system_prompt + reflections[last 4] + best_trajectory
310
+ 2. Episode ends → WIN or LOSS
311
+ 3. Post-episode:
312
+ LOSS generate_reflection(action_log, outcome) lesson stored
313
+ WIN → save trajectory if better reward + generate_success_reflection
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
 
315
+ Episode N+1:
316
+ → last 4 reflections + best win trajectory injected into prompt
317
+ → LLM has learned from its past
318
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
 
320
+ **Example generated reflection:**
321
+ > *"The starting accounts were all real; I wasted 8 steps inspecting low-signal nodes before pivoting. When photo_reuse and bio_template are both below 0.3 after 3 inspections, immediately use INVESTIGATE_NETWORK to jump to a different graph region."*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
 
323
+ All memory persists in a Docker volume (`memory/`) across container restarts — reflections, best trajectories, win history, and α values per task.
 
324
 
325
  ---
326
 
327
+ ## 10. Hybrid Policy — The Novel Contribution
328
 
329
  **File:** `agent/hybrid_policy.py`
330
 
331
+ **Key insight:** A new LLM agent starts dumb but improves over time. A rule engine is always consistent but cannot adapt. The hybrid policy exploits both — rules provide a safety net early while the LLM builds its track record; once the LLM proves itself, rules step back.
 
 
 
 
 
 
 
332
 
333
+ ### Architecture
 
 
 
334
 
335
+ ![Hybrid Policy Architecture](assets/hybrid.png)
 
336
 
337
+ ### Alpha (α): The Trust Weight
338
 
339
+ α is a per-task value in [0.20, cap] representing current trust in the LLM:
 
 
 
 
 
 
 
 
 
 
 
 
340
 
341
  ```
342
  reflection_factor = min(1.0, n_reflections / 4.0)
343
  raw = 0.20 + reflection_factor × (0.80 × recent_win_rate + 0.12)
344
  α = clamp(raw, 0.20, cap)
 
 
 
 
345
  ```
346
 
347
+ | Task | α cap | Rationale |
348
+ |---|---|---|
349
+ | easy | 0.50 | Rule engine alone achieves ~91% — LLM should assist, not override |
350
+ | medium | 0.70 | Decoys require some LLM judgment, but cascade must stay |
351
+ | hard | 0.85 | LLM needs latitude for evasion adaptation, but safety rules remain |
 
 
 
 
 
 
 
 
352
 
353
  **Alpha trajectory over training (easy task, cap=0.50):**
354
 
355
+ | Episode | Win rate | Reflections | α (capped) |
356
+ |---|---|---|---|
357
+ | 1 | 0% | 0 | 0.20 |
358
+ | 5 | 20% | 4 | 0.48 |
359
+ | 10 | 50% | 9 | **0.50** |
360
+ | 20 | 80% | 19 | **0.50** |
 
 
 
 
 
 
 
 
 
361
 
362
+ <br>
 
 
 
 
 
 
 
 
 
363
 
364
+ ![System Architecture](images/plot.png)
365
 
366
+ ### Rule Confidence Levels
 
 
367
 
368
+ | Situation | Action | Confidence |
369
+ |---|---|---|
370
+ | Steps remaining = 0 | SUBMIT | 1.00 |
371
+ | Uninspected SUSPECT accounts exist | INSPECT suspects[0] | 0.95 |
372
+ | `fake_risk 0.85` | FLAG that account | 0.95 |
373
+ | `fake_risk` in [threshold, 0.85) | FLAG that account | 0.70+ |
374
+ | 10 accounts already flagged | SUBMIT | 0.85 |
375
+ | Steps remaining ≤ 3 | SUBMIT | 0.90 |
376
+ | Uninspected accounts available | INSPECT top candidate | 0.30 |
 
 
 
 
 
 
 
 
 
377
 
378
+ At **α=0.20** (early): rules dominate (~90% of decisions). At **α=0.50** (moderate): LLM controls exploration; rules control safety. At **α=0.85** (high): LLM controls most decisions; rules only override forced submits and uninspected suspects.
379
 
380
+ α is saved to `memory/alpha_{task}.json` and persists across Docker restarts — the agent doesn't reset to 0.20 every time.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
381
 
382
  ---
383
 
384
+ ## 11. Training Loop End-to-End
385
 
386
  **File:** `train.py`
387
 
388
+ ### Curriculum
389
 
390
+ | Phase | Episodes | Task | Goal |
391
+ |---|---|---|---|
392
+ | 1 | 1–20 | easy | Learn basic signal thresholds, build first reflections |
393
+ | 2 | 21–35 | medium | Handle decoys, learn evasion response |
394
+ | 3 | 36–50 | hard | Feature-only detection, persistent evasion |
395
 
396
  Seeds rotate deterministically: `seed = (episode_num + task_offset) % 50`
 
397
 
398
+ ### Per-Episode Flow
399
 
400
  ```
401
  for ep in range(n_episodes):
402
 
403
+ 1. DETERMINE TASK curriculum_task(ep) or fixed task
404
+ 2. COMPUTE ALPHA compute_alpha(win_rate, n_reflections, task)
405
+ 3. LOAD CONTEXT last 4 reflections + best win trajectory
406
+ 4. RUN EPISODE while not obs.done:
407
+ blend(rule_action, llm_action, rule_conf, α)
408
+ obs = env.step(final)
409
+ 5. POST-EPISODE record_win update α generate reflection
410
+ 6. LOG task | win/loss | reward | recall | precision | α | modes
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
411
  ```
412
 
413
+ Episode metrics (flushed to `runs/metrics.jsonl` every 5 episodes) include: `episode`, `task`, `won`, `reward`, `recall`, `precision`, `steps_used`, `alpha_used`, `mode_agree`, `mode_rule`, `mode_llm`, `n_reflections_used`.
414
+
415
+ You can watch the transition: early episodes have high `rule` counts; later episodes have high `agree` counts (LLM learned to make the same decisions as the rules, but also brings strategic reasoning the rules can't).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
416
 
417
  ---
418
 
419
+ ## 12. API Reference
420
 
421
  **File:** `server/app.py`
422
 
423
+ | Endpoint | Method | Description |
424
+ |---|---|---|
425
+ | `/health` | GET | `{"status": "healthy"}` |
426
+ | `/tasks` | GET | Task list + `action_schema` + `score_range: [0.0, 1.0]` |
427
+ | `/reset` | POST | Accepts `{task, seed}` → returns initial observation |
428
+ | `/step` | POST | Accepts any `FakeGangAction` → returns updated observation |
429
+ | `/state` | GET | Current episode metadata (step count, task, score) |
430
+ | `/grader` | GET | Normalised [0.0, 1.0] score after SUBMIT |
431
+ | `/baseline` | POST | Runs rule-based agent on all 3 tasks, returns scores |
432
 
433
+ **Baseline performance:**
 
 
434
 
435
+ | Task | Seed=0 score | Win rate (50 seeds) | Mean score (50 seeds) |
436
+ |---|---|---|---|
437
+ | easy | 0.91 | 100% | ~0.91 |
438
+ | medium | 0.906 | 84% | ~0.77 |
439
+ | hard | 0.9038 | 52% | ~0.47 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
440
 
441
  ---
442
 
443
+ ## 13. Docker Deployment
 
 
 
 
444
 
445
  ```bash
446
+ # Build
447
  docker build -f server/Dockerfile -t graphstrike .
 
 
 
 
 
 
 
 
 
448
 
449
+ # Run
450
  docker run -it \
451
  -e AWS_ACCESS_KEY_ID=your_key \
452
  -e AWS_SECRET_ACCESS_KEY=your_secret \
 
456
  graphstrike
457
  ```
458
 
459
+ The `memory/` and `runs/` volumes preserve all learning between container restarts.
 
460
 
461
+ ### Environment Variables
462
 
463
+ | Variable | Default | Description |
464
+ |---|---|---|
465
+ | `AWS_ACCESS_KEY_ID` | (required) | For Bedrock/Qwen3 access |
466
+ | `AWS_SECRET_ACCESS_KEY` | (required) | For Bedrock/Qwen3 access |
467
+ | `AWS_DEFAULT_REGION` | `us-east-1` | Bedrock region |
468
+ | `TRAIN_TASK` | (curriculum) | Fix to `easy`/`medium`/`hard` |
469
+ | `TRAIN_EPISODES` | `50` | Total training episodes |
470
+ | `TRAIN_TEMP` | `0.4` | LLM sampling temperature |
471
+ | `TRAIN_VERBOSE` | `0` | Set `1` for per-step action logging |
472
+ | `SERVER_PORT` | `8000` | FastAPI port |
473
 
474
+ ### Startup Sequence (`run.sh`)
475
 
476
  ```
477
+ 1. Validate AWS credentials
478
+ 2. python server/generator.py → generates 150 episode JSON files
479
  3. uvicorn server.app:app → starts the environment server
480
+ 4. Health check polling waits until /health responds
481
  5. python train.py → runs the full training loop
482
  ```
483
 
484
  ---
485
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
486
 
487
+ ### Full HTTP validation
488
 
489
  ```bash
490
  python3 -m uvicorn server.app:app --port 8001 &
491
  sleep 3
492
  python3 validate.py --url http://localhost:8001
493
+ # Expected: Results: 24/24 passed — all OK
494
  ```
495
 
 
 
496
  ### Deployed Endpoint Verification
497
 
 
 
 
498
  ```bash
 
499
  curl https://pandago-graphstrike.hf.space/health
500
  # → {"status": "healthy"}
501
 
 
502
  curl https://pandago-graphstrike.hf.space/tasks
503
  # → {"tasks": ["easy","medium","hard"], "action_schema": {...}, "score_range": [0.0, 1.0]}
504
 
 
505
  curl -X POST https://pandago-graphstrike.hf.space/baseline
506
  # → {"scores": {"easy": 0.91, "medium": 0.906, "hard": 0.9038}, "agent": "rule_based"}
507
  ```
508
 
509
  ---
510
 
 
 
511
  ![Material wave loading](https://github.com/user-attachments/assets/a08255eb-9647-471d-9881-61871332249f)
512
 
513
  ## Developed with ❤️ by Team ComputeXOR
514
 
 
515
  ### {
516
 
517
+ ### [Sai Nivedh](https://github.com/SaiNivedh26) ,
518
 
519
+ ### [Charuvarthan](https://github.com/Charuvarthan-T) ,
520
 
521
  ### [Sajeev](https://github.com/SajeevSenthil)
522
 
README.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1723ffdcb7e36d47ff546500f86b30ebbb40af3a8616e1818798aaf144e0f5fc
3
+ size 1448893
assets/episode.png ADDED
assets/formulas-1.png ADDED

Git LFS Details

  • SHA256: e0efb65c597f87da46edf10f7295a775568a6de84e190b424777b7120d141ae4
  • Pointer size: 131 Bytes
  • Size of remote file: 230 kB
assets/formulas-2.png ADDED
assets/gs.png ADDED
assets/hybrid.png ADDED

Git LFS Details

  • SHA256: bdc17cf51111efce3a756c4c9fad6179d2dcbe54431702f778c805435484cb8e
  • Pointer size: 131 Bytes
  • Size of remote file: 135 kB
assets/logo.png ADDED
assets/reflexion.png ADDED

Git LFS Details

  • SHA256: 16a89e586131c4c1484edb70b37a363bce1ad84d2009b4d930d43ab62f179c35
  • Pointer size: 131 Bytes
  • Size of remote file: 124 kB
assets/sys arch.png ADDED

Git LFS Details

  • SHA256: 02aa4f9f4f0abfaf934b4c2774c3adc1a3e0dc5486955daea6291e154f79f22c
  • Pointer size: 131 Bytes
  • Size of remote file: 205 kB
docs.md ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: GraphStrike
3
+ emoji: 🕵️
4
+ colorFrom: blue
5
+ colorTo: indigo
6
+ sdk: docker
7
+ app_port: 7860
8
+ pinned: false
9
+ license: mit
10
+ tags:
11
+ - reinforcement-learning
12
+ - social-network
13
+ - fraud-detection
14
+ - openenv
15
+ - llm-agent
16
+ ---
17
+ <br>
18
+
19
+ <p align="center">
20
+ <img src="images/logo.png" width="600"/>
21
+ </p>
22
+
23
+ <br>
24
+
25
+ <p align="center">
26
+ <img src="https://img.shields.io/badge/Hugging%20Face-FFD21E?style=for-the-badge&logo=huggingface&logoColor=black"/>
27
+ <img src="https://img.shields.io/badge/HF%20Spaces-FFBF00?style=for-the-badge&logo=huggingface&logoColor=black"/>
28
+ <img src="https://img.shields.io/badge/FastAPI-009688?style=for-the-badge&logo=fastapi&logoColor=white"/>
29
+ <img src="https://img.shields.io/badge/Docker-2496ED?style=for-the-badge&logo=docker&logoColor=white"/>
30
+ <img src="https://img.shields.io/badge/Gradio-F97316?style=for-the-badge&logo=gradio&logoColor=white"/>
31
+ <img src="https://img.shields.io/badge/OpenEnv-4B5563?style=for-the-badge&logo=envato&logoColor=white"/>
32
+ <img src="https://img.shields.io/badge/Amazon%20Bedrock-FF9900?style=for-the-badge&logo=amazonaws&logoColor=white"/>
33
+ </p>
34
+ <br>
35
+
36
+ <h1 align="center">
37
+ </h1>
38
+ <p align="center">
39
+ An OpenEnv-compatible reinforcement learning environment where an LLM agent must identify all 10 members of a coordinated fake account network hidden inside a synthetic social network. The agent learns via Reflexion and a dynamic hybrid rule/LLM policy , not via gradient updates or fine-tuning.
40
+ <br />
41
+ </p>
42
+ </p>
43
+
44
+ <br>
45
+ <br>
46
+
47
+ ### *Deployed Endpoint Verification*
48
+
49
+ The live environment at [huggingface.co/spaces/Pandago/graphstrike](https://huggingface.co/spaces/Pandago/graphstrike)
50
+ responds to all standard OpenEnv endpoints:
51
+
52
+ ```bash
53
+ # Health check
54
+ curl https://pandago-graphstrike.hf.space/health
55
+ # → {"status": "healthy"}
56
+
57
+ # Task discovery
58
+ curl https://pandago-graphstrike.hf.space/tasks
59
+ # → {"tasks": ["easy","medium","hard"], "action_schema": {...}, "score_range": [0.0, 1.0]}
60
+
61
+ # Baseline (deterministic, reproducible)
62
+ curl -X POST https://pandago-graphstrike.hf.space/baseline
63
+ # → {"scores": {"easy": 0.91, "medium": 0.906, "hard": 0.9038}, "agent": "rule_based"}
64
+ ```
65
+
66
+ ---
67
+
68
+ <br>
69
+
70
+ We evaluate GraphStrike's hybrid rule/LLM policy across multiple *frontier models to measure how well each model handles the investigation task. All runs use
71
+ the same inference pipeline (`inference.py`) with identical system prompts and structured logging. Each model ran: (1) seed=0 on all 3 tasks, and
72
+ (2) seeds 0-2 on all 3 tasks for variance measurement.*
73
+
74
+ <br>
75
+
76
+ **Seed=0 scores (single episode per task):**
77
+
78
+ <p align="center">
79
+ <img src="images/table1.png" alt="Model Performance Table" width="1600"/>
80
+ </p>
81
+ <br>
82
+
83
+ **3-seed variance scores (mean across seeds 0, 1, 2):**
84
+
85
+ <p align="center">
86
+ <img src="images/table2.png" alt="Model Performance Table" width="1600"/>
87
+ </p>
88
+ <br>
89
+
90
+ **Rule-Based Baseline (no LLM, deterministic)**
91
+
92
+ <p align="center">
93
+ <img src="images/table3.png" alt="Model Performance Table" width="1600"/>
94
+ </p>
95
+ <br>
96
+
97
+ ---
98
+
99
+ **The task:** A social network contains fake accounts organised into a
100
+ single coordinated network of 10. The network behaves in a coordinated way — same posting hour,
101
+ same IP subnet, stolen celebrity photos, copy-paste bios. The agent must find
102
+ all 10 by navigating a limited step budget, inspecting accounts, and flagging suspects.
103
+
104
+ **What makes this non-trivial:** The network is large (50–1000 accounts depending on difficulty). Fake accounts are mixed with innocent high-signal "decoy" accounts.In hard mode, the fake accounts actively evades — dropping intra-account follows, renaming profiles — while the agent is mid-investigation.The agent cannot see the full network upfront: it must explore via INSPECT and INVESTIGATE_NETWORK actions, spending steps to reveal information.
105
+
106
+ **What makes the learning novel:** The LInference LLM (via aws bedrock) cannot be fine-tuned.it's a black-box API. The agent learns via Reflexion i.e., post-episode lessons are written back into memory and injected into every future prompt. A dynamic hybrid policy (α-weighted) blends the LLM with a deterministic rule engine, with the blend weight α updating based on recent win rate. Rules dominate early; the LLM takes over as it proves itself.
107
+
108
+ ---
109
+
110
+ ## Detection Signals
111
+
112
+ Detection operates entirely on numeric metadata — no content processing. Three signal categories, computed at different points:
113
+
114
+ **Node signals** (pre-computed by content pipeline, static per account):
115
+
116
+ | Feature | Fake range | Real range | Notes |
117
+ |---|---|---|---|
118
+ | `photo_reuse_score` | 0.30–0.95 | 0.00–0.15 | pHash fingerprint match against celebrity photo DB |
119
+ | `bio_template_score` | 0.20–0.90 | 0.00–0.12 | Cosine sim to known fake bio templates |
120
+ | `comment_repeat_score` | 0.60–0.90 | 0.00–0.08 | Copy-paste spam fraction across accounts |
121
+
122
+ **Behavioral signals** (static, from account metadata):
123
+
124
+ | Feature | Fake pattern |
125
+ |---|---|
126
+ | `avg_post_hour` | All 10 gang members cluster within ±0.5 hours — coordinated scheduling |
127
+ | `account_age_days` | Created within the same week (`base_age ± 7 days`) |
128
+ | `shared_ip_count` | 9 — all 10 share one IP subnet (`ip_gang_{seed}`) |
129
+
130
+ **Graph signals** (dynamic — computed at INSPECT time, shift as investigation progresses):
131
+
132
+ | Feature | Why it matters |
133
+ |---|---|
134
+ | `mutual_follow_rate` | Gang members mutually follow each other at 0.6–0.9 density; legitimate hubs don't follow back |
135
+ | `flagged_neighbor_count` | Grows as more gang members are flagged — the cascade signal |
136
+ | `post_hour_cluster_score` | Alignment to mean posting hour of currently-flagged accounts (wrap-around aware) |
137
+ | `suspicious_mutual_ratio` | Used to compute hub legitimacy — protects celebrities from false positives |
138
+
139
+ Graph signals are the most powerful: once one gang member is flagged, `flagged_neighbor_count` rises for all connected members, compounding with each subsequent flag.
140
+
141
+ ---
142
+
143
+ ## Synthetic Network Composition
144
+
145
+ 150 episodes pre-generated deterministically (50 per task). Each episode is a JSON file (`episodes/{task}_{seed:03d}.json`).
146
+
147
+ | Task | Accounts | Gang | Decoys | Max steps | Evasion |
148
+ |---|---|---|---|---|---|
149
+ | easy | 50 | 10 | 0 | 30 | None |
150
+ | medium | 200 | 10 | 20 | 50 | Step 20 (once) |
151
+ | hard | 1000 | 10 | 50 | 80 | Steps 15/30/45/60 |
152
+
153
+ - **Gang:** Dense intra-follow graph (density 0.60–0.80), same IP subnet, tightly clustered post hours (std 0.5/1.5/2.5 by task).
154
+ - **Decoys** (medium/hard only): Real accounts with elevated `photo_reuse` and `bio_template` scores (0.20–0.40). They score as suspicious but are not gang members — they penalise reckless flagging.
155
+ - **Celebrities** (2 per episode): 100k–5M followers, near-zero fake scores. Hub legitimacy formula protects them.
156
+ - **Zero-edge isolates** (2 per episode): `follower_count=0`, no edges. Test whether the agent wastes steps on disconnected nodes.
157
+
158
+ ---
159
+
160
+ ## Actions
161
+
162
+ | Action | Cost | Effect |
163
+ |---|---|---|
164
+ | `inspect` | 1 step | Reveals full `AccountProfile` (all 22 features), adds neighbors to visible set |
165
+ | `investigate_network` | 2 steps | Bidirectional 2-hop expansion — reveals account IDs only (no profiles); re-cascades SUSPECT |
166
+ | `flag` | 0 steps | Marks account CONFIRMED_FAKE; dual cascade: follow-graph + IP cluster |
167
+ | `unflag` | 0 steps | Clears CONFIRMED_FAKE status |
168
+ | `submit` | 0 steps | Ends episode, triggers scoring |
169
+
170
+ **Dual SUSPECT cascade on FLAG:**
171
+ 1. *Follow-graph:* Every visible account that the flagged account follows → SUSPECT (high precision: gang follow density 0.70+).
172
+ 2. *IP cluster:* Every visible account sharing the same `ip_cluster_id` → SUSPECT (zero false positives: real accounts each have a unique IP; gang shares `ip_gang_{seed}`).
173
+
174
+ Both mechanisms surface in `obs.suspect_ids` — the agent's highest-priority INSPECT targets.
175
+
176
+ ---
177
+
178
+ ## Risk Scoring (`server/scoring.py`)
179
+
180
+ All functions are stateless, called inside `_build_profile()` at INSPECT time and on re-profiling after each FLAG.
181
+
182
+ ```
183
+ node_risk = 0.60 × photo_reuse + 0.40 × bio_template
184
+
185
+ age_norm = min(1.0, account_age_days / 365)
186
+ behavior_risk = 0.55 × (1 − age_norm) + 0.45 × post_hour_cluster_score
187
+
188
+ flagged_ratio = flagged_neighbor_count / max(inspected_neighbor_count, 1)
189
+ graph_risk = 0.45 × flagged_ratio + 0.35 × mutual_follow_rate + 0.20 × avg_neighbor_photo_reuse
190
+
191
+ hub_legitimacy = 0.45 × log(1+followers)/log(1+1M)
192
+ + 0.25 × (1 − follow_ratio_norm)
193
+ + 0.20 × age_norm
194
+ + 0.10 × (1 − suspicious_mutual_ratio)
195
+
196
+ fake_risk = clip(0.30×node_risk + 0.25×behavior_risk + 0.45×graph_risk − 0.25×hub_legitimacy, 0, 1)
197
+ ```
198
+
199
+ **Weight rationale:** Graph risk (0.45) is dominant — structural signals are hardest to fake and compound across the investigation. Hub legitimacy is subtractive — a celebrity with 5M followers produces `hub_legitimacy ≈ 1.0`, making their fake_risk near zero even if gang members follow them.
200
+
201
+ **Classification thresholds:**
202
+ - `fake_risk < 0.35` → normal
203
+ - `0.35 ≤ fake_risk < 0.60` → suspect
204
+ - `fake_risk ≥ 0.60` → confirmed_fake (formula-level; explicit FLAG overrides)
205
+
206
+ **Grader score** (normalised [0.0, 1.0], returned by `/grader`):
207
+ ```
208
+ recall = tp / 10
209
+ precision = tp / max(tp + fp, 1)
210
+ efficiency = max(0, (max_steps − steps_used) / max_steps)
211
+
212
+ if recall ≥ 0.8 AND precision ≥ 0.7:
213
+ score = 0.55 + 0.20×recall + 0.15×precision + 0.10×efficiency
214
+ else:
215
+ score = 0.30×recall + 0.10×precision
216
+ ```
217
+ Maximum 1.0 (all 10 found, zero false positives, zero steps used). Win threshold ≈ 0.815.
218
+
219
+ ---
220
+
221
+ ## Hybrid Policy (`agent/hybrid_policy.py`)
222
+
223
+ The agent blends a deterministic rule engine with Qwen3-Next-80B (via AWS Bedrock) using a per-task trust weight α.
224
+
225
+ **Alpha update** (per episode, after win/loss recorded):
226
+ ```
227
+ reflection_factor = min(1.0, n_reflections / 4.0)
228
+ raw = 0.20 + reflection_factor × (0.80 × recent_win_rate + 0.12)
229
+ alpha = clamp(raw, 0.20, task_cap)
230
+ ```
231
+
232
+ | Task | α cap | Rationale |
233
+ |---|---|---|
234
+ | easy | 0.50 | Rule engine alone hits ~91% — LLM assists, doesn't override |
235
+ | medium | 0.70 | Decoys require LLM judgment, but cascade must stay |
236
+ | hard | 0.85 | LLM needs latitude for evasion adaptation |
237
+
238
+ `reflection_factor` gates α: the LLM must accumulate ≥4 post-episode lessons before reaching meaningful trust, regardless of raw win rate.
239
+
240
+ **Blending decision:**
241
+ ```python
242
+ rule_action, rule_conf = get_rule_action(obs) # deterministic, with confidence score
243
+ llm_action, _ = get_action(obs, ...) # Qwen3 via Bedrock
244
+
245
+ if rule_action == llm_action: final = llm_action # agree
246
+ elif rule_conf >= alpha: final = rule_action # rule overrides
247
+ else: final = llm_action # LLM trusted
248
+ ```
249
+
250
+ Rule confidences: SUBMIT-forced=1.00, INSPECT-suspect=0.95, FLAG-high-risk=0.95, FLAG-threshold=0.70+, INSPECT-explore=0.30. At `α=0.50` (easy cap), safety decisions (suspects, forced submit) always override; exploration goes to the LLM.
251
+
252
+ **Reflexion learning:** After each episode, Qwen3 generates a 2–3 sentence lesson from the action log and outcome. Lessons are stored in `memory/reflections_{task}.jsonl` and injected into every future prompt (last 4 lessons + best winning trajectory as few-shot example). Memory persists across container restarts via Docker volume.
253
+
254
+ ---
255
+
256
+ ## API Reference
257
+
258
+ | Endpoint | Method | Description |
259
+ |---|---|---|
260
+ | `/health` | GET | `{"status": "healthy"}` |
261
+ | `/tasks` | GET | Task list + `action_schema` + `score_range: [0.0, 1.0]` |
262
+ | `/reset` | POST | `{task, seed}` → initial observation |
263
+ | `/step` | POST | `{action_type, account_id?}` → updated observation |
264
+ | `/state` | GET | Episode metadata (step count, task, score, evasion count) |
265
+ | `/grader` | GET | Normalised [0.0, 1.0] score after SUBMIT (400 if not done) |
266
+ | `/baseline` | POST | Runs rule-based agent on all 3 tasks, seed=0 |
267
+ | `/metadata` | GET | OpenEnv metadata block |
268
+ | `/schema` | GET | Full JSON schema for actions and observations |
269
+ | `/mcp` | POST | JSON-RPC 2.0 tool discovery (Model Context Protocol) |
270
+
271
+ Live: `https://pandago-graphstrike.hf.space`
272
+
273
+ ---
274
+
275
+ ## File Structure
276
+
277
+ ```
278
+ server/
279
+ app.py — FastAPI + Gradio UI (gr.mount_gradio_app)
280
+ environment.py — Episode lifecycle, action mechanics, cascade logic
281
+ generator.py — Deterministic episode generation (150 JSON files)
282
+ scoring.py — Stateless risk formula functions
283
+ models.py — Pydantic models: AccountProfile, FakeGangObservation, ActionType
284
+
285
+ agent/
286
+ policy.py — Qwen3 prompt construction + action parsing
287
+ hybrid_policy.py — Alpha blending, rule engine with confidence scores
288
+ reflection.py — Post-episode lesson generation
289
+ memory.py — JSONL persistence for reflections, trajectories, alpha
290
+
291
+ inference.py — Submission entrypoint: [START]/[STEP]/[END] structured logs, OpenAI client
292
+ validate.py — 24-point pre-submission validator (local + HTTP)
293
+ train.py — Full training loop with curriculum
294
+ episodes/ — 150 pre-generated JSON episode files (baked into Docker image)
295
+ memory/ — Docker volume: reflections, win history, alpha values
296
+ ```
297
+
298
+ ---
299
+
300
+ ## Baseline Scores
301
+
302
+ | Task | Seed=0 | Win rate (50 seeds) | Mean (50 seeds) |
303
+ |---|---|---|---|
304
+ | easy | 0.910 | 100% | ~0.91 |
305
+ | medium | 0.906 | 84% | ~0.77 |
306
+ | hard | 0.9038 | 52% | ~0.47 |
307
+
308
+ The rule-based baseline (no LLM) is competitive on easy/medium. Hard is the real differentiator — evasion events drop intra-gang edges mid-investigation, destroying graph signals. Frontier LLM agents with accumulated reflections adapt; the rule engine degrades.
309
+
310
+ ---
311
+
312
+ *Built by team computeXor*
eval-models/deepseek_test_judge_eval.py ADDED
@@ -0,0 +1,478 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Judge Evaluation Simulator
4
+ ==========================
5
+
6
+ Simulates EXACTLY how hackathon judges will evaluate your environment:
7
+
8
+ 1. Baseline re-run: POST /baseline → verify scores are stable
9
+ 2. Standard Open LLM agent: Run an LLM (via HF router) against all 3 tasks
10
+ 3. Score variance check: Run same task multiple seeds, check variance
11
+
12
+ USAGE:
13
+ # Against live HF Space (requires HF_TOKEN):
14
+ export HF_TOKEN="hf_..."
15
+ python test_judge_eval.py --url https://pandago-graphstrike.hf.space
16
+
17
+ # Against local server:
18
+ export HF_TOKEN="hf_..."
19
+ python test_judge_eval.py --url http://localhost:7860
20
+
21
+ # Choose model (default: Qwen/Qwen2.5-72B-Instruct):
22
+ export MODEL_NAME="nvidia/Llama-3.1-Nemotron-70B-Instruct-HF"
23
+ python test_judge_eval.py --url https://pandago-graphstrike.hf.space
24
+
25
+ # Just test endpoints (no LLM needed):
26
+ python test_judge_eval.py --url https://pandago-graphstrike.hf.space --endpoints-only
27
+ """
28
+
29
+ import json
30
+ import os
31
+ import sys
32
+ import time
33
+ import urllib.request
34
+ from typing import Dict, List, Optional
35
+
36
+ # ---------------------------------------------------------------------------
37
+ # Config from env vars (same as judges will set)
38
+ # ---------------------------------------------------------------------------
39
+
40
+ HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
41
+ API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
42
+ MODEL_NAME = os.getenv("MODEL_NAME", "deepseek.v3.2")
43
+
44
+ # AWS Bedrock config (use --bedrock flag)
45
+ LLM_BACKEND = "hf" # "hf" or "bedrock"
46
+ BEDROCK_MODEL_ID = os.getenv("BEDROCK_MODEL_ID", "deepseek.v3.2")
47
+
48
+ SYSTEM_PROMPT = """You are an AI detective finding 10 coordinated fake accounts in a social network.
49
+
50
+ ACTIONS (reply with exactly ONE line):
51
+ - INSPECT acc_XXXX — reveal profile (costs 1 step)
52
+ - FLAG acc_XXXX — mark as fake (FREE, no step cost, triggers suspect cascade)
53
+ - SUBMIT — end episode, get scored
54
+
55
+ DECISION RULES (apply top-to-bottom, first match wins):
56
+ 1. If suspect_ids lists accounts you haven't inspected → INSPECT the first one
57
+ 2. If ANY profiled account has shared_ip_count >= 5 and is NOT flagged → FLAG it immediately
58
+ 3. If ANY profiled account has photo_reuse >= 0.50 AND bio_template >= 0.40 and hub < 0.70 and NOT flagged → FLAG it
59
+ 4. If ANY profiled account has fake_risk_score >= 0.30 and hub < 0.70 and NOT flagged → FLAG it
60
+ 5. If there are uninspected visible accounts and steps > 3 → INSPECT the next one
61
+ 6. If you have 10 flags OR steps <= 3 → SUBMIT
62
+
63
+ IMPORTANT:
64
+ - FLAG is FREE (costs 0 steps) — flag aggressively when you see suspicious signals
65
+ - After each FLAG, new suspects appear — always inspect suspects before other accounts
66
+ - hub_legitimacy_score > 0.70 means celebrity — do NOT flag
67
+ - shared_ip_count >= 5 is the strongest gang signal (all 10 share one IP)
68
+ - Do NOT re-inspect already inspected accounts
69
+
70
+ Reply with EXACTLY one line, nothing else:
71
+ FLAG acc_XXXX
72
+ INSPECT acc_XXXX
73
+ SUBMIT"""
74
+
75
+
76
+ # ---------------------------------------------------------------------------
77
+ # HTTP helpers
78
+ # ---------------------------------------------------------------------------
79
+
80
+ def _retry(fn, retries=3, backoff=3):
81
+ """Retry a function on network errors."""
82
+ for attempt in range(retries):
83
+ try:
84
+ return fn()
85
+ except OSError as e:
86
+ if attempt == retries - 1:
87
+ raise
88
+ wait = backoff * (attempt + 1)
89
+ print(f" [RETRY] Network error: {e} — retrying in {wait}s ({attempt+1}/{retries})")
90
+ time.sleep(wait)
91
+
92
+
93
+ def http_post(url: str, body: Optional[dict] = None) -> dict:
94
+ def _do():
95
+ data = json.dumps(body or {}).encode()
96
+ req = urllib.request.Request(
97
+ url, data=data,
98
+ headers={"Content-Type": "application/json"},
99
+ method="POST"
100
+ )
101
+ with urllib.request.urlopen(req, timeout=120) as resp:
102
+ return json.loads(resp.read())
103
+ return _retry(_do)
104
+
105
+
106
+ def http_get(url: str, expect_json: bool = True) -> dict:
107
+ def _do():
108
+ with urllib.request.urlopen(url, timeout=120) as resp:
109
+ body = resp.read()
110
+ if not expect_json:
111
+ return {"_status": resp.status, "_body_len": len(body)}
112
+ return json.loads(body)
113
+ return _retry(_do)
114
+
115
+
116
+ # ---------------------------------------------------------------------------
117
+ # LLM call via OpenAI-compatible API
118
+ # ---------------------------------------------------------------------------
119
+
120
+ def _call_hf(prompt: str) -> str:
121
+ """Call LLM via HF router (OpenAI-compatible)."""
122
+ from openai import OpenAI
123
+ client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
124
+ resp = client.chat.completions.create(
125
+ model=MODEL_NAME,
126
+ messages=[
127
+ {"role": "system", "content": SYSTEM_PROMPT},
128
+ {"role": "user", "content": prompt},
129
+ ],
130
+ temperature=0.3,
131
+ max_tokens=256,
132
+ )
133
+ return (resp.choices[0].message.content or "").strip()
134
+
135
+
136
+ def _call_bedrock(prompt: str) -> str:
137
+ """Call LLM via AWS Bedrock. Tries converse() first, falls back to invoke_model()."""
138
+ import boto3
139
+ client = boto3.client(
140
+ service_name="bedrock-runtime",
141
+ region_name=os.getenv("AWS_DEFAULT_REGION", "us-east-1"),
142
+ aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
143
+ aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
144
+ )
145
+ # Try converse API first (boto3 >= 1.34.x)
146
+ if hasattr(client, "converse"):
147
+ resp = client.converse(
148
+ modelId=BEDROCK_MODEL_ID,
149
+ messages=[{"role": "user", "content": [{"text": prompt}]}],
150
+ system=[{"text": SYSTEM_PROMPT}],
151
+ inferenceConfig={"maxTokens": 256, "temperature": 0.3},
152
+ )
153
+ return resp["output"]["message"]["content"][0]["text"].strip()
154
+ # Fallback: invoke_model (works with all boto3 versions)
155
+ body = json.dumps({
156
+ "messages": [
157
+ {"role": "system", "content": SYSTEM_PROMPT},
158
+ {"role": "user", "content": prompt},
159
+ ],
160
+ "max_tokens": 256,
161
+ "temperature": 0.3,
162
+ })
163
+ resp = client.invoke_model(
164
+ modelId=BEDROCK_MODEL_ID,
165
+ contentType="application/json",
166
+ accept="application/json",
167
+ body=body,
168
+ )
169
+ result = json.loads(resp["body"].read())
170
+ # Handle both OpenAI-style and Bedrock-native response formats
171
+ if "choices" in result:
172
+ return result["choices"][0]["message"]["content"].strip()
173
+ if "content" in result:
174
+ content = result["content"]
175
+ if isinstance(content, list):
176
+ return content[0].get("text", "").strip()
177
+ return str(content).strip()
178
+ if "output" in result:
179
+ return result["output"].get("text", "").strip()
180
+ return str(result).strip()
181
+
182
+
183
+ def call_llm(prompt: str) -> str:
184
+ """Call LLM with retries. Uses HF router or Bedrock based on LLM_BACKEND."""
185
+ fn = _call_bedrock if LLM_BACKEND == "bedrock" else _call_hf
186
+ for attempt in range(3):
187
+ try:
188
+ raw = fn(prompt)
189
+ if os.getenv("DEBUG_LLM"):
190
+ print(f" [LLM RAW] {raw[:200]}")
191
+ # Strip Qwen3 <think>...</think> reasoning blocks
192
+ import re
193
+ cleaned = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()
194
+ return cleaned if cleaned else raw
195
+ except Exception as e:
196
+ if attempt == 2:
197
+ print(f" [LLM ERROR] {e} (gave up after 3 attempts)")
198
+ return ""
199
+ wait = 3 * (attempt + 1)
200
+ print(f" [LLM RETRY] {e} — retrying in {wait}s")
201
+ time.sleep(wait)
202
+ return ""
203
+
204
+
205
+ def format_obs(obs: dict) -> str:
206
+ """Format observation as text for LLM — shows raw signals prominently."""
207
+ lines = []
208
+ lines.append(f"TASK: {obs.get('task','?').upper()} | Steps remaining: {obs.get('steps_remaining','?')}")
209
+
210
+ flagged = obs.get("flagged_ids", [])
211
+ lines.append(f"Flagged ({len(flagged)}/10): {', '.join(flagged) if flagged else 'none'}")
212
+
213
+ suspects = obs.get("suspect_ids", [])
214
+ inspected = obs.get("inspected_ids", [])
215
+ uninspected_suspects = [s for s in suspects if s not in inspected]
216
+ if uninspected_suspects:
217
+ lines.append(f"*** SUSPECTS (uninspected) → INSPECT THESE FIRST: {', '.join(uninspected_suspects)} ***")
218
+
219
+ accounts = obs.get("visible_accounts", [])
220
+ if accounts:
221
+ # Split: unflagged accounts that should be flagged vs rest
222
+ unflagged_suspicious = []
223
+ flagged_accs = []
224
+ clean_accs = []
225
+ for a in sorted(accounts, key=lambda x: x.get("fake_risk_score", 0), reverse=True):
226
+ aid = a.get("account_id", "?")
227
+ if aid in flagged:
228
+ flagged_accs.append(a)
229
+ elif (a.get("shared_ip_count", 0) >= 5 or
230
+ (a.get("photo_reuse_score", 0) >= 0.50 and a.get("bio_template_score", 0) >= 0.40)):
231
+ unflagged_suspicious.append(a)
232
+ else:
233
+ clean_accs.append(a)
234
+
235
+ if unflagged_suspicious:
236
+ lines.append(f"\n!!! ACTION NEEDED — FLAG THESE ({len(unflagged_suspicious)} accounts with strong fake signals):")
237
+ for a in unflagged_suspicious:
238
+ aid = a.get("account_id", "?")
239
+ lines.append(f" → FLAG {aid}: risk={a.get('fake_risk_score',0):.3f} photo={a.get('photo_reuse_score',0):.2f} bio={a.get('bio_template_score',0):.2f} ip_shared={a.get('shared_ip_count',0)} hub={a.get('hub_legitimacy_score',0):.2f}")
240
+
241
+ if flagged_accs:
242
+ lines.append(f"\nALREADY FLAGGED ({len(flagged_accs)}):")
243
+ for a in flagged_accs[:5]:
244
+ lines.append(f" ✓ {a.get('account_id','?')}")
245
+
246
+ if clean_accs:
247
+ lines.append(f"\nCLEAN ACCOUNTS ({len(clean_accs)}):")
248
+ for a in clean_accs[:5]:
249
+ aid = a.get("account_id", "?")
250
+ hub = a.get("hub_legitimacy_score", 0)
251
+ hub_mark = " [CELEBRITY]" if hub > 0.70 else ""
252
+ lines.append(f" {aid}: risk={a.get('fake_risk_score',0):.3f} photo={a.get('photo_reuse_score',0):.2f} bio={a.get('bio_template_score',0):.2f} hub={hub:.2f}{hub_mark}")
253
+
254
+ visible = obs.get("visible_account_ids", [])
255
+ uninspected = [i for i in visible if i not in inspected]
256
+ if uninspected:
257
+ lines.append(f"\nUninspected IDs ({len(uninspected)}): {', '.join(uninspected[:8])}{'...' if len(uninspected) > 8 else ''}")
258
+
259
+ lines.append(f"\nMessage: {obs.get('message', '')}")
260
+ return "\n".join(lines)
261
+
262
+
263
+ def parse_action(llm_text: str, obs: dict) -> dict:
264
+ """Parse LLM output to action dict."""
265
+ for line in llm_text.split("\n"):
266
+ line = line.strip()
267
+ upper = line.upper()
268
+ if upper.startswith("INSPECT ") or upper.startswith("FLAG ") or upper.startswith("INVESTIGATE_NETWORK ") or upper.startswith("UNFLAG "):
269
+ parts = line.split(maxsplit=1)
270
+ return {"action_type": parts[0].lower(), "account_id": parts[1].lower() if len(parts) > 1 else None}
271
+ if upper == "SUBMIT":
272
+ return {"action_type": "submit"}
273
+
274
+ # Fallback: inspect first uninspected suspect
275
+ suspects = obs.get("suspect_ids", [])
276
+ inspected = obs.get("inspected_ids", [])
277
+ for s in suspects:
278
+ if s not in inspected:
279
+ return {"action_type": "inspect", "account_id": s}
280
+ visible = obs.get("visible_account_ids", [])
281
+ for v in visible:
282
+ if v not in inspected:
283
+ return {"action_type": "inspect", "account_id": v}
284
+ return {"action_type": "submit"}
285
+
286
+
287
+ # ---------------------------------------------------------------------------
288
+ # Test phases
289
+ # ---------------------------------------------------------------------------
290
+
291
+ def test_endpoints(base_url: str) -> bool:
292
+ """Phase 0: Verify all required endpoints respond correctly."""
293
+ print("\n" + "="*60)
294
+ print("PHASE 0: Endpoint Verification")
295
+ print("="*60)
296
+
297
+ checks = [
298
+ ("GET", "/health", None, True),
299
+ ("GET", "/tasks", None, True),
300
+ ("GET", "/metadata", None, True),
301
+ ("GET", "/schema", None, True),
302
+ ("GET", "/web", None, False), # returns HTML, not JSON
303
+ ("POST", "/reset", {"task": "easy", "seed": 0}, True),
304
+ ("GET", "/state", None, True),
305
+ ("POST", "/step", {"action_type": "inspect", "account_id": "acc_0000"}, True),
306
+ ("POST", "/step", {"action_type": "submit"}, True),
307
+ ("GET", "/grader", None, True),
308
+ ("POST", "/mcp", {"jsonrpc": "2.0", "method": "tools/list", "id": 1}, True),
309
+ ("POST", "/baseline", None, True),
310
+ ]
311
+
312
+ all_ok = True
313
+ for method, path, body, expect_json in checks:
314
+ try:
315
+ if method == "GET":
316
+ http_get(f"{base_url}{path}", expect_json=expect_json)
317
+ else:
318
+ http_post(f"{base_url}{path}", body)
319
+ print(f" ✓ {method} {path}")
320
+ except Exception as e:
321
+ print(f" ✗ {method} {path} — {e}")
322
+ all_ok = False
323
+
324
+ return all_ok
325
+
326
+
327
+ def test_baseline_stability(base_url: str) -> bool:
328
+ """Phase 1: Baseline re-run (must produce identical scores)."""
329
+ print("\n" + "="*60)
330
+ print("PHASE 1: Baseline Stability (3 runs)")
331
+ print("="*60)
332
+
333
+ scores_list = []
334
+ for i in range(3):
335
+ r = http_post(f"{base_url}/baseline")
336
+ scores = r["scores"]
337
+ scores_list.append(scores)
338
+ print(f" Run {i+1}: easy={scores['easy']:.4f} medium={scores['medium']:.4f} hard={scores['hard']:.4f}")
339
+
340
+ # Check all identical
341
+ stable = all(s == scores_list[0] for s in scores_list)
342
+ if stable:
343
+ print(" ✓ All 3 runs identical — baseline is deterministic")
344
+ else:
345
+ print(" ✗ SCORES DIFFER — baseline is non-deterministic!")
346
+ return stable
347
+
348
+
349
+ def test_llm_agent(base_url: str, task: str, seed: int = 0) -> float:
350
+ """Phase 2: Run an LLM agent against one task (simulates judge's Nemotron run)."""
351
+ _model = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
352
+ print(f"\n --- LLM Agent: task={task}, seed={seed}, model={_model} ---")
353
+
354
+ # Reset
355
+ reset_resp = http_post(f"{base_url}/reset", {"task": task, "seed": seed})
356
+ obs = reset_resp.get("observation", reset_resp)
357
+ done = reset_resp.get("done", False)
358
+
359
+ step_num = 0
360
+ while not done:
361
+ step_num += 1
362
+ prompt = format_obs(obs)
363
+ llm_text = call_llm(prompt)
364
+ action = parse_action(llm_text, obs)
365
+
366
+ action_str = f"{action['action_type'].upper()} {action.get('account_id', '')}".strip()
367
+
368
+ step_resp = http_post(f"{base_url}/step", action)
369
+ obs = step_resp.get("observation", step_resp)
370
+ done = step_resp.get("done", False)
371
+ reward = step_resp.get("reward")
372
+
373
+ flagged_n = len(obs.get("flagged_ids", []))
374
+ suspects_n = len(obs.get("suspect_ids", []))
375
+ steps_left = obs.get("steps_remaining", "?")
376
+
377
+ print(f" Step {step_num:2d}: {action_str:35s} flagged={flagged_n}/10 suspects={suspects_n} steps_left={steps_left}")
378
+
379
+ if done and reward is not None:
380
+ msg = step_resp.get("message", obs.get("message", ""))
381
+ print(f" → Episode ended: {msg[:100]}")
382
+
383
+ # Get grader score
384
+ grader = http_get(f"{base_url}/grader")
385
+ score = grader["score"]
386
+ print(f" ★ GRADER SCORE: {score:.4f}")
387
+ return score
388
+
389
+
390
+ def test_llm_all_tasks(base_url: str) -> Dict[str, float]:
391
+ """Phase 2: Run LLM agent on all 3 tasks."""
392
+ print("\n" + "="*60)
393
+ _model = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
394
+ print(f"PHASE 2: LLM Agent Evaluation (model={_model})")
395
+ print("="*60)
396
+
397
+ scores = {}
398
+ for task in ["easy", "medium", "hard"]:
399
+ scores[task] = test_llm_agent(base_url, task=task, seed=0)
400
+
401
+ print(f"\n Summary: easy={scores['easy']:.4f} medium={scores['medium']:.4f} hard={scores['hard']:.4f}")
402
+ return scores
403
+
404
+
405
+ def test_variance(base_url: str, seeds: List[int] = [0, 1, 2, 3, 4]) -> None:
406
+ """Phase 3: Score variance check (multiple seeds per task)."""
407
+ print("\n" + "="*60)
408
+ print(f"PHASE 3: Score Variance (seeds={seeds})")
409
+ print("="*60)
410
+
411
+ for task in ["easy", "medium", "hard"]:
412
+ task_scores = []
413
+ for seed in seeds:
414
+ score = test_llm_agent(base_url, task=task, seed=seed)
415
+ task_scores.append(score)
416
+
417
+ mean = sum(task_scores) / len(task_scores)
418
+ variance = sum((s - mean) ** 2 for s in task_scores) / len(task_scores)
419
+ print(f"\n {task}: scores={[f'{s:.3f}' for s in task_scores]} mean={mean:.4f} var={variance:.6f}")
420
+
421
+
422
+ # ---------------------------------------------------------------------------
423
+ # Main
424
+ # ---------------------------------------------------------------------------
425
+
426
+ if __name__ == "__main__":
427
+ import argparse
428
+
429
+ parser = argparse.ArgumentParser(description="Judge Evaluation Simulator for GraphStrike")
430
+ parser.add_argument("--url", required=True, help="Environment server URL")
431
+ parser.add_argument("--bedrock", action="store_true", help="Use AWS Bedrock instead of HF router")
432
+ parser.add_argument("--endpoints-only", action="store_true", help="Only test endpoints (no LLM)")
433
+ parser.add_argument("--skip-variance", action="store_true", help="Skip variance check (faster)")
434
+ parser.add_argument("--seeds", type=int, default=3, help="Number of seeds for variance check")
435
+ args = parser.parse_args()
436
+
437
+ if args.bedrock:
438
+ LLM_BACKEND = "bedrock"
439
+
440
+ base = args.url.rstrip("/")
441
+ model_display = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
442
+ print(f"GraphStrike Judge Evaluation Simulator")
443
+ print(f"Target: {base}")
444
+ print(f"Backend: {LLM_BACKEND}")
445
+ print(f"Model: {model_display}")
446
+ print(f"Token: {'set' if (HF_TOKEN or os.getenv('AWS_ACCESS_KEY_ID')) else 'NOT SET'}")
447
+
448
+ # Phase 0: Endpoints
449
+ if not test_endpoints(base):
450
+ print("\n✗ Endpoint check failed. Fix before proceeding.")
451
+ sys.exit(1)
452
+
453
+ # Phase 1: Baseline stability
454
+ test_baseline_stability(base)
455
+
456
+ if args.endpoints_only:
457
+ print("\n✓ Endpoint-only mode — skipping LLM tests.")
458
+ sys.exit(0)
459
+
460
+ if LLM_BACKEND == "bedrock":
461
+ if not os.getenv("AWS_ACCESS_KEY_ID"):
462
+ print("\n✗ AWS_ACCESS_KEY_ID not set. Cannot run Bedrock LLM tests.")
463
+ sys.exit(1)
464
+ elif not HF_TOKEN:
465
+ print("\n✗ HF_TOKEN not set. Cannot run LLM agent tests.")
466
+ print(" export HF_TOKEN='hf_...' OR use --bedrock with AWS creds")
467
+ sys.exit(1)
468
+
469
+ # Phase 2: LLM on all tasks
470
+ scores = test_llm_all_tasks(base)
471
+
472
+ # Phase 3: Variance
473
+ if not args.skip_variance:
474
+ test_variance(base, seeds=list(range(args.seeds)))
475
+
476
+ print("\n" + "="*60)
477
+ print("EVALUATION COMPLETE")
478
+ print("="*60)
eval-models/gemma_test_judge_eval.py ADDED
@@ -0,0 +1,478 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Judge Evaluation Simulator
4
+ ==========================
5
+
6
+ Simulates EXACTLY how hackathon judges will evaluate your environment:
7
+
8
+ 1. Baseline re-run: POST /baseline → verify scores are stable
9
+ 2. Standard Open LLM agent: Run an LLM (via HF router) against all 3 tasks
10
+ 3. Score variance check: Run same task multiple seeds, check variance
11
+
12
+ USAGE:
13
+ # Against live HF Space (requires HF_TOKEN):
14
+ export HF_TOKEN="hf_..."
15
+ python test_judge_eval.py --url https://pandago-graphstrike.hf.space
16
+
17
+ # Against local server:
18
+ export HF_TOKEN="hf_..."
19
+ python test_judge_eval.py --url http://localhost:7860
20
+
21
+ # Choose model (default: Qwen/Qwen2.5-72B-Instruct):
22
+ export MODEL_NAME="nvidia/Llama-3.1-Nemotron-70B-Instruct-HF"
23
+ python test_judge_eval.py --url https://pandago-graphstrike.hf.space
24
+
25
+ # Just test endpoints (no LLM needed):
26
+ python test_judge_eval.py --url https://pandago-graphstrike.hf.space --endpoints-only
27
+ """
28
+
29
+ import json
30
+ import os
31
+ import sys
32
+ import time
33
+ import urllib.request
34
+ from typing import Dict, List, Optional
35
+
36
+ # ---------------------------------------------------------------------------
37
+ # Config from env vars (same as judges will set)
38
+ # ---------------------------------------------------------------------------
39
+
40
+ HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
41
+ API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
42
+ MODEL_NAME = os.getenv("MODEL_NAME", "google.gemma-3-12b-it")
43
+
44
+ # AWS Bedrock config (use --bedrock flag)
45
+ LLM_BACKEND = "hf" # "hf" or "bedrock"
46
+ BEDROCK_MODEL_ID = os.getenv("BEDROCK_MODEL_ID", "google.gemma-3-12b-it")
47
+
48
+ SYSTEM_PROMPT = """You are an AI detective finding 10 coordinated fake accounts in a social network.
49
+
50
+ ACTIONS (reply with exactly ONE line):
51
+ - INSPECT acc_XXXX — reveal profile (costs 1 step)
52
+ - FLAG acc_XXXX — mark as fake (FREE, no step cost, triggers suspect cascade)
53
+ - SUBMIT — end episode, get scored
54
+
55
+ DECISION RULES (apply top-to-bottom, first match wins):
56
+ 1. If suspect_ids lists accounts you haven't inspected → INSPECT the first one
57
+ 2. If ANY profiled account has shared_ip_count >= 5 and is NOT flagged → FLAG it immediately
58
+ 3. If ANY profiled account has photo_reuse >= 0.50 AND bio_template >= 0.40 and hub < 0.70 and NOT flagged → FLAG it
59
+ 4. If ANY profiled account has fake_risk_score >= 0.30 and hub < 0.70 and NOT flagged → FLAG it
60
+ 5. If there are uninspected visible accounts and steps > 3 → INSPECT the next one
61
+ 6. If you have 10 flags OR steps <= 3 → SUBMIT
62
+
63
+ IMPORTANT:
64
+ - FLAG is FREE (costs 0 steps) — flag aggressively when you see suspicious signals
65
+ - After each FLAG, new suspects appear — always inspect suspects before other accounts
66
+ - hub_legitimacy_score > 0.70 means celebrity — do NOT flag
67
+ - shared_ip_count >= 5 is the strongest gang signal (all 10 share one IP)
68
+ - Do NOT re-inspect already inspected accounts
69
+
70
+ Reply with EXACTLY one line, nothing else:
71
+ FLAG acc_XXXX
72
+ INSPECT acc_XXXX
73
+ SUBMIT"""
74
+
75
+
76
+ # ---------------------------------------------------------------------------
77
+ # HTTP helpers
78
+ # ---------------------------------------------------------------------------
79
+
80
+ def _retry(fn, retries=3, backoff=3):
81
+ """Retry a function on network errors."""
82
+ for attempt in range(retries):
83
+ try:
84
+ return fn()
85
+ except OSError as e:
86
+ if attempt == retries - 1:
87
+ raise
88
+ wait = backoff * (attempt + 1)
89
+ print(f" [RETRY] Network error: {e} — retrying in {wait}s ({attempt+1}/{retries})")
90
+ time.sleep(wait)
91
+
92
+
93
+ def http_post(url: str, body: Optional[dict] = None) -> dict:
94
+ def _do():
95
+ data = json.dumps(body or {}).encode()
96
+ req = urllib.request.Request(
97
+ url, data=data,
98
+ headers={"Content-Type": "application/json"},
99
+ method="POST"
100
+ )
101
+ with urllib.request.urlopen(req, timeout=120) as resp:
102
+ return json.loads(resp.read())
103
+ return _retry(_do)
104
+
105
+
106
+ def http_get(url: str, expect_json: bool = True) -> dict:
107
+ def _do():
108
+ with urllib.request.urlopen(url, timeout=120) as resp:
109
+ body = resp.read()
110
+ if not expect_json:
111
+ return {"_status": resp.status, "_body_len": len(body)}
112
+ return json.loads(body)
113
+ return _retry(_do)
114
+
115
+
116
+ # ---------------------------------------------------------------------------
117
+ # LLM call via OpenAI-compatible API
118
+ # ---------------------------------------------------------------------------
119
+
120
+ def _call_hf(prompt: str) -> str:
121
+ """Call LLM via HF router (OpenAI-compatible)."""
122
+ from openai import OpenAI
123
+ client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
124
+ resp = client.chat.completions.create(
125
+ model=MODEL_NAME,
126
+ messages=[
127
+ {"role": "system", "content": SYSTEM_PROMPT},
128
+ {"role": "user", "content": prompt},
129
+ ],
130
+ temperature=0.3,
131
+ max_tokens=256,
132
+ )
133
+ return (resp.choices[0].message.content or "").strip()
134
+
135
+
136
+ def _call_bedrock(prompt: str) -> str:
137
+ """Call LLM via AWS Bedrock. Tries converse() first, falls back to invoke_model()."""
138
+ import boto3
139
+ client = boto3.client(
140
+ service_name="bedrock-runtime",
141
+ region_name=os.getenv("AWS_DEFAULT_REGION", "us-east-1"),
142
+ aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
143
+ aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
144
+ )
145
+ # Try converse API first (boto3 >= 1.34.x)
146
+ if hasattr(client, "converse"):
147
+ resp = client.converse(
148
+ modelId=BEDROCK_MODEL_ID,
149
+ messages=[{"role": "user", "content": [{"text": prompt}]}],
150
+ system=[{"text": SYSTEM_PROMPT}],
151
+ inferenceConfig={"maxTokens": 256, "temperature": 0.3},
152
+ )
153
+ return resp["output"]["message"]["content"][0]["text"].strip()
154
+ # Fallback: invoke_model (works with all boto3 versions)
155
+ body = json.dumps({
156
+ "messages": [
157
+ {"role": "system", "content": SYSTEM_PROMPT},
158
+ {"role": "user", "content": prompt},
159
+ ],
160
+ "max_tokens": 256,
161
+ "temperature": 0.3,
162
+ })
163
+ resp = client.invoke_model(
164
+ modelId=BEDROCK_MODEL_ID,
165
+ contentType="application/json",
166
+ accept="application/json",
167
+ body=body,
168
+ )
169
+ result = json.loads(resp["body"].read())
170
+ # Handle both OpenAI-style and Bedrock-native response formats
171
+ if "choices" in result:
172
+ return result["choices"][0]["message"]["content"].strip()
173
+ if "content" in result:
174
+ content = result["content"]
175
+ if isinstance(content, list):
176
+ return content[0].get("text", "").strip()
177
+ return str(content).strip()
178
+ if "output" in result:
179
+ return result["output"].get("text", "").strip()
180
+ return str(result).strip()
181
+
182
+
183
+ def call_llm(prompt: str) -> str:
184
+ """Call LLM with retries. Uses HF router or Bedrock based on LLM_BACKEND."""
185
+ fn = _call_bedrock if LLM_BACKEND == "bedrock" else _call_hf
186
+ for attempt in range(3):
187
+ try:
188
+ raw = fn(prompt)
189
+ if os.getenv("DEBUG_LLM"):
190
+ print(f" [LLM RAW] {raw[:200]}")
191
+ # Strip Qwen3 <think>...</think> reasoning blocks
192
+ import re
193
+ cleaned = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()
194
+ return cleaned if cleaned else raw
195
+ except Exception as e:
196
+ if attempt == 2:
197
+ print(f" [LLM ERROR] {e} (gave up after 3 attempts)")
198
+ return ""
199
+ wait = 3 * (attempt + 1)
200
+ print(f" [LLM RETRY] {e} — retrying in {wait}s")
201
+ time.sleep(wait)
202
+ return ""
203
+
204
+
205
+ def format_obs(obs: dict) -> str:
206
+ """Format observation as text for LLM — shows raw signals prominently."""
207
+ lines = []
208
+ lines.append(f"TASK: {obs.get('task','?').upper()} | Steps remaining: {obs.get('steps_remaining','?')}")
209
+
210
+ flagged = obs.get("flagged_ids", [])
211
+ lines.append(f"Flagged ({len(flagged)}/10): {', '.join(flagged) if flagged else 'none'}")
212
+
213
+ suspects = obs.get("suspect_ids", [])
214
+ inspected = obs.get("inspected_ids", [])
215
+ uninspected_suspects = [s for s in suspects if s not in inspected]
216
+ if uninspected_suspects:
217
+ lines.append(f"*** SUSPECTS (uninspected) → INSPECT THESE FIRST: {', '.join(uninspected_suspects)} ***")
218
+
219
+ accounts = obs.get("visible_accounts", [])
220
+ if accounts:
221
+ # Split: unflagged accounts that should be flagged vs rest
222
+ unflagged_suspicious = []
223
+ flagged_accs = []
224
+ clean_accs = []
225
+ for a in sorted(accounts, key=lambda x: x.get("fake_risk_score", 0), reverse=True):
226
+ aid = a.get("account_id", "?")
227
+ if aid in flagged:
228
+ flagged_accs.append(a)
229
+ elif (a.get("shared_ip_count", 0) >= 5 or
230
+ (a.get("photo_reuse_score", 0) >= 0.50 and a.get("bio_template_score", 0) >= 0.40)):
231
+ unflagged_suspicious.append(a)
232
+ else:
233
+ clean_accs.append(a)
234
+
235
+ if unflagged_suspicious:
236
+ lines.append(f"\n!!! ACTION NEEDED — FLAG THESE ({len(unflagged_suspicious)} accounts with strong fake signals):")
237
+ for a in unflagged_suspicious:
238
+ aid = a.get("account_id", "?")
239
+ lines.append(f" → FLAG {aid}: risk={a.get('fake_risk_score',0):.3f} photo={a.get('photo_reuse_score',0):.2f} bio={a.get('bio_template_score',0):.2f} ip_shared={a.get('shared_ip_count',0)} hub={a.get('hub_legitimacy_score',0):.2f}")
240
+
241
+ if flagged_accs:
242
+ lines.append(f"\nALREADY FLAGGED ({len(flagged_accs)}):")
243
+ for a in flagged_accs[:5]:
244
+ lines.append(f" ✓ {a.get('account_id','?')}")
245
+
246
+ if clean_accs:
247
+ lines.append(f"\nCLEAN ACCOUNTS ({len(clean_accs)}):")
248
+ for a in clean_accs[:5]:
249
+ aid = a.get("account_id", "?")
250
+ hub = a.get("hub_legitimacy_score", 0)
251
+ hub_mark = " [CELEBRITY]" if hub > 0.70 else ""
252
+ lines.append(f" {aid}: risk={a.get('fake_risk_score',0):.3f} photo={a.get('photo_reuse_score',0):.2f} bio={a.get('bio_template_score',0):.2f} hub={hub:.2f}{hub_mark}")
253
+
254
+ visible = obs.get("visible_account_ids", [])
255
+ uninspected = [i for i in visible if i not in inspected]
256
+ if uninspected:
257
+ lines.append(f"\nUninspected IDs ({len(uninspected)}): {', '.join(uninspected[:8])}{'...' if len(uninspected) > 8 else ''}")
258
+
259
+ lines.append(f"\nMessage: {obs.get('message', '')}")
260
+ return "\n".join(lines)
261
+
262
+
263
+ def parse_action(llm_text: str, obs: dict) -> dict:
264
+ """Parse LLM output to action dict."""
265
+ for line in llm_text.split("\n"):
266
+ line = line.strip()
267
+ upper = line.upper()
268
+ if upper.startswith("INSPECT ") or upper.startswith("FLAG ") or upper.startswith("INVESTIGATE_NETWORK ") or upper.startswith("UNFLAG "):
269
+ parts = line.split(maxsplit=1)
270
+ return {"action_type": parts[0].lower(), "account_id": parts[1].lower() if len(parts) > 1 else None}
271
+ if upper == "SUBMIT":
272
+ return {"action_type": "submit"}
273
+
274
+ # Fallback: inspect first uninspected suspect
275
+ suspects = obs.get("suspect_ids", [])
276
+ inspected = obs.get("inspected_ids", [])
277
+ for s in suspects:
278
+ if s not in inspected:
279
+ return {"action_type": "inspect", "account_id": s}
280
+ visible = obs.get("visible_account_ids", [])
281
+ for v in visible:
282
+ if v not in inspected:
283
+ return {"action_type": "inspect", "account_id": v}
284
+ return {"action_type": "submit"}
285
+
286
+
287
+ # ---------------------------------------------------------------------------
288
+ # Test phases
289
+ # ---------------------------------------------------------------------------
290
+
291
+ def test_endpoints(base_url: str) -> bool:
292
+ """Phase 0: Verify all required endpoints respond correctly."""
293
+ print("\n" + "="*60)
294
+ print("PHASE 0: Endpoint Verification")
295
+ print("="*60)
296
+
297
+ checks = [
298
+ ("GET", "/health", None, True),
299
+ ("GET", "/tasks", None, True),
300
+ ("GET", "/metadata", None, True),
301
+ ("GET", "/schema", None, True),
302
+ ("GET", "/web", None, False), # returns HTML, not JSON
303
+ ("POST", "/reset", {"task": "easy", "seed": 0}, True),
304
+ ("GET", "/state", None, True),
305
+ ("POST", "/step", {"action_type": "inspect", "account_id": "acc_0000"}, True),
306
+ ("POST", "/step", {"action_type": "submit"}, True),
307
+ ("GET", "/grader", None, True),
308
+ ("POST", "/mcp", {"jsonrpc": "2.0", "method": "tools/list", "id": 1}, True),
309
+ ("POST", "/baseline", None, True),
310
+ ]
311
+
312
+ all_ok = True
313
+ for method, path, body, expect_json in checks:
314
+ try:
315
+ if method == "GET":
316
+ http_get(f"{base_url}{path}", expect_json=expect_json)
317
+ else:
318
+ http_post(f"{base_url}{path}", body)
319
+ print(f" ✓ {method} {path}")
320
+ except Exception as e:
321
+ print(f" ✗ {method} {path} — {e}")
322
+ all_ok = False
323
+
324
+ return all_ok
325
+
326
+
327
+ def test_baseline_stability(base_url: str) -> bool:
328
+ """Phase 1: Baseline re-run (must produce identical scores)."""
329
+ print("\n" + "="*60)
330
+ print("PHASE 1: Baseline Stability (3 runs)")
331
+ print("="*60)
332
+
333
+ scores_list = []
334
+ for i in range(3):
335
+ r = http_post(f"{base_url}/baseline")
336
+ scores = r["scores"]
337
+ scores_list.append(scores)
338
+ print(f" Run {i+1}: easy={scores['easy']:.4f} medium={scores['medium']:.4f} hard={scores['hard']:.4f}")
339
+
340
+ # Check all identical
341
+ stable = all(s == scores_list[0] for s in scores_list)
342
+ if stable:
343
+ print(" ✓ All 3 runs identical — baseline is deterministic")
344
+ else:
345
+ print(" ✗ SCORES DIFFER — baseline is non-deterministic!")
346
+ return stable
347
+
348
+
349
+ def test_llm_agent(base_url: str, task: str, seed: int = 0) -> float:
350
+ """Phase 2: Run an LLM agent against one task (simulates judge's Nemotron run)."""
351
+ _model = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
352
+ print(f"\n --- LLM Agent: task={task}, seed={seed}, model={_model} ---")
353
+
354
+ # Reset
355
+ reset_resp = http_post(f"{base_url}/reset", {"task": task, "seed": seed})
356
+ obs = reset_resp.get("observation", reset_resp)
357
+ done = reset_resp.get("done", False)
358
+
359
+ step_num = 0
360
+ while not done:
361
+ step_num += 1
362
+ prompt = format_obs(obs)
363
+ llm_text = call_llm(prompt)
364
+ action = parse_action(llm_text, obs)
365
+
366
+ action_str = f"{action['action_type'].upper()} {action.get('account_id', '')}".strip()
367
+
368
+ step_resp = http_post(f"{base_url}/step", action)
369
+ obs = step_resp.get("observation", step_resp)
370
+ done = step_resp.get("done", False)
371
+ reward = step_resp.get("reward")
372
+
373
+ flagged_n = len(obs.get("flagged_ids", []))
374
+ suspects_n = len(obs.get("suspect_ids", []))
375
+ steps_left = obs.get("steps_remaining", "?")
376
+
377
+ print(f" Step {step_num:2d}: {action_str:35s} flagged={flagged_n}/10 suspects={suspects_n} steps_left={steps_left}")
378
+
379
+ if done and reward is not None:
380
+ msg = step_resp.get("message", obs.get("message", ""))
381
+ print(f" → Episode ended: {msg[:100]}")
382
+
383
+ # Get grader score
384
+ grader = http_get(f"{base_url}/grader")
385
+ score = grader["score"]
386
+ print(f" ★ GRADER SCORE: {score:.4f}")
387
+ return score
388
+
389
+
390
+ def test_llm_all_tasks(base_url: str) -> Dict[str, float]:
391
+ """Phase 2: Run LLM agent on all 3 tasks."""
392
+ print("\n" + "="*60)
393
+ _model = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
394
+ print(f"PHASE 2: LLM Agent Evaluation (model={_model})")
395
+ print("="*60)
396
+
397
+ scores = {}
398
+ for task in ["easy", "medium", "hard"]:
399
+ scores[task] = test_llm_agent(base_url, task=task, seed=0)
400
+
401
+ print(f"\n Summary: easy={scores['easy']:.4f} medium={scores['medium']:.4f} hard={scores['hard']:.4f}")
402
+ return scores
403
+
404
+
405
+ def test_variance(base_url: str, seeds: List[int] = [0, 1, 2, 3, 4]) -> None:
406
+ """Phase 3: Score variance check (multiple seeds per task)."""
407
+ print("\n" + "="*60)
408
+ print(f"PHASE 3: Score Variance (seeds={seeds})")
409
+ print("="*60)
410
+
411
+ for task in ["easy", "medium", "hard"]:
412
+ task_scores = []
413
+ for seed in seeds:
414
+ score = test_llm_agent(base_url, task=task, seed=seed)
415
+ task_scores.append(score)
416
+
417
+ mean = sum(task_scores) / len(task_scores)
418
+ variance = sum((s - mean) ** 2 for s in task_scores) / len(task_scores)
419
+ print(f"\n {task}: scores={[f'{s:.3f}' for s in task_scores]} mean={mean:.4f} var={variance:.6f}")
420
+
421
+
422
+ # ---------------------------------------------------------------------------
423
+ # Main
424
+ # ---------------------------------------------------------------------------
425
+
426
+ if __name__ == "__main__":
427
+ import argparse
428
+
429
+ parser = argparse.ArgumentParser(description="Judge Evaluation Simulator for GraphStrike")
430
+ parser.add_argument("--url", required=True, help="Environment server URL")
431
+ parser.add_argument("--bedrock", action="store_true", help="Use AWS Bedrock instead of HF router")
432
+ parser.add_argument("--endpoints-only", action="store_true", help="Only test endpoints (no LLM)")
433
+ parser.add_argument("--skip-variance", action="store_true", help="Skip variance check (faster)")
434
+ parser.add_argument("--seeds", type=int, default=3, help="Number of seeds for variance check")
435
+ args = parser.parse_args()
436
+
437
+ if args.bedrock:
438
+ LLM_BACKEND = "bedrock"
439
+
440
+ base = args.url.rstrip("/")
441
+ model_display = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
442
+ print(f"GraphStrike Judge Evaluation Simulator")
443
+ print(f"Target: {base}")
444
+ print(f"Backend: {LLM_BACKEND}")
445
+ print(f"Model: {model_display}")
446
+ print(f"Token: {'set' if (HF_TOKEN or os.getenv('AWS_ACCESS_KEY_ID')) else 'NOT SET'}")
447
+
448
+ # Phase 0: Endpoints
449
+ if not test_endpoints(base):
450
+ print("\n✗ Endpoint check failed. Fix before proceeding.")
451
+ sys.exit(1)
452
+
453
+ # Phase 1: Baseline stability
454
+ test_baseline_stability(base)
455
+
456
+ if args.endpoints_only:
457
+ print("\n✓ Endpoint-only mode — skipping LLM tests.")
458
+ sys.exit(0)
459
+
460
+ if LLM_BACKEND == "bedrock":
461
+ if not os.getenv("AWS_ACCESS_KEY_ID"):
462
+ print("\n✗ AWS_ACCESS_KEY_ID not set. Cannot run Bedrock LLM tests.")
463
+ sys.exit(1)
464
+ elif not HF_TOKEN:
465
+ print("\n✗ HF_TOKEN not set. Cannot run LLM agent tests.")
466
+ print(" export HF_TOKEN='hf_...' OR use --bedrock with AWS creds")
467
+ sys.exit(1)
468
+
469
+ # Phase 2: LLM on all tasks
470
+ scores = test_llm_all_tasks(base)
471
+
472
+ # Phase 3: Variance
473
+ if not args.skip_variance:
474
+ test_variance(base, seeds=list(range(args.seeds)))
475
+
476
+ print("\n" + "="*60)
477
+ print("EVALUATION COMPLETE")
478
+ print("="*60)
eval-models/llama_test_judge_eval.py ADDED
@@ -0,0 +1,478 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Judge Evaluation Simulator
4
+ ==========================
5
+
6
+ Simulates EXACTLY how hackathon judges will evaluate your environment:
7
+
8
+ 1. Baseline re-run: POST /baseline → verify scores are stable
9
+ 2. Standard Open LLM agent: Run an LLM (via HF router) against all 3 tasks
10
+ 3. Score variance check: Run same task multiple seeds, check variance
11
+
12
+ USAGE:
13
+ # Against live HF Space (requires HF_TOKEN):
14
+ export HF_TOKEN="hf_..."
15
+ python test_judge_eval.py --url https://pandago-graphstrike.hf.space
16
+
17
+ # Against local server:
18
+ export HF_TOKEN="hf_..."
19
+ python test_judge_eval.py --url http://localhost:7860
20
+
21
+ # Choose model (default: Qwen/Qwen2.5-72B-Instruct):
22
+ export MODEL_NAME="nvidia/Llama-3.1-Nemotron-70B-Instruct-HF"
23
+ python test_judge_eval.py --url https://pandago-graphstrike.hf.space
24
+
25
+ # Just test endpoints (no LLM needed):
26
+ python test_judge_eval.py --url https://pandago-graphstrike.hf.space --endpoints-only
27
+ """
28
+
29
+ import json
30
+ import os
31
+ import sys
32
+ import time
33
+ import urllib.request
34
+ from typing import Dict, List, Optional
35
+
36
+ # ---------------------------------------------------------------------------
37
+ # Config from env vars (same as judges will set)
38
+ # ---------------------------------------------------------------------------
39
+
40
+ HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
41
+ API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
42
+ MODEL_NAME = os.getenv("MODEL_NAME", "meta.llama4-scout-17b-instruct-v1:0")
43
+
44
+ # AWS Bedrock config (use --bedrock flag)
45
+ LLM_BACKEND = "hf" # "hf" or "bedrock"
46
+ BEDROCK_MODEL_ID = os.getenv("BEDROCK_MODEL_ID", "meta.llama4-scout-17b-instruct-v1:0")
47
+
48
+ SYSTEM_PROMPT = """You are an AI detective finding 10 coordinated fake accounts in a social network.
49
+
50
+ ACTIONS (reply with exactly ONE line):
51
+ - INSPECT acc_XXXX — reveal profile (costs 1 step)
52
+ - FLAG acc_XXXX — mark as fake (FREE, no step cost, triggers suspect cascade)
53
+ - SUBMIT — end episode, get scored
54
+
55
+ DECISION RULES (apply top-to-bottom, first match wins):
56
+ 1. If suspect_ids lists accounts you haven't inspected → INSPECT the first one
57
+ 2. If ANY profiled account has shared_ip_count >= 5 and is NOT flagged → FLAG it immediately
58
+ 3. If ANY profiled account has photo_reuse >= 0.50 AND bio_template >= 0.40 and hub < 0.70 and NOT flagged → FLAG it
59
+ 4. If ANY profiled account has fake_risk_score >= 0.30 and hub < 0.70 and NOT flagged → FLAG it
60
+ 5. If there are uninspected visible accounts and steps > 3 → INSPECT the next one
61
+ 6. If you have 10 flags OR steps <= 3 → SUBMIT
62
+
63
+ IMPORTANT:
64
+ - FLAG is FREE (costs 0 steps) — flag aggressively when you see suspicious signals
65
+ - After each FLAG, new suspects appear — always inspect suspects before other accounts
66
+ - hub_legitimacy_score > 0.70 means celebrity — do NOT flag
67
+ - shared_ip_count >= 5 is the strongest gang signal (all 10 share one IP)
68
+ - Do NOT re-inspect already inspected accounts
69
+
70
+ Reply with EXACTLY one line, nothing else:
71
+ FLAG acc_XXXX
72
+ INSPECT acc_XXXX
73
+ SUBMIT"""
74
+
75
+
76
+ # ---------------------------------------------------------------------------
77
+ # HTTP helpers
78
+ # ---------------------------------------------------------------------------
79
+
80
+ def _retry(fn, retries=3, backoff=3):
81
+ """Retry a function on network errors."""
82
+ for attempt in range(retries):
83
+ try:
84
+ return fn()
85
+ except OSError as e:
86
+ if attempt == retries - 1:
87
+ raise
88
+ wait = backoff * (attempt + 1)
89
+ print(f" [RETRY] Network error: {e} — retrying in {wait}s ({attempt+1}/{retries})")
90
+ time.sleep(wait)
91
+
92
+
93
+ def http_post(url: str, body: Optional[dict] = None) -> dict:
94
+ def _do():
95
+ data = json.dumps(body or {}).encode()
96
+ req = urllib.request.Request(
97
+ url, data=data,
98
+ headers={"Content-Type": "application/json"},
99
+ method="POST"
100
+ )
101
+ with urllib.request.urlopen(req, timeout=120) as resp:
102
+ return json.loads(resp.read())
103
+ return _retry(_do)
104
+
105
+
106
+ def http_get(url: str, expect_json: bool = True) -> dict:
107
+ def _do():
108
+ with urllib.request.urlopen(url, timeout=120) as resp:
109
+ body = resp.read()
110
+ if not expect_json:
111
+ return {"_status": resp.status, "_body_len": len(body)}
112
+ return json.loads(body)
113
+ return _retry(_do)
114
+
115
+
116
+ # ---------------------------------------------------------------------------
117
+ # LLM call via OpenAI-compatible API
118
+ # ---------------------------------------------------------------------------
119
+
120
+ def _call_hf(prompt: str) -> str:
121
+ """Call LLM via HF router (OpenAI-compatible)."""
122
+ from openai import OpenAI
123
+ client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
124
+ resp = client.chat.completions.create(
125
+ model=MODEL_NAME,
126
+ messages=[
127
+ {"role": "system", "content": SYSTEM_PROMPT},
128
+ {"role": "user", "content": prompt},
129
+ ],
130
+ temperature=0.3,
131
+ max_tokens=256,
132
+ )
133
+ return (resp.choices[0].message.content or "").strip()
134
+
135
+
136
+ def _call_bedrock(prompt: str) -> str:
137
+ """Call LLM via AWS Bedrock. Tries converse() first, falls back to invoke_model()."""
138
+ import boto3
139
+ client = boto3.client(
140
+ service_name="bedrock-runtime",
141
+ region_name=os.getenv("AWS_DEFAULT_REGION", "us-east-1"),
142
+ aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
143
+ aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
144
+ )
145
+ # Try converse API first (boto3 >= 1.34.x)
146
+ if hasattr(client, "converse"):
147
+ resp = client.converse(
148
+ modelId=BEDROCK_MODEL_ID,
149
+ messages=[{"role": "user", "content": [{"text": prompt}]}],
150
+ system=[{"text": SYSTEM_PROMPT}],
151
+ inferenceConfig={"maxTokens": 256, "temperature": 0.3},
152
+ )
153
+ return resp["output"]["message"]["content"][0]["text"].strip()
154
+ # Fallback: invoke_model (works with all boto3 versions)
155
+ body = json.dumps({
156
+ "messages": [
157
+ {"role": "system", "content": SYSTEM_PROMPT},
158
+ {"role": "user", "content": prompt},
159
+ ],
160
+ "max_tokens": 256,
161
+ "temperature": 0.3,
162
+ })
163
+ resp = client.invoke_model(
164
+ modelId=BEDROCK_MODEL_ID,
165
+ contentType="application/json",
166
+ accept="application/json",
167
+ body=body,
168
+ )
169
+ result = json.loads(resp["body"].read())
170
+ # Handle both OpenAI-style and Bedrock-native response formats
171
+ if "choices" in result:
172
+ return result["choices"][0]["message"]["content"].strip()
173
+ if "content" in result:
174
+ content = result["content"]
175
+ if isinstance(content, list):
176
+ return content[0].get("text", "").strip()
177
+ return str(content).strip()
178
+ if "output" in result:
179
+ return result["output"].get("text", "").strip()
180
+ return str(result).strip()
181
+
182
+
183
+ def call_llm(prompt: str) -> str:
184
+ """Call LLM with retries. Uses HF router or Bedrock based on LLM_BACKEND."""
185
+ fn = _call_bedrock if LLM_BACKEND == "bedrock" else _call_hf
186
+ for attempt in range(3):
187
+ try:
188
+ raw = fn(prompt)
189
+ if os.getenv("DEBUG_LLM"):
190
+ print(f" [LLM RAW] {raw[:200]}")
191
+ # Strip Qwen3 <think>...</think> reasoning blocks
192
+ import re
193
+ cleaned = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()
194
+ return cleaned if cleaned else raw
195
+ except Exception as e:
196
+ if attempt == 2:
197
+ print(f" [LLM ERROR] {e} (gave up after 3 attempts)")
198
+ return ""
199
+ wait = 3 * (attempt + 1)
200
+ print(f" [LLM RETRY] {e} — retrying in {wait}s")
201
+ time.sleep(wait)
202
+ return ""
203
+
204
+
205
+ def format_obs(obs: dict) -> str:
206
+ """Format observation as text for LLM — shows raw signals prominently."""
207
+ lines = []
208
+ lines.append(f"TASK: {obs.get('task','?').upper()} | Steps remaining: {obs.get('steps_remaining','?')}")
209
+
210
+ flagged = obs.get("flagged_ids", [])
211
+ lines.append(f"Flagged ({len(flagged)}/10): {', '.join(flagged) if flagged else 'none'}")
212
+
213
+ suspects = obs.get("suspect_ids", [])
214
+ inspected = obs.get("inspected_ids", [])
215
+ uninspected_suspects = [s for s in suspects if s not in inspected]
216
+ if uninspected_suspects:
217
+ lines.append(f"*** SUSPECTS (uninspected) → INSPECT THESE FIRST: {', '.join(uninspected_suspects)} ***")
218
+
219
+ accounts = obs.get("visible_accounts", [])
220
+ if accounts:
221
+ # Split: unflagged accounts that should be flagged vs rest
222
+ unflagged_suspicious = []
223
+ flagged_accs = []
224
+ clean_accs = []
225
+ for a in sorted(accounts, key=lambda x: x.get("fake_risk_score", 0), reverse=True):
226
+ aid = a.get("account_id", "?")
227
+ if aid in flagged:
228
+ flagged_accs.append(a)
229
+ elif (a.get("shared_ip_count", 0) >= 5 or
230
+ (a.get("photo_reuse_score", 0) >= 0.50 and a.get("bio_template_score", 0) >= 0.40)):
231
+ unflagged_suspicious.append(a)
232
+ else:
233
+ clean_accs.append(a)
234
+
235
+ if unflagged_suspicious:
236
+ lines.append(f"\n!!! ACTION NEEDED — FLAG THESE ({len(unflagged_suspicious)} accounts with strong fake signals):")
237
+ for a in unflagged_suspicious:
238
+ aid = a.get("account_id", "?")
239
+ lines.append(f" → FLAG {aid}: risk={a.get('fake_risk_score',0):.3f} photo={a.get('photo_reuse_score',0):.2f} bio={a.get('bio_template_score',0):.2f} ip_shared={a.get('shared_ip_count',0)} hub={a.get('hub_legitimacy_score',0):.2f}")
240
+
241
+ if flagged_accs:
242
+ lines.append(f"\nALREADY FLAGGED ({len(flagged_accs)}):")
243
+ for a in flagged_accs[:5]:
244
+ lines.append(f" ✓ {a.get('account_id','?')}")
245
+
246
+ if clean_accs:
247
+ lines.append(f"\nCLEAN ACCOUNTS ({len(clean_accs)}):")
248
+ for a in clean_accs[:5]:
249
+ aid = a.get("account_id", "?")
250
+ hub = a.get("hub_legitimacy_score", 0)
251
+ hub_mark = " [CELEBRITY]" if hub > 0.70 else ""
252
+ lines.append(f" {aid}: risk={a.get('fake_risk_score',0):.3f} photo={a.get('photo_reuse_score',0):.2f} bio={a.get('bio_template_score',0):.2f} hub={hub:.2f}{hub_mark}")
253
+
254
+ visible = obs.get("visible_account_ids", [])
255
+ uninspected = [i for i in visible if i not in inspected]
256
+ if uninspected:
257
+ lines.append(f"\nUninspected IDs ({len(uninspected)}): {', '.join(uninspected[:8])}{'...' if len(uninspected) > 8 else ''}")
258
+
259
+ lines.append(f"\nMessage: {obs.get('message', '')}")
260
+ return "\n".join(lines)
261
+
262
+
263
+ def parse_action(llm_text: str, obs: dict) -> dict:
264
+ """Parse LLM output to action dict."""
265
+ for line in llm_text.split("\n"):
266
+ line = line.strip()
267
+ upper = line.upper()
268
+ if upper.startswith("INSPECT ") or upper.startswith("FLAG ") or upper.startswith("INVESTIGATE_NETWORK ") or upper.startswith("UNFLAG "):
269
+ parts = line.split(maxsplit=1)
270
+ return {"action_type": parts[0].lower(), "account_id": parts[1].lower() if len(parts) > 1 else None}
271
+ if upper == "SUBMIT":
272
+ return {"action_type": "submit"}
273
+
274
+ # Fallback: inspect first uninspected suspect
275
+ suspects = obs.get("suspect_ids", [])
276
+ inspected = obs.get("inspected_ids", [])
277
+ for s in suspects:
278
+ if s not in inspected:
279
+ return {"action_type": "inspect", "account_id": s}
280
+ visible = obs.get("visible_account_ids", [])
281
+ for v in visible:
282
+ if v not in inspected:
283
+ return {"action_type": "inspect", "account_id": v}
284
+ return {"action_type": "submit"}
285
+
286
+
287
+ # ---------------------------------------------------------------------------
288
+ # Test phases
289
+ # ---------------------------------------------------------------------------
290
+
291
+ def test_endpoints(base_url: str) -> bool:
292
+ """Phase 0: Verify all required endpoints respond correctly."""
293
+ print("\n" + "="*60)
294
+ print("PHASE 0: Endpoint Verification")
295
+ print("="*60)
296
+
297
+ checks = [
298
+ ("GET", "/health", None, True),
299
+ ("GET", "/tasks", None, True),
300
+ ("GET", "/metadata", None, True),
301
+ ("GET", "/schema", None, True),
302
+ ("GET", "/web", None, False), # returns HTML, not JSON
303
+ ("POST", "/reset", {"task": "easy", "seed": 0}, True),
304
+ ("GET", "/state", None, True),
305
+ ("POST", "/step", {"action_type": "inspect", "account_id": "acc_0000"}, True),
306
+ ("POST", "/step", {"action_type": "submit"}, True),
307
+ ("GET", "/grader", None, True),
308
+ ("POST", "/mcp", {"jsonrpc": "2.0", "method": "tools/list", "id": 1}, True),
309
+ ("POST", "/baseline", None, True),
310
+ ]
311
+
312
+ all_ok = True
313
+ for method, path, body, expect_json in checks:
314
+ try:
315
+ if method == "GET":
316
+ http_get(f"{base_url}{path}", expect_json=expect_json)
317
+ else:
318
+ http_post(f"{base_url}{path}", body)
319
+ print(f" ✓ {method} {path}")
320
+ except Exception as e:
321
+ print(f" ✗ {method} {path} — {e}")
322
+ all_ok = False
323
+
324
+ return all_ok
325
+
326
+
327
+ def test_baseline_stability(base_url: str) -> bool:
328
+ """Phase 1: Baseline re-run (must produce identical scores)."""
329
+ print("\n" + "="*60)
330
+ print("PHASE 1: Baseline Stability (3 runs)")
331
+ print("="*60)
332
+
333
+ scores_list = []
334
+ for i in range(3):
335
+ r = http_post(f"{base_url}/baseline")
336
+ scores = r["scores"]
337
+ scores_list.append(scores)
338
+ print(f" Run {i+1}: easy={scores['easy']:.4f} medium={scores['medium']:.4f} hard={scores['hard']:.4f}")
339
+
340
+ # Check all identical
341
+ stable = all(s == scores_list[0] for s in scores_list)
342
+ if stable:
343
+ print(" ✓ All 3 runs identical — baseline is deterministic")
344
+ else:
345
+ print(" ✗ SCORES DIFFER — baseline is non-deterministic!")
346
+ return stable
347
+
348
+
349
+ def test_llm_agent(base_url: str, task: str, seed: int = 0) -> float:
350
+ """Phase 2: Run an LLM agent against one task (simulates judge's Nemotron run)."""
351
+ _model = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
352
+ print(f"\n --- LLM Agent: task={task}, seed={seed}, model={_model} ---")
353
+
354
+ # Reset
355
+ reset_resp = http_post(f"{base_url}/reset", {"task": task, "seed": seed})
356
+ obs = reset_resp.get("observation", reset_resp)
357
+ done = reset_resp.get("done", False)
358
+
359
+ step_num = 0
360
+ while not done:
361
+ step_num += 1
362
+ prompt = format_obs(obs)
363
+ llm_text = call_llm(prompt)
364
+ action = parse_action(llm_text, obs)
365
+
366
+ action_str = f"{action['action_type'].upper()} {action.get('account_id', '')}".strip()
367
+
368
+ step_resp = http_post(f"{base_url}/step", action)
369
+ obs = step_resp.get("observation", step_resp)
370
+ done = step_resp.get("done", False)
371
+ reward = step_resp.get("reward")
372
+
373
+ flagged_n = len(obs.get("flagged_ids", []))
374
+ suspects_n = len(obs.get("suspect_ids", []))
375
+ steps_left = obs.get("steps_remaining", "?")
376
+
377
+ print(f" Step {step_num:2d}: {action_str:35s} flagged={flagged_n}/10 suspects={suspects_n} steps_left={steps_left}")
378
+
379
+ if done and reward is not None:
380
+ msg = step_resp.get("message", obs.get("message", ""))
381
+ print(f" → Episode ended: {msg[:100]}")
382
+
383
+ # Get grader score
384
+ grader = http_get(f"{base_url}/grader")
385
+ score = grader["score"]
386
+ print(f" ★ GRADER SCORE: {score:.4f}")
387
+ return score
388
+
389
+
390
+ def test_llm_all_tasks(base_url: str) -> Dict[str, float]:
391
+ """Phase 2: Run LLM agent on all 3 tasks."""
392
+ print("\n" + "="*60)
393
+ _model = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
394
+ print(f"PHASE 2: LLM Agent Evaluation (model={_model})")
395
+ print("="*60)
396
+
397
+ scores = {}
398
+ for task in ["easy", "medium", "hard"]:
399
+ scores[task] = test_llm_agent(base_url, task=task, seed=0)
400
+
401
+ print(f"\n Summary: easy={scores['easy']:.4f} medium={scores['medium']:.4f} hard={scores['hard']:.4f}")
402
+ return scores
403
+
404
+
405
+ def test_variance(base_url: str, seeds: List[int] = [0, 1, 2, 3, 4]) -> None:
406
+ """Phase 3: Score variance check (multiple seeds per task)."""
407
+ print("\n" + "="*60)
408
+ print(f"PHASE 3: Score Variance (seeds={seeds})")
409
+ print("="*60)
410
+
411
+ for task in ["easy", "medium", "hard"]:
412
+ task_scores = []
413
+ for seed in seeds:
414
+ score = test_llm_agent(base_url, task=task, seed=seed)
415
+ task_scores.append(score)
416
+
417
+ mean = sum(task_scores) / len(task_scores)
418
+ variance = sum((s - mean) ** 2 for s in task_scores) / len(task_scores)
419
+ print(f"\n {task}: scores={[f'{s:.3f}' for s in task_scores]} mean={mean:.4f} var={variance:.6f}")
420
+
421
+
422
+ # ---------------------------------------------------------------------------
423
+ # Main
424
+ # ---------------------------------------------------------------------------
425
+
426
+ if __name__ == "__main__":
427
+ import argparse
428
+
429
+ parser = argparse.ArgumentParser(description="Judge Evaluation Simulator for GraphStrike")
430
+ parser.add_argument("--url", required=True, help="Environment server URL")
431
+ parser.add_argument("--bedrock", action="store_true", help="Use AWS Bedrock instead of HF router")
432
+ parser.add_argument("--endpoints-only", action="store_true", help="Only test endpoints (no LLM)")
433
+ parser.add_argument("--skip-variance", action="store_true", help="Skip variance check (faster)")
434
+ parser.add_argument("--seeds", type=int, default=3, help="Number of seeds for variance check")
435
+ args = parser.parse_args()
436
+
437
+ if args.bedrock:
438
+ LLM_BACKEND = "bedrock"
439
+
440
+ base = args.url.rstrip("/")
441
+ model_display = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
442
+ print(f"GraphStrike Judge Evaluation Simulator")
443
+ print(f"Target: {base}")
444
+ print(f"Backend: {LLM_BACKEND}")
445
+ print(f"Model: {model_display}")
446
+ print(f"Token: {'set' if (HF_TOKEN or os.getenv('AWS_ACCESS_KEY_ID')) else 'NOT SET'}")
447
+
448
+ # Phase 0: Endpoints
449
+ if not test_endpoints(base):
450
+ print("\n✗ Endpoint check failed. Fix before proceeding.")
451
+ sys.exit(1)
452
+
453
+ # Phase 1: Baseline stability
454
+ test_baseline_stability(base)
455
+
456
+ if args.endpoints_only:
457
+ print("\n✓ Endpoint-only mode — skipping LLM tests.")
458
+ sys.exit(0)
459
+
460
+ if LLM_BACKEND == "bedrock":
461
+ if not os.getenv("AWS_ACCESS_KEY_ID"):
462
+ print("\n✗ AWS_ACCESS_KEY_ID not set. Cannot run Bedrock LLM tests.")
463
+ sys.exit(1)
464
+ elif not HF_TOKEN:
465
+ print("\n✗ HF_TOKEN not set. Cannot run LLM agent tests.")
466
+ print(" export HF_TOKEN='hf_...' OR use --bedrock with AWS creds")
467
+ sys.exit(1)
468
+
469
+ # Phase 2: LLM on all tasks
470
+ scores = test_llm_all_tasks(base)
471
+
472
+ # Phase 3: Variance
473
+ if not args.skip_variance:
474
+ test_variance(base, seeds=list(range(args.seeds)))
475
+
476
+ print("\n" + "="*60)
477
+ print("EVALUATION COMPLETE")
478
+ print("="*60)
eval-models/mistral_test_judge_eval.py ADDED
@@ -0,0 +1,478 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Judge Evaluation Simulator
4
+ ==========================
5
+
6
+ Simulates EXACTLY how hackathon judges will evaluate your environment:
7
+
8
+ 1. Baseline re-run: POST /baseline → verify scores are stable
9
+ 2. Standard Open LLM agent: Run an LLM (via HF router) against all 3 tasks
10
+ 3. Score variance check: Run same task multiple seeds, check variance
11
+
12
+ USAGE:
13
+ # Against live HF Space (requires HF_TOKEN):
14
+ export HF_TOKEN="hf_..."
15
+ python test_judge_eval.py --url https://pandago-graphstrike.hf.space
16
+
17
+ # Against local server:
18
+ export HF_TOKEN="hf_..."
19
+ python test_judge_eval.py --url http://localhost:7860
20
+
21
+ # Choose model (default: Qwen/Qwen2.5-72B-Instruct):
22
+ export MODEL_NAME="nvidia/Llama-3.1-Nemotron-70B-Instruct-HF"
23
+ python test_judge_eval.py --url https://pandago-graphstrike.hf.space
24
+
25
+ # Just test endpoints (no LLM needed):
26
+ python test_judge_eval.py --url https://pandago-graphstrike.hf.space --endpoints-only
27
+ """
28
+
29
+ import json
30
+ import os
31
+ import sys
32
+ import time
33
+ import urllib.request
34
+ from typing import Dict, List, Optional
35
+
36
+ # ---------------------------------------------------------------------------
37
+ # Config from env vars (same as judges will set)
38
+ # ---------------------------------------------------------------------------
39
+
40
+ HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
41
+ API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
42
+ MODEL_NAME = os.getenv("MODEL_NAME", "mistral.ministral-3-8b-instruct")
43
+
44
+ # AWS Bedrock config (use --bedrock flag)
45
+ LLM_BACKEND = "hf" # "hf" or "bedrock"
46
+ BEDROCK_MODEL_ID = os.getenv("BEDROCK_MODEL_ID", "mistral.ministral-3-8b-instruct")
47
+
48
+ SYSTEM_PROMPT = """You are an AI detective finding 10 coordinated fake accounts in a social network.
49
+
50
+ ACTIONS (reply with exactly ONE line):
51
+ - INSPECT acc_XXXX — reveal profile (costs 1 step)
52
+ - FLAG acc_XXXX — mark as fake (FREE, no step cost, triggers suspect cascade)
53
+ - SUBMIT — end episode, get scored
54
+
55
+ DECISION RULES (apply top-to-bottom, first match wins):
56
+ 1. If suspect_ids lists accounts you haven't inspected → INSPECT the first one
57
+ 2. If ANY profiled account has shared_ip_count >= 5 and is NOT flagged → FLAG it immediately
58
+ 3. If ANY profiled account has photo_reuse >= 0.50 AND bio_template >= 0.40 and hub < 0.70 and NOT flagged → FLAG it
59
+ 4. If ANY profiled account has fake_risk_score >= 0.30 and hub < 0.70 and NOT flagged → FLAG it
60
+ 5. If there are uninspected visible accounts and steps > 3 → INSPECT the next one
61
+ 6. If you have 10 flags OR steps <= 3 → SUBMIT
62
+
63
+ IMPORTANT:
64
+ - FLAG is FREE (costs 0 steps) — flag aggressively when you see suspicious signals
65
+ - After each FLAG, new suspects appear — always inspect suspects before other accounts
66
+ - hub_legitimacy_score > 0.70 means celebrity — do NOT flag
67
+ - shared_ip_count >= 5 is the strongest gang signal (all 10 share one IP)
68
+ - Do NOT re-inspect already inspected accounts
69
+
70
+ Reply with EXACTLY one line, nothing else:
71
+ FLAG acc_XXXX
72
+ INSPECT acc_XXXX
73
+ SUBMIT"""
74
+
75
+
76
+ # ---------------------------------------------------------------------------
77
+ # HTTP helpers
78
+ # ---------------------------------------------------------------------------
79
+
80
+ def _retry(fn, retries=3, backoff=3):
81
+ """Retry a function on network errors."""
82
+ for attempt in range(retries):
83
+ try:
84
+ return fn()
85
+ except OSError as e:
86
+ if attempt == retries - 1:
87
+ raise
88
+ wait = backoff * (attempt + 1)
89
+ print(f" [RETRY] Network error: {e} — retrying in {wait}s ({attempt+1}/{retries})")
90
+ time.sleep(wait)
91
+
92
+
93
+ def http_post(url: str, body: Optional[dict] = None) -> dict:
94
+ def _do():
95
+ data = json.dumps(body or {}).encode()
96
+ req = urllib.request.Request(
97
+ url, data=data,
98
+ headers={"Content-Type": "application/json"},
99
+ method="POST"
100
+ )
101
+ with urllib.request.urlopen(req, timeout=120) as resp:
102
+ return json.loads(resp.read())
103
+ return _retry(_do)
104
+
105
+
106
+ def http_get(url: str, expect_json: bool = True) -> dict:
107
+ def _do():
108
+ with urllib.request.urlopen(url, timeout=120) as resp:
109
+ body = resp.read()
110
+ if not expect_json:
111
+ return {"_status": resp.status, "_body_len": len(body)}
112
+ return json.loads(body)
113
+ return _retry(_do)
114
+
115
+
116
+ # ---------------------------------------------------------------------------
117
+ # LLM call via OpenAI-compatible API
118
+ # ---------------------------------------------------------------------------
119
+
120
+ def _call_hf(prompt: str) -> str:
121
+ """Call LLM via HF router (OpenAI-compatible)."""
122
+ from openai import OpenAI
123
+ client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
124
+ resp = client.chat.completions.create(
125
+ model=MODEL_NAME,
126
+ messages=[
127
+ {"role": "system", "content": SYSTEM_PROMPT},
128
+ {"role": "user", "content": prompt},
129
+ ],
130
+ temperature=0.3,
131
+ max_tokens=256,
132
+ )
133
+ return (resp.choices[0].message.content or "").strip()
134
+
135
+
136
+ def _call_bedrock(prompt: str) -> str:
137
+ """Call LLM via AWS Bedrock. Tries converse() first, falls back to invoke_model()."""
138
+ import boto3
139
+ client = boto3.client(
140
+ service_name="bedrock-runtime",
141
+ region_name=os.getenv("AWS_DEFAULT_REGION", "us-east-1"),
142
+ aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
143
+ aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
144
+ )
145
+ # Try converse API first (boto3 >= 1.34.x)
146
+ if hasattr(client, "converse"):
147
+ resp = client.converse(
148
+ modelId=BEDROCK_MODEL_ID,
149
+ messages=[{"role": "user", "content": [{"text": prompt}]}],
150
+ system=[{"text": SYSTEM_PROMPT}],
151
+ inferenceConfig={"maxTokens": 256, "temperature": 0.3},
152
+ )
153
+ return resp["output"]["message"]["content"][0]["text"].strip()
154
+ # Fallback: invoke_model (works with all boto3 versions)
155
+ body = json.dumps({
156
+ "messages": [
157
+ {"role": "system", "content": SYSTEM_PROMPT},
158
+ {"role": "user", "content": prompt},
159
+ ],
160
+ "max_tokens": 256,
161
+ "temperature": 0.3,
162
+ })
163
+ resp = client.invoke_model(
164
+ modelId=BEDROCK_MODEL_ID,
165
+ contentType="application/json",
166
+ accept="application/json",
167
+ body=body,
168
+ )
169
+ result = json.loads(resp["body"].read())
170
+ # Handle both OpenAI-style and Bedrock-native response formats
171
+ if "choices" in result:
172
+ return result["choices"][0]["message"]["content"].strip()
173
+ if "content" in result:
174
+ content = result["content"]
175
+ if isinstance(content, list):
176
+ return content[0].get("text", "").strip()
177
+ return str(content).strip()
178
+ if "output" in result:
179
+ return result["output"].get("text", "").strip()
180
+ return str(result).strip()
181
+
182
+
183
+ def call_llm(prompt: str) -> str:
184
+ """Call LLM with retries. Uses HF router or Bedrock based on LLM_BACKEND."""
185
+ fn = _call_bedrock if LLM_BACKEND == "bedrock" else _call_hf
186
+ for attempt in range(3):
187
+ try:
188
+ raw = fn(prompt)
189
+ if os.getenv("DEBUG_LLM"):
190
+ print(f" [LLM RAW] {raw[:200]}")
191
+ # Strip Qwen3 <think>...</think> reasoning blocks
192
+ import re
193
+ cleaned = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()
194
+ return cleaned if cleaned else raw
195
+ except Exception as e:
196
+ if attempt == 2:
197
+ print(f" [LLM ERROR] {e} (gave up after 3 attempts)")
198
+ return ""
199
+ wait = 3 * (attempt + 1)
200
+ print(f" [LLM RETRY] {e} — retrying in {wait}s")
201
+ time.sleep(wait)
202
+ return ""
203
+
204
+
205
+ def format_obs(obs: dict) -> str:
206
+ """Format observation as text for LLM — shows raw signals prominently."""
207
+ lines = []
208
+ lines.append(f"TASK: {obs.get('task','?').upper()} | Steps remaining: {obs.get('steps_remaining','?')}")
209
+
210
+ flagged = obs.get("flagged_ids", [])
211
+ lines.append(f"Flagged ({len(flagged)}/10): {', '.join(flagged) if flagged else 'none'}")
212
+
213
+ suspects = obs.get("suspect_ids", [])
214
+ inspected = obs.get("inspected_ids", [])
215
+ uninspected_suspects = [s for s in suspects if s not in inspected]
216
+ if uninspected_suspects:
217
+ lines.append(f"*** SUSPECTS (uninspected) → INSPECT THESE FIRST: {', '.join(uninspected_suspects)} ***")
218
+
219
+ accounts = obs.get("visible_accounts", [])
220
+ if accounts:
221
+ # Split: unflagged accounts that should be flagged vs rest
222
+ unflagged_suspicious = []
223
+ flagged_accs = []
224
+ clean_accs = []
225
+ for a in sorted(accounts, key=lambda x: x.get("fake_risk_score", 0), reverse=True):
226
+ aid = a.get("account_id", "?")
227
+ if aid in flagged:
228
+ flagged_accs.append(a)
229
+ elif (a.get("shared_ip_count", 0) >= 5 or
230
+ (a.get("photo_reuse_score", 0) >= 0.50 and a.get("bio_template_score", 0) >= 0.40)):
231
+ unflagged_suspicious.append(a)
232
+ else:
233
+ clean_accs.append(a)
234
+
235
+ if unflagged_suspicious:
236
+ lines.append(f"\n!!! ACTION NEEDED — FLAG THESE ({len(unflagged_suspicious)} accounts with strong fake signals):")
237
+ for a in unflagged_suspicious:
238
+ aid = a.get("account_id", "?")
239
+ lines.append(f" → FLAG {aid}: risk={a.get('fake_risk_score',0):.3f} photo={a.get('photo_reuse_score',0):.2f} bio={a.get('bio_template_score',0):.2f} ip_shared={a.get('shared_ip_count',0)} hub={a.get('hub_legitimacy_score',0):.2f}")
240
+
241
+ if flagged_accs:
242
+ lines.append(f"\nALREADY FLAGGED ({len(flagged_accs)}):")
243
+ for a in flagged_accs[:5]:
244
+ lines.append(f" ✓ {a.get('account_id','?')}")
245
+
246
+ if clean_accs:
247
+ lines.append(f"\nCLEAN ACCOUNTS ({len(clean_accs)}):")
248
+ for a in clean_accs[:5]:
249
+ aid = a.get("account_id", "?")
250
+ hub = a.get("hub_legitimacy_score", 0)
251
+ hub_mark = " [CELEBRITY]" if hub > 0.70 else ""
252
+ lines.append(f" {aid}: risk={a.get('fake_risk_score',0):.3f} photo={a.get('photo_reuse_score',0):.2f} bio={a.get('bio_template_score',0):.2f} hub={hub:.2f}{hub_mark}")
253
+
254
+ visible = obs.get("visible_account_ids", [])
255
+ uninspected = [i for i in visible if i not in inspected]
256
+ if uninspected:
257
+ lines.append(f"\nUninspected IDs ({len(uninspected)}): {', '.join(uninspected[:8])}{'...' if len(uninspected) > 8 else ''}")
258
+
259
+ lines.append(f"\nMessage: {obs.get('message', '')}")
260
+ return "\n".join(lines)
261
+
262
+
263
+ def parse_action(llm_text: str, obs: dict) -> dict:
264
+ """Parse LLM output to action dict."""
265
+ for line in llm_text.split("\n"):
266
+ line = line.strip()
267
+ upper = line.upper()
268
+ if upper.startswith("INSPECT ") or upper.startswith("FLAG ") or upper.startswith("INVESTIGATE_NETWORK ") or upper.startswith("UNFLAG "):
269
+ parts = line.split(maxsplit=1)
270
+ return {"action_type": parts[0].lower(), "account_id": parts[1].lower() if len(parts) > 1 else None}
271
+ if upper == "SUBMIT":
272
+ return {"action_type": "submit"}
273
+
274
+ # Fallback: inspect first uninspected suspect
275
+ suspects = obs.get("suspect_ids", [])
276
+ inspected = obs.get("inspected_ids", [])
277
+ for s in suspects:
278
+ if s not in inspected:
279
+ return {"action_type": "inspect", "account_id": s}
280
+ visible = obs.get("visible_account_ids", [])
281
+ for v in visible:
282
+ if v not in inspected:
283
+ return {"action_type": "inspect", "account_id": v}
284
+ return {"action_type": "submit"}
285
+
286
+
287
+ # ---------------------------------------------------------------------------
288
+ # Test phases
289
+ # ---------------------------------------------------------------------------
290
+
291
+ def test_endpoints(base_url: str) -> bool:
292
+ """Phase 0: Verify all required endpoints respond correctly."""
293
+ print("\n" + "="*60)
294
+ print("PHASE 0: Endpoint Verification")
295
+ print("="*60)
296
+
297
+ checks = [
298
+ ("GET", "/health", None, True),
299
+ ("GET", "/tasks", None, True),
300
+ ("GET", "/metadata", None, True),
301
+ ("GET", "/schema", None, True),
302
+ ("GET", "/web", None, False), # returns HTML, not JSON
303
+ ("POST", "/reset", {"task": "easy", "seed": 0}, True),
304
+ ("GET", "/state", None, True),
305
+ ("POST", "/step", {"action_type": "inspect", "account_id": "acc_0000"}, True),
306
+ ("POST", "/step", {"action_type": "submit"}, True),
307
+ ("GET", "/grader", None, True),
308
+ ("POST", "/mcp", {"jsonrpc": "2.0", "method": "tools/list", "id": 1}, True),
309
+ ("POST", "/baseline", None, True),
310
+ ]
311
+
312
+ all_ok = True
313
+ for method, path, body, expect_json in checks:
314
+ try:
315
+ if method == "GET":
316
+ http_get(f"{base_url}{path}", expect_json=expect_json)
317
+ else:
318
+ http_post(f"{base_url}{path}", body)
319
+ print(f" ✓ {method} {path}")
320
+ except Exception as e:
321
+ print(f" ✗ {method} {path} — {e}")
322
+ all_ok = False
323
+
324
+ return all_ok
325
+
326
+
327
+ def test_baseline_stability(base_url: str) -> bool:
328
+ """Phase 1: Baseline re-run (must produce identical scores)."""
329
+ print("\n" + "="*60)
330
+ print("PHASE 1: Baseline Stability (3 runs)")
331
+ print("="*60)
332
+
333
+ scores_list = []
334
+ for i in range(3):
335
+ r = http_post(f"{base_url}/baseline")
336
+ scores = r["scores"]
337
+ scores_list.append(scores)
338
+ print(f" Run {i+1}: easy={scores['easy']:.4f} medium={scores['medium']:.4f} hard={scores['hard']:.4f}")
339
+
340
+ # Check all identical
341
+ stable = all(s == scores_list[0] for s in scores_list)
342
+ if stable:
343
+ print(" ✓ All 3 runs identical — baseline is deterministic")
344
+ else:
345
+ print(" ✗ SCORES DIFFER — baseline is non-deterministic!")
346
+ return stable
347
+
348
+
349
+ def test_llm_agent(base_url: str, task: str, seed: int = 0) -> float:
350
+ """Phase 2: Run an LLM agent against one task (simulates judge's Nemotron run)."""
351
+ _model = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
352
+ print(f"\n --- LLM Agent: task={task}, seed={seed}, model={_model} ---")
353
+
354
+ # Reset
355
+ reset_resp = http_post(f"{base_url}/reset", {"task": task, "seed": seed})
356
+ obs = reset_resp.get("observation", reset_resp)
357
+ done = reset_resp.get("done", False)
358
+
359
+ step_num = 0
360
+ while not done:
361
+ step_num += 1
362
+ prompt = format_obs(obs)
363
+ llm_text = call_llm(prompt)
364
+ action = parse_action(llm_text, obs)
365
+
366
+ action_str = f"{action['action_type'].upper()} {action.get('account_id', '')}".strip()
367
+
368
+ step_resp = http_post(f"{base_url}/step", action)
369
+ obs = step_resp.get("observation", step_resp)
370
+ done = step_resp.get("done", False)
371
+ reward = step_resp.get("reward")
372
+
373
+ flagged_n = len(obs.get("flagged_ids", []))
374
+ suspects_n = len(obs.get("suspect_ids", []))
375
+ steps_left = obs.get("steps_remaining", "?")
376
+
377
+ print(f" Step {step_num:2d}: {action_str:35s} flagged={flagged_n}/10 suspects={suspects_n} steps_left={steps_left}")
378
+
379
+ if done and reward is not None:
380
+ msg = step_resp.get("message", obs.get("message", ""))
381
+ print(f" → Episode ended: {msg[:100]}")
382
+
383
+ # Get grader score
384
+ grader = http_get(f"{base_url}/grader")
385
+ score = grader["score"]
386
+ print(f" ★ GRADER SCORE: {score:.4f}")
387
+ return score
388
+
389
+
390
+ def test_llm_all_tasks(base_url: str) -> Dict[str, float]:
391
+ """Phase 2: Run LLM agent on all 3 tasks."""
392
+ print("\n" + "="*60)
393
+ _model = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
394
+ print(f"PHASE 2: LLM Agent Evaluation (model={_model})")
395
+ print("="*60)
396
+
397
+ scores = {}
398
+ for task in ["easy", "medium", "hard"]:
399
+ scores[task] = test_llm_agent(base_url, task=task, seed=0)
400
+
401
+ print(f"\n Summary: easy={scores['easy']:.4f} medium={scores['medium']:.4f} hard={scores['hard']:.4f}")
402
+ return scores
403
+
404
+
405
+ def test_variance(base_url: str, seeds: List[int] = [0, 1, 2, 3, 4]) -> None:
406
+ """Phase 3: Score variance check (multiple seeds per task)."""
407
+ print("\n" + "="*60)
408
+ print(f"PHASE 3: Score Variance (seeds={seeds})")
409
+ print("="*60)
410
+
411
+ for task in ["easy", "medium", "hard"]:
412
+ task_scores = []
413
+ for seed in seeds:
414
+ score = test_llm_agent(base_url, task=task, seed=seed)
415
+ task_scores.append(score)
416
+
417
+ mean = sum(task_scores) / len(task_scores)
418
+ variance = sum((s - mean) ** 2 for s in task_scores) / len(task_scores)
419
+ print(f"\n {task}: scores={[f'{s:.3f}' for s in task_scores]} mean={mean:.4f} var={variance:.6f}")
420
+
421
+
422
+ # ---------------------------------------------------------------------------
423
+ # Main
424
+ # ---------------------------------------------------------------------------
425
+
426
+ if __name__ == "__main__":
427
+ import argparse
428
+
429
+ parser = argparse.ArgumentParser(description="Judge Evaluation Simulator for GraphStrike")
430
+ parser.add_argument("--url", required=True, help="Environment server URL")
431
+ parser.add_argument("--bedrock", action="store_true", help="Use AWS Bedrock instead of HF router")
432
+ parser.add_argument("--endpoints-only", action="store_true", help="Only test endpoints (no LLM)")
433
+ parser.add_argument("--skip-variance", action="store_true", help="Skip variance check (faster)")
434
+ parser.add_argument("--seeds", type=int, default=3, help="Number of seeds for variance check")
435
+ args = parser.parse_args()
436
+
437
+ if args.bedrock:
438
+ LLM_BACKEND = "bedrock"
439
+
440
+ base = args.url.rstrip("/")
441
+ model_display = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
442
+ print(f"GraphStrike Judge Evaluation Simulator")
443
+ print(f"Target: {base}")
444
+ print(f"Backend: {LLM_BACKEND}")
445
+ print(f"Model: {model_display}")
446
+ print(f"Token: {'set' if (HF_TOKEN or os.getenv('AWS_ACCESS_KEY_ID')) else 'NOT SET'}")
447
+
448
+ # Phase 0: Endpoints
449
+ if not test_endpoints(base):
450
+ print("\n✗ Endpoint check failed. Fix before proceeding.")
451
+ sys.exit(1)
452
+
453
+ # Phase 1: Baseline stability
454
+ test_baseline_stability(base)
455
+
456
+ if args.endpoints_only:
457
+ print("\n✓ Endpoint-only mode — skipping LLM tests.")
458
+ sys.exit(0)
459
+
460
+ if LLM_BACKEND == "bedrock":
461
+ if not os.getenv("AWS_ACCESS_KEY_ID"):
462
+ print("\n✗ AWS_ACCESS_KEY_ID not set. Cannot run Bedrock LLM tests.")
463
+ sys.exit(1)
464
+ elif not HF_TOKEN:
465
+ print("\n✗ HF_TOKEN not set. Cannot run LLM agent tests.")
466
+ print(" export HF_TOKEN='hf_...' OR use --bedrock with AWS creds")
467
+ sys.exit(1)
468
+
469
+ # Phase 2: LLM on all tasks
470
+ scores = test_llm_all_tasks(base)
471
+
472
+ # Phase 3: Variance
473
+ if not args.skip_variance:
474
+ test_variance(base, seeds=list(range(args.seeds)))
475
+
476
+ print("\n" + "="*60)
477
+ print("EVALUATION COMPLETE")
478
+ print("="*60)
eval-models/nvidia_test_judge_eval.py ADDED
@@ -0,0 +1,478 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Judge Evaluation Simulator
4
+ ==========================
5
+
6
+ Simulates EXACTLY how hackathon judges will evaluate your environment:
7
+
8
+ 1. Baseline re-run: POST /baseline → verify scores are stable
9
+ 2. Standard Open LLM agent: Run an LLM (via HF router) against all 3 tasks
10
+ 3. Score variance check: Run same task multiple seeds, check variance
11
+
12
+ USAGE:
13
+ # Against live HF Space (requires HF_TOKEN):
14
+ export HF_TOKEN="hf_..."
15
+ python test_judge_eval.py --url https://pandago-graphstrike.hf.space
16
+
17
+ # Against local server:
18
+ export HF_TOKEN="hf_..."
19
+ python test_judge_eval.py --url http://localhost:7860
20
+
21
+ # Choose model (default: Qwen/Qwen2.5-72B-Instruct):
22
+ export MODEL_NAME="nvidia/Llama-3.1-Nemotron-70B-Instruct-HF"
23
+ python test_judge_eval.py --url https://pandago-graphstrike.hf.space
24
+
25
+ # Just test endpoints (no LLM needed):
26
+ python test_judge_eval.py --url https://pandago-graphstrike.hf.space --endpoints-only
27
+ """
28
+
29
+ import json
30
+ import os
31
+ import sys
32
+ import time
33
+ import urllib.request
34
+ from typing import Dict, List, Optional
35
+
36
+ # ---------------------------------------------------------------------------
37
+ # Config from env vars (same as judges will set)
38
+ # ---------------------------------------------------------------------------
39
+
40
+ HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
41
+ API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
42
+ MODEL_NAME = os.getenv("MODEL_NAME", "nvidia.nemotron-super-3-120b")
43
+
44
+ # AWS Bedrock config (use --bedrock flag)
45
+ LLM_BACKEND = "hf" # "hf" or "bedrock"
46
+ BEDROCK_MODEL_ID = os.getenv("BEDROCK_MODEL_ID", "nvidia.nemotron-super-3-120b")
47
+
48
+ SYSTEM_PROMPT = """You are an AI detective finding 10 coordinated fake accounts in a social network.
49
+
50
+ ACTIONS (reply with exactly ONE line):
51
+ - INSPECT acc_XXXX — reveal profile (costs 1 step)
52
+ - FLAG acc_XXXX — mark as fake (FREE, no step cost, triggers suspect cascade)
53
+ - SUBMIT — end episode, get scored
54
+
55
+ DECISION RULES (apply top-to-bottom, first match wins):
56
+ 1. If suspect_ids lists accounts you haven't inspected → INSPECT the first one
57
+ 2. If ANY profiled account has shared_ip_count >= 5 and is NOT flagged → FLAG it immediately
58
+ 3. If ANY profiled account has photo_reuse >= 0.50 AND bio_template >= 0.40 and hub < 0.70 and NOT flagged → FLAG it
59
+ 4. If ANY profiled account has fake_risk_score >= 0.30 and hub < 0.70 and NOT flagged → FLAG it
60
+ 5. If there are uninspected visible accounts and steps > 3 → INSPECT the next one
61
+ 6. If you have 10 flags OR steps <= 3 → SUBMIT
62
+
63
+ IMPORTANT:
64
+ - FLAG is FREE (costs 0 steps) — flag aggressively when you see suspicious signals
65
+ - After each FLAG, new suspects appear — always inspect suspects before other accounts
66
+ - hub_legitimacy_score > 0.70 means celebrity — do NOT flag
67
+ - shared_ip_count >= 5 is the strongest gang signal (all 10 share one IP)
68
+ - Do NOT re-inspect already inspected accounts
69
+
70
+ Reply with EXACTLY one line, nothing else:
71
+ FLAG acc_XXXX
72
+ INSPECT acc_XXXX
73
+ SUBMIT"""
74
+
75
+
76
+ # ---------------------------------------------------------------------------
77
+ # HTTP helpers
78
+ # ---------------------------------------------------------------------------
79
+
80
+ def _retry(fn, retries=3, backoff=3):
81
+ """Retry a function on network errors."""
82
+ for attempt in range(retries):
83
+ try:
84
+ return fn()
85
+ except OSError as e:
86
+ if attempt == retries - 1:
87
+ raise
88
+ wait = backoff * (attempt + 1)
89
+ print(f" [RETRY] Network error: {e} — retrying in {wait}s ({attempt+1}/{retries})")
90
+ time.sleep(wait)
91
+
92
+
93
+ def http_post(url: str, body: Optional[dict] = None) -> dict:
94
+ def _do():
95
+ data = json.dumps(body or {}).encode()
96
+ req = urllib.request.Request(
97
+ url, data=data,
98
+ headers={"Content-Type": "application/json"},
99
+ method="POST"
100
+ )
101
+ with urllib.request.urlopen(req, timeout=120) as resp:
102
+ return json.loads(resp.read())
103
+ return _retry(_do)
104
+
105
+
106
+ def http_get(url: str, expect_json: bool = True) -> dict:
107
+ def _do():
108
+ with urllib.request.urlopen(url, timeout=120) as resp:
109
+ body = resp.read()
110
+ if not expect_json:
111
+ return {"_status": resp.status, "_body_len": len(body)}
112
+ return json.loads(body)
113
+ return _retry(_do)
114
+
115
+
116
+ # ---------------------------------------------------------------------------
117
+ # LLM call via OpenAI-compatible API
118
+ # ---------------------------------------------------------------------------
119
+
120
+ def _call_hf(prompt: str) -> str:
121
+ """Call LLM via HF router (OpenAI-compatible)."""
122
+ from openai import OpenAI
123
+ client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
124
+ resp = client.chat.completions.create(
125
+ model=MODEL_NAME,
126
+ messages=[
127
+ {"role": "system", "content": SYSTEM_PROMPT},
128
+ {"role": "user", "content": prompt},
129
+ ],
130
+ temperature=0.3,
131
+ max_tokens=256,
132
+ )
133
+ return (resp.choices[0].message.content or "").strip()
134
+
135
+
136
+ def _call_bedrock(prompt: str) -> str:
137
+ """Call LLM via AWS Bedrock. Tries converse() first, falls back to invoke_model()."""
138
+ import boto3
139
+ client = boto3.client(
140
+ service_name="bedrock-runtime",
141
+ region_name=os.getenv("AWS_DEFAULT_REGION", "us-east-1"),
142
+ aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
143
+ aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
144
+ )
145
+ # Try converse API first (boto3 >= 1.34.x)
146
+ if hasattr(client, "converse"):
147
+ resp = client.converse(
148
+ modelId=BEDROCK_MODEL_ID,
149
+ messages=[{"role": "user", "content": [{"text": prompt}]}],
150
+ system=[{"text": SYSTEM_PROMPT}],
151
+ inferenceConfig={"maxTokens": 256, "temperature": 0.3},
152
+ )
153
+ return resp["output"]["message"]["content"][0]["text"].strip()
154
+ # Fallback: invoke_model (works with all boto3 versions)
155
+ body = json.dumps({
156
+ "messages": [
157
+ {"role": "system", "content": SYSTEM_PROMPT},
158
+ {"role": "user", "content": prompt},
159
+ ],
160
+ "max_tokens": 256,
161
+ "temperature": 0.3,
162
+ })
163
+ resp = client.invoke_model(
164
+ modelId=BEDROCK_MODEL_ID,
165
+ contentType="application/json",
166
+ accept="application/json",
167
+ body=body,
168
+ )
169
+ result = json.loads(resp["body"].read())
170
+ # Handle both OpenAI-style and Bedrock-native response formats
171
+ if "choices" in result:
172
+ return result["choices"][0]["message"]["content"].strip()
173
+ if "content" in result:
174
+ content = result["content"]
175
+ if isinstance(content, list):
176
+ return content[0].get("text", "").strip()
177
+ return str(content).strip()
178
+ if "output" in result:
179
+ return result["output"].get("text", "").strip()
180
+ return str(result).strip()
181
+
182
+
183
+ def call_llm(prompt: str) -> str:
184
+ """Call LLM with retries. Uses HF router or Bedrock based on LLM_BACKEND."""
185
+ fn = _call_bedrock if LLM_BACKEND == "bedrock" else _call_hf
186
+ for attempt in range(3):
187
+ try:
188
+ raw = fn(prompt)
189
+ if os.getenv("DEBUG_LLM"):
190
+ print(f" [LLM RAW] {raw[:200]}")
191
+ # Strip Qwen3 <think>...</think> reasoning blocks
192
+ import re
193
+ cleaned = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()
194
+ return cleaned if cleaned else raw
195
+ except Exception as e:
196
+ if attempt == 2:
197
+ print(f" [LLM ERROR] {e} (gave up after 3 attempts)")
198
+ return ""
199
+ wait = 3 * (attempt + 1)
200
+ print(f" [LLM RETRY] {e} — retrying in {wait}s")
201
+ time.sleep(wait)
202
+ return ""
203
+
204
+
205
+ def format_obs(obs: dict) -> str:
206
+ """Format observation as text for LLM — shows raw signals prominently."""
207
+ lines = []
208
+ lines.append(f"TASK: {obs.get('task','?').upper()} | Steps remaining: {obs.get('steps_remaining','?')}")
209
+
210
+ flagged = obs.get("flagged_ids", [])
211
+ lines.append(f"Flagged ({len(flagged)}/10): {', '.join(flagged) if flagged else 'none'}")
212
+
213
+ suspects = obs.get("suspect_ids", [])
214
+ inspected = obs.get("inspected_ids", [])
215
+ uninspected_suspects = [s for s in suspects if s not in inspected]
216
+ if uninspected_suspects:
217
+ lines.append(f"*** SUSPECTS (uninspected) → INSPECT THESE FIRST: {', '.join(uninspected_suspects)} ***")
218
+
219
+ accounts = obs.get("visible_accounts", [])
220
+ if accounts:
221
+ # Split: unflagged accounts that should be flagged vs rest
222
+ unflagged_suspicious = []
223
+ flagged_accs = []
224
+ clean_accs = []
225
+ for a in sorted(accounts, key=lambda x: x.get("fake_risk_score", 0), reverse=True):
226
+ aid = a.get("account_id", "?")
227
+ if aid in flagged:
228
+ flagged_accs.append(a)
229
+ elif (a.get("shared_ip_count", 0) >= 5 or
230
+ (a.get("photo_reuse_score", 0) >= 0.50 and a.get("bio_template_score", 0) >= 0.40)):
231
+ unflagged_suspicious.append(a)
232
+ else:
233
+ clean_accs.append(a)
234
+
235
+ if unflagged_suspicious:
236
+ lines.append(f"\n!!! ACTION NEEDED — FLAG THESE ({len(unflagged_suspicious)} accounts with strong fake signals):")
237
+ for a in unflagged_suspicious:
238
+ aid = a.get("account_id", "?")
239
+ lines.append(f" → FLAG {aid}: risk={a.get('fake_risk_score',0):.3f} photo={a.get('photo_reuse_score',0):.2f} bio={a.get('bio_template_score',0):.2f} ip_shared={a.get('shared_ip_count',0)} hub={a.get('hub_legitimacy_score',0):.2f}")
240
+
241
+ if flagged_accs:
242
+ lines.append(f"\nALREADY FLAGGED ({len(flagged_accs)}):")
243
+ for a in flagged_accs[:5]:
244
+ lines.append(f" ✓ {a.get('account_id','?')}")
245
+
246
+ if clean_accs:
247
+ lines.append(f"\nCLEAN ACCOUNTS ({len(clean_accs)}):")
248
+ for a in clean_accs[:5]:
249
+ aid = a.get("account_id", "?")
250
+ hub = a.get("hub_legitimacy_score", 0)
251
+ hub_mark = " [CELEBRITY]" if hub > 0.70 else ""
252
+ lines.append(f" {aid}: risk={a.get('fake_risk_score',0):.3f} photo={a.get('photo_reuse_score',0):.2f} bio={a.get('bio_template_score',0):.2f} hub={hub:.2f}{hub_mark}")
253
+
254
+ visible = obs.get("visible_account_ids", [])
255
+ uninspected = [i for i in visible if i not in inspected]
256
+ if uninspected:
257
+ lines.append(f"\nUninspected IDs ({len(uninspected)}): {', '.join(uninspected[:8])}{'...' if len(uninspected) > 8 else ''}")
258
+
259
+ lines.append(f"\nMessage: {obs.get('message', '')}")
260
+ return "\n".join(lines)
261
+
262
+
263
+ def parse_action(llm_text: str, obs: dict) -> dict:
264
+ """Parse LLM output to action dict."""
265
+ for line in llm_text.split("\n"):
266
+ line = line.strip()
267
+ upper = line.upper()
268
+ if upper.startswith("INSPECT ") or upper.startswith("FLAG ") or upper.startswith("INVESTIGATE_NETWORK ") or upper.startswith("UNFLAG "):
269
+ parts = line.split(maxsplit=1)
270
+ return {"action_type": parts[0].lower(), "account_id": parts[1].lower() if len(parts) > 1 else None}
271
+ if upper == "SUBMIT":
272
+ return {"action_type": "submit"}
273
+
274
+ # Fallback: inspect first uninspected suspect
275
+ suspects = obs.get("suspect_ids", [])
276
+ inspected = obs.get("inspected_ids", [])
277
+ for s in suspects:
278
+ if s not in inspected:
279
+ return {"action_type": "inspect", "account_id": s}
280
+ visible = obs.get("visible_account_ids", [])
281
+ for v in visible:
282
+ if v not in inspected:
283
+ return {"action_type": "inspect", "account_id": v}
284
+ return {"action_type": "submit"}
285
+
286
+
287
+ # ---------------------------------------------------------------------------
288
+ # Test phases
289
+ # ---------------------------------------------------------------------------
290
+
291
+ def test_endpoints(base_url: str) -> bool:
292
+ """Phase 0: Verify all required endpoints respond correctly."""
293
+ print("\n" + "="*60)
294
+ print("PHASE 0: Endpoint Verification")
295
+ print("="*60)
296
+
297
+ checks = [
298
+ ("GET", "/health", None, True),
299
+ ("GET", "/tasks", None, True),
300
+ ("GET", "/metadata", None, True),
301
+ ("GET", "/schema", None, True),
302
+ ("GET", "/web", None, False), # returns HTML, not JSON
303
+ ("POST", "/reset", {"task": "easy", "seed": 0}, True),
304
+ ("GET", "/state", None, True),
305
+ ("POST", "/step", {"action_type": "inspect", "account_id": "acc_0000"}, True),
306
+ ("POST", "/step", {"action_type": "submit"}, True),
307
+ ("GET", "/grader", None, True),
308
+ ("POST", "/mcp", {"jsonrpc": "2.0", "method": "tools/list", "id": 1}, True),
309
+ ("POST", "/baseline", None, True),
310
+ ]
311
+
312
+ all_ok = True
313
+ for method, path, body, expect_json in checks:
314
+ try:
315
+ if method == "GET":
316
+ http_get(f"{base_url}{path}", expect_json=expect_json)
317
+ else:
318
+ http_post(f"{base_url}{path}", body)
319
+ print(f" ✓ {method} {path}")
320
+ except Exception as e:
321
+ print(f" ✗ {method} {path} — {e}")
322
+ all_ok = False
323
+
324
+ return all_ok
325
+
326
+
327
+ def test_baseline_stability(base_url: str) -> bool:
328
+ """Phase 1: Baseline re-run (must produce identical scores)."""
329
+ print("\n" + "="*60)
330
+ print("PHASE 1: Baseline Stability (3 runs)")
331
+ print("="*60)
332
+
333
+ scores_list = []
334
+ for i in range(3):
335
+ r = http_post(f"{base_url}/baseline")
336
+ scores = r["scores"]
337
+ scores_list.append(scores)
338
+ print(f" Run {i+1}: easy={scores['easy']:.4f} medium={scores['medium']:.4f} hard={scores['hard']:.4f}")
339
+
340
+ # Check all identical
341
+ stable = all(s == scores_list[0] for s in scores_list)
342
+ if stable:
343
+ print(" ✓ All 3 runs identical — baseline is deterministic")
344
+ else:
345
+ print(" ✗ SCORES DIFFER — baseline is non-deterministic!")
346
+ return stable
347
+
348
+
349
+ def test_llm_agent(base_url: str, task: str, seed: int = 0) -> float:
350
+ """Phase 2: Run an LLM agent against one task (simulates judge's Nemotron run)."""
351
+ _model = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
352
+ print(f"\n --- LLM Agent: task={task}, seed={seed}, model={_model} ---")
353
+
354
+ # Reset
355
+ reset_resp = http_post(f"{base_url}/reset", {"task": task, "seed": seed})
356
+ obs = reset_resp.get("observation", reset_resp)
357
+ done = reset_resp.get("done", False)
358
+
359
+ step_num = 0
360
+ while not done:
361
+ step_num += 1
362
+ prompt = format_obs(obs)
363
+ llm_text = call_llm(prompt)
364
+ action = parse_action(llm_text, obs)
365
+
366
+ action_str = f"{action['action_type'].upper()} {action.get('account_id', '')}".strip()
367
+
368
+ step_resp = http_post(f"{base_url}/step", action)
369
+ obs = step_resp.get("observation", step_resp)
370
+ done = step_resp.get("done", False)
371
+ reward = step_resp.get("reward")
372
+
373
+ flagged_n = len(obs.get("flagged_ids", []))
374
+ suspects_n = len(obs.get("suspect_ids", []))
375
+ steps_left = obs.get("steps_remaining", "?")
376
+
377
+ print(f" Step {step_num:2d}: {action_str:35s} flagged={flagged_n}/10 suspects={suspects_n} steps_left={steps_left}")
378
+
379
+ if done and reward is not None:
380
+ msg = step_resp.get("message", obs.get("message", ""))
381
+ print(f" → Episode ended: {msg[:100]}")
382
+
383
+ # Get grader score
384
+ grader = http_get(f"{base_url}/grader")
385
+ score = grader["score"]
386
+ print(f" ★ GRADER SCORE: {score:.4f}")
387
+ return score
388
+
389
+
390
+ def test_llm_all_tasks(base_url: str) -> Dict[str, float]:
391
+ """Phase 2: Run LLM agent on all 3 tasks."""
392
+ print("\n" + "="*60)
393
+ _model = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
394
+ print(f"PHASE 2: LLM Agent Evaluation (model={_model})")
395
+ print("="*60)
396
+
397
+ scores = {}
398
+ for task in ["easy", "medium", "hard"]:
399
+ scores[task] = test_llm_agent(base_url, task=task, seed=0)
400
+
401
+ print(f"\n Summary: easy={scores['easy']:.4f} medium={scores['medium']:.4f} hard={scores['hard']:.4f}")
402
+ return scores
403
+
404
+
405
+ def test_variance(base_url: str, seeds: List[int] = [0, 1, 2, 3, 4]) -> None:
406
+ """Phase 3: Score variance check (multiple seeds per task)."""
407
+ print("\n" + "="*60)
408
+ print(f"PHASE 3: Score Variance (seeds={seeds})")
409
+ print("="*60)
410
+
411
+ for task in ["easy", "medium", "hard"]:
412
+ task_scores = []
413
+ for seed in seeds:
414
+ score = test_llm_agent(base_url, task=task, seed=seed)
415
+ task_scores.append(score)
416
+
417
+ mean = sum(task_scores) / len(task_scores)
418
+ variance = sum((s - mean) ** 2 for s in task_scores) / len(task_scores)
419
+ print(f"\n {task}: scores={[f'{s:.3f}' for s in task_scores]} mean={mean:.4f} var={variance:.6f}")
420
+
421
+
422
+ # ---------------------------------------------------------------------------
423
+ # Main
424
+ # ---------------------------------------------------------------------------
425
+
426
+ if __name__ == "__main__":
427
+ import argparse
428
+
429
+ parser = argparse.ArgumentParser(description="Judge Evaluation Simulator for GraphStrike")
430
+ parser.add_argument("--url", required=True, help="Environment server URL")
431
+ parser.add_argument("--bedrock", action="store_true", help="Use AWS Bedrock instead of HF router")
432
+ parser.add_argument("--endpoints-only", action="store_true", help="Only test endpoints (no LLM)")
433
+ parser.add_argument("--skip-variance", action="store_true", help="Skip variance check (faster)")
434
+ parser.add_argument("--seeds", type=int, default=3, help="Number of seeds for variance check")
435
+ args = parser.parse_args()
436
+
437
+ if args.bedrock:
438
+ LLM_BACKEND = "bedrock"
439
+
440
+ base = args.url.rstrip("/")
441
+ model_display = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
442
+ print(f"GraphStrike Judge Evaluation Simulator")
443
+ print(f"Target: {base}")
444
+ print(f"Backend: {LLM_BACKEND}")
445
+ print(f"Model: {model_display}")
446
+ print(f"Token: {'set' if (HF_TOKEN or os.getenv('AWS_ACCESS_KEY_ID')) else 'NOT SET'}")
447
+
448
+ # Phase 0: Endpoints
449
+ if not test_endpoints(base):
450
+ print("\n✗ Endpoint check failed. Fix before proceeding.")
451
+ sys.exit(1)
452
+
453
+ # Phase 1: Baseline stability
454
+ test_baseline_stability(base)
455
+
456
+ if args.endpoints_only:
457
+ print("\n✓ Endpoint-only mode — skipping LLM tests.")
458
+ sys.exit(0)
459
+
460
+ if LLM_BACKEND == "bedrock":
461
+ if not os.getenv("AWS_ACCESS_KEY_ID"):
462
+ print("\n✗ AWS_ACCESS_KEY_ID not set. Cannot run Bedrock LLM tests.")
463
+ sys.exit(1)
464
+ elif not HF_TOKEN:
465
+ print("\n✗ HF_TOKEN not set. Cannot run LLM agent tests.")
466
+ print(" export HF_TOKEN='hf_...' OR use --bedrock with AWS creds")
467
+ sys.exit(1)
468
+
469
+ # Phase 2: LLM on all tasks
470
+ scores = test_llm_all_tasks(base)
471
+
472
+ # Phase 3: Variance
473
+ if not args.skip_variance:
474
+ test_variance(base, seeds=list(range(args.seeds)))
475
+
476
+ print("\n" + "="*60)
477
+ print("EVALUATION COMPLETE")
478
+ print("="*60)
eval-models/qwen_test_judge_eval.py ADDED
@@ -0,0 +1,478 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Judge Evaluation Simulator
4
+ ==========================
5
+
6
+ Simulates EXACTLY how hackathon judges will evaluate your environment:
7
+
8
+ 1. Baseline re-run: POST /baseline → verify scores are stable
9
+ 2. Standard Open LLM agent: Run an LLM (via HF router) against all 3 tasks
10
+ 3. Score variance check: Run same task multiple seeds, check variance
11
+
12
+ USAGE:
13
+ # Against live HF Space (requires HF_TOKEN):
14
+ export HF_TOKEN="hf_..."
15
+ python test_judge_eval.py --url https://pandago-graphstrike.hf.space
16
+
17
+ # Against local server:
18
+ export HF_TOKEN="hf_..."
19
+ python test_judge_eval.py --url http://localhost:7860
20
+
21
+ # Choose model (default: Qwen/Qwen2.5-72B-Instruct):
22
+ export MODEL_NAME="nvidia/Llama-3.1-Nemotron-70B-Instruct-HF"
23
+ python test_judge_eval.py --url https://pandago-graphstrike.hf.space
24
+
25
+ # Just test endpoints (no LLM needed):
26
+ python test_judge_eval.py --url https://pandago-graphstrike.hf.space --endpoints-only
27
+ """
28
+
29
+ import json
30
+ import os
31
+ import sys
32
+ import time
33
+ import urllib.request
34
+ from typing import Dict, List, Optional
35
+
36
+ # ---------------------------------------------------------------------------
37
+ # Config from env vars (same as judges will set)
38
+ # ---------------------------------------------------------------------------
39
+
40
+ HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
41
+ API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
42
+ MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
43
+
44
+ # AWS Bedrock config (use --bedrock flag)
45
+ LLM_BACKEND = "hf" # "hf" or "bedrock"
46
+ BEDROCK_MODEL_ID = os.getenv("BEDROCK_MODEL_ID", "qwen.qwen3-next-80b-a3b")
47
+
48
+ SYSTEM_PROMPT = """You are an AI detective finding 10 coordinated fake accounts in a social network.
49
+
50
+ ACTIONS (reply with exactly ONE line):
51
+ - INSPECT acc_XXXX — reveal profile (costs 1 step)
52
+ - FLAG acc_XXXX — mark as fake (FREE, no step cost, triggers suspect cascade)
53
+ - SUBMIT — end episode, get scored
54
+
55
+ DECISION RULES (apply top-to-bottom, first match wins):
56
+ 1. If suspect_ids lists accounts you haven't inspected → INSPECT the first one
57
+ 2. If ANY profiled account has shared_ip_count >= 5 and is NOT flagged → FLAG it immediately
58
+ 3. If ANY profiled account has photo_reuse >= 0.50 AND bio_template >= 0.40 and hub < 0.70 and NOT flagged → FLAG it
59
+ 4. If ANY profiled account has fake_risk_score >= 0.30 and hub < 0.70 and NOT flagged → FLAG it
60
+ 5. If there are uninspected visible accounts and steps > 3 → INSPECT the next one
61
+ 6. If you have 10 flags OR steps <= 3 → SUBMIT
62
+
63
+ IMPORTANT:
64
+ - FLAG is FREE (costs 0 steps) — flag aggressively when you see suspicious signals
65
+ - After each FLAG, new suspects appear — always inspect suspects before other accounts
66
+ - hub_legitimacy_score > 0.70 means celebrity — do NOT flag
67
+ - shared_ip_count >= 5 is the strongest gang signal (all 10 share one IP)
68
+ - Do NOT re-inspect already inspected accounts
69
+
70
+ Reply with EXACTLY one line, nothing else:
71
+ FLAG acc_XXXX
72
+ INSPECT acc_XXXX
73
+ SUBMIT"""
74
+
75
+
76
+ # ---------------------------------------------------------------------------
77
+ # HTTP helpers
78
+ # ---------------------------------------------------------------------------
79
+
80
+ def _retry(fn, retries=3, backoff=3):
81
+ """Retry a function on network errors."""
82
+ for attempt in range(retries):
83
+ try:
84
+ return fn()
85
+ except OSError as e:
86
+ if attempt == retries - 1:
87
+ raise
88
+ wait = backoff * (attempt + 1)
89
+ print(f" [RETRY] Network error: {e} — retrying in {wait}s ({attempt+1}/{retries})")
90
+ time.sleep(wait)
91
+
92
+
93
+ def http_post(url: str, body: Optional[dict] = None) -> dict:
94
+ def _do():
95
+ data = json.dumps(body or {}).encode()
96
+ req = urllib.request.Request(
97
+ url, data=data,
98
+ headers={"Content-Type": "application/json"},
99
+ method="POST"
100
+ )
101
+ with urllib.request.urlopen(req, timeout=120) as resp:
102
+ return json.loads(resp.read())
103
+ return _retry(_do)
104
+
105
+
106
+ def http_get(url: str, expect_json: bool = True) -> dict:
107
+ def _do():
108
+ with urllib.request.urlopen(url, timeout=120) as resp:
109
+ body = resp.read()
110
+ if not expect_json:
111
+ return {"_status": resp.status, "_body_len": len(body)}
112
+ return json.loads(body)
113
+ return _retry(_do)
114
+
115
+
116
+ # ---------------------------------------------------------------------------
117
+ # LLM call via OpenAI-compatible API
118
+ # ---------------------------------------------------------------------------
119
+
120
+ def _call_hf(prompt: str) -> str:
121
+ """Call LLM via HF router (OpenAI-compatible)."""
122
+ from openai import OpenAI
123
+ client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
124
+ resp = client.chat.completions.create(
125
+ model=MODEL_NAME,
126
+ messages=[
127
+ {"role": "system", "content": SYSTEM_PROMPT},
128
+ {"role": "user", "content": prompt},
129
+ ],
130
+ temperature=0.3,
131
+ max_tokens=256,
132
+ )
133
+ return (resp.choices[0].message.content or "").strip()
134
+
135
+
136
+ def _call_bedrock(prompt: str) -> str:
137
+ """Call LLM via AWS Bedrock. Tries converse() first, falls back to invoke_model()."""
138
+ import boto3
139
+ client = boto3.client(
140
+ service_name="bedrock-runtime",
141
+ region_name=os.getenv("AWS_DEFAULT_REGION", "us-east-1"),
142
+ aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
143
+ aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
144
+ )
145
+ # Try converse API first (boto3 >= 1.34.x)
146
+ if hasattr(client, "converse"):
147
+ resp = client.converse(
148
+ modelId=BEDROCK_MODEL_ID,
149
+ messages=[{"role": "user", "content": [{"text": prompt}]}],
150
+ system=[{"text": SYSTEM_PROMPT}],
151
+ inferenceConfig={"maxTokens": 256, "temperature": 0.3},
152
+ )
153
+ return resp["output"]["message"]["content"][0]["text"].strip()
154
+ # Fallback: invoke_model (works with all boto3 versions)
155
+ body = json.dumps({
156
+ "messages": [
157
+ {"role": "system", "content": SYSTEM_PROMPT},
158
+ {"role": "user", "content": prompt},
159
+ ],
160
+ "max_tokens": 256,
161
+ "temperature": 0.3,
162
+ })
163
+ resp = client.invoke_model(
164
+ modelId=BEDROCK_MODEL_ID,
165
+ contentType="application/json",
166
+ accept="application/json",
167
+ body=body,
168
+ )
169
+ result = json.loads(resp["body"].read())
170
+ # Handle both OpenAI-style and Bedrock-native response formats
171
+ if "choices" in result:
172
+ return result["choices"][0]["message"]["content"].strip()
173
+ if "content" in result:
174
+ content = result["content"]
175
+ if isinstance(content, list):
176
+ return content[0].get("text", "").strip()
177
+ return str(content).strip()
178
+ if "output" in result:
179
+ return result["output"].get("text", "").strip()
180
+ return str(result).strip()
181
+
182
+
183
+ def call_llm(prompt: str) -> str:
184
+ """Call LLM with retries. Uses HF router or Bedrock based on LLM_BACKEND."""
185
+ fn = _call_bedrock if LLM_BACKEND == "bedrock" else _call_hf
186
+ for attempt in range(3):
187
+ try:
188
+ raw = fn(prompt)
189
+ if os.getenv("DEBUG_LLM"):
190
+ print(f" [LLM RAW] {raw[:200]}")
191
+ # Strip Qwen3 <think>...</think> reasoning blocks
192
+ import re
193
+ cleaned = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()
194
+ return cleaned if cleaned else raw
195
+ except Exception as e:
196
+ if attempt == 2:
197
+ print(f" [LLM ERROR] {e} (gave up after 3 attempts)")
198
+ return ""
199
+ wait = 3 * (attempt + 1)
200
+ print(f" [LLM RETRY] {e} — retrying in {wait}s")
201
+ time.sleep(wait)
202
+ return ""
203
+
204
+
205
+ def format_obs(obs: dict) -> str:
206
+ """Format observation as text for LLM — shows raw signals prominently."""
207
+ lines = []
208
+ lines.append(f"TASK: {obs.get('task','?').upper()} | Steps remaining: {obs.get('steps_remaining','?')}")
209
+
210
+ flagged = obs.get("flagged_ids", [])
211
+ lines.append(f"Flagged ({len(flagged)}/10): {', '.join(flagged) if flagged else 'none'}")
212
+
213
+ suspects = obs.get("suspect_ids", [])
214
+ inspected = obs.get("inspected_ids", [])
215
+ uninspected_suspects = [s for s in suspects if s not in inspected]
216
+ if uninspected_suspects:
217
+ lines.append(f"*** SUSPECTS (uninspected) → INSPECT THESE FIRST: {', '.join(uninspected_suspects)} ***")
218
+
219
+ accounts = obs.get("visible_accounts", [])
220
+ if accounts:
221
+ # Split: unflagged accounts that should be flagged vs rest
222
+ unflagged_suspicious = []
223
+ flagged_accs = []
224
+ clean_accs = []
225
+ for a in sorted(accounts, key=lambda x: x.get("fake_risk_score", 0), reverse=True):
226
+ aid = a.get("account_id", "?")
227
+ if aid in flagged:
228
+ flagged_accs.append(a)
229
+ elif (a.get("shared_ip_count", 0) >= 5 or
230
+ (a.get("photo_reuse_score", 0) >= 0.50 and a.get("bio_template_score", 0) >= 0.40)):
231
+ unflagged_suspicious.append(a)
232
+ else:
233
+ clean_accs.append(a)
234
+
235
+ if unflagged_suspicious:
236
+ lines.append(f"\n!!! ACTION NEEDED — FLAG THESE ({len(unflagged_suspicious)} accounts with strong fake signals):")
237
+ for a in unflagged_suspicious:
238
+ aid = a.get("account_id", "?")
239
+ lines.append(f" → FLAG {aid}: risk={a.get('fake_risk_score',0):.3f} photo={a.get('photo_reuse_score',0):.2f} bio={a.get('bio_template_score',0):.2f} ip_shared={a.get('shared_ip_count',0)} hub={a.get('hub_legitimacy_score',0):.2f}")
240
+
241
+ if flagged_accs:
242
+ lines.append(f"\nALREADY FLAGGED ({len(flagged_accs)}):")
243
+ for a in flagged_accs[:5]:
244
+ lines.append(f" ✓ {a.get('account_id','?')}")
245
+
246
+ if clean_accs:
247
+ lines.append(f"\nCLEAN ACCOUNTS ({len(clean_accs)}):")
248
+ for a in clean_accs[:5]:
249
+ aid = a.get("account_id", "?")
250
+ hub = a.get("hub_legitimacy_score", 0)
251
+ hub_mark = " [CELEBRITY]" if hub > 0.70 else ""
252
+ lines.append(f" {aid}: risk={a.get('fake_risk_score',0):.3f} photo={a.get('photo_reuse_score',0):.2f} bio={a.get('bio_template_score',0):.2f} hub={hub:.2f}{hub_mark}")
253
+
254
+ visible = obs.get("visible_account_ids", [])
255
+ uninspected = [i for i in visible if i not in inspected]
256
+ if uninspected:
257
+ lines.append(f"\nUninspected IDs ({len(uninspected)}): {', '.join(uninspected[:8])}{'...' if len(uninspected) > 8 else ''}")
258
+
259
+ lines.append(f"\nMessage: {obs.get('message', '')}")
260
+ return "\n".join(lines)
261
+
262
+
263
+ def parse_action(llm_text: str, obs: dict) -> dict:
264
+ """Parse LLM output to action dict."""
265
+ for line in llm_text.split("\n"):
266
+ line = line.strip()
267
+ upper = line.upper()
268
+ if upper.startswith("INSPECT ") or upper.startswith("FLAG ") or upper.startswith("INVESTIGATE_NETWORK ") or upper.startswith("UNFLAG "):
269
+ parts = line.split(maxsplit=1)
270
+ return {"action_type": parts[0].lower(), "account_id": parts[1].lower() if len(parts) > 1 else None}
271
+ if upper == "SUBMIT":
272
+ return {"action_type": "submit"}
273
+
274
+ # Fallback: inspect first uninspected suspect
275
+ suspects = obs.get("suspect_ids", [])
276
+ inspected = obs.get("inspected_ids", [])
277
+ for s in suspects:
278
+ if s not in inspected:
279
+ return {"action_type": "inspect", "account_id": s}
280
+ visible = obs.get("visible_account_ids", [])
281
+ for v in visible:
282
+ if v not in inspected:
283
+ return {"action_type": "inspect", "account_id": v}
284
+ return {"action_type": "submit"}
285
+
286
+
287
+ # ---------------------------------------------------------------------------
288
+ # Test phases
289
+ # ---------------------------------------------------------------------------
290
+
291
+ def test_endpoints(base_url: str) -> bool:
292
+ """Phase 0: Verify all required endpoints respond correctly."""
293
+ print("\n" + "="*60)
294
+ print("PHASE 0: Endpoint Verification")
295
+ print("="*60)
296
+
297
+ checks = [
298
+ ("GET", "/health", None, True),
299
+ ("GET", "/tasks", None, True),
300
+ ("GET", "/metadata", None, True),
301
+ ("GET", "/schema", None, True),
302
+ ("GET", "/web", None, False), # returns HTML, not JSON
303
+ ("POST", "/reset", {"task": "easy", "seed": 0}, True),
304
+ ("GET", "/state", None, True),
305
+ ("POST", "/step", {"action_type": "inspect", "account_id": "acc_0000"}, True),
306
+ ("POST", "/step", {"action_type": "submit"}, True),
307
+ ("GET", "/grader", None, True),
308
+ ("POST", "/mcp", {"jsonrpc": "2.0", "method": "tools/list", "id": 1}, True),
309
+ ("POST", "/baseline", None, True),
310
+ ]
311
+
312
+ all_ok = True
313
+ for method, path, body, expect_json in checks:
314
+ try:
315
+ if method == "GET":
316
+ http_get(f"{base_url}{path}", expect_json=expect_json)
317
+ else:
318
+ http_post(f"{base_url}{path}", body)
319
+ print(f" ✓ {method} {path}")
320
+ except Exception as e:
321
+ print(f" ✗ {method} {path} — {e}")
322
+ all_ok = False
323
+
324
+ return all_ok
325
+
326
+
327
+ def test_baseline_stability(base_url: str) -> bool:
328
+ """Phase 1: Baseline re-run (must produce identical scores)."""
329
+ print("\n" + "="*60)
330
+ print("PHASE 1: Baseline Stability (3 runs)")
331
+ print("="*60)
332
+
333
+ scores_list = []
334
+ for i in range(3):
335
+ r = http_post(f"{base_url}/baseline")
336
+ scores = r["scores"]
337
+ scores_list.append(scores)
338
+ print(f" Run {i+1}: easy={scores['easy']:.4f} medium={scores['medium']:.4f} hard={scores['hard']:.4f}")
339
+
340
+ # Check all identical
341
+ stable = all(s == scores_list[0] for s in scores_list)
342
+ if stable:
343
+ print(" ✓ All 3 runs identical — baseline is deterministic")
344
+ else:
345
+ print(" ✗ SCORES DIFFER — baseline is non-deterministic!")
346
+ return stable
347
+
348
+
349
+ def test_llm_agent(base_url: str, task: str, seed: int = 0) -> float:
350
+ """Phase 2: Run an LLM agent against one task (simulates judge's Nemotron run)."""
351
+ _model = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
352
+ print(f"\n --- LLM Agent: task={task}, seed={seed}, model={_model} ---")
353
+
354
+ # Reset
355
+ reset_resp = http_post(f"{base_url}/reset", {"task": task, "seed": seed})
356
+ obs = reset_resp.get("observation", reset_resp)
357
+ done = reset_resp.get("done", False)
358
+
359
+ step_num = 0
360
+ while not done:
361
+ step_num += 1
362
+ prompt = format_obs(obs)
363
+ llm_text = call_llm(prompt)
364
+ action = parse_action(llm_text, obs)
365
+
366
+ action_str = f"{action['action_type'].upper()} {action.get('account_id', '')}".strip()
367
+
368
+ step_resp = http_post(f"{base_url}/step", action)
369
+ obs = step_resp.get("observation", step_resp)
370
+ done = step_resp.get("done", False)
371
+ reward = step_resp.get("reward")
372
+
373
+ flagged_n = len(obs.get("flagged_ids", []))
374
+ suspects_n = len(obs.get("suspect_ids", []))
375
+ steps_left = obs.get("steps_remaining", "?")
376
+
377
+ print(f" Step {step_num:2d}: {action_str:35s} flagged={flagged_n}/10 suspects={suspects_n} steps_left={steps_left}")
378
+
379
+ if done and reward is not None:
380
+ msg = step_resp.get("message", obs.get("message", ""))
381
+ print(f" → Episode ended: {msg[:100]}")
382
+
383
+ # Get grader score
384
+ grader = http_get(f"{base_url}/grader")
385
+ score = grader["score"]
386
+ print(f" ★ GRADER SCORE: {score:.4f}")
387
+ return score
388
+
389
+
390
+ def test_llm_all_tasks(base_url: str) -> Dict[str, float]:
391
+ """Phase 2: Run LLM agent on all 3 tasks."""
392
+ print("\n" + "="*60)
393
+ _model = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
394
+ print(f"PHASE 2: LLM Agent Evaluation (model={_model})")
395
+ print("="*60)
396
+
397
+ scores = {}
398
+ for task in ["easy", "medium", "hard"]:
399
+ scores[task] = test_llm_agent(base_url, task=task, seed=0)
400
+
401
+ print(f"\n Summary: easy={scores['easy']:.4f} medium={scores['medium']:.4f} hard={scores['hard']:.4f}")
402
+ return scores
403
+
404
+
405
+ def test_variance(base_url: str, seeds: List[int] = [0, 1, 2, 3, 4]) -> None:
406
+ """Phase 3: Score variance check (multiple seeds per task)."""
407
+ print("\n" + "="*60)
408
+ print(f"PHASE 3: Score Variance (seeds={seeds})")
409
+ print("="*60)
410
+
411
+ for task in ["easy", "medium", "hard"]:
412
+ task_scores = []
413
+ for seed in seeds:
414
+ score = test_llm_agent(base_url, task=task, seed=seed)
415
+ task_scores.append(score)
416
+
417
+ mean = sum(task_scores) / len(task_scores)
418
+ variance = sum((s - mean) ** 2 for s in task_scores) / len(task_scores)
419
+ print(f"\n {task}: scores={[f'{s:.3f}' for s in task_scores]} mean={mean:.4f} var={variance:.6f}")
420
+
421
+
422
+ # ---------------------------------------------------------------------------
423
+ # Main
424
+ # ---------------------------------------------------------------------------
425
+
426
+ if __name__ == "__main__":
427
+ import argparse
428
+
429
+ parser = argparse.ArgumentParser(description="Judge Evaluation Simulator for GraphStrike")
430
+ parser.add_argument("--url", required=True, help="Environment server URL")
431
+ parser.add_argument("--bedrock", action="store_true", help="Use AWS Bedrock instead of HF router")
432
+ parser.add_argument("--endpoints-only", action="store_true", help="Only test endpoints (no LLM)")
433
+ parser.add_argument("--skip-variance", action="store_true", help="Skip variance check (faster)")
434
+ parser.add_argument("--seeds", type=int, default=3, help="Number of seeds for variance check")
435
+ args = parser.parse_args()
436
+
437
+ if args.bedrock:
438
+ LLM_BACKEND = "bedrock"
439
+
440
+ base = args.url.rstrip("/")
441
+ model_display = f"Bedrock/{BEDROCK_MODEL_ID}" if LLM_BACKEND == "bedrock" else MODEL_NAME
442
+ print(f"GraphStrike Judge Evaluation Simulator")
443
+ print(f"Target: {base}")
444
+ print(f"Backend: {LLM_BACKEND}")
445
+ print(f"Model: {model_display}")
446
+ print(f"Token: {'set' if (HF_TOKEN or os.getenv('AWS_ACCESS_KEY_ID')) else 'NOT SET'}")
447
+
448
+ # Phase 0: Endpoints
449
+ if not test_endpoints(base):
450
+ print("\n✗ Endpoint check failed. Fix before proceeding.")
451
+ sys.exit(1)
452
+
453
+ # Phase 1: Baseline stability
454
+ test_baseline_stability(base)
455
+
456
+ if args.endpoints_only:
457
+ print("\n✓ Endpoint-only mode — skipping LLM tests.")
458
+ sys.exit(0)
459
+
460
+ if LLM_BACKEND == "bedrock":
461
+ if not os.getenv("AWS_ACCESS_KEY_ID"):
462
+ print("\n✗ AWS_ACCESS_KEY_ID not set. Cannot run Bedrock LLM tests.")
463
+ sys.exit(1)
464
+ elif not HF_TOKEN:
465
+ print("\n✗ HF_TOKEN not set. Cannot run LLM agent tests.")
466
+ print(" export HF_TOKEN='hf_...' OR use --bedrock with AWS creds")
467
+ sys.exit(1)
468
+
469
+ # Phase 2: LLM on all tasks
470
+ scores = test_llm_all_tasks(base)
471
+
472
+ # Phase 3: Variance
473
+ if not args.skip_variance:
474
+ test_variance(base, seeds=list(range(args.seeds)))
475
+
476
+ print("\n" + "="*60)
477
+ print("EVALUATION COMPLETE")
478
+ print("="*60)
images/big.png ADDED

Git LFS Details

  • SHA256: bad8255420a67138377fb9c34e4fb73ee715c37fc85714de34950d29cb9f8f74
  • Pointer size: 131 Bytes
  • Size of remote file: 522 kB
images/logo.png ADDED
images/plot.png ADDED
images/table1.png ADDED
images/table2.png ADDED
images/table3.png ADDED
judge_log.txt ADDED
@@ -0,0 +1,513 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ubuntu@ip-172-31-33-59:~/meta/meta-hack-26$ python3 test_judge_eval.py --url https://pandago-graphstrike.hf.space --bedrock
2
+ GraphStrike Judge Evaluation Simulator
3
+ Target: https://pandago-graphstrike.hf.space
4
+ Backend: bedrock
5
+ Model: Bedrock/qwen.qwen3-next-80b-a3b
6
+ Token: set
7
+
8
+ ============================================================
9
+ PHASE 0: Endpoint Verification
10
+ ============================================================
11
+ ✓ GET /health
12
+ ✓ GET /tasks
13
+ ✓ GET /metadata
14
+ ✓ GET /schema
15
+ ✓ GET /web
16
+ ✓ POST /reset
17
+ ✓ GET /state
18
+ ✓ POST /step
19
+ ✓ POST /step
20
+ ✓ GET /grader
21
+ ✓ POST /mcp
22
+ ✓ POST /baseline
23
+
24
+ ============================================================
25
+ PHASE 1: Baseline Stability (3 runs)
26
+ ============================================================
27
+ Run 1: easy=0.9100 medium=0.9060 hard=0.9038
28
+ Run 2: easy=0.9100 medium=0.9060 hard=0.9038
29
+ Run 3: easy=0.9100 medium=0.9060 hard=0.9038
30
+ ✓ All 3 runs identical — baseline is deterministic
31
+
32
+ ============================================================
33
+ PHASE 2: LLM Agent Evaluation (model=Bedrock/qwen.qwen3-next-80b-a3b)
34
+ ============================================================
35
+
36
+ --- LLM Agent: task=easy, seed=0, model=Bedrock/qwen.qwen3-next-80b-a3b ---
37
+ Step 1: INSPECT acc_0043 flagged=0/10 suspects=0 steps_left=29
38
+ Step 2: FLAG acc_0043 flagged=1/10 suspects=8 steps_left=29
39
+ Step 3: INSPECT acc_0036 flagged=1/10 suspects=8 steps_left=28
40
+ Step 4: FLAG acc_0036 flagged=2/10 suspects=8 steps_left=28
41
+ Step 5: INSPECT acc_0001 flagged=2/10 suspects=8 steps_left=27
42
+ Step 6: FLAG acc_0001 flagged=3/10 suspects=7 steps_left=27
43
+ Step 7: INSPECT acc_0014 flagged=3/10 suspects=7 steps_left=26
44
+ Step 8: FLAG acc_0014 flagged=4/10 suspects=6 steps_left=26
45
+ Step 9: INSPECT acc_0012 flagged=4/10 suspects=6 steps_left=25
46
+ Step 10: FLAG acc_0012 flagged=5/10 suspects=5 steps_left=25
47
+ Step 11: INSPECT acc_0000 flagged=5/10 suspects=5 steps_left=24
48
+ Step 12: FLAG acc_0000 flagged=6/10 suspects=4 steps_left=24
49
+ Step 13: INSPECT acc_0027 flagged=6/10 suspects=4 steps_left=23
50
+ Step 14: FLAG acc_0027 flagged=7/10 suspects=3 steps_left=23
51
+ Step 15: INSPECT acc_0047 flagged=7/10 suspects=3 steps_left=22
52
+ Step 16: FLAG acc_0047 flagged=8/10 suspects=2 steps_left=22
53
+ Step 17: INSPECT acc_0007 flagged=8/10 suspects=2 steps_left=21
54
+ Step 18: FLAG acc_0007 flagged=9/10 suspects=1 steps_left=21
55
+ Step 19: INSPECT acc_0028 flagged=9/10 suspects=1 steps_left=20
56
+ Step 20: FLAG acc_0028 flagged=10/10 suspects=0 steps_left=20
57
+ Step 21: SUBMIT flagged=10/10 suspects=0 steps_left=20
58
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.90
59
+ ★ GRADER SCORE: 0.9667
60
+
61
+ --- LLM Agent: task=medium, seed=0, model=Bedrock/qwen.qwen3-next-80b-a3b ---
62
+ Step 1: INSPECT acc_0008 flagged=0/10 suspects=0 steps_left=49
63
+ Step 2: INSPECT acc_0074 flagged=0/10 suspects=0 steps_left=48
64
+ Step 3: INSPECT acc_0179 flagged=0/10 suspects=0 steps_left=47
65
+ Step 4: INSPECT acc_0096 flagged=0/10 suspects=0 steps_left=46
66
+ Step 5: INSPECT acc_0177 flagged=0/10 suspects=0 steps_left=45
67
+ Step 6: INSPECT acc_0122 flagged=0/10 suspects=0 steps_left=44
68
+ Step 7: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=43
69
+ Step 8: INSPECT acc_0174 flagged=0/10 suspects=0 steps_left=42
70
+ Step 9: INSPECT acc_0144 flagged=0/10 suspects=0 steps_left=41
71
+ Step 10: FLAG acc_0144 flagged=1/10 suspects=8 steps_left=41
72
+ Step 11: INSPECT acc_0181 flagged=1/10 suspects=8 steps_left=40
73
+ Step 12: FLAG acc_0181 flagged=2/10 suspects=8 steps_left=40
74
+ Step 13: INSPECT acc_0022 flagged=2/10 suspects=8 steps_left=39
75
+ Step 14: FLAG acc_0022 flagged=3/10 suspects=7 steps_left=39
76
+ Step 15: INSPECT acc_0092 flagged=3/10 suspects=7 steps_left=38
77
+ Step 16: INSPECT acc_0097 flagged=3/10 suspects=7 steps_left=37
78
+ Step 17: FLAG acc_0092 flagged=4/10 suspects=6 steps_left=37
79
+ Step 18: FLAG acc_0097 flagged=5/10 suspects=5 steps_left=37
80
+ Step 19: INSPECT acc_0187 flagged=5/10 suspects=5 steps_left=36
81
+ Step 20: FLAG acc_0187 flagged=6/10 suspects=4 steps_left=36
82
+ Step 21: INSPECT acc_0093 flagged=6/10 suspects=4 steps_left=35
83
+ Step 22: FLAG acc_0093 flagged=7/10 suspects=3 steps_left=35
84
+ Step 23: INSPECT acc_0172 flagged=7/10 suspects=3 steps_left=34
85
+ Step 24: FLAG acc_0172 flagged=8/10 suspects=2 steps_left=34
86
+ Step 25: INSPECT acc_0058 flagged=8/10 suspects=2 steps_left=33
87
+ Step 26: FLAG acc_0058 flagged=9/10 suspects=1 steps_left=33
88
+ Step 27: INSPECT acc_0131 flagged=9/10 suspects=1 steps_left=32
89
+ Step 28: FLAG acc_0131 flagged=10/10 suspects=0 steps_left=32
90
+ Step 29: SUBMIT flagged=10/10 suspects=0 steps_left=32
91
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.82
92
+ ★ GRADER SCORE: 0.9640
93
+
94
+ --- LLM Agent: task=hard, seed=0, model=Bedrock/qwen.qwen3-next-80b-a3b ---
95
+ Step 1: INSPECT acc_0704 flagged=0/10 suspects=0 steps_left=79
96
+ Step 2: INSPECT acc_0289 flagged=0/10 suspects=0 steps_left=78
97
+ Step 3: INSPECT acc_0826 flagged=0/10 suspects=0 steps_left=77
98
+ Step 4: INSPECT acc_0927 flagged=0/10 suspects=0 steps_left=76
99
+ Step 5: INSPECT acc_0441 flagged=0/10 suspects=0 steps_left=75
100
+ Step 6: INSPECT acc_0871 flagged=0/10 suspects=0 steps_left=74
101
+ Step 7: INSPECT acc_0880 flagged=0/10 suspects=0 steps_left=73
102
+ Step 8: INSPECT acc_0652 flagged=0/10 suspects=0 steps_left=72
103
+ Step 9: INSPECT acc_0939 flagged=0/10 suspects=0 steps_left=71
104
+ Step 10: INSPECT acc_0070 flagged=0/10 suspects=0 steps_left=70
105
+ Step 11: INSPECT acc_0915 flagged=0/10 suspects=0 steps_left=69
106
+ Step 12: INSPECT acc_0443 flagged=0/10 suspects=0 steps_left=68
107
+ Step 13: INSPECT acc_0970 flagged=0/10 suspects=0 steps_left=67
108
+ Step 14: INSPECT acc_0028 flagged=0/10 suspects=0 steps_left=66
109
+ Step 15: INSPECT acc_0792 flagged=0/10 suspects=0 steps_left=65
110
+ Step 16: INSPECT acc_0579 flagged=0/10 suspects=0 steps_left=64
111
+ Step 17: INSPECT acc_0037 flagged=0/10 suspects=0 steps_left=63
112
+ Step 18: INSPECT acc_0295 flagged=0/10 suspects=0 steps_left=62
113
+ Step 19: INSPECT acc_0438 flagged=0/10 suspects=0 steps_left=61
114
+ Step 20: INSPECT acc_0439 flagged=0/10 suspects=0 steps_left=60
115
+ Step 21: FLAG acc_0439 flagged=1/10 suspects=5 steps_left=60
116
+ Step 22: INSPECT acc_0237 flagged=1/10 suspects=5 steps_left=59
117
+ Step 23: FLAG acc_0237 flagged=2/10 suspects=6 steps_left=59
118
+ Step 24: INSPECT acc_0621 flagged=2/10 suspects=6 steps_left=58
119
+ Step 25: FLAG acc_0621 flagged=3/10 suspects=6 steps_left=58
120
+ Step 26: INSPECT acc_0389 flagged=3/10 suspects=6 steps_left=57
121
+ Step 27: FLAG acc_0389 flagged=4/10 suspects=6 steps_left=57
122
+ Step 28: INSPECT acc_0160 flagged=4/10 suspects=6 steps_left=56
123
+ Step 29: FLAG acc_0160 flagged=5/10 suspects=5 steps_left=56
124
+ Step 30: INSPECT acc_0549 flagged=5/10 suspects=5 steps_left=55
125
+ Step 31: FLAG acc_0549 flagged=6/10 suspects=4 steps_left=55
126
+ Step 32: INSPECT acc_0658 flagged=6/10 suspects=4 steps_left=54
127
+ Step 33: FLAG acc_0658 flagged=7/10 suspects=3 steps_left=54
128
+ Step 34: INSPECT acc_0290 flagged=7/10 suspects=3 steps_left=53
129
+ Step 35: FLAG acc_0290 flagged=8/10 suspects=2 steps_left=53
130
+ Step 36: INSPECT acc_0124 flagged=8/10 suspects=2 steps_left=52
131
+ Step 37: INSPECT acc_0507 flagged=8/10 suspects=2 steps_left=51
132
+ Step 38: FLAG acc_0124 flagged=9/10 suspects=1 steps_left=51
133
+ Step 39: FLAG acc_0507 flagged=10/10 suspects=0 steps_left=51
134
+ Step 40: SUBMIT flagged=10/10 suspects=0 steps_left=51
135
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.71
136
+ ★ GRADER SCORE: 0.9637
137
+
138
+ Summary: easy=0.9667 medium=0.9640 hard=0.9637
139
+
140
+ ============================================================
141
+ PHASE 3: Score Variance (seeds=[0, 1, 2])
142
+ ============================================================
143
+
144
+ --- LLM Agent: task=easy, seed=0, model=Bedrock/qwen.qwen3-next-80b-a3b ---
145
+ Step 1: INSPECT acc_0043 flagged=0/10 suspects=0 steps_left=29
146
+ Step 2: FLAG acc_0043 flagged=1/10 suspects=8 steps_left=29
147
+ Step 3: INSPECT acc_0036 flagged=1/10 suspects=8 steps_left=28
148
+ Step 4: FLAG acc_0036 flagged=2/10 suspects=8 steps_left=28
149
+ Step 5: INSPECT acc_0001 flagged=2/10 suspects=8 steps_left=27
150
+ Step 6: FLAG acc_0001 flagged=3/10 suspects=7 steps_left=27
151
+ Step 7: INSPECT acc_0014 flagged=3/10 suspects=7 steps_left=26
152
+ Step 8: FLAG acc_0014 flagged=4/10 suspects=6 steps_left=26
153
+ Step 9: INSPECT acc_0012 flagged=4/10 suspects=6 steps_left=25
154
+ Step 10: FLAG acc_0012 flagged=5/10 suspects=5 steps_left=25
155
+ Step 11: INSPECT acc_0000 flagged=5/10 suspects=5 steps_left=24
156
+ Step 12: FLAG acc_0000 flagged=6/10 suspects=4 steps_left=24
157
+ Step 13: INSPECT acc_0027 flagged=6/10 suspects=4 steps_left=23
158
+ Step 14: FLAG acc_0027 flagged=7/10 suspects=3 steps_left=23
159
+ Step 15: INSPECT acc_0047 flagged=7/10 suspects=3 steps_left=22
160
+ Step 16: FLAG acc_0047 flagged=8/10 suspects=2 steps_left=22
161
+ Step 17: INSPECT acc_0007 flagged=8/10 suspects=2 steps_left=21
162
+ Step 18: FLAG acc_0007 flagged=9/10 suspects=1 steps_left=21
163
+ Step 19: INSPECT acc_0028 flagged=9/10 suspects=1 steps_left=20
164
+ Step 20: FLAG acc_0028 flagged=10/10 suspects=0 steps_left=20
165
+ Step 21: SUBMIT flagged=10/10 suspects=0 steps_left=20
166
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.90
167
+ ★ GRADER SCORE: 0.9667
168
+
169
+ --- LLM Agent: task=easy, seed=1, model=Bedrock/qwen.qwen3-next-80b-a3b ---
170
+ Step 1: INSPECT acc_0034 flagged=0/10 suspects=0 steps_left=29
171
+ Step 2: INSPECT acc_0003 flagged=0/10 suspects=0 steps_left=28
172
+ Step 3: INSPECT acc_0049 flagged=0/10 suspects=0 steps_left=27
173
+ Step 4: INSPECT acc_0006 flagged=0/10 suspects=0 steps_left=26
174
+ Step 5: INSPECT acc_0047 flagged=0/10 suspects=0 steps_left=25
175
+ Step 6: FLAG acc_0047 flagged=1/10 suspects=9 steps_left=25
176
+ Step 7: INSPECT acc_0009 flagged=1/10 suspects=9 steps_left=24
177
+ Step 8: FLAG acc_0009 flagged=2/10 suspects=8 steps_left=24
178
+ Step 9: INSPECT acc_0046 flagged=2/10 suspects=8 steps_left=23
179
+ Step 10: FLAG acc_0046 flagged=3/10 suspects=7 steps_left=23
180
+ Step 11: INSPECT acc_0014 flagged=3/10 suspects=7 steps_left=22
181
+ Step 12: FLAG acc_0014 flagged=4/10 suspects=6 steps_left=22
182
+ Step 13: INSPECT acc_0021 flagged=4/10 suspects=6 steps_left=21
183
+ Step 14: FLAG acc_0021 flagged=5/10 suspects=5 steps_left=21
184
+ Step 15: INSPECT acc_0002 flagged=5/10 suspects=5 steps_left=20
185
+ Step 16: FLAG acc_0002 flagged=6/10 suspects=4 steps_left=20
186
+ Step 17: INSPECT acc_0048 flagged=6/10 suspects=4 steps_left=19
187
+ Step 18: FLAG acc_0048 flagged=7/10 suspects=3 steps_left=19
188
+ Step 19: INSPECT acc_0029 flagged=7/10 suspects=3 steps_left=18
189
+ Step 20: FLAG acc_0029 flagged=8/10 suspects=2 steps_left=18
190
+ Step 21: INSPECT acc_0015 flagged=8/10 suspects=2 steps_left=17
191
+ Step 22: FLAG acc_0015 flagged=9/10 suspects=1 steps_left=17
192
+ Step 23: INSPECT acc_0005 flagged=9/10 suspects=1 steps_left=16
193
+ Step 24: FLAG acc_0005 flagged=10/10 suspects=0 steps_left=16
194
+ Step 25: SUBMIT flagged=10/10 suspects=0 steps_left=16
195
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.86
196
+ ★ GRADER SCORE: 0.9533
197
+
198
+ --- LLM Agent: task=easy, seed=2, model=Bedrock/qwen.qwen3-next-80b-a3b ---
199
+ Step 1: INSPECT acc_0040 flagged=0/10 suspects=0 steps_left=29
200
+ Step 2: INSPECT acc_0017 flagged=0/10 suspects=0 steps_left=28
201
+ Step 3: INSPECT acc_0025 flagged=0/10 suspects=0 steps_left=27
202
+ Step 4: INSPECT acc_0026 flagged=0/10 suspects=0 steps_left=26
203
+ Step 5: INSPECT acc_0038 flagged=0/10 suspects=0 steps_left=25
204
+ Step 6: INSPECT acc_0029 flagged=0/10 suspects=0 steps_left=24
205
+ Step 7: FLAG acc_0029 flagged=1/10 suspects=9 steps_left=24
206
+ Step 8: INSPECT acc_0006 flagged=1/10 suspects=9 steps_left=23
207
+ Step 9: FLAG acc_0006 flagged=2/10 suspects=8 steps_left=23
208
+ Step 10: INSPECT acc_0033 flagged=2/10 suspects=8 steps_left=22
209
+ Step 11: FLAG acc_0033 flagged=3/10 suspects=7 steps_left=22
210
+ Step 12: INSPECT acc_0015 flagged=3/10 suspects=7 steps_left=21
211
+ Step 13: FLAG acc_0015 flagged=4/10 suspects=6 steps_left=21
212
+ Step 14: INSPECT acc_0022 flagged=4/10 suspects=6 steps_left=20
213
+ Step 15: FLAG acc_0022 flagged=5/10 suspects=5 steps_left=20
214
+ Step 16: INSPECT acc_0009 flagged=5/10 suspects=5 steps_left=19
215
+ Step 17: FLAG acc_0009 flagged=6/10 suspects=4 steps_left=19
216
+ Step 18: INSPECT acc_0004 flagged=6/10 suspects=4 steps_left=18
217
+ Step 19: FLAG acc_0004 flagged=7/10 suspects=3 steps_left=18
218
+ Step 20: INSPECT acc_0024 flagged=7/10 suspects=3 steps_left=17
219
+ Step 21: FLAG acc_0024 flagged=8/10 suspects=2 steps_left=17
220
+ Step 22: INSPECT acc_0049 flagged=8/10 suspects=2 steps_left=16
221
+ Step 23: FLAG acc_0049 flagged=9/10 suspects=1 steps_left=16
222
+ Step 24: INSPECT acc_0035 flagged=9/10 suspects=1 steps_left=15
223
+ Step 25: FLAG acc_0035 flagged=10/10 suspects=0 steps_left=15
224
+ Step 26: SUBMIT flagged=10/10 suspects=0 steps_left=15
225
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.85
226
+ ★ GRADER SCORE: 0.9500
227
+
228
+ easy: scores=['0.967', '0.953', '0.950'] mean=0.9567 var=0.000052
229
+
230
+ --- LLM Agent: task=medium, seed=0, model=Bedrock/qwen.qwen3-next-80b-a3b ---
231
+ Step 1: INSPECT acc_0008 flagged=0/10 suspects=0 steps_left=49
232
+ Step 2: INSPECT acc_0074 flagged=0/10 suspects=0 steps_left=48
233
+ Step 3: INSPECT acc_0179 flagged=0/10 suspects=0 steps_left=47
234
+ Step 4: INSPECT acc_0096 flagged=0/10 suspects=0 steps_left=46
235
+ Step 5: INSPECT acc_0177 flagged=0/10 suspects=0 steps_left=45
236
+ Step 6: INSPECT acc_0122 flagged=0/10 suspects=0 steps_left=44
237
+ Step 7: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=43
238
+ Step 8: INSPECT acc_0174 flagged=0/10 suspects=0 steps_left=42
239
+ Step 9: INSPECT acc_0144 flagged=0/10 suspects=0 steps_left=41
240
+ Step 10: FLAG acc_0144 flagged=1/10 suspects=8 steps_left=41
241
+ Step 11: INSPECT acc_0181 flagged=1/10 suspects=8 steps_left=40
242
+ Step 12: FLAG acc_0181 flagged=2/10 suspects=8 steps_left=40
243
+ Step 13: INSPECT acc_0022 flagged=2/10 suspects=8 steps_left=39
244
+ Step 14: FLAG acc_0022 flagged=3/10 suspects=7 steps_left=39
245
+ Step 15: INSPECT acc_0092 flagged=3/10 suspects=7 steps_left=38
246
+ Step 16: FLAG acc_0092 flagged=4/10 suspects=6 steps_left=38
247
+ Step 17: INSPECT acc_0097 flagged=4/10 suspects=6 steps_left=37
248
+ Step 18: FLAG acc_0097 flagged=5/10 suspects=5 steps_left=37
249
+ Step 19: INSPECT acc_0187 flagged=5/10 suspects=5 steps_left=36
250
+ Step 20: FLAG acc_0187 flagged=6/10 suspects=4 steps_left=36
251
+ Step 21: INSPECT acc_0093 flagged=6/10 suspects=4 steps_left=35
252
+ Step 22: FLAG acc_0093 flagged=7/10 suspects=3 steps_left=35
253
+ Step 23: INSPECT acc_0172 flagged=7/10 suspects=3 steps_left=34
254
+ Step 24: FLAG acc_0172 flagged=8/10 suspects=2 steps_left=34
255
+ Step 25: INSPECT acc_0058 flagged=8/10 suspects=2 steps_left=33
256
+ Step 26: FLAG acc_0058 flagged=9/10 suspects=1 steps_left=33
257
+ Step 27: INSPECT acc_0131 flagged=9/10 suspects=1 steps_left=32
258
+ Step 28: FLAG acc_0131 flagged=10/10 suspects=0 steps_left=32
259
+ Step 29: SUBMIT flagged=10/10 suspects=0 steps_left=32
260
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.82
261
+ ★ GRADER SCORE: 0.9640
262
+
263
+ --- LLM Agent: task=medium, seed=1, model=Bedrock/qwen.qwen3-next-80b-a3b ---
264
+ Step 1: INSPECT acc_0171 flagged=0/10 suspects=0 steps_left=49
265
+ Step 2: INSPECT acc_0099 flagged=0/10 suspects=0 steps_left=48
266
+ Step 3: INSPECT acc_0152 flagged=0/10 suspects=0 steps_left=47
267
+ Step 4: INSPECT acc_0092 flagged=0/10 suspects=0 steps_left=46
268
+ Step 5: INSPECT acc_0078 flagged=0/10 suspects=0 steps_left=45
269
+ Step 6: INSPECT acc_0112 flagged=0/10 suspects=0 steps_left=44
270
+ Step 7: INSPECT acc_0012 flagged=0/10 suspects=0 steps_left=43
271
+ Step 8: FLAG acc_0012 flagged=1/10 suspects=8 steps_left=43
272
+ Step 9: INSPECT acc_0033 flagged=1/10 suspects=8 steps_left=42
273
+ Step 10: FLAG acc_0033 flagged=2/10 suspects=8 steps_left=42
274
+ Step 11: INSPECT acc_0174 flagged=2/10 suspects=8 steps_left=41
275
+ Step 12: FLAG acc_0174 flagged=3/10 suspects=7 steps_left=41
276
+ Step 13: INSPECT acc_0187 flagged=3/10 suspects=7 steps_left=40
277
+ Step 14: FLAG acc_0187 flagged=4/10 suspects=6 steps_left=40
278
+ Step 15: INSPECT acc_0079 flagged=4/10 suspects=6 steps_left=39
279
+ Step 16: FLAG acc_0079 flagged=5/10 suspects=5 steps_left=39
280
+ Step 17: INSPECT acc_0032 flagged=5/10 suspects=5 steps_left=38
281
+ Step 18: FLAG acc_0032 flagged=6/10 suspects=4 steps_left=38
282
+ Step 19: INSPECT acc_0023 flagged=6/10 suspects=4 steps_left=37
283
+ Step 20: FLAG acc_0023 flagged=7/10 suspects=3 steps_left=37
284
+ Step 21: INSPECT acc_0146 flagged=7/10 suspects=3 steps_left=36
285
+ Step 22: FLAG acc_0146 flagged=8/10 suspects=2 steps_left=36
286
+ Step 23: INSPECT acc_0019 flagged=8/10 suspects=2 steps_left=35
287
+ Step 24: FLAG acc_0019 flagged=9/10 suspects=1 steps_left=35
288
+ Step 25: INSPECT acc_0131 flagged=9/10 suspects=1 steps_left=34
289
+ Step 26: FLAG acc_0131 flagged=10/10 suspects=0 steps_left=34
290
+ Step 27: SUBMIT flagged=10/10 suspects=0 steps_left=34
291
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.84
292
+ ★ GRADER SCORE: 0.9680
293
+
294
+ --- LLM Agent: task=medium, seed=2, model=Bedrock/qwen.qwen3-next-80b-a3b ---
295
+ Step 1: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=49
296
+ Step 2: INSPECT acc_0107 flagged=0/10 suspects=0 steps_left=48
297
+ Step 3: INSPECT acc_0175 flagged=0/10 suspects=0 steps_left=47
298
+ Step 4: INSPECT acc_0030 flagged=0/10 suspects=0 steps_left=46
299
+ Step 5: INSPECT acc_0041 flagged=0/10 suspects=0 steps_left=45
300
+ Step 6: INSPECT acc_0054 flagged=0/10 suspects=0 steps_left=44
301
+ Step 7: INSPECT acc_0199 flagged=0/10 suspects=0 steps_left=43
302
+ Step 8: INSPECT acc_0181 flagged=0/10 suspects=0 steps_left=42
303
+ Step 9: INSPECT acc_0166 flagged=0/10 suspects=0 steps_left=41
304
+ Step 10: INSPECT acc_0098 flagged=0/10 suspects=0 steps_left=40
305
+ Step 11: INSPECT acc_0121 flagged=0/10 suspects=0 steps_left=39
306
+ Step 12: INSPECT acc_0053 flagged=0/10 suspects=0 steps_left=38
307
+ Step 13: INSPECT acc_0103 flagged=0/10 suspects=0 steps_left=37
308
+ Step 14: INSPECT acc_0000 flagged=0/10 suspects=0 steps_left=36
309
+ Step 15: INSPECT acc_0168 flagged=0/10 suspects=0 steps_left=35
310
+ Step 16: INSPECT acc_0040 flagged=0/10 suspects=0 steps_left=34
311
+ Step 17: INSPECT acc_0149 flagged=0/10 suspects=0 steps_left=33
312
+ Step 18: INSPECT acc_0064 flagged=0/10 suspects=0 steps_left=32
313
+ Step 19: INSPECT acc_0016 flagged=0/10 suspects=0 steps_left=31
314
+ Step 20: INSPECT acc_0105 flagged=0/10 suspects=0 steps_left=30
315
+ Step 21: INSPECT acc_0035 flagged=0/10 suspects=0 steps_left=29
316
+ Step 22: FLAG acc_0035 flagged=1/10 suspects=9 steps_left=29
317
+ Step 23: INSPECT acc_0020 flagged=1/10 suspects=9 steps_left=28
318
+ Step 24: FLAG acc_0020 flagged=2/10 suspects=8 steps_left=28
319
+ Step 25: INSPECT acc_0036 flagged=2/10 suspects=8 steps_left=27
320
+ Step 26: FLAG acc_0036 flagged=3/10 suspects=7 steps_left=27
321
+ Step 27: INSPECT acc_0050 flagged=3/10 suspects=7 steps_left=26
322
+ Step 28: FLAG acc_0050 flagged=4/10 suspects=6 steps_left=26
323
+ Step 29: INSPECT acc_0051 flagged=4/10 suspects=6 steps_left=25
324
+ Step 30: FLAG acc_0051 flagged=5/10 suspects=5 steps_left=25
325
+ Step 31: INSPECT acc_0085 flagged=5/10 suspects=5 steps_left=24
326
+ Step 32: FLAG acc_0085 flagged=6/10 suspects=4 steps_left=24
327
+ Step 33: INSPECT acc_0177 flagged=6/10 suspects=4 steps_left=23
328
+ Step 34: FLAG acc_0177 flagged=7/10 suspects=3 steps_left=23
329
+ Step 35: INSPECT acc_0170 flagged=7/10 suspects=3 steps_left=22
330
+ Step 36: FLAG acc_0170 flagged=8/10 suspects=2 steps_left=22
331
+ Step 37: INSPECT acc_0055 flagged=8/10 suspects=2 steps_left=21
332
+ Step 38: FLAG acc_0055 flagged=9/10 suspects=1 steps_left=21
333
+ Step 39: INSPECT acc_0094 flagged=9/10 suspects=1 steps_left=20
334
+ Step 40: FLAG acc_0094 flagged=10/10 suspects=0 steps_left=20
335
+ Step 41: SUBMIT flagged=10/10 suspects=0 steps_left=20
336
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.70
337
+ ★ GRADER SCORE: 0.9400
338
+
339
+ medium: scores=['0.964', '0.968', '0.940'] mean=0.9573 var=0.000153
340
+
341
+ --- LLM Agent: task=hard, seed=0, model=Bedrock/qwen.qwen3-next-80b-a3b ---
342
+ Step 1: INSPECT acc_0704 flagged=0/10 suspects=0 steps_left=79
343
+ Step 2: INSPECT acc_0289 flagged=0/10 suspects=0 steps_left=78
344
+ Step 3: INSPECT acc_0826 flagged=0/10 suspects=0 steps_left=77
345
+ Step 4: INSPECT acc_0927 flagged=0/10 suspects=0 steps_left=76
346
+ Step 5: INSPECT acc_0441 flagged=0/10 suspects=0 steps_left=75
347
+ Step 6: INSPECT acc_0871 flagged=0/10 suspects=0 steps_left=74
348
+ Step 7: INSPECT acc_0880 flagged=0/10 suspects=0 steps_left=73
349
+ Step 8: INSPECT acc_0652 flagged=0/10 suspects=0 steps_left=72
350
+ Step 9: INSPECT acc_0939 flagged=0/10 suspects=0 steps_left=71
351
+ Step 10: INSPECT acc_0070 flagged=0/10 suspects=0 steps_left=70
352
+ Step 11: INSPECT acc_0915 flagged=0/10 suspects=0 steps_left=69
353
+ Step 12: INSPECT acc_0443 flagged=0/10 suspects=0 steps_left=68
354
+ Step 13: INSPECT acc_0970 flagged=0/10 suspects=0 steps_left=67
355
+ Step 14: INSPECT acc_0028 flagged=0/10 suspects=0 steps_left=66
356
+ Step 15: INSPECT acc_0792 flagged=0/10 suspects=0 steps_left=65
357
+ Step 16: INSPECT acc_0579 flagged=0/10 suspects=0 steps_left=64
358
+ Step 17: INSPECT acc_0037 flagged=0/10 suspects=0 steps_left=63
359
+ Step 18: INSPECT acc_0295 flagged=0/10 suspects=0 steps_left=62
360
+ Step 19: INSPECT acc_0438 flagged=0/10 suspects=0 steps_left=61
361
+ Step 20: INSPECT acc_0439 flagged=0/10 suspects=0 steps_left=60
362
+ Step 21: FLAG acc_0439 flagged=1/10 suspects=5 steps_left=60
363
+ Step 22: INSPECT acc_0237 flagged=1/10 suspects=5 steps_left=59
364
+ Step 23: FLAG acc_0237 flagged=2/10 suspects=6 steps_left=59
365
+ Step 24: INSPECT acc_0621 flagged=2/10 suspects=6 steps_left=58
366
+ Step 25: FLAG acc_0621 flagged=3/10 suspects=6 steps_left=58
367
+ Step 26: INSPECT acc_0389 flagged=3/10 suspects=6 steps_left=57
368
+ Step 27: INSPECT acc_0160 flagged=3/10 suspects=6 steps_left=56
369
+ Step 28: FLAG acc_0389 flagged=4/10 suspects=6 steps_left=56
370
+ Step 29: FLAG acc_0160 flagged=5/10 suspects=5 steps_left=56
371
+ Step 30: INSPECT acc_0549 flagged=5/10 suspects=5 steps_left=55
372
+ Step 31: FLAG acc_0549 flagged=6/10 suspects=4 steps_left=55
373
+ Step 32: INSPECT acc_0658 flagged=6/10 suspects=4 steps_left=54
374
+ Step 33: FLAG acc_0658 flagged=7/10 suspects=3 steps_left=54
375
+ Step 34: INSPECT acc_0290 flagged=7/10 suspects=3 steps_left=53
376
+ Step 35: FLAG acc_0290 flagged=8/10 suspects=2 steps_left=53
377
+ Step 36: INSPECT acc_0124 flagged=8/10 suspects=2 steps_left=52
378
+ Step 37: INSPECT acc_0507 flagged=8/10 suspects=2 steps_left=51
379
+ Step 38: FLAG acc_0124 flagged=9/10 suspects=1 steps_left=51
380
+ Step 39: FLAG acc_0507 flagged=10/10 suspects=0 steps_left=51
381
+ Step 40: SUBMIT flagged=10/10 suspects=0 steps_left=51
382
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.71
383
+ ★ GRADER SCORE: 0.9637
384
+
385
+ --- LLM Agent: task=hard, seed=1, model=Bedrock/qwen.qwen3-next-80b-a3b ---
386
+ Step 1: INSPECT acc_0014 flagged=0/10 suspects=0 steps_left=79
387
+ Step 2: INSPECT acc_0835 flagged=0/10 suspects=0 steps_left=78
388
+ Step 3: INSPECT acc_0855 flagged=0/10 suspects=0 steps_left=77
389
+ Step 4: INSPECT acc_0930 flagged=0/10 suspects=0 steps_left=76
390
+ Step 5: INSPECT acc_0336 flagged=0/10 suspects=0 steps_left=75
391
+ Step 6: INSPECT acc_0929 flagged=0/10 suspects=0 steps_left=74
392
+ Step 7: INSPECT acc_0076 flagged=0/10 suspects=0 steps_left=73
393
+ Step 8: INSPECT acc_0543 flagged=0/10 suspects=0 steps_left=72
394
+ Step 9: INSPECT acc_0590 flagged=0/10 suspects=0 steps_left=71
395
+ Step 10: INSPECT acc_0401 flagged=0/10 suspects=0 steps_left=70
396
+ Step 11: INSPECT acc_0322 flagged=0/10 suspects=0 steps_left=69
397
+ Step 12: INSPECT acc_0154 flagged=0/10 suspects=0 steps_left=68
398
+ Step 13: INSPECT acc_0374 flagged=0/10 suspects=0 steps_left=67
399
+ Step 14: INSPECT acc_0549 flagged=0/10 suspects=0 steps_left=66
400
+ Step 15: INSPECT acc_0903 flagged=0/10 suspects=0 steps_left=65
401
+ Step 16: INSPECT acc_0976 flagged=0/10 suspects=0 steps_left=64
402
+ Step 17: INSPECT acc_0620 flagged=0/10 suspects=0 steps_left=63
403
+ Step 18: INSPECT acc_0017 flagged=0/10 suspects=0 steps_left=62
404
+ Step 19: INSPECT acc_0222 flagged=0/10 suspects=0 steps_left=61
405
+ Step 20: INSPECT acc_0536 flagged=0/10 suspects=0 steps_left=60
406
+ Step 21: INSPECT acc_0112 flagged=0/10 suspects=0 steps_left=59
407
+ Step 22: INSPECT acc_0577 flagged=0/10 suspects=0 steps_left=58
408
+ Step 23: INSPECT acc_0517 flagged=0/10 suspects=0 steps_left=57
409
+ Step 24: INSPECT acc_0113 flagged=0/10 suspects=0 steps_left=56
410
+ Step 25: INSPECT acc_0167 flagged=0/10 suspects=0 steps_left=55
411
+ Step 26: INSPECT acc_0697 flagged=0/10 suspects=0 steps_left=54
412
+ Step 27: INSPECT acc_0271 flagged=0/10 suspects=0 steps_left=53
413
+ Step 28: INSPECT acc_0681 flagged=0/10 suspects=0 steps_left=52
414
+ Step 29: INSPECT acc_0530 flagged=0/10 suspects=0 steps_left=51
415
+ Step 30: INSPECT acc_0353 flagged=0/10 suspects=0 steps_left=50
416
+ Step 31: INSPECT acc_0652 flagged=0/10 suspects=0 steps_left=49
417
+ Step 32: INSPECT acc_0777 flagged=0/10 suspects=0 steps_left=48
418
+ Step 33: INSPECT acc_0265 flagged=0/10 suspects=0 steps_left=47
419
+ Step 34: INSPECT acc_0788 flagged=0/10 suspects=0 steps_left=46
420
+ Step 35: INSPECT acc_0033 flagged=0/10 suspects=0 steps_left=45
421
+ Step 36: INSPECT acc_0187 flagged=0/10 suspects=0 steps_left=44
422
+ Step 37: INSPECT acc_0445 flagged=0/10 suspects=0 steps_left=43
423
+ Step 38: INSPECT acc_0846 flagged=0/10 suspects=0 steps_left=42
424
+ Step 39: INSPECT acc_0659 flagged=0/10 suspects=0 steps_left=41
425
+ Step 40: INSPECT acc_0768 flagged=0/10 suspects=0 steps_left=40
426
+ Step 41: INSPECT acc_0677 flagged=0/10 suspects=0 steps_left=39
427
+ Step 42: INSPECT acc_0539 flagged=0/10 suspects=0 steps_left=38
428
+ Step 43: INSPECT acc_0742 flagged=0/10 suspects=0 steps_left=37
429
+ Step 44: INSPECT acc_0503 flagged=0/10 suspects=0 steps_left=36
430
+ Step 45: INSPECT acc_0876 flagged=0/10 suspects=0 steps_left=35
431
+ Step 46: INSPECT acc_0639 flagged=0/10 suspects=0 steps_left=34
432
+ Step 47: INSPECT acc_0494 flagged=0/10 suspects=0 steps_left=33
433
+ Step 48: INSPECT acc_0898 flagged=0/10 suspects=0 steps_left=32
434
+ Step 49: INSPECT acc_0553 flagged=0/10 suspects=0 steps_left=31
435
+ Step 50: INSPECT acc_0588 flagged=0/10 suspects=0 steps_left=30
436
+ Step 51: INSPECT acc_0194 flagged=0/10 suspects=0 steps_left=29
437
+ Step 52: INSPECT acc_0810 flagged=0/10 suspects=0 steps_left=28
438
+ Step 53: INSPECT acc_0355 flagged=0/10 suspects=0 steps_left=27
439
+ Step 54: INSPECT acc_0363 flagged=0/10 suspects=0 steps_left=26
440
+ Step 55: INSPECT acc_0221 flagged=0/10 suspects=0 steps_left=25
441
+ Step 56: INSPECT acc_0580 flagged=0/10 suspects=0 steps_left=24
442
+ Step 57: INSPECT acc_0534 flagged=0/10 suspects=0 steps_left=23
443
+ Step 58: INSPECT acc_0778 flagged=0/10 suspects=0 steps_left=22
444
+ Step 59: INSPECT acc_0998 flagged=0/10 suspects=0 steps_left=21
445
+ Step 60: INSPECT acc_0233 flagged=0/10 suspects=0 steps_left=20
446
+ Step 61: INSPECT acc_0052 flagged=0/10 suspects=0 steps_left=19
447
+ Step 62: INSPECT acc_0813 flagged=0/10 suspects=0 steps_left=18
448
+ Step 63: INSPECT acc_0035 flagged=0/10 suspects=0 steps_left=17
449
+ Step 64: INSPECT acc_0667 flagged=0/10 suspects=0 steps_left=16
450
+ Step 65: INSPECT acc_0019 flagged=0/10 suspects=0 steps_left=15
451
+ Step 66: INSPECT acc_0959 flagged=0/10 suspects=0 steps_left=14
452
+ Step 67: INSPECT acc_0212 flagged=0/10 suspects=0 steps_left=13
453
+ Step 68: INSPECT acc_0776 flagged=0/10 suspects=0 steps_left=12
454
+ Step 69: INSPECT acc_0049 flagged=0/10 suspects=0 steps_left=11
455
+ Step 70: INSPECT acc_0434 flagged=0/10 suspects=0 steps_left=10
456
+ Step 71: INSPECT acc_0827 flagged=0/10 suspects=0 steps_left=9
457
+ Step 72: INSPECT acc_0583 flagged=0/10 suspects=0 steps_left=8
458
+ Step 73: INSPECT acc_0065 flagged=0/10 suspects=0 steps_left=7
459
+ Step 74: INSPECT acc_0107 flagged=0/10 suspects=0 steps_left=6
460
+ Step 75: INSPECT acc_0761 flagged=0/10 suspects=0 steps_left=5
461
+ Step 76: INSPECT acc_0995 flagged=0/10 suspects=0 steps_left=4
462
+ Step 77: INSPECT acc_0157 flagged=0/10 suspects=0 steps_left=3
463
+ Step 78: INSPECT acc_0936 flagged=0/10 suspects=0 steps_left=2
464
+ Step 79: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=1
465
+ Step 80: INSPECT acc_0691 flagged=0/10 suspects=0 steps_left=0
466
+ → Episode ended: [LOSS] TP=0 FP=0 FN=10 Recall=0.00 Precision=0.00 Episode reward=-9.80
467
+ ★ GRADER SCORE: 0.0000
468
+
469
+ --- LLM Agent: task=hard, seed=2, model=Bedrock/qwen.qwen3-next-80b-a3b ---
470
+ Step 1: INSPECT acc_0813 flagged=0/10 suspects=0 steps_left=79
471
+ Step 2: INSPECT acc_0430 flagged=0/10 suspects=0 steps_left=78
472
+ Step 3: INSPECT acc_0817 flagged=0/10 suspects=0 steps_left=77
473
+ Step 4: INSPECT acc_0175 flagged=0/10 suspects=0 steps_left=76
474
+ Step 5: INSPECT acc_0523 flagged=0/10 suspects=0 steps_left=75
475
+ Step 6: INSPECT acc_0113 flagged=0/10 suspects=0 steps_left=74
476
+ Step 7: INSPECT acc_0797 flagged=0/10 suspects=0 steps_left=73
477
+ Step 8: INSPECT acc_0478 flagged=0/10 suspects=0 steps_left=72
478
+ Step 9: INSPECT acc_0861 flagged=0/10 suspects=0 steps_left=71
479
+ Step 10: INSPECT acc_0836 flagged=0/10 suspects=0 steps_left=70
480
+ Step 11: INSPECT acc_0926 flagged=0/10 suspects=0 steps_left=69
481
+ Step 12: INSPECT acc_0664 flagged=0/10 suspects=0 steps_left=68
482
+ Step 13: INSPECT acc_0255 flagged=0/10 suspects=0 steps_left=67
483
+ Step 14: INSPECT acc_0938 flagged=0/10 suspects=0 steps_left=66
484
+ Step 15: INSPECT acc_0672 flagged=0/10 suspects=0 steps_left=65
485
+ Step 16: FLAG acc_0672 flagged=1/10 suspects=6 steps_left=65
486
+ Step 17: INSPECT acc_0659 flagged=1/10 suspects=6 steps_left=64
487
+ Step 18: FLAG acc_0659 flagged=2/10 suspects=5 steps_left=64
488
+ Step 19: INSPECT acc_0290 flagged=2/10 suspects=5 steps_left=63
489
+ Step 20: FLAG acc_0290 flagged=3/10 suspects=5 steps_left=63
490
+ Step 21: INSPECT acc_0339 flagged=3/10 suspects=5 steps_left=62
491
+ Step 22: FLAG acc_0339 flagged=4/10 suspects=6 steps_left=62
492
+ Step 23: INSPECT acc_0544 flagged=4/10 suspects=6 steps_left=61
493
+ Step 24: FLAG acc_0544 flagged=5/10 suspects=5 steps_left=61
494
+ Step 25: INSPECT acc_0696 flagged=5/10 suspects=5 steps_left=60
495
+ Step 26: FLAG acc_0696 flagged=6/10 suspects=4 steps_left=60
496
+ Step 27: INSPECT acc_0541 flagged=6/10 suspects=4 steps_left=59
497
+ Step 28: FLAG acc_0541 flagged=7/10 suspects=3 steps_left=59
498
+ Step 29: INSPECT acc_0793 flagged=7/10 suspects=3 steps_left=58
499
+ Step 30: FLAG acc_0793 flagged=8/10 suspects=2 steps_left=58
500
+ Step 31: INSPECT acc_0214 flagged=8/10 suspects=2 steps_left=57
501
+ Step 32: FLAG acc_0214 flagged=9/10 suspects=1 steps_left=57
502
+ Step 33: INSPECT acc_0112 flagged=9/10 suspects=1 steps_left=56
503
+ Step 34: FLAG acc_0112 flagged=10/10 suspects=0 steps_left=56
504
+ Step 35: SUBMIT flagged=10/10 suspects=0 steps_left=56
505
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.76
506
+ ★ GRADER SCORE: 0.9700
507
+
508
+ hard: scores=['0.964', '0.000', '0.970'] mean=0.6446 var=0.207740
509
+
510
+ ============================================================
511
+ EVALUATION COMPLETE
512
+ ============================================================
513
+ ubuntu@ip-172-31-33-59:~/meta/meta-hack-26$
memory/reflections_easy.jsonl CHANGED
@@ -1 +1 @@
1
- {"episode": 1, "reward": 18.88, "reflection": "Started by inspecting multiple low-profile accounts to identify early signals like comment_repeat_score and photo_reuse_score, which revealed coordinated spam patterns. Once a few clear gang members were flagged, their auto-become SUSPECT neighbors were rapidly confirmed and flagged, leveraging network propagation to efficiently uncover all 10 fake accounts without false positives. Always begin with targeted INSPECTs before FLAGging to validate signals and avoid premature assumptions."}
 
1
+ {"episode": 1, "reward": 18.88, "reflection": "Starting with direct INSPECTs on low-numbered accounts revealed multiple gang members with high fake_risk_score, photo_reuse_score, and comment_repeat_score, confirming a coordinated pattern. Once the first few were FLAGGED, their auto-revealed neighbors automatically became high-priority SUSPECTs, allowing rapid expansion without extra steps\u2014this network-triggered propagation was key to 100% recall with zero false positives. Always begin with targeted INSPECTs to identify clear gang markers, then leverage FLAGging to unlock the full network efficiently."}
memory/reflections_hard.jsonl CHANGED
@@ -1,3 +1,3 @@
1
- {"episode": 1, "reward": 17.84, "reflection": "We successfully identified the gang by first inspecting 10 low-suspicion accounts to establish baseline behavior, then flagged those showing high comment_repeat_score, photo_reuse_score, and bio_template_score \u2014 all clustering around shared_ip_count=9. The key was not jumping to flag early, but using initial inspections to confirm coordinated patterns before triggering network suspects, which automatically revealed the full gang without additional steps. Always verify multiple signals together before flagging \u2014 single indicators can be false positives."}
2
- {"episode": 2, "reward": 18.76, "reflection": "Starting with targeted INSPECTs on low-suspicion accounts revealed hidden patterns: multiple accounts shared identical bio templates and reused profile photos, exposing a coordinated gang. Once a single account with high fake_risk_score and shared_ip_count was confirmed, INVESTIGATE_NETWORK was used to expand to all 2-hop neighbors, uncovering the full network without false positives. Repeating this method\u2014inspecting first to validate signals before network expansion\u2014ensured precision and maximized recall."}
3
- {"episode": 3, "reward": 18.84, "reflection": "Started by inspecting multiple low-suspicion accounts to map the network, identifying key gang members through high comment_repeat_score, shared_ip_count, and bio_template_score. Once five clear gang members were confirmed, flagging them triggered automatic SUSPECT status on their neighbors, revealing the full cluster without unnecessary deep dives\u2014repeat this targeted inspection-to-flag pipeline to maximize coverage with minimal steps."}
 
1
+ {"episode": 1, "reward": 17.84, "reflection": "The strategy succeeded by first INSPECTing 10 low-suspicion accounts to map the network\u2019s baseline behavior, confirming that comment_repeat_score and photo_reuse_score consistently spiked in gang members. Once five accounts showed matching red flags, FLAGging them triggered auto-suspect propagation, revealing the full gang without unnecessary INVESTIGATE_NETWORK steps\u2014this minimized steps while maximizing coverage. Future cases should always begin with targeted INSPECTs to validate signals before FLAGging, avoiding premature network expansion."}
2
+ {"episode": 2, "reward": 18.76, "reflection": "By systematically INSPECTing high-risk accounts with elevated fake_risk_score and multiple red flags\u2014comment_repeat_score, shared_ip_count, and photo_reuse_score\u2014we uncovered a tightly coordinated gang operating under identical bio templates. Each confirmed fake account revealed neighbors with identical patterns, making network expansion unnecessary; we prioritized direct inspection over speculative INVESTIGATE_NETWORK, conserving steps while maximizing precision. Future cases should trust low-step inspection of high-signal accounts first, letting flagged neighbors naturally expose the rest."}
3
+ {"episode": 3, "reward": 18.84, "reflection": "We successfully identified the gang by first inspecting 10 low-suspicion accounts to establish baseline behavior, then flagged those showing high comment_repeat_score, shared_ip_count, and photo_reuse_score \u2014 all consistent with coordinated fake accounts. The key was waiting to flag until multiple signals aligned, which prevented false positives and triggered auto-suspect expansion, revealing the full network without unnecessary steps. Always inspect first to calibrate, then flag only when signals converge."}
memory/reflections_medium.jsonl CHANGED
@@ -1 +1 @@
1
- {"episode": 1, "reward": 18.86, "reflection": "The strategy succeeded by first INSPECTing multiple low-suspicion accounts to identify early signals like comment_repeat_score and photo_reuse_score, which revealed coordinated spam patterns. Once a few accounts with high fake_risk_score and shared_ip_count were flagged, the auto-triggered SUSPECT neighbors were immediately flagged without further inspection, efficiently expanding the takedown. Always start with targeted INSPECTs to map the network before FLAGging\u2014this minimizes steps and maximizes recall."}
 
1
+ {"episode": 1, "reward": 18.86, "reflection": "Starting with targeted INSPECTs on low-activity accounts revealed multiple gang members with high comment_repeat_score, photo_reuse_score, and bio_template_score \u2014 all clustered under shared_ip_count > 5. Flagging the first five confirmed gang nodes triggered auto-suspect propagation, letting us identify the full network without unnecessary network expansion. Always begin with direct inspection of suspicious profiles before network moves \u2014 this minimizes steps and maximizes precision."}
memory/wins_easy.jsonl CHANGED
@@ -8,3 +8,43 @@
8
  {"episode": 8, "won": true}
9
  {"episode": 9, "won": true}
10
  {"episode": 10, "won": true}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  {"episode": 8, "won": true}
9
  {"episode": 9, "won": true}
10
  {"episode": 10, "won": true}
11
+ {"episode": 11, "won": true}
12
+ {"episode": 12, "won": true}
13
+ {"episode": 13, "won": true}
14
+ {"episode": 14, "won": true}
15
+ {"episode": 15, "won": true}
16
+ {"episode": 16, "won": true}
17
+ {"episode": 17, "won": true}
18
+ {"episode": 18, "won": true}
19
+ {"episode": 19, "won": true}
20
+ {"episode": 20, "won": true}
21
+ {"episode": 21, "won": true}
22
+ {"episode": 22, "won": true}
23
+ {"episode": 23, "won": true}
24
+ {"episode": 24, "won": true}
25
+ {"episode": 25, "won": true}
26
+ {"episode": 26, "won": true}
27
+ {"episode": 27, "won": true}
28
+ {"episode": 28, "won": true}
29
+ {"episode": 29, "won": true}
30
+ {"episode": 30, "won": true}
31
+ {"episode": 31, "won": true}
32
+ {"episode": 32, "won": true}
33
+ {"episode": 33, "won": true}
34
+ {"episode": 34, "won": true}
35
+ {"episode": 35, "won": true}
36
+ {"episode": 36, "won": true}
37
+ {"episode": 37, "won": true}
38
+ {"episode": 38, "won": true}
39
+ {"episode": 39, "won": true}
40
+ {"episode": 40, "won": true}
41
+ {"episode": 41, "won": true}
42
+ {"episode": 42, "won": true}
43
+ {"episode": 43, "won": true}
44
+ {"episode": 44, "won": true}
45
+ {"episode": 45, "won": true}
46
+ {"episode": 46, "won": true}
47
+ {"episode": 47, "won": true}
48
+ {"episode": 48, "won": true}
49
+ {"episode": 49, "won": true}
50
+ {"episode": 50, "won": true}
memory/wins_hard.jsonl CHANGED
@@ -8,3 +8,28 @@
8
  {"episode": 8, "won": true}
9
  {"episode": 9, "won": true}
10
  {"episode": 10, "won": true}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  {"episode": 8, "won": true}
9
  {"episode": 9, "won": true}
10
  {"episode": 10, "won": true}
11
+ {"episode": 11, "won": true}
12
+ {"episode": 12, "won": true}
13
+ {"episode": 13, "won": true}
14
+ {"episode": 14, "won": true}
15
+ {"episode": 15, "won": true}
16
+ {"episode": 16, "won": true}
17
+ {"episode": 17, "won": true}
18
+ {"episode": 18, "won": true}
19
+ {"episode": 19, "won": true}
20
+ {"episode": 20, "won": true}
21
+ {"episode": 21, "won": true}
22
+ {"episode": 22, "won": true}
23
+ {"episode": 23, "won": true}
24
+ {"episode": 24, "won": true}
25
+ {"episode": 25, "won": true}
26
+ {"episode": 26, "won": true}
27
+ {"episode": 27, "won": true}
28
+ {"episode": 28, "won": true}
29
+ {"episode": 29, "won": true}
30
+ {"episode": 30, "won": true}
31
+ {"episode": 31, "won": true}
32
+ {"episode": 32, "won": true}
33
+ {"episode": 33, "won": true}
34
+ {"episode": 34, "won": true}
35
+ {"episode": 35, "won": true}
memory/wins_medium.jsonl CHANGED
@@ -1,20 +1,44 @@
1
  {"episode": 1, "won": true}
2
- {"episode": 1, "won": true}
3
- {"episode": 2, "won": true}
4
  {"episode": 2, "won": true}
5
  {"episode": 3, "won": true}
6
- {"episode": 3, "won": true}
7
  {"episode": 4, "won": true}
8
- {"episode": 4, "won": true}
9
- {"episode": 5, "won": true}
10
  {"episode": 5, "won": true}
11
  {"episode": 6, "won": true}
12
- {"episode": 6, "won": true}
13
- {"episode": 7, "won": true}
14
  {"episode": 7, "won": true}
15
  {"episode": 8, "won": true}
16
- {"episode": 8, "won": true}
17
  {"episode": 9, "won": true}
18
- {"episode": 9, "won": true}
19
- {"episode": 10, "won": true}
20
  {"episode": 10, "won": true}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  {"episode": 1, "won": true}
 
 
2
  {"episode": 2, "won": true}
3
  {"episode": 3, "won": true}
 
4
  {"episode": 4, "won": true}
 
 
5
  {"episode": 5, "won": true}
6
  {"episode": 6, "won": true}
 
 
7
  {"episode": 7, "won": true}
8
  {"episode": 8, "won": true}
 
9
  {"episode": 9, "won": true}
 
 
10
  {"episode": 10, "won": true}
11
+ {"episode": 11, "won": true}
12
+ {"episode": 12, "won": true}
13
+ {"episode": 13, "won": true}
14
+ {"episode": 14, "won": true}
15
+ {"episode": 15, "won": true}
16
+ {"episode": 16, "won": true}
17
+ {"episode": 17, "won": true}
18
+ {"episode": 18, "won": true}
19
+ {"episode": 19, "won": true}
20
+ {"episode": 20, "won": true}
21
+ {"episode": 21, "won": true}
22
+ {"episode": 22, "won": true}
23
+ {"episode": 23, "won": true}
24
+ {"episode": 24, "won": true}
25
+ {"episode": 25, "won": true}
26
+ {"episode": 26, "won": true}
27
+ {"episode": 27, "won": true}
28
+ {"episode": 28, "won": true}
29
+ {"episode": 29, "won": true}
30
+ {"episode": 30, "won": true}
31
+ {"episode": 31, "won": true}
32
+ {"episode": 32, "won": true}
33
+ {"episode": 33, "won": true}
34
+ {"episode": 34, "won": true}
35
+ {"episode": 35, "won": true}
36
+ {"episode": 36, "won": true}
37
+ {"episode": 37, "won": true}
38
+ {"episode": 38, "won": true}
39
+ {"episode": 39, "won": true}
40
+ {"episode": 40, "won": true}
41
+ {"episode": 41, "won": true}
42
+ {"episode": 42, "won": true}
43
+ {"episode": 43, "won": true}
44
+ {"episode": 44, "won": true}
model-benchmark-logs/deepseek_judge_log.txt ADDED
@@ -0,0 +1,749 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ubuntu@ip-172-31-33-59:~/meta/meta-hack-26$ python3 deepseek_test_judge_eval.py --url https:
2
+ //pandago-graphstrike.hf.space --bedrock
3
+ GraphStrike Judge Evaluation Simulator
4
+ Target: https://pandago-graphstrike.hf.space
5
+ Backend: bedrock
6
+ Model: Bedrock/deepseek.v3.2
7
+ Token: set
8
+
9
+ ============================================================
10
+ PHASE 0: Endpoint Verification
11
+ ============================================================
12
+ ✓ GET /health
13
+ ✓ GET /tasks
14
+ ✓ GET /metadata
15
+ ✓ GET /schema
16
+ ✓ GET /web
17
+ ✓ POST /reset
18
+ ✓ GET /state
19
+ ✓ POST /step
20
+ ✓ POST /step
21
+ ✓ GET /grader
22
+ ✓ POST /mcp
23
+ ✓ POST /baseline
24
+
25
+ ============================================================
26
+ PHASE 1: Baseline Stability (3 runs)
27
+ ============================================================
28
+ Run 1: easy=0.9100 medium=0.9060 hard=0.9038
29
+ Run 2: easy=0.9100 medium=0.9060 hard=0.9038
30
+ Run 3: easy=0.9100 medium=0.9060 hard=0.9038
31
+ ✓ All 3 runs identical — baseline is deterministic
32
+
33
+ ============================================================
34
+ PHASE 2: LLM Agent Evaluation (model=Bedrock/deepseek.v3.2)
35
+ ============================================================
36
+
37
+ --- LLM Agent: task=easy, seed=0, model=Bedrock/deepseek.v3.2 ---
38
+ Step 1: INSPECT acc_0043 flagged=0/10 suspects=0 steps_left=29
39
+ Step 2: FLAG acc_0043 flagged=1/10 suspects=8 steps_left=29
40
+ Step 3: INSPECT acc_0036 flagged=1/10 suspects=8 steps_left=28
41
+ Step 4: INSPECT acc_0001 flagged=1/10 suspects=8 steps_left=27
42
+ Step 5: INSPECT acc_0014 flagged=1/10 suspects=8 steps_left=26
43
+ Step 6: INSPECT acc_0012 flagged=1/10 suspects=8 steps_left=25
44
+ Step 7: INSPECT acc_0000 flagged=1/10 suspects=8 steps_left=24
45
+ Step 8: FLAG acc_0001 flagged=2/10 suspects=8 steps_left=24
46
+ Step 9: INSPECT acc_0027 flagged=2/10 suspects=8 steps_left=23
47
+ Step 10: FLAG acc_0027 flagged=3/10 suspects=7 steps_left=23
48
+ Step 11: INSPECT acc_0047 flagged=3/10 suspects=7 steps_left=22
49
+ Step 12: FLAG acc_0047 flagged=4/10 suspects=6 steps_left=22
50
+ Step 13: INSPECT acc_0007 flagged=4/10 suspects=6 steps_left=21
51
+ Step 14: FLAG acc_0014 flagged=5/10 suspects=5 steps_left=21
52
+ Step 15: INSPECT acc_0028 flagged=5/10 suspects=5 steps_left=20
53
+ Step 16: FLAG acc_0000 flagged=6/10 suspects=4 steps_left=20
54
+ Step 17: FLAG acc_0007 flagged=7/10 suspects=3 steps_left=20
55
+ Step 18: FLAG acc_0012 flagged=8/10 suspects=2 steps_left=20
56
+ Step 19: FLAG acc_0036 flagged=9/10 suspects=1 steps_left=20
57
+ Step 20: FLAG acc_0028 flagged=10/10 suspects=0 steps_left=20
58
+ Step 21: SUBMIT flagged=10/10 suspects=0 steps_left=20
59
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.90
60
+ ★ GRADER SCORE: 0.9667
61
+
62
+ --- LLM Agent: task=medium, seed=0, model=Bedrock/deepseek.v3.2 ---
63
+ Step 1: INSPECT acc_0008 flagged=0/10 suspects=0 steps_left=49
64
+ Step 2: INSPECT acc_0074 flagged=0/10 suspects=0 steps_left=48
65
+ Step 3: INSPECT acc_0179 flagged=0/10 suspects=0 steps_left=47
66
+ Step 4: INSPECT acc_0096 flagged=0/10 suspects=0 steps_left=46
67
+ Step 5: INSPECT acc_0177 flagged=0/10 suspects=0 steps_left=45
68
+ Step 6: INSPECT acc_0122 flagged=0/10 suspects=0 steps_left=44
69
+ Step 7: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=43
70
+ Step 8: INSPECT acc_0174 flagged=0/10 suspects=0 steps_left=42
71
+ Step 9: INSPECT acc_0144 flagged=0/10 suspects=0 steps_left=41
72
+ Step 10: FLAG acc_0144 flagged=1/10 suspects=8 steps_left=41
73
+ Step 11: INSPECT acc_0181 flagged=2/10 suspects=8 steps_left=21
74
+ Step 12: INSPECT acc_0028 flagged=3/10 suspects=7 steps_left=19
75
+ Step 13: FLAG acc_0047 flagged=5/10 suspects=5 steps_left=19
76
+ Step 14: FLAG acc_0007 flagged=7/10 suspects=3 steps_left=19
77
+ Step 15: FLAG acc_0012 flagged=8/10 suspects=2 steps_left=19
78
+ Step 16: FLAG acc_0036 flagged=9/10 suspects=1 steps_left=19
79
+ Step 17: FLAG acc_0028 flagged=10/10 suspects=0 steps_left=19
80
+ Step 18: SUBMIT flagged=10/10 suspects=0 steps_left=18
81
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.88
82
+ ★ GRADER SCORE: 0.9600
83
+
84
+ --- LLM Agent: task=hard, seed=0, model=Bedrock/deepseek.v3.2 ---
85
+ Step 1: INSPECT acc_0704 flagged=0/10 suspects=0 steps_left=76
86
+ Step 2: INSPECT acc_0826 flagged=0/10 suspects=0 steps_left=74
87
+ Step 3: INSPECT acc_0927 flagged=0/10 suspects=0 steps_left=70
88
+ Step 4: INSPECT acc_0880 flagged=0/10 suspects=0 steps_left=67
89
+ Step 5: INSPECT acc_0939 flagged=0/10 suspects=0 steps_left=64
90
+ Step 6: INSPECT acc_0915 flagged=0/10 suspects=0 steps_left=61
91
+ Step 7: INSPECT acc_0970 flagged=0/10 suspects=0 steps_left=59
92
+ Step 8: INSPECT acc_0028 flagged=0/10 suspects=0 steps_left=58
93
+ Step 9: INSPECT acc_0792 flagged=0/10 suspects=0 steps_left=55
94
+ Step 10: INSPECT acc_0579 flagged=0/10 suspects=0 steps_left=52
95
+ Step 11: INSPECT acc_0295 flagged=0/10 suspects=0 steps_left=49
96
+ Step 12: INSPECT acc_0439 flagged=0/10 suspects=0 steps_left=47
97
+ Step 13: FLAG acc_0439 flagged=2/10 suspects=6 steps_left=45
98
+ Step 14: INSPECT acc_0389 flagged=3/10 suspects=6 steps_left=44
99
+ Step 15: INSPECT acc_0160 flagged=3/10 suspects=6 steps_left=41
100
+ Step 16: FLAG acc_0160 flagged=4/10 suspects=6 steps_left=40
101
+ Step 17: INSPECT acc_0658 flagged=6/10 suspects=4 steps_left=39
102
+ Step 18: INSPECT acc_0290 flagged=10/10 suspects=0 steps_left=27
103
+ Step 19: SUBMIT flagged=10/10 suspects=0 steps_left=26
104
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=14.46
105
+ ★ GRADER SCORE: 0.9325
106
+
107
+ Summary: easy=0.9667 medium=0.9600 hard=0.9325
108
+
109
+ ============================================================
110
+ PHASE 3: Score Variance (seeds=[0, 1, 2])
111
+ ============================================================
112
+
113
+ --- LLM Agent: task=easy, seed=0, model=Bedrock/deepseek.v3.2 ---
114
+ Step 1: INSPECT acc_0043 flagged=10/10 suspects=0 steps_left=17
115
+ Step 2: SUBMIT flagged=10/10 suspects=0 steps_left=10
116
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.80
117
+ ★ GRADER SCORE: 0.9333
118
+
119
+ --- LLM Agent: task=easy, seed=1, model=Bedrock/deepseek.v3.2 ---
120
+ Step 1: INSPECT acc_0034 flagged=1/10 suspects=8 steps_left=28
121
+ Step 2: INSPECT acc_0047 flagged=1/10 suspects=8 steps_left=26
122
+ Step 3: INSPECT acc_0002 flagged=1/10 suspects=8 steps_left=23
123
+ Step 4: FLAG acc_0047 flagged=2/10 suspects=8 steps_left=22
124
+ Step 5: INSPECT acc_0029 flagged=4/10 suspects=6 steps_left=19
125
+ Step 6: FLAG acc_0048 flagged=5/10 suspects=5 steps_left=18
126
+ Step 7: FLAG acc_0005 flagged=6/10 suspects=4 steps_left=17
127
+ Step 8: FLAG acc_0046 flagged=8/10 suspects=2 steps_left=17
128
+ Step 9: INSPECT acc_0014 flagged=9/10 suspects=1 steps_left=16
129
+ Step 10: FLAG acc_0014 flagged=10/10 suspects=0 steps_left=15
130
+ Step 11: SUBMIT flagged=10/10 suspects=0 steps_left=12
131
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.82
132
+ ★ GRADER SCORE: 0.9400
133
+
134
+ --- LLM Agent: task=easy, seed=2, model=Bedrock/deepseek.v3.2 ---
135
+ Step 1: INSPECT acc_0040 flagged=0/10 suspects=0 steps_left=17
136
+ Step 2: FLAG acc_0033 flagged=1/10 suspects=9 steps_left=17
137
+ Step 3: INSPECT acc_0029 flagged=6/10 suspects=4 steps_left=7
138
+ Step 4: FLAG acc_0015 flagged=8/10 suspects=2 steps_left=7
139
+ Step 5: FLAG acc_0004 flagged=10/10 suspects=0 steps_left=7
140
+ Step 6: SUBMIT flagged=0/10 suspects=0 steps_left=48
141
+ → Episode ended: [LOSS] TP=0 FP=0 FN=10 Recall=0.00 Precision=0.00 Episode reward=-2.02
142
+ ★ GRADER SCORE: 0.0000
143
+
144
+ easy: scores=['0.933', '0.940', '0.000'] mean=0.6244 var=0.194966
145
+
146
+ --- LLM Agent: task=medium, seed=0, model=Bedrock/deepseek.v3.2 ---
147
+ Step 1: INSPECT acc_0008 flagged=0/10 suspects=0 steps_left=46
148
+ Step 2: INSPECT acc_0096 flagged=0/10 suspects=0 steps_left=49
149
+ Step 3: INSPECT acc_0008 flagged=0/10 suspects=0 steps_left=29
150
+ Step 4: INSPECT acc_0043 flagged=1/10 suspects=9 steps_left=47
151
+ Step 5: INSPECT acc_0144 flagged=1/10 suspects=9 steps_left=44
152
+ Step 6: INSPECT acc_0131 flagged=3/10 suspects=7 steps_left=41
153
+ Step 7: INSPECT acc_0181 flagged=5/10 suspects=5 steps_left=36
154
+ Step 8: INSPECT acc_0097 flagged=6/10 suspects=4 steps_left=35
155
+ Step 9: INSPECT acc_0187 flagged=7/10 suspects=3 steps_left=33
156
+ Step 10: FLAG acc_0187 flagged=8/10 suspects=2 steps_left=32
157
+ Step 11: INSPECT acc_0058 flagged=8/10 suspects=2 steps_left=30
158
+ Step 12: FLAG acc_0058 flagged=9/10 suspects=1 steps_left=29
159
+ Step 13: FLAG acc_0093 flagged=10/10 suspects=0 steps_left=27
160
+ Step 14: SUBMIT flagged=10/10 suspects=0 steps_left=26
161
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.76
162
+ ★ GRADER SCORE: 0.9520
163
+
164
+ --- LLM Agent: task=medium, seed=1, model=Bedrock/deepseek.v3.2 ---
165
+ Step 1: INSPECT acc_0171 flagged=0/10 suspects=0 steps_left=47
166
+ Step 2: INSPECT acc_0099 flagged=0/10 suspects=0 steps_left=45
167
+ Step 3: INSPECT acc_0152 flagged=0/10 suspects=0 steps_left=42
168
+ Step 4: INSPECT acc_0078 flagged=0/10 suspects=0 steps_left=39
169
+ Step 5: INSPECT acc_0012 flagged=0/10 suspects=0 steps_left=37
170
+ Step 6: FLAG acc_0012 flagged=1/10 suspects=8 steps_left=36
171
+ Step 7: INSPECT acc_0174 flagged=2/10 suspects=8 steps_left=34
172
+ Step 8: FLAG acc_0174 flagged=3/10 suspects=7 steps_left=34
173
+ Step 9: INSPECT acc_0187 flagged=10/10 suspects=0 steps_left=24
174
+ Step 10: SUBMIT flagged=10/10 suspects=0 steps_left=23
175
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.73
176
+ ★ GRADER SCORE: 0.9460
177
+
178
+ --- LLM Agent: task=medium, seed=2, model=Bedrock/deepseek.v3.2 ---
179
+ Step 1: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=47
180
+ Step 2: INSPECT acc_0107 flagged=0/10 suspects=0 steps_left=45
181
+ Step 3: INSPECT acc_0175 flagged=0/10 suspects=0 steps_left=43
182
+ Step 4: INSPECT acc_0030 flagged=0/10 suspects=0 steps_left=40
183
+ Step 5: INSPECT acc_0054 flagged=0/10 suspects=0 steps_left=38
184
+ Step 6: INSPECT acc_0199 flagged=0/10 suspects=0 steps_left=35
185
+ Step 7: INSPECT acc_0166 flagged=0/10 suspects=0 steps_left=32
186
+ Step 8: INSPECT acc_0082 flagged=0/10 suspects=0 steps_left=15
187
+ Step 9: INSPECT acc_0016 flagged=0/10 suspects=0 steps_left=28
188
+ Step 10: FLAG acc_0000 flagged=10/10 suspects=0 steps_left=3
189
+ [RETRY] Network error: HTTP Error 400: Bad Request — retrying in 3s (1/3)
190
+ [RETRY] Network error: HTTP Error 400: Bad Request — retrying in 6s (2/3)
191
+ Traceback (most recent call last):
192
+ File "/home/ubuntu/meta/meta-hack-26/deepseek_test_judge_eval.py", line 534, in <module>
193
+ test_variance(base, seeds=list(range(args.seeds)))
194
+ File "/home/ubuntu/meta/meta-hack-26/deepseek_test_judge_eval.py", line 458, in test_varia
195
+ nce
196
+ score = test_llm_agent(base_url, task=task, seed=seed)
197
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
198
+ File "/home/ubuntu/meta/meta-hack-26/deepseek_test_judge_eval.py", line 426, in test_llm_a
199
+ gent
200
+ grader = http_get(f"{base_url}/grader")
201
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
202
+ File "/home/ubuntu/meta/meta-hack-26/deepseek_test_judge_eval.py", line 116, in http_get
203
+ return _retry(_do)
204
+ ^^^^^^^^^^^
205
+ File "/home/ubuntu/meta/meta-hack-26/deepseek_test_judge_eval.py", line 85, in _retry
206
+ return fn()
207
+ ^^^^
208
+ File "/home/ubuntu/meta/meta-hack-26/deepseek_test_judge_eval.py", line 110, in _do
209
+ with urllib.request.urlopen(url, timeout=120) as resp:
210
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
211
+ File "/usr/lib/python3.12/urllib/request.py", line 215, in urlopen
212
+ return opener.open(url, data, timeout)
213
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
214
+ File "/usr/lib/python3.12/urllib/request.py", line 521, in open
215
+ response = meth(req, response)
216
+ ^^^^^^^^^^^^^^^^^^^
217
+ File "/usr/lib/python3.12/urllib/request.py", line 630, in http_response
218
+ response = self.parent.error(
219
+ ^^^^^^^^^^^^^^^^^^
220
+ File "/usr/lib/python3.12/urllib/request.py", line 559, in error
221
+ return self._call_chain(*args)
222
+ ^^^^^^^^^^^^^^^^^^^^^^^
223
+ File "/usr/lib/python3.12/urllib/request.py", line 492, in _call_chain
224
+ result = func(*args)
225
+ ^^^^^^^^^^^
226
+ File "/usr/lib/python3.12/urllib/request.py", line 639, in http_error_default
227
+ raise HTTPError(req.full_url, code, msg, hdrs, fp)
228
+ urllib.error.HTTPError: HTTP Error 400: Bad Request
229
+ ubuntu@ip-172-31-33-59:~/meta/meta-hack-26$ python3 deepseek_test_judge_eval.py --url https:
230
+ //pandago-graphstrike.hf.space --bedrock
231
+ GraphStrike Judge Evaluation Simulator
232
+ Target: https://pandago-graphstrike.hf.space
233
+ Backend: bedrock
234
+ Model: Bedrock/deepseek.v3.2
235
+ Token: set
236
+
237
+ ============================================================
238
+ PHASE 0: Endpoint Verification
239
+ ============================================================
240
+ ✓ GET /health
241
+ ✓ GET /tasks
242
+ ✓ GET /metadata
243
+ ✓ GET /schema
244
+ ✓ GET /web
245
+ ✓ POST /reset
246
+ ✓ GET /state
247
+ ✓ POST /step
248
+ ✓ POST /step
249
+ ✓ GET /grader
250
+ ✓ POST /mcp
251
+ ✓ POST /baseline
252
+
253
+ ============================================================
254
+ PHASE 1: Baseline Stability (3 runs)
255
+ ============================================================
256
+ Run 1: easy=0.9100 medium=0.9060 hard=0.9038
257
+ Run 2: easy=0.9100 medium=0.9060 hard=0.9038
258
+ Run 3: easy=0.9100 medium=0.9060 hard=0.9038
259
+ ✓ All 3 runs identical — baseline is deterministic
260
+
261
+ ============================================================
262
+ PHASE 2: LLM Agent Evaluation (model=Bedrock/deepseek.v3.2)
263
+ ============================================================
264
+
265
+ --- LLM Agent: task=easy, seed=0, model=Bedrock/deepseek.v3.2 ---
266
+ Step 1: INSPECT acc_0043 flagged=0/10 suspects=0 steps_left=29
267
+ Step 2: FLAG acc_0043 flagged=1/10 suspects=8 steps_left=29
268
+ Step 3: INSPECT acc_0036 flagged=1/10 suspects=8 steps_left=28
269
+ Step 4: INSPECT acc_0001 flagged=1/10 suspects=8 steps_left=27
270
+ Step 5: INSPECT acc_0014 flagged=1/10 suspects=8 steps_left=26
271
+ Step 6: INSPECT acc_0012 flagged=1/10 suspects=8 steps_left=25
272
+ Step 7: INSPECT acc_0000 flagged=1/10 suspects=8 steps_left=24
273
+ Step 8: FLAG acc_0001 flagged=2/10 suspects=8 steps_left=24
274
+ Step 9: INSPECT acc_0027 flagged=2/10 suspects=8 steps_left=23
275
+ Step 10: INSPECT acc_0047 flagged=2/10 suspects=8 steps_left=22
276
+ Step 11: FLAG acc_0027 flagged=3/10 suspects=7 steps_left=22
277
+ Step 12: INSPECT acc_0007 flagged=3/10 suspects=7 steps_left=21
278
+ Step 13: FLAG acc_0047 flagged=4/10 suspects=6 steps_left=21
279
+ Step 14: FLAG acc_0014 flagged=1/10 suspects=1 steps_left=76
280
+ Step 15: INSPECT acc_0356 flagged=1/10 suspects=1 steps_left=75
281
+ Step 16: INSPECT acc_0523 flagged=1/10 suspects=1 steps_left=74
282
+ Step 17: INSPECT acc_0113 flagged=1/10 suspects=1 steps_left=73
283
+ Step 18: INSPECT acc_0797 flagged=1/10 suspects=1 steps_left=71
284
+ Step 19: INSPECT acc_0478 flagged=1/10 suspects=1 steps_left=70
285
+ Step 20: INSPECT acc_0861 flagged=1/10 suspects=1 steps_left=69
286
+ Step 21: INSPECT acc_0836 flagged=1/10 suspects=1 steps_left=68
287
+ Step 22: INSPECT acc_0926 flagged=1/10 suspects=1 steps_left=67
288
+ Step 23: INSPECT acc_0664 flagged=1/10 suspects=1 steps_left=64
289
+ Step 24: INSPECT acc_0255 flagged=1/10 suspects=1 steps_left=63
290
+ Step 25: INSPECT acc_0938 flagged=1/10 suspects=1 steps_left=62
291
+ Step 26: INSPECT acc_0672 flagged=1/10 suspects=1 steps_left=61
292
+ Step 27: FLAG acc_0672 flagged=2/10 suspects=7 steps_left=61
293
+ Step 28: INSPECT acc_0659 flagged=2/10 suspects=7 steps_left=60
294
+ Step 29: FLAG acc_0659 flagged=3/10 suspects=6 steps_left=60
295
+ Step 30: INSPECT acc_0290 flagged=3/10 suspects=6 steps_left=59
296
+ Step 31: INSPECT acc_0339 flagged=3/10 suspects=6 steps_left=58
297
+ Step 32: INSPECT acc_0544 flagged=3/10 suspects=6 steps_left=57
298
+ Step 33: INSPECT acc_0696 flagged=3/10 suspects=6 steps_left=56
299
+ Step 34: INSPECT acc_0541 flagged=3/10 suspects=6 steps_left=55
300
+ Step 35: FLAG acc_0290 flagged=4/10 suspects=8 steps_left=55
301
+ Step 36: INSPECT acc_0793 flagged=4/10 suspects=8 steps_left=54
302
+ Step 37: INSPECT acc_0214 flagged=4/10 suspects=8 steps_left=53
303
+ Step 38: INSPECT acc_0112 flagged=4/10 suspects=8 steps_left=52
304
+ Step 39: FLAG acc_0339 flagged=5/10 suspects=7 steps_left=52
305
+ Step 40: FLAG acc_0112 flagged=6/10 suspects=6 steps_left=52
306
+ Step 41: FLAG acc_0696 flagged=7/10 suspects=5 steps_left=52
307
+ Step 42: FLAG acc_0544 flagged=8/10 suspects=4 steps_left=52
308
+ Step 43: FLAG acc_0214 flagged=9/10 suspects=3 steps_left=52
309
+ Step 44: FLAG acc_0793 flagged=10/10 suspects=2 steps_left=52
310
+ Step 45: FLAG acc_0541 flagged=11/10 suspects=1 steps_left=52
311
+ Step 46: SUBMIT flagged=11/10 suspects=1 steps_left=52
312
+ → Episode ended: [WIN] TP=10 FP=1 FN=0 Recall=1.00 Precision=0.91 Episode reward=17.22
313
+ ★ GRADER SCORE: 0.9514
314
+
315
+ --- LLM Agent: task=medium, seed=0, model=Bedrock/deepseek.v3.2 ---
316
+ Step 1: INSPECT acc_0008 flagged=0/10 suspects=0 steps_left=49
317
+ Step 2: INSPECT acc_0074 flagged=0/10 suspects=0 steps_left=48
318
+ Step 3: INSPECT acc_0179 flagged=0/10 suspects=0 steps_left=47
319
+ Step 4: INSPECT acc_0096 flagged=0/10 suspects=0 steps_left=46
320
+ Step 5: INSPECT acc_0177 flagged=0/10 suspects=0 steps_left=45
321
+ Step 6: INSPECT acc_0122 flagged=0/10 suspects=0 steps_left=44
322
+ Step 7: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=43
323
+ Step 8: INSPECT acc_0174 flagged=0/10 suspects=0 steps_left=42
324
+ Step 9: INSPECT acc_0144 flagged=0/10 suspects=0 steps_left=40
325
+ Step 10: FLAG acc_0144 flagged=1/10 suspects=8 steps_left=40
326
+ Step 11: INSPECT acc_0181 flagged=1/10 suspects=8 steps_left=38
327
+ Step 12: INSPECT acc_0022 flagged=1/10 suspects=8 steps_left=37
328
+ Step 13: FLAG acc_0022 flagged=2/10 suspects=8 steps_left=37
329
+ Step 14: INSPECT acc_0092 flagged=2/10 suspects=8 steps_left=36
330
+ Step 15: INSPECT acc_0097 flagged=2/10 suspects=8 steps_left=35
331
+ Step 16: FLAG acc_0097 flagged=3/10 suspects=7 steps_left=35
332
+ Step 17: FLAG acc_0181 flagged=4/10 suspects=6 steps_left=35
333
+ Step 18: INSPECT acc_0187 flagged=4/10 suspects=6 steps_left=34
334
+ Step 19: INSPECT acc_0093 flagged=4/10 suspects=6 steps_left=33
335
+ Step 20: FLAG acc_0093 flagged=5/10 suspects=5 steps_left=33
336
+ Step 21: FLAG acc_0187 flagged=6/10 suspects=4 steps_left=33
337
+ Step 22: INSPECT acc_0172 flagged=6/10 suspects=4 steps_left=32
338
+ Step 23: FLAG acc_0172 flagged=7/10 suspects=3 steps_left=32
339
+ Step 24: INSPECT acc_0058 flagged=7/10 suspects=3 steps_left=31
340
+ Step 25: FLAG acc_0058 flagged=8/10 suspects=2 steps_left=31
341
+ Step 26: INSPECT acc_0131 flagged=8/10 suspects=2 steps_left=30
342
+ Step 27: FLAG acc_0131 flagged=9/10 suspects=1 steps_left=30
343
+ Step 28: FLAG acc_0092 flagged=10/10 suspects=0 steps_left=30
344
+ Step 29: SUBMIT flagged=10/10 suspects=0 steps_left=30
345
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.80
346
+ ★ GRADER SCORE: 0.9600
347
+
348
+ --- LLM Agent: task=hard, seed=0, model=Bedrock/deepseek.v3.2 ---
349
+ Step 1: INSPECT acc_0704 flagged=0/10 suspects=0 steps_left=79
350
+ Step 2: INSPECT acc_0289 flagged=0/10 suspects=0 steps_left=78
351
+ Step 3: INSPECT acc_0826 flagged=0/10 suspects=0 steps_left=77
352
+ Step 4: INSPECT acc_0927 flagged=0/10 suspects=0 steps_left=76
353
+ Step 5: INSPECT acc_0441 flagged=0/10 suspects=0 steps_left=75
354
+ Step 6: INSPECT acc_0871 flagged=0/10 suspects=0 steps_left=74
355
+ Step 7: INSPECT acc_0880 flagged=0/10 suspects=0 steps_left=73
356
+ Step 8: INSPECT acc_0652 flagged=0/10 suspects=0 steps_left=72
357
+ Step 9: INSPECT acc_0939 flagged=0/10 suspects=0 steps_left=71
358
+ Step 10: INSPECT acc_0070 flagged=0/10 suspects=0 steps_left=70
359
+ Step 11: INSPECT acc_0915 flagged=0/10 suspects=0 steps_left=69
360
+ Step 12: INSPECT acc_0443 flagged=0/10 suspects=0 steps_left=68
361
+ Step 13: INSPECT acc_0970 flagged=0/10 suspects=0 steps_left=67
362
+ Step 14: INSPECT acc_0028 flagged=0/10 suspects=0 steps_left=66
363
+ Step 15: INSPECT acc_0792 flagged=0/10 suspects=0 steps_left=65
364
+ Step 16: INSPECT acc_0579 flagged=0/10 suspects=0 steps_left=64
365
+ Step 17: INSPECT acc_0037 flagged=0/10 suspects=0 steps_left=63
366
+ Step 18: INSPECT acc_0295 flagged=0/10 suspects=0 steps_left=62
367
+ Step 19: INSPECT acc_0438 flagged=0/10 suspects=0 steps_left=61
368
+ Step 20: INSPECT acc_0439 flagged=0/10 suspects=0 steps_left=60
369
+ Step 21: FLAG acc_0439 flagged=1/10 suspects=5 steps_left=60
370
+ Step 22: INSPECT acc_0237 flagged=1/10 suspects=5 steps_left=59
371
+ Step 23: INSPECT acc_0621 flagged=1/10 suspects=5 steps_left=58
372
+ Step 24: FLAG acc_0621 flagged=2/10 suspects=7 steps_left=58
373
+ Step 25: INSPECT acc_0389 flagged=2/10 suspects=7 steps_left=57
374
+ Step 26: INSPECT acc_0160 flagged=2/10 suspects=7 steps_left=56
375
+ Step 27: INSPECT acc_0549 flagged=2/10 suspects=7 steps_left=55
376
+ Step 28: INSPECT acc_0658 flagged=2/10 suspects=7 steps_left=54
377
+ Step 29: INSPECT acc_0290 flagged=2/10 suspects=7 steps_left=53
378
+ Step 30: INSPECT acc_0124 flagged=2/10 suspects=7 steps_left=52
379
+ Step 31: FLAG acc_0160 flagged=3/10 suspects=7 steps_left=52
380
+ Step 32: INSPECT acc_0507 flagged=3/10 suspects=7 steps_left=51
381
+ Step 33: FLAG acc_0237 flagged=4/10 suspects=6 steps_left=51
382
+ Step 34: FLAG acc_0549 flagged=5/10 suspects=5 steps_left=51
383
+ Step 35: FLAG acc_0290 flagged=6/10 suspects=4 steps_left=51
384
+ Step 36: FLAG acc_0389 flagged=8/10 suspects=10 steps_left=51
385
+ Step 37: INSPECT acc_0844 flagged=9/10 suspects=9 steps_left=50
386
+ Step 38: INSPECT acc_0436 flagged=9/10 suspects=9 steps_left=49
387
+ Step 39: INSPECT acc_0870 flagged=9/10 suspects=9 steps_left=48
388
+ Step 40: FLAG acc_0507 flagged=10/10 suspects=8 steps_left=48
389
+ Step 41: INSPECT acc_0142 flagged=10/10 suspects=8 steps_left=47
390
+ Step 42: INSPECT acc_0833 flagged=10/10 suspects=8 steps_left=46
391
+ Step 43: INSPECT acc_0271 flagged=10/10 suspects=8 steps_left=45
392
+ Step 44: FLAG acc_0124 flagged=11/10 suspects=7 steps_left=45
393
+ Step 45: SUBMIT flagged=11/10 suspects=7 steps_left=45
394
+ → Episode ended: [WIN] TP=10 FP=1 FN=0 Recall=1.00 Precision=0.91 Episode reward=16.15
395
+ ★ GRADER SCORE: 0.9426
396
+
397
+ Summary: easy=0.9514 medium=0.9600 hard=0.9426
398
+
399
+ ============================================================
400
+ PHASE 3: Score Variance (seeds=[0, 1, 2])
401
+ ============================================================
402
+
403
+ --- LLM Agent: task=easy, seed=0, model=Bedrock/deepseek.v3.2 ---
404
+ Step 1: INSPECT acc_0043 flagged=0/10 suspects=0 steps_left=29
405
+ Step 2: FLAG acc_0043 flagged=1/10 suspects=8 steps_left=29
406
+ Step 3: INSPECT acc_0036 flagged=1/10 suspects=8 steps_left=28
407
+ Step 4: INSPECT acc_0001 flagged=1/10 suspects=8 steps_left=27
408
+ Step 5: INSPECT acc_0014 flagged=1/10 suspects=8 steps_left=26
409
+ Step 6: INSPECT acc_0012 flagged=1/10 suspects=8 steps_left=25
410
+ Step 7: INSPECT acc_0000 flagged=1/10 suspects=8 steps_left=24
411
+ Step 8: FLAG acc_0001 flagged=2/10 suspects=8 steps_left=24
412
+ Step 9: INSPECT acc_0027 flagged=2/10 suspects=8 steps_left=23
413
+ Step 10: FLAG acc_0027 flagged=3/10 suspects=7 steps_left=23
414
+ Step 11: INSPECT acc_0047 flagged=3/10 suspects=7 steps_left=22
415
+ Step 12: FLAG acc_0047 flagged=4/10 suspects=6 steps_left=22
416
+ Step 13: INSPECT acc_0007 flagged=4/10 suspects=6 steps_left=21
417
+ Step 14: INSPECT acc_0028 flagged=4/10 suspects=6 steps_left=20
418
+ Step 15: FLAG acc_0014 flagged=5/10 suspects=5 steps_left=20
419
+ Step 16: FLAG acc_0007 flagged=6/10 suspects=4 steps_left=20
420
+ Step 17: FLAG acc_0000 flagged=7/10 suspects=3 steps_left=20
421
+ Step 18: FLAG acc_0012 flagged=8/10 suspects=2 steps_left=20
422
+ Step 19: FLAG acc_0036 flagged=9/10 suspects=1 steps_left=20
423
+ Step 20: FLAG acc_0028 flagged=10/10 suspects=0 steps_left=20
424
+ Step 21: SUBMIT flagged=10/10 suspects=0 steps_left=20
425
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.90
426
+ ★ GRADER SCORE: 0.9667
427
+
428
+ --- LLM Agent: task=easy, seed=1, model=Bedrock/deepseek.v3.2 ---
429
+ Step 1: INSPECT acc_0034 flagged=0/10 suspects=0 steps_left=29
430
+ Step 2: INSPECT acc_0003 flagged=0/10 suspects=0 steps_left=28
431
+ Step 3: INSPECT acc_0049 flagged=0/10 suspects=0 steps_left=27
432
+ Step 4: INSPECT acc_0006 flagged=0/10 suspects=0 steps_left=26
433
+ Step 5: INSPECT acc_0047 flagged=0/10 suspects=0 steps_left=25
434
+ Step 6: FLAG acc_0047 flagged=1/10 suspects=9 steps_left=25
435
+ Step 7: INSPECT acc_0009 flagged=1/10 suspects=9 steps_left=24
436
+ Step 8: INSPECT acc_0046 flagged=1/10 suspects=9 steps_left=23
437
+ Step 9: FLAG acc_0009 flagged=2/10 suspects=8 steps_left=23
438
+ Step 10: FLAG acc_0046 flagged=3/10 suspects=7 steps_left=23
439
+ Step 11: INSPECT acc_0014 flagged=3/10 suspects=7 steps_left=22
440
+ Step 12: FLAG acc_0014 flagged=4/10 suspects=6 steps_left=22
441
+ Step 13: INSPECT acc_0021 flagged=4/10 suspects=6 steps_left=21
442
+ Step 14: FLAG acc_0021 flagged=5/10 suspects=5 steps_left=21
443
+ Step 15: INSPECT acc_0002 flagged=5/10 suspects=5 steps_left=20
444
+ Step 16: FLAG acc_0002 flagged=6/10 suspects=4 steps_left=20
445
+ Step 17: INSPECT acc_0048 flagged=6/10 suspects=4 steps_left=19
446
+ Step 18: INSPECT acc_0029 flagged=6/10 suspects=4 steps_left=18
447
+ Step 19: FLAG acc_0029 flagged=7/10 suspects=3 steps_left=18
448
+ Step 20: INSPECT acc_0015 flagged=7/10 suspects=3 steps_left=17
449
+ Step 21: FLAG acc_0015 flagged=8/10 suspects=2 steps_left=17
450
+ Step 22: FLAG acc_0048 flagged=9/10 suspects=1 steps_left=17
451
+ Step 23: INSPECT acc_0005 flagged=9/10 suspects=1 steps_left=16
452
+ Step 24: FLAG acc_0005 flagged=10/10 suspects=0 steps_left=16
453
+ Step 25: SUBMIT flagged=10/10 suspects=0 steps_left=16
454
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.86
455
+ ★ GRADER SCORE: 0.9533
456
+
457
+ --- LLM Agent: task=easy, seed=2, model=Bedrock/deepseek.v3.2 ---
458
+ Step 1: INSPECT acc_0040 flagged=0/10 suspects=0 steps_left=30
459
+ ★ GRADER SCORE: 0.0000
460
+
461
+ easy: scores=['0.967', '0.953', '0.000'] mean=0.6400 var=0.204830
462
+
463
+ --- LLM Agent: task=medium, seed=0, model=Bedrock/deepseek.v3.2 ---
464
+ Step 1: INSPECT acc_0008 flagged=0/10 suspects=0 steps_left=49
465
+ Step 2: INSPECT acc_0074 flagged=0/10 suspects=0 steps_left=48
466
+ Step 3: INSPECT acc_0179 flagged=0/10 suspects=0 steps_left=47
467
+ Step 4: INSPECT acc_0096 flagged=0/10 suspects=0 steps_left=46
468
+ Step 5: INSPECT acc_0177 flagged=0/10 suspects=0 steps_left=45
469
+ Step 6: INSPECT acc_0122 flagged=0/10 suspects=0 steps_left=44
470
+ Step 7: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=43
471
+ Step 8: INSPECT acc_0174 flagged=0/10 suspects=0 steps_left=42
472
+ Step 9: INSPECT acc_0144 flagged=0/10 suspects=0 steps_left=41
473
+ Step 10: FLAG acc_0144 flagged=1/10 suspects=8 steps_left=41
474
+ Step 11: INSPECT acc_0181 flagged=1/10 suspects=8 steps_left=40
475
+ Step 12: INSPECT acc_0022 flagged=1/10 suspects=8 steps_left=39
476
+ Step 13: FLAG acc_0022 flagged=2/10 suspects=8 steps_left=39
477
+ Step 14: INSPECT acc_0092 flagged=2/10 suspects=8 steps_left=38
478
+ Step 15: INSPECT acc_0097 flagged=2/10 suspects=8 steps_left=37
479
+ Step 16: FLAG acc_0181 flagged=3/10 suspects=7 steps_left=37
480
+ Step 17: INSPECT acc_0187 flagged=3/10 suspects=7 steps_left=36
481
+ Step 18: FLAG acc_0092 flagged=4/10 suspects=6 steps_left=36
482
+ Step 19: INSPECT acc_0093 flagged=4/10 suspects=6 steps_left=35
483
+ Step 20: FLAG acc_0093 flagged=5/10 suspects=5 steps_left=35
484
+ Step 21: FLAG acc_0097 flagged=6/10 suspects=4 steps_left=35
485
+ Step 22: INSPECT acc_0172 flagged=6/10 suspects=4 steps_left=34
486
+ Step 23: FLAG acc_0187 flagged=7/10 suspects=3 steps_left=34
487
+ Step 24: INSPECT acc_0058 flagged=7/10 suspects=3 steps_left=33
488
+ Step 25: FLAG acc_0058 flagged=8/10 suspects=2 steps_left=33
489
+ Step 26: FLAG acc_0172 flagged=9/10 suspects=1 steps_left=33
490
+ Step 27: INSPECT acc_0131 flagged=9/10 suspects=1 steps_left=32
491
+ Step 28: FLAG acc_0131 flagged=10/10 suspects=0 steps_left=32
492
+ Step 29: SUBMIT flagged=10/10 suspects=0 steps_left=32
493
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.82
494
+ ★ GRADER SCORE: 0.9640
495
+
496
+ --- LLM Agent: task=medium, seed=1, model=Bedrock/deepseek.v3.2 ---
497
+ Step 1: INSPECT acc_0171 flagged=0/10 suspects=0 steps_left=49
498
+ Step 2: INSPECT acc_0099 flagged=0/10 suspects=0 steps_left=48
499
+ Step 3: INSPECT acc_0152 flagged=0/10 suspects=0 steps_left=47
500
+ Step 4: INSPECT acc_0092 flagged=0/10 suspects=0 steps_left=46
501
+ Step 5: INSPECT acc_0078 flagged=0/10 suspects=0 steps_left=45
502
+ Step 6: INSPECT acc_0112 flagged=0/10 suspects=0 steps_left=44
503
+ Step 7: INSPECT acc_0012 flagged=0/10 suspects=0 steps_left=43
504
+ Step 8: FLAG acc_0012 flagged=1/10 suspects=8 steps_left=43
505
+ Step 9: INSPECT acc_0033 flagged=1/10 suspects=8 steps_left=42
506
+ Step 10: FLAG acc_0033 flagged=2/10 suspects=8 steps_left=42
507
+ Step 11: INSPECT acc_0174 flagged=2/10 suspects=8 steps_left=41
508
+ Step 12: INSPECT acc_0187 flagged=2/10 suspects=8 steps_left=40
509
+ Step 13: FLAG acc_0187 flagged=3/10 suspects=7 steps_left=40
510
+ Step 14: INSPECT acc_0079 flagged=3/10 suspects=7 steps_left=39
511
+ Step 15: INSPECT acc_0032 flagged=3/10 suspects=7 steps_left=38
512
+ Step 16: INSPECT acc_0023 flagged=3/10 suspects=7 steps_left=37
513
+ Step 17: INSPECT acc_0146 flagged=3/10 suspects=7 steps_left=36
514
+ Step 18: FLAG acc_0174 flagged=4/10 suspects=6 steps_left=36
515
+ Step 19: INSPECT acc_0019 flagged=4/10 suspects=6 steps_left=35
516
+ Step 20: FLAG acc_0023 flagged=5/10 suspects=5 steps_left=35
517
+ Step 21: INSPECT acc_0131 flagged=5/10 suspects=5 steps_left=34
518
+ Step 22: FLAG acc_0079 flagged=6/10 suspects=4 steps_left=34
519
+ Step 23: FLAG acc_0019 flagged=7/10 suspects=3 steps_left=34
520
+ Step 24: FLAG acc_0146 flagged=8/10 suspects=2 steps_left=34
521
+ Step 25: FLAG acc_0131 flagged=9/10 suspects=1 steps_left=34
522
+ Step 26: FLAG acc_0032 flagged=10/10 suspects=0 steps_left=34
523
+ Step 27: SUBMIT flagged=10/10 suspects=0 steps_left=34
524
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.84
525
+ ★ GRADER SCORE: 0.9680
526
+
527
+ --- LLM Agent: task=medium, seed=2, model=Bedrock/deepseek.v3.2 ---
528
+ Step 1: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=49
529
+ Step 2: INSPECT acc_0107 flagged=0/10 suspects=0 steps_left=48
530
+ Step 3: INSPECT acc_0175 flagged=0/10 suspects=0 steps_left=47
531
+ Step 4: INSPECT acc_0030 flagged=0/10 suspects=0 steps_left=46
532
+ Step 5: INSPECT acc_0041 flagged=0/10 suspects=0 steps_left=45
533
+ Step 6: INSPECT acc_0054 flagged=0/10 suspects=0 steps_left=44
534
+ Step 7: INSPECT acc_0199 flagged=0/10 suspects=0 steps_left=43
535
+ Step 8: INSPECT acc_0181 flagged=0/10 suspects=0 steps_left=42
536
+ Step 9: INSPECT acc_0166 flagged=0/10 suspects=0 steps_left=41
537
+ Step 10: INSPECT acc_0098 flagged=0/10 suspects=0 steps_left=40
538
+ Step 11: INSPECT acc_0121 flagged=0/10 suspects=0 steps_left=39
539
+ Step 12: INSPECT acc_0053 flagged=0/10 suspects=0 steps_left=38
540
+ Step 13: INSPECT acc_0103 flagged=0/10 suspects=0 steps_left=37
541
+ Step 14: INSPECT acc_0000 flagged=0/10 suspects=0 steps_left=36
542
+ Step 15: INSPECT acc_0168 flagged=0/10 suspects=0 steps_left=35
543
+ Step 16: INSPECT acc_0040 flagged=0/10 suspects=0 steps_left=34
544
+ Step 17: INSPECT acc_0149 flagged=0/10 suspects=0 steps_left=33
545
+ Step 18: INSPECT acc_0064 flagged=0/10 suspects=0 steps_left=32
546
+ Step 19: INSPECT acc_0016 flagged=0/10 suspects=0 steps_left=31
547
+ Step 20: INSPECT acc_0105 flagged=0/10 suspects=0 steps_left=30
548
+ Step 21: INSPECT acc_0035 flagged=0/10 suspects=0 steps_left=29
549
+ Step 22: FLAG acc_0035 flagged=1/10 suspects=9 steps_left=29
550
+ Step 23: INSPECT acc_0020 flagged=1/10 suspects=9 steps_left=28
551
+ Step 24: INSPECT acc_0036 flagged=1/10 suspects=9 steps_left=27
552
+ Step 25: FLAG acc_0036 flagged=2/10 suspects=8 steps_left=27
553
+ Step 26: INSPECT acc_0050 flagged=2/10 suspects=8 steps_left=26
554
+ Step 27: FLAG acc_0020 flagged=3/10 suspects=7 steps_left=26
555
+ Step 28: INSPECT acc_0051 flagged=3/10 suspects=7 steps_left=25
556
+ Step 29: INSPECT acc_0085 flagged=3/10 suspects=7 steps_left=24
557
+ Step 30: FLAG acc_0050 flagged=4/10 suspects=6 steps_left=24
558
+ Step 31: FLAG acc_0085 flagged=5/10 suspects=5 steps_left=24
559
+ Step 32: INSPECT acc_0177 flagged=5/10 suspects=5 steps_left=23
560
+ Step 33: INSPECT acc_0170 flagged=5/10 suspects=5 steps_left=22
561
+ Step 34: FLAG acc_0170 flagged=6/10 suspects=4 steps_left=22
562
+ [LLM RETRY] An error occurred (ValidationException) when calling the Converse operation:
563
+ The model returned the following errors: {"error":{"code":"validation_error","message":"Int
564
+ ernal server error","param":null,"type":"invalid_request_error"}} — retrying in 3s
565
+ Step 35: INSPECT acc_0055 flagged=6/10 suspects=4 steps_left=21
566
+ Step 36: FLAG acc_0177 flagged=7/10 suspects=3 steps_left=21
567
+ Step 37: INSPECT acc_0094 flagged=7/10 suspects=3 steps_left=20
568
+ Step 38: FLAG acc_0094 flagged=8/10 suspects=2 steps_left=20
569
+ Step 39: FLAG acc_0055 flagged=9/10 suspects=1 steps_left=20
570
+ Step 40: FLAG acc_0051 flagged=10/10 suspects=0 steps_left=20
571
+ Step 41: SUBMIT flagged=10/10 suspects=0 steps_left=20
572
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.70
573
+ ★ GRADER SCORE: 0.9400
574
+
575
+ medium: scores=['0.964', '0.968', '0.940'] mean=0.9573 var=0.000153
576
+
577
+ --- LLM Agent: task=hard, seed=0, model=Bedrock/deepseek.v3.2 ---
578
+ Step 1: INSPECT acc_0704 flagged=0/10 suspects=0 steps_left=79
579
+ Step 2: INSPECT acc_0289 flagged=0/10 suspects=0 steps_left=78
580
+ Step 3: INSPECT acc_0826 flagged=0/10 suspects=0 steps_left=77
581
+ Step 4: INSPECT acc_0927 flagged=0/10 suspects=0 steps_left=76
582
+ Step 5: INSPECT acc_0441 flagged=0/10 suspects=0 steps_left=75
583
+ Step 6: INSPECT acc_0871 flagged=0/10 suspects=0 steps_left=74
584
+ Step 7: INSPECT acc_0880 flagged=0/10 suspects=0 steps_left=73
585
+ Step 8: INSPECT acc_0652 flagged=0/10 suspects=0 steps_left=72
586
+ Step 9: INSPECT acc_0939 flagged=0/10 suspects=0 steps_left=71
587
+ Step 10: INSPECT acc_0070 flagged=0/10 suspects=0 steps_left=70
588
+ Step 11: INSPECT acc_0915 flagged=0/10 suspects=0 steps_left=69
589
+ Step 12: INSPECT acc_0443 flagged=0/10 suspects=0 steps_left=68
590
+ Step 13: INSPECT acc_0970 flagged=0/10 suspects=0 steps_left=67
591
+ Step 14: INSPECT acc_0028 flagged=0/10 suspects=0 steps_left=66
592
+ Step 15: INSPECT acc_0792 flagged=0/10 suspects=0 steps_left=65
593
+ Step 16: INSPECT acc_0579 flagged=0/10 suspects=0 steps_left=64
594
+ Step 17: INSPECT acc_0037 flagged=0/10 suspects=0 steps_left=63
595
+ Step 18: INSPECT acc_0295 flagged=0/10 suspects=0 steps_left=62
596
+ Step 19: INSPECT acc_0438 flagged=0/10 suspects=0 steps_left=61
597
+ Step 20: INSPECT acc_0439 flagged=0/10 suspects=0 steps_left=60
598
+ Step 21: FLAG acc_0439 flagged=1/10 suspects=5 steps_left=60
599
+ Step 22: INSPECT acc_0237 flagged=1/10 suspects=5 steps_left=59
600
+ Step 23: FLAG acc_0237 flagged=2/10 suspects=6 steps_left=59
601
+ Step 24: INSPECT acc_0621 flagged=2/10 suspects=6 steps_left=58
602
+ Step 25: FLAG acc_0621 flagged=3/10 suspects=6 steps_left=58
603
+ Step 26: INSPECT acc_0389 flagged=3/10 suspects=6 steps_left=57
604
+ Step 27: INSPECT acc_0160 flagged=3/10 suspects=6 steps_left=56
605
+ Step 28: INSPECT acc_0549 flagged=3/10 suspects=6 steps_left=55
606
+ Step 29: INSPECT acc_0658 flagged=3/10 suspects=6 steps_left=54
607
+ Step 30: FLAG acc_0160 flagged=4/10 suspects=6 steps_left=54
608
+ Step 31: INSPECT acc_0290 flagged=4/10 suspects=6 steps_left=53
609
+ Step 32: INSPECT acc_0124 flagged=4/10 suspects=6 steps_left=52
610
+ Step 33: INSPECT acc_0507 flagged=4/10 suspects=6 steps_left=51
611
+ Step 34: FLAG acc_0549 flagged=5/10 suspects=5 steps_left=51
612
+ Step 35: FLAG acc_0290 flagged=6/10 suspects=4 steps_left=51
613
+ Step 36: FLAG acc_0389 flagged=7/10 suspects=3 steps_left=51
614
+ Step 37: FLAG acc_0658 flagged=8/10 suspects=2 steps_left=51
615
+ Step 38: FLAG acc_0507 flagged=9/10 suspects=1 steps_left=51
616
+ Step 39: FLAG acc_0124 flagged=10/10 suspects=0 steps_left=51
617
+ Step 40: SUBMIT flagged=10/10 suspects=0 steps_left=51
618
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.71
619
+ ★ GRADER SCORE: 0.9637
620
+
621
+ --- LLM Agent: task=hard, seed=1, model=Bedrock/deepseek.v3.2 ---
622
+ Step 1: INSPECT acc_0014 flagged=0/10 suspects=0 steps_left=79
623
+ Step 2: INSPECT acc_0835 flagged=0/10 suspects=0 steps_left=78
624
+ Step 3: INSPECT acc_0855 flagged=0/10 suspects=0 steps_left=77
625
+ Step 4: INSPECT acc_0930 flagged=0/10 suspects=0 steps_left=76
626
+ Step 5: INSPECT acc_0336 flagged=0/10 suspects=0 steps_left=75
627
+ Step 6: INSPECT acc_0929 flagged=0/10 suspects=0 steps_left=74
628
+ Step 7: INSPECT acc_0076 flagged=0/10 suspects=0 steps_left=73
629
+ Step 8: INSPECT acc_0543 flagged=0/10 suspects=0 steps_left=72
630
+ Step 9: INSPECT acc_0590 flagged=0/10 suspects=0 steps_left=71
631
+ Step 10: INSPECT acc_0401 flagged=0/10 suspects=0 steps_left=70
632
+ Step 11: INSPECT acc_0322 flagged=0/10 suspects=0 steps_left=69
633
+ Step 12: INSPECT acc_0154 flagged=0/10 suspects=0 steps_left=68
634
+ Step 13: INSPECT acc_0374 flagged=0/10 suspects=0 steps_left=67
635
+ Step 14: INSPECT acc_0549 flagged=0/10 suspects=0 steps_left=66
636
+ Step 15: INSPECT acc_0903 flagged=0/10 suspects=0 steps_left=65
637
+ Step 16: INSPECT acc_0976 flagged=0/10 suspects=0 steps_left=64
638
+ Step 17: INSPECT acc_0620 flagged=0/10 suspects=0 steps_left=63
639
+ Step 18: INSPECT acc_0017 flagged=0/10 suspects=0 steps_left=62
640
+ Step 19: INSPECT acc_0222 flagged=0/10 suspects=0 steps_left=61
641
+ Step 20: INSPECT acc_0536 flagged=0/10 suspects=0 steps_left=60
642
+ Step 21: INSPECT acc_0112 flagged=0/10 suspects=0 steps_left=59
643
+ Step 22: INSPECT acc_0577 flagged=0/10 suspects=0 steps_left=58
644
+ Step 23: INSPECT acc_0517 flagged=0/10 suspects=0 steps_left=57
645
+ Step 24: INSPECT acc_0113 flagged=0/10 suspects=0 steps_left=56
646
+ Step 25: INSPECT acc_0167 flagged=0/10 suspects=0 steps_left=55
647
+ Step 26: INSPECT acc_0697 flagged=0/10 suspects=0 steps_left=54
648
+ Step 27: INSPECT acc_0271 flagged=0/10 suspects=0 steps_left=53
649
+ Step 28: INSPECT acc_0681 flagged=0/10 suspects=0 steps_left=52
650
+ Step 29: INSPECT acc_0530 flagged=0/10 suspects=0 steps_left=51
651
+ Step 30: INSPECT acc_0353 flagged=0/10 suspects=0 steps_left=50
652
+ Step 31: INSPECT acc_0652 flagged=0/10 suspects=0 steps_left=49
653
+ Step 32: INSPECT acc_0777 flagged=0/10 suspects=0 steps_left=48
654
+ Step 33: INSPECT acc_0265 flagged=0/10 suspects=0 steps_left=47
655
+ Step 34: INSPECT acc_0788 flagged=0/10 suspects=0 steps_left=46
656
+ Step 35: INSPECT acc_0033 flagged=0/10 suspects=0 steps_left=45
657
+ Step 36: INSPECT acc_0187 flagged=0/10 suspects=0 steps_left=44
658
+ Step 37: INSPECT acc_0445 flagged=0/10 suspects=0 steps_left=43
659
+ Step 38: INSPECT acc_0846 flagged=0/10 suspects=0 steps_left=42
660
+ Step 39: INSPECT acc_0659 flagged=0/10 suspects=0 steps_left=41
661
+ Step 40: INSPECT acc_0768 flagged=0/10 suspects=0 steps_left=40
662
+ Step 41: INSPECT acc_0677 flagged=0/10 suspects=0 steps_left=39
663
+ Step 42: INSPECT acc_0539 flagged=0/10 suspects=0 steps_left=38
664
+ Step 43: INSPECT acc_0742 flagged=0/10 suspects=0 steps_left=37
665
+ Step 44: INSPECT acc_0503 flagged=0/10 suspects=0 steps_left=36
666
+ Step 45: INSPECT acc_0876 flagged=0/10 suspects=0 steps_left=35
667
+ Step 46: INSPECT acc_0639 flagged=0/10 suspects=0 steps_left=34
668
+ Step 47: INSPECT acc_0494 flagged=0/10 suspects=0 steps_left=33
669
+ Step 48: INSPECT acc_0898 flagged=0/10 suspects=0 steps_left=32
670
+ Step 49: INSPECT acc_0553 flagged=0/10 suspects=0 steps_left=31
671
+ Step 50: INSPECT acc_0588 flagged=0/10 suspects=0 steps_left=30
672
+ Step 51: INSPECT acc_0194 flagged=0/10 suspects=0 steps_left=29
673
+ Step 52: INSPECT acc_0810 flagged=0/10 suspects=0 steps_left=28
674
+ Step 53: INSPECT acc_0355 flagged=0/10 suspects=0 steps_left=27
675
+ Step 54: INSPECT acc_0363 flagged=0/10 suspects=0 steps_left=26
676
+ Step 55: INSPECT acc_0221 flagged=0/10 suspects=0 steps_left=25
677
+ Step 56: INSPECT acc_0580 flagged=0/10 suspects=0 steps_left=24
678
+ Step 57: INSPECT acc_0534 flagged=0/10 suspects=0 steps_left=23
679
+ Step 58: INSPECT acc_0778 flagged=0/10 suspects=0 steps_left=22
680
+ Step 59: INSPECT acc_0998 flagged=0/10 suspects=0 steps_left=21
681
+ Step 60: INSPECT acc_0233 flagged=0/10 suspects=0 steps_left=20
682
+ Step 61: INSPECT acc_0052 flagged=0/10 suspects=0 steps_left=19
683
+ Step 62: INSPECT acc_0813 flagged=0/10 suspects=0 steps_left=18
684
+ Step 63: INSPECT acc_0035 flagged=0/10 suspects=0 steps_left=17
685
+ Step 64: INSPECT acc_0667 flagged=0/10 suspects=0 steps_left=16
686
+ Step 65: INSPECT acc_0019 flagged=0/10 suspects=0 steps_left=15
687
+ Step 66: INSPECT acc_0959 flagged=0/10 suspects=0 steps_left=14
688
+ Step 67: INSPECT acc_0212 flagged=0/10 suspects=0 steps_left=13
689
+ Step 68: INSPECT acc_0776 flagged=0/10 suspects=0 steps_left=12
690
+ Step 69: INSPECT acc_0049 flagged=0/10 suspects=0 steps_left=11
691
+ Step 70: INSPECT acc_0434 flagged=0/10 suspects=0 steps_left=10
692
+ Step 71: INSPECT acc_0827 flagged=0/10 suspects=0 steps_left=9
693
+ Step 72: INSPECT acc_0583 flagged=0/10 suspects=0 steps_left=8
694
+ Step 73: INSPECT acc_0065 flagged=0/10 suspects=0 steps_left=7
695
+ Step 74: INSPECT acc_0107 flagged=0/10 suspects=0 steps_left=6
696
+ Step 75: INSPECT acc_0107 flagged=0/10 suspects=0 steps_left=5
697
+ Step 76: INSPECT acc_0761 flagged=0/10 suspects=0 steps_left=4
698
+ Step 77: INSPECT acc_0995 flagged=0/10 suspects=0 steps_left=3
699
+ Step 78: INSPECT acc_0157 flagged=0/10 suspects=0 steps_left=2
700
+ Step 79: INSPECT acc_0936 flagged=0/10 suspects=0 steps_left=1
701
+ Step 80: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=0
702
+ → Episode ended: [LOSS] TP=0 FP=0 FN=10 Recall=0.00 Precision=0.00 Episode reward=-9.80
703
+ ★ GRADER SCORE: 0.0000
704
+
705
+ --- LLM Agent: task=hard, seed=2, model=Bedrock/deepseek.v3.2 ---
706
+ Step 1: INSPECT acc_0813 flagged=0/10 suspects=0 steps_left=79
707
+ Step 2: INSPECT acc_0430 flagged=0/10 suspects=0 steps_left=78
708
+ Step 3: INSPECT acc_0817 flagged=0/10 suspects=0 steps_left=77
709
+ Step 4: INSPECT acc_0175 flagged=0/10 suspects=0 steps_left=76
710
+ Step 5: INSPECT acc_0523 flagged=0/10 suspects=0 steps_left=75
711
+ Step 6: INSPECT acc_0113 flagged=0/10 suspects=0 steps_left=74
712
+ Step 7: INSPECT acc_0797 flagged=0/10 suspects=0 steps_left=73
713
+ Step 8: INSPECT acc_0478 flagged=0/10 suspects=0 steps_left=72
714
+ Step 9: INSPECT acc_0861 flagged=0/10 suspects=0 steps_left=71
715
+ Step 10: INSPECT acc_0836 flagged=0/10 suspects=0 steps_left=70
716
+ Step 11: INSPECT acc_0926 flagged=0/10 suspects=0 steps_left=69
717
+ Step 12: INSPECT acc_0664 flagged=0/10 suspects=0 steps_left=68
718
+ Step 13: INSPECT acc_0255 flagged=0/10 suspects=0 steps_left=67
719
+ Step 14: INSPECT acc_0938 flagged=0/10 suspects=0 steps_left=66
720
+ Step 15: INSPECT acc_0672 flagged=0/10 suspects=0 steps_left=65
721
+ Step 16: FLAG acc_0672 flagged=1/10 suspects=6 steps_left=65
722
+ Step 17: INSPECT acc_0659 flagged=1/10 suspects=6 steps_left=64
723
+ Step 18: INSPECT acc_0290 flagged=1/10 suspects=6 steps_left=63
724
+ Step 19: FLAG acc_0290 flagged=2/10 suspects=6 steps_left=63
725
+ Step 20: INSPECT acc_0339 flagged=2/10 suspects=6 steps_left=62
726
+ Step 21: FLAG acc_0659 flagged=3/10 suspects=7 steps_left=62
727
+ Step 22: INSPECT acc_0544 flagged=3/10 suspects=7 steps_left=61
728
+ Step 23: FLAG acc_0339 flagged=4/10 suspects=6 steps_left=61
729
+ Step 24: INSPECT acc_0696 flagged=4/10 suspects=6 steps_left=60
730
+ Step 25: FLAG acc_0544 flagged=5/10 suspects=5 steps_left=60
731
+ Step 26: INSPECT acc_0541 flagged=5/10 suspects=5 steps_left=59
732
+ Step 27: FLAG acc_0696 flagged=6/10 suspects=4 steps_left=59
733
+ Step 28: FLAG acc_0541 flagged=7/10 suspects=3 steps_left=59
734
+ Step 29: INSPECT acc_0793 flagged=7/10 suspects=3 steps_left=58
735
+ Step 30: FLAG acc_0793 flagged=8/10 suspects=2 steps_left=58
736
+ Step 31: INSPECT acc_0214 flagged=8/10 suspects=2 steps_left=57
737
+ Step 32: INSPECT acc_0112 flagged=8/10 suspects=2 steps_left=56
738
+ Step 33: FLAG acc_0112 flagged=9/10 suspects=1 steps_left=56
739
+ Step 34: FLAG acc_0214 flagged=10/10 suspects=0 steps_left=56
740
+ Step 35: SUBMIT flagged=10/10 suspects=0 steps_left=56
741
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.76
742
+ ★ GRADER SCORE: 0.9700
743
+
744
+ hard: scores=['0.964', '0.000', '0.970'] mean=0.6446 var=0.207740
745
+
746
+ ============================================================
747
+ EVALUATION COMPLETE
748
+ ============================================================
749
+ ubuntu@ip-172-31-33-59:~/meta/meta-hack-26$
model-benchmark-logs/gemma_judge_log.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-benchmark-logs/meta_judge_log.txt ADDED
@@ -0,0 +1,826 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ubuntu@ip-172-31-33-59:~/meta/meta-hack-26/eval-models$ python3 llama_test_judge_eval.py --url https://pandago-graphstrike.hf.space --bedrock
2
+ GraphStrike Judge Evaluation Simulator
3
+ Target: https://pandago-graphstrike.hf.space
4
+ Backend: bedrock
5
+ Model: Bedrock/us.meta.llama4-scout-17b-instruct-v1:0
6
+ Token: set
7
+
8
+ ============================================================
9
+ PHASE 0: Endpoint Verification
10
+ ============================================================
11
+ ✓ GET /health
12
+ ✓ GET /tasks
13
+ ✓ GET /metadata
14
+ ✓ GET /schema
15
+ ✓ GET /web
16
+ ✓ POST /reset
17
+ ✓ GET /state
18
+ ✓ POST /step
19
+ ✓ POST /step
20
+ ✓ GET /grader
21
+ ✓ POST /mcp
22
+ ✓ POST /baseline
23
+
24
+ ============================================================
25
+ PHASE 1: Baseline Stability (3 runs)
26
+ ============================================================
27
+ Run 1: easy=0.9100 medium=0.9060 hard=0.9038
28
+ Run 2: easy=0.9100 medium=0.9060 hard=0.9038
29
+ Run 3: easy=0.9100 medium=0.9060 hard=0.9038
30
+ ✓ All 3 runs identical — baseline is deterministic
31
+
32
+ ============================================================
33
+ PHASE 2: LLM Agent Evaluation (model=Bedrock/us.meta.llama4-scout-17b-instruct-v1:0)
34
+ ============================================================
35
+
36
+ --- LLM Agent: task=easy, seed=0, model=Bedrock/us.meta.llama4-scout-17b-instruct-v1:0 ---
37
+ Step 1: INSPECT acc_0043 flagged=0/10 suspects=0 steps_left=29
38
+ Step 2: FLAG acc_0043 flagged=1/10 suspects=8 steps_left=29
39
+ Step 3: INSPECT acc_0036 flagged=1/10 suspects=8 steps_left=28
40
+ Step 4: FLAG acc_0036 flagged=2/10 suspects=8 steps_left=28
41
+ Step 5: INSPECT acc_0001 flagged=2/10 suspects=8 steps_left=27
42
+ Step 6: FLAG acc_0001 flagged=3/10 suspects=7 steps_left=27
43
+ Step 7: INSPECT acc_0014 flagged=3/10 suspects=7 steps_left=26
44
+ Step 8: FLAG acc_0014 flagged=4/10 suspects=6 steps_left=26
45
+ Step 9: INSPECT acc_0012 flagged=4/10 suspects=6 steps_left=25
46
+ Step 10: FLAG acc_0012 flagged=5/10 suspects=5 steps_left=25
47
+ Step 11: INSPECT acc_0000 flagged=5/10 suspects=5 steps_left=24
48
+ Step 12: FLAG acc_0000 flagged=6/10 suspects=4 steps_left=24
49
+ Step 13: INSPECT acc_0027 flagged=6/10 suspects=4 steps_left=23
50
+ Step 14: FLAG acc_0027 flagged=7/10 suspects=3 steps_left=23
51
+ Step 15: INSPECT acc_0047 flagged=7/10 suspects=3 steps_left=22
52
+ Step 16: FLAG acc_0047 flagged=8/10 suspects=2 steps_left=22
53
+ Step 17: INSPECT acc_0007 flagged=8/10 suspects=2 steps_left=21
54
+ Step 18: FLAG acc_0007 flagged=9/10 suspects=1 steps_left=21
55
+ Step 19: INSPECT acc_0028 flagged=9/10 suspects=1 steps_left=20
56
+ Step 20: FLAG acc_0028 flagged=10/10 suspects=0 steps_left=20
57
+ Step 21: INSPECT acc_0035 flagged=10/10 suspects=0 steps_left=19
58
+ Step 22: INSPECT acc_0022 flagged=10/10 suspects=0 steps_left=18
59
+ Step 23: INSPECT acc_0041 flagged=10/10 suspects=0 steps_left=17
60
+ Step 24: INSPECT acc_0029 flagged=10/10 suspects=0 steps_left=16
61
+ Step 25: INSPECT acc_0018 flagged=10/10 suspects=0 steps_left=15
62
+ Step 26: INSPECT acc_0019 flagged=10/10 suspects=0 steps_left=14
63
+ Step 27: INSPECT acc_0041 flagged=10/10 suspects=0 steps_left=13
64
+ Step 28: INSPECT acc_0040 flagged=10/10 suspects=0 steps_left=12
65
+ Step 29: INSPECT acc_0026 flagged=10/10 suspects=0 steps_left=11
66
+ Step 30: INSPECT acc_0021 flagged=10/10 suspects=0 steps_left=10
67
+ Step 31: INSPECT acc_0009 flagged=10/10 suspects=0 steps_left=9
68
+ Step 32: INSPECT acc_0033 flagged=10/10 suspects=0 steps_left=8
69
+ Step 33: INSPECT acc_0044 flagged=10/10 suspects=0 steps_left=7
70
+ Step 34: SUBMIT flagged=10/10 suspects=0 steps_left=7
71
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.77
72
+ ★ GRADER SCORE: 0.9233
73
+
74
+ --- LLM Agent: task=medium, seed=0, model=Bedrock/us.meta.llama4-scout-17b-instruct-v1:0 ---
75
+ Step 1: INSPECT acc_0008 flagged=0/10 suspects=0 steps_left=49
76
+ Step 2: INSPECT acc_0074 flagged=0/10 suspects=0 steps_left=48
77
+ Step 3: INSPECT acc_0179 flagged=0/10 suspects=0 steps_left=47
78
+ Step 4: INSPECT acc_0177 flagged=0/10 suspects=0 steps_left=46
79
+ Step 5: INSPECT acc_0096 flagged=0/10 suspects=0 steps_left=45
80
+ Step 6: INSPECT acc_0122 flagged=0/10 suspects=0 steps_left=44
81
+ Step 7: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=43
82
+ Step 8: INSPECT acc_0174 flagged=0/10 suspects=0 steps_left=42
83
+ Step 9: INSPECT acc_0144 flagged=0/10 suspects=0 steps_left=41
84
+ Step 10: FLAG acc_0144 flagged=1/10 suspects=8 steps_left=41
85
+ Step 11: INSPECT acc_0181 flagged=1/10 suspects=8 steps_left=40
86
+ Step 12: FLAG acc_0181 flagged=2/10 suspects=8 steps_left=40
87
+ Step 13: INSPECT acc_0022 flagged=2/10 suspects=8 steps_left=39
88
+ Step 14: FLAG acc_0022 flagged=3/10 suspects=7 steps_left=39
89
+ Step 15: INSPECT acc_0092 flagged=3/10 suspects=7 steps_left=38
90
+ Step 16: FLAG acc_0092 flagged=4/10 suspects=6 steps_left=38
91
+ Step 17: INSPECT acc_0097 flagged=4/10 suspects=6 steps_left=37
92
+ Step 18: FLAG acc_0097 flagged=5/10 suspects=5 steps_left=37
93
+ Step 19: INSPECT acc_0187 flagged=5/10 suspects=5 steps_left=36
94
+ Step 20: FLAG acc_0187 flagged=6/10 suspects=4 steps_left=36
95
+ Step 21: INSPECT acc_0093 flagged=6/10 suspects=4 steps_left=35
96
+ Step 22: FLAG acc_0093 flagged=7/10 suspects=3 steps_left=35
97
+ Step 23: INSPECT acc_0172 flagged=7/10 suspects=3 steps_left=34
98
+ Step 24: FLAG acc_0172 flagged=8/10 suspects=2 steps_left=34
99
+ Step 25: INSPECT acc_0058 flagged=8/10 suspects=2 steps_left=33
100
+ Step 26: FLAG acc_0058 flagged=9/10 suspects=1 steps_left=33
101
+ Step 27: INSPECT acc_0131 flagged=9/10 suspects=1 steps_left=32
102
+ Step 28: FLAG acc_0131 flagged=10/10 suspects=0 steps_left=32
103
+ Step 29: INSPECT acc_0148 flagged=10/10 suspects=0 steps_left=31
104
+ Step 30: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=30
105
+ Step 31: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=29
106
+ Step 32: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=28
107
+ Step 33: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=27
108
+ Step 34: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=26
109
+ Step 35: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=25
110
+ Step 36: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=24
111
+ Step 37: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=23
112
+ Step 38: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=22
113
+ Step 39: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=21
114
+ Step 40: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=20
115
+ Step 41: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=19
116
+ Step 42: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=18
117
+ Step 43: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=17
118
+ Step 44: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=16
119
+ Step 45: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=15
120
+ Step 46: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=14
121
+ Step 47: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=13
122
+ Step 48: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=12
123
+ Step 49: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=11
124
+ Step 50: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=10
125
+ Step 51: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=9
126
+ Step 52: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=8
127
+ Step 53: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=7
128
+ Step 54: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=6
129
+ Step 55: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=5
130
+ Step 56: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=4
131
+ Step 57: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=3
132
+ Step 58: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=2
133
+ Step 59: SUBMIT flagged=10/10 suspects=0 steps_left=2
134
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.52
135
+ ★ GRADER SCORE: 0.9040
136
+
137
+ --- LLM Agent: task=hard, seed=0, model=Bedrock/us.meta.llama4-scout-17b-instruct-v1:0 ---
138
+ Step 1: INSPECT acc_0704 flagged=0/10 suspects=0 steps_left=79
139
+ Step 2: INSPECT acc_0289 flagged=0/10 suspects=0 steps_left=78
140
+ Step 3: INSPECT acc_0826 flagged=0/10 suspects=0 steps_left=77
141
+ Step 4: INSPECT acc_0927 flagged=0/10 suspects=0 steps_left=76
142
+ Step 5: INSPECT acc_0441 flagged=0/10 suspects=0 steps_left=75
143
+ Step 6: INSPECT acc_0871 flagged=0/10 suspects=0 steps_left=74
144
+ Step 7: INSPECT acc_0880 flagged=0/10 suspects=0 steps_left=73
145
+ Step 8: INSPECT acc_0652 flagged=0/10 suspects=0 steps_left=72
146
+ Step 9: INSPECT acc_0939 flagged=0/10 suspects=0 steps_left=71
147
+ Step 10: INSPECT acc_0070 flagged=0/10 suspects=0 steps_left=70
148
+ Step 11: INSPECT acc_0915 flagged=0/10 suspects=0 steps_left=69
149
+ Step 12: INSPECT acc_0443 flagged=0/10 suspects=0 steps_left=68
150
+ Step 13: INSPECT acc_0970 flagged=0/10 suspects=0 steps_left=67
151
+ Step 14: INSPECT acc_0028 flagged=0/10 suspects=0 steps_left=66
152
+ Step 15: INSPECT acc_0792 flagged=0/10 suspects=0 steps_left=65
153
+ Step 16: INSPECT acc_0579 flagged=0/10 suspects=0 steps_left=64
154
+ Step 17: INSPECT acc_0037 flagged=0/10 suspects=0 steps_left=63
155
+ Step 18: INSPECT acc_0295 flagged=0/10 suspects=0 steps_left=62
156
+ Step 19: INSPECT acc_0438 flagged=0/10 suspects=0 steps_left=61
157
+ Step 20: INSPECT acc_0439 flagged=0/10 suspects=0 steps_left=60
158
+ Step 21: FLAG acc_0439 flagged=1/10 suspects=5 steps_left=60
159
+ Step 22: INSPECT acc_0237 flagged=1/10 suspects=5 steps_left=59
160
+ Step 23: FLAG acc_0237 flagged=2/10 suspects=6 steps_left=59
161
+ Step 24: INSPECT acc_0621 flagged=2/10 suspects=6 steps_left=58
162
+ Step 25: FLAG acc_0621 flagged=3/10 suspects=6 steps_left=58
163
+ Step 26: INSPECT acc_0389 flagged=3/10 suspects=6 steps_left=57
164
+ Step 27: FLAG acc_0389 flagged=4/10 suspects=6 steps_left=57
165
+ Step 28: INSPECT acc_0160 flagged=4/10 suspects=6 steps_left=56
166
+ Step 29: FLAG acc_0160 flagged=5/10 suspects=5 steps_left=56
167
+ Step 30: INSPECT acc_0549 flagged=5/10 suspects=5 steps_left=55
168
+ Step 31: FLAG acc_0549 flagged=6/10 suspects=4 steps_left=55
169
+ Step 32: INSPECT acc_0658 flagged=6/10 suspects=4 steps_left=54
170
+ Step 33: FLAG acc_0658 flagged=7/10 suspects=3 steps_left=54
171
+ Step 34: INSPECT acc_0290 flagged=7/10 suspects=3 steps_left=53
172
+ Step 35: FLAG acc_0290 flagged=8/10 suspects=2 steps_left=53
173
+ Step 36: INSPECT acc_0124 flagged=8/10 suspects=2 steps_left=52
174
+ Step 37: FLAG acc_0124 flagged=9/10 suspects=1 steps_left=52
175
+ Step 38: INSPECT acc_0507 flagged=9/10 suspects=1 steps_left=51
176
+ Step 39: FLAG acc_0507 flagged=10/10 suspects=0 steps_left=51
177
+ Step 40: INSPECT acc_0086 flagged=10/10 suspects=0 steps_left=50
178
+ Step 41: INSPECT acc_0497 flagged=10/10 suspects=0 steps_left=49
179
+ Step 42: INSPECT acc_0610 flagged=10/10 suspects=0 steps_left=48
180
+ Step 43: INSPECT acc_0579 flagged=10/10 suspects=0 steps_left=47
181
+ Step 44: INSPECT acc_0573 flagged=10/10 suspects=0 steps_left=46
182
+ Step 45: INSPECT acc_0479 flagged=10/10 suspects=0 steps_left=45
183
+ Step 46: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=44
184
+ Step 47: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=43
185
+ Step 48: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=42
186
+ Step 49: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=41
187
+ Step 50: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=40
188
+ Step 51: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=39
189
+ Step 52: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=38
190
+ Step 53: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=37
191
+ Step 54: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=36
192
+ Step 55: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=35
193
+ Step 56: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=34
194
+ Step 57: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=33
195
+ Step 58: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=32
196
+ Step 59: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=31
197
+ Step 60: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=30
198
+ Step 61: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=29
199
+ Step 62: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=28
200
+ Step 63: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=27
201
+ Step 64: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=26
202
+ Step 65: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=25
203
+ Step 66: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=24
204
+ Step 67: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=23
205
+ Step 68: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=22
206
+ Step 69: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=21
207
+ Step 70: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=20
208
+ Step 71: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=19
209
+ Step 72: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=18
210
+ Step 73: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=17
211
+ Step 74: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=16
212
+ Step 75: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=15
213
+ Step 76: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=14
214
+ Step 77: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=13
215
+ Step 78: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=12
216
+ Step 79: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=11
217
+ Step 80: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=10
218
+ Step 81: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=9
219
+ Step 82: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=8
220
+ Step 83: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=7
221
+ Step 84: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=6
222
+ Step 85: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=5
223
+ Step 86: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=4
224
+ Step 87: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=3
225
+ Step 88: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=2
226
+ Step 89: SUBMIT flagged=10/10 suspects=0 steps_left=2
227
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=13.22
228
+ ★ GRADER SCORE: 0.9025
229
+
230
+ Summary: easy=0.9233 medium=0.9040 hard=0.9025
231
+
232
+ ============================================================
233
+ PHASE 3: Score Variance (seeds=[0, 1, 2])
234
+ ============================================================
235
+
236
+ --- LLM Agent: task=easy, seed=0, model=Bedrock/us.meta.llama4-scout-17b-instruct-v1:0 ---
237
+ Step 1: INSPECT acc_0043 flagged=0/10 suspects=0 steps_left=29
238
+ Step 2: FLAG acc_0043 flagged=1/10 suspects=8 steps_left=29
239
+ Step 3: INSPECT acc_0036 flagged=1/10 suspects=8 steps_left=28
240
+ Step 4: FLAG acc_0036 flagged=2/10 suspects=8 steps_left=28
241
+ Step 5: INSPECT acc_0001 flagged=2/10 suspects=8 steps_left=27
242
+ Step 6: FLAG acc_0001 flagged=3/10 suspects=7 steps_left=27
243
+ Step 7: INSPECT acc_0014 flagged=3/10 suspects=7 steps_left=26
244
+ Step 8: FLAG acc_0014 flagged=4/10 suspects=6 steps_left=26
245
+ Step 9: INSPECT acc_0012 flagged=4/10 suspects=6 steps_left=25
246
+ Step 10: FLAG acc_0012 flagged=5/10 suspects=5 steps_left=25
247
+ Step 11: INSPECT acc_0000 flagged=5/10 suspects=5 steps_left=24
248
+ Step 12: FLAG acc_0000 flagged=6/10 suspects=4 steps_left=24
249
+ Step 13: INSPECT acc_0027 flagged=6/10 suspects=4 steps_left=23
250
+ Step 14: FLAG acc_0027 flagged=7/10 suspects=3 steps_left=23
251
+ Step 15: INSPECT acc_0047 flagged=7/10 suspects=3 steps_left=22
252
+ Step 16: FLAG acc_0047 flagged=8/10 suspects=2 steps_left=22
253
+ Step 17: INSPECT acc_0007 flagged=8/10 suspects=2 steps_left=21
254
+ Step 18: FLAG acc_0007 flagged=9/10 suspects=1 steps_left=21
255
+ Step 19: INSPECT acc_0028 flagged=9/10 suspects=1 steps_left=20
256
+ Step 20: FLAG acc_0028 flagged=10/10 suspects=0 steps_left=20
257
+ Step 21: INSPECT acc_0035 flagged=10/10 suspects=0 steps_left=19
258
+ Step 22: INSPECT acc_0022 flagged=10/10 suspects=0 steps_left=18
259
+ Step 23: INSPECT acc_0041 flagged=10/10 suspects=0 steps_left=17
260
+ Step 24: INSPECT acc_0029 flagged=10/10 suspects=0 steps_left=16
261
+ Step 25: INSPECT acc_0018 flagged=10/10 suspects=0 steps_left=15
262
+ Step 26: INSPECT acc_0019 flagged=10/10 suspects=0 steps_left=14
263
+ Step 27: INSPECT acc_0041 flagged=10/10 suspects=0 steps_left=13
264
+ Step 28: INSPECT acc_0040 flagged=10/10 suspects=0 steps_left=12
265
+ Step 29: INSPECT acc_0026 flagged=10/10 suspects=0 steps_left=11
266
+ Step 30: INSPECT acc_0021 flagged=10/10 suspects=0 steps_left=10
267
+ Step 31: INSPECT acc_0009 flagged=10/10 suspects=0 steps_left=9
268
+ Step 32: INSPECT acc_0033 flagged=10/10 suspects=0 steps_left=8
269
+ Step 33: INSPECT acc_0044 flagged=10/10 suspects=0 steps_left=7
270
+ Step 34: SUBMIT flagged=10/10 suspects=0 steps_left=7
271
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.77
272
+ ★ GRADER SCORE: 0.9233
273
+
274
+ --- LLM Agent: task=easy, seed=1, model=Bedrock/us.meta.llama4-scout-17b-instruct-v1:0 ---
275
+ Step 1: INSPECT acc_0034 flagged=0/10 suspects=0 steps_left=29
276
+ Step 2: INSPECT acc_0003 flagged=0/10 suspects=0 steps_left=28
277
+ Step 3: INSPECT acc_0049 flagged=0/10 suspects=0 steps_left=27
278
+ Step 4: INSPECT acc_0006 flagged=0/10 suspects=0 steps_left=26
279
+ Step 5: INSPECT acc_0047 flagged=0/10 suspects=0 steps_left=25
280
+ Step 6: FLAG acc_0047 flagged=1/10 suspects=9 steps_left=25
281
+ Step 7: INSPECT acc_0009 flagged=1/10 suspects=9 steps_left=24
282
+ Step 8: FLAG acc_0009 flagged=2/10 suspects=8 steps_left=24
283
+ Step 9: INSPECT acc_0046 flagged=2/10 suspects=8 steps_left=23
284
+ Step 10: FLAG acc_0046 flagged=3/10 suspects=7 steps_left=23
285
+ Step 11: INSPECT acc_0014 flagged=3/10 suspects=7 steps_left=22
286
+ Step 12: FLAG acc_0014 flagged=4/10 suspects=6 steps_left=22
287
+ Step 13: INSPECT acc_0021 flagged=4/10 suspects=6 steps_left=21
288
+ Step 14: FLAG acc_0021 flagged=5/10 suspects=5 steps_left=21
289
+ Step 15: INSPECT acc_0002 flagged=5/10 suspects=5 steps_left=20
290
+ Step 16: FLAG acc_0002 flagged=6/10 suspects=4 steps_left=20
291
+ Step 17: INSPECT acc_0048 flagged=6/10 suspects=4 steps_left=19
292
+ Step 18: FLAG acc_0048 flagged=7/10 suspects=3 steps_left=19
293
+ Step 19: INSPECT acc_0029 flagged=7/10 suspects=3 steps_left=18
294
+ Step 20: FLAG acc_0029 flagged=8/10 suspects=2 steps_left=18
295
+ Step 21: INSPECT acc_0015 flagged=8/10 suspects=2 steps_left=17
296
+ Step 22: FLAG acc_0015 flagged=9/10 suspects=1 steps_left=17
297
+ Step 23: INSPECT acc_0005 flagged=9/10 suspects=1 steps_left=16
298
+ Step 24: FLAG acc_0005 flagged=10/10 suspects=0 steps_left=16
299
+ Step 25: INSPECT acc_0036 flagged=10/10 suspects=0 steps_left=15
300
+ Step 26: INSPECT acc_0027 flagged=10/10 suspects=0 steps_left=14
301
+ Step 27: INSPECT acc_0043 flagged=10/10 suspects=0 steps_left=13
302
+ Step 28: INSPECT acc_0044 flagged=10/10 suspects=0 steps_left=12
303
+ Step 29: INSPECT acc_0038 flagged=10/10 suspects=0 steps_left=11
304
+ Step 30: INSPECT acc_0039 flagged=10/10 suspects=0 steps_left=10
305
+ Step 31: INSPECT acc_0028 flagged=10/10 suspects=0 steps_left=9
306
+ Step 32: INSPECT acc_0022 flagged=10/10 suspects=0 steps_left=8
307
+ Step 33: INSPECT acc_0025 flagged=10/10 suspects=0 steps_left=7
308
+ Step 34: INSPECT acc_0031 flagged=10/10 suspects=0 steps_left=6
309
+ Step 35: INSPECT acc_0007 flagged=10/10 suspects=0 steps_left=5
310
+ Step 36: INSPECT acc_0026 flagged=10/10 suspects=0 steps_left=4
311
+ Step 37: SUBMIT flagged=10/10 suspects=0 steps_left=4
312
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.74
313
+ ★ GRADER SCORE: 0.9133
314
+
315
+ --- LLM Agent: task=easy, seed=2, model=Bedrock/us.meta.llama4-scout-17b-instruct-v1:0 ---
316
+ Step 1: INSPECT acc_0040 flagged=0/10 suspects=0 steps_left=29
317
+ Step 2: INSPECT acc_0017 flagged=0/10 suspects=0 steps_left=28
318
+ Step 3: INSPECT acc_0025 flagged=0/10 suspects=0 steps_left=27
319
+ Step 4: INSPECT acc_0026 flagged=0/10 suspects=0 steps_left=26
320
+ Step 5: INSPECT acc_0038 flagged=0/10 suspects=0 steps_left=25
321
+ Step 6: INSPECT acc_0029 flagged=0/10 suspects=0 steps_left=24
322
+ Step 7: FLAG acc_0029 flagged=1/10 suspects=9 steps_left=24
323
+ Step 8: INSPECT acc_0006 flagged=1/10 suspects=9 steps_left=23
324
+ Step 9: FLAG acc_0006 flagged=2/10 suspects=8 steps_left=23
325
+ Step 10: INSPECT acc_0033 flagged=2/10 suspects=8 steps_left=22
326
+ Step 11: FLAG acc_0033 flagged=3/10 suspects=7 steps_left=22
327
+ Step 12: INSPECT acc_0015 flagged=3/10 suspects=7 steps_left=21
328
+ Step 13: FLAG acc_0015 flagged=4/10 suspects=6 steps_left=21
329
+ Step 14: INSPECT acc_0022 flagged=4/10 suspects=6 steps_left=20
330
+ Step 15: FLAG acc_0022 flagged=5/10 suspects=5 steps_left=20
331
+ Step 16: INSPECT acc_0009 flagged=5/10 suspects=5 steps_left=19
332
+ Step 17: FLAG acc_0009 flagged=6/10 suspects=4 steps_left=19
333
+ Step 18: INSPECT acc_0004 flagged=6/10 suspects=4 steps_left=18
334
+ Step 19: FLAG acc_0004 flagged=7/10 suspects=3 steps_left=18
335
+ Step 20: INSPECT acc_0024 flagged=7/10 suspects=3 steps_left=17
336
+ Step 21: FLAG acc_0024 flagged=8/10 suspects=2 steps_left=17
337
+ Step 22: INSPECT acc_0049 flagged=8/10 suspects=2 steps_left=16
338
+ Step 23: FLAG acc_0049 flagged=9/10 suspects=1 steps_left=16
339
+ Step 24: INSPECT acc_0035 flagged=9/10 suspects=1 steps_left=15
340
+ Step 25: FLAG acc_0035 flagged=10/10 suspects=0 steps_left=15
341
+ Step 26: INSPECT acc_0044 flagged=10/10 suspects=0 steps_left=14
342
+ Step 27: INSPECT acc_0016 flagged=10/10 suspects=0 steps_left=13
343
+ Step 28: INSPECT acc_0043 flagged=10/10 suspects=0 steps_left=12
344
+ Step 29: INSPECT acc_0003 flagged=10/10 suspects=0 steps_left=11
345
+ Step 30: INSPECT acc_0028 flagged=10/10 suspects=0 steps_left=10
346
+ Step 31: INSPECT acc_0027 flagged=10/10 suspects=0 steps_left=9
347
+ Step 32: INSPECT acc_0023 flagged=10/10 suspects=0 steps_left=8
348
+ Step 33: INSPECT acc_0041 flagged=10/10 suspects=0 steps_left=7
349
+ Step 34: INSPECT acc_0045 flagged=10/10 suspects=0 steps_left=6
350
+ Step 35: INSPECT acc_0039 flagged=10/10 suspects=0 steps_left=5
351
+ Step 36: INSPECT acc_0048 flagged=10/10 suspects=0 steps_left=4
352
+ Step 37: INSPECT acc_0046 flagged=10/10 suspects=0 steps_left=3
353
+ Step 38: SUBMIT flagged=10/10 suspects=0 steps_left=3
354
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.73
355
+ ★ GRADER SCORE: 0.9100
356
+
357
+ easy: scores=['0.923', '0.913', '0.910'] mean=0.9155 var=0.000032
358
+
359
+ --- LLM Agent: task=medium, seed=0, model=Bedrock/us.meta.llama4-scout-17b-instruct-v1:0 ---
360
+ Step 1: INSPECT acc_0008 flagged=0/10 suspects=0 steps_left=49
361
+ Step 2: INSPECT acc_0074 flagged=0/10 suspects=0 steps_left=48
362
+ Step 3: INSPECT acc_0179 flagged=0/10 suspects=0 steps_left=47
363
+ Step 4: INSPECT acc_0177 flagged=0/10 suspects=0 steps_left=46
364
+ Step 5: INSPECT acc_0096 flagged=0/10 suspects=0 steps_left=45
365
+ Step 6: INSPECT acc_0122 flagged=0/10 suspects=0 steps_left=44
366
+ Step 7: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=43
367
+ Step 8: INSPECT acc_0174 flagged=0/10 suspects=0 steps_left=42
368
+ Step 9: INSPECT acc_0144 flagged=0/10 suspects=0 steps_left=41
369
+ Step 10: FLAG acc_0144 flagged=1/10 suspects=8 steps_left=41
370
+ Step 11: INSPECT acc_0181 flagged=1/10 suspects=8 steps_left=40
371
+ Step 12: FLAG acc_0181 flagged=2/10 suspects=8 steps_left=40
372
+ Step 13: INSPECT acc_0022 flagged=2/10 suspects=8 steps_left=39
373
+ Step 14: FLAG acc_0022 flagged=3/10 suspects=7 steps_left=39
374
+ Step 15: INSPECT acc_0092 flagged=3/10 suspects=7 steps_left=38
375
+ Step 16: FLAG acc_0092 flagged=4/10 suspects=6 steps_left=38
376
+ Step 17: INSPECT acc_0097 flagged=4/10 suspects=6 steps_left=37
377
+ Step 18: FLAG acc_0097 flagged=5/10 suspects=5 steps_left=37
378
+ Step 19: INSPECT acc_0187 flagged=5/10 suspects=5 steps_left=36
379
+ Step 20: FLAG acc_0187 flagged=6/10 suspects=4 steps_left=36
380
+ Step 21: INSPECT acc_0093 flagged=6/10 suspects=4 steps_left=35
381
+ Step 22: FLAG acc_0093 flagged=7/10 suspects=3 steps_left=35
382
+ Step 23: INSPECT acc_0172 flagged=7/10 suspects=3 steps_left=34
383
+ Step 24: FLAG acc_0172 flagged=8/10 suspects=2 steps_left=34
384
+ Step 25: INSPECT acc_0058 flagged=8/10 suspects=2 steps_left=33
385
+ Step 26: FLAG acc_0058 flagged=9/10 suspects=1 steps_left=33
386
+ Step 27: INSPECT acc_0131 flagged=9/10 suspects=1 steps_left=32
387
+ Step 28: FLAG acc_0131 flagged=10/10 suspects=0 steps_left=32
388
+ Step 29: INSPECT acc_0148 flagged=10/10 suspects=0 steps_left=31
389
+ Step 30: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=30
390
+ Step 31: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=29
391
+ Step 32: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=28
392
+ Step 33: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=27
393
+ Step 34: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=26
394
+ Step 35: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=25
395
+ Step 36: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=24
396
+ Step 37: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=23
397
+ Step 38: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=22
398
+ Step 39: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=21
399
+ Step 40: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=20
400
+ Step 41: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=19
401
+ Step 42: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=18
402
+ Step 43: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=17
403
+ Step 44: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=16
404
+ Step 45: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=15
405
+ Step 46: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=14
406
+ Step 47: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=13
407
+ Step 48: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=12
408
+ Step 49: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=11
409
+ Step 50: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=10
410
+ Step 51: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=9
411
+ Step 52: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=8
412
+ Step 53: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=7
413
+ Step 54: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=6
414
+ Step 55: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=5
415
+ Step 56: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=4
416
+ Step 57: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=3
417
+ Step 58: INSPECT acc_0078 flagged=10/10 suspects=0 steps_left=2
418
+ Step 59: INSPECT acc_0179 flagged=10/10 suspects=0 steps_left=1
419
+ Step 60: SUBMIT flagged=10/10 suspects=0 steps_left=1
420
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.51
421
+ ★ GRADER SCORE: 0.9020
422
+
423
+ --- LLM Agent: task=medium, seed=1, model=Bedrock/us.meta.llama4-scout-17b-instruct-v1:0 ---
424
+ Step 1: INSPECT acc_0171 flagged=0/10 suspects=0 steps_left=49
425
+ Step 2: INSPECT acc_0099 flagged=0/10 suspects=0 steps_left=48
426
+ Step 3: INSPECT acc_0152 flagged=0/10 suspects=0 steps_left=47
427
+ Step 4: INSPECT acc_0092 flagged=0/10 suspects=0 steps_left=46
428
+ Step 5: INSPECT acc_0078 flagged=0/10 suspects=0 steps_left=45
429
+ Step 6: INSPECT acc_0112 flagged=0/10 suspects=0 steps_left=44
430
+ Step 7: INSPECT acc_0012 flagged=0/10 suspects=0 steps_left=43
431
+ Step 8: FLAG acc_0012 flagged=1/10 suspects=8 steps_left=43
432
+ Step 9: INSPECT acc_0033 flagged=1/10 suspects=8 steps_left=42
433
+ Step 10: FLAG acc_0033 flagged=2/10 suspects=8 steps_left=42
434
+ Step 11: INSPECT acc_0174 flagged=2/10 suspects=8 steps_left=41
435
+ Step 12: FLAG acc_0174 flagged=3/10 suspects=7 steps_left=41
436
+ Step 13: INSPECT acc_0187 flagged=3/10 suspects=7 steps_left=40
437
+ Step 14: FLAG acc_0187 flagged=4/10 suspects=6 steps_left=40
438
+ Step 15: INSPECT acc_0079 flagged=4/10 suspects=6 steps_left=39
439
+ Step 16: FLAG acc_0079 flagged=5/10 suspects=5 steps_left=39
440
+ Step 17: INSPECT acc_0032 flagged=5/10 suspects=5 steps_left=38
441
+ Step 18: FLAG acc_0032 flagged=6/10 suspects=4 steps_left=38
442
+ Step 19: INSPECT acc_0023 flagged=6/10 suspects=4 steps_left=37
443
+ Step 20: FLAG acc_0023 flagged=7/10 suspects=3 steps_left=37
444
+ Step 21: INSPECT acc_0146 flagged=7/10 suspects=3 steps_left=36
445
+ Step 22: FLAG acc_0146 flagged=8/10 suspects=2 steps_left=36
446
+ Step 23: INSPECT acc_0019 flagged=8/10 suspects=2 steps_left=35
447
+ Step 24: FLAG acc_0019 flagged=9/10 suspects=1 steps_left=35
448
+ Step 25: INSPECT acc_0131 flagged=9/10 suspects=1 steps_left=34
449
+ Step 26: FLAG acc_0131 flagged=10/10 suspects=0 steps_left=34
450
+ Step 27: INSPECT acc_0168 flagged=10/10 suspects=0 steps_left=33
451
+ Step 28: INSPECT acc_0198 flagged=10/10 suspects=0 steps_left=32
452
+ Step 29: INSPECT acc_0186 flagged=10/10 suspects=0 steps_left=31
453
+ Step 30: INSPECT acc_0099 flagged=10/10 suspects=0 steps_left=30
454
+ Step 31: INSPECT acc_0084 flagged=10/10 suspects=0 steps_left=29
455
+ Step 32: INSPECT acc_0117 flagged=10/10 suspects=0 steps_left=28
456
+ Step 33: INSPECT acc_0192 flagged=10/10 suspects=0 steps_left=27
457
+ Step 34: INSPECT acc_0025 flagged=10/10 suspects=0 steps_left=26
458
+ Step 35: INSPECT acc_0176 flagged=10/10 suspects=0 steps_left=25
459
+ Step 36: INSPECT acc_0185 flagged=10/10 suspects=0 steps_left=24
460
+ Step 37: INSPECT acc_0027 flagged=10/10 suspects=0 steps_left=23
461
+ Step 38: INSPECT acc_0199 flagged=10/10 suspects=0 steps_left=22
462
+ Step 39: INSPECT acc_0135 flagged=10/10 suspects=0 steps_left=21
463
+ Step 40: INSPECT acc_0082 flagged=10/10 suspects=0 steps_left=20
464
+ Step 41: INSPECT acc_0002 flagged=10/10 suspects=0 steps_left=19
465
+ Step 42: INSPECT acc_0161 flagged=10/10 suspects=0 steps_left=18
466
+ Step 43: INSPECT acc_0067 flagged=10/10 suspects=0 steps_left=17
467
+ Step 44: INSPECT acc_0062 flagged=10/10 suspects=0 steps_left=16
468
+ Step 45: INSPECT acc_0034 flagged=10/10 suspects=0 steps_left=15
469
+ Step 46: INSPECT acc_0010 flagged=10/10 suspects=0 steps_left=14
470
+ Step 47: INSPECT acc_0173 flagged=10/10 suspects=0 steps_left=13
471
+ Step 48: INSPECT acc_0081 flagged=10/10 suspects=0 steps_left=12
472
+ Step 49: INSPECT acc_0132 flagged=10/10 suspects=0 steps_left=11
473
+ Step 50: INSPECT acc_0094 flagged=10/10 suspects=0 steps_left=10
474
+ Step 51: INSPECT acc_0089 flagged=10/10 suspects=0 steps_left=9
475
+ Step 52: INSPECT acc_0046 flagged=10/10 suspects=0 steps_left=8
476
+ Step 53: INSPECT acc_0116 flagged=10/10 suspects=0 steps_left=7
477
+ Step 54: INSPECT acc_0121 flagged=10/10 suspects=0 steps_left=6
478
+ Step 55: INSPECT acc_0156 flagged=10/10 suspects=0 steps_left=5
479
+ Step 56: INSPECT acc_0141 flagged=10/10 suspects=0 steps_left=4
480
+ Step 57: INSPECT acc_0188 flagged=10/10 suspects=0 steps_left=3
481
+ Step 58: SUBMIT flagged=10/10 suspects=0 steps_left=3
482
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.53
483
+ ★ GRADER SCORE: 0.9060
484
+
485
+ --- LLM Agent: task=medium, seed=2, model=Bedrock/us.meta.llama4-scout-17b-instruct-v1:0 ---
486
+ Step 1: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=49
487
+ Step 2: INSPECT acc_0107 flagged=0/10 suspects=0 steps_left=48
488
+ Step 3: INSPECT acc_0175 flagged=0/10 suspects=0 steps_left=47
489
+ Step 4: INSPECT acc_0030 flagged=0/10 suspects=0 steps_left=46
490
+ Step 5: INSPECT acc_0041 flagged=0/10 suspects=0 steps_left=45
491
+ Step 6: INSPECT acc_0054 flagged=0/10 suspects=0 steps_left=44
492
+ Step 7: INSPECT acc_0199 flagged=0/10 suspects=0 steps_left=43
493
+ Step 8: INSPECT acc_0181 flagged=0/10 suspects=0 steps_left=42
494
+ Step 9: INSPECT acc_0166 flagged=0/10 suspects=0 steps_left=41
495
+ Step 10: INSPECT acc_0098 flagged=0/10 suspects=0 steps_left=40
496
+ Step 11: INSPECT acc_0121 flagged=0/10 suspects=0 steps_left=39
497
+ Step 12: INSPECT acc_0053 flagged=0/10 suspects=0 steps_left=38
498
+ Step 13: INSPECT acc_0103 flagged=0/10 suspects=0 steps_left=37
499
+ Step 14: INSPECT acc_0168 flagged=0/10 suspects=0 steps_left=36
500
+ Step 15: INSPECT acc_0000 flagged=0/10 suspects=0 steps_left=35
501
+ Step 16: INSPECT acc_0040 flagged=0/10 suspects=0 steps_left=34
502
+ Step 17: INSPECT acc_0149 flagged=0/10 suspects=0 steps_left=33
503
+ Step 18: INSPECT acc_0064 flagged=0/10 suspects=0 steps_left=32
504
+ Step 19: INSPECT acc_0016 flagged=0/10 suspects=0 steps_left=31
505
+ Step 20: INSPECT acc_0105 flagged=0/10 suspects=0 steps_left=30
506
+ Step 21: INSPECT acc_0035 flagged=0/10 suspects=0 steps_left=29
507
+ Step 22: FLAG acc_0035 flagged=1/10 suspects=9 steps_left=29
508
+ Step 23: INSPECT acc_0020 flagged=1/10 suspects=9 steps_left=28
509
+ Step 24: FLAG acc_0020 flagged=2/10 suspects=8 steps_left=28
510
+ Step 25: INSPECT acc_0036 flagged=2/10 suspects=8 steps_left=27
511
+ Step 26: FLAG acc_0036 flagged=3/10 suspects=7 steps_left=27
512
+ Step 27: INSPECT acc_0050 flagged=3/10 suspects=7 steps_left=26
513
+ Step 28: FLAG acc_0050 flagged=4/10 suspects=6 steps_left=26
514
+ Step 29: INSPECT acc_0051 flagged=4/10 suspects=6 steps_left=25
515
+ Step 30: FLAG acc_0051 flagged=5/10 suspects=5 steps_left=25
516
+ Step 31: INSPECT acc_0085 flagged=5/10 suspects=5 steps_left=24
517
+ Step 32: FLAG acc_0085 flagged=6/10 suspects=4 steps_left=24
518
+ Step 33: INSPECT acc_0177 flagged=6/10 suspects=4 steps_left=23
519
+ Step 34: FLAG acc_0177 flagged=7/10 suspects=3 steps_left=23
520
+ Step 35: INSPECT acc_0170 flagged=7/10 suspects=3 steps_left=22
521
+ Step 36: FLAG acc_0170 flagged=8/10 suspects=2 steps_left=22
522
+ Step 37: INSPECT acc_0055 flagged=8/10 suspects=2 steps_left=21
523
+ Step 38: FLAG acc_0055 flagged=9/10 suspects=1 steps_left=21
524
+ Step 39: INSPECT acc_0094 flagged=9/10 suspects=1 steps_left=20
525
+ Step 40: FLAG acc_0094 flagged=10/10 suspects=0 steps_left=20
526
+ Step 41: INSPECT acc_0185 flagged=10/10 suspects=0 steps_left=19
527
+ Step 42: INSPECT acc_0175 flagged=10/10 suspects=0 steps_left=18
528
+ Step 43: INSPECT acc_0192 flagged=10/10 suspects=0 steps_left=17
529
+ Step 44: INSPECT acc_0013 flagged=10/10 suspects=0 steps_left=16
530
+ Step 45: INSPECT acc_0099 flagged=10/10 suspects=0 steps_left=15
531
+ Step 46: INSPECT acc_0075 flagged=10/10 suspects=0 steps_left=14
532
+ Step 47: INSPECT acc_0058 flagged=10/10 suspects=0 steps_left=13
533
+ Step 48: INSPECT acc_0072 flagged=10/10 suspects=0 steps_left=12
534
+ Step 49: INSPECT acc_0070 flagged=10/10 suspects=0 steps_left=11
535
+ Step 50: INSPECT acc_0071 flagged=10/10 suspects=0 steps_left=10
536
+ Step 51: INSPECT acc_0175 flagged=10/10 suspects=0 steps_left=9
537
+ Step 52: INSPECT acc_0181 flagged=10/10 suspects=0 steps_left=8
538
+ Step 53: INSPECT acc_0175 flagged=10/10 suspects=0 steps_left=7
539
+ Step 54: INSPECT acc_0181 flagged=10/10 suspects=0 steps_left=6
540
+ Step 55: INSPECT acc_0175 flagged=10/10 suspects=0 steps_left=5
541
+ Step 56: INSPECT acc_0181 flagged=10/10 suspects=0 steps_left=4
542
+ Step 57: INSPECT acc_0175 flagged=10/10 suspects=0 steps_left=3
543
+ Step 58: INSPECT acc_0181 flagged=10/10 suspects=0 steps_left=2
544
+ Step 59: INSPECT acc_0175 flagged=10/10 suspects=0 steps_left=1
545
+ Step 60: SUBMIT flagged=10/10 suspects=0 steps_left=1
546
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.51
547
+ ★ GRADER SCORE: 0.9020
548
+
549
+ medium: scores=['0.902', '0.906', '0.902'] mean=0.9033 var=0.000004
550
+
551
+ --- LLM Agent: task=hard, seed=0, model=Bedrock/us.meta.llama4-scout-17b-instruct-v1:0 ---
552
+ Step 1: INSPECT acc_0704 flagged=0/10 suspects=0 steps_left=79
553
+ Step 2: INSPECT acc_0289 flagged=0/10 suspects=0 steps_left=78
554
+ Step 3: INSPECT acc_0826 flagged=0/10 suspects=0 steps_left=77
555
+ Step 4: INSPECT acc_0927 flagged=0/10 suspects=0 steps_left=76
556
+ Step 5: INSPECT acc_0441 flagged=0/10 suspects=0 steps_left=75
557
+ Step 6: INSPECT acc_0871 flagged=0/10 suspects=0 steps_left=74
558
+ Step 7: INSPECT acc_0880 flagged=0/10 suspects=0 steps_left=73
559
+ Step 8: INSPECT acc_0652 flagged=0/10 suspects=0 steps_left=72
560
+ Step 9: INSPECT acc_0939 flagged=0/10 suspects=0 steps_left=71
561
+ Step 10: INSPECT acc_0070 flagged=0/10 suspects=0 steps_left=70
562
+ Step 11: INSPECT acc_0915 flagged=0/10 suspects=0 steps_left=69
563
+ Step 12: INSPECT acc_0443 flagged=0/10 suspects=0 steps_left=68
564
+ Step 13: INSPECT acc_0970 flagged=0/10 suspects=0 steps_left=67
565
+ Step 14: INSPECT acc_0028 flagged=0/10 suspects=0 steps_left=66
566
+ Step 15: INSPECT acc_0792 flagged=0/10 suspects=0 steps_left=65
567
+ Step 16: INSPECT acc_0579 flagged=0/10 suspects=0 steps_left=64
568
+ Step 17: INSPECT acc_0037 flagged=0/10 suspects=0 steps_left=63
569
+ Step 18: INSPECT acc_0295 flagged=0/10 suspects=0 steps_left=62
570
+ Step 19: INSPECT acc_0438 flagged=0/10 suspects=0 steps_left=61
571
+ Step 20: INSPECT acc_0439 flagged=0/10 suspects=0 steps_left=60
572
+ Step 21: FLAG acc_0439 flagged=1/10 suspects=5 steps_left=60
573
+ Step 22: INSPECT acc_0237 flagged=1/10 suspects=5 steps_left=59
574
+ Step 23: FLAG acc_0237 flagged=2/10 suspects=6 steps_left=59
575
+ Step 24: INSPECT acc_0621 flagged=2/10 suspects=6 steps_left=58
576
+ Step 25: FLAG acc_0621 flagged=3/10 suspects=6 steps_left=58
577
+ Step 26: INSPECT acc_0389 flagged=3/10 suspects=6 steps_left=57
578
+ Step 27: FLAG acc_0389 flagged=4/10 suspects=6 steps_left=57
579
+ Step 28: INSPECT acc_0160 flagged=4/10 suspects=6 steps_left=56
580
+ Step 29: FLAG acc_0160 flagged=5/10 suspects=5 steps_left=56
581
+ Step 30: INSPECT acc_0549 flagged=5/10 suspects=5 steps_left=55
582
+ Step 31: FLAG acc_0549 flagged=6/10 suspects=4 steps_left=55
583
+ Step 32: INSPECT acc_0658 flagged=6/10 suspects=4 steps_left=54
584
+ Step 33: FLAG acc_0658 flagged=7/10 suspects=3 steps_left=54
585
+ Step 34: INSPECT acc_0290 flagged=7/10 suspects=3 steps_left=53
586
+ Step 35: FLAG acc_0290 flagged=8/10 suspects=2 steps_left=53
587
+ Step 36: INSPECT acc_0124 flagged=8/10 suspects=2 steps_left=52
588
+ Step 37: INSPECT acc_0507 flagged=8/10 suspects=2 steps_left=51
589
+ Step 38: INSPECT acc_0124 flagged=8/10 suspects=2 steps_left=50
590
+ Step 39: FLAG acc_0507 flagged=9/10 suspects=1 steps_left=50
591
+ Step 40: FLAG acc_0124 flagged=10/10 suspects=0 steps_left=50
592
+ Step 41: INSPECT acc_0086 flagged=10/10 suspects=0 steps_left=49
593
+ Step 42: INSPECT acc_0497 flagged=10/10 suspects=0 steps_left=48
594
+ Step 43: INSPECT acc_0610 flagged=10/10 suspects=0 steps_left=47
595
+ Step 44: INSPECT acc_0579 flagged=10/10 suspects=0 steps_left=46
596
+ Step 45: INSPECT acc_0573 flagged=10/10 suspects=0 steps_left=45
597
+ Step 46: INSPECT acc_0479 flagged=10/10 suspects=0 steps_left=44
598
+ Step 47: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=43
599
+ Step 48: INSPECT acc_0438 flagged=10/10 suspects=0 steps_left=42
600
+ Step 49: INSPECT acc_0287 flagged=10/10 suspects=0 steps_left=41
601
+ Step 50: INSPECT acc_0960 flagged=10/10 suspects=0 steps_left=40
602
+ Step 51: INSPECT acc_0455 flagged=10/10 suspects=0 steps_left=39
603
+ Step 52: INSPECT acc_0248 flagged=10/10 suspects=0 steps_left=38
604
+ Step 53: INSPECT acc_0964 flagged=10/10 suspects=0 steps_left=37
605
+ Step 54: INSPECT acc_0480 flagged=10/10 suspects=0 steps_left=36
606
+ Step 55: INSPECT acc_0154 flagged=10/10 suspects=0 steps_left=35
607
+ Step 56: INSPECT acc_0368 flagged=10/10 suspects=0 steps_left=34
608
+ Step 57: INSPECT acc_0426 flagged=10/10 suspects=0 steps_left=33
609
+ Step 58: INSPECT acc_0810 flagged=10/10 suspects=0 steps_left=32
610
+ Step 59: INSPECT acc_0040 flagged=10/10 suspects=0 steps_left=31
611
+ Step 60: INSPECT acc_0538 flagged=10/10 suspects=0 steps_left=30
612
+ Step 61: INSPECT acc_0940 flagged=10/10 suspects=0 steps_left=29
613
+ Step 62: INSPECT acc_0668 flagged=10/10 suspects=0 steps_left=28
614
+ Step 63: INSPECT acc_0721 flagged=10/10 suspects=0 steps_left=27
615
+ Step 64: INSPECT acc_0787 flagged=10/10 suspects=0 steps_left=26
616
+ Step 65: INSPECT acc_0639 flagged=10/10 suspects=0 steps_left=25
617
+ Step 66: INSPECT acc_0700 flagged=10/10 suspects=0 steps_left=24
618
+ Step 67: INSPECT acc_0353 flagged=10/10 suspects=0 steps_left=23
619
+ Step 68: INSPECT acc_0620 flagged=10/10 suspects=0 steps_left=22
620
+ Step 69: INSPECT acc_0499 flagged=10/10 suspects=0 steps_left=21
621
+ Step 70: INSPECT acc_0207 flagged=10/10 suspects=0 steps_left=20
622
+ Step 71: INSPECT acc_0011 flagged=10/10 suspects=0 steps_left=19
623
+ Step 72: INSPECT acc_0524 flagged=10/10 suspects=0 steps_left=18
624
+ Step 73: INSPECT acc_0553 flagged=10/10 suspects=0 steps_left=17
625
+ Step 74: INSPECT acc_0948 flagged=10/10 suspects=0 steps_left=16
626
+ Step 75: INSPECT acc_0333 flagged=10/10 suspects=0 steps_left=15
627
+ Step 76: INSPECT acc_0574 flagged=10/10 suspects=0 steps_left=14
628
+ Step 77: INSPECT acc_0258 flagged=10/10 suspects=0 steps_left=13
629
+ Step 78: INSPECT acc_0742 flagged=10/10 suspects=0 steps_left=12
630
+ Step 79: INSPECT acc_0354 flagged=10/10 suspects=0 steps_left=11
631
+ Step 80: INSPECT acc_0022 flagged=10/10 suspects=0 steps_left=10
632
+ Step 81: INSPECT acc_0232 flagged=10/10 suspects=0 steps_left=9
633
+ Step 82: INSPECT acc_0123 flagged=10/10 suspects=0 steps_left=8
634
+ Step 83: INSPECT acc_0844 flagged=10/10 suspects=0 steps_left=7
635
+ Step 84: INSPECT acc_0757 flagged=10/10 suspects=0 steps_left=6
636
+ Step 85: INSPECT acc_0653 flagged=10/10 suspects=0 steps_left=5
637
+ Step 86: INSPECT acc_0119 flagged=10/10 suspects=0 steps_left=4
638
+ Step 87: INSPECT acc_0514 flagged=10/10 suspects=0 steps_left=3
639
+ Step 88: SUBMIT flagged=10/10 suspects=0 steps_left=3
640
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=13.23
641
+ ★ GRADER SCORE: 0.9038
642
+
643
+ --- LLM Agent: task=hard, seed=1, model=Bedrock/us.meta.llama4-scout-17b-instruct-v1:0 ---
644
+ Step 1: INSPECT acc_0014 flagged=0/10 suspects=0 steps_left=79
645
+ Step 2: INSPECT acc_0835 flagged=0/10 suspects=0 steps_left=78
646
+ Step 3: INSPECT acc_0855 flagged=0/10 suspects=0 steps_left=77
647
+ Step 4: INSPECT acc_0930 flagged=0/10 suspects=0 steps_left=76
648
+ Step 5: INSPECT acc_0336 flagged=0/10 suspects=0 steps_left=75
649
+ Step 6: INSPECT acc_0929 flagged=0/10 suspects=0 steps_left=74
650
+ Step 7: INSPECT acc_0076 flagged=0/10 suspects=0 steps_left=73
651
+ Step 8: INSPECT acc_0543 flagged=0/10 suspects=0 steps_left=72
652
+ Step 9: INSPECT acc_0590 flagged=0/10 suspects=0 steps_left=71
653
+ Step 10: INSPECT acc_0401 flagged=0/10 suspects=0 steps_left=70
654
+ Step 11: INSPECT acc_0322 flagged=0/10 suspects=0 steps_left=69
655
+ Step 12: INSPECT acc_0154 flagged=0/10 suspects=0 steps_left=68
656
+ Step 13: INSPECT acc_0374 flagged=0/10 suspects=0 steps_left=67
657
+ Step 14: INSPECT acc_0549 flagged=0/10 suspects=0 steps_left=66
658
+ Step 15: INSPECT acc_0903 flagged=0/10 suspects=0 steps_left=65
659
+ Step 16: INSPECT acc_0976 flagged=0/10 suspects=0 steps_left=64
660
+ Step 17: INSPECT acc_0620 flagged=0/10 suspects=0 steps_left=63
661
+ Step 18: INSPECT acc_0017 flagged=0/10 suspects=0 steps_left=62
662
+ Step 19: INSPECT acc_0222 flagged=0/10 suspects=0 steps_left=61
663
+ Step 20: INSPECT acc_0536 flagged=0/10 suspects=0 steps_left=60
664
+ Step 21: INSPECT acc_0112 flagged=0/10 suspects=0 steps_left=59
665
+ Step 22: INSPECT acc_0577 flagged=0/10 suspects=0 steps_left=58
666
+ Step 23: INSPECT acc_0517 flagged=0/10 suspects=0 steps_left=57
667
+ Step 24: INSPECT acc_0113 flagged=0/10 suspects=0 steps_left=56
668
+ Step 25: INSPECT acc_0167 flagged=0/10 suspects=0 steps_left=55
669
+ Step 26: INSPECT acc_0697 flagged=0/10 suspects=0 steps_left=54
670
+ Step 27: INSPECT acc_0271 flagged=0/10 suspects=0 steps_left=53
671
+ Step 28: INSPECT acc_0681 flagged=0/10 suspects=0 steps_left=52
672
+ Step 29: INSPECT acc_0530 flagged=0/10 suspects=0 steps_left=51
673
+ Step 30: INSPECT acc_0353 flagged=0/10 suspects=0 steps_left=50
674
+ Step 31: INSPECT acc_0652 flagged=0/10 suspects=0 steps_left=49
675
+ Step 32: INSPECT acc_0777 flagged=0/10 suspects=0 steps_left=48
676
+ Step 33: INSPECT acc_0265 flagged=0/10 suspects=0 steps_left=47
677
+ Step 34: INSPECT acc_0788 flagged=0/10 suspects=0 steps_left=46
678
+ Step 35: INSPECT acc_0033 flagged=0/10 suspects=0 steps_left=45
679
+ Step 36: INSPECT acc_0187 flagged=0/10 suspects=0 steps_left=44
680
+ Step 37: INSPECT acc_0445 flagged=0/10 suspects=0 steps_left=43
681
+ Step 38: INSPECT acc_0846 flagged=0/10 suspects=0 steps_left=42
682
+ Step 39: INSPECT acc_0659 flagged=0/10 suspects=0 steps_left=41
683
+ Step 40: INSPECT acc_0677 flagged=0/10 suspects=0 steps_left=40
684
+ Step 41: INSPECT acc_0768 flagged=0/10 suspects=0 steps_left=39
685
+ Step 42: INSPECT acc_0539 flagged=0/10 suspects=0 steps_left=38
686
+ Step 43: INSPECT acc_0742 flagged=0/10 suspects=0 steps_left=37
687
+ Step 44: INSPECT acc_0503 flagged=0/10 suspects=0 steps_left=36
688
+ Step 45: INSPECT acc_0876 flagged=0/10 suspects=0 steps_left=35
689
+ Step 46: INSPECT acc_0639 flagged=0/10 suspects=0 steps_left=34
690
+ Step 47: INSPECT acc_0494 flagged=0/10 suspects=0 steps_left=33
691
+ Step 48: INSPECT acc_0898 flagged=0/10 suspects=0 steps_left=32
692
+ Step 49: INSPECT acc_0553 flagged=0/10 suspects=0 steps_left=31
693
+ Step 50: INSPECT acc_0588 flagged=0/10 suspects=0 steps_left=30
694
+ Step 51: INSPECT acc_0194 flagged=0/10 suspects=0 steps_left=29
695
+ Step 52: INSPECT acc_0810 flagged=0/10 suspects=0 steps_left=28
696
+ Step 53: INSPECT acc_0355 flagged=0/10 suspects=0 steps_left=27
697
+ Step 54: INSPECT acc_0363 flagged=0/10 suspects=0 steps_left=26
698
+ Step 55: INSPECT acc_0221 flagged=0/10 suspects=0 steps_left=25
699
+ Step 56: INSPECT acc_0580 flagged=0/10 suspects=0 steps_left=24
700
+ Step 57: INSPECT acc_0534 flagged=0/10 suspects=0 steps_left=23
701
+ Step 58: INSPECT acc_0778 flagged=0/10 suspects=0 steps_left=22
702
+ Step 59: INSPECT acc_0998 flagged=0/10 suspects=0 steps_left=21
703
+ Step 60: INSPECT acc_0233 flagged=0/10 suspects=0 steps_left=20
704
+ Step 61: INSPECT acc_0052 flagged=0/10 suspects=0 steps_left=19
705
+ Step 62: INSPECT acc_0813 flagged=0/10 suspects=0 steps_left=18
706
+ Step 63: INSPECT acc_0035 flagged=0/10 suspects=0 steps_left=17
707
+ Step 64: INSPECT acc_0667 flagged=0/10 suspects=0 steps_left=16
708
+ Step 65: INSPECT acc_0019 flagged=0/10 suspects=0 steps_left=15
709
+ Step 66: INSPECT acc_0959 flagged=0/10 suspects=0 steps_left=14
710
+ Step 67: INSPECT acc_0212 flagged=0/10 suspects=0 steps_left=13
711
+ Step 68: INSPECT acc_0776 flagged=0/10 suspects=0 steps_left=12
712
+ Step 69: INSPECT acc_0049 flagged=0/10 suspects=0 steps_left=11
713
+ Step 70: INSPECT acc_0434 flagged=0/10 suspects=0 steps_left=10
714
+ Step 71: INSPECT acc_0827 flagged=0/10 suspects=0 steps_left=9
715
+ Step 72: INSPECT acc_0583 flagged=0/10 suspects=0 steps_left=8
716
+ Step 73: INSPECT acc_0065 flagged=0/10 suspects=0 steps_left=7
717
+ Step 74: INSPECT acc_0107 flagged=0/10 suspects=0 steps_left=6
718
+ Step 75: INSPECT acc_0761 flagged=0/10 suspects=0 steps_left=5
719
+ Step 76: INSPECT acc_0995 flagged=0/10 suspects=0 steps_left=4
720
+ Step 77: INSPECT acc_0157 flagged=0/10 suspects=0 steps_left=3
721
+ Step 78: INSPECT acc_0936 flagged=0/10 suspects=0 steps_left=2
722
+ Step 79: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=1
723
+ Step 80: INSPECT acc_0691 flagged=0/10 suspects=0 steps_left=0
724
+ → Episode ended: [LOSS] TP=0 FP=0 FN=10 Recall=0.00 Precision=0.00 Episode reward=-9.80
725
+ ★ GRADER SCORE: 0.0000
726
+
727
+ --- LLM Agent: task=hard, seed=2, model=Bedrock/us.meta.llama4-scout-17b-instruct-v1:0 ---
728
+ Step 1: INSPECT acc_0813 flagged=0/10 suspects=0 steps_left=79
729
+ Step 2: INSPECT acc_0430 flagged=0/10 suspects=0 steps_left=78
730
+ Step 3: INSPECT acc_0817 flagged=0/10 suspects=0 steps_left=77
731
+ Step 4: INSPECT acc_0175 flagged=0/10 suspects=0 steps_left=76
732
+ Step 5: INSPECT acc_0523 flagged=0/10 suspects=0 steps_left=75
733
+ Step 6: INSPECT acc_0113 flagged=0/10 suspects=0 steps_left=74
734
+ Step 7: INSPECT acc_0797 flagged=0/10 suspects=0 steps_left=73
735
+ Step 8: INSPECT acc_0478 flagged=0/10 suspects=0 steps_left=72
736
+ Step 9: INSPECT acc_0861 flagged=0/10 suspects=0 steps_left=71
737
+ Step 10: INSPECT acc_0836 flagged=0/10 suspects=0 steps_left=70
738
+ Step 11: INSPECT acc_0926 flagged=0/10 suspects=0 steps_left=69
739
+ Step 12: INSPECT acc_0664 flagged=0/10 suspects=0 steps_left=68
740
+ Step 13: INSPECT acc_0255 flagged=0/10 suspects=0 steps_left=67
741
+ Step 14: INSPECT acc_0938 flagged=0/10 suspects=0 steps_left=66
742
+ Step 15: INSPECT acc_0672 flagged=0/10 suspects=0 steps_left=65
743
+ Step 16: FLAG acc_0672 flagged=1/10 suspects=6 steps_left=65
744
+ Step 17: INSPECT acc_0659 flagged=1/10 suspects=6 steps_left=64
745
+ Step 18: FLAG acc_0659 flagged=2/10 suspects=5 steps_left=64
746
+ Step 19: INSPECT acc_0290 flagged=2/10 suspects=5 steps_left=63
747
+ Step 20: FLAG acc_0290 flagged=3/10 suspects=5 steps_left=63
748
+ Step 21: INSPECT acc_0339 flagged=3/10 suspects=5 steps_left=62
749
+ Step 22: FLAG acc_0339 flagged=4/10 suspects=6 steps_left=62
750
+ Step 23: INSPECT acc_0544 flagged=4/10 suspects=6 steps_left=61
751
+ Step 24: FLAG acc_0544 flagged=5/10 suspects=5 steps_left=61
752
+ Step 25: INSPECT acc_0696 flagged=5/10 suspects=5 steps_left=60
753
+ Step 26: FLAG acc_0696 flagged=6/10 suspects=4 steps_left=60
754
+ Step 27: INSPECT acc_0541 flagged=6/10 suspects=4 steps_left=59
755
+ Step 28: FLAG acc_0541 flagged=7/10 suspects=3 steps_left=59
756
+ Step 29: INSPECT acc_0793 flagged=7/10 suspects=3 steps_left=58
757
+ Step 30: FLAG acc_0793 flagged=8/10 suspects=2 steps_left=58
758
+ Step 31: INSPECT acc_0214 flagged=8/10 suspects=2 steps_left=57
759
+ Step 32: FLAG acc_0214 flagged=9/10 suspects=1 steps_left=57
760
+ Step 33: INSPECT acc_0112 flagged=9/10 suspects=1 steps_left=56
761
+ Step 34: FLAG acc_0112 flagged=10/10 suspects=0 steps_left=56
762
+ Step 35: INSPECT acc_0348 flagged=10/10 suspects=0 steps_left=55
763
+ Step 36: INSPECT acc_0721 flagged=10/10 suspects=0 steps_left=54
764
+ Step 37: INSPECT acc_0321 flagged=10/10 suspects=0 steps_left=53
765
+ Step 38: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=52
766
+ Step 39: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=51
767
+ Step 40: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=50
768
+ Step 41: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=49
769
+ Step 42: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=48
770
+ Step 43: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=47
771
+ Step 44: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=46
772
+ Step 45: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=45
773
+ Step 46: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=44
774
+ Step 47: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=43
775
+ Step 48: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=42
776
+ Step 49: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=41
777
+ Step 50: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=40
778
+ Step 51: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=39
779
+ Step 52: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=38
780
+ Step 53: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=37
781
+ Step 54: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=36
782
+ Step 55: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=35
783
+ Step 56: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=34
784
+ Step 57: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=33
785
+ Step 58: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=32
786
+ Step 59: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=31
787
+ Step 60: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=30
788
+ Step 61: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=29
789
+ Step 62: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=28
790
+ Step 63: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=27
791
+ Step 64: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=26
792
+ Step 65: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=25
793
+ Step 66: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=24
794
+ Step 67: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=23
795
+ Step 68: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=22
796
+ Step 69: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=21
797
+ Step 70: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=20
798
+ Step 71: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=19
799
+ Step 72: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=18
800
+ Step 73: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=17
801
+ Step 74: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=16
802
+ Step 75: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=15
803
+ Step 76: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=14
804
+ Step 77: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=13
805
+ Step 78: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=12
806
+ Step 79: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=11
807
+ Step 80: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=10
808
+ Step 81: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=9
809
+ Step 82: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=8
810
+ Step 83: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=7
811
+ Step 84: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=6
812
+ Step 85: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=5
813
+ Step 86: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=4
814
+ Step 87: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=3
815
+ Step 88: INSPECT acc_0349 flagged=10/10 suspects=0 steps_left=2
816
+ Step 89: INSPECT acc_0523 flagged=10/10 suspects=0 steps_left=1
817
+ Step 90: SUBMIT flagged=10/10 suspects=0 steps_left=1
818
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=13.21
819
+ ★ GRADER SCORE: 0.9012
820
+
821
+ hard: scores=['0.904', '0.000', '0.901'] mean=0.6017 var=0.181003
822
+
823
+ ============================================================
824
+ EVALUATION COMPLETE
825
+ ============================================================
826
+ ubuntu@ip-172-31-33-59:~/meta/meta-hack-26/eval-models$
model-benchmark-logs/mistral_judge_log.txt ADDED
@@ -0,0 +1,410 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ubuntu@ip-172-31-33-59:~/meta/meta-hack-26$ python3 mistral_test_judge_eval.py --url https://pandago-graphstrike.hf.space --bedrock
2
+ GraphStrike Judge Evaluation Simulator
3
+ Target: https://pandago-graphstrike.hf.space
4
+ Backend: bedrock
5
+ Model: Bedrock/mistral.ministral-3-8b-instruct
6
+ Token: set
7
+
8
+ ============================================================
9
+ PHASE 0: Endpoint Verification
10
+ ============================================================
11
+ ✓ GET /health
12
+ ✓ GET /tasks
13
+ ✓ GET /metadata
14
+ ✓ GET /schema
15
+ ✓ GET /web
16
+ ✓ POST /reset
17
+ ✓ GET /state
18
+ ✓ POST /step
19
+ ✓ POST /step
20
+ ✓ GET /grader
21
+ ✓ POST /mcp
22
+ ✓ POST /baseline
23
+
24
+ ============================================================
25
+ PHASE 1: Baseline Stability (3 runs)
26
+ ============================================================
27
+ Run 1: easy=0.9100 medium=0.9060 hard=0.9038
28
+ Run 2: easy=0.9100 medium=0.9060 hard=0.9038
29
+ Run 3: easy=0.9100 medium=0.9060 hard=0.9038
30
+ ✓ All 3 runs identical — baseline is deterministic
31
+
32
+ ============================================================
33
+ PHASE 2: LLM Agent Evaluation (model=Bedrock/mistral.ministral-3-8b-instruct)
34
+ ============================================================
35
+
36
+ --- LLM Agent: task=easy, seed=0, model=Bedrock/mistral.ministral-3-8b-instruct ---
37
+ Step 1: INSPECT acc_0043 flagged=0/10 suspects=0 steps_left=29
38
+ Step 2: FLAG acc_0043 flagged=1/10 suspects=8 steps_left=29
39
+ Step 3: INSPECT acc_0036 flagged=1/10 suspects=8 steps_left=28
40
+ Step 4: FLAG acc_0036 flagged=2/10 suspects=8 steps_left=28
41
+ Step 5: INSPECT acc_0001 flagged=2/10 suspects=8 steps_left=27
42
+ Step 6: FLAG acc_0001 flagged=3/10 suspects=7 steps_left=27
43
+ Step 7: INSPECT acc_0014 flagged=3/10 suspects=7 steps_left=26
44
+ Step 8: INSPECT acc_0012 flagged=3/10 suspects=7 steps_left=25
45
+ Step 9: INSPECT acc_0000 flagged=3/10 suspects=7 steps_left=24
46
+ Step 10: FLAG acc_0014 flagged=4/10 suspects=6 steps_left=24
47
+ Step 11: FLAG acc_0000 flagged=5/10 suspects=5 steps_left=24
48
+ Step 12: INSPECT acc_0027 flagged=5/10 suspects=5 steps_left=23
49
+ Step 13: FLAG acc_0012 flagged=6/10 suspects=4 steps_left=23
50
+ Step 14: FLAG acc_0027 flagged=7/10 suspects=3 steps_left=23
51
+ Step 15: INSPECT acc_0047 flagged=7/10 suspects=3 steps_left=22
52
+ Step 16: FLAG acc_0047 flagged=8/10 suspects=2 steps_left=22
53
+ Step 17: INSPECT acc_0007 flagged=8/10 suspects=2 steps_left=21
54
+ Step 18: FLAG acc_0007 flagged=9/10 suspects=1 steps_left=21
55
+ Step 19: INSPECT acc_0028 flagged=9/10 suspects=1 steps_left=20
56
+ Step 20: FLAG acc_0028 flagged=10/10 suspects=0 steps_left=20
57
+ Step 21: SUBMIT flagged=10/10 suspects=0 steps_left=20
58
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.90
59
+ ★ GRADER SCORE: 0.9667
60
+
61
+ --- LLM Agent: task=medium, seed=0, model=Bedrock/mistral.ministral-3-8b-instruct ---
62
+ Step 1: INSPECT acc_0008 flagged=0/10 suspects=0 steps_left=49
63
+ Step 2: INSPECT acc_0074 flagged=0/10 suspects=0 steps_left=48
64
+ Step 3: INSPECT acc_0179 flagged=0/10 suspects=0 steps_left=47
65
+ Step 4: INSPECT acc_0096 flagged=0/10 suspects=0 steps_left=46
66
+ Step 5: INSPECT acc_0177 flagged=0/10 suspects=0 steps_left=45
67
+ Step 6: INSPECT acc_0122 flagged=0/10 suspects=0 steps_left=44
68
+ Step 7: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=43
69
+ Step 8: INSPECT acc_0174 flagged=0/10 suspects=0 steps_left=42
70
+ Step 9: INSPECT acc_0144 flagged=0/10 suspects=0 steps_left=41
71
+ Step 10: FLAG acc_0144 flagged=1/10 suspects=8 steps_left=41
72
+ Step 11: INSPECT acc_0181 flagged=1/10 suspects=8 steps_left=40
73
+ Step 12: FLAG acc_0181 flagged=2/10 suspects=8 steps_left=40
74
+ Step 13: INSPECT acc_0022 flagged=2/10 suspects=8 steps_left=39
75
+ Step 14: FLAG acc_0022 flagged=3/10 suspects=7 steps_left=39
76
+ Step 15: INSPECT acc_0092 flagged=3/10 suspects=7 steps_left=38
77
+ Step 16: FLAG acc_0092 flagged=4/10 suspects=6 steps_left=38
78
+ Step 17: INSPECT acc_0097 flagged=4/10 suspects=6 steps_left=37
79
+ Step 18: FLAG acc_0097 flagged=5/10 suspects=5 steps_left=37
80
+ Step 19: INSPECT acc_0187 flagged=5/10 suspects=5 steps_left=36
81
+ Step 20: FLAG acc_0187 flagged=6/10 suspects=4 steps_left=36
82
+ Step 21: INSPECT acc_0093 flagged=6/10 suspects=4 steps_left=35
83
+ Step 22: FLAG acc_0093 flagged=7/10 suspects=3 steps_left=35
84
+ Step 23: INSPECT acc_0172 flagged=7/10 suspects=3 steps_left=34
85
+ Step 24: FLAG acc_0172 flagged=8/10 suspects=2 steps_left=34
86
+ Step 25: INSPECT acc_0058 flagged=8/10 suspects=2 steps_left=33
87
+ Step 26: FLAG acc_0058 flagged=9/10 suspects=1 steps_left=33
88
+ Step 27: INSPECT acc_0131 flagged=9/10 suspects=1 steps_left=32
89
+ Step 28: FLAG acc_0131 flagged=10/10 suspects=0 steps_left=32
90
+ Step 29: SUBMIT flagged=10/10 suspects=0 steps_left=32
91
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.82
92
+ ★ GRADER SCORE: 0.9640
93
+
94
+ --- LLM Agent: task=hard, seed=0, model=Bedrock/mistral.ministral-3-8b-instruct ---
95
+ Step 1: INSPECT acc_0704 flagged=0/10 suspects=0 steps_left=79
96
+ Step 2: INSPECT acc_0289 flagged=0/10 suspects=0 steps_left=78
97
+ Step 3: INSPECT acc_0826 flagged=0/10 suspects=0 steps_left=77
98
+ Step 4: INSPECT acc_0927 flagged=0/10 suspects=0 steps_left=76
99
+ Step 5: INSPECT acc_0441 flagged=0/10 suspects=0 steps_left=75
100
+ Step 6: INSPECT acc_0871 flagged=0/10 suspects=0 steps_left=74
101
+ Step 7: INSPECT acc_0880 flagged=0/10 suspects=0 steps_left=73
102
+ Step 8: INSPECT acc_0652 flagged=0/10 suspects=0 steps_left=72
103
+ Step 9: INSPECT acc_0939 flagged=0/10 suspects=0 steps_left=71
104
+ Step 10: INSPECT acc_0070 flagged=0/10 suspects=0 steps_left=70
105
+ Step 11: INSPECT acc_0915 flagged=0/10 suspects=0 steps_left=69
106
+ Step 12: INSPECT acc_0443 flagged=0/10 suspects=0 steps_left=68
107
+ Step 13: INSPECT acc_0970 flagged=0/10 suspects=0 steps_left=67
108
+ Step 14: INSPECT acc_0028 flagged=0/10 suspects=0 steps_left=66
109
+ Step 15: INSPECT acc_0792 flagged=0/10 suspects=0 steps_left=65
110
+ Step 16: INSPECT acc_0579 flagged=0/10 suspects=0 steps_left=64
111
+ Step 17: INSPECT acc_0037 flagged=0/10 suspects=0 steps_left=63
112
+ Step 18: INSPECT acc_0295 flagged=0/10 suspects=0 steps_left=62
113
+ Step 19: INSPECT acc_0438 flagged=0/10 suspects=0 steps_left=61
114
+ Step 20: INSPECT acc_0439 flagged=0/10 suspects=0 steps_left=60
115
+ Step 21: FLAG acc_0439 flagged=1/10 suspects=5 steps_left=60
116
+ Step 22: INSPECT acc_0237 flagged=1/10 suspects=5 steps_left=59
117
+ Step 23: FLAG acc_0237 flagged=2/10 suspects=6 steps_left=59
118
+ Step 24: INSPECT acc_0621 flagged=2/10 suspects=6 steps_left=58
119
+ Step 25: FLAG acc_0621 flagged=3/10 suspects=6 steps_left=58
120
+ Step 26: INSPECT acc_0389 flagged=3/10 suspects=6 steps_left=57
121
+ Step 27: INSPECT acc_0160 flagged=3/10 suspects=6 steps_left=56
122
+ Step 28: FLAG acc_0160 flagged=4/10 suspects=6 steps_left=56
123
+ Step 29: INSPECT acc_0549 flagged=4/10 suspects=6 steps_left=55
124
+ Step 30: FLAG acc_0549 flagged=5/10 suspects=5 steps_left=55
125
+ Step 31: INSPECT acc_0658 flagged=5/10 suspects=5 steps_left=54
126
+ Step 32: FLAG acc_0658 flagged=6/10 suspects=4 steps_left=54
127
+ Step 33: INSPECT acc_0290 flagged=6/10 suspects=4 steps_left=53
128
+ Step 34: FLAG acc_0389 flagged=7/10 suspects=3 steps_left=53
129
+ Step 35: FLAG acc_0290 flagged=8/10 suspects=2 steps_left=53
130
+ Step 36: INSPECT acc_0124 flagged=8/10 suspects=2 steps_left=52
131
+ Step 37: FLAG acc_0124 flagged=9/10 suspects=1 steps_left=52
132
+ Step 38: INSPECT acc_0507 flagged=9/10 suspects=1 steps_left=51
133
+ Step 39: FLAG acc_0507 flagged=10/10 suspects=0 steps_left=51
134
+ Step 40: SUBMIT flagged=10/10 suspects=0 steps_left=51
135
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.71
136
+ ★ GRADER SCORE: 0.9637
137
+
138
+ Summary: easy=0.9667 medium=0.9640 hard=0.9637
139
+
140
+ ============================================================
141
+ PHASE 3: Score Variance (seeds=[0, 1, 2])
142
+ ============================================================
143
+
144
+ --- LLM Agent: task=easy, seed=0, model=Bedrock/mistral.ministral-3-8b-instruct ---
145
+ Step 1: INSPECT acc_0043 flagged=0/10 suspects=0 steps_left=29
146
+ Step 2: FLAG acc_0043 flagged=1/10 suspects=8 steps_left=29
147
+ Step 3: INSPECT acc_0036 flagged=1/10 suspects=8 steps_left=28
148
+ Step 4: FLAG acc_0036 flagged=2/10 suspects=8 steps_left=28
149
+ Step 5: INSPECT acc_0001 flagged=2/10 suspects=8 steps_left=27
150
+ Step 6: FLAG acc_0001 flagged=3/10 suspects=7 steps_left=27
151
+ Step 7: INSPECT acc_0014 flagged=3/10 suspects=7 steps_left=26
152
+
153
+ Step 8: INSPECT acc_0012 flagged=3/10 suspects=7 steps_left=25
154
+ Step 9: FLAG acc_0014 flagged=4/10 suspects=6 steps_left=25
155
+ Step 10: INSPECT acc_0000 flagged=4/10 suspects=6 steps_left=24
156
+ Step 11: FLAG acc_0012 flagged=5/10 suspects=5 steps_left=24
157
+ Step 12: INSPECT acc_0027 flagged=5/10 suspects=5 steps_left=23
158
+ Step 13: FLAG acc_0000 flagged=6/10 suspects=4 steps_left=23
159
+ Step 14: INSPECT acc_0047 flagged=6/10 suspects=4 steps_left=22
160
+ Step 15: FLAG acc_0027 flagged=7/10 suspects=3 steps_left=22
161
+ Step 16: FLAG acc_0047 flagged=8/10 suspects=2 steps_left=22
162
+ Step 17: INSPECT acc_0007 flagged=8/10 suspects=2 steps_left=21
163
+ Step 18: FLAG acc_0007 flagged=9/10 suspects=1 steps_left=21
164
+ Step 19: INSPECT acc_0028 flagged=9/10 suspects=1 steps_left=20
165
+ Step 20: FLAG acc_0028 flagged=10/10 suspects=0 steps_left=20
166
+ Step 21: SUBMIT flagged=10/10 suspects=0 steps_left=20
167
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.90
168
+ ★ GRADER SCORE: 0.9667
169
+
170
+ --- LLM Agent: task=easy, seed=1, model=Bedrock/mistral.ministral-3-8b-instruct ---
171
+ Step 1: INSPECT acc_0034 flagged=0/10 suspects=0 steps_left=29
172
+ Step 2: INSPECT acc_0003 flagged=0/10 suspects=0 steps_left=28
173
+ Step 3: INSPECT acc_0049 flagged=0/10 suspects=0 steps_left=27
174
+ Step 4: INSPECT acc_0006 flagged=0/10 suspects=0 steps_left=26
175
+ Step 5: INSPECT acc_0047 flagged=0/10 suspects=0 steps_left=25
176
+ Step 6: FLAG acc_0047 flagged=1/10 suspects=9 steps_left=25
177
+ Step 7: INSPECT acc_0009 flagged=1/10 suspects=9 steps_left=24
178
+ Step 8: FLAG acc_0009 flagged=2/10 suspects=8 steps_left=24
179
+ Step 9: INSPECT acc_0046 flagged=2/10 suspects=8 steps_left=23
180
+ Step 10: FLAG acc_0046 flagged=3/10 suspects=7 steps_left=23
181
+ Step 11: INSPECT acc_0014 flagged=3/10 suspects=7 steps_left=22
182
+ Step 12: INSPECT acc_0021 flagged=3/10 suspects=7 steps_left=21
183
+ Step 13: FLAG acc_0014 flagged=4/10 suspects=6 steps_left=21
184
+ Step 14: INSPECT acc_0002 flagged=4/10 suspects=6 steps_left=20
185
+ Step 15: FLAG acc_0021 flagged=5/10 suspects=5 steps_left=20
186
+ Step 16: INSPECT acc_0048 flagged=5/10 suspects=5 steps_left=19
187
+ Step 17: INSPECT acc_0029 flagged=5/10 suspects=5 steps_left=18
188
+ Step 18: FLAG acc_0029 flagged=6/10 suspects=4 steps_left=18
189
+ Step 19: FLAG acc_0048 flagged=7/10 suspects=3 steps_left=18
190
+ Step 20: FLAG acc_0002 flagged=8/10 suspects=2 steps_left=18
191
+ Step 21: INSPECT acc_0015 flagged=8/10 suspects=2 steps_left=17
192
+ Step 22: FLAG acc_0015 flagged=9/10 suspects=1 steps_left=17
193
+ Step 23: INSPECT acc_0005 flagged=9/10 suspects=1 steps_left=16
194
+ Step 24: FLAG acc_0005 flagged=10/10 suspects=0 steps_left=16
195
+ Step 25: SUBMIT flagged=10/10 suspects=0 steps_left=16
196
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.86
197
+ ★ GRADER SCORE: 0.9533
198
+
199
+ --- LLM Agent: task=easy, seed=2, model=Bedrock/mistral.ministral-3-8b-instruct ---
200
+ Step 1: INSPECT acc_0040 flagged=0/10 suspects=0 steps_left=29
201
+ Step 2: INSPECT acc_0017 flagged=0/10 suspects=0 steps_left=28
202
+ Step 3: INSPECT acc_0025 flagged=0/10 suspects=0 steps_left=27
203
+ Step 4: INSPECT acc_0026 flagged=0/10 suspects=0 steps_left=26
204
+ Step 5: INSPECT acc_0038 flagged=0/10 suspects=0 steps_left=25
205
+ Step 6: FLAG acc_0029 flagged=1/10 suspects=9 steps_left=25
206
+ Step 7: INSPECT acc_0006 flagged=1/10 suspects=9 steps_left=24
207
+ Step 8: FLAG acc_0006 flagged=2/10 suspects=8 steps_left=24
208
+ Step 9: INSPECT acc_0033 flagged=2/10 suspects=8 steps_left=23
209
+ Step 10: FLAG acc_0033 flagged=3/10 suspects=7 steps_left=23
210
+ Step 11: INSPECT acc_0015 flagged=3/10 suspects=7 steps_left=22
211
+ Step 12: FLAG acc_0015 flagged=4/10 suspects=6 steps_left=22
212
+ Step 13: INSPECT acc_0022 flagged=4/10 suspects=6 steps_left=21
213
+ Step 14: INSPECT acc_0009 flagged=4/10 suspects=6 steps_left=20
214
+ Step 15: FLAG acc_0022 flagged=5/10 suspects=5 steps_left=20
215
+ Step 16: INSPECT acc_0004 flagged=5/10 suspects=5 steps_left=19
216
+ Step 17: FLAG acc_0009 flagged=6/10 suspects=4 steps_left=19
217
+ Step 18: FLAG acc_0004 flagged=7/10 suspects=3 steps_left=19
218
+ Step 19: INSPECT acc_0024 flagged=7/10 suspects=3 steps_left=18
219
+ Step 20: FLAG acc_0024 flagged=8/10 suspects=2 steps_left=18
220
+ Step 21: INSPECT acc_0049 flagged=8/10 suspects=2 steps_left=17
221
+ Step 22: FLAG acc_0049 flagged=9/10 suspects=1 steps_left=17
222
+ Step 23: INSPECT acc_0035 flagged=9/10 suspects=1 steps_left=16
223
+ Step 24: FLAG acc_0035 flagged=10/10 suspects=0 steps_left=16
224
+ Step 25: SUBMIT flagged=10/10 suspects=0 steps_left=16
225
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.86
226
+ ★ GRADER SCORE: 0.9533
227
+
228
+ easy: scores=['0.967', '0.953', '0.953'] mean=0.9578 var=0.000040
229
+
230
+ --- LLM Agent: task=medium, seed=0, model=Bedrock/mistral.ministral-3-8b-instruct ---
231
+ Step 1: INSPECT acc_0008 flagged=0/10 suspects=0 steps_left=49
232
+ Step 2: INSPECT acc_0074 flagged=0/10 suspects=0 steps_left=48
233
+ Step 3: INSPECT acc_0179 flagged=0/10 suspects=0 steps_left=47
234
+ Step 4: INSPECT acc_0096 flagged=0/10 suspects=0 steps_left=46
235
+ Step 5: INSPECT acc_0177 flagged=0/10 suspects=0 steps_left=45
236
+ Step 6: INSPECT acc_0122 flagged=0/10 suspects=0 steps_left=44
237
+ Step 7: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=43
238
+ Step 8: INSPECT acc_0174 flagged=0/10 suspects=0 steps_left=42
239
+ Step 9: INSPECT acc_0144 flagged=0/10 suspects=0 steps_left=41
240
+ Step 10: FLAG acc_0144 flagged=1/10 suspects=8 steps_left=41
241
+ Step 11: INSPECT acc_0181 flagged=1/10 suspects=8 steps_left=40
242
+ Step 12: FLAG acc_0181 flagged=2/10 suspects=8 steps_left=40
243
+ Step 13: INSPECT acc_0022 flagged=2/10 suspects=8 steps_left=39
244
+ Step 14: FLAG acc_0022 flagged=3/10 suspects=7 steps_left=39
245
+ Step 15: INSPECT acc_0092 flagged=3/10 suspects=7 steps_left=38
246
+ Step 16: FLAG acc_0092 flagged=4/10 suspects=6 steps_left=38
247
+ Step 17: INSPECT acc_0097 flagged=4/10 suspects=6 steps_left=37
248
+ Step 18: FLAG acc_0097 flagged=5/10 suspects=5 steps_left=37
249
+ Step 19: INSPECT acc_0187 flagged=5/10 suspects=5 steps_left=36
250
+ Step 20: FLAG acc_0187 flagged=6/10 suspects=4 steps_left=36
251
+ Step 21: INSPECT acc_0093 flagged=6/10 suspects=4 steps_left=35
252
+ Step 22: FLAG acc_0093 flagged=7/10 suspects=3 steps_left=35
253
+ Step 23: INSPECT acc_0172 flagged=7/10 suspects=3 steps_left=34
254
+ Step 24: FLAG acc_0172 flagged=8/10 suspects=2 steps_left=34
255
+ Step 25: INSPECT acc_0058 flagged=8/10 suspects=2 steps_left=33
256
+ Step 26: FLAG acc_0058 flagged=9/10 suspects=1 steps_left=33
257
+ Step 27: INSPECT acc_0131 flagged=9/10 suspects=1 steps_left=32
258
+ Step 28: FLAG acc_0131 flagged=10/10 suspects=0 steps_left=32
259
+ Step 29: SUBMIT flagged=10/10 suspects=0 steps_left=32
260
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.82
261
+ ★ GRADER SCORE: 0.9640
262
+
263
+ --- LLM Agent: task=medium, seed=1, model=Bedrock/mistral.ministral-3-8b-instruct ---
264
+ Step 1: INSPECT acc_0171 flagged=0/10 suspects=0 steps_left=49
265
+ Step 2: INSPECT acc_0099 flagged=0/10 suspects=0 steps_left=48
266
+ Step 3: INSPECT acc_0152 flagged=0/10 suspects=0 steps_left=47
267
+ Step 4: INSPECT acc_0092 flagged=0/10 suspects=0 steps_left=46
268
+ Step 5: INSPECT acc_0078 flagged=0/10 suspects=0 steps_left=45
269
+ Step 6: INSPECT acc_0112 flagged=0/10 suspects=0 steps_left=44
270
+ Step 7: INSPECT acc_0012 flagged=0/10 suspects=0 steps_left=43
271
+ Step 8: FLAG acc_0012 flagged=1/10 suspects=8 steps_left=43
272
+ Step 9: INSPECT acc_0033 flagged=1/10 suspects=8 steps_left=42
273
+ Step 10: FLAG acc_0033 flagged=2/10 suspects=8 steps_left=42
274
+ Step 11: INSPECT acc_0174 flagged=2/10 suspects=8 steps_left=41
275
+ Step 12: FLAG acc_0174 flagged=3/10 suspects=7 steps_left=41
276
+ Step 13: INSPECT acc_0187 flagged=3/10 suspects=7 steps_left=40
277
+ Step 14: FLAG acc_0187 flagged=4/10 suspects=6 steps_left=40
278
+ Step 15: INSPECT acc_0079 flagged=4/10 suspects=6 steps_left=39
279
+ Step 16: FLAG acc_0032 flagged=5/10 suspects=5 steps_left=39
280
+ Step 17: INSPECT acc_0023 flagged=5/10 suspects=5 steps_left=38
281
+ Step 18: FLAG acc_0023 flagged=6/10 suspects=4 steps_left=38
282
+ Step 19: INSPECT acc_0146 flagged=6/10 suspects=4 steps_left=37
283
+ Step 20: FLAG acc_0079 flagged=7/10 suspects=3 steps_left=37
284
+ Step 21: INSPECT acc_0019 flagged=7/10 suspects=3 steps_left=36
285
+ Step 22: FLAG acc_0146 flagged=8/10 suspects=2 steps_left=36
286
+ Step 23: FLAG acc_0019 flagged=9/10 suspects=1 steps_left=36
287
+ Step 24: INSPECT acc_0131 flagged=9/10 suspects=1 steps_left=35
288
+ Step 25: FLAG acc_0131 flagged=10/10 suspects=0 steps_left=35
289
+ Step 26: SUBMIT flagged=10/10 suspects=0 steps_left=35
290
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.85
291
+ ★ GRADER SCORE: 0.9700
292
+
293
+ --- LLM Agent: task=medium, seed=2, model=Bedrock/mistral.ministral-3-8b-instruct ---
294
+ Step 1: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=49
295
+ Step 2: INSPECT acc_0107 flagged=4/10 suspects=6 steps_left=20
296
+ Step 3: FLAG acc_0014 flagged=10/10 suspects=0 steps_left=20
297
+ Step 4: SUBMIT flagged=0/10 suspects=0 steps_left=42
298
+ → Episode ended: [LOSS] TP=0 FP=0 FN=10 Recall=0.00 Precision=0.00 Episode reward=-2.08
299
+ ★ GRADER SCORE: 0.0000
300
+
301
+ medium: scores=['0.964', '0.970', '0.000'] mean=0.6447 var=0.207804
302
+
303
+ --- LLM Agent: task=hard, seed=0, model=Bedrock/mistral.ministral-3-8b-instruct ---
304
+ Step 1: INSPECT acc_0704 flagged=0/10 suspects=0 steps_left=79
305
+ Step 2: INSPECT acc_0289 flagged=0/10 suspects=0 steps_left=31
306
+ Step 3: INSPECT acc_0105 flagged=0/10 suspects=0 steps_left=67
307
+ Step 4: INSPECT acc_0374 flagged=0/10 suspects=0 steps_left=57
308
+ Step 5: INSPECT acc_0577 flagged=10/10 suspects=0 steps_left=0
309
+ ★ GRADER SCORE: 0.9000
310
+
311
+ --- LLM Agent: task=hard, seed=1, model=Bedrock/mistral.ministral-3-8b-instruct ---
312
+ Step 1: INSPECT acc_0014 flagged=0/10 suspects=0 steps_left=79
313
+ Step 2: INSPECT acc_0835 flagged=0/10 suspects=0 steps_left=78
314
+ Step 3: INSPECT acc_0855 flagged=0/10 suspects=0 steps_left=77
315
+ Step 4: INSPECT acc_0930 flagged=0/10 suspects=0 steps_left=76
316
+ Step 5: INSPECT acc_0336 flagged=0/10 suspects=0 steps_left=75
317
+ Step 6: INSPECT acc_0929 flagged=0/10 suspects=0 steps_left=74
318
+ Step 7: INSPECT acc_0076 flagged=0/10 suspects=0 steps_left=73
319
+ Step 8: INSPECT acc_0543 flagged=0/10 suspects=0 steps_left=72
320
+ Step 9: INSPECT acc_0590 flagged=0/10 suspects=0 steps_left=71
321
+ Step 10: INSPECT acc_0401 flagged=0/10 suspects=0 steps_left=70
322
+ Step 11: INSPECT acc_0322 flagged=0/10 suspects=0 steps_left=69
323
+ Step 12: INSPECT acc_0154 flagged=0/10 suspects=0 steps_left=68
324
+ Step 13: INSPECT acc_0374 flagged=0/10 suspects=0 steps_left=67
325
+ Step 14: INSPECT acc_0549 flagged=0/10 suspects=0 steps_left=66
326
+ Step 15: INSPECT acc_0903 flagged=0/10 suspects=0 steps_left=65
327
+ Step 16: INSPECT acc_0976 flagged=0/10 suspects=0 steps_left=64
328
+ Step 17: INSPECT acc_0620 flagged=0/10 suspects=0 steps_left=63
329
+ Step 18: INSPECT acc_0017 flagged=0/10 suspects=0 steps_left=62
330
+ Step 19: INSPECT acc_0222 flagged=0/10 suspects=0 steps_left=61
331
+ Step 20: INSPECT acc_0536 flagged=0/10 suspects=0 steps_left=60
332
+ Step 21: INSPECT acc_0112 flagged=0/10 suspects=0 steps_left=59
333
+ Step 22: INSPECT acc_0577 flagged=0/10 suspects=0 steps_left=58
334
+ Step 23: INSPECT acc_0517 flagged=0/10 suspects=0 steps_left=57
335
+ Step 24: INSPECT acc_0113 flagged=0/10 suspects=0 steps_left=56
336
+ Step 25: INSPECT acc_0167 flagged=0/10 suspects=0 steps_left=55
337
+ Step 26: INSPECT acc_0697 flagged=0/10 suspects=0 steps_left=54
338
+ Step 27: INSPECT acc_0271 flagged=0/10 suspects=0 steps_left=53
339
+ Step 28: INSPECT acc_0681 flagged=0/10 suspects=0 steps_left=52
340
+ Step 29: INSPECT acc_0530 flagged=0/10 suspects=0 steps_left=51
341
+ Step 30: INSPECT acc_0353 flagged=0/10 suspects=0 steps_left=50
342
+ Step 31: INSPECT acc_0652 flagged=0/10 suspects=0 steps_left=49
343
+ Step 32: INSPECT acc_0777 flagged=0/10 suspects=0 steps_left=48
344
+ Step 33: INSPECT acc_0265 flagged=0/10 suspects=0 steps_left=47
345
+ Step 34: INSPECT acc_0788 flagged=0/10 suspects=0 steps_left=46
346
+ Step 35: INSPECT acc_0033 flagged=0/10 suspects=0 steps_left=45
347
+ Step 36: INSPECT acc_0187 flagged=0/10 suspects=0 steps_left=44
348
+ Step 37: INSPECT acc_0445 flagged=0/10 suspects=0 steps_left=43
349
+ Step 38: INSPECT acc_0846 flagged=0/10 suspects=0 steps_left=42
350
+ Step 39: INSPECT acc_0659 flagged=0/10 suspects=0 steps_left=41
351
+ Step 40: INSPECT acc_0768 flagged=0/10 suspects=0 steps_left=40
352
+ Step 41: INSPECT acc_0677 flagged=0/10 suspects=0 steps_left=39
353
+ Step 42: INSPECT acc_0539 flagged=0/10 suspects=0 steps_left=38
354
+ Step 43: INSPECT acc_0742 flagged=0/10 suspects=0 steps_left=37
355
+ Step 44: INSPECT acc_0503 flagged=0/10 suspects=0 steps_left=36
356
+ Step 45: INSPECT acc_0876 flagged=0/10 suspects=0 steps_left=35
357
+ Step 46: INSPECT acc_0639 flagged=0/10 suspects=0 steps_left=34
358
+ Step 47: INSPECT acc_0494 flagged=0/10 suspects=0 steps_left=33
359
+ Step 48: INSPECT acc_0898 flagged=0/10 suspects=0 steps_left=32
360
+ Step 49: INSPECT acc_0553 flagged=0/10 suspects=0 steps_left=31
361
+ Step 50: INSPECT acc_0588 flagged=0/10 suspects=0 steps_left=30
362
+ Step 51: INSPECT acc_0194 flagged=0/10 suspects=0 steps_left=29
363
+ Step 52: INSPECT acc_0810 flagged=0/10 suspects=0 steps_left=28
364
+ Step 53: INSPECT acc_0355 flagged=0/10 suspects=0 steps_left=27
365
+ Step 54: INSPECT acc_0363 flagged=0/10 suspects=0 steps_left=26
366
+ Step 55: INSPECT acc_0221 flagged=0/10 suspects=0 steps_left=25
367
+ Step 56: INSPECT acc_0580 flagged=0/10 suspects=0 steps_left=24
368
+ Step 57: INSPECT acc_0534 flagged=0/10 suspects=0 steps_left=23
369
+ Step 58: INSPECT acc_0778 flagged=0/10 suspects=0 steps_left=22
370
+ Step 59: INSPECT acc_0998 flagged=0/10 suspects=0 steps_left=21
371
+ Step 60: INSPECT acc_0233 flagged=0/10 suspects=0 steps_left=20
372
+ Step 61: INSPECT acc_0052 flagged=0/10 suspects=0 steps_left=19
373
+ Step 62: INSPECT acc_0813 flagged=4/10 suspects=6 steps_left=21
374
+ Step 63: FLAG acc_0014 flagged=5/10 suspects=5 steps_left=21
375
+ Step 64: FLAG acc_0028 flagged=6/10 suspects=4 steps_left=21
376
+ Step 65: FLAG acc_0000 flagged=7/10 suspects=3 steps_left=21
377
+ Step 66: FLAG acc_0012 flagged=8/10 suspects=2 steps_left=21
378
+ Step 67: FLAG acc_0007 flagged=9/10 suspects=1 steps_left=21
379
+ Step 68: FLAG acc_0036 flagged=10/10 suspects=0 steps_left=21
380
+ Step 69: SUBMIT flagged=10/10 suspects=0 steps_left=21
381
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.91
382
+ ★ GRADER SCORE: 0.9700
383
+
384
+ --- LLM Agent: task=hard, seed=2, model=Bedrock/mistral.ministral-3-8b-instruct ---
385
+ Step 1: INSPECT acc_0813 flagged=0/10 suspects=0 steps_left=79
386
+ Step 2: INSPECT acc_0430 flagged=0/10 suspects=0 steps_left=78
387
+ Step 3: INSPECT acc_0817 flagged=0/10 suspects=0 steps_left=77
388
+ Step 4: INSPECT acc_0175 flagged=0/10 suspects=0 steps_left=76
389
+ Step 5: INSPECT acc_0523 flagged=1/10 suspects=1 steps_left=72
390
+ Step 6: INSPECT acc_0797 flagged=1/10 suspects=1 steps_left=66
391
+ Step 7: INSPECT acc_0664 flagged=1/10 suspects=1 steps_left=65
392
+ Step 8: INSPECT acc_0255 flagged=0/10 suspects=0 steps_left=42
393
+ Step 9: INSPECT acc_0144 flagged=0/10 suspects=0 steps_left=41
394
+ Step 10: FLAG acc_0144 flagged=1/10 suspects=8 steps_left=40
395
+ Step 11: INSPECT acc_0181 flagged=1/10 suspects=8 steps_left=39
396
+ Step 12: FLAG acc_0181 flagged=7/10 suspects=11 steps_left=51
397
+ Step 13: FLAG acc_0389 flagged=8/10 suspects=10 steps_left=51
398
+ Step 14: FLAG acc_0658 flagged=9/10 suspects=9 steps_left=51
399
+ Step 15: FLAG acc_0507 flagged=10/10 suspects=0 steps_left=16
400
+ Step 16: SUBMIT flagged=0/10 suspects=0 steps_left=30
401
+ → Episode ended: [LOSS] TP=0 FP=0 FN=10 Recall=0.00 Precision=0.00 Episode reward=-2.00
402
+ ★ GRADER SCORE: 0.0000
403
+
404
+ hard: scores=['0.900', '0.970', '0.000'] mean=0.6233 var=0.195089
405
+
406
+ ============================================================
407
+ EVALUATION COMPLETE
408
+ ============================================================
409
+ ubuntu@ip-172-31-33-59:~/meta/meta-hack-26$
410
+ ubuntu@ip-172-31-33-59:~/meta/meta-hack-26$
model-benchmark-logs/nvidia_judge_log.txt ADDED
@@ -0,0 +1,545 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ubuntu@ip-172-31-33-59:~/meta/meta-hack-26$ python3 nvidia_test_judge_eval.py --url https://
2
+ pandago-graphstrike.hf.space --bedrock
3
+ GraphStrike Judge Evaluation Simulator
4
+ Target: https://pandago-graphstrike.hf.space
5
+ Backend: bedrock
6
+ Model: Bedrock/nvidia.nemotron-super-3-120b
7
+ Token: set
8
+
9
+ ============================================================
10
+ PHASE 0: Endpoint Verification
11
+ ============================================================
12
+ ✓ GET /health
13
+ ✓ GET /tasks
14
+ ✓ GET /metadata
15
+ ✓ GET /schema
16
+ ✓ GET /web
17
+ ✓ POST /reset
18
+ ✓ GET /state
19
+ ✓ POST /step
20
+ ✓ POST /step
21
+ ✓ GET /grader
22
+ ✓ POST /mcp
23
+ ✓ POST /baseline
24
+
25
+ ============================================================
26
+ PHASE 1: Baseline Stability (3 runs)
27
+ ============================================================
28
+ Run 1: easy=0.9100 medium=0.9060 hard=0.9038
29
+ Run 2: easy=0.9100 medium=0.9060 hard=0.9038
30
+ Run 3: easy=0.9100 medium=0.9060 hard=0.9038
31
+ ✓ All 3 runs identical — baseline is deterministic
32
+
33
+ ============================================================
34
+ PHASE 2: LLM Agent Evaluation (model=Bedrock/nvidia.nemotron-super-3-120b)
35
+ ============================================================
36
+
37
+ --- LLM Agent: task=easy, seed=0, model=Bedrock/nvidia.nemotron-super-3-120b ---
38
+ Step 1: INSPECT acc_0043 flagged=0/10 suspects=0 steps_left=27
39
+ Step 2: FLAG acc_0043 flagged=1/10 suspects=8 steps_left=27
40
+ Step 3: INSPECT acc_0036 flagged=1/10 suspects=8 steps_left=25
41
+ Step 4: FLAG acc_0036 flagged=2/10 suspects=8 steps_left=25
42
+ Step 5: INSPECT acc_0047 flagged=2/10 suspects=8 steps_left=24
43
+ Step 6: FLAG acc_0047 flagged=3/10 suspects=7 steps_left=23
44
+ Step 7: INSPECT acc_0007 flagged=3/10 suspects=7 steps_left=22
45
+ Step 8: FLAG acc_0007 flagged=4/10 suspects=6 steps_left=21
46
+ Step 9: INSPECT acc_0001 flagged=4/10 suspects=6 steps_left=20
47
+ Step 10: FLAG acc_0001 flagged=5/10 suspects=5 steps_left=19
48
+ Step 11: INSPECT acc_0014 flagged=5/10 suspects=5 steps_left=18
49
+ Step 12: FLAG acc_0014 flagged=6/10 suspects=4 steps_left=17
50
+ Step 13: INSPECT acc_0012 flagged=6/10 suspects=4 steps_left=16
51
+ Step 14: FLAG acc_0012 flagged=7/10 suspects=3 steps_left=15
52
+ Step 15: INSPECT acc_0000 flagged=7/10 suspects=3 steps_left=14
53
+ Step 16: FLAG acc_0000 flagged=8/10 suspects=2 steps_left=13
54
+ Step 17: INSPECT acc_0027 flagged=8/10 suspects=2 steps_left=12
55
+ Step 18: FLAG acc_0027 flagged=9/10 suspects=1 steps_left=11
56
+ Step 19: INSPECT acc_0028 flagged=9/10 suspects=1 steps_left=10
57
+ Step 20: FLAG acc_0028 flagged=10/10 suspects=0 steps_left=9
58
+ Step 21: SUBMIT flagged=10/10 suspects=0 steps_left=9
59
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.79
60
+ ★ GRADER SCORE: 0.9300
61
+
62
+ --- LLM Agent: task=medium, seed=0, model=Bedrock/nvidia.nemotron-super-3-120b ---
63
+ Step 1: INSPECT acc_0008 flagged=0/10 suspects=0 steps_left=48
64
+ Step 2: INSPECT acc_0074 flagged=0/10 suspects=0 steps_left=45
65
+ Step 3: INSPECT acc_0179 flagged=0/10 suspects=0 steps_left=43
66
+ Step 4: INSPECT acc_0096 flagged=0/10 suspects=0 steps_left=41
67
+ Step 5: INSPECT acc_0177 flagged=0/10 suspects=0 steps_left=39
68
+ Step 6: INSPECT acc_0122 flagged=0/10 suspects=0 steps_left=37
69
+ Step 7: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=35
70
+ Step 8: INSPECT acc_0174 flagged=0/10 suspects=0 steps_left=33
71
+ Step 9: INSPECT acc_0144 flagged=0/10 suspects=0 steps_left=31
72
+ Step 10: FLAG acc_0144 flagged=1/10 suspects=9 steps_left=31
73
+ Step 11: INSPECT acc_0131 flagged=1/10 suspects=9 steps_left=29
74
+ Step 12: FLAG acc_0131 flagged=2/10 suspects=8 steps_left=29
75
+ Step 13: INSPECT acc_0181 flagged=2/10 suspects=8 steps_left=27
76
+ Step 14: FLAG acc_0181 flagged=3/10 suspects=7 steps_left=26
77
+ Step 15: FLAG acc_0022 flagged=4/10 suspects=6 steps_left=26
78
+ Step 16: INSPECT acc_0092 flagged=4/10 suspects=6 steps_left=24
79
+ Step 17: FLAG acc_0092 flagged=5/10 suspects=5 steps_left=24
80
+ Step 18: INSPECT acc_0097 flagged=5/10 suspects=5 steps_left=22
81
+ Step 19: FLAG acc_0097 flagged=6/10 suspects=4 steps_left=22
82
+ Step 20: INSPECT acc_0187 flagged=6/10 suspects=4 steps_left=20
83
+ Step 21: FLAG acc_0187 flagged=7/10 suspects=3 steps_left=20
84
+ Step 22: INSPECT acc_0093 flagged=7/10 suspects=3 steps_left=19
85
+ Step 23: FLAG acc_0093 flagged=8/10 suspects=2 steps_left=18
86
+ Step 24: INSPECT acc_0172 flagged=8/10 suspects=2 steps_left=17
87
+ Step 25: FLAG acc_0172 flagged=9/10 suspects=1 steps_left=16
88
+ Step 26: INSPECT acc_0058 flagged=0/10 suspects=0 steps_left=79
89
+ Step 27: INSPECT acc_0704 flagged=0/10 suspects=0 steps_left=78
90
+ Step 28: INSPECT acc_0289 flagged=0/10 suspects=0 steps_left=77
91
+ Step 29: INSPECT acc_0826 flagged=0/10 suspects=0 steps_left=75
92
+ Step 30: INSPECT acc_0927 flagged=0/10 suspects=0 steps_left=73
93
+ Step 31: INSPECT acc_0441 flagged=0/10 suspects=0 steps_left=70
94
+ Step 32: INSPECT acc_0871 flagged=0/10 suspects=0 steps_left=68
95
+ Step 33: INSPECT acc_0880 flagged=0/10 suspects=0 steps_left=67
96
+ Step 34: INSPECT acc_0652 flagged=0/10 suspects=0 steps_left=65
97
+ Step 35: INSPECT acc_0939 flagged=0/10 suspects=0 steps_left=63
98
+ Step 36: INSPECT acc_0070 flagged=0/10 suspects=0 steps_left=61
99
+ Step 37: INSPECT acc_0915 flagged=0/10 suspects=0 steps_left=58
100
+ Step 38: INSPECT acc_0443 flagged=0/10 suspects=0 steps_left=56
101
+ Step 39: INSPECT acc_0970 flagged=0/10 suspects=0 steps_left=54
102
+ Step 40: INSPECT acc_0028 flagged=0/10 suspects=0 steps_left=53
103
+ Step 41: INSPECT acc_0792 flagged=0/10 suspects=0 steps_left=52
104
+ Step 42: INSPECT acc_0579 flagged=0/10 suspects=0 steps_left=50
105
+ Step 43: INSPECT acc_0037 flagged=0/10 suspects=0 steps_left=47
106
+ Step 44: INSPECT acc_0295 flagged=0/10 suspects=0 steps_left=46
107
+ Step 45: INSPECT acc_0438 flagged=0/10 suspects=0 steps_left=44
108
+ Step 46: INSPECT acc_0439 flagged=0/10 suspects=0 steps_left=42
109
+ Step 47: FLAG acc_0439 flagged=1/10 suspects=5 steps_left=41
110
+ Step 48: INSPECT acc_0237 flagged=1/10 suspects=5 steps_left=40
111
+ Step 49: FLAG acc_0237 flagged=2/10 suspects=6 steps_left=40
112
+ Step 50: INSPECT acc_0621 flagged=2/10 suspects=6 steps_left=39
113
+ Step 51: FLAG acc_0621 flagged=3/10 suspects=6 steps_left=39
114
+ Step 52: INSPECT acc_0389 flagged=3/10 suspects=6 steps_left=38
115
+ Step 53: FLAG acc_0389 flagged=4/10 suspects=6 steps_left=38
116
+ Step 54: INSPECT acc_0160 flagged=4/10 suspects=6 steps_left=37
117
+ Step 55: FLAG acc_0160 flagged=5/10 suspects=5 steps_left=37
118
+ Step 56: INSPECT acc_0549 flagged=5/10 suspects=5 steps_left=36
119
+ Step 57: FLAG acc_0549 flagged=6/10 suspects=4 steps_left=36
120
+ Step 58: INSPECT acc_0658 flagged=6/10 suspects=4 steps_left=35
121
+ Step 59: FLAG acc_0658 flagged=7/10 suspects=3 steps_left=35
122
+ Step 60: INSPECT acc_0290 flagged=7/10 suspects=3 steps_left=34
123
+ Step 61: FLAG acc_0290 flagged=8/10 suspects=2 steps_left=34
124
+ Step 62: INSPECT acc_0124 flagged=8/10 suspects=2 steps_left=33
125
+ Step 63: FLAG acc_0507 flagged=9/10 suspects=1 steps_left=33
126
+ Step 64: FLAG acc_0124 flagged=10/10 suspects=0 steps_left=33
127
+ Step 65: SUBMIT flagged=10/10 suspects=0 steps_left=33
128
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=14.53
129
+ ★ GRADER SCORE: 0.9413
130
+
131
+ --- LLM Agent: task=hard, seed=0, model=Bedrock/nvidia.nemotron-super-3-120b ---
132
+ Step 1: INSPECT acc_0704 flagged=0/10 suspects=0 steps_left=79
133
+ Step 2: INSPECT acc_0289 flagged=0/10 suspects=0 steps_left=78
134
+ Step 3: INSPECT acc_0826 flagged=0/10 suspects=0 steps_left=77
135
+ Step 4: INSPECT acc_0927 flagged=0/10 suspects=0 steps_left=76
136
+ Step 5: INSPECT acc_0441 flagged=0/10 suspects=0 steps_left=75
137
+ Step 6: INSPECT acc_0871 flagged=0/10 suspects=0 steps_left=74
138
+ Step 7: INSPECT acc_0880 flagged=0/10 suspects=0 steps_left=73
139
+ Step 8: INSPECT acc_0652 flagged=0/10 suspects=0 steps_left=72
140
+ Step 9: INSPECT acc_0939 flagged=0/10 suspects=0 steps_left=71
141
+ Step 10: INSPECT acc_0070 flagged=0/10 suspects=0 steps_left=70
142
+ Step 11: INSPECT acc_0915 flagged=0/10 suspects=0 steps_left=69
143
+ Step 12: INSPECT acc_0443 flagged=0/10 suspects=0 steps_left=68
144
+ Step 13: INSPECT acc_0970 flagged=0/10 suspects=0 steps_left=67
145
+ Step 14: INSPECT acc_0028 flagged=0/10 suspects=0 steps_left=66
146
+ Step 15: INSPECT acc_0792 flagged=0/10 suspects=0 steps_left=65
147
+ Step 16: INSPECT acc_0579 flagged=0/10 suspects=0 steps_left=64
148
+ Step 17: INSPECT acc_0037 flagged=0/10 suspects=0 steps_left=63
149
+ Step 18: INSPECT acc_0295 flagged=0/10 suspects=0 steps_left=62
150
+ Step 19: INSPECT acc_0438 flagged=0/10 suspects=0 steps_left=61
151
+ Step 20: INSPECT acc_0439 flagged=0/10 suspects=0 steps_left=60
152
+ Step 21: FLAG acc_0439 flagged=1/10 suspects=5 steps_left=60
153
+ Step 22: INSPECT acc_0237 flagged=1/10 suspects=5 steps_left=59
154
+ Step 23: FLAG acc_0237 flagged=2/10 suspects=6 steps_left=59
155
+ Step 24: INSPECT acc_0621 flagged=2/10 suspects=6 steps_left=58
156
+ Step 25: FLAG acc_0621 flagged=3/10 suspects=6 steps_left=58
157
+ Step 26: INSPECT acc_0389 flagged=3/10 suspects=6 steps_left=57
158
+ Step 27: FLAG acc_0389 flagged=4/10 suspects=6 steps_left=57
159
+ Step 28: INSPECT acc_0160 flagged=4/10 suspects=6 steps_left=56
160
+ Step 29: FLAG acc_0160 flagged=5/10 suspects=5 steps_left=56
161
+ Step 30: INSPECT acc_0549 flagged=5/10 suspects=5 steps_left=55
162
+ Step 31: FLAG acc_0549 flagged=6/10 suspects=4 steps_left=55
163
+ Step 32: INSPECT acc_0658 flagged=6/10 suspects=4 steps_left=54
164
+ Step 33: FLAG acc_0658 flagged=7/10 suspects=3 steps_left=54
165
+ Step 34: INSPECT acc_0290 flagged=7/10 suspects=3 steps_left=53
166
+ Step 35: FLAG acc_0290 flagged=8/10 suspects=2 steps_left=53
167
+ Step 36: INSPECT acc_0124 flagged=8/10 suspects=2 steps_left=52
168
+ Step 37: FLAG acc_0124 flagged=9/10 suspects=1 steps_left=52
169
+ Step 38: INSPECT acc_0507 flagged=9/10 suspects=1 steps_left=51
170
+ Step 39: FLAG acc_0507 flagged=10/10 suspects=0 steps_left=51
171
+ Step 40: SUBMIT flagged=10/10 suspects=0 steps_left=51
172
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.71
173
+ ★ GRADER SCORE: 0.9637
174
+
175
+ Summary: easy=0.9300 medium=0.9413 hard=0.9637
176
+
177
+ ============================================================
178
+ PHASE 3: Score Variance (seeds=[0, 1, 2])
179
+ ============================================================
180
+
181
+ --- LLM Agent: task=easy, seed=0, model=Bedrock/nvidia.nemotron-super-3-120b ---
182
+ Step 1: INSPECT acc_0043 flagged=0/10 suspects=0 steps_left=29
183
+ Step 2: FLAG acc_0043 flagged=1/10 suspects=8 steps_left=29
184
+ Step 3: INSPECT acc_0036 flagged=1/10 suspects=8 steps_left=28
185
+ Step 4: FLAG acc_0036 flagged=2/10 suspects=8 steps_left=28
186
+ Step 5: INSPECT acc_0001 flagged=2/10 suspects=8 steps_left=27
187
+ Step 6: FLAG acc_0001 flagged=3/10 suspects=7 steps_left=27
188
+ Step 7: INSPECT acc_0014 flagged=3/10 suspects=7 steps_left=26
189
+ Step 8: FLAG acc_0014 flagged=4/10 suspects=6 steps_left=26
190
+ Step 9: INSPECT acc_0012 flagged=4/10 suspects=6 steps_left=25
191
+ Step 10: FLAG acc_0012 flagged=5/10 suspects=5 steps_left=25
192
+ Step 11: INSPECT acc_0000 flagged=5/10 suspects=5 steps_left=24
193
+ Step 12: FLAG acc_0000 flagged=6/10 suspects=4 steps_left=24
194
+ Step 13: INSPECT acc_0027 flagged=6/10 suspects=4 steps_left=23
195
+ Step 14: FLAG acc_0027 flagged=7/10 suspects=3 steps_left=23
196
+ Step 15: INSPECT acc_0047 flagged=7/10 suspects=3 steps_left=22
197
+ Step 16: FLAG acc_0047 flagged=8/10 suspects=2 steps_left=22
198
+ Step 17: INSPECT acc_0007 flagged=8/10 suspects=2 steps_left=21
199
+ Step 18: FLAG acc_0007 flagged=9/10 suspects=1 steps_left=21
200
+ Step 19: INSPECT acc_0028 flagged=9/10 suspects=1 steps_left=20
201
+ Step 20: FLAG acc_0028 flagged=10/10 suspects=0 steps_left=20
202
+ Step 21: SUBMIT flagged=10/10 suspects=0 steps_left=20
203
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.90
204
+ ★ GRADER SCORE: 0.9667
205
+
206
+ --- LLM Agent: task=easy, seed=1, model=Bedrock/nvidia.nemotron-super-3-120b ---
207
+ Step 1: INSPECT acc_0034 flagged=0/10 suspects=0 steps_left=29
208
+ Step 2: INSPECT acc_0003 flagged=0/10 suspects=0 steps_left=28
209
+ Step 3: INSPECT acc_0049 flagged=0/10 suspects=0 steps_left=27
210
+ Step 4: INSPECT acc_0006 flagged=0/10 suspects=0 steps_left=26
211
+ Step 5: INSPECT acc_0047 flagged=0/10 suspects=0 steps_left=25
212
+ Step 6: FLAG acc_0047 flagged=1/10 suspects=9 steps_left=25
213
+ Step 7: INSPECT acc_0009 flagged=1/10 suspects=9 steps_left=24
214
+ Step 8: FLAG acc_0009 flagged=2/10 suspects=8 steps_left=24
215
+ Step 9: INSPECT acc_0046 flagged=2/10 suspects=8 steps_left=23
216
+ Step 10: FLAG acc_0046 flagged=3/10 suspects=7 steps_left=23
217
+ Step 11: INSPECT acc_0014 flagged=3/10 suspects=7 steps_left=22
218
+ Step 12: FLAG acc_0014 flagged=4/10 suspects=6 steps_left=22
219
+ Step 13: INSPECT acc_0021 flagged=4/10 suspects=6 steps_left=21
220
+ Step 14: FLAG acc_0021 flagged=5/10 suspects=5 steps_left=21
221
+ Step 15: INSPECT acc_0002 flagged=5/10 suspects=5 steps_left=20
222
+ Step 16: FLAG acc_0002 flagged=6/10 suspects=4 steps_left=20
223
+ Step 17: INSPECT acc_0048 flagged=6/10 suspects=4 steps_left=19
224
+ Step 18: FLAG acc_0048 flagged=7/10 suspects=3 steps_left=19
225
+ Step 19: INSPECT acc_0029 flagged=7/10 suspects=3 steps_left=18
226
+ Step 20: FLAG acc_0029 flagged=8/10 suspects=2 steps_left=18
227
+ Step 21: INSPECT acc_0015 flagged=8/10 suspects=2 steps_left=17
228
+ Step 22: FLAG acc_0015 flagged=9/10 suspects=1 steps_left=17
229
+ Step 23: INSPECT acc_0005 flagged=9/10 suspects=1 steps_left=16
230
+ Step 24: FLAG acc_0005 flagged=10/10 suspects=0 steps_left=16
231
+ Step 25: SUBMIT flagged=10/10 suspects=0 steps_left=16
232
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.86
233
+ ★ GRADER SCORE: 0.9533
234
+
235
+ --- LLM Agent: task=easy, seed=2, model=Bedrock/nvidia.nemotron-super-3-120b ---
236
+ Step 1: INSPECT acc_0040 flagged=0/10 suspects=0 steps_left=29
237
+ Step 2: INSPECT acc_0017 flagged=0/10 suspects=0 steps_left=28
238
+ Step 3: INSPECT acc_0025 flagged=0/10 suspects=0 steps_left=27
239
+ Step 4: INSPECT acc_0026 flagged=0/10 suspects=0 steps_left=26
240
+ Step 5: INSPECT acc_0038 flagged=0/10 suspects=0 steps_left=25
241
+ Step 6: INSPECT acc_0029 flagged=0/10 suspects=0 steps_left=24
242
+ Step 7: FLAG acc_0029 flagged=1/10 suspects=9 steps_left=24
243
+ Step 8: INSPECT acc_0006 flagged=1/10 suspects=9 steps_left=23
244
+ Step 9: FLAG acc_0006 flagged=2/10 suspects=8 steps_left=23
245
+ Step 10: INSPECT acc_0033 flagged=2/10 suspects=8 steps_left=22
246
+ Step 11: FLAG acc_0033 flagged=3/10 suspects=7 steps_left=22
247
+ Step 12: INSPECT acc_0015 flagged=3/10 suspects=7 steps_left=21
248
+ Step 13: FLAG acc_0015 flagged=4/10 suspects=6 steps_left=21
249
+ Step 14: INSPECT acc_0022 flagged=4/10 suspects=6 steps_left=20
250
+ Step 15: FLAG acc_0022 flagged=5/10 suspects=5 steps_left=20
251
+ Step 16: INSPECT acc_0009 flagged=5/10 suspects=5 steps_left=19
252
+ Step 17: FLAG acc_0009 flagged=6/10 suspects=4 steps_left=19
253
+ Step 18: INSPECT acc_0004 flagged=6/10 suspects=4 steps_left=18
254
+ Step 19: FLAG acc_0004 flagged=7/10 suspects=3 steps_left=18
255
+ Step 20: INSPECT acc_0024 flagged=7/10 suspects=3 steps_left=17
256
+ Step 21: FLAG acc_0024 flagged=8/10 suspects=2 steps_left=17
257
+ Step 22: INSPECT acc_0049 flagged=8/10 suspects=2 steps_left=16
258
+ Step 23: FLAG acc_0049 flagged=9/10 suspects=1 steps_left=16
259
+ Step 24: INSPECT acc_0035 flagged=9/10 suspects=1 steps_left=15
260
+ Step 25: FLAG acc_0035 flagged=10/10 suspects=0 steps_left=15
261
+ Step 26: SUBMIT flagged=10/10 suspects=0 steps_left=15
262
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.85
263
+ ★ GRADER SCORE: 0.9500
264
+
265
+ easy: scores=['0.967', '0.953', '0.950'] mean=0.9567 var=0.000052
266
+
267
+ --- LLM Agent: task=medium, seed=0, model=Bedrock/nvidia.nemotron-super-3-120b ---
268
+ Step 1: INSPECT acc_0008 flagged=0/10 suspects=0 steps_left=49
269
+ Step 2: INSPECT acc_0074 flagged=0/10 suspects=0 steps_left=48
270
+ Step 3: INSPECT acc_0179 flagged=0/10 suspects=0 steps_left=47
271
+ Step 4: INSPECT acc_0096 flagged=0/10 suspects=0 steps_left=46
272
+ Step 5: INSPECT acc_0177 flagged=0/10 suspects=0 steps_left=45
273
+ Step 6: INSPECT acc_0122 flagged=0/10 suspects=0 steps_left=44
274
+ Step 7: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=43
275
+ Step 8: INSPECT acc_0174 flagged=0/10 suspects=0 steps_left=42
276
+ Step 9: INSPECT acc_0144 flagged=0/10 suspects=0 steps_left=41
277
+ Step 10: FLAG acc_0144 flagged=1/10 suspects=8 steps_left=41
278
+ Step 11: INSPECT acc_0181 flagged=1/10 suspects=8 steps_left=40
279
+ Step 12: FLAG acc_0181 flagged=2/10 suspects=8 steps_left=40
280
+ Step 13: INSPECT acc_0022 flagged=2/10 suspects=8 steps_left=39
281
+ Step 14: FLAG acc_0022 flagged=3/10 suspects=7 steps_left=39
282
+ Step 15: INSPECT acc_0092 flagged=3/10 suspects=7 steps_left=38
283
+ Step 16: FLAG acc_0092 flagged=4/10 suspects=6 steps_left=38
284
+ Step 17: INSPECT acc_0097 flagged=4/10 suspects=6 steps_left=37
285
+ Step 18: FLAG acc_0097 flagged=5/10 suspects=5 steps_left=37
286
+ Step 19: INSPECT acc_0187 flagged=5/10 suspects=5 steps_left=36
287
+ Step 20: FLAG acc_0187 flagged=6/10 suspects=4 steps_left=36
288
+ Step 21: INSPECT acc_0093 flagged=6/10 suspects=4 steps_left=35
289
+ Step 22: FLAG acc_0093 flagged=7/10 suspects=3 steps_left=34
290
+ Step 23: INSPECT acc_0172 flagged=7/10 suspects=3 steps_left=33
291
+ Step 24: FLAG acc_0172 flagged=8/10 suspects=2 steps_left=33
292
+ Step 25: INSPECT acc_0058 flagged=8/10 suspects=2 steps_left=31
293
+ Step 26: FLAG acc_0058 flagged=9/10 suspects=1 steps_left=31
294
+ Step 27: INSPECT acc_0131 flagged=9/10 suspects=1 steps_left=29
295
+ Step 28: FLAG acc_0131 flagged=10/10 suspects=0 steps_left=29
296
+ Step 29: SUBMIT flagged=10/10 suspects=0 steps_left=29
297
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.79
298
+ ★ GRADER SCORE: 0.9580
299
+
300
+ --- LLM Agent: task=medium, seed=1, model=Bedrock/nvidia.nemotron-super-3-120b ---
301
+ Step 1: INSPECT acc_0171 flagged=0/10 suspects=0 steps_left=49
302
+ Step 2: INSPECT acc_0099 flagged=1/10 suspects=3 steps_left=48
303
+ Step 3: INSPECT acc_0012 flagged=1/10 suspects=3 steps_left=46
304
+ Step 4: FLAG acc_0012 flagged=2/10 suspects=7 steps_left=46
305
+ Step 5: INSPECT acc_0033 flagged=2/10 suspects=7 steps_left=45
306
+ Step 6: FLAG acc_0033 flagged=3/10 suspects=7 steps_left=44
307
+ Step 7: INSPECT acc_0174 flagged=3/10 suspects=7 steps_left=43
308
+ Step 8: FLAG acc_0174 flagged=4/10 suspects=6 steps_left=43
309
+ Step 9: INSPECT acc_0187 flagged=4/10 suspects=6 steps_left=42
310
+ Step 10: FLAG acc_0187 flagged=5/10 suspects=5 steps_left=42
311
+ Step 11: INSPECT acc_0079 flagged=5/10 suspects=5 steps_left=41
312
+ Step 12: FLAG acc_0079 flagged=6/10 suspects=4 steps_left=41
313
+ Step 13: INSPECT acc_0019 flagged=6/10 suspects=4 steps_left=40
314
+ Step 14: FLAG acc_0019 flagged=7/10 suspects=3 steps_left=40
315
+ Step 15: INSPECT acc_0023 flagged=7/10 suspects=3 steps_left=39
316
+ Step 16: FLAG acc_0023 flagged=8/10 suspects=2 steps_left=39
317
+ Step 17: INSPECT acc_0032 flagged=8/10 suspects=2 steps_left=38
318
+ Step 18: FLAG acc_0032 flagged=9/10 suspects=1 steps_left=38
319
+ Step 19: INSPECT acc_0146 flagged=9/10 suspects=1 steps_left=37
320
+ Step 20: FLAG acc_0146 flagged=10/10 suspects=0 steps_left=37
321
+ Step 21: SUBMIT flagged=10/10 suspects=0 steps_left=37
322
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=18.87
323
+ ★ GRADER SCORE: 0.9740
324
+
325
+ --- LLM Agent: task=medium, seed=2, model=Bedrock/nvidia.nemotron-super-3-120b ---
326
+ Step 1: INSPECT acc_0189 flagged=0/10 suspects=0 steps_left=49
327
+ Step 2: INSPECT acc_0107 flagged=0/10 suspects=0 steps_left=48
328
+ Step 3: INSPECT acc_0175 flagged=0/10 suspects=0 steps_left=47
329
+ Step 4: INSPECT acc_0030 flagged=0/10 suspects=0 steps_left=46
330
+ Step 5: INSPECT acc_0041 flagged=0/10 suspects=0 steps_left=45
331
+ Step 6: INSPECT acc_0054 flagged=0/10 suspects=0 steps_left=44
332
+ Step 7: INSPECT acc_0199 flagged=0/10 suspects=0 steps_left=43
333
+ Step 8: INSPECT acc_0181 flagged=0/10 suspects=0 steps_left=42
334
+ Step 9: INSPECT acc_0166 flagged=0/10 suspects=0 steps_left=41
335
+ Step 10: INSPECT acc_0098 flagged=0/10 suspects=0 steps_left=40
336
+ Step 11: INSPECT acc_0121 flagged=0/10 suspects=0 steps_left=39
337
+ Step 12: INSPECT acc_0053 flagged=0/10 suspects=0 steps_left=38
338
+ Step 13: INSPECT acc_0103 flagged=0/10 suspects=0 steps_left=37
339
+ Step 14: INSPECT acc_0000 flagged=0/10 suspects=0 steps_left=36
340
+ Step 15: INSPECT acc_0168 flagged=0/10 suspects=0 steps_left=35
341
+ Step 16: INSPECT acc_0040 flagged=0/10 suspects=0 steps_left=34
342
+ Step 17: INSPECT acc_0149 flagged=0/10 suspects=0 steps_left=33
343
+ Step 18: INSPECT acc_0064 flagged=0/10 suspects=0 steps_left=32
344
+ Step 19: INSPECT acc_0016 flagged=0/10 suspects=0 steps_left=31
345
+ Step 20: INSPECT acc_0105 flagged=0/10 suspects=0 steps_left=30
346
+ Step 21: INSPECT acc_0035 flagged=0/10 suspects=0 steps_left=29
347
+ Step 22: FLAG acc_0035 flagged=1/10 suspects=9 steps_left=29
348
+ Step 23: INSPECT acc_0020 flagged=1/10 suspects=9 steps_left=28
349
+ Step 24: FLAG acc_0020 flagged=2/10 suspects=8 steps_left=28
350
+ Step 25: INSPECT acc_0036 flagged=2/10 suspects=8 steps_left=27
351
+ Step 26: FLAG acc_0036 flagged=3/10 suspects=7 steps_left=27
352
+ Step 27: INSPECT acc_0050 flagged=3/10 suspects=7 steps_left=26
353
+ Step 28: FLAG acc_0050 flagged=4/10 suspects=6 steps_left=26
354
+ Step 29: INSPECT acc_0051 flagged=4/10 suspects=6 steps_left=25
355
+ Step 30: FLAG acc_0051 flagged=5/10 suspects=5 steps_left=25
356
+ Step 31: INSPECT acc_0085 flagged=5/10 suspects=5 steps_left=24
357
+ Step 32: FLAG acc_0085 flagged=6/10 suspects=4 steps_left=24
358
+ Step 33: INSPECT acc_0177 flagged=6/10 suspects=4 steps_left=23
359
+ Step 34: FLAG acc_0177 flagged=7/10 suspects=3 steps_left=23
360
+ Step 35: INSPECT acc_0170 flagged=7/10 suspects=3 steps_left=22
361
+ Step 36: FLAG acc_0170 flagged=8/10 suspects=2 steps_left=22
362
+ Step 37: INSPECT acc_0055 flagged=8/10 suspects=2 steps_left=21
363
+ Step 38: FLAG acc_0055 flagged=9/10 suspects=1 steps_left=21
364
+ Step 39: INSPECT acc_0094 flagged=9/10 suspects=1 steps_left=20
365
+ Step 40: FLAG acc_0094 flagged=10/10 suspects=0 steps_left=20
366
+ Step 41: SUBMIT flagged=10/10 suspects=0 steps_left=20
367
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.70
368
+ ★ GRADER SCORE: 0.9400
369
+
370
+ medium: scores=['0.958', '0.974', '0.940'] mean=0.9573 var=0.000193
371
+
372
+ --- LLM Agent: task=hard, seed=0, model=Bedrock/nvidia.nemotron-super-3-120b ---
373
+ Step 1: INSPECT acc_0704 flagged=0/10 suspects=0 steps_left=79
374
+ Step 2: INSPECT acc_0289 flagged=0/10 suspects=0 steps_left=78
375
+ Step 3: INSPECT acc_0826 flagged=0/10 suspects=0 steps_left=77
376
+ Step 4: INSPECT acc_0927 flagged=0/10 suspects=0 steps_left=76
377
+ Step 5: INSPECT acc_0441 flagged=0/10 suspects=0 steps_left=75
378
+ Step 6: INSPECT acc_0871 flagged=0/10 suspects=0 steps_left=74
379
+ Step 7: INSPECT acc_0880 flagged=0/10 suspects=0 steps_left=73
380
+ Step 8: INSPECT acc_0652 flagged=0/10 suspects=0 steps_left=72
381
+ Step 9: INSPECT acc_0939 flagged=0/10 suspects=0 steps_left=71
382
+ Step 10: INSPECT acc_0070 flagged=0/10 suspects=0 steps_left=70
383
+ Step 11: INSPECT acc_0915 flagged=0/10 suspects=0 steps_left=69
384
+ Step 12: INSPECT acc_0443 flagged=0/10 suspects=0 steps_left=68
385
+ Step 13: INSPECT acc_0970 flagged=0/10 suspects=0 steps_left=67
386
+ Step 14: INSPECT acc_0028 flagged=0/10 suspects=0 steps_left=66
387
+ Step 15: INSPECT acc_0792 flagged=0/10 suspects=0 steps_left=65
388
+ Step 16: INSPECT acc_0579 flagged=0/10 suspects=0 steps_left=64
389
+ Step 17: INSPECT acc_0037 flagged=0/10 suspects=0 steps_left=63
390
+ Step 18: INSPECT acc_0295 flagged=0/10 suspects=0 steps_left=62
391
+ Step 19: INSPECT acc_0438 flagged=0/10 suspects=0 steps_left=61
392
+ Step 20: INSPECT acc_0439 flagged=0/10 suspects=0 steps_left=60
393
+ Step 21: FLAG acc_0439 flagged=1/10 suspects=5 steps_left=60
394
+ Step 22: INSPECT acc_0237 flagged=1/10 suspects=5 steps_left=59
395
+ Step 23: FLAG acc_0237 flagged=2/10 suspects=6 steps_left=59
396
+ Step 24: INSPECT acc_0621 flagged=2/10 suspects=6 steps_left=58
397
+ Step 25: FLAG acc_0621 flagged=3/10 suspects=6 steps_left=58
398
+ Step 26: INSPECT acc_0389 flagged=3/10 suspects=6 steps_left=57
399
+ Step 27: INSPECT acc_0160 flagged=3/10 suspects=6 steps_left=56
400
+ Step 28: FLAG acc_0389 flagged=4/10 suspects=6 steps_left=56
401
+ Step 29: FLAG acc_0160 flagged=5/10 suspects=5 steps_left=56
402
+ Step 30: INSPECT acc_0549 flagged=5/10 suspects=5 steps_left=55
403
+ Step 31: FLAG acc_0549 flagged=6/10 suspects=4 steps_left=55
404
+ Step 32: INSPECT acc_0658 flagged=6/10 suspects=4 steps_left=54
405
+ Step 33: FLAG acc_0658 flagged=7/10 suspects=3 steps_left=54
406
+ Step 34: INSPECT acc_0290 flagged=7/10 suspects=3 steps_left=53
407
+ Step 35: FLAG acc_0290 flagged=8/10 suspects=2 steps_left=53
408
+ Step 36: INSPECT acc_0124 flagged=8/10 suspects=2 steps_left=52
409
+ Step 37: FLAG acc_0124 flagged=9/10 suspects=1 steps_left=52
410
+ Step 38: INSPECT acc_0507 flagged=9/10 suspects=1 steps_left=51
411
+ Step 39: FLAG acc_0507 flagged=10/10 suspects=0 steps_left=51
412
+ Step 40: SUBMIT flagged=10/10 suspects=0 steps_left=51
413
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.71
414
+ ★ GRADER SCORE: 0.9637
415
+
416
+ --- LLM Agent: task=hard, seed=1, model=Bedrock/nvidia.nemotron-super-3-120b ---
417
+ Step 1: INSPECT acc_0014 flagged=0/10 suspects=0 steps_left=79
418
+ Step 2: INSPECT acc_0835 flagged=0/10 suspects=0 steps_left=78
419
+ Step 3: INSPECT acc_0855 flagged=0/10 suspects=0 steps_left=77
420
+ Step 4: INSPECT acc_0930 flagged=0/10 suspects=0 steps_left=76
421
+ Step 5: INSPECT acc_0336 flagged=0/10 suspects=0 steps_left=75
422
+ Step 6: INSPECT acc_0929 flagged=0/10 suspects=0 steps_left=74
423
+ Step 7: INSPECT acc_0076 flagged=0/10 suspects=0 steps_left=73
424
+ Step 8: INSPECT acc_0543 flagged=0/10 suspects=0 steps_left=72
425
+ Step 9: INSPECT acc_0590 flagged=0/10 suspects=0 steps_left=71
426
+ Step 10: INSPECT acc_0401 flagged=0/10 suspects=0 steps_left=70
427
+ Step 11: INSPECT acc_0322 flagged=0/10 suspects=0 steps_left=69
428
+ Step 12: INSPECT acc_0154 flagged=0/10 suspects=0 steps_left=68
429
+ Step 13: INSPECT acc_0374 flagged=0/10 suspects=0 steps_left=67
430
+ Step 14: INSPECT acc_0549 flagged=0/10 suspects=0 steps_left=66
431
+ Step 15: INSPECT acc_0903 flagged=0/10 suspects=0 steps_left=65
432
+ Step 16: INSPECT acc_0976 flagged=0/10 suspects=0 steps_left=64
433
+ Step 17: INSPECT acc_0620 flagged=0/10 suspects=0 steps_left=63
434
+ Step 18: INSPECT acc_0017 flagged=0/10 suspects=0 steps_left=62
435
+ Step 19: INSPECT acc_0222 flagged=0/10 suspects=0 steps_left=61
436
+ Step 20: INSPECT acc_0536 flagged=0/10 suspects=0 steps_left=60
437
+ Step 21: INSPECT acc_0112 flagged=0/10 suspects=0 steps_left=59
438
+ Step 22: INSPECT acc_0577 flagged=0/10 suspects=0 steps_left=58
439
+ Step 23: INSPECT acc_0517 flagged=0/10 suspects=0 steps_left=57
440
+ Step 24: INSPECT acc_0113 flagged=0/10 suspects=0 steps_left=56
441
+ Step 25: INSPECT acc_0167 flagged=0/10 suspects=0 steps_left=55
442
+ Step 26: INSPECT acc_0697 flagged=0/10 suspects=0 steps_left=54
443
+ Step 27: INSPECT acc_0271 flagged=0/10 suspects=0 steps_left=53
444
+ Step 28: INSPECT acc_0681 flagged=0/10 suspects=0 steps_left=52
445
+ Step 29: INSPECT acc_0530 flagged=0/10 suspects=0 steps_left=51
446
+ Step 30: INSPECT acc_0353 flagged=0/10 suspects=0 steps_left=50
447
+ Step 31: INSPECT acc_0652 flagged=0/10 suspects=0 steps_left=49
448
+ Step 32: INSPECT acc_0777 flagged=0/10 suspects=0 steps_left=48
449
+ Step 33: INSPECT acc_0265 flagged=0/10 suspects=0 steps_left=47
450
+ Step 34: INSPECT acc_0788 flagged=0/10 suspects=0 steps_left=46
451
+ Step 35: INSPECT acc_0033 flagged=0/10 suspects=0 steps_left=45
452
+ Step 36: INSPECT acc_0187 flagged=0/10 suspects=0 steps_left=44
453
+ Step 37: INSPECT acc_0445 flagged=0/10 suspects=0 steps_left=43
454
+ Step 38: INSPECT acc_0846 flagged=0/10 suspects=0 steps_left=42
455
+ Step 39: INSPECT acc_0659 flagged=0/10 suspects=0 steps_left=41
456
+ Step 40: INSPECT acc_0768 flagged=0/10 suspects=0 steps_left=40
457
+ Step 41: INSPECT acc_0677 flagged=0/10 suspects=0 steps_left=39
458
+ Step 42: INSPECT acc_0539 flagged=0/10 suspects=0 steps_left=38
459
+ Step 43: INSPECT acc_0742 flagged=1/10 suspects=10 steps_left=37
460
+ Step 44: INSPECT acc_0876 flagged=1/10 suspects=10 steps_left=35
461
+ Step 45: INSPECT acc_0884 flagged=1/10 suspects=10 steps_left=33
462
+ Step 46: INSPECT acc_0245 flagged=1/10 suspects=10 steps_left=32
463
+ Step 47: INSPECT acc_0223 flagged=1/10 suspects=10 steps_left=31
464
+ Step 48: INSPECT acc_0647 flagged=1/10 suspects=10 steps_left=30
465
+ Step 49: INSPECT acc_0994 flagged=1/10 suspects=10 steps_left=29
466
+ Step 50: INSPECT acc_0811 flagged=1/10 suspects=10 steps_left=28
467
+ Step 51: INSPECT acc_0794 flagged=1/10 suspects=10 steps_left=27
468
+ Step 52: INSPECT acc_0582 flagged=1/10 suspects=10 steps_left=26
469
+ Step 53: INSPECT acc_0503 flagged=1/10 suspects=10 steps_left=25
470
+ Step 54: FLAG acc_0174 flagged=1/10 suspects=10 steps_left=25
471
+ Step 55: INSPECT acc_0639 flagged=1/10 suspects=10 steps_left=24
472
+ Step 56: INSPECT acc_0494 flagged=1/10 suspects=10 steps_left=23
473
+ Step 57: INSPECT acc_0898 flagged=1/10 suspects=10 steps_left=22
474
+ Step 58: INSPECT acc_0553 flagged=1/10 suspects=10 steps_left=21
475
+ Step 59: INSPECT acc_0588 flagged=1/10 suspects=10 steps_left=20
476
+ Step 60: INSPECT acc_0194 flagged=1/10 suspects=10 steps_left=19
477
+ Step 61: INSPECT acc_0810 flagged=1/10 suspects=10 steps_left=18
478
+ Step 62: INSPECT acc_0355 flagged=1/10 suspects=10 steps_left=17
479
+ Step 63: INSPECT acc_0363 flagged=1/10 suspects=10 steps_left=16
480
+ Step 64: INSPECT acc_0221 flagged=1/10 suspects=10 steps_left=15
481
+ Step 65: INSPECT acc_0580 flagged=1/10 suspects=10 steps_left=14
482
+ Step 66: INSPECT acc_0534 flagged=1/10 suspects=10 steps_left=13
483
+ Step 67: INSPECT acc_0778 flagged=1/10 suspects=10 steps_left=12
484
+ Step 68: INSPECT acc_0998 flagged=1/10 suspects=10 steps_left=11
485
+ Step 69: INSPECT acc_0233 flagged=1/10 suspects=10 steps_left=10
486
+ Step 70: INSPECT acc_0052 flagged=1/10 suspects=10 steps_left=9
487
+ Step 71: INSPECT acc_0813 flagged=1/10 suspects=10 steps_left=8
488
+ Step 72: INSPECT acc_0035 flagged=1/10 suspects=10 steps_left=7
489
+ Step 73: INSPECT acc_0667 flagged=1/10 suspects=10 steps_left=6
490
+ Step 74: INSPECT acc_0019 flagged=1/10 suspects=10 steps_left=5
491
+ Step 75: INSPECT acc_0959 flagged=1/10 suspects=10 steps_left=4
492
+ Step 76: INSPECT acc_0212 flagged=1/10 suspects=10 steps_left=3
493
+ Step 77: INSPECT acc_0776 flagged=1/10 suspects=10 steps_left=2
494
+ Step 78: INSPECT acc_0049 flagged=1/10 suspects=10 steps_left=1
495
+ Step 79: FLAG acc_0174 flagged=1/10 suspects=10 steps_left=1
496
+ Step 80: FLAG acc_0434 flagged=2/10 suspects=18 steps_left=1
497
+ Step 81: INSPECT acc_0369 flagged=2/10 suspects=18 steps_left=0
498
+ → Episode ended: [LOSS] TP=0 FP=2 FN=10 Recall=0.00 Precision=0.00 Episode reward=-10.80
499
+ ★ GRADER SCORE: 0.0000
500
+
501
+ --- LLM Agent: task=hard, seed=2, model=Bedrock/nvidia.nemotron-super-3-120b ---
502
+ Step 1: INSPECT acc_0813 flagged=0/10 suspects=0 steps_left=79
503
+ Step 2: INSPECT acc_0430 flagged=0/10 suspects=0 steps_left=78
504
+ Step 3: INSPECT acc_0817 flagged=0/10 suspects=0 steps_left=77
505
+ Step 4: INSPECT acc_0175 flagged=0/10 suspects=0 steps_left=76
506
+ Step 5: INSPECT acc_0523 flagged=0/10 suspects=0 steps_left=75
507
+ Step 6: INSPECT acc_0113 flagged=0/10 suspects=0 steps_left=74
508
+ Step 7: INSPECT acc_0797 flagged=0/10 suspects=0 steps_left=73
509
+ Step 8: INSPECT acc_0478 flagged=0/10 suspects=0 steps_left=72
510
+ Step 9: INSPECT acc_0861 flagged=0/10 suspects=0 steps_left=71
511
+ Step 10: INSPECT acc_0836 flagged=0/10 suspects=0 steps_left=70
512
+ Step 11: INSPECT acc_0926 flagged=0/10 suspects=0 steps_left=69
513
+ Step 12: INSPECT acc_0664 flagged=0/10 suspects=0 steps_left=68
514
+ Step 13: INSPECT acc_0255 flagged=0/10 suspects=0 steps_left=67
515
+ Step 14: INSPECT acc_0938 flagged=0/10 suspects=0 steps_left=66
516
+ Step 15: INSPECT acc_0672 flagged=0/10 suspects=0 steps_left=65
517
+ Step 16: FLAG acc_0672 flagged=1/10 suspects=6 steps_left=65
518
+ Step 17: INSPECT acc_0659 flagged=1/10 suspects=6 steps_left=64
519
+ Step 18: FLAG acc_0659 flagged=2/10 suspects=5 steps_left=64
520
+ Step 19: INSPECT acc_0290 flagged=2/10 suspects=5 steps_left=63
521
+ Step 20: FLAG acc_0290 flagged=3/10 suspects=5 steps_left=63
522
+ Step 21: INSPECT acc_0339 flagged=3/10 suspects=5 steps_left=62
523
+ Step 22: FLAG acc_0339 flagged=4/10 suspects=6 steps_left=62
524
+ Step 23: INSPECT acc_0544 flagged=4/10 suspects=6 steps_left=61
525
+ Step 24: FLAG acc_0544 flagged=5/10 suspects=5 steps_left=61
526
+ Step 25: INSPECT acc_0696 flagged=5/10 suspects=5 steps_left=60
527
+ Step 26: FLAG acc_0696 flagged=6/10 suspects=4 steps_left=60
528
+ Step 27: INSPECT acc_0541 flagged=6/10 suspects=4 steps_left=59
529
+ Step 28: FLAG acc_0541 flagged=7/10 suspects=3 steps_left=59
530
+ Step 29: INSPECT acc_0793 flagged=7/10 suspects=3 steps_left=58
531
+ Step 30: FLAG acc_0793 flagged=8/10 suspects=2 steps_left=58
532
+ Step 31: INSPECT acc_0214 flagged=8/10 suspects=2 steps_left=57
533
+ Step 32: FLAG acc_0214 flagged=9/10 suspects=1 steps_left=57
534
+ Step 33: INSPECT acc_0112 flagged=9/10 suspects=1 steps_left=56
535
+ Step 34: FLAG acc_0112 flagged=10/10 suspects=0 steps_left=56
536
+ Step 35: SUBMIT flagged=10/10 suspects=0 steps_left=56
537
+ → Episode ended: [WIN] TP=10 FP=0 FN=0 Recall=1.00 Precision=1.00 Episode reward=17.76
538
+ ★ GRADER SCORE: 0.9700
539
+
540
+ hard: scores=['0.964', '0.000', '0.970'] mean=0.6446 var=0.207740
541
+
542
+ ============================================================
543
+ EVALUATION COMPLETE
544
+ ============================================================
545
+ ubuntu@ip-172-31-33-59:~/meta/meta-hack-26$
runs/metrics.jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
server/app.py CHANGED
@@ -11,8 +11,9 @@ sys.path.insert(0, str(Path(__file__).parent))
11
  sys.path.insert(0, str(Path(__file__).parent.parent))
12
 
13
  from fastapi import FastAPI, HTTPException
14
- from fastapi.responses import HTMLResponse
15
  from fastapi.middleware.cors import CORSMiddleware
 
16
  from pydantic import BaseModel
17
  from typing import Any, Dict, Optional
18
 
@@ -20,7 +21,7 @@ from models import FakeGangAction, FakeGangObservation, FakeGangState, ActionTyp
20
  from environment import FakeGangEnvironment
21
 
22
  # ---------------------------------------------------------------------------
23
- # App
24
  # ---------------------------------------------------------------------------
25
 
26
  app = FastAPI(
@@ -28,19 +29,19 @@ app = FastAPI(
28
  description="RL environment for detecting coordinated fake account rings in social networks.",
29
  version="1.0.0",
30
  )
 
31
 
32
- app.add_middleware(
33
- CORSMiddleware,
34
- allow_origins=["*"], allow_methods=["*"], allow_headers=["*"],
35
- )
 
 
 
 
36
 
37
  _env = FakeGangEnvironment()
38
 
39
-
40
- # ---------------------------------------------------------------------------
41
- # Schemas
42
- # ---------------------------------------------------------------------------
43
-
44
  class ResetRequest(BaseModel):
45
  task: str = "easy"
46
  seed: Optional[int] = None
@@ -52,7 +53,6 @@ class StepResponse(BaseModel):
52
  reward: Optional[float]
53
  message: str
54
 
55
-
56
  # ---------------------------------------------------------------------------
57
  # OpenEnv API endpoints
58
  # ---------------------------------------------------------------------------
@@ -100,9 +100,8 @@ def grader():
100
  @app.get("/metadata")
101
  def metadata():
102
  return {
103
- "name": "graphstrike",
104
  "description": "RL environment for detecting coordinated fake account rings in social networks.",
105
- "version": "1.0.0", "author": "Pandago",
106
  "tags": ["social-network", "fraud-detection", "graph", "rl"],
107
  }
108
 
@@ -137,124 +136,662 @@ def baseline():
137
  return {"scores": scores, "agent": "rule_based"}
138
 
139
 
140
- # HF Spaces probes GET /web to detect if a web UI exists.
141
- # Must return 200 BEFORE Gradio mount (Gradio's catch-all would shadow it).
142
- @app.get("/web", response_class=HTMLResponse)
143
- def web_view():
144
- return """<!DOCTYPE html>
145
- <html><head><meta http-equiv="refresh" content="0;url=/"><title>GraphStrike</title></head>
146
- <body><p>Loading <a href="/">GraphStrike</a>...</p></body></html>"""
147
 
148
 
149
  # ---------------------------------------------------------------------------
150
- # Gradio web interface — mounted at /
151
  # ---------------------------------------------------------------------------
152
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  try:
154
  import gradio as gr
155
 
 
 
156
  def _fmt_obs(d: dict) -> str:
157
  lines = []
158
- lines.append(f"**Task:** {d.get('task','?')} | **Done:** {d.get('done',False)} | **Steps remaining:** {d.get('steps_remaining','?')}")
 
 
 
 
159
  if d.get('reward') is not None:
160
- lines.append(f"**Reward:** {d['reward']:.2f}")
161
  fl = d.get('flagged_ids', [])
162
- lines.append(f"**Flagged ({len(fl)}/10):** {fl}")
163
- su = d.get('suspect_ids', [])
164
- lines.append(f"**Suspects ({len(su)}):** {su}")
 
 
 
165
  lines.append(f"**Visible:** {len(d.get('visible_account_ids',[]))} IDs | **Inspected:** {len(d.get('inspected_ids',[]))} accounts")
166
  if d.get('evasion_triggered'):
167
- lines.append(f"**Evasion events:** {d.get('evasion_count',0)}")
168
- lines.append(f"**Message:** {d.get('message','')}")
169
  return "\n\n".join(lines)
170
 
171
- def _fmt_profiles(d: dict) -> str:
172
  accs = d.get("visible_accounts", [])
173
  if not accs:
174
- return "No accounts inspected yet. Use **INSPECT** to reveal profiles."
175
- rows = ["| Account | Status | Risk | Node | Beh | Graph | Hub | Photo | Bio | F.Nbrs |",
176
- "|---------|--------|------|------|-----|-------|-----|-------|-----|--------|"]
177
- for a in sorted(accs, key=lambda x: x.get("fake_risk_score",0), reverse=True)[:25]:
178
- rows.append(f"| {a.get('account_id','')} | {a.get('status','?')} | {a.get('fake_risk_score',0):.3f} "
179
- f"| {a.get('node_risk',0):.2f} | {a.get('behavior_risk',0):.2f} | {a.get('graph_risk',0):.2f} "
180
- f"| {a.get('hub_legitimacy_score',0):.2f} | {a.get('photo_reuse_score',0):.2f} "
181
- f"| {a.get('bio_template_score',0):.2f} | {a.get('flagged_neighbor_count',0)} |")
182
- return "\n".join(rows)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
  def gr_reset(task, seed):
185
  try:
186
  obs = _env.reset(task=task, seed=int(seed))
187
- d = obs.model_dump()
188
- return _fmt_obs(d), _fmt_profiles(d), json.dumps(d, indent=2, default=str)
189
  except Exception as e:
190
- return f"**Error:** {e}", "", "{}"
191
 
192
  def gr_step(action_type, account_id):
193
  try:
194
- acc = account_id.strip() if action_type != "submit" else None
195
  action = FakeGangAction(action_type=ActionType(action_type), account_id=acc)
196
- obs = _env.step(action)
197
- d = obs.model_dump()
198
- return _fmt_obs(d), _fmt_profiles(d), json.dumps(d, indent=2, default=str)
199
  except Exception as e:
200
- return f"**Error:** {e}", "", "{}"
201
 
202
  def gr_grader():
203
  if not _env._done:
204
- return "Episode not complete. Call SUBMIT first."
205
- return json.dumps({"score": _env._last_grader_score, "task": _env._task, "episode_id": _env._episode_id}, indent=2)
 
 
 
 
206
 
207
  def gr_baseline():
208
  sys.path.insert(0, str(Path(__file__).parent.parent))
209
  from inference import run_rule_based_episode
210
  scores = {t: run_rule_based_episode(_env, task=t, seed=0) for t in ["easy", "medium", "hard"]}
211
- return json.dumps({"scores": scores, "agent": "rule_based"}, indent=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
 
213
  with gr.Blocks(title="GraphStrike") as demo:
214
- gr.Markdown(
215
- "# GraphStrike\n"
216
- "### Coordinated Fake Account Ring Detection — OpenEnv RL Environment\n\n"
217
- "Detect all 10 members of a coordinated fake account ring hidden in a social network.\n"
218
- "Use **INSPECT** to reveal profiles, **FLAG** to mark fakes, **SUBMIT** to end.\n\n"
219
- "`/reset` `/step` `/state` `/grader` `/baseline` `/tasks` `/health` — [Swagger](/docs)"
220
- )
221
- with gr.Row():
222
- with gr.Column():
223
- gr.Markdown("#### 1. Start Episode")
224
- task_dd = gr.Dropdown(["easy","medium","hard"], value="easy", label="Task")
225
- seed_in = gr.Number(value=0, label="Seed", precision=0)
226
- reset_btn = gr.Button("Reset Episode", variant="primary", size="lg")
227
- with gr.Column():
228
- gr.Markdown("#### 2. Take Actions")
229
- action_dd = gr.Dropdown(["inspect","investigate_network","flag","unflag","submit"], value="inspect", label="Action Type")
230
- acc_in = gr.Textbox(label="Account ID", placeholder="e.g. acc_0012")
231
- step_btn = gr.Button("Step", variant="primary", size="lg")
232
-
233
- obs_md = gr.Markdown(value="*Click 'Reset Episode' to begin.*")
234
- with gr.Accordion("Account Profiles (sorted by risk)", open=True):
235
- prof_md = gr.Markdown(value="")
236
- with gr.Row():
237
- grader_btn = gr.Button("Get Grader Score")
238
- baseline_btn = gr.Button("Run Baseline (all 3 tasks)")
239
- result_box = gr.Textbox(label="Result", lines=5, interactive=False)
240
- with gr.Accordion("Raw JSON", open=False):
241
- raw_json = gr.Textbox(label="Raw JSON", lines=15, interactive=False)
242
-
243
- reset_btn.click(gr_reset, [task_dd, seed_in], [obs_md, prof_md, raw_json])
244
- step_btn.click(gr_step, [action_dd, acc_in], [obs_md, prof_md, raw_json])
245
- grader_btn.click(gr_grader, [], result_box)
246
- baseline_btn.click(gr_baseline, [], result_box)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
 
248
  app = gr.mount_gradio_app(app, demo, path="/")
249
  print("[GraphStrike] Gradio UI mounted at /", flush=True)
250
 
251
  except Exception as exc:
 
252
  print(f"[GraphStrike] Gradio unavailable: {exc}", flush=True)
 
253
 
254
  @app.get("/", response_class=HTMLResponse)
255
  def root_fallback():
256
- return "<html><body><h1>GraphStrike</h1><p>API-only mode. <a href='/docs'>Swagger</a></p></body></html>"
257
-
258
 
259
  # ---------------------------------------------------------------------------
260
  # Entry point
 
11
  sys.path.insert(0, str(Path(__file__).parent.parent))
12
 
13
  from fastapi import FastAPI, HTTPException
14
+ from fastapi.responses import HTMLResponse, RedirectResponse
15
  from fastapi.middleware.cors import CORSMiddleware
16
+ from fastapi.staticfiles import StaticFiles
17
  from pydantic import BaseModel
18
  from typing import Any, Dict, Optional
19
 
 
21
  from environment import FakeGangEnvironment
22
 
23
  # ---------------------------------------------------------------------------
24
+ # App + environment
25
  # ---------------------------------------------------------------------------
26
 
27
  app = FastAPI(
 
29
  description="RL environment for detecting coordinated fake account rings in social networks.",
30
  version="1.0.0",
31
  )
32
+ app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
33
 
34
+ # Serve images at /assets/* and /images/* (used by the Gradio README tab)
35
+ _PROJECT_ROOT = Path(__file__).parent.parent
36
+ _ASSETS_DIR = _PROJECT_ROOT / "assets"
37
+ _IMAGES_DIR = _PROJECT_ROOT / "images"
38
+ if _ASSETS_DIR.exists():
39
+ app.mount("/assets", StaticFiles(directory=str(_ASSETS_DIR)), name="assets")
40
+ if _IMAGES_DIR.exists():
41
+ app.mount("/images", StaticFiles(directory=str(_IMAGES_DIR)), name="images")
42
 
43
  _env = FakeGangEnvironment()
44
 
 
 
 
 
 
45
  class ResetRequest(BaseModel):
46
  task: str = "easy"
47
  seed: Optional[int] = None
 
53
  reward: Optional[float]
54
  message: str
55
 
 
56
  # ---------------------------------------------------------------------------
57
  # OpenEnv API endpoints
58
  # ---------------------------------------------------------------------------
 
100
  @app.get("/metadata")
101
  def metadata():
102
  return {
103
+ "name": "graphstrike", "version": "1.0.0", "author": "Pandago",
104
  "description": "RL environment for detecting coordinated fake account rings in social networks.",
 
105
  "tags": ["social-network", "fraud-detection", "graph", "rl"],
106
  }
107
 
 
136
  return {"scores": scores, "agent": "rule_based"}
137
 
138
 
139
+ # HF Spaces probes /web — redirect to root (must be on FastAPI before Gradio mount)
140
+ @app.get("/web", response_class=RedirectResponse)
141
+ def web_redirect():
142
+ return RedirectResponse(url="/")
 
 
 
143
 
144
 
145
  # ---------------------------------------------------------------------------
146
+ # Gradio UI
147
  # ---------------------------------------------------------------------------
148
 
149
+ import pandas as pd
150
+
151
+ # ── Benchmark data ───────────────────────────────────────────────────────────
152
+
153
+ BENCH_SEED0 = [
154
+ # [Model, Params, Easy, Medium, Hard, Mean] — sorted by Mean desc
155
+ ["Llama 4 Scout 17B", "17B", 0.960, 0.979, 0.976, 0.972],
156
+ ["Ministral 3 8B", "8B", 0.967, 0.964, 0.964, 0.965],
157
+ ["DeepSeek V3.2", "685B", 0.967, 0.960, 0.933, 0.953],
158
+ ["Nemotron Super 3", "49B", 0.930, 0.941, 0.964, 0.945],
159
+ ["Rule-Based Baseline","—", 0.910, 0.906, 0.904, 0.907],
160
+ ["Gemma 3 12B", "12B", 0.900, 0.908, 0.908, 0.905],
161
+ ]
162
+
163
+ BENCH_VARIANCE = [
164
+ # [Model, Easy mean, Easy var, Med mean, Med var, Hard mean, Hard var]
165
+ ["Llama 4 Scout 17B", 0.960, 0.000007, 0.979, 0.000001, 0.976, 0.000063],
166
+ ["Nemotron Super 3", 0.957, 0.000, 0.957, 0.000, 0.645, 0.208],
167
+ ["Ministral 3 8B", 0.958, 0.000, 0.645, 0.208, 0.623, 0.195],
168
+ ["DeepSeek V3.2", 0.640, 0.205, 0.957, 0.000, 0.645, 0.208],
169
+ ["Gemma 3 12B", 0.912, 0.000, 0.917, 0.000, 0.603, 0.182],
170
+ ]
171
+
172
+ PROFILE_HEADERS = ["Account", "Status", "Risk", "Node", "Beh", "Graph", "Hub", "Photo", "Bio", "IP", "F.Nbrs"]
173
+
174
+ # Long-format DataFrame for BarPlot
175
+ _bench_long_rows = []
176
+ for _r in BENCH_SEED0:
177
+ _bench_long_rows += [
178
+ {"Model": _r[0], "Task": "Easy", "Score": _r[2]},
179
+ {"Model": _r[0], "Task": "Medium", "Score": _r[3]},
180
+ {"Model": _r[0], "Task": "Hard", "Score": _r[4]},
181
+ ]
182
+ BENCH_LONG_DF = pd.DataFrame(_bench_long_rows)
183
+
184
+
185
+ # ── HTML table builders ──────────────────────────────────────────────────────
186
+
187
+ def _score_color(s: float) -> str:
188
+ if s >= 0.960: return "#22c55e"
189
+ if s >= 0.930: return "#86efac"
190
+ if s >= 0.910: return "#facc15"
191
+ return "#f97316"
192
+
193
+ def _var_color(v: float) -> str:
194
+ if v < 0.001: return "#22c55e"
195
+ if v < 0.05: return "#facc15"
196
+ return "#f87171"
197
+
198
+ _TH = "padding:11px 16px;font-weight:600;white-space:nowrap;"
199
+ _TD = "padding:10px 16px;white-space:nowrap;"
200
+ _TABLE_WRAP = (
201
+ "overflow-x:auto;border-radius:10px;border:1px solid #1e3a5f;"
202
+ "font-family:'IBM Plex Mono',monospace;font-size:13.5px;"
203
+ )
204
+ _THEAD_BG = "background:#0c2340;"
205
+
206
+ def _leaderboard_html() -> str:
207
+ header = (
208
+ f"<thead><tr style='{_THEAD_BG}'>"
209
+ f"<th style='{_TH}color:#64748b;'>#</th>"
210
+ f"<th style='{_TH}color:#e2e8f0;text-align:left;'>Model</th>"
211
+ f"<th style='{_TH}color:#94a3b8;text-align:center;'>Params</th>"
212
+ f"<th style='{_TH}color:#4ade80;text-align:center;'>Easy</th>"
213
+ f"<th style='{_TH}color:#facc15;text-align:center;'>Medium</th>"
214
+ f"<th style='{_TH}color:#f87171;text-align:center;'>Hard</th>"
215
+ f"<th style='{_TH}color:#c084fc;text-align:center;'>Mean</th>"
216
+ f"</tr></thead>"
217
+ )
218
+ rows = ""
219
+ for i, r in enumerate(BENCH_SEED0):
220
+ bg = "#162032" if i % 2 == 0 else "#0f172a"
221
+ is_base = r[0] == "Rule-Based Baseline"
222
+ name_cell = (
223
+ f"{r[0]} <span style='color:#64748b;font-size:11px;'>(baseline)</span>"
224
+ if is_base else r[0]
225
+ )
226
+ name_color = "#94a3b8" if is_base else "#e2e8f0"
227
+ rows += (
228
+ f"<tr style='background:{bg};'>"
229
+ f"<td style='{_TD}color:#475569;text-align:center;'>{i+1}</td>"
230
+ f"<td style='{_TD}color:{name_color};'>{name_cell}</td>"
231
+ f"<td style='{_TD}color:#64748b;text-align:center;'>{r[1]}</td>"
232
+ + "".join(
233
+ f"<td style='{_TD}color:{_score_color(r[j])};font-weight:700;"
234
+ f"text-align:center;'>{r[j]:.3f}</td>"
235
+ for j in (2, 3, 4)
236
+ )
237
+ + f"<td style='{_TD}color:{_score_color(r[5])};font-weight:800;"
238
+ f"font-size:14px;text-align:center;'>{r[5]:.3f}</td>"
239
+ f"</tr>"
240
+ )
241
+ return f"<div style='{_TABLE_WRAP}'><table style='width:100%;border-collapse:collapse;'>{header}<tbody>{rows}</tbody></table></div>"
242
+
243
+
244
+ def _variance_html() -> str:
245
+ header = (
246
+ f"<thead><tr style='{_THEAD_BG}'>"
247
+ f"<th style='{_TH}color:#e2e8f0;text-align:left;'>Model</th>"
248
+ f"<th style='{_TH}color:#4ade80;text-align:center;'>Easy — mean / var</th>"
249
+ f"<th style='{_TH}color:#facc15;text-align:center;'>Medium — mean / var</th>"
250
+ f"<th style='{_TH}color:#f87171;text-align:center;'>Hard — mean / var</th>"
251
+ f"</tr></thead>"
252
+ )
253
+ rows = ""
254
+ for i, r in enumerate(BENCH_VARIANCE):
255
+ bg = "#162032" if i % 2 == 0 else "#0f172a"
256
+ def cell(mean, var):
257
+ return (
258
+ f"<td style='{_TD}text-align:center;'>"
259
+ f"<span style='color:#e2e8f0;font-weight:600;'>{mean:.3f}</span>"
260
+ f" <span style='color:{_var_color(var)};font-size:11px;'>/ {var:.1e}</span>"
261
+ f"</td>"
262
+ )
263
+ rows += (
264
+ f"<tr style='background:{bg};'>"
265
+ f"<td style='{_TD}color:#e2e8f0;font-weight:500;'>{r[0]}</td>"
266
+ + cell(r[1], r[2]) + cell(r[3], r[4]) + cell(r[5], r[6])
267
+ + "</tr>"
268
+ )
269
+ return f"<div style='{_TABLE_WRAP};margin-top:20px;'><table style='width:100%;border-collapse:collapse;'>{header}<tbody>{rows}</tbody></table></div>"
270
+
271
+
272
+ def _baseline_html() -> str:
273
+ rows_data = [
274
+ ("Easy", 0.9100, "100%", "#4ade80"),
275
+ ("Medium", 0.9060, "84%", "#facc15"),
276
+ ("Hard", 0.9038, "52%", "#f87171"),
277
+ ]
278
+ header = (
279
+ f"<thead><tr style='{_THEAD_BG}'>"
280
+ f"<th style='{_TH}color:#e2e8f0;'>Task</th>"
281
+ f"<th style='{_TH}color:#e2e8f0;text-align:center;'>Score (seed=0)</th>"
282
+ f"<th style='{_TH}color:#e2e8f0;text-align:center;'>Win Rate (50 seeds)</th>"
283
+ f"</tr></thead>"
284
+ )
285
+ rows = ""
286
+ for i, (task, score, wr, col) in enumerate(rows_data):
287
+ bg = "#162032" if i % 2 == 0 else "#0f172a"
288
+ rows += (
289
+ f"<tr style='background:{bg};'>"
290
+ f"<td style='{_TD}color:{col};font-weight:600;'>{task}</td>"
291
+ f"<td style='{_TD}color:#e2e8f0;font-weight:700;text-align:center;'>{score:.4f}</td>"
292
+ f"<td style='{_TD}color:{col};font-weight:600;text-align:center;'>{wr}</td>"
293
+ f"</tr>"
294
+ )
295
+ return f"<div style='{_TABLE_WRAP};margin-top:4px;'><table style='width:100%;border-collapse:collapse;'>{header}<tbody>{rows}</tbody></table></div>"
296
+
297
+
298
  try:
299
  import gradio as gr
300
 
301
+ # ── Observation / profile helpers ─────────────────────────────────────────
302
+
303
  def _fmt_obs(d: dict) -> str:
304
  lines = []
305
+ task = d.get('task', '?').upper()
306
+ done = d.get('done', False)
307
+ steps = d.get('steps_remaining', '?')
308
+ state_label = "Done" if done else "In Progress"
309
+ lines.append(f"### Task: **{task}** | Steps remaining: **{steps}** | {state_label}")
310
  if d.get('reward') is not None:
311
+ lines.append(f"**Final Reward:** `{d['reward']:.2f}`")
312
  fl = d.get('flagged_ids', [])
313
+ lines.append(f"**Flagged ({len(fl)}/10):** " + (" ".join(f"`{f}`" for f in fl) if fl else "*none*"))
314
+ su = d.get('suspect_ids', [])
315
+ ins = set(d.get('inspected_ids', []))
316
+ uninspected_sus = [s for s in su if s not in ins]
317
+ if uninspected_sus:
318
+ lines.append(f"**Suspects — uninspected ({len(uninspected_sus)}):** " + " ".join(f"`{s}`" for s in uninspected_sus))
319
  lines.append(f"**Visible:** {len(d.get('visible_account_ids',[]))} IDs | **Inspected:** {len(d.get('inspected_ids',[]))} accounts")
320
  if d.get('evasion_triggered'):
321
+ lines.append(f"**Evasion events fired:** {d.get('evasion_count', 0)}")
322
+ lines.append(f"\n> {d.get('message', '')}")
323
  return "\n\n".join(lines)
324
 
325
+ def _profile_rows(d: dict) -> list:
326
  accs = d.get("visible_accounts", [])
327
  if not accs:
328
+ return []
329
+ STATUS_MAP = {
330
+ "confirmed_fake": "confirmed_fake [flagged]",
331
+ "suspect": "suspect",
332
+ "normal": "normal",
333
+ }
334
+ rows = []
335
+ for a in sorted(accs, key=lambda x: x.get("fake_risk_score", 0), reverse=True)[:40]:
336
+ rows.append([
337
+ a.get("account_id", ""),
338
+ STATUS_MAP.get(a.get("status", ""), a.get("status", "")),
339
+ round(a.get("fake_risk_score", 0), 3),
340
+ round(a.get("node_risk", 0), 3),
341
+ round(a.get("behavior_risk", 0), 3),
342
+ round(a.get("graph_risk", 0), 3),
343
+ round(a.get("hub_legitimacy_score", 0), 3),
344
+ round(a.get("photo_reuse_score", 0), 3),
345
+ round(a.get("bio_template_score", 0), 3),
346
+ a.get("shared_ip_count", 0),
347
+ a.get("flagged_neighbor_count", 0),
348
+ ])
349
+ return rows
350
+
351
+ def _fmt_visible_ids(d: dict) -> str:
352
+ ins = set(d.get('inspected_ids', []))
353
+ suspects = set(d.get('suspect_ids', []))
354
+ flagged = set(d.get('flagged_ids', []))
355
+ visible = d.get('visible_account_ids', [])
356
+ if not visible:
357
+ return "*No visible accounts yet.*"
358
+ parts = []
359
+ for vid in visible:
360
+ if vid in flagged:
361
+ parts.append(f"**[F]** `{vid}`")
362
+ elif vid in suspects and vid not in ins:
363
+ parts.append(f"**[S]** `{vid}`")
364
+ elif vid in ins:
365
+ parts.append(f"`{vid}`")
366
+ else:
367
+ parts.append(f"`{vid}`")
368
+ return " ".join(parts)
369
+
370
+ # ── Playground callbacks ──────────────────────────────────────────────────
371
 
372
  def gr_reset(task, seed):
373
  try:
374
  obs = _env.reset(task=task, seed=int(seed))
375
+ d = obs.model_dump()
376
+ return _fmt_obs(d), _profile_rows(d), _fmt_visible_ids(d), json.dumps(d, indent=2, default=str)
377
  except Exception as e:
378
+ return f"**Error:** {e}", [], "", "{}"
379
 
380
  def gr_step(action_type, account_id):
381
  try:
382
+ acc = account_id.strip() if action_type != "submit" else None
383
  action = FakeGangAction(action_type=ActionType(action_type), account_id=acc)
384
+ obs = _env.step(action)
385
+ d = obs.model_dump()
386
+ return _fmt_obs(d), _profile_rows(d), _fmt_visible_ids(d), json.dumps(d, indent=2, default=str)
387
  except Exception as e:
388
+ return f"**Error:** {e}", [], "", "{}"
389
 
390
  def gr_grader():
391
  if not _env._done:
392
+ return "Episode not complete call SUBMIT first."
393
+ return (
394
+ f"**Score:** `{_env._last_grader_score:.4f}` | "
395
+ f"**Task:** {_env._task} | "
396
+ f"**Episode:** `{_env._episode_id}`"
397
+ )
398
 
399
  def gr_baseline():
400
  sys.path.insert(0, str(Path(__file__).parent.parent))
401
  from inference import run_rule_based_episode
402
  scores = {t: run_rule_based_episode(_env, task=t, seed=0) for t in ["easy", "medium", "hard"]}
403
+ mean = sum(scores.values()) / 3
404
+ return (
405
+ f"**Baseline (rule-based, seed=0)**\n\n"
406
+ f"Easy: `{scores['easy']:.4f}` | Medium: `{scores['medium']:.4f}` | "
407
+ f"Hard: `{scores['hard']:.4f}` | Mean: `{mean:.4f}`"
408
+ )
409
+
410
+ # ── Build Gradio UI ───────────────────────────────────────────────────────
411
+
412
+ # ── README content (rendered as styled HTML) ─────────────────────────────
413
+
414
+ _README_HTML = """
415
+ <style>
416
+ .gs-readme { font-family: 'Inter', system-ui, sans-serif; color: #cbd5e1; line-height: 1.7; max-width: 960px; margin: 0 auto; padding: 8px 4px 32px; }
417
+ .gs-readme h2 { color: #e2e8f0; font-size: 1.12em; font-weight: 700; border-bottom: 1px solid #1e3a5f; padding-bottom: 8px; margin: 32px 0 14px; letter-spacing: -0.2px; }
418
+ .gs-readme h3 { color: #7dd3fc; font-size: 0.97em; font-weight: 600; margin: 20px 0 8px; }
419
+ .gs-readme p { margin: 0 0 10px; font-size: 0.92em; }
420
+ .gs-readme code { background: #0c2340; color: #7dd3fc; padding: 2px 7px; border-radius: 4px; font-family: 'IBM Plex Mono', monospace; font-size: 0.84em; }
421
+ .gs-readme pre { background: #0a1628; border: 1px solid #1e3a5f; border-radius: 8px; padding: 14px 18px; overflow-x: auto; margin: 10px 0 16px; }
422
+ .gs-readme pre code { background: none; padding: 0; color: #93c5fd; font-size: 0.82em; }
423
+ .gs-table { width: 100%; border-collapse: collapse; margin: 10px 0 18px; font-size: 0.86em; }
424
+ .gs-table th { background: #0c2340; color: #94a3b8; font-weight: 600; padding: 9px 14px; text-align: left; border-bottom: 1px solid #1e3a5f; }
425
+ .gs-table td { padding: 8px 14px; border-bottom: 1px solid #0f1e30; color: #cbd5e1; }
426
+ .gs-table tr:nth-child(even) td { background: #060e1a; }
427
+ .gs-badge { display:inline-block; padding: 2px 9px; border-radius: 4px; font-size: 0.78em; font-weight: 700; }
428
+ .gs-badge-easy { background:#052e16; color:#4ade80; border:1px solid #166534; }
429
+ .gs-badge-medium { background:#2d1f00; color:#facc15; border:1px solid #92400e; }
430
+ .gs-badge-hard { background:#2d0a0a; color:#f87171; border:1px solid #7f1d1d; }
431
+ .gs-card { background: #0a1628; border: 1px solid #1e3a5f; border-radius: 10px; padding: 16px 20px; margin: 10px 0; }
432
+ .gs-card h3 { margin-top: 0; }
433
+ .gs-formula { background: #050d18; border-left: 3px solid #3b82f6; padding: 12px 18px; border-radius: 0 8px 8px 0; margin: 12px 0; font-family: 'IBM Plex Mono', monospace; font-size: 0.83em; color: #93c5fd; white-space: pre; overflow-x: auto; }
434
+ .gs-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 12px; margin: 14px 0; }
435
+ .gs-stat { background: #0a1628; border: 1px solid #1e3a5f; border-radius: 8px; padding: 14px 16px; text-align: center; }
436
+ .gs-stat-val { font-size: 1.7em; font-weight: 800; color: #38bdf8; font-family: 'IBM Plex Mono', monospace; display: block; }
437
+ .gs-stat-lbl { font-size: 0.77em; color: #64748b; margin-top: 4px; display: block; }
438
+ .gs-img { width: 100%; border-radius: 10px; border: 1px solid #1e3a5f; margin: 14px 0; display: block; background: #0a1628; }
439
+ .gs-img-pair { display: grid; grid-template-columns: 1fr 1fr; gap: 12px; margin: 14px 0; }
440
+ .gs-img-caption { font-size: 0.78em; color: #475569; text-align: center; margin-top: -8px; margin-bottom: 12px; font-style: italic; }
441
+ .gs-divider { border: none; border-top: 1px solid #0f1e30; margin: 28px 0; }
442
+ </style>
443
+
444
+ <div class="gs-readme">
445
+
446
+ <!-- OVERVIEW -->
447
+ <div class="gs-card" style="border-color:#2563eb;margin-bottom:20px;border-width:1px 1px 1px 3px;">
448
+ <h3 style="color:#7dd3fc;font-size:1.05em;">What is GraphStrike?</h3>
449
+ <p>An <strong style="color:#e2e8f0;">OpenEnv-compatible</strong> reinforcement learning environment where an LLM agent
450
+ must identify all 10 members of a coordinated fake account ring hidden inside a synthetic social network.
451
+ The agent learns via <strong>Reflexion</strong> and a <strong>dynamic hybrid rule/LLM policy</strong> — no gradient
452
+ updates, no fine-tuning required.</p>
453
+ <p style="margin:0;">Submitted to the <strong style="color:#e2e8f0;">OpenEnv Hackathon × SCALER School of Technology</strong>.
454
+ Judges deploy this container, run their own LLM agent against it, and score on task quality, environment design,
455
+ code quality, creativity, and domain quality.</p>
456
+ </div>
457
+
458
+ <!-- KEY STATS -->
459
+ <div class="gs-grid">
460
+ <div class="gs-stat"><span class="gs-stat-val">10</span><span class="gs-stat-lbl">Gang members to find per episode</span></div>
461
+ <div class="gs-stat"><span class="gs-stat-val">3</span><span class="gs-stat-lbl">Difficulty tiers (easy / medium / hard)</span></div>
462
+ <div class="gs-stat"><span class="gs-stat-val">150</span><span class="gs-stat-lbl">Pre-generated episodes (50 per task)</span></div>
463
+ <div class="gs-stat"><span class="gs-stat-val">24</span><span class="gs-stat-lbl">Automated validator checks</span></div>
464
+ </div>
465
+
466
+ <!-- SYSTEM ARCHITECTURE -->
467
+ <h2>System Architecture</h2>
468
+ <img src="/assets/sys arch.png" class="gs-img" alt="System Architecture" onerror="this.style.display='none'">
469
+ <p class="gs-img-caption">End-to-end pipeline: episode generation → environment server → hybrid agent → reflexion memory</p>
470
+
471
+ <!-- DIFFICULTY -->
472
+ <h2>Task Difficulty Tiers</h2>
473
+ <table class="gs-table">
474
+ <tr><th>Task</th><th>Network Size</th><th>Gang</th><th>Decoys</th><th>Max Steps</th><th>Win Condition</th><th>Baseline Score</th></tr>
475
+ <tr><td><span class="gs-badge gs-badge-easy">Easy</span></td><td>50 accounts</td><td>10</td><td>0</td><td>30</td><td>Recall ≥ 0.8, Precision ≥ 0.7</td><td>0.910</td></tr>
476
+ <tr><td><span class="gs-badge gs-badge-medium">Medium</span></td><td>200 accounts</td><td>10</td><td>20</td><td>50</td><td>Recall ≥ 0.8, Precision ≥ 0.7</td><td>0.906</td></tr>
477
+ <tr><td><span class="gs-badge gs-badge-hard">Hard</span></td><td>1000 accounts</td><td>10</td><td>50</td><td>80</td><td>Recall ≥ 0.9, Precision ≥ 0.8</td><td>0.904</td></tr>
478
+ </table>
479
+ <p style="font-size:0.84em;color:#64748b;margin-top:-8px;">Hard mode fires 4 evasion events (steps 15, 30, 45, 60) that drop intra-gang follow edges mid-investigation, destroying graph signals.</p>
480
+
481
+ <hr class="gs-divider">
482
+
483
+ <!-- DETECTION SIGNALS -->
484
+ <h2>Detection Signal Hierarchy</h2>
485
+ <img src="/assets/gs.png" class="gs-img" alt="Signal Hierarchy" onerror="this.style.display='none'">
486
+ <p class="gs-img-caption">Node signals (offline) → Behavioral signals (temporal/device) → Graph signals (live at INSPECT) → False-positive control via hub legitimacy</p>
487
+
488
+ <h3>Node Signals (pre-computed offline)</h3>
489
+ <table class="gs-table">
490
+ <tr><th>Feature</th><th>Fake Range</th><th>Real Range</th><th>What it measures</th></tr>
491
+ <tr><td><code>photo_reuse_score</code></td><td>0.30 – 0.95</td><td>0.00 – 0.15</td><td>Stolen celebrity photos via pHash fingerprint matching</td></tr>
492
+ <tr><td><code>bio_template_score</code></td><td>0.20 – 0.90</td><td>0.00 – 0.12</td><td>Cosine similarity to known fake bio templates</td></tr>
493
+ <tr><td><code>comment_repeat_score</code></td><td>0.60 – 0.90</td><td>0.00 – 0.08</td><td>Fraction of copy-pasted spam comments across accounts</td></tr>
494
+ </table>
495
+
496
+ <h3>Behavioral Signals (temporal + device)</h3>
497
+ <table class="gs-table">
498
+ <tr><th>Feature</th><th>Fake Pattern</th></tr>
499
+ <tr><td><code>avg_post_hour</code></td><td>All 10 gang members post within ±0.5h of each other (coordinated scheduling)</td></tr>
500
+ <tr><td><code>account_age_days</code></td><td>Created same week — base_age ± 7 days</td></tr>
501
+ <tr><td><code>shared_ip_count</code></td><td>= 9 for all gang members (one IP subnet per episode, unique seed)</td></tr>
502
+ </table>
503
+
504
+ <h3>Graph Signals (computed live at INSPECT)</h3>
505
+ <table class="gs-table">
506
+ <tr><th>Feature</th><th>Fake Pattern</th></tr>
507
+ <tr><td><code>mutual_follow_rate</code></td><td>0.6 – 0.9 (dense intra-gang mutual follows)</td></tr>
508
+ <tr><td><code>flagged_neighbor_count</code></td><td>Grows as investigation proceeds — strongest late-game signal</td></tr>
509
+ <tr><td><code>avg_neighbor_photo_reuse</code></td><td>High when cluster shares stolen content</td></tr>
510
+ </table>
511
+
512
+ <hr class="gs-divider">
513
+
514
+ <!-- EPISODE FLOW -->
515
+ <h2>Episode Lifecycle &amp; Action Mechanics</h2>
516
+ <img src="/assets/episode.png" class="gs-img" alt="Episode Flow" onerror="this.style.display='none'">
517
+ <p class="gs-img-caption">Episode flow: reset → inspect/flag/investigate loop → dual SUSPECT cascade → submit → grader score</p>
518
+
519
+ <h3>Action Space</h3>
520
+ <table class="gs-table">
521
+ <tr><th>Action</th><th>Step Cost</th><th>Effect</th></tr>
522
+ <tr><td><code>INSPECT acc_XXXX</code></td><td>1 step</td><td>Reveals full AccountProfile + follow list; adds 1-hop neighbors to visible set</td></tr>
523
+ <tr><td><code>INVESTIGATE_NETWORK acc_XXXX</code></td><td>2 steps</td><td>Bidirectional 2-hop expansion (outgoing + incoming edges); re-cascades SUSPECT</td></tr>
524
+ <tr><td><code>FLAG acc_XXXX</code></td><td>FREE</td><td>Marks as fake; triggers dual SUSPECT cascade (follow-graph + IP cluster)</td></tr>
525
+ <tr><td><code>UNFLAG acc_XXXX</code></td><td>FREE</td><td>Removes flag; clears CONFIRMED_FAKE status</td></tr>
526
+ <tr><td><code>SUBMIT</code></td><td>FREE</td><td>Ends episode; triggers grader scoring</td></tr>
527
+ </table>
528
+
529
+ <h3>Dual SUSPECT Cascade (triggered by FLAG)</h3>
530
+ <div style="display:grid;grid-template-columns:1fr 1fr;gap:12px;margin:10px 0;">
531
+ <div class="gs-card">
532
+ <h3 style="color:#4ade80;margin-top:0;">Cascade 1 — Follow-Graph</h3>
533
+ <p style="margin:0;font-size:0.88em;">Every account the flagged member <em>follows</em> (<code>_live_edges</code>) becomes SUSPECT if visible and NORMAL. Gang follow density is 0.70+ so this is high-precision.</p>
534
+ </div>
535
+ <div class="gs-card">
536
+ <h3 style="color:#facc15;margin-top:0;">Cascade 2 — IP Cluster</h3>
537
+ <p style="margin:0;font-size:0.88em;">Every visible account sharing the same <code>ip_cluster_id</code> becomes SUSPECT. Gang shares <code>ip_gang_&lt;seed&gt;</code>; real accounts have unique IPs. <strong>Zero false positives.</strong></p>
538
+ </div>
539
+ </div>
540
+
541
+ <hr class="gs-divider">
542
+
543
+ <!-- RISK SCORING -->
544
+ <h2>Risk Scoring Mathematics</h2>
545
+ <img src="/images/big.png" class="gs-img" alt="Risk Scoring Overview" onerror="this.style.display='none'">
546
+ <p class="gs-img-caption">All scoring functions are stateless and deterministic — called inside _build_profile() at every INSPECT</p>
547
+
548
+ <div class="gs-img-pair">
549
+ <div>
550
+ <img src="/assets/formulas-1.png" class="gs-img" alt="Risk Formulas Part 1" onerror="this.style.display='none'">
551
+ <p class="gs-img-caption">Node risk, Behavior risk, Graph risk components</p>
552
+ </div>
553
+ <div>
554
+ <img src="/assets/formulas-2.png" class="gs-img" alt="Risk Formulas Part 2" onerror="this.style.display='none'">
555
+ <p class="gs-img-caption">Hub legitimacy, Composite fake_risk_score formula</p>
556
+ </div>
557
+ </div>
558
+
559
+ <div class="gs-formula">fake_risk = clip(
560
+ 0.30 × node_risk ← content signals (photo reuse, bio templates)
561
+ + 0.25 × behavior_risk ← temporal + age clustering
562
+ + 0.45 × graph_risk ← structural coordination (highest weight — hardest to fake)
563
+ − 0.25 × hub_legitimacy, ← subtractive: celebrities score ≈ 0 before clip
564
+ 0.0, 1.0)</div>
565
+
566
+ <h3>Grader Score Formula</h3>
567
+ <div class="gs-formula">recall = tp / 10
568
+ precision = tp / max(tp + fp, 1)
569
+ efficiency = max(0, (max_steps − steps_used) / max_steps)
570
+
571
+ if recall ≥ 0.8 and precision ≥ 0.7:
572
+ score = 0.55 + 0.20×recall + 0.15×precision + 0.10×efficiency
573
+ else:
574
+ score = 0.30×recall + 0.10×precision
575
+
576
+ # Maximum possible: 1.00 | Win threshold: ~0.815</div>
577
+
578
+ <hr class="gs-divider">
579
+
580
+ <!-- REFLEXION -->
581
+ <h2>Reflexion Learning</h2>
582
+ <img src="/assets/reflexion.png" class="gs-img" alt="Reflexion Learning Loop" onerror="this.style.display='none'">
583
+ <p class="gs-img-caption">Post-episode lessons injected into every future prompt — learning without weight updates</p>
584
+
585
+ <p>The LLM (Qwen3-80B via AWS Bedrock) cannot be fine-tuned — it is a black-box API.
586
+ Instead, a separate Qwen3 call generates a 2–3 sentence lesson after each episode.
587
+ The best winning trajectory is stored as a few-shot example injected into all future prompts.</p>
588
+
589
+ <pre><code>Episode N:
590
+ LLM acts using: system_prompt + reflections[last 4] + best_trajectory
591
+ Episode ends → WIN or LOSS
592
+ LOSS → generate_reflection(action_log, outcome) → lesson stored
593
+ WIN → save trajectory if better reward + generate_success_reflection
594
+
595
+ Episode N+1:
596
+ last 4 reflections + best win trajectory injected into prompt
597
+ → LLM has learned from its past without any weight updates</code></pre>
598
+
599
+ <hr class="gs-divider">
600
+
601
+ <!-- HYBRID POLICY -->
602
+ <h2>Hybrid Policy — The Novel Contribution</h2>
603
+ <img src="/assets/hybrid.png" class="gs-img" alt="Hybrid Policy Architecture" onerror="this.style.display='none'">
604
+ <p class="gs-img-caption">Dynamic alpha-weighted blend: rules dominate early, LLM earns trust through wins and reflections</p>
605
+
606
+ <p>A <strong>dynamic α-weighted blend</strong> of a deterministic rule engine and the LLM. α represents trust in the LLM —
607
+ starts at 0.20 (rules dominate), climbs as the LLM wins consistently and accumulates reflections, capped per task
608
+ to prevent the LLM from overriding correct high-confidence rule decisions.</p>
609
+
610
+ <div class="gs-formula">reflection_factor = min(1.0, n_reflections / 4.0)
611
+ raw = 0.20 + reflection_factor × (0.80 × recent_win_rate + 0.12)
612
+ alpha = clamp(raw, 0.20, task_cap)
613
+
614
+ Per-task caps: easy → 0.50 | medium → 0.70 | hard → 0.85</div>
615
+
616
+ <img src="/images/plot.png" class="gs-img" alt="Alpha progression over training" onerror="this.style.display='none'">
617
+ <p class="gs-img-caption">Alpha progression: rule-dominated early training → LLM earns authority through wins</p>
618
+
619
+ <h3>Rule Confidence Levels</h3>
620
+ <table class="gs-table">
621
+ <tr><th>Situation</th><th>Rule Action</th><th>Confidence</th></tr>
622
+ <tr><td>Steps remaining = 0</td><td>SUBMIT</td><td>1.00</td></tr>
623
+ <tr><td>Uninspected SUSPECT accounts exist</td><td>INSPECT suspects[0]</td><td>0.95</td></tr>
624
+ <tr><td><code>fake_risk ≥ 0.85</code></td><td>FLAG that account</td><td>0.95</td></tr>
625
+ <tr><td><code>fake_risk</code> in [threshold, 0.85)</td><td>FLAG that account</td><td>0.70 – 0.94</td></tr>
626
+ <tr><td>10 flags placed</td><td>SUBMIT</td><td>0.85</td></tr>
627
+ <tr><td>Steps remaining ≤ 3</td><td>SUBMIT</td><td>0.90</td></tr>
628
+ <tr><td>Uninspected accounts available</td><td>INSPECT top candidate</td><td>0.30</td></tr>
629
+ </table>
630
+ <p style="font-size:0.85em;color:#64748b;">When <code>rule_confidence ≥ alpha</code> the rule engine overrides. At easy cap (0.50), the LLM controls only exploratory INSPECT decisions. At hard cap (0.85), the LLM controls most decisions except forced submits and suspect cascade.</p>
631
+
632
+ </div>
633
+ """
634
+
635
+ _HEADER_HTML = """
636
+ <style>
637
+ .gr-dataframe th { background:#0c2340!important;color:#94a3b8!important;font-weight:700!important;font-size:12px!important;padding:10px 12px!important;border-bottom:1px solid #1e3a5f!important; }
638
+ .gr-dataframe td { font-size:12.5px!important;padding:8px 12px!important; }
639
+ </style>
640
+ <div style="background:linear-gradient(135deg,#050d1a 0%,#0b1f3a 50%,#060f1e 100%);
641
+ padding:24px 32px 20px;border-radius:12px;
642
+ border:1px solid #1e3a5f;margin-bottom:2px;
643
+ box-shadow:0 4px 24px rgba(0,0,0,0.5);">
644
+ <div style="display:flex;align-items:center;gap:16px;margin-bottom:8px;">
645
+ <div>
646
+ <h1 style="color:#e2e8f0;margin:0;font-size:1.9em;font-weight:800;letter-spacing:-0.5px;
647
+ font-family:'Inter',system-ui,sans-serif;">GraphStrike</h1>
648
+ <p style="color:#475569;margin:3px 0 0;font-size:0.88em;letter-spacing:0.3px;font-family:'IBM Plex Mono',monospace;">
649
+ COORDINATED FAKE ACCOUNT RING DETECTION &mdash; OPENENV RL ENVIRONMENT
650
+ </p>
651
+ </div>
652
+ </div>
653
+ <div style="display:flex;gap:10px;flex-wrap:wrap;margin-top:12px;">
654
+ <span style="background:#052e16;color:#4ade80;padding:3px 10px;border-radius:20px;font-size:0.78em;font-weight:600;border:1px solid #166534;">OpenEnv Hackathon</span>
655
+ <span style="background:#0c1a2e;color:#7dd3fc;padding:3px 10px;border-radius:20px;font-size:0.78em;font-weight:600;border:1px solid #1e40af;">Reinforcement Learning</span>
656
+ <span style="background:#1c0533;color:#c084fc;padding:3px 10px;border-radius:20px;font-size:0.78em;font-weight:600;border:1px solid #6b21a8;">Hybrid Policy</span>
657
+ <span style="background:#2d1f00;color:#fbbf24;padding:3px 10px;border-radius:20px;font-size:0.78em;font-weight:600;border:1px solid #92400e;">Reflexion Learning</span>
658
+ <span style="background:#1a0505;color:#f87171;padding:3px 10px;border-radius:20px;font-size:0.78em;font-weight:600;border:1px solid #7f1d1d;">Fraud Detection</span>
659
+ </div>
660
+ </div>"""
661
+
662
+ _FOOTER_HTML = """
663
+ <div style="text-align:center;padding:24px 0 8px;color:#1e3a5f;font-size:12px;
664
+ border-top:1px solid #0f1e30;margin-top:28px;font-family:'IBM Plex Mono',monospace;">
665
+ GraphStrike &mdash; OpenEnv Hackathon &times; SCALER School of Technology &nbsp;|&nbsp;
666
+ <a href="/docs" style="color:#334155;text-decoration:none;">API Docs</a>
667
+ </div>"""
668
 
669
  with gr.Blocks(title="GraphStrike") as demo:
670
+
671
+ gr.HTML(_HEADER_HTML)
672
+
673
+ with gr.Tabs():
674
+
675
+ # ══════════════ TAB 1: README ══════════════
676
+ with gr.Tab("Overview"):
677
+ gr.HTML(_README_HTML)
678
+
679
+ # ══════════════ TAB 2: PLAYGROUND ══════════════
680
+ with gr.Tab("Playground"):
681
+ with gr.Row():
682
+ with gr.Column(scale=1, min_width=220):
683
+ gr.Markdown("**1 — Episode**")
684
+ task_dd = gr.Dropdown(["easy","medium","hard"], value="easy", label="Task")
685
+ seed_in = gr.Number(value=0, label="Seed", precision=0)
686
+ reset_btn = gr.Button("Reset", variant="primary")
687
+
688
+ with gr.Column(scale=1, min_width=220):
689
+ gr.Markdown("**2 Action**")
690
+ action_dd = gr.Dropdown(
691
+ ["inspect","investigate_network","flag","unflag","submit"],
692
+ value="inspect", label="Action")
693
+ acc_in = gr.Textbox(label="Account ID", placeholder="acc_0012")
694
+ step_btn = gr.Button("Step", variant="primary")
695
+
696
+ with gr.Column(scale=1, min_width=180):
697
+ gr.Markdown("**3 Score**")
698
+ gr.Markdown("<br>", container=False)
699
+ grader_btn = gr.Button("Grader Score", size="sm")
700
+ baseline_btn = gr.Button("Baseline Agent", size="sm")
701
+ gr.Button("API Docs (Swagger)", size="sm", link="/docs", link_target="_blank")
702
+
703
+ obs_md = gr.Markdown(value="*Reset an episode to begin.*")
704
+
705
+ gr.Markdown("**Account Profiles** — sorted by fake risk score (highest first)")
706
+ prof_table = gr.Dataframe(
707
+ headers=PROFILE_HEADERS,
708
+ datatype=["str","str","number","number","number","number",
709
+ "number","number","number","number","number"],
710
+ value=[],
711
+ interactive=False,
712
+ wrap=False,
713
+ column_widths=["110px","160px","70px","70px","70px",
714
+ "70px","70px","70px","70px","55px","70px"],
715
+ )
716
+
717
+ result_md = gr.Markdown(value="")
718
+
719
+ with gr.Accordion("All Visible IDs", open=False):
720
+ vis_md = gr.Markdown(value="")
721
+ with gr.Accordion("Raw JSON", open=False):
722
+ raw_json = gr.Textbox(lines=20, interactive=False)
723
+
724
+ reset_btn.click(gr_reset, [task_dd, seed_in], [obs_md, prof_table, vis_md, raw_json])
725
+ step_btn.click( gr_step, [action_dd, acc_in], [obs_md, prof_table, vis_md, raw_json])
726
+ grader_btn.click(gr_grader, [], result_md)
727
+ baseline_btn.click(gr_baseline,[], result_md)
728
+
729
+ # ══════════════ TAB 2: BENCHMARKS ══════════════
730
+ with gr.Tab("Benchmarks"):
731
+ gr.Markdown(
732
+ "### LLM Agent Evaluation — GraphStrike Environment\n"
733
+ "Agents evaluated with identical system prompts and structured inference. "
734
+ "Grader score range: **0.0 – 1.0** (win threshold ≥ 0.815). "
735
+ "Score colours: "
736
+ "<span style='color:#22c55e'>■</span> ≥0.960 &nbsp; "
737
+ "<span style='color:#86efac'>■</span> ≥0.930 &nbsp; "
738
+ "<span style='color:#facc15'>■</span> ≥0.910 &nbsp; "
739
+ "<span style='color:#f97316'>■</span> below",
740
+ sanitize_html=False,
741
+ )
742
+
743
+ gr.Markdown("#### Leaderboard — Single Seed (seed=0)")
744
+ gr.HTML(_leaderboard_html())
745
+
746
+ gr.Markdown("#### Score Distribution by Task")
747
+ gr.BarPlot(
748
+ value=BENCH_LONG_DF,
749
+ x="Model", y="Score", color="Task",
750
+ title="Agent Scores by Task (seed=0)",
751
+ color_map={"Easy": "#4ade80", "Medium": "#facc15", "Hard": "#f87171"},
752
+ y_lim=[0.50, 1.0],
753
+ x_label_angle=-25,
754
+ height=340,
755
+ )
756
+
757
+ gr.Markdown(
758
+ "#### Stability — 3-Seed Variance Check (seeds 0, 1, 2)\n"
759
+ "Variance colour: "
760
+ "<span style='color:#22c55e'>■</span> stable (&lt;0.001) &nbsp; "
761
+ "<span style='color:#facc15'>■</span> moderate &nbsp; "
762
+ "<span style='color:#f87171'>■</span> high",
763
+ sanitize_html=False,
764
+ )
765
+ gr.HTML(_variance_html())
766
+
767
+ gr.Markdown("#### Rule-Based Baseline (no LLM, deterministic)")
768
+ gr.HTML(_baseline_html())
769
+
770
+ gr.Markdown(
771
+ "#### Key Observations\n"
772
+ "- Hard task is the real differentiator — evasion events destroy graph signals "
773
+ "mid-investigation, requiring adaptive reasoning beyond memorised patterns.\n"
774
+ "- Llama 4 Scout 17B achieves the lowest variance on hard (6e-5), "
775
+ "outperforming models with 40× more parameters.\n"
776
+ "- The rule-based baseline is competitive at mean 0.907, confirming "
777
+ "the environment's signal quality. LLM value is in evasion adaptation.\n"
778
+ "- All frontier models exceed 0.93 on easy/medium — cascade mechanics "
779
+ "are learnable from the structured observation format."
780
+ )
781
+
782
+ gr.HTML(_FOOTER_HTML)
783
 
784
  app = gr.mount_gradio_app(app, demo, path="/")
785
  print("[GraphStrike] Gradio UI mounted at /", flush=True)
786
 
787
  except Exception as exc:
788
+ import traceback
789
  print(f"[GraphStrike] Gradio unavailable: {exc}", flush=True)
790
+ traceback.print_exc()
791
 
792
  @app.get("/", response_class=HTMLResponse)
793
  def root_fallback():
794
+ return "<html><body><h1>GraphStrike</h1><p>API mode. <a href='/docs'>Swagger</a></p></body></html>"
 
795
 
796
  # ---------------------------------------------------------------------------
797
  # Entry point
server/environment.py CHANGED
@@ -527,11 +527,53 @@ class FakeGangEnvironment(_OpenEnvBase):
527
  suspicious_mutual_ratio=suspicious_mutual_ratio,
528
  )
529
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
530
  def _make_observation(
531
  self,
532
  message: str = "",
533
  terminal_reward: Optional[float] = None,
534
  ) -> FakeGangObservation:
 
 
 
 
535
  return FakeGangObservation(
536
  done=self._done,
537
  reward=terminal_reward,
@@ -549,7 +591,7 @@ class FakeGangEnvironment(_OpenEnvBase):
549
  evasion_triggered=self._evasion_triggered,
550
  evasion_count=self._evasion_count,
551
  task=self._task,
552
- message=message,
553
  suspect_ids=[
554
  sid for sid in self._visible_ids
555
  if sid not in self._flagged
 
527
  suspicious_mutual_ratio=suspicious_mutual_ratio,
528
  )
529
 
530
+ def _build_hint(self) -> str:
531
+ """Generate actionable hints for the agent based on current state."""
532
+ hints = []
533
+
534
+ # Hint 1: Uninspected suspects (highest priority)
535
+ suspect_ids = [
536
+ sid for sid in self._visible_ids
537
+ if sid not in self._flagged
538
+ and self._account_statuses.get(sid, "normal") == "suspect"
539
+ ]
540
+ uninspected_suspects = [s for s in suspect_ids if s not in self._inspected]
541
+ if uninspected_suspects:
542
+ hints.append(f"HINT: {len(uninspected_suspects)} SUSPECT accounts need inspection — INSPECT {uninspected_suspects[0]} next (auto-elevated by cascade, likely gang member).")
543
+
544
+ # Hint 2: Unflagged accounts with strong fake signals
545
+ unflagged_fakes = []
546
+ for acc_id in self._inspected:
547
+ if acc_id in self._flagged:
548
+ continue
549
+ p = self._profiled.get(acc_id)
550
+ if not p:
551
+ continue
552
+ if (p.shared_ip_count >= 5
553
+ or (p.photo_reuse_score >= 0.50 and p.bio_template_score >= 0.40
554
+ and p.hub_legitimacy_score < 0.70)):
555
+ unflagged_fakes.append(acc_id)
556
+ if unflagged_fakes and not uninspected_suspects:
557
+ hints.append(f"HINT: FLAG {unflagged_fakes[0]} — strong fake signals detected (photo_reuse/bio_template/shared_ip). FLAG is FREE (costs 0 steps).")
558
+
559
+ # Hint 3: Submit reminder
560
+ steps_left = max(0, self._max_steps - self._step_count)
561
+ if len(self._flagged) >= 10:
562
+ hints.append("HINT: You have 10 flags — SUBMIT now to end the episode and get scored.")
563
+ elif steps_left <= 3 and not self._done:
564
+ hints.append(f"HINT: Only {steps_left} steps left — consider SUBMIT to lock in your score.")
565
+
566
+ return " ".join(hints)
567
+
568
  def _make_observation(
569
  self,
570
  message: str = "",
571
  terminal_reward: Optional[float] = None,
572
  ) -> FakeGangObservation:
573
+ # Append hints to message for agent guidance
574
+ hint = self._build_hint() if not self._done else ""
575
+ full_message = f"{message} {hint}".strip() if hint else message
576
+
577
  return FakeGangObservation(
578
  done=self._done,
579
  reward=terminal_reward,
 
591
  evasion_triggered=self._evasion_triggered,
592
  evasion_count=self._evasion_count,
593
  task=self._task,
594
+ message=full_message,
595
  suspect_ids=[
596
  sid for sid in self._visible_ids
597
  if sid not in self._flagged
server/generator.py CHANGED
@@ -261,9 +261,19 @@ def generate_episode(task: str, seed: int) -> Dict[str, Any]:
261
 
262
  _build_edges(rng, accounts, gang_ids, cfg["intra_gang_density"])
263
 
264
- # Choose starting visible accounts (mix of real + maybe 1 gang member)
 
 
 
 
 
 
265
  starting_count = cfg["starting_visible"]
266
- starting_visible = rng.sample(all_ids, starting_count)
 
 
 
 
267
 
268
  return {
269
  "episode_id": str(uuid.uuid4()),
 
261
 
262
  _build_edges(rng, accounts, gang_ids, cfg["intra_gang_density"])
263
 
264
+ # Choose starting visible accounts.
265
+ # Guarantee exactly 1 gang member is included so the cascade CAN start
266
+ # regardless of seed. The agent still has to identify WHICH account is fake
267
+ # (requires inspecting profiles) — so difficulty is preserved.
268
+ # Without this, ~31% of easy episodes and ~82% of hard episodes start with
269
+ # zero gang members visible, making score variance seed-luck rather than
270
+ # agent skill.
271
  starting_count = cfg["starting_visible"]
272
+ forced_gang = rng.sample(gang_ids, 1) # exactly 1 gang member
273
+ rest_pool = [i for i in all_ids if i not in forced_gang]
274
+ additional = rng.sample(rest_pool, starting_count - 1)
275
+ starting_visible = forced_gang + additional
276
+ rng.shuffle(starting_visible) # don't reveal which is fake
277
 
278
  return {
279
  "episode_id": str(uuid.uuid4()),