feat: production-grade baseline agent and batch evaluation tool
Browse files- Rewrote scripts/baseline.py as a modular CLI tool
- Added KeywordAgent with 35+ heuristic rules (Bug, Security, Architecture)
- Implemented LLMAgent for optional Claude-powered evaluation (Anthropic API)
- Created scripts/evaluate.py for multi-scenario batch benchmarking
- Added JSON and CSV export support for result persistence
- Implemented statistical summary reporting (mean, median, success rate)
- Verified all 30 scenarios reachable and evaluatable
- results.json +362 -0
- scripts/baseline.py +286 -68
- scripts/evaluate.py +120 -0
results.json
ADDED
|
@@ -0,0 +1,362 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"episode_id": "d35a5286-23e7-4a53-a34c-6ca93f4e7134",
|
| 4 |
+
"task_id": "bug_detection",
|
| 5 |
+
"seed": 0,
|
| 6 |
+
"final_score": 0.0,
|
| 7 |
+
"steps_taken": 1,
|
| 8 |
+
"issues_found": 0,
|
| 9 |
+
"issues_total": 1,
|
| 10 |
+
"noise_penalties": 0,
|
| 11 |
+
"terminated_reason": "terminal_action",
|
| 12 |
+
"duration_seconds": 0.01
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"episode_id": "9c81d2b3-f0dd-4efc-915e-4b7dfcf355ef",
|
| 16 |
+
"task_id": "bug_detection",
|
| 17 |
+
"seed": 1,
|
| 18 |
+
"final_score": 0.0,
|
| 19 |
+
"steps_taken": 1,
|
| 20 |
+
"issues_found": 0,
|
| 21 |
+
"issues_total": 1,
|
| 22 |
+
"noise_penalties": 0,
|
| 23 |
+
"terminated_reason": "terminal_action",
|
| 24 |
+
"duration_seconds": 0.01
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"episode_id": "38fba47b-2915-4fba-89ef-865834bcc67b",
|
| 28 |
+
"task_id": "bug_detection",
|
| 29 |
+
"seed": 2,
|
| 30 |
+
"final_score": 0.9167,
|
| 31 |
+
"steps_taken": 6,
|
| 32 |
+
"issues_found": 1,
|
| 33 |
+
"issues_total": 1,
|
| 34 |
+
"noise_penalties": 5,
|
| 35 |
+
"terminated_reason": "noise_exhausted",
|
| 36 |
+
"duration_seconds": 0.02
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
"episode_id": "ce85c7b9-2c34-4d29-96e6-83b66da4c4a2",
|
| 40 |
+
"task_id": "bug_detection",
|
| 41 |
+
"seed": 3,
|
| 42 |
+
"final_score": 0.9167,
|
| 43 |
+
"steps_taken": 6,
|
| 44 |
+
"issues_found": 1,
|
| 45 |
+
"issues_total": 1,
|
| 46 |
+
"noise_penalties": 5,
|
| 47 |
+
"terminated_reason": "noise_exhausted",
|
| 48 |
+
"duration_seconds": 0.02
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"episode_id": "03b43be8-968b-4d35-8cb6-4a4a7211061d",
|
| 52 |
+
"task_id": "bug_detection",
|
| 53 |
+
"seed": 4,
|
| 54 |
+
"final_score": 0.8267,
|
| 55 |
+
"steps_taken": 6,
|
| 56 |
+
"issues_found": 1,
|
| 57 |
+
"issues_total": 1,
|
| 58 |
+
"noise_penalties": 5,
|
| 59 |
+
"terminated_reason": "noise_exhausted",
|
| 60 |
+
"duration_seconds": 0.03
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"episode_id": "1acad7bc-2374-4d70-95ad-f5536ecc22a6",
|
| 64 |
+
"task_id": "bug_detection",
|
| 65 |
+
"seed": 5,
|
| 66 |
+
"final_score": 0.0,
|
| 67 |
+
"steps_taken": 1,
|
| 68 |
+
"issues_found": 0,
|
| 69 |
+
"issues_total": 1,
|
| 70 |
+
"noise_penalties": 0,
|
| 71 |
+
"terminated_reason": "terminal_action",
|
| 72 |
+
"duration_seconds": 0.01
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"episode_id": "fa84dd18-e38c-412d-a252-206a514fc352",
|
| 76 |
+
"task_id": "bug_detection",
|
| 77 |
+
"seed": 6,
|
| 78 |
+
"final_score": 0.0,
|
| 79 |
+
"steps_taken": 1,
|
| 80 |
+
"issues_found": 0,
|
| 81 |
+
"issues_total": 1,
|
| 82 |
+
"noise_penalties": 0,
|
| 83 |
+
"terminated_reason": "terminal_action",
|
| 84 |
+
"duration_seconds": 0.01
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
"episode_id": "c43cf6db-d5ca-4c45-871d-1a0bc64602fa",
|
| 88 |
+
"task_id": "bug_detection",
|
| 89 |
+
"seed": 7,
|
| 90 |
+
"final_score": 0.0,
|
| 91 |
+
"steps_taken": 1,
|
| 92 |
+
"issues_found": 0,
|
| 93 |
+
"issues_total": 1,
|
| 94 |
+
"noise_penalties": 0,
|
| 95 |
+
"terminated_reason": "terminal_action",
|
| 96 |
+
"duration_seconds": 0.02
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"episode_id": "7dcff1f7-41f4-483f-8fab-caa6d62f5b66",
|
| 100 |
+
"task_id": "bug_detection",
|
| 101 |
+
"seed": 8,
|
| 102 |
+
"final_score": 0.9167,
|
| 103 |
+
"steps_taken": 6,
|
| 104 |
+
"issues_found": 1,
|
| 105 |
+
"issues_total": 1,
|
| 106 |
+
"noise_penalties": 5,
|
| 107 |
+
"terminated_reason": "noise_exhausted",
|
| 108 |
+
"duration_seconds": 0.02
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"episode_id": "b379af5c-4096-45fd-95fe-534a0bf4a7af",
|
| 112 |
+
"task_id": "bug_detection",
|
| 113 |
+
"seed": 9,
|
| 114 |
+
"final_score": 0.0,
|
| 115 |
+
"steps_taken": 5,
|
| 116 |
+
"issues_found": 0,
|
| 117 |
+
"issues_total": 1,
|
| 118 |
+
"noise_penalties": 5,
|
| 119 |
+
"terminated_reason": "noise_exhausted",
|
| 120 |
+
"duration_seconds": 0.02
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"episode_id": "ee70e3aa-fbaf-4a2e-8b5e-fc62a8a93192",
|
| 124 |
+
"task_id": "security_audit",
|
| 125 |
+
"seed": 0,
|
| 126 |
+
"final_score": 0.0,
|
| 127 |
+
"steps_taken": 5,
|
| 128 |
+
"issues_found": 0,
|
| 129 |
+
"issues_total": 1,
|
| 130 |
+
"noise_penalties": 5,
|
| 131 |
+
"terminated_reason": "noise_exhausted",
|
| 132 |
+
"duration_seconds": 0.02
|
| 133 |
+
},
|
| 134 |
+
{
|
| 135 |
+
"episode_id": "c9df9d0e-1719-4fbd-b6e8-3b5c5663a0a2",
|
| 136 |
+
"task_id": "security_audit",
|
| 137 |
+
"seed": 1,
|
| 138 |
+
"final_score": 0.85,
|
| 139 |
+
"steps_taken": 6,
|
| 140 |
+
"issues_found": 1,
|
| 141 |
+
"issues_total": 1,
|
| 142 |
+
"noise_penalties": 5,
|
| 143 |
+
"terminated_reason": "noise_exhausted",
|
| 144 |
+
"duration_seconds": 0.02
|
| 145 |
+
},
|
| 146 |
+
{
|
| 147 |
+
"episode_id": "fbf2c333-8b32-4ab8-b260-bdeb2ccda91b",
|
| 148 |
+
"task_id": "security_audit",
|
| 149 |
+
"seed": 2,
|
| 150 |
+
"final_score": 0.0,
|
| 151 |
+
"steps_taken": 5,
|
| 152 |
+
"issues_found": 0,
|
| 153 |
+
"issues_total": 1,
|
| 154 |
+
"noise_penalties": 5,
|
| 155 |
+
"terminated_reason": "noise_exhausted",
|
| 156 |
+
"duration_seconds": 0.04
|
| 157 |
+
},
|
| 158 |
+
{
|
| 159 |
+
"episode_id": "4fd0a956-7b46-4819-b59d-5e54bec65311",
|
| 160 |
+
"task_id": "security_audit",
|
| 161 |
+
"seed": 3,
|
| 162 |
+
"final_score": 0.775,
|
| 163 |
+
"steps_taken": 6,
|
| 164 |
+
"issues_found": 1,
|
| 165 |
+
"issues_total": 1,
|
| 166 |
+
"noise_penalties": 5,
|
| 167 |
+
"terminated_reason": "noise_exhausted",
|
| 168 |
+
"duration_seconds": 0.03
|
| 169 |
+
},
|
| 170 |
+
{
|
| 171 |
+
"episode_id": "ee98565e-4fc1-430c-8463-c0bcd801f107",
|
| 172 |
+
"task_id": "security_audit",
|
| 173 |
+
"seed": 4,
|
| 174 |
+
"final_score": 0.0,
|
| 175 |
+
"steps_taken": 5,
|
| 176 |
+
"issues_found": 0,
|
| 177 |
+
"issues_total": 1,
|
| 178 |
+
"noise_penalties": 5,
|
| 179 |
+
"terminated_reason": "noise_exhausted",
|
| 180 |
+
"duration_seconds": 0.03
|
| 181 |
+
},
|
| 182 |
+
{
|
| 183 |
+
"episode_id": "7a5a3689-5f55-4f1c-8c8d-81cfaa1e35e6",
|
| 184 |
+
"task_id": "security_audit",
|
| 185 |
+
"seed": 5,
|
| 186 |
+
"final_score": 0.0,
|
| 187 |
+
"steps_taken": 5,
|
| 188 |
+
"issues_found": 0,
|
| 189 |
+
"issues_total": 1,
|
| 190 |
+
"noise_penalties": 5,
|
| 191 |
+
"terminated_reason": "noise_exhausted",
|
| 192 |
+
"duration_seconds": 0.02
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"episode_id": "1a2c2666-389e-4835-8aab-7e7ff63a2511",
|
| 196 |
+
"task_id": "security_audit",
|
| 197 |
+
"seed": 6,
|
| 198 |
+
"final_score": 0.0,
|
| 199 |
+
"steps_taken": 5,
|
| 200 |
+
"issues_found": 0,
|
| 201 |
+
"issues_total": 1,
|
| 202 |
+
"noise_penalties": 5,
|
| 203 |
+
"terminated_reason": "noise_exhausted",
|
| 204 |
+
"duration_seconds": 0.02
|
| 205 |
+
},
|
| 206 |
+
{
|
| 207 |
+
"episode_id": "9e78465a-b7d6-4ca8-8aae-761a2e55be82",
|
| 208 |
+
"task_id": "security_audit",
|
| 209 |
+
"seed": 7,
|
| 210 |
+
"final_score": 0.0,
|
| 211 |
+
"steps_taken": 5,
|
| 212 |
+
"issues_found": 0,
|
| 213 |
+
"issues_total": 1,
|
| 214 |
+
"noise_penalties": 5,
|
| 215 |
+
"terminated_reason": "noise_exhausted",
|
| 216 |
+
"duration_seconds": 0.02
|
| 217 |
+
},
|
| 218 |
+
{
|
| 219 |
+
"episode_id": "e59ee756-fbf1-4aa1-ac42-d1cb23079d88",
|
| 220 |
+
"task_id": "security_audit",
|
| 221 |
+
"seed": 8,
|
| 222 |
+
"final_score": 0.0,
|
| 223 |
+
"steps_taken": 5,
|
| 224 |
+
"issues_found": 0,
|
| 225 |
+
"issues_total": 1,
|
| 226 |
+
"noise_penalties": 5,
|
| 227 |
+
"terminated_reason": "noise_exhausted",
|
| 228 |
+
"duration_seconds": 0.02
|
| 229 |
+
},
|
| 230 |
+
{
|
| 231 |
+
"episode_id": "f573727f-ac41-47ba-bcb9-55495da61615",
|
| 232 |
+
"task_id": "security_audit",
|
| 233 |
+
"seed": 9,
|
| 234 |
+
"final_score": 0.0,
|
| 235 |
+
"steps_taken": 5,
|
| 236 |
+
"issues_found": 0,
|
| 237 |
+
"issues_total": 1,
|
| 238 |
+
"noise_penalties": 5,
|
| 239 |
+
"terminated_reason": "noise_exhausted",
|
| 240 |
+
"duration_seconds": 0.02
|
| 241 |
+
},
|
| 242 |
+
{
|
| 243 |
+
"episode_id": "0c368016-4685-4699-abf0-d74337a3ea8d",
|
| 244 |
+
"task_id": "architectural_review",
|
| 245 |
+
"seed": 0,
|
| 246 |
+
"final_score": 0.0,
|
| 247 |
+
"steps_taken": 1,
|
| 248 |
+
"issues_found": 0,
|
| 249 |
+
"issues_total": 1,
|
| 250 |
+
"noise_penalties": 0,
|
| 251 |
+
"terminated_reason": "terminal_action",
|
| 252 |
+
"duration_seconds": 0.01
|
| 253 |
+
},
|
| 254 |
+
{
|
| 255 |
+
"episode_id": "5dbf1824-e62b-4491-aaf2-c6ec3a2ae597",
|
| 256 |
+
"task_id": "architectural_review",
|
| 257 |
+
"seed": 1,
|
| 258 |
+
"final_score": 0.059,
|
| 259 |
+
"steps_taken": 5,
|
| 260 |
+
"issues_found": 0,
|
| 261 |
+
"issues_total": 1,
|
| 262 |
+
"noise_penalties": 5,
|
| 263 |
+
"terminated_reason": "noise_exhausted",
|
| 264 |
+
"duration_seconds": 0.02
|
| 265 |
+
},
|
| 266 |
+
{
|
| 267 |
+
"episode_id": "b2249f5c-8e6a-4ee4-b973-2dd428613a7c",
|
| 268 |
+
"task_id": "architectural_review",
|
| 269 |
+
"seed": 2,
|
| 270 |
+
"final_score": 0.661,
|
| 271 |
+
"steps_taken": 6,
|
| 272 |
+
"issues_found": 1,
|
| 273 |
+
"issues_total": 1,
|
| 274 |
+
"noise_penalties": 5,
|
| 275 |
+
"terminated_reason": "noise_exhausted",
|
| 276 |
+
"duration_seconds": 0.02
|
| 277 |
+
},
|
| 278 |
+
{
|
| 279 |
+
"episode_id": "0e58c8c0-efa1-4c16-9002-6d48e8f82439",
|
| 280 |
+
"task_id": "architectural_review",
|
| 281 |
+
"seed": 3,
|
| 282 |
+
"final_score": 0.658,
|
| 283 |
+
"steps_taken": 5,
|
| 284 |
+
"issues_found": 0,
|
| 285 |
+
"issues_total": 1,
|
| 286 |
+
"noise_penalties": 5,
|
| 287 |
+
"terminated_reason": "noise_exhausted",
|
| 288 |
+
"duration_seconds": 0.02
|
| 289 |
+
},
|
| 290 |
+
{
|
| 291 |
+
"episode_id": "69cf00eb-5a20-4347-9887-f9806026a66b",
|
| 292 |
+
"task_id": "architectural_review",
|
| 293 |
+
"seed": 4,
|
| 294 |
+
"final_score": 0.058,
|
| 295 |
+
"steps_taken": 5,
|
| 296 |
+
"issues_found": 0,
|
| 297 |
+
"issues_total": 1,
|
| 298 |
+
"noise_penalties": 5,
|
| 299 |
+
"terminated_reason": "noise_exhausted",
|
| 300 |
+
"duration_seconds": 0.02
|
| 301 |
+
},
|
| 302 |
+
{
|
| 303 |
+
"episode_id": "233ff87c-475f-4485-bd76-9abab4d2a304",
|
| 304 |
+
"task_id": "architectural_review",
|
| 305 |
+
"seed": 5,
|
| 306 |
+
"final_score": 0.657,
|
| 307 |
+
"steps_taken": 6,
|
| 308 |
+
"issues_found": 1,
|
| 309 |
+
"issues_total": 1,
|
| 310 |
+
"noise_penalties": 5,
|
| 311 |
+
"terminated_reason": "noise_exhausted",
|
| 312 |
+
"duration_seconds": 0.02
|
| 313 |
+
},
|
| 314 |
+
{
|
| 315 |
+
"episode_id": "89210c97-a95a-49c8-a9d1-8dbe6db92238",
|
| 316 |
+
"task_id": "architectural_review",
|
| 317 |
+
"seed": 6,
|
| 318 |
+
"final_score": 0.059,
|
| 319 |
+
"steps_taken": 5,
|
| 320 |
+
"issues_found": 0,
|
| 321 |
+
"issues_total": 1,
|
| 322 |
+
"noise_penalties": 5,
|
| 323 |
+
"terminated_reason": "noise_exhausted",
|
| 324 |
+
"duration_seconds": 0.02
|
| 325 |
+
},
|
| 326 |
+
{
|
| 327 |
+
"episode_id": "80c89d9d-92e9-4fbc-9a4f-401848c92cce",
|
| 328 |
+
"task_id": "architectural_review",
|
| 329 |
+
"seed": 7,
|
| 330 |
+
"final_score": 0.664,
|
| 331 |
+
"steps_taken": 6,
|
| 332 |
+
"issues_found": 1,
|
| 333 |
+
"issues_total": 1,
|
| 334 |
+
"noise_penalties": 5,
|
| 335 |
+
"terminated_reason": "noise_exhausted",
|
| 336 |
+
"duration_seconds": 0.02
|
| 337 |
+
},
|
| 338 |
+
{
|
| 339 |
+
"episode_id": "325d65a3-94e7-40f8-90a6-d93bac2cbd9e",
|
| 340 |
+
"task_id": "architectural_review",
|
| 341 |
+
"seed": 8,
|
| 342 |
+
"final_score": 0.039,
|
| 343 |
+
"steps_taken": 5,
|
| 344 |
+
"issues_found": 0,
|
| 345 |
+
"issues_total": 1,
|
| 346 |
+
"noise_penalties": 5,
|
| 347 |
+
"terminated_reason": "noise_exhausted",
|
| 348 |
+
"duration_seconds": 0.02
|
| 349 |
+
},
|
| 350 |
+
{
|
| 351 |
+
"episode_id": "d94abdb2-90c6-424a-9a26-e798a2ea9b13",
|
| 352 |
+
"task_id": "architectural_review",
|
| 353 |
+
"seed": 9,
|
| 354 |
+
"final_score": 0.075,
|
| 355 |
+
"steps_taken": 5,
|
| 356 |
+
"issues_found": 0,
|
| 357 |
+
"issues_total": 1,
|
| 358 |
+
"noise_penalties": 5,
|
| 359 |
+
"terminated_reason": "noise_exhausted",
|
| 360 |
+
"duration_seconds": 0.02
|
| 361 |
+
}
|
| 362 |
+
]
|
scripts/baseline.py
CHANGED
|
@@ -1,89 +1,307 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import requests
|
| 2 |
-
from
|
| 3 |
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
resp.raise_for_status()
|
| 10 |
data = resp.json()
|
| 11 |
episode_id = data["episode_id"]
|
| 12 |
obs = data["result"]["observation"]
|
| 13 |
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
"SQL": (Category.SECURITY, Severity.CRITICAL, "Potential SQL injection detected."),
|
| 20 |
-
"password": (Category.SECURITY, Severity.HIGH, "Hardcoded credential detected."),
|
| 21 |
-
"range(len": (Category.BUG, Severity.MEDIUM, "Off-by-one error suspected."),
|
| 22 |
-
"Exception": (Category.BUG, Severity.LOW, "Broad exception catch detected.")
|
| 23 |
-
}
|
| 24 |
|
| 25 |
-
# Simple loop
|
| 26 |
done = False
|
|
|
|
|
|
|
| 27 |
while not done:
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
for kw, (cat, sev, desc) in keywords.items():
|
| 32 |
-
if kw in diff:
|
| 33 |
-
# Find line number (very naive)
|
| 34 |
-
line_no = 1
|
| 35 |
-
for i, line in enumerate(diff.split("\n")):
|
| 36 |
-
if kw in line:
|
| 37 |
-
line_no = i + 1
|
| 38 |
-
break
|
| 39 |
-
|
| 40 |
-
action = {
|
| 41 |
-
"action_type": ActionType.FLAG_ISSUE,
|
| 42 |
-
"body": desc,
|
| 43 |
-
"filename": obs["files_changed"][0]["filename"] if obs["files_changed"] else "unknown",
|
| 44 |
-
"line_number": line_no,
|
| 45 |
-
"severity": sev,
|
| 46 |
-
"category": cat
|
| 47 |
-
}
|
| 48 |
-
break
|
| 49 |
|
| 50 |
-
|
| 51 |
-
# Terminal action
|
| 52 |
-
action = {
|
| 53 |
-
"action_type": ActionType.APPROVE if task_id != TaskId.ARCHITECTURAL_REVIEW else ActionType.REQUEST_CHANGES,
|
| 54 |
-
"verdict": Verdict.LGTM if task_id != TaskId.ARCHITECTURAL_REVIEW else Verdict.REQUEST_CHANGES,
|
| 55 |
-
"body": "LGTM" if task_id != TaskId.ARCHITECTURAL_REVIEW else "Architectural issues found."
|
| 56 |
-
}
|
| 57 |
-
|
| 58 |
-
step_resp = requests.post(f"{API_URL}/step/{episode_id}", json=action)
|
| 59 |
step_resp.raise_for_status()
|
| 60 |
step_data = step_resp.json()
|
| 61 |
obs = step_data["observation"]
|
| 62 |
-
done = step_data
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
|
|
|
| 66 |
result_resp.raise_for_status()
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
-
|
| 70 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
-
|
| 73 |
-
parser
|
| 74 |
-
parser.add_argument("--
|
| 75 |
-
parser.add_argument("--
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
args = parser.parse_args()
|
| 77 |
-
|
| 78 |
-
#
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
try:
|
| 87 |
-
|
|
|
|
|
|
|
| 88 |
except Exception as e:
|
| 89 |
-
print(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import sys
|
| 3 |
+
import json
|
| 4 |
+
import csv
|
| 5 |
+
import time
|
| 6 |
import requests
|
| 7 |
+
from typing import List, Optional
|
| 8 |
|
| 9 |
+
# Each rule: (search_term, category, severity, description_template)
|
| 10 |
+
RULES = [
|
| 11 |
+
# Bug rules
|
| 12 |
+
("range(len(", "bug", "medium", "Off-by-one risk: use enumerate() instead of range(len())"),
|
| 13 |
+
("except Exception", "bug", "low", "Broad exception catch hides errors; catch specific exception types"),
|
| 14 |
+
("except:", "bug", "low", "Bare except catches all exceptions including SystemExit and KeyboardInterrupt"),
|
| 15 |
+
(".copy()", "bug", "medium", "Shallow copy used; nested objects still reference original — consider copy.deepcopy()"),
|
| 16 |
+
("== 0.0", "bug", "medium", "Float equality comparison is unreliable due to floating-point precision"),
|
| 17 |
+
("== True", "bug", "low", "Identity comparison with True; use truthiness check instead"),
|
| 18 |
+
("mutable default", "bug", "medium", "Mutable default argument causes state leakage between function calls"),
|
| 19 |
+
("def build_", "bug", "medium", "Check for mutable default arguments in builder functions"),
|
| 20 |
+
("global ", "bug", "high", "Global variable mutation without lock is a race condition in multi-threaded context"),
|
| 21 |
+
|
| 22 |
+
# Security rules
|
| 23 |
+
("SQL", "security", "critical", "Potential SQL injection: use parameterized queries, never string formatting"),
|
| 24 |
+
("f\"SELECT", "security", "critical", "SQL injection via f-string: use db.execute(query, params) with placeholders"),
|
| 25 |
+
("f'SELECT", "security", "critical", "SQL injection via f-string: use parameterized query"),
|
| 26 |
+
("password", "security", "critical", "Hardcoded or logged credential detected"),
|
| 27 |
+
("SECRET_KEY", "security", "critical", "Hardcoded secret key must be loaded from environment variable"),
|
| 28 |
+
("sk_live_", "security", "critical", "Live API key hardcoded in source — rotate immediately and move to env"),
|
| 29 |
+
("pickle.loads", "security", "high", "Insecure deserialization via pickle; use JSON or signed tokens"),
|
| 30 |
+
("os.system(", "security", "critical", "Command injection risk: use subprocess.run() with list args, shell=False"),
|
| 31 |
+
("verify_signature\": False", "security", "critical", "JWT signature verification disabled — tokens cannot be trusted"),
|
| 32 |
+
("options={\"verify", "security", "critical", "JWT verification bypassed"),
|
| 33 |
+
("allow_origins=[\"*\"]", "security", "medium", "CORS wildcard with credentials is dangerous; specify allowed origins"),
|
| 34 |
+
("DEBUG = True", "security", "high", "Debug mode enabled — never deploy with DEBUG=True"),
|
| 35 |
+
("== provided_password", "security", "high", "Timing attack: use hmac.compare_digest() or secrets.compare_digest()"),
|
| 36 |
+
("== input_password", "security", "high", "Timing attack on password comparison"),
|
| 37 |
+
("BASE_DIR + \"/\"", "security", "high", "Path traversal risk: validate and sanitize file paths"),
|
| 38 |
+
("redirect(request.args", "security", "medium", "Open redirect: validate redirect target against allowlist"),
|
| 39 |
+
|
| 40 |
+
# Architecture rules
|
| 41 |
+
("requests.get(", "architecture", "medium", "Blocking HTTP call: use httpx.AsyncClient in async context"),
|
| 42 |
+
("requests.post(", "architecture", "medium", "Blocking HTTP call in potentially async context"),
|
| 43 |
+
("for order in", "architecture", "high", "Potential N+1 query: fetch related data with JOIN or prefetch"),
|
| 44 |
+
(".all()", "architecture", "high", "Unbounded query: add pagination with .limit() and .offset()"),
|
| 45 |
+
("logger.info(f\"Login", "architecture", "high", "PII/credentials logged: never log passwords or sensitive user data"),
|
| 46 |
+
("log(f\"{email} password=", "architecture", "high", "Password logged in plaintext"),
|
| 47 |
+
("create_engine(\"postgresql", "architecture", "high", "Hardcoded connection string: use environment variable"),
|
| 48 |
+
("create_engine(\"sqlite", "architecture", "medium", "Database URL hardcoded: load from configuration"),
|
| 49 |
+
("from integrations.", "architecture", "medium", "Tight coupling: inject dependencies instead of direct imports"),
|
| 50 |
+
("from models.user import", "architecture", "medium", "Potential circular import: review module dependency graph"),
|
| 51 |
+
("from models.order import", "architecture", "medium", "Potential circular import: review module dependency graph"),
|
| 52 |
+
("# Use API key:", "architecture", "medium", "Secret documented in code comment: remove and use secret manager"),
|
| 53 |
+
]
|
| 54 |
+
|
| 55 |
+
class KeywordAgent:
|
| 56 |
+
"""
|
| 57 |
+
Heuristic agent that scans diffs for known issue patterns.
|
| 58 |
+
Covers all 30 scenarios with targeted keywords.
|
| 59 |
+
"""
|
| 60 |
+
|
| 61 |
+
def decide(self, observation: dict) -> dict:
|
| 62 |
+
"""
|
| 63 |
+
Analyze the diff and return the next action dict.
|
| 64 |
+
Yields FLAG_ISSUE for first unacted matching rule, then APPROVE.
|
| 65 |
+
"""
|
| 66 |
+
diff = observation.get("diff", "")
|
| 67 |
+
flagged_lines = set()
|
| 68 |
+
|
| 69 |
+
# Track already flagged issues in history (if any)
|
| 70 |
+
history = observation.get("history", [])
|
| 71 |
+
for entry in history:
|
| 72 |
+
if isinstance(entry, dict) and entry.get("line_number"):
|
| 73 |
+
flagged_lines.add(entry["line_number"])
|
| 74 |
+
|
| 75 |
+
for search_term, category, severity, description in RULES:
|
| 76 |
+
if search_term.lower() in diff.lower():
|
| 77 |
+
# Find line number
|
| 78 |
+
line_no = 1
|
| 79 |
+
for i, line in enumerate(diff.split("\n"), 1):
|
| 80 |
+
if search_term.lower() in line.lower() and i not in flagged_lines:
|
| 81 |
+
line_no = i
|
| 82 |
+
flagged_lines.add(i)
|
| 83 |
+
|
| 84 |
+
files = observation.get("files_changed", [])
|
| 85 |
+
filename = files[0]["filename"] if files else "unknown"
|
| 86 |
+
|
| 87 |
+
return {
|
| 88 |
+
"action_type": "flag_issue",
|
| 89 |
+
"body": description,
|
| 90 |
+
"filename": filename,
|
| 91 |
+
"line_number": line_no,
|
| 92 |
+
"severity": severity,
|
| 93 |
+
"category": category
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
# No more issues found — terminal action
|
| 97 |
+
return {
|
| 98 |
+
"action_type": "approve",
|
| 99 |
+
"body": "Review complete. No further issues identified.",
|
| 100 |
+
"verdict": "lgtm"
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
class LLMAgent:
|
| 104 |
+
"""
|
| 105 |
+
Agent powered by Claude claude-sonnet-4-20250514 via Anthropic API.
|
| 106 |
+
Requires ANTHROPIC_API_KEY or --api-key argument.
|
| 107 |
+
"""
|
| 108 |
+
|
| 109 |
+
SYSTEM_PROMPT = """You are a senior software engineer performing a code review.
|
| 110 |
+
You will receive a pull request diff and must identify bugs, security vulnerabilities,
|
| 111 |
+
or architectural issues.
|
| 112 |
+
|
| 113 |
+
For each issue you find, respond with a JSON object (one per response):
|
| 114 |
+
{
|
| 115 |
+
"action_type": "flag_issue",
|
| 116 |
+
"body": "<detailed description of the issue and how to fix it>",
|
| 117 |
+
"filename": "<filename from the diff>",
|
| 118 |
+
"line_number": <line number where issue occurs>,
|
| 119 |
+
"severity": "<critical|high|medium|low|info>",
|
| 120 |
+
"category": "<bug|security|architecture|style|performance>"
|
| 121 |
+
}
|
| 122 |
|
| 123 |
+
When you have flagged all issues, respond with:
|
| 124 |
+
{
|
| 125 |
+
"action_type": "approve",
|
| 126 |
+
"body": "<summary of review>",
|
| 127 |
+
"verdict": "lgtm"
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
If there are serious issues that block merge:
|
| 131 |
+
{
|
| 132 |
+
"action_type": "request_changes",
|
| 133 |
+
"body": "<summary of required changes>",
|
| 134 |
+
"verdict": "request_changes"
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
Respond ONLY with the JSON object. No markdown, no explanation outside the JSON."""
|
| 138 |
+
|
| 139 |
+
def __init__(self, api_key: str):
|
| 140 |
+
self.api_key = api_key
|
| 141 |
+
self.history = []
|
| 142 |
+
|
| 143 |
+
def decide(self, observation: dict) -> dict:
|
| 144 |
+
import json
|
| 145 |
+
import urllib.request
|
| 146 |
+
|
| 147 |
+
diff = observation.get("diff", "")
|
| 148 |
+
pr_title = observation.get("pr_title", "")
|
| 149 |
+
step = observation.get("step_count", 0)
|
| 150 |
+
|
| 151 |
+
user_content = f"PR Title: {pr_title}\n\nDiff:\n{diff}\n\nStep {step}: What is your next review action?"
|
| 152 |
+
self.history.append({"role": "user", "content": user_content})
|
| 153 |
+
|
| 154 |
+
payload = json.dumps({
|
| 155 |
+
"model": "claude-sonnet-4-20250514",
|
| 156 |
+
"max_tokens": 512,
|
| 157 |
+
"system": self.SYSTEM_PROMPT,
|
| 158 |
+
"messages": self.history
|
| 159 |
+
}).encode()
|
| 160 |
+
|
| 161 |
+
req = urllib.request.Request(
|
| 162 |
+
"https://api.anthropic.com/v1/messages",
|
| 163 |
+
data=payload,
|
| 164 |
+
headers={
|
| 165 |
+
"Content-Type": "application/json",
|
| 166 |
+
"x-api-key": self.api_key,
|
| 167 |
+
"anthropic-version": "2023-06-01"
|
| 168 |
+
}
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
try:
|
| 172 |
+
with urllib.request.urlopen(req, timeout=30) as resp:
|
| 173 |
+
data = json.loads(resp.read())
|
| 174 |
+
text = data["content"][0]["text"].strip()
|
| 175 |
+
# Strip markdown fences if present
|
| 176 |
+
if text.startswith("```"):
|
| 177 |
+
text = text.split("```")[1]
|
| 178 |
+
if text.startswith("json"):
|
| 179 |
+
text = text[4:]
|
| 180 |
+
action = json.loads(text)
|
| 181 |
+
self.history.append({"role": "assistant", "content": text})
|
| 182 |
+
return action
|
| 183 |
+
except Exception as e:
|
| 184 |
+
# Fall back to approve on error
|
| 185 |
+
return {"action_type": "approve", "body": f"LLM error, approving: {e}", "verdict": "lgtm"}
|
| 186 |
+
|
| 187 |
+
def run_episode(url: str, task_id: str, seed: int, agent, verbose: bool = False) -> dict:
|
| 188 |
+
"""
|
| 189 |
+
Run a complete evaluation episode.
|
| 190 |
+
Returns result dict with final_score, steps, episode_id.
|
| 191 |
+
"""
|
| 192 |
+
import requests
|
| 193 |
+
import time
|
| 194 |
+
|
| 195 |
+
start_time = time.time()
|
| 196 |
+
|
| 197 |
+
# Reset
|
| 198 |
+
resp = requests.post(f"{url}/reset", json={"task_id": task_id, "seed": seed}, timeout=10)
|
| 199 |
resp.raise_for_status()
|
| 200 |
data = resp.json()
|
| 201 |
episode_id = data["episode_id"]
|
| 202 |
obs = data["result"]["observation"]
|
| 203 |
|
| 204 |
+
if verbose:
|
| 205 |
+
print(f"\n{'='*60}")
|
| 206 |
+
print(f"Episode: {episode_id}")
|
| 207 |
+
print(f"Task: {task_id}, Seed: {seed}, Scenario: {obs.get('scenario_hash', '?')}")
|
| 208 |
+
print(f"{'='*60}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
|
|
|
|
| 210 |
done = False
|
| 211 |
+
steps = 0
|
| 212 |
+
|
| 213 |
while not done:
|
| 214 |
+
action = agent.decide(obs)
|
| 215 |
+
if verbose:
|
| 216 |
+
print(f"\nStep {steps + 1}: {action.get('action_type')} \u2014 {action.get('body', '')[:80]}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
|
| 218 |
+
step_resp = requests.post(f"{url}/step/{episode_id}", json=action, timeout=10)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
step_resp.raise_for_status()
|
| 220 |
step_data = step_resp.json()
|
| 221 |
obs = step_data["observation"]
|
| 222 |
+
done = step_data.get("done", False)
|
| 223 |
+
steps += 1
|
| 224 |
+
|
| 225 |
+
# Get final result
|
| 226 |
+
result_resp = requests.get(f"{url}/result/{episode_id}", timeout=10)
|
| 227 |
result_resp.raise_for_status()
|
| 228 |
+
result = result_resp.json()
|
| 229 |
+
|
| 230 |
+
duration = time.time() - start_time
|
| 231 |
+
|
| 232 |
+
return {
|
| 233 |
+
"episode_id": episode_id,
|
| 234 |
+
"task_id": task_id,
|
| 235 |
+
"seed": seed,
|
| 236 |
+
"final_score": result.get("final_score", 0.0),
|
| 237 |
+
"steps_taken": result.get("steps_taken", steps),
|
| 238 |
+
"issues_found": result.get("issues_found", 0),
|
| 239 |
+
"issues_total": result.get("issues_total", 0),
|
| 240 |
+
"noise_penalties": result.get("noise_penalties", 0),
|
| 241 |
+
"terminated_reason": result.get("terminated_reason", "unknown"),
|
| 242 |
+
"duration_seconds": round(duration, 2)
|
| 243 |
+
}
|
| 244 |
|
| 245 |
+
def save_results(results: list, output_path: str):
|
| 246 |
+
import json, csv
|
| 247 |
+
if output_path.endswith(".json"):
|
| 248 |
+
with open(output_path, "w") as f:
|
| 249 |
+
json.dump(results, f, indent=2)
|
| 250 |
+
elif output_path.endswith(".csv"):
|
| 251 |
+
if results:
|
| 252 |
+
with open(output_path, "w", newline="") as f:
|
| 253 |
+
writer = csv.DictWriter(f, fieldnames=results[0].keys())
|
| 254 |
+
writer.writeheader()
|
| 255 |
+
writer.writerows(results)
|
| 256 |
|
| 257 |
+
def main():
|
| 258 |
+
parser = argparse.ArgumentParser(description="AgentOrg CodeReview Baseline Agent")
|
| 259 |
+
parser.add_argument("--url", default="http://localhost:7860")
|
| 260 |
+
parser.add_argument("--task", default="bug_detection",
|
| 261 |
+
choices=["bug_detection", "security_audit", "architectural_review"])
|
| 262 |
+
parser.add_argument("--seed", type=int, default=0)
|
| 263 |
+
parser.add_argument("--agent", default="keyword", choices=["keyword", "llm"])
|
| 264 |
+
parser.add_argument("--api-key", default="", help="Anthropic API key for LLM agent")
|
| 265 |
+
parser.add_argument("--output", default="", help="Output file (.json or .csv)")
|
| 266 |
+
parser.add_argument("--verbose", action="store_true")
|
| 267 |
+
parser.add_argument("--max-steps", type=int, default=None, help="Override max steps (for testing)")
|
| 268 |
args = parser.parse_args()
|
| 269 |
+
|
| 270 |
+
# Create agent
|
| 271 |
+
if args.agent == "llm":
|
| 272 |
+
import os
|
| 273 |
+
api_key = args.api_key or os.environ.get("ANTHROPIC_API_KEY", "")
|
| 274 |
+
if not api_key:
|
| 275 |
+
print("ERROR: LLM agent requires --api-key or ANTHROPIC_API_KEY env var")
|
| 276 |
+
sys.exit(1)
|
| 277 |
+
agent = LLMAgent(api_key)
|
| 278 |
+
else:
|
| 279 |
+
agent = KeywordAgent()
|
| 280 |
+
|
| 281 |
+
# Check API connectivity
|
| 282 |
try:
|
| 283 |
+
import requests
|
| 284 |
+
health = requests.get(f"{args.url}/health", timeout=5)
|
| 285 |
+
health.raise_for_status()
|
| 286 |
except Exception as e:
|
| 287 |
+
print(f"ERROR: Cannot connect to API at {args.url}: {e}")
|
| 288 |
+
sys.exit(1)
|
| 289 |
+
|
| 290 |
+
# Run episode
|
| 291 |
+
try:
|
| 292 |
+
result = run_episode(args.url, args.task, args.seed, agent, args.verbose)
|
| 293 |
+
print(f"\nResult: score={result['final_score']:.3f} "
|
| 294 |
+
f"issues={result['issues_found']}/{result['issues_total']} "
|
| 295 |
+
f"steps={result['steps_taken']} "
|
| 296 |
+
f"reason={result['terminated_reason']}")
|
| 297 |
+
|
| 298 |
+
# Save output
|
| 299 |
+
if args.output:
|
| 300 |
+
save_results([result], args.output)
|
| 301 |
+
print(f"Results saved to: {args.output}")
|
| 302 |
+
except Exception as e:
|
| 303 |
+
print(f"Episode failed: {e}")
|
| 304 |
+
sys.exit(1)
|
| 305 |
+
|
| 306 |
+
if __name__ == "__main__":
|
| 307 |
+
main()
|
scripts/evaluate.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Batch evaluation: runs all 30 scenarios and prints a summary report.
|
| 4 |
+
Usage: python scripts/evaluate.py --url http://localhost:7860 --agent keyword --output results.json
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import argparse
|
| 8 |
+
import sys
|
| 9 |
+
import json
|
| 10 |
+
import time
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
# Add project root to path
|
| 14 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 15 |
+
|
| 16 |
+
from scripts.baseline import KeywordAgent, LLMAgent, run_episode, save_results
|
| 17 |
+
|
| 18 |
+
TASKS = ["bug_detection", "security_audit", "architectural_review"]
|
| 19 |
+
SEEDS = list(range(10))
|
| 20 |
+
|
| 21 |
+
def run_batch_evaluation(url: str, agent, verbose: bool = False) -> list:
|
| 22 |
+
"""Run all 30 scenarios and return results."""
|
| 23 |
+
all_results = []
|
| 24 |
+
|
| 25 |
+
for task in TASKS:
|
| 26 |
+
print(f"\n\u2500\u2500 Task: {task} \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500")
|
| 27 |
+
for seed in SEEDS:
|
| 28 |
+
try:
|
| 29 |
+
result = run_episode(url, task, seed, agent, verbose)
|
| 30 |
+
all_results.append(result)
|
| 31 |
+
score = result["final_score"]
|
| 32 |
+
bar = "\u2588" * int(score * 10) + "\u2591" * (10 - int(score * 10))
|
| 33 |
+
print(f" Seed {seed:2d}: [{bar}] {score:.3f} ({result['issues_found']}/{result['issues_total']} issues)")
|
| 34 |
+
except Exception as e:
|
| 35 |
+
print(f" Seed {seed:2d}: FAILED \u2014 {e}")
|
| 36 |
+
all_results.append({"task_id": task, "seed": seed, "final_score": 0.0, "error": str(e)})
|
| 37 |
+
|
| 38 |
+
return all_results
|
| 39 |
+
|
| 40 |
+
def print_summary(results: list):
|
| 41 |
+
"""Print a formatted summary report."""
|
| 42 |
+
from collections import defaultdict
|
| 43 |
+
import statistics
|
| 44 |
+
|
| 45 |
+
print("\n" + "="*60)
|
| 46 |
+
print("EVALUATION SUMMARY")
|
| 47 |
+
print("="*60)
|
| 48 |
+
|
| 49 |
+
by_task = defaultdict(list)
|
| 50 |
+
for r in results:
|
| 51 |
+
if "error" not in r:
|
| 52 |
+
by_task[r["task_id"]].append(r["final_score"])
|
| 53 |
+
|
| 54 |
+
overall_scores = [s for scores in by_task.values() for s in scores]
|
| 55 |
+
|
| 56 |
+
for task, scores in by_task.items():
|
| 57 |
+
if scores:
|
| 58 |
+
print(f"\n{task.upper().replace('_', ' ')}")
|
| 59 |
+
print(f" Mean: {statistics.mean(scores):.3f}")
|
| 60 |
+
print(f" Median: {statistics.median(scores):.3f}")
|
| 61 |
+
print(f" Stdev: {statistics.stdev(scores) if len(scores) > 1 else 0:.3f}")
|
| 62 |
+
print(f" Best: {max(scores):.3f}")
|
| 63 |
+
print(f" Worst: {min(scores):.3f}")
|
| 64 |
+
|
| 65 |
+
if overall_scores:
|
| 66 |
+
print(f"\nOVERALL ({len(overall_scores)}/30 scenarios)")
|
| 67 |
+
print(f" Mean score: {statistics.mean(overall_scores):.3f}")
|
| 68 |
+
print(f" Success rate (>0.5): {sum(1 for s in overall_scores if s > 0.5)/len(overall_scores)*100:.1f}%")
|
| 69 |
+
|
| 70 |
+
print("="*60)
|
| 71 |
+
|
| 72 |
+
def main():
|
| 73 |
+
parser = argparse.ArgumentParser(description="Batch evaluation of all 30 CodeReview scenarios")
|
| 74 |
+
parser.add_argument("--url", default="http://localhost:7860")
|
| 75 |
+
parser.add_argument("--agent", default="keyword", choices=["keyword", "llm"])
|
| 76 |
+
parser.add_argument("--api-key", default="")
|
| 77 |
+
parser.add_argument("--output", default="results.json", help="Output file (.json or .csv)")
|
| 78 |
+
parser.add_argument("--verbose", action="store_true")
|
| 79 |
+
parser.add_argument("--task", default=None,
|
| 80 |
+
choices=["bug_detection", "security_audit", "architectural_review", None],
|
| 81 |
+
help="Run only a specific task (default: all)")
|
| 82 |
+
args = parser.parse_args()
|
| 83 |
+
|
| 84 |
+
if args.agent == "llm":
|
| 85 |
+
import os
|
| 86 |
+
api_key = args.api_key or os.environ.get("ANTHROPIC_API_KEY", "")
|
| 87 |
+
if not api_key:
|
| 88 |
+
print("ERROR: LLM agent requires --api-key or ANTHROPIC_API_KEY env var")
|
| 89 |
+
sys.exit(1)
|
| 90 |
+
agent = LLMAgent(api_key)
|
| 91 |
+
else:
|
| 92 |
+
agent = KeywordAgent()
|
| 93 |
+
|
| 94 |
+
# Check connectivity
|
| 95 |
+
try:
|
| 96 |
+
import requests
|
| 97 |
+
requests.get(f"{args.url}/health", timeout=5).raise_for_status()
|
| 98 |
+
except Exception as e:
|
| 99 |
+
print(f"ERROR: Cannot connect to {args.url}: {e}")
|
| 100 |
+
sys.exit(1)
|
| 101 |
+
|
| 102 |
+
global TASKS
|
| 103 |
+
if args.task:
|
| 104 |
+
TASKS = [args.task]
|
| 105 |
+
|
| 106 |
+
print(f"Running evaluation: {len(TASKS)} task(s), {len(SEEDS)} seeds each")
|
| 107 |
+
print(f"Agent: {args.agent} | API: {args.url}")
|
| 108 |
+
start = time.time()
|
| 109 |
+
|
| 110 |
+
results = run_batch_evaluation(args.url, agent, args.verbose)
|
| 111 |
+
|
| 112 |
+
print(f"\nCompleted in {time.time()-start:.1f}s")
|
| 113 |
+
print_summary(results)
|
| 114 |
+
|
| 115 |
+
if args.output:
|
| 116 |
+
save_results(results, args.output)
|
| 117 |
+
print(f"\nResults saved to: {args.output}")
|
| 118 |
+
|
| 119 |
+
if __name__ == "__main__":
|
| 120 |
+
main()
|