CodeLens / results.json
ArshVerma's picture
Skip DB init in TESTING; add reset_db
c90ac2d
Raw
History Blame Contribute Delete
9.26 kB
[
{
"episode_id": "27c65a00-9315-4d7d-b457-f8d64b0da466",
"task_id": "bug_detection",
"seed": 0,
"final_score": 0.0,
"steps_taken": 1,
"issues_found": 0,
"issues_total": 1,
"noise_penalties": 0,
"terminated_reason": "terminal_action",
"duration_seconds": 0.02
},
{
"episode_id": "8bdcee56-8a56-4ec8-88fc-a70e78ab48f2",
"task_id": "bug_detection",
"seed": 1,
"final_score": 0.0,
"steps_taken": 1,
"issues_found": 0,
"issues_total": 1,
"noise_penalties": 0,
"terminated_reason": "terminal_action",
"duration_seconds": 0.02
},
{
"episode_id": "d3dcda88-ce7e-4965-9409-4e97c98cf444",
"task_id": "bug_detection",
"seed": 2,
"final_score": 0.9167,
"steps_taken": 6,
"issues_found": 1,
"issues_total": 1,
"noise_penalties": 5,
"terminated_reason": "noise_exhausted",
"duration_seconds": 0.04
},
{
"episode_id": "822dfb7d-d67b-4dc9-9cfe-866665a9e5b9",
"task_id": "bug_detection",
"seed": 3,
"final_score": 0.9167,
"steps_taken": 6,
"issues_found": 1,
"issues_total": 1,
"noise_penalties": 5,
"terminated_reason": "noise_exhausted",
"duration_seconds": 0.03
},
{
"episode_id": "c35c2096-b7b8-4a54-b0c7-d8fa1e1538bf",
"task_id": "bug_detection",
"seed": 4,
"final_score": 0.8267,
"steps_taken": 6,
"issues_found": 1,
"issues_total": 1,
"noise_penalties": 5,
"terminated_reason": "noise_exhausted",
"duration_seconds": 0.03
},
{
"episode_id": "9b4e5191-57da-4b10-b31e-5b8236b9b1f4",
"task_id": "bug_detection",
"seed": 5,
"final_score": 0.0,
"steps_taken": 1,
"issues_found": 0,
"issues_total": 1,
"noise_penalties": 0,
"terminated_reason": "terminal_action",
"duration_seconds": 0.01
},
{
"episode_id": "c1e64a16-512a-4716-b0f9-5d9fe14a142d",
"task_id": "bug_detection",
"seed": 6,
"final_score": 0.0,
"steps_taken": 1,
"issues_found": 0,
"issues_total": 1,
"noise_penalties": 0,
"terminated_reason": "terminal_action",
"duration_seconds": 0.02
},
{
"episode_id": "cc8bd066-506e-46de-ba0f-0a446f045945",
"task_id": "bug_detection",
"seed": 7,
"final_score": 0.0,
"steps_taken": 1,
"issues_found": 0,
"issues_total": 1,
"noise_penalties": 0,
"terminated_reason": "terminal_action",
"duration_seconds": 0.02
},
{
"episode_id": "20dd7158-a49f-4b6c-adcd-6b76b4274e94",
"task_id": "bug_detection",
"seed": 8,
"final_score": 0.9167,
"steps_taken": 6,
"issues_found": 1,
"issues_total": 1,
"noise_penalties": 5,
"terminated_reason": "noise_exhausted",
"duration_seconds": 0.03
},
{
"episode_id": "14b9d1a5-03b7-45b2-8b23-b6c387bced5b",
"task_id": "bug_detection",
"seed": 9,
"final_score": 0.0,
"steps_taken": 5,
"issues_found": 0,
"issues_total": 1,
"noise_penalties": 5,
"terminated_reason": "noise_exhausted",
"duration_seconds": 0.03
},
{
"episode_id": "2b1c4b7f-cf05-4312-89d2-49883b9ed6c1",
"task_id": "security_audit",
"seed": 0,
"final_score": 0.0,
"steps_taken": 5,
"issues_found": 0,
"issues_total": 1,
"noise_penalties": 5,
"terminated_reason": "noise_exhausted",
"duration_seconds": 0.03
},
{
"episode_id": "47145114-7a99-4c81-baa9-1f9c4221f48a",
"task_id": "security_audit",
"seed": 1,
"final_score": 1.0,
"steps_taken": 6,
"issues_found": 1,
"issues_total": 1,
"noise_penalties": 5,
"terminated_reason": "noise_exhausted",
"duration_seconds": 0.03
},
{
"episode_id": "5d9d3f74-a127-46b2-a331-45f30f0d2f6f",
"task_id": "security_audit",
"seed": 2,
"final_score": 0.0,
"steps_taken": 5,
"issues_found": 0,
"issues_total": 1,
"noise_penalties": 5,
"terminated_reason": "noise_exhausted",
"duration_seconds": 0.03
},
{
"episode_id": "f383afc4-df2d-4a60-abd1-a583cb2de538",
"task_id": "security_audit",
"seed": 3,
"final_score": 0.85,
"steps_taken": 6,
"issues_found": 1,
"issues_total": 1,
"noise_penalties": 5,
"terminated_reason": "noise_exhausted",
"duration_seconds": 0.03
},
{
"episode_id": "df23e18b-c21a-4390-9afa-f77eb1b8050b",
"task_id": "security_audit",
"seed": 4,
"final_score": 0.0,
"steps_taken": 5,
"issues_found": 0,
"issues_total": 1,
"noise_penalties": 5,
"terminated_reason": "noise_exhausted",
"duration_seconds": 0.03
},
{
"episode_id": "bcec83af-4aaf-4997-922a-1556ca63fcf3",
"task_id": "security_audit",
"seed": 5,
"final_score": 0.0,
"steps_taken": 5,
"issues_found": 0,
"issues_total": 1,
"noise_penalties": 5,
"terminated_reason": "noise_exhausted",
"duration_seconds": 0.03
},
{
"episode_id": "65afedd0-9ad1-45f7-9615-dafeaa390338",
"task_id": "security_audit",
"seed": 6,
"final_score": 0.0,
"steps_taken": 5,
"issues_found": 0,
"issues_total": 1,
"noise_penalties": 5,
"terminated_reason": "noise_exhausted",
"duration_seconds": 0.03
},
{
"episode_id": "0e8b65e5-d59c-41fb-8038-1f3b1d4b657b",
"task_id": "security_audit",
"seed": 7,
"final_score": 0.0,
"steps_taken": 5,
"issues_found": 0,
"issues_total": 1,
"noise_penalties": 5,
"terminated_reason": "noise_exhausted",
"duration_seconds": 0.03
},
{
"episode_id": "ae477e21-2718-477e-8a11-e46bda49bea7",
"task_id": "security_audit",
"seed": 8,
"final_score": 0.0,
"steps_taken": 5,
"issues_found": 0,
"issues_total": 1,
"noise_penalties": 5,
"terminated_reason": "noise_exhausted",
"duration_seconds": 0.04
},
{
"episode_id": "0cd3d18c-cdf6-4752-9dc8-cade848cee0e",
"task_id": "security_audit",
"seed": 9,
"final_score": 0.0,
"steps_taken": 5,
"issues_found": 0,
"issues_total": 1,
"noise_penalties": 5,
"terminated_reason": "noise_exhausted",
"duration_seconds": 0.03
},
{
"episode_id": "d9f79316-57ea-42ad-b191-797ea895951b",
"task_id": "architectural_review",
"seed": 0,
"final_score": 0.0,
"steps_taken": 1,
"issues_found": 0,
"issues_total": 1,
"noise_penalties": 0,
"terminated_reason": "terminal_action",
"duration_seconds": 0.01
},
{
"episode_id": "65112da3-d319-4eb3-9774-1c43313fe1ec",
"task_id": "architectural_review",
"seed": 1,
"final_score": 0.059,
"steps_taken": 5,
"issues_found": 0,
"issues_total": 1,
"noise_penalties": 5,
"terminated_reason": "noise_exhausted",
"duration_seconds": 0.03
},
{
"episode_id": "c1d6bd63-1abf-403b-97be-cf1962959910",
"task_id": "architectural_review",
"seed": 2,
"final_score": 0.661,
"steps_taken": 6,
"issues_found": 1,
"issues_total": 1,
"noise_penalties": 5,
"terminated_reason": "noise_exhausted",
"duration_seconds": 0.03
},
{
"episode_id": "fab56551-f2ae-42be-88eb-854ef236b29a",
"task_id": "architectural_review",
"seed": 3,
"final_score": 0.658,
"steps_taken": 5,
"issues_found": 0,
"issues_total": 1,
"noise_penalties": 5,
"terminated_reason": "noise_exhausted",
"duration_seconds": 0.03
},
{
"episode_id": "8f4b8fa9-f281-4bb2-ae71-7ad6c7ca7fc9",
"task_id": "architectural_review",
"seed": 4,
"final_score": 0.058,
"steps_taken": 5,
"issues_found": 0,
"issues_total": 1,
"noise_penalties": 5,
"terminated_reason": "noise_exhausted",
"duration_seconds": 0.03
},
{
"episode_id": "7eaa445d-774d-44b1-80d9-06e978b022df",
"task_id": "architectural_review",
"seed": 5,
"final_score": 0.657,
"steps_taken": 6,
"issues_found": 1,
"issues_total": 1,
"noise_penalties": 5,
"terminated_reason": "noise_exhausted",
"duration_seconds": 0.03
},
{
"episode_id": "103d6e2b-e40d-49ff-8f36-6910d463b48b",
"task_id": "architectural_review",
"seed": 6,
"final_score": 0.059,
"steps_taken": 5,
"issues_found": 0,
"issues_total": 1,
"noise_penalties": 5,
"terminated_reason": "noise_exhausted",
"duration_seconds": 0.03
},
{
"episode_id": "5206e9d8-0ee0-482c-ad3d-34adf2ce57a7",
"task_id": "architectural_review",
"seed": 7,
"final_score": 0.664,
"steps_taken": 6,
"issues_found": 1,
"issues_total": 1,
"noise_penalties": 5,
"terminated_reason": "noise_exhausted",
"duration_seconds": 0.03
},
{
"episode_id": "c147abd7-f887-44a2-b166-5382e292d793",
"task_id": "architectural_review",
"seed": 8,
"final_score": 0.039,
"steps_taken": 5,
"issues_found": 0,
"issues_total": 1,
"noise_penalties": 5,
"terminated_reason": "noise_exhausted",
"duration_seconds": 0.08
},
{
"episode_id": "6fe54153-a878-47d7-a521-5cd70f55f6b8",
"task_id": "architectural_review",
"seed": 9,
"final_score": 0.075,
"steps_taken": 5,
"issues_found": 0,
"issues_total": 1,
"noise_penalties": 5,
"terminated_reason": "noise_exhausted",
"duration_seconds": 0.09
}
]