Spaces:
Configuration error
Configuration error
File size: 4,294 Bytes
e4f3d12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 | {
"summary": {
"total_samples": 2,
"overall_accuracy": 1.0,
"cwe_breakdown": {
"CWE-89": {
"accuracy": 1.0,
"count": 2
}
}
},
"results": [
{
"sample_id": "synthetic-00100",
"gt_vulnerable": true,
"gt_cwe": "CWE-89",
"final_verdict": true,
"is_correct": true,
"total_reward": 0.8999999999999999,
"history": [
{
"step": 0,
"model_output": "<action><action_type>request_context</action_type><file_path>auth.c</file_path></action>",
"parsed_action": "<action><action_type>request_context</action_type><file_path>auth.c</file_path></action>",
"reward": -0.05,
"observation": {
"episode_id": "c98a9f07-18fe-40a3-8c65-161a46fdabbd",
"step_idx": 1,
"diff": "--- a/db.py\n+++ b/db.py\n@@\n- cursor.execute(\"SELECT * FROM users WHERE id = %s\", (user_id,))\n+ cursor.execute(f\"SELECT * FROM users WHERE id = {user_id}\")\n",
"available_files": [
"db.py"
],
"context_snippets": [],
"budget_remaining": 4,
"error": "context_unavailable"
}
},
{
"step": 1,
"model_output": "<action><action_type>verdict</action_type><is_vulnerable>true</is_vulnerable><vuln_type>CWE-89</vuln_type><exploit_sketch>SQL injection in user_id</exploit_sketch></action>",
"parsed_action": "<action><action_type>verdict</action_type><is_vulnerable>true</is_vulnerable><vuln_type>CWE-89</vuln_type><exploit_sketch>SQL injection in user_id</exploit_sketch></action>",
"reward": 0.95,
"observation": {
"episode_id": "c98a9f07-18fe-40a3-8c65-161a46fdabbd",
"step_idx": 2,
"diff": "--- a/db.py\n+++ b/db.py\n@@\n- cursor.execute(\"SELECT * FROM users WHERE id = %s\", (user_id,))\n+ cursor.execute(f\"SELECT * FROM users WHERE id = {user_id}\")\n",
"available_files": [
"db.py"
],
"context_snippets": [],
"budget_remaining": 3,
"error": null
}
}
]
},
{
"sample_id": "synthetic-00101",
"gt_vulnerable": true,
"gt_cwe": "CWE-89",
"final_verdict": true,
"is_correct": true,
"total_reward": 0.8999999999999999,
"history": [
{
"step": 0,
"model_output": "<action><action_type>request_context</action_type><file_path>auth.c</file_path></action>",
"parsed_action": "<action><action_type>request_context</action_type><file_path>auth.c</file_path></action>",
"reward": -0.05,
"observation": {
"episode_id": "299ca2fd-e3e6-4bac-b8a2-d7404a52e07d",
"step_idx": 1,
"diff": "--- a/db.py\n+++ b/db.py\n@@\n- cursor.execute(\"SELECT * FROM users WHERE id = %s\", (user_id,))\n+ cursor.execute(f\"SELECT * FROM users WHERE id = {user_id}\")\n",
"available_files": [
"db.py"
],
"context_snippets": [],
"budget_remaining": 4,
"error": "context_unavailable"
}
},
{
"step": 1,
"model_output": "<action><action_type>verdict</action_type><is_vulnerable>true</is_vulnerable><vuln_type>CWE-89</vuln_type><exploit_sketch>SQL injection in user_id</exploit_sketch></action>",
"parsed_action": "<action><action_type>verdict</action_type><is_vulnerable>true</is_vulnerable><vuln_type>CWE-89</vuln_type><exploit_sketch>SQL injection in user_id</exploit_sketch></action>",
"reward": 0.95,
"observation": {
"episode_id": "299ca2fd-e3e6-4bac-b8a2-d7404a52e07d",
"step_idx": 2,
"diff": "--- a/db.py\n+++ b/db.py\n@@\n- cursor.execute(\"SELECT * FROM users WHERE id = %s\", (user_id,))\n+ cursor.execute(f\"SELECT * FROM users WHERE id = {user_id}\")\n",
"available_files": [
"db.py"
],
"context_snippets": [],
"budget_remaining": 3,
"error": null
}
}
]
}
]
}
|