CVE-Bench (#2)
Browse files- Add CVE-Bench (a6538dc6d948215f50f2edf1b94a28749e73f49c)
- meta_data.py +7 -1
- results.json +23 -1
meta_data.py
CHANGED
|
@@ -97,4 +97,10 @@ LEADERBOARD_MD['BountyBench'] = """This is a benchmark with 25 systems with comp
|
|
| 97 |
|
| 98 |
Paper: https://arxiv.org/abs/2505.15216
|
| 99 |
Code: https://github.com/bountybench/bountybench
|
| 100 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
Paper: https://arxiv.org/abs/2505.15216
|
| 99 |
Code: https://github.com/bountybench/bountybench
|
| 100 |
+
"""
|
| 101 |
+
|
| 102 |
+
LEADERBOARD_MD["CVE-Bench"] = """A Benchmark for AI Agents' Ability to Exploit Real-World Web Application Vulnerabilities.
|
| 103 |
+
|
| 104 |
+
Paper: https://arxiv.org/abs/2503.17332
|
| 105 |
+
Code: https://github.com/uiuc-kang-lab/cve-bench/
|
| 106 |
+
"""
|
results.json
CHANGED
|
@@ -852,6 +852,28 @@
|
|
| 852 |
"C-Agent: Gemini 2.5": 45,
|
| 853 |
"C-Agent: GPT-4.1": 50
|
| 854 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 855 |
}
|
| 856 |
}
|
| 857 |
-
}
|
|
|
|
| 852 |
"C-Agent: Gemini 2.5": 45,
|
| 853 |
"C-Agent: GPT-4.1": 50
|
| 854 |
}
|
| 855 |
+
},
|
| 856 |
+
"CVE-Bench": {
|
| 857 |
+
"Zero-day Pass@1": {
|
| 858 |
+
"T-Agent + GPT-4o (2024-11-20)": 8.0,
|
| 859 |
+
"AutoGPT + GPT-4o (2024-11-20)": 3.0,
|
| 860 |
+
"Cy-Agent + GPT-4o (2024-11-20)": 1.0
|
| 861 |
+
},
|
| 862 |
+
"Zero-day Pass@5": {
|
| 863 |
+
"T-Agent + GPT-4o (2024-11-20)": 10.0,
|
| 864 |
+
"AutoGPT + GPT-4o (2024-11-20)": 10.0,
|
| 865 |
+
"Cy-Agent + GPT-4o (2024-11-20)": 2.5
|
| 866 |
+
},
|
| 867 |
+
"One-day Pass@1": {
|
| 868 |
+
"T-Agent + GPT-4o (2024-11-20)": 7.0,
|
| 869 |
+
"AutoGPT + GPT-4o (2024-11-20)": 4.5,
|
| 870 |
+
"Cy-Agent + GPT-4o (2024-11-20)": 2.5
|
| 871 |
+
},
|
| 872 |
+
"One-day Pass@5": {
|
| 873 |
+
"T-Agent + GPT-4o (2024-11-20)": 12.5,
|
| 874 |
+
"AutoGPT + GPT-4o (2024-11-20)": 5.0,
|
| 875 |
+
"Cy-Agent + GPT-4o (2024-11-20)": 2.5
|
| 876 |
+
}
|
| 877 |
}
|
| 878 |
}
|
| 879 |
+
}
|