lllouo commited on
Commit
cce4575
·
1 Parent(s): e0e242c

update leaderboard.json

Browse files
Files changed (1) hide show
  1. leaderboard.json +139 -27
leaderboard.json CHANGED
@@ -2,13 +2,21 @@
2
  {
3
  "ID": 1,
4
  "Category": "RA",
 
 
 
 
 
 
 
 
5
  "Benchmark": "ARC_deepseek_r1_denoising",
6
  "WAR": 0.00,
7
  "SED": 0.67,
8
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/ARC/arc_deepseek_r1_denoising)"
9
  },
10
  {
11
- "ID": 2,
12
  "Category": "RA",
13
  "Benchmark": "ARC_wac_gec",
14
  "WAR": 0.00,
@@ -16,7 +24,15 @@
16
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/ARC/arc_wac_gec)"
17
  },
18
  {
19
- "ID": 3,
 
 
 
 
 
 
 
 
20
  "Category": "TG",
21
  "Benchmark": "COQA_deepseek_r1_denoising",
22
  "WAR": 4.18,
@@ -24,7 +40,7 @@
24
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/COQA/coqa_deepseek_r1_denoising)"
25
  },
26
  {
27
- "ID": 4,
28
  "Category": "TG",
29
  "Benchmark": "COQA_wac_gec",
30
  "WAR": 4.70,
@@ -32,7 +48,15 @@
32
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/COQA/coqa_wac_gec)"
33
  },
34
  {
35
- "ID": 5,
 
 
 
 
 
 
 
 
36
  "Category": "TG",
37
  "Benchmark": "DROP_deepseek_r1_denoising",
38
  "WAR": 0.02,
@@ -40,7 +64,7 @@
40
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/DROP/drop_deepseek_r1_denoising)"
41
  },
42
  {
43
- "ID": 6,
44
  "Category": "TG",
45
  "Benchmark": "DROP_wac_gec",
46
  "WAR": 0.64,
@@ -48,7 +72,15 @@
48
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/DROP/drop_wac_gec)"
49
  },
50
  {
51
- "ID": 7,
 
 
 
 
 
 
 
 
52
  "Category": "BT",
53
  "Benchmark": "MRPC_deepseek_r1_denoising",
54
  "WAR": 3.80,
@@ -56,7 +88,7 @@
56
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue_deepseek_r1_denoising/mrpc)"
57
  },
58
  {
59
- "ID": 8,
60
  "Category": "BT",
61
  "Benchmark": "MRPC_wac_gec",
62
  "WAR": 1.84,
@@ -64,7 +96,15 @@
64
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue_wac_gec/mrpc)"
65
  },
66
  {
67
- "ID": 9,
 
 
 
 
 
 
 
 
68
  "Category": "BT",
69
  "Benchmark": "RTE_deepseek_r1_denoising",
70
  "WAR": 0.36,
@@ -72,7 +112,7 @@
72
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue_deepseek_r1_denoising/rte)"
73
  },
74
  {
75
- "ID": 10,
76
  "Category": "BT",
77
  "Benchmark": "RTE_wac_gec",
78
  "WAR": 0.72,
@@ -80,7 +120,15 @@
80
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue_wac_gec/rte)"
81
  },
82
  {
83
- "ID": 11,
 
 
 
 
 
 
 
 
84
  "Category": "BT",
85
  "Benchmark": "SST2_deepseek_r1_denoising",
86
  "WAR": 7.22,
@@ -88,7 +136,7 @@
88
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue_deepseek_r1_denoising/sst2)"
89
  },
90
  {
91
- "ID": 12,
92
  "Category": "BT",
93
  "Benchmark": "SST2_wac_gec",
94
  "WAR": 5.39,
@@ -96,7 +144,15 @@
96
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue_wac_gec/sst2)"
97
  },
98
  {
99
- "ID": 13,
 
 
 
 
 
 
 
 
100
  "Category": "SU",
101
  "Benchmark": "WNLI_deepseek_r1_denoising",
102
  "WAR": 0.00,
@@ -104,7 +160,7 @@
104
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue_deepseek_r1_denoising/wnli)"
105
  },
106
  {
107
- "ID": 14,
108
  "Category": "SU",
109
  "Benchmark": "WNLI_wac_gec",
110
  "WAR": 0.00,
@@ -112,7 +168,15 @@
112
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue_wac_gec/wnli)"
113
  },
114
  {
115
- "ID": 15,
 
 
 
 
 
 
 
 
116
  "Category": "RA",
117
  "Benchmark": "GSM8K_deepseek_r1_denoising",
118
  "WAR": 0.30,
@@ -120,7 +184,7 @@
120
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GSM8K/gsm8k_deepseek_r1_denoising)"
121
  },
122
  {
123
- "ID": 16,
124
  "Category": "RA",
125
  "Benchmark": "GSM8K_wac_gec",
126
  "WAR": 1.97,
@@ -128,7 +192,15 @@
128
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GSM8K/gsm8k_wac_gec)"
129
  },
130
  {
131
- "ID": 17,
 
 
 
 
 
 
 
 
132
  "Category": "RA",
133
  "Benchmark": "MMLU_deepseek_r1_denoising",
134
  "WAR": 6.56,
@@ -136,7 +208,7 @@
136
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MMLU/mmlu_deepseek_r1_denoising)"
137
  },
138
  {
139
- "ID": 18,
140
  "Category": "RA",
141
  "Benchmark": "MMLU_wac_gec",
142
  "WAR": 2.98,
@@ -144,7 +216,15 @@
144
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MMLU/mmlu_wac_gec)"
145
  },
146
  {
147
- "ID": 19,
 
 
 
 
 
 
 
 
148
  "Category": "ME",
149
  "Benchmark": "MedMCQA_deepseek_r1_denoising",
150
  "WAR": 3.44,
@@ -152,7 +232,7 @@
152
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MedMCQA/medmcqa_deepseek_r1_denoising)"
153
  },
154
  {
155
- "ID": 20,
156
  "Category": "ME",
157
  "Benchmark": "MedMCQA_wac_gec",
158
  "WAR": 2.44,
@@ -160,7 +240,15 @@
160
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MedMCQA/medmcqa_wac_gec)"
161
  },
162
  {
163
- "ID": 21,
 
 
 
 
 
 
 
 
164
  "Category": "ME",
165
  "Benchmark": "MedQA_deepseek_r1_denoising",
166
  "WAR": 16.26,
@@ -168,7 +256,7 @@
168
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MedQA/MedQA_deepseek_r1_denoising)"
169
  },
170
  {
171
- "ID": 22,
172
  "Category": "ME",
173
  "Benchmark": "MedQA_wac_gec",
174
  "WAR": 0.79,
@@ -176,7 +264,15 @@
176
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MedQA/MedQA_wac_gec)"
177
  },
178
  {
179
- "ID": 23,
 
 
 
 
 
 
 
 
180
  "Category": "SU",
181
  "Benchmark": "Natural_questions_deepseek_r1_denoising",
182
  "WAR": 0.06,
@@ -184,7 +280,7 @@
184
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/Natural_questions/nq_open_deepseek_r1_denoising)"
185
  },
186
  {
187
- "ID": 24,
188
  "Category": "SU",
189
  "Benchmark": "Natural_questions_wac_gec",
190
  "WAR": 0.28,
@@ -192,7 +288,15 @@
192
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/Natural_questions/nq_open_wac_gec)"
193
  },
194
  {
195
- "ID": 25,
 
 
 
 
 
 
 
 
196
  "Category": "ME",
197
  "Benchmark": "PubMedQA_deepseek_r1_denoising",
198
  "WAR": 0.20,
@@ -200,7 +304,7 @@
200
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/PubMedQA/pubmed_qa_deepseek_r1_denoising)"
201
  },
202
  {
203
- "ID": 26,
204
  "Category": "ME",
205
  "Benchmark": "PubMedQA_wac_gec",
206
  "WAR": 0.00,
@@ -208,7 +312,15 @@
208
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/PubMedQA/pubmed_qa_wac_gec)"
209
  },
210
  {
211
- "ID": 27,
 
 
 
 
 
 
 
 
212
  "Category": "TG",
213
  "Benchmark": "Truthful_QA_deepseek_r1_denoising",
214
  "WAR": 0.00,
@@ -216,7 +328,7 @@
216
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/Truthful_QA/truthful_qa_deepseek_r1_denoising)"
217
  },
218
  {
219
- "ID": 28,
220
  "Category": "TG",
221
  "Benchmark": "Truthful_QA_wac_gec",
222
  "WAR": 0.00,
 
2
  {
3
  "ID": 1,
4
  "Category": "RA",
5
+ "Benchmark": "ARC",
6
+ "WAR": 0.11,
7
+ "SED": 0.67,
8
+ "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/ARC/arc)"
9
+ },
10
+ {
11
+ "ID": 2,
12
+ "Category": "RA",
13
  "Benchmark": "ARC_deepseek_r1_denoising",
14
  "WAR": 0.00,
15
  "SED": 0.67,
16
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/ARC/arc_deepseek_r1_denoising)"
17
  },
18
  {
19
+ "ID": 3,
20
  "Category": "RA",
21
  "Benchmark": "ARC_wac_gec",
22
  "WAR": 0.00,
 
24
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/ARC/arc_wac_gec)"
25
  },
26
  {
27
+ "ID": 4,
28
+ "Category": "TG",
29
+ "Benchmark": "COQA",
30
+ "WAR": 6.79,
31
+ "SED": 2.74,
32
+ "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/COQA/coqa)"
33
+ },
34
+ {
35
+ "ID": 5,
36
  "Category": "TG",
37
  "Benchmark": "COQA_deepseek_r1_denoising",
38
  "WAR": 4.18,
 
40
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/COQA/coqa_deepseek_r1_denoising)"
41
  },
42
  {
43
+ "ID": 6,
44
  "Category": "TG",
45
  "Benchmark": "COQA_wac_gec",
46
  "WAR": 4.70,
 
48
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/COQA/coqa_wac_gec)"
49
  },
50
  {
51
+ "ID": 7,
52
+ "Category": "TG",
53
+ "Benchmark": "DROP",
54
+ "WAR": 1.50,
55
+ "SED": 3.38,
56
+ "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/DROP/drop)"
57
+ },
58
+ {
59
+ "ID": 8,
60
  "Category": "TG",
61
  "Benchmark": "DROP_deepseek_r1_denoising",
62
  "WAR": 0.02,
 
64
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/DROP/drop_deepseek_r1_denoising)"
65
  },
66
  {
67
+ "ID": 9,
68
  "Category": "TG",
69
  "Benchmark": "DROP_wac_gec",
70
  "WAR": 0.64,
 
72
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/DROP/drop_wac_gec)"
73
  },
74
  {
75
+ "ID": 10,
76
+ "Category": "BT",
77
+ "Benchmark": "MRPC",
78
+ "WAR": 100.00,
79
+ "SED": 5.65,
80
+ "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue/mrpc)"
81
+ },
82
+ {
83
+ "ID": 11,
84
  "Category": "BT",
85
  "Benchmark": "MRPC_deepseek_r1_denoising",
86
  "WAR": 3.80,
 
88
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue_deepseek_r1_denoising/mrpc)"
89
  },
90
  {
91
+ "ID": 12,
92
  "Category": "BT",
93
  "Benchmark": "MRPC_wac_gec",
94
  "WAR": 1.84,
 
96
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue_wac_gec/mrpc)"
97
  },
98
  {
99
+ "ID": 13,
100
+ "Category": "BT",
101
+ "Benchmark": "RTE",
102
+ "WAR": 2.17,
103
+ "SED": 4.47,
104
+ "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue/rte)"
105
+ },
106
+ {
107
+ "ID": 14,
108
  "Category": "BT",
109
  "Benchmark": "RTE_deepseek_r1_denoising",
110
  "WAR": 0.36,
 
112
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue_deepseek_r1_denoising/rte)"
113
  },
114
  {
115
+ "ID": 15,
116
  "Category": "BT",
117
  "Benchmark": "RTE_wac_gec",
118
  "WAR": 0.72,
 
120
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue_wac_gec/rte)"
121
  },
122
  {
123
+ "ID": 16,
124
+ "Category": "BT",
125
+ "Benchmark": "SST2",
126
+ "WAR": 98.97,
127
+ "SED": 5.42,
128
+ "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue/sst2)"
129
+ },
130
+ {
131
+ "ID": 17,
132
  "Category": "BT",
133
  "Benchmark": "SST2_deepseek_r1_denoising",
134
  "WAR": 7.22,
 
136
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue_deepseek_r1_denoising/sst2)"
137
  },
138
  {
139
+ "ID": 18,
140
  "Category": "BT",
141
  "Benchmark": "SST2_wac_gec",
142
  "WAR": 5.39,
 
144
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue_wac_gec/sst2)"
145
  },
146
  {
147
+ "ID": 19,
148
+ "Category": "SU",
149
+ "Benchmark": "WNLI",
150
+ "WAR": 0.70,
151
+ "SED": 0.64,
152
+ "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue/wnli)"
153
+ },
154
+ {
155
+ "ID": 20,
156
  "Category": "SU",
157
  "Benchmark": "WNLI_deepseek_r1_denoising",
158
  "WAR": 0.00,
 
160
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue_deepseek_r1_denoising/wnli)"
161
  },
162
  {
163
+ "ID": 21,
164
  "Category": "SU",
165
  "Benchmark": "WNLI_wac_gec",
166
  "WAR": 0.00,
 
168
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue_wac_gec/wnli)"
169
  },
170
  {
171
+ "ID": 22,
172
+ "Category": "RA",
173
+ "Benchmark": "GSM8K",
174
+ "WAR": 25.70,
175
+ "SED": 1.11,
176
+ "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GSM8K/gsm8k)"
177
+ },
178
+ {
179
+ "ID": 23,
180
  "Category": "RA",
181
  "Benchmark": "GSM8K_deepseek_r1_denoising",
182
  "WAR": 0.30,
 
184
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GSM8K/gsm8k_deepseek_r1_denoising)"
185
  },
186
  {
187
+ "ID": 24,
188
  "Category": "RA",
189
  "Benchmark": "GSM8K_wac_gec",
190
  "WAR": 1.97,
 
192
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GSM8K/gsm8k_wac_gec)"
193
  },
194
  {
195
+ "ID": 25,
196
+ "Category": "RA",
197
+ "Benchmark": "MMLU",
198
+ "WAR": 10.06,
199
+ "SED": 2.21,
200
+ "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MMLU/mmlu)"
201
+ },
202
+ {
203
+ "ID": 26,
204
  "Category": "RA",
205
  "Benchmark": "MMLU_deepseek_r1_denoising",
206
  "WAR": 6.56,
 
208
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MMLU/mmlu_deepseek_r1_denoising)"
209
  },
210
  {
211
+ "ID": 27,
212
  "Category": "RA",
213
  "Benchmark": "MMLU_wac_gec",
214
  "WAR": 2.98,
 
216
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MMLU/mmlu_wac_gec)"
217
  },
218
  {
219
+ "ID": 28,
220
+ "Category": "ME",
221
+ "Benchmark": "MedMCQA",
222
+ "WAR": 6.31,
223
+ "SED": 6.18,
224
+ "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MedMCQA/medmcqa)"
225
+ },
226
+ {
227
+ "ID": 29,
228
  "Category": "ME",
229
  "Benchmark": "MedMCQA_deepseek_r1_denoising",
230
  "WAR": 3.44,
 
232
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MedMCQA/medmcqa_deepseek_r1_denoising)"
233
  },
234
  {
235
+ "ID": 30,
236
  "Category": "ME",
237
  "Benchmark": "MedMCQA_wac_gec",
238
  "WAR": 2.44,
 
240
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MedMCQA/medmcqa_wac_gec)"
241
  },
242
  {
243
+ "ID": 31,
244
+ "Category": "ME",
245
+ "Benchmark": "MedQA",
246
+ "WAR": 16.97,
247
+ "SED": 6.49,
248
+ "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MedQA/MedQA-USMLE-4-options)"
249
+ },
250
+ {
251
+ "ID": 32,
252
  "Category": "ME",
253
  "Benchmark": "MedQA_deepseek_r1_denoising",
254
  "WAR": 16.26,
 
256
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MedQA/MedQA_deepseek_r1_denoising)"
257
  },
258
  {
259
+ "ID": 33,
260
  "Category": "ME",
261
  "Benchmark": "MedQA_wac_gec",
262
  "WAR": 0.79,
 
264
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MedQA/MedQA_wac_gec)"
265
  },
266
  {
267
+ "ID": 34,
268
+ "Category": "SU",
269
+ "Benchmark": "Natural_questions",
270
+ "WAR": 0.17,
271
+ "SED": 2.90,
272
+ "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/Natural_questions/nq_open)"
273
+ },
274
+ {
275
+ "ID": 35,
276
  "Category": "SU",
277
  "Benchmark": "Natural_questions_deepseek_r1_denoising",
278
  "WAR": 0.06,
 
280
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/Natural_questions/nq_open_deepseek_r1_denoising)"
281
  },
282
  {
283
+ "ID": 36,
284
  "Category": "SU",
285
  "Benchmark": "Natural_questions_wac_gec",
286
  "WAR": 0.28,
 
288
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/Natural_questions/nq_open_wac_gec)"
289
  },
290
  {
291
+ "ID": 37,
292
+ "Category": "ME",
293
+ "Benchmark": "PubMedQA",
294
+ "WAR": 0.60,
295
+ "SED": 8.15,
296
+ "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/PubMedQA/pubmed_qa)"
297
+ },
298
+ {
299
+ "ID": 38,
300
  "Category": "ME",
301
  "Benchmark": "PubMedQA_deepseek_r1_denoising",
302
  "WAR": 0.20,
 
304
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/PubMedQA/pubmed_qa_deepseek_r1_denoising)"
305
  },
306
  {
307
+ "ID": 39,
308
  "Category": "ME",
309
  "Benchmark": "PubMedQA_wac_gec",
310
  "WAR": 0.00,
 
312
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/PubMedQA/pubmed_qa_wac_gec)"
313
  },
314
  {
315
+ "ID": 40,
316
+ "Category": "TG",
317
+ "Benchmark": "Truthful_QA",
318
+ "WAR": 0.00,
319
+ "SED": 1.75,
320
+ "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/Truthful_QA/truthful_qa)"
321
+ },
322
+ {
323
+ "ID": 41,
324
  "Category": "TG",
325
  "Benchmark": "Truthful_QA_deepseek_r1_denoising",
326
  "WAR": 0.00,
 
328
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/Truthful_QA/truthful_qa_deepseek_r1_denoising)"
329
  },
330
  {
331
+ "ID": 42,
332
  "Category": "TG",
333
  "Benchmark": "Truthful_QA_wac_gec",
334
  "WAR": 0.00,