Add MDPBench result

#39
Files changed (1) hide show
  1. .eval_results/mdpbench.yaml +0 -198
.eval_results/mdpbench.yaml DELETED
@@ -1,198 +0,0 @@
1
- - dataset:
2
- id: Delores-Lin/MDPBench
3
- task_id: overall
4
- value: 63.9
5
- date: "2026-04-28"
6
- source:
7
- url: https://huggingface.co/datasets/Delores-Lin/MDPBench
8
- name: MDPBench leaderboard
9
- user: Delores-Lin
10
- - dataset:
11
- id: Delores-Lin/MDPBench
12
- task_id: digital
13
- value: 80.2
14
- date: "2026-04-28"
15
- source:
16
- url: https://huggingface.co/datasets/Delores-Lin/MDPBench
17
- name: MDPBench leaderboard
18
- user: Delores-Lin
19
- - dataset:
20
- id: Delores-Lin/MDPBench
21
- task_id: photographed
22
- value: 58.5
23
- date: "2026-04-28"
24
- source:
25
- url: https://huggingface.co/datasets/Delores-Lin/MDPBench
26
- name: MDPBench leaderboard
27
- user: Delores-Lin
28
- - dataset:
29
- id: Delores-Lin/MDPBench
30
- task_id: latin
31
- value: 73.7
32
- date: "2026-04-28"
33
- source:
34
- url: https://huggingface.co/datasets/Delores-Lin/MDPBench
35
- name: MDPBench leaderboard
36
- user: Delores-Lin
37
- - dataset:
38
- id: Delores-Lin/MDPBench
39
- task_id: de
40
- value: 72.4
41
- date: "2026-04-28"
42
- source:
43
- url: https://huggingface.co/datasets/Delores-Lin/MDPBench
44
- name: MDPBench leaderboard
45
- user: Delores-Lin
46
- - dataset:
47
- id: Delores-Lin/MDPBench
48
- task_id: en
49
- value: 79.1
50
- date: "2026-04-28"
51
- source:
52
- url: https://huggingface.co/datasets/Delores-Lin/MDPBench
53
- name: MDPBench leaderboard
54
- user: Delores-Lin
55
- - dataset:
56
- id: Delores-Lin/MDPBench
57
- task_id: es
58
- value: 65.0
59
- date: "2026-04-28"
60
- source:
61
- url: https://huggingface.co/datasets/Delores-Lin/MDPBench
62
- name: MDPBench leaderboard
63
- user: Delores-Lin
64
- - dataset:
65
- id: Delores-Lin/MDPBench
66
- task_id: fr
67
- value: 62.1
68
- date: "2026-04-28"
69
- source:
70
- url: https://huggingface.co/datasets/Delores-Lin/MDPBench
71
- name: MDPBench leaderboard
72
- user: Delores-Lin
73
- - dataset:
74
- id: Delores-Lin/MDPBench
75
- task_id: id
76
- value: 72.9
77
- date: "2026-04-28"
78
- source:
79
- url: https://huggingface.co/datasets/Delores-Lin/MDPBench
80
- name: MDPBench leaderboard
81
- user: Delores-Lin
82
- - dataset:
83
- id: Delores-Lin/MDPBench
84
- task_id: it
85
- value: 82.9
86
- date: "2026-04-28"
87
- source:
88
- url: https://huggingface.co/datasets/Delores-Lin/MDPBench
89
- name: MDPBench leaderboard
90
- user: Delores-Lin
91
- - dataset:
92
- id: Delores-Lin/MDPBench
93
- task_id: nl
94
- value: 70.2
95
- date: "2026-04-28"
96
- source:
97
- url: https://huggingface.co/datasets/Delores-Lin/MDPBench
98
- name: MDPBench leaderboard
99
- user: Delores-Lin
100
- - dataset:
101
- id: Delores-Lin/MDPBench
102
- task_id: pt
103
- value: 83.8
104
- date: "2026-04-28"
105
- source:
106
- url: https://huggingface.co/datasets/Delores-Lin/MDPBench
107
- name: MDPBench leaderboard
108
- user: Delores-Lin
109
- - dataset:
110
- id: Delores-Lin/MDPBench
111
- task_id: vi
112
- value: 74.9
113
- date: "2026-04-28"
114
- source:
115
- url: https://huggingface.co/datasets/Delores-Lin/MDPBench
116
- name: MDPBench leaderboard
117
- user: Delores-Lin
118
- - dataset:
119
- id: Delores-Lin/MDPBench
120
- task_id: non_latin
121
- value: 52.8
122
- date: "2026-04-28"
123
- source:
124
- url: https://huggingface.co/datasets/Delores-Lin/MDPBench
125
- name: MDPBench leaderboard
126
- user: Delores-Lin
127
- - dataset:
128
- id: Delores-Lin/MDPBench
129
- task_id: ar
130
- value: 64.2
131
- date: "2026-04-28"
132
- source:
133
- url: https://huggingface.co/datasets/Delores-Lin/MDPBench
134
- name: MDPBench leaderboard
135
- user: Delores-Lin
136
- - dataset:
137
- id: Delores-Lin/MDPBench
138
- task_id: hi
139
- value: 59.0
140
- date: "2026-04-28"
141
- source:
142
- url: https://huggingface.co/datasets/Delores-Lin/MDPBench
143
- name: MDPBench leaderboard
144
- user: Delores-Lin
145
- - dataset:
146
- id: Delores-Lin/MDPBench
147
- task_id: jp
148
- value: 50.5
149
- date: "2026-04-28"
150
- source:
151
- url: https://huggingface.co/datasets/Delores-Lin/MDPBench
152
- name: MDPBench leaderboard
153
- user: Delores-Lin
154
- - dataset:
155
- id: Delores-Lin/MDPBench
156
- task_id: ko
157
- value: 41.6
158
- date: "2026-04-28"
159
- source:
160
- url: https://huggingface.co/datasets/Delores-Lin/MDPBench
161
- name: MDPBench leaderboard
162
- user: Delores-Lin
163
- - dataset:
164
- id: Delores-Lin/MDPBench
165
- task_id: ru
166
- value: 54.3
167
- date: "2026-04-28"
168
- source:
169
- url: https://huggingface.co/datasets/Delores-Lin/MDPBench
170
- name: MDPBench leaderboard
171
- user: Delores-Lin
172
- - dataset:
173
- id: Delores-Lin/MDPBench
174
- task_id: th
175
- value: 51.4
176
- date: "2026-04-28"
177
- source:
178
- url: https://huggingface.co/datasets/Delores-Lin/MDPBench
179
- name: MDPBench leaderboard
180
- user: Delores-Lin
181
- - dataset:
182
- id: Delores-Lin/MDPBench
183
- task_id: zh
184
- value: 46.6
185
- date: "2026-04-28"
186
- source:
187
- url: https://huggingface.co/datasets/Delores-Lin/MDPBench
188
- name: MDPBench leaderboard
189
- user: Delores-Lin
190
- - dataset:
191
- id: Delores-Lin/MDPBench
192
- task_id: zh_t
193
- value: 54.7
194
- date: "2026-04-28"
195
- source:
196
- url: https://huggingface.co/datasets/Delores-Lin/MDPBench
197
- name: MDPBench leaderboard
198
- user: Delores-Lin