lennarddaw commited on
Commit
88b5445
·
verified ·
1 Parent(s): a2cf9f6

Upload llm_benchmark.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. llm_benchmark.json +358 -0
llm_benchmark.json ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "HowzerSeverity (ours)": {
3
+ "params": "336M",
4
+ "type": "fine-tuned",
5
+ "metrics": {
6
+ "n_valid": 48,
7
+ "n_total": 48,
8
+ "parse_rate": 1.0,
9
+ "tier_f1_weighted": 1.0,
10
+ "tier_f1_macro": 1.0,
11
+ "tier_accuracy": 1.0,
12
+ "score_mae": 0.030167026676734283,
13
+ "tier_f1_low": 1.0,
14
+ "tier_f1_medium": 1.0,
15
+ "tier_f1_high": 1.0,
16
+ "tier_f1_critical": 1.0,
17
+ "critical_to_low": 0,
18
+ "confusion_matrix": [
19
+ [
20
+ 24,
21
+ 0,
22
+ 0,
23
+ 0
24
+ ],
25
+ [
26
+ 0,
27
+ 19,
28
+ 0,
29
+ 0
30
+ ],
31
+ [
32
+ 0,
33
+ 0,
34
+ 2,
35
+ 0
36
+ ],
37
+ [
38
+ 0,
39
+ 0,
40
+ 0,
41
+ 3
42
+ ]
43
+ ]
44
+ }
45
+ },
46
+ "mDeBERTa XNLI": {
47
+ "params": "~300M",
48
+ "type": "zero-shot-nli",
49
+ "metrics": {
50
+ "n_valid": 48,
51
+ "n_total": 48,
52
+ "parse_rate": 1.0,
53
+ "tier_f1_weighted": 0.45595583494519665,
54
+ "tier_f1_macro": 0.2838491295938104,
55
+ "tier_accuracy": 0.4375,
56
+ "score_mae": 0.16276687665792802,
57
+ "tier_f1_low": 0.6808510638297872,
58
+ "tier_f1_medium": 0.2727272727272727,
59
+ "tier_f1_high": 0.18181818181818182,
60
+ "tier_f1_critical": 0.0,
61
+ "critical_to_low": 0,
62
+ "confusion_matrix": [
63
+ [
64
+ 16,
65
+ 0,
66
+ 8,
67
+ 0
68
+ ],
69
+ [
70
+ 7,
71
+ 3,
72
+ 7,
73
+ 2
74
+ ],
75
+ [
76
+ 0,
77
+ 0,
78
+ 2,
79
+ 0
80
+ ],
81
+ [
82
+ 0,
83
+ 0,
84
+ 3,
85
+ 0
86
+ ]
87
+ ]
88
+ }
89
+ },
90
+ "BART MNLI": {
91
+ "params": "~300M",
92
+ "type": "zero-shot-nli",
93
+ "metrics": {
94
+ "n_valid": 48,
95
+ "n_total": 48,
96
+ "parse_rate": 1.0,
97
+ "tier_f1_weighted": 0.21050724637681162,
98
+ "tier_f1_macro": 0.14347826086956522,
99
+ "tier_accuracy": 0.16666666666666666,
100
+ "score_mae": 0.23398512904193877,
101
+ "tier_f1_low": 0.26666666666666666,
102
+ "tier_f1_medium": 0.17391304347826086,
103
+ "tier_f1_high": 0.0,
104
+ "tier_f1_critical": 0.13333333333333333,
105
+ "critical_to_low": 0,
106
+ "confusion_matrix": [
107
+ [
108
+ 4,
109
+ 2,
110
+ 6,
111
+ 12
112
+ ],
113
+ [
114
+ 1,
115
+ 2,
116
+ 4,
117
+ 12
118
+ ],
119
+ [
120
+ 1,
121
+ 0,
122
+ 0,
123
+ 1
124
+ ],
125
+ [
126
+ 0,
127
+ 0,
128
+ 1,
129
+ 2
130
+ ]
131
+ ]
132
+ }
133
+ },
134
+ "German Sentiment BERT": {
135
+ "params": "110M",
136
+ "type": "sentiment-mapped",
137
+ "metrics": {
138
+ "n_valid": 48,
139
+ "n_total": 48,
140
+ "parse_rate": 1.0,
141
+ "tier_f1_weighted": 0.2868131868131868,
142
+ "tier_f1_macro": 0.1695970695970696,
143
+ "tier_accuracy": 0.2708333333333333,
144
+ "score_mae": 0.19046067998939412,
145
+ "tier_f1_low": 0.5641025641025641,
146
+ "tier_f1_medium": 0.0,
147
+ "tier_f1_high": 0.11428571428571428,
148
+ "tier_f1_critical": 0.0,
149
+ "critical_to_low": 0,
150
+ "confusion_matrix": [
151
+ [
152
+ 11,
153
+ 0,
154
+ 13,
155
+ 0
156
+ ],
157
+ [
158
+ 4,
159
+ 0,
160
+ 15,
161
+ 0
162
+ ],
163
+ [
164
+ 0,
165
+ 0,
166
+ 2,
167
+ 0
168
+ ],
169
+ [
170
+ 0,
171
+ 0,
172
+ 3,
173
+ 0
174
+ ]
175
+ ]
176
+ }
177
+ },
178
+ "nlptown Star Rating": {
179
+ "params": "110M",
180
+ "type": "sentiment-mapped",
181
+ "metrics": {
182
+ "n_valid": 48,
183
+ "n_total": 48,
184
+ "parse_rate": 1.0,
185
+ "tier_f1_weighted": 0.4287094645550528,
186
+ "tier_f1_macro": 0.30458144796380093,
187
+ "tier_accuracy": 0.375,
188
+ "score_mae": 0.21206720113219513,
189
+ "tier_f1_low": 0.6153846153846154,
190
+ "tier_f1_medium": 0.25,
191
+ "tier_f1_high": 0.0,
192
+ "tier_f1_critical": 0.35294117647058826,
193
+ "critical_to_low": 0,
194
+ "confusion_matrix": [
195
+ [
196
+ 12,
197
+ 2,
198
+ 7,
199
+ 3
200
+ ],
201
+ [
202
+ 3,
203
+ 3,
204
+ 7,
205
+ 6
206
+ ],
207
+ [
208
+ 0,
209
+ 0,
210
+ 0,
211
+ 2
212
+ ],
213
+ [
214
+ 0,
215
+ 0,
216
+ 0,
217
+ 3
218
+ ]
219
+ ]
220
+ }
221
+ },
222
+ "Claude Opus 4.6": {
223
+ "params": "~70B?",
224
+ "type": "llm",
225
+ "metrics": {
226
+ "n_valid": 48,
227
+ "n_total": 48,
228
+ "parse_rate": 1.0,
229
+ "tier_f1_weighted": 0.8849479166666666,
230
+ "tier_f1_macro": 0.818125,
231
+ "tier_accuracy": 0.875,
232
+ "score_mae": 0.06494583333333333,
233
+ "tier_f1_low": 0.96,
234
+ "tier_f1_medium": 0.8125,
235
+ "tier_f1_high": 0.5,
236
+ "tier_f1_critical": 1.0,
237
+ "critical_to_low": 0,
238
+ "confusion_matrix": [
239
+ [
240
+ 24,
241
+ 0,
242
+ 0,
243
+ 0
244
+ ],
245
+ [
246
+ 2,
247
+ 13,
248
+ 4,
249
+ 0
250
+ ],
251
+ [
252
+ 0,
253
+ 0,
254
+ 2,
255
+ 0
256
+ ],
257
+ [
258
+ 0,
259
+ 0,
260
+ 0,
261
+ 3
262
+ ]
263
+ ]
264
+ }
265
+ },
266
+ "Claude Sonnet 4.6": {
267
+ "params": "~70B?",
268
+ "type": "llm",
269
+ "metrics": {
270
+ "n_valid": 48,
271
+ "n_total": 48,
272
+ "parse_rate": 1.0,
273
+ "tier_f1_weighted": 0.9809684684684684,
274
+ "tier_f1_macro": 0.9432432432432433,
275
+ "tier_accuracy": 0.9791666666666666,
276
+ "score_mae": 0.005687499999999999,
277
+ "tier_f1_low": 1.0,
278
+ "tier_f1_medium": 0.972972972972973,
279
+ "tier_f1_high": 0.8,
280
+ "tier_f1_critical": 1.0,
281
+ "critical_to_low": 0,
282
+ "confusion_matrix": [
283
+ [
284
+ 24,
285
+ 0,
286
+ 0,
287
+ 0
288
+ ],
289
+ [
290
+ 0,
291
+ 18,
292
+ 1,
293
+ 0
294
+ ],
295
+ [
296
+ 0,
297
+ 0,
298
+ 2,
299
+ 0
300
+ ],
301
+ [
302
+ 0,
303
+ 0,
304
+ 0,
305
+ 3
306
+ ]
307
+ ]
308
+ }
309
+ },
310
+ "Claude Haiku 4.5": {
311
+ "params": "~8B?",
312
+ "type": "llm",
313
+ "metrics": {
314
+ "n_valid": 48,
315
+ "n_total": 48,
316
+ "parse_rate": 1.0,
317
+ "tier_f1_weighted": 0.8112037037037038,
318
+ "tier_f1_macro": 0.7794444444444444,
319
+ "tier_accuracy": 0.8125,
320
+ "score_mae": 0.037641666666666664,
321
+ "tier_f1_low": 0.84,
322
+ "tier_f1_medium": 0.7777777777777778,
323
+ "tier_f1_high": 0.5,
324
+ "tier_f1_critical": 1.0,
325
+ "critical_to_low": 0,
326
+ "confusion_matrix": [
327
+ [
328
+ 21,
329
+ 3,
330
+ 0,
331
+ 0
332
+ ],
333
+ [
334
+ 4,
335
+ 14,
336
+ 1,
337
+ 0
338
+ ],
339
+ [
340
+ 1,
341
+ 0,
342
+ 1,
343
+ 0
344
+ ],
345
+ [
346
+ 0,
347
+ 0,
348
+ 0,
349
+ 3
350
+ ]
351
+ ]
352
+ }
353
+ },
354
+ "_meta": {
355
+ "n_samples": 48,
356
+ "timestamp": "2026-02-24 18:13:25"
357
+ }
358
+ }