hevok commited on
Commit
28c074d
·
verified ·
1 Parent(s): 9bf9226

Upload pawsx_0.4.8_results_2025-03-18T13-56-32.464817.json

Browse files
lm_eval/Qwen__Qwen2.5-7B/0.4.8/pawsx_0.4.8_results_2025-03-18T13-56-32.464817.json ADDED
@@ -0,0 +1,394 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "pawsx": {
4
+ "acc,none": 0.5892857142857143,
5
+ "acc_stderr,none": 0.004124430121757617,
6
+ "alias": "pawsx"
7
+ },
8
+ "paws_de": {
9
+ "alias": " - paws_de",
10
+ "acc,none": 0.648,
11
+ "acc_stderr,none": 0.010681996654477083
12
+ },
13
+ "paws_en": {
14
+ "alias": " - paws_en",
15
+ "acc,none": 0.6495,
16
+ "acc_stderr,none": 0.01067154233969731
17
+ },
18
+ "paws_es": {
19
+ "alias": " - paws_es",
20
+ "acc,none": 0.6445,
21
+ "acc_stderr,none": 0.010705941508891127
22
+ },
23
+ "paws_fr": {
24
+ "alias": " - paws_fr",
25
+ "acc,none": 0.615,
26
+ "acc_stderr,none": 0.010883323176386978
27
+ },
28
+ "paws_ja": {
29
+ "alias": " - paws_ja",
30
+ "acc,none": 0.473,
31
+ "acc_stderr,none": 0.01116681910502999
32
+ },
33
+ "paws_ko": {
34
+ "alias": " - paws_ko",
35
+ "acc,none": 0.5385,
36
+ "acc_stderr,none": 0.01114993432795706
37
+ },
38
+ "paws_zh": {
39
+ "alias": " - paws_zh",
40
+ "acc,none": 0.5565,
41
+ "acc_stderr,none": 0.011111507899646485
42
+ }
43
+ },
44
+ "groups": {
45
+ "pawsx": {
46
+ "acc,none": 0.5892857142857143,
47
+ "acc_stderr,none": 0.004124430121757617,
48
+ "alias": "pawsx"
49
+ }
50
+ },
51
+ "group_subtasks": {
52
+ "pawsx": [
53
+ "paws_en",
54
+ "paws_de",
55
+ "paws_es",
56
+ "paws_fr",
57
+ "paws_ja",
58
+ "paws_ko",
59
+ "paws_zh"
60
+ ]
61
+ },
62
+ "configs": {
63
+ "paws_de": {
64
+ "task": "paws_de",
65
+ "dataset_path": "paws-x",
66
+ "dataset_name": "de",
67
+ "training_split": "train",
68
+ "validation_split": "validation",
69
+ "test_split": "test",
70
+ "doc_to_text": "",
71
+ "doc_to_target": "label",
72
+ "unsafe_code": false,
73
+ "doc_to_choice": "{{[sentence1+\", richtig? Nein, \"+sentence2, sentence1+\", richtig? Ja, \"+sentence2]}}",
74
+ "description": "",
75
+ "target_delimiter": " ",
76
+ "fewshot_delimiter": "\n\n",
77
+ "num_fewshot": 0,
78
+ "metric_list": [
79
+ {
80
+ "metric": "acc",
81
+ "aggregation": "mean",
82
+ "higher_is_better": true
83
+ }
84
+ ],
85
+ "output_type": "multiple_choice",
86
+ "repeats": 1,
87
+ "should_decontaminate": false,
88
+ "metadata": {
89
+ "version": 1.0
90
+ }
91
+ },
92
+ "paws_en": {
93
+ "task": "paws_en",
94
+ "dataset_path": "paws-x",
95
+ "dataset_name": "en",
96
+ "training_split": "train",
97
+ "validation_split": "validation",
98
+ "test_split": "test",
99
+ "doc_to_text": "",
100
+ "doc_to_target": "label",
101
+ "unsafe_code": false,
102
+ "doc_to_choice": "{{[sentence1+\", right? No, \"+sentence2, sentence1+\", right? Yes, \"+sentence2]}}",
103
+ "description": "",
104
+ "target_delimiter": " ",
105
+ "fewshot_delimiter": "\n\n",
106
+ "num_fewshot": 0,
107
+ "metric_list": [
108
+ {
109
+ "metric": "acc",
110
+ "aggregation": "mean",
111
+ "higher_is_better": true
112
+ }
113
+ ],
114
+ "output_type": "multiple_choice",
115
+ "repeats": 1,
116
+ "should_decontaminate": false,
117
+ "metadata": {
118
+ "version": 1.0
119
+ }
120
+ },
121
+ "paws_es": {
122
+ "task": "paws_es",
123
+ "dataset_path": "paws-x",
124
+ "dataset_name": "es",
125
+ "training_split": "train",
126
+ "validation_split": "validation",
127
+ "test_split": "test",
128
+ "doc_to_text": "",
129
+ "doc_to_target": "label",
130
+ "unsafe_code": false,
131
+ "doc_to_choice": "{{[sentence1+\", verdad? No, \"+sentence2, sentence1+\", verdad? Sí, \"+sentence2]}}",
132
+ "description": "",
133
+ "target_delimiter": " ",
134
+ "fewshot_delimiter": "\n\n",
135
+ "num_fewshot": 0,
136
+ "metric_list": [
137
+ {
138
+ "metric": "acc",
139
+ "aggregation": "mean",
140
+ "higher_is_better": true
141
+ }
142
+ ],
143
+ "output_type": "multiple_choice",
144
+ "repeats": 1,
145
+ "should_decontaminate": false,
146
+ "metadata": {
147
+ "version": 1.0
148
+ }
149
+ },
150
+ "paws_fr": {
151
+ "task": "paws_fr",
152
+ "dataset_path": "paws-x",
153
+ "dataset_name": "fr",
154
+ "training_split": "train",
155
+ "validation_split": "validation",
156
+ "test_split": "test",
157
+ "doc_to_text": "",
158
+ "doc_to_target": "label",
159
+ "unsafe_code": false,
160
+ "doc_to_choice": "{{[sentence1+\", n'est-ce pas? Non, \"+sentence2, sentence1+\", n'est-ce pas? Oui, \"+sentence2]}}",
161
+ "description": "",
162
+ "target_delimiter": " ",
163
+ "fewshot_delimiter": "\n\n",
164
+ "num_fewshot": 0,
165
+ "metric_list": [
166
+ {
167
+ "metric": "acc",
168
+ "aggregation": "mean",
169
+ "higher_is_better": true
170
+ }
171
+ ],
172
+ "output_type": "multiple_choice",
173
+ "repeats": 1,
174
+ "should_decontaminate": false,
175
+ "metadata": {
176
+ "version": 1.0
177
+ }
178
+ },
179
+ "paws_ja": {
180
+ "task": "paws_ja",
181
+ "dataset_path": "paws-x",
182
+ "dataset_name": "ja",
183
+ "training_split": "train",
184
+ "validation_split": "validation",
185
+ "test_split": "test",
186
+ "doc_to_text": "",
187
+ "doc_to_target": "label",
188
+ "unsafe_code": false,
189
+ "doc_to_choice": "{{[sentence1+\", ですね? いいえ, \"+sentence2, sentence1+\", ですね? はい, \"+sentence2]}}",
190
+ "description": "",
191
+ "target_delimiter": " ",
192
+ "fewshot_delimiter": "\n\n",
193
+ "num_fewshot": 0,
194
+ "metric_list": [
195
+ {
196
+ "metric": "acc",
197
+ "aggregation": "mean",
198
+ "higher_is_better": true
199
+ }
200
+ ],
201
+ "output_type": "multiple_choice",
202
+ "repeats": 1,
203
+ "should_decontaminate": false,
204
+ "metadata": {
205
+ "version": 1.0
206
+ }
207
+ },
208
+ "paws_ko": {
209
+ "task": "paws_ko",
210
+ "dataset_path": "paws-x",
211
+ "dataset_name": "ko",
212
+ "training_split": "train",
213
+ "validation_split": "validation",
214
+ "test_split": "test",
215
+ "doc_to_text": "",
216
+ "doc_to_target": "label",
217
+ "unsafe_code": false,
218
+ "doc_to_choice": "{{[sentence1+\", 맞죠? 아니요, \"+sentence2, sentence1+\", 맞죠? 예, \"+sentence2]}}",
219
+ "description": "",
220
+ "target_delimiter": " ",
221
+ "fewshot_delimiter": "\n\n",
222
+ "num_fewshot": 0,
223
+ "metric_list": [
224
+ {
225
+ "metric": "acc",
226
+ "aggregation": "mean",
227
+ "higher_is_better": true
228
+ }
229
+ ],
230
+ "output_type": "multiple_choice",
231
+ "repeats": 1,
232
+ "should_decontaminate": false,
233
+ "metadata": {
234
+ "version": 1.0
235
+ }
236
+ },
237
+ "paws_zh": {
238
+ "task": "paws_zh",
239
+ "dataset_path": "paws-x",
240
+ "dataset_name": "zh",
241
+ "training_split": "train",
242
+ "validation_split": "validation",
243
+ "test_split": "test",
244
+ "doc_to_text": "",
245
+ "doc_to_target": "label",
246
+ "unsafe_code": false,
247
+ "doc_to_choice": "{{[sentence1+\", 对吧? 不是, \"+sentence2, sentence1+\", 对吧? 是, \"+sentence2]}}",
248
+ "description": "",
249
+ "target_delimiter": " ",
250
+ "fewshot_delimiter": "\n\n",
251
+ "num_fewshot": 0,
252
+ "metric_list": [
253
+ {
254
+ "metric": "acc",
255
+ "aggregation": "mean",
256
+ "higher_is_better": true
257
+ }
258
+ ],
259
+ "output_type": "multiple_choice",
260
+ "repeats": 1,
261
+ "should_decontaminate": false,
262
+ "metadata": {
263
+ "version": 1.0
264
+ }
265
+ }
266
+ },
267
+ "versions": {
268
+ "paws_de": 1.0,
269
+ "paws_en": 1.0,
270
+ "paws_es": 1.0,
271
+ "paws_fr": 1.0,
272
+ "paws_ja": 1.0,
273
+ "paws_ko": 1.0,
274
+ "paws_zh": 1.0,
275
+ "pawsx": 0.0
276
+ },
277
+ "n-shot": {
278
+ "paws_de": 0,
279
+ "paws_en": 0,
280
+ "paws_es": 0,
281
+ "paws_fr": 0,
282
+ "paws_ja": 0,
283
+ "paws_ko": 0,
284
+ "paws_zh": 0
285
+ },
286
+ "higher_is_better": {
287
+ "paws_de": {
288
+ "acc": true
289
+ },
290
+ "paws_en": {
291
+ "acc": true
292
+ },
293
+ "paws_es": {
294
+ "acc": true
295
+ },
296
+ "paws_fr": {
297
+ "acc": true
298
+ },
299
+ "paws_ja": {
300
+ "acc": true
301
+ },
302
+ "paws_ko": {
303
+ "acc": true
304
+ },
305
+ "paws_zh": {
306
+ "acc": true
307
+ },
308
+ "pawsx": {
309
+ "acc": true
310
+ }
311
+ },
312
+ "n-samples": {
313
+ "paws_en": {
314
+ "original": 2000,
315
+ "effective": 2000
316
+ },
317
+ "paws_de": {
318
+ "original": 2000,
319
+ "effective": 2000
320
+ },
321
+ "paws_es": {
322
+ "original": 2000,
323
+ "effective": 2000
324
+ },
325
+ "paws_fr": {
326
+ "original": 2000,
327
+ "effective": 2000
328
+ },
329
+ "paws_ja": {
330
+ "original": 2000,
331
+ "effective": 2000
332
+ },
333
+ "paws_ko": {
334
+ "original": 2000,
335
+ "effective": 2000
336
+ },
337
+ "paws_zh": {
338
+ "original": 2000,
339
+ "effective": 2000
340
+ }
341
+ },
342
+ "config": {
343
+ "model": "hf",
344
+ "model_args": "pretrained=Qwen/Qwen2.5-7B,dtype=float32,trust_remote_code=True",
345
+ "model_num_parameters": 7615616512,
346
+ "model_dtype": "torch.float32",
347
+ "model_revision": "main",
348
+ "model_sha": "d149729398750b98c0af14eb82c78cfe92750796",
349
+ "batch_size": "auto",
350
+ "batch_sizes": [
351
+ 32
352
+ ],
353
+ "device": "cuda",
354
+ "use_cache": null,
355
+ "limit": null,
356
+ "bootstrap_iters": 100000,
357
+ "gen_kwargs": null,
358
+ "random_seed": 0,
359
+ "numpy_seed": 1234,
360
+ "torch_seed": 1234,
361
+ "fewshot_seed": 1234
362
+ },
363
+ "git_hash": null,
364
+ "date": 1742304929.7642052,
365
+ "pretty_env_info": "PyTorch version: 2.5.1+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.16.3\nLibc version: glibc-2.31\n\nPython version: 3.10.10 (main, Mar 21 2023, 18:45:11) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1077-aws-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA L40S\nNvidia driver version: 535.230.02\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.0\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 48 bits physical, 48 bits virtual\nCPU(s): 16\nOn-line CPU(s) list: 0-15\nThread(s) per core: 2\nCore(s) per socket: 8\nSocket(s): 1\nNUMA node(s): 1\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 1\nModel name: AMD EPYC 7R13 Processor\nStepping: 1\nCPU MHz: 2649.998\nBogoMIPS: 5299.99\nHypervisor vendor: KVM\nVirtualization type: full\nL1d cache: 256 KiB\nL1i cache: 256 KiB\nL2 cache: 4 MiB\nL3 cache: 32 MiB\nNUMA node0 CPU(s): 0-15\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec rstack overflow: Mitigation; safe RET, no microcode\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; IBPB conditional; IBRS_FW; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch topoext invpcid_single ssbd ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 clzero xsaveerptr rdpru wbnoinvd arat npt nrip_save vaes vpclmulqdq rdpid\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] pytorch-lightning==2.5.0.post0\n[pip3] torch==2.5.1+cu121\n[pip3] torchaudio==2.5.1+cu121\n[pip3] torchmetrics==1.3.1\n[pip3] torchvision==0.20.1+cu121\n[pip3] triton==3.1.0\n[conda] numpy 1.26.4 pypi_0 pypi\n[conda] pytorch-lightning 2.5.0.post0 pypi_0 pypi\n[conda] torch 2.5.1+cu121 pypi_0 pypi\n[conda] torchaudio 2.5.1+cu121 pypi_0 pypi\n[conda] torchmetrics 1.3.1 pypi_0 pypi\n[conda] torchvision 0.20.1+cu121 pypi_0 pypi\n[conda] triton 3.1.0 pypi_0 pypi",
366
+ "transformers_version": "4.49.0",
367
+ "upper_git_hash": null,
368
+ "tokenizer_pad_token": [
369
+ "<|endoftext|>",
370
+ "151643"
371
+ ],
372
+ "tokenizer_eos_token": [
373
+ "<|endoftext|>",
374
+ "151643"
375
+ ],
376
+ "tokenizer_bos_token": [
377
+ null,
378
+ "None"
379
+ ],
380
+ "eot_token_id": 151643,
381
+ "max_length": 131072,
382
+ "task_hashes": {},
383
+ "model_source": "hf",
384
+ "model_name": "Qwen/Qwen2.5-7B",
385
+ "model_name_sanitized": "Qwen__Qwen2.5-7B",
386
+ "system_instruction": null,
387
+ "system_instruction_sha": null,
388
+ "fewshot_as_multiturn": false,
389
+ "chat_template": null,
390
+ "chat_template_sha": null,
391
+ "start_time": 106.775862057,
392
+ "end_time": 1381.904321725,
393
+ "total_evaluation_time_seconds": "1275.128459668"
394
+ }