hevok commited on
Commit
be529ae
·
verified ·
1 Parent(s): 4ef0ec3

Upload folder using huggingface_hub

Browse files
lm_eval/meta-llama__Llama-3.2-1B/arc_challenge_0.4.8_results_2025-03-18T11-27-49.934618.json ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_challenge": {
4
+ "alias": "arc_challenge",
5
+ "acc,none": 0.31313993174061433,
6
+ "acc_stderr,none": 0.013552671543623497,
7
+ "acc_norm,none": 0.3626279863481229,
8
+ "acc_norm_stderr,none": 0.014049106564955003
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "arc_challenge": []
13
+ },
14
+ "configs": {
15
+ "arc_challenge": {
16
+ "task": "arc_challenge",
17
+ "tag": [
18
+ "ai2_arc"
19
+ ],
20
+ "dataset_path": "allenai/ai2_arc",
21
+ "dataset_name": "ARC-Challenge",
22
+ "training_split": "train",
23
+ "validation_split": "validation",
24
+ "test_split": "test",
25
+ "doc_to_text": "Question: {{question}}\nAnswer:",
26
+ "doc_to_target": "{{choices.label.index(answerKey)}}",
27
+ "unsafe_code": false,
28
+ "doc_to_choice": "{{choices.text}}",
29
+ "description": "",
30
+ "target_delimiter": " ",
31
+ "fewshot_delimiter": "\n\n",
32
+ "num_fewshot": 0,
33
+ "metric_list": [
34
+ {
35
+ "metric": "acc",
36
+ "aggregation": "mean",
37
+ "higher_is_better": true
38
+ },
39
+ {
40
+ "metric": "acc_norm",
41
+ "aggregation": "mean",
42
+ "higher_is_better": true
43
+ }
44
+ ],
45
+ "output_type": "multiple_choice",
46
+ "repeats": 1,
47
+ "should_decontaminate": true,
48
+ "doc_to_decontamination_query": "Question: {{question}}\nAnswer:",
49
+ "metadata": {
50
+ "version": 1.0
51
+ }
52
+ }
53
+ },
54
+ "versions": {
55
+ "arc_challenge": 1.0
56
+ },
57
+ "n-shot": {
58
+ "arc_challenge": 0
59
+ },
60
+ "higher_is_better": {
61
+ "arc_challenge": {
62
+ "acc": true,
63
+ "acc_norm": true
64
+ }
65
+ },
66
+ "n-samples": {
67
+ "arc_challenge": {
68
+ "original": 1172,
69
+ "effective": 1172
70
+ }
71
+ },
72
+ "config": {
73
+ "model": "hf",
74
+ "model_args": "pretrained=meta-llama/Llama-3.2-1B,dtype=float32,trust_remote_code=True",
75
+ "model_num_parameters": 1235814400,
76
+ "model_dtype": "torch.float32",
77
+ "model_revision": "main",
78
+ "model_sha": "4e20de362430cd3b72f300e6b0f18e50e7166e08",
79
+ "batch_size": "auto",
80
+ "batch_sizes": [
81
+ 16
82
+ ],
83
+ "device": "cuda",
84
+ "use_cache": null,
85
+ "limit": null,
86
+ "bootstrap_iters": 100000,
87
+ "gen_kwargs": null,
88
+ "random_seed": 0,
89
+ "numpy_seed": 1234,
90
+ "torch_seed": 1234,
91
+ "fewshot_seed": 1234
92
+ },
93
+ "git_hash": null,
94
+ "date": 1742297154.8578038,
95
+ "pretty_env_info": "PyTorch version: 2.5.1+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: 14.0.0-1ubuntu1.1\nCMake version: version 3.31.2\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Nov 6 2024, 20:22:13) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-6.6.56+-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.140\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: Tesla T4\nGPU 1: Tesla T4\n\nNvidia driver version: 560.35.03\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.6\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 4\nOn-line CPU(s) list: 0-3\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) CPU @ 2.00GHz\nCPU family: 6\nModel: 85\nThread(s) per core: 2\nCore(s) per socket: 2\nSocket(s): 1\nStepping: 3\nBogoMIPS: 4000.44\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch pti ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves arat md_clear arch_capabilities\nHypervisor vendor: KVM\nVirtualization type: full\nL1d cache: 64 KiB (2 instances)\nL1i cache: 64 KiB (2 instances)\nL2 cache: 2 MiB (2 instances)\nL3 cache: 38.5 MiB (1 instance)\nNUMA node(s): 1\nNUMA node0 CPU(s): 0-3\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Mitigation; PTE Inversion\nVulnerability Mds: Mitigation; Clear CPU buffers; SMT Host state unknown\nVulnerability Meltdown: Mitigation; PTI\nVulnerability Mmio stale data: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Mitigation; IBRS\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; IBRS; IBPB conditional; STIBP conditional; RSB filling; PBRSB-eIBRS Not affected; BHI SW loop, KVM SW loop\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Mitigation; Clear CPU buffers; SMT Host state unknown\n\nVersions of relevant libraries:\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] onnx==1.17.0\n[pip3] optree==0.13.1\n[pip3] pytorch-ignite==0.5.1\n[pip3] pytorch-lightning==2.5.0.post0\n[pip3] torch==2.5.1+cu121\n[pip3] torchaudio==2.5.1+cu121\n[pip3] torchinfo==1.8.0\n[pip3] torchmetrics==1.6.1\n[pip3] torchsummary==1.5.1\n[pip3] torchtune==0.5.0\n[pip3] torchvision==0.20.1+cu121\n[conda] Could not collect",
96
+ "transformers_version": "4.47.0",
97
+ "upper_git_hash": null,
98
+ "tokenizer_pad_token": [
99
+ "<|end_of_text|>",
100
+ "128001"
101
+ ],
102
+ "tokenizer_eos_token": [
103
+ "<|end_of_text|>",
104
+ "128001"
105
+ ],
106
+ "tokenizer_bos_token": [
107
+ "<|begin_of_text|>",
108
+ "128000"
109
+ ],
110
+ "eot_token_id": 128001,
111
+ "max_length": 131072,
112
+ "task_hashes": {},
113
+ "model_source": "hf",
114
+ "model_name": "meta-llama/Llama-3.2-1B",
115
+ "model_name_sanitized": "meta-llama__Llama-3.2-1B",
116
+ "system_instruction": null,
117
+ "system_instruction_sha": null,
118
+ "fewshot_as_multiturn": false,
119
+ "chat_template": null,
120
+ "chat_template_sha": null,
121
+ "start_time": 3403.099108597,
122
+ "end_time": 3534.480372462,
123
+ "total_evaluation_time_seconds": "131.38126386500016"
124
+ }
lm_eval/meta-llama__Llama-3.2-1B/arc_easy_0.4.8_results_2025-03-18T11-25-21.966280.json ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_easy": {
4
+ "alias": "arc_easy",
5
+ "acc,none": 0.6548821548821548,
6
+ "acc_stderr,none": 0.009755139387152048,
7
+ "acc_norm,none": 0.6047979797979798,
8
+ "acc_norm_stderr,none": 0.010031894052790978
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "arc_easy": []
13
+ },
14
+ "configs": {
15
+ "arc_easy": {
16
+ "task": "arc_easy",
17
+ "tag": [
18
+ "ai2_arc"
19
+ ],
20
+ "dataset_path": "allenai/ai2_arc",
21
+ "dataset_name": "ARC-Easy",
22
+ "training_split": "train",
23
+ "validation_split": "validation",
24
+ "test_split": "test",
25
+ "doc_to_text": "Question: {{question}}\nAnswer:",
26
+ "doc_to_target": "{{choices.label.index(answerKey)}}",
27
+ "unsafe_code": false,
28
+ "doc_to_choice": "{{choices.text}}",
29
+ "description": "",
30
+ "target_delimiter": " ",
31
+ "fewshot_delimiter": "\n\n",
32
+ "num_fewshot": 0,
33
+ "metric_list": [
34
+ {
35
+ "metric": "acc",
36
+ "aggregation": "mean",
37
+ "higher_is_better": true
38
+ },
39
+ {
40
+ "metric": "acc_norm",
41
+ "aggregation": "mean",
42
+ "higher_is_better": true
43
+ }
44
+ ],
45
+ "output_type": "multiple_choice",
46
+ "repeats": 1,
47
+ "should_decontaminate": true,
48
+ "doc_to_decontamination_query": "Question: {{question}}\nAnswer:",
49
+ "metadata": {
50
+ "version": 1.0
51
+ }
52
+ }
53
+ },
54
+ "versions": {
55
+ "arc_easy": 1.0
56
+ },
57
+ "n-shot": {
58
+ "arc_easy": 0
59
+ },
60
+ "higher_is_better": {
61
+ "arc_easy": {
62
+ "acc": true,
63
+ "acc_norm": true
64
+ }
65
+ },
66
+ "n-samples": {
67
+ "arc_easy": {
68
+ "original": 2376,
69
+ "effective": 2376
70
+ }
71
+ },
72
+ "config": {
73
+ "model": "hf",
74
+ "model_args": "pretrained=meta-llama/Llama-3.2-1B,dtype=float32,trust_remote_code=True",
75
+ "model_num_parameters": 1235814400,
76
+ "model_dtype": "torch.float32",
77
+ "model_revision": "main",
78
+ "model_sha": "4e20de362430cd3b72f300e6b0f18e50e7166e08",
79
+ "batch_size": "auto",
80
+ "batch_sizes": [
81
+ 16
82
+ ],
83
+ "device": "cuda",
84
+ "use_cache": null,
85
+ "limit": null,
86
+ "bootstrap_iters": 100000,
87
+ "gen_kwargs": null,
88
+ "random_seed": 0,
89
+ "numpy_seed": 1234,
90
+ "torch_seed": 1234,
91
+ "fewshot_seed": 1234
92
+ },
93
+ "git_hash": null,
94
+ "date": 1742296955.8166158,
95
+ "pretty_env_info": "PyTorch version: 2.5.1+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: 14.0.0-1ubuntu1.1\nCMake version: version 3.31.2\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Nov 6 2024, 20:22:13) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-6.6.56+-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.140\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: Tesla T4\nGPU 1: Tesla T4\n\nNvidia driver version: 560.35.03\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.6\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 4\nOn-line CPU(s) list: 0-3\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) CPU @ 2.00GHz\nCPU family: 6\nModel: 85\nThread(s) per core: 2\nCore(s) per socket: 2\nSocket(s): 1\nStepping: 3\nBogoMIPS: 4000.44\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch pti ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves arat md_clear arch_capabilities\nHypervisor vendor: KVM\nVirtualization type: full\nL1d cache: 64 KiB (2 instances)\nL1i cache: 64 KiB (2 instances)\nL2 cache: 2 MiB (2 instances)\nL3 cache: 38.5 MiB (1 instance)\nNUMA node(s): 1\nNUMA node0 CPU(s): 0-3\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Mitigation; PTE Inversion\nVulnerability Mds: Mitigation; Clear CPU buffers; SMT Host state unknown\nVulnerability Meltdown: Mitigation; PTI\nVulnerability Mmio stale data: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Mitigation; IBRS\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; IBRS; IBPB conditional; STIBP conditional; RSB filling; PBRSB-eIBRS Not affected; BHI SW loop, KVM SW loop\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Mitigation; Clear CPU buffers; SMT Host state unknown\n\nVersions of relevant libraries:\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] onnx==1.17.0\n[pip3] optree==0.13.1\n[pip3] pytorch-ignite==0.5.1\n[pip3] pytorch-lightning==2.5.0.post0\n[pip3] torch==2.5.1+cu121\n[pip3] torchaudio==2.5.1+cu121\n[pip3] torchinfo==1.8.0\n[pip3] torchmetrics==1.6.1\n[pip3] torchsummary==1.5.1\n[pip3] torchtune==0.5.0\n[pip3] torchvision==0.20.1+cu121\n[conda] Could not collect",
96
+ "transformers_version": "4.47.0",
97
+ "upper_git_hash": null,
98
+ "tokenizer_pad_token": [
99
+ "<|end_of_text|>",
100
+ "128001"
101
+ ],
102
+ "tokenizer_eos_token": [
103
+ "<|end_of_text|>",
104
+ "128001"
105
+ ],
106
+ "tokenizer_bos_token": [
107
+ "<|begin_of_text|>",
108
+ "128000"
109
+ ],
110
+ "eot_token_id": 128001,
111
+ "max_length": 131072,
112
+ "task_hashes": {},
113
+ "model_source": "hf",
114
+ "model_name": "meta-llama/Llama-3.2-1B",
115
+ "model_name_sanitized": "meta-llama__Llama-3.2-1B",
116
+ "system_instruction": null,
117
+ "system_instruction_sha": null,
118
+ "fewshot_as_multiturn": false,
119
+ "chat_template": null,
120
+ "chat_template_sha": null,
121
+ "start_time": 3203.09991243,
122
+ "end_time": 3386.511960146,
123
+ "total_evaluation_time_seconds": "183.41204771599996"
124
+ }
lm_eval/meta-llama__Llama-3.2-1B/glue_0.4.8_results_2025-03-18T13-13-42.753968.json ADDED
@@ -0,0 +1,488 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "cola": {
4
+ "alias": "cola",
5
+ "mcc,none": -0.001247821773438864,
6
+ "mcc_stderr,none": 0.031084198431639273
7
+ },
8
+ "mnli": {
9
+ "alias": "mnli",
10
+ "acc,none": 0.3584309730005094,
11
+ "acc_stderr,none": 0.0048406249332801745
12
+ },
13
+ "mnli_mismatch": {
14
+ "alias": "mnli_mismatch",
15
+ "acc,none": 0.3558787632221318,
16
+ "acc_stderr,none": 0.0048287641892860445
17
+ },
18
+ "mrpc": {
19
+ "alias": "mrpc",
20
+ "acc,none": 0.5245098039215687,
21
+ "acc_stderr,none": 0.024754284840506457,
22
+ "f1,none": 0.6407407407407407,
23
+ "f1_stderr,none": 0.024133162712869544
24
+ },
25
+ "qnli": {
26
+ "alias": "qnli",
27
+ "acc,none": 0.5172981878088962,
28
+ "acc_stderr,none": 0.006761360548456827
29
+ },
30
+ "qqp": {
31
+ "alias": "qqp",
32
+ "acc,none": 0.5002720751916894,
33
+ "acc_stderr,none": 0.0024867002635363352,
34
+ "f1,none": 0.3914824408168182,
35
+ "f1_stderr,none": 0.003384729521227567
36
+ },
37
+ "rte": {
38
+ "alias": "rte",
39
+ "acc,none": 0.5667870036101083,
40
+ "acc_stderr,none": 0.02982676408213828
41
+ },
42
+ "sst2": {
43
+ "alias": "sst2",
44
+ "acc,none": 0.7041284403669725,
45
+ "acc_stderr,none": 0.015465660633199553
46
+ },
47
+ "wnli": {
48
+ "alias": "wnli",
49
+ "acc,none": 0.4507042253521127,
50
+ "acc_stderr,none": 0.05947027187737998
51
+ }
52
+ },
53
+ "group_subtasks": {
54
+ "wnli": [],
55
+ "cola": [],
56
+ "mnli_mismatch": [],
57
+ "mnli": [],
58
+ "mrpc": [],
59
+ "qnli": [],
60
+ "qqp": [],
61
+ "rte": [],
62
+ "sst2": []
63
+ },
64
+ "configs": {
65
+ "cola": {
66
+ "task": "cola",
67
+ "tag": "glue",
68
+ "dataset_path": "glue",
69
+ "dataset_name": "cola",
70
+ "training_split": "train",
71
+ "validation_split": "validation",
72
+ "doc_to_text": "{{sentence}}\nQuestion: Does this sentence make sense?\nAnswer:",
73
+ "doc_to_target": "label",
74
+ "unsafe_code": false,
75
+ "doc_to_choice": [
76
+ "no",
77
+ "yes"
78
+ ],
79
+ "description": "",
80
+ "target_delimiter": " ",
81
+ "fewshot_delimiter": "\n\n",
82
+ "num_fewshot": 0,
83
+ "metric_list": [
84
+ {
85
+ "metric": "mcc"
86
+ }
87
+ ],
88
+ "output_type": "multiple_choice",
89
+ "repeats": 1,
90
+ "should_decontaminate": true,
91
+ "doc_to_decontamination_query": "sentence",
92
+ "metadata": {
93
+ "version": 1.0
94
+ }
95
+ },
96
+ "mnli": {
97
+ "task": "mnli",
98
+ "tag": "glue",
99
+ "dataset_path": "glue",
100
+ "dataset_name": "mnli",
101
+ "training_split": "train",
102
+ "validation_split": "validation_matched",
103
+ "doc_to_text": "def doc_to_text(doc) -> str:\n return \"{}\\nQuestion: {} True, False or Neither?\\nAnswer:\".format(\n doc[\"premise\"],\n doc[\"hypothesis\"].strip()\n + (\"\" if doc[\"hypothesis\"].strip().endswith(\".\") else \".\"),\n )\n",
104
+ "doc_to_target": "label",
105
+ "unsafe_code": false,
106
+ "doc_to_choice": [
107
+ "True",
108
+ "Neither",
109
+ "False"
110
+ ],
111
+ "description": "",
112
+ "target_delimiter": " ",
113
+ "fewshot_delimiter": "\n\n",
114
+ "num_fewshot": 0,
115
+ "metric_list": [
116
+ {
117
+ "metric": "acc"
118
+ }
119
+ ],
120
+ "output_type": "multiple_choice",
121
+ "repeats": 1,
122
+ "should_decontaminate": false,
123
+ "metadata": {
124
+ "version": 1.0
125
+ }
126
+ },
127
+ "mnli_mismatch": {
128
+ "task": "mnli_mismatch",
129
+ "tag": "glue",
130
+ "dataset_path": "glue",
131
+ "dataset_name": "mnli",
132
+ "training_split": "train",
133
+ "validation_split": "validation_mismatched",
134
+ "doc_to_text": "def doc_to_text(doc) -> str:\n return \"{}\\nQuestion: {} True, False or Neither?\\nAnswer:\".format(\n doc[\"premise\"],\n doc[\"hypothesis\"].strip()\n + (\"\" if doc[\"hypothesis\"].strip().endswith(\".\") else \".\"),\n )\n",
135
+ "doc_to_target": "label",
136
+ "unsafe_code": false,
137
+ "doc_to_choice": [
138
+ "True",
139
+ "Neither",
140
+ "False"
141
+ ],
142
+ "description": "",
143
+ "target_delimiter": " ",
144
+ "fewshot_delimiter": "\n\n",
145
+ "num_fewshot": 0,
146
+ "metric_list": [
147
+ {
148
+ "metric": "acc"
149
+ }
150
+ ],
151
+ "output_type": "multiple_choice",
152
+ "repeats": 1,
153
+ "should_decontaminate": false,
154
+ "metadata": {
155
+ "version": 1.0
156
+ }
157
+ },
158
+ "mrpc": {
159
+ "task": "mrpc",
160
+ "tag": "glue",
161
+ "dataset_path": "glue",
162
+ "dataset_name": "mrpc",
163
+ "training_split": "train",
164
+ "validation_split": "validation",
165
+ "doc_to_text": "Sentence 1: {{sentence1}}\nSentence 2: {{sentence2}}\nQuestion: Do both sentences mean the same thing?\nAnswer:",
166
+ "doc_to_target": "label",
167
+ "unsafe_code": false,
168
+ "doc_to_choice": [
169
+ "no",
170
+ "yes"
171
+ ],
172
+ "description": "",
173
+ "target_delimiter": " ",
174
+ "fewshot_delimiter": "\n\n",
175
+ "num_fewshot": 0,
176
+ "metric_list": [
177
+ {
178
+ "metric": "acc"
179
+ },
180
+ {
181
+ "metric": "f1"
182
+ }
183
+ ],
184
+ "output_type": "multiple_choice",
185
+ "repeats": 1,
186
+ "should_decontaminate": false,
187
+ "metadata": {
188
+ "version": 1.0
189
+ }
190
+ },
191
+ "qnli": {
192
+ "task": "qnli",
193
+ "tag": "glue",
194
+ "dataset_path": "glue",
195
+ "dataset_name": "qnli",
196
+ "training_split": "train",
197
+ "validation_split": "validation",
198
+ "doc_to_text": "{{question}}\n{{sentence}}\nQuestion: Does this response answer the question?\nAnswer:",
199
+ "doc_to_target": "label",
200
+ "unsafe_code": false,
201
+ "doc_to_choice": [
202
+ "yes",
203
+ "no"
204
+ ],
205
+ "description": "",
206
+ "target_delimiter": " ",
207
+ "fewshot_delimiter": "\n\n",
208
+ "num_fewshot": 0,
209
+ "metric_list": [
210
+ {
211
+ "metric": "acc"
212
+ }
213
+ ],
214
+ "output_type": "multiple_choice",
215
+ "repeats": 1,
216
+ "should_decontaminate": false,
217
+ "metadata": {
218
+ "version": 1.0
219
+ }
220
+ },
221
+ "qqp": {
222
+ "task": "qqp",
223
+ "tag": "glue",
224
+ "dataset_path": "glue",
225
+ "dataset_name": "qqp",
226
+ "training_split": "train",
227
+ "validation_split": "validation",
228
+ "doc_to_text": "Question 1: {{question1}}\nQuestion 2: {{question2}}\nQuestion: Do both questions ask the same thing?\nAnswer:",
229
+ "doc_to_target": "label",
230
+ "unsafe_code": false,
231
+ "doc_to_choice": [
232
+ "no",
233
+ "yes"
234
+ ],
235
+ "description": "",
236
+ "target_delimiter": " ",
237
+ "fewshot_delimiter": "\n\n",
238
+ "num_fewshot": 0,
239
+ "metric_list": [
240
+ {
241
+ "metric": "acc"
242
+ },
243
+ {
244
+ "metric": "f1"
245
+ }
246
+ ],
247
+ "output_type": "multiple_choice",
248
+ "repeats": 1,
249
+ "should_decontaminate": false,
250
+ "metadata": {
251
+ "version": 2.0
252
+ }
253
+ },
254
+ "rte": {
255
+ "task": "rte",
256
+ "tag": "glue",
257
+ "dataset_path": "glue",
258
+ "dataset_name": "rte",
259
+ "training_split": "train",
260
+ "validation_split": "validation",
261
+ "doc_to_text": "{{sentence1}}\nQuestion: {{sentence2}} True or False?\nAnswer:",
262
+ "doc_to_target": "label",
263
+ "unsafe_code": false,
264
+ "doc_to_choice": [
265
+ "True",
266
+ "False"
267
+ ],
268
+ "description": "",
269
+ "target_delimiter": " ",
270
+ "fewshot_delimiter": "\n\n",
271
+ "num_fewshot": 0,
272
+ "metric_list": [
273
+ {
274
+ "metric": "acc"
275
+ }
276
+ ],
277
+ "output_type": "multiple_choice",
278
+ "repeats": 1,
279
+ "should_decontaminate": false,
280
+ "metadata": {
281
+ "version": 1.0
282
+ }
283
+ },
284
+ "sst2": {
285
+ "task": "sst2",
286
+ "tag": "glue",
287
+ "dataset_path": "glue",
288
+ "dataset_name": "sst2",
289
+ "training_split": "train",
290
+ "validation_split": "validation",
291
+ "doc_to_text": "{{sentence}}\nQuestion: Is this sentence positive or negative?\nAnswer:",
292
+ "doc_to_target": "label",
293
+ "unsafe_code": false,
294
+ "doc_to_choice": [
295
+ "negative",
296
+ "positive"
297
+ ],
298
+ "description": "",
299
+ "target_delimiter": " ",
300
+ "fewshot_delimiter": "\n\n",
301
+ "num_fewshot": 0,
302
+ "metric_list": [
303
+ {
304
+ "metric": "acc"
305
+ }
306
+ ],
307
+ "output_type": "multiple_choice",
308
+ "repeats": 1,
309
+ "should_decontaminate": false,
310
+ "metadata": {
311
+ "version": 1.0
312
+ }
313
+ },
314
+ "wnli": {
315
+ "task": "wnli",
316
+ "tag": "glue",
317
+ "dataset_path": "glue",
318
+ "dataset_name": "wnli",
319
+ "training_split": "train",
320
+ "validation_split": "validation",
321
+ "doc_to_text": "{{sentence1}}\nQuestion: {{sentence2}} True or False?\nAnswer:",
322
+ "doc_to_target": "label",
323
+ "unsafe_code": false,
324
+ "doc_to_choice": [
325
+ "False",
326
+ "True"
327
+ ],
328
+ "description": "",
329
+ "target_delimiter": " ",
330
+ "fewshot_delimiter": "\n\n",
331
+ "num_fewshot": 0,
332
+ "metric_list": [
333
+ {
334
+ "metric": "acc"
335
+ }
336
+ ],
337
+ "output_type": "multiple_choice",
338
+ "repeats": 1,
339
+ "should_decontaminate": false,
340
+ "metadata": {
341
+ "version": 2.0
342
+ }
343
+ }
344
+ },
345
+ "versions": {
346
+ "cola": 1.0,
347
+ "mnli": 1.0,
348
+ "mnli_mismatch": 1.0,
349
+ "mrpc": 1.0,
350
+ "qnli": 1.0,
351
+ "qqp": 2.0,
352
+ "rte": 1.0,
353
+ "sst2": 1.0,
354
+ "wnli": 2.0
355
+ },
356
+ "n-shot": {
357
+ "cola": 0,
358
+ "mnli": 0,
359
+ "mnli_mismatch": 0,
360
+ "mrpc": 0,
361
+ "qnli": 0,
362
+ "qqp": 0,
363
+ "rte": 0,
364
+ "sst2": 0,
365
+ "wnli": 0
366
+ },
367
+ "higher_is_better": {
368
+ "cola": {
369
+ "mcc": true
370
+ },
371
+ "mnli": {
372
+ "acc": true
373
+ },
374
+ "mnli_mismatch": {
375
+ "acc": true
376
+ },
377
+ "mrpc": {
378
+ "acc": true,
379
+ "f1": true
380
+ },
381
+ "qnli": {
382
+ "acc": true
383
+ },
384
+ "qqp": {
385
+ "acc": true,
386
+ "f1": true
387
+ },
388
+ "rte": {
389
+ "acc": true
390
+ },
391
+ "sst2": {
392
+ "acc": true
393
+ },
394
+ "wnli": {
395
+ "acc": true
396
+ }
397
+ },
398
+ "n-samples": {
399
+ "sst2": {
400
+ "original": 872,
401
+ "effective": 872
402
+ },
403
+ "rte": {
404
+ "original": 277,
405
+ "effective": 277
406
+ },
407
+ "qqp": {
408
+ "original": 40430,
409
+ "effective": 40430
410
+ },
411
+ "qnli": {
412
+ "original": 5463,
413
+ "effective": 5463
414
+ },
415
+ "mrpc": {
416
+ "original": 408,
417
+ "effective": 408
418
+ },
419
+ "mnli": {
420
+ "original": 9815,
421
+ "effective": 9815
422
+ },
423
+ "mnli_mismatch": {
424
+ "original": 9832,
425
+ "effective": 9832
426
+ },
427
+ "cola": {
428
+ "original": 1043,
429
+ "effective": 1043
430
+ },
431
+ "wnli": {
432
+ "original": 71,
433
+ "effective": 71
434
+ }
435
+ },
436
+ "config": {
437
+ "model": "hf",
438
+ "model_args": "pretrained=meta-llama/Llama-3.2-1B,dtype=float32,trust_remote_code=True",
439
+ "model_num_parameters": 1235814400,
440
+ "model_dtype": "torch.float32",
441
+ "model_revision": "main",
442
+ "model_sha": "4e20de362430cd3b72f300e6b0f18e50e7166e08",
443
+ "batch_size": "auto",
444
+ "batch_sizes": [
445
+ 16
446
+ ],
447
+ "device": "cuda",
448
+ "use_cache": null,
449
+ "limit": null,
450
+ "bootstrap_iters": 100000,
451
+ "gen_kwargs": null,
452
+ "random_seed": 0,
453
+ "numpy_seed": 1234,
454
+ "torch_seed": 1234,
455
+ "fewshot_seed": 1234
456
+ },
457
+ "git_hash": null,
458
+ "date": 1742297302.9341354,
459
+ "pretty_env_info": "PyTorch version: 2.5.1+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: 14.0.0-1ubuntu1.1\nCMake version: version 3.31.2\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Nov 6 2024, 20:22:13) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-6.6.56+-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.140\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: Tesla T4\nGPU 1: Tesla T4\n\nNvidia driver version: 560.35.03\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.6\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 4\nOn-line CPU(s) list: 0-3\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) CPU @ 2.00GHz\nCPU family: 6\nModel: 85\nThread(s) per core: 2\nCore(s) per socket: 2\nSocket(s): 1\nStepping: 3\nBogoMIPS: 4000.44\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch pti ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves arat md_clear arch_capabilities\nHypervisor vendor: KVM\nVirtualization type: full\nL1d cache: 64 KiB (2 instances)\nL1i cache: 64 KiB (2 instances)\nL2 cache: 2 MiB (2 instances)\nL3 cache: 38.5 MiB (1 instance)\nNUMA node(s): 1\nNUMA node0 CPU(s): 0-3\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Mitigation; PTE Inversion\nVulnerability Mds: Mitigation; Clear CPU buffers; SMT Host state unknown\nVulnerability Meltdown: Mitigation; PTI\nVulnerability Mmio stale data: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Mitigation; IBRS\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; IBRS; IBPB conditional; STIBP conditional; RSB filling; PBRSB-eIBRS Not affected; BHI SW loop, KVM SW loop\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Mitigation; Clear CPU buffers; SMT Host state unknown\n\nVersions of relevant libraries:\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] onnx==1.17.0\n[pip3] optree==0.13.1\n[pip3] pytorch-ignite==0.5.1\n[pip3] pytorch-lightning==2.5.0.post0\n[pip3] torch==2.5.1+cu121\n[pip3] torchaudio==2.5.1+cu121\n[pip3] torchinfo==1.8.0\n[pip3] torchmetrics==1.6.1\n[pip3] torchsummary==1.5.1\n[pip3] torchtune==0.5.0\n[pip3] torchvision==0.20.1+cu121\n[conda] Could not collect",
460
+ "transformers_version": "4.47.0",
461
+ "upper_git_hash": null,
462
+ "tokenizer_pad_token": [
463
+ "<|end_of_text|>",
464
+ "128001"
465
+ ],
466
+ "tokenizer_eos_token": [
467
+ "<|end_of_text|>",
468
+ "128001"
469
+ ],
470
+ "tokenizer_bos_token": [
471
+ "<|begin_of_text|>",
472
+ "128000"
473
+ ],
474
+ "eot_token_id": 128001,
475
+ "max_length": 131072,
476
+ "task_hashes": {},
477
+ "model_source": "hf",
478
+ "model_name": "meta-llama/Llama-3.2-1B",
479
+ "model_name_sanitized": "meta-llama__Llama-3.2-1B",
480
+ "system_instruction": null,
481
+ "system_instruction_sha": null,
482
+ "fewshot_as_multiturn": false,
483
+ "chat_template": null,
484
+ "chat_template_sha": null,
485
+ "start_time": 3550.32336198,
486
+ "end_time": 9887.299040389,
487
+ "total_evaluation_time_seconds": "6336.9756784090005"
488
+ }
lm_eval/meta-llama__Llama-3.2-1B/hellaswag_0.4.8_results_2025-03-18T11-19-41.666907.json ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "hellaswag": {
4
+ "alias": "hellaswag",
5
+ "acc,none": 0.47719577773351923,
6
+ "acc_stderr,none": 0.004984589012289378,
7
+ "acc_norm,none": 0.6366261700856403,
8
+ "acc_norm_stderr,none": 0.004799882248494808
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "hellaswag": []
13
+ },
14
+ "configs": {
15
+ "hellaswag": {
16
+ "task": "hellaswag",
17
+ "tag": [
18
+ "multiple_choice"
19
+ ],
20
+ "dataset_path": "hellaswag",
21
+ "dataset_kwargs": {
22
+ "trust_remote_code": true
23
+ },
24
+ "training_split": "train",
25
+ "validation_split": "validation",
26
+ "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
27
+ "doc_to_text": "{{query}}",
28
+ "doc_to_target": "{{label}}",
29
+ "unsafe_code": false,
30
+ "doc_to_choice": "choices",
31
+ "description": "",
32
+ "target_delimiter": " ",
33
+ "fewshot_delimiter": "\n\n",
34
+ "num_fewshot": 0,
35
+ "metric_list": [
36
+ {
37
+ "metric": "acc",
38
+ "aggregation": "mean",
39
+ "higher_is_better": true
40
+ },
41
+ {
42
+ "metric": "acc_norm",
43
+ "aggregation": "mean",
44
+ "higher_is_better": true
45
+ }
46
+ ],
47
+ "output_type": "multiple_choice",
48
+ "repeats": 1,
49
+ "should_decontaminate": false,
50
+ "metadata": {
51
+ "version": 1.0
52
+ }
53
+ }
54
+ },
55
+ "versions": {
56
+ "hellaswag": 1.0
57
+ },
58
+ "n-shot": {
59
+ "hellaswag": 0
60
+ },
61
+ "higher_is_better": {
62
+ "hellaswag": {
63
+ "acc": true,
64
+ "acc_norm": true
65
+ }
66
+ },
67
+ "n-samples": {
68
+ "hellaswag": {
69
+ "original": 10042,
70
+ "effective": 10042
71
+ }
72
+ },
73
+ "config": {
74
+ "model": "hf",
75
+ "model_args": "pretrained=meta-llama/Llama-3.2-1B,dtype=float32,trust_remote_code=True",
76
+ "model_num_parameters": 1235814400,
77
+ "model_dtype": "torch.float32",
78
+ "model_revision": "main",
79
+ "model_sha": "4e20de362430cd3b72f300e6b0f18e50e7166e08",
80
+ "batch_size": "auto",
81
+ "batch_sizes": [
82
+ 32
83
+ ],
84
+ "device": "cuda",
85
+ "use_cache": null,
86
+ "limit": null,
87
+ "bootstrap_iters": 100000,
88
+ "gen_kwargs": null,
89
+ "random_seed": 0,
90
+ "numpy_seed": 1234,
91
+ "torch_seed": 1234,
92
+ "fewshot_seed": 1234
93
+ },
94
+ "git_hash": null,
95
+ "date": 1742295374.8822403,
96
+ "pretty_env_info": "PyTorch version: 2.5.1+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: 14.0.0-1ubuntu1.1\nCMake version: version 3.31.2\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Nov 6 2024, 20:22:13) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-6.6.56+-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.140\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: Tesla T4\nGPU 1: Tesla T4\n\nNvidia driver version: 560.35.03\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.6\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 4\nOn-line CPU(s) list: 0-3\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) CPU @ 2.00GHz\nCPU family: 6\nModel: 85\nThread(s) per core: 2\nCore(s) per socket: 2\nSocket(s): 1\nStepping: 3\nBogoMIPS: 4000.44\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch pti ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves arat md_clear arch_capabilities\nHypervisor vendor: KVM\nVirtualization type: full\nL1d cache: 64 KiB (2 instances)\nL1i cache: 64 KiB (2 instances)\nL2 cache: 2 MiB (2 instances)\nL3 cache: 38.5 MiB (1 instance)\nNUMA node(s): 1\nNUMA node0 CPU(s): 0-3\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Mitigation; PTE Inversion\nVulnerability Mds: Mitigation; Clear CPU buffers; SMT Host state unknown\nVulnerability Meltdown: Mitigation; PTI\nVulnerability Mmio stale data: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Mitigation; IBRS\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; IBRS; IBPB conditional; STIBP conditional; RSB filling; PBRSB-eIBRS Not affected; BHI SW loop, KVM SW loop\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Mitigation; Clear CPU buffers; SMT Host state unknown\n\nVersions of relevant libraries:\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] onnx==1.17.0\n[pip3] optree==0.13.1\n[pip3] pytorch-ignite==0.5.1\n[pip3] pytorch-lightning==2.5.0.post0\n[pip3] torch==2.5.1+cu121\n[pip3] torchaudio==2.5.1+cu121\n[pip3] torchinfo==1.8.0\n[pip3] torchmetrics==1.6.1\n[pip3] torchsummary==1.5.1\n[pip3] torchtune==0.5.0\n[pip3] torchvision==0.20.1+cu121\n[conda] Could not collect",
97
+ "transformers_version": "4.47.0",
98
+ "upper_git_hash": null,
99
+ "tokenizer_pad_token": [
100
+ "<|end_of_text|>",
101
+ "128001"
102
+ ],
103
+ "tokenizer_eos_token": [
104
+ "<|end_of_text|>",
105
+ "128001"
106
+ ],
107
+ "tokenizer_bos_token": [
108
+ "<|begin_of_text|>",
109
+ "128000"
110
+ ],
111
+ "eot_token_id": 128001,
112
+ "max_length": 131072,
113
+ "task_hashes": {},
114
+ "model_source": "hf",
115
+ "model_name": "meta-llama/Llama-3.2-1B",
116
+ "model_name_sanitized": "meta-llama__Llama-3.2-1B",
117
+ "system_instruction": null,
118
+ "system_instruction_sha": null,
119
+ "fewshot_as_multiturn": false,
120
+ "chat_template": null,
121
+ "chat_template_sha": null,
122
+ "start_time": 1622.231848667,
123
+ "end_time": 3046.212616711,
124
+ "total_evaluation_time_seconds": "1423.9807680440001"
125
+ }
lm_eval/meta-llama__Llama-3.2-1B/lambada_multilingual_0.4.8_results_2025-03-18T13-58-58.411133.json ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "lambada_openai_mt_de": {
4
+ "alias": "lambada_openai_mt_de",
5
+ "perplexity,none": 87.5065341563224,
6
+ "perplexity_stderr,none": 5.286347594317647,
7
+ "acc,none": 0.3287405394915583,
8
+ "acc_stderr,none": 0.00654461215135277
9
+ },
10
+ "lambada_openai_mt_en": {
11
+ "alias": "lambada_openai_mt_en",
12
+ "perplexity,none": 5.726774984496916,
13
+ "perplexity_stderr,none": 0.13912667370464665,
14
+ "acc,none": 0.6297302542208423,
15
+ "acc_stderr,none": 0.006727418824564933
16
+ },
17
+ "lambada_openai_mt_es": {
18
+ "alias": "lambada_openai_mt_es",
19
+ "perplexity,none": 162.319184283327,
20
+ "perplexity_stderr,none": 9.233247847299765,
21
+ "acc,none": 0.22996312827479137,
22
+ "acc_stderr,none": 0.00586269008864363
23
+ },
24
+ "lambada_openai_mt_fr": {
25
+ "alias": "lambada_openai_mt_fr",
26
+ "perplexity,none": 62.9701795304645,
27
+ "perplexity_stderr,none": 3.6219586820055167,
28
+ "acc,none": 0.3939452745973219,
29
+ "acc_stderr,none": 0.006807473218363964
30
+ },
31
+ "lambada_openai_mt_it": {
32
+ "alias": "lambada_openai_mt_it",
33
+ "perplexity,none": 85.85290564276717,
34
+ "perplexity_stderr,none": 5.223512913330723,
35
+ "acc,none": 0.3696875606442849,
36
+ "acc_stderr,none": 0.006725234475887096
37
+ }
38
+ },
39
+ "group_subtasks": {
40
+ "lambada_openai_mt_en": [],
41
+ "lambada_openai_mt_it": [],
42
+ "lambada_openai_mt_es": [],
43
+ "lambada_openai_mt_de": [],
44
+ "lambada_openai_mt_fr": []
45
+ },
46
+ "configs": {
47
+ "lambada_openai_mt_de": {
48
+ "task": "lambada_openai_mt_de",
49
+ "tag": [
50
+ "lambada_multilingual"
51
+ ],
52
+ "dataset_path": "EleutherAI/lambada_openai",
53
+ "dataset_name": "de",
54
+ "test_split": "test",
55
+ "doc_to_text": "{{text.split(' ')[:-1]|join(' ')}}",
56
+ "doc_to_target": "{{' '+text.split(' ')[-1]}}",
57
+ "unsafe_code": false,
58
+ "description": "",
59
+ "target_delimiter": " ",
60
+ "fewshot_delimiter": "\n\n",
61
+ "num_fewshot": 0,
62
+ "metric_list": [
63
+ {
64
+ "metric": "perplexity",
65
+ "aggregation": "perplexity",
66
+ "higher_is_better": false
67
+ },
68
+ {
69
+ "metric": "acc",
70
+ "aggregation": "mean",
71
+ "higher_is_better": true
72
+ }
73
+ ],
74
+ "output_type": "loglikelihood",
75
+ "repeats": 1,
76
+ "should_decontaminate": true,
77
+ "doc_to_decontamination_query": "{{text}}",
78
+ "metadata": {
79
+ "version": 1.0
80
+ }
81
+ },
82
+ "lambada_openai_mt_en": {
83
+ "task": "lambada_openai_mt_en",
84
+ "tag": [
85
+ "lambada_multilingual"
86
+ ],
87
+ "dataset_path": "EleutherAI/lambada_openai",
88
+ "dataset_name": "en",
89
+ "test_split": "test",
90
+ "doc_to_text": "{{text.split(' ')[:-1]|join(' ')}}",
91
+ "doc_to_target": "{{' '+text.split(' ')[-1]}}",
92
+ "unsafe_code": false,
93
+ "description": "",
94
+ "target_delimiter": " ",
95
+ "fewshot_delimiter": "\n\n",
96
+ "num_fewshot": 0,
97
+ "metric_list": [
98
+ {
99
+ "metric": "perplexity",
100
+ "aggregation": "perplexity",
101
+ "higher_is_better": false
102
+ },
103
+ {
104
+ "metric": "acc",
105
+ "aggregation": "mean",
106
+ "higher_is_better": true
107
+ }
108
+ ],
109
+ "output_type": "loglikelihood",
110
+ "repeats": 1,
111
+ "should_decontaminate": true,
112
+ "doc_to_decontamination_query": "{{text}}",
113
+ "metadata": {
114
+ "version": 1.0
115
+ }
116
+ },
117
+ "lambada_openai_mt_es": {
118
+ "task": "lambada_openai_mt_es",
119
+ "tag": [
120
+ "lambada_multilingual"
121
+ ],
122
+ "dataset_path": "EleutherAI/lambada_openai",
123
+ "dataset_name": "es",
124
+ "test_split": "test",
125
+ "doc_to_text": "{{text.split(' ')[:-1]|join(' ')}}",
126
+ "doc_to_target": "{{' '+text.split(' ')[-1]}}",
127
+ "unsafe_code": false,
128
+ "description": "",
129
+ "target_delimiter": " ",
130
+ "fewshot_delimiter": "\n\n",
131
+ "num_fewshot": 0,
132
+ "metric_list": [
133
+ {
134
+ "metric": "perplexity",
135
+ "aggregation": "perplexity",
136
+ "higher_is_better": false
137
+ },
138
+ {
139
+ "metric": "acc",
140
+ "aggregation": "mean",
141
+ "higher_is_better": true
142
+ }
143
+ ],
144
+ "output_type": "loglikelihood",
145
+ "repeats": 1,
146
+ "should_decontaminate": true,
147
+ "doc_to_decontamination_query": "{{text}}",
148
+ "metadata": {
149
+ "version": 1.0
150
+ }
151
+ },
152
+ "lambada_openai_mt_fr": {
153
+ "task": "lambada_openai_mt_fr",
154
+ "tag": [
155
+ "lambada_multilingual"
156
+ ],
157
+ "dataset_path": "EleutherAI/lambada_openai",
158
+ "dataset_name": "fr",
159
+ "test_split": "test",
160
+ "doc_to_text": "{{text.split(' ')[:-1]|join(' ')}}",
161
+ "doc_to_target": "{{' '+text.split(' ')[-1]}}",
162
+ "unsafe_code": false,
163
+ "description": "",
164
+ "target_delimiter": " ",
165
+ "fewshot_delimiter": "\n\n",
166
+ "num_fewshot": 0,
167
+ "metric_list": [
168
+ {
169
+ "metric": "perplexity",
170
+ "aggregation": "perplexity",
171
+ "higher_is_better": false
172
+ },
173
+ {
174
+ "metric": "acc",
175
+ "aggregation": "mean",
176
+ "higher_is_better": true
177
+ }
178
+ ],
179
+ "output_type": "loglikelihood",
180
+ "repeats": 1,
181
+ "should_decontaminate": true,
182
+ "doc_to_decontamination_query": "{{text}}",
183
+ "metadata": {
184
+ "version": 1.0
185
+ }
186
+ },
187
+ "lambada_openai_mt_it": {
188
+ "task": "lambada_openai_mt_it",
189
+ "tag": [
190
+ "lambada_multilingual"
191
+ ],
192
+ "dataset_path": "EleutherAI/lambada_openai",
193
+ "dataset_name": "it",
194
+ "test_split": "test",
195
+ "doc_to_text": "{{text.split(' ')[:-1]|join(' ')}}",
196
+ "doc_to_target": "{{' '+text.split(' ')[-1]}}",
197
+ "unsafe_code": false,
198
+ "description": "",
199
+ "target_delimiter": " ",
200
+ "fewshot_delimiter": "\n\n",
201
+ "num_fewshot": 0,
202
+ "metric_list": [
203
+ {
204
+ "metric": "perplexity",
205
+ "aggregation": "perplexity",
206
+ "higher_is_better": false
207
+ },
208
+ {
209
+ "metric": "acc",
210
+ "aggregation": "mean",
211
+ "higher_is_better": true
212
+ }
213
+ ],
214
+ "output_type": "loglikelihood",
215
+ "repeats": 1,
216
+ "should_decontaminate": true,
217
+ "doc_to_decontamination_query": "{{text}}",
218
+ "metadata": {
219
+ "version": 1.0
220
+ }
221
+ }
222
+ },
223
+ "versions": {
224
+ "lambada_openai_mt_de": 1.0,
225
+ "lambada_openai_mt_en": 1.0,
226
+ "lambada_openai_mt_es": 1.0,
227
+ "lambada_openai_mt_fr": 1.0,
228
+ "lambada_openai_mt_it": 1.0
229
+ },
230
+ "n-shot": {
231
+ "lambada_openai_mt_de": 0,
232
+ "lambada_openai_mt_en": 0,
233
+ "lambada_openai_mt_es": 0,
234
+ "lambada_openai_mt_fr": 0,
235
+ "lambada_openai_mt_it": 0
236
+ },
237
+ "higher_is_better": {
238
+ "lambada_openai_mt_de": {
239
+ "perplexity": false,
240
+ "acc": true
241
+ },
242
+ "lambada_openai_mt_en": {
243
+ "perplexity": false,
244
+ "acc": true
245
+ },
246
+ "lambada_openai_mt_es": {
247
+ "perplexity": false,
248
+ "acc": true
249
+ },
250
+ "lambada_openai_mt_fr": {
251
+ "perplexity": false,
252
+ "acc": true
253
+ },
254
+ "lambada_openai_mt_it": {
255
+ "perplexity": false,
256
+ "acc": true
257
+ }
258
+ },
259
+ "n-samples": {
260
+ "lambada_openai_mt_fr": {
261
+ "original": 5153,
262
+ "effective": 5153
263
+ },
264
+ "lambada_openai_mt_de": {
265
+ "original": 5153,
266
+ "effective": 5153
267
+ },
268
+ "lambada_openai_mt_es": {
269
+ "original": 5153,
270
+ "effective": 5153
271
+ },
272
+ "lambada_openai_mt_it": {
273
+ "original": 5153,
274
+ "effective": 5153
275
+ },
276
+ "lambada_openai_mt_en": {
277
+ "original": 5153,
278
+ "effective": 5153
279
+ }
280
+ },
281
+ "config": {
282
+ "model": "hf",
283
+ "model_args": "pretrained=meta-llama/Llama-3.2-1B,dtype=float32,trust_remote_code=True",
284
+ "model_num_parameters": 1235814400,
285
+ "model_dtype": "torch.float32",
286
+ "model_revision": "main",
287
+ "model_sha": "4e20de362430cd3b72f300e6b0f18e50e7166e08",
288
+ "batch_size": "auto",
289
+ "batch_sizes": [
290
+ 16
291
+ ],
292
+ "device": "cuda",
293
+ "use_cache": null,
294
+ "limit": null,
295
+ "bootstrap_iters": 100000,
296
+ "gen_kwargs": null,
297
+ "random_seed": 0,
298
+ "numpy_seed": 1234,
299
+ "torch_seed": 1234,
300
+ "fewshot_seed": 1234
301
+ },
302
+ "git_hash": null,
303
+ "date": 1742304987.5422304,
304
+ "pretty_env_info": "PyTorch version: 2.5.1+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: 14.0.0-1ubuntu1.1\nCMake version: version 3.31.2\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Nov 6 2024, 20:22:13) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-6.6.56+-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.140\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: Tesla T4\nGPU 1: Tesla T4\n\nNvidia driver version: 560.35.03\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.6\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 4\nOn-line CPU(s) list: 0-3\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) CPU @ 2.00GHz\nCPU family: 6\nModel: 85\nThread(s) per core: 2\nCore(s) per socket: 2\nSocket(s): 1\nStepping: 3\nBogoMIPS: 4000.44\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch pti ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves arat md_clear arch_capabilities\nHypervisor vendor: KVM\nVirtualization type: full\nL1d cache: 64 KiB (2 instances)\nL1i cache: 64 KiB (2 instances)\nL2 cache: 2 MiB (2 instances)\nL3 cache: 38.5 MiB (1 instance)\nNUMA node(s): 1\nNUMA node0 CPU(s): 0-3\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Mitigation; PTE Inversion\nVulnerability Mds: Mitigation; Clear CPU buffers; SMT Host state unknown\nVulnerability Meltdown: Mitigation; PTI\nVulnerability Mmio stale data: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Mitigation; IBRS\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; IBRS; IBPB conditional; STIBP conditional; RSB filling; PBRSB-eIBRS Not affected; BHI SW loop, KVM SW loop\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Mitigation; Clear CPU buffers; SMT Host state unknown\n\nVersions of relevant libraries:\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] onnx==1.17.0\n[pip3] optree==0.13.1\n[pip3] pytorch-ignite==0.5.1\n[pip3] pytorch-lightning==2.5.0.post0\n[pip3] torch==2.5.1+cu121\n[pip3] torchaudio==2.5.1+cu121\n[pip3] torchinfo==1.8.0\n[pip3] torchmetrics==1.6.1\n[pip3] torchsummary==1.5.1\n[pip3] torchtune==0.5.0\n[pip3] torchvision==0.20.1+cu121\n[conda] Could not collect",
305
+ "transformers_version": "4.47.0",
306
+ "upper_git_hash": null,
307
+ "tokenizer_pad_token": [
308
+ "<|end_of_text|>",
309
+ "128001"
310
+ ],
311
+ "tokenizer_eos_token": [
312
+ "<|end_of_text|>",
313
+ "128001"
314
+ ],
315
+ "tokenizer_bos_token": [
316
+ "<|begin_of_text|>",
317
+ "128000"
318
+ ],
319
+ "eot_token_id": 128001,
320
+ "max_length": 131072,
321
+ "task_hashes": {},
322
+ "model_source": "hf",
323
+ "model_name": "meta-llama/Llama-3.2-1B",
324
+ "model_name_sanitized": "meta-llama__Llama-3.2-1B",
325
+ "system_instruction": null,
326
+ "system_instruction_sha": null,
327
+ "fewshot_as_multiturn": false,
328
+ "chat_template": null,
329
+ "chat_template_sha": null,
330
+ "start_time": 11234.866892502,
331
+ "end_time": 12602.956532537,
332
+ "total_evaluation_time_seconds": "1368.0896400350011"
333
+ }
lm_eval/meta-llama__Llama-3.2-1B/lambada_openai_0.4.8_results_2025-03-18T10-52-22.939329.json ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "lambada_openai": {
4
+ "alias": "lambada_openai",
5
+ "perplexity,none": 5.726775077496861,
6
+ "perplexity_stderr,none": 0.13911667415832324,
7
+ "acc,none": 0.6297302542208423,
8
+ "acc_stderr,none": 0.006727418824564937
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "lambada_openai": []
13
+ },
14
+ "configs": {
15
+ "lambada_openai": {
16
+ "task": "lambada_openai",
17
+ "tag": [
18
+ "lambada"
19
+ ],
20
+ "dataset_path": "EleutherAI/lambada_openai",
21
+ "dataset_name": "default",
22
+ "dataset_kwargs": {
23
+ "trust_remote_code": true
24
+ },
25
+ "test_split": "test",
26
+ "doc_to_text": "{{text.split(' ')[:-1]|join(' ')}}",
27
+ "doc_to_target": "{{' '+text.split(' ')[-1]}}",
28
+ "unsafe_code": false,
29
+ "description": "",
30
+ "target_delimiter": " ",
31
+ "fewshot_delimiter": "\n\n",
32
+ "num_fewshot": 0,
33
+ "metric_list": [
34
+ {
35
+ "metric": "perplexity",
36
+ "aggregation": "perplexity",
37
+ "higher_is_better": false
38
+ },
39
+ {
40
+ "metric": "acc",
41
+ "aggregation": "mean",
42
+ "higher_is_better": true
43
+ }
44
+ ],
45
+ "output_type": "loglikelihood",
46
+ "repeats": 1,
47
+ "should_decontaminate": true,
48
+ "doc_to_decontamination_query": "{{text}}",
49
+ "metadata": {
50
+ "version": 1.0
51
+ }
52
+ }
53
+ },
54
+ "versions": {
55
+ "lambada_openai": 1.0
56
+ },
57
+ "n-shot": {
58
+ "lambada_openai": 0
59
+ },
60
+ "higher_is_better": {
61
+ "lambada_openai": {
62
+ "perplexity": false,
63
+ "acc": true
64
+ }
65
+ },
66
+ "n-samples": {
67
+ "lambada_openai": {
68
+ "original": 5153,
69
+ "effective": 5153
70
+ }
71
+ },
72
+ "config": {
73
+ "model": "hf",
74
+ "model_args": "pretrained=meta-llama/Llama-3.2-1B,dtype=float32,trust_remote_code=True",
75
+ "model_num_parameters": 1235814400,
76
+ "model_dtype": "torch.float32",
77
+ "model_revision": "main",
78
+ "model_sha": "4e20de362430cd3b72f300e6b0f18e50e7166e08",
79
+ "batch_size": "auto",
80
+ "batch_sizes": [
81
+ 16
82
+ ],
83
+ "device": "cuda",
84
+ "use_cache": null,
85
+ "limit": null,
86
+ "bootstrap_iters": 100000,
87
+ "gen_kwargs": null,
88
+ "random_seed": 0,
89
+ "numpy_seed": 1234,
90
+ "torch_seed": 1234,
91
+ "fewshot_seed": 1234
92
+ },
93
+ "git_hash": null,
94
+ "date": 1742294721.9475036,
95
+ "pretty_env_info": "PyTorch version: 2.5.1+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: 14.0.0-1ubuntu1.1\nCMake version: version 3.31.2\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Nov 6 2024, 20:22:13) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-6.6.56+-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.140\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: Tesla T4\nGPU 1: Tesla T4\n\nNvidia driver version: 560.35.03\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.6\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 4\nOn-line CPU(s) list: 0-3\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) CPU @ 2.00GHz\nCPU family: 6\nModel: 85\nThread(s) per core: 2\nCore(s) per socket: 2\nSocket(s): 1\nStepping: 3\nBogoMIPS: 4000.44\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch pti ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves arat md_clear arch_capabilities\nHypervisor vendor: KVM\nVirtualization type: full\nL1d cache: 64 KiB (2 instances)\nL1i cache: 64 KiB (2 instances)\nL2 cache: 2 MiB (2 instances)\nL3 cache: 38.5 MiB (1 instance)\nNUMA node(s): 1\nNUMA node0 CPU(s): 0-3\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Mitigation; PTE Inversion\nVulnerability Mds: Mitigation; Clear CPU buffers; SMT Host state unknown\nVulnerability Meltdown: Mitigation; PTI\nVulnerability Mmio stale data: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Mitigation; IBRS\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; IBRS; IBPB conditional; STIBP conditional; RSB filling; PBRSB-eIBRS Not affected; BHI SW loop, KVM SW loop\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Mitigation; Clear CPU buffers; SMT Host state unknown\n\nVersions of relevant libraries:\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] onnx==1.17.0\n[pip3] optree==0.13.1\n[pip3] pytorch-ignite==0.5.1\n[pip3] pytorch-lightning==2.5.0.post0\n[pip3] torch==2.5.1+cu121\n[pip3] torchaudio==2.5.1+cu121\n[pip3] torchinfo==1.8.0\n[pip3] torchmetrics==1.6.1\n[pip3] torchsummary==1.5.1\n[pip3] torchtune==0.5.0\n[pip3] torchvision==0.20.1+cu121\n[conda] Could not collect",
96
+ "transformers_version": "4.47.0",
97
+ "upper_git_hash": null,
98
+ "tokenizer_pad_token": [
99
+ "<|end_of_text|>",
100
+ "128001"
101
+ ],
102
+ "tokenizer_eos_token": [
103
+ "<|end_of_text|>",
104
+ "128001"
105
+ ],
106
+ "tokenizer_bos_token": [
107
+ "<|begin_of_text|>",
108
+ "128000"
109
+ ],
110
+ "eot_token_id": 128001,
111
+ "max_length": 131072,
112
+ "task_hashes": {},
113
+ "model_source": "hf",
114
+ "model_name": "meta-llama/Llama-3.2-1B",
115
+ "model_name_sanitized": "meta-llama__Llama-3.2-1B",
116
+ "system_instruction": null,
117
+ "system_instruction_sha": null,
118
+ "fewshot_as_multiturn": false,
119
+ "chat_template": null,
120
+ "chat_template_sha": null,
121
+ "start_time": 970.501230964,
122
+ "end_time": 1407.484874315,
123
+ "total_evaluation_time_seconds": "436.98364335100007"
124
+ }
lm_eval/meta-llama__Llama-3.2-1B/mmlu_0.4.8_results_2025-03-18T13-35-53.468038.json ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval/meta-llama__Llama-3.2-1B/pawsx_0.4.8_results_2025-03-18T14-16-44.489821.json ADDED
@@ -0,0 +1,394 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "pawsx": {
4
+ "acc,none": 0.539,
5
+ "acc_stderr,none": 0.0042043710977204725,
6
+ "alias": "pawsx"
7
+ },
8
+ "paws_de": {
9
+ "alias": " - paws_de",
10
+ "acc,none": 0.537,
11
+ "acc_stderr,none": 0.011152474561478175
12
+ },
13
+ "paws_en": {
14
+ "alias": " - paws_en",
15
+ "acc,none": 0.5925,
16
+ "acc_stderr,none": 0.010990098549743105
17
+ },
18
+ "paws_es": {
19
+ "alias": " - paws_es",
20
+ "acc,none": 0.571,
21
+ "acc_stderr,none": 0.01106981347562766
22
+ },
23
+ "paws_fr": {
24
+ "alias": " - paws_fr",
25
+ "acc,none": 0.5515,
26
+ "acc_stderr,none": 0.011123656901911276
27
+ },
28
+ "paws_ja": {
29
+ "alias": " - paws_ja",
30
+ "acc,none": 0.5075,
31
+ "acc_stderr,none": 0.011181877847485998
32
+ },
33
+ "paws_ko": {
34
+ "alias": " - paws_ko",
35
+ "acc,none": 0.5265,
36
+ "acc_stderr,none": 0.011167418260963936
37
+ },
38
+ "paws_zh": {
39
+ "alias": " - paws_zh",
40
+ "acc,none": 0.487,
41
+ "acc_stderr,none": 0.011179355482070377
42
+ }
43
+ },
44
+ "groups": {
45
+ "pawsx": {
46
+ "acc,none": 0.539,
47
+ "acc_stderr,none": 0.0042043710977204725,
48
+ "alias": "pawsx"
49
+ }
50
+ },
51
+ "group_subtasks": {
52
+ "pawsx": [
53
+ "paws_en",
54
+ "paws_de",
55
+ "paws_es",
56
+ "paws_fr",
57
+ "paws_ja",
58
+ "paws_ko",
59
+ "paws_zh"
60
+ ]
61
+ },
62
+ "configs": {
63
+ "paws_de": {
64
+ "task": "paws_de",
65
+ "dataset_path": "paws-x",
66
+ "dataset_name": "de",
67
+ "training_split": "train",
68
+ "validation_split": "validation",
69
+ "test_split": "test",
70
+ "doc_to_text": "",
71
+ "doc_to_target": "label",
72
+ "unsafe_code": false,
73
+ "doc_to_choice": "{{[sentence1+\", richtig? Nein, \"+sentence2, sentence1+\", richtig? Ja, \"+sentence2]}}",
74
+ "description": "",
75
+ "target_delimiter": " ",
76
+ "fewshot_delimiter": "\n\n",
77
+ "num_fewshot": 0,
78
+ "metric_list": [
79
+ {
80
+ "metric": "acc",
81
+ "aggregation": "mean",
82
+ "higher_is_better": true
83
+ }
84
+ ],
85
+ "output_type": "multiple_choice",
86
+ "repeats": 1,
87
+ "should_decontaminate": false,
88
+ "metadata": {
89
+ "version": 1.0
90
+ }
91
+ },
92
+ "paws_en": {
93
+ "task": "paws_en",
94
+ "dataset_path": "paws-x",
95
+ "dataset_name": "en",
96
+ "training_split": "train",
97
+ "validation_split": "validation",
98
+ "test_split": "test",
99
+ "doc_to_text": "",
100
+ "doc_to_target": "label",
101
+ "unsafe_code": false,
102
+ "doc_to_choice": "{{[sentence1+\", right? No, \"+sentence2, sentence1+\", right? Yes, \"+sentence2]}}",
103
+ "description": "",
104
+ "target_delimiter": " ",
105
+ "fewshot_delimiter": "\n\n",
106
+ "num_fewshot": 0,
107
+ "metric_list": [
108
+ {
109
+ "metric": "acc",
110
+ "aggregation": "mean",
111
+ "higher_is_better": true
112
+ }
113
+ ],
114
+ "output_type": "multiple_choice",
115
+ "repeats": 1,
116
+ "should_decontaminate": false,
117
+ "metadata": {
118
+ "version": 1.0
119
+ }
120
+ },
121
+ "paws_es": {
122
+ "task": "paws_es",
123
+ "dataset_path": "paws-x",
124
+ "dataset_name": "es",
125
+ "training_split": "train",
126
+ "validation_split": "validation",
127
+ "test_split": "test",
128
+ "doc_to_text": "",
129
+ "doc_to_target": "label",
130
+ "unsafe_code": false,
131
+ "doc_to_choice": "{{[sentence1+\", verdad? No, \"+sentence2, sentence1+\", verdad? Sí, \"+sentence2]}}",
132
+ "description": "",
133
+ "target_delimiter": " ",
134
+ "fewshot_delimiter": "\n\n",
135
+ "num_fewshot": 0,
136
+ "metric_list": [
137
+ {
138
+ "metric": "acc",
139
+ "aggregation": "mean",
140
+ "higher_is_better": true
141
+ }
142
+ ],
143
+ "output_type": "multiple_choice",
144
+ "repeats": 1,
145
+ "should_decontaminate": false,
146
+ "metadata": {
147
+ "version": 1.0
148
+ }
149
+ },
150
+ "paws_fr": {
151
+ "task": "paws_fr",
152
+ "dataset_path": "paws-x",
153
+ "dataset_name": "fr",
154
+ "training_split": "train",
155
+ "validation_split": "validation",
156
+ "test_split": "test",
157
+ "doc_to_text": "",
158
+ "doc_to_target": "label",
159
+ "unsafe_code": false,
160
+ "doc_to_choice": "{{[sentence1+\", n'est-ce pas? Non, \"+sentence2, sentence1+\", n'est-ce pas? Oui, \"+sentence2]}}",
161
+ "description": "",
162
+ "target_delimiter": " ",
163
+ "fewshot_delimiter": "\n\n",
164
+ "num_fewshot": 0,
165
+ "metric_list": [
166
+ {
167
+ "metric": "acc",
168
+ "aggregation": "mean",
169
+ "higher_is_better": true
170
+ }
171
+ ],
172
+ "output_type": "multiple_choice",
173
+ "repeats": 1,
174
+ "should_decontaminate": false,
175
+ "metadata": {
176
+ "version": 1.0
177
+ }
178
+ },
179
+ "paws_ja": {
180
+ "task": "paws_ja",
181
+ "dataset_path": "paws-x",
182
+ "dataset_name": "ja",
183
+ "training_split": "train",
184
+ "validation_split": "validation",
185
+ "test_split": "test",
186
+ "doc_to_text": "",
187
+ "doc_to_target": "label",
188
+ "unsafe_code": false,
189
+ "doc_to_choice": "{{[sentence1+\", ですね? いいえ, \"+sentence2, sentence1+\", ですね? はい, \"+sentence2]}}",
190
+ "description": "",
191
+ "target_delimiter": " ",
192
+ "fewshot_delimiter": "\n\n",
193
+ "num_fewshot": 0,
194
+ "metric_list": [
195
+ {
196
+ "metric": "acc",
197
+ "aggregation": "mean",
198
+ "higher_is_better": true
199
+ }
200
+ ],
201
+ "output_type": "multiple_choice",
202
+ "repeats": 1,
203
+ "should_decontaminate": false,
204
+ "metadata": {
205
+ "version": 1.0
206
+ }
207
+ },
208
+ "paws_ko": {
209
+ "task": "paws_ko",
210
+ "dataset_path": "paws-x",
211
+ "dataset_name": "ko",
212
+ "training_split": "train",
213
+ "validation_split": "validation",
214
+ "test_split": "test",
215
+ "doc_to_text": "",
216
+ "doc_to_target": "label",
217
+ "unsafe_code": false,
218
+ "doc_to_choice": "{{[sentence1+\", 맞죠? 아니요, \"+sentence2, sentence1+\", 맞죠? 예, \"+sentence2]}}",
219
+ "description": "",
220
+ "target_delimiter": " ",
221
+ "fewshot_delimiter": "\n\n",
222
+ "num_fewshot": 0,
223
+ "metric_list": [
224
+ {
225
+ "metric": "acc",
226
+ "aggregation": "mean",
227
+ "higher_is_better": true
228
+ }
229
+ ],
230
+ "output_type": "multiple_choice",
231
+ "repeats": 1,
232
+ "should_decontaminate": false,
233
+ "metadata": {
234
+ "version": 1.0
235
+ }
236
+ },
237
+ "paws_zh": {
238
+ "task": "paws_zh",
239
+ "dataset_path": "paws-x",
240
+ "dataset_name": "zh",
241
+ "training_split": "train",
242
+ "validation_split": "validation",
243
+ "test_split": "test",
244
+ "doc_to_text": "",
245
+ "doc_to_target": "label",
246
+ "unsafe_code": false,
247
+ "doc_to_choice": "{{[sentence1+\", 对吧? 不是, \"+sentence2, sentence1+\", 对吧? 是, \"+sentence2]}}",
248
+ "description": "",
249
+ "target_delimiter": " ",
250
+ "fewshot_delimiter": "\n\n",
251
+ "num_fewshot": 0,
252
+ "metric_list": [
253
+ {
254
+ "metric": "acc",
255
+ "aggregation": "mean",
256
+ "higher_is_better": true
257
+ }
258
+ ],
259
+ "output_type": "multiple_choice",
260
+ "repeats": 1,
261
+ "should_decontaminate": false,
262
+ "metadata": {
263
+ "version": 1.0
264
+ }
265
+ }
266
+ },
267
+ "versions": {
268
+ "paws_de": 1.0,
269
+ "paws_en": 1.0,
270
+ "paws_es": 1.0,
271
+ "paws_fr": 1.0,
272
+ "paws_ja": 1.0,
273
+ "paws_ko": 1.0,
274
+ "paws_zh": 1.0,
275
+ "pawsx": 0.0
276
+ },
277
+ "n-shot": {
278
+ "paws_de": 0,
279
+ "paws_en": 0,
280
+ "paws_es": 0,
281
+ "paws_fr": 0,
282
+ "paws_ja": 0,
283
+ "paws_ko": 0,
284
+ "paws_zh": 0
285
+ },
286
+ "higher_is_better": {
287
+ "paws_de": {
288
+ "acc": true
289
+ },
290
+ "paws_en": {
291
+ "acc": true
292
+ },
293
+ "paws_es": {
294
+ "acc": true
295
+ },
296
+ "paws_fr": {
297
+ "acc": true
298
+ },
299
+ "paws_ja": {
300
+ "acc": true
301
+ },
302
+ "paws_ko": {
303
+ "acc": true
304
+ },
305
+ "paws_zh": {
306
+ "acc": true
307
+ },
308
+ "pawsx": {
309
+ "acc": true
310
+ }
311
+ },
312
+ "n-samples": {
313
+ "paws_en": {
314
+ "original": 2000,
315
+ "effective": 2000
316
+ },
317
+ "paws_de": {
318
+ "original": 2000,
319
+ "effective": 2000
320
+ },
321
+ "paws_es": {
322
+ "original": 2000,
323
+ "effective": 2000
324
+ },
325
+ "paws_fr": {
326
+ "original": 2000,
327
+ "effective": 2000
328
+ },
329
+ "paws_ja": {
330
+ "original": 2000,
331
+ "effective": 2000
332
+ },
333
+ "paws_ko": {
334
+ "original": 2000,
335
+ "effective": 2000
336
+ },
337
+ "paws_zh": {
338
+ "original": 2000,
339
+ "effective": 2000
340
+ }
341
+ },
342
+ "config": {
343
+ "model": "hf",
344
+ "model_args": "pretrained=meta-llama/Llama-3.2-1B,dtype=float32,trust_remote_code=True",
345
+ "model_num_parameters": 1235814400,
346
+ "model_dtype": "torch.float32",
347
+ "model_revision": "main",
348
+ "model_sha": "4e20de362430cd3b72f300e6b0f18e50e7166e08",
349
+ "batch_size": "auto",
350
+ "batch_sizes": [
351
+ 16
352
+ ],
353
+ "device": "cuda",
354
+ "use_cache": null,
355
+ "limit": null,
356
+ "bootstrap_iters": 100000,
357
+ "gen_kwargs": null,
358
+ "random_seed": 0,
359
+ "numpy_seed": 1234,
360
+ "torch_seed": 1234,
361
+ "fewshot_seed": 1234
362
+ },
363
+ "git_hash": null,
364
+ "date": 1742306371.981645,
365
+ "pretty_env_info": "PyTorch version: 2.5.1+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: 14.0.0-1ubuntu1.1\nCMake version: version 3.31.2\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Nov 6 2024, 20:22:13) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-6.6.56+-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.140\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: Tesla T4\nGPU 1: Tesla T4\n\nNvidia driver version: 560.35.03\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.6\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 4\nOn-line CPU(s) list: 0-3\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) CPU @ 2.00GHz\nCPU family: 6\nModel: 85\nThread(s) per core: 2\nCore(s) per socket: 2\nSocket(s): 1\nStepping: 3\nBogoMIPS: 4000.44\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch pti ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves arat md_clear arch_capabilities\nHypervisor vendor: KVM\nVirtualization type: full\nL1d cache: 64 KiB (2 instances)\nL1i cache: 64 KiB (2 instances)\nL2 cache: 2 MiB (2 instances)\nL3 cache: 38.5 MiB (1 instance)\nNUMA node(s): 1\nNUMA node0 CPU(s): 0-3\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Mitigation; PTE Inversion\nVulnerability Mds: Mitigation; Clear CPU buffers; SMT Host state unknown\nVulnerability Meltdown: Mitigation; PTI\nVulnerability Mmio stale data: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Mitigation; IBRS\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; IBRS; IBPB conditional; STIBP conditional; RSB filling; PBRSB-eIBRS Not affected; BHI SW loop, KVM SW loop\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Mitigation; Clear CPU buffers; SMT Host state unknown\n\nVersions of relevant libraries:\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] onnx==1.17.0\n[pip3] optree==0.13.1\n[pip3] pytorch-ignite==0.5.1\n[pip3] pytorch-lightning==2.5.0.post0\n[pip3] torch==2.5.1+cu121\n[pip3] torchaudio==2.5.1+cu121\n[pip3] torchinfo==1.8.0\n[pip3] torchmetrics==1.6.1\n[pip3] torchsummary==1.5.1\n[pip3] torchtune==0.5.0\n[pip3] torchvision==0.20.1+cu121\n[conda] Could not collect",
366
+ "transformers_version": "4.47.0",
367
+ "upper_git_hash": null,
368
+ "tokenizer_pad_token": [
369
+ "<|end_of_text|>",
370
+ "128001"
371
+ ],
372
+ "tokenizer_eos_token": [
373
+ "<|end_of_text|>",
374
+ "128001"
375
+ ],
376
+ "tokenizer_bos_token": [
377
+ "<|begin_of_text|>",
378
+ "128000"
379
+ ],
380
+ "eot_token_id": 128001,
381
+ "max_length": 131072,
382
+ "task_hashes": {},
383
+ "model_source": "hf",
384
+ "model_name": "meta-llama/Llama-3.2-1B",
385
+ "model_name_sanitized": "meta-llama__Llama-3.2-1B",
386
+ "system_instruction": null,
387
+ "system_instruction_sha": null,
388
+ "fewshot_as_multiturn": false,
389
+ "chat_template": null,
390
+ "chat_template_sha": null,
391
+ "start_time": 12619.447145299,
392
+ "end_time": 13669.035210791,
393
+ "total_evaluation_time_seconds": "1049.5880654920002"
394
+ }
lm_eval/meta-llama__Llama-3.2-1B/piqa_0.4.8_results_2025-03-18T11-22-02.759281.json ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "piqa": {
4
+ "alias": "piqa",
5
+ "acc,none": 0.7448313384113167,
6
+ "acc_stderr,none": 0.010171571592521826,
7
+ "acc_norm,none": 0.7459194776931447,
8
+ "acc_norm_stderr,none": 0.010157271999135041
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "piqa": []
13
+ },
14
+ "configs": {
15
+ "piqa": {
16
+ "task": "piqa",
17
+ "dataset_path": "piqa",
18
+ "dataset_kwargs": {
19
+ "trust_remote_code": true
20
+ },
21
+ "training_split": "train",
22
+ "validation_split": "validation",
23
+ "doc_to_text": "Question: {{goal}}\nAnswer:",
24
+ "doc_to_target": "label",
25
+ "unsafe_code": false,
26
+ "doc_to_choice": "{{[sol1, sol2]}}",
27
+ "description": "",
28
+ "target_delimiter": " ",
29
+ "fewshot_delimiter": "\n\n",
30
+ "num_fewshot": 0,
31
+ "metric_list": [
32
+ {
33
+ "metric": "acc",
34
+ "aggregation": "mean",
35
+ "higher_is_better": true
36
+ },
37
+ {
38
+ "metric": "acc_norm",
39
+ "aggregation": "mean",
40
+ "higher_is_better": true
41
+ }
42
+ ],
43
+ "output_type": "multiple_choice",
44
+ "repeats": 1,
45
+ "should_decontaminate": true,
46
+ "doc_to_decontamination_query": "goal",
47
+ "metadata": {
48
+ "version": 1.0
49
+ }
50
+ }
51
+ },
52
+ "versions": {
53
+ "piqa": 1.0
54
+ },
55
+ "n-shot": {
56
+ "piqa": 0
57
+ },
58
+ "higher_is_better": {
59
+ "piqa": {
60
+ "acc": true,
61
+ "acc_norm": true
62
+ }
63
+ },
64
+ "n-samples": {
65
+ "piqa": {
66
+ "original": 1838,
67
+ "effective": 1838
68
+ }
69
+ },
70
+ "config": {
71
+ "model": "hf",
72
+ "model_args": "pretrained=meta-llama/Llama-3.2-1B,dtype=float32,trust_remote_code=True",
73
+ "model_num_parameters": 1235814400,
74
+ "model_dtype": "torch.float32",
75
+ "model_revision": "main",
76
+ "model_sha": "4e20de362430cd3b72f300e6b0f18e50e7166e08",
77
+ "batch_size": "auto",
78
+ "batch_sizes": [
79
+ 16
80
+ ],
81
+ "device": "cuda",
82
+ "use_cache": null,
83
+ "limit": null,
84
+ "bootstrap_iters": 100000,
85
+ "gen_kwargs": null,
86
+ "random_seed": 0,
87
+ "numpy_seed": 1234,
88
+ "torch_seed": 1234,
89
+ "fewshot_seed": 1234
90
+ },
91
+ "git_hash": null,
92
+ "date": 1742296814.327999,
93
+ "pretty_env_info": "PyTorch version: 2.5.1+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: 14.0.0-1ubuntu1.1\nCMake version: version 3.31.2\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Nov 6 2024, 20:22:13) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-6.6.56+-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.140\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: Tesla T4\nGPU 1: Tesla T4\n\nNvidia driver version: 560.35.03\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.6\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 4\nOn-line CPU(s) list: 0-3\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) CPU @ 2.00GHz\nCPU family: 6\nModel: 85\nThread(s) per core: 2\nCore(s) per socket: 2\nSocket(s): 1\nStepping: 3\nBogoMIPS: 4000.44\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch pti ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves arat md_clear arch_capabilities\nHypervisor vendor: KVM\nVirtualization type: full\nL1d cache: 64 KiB (2 instances)\nL1i cache: 64 KiB (2 instances)\nL2 cache: 2 MiB (2 instances)\nL3 cache: 38.5 MiB (1 instance)\nNUMA node(s): 1\nNUMA node0 CPU(s): 0-3\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Mitigation; PTE Inversion\nVulnerability Mds: Mitigation; Clear CPU buffers; SMT Host state unknown\nVulnerability Meltdown: Mitigation; PTI\nVulnerability Mmio stale data: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Mitigation; IBRS\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; IBRS; IBPB conditional; STIBP conditional; RSB filling; PBRSB-eIBRS Not affected; BHI SW loop, KVM SW loop\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Mitigation; Clear CPU buffers; SMT Host state unknown\n\nVersions of relevant libraries:\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] onnx==1.17.0\n[pip3] optree==0.13.1\n[pip3] pytorch-ignite==0.5.1\n[pip3] pytorch-lightning==2.5.0.post0\n[pip3] torch==2.5.1+cu121\n[pip3] torchaudio==2.5.1+cu121\n[pip3] torchinfo==1.8.0\n[pip3] torchmetrics==1.6.1\n[pip3] torchsummary==1.5.1\n[pip3] torchtune==0.5.0\n[pip3] torchvision==0.20.1+cu121\n[conda] Could not collect",
94
+ "transformers_version": "4.47.0",
95
+ "upper_git_hash": null,
96
+ "tokenizer_pad_token": [
97
+ "<|end_of_text|>",
98
+ "128001"
99
+ ],
100
+ "tokenizer_eos_token": [
101
+ "<|end_of_text|>",
102
+ "128001"
103
+ ],
104
+ "tokenizer_bos_token": [
105
+ "<|begin_of_text|>",
106
+ "128000"
107
+ ],
108
+ "eot_token_id": 128001,
109
+ "max_length": 131072,
110
+ "task_hashes": {},
111
+ "model_source": "hf",
112
+ "model_name": "meta-llama/Llama-3.2-1B",
113
+ "model_name_sanitized": "meta-llama__Llama-3.2-1B",
114
+ "system_instruction": null,
115
+ "system_instruction_sha": null,
116
+ "fewshot_as_multiturn": false,
117
+ "chat_template": null,
118
+ "chat_template_sha": null,
119
+ "start_time": 3062.047450193,
120
+ "end_time": 3187.30502146,
121
+ "total_evaluation_time_seconds": "125.257571267"
122
+ }
lm_eval/meta-llama__Llama-3.2-1B/sciq_0.4.8_results_2025-03-18T13-19-28.563679.json ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "sciq": {
4
+ "alias": "sciq",
5
+ "acc,none": 0.914,
6
+ "acc_stderr,none": 0.008870325962594766,
7
+ "acc_norm,none": 0.883,
8
+ "acc_norm_stderr,none": 0.010169287802713329
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "sciq": []
13
+ },
14
+ "configs": {
15
+ "sciq": {
16
+ "task": "sciq",
17
+ "dataset_path": "sciq",
18
+ "training_split": "train",
19
+ "validation_split": "validation",
20
+ "test_split": "test",
21
+ "doc_to_text": "{{support.lstrip()}}\nQuestion: {{question}}\nAnswer:",
22
+ "doc_to_target": 3,
23
+ "unsafe_code": false,
24
+ "doc_to_choice": "{{[distractor1, distractor2, distractor3, correct_answer]}}",
25
+ "description": "",
26
+ "target_delimiter": " ",
27
+ "fewshot_delimiter": "\n\n",
28
+ "num_fewshot": 0,
29
+ "metric_list": [
30
+ {
31
+ "metric": "acc",
32
+ "aggregation": "mean",
33
+ "higher_is_better": true
34
+ },
35
+ {
36
+ "metric": "acc_norm",
37
+ "aggregation": "mean",
38
+ "higher_is_better": true
39
+ }
40
+ ],
41
+ "output_type": "multiple_choice",
42
+ "repeats": 1,
43
+ "should_decontaminate": true,
44
+ "doc_to_decontamination_query": "{{support}} {{question}}",
45
+ "metadata": {
46
+ "version": 1.0
47
+ }
48
+ }
49
+ },
50
+ "versions": {
51
+ "sciq": 1.0
52
+ },
53
+ "n-shot": {
54
+ "sciq": 0
55
+ },
56
+ "higher_is_better": {
57
+ "sciq": {
58
+ "acc": true,
59
+ "acc_norm": true
60
+ }
61
+ },
62
+ "n-samples": {
63
+ "sciq": {
64
+ "original": 1000,
65
+ "effective": 1000
66
+ }
67
+ },
68
+ "config": {
69
+ "model": "hf",
70
+ "model_args": "pretrained=meta-llama/Llama-3.2-1B,dtype=float32,trust_remote_code=True",
71
+ "model_num_parameters": 1235814400,
72
+ "model_dtype": "torch.float32",
73
+ "model_revision": "main",
74
+ "model_sha": "4e20de362430cd3b72f300e6b0f18e50e7166e08",
75
+ "batch_size": "auto",
76
+ "batch_sizes": [
77
+ 4
78
+ ],
79
+ "device": "cuda",
80
+ "use_cache": null,
81
+ "limit": null,
82
+ "bootstrap_iters": 100000,
83
+ "gen_kwargs": null,
84
+ "random_seed": 0,
85
+ "numpy_seed": 1234,
86
+ "torch_seed": 1234,
87
+ "fewshot_seed": 1234
88
+ },
89
+ "git_hash": null,
90
+ "date": 1742303738.5865788,
91
+ "pretty_env_info": "PyTorch version: 2.5.1+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: 14.0.0-1ubuntu1.1\nCMake version: version 3.31.2\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Nov 6 2024, 20:22:13) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-6.6.56+-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.140\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: Tesla T4\nGPU 1: Tesla T4\n\nNvidia driver version: 560.35.03\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.6\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 4\nOn-line CPU(s) list: 0-3\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) CPU @ 2.00GHz\nCPU family: 6\nModel: 85\nThread(s) per core: 2\nCore(s) per socket: 2\nSocket(s): 1\nStepping: 3\nBogoMIPS: 4000.44\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch pti ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves arat md_clear arch_capabilities\nHypervisor vendor: KVM\nVirtualization type: full\nL1d cache: 64 KiB (2 instances)\nL1i cache: 64 KiB (2 instances)\nL2 cache: 2 MiB (2 instances)\nL3 cache: 38.5 MiB (1 instance)\nNUMA node(s): 1\nNUMA node0 CPU(s): 0-3\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Mitigation; PTE Inversion\nVulnerability Mds: Mitigation; Clear CPU buffers; SMT Host state unknown\nVulnerability Meltdown: Mitigation; PTI\nVulnerability Mmio stale data: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Mitigation; IBRS\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; IBRS; IBPB conditional; STIBP conditional; RSB filling; PBRSB-eIBRS Not affected; BHI SW loop, KVM SW loop\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Mitigation; Clear CPU buffers; SMT Host state unknown\n\nVersions of relevant libraries:\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] onnx==1.17.0\n[pip3] optree==0.13.1\n[pip3] pytorch-ignite==0.5.1\n[pip3] pytorch-lightning==2.5.0.post0\n[pip3] torch==2.5.1+cu121\n[pip3] torchaudio==2.5.1+cu121\n[pip3] torchinfo==1.8.0\n[pip3] torchmetrics==1.6.1\n[pip3] torchsummary==1.5.1\n[pip3] torchtune==0.5.0\n[pip3] torchvision==0.20.1+cu121\n[conda] Could not collect",
92
+ "transformers_version": "4.47.0",
93
+ "upper_git_hash": null,
94
+ "tokenizer_pad_token": [
95
+ "<|end_of_text|>",
96
+ "128001"
97
+ ],
98
+ "tokenizer_eos_token": [
99
+ "<|end_of_text|>",
100
+ "128001"
101
+ ],
102
+ "tokenizer_bos_token": [
103
+ "<|begin_of_text|>",
104
+ "128000"
105
+ ],
106
+ "eot_token_id": 128001,
107
+ "max_length": 131072,
108
+ "task_hashes": {},
109
+ "model_source": "hf",
110
+ "model_name": "meta-llama/Llama-3.2-1B",
111
+ "model_name_sanitized": "meta-llama__Llama-3.2-1B",
112
+ "system_instruction": null,
113
+ "system_instruction_sha": null,
114
+ "fewshot_as_multiturn": false,
115
+ "chat_template": null,
116
+ "chat_template_sha": null,
117
+ "start_time": 9986.600789724,
118
+ "end_time": 10233.109467324,
119
+ "total_evaluation_time_seconds": "246.50867760000074"
120
+ }
lm_eval/meta-llama__Llama-3.2-1B/winogrande_0.4.8_results_2025-03-18T13-15-05.925252.json ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "winogrande": {
4
+ "alias": "winogrande",
5
+ "acc,none": 0.6069455406471981,
6
+ "acc_stderr,none": 0.01372727624910844
7
+ }
8
+ },
9
+ "group_subtasks": {
10
+ "winogrande": []
11
+ },
12
+ "configs": {
13
+ "winogrande": {
14
+ "task": "winogrande",
15
+ "dataset_path": "winogrande",
16
+ "dataset_name": "winogrande_xl",
17
+ "dataset_kwargs": {
18
+ "trust_remote_code": true
19
+ },
20
+ "training_split": "train",
21
+ "validation_split": "validation",
22
+ "doc_to_text": "def doc_to_text(doc):\n answer_to_num = {\"1\": 0, \"2\": 1}\n return answer_to_num[doc[\"answer\"]]\n",
23
+ "doc_to_target": "def doc_to_target(doc):\n idx = doc[\"sentence\"].index(\"_\") + 1\n return doc[\"sentence\"][idx:].strip()\n",
24
+ "unsafe_code": false,
25
+ "doc_to_choice": "def doc_to_choice(doc):\n idx = doc[\"sentence\"].index(\"_\")\n options = [doc[\"option1\"], doc[\"option2\"]]\n return [doc[\"sentence\"][:idx] + opt for opt in options]\n",
26
+ "description": "",
27
+ "target_delimiter": " ",
28
+ "fewshot_delimiter": "\n\n",
29
+ "num_fewshot": 0,
30
+ "metric_list": [
31
+ {
32
+ "metric": "acc",
33
+ "aggregation": "mean",
34
+ "higher_is_better": true
35
+ }
36
+ ],
37
+ "output_type": "multiple_choice",
38
+ "repeats": 1,
39
+ "should_decontaminate": true,
40
+ "doc_to_decontamination_query": "sentence",
41
+ "metadata": {
42
+ "version": 1.0
43
+ }
44
+ }
45
+ },
46
+ "versions": {
47
+ "winogrande": 1.0
48
+ },
49
+ "n-shot": {
50
+ "winogrande": 0
51
+ },
52
+ "higher_is_better": {
53
+ "winogrande": {
54
+ "acc": true
55
+ }
56
+ },
57
+ "n-samples": {
58
+ "winogrande": {
59
+ "original": 1267,
60
+ "effective": 1267
61
+ }
62
+ },
63
+ "config": {
64
+ "model": "hf",
65
+ "model_args": "pretrained=meta-llama/Llama-3.2-1B,dtype=float32,trust_remote_code=True",
66
+ "model_num_parameters": 1235814400,
67
+ "model_dtype": "torch.float32",
68
+ "model_revision": "main",
69
+ "model_sha": "4e20de362430cd3b72f300e6b0f18e50e7166e08",
70
+ "batch_size": "auto",
71
+ "batch_sizes": [
72
+ 64
73
+ ],
74
+ "device": "cuda",
75
+ "use_cache": null,
76
+ "limit": null,
77
+ "bootstrap_iters": 100000,
78
+ "gen_kwargs": null,
79
+ "random_seed": 0,
80
+ "numpy_seed": 1234,
81
+ "torch_seed": 1234,
82
+ "fewshot_seed": 1234
83
+ },
84
+ "git_hash": null,
85
+ "date": 1742303656.8517983,
86
+ "pretty_env_info": "PyTorch version: 2.5.1+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: 14.0.0-1ubuntu1.1\nCMake version: version 3.31.2\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Nov 6 2024, 20:22:13) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-6.6.56+-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.140\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: Tesla T4\nGPU 1: Tesla T4\n\nNvidia driver version: 560.35.03\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.6\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.6\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 4\nOn-line CPU(s) list: 0-3\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) CPU @ 2.00GHz\nCPU family: 6\nModel: 85\nThread(s) per core: 2\nCore(s) per socket: 2\nSocket(s): 1\nStepping: 3\nBogoMIPS: 4000.44\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch pti ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves arat md_clear arch_capabilities\nHypervisor vendor: KVM\nVirtualization type: full\nL1d cache: 64 KiB (2 instances)\nL1i cache: 64 KiB (2 instances)\nL2 cache: 2 MiB (2 instances)\nL3 cache: 38.5 MiB (1 instance)\nNUMA node(s): 1\nNUMA node0 CPU(s): 0-3\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Mitigation; PTE Inversion\nVulnerability Mds: Mitigation; Clear CPU buffers; SMT Host state unknown\nVulnerability Meltdown: Mitigation; PTI\nVulnerability Mmio stale data: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Mitigation; IBRS\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; IBRS; IBPB conditional; STIBP conditional; RSB filling; PBRSB-eIBRS Not affected; BHI SW loop, KVM SW loop\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Mitigation; Clear CPU buffers; SMT Host state unknown\n\nVersions of relevant libraries:\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] onnx==1.17.0\n[pip3] optree==0.13.1\n[pip3] pytorch-ignite==0.5.1\n[pip3] pytorch-lightning==2.5.0.post0\n[pip3] torch==2.5.1+cu121\n[pip3] torchaudio==2.5.1+cu121\n[pip3] torchinfo==1.8.0\n[pip3] torchmetrics==1.6.1\n[pip3] torchsummary==1.5.1\n[pip3] torchtune==0.5.0\n[pip3] torchvision==0.20.1+cu121\n[conda] Could not collect",
87
+ "transformers_version": "4.47.0",
88
+ "upper_git_hash": null,
89
+ "tokenizer_pad_token": [
90
+ "<|end_of_text|>",
91
+ "128001"
92
+ ],
93
+ "tokenizer_eos_token": [
94
+ "<|end_of_text|>",
95
+ "128001"
96
+ ],
97
+ "tokenizer_bos_token": [
98
+ "<|begin_of_text|>",
99
+ "128000"
100
+ ],
101
+ "eot_token_id": 128001,
102
+ "max_length": 131072,
103
+ "task_hashes": {},
104
+ "model_source": "hf",
105
+ "model_name": "meta-llama/Llama-3.2-1B",
106
+ "model_name_sanitized": "meta-llama__Llama-3.2-1B",
107
+ "system_instruction": null,
108
+ "system_instruction_sha": null,
109
+ "fewshot_as_multiturn": false,
110
+ "chat_template": null,
111
+ "chat_template_sha": null,
112
+ "start_time": 9905.208666602,
113
+ "end_time": 9970.470989964,
114
+ "total_evaluation_time_seconds": "65.26232336199973"
115
+ }