khazic commited on
Commit
9d37604
·
verified ·
1 Parent(s): 3e6d033

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -38,3 +38,5 @@ outputs/2025-11-18/exp1_len1024/dream-inst/heat/gsm8k/results.json filter=lfs di
38
  outputs/2025-11-18/exp1_len1024/dream-inst/no_cache/gsm8k/results.json filter=lfs diff=lfs merge=lfs -text
39
  outputs/2025-11-18/exp1_len1024/dream-inst/prefix/gsm8k/results.json filter=lfs diff=lfs merge=lfs -text
40
  outputs/2025-11-18/exp2_random_gen/dream-inst/gsm8k_len256/results.json filter=lfs diff=lfs merge=lfs -text
 
 
 
38
  outputs/2025-11-18/exp1_len1024/dream-inst/no_cache/gsm8k/results.json filter=lfs diff=lfs merge=lfs -text
39
  outputs/2025-11-18/exp1_len1024/dream-inst/prefix/gsm8k/results.json filter=lfs diff=lfs merge=lfs -text
40
  outputs/2025-11-18/exp2_random_gen/dream-inst/gsm8k_len256/results.json filter=lfs diff=lfs merge=lfs -text
41
+ 2025-11-1/exp1_len256/dream-inst/dllm/mmlu_pro/results.json filter=lfs diff=lfs merge=lfs -text
42
+ 2025-11-1/exp1_len256/dream-inst/prefix/mmlu_pro/results.json filter=lfs diff=lfs merge=lfs -text
2025-11-1/exp1_len256/dream-inst/dllm/mmlu_pro/.hydra/config.yaml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ generation:
3
+ mask_token_id: 151666
4
+ eot_token_id: 151643
5
+ pad_token_id: 151643
6
+ add_bos_token: true
7
+ alg: maskgit_plus
8
+ name: dream-inst
9
+ path: ${oc.env:DREAM_INST_PATH}
10
+ generation:
11
+ strategy: vanilla
12
+ threshold: null
13
+ factor: null
14
+ alg: maskgit_plus
15
+ gen_length: null
16
+ block_length: null
17
+ steps: null
18
+ temperature: 0.0
19
+ top_p: null
20
+ top_k: null
21
+ debias: false
22
+ output_probs: false
23
+ cache:
24
+ _target_: src.cache.dLLMCache
25
+ kr: 1
26
+ kp: 50
27
+ rou: 0.25
28
+ seed: 1234
29
+ batch_size: 1
30
+ attn_implementation: eager
31
+ dataset:
32
+ name: mmlu_pro
33
+ size: null
34
+ n_shot: null
35
+ system_prompt: null
36
+ batch_size: 1
37
+ mc_num: null
38
+ max_length: 4096
39
+ is_check_greedy: true
40
+ add_bos_token: true
41
+ nll_type: mc
42
+ log_type: ftb
43
+ eval_args:
44
+ log_samples: true
45
+ tasks: ${..dataset.name}
46
+ num_fewshot: ${..dataset.n_shot}
47
+ batch_size: ${..batch_size}
48
+ limit: ${..dataset.size}
49
+ confirm_run_unsafe_code: true
50
+ random_seed: ${..seed}
51
+ fewshot_random_seed: ${..seed}
52
+ numpy_random_seed: ${..seed}
53
+ torch_random_seed: ${..seed}
2025-11-1/exp1_len256/dream-inst/dllm/mmlu_pro/.hydra/hydra.yaml ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: outputs/2025-11-19/exp1_len256/dream-inst/dllm/mmlu_pro
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.run.dir=outputs/2025-11-19/exp1_len256/dream-inst/dllm/mmlu_pro
114
+ - hydra.mode=RUN
115
+ task:
116
+ - dataset.name=mmlu_pro
117
+ - model=dream-inst
118
+ - cache=dllm
119
+ - generation=vanilla
120
+ - batch_size=1
121
+ - seed=1234
122
+ job:
123
+ name: eval
124
+ chdir: null
125
+ override_dirname: batch_size=1,cache=dllm,dataset.name=mmlu_pro,generation=vanilla,model=dream-inst,seed=1234
126
+ id: ???
127
+ num: ???
128
+ config_name: eval
129
+ env_set: {}
130
+ env_copy: []
131
+ config:
132
+ override_dirname:
133
+ kv_sep: '='
134
+ item_sep: ','
135
+ exclude_keys: []
136
+ runtime:
137
+ version: 1.3.2
138
+ version_base: '1.3'
139
+ cwd: /xfr_ceph_sh/liuchonghan/HEAT/heat
140
+ config_sources:
141
+ - path: hydra.conf
142
+ schema: pkg
143
+ provider: hydra
144
+ - path: /xfr_ceph_sh/liuchonghan/HEAT/heat/configs
145
+ schema: file
146
+ provider: main
147
+ - path: ''
148
+ schema: structured
149
+ provider: schema
150
+ output_dir: /xfr_ceph_sh/liuchonghan/HEAT/heat/outputs/2025-11-19/exp1_len256/dream-inst/dllm/mmlu_pro
151
+ choices:
152
+ cache: dllm
153
+ generation: vanilla
154
+ model: dream-inst
155
+ hydra/env: default
156
+ hydra/callbacks: null
157
+ hydra/job_logging: default
158
+ hydra/hydra_logging: default
159
+ hydra/hydra_help: default
160
+ hydra/help: default
161
+ hydra/sweeper: basic
162
+ hydra/launcher: basic
163
+ hydra/output: default
164
+ verbose: false
2025-11-1/exp1_len256/dream-inst/dllm/mmlu_pro/.hydra/overrides.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ - dataset.name=mmlu_pro
2
+ - model=dream-inst
3
+ - cache=dllm
4
+ - generation=vanilla
5
+ - batch_size=1
6
+ - seed=1234
2025-11-1/exp1_len256/dream-inst/dllm/mmlu_pro/eval.log ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-11-19 17:39:42,871][accelerate.utils.other][WARNING] - Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
2
+ [2025-11-19 17:39:51,105][lm_eval.evaluator][INFO] - Setting random seed to 1234 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
3
+ [2025-11-19 17:39:51,105][lm_eval.evaluator][INFO] - Using pre-initialized model
4
+ [2025-11-19 17:39:51,156][lm_eval.evaluator][INFO] - Setting random seed to 1234 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
5
+ [2025-11-19 17:39:51,156][lm_eval.evaluator][INFO] - Using pre-initialized model
6
+ [2025-11-19 17:39:51,181][lm_eval.evaluator][INFO] - Setting random seed to 1234 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
7
+ [2025-11-19 17:39:51,181][lm_eval.evaluator][INFO] - Using pre-initialized model
8
+ [2025-11-19 17:39:51,303][lm_eval.evaluator][INFO] - Setting random seed to 1234 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
9
+ [2025-11-19 17:39:51,303][lm_eval.evaluator][INFO] - Using pre-initialized model
10
+ [2025-11-19 17:39:51,307][lm_eval.evaluator][INFO] - Setting random seed to 1234 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
11
+ [2025-11-19 17:39:51,308][lm_eval.evaluator][INFO] - Using pre-initialized model
12
+ [2025-11-19 17:39:51,342][lm_eval.evaluator][INFO] - Setting random seed to 1234 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
13
+ [2025-11-19 17:39:51,342][lm_eval.evaluator][INFO] - Using pre-initialized model
14
+ [2025-11-19 17:39:51,374][lm_eval.evaluator][INFO] - Setting random seed to 1234 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
15
+ [2025-11-19 17:39:51,374][lm_eval.evaluator][INFO] - Using pre-initialized model
16
+ [2025-11-19 17:39:51,561][lm_eval.evaluator][INFO] - Setting random seed to 1234 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
17
+ [2025-11-19 17:39:51,562][lm_eval.evaluator][INFO] - Using pre-initialized model
18
+ [2025-11-19 17:40:34,823][lm_eval.evaluator][INFO] - mmlu_pro_biology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
19
+ [2025-11-19 17:40:34,823][lm_eval.evaluator][INFO] - mmlu_pro_business: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
20
+ [2025-11-19 17:40:34,823][lm_eval.evaluator][INFO] - mmlu_pro_chemistry: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
21
+ [2025-11-19 17:40:34,823][lm_eval.evaluator][INFO] - mmlu_pro_computer_science: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
22
+ [2025-11-19 17:40:34,823][lm_eval.evaluator][INFO] - mmlu_pro_economics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
23
+ [2025-11-19 17:40:34,823][lm_eval.evaluator][INFO] - mmlu_pro_engineering: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
24
+ [2025-11-19 17:40:34,823][lm_eval.evaluator][INFO] - mmlu_pro_health: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
25
+ [2025-11-19 17:40:34,823][lm_eval.evaluator][INFO] - mmlu_pro_history: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
26
+ [2025-11-19 17:40:34,823][lm_eval.evaluator][INFO] - mmlu_pro_law: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
27
+ [2025-11-19 17:40:34,823][lm_eval.evaluator][INFO] - mmlu_pro_math: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
28
+ [2025-11-19 17:40:34,823][lm_eval.evaluator][INFO] - mmlu_pro_other: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
29
+ [2025-11-19 17:40:34,823][lm_eval.evaluator][INFO] - mmlu_pro_philosophy: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
30
+ [2025-11-19 17:40:34,823][lm_eval.evaluator][INFO] - mmlu_pro_physics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
31
+ [2025-11-19 17:40:34,824][lm_eval.evaluator][INFO] - mmlu_pro_psychology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
32
+ [2025-11-19 17:40:34,825][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_biology on rank 5...
33
+ [2025-11-19 17:40:34,831][lm_eval.evaluator][INFO] - mmlu_pro_biology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
34
+ [2025-11-19 17:40:34,831][lm_eval.evaluator][INFO] - mmlu_pro_business: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
35
+ [2025-11-19 17:40:34,831][lm_eval.evaluator][INFO] - mmlu_pro_chemistry: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
36
+ [2025-11-19 17:40:34,831][lm_eval.evaluator][INFO] - mmlu_pro_computer_science: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
37
+ [2025-11-19 17:40:34,831][lm_eval.evaluator][INFO] - mmlu_pro_economics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
38
+ [2025-11-19 17:40:34,831][lm_eval.evaluator][INFO] - mmlu_pro_engineering: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
39
+ [2025-11-19 17:40:34,832][lm_eval.evaluator][INFO] - mmlu_pro_health: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
40
+ [2025-11-19 17:40:34,832][lm_eval.evaluator][INFO] - mmlu_pro_history: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
41
+ [2025-11-19 17:40:34,832][lm_eval.evaluator][INFO] - mmlu_pro_law: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
42
+ [2025-11-19 17:40:34,832][lm_eval.evaluator][INFO] - mmlu_pro_math: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
43
+ [2025-11-19 17:40:34,832][lm_eval.evaluator][INFO] - mmlu_pro_other: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
44
+ [2025-11-19 17:40:34,832][lm_eval.evaluator][INFO] - mmlu_pro_philosophy: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
45
+ [2025-11-19 17:40:34,832][lm_eval.evaluator][INFO] - mmlu_pro_physics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
46
+ [2025-11-19 17:40:34,832][lm_eval.evaluator][INFO] - mmlu_pro_psychology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
47
+ [2025-11-19 17:40:34,833][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_biology on rank 6...
48
+ [2025-11-19 17:40:34,849][lm_eval.evaluator][INFO] - mmlu_pro_biology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
49
+ [2025-11-19 17:40:34,849][lm_eval.evaluator][INFO] - mmlu_pro_business: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
50
+ [2025-11-19 17:40:34,849][lm_eval.evaluator][INFO] - mmlu_pro_chemistry: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
51
+ [2025-11-19 17:40:34,849][lm_eval.evaluator][INFO] - mmlu_pro_computer_science: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
52
+ [2025-11-19 17:40:34,849][lm_eval.evaluator][INFO] - mmlu_pro_economics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
53
+ [2025-11-19 17:40:34,849][lm_eval.evaluator][INFO] - mmlu_pro_engineering: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
54
+ [2025-11-19 17:40:34,849][lm_eval.evaluator][INFO] - mmlu_pro_health: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
55
+ [2025-11-19 17:40:34,849][lm_eval.evaluator][INFO] - mmlu_pro_history: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
56
+ [2025-11-19 17:40:34,849][lm_eval.evaluator][INFO] - mmlu_pro_law: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
57
+ [2025-11-19 17:40:34,849][lm_eval.evaluator][INFO] - mmlu_pro_math: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
58
+ [2025-11-19 17:40:34,849][lm_eval.evaluator][INFO] - mmlu_pro_other: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
59
+ [2025-11-19 17:40:34,849][lm_eval.evaluator][INFO] - mmlu_pro_philosophy: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
60
+ [2025-11-19 17:40:34,849][lm_eval.evaluator][INFO] - mmlu_pro_physics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
61
+ [2025-11-19 17:40:34,850][lm_eval.evaluator][INFO] - mmlu_pro_psychology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
62
+ [2025-11-19 17:40:34,851][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_biology on rank 2...
63
+ [2025-11-19 17:40:34,860][lm_eval.evaluator][INFO] - mmlu_pro_biology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
64
+ [2025-11-19 17:40:34,861][lm_eval.evaluator][INFO] - mmlu_pro_business: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
65
+ [2025-11-19 17:40:34,861][lm_eval.evaluator][INFO] - mmlu_pro_chemistry: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
66
+ [2025-11-19 17:40:34,861][lm_eval.evaluator][INFO] - mmlu_pro_computer_science: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
67
+ [2025-11-19 17:40:34,861][lm_eval.evaluator][INFO] - mmlu_pro_economics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
68
+ [2025-11-19 17:40:34,861][lm_eval.evaluator][INFO] - mmlu_pro_engineering: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
69
+ [2025-11-19 17:40:34,861][lm_eval.evaluator][INFO] - mmlu_pro_health: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
70
+ [2025-11-19 17:40:34,861][lm_eval.evaluator][INFO] - mmlu_pro_history: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
71
+ [2025-11-19 17:40:34,861][lm_eval.evaluator][INFO] - mmlu_pro_law: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
72
+ [2025-11-19 17:40:34,861][lm_eval.evaluator][INFO] - mmlu_pro_math: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
73
+ [2025-11-19 17:40:34,861][lm_eval.evaluator][INFO] - mmlu_pro_other: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
74
+ [2025-11-19 17:40:34,861][lm_eval.evaluator][INFO] - mmlu_pro_philosophy: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
75
+ [2025-11-19 17:40:34,861][lm_eval.evaluator][INFO] - mmlu_pro_physics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
76
+ [2025-11-19 17:40:34,861][lm_eval.evaluator][INFO] - mmlu_pro_psychology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
77
+ [2025-11-19 17:40:34,863][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_biology on rank 1...
78
+ [2025-11-19 17:40:34,911][lm_eval.evaluator][INFO] - mmlu_pro_biology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
79
+ [2025-11-19 17:40:34,911][lm_eval.evaluator][INFO] - mmlu_pro_business: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
80
+ [2025-11-19 17:40:34,912][lm_eval.evaluator][INFO] - mmlu_pro_chemistry: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
81
+ [2025-11-19 17:40:34,912][lm_eval.evaluator][INFO] - mmlu_pro_computer_science: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
82
+ [2025-11-19 17:40:34,912][lm_eval.evaluator][INFO] - mmlu_pro_economics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
83
+ [2025-11-19 17:40:34,912][lm_eval.evaluator][INFO] - mmlu_pro_engineering: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
84
+ [2025-11-19 17:40:34,912][lm_eval.evaluator][INFO] - mmlu_pro_health: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
85
+ [2025-11-19 17:40:34,912][lm_eval.evaluator][INFO] - mmlu_pro_history: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
86
+ [2025-11-19 17:40:34,912][lm_eval.evaluator][INFO] - mmlu_pro_law: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
87
+ [2025-11-19 17:40:34,912][lm_eval.evaluator][INFO] - mmlu_pro_math: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
88
+ [2025-11-19 17:40:34,912][lm_eval.evaluator][INFO] - mmlu_pro_other: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
89
+ [2025-11-19 17:40:34,912][lm_eval.evaluator][INFO] - mmlu_pro_philosophy: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
90
+ [2025-11-19 17:40:34,912][lm_eval.evaluator][INFO] - mmlu_pro_physics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
91
+ [2025-11-19 17:40:34,912][lm_eval.evaluator][INFO] - mmlu_pro_psychology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
92
+ [2025-11-19 17:40:34,914][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_biology on rank 0...
93
+ [2025-11-19 17:40:35,057][lm_eval.evaluator][INFO] - mmlu_pro_biology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
94
+ [2025-11-19 17:40:35,057][lm_eval.evaluator][INFO] - mmlu_pro_business: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
95
+ [2025-11-19 17:40:35,057][lm_eval.evaluator][INFO] - mmlu_pro_chemistry: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
96
+ [2025-11-19 17:40:35,057][lm_eval.evaluator][INFO] - mmlu_pro_computer_science: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
97
+ [2025-11-19 17:40:35,057][lm_eval.evaluator][INFO] - mmlu_pro_economics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
98
+ [2025-11-19 17:40:35,057][lm_eval.evaluator][INFO] - mmlu_pro_engineering: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
99
+ [2025-11-19 17:40:35,058][lm_eval.evaluator][INFO] - mmlu_pro_health: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
100
+ [2025-11-19 17:40:35,058][lm_eval.evaluator][INFO] - mmlu_pro_history: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
101
+ [2025-11-19 17:40:35,058][lm_eval.evaluator][INFO] - mmlu_pro_law: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
102
+ [2025-11-19 17:40:35,058][lm_eval.evaluator][INFO] - mmlu_pro_math: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
103
+ [2025-11-19 17:40:35,058][lm_eval.evaluator][INFO] - mmlu_pro_other: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
104
+ [2025-11-19 17:40:35,058][lm_eval.evaluator][INFO] - mmlu_pro_philosophy: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
105
+ [2025-11-19 17:40:35,058][lm_eval.evaluator][INFO] - mmlu_pro_physics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
106
+ [2025-11-19 17:40:35,058][lm_eval.evaluator][INFO] - mmlu_pro_psychology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
107
+ [2025-11-19 17:40:35,062][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_biology on rank 4...
108
+ [2025-11-19 17:40:35,085][lm_eval.evaluator][INFO] - mmlu_pro_biology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
109
+ [2025-11-19 17:40:35,085][lm_eval.evaluator][INFO] - mmlu_pro_business: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
110
+ [2025-11-19 17:40:35,085][lm_eval.evaluator][INFO] - mmlu_pro_chemistry: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
111
+ [2025-11-19 17:40:35,085][lm_eval.evaluator][INFO] - mmlu_pro_computer_science: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
112
+ [2025-11-19 17:40:35,086][lm_eval.evaluator][INFO] - mmlu_pro_economics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
113
+ [2025-11-19 17:40:35,086][lm_eval.evaluator][INFO] - mmlu_pro_engineering: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
114
+ [2025-11-19 17:40:35,086][lm_eval.evaluator][INFO] - mmlu_pro_health: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
115
+ [2025-11-19 17:40:35,086][lm_eval.evaluator][INFO] - mmlu_pro_history: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
116
+ [2025-11-19 17:40:35,086][lm_eval.evaluator][INFO] - mmlu_pro_law: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
117
+ [2025-11-19 17:40:35,086][lm_eval.evaluator][INFO] - mmlu_pro_math: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
118
+ [2025-11-19 17:40:35,086][lm_eval.evaluator][INFO] - mmlu_pro_other: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
119
+ [2025-11-19 17:40:35,086][lm_eval.evaluator][INFO] - mmlu_pro_philosophy: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
120
+ [2025-11-19 17:40:35,086][lm_eval.evaluator][INFO] - mmlu_pro_physics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
121
+ [2025-11-19 17:40:35,086][lm_eval.evaluator][INFO] - mmlu_pro_psychology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
122
+ [2025-11-19 17:40:35,089][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_biology on rank 7...
123
+ [2025-11-19 17:40:35,119][lm_eval.evaluator][INFO] - mmlu_pro_biology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
124
+ [2025-11-19 17:40:35,120][lm_eval.evaluator][INFO] - mmlu_pro_business: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
125
+ [2025-11-19 17:40:35,120][lm_eval.evaluator][INFO] - mmlu_pro_chemistry: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
126
+ [2025-11-19 17:40:35,120][lm_eval.evaluator][INFO] - mmlu_pro_computer_science: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
127
+ [2025-11-19 17:40:35,120][lm_eval.evaluator][INFO] - mmlu_pro_economics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
128
+ [2025-11-19 17:40:35,120][lm_eval.evaluator][INFO] - mmlu_pro_engineering: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
129
+ [2025-11-19 17:40:35,121][lm_eval.evaluator][INFO] - mmlu_pro_health: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
130
+ [2025-11-19 17:40:35,121][lm_eval.evaluator][INFO] - mmlu_pro_history: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
131
+ [2025-11-19 17:40:35,121][lm_eval.evaluator][INFO] - mmlu_pro_law: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
132
+ [2025-11-19 17:40:35,121][lm_eval.evaluator][INFO] - mmlu_pro_math: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
133
+ [2025-11-19 17:40:35,121][lm_eval.evaluator][INFO] - mmlu_pro_other: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
134
+ [2025-11-19 17:40:35,121][lm_eval.evaluator][INFO] - mmlu_pro_philosophy: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
135
+ [2025-11-19 17:40:35,121][lm_eval.evaluator][INFO] - mmlu_pro_physics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
136
+ [2025-11-19 17:40:35,121][lm_eval.evaluator][INFO] - mmlu_pro_psychology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
137
+ [2025-11-19 17:40:35,124][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_biology on rank 3...
138
+ [2025-11-19 17:40:39,984][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_business on rank 1...
139
+ [2025-11-19 17:40:39,984][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_business on rank 7...
140
+ [2025-11-19 17:40:39,984][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_business on rank 5...
141
+ [2025-11-19 17:40:39,984][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_business on rank 6...
142
+ [2025-11-19 17:40:39,984][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_business on rank 3...
143
+ [2025-11-19 17:40:39,984][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_business on rank 4...
144
+ [2025-11-19 17:40:39,985][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_business on rank 0...
145
+ [2025-11-19 17:40:39,985][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_business on rank 2...
146
+ [2025-11-19 17:40:40,026][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_chemistry on rank 6...
147
+ [2025-11-19 17:40:40,026][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_chemistry on rank 1...
148
+ [2025-11-19 17:40:40,026][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_chemistry on rank 7...
149
+ [2025-11-19 17:40:40,026][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_chemistry on rank 4...
150
+ [2025-11-19 17:40:40,026][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_chemistry on rank 0...
151
+ [2025-11-19 17:40:40,026][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_chemistry on rank 5...
152
+ [2025-11-19 17:40:40,026][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_chemistry on rank 2...
153
+ [2025-11-19 17:40:40,026][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_chemistry on rank 3...
154
+ [2025-11-19 17:40:40,081][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_computer_science on rank 1...
155
+ [2025-11-19 17:40:40,081][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_computer_science on rank 2...
156
+ [2025-11-19 17:40:40,081][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_computer_science on rank 0...
157
+ [2025-11-19 17:40:40,081][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_computer_science on rank 6...
158
+ [2025-11-19 17:40:40,081][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_computer_science on rank 5...
159
+ [2025-11-19 17:40:40,081][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_computer_science on rank 4...
160
+ [2025-11-19 17:40:40,081][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_computer_science on rank 3...
161
+ [2025-11-19 17:40:40,081][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_computer_science on rank 7...
162
+ [2025-11-19 17:40:40,132][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_economics on rank 1...
163
+ [2025-11-19 17:40:40,132][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_economics on rank 2...
164
+ [2025-11-19 17:40:40,132][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_economics on rank 0...
165
+ [2025-11-19 17:40:40,132][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_economics on rank 4...
166
+ [2025-11-19 17:40:40,132][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_economics on rank 6...
167
+ [2025-11-19 17:40:40,132][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_economics on rank 7...
168
+ [2025-11-19 17:40:40,132][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_economics on rank 5...
169
+ [2025-11-19 17:40:40,132][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_economics on rank 3...
170
+ [2025-11-19 17:40:40,180][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_engineering on rank 6...
171
+ [2025-11-19 17:40:40,180][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_engineering on rank 4...
172
+ [2025-11-19 17:40:40,180][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_engineering on rank 1...
173
+ [2025-11-19 17:40:40,180][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_engineering on rank 7...
174
+ [2025-11-19 17:40:40,180][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_engineering on rank 5...
175
+ [2025-11-19 17:40:40,180][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_engineering on rank 0...
176
+ [2025-11-19 17:40:40,180][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_engineering on rank 2...
177
+ [2025-11-19 17:40:40,180][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_engineering on rank 3...
178
+ [2025-11-19 17:40:40,247][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_health on rank 6...
179
+ [2025-11-19 17:40:40,247][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_health on rank 0...
180
+ [2025-11-19 17:40:40,247][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_health on rank 1...
181
+ [2025-11-19 17:40:40,247][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_health on rank 7...
182
+ [2025-11-19 17:40:40,247][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_health on rank 4...
183
+ [2025-11-19 17:40:40,247][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_health on rank 5...
184
+ [2025-11-19 17:40:40,247][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_health on rank 2...
185
+ [2025-11-19 17:40:40,247][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_health on rank 3...
186
+ [2025-11-19 17:40:40,281][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_history on rank 6...
187
+ [2025-11-19 17:40:40,281][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_history on rank 5...
188
+ [2025-11-19 17:40:40,281][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_history on rank 7...
189
+ [2025-11-19 17:40:40,281][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_history on rank 2...
190
+ [2025-11-19 17:40:40,281][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_history on rank 1...
191
+ [2025-11-19 17:40:40,281][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_history on rank 0...
192
+ [2025-11-19 17:40:40,282][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_history on rank 4...
193
+ [2025-11-19 17:40:40,282][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_history on rank 3...
194
+ [2025-11-19 17:40:40,324][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_law on rank 2...
195
+ [2025-11-19 17:40:40,324][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_law on rank 6...
196
+ [2025-11-19 17:40:40,324][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_law on rank 1...
197
+ [2025-11-19 17:40:40,324][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_law on rank 4...
198
+ [2025-11-19 17:40:40,324][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_law on rank 5...
199
+ [2025-11-19 17:40:40,324][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_law on rank 7...
200
+ [2025-11-19 17:40:40,324][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_law on rank 0...
201
+ [2025-11-19 17:40:40,324][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_law on rank 3...
202
+ [2025-11-19 17:40:40,344][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_math on rank 1...
203
+ [2025-11-19 17:40:40,344][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_math on rank 6...
204
+ [2025-11-19 17:40:40,344][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_math on rank 4...
205
+ [2025-11-19 17:40:40,344][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_math on rank 7...
206
+ [2025-11-19 17:40:40,344][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_math on rank 5...
207
+ [2025-11-19 17:40:40,344][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_math on rank 0...
208
+ [2025-11-19 17:40:40,344][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_math on rank 3...
209
+ [2025-11-19 17:40:40,344][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_math on rank 2...
210
+ [2025-11-19 17:40:40,365][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_other on rank 7...
211
+ [2025-11-19 17:40:40,365][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_other on rank 1...
212
+ [2025-11-19 17:40:40,365][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_other on rank 6...
213
+ [2025-11-19 17:40:40,365][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_other on rank 4...
214
+ [2025-11-19 17:40:40,365][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_other on rank 3...
215
+ [2025-11-19 17:40:40,365][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_other on rank 5...
216
+ [2025-11-19 17:40:40,365][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_other on rank 0...
217
+ [2025-11-19 17:40:40,365][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_other on rank 2...
218
+ [2025-11-19 17:40:40,385][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_philosophy on rank 7...
219
+ [2025-11-19 17:40:40,385][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_philosophy on rank 6...
220
+ [2025-11-19 17:40:40,385][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_philosophy on rank 5...
221
+ [2025-11-19 17:40:40,385][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_philosophy on rank 1...
222
+ [2025-11-19 17:40:40,385][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_philosophy on rank 4...
223
+ [2025-11-19 17:40:40,385][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_philosophy on rank 0...
224
+ [2025-11-19 17:40:40,385][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_philosophy on rank 3...
225
+ [2025-11-19 17:40:40,385][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_philosophy on rank 2...
226
+ [2025-11-19 17:40:40,406][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_physics on rank 6...
227
+ [2025-11-19 17:40:40,406][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_physics on rank 3...
228
+ [2025-11-19 17:40:40,406][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_physics on rank 0...
229
+ [2025-11-19 17:40:40,406][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_physics on rank 4...
230
+ [2025-11-19 17:40:40,406][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_physics on rank 5...
231
+ [2025-11-19 17:40:40,406][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_physics on rank 7...
232
+ [2025-11-19 17:40:40,406][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_physics on rank 1...
233
+ [2025-11-19 17:40:40,406][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_physics on rank 2...
234
+ [2025-11-19 17:40:40,426][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_psychology on rank 6...
235
+ [2025-11-19 17:40:40,426][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_psychology on rank 1...
236
+ [2025-11-19 17:40:40,426][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_psychology on rank 7...
237
+ [2025-11-19 17:40:40,426][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_psychology on rank 3...
238
+ [2025-11-19 17:40:40,426][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_psychology on rank 4...
239
+ [2025-11-19 17:40:40,426][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_psychology on rank 0...
240
+ [2025-11-19 17:40:40,426][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_psychology on rank 5...
241
+ [2025-11-19 17:40:40,427][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_psychology on rank 2...
242
+ [2025-11-19 17:40:40,447][lm_eval.evaluator][INFO] - Running generate_until requests
243
+ [2025-11-19 17:40:40,447][lm_eval.evaluator][INFO] - Running generate_until requests
244
+ [2025-11-19 17:40:40,447][lm_eval.evaluator][INFO] - Running generate_until requests
245
+ [2025-11-19 17:40:40,447][lm_eval.evaluator][INFO] - Running generate_until requests
246
+ [2025-11-19 17:40:40,447][lm_eval.evaluator][INFO] - Running generate_until requests
247
+ [2025-11-19 17:40:40,447][lm_eval.evaluator][INFO] - Running generate_until requests
248
+ [2025-11-19 17:40:40,447][lm_eval.evaluator][INFO] - Running generate_until requests
249
+ [2025-11-19 17:40:40,447][lm_eval.evaluator][INFO] - Running generate_until requests
2025-11-1/exp1_len256/dream-inst/dllm/mmlu_pro/results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec3caae7416644291124ab09253fed704c145e61302e62b200add7168ef9da8e
3
+ size 10543271
2025-11-1/exp1_len256/dream-inst/dllm/mmlu_pro/stderr.log ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0
  0%| | 0/12 [00:00<?, ?it/s]
1
  0%| | 0/12 [00:00<?, ?it/s]
 
 
 
2
  0%| | 0/13 [00:00<?, ?it/s]
3
  0%| | 0/13 [00:00<?, ?it/s]
 
 
 
4
  0%| | 0/13 [00:00<?, ?it/s]
 
 
5
  0%| | 0/12 [00:00<?, ?it/s]
 
6
  0%| | 0/12 [00:00<?, ?it/s]
 
7
  0%| | 0/13 [00:00<?, ?it/s]
 
8
  0%| | 0/12 [00:00<?, ?it/s]
9
  0%| | 0/13 [00:00<?, ?it/s]
10
  0%| | 0/12 [00:00<?, ?it/s]
11
  0%| | 0/12 [00:00<?, ?it/s]
12
  0%| | 0/12 [00:00<?, ?it/s]
13
  0%| | 0/13 [00:00<?, ?it/s]
14
  0%| | 0/13 [00:00<?, ?it/s]
 
 
 
 
15
  0%| | 0/13 [00:00<?, ?it/s]
 
 
 
 
16
  0%| | 0/12 [00:00<?, ?it/s]
17
  0%| | 0/12 [00:00<?, ?it/s]
18
  0%| | 0/13 [00:00<?, ?it/s]
19
  0%| | 0/13 [00:00<?, ?it/s]
20
  0%| | 0/12 [00:00<?, ?it/s]
21
  0%| | 0/13 [00:00<?, ?it/s]
22
  0%| | 0/12 [00:00<?, ?it/s]
 
 
 
 
 
 
 
23
  0%| | 0/13 [00:00<?, ?it/s]
 
24
  0%| | 0/12 [00:00<?, ?it/s]
25
  0%| | 0/12 [00:00<?, ?it/s]
26
  0%| | 0/13 [00:00<?, ?it/s]
27
  0%| | 0/13 [00:00<?, ?it/s]
28
  0%| | 0/12 [00:00<?, ?it/s]
29
  0%| | 0/13 [00:00<?, ?it/s]
30
  0%| | 0/12 [00:00<?, ?it/s]
 
 
 
 
 
 
 
31
  0%| | 0/13 [00:00<?, ?it/s]
 
32
  0%| | 0/12 [00:00<?, ?it/s]
33
  0%| | 0/13 [00:00<?, ?it/s]
34
  0%| | 0/13 [00:00<?, ?it/s]
35
  0%| | 0/13 [00:00<?, ?it/s]
36
  0%| | 0/12 [00:00<?, ?it/s]
37
  0%| | 0/12 [00:00<?, ?it/s]
38
  0%| | 0/12 [00:00<?, ?it/s]
 
 
 
 
 
 
 
39
  0%| | 0/13 [00:00<?, ?it/s]
 
40
  0%| | 0/12 [00:00<?, ?it/s]
41
  0%| | 0/12 [00:00<?, ?it/s]
42
  0%| | 0/12 [00:00<?, ?it/s]
43
  0%| | 0/13 [00:00<?, ?it/s]
44
  0%| | 0/12 [00:00<?, ?it/s]
45
  0%| | 0/13 [00:00<?, ?it/s]
46
  0%| | 0/13 [00:00<?, ?it/s]
 
 
 
 
 
 
 
47
  0%| | 0/13 [00:00<?, ?it/s]
 
48
  0%| | 0/13 [00:00<?, ?it/s]
49
  0%| | 0/12 [00:00<?, ?it/s]
50
  0%| | 0/12 [00:00<?, ?it/s]
51
  0%| | 0/13 [00:00<?, ?it/s]
52
  0%| | 0/13 [00:00<?, ?it/s]
53
  0%| | 0/12 [00:00<?, ?it/s]
54
  0%| | 0/12 [00:00<?, ?it/s]
 
 
 
 
55
  0%| | 0/13 [00:00<?, ?it/s]
 
 
 
 
56
  0%| | 0/12 [00:00<?, ?it/s]
57
  0%| | 0/13 [00:00<?, ?it/s]
58
  0%| | 0/13 [00:00<?, ?it/s]
59
  0%| | 0/12 [00:00<?, ?it/s]
60
  0%| | 0/12 [00:00<?, ?it/s]
61
  0%| | 0/13 [00:00<?, ?it/s]
62
  0%| | 0/12 [00:00<?, ?it/s]
63
  0%| | 0/13 [00:00<?, ?it/s]
 
 
 
 
 
 
 
 
64
  0%| | 0/12 [00:00<?, ?it/s]
65
  0%| | 0/12 [00:00<?, ?it/s]
66
  0%| | 0/12 [00:00<?, ?it/s]
67
  0%| | 0/13 [00:00<?, ?it/s]
68
  0%| | 0/13 [00:00<?, ?it/s]
69
  0%| | 0/13 [00:00<?, ?it/s]
70
  0%| | 0/13 [00:00<?, ?it/s]
71
  0%| | 0/12 [00:00<?, ?it/s]
 
 
 
 
 
 
 
 
72
  0%| | 0/12 [00:00<?, ?it/s]
73
  0%| | 0/12 [00:00<?, ?it/s]
74
  0%| | 0/13 [00:00<?, ?it/s]
75
  0%| | 0/12 [00:00<?, ?it/s]
76
  0%| | 0/13 [00:00<?, ?it/s]
77
  0%| | 0/13 [00:00<?, ?it/s]
78
  0%| | 0/13 [00:00<?, ?it/s]
79
  0%| | 0/12 [00:00<?, ?it/s]
 
 
 
 
 
 
 
 
80
  0%| | 0/12 [00:00<?, ?it/s]
81
  0%| | 0/12 [00:00<?, ?it/s]
82
  0%| | 0/13 [00:00<?, ?it/s]
83
  0%| | 0/12 [00:00<?, ?it/s]
84
  0%| | 0/12 [00:00<?, ?it/s]
85
  0%| | 0/13 [00:00<?, ?it/s]
86
  0%| | 0/13 [00:00<?, ?it/s]
87
  0%| | 0/13 [00:00<?, ?it/s]
 
 
 
 
 
 
 
 
88
  0%| | 0/12 [00:00<?, ?it/s]
89
  0%| | 0/12 [00:00<?, ?it/s]
90
  0%| | 0/12 [00:00<?, ?it/s]
91
  0%| | 0/13 [00:00<?, ?it/s]
92
  0%| | 0/12 [00:00<?, ?it/s]
93
  0%| | 0/13 [00:00<?, ?it/s]
94
  0%| | 0/13 [00:00<?, ?it/s]
95
  0%| | 0/13 [00:00<?, ?it/s]
 
 
 
 
 
 
 
 
96
  0%| | 0/12 [00:00<?, ?it/s]
97
  0%| | 0/13 [00:00<?, ?it/s]
98
  0%| | 0/12 [00:00<?, ?it/s]
99
  0%| | 0/12 [00:00<?, ?it/s]
100
  0%| | 0/12 [00:00<?, ?it/s]
101
  0%| | 0/13 [00:00<?, ?it/s]
102
  0%| | 0/13 [00:00<?, ?it/s]
103
  0%| | 0/13 [00:00<?, ?it/s]
 
 
 
 
 
 
 
 
104
  0%| | 0/13 [00:00<?, ?it/s]
105
  0%| | 0/12 [00:00<?, ?it/s]
106
  0%| | 0/12 [00:00<?, ?it/s]
107
  0%| | 0/12 [00:00<?, ?it/s]
108
  0%| | 0/13 [00:00<?, ?it/s]
109
  0%| | 0/12 [00:00<?, ?it/s]
110
  0%| | 0/13 [00:00<?, ?it/s]
111
  0%| | 0/13 [00:00<?, ?it/s]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ipex flag is deprecated, will be removed in Accelerate v1.10. From 2.7.0, PyTorch has all needed optimizations for Intel CPU and XPU.
2
+ The following values were not passed to `accelerate launch` and had defaults used instead:
3
+ More than one GPU was found, enabling multi-GPU training.
4
+ If this was unintended please pass in `--num_processes=1`.
5
+ `--mixed_precision` was set to a value of `'no'`
6
+ `--dynamo_backend` was set to a value of `'no'`
7
+ To avoid this warning pass in values for each of the problematic parameters or run `accelerate config`.
8
+ [W1119 17:39:27.733661635 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
9
+ 2025-11-19 17:39:36.278 | INFO | src.utils:pre_initialize:603 - {'strategy': 'vanilla', 'threshold': None, 'factor': None, 'alg': 'maskgit_plus', 'gen_length': 256, 'block_length': 32, 'steps': 256, 'temperature': 0.0, 'top_p': 0.9, 'top_k': None, 'debias': False, 'output_probs': False, 'mask_token_id': 151666, 'eot_token_id': 151643, 'pad_token_id': 151643, 'add_bos_token': True, 'sigma': None}
10
+ 2025-11-19 17:39:36.278 | INFO | src.utils:pre_initialize:618 - Using cache with args: {'kp': 50, 'kr': 4}
11
+ [W1119 17:39:37.823449224 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
12
+ [W1119 17:39:39.543581206 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
13
+ [W1119 17:39:40.105350243 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
14
+ [W1119 17:39:40.238904703 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
15
+ [W1119 17:39:40.460865127 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
16
+ [W1119 17:39:40.523301171 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
17
+ [W1119 17:39:40.541752953 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
18
+ [W1119 17:39:40.626767406 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
19
+
20
+
21
+
22
+
23
+
24
+
25
+
26
+
27
+ The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
28
+ The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
29
+ The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
30
+ The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
31
+ The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
32
+ The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
33
+ The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
34
+ The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
35
+ The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
36
+ The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
37
+ The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
38
+ The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
39
+ The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
40
+ The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
41
+ The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
42
+ The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
43
+ 2025-11-19 17:39:51.101 | INFO | __main__:overwrite_eval_task:62 - MMLU-Pro dataset is too large, shrink to 100 for faster evaluation.
44
+ 2025-11-19 17:39:51.152 | INFO | __main__:overwrite_eval_task:62 - MMLU-Pro dataset is too large, shrink to 100 for faster evaluation.
45
+ 2025-11-19 17:39:51.178 | INFO | __main__:overwrite_eval_task:62 - MMLU-Pro dataset is too large, shrink to 100 for faster evaluation.
46
+ 2025-11-19 17:39:51.300 | INFO | __main__:overwrite_eval_task:62 - MMLU-Pro dataset is too large, shrink to 100 for faster evaluation.
47
+ 2025-11-19 17:39:51.302 | INFO | __main__:overwrite_eval_task:62 - MMLU-Pro dataset is too large, shrink to 100 for faster evaluation.
48
+ 2025-11-19 17:39:51.338 | INFO | __main__:overwrite_eval_task:62 - MMLU-Pro dataset is too large, shrink to 100 for faster evaluation.
49
+ 2025-11-19 17:39:51.371 | INFO | __main__:overwrite_eval_task:62 - MMLU-Pro dataset is too large, shrink to 100 for faster evaluation.
50
+ 2025-11-19 17:39:51.559 | INFO | __main__:overwrite_eval_task:62 - MMLU-Pro dataset is too large, shrink to 100 for faster evaluation.
51
+
52
+
53
+
54
+
55
+
56
+
57
+
58
+
59
+
60
+
61
+
62
+
63
+
64
+
65
+
66
+
67
+
68
+
69
+
70
+
71
+
72
+
73
+
74
+
75
+
76
+
77
+
78
+
79
+
80
+
81
+
82
+
83
+
84
+
85
+
86
+
87
+
88
+
89
+
90
+
91
+
92
+
93
+
94
+
95
+
96
+
97
+
98
+
99
+
100
+
101
+
102
+
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
+
111
+
112
+
113
+
114
+
115
+
116
+
117
+
118
+
119
+
120
+
121
+
122
+
123
+
124
+
125
+
126
+
127
+
128
+
129
+
130
+
131
+
132
+
133
+
134
+
135
+
136
+
137
+
138
+
139
+
140
  0%| | 0/12 [00:00<?, ?it/s]
141
  0%| | 0/12 [00:00<?, ?it/s]
142
+
143
+
144
+
145
  0%| | 0/13 [00:00<?, ?it/s]
146
  0%| | 0/13 [00:00<?, ?it/s]
147
+
148
+
149
+
150
  0%| | 0/13 [00:00<?, ?it/s]
151
+
152
+
153
  0%| | 0/12 [00:00<?, ?it/s]
154
+
155
  0%| | 0/12 [00:00<?, ?it/s]
156
+
157
  0%| | 0/13 [00:00<?, ?it/s]
158
+
159
  0%| | 0/12 [00:00<?, ?it/s]
160
  0%| | 0/13 [00:00<?, ?it/s]
161
  0%| | 0/12 [00:00<?, ?it/s]
162
  0%| | 0/12 [00:00<?, ?it/s]
163
  0%| | 0/12 [00:00<?, ?it/s]
164
  0%| | 0/13 [00:00<?, ?it/s]
165
  0%| | 0/13 [00:00<?, ?it/s]
166
+
167
+
168
+
169
+
170
  0%| | 0/13 [00:00<?, ?it/s]
171
+
172
+
173
+
174
+
175
  0%| | 0/12 [00:00<?, ?it/s]
176
  0%| | 0/12 [00:00<?, ?it/s]
177
  0%| | 0/13 [00:00<?, ?it/s]
178
  0%| | 0/13 [00:00<?, ?it/s]
179
  0%| | 0/12 [00:00<?, ?it/s]
180
  0%| | 0/13 [00:00<?, ?it/s]
181
  0%| | 0/12 [00:00<?, ?it/s]
182
+
183
+
184
+
185
+
186
+
187
+
188
+
189
  0%| | 0/13 [00:00<?, ?it/s]
190
+
191
  0%| | 0/12 [00:00<?, ?it/s]
192
  0%| | 0/12 [00:00<?, ?it/s]
193
  0%| | 0/13 [00:00<?, ?it/s]
194
  0%| | 0/13 [00:00<?, ?it/s]
195
  0%| | 0/12 [00:00<?, ?it/s]
196
  0%| | 0/13 [00:00<?, ?it/s]
197
  0%| | 0/12 [00:00<?, ?it/s]
198
+
199
+
200
+
201
+
202
+
203
+
204
+
205
  0%| | 0/13 [00:00<?, ?it/s]
206
+
207
  0%| | 0/12 [00:00<?, ?it/s]
208
  0%| | 0/13 [00:00<?, ?it/s]
209
  0%| | 0/13 [00:00<?, ?it/s]
210
  0%| | 0/13 [00:00<?, ?it/s]
211
  0%| | 0/12 [00:00<?, ?it/s]
212
  0%| | 0/12 [00:00<?, ?it/s]
213
  0%| | 0/12 [00:00<?, ?it/s]
214
+
215
+
216
+
217
+
218
+
219
+
220
+
221
  0%| | 0/13 [00:00<?, ?it/s]
222
+
223
  0%| | 0/12 [00:00<?, ?it/s]
224
  0%| | 0/12 [00:00<?, ?it/s]
225
  0%| | 0/12 [00:00<?, ?it/s]
226
  0%| | 0/13 [00:00<?, ?it/s]
227
  0%| | 0/12 [00:00<?, ?it/s]
228
  0%| | 0/13 [00:00<?, ?it/s]
229
  0%| | 0/13 [00:00<?, ?it/s]
230
+
231
+
232
+
233
+
234
+
235
+
236
+
237
  0%| | 0/13 [00:00<?, ?it/s]
238
+
239
  0%| | 0/13 [00:00<?, ?it/s]
240
  0%| | 0/12 [00:00<?, ?it/s]
241
  0%| | 0/12 [00:00<?, ?it/s]
242
  0%| | 0/13 [00:00<?, ?it/s]
243
  0%| | 0/13 [00:00<?, ?it/s]
244
  0%| | 0/12 [00:00<?, ?it/s]
245
  0%| | 0/12 [00:00<?, ?it/s]
246
+
247
+
248
+
249
+
250
  0%| | 0/13 [00:00<?, ?it/s]
251
+
252
+
253
+
254
+
255
  0%| | 0/12 [00:00<?, ?it/s]
256
  0%| | 0/13 [00:00<?, ?it/s]
257
  0%| | 0/13 [00:00<?, ?it/s]
258
  0%| | 0/12 [00:00<?, ?it/s]
259
  0%| | 0/12 [00:00<?, ?it/s]
260
  0%| | 0/13 [00:00<?, ?it/s]
261
  0%| | 0/12 [00:00<?, ?it/s]
262
  0%| | 0/13 [00:00<?, ?it/s]
263
+
264
+
265
+
266
+
267
+
268
+
269
+
270
+
271
  0%| | 0/12 [00:00<?, ?it/s]
272
  0%| | 0/12 [00:00<?, ?it/s]
273
  0%| | 0/12 [00:00<?, ?it/s]
274
  0%| | 0/13 [00:00<?, ?it/s]
275
  0%| | 0/13 [00:00<?, ?it/s]
276
  0%| | 0/13 [00:00<?, ?it/s]
277
  0%| | 0/13 [00:00<?, ?it/s]
278
  0%| | 0/12 [00:00<?, ?it/s]
279
+
280
+
281
+
282
+
283
+
284
+
285
+
286
+
287
  0%| | 0/12 [00:00<?, ?it/s]
288
  0%| | 0/12 [00:00<?, ?it/s]
289
  0%| | 0/13 [00:00<?, ?it/s]
290
  0%| | 0/12 [00:00<?, ?it/s]
291
  0%| | 0/13 [00:00<?, ?it/s]
292
  0%| | 0/13 [00:00<?, ?it/s]
293
  0%| | 0/13 [00:00<?, ?it/s]
294
  0%| | 0/12 [00:00<?, ?it/s]
295
+
296
+
297
+
298
+
299
+
300
+
301
+
302
+
303
  0%| | 0/12 [00:00<?, ?it/s]
304
  0%| | 0/12 [00:00<?, ?it/s]
305
  0%| | 0/13 [00:00<?, ?it/s]
306
  0%| | 0/12 [00:00<?, ?it/s]
307
  0%| | 0/12 [00:00<?, ?it/s]
308
  0%| | 0/13 [00:00<?, ?it/s]
309
  0%| | 0/13 [00:00<?, ?it/s]
310
  0%| | 0/13 [00:00<?, ?it/s]
311
+
312
+
313
+
314
+
315
+
316
+
317
+
318
+
319
  0%| | 0/12 [00:00<?, ?it/s]
320
  0%| | 0/12 [00:00<?, ?it/s]
321
  0%| | 0/12 [00:00<?, ?it/s]
322
  0%| | 0/13 [00:00<?, ?it/s]
323
  0%| | 0/12 [00:00<?, ?it/s]
324
  0%| | 0/13 [00:00<?, ?it/s]
325
  0%| | 0/13 [00:00<?, ?it/s]
326
  0%| | 0/13 [00:00<?, ?it/s]
327
+
328
+
329
+
330
+
331
+
332
+
333
+
334
+
335
  0%| | 0/12 [00:00<?, ?it/s]
336
  0%| | 0/13 [00:00<?, ?it/s]
337
  0%| | 0/12 [00:00<?, ?it/s]
338
  0%| | 0/12 [00:00<?, ?it/s]
339
  0%| | 0/12 [00:00<?, ?it/s]
340
  0%| | 0/13 [00:00<?, ?it/s]
341
  0%| | 0/13 [00:00<?, ?it/s]
342
  0%| | 0/13 [00:00<?, ?it/s]
343
+
344
+
345
+
346
+
347
+
348
+
349
+
350
+
351
  0%| | 0/13 [00:00<?, ?it/s]
352
  0%| | 0/12 [00:00<?, ?it/s]
353
  0%| | 0/12 [00:00<?, ?it/s]
354
  0%| | 0/12 [00:00<?, ?it/s]
355
  0%| | 0/13 [00:00<?, ?it/s]
356
  0%| | 0/12 [00:00<?, ?it/s]
357
  0%| | 0/13 [00:00<?, ?it/s]
358
  0%| | 0/13 [00:00<?, ?it/s]
359
+
360
+
361
+
362
+
363
+
364
+
365
+
366
+
367
+ 2025-11-19 17:40:40.463 | WARNING | src.generation:generate:53 - The arguments ('add_bos_token', 'sigma', 'eot_token_id') are not supported by the generation strategy 'vanilla'.
368
+ 2025-11-19 17:40:40.464 | WARNING | src.generation:generate:53 - The arguments ('eot_token_id', 'sigma', 'add_bos_token') are not supported by the generation strategy 'vanilla'.
369
+ 2025-11-19 17:40:40.465 | WARNING | src.generation:generate:53 - The arguments ('eot_token_id', 'add_bos_token', 'sigma') are not supported by the generation strategy 'vanilla'.
370
+ 2025-11-19 17:40:40.465 | WARNING | src.generation:generate:53 - The arguments ('eot_token_id', 'add_bos_token', 'sigma') are not supported by the generation strategy 'vanilla'.
371
+ 2025-11-19 17:40:40.466 | WARNING | src.generation:generate:53 - The arguments ('add_bos_token', 'eot_token_id', 'sigma') are not supported by the generation strategy 'vanilla'.
372
+ 2025-11-19 17:40:40.467 | WARNING | src.generation:generate:53 - The arguments ('eot_token_id', 'sigma', 'add_bos_token') are not supported by the generation strategy 'vanilla'.
373
+ 2025-11-19 17:40:40.476 | WARNING | src.generation:generate:53 - The arguments ('add_bos_token', 'eot_token_id', 'sigma') are not supported by the generation strategy 'vanilla'.
374
+
375
+ 2025-11-19 18:22:21.069 | INFO | __main__:main:87 - Throughput: 7.56 tokens/sec, Tokens per step: 0.89 tokens/step (full: 19.51 tokens/sec, 1.00 tokens/step), Latency: 13.16 s, Average Input Length: 1360.60 tokens, Peak GPU Memory: 17.24 GB, Total time: 2384.56 s
376
+ 2025-11-19 18:22:21.153 | INFO | __main__:main:108 - Results saved to /xfr_ceph_sh/liuchonghan/HEAT/heat/outputs/2025-11-19/exp1_len256/dream-inst/dllm/mmlu_pro/results.json
377
+ 2025-11-19 18:22:21.153 | INFO | __main__:main:111 - eval time: 2384.56 seconds
378
+ [rank0]:[W1119 18:22:21.341088084 ProcessGroupNCCL.cpp:1479] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
2025-11-1/exp1_len256/dream-inst/prefix/mmlu_pro/.hydra/config.yaml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ generation:
3
+ mask_token_id: 151666
4
+ eot_token_id: 151643
5
+ pad_token_id: 151643
6
+ add_bos_token: true
7
+ alg: maskgit_plus
8
+ name: dream-inst
9
+ path: ${oc.env:DREAM_INST_PATH}
10
+ generation:
11
+ strategy: vanilla
12
+ threshold: null
13
+ factor: null
14
+ alg: maskgit_plus
15
+ gen_length: null
16
+ block_length: null
17
+ steps: null
18
+ temperature: 0.0
19
+ top_p: null
20
+ top_k: null
21
+ debias: false
22
+ output_probs: false
23
+ cache:
24
+ _target_: src.cache.PrefixCache
25
+ use_dual: false
26
+ seed: 1234
27
+ batch_size: 1
28
+ attn_implementation: eager
29
+ dataset:
30
+ name: mmlu_pro
31
+ size: null
32
+ n_shot: null
33
+ system_prompt: null
34
+ batch_size: 1
35
+ mc_num: null
36
+ max_length: 4096
37
+ is_check_greedy: true
38
+ add_bos_token: true
39
+ nll_type: mc
40
+ log_type: ftb
41
+ eval_args:
42
+ log_samples: true
43
+ tasks: ${..dataset.name}
44
+ num_fewshot: ${..dataset.n_shot}
45
+ batch_size: ${..batch_size}
46
+ limit: ${..dataset.size}
47
+ confirm_run_unsafe_code: true
48
+ random_seed: ${..seed}
49
+ fewshot_random_seed: ${..seed}
50
+ numpy_random_seed: ${..seed}
51
+ torch_random_seed: ${..seed}
2025-11-1/exp1_len256/dream-inst/prefix/mmlu_pro/.hydra/hydra.yaml ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: outputs/2025-11-19/exp1_len256/dream-inst/prefix/mmlu_pro
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.run.dir=outputs/2025-11-19/exp1_len256/dream-inst/prefix/mmlu_pro
114
+ - hydra.mode=RUN
115
+ task:
116
+ - dataset.name=mmlu_pro
117
+ - model=dream-inst
118
+ - cache=prefix
119
+ - generation=vanilla
120
+ - batch_size=1
121
+ - seed=1234
122
+ job:
123
+ name: eval
124
+ chdir: null
125
+ override_dirname: batch_size=1,cache=prefix,dataset.name=mmlu_pro,generation=vanilla,model=dream-inst,seed=1234
126
+ id: ???
127
+ num: ???
128
+ config_name: eval
129
+ env_set: {}
130
+ env_copy: []
131
+ config:
132
+ override_dirname:
133
+ kv_sep: '='
134
+ item_sep: ','
135
+ exclude_keys: []
136
+ runtime:
137
+ version: 1.3.2
138
+ version_base: '1.3'
139
+ cwd: /xfr_ceph_sh/liuchonghan/HEAT/heat
140
+ config_sources:
141
+ - path: hydra.conf
142
+ schema: pkg
143
+ provider: hydra
144
+ - path: /xfr_ceph_sh/liuchonghan/HEAT/heat/configs
145
+ schema: file
146
+ provider: main
147
+ - path: ''
148
+ schema: structured
149
+ provider: schema
150
+ output_dir: /xfr_ceph_sh/liuchonghan/HEAT/heat/outputs/2025-11-19/exp1_len256/dream-inst/prefix/mmlu_pro
151
+ choices:
152
+ cache: prefix
153
+ generation: vanilla
154
+ model: dream-inst
155
+ hydra/env: default
156
+ hydra/callbacks: null
157
+ hydra/job_logging: default
158
+ hydra/hydra_logging: default
159
+ hydra/hydra_help: default
160
+ hydra/help: default
161
+ hydra/sweeper: basic
162
+ hydra/launcher: basic
163
+ hydra/output: default
164
+ verbose: false
2025-11-1/exp1_len256/dream-inst/prefix/mmlu_pro/.hydra/overrides.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ - dataset.name=mmlu_pro
2
+ - model=dream-inst
3
+ - cache=prefix
4
+ - generation=vanilla
5
+ - batch_size=1
6
+ - seed=1234
2025-11-1/exp1_len256/dream-inst/prefix/mmlu_pro/eval.log ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-11-19 18:22:51,650][accelerate.utils.other][WARNING] - Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
2
+ [2025-11-19 18:22:59,357][lm_eval.evaluator][INFO] - Setting random seed to 1234 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
3
+ [2025-11-19 18:22:59,358][lm_eval.evaluator][INFO] - Using pre-initialized model
4
+ [2025-11-19 18:23:00,307][lm_eval.evaluator][INFO] - Setting random seed to 1234 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
5
+ [2025-11-19 18:23:00,307][lm_eval.evaluator][INFO] - Using pre-initialized model
6
+ [2025-11-19 18:23:00,571][lm_eval.evaluator][INFO] - Setting random seed to 1234 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
7
+ [2025-11-19 18:23:00,571][lm_eval.evaluator][INFO] - Using pre-initialized model
8
+ [2025-11-19 18:23:00,608][lm_eval.evaluator][INFO] - Setting random seed to 1234 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
9
+ [2025-11-19 18:23:00,608][lm_eval.evaluator][INFO] - Using pre-initialized model
10
+ [2025-11-19 18:23:00,753][lm_eval.evaluator][INFO] - Setting random seed to 1234 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
11
+ [2025-11-19 18:23:00,753][lm_eval.evaluator][INFO] - Using pre-initialized model
12
+ [2025-11-19 18:23:01,333][lm_eval.evaluator][INFO] - Setting random seed to 1234 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
13
+ [2025-11-19 18:23:01,333][lm_eval.evaluator][INFO] - Using pre-initialized model
14
+ [2025-11-19 18:23:02,643][lm_eval.evaluator][INFO] - Setting random seed to 1234 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
15
+ [2025-11-19 18:23:02,643][lm_eval.evaluator][INFO] - Using pre-initialized model
16
+ [2025-11-19 18:23:03,085][lm_eval.evaluator][INFO] - Setting random seed to 1234 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
17
+ [2025-11-19 18:23:03,086][lm_eval.evaluator][INFO] - Using pre-initialized model
18
+ [2025-11-19 18:23:40,845][lm_eval.evaluator][INFO] - mmlu_pro_biology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
19
+ [2025-11-19 18:23:40,845][lm_eval.evaluator][INFO] - mmlu_pro_business: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
20
+ [2025-11-19 18:23:40,846][lm_eval.evaluator][INFO] - mmlu_pro_chemistry: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
21
+ [2025-11-19 18:23:40,846][lm_eval.evaluator][INFO] - mmlu_pro_computer_science: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
22
+ [2025-11-19 18:23:40,846][lm_eval.evaluator][INFO] - mmlu_pro_economics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
23
+ [2025-11-19 18:23:40,846][lm_eval.evaluator][INFO] - mmlu_pro_engineering: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
24
+ [2025-11-19 18:23:40,846][lm_eval.evaluator][INFO] - mmlu_pro_health: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
25
+ [2025-11-19 18:23:40,846][lm_eval.evaluator][INFO] - mmlu_pro_history: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
26
+ [2025-11-19 18:23:40,846][lm_eval.evaluator][INFO] - mmlu_pro_law: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
27
+ [2025-11-19 18:23:40,846][lm_eval.evaluator][INFO] - mmlu_pro_math: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
28
+ [2025-11-19 18:23:40,846][lm_eval.evaluator][INFO] - mmlu_pro_other: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
29
+ [2025-11-19 18:23:40,846][lm_eval.evaluator][INFO] - mmlu_pro_philosophy: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
30
+ [2025-11-19 18:23:40,846][lm_eval.evaluator][INFO] - mmlu_pro_physics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
31
+ [2025-11-19 18:23:40,846][lm_eval.evaluator][INFO] - mmlu_pro_psychology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
32
+ [2025-11-19 18:23:40,849][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_biology on rank 1...
33
+ [2025-11-19 18:23:41,778][lm_eval.evaluator][INFO] - mmlu_pro_biology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
34
+ [2025-11-19 18:23:41,779][lm_eval.evaluator][INFO] - mmlu_pro_business: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
35
+ [2025-11-19 18:23:41,779][lm_eval.evaluator][INFO] - mmlu_pro_chemistry: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
36
+ [2025-11-19 18:23:41,779][lm_eval.evaluator][INFO] - mmlu_pro_computer_science: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
37
+ [2025-11-19 18:23:41,780][lm_eval.evaluator][INFO] - mmlu_pro_economics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
38
+ [2025-11-19 18:23:41,780][lm_eval.evaluator][INFO] - mmlu_pro_engineering: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
39
+ [2025-11-19 18:23:41,780][lm_eval.evaluator][INFO] - mmlu_pro_health: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
40
+ [2025-11-19 18:23:41,780][lm_eval.evaluator][INFO] - mmlu_pro_history: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
41
+ [2025-11-19 18:23:41,780][lm_eval.evaluator][INFO] - mmlu_pro_law: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
42
+ [2025-11-19 18:23:41,780][lm_eval.evaluator][INFO] - mmlu_pro_math: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
43
+ [2025-11-19 18:23:41,780][lm_eval.evaluator][INFO] - mmlu_pro_other: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
44
+ [2025-11-19 18:23:41,781][lm_eval.evaluator][INFO] - mmlu_pro_philosophy: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
45
+ [2025-11-19 18:23:41,781][lm_eval.evaluator][INFO] - mmlu_pro_physics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
46
+ [2025-11-19 18:23:41,781][lm_eval.evaluator][INFO] - mmlu_pro_psychology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
47
+ [2025-11-19 18:23:41,787][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_biology on rank 6...
48
+ [2025-11-19 18:23:42,857][lm_eval.evaluator][INFO] - mmlu_pro_biology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
49
+ [2025-11-19 18:23:42,857][lm_eval.evaluator][INFO] - mmlu_pro_business: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
50
+ [2025-11-19 18:23:42,857][lm_eval.evaluator][INFO] - mmlu_pro_chemistry: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
51
+ [2025-11-19 18:23:42,858][lm_eval.evaluator][INFO] - mmlu_pro_computer_science: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
52
+ [2025-11-19 18:23:42,858][lm_eval.evaluator][INFO] - mmlu_pro_economics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
53
+ [2025-11-19 18:23:42,858][lm_eval.evaluator][INFO] - mmlu_pro_engineering: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
54
+ [2025-11-19 18:23:42,858][lm_eval.evaluator][INFO] - mmlu_pro_health: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
55
+ [2025-11-19 18:23:42,858][lm_eval.evaluator][INFO] - mmlu_pro_history: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
56
+ [2025-11-19 18:23:42,858][lm_eval.evaluator][INFO] - mmlu_pro_law: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
57
+ [2025-11-19 18:23:42,859][lm_eval.evaluator][INFO] - mmlu_pro_math: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
58
+ [2025-11-19 18:23:42,859][lm_eval.evaluator][INFO] - mmlu_pro_other: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
59
+ [2025-11-19 18:23:42,859][lm_eval.evaluator][INFO] - mmlu_pro_philosophy: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
60
+ [2025-11-19 18:23:42,859][lm_eval.evaluator][INFO] - mmlu_pro_physics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
61
+ [2025-11-19 18:23:42,859][lm_eval.evaluator][INFO] - mmlu_pro_psychology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
62
+ [2025-11-19 18:23:42,865][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_biology on rank 7...
63
+ [2025-11-19 18:23:43,755][lm_eval.evaluator][INFO] - mmlu_pro_biology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
64
+ [2025-11-19 18:23:43,755][lm_eval.evaluator][INFO] - mmlu_pro_business: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
65
+ [2025-11-19 18:23:43,755][lm_eval.evaluator][INFO] - mmlu_pro_chemistry: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
66
+ [2025-11-19 18:23:43,755][lm_eval.evaluator][INFO] - mmlu_pro_computer_science: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
67
+ [2025-11-19 18:23:43,756][lm_eval.evaluator][INFO] - mmlu_pro_economics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
68
+ [2025-11-19 18:23:43,756][lm_eval.evaluator][INFO] - mmlu_pro_engineering: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
69
+ [2025-11-19 18:23:43,756][lm_eval.evaluator][INFO] - mmlu_pro_health: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
70
+ [2025-11-19 18:23:43,756][lm_eval.evaluator][INFO] - mmlu_pro_history: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
71
+ [2025-11-19 18:23:43,756][lm_eval.evaluator][INFO] - mmlu_pro_law: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
72
+ [2025-11-19 18:23:43,756][lm_eval.evaluator][INFO] - mmlu_pro_math: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
73
+ [2025-11-19 18:23:43,756][lm_eval.evaluator][INFO] - mmlu_pro_other: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
74
+ [2025-11-19 18:23:43,756][lm_eval.evaluator][INFO] - mmlu_pro_philosophy: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
75
+ [2025-11-19 18:23:43,756][lm_eval.evaluator][INFO] - mmlu_pro_physics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
76
+ [2025-11-19 18:23:43,756][lm_eval.evaluator][INFO] - mmlu_pro_psychology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
77
+ [2025-11-19 18:23:43,759][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_biology on rank 3...
78
+ [2025-11-19 18:23:45,734][lm_eval.evaluator][INFO] - mmlu_pro_biology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
79
+ [2025-11-19 18:23:45,735][lm_eval.evaluator][INFO] - mmlu_pro_business: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
80
+ [2025-11-19 18:23:45,735][lm_eval.evaluator][INFO] - mmlu_pro_chemistry: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
81
+ [2025-11-19 18:23:45,735][lm_eval.evaluator][INFO] - mmlu_pro_computer_science: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
82
+ [2025-11-19 18:23:45,735][lm_eval.evaluator][INFO] - mmlu_pro_economics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
83
+ [2025-11-19 18:23:45,735][lm_eval.evaluator][INFO] - mmlu_pro_engineering: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
84
+ [2025-11-19 18:23:45,735][lm_eval.evaluator][INFO] - mmlu_pro_health: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
85
+ [2025-11-19 18:23:45,735][lm_eval.evaluator][INFO] - mmlu_pro_history: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
86
+ [2025-11-19 18:23:45,735][lm_eval.evaluator][INFO] - mmlu_pro_law: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
87
+ [2025-11-19 18:23:45,735][lm_eval.evaluator][INFO] - mmlu_pro_math: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
88
+ [2025-11-19 18:23:45,736][lm_eval.evaluator][INFO] - mmlu_pro_other: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
89
+ [2025-11-19 18:23:45,736][lm_eval.evaluator][INFO] - mmlu_pro_philosophy: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
90
+ [2025-11-19 18:23:45,736][lm_eval.evaluator][INFO] - mmlu_pro_physics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
91
+ [2025-11-19 18:23:45,736][lm_eval.evaluator][INFO] - mmlu_pro_psychology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
92
+ [2025-11-19 18:23:45,738][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_biology on rank 0...
93
+ [2025-11-19 18:23:54,454][lm_eval.evaluator][INFO] - mmlu_pro_biology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
94
+ [2025-11-19 18:23:54,454][lm_eval.evaluator][INFO] - mmlu_pro_business: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
95
+ [2025-11-19 18:23:54,454][lm_eval.evaluator][INFO] - mmlu_pro_chemistry: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
96
+ [2025-11-19 18:23:54,454][lm_eval.evaluator][INFO] - mmlu_pro_computer_science: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
97
+ [2025-11-19 18:23:54,454][lm_eval.evaluator][INFO] - mmlu_pro_economics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
98
+ [2025-11-19 18:23:54,454][lm_eval.evaluator][INFO] - mmlu_pro_engineering: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
99
+ [2025-11-19 18:23:54,454][lm_eval.evaluator][INFO] - mmlu_pro_health: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
100
+ [2025-11-19 18:23:54,455][lm_eval.evaluator][INFO] - mmlu_pro_history: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
101
+ [2025-11-19 18:23:54,455][lm_eval.evaluator][INFO] - mmlu_pro_law: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
102
+ [2025-11-19 18:23:54,455][lm_eval.evaluator][INFO] - mmlu_pro_math: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
103
+ [2025-11-19 18:23:54,455][lm_eval.evaluator][INFO] - mmlu_pro_other: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
104
+ [2025-11-19 18:23:54,455][lm_eval.evaluator][INFO] - mmlu_pro_philosophy: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
105
+ [2025-11-19 18:23:54,455][lm_eval.evaluator][INFO] - mmlu_pro_physics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
106
+ [2025-11-19 18:23:54,455][lm_eval.evaluator][INFO] - mmlu_pro_psychology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
107
+ [2025-11-19 18:23:54,458][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_biology on rank 4...
108
+ [2025-11-19 18:23:57,947][lm_eval.evaluator][INFO] - mmlu_pro_biology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
109
+ [2025-11-19 18:23:57,947][lm_eval.evaluator][INFO] - mmlu_pro_business: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
110
+ [2025-11-19 18:23:57,947][lm_eval.evaluator][INFO] - mmlu_pro_chemistry: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
111
+ [2025-11-19 18:23:57,947][lm_eval.evaluator][INFO] - mmlu_pro_computer_science: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
112
+ [2025-11-19 18:23:57,947][lm_eval.evaluator][INFO] - mmlu_pro_economics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
113
+ [2025-11-19 18:23:57,947][lm_eval.evaluator][INFO] - mmlu_pro_engineering: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
114
+ [2025-11-19 18:23:57,948][lm_eval.evaluator][INFO] - mmlu_pro_health: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
115
+ [2025-11-19 18:23:57,948][lm_eval.evaluator][INFO] - mmlu_pro_history: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
116
+ [2025-11-19 18:23:57,948][lm_eval.evaluator][INFO] - mmlu_pro_law: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
117
+ [2025-11-19 18:23:57,948][lm_eval.evaluator][INFO] - mmlu_pro_math: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
118
+ [2025-11-19 18:23:57,948][lm_eval.evaluator][INFO] - mmlu_pro_other: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
119
+ [2025-11-19 18:23:57,948][lm_eval.evaluator][INFO] - mmlu_pro_philosophy: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
120
+ [2025-11-19 18:23:57,948][lm_eval.evaluator][INFO] - mmlu_pro_physics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
121
+ [2025-11-19 18:23:57,948][lm_eval.evaluator][INFO] - mmlu_pro_psychology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
122
+ [2025-11-19 18:23:57,951][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_biology on rank 2...
123
+ [2025-11-19 18:23:58,344][lm_eval.evaluator][INFO] - mmlu_pro_biology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
124
+ [2025-11-19 18:23:58,344][lm_eval.evaluator][INFO] - mmlu_pro_business: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
125
+ [2025-11-19 18:23:58,344][lm_eval.evaluator][INFO] - mmlu_pro_chemistry: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
126
+ [2025-11-19 18:23:58,344][lm_eval.evaluator][INFO] - mmlu_pro_computer_science: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
127
+ [2025-11-19 18:23:58,344][lm_eval.evaluator][INFO] - mmlu_pro_economics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
128
+ [2025-11-19 18:23:58,344][lm_eval.evaluator][INFO] - mmlu_pro_engineering: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
129
+ [2025-11-19 18:23:58,344][lm_eval.evaluator][INFO] - mmlu_pro_health: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
130
+ [2025-11-19 18:23:58,345][lm_eval.evaluator][INFO] - mmlu_pro_history: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
131
+ [2025-11-19 18:23:58,345][lm_eval.evaluator][INFO] - mmlu_pro_law: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
132
+ [2025-11-19 18:23:58,345][lm_eval.evaluator][INFO] - mmlu_pro_math: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
133
+ [2025-11-19 18:23:58,345][lm_eval.evaluator][INFO] - mmlu_pro_other: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
134
+ [2025-11-19 18:23:58,345][lm_eval.evaluator][INFO] - mmlu_pro_philosophy: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
135
+ [2025-11-19 18:23:58,345][lm_eval.evaluator][INFO] - mmlu_pro_physics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
136
+ [2025-11-19 18:23:58,345][lm_eval.evaluator][INFO] - mmlu_pro_psychology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
137
+ [2025-11-19 18:23:58,347][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_biology on rank 5...
138
+ [2025-11-19 18:24:01,625][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_business on rank 0...
139
+ [2025-11-19 18:24:01,625][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_business on rank 4...
140
+ [2025-11-19 18:24:01,626][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_business on rank 5...
141
+ [2025-11-19 18:24:01,626][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_business on rank 7...
142
+ [2025-11-19 18:24:01,626][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_business on rank 1...
143
+ [2025-11-19 18:24:01,626][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_business on rank 2...
144
+ [2025-11-19 18:24:01,626][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_business on rank 3...
145
+ [2025-11-19 18:24:01,626][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_business on rank 6...
146
+ [2025-11-19 18:24:01,669][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_chemistry on rank 1...
147
+ [2025-11-19 18:24:01,669][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_chemistry on rank 6...
148
+ [2025-11-19 18:24:01,669][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_chemistry on rank 0...
149
+ [2025-11-19 18:24:01,669][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_chemistry on rank 4...
150
+ [2025-11-19 18:24:01,669][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_chemistry on rank 5...
151
+ [2025-11-19 18:24:01,669][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_chemistry on rank 2...
152
+ [2025-11-19 18:24:01,669][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_chemistry on rank 3...
153
+ [2025-11-19 18:24:01,669][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_chemistry on rank 7...
154
+ [2025-11-19 18:24:01,690][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_computer_science on rank 0...
155
+ [2025-11-19 18:24:01,690][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_computer_science on rank 1...
156
+ [2025-11-19 18:24:01,690][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_computer_science on rank 2...
157
+ [2025-11-19 18:24:01,690][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_computer_science on rank 5...
158
+ [2025-11-19 18:24:01,690][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_computer_science on rank 6...
159
+ [2025-11-19 18:24:01,690][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_computer_science on rank 4...
160
+ [2025-11-19 18:24:01,690][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_computer_science on rank 7...
161
+ [2025-11-19 18:24:01,690][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_computer_science on rank 3...
162
+ [2025-11-19 18:24:01,710][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_economics on rank 0...
163
+ [2025-11-19 18:24:01,710][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_economics on rank 1...
164
+ [2025-11-19 18:24:01,710][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_economics on rank 4...
165
+ [2025-11-19 18:24:01,710][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_economics on rank 5...
166
+ [2025-11-19 18:24:01,710][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_economics on rank 6...
167
+ [2025-11-19 18:24:01,710][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_economics on rank 7...
168
+ [2025-11-19 18:24:01,710][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_economics on rank 2...
169
+ [2025-11-19 18:24:01,710][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_economics on rank 3...
170
+ [2025-11-19 18:24:01,730][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_engineering on rank 0...
171
+ [2025-11-19 18:24:01,730][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_engineering on rank 1...
172
+ [2025-11-19 18:24:01,730][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_engineering on rank 7...
173
+ [2025-11-19 18:24:01,730][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_engineering on rank 5...
174
+ [2025-11-19 18:24:01,730][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_engineering on rank 4...
175
+ [2025-11-19 18:24:01,730][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_engineering on rank 6...
176
+ [2025-11-19 18:24:01,730][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_engineering on rank 3...
177
+ [2025-11-19 18:24:01,731][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_engineering on rank 2...
178
+ [2025-11-19 18:24:01,751][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_health on rank 0...
179
+ [2025-11-19 18:24:01,751][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_health on rank 1...
180
+ [2025-11-19 18:24:01,751][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_health on rank 7...
181
+ [2025-11-19 18:24:01,751][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_health on rank 2...
182
+ [2025-11-19 18:24:01,751][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_health on rank 4...
183
+ [2025-11-19 18:24:01,751][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_health on rank 6...
184
+ [2025-11-19 18:24:01,751][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_health on rank 5...
185
+ [2025-11-19 18:24:01,751][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_health on rank 3...
186
+ [2025-11-19 18:24:01,771][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_history on rank 1...
187
+ [2025-11-19 18:24:01,771][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_history on rank 7...
188
+ [2025-11-19 18:24:01,771][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_history on rank 0...
189
+ [2025-11-19 18:24:01,771][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_history on rank 4...
190
+ [2025-11-19 18:24:01,771][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_history on rank 6...
191
+ [2025-11-19 18:24:01,771][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_history on rank 2...
192
+ [2025-11-19 18:24:01,771][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_history on rank 5...
193
+ [2025-11-19 18:24:01,771][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_history on rank 3...
194
+ [2025-11-19 18:24:01,791][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_law on rank 1...
195
+ [2025-11-19 18:24:01,791][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_law on rank 6...
196
+ [2025-11-19 18:24:01,791][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_law on rank 4...
197
+ [2025-11-19 18:24:01,791][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_law on rank 5...
198
+ [2025-11-19 18:24:01,791][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_law on rank 7...
199
+ [2025-11-19 18:24:01,791][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_law on rank 0...
200
+ [2025-11-19 18:24:01,791][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_law on rank 3...
201
+ [2025-11-19 18:24:01,791][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_law on rank 2...
202
+ [2025-11-19 18:24:01,811][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_math on rank 1...
203
+ [2025-11-19 18:24:01,811][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_math on rank 3...
204
+ [2025-11-19 18:24:01,811][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_math on rank 0...
205
+ [2025-11-19 18:24:01,811][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_math on rank 4...
206
+ [2025-11-19 18:24:01,811][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_math on rank 5...
207
+ [2025-11-19 18:24:01,811][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_math on rank 6...
208
+ [2025-11-19 18:24:01,811][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_math on rank 7...
209
+ [2025-11-19 18:24:01,811][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_math on rank 2...
210
+ [2025-11-19 18:24:01,832][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_other on rank 1...
211
+ [2025-11-19 18:24:01,832][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_other on rank 0...
212
+ [2025-11-19 18:24:01,832][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_other on rank 6...
213
+ [2025-11-19 18:24:01,832][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_other on rank 5...
214
+ [2025-11-19 18:24:01,832][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_other on rank 7...
215
+ [2025-11-19 18:24:01,832][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_other on rank 3...
216
+ [2025-11-19 18:24:01,832][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_other on rank 4...
217
+ [2025-11-19 18:24:01,832][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_other on rank 2...
218
+ [2025-11-19 18:24:01,852][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_philosophy on rank 1...
219
+ [2025-11-19 18:24:01,852][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_philosophy on rank 6...
220
+ [2025-11-19 18:24:01,852][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_philosophy on rank 3...
221
+ [2025-11-19 18:24:01,852][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_philosophy on rank 0...
222
+ [2025-11-19 18:24:01,852][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_philosophy on rank 7...
223
+ [2025-11-19 18:24:01,852][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_philosophy on rank 5...
224
+ [2025-11-19 18:24:01,852][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_philosophy on rank 4...
225
+ [2025-11-19 18:24:01,852][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_philosophy on rank 2...
226
+ [2025-11-19 18:24:01,873][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_physics on rank 2...
227
+ [2025-11-19 18:24:01,873][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_physics on rank 6...
228
+ [2025-11-19 18:24:01,873][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_physics on rank 7...
229
+ [2025-11-19 18:24:01,873][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_physics on rank 1...
230
+ [2025-11-19 18:24:01,873][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_physics on rank 5...
231
+ [2025-11-19 18:24:01,873][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_physics on rank 3...
232
+ [2025-11-19 18:24:01,873][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_physics on rank 4...
233
+ [2025-11-19 18:24:01,873][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_physics on rank 0...
234
+ [2025-11-19 18:24:01,893][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_psychology on rank 2...
235
+ [2025-11-19 18:24:01,893][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_psychology on rank 1...
236
+ [2025-11-19 18:24:01,893][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_psychology on rank 7...
237
+ [2025-11-19 18:24:01,893][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_psychology on rank 4...
238
+ [2025-11-19 18:24:01,893][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_psychology on rank 6...
239
+ [2025-11-19 18:24:01,893][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_psychology on rank 0...
240
+ [2025-11-19 18:24:01,893][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_psychology on rank 5...
241
+ [2025-11-19 18:24:01,893][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_psychology on rank 3...
242
+ [2025-11-19 18:24:01,913][lm_eval.evaluator][INFO] - Running generate_until requests
243
+ [2025-11-19 18:24:01,913][lm_eval.evaluator][INFO] - Running generate_until requests
244
+ [2025-11-19 18:24:01,913][lm_eval.evaluator][INFO] - Running generate_until requests
245
+ [2025-11-19 18:24:01,913][lm_eval.evaluator][INFO] - Running generate_until requests
246
+ [2025-11-19 18:24:01,913][lm_eval.evaluator][INFO] - Running generate_until requests
247
+ [2025-11-19 18:24:01,913][lm_eval.evaluator][INFO] - Running generate_until requests
248
+ [2025-11-19 18:24:01,913][lm_eval.evaluator][INFO] - Running generate_until requests
249
+ [2025-11-19 18:24:01,913][lm_eval.evaluator][INFO] - Running generate_until requests
2025-11-1/exp1_len256/dream-inst/prefix/mmlu_pro/results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1230760c961dd9022be0aec4f3f825675b214710c9f5cd95b876255b60bfc0cd
3
+ size 10524080
2025-11-1/exp1_len256/dream-inst/prefix/mmlu_pro/stderr.log ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0
  0%| | 0/13 [00:00<?, ?it/s]
 
1
  0%| | 0/12 [00:00<?, ?it/s]
 
2
  0%| | 0/12 [00:00<?, ?it/s]
 
3
  0%| | 0/13 [00:00<?, ?it/s]
 
4
  0%| | 0/13 [00:00<?, ?it/s]
 
5
  0%| | 0/12 [00:00<?, ?it/s]
 
6
  0%| | 0/13 [00:00<?, ?it/s]
 
7
  0%| | 0/12 [00:00<?, ?it/s]
 
8
  0%| | 0/12 [00:00<?, ?it/s]
9
  0%| | 0/13 [00:00<?, ?it/s]
10
  0%| | 0/13 [00:00<?, ?it/s]
11
  0%| | 0/13 [00:00<?, ?it/s]
12
  0%| | 0/12 [00:00<?, ?it/s]
13
  0%| | 0/12 [00:00<?, ?it/s]
14
  0%| | 0/13 [00:00<?, ?it/s]
 
 
15
  0%| | 0/12 [00:00<?, ?it/s]
 
 
 
 
 
 
16
  0%| | 0/13 [00:00<?, ?it/s]
17
  0%| | 0/13 [00:00<?, ?it/s]
18
  0%| | 0/13 [00:00<?, ?it/s]
19
  0%| | 0/12 [00:00<?, ?it/s]
20
  0%| | 0/12 [00:00<?, ?it/s]
21
  0%| | 0/12 [00:00<?, ?it/s]
22
  0%| | 0/13 [00:00<?, ?it/s]
23
  0%| | 0/12 [00:00<?, ?it/s]
 
 
 
 
 
 
 
 
24
  0%| | 0/13 [00:00<?, ?it/s]
25
  0%| | 0/12 [00:00<?, ?it/s]
26
  0%| | 0/13 [00:00<?, ?it/s]
27
  0%| | 0/12 [00:00<?, ?it/s]
28
  0%| | 0/13 [00:00<?, ?it/s]
29
  0%| | 0/12 [00:00<?, ?it/s]
30
  0%| | 0/12 [00:00<?, ?it/s]
31
  0%| | 0/13 [00:00<?, ?it/s]
 
 
 
 
 
 
 
 
32
  0%| | 0/13 [00:00<?, ?it/s]
33
  0%| | 0/12 [00:00<?, ?it/s]
34
  0%| | 0/13 [00:00<?, ?it/s]
35
  0%| | 0/12 [00:00<?, ?it/s]
36
  0%| | 0/12 [00:00<?, ?it/s]
37
  0%| | 0/12 [00:00<?, ?it/s]
38
  0%| | 0/13 [00:00<?, ?it/s]
39
  0%| | 0/13 [00:00<?, ?it/s]
 
 
 
 
 
 
 
 
40
  0%| | 0/13 [00:00<?, ?it/s]
41
  0%| | 0/13 [00:00<?, ?it/s]
42
  0%| | 0/12 [00:00<?, ?it/s]
43
  0%| | 0/12 [00:00<?, ?it/s]
44
  0%| | 0/12 [00:00<?, ?it/s]
45
  0%| | 0/12 [00:00<?, ?it/s]
46
  0%| | 0/13 [00:00<?, ?it/s]
47
  0%| | 0/13 [00:00<?, ?it/s]
 
 
 
 
 
 
 
 
48
  0%| | 0/13 [00:00<?, ?it/s]
49
  0%| | 0/13 [00:00<?, ?it/s]
50
  0%| | 0/12 [00:00<?, ?it/s]
51
  0%| | 0/13 [00:00<?, ?it/s]
52
  0%| | 0/12 [00:00<?, ?it/s]
53
  0%| | 0/13 [00:00<?, ?it/s]
54
  0%| | 0/12 [00:00<?, ?it/s]
55
  0%| | 0/12 [00:00<?, ?it/s]
 
 
 
 
 
 
 
 
56
  0%| | 0/13 [00:00<?, ?it/s]
57
  0%| | 0/13 [00:00<?, ?it/s]
58
  0%| | 0/13 [00:00<?, ?it/s]
59
  0%| | 0/13 [00:00<?, ?it/s]
60
  0%| | 0/12 [00:00<?, ?it/s]
61
  0%| | 0/12 [00:00<?, ?it/s]
62
  0%| | 0/12 [00:00<?, ?it/s]
63
  0%| | 0/12 [00:00<?, ?it/s]
 
 
 
 
 
 
 
 
64
  0%| | 0/13 [00:00<?, ?it/s]
65
  0%| | 0/13 [00:00<?, ?it/s]
66
  0%| | 0/12 [00:00<?, ?it/s]
67
  0%| | 0/12 [00:00<?, ?it/s]
68
  0%| | 0/13 [00:00<?, ?it/s]
69
  0%| | 0/12 [00:00<?, ?it/s]
70
  0%| | 0/12 [00:00<?, ?it/s]
71
  0%| | 0/13 [00:00<?, ?it/s]
 
 
 
 
 
 
 
 
72
  0%| | 0/13 [00:00<?, ?it/s]
73
  0%| | 0/12 [00:00<?, ?it/s]
74
  0%| | 0/12 [00:00<?, ?it/s]
75
  0%| | 0/13 [00:00<?, ?it/s]
76
  0%| | 0/13 [00:00<?, ?it/s]
77
  0%| | 0/13 [00:00<?, ?it/s]
78
  0%| | 0/12 [00:00<?, ?it/s]
79
  0%| | 0/12 [00:00<?, ?it/s]
 
 
 
 
 
 
 
 
80
  0%| | 0/13 [00:00<?, ?it/s]
81
  0%| | 0/13 [00:00<?, ?it/s]
82
  0%| | 0/13 [00:00<?, ?it/s]
83
  0%| | 0/12 [00:00<?, ?it/s]
84
  0%| | 0/12 [00:00<?, ?it/s]
85
  0%| | 0/12 [00:00<?, ?it/s]
86
  0%| | 0/12 [00:00<?, ?it/s]
87
  0%| | 0/13 [00:00<?, ?it/s]
 
 
 
 
 
 
 
 
88
  0%| | 0/13 [00:00<?, ?it/s]
89
  0%| | 0/13 [00:00<?, ?it/s]
90
  0%| | 0/12 [00:00<?, ?it/s]
91
  0%| | 0/12 [00:00<?, ?it/s]
92
  0%| | 0/13 [00:00<?, ?it/s]
93
  0%| | 0/12 [00:00<?, ?it/s]
94
  0%| | 0/13 [00:00<?, ?it/s]
95
  0%| | 0/12 [00:00<?, ?it/s]
 
 
 
 
 
 
 
 
96
  0%| | 0/12 [00:00<?, ?it/s]
97
  0%| | 0/13 [00:00<?, ?it/s]
98
  0%| | 0/13 [00:00<?, ?it/s]
99
  0%| | 0/13 [00:00<?, ?it/s]
100
  0%| | 0/12 [00:00<?, ?it/s]
101
  0%| | 0/13 [00:00<?, ?it/s]
102
  0%| | 0/12 [00:00<?, ?it/s]
103
  0%| | 0/12 [00:00<?, ?it/s]
 
 
 
 
 
 
 
 
104
  0%| | 0/13 [00:00<?, ?it/s]
105
  0%| | 0/12 [00:00<?, ?it/s]
106
  0%| | 0/13 [00:00<?, ?it/s]
107
  0%| | 0/12 [00:00<?, ?it/s]
108
  0%| | 0/13 [00:00<?, ?it/s]
109
  0%| | 0/13 [00:00<?, ?it/s]
110
  0%| | 0/12 [00:00<?, ?it/s]
111
  0%| | 0/12 [00:00<?, ?it/s]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ipex flag is deprecated, will be removed in Accelerate v1.10. From 2.7.0, PyTorch has all needed optimizations for Intel CPU and XPU.
2
+ The following values were not passed to `accelerate launch` and had defaults used instead:
3
+ More than one GPU was found, enabling multi-GPU training.
4
+ If this was unintended please pass in `--num_processes=1`.
5
+ `--mixed_precision` was set to a value of `'no'`
6
+ `--dynamo_backend` was set to a value of `'no'`
7
+ To avoid this warning pass in values for each of the problematic parameters or run `accelerate config`.
8
+ [W1119 18:22:37.226195129 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
9
+ 2025-11-19 18:22:45.937 | INFO | src.utils:pre_initialize:603 - {'strategy': 'vanilla', 'threshold': None, 'factor': None, 'alg': 'maskgit_plus', 'gen_length': 256, 'block_length': 32, 'steps': 256, 'temperature': 0.0, 'top_p': 0.9, 'top_k': None, 'debias': False, 'output_probs': False, 'mask_token_id': 151666, 'eot_token_id': 151643, 'pad_token_id': 151643, 'add_bos_token': True, 'sigma': None}
10
+ [W1119 18:22:46.786781829 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
11
+ [W1119 18:22:49.307737806 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
12
+ [W1119 18:22:49.337649472 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
13
+ [W1119 18:22:49.770033713 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
14
+ [W1119 18:22:50.165484231 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
15
+ [W1119 18:22:50.184086372 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
16
+ [W1119 18:22:50.184270291 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
17
+ [W1119 18:22:50.186893603 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
18
+
19
+ The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
20
+ The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
21
+
22
+
23
+
24
+
25
+ The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
26
+ The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
27
+ The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
28
+ The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
29
+
30
+ The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
31
+ The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
32
+ The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
33
+ The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
34
+
35
+ The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
36
+
37
+
38
+ The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
39
+ The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
40
+ The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
41
+ The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
42
+ 2025-11-19 18:22:59.355 | INFO | __main__:overwrite_eval_task:62 - MMLU-Pro dataset is too large, shrink to 100 for faster evaluation.
43
+ 2025-11-19 18:23:00.303 | INFO | __main__:overwrite_eval_task:62 - MMLU-Pro dataset is too large, shrink to 100 for faster evaluation.
44
+ 2025-11-19 18:23:00.569 | INFO | __main__:overwrite_eval_task:62 - MMLU-Pro dataset is too large, shrink to 100 for faster evaluation.
45
+ 2025-11-19 18:23:00.605 | INFO | __main__:overwrite_eval_task:62 - MMLU-Pro dataset is too large, shrink to 100 for faster evaluation.
46
+ 2025-11-19 18:23:00.751 | INFO | __main__:overwrite_eval_task:62 - MMLU-Pro dataset is too large, shrink to 100 for faster evaluation.
47
+ 2025-11-19 18:23:01.331 | INFO | __main__:overwrite_eval_task:62 - MMLU-Pro dataset is too large, shrink to 100 for faster evaluation.
48
+ 2025-11-19 18:23:02.640 | INFO | __main__:overwrite_eval_task:62 - MMLU-Pro dataset is too large, shrink to 100 for faster evaluation.
49
+ 2025-11-19 18:23:03.083 | INFO | __main__:overwrite_eval_task:62 - MMLU-Pro dataset is too large, shrink to 100 for faster evaluation.
50
+
51
  0%| | 0/13 [00:00<?, ?it/s]
52
+
53
  0%| | 0/12 [00:00<?, ?it/s]
54
+
55
  0%| | 0/12 [00:00<?, ?it/s]
56
+
57
  0%| | 0/13 [00:00<?, ?it/s]
58
+
59
  0%| | 0/13 [00:00<?, ?it/s]
60
+
61
  0%| | 0/12 [00:00<?, ?it/s]
62
+
63
  0%| | 0/13 [00:00<?, ?it/s]
64
+
65
  0%| | 0/12 [00:00<?, ?it/s]
66
+
67
  0%| | 0/12 [00:00<?, ?it/s]
68
  0%| | 0/13 [00:00<?, ?it/s]
69
  0%| | 0/13 [00:00<?, ?it/s]
70
  0%| | 0/13 [00:00<?, ?it/s]
71
  0%| | 0/12 [00:00<?, ?it/s]
72
  0%| | 0/12 [00:00<?, ?it/s]
73
  0%| | 0/13 [00:00<?, ?it/s]
74
+
75
+
76
  0%| | 0/12 [00:00<?, ?it/s]
77
+
78
+
79
+
80
+
81
+
82
+
83
  0%| | 0/13 [00:00<?, ?it/s]
84
  0%| | 0/13 [00:00<?, ?it/s]
85
  0%| | 0/13 [00:00<?, ?it/s]
86
  0%| | 0/12 [00:00<?, ?it/s]
87
  0%| | 0/12 [00:00<?, ?it/s]
88
  0%| | 0/12 [00:00<?, ?it/s]
89
  0%| | 0/13 [00:00<?, ?it/s]
90
  0%| | 0/12 [00:00<?, ?it/s]
91
+
92
+
93
+
94
+
95
+
96
+
97
+
98
+
99
  0%| | 0/13 [00:00<?, ?it/s]
100
  0%| | 0/12 [00:00<?, ?it/s]
101
  0%| | 0/13 [00:00<?, ?it/s]
102
  0%| | 0/12 [00:00<?, ?it/s]
103
  0%| | 0/13 [00:00<?, ?it/s]
104
  0%| | 0/12 [00:00<?, ?it/s]
105
  0%| | 0/12 [00:00<?, ?it/s]
106
  0%| | 0/13 [00:00<?, ?it/s]
107
+
108
+
109
+
110
+
111
+
112
+
113
+
114
+
115
  0%| | 0/13 [00:00<?, ?it/s]
116
  0%| | 0/12 [00:00<?, ?it/s]
117
  0%| | 0/13 [00:00<?, ?it/s]
118
  0%| | 0/12 [00:00<?, ?it/s]
119
  0%| | 0/12 [00:00<?, ?it/s]
120
  0%| | 0/12 [00:00<?, ?it/s]
121
  0%| | 0/13 [00:00<?, ?it/s]
122
  0%| | 0/13 [00:00<?, ?it/s]
123
+
124
+
125
+
126
+
127
+
128
+
129
+
130
+
131
  0%| | 0/13 [00:00<?, ?it/s]
132
  0%| | 0/13 [00:00<?, ?it/s]
133
  0%| | 0/12 [00:00<?, ?it/s]
134
  0%| | 0/12 [00:00<?, ?it/s]
135
  0%| | 0/12 [00:00<?, ?it/s]
136
  0%| | 0/12 [00:00<?, ?it/s]
137
  0%| | 0/13 [00:00<?, ?it/s]
138
  0%| | 0/13 [00:00<?, ?it/s]
139
+
140
+
141
+
142
+
143
+
144
+
145
+
146
+
147
  0%| | 0/13 [00:00<?, ?it/s]
148
  0%| | 0/13 [00:00<?, ?it/s]
149
  0%| | 0/12 [00:00<?, ?it/s]
150
  0%| | 0/13 [00:00<?, ?it/s]
151
  0%| | 0/12 [00:00<?, ?it/s]
152
  0%| | 0/13 [00:00<?, ?it/s]
153
  0%| | 0/12 [00:00<?, ?it/s]
154
  0%| | 0/12 [00:00<?, ?it/s]
155
+
156
+
157
+
158
+
159
+
160
+
161
+
162
+
163
  0%| | 0/13 [00:00<?, ?it/s]
164
  0%| | 0/13 [00:00<?, ?it/s]
165
  0%| | 0/13 [00:00<?, ?it/s]
166
  0%| | 0/13 [00:00<?, ?it/s]
167
  0%| | 0/12 [00:00<?, ?it/s]
168
  0%| | 0/12 [00:00<?, ?it/s]
169
  0%| | 0/12 [00:00<?, ?it/s]
170
  0%| | 0/12 [00:00<?, ?it/s]
171
+
172
+
173
+
174
+
175
+
176
+
177
+
178
+
179
  0%| | 0/13 [00:00<?, ?it/s]
180
  0%| | 0/13 [00:00<?, ?it/s]
181
  0%| | 0/12 [00:00<?, ?it/s]
182
  0%| | 0/12 [00:00<?, ?it/s]
183
  0%| | 0/13 [00:00<?, ?it/s]
184
  0%| | 0/12 [00:00<?, ?it/s]
185
  0%| | 0/12 [00:00<?, ?it/s]
186
  0%| | 0/13 [00:00<?, ?it/s]
187
+
188
+
189
+
190
+
191
+
192
+
193
+
194
+
195
  0%| | 0/13 [00:00<?, ?it/s]
196
  0%| | 0/12 [00:00<?, ?it/s]
197
  0%| | 0/12 [00:00<?, ?it/s]
198
  0%| | 0/13 [00:00<?, ?it/s]
199
  0%| | 0/13 [00:00<?, ?it/s]
200
  0%| | 0/13 [00:00<?, ?it/s]
201
  0%| | 0/12 [00:00<?, ?it/s]
202
  0%| | 0/12 [00:00<?, ?it/s]
203
+
204
+
205
+
206
+
207
+
208
+
209
+
210
+
211
  0%| | 0/13 [00:00<?, ?it/s]
212
  0%| | 0/13 [00:00<?, ?it/s]
213
  0%| | 0/13 [00:00<?, ?it/s]
214
  0%| | 0/12 [00:00<?, ?it/s]
215
  0%| | 0/12 [00:00<?, ?it/s]
216
  0%| | 0/12 [00:00<?, ?it/s]
217
  0%| | 0/12 [00:00<?, ?it/s]
218
  0%| | 0/13 [00:00<?, ?it/s]
219
+
220
+
221
+
222
+
223
+
224
+
225
+
226
+
227
  0%| | 0/13 [00:00<?, ?it/s]
228
  0%| | 0/13 [00:00<?, ?it/s]
229
  0%| | 0/12 [00:00<?, ?it/s]
230
  0%| | 0/12 [00:00<?, ?it/s]
231
  0%| | 0/13 [00:00<?, ?it/s]
232
  0%| | 0/12 [00:00<?, ?it/s]
233
  0%| | 0/13 [00:00<?, ?it/s]
234
  0%| | 0/12 [00:00<?, ?it/s]
235
+
236
+
237
+
238
+
239
+
240
+
241
+
242
+
243
  0%| | 0/12 [00:00<?, ?it/s]
244
  0%| | 0/13 [00:00<?, ?it/s]
245
  0%| | 0/13 [00:00<?, ?it/s]
246
  0%| | 0/13 [00:00<?, ?it/s]
247
  0%| | 0/12 [00:00<?, ?it/s]
248
  0%| | 0/13 [00:00<?, ?it/s]
249
  0%| | 0/12 [00:00<?, ?it/s]
250
  0%| | 0/12 [00:00<?, ?it/s]
251
+
252
+
253
+
254
+
255
+
256
+
257
+
258
+
259
  0%| | 0/13 [00:00<?, ?it/s]
260
  0%| | 0/12 [00:00<?, ?it/s]
261
  0%| | 0/13 [00:00<?, ?it/s]
262
  0%| | 0/12 [00:00<?, ?it/s]
263
  0%| | 0/13 [00:00<?, ?it/s]
264
  0%| | 0/13 [00:00<?, ?it/s]
265
  0%| | 0/12 [00:00<?, ?it/s]
266
  0%| | 0/12 [00:00<?, ?it/s]
267
+
268
+
269
+
270
+
271
+
272
+
273
+
274
+
275
+ 2025-11-19 18:24:01.929 | WARNING | src.generation:generate:53 - The arguments ('add_bos_token', 'sigma', 'eot_token_id') are not supported by the generation strategy 'vanilla'.
276
+ 2025-11-19 18:24:01.929 | WARNING | src.generation:generate:53 - The arguments ('add_bos_token', 'eot_token_id', 'sigma') are not supported by the generation strategy 'vanilla'.
277
+ 2025-11-19 18:24:01.929 | WARNING | src.generation:generate:53 - The arguments ('eot_token_id', 'sigma', 'add_bos_token') are not supported by the generation strategy 'vanilla'.
278
+ 2025-11-19 18:24:01.930 | WARNING | src.generation:generate:53 - The arguments ('sigma', 'eot_token_id', 'add_bos_token') are not supported by the generation strategy 'vanilla'.
279
+ 2025-11-19 18:24:01.930 | WARNING | src.generation:generate:53 - The arguments ('add_bos_token', 'sigma', 'eot_token_id') are not supported by the generation strategy 'vanilla'.
280
+ 2025-11-19 18:24:01.944 | WARNING | src.generation:generate:53 - The arguments ('add_bos_token', 'eot_token_id', 'sigma') are not supported by the generation strategy 'vanilla'.
281
+ 2025-11-19 18:24:01.944 | WARNING | src.generation:generate:53 - The arguments ('add_bos_token', 'sigma', 'eot_token_id') are not supported by the generation strategy 'vanilla'.
282
+
283
+ 2025-11-19 19:00:06.282 | INFO | __main__:main:87 - Throughput: 8.65 tokens/sec, Tokens per step: 0.91 tokens/step (full: 22.65 tokens/sec, 1.00 tokens/step), Latency: 11.33 s, Average Input Length: 1360.60 tokens, Peak GPU Memory: 18.50 GB, Total time: 2126.30 s
284
+ 2025-11-19 19:00:06.383 | INFO | __main__:main:108 - Results saved to /xfr_ceph_sh/liuchonghan/HEAT/heat/outputs/2025-11-19/exp1_len256/dream-inst/prefix/mmlu_pro/results.json
285
+ 2025-11-19 19:00:06.383 | INFO | __main__:main:111 - eval time: 2126.30 seconds
286
+ [rank0]:[W1119 19:00:06.522929613 ProcessGroupNCCL.cpp:1479] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())