Upload folder using huggingface_hub

Browse files

Files changed (13) hide show

.gitattributes +2 -0
2025-11-1/exp1_len256/dream-inst/dllm/mmlu_pro/.hydra/config.yaml +53 -0
2025-11-1/exp1_len256/dream-inst/dllm/mmlu_pro/.hydra/hydra.yaml +164 -0
2025-11-1/exp1_len256/dream-inst/dllm/mmlu_pro/.hydra/overrides.yaml +6 -0
2025-11-1/exp1_len256/dream-inst/dllm/mmlu_pro/eval.log +249 -0
2025-11-1/exp1_len256/dream-inst/dllm/mmlu_pro/results.json +3 -0
2025-11-1/exp1_len256/dream-inst/dllm/mmlu_pro/stderr.log +266 -0
2025-11-1/exp1_len256/dream-inst/prefix/mmlu_pro/.hydra/config.yaml +51 -0
2025-11-1/exp1_len256/dream-inst/prefix/mmlu_pro/.hydra/hydra.yaml +164 -0
2025-11-1/exp1_len256/dream-inst/prefix/mmlu_pro/.hydra/overrides.yaml +6 -0
2025-11-1/exp1_len256/dream-inst/prefix/mmlu_pro/eval.log +249 -0
2025-11-1/exp1_len256/dream-inst/prefix/mmlu_pro/results.json +3 -0
2025-11-1/exp1_len256/dream-inst/prefix/mmlu_pro/stderr.log +174 -0

.gitattributes CHANGED Viewed

@@ -38,3 +38,5 @@ outputs/2025-11-18/exp1_len1024/dream-inst/heat/gsm8k/results.json filter=lfs di
 outputs/2025-11-18/exp1_len1024/dream-inst/no_cache/gsm8k/results.json filter=lfs diff=lfs merge=lfs -text
 outputs/2025-11-18/exp1_len1024/dream-inst/prefix/gsm8k/results.json filter=lfs diff=lfs merge=lfs -text
 outputs/2025-11-18/exp2_random_gen/dream-inst/gsm8k_len256/results.json filter=lfs diff=lfs merge=lfs -text

 outputs/2025-11-18/exp1_len1024/dream-inst/no_cache/gsm8k/results.json filter=lfs diff=lfs merge=lfs -text
 outputs/2025-11-18/exp1_len1024/dream-inst/prefix/gsm8k/results.json filter=lfs diff=lfs merge=lfs -text
 outputs/2025-11-18/exp2_random_gen/dream-inst/gsm8k_len256/results.json filter=lfs diff=lfs merge=lfs -text
+2025-11-1/exp1_len256/dream-inst/dllm/mmlu_pro/results.json filter=lfs diff=lfs merge=lfs -text
+2025-11-1/exp1_len256/dream-inst/prefix/mmlu_pro/results.json filter=lfs diff=lfs merge=lfs -text

2025-11-1/exp1_len256/dream-inst/dllm/mmlu_pro/.hydra/config.yaml ADDED Viewed

	@@ -0,0 +1,53 @@

+model:
+  generation:
+    mask_token_id: 151666
+    eot_token_id: 151643
+    pad_token_id: 151643
+    add_bos_token: true
+    alg: maskgit_plus
+  name: dream-inst
+  path: ${oc.env:DREAM_INST_PATH}
+generation:
+  strategy: vanilla
+  threshold: null
+  factor: null
+  alg: maskgit_plus
+  gen_length: null
+  block_length: null
+  steps: null
+  temperature: 0.0
+  top_p: null
+  top_k: null
+  debias: false
+  output_probs: false
+cache:
+  _target_: src.cache.dLLMCache
+  kr: 1
+  kp: 50
+  rou: 0.25
+seed: 1234
+batch_size: 1
+attn_implementation: eager
+dataset:
+  name: mmlu_pro
+  size: null
+  n_shot: null
+  system_prompt: null
+  batch_size: 1
+mc_num: null
+max_length: 4096
+is_check_greedy: true
+add_bos_token: true
+nll_type: mc
+log_type: ftb
+eval_args:
+  log_samples: true
+  tasks: ${..dataset.name}
+  num_fewshot: ${..dataset.n_shot}
+  batch_size: ${..batch_size}
+  limit: ${..dataset.size}
+  confirm_run_unsafe_code: true
+  random_seed: ${..seed}
+  fewshot_random_seed: ${..seed}
+  numpy_random_seed: ${..seed}
+  torch_random_seed: ${..seed}

2025-11-1/exp1_len256/dream-inst/dllm/mmlu_pro/.hydra/hydra.yaml ADDED Viewed

	@@ -0,0 +1,164 @@

+hydra:
+  run:
+    dir: outputs/2025-11-19/exp1_len256/dream-inst/dllm/mmlu_pro
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+      Use --hydra-help to view Hydra specific help
+      '
+    template: '${hydra.help.header}
+      == Configuration groups ==
+      Compose your configuration from those groups (group=option)
+      $APP_CONFIG_GROUPS
+      == Config ==
+      Override anything in the config (foo.bar=value)
+      $CONFIG
+      ${hydra.help.footer}
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+      See https://hydra.cc for more info.
+      == Flags ==
+      $FLAGS_HELP
+      == Configuration groups ==
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+      $HYDRA_CONFIG_GROUPS
+      Use ''--cfg hydra'' to Show the Hydra config.
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.run.dir=outputs/2025-11-19/exp1_len256/dream-inst/dllm/mmlu_pro
+    - hydra.mode=RUN
+    task:
+    - dataset.name=mmlu_pro
+    - model=dream-inst
+    - cache=dllm
+    - generation=vanilla
+    - batch_size=1
+    - seed=1234
+  job:
+    name: eval
+    chdir: null
+    override_dirname: batch_size=1,cache=dllm,dataset.name=mmlu_pro,generation=vanilla,model=dream-inst,seed=1234
+    id: ???
+    num: ???
+    config_name: eval
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /xfr_ceph_sh/liuchonghan/HEAT/heat
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /xfr_ceph_sh/liuchonghan/HEAT/heat/configs
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /xfr_ceph_sh/liuchonghan/HEAT/heat/outputs/2025-11-19/exp1_len256/dream-inst/dllm/mmlu_pro
+    choices:
+      cache: dllm
+      generation: vanilla
+      model: dream-inst
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false

2025-11-1/exp1_len256/dream-inst/dllm/mmlu_pro/.hydra/overrides.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+- dataset.name=mmlu_pro
+- model=dream-inst
+- cache=dllm
+- generation=vanilla
+- batch_size=1
+- seed=1234

2025-11-1/exp1_len256/dream-inst/dllm/mmlu_pro/eval.log ADDED Viewed

	@@ -0,0 +1,249 @@

+[2025-11-19 17:39:42,871][accelerate.utils.other][WARNING] - Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
+[2025-11-19 17:39:51,105][lm_eval.evaluator][INFO] - Setting random seed to 1234 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
+[2025-11-19 17:39:51,105][lm_eval.evaluator][INFO] - Using pre-initialized model
+[2025-11-19 17:39:51,156][lm_eval.evaluator][INFO] - Setting random seed to 1234 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
+[2025-11-19 17:39:51,156][lm_eval.evaluator][INFO] - Using pre-initialized model
+[2025-11-19 17:39:51,181][lm_eval.evaluator][INFO] - Setting random seed to 1234 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
+[2025-11-19 17:39:51,181][lm_eval.evaluator][INFO] - Using pre-initialized model
+[2025-11-19 17:39:51,303][lm_eval.evaluator][INFO] - Setting random seed to 1234 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
+[2025-11-19 17:39:51,303][lm_eval.evaluator][INFO] - Using pre-initialized model
+[2025-11-19 17:39:51,307][lm_eval.evaluator][INFO] - Setting random seed to 1234 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
+[2025-11-19 17:39:51,308][lm_eval.evaluator][INFO] - Using pre-initialized model
+[2025-11-19 17:39:51,342][lm_eval.evaluator][INFO] - Setting random seed to 1234 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
+[2025-11-19 17:39:51,342][lm_eval.evaluator][INFO] - Using pre-initialized model
+[2025-11-19 17:39:51,374][lm_eval.evaluator][INFO] - Setting random seed to 1234 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
+[2025-11-19 17:39:51,374][lm_eval.evaluator][INFO] - Using pre-initialized model
+[2025-11-19 17:39:51,561][lm_eval.evaluator][INFO] - Setting random seed to 1234 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
+[2025-11-19 17:39:51,562][lm_eval.evaluator][INFO] - Using pre-initialized model
+[2025-11-19 17:40:34,823][lm_eval.evaluator][INFO] - mmlu_pro_biology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,823][lm_eval.evaluator][INFO] - mmlu_pro_business: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,823][lm_eval.evaluator][INFO] - mmlu_pro_chemistry: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,823][lm_eval.evaluator][INFO] - mmlu_pro_computer_science: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,823][lm_eval.evaluator][INFO] - mmlu_pro_economics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,823][lm_eval.evaluator][INFO] - mmlu_pro_engineering: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,823][lm_eval.evaluator][INFO] - mmlu_pro_health: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,823][lm_eval.evaluator][INFO] - mmlu_pro_history: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,823][lm_eval.evaluator][INFO] - mmlu_pro_law: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,823][lm_eval.evaluator][INFO] - mmlu_pro_math: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,823][lm_eval.evaluator][INFO] - mmlu_pro_other: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,823][lm_eval.evaluator][INFO] - mmlu_pro_philosophy: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,823][lm_eval.evaluator][INFO] - mmlu_pro_physics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,824][lm_eval.evaluator][INFO] - mmlu_pro_psychology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,825][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_biology on rank 5...
+[2025-11-19 17:40:34,831][lm_eval.evaluator][INFO] - mmlu_pro_biology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,831][lm_eval.evaluator][INFO] - mmlu_pro_business: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,831][lm_eval.evaluator][INFO] - mmlu_pro_chemistry: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,831][lm_eval.evaluator][INFO] - mmlu_pro_computer_science: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,831][lm_eval.evaluator][INFO] - mmlu_pro_economics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,831][lm_eval.evaluator][INFO] - mmlu_pro_engineering: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,832][lm_eval.evaluator][INFO] - mmlu_pro_health: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,832][lm_eval.evaluator][INFO] - mmlu_pro_history: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,832][lm_eval.evaluator][INFO] - mmlu_pro_law: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,832][lm_eval.evaluator][INFO] - mmlu_pro_math: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,832][lm_eval.evaluator][INFO] - mmlu_pro_other: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,832][lm_eval.evaluator][INFO] - mmlu_pro_philosophy: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,832][lm_eval.evaluator][INFO] - mmlu_pro_physics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,832][lm_eval.evaluator][INFO] - mmlu_pro_psychology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,833][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_biology on rank 6...
+[2025-11-19 17:40:34,849][lm_eval.evaluator][INFO] - mmlu_pro_biology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,849][lm_eval.evaluator][INFO] - mmlu_pro_business: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,849][lm_eval.evaluator][INFO] - mmlu_pro_chemistry: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,849][lm_eval.evaluator][INFO] - mmlu_pro_computer_science: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,849][lm_eval.evaluator][INFO] - mmlu_pro_economics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,849][lm_eval.evaluator][INFO] - mmlu_pro_engineering: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,849][lm_eval.evaluator][INFO] - mmlu_pro_health: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,849][lm_eval.evaluator][INFO] - mmlu_pro_history: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,849][lm_eval.evaluator][INFO] - mmlu_pro_law: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,849][lm_eval.evaluator][INFO] - mmlu_pro_math: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,849][lm_eval.evaluator][INFO] - mmlu_pro_other: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,849][lm_eval.evaluator][INFO] - mmlu_pro_philosophy: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,849][lm_eval.evaluator][INFO] - mmlu_pro_physics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,850][lm_eval.evaluator][INFO] - mmlu_pro_psychology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,851][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_biology on rank 2...
+[2025-11-19 17:40:34,860][lm_eval.evaluator][INFO] - mmlu_pro_biology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,861][lm_eval.evaluator][INFO] - mmlu_pro_business: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,861][lm_eval.evaluator][INFO] - mmlu_pro_chemistry: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,861][lm_eval.evaluator][INFO] - mmlu_pro_computer_science: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,861][lm_eval.evaluator][INFO] - mmlu_pro_economics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,861][lm_eval.evaluator][INFO] - mmlu_pro_engineering: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,861][lm_eval.evaluator][INFO] - mmlu_pro_health: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,861][lm_eval.evaluator][INFO] - mmlu_pro_history: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,861][lm_eval.evaluator][INFO] - mmlu_pro_law: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,861][lm_eval.evaluator][INFO] - mmlu_pro_math: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,861][lm_eval.evaluator][INFO] - mmlu_pro_other: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,861][lm_eval.evaluator][INFO] - mmlu_pro_philosophy: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,861][lm_eval.evaluator][INFO] - mmlu_pro_physics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,861][lm_eval.evaluator][INFO] - mmlu_pro_psychology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,863][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_biology on rank 1...
+[2025-11-19 17:40:34,911][lm_eval.evaluator][INFO] - mmlu_pro_biology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,911][lm_eval.evaluator][INFO] - mmlu_pro_business: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,912][lm_eval.evaluator][INFO] - mmlu_pro_chemistry: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,912][lm_eval.evaluator][INFO] - mmlu_pro_computer_science: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,912][lm_eval.evaluator][INFO] - mmlu_pro_economics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,912][lm_eval.evaluator][INFO] - mmlu_pro_engineering: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,912][lm_eval.evaluator][INFO] - mmlu_pro_health: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,912][lm_eval.evaluator][INFO] - mmlu_pro_history: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,912][lm_eval.evaluator][INFO] - mmlu_pro_law: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,912][lm_eval.evaluator][INFO] - mmlu_pro_math: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,912][lm_eval.evaluator][INFO] - mmlu_pro_other: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,912][lm_eval.evaluator][INFO] - mmlu_pro_philosophy: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,912][lm_eval.evaluator][INFO] - mmlu_pro_physics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,912][lm_eval.evaluator][INFO] - mmlu_pro_psychology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:34,914][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_biology on rank 0...
+[2025-11-19 17:40:35,057][lm_eval.evaluator][INFO] - mmlu_pro_biology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,057][lm_eval.evaluator][INFO] - mmlu_pro_business: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,057][lm_eval.evaluator][INFO] - mmlu_pro_chemistry: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,057][lm_eval.evaluator][INFO] - mmlu_pro_computer_science: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,057][lm_eval.evaluator][INFO] - mmlu_pro_economics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,057][lm_eval.evaluator][INFO] - mmlu_pro_engineering: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,058][lm_eval.evaluator][INFO] - mmlu_pro_health: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,058][lm_eval.evaluator][INFO] - mmlu_pro_history: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,058][lm_eval.evaluator][INFO] - mmlu_pro_law: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,058][lm_eval.evaluator][INFO] - mmlu_pro_math: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,058][lm_eval.evaluator][INFO] - mmlu_pro_other: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,058][lm_eval.evaluator][INFO] - mmlu_pro_philosophy: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,058][lm_eval.evaluator][INFO] - mmlu_pro_physics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,058][lm_eval.evaluator][INFO] - mmlu_pro_psychology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,062][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_biology on rank 4...
+[2025-11-19 17:40:35,085][lm_eval.evaluator][INFO] - mmlu_pro_biology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,085][lm_eval.evaluator][INFO] - mmlu_pro_business: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,085][lm_eval.evaluator][INFO] - mmlu_pro_chemistry: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,085][lm_eval.evaluator][INFO] - mmlu_pro_computer_science: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,086][lm_eval.evaluator][INFO] - mmlu_pro_economics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,086][lm_eval.evaluator][INFO] - mmlu_pro_engineering: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,086][lm_eval.evaluator][INFO] - mmlu_pro_health: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,086][lm_eval.evaluator][INFO] - mmlu_pro_history: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,086][lm_eval.evaluator][INFO] - mmlu_pro_law: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,086][lm_eval.evaluator][INFO] - mmlu_pro_math: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,086][lm_eval.evaluator][INFO] - mmlu_pro_other: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,086][lm_eval.evaluator][INFO] - mmlu_pro_philosophy: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,086][lm_eval.evaluator][INFO] - mmlu_pro_physics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,086][lm_eval.evaluator][INFO] - mmlu_pro_psychology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,089][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_biology on rank 7...
+[2025-11-19 17:40:35,119][lm_eval.evaluator][INFO] - mmlu_pro_biology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,120][lm_eval.evaluator][INFO] - mmlu_pro_business: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,120][lm_eval.evaluator][INFO] - mmlu_pro_chemistry: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,120][lm_eval.evaluator][INFO] - mmlu_pro_computer_science: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,120][lm_eval.evaluator][INFO] - mmlu_pro_economics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,120][lm_eval.evaluator][INFO] - mmlu_pro_engineering: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,121][lm_eval.evaluator][INFO] - mmlu_pro_health: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,121][lm_eval.evaluator][INFO] - mmlu_pro_history: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,121][lm_eval.evaluator][INFO] - mmlu_pro_law: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,121][lm_eval.evaluator][INFO] - mmlu_pro_math: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,121][lm_eval.evaluator][INFO] - mmlu_pro_other: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,121][lm_eval.evaluator][INFO] - mmlu_pro_philosophy: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,121][lm_eval.evaluator][INFO] - mmlu_pro_physics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,121][lm_eval.evaluator][INFO] - mmlu_pro_psychology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 17:40:35,124][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_biology on rank 3...
+[2025-11-19 17:40:39,984][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_business on rank 1...
+[2025-11-19 17:40:39,984][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_business on rank 7...
+[2025-11-19 17:40:39,984][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_business on rank 5...
+[2025-11-19 17:40:39,984][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_business on rank 6...
+[2025-11-19 17:40:39,984][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_business on rank 3...
+[2025-11-19 17:40:39,984][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_business on rank 4...
+[2025-11-19 17:40:39,985][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_business on rank 0...
+[2025-11-19 17:40:39,985][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_business on rank 2...
+[2025-11-19 17:40:40,026][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_chemistry on rank 6...
+[2025-11-19 17:40:40,026][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_chemistry on rank 1...
+[2025-11-19 17:40:40,026][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_chemistry on rank 7...
+[2025-11-19 17:40:40,026][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_chemistry on rank 4...
+[2025-11-19 17:40:40,026][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_chemistry on rank 0...
+[2025-11-19 17:40:40,026][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_chemistry on rank 5...
+[2025-11-19 17:40:40,026][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_chemistry on rank 2...
+[2025-11-19 17:40:40,026][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_chemistry on rank 3...
+[2025-11-19 17:40:40,081][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_computer_science on rank 1...
+[2025-11-19 17:40:40,081][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_computer_science on rank 2...
+[2025-11-19 17:40:40,081][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_computer_science on rank 0...
+[2025-11-19 17:40:40,081][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_computer_science on rank 6...
+[2025-11-19 17:40:40,081][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_computer_science on rank 5...
+[2025-11-19 17:40:40,081][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_computer_science on rank 4...
+[2025-11-19 17:40:40,081][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_computer_science on rank 3...
+[2025-11-19 17:40:40,081][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_computer_science on rank 7...
+[2025-11-19 17:40:40,132][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_economics on rank 1...
+[2025-11-19 17:40:40,132][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_economics on rank 2...
+[2025-11-19 17:40:40,132][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_economics on rank 0...
+[2025-11-19 17:40:40,132][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_economics on rank 4...
+[2025-11-19 17:40:40,132][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_economics on rank 6...
+[2025-11-19 17:40:40,132][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_economics on rank 7...
+[2025-11-19 17:40:40,132][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_economics on rank 5...
+[2025-11-19 17:40:40,132][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_economics on rank 3...
+[2025-11-19 17:40:40,180][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_engineering on rank 6...
+[2025-11-19 17:40:40,180][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_engineering on rank 4...
+[2025-11-19 17:40:40,180][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_engineering on rank 1...
+[2025-11-19 17:40:40,180][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_engineering on rank 7...
+[2025-11-19 17:40:40,180][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_engineering on rank 5...
+[2025-11-19 17:40:40,180][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_engineering on rank 0...
+[2025-11-19 17:40:40,180][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_engineering on rank 2...
+[2025-11-19 17:40:40,180][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_engineering on rank 3...
+[2025-11-19 17:40:40,247][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_health on rank 6...
+[2025-11-19 17:40:40,247][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_health on rank 0...
+[2025-11-19 17:40:40,247][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_health on rank 1...
+[2025-11-19 17:40:40,247][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_health on rank 7...
+[2025-11-19 17:40:40,247][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_health on rank 4...
+[2025-11-19 17:40:40,247][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_health on rank 5...
+[2025-11-19 17:40:40,247][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_health on rank 2...
+[2025-11-19 17:40:40,247][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_health on rank 3...
+[2025-11-19 17:40:40,281][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_history on rank 6...
+[2025-11-19 17:40:40,281][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_history on rank 5...
+[2025-11-19 17:40:40,281][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_history on rank 7...
+[2025-11-19 17:40:40,281][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_history on rank 2...
+[2025-11-19 17:40:40,281][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_history on rank 1...
+[2025-11-19 17:40:40,281][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_history on rank 0...
+[2025-11-19 17:40:40,282][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_history on rank 4...
+[2025-11-19 17:40:40,282][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_history on rank 3...
+[2025-11-19 17:40:40,324][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_law on rank 2...
+[2025-11-19 17:40:40,324][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_law on rank 6...
+[2025-11-19 17:40:40,324][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_law on rank 1...
+[2025-11-19 17:40:40,324][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_law on rank 4...
+[2025-11-19 17:40:40,324][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_law on rank 5...
+[2025-11-19 17:40:40,324][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_law on rank 7...
+[2025-11-19 17:40:40,324][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_law on rank 0...
+[2025-11-19 17:40:40,324][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_law on rank 3...
+[2025-11-19 17:40:40,344][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_math on rank 1...
+[2025-11-19 17:40:40,344][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_math on rank 6...
+[2025-11-19 17:40:40,344][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_math on rank 4...
+[2025-11-19 17:40:40,344][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_math on rank 7...
+[2025-11-19 17:40:40,344][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_math on rank 5...
+[2025-11-19 17:40:40,344][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_math on rank 0...
+[2025-11-19 17:40:40,344][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_math on rank 3...
+[2025-11-19 17:40:40,344][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_math on rank 2...
+[2025-11-19 17:40:40,365][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_other on rank 7...
+[2025-11-19 17:40:40,365][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_other on rank 1...
+[2025-11-19 17:40:40,365][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_other on rank 6...
+[2025-11-19 17:40:40,365][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_other on rank 4...
+[2025-11-19 17:40:40,365][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_other on rank 3...
+[2025-11-19 17:40:40,365][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_other on rank 5...
+[2025-11-19 17:40:40,365][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_other on rank 0...
+[2025-11-19 17:40:40,365][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_other on rank 2...
+[2025-11-19 17:40:40,385][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_philosophy on rank 7...
+[2025-11-19 17:40:40,385][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_philosophy on rank 6...
+[2025-11-19 17:40:40,385][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_philosophy on rank 5...
+[2025-11-19 17:40:40,385][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_philosophy on rank 1...
+[2025-11-19 17:40:40,385][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_philosophy on rank 4...
+[2025-11-19 17:40:40,385][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_philosophy on rank 0...
+[2025-11-19 17:40:40,385][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_philosophy on rank 3...
+[2025-11-19 17:40:40,385][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_philosophy on rank 2...
+[2025-11-19 17:40:40,406][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_physics on rank 6...
+[2025-11-19 17:40:40,406][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_physics on rank 3...
+[2025-11-19 17:40:40,406][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_physics on rank 0...
+[2025-11-19 17:40:40,406][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_physics on rank 4...
+[2025-11-19 17:40:40,406][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_physics on rank 5...
+[2025-11-19 17:40:40,406][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_physics on rank 7...
+[2025-11-19 17:40:40,406][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_physics on rank 1...
+[2025-11-19 17:40:40,406][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_physics on rank 2...
+[2025-11-19 17:40:40,426][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_psychology on rank 6...
+[2025-11-19 17:40:40,426][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_psychology on rank 1...
+[2025-11-19 17:40:40,426][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_psychology on rank 7...
+[2025-11-19 17:40:40,426][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_psychology on rank 3...
+[2025-11-19 17:40:40,426][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_psychology on rank 4...
+[2025-11-19 17:40:40,426][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_psychology on rank 0...
+[2025-11-19 17:40:40,426][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_psychology on rank 5...
+[2025-11-19 17:40:40,427][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_psychology on rank 2...
+[2025-11-19 17:40:40,447][lm_eval.evaluator][INFO] - Running generate_until requests
+[2025-11-19 17:40:40,447][lm_eval.evaluator][INFO] - Running generate_until requests
+[2025-11-19 17:40:40,447][lm_eval.evaluator][INFO] - Running generate_until requests
+[2025-11-19 17:40:40,447][lm_eval.evaluator][INFO] - Running generate_until requests
+[2025-11-19 17:40:40,447][lm_eval.evaluator][INFO] - Running generate_until requests
+[2025-11-19 17:40:40,447][lm_eval.evaluator][INFO] - Running generate_until requests
+[2025-11-19 17:40:40,447][lm_eval.evaluator][INFO] - Running generate_until requests
+[2025-11-19 17:40:40,447][lm_eval.evaluator][INFO] - Running generate_until requests

2025-11-1/exp1_len256/dream-inst/dllm/mmlu_pro/results.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ec3caae7416644291124ab09253fed704c145e61302e62b200add7168ef9da8e
+size 10543271

2025-11-1/exp1_len256/dream-inst/dllm/mmlu_pro/stderr.log ADDED Viewed

@@ -0,0 +1,266 @@
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]

+ipex flag is deprecated, will be removed in Accelerate v1.10. From 2.7.0, PyTorch has all needed optimizations for Intel CPU and XPU.
+The following values were not passed to `accelerate launch` and had defaults used instead:
+		More than one GPU was found, enabling multi-GPU training.
+		If this was unintended please pass in `--num_processes=1`.
+	`--mixed_precision` was set to a value of `'no'`
+	`--dynamo_backend` was set to a value of `'no'`
+To avoid this warning pass in values for each of the problematic parameters or run `accelerate config`.
+[W1119 17:39:27.733661635 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
+2025-11-19 17:39:36.278 | INFO     | src.utils:pre_initialize:603 - {'strategy': 'vanilla', 'threshold': None, 'factor': None, 'alg': 'maskgit_plus', 'gen_length': 256, 'block_length': 32, 'steps': 256, 'temperature': 0.0, 'top_p': 0.9, 'top_k': None, 'debias': False, 'output_probs': False, 'mask_token_id': 151666, 'eot_token_id': 151643, 'pad_token_id': 151643, 'add_bos_token': True, 'sigma': None}
+2025-11-19 17:39:36.278 | INFO     | src.utils:pre_initialize:618 - Using cache with args: {'kp': 50, 'kr': 4}
+[W1119 17:39:37.823449224 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
+[W1119 17:39:39.543581206 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
+[W1119 17:39:40.105350243 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
+[W1119 17:39:40.238904703 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
+[W1119 17:39:40.460865127 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
+[W1119 17:39:40.523301171 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
+[W1119 17:39:40.541752953 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
+[W1119 17:39:40.626767406 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
+The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
+The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
+The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
+The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
+The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
+The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
+The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
+The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
+The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
+The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
+The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
+The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
+The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
+The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
+The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
+The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
+2025-11-19 17:39:51.101 | INFO     | __main__:overwrite_eval_task:62 - MMLU-Pro dataset is too large, shrink to 100 for faster evaluation.
+2025-11-19 17:39:51.152 | INFO     | __main__:overwrite_eval_task:62 - MMLU-Pro dataset is too large, shrink to 100 for faster evaluation.
+2025-11-19 17:39:51.178 | INFO     | __main__:overwrite_eval_task:62 - MMLU-Pro dataset is too large, shrink to 100 for faster evaluation.
+2025-11-19 17:39:51.300 | INFO     | __main__:overwrite_eval_task:62 - MMLU-Pro dataset is too large, shrink to 100 for faster evaluation.
+2025-11-19 17:39:51.302 | INFO     | __main__:overwrite_eval_task:62 - MMLU-Pro dataset is too large, shrink to 100 for faster evaluation.
+2025-11-19 17:39:51.338 | INFO     | __main__:overwrite_eval_task:62 - MMLU-Pro dataset is too large, shrink to 100 for faster evaluation.
+2025-11-19 17:39:51.371 | INFO     | __main__:overwrite_eval_task:62 - MMLU-Pro dataset is too large, shrink to 100 for faster evaluation.
+2025-11-19 17:39:51.559 | INFO     | __main__:overwrite_eval_task:62 - MMLU-Pro dataset is too large, shrink to 100 for faster evaluation.
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
+2025-11-19 17:40:40.463 | WARNING  | src.generation:generate:53 - The arguments ('add_bos_token', 'sigma', 'eot_token_id') are not supported by the generation strategy 'vanilla'.
+2025-11-19 17:40:40.464 | WARNING  | src.generation:generate:53 - The arguments ('eot_token_id', 'sigma', 'add_bos_token') are not supported by the generation strategy 'vanilla'.
+2025-11-19 17:40:40.465 | WARNING  | src.generation:generate:53 - The arguments ('eot_token_id', 'add_bos_token', 'sigma') are not supported by the generation strategy 'vanilla'.
+2025-11-19 17:40:40.465 | WARNING  | src.generation:generate:53 - The arguments ('eot_token_id', 'add_bos_token', 'sigma') are not supported by the generation strategy 'vanilla'.
+2025-11-19 17:40:40.466 | WARNING  | src.generation:generate:53 - The arguments ('add_bos_token', 'eot_token_id', 'sigma') are not supported by the generation strategy 'vanilla'.
+2025-11-19 17:40:40.467 | WARNING  | src.generation:generate:53 - The arguments ('eot_token_id', 'sigma', 'add_bos_token') are not supported by the generation strategy 'vanilla'.
+2025-11-19 17:40:40.476 | WARNING  | src.generation:generate:53 - The arguments ('add_bos_token', 'eot_token_id', 'sigma') are not supported by the generation strategy 'vanilla'.
+2025-11-19 18:22:21.069 | INFO     | __main__:main:87 - Throughput: 7.56 tokens/sec, Tokens per step: 0.89 tokens/step (full: 19.51 tokens/sec, 1.00 tokens/step), Latency: 13.16 s, Average Input Length: 1360.60 tokens, Peak GPU Memory: 17.24 GB, Total time: 2384.56 s
+2025-11-19 18:22:21.153 | INFO     | __main__:main:108 - Results saved to /xfr_ceph_sh/liuchonghan/HEAT/heat/outputs/2025-11-19/exp1_len256/dream-inst/dllm/mmlu_pro/results.json
+2025-11-19 18:22:21.153 | INFO     | __main__:main:111 - eval time: 2384.56 seconds
+[rank0]:[W1119 18:22:21.341088084 ProcessGroupNCCL.cpp:1479] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())

2025-11-1/exp1_len256/dream-inst/prefix/mmlu_pro/.hydra/config.yaml ADDED Viewed

	@@ -0,0 +1,51 @@

+model:
+  generation:
+    mask_token_id: 151666
+    eot_token_id: 151643
+    pad_token_id: 151643
+    add_bos_token: true
+    alg: maskgit_plus
+  name: dream-inst
+  path: ${oc.env:DREAM_INST_PATH}
+generation:
+  strategy: vanilla
+  threshold: null
+  factor: null
+  alg: maskgit_plus
+  gen_length: null
+  block_length: null
+  steps: null
+  temperature: 0.0
+  top_p: null
+  top_k: null
+  debias: false
+  output_probs: false
+cache:
+  _target_: src.cache.PrefixCache
+  use_dual: false
+seed: 1234
+batch_size: 1
+attn_implementation: eager
+dataset:
+  name: mmlu_pro
+  size: null
+  n_shot: null
+  system_prompt: null
+  batch_size: 1
+mc_num: null
+max_length: 4096
+is_check_greedy: true
+add_bos_token: true
+nll_type: mc
+log_type: ftb
+eval_args:
+  log_samples: true
+  tasks: ${..dataset.name}
+  num_fewshot: ${..dataset.n_shot}
+  batch_size: ${..batch_size}
+  limit: ${..dataset.size}
+  confirm_run_unsafe_code: true
+  random_seed: ${..seed}
+  fewshot_random_seed: ${..seed}
+  numpy_random_seed: ${..seed}
+  torch_random_seed: ${..seed}

2025-11-1/exp1_len256/dream-inst/prefix/mmlu_pro/.hydra/hydra.yaml ADDED Viewed

	@@ -0,0 +1,164 @@

+hydra:
+  run:
+    dir: outputs/2025-11-19/exp1_len256/dream-inst/prefix/mmlu_pro
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+      Use --hydra-help to view Hydra specific help
+      '
+    template: '${hydra.help.header}
+      == Configuration groups ==
+      Compose your configuration from those groups (group=option)
+      $APP_CONFIG_GROUPS
+      == Config ==
+      Override anything in the config (foo.bar=value)
+      $CONFIG
+      ${hydra.help.footer}
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+      See https://hydra.cc for more info.
+      == Flags ==
+      $FLAGS_HELP
+      == Configuration groups ==
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+      $HYDRA_CONFIG_GROUPS
+      Use ''--cfg hydra'' to Show the Hydra config.
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.run.dir=outputs/2025-11-19/exp1_len256/dream-inst/prefix/mmlu_pro
+    - hydra.mode=RUN
+    task:
+    - dataset.name=mmlu_pro
+    - model=dream-inst
+    - cache=prefix
+    - generation=vanilla
+    - batch_size=1
+    - seed=1234
+  job:
+    name: eval
+    chdir: null
+    override_dirname: batch_size=1,cache=prefix,dataset.name=mmlu_pro,generation=vanilla,model=dream-inst,seed=1234
+    id: ???
+    num: ???
+    config_name: eval
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /xfr_ceph_sh/liuchonghan/HEAT/heat
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /xfr_ceph_sh/liuchonghan/HEAT/heat/configs
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /xfr_ceph_sh/liuchonghan/HEAT/heat/outputs/2025-11-19/exp1_len256/dream-inst/prefix/mmlu_pro
+    choices:
+      cache: prefix
+      generation: vanilla
+      model: dream-inst
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false

2025-11-1/exp1_len256/dream-inst/prefix/mmlu_pro/.hydra/overrides.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+- dataset.name=mmlu_pro
+- model=dream-inst
+- cache=prefix
+- generation=vanilla
+- batch_size=1
+- seed=1234

2025-11-1/exp1_len256/dream-inst/prefix/mmlu_pro/eval.log ADDED Viewed

	@@ -0,0 +1,249 @@

+[2025-11-19 18:22:51,650][accelerate.utils.other][WARNING] - Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
+[2025-11-19 18:22:59,357][lm_eval.evaluator][INFO] - Setting random seed to 1234 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
+[2025-11-19 18:22:59,358][lm_eval.evaluator][INFO] - Using pre-initialized model
+[2025-11-19 18:23:00,307][lm_eval.evaluator][INFO] - Setting random seed to 1234 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
+[2025-11-19 18:23:00,307][lm_eval.evaluator][INFO] - Using pre-initialized model
+[2025-11-19 18:23:00,571][lm_eval.evaluator][INFO] - Setting random seed to 1234 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
+[2025-11-19 18:23:00,571][lm_eval.evaluator][INFO] - Using pre-initialized model
+[2025-11-19 18:23:00,608][lm_eval.evaluator][INFO] - Setting random seed to 1234 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
+[2025-11-19 18:23:00,608][lm_eval.evaluator][INFO] - Using pre-initialized model
+[2025-11-19 18:23:00,753][lm_eval.evaluator][INFO] - Setting random seed to 1234 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
+[2025-11-19 18:23:00,753][lm_eval.evaluator][INFO] - Using pre-initialized model
+[2025-11-19 18:23:01,333][lm_eval.evaluator][INFO] - Setting random seed to 1234 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
+[2025-11-19 18:23:01,333][lm_eval.evaluator][INFO] - Using pre-initialized model
+[2025-11-19 18:23:02,643][lm_eval.evaluator][INFO] - Setting random seed to 1234 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
+[2025-11-19 18:23:02,643][lm_eval.evaluator][INFO] - Using pre-initialized model
+[2025-11-19 18:23:03,085][lm_eval.evaluator][INFO] - Setting random seed to 1234 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
+[2025-11-19 18:23:03,086][lm_eval.evaluator][INFO] - Using pre-initialized model
+[2025-11-19 18:23:40,845][lm_eval.evaluator][INFO] - mmlu_pro_biology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:40,845][lm_eval.evaluator][INFO] - mmlu_pro_business: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:40,846][lm_eval.evaluator][INFO] - mmlu_pro_chemistry: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:40,846][lm_eval.evaluator][INFO] - mmlu_pro_computer_science: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:40,846][lm_eval.evaluator][INFO] - mmlu_pro_economics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:40,846][lm_eval.evaluator][INFO] - mmlu_pro_engineering: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:40,846][lm_eval.evaluator][INFO] - mmlu_pro_health: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:40,846][lm_eval.evaluator][INFO] - mmlu_pro_history: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:40,846][lm_eval.evaluator][INFO] - mmlu_pro_law: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:40,846][lm_eval.evaluator][INFO] - mmlu_pro_math: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:40,846][lm_eval.evaluator][INFO] - mmlu_pro_other: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:40,846][lm_eval.evaluator][INFO] - mmlu_pro_philosophy: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:40,846][lm_eval.evaluator][INFO] - mmlu_pro_physics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:40,846][lm_eval.evaluator][INFO] - mmlu_pro_psychology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:40,849][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_biology on rank 1...
+[2025-11-19 18:23:41,778][lm_eval.evaluator][INFO] - mmlu_pro_biology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:41,779][lm_eval.evaluator][INFO] - mmlu_pro_business: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:41,779][lm_eval.evaluator][INFO] - mmlu_pro_chemistry: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:41,779][lm_eval.evaluator][INFO] - mmlu_pro_computer_science: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:41,780][lm_eval.evaluator][INFO] - mmlu_pro_economics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:41,780][lm_eval.evaluator][INFO] - mmlu_pro_engineering: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:41,780][lm_eval.evaluator][INFO] - mmlu_pro_health: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:41,780][lm_eval.evaluator][INFO] - mmlu_pro_history: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:41,780][lm_eval.evaluator][INFO] - mmlu_pro_law: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:41,780][lm_eval.evaluator][INFO] - mmlu_pro_math: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:41,780][lm_eval.evaluator][INFO] - mmlu_pro_other: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:41,781][lm_eval.evaluator][INFO] - mmlu_pro_philosophy: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:41,781][lm_eval.evaluator][INFO] - mmlu_pro_physics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:41,781][lm_eval.evaluator][INFO] - mmlu_pro_psychology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:41,787][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_biology on rank 6...
+[2025-11-19 18:23:42,857][lm_eval.evaluator][INFO] - mmlu_pro_biology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:42,857][lm_eval.evaluator][INFO] - mmlu_pro_business: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:42,857][lm_eval.evaluator][INFO] - mmlu_pro_chemistry: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:42,858][lm_eval.evaluator][INFO] - mmlu_pro_computer_science: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:42,858][lm_eval.evaluator][INFO] - mmlu_pro_economics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:42,858][lm_eval.evaluator][INFO] - mmlu_pro_engineering: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:42,858][lm_eval.evaluator][INFO] - mmlu_pro_health: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:42,858][lm_eval.evaluator][INFO] - mmlu_pro_history: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:42,858][lm_eval.evaluator][INFO] - mmlu_pro_law: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:42,859][lm_eval.evaluator][INFO] - mmlu_pro_math: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:42,859][lm_eval.evaluator][INFO] - mmlu_pro_other: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:42,859][lm_eval.evaluator][INFO] - mmlu_pro_philosophy: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:42,859][lm_eval.evaluator][INFO] - mmlu_pro_physics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:42,859][lm_eval.evaluator][INFO] - mmlu_pro_psychology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:42,865][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_biology on rank 7...
+[2025-11-19 18:23:43,755][lm_eval.evaluator][INFO] - mmlu_pro_biology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:43,755][lm_eval.evaluator][INFO] - mmlu_pro_business: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:43,755][lm_eval.evaluator][INFO] - mmlu_pro_chemistry: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:43,755][lm_eval.evaluator][INFO] - mmlu_pro_computer_science: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:43,756][lm_eval.evaluator][INFO] - mmlu_pro_economics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:43,756][lm_eval.evaluator][INFO] - mmlu_pro_engineering: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:43,756][lm_eval.evaluator][INFO] - mmlu_pro_health: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:43,756][lm_eval.evaluator][INFO] - mmlu_pro_history: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:43,756][lm_eval.evaluator][INFO] - mmlu_pro_law: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:43,756][lm_eval.evaluator][INFO] - mmlu_pro_math: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:43,756][lm_eval.evaluator][INFO] - mmlu_pro_other: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:43,756][lm_eval.evaluator][INFO] - mmlu_pro_philosophy: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:43,756][lm_eval.evaluator][INFO] - mmlu_pro_physics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:43,756][lm_eval.evaluator][INFO] - mmlu_pro_psychology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:43,759][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_biology on rank 3...
+[2025-11-19 18:23:45,734][lm_eval.evaluator][INFO] - mmlu_pro_biology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:45,735][lm_eval.evaluator][INFO] - mmlu_pro_business: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:45,735][lm_eval.evaluator][INFO] - mmlu_pro_chemistry: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:45,735][lm_eval.evaluator][INFO] - mmlu_pro_computer_science: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:45,735][lm_eval.evaluator][INFO] - mmlu_pro_economics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:45,735][lm_eval.evaluator][INFO] - mmlu_pro_engineering: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:45,735][lm_eval.evaluator][INFO] - mmlu_pro_health: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:45,735][lm_eval.evaluator][INFO] - mmlu_pro_history: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:45,735][lm_eval.evaluator][INFO] - mmlu_pro_law: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:45,735][lm_eval.evaluator][INFO] - mmlu_pro_math: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:45,736][lm_eval.evaluator][INFO] - mmlu_pro_other: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:45,736][lm_eval.evaluator][INFO] - mmlu_pro_philosophy: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:45,736][lm_eval.evaluator][INFO] - mmlu_pro_physics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:45,736][lm_eval.evaluator][INFO] - mmlu_pro_psychology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:45,738][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_biology on rank 0...
+[2025-11-19 18:23:54,454][lm_eval.evaluator][INFO] - mmlu_pro_biology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:54,454][lm_eval.evaluator][INFO] - mmlu_pro_business: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:54,454][lm_eval.evaluator][INFO] - mmlu_pro_chemistry: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:54,454][lm_eval.evaluator][INFO] - mmlu_pro_computer_science: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:54,454][lm_eval.evaluator][INFO] - mmlu_pro_economics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:54,454][lm_eval.evaluator][INFO] - mmlu_pro_engineering: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:54,454][lm_eval.evaluator][INFO] - mmlu_pro_health: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:54,455][lm_eval.evaluator][INFO] - mmlu_pro_history: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:54,455][lm_eval.evaluator][INFO] - mmlu_pro_law: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:54,455][lm_eval.evaluator][INFO] - mmlu_pro_math: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:54,455][lm_eval.evaluator][INFO] - mmlu_pro_other: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:54,455][lm_eval.evaluator][INFO] - mmlu_pro_philosophy: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:54,455][lm_eval.evaluator][INFO] - mmlu_pro_physics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:54,455][lm_eval.evaluator][INFO] - mmlu_pro_psychology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:54,458][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_biology on rank 4...
+[2025-11-19 18:23:57,947][lm_eval.evaluator][INFO] - mmlu_pro_biology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:57,947][lm_eval.evaluator][INFO] - mmlu_pro_business: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:57,947][lm_eval.evaluator][INFO] - mmlu_pro_chemistry: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:57,947][lm_eval.evaluator][INFO] - mmlu_pro_computer_science: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:57,947][lm_eval.evaluator][INFO] - mmlu_pro_economics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:57,947][lm_eval.evaluator][INFO] - mmlu_pro_engineering: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:57,948][lm_eval.evaluator][INFO] - mmlu_pro_health: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:57,948][lm_eval.evaluator][INFO] - mmlu_pro_history: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:57,948][lm_eval.evaluator][INFO] - mmlu_pro_law: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:57,948][lm_eval.evaluator][INFO] - mmlu_pro_math: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:57,948][lm_eval.evaluator][INFO] - mmlu_pro_other: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:57,948][lm_eval.evaluator][INFO] - mmlu_pro_philosophy: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:57,948][lm_eval.evaluator][INFO] - mmlu_pro_physics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:57,948][lm_eval.evaluator][INFO] - mmlu_pro_psychology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:57,951][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_biology on rank 2...
+[2025-11-19 18:23:58,344][lm_eval.evaluator][INFO] - mmlu_pro_biology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:58,344][lm_eval.evaluator][INFO] - mmlu_pro_business: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:58,344][lm_eval.evaluator][INFO] - mmlu_pro_chemistry: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:58,344][lm_eval.evaluator][INFO] - mmlu_pro_computer_science: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:58,344][lm_eval.evaluator][INFO] - mmlu_pro_economics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:58,344][lm_eval.evaluator][INFO] - mmlu_pro_engineering: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:58,344][lm_eval.evaluator][INFO] - mmlu_pro_health: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:58,345][lm_eval.evaluator][INFO] - mmlu_pro_history: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:58,345][lm_eval.evaluator][INFO] - mmlu_pro_law: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:58,345][lm_eval.evaluator][INFO] - mmlu_pro_math: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:58,345][lm_eval.evaluator][INFO] - mmlu_pro_other: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:58,345][lm_eval.evaluator][INFO] - mmlu_pro_philosophy: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:58,345][lm_eval.evaluator][INFO] - mmlu_pro_physics: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:58,345][lm_eval.evaluator][INFO] - mmlu_pro_psychology: Using gen_kwargs: {'until': ['Question:'], 'max_gen_toks': 2048, 'do_sample': False, 'temperature': 0.0}
+[2025-11-19 18:23:58,347][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_biology on rank 5...
+[2025-11-19 18:24:01,625][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_business on rank 0...
+[2025-11-19 18:24:01,625][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_business on rank 4...
+[2025-11-19 18:24:01,626][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_business on rank 5...
+[2025-11-19 18:24:01,626][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_business on rank 7...
+[2025-11-19 18:24:01,626][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_business on rank 1...
+[2025-11-19 18:24:01,626][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_business on rank 2...
+[2025-11-19 18:24:01,626][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_business on rank 3...
+[2025-11-19 18:24:01,626][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_business on rank 6...
+[2025-11-19 18:24:01,669][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_chemistry on rank 1...
+[2025-11-19 18:24:01,669][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_chemistry on rank 6...
+[2025-11-19 18:24:01,669][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_chemistry on rank 0...
+[2025-11-19 18:24:01,669][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_chemistry on rank 4...
+[2025-11-19 18:24:01,669][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_chemistry on rank 5...
+[2025-11-19 18:24:01,669][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_chemistry on rank 2...
+[2025-11-19 18:24:01,669][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_chemistry on rank 3...
+[2025-11-19 18:24:01,669][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_chemistry on rank 7...
+[2025-11-19 18:24:01,690][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_computer_science on rank 0...
+[2025-11-19 18:24:01,690][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_computer_science on rank 1...
+[2025-11-19 18:24:01,690][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_computer_science on rank 2...
+[2025-11-19 18:24:01,690][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_computer_science on rank 5...
+[2025-11-19 18:24:01,690][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_computer_science on rank 6...
+[2025-11-19 18:24:01,690][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_computer_science on rank 4...
+[2025-11-19 18:24:01,690][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_computer_science on rank 7...
+[2025-11-19 18:24:01,690][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_computer_science on rank 3...
+[2025-11-19 18:24:01,710][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_economics on rank 0...
+[2025-11-19 18:24:01,710][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_economics on rank 1...
+[2025-11-19 18:24:01,710][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_economics on rank 4...
+[2025-11-19 18:24:01,710][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_economics on rank 5...
+[2025-11-19 18:24:01,710][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_economics on rank 6...
+[2025-11-19 18:24:01,710][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_economics on rank 7...
+[2025-11-19 18:24:01,710][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_economics on rank 2...
+[2025-11-19 18:24:01,710][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_economics on rank 3...
+[2025-11-19 18:24:01,730][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_engineering on rank 0...
+[2025-11-19 18:24:01,730][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_engineering on rank 1...
+[2025-11-19 18:24:01,730][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_engineering on rank 7...
+[2025-11-19 18:24:01,730][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_engineering on rank 5...
+[2025-11-19 18:24:01,730][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_engineering on rank 4...
+[2025-11-19 18:24:01,730][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_engineering on rank 6...
+[2025-11-19 18:24:01,730][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_engineering on rank 3...
+[2025-11-19 18:24:01,731][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_engineering on rank 2...
+[2025-11-19 18:24:01,751][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_health on rank 0...
+[2025-11-19 18:24:01,751][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_health on rank 1...
+[2025-11-19 18:24:01,751][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_health on rank 7...
+[2025-11-19 18:24:01,751][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_health on rank 2...
+[2025-11-19 18:24:01,751][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_health on rank 4...
+[2025-11-19 18:24:01,751][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_health on rank 6...
+[2025-11-19 18:24:01,751][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_health on rank 5...
+[2025-11-19 18:24:01,751][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_health on rank 3...
+[2025-11-19 18:24:01,771][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_history on rank 1...
+[2025-11-19 18:24:01,771][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_history on rank 7...
+[2025-11-19 18:24:01,771][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_history on rank 0...
+[2025-11-19 18:24:01,771][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_history on rank 4...
+[2025-11-19 18:24:01,771][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_history on rank 6...
+[2025-11-19 18:24:01,771][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_history on rank 2...
+[2025-11-19 18:24:01,771][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_history on rank 5...
+[2025-11-19 18:24:01,771][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_history on rank 3...
+[2025-11-19 18:24:01,791][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_law on rank 1...
+[2025-11-19 18:24:01,791][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_law on rank 6...
+[2025-11-19 18:24:01,791][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_law on rank 4...
+[2025-11-19 18:24:01,791][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_law on rank 5...
+[2025-11-19 18:24:01,791][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_law on rank 7...
+[2025-11-19 18:24:01,791][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_law on rank 0...
+[2025-11-19 18:24:01,791][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_law on rank 3...
+[2025-11-19 18:24:01,791][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_law on rank 2...
+[2025-11-19 18:24:01,811][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_math on rank 1...
+[2025-11-19 18:24:01,811][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_math on rank 3...
+[2025-11-19 18:24:01,811][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_math on rank 0...
+[2025-11-19 18:24:01,811][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_math on rank 4...
+[2025-11-19 18:24:01,811][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_math on rank 5...
+[2025-11-19 18:24:01,811][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_math on rank 6...
+[2025-11-19 18:24:01,811][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_math on rank 7...
+[2025-11-19 18:24:01,811][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_math on rank 2...
+[2025-11-19 18:24:01,832][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_other on rank 1...
+[2025-11-19 18:24:01,832][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_other on rank 0...
+[2025-11-19 18:24:01,832][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_other on rank 6...
+[2025-11-19 18:24:01,832][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_other on rank 5...
+[2025-11-19 18:24:01,832][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_other on rank 7...
+[2025-11-19 18:24:01,832][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_other on rank 3...
+[2025-11-19 18:24:01,832][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_other on rank 4...
+[2025-11-19 18:24:01,832][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_other on rank 2...
+[2025-11-19 18:24:01,852][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_philosophy on rank 1...
+[2025-11-19 18:24:01,852][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_philosophy on rank 6...
+[2025-11-19 18:24:01,852][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_philosophy on rank 3...
+[2025-11-19 18:24:01,852][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_philosophy on rank 0...
+[2025-11-19 18:24:01,852][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_philosophy on rank 7...
+[2025-11-19 18:24:01,852][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_philosophy on rank 5...
+[2025-11-19 18:24:01,852][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_philosophy on rank 4...
+[2025-11-19 18:24:01,852][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_philosophy on rank 2...
+[2025-11-19 18:24:01,873][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_physics on rank 2...
+[2025-11-19 18:24:01,873][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_physics on rank 6...
+[2025-11-19 18:24:01,873][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_physics on rank 7...
+[2025-11-19 18:24:01,873][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_physics on rank 1...
+[2025-11-19 18:24:01,873][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_physics on rank 5...
+[2025-11-19 18:24:01,873][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_physics on rank 3...
+[2025-11-19 18:24:01,873][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_physics on rank 4...
+[2025-11-19 18:24:01,873][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_physics on rank 0...
+[2025-11-19 18:24:01,893][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_psychology on rank 2...
+[2025-11-19 18:24:01,893][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_psychology on rank 1...
+[2025-11-19 18:24:01,893][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_psychology on rank 7...
+[2025-11-19 18:24:01,893][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_psychology on rank 4...
+[2025-11-19 18:24:01,893][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_psychology on rank 6...
+[2025-11-19 18:24:01,893][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_psychology on rank 0...
+[2025-11-19 18:24:01,893][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_psychology on rank 5...
+[2025-11-19 18:24:01,893][lm_eval.api.task][INFO] - Building contexts for mmlu_pro_psychology on rank 3...
+[2025-11-19 18:24:01,913][lm_eval.evaluator][INFO] - Running generate_until requests
+[2025-11-19 18:24:01,913][lm_eval.evaluator][INFO] - Running generate_until requests
+[2025-11-19 18:24:01,913][lm_eval.evaluator][INFO] - Running generate_until requests
+[2025-11-19 18:24:01,913][lm_eval.evaluator][INFO] - Running generate_until requests
+[2025-11-19 18:24:01,913][lm_eval.evaluator][INFO] - Running generate_until requests
+[2025-11-19 18:24:01,913][lm_eval.evaluator][INFO] - Running generate_until requests
+[2025-11-19 18:24:01,913][lm_eval.evaluator][INFO] - Running generate_until requests
+[2025-11-19 18:24:01,913][lm_eval.evaluator][INFO] - Running generate_until requests

2025-11-1/exp1_len256/dream-inst/prefix/mmlu_pro/results.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1230760c961dd9022be0aec4f3f825675b214710c9f5cd95b876255b60bfc0cd
+size 10524080

2025-11-1/exp1_len256/dream-inst/prefix/mmlu_pro/stderr.log ADDED Viewed

@@ -0,0 +1,174 @@
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]

+ipex flag is deprecated, will be removed in Accelerate v1.10. From 2.7.0, PyTorch has all needed optimizations for Intel CPU and XPU.
+The following values were not passed to `accelerate launch` and had defaults used instead:
+		More than one GPU was found, enabling multi-GPU training.
+		If this was unintended please pass in `--num_processes=1`.
+	`--mixed_precision` was set to a value of `'no'`
+	`--dynamo_backend` was set to a value of `'no'`
+To avoid this warning pass in values for each of the problematic parameters or run `accelerate config`.
+[W1119 18:22:37.226195129 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
+2025-11-19 18:22:45.937 | INFO     | src.utils:pre_initialize:603 - {'strategy': 'vanilla', 'threshold': None, 'factor': None, 'alg': 'maskgit_plus', 'gen_length': 256, 'block_length': 32, 'steps': 256, 'temperature': 0.0, 'top_p': 0.9, 'top_k': None, 'debias': False, 'output_probs': False, 'mask_token_id': 151666, 'eot_token_id': 151643, 'pad_token_id': 151643, 'add_bos_token': True, 'sigma': None}
+[W1119 18:22:46.786781829 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
+[W1119 18:22:49.307737806 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
+[W1119 18:22:49.337649472 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
+[W1119 18:22:49.770033713 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
+[W1119 18:22:50.165484231 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
+[W1119 18:22:50.184086372 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
+[W1119 18:22:50.184270291 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
+[W1119 18:22:50.186893603 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:29500 (errno: 97 - Address family not supported by protocol).
+The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
+The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
+The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
+The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
+The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
+The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
+The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
+The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
+The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
+The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
+The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
+The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
+The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
+The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
+The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
+2025-11-19 18:22:59.355 | INFO     | __main__:overwrite_eval_task:62 - MMLU-Pro dataset is too large, shrink to 100 for faster evaluation.
+2025-11-19 18:23:00.303 | INFO     | __main__:overwrite_eval_task:62 - MMLU-Pro dataset is too large, shrink to 100 for faster evaluation.
+2025-11-19 18:23:00.569 | INFO     | __main__:overwrite_eval_task:62 - MMLU-Pro dataset is too large, shrink to 100 for faster evaluation.
+2025-11-19 18:23:00.605 | INFO     | __main__:overwrite_eval_task:62 - MMLU-Pro dataset is too large, shrink to 100 for faster evaluation.
+2025-11-19 18:23:00.751 | INFO     | __main__:overwrite_eval_task:62 - MMLU-Pro dataset is too large, shrink to 100 for faster evaluation.
+2025-11-19 18:23:01.331 | INFO     | __main__:overwrite_eval_task:62 - MMLU-Pro dataset is too large, shrink to 100 for faster evaluation.
+2025-11-19 18:23:02.640 | INFO     | __main__:overwrite_eval_task:62 - MMLU-Pro dataset is too large, shrink to 100 for faster evaluation.
+2025-11-19 18:23:03.083 | INFO     | __main__:overwrite_eval_task:62 - MMLU-Pro dataset is too large, shrink to 100 for faster evaluation.
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/12 [00:00<?, ?it/s]
+2025-11-19 18:24:01.929 | WARNING  | src.generation:generate:53 - The arguments ('add_bos_token', 'sigma', 'eot_token_id') are not supported by the generation strategy 'vanilla'.
+2025-11-19 18:24:01.929 | WARNING  | src.generation:generate:53 - The arguments ('add_bos_token', 'eot_token_id', 'sigma') are not supported by the generation strategy 'vanilla'.
+2025-11-19 18:24:01.929 | WARNING  | src.generation:generate:53 - The arguments ('eot_token_id', 'sigma', 'add_bos_token') are not supported by the generation strategy 'vanilla'.
+2025-11-19 18:24:01.930 | WARNING  | src.generation:generate:53 - The arguments ('sigma', 'eot_token_id', 'add_bos_token') are not supported by the generation strategy 'vanilla'.
+2025-11-19 18:24:01.930 | WARNING  | src.generation:generate:53 - The arguments ('add_bos_token', 'sigma', 'eot_token_id') are not supported by the generation strategy 'vanilla'.
+2025-11-19 18:24:01.944 | WARNING  | src.generation:generate:53 - The arguments ('add_bos_token', 'eot_token_id', 'sigma') are not supported by the generation strategy 'vanilla'.
+2025-11-19 18:24:01.944 | WARNING  | src.generation:generate:53 - The arguments ('add_bos_token', 'sigma', 'eot_token_id') are not supported by the generation strategy 'vanilla'.
+2025-11-19 19:00:06.282 | INFO     | __main__:main:87 - Throughput: 8.65 tokens/sec, Tokens per step: 0.91 tokens/step (full: 22.65 tokens/sec, 1.00 tokens/step), Latency: 11.33 s, Average Input Length: 1360.60 tokens, Peak GPU Memory: 18.50 GB, Total time: 2126.30 s
+2025-11-19 19:00:06.383 | INFO     | __main__:main:108 - Results saved to /xfr_ceph_sh/liuchonghan/HEAT/heat/outputs/2025-11-19/exp1_len256/dream-inst/prefix/mmlu_pro/results.json
+2025-11-19 19:00:06.383 | INFO     | __main__:main:111 - eval time: 2126.30 seconds
+[rank0]:[W1119 19:00:06.522929613 ProcessGroupNCCL.cpp:1479] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())