diff --git "a/lm_harness_eval.md" "b/lm_harness_eval.md" new file mode 100644--- /dev/null +++ "b/lm_harness_eval.md" @@ -0,0 +1,264 @@ +2025-04-28:20:12:03,746 INFO [utils.py:146] Note: detected 128 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable. +2025-04-28:20:12:03,747 INFO [utils.py:149] Note: NumExpr detected 128 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 16. +2025-04-28:20:12:03,747 INFO [utils.py:162] NumExpr defaulting to 16 threads. +2025-04-28:20:12:03,855 INFO [config.py:58] PyTorch version 2.7.0a0+git6374332 available. +2025-04-28:20:12:10,122 INFO [__main__.py:132] Verbosity set to INFO +2025-04-28:20:12:13,577 INFO [__main__.py:205] Selected Tasks: ['arc_challenge', 'arc_easy', 'hellaswag', 'mmlu', 'openbookqa', 'piqa', 'pubmedqa', 'race', 'winogrande'] +2025-04-28:20:12:13,578 WARNING [evaluator.py:93] generation_kwargs specified through cli, these settings will be used over set parameters in yaml tasks. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +2025-04-28:20:14:19,930 WARNING [task.py:301] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended. +2025-04-28:20:14:19,930 WARNING [task.py:301] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended. +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of arc_challenge from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of arc_easy from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of hellaswag from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_formal_logic from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_high_school_european_history from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_high_school_us_history from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_high_school_world_history from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_international_law from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_jurisprudence from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_logical_fallacies from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_moral_disputes from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_moral_scenarios from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_philosophy from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_prehistory from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_professional_law from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_world_religions from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_business_ethics from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_clinical_knowledge from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_college_medicine from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_global_facts from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_human_aging from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_management from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_marketing from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_medical_genetics from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_miscellaneous from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_nutrition from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_professional_accounting from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_professional_medicine from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_virology from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_econometrics from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_high_school_geography from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_high_school_government_and_politics from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_high_school_macroeconomics from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_high_school_microeconomics from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_high_school_psychology from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_human_sexuality from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_professional_psychology from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_public_relations from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_security_studies from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_sociology from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_us_foreign_policy from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_abstract_algebra from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_anatomy from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_astronomy from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_college_biology from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_college_chemistry from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_college_computer_science from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_college_mathematics from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_college_physics from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_computer_security from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_conceptual_physics from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_electrical_engineering from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_elementary_mathematics from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_high_school_biology from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_high_school_chemistry from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_high_school_computer_science from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_high_school_mathematics from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_high_school_physics from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_high_school_statistics from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of mmlu_machine_learning from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of openbookqa from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of piqa from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of pubmedqa from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of race from None to 0 +2025-04-28:20:14:22,108 WARNING [evaluator.py:143] Overwriting default num_fewshot of winogrande from None to 0 +2025-04-28:20:14:22,108 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:22,476 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:23,210 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:26,396 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:26,455 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:26,532 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:26,627 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:26,738 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:26,795 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:26,844 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:26,919 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:27,079 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:27,489 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:27,633 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:27,782 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:28,490 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:28,569 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:28,614 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:28,735 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:28,815 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:28,861 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:28,965 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:29,013 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:29,121 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:29,167 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:29,531 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:29,671 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:29,801 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:29,926 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:30,002 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:30,055 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:30,148 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:30,237 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:30,416 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:30,526 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:30,779 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:30,840 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:31,560 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:31,611 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:31,725 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:31,818 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:31,865 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:31,911 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:31,974 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:32,045 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:32,111 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:32,157 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:32,203 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:32,249 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:32,296 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:32,341 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:32,449 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:32,516 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:32,689 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:32,833 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:32,927 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:32,973 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:33,097 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:33,167 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:33,266 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:33,317 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:33,405 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:33,988 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:34,007 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:34,102 INFO [task.py:356] Building contexts for task on rank 0... +2025-04-28:20:14:34,122 INFO [evaluator.py:319] Running loglikelihood requests +LlamaForCausalLM( + (model): LlamaModel( + (embed_tokens): Embedding(128256, 2048) + (layers): ModuleList( + (0-15): 16 x MLADecoderLayer( + (mla): DeepseekV3FlashAttention2( + (q_a_proj): Linear(in_features=2048, out_features=1424, bias=False) + (q_a_layernorm): Identity() + (q_b_proj): Linear(in_features=1424, out_features=2048, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=96, bias=False) + (kv_a_layernorm): Identity() + (kv_b_proj): Linear(in_features=64, out_features=3072, bias=False) + (out_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV3YarnRotaryEmbedding() + ) + (mlp): MLP( + (gate_proj): Linear(in_features=2048, out_features=8192, bias=False) + (up_proj): Linear(in_features=2048, out_features=8192, bias=False) + (down_proj): Linear(in_features=8192, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV3RMSNorm() + (post_attention_layernorm): DeepseekV3RMSNorm() + ) + ) + (norm): LlamaRMSNorm() + (rotary_emb): LlamaRotaryEmbedding() + ) + (lm_head): Linear(in_features=2048, out_features=128256, bias=False) +) + 0%| | 0/124414 [00:00