wcs2024 commited on
Commit
adb8cd1
·
verified ·
1 Parent(s): 2718710

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. config.json +28 -0
  3. generation_config.json +6 -0
  4. lm_eval_logs/driver_nohup.log +0 -0
  5. lm_eval_logs/mmlu.log +318 -0
  6. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/results_2026-04-03T16-40-13.638747.json +0 -0
  7. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_abstract_algebra_2026-04-03T16-40-13.638747.jsonl +0 -0
  8. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_anatomy_2026-04-03T16-40-13.638747.jsonl +0 -0
  9. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_astronomy_2026-04-03T16-40-13.638747.jsonl +0 -0
  10. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_business_ethics_2026-04-03T16-40-13.638747.jsonl +0 -0
  11. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_clinical_knowledge_2026-04-03T16-40-13.638747.jsonl +0 -0
  12. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_college_biology_2026-04-03T16-40-13.638747.jsonl +0 -0
  13. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_college_chemistry_2026-04-03T16-40-13.638747.jsonl +0 -0
  14. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_college_computer_science_2026-04-03T16-40-13.638747.jsonl +0 -0
  15. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_college_mathematics_2026-04-03T16-40-13.638747.jsonl +0 -0
  16. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_college_medicine_2026-04-03T16-40-13.638747.jsonl +0 -0
  17. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_college_physics_2026-04-03T16-40-13.638747.jsonl +0 -0
  18. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_computer_security_2026-04-03T16-40-13.638747.jsonl +0 -0
  19. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_conceptual_physics_2026-04-03T16-40-13.638747.jsonl +0 -0
  20. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_econometrics_2026-04-03T16-40-13.638747.jsonl +0 -0
  21. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_electrical_engineering_2026-04-03T16-40-13.638747.jsonl +0 -0
  22. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_elementary_mathematics_2026-04-03T16-40-13.638747.jsonl +0 -0
  23. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_formal_logic_2026-04-03T16-40-13.638747.jsonl +0 -0
  24. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_global_facts_2026-04-03T16-40-13.638747.jsonl +0 -0
  25. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_high_school_biology_2026-04-03T16-40-13.638747.jsonl +0 -0
  26. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_high_school_chemistry_2026-04-03T16-40-13.638747.jsonl +0 -0
  27. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_high_school_computer_science_2026-04-03T16-40-13.638747.jsonl +0 -0
  28. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_high_school_european_history_2026-04-03T16-40-13.638747.jsonl +0 -0
  29. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_high_school_geography_2026-04-03T16-40-13.638747.jsonl +0 -0
  30. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_high_school_government_and_politics_2026-04-03T16-40-13.638747.jsonl +0 -0
  31. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_high_school_macroeconomics_2026-04-03T16-40-13.638747.jsonl +0 -0
  32. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_high_school_mathematics_2026-04-03T16-40-13.638747.jsonl +0 -0
  33. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_high_school_microeconomics_2026-04-03T16-40-13.638747.jsonl +0 -0
  34. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_high_school_physics_2026-04-03T16-40-13.638747.jsonl +0 -0
  35. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_high_school_psychology_2026-04-03T16-40-13.638747.jsonl +0 -0
  36. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_high_school_statistics_2026-04-03T16-40-13.638747.jsonl +0 -0
  37. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_high_school_us_history_2026-04-03T16-40-13.638747.jsonl +0 -0
  38. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_high_school_world_history_2026-04-03T16-40-13.638747.jsonl +0 -0
  39. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_human_aging_2026-04-03T16-40-13.638747.jsonl +0 -0
  40. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_human_sexuality_2026-04-03T16-40-13.638747.jsonl +0 -0
  41. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_international_law_2026-04-03T16-40-13.638747.jsonl +0 -0
  42. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_jurisprudence_2026-04-03T16-40-13.638747.jsonl +0 -0
  43. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_logical_fallacies_2026-04-03T16-40-13.638747.jsonl +0 -0
  44. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_machine_learning_2026-04-03T16-40-13.638747.jsonl +0 -0
  45. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_management_2026-04-03T16-40-13.638747.jsonl +0 -0
  46. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_marketing_2026-04-03T16-40-13.638747.jsonl +0 -0
  47. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_medical_genetics_2026-04-03T16-40-13.638747.jsonl +0 -0
  48. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_miscellaneous_2026-04-03T16-40-13.638747.jsonl +0 -0
  49. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_moral_disputes_2026-04-03T16-40-13.638747.jsonl +0 -0
  50. lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_moral_scenarios_2026-04-03T16-40-13.638747.jsonl +0 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_professional_law_2026-04-03T16-40-13.638747.jsonl filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "HuggingFaceH4/zephyr-7b-beta",
3
+ "architectures": [
4
+ "MistralForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 4096,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 14336,
14
+ "max_position_embeddings": 32768,
15
+ "model_type": "mistral",
16
+ "num_attention_heads": 32,
17
+ "num_hidden_layers": 32,
18
+ "num_key_value_heads": 8,
19
+ "pad_token_id": 2,
20
+ "rms_norm_eps": 1e-05,
21
+ "rope_theta": 10000.0,
22
+ "sliding_window": 4096,
23
+ "tie_word_embeddings": false,
24
+ "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.43.0",
26
+ "use_cache": true,
27
+ "vocab_size": 32000
28
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.43.0"
6
+ }
lm_eval_logs/driver_nohup.log ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu.log ADDED
@@ -0,0 +1,318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0
  0%| | 0/270 [00:00<?, ?it/s]
1
  30%|███ | 82/270 [00:00<00:00, 819.04it/s]
2
  74%|███████▍ | 200/270 [00:00<00:00, 1029.54it/s]
 
 
3
  0%| | 0/100 [00:00<?, ?it/s]
 
 
4
  0%| | 0/100 [00:00<?, ?it/s]
 
 
5
  0%| | 0/235 [00:00<?, ?it/s]
6
  51%|█████ | 120/235 [00:00<00:00, 1197.33it/s]
 
 
7
  0%| | 0/144 [00:00<?, ?it/s]
8
  83%|████████▎ | 120/144 [00:00<00:00, 1193.77it/s]
 
 
9
  0%| | 0/100 [00:00<?, ?it/s]
 
 
10
  0%| | 0/100 [00:00<?, ?it/s]
 
 
11
  0%| | 0/112 [00:00<?, ?it/s]
 
 
12
  0%| | 0/100 [00:00<?, ?it/s]
 
 
13
  0%| | 0/102 [00:00<?, ?it/s]
 
 
14
  0%| | 0/216 [00:00<?, ?it/s]
15
  55%|█████▌ | 119/216 [00:00<00:00, 1186.72it/s]
 
 
16
  0%| | 0/151 [00:00<?, ?it/s]
17
  79%|███████▉ | 119/151 [00:00<00:00, 1189.56it/s]
 
 
18
  0%| | 0/378 [00:00<?, ?it/s]
19
  31%|███ | 118/378 [00:00<00:00, 1179.87it/s]
20
  63%|██████▎ | 237/378 [00:00<00:00, 1185.01it/s]
21
  94%|█████████▍| 356/378 [00:00<00:00, 1184.24it/s]
 
 
22
  0%| | 0/145 [00:00<?, ?it/s]
23
  83%|████████▎ | 120/145 [00:00<00:00, 1198.54it/s]
 
 
24
  0%| | 0/310 [00:00<?, ?it/s]
25
  39%|███▊ | 120/310 [00:00<00:00, 1194.88it/s]
26
  77%|███████▋ | 240/310 [00:00<00:00, 1189.11it/s]
 
 
27
  0%| | 0/152 [00:00<?, ?it/s]
28
  79%|███████▉ | 120/152 [00:00<00:00, 1194.28it/s]
 
 
29
  0%| | 0/135 [00:00<?, ?it/s]
30
  89%|████████▉ | 120/135 [00:00<00:00, 1199.04it/s]
 
 
31
  0%| | 0/100 [00:00<?, ?it/s]
 
 
32
  0%| | 0/203 [00:00<?, ?it/s]
33
  59%|█████▉ | 120/203 [00:00<00:00, 1194.98it/s]
 
 
34
  0%| | 0/103 [00:00<?, ?it/s]
 
 
35
  0%| | 0/265 [00:00<?, ?it/s]
36
  45%|████▌ | 120/265 [00:00<00:00, 1190.62it/s]
37
  91%|█████████ | 240/265 [00:00<00:00, 1188.99it/s]
 
 
38
  0%| | 0/272 [00:00<?, ?it/s]
39
  44%|████▍ | 120/272 [00:00<00:00, 1192.97it/s]
40
  88%|████████▊ | 240/272 [00:00<00:00, 1191.79it/s]
 
 
41
  0%| | 0/223 [00:00<?, ?it/s]
42
  33%|███▎ | 74/223 [00:00<00:00, 340.71it/s]
43
  87%|████████▋ | 193/223 [00:00<00:00, 676.52it/s]
 
 
44
  0%| | 0/282 [00:00<?, ?it/s]
45
  42%|████▏ | 119/282 [00:00<00:00, 1189.22it/s]
46
  84%|████████▍ | 238/282 [00:00<00:00, 1186.54it/s]
 
 
47
  0%| | 0/173 [00:00<?, ?it/s]
48
  69%|██████▉ | 119/173 [00:00<00:00, 1189.56it/s]
 
 
49
  0%| | 0/100 [00:00<?, ?it/s]
 
 
50
  0%| | 0/783 [00:00<?, ?it/s]
51
  15%|█▌ | 120/783 [00:00<00:00, 1197.11it/s]
52
  31%|███ | 240/783 [00:00<00:00, 1193.18it/s]
53
  46%|████▌ | 360/783 [00:00<00:00, 1190.30it/s]
54
  61%|██████▏ | 480/783 [00:00<00:00, 1191.35it/s]
55
  77%|███████▋ | 600/783 [00:00<00:00, 1189.42it/s]
56
  92%|█████████▏| 720/783 [00:00<00:00, 1190.78it/s]
 
 
57
  0%| | 0/100 [00:00<?, ?it/s]
 
 
58
  0%| | 0/166 [00:00<?, ?it/s]
59
  72%|███████▏ | 120/166 [00:00<00:00, 1198.06it/s]
 
 
60
  0%| | 0/100 [00:00<?, ?it/s]
 
 
61
  0%| | 0/306 [00:00<?, ?it/s]
62
  39%|███▉ | 120/306 [00:00<00:00, 1198.40it/s]
63
  78%|███████▊ | 240/306 [00:00<00:00, 1194.25it/s]
 
 
64
  0%| | 0/234 [00:00<?, ?it/s]
65
  51%|█████▏ | 120/234 [00:00<00:00, 1194.33it/s]
 
 
66
  0%| | 0/612 [00:00<?, ?it/s]
67
  20%|█▉ | 120/612 [00:00<00:00, 1190.56it/s]
68
  39%|███▉ | 240/612 [00:00<00:00, 1191.50it/s]
69
  59%|█████▉ | 360/612 [00:00<00:00, 1193.22it/s]
70
  78%|███████▊ | 480/612 [00:00<00:00, 1192.31it/s]
71
  98%|█████████▊| 600/612 [00:00<00:00, 1193.37it/s]
 
 
72
  0%| | 0/198 [00:00<?, ?it/s]
73
  61%|██████ | 120/198 [00:00<00:00, 1190.72it/s]
 
 
74
  0%| | 0/545 [00:00<?, ?it/s]
75
  22%|██▏ | 121/545 [00:00<00:00, 1200.06it/s]
76
  44%|████▍ | 242/545 [00:00<00:00, 1191.97it/s]
77
  66%|██████▋ | 362/545 [00:00<00:00, 1192.51it/s]
78
  88%|████████▊ | 482/545 [00:00<00:00, 1191.55it/s]
 
 
79
  0%| | 0/100 [00:00<?, ?it/s]
 
 
80
  0%| | 0/238 [00:00<?, ?it/s]
81
  50%|█████ | 120/238 [00:00<00:00, 1191.20it/s]
 
 
82
  0%| | 0/201 [00:00<?, ?it/s]
83
  60%|█████▉ | 120/201 [00:00<00:00, 1191.49it/s]
 
 
84
  0%| | 0/245 [00:00<?, ?it/s]
85
  49%|████▊ | 119/245 [00:00<00:00, 1188.33it/s]
86
  97%|█████████▋| 238/245 [00:00<00:00, 1187.83it/s]
 
 
87
  0%| | 0/110 [00:00<?, ?it/s]
 
 
88
  0%| | 0/131 [00:00<?, ?it/s]
89
  92%|█████████▏| 120/131 [00:00<00:00, 1198.52it/s]
 
 
90
  0%| | 0/193 [00:00<?, ?it/s]
91
  62%|██████▏ | 119/193 [00:00<00:00, 1188.54it/s]
 
 
92
  0%| | 0/114 [00:00<?, ?it/s]
 
 
93
  0%| | 0/390 [00:00<?, ?it/s]
94
  31%|███ | 121/390 [00:00<00:00, 1200.51it/s]
95
  62%|██████▏ | 242/390 [00:00<00:00, 1193.08it/s]
96
  93%|█████████▎| 362/390 [00:00<00:00, 1192.32it/s]
 
 
97
  0%| | 0/126 [00:00<?, ?it/s]
98
  94%|█████████▍| 119/126 [00:00<00:00, 1187.81it/s]
 
 
99
  0%| | 0/346 [00:00<?, ?it/s]
100
  34%|███▍ | 119/346 [00:00<00:00, 1186.30it/s]
101
  69%|██████▉ | 239/346 [00:00<00:00, 1189.05it/s]
 
 
102
  0%| | 0/895 [00:00<?, ?it/s]
103
  13%|█▎ | 120/895 [00:00<00:00, 1190.24it/s]
104
  27%|██▋ | 240/895 [00:00<00:00, 1191.14it/s]
105
  40%|████ | 360/895 [00:00<00:00, 1189.38it/s]
106
  54%|█████▎ | 480/895 [00:00<00:00, 1190.82it/s]
107
  67%|██████▋ | 600/895 [00:00<00:00, 1187.94it/s]
108
  80%|████████ | 719/895 [00:00<00:00, 1187.12it/s]
109
  94%|█████████▎| 839/895 [00:00<00:00, 1189.43it/s]
 
 
110
  0%| | 0/108 [00:00<?, ?it/s]
 
 
111
  0%| | 0/165 [00:00<?, ?it/s]
112
  59%|█████▉ | 97/165 [00:00<00:00, 316.18it/s]
 
 
113
  0%| | 0/1534 [00:00<?, ?it/s]
114
  8%|▊ | 119/1534 [00:00<00:01, 1181.94it/s]
115
  16%|█▌ | 238/1534 [00:00<00:01, 1183.71it/s]
116
  23%|██▎ | 357/1534 [00:00<00:00, 1185.37it/s]
117
  31%|███ | 477/1534 [00:00<00:00, 1188.98it/s]
118
  39%|███▉ | 597/1534 [00:00<00:00, 1189.89it/s]
119
  47%|████▋ | 717/1534 [00:00<00:00, 1192.61it/s]
120
  55%|█████▍ | 837/1534 [00:00<00:00, 1191.78it/s]
121
  62%|██████▏ | 957/1534 [00:00<00:00, 1192.38it/s]
122
  70%|███████ | 1077/1534 [00:00<00:00, 1192.50it/s]
123
  78%|███████▊ | 1197/1534 [00:01<00:00, 1190.77it/s]
124
  86%|████████▌ | 1317/1534 [00:01<00:00, 1190.94it/s]
125
  94%|█████████▎| 1437/1534 [00:01<00:00, 1190.67it/s]
 
 
126
  0%| | 0/324 [00:00<?, ?it/s]
127
  37%|███▋ | 119/324 [00:00<00:00, 1181.62it/s]
128
  73%|███████▎ | 238/324 [00:00<00:00, 1184.06it/s]
 
 
129
  0%| | 0/171 [00:00<?, ?it/s]
130
  70%|███████ | 120/171 [00:00<00:00, 1191.27it/s]
 
 
131
  0%| | 0/237 [00:00<?, ?it/s]
132
  50%|█████ | 119/237 [00:00<00:00, 1188.77it/s]
 
 
133
  0%| | 0/121 [00:00<?, ?it/s]
134
  99%|█████████▉| 120/121 [00:00<00:00, 1193.52it/s]
 
 
135
  0%| | 0/163 [00:00<?, ?it/s]
136
  74%|███████▍ | 121/163 [00:00<00:00, 1205.56it/s]
 
 
137
  0%| | 0/311 [00:00<?, ?it/s]
138
  39%|███▉ | 121/311 [00:00<00:00, 1201.64it/s]
139
  78%|███████▊ | 242/311 [00:00<00:00, 1195.51it/s]
 
 
140
  0%| | 0/204 [00:00<?, ?it/s]
141
  58%|█████▊ | 118/204 [00:00<00:00, 1174.86it/s]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /egr/research-optml/wangc168/anaconda3/envs/SOUL/lib/python3.9/site-packages/transformers/utils/hub.py:127: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
2
+ warnings.warn(
3
+ 2026-04-03:16:35:59,252 INFO [__main__.py:279] Verbosity set to INFO
4
+ 2026-04-03:16:35:59,682 INFO [__init__.py:491] `group` and `group_alias` keys in TaskConfigs are deprecated and will be removed in v0.4.5 of lm_eval. The new `tag` field will be used to allow for a shortcut to a group of tasks one does not wish to aggregate metrics across. `group`s which aggregate across subtasks must be only defined in a separate group config file, which will be the official way to create groups that support cross-task aggregation as in `mmlu`. Please see the v0.4.4 patch notes and our documentation: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/new_task_guide.md#advanced-group-configs for more information.
5
+ 2026-04-03:16:36:07,612 INFO [__main__.py:376] Selected Tasks: ['mmlu']
6
+ 2026-04-03:16:36:07,615 INFO [evaluator.py:161] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234
7
+ 2026-04-03:16:36:07,615 INFO [evaluator.py:198] Initializing hf model, with arguments: {'pretrained': '/egr/research-optml/wangc168/Muon_wmdp/rmu_wmdp/models/muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6'}
8
+ 2026-04-03:16:36:07,760 INFO [huggingface.py:130] Using device 'cuda:0'
9
+ 2026-04-03:16:36:07,992 INFO [huggingface.py:366] Model parallel was set to False, max memory was not set, and device map was set to {'': 'cuda:0'}
10
+
11
+ 2026-04-03:16:37:53,518 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
12
+ 2026-04-03:16:37:53,518 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
13
+ 2026-04-03:16:37:53,518 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
14
+ 2026-04-03:16:37:53,519 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
15
+ 2026-04-03:16:37:53,519 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
16
+ 2026-04-03:16:37:53,519 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
17
+ 2026-04-03:16:37:53,519 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
18
+ 2026-04-03:16:37:53,519 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
19
+ 2026-04-03:16:37:53,519 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
20
+ 2026-04-03:16:37:53,519 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
21
+ 2026-04-03:16:37:53,519 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
22
+ 2026-04-03:16:37:53,519 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
23
+ 2026-04-03:16:37:53,519 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
24
+ 2026-04-03:16:37:53,519 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
25
+ 2026-04-03:16:37:53,519 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
26
+ 2026-04-03:16:37:53,519 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
27
+ 2026-04-03:16:37:53,519 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
28
+ 2026-04-03:16:37:53,519 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
29
+ 2026-04-03:16:37:53,519 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
30
+ 2026-04-03:16:37:53,519 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
31
+ 2026-04-03:16:37:53,519 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
32
+ 2026-04-03:16:37:53,519 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
33
+ 2026-04-03:16:37:53,520 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
34
+ 2026-04-03:16:37:53,520 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
35
+ 2026-04-03:16:37:53,520 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
36
+ 2026-04-03:16:37:53,520 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
37
+ 2026-04-03:16:37:53,520 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
38
+ 2026-04-03:16:37:53,520 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
39
+ 2026-04-03:16:37:53,520 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
40
+ 2026-04-03:16:37:53,520 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
41
+ 2026-04-03:16:37:53,520 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
42
+ 2026-04-03:16:37:53,520 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
43
+ 2026-04-03:16:37:53,520 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
44
+ 2026-04-03:16:37:53,520 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
45
+ 2026-04-03:16:37:53,520 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
46
+ 2026-04-03:16:37:53,520 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
47
+ 2026-04-03:16:37:53,520 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
48
+ 2026-04-03:16:37:53,520 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
49
+ 2026-04-03:16:37:53,520 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
50
+ 2026-04-03:16:37:53,520 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
51
+ 2026-04-03:16:37:53,520 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
52
+ 2026-04-03:16:37:53,521 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
53
+ 2026-04-03:16:37:53,521 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
54
+ 2026-04-03:16:37:53,521 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
55
+ 2026-04-03:16:37:53,521 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
56
+ 2026-04-03:16:37:53,521 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
57
+ 2026-04-03:16:37:53,521 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
58
+ 2026-04-03:16:37:53,521 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
59
+ 2026-04-03:16:37:53,521 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
60
+ 2026-04-03:16:37:53,521 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
61
+ 2026-04-03:16:37:53,521 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
62
+ 2026-04-03:16:37:53,521 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
63
+ 2026-04-03:16:37:53,521 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
64
+ 2026-04-03:16:37:53,521 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
65
+ 2026-04-03:16:37:53,521 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
66
+ 2026-04-03:16:37:53,521 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
67
+ 2026-04-03:16:37:53,521 INFO [evaluator.py:279] Setting fewshot random generator seed to 1234
68
+ 2026-04-03:16:37:53,521 WARNING [model.py:422] model.chat_template was called with the chat_template set to False or None. Therefore no chat template will be applied. Make sure this is an intended behavior.
69
+ 2026-04-03:16:37:53,529 INFO [task.py:423] Building contexts for mmlu_high_school_mathematics on rank 0...
70
+
71
  0%| | 0/270 [00:00<?, ?it/s]
72
  30%|███ | 82/270 [00:00<00:00, 819.04it/s]
73
  74%|███████▍ | 200/270 [00:00<00:00, 1029.54it/s]
74
+ 2026-04-03:16:37:53,804 INFO [task.py:423] Building contexts for mmlu_college_chemistry on rank 0...
75
+
76
  0%| | 0/100 [00:00<?, ?it/s]
77
+ 2026-04-03:16:37:53,891 INFO [task.py:423] Building contexts for mmlu_computer_security on rank 0...
78
+
79
  0%| | 0/100 [00:00<?, ?it/s]
80
+ 2026-04-03:16:37:53,978 INFO [task.py:423] Building contexts for mmlu_conceptual_physics on rank 0...
81
+
82
  0%| | 0/235 [00:00<?, ?it/s]
83
  51%|█████ | 120/235 [00:00<00:00, 1197.33it/s]
84
+ 2026-04-03:16:37:54,181 INFO [task.py:423] Building contexts for mmlu_college_biology on rank 0...
85
+
86
  0%| | 0/144 [00:00<?, ?it/s]
87
  83%|████████▎ | 120/144 [00:00<00:00, 1193.77it/s]
88
+ 2026-04-03:16:37:54,306 INFO [task.py:423] Building contexts for mmlu_college_computer_science on rank 0...
89
+
90
  0%| | 0/100 [00:00<?, ?it/s]
91
+ 2026-04-03:16:37:54,392 INFO [task.py:423] Building contexts for mmlu_abstract_algebra on rank 0...
92
+
93
  0%| | 0/100 [00:00<?, ?it/s]
94
+ 2026-04-03:16:37:54,478 INFO [task.py:423] Building contexts for mmlu_machine_learning on rank 0...
95
+
96
  0%| | 0/112 [00:00<?, ?it/s]
97
+ 2026-04-03:16:37:54,575 INFO [task.py:423] Building contexts for mmlu_high_school_computer_science on rank 0...
98
+
99
  0%| | 0/100 [00:00<?, ?it/s]
100
+ 2026-04-03:16:37:54,661 INFO [task.py:423] Building contexts for mmlu_college_physics on rank 0...
101
+
102
  0%| | 0/102 [00:00<?, ?it/s]
103
+ 2026-04-03:16:37:54,750 INFO [task.py:423] Building contexts for mmlu_high_school_statistics on rank 0...
104
+
105
  0%| | 0/216 [00:00<?, ?it/s]
106
  55%|█████▌ | 119/216 [00:00<00:00, 1186.72it/s]
107
+ 2026-04-03:16:37:54,937 INFO [task.py:423] Building contexts for mmlu_high_school_physics on rank 0...
108
+
109
  0%| | 0/151 [00:00<?, ?it/s]
110
  79%|███████▉ | 119/151 [00:00<00:00, 1189.56it/s]
111
+ 2026-04-03:16:37:55,068 INFO [task.py:423] Building contexts for mmlu_elementary_mathematics on rank 0...
112
+
113
  0%| | 0/378 [00:00<?, ?it/s]
114
  31%|███ | 118/378 [00:00<00:00, 1179.87it/s]
115
  63%|██████▎ | 237/378 [00:00<00:00, 1185.01it/s]
116
  94%|█████████▍| 356/378 [00:00<00:00, 1184.24it/s]
117
+ 2026-04-03:16:37:55,396 INFO [task.py:423] Building contexts for mmlu_electrical_engineering on rank 0...
118
+
119
  0%| | 0/145 [00:00<?, ?it/s]
120
  83%|████████▎ | 120/145 [00:00<00:00, 1198.54it/s]
121
+ 2026-04-03:16:37:55,521 INFO [task.py:423] Building contexts for mmlu_high_school_biology on rank 0...
122
+
123
  0%| | 0/310 [00:00<?, ?it/s]
124
  39%|███▊ | 120/310 [00:00<00:00, 1194.88it/s]
125
  77%|███████▋ | 240/310 [00:00<00:00, 1189.11it/s]
126
+ 2026-04-03:16:37:55,790 INFO [task.py:423] Building contexts for mmlu_astronomy on rank 0...
127
+
128
  0%| | 0/152 [00:00<?, ?it/s]
129
  79%|███████▉ | 120/152 [00:00<00:00, 1194.28it/s]
130
+ 2026-04-03:16:37:55,921 INFO [task.py:423] Building contexts for mmlu_anatomy on rank 0...
131
+
132
  0%| | 0/135 [00:00<?, ?it/s]
133
  89%|████████▉ | 120/135 [00:00<00:00, 1199.04it/s]
134
+ 2026-04-03:16:37:56,038 INFO [task.py:423] Building contexts for mmlu_college_mathematics on rank 0...
135
+
136
  0%| | 0/100 [00:00<?, ?it/s]
137
+ 2026-04-03:16:37:56,124 INFO [task.py:423] Building contexts for mmlu_high_school_chemistry on rank 0...
138
+
139
  0%| | 0/203 [00:00<?, ?it/s]
140
  59%|█████▉ | 120/203 [00:00<00:00, 1194.98it/s]
141
+ 2026-04-03:16:37:56,299 INFO [task.py:423] Building contexts for mmlu_management on rank 0...
142
+
143
  0%| | 0/103 [00:00<?, ?it/s]
144
+ 2026-04-03:16:37:56,388 INFO [task.py:423] Building contexts for mmlu_clinical_knowledge on rank 0...
145
+
146
  0%| | 0/265 [00:00<?, ?it/s]
147
  45%|████▌ | 120/265 [00:00<00:00, 1190.62it/s]
148
  91%|█████████ | 240/265 [00:00<00:00, 1188.99it/s]
149
+ 2026-04-03:16:37:56,618 INFO [task.py:423] Building contexts for mmlu_professional_medicine on rank 0...
150
+
151
  0%| | 0/272 [00:00<?, ?it/s]
152
  44%|████▍ | 120/272 [00:00<00:00, 1192.97it/s]
153
  88%|████████▊ | 240/272 [00:00<00:00, 1191.79it/s]
154
+ 2026-04-03:16:37:56,853 INFO [task.py:423] Building contexts for mmlu_human_aging on rank 0...
155
+
156
  0%| | 0/223 [00:00<?, ?it/s]
157
  33%|███▎ | 74/223 [00:00<00:00, 340.71it/s]
158
  87%|████████▋ | 193/223 [00:00<00:00, 676.52it/s]
159
+ 2026-04-03:16:37:57,201 INFO [task.py:423] Building contexts for mmlu_professional_accounting on rank 0...
160
+
161
  0%| | 0/282 [00:00<?, ?it/s]
162
  42%|████▏ | 119/282 [00:00<00:00, 1189.22it/s]
163
  84%|████████▍ | 238/282 [00:00<00:00, 1186.54it/s]
164
+ 2026-04-03:16:37:57,447 INFO [task.py:423] Building contexts for mmlu_college_medicine on rank 0...
165
+
166
  0%| | 0/173 [00:00<?, ?it/s]
167
  69%|██████▉ | 119/173 [00:00<00:00, 1189.56it/s]
168
+ 2026-04-03:16:37:57,596 INFO [task.py:423] Building contexts for mmlu_medical_genetics on rank 0...
169
+
170
  0%| | 0/100 [00:00<?, ?it/s]
171
+ 2026-04-03:16:37:57,683 INFO [task.py:423] Building contexts for mmlu_miscellaneous on rank 0...
172
+
173
  0%| | 0/783 [00:00<?, ?it/s]
174
  15%|█▌ | 120/783 [00:00<00:00, 1197.11it/s]
175
  31%|███ | 240/783 [00:00<00:00, 1193.18it/s]
176
  46%|████▌ | 360/783 [00:00<00:00, 1190.30it/s]
177
  61%|██████▏ | 480/783 [00:00<00:00, 1191.35it/s]
178
  77%|███████▋ | 600/783 [00:00<00:00, 1189.42it/s]
179
  92%|█████████▏| 720/783 [00:00<00:00, 1190.78it/s]
180
+ 2026-04-03:16:37:58,358 INFO [task.py:423] Building contexts for mmlu_global_facts on rank 0...
181
+
182
  0%| | 0/100 [00:00<?, ?it/s]
183
+ 2026-04-03:16:37:58,445 INFO [task.py:423] Building contexts for mmlu_virology on rank 0...
184
+
185
  0%| | 0/166 [00:00<?, ?it/s]
186
  72%|███████▏ | 120/166 [00:00<00:00, 1198.06it/s]
187
+ 2026-04-03:16:37:58,587 INFO [task.py:423] Building contexts for mmlu_business_ethics on rank 0...
188
+
189
  0%| | 0/100 [00:00<?, ?it/s]
190
+ 2026-04-03:16:37:58,674 INFO [task.py:423] Building contexts for mmlu_nutrition on rank 0...
191
+
192
  0%| | 0/306 [00:00<?, ?it/s]
193
  39%|███▉ | 120/306 [00:00<00:00, 1198.40it/s]
194
  78%|███████▊ | 240/306 [00:00<00:00, 1194.25it/s]
195
+ 2026-04-03:16:37:58,937 INFO [task.py:423] Building contexts for mmlu_marketing on rank 0...
196
+
197
  0%| | 0/234 [00:00<?, ?it/s]
198
  51%|█████▏ | 120/234 [00:00<00:00, 1194.33it/s]
199
+ 2026-04-03:16:37:59,139 INFO [task.py:423] Building contexts for mmlu_professional_psychology on rank 0...
200
+
201
  0%| | 0/612 [00:00<?, ?it/s]
202
  20%|█▉ | 120/612 [00:00<00:00, 1190.56it/s]
203
  39%|███▉ | 240/612 [00:00<00:00, 1191.50it/s]
204
  59%|█████▉ | 360/612 [00:00<00:00, 1193.22it/s]
205
  78%|███████▊ | 480/612 [00:00<00:00, 1192.31it/s]
206
  98%|█████████▊| 600/612 [00:00<00:00, 1193.37it/s]
207
+ 2026-04-03:16:37:59,667 INFO [task.py:423] Building contexts for mmlu_high_school_geography on rank 0...
208
+
209
  0%| | 0/198 [00:00<?, ?it/s]
210
  61%|██████ | 120/198 [00:00<00:00, 1190.72it/s]
211
+ 2026-04-03:16:37:59,838 INFO [task.py:423] Building contexts for mmlu_high_school_psychology on rank 0...
212
+
213
  0%| | 0/545 [00:00<?, ?it/s]
214
  22%|██▏ | 121/545 [00:00<00:00, 1200.06it/s]
215
  44%|████▍ | 242/545 [00:00<00:00, 1191.97it/s]
216
  66%|██████▋ | 362/545 [00:00<00:00, 1192.51it/s]
217
  88%|████████▊ | 482/545 [00:00<00:00, 1191.55it/s]
218
+ 2026-04-03:16:38:00,309 INFO [task.py:423] Building contexts for mmlu_us_foreign_policy on rank 0...
219
+
220
  0%| | 0/100 [00:00<?, ?it/s]
221
+ 2026-04-03:16:38:00,396 INFO [task.py:423] Building contexts for mmlu_high_school_microeconomics on rank 0...
222
+
223
  0%| | 0/238 [00:00<?, ?it/s]
224
  50%|█████ | 120/238 [00:00<00:00, 1191.20it/s]
225
+ 2026-04-03:16:38:00,602 INFO [task.py:423] Building contexts for mmlu_sociology on rank 0...
226
+
227
  0%| | 0/201 [00:00<?, ?it/s]
228
  60%|█████▉ | 120/201 [00:00<00:00, 1191.49it/s]
229
+ 2026-04-03:16:38:00,776 INFO [task.py:423] Building contexts for mmlu_security_studies on rank 0...
230
+
231
  0%| | 0/245 [00:00<?, ?it/s]
232
  49%|████▊ | 119/245 [00:00<00:00, 1188.33it/s]
233
  97%|█████████▋| 238/245 [00:00<00:00, 1187.83it/s]
234
+ 2026-04-03:16:38:00,989 INFO [task.py:423] Building contexts for mmlu_public_relations on rank 0...
235
+
236
  0%| | 0/110 [00:00<?, ?it/s]
237
+ 2026-04-03:16:38:01,083 INFO [task.py:423] Building contexts for mmlu_human_sexuality on rank 0...
238
+
239
  0%| | 0/131 [00:00<?, ?it/s]
240
  92%|█████████▏| 120/131 [00:00<00:00, 1198.52it/s]
241
+ 2026-04-03:16:38:01,196 INFO [task.py:423] Building contexts for mmlu_high_school_government_and_politics on rank 0...
242
+
243
  0%| | 0/193 [00:00<?, ?it/s]
244
  62%|██████▏ | 119/193 [00:00<00:00, 1188.54it/s]
245
+ 2026-04-03:16:38:01,364 INFO [task.py:423] Building contexts for mmlu_econometrics on rank 0...
246
+
247
  0%| | 0/114 [00:00<?, ?it/s]
248
+ 2026-04-03:16:38:01,462 INFO [task.py:423] Building contexts for mmlu_high_school_macroeconomics on rank 0...
249
+
250
  0%| | 0/390 [00:00<?, ?it/s]
251
  31%|███ | 121/390 [00:00<00:00, 1200.51it/s]
252
  62%|██████▏ | 242/390 [00:00<00:00, 1193.08it/s]
253
  93%|█████████▎| 362/390 [00:00<00:00, 1192.32it/s]
254
+ 2026-04-03:16:38:01,799 INFO [task.py:423] Building contexts for mmlu_formal_logic on rank 0...
255
+
256
  0%| | 0/126 [00:00<?, ?it/s]
257
  94%|█████████▍| 119/126 [00:00<00:00, 1187.81it/s]
258
+ 2026-04-03:16:38:01,908 INFO [task.py:423] Building contexts for mmlu_moral_disputes on rank 0...
259
+
260
  0%| | 0/346 [00:00<?, ?it/s]
261
  34%|███▍ | 119/346 [00:00<00:00, 1186.30it/s]
262
  69%|██████▉ | 239/346 [00:00<00:00, 1189.05it/s]
263
+ 2026-04-03:16:38:02,208 INFO [task.py:423] Building contexts for mmlu_moral_scenarios on rank 0...
264
+
265
  0%| | 0/895 [00:00<?, ?it/s]
266
  13%|█▎ | 120/895 [00:00<00:00, 1190.24it/s]
267
  27%|██▋ | 240/895 [00:00<00:00, 1191.14it/s]
268
  40%|████ | 360/895 [00:00<00:00, 1189.38it/s]
269
  54%|█████▎ | 480/895 [00:00<00:00, 1190.82it/s]
270
  67%|██████▋ | 600/895 [00:00<00:00, 1187.94it/s]
271
  80%|████████ | 719/895 [00:00<00:00, 1187.12it/s]
272
  94%|█████████▎| 839/895 [00:00<00:00, 1189.43it/s]
273
+ 2026-04-03:16:38:02,982 INFO [task.py:423] Building contexts for mmlu_jurisprudence on rank 0...
274
+
275
  0%| | 0/108 [00:00<?, ?it/s]
276
+ 2026-04-03:16:38:03,075 INFO [task.py:423] Building contexts for mmlu_high_school_european_history on rank 0...
277
+
278
  0%| | 0/165 [00:00<?, ?it/s]
279
  59%|█████▉ | 97/165 [00:00<00:00, 316.18it/s]
280
+ 2026-04-03:16:38:03,445 INFO [task.py:423] Building contexts for mmlu_professional_law on rank 0...
281
+
282
  0%| | 0/1534 [00:00<?, ?it/s]
283
  8%|▊ | 119/1534 [00:00<00:01, 1181.94it/s]
284
  16%|█▌ | 238/1534 [00:00<00:01, 1183.71it/s]
285
  23%|██▎ | 357/1534 [00:00<00:00, 1185.37it/s]
286
  31%|███ | 477/1534 [00:00<00:00, 1188.98it/s]
287
  39%|███▉ | 597/1534 [00:00<00:00, 1189.89it/s]
288
  47%|████▋ | 717/1534 [00:00<00:00, 1192.61it/s]
289
  55%|█████▍ | 837/1534 [00:00<00:00, 1191.78it/s]
290
  62%|██████▏ | 957/1534 [00:00<00:00, 1192.38it/s]
291
  70%|███████ | 1077/1534 [00:00<00:00, 1192.50it/s]
292
  78%|███████▊ | 1197/1534 [00:01<00:00, 1190.77it/s]
293
  86%|████████▌ | 1317/1534 [00:01<00:00, 1190.94it/s]
294
  94%|█████████▎| 1437/1534 [00:01<00:00, 1190.67it/s]
295
+ 2026-04-03:16:38:04,772 INFO [task.py:423] Building contexts for mmlu_prehistory on rank 0...
296
+
297
  0%| | 0/324 [00:00<?, ?it/s]
298
  37%|███▋ | 119/324 [00:00<00:00, 1181.62it/s]
299
  73%|███████▎ | 238/324 [00:00<00:00, 1184.06it/s]
300
+ 2026-04-03:16:38:05,054 INFO [task.py:423] Building contexts for mmlu_world_religions on rank 0...
301
+
302
  0%| | 0/171 [00:00<?, ?it/s]
303
  70%|███████ | 120/171 [00:00<00:00, 1191.27it/s]
304
+ 2026-04-03:16:38:05,202 INFO [task.py:423] Building contexts for mmlu_high_school_world_history on rank 0...
305
+
306
  0%| | 0/237 [00:00<?, ?it/s]
307
  50%|█████ | 119/237 [00:00<00:00, 1188.77it/s]
308
+ 2026-04-03:16:38:05,408 INFO [task.py:423] Building contexts for mmlu_international_law on rank 0...
309
+
310
  0%| | 0/121 [00:00<?, ?it/s]
311
  99%|█████████▉| 120/121 [00:00<00:00, 1193.52it/s]
312
+ 2026-04-03:16:38:05,513 INFO [task.py:423] Building contexts for mmlu_logical_fallacies on rank 0...
313
+
314
  0%| | 0/163 [00:00<?, ?it/s]
315
  74%|███████▍ | 121/163 [00:00<00:00, 1205.56it/s]
316
+ 2026-04-03:16:38:05,652 INFO [task.py:423] Building contexts for mmlu_philosophy on rank 0...
317
+
318
  0%| | 0/311 [00:00<?, ?it/s]
319
  39%|███▉ | 121/311 [00:00<00:00, 1201.64it/s]
320
  78%|███████▊ | 242/311 [00:00<00:00, 1195.51it/s]
321
+ 2026-04-03:16:38:05,920 INFO [task.py:423] Building contexts for mmlu_high_school_us_history on rank 0...
322
+
323
  0%| | 0/204 [00:00<?, ?it/s]
324
  58%|█████▊ | 118/204 [00:00<00:00, 1174.86it/s]
325
+ 2026-04-03:16:38:06,099 INFO [evaluator.py:465] Running loglikelihood requests
326
+
327
+
328
+ 2026-04-03:16:40:08,272 WARNING [huggingface.py:1344] Failed to get model SHA for /egr/research-optml/wangc168/Muon_wmdp/rmu_wmdp/models/muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6 at revision main. Error: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/egr/research-optml/wangc168/Muon_wmdp/rmu_wmdp/models/muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6'. Use `repo_type` argument if needed.
329
+ 2026-04-03:16:40:13,621 INFO [evaluation_tracker.py:206] Saving results aggregated
330
+ 2026-04-03:16:40:13,641 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_abstract_algebra
331
+ 2026-04-03:16:40:13,711 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_anatomy
332
+ 2026-04-03:16:40:13,809 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_astronomy
333
+ 2026-04-03:16:40:13,927 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_business_ethics
334
+ 2026-04-03:16:40:14,011 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_clinical_knowledge
335
+ 2026-04-03:16:40:14,215 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_college_biology
336
+ 2026-04-03:16:40:14,331 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_college_chemistry
337
+ 2026-04-03:16:40:14,423 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_college_computer_science
338
+ 2026-04-03:16:40:14,517 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_college_mathematics
339
+ 2026-04-03:16:40:14,607 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_college_medicine
340
+ 2026-04-03:16:40:14,749 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_college_physics
341
+ 2026-04-03:16:40:14,828 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_computer_security
342
+ 2026-04-03:16:40:14,915 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_conceptual_physics
343
+ 2026-04-03:16:40:15,093 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_econometrics
344
+ 2026-04-03:16:40:15,183 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_electrical_engineering
345
+ 2026-04-03:16:40:15,308 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_elementary_mathematics
346
+ 2026-04-03:16:40:15,632 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_formal_logic
347
+ 2026-04-03:16:40:15,732 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_global_facts
348
+ 2026-04-03:16:40:15,808 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_high_school_biology
349
+ 2026-04-03:16:40:16,059 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_high_school_chemistry
350
+ 2026-04-03:16:40:16,266 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_high_school_computer_science
351
+ 2026-04-03:16:40:16,375 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_high_school_european_history
352
+ 2026-04-03:16:40:16,583 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_high_school_geography
353
+ 2026-04-03:16:40:16,806 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_high_school_government_and_politics
354
+ 2026-04-03:16:40:17,022 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_high_school_macroeconomics
355
+ 2026-04-03:16:40:17,385 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_high_school_mathematics
356
+ 2026-04-03:16:40:17,684 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_high_school_microeconomics
357
+ 2026-04-03:16:40:17,929 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_high_school_physics
358
+ 2026-04-03:16:40:18,109 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_high_school_psychology
359
+ 2026-04-03:16:40:18,727 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_high_school_statistics
360
+ 2026-04-03:16:40:18,908 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_high_school_us_history
361
+ 2026-04-03:16:40:19,537 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_high_school_world_history
362
+ 2026-04-03:16:40:19,726 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_human_aging
363
+ 2026-04-03:16:40:19,895 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_human_sexuality
364
+ 2026-04-03:16:40:20,007 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_international_law
365
+ 2026-04-03:16:40:20,097 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_jurisprudence
366
+ 2026-04-03:16:40:20,195 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_logical_fallacies
367
+ 2026-04-03:16:40:20,341 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_machine_learning
368
+ 2026-04-03:16:40:20,436 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_management
369
+ 2026-04-03:16:40:20,524 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_marketing
370
+ 2026-04-03:16:40:20,789 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_medical_genetics
371
+ 2026-04-03:16:40:20,897 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_miscellaneous
372
+ 2026-04-03:16:40:21,555 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_moral_disputes
373
+ 2026-04-03:16:40:21,850 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_moral_scenarios
374
+ 2026-04-03:16:40:22,676 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_nutrition
375
+ 2026-04-03:16:40:22,944 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_philosophy
376
+ 2026-04-03:16:40:23,198 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_prehistory
377
+ 2026-04-03:16:40:23,473 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_professional_accounting
378
+ 2026-04-03:16:40:23,719 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_professional_law
379
+ 2026-04-03:16:40:25,243 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_professional_medicine
380
+ 2026-04-03:16:40:25,522 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_professional_psychology
381
+ 2026-04-03:16:40:26,010 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_public_relations
382
+ 2026-04-03:16:40:26,094 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_security_studies
383
+ 2026-04-03:16:40:26,264 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_sociology
384
+ 2026-04-03:16:40:26,404 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_us_foreign_policy
385
+ 2026-04-03:16:40:26,475 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_virology
386
+ 2026-04-03:16:40:26,587 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_world_religions
387
+ hf (pretrained=/egr/research-optml/wangc168/Muon_wmdp/rmu_wmdp/models/muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 16
388
+ | Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
389
+ |---------------------------------------|------:|------|-----:|------|---|-----:|---|-----:|
390
+ |mmlu | 2|none | |acc |↑ |0.5713|± |0.0040|
391
+ | - humanities | 2|none | |acc |↑ |0.5186|± |0.0069|
392
+ | - formal_logic | 1|none | 0|acc |↑ |0.3810|± |0.0434|
393
+ | - high_school_european_history | 1|none | 0|acc |↑ |0.6970|± |0.0359|
394
+ | - high_school_us_history | 1|none | 0|acc |↑ |0.7500|± |0.0304|
395
+ | - high_school_world_history | 1|none | 0|acc |↑ |0.7384|± |0.0286|
396
+ | - international_law | 1|none | 0|acc |↑ |0.7107|± |0.0414|
397
+ | - jurisprudence | 1|none | 0|acc |↑ |0.6944|± |0.0445|
398
+ | - logical_fallacies | 1|none | 0|acc |↑ |0.6810|± |0.0366|
399
+ | - moral_disputes | 1|none | 0|acc |↑ |0.6590|± |0.0255|
400
+ | - moral_scenarios | 1|none | 0|acc |↑ |0.2804|± |0.0150|
401
+ | - philosophy | 1|none | 0|acc |↑ |0.6463|± |0.0272|
402
+ | - prehistory | 1|none | 0|acc |↑ |0.6451|± |0.0266|
403
+ | - professional_law | 1|none | 0|acc |↑ |0.4231|± |0.0126|
404
+ | - world_religions | 1|none | 0|acc |↑ |0.8129|± |0.0299|
405
+ | - other | 2|none | |acc |↑ |0.6350|± |0.0082|
406
+ | - business_ethics | 1|none | 0|acc |↑ |0.5200|± |0.0502|
407
+ | - clinical_knowledge | 1|none | 0|acc |↑ |0.6491|± |0.0294|
408
+ | - college_medicine | 1|none | 0|acc |↑ |0.5896|± |0.0375|
409
+ | - global_facts | 1|none | 0|acc |↑ |0.3000|± |0.0461|
410
+ | - human_aging | 1|none | 0|acc |↑ |0.6368|± |0.0323|
411
+ | - management | 1|none | 0|acc |↑ |0.7476|± |0.0430|
412
+ | - marketing | 1|none | 0|acc |↑ |0.8248|± |0.0249|
413
+ | - medical_genetics | 1|none | 0|acc |↑ |0.6400|± |0.0482|
414
+ | - miscellaneous | 1|none | 0|acc |↑ |0.7778|± |0.0149|
415
+ | - nutrition | 1|none | 0|acc |↑ |0.6601|± |0.0271|
416
+ | - professional_accounting | 1|none | 0|acc |↑ |0.4149|± |0.0294|
417
+ | - professional_medicine | 1|none | 0|acc |↑ |0.5699|± |0.0301|
418
+ | - virology | 1|none | 0|acc |↑ |0.3494|± |0.0371|
419
+ | - social sciences | 2|none | |acc |↑ |0.6783|± |0.0082|
420
+ | - econometrics | 1|none | 0|acc |↑ |0.4298|± |0.0466|
421
+ | - high_school_geography | 1|none | 0|acc |↑ |0.7172|± |0.0321|
422
+ | - high_school_government_and_politics| 1|none | 0|acc |↑ |0.8290|± |0.0272|
423
+ | - high_school_macroeconomics | 1|none | 0|acc |↑ |0.5795|± |0.0250|
424
+ | - high_school_microeconomics | 1|none | 0|acc |↑ |0.6050|± |0.0318|
425
+ | - high_school_psychology | 1|none | 0|acc |↑ |0.7890|± |0.0175|
426
+ | - human_sexuality | 1|none | 0|acc |↑ |0.6565|± |0.0416|
427
+ | - professional_psychology | 1|none | 0|acc |↑ |0.5997|± |0.0198|
428
+ | - public_relations | 1|none | 0|acc |↑ |0.6545|± |0.0455|
429
+ | - security_studies | 1|none | 0|acc |↑ |0.6490|± |0.0306|
430
+ | - sociology | 1|none | 0|acc |↑ |0.8557|± |0.0248|
431
+ | - us_foreign_policy | 1|none | 0|acc |↑ |0.8000|± |0.0402|
432
+ | - stem | 2|none | |acc |↑ |0.4827|± |0.0087|
433
+ | - abstract_algebra | 1|none | 0|acc |↑ |0.3000|± |0.0461|
434
+ | - anatomy | 1|none | 0|acc |↑ |0.5630|± |0.0428|
435
+ | - astronomy | 1|none | 0|acc |↑ |0.5855|± |0.0401|
436
+ | - college_biology | 1|none | 0|acc |↑ |0.6389|± |0.0402|
437
+ | - college_chemistry | 1|none | 0|acc |↑ |0.4900|± |0.0502|
438
+ | - college_computer_science | 1|none | 0|acc |↑ |0.5100|± |0.0502|
439
+ | - college_mathematics | 1|none | 0|acc |↑ |0.3700|± |0.0485|
440
+ | - college_physics | 1|none | 0|acc |↑ |0.5294|± |0.0497|
441
+ | - computer_security | 1|none | 0|acc |↑ |0.4800|± |0.0502|
442
+ | - conceptual_physics | 1|none | 0|acc |↑ |0.4936|± |0.0327|
443
+ | - electrical_engineering | 1|none | 0|acc |↑ |0.5310|± |0.0416|
444
+ | - elementary_mathematics | 1|none | 0|acc |↑ |0.3995|± |0.0252|
445
+ | - high_school_biology | 1|none | 0|acc |↑ |0.6581|± |0.0270|
446
+ | - high_school_chemistry | 1|none | 0|acc |↑ |0.4828|± |0.0352|
447
+ | - high_school_computer_science | 1|none | 0|acc |↑ |0.5700|± |0.0498|
448
+ | - high_school_mathematics | 1|none | 0|acc |↑ |0.3370|± |0.0288|
449
+ | - high_school_physics | 1|none | 0|acc |↑ |0.2914|± |0.0371|
450
+ | - high_school_statistics | 1|none | 0|acc |↑ |0.4907|± |0.0341|
451
+ | - machine_learning | 1|none | 0|acc |↑ |0.4643|± |0.0473|
452
+
453
+ | Groups |Version|Filter|n-shot|Metric| |Value | |Stderr|
454
+ |------------------|------:|------|------|------|---|-----:|---|-----:|
455
+ |mmlu | 2|none | |acc |↑ |0.5713|± |0.0040|
456
+ | - humanities | 2|none | |acc |↑ |0.5186|± |0.0069|
457
+ | - other | 2|none | |acc |↑ |0.6350|± |0.0082|
458
+ | - social sciences| 2|none | |acc |↑ |0.6783|± |0.0082|
459
+ | - stem | 2|none | |acc |↑ |0.4827|± |0.0087|
460
+
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/results_2026-04-03T16-40-13.638747.json ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_abstract_algebra_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_anatomy_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_astronomy_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_business_ethics_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_clinical_knowledge_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_college_biology_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_college_chemistry_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_college_computer_science_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_college_mathematics_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_college_medicine_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_college_physics_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_computer_security_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_conceptual_physics_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_econometrics_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_electrical_engineering_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_elementary_mathematics_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_formal_logic_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_global_facts_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_high_school_biology_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_high_school_chemistry_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_high_school_computer_science_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_high_school_european_history_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_high_school_geography_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_high_school_government_and_politics_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_high_school_macroeconomics_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_high_school_mathematics_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_high_school_microeconomics_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_high_school_physics_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_high_school_psychology_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_high_school_statistics_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_high_school_us_history_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_high_school_world_history_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_human_aging_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_human_sexuality_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_international_law_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_jurisprudence_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_logical_fallacies_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_machine_learning_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_management_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_marketing_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_medical_genetics_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_miscellaneous_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_moral_disputes_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
lm_eval_logs/mmlu_results.json/__egr__research-optml__wangc168__Muon_wmdp__rmu_wmdp__models__muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/samples_mmlu_moral_scenarios_2026-04-03T16-40-13.638747.jsonl ADDED
The diff for this file is too large to render. See raw diff