Upload folder using huggingface_hub

#269
Files changed (20) hide show
  1. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-100000.pth.json +1 -0
  2. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-120000.pth.json +1 -0
  3. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-140000.pth.json +1 -0
  4. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-160000.pth.json +1 -0
  5. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-180000.pth.json +1 -0
  6. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-20000.pth.json +1 -0
  7. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-200000.pth.json +1 -0
  8. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-220000.pth.json +1 -0
  9. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-240000.pth.json +1 -0
  10. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-260000.pth.json +1 -0
  11. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-280000.pth.json +1 -0
  12. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-300000.pth.json +1 -0
  13. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-320000.pth.json +1 -0
  14. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-340000.pth.json +1 -0
  15. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-360000.pth.json +121 -0
  16. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-380000.pth.json +1 -0
  17. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-40000.pth.json +1 -0
  18. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-400000.pth.json +121 -0
  19. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-60000.pth.json +1 -0
  20. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-80000.pth.json +1 -0
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-100000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.4421934097532243, "val/accuracy": 0.5000222826760913, "val/perplexity": 11.498233441114053, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.3516005995851126, "lambada/accuracy/total": 0.2921195652173913, "lambada/accuracy/openai_last_token": 0.7758152173913043, "lambada/perplexity": 8.978465823431263, "lambada/lm_loss": 3.026403419079208, "lambada/lm_perplexity": 20.622927020638386, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.39607092394674126, "mean_loss": 2.396897004669168, "blimp/accuracy/passive_2": 0.894, "blimp/accuracy/determiner_noun_agreement_2": 0.984, "blimp/accuracy/ellipsis_n_bar_1": 0.844, "blimp/accuracy/tough_vs_raising_2": 0.915, "blimp/accuracy/tough_vs_raising_1": 0.616, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.903, "blimp/accuracy/principle_A_reconstruction": 0.426, "blimp/accuracy/wh_vs_that_with_gap": 0.496, "blimp/accuracy/principle_A_domain_2": 0.88, "blimp/accuracy/determiner_noun_agreement_1": 0.993, "blimp/accuracy/ellipsis_n_bar_2": 0.887, "blimp/accuracy/principle_A_domain_3": 0.645, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.921, "blimp/accuracy/animate_subject_trans": 0.908, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.927, "blimp/accuracy/distractor_agreement_relative_clause": 0.696, "blimp/accuracy/transitive": 0.875, "blimp/accuracy/sentential_subject_island": 0.303, "blimp/accuracy/adjunct_island": 0.817, "blimp/accuracy/intransitive": 0.803, "blimp/accuracy/existential_there_subject_raising": 0.85, "blimp/accuracy/irregular_past_participle_adjectives": 0.87, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.548, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.314, "blimp/accuracy/only_npi_scope": 0.697, "blimp/accuracy/superlative_quantifiers_2": 0.769, "blimp/accuracy/passive_1": 0.887, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.905, "blimp/accuracy/inchoative": 0.675, "blimp/accuracy/anaphor_gender_agreement": 0.976, "blimp/accuracy/principle_A_c_command": 0.738, "blimp/accuracy/only_npi_licensor_present": 0.515, "blimp/accuracy/expletive_it_object_raising": 0.796, "blimp/accuracy/left_branch_island_simple_question": 0.627, "blimp/accuracy/wh_questions_subject_gap": 0.942, "blimp/accuracy/existential_there_quantifiers_2": 0.501, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.938, "blimp/accuracy/sentential_negation_npi_scope": 0.711, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.845, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.882, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.915, "blimp/accuracy/principle_A_case_2": 0.947, "blimp/accuracy/distractor_agreement_relational_noun": 0.868, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.986, "blimp/accuracy/superlative_quantifiers_1": 0.677, "blimp/accuracy/wh_island": 0.8, "blimp/accuracy/principle_A_domain_1": 0.987, "blimp/accuracy/complex_NP_island": 0.582, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.979, "blimp/accuracy/irregular_past_participle_verbs": 0.914, "blimp/accuracy/drop_argument": 0.783, "blimp/accuracy/wh_questions_object_gap": 0.812, "blimp/accuracy/animate_subject_passive": 0.8, "blimp/accuracy/existential_there_quantifiers_1": 0.986, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.912, "blimp/accuracy/npi_present_2": 0.645, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.958, "blimp/accuracy/anaphor_number_agreement": 0.994, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.958, "blimp/accuracy/existential_there_object_raising": 0.816, "blimp/accuracy/matrix_question_npi_licensor_present": 0.302, "blimp/accuracy/npi_present_1": 0.558, "blimp/accuracy/wh_vs_that_no_gap": 0.976, "blimp/accuracy/left_branch_island_echo_question": 0.436, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.964, "blimp/accuracy/causative": 0.734, "blimp/accuracy/group_average": 0.7911641791044776, "blimp/accuracy/seq_average": 0.7911641791044776, "cbt/accuracy/NE": 0.7920673076923077, "cbt/accuracy/V": 0.9328, "cbt/accuracy/CN": 0.8616, "cbt/accuracy/P": 0.9048, "cbt/accuracy/group_average": 0.8728168269230769, "cbt/accuracy/seq_average": 0.8728491396558623, "hellaswag/accuracy/val": 0.326229834694284, "hellaswag/accuracy/group_average": 0.326229834694284, "hellaswag/accuracy/seq_average": 0.326229834694284, "piqa/accuracy/val": 0.6126224156692056, "piqa/accuracy/group_average": 0.6126224156692056, "piqa/accuracy/seq_average": 0.6126224156692056, "ai2arc/accuracy/ARC-Easy": 0.35306553911205074, "ai2arc/accuracy/ARC-Challenge": 0.2223175965665236, "ai2arc/accuracy/group_average": 0.2876915678392872, "ai2arc/accuracy/seq_average": 0.30991501416430595, "mmlu/accuracy/MMLU": 0.2657847693957812, "mmlu/accuracy/group_average": 0.2657847693957812, "mmlu/accuracy/seq_average": 0.2657847693957812, "openbookqa/accuracy/test": 0.282, "openbookqa/accuracy/group_average": 0.282, "openbookqa/accuracy/seq_average": 0.282, "race/accuracy/test/high": 0.28444825614636937, "race/accuracy/test/middle": 0.3544568245125348, "race/accuracy/group_average": 0.3194525403294521, "race/accuracy/seq_average": 0.30482367247669234, "siqa/accuracy/dev": 0.368474923234391, "siqa/accuracy/group_average": 0.368474923234391, "siqa/accuracy/seq_average": 0.368474923234391, "winogrande/accuracy/dev": 0.4996053670086819, "winogrande/accuracy/group_average": 0.4996053670086819, "winogrande/accuracy/seq_average": 0.4996053670086819, "commonsenseqa/accuracy/dev_rand_split": 0.2571662571662572, "commonsenseqa/accuracy/group_average": 0.2571662571662572, "commonsenseqa/accuracy/seq_average": 0.2571662571662572}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-120000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.415764944893973, "val/accuracy": 0.5039643787202381, "val/perplexity": 11.198333189882343, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.4444650211689636, "lambada/accuracy/total": 0.3330745341614907, "lambada/accuracy/openai_last_token": 0.7849378881987578, "lambada/perplexity": 7.579221684881155, "lambada/lm_loss": 2.9939789487138118, "lambada/lm_perplexity": 19.96496422671764, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4185194564408644, "mean_loss": 2.4301149830314683, "blimp/accuracy/passive_2": 0.914, "blimp/accuracy/determiner_noun_agreement_2": 0.986, "blimp/accuracy/ellipsis_n_bar_1": 0.852, "blimp/accuracy/tough_vs_raising_2": 0.876, "blimp/accuracy/tough_vs_raising_1": 0.657, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.923, "blimp/accuracy/principle_A_reconstruction": 0.466, "blimp/accuracy/wh_vs_that_with_gap": 0.491, "blimp/accuracy/principle_A_domain_2": 0.862, "blimp/accuracy/determiner_noun_agreement_1": 0.994, "blimp/accuracy/ellipsis_n_bar_2": 0.902, "blimp/accuracy/principle_A_domain_3": 0.608, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.925, "blimp/accuracy/animate_subject_trans": 0.896, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.916, "blimp/accuracy/distractor_agreement_relative_clause": 0.69, "blimp/accuracy/transitive": 0.88, "blimp/accuracy/sentential_subject_island": 0.377, "blimp/accuracy/adjunct_island": 0.805, "blimp/accuracy/intransitive": 0.789, "blimp/accuracy/existential_there_subject_raising": 0.851, "blimp/accuracy/irregular_past_participle_adjectives": 0.916, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.614, "blimp/accuracy/principle_A_case_1": 0.999, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.359, "blimp/accuracy/only_npi_scope": 0.669, "blimp/accuracy/superlative_quantifiers_2": 0.714, "blimp/accuracy/passive_1": 0.901, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.922, "blimp/accuracy/inchoative": 0.652, "blimp/accuracy/anaphor_gender_agreement": 0.971, "blimp/accuracy/principle_A_c_command": 0.706, "blimp/accuracy/only_npi_licensor_present": 0.616, "blimp/accuracy/expletive_it_object_raising": 0.787, "blimp/accuracy/left_branch_island_simple_question": 0.679, "blimp/accuracy/wh_questions_subject_gap": 0.942, "blimp/accuracy/existential_there_quantifiers_2": 0.382, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.939, "blimp/accuracy/sentential_negation_npi_scope": 0.786, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.796, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.885, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.904, "blimp/accuracy/principle_A_case_2": 0.918, "blimp/accuracy/distractor_agreement_relational_noun": 0.862, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.991, "blimp/accuracy/superlative_quantifiers_1": 0.835, "blimp/accuracy/wh_island": 0.784, "blimp/accuracy/principle_A_domain_1": 0.995, "blimp/accuracy/complex_NP_island": 0.589, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.977, "blimp/accuracy/irregular_past_participle_verbs": 0.917, "blimp/accuracy/drop_argument": 0.733, "blimp/accuracy/wh_questions_object_gap": 0.817, "blimp/accuracy/animate_subject_passive": 0.812, "blimp/accuracy/existential_there_quantifiers_1": 0.961, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.918, "blimp/accuracy/npi_present_2": 0.614, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.952, "blimp/accuracy/anaphor_number_agreement": 0.992, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.966, "blimp/accuracy/existential_there_object_raising": 0.849, "blimp/accuracy/matrix_question_npi_licensor_present": 0.308, "blimp/accuracy/npi_present_1": 0.563, "blimp/accuracy/wh_vs_that_no_gap": 0.979, "blimp/accuracy/left_branch_island_echo_question": 0.426, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.955, "blimp/accuracy/causative": 0.75, "blimp/accuracy/group_average": 0.7949552238805968, "blimp/accuracy/seq_average": 0.794955223880597, "cbt/accuracy/NE": 0.7912660256410257, "cbt/accuracy/V": 0.9344, "cbt/accuracy/CN": 0.862, "cbt/accuracy/P": 0.9128, "cbt/accuracy/group_average": 0.8751165064102564, "cbt/accuracy/seq_average": 0.8751500600240096, "hellaswag/accuracy/val": 0.3314080860386377, "hellaswag/accuracy/group_average": 0.3314080860386377, "hellaswag/accuracy/seq_average": 0.3314080860386377, "piqa/accuracy/val": 0.6153427638737758, "piqa/accuracy/group_average": 0.6153427638737758, "piqa/accuracy/seq_average": 0.6153427638737758, "ai2arc/accuracy/ARC-Easy": 0.3687103594080338, "ai2arc/accuracy/ARC-Challenge": 0.2317596566523605, "ai2arc/accuracy/group_average": 0.30023500803019715, "ai2arc/accuracy/seq_average": 0.3235127478753541, "mmlu/accuracy/MMLU": 0.26363961387200574, "mmlu/accuracy/group_average": 0.26363961387200574, "mmlu/accuracy/seq_average": 0.26363961387200574, "openbookqa/accuracy/test": 0.292, "openbookqa/accuracy/group_average": 0.292, "openbookqa/accuracy/seq_average": 0.292, "race/accuracy/test/high": 0.2835906232132647, "race/accuracy/test/middle": 0.3593314763231198, "race/accuracy/group_average": 0.32146104976819223, "race/accuracy/seq_average": 0.3056343737332793, "siqa/accuracy/dev": 0.37154554759467756, "siqa/accuracy/group_average": 0.37154554759467756, "siqa/accuracy/seq_average": 0.37154554759467756, "winogrande/accuracy/dev": 0.5074980268350434, "winogrande/accuracy/group_average": 0.5074980268350434, "winogrande/accuracy/seq_average": 0.5074980268350434, "commonsenseqa/accuracy/dev_rand_split": 0.26535626535626533, "commonsenseqa/accuracy/group_average": 0.26535626535626533, "commonsenseqa/accuracy/seq_average": 0.26535626535626533}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-140000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.3991331070188493, "val/accuracy": 0.5065792023189484, "val/perplexity": 11.013624607188103, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.305036461871603, "lambada/accuracy/total": 0.30337732919254656, "lambada/accuracy/openai_last_token": 0.7839673913043478, "lambada/perplexity": 8.139248467226396, "lambada/lm_loss": 2.990292564057972, "lambada/lm_perplexity": 19.891501178657855, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.40497826575574747, "mean_loss": 2.352084784445226, "blimp/accuracy/passive_2": 0.906, "blimp/accuracy/determiner_noun_agreement_2": 0.977, "blimp/accuracy/ellipsis_n_bar_1": 0.842, "blimp/accuracy/tough_vs_raising_2": 0.901, "blimp/accuracy/tough_vs_raising_1": 0.599, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.906, "blimp/accuracy/principle_A_reconstruction": 0.43, "blimp/accuracy/wh_vs_that_with_gap": 0.518, "blimp/accuracy/principle_A_domain_2": 0.88, "blimp/accuracy/determiner_noun_agreement_1": 0.994, "blimp/accuracy/ellipsis_n_bar_2": 0.892, "blimp/accuracy/principle_A_domain_3": 0.63, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.928, "blimp/accuracy/animate_subject_trans": 0.917, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.889, "blimp/accuracy/distractor_agreement_relative_clause": 0.64, "blimp/accuracy/transitive": 0.869, "blimp/accuracy/sentential_subject_island": 0.386, "blimp/accuracy/adjunct_island": 0.83, "blimp/accuracy/intransitive": 0.74, "blimp/accuracy/existential_there_subject_raising": 0.86, "blimp/accuracy/irregular_past_participle_adjectives": 0.981, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.664, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.345, "blimp/accuracy/only_npi_scope": 0.692, "blimp/accuracy/superlative_quantifiers_2": 0.782, "blimp/accuracy/passive_1": 0.901, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.921, "blimp/accuracy/inchoative": 0.607, "blimp/accuracy/anaphor_gender_agreement": 0.98, "blimp/accuracy/principle_A_c_command": 0.703, "blimp/accuracy/only_npi_licensor_present": 0.674, "blimp/accuracy/expletive_it_object_raising": 0.792, "blimp/accuracy/left_branch_island_simple_question": 0.725, "blimp/accuracy/wh_questions_subject_gap": 0.949, "blimp/accuracy/existential_there_quantifiers_2": 0.476, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.935, "blimp/accuracy/sentential_negation_npi_scope": 0.689, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.782, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.916, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.875, "blimp/accuracy/principle_A_case_2": 0.945, "blimp/accuracy/distractor_agreement_relational_noun": 0.868, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.997, "blimp/accuracy/superlative_quantifiers_1": 0.817, "blimp/accuracy/wh_island": 0.71, "blimp/accuracy/principle_A_domain_1": 0.988, "blimp/accuracy/complex_NP_island": 0.586, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.978, "blimp/accuracy/irregular_past_participle_verbs": 0.904, "blimp/accuracy/drop_argument": 0.77, "blimp/accuracy/wh_questions_object_gap": 0.86, "blimp/accuracy/animate_subject_passive": 0.794, "blimp/accuracy/existential_there_quantifiers_1": 0.989, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.907, "blimp/accuracy/npi_present_2": 0.531, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.95, "blimp/accuracy/anaphor_number_agreement": 0.988, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.971, "blimp/accuracy/existential_there_object_raising": 0.87, "blimp/accuracy/matrix_question_npi_licensor_present": 0.349, "blimp/accuracy/npi_present_1": 0.528, "blimp/accuracy/wh_vs_that_no_gap": 0.988, "blimp/accuracy/left_branch_island_echo_question": 0.487, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.968, "blimp/accuracy/causative": 0.707, "blimp/accuracy/group_average": 0.7966119402985075, "blimp/accuracy/seq_average": 0.7966119402985075, "cbt/accuracy/NE": 0.796875, "cbt/accuracy/V": 0.94, "cbt/accuracy/CN": 0.8696, "cbt/accuracy/P": 0.9148, "cbt/accuracy/group_average": 0.88031875, "cbt/accuracy/seq_average": 0.8803521408563425, "hellaswag/accuracy/val": 0.3333001394144593, "hellaswag/accuracy/group_average": 0.3333001394144593, "hellaswag/accuracy/seq_average": 0.3333001394144593, "piqa/accuracy/val": 0.6164309031556039, "piqa/accuracy/group_average": 0.6164309031556039, "piqa/accuracy/seq_average": 0.6164309031556039, "ai2arc/accuracy/ARC-Easy": 0.3758985200845666, "ai2arc/accuracy/ARC-Challenge": 0.23261802575107296, "ai2arc/accuracy/group_average": 0.30425827291781976, "ai2arc/accuracy/seq_average": 0.3286118980169972, "mmlu/accuracy/MMLU": 0.26414015016088666, "mmlu/accuracy/group_average": 0.26414015016088666, "mmlu/accuracy/seq_average": 0.26414015016088666, "openbookqa/accuracy/test": 0.286, "openbookqa/accuracy/group_average": 0.286, "openbookqa/accuracy/seq_average": 0.286, "race/accuracy/test/high": 0.2815894797026873, "race/accuracy/test/middle": 0.36629526462395545, "race/accuracy/group_average": 0.32394237216332133, "race/accuracy/seq_average": 0.3062423996757195, "siqa/accuracy/dev": 0.37615148413510746, "siqa/accuracy/group_average": 0.37615148413510746, "siqa/accuracy/seq_average": 0.37615148413510746, "winogrande/accuracy/dev": 0.500394632991318, "winogrande/accuracy/group_average": 0.500394632991318, "winogrande/accuracy/seq_average": 0.500394632991318, "commonsenseqa/accuracy/dev_rand_split": 0.2710892710892711, "commonsenseqa/accuracy/group_average": 0.2710892710892711, "commonsenseqa/accuracy/seq_average": 0.2710892710892711}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-160000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.377961658296131, "val/accuracy": 0.5092996264260913, "val/perplexity": 10.78290121087871, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.4818558781783775, "lambada/accuracy/total": 0.3189052795031056, "lambada/accuracy/openai_last_token": 0.7876552795031055, "lambada/perplexity": 7.543786560384712, "lambada/lm_loss": 2.9683020682516768, "lambada/lm_perplexity": 19.458851728348925, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.41410245296459847, "mean_loss": 2.4299087682372544, "blimp/accuracy/passive_2": 0.902, "blimp/accuracy/determiner_noun_agreement_2": 0.99, "blimp/accuracy/ellipsis_n_bar_1": 0.884, "blimp/accuracy/tough_vs_raising_2": 0.899, "blimp/accuracy/tough_vs_raising_1": 0.612, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.908, "blimp/accuracy/principle_A_reconstruction": 0.485, "blimp/accuracy/wh_vs_that_with_gap": 0.429, "blimp/accuracy/principle_A_domain_2": 0.885, "blimp/accuracy/determiner_noun_agreement_1": 0.992, "blimp/accuracy/ellipsis_n_bar_2": 0.901, "blimp/accuracy/principle_A_domain_3": 0.632, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.909, "blimp/accuracy/animate_subject_trans": 0.903, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.908, "blimp/accuracy/distractor_agreement_relative_clause": 0.677, "blimp/accuracy/transitive": 0.885, "blimp/accuracy/sentential_subject_island": 0.384, "blimp/accuracy/adjunct_island": 0.864, "blimp/accuracy/intransitive": 0.768, "blimp/accuracy/existential_there_subject_raising": 0.868, "blimp/accuracy/irregular_past_participle_adjectives": 0.93, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.796, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.321, "blimp/accuracy/only_npi_scope": 0.673, "blimp/accuracy/superlative_quantifiers_2": 0.785, "blimp/accuracy/passive_1": 0.882, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.929, "blimp/accuracy/inchoative": 0.627, "blimp/accuracy/anaphor_gender_agreement": 0.983, "blimp/accuracy/principle_A_c_command": 0.701, "blimp/accuracy/only_npi_licensor_present": 0.628, "blimp/accuracy/expletive_it_object_raising": 0.771, "blimp/accuracy/left_branch_island_simple_question": 0.833, "blimp/accuracy/wh_questions_subject_gap": 0.946, "blimp/accuracy/existential_there_quantifiers_2": 0.547, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.933, "blimp/accuracy/sentential_negation_npi_scope": 0.675, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.808, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.911, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.895, "blimp/accuracy/principle_A_case_2": 0.959, "blimp/accuracy/distractor_agreement_relational_noun": 0.87, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.973, "blimp/accuracy/superlative_quantifiers_1": 0.778, "blimp/accuracy/wh_island": 0.815, "blimp/accuracy/principle_A_domain_1": 0.991, "blimp/accuracy/complex_NP_island": 0.588, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.971, "blimp/accuracy/irregular_past_participle_verbs": 0.928, "blimp/accuracy/drop_argument": 0.711, "blimp/accuracy/wh_questions_object_gap": 0.854, "blimp/accuracy/animate_subject_passive": 0.777, "blimp/accuracy/existential_there_quantifiers_1": 0.975, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.926, "blimp/accuracy/npi_present_2": 0.587, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.958, "blimp/accuracy/anaphor_number_agreement": 0.99, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.969, "blimp/accuracy/existential_there_object_raising": 0.868, "blimp/accuracy/matrix_question_npi_licensor_present": 0.357, "blimp/accuracy/npi_present_1": 0.544, "blimp/accuracy/wh_vs_that_no_gap": 0.983, "blimp/accuracy/left_branch_island_echo_question": 0.439, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.972, "blimp/accuracy/causative": 0.746, "blimp/accuracy/group_average": 0.8028059701492538, "blimp/accuracy/seq_average": 0.8028059701492537, "cbt/accuracy/NE": 0.8112980769230769, "cbt/accuracy/V": 0.938, "cbt/accuracy/CN": 0.8724, "cbt/accuracy/P": 0.9116, "cbt/accuracy/group_average": 0.8833245192307692, "cbt/accuracy/seq_average": 0.8833533413365346, "hellaswag/accuracy/val": 0.34624576777534355, "hellaswag/accuracy/group_average": 0.34624576777534355, "hellaswag/accuracy/seq_average": 0.34624576777534355, "piqa/accuracy/val": 0.6196953210010882, "piqa/accuracy/group_average": 0.6196953210010882, "piqa/accuracy/seq_average": 0.6196953210010882, "ai2arc/accuracy/ARC-Easy": 0.3649048625792812, "ai2arc/accuracy/ARC-Challenge": 0.23004291845493563, "ai2arc/accuracy/group_average": 0.2974738905171084, "ai2arc/accuracy/seq_average": 0.32039660056657226, "mmlu/accuracy/MMLU": 0.26442617089739007, "mmlu/accuracy/group_average": 0.26442617089739007, "mmlu/accuracy/seq_average": 0.26442617089739007, "openbookqa/accuracy/test": 0.296, "openbookqa/accuracy/group_average": 0.296, "openbookqa/accuracy/seq_average": 0.296, "race/accuracy/test/high": 0.28187535734705543, "race/accuracy/test/middle": 0.3551532033426184, "race/accuracy/group_average": 0.3185142803448369, "race/accuracy/seq_average": 0.30320226996351846, "siqa/accuracy/dev": 0.36898669396110545, "siqa/accuracy/group_average": 0.36898669396110545, "siqa/accuracy/seq_average": 0.36898669396110545, "winogrande/accuracy/dev": 0.5082872928176796, "winogrande/accuracy/group_average": 0.5082872928176796, "winogrande/accuracy/seq_average": 0.5082872928176796, "commonsenseqa/accuracy/dev_rand_split": 0.27764127764127766, "commonsenseqa/accuracy/group_average": 0.27764127764127766, "commonsenseqa/accuracy/seq_average": 0.27764127764127766}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-180000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.3576752193390376, "val/accuracy": 0.5118204752604166, "val/perplexity": 10.566358411105712, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.4043690936165567, "lambada/accuracy/total": 0.3373447204968944, "lambada/accuracy/openai_last_token": 0.7917313664596274, "lambada/perplexity": 7.412574033230972, "lambada/lm_loss": 2.9610225389355556, "lambada/lm_perplexity": 19.317714775282976, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4245825978786555, "mean_loss": 2.381022156477797, "blimp/accuracy/passive_2": 0.904, "blimp/accuracy/determiner_noun_agreement_2": 0.984, "blimp/accuracy/ellipsis_n_bar_1": 0.857, "blimp/accuracy/tough_vs_raising_2": 0.872, "blimp/accuracy/tough_vs_raising_1": 0.609, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.897, "blimp/accuracy/principle_A_reconstruction": 0.432, "blimp/accuracy/wh_vs_that_with_gap": 0.424, "blimp/accuracy/principle_A_domain_2": 0.908, "blimp/accuracy/determiner_noun_agreement_1": 0.992, "blimp/accuracy/ellipsis_n_bar_2": 0.886, "blimp/accuracy/principle_A_domain_3": 0.627, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.917, "blimp/accuracy/animate_subject_trans": 0.91, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.926, "blimp/accuracy/distractor_agreement_relative_clause": 0.646, "blimp/accuracy/transitive": 0.88, "blimp/accuracy/sentential_subject_island": 0.347, "blimp/accuracy/adjunct_island": 0.867, "blimp/accuracy/intransitive": 0.765, "blimp/accuracy/existential_there_subject_raising": 0.874, "blimp/accuracy/irregular_past_participle_adjectives": 0.954, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.692, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.299, "blimp/accuracy/only_npi_scope": 0.757, "blimp/accuracy/superlative_quantifiers_2": 0.726, "blimp/accuracy/passive_1": 0.897, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.93, "blimp/accuracy/inchoative": 0.611, "blimp/accuracy/anaphor_gender_agreement": 0.972, "blimp/accuracy/principle_A_c_command": 0.697, "blimp/accuracy/only_npi_licensor_present": 0.731, "blimp/accuracy/expletive_it_object_raising": 0.793, "blimp/accuracy/left_branch_island_simple_question": 0.757, "blimp/accuracy/wh_questions_subject_gap": 0.956, "blimp/accuracy/existential_there_quantifiers_2": 0.506, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.937, "blimp/accuracy/sentential_negation_npi_scope": 0.731, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.773, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.917, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.889, "blimp/accuracy/principle_A_case_2": 0.945, "blimp/accuracy/distractor_agreement_relational_noun": 0.845, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.982, "blimp/accuracy/superlative_quantifiers_1": 0.642, "blimp/accuracy/wh_island": 0.732, "blimp/accuracy/principle_A_domain_1": 0.994, "blimp/accuracy/complex_NP_island": 0.575, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.978, "blimp/accuracy/irregular_past_participle_verbs": 0.911, "blimp/accuracy/drop_argument": 0.756, "blimp/accuracy/wh_questions_object_gap": 0.846, "blimp/accuracy/animate_subject_passive": 0.799, "blimp/accuracy/existential_there_quantifiers_1": 0.984, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.9, "blimp/accuracy/npi_present_2": 0.584, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.932, "blimp/accuracy/anaphor_number_agreement": 0.993, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.959, "blimp/accuracy/existential_there_object_raising": 0.862, "blimp/accuracy/matrix_question_npi_licensor_present": 0.42, "blimp/accuracy/npi_present_1": 0.559, "blimp/accuracy/wh_vs_that_no_gap": 0.987, "blimp/accuracy/left_branch_island_echo_question": 0.468, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.965, "blimp/accuracy/causative": 0.723, "blimp/accuracy/group_average": 0.7968656716417915, "blimp/accuracy/seq_average": 0.796865671641791, "cbt/accuracy/NE": 0.8112980769230769, "cbt/accuracy/V": 0.934, "cbt/accuracy/CN": 0.87, "cbt/accuracy/P": 0.9172, "cbt/accuracy/group_average": 0.8831245192307693, "cbt/accuracy/seq_average": 0.8831532613045218, "hellaswag/accuracy/val": 0.3476399123680542, "hellaswag/accuracy/group_average": 0.3476399123680542, "hellaswag/accuracy/seq_average": 0.3476399123680542, "piqa/accuracy/val": 0.6256800870511425, "piqa/accuracy/group_average": 0.6256800870511425, "piqa/accuracy/seq_average": 0.6256800870511425, "ai2arc/accuracy/ARC-Easy": 0.37293868921775897, "ai2arc/accuracy/ARC-Challenge": 0.24034334763948498, "ai2arc/accuracy/group_average": 0.30664101842862196, "ai2arc/accuracy/seq_average": 0.3291784702549575, "mmlu/accuracy/MMLU": 0.26428316052913836, "mmlu/accuracy/group_average": 0.26428316052913836, "mmlu/accuracy/seq_average": 0.26428316052913836, "openbookqa/accuracy/test": 0.288, "openbookqa/accuracy/group_average": 0.288, "openbookqa/accuracy/seq_average": 0.288, "race/accuracy/test/high": 0.2878787878787879, "race/accuracy/test/middle": 0.3593314763231198, "race/accuracy/group_average": 0.3236051321009539, "race/accuracy/seq_average": 0.30867450344548036, "siqa/accuracy/dev": 0.37154554759467756, "siqa/accuracy/group_average": 0.37154554759467756, "siqa/accuracy/seq_average": 0.37154554759467756, "winogrande/accuracy/dev": 0.5067087608524072, "winogrande/accuracy/group_average": 0.5067087608524072, "winogrande/accuracy/seq_average": 0.5067087608524072, "commonsenseqa/accuracy/dev_rand_split": 0.2784602784602785, "commonsenseqa/accuracy/group_average": 0.2784602784602785, "commonsenseqa/accuracy/seq_average": 0.2784602784602785}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-20000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.719937279110863, "val/accuracy": 0.4622599283854167, "val/perplexity": 15.179370151503694, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5517911733307455, "lambada/accuracy/total": 0.21312111801242237, "lambada/accuracy/openai_last_token": 0.7428183229813664, "lambada/perplexity": 14.654807495785061, "lambada/lm_loss": 3.271749276604304, "lambada/lm_perplexity": 26.357405433026777, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3376905231989195, "mean_loss": 2.6358642262208045, "blimp/accuracy/passive_2": 0.878, "blimp/accuracy/determiner_noun_agreement_2": 0.989, "blimp/accuracy/ellipsis_n_bar_1": 0.781, "blimp/accuracy/tough_vs_raising_2": 0.913, "blimp/accuracy/tough_vs_raising_1": 0.544, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.883, "blimp/accuracy/principle_A_reconstruction": 0.382, "blimp/accuracy/wh_vs_that_with_gap": 0.502, "blimp/accuracy/principle_A_domain_2": 0.847, "blimp/accuracy/determiner_noun_agreement_1": 0.987, "blimp/accuracy/ellipsis_n_bar_2": 0.881, "blimp/accuracy/principle_A_domain_3": 0.593, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.919, "blimp/accuracy/animate_subject_trans": 0.878, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.87, "blimp/accuracy/distractor_agreement_relative_clause": 0.505, "blimp/accuracy/transitive": 0.854, "blimp/accuracy/sentential_subject_island": 0.32, "blimp/accuracy/adjunct_island": 0.757, "blimp/accuracy/intransitive": 0.794, "blimp/accuracy/existential_there_subject_raising": 0.833, "blimp/accuracy/irregular_past_participle_adjectives": 0.951, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.259, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.248, "blimp/accuracy/only_npi_scope": 0.617, "blimp/accuracy/superlative_quantifiers_2": 0.604, "blimp/accuracy/passive_1": 0.875, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.875, "blimp/accuracy/inchoative": 0.614, "blimp/accuracy/anaphor_gender_agreement": 0.966, "blimp/accuracy/principle_A_c_command": 0.603, "blimp/accuracy/only_npi_licensor_present": 0.557, "blimp/accuracy/expletive_it_object_raising": 0.742, "blimp/accuracy/left_branch_island_simple_question": 0.362, "blimp/accuracy/wh_questions_subject_gap": 0.931, "blimp/accuracy/existential_there_quantifiers_2": 0.354, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.93, "blimp/accuracy/sentential_negation_npi_scope": 0.625, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.802, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.895, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.851, "blimp/accuracy/principle_A_case_2": 0.929, "blimp/accuracy/distractor_agreement_relational_noun": 0.766, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.999, "blimp/accuracy/superlative_quantifiers_1": 0.653, "blimp/accuracy/wh_island": 0.779, "blimp/accuracy/principle_A_domain_1": 0.986, "blimp/accuracy/complex_NP_island": 0.493, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.961, "blimp/accuracy/irregular_past_participle_verbs": 0.853, "blimp/accuracy/drop_argument": 0.775, "blimp/accuracy/wh_questions_object_gap": 0.766, "blimp/accuracy/animate_subject_passive": 0.786, "blimp/accuracy/existential_there_quantifiers_1": 0.966, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.864, "blimp/accuracy/npi_present_2": 0.574, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.931, "blimp/accuracy/anaphor_number_agreement": 0.984, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.947, "blimp/accuracy/existential_there_object_raising": 0.789, "blimp/accuracy/matrix_question_npi_licensor_present": 0.201, "blimp/accuracy/npi_present_1": 0.528, "blimp/accuracy/wh_vs_that_no_gap": 0.984, "blimp/accuracy/left_branch_island_echo_question": 0.354, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.965, "blimp/accuracy/causative": 0.684, "blimp/accuracy/group_average": 0.7475820895522388, "blimp/accuracy/seq_average": 0.7475820895522388, "cbt/accuracy/NE": 0.7439903846153846, "cbt/accuracy/V": 0.9032, "cbt/accuracy/CN": 0.8016, "cbt/accuracy/P": 0.8804, "cbt/accuracy/group_average": 0.8322975961538461, "cbt/accuracy/seq_average": 0.8323329331732693, "hellaswag/accuracy/val": 0.2892850029874527, "hellaswag/accuracy/group_average": 0.2892850029874527, "hellaswag/accuracy/seq_average": 0.2892850029874527, "piqa/accuracy/val": 0.5685527747551686, "piqa/accuracy/group_average": 0.5685527747551686, "piqa/accuracy/seq_average": 0.5685527747551686, "ai2arc/accuracy/ARC-Easy": 0.3357293868921776, "ai2arc/accuracy/ARC-Challenge": 0.20686695278969958, "ai2arc/accuracy/group_average": 0.2712981698409386, "ai2arc/accuracy/seq_average": 0.29320113314447593, "mmlu/accuracy/MMLU": 0.25827672506256705, "mmlu/accuracy/group_average": 0.25827672506256705, "mmlu/accuracy/seq_average": 0.25827672506256705, "openbookqa/accuracy/test": 0.274, "openbookqa/accuracy/group_average": 0.274, "openbookqa/accuracy/seq_average": 0.274, "race/accuracy/test/high": 0.2667238421955403, "race/accuracy/test/middle": 0.3398328690807799, "race/accuracy/group_average": 0.3032783556381601, "race/accuracy/seq_average": 0.2880016214025132, "siqa/accuracy/dev": 0.36591606960081885, "siqa/accuracy/group_average": 0.36591606960081885, "siqa/accuracy/seq_average": 0.36591606960081885, "winogrande/accuracy/dev": 0.5035516969218626, "winogrande/accuracy/group_average": 0.5035516969218626, "winogrande/accuracy/seq_average": 0.5035516969218626, "commonsenseqa/accuracy/dev_rand_split": 0.25225225225225223, "commonsenseqa/accuracy/group_average": 0.25225225225225223, "commonsenseqa/accuracy/seq_average": 0.25225225225225223}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-200000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.3439253549727184, "val/accuracy": 0.5150601826016865, "val/perplexity": 10.42206668523126, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.420701684418672, "lambada/accuracy/total": 0.33676242236024845, "lambada/accuracy/openai_last_token": 0.7913431677018633, "lambada/perplexity": 7.350908743059603, "lambada/lm_loss": 2.9507299958303914, "lambada/lm_perplexity": 19.119906086751083, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4259113024809675, "mean_loss": 2.3823135196956953, "blimp/accuracy/passive_2": 0.913, "blimp/accuracy/determiner_noun_agreement_2": 0.984, "blimp/accuracy/ellipsis_n_bar_1": 0.829, "blimp/accuracy/tough_vs_raising_2": 0.879, "blimp/accuracy/tough_vs_raising_1": 0.636, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.912, "blimp/accuracy/principle_A_reconstruction": 0.465, "blimp/accuracy/wh_vs_that_with_gap": 0.473, "blimp/accuracy/principle_A_domain_2": 0.842, "blimp/accuracy/determiner_noun_agreement_1": 0.995, "blimp/accuracy/ellipsis_n_bar_2": 0.908, "blimp/accuracy/principle_A_domain_3": 0.634, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.927, "blimp/accuracy/animate_subject_trans": 0.904, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.91, "blimp/accuracy/distractor_agreement_relative_clause": 0.684, "blimp/accuracy/transitive": 0.876, "blimp/accuracy/sentential_subject_island": 0.388, "blimp/accuracy/adjunct_island": 0.829, "blimp/accuracy/intransitive": 0.766, "blimp/accuracy/existential_there_subject_raising": 0.895, "blimp/accuracy/irregular_past_participle_adjectives": 0.951, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.676, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.355, "blimp/accuracy/only_npi_scope": 0.669, "blimp/accuracy/superlative_quantifiers_2": 0.794, "blimp/accuracy/passive_1": 0.901, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.932, "blimp/accuracy/inchoative": 0.633, "blimp/accuracy/anaphor_gender_agreement": 0.973, "blimp/accuracy/principle_A_c_command": 0.701, "blimp/accuracy/only_npi_licensor_present": 0.898, "blimp/accuracy/expletive_it_object_raising": 0.8, "blimp/accuracy/left_branch_island_simple_question": 0.754, "blimp/accuracy/wh_questions_subject_gap": 0.947, "blimp/accuracy/existential_there_quantifiers_2": 0.462, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.941, "blimp/accuracy/sentential_negation_npi_scope": 0.758, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.84, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.87, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.89, "blimp/accuracy/principle_A_case_2": 0.961, "blimp/accuracy/distractor_agreement_relational_noun": 0.882, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.99, "blimp/accuracy/superlative_quantifiers_1": 0.749, "blimp/accuracy/wh_island": 0.758, "blimp/accuracy/principle_A_domain_1": 0.989, "blimp/accuracy/complex_NP_island": 0.597, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.976, "blimp/accuracy/irregular_past_participle_verbs": 0.903, "blimp/accuracy/drop_argument": 0.753, "blimp/accuracy/wh_questions_object_gap": 0.828, "blimp/accuracy/animate_subject_passive": 0.797, "blimp/accuracy/existential_there_quantifiers_1": 0.98, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.904, "blimp/accuracy/npi_present_2": 0.567, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.937, "blimp/accuracy/anaphor_number_agreement": 0.991, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.974, "blimp/accuracy/existential_there_object_raising": 0.865, "blimp/accuracy/matrix_question_npi_licensor_present": 0.367, "blimp/accuracy/npi_present_1": 0.541, "blimp/accuracy/wh_vs_that_no_gap": 0.98, "blimp/accuracy/left_branch_island_echo_question": 0.52, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.969, "blimp/accuracy/causative": 0.764, "blimp/accuracy/group_average": 0.8050149253731341, "blimp/accuracy/seq_average": 0.8050149253731343, "cbt/accuracy/NE": 0.8108974358974359, "cbt/accuracy/V": 0.9396, "cbt/accuracy/CN": 0.8792, "cbt/accuracy/P": 0.9232, "cbt/accuracy/group_average": 0.8882243589743589, "cbt/accuracy/seq_average": 0.8882553021208484, "hellaswag/accuracy/val": 0.35391356303525195, "hellaswag/accuracy/group_average": 0.35391356303525195, "hellaswag/accuracy/seq_average": 0.35391356303525195, "piqa/accuracy/val": 0.6196953210010882, "piqa/accuracy/group_average": 0.6196953210010882, "piqa/accuracy/seq_average": 0.6196953210010882, "ai2arc/accuracy/ARC-Easy": 0.3813953488372093, "ai2arc/accuracy/ARC-Challenge": 0.23862660944206007, "ai2arc/accuracy/group_average": 0.3100109791396347, "ai2arc/accuracy/seq_average": 0.3342776203966006, "mmlu/accuracy/MMLU": 0.2622810153736146, "mmlu/accuracy/group_average": 0.2622810153736146, "mmlu/accuracy/seq_average": 0.2622810153736146, "openbookqa/accuracy/test": 0.286, "openbookqa/accuracy/group_average": 0.286, "openbookqa/accuracy/seq_average": 0.286, "race/accuracy/test/high": 0.2864493996569468, "race/accuracy/test/middle": 0.35863509749303624, "race/accuracy/group_average": 0.32254224857499153, "race/accuracy/seq_average": 0.3074584515605999, "siqa/accuracy/dev": 0.37871033776867963, "siqa/accuracy/group_average": 0.37871033776867963, "siqa/accuracy/seq_average": 0.37871033776867963, "winogrande/accuracy/dev": 0.5011838989739542, "winogrande/accuracy/group_average": 0.5011838989739542, "winogrande/accuracy/seq_average": 0.5011838989739542, "commonsenseqa/accuracy/dev_rand_split": 0.27764127764127766, "commonsenseqa/accuracy/group_average": 0.27764127764127766, "commonsenseqa/accuracy/seq_average": 0.27764127764127766}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-220000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.3285449800037203, "val/accuracy": 0.5164698040674603, "val/perplexity": 10.262997796946124, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.421502533906735, "lambada/accuracy/total": 0.3359860248447205, "lambada/accuracy/openai_last_token": 0.7938664596273292, "lambada/perplexity": 7.3488881303397084, "lambada/lm_loss": 2.928779310359278, "lambada/lm_perplexity": 18.704783819318493, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.42622791445609043, "mean_loss": 2.375023756955228, "blimp/accuracy/passive_2": 0.909, "blimp/accuracy/determiner_noun_agreement_2": 0.988, "blimp/accuracy/ellipsis_n_bar_1": 0.86, "blimp/accuracy/tough_vs_raising_2": 0.886, "blimp/accuracy/tough_vs_raising_1": 0.637, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.907, "blimp/accuracy/principle_A_reconstruction": 0.475, "blimp/accuracy/wh_vs_that_with_gap": 0.459, "blimp/accuracy/principle_A_domain_2": 0.854, "blimp/accuracy/determiner_noun_agreement_1": 0.991, "blimp/accuracy/ellipsis_n_bar_2": 0.896, "blimp/accuracy/principle_A_domain_3": 0.615, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.908, "blimp/accuracy/animate_subject_trans": 0.905, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.9, "blimp/accuracy/distractor_agreement_relative_clause": 0.665, "blimp/accuracy/transitive": 0.873, "blimp/accuracy/sentential_subject_island": 0.416, "blimp/accuracy/adjunct_island": 0.819, "blimp/accuracy/intransitive": 0.798, "blimp/accuracy/existential_there_subject_raising": 0.885, "blimp/accuracy/irregular_past_participle_adjectives": 0.969, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.677, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.36, "blimp/accuracy/only_npi_scope": 0.76, "blimp/accuracy/superlative_quantifiers_2": 0.791, "blimp/accuracy/passive_1": 0.907, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.93, "blimp/accuracy/inchoative": 0.659, "blimp/accuracy/anaphor_gender_agreement": 0.974, "blimp/accuracy/principle_A_c_command": 0.647, "blimp/accuracy/only_npi_licensor_present": 0.647, "blimp/accuracy/expletive_it_object_raising": 0.791, "blimp/accuracy/left_branch_island_simple_question": 0.734, "blimp/accuracy/wh_questions_subject_gap": 0.937, "blimp/accuracy/existential_there_quantifiers_2": 0.547, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.926, "blimp/accuracy/sentential_negation_npi_scope": 0.739, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.819, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.899, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.898, "blimp/accuracy/principle_A_case_2": 0.931, "blimp/accuracy/distractor_agreement_relational_noun": 0.854, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.985, "blimp/accuracy/superlative_quantifiers_1": 0.72, "blimp/accuracy/wh_island": 0.742, "blimp/accuracy/principle_A_domain_1": 0.988, "blimp/accuracy/complex_NP_island": 0.538, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.964, "blimp/accuracy/irregular_past_participle_verbs": 0.913, "blimp/accuracy/drop_argument": 0.748, "blimp/accuracy/wh_questions_object_gap": 0.837, "blimp/accuracy/animate_subject_passive": 0.786, "blimp/accuracy/existential_there_quantifiers_1": 0.983, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.91, "blimp/accuracy/npi_present_2": 0.574, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.922, "blimp/accuracy/anaphor_number_agreement": 0.993, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.961, "blimp/accuracy/existential_there_object_raising": 0.862, "blimp/accuracy/matrix_question_npi_licensor_present": 0.424, "blimp/accuracy/npi_present_1": 0.546, "blimp/accuracy/wh_vs_that_no_gap": 0.981, "blimp/accuracy/left_branch_island_echo_question": 0.465, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.968, "blimp/accuracy/causative": 0.755, "blimp/accuracy/group_average": 0.8001044776119399, "blimp/accuracy/seq_average": 0.8001044776119403, "cbt/accuracy/NE": 0.8064903846153846, "cbt/accuracy/V": 0.94, "cbt/accuracy/CN": 0.8824, "cbt/accuracy/P": 0.92, "cbt/accuracy/group_average": 0.8872225961538461, "cbt/accuracy/seq_average": 0.8872549019607843, "hellaswag/accuracy/val": 0.35391356303525195, "hellaswag/accuracy/group_average": 0.35391356303525195, "hellaswag/accuracy/seq_average": 0.35391356303525195, "piqa/accuracy/val": 0.6284004352557128, "piqa/accuracy/group_average": 0.6284004352557128, "piqa/accuracy/seq_average": 0.6284004352557128, "ai2arc/accuracy/ARC-Easy": 0.3839323467230444, "ai2arc/accuracy/ARC-Challenge": 0.2317596566523605, "ai2arc/accuracy/group_average": 0.30784600168770243, "ai2arc/accuracy/seq_average": 0.3337110481586402, "mmlu/accuracy/MMLU": 0.26313907758312477, "mmlu/accuracy/group_average": 0.26313907758312477, "mmlu/accuracy/seq_average": 0.26313907758312477, "openbookqa/accuracy/test": 0.278, "openbookqa/accuracy/group_average": 0.278, "openbookqa/accuracy/seq_average": 0.278, "race/accuracy/test/high": 0.28959405374499714, "race/accuracy/test/middle": 0.3628133704735376, "race/accuracy/group_average": 0.32620371210926735, "race/accuracy/seq_average": 0.31090393190109444, "siqa/accuracy/dev": 0.36745138178096215, "siqa/accuracy/group_average": 0.36745138178096215, "siqa/accuracy/seq_average": 0.36745138178096215, "winogrande/accuracy/dev": 0.5098658247829518, "winogrande/accuracy/group_average": 0.5098658247829518, "winogrande/accuracy/seq_average": 0.5098658247829518, "commonsenseqa/accuracy/dev_rand_split": 0.2719082719082719, "commonsenseqa/accuracy/group_average": 0.2719082719082719, "commonsenseqa/accuracy/seq_average": 0.2719082719082719}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-240000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.3154403444320435, "val/accuracy": 0.5187891400049603, "val/perplexity": 10.129382353947117, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.3017707019118787, "lambada/accuracy/total": 0.3641304347826087, "lambada/accuracy/openai_last_token": 0.8012422360248447, "lambada/perplexity": 6.809268412460182, "lambada/lm_loss": 2.916193744329646, "lambada/lm_perplexity": 18.47084871818785, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.44145978739378455, "mean_loss": 2.308605523171961, "blimp/accuracy/passive_2": 0.911, "blimp/accuracy/determiner_noun_agreement_2": 0.99, "blimp/accuracy/ellipsis_n_bar_1": 0.87, "blimp/accuracy/tough_vs_raising_2": 0.892, "blimp/accuracy/tough_vs_raising_1": 0.596, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.898, "blimp/accuracy/principle_A_reconstruction": 0.447, "blimp/accuracy/wh_vs_that_with_gap": 0.436, "blimp/accuracy/principle_A_domain_2": 0.894, "blimp/accuracy/determiner_noun_agreement_1": 0.994, "blimp/accuracy/ellipsis_n_bar_2": 0.892, "blimp/accuracy/principle_A_domain_3": 0.637, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.925, "blimp/accuracy/animate_subject_trans": 0.902, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.921, "blimp/accuracy/distractor_agreement_relative_clause": 0.716, "blimp/accuracy/transitive": 0.877, "blimp/accuracy/sentential_subject_island": 0.338, "blimp/accuracy/adjunct_island": 0.859, "blimp/accuracy/intransitive": 0.765, "blimp/accuracy/existential_there_subject_raising": 0.881, "blimp/accuracy/irregular_past_participle_adjectives": 0.904, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.737, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.314, "blimp/accuracy/only_npi_scope": 0.688, "blimp/accuracy/superlative_quantifiers_2": 0.766, "blimp/accuracy/passive_1": 0.904, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.927, "blimp/accuracy/inchoative": 0.628, "blimp/accuracy/anaphor_gender_agreement": 0.971, "blimp/accuracy/principle_A_c_command": 0.744, "blimp/accuracy/only_npi_licensor_present": 0.632, "blimp/accuracy/expletive_it_object_raising": 0.789, "blimp/accuracy/left_branch_island_simple_question": 0.779, "blimp/accuracy/wh_questions_subject_gap": 0.937, "blimp/accuracy/existential_there_quantifiers_2": 0.498, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.935, "blimp/accuracy/sentential_negation_npi_scope": 0.74, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.794, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.894, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.907, "blimp/accuracy/principle_A_case_2": 0.966, "blimp/accuracy/distractor_agreement_relational_noun": 0.88, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.99, "blimp/accuracy/superlative_quantifiers_1": 0.852, "blimp/accuracy/wh_island": 0.786, "blimp/accuracy/principle_A_domain_1": 0.994, "blimp/accuracy/complex_NP_island": 0.577, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.981, "blimp/accuracy/irregular_past_participle_verbs": 0.944, "blimp/accuracy/drop_argument": 0.745, "blimp/accuracy/wh_questions_object_gap": 0.843, "blimp/accuracy/animate_subject_passive": 0.779, "blimp/accuracy/existential_there_quantifiers_1": 0.967, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.909, "blimp/accuracy/npi_present_2": 0.555, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.956, "blimp/accuracy/anaphor_number_agreement": 0.989, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.965, "blimp/accuracy/existential_there_object_raising": 0.859, "blimp/accuracy/matrix_question_npi_licensor_present": 0.396, "blimp/accuracy/npi_present_1": 0.497, "blimp/accuracy/wh_vs_that_no_gap": 0.98, "blimp/accuracy/left_branch_island_echo_question": 0.512, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.971, "blimp/accuracy/causative": 0.752, "blimp/accuracy/group_average": 0.8025970149253734, "blimp/accuracy/seq_average": 0.8025970149253732, "cbt/accuracy/NE": 0.8068910256410257, "cbt/accuracy/V": 0.9392, "cbt/accuracy/CN": 0.884, "cbt/accuracy/P": 0.92, "cbt/accuracy/group_average": 0.8875227564102564, "cbt/accuracy/seq_average": 0.8875550220088035, "hellaswag/accuracy/val": 0.35550687114120694, "hellaswag/accuracy/group_average": 0.35550687114120694, "hellaswag/accuracy/seq_average": 0.35550687114120694, "piqa/accuracy/val": 0.6360174102285092, "piqa/accuracy/group_average": 0.6360174102285092, "piqa/accuracy/seq_average": 0.6360174102285092, "ai2arc/accuracy/ARC-Easy": 0.38097251585623676, "ai2arc/accuracy/ARC-Challenge": 0.24034334763948498, "ai2arc/accuracy/group_average": 0.3106579317478609, "ai2arc/accuracy/seq_average": 0.33456090651558074, "mmlu/accuracy/MMLU": 0.2627815516624955, "mmlu/accuracy/group_average": 0.2627815516624955, "mmlu/accuracy/seq_average": 0.2627815516624955, "openbookqa/accuracy/test": 0.28, "openbookqa/accuracy/group_average": 0.28, "openbookqa/accuracy/seq_average": 0.28, "race/accuracy/test/high": 0.2933104631217839, "race/accuracy/test/middle": 0.3607242339832869, "race/accuracy/group_average": 0.3270173485525354, "race/accuracy/seq_average": 0.31293068504256183, "siqa/accuracy/dev": 0.37563971340839303, "siqa/accuracy/group_average": 0.37563971340839303, "siqa/accuracy/seq_average": 0.37563971340839303, "winogrande/accuracy/dev": 0.5090765588003157, "winogrande/accuracy/group_average": 0.5090765588003157, "winogrande/accuracy/seq_average": 0.5090765588003157, "commonsenseqa/accuracy/dev_rand_split": 0.28173628173628174, "commonsenseqa/accuracy/group_average": 0.28173628173628174, "commonsenseqa/accuracy/seq_average": 0.28173628173628174}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-260000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.3030453636532737, "val/accuracy": 0.5210716610863095, "val/perplexity": 10.00460376600021, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.3419998832370923, "lambada/accuracy/total": 0.3631599378881988, "lambada/accuracy/openai_last_token": 0.8002717391304348, "lambada/perplexity": 6.64826169514925, "lambada/lm_loss": 2.8884980608176205, "lambada/lm_perplexity": 17.96630502952793, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.44211579948725416, "mean_loss": 2.322522623445183, "blimp/accuracy/passive_2": 0.914, "blimp/accuracy/determiner_noun_agreement_2": 0.987, "blimp/accuracy/ellipsis_n_bar_1": 0.841, "blimp/accuracy/tough_vs_raising_2": 0.891, "blimp/accuracy/tough_vs_raising_1": 0.615, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.911, "blimp/accuracy/principle_A_reconstruction": 0.443, "blimp/accuracy/wh_vs_that_with_gap": 0.428, "blimp/accuracy/principle_A_domain_2": 0.864, "blimp/accuracy/determiner_noun_agreement_1": 0.993, "blimp/accuracy/ellipsis_n_bar_2": 0.905, "blimp/accuracy/principle_A_domain_3": 0.629, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.922, "blimp/accuracy/animate_subject_trans": 0.905, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.93, "blimp/accuracy/distractor_agreement_relative_clause": 0.674, "blimp/accuracy/transitive": 0.882, "blimp/accuracy/sentential_subject_island": 0.337, "blimp/accuracy/adjunct_island": 0.858, "blimp/accuracy/intransitive": 0.784, "blimp/accuracy/existential_there_subject_raising": 0.895, "blimp/accuracy/irregular_past_participle_adjectives": 0.903, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.689, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.339, "blimp/accuracy/only_npi_scope": 0.768, "blimp/accuracy/superlative_quantifiers_2": 0.767, "blimp/accuracy/passive_1": 0.888, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.923, "blimp/accuracy/inchoative": 0.64, "blimp/accuracy/anaphor_gender_agreement": 0.985, "blimp/accuracy/principle_A_c_command": 0.663, "blimp/accuracy/only_npi_licensor_present": 0.645, "blimp/accuracy/expletive_it_object_raising": 0.787, "blimp/accuracy/left_branch_island_simple_question": 0.732, "blimp/accuracy/wh_questions_subject_gap": 0.939, "blimp/accuracy/existential_there_quantifiers_2": 0.438, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.945, "blimp/accuracy/sentential_negation_npi_scope": 0.787, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.806, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.884, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.898, "blimp/accuracy/principle_A_case_2": 0.96, "blimp/accuracy/distractor_agreement_relational_noun": 0.852, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.985, "blimp/accuracy/superlative_quantifiers_1": 0.747, "blimp/accuracy/wh_island": 0.761, "blimp/accuracy/principle_A_domain_1": 0.997, "blimp/accuracy/complex_NP_island": 0.57, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.978, "blimp/accuracy/irregular_past_participle_verbs": 0.905, "blimp/accuracy/drop_argument": 0.756, "blimp/accuracy/wh_questions_object_gap": 0.818, "blimp/accuracy/animate_subject_passive": 0.798, "blimp/accuracy/existential_there_quantifiers_1": 0.971, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.935, "blimp/accuracy/npi_present_2": 0.571, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.961, "blimp/accuracy/anaphor_number_agreement": 0.992, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.967, "blimp/accuracy/existential_there_object_raising": 0.854, "blimp/accuracy/matrix_question_npi_licensor_present": 0.478, "blimp/accuracy/npi_present_1": 0.552, "blimp/accuracy/wh_vs_that_no_gap": 0.985, "blimp/accuracy/left_branch_island_echo_question": 0.464, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.973, "blimp/accuracy/causative": 0.765, "blimp/accuracy/group_average": 0.8004328358208954, "blimp/accuracy/seq_average": 0.8004328358208955, "cbt/accuracy/NE": 0.8257211538461539, "cbt/accuracy/V": 0.944, "cbt/accuracy/CN": 0.8808, "cbt/accuracy/P": 0.922, "cbt/accuracy/group_average": 0.8931302884615385, "cbt/accuracy/seq_average": 0.8931572629051621, "hellaswag/accuracy/val": 0.3625771758613822, "hellaswag/accuracy/group_average": 0.3625771758613822, "hellaswag/accuracy/seq_average": 0.3625771758613822, "piqa/accuracy/val": 0.6235038084874864, "piqa/accuracy/group_average": 0.6235038084874864, "piqa/accuracy/seq_average": 0.6235038084874864, "ai2arc/accuracy/ARC-Easy": 0.3788583509513742, "ai2arc/accuracy/ARC-Challenge": 0.23948497854077253, "ai2arc/accuracy/group_average": 0.3091716647460734, "ai2arc/accuracy/seq_average": 0.3328611898016997, "mmlu/accuracy/MMLU": 0.26557025384340366, "mmlu/accuracy/group_average": 0.26557025384340366, "mmlu/accuracy/seq_average": 0.26557025384340366, "openbookqa/accuracy/test": 0.288, "openbookqa/accuracy/group_average": 0.288, "openbookqa/accuracy/seq_average": 0.288, "race/accuracy/test/high": 0.28959405374499714, "race/accuracy/test/middle": 0.3753481894150418, "race/accuracy/group_average": 0.33247112158001946, "race/accuracy/seq_average": 0.3145520875557357, "siqa/accuracy/dev": 0.3766632548618219, "siqa/accuracy/group_average": 0.3766632548618219, "siqa/accuracy/seq_average": 0.3766632548618219, "winogrande/accuracy/dev": 0.510655090765588, "winogrande/accuracy/group_average": 0.510655090765588, "winogrande/accuracy/seq_average": 0.510655090765588, "commonsenseqa/accuracy/dev_rand_split": 0.27927927927927926, "commonsenseqa/accuracy/group_average": 0.27927927927927926, "commonsenseqa/accuracy/seq_average": 0.27927927927927926}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-280000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.291588435097346, "val/accuracy": 0.523009285094246, "val/perplexity": 9.890635843225144, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.3840223987650426, "lambada/accuracy/total": 0.359277950310559, "lambada/accuracy/openai_last_token": 0.8014363354037267, "lambada/perplexity": 6.682879057379743, "lambada/lm_loss": 2.8841469483282802, "lambada/lm_perplexity": 17.88830143958865, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4411436177024025, "mean_loss": 2.3378054169311944, "blimp/accuracy/passive_2": 0.906, "blimp/accuracy/determiner_noun_agreement_2": 0.99, "blimp/accuracy/ellipsis_n_bar_1": 0.825, "blimp/accuracy/tough_vs_raising_2": 0.9, "blimp/accuracy/tough_vs_raising_1": 0.631, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.891, "blimp/accuracy/principle_A_reconstruction": 0.48, "blimp/accuracy/wh_vs_that_with_gap": 0.444, "blimp/accuracy/principle_A_domain_2": 0.877, "blimp/accuracy/determiner_noun_agreement_1": 0.994, "blimp/accuracy/ellipsis_n_bar_2": 0.891, "blimp/accuracy/principle_A_domain_3": 0.624, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.919, "blimp/accuracy/animate_subject_trans": 0.899, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.922, "blimp/accuracy/distractor_agreement_relative_clause": 0.692, "blimp/accuracy/transitive": 0.888, "blimp/accuracy/sentential_subject_island": 0.363, "blimp/accuracy/adjunct_island": 0.829, "blimp/accuracy/intransitive": 0.791, "blimp/accuracy/existential_there_subject_raising": 0.884, "blimp/accuracy/irregular_past_participle_adjectives": 0.911, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.713, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.34, "blimp/accuracy/only_npi_scope": 0.722, "blimp/accuracy/superlative_quantifiers_2": 0.802, "blimp/accuracy/passive_1": 0.897, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.915, "blimp/accuracy/inchoative": 0.631, "blimp/accuracy/anaphor_gender_agreement": 0.978, "blimp/accuracy/principle_A_c_command": 0.707, "blimp/accuracy/only_npi_licensor_present": 0.809, "blimp/accuracy/expletive_it_object_raising": 0.78, "blimp/accuracy/left_branch_island_simple_question": 0.759, "blimp/accuracy/wh_questions_subject_gap": 0.94, "blimp/accuracy/existential_there_quantifiers_2": 0.489, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.94, "blimp/accuracy/sentential_negation_npi_scope": 0.742, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.826, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.89, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.9, "blimp/accuracy/principle_A_case_2": 0.957, "blimp/accuracy/distractor_agreement_relational_noun": 0.881, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.983, "blimp/accuracy/superlative_quantifiers_1": 0.762, "blimp/accuracy/wh_island": 0.686, "blimp/accuracy/principle_A_domain_1": 0.994, "blimp/accuracy/complex_NP_island": 0.554, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.977, "blimp/accuracy/irregular_past_participle_verbs": 0.915, "blimp/accuracy/drop_argument": 0.742, "blimp/accuracy/wh_questions_object_gap": 0.83, "blimp/accuracy/animate_subject_passive": 0.791, "blimp/accuracy/existential_there_quantifiers_1": 0.969, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.916, "blimp/accuracy/npi_present_2": 0.583, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.956, "blimp/accuracy/anaphor_number_agreement": 0.993, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.969, "blimp/accuracy/existential_there_object_raising": 0.858, "blimp/accuracy/matrix_question_npi_licensor_present": 0.424, "blimp/accuracy/npi_present_1": 0.583, "blimp/accuracy/wh_vs_that_no_gap": 0.984, "blimp/accuracy/left_branch_island_echo_question": 0.448, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.975, "blimp/accuracy/causative": 0.755, "blimp/accuracy/group_average": 0.803223880597015, "blimp/accuracy/seq_average": 0.8032238805970149, "cbt/accuracy/NE": 0.8253205128205128, "cbt/accuracy/V": 0.942, "cbt/accuracy/CN": 0.8864, "cbt/accuracy/P": 0.924, "cbt/accuracy/group_average": 0.8944301282051281, "cbt/accuracy/seq_average": 0.8944577831132453, "hellaswag/accuracy/val": 0.3632742481577375, "hellaswag/accuracy/group_average": 0.3632742481577375, "hellaswag/accuracy/seq_average": 0.3632742481577375, "piqa/accuracy/val": 0.6289445048966268, "piqa/accuracy/group_average": 0.6289445048966268, "piqa/accuracy/seq_average": 0.6289445048966268, "ai2arc/accuracy/ARC-Easy": 0.38097251585623676, "ai2arc/accuracy/ARC-Challenge": 0.24120171673819743, "ai2arc/accuracy/group_average": 0.3110871162972171, "ai2arc/accuracy/seq_average": 0.3348441926345609, "mmlu/accuracy/MMLU": 0.26557025384340366, "mmlu/accuracy/group_average": 0.26557025384340366, "mmlu/accuracy/seq_average": 0.26557025384340366, "openbookqa/accuracy/test": 0.288, "openbookqa/accuracy/group_average": 0.288, "openbookqa/accuracy/seq_average": 0.288, "race/accuracy/test/high": 0.29073756432247, "race/accuracy/test/middle": 0.366991643454039, "race/accuracy/group_average": 0.3288646038882545, "race/accuracy/seq_average": 0.31293068504256183, "siqa/accuracy/dev": 0.37308085977482086, "siqa/accuracy/group_average": 0.37308085977482086, "siqa/accuracy/seq_average": 0.37308085977482086, "winogrande/accuracy/dev": 0.516179952644041, "winogrande/accuracy/group_average": 0.516179952644041, "winogrande/accuracy/seq_average": 0.516179952644041, "commonsenseqa/accuracy/dev_rand_split": 0.2858312858312858, "commonsenseqa/accuracy/group_average": 0.2858312858312858, "commonsenseqa/accuracy/seq_average": 0.2858312858312858}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-300000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.2825099400111606, "val/accuracy": 0.5242435515873016, "val/perplexity": 9.801250112206336, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.3021084803231755, "lambada/accuracy/total": 0.35403726708074534, "lambada/accuracy/openai_last_token": 0.7998835403726708, "lambada/perplexity": 6.787367784785684, "lambada/lm_loss": 2.8800061496491454, "lambada/lm_perplexity": 17.814382731478887, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4391404093340235, "mean_loss": 2.292309210167168, "blimp/accuracy/passive_2": 0.906, "blimp/accuracy/determiner_noun_agreement_2": 0.99, "blimp/accuracy/ellipsis_n_bar_1": 0.84, "blimp/accuracy/tough_vs_raising_2": 0.88, "blimp/accuracy/tough_vs_raising_1": 0.615, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.901, "blimp/accuracy/principle_A_reconstruction": 0.454, "blimp/accuracy/wh_vs_that_with_gap": 0.47, "blimp/accuracy/principle_A_domain_2": 0.867, "blimp/accuracy/determiner_noun_agreement_1": 0.994, "blimp/accuracy/ellipsis_n_bar_2": 0.882, "blimp/accuracy/principle_A_domain_3": 0.63, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.92, "blimp/accuracy/animate_subject_trans": 0.905, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.935, "blimp/accuracy/distractor_agreement_relative_clause": 0.654, "blimp/accuracy/transitive": 0.883, "blimp/accuracy/sentential_subject_island": 0.39, "blimp/accuracy/adjunct_island": 0.823, "blimp/accuracy/intransitive": 0.806, "blimp/accuracy/existential_there_subject_raising": 0.887, "blimp/accuracy/irregular_past_participle_adjectives": 0.952, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.739, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.36, "blimp/accuracy/only_npi_scope": 0.782, "blimp/accuracy/superlative_quantifiers_2": 0.765, "blimp/accuracy/passive_1": 0.895, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.929, "blimp/accuracy/inchoative": 0.648, "blimp/accuracy/anaphor_gender_agreement": 0.977, "blimp/accuracy/principle_A_c_command": 0.708, "blimp/accuracy/only_npi_licensor_present": 0.743, "blimp/accuracy/expletive_it_object_raising": 0.79, "blimp/accuracy/left_branch_island_simple_question": 0.811, "blimp/accuracy/wh_questions_subject_gap": 0.933, "blimp/accuracy/existential_there_quantifiers_2": 0.411, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.939, "blimp/accuracy/sentential_negation_npi_scope": 0.73, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.798, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.865, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.901, "blimp/accuracy/principle_A_case_2": 0.959, "blimp/accuracy/distractor_agreement_relational_noun": 0.845, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.984, "blimp/accuracy/superlative_quantifiers_1": 0.757, "blimp/accuracy/wh_island": 0.764, "blimp/accuracy/principle_A_domain_1": 0.993, "blimp/accuracy/complex_NP_island": 0.574, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.974, "blimp/accuracy/irregular_past_participle_verbs": 0.915, "blimp/accuracy/drop_argument": 0.752, "blimp/accuracy/wh_questions_object_gap": 0.837, "blimp/accuracy/animate_subject_passive": 0.781, "blimp/accuracy/existential_there_quantifiers_1": 0.973, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.923, "blimp/accuracy/npi_present_2": 0.525, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.955, "blimp/accuracy/anaphor_number_agreement": 0.986, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.973, "blimp/accuracy/existential_there_object_raising": 0.859, "blimp/accuracy/matrix_question_npi_licensor_present": 0.389, "blimp/accuracy/npi_present_1": 0.489, "blimp/accuracy/wh_vs_that_no_gap": 0.978, "blimp/accuracy/left_branch_island_echo_question": 0.474, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.974, "blimp/accuracy/causative": 0.744, "blimp/accuracy/group_average": 0.8012686567164179, "blimp/accuracy/seq_average": 0.8012686567164179, "cbt/accuracy/NE": 0.8205128205128205, "cbt/accuracy/V": 0.9416, "cbt/accuracy/CN": 0.8808, "cbt/accuracy/P": 0.926, "cbt/accuracy/group_average": 0.8922282051282051, "cbt/accuracy/seq_average": 0.8922569027611045, "hellaswag/accuracy/val": 0.37024497112129057, "hellaswag/accuracy/group_average": 0.37024497112129057, "hellaswag/accuracy/seq_average": 0.37024497112129057, "piqa/accuracy/val": 0.6294885745375408, "piqa/accuracy/group_average": 0.6294885745375408, "piqa/accuracy/seq_average": 0.6294885745375408, "ai2arc/accuracy/ARC-Easy": 0.3873150105708245, "ai2arc/accuracy/ARC-Challenge": 0.2463519313304721, "ai2arc/accuracy/group_average": 0.3168334709506483, "ai2arc/accuracy/seq_average": 0.3407932011331445, "mmlu/accuracy/MMLU": 0.26542724347515195, "mmlu/accuracy/group_average": 0.26542724347515195, "mmlu/accuracy/seq_average": 0.26542724347515195, "openbookqa/accuracy/test": 0.278, "openbookqa/accuracy/group_average": 0.278, "openbookqa/accuracy/seq_average": 0.278, "race/accuracy/test/high": 0.28987993138936535, "race/accuracy/test/middle": 0.3732590529247911, "race/accuracy/group_average": 0.3315694921570782, "race/accuracy/seq_average": 0.3141467369274422, "siqa/accuracy/dev": 0.37563971340839303, "siqa/accuracy/group_average": 0.37563971340839303, "siqa/accuracy/seq_average": 0.37563971340839303, "winogrande/accuracy/dev": 0.5043409629044988, "winogrande/accuracy/group_average": 0.5043409629044988, "winogrande/accuracy/seq_average": 0.5043409629044988, "commonsenseqa/accuracy/dev_rand_split": 0.2841932841932842, "commonsenseqa/accuracy/group_average": 0.2841932841932842, "commonsenseqa/accuracy/seq_average": 0.2841932841932842}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-320000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.2737576923673113, "val/accuracy": 0.5255146329365079, "val/perplexity": 9.715841448098965, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.3382627119929156, "lambada/accuracy/total": 0.3515139751552795, "lambada/accuracy/openai_last_token": 0.7979425465838509, "lambada/perplexity": 6.8425000363609, "lambada/lm_loss": 2.8787246235304225, "lambada/lm_perplexity": 17.791567756834958, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4385143040458937, "mean_loss": 2.306010202180113, "blimp/accuracy/passive_2": 0.916, "blimp/accuracy/determiner_noun_agreement_2": 0.99, "blimp/accuracy/ellipsis_n_bar_1": 0.836, "blimp/accuracy/tough_vs_raising_2": 0.897, "blimp/accuracy/tough_vs_raising_1": 0.628, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.913, "blimp/accuracy/principle_A_reconstruction": 0.46, "blimp/accuracy/wh_vs_that_with_gap": 0.44, "blimp/accuracy/principle_A_domain_2": 0.884, "blimp/accuracy/determiner_noun_agreement_1": 0.997, "blimp/accuracy/ellipsis_n_bar_2": 0.886, "blimp/accuracy/principle_A_domain_3": 0.648, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.924, "blimp/accuracy/animate_subject_trans": 0.904, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.91, "blimp/accuracy/distractor_agreement_relative_clause": 0.675, "blimp/accuracy/transitive": 0.882, "blimp/accuracy/sentential_subject_island": 0.34, "blimp/accuracy/adjunct_island": 0.822, "blimp/accuracy/intransitive": 0.774, "blimp/accuracy/existential_there_subject_raising": 0.883, "blimp/accuracy/irregular_past_participle_adjectives": 0.94, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.736, "blimp/accuracy/principle_A_case_1": 0.999, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.346, "blimp/accuracy/only_npi_scope": 0.751, "blimp/accuracy/superlative_quantifiers_2": 0.746, "blimp/accuracy/passive_1": 0.905, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.934, "blimp/accuracy/inchoative": 0.645, "blimp/accuracy/anaphor_gender_agreement": 0.984, "blimp/accuracy/principle_A_c_command": 0.723, "blimp/accuracy/only_npi_licensor_present": 0.636, "blimp/accuracy/expletive_it_object_raising": 0.786, "blimp/accuracy/left_branch_island_simple_question": 0.787, "blimp/accuracy/wh_questions_subject_gap": 0.949, "blimp/accuracy/existential_there_quantifiers_2": 0.457, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.94, "blimp/accuracy/sentential_negation_npi_scope": 0.763, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.816, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.897, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.901, "blimp/accuracy/principle_A_case_2": 0.964, "blimp/accuracy/distractor_agreement_relational_noun": 0.844, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.986, "blimp/accuracy/superlative_quantifiers_1": 0.77, "blimp/accuracy/wh_island": 0.746, "blimp/accuracy/principle_A_domain_1": 0.994, "blimp/accuracy/complex_NP_island": 0.555, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.98, "blimp/accuracy/irregular_past_participle_verbs": 0.935, "blimp/accuracy/drop_argument": 0.74, "blimp/accuracy/wh_questions_object_gap": 0.859, "blimp/accuracy/animate_subject_passive": 0.796, "blimp/accuracy/existential_there_quantifiers_1": 0.969, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.923, "blimp/accuracy/npi_present_2": 0.549, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.96, "blimp/accuracy/anaphor_number_agreement": 0.988, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.971, "blimp/accuracy/existential_there_object_raising": 0.889, "blimp/accuracy/matrix_question_npi_licensor_present": 0.383, "blimp/accuracy/npi_present_1": 0.538, "blimp/accuracy/wh_vs_that_no_gap": 0.986, "blimp/accuracy/left_branch_island_echo_question": 0.453, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.966, "blimp/accuracy/causative": 0.763, "blimp/accuracy/group_average": 0.8023432835820897, "blimp/accuracy/seq_average": 0.8023432835820895, "cbt/accuracy/NE": 0.8201121794871795, "cbt/accuracy/V": 0.9404, "cbt/accuracy/CN": 0.8864, "cbt/accuracy/P": 0.9272, "cbt/accuracy/group_average": 0.8935280448717949, "cbt/accuracy/seq_average": 0.8935574229691877, "hellaswag/accuracy/val": 0.3684524995020912, "hellaswag/accuracy/group_average": 0.3684524995020912, "hellaswag/accuracy/seq_average": 0.3684524995020912, "piqa/accuracy/val": 0.6430903155603918, "piqa/accuracy/group_average": 0.6430903155603918, "piqa/accuracy/seq_average": 0.6430903155603918, "ai2arc/accuracy/ARC-Easy": 0.386892177589852, "ai2arc/accuracy/ARC-Challenge": 0.24377682403433476, "ai2arc/accuracy/group_average": 0.31533450081209335, "ai2arc/accuracy/seq_average": 0.3396600566572238, "mmlu/accuracy/MMLU": 0.2698605648909546, "mmlu/accuracy/group_average": 0.2698605648909546, "mmlu/accuracy/seq_average": 0.2698605648909546, "openbookqa/accuracy/test": 0.284, "openbookqa/accuracy/group_average": 0.284, "openbookqa/accuracy/seq_average": 0.284, "race/accuracy/test/high": 0.2910234419668382, "race/accuracy/test/middle": 0.36559888579387184, "race/accuracy/group_average": 0.32831116388035503, "race/accuracy/seq_average": 0.3127280097284151, "siqa/accuracy/dev": 0.37563971340839303, "siqa/accuracy/group_average": 0.37563971340839303, "siqa/accuracy/seq_average": 0.37563971340839303, "winogrande/accuracy/dev": 0.5043409629044988, "winogrande/accuracy/group_average": 0.5043409629044988, "winogrande/accuracy/seq_average": 0.5043409629044988, "commonsenseqa/accuracy/dev_rand_split": 0.2784602784602785, "commonsenseqa/accuracy/group_average": 0.2784602784602785, "commonsenseqa/accuracy/seq_average": 0.2784602784602785}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-340000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.2645125616164434, "val/accuracy": 0.526610359312996, "val/perplexity": 9.626431165276204, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.270631020113548, "lambada/accuracy/total": 0.390333850931677, "lambada/accuracy/openai_last_token": 0.8059006211180124, "lambada/perplexity": 6.177026235620702, "lambada/lm_loss": 2.8669808365618117, "lambada/lm_perplexity": 17.583849462271587, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4584721051223365, "mean_loss": 2.267571790864996, "blimp/accuracy/passive_2": 0.907, "blimp/accuracy/determiner_noun_agreement_2": 0.988, "blimp/accuracy/ellipsis_n_bar_1": 0.849, "blimp/accuracy/tough_vs_raising_2": 0.905, "blimp/accuracy/tough_vs_raising_1": 0.62, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.914, "blimp/accuracy/principle_A_reconstruction": 0.431, "blimp/accuracy/wh_vs_that_with_gap": 0.479, "blimp/accuracy/principle_A_domain_2": 0.873, "blimp/accuracy/determiner_noun_agreement_1": 0.994, "blimp/accuracy/ellipsis_n_bar_2": 0.888, "blimp/accuracy/principle_A_domain_3": 0.631, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.92, "blimp/accuracy/animate_subject_trans": 0.904, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.925, "blimp/accuracy/distractor_agreement_relative_clause": 0.672, "blimp/accuracy/transitive": 0.894, "blimp/accuracy/sentential_subject_island": 0.361, "blimp/accuracy/adjunct_island": 0.827, "blimp/accuracy/intransitive": 0.78, "blimp/accuracy/existential_there_subject_raising": 0.899, "blimp/accuracy/irregular_past_participle_adjectives": 0.891, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.754, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.387, "blimp/accuracy/only_npi_scope": 0.76, "blimp/accuracy/superlative_quantifiers_2": 0.834, "blimp/accuracy/passive_1": 0.898, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.93, "blimp/accuracy/inchoative": 0.64, "blimp/accuracy/anaphor_gender_agreement": 0.982, "blimp/accuracy/principle_A_c_command": 0.711, "blimp/accuracy/only_npi_licensor_present": 0.688, "blimp/accuracy/expletive_it_object_raising": 0.78, "blimp/accuracy/left_branch_island_simple_question": 0.807, "blimp/accuracy/wh_questions_subject_gap": 0.94, "blimp/accuracy/existential_there_quantifiers_2": 0.545, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.938, "blimp/accuracy/sentential_negation_npi_scope": 0.756, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.802, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.893, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.901, "blimp/accuracy/principle_A_case_2": 0.956, "blimp/accuracy/distractor_agreement_relational_noun": 0.848, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.984, "blimp/accuracy/superlative_quantifiers_1": 0.772, "blimp/accuracy/wh_island": 0.741, "blimp/accuracy/principle_A_domain_1": 0.993, "blimp/accuracy/complex_NP_island": 0.573, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.979, "blimp/accuracy/irregular_past_participle_verbs": 0.927, "blimp/accuracy/drop_argument": 0.743, "blimp/accuracy/wh_questions_object_gap": 0.846, "blimp/accuracy/animate_subject_passive": 0.799, "blimp/accuracy/existential_there_quantifiers_1": 0.968, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.921, "blimp/accuracy/npi_present_2": 0.587, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.956, "blimp/accuracy/anaphor_number_agreement": 0.991, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.969, "blimp/accuracy/existential_there_object_raising": 0.847, "blimp/accuracy/matrix_question_npi_licensor_present": 0.427, "blimp/accuracy/npi_present_1": 0.599, "blimp/accuracy/wh_vs_that_no_gap": 0.983, "blimp/accuracy/left_branch_island_echo_question": 0.51, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.968, "blimp/accuracy/causative": 0.775, "blimp/accuracy/group_average": 0.8083582089552239, "blimp/accuracy/seq_average": 0.8083582089552239, "cbt/accuracy/NE": 0.827323717948718, "cbt/accuracy/V": 0.946, "cbt/accuracy/CN": 0.89, "cbt/accuracy/P": 0.9244, "cbt/accuracy/group_average": 0.8969309294871795, "cbt/accuracy/seq_average": 0.8969587835134054, "hellaswag/accuracy/val": 0.37353116908982276, "hellaswag/accuracy/group_average": 0.37353116908982276, "hellaswag/accuracy/seq_average": 0.37353116908982276, "piqa/accuracy/val": 0.6436343852013058, "piqa/accuracy/group_average": 0.6436343852013058, "piqa/accuracy/seq_average": 0.6436343852013058, "ai2arc/accuracy/ARC-Easy": 0.3864693446088795, "ai2arc/accuracy/ARC-Challenge": 0.24377682403433476, "ai2arc/accuracy/group_average": 0.3151230843216071, "ai2arc/accuracy/seq_average": 0.3393767705382436, "mmlu/accuracy/MMLU": 0.2685734715766893, "mmlu/accuracy/group_average": 0.2685734715766893, "mmlu/accuracy/seq_average": 0.2685734715766893, "openbookqa/accuracy/test": 0.274, "openbookqa/accuracy/group_average": 0.274, "openbookqa/accuracy/seq_average": 0.274, "race/accuracy/test/high": 0.2933104631217839, "race/accuracy/test/middle": 0.3767409470752089, "race/accuracy/group_average": 0.3350257050984964, "race/accuracy/seq_average": 0.31759221726793674, "siqa/accuracy/dev": 0.3741044012282497, "siqa/accuracy/group_average": 0.3741044012282497, "siqa/accuracy/seq_average": 0.3741044012282497, "winogrande/accuracy/dev": 0.5114443567482242, "winogrande/accuracy/group_average": 0.5114443567482242, "winogrande/accuracy/seq_average": 0.5114443567482242, "commonsenseqa/accuracy/dev_rand_split": 0.2858312858312858, "commonsenseqa/accuracy/group_average": 0.2858312858312858, "commonsenseqa/accuracy/seq_average": 0.2858312858312858}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-360000.pth.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "val/loss": 2.2589615110367065,
3
+ "val/accuracy": 0.5278514074900794,
4
+ "val/perplexity": 9.573142400128054,
5
+ "val/time_since_best_loss": 0,
6
+ "val/time_since_best_accuracy": 0,
7
+ "lambada/loss": 2.2915035271496507,
8
+ "lambada/accuracy/total": 0.3734472049689441,
9
+ "lambada/accuracy/openai_last_token": 0.8047360248447205,
10
+ "lambada/perplexity": 6.345234886699006,
11
+ "lambada/lm_loss": 2.854429580755265,
12
+ "lambada/lm_perplexity": 17.364529320400003,
13
+ "lambada/time_since_best_loss": 0,
14
+ "lambada/time_since_best_accuracy": 0,
15
+ "mean_accuracy": 0.45064930622951177,
16
+ "mean_loss": 2.2752325190931786,
17
+ "blimp/accuracy/passive_2": 0.91,
18
+ "blimp/accuracy/determiner_noun_agreement_2": 0.99,
19
+ "blimp/accuracy/ellipsis_n_bar_1": 0.853,
20
+ "blimp/accuracy/tough_vs_raising_2": 0.896,
21
+ "blimp/accuracy/tough_vs_raising_1": 0.618,
22
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.892,
23
+ "blimp/accuracy/principle_A_reconstruction": 0.463,
24
+ "blimp/accuracy/wh_vs_that_with_gap": 0.468,
25
+ "blimp/accuracy/principle_A_domain_2": 0.877,
26
+ "blimp/accuracy/determiner_noun_agreement_1": 0.994,
27
+ "blimp/accuracy/ellipsis_n_bar_2": 0.891,
28
+ "blimp/accuracy/principle_A_domain_3": 0.631,
29
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.916,
30
+ "blimp/accuracy/animate_subject_trans": 0.901,
31
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.925,
32
+ "blimp/accuracy/distractor_agreement_relative_clause": 0.67,
33
+ "blimp/accuracy/transitive": 0.888,
34
+ "blimp/accuracy/sentential_subject_island": 0.359,
35
+ "blimp/accuracy/adjunct_island": 0.83,
36
+ "blimp/accuracy/intransitive": 0.785,
37
+ "blimp/accuracy/existential_there_subject_raising": 0.882,
38
+ "blimp/accuracy/irregular_past_participle_adjectives": 0.962,
39
+ "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.728,
40
+ "blimp/accuracy/principle_A_case_1": 0.999,
41
+ "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.392,
42
+ "blimp/accuracy/only_npi_scope": 0.814,
43
+ "blimp/accuracy/superlative_quantifiers_2": 0.807,
44
+ "blimp/accuracy/passive_1": 0.896,
45
+ "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.915,
46
+ "blimp/accuracy/inchoative": 0.647,
47
+ "blimp/accuracy/anaphor_gender_agreement": 0.977,
48
+ "blimp/accuracy/principle_A_c_command": 0.718,
49
+ "blimp/accuracy/only_npi_licensor_present": 0.73,
50
+ "blimp/accuracy/expletive_it_object_raising": 0.761,
51
+ "blimp/accuracy/left_branch_island_simple_question": 0.781,
52
+ "blimp/accuracy/wh_questions_subject_gap": 0.945,
53
+ "blimp/accuracy/existential_there_quantifiers_2": 0.446,
54
+ "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.94,
55
+ "blimp/accuracy/sentential_negation_npi_scope": 0.737,
56
+ "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.806,
57
+ "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.879,
58
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.905,
59
+ "blimp/accuracy/principle_A_case_2": 0.96,
60
+ "blimp/accuracy/distractor_agreement_relational_noun": 0.853,
61
+ "blimp/accuracy/sentential_negation_npi_licensor_present": 0.971,
62
+ "blimp/accuracy/superlative_quantifiers_1": 0.783,
63
+ "blimp/accuracy/wh_island": 0.745,
64
+ "blimp/accuracy/principle_A_domain_1": 0.992,
65
+ "blimp/accuracy/complex_NP_island": 0.566,
66
+ "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.974,
67
+ "blimp/accuracy/irregular_past_participle_verbs": 0.901,
68
+ "blimp/accuracy/drop_argument": 0.739,
69
+ "blimp/accuracy/wh_questions_object_gap": 0.853,
70
+ "blimp/accuracy/animate_subject_passive": 0.789,
71
+ "blimp/accuracy/existential_there_quantifiers_1": 0.973,
72
+ "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.901,
73
+ "blimp/accuracy/npi_present_2": 0.579,
74
+ "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.956,
75
+ "blimp/accuracy/anaphor_number_agreement": 0.992,
76
+ "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.969,
77
+ "blimp/accuracy/existential_there_object_raising": 0.842,
78
+ "blimp/accuracy/matrix_question_npi_licensor_present": 0.418,
79
+ "blimp/accuracy/npi_present_1": 0.569,
80
+ "blimp/accuracy/wh_vs_that_no_gap": 0.983,
81
+ "blimp/accuracy/left_branch_island_echo_question": 0.498,
82
+ "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.966,
83
+ "blimp/accuracy/causative": 0.757,
84
+ "blimp/accuracy/group_average": 0.8052686567164179,
85
+ "blimp/accuracy/seq_average": 0.8052686567164179,
86
+ "cbt/accuracy/NE": 0.8189102564102564,
87
+ "cbt/accuracy/V": 0.9468,
88
+ "cbt/accuracy/CN": 0.8888,
89
+ "cbt/accuracy/P": 0.9256,
90
+ "cbt/accuracy/group_average": 0.8950275641025642,
91
+ "cbt/accuracy/seq_average": 0.8950580232092837,
92
+ "hellaswag/accuracy/val": 0.3743278231428002,
93
+ "hellaswag/accuracy/group_average": 0.3743278231428002,
94
+ "hellaswag/accuracy/seq_average": 0.3743278231428002,
95
+ "piqa/accuracy/val": 0.6430903155603918,
96
+ "piqa/accuracy/group_average": 0.6430903155603918,
97
+ "piqa/accuracy/seq_average": 0.6430903155603918,
98
+ "ai2arc/accuracy/ARC-Easy": 0.3856236786469345,
99
+ "ai2arc/accuracy/ARC-Challenge": 0.24034334763948498,
100
+ "ai2arc/accuracy/group_average": 0.3129835131432097,
101
+ "ai2arc/accuracy/seq_average": 0.3376770538243626,
102
+ "mmlu/accuracy/MMLU": 0.26835895602431176,
103
+ "mmlu/accuracy/group_average": 0.26835895602431176,
104
+ "mmlu/accuracy/seq_average": 0.26835895602431176,
105
+ "openbookqa/accuracy/test": 0.294,
106
+ "openbookqa/accuracy/group_average": 0.294,
107
+ "openbookqa/accuracy/seq_average": 0.294,
108
+ "race/accuracy/test/high": 0.2938822184105203,
109
+ "race/accuracy/test/middle": 0.3788300835654596,
110
+ "race/accuracy/group_average": 0.33635615098799,
111
+ "race/accuracy/seq_average": 0.31860559383867043,
112
+ "siqa/accuracy/dev": 0.37615148413510746,
113
+ "siqa/accuracy/group_average": 0.37615148413510746,
114
+ "siqa/accuracy/seq_average": 0.37615148413510746,
115
+ "winogrande/accuracy/dev": 0.5082872928176796,
116
+ "winogrande/accuracy/group_average": 0.5082872928176796,
117
+ "winogrande/accuracy/seq_average": 0.5082872928176796,
118
+ "commonsenseqa/accuracy/dev_rand_split": 0.2809172809172809,
119
+ "commonsenseqa/accuracy/group_average": 0.2809172809172809,
120
+ "commonsenseqa/accuracy/seq_average": 0.2809172809172809
121
+ }
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-380000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.2546873546781994, "val/accuracy": 0.5287204318576388, "val/perplexity": 9.532312611276403, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.340093387580066, "lambada/accuracy/total": 0.39169254658385094, "lambada/accuracy/openai_last_token": 0.8045419254658385, "lambada/perplexity": 6.421798591679047, "lambada/lm_loss": 2.854525177941722, "lambada/lm_perplexity": 17.366189399895386, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4602064892207449, "mean_loss": 2.297390371129133, "blimp/accuracy/passive_2": 0.916, "blimp/accuracy/determiner_noun_agreement_2": 0.988, "blimp/accuracy/ellipsis_n_bar_1": 0.843, "blimp/accuracy/tough_vs_raising_2": 0.898, "blimp/accuracy/tough_vs_raising_1": 0.625, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.911, "blimp/accuracy/principle_A_reconstruction": 0.435, "blimp/accuracy/wh_vs_that_with_gap": 0.443, "blimp/accuracy/principle_A_domain_2": 0.867, "blimp/accuracy/determiner_noun_agreement_1": 0.994, "blimp/accuracy/ellipsis_n_bar_2": 0.892, "blimp/accuracy/principle_A_domain_3": 0.658, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.911, "blimp/accuracy/animate_subject_trans": 0.902, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.911, "blimp/accuracy/distractor_agreement_relative_clause": 0.667, "blimp/accuracy/transitive": 0.894, "blimp/accuracy/sentential_subject_island": 0.375, "blimp/accuracy/adjunct_island": 0.848, "blimp/accuracy/intransitive": 0.781, "blimp/accuracy/existential_there_subject_raising": 0.881, "blimp/accuracy/irregular_past_participle_adjectives": 0.942, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.758, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.364, "blimp/accuracy/only_npi_scope": 0.785, "blimp/accuracy/superlative_quantifiers_2": 0.803, "blimp/accuracy/passive_1": 0.903, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.928, "blimp/accuracy/inchoative": 0.632, "blimp/accuracy/anaphor_gender_agreement": 0.979, "blimp/accuracy/principle_A_c_command": 0.71, "blimp/accuracy/only_npi_licensor_present": 0.768, "blimp/accuracy/expletive_it_object_raising": 0.785, "blimp/accuracy/left_branch_island_simple_question": 0.838, "blimp/accuracy/wh_questions_subject_gap": 0.948, "blimp/accuracy/existential_there_quantifiers_2": 0.485, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.935, "blimp/accuracy/sentential_negation_npi_scope": 0.756, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.798, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.9, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.892, "blimp/accuracy/principle_A_case_2": 0.964, "blimp/accuracy/distractor_agreement_relational_noun": 0.848, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.981, "blimp/accuracy/superlative_quantifiers_1": 0.777, "blimp/accuracy/wh_island": 0.785, "blimp/accuracy/principle_A_domain_1": 0.992, "blimp/accuracy/complex_NP_island": 0.553, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.976, "blimp/accuracy/irregular_past_participle_verbs": 0.935, "blimp/accuracy/drop_argument": 0.744, "blimp/accuracy/wh_questions_object_gap": 0.853, "blimp/accuracy/animate_subject_passive": 0.797, "blimp/accuracy/existential_there_quantifiers_1": 0.967, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.915, "blimp/accuracy/npi_present_2": 0.568, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.948, "blimp/accuracy/anaphor_number_agreement": 0.99, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.969, "blimp/accuracy/existential_there_object_raising": 0.855, "blimp/accuracy/matrix_question_npi_licensor_present": 0.397, "blimp/accuracy/npi_present_1": 0.548, "blimp/accuracy/wh_vs_that_no_gap": 0.981, "blimp/accuracy/left_branch_island_echo_question": 0.499, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.976, "blimp/accuracy/causative": 0.759, "blimp/accuracy/group_average": 0.8078507462686566, "blimp/accuracy/seq_average": 0.8078507462686567, "cbt/accuracy/NE": 0.8241185897435898, "cbt/accuracy/V": 0.9412, "cbt/accuracy/CN": 0.8912, "cbt/accuracy/P": 0.9264, "cbt/accuracy/group_average": 0.8957296474358974, "cbt/accuracy/seq_average": 0.8957583033213286, "hellaswag/accuracy/val": 0.374726150169289, "hellaswag/accuracy/group_average": 0.374726150169289, "hellaswag/accuracy/seq_average": 0.374726150169289, "piqa/accuracy/val": 0.6528835690968444, "piqa/accuracy/group_average": 0.6528835690968444, "piqa/accuracy/seq_average": 0.6528835690968444, "ai2arc/accuracy/ARC-Easy": 0.3919661733615222, "ai2arc/accuracy/ARC-Challenge": 0.23862660944206007, "ai2arc/accuracy/group_average": 0.3152963914017911, "ai2arc/accuracy/seq_average": 0.3413597733711048, "mmlu/accuracy/MMLU": 0.2690025026814444, "mmlu/accuracy/group_average": 0.2690025026814444, "mmlu/accuracy/seq_average": 0.2690025026814444, "openbookqa/accuracy/test": 0.288, "openbookqa/accuracy/group_average": 0.288, "openbookqa/accuracy/seq_average": 0.288, "race/accuracy/test/high": 0.2970268724985706, "race/accuracy/test/middle": 0.37186629526462395, "race/accuracy/group_average": 0.3344465838815973, "race/accuracy/seq_average": 0.3188082691528172, "siqa/accuracy/dev": 0.3741044012282497, "siqa/accuracy/group_average": 0.3741044012282497, "siqa/accuracy/seq_average": 0.3741044012282497, "winogrande/accuracy/dev": 0.5209155485398579, "winogrande/accuracy/group_average": 0.5209155485398579, "winogrande/accuracy/seq_average": 0.5209155485398579, "commonsenseqa/accuracy/dev_rand_split": 0.2809172809172809, "commonsenseqa/accuracy/group_average": 0.2809172809172809, "commonsenseqa/accuracy/seq_average": 0.2809172809172809}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-40000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.586515154157366, "val/accuracy": 0.4788392082093254, "val/perplexity": 13.283400249479106, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.447929050611413, "lambada/accuracy/total": 0.2783385093167702, "lambada/accuracy/openai_last_token": 0.7630046583850931, "lambada/perplexity": 9.981677522981323, "lambada/lm_loss": 3.147475815221377, "lambada/lm_perplexity": 23.27723432239257, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3785888587630478, "mean_loss": 2.5172221023843893, "blimp/accuracy/passive_2": 0.886, "blimp/accuracy/determiner_noun_agreement_2": 0.982, "blimp/accuracy/ellipsis_n_bar_1": 0.809, "blimp/accuracy/tough_vs_raising_2": 0.871, "blimp/accuracy/tough_vs_raising_1": 0.646, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.894, "blimp/accuracy/principle_A_reconstruction": 0.338, "blimp/accuracy/wh_vs_that_with_gap": 0.487, "blimp/accuracy/principle_A_domain_2": 0.838, "blimp/accuracy/determiner_noun_agreement_1": 0.99, "blimp/accuracy/ellipsis_n_bar_2": 0.895, "blimp/accuracy/principle_A_domain_3": 0.643, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.909, "blimp/accuracy/animate_subject_trans": 0.911, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.914, "blimp/accuracy/distractor_agreement_relative_clause": 0.62, "blimp/accuracy/transitive": 0.864, "blimp/accuracy/sentential_subject_island": 0.33, "blimp/accuracy/adjunct_island": 0.75, "blimp/accuracy/intransitive": 0.793, "blimp/accuracy/existential_there_subject_raising": 0.848, "blimp/accuracy/irregular_past_participle_adjectives": 0.845, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.418, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.275, "blimp/accuracy/only_npi_scope": 0.725, "blimp/accuracy/superlative_quantifiers_2": 0.635, "blimp/accuracy/passive_1": 0.884, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.926, "blimp/accuracy/inchoative": 0.649, "blimp/accuracy/anaphor_gender_agreement": 0.963, "blimp/accuracy/principle_A_c_command": 0.628, "blimp/accuracy/only_npi_licensor_present": 0.791, "blimp/accuracy/expletive_it_object_raising": 0.772, "blimp/accuracy/left_branch_island_simple_question": 0.506, "blimp/accuracy/wh_questions_subject_gap": 0.943, "blimp/accuracy/existential_there_quantifiers_2": 0.409, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.936, "blimp/accuracy/sentential_negation_npi_scope": 0.6, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.809, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.912, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.856, "blimp/accuracy/principle_A_case_2": 0.924, "blimp/accuracy/distractor_agreement_relational_noun": 0.859, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.994, "blimp/accuracy/superlative_quantifiers_1": 0.782, "blimp/accuracy/wh_island": 0.843, "blimp/accuracy/principle_A_domain_1": 0.967, "blimp/accuracy/complex_NP_island": 0.582, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.96, "blimp/accuracy/irregular_past_participle_verbs": 0.882, "blimp/accuracy/drop_argument": 0.778, "blimp/accuracy/wh_questions_object_gap": 0.847, "blimp/accuracy/animate_subject_passive": 0.791, "blimp/accuracy/existential_there_quantifiers_1": 0.975, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.907, "blimp/accuracy/npi_present_2": 0.504, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.911, "blimp/accuracy/anaphor_number_agreement": 0.986, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.96, "blimp/accuracy/existential_there_object_raising": 0.845, "blimp/accuracy/matrix_question_npi_licensor_present": 0.257, "blimp/accuracy/npi_present_1": 0.443, "blimp/accuracy/wh_vs_that_no_gap": 0.978, "blimp/accuracy/left_branch_island_echo_question": 0.424, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.977, "blimp/accuracy/causative": 0.702, "blimp/accuracy/group_average": 0.7728059701492535, "blimp/accuracy/seq_average": 0.7728059701492538, "cbt/accuracy/NE": 0.7632211538461539, "cbt/accuracy/V": 0.9168, "cbt/accuracy/CN": 0.832, "cbt/accuracy/P": 0.8844, "cbt/accuracy/group_average": 0.8491052884615384, "cbt/accuracy/seq_average": 0.849139655862345, "hellaswag/accuracy/val": 0.2996415056761601, "hellaswag/accuracy/group_average": 0.2996415056761601, "hellaswag/accuracy/seq_average": 0.2996415056761601, "piqa/accuracy/val": 0.5903155603917302, "piqa/accuracy/group_average": 0.5903155603917302, "piqa/accuracy/seq_average": 0.5903155603917302, "ai2arc/accuracy/ARC-Easy": 0.3420718816067653, "ai2arc/accuracy/ARC-Challenge": 0.21373390557939914, "ai2arc/accuracy/group_average": 0.27790289359308223, "ai2arc/accuracy/seq_average": 0.29971671388101984, "mmlu/accuracy/MMLU": 0.2621380050053629, "mmlu/accuracy/group_average": 0.2621380050053629, "mmlu/accuracy/seq_average": 0.2621380050053629, "openbookqa/accuracy/test": 0.276, "openbookqa/accuracy/group_average": 0.276, "openbookqa/accuracy/seq_average": 0.276, "race/accuracy/test/high": 0.2638650657518582, "race/accuracy/test/middle": 0.34052924791086353, "race/accuracy/group_average": 0.30219715683136084, "race/accuracy/seq_average": 0.28617754357519254, "siqa/accuracy/dev": 0.3556806550665302, "siqa/accuracy/group_average": 0.3556806550665302, "siqa/accuracy/seq_average": 0.3556806550665302, "winogrande/accuracy/dev": 0.5019731649565904, "winogrande/accuracy/group_average": 0.5019731649565904, "winogrande/accuracy/seq_average": 0.5019731649565904, "commonsenseqa/accuracy/dev_rand_split": 0.25552825552825553, "commonsenseqa/accuracy/group_average": 0.25552825552825553, "commonsenseqa/accuracy/seq_average": 0.25552825552825553}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-400000.pth.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "val/loss": 2.250890096028646,
3
+ "val/accuracy": 0.5290110754588294,
4
+ "val/perplexity": 9.496184591891375,
5
+ "val/time_since_best_loss": 0,
6
+ "val/time_since_best_accuracy": 0,
7
+ "lambada/loss": 2.312096447678086,
8
+ "lambada/accuracy/total": 0.36898291925465837,
9
+ "lambada/accuracy/openai_last_token": 0.8041537267080745,
10
+ "lambada/perplexity": 6.560592995464335,
11
+ "lambada/lm_loss": 2.850388046003985,
12
+ "lambada/lm_perplexity": 17.294491596991527,
13
+ "lambada/time_since_best_loss": 0,
14
+ "lambada/time_since_best_accuracy": 0,
15
+ "mean_accuracy": 0.4489969973567439,
16
+ "mean_loss": 2.281493271853366,
17
+ "blimp/accuracy/passive_2": 0.914,
18
+ "blimp/accuracy/determiner_noun_agreement_2": 0.991,
19
+ "blimp/accuracy/ellipsis_n_bar_1": 0.824,
20
+ "blimp/accuracy/tough_vs_raising_2": 0.9,
21
+ "blimp/accuracy/tough_vs_raising_1": 0.632,
22
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.908,
23
+ "blimp/accuracy/principle_A_reconstruction": 0.437,
24
+ "blimp/accuracy/wh_vs_that_with_gap": 0.451,
25
+ "blimp/accuracy/principle_A_domain_2": 0.87,
26
+ "blimp/accuracy/determiner_noun_agreement_1": 0.994,
27
+ "blimp/accuracy/ellipsis_n_bar_2": 0.901,
28
+ "blimp/accuracy/principle_A_domain_3": 0.653,
29
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.925,
30
+ "blimp/accuracy/animate_subject_trans": 0.901,
31
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.927,
32
+ "blimp/accuracy/distractor_agreement_relative_clause": 0.676,
33
+ "blimp/accuracy/transitive": 0.893,
34
+ "blimp/accuracy/sentential_subject_island": 0.35,
35
+ "blimp/accuracy/adjunct_island": 0.842,
36
+ "blimp/accuracy/intransitive": 0.789,
37
+ "blimp/accuracy/existential_there_subject_raising": 0.888,
38
+ "blimp/accuracy/irregular_past_participle_adjectives": 0.952,
39
+ "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.727,
40
+ "blimp/accuracy/principle_A_case_1": 1.0,
41
+ "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.38,
42
+ "blimp/accuracy/only_npi_scope": 0.793,
43
+ "blimp/accuracy/superlative_quantifiers_2": 0.792,
44
+ "blimp/accuracy/passive_1": 0.903,
45
+ "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.923,
46
+ "blimp/accuracy/inchoative": 0.643,
47
+ "blimp/accuracy/anaphor_gender_agreement": 0.982,
48
+ "blimp/accuracy/principle_A_c_command": 0.701,
49
+ "blimp/accuracy/only_npi_licensor_present": 0.663,
50
+ "blimp/accuracy/expletive_it_object_raising": 0.785,
51
+ "blimp/accuracy/left_branch_island_simple_question": 0.796,
52
+ "blimp/accuracy/wh_questions_subject_gap": 0.936,
53
+ "blimp/accuracy/existential_there_quantifiers_2": 0.5,
54
+ "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.95,
55
+ "blimp/accuracy/sentential_negation_npi_scope": 0.747,
56
+ "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.807,
57
+ "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.885,
58
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.895,
59
+ "blimp/accuracy/principle_A_case_2": 0.967,
60
+ "blimp/accuracy/distractor_agreement_relational_noun": 0.863,
61
+ "blimp/accuracy/sentential_negation_npi_licensor_present": 0.977,
62
+ "blimp/accuracy/superlative_quantifiers_1": 0.714,
63
+ "blimp/accuracy/wh_island": 0.752,
64
+ "blimp/accuracy/principle_A_domain_1": 0.99,
65
+ "blimp/accuracy/complex_NP_island": 0.571,
66
+ "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.981,
67
+ "blimp/accuracy/irregular_past_participle_verbs": 0.918,
68
+ "blimp/accuracy/drop_argument": 0.752,
69
+ "blimp/accuracy/wh_questions_object_gap": 0.859,
70
+ "blimp/accuracy/animate_subject_passive": 0.792,
71
+ "blimp/accuracy/existential_there_quantifiers_1": 0.978,
72
+ "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.904,
73
+ "blimp/accuracy/npi_present_2": 0.583,
74
+ "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.963,
75
+ "blimp/accuracy/anaphor_number_agreement": 0.99,
76
+ "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.971,
77
+ "blimp/accuracy/existential_there_object_raising": 0.859,
78
+ "blimp/accuracy/matrix_question_npi_licensor_present": 0.432,
79
+ "blimp/accuracy/npi_present_1": 0.561,
80
+ "blimp/accuracy/wh_vs_that_no_gap": 0.98,
81
+ "blimp/accuracy/left_branch_island_echo_question": 0.497,
82
+ "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.967,
83
+ "blimp/accuracy/causative": 0.774,
84
+ "blimp/accuracy/group_average": 0.8062835820895523,
85
+ "blimp/accuracy/seq_average": 0.8062835820895522,
86
+ "cbt/accuracy/NE": 0.8165064102564102,
87
+ "cbt/accuracy/V": 0.9448,
88
+ "cbt/accuracy/CN": 0.8924,
89
+ "cbt/accuracy/P": 0.9284,
90
+ "cbt/accuracy/group_average": 0.8955266025641025,
91
+ "cbt/accuracy/seq_average": 0.8955582232893158,
92
+ "hellaswag/accuracy/val": 0.37492531368253335,
93
+ "hellaswag/accuracy/group_average": 0.37492531368253335,
94
+ "hellaswag/accuracy/seq_average": 0.37492531368253335,
95
+ "piqa/accuracy/val": 0.6447225244831338,
96
+ "piqa/accuracy/group_average": 0.6447225244831338,
97
+ "piqa/accuracy/seq_average": 0.6447225244831338,
98
+ "ai2arc/accuracy/ARC-Easy": 0.39323467230443976,
99
+ "ai2arc/accuracy/ARC-Challenge": 0.248068669527897,
100
+ "ai2arc/accuracy/group_average": 0.32065167091616836,
101
+ "ai2arc/accuracy/seq_average": 0.3453257790368272,
102
+ "mmlu/accuracy/MMLU": 0.26850196639256346,
103
+ "mmlu/accuracy/group_average": 0.26850196639256346,
104
+ "mmlu/accuracy/seq_average": 0.26850196639256346,
105
+ "openbookqa/accuracy/test": 0.294,
106
+ "openbookqa/accuracy/group_average": 0.294,
107
+ "openbookqa/accuracy/seq_average": 0.294,
108
+ "race/accuracy/test/high": 0.2933104631217839,
109
+ "race/accuracy/test/middle": 0.3683844011142061,
110
+ "race/accuracy/group_average": 0.330847432117995,
111
+ "race/accuracy/seq_average": 0.3151601134981759,
112
+ "siqa/accuracy/dev": 0.3710337768679631,
113
+ "siqa/accuracy/group_average": 0.3710337768679631,
114
+ "siqa/accuracy/seq_average": 0.3710337768679631,
115
+ "winogrande/accuracy/dev": 0.5090765588003157,
116
+ "winogrande/accuracy/group_average": 0.5090765588003157,
117
+ "winogrande/accuracy/seq_average": 0.5090765588003157,
118
+ "commonsenseqa/accuracy/dev_rand_split": 0.2833742833742834,
119
+ "commonsenseqa/accuracy/group_average": 0.2833742833742834,
120
+ "commonsenseqa/accuracy/seq_average": 0.2833742833742834
121
+ }
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-60000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.52168952094184, "val/accuracy": 0.48801967075892855, "val/perplexity": 12.449612786509071, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.3707108586471275, "lambada/accuracy/total": 0.26649844720496896, "lambada/accuracy/openai_last_token": 0.7637810559006211, "lambada/perplexity": 10.182359073224125, "lambada/lm_loss": 3.0747436423942354, "lambada/lm_perplexity": 21.64433251096604, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3772590589819488, "mean_loss": 2.4462000686929835, "blimp/accuracy/passive_2": 0.906, "blimp/accuracy/determiner_noun_agreement_2": 0.988, "blimp/accuracy/ellipsis_n_bar_1": 0.836, "blimp/accuracy/tough_vs_raising_2": 0.871, "blimp/accuracy/tough_vs_raising_1": 0.593, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.906, "blimp/accuracy/principle_A_reconstruction": 0.434, "blimp/accuracy/wh_vs_that_with_gap": 0.494, "blimp/accuracy/principle_A_domain_2": 0.839, "blimp/accuracy/determiner_noun_agreement_1": 0.993, "blimp/accuracy/ellipsis_n_bar_2": 0.899, "blimp/accuracy/principle_A_domain_3": 0.592, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.936, "blimp/accuracy/animate_subject_trans": 0.895, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.923, "blimp/accuracy/distractor_agreement_relative_clause": 0.657, "blimp/accuracy/transitive": 0.856, "blimp/accuracy/sentential_subject_island": 0.378, "blimp/accuracy/adjunct_island": 0.788, "blimp/accuracy/intransitive": 0.732, "blimp/accuracy/existential_there_subject_raising": 0.85, "blimp/accuracy/irregular_past_participle_adjectives": 0.895, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.56, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.292, "blimp/accuracy/only_npi_scope": 0.647, "blimp/accuracy/superlative_quantifiers_2": 0.582, "blimp/accuracy/passive_1": 0.892, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.926, "blimp/accuracy/inchoative": 0.624, "blimp/accuracy/anaphor_gender_agreement": 0.978, "blimp/accuracy/principle_A_c_command": 0.691, "blimp/accuracy/only_npi_licensor_present": 0.796, "blimp/accuracy/expletive_it_object_raising": 0.783, "blimp/accuracy/left_branch_island_simple_question": 0.653, "blimp/accuracy/wh_questions_subject_gap": 0.941, "blimp/accuracy/existential_there_quantifiers_2": 0.541, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.955, "blimp/accuracy/sentential_negation_npi_scope": 0.607, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.805, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.896, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.886, "blimp/accuracy/principle_A_case_2": 0.913, "blimp/accuracy/distractor_agreement_relational_noun": 0.871, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.985, "blimp/accuracy/superlative_quantifiers_1": 0.611, "blimp/accuracy/wh_island": 0.931, "blimp/accuracy/principle_A_domain_1": 0.987, "blimp/accuracy/complex_NP_island": 0.564, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.966, "blimp/accuracy/irregular_past_participle_verbs": 0.842, "blimp/accuracy/drop_argument": 0.716, "blimp/accuracy/wh_questions_object_gap": 0.827, "blimp/accuracy/animate_subject_passive": 0.792, "blimp/accuracy/existential_there_quantifiers_1": 0.98, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.915, "blimp/accuracy/npi_present_2": 0.567, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.944, "blimp/accuracy/anaphor_number_agreement": 0.988, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.97, "blimp/accuracy/existential_there_object_raising": 0.814, "blimp/accuracy/matrix_question_npi_licensor_present": 0.279, "blimp/accuracy/npi_present_1": 0.484, "blimp/accuracy/wh_vs_that_no_gap": 0.982, "blimp/accuracy/left_branch_island_echo_question": 0.391, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.977, "blimp/accuracy/causative": 0.742, "blimp/accuracy/group_average": 0.7814029850746267, "blimp/accuracy/seq_average": 0.7814029850746269, "cbt/accuracy/NE": 0.7668269230769231, "cbt/accuracy/V": 0.9276, "cbt/accuracy/CN": 0.8432, "cbt/accuracy/P": 0.8988, "cbt/accuracy/group_average": 0.8591067307692307, "cbt/accuracy/seq_average": 0.8591436574629852, "hellaswag/accuracy/val": 0.31208922525393346, "hellaswag/accuracy/group_average": 0.31208922525393346, "hellaswag/accuracy/seq_average": 0.31208922525393346, "piqa/accuracy/val": 0.5984766050054406, "piqa/accuracy/group_average": 0.5984766050054406, "piqa/accuracy/seq_average": 0.5984766050054406, "ai2arc/accuracy/ARC-Easy": 0.3492600422832981, "ai2arc/accuracy/ARC-Challenge": 0.21630901287553647, "ai2arc/accuracy/group_average": 0.2827845275794173, "ai2arc/accuracy/seq_average": 0.3053824362606232, "mmlu/accuracy/MMLU": 0.2619234894529853, "mmlu/accuracy/group_average": 0.2619234894529853, "mmlu/accuracy/seq_average": 0.2619234894529853, "openbookqa/accuracy/test": 0.3, "openbookqa/accuracy/group_average": 0.3, "openbookqa/accuracy/seq_average": 0.3, "race/accuracy/test/high": 0.2830188679245283, "race/accuracy/test/middle": 0.35097493036211697, "race/accuracy/group_average": 0.31699689914332263, "race/accuracy/seq_average": 0.30279691933522496, "siqa/accuracy/dev": 0.37154554759467756, "siqa/accuracy/group_average": 0.37154554759467756, "siqa/accuracy/seq_average": 0.37154554759467756, "winogrande/accuracy/dev": 0.5122336227308603, "winogrande/accuracy/group_average": 0.5122336227308603, "winogrande/accuracy/seq_average": 0.5122336227308603, "commonsenseqa/accuracy/dev_rand_split": 0.25634725634725636, "commonsenseqa/accuracy/group_average": 0.25634725634725636, "commonsenseqa/accuracy/seq_average": 0.25634725634725636}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly_v2/export/result-model-80000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.472232879154266, "val/accuracy": 0.4951995365203373, "val/perplexity": 11.848874440599944, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.4869018933787848, "lambada/accuracy/total": 0.265527950310559, "lambada/accuracy/openai_last_token": 0.7674689440993789, "lambada/perplexity": 9.900565978926997, "lambada/lm_loss": 3.042767824029937, "lambada/lm_perplexity": 20.963185419565264, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.38036374341544815, "mean_loss": 2.4795673862665253, "blimp/accuracy/passive_2": 0.907, "blimp/accuracy/determiner_noun_agreement_2": 0.99, "blimp/accuracy/ellipsis_n_bar_1": 0.839, "blimp/accuracy/tough_vs_raising_2": 0.871, "blimp/accuracy/tough_vs_raising_1": 0.654, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.892, "blimp/accuracy/principle_A_reconstruction": 0.451, "blimp/accuracy/wh_vs_that_with_gap": 0.509, "blimp/accuracy/principle_A_domain_2": 0.857, "blimp/accuracy/determiner_noun_agreement_1": 0.995, "blimp/accuracy/ellipsis_n_bar_2": 0.878, "blimp/accuracy/principle_A_domain_3": 0.63, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.932, "blimp/accuracy/animate_subject_trans": 0.901, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.913, "blimp/accuracy/distractor_agreement_relative_clause": 0.662, "blimp/accuracy/transitive": 0.866, "blimp/accuracy/sentential_subject_island": 0.396, "blimp/accuracy/adjunct_island": 0.751, "blimp/accuracy/intransitive": 0.766, "blimp/accuracy/existential_there_subject_raising": 0.855, "blimp/accuracy/irregular_past_participle_adjectives": 0.867, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.534, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.358, "blimp/accuracy/only_npi_scope": 0.763, "blimp/accuracy/superlative_quantifiers_2": 0.774, "blimp/accuracy/passive_1": 0.891, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.92, "blimp/accuracy/inchoative": 0.654, "blimp/accuracy/anaphor_gender_agreement": 0.979, "blimp/accuracy/principle_A_c_command": 0.688, "blimp/accuracy/only_npi_licensor_present": 0.73, "blimp/accuracy/expletive_it_object_raising": 0.778, "blimp/accuracy/left_branch_island_simple_question": 0.568, "blimp/accuracy/wh_questions_subject_gap": 0.931, "blimp/accuracy/existential_there_quantifiers_2": 0.451, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.95, "blimp/accuracy/sentential_negation_npi_scope": 0.717, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.83, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.871, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.898, "blimp/accuracy/principle_A_case_2": 0.944, "blimp/accuracy/distractor_agreement_relational_noun": 0.874, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.987, "blimp/accuracy/superlative_quantifiers_1": 0.668, "blimp/accuracy/wh_island": 0.818, "blimp/accuracy/principle_A_domain_1": 0.987, "blimp/accuracy/complex_NP_island": 0.562, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.98, "blimp/accuracy/irregular_past_participle_verbs": 0.845, "blimp/accuracy/drop_argument": 0.755, "blimp/accuracy/wh_questions_object_gap": 0.82, "blimp/accuracy/animate_subject_passive": 0.801, "blimp/accuracy/existential_there_quantifiers_1": 0.987, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.916, "blimp/accuracy/npi_present_2": 0.575, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.951, "blimp/accuracy/anaphor_number_agreement": 0.989, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.956, "blimp/accuracy/existential_there_object_raising": 0.836, "blimp/accuracy/matrix_question_npi_licensor_present": 0.311, "blimp/accuracy/npi_present_1": 0.494, "blimp/accuracy/wh_vs_that_no_gap": 0.971, "blimp/accuracy/left_branch_island_echo_question": 0.36, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.952, "blimp/accuracy/causative": 0.735, "blimp/accuracy/group_average": 0.787179104477612, "blimp/accuracy/seq_average": 0.787179104477612, "cbt/accuracy/NE": 0.7868589743589743, "cbt/accuracy/V": 0.9276, "cbt/accuracy/CN": 0.8496, "cbt/accuracy/P": 0.9048, "cbt/accuracy/group_average": 0.8672147435897437, "cbt/accuracy/seq_average": 0.8672468987595038, "hellaswag/accuracy/val": 0.3234415455088628, "hellaswag/accuracy/group_average": 0.3234415455088628, "hellaswag/accuracy/seq_average": 0.3234415455088628, "piqa/accuracy/val": 0.6055495103373232, "piqa/accuracy/group_average": 0.6055495103373232, "piqa/accuracy/seq_average": 0.6055495103373232, "ai2arc/accuracy/ARC-Easy": 0.3513742071881607, "ai2arc/accuracy/ARC-Challenge": 0.22489270386266094, "ai2arc/accuracy/group_average": 0.2881334555254108, "ai2arc/accuracy/seq_average": 0.3096317280453258, "mmlu/accuracy/MMLU": 0.264926707186271, "mmlu/accuracy/group_average": 0.264926707186271, "mmlu/accuracy/seq_average": 0.264926707186271, "openbookqa/accuracy/test": 0.304, "openbookqa/accuracy/group_average": 0.304, "openbookqa/accuracy/seq_average": 0.304, "race/accuracy/test/high": 0.2773013150371641, "race/accuracy/test/middle": 0.3565459610027855, "race/accuracy/group_average": 0.3169236380199748, "race/accuracy/seq_average": 0.3003648155654641, "siqa/accuracy/dev": 0.3587512794268168, "siqa/accuracy/group_average": 0.3587512794268168, "siqa/accuracy/seq_average": 0.3587512794268168, "winogrande/accuracy/dev": 0.5035516969218626, "winogrande/accuracy/group_average": 0.5035516969218626, "winogrande/accuracy/seq_average": 0.5035516969218626, "commonsenseqa/accuracy/dev_rand_split": 0.2620802620802621, "commonsenseqa/accuracy/group_average": 0.2620802620802621, "commonsenseqa/accuracy/seq_average": 0.2620802620802621}