Upload folder using huggingface_hub

#277
Files changed (20) hide show
  1. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-100000.pth.json +1 -0
  2. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-120000.pth.json +1 -0
  3. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-140000.pth.json +1 -0
  4. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-160000.pth.json +1 -0
  5. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-180000.pth.json +1 -0
  6. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-20000.pth.json +1 -0
  7. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-200000.pth.json +1 -0
  8. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-220000.pth.json +1 -0
  9. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-240000.pth.json +1 -0
  10. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-260000.pth.json +1 -0
  11. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-280000.pth.json +1 -0
  12. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-300000.pth.json +1 -0
  13. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-320000.pth.json +1 -0
  14. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-340000.pth.json +1 -0
  15. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-360000.pth.json +1 -0
  16. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-380000.pth.json +1 -0
  17. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-40000.pth.json +121 -0
  18. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-400000.pth.json +121 -0
  19. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-60000.pth.json +1 -0
  20. Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-80000.pth.json +1 -0
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-100000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.4364231654575894, "val/accuracy": 0.500152103484623, "val/perplexity": 11.432076878515321, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.46998283433618, "lambada/accuracy/total": 0.31754658385093165, "lambada/accuracy/openai_last_token": 0.7829968944099379, "lambada/perplexity": 8.357460342678262, "lambada/lm_loss": 3.024396140073712, "lambada/lm_perplexity": 20.58157257104255, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.40884934366777737, "mean_loss": 2.4532029998968845, "blimp/accuracy/passive_2": 0.902, "blimp/accuracy/determiner_noun_agreement_2": 0.985, "blimp/accuracy/ellipsis_n_bar_1": 0.85, "blimp/accuracy/tough_vs_raising_2": 0.904, "blimp/accuracy/tough_vs_raising_1": 0.59, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.915, "blimp/accuracy/principle_A_reconstruction": 0.462, "blimp/accuracy/wh_vs_that_with_gap": 0.468, "blimp/accuracy/principle_A_domain_2": 0.875, "blimp/accuracy/determiner_noun_agreement_1": 0.988, "blimp/accuracy/ellipsis_n_bar_2": 0.908, "blimp/accuracy/principle_A_domain_3": 0.604, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.906, "blimp/accuracy/animate_subject_trans": 0.9, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.914, "blimp/accuracy/distractor_agreement_relative_clause": 0.698, "blimp/accuracy/transitive": 0.885, "blimp/accuracy/sentential_subject_island": 0.299, "blimp/accuracy/adjunct_island": 0.817, "blimp/accuracy/intransitive": 0.761, "blimp/accuracy/existential_there_subject_raising": 0.846, "blimp/accuracy/irregular_past_participle_adjectives": 0.984, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.624, "blimp/accuracy/principle_A_case_1": 0.999, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.364, "blimp/accuracy/only_npi_scope": 0.787, "blimp/accuracy/superlative_quantifiers_2": 0.818, "blimp/accuracy/passive_1": 0.897, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.921, "blimp/accuracy/inchoative": 0.624, "blimp/accuracy/anaphor_gender_agreement": 0.962, "blimp/accuracy/principle_A_c_command": 0.629, "blimp/accuracy/only_npi_licensor_present": 0.821, "blimp/accuracy/expletive_it_object_raising": 0.809, "blimp/accuracy/left_branch_island_simple_question": 0.723, "blimp/accuracy/wh_questions_subject_gap": 0.941, "blimp/accuracy/existential_there_quantifiers_2": 0.566, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.931, "blimp/accuracy/sentential_negation_npi_scope": 0.698, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.842, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.886, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.91, "blimp/accuracy/principle_A_case_2": 0.963, "blimp/accuracy/distractor_agreement_relational_noun": 0.847, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.975, "blimp/accuracy/superlative_quantifiers_1": 0.792, "blimp/accuracy/wh_island": 0.805, "blimp/accuracy/principle_A_domain_1": 0.974, "blimp/accuracy/complex_NP_island": 0.578, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.977, "blimp/accuracy/irregular_past_participle_verbs": 0.909, "blimp/accuracy/drop_argument": 0.749, "blimp/accuracy/wh_questions_object_gap": 0.848, "blimp/accuracy/animate_subject_passive": 0.78, "blimp/accuracy/existential_there_quantifiers_1": 0.992, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.901, "blimp/accuracy/npi_present_2": 0.62, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.951, "blimp/accuracy/anaphor_number_agreement": 0.99, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.953, "blimp/accuracy/existential_there_object_raising": 0.83, "blimp/accuracy/matrix_question_npi_licensor_present": 0.246, "blimp/accuracy/npi_present_1": 0.631, "blimp/accuracy/wh_vs_that_no_gap": 0.983, "blimp/accuracy/left_branch_island_echo_question": 0.493, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.975, "blimp/accuracy/causative": 0.755, "blimp/accuracy/group_average": 0.8019402985074627, "blimp/accuracy/seq_average": 0.8019402985074627, "cbt/accuracy/NE": 0.7980769230769231, "cbt/accuracy/V": 0.926, "cbt/accuracy/CN": 0.862, "cbt/accuracy/P": 0.8992, "cbt/accuracy/group_average": 0.8713192307692308, "cbt/accuracy/seq_average": 0.8713485394157663, "hellaswag/accuracy/val": 0.3269269069906393, "hellaswag/accuracy/group_average": 0.3269269069906393, "hellaswag/accuracy/seq_average": 0.3269269069906393, "piqa/accuracy/val": 0.6186071817192601, "piqa/accuracy/group_average": 0.6186071817192601, "piqa/accuracy/seq_average": 0.6186071817192601, "ai2arc/accuracy/ARC-Easy": 0.3572938689217759, "ai2arc/accuracy/ARC-Challenge": 0.22489270386266094, "ai2arc/accuracy/group_average": 0.29109328639221843, "ai2arc/accuracy/seq_average": 0.31359773371104815, "mmlu/accuracy/MMLU": 0.26692885234179475, "mmlu/accuracy/group_average": 0.26692885234179475, "mmlu/accuracy/seq_average": 0.26692885234179475, "openbookqa/accuracy/test": 0.286, "openbookqa/accuracy/group_average": 0.286, "openbookqa/accuracy/seq_average": 0.286, "race/accuracy/test/high": 0.27530017152658665, "race/accuracy/test/middle": 0.3447075208913649, "race/accuracy/group_average": 0.3100038462089758, "race/accuracy/seq_average": 0.2955006080259424, "siqa/accuracy/dev": 0.3607983623336745, "siqa/accuracy/group_average": 0.3607983623336745, "siqa/accuracy/seq_average": 0.3607983623336745, "winogrande/accuracy/dev": 0.5146014206787688, "winogrande/accuracy/group_average": 0.5146014206787688, "winogrande/accuracy/seq_average": 0.5146014206787688, "commonsenseqa/accuracy/dev_rand_split": 0.2678132678132678, "commonsenseqa/accuracy/group_average": 0.2678132678132678, "commonsenseqa/accuracy/seq_average": 0.2678132678132678}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-120000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.412752666170635, "val/accuracy": 0.5040099128844246, "val/perplexity": 11.16465144394944, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.4507620675223216, "lambada/accuracy/total": 0.3371506211180124, "lambada/accuracy/openai_last_token": 0.7905667701863354, "lambada/perplexity": 7.687300419540732, "lambada/lm_loss": 2.998299679631566, "lambada/lm_perplexity": 20.05141409373822, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.42058026700121853, "mean_loss": 2.4317573668464783, "blimp/accuracy/passive_2": 0.919, "blimp/accuracy/determiner_noun_agreement_2": 0.978, "blimp/accuracy/ellipsis_n_bar_1": 0.854, "blimp/accuracy/tough_vs_raising_2": 0.874, "blimp/accuracy/tough_vs_raising_1": 0.614, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.917, "blimp/accuracy/principle_A_reconstruction": 0.451, "blimp/accuracy/wh_vs_that_with_gap": 0.5, "blimp/accuracy/principle_A_domain_2": 0.855, "blimp/accuracy/determiner_noun_agreement_1": 0.99, "blimp/accuracy/ellipsis_n_bar_2": 0.883, "blimp/accuracy/principle_A_domain_3": 0.577, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.905, "blimp/accuracy/animate_subject_trans": 0.889, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.91, "blimp/accuracy/distractor_agreement_relative_clause": 0.666, "blimp/accuracy/transitive": 0.876, "blimp/accuracy/sentential_subject_island": 0.347, "blimp/accuracy/adjunct_island": 0.81, "blimp/accuracy/intransitive": 0.773, "blimp/accuracy/existential_there_subject_raising": 0.85, "blimp/accuracy/irregular_past_participle_adjectives": 0.984, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.673, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.39, "blimp/accuracy/only_npi_scope": 0.748, "blimp/accuracy/superlative_quantifiers_2": 0.751, "blimp/accuracy/passive_1": 0.906, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.938, "blimp/accuracy/inchoative": 0.61, "blimp/accuracy/anaphor_gender_agreement": 0.949, "blimp/accuracy/principle_A_c_command": 0.626, "blimp/accuracy/only_npi_licensor_present": 0.788, "blimp/accuracy/expletive_it_object_raising": 0.813, "blimp/accuracy/left_branch_island_simple_question": 0.719, "blimp/accuracy/wh_questions_subject_gap": 0.946, "blimp/accuracy/existential_there_quantifiers_2": 0.559, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.926, "blimp/accuracy/sentential_negation_npi_scope": 0.716, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.835, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.899, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.911, "blimp/accuracy/principle_A_case_2": 0.935, "blimp/accuracy/distractor_agreement_relational_noun": 0.872, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.981, "blimp/accuracy/superlative_quantifiers_1": 0.898, "blimp/accuracy/wh_island": 0.823, "blimp/accuracy/principle_A_domain_1": 0.993, "blimp/accuracy/complex_NP_island": 0.59, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.972, "blimp/accuracy/irregular_past_participle_verbs": 0.908, "blimp/accuracy/drop_argument": 0.728, "blimp/accuracy/wh_questions_object_gap": 0.816, "blimp/accuracy/animate_subject_passive": 0.802, "blimp/accuracy/existential_there_quantifiers_1": 0.985, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.908, "blimp/accuracy/npi_present_2": 0.635, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.928, "blimp/accuracy/anaphor_number_agreement": 0.986, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.964, "blimp/accuracy/existential_there_object_raising": 0.888, "blimp/accuracy/matrix_question_npi_licensor_present": 0.266, "blimp/accuracy/npi_present_1": 0.621, "blimp/accuracy/wh_vs_that_no_gap": 0.984, "blimp/accuracy/left_branch_island_echo_question": 0.446, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.96, "blimp/accuracy/causative": 0.745, "blimp/accuracy/group_average": 0.8023731343283584, "blimp/accuracy/seq_average": 0.8023731343283582, "cbt/accuracy/NE": 0.8040865384615384, "cbt/accuracy/V": 0.9224, "cbt/accuracy/CN": 0.864, "cbt/accuracy/P": 0.9024, "cbt/accuracy/group_average": 0.8732216346153846, "cbt/accuracy/seq_average": 0.873249299719888, "hellaswag/accuracy/val": 0.33519219279028084, "hellaswag/accuracy/group_average": 0.33519219279028084, "hellaswag/accuracy/seq_average": 0.33519219279028084, "piqa/accuracy/val": 0.6251360174102285, "piqa/accuracy/group_average": 0.6251360174102285, "piqa/accuracy/seq_average": 0.6251360174102285, "ai2arc/accuracy/ARC-Easy": 0.3699788583509514, "ai2arc/accuracy/ARC-Challenge": 0.2240343347639485, "ai2arc/accuracy/group_average": 0.29700659655744993, "ai2arc/accuracy/seq_average": 0.3218130311614731, "race/accuracy/test/high": 0.2878787878787879, "race/accuracy/test/middle": 0.3447075208913649, "race/accuracy/group_average": 0.31629315438507644, "race/accuracy/seq_average": 0.30441832184839884, "siqa/accuracy/dev": 0.3679631525076766, "siqa/accuracy/group_average": 0.3679631525076766, "siqa/accuracy/seq_average": 0.3679631525076766, "commonsenseqa/accuracy/dev_rand_split": 0.2710892710892711, "commonsenseqa/accuracy/group_average": 0.2710892710892711, "commonsenseqa/accuracy/seq_average": 0.2710892710892711}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-140000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.391782730344742, "val/accuracy": 0.5074956984747023, "val/perplexity": 10.932967112409012, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.469319408724767, "lambada/accuracy/total": 0.3338509316770186, "lambada/accuracy/openai_last_token": 0.7907608695652174, "lambada/perplexity": 7.49987847015564, "lambada/lm_loss": 2.9802582643112916, "lambada/lm_perplexity": 19.69290196181868, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4206733150758605, "mean_loss": 2.4305510695347543, "blimp/accuracy/passive_2": 0.91, "blimp/accuracy/determiner_noun_agreement_2": 0.977, "blimp/accuracy/ellipsis_n_bar_1": 0.863, "blimp/accuracy/tough_vs_raising_2": 0.9, "blimp/accuracy/tough_vs_raising_1": 0.602, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.937, "blimp/accuracy/principle_A_reconstruction": 0.534, "blimp/accuracy/wh_vs_that_with_gap": 0.504, "blimp/accuracy/principle_A_domain_2": 0.869, "blimp/accuracy/determiner_noun_agreement_1": 0.989, "blimp/accuracy/ellipsis_n_bar_2": 0.91, "blimp/accuracy/principle_A_domain_3": 0.595, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.919, "blimp/accuracy/animate_subject_trans": 0.914, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.883, "blimp/accuracy/distractor_agreement_relative_clause": 0.616, "blimp/accuracy/transitive": 0.887, "blimp/accuracy/sentential_subject_island": 0.35, "blimp/accuracy/adjunct_island": 0.801, "blimp/accuracy/intransitive": 0.762, "blimp/accuracy/existential_there_subject_raising": 0.861, "blimp/accuracy/irregular_past_participle_adjectives": 0.894, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.629, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.389, "blimp/accuracy/only_npi_scope": 0.749, "blimp/accuracy/superlative_quantifiers_2": 0.859, "blimp/accuracy/passive_1": 0.913, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.943, "blimp/accuracy/inchoative": 0.595, "blimp/accuracy/anaphor_gender_agreement": 0.952, "blimp/accuracy/principle_A_c_command": 0.566, "blimp/accuracy/only_npi_licensor_present": 0.586, "blimp/accuracy/expletive_it_object_raising": 0.819, "blimp/accuracy/left_branch_island_simple_question": 0.662, "blimp/accuracy/wh_questions_subject_gap": 0.95, "blimp/accuracy/existential_there_quantifiers_2": 0.651, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.939, "blimp/accuracy/sentential_negation_npi_scope": 0.665, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.814, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.92, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.905, "blimp/accuracy/principle_A_case_2": 0.937, "blimp/accuracy/distractor_agreement_relational_noun": 0.838, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.981, "blimp/accuracy/superlative_quantifiers_1": 0.868, "blimp/accuracy/wh_island": 0.743, "blimp/accuracy/principle_A_domain_1": 0.981, "blimp/accuracy/complex_NP_island": 0.584, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.961, "blimp/accuracy/irregular_past_participle_verbs": 0.915, "blimp/accuracy/drop_argument": 0.762, "blimp/accuracy/wh_questions_object_gap": 0.864, "blimp/accuracy/animate_subject_passive": 0.799, "blimp/accuracy/existential_there_quantifiers_1": 0.99, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.925, "blimp/accuracy/npi_present_2": 0.569, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.939, "blimp/accuracy/anaphor_number_agreement": 0.986, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.954, "blimp/accuracy/existential_there_object_raising": 0.844, "blimp/accuracy/matrix_question_npi_licensor_present": 0.298, "blimp/accuracy/npi_present_1": 0.559, "blimp/accuracy/wh_vs_that_no_gap": 0.988, "blimp/accuracy/left_branch_island_echo_question": 0.482, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.968, "blimp/accuracy/causative": 0.751, "blimp/accuracy/group_average": 0.798044776119403, "blimp/accuracy/seq_average": 0.7980447761194029, "cbt/accuracy/NE": 0.7980769230769231, "cbt/accuracy/V": 0.9336, "cbt/accuracy/CN": 0.8688, "cbt/accuracy/P": 0.9112, "cbt/accuracy/group_average": 0.8779192307692307, "cbt/accuracy/seq_average": 0.8779511804721889, "hellaswag/accuracy/val": 0.3396733718382792, "hellaswag/accuracy/group_average": 0.3396733718382792, "hellaswag/accuracy/seq_average": 0.3396733718382792, "piqa/accuracy/val": 0.6196953210010882, "piqa/accuracy/group_average": 0.6196953210010882, "piqa/accuracy/seq_average": 0.6196953210010882, "ai2arc/accuracy/ARC-Easy": 0.3674418604651163, "ai2arc/accuracy/ARC-Challenge": 0.22660944206008585, "ai2arc/accuracy/group_average": 0.29702565126260105, "ai2arc/accuracy/seq_average": 0.32096317280453257, "mmlu/accuracy/MMLU": 0.26056489095459423, "mmlu/accuracy/group_average": 0.26056489095459423, "mmlu/accuracy/seq_average": 0.26056489095459423, "openbookqa/accuracy/test": 0.306, "openbookqa/accuracy/group_average": 0.306, "openbookqa/accuracy/seq_average": 0.306, "race/accuracy/test/high": 0.2815894797026873, "race/accuracy/test/middle": 0.34540389972144847, "race/accuracy/group_average": 0.3134966897120679, "race/accuracy/seq_average": 0.3001621402513174, "siqa/accuracy/dev": 0.3812691914022518, "siqa/accuracy/group_average": 0.3812691914022518, "siqa/accuracy/seq_average": 0.3812691914022518, "winogrande/accuracy/dev": 0.5027624309392266, "winogrande/accuracy/group_average": 0.5027624309392266, "winogrande/accuracy/seq_average": 0.5027624309392266, "commonsenseqa/accuracy/dev_rand_split": 0.2710892710892711, "commonsenseqa/accuracy/group_average": 0.2710892710892711, "commonsenseqa/accuracy/seq_average": 0.2710892710892711}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-160000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.3729340084015376, "val/accuracy": 0.5104108537946429, "val/perplexity": 10.728824611755357, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.85900556670953, "lambada/accuracy/total": 0.31638198757763975, "lambada/accuracy/openai_last_token": 0.7868788819875776, "lambada/perplexity": 7.2998206169449, "lambada/lm_loss": 2.9649637621185803, "lambada/lm_perplexity": 19.394000431249367, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.41339642068614135, "mean_loss": 2.615969787555534, "blimp/accuracy/passive_2": 0.92, "blimp/accuracy/determiner_noun_agreement_2": 0.983, "blimp/accuracy/ellipsis_n_bar_1": 0.871, "blimp/accuracy/tough_vs_raising_2": 0.891, "blimp/accuracy/tough_vs_raising_1": 0.61, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.927, "blimp/accuracy/principle_A_reconstruction": 0.427, "blimp/accuracy/wh_vs_that_with_gap": 0.508, "blimp/accuracy/principle_A_domain_2": 0.859, "blimp/accuracy/determiner_noun_agreement_1": 0.996, "blimp/accuracy/ellipsis_n_bar_2": 0.881, "blimp/accuracy/principle_A_domain_3": 0.579, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.909, "blimp/accuracy/animate_subject_trans": 0.904, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.897, "blimp/accuracy/distractor_agreement_relative_clause": 0.677, "blimp/accuracy/transitive": 0.879, "blimp/accuracy/sentential_subject_island": 0.285, "blimp/accuracy/adjunct_island": 0.856, "blimp/accuracy/intransitive": 0.765, "blimp/accuracy/existential_there_subject_raising": 0.864, "blimp/accuracy/irregular_past_participle_adjectives": 0.973, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.653, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.39, "blimp/accuracy/only_npi_scope": 0.755, "blimp/accuracy/superlative_quantifiers_2": 0.804, "blimp/accuracy/passive_1": 0.915, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.94, "blimp/accuracy/inchoative": 0.633, "blimp/accuracy/anaphor_gender_agreement": 0.962, "blimp/accuracy/principle_A_c_command": 0.584, "blimp/accuracy/only_npi_licensor_present": 0.667, "blimp/accuracy/expletive_it_object_raising": 0.814, "blimp/accuracy/left_branch_island_simple_question": 0.682, "blimp/accuracy/wh_questions_subject_gap": 0.937, "blimp/accuracy/existential_there_quantifiers_2": 0.552, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.933, "blimp/accuracy/sentential_negation_npi_scope": 0.707, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.827, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.882, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.928, "blimp/accuracy/principle_A_case_2": 0.95, "blimp/accuracy/distractor_agreement_relational_noun": 0.881, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.98, "blimp/accuracy/superlative_quantifiers_1": 0.804, "blimp/accuracy/wh_island": 0.809, "blimp/accuracy/principle_A_domain_1": 0.983, "blimp/accuracy/complex_NP_island": 0.579, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.967, "blimp/accuracy/irregular_past_participle_verbs": 0.903, "blimp/accuracy/drop_argument": 0.758, "blimp/accuracy/wh_questions_object_gap": 0.826, "blimp/accuracy/animate_subject_passive": 0.796, "blimp/accuracy/existential_there_quantifiers_1": 0.981, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.932, "blimp/accuracy/npi_present_2": 0.526, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.959, "blimp/accuracy/anaphor_number_agreement": 0.989, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.957, "blimp/accuracy/existential_there_object_raising": 0.913, "blimp/accuracy/matrix_question_npi_licensor_present": 0.288, "blimp/accuracy/npi_present_1": 0.547, "blimp/accuracy/wh_vs_that_no_gap": 0.979, "blimp/accuracy/left_branch_island_echo_question": 0.482, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.956, "blimp/accuracy/causative": 0.764, "blimp/accuracy/group_average": 0.7990298507462685, "blimp/accuracy/seq_average": 0.7990298507462686, "cbt/accuracy/NE": 0.8120993589743589, "cbt/accuracy/V": 0.932, "cbt/accuracy/CN": 0.8796, "cbt/accuracy/P": 0.9132, "cbt/accuracy/group_average": 0.8842248397435897, "cbt/accuracy/seq_average": 0.8842537014805922, "hellaswag/accuracy/val": 0.34773949412467636, "hellaswag/accuracy/group_average": 0.34773949412467636, "hellaswag/accuracy/seq_average": 0.34773949412467636, "piqa/accuracy/val": 0.6256800870511425, "piqa/accuracy/group_average": 0.6256800870511425, "piqa/accuracy/seq_average": 0.6256800870511425, "ai2arc/accuracy/ARC-Easy": 0.3572938689217759, "ai2arc/accuracy/ARC-Challenge": 0.23261802575107296, "ai2arc/accuracy/group_average": 0.29495594733642444, "ai2arc/accuracy/seq_average": 0.3161473087818697, "mmlu/accuracy/MMLU": 0.2643546657132642, "mmlu/accuracy/group_average": 0.2643546657132642, "mmlu/accuracy/seq_average": 0.2643546657132642, "openbookqa/accuracy/test": 0.274, "openbookqa/accuracy/group_average": 0.274, "openbookqa/accuracy/seq_average": 0.274, "race/accuracy/test/high": 0.28101772441395084, "race/accuracy/test/middle": 0.35097493036211697, "race/accuracy/group_average": 0.3159963273880339, "race/accuracy/seq_average": 0.3013781921361978, "siqa/accuracy/dev": 0.36898669396110545, "siqa/accuracy/group_average": 0.36898669396110545, "siqa/accuracy/seq_average": 0.36898669396110545, "winogrande/accuracy/dev": 0.4988161010260458, "winogrande/accuracy/group_average": 0.4988161010260458, "winogrande/accuracy/seq_average": 0.4988161010260458, "commonsenseqa/accuracy/dev_rand_split": 0.26535626535626533, "commonsenseqa/accuracy/group_average": 0.26535626535626533, "commonsenseqa/accuracy/seq_average": 0.26535626535626533}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-180000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.3537970648871527, "val/accuracy": 0.5129229833209326, "val/perplexity": 10.525459798022535, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.545559142687306, "lambada/accuracy/total": 0.343944099378882, "lambada/accuracy/openai_last_token": 0.797166149068323, "lambada/perplexity": 7.222484004238777, "lambada/lm_loss": 2.9563118238790773, "lambada/lm_perplexity": 19.226928527391898, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4284335413499073, "mean_loss": 2.449678103787229, "blimp/accuracy/passive_2": 0.912, "blimp/accuracy/determiner_noun_agreement_2": 0.976, "blimp/accuracy/ellipsis_n_bar_1": 0.844, "blimp/accuracy/tough_vs_raising_2": 0.891, "blimp/accuracy/tough_vs_raising_1": 0.598, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.93, "blimp/accuracy/principle_A_reconstruction": 0.328, "blimp/accuracy/wh_vs_that_with_gap": 0.483, "blimp/accuracy/principle_A_domain_2": 0.878, "blimp/accuracy/determiner_noun_agreement_1": 0.993, "blimp/accuracy/ellipsis_n_bar_2": 0.905, "blimp/accuracy/principle_A_domain_3": 0.584, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.927, "blimp/accuracy/animate_subject_trans": 0.904, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.926, "blimp/accuracy/distractor_agreement_relative_clause": 0.668, "blimp/accuracy/transitive": 0.903, "blimp/accuracy/sentential_subject_island": 0.33, "blimp/accuracy/adjunct_island": 0.862, "blimp/accuracy/intransitive": 0.773, "blimp/accuracy/existential_there_subject_raising": 0.881, "blimp/accuracy/irregular_past_participle_adjectives": 0.975, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.684, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.344, "blimp/accuracy/only_npi_scope": 0.778, "blimp/accuracy/superlative_quantifiers_2": 0.841, "blimp/accuracy/passive_1": 0.904, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.947, "blimp/accuracy/inchoative": 0.619, "blimp/accuracy/anaphor_gender_agreement": 0.969, "blimp/accuracy/principle_A_c_command": 0.605, "blimp/accuracy/only_npi_licensor_present": 0.756, "blimp/accuracy/expletive_it_object_raising": 0.826, "blimp/accuracy/left_branch_island_simple_question": 0.787, "blimp/accuracy/wh_questions_subject_gap": 0.967, "blimp/accuracy/existential_there_quantifiers_2": 0.557, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.937, "blimp/accuracy/sentential_negation_npi_scope": 0.713, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.825, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.939, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.895, "blimp/accuracy/principle_A_case_2": 0.927, "blimp/accuracy/distractor_agreement_relational_noun": 0.855, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.981, "blimp/accuracy/superlative_quantifiers_1": 0.786, "blimp/accuracy/wh_island": 0.788, "blimp/accuracy/principle_A_domain_1": 0.988, "blimp/accuracy/complex_NP_island": 0.578, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.976, "blimp/accuracy/irregular_past_participle_verbs": 0.873, "blimp/accuracy/drop_argument": 0.74, "blimp/accuracy/wh_questions_object_gap": 0.874, "blimp/accuracy/animate_subject_passive": 0.82, "blimp/accuracy/existential_there_quantifiers_1": 0.986, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.891, "blimp/accuracy/npi_present_2": 0.533, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.942, "blimp/accuracy/anaphor_number_agreement": 0.992, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.963, "blimp/accuracy/existential_there_object_raising": 0.889, "blimp/accuracy/matrix_question_npi_licensor_present": 0.301, "blimp/accuracy/npi_present_1": 0.5, "blimp/accuracy/wh_vs_that_no_gap": 0.982, "blimp/accuracy/left_branch_island_echo_question": 0.498, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.972, "blimp/accuracy/causative": 0.759, "blimp/accuracy/group_average": 0.8023582089552239, "blimp/accuracy/seq_average": 0.8023582089552239, "cbt/accuracy/NE": 0.8145032051282052, "cbt/accuracy/V": 0.9344, "cbt/accuracy/CN": 0.8808, "cbt/accuracy/P": 0.92, "cbt/accuracy/group_average": 0.8874258012820513, "cbt/accuracy/seq_average": 0.8874549819927972, "hellaswag/accuracy/val": 0.35012945628360886, "hellaswag/accuracy/group_average": 0.35012945628360886, "hellaswag/accuracy/seq_average": 0.35012945628360886, "piqa/accuracy/val": 0.6463547334058759, "piqa/accuracy/group_average": 0.6463547334058759, "piqa/accuracy/seq_average": 0.6463547334058759, "ai2arc/accuracy/ARC-Easy": 0.37970401691331923, "ai2arc/accuracy/ARC-Challenge": 0.22746781115879827, "ai2arc/accuracy/group_average": 0.3035859140360587, "ai2arc/accuracy/seq_average": 0.32946175637393765, "mmlu/accuracy/MMLU": 0.25942080800858064, "mmlu/accuracy/group_average": 0.25942080800858064, "mmlu/accuracy/seq_average": 0.25942080800858064, "openbookqa/accuracy/test": 0.278, "openbookqa/accuracy/group_average": 0.278, "openbookqa/accuracy/seq_average": 0.278, "race/accuracy/test/high": 0.2864493996569468, "race/accuracy/test/middle": 0.35097493036211697, "race/accuracy/group_average": 0.3187121650095319, "race/accuracy/seq_average": 0.3052290231049858, "siqa/accuracy/dev": 0.37871033776867963, "siqa/accuracy/group_average": 0.37871033776867963, "siqa/accuracy/seq_average": 0.37871033776867963, "winogrande/accuracy/dev": 0.5146014206787688, "winogrande/accuracy/group_average": 0.5146014206787688, "winogrande/accuracy/seq_average": 0.5146014206787688, "commonsenseqa/accuracy/dev_rand_split": 0.276003276003276, "commonsenseqa/accuracy/group_average": 0.276003276003276, "commonsenseqa/accuracy/seq_average": 0.276003276003276}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-20000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.7155628507099454, "val/accuracy": 0.46135699559771826, "val/perplexity": 15.113114105423962, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6201310246627525, "lambada/accuracy/total": 0.22437888198757763, "lambada/accuracy/openai_last_token": 0.7484472049689441, "lambada/perplexity": 13.289147115695117, "lambada/lm_loss": 3.262524279066259, "lambada/lm_perplexity": 26.11537650731747, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.34286793879264793, "mean_loss": 2.667846937686349, "blimp/accuracy/passive_2": 0.899, "blimp/accuracy/determiner_noun_agreement_2": 0.982, "blimp/accuracy/ellipsis_n_bar_1": 0.826, "blimp/accuracy/tough_vs_raising_2": 0.879, "blimp/accuracy/tough_vs_raising_1": 0.566, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.858, "blimp/accuracy/principle_A_reconstruction": 0.332, "blimp/accuracy/wh_vs_that_with_gap": 0.43, "blimp/accuracy/principle_A_domain_2": 0.824, "blimp/accuracy/determiner_noun_agreement_1": 0.983, "blimp/accuracy/ellipsis_n_bar_2": 0.876, "blimp/accuracy/principle_A_domain_3": 0.551, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.917, "blimp/accuracy/animate_subject_trans": 0.88, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.861, "blimp/accuracy/distractor_agreement_relative_clause": 0.538, "blimp/accuracy/transitive": 0.852, "blimp/accuracy/sentential_subject_island": 0.333, "blimp/accuracy/adjunct_island": 0.819, "blimp/accuracy/intransitive": 0.791, "blimp/accuracy/existential_there_subject_raising": 0.853, "blimp/accuracy/irregular_past_participle_adjectives": 0.911, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.298, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.195, "blimp/accuracy/only_npi_scope": 0.591, "blimp/accuracy/superlative_quantifiers_2": 0.76, "blimp/accuracy/passive_1": 0.892, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.879, "blimp/accuracy/inchoative": 0.603, "blimp/accuracy/anaphor_gender_agreement": 0.957, "blimp/accuracy/principle_A_c_command": 0.503, "blimp/accuracy/only_npi_licensor_present": 0.492, "blimp/accuracy/expletive_it_object_raising": 0.792, "blimp/accuracy/left_branch_island_simple_question": 0.392, "blimp/accuracy/wh_questions_subject_gap": 0.938, "blimp/accuracy/existential_there_quantifiers_2": 0.4, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.932, "blimp/accuracy/sentential_negation_npi_scope": 0.583, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.787, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.927, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.853, "blimp/accuracy/principle_A_case_2": 0.957, "blimp/accuracy/distractor_agreement_relational_noun": 0.785, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.994, "blimp/accuracy/superlative_quantifiers_1": 0.651, "blimp/accuracy/wh_island": 0.865, "blimp/accuracy/principle_A_domain_1": 0.978, "blimp/accuracy/complex_NP_island": 0.471, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.965, "blimp/accuracy/irregular_past_participle_verbs": 0.817, "blimp/accuracy/drop_argument": 0.776, "blimp/accuracy/wh_questions_object_gap": 0.837, "blimp/accuracy/animate_subject_passive": 0.78, "blimp/accuracy/existential_there_quantifiers_1": 0.978, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.833, "blimp/accuracy/npi_present_2": 0.56, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.923, "blimp/accuracy/anaphor_number_agreement": 0.984, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.945, "blimp/accuracy/existential_there_object_raising": 0.805, "blimp/accuracy/matrix_question_npi_licensor_present": 0.169, "blimp/accuracy/npi_present_1": 0.606, "blimp/accuracy/wh_vs_that_no_gap": 0.982, "blimp/accuracy/left_branch_island_echo_question": 0.397, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.983, "blimp/accuracy/causative": 0.684, "blimp/accuracy/group_average": 0.7501492537313434, "blimp/accuracy/seq_average": 0.7501492537313433, "cbt/accuracy/NE": 0.7524038461538461, "cbt/accuracy/V": 0.9052, "cbt/accuracy/CN": 0.8044, "cbt/accuracy/P": 0.8752, "cbt/accuracy/group_average": 0.8343009615384614, "cbt/accuracy/seq_average": 0.8343337334933973, "hellaswag/accuracy/val": 0.2876916948814977, "hellaswag/accuracy/group_average": 0.2876916948814977, "hellaswag/accuracy/seq_average": 0.2876916948814977, "piqa/accuracy/val": 0.5788900979325353, "piqa/accuracy/group_average": 0.5788900979325353, "piqa/accuracy/seq_average": 0.5788900979325353, "ai2arc/accuracy/ARC-Easy": 0.31670190274841437, "ai2arc/accuracy/ARC-Challenge": 0.21974248927038625, "ai2arc/accuracy/group_average": 0.26822219600940034, "ai2arc/accuracy/seq_average": 0.2847025495750708, "mmlu/accuracy/MMLU": 0.2590632820879514, "mmlu/accuracy/group_average": 0.2590632820879514, "mmlu/accuracy/seq_average": 0.2590632820879514, "openbookqa/accuracy/test": 0.278, "openbookqa/accuracy/group_average": 0.278, "openbookqa/accuracy/seq_average": 0.278, "race/accuracy/test/high": 0.2652944539736993, "race/accuracy/test/middle": 0.3328690807799443, "race/accuracy/group_average": 0.2990817673768218, "race/accuracy/seq_average": 0.2849614916903121, "siqa/accuracy/dev": 0.3669396110542477, "siqa/accuracy/group_average": 0.3669396110542477, "siqa/accuracy/seq_average": 0.3669396110542477, "winogrande/accuracy/dev": 0.5169692186266772, "winogrande/accuracy/group_average": 0.5169692186266772, "winogrande/accuracy/seq_average": 0.5169692186266772, "commonsenseqa/accuracy/dev_rand_split": 0.2416052416052416, "commonsenseqa/accuracy/group_average": 0.2416052416052416, "commonsenseqa/accuracy/seq_average": 0.2416052416052416}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-200000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.3394068157862105, "val/accuracy": 0.5149061414930556, "val/perplexity": 10.375080403132076, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.466587990707492, "lambada/accuracy/total": 0.35423136645962733, "lambada/accuracy/openai_last_token": 0.7919254658385093, "lambada/perplexity": 7.0657466028044436, "lambada/lm_loss": 2.945683773935964, "lambada/lm_perplexity": 19.023665827113973, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4345687539763414, "mean_loss": 2.402997403246851, "blimp/accuracy/passive_2": 0.918, "blimp/accuracy/determiner_noun_agreement_2": 0.974, "blimp/accuracy/ellipsis_n_bar_1": 0.845, "blimp/accuracy/tough_vs_raising_2": 0.893, "blimp/accuracy/tough_vs_raising_1": 0.571, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.949, "blimp/accuracy/principle_A_reconstruction": 0.466, "blimp/accuracy/wh_vs_that_with_gap": 0.545, "blimp/accuracy/principle_A_domain_2": 0.864, "blimp/accuracy/determiner_noun_agreement_1": 0.992, "blimp/accuracy/ellipsis_n_bar_2": 0.901, "blimp/accuracy/principle_A_domain_3": 0.592, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.92, "blimp/accuracy/animate_subject_trans": 0.907, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.929, "blimp/accuracy/distractor_agreement_relative_clause": 0.734, "blimp/accuracy/transitive": 0.882, "blimp/accuracy/sentential_subject_island": 0.348, "blimp/accuracy/adjunct_island": 0.849, "blimp/accuracy/intransitive": 0.764, "blimp/accuracy/existential_there_subject_raising": 0.88, "blimp/accuracy/irregular_past_participle_adjectives": 0.973, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.731, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.391, "blimp/accuracy/only_npi_scope": 0.683, "blimp/accuracy/superlative_quantifiers_2": 0.799, "blimp/accuracy/passive_1": 0.9, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.924, "blimp/accuracy/inchoative": 0.631, "blimp/accuracy/anaphor_gender_agreement": 0.973, "blimp/accuracy/principle_A_c_command": 0.601, "blimp/accuracy/only_npi_licensor_present": 0.922, "blimp/accuracy/expletive_it_object_raising": 0.813, "blimp/accuracy/left_branch_island_simple_question": 0.829, "blimp/accuracy/wh_questions_subject_gap": 0.94, "blimp/accuracy/existential_there_quantifiers_2": 0.624, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.932, "blimp/accuracy/sentential_negation_npi_scope": 0.739, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.848, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.933, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.926, "blimp/accuracy/principle_A_case_2": 0.952, "blimp/accuracy/distractor_agreement_relational_noun": 0.866, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.985, "blimp/accuracy/superlative_quantifiers_1": 0.746, "blimp/accuracy/wh_island": 0.789, "blimp/accuracy/principle_A_domain_1": 0.987, "blimp/accuracy/complex_NP_island": 0.627, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.971, "blimp/accuracy/irregular_past_participle_verbs": 0.907, "blimp/accuracy/drop_argument": 0.76, "blimp/accuracy/wh_questions_object_gap": 0.872, "blimp/accuracy/animate_subject_passive": 0.799, "blimp/accuracy/existential_there_quantifiers_1": 0.987, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.903, "blimp/accuracy/npi_present_2": 0.564, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.955, "blimp/accuracy/anaphor_number_agreement": 0.992, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.969, "blimp/accuracy/existential_there_object_raising": 0.905, "blimp/accuracy/matrix_question_npi_licensor_present": 0.286, "blimp/accuracy/npi_present_1": 0.574, "blimp/accuracy/wh_vs_that_no_gap": 0.985, "blimp/accuracy/left_branch_island_echo_question": 0.531, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.971, "blimp/accuracy/causative": 0.773, "blimp/accuracy/group_average": 0.8132985074626864, "blimp/accuracy/seq_average": 0.8132985074626866, "cbt/accuracy/NE": 0.8189102564102564, "cbt/accuracy/V": 0.9372, "cbt/accuracy/CN": 0.8716, "cbt/accuracy/P": 0.9212, "cbt/accuracy/group_average": 0.8872275641025642, "cbt/accuracy/seq_average": 0.8872549019607843, "hellaswag/accuracy/val": 0.3527185819557857, "hellaswag/accuracy/group_average": 0.3527185819557857, "hellaswag/accuracy/seq_average": 0.3527185819557857, "piqa/accuracy/val": 0.6245919477693145, "piqa/accuracy/group_average": 0.6245919477693145, "piqa/accuracy/seq_average": 0.6245919477693145, "ai2arc/accuracy/ARC-Easy": 0.38224101479915434, "ai2arc/accuracy/ARC-Challenge": 0.2369098712446352, "ai2arc/accuracy/group_average": 0.30957544302189477, "ai2arc/accuracy/seq_average": 0.3342776203966006, "mmlu/accuracy/MMLU": 0.2635681086878799, "mmlu/accuracy/group_average": 0.2635681086878799, "mmlu/accuracy/seq_average": 0.2635681086878799, "openbookqa/accuracy/test": 0.288, "openbookqa/accuracy/group_average": 0.288, "openbookqa/accuracy/seq_average": 0.288, "race/accuracy/test/high": 0.28702115494568325, "race/accuracy/test/middle": 0.34331476323119775, "race/accuracy/group_average": 0.31516795908844053, "race/accuracy/seq_average": 0.3034049452776652, "siqa/accuracy/dev": 0.38024564994882293, "siqa/accuracy/group_average": 0.38024564994882293, "siqa/accuracy/seq_average": 0.38024564994882293, "winogrande/accuracy/dev": 0.5082872928176796, "winogrande/accuracy/group_average": 0.5082872928176796, "winogrande/accuracy/seq_average": 0.5082872928176796, "commonsenseqa/accuracy/dev_rand_split": 0.276003276003276, "commonsenseqa/accuracy/group_average": 0.276003276003276, "commonsenseqa/accuracy/seq_average": 0.276003276003276}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-220000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.3243282257564486, "val/accuracy": 0.5172293526785714, "val/perplexity": 10.21981237254703, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.4493465068177405, "lambada/accuracy/total": 0.34976708074534163, "lambada/accuracy/openai_last_token": 0.7975543478260869, "lambada/perplexity": 6.95789542825927, "lambada/lm_loss": 2.9241843595669472, "lambada/lm_perplexity": 18.619033418398594, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.43349821671195654, "mean_loss": 2.3868373662870948, "blimp/accuracy/passive_2": 0.916, "blimp/accuracy/determiner_noun_agreement_2": 0.985, "blimp/accuracy/ellipsis_n_bar_1": 0.856, "blimp/accuracy/tough_vs_raising_2": 0.889, "blimp/accuracy/tough_vs_raising_1": 0.622, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.936, "blimp/accuracy/principle_A_reconstruction": 0.458, "blimp/accuracy/wh_vs_that_with_gap": 0.532, "blimp/accuracy/principle_A_domain_2": 0.868, "blimp/accuracy/determiner_noun_agreement_1": 0.993, "blimp/accuracy/ellipsis_n_bar_2": 0.904, "blimp/accuracy/principle_A_domain_3": 0.591, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.911, "blimp/accuracy/animate_subject_trans": 0.919, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.915, "blimp/accuracy/distractor_agreement_relative_clause": 0.676, "blimp/accuracy/transitive": 0.893, "blimp/accuracy/sentential_subject_island": 0.375, "blimp/accuracy/adjunct_island": 0.832, "blimp/accuracy/intransitive": 0.782, "blimp/accuracy/existential_there_subject_raising": 0.872, "blimp/accuracy/irregular_past_participle_adjectives": 0.989, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.654, "blimp/accuracy/principle_A_case_1": 0.999, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.423, "blimp/accuracy/only_npi_scope": 0.773, "blimp/accuracy/superlative_quantifiers_2": 0.777, "blimp/accuracy/passive_1": 0.909, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.935, "blimp/accuracy/inchoative": 0.638, "blimp/accuracy/anaphor_gender_agreement": 0.966, "blimp/accuracy/principle_A_c_command": 0.575, "blimp/accuracy/only_npi_licensor_present": 0.667, "blimp/accuracy/expletive_it_object_raising": 0.804, "blimp/accuracy/left_branch_island_simple_question": 0.721, "blimp/accuracy/wh_questions_subject_gap": 0.934, "blimp/accuracy/existential_there_quantifiers_2": 0.6, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.934, "blimp/accuracy/sentential_negation_npi_scope": 0.703, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.855, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.914, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.909, "blimp/accuracy/principle_A_case_2": 0.941, "blimp/accuracy/distractor_agreement_relational_noun": 0.857, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.962, "blimp/accuracy/superlative_quantifiers_1": 0.844, "blimp/accuracy/wh_island": 0.82, "blimp/accuracy/principle_A_domain_1": 0.982, "blimp/accuracy/complex_NP_island": 0.588, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.964, "blimp/accuracy/irregular_past_participle_verbs": 0.879, "blimp/accuracy/drop_argument": 0.758, "blimp/accuracy/wh_questions_object_gap": 0.83, "blimp/accuracy/animate_subject_passive": 0.809, "blimp/accuracy/existential_there_quantifiers_1": 0.986, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.889, "blimp/accuracy/npi_present_2": 0.596, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.959, "blimp/accuracy/anaphor_number_agreement": 0.988, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.962, "blimp/accuracy/existential_there_object_raising": 0.905, "blimp/accuracy/matrix_question_npi_licensor_present": 0.334, "blimp/accuracy/npi_present_1": 0.585, "blimp/accuracy/wh_vs_that_no_gap": 0.988, "blimp/accuracy/left_branch_island_echo_question": 0.511, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.964, "blimp/accuracy/causative": 0.772, "blimp/accuracy/group_average": 0.8071194029850748, "blimp/accuracy/seq_average": 0.8071194029850747, "cbt/accuracy/NE": 0.8145032051282052, "cbt/accuracy/V": 0.9396, "cbt/accuracy/CN": 0.8792, "cbt/accuracy/P": 0.92, "cbt/accuracy/group_average": 0.8883258012820513, "cbt/accuracy/seq_average": 0.8883553421368547, "hellaswag/accuracy/val": 0.3551085441147182, "hellaswag/accuracy/group_average": 0.3551085441147182, "hellaswag/accuracy/seq_average": 0.3551085441147182, "piqa/accuracy/val": 0.6332970620239391, "piqa/accuracy/group_average": 0.6332970620239391, "piqa/accuracy/seq_average": 0.6332970620239391, "ai2arc/accuracy/ARC-Easy": 0.38054968287526425, "ai2arc/accuracy/ARC-Challenge": 0.23261802575107296, "ai2arc/accuracy/group_average": 0.3065838543131686, "ai2arc/accuracy/seq_average": 0.33172804532577904, "mmlu/accuracy/MMLU": 0.26414015016088666, "mmlu/accuracy/group_average": 0.26414015016088666, "mmlu/accuracy/seq_average": 0.26414015016088666, "openbookqa/accuracy/test": 0.272, "openbookqa/accuracy/group_average": 0.272, "openbookqa/accuracy/seq_average": 0.272, "race/accuracy/test/high": 0.2861635220125786, "race/accuracy/test/middle": 0.35863509749303624, "race/accuracy/group_average": 0.3223993097528074, "race/accuracy/seq_average": 0.30725577624645317, "siqa/accuracy/dev": 0.3669396110542477, "siqa/accuracy/group_average": 0.3669396110542477, "siqa/accuracy/seq_average": 0.3669396110542477, "winogrande/accuracy/dev": 0.5130228887134964, "winogrande/accuracy/group_average": 0.5130228887134964, "winogrande/accuracy/seq_average": 0.5130228887134964, "commonsenseqa/accuracy/dev_rand_split": 0.2800982800982801, "commonsenseqa/accuracy/group_average": 0.2800982800982801, "commonsenseqa/accuracy/seq_average": 0.2800982800982801}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-240000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.311644296797495, "val/accuracy": 0.5192221989707341, "val/perplexity": 10.091003625849357, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.440006232409744, "lambada/accuracy/total": 0.359860248447205, "lambada/accuracy/openai_last_token": 0.8000776397515528, "lambada/perplexity": 6.714775589226039, "lambada/lm_loss": 2.90779245405573, "lambada/lm_perplexity": 18.316319786613906, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.43954122370896953, "mean_loss": 2.3758252646036193, "blimp/accuracy/passive_2": 0.917, "blimp/accuracy/determiner_noun_agreement_2": 0.986, "blimp/accuracy/ellipsis_n_bar_1": 0.835, "blimp/accuracy/tough_vs_raising_2": 0.91, "blimp/accuracy/tough_vs_raising_1": 0.603, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.927, "blimp/accuracy/principle_A_reconstruction": 0.442, "blimp/accuracy/wh_vs_that_with_gap": 0.485, "blimp/accuracy/principle_A_domain_2": 0.864, "blimp/accuracy/determiner_noun_agreement_1": 0.99, "blimp/accuracy/ellipsis_n_bar_2": 0.905, "blimp/accuracy/principle_A_domain_3": 0.575, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.931, "blimp/accuracy/animate_subject_trans": 0.91, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.908, "blimp/accuracy/distractor_agreement_relative_clause": 0.726, "blimp/accuracy/transitive": 0.897, "blimp/accuracy/sentential_subject_island": 0.313, "blimp/accuracy/adjunct_island": 0.858, "blimp/accuracy/intransitive": 0.779, "blimp/accuracy/existential_there_subject_raising": 0.883, "blimp/accuracy/irregular_past_participle_adjectives": 0.825, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.654, "blimp/accuracy/principle_A_case_1": 0.999, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.407, "blimp/accuracy/only_npi_scope": 0.747, "blimp/accuracy/superlative_quantifiers_2": 0.731, "blimp/accuracy/passive_1": 0.908, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.936, "blimp/accuracy/inchoative": 0.631, "blimp/accuracy/anaphor_gender_agreement": 0.967, "blimp/accuracy/principle_A_c_command": 0.643, "blimp/accuracy/only_npi_licensor_present": 0.841, "blimp/accuracy/expletive_it_object_raising": 0.816, "blimp/accuracy/left_branch_island_simple_question": 0.706, "blimp/accuracy/wh_questions_subject_gap": 0.932, "blimp/accuracy/existential_there_quantifiers_2": 0.629, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.939, "blimp/accuracy/sentential_negation_npi_scope": 0.753, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.822, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.918, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.919, "blimp/accuracy/principle_A_case_2": 0.945, "blimp/accuracy/distractor_agreement_relational_noun": 0.856, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.978, "blimp/accuracy/superlative_quantifiers_1": 0.9, "blimp/accuracy/wh_island": 0.713, "blimp/accuracy/principle_A_domain_1": 0.986, "blimp/accuracy/complex_NP_island": 0.612, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.972, "blimp/accuracy/irregular_past_participle_verbs": 0.942, "blimp/accuracy/drop_argument": 0.754, "blimp/accuracy/wh_questions_object_gap": 0.842, "blimp/accuracy/animate_subject_passive": 0.805, "blimp/accuracy/existential_there_quantifiers_1": 0.986, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.9, "blimp/accuracy/npi_present_2": 0.617, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.971, "blimp/accuracy/anaphor_number_agreement": 0.989, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.957, "blimp/accuracy/existential_there_object_raising": 0.914, "blimp/accuracy/matrix_question_npi_licensor_present": 0.344, "blimp/accuracy/npi_present_1": 0.578, "blimp/accuracy/wh_vs_that_no_gap": 0.978, "blimp/accuracy/left_branch_island_echo_question": 0.508, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.961, "blimp/accuracy/causative": 0.769, "blimp/accuracy/group_average": 0.8081194029850745, "blimp/accuracy/seq_average": 0.8081194029850747, "cbt/accuracy/NE": 0.8197115384615384, "cbt/accuracy/V": 0.9408, "cbt/accuracy/CN": 0.8864, "cbt/accuracy/P": 0.924, "cbt/accuracy/group_average": 0.8927278846153845, "cbt/accuracy/seq_average": 0.8927571028411364, "hellaswag/accuracy/val": 0.3593905596494722, "hellaswag/accuracy/group_average": 0.3593905596494722, "hellaswag/accuracy/seq_average": 0.3593905596494722, "piqa/accuracy/val": 0.6305767138193689, "piqa/accuracy/group_average": 0.6305767138193689, "piqa/accuracy/seq_average": 0.6305767138193689, "ai2arc/accuracy/ARC-Easy": 0.3835095137420719, "ai2arc/accuracy/ARC-Challenge": 0.24377682403433476, "ai2arc/accuracy/group_average": 0.3136431688882033, "ai2arc/accuracy/seq_average": 0.33739376770538243, "mmlu/accuracy/MMLU": 0.2593493028244548, "mmlu/accuracy/group_average": 0.2593493028244548, "mmlu/accuracy/seq_average": 0.2593493028244548, "openbookqa/accuracy/test": 0.28, "openbookqa/accuracy/group_average": 0.28, "openbookqa/accuracy/seq_average": 0.28, "race/accuracy/test/high": 0.2887364208118925, "race/accuracy/test/middle": 0.3579387186629526, "race/accuracy/group_average": 0.32333756973742256, "race/accuracy/seq_average": 0.30887717875962706, "siqa/accuracy/dev": 0.3741044012282497, "siqa/accuracy/group_average": 0.3741044012282497, "siqa/accuracy/seq_average": 0.3741044012282497, "winogrande/accuracy/dev": 0.5209155485398579, "winogrande/accuracy/group_average": 0.5209155485398579, "winogrande/accuracy/seq_average": 0.5209155485398579, "commonsenseqa/accuracy/dev_rand_split": 0.27682227682227684, "commonsenseqa/accuracy/group_average": 0.27682227682227684, "commonsenseqa/accuracy/seq_average": 0.27682227682227684}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-260000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.300623575846354, "val/accuracy": 0.520569816468254, "val/perplexity": 9.980404053698463, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5182093270817156, "lambada/accuracy/total": 0.3548136645962733, "lambada/accuracy/openai_last_token": 0.7958074534161491, "lambada/perplexity": 6.762386451881823, "lambada/lm_loss": 2.889514533915263, "lambada/lm_perplexity": 17.984576579951003, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4376917405322636, "mean_loss": 2.409416451464035, "blimp/accuracy/passive_2": 0.915, "blimp/accuracy/determiner_noun_agreement_2": 0.984, "blimp/accuracy/ellipsis_n_bar_1": 0.824, "blimp/accuracy/tough_vs_raising_2": 0.89, "blimp/accuracy/tough_vs_raising_1": 0.608, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.942, "blimp/accuracy/principle_A_reconstruction": 0.447, "blimp/accuracy/wh_vs_that_with_gap": 0.494, "blimp/accuracy/principle_A_domain_2": 0.859, "blimp/accuracy/determiner_noun_agreement_1": 0.993, "blimp/accuracy/ellipsis_n_bar_2": 0.905, "blimp/accuracy/principle_A_domain_3": 0.601, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.921, "blimp/accuracy/animate_subject_trans": 0.908, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.921, "blimp/accuracy/distractor_agreement_relative_clause": 0.7, "blimp/accuracy/transitive": 0.884, "blimp/accuracy/sentential_subject_island": 0.344, "blimp/accuracy/adjunct_island": 0.859, "blimp/accuracy/intransitive": 0.782, "blimp/accuracy/existential_there_subject_raising": 0.892, "blimp/accuracy/irregular_past_participle_adjectives": 0.87, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.741, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.382, "blimp/accuracy/only_npi_scope": 0.755, "blimp/accuracy/superlative_quantifiers_2": 0.818, "blimp/accuracy/passive_1": 0.909, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.934, "blimp/accuracy/inchoative": 0.646, "blimp/accuracy/anaphor_gender_agreement": 0.974, "blimp/accuracy/principle_A_c_command": 0.608, "blimp/accuracy/only_npi_licensor_present": 0.714, "blimp/accuracy/expletive_it_object_raising": 0.829, "blimp/accuracy/left_branch_island_simple_question": 0.801, "blimp/accuracy/wh_questions_subject_gap": 0.936, "blimp/accuracy/existential_there_quantifiers_2": 0.594, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.951, "blimp/accuracy/sentential_negation_npi_scope": 0.771, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.841, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.909, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.911, "blimp/accuracy/principle_A_case_2": 0.924, "blimp/accuracy/distractor_agreement_relational_noun": 0.845, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.977, "blimp/accuracy/superlative_quantifiers_1": 0.889, "blimp/accuracy/wh_island": 0.778, "blimp/accuracy/principle_A_domain_1": 0.983, "blimp/accuracy/complex_NP_island": 0.629, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.973, "blimp/accuracy/irregular_past_participle_verbs": 0.915, "blimp/accuracy/drop_argument": 0.763, "blimp/accuracy/wh_questions_object_gap": 0.84, "blimp/accuracy/animate_subject_passive": 0.81, "blimp/accuracy/existential_there_quantifiers_1": 0.99, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.929, "blimp/accuracy/npi_present_2": 0.61, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.966, "blimp/accuracy/anaphor_number_agreement": 0.987, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.965, "blimp/accuracy/existential_there_object_raising": 0.901, "blimp/accuracy/matrix_question_npi_licensor_present": 0.349, "blimp/accuracy/npi_present_1": 0.608, "blimp/accuracy/wh_vs_that_no_gap": 0.986, "blimp/accuracy/left_branch_island_echo_question": 0.524, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.963, "blimp/accuracy/causative": 0.781, "blimp/accuracy/group_average": 0.8127164179104478, "blimp/accuracy/seq_average": 0.8127164179104478, "cbt/accuracy/NE": 0.8197115384615384, "cbt/accuracy/V": 0.9392, "cbt/accuracy/CN": 0.8828, "cbt/accuracy/P": 0.9232, "cbt/accuracy/group_average": 0.8912278846153846, "cbt/accuracy/seq_average": 0.8912565026010404, "hellaswag/accuracy/val": 0.36058554072893845, "hellaswag/accuracy/group_average": 0.36058554072893845, "hellaswag/accuracy/seq_average": 0.36058554072893845, "piqa/accuracy/val": 0.6332970620239391, "piqa/accuracy/group_average": 0.6332970620239391, "piqa/accuracy/seq_average": 0.6332970620239391, "ai2arc/accuracy/ARC-Easy": 0.3788583509513742, "ai2arc/accuracy/ARC-Challenge": 0.2334763948497854, "ai2arc/accuracy/group_average": 0.3061673729005798, "ai2arc/accuracy/seq_average": 0.33087818696883853, "mmlu/accuracy/MMLU": 0.26120843761172685, "mmlu/accuracy/group_average": 0.26120843761172685, "mmlu/accuracy/seq_average": 0.26120843761172685, "openbookqa/accuracy/test": 0.272, "openbookqa/accuracy/group_average": 0.272, "openbookqa/accuracy/seq_average": 0.272, "race/accuracy/test/high": 0.28416237850200116, "race/accuracy/test/middle": 0.3502785515320334, "race/accuracy/group_average": 0.3172204650170173, "race/accuracy/seq_average": 0.3034049452776652, "siqa/accuracy/dev": 0.3725690890481064, "siqa/accuracy/group_average": 0.3725690890481064, "siqa/accuracy/seq_average": 0.3725690890481064, "winogrande/accuracy/dev": 0.5130228887134964, "winogrande/accuracy/group_average": 0.5130228887134964, "winogrande/accuracy/seq_average": 0.5130228887134964, "commonsenseqa/accuracy/dev_rand_split": 0.2727272727272727, "commonsenseqa/accuracy/group_average": 0.2727272727272727, "commonsenseqa/accuracy/seq_average": 0.2727272727272727}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-280000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.2870066809275795, "val/accuracy": 0.5226120721726191, "val/perplexity": 9.845423037283895, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5413704629270186, "lambada/accuracy/total": 0.3759704968944099, "lambada/accuracy/openai_last_token": 0.8039596273291926, "lambada/perplexity": 6.317988108823097, "lambada/lm_loss": 2.8818190913605286, "lambada/lm_perplexity": 17.846708462474464, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4492912845335145, "mean_loss": 2.414188571927299, "blimp/accuracy/passive_2": 0.911, "blimp/accuracy/determiner_noun_agreement_2": 0.984, "blimp/accuracy/ellipsis_n_bar_1": 0.817, "blimp/accuracy/tough_vs_raising_2": 0.903, "blimp/accuracy/tough_vs_raising_1": 0.598, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.911, "blimp/accuracy/principle_A_reconstruction": 0.46, "blimp/accuracy/wh_vs_that_with_gap": 0.47, "blimp/accuracy/principle_A_domain_2": 0.881, "blimp/accuracy/determiner_noun_agreement_1": 0.992, "blimp/accuracy/ellipsis_n_bar_2": 0.905, "blimp/accuracy/principle_A_domain_3": 0.599, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.907, "blimp/accuracy/animate_subject_trans": 0.905, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.903, "blimp/accuracy/distractor_agreement_relative_clause": 0.724, "blimp/accuracy/transitive": 0.892, "blimp/accuracy/sentential_subject_island": 0.318, "blimp/accuracy/adjunct_island": 0.85, "blimp/accuracy/intransitive": 0.785, "blimp/accuracy/existential_there_subject_raising": 0.876, "blimp/accuracy/irregular_past_participle_adjectives": 0.816, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.684, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.353, "blimp/accuracy/only_npi_scope": 0.733, "blimp/accuracy/superlative_quantifiers_2": 0.872, "blimp/accuracy/passive_1": 0.911, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.925, "blimp/accuracy/inchoative": 0.632, "blimp/accuracy/anaphor_gender_agreement": 0.976, "blimp/accuracy/principle_A_c_command": 0.626, "blimp/accuracy/only_npi_licensor_present": 0.837, "blimp/accuracy/expletive_it_object_raising": 0.81, "blimp/accuracy/left_branch_island_simple_question": 0.763, "blimp/accuracy/wh_questions_subject_gap": 0.93, "blimp/accuracy/existential_there_quantifiers_2": 0.616, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.945, "blimp/accuracy/sentential_negation_npi_scope": 0.716, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.839, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.932, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.921, "blimp/accuracy/principle_A_case_2": 0.936, "blimp/accuracy/distractor_agreement_relational_noun": 0.869, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.978, "blimp/accuracy/superlative_quantifiers_1": 0.887, "blimp/accuracy/wh_island": 0.761, "blimp/accuracy/principle_A_domain_1": 0.985, "blimp/accuracy/complex_NP_island": 0.604, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.97, "blimp/accuracy/irregular_past_participle_verbs": 0.904, "blimp/accuracy/drop_argument": 0.752, "blimp/accuracy/wh_questions_object_gap": 0.857, "blimp/accuracy/animate_subject_passive": 0.807, "blimp/accuracy/existential_there_quantifiers_1": 0.99, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.921, "blimp/accuracy/npi_present_2": 0.605, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.962, "blimp/accuracy/anaphor_number_agreement": 0.988, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.964, "blimp/accuracy/existential_there_object_raising": 0.896, "blimp/accuracy/matrix_question_npi_licensor_present": 0.346, "blimp/accuracy/npi_present_1": 0.637, "blimp/accuracy/wh_vs_that_no_gap": 0.982, "blimp/accuracy/left_branch_island_echo_question": 0.498, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.975, "blimp/accuracy/causative": 0.769, "blimp/accuracy/group_average": 0.8100149253731345, "blimp/accuracy/seq_average": 0.8100149253731344, "cbt/accuracy/NE": 0.8305288461538461, "cbt/accuracy/V": 0.9376, "cbt/accuracy/CN": 0.8824, "cbt/accuracy/P": 0.93, "cbt/accuracy/group_average": 0.8951322115384616, "cbt/accuracy/seq_average": 0.8951580632252901, "hellaswag/accuracy/val": 0.3652658832901812, "hellaswag/accuracy/group_average": 0.3652658832901812, "hellaswag/accuracy/seq_average": 0.3652658832901812, "piqa/accuracy/val": 0.6392818280739935, "piqa/accuracy/group_average": 0.6392818280739935, "piqa/accuracy/seq_average": 0.6392818280739935, "ai2arc/accuracy/ARC-Easy": 0.386046511627907, "ai2arc/accuracy/ARC-Challenge": 0.22832618025751072, "ai2arc/accuracy/group_average": 0.3071863459427089, "ai2arc/accuracy/seq_average": 0.3339943342776204, "mmlu/accuracy/MMLU": 0.25963532356095814, "mmlu/accuracy/group_average": 0.25963532356095814, "mmlu/accuracy/seq_average": 0.25963532356095814, "openbookqa/accuracy/test": 0.28, "openbookqa/accuracy/group_average": 0.28, "openbookqa/accuracy/seq_average": 0.28, "race/accuracy/test/high": 0.2884505431675243, "race/accuracy/test/middle": 0.3530640668523677, "race/accuracy/group_average": 0.320757305009946, "race/accuracy/seq_average": 0.30725577624645317, "siqa/accuracy/dev": 0.37154554759467756, "siqa/accuracy/group_average": 0.37154554759467756, "siqa/accuracy/seq_average": 0.37154554759467756, "winogrande/accuracy/dev": 0.5138121546961326, "winogrande/accuracy/group_average": 0.5138121546961326, "winogrande/accuracy/seq_average": 0.5138121546961326, "commonsenseqa/accuracy/dev_rand_split": 0.2809172809172809, "commonsenseqa/accuracy/group_average": 0.2809172809172809, "commonsenseqa/accuracy/seq_average": 0.2809172809172809}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-300000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.2766200474330356, "val/accuracy": 0.524903312562004, "val/perplexity": 9.743691475404292, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.627922863693711, "lambada/accuracy/total": 0.3491847826086957, "lambada/accuracy/openai_last_token": 0.796389751552795, "lambada/perplexity": 6.683031221373312, "lambada/lm_loss": 2.880704140065331, "lambada/lm_perplexity": 17.826821340405633, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4370440475853498, "mean_loss": 2.4522714555633733, "blimp/accuracy/passive_2": 0.915, "blimp/accuracy/determiner_noun_agreement_2": 0.977, "blimp/accuracy/ellipsis_n_bar_1": 0.84, "blimp/accuracy/tough_vs_raising_2": 0.897, "blimp/accuracy/tough_vs_raising_1": 0.595, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.918, "blimp/accuracy/principle_A_reconstruction": 0.504, "blimp/accuracy/wh_vs_that_with_gap": 0.495, "blimp/accuracy/principle_A_domain_2": 0.857, "blimp/accuracy/determiner_noun_agreement_1": 0.991, "blimp/accuracy/ellipsis_n_bar_2": 0.9, "blimp/accuracy/principle_A_domain_3": 0.592, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.91, "blimp/accuracy/animate_subject_trans": 0.911, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.919, "blimp/accuracy/distractor_agreement_relative_clause": 0.706, "blimp/accuracy/transitive": 0.881, "blimp/accuracy/sentential_subject_island": 0.347, "blimp/accuracy/adjunct_island": 0.832, "blimp/accuracy/intransitive": 0.791, "blimp/accuracy/existential_there_subject_raising": 0.882, "blimp/accuracy/irregular_past_participle_adjectives": 0.964, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.741, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.412, "blimp/accuracy/only_npi_scope": 0.758, "blimp/accuracy/superlative_quantifiers_2": 0.854, "blimp/accuracy/passive_1": 0.909, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.935, "blimp/accuracy/inchoative": 0.638, "blimp/accuracy/anaphor_gender_agreement": 0.97, "blimp/accuracy/principle_A_c_command": 0.6, "blimp/accuracy/only_npi_licensor_present": 0.61, "blimp/accuracy/expletive_it_object_raising": 0.816, "blimp/accuracy/left_branch_island_simple_question": 0.808, "blimp/accuracy/wh_questions_subject_gap": 0.926, "blimp/accuracy/existential_there_quantifiers_2": 0.547, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.931, "blimp/accuracy/sentential_negation_npi_scope": 0.722, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.81, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.911, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.915, "blimp/accuracy/principle_A_case_2": 0.94, "blimp/accuracy/distractor_agreement_relational_noun": 0.834, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.971, "blimp/accuracy/superlative_quantifiers_1": 0.875, "blimp/accuracy/wh_island": 0.744, "blimp/accuracy/principle_A_domain_1": 0.986, "blimp/accuracy/complex_NP_island": 0.592, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.965, "blimp/accuracy/irregular_past_participle_verbs": 0.92, "blimp/accuracy/drop_argument": 0.76, "blimp/accuracy/wh_questions_object_gap": 0.841, "blimp/accuracy/animate_subject_passive": 0.798, "blimp/accuracy/existential_there_quantifiers_1": 0.993, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.914, "blimp/accuracy/npi_present_2": 0.561, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.968, "blimp/accuracy/anaphor_number_agreement": 0.986, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.969, "blimp/accuracy/existential_there_object_raising": 0.868, "blimp/accuracy/matrix_question_npi_licensor_present": 0.306, "blimp/accuracy/npi_present_1": 0.576, "blimp/accuracy/wh_vs_that_no_gap": 0.983, "blimp/accuracy/left_branch_island_echo_question": 0.507, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.964, "blimp/accuracy/causative": 0.769, "blimp/accuracy/group_average": 0.8063731343283579, "blimp/accuracy/seq_average": 0.8063731343283582, "cbt/accuracy/NE": 0.8265224358974359, "cbt/accuracy/V": 0.9412, "cbt/accuracy/CN": 0.8776, "cbt/accuracy/P": 0.9252, "cbt/accuracy/group_average": 0.892630608974359, "cbt/accuracy/seq_average": 0.89265706282513, "hellaswag/accuracy/val": 0.3670583549093806, "hellaswag/accuracy/group_average": 0.3670583549093806, "hellaswag/accuracy/seq_average": 0.3670583549093806, "piqa/accuracy/val": 0.6332970620239391, "piqa/accuracy/group_average": 0.6332970620239391, "piqa/accuracy/seq_average": 0.6332970620239391, "ai2arc/accuracy/ARC-Easy": 0.39154334038054966, "ai2arc/accuracy/ARC-Challenge": 0.22918454935622318, "ai2arc/accuracy/group_average": 0.3103639448683864, "ai2arc/accuracy/seq_average": 0.3379603399433428, "mmlu/accuracy/MMLU": 0.2599928494815874, "mmlu/accuracy/group_average": 0.2599928494815874, "mmlu/accuracy/seq_average": 0.2599928494815874, "openbookqa/accuracy/test": 0.288, "openbookqa/accuracy/group_average": 0.288, "openbookqa/accuracy/seq_average": 0.288, "race/accuracy/test/high": 0.29245283018867924, "race/accuracy/test/middle": 0.35097493036211697, "race/accuracy/group_average": 0.3217138802753981, "race/accuracy/seq_average": 0.3094852047020673, "siqa/accuracy/dev": 0.36898669396110545, "siqa/accuracy/group_average": 0.36898669396110545, "siqa/accuracy/seq_average": 0.36898669396110545, "winogrande/accuracy/dev": 0.5082872928176796, "winogrande/accuracy/group_average": 0.5082872928176796, "winogrande/accuracy/seq_average": 0.5082872928176796, "commonsenseqa/accuracy/dev_rand_split": 0.28255528255528256, "commonsenseqa/accuracy/group_average": 0.28255528255528256, "commonsenseqa/accuracy/seq_average": 0.28255528255528256}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-320000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.268373035249256, "val/accuracy": 0.5255194769965278, "val/perplexity": 9.663665573951482, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5120783266813858, "lambada/accuracy/total": 0.37461180124223603, "lambada/accuracy/openai_last_token": 0.8045419254658385, "lambada/perplexity": 6.368059241843409, "lambada/lm_loss": 2.8760358544450333, "lambada/lm_perplexity": 17.74379459375303, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4500656391193819, "mean_loss": 2.3902256809653206, "blimp/accuracy/passive_2": 0.916, "blimp/accuracy/determiner_noun_agreement_2": 0.984, "blimp/accuracy/ellipsis_n_bar_1": 0.848, "blimp/accuracy/tough_vs_raising_2": 0.896, "blimp/accuracy/tough_vs_raising_1": 0.603, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.934, "blimp/accuracy/principle_A_reconstruction": 0.478, "blimp/accuracy/wh_vs_that_with_gap": 0.501, "blimp/accuracy/principle_A_domain_2": 0.861, "blimp/accuracy/determiner_noun_agreement_1": 0.993, "blimp/accuracy/ellipsis_n_bar_2": 0.912, "blimp/accuracy/principle_A_domain_3": 0.595, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.911, "blimp/accuracy/animate_subject_trans": 0.907, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.913, "blimp/accuracy/distractor_agreement_relative_clause": 0.669, "blimp/accuracy/transitive": 0.887, "blimp/accuracy/sentential_subject_island": 0.382, "blimp/accuracy/adjunct_island": 0.857, "blimp/accuracy/intransitive": 0.787, "blimp/accuracy/existential_there_subject_raising": 0.883, "blimp/accuracy/irregular_past_participle_adjectives": 0.906, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.725, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.406, "blimp/accuracy/only_npi_scope": 0.723, "blimp/accuracy/superlative_quantifiers_2": 0.844, "blimp/accuracy/passive_1": 0.906, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.937, "blimp/accuracy/inchoative": 0.641, "blimp/accuracy/anaphor_gender_agreement": 0.969, "blimp/accuracy/principle_A_c_command": 0.617, "blimp/accuracy/only_npi_licensor_present": 0.622, "blimp/accuracy/expletive_it_object_raising": 0.818, "blimp/accuracy/left_branch_island_simple_question": 0.801, "blimp/accuracy/wh_questions_subject_gap": 0.931, "blimp/accuracy/existential_there_quantifiers_2": 0.604, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.941, "blimp/accuracy/sentential_negation_npi_scope": 0.719, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.837, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.93, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.915, "blimp/accuracy/principle_A_case_2": 0.956, "blimp/accuracy/distractor_agreement_relational_noun": 0.834, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.986, "blimp/accuracy/superlative_quantifiers_1": 0.89, "blimp/accuracy/wh_island": 0.8, "blimp/accuracy/principle_A_domain_1": 0.988, "blimp/accuracy/complex_NP_island": 0.611, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.971, "blimp/accuracy/irregular_past_participle_verbs": 0.914, "blimp/accuracy/drop_argument": 0.761, "blimp/accuracy/wh_questions_object_gap": 0.864, "blimp/accuracy/animate_subject_passive": 0.792, "blimp/accuracy/existential_there_quantifiers_1": 0.989, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.927, "blimp/accuracy/npi_present_2": 0.575, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.969, "blimp/accuracy/anaphor_number_agreement": 0.985, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.973, "blimp/accuracy/existential_there_object_raising": 0.878, "blimp/accuracy/matrix_question_npi_licensor_present": 0.333, "blimp/accuracy/npi_present_1": 0.604, "blimp/accuracy/wh_vs_that_no_gap": 0.98, "blimp/accuracy/left_branch_island_echo_question": 0.487, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.967, "blimp/accuracy/causative": 0.77, "blimp/accuracy/group_average": 0.8106417910447761, "blimp/accuracy/seq_average": 0.8106417910447761, "cbt/accuracy/NE": 0.828125, "cbt/accuracy/V": 0.9396, "cbt/accuracy/CN": 0.8856, "cbt/accuracy/P": 0.9292, "cbt/accuracy/group_average": 0.8956312500000001, "cbt/accuracy/seq_average": 0.8956582633053222, "hellaswag/accuracy/val": 0.3716391157140012, "hellaswag/accuracy/group_average": 0.3716391157140012, "hellaswag/accuracy/seq_average": 0.3716391157140012, "piqa/accuracy/val": 0.6420021762785637, "piqa/accuracy/group_average": 0.6420021762785637, "piqa/accuracy/seq_average": 0.6420021762785637, "ai2arc/accuracy/ARC-Easy": 0.3970401691331924, "ai2arc/accuracy/ARC-Challenge": 0.23261802575107296, "ai2arc/accuracy/group_average": 0.31482909744213267, "ai2arc/accuracy/seq_average": 0.34277620396600567, "mmlu/accuracy/MMLU": 0.2639256346085091, "mmlu/accuracy/group_average": 0.2639256346085091, "mmlu/accuracy/seq_average": 0.2639256346085091, "openbookqa/accuracy/test": 0.284, "openbookqa/accuracy/group_average": 0.284, "openbookqa/accuracy/seq_average": 0.284, "race/accuracy/test/high": 0.29016580903373357, "race/accuracy/test/middle": 0.3635097493036212, "race/accuracy/group_average": 0.3268377791686774, "race/accuracy/seq_average": 0.31151195784353464, "siqa/accuracy/dev": 0.3781985670419652, "siqa/accuracy/group_average": 0.3781985670419652, "siqa/accuracy/seq_average": 0.3781985670419652, "winogrande/accuracy/dev": 0.5138121546961326, "winogrande/accuracy/group_average": 0.5138121546961326, "winogrande/accuracy/seq_average": 0.5138121546961326, "commonsenseqa/accuracy/dev_rand_split": 0.29074529074529076, "commonsenseqa/accuracy/group_average": 0.29074529074529076, "commonsenseqa/accuracy/seq_average": 0.29074529074529076}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-340000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.2620675223214284, "val/accuracy": 0.5260755750868056, "val/perplexity": 9.602922913817086, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5732020028629656, "lambada/accuracy/total": 0.39382763975155277, "lambada/accuracy/openai_last_token": 0.8080357142857143, "lambada/perplexity": 6.056061518998879, "lambada/lm_loss": 2.8638397487339264, "lambada/lm_perplexity": 17.528703700939108, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4599516074191792, "mean_loss": 2.417634762592197, "blimp/accuracy/passive_2": 0.914, "blimp/accuracy/determiner_noun_agreement_2": 0.98, "blimp/accuracy/ellipsis_n_bar_1": 0.851, "blimp/accuracy/tough_vs_raising_2": 0.907, "blimp/accuracy/tough_vs_raising_1": 0.606, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.925, "blimp/accuracy/principle_A_reconstruction": 0.47, "blimp/accuracy/wh_vs_that_with_gap": 0.49, "blimp/accuracy/principle_A_domain_2": 0.869, "blimp/accuracy/determiner_noun_agreement_1": 0.994, "blimp/accuracy/ellipsis_n_bar_2": 0.912, "blimp/accuracy/principle_A_domain_3": 0.588, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.917, "blimp/accuracy/animate_subject_trans": 0.917, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.931, "blimp/accuracy/distractor_agreement_relative_clause": 0.729, "blimp/accuracy/transitive": 0.893, "blimp/accuracy/sentential_subject_island": 0.356, "blimp/accuracy/adjunct_island": 0.86, "blimp/accuracy/intransitive": 0.779, "blimp/accuracy/existential_there_subject_raising": 0.882, "blimp/accuracy/irregular_past_participle_adjectives": 0.941, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.737, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.396, "blimp/accuracy/only_npi_scope": 0.732, "blimp/accuracy/superlative_quantifiers_2": 0.866, "blimp/accuracy/passive_1": 0.913, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.939, "blimp/accuracy/inchoative": 0.623, "blimp/accuracy/anaphor_gender_agreement": 0.973, "blimp/accuracy/principle_A_c_command": 0.608, "blimp/accuracy/only_npi_licensor_present": 0.659, "blimp/accuracy/expletive_it_object_raising": 0.819, "blimp/accuracy/left_branch_island_simple_question": 0.822, "blimp/accuracy/wh_questions_subject_gap": 0.935, "blimp/accuracy/existential_there_quantifiers_2": 0.671, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.948, "blimp/accuracy/sentential_negation_npi_scope": 0.749, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.844, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.919, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.919, "blimp/accuracy/principle_A_case_2": 0.946, "blimp/accuracy/distractor_agreement_relational_noun": 0.872, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.976, "blimp/accuracy/superlative_quantifiers_1": 0.877, "blimp/accuracy/wh_island": 0.754, "blimp/accuracy/principle_A_domain_1": 0.985, "blimp/accuracy/complex_NP_island": 0.601, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.968, "blimp/accuracy/irregular_past_participle_verbs": 0.918, "blimp/accuracy/drop_argument": 0.75, "blimp/accuracy/wh_questions_object_gap": 0.856, "blimp/accuracy/animate_subject_passive": 0.81, "blimp/accuracy/existential_there_quantifiers_1": 0.988, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.923, "blimp/accuracy/npi_present_2": 0.602, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.966, "blimp/accuracy/anaphor_number_agreement": 0.988, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.972, "blimp/accuracy/existential_there_object_raising": 0.875, "blimp/accuracy/matrix_question_npi_licensor_present": 0.354, "blimp/accuracy/npi_present_1": 0.597, "blimp/accuracy/wh_vs_that_no_gap": 0.984, "blimp/accuracy/left_branch_island_echo_question": 0.536, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.971, "blimp/accuracy/causative": 0.784, "blimp/accuracy/group_average": 0.815462686567164, "blimp/accuracy/seq_average": 0.8154626865671641, "cbt/accuracy/NE": 0.8229166666666666, "cbt/accuracy/V": 0.9372, "cbt/accuracy/CN": 0.886, "cbt/accuracy/P": 0.9244, "cbt/accuracy/group_average": 0.8926291666666667, "cbt/accuracy/seq_average": 0.89265706282513, "hellaswag/accuracy/val": 0.3679545907189803, "hellaswag/accuracy/group_average": 0.3679545907189803, "hellaswag/accuracy/seq_average": 0.3679545907189803, "piqa/accuracy/val": 0.6507072905331882, "piqa/accuracy/group_average": 0.6507072905331882, "piqa/accuracy/seq_average": 0.6507072905331882, "ai2arc/accuracy/ARC-Easy": 0.3894291754756871, "ai2arc/accuracy/ARC-Challenge": 0.22918454935622318, "ai2arc/accuracy/group_average": 0.3093068624159552, "ai2arc/accuracy/seq_average": 0.3365439093484419, "mmlu/accuracy/MMLU": 0.26385412942438324, "mmlu/accuracy/group_average": 0.26385412942438324, "mmlu/accuracy/seq_average": 0.26385412942438324, "openbookqa/accuracy/test": 0.274, "openbookqa/accuracy/group_average": 0.274, "openbookqa/accuracy/seq_average": 0.274, "race/accuracy/test/high": 0.29416809605488853, "race/accuracy/test/middle": 0.3565459610027855, "race/accuracy/group_average": 0.325357028528837, "race/accuracy/seq_average": 0.3123226591001216, "siqa/accuracy/dev": 0.37563971340839303, "siqa/accuracy/group_average": 0.37563971340839303, "siqa/accuracy/seq_average": 0.37563971340839303, "winogrande/accuracy/dev": 0.516179952644041, "winogrande/accuracy/group_average": 0.516179952644041, "winogrande/accuracy/seq_average": 0.516179952644041, "commonsenseqa/accuracy/dev_rand_split": 0.28992628992628994, "commonsenseqa/accuracy/group_average": 0.28992628992628994, "commonsenseqa/accuracy/seq_average": 0.28992628992628994}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-360000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.2557566809275795, "val/accuracy": 0.5274871341765873, "val/perplexity": 9.542511215214581, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.492807708171584, "lambada/accuracy/total": 0.37985248447204967, "lambada/accuracy/openai_last_token": 0.8053183229813664, "lambada/perplexity": 6.278555740560683, "lambada/lm_loss": 2.851631857068325, "lambada/lm_perplexity": 17.31601606039989, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4536698093243185, "mean_loss": 2.374282194549582, "blimp/accuracy/passive_2": 0.918, "blimp/accuracy/determiner_noun_agreement_2": 0.982, "blimp/accuracy/ellipsis_n_bar_1": 0.844, "blimp/accuracy/tough_vs_raising_2": 0.902, "blimp/accuracy/tough_vs_raising_1": 0.6, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.916, "blimp/accuracy/principle_A_reconstruction": 0.492, "blimp/accuracy/wh_vs_that_with_gap": 0.505, "blimp/accuracy/principle_A_domain_2": 0.87, "blimp/accuracy/determiner_noun_agreement_1": 0.994, "blimp/accuracy/ellipsis_n_bar_2": 0.907, "blimp/accuracy/principle_A_domain_3": 0.588, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.91, "blimp/accuracy/animate_subject_trans": 0.908, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.927, "blimp/accuracy/distractor_agreement_relative_clause": 0.684, "blimp/accuracy/transitive": 0.892, "blimp/accuracy/sentential_subject_island": 0.354, "blimp/accuracy/adjunct_island": 0.846, "blimp/accuracy/intransitive": 0.787, "blimp/accuracy/existential_there_subject_raising": 0.88, "blimp/accuracy/irregular_past_participle_adjectives": 0.924, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.718, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.403, "blimp/accuracy/only_npi_scope": 0.685, "blimp/accuracy/superlative_quantifiers_2": 0.842, "blimp/accuracy/passive_1": 0.918, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.933, "blimp/accuracy/inchoative": 0.64, "blimp/accuracy/anaphor_gender_agreement": 0.967, "blimp/accuracy/principle_A_c_command": 0.637, "blimp/accuracy/only_npi_licensor_present": 0.653, "blimp/accuracy/expletive_it_object_raising": 0.8, "blimp/accuracy/left_branch_island_simple_question": 0.78, "blimp/accuracy/wh_questions_subject_gap": 0.937, "blimp/accuracy/existential_there_quantifiers_2": 0.644, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.949, "blimp/accuracy/sentential_negation_npi_scope": 0.741, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.841, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.925, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.914, "blimp/accuracy/principle_A_case_2": 0.946, "blimp/accuracy/distractor_agreement_relational_noun": 0.831, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.979, "blimp/accuracy/superlative_quantifiers_1": 0.864, "blimp/accuracy/wh_island": 0.77, "blimp/accuracy/principle_A_domain_1": 0.989, "blimp/accuracy/complex_NP_island": 0.615, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.969, "blimp/accuracy/irregular_past_participle_verbs": 0.919, "blimp/accuracy/drop_argument": 0.752, "blimp/accuracy/wh_questions_object_gap": 0.889, "blimp/accuracy/animate_subject_passive": 0.8, "blimp/accuracy/existential_there_quantifiers_1": 0.99, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.916, "blimp/accuracy/npi_present_2": 0.586, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.971, "blimp/accuracy/anaphor_number_agreement": 0.988, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.973, "blimp/accuracy/existential_there_object_raising": 0.842, "blimp/accuracy/matrix_question_npi_licensor_present": 0.342, "blimp/accuracy/npi_present_1": 0.581, "blimp/accuracy/wh_vs_that_no_gap": 0.982, "blimp/accuracy/left_branch_island_echo_question": 0.514, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.977, "blimp/accuracy/causative": 0.786, "blimp/accuracy/group_average": 0.8108656716417909, "blimp/accuracy/seq_average": 0.810865671641791, "cbt/accuracy/NE": 0.827323717948718, "cbt/accuracy/V": 0.9412, "cbt/accuracy/CN": 0.888, "cbt/accuracy/P": 0.926, "cbt/accuracy/group_average": 0.8956309294871795, "cbt/accuracy/seq_average": 0.8956582633053222, "hellaswag/accuracy/val": 0.37183827922724555, "hellaswag/accuracy/group_average": 0.37183827922724555, "hellaswag/accuracy/seq_average": 0.37183827922724555, "piqa/accuracy/val": 0.6512513601741022, "piqa/accuracy/group_average": 0.6512513601741022, "piqa/accuracy/seq_average": 0.6512513601741022, "ai2arc/accuracy/ARC-Easy": 0.38816067653276953, "ai2arc/accuracy/ARC-Challenge": 0.23948497854077253, "ai2arc/accuracy/group_average": 0.31382282753677104, "ai2arc/accuracy/seq_average": 0.33909348441926346, "mmlu/accuracy/MMLU": 0.26642831605291384, "mmlu/accuracy/group_average": 0.26642831605291384, "mmlu/accuracy/seq_average": 0.26642831605291384, "openbookqa/accuracy/test": 0.284, "openbookqa/accuracy/group_average": 0.284, "openbookqa/accuracy/seq_average": 0.284, "race/accuracy/test/high": 0.292166952544311, "race/accuracy/test/middle": 0.3593314763231198, "race/accuracy/group_average": 0.3257492144337154, "race/accuracy/seq_average": 0.3117146331576814, "siqa/accuracy/dev": 0.37563971340839303, "siqa/accuracy/group_average": 0.37563971340839303, "siqa/accuracy/seq_average": 0.37563971340839303, "winogrande/accuracy/dev": 0.5138121546961326, "winogrande/accuracy/group_average": 0.5138121546961326, "winogrande/accuracy/seq_average": 0.5138121546961326, "commonsenseqa/accuracy/dev_rand_split": 0.2882882882882883, "commonsenseqa/accuracy/group_average": 0.2882882882882883, "commonsenseqa/accuracy/seq_average": 0.2882882882882883}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-380000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.2511664496527777, "val/accuracy": 0.528046138702877, "val/perplexity": 9.498809259570276, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.4737514709093555, "lambada/accuracy/total": 0.3792701863354037, "lambada/accuracy/openai_last_token": 0.8051242236024845, "lambada/perplexity": 6.267921651278178, "lambada/lm_loss": 2.8486426770983715, "lambada/lm_perplexity": 17.264332656014773, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4536581625191404, "mean_loss": 2.3624589602810664, "blimp/accuracy/passive_2": 0.918, "blimp/accuracy/determiner_noun_agreement_2": 0.988, "blimp/accuracy/ellipsis_n_bar_1": 0.844, "blimp/accuracy/tough_vs_raising_2": 0.899, "blimp/accuracy/tough_vs_raising_1": 0.602, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.912, "blimp/accuracy/principle_A_reconstruction": 0.457, "blimp/accuracy/wh_vs_that_with_gap": 0.47, "blimp/accuracy/principle_A_domain_2": 0.868, "blimp/accuracy/determiner_noun_agreement_1": 0.993, "blimp/accuracy/ellipsis_n_bar_2": 0.912, "blimp/accuracy/principle_A_domain_3": 0.592, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.916, "blimp/accuracy/animate_subject_trans": 0.907, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.914, "blimp/accuracy/distractor_agreement_relative_clause": 0.698, "blimp/accuracy/transitive": 0.896, "blimp/accuracy/sentential_subject_island": 0.361, "blimp/accuracy/adjunct_island": 0.857, "blimp/accuracy/intransitive": 0.784, "blimp/accuracy/existential_there_subject_raising": 0.885, "blimp/accuracy/irregular_past_participle_adjectives": 0.934, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.73, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.382, "blimp/accuracy/only_npi_scope": 0.744, "blimp/accuracy/superlative_quantifiers_2": 0.839, "blimp/accuracy/passive_1": 0.919, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.939, "blimp/accuracy/inchoative": 0.632, "blimp/accuracy/anaphor_gender_agreement": 0.976, "blimp/accuracy/principle_A_c_command": 0.619, "blimp/accuracy/only_npi_licensor_present": 0.657, "blimp/accuracy/expletive_it_object_raising": 0.809, "blimp/accuracy/left_branch_island_simple_question": 0.818, "blimp/accuracy/wh_questions_subject_gap": 0.938, "blimp/accuracy/existential_there_quantifiers_2": 0.643, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.95, "blimp/accuracy/sentential_negation_npi_scope": 0.726, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.824, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.928, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.904, "blimp/accuracy/principle_A_case_2": 0.937, "blimp/accuracy/distractor_agreement_relational_noun": 0.85, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.975, "blimp/accuracy/superlative_quantifiers_1": 0.883, "blimp/accuracy/wh_island": 0.783, "blimp/accuracy/principle_A_domain_1": 0.987, "blimp/accuracy/complex_NP_island": 0.603, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.972, "blimp/accuracy/irregular_past_participle_verbs": 0.93, "blimp/accuracy/drop_argument": 0.75, "blimp/accuracy/wh_questions_object_gap": 0.858, "blimp/accuracy/animate_subject_passive": 0.805, "blimp/accuracy/existential_there_quantifiers_1": 0.991, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.909, "blimp/accuracy/npi_present_2": 0.609, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.961, "blimp/accuracy/anaphor_number_agreement": 0.988, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.969, "blimp/accuracy/existential_there_object_raising": 0.854, "blimp/accuracy/matrix_question_npi_licensor_present": 0.348, "blimp/accuracy/npi_present_1": 0.603, "blimp/accuracy/wh_vs_that_no_gap": 0.986, "blimp/accuracy/left_branch_island_echo_question": 0.53, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.969, "blimp/accuracy/causative": 0.774, "blimp/accuracy/group_average": 0.8120597014925376, "blimp/accuracy/seq_average": 0.8120597014925374, "cbt/accuracy/NE": 0.8253205128205128, "cbt/accuracy/V": 0.9432, "cbt/accuracy/CN": 0.886, "cbt/accuracy/P": 0.9304, "cbt/accuracy/group_average": 0.8962301282051283, "cbt/accuracy/seq_average": 0.8962585034013606, "hellaswag/accuracy/val": 0.37064329814777935, "hellaswag/accuracy/group_average": 0.37064329814777935, "hellaswag/accuracy/seq_average": 0.37064329814777935, "piqa/accuracy/val": 0.6496191512513602, "piqa/accuracy/group_average": 0.6496191512513602, "piqa/accuracy/seq_average": 0.6496191512513602, "ai2arc/accuracy/ARC-Easy": 0.3949260042283298, "ai2arc/accuracy/ARC-Challenge": 0.23261802575107296, "ai2arc/accuracy/group_average": 0.31377201498970136, "ai2arc/accuracy/seq_average": 0.3413597733711048, "mmlu/accuracy/MMLU": 0.26263854129424385, "mmlu/accuracy/group_average": 0.26263854129424385, "mmlu/accuracy/seq_average": 0.26263854129424385, "openbookqa/accuracy/test": 0.282, "openbookqa/accuracy/group_average": 0.282, "openbookqa/accuracy/seq_average": 0.282, "race/accuracy/test/high": 0.29302458547741567, "race/accuracy/test/middle": 0.36142061281337046, "race/accuracy/group_average": 0.3272225991453931, "race/accuracy/seq_average": 0.31293068504256183, "siqa/accuracy/dev": 0.38229273285568066, "siqa/accuracy/group_average": 0.38229273285568066, "siqa/accuracy/seq_average": 0.38229273285568066, "winogrande/accuracy/dev": 0.5193370165745856, "winogrande/accuracy/group_average": 0.5193370165745856, "winogrande/accuracy/seq_average": 0.5193370165745856, "commonsenseqa/accuracy/dev_rand_split": 0.29074529074529076, "commonsenseqa/accuracy/group_average": 0.29074529074529076, "commonsenseqa/accuracy/seq_average": 0.29074529074529076}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-40000.pth.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "val/loss": 2.583443535698785,
3
+ "val/accuracy": 0.47975086030505953,
4
+ "val/perplexity": 13.242661311347701,
5
+ "val/time_since_best_loss": 0,
6
+ "val/time_since_best_accuracy": 0,
7
+ "lambada/loss": 2.753962356851708,
8
+ "lambada/accuracy/total": 0.29891304347826086,
9
+ "lambada/accuracy/openai_last_token": 0.7668866459627329,
10
+ "lambada/perplexity": 10.079826263256464,
11
+ "lambada/lm_loss": 3.1488050089782784,
12
+ "lambada/lm_perplexity": 23.308194848642618,
13
+ "lambada/time_since_best_loss": 0,
14
+ "lambada/time_since_best_accuracy": 0,
15
+ "mean_accuracy": 0.3893319518916602,
16
+ "mean_loss": 2.6687029462752463,
17
+ "blimp/accuracy/passive_2": 0.894,
18
+ "blimp/accuracy/determiner_noun_agreement_2": 0.976,
19
+ "blimp/accuracy/ellipsis_n_bar_1": 0.827,
20
+ "blimp/accuracy/tough_vs_raising_2": 0.845,
21
+ "blimp/accuracy/tough_vs_raising_1": 0.637,
22
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.925,
23
+ "blimp/accuracy/principle_A_reconstruction": 0.329,
24
+ "blimp/accuracy/wh_vs_that_with_gap": 0.398,
25
+ "blimp/accuracy/principle_A_domain_2": 0.835,
26
+ "blimp/accuracy/determiner_noun_agreement_1": 0.99,
27
+ "blimp/accuracy/ellipsis_n_bar_2": 0.868,
28
+ "blimp/accuracy/principle_A_domain_3": 0.595,
29
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.896,
30
+ "blimp/accuracy/animate_subject_trans": 0.913,
31
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.904,
32
+ "blimp/accuracy/distractor_agreement_relative_clause": 0.593,
33
+ "blimp/accuracy/transitive": 0.866,
34
+ "blimp/accuracy/sentential_subject_island": 0.351,
35
+ "blimp/accuracy/adjunct_island": 0.801,
36
+ "blimp/accuracy/intransitive": 0.749,
37
+ "blimp/accuracy/existential_there_subject_raising": 0.853,
38
+ "blimp/accuracy/irregular_past_participle_adjectives": 0.905,
39
+ "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.463,
40
+ "blimp/accuracy/principle_A_case_1": 1.0,
41
+ "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.214,
42
+ "blimp/accuracy/only_npi_scope": 0.711,
43
+ "blimp/accuracy/superlative_quantifiers_2": 0.567,
44
+ "blimp/accuracy/passive_1": 0.888,
45
+ "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.925,
46
+ "blimp/accuracy/inchoative": 0.597,
47
+ "blimp/accuracy/anaphor_gender_agreement": 0.964,
48
+ "blimp/accuracy/principle_A_c_command": 0.593,
49
+ "blimp/accuracy/only_npi_licensor_present": 0.848,
50
+ "blimp/accuracy/expletive_it_object_raising": 0.795,
51
+ "blimp/accuracy/left_branch_island_simple_question": 0.497,
52
+ "blimp/accuracy/wh_questions_subject_gap": 0.947,
53
+ "blimp/accuracy/existential_there_quantifiers_2": 0.439,
54
+ "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.918,
55
+ "blimp/accuracy/sentential_negation_npi_scope": 0.692,
56
+ "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.806,
57
+ "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.933,
58
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.885,
59
+ "blimp/accuracy/principle_A_case_2": 0.944,
60
+ "blimp/accuracy/distractor_agreement_relational_noun": 0.843,
61
+ "blimp/accuracy/sentential_negation_npi_licensor_present": 0.986,
62
+ "blimp/accuracy/superlative_quantifiers_1": 0.688,
63
+ "blimp/accuracy/wh_island": 0.896,
64
+ "blimp/accuracy/principle_A_domain_1": 0.977,
65
+ "blimp/accuracy/complex_NP_island": 0.519,
66
+ "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.958,
67
+ "blimp/accuracy/irregular_past_participle_verbs": 0.884,
68
+ "blimp/accuracy/drop_argument": 0.765,
69
+ "blimp/accuracy/wh_questions_object_gap": 0.864,
70
+ "blimp/accuracy/animate_subject_passive": 0.802,
71
+ "blimp/accuracy/existential_there_quantifiers_1": 0.971,
72
+ "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.916,
73
+ "blimp/accuracy/npi_present_2": 0.513,
74
+ "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.928,
75
+ "blimp/accuracy/anaphor_number_agreement": 0.975,
76
+ "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.968,
77
+ "blimp/accuracy/existential_there_object_raising": 0.848,
78
+ "blimp/accuracy/matrix_question_npi_licensor_present": 0.222,
79
+ "blimp/accuracy/npi_present_1": 0.462,
80
+ "blimp/accuracy/wh_vs_that_no_gap": 0.984,
81
+ "blimp/accuracy/left_branch_island_echo_question": 0.432,
82
+ "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.988,
83
+ "blimp/accuracy/causative": 0.719,
84
+ "blimp/accuracy/group_average": 0.7714029850746269,
85
+ "blimp/accuracy/seq_average": 0.7714029850746269,
86
+ "cbt/accuracy/NE": 0.7724358974358975,
87
+ "cbt/accuracy/V": 0.922,
88
+ "cbt/accuracy/CN": 0.8348,
89
+ "cbt/accuracy/P": 0.89,
90
+ "cbt/accuracy/group_average": 0.8548089743589744,
91
+ "cbt/accuracy/seq_average": 0.8548419367747099,
92
+ "hellaswag/accuracy/val": 0.30342561242780325,
93
+ "hellaswag/accuracy/group_average": 0.30342561242780325,
94
+ "hellaswag/accuracy/seq_average": 0.30342561242780325,
95
+ "piqa/accuracy/val": 0.5930359085963003,
96
+ "piqa/accuracy/group_average": 0.5930359085963003,
97
+ "piqa/accuracy/seq_average": 0.5930359085963003,
98
+ "ai2arc/accuracy/ARC-Easy": 0.3382663847780127,
99
+ "ai2arc/accuracy/ARC-Challenge": 0.21201716738197424,
100
+ "ai2arc/accuracy/group_average": 0.27514177607999346,
101
+ "ai2arc/accuracy/seq_average": 0.29660056657223793,
102
+ "mmlu/accuracy/MMLU": 0.2580622095101895,
103
+ "mmlu/accuracy/group_average": 0.2580622095101895,
104
+ "mmlu/accuracy/seq_average": 0.2580622095101895,
105
+ "openbookqa/accuracy/test": 0.294,
106
+ "openbookqa/accuracy/group_average": 0.294,
107
+ "openbookqa/accuracy/seq_average": 0.294,
108
+ "race/accuracy/test/high": 0.2672955974842767,
109
+ "race/accuracy/test/middle": 0.3286908077994429,
110
+ "race/accuracy/group_average": 0.2979932026418598,
111
+ "race/accuracy/seq_average": 0.28516416700445885,
112
+ "siqa/accuracy/dev": 0.35516888433981575,
113
+ "siqa/accuracy/group_average": 0.35516888433981575,
114
+ "siqa/accuracy/seq_average": 0.35516888433981575,
115
+ "winogrande/accuracy/dev": 0.5138121546961326,
116
+ "winogrande/accuracy/group_average": 0.5138121546961326,
117
+ "winogrande/accuracy/seq_average": 0.5138121546961326,
118
+ "commonsenseqa/accuracy/dev_rand_split": 0.25307125307125306,
119
+ "commonsenseqa/accuracy/group_average": 0.25307125307125306,
120
+ "commonsenseqa/accuracy/seq_average": 0.25307125307125306
121
+ }
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-400000.pth.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "val/loss": 2.2471437000093006,
3
+ "val/accuracy": 0.5288938492063492,
4
+ "val/perplexity": 9.460674682362416,
5
+ "val/time_since_best_loss": 0,
6
+ "val/time_since_best_accuracy": 0,
7
+ "lambada/loss": 2.506588195421681,
8
+ "lambada/accuracy/total": 0.37558229813664595,
9
+ "lambada/accuracy/openai_last_token": 0.8043478260869565,
10
+ "lambada/perplexity": 6.24110344487834,
11
+ "lambada/lm_loss": 2.8454761839907974,
12
+ "lambada/lm_perplexity": 17.209751726326914,
13
+ "lambada/time_since_best_loss": 0,
14
+ "lambada/time_since_best_accuracy": 0,
15
+ "mean_accuracy": 0.4522380736714976,
16
+ "mean_loss": 2.3768659477154905,
17
+ "blimp/accuracy/passive_2": 0.915,
18
+ "blimp/accuracy/determiner_noun_agreement_2": 0.987,
19
+ "blimp/accuracy/ellipsis_n_bar_1": 0.846,
20
+ "blimp/accuracy/tough_vs_raising_2": 0.905,
21
+ "blimp/accuracy/tough_vs_raising_1": 0.611,
22
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.934,
23
+ "blimp/accuracy/principle_A_reconstruction": 0.442,
24
+ "blimp/accuracy/wh_vs_that_with_gap": 0.475,
25
+ "blimp/accuracy/principle_A_domain_2": 0.874,
26
+ "blimp/accuracy/determiner_noun_agreement_1": 0.992,
27
+ "blimp/accuracy/ellipsis_n_bar_2": 0.915,
28
+ "blimp/accuracy/principle_A_domain_3": 0.592,
29
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.914,
30
+ "blimp/accuracy/animate_subject_trans": 0.918,
31
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.919,
32
+ "blimp/accuracy/distractor_agreement_relative_clause": 0.684,
33
+ "blimp/accuracy/transitive": 0.884,
34
+ "blimp/accuracy/sentential_subject_island": 0.352,
35
+ "blimp/accuracy/adjunct_island": 0.856,
36
+ "blimp/accuracy/intransitive": 0.783,
37
+ "blimp/accuracy/existential_there_subject_raising": 0.884,
38
+ "blimp/accuracy/irregular_past_participle_adjectives": 0.904,
39
+ "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.743,
40
+ "blimp/accuracy/principle_A_case_1": 1.0,
41
+ "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.392,
42
+ "blimp/accuracy/only_npi_scope": 0.714,
43
+ "blimp/accuracy/superlative_quantifiers_2": 0.833,
44
+ "blimp/accuracy/passive_1": 0.925,
45
+ "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.94,
46
+ "blimp/accuracy/inchoative": 0.642,
47
+ "blimp/accuracy/anaphor_gender_agreement": 0.976,
48
+ "blimp/accuracy/principle_A_c_command": 0.579,
49
+ "blimp/accuracy/only_npi_licensor_present": 0.61,
50
+ "blimp/accuracy/expletive_it_object_raising": 0.81,
51
+ "blimp/accuracy/left_branch_island_simple_question": 0.816,
52
+ "blimp/accuracy/wh_questions_subject_gap": 0.933,
53
+ "blimp/accuracy/existential_there_quantifiers_2": 0.636,
54
+ "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.943,
55
+ "blimp/accuracy/sentential_negation_npi_scope": 0.742,
56
+ "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.834,
57
+ "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.919,
58
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.916,
59
+ "blimp/accuracy/principle_A_case_2": 0.951,
60
+ "blimp/accuracy/distractor_agreement_relational_noun": 0.842,
61
+ "blimp/accuracy/sentential_negation_npi_licensor_present": 0.981,
62
+ "blimp/accuracy/superlative_quantifiers_1": 0.818,
63
+ "blimp/accuracy/wh_island": 0.805,
64
+ "blimp/accuracy/principle_A_domain_1": 0.984,
65
+ "blimp/accuracy/complex_NP_island": 0.616,
66
+ "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.972,
67
+ "blimp/accuracy/irregular_past_participle_verbs": 0.904,
68
+ "blimp/accuracy/drop_argument": 0.759,
69
+ "blimp/accuracy/wh_questions_object_gap": 0.855,
70
+ "blimp/accuracy/animate_subject_passive": 0.812,
71
+ "blimp/accuracy/existential_there_quantifiers_1": 0.994,
72
+ "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.914,
73
+ "blimp/accuracy/npi_present_2": 0.612,
74
+ "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.975,
75
+ "blimp/accuracy/anaphor_number_agreement": 0.987,
76
+ "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.968,
77
+ "blimp/accuracy/existential_there_object_raising": 0.864,
78
+ "blimp/accuracy/matrix_question_npi_licensor_present": 0.355,
79
+ "blimp/accuracy/npi_present_1": 0.609,
80
+ "blimp/accuracy/wh_vs_that_no_gap": 0.98,
81
+ "blimp/accuracy/left_branch_island_echo_question": 0.529,
82
+ "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.974,
83
+ "blimp/accuracy/causative": 0.777,
84
+ "blimp/accuracy/group_average": 0.8108358208955223,
85
+ "blimp/accuracy/seq_average": 0.8108358208955224,
86
+ "cbt/accuracy/NE": 0.8201121794871795,
87
+ "cbt/accuracy/V": 0.9404,
88
+ "cbt/accuracy/CN": 0.8896,
89
+ "cbt/accuracy/P": 0.9324,
90
+ "cbt/accuracy/group_average": 0.8956280448717949,
91
+ "cbt/accuracy/seq_average": 0.8956582633053222,
92
+ "hellaswag/accuracy/val": 0.3752240589523999,
93
+ "hellaswag/accuracy/group_average": 0.3752240589523999,
94
+ "hellaswag/accuracy/seq_average": 0.3752240589523999,
95
+ "piqa/accuracy/val": 0.6490750816104461,
96
+ "piqa/accuracy/group_average": 0.6490750816104461,
97
+ "piqa/accuracy/seq_average": 0.6490750816104461,
98
+ "ai2arc/accuracy/ARC-Easy": 0.39154334038054966,
99
+ "ai2arc/accuracy/ARC-Challenge": 0.23090128755364808,
100
+ "ai2arc/accuracy/group_average": 0.3112223139670989,
101
+ "ai2arc/accuracy/seq_average": 0.3385269121813031,
102
+ "mmlu/accuracy/MMLU": 0.26349660350375403,
103
+ "mmlu/accuracy/group_average": 0.26349660350375403,
104
+ "mmlu/accuracy/seq_average": 0.26349660350375403,
105
+ "openbookqa/accuracy/test": 0.284,
106
+ "openbookqa/accuracy/group_average": 0.284,
107
+ "openbookqa/accuracy/seq_average": 0.284,
108
+ "race/accuracy/test/high": 0.292166952544311,
109
+ "race/accuracy/test/middle": 0.36142061281337046,
110
+ "race/accuracy/group_average": 0.32679378267884074,
111
+ "race/accuracy/seq_average": 0.3123226591001216,
112
+ "siqa/accuracy/dev": 0.3766632548618219,
113
+ "siqa/accuracy/group_average": 0.3766632548618219,
114
+ "siqa/accuracy/seq_average": 0.3766632548618219,
115
+ "winogrande/accuracy/dev": 0.5209155485398579,
116
+ "winogrande/accuracy/group_average": 0.5209155485398579,
117
+ "winogrande/accuracy/seq_average": 0.5209155485398579,
118
+ "commonsenseqa/accuracy/dev_rand_split": 0.28501228501228504,
119
+ "commonsenseqa/accuracy/group_average": 0.28501228501228504,
120
+ "commonsenseqa/accuracy/seq_average": 0.28501228501228504
121
+ }
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-60000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.5147608196924605, "val/accuracy": 0.4903002542162698, "val/perplexity": 12.363651283522634, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6585227065945265, "lambada/accuracy/total": 0.28959627329192544, "lambada/accuracy/openai_last_token": 0.7686335403726708, "lambada/perplexity": 9.73550481731993, "lambada/lm_loss": 3.0766951345484563, "lambada/lm_perplexity": 21.686612497157228, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.38994826375409763, "mean_loss": 2.586641884244994, "blimp/accuracy/passive_2": 0.897, "blimp/accuracy/determiner_noun_agreement_2": 0.982, "blimp/accuracy/ellipsis_n_bar_1": 0.845, "blimp/accuracy/tough_vs_raising_2": 0.854, "blimp/accuracy/tough_vs_raising_1": 0.612, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.932, "blimp/accuracy/principle_A_reconstruction": 0.394, "blimp/accuracy/wh_vs_that_with_gap": 0.436, "blimp/accuracy/principle_A_domain_2": 0.855, "blimp/accuracy/determiner_noun_agreement_1": 0.99, "blimp/accuracy/ellipsis_n_bar_2": 0.896, "blimp/accuracy/principle_A_domain_3": 0.6, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.922, "blimp/accuracy/animate_subject_trans": 0.898, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.91, "blimp/accuracy/distractor_agreement_relative_clause": 0.666, "blimp/accuracy/transitive": 0.889, "blimp/accuracy/sentential_subject_island": 0.247, "blimp/accuracy/adjunct_island": 0.781, "blimp/accuracy/intransitive": 0.755, "blimp/accuracy/existential_there_subject_raising": 0.878, "blimp/accuracy/irregular_past_participle_adjectives": 0.826, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.474, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.322, "blimp/accuracy/only_npi_scope": 0.728, "blimp/accuracy/superlative_quantifiers_2": 0.63, "blimp/accuracy/passive_1": 0.878, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.942, "blimp/accuracy/inchoative": 0.623, "blimp/accuracy/anaphor_gender_agreement": 0.967, "blimp/accuracy/principle_A_c_command": 0.569, "blimp/accuracy/only_npi_licensor_present": 0.923, "blimp/accuracy/expletive_it_object_raising": 0.813, "blimp/accuracy/left_branch_island_simple_question": 0.561, "blimp/accuracy/wh_questions_subject_gap": 0.925, "blimp/accuracy/existential_there_quantifiers_2": 0.543, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.946, "blimp/accuracy/sentential_negation_npi_scope": 0.684, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.801, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.852, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.914, "blimp/accuracy/principle_A_case_2": 0.942, "blimp/accuracy/distractor_agreement_relational_noun": 0.906, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.983, "blimp/accuracy/superlative_quantifiers_1": 0.655, "blimp/accuracy/wh_island": 0.908, "blimp/accuracy/principle_A_domain_1": 0.985, "blimp/accuracy/complex_NP_island": 0.527, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.959, "blimp/accuracy/irregular_past_participle_verbs": 0.877, "blimp/accuracy/drop_argument": 0.735, "blimp/accuracy/wh_questions_object_gap": 0.819, "blimp/accuracy/animate_subject_passive": 0.791, "blimp/accuracy/existential_there_quantifiers_1": 0.979, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.906, "blimp/accuracy/npi_present_2": 0.581, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.915, "blimp/accuracy/anaphor_number_agreement": 0.989, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.967, "blimp/accuracy/existential_there_object_raising": 0.819, "blimp/accuracy/matrix_question_npi_licensor_present": 0.222, "blimp/accuracy/npi_present_1": 0.634, "blimp/accuracy/wh_vs_that_no_gap": 0.982, "blimp/accuracy/left_branch_island_echo_question": 0.41, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.972, "blimp/accuracy/causative": 0.75, "blimp/accuracy/group_average": 0.7816865671641792, "blimp/accuracy/seq_average": 0.7816865671641791, "cbt/accuracy/NE": 0.7796474358974359, "cbt/accuracy/V": 0.922, "cbt/accuracy/CN": 0.8372, "cbt/accuracy/P": 0.8996, "cbt/accuracy/group_average": 0.859611858974359, "cbt/accuracy/seq_average": 0.8596438575430172, "hellaswag/accuracy/val": 0.31388169687313283, "hellaswag/accuracy/group_average": 0.31388169687313283, "hellaswag/accuracy/seq_average": 0.31388169687313283, "piqa/accuracy/val": 0.6017410228509249, "piqa/accuracy/group_average": 0.6017410228509249, "piqa/accuracy/seq_average": 0.6017410228509249, "ai2arc/accuracy/ARC-Easy": 0.36109936575052853, "ai2arc/accuracy/ARC-Challenge": 0.20686695278969958, "ai2arc/accuracy/group_average": 0.2839831592701141, "ai2arc/accuracy/seq_average": 0.3101983002832861, "mmlu/accuracy/MMLU": 0.26721487307829817, "mmlu/accuracy/group_average": 0.26721487307829817, "mmlu/accuracy/seq_average": 0.26721487307829817, "openbookqa/accuracy/test": 0.288, "openbookqa/accuracy/group_average": 0.288, "openbookqa/accuracy/seq_average": 0.288, "race/accuracy/test/high": 0.2773013150371641, "race/accuracy/test/middle": 0.34401114206128136, "race/accuracy/group_average": 0.3106562285492227, "race/accuracy/seq_average": 0.29671665991082286, "siqa/accuracy/dev": 0.3664278403275333, "siqa/accuracy/group_average": 0.3664278403275333, "siqa/accuracy/seq_average": 0.3664278403275333, "winogrande/accuracy/dev": 0.5209155485398579, "winogrande/accuracy/group_average": 0.5209155485398579, "winogrande/accuracy/seq_average": 0.5209155485398579, "commonsenseqa/accuracy/dev_rand_split": 0.26371826371826373, "commonsenseqa/accuracy/group_average": 0.26371826371826373, "commonsenseqa/accuracy/seq_average": 0.26371826371826373}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_sigmoid_standardlb_v2/export/result-model-80000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.468655056423611, "val/accuracy": 0.4957614474826389, "val/perplexity": 11.806557015557754, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5147530691964284, "lambada/accuracy/total": 0.2791149068322981, "lambada/accuracy/openai_last_token": 0.7754270186335404, "lambada/perplexity": 9.574216599234415, "lambada/lm_loss": 3.0446928632640704, "lambada/lm_perplexity": 21.003579241342003, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.38743817715746853, "mean_loss": 2.4917040628100198, "blimp/accuracy/passive_2": 0.914, "blimp/accuracy/determiner_noun_agreement_2": 0.988, "blimp/accuracy/ellipsis_n_bar_1": 0.833, "blimp/accuracy/tough_vs_raising_2": 0.855, "blimp/accuracy/tough_vs_raising_1": 0.63, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.918, "blimp/accuracy/principle_A_reconstruction": 0.456, "blimp/accuracy/wh_vs_that_with_gap": 0.491, "blimp/accuracy/principle_A_domain_2": 0.852, "blimp/accuracy/determiner_noun_agreement_1": 0.994, "blimp/accuracy/ellipsis_n_bar_2": 0.888, "blimp/accuracy/principle_A_domain_3": 0.583, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.907, "blimp/accuracy/animate_subject_trans": 0.89, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.9, "blimp/accuracy/distractor_agreement_relative_clause": 0.621, "blimp/accuracy/transitive": 0.867, "blimp/accuracy/sentential_subject_island": 0.312, "blimp/accuracy/adjunct_island": 0.806, "blimp/accuracy/intransitive": 0.768, "blimp/accuracy/existential_there_subject_raising": 0.85, "blimp/accuracy/irregular_past_participle_adjectives": 0.899, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.572, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.3, "blimp/accuracy/only_npi_scope": 0.691, "blimp/accuracy/superlative_quantifiers_2": 0.72, "blimp/accuracy/passive_1": 0.903, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.918, "blimp/accuracy/inchoative": 0.607, "blimp/accuracy/anaphor_gender_agreement": 0.961, "blimp/accuracy/principle_A_c_command": 0.627, "blimp/accuracy/only_npi_licensor_present": 0.77, "blimp/accuracy/expletive_it_object_raising": 0.804, "blimp/accuracy/left_branch_island_simple_question": 0.616, "blimp/accuracy/wh_questions_subject_gap": 0.943, "blimp/accuracy/existential_there_quantifiers_2": 0.568, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.938, "blimp/accuracy/sentential_negation_npi_scope": 0.719, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.803, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.877, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.905, "blimp/accuracy/principle_A_case_2": 0.927, "blimp/accuracy/distractor_agreement_relational_noun": 0.862, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.98, "blimp/accuracy/superlative_quantifiers_1": 0.784, "blimp/accuracy/wh_island": 0.826, "blimp/accuracy/principle_A_domain_1": 0.982, "blimp/accuracy/complex_NP_island": 0.497, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.966, "blimp/accuracy/irregular_past_participle_verbs": 0.854, "blimp/accuracy/drop_argument": 0.738, "blimp/accuracy/wh_questions_object_gap": 0.832, "blimp/accuracy/animate_subject_passive": 0.789, "blimp/accuracy/existential_there_quantifiers_1": 0.984, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.917, "blimp/accuracy/npi_present_2": 0.626, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.937, "blimp/accuracy/anaphor_number_agreement": 0.981, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.957, "blimp/accuracy/existential_there_object_raising": 0.837, "blimp/accuracy/matrix_question_npi_licensor_present": 0.253, "blimp/accuracy/npi_present_1": 0.637, "blimp/accuracy/wh_vs_that_no_gap": 0.988, "blimp/accuracy/left_branch_island_echo_question": 0.389, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.973, "blimp/accuracy/causative": 0.737, "blimp/accuracy/group_average": 0.7868208955223881, "blimp/accuracy/seq_average": 0.7868208955223881, "cbt/accuracy/NE": 0.796875, "cbt/accuracy/V": 0.9248, "cbt/accuracy/CN": 0.854, "cbt/accuracy/P": 0.9104, "cbt/accuracy/group_average": 0.87151875, "cbt/accuracy/seq_average": 0.8715486194477791, "hellaswag/accuracy/val": 0.3189603664608644, "hellaswag/accuracy/group_average": 0.3189603664608644, "hellaswag/accuracy/seq_average": 0.3189603664608644, "piqa/accuracy/val": 0.6207834602829162, "piqa/accuracy/group_average": 0.6207834602829162, "piqa/accuracy/seq_average": 0.6207834602829162, "ai2arc/accuracy/ARC-Easy": 0.34334038054968286, "ai2arc/accuracy/ARC-Challenge": 0.2206008583690987, "ai2arc/accuracy/group_average": 0.2819706194593908, "ai2arc/accuracy/seq_average": 0.3028328611898017, "mmlu/accuracy/MMLU": 0.2642116553450125, "mmlu/accuracy/group_average": 0.2642116553450125, "mmlu/accuracy/seq_average": 0.2642116553450125, "openbookqa/accuracy/test": 0.278, "openbookqa/accuracy/group_average": 0.278, "openbookqa/accuracy/seq_average": 0.278, "race/accuracy/test/high": 0.2775871926815323, "race/accuracy/test/middle": 0.3516713091922006, "race/accuracy/group_average": 0.31462925093686644, "race/accuracy/seq_average": 0.2991487636805837, "siqa/accuracy/dev": 0.3633572159672467, "siqa/accuracy/group_average": 0.3633572159672467, "siqa/accuracy/seq_average": 0.3633572159672467, "winogrande/accuracy/dev": 0.510655090765588, "winogrande/accuracy/group_average": 0.510655090765588, "winogrande/accuracy/seq_average": 0.510655090765588, "commonsenseqa/accuracy/dev_rand_split": 0.2710892710892711, "commonsenseqa/accuracy/group_average": 0.2710892710892711, "commonsenseqa/accuracy/seq_average": 0.2710892710892711}