DavidNguyen commited on
Commit
e8591ac
·
verified ·
1 Parent(s): 0c26476

06ac304c18c243283f61724d5bcd40229a92cc6dd9592e6ed62d41cb88909d4b

Browse files
Files changed (20) hide show
  1. Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-100000.pth.json +1 -0
  2. Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-120000.pth.json +1 -0
  3. Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-140000.pth.json +1 -0
  4. Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-160000.pth.json +1 -0
  5. Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-180000.pth.json +1 -0
  6. Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-20000.pth.json +1 -0
  7. Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-200000.pth.json +1 -0
  8. Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-220000.pth.json +1 -0
  9. Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-240000.pth.json +1 -0
  10. Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-260000.pth.json +1 -0
  11. Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-280000.pth.json +1 -0
  12. Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-300000.pth.json +1 -0
  13. Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-320000.pth.json +1 -0
  14. Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-340000.pth.json +1 -0
  15. Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-360000.pth.json +1 -0
  16. Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-380000.pth.json +1 -0
  17. Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-40000.pth.json +1 -0
  18. Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-400000.pth.json +1 -0
  19. Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-60000.pth.json +1 -0
  20. Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-80000.pth.json +1 -0
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-100000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.4664105612134177, "val/accuracy": 0.49588448660714285, "val/perplexity": 11.780086971942987, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6781388750727873, "lambada/accuracy/total": 0.30124223602484473, "lambada/accuracy/openai_last_token": 0.7787267080745341, "lambada/perplexity": 9.116728943957694, "lambada/lm_loss": 3.0431960470432444, "lambada/lm_perplexity": 20.972164260330068, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3985633613159938, "mean_loss": 2.5722747181431025, "blimp/accuracy/passive_2": 0.891, "blimp/accuracy/determiner_noun_agreement_2": 0.982, "blimp/accuracy/ellipsis_n_bar_1": 0.828, "blimp/accuracy/tough_vs_raising_2": 0.909, "blimp/accuracy/tough_vs_raising_1": 0.606, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.877, "blimp/accuracy/principle_A_reconstruction": 0.216, "blimp/accuracy/wh_vs_that_with_gap": 0.523, "blimp/accuracy/principle_A_domain_2": 0.915, "blimp/accuracy/determiner_noun_agreement_1": 0.988, "blimp/accuracy/ellipsis_n_bar_2": 0.929, "blimp/accuracy/principle_A_domain_3": 0.707, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.918, "blimp/accuracy/animate_subject_trans": 0.906, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.911, "blimp/accuracy/distractor_agreement_relative_clause": 0.637, "blimp/accuracy/transitive": 0.863, "blimp/accuracy/sentential_subject_island": 0.371, "blimp/accuracy/adjunct_island": 0.845, "blimp/accuracy/intransitive": 0.776, "blimp/accuracy/existential_there_subject_raising": 0.873, "blimp/accuracy/irregular_past_participle_adjectives": 0.938, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.567, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.312, "blimp/accuracy/only_npi_scope": 0.781, "blimp/accuracy/superlative_quantifiers_2": 0.761, "blimp/accuracy/passive_1": 0.881, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.89, "blimp/accuracy/inchoative": 0.645, "blimp/accuracy/anaphor_gender_agreement": 0.976, "blimp/accuracy/principle_A_c_command": 0.59, "blimp/accuracy/only_npi_licensor_present": 0.773, "blimp/accuracy/expletive_it_object_raising": 0.809, "blimp/accuracy/left_branch_island_simple_question": 0.68, "blimp/accuracy/wh_questions_subject_gap": 0.947, "blimp/accuracy/existential_there_quantifiers_2": 0.594, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.952, "blimp/accuracy/sentential_negation_npi_scope": 0.608, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.797, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.868, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.913, "blimp/accuracy/principle_A_case_2": 0.914, "blimp/accuracy/distractor_agreement_relational_noun": 0.839, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.965, "blimp/accuracy/superlative_quantifiers_1": 0.663, "blimp/accuracy/wh_island": 0.747, "blimp/accuracy/principle_A_domain_1": 0.972, "blimp/accuracy/complex_NP_island": 0.552, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.962, "blimp/accuracy/irregular_past_participle_verbs": 0.923, "blimp/accuracy/drop_argument": 0.762, "blimp/accuracy/wh_questions_object_gap": 0.82, "blimp/accuracy/animate_subject_passive": 0.807, "blimp/accuracy/existential_there_quantifiers_1": 0.998, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.882, "blimp/accuracy/npi_present_2": 0.579, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.944, "blimp/accuracy/anaphor_number_agreement": 0.989, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.95, "blimp/accuracy/existential_there_object_raising": 0.824, "blimp/accuracy/matrix_question_npi_licensor_present": 0.311, "blimp/accuracy/npi_present_1": 0.52, "blimp/accuracy/wh_vs_that_no_gap": 0.972, "blimp/accuracy/left_branch_island_echo_question": 0.429, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.968, "blimp/accuracy/causative": 0.746, "blimp/accuracy/group_average": 0.7879253731343286, "blimp/accuracy/seq_average": 0.7879253731343283, "cbt/accuracy/NE": 0.7964743589743589, "cbt/accuracy/V": 0.9232, "cbt/accuracy/CN": 0.8584, "cbt/accuracy/P": 0.896, "cbt/accuracy/group_average": 0.8685185897435898, "cbt/accuracy/seq_average": 0.868547418967587, "hellaswag/accuracy/val": 0.3229436367257518, "hellaswag/accuracy/group_average": 0.3229436367257518, "hellaswag/accuracy/seq_average": 0.3229436367257518, "piqa/accuracy/val": 0.5990206746463548, "piqa/accuracy/group_average": 0.5990206746463548, "piqa/accuracy/seq_average": 0.5990206746463548, "ai2arc/accuracy/ARC-Easy": 0.35433403805496827, "ai2arc/accuracy/ARC-Challenge": 0.22832618025751072, "ai2arc/accuracy/group_average": 0.29133010915623947, "ai2arc/accuracy/seq_average": 0.31274787535410764, "race/accuracy/test/high": 0.28130360205831906, "race/accuracy/test/middle": 0.34401114206128136, "race/accuracy/group_average": 0.3126573720598002, "race/accuracy/seq_average": 0.2995541143088772, "siqa/accuracy/dev": 0.35977482088024565, "siqa/accuracy/group_average": 0.35977482088024565, "siqa/accuracy/seq_average": 0.35977482088024565, "commonsenseqa/accuracy/dev_rand_split": 0.2596232596232596, "commonsenseqa/accuracy/group_average": 0.2596232596232596, "commonsenseqa/accuracy/seq_average": 0.2596232596232596}
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-120000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.4421248663039434, "val/accuracy": 0.49870566716269843, "val/perplexity": 11.49744533954327, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5755429475203804, "lambada/accuracy/total": 0.3187111801242236, "lambada/accuracy/openai_last_token": 0.7853260869565217, "lambada/perplexity": 8.165192069714994, "lambada/lm_loss": 3.017661907585545, "lambada/lm_perplexity": 20.443437116545, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.40870842364346105, "mean_loss": 2.508833906912162, "blimp/accuracy/passive_2": 0.918, "blimp/accuracy/determiner_noun_agreement_2": 0.985, "blimp/accuracy/ellipsis_n_bar_1": 0.823, "blimp/accuracy/tough_vs_raising_2": 0.884, "blimp/accuracy/tough_vs_raising_1": 0.602, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.911, "blimp/accuracy/principle_A_reconstruction": 0.34, "blimp/accuracy/wh_vs_that_with_gap": 0.461, "blimp/accuracy/principle_A_domain_2": 0.892, "blimp/accuracy/determiner_noun_agreement_1": 0.992, "blimp/accuracy/ellipsis_n_bar_2": 0.928, "blimp/accuracy/principle_A_domain_3": 0.632, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.923, "blimp/accuracy/animate_subject_trans": 0.891, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.891, "blimp/accuracy/distractor_agreement_relative_clause": 0.643, "blimp/accuracy/transitive": 0.868, "blimp/accuracy/sentential_subject_island": 0.383, "blimp/accuracy/adjunct_island": 0.874, "blimp/accuracy/intransitive": 0.749, "blimp/accuracy/existential_there_subject_raising": 0.892, "blimp/accuracy/irregular_past_participle_adjectives": 0.929, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.53, "blimp/accuracy/principle_A_case_1": 0.999, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.316, "blimp/accuracy/only_npi_scope": 0.644, "blimp/accuracy/superlative_quantifiers_2": 0.766, "blimp/accuracy/passive_1": 0.891, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.92, "blimp/accuracy/inchoative": 0.624, "blimp/accuracy/anaphor_gender_agreement": 0.969, "blimp/accuracy/principle_A_c_command": 0.616, "blimp/accuracy/only_npi_licensor_present": 0.625, "blimp/accuracy/expletive_it_object_raising": 0.798, "blimp/accuracy/left_branch_island_simple_question": 0.656, "blimp/accuracy/wh_questions_subject_gap": 0.935, "blimp/accuracy/existential_there_quantifiers_2": 0.532, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.942, "blimp/accuracy/sentential_negation_npi_scope": 0.706, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.775, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.883, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.913, "blimp/accuracy/principle_A_case_2": 0.924, "blimp/accuracy/distractor_agreement_relational_noun": 0.808, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.99, "blimp/accuracy/superlative_quantifiers_1": 0.78, "blimp/accuracy/wh_island": 0.785, "blimp/accuracy/principle_A_domain_1": 0.986, "blimp/accuracy/complex_NP_island": 0.592, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.975, "blimp/accuracy/irregular_past_participle_verbs": 0.899, "blimp/accuracy/drop_argument": 0.718, "blimp/accuracy/wh_questions_object_gap": 0.816, "blimp/accuracy/animate_subject_passive": 0.813, "blimp/accuracy/existential_there_quantifiers_1": 0.991, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.901, "blimp/accuracy/npi_present_2": 0.596, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.916, "blimp/accuracy/anaphor_number_agreement": 0.991, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.952, "blimp/accuracy/existential_there_object_raising": 0.853, "blimp/accuracy/matrix_question_npi_licensor_present": 0.319, "blimp/accuracy/npi_present_1": 0.505, "blimp/accuracy/wh_vs_that_no_gap": 0.972, "blimp/accuracy/left_branch_island_echo_question": 0.386, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.957, "blimp/accuracy/causative": 0.758, "blimp/accuracy/group_average": 0.7855820895522388, "blimp/accuracy/seq_average": 0.7855820895522388, "cbt/accuracy/NE": 0.796875, "cbt/accuracy/V": 0.9232, "cbt/accuracy/CN": 0.86, "cbt/accuracy/P": 0.9108, "cbt/accuracy/group_average": 0.87271875, "cbt/accuracy/seq_average": 0.8727490996398559, "hellaswag/accuracy/val": 0.32951603266281615, "hellaswag/accuracy/group_average": 0.32951603266281615, "hellaswag/accuracy/seq_average": 0.32951603266281615, "piqa/accuracy/val": 0.6044613710554951, "piqa/accuracy/group_average": 0.6044613710554951, "piqa/accuracy/seq_average": 0.6044613710554951, "ai2arc/accuracy/ARC-Easy": 0.36194503171247355, "ai2arc/accuracy/ARC-Challenge": 0.2094420600858369, "ai2arc/accuracy/group_average": 0.28569354589915524, "ai2arc/accuracy/seq_average": 0.311614730878187, "race/accuracy/test/high": 0.29016580903373357, "race/accuracy/test/middle": 0.35097493036211697, "race/accuracy/group_average": 0.3205703696979253, "race/accuracy/seq_average": 0.30786380218889337, "siqa/accuracy/dev": 0.36438075742067555, "siqa/accuracy/group_average": 0.36438075742067555, "siqa/accuracy/seq_average": 0.36438075742067555, "commonsenseqa/accuracy/dev_rand_split": 0.25061425061425063, "commonsenseqa/accuracy/group_average": 0.25061425061425063, "commonsenseqa/accuracy/seq_average": 0.25061425061425063}
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-140000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.420013912140377, "val/accuracy": 0.5032687717013888, "val/perplexity": 11.246015769943604, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5039229304153725, "lambada/accuracy/total": 0.3105590062111801, "lambada/accuracy/openai_last_token": 0.7839673913043478, "lambada/perplexity": 8.281849354559121, "lambada/lm_loss": 3.010106063114048, "lambada/lm_perplexity": 20.289551784167354, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.40691388895628444, "mean_loss": 2.4619684212778745, "blimp/accuracy/passive_2": 0.907, "blimp/accuracy/determiner_noun_agreement_2": 0.981, "blimp/accuracy/ellipsis_n_bar_1": 0.851, "blimp/accuracy/tough_vs_raising_2": 0.916, "blimp/accuracy/tough_vs_raising_1": 0.553, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.908, "blimp/accuracy/principle_A_reconstruction": 0.383, "blimp/accuracy/wh_vs_that_with_gap": 0.511, "blimp/accuracy/principle_A_domain_2": 0.895, "blimp/accuracy/determiner_noun_agreement_1": 0.99, "blimp/accuracy/ellipsis_n_bar_2": 0.922, "blimp/accuracy/principle_A_domain_3": 0.657, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.929, "blimp/accuracy/animate_subject_trans": 0.911, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.891, "blimp/accuracy/distractor_agreement_relative_clause": 0.639, "blimp/accuracy/transitive": 0.899, "blimp/accuracy/sentential_subject_island": 0.415, "blimp/accuracy/adjunct_island": 0.877, "blimp/accuracy/intransitive": 0.77, "blimp/accuracy/existential_there_subject_raising": 0.907, "blimp/accuracy/irregular_past_participle_adjectives": 0.675, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.524, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.361, "blimp/accuracy/only_npi_scope": 0.688, "blimp/accuracy/superlative_quantifiers_2": 0.824, "blimp/accuracy/passive_1": 0.893, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.928, "blimp/accuracy/inchoative": 0.615, "blimp/accuracy/anaphor_gender_agreement": 0.967, "blimp/accuracy/principle_A_c_command": 0.614, "blimp/accuracy/only_npi_licensor_present": 0.419, "blimp/accuracy/expletive_it_object_raising": 0.827, "blimp/accuracy/left_branch_island_simple_question": 0.561, "blimp/accuracy/wh_questions_subject_gap": 0.954, "blimp/accuracy/existential_there_quantifiers_2": 0.582, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.954, "blimp/accuracy/sentential_negation_npi_scope": 0.672, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.775, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.889, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.908, "blimp/accuracy/principle_A_case_2": 0.931, "blimp/accuracy/distractor_agreement_relational_noun": 0.855, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.987, "blimp/accuracy/superlative_quantifiers_1": 0.809, "blimp/accuracy/wh_island": 0.747, "blimp/accuracy/principle_A_domain_1": 0.984, "blimp/accuracy/complex_NP_island": 0.625, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.97, "blimp/accuracy/irregular_past_participle_verbs": 0.923, "blimp/accuracy/drop_argument": 0.757, "blimp/accuracy/wh_questions_object_gap": 0.855, "blimp/accuracy/animate_subject_passive": 0.819, "blimp/accuracy/existential_there_quantifiers_1": 0.988, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.933, "blimp/accuracy/npi_present_2": 0.501, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.939, "blimp/accuracy/anaphor_number_agreement": 0.982, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.958, "blimp/accuracy/existential_there_object_raising": 0.854, "blimp/accuracy/matrix_question_npi_licensor_present": 0.279, "blimp/accuracy/npi_present_1": 0.412, "blimp/accuracy/wh_vs_that_no_gap": 0.981, "blimp/accuracy/left_branch_island_echo_question": 0.451, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.961, "blimp/accuracy/causative": 0.754, "blimp/accuracy/group_average": 0.7850298507462685, "blimp/accuracy/seq_average": 0.7850298507462686, "cbt/accuracy/NE": 0.8016826923076923, "cbt/accuracy/V": 0.9276, "cbt/accuracy/CN": 0.864, "cbt/accuracy/P": 0.9052, "cbt/accuracy/group_average": 0.874620673076923, "cbt/accuracy/seq_average": 0.8746498599439776, "hellaswag/accuracy/val": 0.3343955387373033, "hellaswag/accuracy/group_average": 0.3343955387373033, "hellaswag/accuracy/seq_average": 0.3343955387373033, "piqa/accuracy/val": 0.6088139281828074, "piqa/accuracy/group_average": 0.6088139281828074, "piqa/accuracy/seq_average": 0.6088139281828074, "ai2arc/accuracy/ARC-Easy": 0.36363636363636365, "ai2arc/accuracy/ARC-Challenge": 0.22746781115879827, "ai2arc/accuracy/group_average": 0.295552087397581, "ai2arc/accuracy/seq_average": 0.31869688385269124, "race/accuracy/test/high": 0.28730703259005147, "race/accuracy/test/middle": 0.34958217270194986, "race/accuracy/group_average": 0.31844460264600066, "race/accuracy/seq_average": 0.30543169841913254, "siqa/accuracy/dev": 0.37563971340839303, "siqa/accuracy/group_average": 0.37563971340839303, "siqa/accuracy/seq_average": 0.37563971340839303, "commonsenseqa/accuracy/dev_rand_split": 0.2678132678132678, "commonsenseqa/accuracy/group_average": 0.2678132678132678, "commonsenseqa/accuracy/seq_average": 0.2678132678132678}
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-160000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.400789291139633, "val/accuracy": 0.5053284660218254, "val/perplexity": 11.031880310604988, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.907242295164499, "lambada/accuracy/total": 0.30085403726708076, "lambada/accuracy/openai_last_token": 0.7820263975155279, "lambada/perplexity": 8.1774234604529, "lambada/lm_loss": 2.986163027513132, "lambada/lm_perplexity": 19.80952786998715, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.40309125164445303, "mean_loss": 2.654015793152066, "blimp/accuracy/passive_2": 0.9, "blimp/accuracy/determiner_noun_agreement_2": 0.988, "blimp/accuracy/ellipsis_n_bar_1": 0.872, "blimp/accuracy/tough_vs_raising_2": 0.918, "blimp/accuracy/tough_vs_raising_1": 0.568, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.934, "blimp/accuracy/principle_A_reconstruction": 0.308, "blimp/accuracy/wh_vs_that_with_gap": 0.423, "blimp/accuracy/principle_A_domain_2": 0.88, "blimp/accuracy/determiner_noun_agreement_1": 0.99, "blimp/accuracy/ellipsis_n_bar_2": 0.926, "blimp/accuracy/principle_A_domain_3": 0.612, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.936, "blimp/accuracy/animate_subject_trans": 0.893, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.889, "blimp/accuracy/distractor_agreement_relative_clause": 0.648, "blimp/accuracy/transitive": 0.881, "blimp/accuracy/sentential_subject_island": 0.423, "blimp/accuracy/adjunct_island": 0.884, "blimp/accuracy/intransitive": 0.74, "blimp/accuracy/existential_there_subject_raising": 0.886, "blimp/accuracy/irregular_past_participle_adjectives": 0.775, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.558, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.298, "blimp/accuracy/only_npi_scope": 0.745, "blimp/accuracy/superlative_quantifiers_2": 0.885, "blimp/accuracy/passive_1": 0.895, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.923, "blimp/accuracy/inchoative": 0.624, "blimp/accuracy/anaphor_gender_agreement": 0.969, "blimp/accuracy/principle_A_c_command": 0.612, "blimp/accuracy/only_npi_licensor_present": 0.684, "blimp/accuracy/expletive_it_object_raising": 0.801, "blimp/accuracy/left_branch_island_simple_question": 0.653, "blimp/accuracy/wh_questions_subject_gap": 0.943, "blimp/accuracy/existential_there_quantifiers_2": 0.506, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.945, "blimp/accuracy/sentential_negation_npi_scope": 0.654, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.839, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.894, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.911, "blimp/accuracy/principle_A_case_2": 0.928, "blimp/accuracy/distractor_agreement_relational_noun": 0.854, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.989, "blimp/accuracy/superlative_quantifiers_1": 0.794, "blimp/accuracy/wh_island": 0.815, "blimp/accuracy/principle_A_domain_1": 0.982, "blimp/accuracy/complex_NP_island": 0.587, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.963, "blimp/accuracy/irregular_past_participle_verbs": 0.902, "blimp/accuracy/drop_argument": 0.738, "blimp/accuracy/wh_questions_object_gap": 0.853, "blimp/accuracy/animate_subject_passive": 0.799, "blimp/accuracy/existential_there_quantifiers_1": 0.986, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.908, "blimp/accuracy/npi_present_2": 0.531, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.921, "blimp/accuracy/anaphor_number_agreement": 0.987, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.966, "blimp/accuracy/existential_there_object_raising": 0.864, "blimp/accuracy/matrix_question_npi_licensor_present": 0.272, "blimp/accuracy/npi_present_1": 0.471, "blimp/accuracy/wh_vs_that_no_gap": 0.979, "blimp/accuracy/left_branch_island_echo_question": 0.393, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.968, "blimp/accuracy/causative": 0.747, "blimp/accuracy/group_average": 0.7882089552238802, "blimp/accuracy/seq_average": 0.7882089552238806, "cbt/accuracy/NE": 0.8000801282051282, "cbt/accuracy/V": 0.9308, "cbt/accuracy/CN": 0.8748, "cbt/accuracy/P": 0.9072, "cbt/accuracy/group_average": 0.8782200320512821, "cbt/accuracy/seq_average": 0.8782513005202081, "hellaswag/accuracy/val": 0.34156542521410077, "hellaswag/accuracy/group_average": 0.34156542521410077, "hellaswag/accuracy/seq_average": 0.34156542521410077, "piqa/accuracy/val": 0.6229597388465724, "piqa/accuracy/group_average": 0.6229597388465724, "piqa/accuracy/seq_average": 0.6229597388465724, "ai2arc/accuracy/ARC-Easy": 0.3767441860465116, "ai2arc/accuracy/ARC-Challenge": 0.21630901287553647, "ai2arc/accuracy/group_average": 0.29652659946102405, "ai2arc/accuracy/seq_average": 0.32379603399433426, "race/accuracy/test/high": 0.2830188679245283, "race/accuracy/test/middle": 0.3565459610027855, "race/accuracy/group_average": 0.3197824144636569, "race/accuracy/seq_average": 0.30441832184839884, "siqa/accuracy/dev": 0.37001023541453426, "siqa/accuracy/group_average": 0.37001023541453426, "siqa/accuracy/seq_average": 0.37001023541453426, "commonsenseqa/accuracy/dev_rand_split": 0.26617526617526616, "commonsenseqa/accuracy/group_average": 0.26617526617526616, "commonsenseqa/accuracy/seq_average": 0.26617526617526616}
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-180000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.381462702675471, "val/accuracy": 0.5085497659350199, "val/perplexity": 10.820718788436235, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.781669853636937, "lambada/accuracy/total": 0.3297748447204969, "lambada/accuracy/openai_last_token": 0.7903726708074534, "lambada/perplexity": 7.825589193661386, "lambada/lm_loss": 2.978314891370167, "lambada/lm_perplexity": 19.654668472011664, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4191623053277584, "mean_loss": 2.5815662781562043, "blimp/accuracy/passive_2": 0.905, "blimp/accuracy/determiner_noun_agreement_2": 0.986, "blimp/accuracy/ellipsis_n_bar_1": 0.867, "blimp/accuracy/tough_vs_raising_2": 0.891, "blimp/accuracy/tough_vs_raising_1": 0.603, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.939, "blimp/accuracy/principle_A_reconstruction": 0.262, "blimp/accuracy/wh_vs_that_with_gap": 0.426, "blimp/accuracy/principle_A_domain_2": 0.909, "blimp/accuracy/determiner_noun_agreement_1": 0.992, "blimp/accuracy/ellipsis_n_bar_2": 0.914, "blimp/accuracy/principle_A_domain_3": 0.657, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.933, "blimp/accuracy/animate_subject_trans": 0.913, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.908, "blimp/accuracy/distractor_agreement_relative_clause": 0.654, "blimp/accuracy/transitive": 0.892, "blimp/accuracy/sentential_subject_island": 0.454, "blimp/accuracy/adjunct_island": 0.86, "blimp/accuracy/intransitive": 0.73, "blimp/accuracy/existential_there_subject_raising": 0.885, "blimp/accuracy/irregular_past_participle_adjectives": 0.916, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.53, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.317, "blimp/accuracy/only_npi_scope": 0.598, "blimp/accuracy/superlative_quantifiers_2": 0.78, "blimp/accuracy/passive_1": 0.871, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.923, "blimp/accuracy/inchoative": 0.606, "blimp/accuracy/anaphor_gender_agreement": 0.966, "blimp/accuracy/principle_A_c_command": 0.645, "blimp/accuracy/only_npi_licensor_present": 0.777, "blimp/accuracy/expletive_it_object_raising": 0.826, "blimp/accuracy/left_branch_island_simple_question": 0.585, "blimp/accuracy/wh_questions_subject_gap": 0.942, "blimp/accuracy/existential_there_quantifiers_2": 0.593, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.94, "blimp/accuracy/sentential_negation_npi_scope": 0.641, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.739, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.893, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.917, "blimp/accuracy/principle_A_case_2": 0.934, "blimp/accuracy/distractor_agreement_relational_noun": 0.841, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.987, "blimp/accuracy/superlative_quantifiers_1": 0.657, "blimp/accuracy/wh_island": 0.8, "blimp/accuracy/principle_A_domain_1": 0.99, "blimp/accuracy/complex_NP_island": 0.61, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.975, "blimp/accuracy/irregular_past_participle_verbs": 0.872, "blimp/accuracy/drop_argument": 0.724, "blimp/accuracy/wh_questions_object_gap": 0.865, "blimp/accuracy/animate_subject_passive": 0.795, "blimp/accuracy/existential_there_quantifiers_1": 0.992, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.886, "blimp/accuracy/npi_present_2": 0.495, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.924, "blimp/accuracy/anaphor_number_agreement": 0.99, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.949, "blimp/accuracy/existential_there_object_raising": 0.902, "blimp/accuracy/matrix_question_npi_licensor_present": 0.366, "blimp/accuracy/npi_present_1": 0.471, "blimp/accuracy/wh_vs_that_no_gap": 0.981, "blimp/accuracy/left_branch_island_echo_question": 0.399, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.97, "blimp/accuracy/causative": 0.731, "blimp/accuracy/group_average": 0.7864328358208955, "blimp/accuracy/seq_average": 0.7864328358208955, "cbt/accuracy/NE": 0.8108974358974359, "cbt/accuracy/V": 0.9288, "cbt/accuracy/CN": 0.8724, "cbt/accuracy/P": 0.9152, "cbt/accuracy/group_average": 0.881824358974359, "cbt/accuracy/seq_average": 0.8818527410964386, "hellaswag/accuracy/val": 0.3434574785899223, "hellaswag/accuracy/group_average": 0.3434574785899223, "hellaswag/accuracy/seq_average": 0.3434574785899223, "piqa/accuracy/val": 0.6224156692056583, "piqa/accuracy/group_average": 0.6224156692056583, "piqa/accuracy/seq_average": 0.6224156692056583, "ai2arc/accuracy/ARC-Easy": 0.38012684989429174, "ai2arc/accuracy/ARC-Challenge": 0.2317596566523605, "ai2arc/accuracy/group_average": 0.3059432532733261, "ai2arc/accuracy/seq_average": 0.3311614730878187, "race/accuracy/test/high": 0.28673527730131504, "race/accuracy/test/middle": 0.3488857938718663, "race/accuracy/group_average": 0.31781053558659067, "race/accuracy/seq_average": 0.30482367247669234, "siqa/accuracy/dev": 0.3618219037871034, "siqa/accuracy/group_average": 0.3618219037871034, "siqa/accuracy/seq_average": 0.3618219037871034, "commonsenseqa/accuracy/dev_rand_split": 0.27436527436527436, "commonsenseqa/accuracy/group_average": 0.27436527436527436, "commonsenseqa/accuracy/seq_average": 0.27436527436527436}
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-20000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.759589059012277, "val/accuracy": 0.4554966517857143, "val/perplexity": 15.793351479090099, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.600937932174398, "lambada/accuracy/total": 0.21215062111801242, "lambada/accuracy/openai_last_token": 0.7455357142857143, "lambada/perplexity": 14.948803385663068, "lambada/lm_loss": 3.3013689517100633, "lambada/lm_perplexity": 27.149780230602207, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.33382363645186336, "mean_loss": 2.6802634955933375, "blimp/accuracy/passive_2": 0.873, "blimp/accuracy/determiner_noun_agreement_2": 0.981, "blimp/accuracy/ellipsis_n_bar_1": 0.746, "blimp/accuracy/tough_vs_raising_2": 0.891, "blimp/accuracy/tough_vs_raising_1": 0.542, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.883, "blimp/accuracy/principle_A_reconstruction": 0.449, "blimp/accuracy/wh_vs_that_with_gap": 0.466, "blimp/accuracy/principle_A_domain_2": 0.855, "blimp/accuracy/determiner_noun_agreement_1": 0.967, "blimp/accuracy/ellipsis_n_bar_2": 0.905, "blimp/accuracy/principle_A_domain_3": 0.584, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.923, "blimp/accuracy/animate_subject_trans": 0.887, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.857, "blimp/accuracy/distractor_agreement_relative_clause": 0.541, "blimp/accuracy/transitive": 0.843, "blimp/accuracy/sentential_subject_island": 0.351, "blimp/accuracy/adjunct_island": 0.832, "blimp/accuracy/intransitive": 0.759, "blimp/accuracy/existential_there_subject_raising": 0.865, "blimp/accuracy/irregular_past_participle_adjectives": 0.874, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.211, "blimp/accuracy/principle_A_case_1": 0.999, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.192, "blimp/accuracy/only_npi_scope": 0.6, "blimp/accuracy/superlative_quantifiers_2": 0.707, "blimp/accuracy/passive_1": 0.861, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.851, "blimp/accuracy/inchoative": 0.596, "blimp/accuracy/anaphor_gender_agreement": 0.961, "blimp/accuracy/principle_A_c_command": 0.543, "blimp/accuracy/only_npi_licensor_present": 0.446, "blimp/accuracy/expletive_it_object_raising": 0.723, "blimp/accuracy/left_branch_island_simple_question": 0.228, "blimp/accuracy/wh_questions_subject_gap": 0.914, "blimp/accuracy/existential_there_quantifiers_2": 0.239, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.934, "blimp/accuracy/sentential_negation_npi_scope": 0.618, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.719, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.899, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.857, "blimp/accuracy/principle_A_case_2": 0.946, "blimp/accuracy/distractor_agreement_relational_noun": 0.732, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.999, "blimp/accuracy/superlative_quantifiers_1": 0.63, "blimp/accuracy/wh_island": 0.824, "blimp/accuracy/principle_A_domain_1": 0.968, "blimp/accuracy/complex_NP_island": 0.463, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.963, "blimp/accuracy/irregular_past_participle_verbs": 0.827, "blimp/accuracy/drop_argument": 0.784, "blimp/accuracy/wh_questions_object_gap": 0.781, "blimp/accuracy/animate_subject_passive": 0.755, "blimp/accuracy/existential_there_quantifiers_1": 0.962, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.847, "blimp/accuracy/npi_present_2": 0.603, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.892, "blimp/accuracy/anaphor_number_agreement": 0.973, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.933, "blimp/accuracy/existential_there_object_raising": 0.762, "blimp/accuracy/matrix_question_npi_licensor_present": 0.224, "blimp/accuracy/npi_present_1": 0.609, "blimp/accuracy/wh_vs_that_no_gap": 0.975, "blimp/accuracy/left_branch_island_echo_question": 0.339, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.975, "blimp/accuracy/causative": 0.68, "blimp/accuracy/group_average": 0.7375820895522389, "blimp/accuracy/seq_average": 0.7375820895522388, "cbt/accuracy/NE": 0.7391826923076923, "cbt/accuracy/V": 0.9012, "cbt/accuracy/CN": 0.802, "cbt/accuracy/P": 0.8704, "cbt/accuracy/group_average": 0.8281956730769231, "cbt/accuracy/seq_average": 0.8282312925170068, "hellaswag/accuracy/val": 0.28599880501892055, "hellaswag/accuracy/group_average": 0.28599880501892055, "hellaswag/accuracy/seq_average": 0.28599880501892055, "piqa/accuracy/val": 0.558215451577802, "piqa/accuracy/group_average": 0.558215451577802, "piqa/accuracy/seq_average": 0.558215451577802, "ai2arc/accuracy/ARC-Easy": 0.32389006342494714, "ai2arc/accuracy/ARC-Challenge": 0.20085836909871244, "ai2arc/accuracy/group_average": 0.2623742162618298, "ai2arc/accuracy/seq_average": 0.28328611898017, "race/accuracy/test/high": 0.2718696397941681, "race/accuracy/test/middle": 0.3370473537604457, "race/accuracy/group_average": 0.3044584967773069, "race/accuracy/seq_average": 0.2908390758005675, "siqa/accuracy/dev": 0.3638689866939611, "siqa/accuracy/group_average": 0.3638689866939611, "siqa/accuracy/seq_average": 0.3638689866939611, "commonsenseqa/accuracy/dev_rand_split": 0.24897624897624898, "commonsenseqa/accuracy/group_average": 0.24897624897624898, "commonsenseqa/accuracy/seq_average": 0.24897624897624898}
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-200000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.365791199699281, "val/accuracy": 0.5103498186383929, "val/perplexity": 10.65246371026837, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.519452397127329, "lambada/accuracy/total": 0.32492236024844723, "lambada/accuracy/openai_last_token": 0.7851319875776398, "lambada/perplexity": 7.9994273234907745, "lambada/lm_loss": 2.978696756295795, "lambada/lm_perplexity": 19.662175333738347, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.41763608944342007, "mean_loss": 2.442621798413305, "blimp/accuracy/passive_2": 0.927, "blimp/accuracy/determiner_noun_agreement_2": 0.983, "blimp/accuracy/ellipsis_n_bar_1": 0.849, "blimp/accuracy/tough_vs_raising_2": 0.895, "blimp/accuracy/tough_vs_raising_1": 0.597, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.927, "blimp/accuracy/principle_A_reconstruction": 0.334, "blimp/accuracy/wh_vs_that_with_gap": 0.455, "blimp/accuracy/principle_A_domain_2": 0.885, "blimp/accuracy/determiner_noun_agreement_1": 0.986, "blimp/accuracy/ellipsis_n_bar_2": 0.927, "blimp/accuracy/principle_A_domain_3": 0.658, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.941, "blimp/accuracy/animate_subject_trans": 0.899, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.9, "blimp/accuracy/distractor_agreement_relative_clause": 0.677, "blimp/accuracy/transitive": 0.875, "blimp/accuracy/sentential_subject_island": 0.453, "blimp/accuracy/adjunct_island": 0.861, "blimp/accuracy/intransitive": 0.75, "blimp/accuracy/existential_there_subject_raising": 0.886, "blimp/accuracy/irregular_past_participle_adjectives": 0.971, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.577, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.343, "blimp/accuracy/only_npi_scope": 0.577, "blimp/accuracy/superlative_quantifiers_2": 0.764, "blimp/accuracy/passive_1": 0.9, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.894, "blimp/accuracy/inchoative": 0.637, "blimp/accuracy/anaphor_gender_agreement": 0.983, "blimp/accuracy/principle_A_c_command": 0.683, "blimp/accuracy/only_npi_licensor_present": 0.694, "blimp/accuracy/expletive_it_object_raising": 0.81, "blimp/accuracy/left_branch_island_simple_question": 0.704, "blimp/accuracy/wh_questions_subject_gap": 0.944, "blimp/accuracy/existential_there_quantifiers_2": 0.585, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.951, "blimp/accuracy/sentential_negation_npi_scope": 0.678, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.852, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.884, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.924, "blimp/accuracy/principle_A_case_2": 0.936, "blimp/accuracy/distractor_agreement_relational_noun": 0.828, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.986, "blimp/accuracy/superlative_quantifiers_1": 0.64, "blimp/accuracy/wh_island": 0.799, "blimp/accuracy/principle_A_domain_1": 0.99, "blimp/accuracy/complex_NP_island": 0.649, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.979, "blimp/accuracy/irregular_past_participle_verbs": 0.895, "blimp/accuracy/drop_argument": 0.742, "blimp/accuracy/wh_questions_object_gap": 0.879, "blimp/accuracy/animate_subject_passive": 0.797, "blimp/accuracy/existential_there_quantifiers_1": 0.992, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.905, "blimp/accuracy/npi_present_2": 0.534, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.921, "blimp/accuracy/anaphor_number_agreement": 0.991, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.962, "blimp/accuracy/existential_there_object_raising": 0.863, "blimp/accuracy/matrix_question_npi_licensor_present": 0.304, "blimp/accuracy/npi_present_1": 0.559, "blimp/accuracy/wh_vs_that_no_gap": 0.982, "blimp/accuracy/left_branch_island_echo_question": 0.445, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.97, "blimp/accuracy/causative": 0.753, "blimp/accuracy/group_average": 0.7958358208955223, "blimp/accuracy/seq_average": 0.7958358208955224, "cbt/accuracy/NE": 0.8020833333333334, "cbt/accuracy/V": 0.9312, "cbt/accuracy/CN": 0.876, "cbt/accuracy/P": 0.9124, "cbt/accuracy/group_average": 0.8804208333333333, "cbt/accuracy/seq_average": 0.8804521808723489, "hellaswag/accuracy/val": 0.35132443736307506, "hellaswag/accuracy/group_average": 0.35132443736307506, "hellaswag/accuracy/seq_average": 0.35132443736307506, "piqa/accuracy/val": 0.6191512513601741, "piqa/accuracy/group_average": 0.6191512513601741, "piqa/accuracy/seq_average": 0.6191512513601741, "ai2arc/accuracy/ARC-Easy": 0.3704016913319239, "ai2arc/accuracy/ARC-Challenge": 0.22832618025751072, "ai2arc/accuracy/group_average": 0.2993639357947173, "ai2arc/accuracy/seq_average": 0.3235127478753541, "race/accuracy/test/high": 0.2918810748999428, "race/accuracy/test/middle": 0.3635097493036212, "race/accuracy/group_average": 0.327695412101782, "race/accuracy/seq_average": 0.3127280097284151, "siqa/accuracy/dev": 0.38536335721596726, "siqa/accuracy/group_average": 0.38536335721596726, "siqa/accuracy/seq_average": 0.38536335721596726, "commonsenseqa/accuracy/dev_rand_split": 0.27682227682227684, "commonsenseqa/accuracy/group_average": 0.27682227682227684, "commonsenseqa/accuracy/seq_average": 0.27682227682227684}
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-220000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.3480672684926835, "val/accuracy": 0.5140332418774801, "val/perplexity": 10.465323505303918, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.463605797809103, "lambada/accuracy/total": 0.328804347826087, "lambada/accuracy/openai_last_token": 0.7894021739130435, "lambada/perplexity": 7.65652673991561, "lambada/lm_loss": 2.953479004238753, "lambada/lm_perplexity": 19.172539180613548, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4214187948517836, "mean_loss": 2.405836533150893, "blimp/accuracy/passive_2": 0.914, "blimp/accuracy/determiner_noun_agreement_2": 0.988, "blimp/accuracy/ellipsis_n_bar_1": 0.858, "blimp/accuracy/tough_vs_raising_2": 0.888, "blimp/accuracy/tough_vs_raising_1": 0.612, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.897, "blimp/accuracy/principle_A_reconstruction": 0.313, "blimp/accuracy/wh_vs_that_with_gap": 0.418, "blimp/accuracy/principle_A_domain_2": 0.891, "blimp/accuracy/determiner_noun_agreement_1": 0.984, "blimp/accuracy/ellipsis_n_bar_2": 0.915, "blimp/accuracy/principle_A_domain_3": 0.653, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.924, "blimp/accuracy/animate_subject_trans": 0.903, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.897, "blimp/accuracy/distractor_agreement_relative_clause": 0.649, "blimp/accuracy/transitive": 0.88, "blimp/accuracy/sentential_subject_island": 0.429, "blimp/accuracy/adjunct_island": 0.875, "blimp/accuracy/intransitive": 0.777, "blimp/accuracy/existential_there_subject_raising": 0.881, "blimp/accuracy/irregular_past_participle_adjectives": 0.979, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.55, "blimp/accuracy/principle_A_case_1": 0.999, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.307, "blimp/accuracy/only_npi_scope": 0.72, "blimp/accuracy/superlative_quantifiers_2": 0.758, "blimp/accuracy/passive_1": 0.896, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.919, "blimp/accuracy/inchoative": 0.63, "blimp/accuracy/anaphor_gender_agreement": 0.975, "blimp/accuracy/principle_A_c_command": 0.609, "blimp/accuracy/only_npi_licensor_present": 0.743, "blimp/accuracy/expletive_it_object_raising": 0.807, "blimp/accuracy/left_branch_island_simple_question": 0.607, "blimp/accuracy/wh_questions_subject_gap": 0.922, "blimp/accuracy/existential_there_quantifiers_2": 0.556, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.951, "blimp/accuracy/sentential_negation_npi_scope": 0.693, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.8, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.872, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.925, "blimp/accuracy/principle_A_case_2": 0.949, "blimp/accuracy/distractor_agreement_relational_noun": 0.832, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.978, "blimp/accuracy/superlative_quantifiers_1": 0.73, "blimp/accuracy/wh_island": 0.806, "blimp/accuracy/principle_A_domain_1": 0.988, "blimp/accuracy/complex_NP_island": 0.622, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.965, "blimp/accuracy/irregular_past_participle_verbs": 0.89, "blimp/accuracy/drop_argument": 0.733, "blimp/accuracy/wh_questions_object_gap": 0.855, "blimp/accuracy/animate_subject_passive": 0.791, "blimp/accuracy/existential_there_quantifiers_1": 0.992, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.885, "blimp/accuracy/npi_present_2": 0.577, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.935, "blimp/accuracy/anaphor_number_agreement": 0.991, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.964, "blimp/accuracy/existential_there_object_raising": 0.858, "blimp/accuracy/matrix_question_npi_licensor_present": 0.35, "blimp/accuracy/npi_present_1": 0.525, "blimp/accuracy/wh_vs_that_no_gap": 0.978, "blimp/accuracy/left_branch_island_echo_question": 0.43, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.97, "blimp/accuracy/causative": 0.747, "blimp/accuracy/group_average": 0.7926119402985071, "blimp/accuracy/seq_average": 0.7926119402985075, "cbt/accuracy/NE": 0.8108974358974359, "cbt/accuracy/V": 0.936, "cbt/accuracy/CN": 0.8784, "cbt/accuracy/P": 0.9108, "cbt/accuracy/group_average": 0.884024358974359, "cbt/accuracy/seq_average": 0.8840536214485795, "hellaswag/accuracy/val": 0.35301732722565227, "hellaswag/accuracy/group_average": 0.35301732722565227, "hellaswag/accuracy/seq_average": 0.35301732722565227, "piqa/accuracy/val": 0.6289445048966268, "piqa/accuracy/group_average": 0.6289445048966268, "piqa/accuracy/seq_average": 0.6289445048966268, "ai2arc/accuracy/ARC-Easy": 0.3771670190274841, "ai2arc/accuracy/ARC-Challenge": 0.21630901287553647, "ai2arc/accuracy/group_average": 0.2967380159515103, "ai2arc/accuracy/seq_average": 0.3240793201133145, "race/accuracy/test/high": 0.2875929102344197, "race/accuracy/test/middle": 0.36002785515320335, "race/accuracy/group_average": 0.3238103826938115, "race/accuracy/seq_average": 0.30867450344548036, "siqa/accuracy/dev": 0.3705220061412487, "siqa/accuracy/group_average": 0.3705220061412487, "siqa/accuracy/seq_average": 0.3705220061412487, "commonsenseqa/accuracy/dev_rand_split": 0.2841932841932842, "commonsenseqa/accuracy/group_average": 0.2841932841932842, "commonsenseqa/accuracy/seq_average": 0.2841932841932842}
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-240000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.3359447660900297, "val/accuracy": 0.5155319940476191, "val/perplexity": 10.339223464241224, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.379752781080163, "lambada/accuracy/total": 0.33676242236024845, "lambada/accuracy/openai_last_token": 0.7917313664596274, "lambada/perplexity": 7.822700963274279, "lambada/lm_loss": 2.932708747587808, "lambada/lm_perplexity": 18.77842768793121, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.42614720820393376, "mean_loss": 2.357848773585096, "blimp/accuracy/passive_2": 0.907, "blimp/accuracy/determiner_noun_agreement_2": 0.985, "blimp/accuracy/ellipsis_n_bar_1": 0.875, "blimp/accuracy/tough_vs_raising_2": 0.913, "blimp/accuracy/tough_vs_raising_1": 0.594, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.921, "blimp/accuracy/principle_A_reconstruction": 0.374, "blimp/accuracy/wh_vs_that_with_gap": 0.424, "blimp/accuracy/principle_A_domain_2": 0.917, "blimp/accuracy/determiner_noun_agreement_1": 0.987, "blimp/accuracy/ellipsis_n_bar_2": 0.909, "blimp/accuracy/principle_A_domain_3": 0.677, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.947, "blimp/accuracy/animate_subject_trans": 0.911, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.906, "blimp/accuracy/distractor_agreement_relative_clause": 0.674, "blimp/accuracy/transitive": 0.884, "blimp/accuracy/sentential_subject_island": 0.379, "blimp/accuracy/adjunct_island": 0.886, "blimp/accuracy/intransitive": 0.76, "blimp/accuracy/existential_there_subject_raising": 0.888, "blimp/accuracy/irregular_past_participle_adjectives": 0.947, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.466, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.336, "blimp/accuracy/only_npi_scope": 0.597, "blimp/accuracy/superlative_quantifiers_2": 0.76, "blimp/accuracy/passive_1": 0.891, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.904, "blimp/accuracy/inchoative": 0.607, "blimp/accuracy/anaphor_gender_agreement": 0.967, "blimp/accuracy/principle_A_c_command": 0.631, "blimp/accuracy/only_npi_licensor_present": 0.487, "blimp/accuracy/expletive_it_object_raising": 0.799, "blimp/accuracy/left_branch_island_simple_question": 0.56, "blimp/accuracy/wh_questions_subject_gap": 0.931, "blimp/accuracy/existential_there_quantifiers_2": 0.57, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.959, "blimp/accuracy/sentential_negation_npi_scope": 0.731, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.786, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.879, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.918, "blimp/accuracy/principle_A_case_2": 0.95, "blimp/accuracy/distractor_agreement_relational_noun": 0.827, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.988, "blimp/accuracy/superlative_quantifiers_1": 0.799, "blimp/accuracy/wh_island": 0.816, "blimp/accuracy/principle_A_domain_1": 0.989, "blimp/accuracy/complex_NP_island": 0.671, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.972, "blimp/accuracy/irregular_past_participle_verbs": 0.908, "blimp/accuracy/drop_argument": 0.753, "blimp/accuracy/wh_questions_object_gap": 0.852, "blimp/accuracy/animate_subject_passive": 0.792, "blimp/accuracy/existential_there_quantifiers_1": 0.992, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.919, "blimp/accuracy/npi_present_2": 0.584, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.93, "blimp/accuracy/anaphor_number_agreement": 0.987, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.961, "blimp/accuracy/existential_there_object_raising": 0.885, "blimp/accuracy/matrix_question_npi_licensor_present": 0.336, "blimp/accuracy/npi_present_1": 0.474, "blimp/accuracy/wh_vs_that_no_gap": 0.977, "blimp/accuracy/left_branch_island_echo_question": 0.372, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.959, "blimp/accuracy/causative": 0.775, "blimp/accuracy/group_average": 0.789731343283582, "blimp/accuracy/seq_average": 0.7897313432835821, "cbt/accuracy/NE": 0.8104967948717948, "cbt/accuracy/V": 0.9388, "cbt/accuracy/CN": 0.8764, "cbt/accuracy/P": 0.9136, "cbt/accuracy/group_average": 0.8848241987179486, "cbt/accuracy/seq_average": 0.8848539415766307, "hellaswag/accuracy/val": 0.35232025492929697, "hellaswag/accuracy/group_average": 0.35232025492929697, "hellaswag/accuracy/seq_average": 0.35232025492929697, "piqa/accuracy/val": 0.6284004352557128, "piqa/accuracy/group_average": 0.6284004352557128, "piqa/accuracy/seq_average": 0.6284004352557128, "ai2arc/accuracy/ARC-Easy": 0.38266384778012685, "ai2arc/accuracy/ARC-Challenge": 0.23948497854077253, "ai2arc/accuracy/group_average": 0.3110744131604497, "ai2arc/accuracy/seq_average": 0.33541076487252125, "race/accuracy/test/high": 0.29559748427672955, "race/accuracy/test/middle": 0.3607242339832869, "race/accuracy/group_average": 0.32816085913000825, "race/accuracy/seq_average": 0.3145520875557357, "siqa/accuracy/dev": 0.37922210849539406, "siqa/accuracy/group_average": 0.37922210849539406, "siqa/accuracy/seq_average": 0.37922210849539406, "commonsenseqa/accuracy/dev_rand_split": 0.2833742833742834, "commonsenseqa/accuracy/group_average": 0.2833742833742834, "commonsenseqa/accuracy/seq_average": 0.2833742833742834}}
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-260000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.3236176021515376, "val/accuracy": 0.5178242032490079, "val/perplexity": 10.212552512456822, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.4122257588072595, "lambada/accuracy/total": 0.3402562111801242, "lambada/accuracy/openai_last_token": 0.7923136645962733, "lambada/perplexity": 7.512679335686159, "lambada/lm_loss": 2.9116786761177798, "lambada/lm_perplexity": 18.3876395651524, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.42904020721456604, "mean_loss": 2.3679216804793985, "blimp/accuracy/passive_2": 0.906, "blimp/accuracy/determiner_noun_agreement_2": 0.989, "blimp/accuracy/ellipsis_n_bar_1": 0.863, "blimp/accuracy/tough_vs_raising_2": 0.893, "blimp/accuracy/tough_vs_raising_1": 0.608, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.886, "blimp/accuracy/principle_A_reconstruction": 0.444, "blimp/accuracy/wh_vs_that_with_gap": 0.469, "blimp/accuracy/principle_A_domain_2": 0.914, "blimp/accuracy/determiner_noun_agreement_1": 0.992, "blimp/accuracy/ellipsis_n_bar_2": 0.916, "blimp/accuracy/principle_A_domain_3": 0.675, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.931, "blimp/accuracy/animate_subject_trans": 0.917, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.914, "blimp/accuracy/distractor_agreement_relative_clause": 0.652, "blimp/accuracy/transitive": 0.88, "blimp/accuracy/sentential_subject_island": 0.399, "blimp/accuracy/adjunct_island": 0.88, "blimp/accuracy/intransitive": 0.784, "blimp/accuracy/existential_there_subject_raising": 0.901, "blimp/accuracy/irregular_past_participle_adjectives": 0.937, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.542, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.36, "blimp/accuracy/only_npi_scope": 0.692, "blimp/accuracy/superlative_quantifiers_2": 0.784, "blimp/accuracy/passive_1": 0.888, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.925, "blimp/accuracy/inchoative": 0.64, "blimp/accuracy/anaphor_gender_agreement": 0.976, "blimp/accuracy/principle_A_c_command": 0.653, "blimp/accuracy/only_npi_licensor_present": 0.743, "blimp/accuracy/expletive_it_object_raising": 0.81, "blimp/accuracy/left_branch_island_simple_question": 0.647, "blimp/accuracy/wh_questions_subject_gap": 0.923, "blimp/accuracy/existential_there_quantifiers_2": 0.559, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.952, "blimp/accuracy/sentential_negation_npi_scope": 0.725, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.795, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.88, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.904, "blimp/accuracy/principle_A_case_2": 0.946, "blimp/accuracy/distractor_agreement_relational_noun": 0.815, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.983, "blimp/accuracy/superlative_quantifiers_1": 0.777, "blimp/accuracy/wh_island": 0.802, "blimp/accuracy/principle_A_domain_1": 0.988, "blimp/accuracy/complex_NP_island": 0.658, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.978, "blimp/accuracy/irregular_past_participle_verbs": 0.894, "blimp/accuracy/drop_argument": 0.746, "blimp/accuracy/wh_questions_object_gap": 0.832, "blimp/accuracy/animate_subject_passive": 0.811, "blimp/accuracy/existential_there_quantifiers_1": 0.993, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.907, "blimp/accuracy/npi_present_2": 0.62, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.955, "blimp/accuracy/anaphor_number_agreement": 0.989, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.956, "blimp/accuracy/existential_there_object_raising": 0.855, "blimp/accuracy/matrix_question_npi_licensor_present": 0.384, "blimp/accuracy/npi_present_1": 0.585, "blimp/accuracy/wh_vs_that_no_gap": 0.975, "blimp/accuracy/left_branch_island_echo_question": 0.409, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.968, "blimp/accuracy/causative": 0.761, "blimp/accuracy/group_average": 0.8020149253731345, "blimp/accuracy/seq_average": 0.8020149253731343, "cbt/accuracy/NE": 0.8096955128205128, "cbt/accuracy/V": 0.9352, "cbt/accuracy/CN": 0.8884, "cbt/accuracy/P": 0.9204, "cbt/accuracy/group_average": 0.8884238782051281, "cbt/accuracy/seq_average": 0.8884553821528611, "hellaswag/accuracy/val": 0.35988846843258315, "hellaswag/accuracy/group_average": 0.35988846843258315, "hellaswag/accuracy/seq_average": 0.35988846843258315, "piqa/accuracy/val": 0.6387377584330794, "piqa/accuracy/group_average": 0.6387377584330794, "piqa/accuracy/seq_average": 0.6387377584330794, "ai2arc/accuracy/ARC-Easy": 0.37378435517970404, "ai2arc/accuracy/ARC-Challenge": 0.23862660944206007, "ai2arc/accuracy/group_average": 0.30620548231088207, "ai2arc/accuracy/seq_average": 0.3291784702549575, "race/accuracy/test/high": 0.28673527730131504, "race/accuracy/test/middle": 0.3530640668523677, "race/accuracy/group_average": 0.31989967207684133, "race/accuracy/seq_average": 0.3060397243615728, "siqa/accuracy/dev": 0.38024564994882293, "siqa/accuracy/group_average": 0.38024564994882293, "siqa/accuracy/seq_average": 0.38024564994882293, "commonsenseqa/accuracy/dev_rand_split": 0.27682227682227684, "commonsenseqa/accuracy/group_average": 0.27682227682227684, "commonsenseqa/accuracy/seq_average": 0.27682227682227684}
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-280000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.3105870806981645, "val/accuracy": 0.5198732406374008, "val/perplexity": 10.080340891757931, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.4289634183326863, "lambada/accuracy/total": 0.3373447204968944, "lambada/accuracy/openai_last_token": 0.7946428571428571, "lambada/perplexity": 7.387379791202243, "lambada/lm_loss": 2.9000765825132055, "lambada/lm_perplexity": 18.17553724446678, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4286089805671476, "mean_loss": 2.3697752495154254, "blimp/accuracy/passive_2": 0.916, "blimp/accuracy/determiner_noun_agreement_2": 0.988, "blimp/accuracy/ellipsis_n_bar_1": 0.843, "blimp/accuracy/tough_vs_raising_2": 0.894, "blimp/accuracy/tough_vs_raising_1": 0.616, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.899, "blimp/accuracy/principle_A_reconstruction": 0.409, "blimp/accuracy/wh_vs_that_with_gap": 0.436, "blimp/accuracy/principle_A_domain_2": 0.904, "blimp/accuracy/determiner_noun_agreement_1": 0.989, "blimp/accuracy/ellipsis_n_bar_2": 0.931, "blimp/accuracy/principle_A_domain_3": 0.681, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.946, "blimp/accuracy/animate_subject_trans": 0.906, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.914, "blimp/accuracy/distractor_agreement_relative_clause": 0.649, "blimp/accuracy/transitive": 0.889, "blimp/accuracy/sentential_subject_island": 0.37, "blimp/accuracy/adjunct_island": 0.883, "blimp/accuracy/intransitive": 0.785, "blimp/accuracy/existential_there_subject_raising": 0.874, "blimp/accuracy/irregular_past_participle_adjectives": 0.861, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.573, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.348, "blimp/accuracy/only_npi_scope": 0.619, "blimp/accuracy/superlative_quantifiers_2": 0.888, "blimp/accuracy/passive_1": 0.892, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.913, "blimp/accuracy/inchoative": 0.636, "blimp/accuracy/anaphor_gender_agreement": 0.983, "blimp/accuracy/principle_A_c_command": 0.637, "blimp/accuracy/only_npi_licensor_present": 0.746, "blimp/accuracy/expletive_it_object_raising": 0.797, "blimp/accuracy/left_branch_island_simple_question": 0.672, "blimp/accuracy/wh_questions_subject_gap": 0.938, "blimp/accuracy/existential_there_quantifiers_2": 0.509, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.96, "blimp/accuracy/sentential_negation_npi_scope": 0.659, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.835, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.892, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.909, "blimp/accuracy/principle_A_case_2": 0.931, "blimp/accuracy/distractor_agreement_relational_noun": 0.82, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.988, "blimp/accuracy/superlative_quantifiers_1": 0.801, "blimp/accuracy/wh_island": 0.802, "blimp/accuracy/principle_A_domain_1": 0.992, "blimp/accuracy/complex_NP_island": 0.603, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.976, "blimp/accuracy/irregular_past_participle_verbs": 0.898, "blimp/accuracy/drop_argument": 0.749, "blimp/accuracy/wh_questions_object_gap": 0.864, "blimp/accuracy/animate_subject_passive": 0.803, "blimp/accuracy/existential_there_quantifiers_1": 0.993, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.897, "blimp/accuracy/npi_present_2": 0.588, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.94, "blimp/accuracy/anaphor_number_agreement": 0.992, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.961, "blimp/accuracy/existential_there_object_raising": 0.865, "blimp/accuracy/matrix_question_npi_licensor_present": 0.378, "blimp/accuracy/npi_present_1": 0.541, "blimp/accuracy/wh_vs_that_no_gap": 0.976, "blimp/accuracy/left_branch_island_echo_question": 0.437, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.971, "blimp/accuracy/causative": 0.763, "blimp/accuracy/group_average": 0.7987761194029848, "blimp/accuracy/seq_average": 0.7987761194029851, "cbt/accuracy/NE": 0.8112980769230769, "cbt/accuracy/V": 0.9356, "cbt/accuracy/CN": 0.882, "cbt/accuracy/P": 0.9188, "cbt/accuracy/group_average": 0.8869245192307692, "cbt/accuracy/seq_average": 0.8869547819127651, "hellaswag/accuracy/val": 0.35929097789285, "hellaswag/accuracy/group_average": 0.35929097789285, "hellaswag/accuracy/seq_average": 0.35929097789285, "piqa/accuracy/val": 0.6289445048966268, "piqa/accuracy/group_average": 0.6289445048966268, "piqa/accuracy/seq_average": 0.6289445048966268, "ai2arc/accuracy/ARC-Easy": 0.3839323467230444, "ai2arc/accuracy/ARC-Challenge": 0.24206008583690988, "ai2arc/accuracy/group_average": 0.31299621627997715, "ai2arc/accuracy/seq_average": 0.3371104815864023, "race/accuracy/test/high": 0.2918810748999428, "race/accuracy/test/middle": 0.3649025069637883, "race/accuracy/group_average": 0.32839179093186555, "race/accuracy/seq_average": 0.3131333603567086, "siqa/accuracy/dev": 0.37717502558853633, "siqa/accuracy/group_average": 0.37717502558853633, "siqa/accuracy/seq_average": 0.37717502558853633, "commonsenseqa/accuracy/dev_rand_split": 0.2727272727272727, "commonsenseqa/accuracy/group_average": 0.2727272727272727, "commonsenseqa/accuracy/seq_average": 0.2727272727272727}
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-300000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.301231989784846, "val/accuracy": 0.5205911303323413, "val/perplexity": 9.986478118221905, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.4577822477921196, "lambada/accuracy/total": 0.3489906832298137, "lambada/accuracy/openai_last_token": 0.7985248447204969, "lambada/perplexity": 7.18487432045619, "lambada/lm_loss": 2.898306594058203, "lambada/lm_perplexity": 18.143395207289338, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.43479090678107746, "mean_loss": 2.379507118788483, "blimp/accuracy/passive_2": 0.91, "blimp/accuracy/determiner_noun_agreement_2": 0.988, "blimp/accuracy/ellipsis_n_bar_1": 0.857, "blimp/accuracy/tough_vs_raising_2": 0.885, "blimp/accuracy/tough_vs_raising_1": 0.597, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.907, "blimp/accuracy/principle_A_reconstruction": 0.424, "blimp/accuracy/wh_vs_that_with_gap": 0.422, "blimp/accuracy/principle_A_domain_2": 0.902, "blimp/accuracy/determiner_noun_agreement_1": 0.989, "blimp/accuracy/ellipsis_n_bar_2": 0.92, "blimp/accuracy/principle_A_domain_3": 0.673, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.939, "blimp/accuracy/animate_subject_trans": 0.912, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.917, "blimp/accuracy/distractor_agreement_relative_clause": 0.642, "blimp/accuracy/transitive": 0.883, "blimp/accuracy/sentential_subject_island": 0.446, "blimp/accuracy/adjunct_island": 0.875, "blimp/accuracy/intransitive": 0.77, "blimp/accuracy/existential_there_subject_raising": 0.892, "blimp/accuracy/irregular_past_participle_adjectives": 0.975, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.66, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.332, "blimp/accuracy/only_npi_scope": 0.714, "blimp/accuracy/superlative_quantifiers_2": 0.848, "blimp/accuracy/passive_1": 0.894, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.931, "blimp/accuracy/inchoative": 0.625, "blimp/accuracy/anaphor_gender_agreement": 0.975, "blimp/accuracy/principle_A_c_command": 0.63, "blimp/accuracy/only_npi_licensor_present": 0.743, "blimp/accuracy/expletive_it_object_raising": 0.809, "blimp/accuracy/left_branch_island_simple_question": 0.786, "blimp/accuracy/wh_questions_subject_gap": 0.928, "blimp/accuracy/existential_there_quantifiers_2": 0.508, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.953, "blimp/accuracy/sentential_negation_npi_scope": 0.65, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.802, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.89, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.912, "blimp/accuracy/principle_A_case_2": 0.948, "blimp/accuracy/distractor_agreement_relational_noun": 0.821, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.984, "blimp/accuracy/superlative_quantifiers_1": 0.81, "blimp/accuracy/wh_island": 0.809, "blimp/accuracy/principle_A_domain_1": 0.99, "blimp/accuracy/complex_NP_island": 0.623, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.972, "blimp/accuracy/irregular_past_participle_verbs": 0.892, "blimp/accuracy/drop_argument": 0.729, "blimp/accuracy/wh_questions_object_gap": 0.863, "blimp/accuracy/animate_subject_passive": 0.799, "blimp/accuracy/existential_there_quantifiers_1": 0.995, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.902, "blimp/accuracy/npi_present_2": 0.532, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.934, "blimp/accuracy/anaphor_number_agreement": 0.992, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.964, "blimp/accuracy/existential_there_object_raising": 0.843, "blimp/accuracy/matrix_question_npi_licensor_present": 0.359, "blimp/accuracy/npi_present_1": 0.484, "blimp/accuracy/wh_vs_that_no_gap": 0.973, "blimp/accuracy/left_branch_island_echo_question": 0.441, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.961, "blimp/accuracy/causative": 0.76, "blimp/accuracy/group_average": 0.801417910447761, "blimp/accuracy/seq_average": 0.8014179104477612, "cbt/accuracy/NE": 0.8068910256410257, "cbt/accuracy/V": 0.94, "cbt/accuracy/CN": 0.8872, "cbt/accuracy/P": 0.9136, "cbt/accuracy/group_average": 0.8869227564102564, "cbt/accuracy/seq_average": 0.8869547819127651, "hellaswag/accuracy/val": 0.36188010356502687, "hellaswag/accuracy/group_average": 0.36188010356502687, "hellaswag/accuracy/seq_average": 0.36188010356502687, "piqa/accuracy/val": 0.6371055495103374, "piqa/accuracy/group_average": 0.6371055495103374, "piqa/accuracy/seq_average": 0.6371055495103374, "ai2arc/accuracy/ARC-Easy": 0.38054968287526425, "ai2arc/accuracy/ARC-Challenge": 0.22489270386266094, "ai2arc/accuracy/group_average": 0.3027211933689626, "ai2arc/accuracy/seq_average": 0.3291784702549575, "race/accuracy/test/high": 0.2938822184105203, "race/accuracy/test/middle": 0.3732590529247911, "race/accuracy/group_average": 0.3335706356676557, "race/accuracy/seq_average": 0.31698419132549654, "siqa/accuracy/dev": 0.37768679631525076, "siqa/accuracy/group_average": 0.37768679631525076, "siqa/accuracy/seq_average": 0.37768679631525076, "commonsenseqa/accuracy/dev_rand_split": 0.27682227682227684, "commonsenseqa/accuracy/group_average": 0.27682227682227684, "commonsenseqa/accuracy/seq_average": 0.27682227682227684}
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-320000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.292751978314112, "val/accuracy": 0.5225035652281746, "val/perplexity": 9.902150723201173, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.4101539753979035, "lambada/accuracy/total": 0.3515139751552795, "lambada/accuracy/openai_last_token": 0.7985248447204969, "lambada/perplexity": 6.937050640237116, "lambada/lm_loss": 2.8965135575144103, "lambada/lm_perplexity": 18.110892584557302, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.43700877019172707, "mean_loss": 2.3514529768560077, "blimp/accuracy/passive_2": 0.915, "blimp/accuracy/determiner_noun_agreement_2": 0.985, "blimp/accuracy/ellipsis_n_bar_1": 0.849, "blimp/accuracy/tough_vs_raising_2": 0.904, "blimp/accuracy/tough_vs_raising_1": 0.595, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.93, "blimp/accuracy/principle_A_reconstruction": 0.468, "blimp/accuracy/wh_vs_that_with_gap": 0.441, "blimp/accuracy/principle_A_domain_2": 0.894, "blimp/accuracy/determiner_noun_agreement_1": 0.988, "blimp/accuracy/ellipsis_n_bar_2": 0.925, "blimp/accuracy/principle_A_domain_3": 0.688, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.939, "blimp/accuracy/animate_subject_trans": 0.907, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.924, "blimp/accuracy/distractor_agreement_relative_clause": 0.623, "blimp/accuracy/transitive": 0.89, "blimp/accuracy/sentential_subject_island": 0.423, "blimp/accuracy/adjunct_island": 0.893, "blimp/accuracy/intransitive": 0.77, "blimp/accuracy/existential_there_subject_raising": 0.888, "blimp/accuracy/irregular_past_participle_adjectives": 0.992, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.616, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.345, "blimp/accuracy/only_npi_scope": 0.676, "blimp/accuracy/superlative_quantifiers_2": 0.789, "blimp/accuracy/passive_1": 0.906, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.918, "blimp/accuracy/inchoative": 0.624, "blimp/accuracy/anaphor_gender_agreement": 0.972, "blimp/accuracy/principle_A_c_command": 0.614, "blimp/accuracy/only_npi_licensor_present": 0.729, "blimp/accuracy/expletive_it_object_raising": 0.801, "blimp/accuracy/left_branch_island_simple_question": 0.75, "blimp/accuracy/wh_questions_subject_gap": 0.936, "blimp/accuracy/existential_there_quantifiers_2": 0.619, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.964, "blimp/accuracy/sentential_negation_npi_scope": 0.66, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.823, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.884, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.909, "blimp/accuracy/principle_A_case_2": 0.951, "blimp/accuracy/distractor_agreement_relational_noun": 0.822, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.99, "blimp/accuracy/superlative_quantifiers_1": 0.8, "blimp/accuracy/wh_island": 0.826, "blimp/accuracy/principle_A_domain_1": 0.983, "blimp/accuracy/complex_NP_island": 0.637, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.974, "blimp/accuracy/irregular_past_participle_verbs": 0.909, "blimp/accuracy/drop_argument": 0.744, "blimp/accuracy/wh_questions_object_gap": 0.873, "blimp/accuracy/animate_subject_passive": 0.795, "blimp/accuracy/existential_there_quantifiers_1": 0.99, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.914, "blimp/accuracy/npi_present_2": 0.577, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.939, "blimp/accuracy/anaphor_number_agreement": 0.991, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.962, "blimp/accuracy/existential_there_object_raising": 0.893, "blimp/accuracy/matrix_question_npi_licensor_present": 0.339, "blimp/accuracy/npi_present_1": 0.526, "blimp/accuracy/wh_vs_that_no_gap": 0.975, "blimp/accuracy/left_branch_island_echo_question": 0.422, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.958, "blimp/accuracy/causative": 0.759, "blimp/accuracy/group_average": 0.8047014925373133, "blimp/accuracy/seq_average": 0.8047014925373134, "cbt/accuracy/NE": 0.8088942307692307, "cbt/accuracy/V": 0.9444, "cbt/accuracy/CN": 0.8844, "cbt/accuracy/P": 0.918, "cbt/accuracy/group_average": 0.8889235576923077, "cbt/accuracy/seq_average": 0.8889555822328932, "hellaswag/accuracy/val": 0.36566421031667, "hellaswag/accuracy/group_average": 0.36566421031667, "hellaswag/accuracy/seq_average": 0.36566421031667, "piqa/accuracy/val": 0.6447225244831338, "piqa/accuracy/group_average": 0.6447225244831338, "piqa/accuracy/seq_average": 0.6447225244831338, "ai2arc/accuracy/ARC-Easy": 0.387737843551797, "ai2arc/accuracy/ARC-Challenge": 0.23605150214592274, "ai2arc/accuracy/group_average": 0.31189467284885986, "ai2arc/accuracy/seq_average": 0.3376770538243626, "race/accuracy/test/high": 0.29845626072041165, "race/accuracy/test/middle": 0.3753481894150418, "race/accuracy/group_average": 0.3369022250677267, "race/accuracy/seq_average": 0.32083502229428457, "siqa/accuracy/dev": 0.38178096212896623, "siqa/accuracy/group_average": 0.38178096212896623, "siqa/accuracy/seq_average": 0.38178096212896623, "commonsenseqa/accuracy/dev_rand_split": 0.2751842751842752, "commonsenseqa/accuracy/group_average": 0.2751842751842752, "commonsenseqa/accuracy/seq_average": 0.2751842751842752}
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-340000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.283391316731771, "val/accuracy": 0.5235392252604166, "val/perplexity": 9.809892513934493, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.3941532869516693, "lambada/accuracy/total": 0.37111801242236025, "lambada/accuracy/openai_last_token": 0.8002717391304348, "lambada/perplexity": 6.812602741863024, "lambada/lm_loss": 2.8877848796156416, "lambada/lm_perplexity": 17.9534963665036, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4473286188413884, "mean_loss": 2.33877230184172, "blimp/accuracy/passive_2": 0.921, "blimp/accuracy/determiner_noun_agreement_2": 0.988, "blimp/accuracy/ellipsis_n_bar_1": 0.863, "blimp/accuracy/tough_vs_raising_2": 0.898, "blimp/accuracy/tough_vs_raising_1": 0.589, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.924, "blimp/accuracy/principle_A_reconstruction": 0.414, "blimp/accuracy/wh_vs_that_with_gap": 0.44, "blimp/accuracy/principle_A_domain_2": 0.904, "blimp/accuracy/determiner_noun_agreement_1": 0.986, "blimp/accuracy/ellipsis_n_bar_2": 0.926, "blimp/accuracy/principle_A_domain_3": 0.666, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.934, "blimp/accuracy/animate_subject_trans": 0.909, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.936, "blimp/accuracy/distractor_agreement_relative_clause": 0.624, "blimp/accuracy/transitive": 0.889, "blimp/accuracy/sentential_subject_island": 0.391, "blimp/accuracy/adjunct_island": 0.885, "blimp/accuracy/intransitive": 0.769, "blimp/accuracy/existential_there_subject_raising": 0.893, "blimp/accuracy/irregular_past_participle_adjectives": 0.922, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.666, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.347, "blimp/accuracy/only_npi_scope": 0.634, "blimp/accuracy/superlative_quantifiers_2": 0.867, "blimp/accuracy/passive_1": 0.891, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.926, "blimp/accuracy/inchoative": 0.627, "blimp/accuracy/anaphor_gender_agreement": 0.977, "blimp/accuracy/principle_A_c_command": 0.617, "blimp/accuracy/only_npi_licensor_present": 0.652, "blimp/accuracy/expletive_it_object_raising": 0.822, "blimp/accuracy/left_branch_island_simple_question": 0.786, "blimp/accuracy/wh_questions_subject_gap": 0.926, "blimp/accuracy/existential_there_quantifiers_2": 0.599, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.957, "blimp/accuracy/sentential_negation_npi_scope": 0.669, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.818, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.887, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.911, "blimp/accuracy/principle_A_case_2": 0.936, "blimp/accuracy/distractor_agreement_relational_noun": 0.834, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.985, "blimp/accuracy/superlative_quantifiers_1": 0.72, "blimp/accuracy/wh_island": 0.813, "blimp/accuracy/principle_A_domain_1": 0.986, "blimp/accuracy/complex_NP_island": 0.617, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.975, "blimp/accuracy/irregular_past_participle_verbs": 0.906, "blimp/accuracy/drop_argument": 0.729, "blimp/accuracy/wh_questions_object_gap": 0.858, "blimp/accuracy/animate_subject_passive": 0.805, "blimp/accuracy/existential_there_quantifiers_1": 0.987, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.923, "blimp/accuracy/npi_present_2": 0.58, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.947, "blimp/accuracy/anaphor_number_agreement": 0.995, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.961, "blimp/accuracy/existential_there_object_raising": 0.862, "blimp/accuracy/matrix_question_npi_licensor_present": 0.38, "blimp/accuracy/npi_present_1": 0.495, "blimp/accuracy/wh_vs_that_no_gap": 0.978, "blimp/accuracy/left_branch_island_echo_question": 0.45, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.973, "blimp/accuracy/causative": 0.775, "blimp/accuracy/group_average": 0.8017910447761194, "blimp/accuracy/seq_average": 0.8017910447761194, "cbt/accuracy/NE": 0.8157051282051282, "cbt/accuracy/V": 0.942, "cbt/accuracy/CN": 0.8884, "cbt/accuracy/P": 0.9208, "cbt/accuracy/group_average": 0.891726282051282, "cbt/accuracy/seq_average": 0.8917567026810724, "hellaswag/accuracy/val": 0.36805417247560246, "hellaswag/accuracy/group_average": 0.36805417247560246, "hellaswag/accuracy/seq_average": 0.36805417247560246, "piqa/accuracy/val": 0.6376496191512514, "piqa/accuracy/group_average": 0.6376496191512514, "piqa/accuracy/seq_average": 0.6376496191512514, "ai2arc/accuracy/ARC-Easy": 0.386892177589852, "ai2arc/accuracy/ARC-Challenge": 0.2369098712446352, "ai2arc/accuracy/group_average": 0.3119010244172436, "ai2arc/accuracy/seq_average": 0.33739376770538243, "race/accuracy/test/high": 0.29445397369925674, "race/accuracy/test/middle": 0.37047353760445684, "race/accuracy/group_average": 0.3324637556518568, "race/accuracy/seq_average": 0.3165788406972031, "siqa/accuracy/dev": 0.37717502558853633, "siqa/accuracy/group_average": 0.37717502558853633, "siqa/accuracy/seq_average": 0.37717502558853633, "commonsenseqa/accuracy/dev_rand_split": 0.2719082719082719, "commonsenseqa/accuracy/group_average": 0.2719082719082719, "commonsenseqa/accuracy/seq_average": 0.2719082719082719}
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-360000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.2767794170076887, "val/accuracy": 0.5248664977058531, "val/perplexity": 9.745244447115212, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.2681564425829777, "lambada/accuracy/total": 0.359860248447205, "lambada/accuracy/openai_last_token": 0.8002717391304348, "lambada/perplexity": 6.724242889087986, "lambada/lm_loss": 2.8697817031699504, "lambada/lm_perplexity": 17.63316851487507, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.44236337307652907, "mean_loss": 2.272467929795333, "blimp/accuracy/passive_2": 0.918, "blimp/accuracy/determiner_noun_agreement_2": 0.99, "blimp/accuracy/ellipsis_n_bar_1": 0.857, "blimp/accuracy/tough_vs_raising_2": 0.884, "blimp/accuracy/tough_vs_raising_1": 0.615, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.924, "blimp/accuracy/principle_A_reconstruction": 0.388, "blimp/accuracy/wh_vs_that_with_gap": 0.456, "blimp/accuracy/principle_A_domain_2": 0.91, "blimp/accuracy/determiner_noun_agreement_1": 0.988, "blimp/accuracy/ellipsis_n_bar_2": 0.917, "blimp/accuracy/principle_A_domain_3": 0.678, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.932, "blimp/accuracy/animate_subject_trans": 0.901, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.92, "blimp/accuracy/distractor_agreement_relative_clause": 0.62, "blimp/accuracy/transitive": 0.89, "blimp/accuracy/sentential_subject_island": 0.413, "blimp/accuracy/adjunct_island": 0.904, "blimp/accuracy/intransitive": 0.787, "blimp/accuracy/existential_there_subject_raising": 0.893, "blimp/accuracy/irregular_past_participle_adjectives": 0.964, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.647, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.378, "blimp/accuracy/only_npi_scope": 0.616, "blimp/accuracy/superlative_quantifiers_2": 0.851, "blimp/accuracy/passive_1": 0.89, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.916, "blimp/accuracy/inchoative": 0.631, "blimp/accuracy/anaphor_gender_agreement": 0.975, "blimp/accuracy/principle_A_c_command": 0.656, "blimp/accuracy/only_npi_licensor_present": 0.7, "blimp/accuracy/expletive_it_object_raising": 0.798, "blimp/accuracy/left_branch_island_simple_question": 0.784, "blimp/accuracy/wh_questions_subject_gap": 0.943, "blimp/accuracy/existential_there_quantifiers_2": 0.591, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.967, "blimp/accuracy/sentential_negation_npi_scope": 0.66, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.831, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.859, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.911, "blimp/accuracy/principle_A_case_2": 0.945, "blimp/accuracy/distractor_agreement_relational_noun": 0.792, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.982, "blimp/accuracy/superlative_quantifiers_1": 0.793, "blimp/accuracy/wh_island": 0.796, "blimp/accuracy/principle_A_domain_1": 0.989, "blimp/accuracy/complex_NP_island": 0.648, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.972, "blimp/accuracy/irregular_past_participle_verbs": 0.878, "blimp/accuracy/drop_argument": 0.732, "blimp/accuracy/wh_questions_object_gap": 0.872, "blimp/accuracy/animate_subject_passive": 0.791, "blimp/accuracy/existential_there_quantifiers_1": 0.989, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.914, "blimp/accuracy/npi_present_2": 0.568, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.944, "blimp/accuracy/anaphor_number_agreement": 0.994, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.964, "blimp/accuracy/existential_there_object_raising": 0.86, "blimp/accuracy/matrix_question_npi_licensor_present": 0.359, "blimp/accuracy/npi_present_1": 0.503, "blimp/accuracy/wh_vs_that_no_gap": 0.98, "blimp/accuracy/left_branch_island_echo_question": 0.512, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.959, "blimp/accuracy/causative": 0.771, "blimp/accuracy/group_average": 0.8038805970149253, "blimp/accuracy/seq_average": 0.8038805970149254, "cbt/accuracy/NE": 0.8149038461538461, "cbt/accuracy/V": 0.9412, "cbt/accuracy/CN": 0.8864, "cbt/accuracy/P": 0.9216, "cbt/accuracy/group_average": 0.8910259615384615, "cbt/accuracy/seq_average": 0.8910564225690276, "hellaswag/accuracy/val": 0.37004580760804623, "hellaswag/accuracy/group_average": 0.37004580760804623, "hellaswag/accuracy/seq_average": 0.37004580760804623, "piqa/accuracy/val": 0.6490750816104461, "piqa/accuracy/group_average": 0.6490750816104461, "piqa/accuracy/seq_average": 0.6490750816104461, "ai2arc/accuracy/ARC-Easy": 0.3885835095137421, "ai2arc/accuracy/ARC-Challenge": 0.2334763948497854, "ai2arc/accuracy/group_average": 0.31102995218176377, "ai2arc/accuracy/seq_average": 0.33739376770538243, "race/accuracy/test/high": 0.29416809605488853, "race/accuracy/test/middle": 0.36629526462395545, "race/accuracy/group_average": 0.33023168033942196, "race/accuracy/seq_average": 0.3151601134981759, "siqa/accuracy/dev": 0.37922210849539406, "siqa/accuracy/group_average": 0.37922210849539406, "siqa/accuracy/seq_average": 0.37922210849539406, "commonsenseqa/accuracy/dev_rand_split": 0.27927927927927926, "commonsenseqa/accuracy/group_average": 0.27927927927927926, "commonsenseqa/accuracy/seq_average": 0.27927927927927926}
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-380000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.2729613289000494, "val/accuracy": 0.5252133324032738, "val/perplexity": 9.708107186966249, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.260025498289499, "lambada/accuracy/total": 0.3771350931677019, "lambada/accuracy/openai_last_token": 0.8043478260869565, "lambada/perplexity": 5.973427437785395, "lambada/lm_loss": 2.867901450562897, "lambada/lm_perplexity": 17.600044853987878, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.45117421278548786, "mean_loss": 2.266493413594774, "blimp/accuracy/passive_2": 0.928, "blimp/accuracy/determiner_noun_agreement_2": 0.987, "blimp/accuracy/ellipsis_n_bar_1": 0.848, "blimp/accuracy/tough_vs_raising_2": 0.894, "blimp/accuracy/tough_vs_raising_1": 0.599, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.931, "blimp/accuracy/principle_A_reconstruction": 0.394, "blimp/accuracy/wh_vs_that_with_gap": 0.418, "blimp/accuracy/principle_A_domain_2": 0.909, "blimp/accuracy/determiner_noun_agreement_1": 0.985, "blimp/accuracy/ellipsis_n_bar_2": 0.928, "blimp/accuracy/principle_A_domain_3": 0.676, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.941, "blimp/accuracy/animate_subject_trans": 0.91, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.92, "blimp/accuracy/distractor_agreement_relative_clause": 0.614, "blimp/accuracy/transitive": 0.892, "blimp/accuracy/sentential_subject_island": 0.425, "blimp/accuracy/adjunct_island": 0.894, "blimp/accuracy/intransitive": 0.775, "blimp/accuracy/existential_there_subject_raising": 0.887, "blimp/accuracy/irregular_past_participle_adjectives": 0.983, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.698, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.362, "blimp/accuracy/only_npi_scope": 0.681, "blimp/accuracy/superlative_quantifiers_2": 0.817, "blimp/accuracy/passive_1": 0.898, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.921, "blimp/accuracy/inchoative": 0.625, "blimp/accuracy/anaphor_gender_agreement": 0.979, "blimp/accuracy/principle_A_c_command": 0.64, "blimp/accuracy/only_npi_licensor_present": 0.793, "blimp/accuracy/expletive_it_object_raising": 0.818, "blimp/accuracy/left_branch_island_simple_question": 0.819, "blimp/accuracy/wh_questions_subject_gap": 0.95, "blimp/accuracy/existential_there_quantifiers_2": 0.661, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.957, "blimp/accuracy/sentential_negation_npi_scope": 0.684, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.807, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.894, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.907, "blimp/accuracy/principle_A_case_2": 0.94, "blimp/accuracy/distractor_agreement_relational_noun": 0.801, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.983, "blimp/accuracy/superlative_quantifiers_1": 0.768, "blimp/accuracy/wh_island": 0.823, "blimp/accuracy/principle_A_domain_1": 0.989, "blimp/accuracy/complex_NP_island": 0.616, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.983, "blimp/accuracy/irregular_past_participle_verbs": 0.907, "blimp/accuracy/drop_argument": 0.725, "blimp/accuracy/wh_questions_object_gap": 0.883, "blimp/accuracy/animate_subject_passive": 0.805, "blimp/accuracy/existential_there_quantifiers_1": 0.992, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.916, "blimp/accuracy/npi_present_2": 0.572, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.938, "blimp/accuracy/anaphor_number_agreement": 0.992, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.959, "blimp/accuracy/existential_there_object_raising": 0.877, "blimp/accuracy/matrix_question_npi_licensor_present": 0.357, "blimp/accuracy/npi_present_1": 0.508, "blimp/accuracy/wh_vs_that_no_gap": 0.979, "blimp/accuracy/left_branch_island_echo_question": 0.469, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.958, "blimp/accuracy/causative": 0.766, "blimp/accuracy/group_average": 0.8082835820895522, "blimp/accuracy/seq_average": 0.8082835820895522, "cbt/accuracy/NE": 0.8193108974358975, "cbt/accuracy/V": 0.94, "cbt/accuracy/CN": 0.89, "cbt/accuracy/P": 0.922, "cbt/accuracy/group_average": 0.8928277243589744, "cbt/accuracy/seq_average": 0.8928571428571429, "hellaswag/accuracy/val": 0.37054371639115713, "hellaswag/accuracy/group_average": 0.37054371639115713, "hellaswag/accuracy/seq_average": 0.37054371639115713, "piqa/accuracy/val": 0.6463547334058759, "piqa/accuracy/group_average": 0.6463547334058759, "piqa/accuracy/seq_average": 0.6463547334058759, "ai2arc/accuracy/ARC-Easy": 0.3919661733615222, "ai2arc/accuracy/ARC-Challenge": 0.2429184549356223, "ai2arc/accuracy/group_average": 0.31744231414857227, "ai2arc/accuracy/seq_average": 0.34277620396600567, "race/accuracy/test/high": 0.29445397369925674, "race/accuracy/test/middle": 0.3774373259052925, "race/accuracy/group_average": 0.3359456498022746, "race/accuracy/seq_average": 0.31860559383867043, "siqa/accuracy/dev": 0.37871033776867963, "siqa/accuracy/group_average": 0.37871033776867963, "siqa/accuracy/seq_average": 0.37871033776867963, "commonsenseqa/accuracy/dev_rand_split": 0.28746928746928746, "commonsenseqa/accuracy/group_average": 0.28746928746928746, "commonsenseqa/accuracy/seq_average": 0.28746928746928746}
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-40000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.6197362021794395, "val/accuracy": 0.4738071986607143, "val/perplexity": 13.732100609019962, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5792435355808423, "lambada/accuracy/total": 0.2705745341614907, "lambada/accuracy/openai_last_token": 0.765333850931677, "lambada/perplexity": 10.978750696122303, "lambada/lm_loss": 3.1780865581450897, "lambada/lm_perplexity": 24.000785479984902, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3721908664111025, "mean_loss": 2.599489868880141, "blimp/accuracy/passive_2": 0.89, "blimp/accuracy/determiner_noun_agreement_2": 0.981, "blimp/accuracy/ellipsis_n_bar_1": 0.842, "blimp/accuracy/tough_vs_raising_2": 0.864, "blimp/accuracy/tough_vs_raising_1": 0.584, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.901, "blimp/accuracy/principle_A_reconstruction": 0.307, "blimp/accuracy/wh_vs_that_with_gap": 0.408, "blimp/accuracy/principle_A_domain_2": 0.861, "blimp/accuracy/determiner_noun_agreement_1": 0.984, "blimp/accuracy/ellipsis_n_bar_2": 0.928, "blimp/accuracy/principle_A_domain_3": 0.628, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.926, "blimp/accuracy/animate_subject_trans": 0.9, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.911, "blimp/accuracy/distractor_agreement_relative_clause": 0.689, "blimp/accuracy/transitive": 0.864, "blimp/accuracy/sentential_subject_island": 0.346, "blimp/accuracy/adjunct_island": 0.847, "blimp/accuracy/intransitive": 0.75, "blimp/accuracy/existential_there_subject_raising": 0.847, "blimp/accuracy/irregular_past_participle_adjectives": 0.965, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.394, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.199, "blimp/accuracy/only_npi_scope": 0.749, "blimp/accuracy/superlative_quantifiers_2": 0.931, "blimp/accuracy/passive_1": 0.873, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.904, "blimp/accuracy/inchoative": 0.581, "blimp/accuracy/anaphor_gender_agreement": 0.965, "blimp/accuracy/principle_A_c_command": 0.586, "blimp/accuracy/only_npi_licensor_present": 0.788, "blimp/accuracy/expletive_it_object_raising": 0.777, "blimp/accuracy/left_branch_island_simple_question": 0.426, "blimp/accuracy/wh_questions_subject_gap": 0.936, "blimp/accuracy/existential_there_quantifiers_2": 0.32, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.938, "blimp/accuracy/sentential_negation_npi_scope": 0.703, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.712, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.895, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.881, "blimp/accuracy/principle_A_case_2": 0.934, "blimp/accuracy/distractor_agreement_relational_noun": 0.838, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.997, "blimp/accuracy/superlative_quantifiers_1": 0.643, "blimp/accuracy/wh_island": 0.777, "blimp/accuracy/principle_A_domain_1": 0.953, "blimp/accuracy/complex_NP_island": 0.566, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.953, "blimp/accuracy/irregular_past_participle_verbs": 0.868, "blimp/accuracy/drop_argument": 0.757, "blimp/accuracy/wh_questions_object_gap": 0.848, "blimp/accuracy/animate_subject_passive": 0.794, "blimp/accuracy/existential_there_quantifiers_1": 0.978, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.862, "blimp/accuracy/npi_present_2": 0.581, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.918, "blimp/accuracy/anaphor_number_agreement": 0.987, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.959, "blimp/accuracy/existential_there_object_raising": 0.813, "blimp/accuracy/matrix_question_npi_licensor_present": 0.32, "blimp/accuracy/npi_present_1": 0.513, "blimp/accuracy/wh_vs_that_no_gap": 0.985, "blimp/accuracy/left_branch_island_echo_question": 0.467, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.981, "blimp/accuracy/causative": 0.718, "blimp/accuracy/group_average": 0.7730000000000001, "blimp/accuracy/seq_average": 0.773, "cbt/accuracy/NE": 0.7572115384615384, "cbt/accuracy/V": 0.9128, "cbt/accuracy/CN": 0.8264, "cbt/accuracy/P": 0.89, "cbt/accuracy/group_average": 0.8466028846153847, "cbt/accuracy/seq_average": 0.8466386554621849, "hellaswag/accuracy/val": 0.3009360685122486, "hellaswag/accuracy/group_average": 0.3009360685122486, "hellaswag/accuracy/seq_average": 0.3009360685122486, "piqa/accuracy/val": 0.5903155603917302, "piqa/accuracy/group_average": 0.5903155603917302, "piqa/accuracy/seq_average": 0.5903155603917302, "ai2arc/accuracy/ARC-Easy": 0.3361522198731501, "ai2arc/accuracy/ARC-Challenge": 0.2111587982832618, "ai2arc/accuracy/group_average": 0.27365550907820596, "ai2arc/accuracy/seq_average": 0.29490084985835696, "race/accuracy/test/high": 0.27072612921669525, "race/accuracy/test/middle": 0.33356545961002787, "race/accuracy/group_average": 0.30214579441336153, "race/accuracy/seq_average": 0.28901499797324687, "siqa/accuracy/dev": 0.3607983623336745, "siqa/accuracy/group_average": 0.3607983623336745, "siqa/accuracy/seq_average": 0.3607983623336745, "commonsenseqa/accuracy/dev_rand_split": 0.2547092547092547, "commonsenseqa/accuracy/group_average": 0.2547092547092547, "commonsenseqa/accuracy/seq_average": 0.2547092547092547}
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-400000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.2683173285590277, "val/accuracy": 0.5268351236979166, "val/perplexity": 9.66312725812092, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.2578974184782608, "lambada/accuracy/total": 0.35248447204968947, "lambada/accuracy/openai_last_token": 0.797748447204969, "lambada/perplexity": 6.849806810128367, "lambada/lm_loss": 2.8690962844385592, "lambada/lm_perplexity": 17.621086551956054, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.43965979787380305, "mean_loss": 2.263107373518644, "blimp/accuracy/passive_2": 0.931, "blimp/accuracy/determiner_noun_agreement_2": 0.988, "blimp/accuracy/ellipsis_n_bar_1": 0.855, "blimp/accuracy/tough_vs_raising_2": 0.896, "blimp/accuracy/tough_vs_raising_1": 0.617, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.917, "blimp/accuracy/principle_A_reconstruction": 0.401, "blimp/accuracy/wh_vs_that_with_gap": 0.427, "blimp/accuracy/principle_A_domain_2": 0.904, "blimp/accuracy/determiner_noun_agreement_1": 0.988, "blimp/accuracy/ellipsis_n_bar_2": 0.932, "blimp/accuracy/principle_A_domain_3": 0.681, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.945, "blimp/accuracy/animate_subject_trans": 0.907, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.914, "blimp/accuracy/distractor_agreement_relative_clause": 0.649, "blimp/accuracy/transitive": 0.89, "blimp/accuracy/sentential_subject_island": 0.414, "blimp/accuracy/adjunct_island": 0.902, "blimp/accuracy/intransitive": 0.785, "blimp/accuracy/existential_there_subject_raising": 0.894, "blimp/accuracy/irregular_past_participle_adjectives": 0.975, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.688, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.367, "blimp/accuracy/only_npi_scope": 0.679, "blimp/accuracy/superlative_quantifiers_2": 0.806, "blimp/accuracy/passive_1": 0.905, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.923, "blimp/accuracy/inchoative": 0.626, "blimp/accuracy/anaphor_gender_agreement": 0.983, "blimp/accuracy/principle_A_c_command": 0.613, "blimp/accuracy/only_npi_licensor_present": 0.612, "blimp/accuracy/expletive_it_object_raising": 0.807, "blimp/accuracy/left_branch_island_simple_question": 0.798, "blimp/accuracy/wh_questions_subject_gap": 0.931, "blimp/accuracy/existential_there_quantifiers_2": 0.534, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.965, "blimp/accuracy/sentential_negation_npi_scope": 0.707, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.814, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.9, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.908, "blimp/accuracy/principle_A_case_2": 0.956, "blimp/accuracy/distractor_agreement_relational_noun": 0.811, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.981, "blimp/accuracy/superlative_quantifiers_1": 0.69, "blimp/accuracy/wh_island": 0.835, "blimp/accuracy/principle_A_domain_1": 0.988, "blimp/accuracy/complex_NP_island": 0.638, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.978, "blimp/accuracy/irregular_past_participle_verbs": 0.879, "blimp/accuracy/drop_argument": 0.724, "blimp/accuracy/wh_questions_object_gap": 0.877, "blimp/accuracy/animate_subject_passive": 0.807, "blimp/accuracy/existential_there_quantifiers_1": 0.989, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.903, "blimp/accuracy/npi_present_2": 0.589, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.951, "blimp/accuracy/anaphor_number_agreement": 0.991, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.966, "blimp/accuracy/existential_there_object_raising": 0.865, "blimp/accuracy/matrix_question_npi_licensor_present": 0.373, "blimp/accuracy/npi_present_1": 0.518, "blimp/accuracy/wh_vs_that_no_gap": 0.98, "blimp/accuracy/left_branch_island_echo_question": 0.46, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.967, "blimp/accuracy/causative": 0.763, "blimp/accuracy/group_average": 0.8038358208955223, "blimp/accuracy/seq_average": 0.8038358208955224, "cbt/accuracy/NE": 0.8241185897435898, "cbt/accuracy/V": 0.9368, "cbt/accuracy/CN": 0.8876, "cbt/accuracy/P": 0.9252, "cbt/accuracy/group_average": 0.8934296474358974, "cbt/accuracy/seq_average": 0.8934573829531812, "hellaswag/accuracy/val": 0.3719378609838678, "hellaswag/accuracy/group_average": 0.3719378609838678, "hellaswag/accuracy/seq_average": 0.3719378609838678, "piqa/accuracy/val": 0.6420021762785637, "piqa/accuracy/group_average": 0.6420021762785637, "piqa/accuracy/seq_average": 0.6420021762785637, "ai2arc/accuracy/ARC-Easy": 0.3898520084566596, "ai2arc/accuracy/ARC-Challenge": 0.23862660944206007, "ai2arc/accuracy/group_average": 0.31423930894935986, "ai2arc/accuracy/seq_average": 0.33994334277620397, "race/accuracy/test/high": 0.29416809605488853, "race/accuracy/test/middle": 0.3649025069637883, "race/accuracy/group_average": 0.3295353015093384, "race/accuracy/seq_average": 0.31475476286988247, "siqa/accuracy/dev": 0.37768679631525076, "siqa/accuracy/group_average": 0.37768679631525076, "siqa/accuracy/seq_average": 0.37768679631525076, "commonsenseqa/accuracy/dev_rand_split": 0.2833742833742834, "commonsenseqa/accuracy/group_average": 0.2833742833742834, "commonsenseqa/accuracy/seq_average": 0.2833742833742834}
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-60000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.5485709054129466, "val/accuracy": 0.48475186786954366, "val/perplexity": 12.788814291790336, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5533259610952057, "lambada/accuracy/total": 0.2534937888198758, "lambada/accuracy/openai_last_token": 0.7577639751552795, "lambada/perplexity": 11.13376569615238, "lambada/lm_loss": 3.1077741727307586, "lambada/lm_perplexity": 22.371194527898428, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3691228283447097, "mean_loss": 2.550948433254076, "blimp/accuracy/passive_2": 0.881, "blimp/accuracy/determiner_noun_agreement_2": 0.981, "blimp/accuracy/ellipsis_n_bar_1": 0.841, "blimp/accuracy/tough_vs_raising_2": 0.843, "blimp/accuracy/tough_vs_raising_1": 0.579, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.878, "blimp/accuracy/principle_A_reconstruction": 0.199, "blimp/accuracy/wh_vs_that_with_gap": 0.488, "blimp/accuracy/principle_A_domain_2": 0.875, "blimp/accuracy/determiner_noun_agreement_1": 0.98, "blimp/accuracy/ellipsis_n_bar_2": 0.898, "blimp/accuracy/principle_A_domain_3": 0.613, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.94, "blimp/accuracy/animate_subject_trans": 0.898, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.891, "blimp/accuracy/distractor_agreement_relative_clause": 0.652, "blimp/accuracy/transitive": 0.856, "blimp/accuracy/sentential_subject_island": 0.386, "blimp/accuracy/adjunct_island": 0.86, "blimp/accuracy/intransitive": 0.719, "blimp/accuracy/existential_there_subject_raising": 0.871, "blimp/accuracy/irregular_past_participle_adjectives": 0.898, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.445, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.268, "blimp/accuracy/only_npi_scope": 0.686, "blimp/accuracy/superlative_quantifiers_2": 0.601, "blimp/accuracy/passive_1": 0.865, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.919, "blimp/accuracy/inchoative": 0.596, "blimp/accuracy/anaphor_gender_agreement": 0.978, "blimp/accuracy/principle_A_c_command": 0.58, "blimp/accuracy/only_npi_licensor_present": 0.773, "blimp/accuracy/expletive_it_object_raising": 0.789, "blimp/accuracy/left_branch_island_simple_question": 0.538, "blimp/accuracy/wh_questions_subject_gap": 0.921, "blimp/accuracy/existential_there_quantifiers_2": 0.466, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.947, "blimp/accuracy/sentential_negation_npi_scope": 0.648, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.75, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.829, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.893, "blimp/accuracy/principle_A_case_2": 0.931, "blimp/accuracy/distractor_agreement_relational_noun": 0.823, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.987, "blimp/accuracy/superlative_quantifiers_1": 0.544, "blimp/accuracy/wh_island": 0.883, "blimp/accuracy/principle_A_domain_1": 0.986, "blimp/accuracy/complex_NP_island": 0.563, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.97, "blimp/accuracy/irregular_past_participle_verbs": 0.88, "blimp/accuracy/drop_argument": 0.721, "blimp/accuracy/wh_questions_object_gap": 0.8, "blimp/accuracy/animate_subject_passive": 0.817, "blimp/accuracy/existential_there_quantifiers_1": 0.98, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.89, "blimp/accuracy/npi_present_2": 0.642, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.907, "blimp/accuracy/anaphor_number_agreement": 0.991, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.958, "blimp/accuracy/existential_there_object_raising": 0.88, "blimp/accuracy/matrix_question_npi_licensor_present": 0.303, "blimp/accuracy/npi_present_1": 0.522, "blimp/accuracy/wh_vs_that_no_gap": 0.962, "blimp/accuracy/left_branch_island_echo_question": 0.456, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.963, "blimp/accuracy/causative": 0.687, "blimp/accuracy/group_average": 0.7696268656716416, "blimp/accuracy/seq_average": 0.7696268656716417, "cbt/accuracy/NE": 0.7740384615384616, "cbt/accuracy/V": 0.9184, "cbt/accuracy/CN": 0.836, "cbt/accuracy/P": 0.8996, "cbt/accuracy/group_average": 0.8570096153846154, "cbt/accuracy/seq_average": 0.8570428171268507, "hellaswag/accuracy/val": 0.3073093009360685, "hellaswag/accuracy/group_average": 0.3073093009360685, "hellaswag/accuracy/seq_average": 0.3073093009360685, "piqa/accuracy/val": 0.5973884657236126, "piqa/accuracy/group_average": 0.5973884657236126, "piqa/accuracy/seq_average": 0.5973884657236126, "ai2arc/accuracy/ARC-Easy": 0.34545454545454546, "ai2arc/accuracy/ARC-Challenge": 0.2128755364806867, "ai2arc/accuracy/group_average": 0.2791650409676161, "ai2arc/accuracy/seq_average": 0.301699716713881, "race/accuracy/test/high": 0.2775871926815323, "race/accuracy/test/middle": 0.3293871866295265, "race/accuracy/group_average": 0.3034871896555294, "race/accuracy/seq_average": 0.29266315362788814, "siqa/accuracy/dev": 0.36438075742067555, "siqa/accuracy/group_average": 0.36438075742067555, "siqa/accuracy/seq_average": 0.36438075742067555, "commonsenseqa/accuracy/dev_rand_split": 0.2620802620802621, "commonsenseqa/accuracy/group_average": 0.2620802620802621, "commonsenseqa/accuracy/seq_average": 0.2620802620802621}
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_660M_standardlb/export/result-model-80000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.498806423611111, "val/accuracy": 0.4925692119295635, "val/perplexity": 12.16796189784298, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.620023928814053, "lambada/accuracy/total": 0.266110248447205, "lambada/accuracy/openai_last_token": 0.765916149068323, "lambada/perplexity": 10.338889135005392, "lambada/lm_loss": 3.078159997737097, "lambada/lm_perplexity": 21.718403696684717, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.37933973018838424, "mean_loss": 2.559415176212582, "blimp/accuracy/passive_2": 0.914, "blimp/accuracy/determiner_noun_agreement_2": 0.992, "blimp/accuracy/ellipsis_n_bar_1": 0.802, "blimp/accuracy/tough_vs_raising_2": 0.863, "blimp/accuracy/tough_vs_raising_1": 0.613, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.911, "blimp/accuracy/principle_A_reconstruction": 0.288, "blimp/accuracy/wh_vs_that_with_gap": 0.519, "blimp/accuracy/principle_A_domain_2": 0.899, "blimp/accuracy/determiner_noun_agreement_1": 0.99, "blimp/accuracy/ellipsis_n_bar_2": 0.912, "blimp/accuracy/principle_A_domain_3": 0.664, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.913, "blimp/accuracy/animate_subject_trans": 0.898, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.89, "blimp/accuracy/distractor_agreement_relative_clause": 0.677, "blimp/accuracy/transitive": 0.867, "blimp/accuracy/sentential_subject_island": 0.441, "blimp/accuracy/adjunct_island": 0.863, "blimp/accuracy/intransitive": 0.704, "blimp/accuracy/existential_there_subject_raising": 0.889, "blimp/accuracy/irregular_past_participle_adjectives": 0.898, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.472, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.325, "blimp/accuracy/only_npi_scope": 0.646, "blimp/accuracy/superlative_quantifiers_2": 0.669, "blimp/accuracy/passive_1": 0.897, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.892, "blimp/accuracy/inchoative": 0.557, "blimp/accuracy/anaphor_gender_agreement": 0.977, "blimp/accuracy/principle_A_c_command": 0.609, "blimp/accuracy/only_npi_licensor_present": 0.506, "blimp/accuracy/expletive_it_object_raising": 0.795, "blimp/accuracy/left_branch_island_simple_question": 0.534, "blimp/accuracy/wh_questions_subject_gap": 0.922, "blimp/accuracy/existential_there_quantifiers_2": 0.556, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.947, "blimp/accuracy/sentential_negation_npi_scope": 0.67, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.811, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.853, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.916, "blimp/accuracy/principle_A_case_2": 0.941, "blimp/accuracy/distractor_agreement_relational_noun": 0.854, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.975, "blimp/accuracy/superlative_quantifiers_1": 0.552, "blimp/accuracy/wh_island": 0.85, "blimp/accuracy/principle_A_domain_1": 0.979, "blimp/accuracy/complex_NP_island": 0.581, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.965, "blimp/accuracy/irregular_past_participle_verbs": 0.875, "blimp/accuracy/drop_argument": 0.732, "blimp/accuracy/wh_questions_object_gap": 0.835, "blimp/accuracy/animate_subject_passive": 0.804, "blimp/accuracy/existential_there_quantifiers_1": 0.992, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.904, "blimp/accuracy/npi_present_2": 0.618, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.936, "blimp/accuracy/anaphor_number_agreement": 0.985, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.949, "blimp/accuracy/existential_there_object_raising": 0.834, "blimp/accuracy/matrix_question_npi_licensor_present": 0.298, "blimp/accuracy/npi_present_1": 0.56, "blimp/accuracy/wh_vs_that_no_gap": 0.978, "blimp/accuracy/left_branch_island_echo_question": 0.494, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.953, "blimp/accuracy/causative": 0.733, "blimp/accuracy/group_average": 0.7781791044776123, "blimp/accuracy/seq_average": 0.778179104477612, "cbt/accuracy/NE": 0.7824519230769231, "cbt/accuracy/V": 0.9252, "cbt/accuracy/CN": 0.8436, "cbt/accuracy/P": 0.8988, "cbt/accuracy/group_average": 0.8625129807692308, "cbt/accuracy/seq_average": 0.8625450180072028, "hellaswag/accuracy/val": 0.31736705835490936, "hellaswag/accuracy/group_average": 0.31736705835490936, "hellaswag/accuracy/seq_average": 0.31736705835490936, "piqa/accuracy/val": 0.602829162132753, "piqa/accuracy/group_average": 0.602829162132753, "piqa/accuracy/seq_average": 0.602829162132753, "ai2arc/accuracy/ARC-Easy": 0.35348837209302325, "ai2arc/accuracy/ARC-Challenge": 0.21974248927038625, "ai2arc/accuracy/group_average": 0.28661543068170475, "ai2arc/accuracy/seq_average": 0.3093484419263456, "race/accuracy/test/high": 0.27844482561463696, "race/accuracy/test/middle": 0.3502785515320334, "race/accuracy/group_average": 0.3143616885733352, "race/accuracy/seq_average": 0.29935143899473043, "siqa/accuracy/dev": 0.3618219037871034, "siqa/accuracy/group_average": 0.3618219037871034, "siqa/accuracy/seq_average": 0.3618219037871034, "commonsenseqa/accuracy/dev_rand_split": 0.25225225225225223, "commonsenseqa/accuracy/group_average": 0.25225225225225223, "commonsenseqa/accuracy/seq_average": 0.25225225225225223}