DavidNguyen commited on
Commit
1a2df65
·
verified ·
1 Parent(s): 745e881

59d9a71623350d4d564442458f8a1cbad5d3a5c4dc9e00d508979c74d5853496

Browse files
Files changed (20) hide show
  1. Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-100000.pth.json +1 -0
  2. Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-120000.pth.json +1 -0
  3. Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-140000.pth.json +1 -0
  4. Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-160000.pth.json +1 -0
  5. Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-180000.pth.json +1 -0
  6. Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-20000.pth.json +1 -0
  7. Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-200000.pth.json +1 -0
  8. Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-220000.pth.json +1 -0
  9. Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-240000.pth.json +1 -0
  10. Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-260000.pth.json +1 -0
  11. Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-280000.pth.json +1 -0
  12. Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-300000.pth.json +1 -0
  13. Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-320000.pth.json +1 -0
  14. Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-340000.pth.json +1 -0
  15. Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-360000.pth.json +1 -0
  16. Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-380000.pth.json +1 -0
  17. Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-40000.pth.json +1 -0
  18. Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-400000.pth.json +1 -0
  19. Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-60000.pth.json +1 -0
  20. Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-80000.pth.json +1 -0
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-100000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.4436538938492065, "val/accuracy": 0.49998934306795634, "val/perplexity": 11.515038697104089, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.52237848317401, "lambada/accuracy/total": 0.3204580745341615, "lambada/accuracy/openai_last_token": 0.7841614906832298, "lambada/perplexity": 8.66151602626163, "lambada/lm_loss": 3.0348728059790933, "lambada/lm_perplexity": 20.79833230771316, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4102237088010589, "mean_loss": 2.483016188511608, "blimp/accuracy/passive_2": 0.896, "blimp/accuracy/determiner_noun_agreement_2": 0.982, "blimp/accuracy/ellipsis_n_bar_1": 0.805, "blimp/accuracy/tough_vs_raising_2": 0.883, "blimp/accuracy/tough_vs_raising_1": 0.617, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.924, "blimp/accuracy/principle_A_reconstruction": 0.367, "blimp/accuracy/wh_vs_that_with_gap": 0.574, "blimp/accuracy/principle_A_domain_2": 0.883, "blimp/accuracy/determiner_noun_agreement_1": 0.99, "blimp/accuracy/ellipsis_n_bar_2": 0.894, "blimp/accuracy/principle_A_domain_3": 0.64, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.936, "blimp/accuracy/animate_subject_trans": 0.912, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.901, "blimp/accuracy/distractor_agreement_relative_clause": 0.665, "blimp/accuracy/transitive": 0.859, "blimp/accuracy/sentential_subject_island": 0.352, "blimp/accuracy/adjunct_island": 0.798, "blimp/accuracy/intransitive": 0.801, "blimp/accuracy/existential_there_subject_raising": 0.854, "blimp/accuracy/irregular_past_participle_adjectives": 0.903, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.569, "blimp/accuracy/principle_A_case_1": 0.999, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.31, "blimp/accuracy/only_npi_scope": 0.692, "blimp/accuracy/superlative_quantifiers_2": 0.722, "blimp/accuracy/passive_1": 0.887, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.928, "blimp/accuracy/inchoative": 0.678, "blimp/accuracy/anaphor_gender_agreement": 0.98, "blimp/accuracy/principle_A_c_command": 0.646, "blimp/accuracy/only_npi_licensor_present": 0.625, "blimp/accuracy/expletive_it_object_raising": 0.805, "blimp/accuracy/left_branch_island_simple_question": 0.611, "blimp/accuracy/wh_questions_subject_gap": 0.918, "blimp/accuracy/existential_there_quantifiers_2": 0.55, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.95, "blimp/accuracy/sentential_negation_npi_scope": 0.757, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.836, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.842, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.91, "blimp/accuracy/principle_A_case_2": 0.95, "blimp/accuracy/distractor_agreement_relational_noun": 0.887, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.987, "blimp/accuracy/superlative_quantifiers_1": 0.562, "blimp/accuracy/wh_island": 0.708, "blimp/accuracy/principle_A_domain_1": 0.975, "blimp/accuracy/complex_NP_island": 0.545, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.978, "blimp/accuracy/irregular_past_participle_verbs": 0.879, "blimp/accuracy/drop_argument": 0.754, "blimp/accuracy/wh_questions_object_gap": 0.795, "blimp/accuracy/animate_subject_passive": 0.785, "blimp/accuracy/existential_there_quantifiers_1": 0.981, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.912, "blimp/accuracy/npi_present_2": 0.567, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.958, "blimp/accuracy/anaphor_number_agreement": 0.991, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.968, "blimp/accuracy/existential_there_object_raising": 0.804, "blimp/accuracy/matrix_question_npi_licensor_present": 0.28, "blimp/accuracy/npi_present_1": 0.655, "blimp/accuracy/wh_vs_that_no_gap": 0.976, "blimp/accuracy/left_branch_island_echo_question": 0.546, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.969, "blimp/accuracy/causative": 0.726, "blimp/accuracy/group_average": 0.7878955223880597, "blimp/accuracy/seq_average": 0.7878955223880597, "cbt/accuracy/NE": 0.7920673076923077, "cbt/accuracy/V": 0.926, "cbt/accuracy/CN": 0.8676, "cbt/accuracy/P": 0.9036, "cbt/accuracy/group_average": 0.8723168269230769, "cbt/accuracy/seq_average": 0.8723489395758304, "hellaswag/accuracy/val": 0.3285202150965943, "hellaswag/accuracy/group_average": 0.3285202150965943, "hellaswag/accuracy/seq_average": 0.3285202150965943, "piqa/accuracy/val": 0.6093579978237215, "piqa/accuracy/group_average": 0.6093579978237215, "piqa/accuracy/seq_average": 0.6093579978237215, "ai2arc/accuracy/ARC-Easy": 0.360676532769556, "ai2arc/accuracy/ARC-Challenge": 0.22489270386266094, "ai2arc/accuracy/group_average": 0.29278461831610847, "ai2arc/accuracy/seq_average": 0.31586402266288954, "race/accuracy/test/high": 0.28187535734705543, "race/accuracy/test/middle": 0.346100278551532, "race/accuracy/group_average": 0.31398781794929376, "race/accuracy/seq_average": 0.3005674908796109, "siqa/accuracy/dev": 0.3679631525076766, "siqa/accuracy/group_average": 0.3679631525076766, "siqa/accuracy/seq_average": 0.3679631525076766, "commonsenseqa/accuracy/dev_rand_split": 0.26371826371826373, "commonsenseqa/accuracy/group_average": 0.26371826371826373, "commonsenseqa/accuracy/seq_average": 0.26371826371826373}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-120000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.4208512079148066, "val/accuracy": 0.5036388578869048, "val/perplexity": 11.255435954617012, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.4739034901494565, "lambada/accuracy/total": 0.3458850931677019, "lambada/accuracy/openai_last_token": 0.7888198757763976, "lambada/perplexity": 7.708660550777633, "lambada/lm_loss": 3.0079031372639404, "lambada/lm_perplexity": 20.244904601327143, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4247619755273033, "mean_loss": 2.4473773490321316, "blimp/accuracy/passive_2": 0.9, "blimp/accuracy/determiner_noun_agreement_2": 0.984, "blimp/accuracy/ellipsis_n_bar_1": 0.825, "blimp/accuracy/tough_vs_raising_2": 0.87, "blimp/accuracy/tough_vs_raising_1": 0.593, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.915, "blimp/accuracy/principle_A_reconstruction": 0.402, "blimp/accuracy/wh_vs_that_with_gap": 0.521, "blimp/accuracy/principle_A_domain_2": 0.862, "blimp/accuracy/determiner_noun_agreement_1": 0.993, "blimp/accuracy/ellipsis_n_bar_2": 0.926, "blimp/accuracy/principle_A_domain_3": 0.604, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.934, "blimp/accuracy/animate_subject_trans": 0.897, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.915, "blimp/accuracy/distractor_agreement_relative_clause": 0.675, "blimp/accuracy/transitive": 0.85, "blimp/accuracy/sentential_subject_island": 0.366, "blimp/accuracy/adjunct_island": 0.848, "blimp/accuracy/intransitive": 0.752, "blimp/accuracy/existential_there_subject_raising": 0.854, "blimp/accuracy/irregular_past_participle_adjectives": 0.906, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.68, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.329, "blimp/accuracy/only_npi_scope": 0.589, "blimp/accuracy/superlative_quantifiers_2": 0.728, "blimp/accuracy/passive_1": 0.895, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.925, "blimp/accuracy/inchoative": 0.636, "blimp/accuracy/anaphor_gender_agreement": 0.979, "blimp/accuracy/principle_A_c_command": 0.628, "blimp/accuracy/only_npi_licensor_present": 0.525, "blimp/accuracy/expletive_it_object_raising": 0.766, "blimp/accuracy/left_branch_island_simple_question": 0.738, "blimp/accuracy/wh_questions_subject_gap": 0.954, "blimp/accuracy/existential_there_quantifiers_2": 0.482, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.947, "blimp/accuracy/sentential_negation_npi_scope": 0.764, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.801, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.889, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.914, "blimp/accuracy/principle_A_case_2": 0.952, "blimp/accuracy/distractor_agreement_relational_noun": 0.866, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.986, "blimp/accuracy/superlative_quantifiers_1": 0.703, "blimp/accuracy/wh_island": 0.802, "blimp/accuracy/principle_A_domain_1": 0.987, "blimp/accuracy/complex_NP_island": 0.587, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.979, "blimp/accuracy/irregular_past_participle_verbs": 0.903, "blimp/accuracy/drop_argument": 0.715, "blimp/accuracy/wh_questions_object_gap": 0.822, "blimp/accuracy/animate_subject_passive": 0.817, "blimp/accuracy/existential_there_quantifiers_1": 0.98, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.911, "blimp/accuracy/npi_present_2": 0.648, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.929, "blimp/accuracy/anaphor_number_agreement": 0.987, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.966, "blimp/accuracy/existential_there_object_raising": 0.833, "blimp/accuracy/matrix_question_npi_licensor_present": 0.334, "blimp/accuracy/npi_present_1": 0.593, "blimp/accuracy/wh_vs_that_no_gap": 0.981, "blimp/accuracy/left_branch_island_echo_question": 0.518, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.968, "blimp/accuracy/causative": 0.766, "blimp/accuracy/group_average": 0.7924477611940298, "blimp/accuracy/seq_average": 0.7924477611940298, "cbt/accuracy/NE": 0.7988782051282052, "cbt/accuracy/V": 0.9276, "cbt/accuracy/CN": 0.8664, "cbt/accuracy/P": 0.9092, "cbt/accuracy/group_average": 0.8755195512820513, "cbt/accuracy/seq_average": 0.8755502200880352, "hellaswag/accuracy/val": 0.3337980481975702, "hellaswag/accuracy/group_average": 0.3337980481975702, "hellaswag/accuracy/seq_average": 0.3337980481975702, "piqa/accuracy/val": 0.6240478781284005, "piqa/accuracy/group_average": 0.6240478781284005, "piqa/accuracy/seq_average": 0.6240478781284005, "ai2arc/accuracy/ARC-Easy": 0.3758985200845666, "ai2arc/accuracy/ARC-Challenge": 0.23090128755364808, "ai2arc/accuracy/group_average": 0.3033999038191073, "ai2arc/accuracy/seq_average": 0.32804532577903683, "race/accuracy/test/high": 0.2887364208118925, "race/accuracy/test/middle": 0.36629526462395545, "race/accuracy/group_average": 0.327515842717924, "race/accuracy/seq_average": 0.31130928252938794, "siqa/accuracy/dev": 0.37615148413510746, "siqa/accuracy/group_average": 0.37615148413510746, "siqa/accuracy/seq_average": 0.37615148413510746, "commonsenseqa/accuracy/dev_rand_split": 0.2702702702702703, "commonsenseqa/accuracy/group_average": 0.2702702702702703, "commonsenseqa/accuracy/seq_average": 0.2702702702702703}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-140000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.400276547386533, "val/accuracy": 0.5057101779513888, "val/perplexity": 11.026225232817579, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.3573642517468945, "lambada/accuracy/total": 0.3266692546583851, "lambada/accuracy/openai_last_token": 0.7905667701863354, "lambada/perplexity": 7.996995723739071, "lambada/lm_loss": 2.98772270215793, "lambada/lm_perplexity": 19.84044839504278, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.416189716304887, "mean_loss": 2.3788203995667137, "blimp/accuracy/passive_2": 0.904, "blimp/accuracy/determiner_noun_agreement_2": 0.978, "blimp/accuracy/ellipsis_n_bar_1": 0.835, "blimp/accuracy/tough_vs_raising_2": 0.872, "blimp/accuracy/tough_vs_raising_1": 0.65, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.898, "blimp/accuracy/principle_A_reconstruction": 0.399, "blimp/accuracy/wh_vs_that_with_gap": 0.54, "blimp/accuracy/principle_A_domain_2": 0.892, "blimp/accuracy/determiner_noun_agreement_1": 0.989, "blimp/accuracy/ellipsis_n_bar_2": 0.898, "blimp/accuracy/principle_A_domain_3": 0.66, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.945, "blimp/accuracy/animate_subject_trans": 0.915, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.893, "blimp/accuracy/distractor_agreement_relative_clause": 0.654, "blimp/accuracy/transitive": 0.882, "blimp/accuracy/sentential_subject_island": 0.319, "blimp/accuracy/adjunct_island": 0.838, "blimp/accuracy/intransitive": 0.759, "blimp/accuracy/existential_there_subject_raising": 0.881, "blimp/accuracy/irregular_past_participle_adjectives": 0.978, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.662, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.341, "blimp/accuracy/only_npi_scope": 0.65, "blimp/accuracy/superlative_quantifiers_2": 0.671, "blimp/accuracy/passive_1": 0.903, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.926, "blimp/accuracy/inchoative": 0.641, "blimp/accuracy/anaphor_gender_agreement": 0.978, "blimp/accuracy/principle_A_c_command": 0.594, "blimp/accuracy/only_npi_licensor_present": 0.587, "blimp/accuracy/expletive_it_object_raising": 0.763, "blimp/accuracy/left_branch_island_simple_question": 0.661, "blimp/accuracy/wh_questions_subject_gap": 0.947, "blimp/accuracy/existential_there_quantifiers_2": 0.558, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.947, "blimp/accuracy/sentential_negation_npi_scope": 0.785, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.812, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.885, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.908, "blimp/accuracy/principle_A_case_2": 0.945, "blimp/accuracy/distractor_agreement_relational_noun": 0.895, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.988, "blimp/accuracy/superlative_quantifiers_1": 0.717, "blimp/accuracy/wh_island": 0.744, "blimp/accuracy/principle_A_domain_1": 0.987, "blimp/accuracy/complex_NP_island": 0.602, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.977, "blimp/accuracy/irregular_past_participle_verbs": 0.869, "blimp/accuracy/drop_argument": 0.766, "blimp/accuracy/wh_questions_object_gap": 0.835, "blimp/accuracy/animate_subject_passive": 0.783, "blimp/accuracy/existential_there_quantifiers_1": 0.989, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.928, "blimp/accuracy/npi_present_2": 0.569, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.955, "blimp/accuracy/anaphor_number_agreement": 0.991, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.963, "blimp/accuracy/existential_there_object_raising": 0.812, "blimp/accuracy/matrix_question_npi_licensor_present": 0.401, "blimp/accuracy/npi_present_1": 0.57, "blimp/accuracy/wh_vs_that_no_gap": 0.986, "blimp/accuracy/left_branch_island_echo_question": 0.536, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.976, "blimp/accuracy/causative": 0.736, "blimp/accuracy/group_average": 0.7957910447761193, "blimp/accuracy/seq_average": 0.7957910447761194, "cbt/accuracy/NE": 0.7980769230769231, "cbt/accuracy/V": 0.9352, "cbt/accuracy/CN": 0.8664, "cbt/accuracy/P": 0.9156, "cbt/accuracy/group_average": 0.8788192307692307, "cbt/accuracy/seq_average": 0.8788515406162465, "hellaswag/accuracy/val": 0.3385779725154352, "hellaswag/accuracy/group_average": 0.3385779725154352, "hellaswag/accuracy/seq_average": 0.3385779725154352, "piqa/accuracy/val": 0.6164309031556039, "piqa/accuracy/group_average": 0.6164309031556039, "piqa/accuracy/seq_average": 0.6164309031556039, "ai2arc/accuracy/ARC-Easy": 0.3627906976744186, "ai2arc/accuracy/ARC-Challenge": 0.2334763948497854, "ai2arc/accuracy/group_average": 0.29813354626210203, "ai2arc/accuracy/seq_average": 0.32011331444759206, "race/accuracy/test/high": 0.28987993138936535, "race/accuracy/test/middle": 0.3544568245125348, "race/accuracy/group_average": 0.3221683779509501, "race/accuracy/seq_average": 0.30867450344548036, "siqa/accuracy/dev": 0.37308085977482086, "siqa/accuracy/group_average": 0.37308085977482086, "siqa/accuracy/seq_average": 0.37308085977482086, "commonsenseqa/accuracy/dev_rand_split": 0.2727272727272727, "commonsenseqa/accuracy/group_average": 0.2727272727272727, "commonsenseqa/accuracy/seq_average": 0.2727272727272727}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-160000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.3806600419301835, "val/accuracy": 0.5089673239087301, "val/perplexity": 10.812036906997793, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.3977236540421196, "lambada/accuracy/total": 0.33462732919254656, "lambada/accuracy/openai_last_token": 0.7884316770186336, "lambada/perplexity": 7.694164831524178, "lambada/lm_loss": 2.9711021491295155, "lambada/lm_perplexity": 19.513414441333115, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.42179732655063834, "mean_loss": 2.389191847986152, "blimp/accuracy/passive_2": 0.897, "blimp/accuracy/determiner_noun_agreement_2": 0.989, "blimp/accuracy/ellipsis_n_bar_1": 0.841, "blimp/accuracy/tough_vs_raising_2": 0.856, "blimp/accuracy/tough_vs_raising_1": 0.622, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.921, "blimp/accuracy/principle_A_reconstruction": 0.403, "blimp/accuracy/wh_vs_that_with_gap": 0.49, "blimp/accuracy/principle_A_domain_2": 0.894, "blimp/accuracy/determiner_noun_agreement_1": 0.992, "blimp/accuracy/ellipsis_n_bar_2": 0.889, "blimp/accuracy/principle_A_domain_3": 0.607, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.939, "blimp/accuracy/animate_subject_trans": 0.907, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.912, "blimp/accuracy/distractor_agreement_relative_clause": 0.668, "blimp/accuracy/transitive": 0.868, "blimp/accuracy/sentential_subject_island": 0.374, "blimp/accuracy/adjunct_island": 0.856, "blimp/accuracy/intransitive": 0.76, "blimp/accuracy/existential_there_subject_raising": 0.876, "blimp/accuracy/irregular_past_participle_adjectives": 0.947, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.673, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.321, "blimp/accuracy/only_npi_scope": 0.659, "blimp/accuracy/superlative_quantifiers_2": 0.799, "blimp/accuracy/passive_1": 0.906, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.945, "blimp/accuracy/inchoative": 0.643, "blimp/accuracy/anaphor_gender_agreement": 0.978, "blimp/accuracy/principle_A_c_command": 0.61, "blimp/accuracy/only_npi_licensor_present": 0.481, "blimp/accuracy/expletive_it_object_raising": 0.757, "blimp/accuracy/left_branch_island_simple_question": 0.705, "blimp/accuracy/wh_questions_subject_gap": 0.927, "blimp/accuracy/existential_there_quantifiers_2": 0.512, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.944, "blimp/accuracy/sentential_negation_npi_scope": 0.779, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.849, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.889, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.906, "blimp/accuracy/principle_A_case_2": 0.95, "blimp/accuracy/distractor_agreement_relational_noun": 0.902, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.982, "blimp/accuracy/superlative_quantifiers_1": 0.787, "blimp/accuracy/wh_island": 0.818, "blimp/accuracy/principle_A_domain_1": 0.982, "blimp/accuracy/complex_NP_island": 0.563, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.974, "blimp/accuracy/irregular_past_participle_verbs": 0.905, "blimp/accuracy/drop_argument": 0.735, "blimp/accuracy/wh_questions_object_gap": 0.822, "blimp/accuracy/animate_subject_passive": 0.769, "blimp/accuracy/existential_there_quantifiers_1": 0.985, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.914, "blimp/accuracy/npi_present_2": 0.547, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.957, "blimp/accuracy/anaphor_number_agreement": 0.992, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.968, "blimp/accuracy/existential_there_object_raising": 0.824, "blimp/accuracy/matrix_question_npi_licensor_present": 0.34, "blimp/accuracy/npi_present_1": 0.516, "blimp/accuracy/wh_vs_that_no_gap": 0.982, "blimp/accuracy/left_branch_island_echo_question": 0.457, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.965, "blimp/accuracy/causative": 0.738, "blimp/accuracy/group_average": 0.793507462686567, "blimp/accuracy/seq_average": 0.7935074626865671, "cbt/accuracy/NE": 0.8092948717948718, "cbt/accuracy/V": 0.9276, "cbt/accuracy/CN": 0.8804, "cbt/accuracy/P": 0.9108, "cbt/accuracy/group_average": 0.8820237179487179, "cbt/accuracy/seq_average": 0.8820528211284514, "hellaswag/accuracy/val": 0.34047002589125674, "hellaswag/accuracy/group_average": 0.34047002589125674, "hellaswag/accuracy/seq_average": 0.34047002589125674, "piqa/accuracy/val": 0.6240478781284005, "piqa/accuracy/group_average": 0.6240478781284005, "piqa/accuracy/seq_average": 0.6240478781284005, "ai2arc/accuracy/ARC-Easy": 0.36659619450317127, "ai2arc/accuracy/ARC-Challenge": 0.22746781115879827, "ai2arc/accuracy/group_average": 0.29703200283098474, "ai2arc/accuracy/seq_average": 0.3206798866855524, "race/accuracy/test/high": 0.29073756432247, "race/accuracy/test/middle": 0.3579387186629526, "race/accuracy/group_average": 0.32433814149271134, "race/accuracy/seq_average": 0.31029590595865425, "siqa/accuracy/dev": 0.372057318321392, "siqa/accuracy/group_average": 0.372057318321392, "siqa/accuracy/seq_average": 0.372057318321392, "commonsenseqa/accuracy/dev_rand_split": 0.28255528255528256, "commonsenseqa/accuracy/group_average": 0.28255528255528256, "commonsenseqa/accuracy/seq_average": 0.28255528255528256}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-180000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.3608567979600696, "val/accuracy": 0.5122254386780754, "val/perplexity": 10.600029646567169, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.4632685880483307, "lambada/accuracy/total": 0.3427795031055901, "lambada/accuracy/openai_last_token": 0.7928959627329193, "lambada/perplexity": 7.499034838691734, "lambada/lm_loss": 2.962471163698003, "lambada/lm_perplexity": 19.34571917429598, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4275024708918327, "mean_loss": 2.4120626930042004, "blimp/accuracy/passive_2": 0.903, "blimp/accuracy/determiner_noun_agreement_2": 0.982, "blimp/accuracy/ellipsis_n_bar_1": 0.815, "blimp/accuracy/tough_vs_raising_2": 0.844, "blimp/accuracy/tough_vs_raising_1": 0.656, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.914, "blimp/accuracy/principle_A_reconstruction": 0.309, "blimp/accuracy/wh_vs_that_with_gap": 0.507, "blimp/accuracy/principle_A_domain_2": 0.892, "blimp/accuracy/determiner_noun_agreement_1": 0.989, "blimp/accuracy/ellipsis_n_bar_2": 0.919, "blimp/accuracy/principle_A_domain_3": 0.665, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.936, "blimp/accuracy/animate_subject_trans": 0.922, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.916, "blimp/accuracy/distractor_agreement_relative_clause": 0.699, "blimp/accuracy/transitive": 0.862, "blimp/accuracy/sentential_subject_island": 0.34, "blimp/accuracy/adjunct_island": 0.87, "blimp/accuracy/intransitive": 0.769, "blimp/accuracy/existential_there_subject_raising": 0.862, "blimp/accuracy/irregular_past_participle_adjectives": 0.986, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.673, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.337, "blimp/accuracy/only_npi_scope": 0.603, "blimp/accuracy/superlative_quantifiers_2": 0.748, "blimp/accuracy/passive_1": 0.9, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.932, "blimp/accuracy/inchoative": 0.637, "blimp/accuracy/anaphor_gender_agreement": 0.976, "blimp/accuracy/principle_A_c_command": 0.624, "blimp/accuracy/only_npi_licensor_present": 0.582, "blimp/accuracy/expletive_it_object_raising": 0.759, "blimp/accuracy/left_branch_island_simple_question": 0.697, "blimp/accuracy/wh_questions_subject_gap": 0.953, "blimp/accuracy/existential_there_quantifiers_2": 0.437, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.938, "blimp/accuracy/sentential_negation_npi_scope": 0.789, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.801, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.91, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.886, "blimp/accuracy/principle_A_case_2": 0.95, "blimp/accuracy/distractor_agreement_relational_noun": 0.872, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.989, "blimp/accuracy/superlative_quantifiers_1": 0.677, "blimp/accuracy/wh_island": 0.821, "blimp/accuracy/principle_A_domain_1": 0.988, "blimp/accuracy/complex_NP_island": 0.574, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.973, "blimp/accuracy/irregular_past_participle_verbs": 0.891, "blimp/accuracy/drop_argument": 0.752, "blimp/accuracy/wh_questions_object_gap": 0.845, "blimp/accuracy/animate_subject_passive": 0.771, "blimp/accuracy/existential_there_quantifiers_1": 0.986, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.906, "blimp/accuracy/npi_present_2": 0.519, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.942, "blimp/accuracy/anaphor_number_agreement": 0.99, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.964, "blimp/accuracy/existential_there_object_raising": 0.856, "blimp/accuracy/matrix_question_npi_licensor_present": 0.363, "blimp/accuracy/npi_present_1": 0.5, "blimp/accuracy/wh_vs_that_no_gap": 0.983, "blimp/accuracy/left_branch_island_echo_question": 0.449, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.972, "blimp/accuracy/causative": 0.763, "blimp/accuracy/group_average": 0.7915671641791042, "blimp/accuracy/seq_average": 0.7915671641791044, "cbt/accuracy/NE": 0.8112980769230769, "cbt/accuracy/V": 0.932, "cbt/accuracy/CN": 0.8816, "cbt/accuracy/P": 0.9176, "cbt/accuracy/group_average": 0.8856245192307692, "cbt/accuracy/seq_average": 0.8856542617046819, "hellaswag/accuracy/val": 0.3474407488548098, "hellaswag/accuracy/group_average": 0.3474407488548098, "hellaswag/accuracy/seq_average": 0.3474407488548098, "piqa/accuracy/val": 0.6267682263329706, "piqa/accuracy/group_average": 0.6267682263329706, "piqa/accuracy/seq_average": 0.6267682263329706, "ai2arc/accuracy/ARC-Easy": 0.3687103594080338, "ai2arc/accuracy/ARC-Challenge": 0.2257510729613734, "ai2arc/accuracy/group_average": 0.2972307161847036, "ai2arc/accuracy/seq_average": 0.32152974504249293, "race/accuracy/test/high": 0.2978845054316752, "race/accuracy/test/middle": 0.3628133704735376, "race/accuracy/group_average": 0.33034893795260645, "race/accuracy/seq_average": 0.3167815160113498, "siqa/accuracy/dev": 0.3679631525076766, "siqa/accuracy/group_average": 0.3679631525076766, "siqa/accuracy/seq_average": 0.3679631525076766, "commonsenseqa/accuracy/dev_rand_split": 0.27682227682227684, "commonsenseqa/accuracy/group_average": 0.27682227682227684, "commonsenseqa/accuracy/seq_average": 0.27682227682227684}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-20000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.7174210321335566, "val/accuracy": 0.46215045262896826, "val/perplexity": 15.141223121044918, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5852727475373642, "lambada/accuracy/total": 0.2107919254658385, "lambada/accuracy/openai_last_token": 0.748641304347826, "lambada/perplexity": 14.51631128861096, "lambada/lm_loss": 3.2801830437306054, "lambada/lm_perplexity": 26.58063767369056, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.33647118904740336, "mean_loss": 2.6513468898354606, "blimp/accuracy/passive_2": 0.907, "blimp/accuracy/determiner_noun_agreement_2": 0.989, "blimp/accuracy/ellipsis_n_bar_1": 0.813, "blimp/accuracy/tough_vs_raising_2": 0.862, "blimp/accuracy/tough_vs_raising_1": 0.605, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.893, "blimp/accuracy/principle_A_reconstruction": 0.435, "blimp/accuracy/wh_vs_that_with_gap": 0.47, "blimp/accuracy/principle_A_domain_2": 0.865, "blimp/accuracy/determiner_noun_agreement_1": 0.98, "blimp/accuracy/ellipsis_n_bar_2": 0.908, "blimp/accuracy/principle_A_domain_3": 0.575, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.926, "blimp/accuracy/animate_subject_trans": 0.897, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.89, "blimp/accuracy/distractor_agreement_relative_clause": 0.544, "blimp/accuracy/transitive": 0.851, "blimp/accuracy/sentential_subject_island": 0.367, "blimp/accuracy/adjunct_island": 0.794, "blimp/accuracy/intransitive": 0.777, "blimp/accuracy/existential_there_subject_raising": 0.859, "blimp/accuracy/irregular_past_participle_adjectives": 0.989, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.349, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.17, "blimp/accuracy/only_npi_scope": 0.604, "blimp/accuracy/superlative_quantifiers_2": 0.605, "blimp/accuracy/passive_1": 0.885, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.864, "blimp/accuracy/inchoative": 0.642, "blimp/accuracy/anaphor_gender_agreement": 0.967, "blimp/accuracy/principle_A_c_command": 0.522, "blimp/accuracy/only_npi_licensor_present": 0.565, "blimp/accuracy/expletive_it_object_raising": 0.76, "blimp/accuracy/left_branch_island_simple_question": 0.414, "blimp/accuracy/wh_questions_subject_gap": 0.913, "blimp/accuracy/existential_there_quantifiers_2": 0.286, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.935, "blimp/accuracy/sentential_negation_npi_scope": 0.718, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.814, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.932, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.883, "blimp/accuracy/principle_A_case_2": 0.943, "blimp/accuracy/distractor_agreement_relational_noun": 0.846, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.997, "blimp/accuracy/superlative_quantifiers_1": 0.486, "blimp/accuracy/wh_island": 0.737, "blimp/accuracy/principle_A_domain_1": 0.987, "blimp/accuracy/complex_NP_island": 0.503, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.963, "blimp/accuracy/irregular_past_participle_verbs": 0.855, "blimp/accuracy/drop_argument": 0.762, "blimp/accuracy/wh_questions_object_gap": 0.811, "blimp/accuracy/animate_subject_passive": 0.811, "blimp/accuracy/existential_there_quantifiers_1": 0.974, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.865, "blimp/accuracy/npi_present_2": 0.598, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.924, "blimp/accuracy/anaphor_number_agreement": 0.991, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.952, "blimp/accuracy/existential_there_object_raising": 0.805, "blimp/accuracy/matrix_question_npi_licensor_present": 0.096, "blimp/accuracy/npi_present_1": 0.594, "blimp/accuracy/wh_vs_that_no_gap": 0.979, "blimp/accuracy/left_branch_island_echo_question": 0.473, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.987, "blimp/accuracy/causative": 0.709, "blimp/accuracy/group_average": 0.7562985074626866, "blimp/accuracy/seq_average": 0.7562985074626866, "cbt/accuracy/NE": 0.7431891025641025, "cbt/accuracy/V": 0.9064, "cbt/accuracy/CN": 0.8024, "cbt/accuracy/P": 0.8732, "cbt/accuracy/group_average": 0.8312972756410255, "cbt/accuracy/seq_average": 0.8313325330132053, "hellaswag/accuracy/val": 0.2874925313682533, "hellaswag/accuracy/group_average": 0.2874925313682533, "hellaswag/accuracy/seq_average": 0.2874925313682533, "piqa/accuracy/val": 0.5723612622415669, "piqa/accuracy/group_average": 0.5723612622415669, "piqa/accuracy/seq_average": 0.5723612622415669, "ai2arc/accuracy/ARC-Easy": 0.31839323467230446, "ai2arc/accuracy/ARC-Challenge": 0.21802575107296138, "ai2arc/accuracy/group_average": 0.2682094928726329, "ai2arc/accuracy/seq_average": 0.28526912181303116, "race/accuracy/test/high": 0.2672955974842767, "race/accuracy/test/middle": 0.3412256267409471, "race/accuracy/group_average": 0.3042606121126119, "race/accuracy/seq_average": 0.2888123226591001, "siqa/accuracy/dev": 0.36898669396110545, "siqa/accuracy/group_average": 0.36898669396110545, "siqa/accuracy/seq_average": 0.36898669396110545, "commonsenseqa/accuracy/dev_rand_split": 0.2497952497952498, "commonsenseqa/accuracy/group_average": 0.2497952497952498, "commonsenseqa/accuracy/seq_average": 0.2497952497952498}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-200000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.3466634598989335, "val/accuracy": 0.5151338123139881, "val/perplexity": 10.450642501301552, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.32799970733453, "lambada/accuracy/total": 0.34627329192546585, "lambada/accuracy/openai_last_token": 0.7911490683229814, "lambada/perplexity": 7.539690419688911, "lambada/lm_loss": 2.953883393432971, "lambada/lm_perplexity": 19.180293916143853, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.430703552119727, "mean_loss": 2.3373315836167317, "blimp/accuracy/passive_2": 0.912, "blimp/accuracy/determiner_noun_agreement_2": 0.981, "blimp/accuracy/ellipsis_n_bar_1": 0.788, "blimp/accuracy/tough_vs_raising_2": 0.859, "blimp/accuracy/tough_vs_raising_1": 0.624, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.941, "blimp/accuracy/principle_A_reconstruction": 0.364, "blimp/accuracy/wh_vs_that_with_gap": 0.522, "blimp/accuracy/principle_A_domain_2": 0.863, "blimp/accuracy/determiner_noun_agreement_1": 0.993, "blimp/accuracy/ellipsis_n_bar_2": 0.913, "blimp/accuracy/principle_A_domain_3": 0.678, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.946, "blimp/accuracy/animate_subject_trans": 0.905, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.923, "blimp/accuracy/distractor_agreement_relative_clause": 0.715, "blimp/accuracy/transitive": 0.857, "blimp/accuracy/sentential_subject_island": 0.391, "blimp/accuracy/adjunct_island": 0.826, "blimp/accuracy/intransitive": 0.74, "blimp/accuracy/existential_there_subject_raising": 0.871, "blimp/accuracy/irregular_past_participle_adjectives": 0.984, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.736, "blimp/accuracy/principle_A_case_1": 0.999, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.332, "blimp/accuracy/only_npi_scope": 0.593, "blimp/accuracy/superlative_quantifiers_2": 0.761, "blimp/accuracy/passive_1": 0.911, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.928, "blimp/accuracy/inchoative": 0.617, "blimp/accuracy/anaphor_gender_agreement": 0.985, "blimp/accuracy/principle_A_c_command": 0.634, "blimp/accuracy/only_npi_licensor_present": 0.794, "blimp/accuracy/expletive_it_object_raising": 0.772, "blimp/accuracy/left_branch_island_simple_question": 0.788, "blimp/accuracy/wh_questions_subject_gap": 0.947, "blimp/accuracy/existential_there_quantifiers_2": 0.496, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.948, "blimp/accuracy/sentential_negation_npi_scope": 0.772, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.865, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.88, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.915, "blimp/accuracy/principle_A_case_2": 0.945, "blimp/accuracy/distractor_agreement_relational_noun": 0.877, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.985, "blimp/accuracy/superlative_quantifiers_1": 0.749, "blimp/accuracy/wh_island": 0.833, "blimp/accuracy/principle_A_domain_1": 0.987, "blimp/accuracy/complex_NP_island": 0.583, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.975, "blimp/accuracy/irregular_past_participle_verbs": 0.894, "blimp/accuracy/drop_argument": 0.738, "blimp/accuracy/wh_questions_object_gap": 0.832, "blimp/accuracy/animate_subject_passive": 0.781, "blimp/accuracy/existential_there_quantifiers_1": 0.981, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.898, "blimp/accuracy/npi_present_2": 0.578, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.947, "blimp/accuracy/anaphor_number_agreement": 0.992, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.967, "blimp/accuracy/existential_there_object_raising": 0.838, "blimp/accuracy/matrix_question_npi_licensor_present": 0.346, "blimp/accuracy/npi_present_1": 0.615, "blimp/accuracy/wh_vs_that_no_gap": 0.983, "blimp/accuracy/left_branch_island_echo_question": 0.589, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.977, "blimp/accuracy/causative": 0.76, "blimp/accuracy/group_average": 0.8047611940298506, "blimp/accuracy/seq_average": 0.8047611940298508, "cbt/accuracy/NE": 0.8149038461538461, "cbt/accuracy/V": 0.9356, "cbt/accuracy/CN": 0.8796, "cbt/accuracy/P": 0.9176, "cbt/accuracy/group_average": 0.8869259615384615, "cbt/accuracy/seq_average": 0.8869547819127651, "hellaswag/accuracy/val": 0.34783907588129853, "hellaswag/accuracy/group_average": 0.34783907588129853, "hellaswag/accuracy/seq_average": 0.34783907588129853, "piqa/accuracy/val": 0.6267682263329706, "piqa/accuracy/group_average": 0.6267682263329706, "piqa/accuracy/seq_average": 0.6267682263329706, "ai2arc/accuracy/ARC-Easy": 0.37251585623678646, "ai2arc/accuracy/ARC-Challenge": 0.22746781115879827, "ai2arc/accuracy/group_average": 0.29999183369779236, "ai2arc/accuracy/seq_average": 0.3246458923512748, "race/accuracy/test/high": 0.292166952544311, "race/accuracy/test/middle": 0.3635097493036212, "race/accuracy/group_average": 0.32783835092396607, "race/accuracy/seq_average": 0.31293068504256183, "siqa/accuracy/dev": 0.372057318321392, "siqa/accuracy/group_average": 0.372057318321392, "siqa/accuracy/seq_average": 0.372057318321392, "commonsenseqa/accuracy/dev_rand_split": 0.2800982800982801, "commonsenseqa/accuracy/group_average": 0.2800982800982801, "commonsenseqa/accuracy/seq_average": 0.2800982800982801}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-220000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.3320390004960316, "val/accuracy": 0.51617431640625, "val/perplexity": 10.298919640845535, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.3780092985733696, "lambada/accuracy/total": 0.35403726708074534, "lambada/accuracy/openai_last_token": 0.7954192546583851, "lambada/perplexity": 7.160020246025561, "lambada/lm_loss": 2.9337226015844515, "lambada/lm_perplexity": 18.797475926327973, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.43510579174349767, "mean_loss": 2.3550241495347004, "blimp/accuracy/passive_2": 0.914, "blimp/accuracy/determiner_noun_agreement_2": 0.99, "blimp/accuracy/ellipsis_n_bar_1": 0.861, "blimp/accuracy/tough_vs_raising_2": 0.844, "blimp/accuracy/tough_vs_raising_1": 0.656, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.921, "blimp/accuracy/principle_A_reconstruction": 0.394, "blimp/accuracy/wh_vs_that_with_gap": 0.542, "blimp/accuracy/principle_A_domain_2": 0.887, "blimp/accuracy/determiner_noun_agreement_1": 0.993, "blimp/accuracy/ellipsis_n_bar_2": 0.916, "blimp/accuracy/principle_A_domain_3": 0.653, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.937, "blimp/accuracy/animate_subject_trans": 0.914, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.903, "blimp/accuracy/distractor_agreement_relative_clause": 0.682, "blimp/accuracy/transitive": 0.858, "blimp/accuracy/sentential_subject_island": 0.381, "blimp/accuracy/adjunct_island": 0.833, "blimp/accuracy/intransitive": 0.775, "blimp/accuracy/existential_there_subject_raising": 0.869, "blimp/accuracy/irregular_past_participle_adjectives": 0.983, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.629, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.361, "blimp/accuracy/only_npi_scope": 0.689, "blimp/accuracy/superlative_quantifiers_2": 0.692, "blimp/accuracy/passive_1": 0.902, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.927, "blimp/accuracy/inchoative": 0.648, "blimp/accuracy/anaphor_gender_agreement": 0.98, "blimp/accuracy/principle_A_c_command": 0.623, "blimp/accuracy/only_npi_licensor_present": 0.746, "blimp/accuracy/expletive_it_object_raising": 0.788, "blimp/accuracy/left_branch_island_simple_question": 0.645, "blimp/accuracy/wh_questions_subject_gap": 0.923, "blimp/accuracy/existential_there_quantifiers_2": 0.516, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.949, "blimp/accuracy/sentential_negation_npi_scope": 0.789, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.827, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.862, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.898, "blimp/accuracy/principle_A_case_2": 0.949, "blimp/accuracy/distractor_agreement_relational_noun": 0.885, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.975, "blimp/accuracy/superlative_quantifiers_1": 0.584, "blimp/accuracy/wh_island": 0.856, "blimp/accuracy/principle_A_domain_1": 0.985, "blimp/accuracy/complex_NP_island": 0.567, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.968, "blimp/accuracy/irregular_past_participle_verbs": 0.877, "blimp/accuracy/drop_argument": 0.731, "blimp/accuracy/wh_questions_object_gap": 0.804, "blimp/accuracy/animate_subject_passive": 0.781, "blimp/accuracy/existential_there_quantifiers_1": 0.976, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.896, "blimp/accuracy/npi_present_2": 0.601, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.947, "blimp/accuracy/anaphor_number_agreement": 0.991, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.968, "blimp/accuracy/existential_there_object_raising": 0.847, "blimp/accuracy/matrix_question_npi_licensor_present": 0.385, "blimp/accuracy/npi_present_1": 0.614, "blimp/accuracy/wh_vs_that_no_gap": 0.978, "blimp/accuracy/left_branch_island_echo_question": 0.476, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.963, "blimp/accuracy/causative": 0.765, "blimp/accuracy/group_average": 0.7980447761194032, "blimp/accuracy/seq_average": 0.7980447761194029, "cbt/accuracy/NE": 0.8100961538461539, "cbt/accuracy/V": 0.9368, "cbt/accuracy/CN": 0.8868, "cbt/accuracy/P": 0.9196, "cbt/accuracy/group_average": 0.8883240384615385, "cbt/accuracy/seq_average": 0.8883553421368547, "hellaswag/accuracy/val": 0.35012945628360886, "hellaswag/accuracy/group_average": 0.35012945628360886, "hellaswag/accuracy/seq_average": 0.35012945628360886, "piqa/accuracy/val": 0.6256800870511425, "piqa/accuracy/group_average": 0.6256800870511425, "piqa/accuracy/seq_average": 0.6256800870511425, "ai2arc/accuracy/ARC-Easy": 0.3792811839323467, "ai2arc/accuracy/ARC-Challenge": 0.2334763948497854, "ai2arc/accuracy/group_average": 0.30637878939106605, "ai2arc/accuracy/seq_average": 0.3311614730878187, "race/accuracy/test/high": 0.292166952544311, "race/accuracy/test/middle": 0.38091922005571033, "race/accuracy/group_average": 0.3365430863000107, "race/accuracy/seq_average": 0.31799756789623024, "siqa/accuracy/dev": 0.37768679631525076, "siqa/accuracy/group_average": 0.37768679631525076, "siqa/accuracy/seq_average": 0.37768679631525076, "commonsenseqa/accuracy/dev_rand_split": 0.2858312858312858, "commonsenseqa/accuracy/group_average": 0.2858312858312858, "commonsenseqa/accuracy/seq_average": 0.2858312858312858}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-240000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.318146236359127, "val/accuracy": 0.517516121031746, "val/perplexity": 10.156828484269868, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.3315024050126163, "lambada/accuracy/total": 0.37247670807453415, "lambada/accuracy/openai_last_token": 0.8022127329192547, "lambada/perplexity": 6.6602017354152645, "lambada/lm_loss": 2.918840607799054, "lambada/lm_perplexity": 18.51980329236516, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4449964145531401, "mean_loss": 2.3248243206858716, "blimp/accuracy/passive_2": 0.904, "blimp/accuracy/determiner_noun_agreement_2": 0.987, "blimp/accuracy/ellipsis_n_bar_1": 0.857, "blimp/accuracy/tough_vs_raising_2": 0.885, "blimp/accuracy/tough_vs_raising_1": 0.608, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.918, "blimp/accuracy/principle_A_reconstruction": 0.391, "blimp/accuracy/wh_vs_that_with_gap": 0.5, "blimp/accuracy/principle_A_domain_2": 0.9, "blimp/accuracy/determiner_noun_agreement_1": 0.992, "blimp/accuracy/ellipsis_n_bar_2": 0.928, "blimp/accuracy/principle_A_domain_3": 0.651, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.946, "blimp/accuracy/animate_subject_trans": 0.906, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.915, "blimp/accuracy/distractor_agreement_relative_clause": 0.754, "blimp/accuracy/transitive": 0.88, "blimp/accuracy/sentential_subject_island": 0.369, "blimp/accuracy/adjunct_island": 0.858, "blimp/accuracy/intransitive": 0.785, "blimp/accuracy/existential_there_subject_raising": 0.876, "blimp/accuracy/irregular_past_participle_adjectives": 0.964, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.684, "blimp/accuracy/principle_A_case_1": 0.999, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.322, "blimp/accuracy/only_npi_scope": 0.621, "blimp/accuracy/superlative_quantifiers_2": 0.806, "blimp/accuracy/passive_1": 0.9, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.929, "blimp/accuracy/inchoative": 0.655, "blimp/accuracy/anaphor_gender_agreement": 0.978, "blimp/accuracy/principle_A_c_command": 0.671, "blimp/accuracy/only_npi_licensor_present": 0.845, "blimp/accuracy/expletive_it_object_raising": 0.779, "blimp/accuracy/left_branch_island_simple_question": 0.727, "blimp/accuracy/wh_questions_subject_gap": 0.95, "blimp/accuracy/existential_there_quantifiers_2": 0.556, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.95, "blimp/accuracy/sentential_negation_npi_scope": 0.783, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.813, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.896, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.919, "blimp/accuracy/principle_A_case_2": 0.952, "blimp/accuracy/distractor_agreement_relational_noun": 0.893, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.979, "blimp/accuracy/superlative_quantifiers_1": 0.757, "blimp/accuracy/wh_island": 0.847, "blimp/accuracy/principle_A_domain_1": 0.986, "blimp/accuracy/complex_NP_island": 0.586, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.981, "blimp/accuracy/irregular_past_participle_verbs": 0.895, "blimp/accuracy/drop_argument": 0.753, "blimp/accuracy/wh_questions_object_gap": 0.841, "blimp/accuracy/animate_subject_passive": 0.791, "blimp/accuracy/existential_there_quantifiers_1": 0.986, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.916, "blimp/accuracy/npi_present_2": 0.595, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.947, "blimp/accuracy/anaphor_number_agreement": 0.993, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.963, "blimp/accuracy/existential_there_object_raising": 0.843, "blimp/accuracy/matrix_question_npi_licensor_present": 0.367, "blimp/accuracy/npi_present_1": 0.567, "blimp/accuracy/wh_vs_that_no_gap": 0.981, "blimp/accuracy/left_branch_island_echo_question": 0.479, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.974, "blimp/accuracy/causative": 0.765, "blimp/accuracy/group_average": 0.8088656716417909, "blimp/accuracy/seq_average": 0.808865671641791, "cbt/accuracy/NE": 0.8201121794871795, "cbt/accuracy/V": 0.9408, "cbt/accuracy/CN": 0.8892, "cbt/accuracy/P": 0.9128, "cbt/accuracy/group_average": 0.8907280448717949, "cbt/accuracy/seq_average": 0.8907563025210085, "hellaswag/accuracy/val": 0.35261900019916353, "hellaswag/accuracy/group_average": 0.35261900019916353, "hellaswag/accuracy/seq_average": 0.35261900019916353, "piqa/accuracy/val": 0.6398258977149075, "piqa/accuracy/group_average": 0.6398258977149075, "piqa/accuracy/seq_average": 0.6398258977149075, "ai2arc/accuracy/ARC-Easy": 0.3708245243128964, "ai2arc/accuracy/ARC-Challenge": 0.22832618025751072, "ai2arc/accuracy/group_average": 0.2995753522852036, "ai2arc/accuracy/seq_average": 0.32379603399433426, "race/accuracy/test/high": 0.29588336192109777, "race/accuracy/test/middle": 0.3725626740947075, "race/accuracy/group_average": 0.33422301800790266, "race/accuracy/seq_average": 0.318200243210377, "siqa/accuracy/dev": 0.3664278403275333, "siqa/accuracy/group_average": 0.3664278403275333, "siqa/accuracy/seq_average": 0.3664278403275333, "commonsenseqa/accuracy/dev_rand_split": 0.276003276003276, "commonsenseqa/accuracy/group_average": 0.276003276003276, "commonsenseqa/accuracy/seq_average": 0.276003276003276}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-260000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.3059869191003224, "val/accuracy": 0.5199410574776786, "val/perplexity": 10.034076188835163, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.260860466809006, "lambada/accuracy/total": 0.3759704968944099, "lambada/accuracy/openai_last_token": 0.8014363354037267, "lambada/perplexity": 6.571177127912787, "lambada/lm_loss": 2.890877514075049, "lambada/lm_perplexity": 18.009105913711878, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4479557771860443, "mean_loss": 2.2834236929546643, "blimp/accuracy/passive_2": 0.892, "blimp/accuracy/determiner_noun_agreement_2": 0.988, "blimp/accuracy/ellipsis_n_bar_1": 0.805, "blimp/accuracy/tough_vs_raising_2": 0.857, "blimp/accuracy/tough_vs_raising_1": 0.659, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.932, "blimp/accuracy/principle_A_reconstruction": 0.368, "blimp/accuracy/wh_vs_that_with_gap": 0.524, "blimp/accuracy/principle_A_domain_2": 0.884, "blimp/accuracy/determiner_noun_agreement_1": 0.991, "blimp/accuracy/ellipsis_n_bar_2": 0.925, "blimp/accuracy/principle_A_domain_3": 0.663, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.947, "blimp/accuracy/animate_subject_trans": 0.91, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.916, "blimp/accuracy/distractor_agreement_relative_clause": 0.714, "blimp/accuracy/transitive": 0.875, "blimp/accuracy/sentential_subject_island": 0.365, "blimp/accuracy/adjunct_island": 0.839, "blimp/accuracy/intransitive": 0.759, "blimp/accuracy/existential_there_subject_raising": 0.881, "blimp/accuracy/irregular_past_participle_adjectives": 0.992, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.696, "blimp/accuracy/principle_A_case_1": 0.999, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.333, "blimp/accuracy/only_npi_scope": 0.597, "blimp/accuracy/superlative_quantifiers_2": 0.815, "blimp/accuracy/passive_1": 0.907, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.945, "blimp/accuracy/inchoative": 0.64, "blimp/accuracy/anaphor_gender_agreement": 0.981, "blimp/accuracy/principle_A_c_command": 0.687, "blimp/accuracy/only_npi_licensor_present": 0.503, "blimp/accuracy/expletive_it_object_raising": 0.778, "blimp/accuracy/left_branch_island_simple_question": 0.728, "blimp/accuracy/wh_questions_subject_gap": 0.948, "blimp/accuracy/existential_there_quantifiers_2": 0.54, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.955, "blimp/accuracy/sentential_negation_npi_scope": 0.785, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.81, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.875, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.911, "blimp/accuracy/principle_A_case_2": 0.933, "blimp/accuracy/distractor_agreement_relational_noun": 0.899, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.985, "blimp/accuracy/superlative_quantifiers_1": 0.747, "blimp/accuracy/wh_island": 0.837, "blimp/accuracy/principle_A_domain_1": 0.985, "blimp/accuracy/complex_NP_island": 0.57, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.985, "blimp/accuracy/irregular_past_participle_verbs": 0.872, "blimp/accuracy/drop_argument": 0.749, "blimp/accuracy/wh_questions_object_gap": 0.828, "blimp/accuracy/animate_subject_passive": 0.784, "blimp/accuracy/existential_there_quantifiers_1": 0.968, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.921, "blimp/accuracy/npi_present_2": 0.658, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.958, "blimp/accuracy/anaphor_number_agreement": 0.989, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.976, "blimp/accuracy/existential_there_object_raising": 0.829, "blimp/accuracy/matrix_question_npi_licensor_present": 0.449, "blimp/accuracy/npi_present_1": 0.648, "blimp/accuracy/wh_vs_that_no_gap": 0.977, "blimp/accuracy/left_branch_island_echo_question": 0.471, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.972, "blimp/accuracy/causative": 0.759, "blimp/accuracy/group_average": 0.804, "blimp/accuracy/seq_average": 0.804, "cbt/accuracy/NE": 0.8205128205128205, "cbt/accuracy/V": 0.9376, "cbt/accuracy/CN": 0.8896, "cbt/accuracy/P": 0.92, "cbt/accuracy/group_average": 0.8919282051282051, "cbt/accuracy/seq_average": 0.8919567827130852, "hellaswag/accuracy/val": 0.3571997610037841, "hellaswag/accuracy/group_average": 0.3571997610037841, "hellaswag/accuracy/seq_average": 0.3571997610037841, "piqa/accuracy/val": 0.6300326441784548, "piqa/accuracy/group_average": 0.6300326441784548, "piqa/accuracy/seq_average": 0.6300326441784548, "ai2arc/accuracy/ARC-Easy": 0.37293868921775897, "ai2arc/accuracy/ARC-Challenge": 0.23776824034334765, "ai2arc/accuracy/group_average": 0.3053534647805533, "ai2arc/accuracy/seq_average": 0.328328611898017, "race/accuracy/test/high": 0.2861635220125786, "race/accuracy/test/middle": 0.37186629526462395, "race/accuracy/group_average": 0.3290149086386013, "race/accuracy/seq_average": 0.3111066072152412, "siqa/accuracy/dev": 0.3679631525076766, "siqa/accuracy/group_average": 0.3679631525076766, "siqa/accuracy/seq_average": 0.3679631525076766, "commonsenseqa/accuracy/dev_rand_split": 0.27354627354627353, "commonsenseqa/accuracy/group_average": 0.27354627354627353, "commonsenseqa/accuracy/seq_average": 0.27354627354627353}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-280000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.2941119481646823, "val/accuracy": 0.5220840696304564, "val/perplexity": 9.915626510896043, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.290635387349573, "lambada/accuracy/total": 0.3761645962732919, "lambada/accuracy/openai_last_token": 0.8039596273291926, "lambada/perplexity": 6.517616348836059, "lambada/lm_loss": 2.8839167293504837, "lambada/lm_perplexity": 17.88418368712728, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.44912433295187415, "mean_loss": 2.2923736677571274, "blimp/accuracy/passive_2": 0.894, "blimp/accuracy/determiner_noun_agreement_2": 0.986, "blimp/accuracy/ellipsis_n_bar_1": 0.834, "blimp/accuracy/tough_vs_raising_2": 0.886, "blimp/accuracy/tough_vs_raising_1": 0.639, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.925, "blimp/accuracy/principle_A_reconstruction": 0.392, "blimp/accuracy/wh_vs_that_with_gap": 0.511, "blimp/accuracy/principle_A_domain_2": 0.888, "blimp/accuracy/determiner_noun_agreement_1": 0.991, "blimp/accuracy/ellipsis_n_bar_2": 0.924, "blimp/accuracy/principle_A_domain_3": 0.66, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.937, "blimp/accuracy/animate_subject_trans": 0.897, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.917, "blimp/accuracy/distractor_agreement_relative_clause": 0.711, "blimp/accuracy/transitive": 0.881, "blimp/accuracy/sentential_subject_island": 0.363, "blimp/accuracy/adjunct_island": 0.834, "blimp/accuracy/intransitive": 0.769, "blimp/accuracy/existential_there_subject_raising": 0.874, "blimp/accuracy/irregular_past_participle_adjectives": 0.927, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.677, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.329, "blimp/accuracy/only_npi_scope": 0.585, "blimp/accuracy/superlative_quantifiers_2": 0.787, "blimp/accuracy/passive_1": 0.894, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.931, "blimp/accuracy/inchoative": 0.658, "blimp/accuracy/anaphor_gender_agreement": 0.977, "blimp/accuracy/principle_A_c_command": 0.671, "blimp/accuracy/only_npi_licensor_present": 0.677, "blimp/accuracy/expletive_it_object_raising": 0.769, "blimp/accuracy/left_branch_island_simple_question": 0.707, "blimp/accuracy/wh_questions_subject_gap": 0.947, "blimp/accuracy/existential_there_quantifiers_2": 0.553, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.957, "blimp/accuracy/sentential_negation_npi_scope": 0.731, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.822, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.893, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.911, "blimp/accuracy/principle_A_case_2": 0.94, "blimp/accuracy/distractor_agreement_relational_noun": 0.894, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.986, "blimp/accuracy/superlative_quantifiers_1": 0.711, "blimp/accuracy/wh_island": 0.835, "blimp/accuracy/principle_A_domain_1": 0.99, "blimp/accuracy/complex_NP_island": 0.578, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.991, "blimp/accuracy/irregular_past_participle_verbs": 0.891, "blimp/accuracy/drop_argument": 0.737, "blimp/accuracy/wh_questions_object_gap": 0.839, "blimp/accuracy/animate_subject_passive": 0.792, "blimp/accuracy/existential_there_quantifiers_1": 0.97, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.906, "blimp/accuracy/npi_present_2": 0.605, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.954, "blimp/accuracy/anaphor_number_agreement": 0.988, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.969, "blimp/accuracy/existential_there_object_raising": 0.822, "blimp/accuracy/matrix_question_npi_licensor_present": 0.415, "blimp/accuracy/npi_present_1": 0.614, "blimp/accuracy/wh_vs_that_no_gap": 0.98, "blimp/accuracy/left_branch_island_echo_question": 0.507, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.972, "blimp/accuracy/causative": 0.765, "blimp/accuracy/group_average": 0.8024925373134326, "blimp/accuracy/seq_average": 0.8024925373134328, "cbt/accuracy/NE": 0.8233173076923077, "cbt/accuracy/V": 0.9416, "cbt/accuracy/CN": 0.8908, "cbt/accuracy/P": 0.916, "cbt/accuracy/group_average": 0.8929293269230769, "cbt/accuracy/seq_average": 0.8929571828731493, "hellaswag/accuracy/val": 0.35899223262298346, "hellaswag/accuracy/group_average": 0.35899223262298346, "hellaswag/accuracy/seq_average": 0.35899223262298346, "piqa/accuracy/val": 0.6305767138193689, "piqa/accuracy/group_average": 0.6305767138193689, "piqa/accuracy/seq_average": 0.6305767138193689, "ai2arc/accuracy/ARC-Easy": 0.3758985200845666, "ai2arc/accuracy/ARC-Challenge": 0.23004291845493563, "ai2arc/accuracy/group_average": 0.3029707192697511, "ai2arc/accuracy/seq_average": 0.3277620396600567, "race/accuracy/test/high": 0.2933104631217839, "race/accuracy/test/middle": 0.36559888579387184, "race/accuracy/group_average": 0.32945467445782783, "race/accuracy/seq_average": 0.31434941224158897, "siqa/accuracy/dev": 0.3710337768679631, "siqa/accuracy/group_average": 0.3710337768679631, "siqa/accuracy/seq_average": 0.3710337768679631, "commonsenseqa/accuracy/dev_rand_split": 0.27927927927927926, "commonsenseqa/accuracy/group_average": 0.27927927927927926, "commonsenseqa/accuracy/seq_average": 0.27927927927927926}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-300000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.2852373880053323, "val/accuracy": 0.5237630208333334, "val/perplexity": 9.828019000946783, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.3717890200407608, "lambada/accuracy/total": 0.36490683229813664, "lambada/accuracy/openai_last_token": 0.8035714285714286, "lambada/perplexity": 6.65080014028089, "lambada/lm_loss": 2.8812355556579914, "lambada/lm_perplexity": 17.836297308849186, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.44433492656573503, "mean_loss": 2.3285132040230465, "blimp/accuracy/passive_2": 0.896, "blimp/accuracy/determiner_noun_agreement_2": 0.984, "blimp/accuracy/ellipsis_n_bar_1": 0.834, "blimp/accuracy/tough_vs_raising_2": 0.865, "blimp/accuracy/tough_vs_raising_1": 0.627, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.939, "blimp/accuracy/principle_A_reconstruction": 0.398, "blimp/accuracy/wh_vs_that_with_gap": 0.536, "blimp/accuracy/principle_A_domain_2": 0.888, "blimp/accuracy/determiner_noun_agreement_1": 0.993, "blimp/accuracy/ellipsis_n_bar_2": 0.92, "blimp/accuracy/principle_A_domain_3": 0.692, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.941, "blimp/accuracy/animate_subject_trans": 0.921, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.914, "blimp/accuracy/distractor_agreement_relative_clause": 0.688, "blimp/accuracy/transitive": 0.875, "blimp/accuracy/sentential_subject_island": 0.386, "blimp/accuracy/adjunct_island": 0.833, "blimp/accuracy/intransitive": 0.766, "blimp/accuracy/existential_there_subject_raising": 0.867, "blimp/accuracy/irregular_past_participle_adjectives": 0.985, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.706, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.4, "blimp/accuracy/only_npi_scope": 0.674, "blimp/accuracy/superlative_quantifiers_2": 0.837, "blimp/accuracy/passive_1": 0.892, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.933, "blimp/accuracy/inchoative": 0.65, "blimp/accuracy/anaphor_gender_agreement": 0.981, "blimp/accuracy/principle_A_c_command": 0.668, "blimp/accuracy/only_npi_licensor_present": 0.761, "blimp/accuracy/expletive_it_object_raising": 0.786, "blimp/accuracy/left_branch_island_simple_question": 0.735, "blimp/accuracy/wh_questions_subject_gap": 0.943, "blimp/accuracy/existential_there_quantifiers_2": 0.527, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.959, "blimp/accuracy/sentential_negation_npi_scope": 0.737, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.843, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.872, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.918, "blimp/accuracy/principle_A_case_2": 0.942, "blimp/accuracy/distractor_agreement_relational_noun": 0.856, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.984, "blimp/accuracy/superlative_quantifiers_1": 0.765, "blimp/accuracy/wh_island": 0.838, "blimp/accuracy/principle_A_domain_1": 0.991, "blimp/accuracy/complex_NP_island": 0.564, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.978, "blimp/accuracy/irregular_past_participle_verbs": 0.886, "blimp/accuracy/drop_argument": 0.746, "blimp/accuracy/wh_questions_object_gap": 0.837, "blimp/accuracy/animate_subject_passive": 0.783, "blimp/accuracy/existential_there_quantifiers_1": 0.986, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.918, "blimp/accuracy/npi_present_2": 0.599, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.955, "blimp/accuracy/anaphor_number_agreement": 0.992, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.974, "blimp/accuracy/existential_there_object_raising": 0.847, "blimp/accuracy/matrix_question_npi_licensor_present": 0.418, "blimp/accuracy/npi_present_1": 0.627, "blimp/accuracy/wh_vs_that_no_gap": 0.974, "blimp/accuracy/left_branch_island_echo_question": 0.495, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.96, "blimp/accuracy/causative": 0.766, "blimp/accuracy/group_average": 0.8097164179104476, "blimp/accuracy/seq_average": 0.8097164179104478, "cbt/accuracy/NE": 0.8269230769230769, "cbt/accuracy/V": 0.9416, "cbt/accuracy/CN": 0.8864, "cbt/accuracy/P": 0.9204, "cbt/accuracy/group_average": 0.8938307692307692, "cbt/accuracy/seq_average": 0.8938575430172069, "hellaswag/accuracy/val": 0.3648675562636925, "hellaswag/accuracy/group_average": 0.3648675562636925, "hellaswag/accuracy/seq_average": 0.3648675562636925, "piqa/accuracy/val": 0.6311207834602829, "piqa/accuracy/group_average": 0.6311207834602829, "piqa/accuracy/seq_average": 0.6311207834602829, "ai2arc/accuracy/ARC-Easy": 0.3856236786469345, "ai2arc/accuracy/ARC-Challenge": 0.23776824034334765, "ai2arc/accuracy/group_average": 0.31169595949514106, "ai2arc/accuracy/seq_average": 0.33682719546742207, "race/accuracy/test/high": 0.29588336192109777, "race/accuracy/test/middle": 0.36629526462395545, "race/accuracy/group_average": 0.3310893132725266, "race/accuracy/seq_average": 0.31637616538305635, "siqa/accuracy/dev": 0.37001023541453426, "siqa/accuracy/group_average": 0.37001023541453426, "siqa/accuracy/seq_average": 0.37001023541453426, "commonsenseqa/accuracy/dev_rand_split": 0.27764127764127766, "commonsenseqa/accuracy/group_average": 0.27764127764127766, "commonsenseqa/accuracy/seq_average": 0.27764127764127766}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-320000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.2766522604321677, "val/accuracy": 0.5242416139632936, "val/perplexity": 9.744005353984791, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.3273525830381407, "lambada/accuracy/total": 0.3717003105590062, "lambada/accuracy/openai_last_token": 0.8057065217391305, "lambada/perplexity": 6.561656918470149, "lambada/lm_loss": 2.877079828103163, "lambada/lm_perplexity": 17.762328320582178, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4479709622611499, "mean_loss": 2.3020024217351542, "blimp/accuracy/passive_2": 0.902, "blimp/accuracy/determiner_noun_agreement_2": 0.986, "blimp/accuracy/ellipsis_n_bar_1": 0.847, "blimp/accuracy/tough_vs_raising_2": 0.885, "blimp/accuracy/tough_vs_raising_1": 0.633, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.926, "blimp/accuracy/principle_A_reconstruction": 0.433, "blimp/accuracy/wh_vs_that_with_gap": 0.506, "blimp/accuracy/principle_A_domain_2": 0.886, "blimp/accuracy/determiner_noun_agreement_1": 0.993, "blimp/accuracy/ellipsis_n_bar_2": 0.936, "blimp/accuracy/principle_A_domain_3": 0.693, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.934, "blimp/accuracy/animate_subject_trans": 0.909, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.909, "blimp/accuracy/distractor_agreement_relative_clause": 0.697, "blimp/accuracy/transitive": 0.889, "blimp/accuracy/sentential_subject_island": 0.393, "blimp/accuracy/adjunct_island": 0.828, "blimp/accuracy/intransitive": 0.791, "blimp/accuracy/existential_there_subject_raising": 0.882, "blimp/accuracy/irregular_past_participle_adjectives": 0.901, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.693, "blimp/accuracy/principle_A_case_1": 0.999, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.338, "blimp/accuracy/only_npi_scope": 0.558, "blimp/accuracy/superlative_quantifiers_2": 0.845, "blimp/accuracy/passive_1": 0.904, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.939, "blimp/accuracy/inchoative": 0.658, "blimp/accuracy/anaphor_gender_agreement": 0.982, "blimp/accuracy/principle_A_c_command": 0.643, "blimp/accuracy/only_npi_licensor_present": 0.668, "blimp/accuracy/expletive_it_object_raising": 0.772, "blimp/accuracy/left_branch_island_simple_question": 0.749, "blimp/accuracy/wh_questions_subject_gap": 0.951, "blimp/accuracy/existential_there_quantifiers_2": 0.564, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.959, "blimp/accuracy/sentential_negation_npi_scope": 0.734, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.85, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.885, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.917, "blimp/accuracy/principle_A_case_2": 0.953, "blimp/accuracy/distractor_agreement_relational_noun": 0.886, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.989, "blimp/accuracy/superlative_quantifiers_1": 0.72, "blimp/accuracy/wh_island": 0.836, "blimp/accuracy/principle_A_domain_1": 0.985, "blimp/accuracy/complex_NP_island": 0.568, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.972, "blimp/accuracy/irregular_past_participle_verbs": 0.896, "blimp/accuracy/drop_argument": 0.751, "blimp/accuracy/wh_questions_object_gap": 0.849, "blimp/accuracy/animate_subject_passive": 0.782, "blimp/accuracy/existential_there_quantifiers_1": 0.97, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.92, "blimp/accuracy/npi_present_2": 0.639, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.951, "blimp/accuracy/anaphor_number_agreement": 0.992, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.974, "blimp/accuracy/existential_there_object_raising": 0.85, "blimp/accuracy/matrix_question_npi_licensor_present": 0.413, "blimp/accuracy/npi_present_1": 0.664, "blimp/accuracy/wh_vs_that_no_gap": 0.981, "blimp/accuracy/left_branch_island_echo_question": 0.483, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.971, "blimp/accuracy/causative": 0.774, "blimp/accuracy/group_average": 0.8079999999999997, "blimp/accuracy/seq_average": 0.808, "cbt/accuracy/NE": 0.8257211538461539, "cbt/accuracy/V": 0.9432, "cbt/accuracy/CN": 0.888, "cbt/accuracy/P": 0.9252, "cbt/accuracy/group_average": 0.8955302884615384, "cbt/accuracy/seq_average": 0.8955582232893158, "hellaswag/accuracy/val": 0.36227843059151565, "hellaswag/accuracy/group_average": 0.36227843059151565, "hellaswag/accuracy/seq_average": 0.36227843059151565, "piqa/accuracy/val": 0.6409140369967355, "piqa/accuracy/group_average": 0.6409140369967355, "piqa/accuracy/seq_average": 0.6409140369967355, "ai2arc/accuracy/ARC-Easy": 0.3813953488372093, "ai2arc/accuracy/ARC-Challenge": 0.23433476394849787, "ai2arc/accuracy/group_average": 0.3078650563928536, "ai2arc/accuracy/seq_average": 0.3328611898016997, "race/accuracy/test/high": 0.2964551172098342, "race/accuracy/test/middle": 0.37604456824512533, "race/accuracy/group_average": 0.33624984272747976, "race/accuracy/seq_average": 0.3196189704094041, "siqa/accuracy/dev": 0.372057318321392, "siqa/accuracy/group_average": 0.372057318321392, "siqa/accuracy/seq_average": 0.372057318321392, "commonsenseqa/accuracy/dev_rand_split": 0.27764127764127766, "commonsenseqa/accuracy/group_average": 0.27764127764127766, "commonsenseqa/accuracy/seq_average": 0.27764127764127766}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-340000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.2680601089719743, "val/accuracy": 0.5254729740203373, "val/perplexity": 9.660642032156042, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.31619888210889, "lambada/accuracy/total": 0.40042701863354035, "lambada/accuracy/openai_last_token": 0.811141304347826, "lambada/perplexity": 6.113701686798763, "lambada/lm_loss": 2.8670654335240737, "lambada/lm_perplexity": 17.585337065443436, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.46294999632693884, "mean_loss": 2.292129495540432, "blimp/accuracy/passive_2": 0.911, "blimp/accuracy/determiner_noun_agreement_2": 0.985, "blimp/accuracy/ellipsis_n_bar_1": 0.836, "blimp/accuracy/tough_vs_raising_2": 0.879, "blimp/accuracy/tough_vs_raising_1": 0.632, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.931, "blimp/accuracy/principle_A_reconstruction": 0.437, "blimp/accuracy/wh_vs_that_with_gap": 0.516, "blimp/accuracy/principle_A_domain_2": 0.883, "blimp/accuracy/determiner_noun_agreement_1": 0.996, "blimp/accuracy/ellipsis_n_bar_2": 0.933, "blimp/accuracy/principle_A_domain_3": 0.697, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.95, "blimp/accuracy/animate_subject_trans": 0.92, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.924, "blimp/accuracy/distractor_agreement_relative_clause": 0.708, "blimp/accuracy/transitive": 0.879, "blimp/accuracy/sentential_subject_island": 0.369, "blimp/accuracy/adjunct_island": 0.845, "blimp/accuracy/intransitive": 0.765, "blimp/accuracy/existential_there_subject_raising": 0.884, "blimp/accuracy/irregular_past_participle_adjectives": 0.864, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.722, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.348, "blimp/accuracy/only_npi_scope": 0.589, "blimp/accuracy/superlative_quantifiers_2": 0.872, "blimp/accuracy/passive_1": 0.901, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.945, "blimp/accuracy/inchoative": 0.628, "blimp/accuracy/anaphor_gender_agreement": 0.981, "blimp/accuracy/principle_A_c_command": 0.637, "blimp/accuracy/only_npi_licensor_present": 0.619, "blimp/accuracy/expletive_it_object_raising": 0.786, "blimp/accuracy/left_branch_island_simple_question": 0.748, "blimp/accuracy/wh_questions_subject_gap": 0.951, "blimp/accuracy/existential_there_quantifiers_2": 0.586, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.958, "blimp/accuracy/sentential_negation_npi_scope": 0.76, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.836, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.879, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.918, "blimp/accuracy/principle_A_case_2": 0.953, "blimp/accuracy/distractor_agreement_relational_noun": 0.878, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.987, "blimp/accuracy/superlative_quantifiers_1": 0.778, "blimp/accuracy/wh_island": 0.814, "blimp/accuracy/principle_A_domain_1": 0.986, "blimp/accuracy/complex_NP_island": 0.601, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.987, "blimp/accuracy/irregular_past_participle_verbs": 0.898, "blimp/accuracy/drop_argument": 0.735, "blimp/accuracy/wh_questions_object_gap": 0.849, "blimp/accuracy/animate_subject_passive": 0.79, "blimp/accuracy/existential_there_quantifiers_1": 0.964, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.918, "blimp/accuracy/npi_present_2": 0.64, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.958, "blimp/accuracy/anaphor_number_agreement": 0.993, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.973, "blimp/accuracy/existential_there_object_raising": 0.833, "blimp/accuracy/matrix_question_npi_licensor_present": 0.457, "blimp/accuracy/npi_present_1": 0.615, "blimp/accuracy/wh_vs_that_no_gap": 0.982, "blimp/accuracy/left_branch_island_echo_question": 0.48, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.976, "blimp/accuracy/causative": 0.774, "blimp/accuracy/group_average": 0.8093582089552238, "blimp/accuracy/seq_average": 0.8093582089552239, "cbt/accuracy/NE": 0.8277243589743589, "cbt/accuracy/V": 0.9416, "cbt/accuracy/CN": 0.8912, "cbt/accuracy/P": 0.9224, "cbt/accuracy/group_average": 0.8957310897435897, "cbt/accuracy/seq_average": 0.8957583033213286, "hellaswag/accuracy/val": 0.3685520812587134, "hellaswag/accuracy/group_average": 0.3685520812587134, "hellaswag/accuracy/seq_average": 0.3685520812587134, "piqa/accuracy/val": 0.6381936887921654, "piqa/accuracy/group_average": 0.6381936887921654, "piqa/accuracy/seq_average": 0.6381936887921654, "ai2arc/accuracy/ARC-Easy": 0.3775898520084567, "ai2arc/accuracy/ARC-Challenge": 0.24721030042918454, "ai2arc/accuracy/group_average": 0.3124000762188206, "ai2arc/accuracy/seq_average": 0.33456090651558074, "race/accuracy/test/high": 0.28959405374499714, "race/accuracy/test/middle": 0.37604456824512533, "race/accuracy/group_average": 0.33281931099506123, "race/accuracy/seq_average": 0.31475476286988247, "siqa/accuracy/dev": 0.37154554759467756, "siqa/accuracy/group_average": 0.37154554759467756, "siqa/accuracy/seq_average": 0.37154554759467756, "commonsenseqa/accuracy/dev_rand_split": 0.276003276003276, "commonsenseqa/accuracy/group_average": 0.276003276003276, "commonsenseqa/accuracy/seq_average": 0.276003276003276}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-360000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.261893378363715, "val/accuracy": 0.5271926153273809, "val/perplexity": 9.601250768416493, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.2946964998422943, "lambada/accuracy/total": 0.39285714285714285, "lambada/accuracy/openai_last_token": 0.8099767080745341, "lambada/perplexity": 6.179482793062585, "lambada/lm_loss": 2.8565397904300145, "lambada/lm_perplexity": 17.40121080737189, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.46002487909226186, "mean_loss": 2.278294939103005, "blimp/accuracy/passive_2": 0.901, "blimp/accuracy/determiner_noun_agreement_2": 0.984, "blimp/accuracy/ellipsis_n_bar_1": 0.833, "blimp/accuracy/tough_vs_raising_2": 0.884, "blimp/accuracy/tough_vs_raising_1": 0.635, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.931, "blimp/accuracy/principle_A_reconstruction": 0.429, "blimp/accuracy/wh_vs_that_with_gap": 0.523, "blimp/accuracy/principle_A_domain_2": 0.89, "blimp/accuracy/determiner_noun_agreement_1": 0.993, "blimp/accuracy/ellipsis_n_bar_2": 0.92, "blimp/accuracy/principle_A_domain_3": 0.671, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.943, "blimp/accuracy/animate_subject_trans": 0.91, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.915, "blimp/accuracy/distractor_agreement_relative_clause": 0.683, "blimp/accuracy/transitive": 0.876, "blimp/accuracy/sentential_subject_island": 0.351, "blimp/accuracy/adjunct_island": 0.849, "blimp/accuracy/intransitive": 0.775, "blimp/accuracy/existential_there_subject_raising": 0.885, "blimp/accuracy/irregular_past_participle_adjectives": 0.902, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.663, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.392, "blimp/accuracy/only_npi_scope": 0.57, "blimp/accuracy/superlative_quantifiers_2": 0.874, "blimp/accuracy/passive_1": 0.886, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.936, "blimp/accuracy/inchoative": 0.651, "blimp/accuracy/anaphor_gender_agreement": 0.98, "blimp/accuracy/principle_A_c_command": 0.657, "blimp/accuracy/only_npi_licensor_present": 0.661, "blimp/accuracy/expletive_it_object_raising": 0.754, "blimp/accuracy/left_branch_island_simple_question": 0.697, "blimp/accuracy/wh_questions_subject_gap": 0.951, "blimp/accuracy/existential_there_quantifiers_2": 0.582, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.959, "blimp/accuracy/sentential_negation_npi_scope": 0.724, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.832, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.878, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.922, "blimp/accuracy/principle_A_case_2": 0.943, "blimp/accuracy/distractor_agreement_relational_noun": 0.871, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.984, "blimp/accuracy/superlative_quantifiers_1": 0.767, "blimp/accuracy/wh_island": 0.813, "blimp/accuracy/principle_A_domain_1": 0.984, "blimp/accuracy/complex_NP_island": 0.573, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.977, "blimp/accuracy/irregular_past_participle_verbs": 0.908, "blimp/accuracy/drop_argument": 0.746, "blimp/accuracy/wh_questions_object_gap": 0.848, "blimp/accuracy/animate_subject_passive": 0.786, "blimp/accuracy/existential_there_quantifiers_1": 0.982, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.92, "blimp/accuracy/npi_present_2": 0.621, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.95, "blimp/accuracy/anaphor_number_agreement": 0.991, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.97, "blimp/accuracy/existential_there_object_raising": 0.805, "blimp/accuracy/matrix_question_npi_licensor_present": 0.424, "blimp/accuracy/npi_present_1": 0.585, "blimp/accuracy/wh_vs_that_no_gap": 0.983, "blimp/accuracy/left_branch_island_echo_question": 0.491, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.966, "blimp/accuracy/causative": 0.761, "blimp/accuracy/group_average": 0.804492537313433, "blimp/accuracy/seq_average": 0.8044925373134328, "cbt/accuracy/NE": 0.8293269230769231, "cbt/accuracy/V": 0.944, "cbt/accuracy/CN": 0.8908, "cbt/accuracy/P": 0.928, "cbt/accuracy/group_average": 0.8980317307692307, "cbt/accuracy/seq_average": 0.8980592236894758, "hellaswag/accuracy/val": 0.3685520812587134, "hellaswag/accuracy/group_average": 0.3685520812587134, "hellaswag/accuracy/seq_average": 0.3685520812587134, "piqa/accuracy/val": 0.6420021762785637, "piqa/accuracy/group_average": 0.6420021762785637, "piqa/accuracy/seq_average": 0.6420021762785637, "ai2arc/accuracy/ARC-Easy": 0.3775898520084567, "ai2arc/accuracy/ARC-Challenge": 0.2240343347639485, "ai2arc/accuracy/group_average": 0.3008120933862026, "ai2arc/accuracy/seq_average": 0.32691218130311617, "race/accuracy/test/high": 0.29245283018867924, "race/accuracy/test/middle": 0.3732590529247911, "race/accuracy/group_average": 0.33285594155673515, "race/accuracy/seq_average": 0.31597081475476285, "siqa/accuracy/dev": 0.3751279426816786, "siqa/accuracy/group_average": 0.3751279426816786, "siqa/accuracy/seq_average": 0.3751279426816786, "commonsenseqa/accuracy/dev_rand_split": 0.276003276003276, "commonsenseqa/accuracy/group_average": 0.276003276003276, "commonsenseqa/accuracy/seq_average": 0.276003276003276}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-380000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.2571287609281994, "val/accuracy": 0.5277225554935516, "val/perplexity": 9.5556132905009, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.283029876140334, "lambada/accuracy/total": 0.3918866459627329, "lambada/accuracy/openai_last_token": 0.8092003105590062, "lambada/perplexity": 6.290781348541941, "lambada/lm_loss": 2.8560877335976183, "lambada/lm_perplexity": 17.39334624888208, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4598046007281423, "mean_loss": 2.2700793185342665, "blimp/accuracy/passive_2": 0.913, "blimp/accuracy/determiner_noun_agreement_2": 0.984, "blimp/accuracy/ellipsis_n_bar_1": 0.835, "blimp/accuracy/tough_vs_raising_2": 0.871, "blimp/accuracy/tough_vs_raising_1": 0.625, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.941, "blimp/accuracy/principle_A_reconstruction": 0.41, "blimp/accuracy/wh_vs_that_with_gap": 0.478, "blimp/accuracy/principle_A_domain_2": 0.883, "blimp/accuracy/determiner_noun_agreement_1": 0.991, "blimp/accuracy/ellipsis_n_bar_2": 0.931, "blimp/accuracy/principle_A_domain_3": 0.686, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.947, "blimp/accuracy/animate_subject_trans": 0.916, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.919, "blimp/accuracy/distractor_agreement_relative_clause": 0.7, "blimp/accuracy/transitive": 0.878, "blimp/accuracy/sentential_subject_island": 0.373, "blimp/accuracy/adjunct_island": 0.859, "blimp/accuracy/intransitive": 0.777, "blimp/accuracy/existential_there_subject_raising": 0.881, "blimp/accuracy/irregular_past_participle_adjectives": 0.902, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.684, "blimp/accuracy/principle_A_case_1": 0.999, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.347, "blimp/accuracy/only_npi_scope": 0.606, "blimp/accuracy/superlative_quantifiers_2": 0.847, "blimp/accuracy/passive_1": 0.897, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.933, "blimp/accuracy/inchoative": 0.631, "blimp/accuracy/anaphor_gender_agreement": 0.981, "blimp/accuracy/principle_A_c_command": 0.67, "blimp/accuracy/only_npi_licensor_present": 0.664, "blimp/accuracy/expletive_it_object_raising": 0.779, "blimp/accuracy/left_branch_island_simple_question": 0.761, "blimp/accuracy/wh_questions_subject_gap": 0.964, "blimp/accuracy/existential_there_quantifiers_2": 0.505, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.964, "blimp/accuracy/sentential_negation_npi_scope": 0.735, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.827, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.884, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.918, "blimp/accuracy/principle_A_case_2": 0.944, "blimp/accuracy/distractor_agreement_relational_noun": 0.88, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.977, "blimp/accuracy/superlative_quantifiers_1": 0.782, "blimp/accuracy/wh_island": 0.839, "blimp/accuracy/principle_A_domain_1": 0.988, "blimp/accuracy/complex_NP_island": 0.571, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.974, "blimp/accuracy/irregular_past_participle_verbs": 0.911, "blimp/accuracy/drop_argument": 0.74, "blimp/accuracy/wh_questions_object_gap": 0.851, "blimp/accuracy/animate_subject_passive": 0.786, "blimp/accuracy/existential_there_quantifiers_1": 0.973, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.917, "blimp/accuracy/npi_present_2": 0.63, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.963, "blimp/accuracy/anaphor_number_agreement": 0.991, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.969, "blimp/accuracy/existential_there_object_raising": 0.825, "blimp/accuracy/matrix_question_npi_licensor_present": 0.427, "blimp/accuracy/npi_present_1": 0.622, "blimp/accuracy/wh_vs_that_no_gap": 0.982, "blimp/accuracy/left_branch_island_echo_question": 0.479, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.97, "blimp/accuracy/causative": 0.766, "blimp/accuracy/group_average": 0.8067611940298508, "blimp/accuracy/seq_average": 0.8067611940298507, "cbt/accuracy/NE": 0.827323717948718, "cbt/accuracy/V": 0.944, "cbt/accuracy/CN": 0.8892, "cbt/accuracy/P": 0.9272, "cbt/accuracy/group_average": 0.8969309294871795, "cbt/accuracy/seq_average": 0.8969587835134054, "hellaswag/accuracy/val": 0.37004580760804623, "hellaswag/accuracy/group_average": 0.37004580760804623, "hellaswag/accuracy/seq_average": 0.37004580760804623, "piqa/accuracy/val": 0.6392818280739935, "piqa/accuracy/group_average": 0.6392818280739935, "piqa/accuracy/seq_average": 0.6392818280739935, "ai2arc/accuracy/ARC-Easy": 0.3864693446088795, "ai2arc/accuracy/ARC-Challenge": 0.22832618025751072, "ai2arc/accuracy/group_average": 0.3073977624331951, "ai2arc/accuracy/seq_average": 0.3342776203966006, "race/accuracy/test/high": 0.2935963407661521, "race/accuracy/test/middle": 0.3774373259052925, "race/accuracy/group_average": 0.3355168333357223, "race/accuracy/seq_average": 0.31799756789623024, "siqa/accuracy/dev": 0.3735926305015353, "siqa/accuracy/group_average": 0.3735926305015353, "siqa/accuracy/seq_average": 0.3735926305015353, "commonsenseqa/accuracy/dev_rand_split": 0.28665028665028663, "commonsenseqa/accuracy/group_average": 0.28665028665028663, "commonsenseqa/accuracy/seq_average": 0.28665028665028663}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-40000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.5870511493985613, "val/accuracy": 0.47924417162698413, "val/perplexity": 13.290521997240674, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.521425993546196, "lambada/accuracy/total": 0.27212732919254656, "lambada/accuracy/openai_last_token": 0.7645574534161491, "lambada/perplexity": 10.580871609837153, "lambada/lm_loss": 3.1651568422442136, "lambada/lm_perplexity": 23.69245971998854, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3756857504097654, "mean_loss": 2.5542385714723785, "blimp/accuracy/passive_2": 0.889, "blimp/accuracy/determiner_noun_agreement_2": 0.989, "blimp/accuracy/ellipsis_n_bar_1": 0.84, "blimp/accuracy/tough_vs_raising_2": 0.851, "blimp/accuracy/tough_vs_raising_1": 0.667, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.922, "blimp/accuracy/principle_A_reconstruction": 0.408, "blimp/accuracy/wh_vs_that_with_gap": 0.492, "blimp/accuracy/principle_A_domain_2": 0.833, "blimp/accuracy/determiner_noun_agreement_1": 0.99, "blimp/accuracy/ellipsis_n_bar_2": 0.892, "blimp/accuracy/principle_A_domain_3": 0.628, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.94, "blimp/accuracy/animate_subject_trans": 0.913, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.913, "blimp/accuracy/distractor_agreement_relative_clause": 0.661, "blimp/accuracy/transitive": 0.86, "blimp/accuracy/sentential_subject_island": 0.364, "blimp/accuracy/adjunct_island": 0.812, "blimp/accuracy/intransitive": 0.775, "blimp/accuracy/existential_there_subject_raising": 0.87, "blimp/accuracy/irregular_past_participle_adjectives": 0.949, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.576, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.196, "blimp/accuracy/only_npi_scope": 0.63, "blimp/accuracy/superlative_quantifiers_2": 0.676, "blimp/accuracy/passive_1": 0.897, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.913, "blimp/accuracy/inchoative": 0.659, "blimp/accuracy/anaphor_gender_agreement": 0.973, "blimp/accuracy/principle_A_c_command": 0.574, "blimp/accuracy/only_npi_licensor_present": 0.379, "blimp/accuracy/expletive_it_object_raising": 0.733, "blimp/accuracy/left_branch_island_simple_question": 0.567, "blimp/accuracy/wh_questions_subject_gap": 0.948, "blimp/accuracy/existential_there_quantifiers_2": 0.459, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.95, "blimp/accuracy/sentential_negation_npi_scope": 0.747, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.79, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.917, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.888, "blimp/accuracy/principle_A_case_2": 0.944, "blimp/accuracy/distractor_agreement_relational_noun": 0.878, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.991, "blimp/accuracy/superlative_quantifiers_1": 0.638, "blimp/accuracy/wh_island": 0.796, "blimp/accuracy/principle_A_domain_1": 0.986, "blimp/accuracy/complex_NP_island": 0.549, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.961, "blimp/accuracy/irregular_past_participle_verbs": 0.876, "blimp/accuracy/drop_argument": 0.792, "blimp/accuracy/wh_questions_object_gap": 0.829, "blimp/accuracy/animate_subject_passive": 0.784, "blimp/accuracy/existential_there_quantifiers_1": 0.981, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.921, "blimp/accuracy/npi_present_2": 0.516, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.933, "blimp/accuracy/anaphor_number_agreement": 0.991, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.964, "blimp/accuracy/existential_there_object_raising": 0.808, "blimp/accuracy/matrix_question_npi_licensor_present": 0.2, "blimp/accuracy/npi_present_1": 0.507, "blimp/accuracy/wh_vs_that_no_gap": 0.986, "blimp/accuracy/left_branch_island_echo_question": 0.528, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.985, "blimp/accuracy/causative": 0.716, "blimp/accuracy/group_average": 0.7759701492537312, "blimp/accuracy/seq_average": 0.7759701492537313, "cbt/accuracy/NE": 0.7592147435897436, "cbt/accuracy/V": 0.9228, "cbt/accuracy/CN": 0.832, "cbt/accuracy/P": 0.8936, "cbt/accuracy/group_average": 0.8519036858974358, "cbt/accuracy/seq_average": 0.8519407763105242, "hellaswag/accuracy/val": 0.3020314678350926, "hellaswag/accuracy/group_average": 0.3020314678350926, "hellaswag/accuracy/seq_average": 0.3020314678350926, "piqa/accuracy/val": 0.5963003264417845, "piqa/accuracy/group_average": 0.5963003264417845, "piqa/accuracy/seq_average": 0.5963003264417845, "ai2arc/accuracy/ARC-Easy": 0.33699788583509516, "ai2arc/accuracy/ARC-Challenge": 0.2128755364806867, "ai2arc/accuracy/group_average": 0.2749367111578909, "ai2arc/accuracy/seq_average": 0.29603399433427763, "race/accuracy/test/high": 0.2741566609491138, "race/accuracy/test/middle": 0.3447075208913649, "race/accuracy/group_average": 0.3094320909202394, "race/accuracy/seq_average": 0.29468990676935547, "siqa/accuracy/dev": 0.36591606960081885, "siqa/accuracy/group_average": 0.36591606960081885, "siqa/accuracy/seq_average": 0.36591606960081885, "commonsenseqa/accuracy/dev_rand_split": 0.257985257985258, "commonsenseqa/accuracy/group_average": 0.257985257985258, "commonsenseqa/accuracy/seq_average": 0.257985257985258}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-400000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.2533070397755455, "val/accuracy": 0.5283280629960317, "val/perplexity": 9.519164094756576, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.282633716275233, "lambada/accuracy/total": 0.3843167701863354, "lambada/accuracy/openai_last_token": 0.8092003105590062, "lambada/perplexity": 6.237563227990842, "lambada/lm_loss": 2.8528693822902675, "lambada/lm_perplexity": 17.337458331957613, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.45632241659118356, "mean_loss": 2.2679703780253893, "blimp/accuracy/passive_2": 0.906, "blimp/accuracy/determiner_noun_agreement_2": 0.984, "blimp/accuracy/ellipsis_n_bar_1": 0.815, "blimp/accuracy/tough_vs_raising_2": 0.881, "blimp/accuracy/tough_vs_raising_1": 0.632, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.944, "blimp/accuracy/principle_A_reconstruction": 0.409, "blimp/accuracy/wh_vs_that_with_gap": 0.517, "blimp/accuracy/principle_A_domain_2": 0.882, "blimp/accuracy/determiner_noun_agreement_1": 0.991, "blimp/accuracy/ellipsis_n_bar_2": 0.933, "blimp/accuracy/principle_A_domain_3": 0.687, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.942, "blimp/accuracy/animate_subject_trans": 0.912, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.914, "blimp/accuracy/distractor_agreement_relative_clause": 0.665, "blimp/accuracy/transitive": 0.884, "blimp/accuracy/sentential_subject_island": 0.371, "blimp/accuracy/adjunct_island": 0.843, "blimp/accuracy/intransitive": 0.768, "blimp/accuracy/existential_there_subject_raising": 0.89, "blimp/accuracy/irregular_past_participle_adjectives": 0.898, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.694, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.339, "blimp/accuracy/only_npi_scope": 0.596, "blimp/accuracy/superlative_quantifiers_2": 0.823, "blimp/accuracy/passive_1": 0.901, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.927, "blimp/accuracy/inchoative": 0.641, "blimp/accuracy/anaphor_gender_agreement": 0.981, "blimp/accuracy/principle_A_c_command": 0.64, "blimp/accuracy/only_npi_licensor_present": 0.612, "blimp/accuracy/expletive_it_object_raising": 0.782, "blimp/accuracy/left_branch_island_simple_question": 0.744, "blimp/accuracy/wh_questions_subject_gap": 0.946, "blimp/accuracy/existential_there_quantifiers_2": 0.496, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.969, "blimp/accuracy/sentential_negation_npi_scope": 0.756, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.836, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.894, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.914, "blimp/accuracy/principle_A_case_2": 0.958, "blimp/accuracy/distractor_agreement_relational_noun": 0.878, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.987, "blimp/accuracy/superlative_quantifiers_1": 0.762, "blimp/accuracy/wh_island": 0.839, "blimp/accuracy/principle_A_domain_1": 0.988, "blimp/accuracy/complex_NP_island": 0.572, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.973, "blimp/accuracy/irregular_past_participle_verbs": 0.894, "blimp/accuracy/drop_argument": 0.738, "blimp/accuracy/wh_questions_object_gap": 0.838, "blimp/accuracy/animate_subject_passive": 0.79, "blimp/accuracy/existential_there_quantifiers_1": 0.969, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.909, "blimp/accuracy/npi_present_2": 0.636, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.957, "blimp/accuracy/anaphor_number_agreement": 0.991, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.973, "blimp/accuracy/existential_there_object_raising": 0.834, "blimp/accuracy/matrix_question_npi_licensor_present": 0.433, "blimp/accuracy/npi_present_1": 0.637, "blimp/accuracy/wh_vs_that_no_gap": 0.98, "blimp/accuracy/left_branch_island_echo_question": 0.489, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.977, "blimp/accuracy/causative": 0.782, "blimp/accuracy/group_average": 0.8051194029850746, "blimp/accuracy/seq_average": 0.8051194029850747, "cbt/accuracy/NE": 0.8285256410256411, "cbt/accuracy/V": 0.9464, "cbt/accuracy/CN": 0.8904, "cbt/accuracy/P": 0.924, "cbt/accuracy/group_average": 0.8973314102564103, "cbt/accuracy/seq_average": 0.897358943577431, "hellaswag/accuracy/val": 0.369946225851424, "hellaswag/accuracy/group_average": 0.369946225851424, "hellaswag/accuracy/seq_average": 0.369946225851424, "piqa/accuracy/val": 0.6338411316648531, "piqa/accuracy/group_average": 0.6338411316648531, "piqa/accuracy/seq_average": 0.6338411316648531, "ai2arc/accuracy/ARC-Easy": 0.3835095137420719, "ai2arc/accuracy/ARC-Challenge": 0.22832618025751072, "ai2arc/accuracy/group_average": 0.3059178469997913, "ai2arc/accuracy/seq_average": 0.3322946175637394, "race/accuracy/test/high": 0.29273870783304745, "race/accuracy/test/middle": 0.3725626740947075, "race/accuracy/group_average": 0.33265069096387745, "race/accuracy/seq_average": 0.31597081475476285, "siqa/accuracy/dev": 0.37717502558853633, "siqa/accuracy/group_average": 0.37717502558853633, "siqa/accuracy/seq_average": 0.37717502558853633, "commonsenseqa/accuracy/dev_rand_split": 0.2882882882882883, "commonsenseqa/accuracy/group_average": 0.2882882882882883, "commonsenseqa/accuracy/seq_average": 0.2882882882882883}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-60000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.5212530711340526, "val/accuracy": 0.48851473369295634, "val/perplexity": 12.444180340982506, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.4765987040833655, "lambada/accuracy/total": 0.25562888198757766, "lambada/accuracy/openai_last_token": 0.7597049689440993, "lambada/perplexity": 10.461236984167671, "lambada/lm_loss": 3.091603581253942, "lambada/lm_perplexity": 22.012348277861356, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.372071807840267, "mean_loss": 2.498925887608709, "blimp/accuracy/passive_2": 0.898, "blimp/accuracy/determiner_noun_agreement_2": 0.986, "blimp/accuracy/ellipsis_n_bar_1": 0.812, "blimp/accuracy/tough_vs_raising_2": 0.845, "blimp/accuracy/tough_vs_raising_1": 0.604, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.889, "blimp/accuracy/principle_A_reconstruction": 0.38, "blimp/accuracy/wh_vs_that_with_gap": 0.509, "blimp/accuracy/principle_A_domain_2": 0.824, "blimp/accuracy/determiner_noun_agreement_1": 0.989, "blimp/accuracy/ellipsis_n_bar_2": 0.885, "blimp/accuracy/principle_A_domain_3": 0.597, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.935, "blimp/accuracy/animate_subject_trans": 0.896, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.899, "blimp/accuracy/distractor_agreement_relative_clause": 0.62, "blimp/accuracy/transitive": 0.858, "blimp/accuracy/sentential_subject_island": 0.424, "blimp/accuracy/adjunct_island": 0.807, "blimp/accuracy/intransitive": 0.73, "blimp/accuracy/existential_there_subject_raising": 0.857, "blimp/accuracy/irregular_past_participle_adjectives": 0.841, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.51, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.273, "blimp/accuracy/only_npi_scope": 0.557, "blimp/accuracy/superlative_quantifiers_2": 0.53, "blimp/accuracy/passive_1": 0.882, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.916, "blimp/accuracy/inchoative": 0.616, "blimp/accuracy/anaphor_gender_agreement": 0.975, "blimp/accuracy/principle_A_c_command": 0.598, "blimp/accuracy/only_npi_licensor_present": 0.789, "blimp/accuracy/expletive_it_object_raising": 0.758, "blimp/accuracy/left_branch_island_simple_question": 0.548, "blimp/accuracy/wh_questions_subject_gap": 0.921, "blimp/accuracy/existential_there_quantifiers_2": 0.533, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.951, "blimp/accuracy/sentential_negation_npi_scope": 0.703, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.839, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.853, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.905, "blimp/accuracy/principle_A_case_2": 0.937, "blimp/accuracy/distractor_agreement_relational_noun": 0.872, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.992, "blimp/accuracy/superlative_quantifiers_1": 0.485, "blimp/accuracy/wh_island": 0.849, "blimp/accuracy/principle_A_domain_1": 0.98, "blimp/accuracy/complex_NP_island": 0.554, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.976, "blimp/accuracy/irregular_past_participle_verbs": 0.86, "blimp/accuracy/drop_argument": 0.74, "blimp/accuracy/wh_questions_object_gap": 0.784, "blimp/accuracy/animate_subject_passive": 0.797, "blimp/accuracy/existential_there_quantifiers_1": 0.98, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.917, "blimp/accuracy/npi_present_2": 0.566, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.929, "blimp/accuracy/anaphor_number_agreement": 0.991, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.971, "blimp/accuracy/existential_there_object_raising": 0.784, "blimp/accuracy/matrix_question_npi_licensor_present": 0.249, "blimp/accuracy/npi_present_1": 0.558, "blimp/accuracy/wh_vs_that_no_gap": 0.969, "blimp/accuracy/left_branch_island_echo_question": 0.48, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.967, "blimp/accuracy/causative": 0.72, "blimp/accuracy/group_average": 0.7708805970149253, "blimp/accuracy/seq_average": 0.7708805970149254, "cbt/accuracy/NE": 0.7844551282051282, "cbt/accuracy/V": 0.9212, "cbt/accuracy/CN": 0.8392, "cbt/accuracy/P": 0.9016, "cbt/accuracy/group_average": 0.8616137820512821, "cbt/accuracy/seq_average": 0.8616446578631453, "hellaswag/accuracy/val": 0.3145787691694881, "hellaswag/accuracy/group_average": 0.3145787691694881, "hellaswag/accuracy/seq_average": 0.3145787691694881, "piqa/accuracy/val": 0.5963003264417845, "piqa/accuracy/group_average": 0.5963003264417845, "piqa/accuracy/seq_average": 0.5963003264417845, "ai2arc/accuracy/ARC-Easy": 0.35391120507399576, "ai2arc/accuracy/ARC-Challenge": 0.21716738197424892, "ai2arc/accuracy/group_average": 0.28553929352412233, "ai2arc/accuracy/seq_average": 0.3087818696883853, "race/accuracy/test/high": 0.2847341337907376, "race/accuracy/test/middle": 0.3530640668523677, "race/accuracy/group_average": 0.31889910032155266, "race/accuracy/seq_average": 0.3046209971625456, "siqa/accuracy/dev": 0.3694984646878199, "siqa/accuracy/group_average": 0.3694984646878199, "siqa/accuracy/seq_average": 0.3694984646878199, "commonsenseqa/accuracy/dev_rand_split": 0.266994266994267, "commonsenseqa/accuracy/group_average": 0.266994266994267, "commonsenseqa/accuracy/seq_average": 0.266994266994267}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_33_experts/export/result-model-80000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.4752994113498263, "val/accuracy": 0.49515206473214285, "val/perplexity": 11.88526516369816, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.517614139533191, "lambada/accuracy/total": 0.28959627329192544, "lambada/accuracy/openai_last_token": 0.7738742236024845, "lambada/perplexity": 9.505356860414015, "lambada/lm_loss": 3.0635331043889846, "lambada/lm_perplexity": 21.403042918848794, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.39237416901203415, "mean_loss": 2.4964567754415086, "blimp/accuracy/passive_2": 0.905, "blimp/accuracy/determiner_noun_agreement_2": 0.985, "blimp/accuracy/ellipsis_n_bar_1": 0.829, "blimp/accuracy/tough_vs_raising_2": 0.817, "blimp/accuracy/tough_vs_raising_1": 0.67, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.938, "blimp/accuracy/principle_A_reconstruction": 0.349, "blimp/accuracy/wh_vs_that_with_gap": 0.562, "blimp/accuracy/principle_A_domain_2": 0.864, "blimp/accuracy/determiner_noun_agreement_1": 0.99, "blimp/accuracy/ellipsis_n_bar_2": 0.904, "blimp/accuracy/principle_A_domain_3": 0.648, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.933, "blimp/accuracy/animate_subject_trans": 0.899, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.901, "blimp/accuracy/distractor_agreement_relative_clause": 0.725, "blimp/accuracy/transitive": 0.864, "blimp/accuracy/sentential_subject_island": 0.434, "blimp/accuracy/adjunct_island": 0.832, "blimp/accuracy/intransitive": 0.781, "blimp/accuracy/existential_there_subject_raising": 0.876, "blimp/accuracy/irregular_past_participle_adjectives": 0.958, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.656, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.336, "blimp/accuracy/only_npi_scope": 0.704, "blimp/accuracy/superlative_quantifiers_2": 0.73, "blimp/accuracy/passive_1": 0.884, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.924, "blimp/accuracy/inchoative": 0.648, "blimp/accuracy/anaphor_gender_agreement": 0.983, "blimp/accuracy/principle_A_c_command": 0.629, "blimp/accuracy/only_npi_licensor_present": 0.626, "blimp/accuracy/expletive_it_object_raising": 0.752, "blimp/accuracy/left_branch_island_simple_question": 0.656, "blimp/accuracy/wh_questions_subject_gap": 0.914, "blimp/accuracy/existential_there_quantifiers_2": 0.557, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.949, "blimp/accuracy/sentential_negation_npi_scope": 0.801, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.843, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.845, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.908, "blimp/accuracy/principle_A_case_2": 0.926, "blimp/accuracy/distractor_agreement_relational_noun": 0.901, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.99, "blimp/accuracy/superlative_quantifiers_1": 0.491, "blimp/accuracy/wh_island": 0.758, "blimp/accuracy/principle_A_domain_1": 0.985, "blimp/accuracy/complex_NP_island": 0.576, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.986, "blimp/accuracy/irregular_past_participle_verbs": 0.849, "blimp/accuracy/drop_argument": 0.755, "blimp/accuracy/wh_questions_object_gap": 0.805, "blimp/accuracy/animate_subject_passive": 0.8, "blimp/accuracy/existential_there_quantifiers_1": 0.992, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.924, "blimp/accuracy/npi_present_2": 0.571, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.938, "blimp/accuracy/anaphor_number_agreement": 0.992, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.967, "blimp/accuracy/existential_there_object_raising": 0.81, "blimp/accuracy/matrix_question_npi_licensor_present": 0.267, "blimp/accuracy/npi_present_1": 0.553, "blimp/accuracy/wh_vs_that_no_gap": 0.968, "blimp/accuracy/left_branch_island_echo_question": 0.462, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.971, "blimp/accuracy/causative": 0.713, "blimp/accuracy/group_average": 0.7904328358208955, "blimp/accuracy/seq_average": 0.7904328358208955, "cbt/accuracy/NE": 0.7880608974358975, "cbt/accuracy/V": 0.9228, "cbt/accuracy/CN": 0.8608, "cbt/accuracy/P": 0.9032, "cbt/accuracy/group_average": 0.8687152243589744, "cbt/accuracy/seq_average": 0.8687474989995998, "hellaswag/accuracy/val": 0.3205536745668194, "hellaswag/accuracy/group_average": 0.3205536745668194, "hellaswag/accuracy/seq_average": 0.3205536745668194, "piqa/accuracy/val": 0.6039173014145811, "piqa/accuracy/group_average": 0.6039173014145811, "piqa/accuracy/seq_average": 0.6039173014145811, "ai2arc/accuracy/ARC-Easy": 0.3674418604651163, "ai2arc/accuracy/ARC-Challenge": 0.22660944206008585, "ai2arc/accuracy/group_average": 0.29702565126260105, "ai2arc/accuracy/seq_average": 0.32096317280453257, "race/accuracy/test/high": 0.27501429388221843, "race/accuracy/test/middle": 0.3579387186629526, "race/accuracy/group_average": 0.3164765062725855, "race/accuracy/seq_average": 0.2991487636805837, "siqa/accuracy/dev": 0.36898669396110545, "siqa/accuracy/group_average": 0.36898669396110545, "siqa/accuracy/seq_average": 0.36898669396110545, "commonsenseqa/accuracy/dev_rand_split": 0.276003276003276, "commonsenseqa/accuracy/group_average": 0.276003276003276, "commonsenseqa/accuracy/seq_average": 0.276003276003276}