Upload folder using huggingface_hub

#319
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_lb0001/export/result-model-10000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 3.0136551629929316, "val/accuracy": 0.4246099562872024, "val/perplexity": 20.361689365967496, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 3.001561703889266, "lambada/accuracy/total": 0.1768245341614907, "lambada/accuracy/openai_last_token": 0.720108695652174, "lambada/perplexity": 27.0789815952356, "lambada/lm_loss": 3.558235660168625, "lambada/lm_perplexity": 35.101212013282954, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3007172452243465, "mean_loss": 3.007608433441099, "blimp/accuracy/passive_2": 0.863, "blimp/accuracy/determiner_noun_agreement_2": 0.973, "blimp/accuracy/ellipsis_n_bar_1": 0.765, "blimp/accuracy/tough_vs_raising_2": 0.838, "blimp/accuracy/tough_vs_raising_1": 0.521, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.859, "blimp/accuracy/principle_A_reconstruction": 0.433, "blimp/accuracy/wh_vs_that_with_gap": 0.495, "blimp/accuracy/principle_A_domain_2": 0.762, "blimp/accuracy/determiner_noun_agreement_1": 0.98, "blimp/accuracy/ellipsis_n_bar_2": 0.856, "blimp/accuracy/principle_A_domain_3": 0.514, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.888, "blimp/accuracy/animate_subject_trans": 0.888, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.758, "blimp/accuracy/distractor_agreement_relative_clause": 0.513, "blimp/accuracy/transitive": 0.831, "blimp/accuracy/sentential_subject_island": 0.379, "blimp/accuracy/adjunct_island": 0.784, "blimp/accuracy/intransitive": 0.726, "blimp/accuracy/existential_there_subject_raising": 0.855, "blimp/accuracy/irregular_past_participle_adjectives": 0.951, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.119, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.143, "blimp/accuracy/only_npi_scope": 0.352, "blimp/accuracy/superlative_quantifiers_2": 0.76, "blimp/accuracy/passive_1": 0.872, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.899, "blimp/accuracy/inchoative": 0.566, "blimp/accuracy/anaphor_gender_agreement": 0.93, "blimp/accuracy/principle_A_c_command": 0.473, "blimp/accuracy/only_npi_licensor_present": 0.468, "blimp/accuracy/expletive_it_object_raising": 0.755, "blimp/accuracy/left_branch_island_simple_question": 0.155, "blimp/accuracy/wh_questions_subject_gap": 0.904, "blimp/accuracy/existential_there_quantifiers_2": 0.31, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.922, "blimp/accuracy/sentential_negation_npi_scope": 0.387, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.77, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.849, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.825, "blimp/accuracy/principle_A_case_2": 0.921, "blimp/accuracy/distractor_agreement_relational_noun": 0.761, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.995, "blimp/accuracy/superlative_quantifiers_1": 0.674, "blimp/accuracy/wh_island": 0.784, "blimp/accuracy/principle_A_domain_1": 0.949, "blimp/accuracy/complex_NP_island": 0.615, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.935, "blimp/accuracy/irregular_past_participle_verbs": 0.806, "blimp/accuracy/drop_argument": 0.738, "blimp/accuracy/wh_questions_object_gap": 0.759, "blimp/accuracy/animate_subject_passive": 0.729, "blimp/accuracy/existential_there_quantifiers_1": 0.954, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.856, "blimp/accuracy/npi_present_2": 0.666, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.828, "blimp/accuracy/anaphor_number_agreement": 0.973, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.94, "blimp/accuracy/existential_there_object_raising": 0.791, "blimp/accuracy/matrix_question_npi_licensor_present": 0.061, "blimp/accuracy/npi_present_1": 0.572, "blimp/accuracy/wh_vs_that_no_gap": 0.963, "blimp/accuracy/left_branch_island_echo_question": 0.347, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.987, "blimp/accuracy/causative": 0.654, "blimp/accuracy/group_average": 0.7186417910447762, "blimp/accuracy/seq_average": 0.7186417910447761, "cbt/accuracy/NE": 0.6858974358974359, "cbt/accuracy/V": 0.8664, "cbt/accuracy/CN": 0.7332, "cbt/accuracy/P": 0.8344, "cbt/accuracy/group_average": 0.779974358974359, "cbt/accuracy/seq_average": 0.7800120048019208, "hellaswag/accuracy/val": 0.27524397530372435, "hellaswag/accuracy/group_average": 0.27524397530372435, "hellaswag/accuracy/seq_average": 0.27524397530372435, "piqa/accuracy/val": 0.5500544069640914, "piqa/accuracy/group_average": 0.5500544069640914, "piqa/accuracy/seq_average": 0.5500544069640914, "ai2arc/accuracy/ARC-Easy": 0.31374207188160674, "ai2arc/accuracy/ARC-Challenge": 0.20686695278969958, "ai2arc/accuracy/group_average": 0.26030451233565316, "ai2arc/accuracy/seq_average": 0.2784702549575071, "race/accuracy/test/high": 0.24842767295597484, "race/accuracy/test/middle": 0.3203342618384401, "race/accuracy/group_average": 0.2843809673972075, "race/accuracy/seq_average": 0.26935549250101337, "siqa/accuracy/dev": 0.3474923234390993, "siqa/accuracy/group_average": 0.3474923234390993, "siqa/accuracy/seq_average": 0.3474923234390993, "commonsenseqa/accuracy/dev_rand_split": 0.23832923832923833, "commonsenseqa/accuracy/group_average": 0.23832923832923833, "commonsenseqa/accuracy/seq_average": 0.23832923832923833}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_lb0001/export/result-model-100000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.600516425238715, "val/accuracy": 0.48130580357142855, "val/perplexity": 13.4706928447955, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.7820383391765335, "lambada/accuracy/total": 0.2517468944099379, "lambada/accuracy/openai_last_token": 0.7637810559006211, "lambada/perplexity": 12.212507449164173, "lambada/lm_loss": 3.164976418059304, "lambada/lm_perplexity": 23.688185412860843, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.36652634899068326, "mean_loss": 2.691277382207624, "blimp/accuracy/passive_2": 0.898, "blimp/accuracy/determiner_noun_agreement_2": 0.988, "blimp/accuracy/ellipsis_n_bar_1": 0.838, "blimp/accuracy/tough_vs_raising_2": 0.862, "blimp/accuracy/tough_vs_raising_1": 0.6, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.922, "blimp/accuracy/principle_A_reconstruction": 0.399, "blimp/accuracy/wh_vs_that_with_gap": 0.562, "blimp/accuracy/principle_A_domain_2": 0.826, "blimp/accuracy/determiner_noun_agreement_1": 0.985, "blimp/accuracy/ellipsis_n_bar_2": 0.889, "blimp/accuracy/principle_A_domain_3": 0.575, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.925, "blimp/accuracy/animate_subject_trans": 0.902, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.882, "blimp/accuracy/distractor_agreement_relative_clause": 0.712, "blimp/accuracy/transitive": 0.88, "blimp/accuracy/sentential_subject_island": 0.299, "blimp/accuracy/adjunct_island": 0.8, "blimp/accuracy/intransitive": 0.784, "blimp/accuracy/existential_there_subject_raising": 0.889, "blimp/accuracy/irregular_past_participle_adjectives": 0.979, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.343, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.284, "blimp/accuracy/only_npi_scope": 0.644, "blimp/accuracy/superlative_quantifiers_2": 0.799, "blimp/accuracy/passive_1": 0.898, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.918, "blimp/accuracy/inchoative": 0.608, "blimp/accuracy/anaphor_gender_agreement": 0.917, "blimp/accuracy/principle_A_c_command": 0.573, "blimp/accuracy/only_npi_licensor_present": 0.569, "blimp/accuracy/expletive_it_object_raising": 0.78, "blimp/accuracy/left_branch_island_simple_question": 0.428, "blimp/accuracy/wh_questions_subject_gap": 0.904, "blimp/accuracy/existential_there_quantifiers_2": 0.429, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.939, "blimp/accuracy/sentential_negation_npi_scope": 0.599, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.821, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.862, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.9, "blimp/accuracy/principle_A_case_2": 0.952, "blimp/accuracy/distractor_agreement_relational_noun": 0.829, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.98, "blimp/accuracy/superlative_quantifiers_1": 0.666, "blimp/accuracy/wh_island": 0.799, "blimp/accuracy/principle_A_domain_1": 0.967, "blimp/accuracy/complex_NP_island": 0.605, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.97, "blimp/accuracy/irregular_past_participle_verbs": 0.891, "blimp/accuracy/drop_argument": 0.749, "blimp/accuracy/wh_questions_object_gap": 0.778, "blimp/accuracy/animate_subject_passive": 0.801, "blimp/accuracy/existential_there_quantifiers_1": 0.985, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.888, "blimp/accuracy/npi_present_2": 0.62, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.917, "blimp/accuracy/anaphor_number_agreement": 0.977, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.955, "blimp/accuracy/existential_there_object_raising": 0.841, "blimp/accuracy/matrix_question_npi_licensor_present": 0.233, "blimp/accuracy/npi_present_1": 0.563, "blimp/accuracy/wh_vs_that_no_gap": 0.957, "blimp/accuracy/left_branch_island_echo_question": 0.485, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.965, "blimp/accuracy/causative": 0.712, "blimp/accuracy/group_average": 0.7715820895522386, "blimp/accuracy/seq_average": 0.7715820895522388, "cbt/accuracy/NE": 0.7616185897435898, "cbt/accuracy/V": 0.9112, "cbt/accuracy/CN": 0.8216, "cbt/accuracy/P": 0.89, "cbt/accuracy/group_average": 0.8461046474358975, "cbt/accuracy/seq_average": 0.8461384553821528, "hellaswag/accuracy/val": 0.29267078271260705, "hellaswag/accuracy/group_average": 0.29267078271260705, "hellaswag/accuracy/seq_average": 0.29267078271260705, "piqa/accuracy/val": 0.5816104461371056, "piqa/accuracy/group_average": 0.5816104461371056, "piqa/accuracy/seq_average": 0.5816104461371056, "ai2arc/accuracy/ARC-Easy": 0.33276955602537, "ai2arc/accuracy/ARC-Challenge": 0.2145922746781116, "ai2arc/accuracy/group_average": 0.27368091535174077, "ai2arc/accuracy/seq_average": 0.29376770538243624, "race/accuracy/test/high": 0.2730131503716409, "race/accuracy/test/middle": 0.346100278551532, "race/accuracy/group_average": 0.3095567144615865, "race/accuracy/seq_average": 0.294284556141062, "siqa/accuracy/dev": 0.3546571136131013, "siqa/accuracy/group_average": 0.3546571136131013, "siqa/accuracy/seq_average": 0.3546571136131013, "commonsenseqa/accuracy/dev_rand_split": 0.25061425061425063, "commonsenseqa/accuracy/group_average": 0.25061425061425063, "commonsenseqa/accuracy/seq_average": 0.25061425061425063}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_lb0001/export/result-model-20000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.8688117133246527, "val/accuracy": 0.4423895941840278, "val/perplexity": 17.61607281314417, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.688451352326766, "lambada/accuracy/total": 0.1939052795031056, "lambada/accuracy/openai_last_token": 0.7305900621118012, "lambada/perplexity": 17.94126846112682, "lambada/lm_loss": 3.393954338406611, "lambada/lm_perplexity": 29.783493730992074, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3181474368435667, "mean_loss": 2.7786315328257096, "blimp/accuracy/passive_2": 0.852, "blimp/accuracy/determiner_noun_agreement_2": 0.97, "blimp/accuracy/ellipsis_n_bar_1": 0.779, "blimp/accuracy/tough_vs_raising_2": 0.834, "blimp/accuracy/tough_vs_raising_1": 0.567, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.886, "blimp/accuracy/principle_A_reconstruction": 0.475, "blimp/accuracy/wh_vs_that_with_gap": 0.464, "blimp/accuracy/principle_A_domain_2": 0.799, "blimp/accuracy/determiner_noun_agreement_1": 0.986, "blimp/accuracy/ellipsis_n_bar_2": 0.863, "blimp/accuracy/principle_A_domain_3": 0.57, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.883, "blimp/accuracy/animate_subject_trans": 0.889, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.848, "blimp/accuracy/distractor_agreement_relative_clause": 0.635, "blimp/accuracy/transitive": 0.858, "blimp/accuracy/sentential_subject_island": 0.33, "blimp/accuracy/adjunct_island": 0.824, "blimp/accuracy/intransitive": 0.708, "blimp/accuracy/existential_there_subject_raising": 0.846, "blimp/accuracy/irregular_past_participle_adjectives": 0.929, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.198, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.147, "blimp/accuracy/only_npi_scope": 0.499, "blimp/accuracy/superlative_quantifiers_2": 0.764, "blimp/accuracy/passive_1": 0.884, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.897, "blimp/accuracy/inchoative": 0.554, "blimp/accuracy/anaphor_gender_agreement": 0.84, "blimp/accuracy/principle_A_c_command": 0.528, "blimp/accuracy/only_npi_licensor_present": 0.551, "blimp/accuracy/expletive_it_object_raising": 0.752, "blimp/accuracy/left_branch_island_simple_question": 0.278, "blimp/accuracy/wh_questions_subject_gap": 0.884, "blimp/accuracy/existential_there_quantifiers_2": 0.413, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.898, "blimp/accuracy/sentential_negation_npi_scope": 0.511, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.784, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.862, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.878, "blimp/accuracy/principle_A_case_2": 0.932, "blimp/accuracy/distractor_agreement_relational_noun": 0.853, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.984, "blimp/accuracy/superlative_quantifiers_1": 0.615, "blimp/accuracy/wh_island": 0.873, "blimp/accuracy/principle_A_domain_1": 0.975, "blimp/accuracy/complex_NP_island": 0.643, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.958, "blimp/accuracy/irregular_past_participle_verbs": 0.828, "blimp/accuracy/drop_argument": 0.735, "blimp/accuracy/wh_questions_object_gap": 0.778, "blimp/accuracy/animate_subject_passive": 0.755, "blimp/accuracy/existential_there_quantifiers_1": 0.985, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.863, "blimp/accuracy/npi_present_2": 0.615, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.908, "blimp/accuracy/anaphor_number_agreement": 0.965, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.947, "blimp/accuracy/existential_there_object_raising": 0.816, "blimp/accuracy/matrix_question_npi_licensor_present": 0.112, "blimp/accuracy/npi_present_1": 0.601, "blimp/accuracy/wh_vs_that_no_gap": 0.958, "blimp/accuracy/left_branch_island_echo_question": 0.385, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.985, "blimp/accuracy/causative": 0.678, "blimp/accuracy/group_average": 0.7412537313432835, "blimp/accuracy/seq_average": 0.7412537313432835, "cbt/accuracy/NE": 0.7047275641025641, "cbt/accuracy/V": 0.878, "cbt/accuracy/CN": 0.7688, "cbt/accuracy/P": 0.8568, "cbt/accuracy/group_average": 0.8020818910256411, "cbt/accuracy/seq_average": 0.8021208483393357, "hellaswag/accuracy/val": 0.27394941246763593, "hellaswag/accuracy/group_average": 0.27394941246763593, "hellaswag/accuracy/seq_average": 0.27394941246763593, "piqa/accuracy/val": 0.55930359085963, "piqa/accuracy/group_average": 0.55930359085963, "piqa/accuracy/seq_average": 0.55930359085963, "ai2arc/accuracy/ARC-Easy": 0.3175475687103594, "ai2arc/accuracy/ARC-Challenge": 0.19484978540772532, "ai2arc/accuracy/group_average": 0.25619867705904237, "ai2arc/accuracy/seq_average": 0.2770538243626062, "race/accuracy/test/high": 0.2641509433962264, "race/accuracy/test/middle": 0.3293871866295265, "race/accuracy/group_average": 0.2967690650128765, "race/accuracy/seq_average": 0.28313741386299146, "siqa/accuracy/dev": 0.34953940634595704, "siqa/accuracy/group_average": 0.34953940634595704, "siqa/accuracy/seq_average": 0.34953940634595704, "commonsenseqa/accuracy/dev_rand_split": 0.23832923832923833, "commonsenseqa/accuracy/group_average": 0.23832923832923833, "commonsenseqa/accuracy/seq_average": 0.23832923832923833}}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_lb0001/export/result-model-30000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.7964564732142856, "val/accuracy": 0.45245264446924605, "val/perplexity": 16.386477847218877, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.889732597777562, "lambada/accuracy/total": 0.19972826086956522, "lambada/accuracy/openai_last_token": 0.7418478260869565, "lambada/perplexity": 15.944925219404283, "lambada/lm_loss": 3.3557300658743863, "lambada/lm_perplexity": 28.666525004634355, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3260904526694056, "mean_loss": 2.843094535495924, "blimp/accuracy/passive_2": 0.876, "blimp/accuracy/determiner_noun_agreement_2": 0.977, "blimp/accuracy/ellipsis_n_bar_1": 0.815, "blimp/accuracy/tough_vs_raising_2": 0.885, "blimp/accuracy/tough_vs_raising_1": 0.588, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.911, "blimp/accuracy/principle_A_reconstruction": 0.372, "blimp/accuracy/wh_vs_that_with_gap": 0.609, "blimp/accuracy/principle_A_domain_2": 0.8, "blimp/accuracy/determiner_noun_agreement_1": 0.979, "blimp/accuracy/ellipsis_n_bar_2": 0.872, "blimp/accuracy/principle_A_domain_3": 0.543, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.895, "blimp/accuracy/animate_subject_trans": 0.888, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.896, "blimp/accuracy/distractor_agreement_relative_clause": 0.616, "blimp/accuracy/transitive": 0.863, "blimp/accuracy/sentential_subject_island": 0.34, "blimp/accuracy/adjunct_island": 0.824, "blimp/accuracy/intransitive": 0.796, "blimp/accuracy/existential_there_subject_raising": 0.894, "blimp/accuracy/irregular_past_participle_adjectives": 0.933, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.257, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.238, "blimp/accuracy/only_npi_scope": 0.624, "blimp/accuracy/superlative_quantifiers_2": 0.776, "blimp/accuracy/passive_1": 0.867, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.909, "blimp/accuracy/inchoative": 0.639, "blimp/accuracy/anaphor_gender_agreement": 0.933, "blimp/accuracy/principle_A_c_command": 0.493, "blimp/accuracy/only_npi_licensor_present": 0.446, "blimp/accuracy/expletive_it_object_raising": 0.781, "blimp/accuracy/left_branch_island_simple_question": 0.352, "blimp/accuracy/wh_questions_subject_gap": 0.857, "blimp/accuracy/existential_there_quantifiers_2": 0.419, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.917, "blimp/accuracy/sentential_negation_npi_scope": 0.513, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.804, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.824, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.916, "blimp/accuracy/principle_A_case_2": 0.962, "blimp/accuracy/distractor_agreement_relational_noun": 0.791, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.977, "blimp/accuracy/superlative_quantifiers_1": 0.776, "blimp/accuracy/wh_island": 0.765, "blimp/accuracy/principle_A_domain_1": 0.973, "blimp/accuracy/complex_NP_island": 0.622, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.953, "blimp/accuracy/irregular_past_participle_verbs": 0.887, "blimp/accuracy/drop_argument": 0.782, "blimp/accuracy/wh_questions_object_gap": 0.694, "blimp/accuracy/animate_subject_passive": 0.797, "blimp/accuracy/existential_there_quantifiers_1": 0.964, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.875, "blimp/accuracy/npi_present_2": 0.58, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.931, "blimp/accuracy/anaphor_number_agreement": 0.977, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.938, "blimp/accuracy/existential_there_object_raising": 0.802, "blimp/accuracy/matrix_question_npi_licensor_present": 0.251, "blimp/accuracy/npi_present_1": 0.469, "blimp/accuracy/wh_vs_that_no_gap": 0.933, "blimp/accuracy/left_branch_island_echo_question": 0.429, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.968, "blimp/accuracy/causative": 0.7, "blimp/accuracy/group_average": 0.7542238805970147, "blimp/accuracy/seq_average": 0.754223880597015, "cbt/accuracy/NE": 0.7347756410256411, "cbt/accuracy/V": 0.8928, "cbt/accuracy/CN": 0.786, "cbt/accuracy/P": 0.872, "cbt/accuracy/group_average": 0.8213939102564103, "cbt/accuracy/seq_average": 0.8214285714285714, "hellaswag/accuracy/val": 0.2778331009759012, "hellaswag/accuracy/group_average": 0.2778331009759012, "hellaswag/accuracy/seq_average": 0.2778331009759012, "piqa/accuracy/val": 0.5690968443960827, "piqa/accuracy/group_average": 0.5690968443960827, "piqa/accuracy/seq_average": 0.5690968443960827, "ai2arc/accuracy/ARC-Easy": 0.32346723044397463, "ai2arc/accuracy/ARC-Challenge": 0.21030042918454936, "ai2arc/accuracy/group_average": 0.266883829814262, "ai2arc/accuracy/seq_average": 0.28611898016997167, "race/accuracy/test/high": 0.27215551743853633, "race/accuracy/test/middle": 0.334958217270195, "race/accuracy/group_average": 0.30355686735436566, "race/accuracy/seq_average": 0.290433725172274, "siqa/accuracy/dev": 0.3500511770726714, "siqa/accuracy/group_average": 0.3500511770726714, "siqa/accuracy/seq_average": 0.3500511770726714, "commonsenseqa/accuracy/dev_rand_split": 0.25552825552825553, "commonsenseqa/accuracy/group_average": 0.25552825552825553, "commonsenseqa/accuracy/seq_average": 0.25552825552825553}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_lb0001/export/result-model-40000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.7437225826202876, "val/accuracy": 0.4607001410590278, "val/perplexity": 15.544744117564305, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.7624013201790567, "lambada/accuracy/total": 0.22088509316770186, "lambada/accuracy/openai_last_token": 0.7445652173913043, "lambada/perplexity": 15.0265730518372, "lambada/lm_loss": 3.2996150517943645, "lambada/lm_perplexity": 27.102203967542433, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.34079261711336484, "mean_loss": 2.753061951399672, "blimp/accuracy/passive_2": 0.872, "blimp/accuracy/determiner_noun_agreement_2": 0.983, "blimp/accuracy/ellipsis_n_bar_1": 0.863, "blimp/accuracy/tough_vs_raising_2": 0.855, "blimp/accuracy/tough_vs_raising_1": 0.612, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.875, "blimp/accuracy/principle_A_reconstruction": 0.362, "blimp/accuracy/wh_vs_that_with_gap": 0.568, "blimp/accuracy/principle_A_domain_2": 0.818, "blimp/accuracy/determiner_noun_agreement_1": 0.988, "blimp/accuracy/ellipsis_n_bar_2": 0.89, "blimp/accuracy/principle_A_domain_3": 0.559, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.909, "blimp/accuracy/animate_subject_trans": 0.895, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.878, "blimp/accuracy/distractor_agreement_relative_clause": 0.65, "blimp/accuracy/transitive": 0.87, "blimp/accuracy/sentential_subject_island": 0.342, "blimp/accuracy/adjunct_island": 0.826, "blimp/accuracy/intransitive": 0.778, "blimp/accuracy/existential_there_subject_raising": 0.864, "blimp/accuracy/irregular_past_participle_adjectives": 0.947, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.285, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.248, "blimp/accuracy/only_npi_scope": 0.504, "blimp/accuracy/superlative_quantifiers_2": 0.732, "blimp/accuracy/passive_1": 0.87, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.905, "blimp/accuracy/inchoative": 0.582, "blimp/accuracy/anaphor_gender_agreement": 0.904, "blimp/accuracy/principle_A_c_command": 0.563, "blimp/accuracy/only_npi_licensor_present": 0.495, "blimp/accuracy/expletive_it_object_raising": 0.773, "blimp/accuracy/left_branch_island_simple_question": 0.352, "blimp/accuracy/wh_questions_subject_gap": 0.892, "blimp/accuracy/existential_there_quantifiers_2": 0.355, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.933, "blimp/accuracy/sentential_negation_npi_scope": 0.518, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.803, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.856, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.884, "blimp/accuracy/principle_A_case_2": 0.959, "blimp/accuracy/distractor_agreement_relational_noun": 0.822, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.993, "blimp/accuracy/superlative_quantifiers_1": 0.786, "blimp/accuracy/wh_island": 0.745, "blimp/accuracy/principle_A_domain_1": 0.966, "blimp/accuracy/complex_NP_island": 0.624, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.961, "blimp/accuracy/irregular_past_participle_verbs": 0.877, "blimp/accuracy/drop_argument": 0.737, "blimp/accuracy/wh_questions_object_gap": 0.751, "blimp/accuracy/animate_subject_passive": 0.764, "blimp/accuracy/existential_there_quantifiers_1": 0.986, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.87, "blimp/accuracy/npi_present_2": 0.533, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.913, "blimp/accuracy/anaphor_number_agreement": 0.974, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.951, "blimp/accuracy/existential_there_object_raising": 0.821, "blimp/accuracy/matrix_question_npi_licensor_present": 0.207, "blimp/accuracy/npi_present_1": 0.487, "blimp/accuracy/wh_vs_that_no_gap": 0.964, "blimp/accuracy/left_branch_island_echo_question": 0.458, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.973, "blimp/accuracy/causative": 0.674, "blimp/accuracy/group_average": 0.753044776119403, "blimp/accuracy/seq_average": 0.753044776119403, "cbt/accuracy/NE": 0.7435897435897436, "cbt/accuracy/V": 0.8916, "cbt/accuracy/CN": 0.796, "cbt/accuracy/P": 0.87, "cbt/accuracy/group_average": 0.8252974358974359, "cbt/accuracy/seq_average": 0.8253301320528211, "hellaswag/accuracy/val": 0.2811192989444334, "hellaswag/accuracy/group_average": 0.2811192989444334, "hellaswag/accuracy/seq_average": 0.2811192989444334, "piqa/accuracy/val": 0.573993471164309, "piqa/accuracy/group_average": 0.573993471164309, "piqa/accuracy/seq_average": 0.573993471164309, "ai2arc/accuracy/ARC-Easy": 0.3276955602536998, "ai2arc/accuracy/ARC-Challenge": 0.20085836909871244, "ai2arc/accuracy/group_average": 0.2642769646762061, "ai2arc/accuracy/seq_average": 0.2858356940509915, "race/accuracy/test/high": 0.2672955974842767, "race/accuracy/test/middle": 0.33565459610027853, "race/accuracy/group_average": 0.30147509679227763, "race/accuracy/seq_average": 0.28719092014592623, "siqa/accuracy/dev": 0.3587512794268168, "siqa/accuracy/group_average": 0.3587512794268168, "siqa/accuracy/seq_average": 0.3587512794268168, "commonsenseqa/accuracy/dev_rand_split": 0.24815724815724816, "commonsenseqa/accuracy/group_average": 0.24815724815724816, "commonsenseqa/accuracy/seq_average": 0.24815724815724816}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_lb0001/export/result-model-50000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.7020379929315474, "val/accuracy": 0.4664422898065476, "val/perplexity": 14.91008743480593, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "mean_accuracy": 0.4664422898065476, "mean_loss": 2.7020379929315474, "boolq/accuracy/dev": 0.6165137614678899, "boolq/accuracy/group_average": 0.6165137614678899, "boolq/accuracy/seq_average": 0.6165137614678899}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_lb0001/export/result-model-60000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.6685878208705356, "val/accuracy": 0.4708203512524802, "val/perplexity": 14.41959176135687, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.7531376240416345, "lambada/accuracy/total": 0.23505434782608695, "lambada/accuracy/openai_last_token": 0.7552406832298136, "lambada/perplexity": 13.56148286695966, "lambada/lm_loss": 3.223827817261436, "lambada/lm_perplexity": 25.12410684429072, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3529373495392836, "mean_loss": 2.710862722456085, "blimp/accuracy/passive_2": 0.889, "blimp/accuracy/determiner_noun_agreement_2": 0.982, "blimp/accuracy/ellipsis_n_bar_1": 0.829, "blimp/accuracy/tough_vs_raising_2": 0.884, "blimp/accuracy/tough_vs_raising_1": 0.583, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.912, "blimp/accuracy/principle_A_reconstruction": 0.407, "blimp/accuracy/wh_vs_that_with_gap": 0.52, "blimp/accuracy/principle_A_domain_2": 0.799, "blimp/accuracy/determiner_noun_agreement_1": 0.98, "blimp/accuracy/ellipsis_n_bar_2": 0.889, "blimp/accuracy/principle_A_domain_3": 0.563, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.918, "blimp/accuracy/animate_subject_trans": 0.905, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.871, "blimp/accuracy/distractor_agreement_relative_clause": 0.695, "blimp/accuracy/transitive": 0.873, "blimp/accuracy/sentential_subject_island": 0.326, "blimp/accuracy/adjunct_island": 0.789, "blimp/accuracy/intransitive": 0.756, "blimp/accuracy/existential_there_subject_raising": 0.898, "blimp/accuracy/irregular_past_participle_adjectives": 0.952, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.298, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.257, "blimp/accuracy/only_npi_scope": 0.678, "blimp/accuracy/superlative_quantifiers_2": 0.665, "blimp/accuracy/passive_1": 0.887, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.904, "blimp/accuracy/inchoative": 0.618, "blimp/accuracy/anaphor_gender_agreement": 0.918, "blimp/accuracy/principle_A_c_command": 0.538, "blimp/accuracy/only_npi_licensor_present": 0.406, "blimp/accuracy/expletive_it_object_raising": 0.771, "blimp/accuracy/left_branch_island_simple_question": 0.345, "blimp/accuracy/wh_questions_subject_gap": 0.911, "blimp/accuracy/existential_there_quantifiers_2": 0.41, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.937, "blimp/accuracy/sentential_negation_npi_scope": 0.548, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.776, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.86, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.888, "blimp/accuracy/principle_A_case_2": 0.949, "blimp/accuracy/distractor_agreement_relational_noun": 0.818, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.989, "blimp/accuracy/superlative_quantifiers_1": 0.692, "blimp/accuracy/wh_island": 0.797, "blimp/accuracy/principle_A_domain_1": 0.977, "blimp/accuracy/complex_NP_island": 0.597, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.958, "blimp/accuracy/irregular_past_participle_verbs": 0.876, "blimp/accuracy/drop_argument": 0.752, "blimp/accuracy/wh_questions_object_gap": 0.782, "blimp/accuracy/animate_subject_passive": 0.793, "blimp/accuracy/existential_there_quantifiers_1": 0.975, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.892, "blimp/accuracy/npi_present_2": 0.584, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.915, "blimp/accuracy/anaphor_number_agreement": 0.976, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.946, "blimp/accuracy/existential_there_object_raising": 0.812, "blimp/accuracy/matrix_question_npi_licensor_present": 0.235, "blimp/accuracy/npi_present_1": 0.481, "blimp/accuracy/wh_vs_that_no_gap": 0.965, "blimp/accuracy/left_branch_island_echo_question": 0.423, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.969, "blimp/accuracy/causative": 0.702, "blimp/accuracy/group_average": 0.7565671641791044, "blimp/accuracy/seq_average": 0.7565671641791045, "cbt/accuracy/NE": 0.7495993589743589, "cbt/accuracy/V": 0.8968, "cbt/accuracy/CN": 0.812, "cbt/accuracy/P": 0.8868, "cbt/accuracy/group_average": 0.8362998397435897, "cbt/accuracy/seq_average": 0.8363345338135254, "hellaswag/accuracy/val": 0.2839075881298546, "hellaswag/accuracy/group_average": 0.2839075881298546, "hellaswag/accuracy/seq_average": 0.2839075881298546, "piqa/accuracy/val": 0.5685527747551686, "piqa/accuracy/group_average": 0.5685527747551686, "piqa/accuracy/seq_average": 0.5685527747551686, "ai2arc/accuracy/ARC-Easy": 0.34249471458773784, "ai2arc/accuracy/ARC-Challenge": 0.20429184549356222, "ai2arc/accuracy/group_average": 0.27339328004065, "ai2arc/accuracy/seq_average": 0.29688385269121814, "race/accuracy/test/high": 0.27215551743853633, "race/accuracy/test/middle": 0.32729805013927576, "race/accuracy/group_average": 0.29972678378890605, "race/accuracy/seq_average": 0.2882042967166599, "siqa/accuracy/dev": 0.34698055271238487, "siqa/accuracy/group_average": 0.34698055271238487, "siqa/accuracy/seq_average": 0.34698055271238487, "commonsenseqa/accuracy/dev_rand_split": 0.24897624897624898, "commonsenseqa/accuracy/group_average": 0.24897624897624898, "commonsenseqa/accuracy/seq_average": 0.24897624897624898}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_lb0001/export/result-model-70000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.641061207604787, "val/accuracy": 0.47516450427827384, "val/perplexity": 14.028082419327244, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "mean_accuracy": 0.47516450427827384, "mean_loss": 2.641061207604787, "boolq/accuracy/dev": 0.6198776758409786, "boolq/accuracy/group_average": 0.6198776758409786, "boolq/accuracy/seq_average": 0.6198776758409786}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_lb0001/export/result-model-80000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.619595724438864, "val/accuracy": 0.47819301060267855, "val/perplexity": 13.730171690041313, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "mean_accuracy": 0.47819301060267855, "mean_loss": 2.619595724438864, "boolq/accuracy/dev": 0.618960244648318, "boolq/accuracy/group_average": 0.618960244648318, "boolq/accuracy/seq_average": 0.618960244648318}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_lb0001/export/result-model-90000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.6069149441189237, "val/accuracy": 0.48042224702380953, "val/perplexity": 13.557161668482745, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "mean_accuracy": 0.48042224702380953, "mean_loss": 2.6069149441189237, "boolq/accuracy/dev": 0.618960244648318, "boolq/accuracy/group_average": 0.618960244648318, "boolq/accuracy/seq_average": 0.618960244648318}