DavidNguyen commited on
Commit
2fc2e2c
·
verified ·
1 Parent(s): b9eeb23

bf30748275ec30039347d5e1f8867759321c00093d1353fb19f391a85ee9d9c5

Browse files
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_std0006/export/result-model-10000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 3.028562999906994, "val/accuracy": 0.42321971106150796, "val/perplexity": 20.667512023575842, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.735074440144604, "lambada/accuracy/total": 0.16168478260869565, "lambada/accuracy/openai_last_token": 0.7158385093167702, "lambada/perplexity": 24.595714180537048, "lambada/lm_loss": 3.55616975225867, "lambada/lm_perplexity": 35.028770995736, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.2924522468351018, "mean_loss": 2.881818720025799, "blimp/accuracy/passive_2": 0.886, "blimp/accuracy/determiner_noun_agreement_2": 0.955, "blimp/accuracy/ellipsis_n_bar_1": 0.724, "blimp/accuracy/tough_vs_raising_2": 0.807, "blimp/accuracy/tough_vs_raising_1": 0.546, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.874, "blimp/accuracy/principle_A_reconstruction": 0.425, "blimp/accuracy/wh_vs_that_with_gap": 0.382, "blimp/accuracy/principle_A_domain_2": 0.843, "blimp/accuracy/determiner_noun_agreement_1": 0.964, "blimp/accuracy/ellipsis_n_bar_2": 0.881, "blimp/accuracy/principle_A_domain_3": 0.508, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.849, "blimp/accuracy/animate_subject_trans": 0.87, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.733, "blimp/accuracy/distractor_agreement_relative_clause": 0.424, "blimp/accuracy/transitive": 0.829, "blimp/accuracy/sentential_subject_island": 0.271, "blimp/accuracy/adjunct_island": 0.671, "blimp/accuracy/intransitive": 0.711, "blimp/accuracy/existential_there_subject_raising": 0.807, "blimp/accuracy/irregular_past_participle_adjectives": 0.934, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.217, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.178, "blimp/accuracy/only_npi_scope": 0.628, "blimp/accuracy/superlative_quantifiers_2": 0.668, "blimp/accuracy/passive_1": 0.893, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.86, "blimp/accuracy/inchoative": 0.515, "blimp/accuracy/anaphor_gender_agreement": 0.918, "blimp/accuracy/principle_A_c_command": 0.5, "blimp/accuracy/only_npi_licensor_present": 0.542, "blimp/accuracy/expletive_it_object_raising": 0.76, "blimp/accuracy/left_branch_island_simple_question": 0.271, "blimp/accuracy/wh_questions_subject_gap": 0.894, "blimp/accuracy/existential_there_quantifiers_2": 0.264, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.895, "blimp/accuracy/sentential_negation_npi_scope": 0.431, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.754, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.897, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.818, "blimp/accuracy/principle_A_case_2": 0.929, "blimp/accuracy/distractor_agreement_relational_noun": 0.7, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.987, "blimp/accuracy/superlative_quantifiers_1": 0.699, "blimp/accuracy/wh_island": 0.7, "blimp/accuracy/principle_A_domain_1": 0.986, "blimp/accuracy/complex_NP_island": 0.535, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.905, "blimp/accuracy/irregular_past_participle_verbs": 0.847, "blimp/accuracy/drop_argument": 0.739, "blimp/accuracy/wh_questions_object_gap": 0.713, "blimp/accuracy/animate_subject_passive": 0.744, "blimp/accuracy/existential_there_quantifiers_1": 0.961, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.842, "blimp/accuracy/npi_present_2": 0.675, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.846, "blimp/accuracy/anaphor_number_agreement": 0.956, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.905, "blimp/accuracy/existential_there_object_raising": 0.754, "blimp/accuracy/matrix_question_npi_licensor_present": 0.072, "blimp/accuracy/npi_present_1": 0.565, "blimp/accuracy/wh_vs_that_no_gap": 0.957, "blimp/accuracy/left_branch_island_echo_question": 0.471, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.967, "blimp/accuracy/causative": 0.61, "blimp/accuracy/group_average": 0.7143582089552236, "blimp/accuracy/seq_average": 0.7143582089552238, "cbt/accuracy/NE": 0.6842948717948718, "cbt/accuracy/V": 0.86, "cbt/accuracy/CN": 0.7324, "cbt/accuracy/P": 0.8372, "cbt/accuracy/group_average": 0.778473717948718, "cbt/accuracy/seq_average": 0.7785114045618248, "hellaswag/accuracy/val": 0.26926906990639315, "hellaswag/accuracy/group_average": 0.26926906990639315, "hellaswag/accuracy/seq_average": 0.26926906990639315, "piqa/accuracy/val": 0.5489662676822633, "piqa/accuracy/group_average": 0.5489662676822633, "piqa/accuracy/seq_average": 0.5489662676822633, "ai2arc/accuracy/ARC-Easy": 0.3124735729386892, "ai2arc/accuracy/ARC-Challenge": 0.2094420600858369, "ai2arc/accuracy/group_average": 0.2609578165122631, "ai2arc/accuracy/seq_average": 0.2784702549575071, "race/accuracy/test/high": 0.25443110348770726, "race/accuracy/test/middle": 0.32590529247910865, "race/accuracy/group_average": 0.290168197983408, "race/accuracy/seq_average": 0.2752330766112687, "siqa/accuracy/dev": 0.353121801432958, "siqa/accuracy/group_average": 0.353121801432958, "siqa/accuracy/seq_average": 0.353121801432958, "commonsenseqa/accuracy/dev_rand_split": 0.2325962325962326, "commonsenseqa/accuracy/group_average": 0.2325962325962326, "commonsenseqa/accuracy/seq_average": 0.2325962325962326}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_std0006/export/result-model-100000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.6122664194258434, "val/accuracy": 0.47871616908482145, "val/perplexity": 13.629906958085757, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.619633265904018, "lambada/accuracy/total": 0.26048136645962733, "lambada/accuracy/openai_last_token": 0.7639751552795031, "lambada/perplexity": 11.736824417331295, "lambada/lm_loss": 3.1732623116378083, "lambada/lm_perplexity": 23.88527861527466, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3695987677722244, "mean_loss": 2.6159498426649304, "blimp/accuracy/passive_2": 0.906, "blimp/accuracy/determiner_noun_agreement_2": 0.982, "blimp/accuracy/ellipsis_n_bar_1": 0.82, "blimp/accuracy/tough_vs_raising_2": 0.85, "blimp/accuracy/tough_vs_raising_1": 0.599, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.916, "blimp/accuracy/principle_A_reconstruction": 0.285, "blimp/accuracy/wh_vs_that_with_gap": 0.532, "blimp/accuracy/principle_A_domain_2": 0.846, "blimp/accuracy/determiner_noun_agreement_1": 0.988, "blimp/accuracy/ellipsis_n_bar_2": 0.924, "blimp/accuracy/principle_A_domain_3": 0.603, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.908, "blimp/accuracy/animate_subject_trans": 0.902, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.881, "blimp/accuracy/distractor_agreement_relative_clause": 0.566, "blimp/accuracy/transitive": 0.863, "blimp/accuracy/sentential_subject_island": 0.325, "blimp/accuracy/adjunct_island": 0.782, "blimp/accuracy/intransitive": 0.795, "blimp/accuracy/existential_there_subject_raising": 0.875, "blimp/accuracy/irregular_past_participle_adjectives": 0.931, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.316, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.267, "blimp/accuracy/only_npi_scope": 0.724, "blimp/accuracy/superlative_quantifiers_2": 0.744, "blimp/accuracy/passive_1": 0.892, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.888, "blimp/accuracy/inchoative": 0.615, "blimp/accuracy/anaphor_gender_agreement": 0.953, "blimp/accuracy/principle_A_c_command": 0.587, "blimp/accuracy/only_npi_licensor_present": 0.668, "blimp/accuracy/expletive_it_object_raising": 0.768, "blimp/accuracy/left_branch_island_simple_question": 0.416, "blimp/accuracy/wh_questions_subject_gap": 0.899, "blimp/accuracy/existential_there_quantifiers_2": 0.52, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.946, "blimp/accuracy/sentential_negation_npi_scope": 0.643, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.809, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.864, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.906, "blimp/accuracy/principle_A_case_2": 0.965, "blimp/accuracy/distractor_agreement_relational_noun": 0.763, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.985, "blimp/accuracy/superlative_quantifiers_1": 0.729, "blimp/accuracy/wh_island": 0.752, "blimp/accuracy/principle_A_domain_1": 0.992, "blimp/accuracy/complex_NP_island": 0.511, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.953, "blimp/accuracy/irregular_past_participle_verbs": 0.886, "blimp/accuracy/drop_argument": 0.78, "blimp/accuracy/wh_questions_object_gap": 0.787, "blimp/accuracy/animate_subject_passive": 0.779, "blimp/accuracy/existential_there_quantifiers_1": 0.983, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.891, "blimp/accuracy/npi_present_2": 0.511, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.923, "blimp/accuracy/anaphor_number_agreement": 0.987, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.958, "blimp/accuracy/existential_there_object_raising": 0.802, "blimp/accuracy/matrix_question_npi_licensor_present": 0.25, "blimp/accuracy/npi_present_1": 0.519, "blimp/accuracy/wh_vs_that_no_gap": 0.966, "blimp/accuracy/left_branch_island_echo_question": 0.42, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.966, "blimp/accuracy/causative": 0.711, "blimp/accuracy/group_average": 0.7652686567164181, "blimp/accuracy/seq_average": 0.7652686567164179, "cbt/accuracy/NE": 0.7544070512820513, "cbt/accuracy/V": 0.9088, "cbt/accuracy/CN": 0.8144, "cbt/accuracy/P": 0.8908, "cbt/accuracy/group_average": 0.8421017628205129, "cbt/accuracy/seq_average": 0.8421368547418968, "hellaswag/accuracy/val": 0.29038040231029677, "hellaswag/accuracy/group_average": 0.29038040231029677, "hellaswag/accuracy/seq_average": 0.29038040231029677, "piqa/accuracy/val": 0.5826985854189336, "piqa/accuracy/group_average": 0.5826985854189336, "piqa/accuracy/seq_average": 0.5826985854189336, "ai2arc/accuracy/ARC-Easy": 0.3276955602536998, "ai2arc/accuracy/ARC-Challenge": 0.21201716738197424, "ai2arc/accuracy/group_average": 0.269856363817837, "ai2arc/accuracy/seq_average": 0.2895184135977337, "race/accuracy/test/high": 0.2727272727272727, "race/accuracy/test/middle": 0.3398328690807799, "race/accuracy/group_average": 0.3062800709040263, "race/accuracy/seq_average": 0.29225780299959464, "siqa/accuracy/dev": 0.3526100307062436, "siqa/accuracy/group_average": 0.3526100307062436, "siqa/accuracy/seq_average": 0.3526100307062436, "commonsenseqa/accuracy/dev_rand_split": 0.26044226044226043, "commonsenseqa/accuracy/group_average": 0.26044226044226043, "commonsenseqa/accuracy/seq_average": 0.26044226044226043}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_std0006/export/result-model-20000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.8826192220052085, "val/accuracy": 0.4410303509424603, "val/perplexity": 17.860993875158048, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6884145795928767, "lambada/accuracy/total": 0.1859472049689441, "lambada/accuracy/openai_last_token": 0.7302018633540373, "lambada/perplexity": 18.416513600965327, "lambada/lm_loss": 3.407392430312275, "lambada/lm_perplexity": 30.18642832955101, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3134887779557022, "mean_loss": 2.785516900799043, "blimp/accuracy/passive_2": 0.876, "blimp/accuracy/determiner_noun_agreement_2": 0.961, "blimp/accuracy/ellipsis_n_bar_1": 0.762, "blimp/accuracy/tough_vs_raising_2": 0.86, "blimp/accuracy/tough_vs_raising_1": 0.585, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.88, "blimp/accuracy/principle_A_reconstruction": 0.369, "blimp/accuracy/wh_vs_that_with_gap": 0.484, "blimp/accuracy/principle_A_domain_2": 0.792, "blimp/accuracy/determiner_noun_agreement_1": 0.972, "blimp/accuracy/ellipsis_n_bar_2": 0.906, "blimp/accuracy/principle_A_domain_3": 0.534, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.898, "blimp/accuracy/animate_subject_trans": 0.876, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.801, "blimp/accuracy/distractor_agreement_relative_clause": 0.429, "blimp/accuracy/transitive": 0.844, "blimp/accuracy/sentential_subject_island": 0.351, "blimp/accuracy/adjunct_island": 0.747, "blimp/accuracy/intransitive": 0.689, "blimp/accuracy/existential_there_subject_raising": 0.827, "blimp/accuracy/irregular_past_participle_adjectives": 0.809, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.327, "blimp/accuracy/principle_A_case_1": 0.999, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.187, "blimp/accuracy/only_npi_scope": 0.706, "blimp/accuracy/superlative_quantifiers_2": 0.753, "blimp/accuracy/passive_1": 0.909, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.912, "blimp/accuracy/inchoative": 0.546, "blimp/accuracy/anaphor_gender_agreement": 0.88, "blimp/accuracy/principle_A_c_command": 0.519, "blimp/accuracy/only_npi_licensor_present": 0.639, "blimp/accuracy/expletive_it_object_raising": 0.744, "blimp/accuracy/left_branch_island_simple_question": 0.422, "blimp/accuracy/wh_questions_subject_gap": 0.885, "blimp/accuracy/existential_there_quantifiers_2": 0.355, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.893, "blimp/accuracy/sentential_negation_npi_scope": 0.437, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.769, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.903, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.859, "blimp/accuracy/principle_A_case_2": 0.95, "blimp/accuracy/distractor_agreement_relational_noun": 0.788, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.988, "blimp/accuracy/superlative_quantifiers_1": 0.683, "blimp/accuracy/wh_island": 0.727, "blimp/accuracy/principle_A_domain_1": 0.992, "blimp/accuracy/complex_NP_island": 0.521, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.927, "blimp/accuracy/irregular_past_participle_verbs": 0.868, "blimp/accuracy/drop_argument": 0.732, "blimp/accuracy/wh_questions_object_gap": 0.775, "blimp/accuracy/animate_subject_passive": 0.774, "blimp/accuracy/existential_there_quantifiers_1": 0.985, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.881, "blimp/accuracy/npi_present_2": 0.613, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.855, "blimp/accuracy/anaphor_number_agreement": 0.965, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.928, "blimp/accuracy/existential_there_object_raising": 0.748, "blimp/accuracy/matrix_question_npi_licensor_present": 0.183, "blimp/accuracy/npi_present_1": 0.537, "blimp/accuracy/wh_vs_that_no_gap": 0.964, "blimp/accuracy/left_branch_island_echo_question": 0.312, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.98, "blimp/accuracy/causative": 0.669, "blimp/accuracy/group_average": 0.7349402985074622, "blimp/accuracy/seq_average": 0.7349402985074627, "cbt/accuracy/NE": 0.7059294871794872, "cbt/accuracy/V": 0.8732, "cbt/accuracy/CN": 0.7624, "cbt/accuracy/P": 0.852, "cbt/accuracy/group_average": 0.7983823717948718, "cbt/accuracy/seq_average": 0.7984193677470989, "hellaswag/accuracy/val": 0.2757418840868353, "hellaswag/accuracy/group_average": 0.2757418840868353, "hellaswag/accuracy/seq_average": 0.2757418840868353, "piqa/accuracy/val": 0.5560391730141458, "piqa/accuracy/group_average": 0.5560391730141458, "piqa/accuracy/seq_average": 0.5560391730141458, "ai2arc/accuracy/ARC-Easy": 0.3141649048625793, "ai2arc/accuracy/ARC-Challenge": 0.18969957081545064, "ai2arc/accuracy/group_average": 0.25193223783901497, "ai2arc/accuracy/seq_average": 0.27308781869688387, "race/accuracy/test/high": 0.25757575757575757, "race/accuracy/test/middle": 0.32381615598885793, "race/accuracy/group_average": 0.29069595678230775, "race/accuracy/seq_average": 0.27685447912444267, "siqa/accuracy/dev": 0.35823950870010235, "siqa/accuracy/group_average": 0.35823950870010235, "siqa/accuracy/seq_average": 0.35823950870010235, "commonsenseqa/accuracy/dev_rand_split": 0.23996723996723995, "commonsenseqa/accuracy/group_average": 0.23996723996723995, "commonsenseqa/accuracy/seq_average": 0.23996723996723995}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_std0006/export/result-model-30000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.8063238234747026, "val/accuracy": 0.45164756169394843, "val/perplexity": 16.548969325429177, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.691093113111413, "lambada/accuracy/total": 0.2123447204968944, "lambada/accuracy/openai_last_token": 0.7397127329192547, "lambada/perplexity": 16.280034554191626, "lambada/lm_loss": 3.361711194036105, "lambada/lm_perplexity": 28.838496945070982, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3319961410954214, "mean_loss": 2.748708468293058, "blimp/accuracy/passive_2": 0.902, "blimp/accuracy/determiner_noun_agreement_2": 0.968, "blimp/accuracy/ellipsis_n_bar_1": 0.75, "blimp/accuracy/tough_vs_raising_2": 0.843, "blimp/accuracy/tough_vs_raising_1": 0.645, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.918, "blimp/accuracy/principle_A_reconstruction": 0.515, "blimp/accuracy/wh_vs_that_with_gap": 0.554, "blimp/accuracy/principle_A_domain_2": 0.825, "blimp/accuracy/determiner_noun_agreement_1": 0.985, "blimp/accuracy/ellipsis_n_bar_2": 0.905, "blimp/accuracy/principle_A_domain_3": 0.526, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.899, "blimp/accuracy/animate_subject_trans": 0.88, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.841, "blimp/accuracy/distractor_agreement_relative_clause": 0.501, "blimp/accuracy/transitive": 0.815, "blimp/accuracy/sentential_subject_island": 0.316, "blimp/accuracy/adjunct_island": 0.788, "blimp/accuracy/intransitive": 0.766, "blimp/accuracy/existential_there_subject_raising": 0.831, "blimp/accuracy/irregular_past_participle_adjectives": 0.841, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.286, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.258, "blimp/accuracy/only_npi_scope": 0.803, "blimp/accuracy/superlative_quantifiers_2": 0.674, "blimp/accuracy/passive_1": 0.876, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.905, "blimp/accuracy/inchoative": 0.62, "blimp/accuracy/anaphor_gender_agreement": 0.957, "blimp/accuracy/principle_A_c_command": 0.538, "blimp/accuracy/only_npi_licensor_present": 0.368, "blimp/accuracy/expletive_it_object_raising": 0.804, "blimp/accuracy/left_branch_island_simple_question": 0.35, "blimp/accuracy/wh_questions_subject_gap": 0.831, "blimp/accuracy/existential_there_quantifiers_2": 0.364, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.928, "blimp/accuracy/sentential_negation_npi_scope": 0.578, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.777, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.893, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.882, "blimp/accuracy/principle_A_case_2": 0.954, "blimp/accuracy/distractor_agreement_relational_noun": 0.782, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.982, "blimp/accuracy/superlative_quantifiers_1": 0.738, "blimp/accuracy/wh_island": 0.713, "blimp/accuracy/principle_A_domain_1": 0.973, "blimp/accuracy/complex_NP_island": 0.516, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.933, "blimp/accuracy/irregular_past_participle_verbs": 0.844, "blimp/accuracy/drop_argument": 0.763, "blimp/accuracy/wh_questions_object_gap": 0.723, "blimp/accuracy/animate_subject_passive": 0.799, "blimp/accuracy/existential_there_quantifiers_1": 0.977, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.864, "blimp/accuracy/npi_present_2": 0.503, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.892, "blimp/accuracy/anaphor_number_agreement": 0.985, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.938, "blimp/accuracy/existential_there_object_raising": 0.782, "blimp/accuracy/matrix_question_npi_licensor_present": 0.252, "blimp/accuracy/npi_present_1": 0.43, "blimp/accuracy/wh_vs_that_no_gap": 0.946, "blimp/accuracy/left_branch_island_echo_question": 0.381, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.957, "blimp/accuracy/causative": 0.669, "blimp/accuracy/group_average": 0.7433134328358209, "blimp/accuracy/seq_average": 0.7433134328358209, "cbt/accuracy/NE": 0.7255608974358975, "cbt/accuracy/V": 0.8884, "cbt/accuracy/CN": 0.7776, "cbt/accuracy/P": 0.866, "cbt/accuracy/group_average": 0.8143902243589743, "cbt/accuracy/seq_average": 0.8144257703081232, "hellaswag/accuracy/val": 0.2822146982672774, "hellaswag/accuracy/group_average": 0.2822146982672774, "hellaswag/accuracy/seq_average": 0.2822146982672774, "piqa/accuracy/val": 0.5647442872687704, "piqa/accuracy/group_average": 0.5647442872687704, "piqa/accuracy/seq_average": 0.5647442872687704, "ai2arc/accuracy/ARC-Easy": 0.31797040169133195, "ai2arc/accuracy/ARC-Challenge": 0.19914163090128756, "ai2arc/accuracy/group_average": 0.25855601629630975, "ai2arc/accuracy/seq_average": 0.27875354107648725, "race/accuracy/test/high": 0.26357918810749, "race/accuracy/test/middle": 0.33008356545961004, "race/accuracy/group_average": 0.29683137678355, "race/accuracy/seq_average": 0.28293473854884477, "siqa/accuracy/dev": 0.35209825997952915, "siqa/accuracy/group_average": 0.35209825997952915, "siqa/accuracy/seq_average": 0.35209825997952915, "commonsenseqa/accuracy/dev_rand_split": 0.23669123669123668, "commonsenseqa/accuracy/group_average": 0.23669123669123668, "commonsenseqa/accuracy/seq_average": 0.23669123669123668}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_std0006/export/result-model-40000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.7537585061693948, "val/accuracy": 0.45876832992311506, "val/perplexity": 15.701535437974526, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5706117997258344, "lambada/accuracy/total": 0.2280667701863354, "lambada/accuracy/openai_last_token": 0.7467003105590062, "lambada/perplexity": 15.329036429403285, "lambada/lm_loss": 3.2972694291724634, "lambada/lm_perplexity": 27.038706923976797, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.34341755005472524, "mean_loss": 2.6621851529476146, "blimp/accuracy/passive_2": 0.88, "blimp/accuracy/determiner_noun_agreement_2": 0.971, "blimp/accuracy/ellipsis_n_bar_1": 0.813, "blimp/accuracy/tough_vs_raising_2": 0.858, "blimp/accuracy/tough_vs_raising_1": 0.637, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.916, "blimp/accuracy/principle_A_reconstruction": 0.267, "blimp/accuracy/wh_vs_that_with_gap": 0.502, "blimp/accuracy/principle_A_domain_2": 0.814, "blimp/accuracy/determiner_noun_agreement_1": 0.979, "blimp/accuracy/ellipsis_n_bar_2": 0.899, "blimp/accuracy/principle_A_domain_3": 0.554, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.907, "blimp/accuracy/animate_subject_trans": 0.885, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.883, "blimp/accuracy/distractor_agreement_relative_clause": 0.559, "blimp/accuracy/transitive": 0.835, "blimp/accuracy/sentential_subject_island": 0.282, "blimp/accuracy/adjunct_island": 0.819, "blimp/accuracy/intransitive": 0.783, "blimp/accuracy/existential_there_subject_raising": 0.845, "blimp/accuracy/irregular_past_participle_adjectives": 0.852, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.331, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.239, "blimp/accuracy/only_npi_scope": 0.741, "blimp/accuracy/superlative_quantifiers_2": 0.592, "blimp/accuracy/passive_1": 0.874, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.882, "blimp/accuracy/inchoative": 0.605, "blimp/accuracy/anaphor_gender_agreement": 0.935, "blimp/accuracy/principle_A_c_command": 0.536, "blimp/accuracy/only_npi_licensor_present": 0.648, "blimp/accuracy/expletive_it_object_raising": 0.757, "blimp/accuracy/left_branch_island_simple_question": 0.429, "blimp/accuracy/wh_questions_subject_gap": 0.901, "blimp/accuracy/existential_there_quantifiers_2": 0.349, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.933, "blimp/accuracy/sentential_negation_npi_scope": 0.596, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.772, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.918, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.878, "blimp/accuracy/principle_A_case_2": 0.96, "blimp/accuracy/distractor_agreement_relational_noun": 0.778, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.983, "blimp/accuracy/superlative_quantifiers_1": 0.79, "blimp/accuracy/wh_island": 0.691, "blimp/accuracy/principle_A_domain_1": 0.989, "blimp/accuracy/complex_NP_island": 0.52, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.946, "blimp/accuracy/irregular_past_participle_verbs": 0.88, "blimp/accuracy/drop_argument": 0.786, "blimp/accuracy/wh_questions_object_gap": 0.772, "blimp/accuracy/animate_subject_passive": 0.766, "blimp/accuracy/existential_there_quantifiers_1": 0.977, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.872, "blimp/accuracy/npi_present_2": 0.504, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.906, "blimp/accuracy/anaphor_number_agreement": 0.978, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.953, "blimp/accuracy/existential_there_object_raising": 0.798, "blimp/accuracy/matrix_question_npi_licensor_present": 0.188, "blimp/accuracy/npi_present_1": 0.447, "blimp/accuracy/wh_vs_that_no_gap": 0.965, "blimp/accuracy/left_branch_island_echo_question": 0.436, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.972, "blimp/accuracy/causative": 0.649, "blimp/accuracy/group_average": 0.7486865671641793, "blimp/accuracy/seq_average": 0.7486865671641791, "cbt/accuracy/NE": 0.7323717948717948, "cbt/accuracy/V": 0.8928, "cbt/accuracy/CN": 0.7868, "cbt/accuracy/P": 0.8704, "cbt/accuracy/group_average": 0.8205929487179487, "cbt/accuracy/seq_average": 0.8206282513005202, "hellaswag/accuracy/val": 0.28450507866958774, "hellaswag/accuracy/group_average": 0.28450507866958774, "hellaswag/accuracy/seq_average": 0.28450507866958774, "piqa/accuracy/val": 0.5761697497279652, "piqa/accuracy/group_average": 0.5761697497279652, "piqa/accuracy/seq_average": 0.5761697497279652, "ai2arc/accuracy/ARC-Easy": 0.3298097251585624, "ai2arc/accuracy/ARC-Challenge": 0.19570815450643778, "ai2arc/accuracy/group_average": 0.2627589398325001, "ai2arc/accuracy/seq_average": 0.2855524079320113, "race/accuracy/test/high": 0.26815323041738137, "race/accuracy/test/middle": 0.33913649025069637, "race/accuracy/group_average": 0.30364486033403887, "race/accuracy/seq_average": 0.2888123226591001, "siqa/accuracy/dev": 0.372057318321392, "siqa/accuracy/group_average": 0.372057318321392, "siqa/accuracy/seq_average": 0.372057318321392, "commonsenseqa/accuracy/dev_rand_split": 0.25552825552825553, "commonsenseqa/accuracy/group_average": 0.25552825552825553, "commonsenseqa/accuracy/seq_average": 0.25552825552825553}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_std0006/export/result-model-50000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.7112567235553073, "val/accuracy": 0.4644349113343254, "val/perplexity": 15.048175032681335, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.645667224196914, "lambada/accuracy/total": 0.23311335403726707, "lambada/accuracy/openai_last_token": 0.7552406832298136, "lambada/perplexity": 13.71216661786763, "lambada/lm_loss": 3.2802709985081884, "lambada/lm_perplexity": 26.582975670582655, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.34877413268579627, "mean_loss": 2.678461973876111, "blimp/accuracy/passive_2": 0.888, "blimp/accuracy/determiner_noun_agreement_2": 0.966, "blimp/accuracy/ellipsis_n_bar_1": 0.819, "blimp/accuracy/tough_vs_raising_2": 0.838, "blimp/accuracy/tough_vs_raising_1": 0.594, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.896, "blimp/accuracy/principle_A_reconstruction": 0.294, "blimp/accuracy/wh_vs_that_with_gap": 0.497, "blimp/accuracy/principle_A_domain_2": 0.861, "blimp/accuracy/determiner_noun_agreement_1": 0.983, "blimp/accuracy/ellipsis_n_bar_2": 0.908, "blimp/accuracy/principle_A_domain_3": 0.55, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.872, "blimp/accuracy/animate_subject_trans": 0.87, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.84, "blimp/accuracy/distractor_agreement_relative_clause": 0.535, "blimp/accuracy/transitive": 0.837, "blimp/accuracy/sentential_subject_island": 0.306, "blimp/accuracy/adjunct_island": 0.793, "blimp/accuracy/intransitive": 0.774, "blimp/accuracy/existential_there_subject_raising": 0.861, "blimp/accuracy/irregular_past_participle_adjectives": 0.858, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.364, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.238, "blimp/accuracy/only_npi_scope": 0.729, "blimp/accuracy/superlative_quantifiers_2": 0.731, "blimp/accuracy/passive_1": 0.874, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.884, "blimp/accuracy/inchoative": 0.604, "blimp/accuracy/anaphor_gender_agreement": 0.93, "blimp/accuracy/principle_A_c_command": 0.568, "blimp/accuracy/only_npi_licensor_present": 0.592, "blimp/accuracy/expletive_it_object_raising": 0.791, "blimp/accuracy/left_branch_island_simple_question": 0.396, "blimp/accuracy/wh_questions_subject_gap": 0.886, "blimp/accuracy/existential_there_quantifiers_2": 0.434, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.919, "blimp/accuracy/sentential_negation_npi_scope": 0.539, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.797, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.887, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.883, "blimp/accuracy/principle_A_case_2": 0.959, "blimp/accuracy/distractor_agreement_relational_noun": 0.734, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.988, "blimp/accuracy/superlative_quantifiers_1": 0.653, "blimp/accuracy/wh_island": 0.764, "blimp/accuracy/principle_A_domain_1": 0.992, "blimp/accuracy/complex_NP_island": 0.478, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.932, "blimp/accuracy/irregular_past_participle_verbs": 0.836, "blimp/accuracy/drop_argument": 0.772, "blimp/accuracy/wh_questions_object_gap": 0.807, "blimp/accuracy/animate_subject_passive": 0.782, "blimp/accuracy/existential_there_quantifiers_1": 0.98, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.908, "blimp/accuracy/npi_present_2": 0.527, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.898, "blimp/accuracy/anaphor_number_agreement": 0.985, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.942, "blimp/accuracy/existential_there_object_raising": 0.819, "blimp/accuracy/matrix_question_npi_licensor_present": 0.195, "blimp/accuracy/npi_present_1": 0.531, "blimp/accuracy/wh_vs_that_no_gap": 0.978, "blimp/accuracy/left_branch_island_echo_question": 0.469, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.975, "blimp/accuracy/causative": 0.645, "blimp/accuracy/group_average": 0.7497761194029852, "blimp/accuracy/seq_average": 0.749776119402985, "cbt/accuracy/NE": 0.7327724358974359, "cbt/accuracy/V": 0.9004, "cbt/accuracy/CN": 0.7976, "cbt/accuracy/P": 0.8824, "cbt/accuracy/group_average": 0.828293108974359, "cbt/accuracy/seq_average": 0.8283313325330132, "hellaswag/accuracy/val": 0.28510256920932087, "hellaswag/accuracy/group_average": 0.28510256920932087, "hellaswag/accuracy/seq_average": 0.28510256920932087, "piqa/accuracy/val": 0.5767138193688792, "piqa/accuracy/group_average": 0.5767138193688792, "piqa/accuracy/seq_average": 0.5767138193688792, "ai2arc/accuracy/ARC-Easy": 0.3230443974630021, "ai2arc/accuracy/ARC-Challenge": 0.2034334763948498, "ai2arc/accuracy/group_average": 0.26323893692892597, "ai2arc/accuracy/seq_average": 0.28356940509915013, "race/accuracy/test/high": 0.2672955974842767, "race/accuracy/test/middle": 0.3474930362116992, "race/accuracy/group_average": 0.30739431684798796, "race/accuracy/seq_average": 0.29063640048642075, "siqa/accuracy/dev": 0.35363357215967245, "siqa/accuracy/group_average": 0.35363357215967245, "siqa/accuracy/seq_average": 0.35363357215967245, "commonsenseqa/accuracy/dev_rand_split": 0.24406224406224405, "commonsenseqa/accuracy/group_average": 0.24406224406224405, "commonsenseqa/accuracy/seq_average": 0.24406224406224405}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_std0006/export/result-model-60000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.6793312193855408, "val/accuracy": 0.46882362971230157, "val/perplexity": 14.575342329216605, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6293422154017856, "lambada/accuracy/total": 0.23524844720496896, "lambada/accuracy/openai_last_token": 0.7538819875776398, "lambada/perplexity": 13.814436701668063, "lambada/lm_loss": 3.2271723461673845, "lambada/lm_perplexity": 25.208275820195425, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3520360384586353, "mean_loss": 2.6543367173936634, "blimp/accuracy/passive_2": 0.899, "blimp/accuracy/determiner_noun_agreement_2": 0.978, "blimp/accuracy/ellipsis_n_bar_1": 0.818, "blimp/accuracy/tough_vs_raising_2": 0.882, "blimp/accuracy/tough_vs_raising_1": 0.569, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.91, "blimp/accuracy/principle_A_reconstruction": 0.285, "blimp/accuracy/wh_vs_that_with_gap": 0.525, "blimp/accuracy/principle_A_domain_2": 0.843, "blimp/accuracy/determiner_noun_agreement_1": 0.986, "blimp/accuracy/ellipsis_n_bar_2": 0.901, "blimp/accuracy/principle_A_domain_3": 0.587, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.91, "blimp/accuracy/animate_subject_trans": 0.904, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.875, "blimp/accuracy/distractor_agreement_relative_clause": 0.572, "blimp/accuracy/transitive": 0.858, "blimp/accuracy/sentential_subject_island": 0.325, "blimp/accuracy/adjunct_island": 0.783, "blimp/accuracy/intransitive": 0.806, "blimp/accuracy/existential_there_subject_raising": 0.858, "blimp/accuracy/irregular_past_participle_adjectives": 0.98, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.308, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.282, "blimp/accuracy/only_npi_scope": 0.745, "blimp/accuracy/superlative_quantifiers_2": 0.639, "blimp/accuracy/passive_1": 0.88, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.895, "blimp/accuracy/inchoative": 0.645, "blimp/accuracy/anaphor_gender_agreement": 0.961, "blimp/accuracy/principle_A_c_command": 0.583, "blimp/accuracy/only_npi_licensor_present": 0.573, "blimp/accuracy/expletive_it_object_raising": 0.741, "blimp/accuracy/left_branch_island_simple_question": 0.352, "blimp/accuracy/wh_questions_subject_gap": 0.901, "blimp/accuracy/existential_there_quantifiers_2": 0.367, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.931, "blimp/accuracy/sentential_negation_npi_scope": 0.626, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.779, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.869, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.898, "blimp/accuracy/principle_A_case_2": 0.964, "blimp/accuracy/distractor_agreement_relational_noun": 0.746, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.992, "blimp/accuracy/superlative_quantifiers_1": 0.741, "blimp/accuracy/wh_island": 0.763, "blimp/accuracy/principle_A_domain_1": 0.991, "blimp/accuracy/complex_NP_island": 0.488, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.947, "blimp/accuracy/irregular_past_participle_verbs": 0.888, "blimp/accuracy/drop_argument": 0.797, "blimp/accuracy/wh_questions_object_gap": 0.801, "blimp/accuracy/animate_subject_passive": 0.785, "blimp/accuracy/existential_there_quantifiers_1": 0.965, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.885, "blimp/accuracy/npi_present_2": 0.521, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.925, "blimp/accuracy/anaphor_number_agreement": 0.985, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.95, "blimp/accuracy/existential_there_object_raising": 0.79, "blimp/accuracy/matrix_question_npi_licensor_present": 0.238, "blimp/accuracy/npi_present_1": 0.496, "blimp/accuracy/wh_vs_that_no_gap": 0.968, "blimp/accuracy/left_branch_island_echo_question": 0.43, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.958, "blimp/accuracy/causative": 0.698, "blimp/accuracy/group_average": 0.757328358208955, "blimp/accuracy/seq_average": 0.7573283582089553, "cbt/accuracy/NE": 0.7387820512820513, "cbt/accuracy/V": 0.8984, "cbt/accuracy/CN": 0.7992, "cbt/accuracy/P": 0.8876, "cbt/accuracy/group_average": 0.8309955128205128, "cbt/accuracy/seq_average": 0.831032412965186, "hellaswag/accuracy/val": 0.28589922326229833, "hellaswag/accuracy/group_average": 0.28589922326229833, "hellaswag/accuracy/seq_average": 0.28589922326229833, "piqa/accuracy/val": 0.5707290533188248, "piqa/accuracy/group_average": 0.5707290533188248, "piqa/accuracy/seq_average": 0.5707290533188248, "ai2arc/accuracy/ARC-Easy": 0.32515856236786467, "ai2arc/accuracy/ARC-Challenge": 0.21030042918454936, "ai2arc/accuracy/group_average": 0.267729495776207, "ai2arc/accuracy/seq_average": 0.28725212464589234, "race/accuracy/test/high": 0.26472269868496284, "race/accuracy/test/middle": 0.3384401114206128, "race/accuracy/group_average": 0.30158140505278785, "race/accuracy/seq_average": 0.28617754357519254, "siqa/accuracy/dev": 0.3546571136131013, "siqa/accuracy/group_average": 0.3546571136131013, "siqa/accuracy/seq_average": 0.3546571136131013, "commonsenseqa/accuracy/dev_rand_split": 0.25307125307125306, "commonsenseqa/accuracy/group_average": 0.25307125307125306, "commonsenseqa/accuracy/seq_average": 0.25307125307125306}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_std0006/export/result-model-70000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.652193341936384, "val/accuracy": 0.47311159164186506, "val/perplexity": 14.185117362573461, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5909475006672165, "lambada/accuracy/total": 0.24572981366459629, "lambada/accuracy/openai_last_token": 0.7544642857142857, "lambada/perplexity": 13.001072036266239, "lambada/lm_loss": 3.216675711124893, "lambada/lm_perplexity": 24.9450576183242, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3594207026532307, "mean_loss": 2.6215704213018003, "blimp/accuracy/passive_2": 0.903, "blimp/accuracy/determiner_noun_agreement_2": 0.979, "blimp/accuracy/ellipsis_n_bar_1": 0.829, "blimp/accuracy/tough_vs_raising_2": 0.845, "blimp/accuracy/tough_vs_raising_1": 0.62, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.913, "blimp/accuracy/principle_A_reconstruction": 0.263, "blimp/accuracy/wh_vs_that_with_gap": 0.531, "blimp/accuracy/principle_A_domain_2": 0.845, "blimp/accuracy/determiner_noun_agreement_1": 0.983, "blimp/accuracy/ellipsis_n_bar_2": 0.92, "blimp/accuracy/principle_A_domain_3": 0.602, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.901, "blimp/accuracy/animate_subject_trans": 0.898, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.878, "blimp/accuracy/distractor_agreement_relative_clause": 0.589, "blimp/accuracy/transitive": 0.849, "blimp/accuracy/sentential_subject_island": 0.326, "blimp/accuracy/adjunct_island": 0.815, "blimp/accuracy/intransitive": 0.761, "blimp/accuracy/existential_there_subject_raising": 0.87, "blimp/accuracy/irregular_past_participle_adjectives": 0.863, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.388, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.31, "blimp/accuracy/only_npi_scope": 0.688, "blimp/accuracy/superlative_quantifiers_2": 0.727, "blimp/accuracy/passive_1": 0.905, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.893, "blimp/accuracy/inchoative": 0.609, "blimp/accuracy/anaphor_gender_agreement": 0.966, "blimp/accuracy/principle_A_c_command": 0.567, "blimp/accuracy/only_npi_licensor_present": 0.619, "blimp/accuracy/expletive_it_object_raising": 0.759, "blimp/accuracy/left_branch_island_simple_question": 0.482, "blimp/accuracy/wh_questions_subject_gap": 0.897, "blimp/accuracy/existential_there_quantifiers_2": 0.419, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.93, "blimp/accuracy/sentential_negation_npi_scope": 0.601, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.811, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.886, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.902, "blimp/accuracy/principle_A_case_2": 0.957, "blimp/accuracy/distractor_agreement_relational_noun": 0.827, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.991, "blimp/accuracy/superlative_quantifiers_1": 0.796, "blimp/accuracy/wh_island": 0.756, "blimp/accuracy/principle_A_domain_1": 0.988, "blimp/accuracy/complex_NP_island": 0.51, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.953, "blimp/accuracy/irregular_past_participle_verbs": 0.881, "blimp/accuracy/drop_argument": 0.766, "blimp/accuracy/wh_questions_object_gap": 0.81, "blimp/accuracy/animate_subject_passive": 0.782, "blimp/accuracy/existential_there_quantifiers_1": 0.993, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.896, "blimp/accuracy/npi_present_2": 0.448, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.914, "blimp/accuracy/anaphor_number_agreement": 0.992, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.948, "blimp/accuracy/existential_there_object_raising": 0.787, "blimp/accuracy/matrix_question_npi_licensor_present": 0.228, "blimp/accuracy/npi_present_1": 0.459, "blimp/accuracy/wh_vs_that_no_gap": 0.971, "blimp/accuracy/left_branch_island_echo_question": 0.482, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.972, "blimp/accuracy/causative": 0.687, "blimp/accuracy/group_average": 0.7632238805970148, "blimp/accuracy/seq_average": 0.763223880597015, "cbt/accuracy/NE": 0.7471955128205128, "cbt/accuracy/V": 0.9016, "cbt/accuracy/CN": 0.7968, "cbt/accuracy/P": 0.8892, "cbt/accuracy/group_average": 0.8336988782051282, "cbt/accuracy/seq_average": 0.8337334933973589, "hellaswag/accuracy/val": 0.28818960366460866, "hellaswag/accuracy/group_average": 0.28818960366460866, "hellaswag/accuracy/seq_average": 0.28818960366460866, "piqa/accuracy/val": 0.5772578890097932, "piqa/accuracy/group_average": 0.5772578890097932, "piqa/accuracy/seq_average": 0.5772578890097932, "ai2arc/accuracy/ARC-Easy": 0.3331923890063425, "ai2arc/accuracy/ARC-Challenge": 0.2111587982832618, "ai2arc/accuracy/group_average": 0.27217559364480215, "ai2arc/accuracy/seq_average": 0.2929178470254957, "race/accuracy/test/high": 0.26472269868496284, "race/accuracy/test/middle": 0.34331476323119775, "race/accuracy/group_average": 0.3040187309580803, "race/accuracy/seq_average": 0.2875962707742197, "siqa/accuracy/dev": 0.36131013306038895, "siqa/accuracy/group_average": 0.36131013306038895, "siqa/accuracy/seq_average": 0.36131013306038895, "commonsenseqa/accuracy/dev_rand_split": 0.26126126126126126, "commonsenseqa/accuracy/group_average": 0.26126126126126126, "commonsenseqa/accuracy/seq_average": 0.26126126126126126}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_std0006/export/result-model-80000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.6311536516462053, "val/accuracy": 0.47644624255952384, "val/perplexity": 13.889784635552175, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5041306774068324, "lambada/accuracy/total": 0.2521350931677019, "lambada/accuracy/openai_last_token": 0.7577639751552795, "lambada/perplexity": 12.239647203829351, "lambada/lm_loss": 3.1924376980003015, "lambada/lm_perplexity": 24.347707518207987, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.36429066786361286, "mean_loss": 2.567642164526519, "blimp/accuracy/passive_2": 0.917, "blimp/accuracy/determiner_noun_agreement_2": 0.98, "blimp/accuracy/ellipsis_n_bar_1": 0.826, "blimp/accuracy/tough_vs_raising_2": 0.842, "blimp/accuracy/tough_vs_raising_1": 0.627, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.923, "blimp/accuracy/principle_A_reconstruction": 0.295, "blimp/accuracy/wh_vs_that_with_gap": 0.512, "blimp/accuracy/principle_A_domain_2": 0.858, "blimp/accuracy/determiner_noun_agreement_1": 0.989, "blimp/accuracy/ellipsis_n_bar_2": 0.916, "blimp/accuracy/principle_A_domain_3": 0.624, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.904, "blimp/accuracy/animate_subject_trans": 0.904, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.887, "blimp/accuracy/distractor_agreement_relative_clause": 0.554, "blimp/accuracy/transitive": 0.858, "blimp/accuracy/sentential_subject_island": 0.332, "blimp/accuracy/adjunct_island": 0.786, "blimp/accuracy/intransitive": 0.802, "blimp/accuracy/existential_there_subject_raising": 0.862, "blimp/accuracy/irregular_past_participle_adjectives": 0.942, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.295, "blimp/accuracy/principle_A_case_1": 0.999, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.279, "blimp/accuracy/only_npi_scope": 0.735, "blimp/accuracy/superlative_quantifiers_2": 0.754, "blimp/accuracy/passive_1": 0.899, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.897, "blimp/accuracy/inchoative": 0.64, "blimp/accuracy/anaphor_gender_agreement": 0.959, "blimp/accuracy/principle_A_c_command": 0.578, "blimp/accuracy/only_npi_licensor_present": 0.711, "blimp/accuracy/expletive_it_object_raising": 0.765, "blimp/accuracy/left_branch_island_simple_question": 0.374, "blimp/accuracy/wh_questions_subject_gap": 0.904, "blimp/accuracy/existential_there_quantifiers_2": 0.41, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.934, "blimp/accuracy/sentential_negation_npi_scope": 0.663, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.796, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.876, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.874, "blimp/accuracy/principle_A_case_2": 0.964, "blimp/accuracy/distractor_agreement_relational_noun": 0.793, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.991, "blimp/accuracy/superlative_quantifiers_1": 0.781, "blimp/accuracy/wh_island": 0.792, "blimp/accuracy/principle_A_domain_1": 0.983, "blimp/accuracy/complex_NP_island": 0.498, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.943, "blimp/accuracy/irregular_past_participle_verbs": 0.863, "blimp/accuracy/drop_argument": 0.795, "blimp/accuracy/wh_questions_object_gap": 0.799, "blimp/accuracy/animate_subject_passive": 0.771, "blimp/accuracy/existential_there_quantifiers_1": 0.974, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.885, "blimp/accuracy/npi_present_2": 0.54, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.912, "blimp/accuracy/anaphor_number_agreement": 0.984, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.953, "blimp/accuracy/existential_there_object_raising": 0.798, "blimp/accuracy/matrix_question_npi_licensor_present": 0.272, "blimp/accuracy/npi_present_1": 0.535, "blimp/accuracy/wh_vs_that_no_gap": 0.97, "blimp/accuracy/left_branch_island_echo_question": 0.415, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.966, "blimp/accuracy/causative": 0.689, "blimp/accuracy/group_average": 0.7663880597014923, "blimp/accuracy/seq_average": 0.7663880597014925, "cbt/accuracy/NE": 0.7431891025641025, "cbt/accuracy/V": 0.902, "cbt/accuracy/CN": 0.8104, "cbt/accuracy/P": 0.8884, "cbt/accuracy/group_average": 0.8359972756410257, "cbt/accuracy/seq_average": 0.8360344137655062, "hellaswag/accuracy/val": 0.28799044015136427, "hellaswag/accuracy/group_average": 0.28799044015136427, "hellaswag/accuracy/seq_average": 0.28799044015136427, "piqa/accuracy/val": 0.5897714907508161, "piqa/accuracy/group_average": 0.5897714907508161, "piqa/accuracy/seq_average": 0.5897714907508161, "ai2arc/accuracy/ARC-Easy": 0.3399577167019027, "ai2arc/accuracy/ARC-Challenge": 0.20772532188841203, "ai2arc/accuracy/group_average": 0.2738415192951574, "ai2arc/accuracy/seq_average": 0.2963172804532578, "race/accuracy/test/high": 0.27072612921669525, "race/accuracy/test/middle": 0.3398328690807799, "race/accuracy/group_average": 0.3052794991487376, "race/accuracy/seq_average": 0.2908390758005675, "siqa/accuracy/dev": 0.3633572159672467, "siqa/accuracy/group_average": 0.3633572159672467, "siqa/accuracy/seq_average": 0.3633572159672467, "commonsenseqa/accuracy/dev_rand_split": 0.26535626535626533, "commonsenseqa/accuracy/group_average": 0.26535626535626533, "commonsenseqa/accuracy/seq_average": 0.26535626535626533}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_std0006/export/result-model-90000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.619107200985863, "val/accuracy": 0.4775051540798611, "val/perplexity": 13.723465817278425, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5411983513684007, "lambada/accuracy/total": 0.25407608695652173, "lambada/accuracy/openai_last_token": 0.7598990683229814, "lambada/perplexity": 12.0125206162249, "lambada/lm_loss": 3.1678107557116277, "lambada/lm_perplexity": 23.755420967813002, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.36579062051819144, "mean_loss": 2.580152776177132, "blimp/accuracy/passive_2": 0.901, "blimp/accuracy/determiner_noun_agreement_2": 0.983, "blimp/accuracy/ellipsis_n_bar_1": 0.812, "blimp/accuracy/tough_vs_raising_2": 0.874, "blimp/accuracy/tough_vs_raising_1": 0.605, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.925, "blimp/accuracy/principle_A_reconstruction": 0.279, "blimp/accuracy/wh_vs_that_with_gap": 0.525, "blimp/accuracy/principle_A_domain_2": 0.837, "blimp/accuracy/determiner_noun_agreement_1": 0.988, "blimp/accuracy/ellipsis_n_bar_2": 0.924, "blimp/accuracy/principle_A_domain_3": 0.585, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.906, "blimp/accuracy/animate_subject_trans": 0.899, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.894, "blimp/accuracy/distractor_agreement_relative_clause": 0.582, "blimp/accuracy/transitive": 0.867, "blimp/accuracy/sentential_subject_island": 0.325, "blimp/accuracy/adjunct_island": 0.768, "blimp/accuracy/intransitive": 0.795, "blimp/accuracy/existential_there_subject_raising": 0.884, "blimp/accuracy/irregular_past_participle_adjectives": 0.928, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.336, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.284, "blimp/accuracy/only_npi_scope": 0.761, "blimp/accuracy/superlative_quantifiers_2": 0.647, "blimp/accuracy/passive_1": 0.885, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.877, "blimp/accuracy/inchoative": 0.65, "blimp/accuracy/anaphor_gender_agreement": 0.953, "blimp/accuracy/principle_A_c_command": 0.61, "blimp/accuracy/only_npi_licensor_present": 0.717, "blimp/accuracy/expletive_it_object_raising": 0.783, "blimp/accuracy/left_branch_island_simple_question": 0.404, "blimp/accuracy/wh_questions_subject_gap": 0.912, "blimp/accuracy/existential_there_quantifiers_2": 0.432, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.931, "blimp/accuracy/sentential_negation_npi_scope": 0.618, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.793, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.864, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.899, "blimp/accuracy/principle_A_case_2": 0.966, "blimp/accuracy/distractor_agreement_relational_noun": 0.796, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.981, "blimp/accuracy/superlative_quantifiers_1": 0.786, "blimp/accuracy/wh_island": 0.745, "blimp/accuracy/principle_A_domain_1": 0.987, "blimp/accuracy/complex_NP_island": 0.537, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.958, "blimp/accuracy/irregular_past_participle_verbs": 0.888, "blimp/accuracy/drop_argument": 0.78, "blimp/accuracy/wh_questions_object_gap": 0.809, "blimp/accuracy/animate_subject_passive": 0.778, "blimp/accuracy/existential_there_quantifiers_1": 0.983, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.891, "blimp/accuracy/npi_present_2": 0.528, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.937, "blimp/accuracy/anaphor_number_agreement": 0.99, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.952, "blimp/accuracy/existential_there_object_raising": 0.804, "blimp/accuracy/matrix_question_npi_licensor_present": 0.253, "blimp/accuracy/npi_present_1": 0.515, "blimp/accuracy/wh_vs_that_no_gap": 0.967, "blimp/accuracy/left_branch_island_echo_question": 0.416, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.965, "blimp/accuracy/causative": 0.691, "blimp/accuracy/group_average": 0.7663432835820893, "blimp/accuracy/seq_average": 0.7663432835820896, "cbt/accuracy/NE": 0.7564102564102564, "cbt/accuracy/V": 0.904, "cbt/accuracy/CN": 0.812, "cbt/accuracy/P": 0.894, "cbt/accuracy/group_average": 0.8416025641025642, "cbt/accuracy/seq_average": 0.8416366546618648, "hellaswag/accuracy/val": 0.29038040231029677, "hellaswag/accuracy/group_average": 0.29038040231029677, "hellaswag/accuracy/seq_average": 0.29038040231029677, "piqa/accuracy/val": 0.5875952121871599, "piqa/accuracy/group_average": 0.5875952121871599, "piqa/accuracy/seq_average": 0.5875952121871599, "ai2arc/accuracy/ARC-Easy": 0.33446088794926004, "ai2arc/accuracy/ARC-Challenge": 0.21545064377682405, "ai2arc/accuracy/group_average": 0.27495576586304205, "ai2arc/accuracy/seq_average": 0.2951841359773371, "race/accuracy/test/high": 0.2718696397941681, "race/accuracy/test/middle": 0.3398328690807799, "race/accuracy/group_average": 0.305851254437474, "race/accuracy/seq_average": 0.29164977705715445, "siqa/accuracy/dev": 0.3587512794268168, "siqa/accuracy/group_average": 0.3587512794268168, "siqa/accuracy/seq_average": 0.3587512794268168, "commonsenseqa/accuracy/dev_rand_split": 0.26126126126126126, "commonsenseqa/accuracy/group_average": 0.26126126126126126, "commonsenseqa/accuracy/seq_average": 0.26126126126126126}