Upload folder using huggingface_hub

#524
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_154M_standard_lb_postln/export/result-model-10000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 3.256010509672619, "val/accuracy": 0.39602515811011907, "val/perplexity": 25.945819793300284, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.889905846637228, "lambada/accuracy/total": 0.1469332298136646, "lambada/accuracy/openai_last_token": 0.7069099378881988, "lambada/perplexity": 36.4411387800352, "lambada/lm_loss": 3.8096153765567644, "lambada/lm_perplexity": 45.13307628829967, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.27147919396189185, "mean_loss": 3.0729581781549236, "blimp/accuracy/passive_2": 0.843, "blimp/accuracy/determiner_noun_agreement_2": 0.95, "blimp/accuracy/ellipsis_n_bar_1": 0.643, "blimp/accuracy/tough_vs_raising_2": 0.793, "blimp/accuracy/tough_vs_raising_1": 0.408, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.842, "blimp/accuracy/principle_A_reconstruction": 0.564, "blimp/accuracy/wh_vs_that_with_gap": 0.31, "blimp/accuracy/principle_A_domain_2": 0.728, "blimp/accuracy/determiner_noun_agreement_1": 0.967, "blimp/accuracy/ellipsis_n_bar_2": 0.807, "blimp/accuracy/principle_A_domain_3": 0.47, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.823, "blimp/accuracy/animate_subject_trans": 0.767, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.719, "blimp/accuracy/distractor_agreement_relative_clause": 0.444, "blimp/accuracy/transitive": 0.754, "blimp/accuracy/sentential_subject_island": 0.354, "blimp/accuracy/adjunct_island": 0.62, "blimp/accuracy/intransitive": 0.629, "blimp/accuracy/existential_there_subject_raising": 0.743, "blimp/accuracy/irregular_past_participle_adjectives": 0.866, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.109, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.126, "blimp/accuracy/only_npi_scope": 0.788, "blimp/accuracy/superlative_quantifiers_2": 0.54, "blimp/accuracy/passive_1": 0.837, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.803, "blimp/accuracy/inchoative": 0.464, "blimp/accuracy/anaphor_gender_agreement": 0.749, "blimp/accuracy/principle_A_c_command": 0.647, "blimp/accuracy/only_npi_licensor_present": 0.382, "blimp/accuracy/expletive_it_object_raising": 0.718, "blimp/accuracy/left_branch_island_simple_question": 0.122, "blimp/accuracy/wh_questions_subject_gap": 0.861, "blimp/accuracy/existential_there_quantifiers_2": 0.537, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.867, "blimp/accuracy/sentential_negation_npi_scope": 0.361, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.793, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.893, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.726, "blimp/accuracy/principle_A_case_2": 0.853, "blimp/accuracy/distractor_agreement_relational_noun": 0.542, "blimp/accuracy/sentential_negation_npi_licensor_present": 1.0, "blimp/accuracy/superlative_quantifiers_1": 0.55, "blimp/accuracy/wh_island": 0.644, "blimp/accuracy/principle_A_domain_1": 0.98, "blimp/accuracy/complex_NP_island": 0.502, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.848, "blimp/accuracy/irregular_past_participle_verbs": 0.784, "blimp/accuracy/drop_argument": 0.714, "blimp/accuracy/wh_questions_object_gap": 0.674, "blimp/accuracy/animate_subject_passive": 0.737, "blimp/accuracy/existential_there_quantifiers_1": 0.909, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.809, "blimp/accuracy/npi_present_2": 0.529, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.779, "blimp/accuracy/anaphor_number_agreement": 0.957, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.897, "blimp/accuracy/existential_there_object_raising": 0.739, "blimp/accuracy/matrix_question_npi_licensor_present": 0.059, "blimp/accuracy/npi_present_1": 0.504, "blimp/accuracy/wh_vs_that_no_gap": 0.956, "blimp/accuracy/left_branch_island_echo_question": 0.308, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.966, "blimp/accuracy/causative": 0.581, "blimp/accuracy/group_average": 0.6744477611940299, "blimp/accuracy/seq_average": 0.6744477611940298, "cbt/accuracy/NE": 0.6682692307692307, "cbt/accuracy/V": 0.828, "cbt/accuracy/CN": 0.6808, "cbt/accuracy/P": 0.784, "cbt/accuracy/group_average": 0.7402673076923076, "cbt/accuracy/seq_average": 0.740296118447379, "hellaswag/accuracy/val": 0.2695678151762597, "hellaswag/accuracy/group_average": 0.2695678151762597, "hellaswag/accuracy/seq_average": 0.2695678151762597, "piqa/accuracy/val": 0.5478781284004353, "piqa/accuracy/group_average": 0.5478781284004353, "piqa/accuracy/seq_average": 0.5478781284004353, "ai2arc/accuracy/ARC-Easy": 0.29471458773784354, "ai2arc/accuracy/ARC-Challenge": 0.19742489270386265, "ai2arc/accuracy/group_average": 0.2460697402208531, "ai2arc/accuracy/seq_average": 0.26260623229461755, "race/accuracy/test/high": 0.25728987993138935, "race/accuracy/test/middle": 0.32172701949860727, "race/accuracy/group_average": 0.28950844971499834, "race/accuracy/seq_average": 0.2760437778678557, "siqa/accuracy/dev": 0.3556806550665302, "siqa/accuracy/group_average": 0.3556806550665302, "siqa/accuracy/seq_average": 0.3556806550665302, "commonsenseqa/accuracy/dev_rand_split": 0.2285012285012285, "commonsenseqa/accuracy/group_average": 0.2285012285012285, "commonsenseqa/accuracy/seq_average": 0.2285012285012285}
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_154M_standard_lb_postln/export/result-model-100000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.7842617943173362, "val/accuracy": 0.4566853841145833, "val/perplexity": 16.187863492088137, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.788019594938859, "lambada/accuracy/total": 0.21312111801242237, "lambada/accuracy/openai_last_token": 0.7465062111801242, "lambada/perplexity": 16.654925042080798, "lambada/lm_loss": 3.360022691463149, "lambada/lm_perplexity": 28.789844155518466, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3349032510635028, "mean_loss": 2.7861406946280978, "blimp/accuracy/passive_2": 0.886, "blimp/accuracy/determiner_noun_agreement_2": 0.968, "blimp/accuracy/ellipsis_n_bar_1": 0.762, "blimp/accuracy/tough_vs_raising_2": 0.861, "blimp/accuracy/tough_vs_raising_1": 0.571, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.914, "blimp/accuracy/principle_A_reconstruction": 0.373, "blimp/accuracy/wh_vs_that_with_gap": 0.464, "blimp/accuracy/principle_A_domain_2": 0.815, "blimp/accuracy/determiner_noun_agreement_1": 0.978, "blimp/accuracy/ellipsis_n_bar_2": 0.895, "blimp/accuracy/principle_A_domain_3": 0.52, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.894, "blimp/accuracy/animate_subject_trans": 0.836, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.856, "blimp/accuracy/distractor_agreement_relative_clause": 0.513, "blimp/accuracy/transitive": 0.843, "blimp/accuracy/sentential_subject_island": 0.388, "blimp/accuracy/adjunct_island": 0.763, "blimp/accuracy/intransitive": 0.77, "blimp/accuracy/existential_there_subject_raising": 0.829, "blimp/accuracy/irregular_past_participle_adjectives": 0.909, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.279, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.169, "blimp/accuracy/only_npi_scope": 0.806, "blimp/accuracy/superlative_quantifiers_2": 0.673, "blimp/accuracy/passive_1": 0.897, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.9, "blimp/accuracy/inchoative": 0.587, "blimp/accuracy/anaphor_gender_agreement": 0.861, "blimp/accuracy/principle_A_c_command": 0.64, "blimp/accuracy/only_npi_licensor_present": 0.565, "blimp/accuracy/expletive_it_object_raising": 0.759, "blimp/accuracy/left_branch_island_simple_question": 0.273, "blimp/accuracy/wh_questions_subject_gap": 0.933, "blimp/accuracy/existential_there_quantifiers_2": 0.466, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.9, "blimp/accuracy/sentential_negation_npi_scope": 0.532, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.817, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.89, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.845, "blimp/accuracy/principle_A_case_2": 0.934, "blimp/accuracy/distractor_agreement_relational_noun": 0.758, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.989, "blimp/accuracy/superlative_quantifiers_1": 0.483, "blimp/accuracy/wh_island": 0.817, "blimp/accuracy/principle_A_domain_1": 0.981, "blimp/accuracy/complex_NP_island": 0.461, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.95, "blimp/accuracy/irregular_past_participle_verbs": 0.88, "blimp/accuracy/drop_argument": 0.753, "blimp/accuracy/wh_questions_object_gap": 0.758, "blimp/accuracy/animate_subject_passive": 0.796, "blimp/accuracy/existential_there_quantifiers_1": 0.98, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.853, "blimp/accuracy/npi_present_2": 0.55, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.899, "blimp/accuracy/anaphor_number_agreement": 0.968, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.951, "blimp/accuracy/existential_there_object_raising": 0.82, "blimp/accuracy/matrix_question_npi_licensor_present": 0.169, "blimp/accuracy/npi_present_1": 0.523, "blimp/accuracy/wh_vs_that_no_gap": 0.963, "blimp/accuracy/left_branch_island_echo_question": 0.28, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.984, "blimp/accuracy/causative": 0.671, "blimp/accuracy/group_average": 0.7398656716417912, "blimp/accuracy/seq_average": 0.7398656716417911, "cbt/accuracy/NE": 0.7387820512820513, "cbt/accuracy/V": 0.8848, "cbt/accuracy/CN": 0.7804, "cbt/accuracy/P": 0.8736, "cbt/accuracy/group_average": 0.8193955128205129, "cbt/accuracy/seq_average": 0.8194277711084433, "hellaswag/accuracy/val": 0.2815176259709221, "hellaswag/accuracy/group_average": 0.2815176259709221, "hellaswag/accuracy/seq_average": 0.2815176259709221, "piqa/accuracy/val": 0.5756256800870512, "piqa/accuracy/group_average": 0.5756256800870512, "piqa/accuracy/seq_average": 0.5756256800870512, "ai2arc/accuracy/ARC-Easy": 0.30697674418604654, "ai2arc/accuracy/ARC-Challenge": 0.2145922746781116, "ai2arc/accuracy/group_average": 0.2607845094320791, "ai2arc/accuracy/seq_average": 0.27648725212464587, "race/accuracy/test/high": 0.2655803316180675, "race/accuracy/test/middle": 0.32590529247910865, "race/accuracy/group_average": 0.2957428120485881, "race/accuracy/seq_average": 0.28313741386299146, "siqa/accuracy/dev": 0.35670419651995905, "siqa/accuracy/group_average": 0.35670419651995905, "siqa/accuracy/seq_average": 0.35670419651995905, "commonsenseqa/accuracy/dev_rand_split": 0.24324324324324326, "commonsenseqa/accuracy/group_average": 0.24324324324324326, "commonsenseqa/accuracy/seq_average": 0.24324324324324326}
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_154M_standard_lb_postln/export/result-model-20000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 3.0811060345362105, "val/accuracy": 0.41661531963045634, "val/perplexity": 21.782481254269683, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.710398608853358, "lambada/accuracy/total": 0.16770186335403728, "lambada/accuracy/openai_last_token": 0.7232142857142857, "lambada/perplexity": 24.723021316005735, "lambada/lm_loss": 3.6096392623074474, "lambada/lm_perplexity": 36.952720171165446, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.2921585914922468, "mean_loss": 2.8957523216947845, "blimp/accuracy/passive_2": 0.841, "blimp/accuracy/determiner_noun_agreement_2": 0.959, "blimp/accuracy/ellipsis_n_bar_1": 0.711, "blimp/accuracy/tough_vs_raising_2": 0.788, "blimp/accuracy/tough_vs_raising_1": 0.472, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.863, "blimp/accuracy/principle_A_reconstruction": 0.591, "blimp/accuracy/wh_vs_that_with_gap": 0.362, "blimp/accuracy/principle_A_domain_2": 0.772, "blimp/accuracy/determiner_noun_agreement_1": 0.976, "blimp/accuracy/ellipsis_n_bar_2": 0.874, "blimp/accuracy/principle_A_domain_3": 0.488, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.869, "blimp/accuracy/animate_subject_trans": 0.759, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.766, "blimp/accuracy/distractor_agreement_relative_clause": 0.522, "blimp/accuracy/transitive": 0.797, "blimp/accuracy/sentential_subject_island": 0.348, "blimp/accuracy/adjunct_island": 0.708, "blimp/accuracy/intransitive": 0.65, "blimp/accuracy/existential_there_subject_raising": 0.785, "blimp/accuracy/irregular_past_participle_adjectives": 0.922, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.249, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.114, "blimp/accuracy/only_npi_scope": 0.569, "blimp/accuracy/superlative_quantifiers_2": 0.801, "blimp/accuracy/passive_1": 0.87, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.826, "blimp/accuracy/inchoative": 0.481, "blimp/accuracy/anaphor_gender_agreement": 0.824, "blimp/accuracy/principle_A_c_command": 0.595, "blimp/accuracy/only_npi_licensor_present": 0.174, "blimp/accuracy/expletive_it_object_raising": 0.722, "blimp/accuracy/left_branch_island_simple_question": 0.253, "blimp/accuracy/wh_questions_subject_gap": 0.904, "blimp/accuracy/existential_there_quantifiers_2": 0.491, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.899, "blimp/accuracy/sentential_negation_npi_scope": 0.345, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.782, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.924, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.817, "blimp/accuracy/principle_A_case_2": 0.888, "blimp/accuracy/distractor_agreement_relational_noun": 0.696, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.996, "blimp/accuracy/superlative_quantifiers_1": 0.487, "blimp/accuracy/wh_island": 0.729, "blimp/accuracy/principle_A_domain_1": 0.985, "blimp/accuracy/complex_NP_island": 0.469, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.914, "blimp/accuracy/irregular_past_participle_verbs": 0.791, "blimp/accuracy/drop_argument": 0.742, "blimp/accuracy/wh_questions_object_gap": 0.744, "blimp/accuracy/animate_subject_passive": 0.726, "blimp/accuracy/existential_there_quantifiers_1": 0.921, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.841, "blimp/accuracy/npi_present_2": 0.478, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.835, "blimp/accuracy/anaphor_number_agreement": 0.958, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.938, "blimp/accuracy/existential_there_object_raising": 0.733, "blimp/accuracy/matrix_question_npi_licensor_present": 0.091, "blimp/accuracy/npi_present_1": 0.505, "blimp/accuracy/wh_vs_that_no_gap": 0.955, "blimp/accuracy/left_branch_island_echo_question": 0.38, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.988, "blimp/accuracy/causative": 0.607, "blimp/accuracy/group_average": 0.699402985074627, "blimp/accuracy/seq_average": 0.6994029850746268, "cbt/accuracy/NE": 0.6870993589743589, "cbt/accuracy/V": 0.8536, "cbt/accuracy/CN": 0.7204, "cbt/accuracy/P": 0.8172, "cbt/accuracy/group_average": 0.7695748397435898, "cbt/accuracy/seq_average": 0.7696078431372549, "hellaswag/accuracy/val": 0.2722565226050587, "hellaswag/accuracy/group_average": 0.2722565226050587, "hellaswag/accuracy/seq_average": 0.2722565226050587, "piqa/accuracy/val": 0.5500544069640914, "piqa/accuracy/group_average": 0.5500544069640914, "piqa/accuracy/seq_average": 0.5500544069640914, "ai2arc/accuracy/ARC-Easy": 0.3019027484143763, "ai2arc/accuracy/ARC-Challenge": 0.2094420600858369, "ai2arc/accuracy/group_average": 0.2556724042501066, "ai2arc/accuracy/seq_average": 0.27138810198300284, "race/accuracy/test/high": 0.26329331046312177, "race/accuracy/test/middle": 0.32729805013927576, "race/accuracy/group_average": 0.2952956803011988, "race/accuracy/seq_average": 0.2819213619781111, "siqa/accuracy/dev": 0.3587512794268168, "siqa/accuracy/group_average": 0.3587512794268168, "siqa/accuracy/seq_average": 0.3587512794268168, "commonsenseqa/accuracy/dev_rand_split": 0.23505323505323505, "commonsenseqa/accuracy/group_average": 0.23505323505323505, "commonsenseqa/accuracy/seq_average": 0.23505323505323505}
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_154M_standard_lb_postln/export/result-model-30000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.9902484227740573, "val/accuracy": 0.4281684027777778, "val/perplexity": 19.89062316163538, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.7823946935049495, "lambada/accuracy/total": 0.15760869565217392, "lambada/accuracy/openai_last_token": 0.7214673913043478, "lambada/perplexity": 23.986739844570696, "lambada/lm_loss": 3.561705379741531, "lambada/lm_perplexity": 35.22321491116061, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.29288854921497587, "mean_loss": 2.886321558139503, "blimp/accuracy/passive_2": 0.871, "blimp/accuracy/determiner_noun_agreement_2": 0.962, "blimp/accuracy/ellipsis_n_bar_1": 0.761, "blimp/accuracy/tough_vs_raising_2": 0.819, "blimp/accuracy/tough_vs_raising_1": 0.514, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.895, "blimp/accuracy/principle_A_reconstruction": 0.433, "blimp/accuracy/wh_vs_that_with_gap": 0.482, "blimp/accuracy/principle_A_domain_2": 0.797, "blimp/accuracy/determiner_noun_agreement_1": 0.978, "blimp/accuracy/ellipsis_n_bar_2": 0.845, "blimp/accuracy/principle_A_domain_3": 0.516, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.868, "blimp/accuracy/animate_subject_trans": 0.829, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.792, "blimp/accuracy/distractor_agreement_relative_clause": 0.497, "blimp/accuracy/transitive": 0.814, "blimp/accuracy/sentential_subject_island": 0.399, "blimp/accuracy/adjunct_island": 0.744, "blimp/accuracy/intransitive": 0.754, "blimp/accuracy/existential_there_subject_raising": 0.823, "blimp/accuracy/irregular_past_participle_adjectives": 0.941, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.219, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.18, "blimp/accuracy/only_npi_scope": 0.815, "blimp/accuracy/superlative_quantifiers_2": 0.686, "blimp/accuracy/passive_1": 0.88, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.869, "blimp/accuracy/inchoative": 0.568, "blimp/accuracy/anaphor_gender_agreement": 0.838, "blimp/accuracy/principle_A_c_command": 0.64, "blimp/accuracy/only_npi_licensor_present": 0.49, "blimp/accuracy/expletive_it_object_raising": 0.748, "blimp/accuracy/left_branch_island_simple_question": 0.207, "blimp/accuracy/wh_questions_subject_gap": 0.91, "blimp/accuracy/existential_there_quantifiers_2": 0.406, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.907, "blimp/accuracy/sentential_negation_npi_scope": 0.436, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.797, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.852, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.786, "blimp/accuracy/principle_A_case_2": 0.933, "blimp/accuracy/distractor_agreement_relational_noun": 0.65, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.998, "blimp/accuracy/superlative_quantifiers_1": 0.553, "blimp/accuracy/wh_island": 0.821, "blimp/accuracy/principle_A_domain_1": 0.965, "blimp/accuracy/complex_NP_island": 0.504, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.921, "blimp/accuracy/irregular_past_participle_verbs": 0.845, "blimp/accuracy/drop_argument": 0.782, "blimp/accuracy/wh_questions_object_gap": 0.71, "blimp/accuracy/animate_subject_passive": 0.795, "blimp/accuracy/existential_there_quantifiers_1": 0.933, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.875, "blimp/accuracy/npi_present_2": 0.55, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.852, "blimp/accuracy/anaphor_number_agreement": 0.967, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.945, "blimp/accuracy/existential_there_object_raising": 0.764, "blimp/accuracy/matrix_question_npi_licensor_present": 0.122, "blimp/accuracy/npi_present_1": 0.544, "blimp/accuracy/wh_vs_that_no_gap": 0.958, "blimp/accuracy/left_branch_island_echo_question": 0.339, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.969, "blimp/accuracy/causative": 0.603, "blimp/accuracy/group_average": 0.723373134328358, "blimp/accuracy/seq_average": 0.7233731343283583, "cbt/accuracy/NE": 0.7103365384615384, "cbt/accuracy/V": 0.8624, "cbt/accuracy/CN": 0.7528, "cbt/accuracy/P": 0.8384, "cbt/accuracy/group_average": 0.7909841346153846, "cbt/accuracy/seq_average": 0.7910164065626251, "hellaswag/accuracy/val": 0.2722565226050587, "hellaswag/accuracy/group_average": 0.2722565226050587, "hellaswag/accuracy/seq_average": 0.2722565226050587, "piqa/accuracy/val": 0.5554951033732318, "piqa/accuracy/group_average": 0.5554951033732318, "piqa/accuracy/seq_average": 0.5554951033732318, "ai2arc/accuracy/ARC-Easy": 0.3023255813953488, "ai2arc/accuracy/ARC-Challenge": 0.21545064377682405, "ai2arc/accuracy/group_average": 0.25888811258608646, "ai2arc/accuracy/seq_average": 0.27365439093484417, "race/accuracy/test/high": 0.2555746140651801, "race/accuracy/test/middle": 0.32729805013927576, "race/accuracy/group_average": 0.29143633210222797, "race/accuracy/seq_average": 0.27644912849614917, "siqa/accuracy/dev": 0.3618219037871034, "siqa/accuracy/group_average": 0.3618219037871034, "siqa/accuracy/seq_average": 0.3618219037871034, "commonsenseqa/accuracy/dev_rand_split": 0.23996723996723995, "commonsenseqa/accuracy/group_average": 0.23996723996723995, "commonsenseqa/accuracy/seq_average": 0.23996723996723995}
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_154M_standard_lb_postln/export/result-model-40000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.9300742981925842, "val/accuracy": 0.4361795092385913, "val/perplexity": 18.729021977461816, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.7307655855735637, "lambada/accuracy/total": 0.1766304347826087, "lambada/accuracy/openai_last_token": 0.7300077639751553, "lambada/perplexity": 22.068302413293733, "lambada/lm_loss": 3.485583954223168, "lambada/lm_perplexity": 32.64148267718823, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3064049720106, "mean_loss": 2.8304199418830738, "blimp/accuracy/passive_2": 0.853, "blimp/accuracy/determiner_noun_agreement_2": 0.959, "blimp/accuracy/ellipsis_n_bar_1": 0.755, "blimp/accuracy/tough_vs_raising_2": 0.853, "blimp/accuracy/tough_vs_raising_1": 0.489, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.858, "blimp/accuracy/principle_A_reconstruction": 0.415, "blimp/accuracy/wh_vs_that_with_gap": 0.446, "blimp/accuracy/principle_A_domain_2": 0.752, "blimp/accuracy/determiner_noun_agreement_1": 0.982, "blimp/accuracy/ellipsis_n_bar_2": 0.879, "blimp/accuracy/principle_A_domain_3": 0.524, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.87, "blimp/accuracy/animate_subject_trans": 0.816, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.809, "blimp/accuracy/distractor_agreement_relative_clause": 0.541, "blimp/accuracy/transitive": 0.829, "blimp/accuracy/sentential_subject_island": 0.397, "blimp/accuracy/adjunct_island": 0.677, "blimp/accuracy/intransitive": 0.733, "blimp/accuracy/existential_there_subject_raising": 0.832, "blimp/accuracy/irregular_past_participle_adjectives": 0.936, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.246, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.149, "blimp/accuracy/only_npi_scope": 0.787, "blimp/accuracy/superlative_quantifiers_2": 0.498, "blimp/accuracy/passive_1": 0.878, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.871, "blimp/accuracy/inchoative": 0.549, "blimp/accuracy/anaphor_gender_agreement": 0.867, "blimp/accuracy/principle_A_c_command": 0.668, "blimp/accuracy/only_npi_licensor_present": 0.442, "blimp/accuracy/expletive_it_object_raising": 0.746, "blimp/accuracy/left_branch_island_simple_question": 0.261, "blimp/accuracy/wh_questions_subject_gap": 0.926, "blimp/accuracy/existential_there_quantifiers_2": 0.35, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.889, "blimp/accuracy/sentential_negation_npi_scope": 0.479, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.789, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.927, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.811, "blimp/accuracy/principle_A_case_2": 0.927, "blimp/accuracy/distractor_agreement_relational_noun": 0.685, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.994, "blimp/accuracy/superlative_quantifiers_1": 0.442, "blimp/accuracy/wh_island": 0.741, "blimp/accuracy/principle_A_domain_1": 0.992, "blimp/accuracy/complex_NP_island": 0.422, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.949, "blimp/accuracy/irregular_past_participle_verbs": 0.87, "blimp/accuracy/drop_argument": 0.743, "blimp/accuracy/wh_questions_object_gap": 0.758, "blimp/accuracy/animate_subject_passive": 0.76, "blimp/accuracy/existential_there_quantifiers_1": 0.963, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.848, "blimp/accuracy/npi_present_2": 0.437, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.853, "blimp/accuracy/anaphor_number_agreement": 0.963, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.956, "blimp/accuracy/existential_there_object_raising": 0.784, "blimp/accuracy/matrix_question_npi_licensor_present": 0.143, "blimp/accuracy/npi_present_1": 0.427, "blimp/accuracy/wh_vs_that_no_gap": 0.978, "blimp/accuracy/left_branch_island_echo_question": 0.296, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.985, "blimp/accuracy/causative": 0.635, "blimp/accuracy/group_average": 0.7147611940298506, "blimp/accuracy/seq_average": 0.7147611940298507, "cbt/accuracy/NE": 0.7199519230769231, "cbt/accuracy/V": 0.8692, "cbt/accuracy/CN": 0.7512, "cbt/accuracy/P": 0.8584, "cbt/accuracy/group_average": 0.7996879807692308, "cbt/accuracy/seq_average": 0.7997198879551821, "hellaswag/accuracy/val": 0.2748456482772356, "hellaswag/accuracy/group_average": 0.2748456482772356, "hellaswag/accuracy/seq_average": 0.2748456482772356, "piqa/accuracy/val": 0.5642002176278563, "piqa/accuracy/group_average": 0.5642002176278563, "piqa/accuracy/seq_average": 0.5642002176278563, "ai2arc/accuracy/ARC-Easy": 0.2989429175475687, "ai2arc/accuracy/ARC-Challenge": 0.21373390557939914, "ai2arc/accuracy/group_average": 0.2563384115634839, "ai2arc/accuracy/seq_average": 0.2708215297450425, "race/accuracy/test/high": 0.25871926815323043, "race/accuracy/test/middle": 0.32172701949860727, "race/accuracy/group_average": 0.29022314382591885, "race/accuracy/seq_average": 0.27705715443858936, "siqa/accuracy/dev": 0.3561924257932446, "siqa/accuracy/group_average": 0.3561924257932446, "siqa/accuracy/seq_average": 0.3561924257932446, "commonsenseqa/accuracy/dev_rand_split": 0.2416052416052416, "commonsenseqa/accuracy/group_average": 0.2416052416052416, "commonsenseqa/accuracy/seq_average": 0.2416052416052416}
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_154M_standard_lb_postln/export/result-model-50000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.886021205357143, "val/accuracy": 0.4430929516989087, "val/perplexity": 17.92186015329987, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6944382945943324, "lambada/accuracy/total": 0.20128105590062112, "lambada/accuracy/openai_last_token": 0.7410714285714286, "lambada/perplexity": 19.36618728945894, "lambada/lm_loss": 3.4743736276169566, "lambada/lm_perplexity": 32.277604403363576, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.32218700379976495, "mean_loss": 2.7902297499757376, "blimp/accuracy/passive_2": 0.866, "blimp/accuracy/determiner_noun_agreement_2": 0.975, "blimp/accuracy/ellipsis_n_bar_1": 0.756, "blimp/accuracy/tough_vs_raising_2": 0.846, "blimp/accuracy/tough_vs_raising_1": 0.533, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.9, "blimp/accuracy/principle_A_reconstruction": 0.386, "blimp/accuracy/wh_vs_that_with_gap": 0.431, "blimp/accuracy/principle_A_domain_2": 0.815, "blimp/accuracy/determiner_noun_agreement_1": 0.986, "blimp/accuracy/ellipsis_n_bar_2": 0.864, "blimp/accuracy/principle_A_domain_3": 0.535, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.875, "blimp/accuracy/animate_subject_trans": 0.831, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.836, "blimp/accuracy/distractor_agreement_relative_clause": 0.489, "blimp/accuracy/transitive": 0.837, "blimp/accuracy/sentential_subject_island": 0.394, "blimp/accuracy/adjunct_island": 0.767, "blimp/accuracy/intransitive": 0.716, "blimp/accuracy/existential_there_subject_raising": 0.841, "blimp/accuracy/irregular_past_participle_adjectives": 0.931, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.187, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.114, "blimp/accuracy/only_npi_scope": 0.775, "blimp/accuracy/superlative_quantifiers_2": 0.673, "blimp/accuracy/passive_1": 0.881, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.851, "blimp/accuracy/inchoative": 0.537, "blimp/accuracy/anaphor_gender_agreement": 0.799, "blimp/accuracy/principle_A_c_command": 0.636, "blimp/accuracy/only_npi_licensor_present": 0.356, "blimp/accuracy/expletive_it_object_raising": 0.738, "blimp/accuracy/left_branch_island_simple_question": 0.164, "blimp/accuracy/wh_questions_subject_gap": 0.919, "blimp/accuracy/existential_there_quantifiers_2": 0.397, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.891, "blimp/accuracy/sentential_negation_npi_scope": 0.4, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.839, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.898, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.807, "blimp/accuracy/principle_A_case_2": 0.946, "blimp/accuracy/distractor_agreement_relational_noun": 0.699, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.983, "blimp/accuracy/superlative_quantifiers_1": 0.37, "blimp/accuracy/wh_island": 0.789, "blimp/accuracy/principle_A_domain_1": 0.989, "blimp/accuracy/complex_NP_island": 0.507, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.948, "blimp/accuracy/irregular_past_participle_verbs": 0.849, "blimp/accuracy/drop_argument": 0.738, "blimp/accuracy/wh_questions_object_gap": 0.744, "blimp/accuracy/animate_subject_passive": 0.784, "blimp/accuracy/existential_there_quantifiers_1": 0.969, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.872, "blimp/accuracy/npi_present_2": 0.517, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.877, "blimp/accuracy/anaphor_number_agreement": 0.949, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.944, "blimp/accuracy/existential_there_object_raising": 0.794, "blimp/accuracy/matrix_question_npi_licensor_present": 0.169, "blimp/accuracy/npi_present_1": 0.481, "blimp/accuracy/wh_vs_that_no_gap": 0.966, "blimp/accuracy/left_branch_island_echo_question": 0.353, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.977, "blimp/accuracy/causative": 0.648, "blimp/accuracy/group_average": 0.718417910447761, "blimp/accuracy/seq_average": 0.7184179104477612, "cbt/accuracy/NE": 0.7207532051282052, "cbt/accuracy/V": 0.8688, "cbt/accuracy/CN": 0.7648, "cbt/accuracy/P": 0.8548, "cbt/accuracy/group_average": 0.8022883012820513, "cbt/accuracy/seq_average": 0.8023209283713485, "hellaswag/accuracy/val": 0.27504481179047996, "hellaswag/accuracy/group_average": 0.27504481179047996, "hellaswag/accuracy/seq_average": 0.27504481179047996, "piqa/accuracy/val": 0.5565832426550599, "piqa/accuracy/group_average": 0.5565832426550599, "piqa/accuracy/seq_average": 0.5565832426550599, "ai2arc/accuracy/ARC-Easy": 0.3095137420718816, "ai2arc/accuracy/ARC-Challenge": 0.21201716738197424, "ai2arc/accuracy/group_average": 0.2607654547269279, "ai2arc/accuracy/seq_average": 0.2773371104815864, "race/accuracy/test/high": 0.26300743281875355, "race/accuracy/test/middle": 0.3328690807799443, "race/accuracy/group_average": 0.29793825679934893, "race/accuracy/seq_average": 0.2833400891771382, "siqa/accuracy/dev": 0.3526100307062436, "siqa/accuracy/group_average": 0.3526100307062436, "siqa/accuracy/seq_average": 0.3526100307062436, "commonsenseqa/accuracy/dev_rand_split": 0.2334152334152334, "commonsenseqa/accuracy/group_average": 0.2334152334152334, "commonsenseqa/accuracy/seq_average": 0.2334152334152334}
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_154M_standard_lb_postln/export/result-model-60000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.8495362296937006, "val/accuracy": 0.4467996264260913, "val/perplexity": 17.279766139553928, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.77426772976514, "lambada/accuracy/total": 0.19642857142857142, "lambada/accuracy/openai_last_token": 0.7404891304347826, "lambada/perplexity": 18.986052586471416, "lambada/lm_loss": 3.418875400191086, "lambada/lm_perplexity": 30.535055986462016, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.32161409892733134, "mean_loss": 2.81190197972942, "blimp/accuracy/passive_2": 0.873, "blimp/accuracy/determiner_noun_agreement_2": 0.964, "blimp/accuracy/ellipsis_n_bar_1": 0.758, "blimp/accuracy/tough_vs_raising_2": 0.864, "blimp/accuracy/tough_vs_raising_1": 0.534, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.881, "blimp/accuracy/principle_A_reconstruction": 0.488, "blimp/accuracy/wh_vs_that_with_gap": 0.451, "blimp/accuracy/principle_A_domain_2": 0.809, "blimp/accuracy/determiner_noun_agreement_1": 0.984, "blimp/accuracy/ellipsis_n_bar_2": 0.868, "blimp/accuracy/principle_A_domain_3": 0.542, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.898, "blimp/accuracy/animate_subject_trans": 0.821, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.836, "blimp/accuracy/distractor_agreement_relative_clause": 0.503, "blimp/accuracy/transitive": 0.841, "blimp/accuracy/sentential_subject_island": 0.413, "blimp/accuracy/adjunct_island": 0.773, "blimp/accuracy/intransitive": 0.721, "blimp/accuracy/existential_there_subject_raising": 0.819, "blimp/accuracy/irregular_past_participle_adjectives": 0.967, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.271, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.175, "blimp/accuracy/only_npi_scope": 0.809, "blimp/accuracy/superlative_quantifiers_2": 0.701, "blimp/accuracy/passive_1": 0.885, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.87, "blimp/accuracy/inchoative": 0.594, "blimp/accuracy/anaphor_gender_agreement": 0.871, "blimp/accuracy/principle_A_c_command": 0.617, "blimp/accuracy/only_npi_licensor_present": 0.49, "blimp/accuracy/expletive_it_object_raising": 0.74, "blimp/accuracy/left_branch_island_simple_question": 0.29, "blimp/accuracy/wh_questions_subject_gap": 0.922, "blimp/accuracy/existential_there_quantifiers_2": 0.312, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.908, "blimp/accuracy/sentential_negation_npi_scope": 0.508, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.816, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.87, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.804, "blimp/accuracy/principle_A_case_2": 0.936, "blimp/accuracy/distractor_agreement_relational_noun": 0.678, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.995, "blimp/accuracy/superlative_quantifiers_1": 0.524, "blimp/accuracy/wh_island": 0.852, "blimp/accuracy/principle_A_domain_1": 0.974, "blimp/accuracy/complex_NP_island": 0.489, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.937, "blimp/accuracy/irregular_past_participle_verbs": 0.871, "blimp/accuracy/drop_argument": 0.749, "blimp/accuracy/wh_questions_object_gap": 0.759, "blimp/accuracy/animate_subject_passive": 0.804, "blimp/accuracy/existential_there_quantifiers_1": 0.966, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.86, "blimp/accuracy/npi_present_2": 0.566, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.87, "blimp/accuracy/anaphor_number_agreement": 0.961, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.945, "blimp/accuracy/existential_there_object_raising": 0.813, "blimp/accuracy/matrix_question_npi_licensor_present": 0.157, "blimp/accuracy/npi_present_1": 0.483, "blimp/accuracy/wh_vs_that_no_gap": 0.975, "blimp/accuracy/left_branch_island_echo_question": 0.329, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.981, "blimp/accuracy/causative": 0.659, "blimp/accuracy/group_average": 0.7342388059701492, "blimp/accuracy/seq_average": 0.7342388059701492, "cbt/accuracy/NE": 0.7291666666666666, "cbt/accuracy/V": 0.8752, "cbt/accuracy/CN": 0.7668, "cbt/accuracy/P": 0.854, "cbt/accuracy/group_average": 0.8062916666666666, "cbt/accuracy/seq_average": 0.8063225290116046, "hellaswag/accuracy/val": 0.2771360286795459, "hellaswag/accuracy/group_average": 0.2771360286795459, "hellaswag/accuracy/seq_average": 0.2771360286795459, "piqa/accuracy/val": 0.558215451577802, "piqa/accuracy/group_average": 0.558215451577802, "piqa/accuracy/seq_average": 0.558215451577802, "ai2arc/accuracy/ARC-Easy": 0.3035940803382664, "ai2arc/accuracy/ARC-Challenge": 0.20858369098712445, "ai2arc/accuracy/group_average": 0.25608888566269544, "ai2arc/accuracy/seq_average": 0.27223796033994335, "race/accuracy/test/high": 0.2607204116638079, "race/accuracy/test/middle": 0.3412256267409471, "race/accuracy/group_average": 0.3009730192023775, "race/accuracy/seq_average": 0.28415079043372515, "siqa/accuracy/dev": 0.36131013306038895, "siqa/accuracy/group_average": 0.36131013306038895, "siqa/accuracy/seq_average": 0.36131013306038895, "commonsenseqa/accuracy/dev_rand_split": 0.24242424242424243, "commonsenseqa/accuracy/group_average": 0.24242424242424243, "commonsenseqa/accuracy/seq_average": 0.24242424242424243}
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_154M_standard_lb_postln/export/result-model-70000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.823054722377232, "val/accuracy": 0.45099070715525796, "val/perplexity": 16.828177648226344, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.7036075947447595, "lambada/accuracy/total": 0.19778726708074534, "lambada/accuracy/openai_last_token": 0.7439829192546584, "lambada/perplexity": 17.37339804829553, "lambada/lm_loss": 3.3987665838347945, "lambada/lm_perplexity": 29.92716462507873, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3243889871180017, "mean_loss": 2.763331158560996, "blimp/accuracy/passive_2": 0.886, "blimp/accuracy/determiner_noun_agreement_2": 0.97, "blimp/accuracy/ellipsis_n_bar_1": 0.774, "blimp/accuracy/tough_vs_raising_2": 0.83, "blimp/accuracy/tough_vs_raising_1": 0.59, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.832, "blimp/accuracy/principle_A_reconstruction": 0.416, "blimp/accuracy/wh_vs_that_with_gap": 0.463, "blimp/accuracy/principle_A_domain_2": 0.818, "blimp/accuracy/determiner_noun_agreement_1": 0.992, "blimp/accuracy/ellipsis_n_bar_2": 0.896, "blimp/accuracy/principle_A_domain_3": 0.534, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.882, "blimp/accuracy/animate_subject_trans": 0.829, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.838, "blimp/accuracy/distractor_agreement_relative_clause": 0.538, "blimp/accuracy/transitive": 0.829, "blimp/accuracy/sentential_subject_island": 0.345, "blimp/accuracy/adjunct_island": 0.701, "blimp/accuracy/intransitive": 0.768, "blimp/accuracy/existential_there_subject_raising": 0.837, "blimp/accuracy/irregular_past_participle_adjectives": 0.907, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.225, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.179, "blimp/accuracy/only_npi_scope": 0.737, "blimp/accuracy/superlative_quantifiers_2": 0.692, "blimp/accuracy/passive_1": 0.887, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.871, "blimp/accuracy/inchoative": 0.579, "blimp/accuracy/anaphor_gender_agreement": 0.858, "blimp/accuracy/principle_A_c_command": 0.622, "blimp/accuracy/only_npi_licensor_present": 0.484, "blimp/accuracy/expletive_it_object_raising": 0.753, "blimp/accuracy/left_branch_island_simple_question": 0.2, "blimp/accuracy/wh_questions_subject_gap": 0.923, "blimp/accuracy/existential_there_quantifiers_2": 0.43, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.89, "blimp/accuracy/sentential_negation_npi_scope": 0.527, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.809, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.887, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.83, "blimp/accuracy/principle_A_case_2": 0.95, "blimp/accuracy/distractor_agreement_relational_noun": 0.708, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.996, "blimp/accuracy/superlative_quantifiers_1": 0.417, "blimp/accuracy/wh_island": 0.767, "blimp/accuracy/principle_A_domain_1": 0.979, "blimp/accuracy/complex_NP_island": 0.422, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.951, "blimp/accuracy/irregular_past_participle_verbs": 0.861, "blimp/accuracy/drop_argument": 0.768, "blimp/accuracy/wh_questions_object_gap": 0.744, "blimp/accuracy/animate_subject_passive": 0.786, "blimp/accuracy/existential_there_quantifiers_1": 0.977, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.789, "blimp/accuracy/npi_present_2": 0.537, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.887, "blimp/accuracy/anaphor_number_agreement": 0.971, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.953, "blimp/accuracy/existential_there_object_raising": 0.796, "blimp/accuracy/matrix_question_npi_licensor_present": 0.18, "blimp/accuracy/npi_present_1": 0.487, "blimp/accuracy/wh_vs_that_no_gap": 0.954, "blimp/accuracy/left_branch_island_echo_question": 0.267, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.979, "blimp/accuracy/causative": 0.644, "blimp/accuracy/group_average": 0.7257910447761194, "blimp/accuracy/seq_average": 0.7257910447761194, "cbt/accuracy/NE": 0.7283653846153846, "cbt/accuracy/V": 0.8824, "cbt/accuracy/CN": 0.7696, "cbt/accuracy/P": 0.8592, "cbt/accuracy/group_average": 0.8098913461538462, "cbt/accuracy/seq_average": 0.8099239695878351, "hellaswag/accuracy/val": 0.2808205536745668, "hellaswag/accuracy/group_average": 0.2808205536745668, "hellaswag/accuracy/seq_average": 0.2808205536745668, "piqa/accuracy/val": 0.5647442872687704, "piqa/accuracy/group_average": 0.5647442872687704, "piqa/accuracy/seq_average": 0.5647442872687704, "ai2arc/accuracy/ARC-Easy": 0.30613107822410146, "ai2arc/accuracy/ARC-Challenge": 0.20085836909871244, "ai2arc/accuracy/group_average": 0.25349472366140696, "ai2arc/accuracy/seq_average": 0.27138810198300284, "race/accuracy/test/high": 0.2564322469982847, "race/accuracy/test/middle": 0.3321727019498607, "race/accuracy/group_average": 0.2943024744740727, "race/accuracy/seq_average": 0.27847588163761655, "siqa/accuracy/dev": 0.36489252814739, "siqa/accuracy/group_average": 0.36489252814739, "siqa/accuracy/seq_average": 0.36489252814739, "commonsenseqa/accuracy/dev_rand_split": 0.24488124488124488, "commonsenseqa/accuracy/group_average": 0.24488124488124488, "commonsenseqa/accuracy/seq_average": 0.24488124488124488}
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_154M_standard_lb_postln/export/result-model-80000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.8020692855592757, "val/accuracy": 0.4536268446180556, "val/perplexity": 16.478710673003054, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6619753630264946, "lambada/accuracy/total": 0.20535714285714285, "lambada/accuracy/openai_last_token": 0.7410714285714286, "lambada/perplexity": 17.368127243106436, "lambada/lm_loss": 3.378603815015337, "lambada/lm_perplexity": 29.329792697054522, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3294919937375992, "mean_loss": 2.732022324292885, "blimp/accuracy/passive_2": 0.878, "blimp/accuracy/determiner_noun_agreement_2": 0.972, "blimp/accuracy/ellipsis_n_bar_1": 0.763, "blimp/accuracy/tough_vs_raising_2": 0.838, "blimp/accuracy/tough_vs_raising_1": 0.601, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.907, "blimp/accuracy/principle_A_reconstruction": 0.426, "blimp/accuracy/wh_vs_that_with_gap": 0.457, "blimp/accuracy/principle_A_domain_2": 0.823, "blimp/accuracy/determiner_noun_agreement_1": 0.982, "blimp/accuracy/ellipsis_n_bar_2": 0.89, "blimp/accuracy/principle_A_domain_3": 0.537, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.9, "blimp/accuracy/animate_subject_trans": 0.845, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.837, "blimp/accuracy/distractor_agreement_relative_clause": 0.555, "blimp/accuracy/transitive": 0.837, "blimp/accuracy/sentential_subject_island": 0.387, "blimp/accuracy/adjunct_island": 0.739, "blimp/accuracy/intransitive": 0.77, "blimp/accuracy/existential_there_subject_raising": 0.815, "blimp/accuracy/irregular_past_participle_adjectives": 0.923, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.223, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.164, "blimp/accuracy/only_npi_scope": 0.768, "blimp/accuracy/superlative_quantifiers_2": 0.744, "blimp/accuracy/passive_1": 0.895, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.883, "blimp/accuracy/inchoative": 0.601, "blimp/accuracy/anaphor_gender_agreement": 0.858, "blimp/accuracy/principle_A_c_command": 0.657, "blimp/accuracy/only_npi_licensor_present": 0.39, "blimp/accuracy/expletive_it_object_raising": 0.749, "blimp/accuracy/left_branch_island_simple_question": 0.258, "blimp/accuracy/wh_questions_subject_gap": 0.944, "blimp/accuracy/existential_there_quantifiers_2": 0.344, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.908, "blimp/accuracy/sentential_negation_npi_scope": 0.557, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.831, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.9, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.849, "blimp/accuracy/principle_A_case_2": 0.938, "blimp/accuracy/distractor_agreement_relational_noun": 0.783, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.992, "blimp/accuracy/superlative_quantifiers_1": 0.527, "blimp/accuracy/wh_island": 0.822, "blimp/accuracy/principle_A_domain_1": 0.983, "blimp/accuracy/complex_NP_island": 0.449, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.948, "blimp/accuracy/irregular_past_participle_verbs": 0.873, "blimp/accuracy/drop_argument": 0.767, "blimp/accuracy/wh_questions_object_gap": 0.776, "blimp/accuracy/animate_subject_passive": 0.794, "blimp/accuracy/existential_there_quantifiers_1": 0.965, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.867, "blimp/accuracy/npi_present_2": 0.573, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.895, "blimp/accuracy/anaphor_number_agreement": 0.967, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.948, "blimp/accuracy/existential_there_object_raising": 0.787, "blimp/accuracy/matrix_question_npi_licensor_present": 0.196, "blimp/accuracy/npi_present_1": 0.532, "blimp/accuracy/wh_vs_that_no_gap": 0.973, "blimp/accuracy/left_branch_island_echo_question": 0.266, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.985, "blimp/accuracy/causative": 0.658, "blimp/accuracy/group_average": 0.7383432835820896, "blimp/accuracy/seq_average": 0.7383432835820896, "cbt/accuracy/NE": 0.7371794871794872, "cbt/accuracy/V": 0.8804, "cbt/accuracy/CN": 0.7748, "cbt/accuracy/P": 0.8648, "cbt/accuracy/group_average": 0.8142948717948717, "cbt/accuracy/seq_average": 0.8143257302921169, "hellaswag/accuracy/val": 0.2815176259709221, "hellaswag/accuracy/group_average": 0.2815176259709221, "hellaswag/accuracy/seq_average": 0.2815176259709221, "piqa/accuracy/val": 0.5718171926006529, "piqa/accuracy/group_average": 0.5718171926006529, "piqa/accuracy/seq_average": 0.5718171926006529, "ai2arc/accuracy/ARC-Easy": 0.31374207188160674, "ai2arc/accuracy/ARC-Challenge": 0.20515021459227467, "ai2arc/accuracy/group_average": 0.2594461432369407, "ai2arc/accuracy/seq_average": 0.27790368271954674, "race/accuracy/test/high": 0.26758147512864494, "race/accuracy/test/middle": 0.33913649025069637, "race/accuracy/group_average": 0.30335898268967065, "race/accuracy/seq_average": 0.2884069720308067, "siqa/accuracy/dev": 0.36131013306038895, "siqa/accuracy/group_average": 0.36131013306038895, "siqa/accuracy/seq_average": 0.36131013306038895, "commonsenseqa/accuracy/dev_rand_split": 0.2547092547092547, "commonsenseqa/accuracy/group_average": 0.2547092547092547, "commonsenseqa/accuracy/seq_average": 0.2547092547092547}
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_154M_standard_lb_postln/export/result-model-90000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.7901739695715526, "val/accuracy": 0.4555906265500992, "val/perplexity": 16.283852450217957, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.8317389636306287, "lambada/accuracy/total": 0.20399844720496896, "lambada/accuracy/openai_last_token": 0.7437888198757764, "lambada/perplexity": 17.66163331382681, "lambada/lm_loss": 3.351313559144471, "lambada/lm_perplexity": 28.54019827080228, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3297945368775341, "mean_loss": 2.8109564666010907, "blimp/accuracy/passive_2": 0.879, "blimp/accuracy/determiner_noun_agreement_2": 0.975, "blimp/accuracy/ellipsis_n_bar_1": 0.769, "blimp/accuracy/tough_vs_raising_2": 0.827, "blimp/accuracy/tough_vs_raising_1": 0.572, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.924, "blimp/accuracy/principle_A_reconstruction": 0.394, "blimp/accuracy/wh_vs_that_with_gap": 0.451, "blimp/accuracy/principle_A_domain_2": 0.825, "blimp/accuracy/determiner_noun_agreement_1": 0.98, "blimp/accuracy/ellipsis_n_bar_2": 0.88, "blimp/accuracy/principle_A_domain_3": 0.528, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.888, "blimp/accuracy/animate_subject_trans": 0.861, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.857, "blimp/accuracy/distractor_agreement_relative_clause": 0.531, "blimp/accuracy/transitive": 0.841, "blimp/accuracy/sentential_subject_island": 0.395, "blimp/accuracy/adjunct_island": 0.773, "blimp/accuracy/intransitive": 0.775, "blimp/accuracy/existential_there_subject_raising": 0.836, "blimp/accuracy/irregular_past_participle_adjectives": 0.848, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.202, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.162, "blimp/accuracy/only_npi_scope": 0.763, "blimp/accuracy/superlative_quantifiers_2": 0.696, "blimp/accuracy/passive_1": 0.895, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.887, "blimp/accuracy/inchoative": 0.603, "blimp/accuracy/anaphor_gender_agreement": 0.861, "blimp/accuracy/principle_A_c_command": 0.659, "blimp/accuracy/only_npi_licensor_present": 0.509, "blimp/accuracy/expletive_it_object_raising": 0.754, "blimp/accuracy/left_branch_island_simple_question": 0.224, "blimp/accuracy/wh_questions_subject_gap": 0.933, "blimp/accuracy/existential_there_quantifiers_2": 0.352, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.905, "blimp/accuracy/sentential_negation_npi_scope": 0.49, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.801, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.887, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.832, "blimp/accuracy/principle_A_case_2": 0.932, "blimp/accuracy/distractor_agreement_relational_noun": 0.776, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.984, "blimp/accuracy/superlative_quantifiers_1": 0.5, "blimp/accuracy/wh_island": 0.827, "blimp/accuracy/principle_A_domain_1": 0.989, "blimp/accuracy/complex_NP_island": 0.486, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.951, "blimp/accuracy/irregular_past_participle_verbs": 0.886, "blimp/accuracy/drop_argument": 0.761, "blimp/accuracy/wh_questions_object_gap": 0.768, "blimp/accuracy/animate_subject_passive": 0.779, "blimp/accuracy/existential_there_quantifiers_1": 0.985, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.867, "blimp/accuracy/npi_present_2": 0.584, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.895, "blimp/accuracy/anaphor_number_agreement": 0.971, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.948, "blimp/accuracy/existential_there_object_raising": 0.819, "blimp/accuracy/matrix_question_npi_licensor_present": 0.212, "blimp/accuracy/npi_present_1": 0.574, "blimp/accuracy/wh_vs_that_no_gap": 0.97, "blimp/accuracy/left_branch_island_echo_question": 0.283, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.982, "blimp/accuracy/causative": 0.66, "blimp/accuracy/group_average": 0.7375074626865673, "blimp/accuracy/seq_average": 0.7375074626865672, "cbt/accuracy/NE": 0.7319711538461539, "cbt/accuracy/V": 0.886, "cbt/accuracy/CN": 0.7764, "cbt/accuracy/P": 0.8664, "cbt/accuracy/group_average": 0.8151927884615385, "cbt/accuracy/seq_average": 0.8152260904361744, "hellaswag/accuracy/val": 0.28211511651065524, "hellaswag/accuracy/group_average": 0.28211511651065524, "hellaswag/accuracy/seq_average": 0.28211511651065524, "piqa/accuracy/val": 0.5658324265505985, "piqa/accuracy/group_average": 0.5658324265505985, "piqa/accuracy/seq_average": 0.5658324265505985, "ai2arc/accuracy/ARC-Easy": 0.30782241014799155, "ai2arc/accuracy/ARC-Challenge": 0.2128755364806867, "ai2arc/accuracy/group_average": 0.26034897331433915, "ai2arc/accuracy/seq_average": 0.27648725212464587, "race/accuracy/test/high": 0.2701543739279588, "race/accuracy/test/middle": 0.33008356545961004, "race/accuracy/group_average": 0.30011896969378443, "race/accuracy/seq_average": 0.2875962707742197, "siqa/accuracy/dev": 0.35209825997952915, "siqa/accuracy/group_average": 0.35209825997952915, "siqa/accuracy/seq_average": 0.35209825997952915, "commonsenseqa/accuracy/dev_rand_split": 0.24242424242424243, "commonsenseqa/accuracy/group_average": 0.24242424242424243, "commonsenseqa/accuracy/seq_average": 0.24242424242424243}