e4972c24622ac8ab111b1fcefe4609edde86014d2eed59db971daff866a37217
Browse files- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_postln/export/result-model-10000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_postln/export/result-model-100000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_postln/export/result-model-20000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_postln/export/result-model-30000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_postln/export/result-model-40000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_postln/export/result-model-50000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_postln/export/result-model-60000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_postln/export/result-model-70000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_postln/export/result-model-80000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_postln/export/result-model-90000.pth.json +1 -0
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_postln/export/result-model-10000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 3.2031133742559526, "val/accuracy": 0.402557857452877, "val/perplexity": 24.609028166671084, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.832875885578416, "lambada/accuracy/total": 0.15003881987577639, "lambada/accuracy/openai_last_token": 0.7074922360248447, "lambada/perplexity": 32.92056403583473, "lambada/lm_loss": 3.7366795035116245, "lambada/lm_perplexity": 41.95843575966647, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.2762983386643267, "mean_loss": 3.017994629917184, "blimp/accuracy/passive_2": 0.873, "blimp/accuracy/determiner_noun_agreement_2": 0.928, "blimp/accuracy/ellipsis_n_bar_1": 0.618, "blimp/accuracy/tough_vs_raising_2": 0.797, "blimp/accuracy/tough_vs_raising_1": 0.454, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.891, "blimp/accuracy/principle_A_reconstruction": 0.502, "blimp/accuracy/wh_vs_that_with_gap": 0.26, "blimp/accuracy/principle_A_domain_2": 0.681, "blimp/accuracy/determiner_noun_agreement_1": 0.948, "blimp/accuracy/ellipsis_n_bar_2": 0.816, "blimp/accuracy/principle_A_domain_3": 0.473, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.849, "blimp/accuracy/animate_subject_trans": 0.803, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.742, "blimp/accuracy/distractor_agreement_relative_clause": 0.439, "blimp/accuracy/transitive": 0.754, "blimp/accuracy/sentential_subject_island": 0.392, "blimp/accuracy/adjunct_island": 0.6, "blimp/accuracy/intransitive": 0.649, "blimp/accuracy/existential_there_subject_raising": 0.76, "blimp/accuracy/irregular_past_participle_adjectives": 0.991, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.167, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.115, "blimp/accuracy/only_npi_scope": 0.801, "blimp/accuracy/superlative_quantifiers_2": 0.53, "blimp/accuracy/passive_1": 0.873, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.871, "blimp/accuracy/inchoative": 0.512, "blimp/accuracy/anaphor_gender_agreement": 0.859, "blimp/accuracy/principle_A_c_command": 0.647, "blimp/accuracy/only_npi_licensor_present": 0.577, "blimp/accuracy/expletive_it_object_raising": 0.779, "blimp/accuracy/left_branch_island_simple_question": 0.253, "blimp/accuracy/wh_questions_subject_gap": 0.892, "blimp/accuracy/existential_there_quantifiers_2": 0.282, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.867, "blimp/accuracy/sentential_negation_npi_scope": 0.456, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.58, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.923, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.771, "blimp/accuracy/principle_A_case_2": 0.793, "blimp/accuracy/distractor_agreement_relational_noun": 0.594, "blimp/accuracy/sentential_negation_npi_licensor_present": 1.0, "blimp/accuracy/superlative_quantifiers_1": 0.814, "blimp/accuracy/wh_island": 0.73, "blimp/accuracy/principle_A_domain_1": 0.984, "blimp/accuracy/complex_NP_island": 0.541, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.905, "blimp/accuracy/irregular_past_participle_verbs": 0.857, "blimp/accuracy/drop_argument": 0.716, "blimp/accuracy/wh_questions_object_gap": 0.747, "blimp/accuracy/animate_subject_passive": 0.726, "blimp/accuracy/existential_there_quantifiers_1": 0.893, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.897, "blimp/accuracy/npi_present_2": 0.574, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.782, "blimp/accuracy/anaphor_number_agreement": 0.972, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.906, "blimp/accuracy/existential_there_object_raising": 0.784, "blimp/accuracy/matrix_question_npi_licensor_present": 0.117, "blimp/accuracy/npi_present_1": 0.549, "blimp/accuracy/wh_vs_that_no_gap": 0.948, "blimp/accuracy/left_branch_island_echo_question": 0.357, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.964, "blimp/accuracy/causative": 0.609, "blimp/accuracy/group_average": 0.6975223880597013, "blimp/accuracy/seq_average": 0.6975223880597015, "cbt/accuracy/NE": 0.6722756410256411, "cbt/accuracy/V": 0.8412, "cbt/accuracy/CN": 0.6972, "cbt/accuracy/P": 0.8048, "cbt/accuracy/group_average": 0.7538689102564102, "cbt/accuracy/seq_average": 0.7539015606242497, "hellaswag/accuracy/val": 0.2709619597689703, "hellaswag/accuracy/group_average": 0.2709619597689703, "hellaswag/accuracy/seq_average": 0.2709619597689703, "piqa/accuracy/val": 0.5386289445048966, "piqa/accuracy/group_average": 0.5386289445048966, "piqa/accuracy/seq_average": 0.5386289445048966, "ai2arc/accuracy/ARC-Easy": 0.29471458773784354, "ai2arc/accuracy/ARC-Challenge": 0.20858369098712445, "ai2arc/accuracy/group_average": 0.251649139362484, "ai2arc/accuracy/seq_average": 0.26628895184135976, "race/accuracy/test/high": 0.2518582046883934, "race/accuracy/test/middle": 0.31963788300835655, "race/accuracy/group_average": 0.28574804384837493, "race/accuracy/seq_average": 0.2715849209566275, "siqa/accuracy/dev": 0.3500511770726714, "siqa/accuracy/group_average": 0.3500511770726714, "siqa/accuracy/seq_average": 0.3500511770726714, "commonsenseqa/accuracy/dev_rand_split": 0.23177723177723178, "commonsenseqa/accuracy/group_average": 0.23177723177723178, "commonsenseqa/accuracy/seq_average": 0.23177723177723178}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_postln/export/result-model-100000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.773337954566592, "val/accuracy": 0.45831105065724204, "val/perplexity": 16.01199220914337, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6838583620438663, "lambada/accuracy/total": 0.2016692546583851, "lambada/accuracy/openai_last_token": 0.7447593167701864, "lambada/perplexity": 17.153668581375232, "lambada/lm_loss": 3.347194052867128, "lambada/lm_perplexity": 28.422868580992876, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.32999015265781356, "mean_loss": 2.7285981583052292, "blimp/accuracy/passive_2": 0.893, "blimp/accuracy/determiner_noun_agreement_2": 0.962, "blimp/accuracy/ellipsis_n_bar_1": 0.747, "blimp/accuracy/tough_vs_raising_2": 0.857, "blimp/accuracy/tough_vs_raising_1": 0.517, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.92, "blimp/accuracy/principle_A_reconstruction": 0.22, "blimp/accuracy/wh_vs_that_with_gap": 0.458, "blimp/accuracy/principle_A_domain_2": 0.779, "blimp/accuracy/determiner_noun_agreement_1": 0.981, "blimp/accuracy/ellipsis_n_bar_2": 0.895, "blimp/accuracy/principle_A_domain_3": 0.502, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.899, "blimp/accuracy/animate_subject_trans": 0.852, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.871, "blimp/accuracy/distractor_agreement_relative_clause": 0.535, "blimp/accuracy/transitive": 0.844, "blimp/accuracy/sentential_subject_island": 0.295, "blimp/accuracy/adjunct_island": 0.75, "blimp/accuracy/intransitive": 0.794, "blimp/accuracy/existential_there_subject_raising": 0.851, "blimp/accuracy/irregular_past_participle_adjectives": 0.97, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.245, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.172, "blimp/accuracy/only_npi_scope": 0.743, "blimp/accuracy/superlative_quantifiers_2": 0.713, "blimp/accuracy/passive_1": 0.9, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.895, "blimp/accuracy/inchoative": 0.611, "blimp/accuracy/anaphor_gender_agreement": 0.94, "blimp/accuracy/principle_A_c_command": 0.593, "blimp/accuracy/only_npi_licensor_present": 0.618, "blimp/accuracy/expletive_it_object_raising": 0.765, "blimp/accuracy/left_branch_island_simple_question": 0.27, "blimp/accuracy/wh_questions_subject_gap": 0.927, "blimp/accuracy/existential_there_quantifiers_2": 0.483, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.926, "blimp/accuracy/sentential_negation_npi_scope": 0.441, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.788, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.908, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.868, "blimp/accuracy/principle_A_case_2": 0.94, "blimp/accuracy/distractor_agreement_relational_noun": 0.749, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.964, "blimp/accuracy/superlative_quantifiers_1": 0.643, "blimp/accuracy/wh_island": 0.733, "blimp/accuracy/principle_A_domain_1": 0.979, "blimp/accuracy/complex_NP_island": 0.562, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.94, "blimp/accuracy/irregular_past_participle_verbs": 0.895, "blimp/accuracy/drop_argument": 0.753, "blimp/accuracy/wh_questions_object_gap": 0.774, "blimp/accuracy/animate_subject_passive": 0.767, "blimp/accuracy/existential_there_quantifiers_1": 0.971, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.885, "blimp/accuracy/npi_present_2": 0.566, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.916, "blimp/accuracy/anaphor_number_agreement": 0.98, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.945, "blimp/accuracy/existential_there_object_raising": 0.8, "blimp/accuracy/matrix_question_npi_licensor_present": 0.263, "blimp/accuracy/npi_present_1": 0.565, "blimp/accuracy/wh_vs_that_no_gap": 0.946, "blimp/accuracy/left_branch_island_echo_question": 0.261, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.967, "blimp/accuracy/causative": 0.69, "blimp/accuracy/group_average": 0.7410746268656714, "blimp/accuracy/seq_average": 0.7410746268656716, "cbt/accuracy/NE": 0.7327724358974359, "cbt/accuracy/V": 0.89, "cbt/accuracy/CN": 0.786, "cbt/accuracy/P": 0.874, "cbt/accuracy/group_average": 0.820693108974359, "cbt/accuracy/seq_average": 0.8207282913165266, "hellaswag/accuracy/val": 0.2811192989444334, "hellaswag/accuracy/group_average": 0.2811192989444334, "hellaswag/accuracy/seq_average": 0.2811192989444334, "piqa/accuracy/val": 0.5745375408052231, "piqa/accuracy/group_average": 0.5745375408052231, "piqa/accuracy/seq_average": 0.5745375408052231, "ai2arc/accuracy/ARC-Easy": 0.3281183932346723, "ai2arc/accuracy/ARC-Challenge": 0.20858369098712445, "ai2arc/accuracy/group_average": 0.2683510421108984, "ai2arc/accuracy/seq_average": 0.2886685552407932, "race/accuracy/test/high": 0.2567181246426529, "race/accuracy/test/middle": 0.3426183844011142, "race/accuracy/group_average": 0.29966825452188356, "race/accuracy/seq_average": 0.2817186866639643, "siqa/accuracy/dev": 0.36591606960081885, "siqa/accuracy/group_average": 0.36591606960081885, "siqa/accuracy/seq_average": 0.36591606960081885, "commonsenseqa/accuracy/dev_rand_split": 0.24488124488124488, "commonsenseqa/accuracy/group_average": 0.24488124488124488, "commonsenseqa/accuracy/seq_average": 0.24488124488124488}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_postln/export/result-model-20000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 3.047113085549975, "val/accuracy": 0.4216153583829365, "val/perplexity": 21.054474135541287, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.692286900111607, "lambada/accuracy/total": 0.1702251552795031, "lambada/accuracy/openai_last_token": 0.7272903726708074, "lambada/perplexity": 25.056111745912684, "lambada/lm_loss": 3.5711213940897433, "lambada/lm_perplexity": 35.55644358912299, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.2959202568312198, "mean_loss": 2.869699992830791, "blimp/accuracy/passive_2": 0.86, "blimp/accuracy/determiner_noun_agreement_2": 0.959, "blimp/accuracy/ellipsis_n_bar_1": 0.652, "blimp/accuracy/tough_vs_raising_2": 0.814, "blimp/accuracy/tough_vs_raising_1": 0.547, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.869, "blimp/accuracy/principle_A_reconstruction": 0.535, "blimp/accuracy/wh_vs_that_with_gap": 0.401, "blimp/accuracy/principle_A_domain_2": 0.781, "blimp/accuracy/determiner_noun_agreement_1": 0.979, "blimp/accuracy/ellipsis_n_bar_2": 0.858, "blimp/accuracy/principle_A_domain_3": 0.497, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.844, "blimp/accuracy/animate_subject_trans": 0.784, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.817, "blimp/accuracy/distractor_agreement_relative_clause": 0.47, "blimp/accuracy/transitive": 0.787, "blimp/accuracy/sentential_subject_island": 0.395, "blimp/accuracy/adjunct_island": 0.639, "blimp/accuracy/intransitive": 0.699, "blimp/accuracy/existential_there_subject_raising": 0.778, "blimp/accuracy/irregular_past_participle_adjectives": 0.917, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.197, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.158, "blimp/accuracy/only_npi_scope": 0.603, "blimp/accuracy/superlative_quantifiers_2": 0.868, "blimp/accuracy/passive_1": 0.873, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.859, "blimp/accuracy/inchoative": 0.53, "blimp/accuracy/anaphor_gender_agreement": 0.845, "blimp/accuracy/principle_A_c_command": 0.582, "blimp/accuracy/only_npi_licensor_present": 0.112, "blimp/accuracy/expletive_it_object_raising": 0.732, "blimp/accuracy/left_branch_island_simple_question": 0.287, "blimp/accuracy/wh_questions_subject_gap": 0.884, "blimp/accuracy/existential_there_quantifiers_2": 0.568, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.902, "blimp/accuracy/sentential_negation_npi_scope": 0.411, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.673, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.908, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.814, "blimp/accuracy/principle_A_case_2": 0.865, "blimp/accuracy/distractor_agreement_relational_noun": 0.692, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.997, "blimp/accuracy/superlative_quantifiers_1": 0.62, "blimp/accuracy/wh_island": 0.813, "blimp/accuracy/principle_A_domain_1": 0.977, "blimp/accuracy/complex_NP_island": 0.518, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.884, "blimp/accuracy/irregular_past_participle_verbs": 0.793, "blimp/accuracy/drop_argument": 0.73, "blimp/accuracy/wh_questions_object_gap": 0.738, "blimp/accuracy/animate_subject_passive": 0.73, "blimp/accuracy/existential_there_quantifiers_1": 0.96, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.883, "blimp/accuracy/npi_present_2": 0.538, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.892, "blimp/accuracy/anaphor_number_agreement": 0.975, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.934, "blimp/accuracy/existential_there_object_raising": 0.741, "blimp/accuracy/matrix_question_npi_licensor_present": 0.149, "blimp/accuracy/npi_present_1": 0.383, "blimp/accuracy/wh_vs_that_no_gap": 0.938, "blimp/accuracy/left_branch_island_echo_question": 0.286, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.965, "blimp/accuracy/causative": 0.637, "blimp/accuracy/group_average": 0.706358208955224, "blimp/accuracy/seq_average": 0.7063582089552238, "cbt/accuracy/NE": 0.6887019230769231, "cbt/accuracy/V": 0.8552, "cbt/accuracy/CN": 0.732, "cbt/accuracy/P": 0.8336, "cbt/accuracy/group_average": 0.7773754807692308, "cbt/accuracy/seq_average": 0.7774109643857543, "hellaswag/accuracy/val": 0.27235610436168095, "hellaswag/accuracy/group_average": 0.27235610436168095, "hellaswag/accuracy/seq_average": 0.27235610436168095, "piqa/accuracy/val": 0.5478781284004353, "piqa/accuracy/group_average": 0.5478781284004353, "piqa/accuracy/seq_average": 0.5478781284004353, "ai2arc/accuracy/ARC-Easy": 0.2989429175475687, "ai2arc/accuracy/ARC-Challenge": 0.1982832618025751, "ai2arc/accuracy/group_average": 0.2486130896750719, "ai2arc/accuracy/seq_average": 0.26572237960339945, "race/accuracy/test/high": 0.25586049170954833, "race/accuracy/test/middle": 0.33147632311977715, "race/accuracy/group_average": 0.2936684074146627, "race/accuracy/seq_average": 0.2778678556951763, "siqa/accuracy/dev": 0.36438075742067555, "siqa/accuracy/group_average": 0.36438075742067555, "siqa/accuracy/seq_average": 0.36438075742067555, "commonsenseqa/accuracy/dev_rand_split": 0.2334152334152334, "commonsenseqa/accuracy/group_average": 0.2334152334152334, "commonsenseqa/accuracy/seq_average": 0.2334152334152334}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_postln/export/result-model-30000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.9684857565259177, "val/accuracy": 0.4321511889260913, "val/perplexity": 19.46242641954599, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.7600296683933423, "lambada/accuracy/total": 0.16964285714285715, "lambada/accuracy/openai_last_token": 0.7253493788819876, "lambada/perplexity": 22.860236508956753, "lambada/lm_loss": 3.521664271107461, "lambada/lm_perplexity": 33.840701726017386, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3008970230344742, "mean_loss": 2.8642577124596302, "blimp/accuracy/passive_2": 0.88, "blimp/accuracy/determiner_noun_agreement_2": 0.954, "blimp/accuracy/ellipsis_n_bar_1": 0.749, "blimp/accuracy/tough_vs_raising_2": 0.859, "blimp/accuracy/tough_vs_raising_1": 0.505, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.889, "blimp/accuracy/principle_A_reconstruction": 0.5, "blimp/accuracy/wh_vs_that_with_gap": 0.456, "blimp/accuracy/principle_A_domain_2": 0.767, "blimp/accuracy/determiner_noun_agreement_1": 0.965, "blimp/accuracy/ellipsis_n_bar_2": 0.864, "blimp/accuracy/principle_A_domain_3": 0.506, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.871, "blimp/accuracy/animate_subject_trans": 0.821, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.834, "blimp/accuracy/distractor_agreement_relative_clause": 0.481, "blimp/accuracy/transitive": 0.806, "blimp/accuracy/sentential_subject_island": 0.363, "blimp/accuracy/adjunct_island": 0.68, "blimp/accuracy/intransitive": 0.753, "blimp/accuracy/existential_there_subject_raising": 0.84, "blimp/accuracy/irregular_past_participle_adjectives": 0.957, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.31, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.163, "blimp/accuracy/only_npi_scope": 0.766, "blimp/accuracy/superlative_quantifiers_2": 0.59, "blimp/accuracy/passive_1": 0.856, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.871, "blimp/accuracy/inchoative": 0.59, "blimp/accuracy/anaphor_gender_agreement": 0.898, "blimp/accuracy/principle_A_c_command": 0.546, "blimp/accuracy/only_npi_licensor_present": 0.288, "blimp/accuracy/expletive_it_object_raising": 0.758, "blimp/accuracy/left_branch_island_simple_question": 0.378, "blimp/accuracy/wh_questions_subject_gap": 0.892, "blimp/accuracy/existential_there_quantifiers_2": 0.479, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.896, "blimp/accuracy/sentential_negation_npi_scope": 0.471, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.73, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.89, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.853, "blimp/accuracy/principle_A_case_2": 0.899, "blimp/accuracy/distractor_agreement_relational_noun": 0.655, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.991, "blimp/accuracy/superlative_quantifiers_1": 0.496, "blimp/accuracy/wh_island": 0.578, "blimp/accuracy/principle_A_domain_1": 0.987, "blimp/accuracy/complex_NP_island": 0.539, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.938, "blimp/accuracy/irregular_past_participle_verbs": 0.854, "blimp/accuracy/drop_argument": 0.769, "blimp/accuracy/wh_questions_object_gap": 0.678, "blimp/accuracy/animate_subject_passive": 0.773, "blimp/accuracy/existential_there_quantifiers_1": 0.957, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.873, "blimp/accuracy/npi_present_2": 0.598, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.887, "blimp/accuracy/anaphor_number_agreement": 0.979, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.932, "blimp/accuracy/existential_there_object_raising": 0.775, "blimp/accuracy/matrix_question_npi_licensor_present": 0.19, "blimp/accuracy/npi_present_1": 0.337, "blimp/accuracy/wh_vs_that_no_gap": 0.929, "blimp/accuracy/left_branch_island_echo_question": 0.399, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.965, "blimp/accuracy/causative": 0.633, "blimp/accuracy/group_average": 0.7184477611940302, "blimp/accuracy/seq_average": 0.7184477611940299, "cbt/accuracy/NE": 0.7067307692307693, "cbt/accuracy/V": 0.8656, "cbt/accuracy/CN": 0.7532, "cbt/accuracy/P": 0.8428, "cbt/accuracy/group_average": 0.7920826923076923, "cbt/accuracy/seq_average": 0.7921168467386954, "hellaswag/accuracy/val": 0.27365066719776937, "hellaswag/accuracy/group_average": 0.27365066719776937, "hellaswag/accuracy/seq_average": 0.27365066719776937, "piqa/accuracy/val": 0.5696409140369967, "piqa/accuracy/group_average": 0.5696409140369967, "piqa/accuracy/seq_average": 0.5696409140369967, "ai2arc/accuracy/ARC-Easy": 0.2989429175475687, "ai2arc/accuracy/ARC-Challenge": 0.20429184549356222, "ai2arc/accuracy/group_average": 0.25161738152056545, "ai2arc/accuracy/seq_average": 0.26770538243626063, "race/accuracy/test/high": 0.259576901086335, "race/accuracy/test/middle": 0.32590529247910865, "race/accuracy/group_average": 0.29274109678272187, "race/accuracy/seq_average": 0.27888123226591, "siqa/accuracy/dev": 0.35209825997952915, "siqa/accuracy/group_average": 0.35209825997952915, "siqa/accuracy/seq_average": 0.35209825997952915, "commonsenseqa/accuracy/dev_rand_split": 0.23013923013923013, "commonsenseqa/accuracy/group_average": 0.23013923013923013, "commonsenseqa/accuracy/seq_average": 0.23013923013923013}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_postln/export/result-model-40000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.914554172092014, "val/accuracy": 0.4394754076760913, "val/perplexity": 18.44058924050239, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.7433970196646933, "lambada/accuracy/total": 0.16750776397515527, "lambada/accuracy/openai_last_token": 0.7290372670807453, "lambada/perplexity": 23.1343219184905, "lambada/lm_loss": 3.4627858956737456, "lambada/lm_perplexity": 31.9057388758219, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3034915858256233, "mean_loss": 2.8289755958783536, "blimp/accuracy/passive_2": 0.875, "blimp/accuracy/determiner_noun_agreement_2": 0.962, "blimp/accuracy/ellipsis_n_bar_1": 0.741, "blimp/accuracy/tough_vs_raising_2": 0.852, "blimp/accuracy/tough_vs_raising_1": 0.509, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.873, "blimp/accuracy/principle_A_reconstruction": 0.344, "blimp/accuracy/wh_vs_that_with_gap": 0.458, "blimp/accuracy/principle_A_domain_2": 0.768, "blimp/accuracy/determiner_noun_agreement_1": 0.975, "blimp/accuracy/ellipsis_n_bar_2": 0.864, "blimp/accuracy/principle_A_domain_3": 0.51, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.884, "blimp/accuracy/animate_subject_trans": 0.844, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.889, "blimp/accuracy/distractor_agreement_relative_clause": 0.545, "blimp/accuracy/transitive": 0.807, "blimp/accuracy/sentential_subject_island": 0.343, "blimp/accuracy/adjunct_island": 0.727, "blimp/accuracy/intransitive": 0.749, "blimp/accuracy/existential_there_subject_raising": 0.833, "blimp/accuracy/irregular_past_participle_adjectives": 0.952, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.253, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.168, "blimp/accuracy/only_npi_scope": 0.727, "blimp/accuracy/superlative_quantifiers_2": 0.71, "blimp/accuracy/passive_1": 0.884, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.859, "blimp/accuracy/inchoative": 0.576, "blimp/accuracy/anaphor_gender_agreement": 0.886, "blimp/accuracy/principle_A_c_command": 0.611, "blimp/accuracy/only_npi_licensor_present": 0.44, "blimp/accuracy/expletive_it_object_raising": 0.774, "blimp/accuracy/left_branch_island_simple_question": 0.371, "blimp/accuracy/wh_questions_subject_gap": 0.899, "blimp/accuracy/existential_there_quantifiers_2": 0.544, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.921, "blimp/accuracy/sentential_negation_npi_scope": 0.51, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.729, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.913, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.873, "blimp/accuracy/principle_A_case_2": 0.932, "blimp/accuracy/distractor_agreement_relational_noun": 0.739, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.997, "blimp/accuracy/superlative_quantifiers_1": 0.532, "blimp/accuracy/wh_island": 0.624, "blimp/accuracy/principle_A_domain_1": 0.985, "blimp/accuracy/complex_NP_island": 0.529, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.954, "blimp/accuracy/irregular_past_participle_verbs": 0.859, "blimp/accuracy/drop_argument": 0.756, "blimp/accuracy/wh_questions_object_gap": 0.711, "blimp/accuracy/animate_subject_passive": 0.768, "blimp/accuracy/existential_there_quantifiers_1": 0.969, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.877, "blimp/accuracy/npi_present_2": 0.57, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.913, "blimp/accuracy/anaphor_number_agreement": 0.967, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.951, "blimp/accuracy/existential_there_object_raising": 0.776, "blimp/accuracy/matrix_question_npi_licensor_present": 0.2, "blimp/accuracy/npi_present_1": 0.488, "blimp/accuracy/wh_vs_that_no_gap": 0.952, "blimp/accuracy/left_branch_island_echo_question": 0.338, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.968, "blimp/accuracy/causative": 0.641, "blimp/accuracy/group_average": 0.7305671641791047, "blimp/accuracy/seq_average": 0.7305671641791045, "cbt/accuracy/NE": 0.7203525641025641, "cbt/accuracy/V": 0.8736, "cbt/accuracy/CN": 0.7444, "cbt/accuracy/P": 0.864, "cbt/accuracy/group_average": 0.800588141025641, "cbt/accuracy/seq_average": 0.8006202480992397, "hellaswag/accuracy/val": 0.27285401314479185, "hellaswag/accuracy/group_average": 0.27285401314479185, "hellaswag/accuracy/seq_average": 0.27285401314479185, "piqa/accuracy/val": 0.5642002176278563, "piqa/accuracy/group_average": 0.5642002176278563, "piqa/accuracy/seq_average": 0.5642002176278563, "ai2arc/accuracy/ARC-Easy": 0.3175475687103594, "ai2arc/accuracy/ARC-Challenge": 0.2034334763948498, "ai2arc/accuracy/group_average": 0.2604905225526046, "ai2arc/accuracy/seq_average": 0.2798866855524079, "race/accuracy/test/high": 0.258147512864494, "race/accuracy/test/middle": 0.3245125348189415, "race/accuracy/group_average": 0.29133002384171774, "race/accuracy/seq_average": 0.27746250506688286, "siqa/accuracy/dev": 0.3490276356192426, "siqa/accuracy/group_average": 0.3490276356192426, "siqa/accuracy/seq_average": 0.3490276356192426, "commonsenseqa/accuracy/dev_rand_split": 0.23505323505323505, "commonsenseqa/accuracy/group_average": 0.23505323505323505, "commonsenseqa/accuracy/seq_average": 0.23505323505323505}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_postln/export/result-model-50000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.872371128627232, "val/accuracy": 0.44425165085565477, "val/perplexity": 17.678887457671276, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6651740222243787, "lambada/accuracy/total": 0.19274068322981366, "lambada/accuracy/openai_last_token": 0.7369953416149069, "lambada/perplexity": 18.828556574913257, "lambada/lm_loss": 3.443039294155115, "lambada/lm_perplexity": 31.28188869580971, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3184961670427342, "mean_loss": 2.7687725754258055, "blimp/accuracy/passive_2": 0.885, "blimp/accuracy/determiner_noun_agreement_2": 0.974, "blimp/accuracy/ellipsis_n_bar_1": 0.769, "blimp/accuracy/tough_vs_raising_2": 0.847, "blimp/accuracy/tough_vs_raising_1": 0.493, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.894, "blimp/accuracy/principle_A_reconstruction": 0.379, "blimp/accuracy/wh_vs_that_with_gap": 0.451, "blimp/accuracy/principle_A_domain_2": 0.805, "blimp/accuracy/determiner_noun_agreement_1": 0.98, "blimp/accuracy/ellipsis_n_bar_2": 0.879, "blimp/accuracy/principle_A_domain_3": 0.524, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.898, "blimp/accuracy/animate_subject_trans": 0.825, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.858, "blimp/accuracy/distractor_agreement_relative_clause": 0.495, "blimp/accuracy/transitive": 0.799, "blimp/accuracy/sentential_subject_island": 0.364, "blimp/accuracy/adjunct_island": 0.718, "blimp/accuracy/intransitive": 0.78, "blimp/accuracy/existential_there_subject_raising": 0.847, "blimp/accuracy/irregular_past_participle_adjectives": 0.967, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.214, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.161, "blimp/accuracy/only_npi_scope": 0.806, "blimp/accuracy/superlative_quantifiers_2": 0.729, "blimp/accuracy/passive_1": 0.891, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.881, "blimp/accuracy/inchoative": 0.581, "blimp/accuracy/anaphor_gender_agreement": 0.919, "blimp/accuracy/principle_A_c_command": 0.571, "blimp/accuracy/only_npi_licensor_present": 0.694, "blimp/accuracy/expletive_it_object_raising": 0.785, "blimp/accuracy/left_branch_island_simple_question": 0.23, "blimp/accuracy/wh_questions_subject_gap": 0.93, "blimp/accuracy/existential_there_quantifiers_2": 0.48, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.916, "blimp/accuracy/sentential_negation_npi_scope": 0.424, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.74, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.908, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.852, "blimp/accuracy/principle_A_case_2": 0.911, "blimp/accuracy/distractor_agreement_relational_noun": 0.679, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.994, "blimp/accuracy/superlative_quantifiers_1": 0.655, "blimp/accuracy/wh_island": 0.633, "blimp/accuracy/principle_A_domain_1": 0.985, "blimp/accuracy/complex_NP_island": 0.542, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.958, "blimp/accuracy/irregular_past_participle_verbs": 0.86, "blimp/accuracy/drop_argument": 0.747, "blimp/accuracy/wh_questions_object_gap": 0.77, "blimp/accuracy/animate_subject_passive": 0.762, "blimp/accuracy/existential_there_quantifiers_1": 0.967, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.888, "blimp/accuracy/npi_present_2": 0.547, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.894, "blimp/accuracy/anaphor_number_agreement": 0.974, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.939, "blimp/accuracy/existential_there_object_raising": 0.814, "blimp/accuracy/matrix_question_npi_licensor_present": 0.214, "blimp/accuracy/npi_present_1": 0.566, "blimp/accuracy/wh_vs_that_no_gap": 0.944, "blimp/accuracy/left_branch_island_echo_question": 0.365, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.971, "blimp/accuracy/causative": 0.634, "blimp/accuracy/group_average": 0.7366567164179104, "blimp/accuracy/seq_average": 0.7366567164179104, "cbt/accuracy/NE": 0.71875, "cbt/accuracy/V": 0.8836, "cbt/accuracy/CN": 0.7624, "cbt/accuracy/P": 0.8576, "cbt/accuracy/group_average": 0.8055875, "cbt/accuracy/seq_average": 0.8056222488995598, "hellaswag/accuracy/val": 0.276638119896435, "hellaswag/accuracy/group_average": 0.276638119896435, "hellaswag/accuracy/seq_average": 0.276638119896435, "piqa/accuracy/val": 0.559847660500544, "piqa/accuracy/group_average": 0.559847660500544, "piqa/accuracy/seq_average": 0.559847660500544, "ai2arc/accuracy/ARC-Easy": 0.3099365750528541, "ai2arc/accuracy/ARC-Challenge": 0.2, "ai2arc/accuracy/group_average": 0.2549682875264271, "ai2arc/accuracy/seq_average": 0.27365439093484417, "race/accuracy/test/high": 0.25443110348770726, "race/accuracy/test/middle": 0.3266016713091922, "race/accuracy/group_average": 0.29051638739844976, "race/accuracy/seq_average": 0.2754357519254155, "siqa/accuracy/dev": 0.3577277379733879, "siqa/accuracy/group_average": 0.3577277379733879, "siqa/accuracy/seq_average": 0.3577277379733879, "commonsenseqa/accuracy/dev_rand_split": 0.23505323505323505, "commonsenseqa/accuracy/group_average": 0.23505323505323505, "commonsenseqa/accuracy/seq_average": 0.23505323505323505}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_postln/export/result-model-60000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.8366447327628967, "val/accuracy": 0.45001026940724204, "val/perplexity": 17.058433805227867, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.760892690338703, "lambada/accuracy/total": 0.17565993788819875, "lambada/accuracy/openai_last_token": 0.7373835403726708, "lambada/perplexity": 19.783593474179945, "lambada/lm_loss": 3.3985858692737057, "lambada/lm_perplexity": 29.92175683930641, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3128351036477204, "mean_loss": 2.7987687115508, "blimp/accuracy/passive_2": 0.875, "blimp/accuracy/determiner_noun_agreement_2": 0.96, "blimp/accuracy/ellipsis_n_bar_1": 0.766, "blimp/accuracy/tough_vs_raising_2": 0.864, "blimp/accuracy/tough_vs_raising_1": 0.504, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.912, "blimp/accuracy/principle_A_reconstruction": 0.212, "blimp/accuracy/wh_vs_that_with_gap": 0.453, "blimp/accuracy/principle_A_domain_2": 0.764, "blimp/accuracy/determiner_noun_agreement_1": 0.985, "blimp/accuracy/ellipsis_n_bar_2": 0.882, "blimp/accuracy/principle_A_domain_3": 0.493, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.909, "blimp/accuracy/animate_subject_trans": 0.842, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.887, "blimp/accuracy/distractor_agreement_relative_clause": 0.513, "blimp/accuracy/transitive": 0.829, "blimp/accuracy/sentential_subject_island": 0.328, "blimp/accuracy/adjunct_island": 0.763, "blimp/accuracy/intransitive": 0.787, "blimp/accuracy/existential_there_subject_raising": 0.836, "blimp/accuracy/irregular_past_participle_adjectives": 0.974, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.193, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.19, "blimp/accuracy/only_npi_scope": 0.709, "blimp/accuracy/superlative_quantifiers_2": 0.71, "blimp/accuracy/passive_1": 0.869, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.901, "blimp/accuracy/inchoative": 0.615, "blimp/accuracy/anaphor_gender_agreement": 0.935, "blimp/accuracy/principle_A_c_command": 0.598, "blimp/accuracy/only_npi_licensor_present": 0.328, "blimp/accuracy/expletive_it_object_raising": 0.766, "blimp/accuracy/left_branch_island_simple_question": 0.245, "blimp/accuracy/wh_questions_subject_gap": 0.933, "blimp/accuracy/existential_there_quantifiers_2": 0.397, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.927, "blimp/accuracy/sentential_negation_npi_scope": 0.443, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.781, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.896, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.867, "blimp/accuracy/principle_A_case_2": 0.948, "blimp/accuracy/distractor_agreement_relational_noun": 0.706, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.989, "blimp/accuracy/superlative_quantifiers_1": 0.731, "blimp/accuracy/wh_island": 0.676, "blimp/accuracy/principle_A_domain_1": 0.986, "blimp/accuracy/complex_NP_island": 0.565, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.954, "blimp/accuracy/irregular_past_participle_verbs": 0.898, "blimp/accuracy/drop_argument": 0.765, "blimp/accuracy/wh_questions_object_gap": 0.761, "blimp/accuracy/animate_subject_passive": 0.79, "blimp/accuracy/existential_there_quantifiers_1": 0.966, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.884, "blimp/accuracy/npi_present_2": 0.557, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.914, "blimp/accuracy/anaphor_number_agreement": 0.98, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.949, "blimp/accuracy/existential_there_object_raising": 0.812, "blimp/accuracy/matrix_question_npi_licensor_present": 0.188, "blimp/accuracy/npi_present_1": 0.483, "blimp/accuracy/wh_vs_that_no_gap": 0.959, "blimp/accuracy/left_branch_island_echo_question": 0.327, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.972, "blimp/accuracy/causative": 0.661, "blimp/accuracy/group_average": 0.7322686567164179, "blimp/accuracy/seq_average": 0.7322686567164179, "cbt/accuracy/NE": 0.71875, "cbt/accuracy/V": 0.8784, "cbt/accuracy/CN": 0.7824, "cbt/accuracy/P": 0.8664, "cbt/accuracy/group_average": 0.8114875, "cbt/accuracy/seq_average": 0.8115246098439376, "hellaswag/accuracy/val": 0.27544313881696875, "hellaswag/accuracy/group_average": 0.27544313881696875, "hellaswag/accuracy/seq_average": 0.27544313881696875, "piqa/accuracy/val": 0.5652883569096845, "piqa/accuracy/group_average": 0.5652883569096845, "piqa/accuracy/seq_average": 0.5652883569096845, "ai2arc/accuracy/ARC-Easy": 0.3107822410147992, "ai2arc/accuracy/ARC-Challenge": 0.20429184549356222, "ai2arc/accuracy/group_average": 0.2575370432541807, "ai2arc/accuracy/seq_average": 0.2756373937677054, "race/accuracy/test/high": 0.25871926815323043, "race/accuracy/test/middle": 0.34052924791086353, "race/accuracy/group_average": 0.29962425803204695, "race/accuracy/seq_average": 0.28252938792055127, "siqa/accuracy/dev": 0.3561924257932446, "siqa/accuracy/group_average": 0.3561924257932446, "siqa/accuracy/seq_average": 0.3561924257932446, "commonsenseqa/accuracy/dev_rand_split": 0.24488124488124488, "commonsenseqa/accuracy/group_average": 0.24488124488124488, "commonsenseqa/accuracy/seq_average": 0.24488124488124488}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_postln/export/result-model-70000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.8124016655815973, "val/accuracy": 0.453704349578373, "val/perplexity": 16.649857629041854, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.784775443699049, "lambada/accuracy/total": 0.18808229813664595, "lambada/accuracy/openai_last_token": 0.7404891304347826, "lambada/perplexity": 18.370605342226593, "lambada/lm_loss": 3.38492699131732, "lambada/lm_perplexity": 29.515837725055412, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.32089332385750946, "mean_loss": 2.798588554640323, "blimp/accuracy/passive_2": 0.892, "blimp/accuracy/determiner_noun_agreement_2": 0.974, "blimp/accuracy/ellipsis_n_bar_1": 0.774, "blimp/accuracy/tough_vs_raising_2": 0.847, "blimp/accuracy/tough_vs_raising_1": 0.544, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.88, "blimp/accuracy/principle_A_reconstruction": 0.194, "blimp/accuracy/wh_vs_that_with_gap": 0.471, "blimp/accuracy/principle_A_domain_2": 0.759, "blimp/accuracy/determiner_noun_agreement_1": 0.979, "blimp/accuracy/ellipsis_n_bar_2": 0.888, "blimp/accuracy/principle_A_domain_3": 0.515, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.908, "blimp/accuracy/animate_subject_trans": 0.86, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.875, "blimp/accuracy/distractor_agreement_relative_clause": 0.523, "blimp/accuracy/transitive": 0.821, "blimp/accuracy/sentential_subject_island": 0.315, "blimp/accuracy/adjunct_island": 0.714, "blimp/accuracy/intransitive": 0.804, "blimp/accuracy/existential_there_subject_raising": 0.832, "blimp/accuracy/irregular_past_participle_adjectives": 0.838, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.242, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.202, "blimp/accuracy/only_npi_scope": 0.677, "blimp/accuracy/superlative_quantifiers_2": 0.655, "blimp/accuracy/passive_1": 0.886, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.896, "blimp/accuracy/inchoative": 0.602, "blimp/accuracy/anaphor_gender_agreement": 0.948, "blimp/accuracy/principle_A_c_command": 0.613, "blimp/accuracy/only_npi_licensor_present": 0.506, "blimp/accuracy/expletive_it_object_raising": 0.769, "blimp/accuracy/left_branch_island_simple_question": 0.251, "blimp/accuracy/wh_questions_subject_gap": 0.918, "blimp/accuracy/existential_there_quantifiers_2": 0.488, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.929, "blimp/accuracy/sentential_negation_npi_scope": 0.471, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.798, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.898, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.875, "blimp/accuracy/principle_A_case_2": 0.928, "blimp/accuracy/distractor_agreement_relational_noun": 0.767, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.993, "blimp/accuracy/superlative_quantifiers_1": 0.625, "blimp/accuracy/wh_island": 0.713, "blimp/accuracy/principle_A_domain_1": 0.981, "blimp/accuracy/complex_NP_island": 0.539, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.959, "blimp/accuracy/irregular_past_participle_verbs": 0.879, "blimp/accuracy/drop_argument": 0.754, "blimp/accuracy/wh_questions_object_gap": 0.753, "blimp/accuracy/animate_subject_passive": 0.778, "blimp/accuracy/existential_there_quantifiers_1": 0.966, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.825, "blimp/accuracy/npi_present_2": 0.48, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.914, "blimp/accuracy/anaphor_number_agreement": 0.978, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.948, "blimp/accuracy/existential_there_object_raising": 0.8, "blimp/accuracy/matrix_question_npi_licensor_present": 0.225, "blimp/accuracy/npi_present_1": 0.49, "blimp/accuracy/wh_vs_that_no_gap": 0.942, "blimp/accuracy/left_branch_island_echo_question": 0.332, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.968, "blimp/accuracy/causative": 0.669, "blimp/accuracy/group_average": 0.7318955223880598, "blimp/accuracy/seq_average": 0.7318955223880597, "cbt/accuracy/NE": 0.7299679487179487, "cbt/accuracy/V": 0.882, "cbt/accuracy/CN": 0.7708, "cbt/accuracy/P": 0.8632, "cbt/accuracy/group_average": 0.8114919871794871, "cbt/accuracy/seq_average": 0.8115246098439376, "hellaswag/accuracy/val": 0.2804222266480781, "hellaswag/accuracy/group_average": 0.2804222266480781, "hellaswag/accuracy/seq_average": 0.2804222266480781, "piqa/accuracy/val": 0.5783460282916213, "piqa/accuracy/group_average": 0.5783460282916213, "piqa/accuracy/seq_average": 0.5783460282916213, "ai2arc/accuracy/ARC-Easy": 0.32684989429175476, "ai2arc/accuracy/ARC-Challenge": 0.20429184549356222, "ai2arc/accuracy/group_average": 0.2655708698926585, "ai2arc/accuracy/seq_average": 0.2864022662889518, "race/accuracy/test/high": 0.2524299599771298, "race/accuracy/test/middle": 0.33565459610027853, "race/accuracy/group_average": 0.2940422780387042, "race/accuracy/seq_average": 0.2766518038102959, "siqa/accuracy/dev": 0.35516888433981575, "siqa/accuracy/group_average": 0.35516888433981575, "siqa/accuracy/seq_average": 0.35516888433981575, "commonsenseqa/accuracy/dev_rand_split": 0.25061425061425063, "commonsenseqa/accuracy/group_average": 0.25061425061425063, "commonsenseqa/accuracy/seq_average": 0.25061425061425063}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_postln/export/result-model-80000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.791228763640873, "val/accuracy": 0.45579601469494047, "val/perplexity": 16.301037623023976, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.674167609362869, "lambada/accuracy/total": 0.1906055900621118, "lambada/accuracy/openai_last_token": 0.7422360248447205, "lambada/perplexity": 17.97027802837003, "lambada/lm_loss": 3.3627774099465295, "lambada/lm_perplexity": 28.86926140721159, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3232008023785261, "mean_loss": 2.732698186501871, "blimp/accuracy/passive_2": 0.883, "blimp/accuracy/determiner_noun_agreement_2": 0.968, "blimp/accuracy/ellipsis_n_bar_1": 0.786, "blimp/accuracy/tough_vs_raising_2": 0.869, "blimp/accuracy/tough_vs_raising_1": 0.533, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.924, "blimp/accuracy/principle_A_reconstruction": 0.246, "blimp/accuracy/wh_vs_that_with_gap": 0.472, "blimp/accuracy/principle_A_domain_2": 0.789, "blimp/accuracy/determiner_noun_agreement_1": 0.985, "blimp/accuracy/ellipsis_n_bar_2": 0.889, "blimp/accuracy/principle_A_domain_3": 0.524, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.907, "blimp/accuracy/animate_subject_trans": 0.856, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.869, "blimp/accuracy/distractor_agreement_relative_clause": 0.536, "blimp/accuracy/transitive": 0.827, "blimp/accuracy/sentential_subject_island": 0.291, "blimp/accuracy/adjunct_island": 0.744, "blimp/accuracy/intransitive": 0.793, "blimp/accuracy/existential_there_subject_raising": 0.845, "blimp/accuracy/irregular_past_participle_adjectives": 0.98, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.205, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.18, "blimp/accuracy/only_npi_scope": 0.725, "blimp/accuracy/superlative_quantifiers_2": 0.764, "blimp/accuracy/passive_1": 0.896, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.91, "blimp/accuracy/inchoative": 0.621, "blimp/accuracy/anaphor_gender_agreement": 0.935, "blimp/accuracy/principle_A_c_command": 0.584, "blimp/accuracy/only_npi_licensor_present": 0.678, "blimp/accuracy/expletive_it_object_raising": 0.754, "blimp/accuracy/left_branch_island_simple_question": 0.247, "blimp/accuracy/wh_questions_subject_gap": 0.926, "blimp/accuracy/existential_there_quantifiers_2": 0.395, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.921, "blimp/accuracy/sentential_negation_npi_scope": 0.476, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.779, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.895, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.868, "blimp/accuracy/principle_A_case_2": 0.936, "blimp/accuracy/distractor_agreement_relational_noun": 0.747, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.982, "blimp/accuracy/superlative_quantifiers_1": 0.587, "blimp/accuracy/wh_island": 0.671, "blimp/accuracy/principle_A_domain_1": 0.981, "blimp/accuracy/complex_NP_island": 0.538, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.956, "blimp/accuracy/irregular_past_participle_verbs": 0.868, "blimp/accuracy/drop_argument": 0.773, "blimp/accuracy/wh_questions_object_gap": 0.762, "blimp/accuracy/animate_subject_passive": 0.765, "blimp/accuracy/existential_there_quantifiers_1": 0.963, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.894, "blimp/accuracy/npi_present_2": 0.57, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.91, "blimp/accuracy/anaphor_number_agreement": 0.98, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.943, "blimp/accuracy/existential_there_object_raising": 0.807, "blimp/accuracy/matrix_question_npi_licensor_present": 0.251, "blimp/accuracy/npi_present_1": 0.626, "blimp/accuracy/wh_vs_that_no_gap": 0.953, "blimp/accuracy/left_branch_island_echo_question": 0.326, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.97, "blimp/accuracy/causative": 0.69, "blimp/accuracy/group_average": 0.7421492537313432, "blimp/accuracy/seq_average": 0.7421492537313433, "cbt/accuracy/NE": 0.734375, "cbt/accuracy/V": 0.8836, "cbt/accuracy/CN": 0.776, "cbt/accuracy/P": 0.868, "cbt/accuracy/group_average": 0.81549375, "cbt/accuracy/seq_average": 0.8155262104841937, "hellaswag/accuracy/val": 0.27753435570603463, "hellaswag/accuracy/group_average": 0.27753435570603463, "hellaswag/accuracy/seq_average": 0.27753435570603463, "piqa/accuracy/val": 0.573449401523395, "piqa/accuracy/group_average": 0.573449401523395, "piqa/accuracy/seq_average": 0.573449401523395, "ai2arc/accuracy/ARC-Easy": 0.3293868921775899, "ai2arc/accuracy/ARC-Challenge": 0.20429184549356222, "ai2arc/accuracy/group_average": 0.266839368835576, "ai2arc/accuracy/seq_average": 0.28810198300283285, "race/accuracy/test/high": 0.25871926815323043, "race/accuracy/test/middle": 0.3412256267409471, "race/accuracy/group_average": 0.29997244744708873, "race/accuracy/seq_average": 0.282732063234698, "siqa/accuracy/dev": 0.3602865916069601, "siqa/accuracy/group_average": 0.3602865916069601, "siqa/accuracy/seq_average": 0.3602865916069601, "commonsenseqa/accuracy/dev_rand_split": 0.23996723996723995, "commonsenseqa/accuracy/group_average": 0.23996723996723995, "commonsenseqa/accuracy/seq_average": 0.23996723996723995}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_postln/export/result-model-90000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.7798265729631697, "val/accuracy": 0.45793902684771826, "val/perplexity": 16.1162257163793, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.7218074443177405, "lambada/accuracy/total": 0.19701086956521738, "lambada/accuracy/openai_last_token": 0.7428183229813664, "lambada/perplexity": 17.52029096732474, "lambada/lm_loss": 3.3421573547159693, "lambada/lm_perplexity": 28.280071087174598, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.32747494820646783, "mean_loss": 2.750817008640455, "blimp/accuracy/passive_2": 0.878, "blimp/accuracy/determiner_noun_agreement_2": 0.973, "blimp/accuracy/ellipsis_n_bar_1": 0.767, "blimp/accuracy/tough_vs_raising_2": 0.865, "blimp/accuracy/tough_vs_raising_1": 0.514, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.917, "blimp/accuracy/principle_A_reconstruction": 0.235, "blimp/accuracy/wh_vs_that_with_gap": 0.472, "blimp/accuracy/principle_A_domain_2": 0.776, "blimp/accuracy/determiner_noun_agreement_1": 0.982, "blimp/accuracy/ellipsis_n_bar_2": 0.883, "blimp/accuracy/principle_A_domain_3": 0.494, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.916, "blimp/accuracy/animate_subject_trans": 0.868, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.873, "blimp/accuracy/distractor_agreement_relative_clause": 0.537, "blimp/accuracy/transitive": 0.833, "blimp/accuracy/sentential_subject_island": 0.302, "blimp/accuracy/adjunct_island": 0.747, "blimp/accuracy/intransitive": 0.788, "blimp/accuracy/existential_there_subject_raising": 0.854, "blimp/accuracy/irregular_past_participle_adjectives": 0.975, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.201, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.192, "blimp/accuracy/only_npi_scope": 0.749, "blimp/accuracy/superlative_quantifiers_2": 0.598, "blimp/accuracy/passive_1": 0.886, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.898, "blimp/accuracy/inchoative": 0.605, "blimp/accuracy/anaphor_gender_agreement": 0.936, "blimp/accuracy/principle_A_c_command": 0.609, "blimp/accuracy/only_npi_licensor_present": 0.719, "blimp/accuracy/expletive_it_object_raising": 0.771, "blimp/accuracy/left_branch_island_simple_question": 0.246, "blimp/accuracy/wh_questions_subject_gap": 0.917, "blimp/accuracy/existential_there_quantifiers_2": 0.466, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.926, "blimp/accuracy/sentential_negation_npi_scope": 0.444, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.767, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.91, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.873, "blimp/accuracy/principle_A_case_2": 0.939, "blimp/accuracy/distractor_agreement_relational_noun": 0.755, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.969, "blimp/accuracy/superlative_quantifiers_1": 0.688, "blimp/accuracy/wh_island": 0.728, "blimp/accuracy/principle_A_domain_1": 0.982, "blimp/accuracy/complex_NP_island": 0.555, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.949, "blimp/accuracy/irregular_past_participle_verbs": 0.897, "blimp/accuracy/drop_argument": 0.756, "blimp/accuracy/wh_questions_object_gap": 0.772, "blimp/accuracy/animate_subject_passive": 0.758, "blimp/accuracy/existential_there_quantifiers_1": 0.972, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.886, "blimp/accuracy/npi_present_2": 0.592, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.908, "blimp/accuracy/anaphor_number_agreement": 0.982, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.946, "blimp/accuracy/existential_there_object_raising": 0.813, "blimp/accuracy/matrix_question_npi_licensor_present": 0.272, "blimp/accuracy/npi_present_1": 0.587, "blimp/accuracy/wh_vs_that_no_gap": 0.949, "blimp/accuracy/left_branch_island_echo_question": 0.265, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.958, "blimp/accuracy/causative": 0.696, "blimp/accuracy/group_average": 0.7423283582089552, "blimp/accuracy/seq_average": 0.7423283582089553, "cbt/accuracy/NE": 0.7299679487179487, "cbt/accuracy/V": 0.8908, "cbt/accuracy/CN": 0.788, "cbt/accuracy/P": 0.8712, "cbt/accuracy/group_average": 0.8199919871794872, "cbt/accuracy/seq_average": 0.8200280112044818, "hellaswag/accuracy/val": 0.27972515435172274, "hellaswag/accuracy/group_average": 0.27972515435172274, "hellaswag/accuracy/seq_average": 0.27972515435172274, "piqa/accuracy/val": 0.5788900979325353, "piqa/accuracy/group_average": 0.5788900979325353, "piqa/accuracy/seq_average": 0.5788900979325353, "ai2arc/accuracy/ARC-Easy": 0.3276955602536998, "ai2arc/accuracy/ARC-Challenge": 0.20515021459227467, "ai2arc/accuracy/group_average": 0.26642288742298725, "ai2arc/accuracy/seq_average": 0.28725212464589234, "race/accuracy/test/high": 0.2624356775300172, "race/accuracy/test/middle": 0.3363509749303621, "race/accuracy/group_average": 0.29939332623018966, "race/accuracy/seq_average": 0.28394811511957846, "siqa/accuracy/dev": 0.3607983623336745, "siqa/accuracy/group_average": 0.3607983623336745, "siqa/accuracy/seq_average": 0.3607983623336745, "commonsenseqa/accuracy/dev_rand_split": 0.25061425061425063, "commonsenseqa/accuracy/group_average": 0.25061425061425063, "commonsenseqa/accuracy/seq_average": 0.25061425061425063}
|