Upload folder using huggingface_hub
#298
by
DavidNguyen
- opened
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-100000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-120000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-140000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-160000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-180000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-20000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-200000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-220000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-240000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-260000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-280000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-300000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-320000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-340000.pth.json +121 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-360000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-380000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-40000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-400000.pth.json +121 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-60000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-80000.pth.json +121 -0
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-100000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.4435906788659474, "val/accuracy": 0.49988180493551587, "val/perplexity": 11.514310797132957, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.414007719999515, "lambada/accuracy/total": 0.3253105590062112, "lambada/accuracy/openai_last_token": 0.7859083850931677, "lambada/perplexity": 8.445781014354319, "lambada/lm_loss": 3.0301547574398184, "lambada/lm_perplexity": 20.700435887977115, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.41259618197086356, "mean_loss": 2.4287991994327314, "blimp/accuracy/passive_2": 0.899, "blimp/accuracy/determiner_noun_agreement_2": 0.985, "blimp/accuracy/ellipsis_n_bar_1": 0.859, "blimp/accuracy/tough_vs_raising_2": 0.89, "blimp/accuracy/tough_vs_raising_1": 0.613, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.909, "blimp/accuracy/principle_A_reconstruction": 0.439, "blimp/accuracy/wh_vs_that_with_gap": 0.477, "blimp/accuracy/principle_A_domain_2": 0.891, "blimp/accuracy/determiner_noun_agreement_1": 0.992, "blimp/accuracy/ellipsis_n_bar_2": 0.91, "blimp/accuracy/principle_A_domain_3": 0.606, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.931, "blimp/accuracy/animate_subject_trans": 0.891, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.914, "blimp/accuracy/distractor_agreement_relative_clause": 0.685, "blimp/accuracy/transitive": 0.886, "blimp/accuracy/sentential_subject_island": 0.345, "blimp/accuracy/adjunct_island": 0.889, "blimp/accuracy/intransitive": 0.746, "blimp/accuracy/existential_there_subject_raising": 0.909, "blimp/accuracy/irregular_past_participle_adjectives": 0.879, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.648, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.354, "blimp/accuracy/only_npi_scope": 0.605, "blimp/accuracy/superlative_quantifiers_2": 0.808, "blimp/accuracy/passive_1": 0.892, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.901, "blimp/accuracy/inchoative": 0.63, "blimp/accuracy/anaphor_gender_agreement": 0.953, "blimp/accuracy/principle_A_c_command": 0.695, "blimp/accuracy/only_npi_licensor_present": 0.646, "blimp/accuracy/expletive_it_object_raising": 0.778, "blimp/accuracy/left_branch_island_simple_question": 0.755, "blimp/accuracy/wh_questions_subject_gap": 0.911, "blimp/accuracy/existential_there_quantifiers_2": 0.602, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.94, "blimp/accuracy/sentential_negation_npi_scope": 0.698, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.849, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.871, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.922, "blimp/accuracy/principle_A_case_2": 0.934, "blimp/accuracy/distractor_agreement_relational_noun": 0.825, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.967, "blimp/accuracy/superlative_quantifiers_1": 0.596, "blimp/accuracy/wh_island": 0.743, "blimp/accuracy/principle_A_domain_1": 0.985, "blimp/accuracy/complex_NP_island": 0.605, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.975, "blimp/accuracy/irregular_past_participle_verbs": 0.886, "blimp/accuracy/drop_argument": 0.73, "blimp/accuracy/wh_questions_object_gap": 0.815, "blimp/accuracy/animate_subject_passive": 0.798, "blimp/accuracy/existential_there_quantifiers_1": 0.979, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.9, "blimp/accuracy/npi_present_2": 0.581, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.966, "blimp/accuracy/anaphor_number_agreement": 0.985, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.953, "blimp/accuracy/existential_there_object_raising": 0.857, "blimp/accuracy/matrix_question_npi_licensor_present": 0.307, "blimp/accuracy/npi_present_1": 0.663, "blimp/accuracy/wh_vs_that_no_gap": 0.971, "blimp/accuracy/left_branch_island_echo_question": 0.436, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.951, "blimp/accuracy/causative": 0.733, "blimp/accuracy/group_average": 0.7931940298507462, "blimp/accuracy/seq_average": 0.7931940298507463, "cbt/accuracy/NE": 0.8008814102564102, "cbt/accuracy/V": 0.9264, "cbt/accuracy/CN": 0.8692, "cbt/accuracy/P": 0.9044, "cbt/accuracy/group_average": 0.8752203525641025, "cbt/accuracy/seq_average": 0.875250100040016, "hellaswag/accuracy/val": 0.32642899820752835, "hellaswag/accuracy/group_average": 0.32642899820752835, "hellaswag/accuracy/seq_average": 0.32642899820752835, "piqa/accuracy/val": 0.6153427638737758, "piqa/accuracy/group_average": 0.6153427638737758, "piqa/accuracy/seq_average": 0.6153427638737758, "ai2arc/accuracy/ARC-Easy": 0.3568710359408034, "ai2arc/accuracy/ARC-Challenge": 0.21974248927038625, "ai2arc/accuracy/group_average": 0.28830676260559485, "ai2arc/accuracy/seq_average": 0.311614730878187, "mmlu/accuracy/MMLU": 0.2639256346085091, "mmlu/accuracy/group_average": 0.2639256346085091, "mmlu/accuracy/seq_average": 0.2639256346085091, "openbookqa/accuracy/test": 0.28, "openbookqa/accuracy/group_average": 0.28, "openbookqa/accuracy/seq_average": 0.28, "race/accuracy/test/high": 0.27530017152658665, "race/accuracy/test/middle": 0.3565459610027855, "race/accuracy/group_average": 0.3159230662646861, "race/accuracy/seq_average": 0.298946088366437, "siqa/accuracy/dev": 0.3602865916069601, "siqa/accuracy/group_average": 0.3602865916069601, "siqa/accuracy/seq_average": 0.3602865916069601, "winogrande/accuracy/dev": 0.5248618784530387, "winogrande/accuracy/group_average": 0.5248618784530387, "winogrande/accuracy/seq_average": 0.5248618784530387, "commonsenseqa/accuracy/dev_rand_split": 0.2628992628992629, "commonsenseqa/accuracy/group_average": 0.2628992628992629, "commonsenseqa/accuracy/seq_average": 0.2628992628992629}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-120000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.4185422867063493, "val/accuracy": 0.5034983801463294, "val/perplexity": 11.229478018777323, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.4556990913722827, "lambada/accuracy/total": 0.3449145962732919, "lambada/accuracy/openai_last_token": 0.7925077639751553, "lambada/perplexity": 7.2889780422434525, "lambada/lm_loss": 3.0061548873472277, "lambada/lm_perplexity": 20.209542368560957, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.42420648820981066, "mean_loss": 2.437120689039316, "blimp/accuracy/passive_2": 0.916, "blimp/accuracy/determiner_noun_agreement_2": 0.979, "blimp/accuracy/ellipsis_n_bar_1": 0.862, "blimp/accuracy/tough_vs_raising_2": 0.874, "blimp/accuracy/tough_vs_raising_1": 0.599, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.905, "blimp/accuracy/principle_A_reconstruction": 0.355, "blimp/accuracy/wh_vs_that_with_gap": 0.406, "blimp/accuracy/principle_A_domain_2": 0.867, "blimp/accuracy/determiner_noun_agreement_1": 0.991, "blimp/accuracy/ellipsis_n_bar_2": 0.921, "blimp/accuracy/principle_A_domain_3": 0.56, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.92, "blimp/accuracy/animate_subject_trans": 0.894, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.865, "blimp/accuracy/distractor_agreement_relative_clause": 0.693, "blimp/accuracy/transitive": 0.874, "blimp/accuracy/sentential_subject_island": 0.348, "blimp/accuracy/adjunct_island": 0.876, "blimp/accuracy/intransitive": 0.738, "blimp/accuracy/existential_there_subject_raising": 0.875, "blimp/accuracy/irregular_past_participle_adjectives": 0.881, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.761, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.301, "blimp/accuracy/only_npi_scope": 0.663, "blimp/accuracy/superlative_quantifiers_2": 0.796, "blimp/accuracy/passive_1": 0.916, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.901, "blimp/accuracy/inchoative": 0.61, "blimp/accuracy/anaphor_gender_agreement": 0.945, "blimp/accuracy/principle_A_c_command": 0.725, "blimp/accuracy/only_npi_licensor_present": 0.621, "blimp/accuracy/expletive_it_object_raising": 0.786, "blimp/accuracy/left_branch_island_simple_question": 0.858, "blimp/accuracy/wh_questions_subject_gap": 0.936, "blimp/accuracy/existential_there_quantifiers_2": 0.549, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.939, "blimp/accuracy/sentential_negation_npi_scope": 0.756, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.824, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.924, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.905, "blimp/accuracy/principle_A_case_2": 0.909, "blimp/accuracy/distractor_agreement_relational_noun": 0.837, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.98, "blimp/accuracy/superlative_quantifiers_1": 0.838, "blimp/accuracy/wh_island": 0.802, "blimp/accuracy/principle_A_domain_1": 0.999, "blimp/accuracy/complex_NP_island": 0.607, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.972, "blimp/accuracy/irregular_past_participle_verbs": 0.869, "blimp/accuracy/drop_argument": 0.722, "blimp/accuracy/wh_questions_object_gap": 0.833, "blimp/accuracy/animate_subject_passive": 0.809, "blimp/accuracy/existential_there_quantifiers_1": 0.969, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.897, "blimp/accuracy/npi_present_2": 0.66, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.929, "blimp/accuracy/anaphor_number_agreement": 0.986, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.963, "blimp/accuracy/existential_there_object_raising": 0.851, "blimp/accuracy/matrix_question_npi_licensor_present": 0.257, "blimp/accuracy/npi_present_1": 0.663, "blimp/accuracy/wh_vs_that_no_gap": 0.984, "blimp/accuracy/left_branch_island_echo_question": 0.528, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.955, "blimp/accuracy/causative": 0.753, "blimp/accuracy/group_average": 0.7983134328358207, "blimp/accuracy/seq_average": 0.7983134328358209, "cbt/accuracy/NE": 0.8064903846153846, "cbt/accuracy/V": 0.9288, "cbt/accuracy/CN": 0.8732, "cbt/accuracy/P": 0.9076, "cbt/accuracy/group_average": 0.8790225961538461, "cbt/accuracy/seq_average": 0.8790516206482593, "hellaswag/accuracy/val": 0.3313085042820155, "hellaswag/accuracy/group_average": 0.3313085042820155, "hellaswag/accuracy/seq_average": 0.3313085042820155, "piqa/accuracy/val": 0.6186071817192601, "piqa/accuracy/group_average": 0.6186071817192601, "piqa/accuracy/seq_average": 0.6186071817192601, "ai2arc/accuracy/ARC-Easy": 0.3657505285412262, "ai2arc/accuracy/ARC-Challenge": 0.22145922746781116, "ai2arc/accuracy/group_average": 0.2936048780045187, "ai2arc/accuracy/seq_average": 0.3181303116147309, "mmlu/accuracy/MMLU": 0.2647121916338935, "mmlu/accuracy/group_average": 0.2647121916338935, "mmlu/accuracy/seq_average": 0.2647121916338935, "openbookqa/accuracy/test": 0.296, "openbookqa/accuracy/group_average": 0.296, "openbookqa/accuracy/seq_average": 0.296, "race/accuracy/test/high": 0.2801600914808462, "race/accuracy/test/middle": 0.3488857938718663, "race/accuracy/group_average": 0.3145229426763563, "race/accuracy/seq_average": 0.3001621402513174, "siqa/accuracy/dev": 0.36489252814739, "siqa/accuracy/group_average": 0.36489252814739, "siqa/accuracy/seq_average": 0.36489252814739, "winogrande/accuracy/dev": 0.500394632991318, "winogrande/accuracy/group_average": 0.500394632991318, "winogrande/accuracy/seq_average": 0.500394632991318, "commonsenseqa/accuracy/dev_rand_split": 0.26126126126126126, "commonsenseqa/accuracy/group_average": 0.26126126126126126, "commonsenseqa/accuracy/seq_average": 0.26126126126126126}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-140000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.3978223043774802, "val/accuracy": 0.5064677889384921, "val/perplexity": 10.99919737665364, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.4366089246287848, "lambada/accuracy/total": 0.327445652173913, "lambada/accuracy/openai_last_token": 0.7905667701863354, "lambada/perplexity": 8.162550756718513, "lambada/lm_loss": 2.9888242960380333, "lambada/lm_perplexity": 19.862316554277026, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4169567205562026, "mean_loss": 2.4172156145031325, "blimp/accuracy/passive_2": 0.908, "blimp/accuracy/determiner_noun_agreement_2": 0.983, "blimp/accuracy/ellipsis_n_bar_1": 0.869, "blimp/accuracy/tough_vs_raising_2": 0.884, "blimp/accuracy/tough_vs_raising_1": 0.613, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.885, "blimp/accuracy/principle_A_reconstruction": 0.336, "blimp/accuracy/wh_vs_that_with_gap": 0.447, "blimp/accuracy/principle_A_domain_2": 0.877, "blimp/accuracy/determiner_noun_agreement_1": 0.993, "blimp/accuracy/ellipsis_n_bar_2": 0.899, "blimp/accuracy/principle_A_domain_3": 0.58, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.907, "blimp/accuracy/animate_subject_trans": 0.907, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.866, "blimp/accuracy/distractor_agreement_relative_clause": 0.682, "blimp/accuracy/transitive": 0.894, "blimp/accuracy/sentential_subject_island": 0.312, "blimp/accuracy/adjunct_island": 0.893, "blimp/accuracy/intransitive": 0.751, "blimp/accuracy/existential_there_subject_raising": 0.913, "blimp/accuracy/irregular_past_participle_adjectives": 0.892, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.609, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.303, "blimp/accuracy/only_npi_scope": 0.788, "blimp/accuracy/superlative_quantifiers_2": 0.752, "blimp/accuracy/passive_1": 0.898, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.917, "blimp/accuracy/inchoative": 0.627, "blimp/accuracy/anaphor_gender_agreement": 0.966, "blimp/accuracy/principle_A_c_command": 0.627, "blimp/accuracy/only_npi_licensor_present": 0.79, "blimp/accuracy/expletive_it_object_raising": 0.774, "blimp/accuracy/left_branch_island_simple_question": 0.696, "blimp/accuracy/wh_questions_subject_gap": 0.946, "blimp/accuracy/existential_there_quantifiers_2": 0.483, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.946, "blimp/accuracy/sentential_negation_npi_scope": 0.679, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.844, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.897, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.901, "blimp/accuracy/principle_A_case_2": 0.92, "blimp/accuracy/distractor_agreement_relational_noun": 0.869, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.983, "blimp/accuracy/superlative_quantifiers_1": 0.74, "blimp/accuracy/wh_island": 0.752, "blimp/accuracy/principle_A_domain_1": 0.986, "blimp/accuracy/complex_NP_island": 0.634, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.975, "blimp/accuracy/irregular_past_participle_verbs": 0.886, "blimp/accuracy/drop_argument": 0.753, "blimp/accuracy/wh_questions_object_gap": 0.851, "blimp/accuracy/animate_subject_passive": 0.807, "blimp/accuracy/existential_there_quantifiers_1": 0.971, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.904, "blimp/accuracy/npi_present_2": 0.567, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.936, "blimp/accuracy/anaphor_number_agreement": 0.987, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.964, "blimp/accuracy/existential_there_object_raising": 0.847, "blimp/accuracy/matrix_question_npi_licensor_present": 0.335, "blimp/accuracy/npi_present_1": 0.579, "blimp/accuracy/wh_vs_that_no_gap": 0.987, "blimp/accuracy/left_branch_island_echo_question": 0.435, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.975, "blimp/accuracy/causative": 0.741, "blimp/accuracy/group_average": 0.792805970149254, "blimp/accuracy/seq_average": 0.7928059701492537, "cbt/accuracy/NE": 0.8040865384615384, "cbt/accuracy/V": 0.9344, "cbt/accuracy/CN": 0.87, "cbt/accuracy/P": 0.912, "cbt/accuracy/group_average": 0.8801216346153846, "cbt/accuracy/seq_average": 0.8801520608243297, "hellaswag/accuracy/val": 0.3375821549492133, "hellaswag/accuracy/group_average": 0.3375821549492133, "hellaswag/accuracy/seq_average": 0.3375821549492133, "piqa/accuracy/val": 0.6147986942328618, "piqa/accuracy/group_average": 0.6147986942328618, "piqa/accuracy/seq_average": 0.6147986942328618, "ai2arc/accuracy/ARC-Easy": 0.3627906976744186, "ai2arc/accuracy/ARC-Challenge": 0.2334763948497854, "ai2arc/accuracy/group_average": 0.29813354626210203, "ai2arc/accuracy/seq_average": 0.32011331444759206, "mmlu/accuracy/MMLU": 0.2598498391133357, "mmlu/accuracy/group_average": 0.2598498391133357, "mmlu/accuracy/seq_average": 0.2598498391133357, "openbookqa/accuracy/test": 0.288, "openbookqa/accuracy/group_average": 0.288, "openbookqa/accuracy/seq_average": 0.288, "race/accuracy/test/high": 0.27787307032590053, "race/accuracy/test/middle": 0.35097493036211697, "race/accuracy/group_average": 0.31442400034400875, "race/accuracy/seq_average": 0.2991487636805837, "siqa/accuracy/dev": 0.37001023541453426, "siqa/accuracy/group_average": 0.37001023541453426, "siqa/accuracy/seq_average": 0.37001023541453426, "winogrande/accuracy/dev": 0.5011838989739542, "winogrande/accuracy/group_average": 0.5011838989739542, "winogrande/accuracy/seq_average": 0.5011838989739542, "commonsenseqa/accuracy/dev_rand_split": 0.26617526617526616, "commonsenseqa/accuracy/group_average": 0.26617526617526616, "commonsenseqa/accuracy/seq_average": 0.26617526617526616}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-160000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.380401126922123, "val/accuracy": 0.508758060515873, "val/perplexity": 10.80923787074675, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.4365998262204, "lambada/accuracy/total": 0.33152173913043476, "lambada/accuracy/openai_last_token": 0.7886257763975155, "lambada/perplexity": 7.40539426216436, "lambada/lm_loss": 2.971350439904204, "lambada/lm_perplexity": 19.518260043655854, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4201398998231539, "mean_loss": 2.4085004765712617, "blimp/accuracy/passive_2": 0.921, "blimp/accuracy/determiner_noun_agreement_2": 0.981, "blimp/accuracy/ellipsis_n_bar_1": 0.868, "blimp/accuracy/tough_vs_raising_2": 0.869, "blimp/accuracy/tough_vs_raising_1": 0.603, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.898, "blimp/accuracy/principle_A_reconstruction": 0.363, "blimp/accuracy/wh_vs_that_with_gap": 0.468, "blimp/accuracy/principle_A_domain_2": 0.895, "blimp/accuracy/determiner_noun_agreement_1": 0.989, "blimp/accuracy/ellipsis_n_bar_2": 0.899, "blimp/accuracy/principle_A_domain_3": 0.624, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.942, "blimp/accuracy/animate_subject_trans": 0.891, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.909, "blimp/accuracy/distractor_agreement_relative_clause": 0.683, "blimp/accuracy/transitive": 0.883, "blimp/accuracy/sentential_subject_island": 0.309, "blimp/accuracy/adjunct_island": 0.886, "blimp/accuracy/intransitive": 0.755, "blimp/accuracy/existential_there_subject_raising": 0.886, "blimp/accuracy/irregular_past_participle_adjectives": 0.913, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.635, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.316, "blimp/accuracy/only_npi_scope": 0.721, "blimp/accuracy/superlative_quantifiers_2": 0.81, "blimp/accuracy/passive_1": 0.91, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.932, "blimp/accuracy/inchoative": 0.64, "blimp/accuracy/anaphor_gender_agreement": 0.968, "blimp/accuracy/principle_A_c_command": 0.688, "blimp/accuracy/only_npi_licensor_present": 0.574, "blimp/accuracy/expletive_it_object_raising": 0.767, "blimp/accuracy/left_branch_island_simple_question": 0.762, "blimp/accuracy/wh_questions_subject_gap": 0.925, "blimp/accuracy/existential_there_quantifiers_2": 0.474, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.939, "blimp/accuracy/sentential_negation_npi_scope": 0.728, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.842, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.883, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.914, "blimp/accuracy/principle_A_case_2": 0.935, "blimp/accuracy/distractor_agreement_relational_noun": 0.83, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.985, "blimp/accuracy/superlative_quantifiers_1": 0.742, "blimp/accuracy/wh_island": 0.839, "blimp/accuracy/principle_A_domain_1": 0.992, "blimp/accuracy/complex_NP_island": 0.613, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.968, "blimp/accuracy/irregular_past_participle_verbs": 0.887, "blimp/accuracy/drop_argument": 0.715, "blimp/accuracy/wh_questions_object_gap": 0.826, "blimp/accuracy/animate_subject_passive": 0.782, "blimp/accuracy/existential_there_quantifiers_1": 0.976, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.93, "blimp/accuracy/npi_present_2": 0.559, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.965, "blimp/accuracy/anaphor_number_agreement": 0.994, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.963, "blimp/accuracy/existential_there_object_raising": 0.876, "blimp/accuracy/matrix_question_npi_licensor_present": 0.313, "blimp/accuracy/npi_present_1": 0.529, "blimp/accuracy/wh_vs_that_no_gap": 0.97, "blimp/accuracy/left_branch_island_echo_question": 0.433, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.953, "blimp/accuracy/causative": 0.753, "blimp/accuracy/group_average": 0.7938955223880598, "blimp/accuracy/seq_average": 0.7938955223880597, "cbt/accuracy/NE": 0.8112980769230769, "cbt/accuracy/V": 0.9356, "cbt/accuracy/CN": 0.8788, "cbt/accuracy/P": 0.912, "cbt/accuracy/group_average": 0.8844245192307691, "cbt/accuracy/seq_average": 0.884453781512605, "hellaswag/accuracy/val": 0.3397729535949014, "hellaswag/accuracy/group_average": 0.3397729535949014, "hellaswag/accuracy/seq_average": 0.3397729535949014, "piqa/accuracy/val": 0.6273122959738846, "piqa/accuracy/group_average": 0.6273122959738846, "piqa/accuracy/seq_average": 0.6273122959738846, "ai2arc/accuracy/ARC-Easy": 0.360676532769556, "ai2arc/accuracy/ARC-Challenge": 0.2317596566523605, "ai2arc/accuracy/group_average": 0.2962180947109583, "ai2arc/accuracy/seq_average": 0.3181303116147309, "mmlu/accuracy/MMLU": 0.2675008938148016, "mmlu/accuracy/group_average": 0.2675008938148016, "mmlu/accuracy/seq_average": 0.2675008938148016, "openbookqa/accuracy/test": 0.28, "openbookqa/accuracy/group_average": 0.28, "openbookqa/accuracy/seq_average": 0.28, "race/accuracy/test/high": 0.27844482561463696, "race/accuracy/test/middle": 0.3467966573816156, "race/accuracy/group_average": 0.31262074149812624, "race/accuracy/seq_average": 0.29833806242399674, "siqa/accuracy/dev": 0.3741044012282497, "siqa/accuracy/group_average": 0.3741044012282497, "siqa/accuracy/seq_average": 0.3741044012282497, "winogrande/accuracy/dev": 0.4988161010260458, "winogrande/accuracy/group_average": 0.4988161010260458, "winogrande/accuracy/seq_average": 0.4988161010260458, "commonsenseqa/accuracy/dev_rand_split": 0.266994266994267, "commonsenseqa/accuracy/group_average": 0.266994266994267, "commonsenseqa/accuracy/seq_average": 0.266994266994267}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-180000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.3594217451791915, "val/accuracy": 0.5116422138516865, "val/perplexity": 10.584828954052167, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.3822523792337926, "lambada/accuracy/total": 0.35229037267080743, "lambada/accuracy/openai_last_token": 0.797748447204969, "lambada/perplexity": 7.606355550412874, "lambada/lm_loss": 2.9619059617024575, "lambada/lm_perplexity": 19.3347880246582, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.43196629326124697, "mean_loss": 2.3708370622064923, "blimp/accuracy/passive_2": 0.917, "blimp/accuracy/determiner_noun_agreement_2": 0.987, "blimp/accuracy/ellipsis_n_bar_1": 0.854, "blimp/accuracy/tough_vs_raising_2": 0.859, "blimp/accuracy/tough_vs_raising_1": 0.64, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.918, "blimp/accuracy/principle_A_reconstruction": 0.288, "blimp/accuracy/wh_vs_that_with_gap": 0.465, "blimp/accuracy/principle_A_domain_2": 0.892, "blimp/accuracy/determiner_noun_agreement_1": 0.989, "blimp/accuracy/ellipsis_n_bar_2": 0.905, "blimp/accuracy/principle_A_domain_3": 0.61, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.945, "blimp/accuracy/animate_subject_trans": 0.901, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.918, "blimp/accuracy/distractor_agreement_relative_clause": 0.678, "blimp/accuracy/transitive": 0.906, "blimp/accuracy/sentential_subject_island": 0.292, "blimp/accuracy/adjunct_island": 0.868, "blimp/accuracy/intransitive": 0.751, "blimp/accuracy/existential_there_subject_raising": 0.889, "blimp/accuracy/irregular_past_participle_adjectives": 0.956, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.767, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.339, "blimp/accuracy/only_npi_scope": 0.709, "blimp/accuracy/superlative_quantifiers_2": 0.747, "blimp/accuracy/passive_1": 0.892, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.932, "blimp/accuracy/inchoative": 0.606, "blimp/accuracy/anaphor_gender_agreement": 0.967, "blimp/accuracy/principle_A_c_command": 0.687, "blimp/accuracy/only_npi_licensor_present": 0.546, "blimp/accuracy/expletive_it_object_raising": 0.761, "blimp/accuracy/left_branch_island_simple_question": 0.854, "blimp/accuracy/wh_questions_subject_gap": 0.937, "blimp/accuracy/existential_there_quantifiers_2": 0.537, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.936, "blimp/accuracy/sentential_negation_npi_scope": 0.725, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.811, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.934, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.894, "blimp/accuracy/principle_A_case_2": 0.931, "blimp/accuracy/distractor_agreement_relational_noun": 0.828, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.98, "blimp/accuracy/superlative_quantifiers_1": 0.685, "blimp/accuracy/wh_island": 0.774, "blimp/accuracy/principle_A_domain_1": 0.994, "blimp/accuracy/complex_NP_island": 0.608, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.977, "blimp/accuracy/irregular_past_participle_verbs": 0.899, "blimp/accuracy/drop_argument": 0.754, "blimp/accuracy/wh_questions_object_gap": 0.847, "blimp/accuracy/animate_subject_passive": 0.794, "blimp/accuracy/existential_there_quantifiers_1": 0.975, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.899, "blimp/accuracy/npi_present_2": 0.555, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.931, "blimp/accuracy/anaphor_number_agreement": 0.989, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.956, "blimp/accuracy/existential_there_object_raising": 0.853, "blimp/accuracy/matrix_question_npi_licensor_present": 0.367, "blimp/accuracy/npi_present_1": 0.561, "blimp/accuracy/wh_vs_that_no_gap": 0.978, "blimp/accuracy/left_branch_island_echo_question": 0.508, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.961, "blimp/accuracy/causative": 0.755, "blimp/accuracy/group_average": 0.7965373134328356, "blimp/accuracy/seq_average": 0.7965373134328358, "cbt/accuracy/NE": 0.8044871794871795, "cbt/accuracy/V": 0.934, "cbt/accuracy/CN": 0.8792, "cbt/accuracy/P": 0.9168, "cbt/accuracy/group_average": 0.8836217948717948, "cbt/accuracy/seq_average": 0.8836534613845538, "hellaswag/accuracy/val": 0.34734116709818763, "hellaswag/accuracy/group_average": 0.34734116709818763, "hellaswag/accuracy/seq_average": 0.34734116709818763, "piqa/accuracy/val": 0.6327529923830251, "piqa/accuracy/group_average": 0.6327529923830251, "piqa/accuracy/seq_average": 0.6327529923830251, "ai2arc/accuracy/ARC-Easy": 0.3788583509513742, "ai2arc/accuracy/ARC-Challenge": 0.22660944206008585, "ai2arc/accuracy/group_average": 0.30273389650573, "ai2arc/accuracy/seq_average": 0.3286118980169972, "mmlu/accuracy/MMLU": 0.25927779764032893, "mmlu/accuracy/group_average": 0.25927779764032893, "mmlu/accuracy/seq_average": 0.25927779764032893, "openbookqa/accuracy/test": 0.28, "openbookqa/accuracy/group_average": 0.28, "openbookqa/accuracy/seq_average": 0.28, "race/accuracy/test/high": 0.29130931961120643, "race/accuracy/test/middle": 0.36768802228412256, "race/accuracy/group_average": 0.3294986709476645, "race/accuracy/seq_average": 0.313538710985002, "siqa/accuracy/dev": 0.3705220061412487, "siqa/accuracy/group_average": 0.3705220061412487, "siqa/accuracy/seq_average": 0.3705220061412487, "winogrande/accuracy/dev": 0.4988161010260458, "winogrande/accuracy/group_average": 0.4988161010260458, "winogrande/accuracy/seq_average": 0.4988161010260458, "commonsenseqa/accuracy/dev_rand_split": 0.26453726453726456, "commonsenseqa/accuracy/group_average": 0.26453726453726456, "commonsenseqa/accuracy/seq_average": 0.26453726453726456}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-20000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.7214597671750993, "val/accuracy": 0.4614684089781746, "val/perplexity": 15.202498162975186, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6877632851926436, "lambada/accuracy/total": 0.2138975155279503, "lambada/accuracy/openai_last_token": 0.7468944099378882, "lambada/perplexity": 14.989512899768428, "lambada/lm_loss": 3.2719860433345973, "lambada/lm_perplexity": 26.36364672856658, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.33768296225306244, "mean_loss": 2.7046115261838715, "blimp/accuracy/passive_2": 0.882, "blimp/accuracy/determiner_noun_agreement_2": 0.983, "blimp/accuracy/ellipsis_n_bar_1": 0.818, "blimp/accuracy/tough_vs_raising_2": 0.864, "blimp/accuracy/tough_vs_raising_1": 0.554, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.866, "blimp/accuracy/principle_A_reconstruction": 0.509, "blimp/accuracy/wh_vs_that_with_gap": 0.478, "blimp/accuracy/principle_A_domain_2": 0.865, "blimp/accuracy/determiner_noun_agreement_1": 0.986, "blimp/accuracy/ellipsis_n_bar_2": 0.907, "blimp/accuracy/principle_A_domain_3": 0.616, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.909, "blimp/accuracy/animate_subject_trans": 0.89, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.883, "blimp/accuracy/distractor_agreement_relative_clause": 0.524, "blimp/accuracy/transitive": 0.836, "blimp/accuracy/sentential_subject_island": 0.35, "blimp/accuracy/adjunct_island": 0.806, "blimp/accuracy/intransitive": 0.769, "blimp/accuracy/existential_there_subject_raising": 0.866, "blimp/accuracy/irregular_past_participle_adjectives": 0.866, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.323, "blimp/accuracy/principle_A_case_1": 0.999, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.23, "blimp/accuracy/only_npi_scope": 0.54, "blimp/accuracy/superlative_quantifiers_2": 0.611, "blimp/accuracy/passive_1": 0.886, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.869, "blimp/accuracy/inchoative": 0.623, "blimp/accuracy/anaphor_gender_agreement": 0.962, "blimp/accuracy/principle_A_c_command": 0.536, "blimp/accuracy/only_npi_licensor_present": 0.788, "blimp/accuracy/expletive_it_object_raising": 0.715, "blimp/accuracy/left_branch_island_simple_question": 0.457, "blimp/accuracy/wh_questions_subject_gap": 0.917, "blimp/accuracy/existential_there_quantifiers_2": 0.36, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.948, "blimp/accuracy/sentential_negation_npi_scope": 0.588, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.804, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.893, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.87, "blimp/accuracy/principle_A_case_2": 0.927, "blimp/accuracy/distractor_agreement_relational_noun": 0.711, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.99, "blimp/accuracy/superlative_quantifiers_1": 0.603, "blimp/accuracy/wh_island": 0.809, "blimp/accuracy/principle_A_domain_1": 0.969, "blimp/accuracy/complex_NP_island": 0.515, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.971, "blimp/accuracy/irregular_past_participle_verbs": 0.869, "blimp/accuracy/drop_argument": 0.757, "blimp/accuracy/wh_questions_object_gap": 0.745, "blimp/accuracy/animate_subject_passive": 0.793, "blimp/accuracy/existential_there_quantifiers_1": 0.971, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.849, "blimp/accuracy/npi_present_2": 0.62, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.918, "blimp/accuracy/anaphor_number_agreement": 0.983, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.946, "blimp/accuracy/existential_there_object_raising": 0.803, "blimp/accuracy/matrix_question_npi_licensor_present": 0.174, "blimp/accuracy/npi_present_1": 0.623, "blimp/accuracy/wh_vs_that_no_gap": 0.968, "blimp/accuracy/left_branch_island_echo_question": 0.453, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.975, "blimp/accuracy/causative": 0.7, "blimp/accuracy/group_average": 0.7550447761194028, "blimp/accuracy/seq_average": 0.755044776119403, "cbt/accuracy/NE": 0.7520032051282052, "cbt/accuracy/V": 0.898, "cbt/accuracy/CN": 0.806, "cbt/accuracy/P": 0.8712, "cbt/accuracy/group_average": 0.8318008012820512, "cbt/accuracy/seq_average": 0.8318327330932372, "hellaswag/accuracy/val": 0.28689504082852024, "hellaswag/accuracy/group_average": 0.28689504082852024, "hellaswag/accuracy/seq_average": 0.28689504082852024, "piqa/accuracy/val": 0.5761697497279652, "piqa/accuracy/group_average": 0.5761697497279652, "piqa/accuracy/seq_average": 0.5761697497279652, "ai2arc/accuracy/ARC-Easy": 0.32473572938689216, "ai2arc/accuracy/ARC-Challenge": 0.1982832618025751, "ai2arc/accuracy/group_average": 0.26150949559473363, "ai2arc/accuracy/seq_average": 0.2830028328611898, "mmlu/accuracy/MMLU": 0.26313907758312477, "mmlu/accuracy/group_average": 0.26313907758312477, "mmlu/accuracy/seq_average": 0.26313907758312477, "openbookqa/accuracy/test": 0.282, "openbookqa/accuracy/group_average": 0.282, "openbookqa/accuracy/seq_average": 0.282, "race/accuracy/test/high": 0.2670097198399085, "race/accuracy/test/middle": 0.3293871866295265, "race/accuracy/group_average": 0.2981984532347175, "race/accuracy/seq_average": 0.28516416700445885, "siqa/accuracy/dev": 0.3541453428863869, "siqa/accuracy/group_average": 0.3541453428863869, "siqa/accuracy/seq_average": 0.3541453428863869, "winogrande/accuracy/dev": 0.505130228887135, "winogrande/accuracy/group_average": 0.505130228887135, "winogrande/accuracy/seq_average": 0.505130228887135, "commonsenseqa/accuracy/dev_rand_split": 0.25634725634725636, "commonsenseqa/accuracy/group_average": 0.25634725634725636, "commonsenseqa/accuracy/seq_average": 0.25634725634725636}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-200000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.3440958658854165, "val/accuracy": 0.5141766260540674, "val/perplexity": 10.423843912848005, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.363063835949631, "lambada/accuracy/total": 0.34142080745341613, "lambada/accuracy/openai_last_token": 0.7930900621118012, "lambada/perplexity": 8.002005484452583, "lambada/lm_loss": 2.95954667552272, "lambada/lm_perplexity": 19.289225495140865, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4277987167537418, "mean_loss": 2.353579850917524, "blimp/accuracy/passive_2": 0.91, "blimp/accuracy/determiner_noun_agreement_2": 0.988, "blimp/accuracy/ellipsis_n_bar_1": 0.854, "blimp/accuracy/tough_vs_raising_2": 0.856, "blimp/accuracy/tough_vs_raising_1": 0.62, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.911, "blimp/accuracy/principle_A_reconstruction": 0.426, "blimp/accuracy/wh_vs_that_with_gap": 0.482, "blimp/accuracy/principle_A_domain_2": 0.879, "blimp/accuracy/determiner_noun_agreement_1": 0.993, "blimp/accuracy/ellipsis_n_bar_2": 0.89, "blimp/accuracy/principle_A_domain_3": 0.611, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.94, "blimp/accuracy/animate_subject_trans": 0.908, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.927, "blimp/accuracy/distractor_agreement_relative_clause": 0.708, "blimp/accuracy/transitive": 0.873, "blimp/accuracy/sentential_subject_island": 0.371, "blimp/accuracy/adjunct_island": 0.867, "blimp/accuracy/intransitive": 0.744, "blimp/accuracy/existential_there_subject_raising": 0.897, "blimp/accuracy/irregular_past_participle_adjectives": 0.919, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.707, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.379, "blimp/accuracy/only_npi_scope": 0.658, "blimp/accuracy/superlative_quantifiers_2": 0.734, "blimp/accuracy/passive_1": 0.899, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.912, "blimp/accuracy/inchoative": 0.594, "blimp/accuracy/anaphor_gender_agreement": 0.973, "blimp/accuracy/principle_A_c_command": 0.678, "blimp/accuracy/only_npi_licensor_present": 0.874, "blimp/accuracy/expletive_it_object_raising": 0.792, "blimp/accuracy/left_branch_island_simple_question": 0.84, "blimp/accuracy/wh_questions_subject_gap": 0.931, "blimp/accuracy/existential_there_quantifiers_2": 0.541, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.938, "blimp/accuracy/sentential_negation_npi_scope": 0.716, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.847, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.88, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.886, "blimp/accuracy/principle_A_case_2": 0.924, "blimp/accuracy/distractor_agreement_relational_noun": 0.817, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.976, "blimp/accuracy/superlative_quantifiers_1": 0.63, "blimp/accuracy/wh_island": 0.783, "blimp/accuracy/principle_A_domain_1": 0.994, "blimp/accuracy/complex_NP_island": 0.681, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.971, "blimp/accuracy/irregular_past_participle_verbs": 0.903, "blimp/accuracy/drop_argument": 0.724, "blimp/accuracy/wh_questions_object_gap": 0.857, "blimp/accuracy/animate_subject_passive": 0.8, "blimp/accuracy/existential_there_quantifiers_1": 0.985, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.915, "blimp/accuracy/npi_present_2": 0.553, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.946, "blimp/accuracy/anaphor_number_agreement": 0.99, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.968, "blimp/accuracy/existential_there_object_raising": 0.859, "blimp/accuracy/matrix_question_npi_licensor_present": 0.354, "blimp/accuracy/npi_present_1": 0.637, "blimp/accuracy/wh_vs_that_no_gap": 0.98, "blimp/accuracy/left_branch_island_echo_question": 0.542, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.954, "blimp/accuracy/causative": 0.762, "blimp/accuracy/group_average": 0.8038507462686567, "blimp/accuracy/seq_average": 0.8038507462686567, "cbt/accuracy/NE": 0.8100961538461539, "cbt/accuracy/V": 0.9396, "cbt/accuracy/CN": 0.882, "cbt/accuracy/P": 0.9192, "cbt/accuracy/group_average": 0.8877240384615385, "cbt/accuracy/seq_average": 0.8877551020408163, "hellaswag/accuracy/val": 0.35172276438956385, "hellaswag/accuracy/group_average": 0.35172276438956385, "hellaswag/accuracy/seq_average": 0.35172276438956385, "piqa/accuracy/val": 0.6371055495103374, "piqa/accuracy/group_average": 0.6371055495103374, "piqa/accuracy/seq_average": 0.6371055495103374, "ai2arc/accuracy/ARC-Easy": 0.3767441860465116, "ai2arc/accuracy/ARC-Challenge": 0.2257510729613734, "ai2arc/accuracy/group_average": 0.3012476295039425, "ai2arc/accuracy/seq_average": 0.32691218130311617, "mmlu/accuracy/MMLU": 0.26235252055774044, "mmlu/accuracy/group_average": 0.26235252055774044, "mmlu/accuracy/seq_average": 0.26235252055774044, "openbookqa/accuracy/test": 0.278, "openbookqa/accuracy/group_average": 0.278, "openbookqa/accuracy/seq_average": 0.278, "race/accuracy/test/high": 0.28444825614636937, "race/accuracy/test/middle": 0.3628133704735376, "race/accuracy/group_average": 0.32363081330995347, "race/accuracy/seq_average": 0.30725577624645317, "siqa/accuracy/dev": 0.37717502558853633, "siqa/accuracy/group_average": 0.37717502558853633, "siqa/accuracy/seq_average": 0.37717502558853633, "winogrande/accuracy/dev": 0.5067087608524072, "winogrande/accuracy/group_average": 0.5067087608524072, "winogrande/accuracy/seq_average": 0.5067087608524072, "commonsenseqa/accuracy/dev_rand_split": 0.26863226863226863, "commonsenseqa/accuracy/group_average": 0.26863226863226863, "commonsenseqa/accuracy/seq_average": 0.26863226863226863}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-220000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.3291797940693204, "val/accuracy": 0.5159146747891865, "val/perplexity": 10.269514960677588, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.4173195406516887, "lambada/accuracy/total": 0.3513198757763975, "lambada/accuracy/openai_last_token": 0.7983307453416149, "lambada/perplexity": 7.055594567274998, "lambada/lm_loss": 2.9339596302248445, "lambada/lm_perplexity": 18.801931994576645, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.43361727528279204, "mean_loss": 2.3732496673605046, "blimp/accuracy/passive_2": 0.914, "blimp/accuracy/determiner_noun_agreement_2": 0.986, "blimp/accuracy/ellipsis_n_bar_1": 0.873, "blimp/accuracy/tough_vs_raising_2": 0.844, "blimp/accuracy/tough_vs_raising_1": 0.632, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.905, "blimp/accuracy/principle_A_reconstruction": 0.397, "blimp/accuracy/wh_vs_that_with_gap": 0.476, "blimp/accuracy/principle_A_domain_2": 0.888, "blimp/accuracy/determiner_noun_agreement_1": 0.993, "blimp/accuracy/ellipsis_n_bar_2": 0.906, "blimp/accuracy/principle_A_domain_3": 0.605, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.917, "blimp/accuracy/animate_subject_trans": 0.911, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.897, "blimp/accuracy/distractor_agreement_relative_clause": 0.706, "blimp/accuracy/transitive": 0.882, "blimp/accuracy/sentential_subject_island": 0.404, "blimp/accuracy/adjunct_island": 0.867, "blimp/accuracy/intransitive": 0.769, "blimp/accuracy/existential_there_subject_raising": 0.887, "blimp/accuracy/irregular_past_participle_adjectives": 0.912, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.642, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.353, "blimp/accuracy/only_npi_scope": 0.719, "blimp/accuracy/superlative_quantifiers_2": 0.66, "blimp/accuracy/passive_1": 0.895, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.912, "blimp/accuracy/inchoative": 0.604, "blimp/accuracy/anaphor_gender_agreement": 0.967, "blimp/accuracy/principle_A_c_command": 0.68, "blimp/accuracy/only_npi_licensor_present": 0.646, "blimp/accuracy/expletive_it_object_raising": 0.8, "blimp/accuracy/left_branch_island_simple_question": 0.751, "blimp/accuracy/wh_questions_subject_gap": 0.947, "blimp/accuracy/existential_there_quantifiers_2": 0.399, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.95, "blimp/accuracy/sentential_negation_npi_scope": 0.712, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.833, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.918, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.897, "blimp/accuracy/principle_A_case_2": 0.906, "blimp/accuracy/distractor_agreement_relational_noun": 0.833, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.977, "blimp/accuracy/superlative_quantifiers_1": 0.685, "blimp/accuracy/wh_island": 0.809, "blimp/accuracy/principle_A_domain_1": 0.989, "blimp/accuracy/complex_NP_island": 0.598, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.971, "blimp/accuracy/irregular_past_participle_verbs": 0.897, "blimp/accuracy/drop_argument": 0.717, "blimp/accuracy/wh_questions_object_gap": 0.853, "blimp/accuracy/animate_subject_passive": 0.792, "blimp/accuracy/existential_there_quantifiers_1": 0.974, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.888, "blimp/accuracy/npi_present_2": 0.58, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.925, "blimp/accuracy/anaphor_number_agreement": 0.991, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.963, "blimp/accuracy/existential_there_object_raising": 0.846, "blimp/accuracy/matrix_question_npi_licensor_present": 0.336, "blimp/accuracy/npi_present_1": 0.6, "blimp/accuracy/wh_vs_that_no_gap": 0.971, "blimp/accuracy/left_branch_island_echo_question": 0.451, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.94, "blimp/accuracy/causative": 0.755, "blimp/accuracy/group_average": 0.7925820895522383, "blimp/accuracy/seq_average": 0.7925820895522389, "cbt/accuracy/NE": 0.8185096153846154, "cbt/accuracy/V": 0.9352, "cbt/accuracy/CN": 0.8804, "cbt/accuracy/P": 0.9252, "cbt/accuracy/group_average": 0.8898274038461538, "cbt/accuracy/seq_average": 0.8898559423769508, "hellaswag/accuracy/val": 0.35700059749053975, "hellaswag/accuracy/group_average": 0.35700059749053975, "hellaswag/accuracy/seq_average": 0.35700059749053975, "piqa/accuracy/val": 0.6332970620239391, "piqa/accuracy/group_average": 0.6332970620239391, "piqa/accuracy/seq_average": 0.6332970620239391, "ai2arc/accuracy/ARC-Easy": 0.3788583509513742, "ai2arc/accuracy/ARC-Challenge": 0.2240343347639485, "ai2arc/accuracy/group_average": 0.30144634285766136, "ai2arc/accuracy/seq_average": 0.3277620396600567, "mmlu/accuracy/MMLU": 0.2619949946371112, "mmlu/accuracy/group_average": 0.2619949946371112, "mmlu/accuracy/seq_average": 0.2619949946371112, "openbookqa/accuracy/test": 0.268, "openbookqa/accuracy/group_average": 0.268, "openbookqa/accuracy/seq_average": 0.268, "race/accuracy/test/high": 0.2864493996569468, "race/accuracy/test/middle": 0.3593314763231198, "race/accuracy/group_average": 0.3228904379900333, "race/accuracy/seq_average": 0.30766112687474667, "siqa/accuracy/dev": 0.368474923234391, "siqa/accuracy/group_average": 0.368474923234391, "siqa/accuracy/seq_average": 0.368474923234391, "winogrande/accuracy/dev": 0.500394632991318, "winogrande/accuracy/group_average": 0.500394632991318, "winogrande/accuracy/seq_average": 0.500394632991318, "commonsenseqa/accuracy/dev_rand_split": 0.2727272727272727, "commonsenseqa/accuracy/group_average": 0.2727272727272727, "commonsenseqa/accuracy/seq_average": 0.2727272727272727}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-240000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.314915490528894, "val/accuracy": 0.5184190538194444, "val/perplexity": 10.12406730301678, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.3526321316357723, "lambada/accuracy/total": 0.3511257763975155, "lambada/accuracy/openai_last_token": 0.797748447204969, "lambada/perplexity": 6.933118277255691, "lambada/lm_loss": 2.9107763958236865, "lambada/lm_perplexity": 18.371056242845395, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.43477241510847997, "mean_loss": 2.333773811082333, "blimp/accuracy/passive_2": 0.906, "blimp/accuracy/determiner_noun_agreement_2": 0.984, "blimp/accuracy/ellipsis_n_bar_1": 0.863, "blimp/accuracy/tough_vs_raising_2": 0.884, "blimp/accuracy/tough_vs_raising_1": 0.601, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.898, "blimp/accuracy/principle_A_reconstruction": 0.413, "blimp/accuracy/wh_vs_that_with_gap": 0.446, "blimp/accuracy/principle_A_domain_2": 0.89, "blimp/accuracy/determiner_noun_agreement_1": 0.991, "blimp/accuracy/ellipsis_n_bar_2": 0.906, "blimp/accuracy/principle_A_domain_3": 0.618, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.937, "blimp/accuracy/animate_subject_trans": 0.904, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.906, "blimp/accuracy/distractor_agreement_relative_clause": 0.733, "blimp/accuracy/transitive": 0.883, "blimp/accuracy/sentential_subject_island": 0.341, "blimp/accuracy/adjunct_island": 0.868, "blimp/accuracy/intransitive": 0.762, "blimp/accuracy/existential_there_subject_raising": 0.903, "blimp/accuracy/irregular_past_participle_adjectives": 0.875, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.623, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.333, "blimp/accuracy/only_npi_scope": 0.698, "blimp/accuracy/superlative_quantifiers_2": 0.721, "blimp/accuracy/passive_1": 0.898, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.93, "blimp/accuracy/inchoative": 0.614, "blimp/accuracy/anaphor_gender_agreement": 0.975, "blimp/accuracy/principle_A_c_command": 0.68, "blimp/accuracy/only_npi_licensor_present": 0.695, "blimp/accuracy/expletive_it_object_raising": 0.792, "blimp/accuracy/left_branch_island_simple_question": 0.71, "blimp/accuracy/wh_questions_subject_gap": 0.925, "blimp/accuracy/existential_there_quantifiers_2": 0.479, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.947, "blimp/accuracy/sentential_negation_npi_scope": 0.726, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.833, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.894, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.901, "blimp/accuracy/principle_A_case_2": 0.943, "blimp/accuracy/distractor_agreement_relational_noun": 0.848, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.974, "blimp/accuracy/superlative_quantifiers_1": 0.844, "blimp/accuracy/wh_island": 0.829, "blimp/accuracy/principle_A_domain_1": 0.995, "blimp/accuracy/complex_NP_island": 0.616, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.975, "blimp/accuracy/irregular_past_participle_verbs": 0.903, "blimp/accuracy/drop_argument": 0.731, "blimp/accuracy/wh_questions_object_gap": 0.845, "blimp/accuracy/animate_subject_passive": 0.81, "blimp/accuracy/existential_there_quantifiers_1": 0.97, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.91, "blimp/accuracy/npi_present_2": 0.555, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.969, "blimp/accuracy/anaphor_number_agreement": 0.991, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.957, "blimp/accuracy/existential_there_object_raising": 0.872, "blimp/accuracy/matrix_question_npi_licensor_present": 0.36, "blimp/accuracy/npi_present_1": 0.554, "blimp/accuracy/wh_vs_that_no_gap": 0.98, "blimp/accuracy/left_branch_island_echo_question": 0.443, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.959, "blimp/accuracy/causative": 0.74, "blimp/accuracy/group_average": 0.7978955223880595, "blimp/accuracy/seq_average": 0.7978955223880597, "cbt/accuracy/NE": 0.8125, "cbt/accuracy/V": 0.9412, "cbt/accuracy/CN": 0.8824, "cbt/accuracy/P": 0.926, "cbt/accuracy/group_average": 0.890525, "cbt/accuracy/seq_average": 0.8905562224889956, "hellaswag/accuracy/val": 0.35590519816769567, "hellaswag/accuracy/group_average": 0.35590519816769567, "hellaswag/accuracy/seq_average": 0.35590519816769567, "piqa/accuracy/val": 0.6338411316648531, "piqa/accuracy/group_average": 0.6338411316648531, "piqa/accuracy/seq_average": 0.6338411316648531, "ai2arc/accuracy/ARC-Easy": 0.3767441860465116, "ai2arc/accuracy/ARC-Challenge": 0.2429184549356223, "ai2arc/accuracy/group_average": 0.309831320491067, "ai2arc/accuracy/seq_average": 0.33257790368271956, "mmlu/accuracy/MMLU": 0.2624240257418663, "mmlu/accuracy/group_average": 0.2624240257418663, "mmlu/accuracy/seq_average": 0.2624240257418663, "openbookqa/accuracy/test": 0.268, "openbookqa/accuracy/group_average": 0.268, "openbookqa/accuracy/seq_average": 0.268, "race/accuracy/test/high": 0.29245283018867924, "race/accuracy/test/middle": 0.36142061281337046, "race/accuracy/group_average": 0.3269367215010248, "race/accuracy/seq_average": 0.31252533441426833, "siqa/accuracy/dev": 0.37717502558853633, "siqa/accuracy/group_average": 0.37717502558853633, "siqa/accuracy/seq_average": 0.37717502558853633, "winogrande/accuracy/dev": 0.5059194948697711, "winogrande/accuracy/group_average": 0.5059194948697711, "winogrande/accuracy/seq_average": 0.5059194948697711, "commonsenseqa/accuracy/dev_rand_split": 0.266994266994267, "commonsenseqa/accuracy/group_average": 0.266994266994267, "commonsenseqa/accuracy/seq_average": 0.266994266994267}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-260000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.303532918294271, "val/accuracy": 0.5205116877480159, "val/perplexity": 10.009482746285755, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.3750993242915373, "lambada/accuracy/total": 0.3687888198757764, "lambada/accuracy/openai_last_token": 0.8000776397515528, "lambada/perplexity": 6.975504465511719, "lambada/lm_loss": 2.8943284415510946, "lambada/lm_perplexity": 18.071361389912358, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.44465025381189616, "mean_loss": 2.339316121292904, "blimp/accuracy/passive_2": 0.914, "blimp/accuracy/determiner_noun_agreement_2": 0.988, "blimp/accuracy/ellipsis_n_bar_1": 0.859, "blimp/accuracy/tough_vs_raising_2": 0.887, "blimp/accuracy/tough_vs_raising_1": 0.596, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.895, "blimp/accuracy/principle_A_reconstruction": 0.364, "blimp/accuracy/wh_vs_that_with_gap": 0.462, "blimp/accuracy/principle_A_domain_2": 0.891, "blimp/accuracy/determiner_noun_agreement_1": 0.991, "blimp/accuracy/ellipsis_n_bar_2": 0.915, "blimp/accuracy/principle_A_domain_3": 0.606, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.939, "blimp/accuracy/animate_subject_trans": 0.902, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.914, "blimp/accuracy/distractor_agreement_relative_clause": 0.738, "blimp/accuracy/transitive": 0.882, "blimp/accuracy/sentential_subject_island": 0.316, "blimp/accuracy/adjunct_island": 0.866, "blimp/accuracy/intransitive": 0.749, "blimp/accuracy/existential_there_subject_raising": 0.904, "blimp/accuracy/irregular_past_participle_adjectives": 0.888, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.711, "blimp/accuracy/principle_A_case_1": 0.999, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.349, "blimp/accuracy/only_npi_scope": 0.617, "blimp/accuracy/superlative_quantifiers_2": 0.808, "blimp/accuracy/passive_1": 0.905, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.931, "blimp/accuracy/inchoative": 0.622, "blimp/accuracy/anaphor_gender_agreement": 0.978, "blimp/accuracy/principle_A_c_command": 0.704, "blimp/accuracy/only_npi_licensor_present": 0.686, "blimp/accuracy/expletive_it_object_raising": 0.805, "blimp/accuracy/left_branch_island_simple_question": 0.782, "blimp/accuracy/wh_questions_subject_gap": 0.942, "blimp/accuracy/existential_there_quantifiers_2": 0.51, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.954, "blimp/accuracy/sentential_negation_npi_scope": 0.752, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.813, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.894, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.893, "blimp/accuracy/principle_A_case_2": 0.918, "blimp/accuracy/distractor_agreement_relational_noun": 0.864, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.983, "blimp/accuracy/superlative_quantifiers_1": 0.894, "blimp/accuracy/wh_island": 0.803, "blimp/accuracy/principle_A_domain_1": 0.995, "blimp/accuracy/complex_NP_island": 0.657, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.975, "blimp/accuracy/irregular_past_participle_verbs": 0.912, "blimp/accuracy/drop_argument": 0.726, "blimp/accuracy/wh_questions_object_gap": 0.829, "blimp/accuracy/animate_subject_passive": 0.81, "blimp/accuracy/existential_there_quantifiers_1": 0.969, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.911, "blimp/accuracy/npi_present_2": 0.597, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.97, "blimp/accuracy/anaphor_number_agreement": 0.989, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.966, "blimp/accuracy/existential_there_object_raising": 0.857, "blimp/accuracy/matrix_question_npi_licensor_present": 0.362, "blimp/accuracy/npi_present_1": 0.6, "blimp/accuracy/wh_vs_that_no_gap": 0.981, "blimp/accuracy/left_branch_island_echo_question": 0.453, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.956, "blimp/accuracy/causative": 0.754, "blimp/accuracy/group_average": 0.8037611940298507, "blimp/accuracy/seq_average": 0.8037611940298508, "cbt/accuracy/NE": 0.828125, "cbt/accuracy/V": 0.9376, "cbt/accuracy/CN": 0.8832, "cbt/accuracy/P": 0.9272, "cbt/accuracy/group_average": 0.89403125, "cbt/accuracy/seq_average": 0.8940576230492197, "hellaswag/accuracy/val": 0.35929097789285, "hellaswag/accuracy/group_average": 0.35929097789285, "hellaswag/accuracy/seq_average": 0.35929097789285, "piqa/accuracy/val": 0.6409140369967355, "piqa/accuracy/group_average": 0.6409140369967355, "piqa/accuracy/seq_average": 0.6409140369967355, "ai2arc/accuracy/ARC-Easy": 0.3788583509513742, "ai2arc/accuracy/ARC-Challenge": 0.23004291845493563, "ai2arc/accuracy/group_average": 0.3044506347031549, "ai2arc/accuracy/seq_average": 0.32974504249291786, "mmlu/accuracy/MMLU": 0.2609939220593493, "mmlu/accuracy/group_average": 0.2609939220593493, "mmlu/accuracy/seq_average": 0.2609939220593493, "openbookqa/accuracy/test": 0.276, "openbookqa/accuracy/group_average": 0.276, "openbookqa/accuracy/seq_average": 0.276, "race/accuracy/test/high": 0.2893081761006289, "race/accuracy/test/middle": 0.3649025069637883, "race/accuracy/group_average": 0.3271053415322086, "race/accuracy/seq_average": 0.31130928252938794, "siqa/accuracy/dev": 0.3705220061412487, "siqa/accuracy/group_average": 0.3705220061412487, "siqa/accuracy/seq_average": 0.3705220061412487, "winogrande/accuracy/dev": 0.4996053670086819, "winogrande/accuracy/group_average": 0.4996053670086819, "winogrande/accuracy/seq_average": 0.4996053670086819, "commonsenseqa/accuracy/dev_rand_split": 0.26535626535626533, "commonsenseqa/accuracy/group_average": 0.26535626535626533, "commonsenseqa/accuracy/seq_average": 0.26535626535626533}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-280000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.2922164674789185, "val/accuracy": 0.5227467370411706, "val/perplexity": 9.896849433772832, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.338406201475155, "lambada/accuracy/total": 0.38528726708074534, "lambada/accuracy/openai_last_token": 0.8018245341614907, "lambada/perplexity": 6.266475327407352, "lambada/lm_loss": 2.8854536178165917, "lambada/lm_perplexity": 17.911690815041215, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.45401700206095796, "mean_loss": 2.3153113344770366, "blimp/accuracy/passive_2": 0.913, "blimp/accuracy/determiner_noun_agreement_2": 0.983, "blimp/accuracy/ellipsis_n_bar_1": 0.862, "blimp/accuracy/tough_vs_raising_2": 0.872, "blimp/accuracy/tough_vs_raising_1": 0.611, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.875, "blimp/accuracy/principle_A_reconstruction": 0.396, "blimp/accuracy/wh_vs_that_with_gap": 0.437, "blimp/accuracy/principle_A_domain_2": 0.89, "blimp/accuracy/determiner_noun_agreement_1": 0.989, "blimp/accuracy/ellipsis_n_bar_2": 0.915, "blimp/accuracy/principle_A_domain_3": 0.615, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.93, "blimp/accuracy/animate_subject_trans": 0.889, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.911, "blimp/accuracy/distractor_agreement_relative_clause": 0.725, "blimp/accuracy/transitive": 0.893, "blimp/accuracy/sentential_subject_island": 0.315, "blimp/accuracy/adjunct_island": 0.868, "blimp/accuracy/intransitive": 0.756, "blimp/accuracy/existential_there_subject_raising": 0.911, "blimp/accuracy/irregular_past_participle_adjectives": 0.832, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.677, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.327, "blimp/accuracy/only_npi_scope": 0.728, "blimp/accuracy/superlative_quantifiers_2": 0.81, "blimp/accuracy/passive_1": 0.899, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.903, "blimp/accuracy/inchoative": 0.617, "blimp/accuracy/anaphor_gender_agreement": 0.977, "blimp/accuracy/principle_A_c_command": 0.688, "blimp/accuracy/only_npi_licensor_present": 0.744, "blimp/accuracy/expletive_it_object_raising": 0.791, "blimp/accuracy/left_branch_island_simple_question": 0.772, "blimp/accuracy/wh_questions_subject_gap": 0.926, "blimp/accuracy/existential_there_quantifiers_2": 0.511, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.947, "blimp/accuracy/sentential_negation_npi_scope": 0.731, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.839, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.912, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.884, "blimp/accuracy/principle_A_case_2": 0.91, "blimp/accuracy/distractor_agreement_relational_noun": 0.857, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.983, "blimp/accuracy/superlative_quantifiers_1": 0.877, "blimp/accuracy/wh_island": 0.814, "blimp/accuracy/principle_A_domain_1": 0.996, "blimp/accuracy/complex_NP_island": 0.62, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.978, "blimp/accuracy/irregular_past_participle_verbs": 0.898, "blimp/accuracy/drop_argument": 0.714, "blimp/accuracy/wh_questions_object_gap": 0.843, "blimp/accuracy/animate_subject_passive": 0.809, "blimp/accuracy/existential_there_quantifiers_1": 0.97, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.897, "blimp/accuracy/npi_present_2": 0.584, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.952, "blimp/accuracy/anaphor_number_agreement": 0.994, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.962, "blimp/accuracy/existential_there_object_raising": 0.846, "blimp/accuracy/matrix_question_npi_licensor_present": 0.411, "blimp/accuracy/npi_present_1": 0.57, "blimp/accuracy/wh_vs_that_no_gap": 0.98, "blimp/accuracy/left_branch_island_echo_question": 0.446, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.967, "blimp/accuracy/causative": 0.741, "blimp/accuracy/group_average": 0.8013432835820893, "blimp/accuracy/seq_average": 0.8013432835820895, "cbt/accuracy/NE": 0.8329326923076923, "cbt/accuracy/V": 0.9412, "cbt/accuracy/CN": 0.8916, "cbt/accuracy/P": 0.9284, "cbt/accuracy/group_average": 0.898533173076923, "cbt/accuracy/seq_average": 0.8985594237695078, "hellaswag/accuracy/val": 0.364070902210715, "hellaswag/accuracy/group_average": 0.364070902210715, "hellaswag/accuracy/seq_average": 0.364070902210715, "piqa/accuracy/val": 0.6425462459194777, "piqa/accuracy/group_average": 0.6425462459194777, "piqa/accuracy/seq_average": 0.6425462459194777, "ai2arc/accuracy/ARC-Easy": 0.37970401691331923, "ai2arc/accuracy/ARC-Challenge": 0.2317596566523605, "ai2arc/accuracy/group_average": 0.3057318367828399, "ai2arc/accuracy/seq_average": 0.33087818696883853, "mmlu/accuracy/MMLU": 0.26020736503396497, "mmlu/accuracy/group_average": 0.26020736503396497, "mmlu/accuracy/seq_average": 0.26020736503396497, "openbookqa/accuracy/test": 0.272, "openbookqa/accuracy/group_average": 0.272, "openbookqa/accuracy/seq_average": 0.272, "race/accuracy/test/high": 0.2827329902801601, "race/accuracy/test/middle": 0.3683844011142061, "race/accuracy/group_average": 0.3255586956971831, "race/accuracy/seq_average": 0.30766112687474667, "siqa/accuracy/dev": 0.3766632548618219, "siqa/accuracy/group_average": 0.3766632548618219, "siqa/accuracy/seq_average": 0.3766632548618219, "winogrande/accuracy/dev": 0.5082872928176796, "winogrande/accuracy/group_average": 0.5082872928176796, "winogrande/accuracy/seq_average": 0.5082872928176796, "commonsenseqa/accuracy/dev_rand_split": 0.26453726453726456, "commonsenseqa/accuracy/group_average": 0.26453726453726456, "commonsenseqa/accuracy/seq_average": 0.26453726453726456}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-300000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.28228517562624, "val/accuracy": 0.523619636656746, "val/perplexity": 9.799047387809685, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.42976995432599, "lambada/accuracy/total": 0.359472049689441, "lambada/accuracy/openai_last_token": 0.7981366459627329, "lambada/perplexity": 6.901439941437564, "lambada/lm_loss": 2.887644233892005, "lambada/lm_perplexity": 17.950971461578032, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.44154584317309353, "mean_loss": 2.356027564976115, "blimp/accuracy/passive_2": 0.918, "blimp/accuracy/determiner_noun_agreement_2": 0.986, "blimp/accuracy/ellipsis_n_bar_1": 0.852, "blimp/accuracy/tough_vs_raising_2": 0.87, "blimp/accuracy/tough_vs_raising_1": 0.589, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.895, "blimp/accuracy/principle_A_reconstruction": 0.389, "blimp/accuracy/wh_vs_that_with_gap": 0.463, "blimp/accuracy/principle_A_domain_2": 0.895, "blimp/accuracy/determiner_noun_agreement_1": 0.989, "blimp/accuracy/ellipsis_n_bar_2": 0.897, "blimp/accuracy/principle_A_domain_3": 0.611, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.928, "blimp/accuracy/animate_subject_trans": 0.906, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.924, "blimp/accuracy/distractor_agreement_relative_clause": 0.728, "blimp/accuracy/transitive": 0.885, "blimp/accuracy/sentential_subject_island": 0.324, "blimp/accuracy/adjunct_island": 0.855, "blimp/accuracy/intransitive": 0.766, "blimp/accuracy/existential_there_subject_raising": 0.911, "blimp/accuracy/irregular_past_participle_adjectives": 0.899, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.677, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.375, "blimp/accuracy/only_npi_scope": 0.731, "blimp/accuracy/superlative_quantifiers_2": 0.823, "blimp/accuracy/passive_1": 0.901, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.935, "blimp/accuracy/inchoative": 0.618, "blimp/accuracy/anaphor_gender_agreement": 0.97, "blimp/accuracy/principle_A_c_command": 0.711, "blimp/accuracy/only_npi_licensor_present": 0.745, "blimp/accuracy/expletive_it_object_raising": 0.806, "blimp/accuracy/left_branch_island_simple_question": 0.78, "blimp/accuracy/wh_questions_subject_gap": 0.933, "blimp/accuracy/existential_there_quantifiers_2": 0.461, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.94, "blimp/accuracy/sentential_negation_npi_scope": 0.714, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.829, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.904, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.906, "blimp/accuracy/principle_A_case_2": 0.927, "blimp/accuracy/distractor_agreement_relational_noun": 0.836, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.967, "blimp/accuracy/superlative_quantifiers_1": 0.846, "blimp/accuracy/wh_island": 0.798, "blimp/accuracy/principle_A_domain_1": 0.996, "blimp/accuracy/complex_NP_island": 0.574, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.979, "blimp/accuracy/irregular_past_participle_verbs": 0.921, "blimp/accuracy/drop_argument": 0.712, "blimp/accuracy/wh_questions_object_gap": 0.845, "blimp/accuracy/animate_subject_passive": 0.798, "blimp/accuracy/existential_there_quantifiers_1": 0.979, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.901, "blimp/accuracy/npi_present_2": 0.581, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.968, "blimp/accuracy/anaphor_number_agreement": 0.991, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.97, "blimp/accuracy/existential_there_object_raising": 0.844, "blimp/accuracy/matrix_question_npi_licensor_present": 0.348, "blimp/accuracy/npi_present_1": 0.561, "blimp/accuracy/wh_vs_that_no_gap": 0.982, "blimp/accuracy/left_branch_island_echo_question": 0.456, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.955, "blimp/accuracy/causative": 0.716, "blimp/accuracy/group_average": 0.8013432835820896, "blimp/accuracy/seq_average": 0.8013432835820895, "cbt/accuracy/NE": 0.8301282051282052, "cbt/accuracy/V": 0.9416, "cbt/accuracy/CN": 0.8856, "cbt/accuracy/P": 0.924, "cbt/accuracy/group_average": 0.8953320512820513, "cbt/accuracy/seq_average": 0.8953581432573029, "hellaswag/accuracy/val": 0.3662617008564031, "hellaswag/accuracy/group_average": 0.3662617008564031, "hellaswag/accuracy/seq_average": 0.3662617008564031, "piqa/accuracy/val": 0.6332970620239391, "piqa/accuracy/group_average": 0.6332970620239391, "piqa/accuracy/seq_average": 0.6332970620239391, "ai2arc/accuracy/ARC-Easy": 0.386892177589852, "ai2arc/accuracy/ARC-Challenge": 0.22317596566523606, "ai2arc/accuracy/group_average": 0.30503407162754403, "ai2arc/accuracy/seq_average": 0.3328611898016997, "mmlu/accuracy/MMLU": 0.26020736503396497, "mmlu/accuracy/group_average": 0.26020736503396497, "mmlu/accuracy/seq_average": 0.26020736503396497, "openbookqa/accuracy/test": 0.278, "openbookqa/accuracy/group_average": 0.278, "openbookqa/accuracy/seq_average": 0.278, "race/accuracy/test/high": 0.2850200114351058, "race/accuracy/test/middle": 0.36559888579387184, "race/accuracy/group_average": 0.32530944861448885, "race/accuracy/seq_average": 0.3084718281313336, "siqa/accuracy/dev": 0.37308085977482086, "siqa/accuracy/group_average": 0.37308085977482086, "siqa/accuracy/seq_average": 0.37308085977482086, "winogrande/accuracy/dev": 0.4988161010260458, "winogrande/accuracy/group_average": 0.4988161010260458, "winogrande/accuracy/seq_average": 0.4988161010260458, "commonsenseqa/accuracy/dev_rand_split": 0.26863226863226863, "commonsenseqa/accuracy/group_average": 0.26863226863226863, "commonsenseqa/accuracy/seq_average": 0.26863226863226863}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-320000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.2745269290984624, "val/accuracy": 0.525177486359127, "val/perplexity": 9.723318105505957, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.400702624587539, "lambada/accuracy/total": 0.37461180124223603, "lambada/accuracy/openai_last_token": 0.8041537267080745, "lambada/perplexity": 6.273224290997466, "lambada/lm_loss": 2.880101746835602, "lambada/lm_perplexity": 17.81608581775031, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4498946438006815, "mean_loss": 2.337614776843001, "blimp/accuracy/passive_2": 0.922, "blimp/accuracy/determiner_noun_agreement_2": 0.987, "blimp/accuracy/ellipsis_n_bar_1": 0.857, "blimp/accuracy/tough_vs_raising_2": 0.879, "blimp/accuracy/tough_vs_raising_1": 0.607, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.903, "blimp/accuracy/principle_A_reconstruction": 0.362, "blimp/accuracy/wh_vs_that_with_gap": 0.453, "blimp/accuracy/principle_A_domain_2": 0.896, "blimp/accuracy/determiner_noun_agreement_1": 0.988, "blimp/accuracy/ellipsis_n_bar_2": 0.904, "blimp/accuracy/principle_A_domain_3": 0.633, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.937, "blimp/accuracy/animate_subject_trans": 0.904, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.911, "blimp/accuracy/distractor_agreement_relative_clause": 0.702, "blimp/accuracy/transitive": 0.887, "blimp/accuracy/sentential_subject_island": 0.371, "blimp/accuracy/adjunct_island": 0.865, "blimp/accuracy/intransitive": 0.764, "blimp/accuracy/existential_there_subject_raising": 0.902, "blimp/accuracy/irregular_past_participle_adjectives": 0.858, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.713, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.346, "blimp/accuracy/only_npi_scope": 0.628, "blimp/accuracy/superlative_quantifiers_2": 0.782, "blimp/accuracy/passive_1": 0.917, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.925, "blimp/accuracy/inchoative": 0.611, "blimp/accuracy/anaphor_gender_agreement": 0.974, "blimp/accuracy/principle_A_c_command": 0.689, "blimp/accuracy/only_npi_licensor_present": 0.488, "blimp/accuracy/expletive_it_object_raising": 0.805, "blimp/accuracy/left_branch_island_simple_question": 0.855, "blimp/accuracy/wh_questions_subject_gap": 0.937, "blimp/accuracy/existential_there_quantifiers_2": 0.52, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.951, "blimp/accuracy/sentential_negation_npi_scope": 0.729, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.857, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.897, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.891, "blimp/accuracy/principle_A_case_2": 0.922, "blimp/accuracy/distractor_agreement_relational_noun": 0.815, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.976, "blimp/accuracy/superlative_quantifiers_1": 0.854, "blimp/accuracy/wh_island": 0.815, "blimp/accuracy/principle_A_domain_1": 0.993, "blimp/accuracy/complex_NP_island": 0.618, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.98, "blimp/accuracy/irregular_past_participle_verbs": 0.932, "blimp/accuracy/drop_argument": 0.742, "blimp/accuracy/wh_questions_object_gap": 0.854, "blimp/accuracy/animate_subject_passive": 0.792, "blimp/accuracy/existential_there_quantifiers_1": 0.981, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.909, "blimp/accuracy/npi_present_2": 0.559, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.962, "blimp/accuracy/anaphor_number_agreement": 0.99, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.961, "blimp/accuracy/existential_there_object_raising": 0.853, "blimp/accuracy/matrix_question_npi_licensor_present": 0.368, "blimp/accuracy/npi_present_1": 0.603, "blimp/accuracy/wh_vs_that_no_gap": 0.979, "blimp/accuracy/left_branch_island_echo_question": 0.542, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.949, "blimp/accuracy/causative": 0.753, "blimp/accuracy/group_average": 0.8016268656716418, "blimp/accuracy/seq_average": 0.8016268656716418, "cbt/accuracy/NE": 0.8309294871794872, "cbt/accuracy/V": 0.9424, "cbt/accuracy/CN": 0.8896, "cbt/accuracy/P": 0.926, "cbt/accuracy/group_average": 0.8972323717948718, "cbt/accuracy/seq_average": 0.8972589035614246, "hellaswag/accuracy/val": 0.37223660625373434, "hellaswag/accuracy/group_average": 0.37223660625373434, "hellaswag/accuracy/seq_average": 0.37223660625373434, "piqa/accuracy/val": 0.6425462459194777, "piqa/accuracy/group_average": 0.6425462459194777, "piqa/accuracy/seq_average": 0.6425462459194777, "ai2arc/accuracy/ARC-Easy": 0.3775898520084567, "ai2arc/accuracy/ARC-Challenge": 0.22746781115879827, "ai2arc/accuracy/group_average": 0.3025288315836275, "ai2arc/accuracy/seq_average": 0.32804532577903683, "mmlu/accuracy/MMLU": 0.26335359313550233, "mmlu/accuracy/group_average": 0.26335359313550233, "mmlu/accuracy/seq_average": 0.26335359313550233, "openbookqa/accuracy/test": 0.28, "openbookqa/accuracy/group_average": 0.28, "openbookqa/accuracy/seq_average": 0.28, "race/accuracy/test/high": 0.28959405374499714, "race/accuracy/test/middle": 0.36002785515320335, "race/accuracy/group_average": 0.32481095444910024, "race/accuracy/seq_average": 0.3100932306445075, "siqa/accuracy/dev": 0.3679631525076766, "siqa/accuracy/group_average": 0.3679631525076766, "siqa/accuracy/seq_average": 0.3679631525076766, "winogrande/accuracy/dev": 0.4972375690607735, "winogrande/accuracy/group_average": 0.4972375690607735, "winogrande/accuracy/seq_average": 0.4972375690607735, "commonsenseqa/accuracy/dev_rand_split": 0.26044226044226043, "commonsenseqa/accuracy/group_average": 0.26044226044226043, "commonsenseqa/accuracy/seq_average": 0.26044226044226043}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-340000.pth.json
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"val/loss": 2.2654825846354165,
|
| 3 |
+
"val/accuracy": 0.5260261656746031,
|
| 4 |
+
"val/perplexity": 9.635773555531326,
|
| 5 |
+
"val/time_since_best_loss": 0,
|
| 6 |
+
"val/time_since_best_accuracy": 0,
|
| 7 |
+
"lambada/loss": 2.3353969029017856,
|
| 8 |
+
"lambada/accuracy/total": 0.390916149068323,
|
| 9 |
+
"lambada/accuracy/openai_last_token": 0.8045419254658385,
|
| 10 |
+
"lambada/perplexity": 6.41524536323411,
|
| 11 |
+
"lambada/lm_loss": 2.8689352097271317,
|
| 12 |
+
"lambada/lm_perplexity": 17.618248469102483,
|
| 13 |
+
"lambada/time_since_best_loss": 0,
|
| 14 |
+
"lambada/time_since_best_accuracy": 0,
|
| 15 |
+
"mean_accuracy": 0.45847115737146305,
|
| 16 |
+
"mean_loss": 2.3004397437686013,
|
| 17 |
+
"blimp/accuracy/passive_2": 0.915,
|
| 18 |
+
"blimp/accuracy/determiner_noun_agreement_2": 0.987,
|
| 19 |
+
"blimp/accuracy/ellipsis_n_bar_1": 0.857,
|
| 20 |
+
"blimp/accuracy/tough_vs_raising_2": 0.902,
|
| 21 |
+
"blimp/accuracy/tough_vs_raising_1": 0.593,
|
| 22 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.913,
|
| 23 |
+
"blimp/accuracy/principle_A_reconstruction": 0.411,
|
| 24 |
+
"blimp/accuracy/wh_vs_that_with_gap": 0.475,
|
| 25 |
+
"blimp/accuracy/principle_A_domain_2": 0.884,
|
| 26 |
+
"blimp/accuracy/determiner_noun_agreement_1": 0.994,
|
| 27 |
+
"blimp/accuracy/ellipsis_n_bar_2": 0.919,
|
| 28 |
+
"blimp/accuracy/principle_A_domain_3": 0.649,
|
| 29 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.939,
|
| 30 |
+
"blimp/accuracy/animate_subject_trans": 0.911,
|
| 31 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.932,
|
| 32 |
+
"blimp/accuracy/distractor_agreement_relative_clause": 0.711,
|
| 33 |
+
"blimp/accuracy/transitive": 0.884,
|
| 34 |
+
"blimp/accuracy/sentential_subject_island": 0.337,
|
| 35 |
+
"blimp/accuracy/adjunct_island": 0.868,
|
| 36 |
+
"blimp/accuracy/intransitive": 0.761,
|
| 37 |
+
"blimp/accuracy/existential_there_subject_raising": 0.904,
|
| 38 |
+
"blimp/accuracy/irregular_past_participle_adjectives": 0.909,
|
| 39 |
+
"blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.724,
|
| 40 |
+
"blimp/accuracy/principle_A_case_1": 1.0,
|
| 41 |
+
"blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.375,
|
| 42 |
+
"blimp/accuracy/only_npi_scope": 0.658,
|
| 43 |
+
"blimp/accuracy/superlative_quantifiers_2": 0.836,
|
| 44 |
+
"blimp/accuracy/passive_1": 0.903,
|
| 45 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.935,
|
| 46 |
+
"blimp/accuracy/inchoative": 0.624,
|
| 47 |
+
"blimp/accuracy/anaphor_gender_agreement": 0.977,
|
| 48 |
+
"blimp/accuracy/principle_A_c_command": 0.703,
|
| 49 |
+
"blimp/accuracy/only_npi_licensor_present": 0.595,
|
| 50 |
+
"blimp/accuracy/expletive_it_object_raising": 0.797,
|
| 51 |
+
"blimp/accuracy/left_branch_island_simple_question": 0.803,
|
| 52 |
+
"blimp/accuracy/wh_questions_subject_gap": 0.938,
|
| 53 |
+
"blimp/accuracy/existential_there_quantifiers_2": 0.543,
|
| 54 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.948,
|
| 55 |
+
"blimp/accuracy/sentential_negation_npi_scope": 0.746,
|
| 56 |
+
"blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.83,
|
| 57 |
+
"blimp/accuracy/wh_questions_subject_gap_long_distance": 0.88,
|
| 58 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.895,
|
| 59 |
+
"blimp/accuracy/principle_A_case_2": 0.897,
|
| 60 |
+
"blimp/accuracy/distractor_agreement_relational_noun": 0.84,
|
| 61 |
+
"blimp/accuracy/sentential_negation_npi_licensor_present": 0.972,
|
| 62 |
+
"blimp/accuracy/superlative_quantifiers_1": 0.871,
|
| 63 |
+
"blimp/accuracy/wh_island": 0.801,
|
| 64 |
+
"blimp/accuracy/principle_A_domain_1": 0.993,
|
| 65 |
+
"blimp/accuracy/complex_NP_island": 0.625,
|
| 66 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_2": 0.981,
|
| 67 |
+
"blimp/accuracy/irregular_past_participle_verbs": 0.923,
|
| 68 |
+
"blimp/accuracy/drop_argument": 0.71,
|
| 69 |
+
"blimp/accuracy/wh_questions_object_gap": 0.851,
|
| 70 |
+
"blimp/accuracy/animate_subject_passive": 0.817,
|
| 71 |
+
"blimp/accuracy/existential_there_quantifiers_1": 0.98,
|
| 72 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.919,
|
| 73 |
+
"blimp/accuracy/npi_present_2": 0.585,
|
| 74 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_1": 0.974,
|
| 75 |
+
"blimp/accuracy/anaphor_number_agreement": 0.993,
|
| 76 |
+
"blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.971,
|
| 77 |
+
"blimp/accuracy/existential_there_object_raising": 0.844,
|
| 78 |
+
"blimp/accuracy/matrix_question_npi_licensor_present": 0.356,
|
| 79 |
+
"blimp/accuracy/npi_present_1": 0.569,
|
| 80 |
+
"blimp/accuracy/wh_vs_that_no_gap": 0.985,
|
| 81 |
+
"blimp/accuracy/left_branch_island_echo_question": 0.519,
|
| 82 |
+
"blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.954,
|
| 83 |
+
"blimp/accuracy/causative": 0.764,
|
| 84 |
+
"blimp/accuracy/group_average": 0.8068507462686568,
|
| 85 |
+
"blimp/accuracy/seq_average": 0.8068507462686567,
|
| 86 |
+
"cbt/accuracy/NE": 0.8297275641025641,
|
| 87 |
+
"cbt/accuracy/V": 0.9436,
|
| 88 |
+
"cbt/accuracy/CN": 0.8908,
|
| 89 |
+
"cbt/accuracy/P": 0.9288,
|
| 90 |
+
"cbt/accuracy/group_average": 0.898231891025641,
|
| 91 |
+
"cbt/accuracy/seq_average": 0.8982593037214885,
|
| 92 |
+
"hellaswag/accuracy/val": 0.3701453893646684,
|
| 93 |
+
"hellaswag/accuracy/group_average": 0.3701453893646684,
|
| 94 |
+
"hellaswag/accuracy/seq_average": 0.3701453893646684,
|
| 95 |
+
"piqa/accuracy/val": 0.6485310119695321,
|
| 96 |
+
"piqa/accuracy/group_average": 0.6485310119695321,
|
| 97 |
+
"piqa/accuracy/seq_average": 0.6485310119695321,
|
| 98 |
+
"ai2arc/accuracy/ARC-Easy": 0.3839323467230444,
|
| 99 |
+
"ai2arc/accuracy/ARC-Challenge": 0.23433476394849787,
|
| 100 |
+
"ai2arc/accuracy/group_average": 0.30913355533577114,
|
| 101 |
+
"ai2arc/accuracy/seq_average": 0.33456090651558074,
|
| 102 |
+
"mmlu/accuracy/MMLU": 0.2644976760815159,
|
| 103 |
+
"mmlu/accuracy/group_average": 0.2644976760815159,
|
| 104 |
+
"mmlu/accuracy/seq_average": 0.2644976760815159,
|
| 105 |
+
"openbookqa/accuracy/test": 0.274,
|
| 106 |
+
"openbookqa/accuracy/group_average": 0.274,
|
| 107 |
+
"openbookqa/accuracy/seq_average": 0.274,
|
| 108 |
+
"race/accuracy/test/high": 0.28959405374499714,
|
| 109 |
+
"race/accuracy/test/middle": 0.36629526462395545,
|
| 110 |
+
"race/accuracy/group_average": 0.3279446591844763,
|
| 111 |
+
"race/accuracy/seq_average": 0.31191730847182814,
|
| 112 |
+
"siqa/accuracy/dev": 0.37615148413510746,
|
| 113 |
+
"siqa/accuracy/group_average": 0.37615148413510746,
|
| 114 |
+
"siqa/accuracy/seq_average": 0.37615148413510746,
|
| 115 |
+
"winogrande/accuracy/dev": 0.5067087608524072,
|
| 116 |
+
"winogrande/accuracy/group_average": 0.5067087608524072,
|
| 117 |
+
"winogrande/accuracy/seq_average": 0.5067087608524072,
|
| 118 |
+
"commonsenseqa/accuracy/dev_rand_split": 0.2678132678132678,
|
| 119 |
+
"commonsenseqa/accuracy/group_average": 0.2678132678132678,
|
| 120 |
+
"commonsenseqa/accuracy/seq_average": 0.2678132678132678
|
| 121 |
+
}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-360000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.2600211491660467, "val/accuracy": 0.527220710875496, "val/perplexity": 9.583291843251617, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.379837510008249, "lambada/accuracy/total": 0.38082298136645965, "lambada/accuracy/openai_last_token": 0.8039596273291926, "lambada/perplexity": 6.558668080738002, "lambada/lm_loss": 2.854366984241393, "lambada/lm_perplexity": 17.363442395418733, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.45402184612097785, "mean_loss": 2.319929329587148, "blimp/accuracy/passive_2": 0.91, "blimp/accuracy/determiner_noun_agreement_2": 0.986, "blimp/accuracy/ellipsis_n_bar_1": 0.853, "blimp/accuracy/tough_vs_raising_2": 0.893, "blimp/accuracy/tough_vs_raising_1": 0.584, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.904, "blimp/accuracy/principle_A_reconstruction": 0.409, "blimp/accuracy/wh_vs_that_with_gap": 0.456, "blimp/accuracy/principle_A_domain_2": 0.908, "blimp/accuracy/determiner_noun_agreement_1": 0.993, "blimp/accuracy/ellipsis_n_bar_2": 0.9, "blimp/accuracy/principle_A_domain_3": 0.63, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.938, "blimp/accuracy/animate_subject_trans": 0.902, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.916, "blimp/accuracy/distractor_agreement_relative_clause": 0.706, "blimp/accuracy/transitive": 0.876, "blimp/accuracy/sentential_subject_island": 0.34, "blimp/accuracy/adjunct_island": 0.875, "blimp/accuracy/intransitive": 0.755, "blimp/accuracy/existential_there_subject_raising": 0.903, "blimp/accuracy/irregular_past_participle_adjectives": 0.891, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.732, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.37, "blimp/accuracy/only_npi_scope": 0.648, "blimp/accuracy/superlative_quantifiers_2": 0.78, "blimp/accuracy/passive_1": 0.9, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.916, "blimp/accuracy/inchoative": 0.613, "blimp/accuracy/anaphor_gender_agreement": 0.973, "blimp/accuracy/principle_A_c_command": 0.718, "blimp/accuracy/only_npi_licensor_present": 0.543, "blimp/accuracy/expletive_it_object_raising": 0.793, "blimp/accuracy/left_branch_island_simple_question": 0.821, "blimp/accuracy/wh_questions_subject_gap": 0.942, "blimp/accuracy/existential_there_quantifiers_2": 0.546, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.955, "blimp/accuracy/sentential_negation_npi_scope": 0.729, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.815, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.902, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.883, "blimp/accuracy/principle_A_case_2": 0.918, "blimp/accuracy/distractor_agreement_relational_noun": 0.814, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.975, "blimp/accuracy/superlative_quantifiers_1": 0.866, "blimp/accuracy/wh_island": 0.783, "blimp/accuracy/principle_A_domain_1": 0.996, "blimp/accuracy/complex_NP_island": 0.608, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.978, "blimp/accuracy/irregular_past_participle_verbs": 0.913, "blimp/accuracy/drop_argument": 0.716, "blimp/accuracy/wh_questions_object_gap": 0.857, "blimp/accuracy/animate_subject_passive": 0.796, "blimp/accuracy/existential_there_quantifiers_1": 0.982, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.892, "blimp/accuracy/npi_present_2": 0.558, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.962, "blimp/accuracy/anaphor_number_agreement": 0.992, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.967, "blimp/accuracy/existential_there_object_raising": 0.838, "blimp/accuracy/matrix_question_npi_licensor_present": 0.394, "blimp/accuracy/npi_present_1": 0.589, "blimp/accuracy/wh_vs_that_no_gap": 0.978, "blimp/accuracy/left_branch_island_echo_question": 0.54, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.966, "blimp/accuracy/causative": 0.754, "blimp/accuracy/group_average": 0.8020746268656715, "blimp/accuracy/seq_average": 0.8020746268656717, "cbt/accuracy/NE": 0.827323717948718, "cbt/accuracy/V": 0.9428, "cbt/accuracy/CN": 0.8928, "cbt/accuracy/P": 0.928, "cbt/accuracy/group_average": 0.8977309294871795, "cbt/accuracy/seq_average": 0.8977591036414566, "hellaswag/accuracy/val": 0.37502489543915557, "hellaswag/accuracy/group_average": 0.37502489543915557, "hellaswag/accuracy/seq_average": 0.37502489543915557, "piqa/accuracy/val": 0.6490750816104461, "piqa/accuracy/group_average": 0.6490750816104461, "piqa/accuracy/seq_average": 0.6490750816104461, "ai2arc/accuracy/ARC-Easy": 0.37505285412262157, "ai2arc/accuracy/ARC-Challenge": 0.2317596566523605, "ai2arc/accuracy/group_average": 0.30340625538749105, "ai2arc/accuracy/seq_average": 0.3277620396600567, "mmlu/accuracy/MMLU": 0.26170897390060777, "mmlu/accuracy/group_average": 0.26170897390060777, "mmlu/accuracy/seq_average": 0.26170897390060777, "openbookqa/accuracy/test": 0.274, "openbookqa/accuracy/group_average": 0.274, "openbookqa/accuracy/seq_average": 0.274, "race/accuracy/test/high": 0.28987993138936535, "race/accuracy/test/middle": 0.3649025069637883, "race/accuracy/group_average": 0.3273912191765768, "race/accuracy/seq_average": 0.3117146331576814, "siqa/accuracy/dev": 0.37563971340839303, "siqa/accuracy/group_average": 0.37563971340839303, "siqa/accuracy/seq_average": 0.37563971340839303, "winogrande/accuracy/dev": 0.4988161010260458, "winogrande/accuracy/group_average": 0.4988161010260458, "winogrande/accuracy/seq_average": 0.4988161010260458, "commonsenseqa/accuracy/dev_rand_split": 0.26617526617526616, "commonsenseqa/accuracy/group_average": 0.26617526617526616, "commonsenseqa/accuracy/seq_average": 0.26617526617526616}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-380000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.255331614660838, "val/accuracy": 0.5282960921999008, "val/perplexity": 9.538455877551673, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.3658928723068713, "lambada/accuracy/total": 0.38528726708074534, "lambada/accuracy/openai_last_token": 0.8053183229813664, "lambada/perplexity": 6.557504748004316, "lambada/lm_loss": 2.8554835069971904, "lambada/lm_perplexity": 17.382839900835183, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4567916796403231, "mean_loss": 2.3106122434838547, "blimp/accuracy/passive_2": 0.919, "blimp/accuracy/determiner_noun_agreement_2": 0.986, "blimp/accuracy/ellipsis_n_bar_1": 0.857, "blimp/accuracy/tough_vs_raising_2": 0.888, "blimp/accuracy/tough_vs_raising_1": 0.578, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.917, "blimp/accuracy/principle_A_reconstruction": 0.385, "blimp/accuracy/wh_vs_that_with_gap": 0.434, "blimp/accuracy/principle_A_domain_2": 0.887, "blimp/accuracy/determiner_noun_agreement_1": 0.991, "blimp/accuracy/ellipsis_n_bar_2": 0.913, "blimp/accuracy/principle_A_domain_3": 0.63, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.933, "blimp/accuracy/animate_subject_trans": 0.893, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.919, "blimp/accuracy/distractor_agreement_relative_clause": 0.71, "blimp/accuracy/transitive": 0.896, "blimp/accuracy/sentential_subject_island": 0.329, "blimp/accuracy/adjunct_island": 0.861, "blimp/accuracy/intransitive": 0.771, "blimp/accuracy/existential_there_subject_raising": 0.903, "blimp/accuracy/irregular_past_participle_adjectives": 0.864, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.737, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.348, "blimp/accuracy/only_npi_scope": 0.661, "blimp/accuracy/superlative_quantifiers_2": 0.784, "blimp/accuracy/passive_1": 0.899, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.915, "blimp/accuracy/inchoative": 0.612, "blimp/accuracy/anaphor_gender_agreement": 0.976, "blimp/accuracy/principle_A_c_command": 0.714, "blimp/accuracy/only_npi_licensor_present": 0.561, "blimp/accuracy/expletive_it_object_raising": 0.797, "blimp/accuracy/left_branch_island_simple_question": 0.835, "blimp/accuracy/wh_questions_subject_gap": 0.938, "blimp/accuracy/existential_there_quantifiers_2": 0.572, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.95, "blimp/accuracy/sentential_negation_npi_scope": 0.703, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.819, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.901, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.886, "blimp/accuracy/principle_A_case_2": 0.896, "blimp/accuracy/distractor_agreement_relational_noun": 0.843, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.97, "blimp/accuracy/superlative_quantifiers_1": 0.874, "blimp/accuracy/wh_island": 0.775, "blimp/accuracy/principle_A_domain_1": 0.995, "blimp/accuracy/complex_NP_island": 0.609, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.979, "blimp/accuracy/irregular_past_participle_verbs": 0.943, "blimp/accuracy/drop_argument": 0.707, "blimp/accuracy/wh_questions_object_gap": 0.86, "blimp/accuracy/animate_subject_passive": 0.806, "blimp/accuracy/existential_there_quantifiers_1": 0.979, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.91, "blimp/accuracy/npi_present_2": 0.55, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.964, "blimp/accuracy/anaphor_number_agreement": 0.991, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.97, "blimp/accuracy/existential_there_object_raising": 0.849, "blimp/accuracy/matrix_question_npi_licensor_present": 0.384, "blimp/accuracy/npi_present_1": 0.55, "blimp/accuracy/wh_vs_that_no_gap": 0.982, "blimp/accuracy/left_branch_island_echo_question": 0.509, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.966, "blimp/accuracy/causative": 0.745, "blimp/accuracy/group_average": 0.8011641791044771, "blimp/accuracy/seq_average": 0.8011641791044776, "cbt/accuracy/NE": 0.827323717948718, "cbt/accuracy/V": 0.948, "cbt/accuracy/CN": 0.8912, "cbt/accuracy/P": 0.9284, "cbt/accuracy/group_average": 0.8987309294871795, "cbt/accuracy/seq_average": 0.8987595038015206, "hellaswag/accuracy/val": 0.373132842063334, "hellaswag/accuracy/group_average": 0.373132842063334, "hellaswag/accuracy/seq_average": 0.373132842063334, "piqa/accuracy/val": 0.6430903155603918, "piqa/accuracy/group_average": 0.6430903155603918, "piqa/accuracy/seq_average": 0.6430903155603918, "ai2arc/accuracy/ARC-Easy": 0.38012684989429174, "ai2arc/accuracy/ARC-Challenge": 0.23605150214592274, "ai2arc/accuracy/group_average": 0.3080891760201072, "ai2arc/accuracy/seq_average": 0.33257790368271956, "mmlu/accuracy/MMLU": 0.261136932427601, "mmlu/accuracy/group_average": 0.261136932427601, "mmlu/accuracy/seq_average": 0.261136932427601, "openbookqa/accuracy/test": 0.272, "openbookqa/accuracy/group_average": 0.272, "openbookqa/accuracy/seq_average": 0.272, "race/accuracy/test/high": 0.2887364208118925, "race/accuracy/test/middle": 0.37047353760445684, "race/accuracy/group_average": 0.32960497920817466, "race/accuracy/seq_average": 0.31252533441426833, "siqa/accuracy/dev": 0.37615148413510746, "siqa/accuracy/group_average": 0.37615148413510746, "siqa/accuracy/seq_average": 0.37615148413510746, "winogrande/accuracy/dev": 0.5019731649565904, "winogrande/accuracy/group_average": 0.5019731649565904, "winogrande/accuracy/seq_average": 0.5019731649565904, "commonsenseqa/accuracy/dev_rand_split": 0.266994266994267, "commonsenseqa/accuracy/group_average": 0.266994266994267, "commonsenseqa/accuracy/seq_average": 0.266994266994267}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-40000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.590641082279266, "val/accuracy": 0.4779546828497024, "val/perplexity": 13.338319823320697, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.664239350312985, "lambada/accuracy/total": 0.2909549689440994, "lambada/accuracy/openai_last_token": 0.7649456521739131, "lambada/perplexity": 11.671392716337088, "lambada/lm_loss": 3.158754187942497, "lambada/lm_perplexity": 23.541249680208068, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3844548258969009, "mean_loss": 2.6274402162961255, "blimp/accuracy/passive_2": 0.899, "blimp/accuracy/determiner_noun_agreement_2": 0.981, "blimp/accuracy/ellipsis_n_bar_1": 0.815, "blimp/accuracy/tough_vs_raising_2": 0.819, "blimp/accuracy/tough_vs_raising_1": 0.592, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.89, "blimp/accuracy/principle_A_reconstruction": 0.275, "blimp/accuracy/wh_vs_that_with_gap": 0.391, "blimp/accuracy/principle_A_domain_2": 0.848, "blimp/accuracy/determiner_noun_agreement_1": 0.988, "blimp/accuracy/ellipsis_n_bar_2": 0.912, "blimp/accuracy/principle_A_domain_3": 0.598, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.929, "blimp/accuracy/animate_subject_trans": 0.896, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.905, "blimp/accuracy/distractor_agreement_relative_clause": 0.608, "blimp/accuracy/transitive": 0.865, "blimp/accuracy/sentential_subject_island": 0.36, "blimp/accuracy/adjunct_island": 0.828, "blimp/accuracy/intransitive": 0.714, "blimp/accuracy/existential_there_subject_raising": 0.873, "blimp/accuracy/irregular_past_participle_adjectives": 0.892, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.432, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.235, "blimp/accuracy/only_npi_scope": 0.628, "blimp/accuracy/superlative_quantifiers_2": 0.589, "blimp/accuracy/passive_1": 0.896, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.893, "blimp/accuracy/inchoative": 0.566, "blimp/accuracy/anaphor_gender_agreement": 0.973, "blimp/accuracy/principle_A_c_command": 0.618, "blimp/accuracy/only_npi_licensor_present": 0.457, "blimp/accuracy/expletive_it_object_raising": 0.748, "blimp/accuracy/left_branch_island_simple_question": 0.533, "blimp/accuracy/wh_questions_subject_gap": 0.938, "blimp/accuracy/existential_there_quantifiers_2": 0.553, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.951, "blimp/accuracy/sentential_negation_npi_scope": 0.621, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.818, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.898, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.874, "blimp/accuracy/principle_A_case_2": 0.926, "blimp/accuracy/distractor_agreement_relational_noun": 0.815, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.97, "blimp/accuracy/superlative_quantifiers_1": 0.546, "blimp/accuracy/wh_island": 0.769, "blimp/accuracy/principle_A_domain_1": 0.979, "blimp/accuracy/complex_NP_island": 0.598, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.965, "blimp/accuracy/irregular_past_participle_verbs": 0.88, "blimp/accuracy/drop_argument": 0.746, "blimp/accuracy/wh_questions_object_gap": 0.826, "blimp/accuracy/animate_subject_passive": 0.801, "blimp/accuracy/existential_there_quantifiers_1": 0.98, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.87, "blimp/accuracy/npi_present_2": 0.552, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.922, "blimp/accuracy/anaphor_number_agreement": 0.988, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.964, "blimp/accuracy/existential_there_object_raising": 0.837, "blimp/accuracy/matrix_question_npi_licensor_present": 0.224, "blimp/accuracy/npi_present_1": 0.485, "blimp/accuracy/wh_vs_that_no_gap": 0.979, "blimp/accuracy/left_branch_island_echo_question": 0.434, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.98, "blimp/accuracy/causative": 0.709, "blimp/accuracy/group_average": 0.7588656716417909, "blimp/accuracy/seq_average": 0.7588656716417911, "cbt/accuracy/NE": 0.7712339743589743, "cbt/accuracy/V": 0.9124, "cbt/accuracy/CN": 0.8392, "cbt/accuracy/P": 0.8916, "cbt/accuracy/group_average": 0.8536084935897436, "cbt/accuracy/seq_average": 0.853641456582633, "hellaswag/accuracy/val": 0.3021310495917148, "hellaswag/accuracy/group_average": 0.3021310495917148, "hellaswag/accuracy/seq_average": 0.3021310495917148, "piqa/accuracy/val": 0.5903155603917302, "piqa/accuracy/group_average": 0.5903155603917302, "piqa/accuracy/seq_average": 0.5903155603917302, "ai2arc/accuracy/ARC-Easy": 0.33657505285412265, "ai2arc/accuracy/ARC-Challenge": 0.2094420600858369, "ai2arc/accuracy/group_average": 0.2730085564699798, "ai2arc/accuracy/seq_average": 0.29461756373937675, "mmlu/accuracy/MMLU": 0.264926707186271, "mmlu/accuracy/group_average": 0.264926707186271, "mmlu/accuracy/seq_average": 0.264926707186271, "openbookqa/accuracy/test": 0.272, "openbookqa/accuracy/group_average": 0.272, "openbookqa/accuracy/seq_average": 0.272, "race/accuracy/test/high": 0.2701543739279588, "race/accuracy/test/middle": 0.34401114206128136, "race/accuracy/group_average": 0.3070827579946201, "race/accuracy/seq_average": 0.29164977705715445, "siqa/accuracy/dev": 0.36131013306038895, "siqa/accuracy/group_average": 0.36131013306038895, "siqa/accuracy/seq_average": 0.36131013306038895, "winogrande/accuracy/dev": 0.5074980268350434, "winogrande/accuracy/group_average": 0.5074980268350434, "winogrande/accuracy/seq_average": 0.5074980268350434, "commonsenseqa/accuracy/dev_rand_split": 0.25634725634725636, "commonsenseqa/accuracy/group_average": 0.25634725634725636, "commonsenseqa/accuracy/seq_average": 0.25634725634725636}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-400000.pth.json
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"val/loss": 2.252078828357515,
|
| 3 |
+
"val/accuracy": 0.5291428338913691,
|
| 4 |
+
"val/perplexity": 9.507479725631901,
|
| 5 |
+
"val/time_since_best_loss": 0,
|
| 6 |
+
"val/time_since_best_accuracy": 0,
|
| 7 |
+
"lambada/loss": 2.3892655461471275,
|
| 8 |
+
"lambada/accuracy/total": 0.37131211180124224,
|
| 9 |
+
"lambada/accuracy/openai_last_token": 0.8029891304347826,
|
| 10 |
+
"lambada/perplexity": 6.572569725278648,
|
| 11 |
+
"lambada/lm_loss": 2.8536624460727102,
|
| 12 |
+
"lambada/lm_perplexity": 17.351213495880522,
|
| 13 |
+
"lambada/time_since_best_loss": 0,
|
| 14 |
+
"lambada/time_since_best_accuracy": 0,
|
| 15 |
+
"mean_accuracy": 0.45022747284630565,
|
| 16 |
+
"mean_loss": 2.3206721872523213,
|
| 17 |
+
"blimp/accuracy/passive_2": 0.919,
|
| 18 |
+
"blimp/accuracy/determiner_noun_agreement_2": 0.989,
|
| 19 |
+
"blimp/accuracy/ellipsis_n_bar_1": 0.858,
|
| 20 |
+
"blimp/accuracy/tough_vs_raising_2": 0.875,
|
| 21 |
+
"blimp/accuracy/tough_vs_raising_1": 0.616,
|
| 22 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.915,
|
| 23 |
+
"blimp/accuracy/principle_A_reconstruction": 0.359,
|
| 24 |
+
"blimp/accuracy/wh_vs_that_with_gap": 0.473,
|
| 25 |
+
"blimp/accuracy/principle_A_domain_2": 0.899,
|
| 26 |
+
"blimp/accuracy/determiner_noun_agreement_1": 0.994,
|
| 27 |
+
"blimp/accuracy/ellipsis_n_bar_2": 0.907,
|
| 28 |
+
"blimp/accuracy/principle_A_domain_3": 0.642,
|
| 29 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.935,
|
| 30 |
+
"blimp/accuracy/animate_subject_trans": 0.902,
|
| 31 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.937,
|
| 32 |
+
"blimp/accuracy/distractor_agreement_relative_clause": 0.7,
|
| 33 |
+
"blimp/accuracy/transitive": 0.894,
|
| 34 |
+
"blimp/accuracy/sentential_subject_island": 0.348,
|
| 35 |
+
"blimp/accuracy/adjunct_island": 0.866,
|
| 36 |
+
"blimp/accuracy/intransitive": 0.764,
|
| 37 |
+
"blimp/accuracy/existential_there_subject_raising": 0.903,
|
| 38 |
+
"blimp/accuracy/irregular_past_participle_adjectives": 0.868,
|
| 39 |
+
"blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.76,
|
| 40 |
+
"blimp/accuracy/principle_A_case_1": 1.0,
|
| 41 |
+
"blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.365,
|
| 42 |
+
"blimp/accuracy/only_npi_scope": 0.628,
|
| 43 |
+
"blimp/accuracy/superlative_quantifiers_2": 0.776,
|
| 44 |
+
"blimp/accuracy/passive_1": 0.901,
|
| 45 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.905,
|
| 46 |
+
"blimp/accuracy/inchoative": 0.614,
|
| 47 |
+
"blimp/accuracy/anaphor_gender_agreement": 0.973,
|
| 48 |
+
"blimp/accuracy/principle_A_c_command": 0.692,
|
| 49 |
+
"blimp/accuracy/only_npi_licensor_present": 0.643,
|
| 50 |
+
"blimp/accuracy/expletive_it_object_raising": 0.788,
|
| 51 |
+
"blimp/accuracy/left_branch_island_simple_question": 0.844,
|
| 52 |
+
"blimp/accuracy/wh_questions_subject_gap": 0.937,
|
| 53 |
+
"blimp/accuracy/existential_there_quantifiers_2": 0.527,
|
| 54 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.954,
|
| 55 |
+
"blimp/accuracy/sentential_negation_npi_scope": 0.739,
|
| 56 |
+
"blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.841,
|
| 57 |
+
"blimp/accuracy/wh_questions_subject_gap_long_distance": 0.905,
|
| 58 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.883,
|
| 59 |
+
"blimp/accuracy/principle_A_case_2": 0.923,
|
| 60 |
+
"blimp/accuracy/distractor_agreement_relational_noun": 0.83,
|
| 61 |
+
"blimp/accuracy/sentential_negation_npi_licensor_present": 0.971,
|
| 62 |
+
"blimp/accuracy/superlative_quantifiers_1": 0.851,
|
| 63 |
+
"blimp/accuracy/wh_island": 0.79,
|
| 64 |
+
"blimp/accuracy/principle_A_domain_1": 0.994,
|
| 65 |
+
"blimp/accuracy/complex_NP_island": 0.619,
|
| 66 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_2": 0.98,
|
| 67 |
+
"blimp/accuracy/irregular_past_participle_verbs": 0.909,
|
| 68 |
+
"blimp/accuracy/drop_argument": 0.719,
|
| 69 |
+
"blimp/accuracy/wh_questions_object_gap": 0.859,
|
| 70 |
+
"blimp/accuracy/animate_subject_passive": 0.803,
|
| 71 |
+
"blimp/accuracy/existential_there_quantifiers_1": 0.988,
|
| 72 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.893,
|
| 73 |
+
"blimp/accuracy/npi_present_2": 0.568,
|
| 74 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_1": 0.974,
|
| 75 |
+
"blimp/accuracy/anaphor_number_agreement": 0.99,
|
| 76 |
+
"blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.971,
|
| 77 |
+
"blimp/accuracy/existential_there_object_raising": 0.861,
|
| 78 |
+
"blimp/accuracy/matrix_question_npi_licensor_present": 0.378,
|
| 79 |
+
"blimp/accuracy/npi_present_1": 0.572,
|
| 80 |
+
"blimp/accuracy/wh_vs_that_no_gap": 0.987,
|
| 81 |
+
"blimp/accuracy/left_branch_island_echo_question": 0.535,
|
| 82 |
+
"blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.963,
|
| 83 |
+
"blimp/accuracy/causative": 0.748,
|
| 84 |
+
"blimp/accuracy/group_average": 0.804686567164179,
|
| 85 |
+
"blimp/accuracy/seq_average": 0.8046865671641791,
|
| 86 |
+
"cbt/accuracy/NE": 0.828926282051282,
|
| 87 |
+
"cbt/accuracy/V": 0.9452,
|
| 88 |
+
"cbt/accuracy/CN": 0.886,
|
| 89 |
+
"cbt/accuracy/P": 0.9328,
|
| 90 |
+
"cbt/accuracy/group_average": 0.8982315705128205,
|
| 91 |
+
"cbt/accuracy/seq_average": 0.8982593037214885,
|
| 92 |
+
"hellaswag/accuracy/val": 0.37492531368253335,
|
| 93 |
+
"hellaswag/accuracy/group_average": 0.37492531368253335,
|
| 94 |
+
"hellaswag/accuracy/seq_average": 0.37492531368253335,
|
| 95 |
+
"piqa/accuracy/val": 0.6436343852013058,
|
| 96 |
+
"piqa/accuracy/group_average": 0.6436343852013058,
|
| 97 |
+
"piqa/accuracy/seq_average": 0.6436343852013058,
|
| 98 |
+
"ai2arc/accuracy/ARC-Easy": 0.38224101479915434,
|
| 99 |
+
"ai2arc/accuracy/ARC-Challenge": 0.23090128755364808,
|
| 100 |
+
"ai2arc/accuracy/group_average": 0.3065711511764012,
|
| 101 |
+
"ai2arc/accuracy/seq_average": 0.3322946175637394,
|
| 102 |
+
"mmlu/accuracy/MMLU": 0.2653557382910261,
|
| 103 |
+
"mmlu/accuracy/group_average": 0.2653557382910261,
|
| 104 |
+
"mmlu/accuracy/seq_average": 0.2653557382910261,
|
| 105 |
+
"openbookqa/accuracy/test": 0.278,
|
| 106 |
+
"openbookqa/accuracy/group_average": 0.278,
|
| 107 |
+
"openbookqa/accuracy/seq_average": 0.278,
|
| 108 |
+
"race/accuracy/test/high": 0.2915951972555746,
|
| 109 |
+
"race/accuracy/test/middle": 0.36908077994428967,
|
| 110 |
+
"race/accuracy/group_average": 0.33033798859993213,
|
| 111 |
+
"race/accuracy/seq_average": 0.3141467369274422,
|
| 112 |
+
"siqa/accuracy/dev": 0.3741044012282497,
|
| 113 |
+
"siqa/accuracy/group_average": 0.3741044012282497,
|
| 114 |
+
"siqa/accuracy/seq_average": 0.3741044012282497,
|
| 115 |
+
"winogrande/accuracy/dev": 0.5122336227308603,
|
| 116 |
+
"winogrande/accuracy/group_average": 0.5122336227308603,
|
| 117 |
+
"winogrande/accuracy/seq_average": 0.5122336227308603,
|
| 118 |
+
"commonsenseqa/accuracy/dev_rand_split": 0.26535626535626533,
|
| 119 |
+
"commonsenseqa/accuracy/group_average": 0.26535626535626533,
|
| 120 |
+
"commonsenseqa/accuracy/seq_average": 0.26535626535626533
|
| 121 |
+
}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-60000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.5216820126488093, "val/accuracy": 0.4887627495659722, "val/perplexity": 12.449519311519067, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5009371360636647, "lambada/accuracy/total": 0.28396739130434784, "lambada/accuracy/openai_last_token": 0.7699922360248447, "lambada/perplexity": 10.952960582848524, "lambada/lm_loss": 3.077561533158866, "lambada/lm_perplexity": 21.7054098899309, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.38636507043516, "mean_loss": 2.511309574356237, "blimp/accuracy/passive_2": 0.887, "blimp/accuracy/determiner_noun_agreement_2": 0.985, "blimp/accuracy/ellipsis_n_bar_1": 0.817, "blimp/accuracy/tough_vs_raising_2": 0.831, "blimp/accuracy/tough_vs_raising_1": 0.622, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.877, "blimp/accuracy/principle_A_reconstruction": 0.371, "blimp/accuracy/wh_vs_that_with_gap": 0.549, "blimp/accuracy/principle_A_domain_2": 0.868, "blimp/accuracy/determiner_noun_agreement_1": 0.99, "blimp/accuracy/ellipsis_n_bar_2": 0.898, "blimp/accuracy/principle_A_domain_3": 0.59, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.932, "blimp/accuracy/animate_subject_trans": 0.887, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.905, "blimp/accuracy/distractor_agreement_relative_clause": 0.682, "blimp/accuracy/transitive": 0.874, "blimp/accuracy/sentential_subject_island": 0.364, "blimp/accuracy/adjunct_island": 0.833, "blimp/accuracy/intransitive": 0.735, "blimp/accuracy/existential_there_subject_raising": 0.889, "blimp/accuracy/irregular_past_participle_adjectives": 0.877, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.451, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.34, "blimp/accuracy/only_npi_scope": 0.514, "blimp/accuracy/superlative_quantifiers_2": 0.551, "blimp/accuracy/passive_1": 0.887, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.906, "blimp/accuracy/inchoative": 0.583, "blimp/accuracy/anaphor_gender_agreement": 0.969, "blimp/accuracy/principle_A_c_command": 0.637, "blimp/accuracy/only_npi_licensor_present": 0.531, "blimp/accuracy/expletive_it_object_raising": 0.737, "blimp/accuracy/left_branch_island_simple_question": 0.552, "blimp/accuracy/wh_questions_subject_gap": 0.923, "blimp/accuracy/existential_there_quantifiers_2": 0.464, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.939, "blimp/accuracy/sentential_negation_npi_scope": 0.66, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.797, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.833, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.896, "blimp/accuracy/principle_A_case_2": 0.926, "blimp/accuracy/distractor_agreement_relational_noun": 0.83, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.973, "blimp/accuracy/superlative_quantifiers_1": 0.473, "blimp/accuracy/wh_island": 0.801, "blimp/accuracy/principle_A_domain_1": 0.996, "blimp/accuracy/complex_NP_island": 0.592, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.979, "blimp/accuracy/irregular_past_participle_verbs": 0.863, "blimp/accuracy/drop_argument": 0.728, "blimp/accuracy/wh_questions_object_gap": 0.783, "blimp/accuracy/animate_subject_passive": 0.787, "blimp/accuracy/existential_there_quantifiers_1": 0.968, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.874, "blimp/accuracy/npi_present_2": 0.633, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.944, "blimp/accuracy/anaphor_number_agreement": 0.989, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.955, "blimp/accuracy/existential_there_object_raising": 0.836, "blimp/accuracy/matrix_question_npi_licensor_present": 0.261, "blimp/accuracy/npi_present_1": 0.603, "blimp/accuracy/wh_vs_that_no_gap": 0.964, "blimp/accuracy/left_branch_island_echo_question": 0.384, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.96, "blimp/accuracy/causative": 0.74, "blimp/accuracy/group_average": 0.7652985074626866, "blimp/accuracy/seq_average": 0.7652985074626866, "cbt/accuracy/NE": 0.780448717948718, "cbt/accuracy/V": 0.928, "cbt/accuracy/CN": 0.8456, "cbt/accuracy/P": 0.9008, "cbt/accuracy/group_average": 0.8637121794871796, "cbt/accuracy/seq_average": 0.8637454981992797, "hellaswag/accuracy/val": 0.31398127862975506, "hellaswag/accuracy/group_average": 0.31398127862975506, "hellaswag/accuracy/seq_average": 0.31398127862975506, "piqa/accuracy/val": 0.5990206746463548, "piqa/accuracy/group_average": 0.5990206746463548, "piqa/accuracy/seq_average": 0.5990206746463548, "ai2arc/accuracy/ARC-Easy": 0.35306553911205074, "ai2arc/accuracy/ARC-Challenge": 0.22145922746781116, "ai2arc/accuracy/group_average": 0.2872623832899309, "ai2arc/accuracy/seq_average": 0.3096317280453258, "mmlu/accuracy/MMLU": 0.2627100464783697, "mmlu/accuracy/group_average": 0.2627100464783697, "mmlu/accuracy/seq_average": 0.2627100464783697, "openbookqa/accuracy/test": 0.278, "openbookqa/accuracy/group_average": 0.278, "openbookqa/accuracy/seq_average": 0.278, "race/accuracy/test/high": 0.274442538593482, "race/accuracy/test/middle": 0.34540389972144847, "race/accuracy/group_average": 0.30992321915746524, "race/accuracy/seq_average": 0.29509525739764897, "siqa/accuracy/dev": 0.3602865916069601, "siqa/accuracy/group_average": 0.3602865916069601, "siqa/accuracy/seq_average": 0.3602865916069601, "winogrande/accuracy/dev": 0.5169692186266772, "winogrande/accuracy/group_average": 0.5169692186266772, "winogrande/accuracy/seq_average": 0.5169692186266772, "commonsenseqa/accuracy/dev_rand_split": 0.2620802620802621, "commonsenseqa/accuracy/group_average": 0.2620802620802621, "commonsenseqa/accuracy/seq_average": 0.2620802620802621}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb/export/result-model-80000.pth.json
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"val/loss": 2.474331325954861,
|
| 3 |
+
"val/accuracy": 0.4955831860739087,
|
| 4 |
+
"val/perplexity": 11.87376477965298,
|
| 5 |
+
"val/time_since_best_loss": 0,
|
| 6 |
+
"val/time_since_best_accuracy": 0,
|
| 7 |
+
"lambada/loss": 2.4638778023097827,
|
| 8 |
+
"lambada/accuracy/total": 0.29580745341614906,
|
| 9 |
+
"lambada/accuracy/openai_last_token": 0.7754270186335404,
|
| 10 |
+
"lambada/perplexity": 9.1022369700753,
|
| 11 |
+
"lambada/lm_loss": 3.0485617992595277,
|
| 12 |
+
"lambada/lm_perplexity": 21.084998145806363,
|
| 13 |
+
"lambada/time_since_best_loss": 0,
|
| 14 |
+
"lambada/time_since_best_accuracy": 0,
|
| 15 |
+
"mean_accuracy": 0.3956953197450289,
|
| 16 |
+
"mean_loss": 2.469104564132322,
|
| 17 |
+
"blimp/accuracy/passive_2": 0.91,
|
| 18 |
+
"blimp/accuracy/determiner_noun_agreement_2": 0.987,
|
| 19 |
+
"blimp/accuracy/ellipsis_n_bar_1": 0.819,
|
| 20 |
+
"blimp/accuracy/tough_vs_raising_2": 0.857,
|
| 21 |
+
"blimp/accuracy/tough_vs_raising_1": 0.667,
|
| 22 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.908,
|
| 23 |
+
"blimp/accuracy/principle_A_reconstruction": 0.549,
|
| 24 |
+
"blimp/accuracy/wh_vs_that_with_gap": 0.483,
|
| 25 |
+
"blimp/accuracy/principle_A_domain_2": 0.889,
|
| 26 |
+
"blimp/accuracy/determiner_noun_agreement_1": 0.994,
|
| 27 |
+
"blimp/accuracy/ellipsis_n_bar_2": 0.911,
|
| 28 |
+
"blimp/accuracy/principle_A_domain_3": 0.63,
|
| 29 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.921,
|
| 30 |
+
"blimp/accuracy/animate_subject_trans": 0.894,
|
| 31 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.899,
|
| 32 |
+
"blimp/accuracy/distractor_agreement_relative_clause": 0.69,
|
| 33 |
+
"blimp/accuracy/transitive": 0.875,
|
| 34 |
+
"blimp/accuracy/sentential_subject_island": 0.348,
|
| 35 |
+
"blimp/accuracy/adjunct_island": 0.876,
|
| 36 |
+
"blimp/accuracy/intransitive": 0.746,
|
| 37 |
+
"blimp/accuracy/existential_there_subject_raising": 0.875,
|
| 38 |
+
"blimp/accuracy/irregular_past_participle_adjectives": 0.836,
|
| 39 |
+
"blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.634,
|
| 40 |
+
"blimp/accuracy/principle_A_case_1": 1.0,
|
| 41 |
+
"blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.366,
|
| 42 |
+
"blimp/accuracy/only_npi_scope": 0.692,
|
| 43 |
+
"blimp/accuracy/superlative_quantifiers_2": 0.788,
|
| 44 |
+
"blimp/accuracy/passive_1": 0.894,
|
| 45 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.927,
|
| 46 |
+
"blimp/accuracy/inchoative": 0.603,
|
| 47 |
+
"blimp/accuracy/anaphor_gender_agreement": 0.975,
|
| 48 |
+
"blimp/accuracy/principle_A_c_command": 0.674,
|
| 49 |
+
"blimp/accuracy/only_npi_licensor_present": 0.692,
|
| 50 |
+
"blimp/accuracy/expletive_it_object_raising": 0.76,
|
| 51 |
+
"blimp/accuracy/left_branch_island_simple_question": 0.719,
|
| 52 |
+
"blimp/accuracy/wh_questions_subject_gap": 0.931,
|
| 53 |
+
"blimp/accuracy/existential_there_quantifiers_2": 0.463,
|
| 54 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.956,
|
| 55 |
+
"blimp/accuracy/sentential_negation_npi_scope": 0.726,
|
| 56 |
+
"blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.851,
|
| 57 |
+
"blimp/accuracy/wh_questions_subject_gap_long_distance": 0.841,
|
| 58 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.919,
|
| 59 |
+
"blimp/accuracy/principle_A_case_2": 0.933,
|
| 60 |
+
"blimp/accuracy/distractor_agreement_relational_noun": 0.863,
|
| 61 |
+
"blimp/accuracy/sentential_negation_npi_licensor_present": 0.956,
|
| 62 |
+
"blimp/accuracy/superlative_quantifiers_1": 0.54,
|
| 63 |
+
"blimp/accuracy/wh_island": 0.717,
|
| 64 |
+
"blimp/accuracy/principle_A_domain_1": 0.996,
|
| 65 |
+
"blimp/accuracy/complex_NP_island": 0.624,
|
| 66 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_2": 0.979,
|
| 67 |
+
"blimp/accuracy/irregular_past_participle_verbs": 0.89,
|
| 68 |
+
"blimp/accuracy/drop_argument": 0.748,
|
| 69 |
+
"blimp/accuracy/wh_questions_object_gap": 0.816,
|
| 70 |
+
"blimp/accuracy/animate_subject_passive": 0.785,
|
| 71 |
+
"blimp/accuracy/existential_there_quantifiers_1": 0.988,
|
| 72 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.888,
|
| 73 |
+
"blimp/accuracy/npi_present_2": 0.588,
|
| 74 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_1": 0.954,
|
| 75 |
+
"blimp/accuracy/anaphor_number_agreement": 0.988,
|
| 76 |
+
"blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.97,
|
| 77 |
+
"blimp/accuracy/existential_there_object_raising": 0.809,
|
| 78 |
+
"blimp/accuracy/matrix_question_npi_licensor_present": 0.273,
|
| 79 |
+
"blimp/accuracy/npi_present_1": 0.576,
|
| 80 |
+
"blimp/accuracy/wh_vs_that_no_gap": 0.97,
|
| 81 |
+
"blimp/accuracy/left_branch_island_echo_question": 0.366,
|
| 82 |
+
"blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.945,
|
| 83 |
+
"blimp/accuracy/causative": 0.732,
|
| 84 |
+
"blimp/accuracy/group_average": 0.789089552238806,
|
| 85 |
+
"blimp/accuracy/seq_average": 0.789089552238806,
|
| 86 |
+
"cbt/accuracy/NE": 0.7904647435897436,
|
| 87 |
+
"cbt/accuracy/V": 0.9308,
|
| 88 |
+
"cbt/accuracy/CN": 0.8592,
|
| 89 |
+
"cbt/accuracy/P": 0.9028,
|
| 90 |
+
"cbt/accuracy/group_average": 0.8708161858974359,
|
| 91 |
+
"cbt/accuracy/seq_average": 0.8708483393357342,
|
| 92 |
+
"hellaswag/accuracy/val": 0.31935869348735313,
|
| 93 |
+
"hellaswag/accuracy/group_average": 0.31935869348735313,
|
| 94 |
+
"hellaswag/accuracy/seq_average": 0.31935869348735313,
|
| 95 |
+
"piqa/accuracy/val": 0.5957562568008705,
|
| 96 |
+
"piqa/accuracy/group_average": 0.5957562568008705,
|
| 97 |
+
"piqa/accuracy/seq_average": 0.5957562568008705,
|
| 98 |
+
"ai2arc/accuracy/ARC-Easy": 0.3420718816067653,
|
| 99 |
+
"ai2arc/accuracy/ARC-Challenge": 0.21974248927038625,
|
| 100 |
+
"ai2arc/accuracy/group_average": 0.2809071854385758,
|
| 101 |
+
"ai2arc/accuracy/seq_average": 0.301699716713881,
|
| 102 |
+
"mmlu/accuracy/MMLU": 0.26163746871648197,
|
| 103 |
+
"mmlu/accuracy/group_average": 0.26163746871648197,
|
| 104 |
+
"mmlu/accuracy/seq_average": 0.26163746871648197,
|
| 105 |
+
"openbookqa/accuracy/test": 0.284,
|
| 106 |
+
"openbookqa/accuracy/group_average": 0.284,
|
| 107 |
+
"openbookqa/accuracy/seq_average": 0.284,
|
| 108 |
+
"race/accuracy/test/high": 0.27787307032590053,
|
| 109 |
+
"race/accuracy/test/middle": 0.3649025069637883,
|
| 110 |
+
"race/accuracy/group_average": 0.3213877886448444,
|
| 111 |
+
"race/accuracy/seq_average": 0.30320226996351846,
|
| 112 |
+
"siqa/accuracy/dev": 0.36284544524053225,
|
| 113 |
+
"siqa/accuracy/group_average": 0.36284544524053225,
|
| 114 |
+
"siqa/accuracy/seq_average": 0.36284544524053225,
|
| 115 |
+
"winogrande/accuracy/dev": 0.5153906866614049,
|
| 116 |
+
"winogrande/accuracy/group_average": 0.5153906866614049,
|
| 117 |
+
"winogrande/accuracy/seq_average": 0.5153906866614049,
|
| 118 |
+
"commonsenseqa/accuracy/dev_rand_split": 0.2547092547092547,
|
| 119 |
+
"commonsenseqa/accuracy/group_average": 0.2547092547092547,
|
| 120 |
+
"commonsenseqa/accuracy/seq_average": 0.2547092547092547
|
| 121 |
+
}
|