73bebc12bb080ae84c7647473cae93e6ced99d3dbd319ef725f99f5e449c99fd
Browse files- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_tcmoe/export/result-model-10000.pth.json +112 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_tcmoe/export/result-model-100000.pth.json +112 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_tcmoe/export/result-model-20000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_tcmoe/export/result-model-30000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_tcmoe/export/result-model-40000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_tcmoe/export/result-model-50000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_tcmoe/export/result-model-60000.pth.json +112 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_tcmoe/export/result-model-70000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_tcmoe/export/result-model-80000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_tcmoe/export/result-model-90000.pth.json +1 -0
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_tcmoe/export/result-model-10000.pth.json
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"val/loss": 3.01871575249566,
|
| 3 |
+
"val/accuracy": 0.4236605205233135,
|
| 4 |
+
"val/perplexity": 20.464992684813527,
|
| 5 |
+
"val/time_since_best_loss": 0,
|
| 6 |
+
"val/time_since_best_accuracy": 0,
|
| 7 |
+
"lambada/loss": 2.7049215565557065,
|
| 8 |
+
"lambada/accuracy/total": 0.1653726708074534,
|
| 9 |
+
"lambada/accuracy/openai_last_token": 0.720108695652174,
|
| 10 |
+
"lambada/perplexity": 23.920566937013117,
|
| 11 |
+
"lambada/lm_loss": 3.554632107058449,
|
| 12 |
+
"lambada/lm_perplexity": 34.974950563113914,
|
| 13 |
+
"lambada/time_since_best_loss": 0,
|
| 14 |
+
"lambada/time_since_best_accuracy": 0,
|
| 15 |
+
"mean_accuracy": 0.29451659566538346,
|
| 16 |
+
"mean_loss": 2.861818654525683,
|
| 17 |
+
"blimp/accuracy/passive_2": 0.882,
|
| 18 |
+
"blimp/accuracy/determiner_noun_agreement_2": 0.964,
|
| 19 |
+
"blimp/accuracy/ellipsis_n_bar_1": 0.706,
|
| 20 |
+
"blimp/accuracy/tough_vs_raising_2": 0.836,
|
| 21 |
+
"blimp/accuracy/tough_vs_raising_1": 0.504,
|
| 22 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.896,
|
| 23 |
+
"blimp/accuracy/principle_A_reconstruction": 0.461,
|
| 24 |
+
"blimp/accuracy/wh_vs_that_with_gap": 0.39,
|
| 25 |
+
"blimp/accuracy/principle_A_domain_2": 0.732,
|
| 26 |
+
"blimp/accuracy/determiner_noun_agreement_1": 0.969,
|
| 27 |
+
"blimp/accuracy/ellipsis_n_bar_2": 0.872,
|
| 28 |
+
"blimp/accuracy/principle_A_domain_3": 0.537,
|
| 29 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.844,
|
| 30 |
+
"blimp/accuracy/animate_subject_trans": 0.876,
|
| 31 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.743,
|
| 32 |
+
"blimp/accuracy/distractor_agreement_relative_clause": 0.406,
|
| 33 |
+
"blimp/accuracy/transitive": 0.817,
|
| 34 |
+
"blimp/accuracy/sentential_subject_island": 0.407,
|
| 35 |
+
"blimp/accuracy/adjunct_island": 0.709,
|
| 36 |
+
"blimp/accuracy/intransitive": 0.729,
|
| 37 |
+
"blimp/accuracy/existential_there_subject_raising": 0.813,
|
| 38 |
+
"blimp/accuracy/irregular_past_participle_adjectives": 0.968,
|
| 39 |
+
"blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.206,
|
| 40 |
+
"blimp/accuracy/principle_A_case_1": 1.0,
|
| 41 |
+
"blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.145,
|
| 42 |
+
"blimp/accuracy/only_npi_scope": 0.64,
|
| 43 |
+
"blimp/accuracy/superlative_quantifiers_2": 0.636,
|
| 44 |
+
"blimp/accuracy/passive_1": 0.89,
|
| 45 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.886,
|
| 46 |
+
"blimp/accuracy/inchoative": 0.584,
|
| 47 |
+
"blimp/accuracy/anaphor_gender_agreement": 0.938,
|
| 48 |
+
"blimp/accuracy/principle_A_c_command": 0.549,
|
| 49 |
+
"blimp/accuracy/only_npi_licensor_present": 0.555,
|
| 50 |
+
"blimp/accuracy/expletive_it_object_raising": 0.767,
|
| 51 |
+
"blimp/accuracy/left_branch_island_simple_question": 0.205,
|
| 52 |
+
"blimp/accuracy/wh_questions_subject_gap": 0.862,
|
| 53 |
+
"blimp/accuracy/existential_there_quantifiers_2": 0.353,
|
| 54 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.886,
|
| 55 |
+
"blimp/accuracy/sentential_negation_npi_scope": 0.449,
|
| 56 |
+
"blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.763,
|
| 57 |
+
"blimp/accuracy/wh_questions_subject_gap_long_distance": 0.816,
|
| 58 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.817,
|
| 59 |
+
"blimp/accuracy/principle_A_case_2": 0.945,
|
| 60 |
+
"blimp/accuracy/distractor_agreement_relational_noun": 0.717,
|
| 61 |
+
"blimp/accuracy/sentential_negation_npi_licensor_present": 0.985,
|
| 62 |
+
"blimp/accuracy/superlative_quantifiers_1": 0.678,
|
| 63 |
+
"blimp/accuracy/wh_island": 0.763,
|
| 64 |
+
"blimp/accuracy/principle_A_domain_1": 0.969,
|
| 65 |
+
"blimp/accuracy/complex_NP_island": 0.527,
|
| 66 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_2": 0.928,
|
| 67 |
+
"blimp/accuracy/irregular_past_participle_verbs": 0.851,
|
| 68 |
+
"blimp/accuracy/drop_argument": 0.73,
|
| 69 |
+
"blimp/accuracy/wh_questions_object_gap": 0.707,
|
| 70 |
+
"blimp/accuracy/animate_subject_passive": 0.741,
|
| 71 |
+
"blimp/accuracy/existential_there_quantifiers_1": 0.987,
|
| 72 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.855,
|
| 73 |
+
"blimp/accuracy/npi_present_2": 0.602,
|
| 74 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_1": 0.812,
|
| 75 |
+
"blimp/accuracy/anaphor_number_agreement": 0.978,
|
| 76 |
+
"blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.911,
|
| 77 |
+
"blimp/accuracy/existential_there_object_raising": 0.765,
|
| 78 |
+
"blimp/accuracy/matrix_question_npi_licensor_present": 0.165,
|
| 79 |
+
"blimp/accuracy/npi_present_1": 0.633,
|
| 80 |
+
"blimp/accuracy/wh_vs_that_no_gap": 0.94,
|
| 81 |
+
"blimp/accuracy/left_branch_island_echo_question": 0.301,
|
| 82 |
+
"blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.961,
|
| 83 |
+
"blimp/accuracy/causative": 0.654,
|
| 84 |
+
"blimp/accuracy/group_average": 0.7181044776119401,
|
| 85 |
+
"blimp/accuracy/seq_average": 0.7181044776119403,
|
| 86 |
+
"cbt/accuracy/NE": 0.6838942307692307,
|
| 87 |
+
"cbt/accuracy/V": 0.8648,
|
| 88 |
+
"cbt/accuracy/CN": 0.7356,
|
| 89 |
+
"cbt/accuracy/P": 0.8372,
|
| 90 |
+
"cbt/accuracy/group_average": 0.7803735576923077,
|
| 91 |
+
"cbt/accuracy/seq_average": 0.7804121648659463,
|
| 92 |
+
"hellaswag/accuracy/val": 0.2727544313881697,
|
| 93 |
+
"hellaswag/accuracy/group_average": 0.2727544313881697,
|
| 94 |
+
"hellaswag/accuracy/seq_average": 0.2727544313881697,
|
| 95 |
+
"piqa/accuracy/val": 0.5571273122959739,
|
| 96 |
+
"piqa/accuracy/group_average": 0.5571273122959739,
|
| 97 |
+
"piqa/accuracy/seq_average": 0.5571273122959739,
|
| 98 |
+
"ai2arc/accuracy/ARC-Easy": 0.3090909090909091,
|
| 99 |
+
"ai2arc/accuracy/ARC-Challenge": 0.20429184549356222,
|
| 100 |
+
"ai2arc/accuracy/group_average": 0.2566913772922357,
|
| 101 |
+
"ai2arc/accuracy/seq_average": 0.2745042492917847,
|
| 102 |
+
"race/accuracy/test/high": 0.25128644939965694,
|
| 103 |
+
"race/accuracy/test/middle": 0.32103064066852366,
|
| 104 |
+
"race/accuracy/group_average": 0.28615854503409033,
|
| 105 |
+
"race/accuracy/seq_average": 0.2715849209566275,
|
| 106 |
+
"siqa/accuracy/dev": 0.3587512794268168,
|
| 107 |
+
"siqa/accuracy/group_average": 0.3587512794268168,
|
| 108 |
+
"siqa/accuracy/seq_average": 0.3587512794268168,
|
| 109 |
+
"commonsenseqa/accuracy/dev_rand_split": 0.23177723177723178,
|
| 110 |
+
"commonsenseqa/accuracy/group_average": 0.23177723177723178,
|
| 111 |
+
"commonsenseqa/accuracy/seq_average": 0.23177723177723178
|
| 112 |
+
}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_tcmoe/export/result-model-100000.pth.json
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"val/loss": 2.603516593812004,
|
| 3 |
+
"val/accuracy": 0.48022557818700395,
|
| 4 |
+
"val/perplexity": 13.51116787973311,
|
| 5 |
+
"val/time_since_best_loss": 0,
|
| 6 |
+
"val/time_since_best_accuracy": 0,
|
| 7 |
+
"lambada/loss": 2.669328582953222,
|
| 8 |
+
"lambada/accuracy/total": 0.25601708074534163,
|
| 9 |
+
"lambada/accuracy/openai_last_token": 0.7635869565217391,
|
| 10 |
+
"lambada/perplexity": 11.557441177211011,
|
| 11 |
+
"lambada/lm_loss": 3.1691284005766103,
|
| 12 |
+
"lambada/lm_perplexity": 23.78674280726548,
|
| 13 |
+
"lambada/time_since_best_loss": 0,
|
| 14 |
+
"lambada/time_since_best_accuracy": 0,
|
| 15 |
+
"mean_accuracy": 0.3681213294661728,
|
| 16 |
+
"mean_loss": 2.636422588382613,
|
| 17 |
+
"blimp/accuracy/passive_2": 0.904,
|
| 18 |
+
"blimp/accuracy/determiner_noun_agreement_2": 0.972,
|
| 19 |
+
"blimp/accuracy/ellipsis_n_bar_1": 0.794,
|
| 20 |
+
"blimp/accuracy/tough_vs_raising_2": 0.858,
|
| 21 |
+
"blimp/accuracy/tough_vs_raising_1": 0.567,
|
| 22 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.905,
|
| 23 |
+
"blimp/accuracy/principle_A_reconstruction": 0.353,
|
| 24 |
+
"blimp/accuracy/wh_vs_that_with_gap": 0.512,
|
| 25 |
+
"blimp/accuracy/principle_A_domain_2": 0.849,
|
| 26 |
+
"blimp/accuracy/determiner_noun_agreement_1": 0.986,
|
| 27 |
+
"blimp/accuracy/ellipsis_n_bar_2": 0.903,
|
| 28 |
+
"blimp/accuracy/principle_A_domain_3": 0.603,
|
| 29 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.907,
|
| 30 |
+
"blimp/accuracy/animate_subject_trans": 0.893,
|
| 31 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.884,
|
| 32 |
+
"blimp/accuracy/distractor_agreement_relative_clause": 0.544,
|
| 33 |
+
"blimp/accuracy/transitive": 0.876,
|
| 34 |
+
"blimp/accuracy/sentential_subject_island": 0.285,
|
| 35 |
+
"blimp/accuracy/adjunct_island": 0.78,
|
| 36 |
+
"blimp/accuracy/intransitive": 0.759,
|
| 37 |
+
"blimp/accuracy/existential_there_subject_raising": 0.88,
|
| 38 |
+
"blimp/accuracy/irregular_past_participle_adjectives": 0.978,
|
| 39 |
+
"blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.49,
|
| 40 |
+
"blimp/accuracy/principle_A_case_1": 1.0,
|
| 41 |
+
"blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.202,
|
| 42 |
+
"blimp/accuracy/only_npi_scope": 0.703,
|
| 43 |
+
"blimp/accuracy/superlative_quantifiers_2": 0.812,
|
| 44 |
+
"blimp/accuracy/passive_1": 0.888,
|
| 45 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.885,
|
| 46 |
+
"blimp/accuracy/inchoative": 0.626,
|
| 47 |
+
"blimp/accuracy/anaphor_gender_agreement": 0.95,
|
| 48 |
+
"blimp/accuracy/principle_A_c_command": 0.603,
|
| 49 |
+
"blimp/accuracy/only_npi_licensor_present": 0.581,
|
| 50 |
+
"blimp/accuracy/expletive_it_object_raising": 0.801,
|
| 51 |
+
"blimp/accuracy/left_branch_island_simple_question": 0.556,
|
| 52 |
+
"blimp/accuracy/wh_questions_subject_gap": 0.909,
|
| 53 |
+
"blimp/accuracy/existential_there_quantifiers_2": 0.388,
|
| 54 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.947,
|
| 55 |
+
"blimp/accuracy/sentential_negation_npi_scope": 0.664,
|
| 56 |
+
"blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.828,
|
| 57 |
+
"blimp/accuracy/wh_questions_subject_gap_long_distance": 0.844,
|
| 58 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.894,
|
| 59 |
+
"blimp/accuracy/principle_A_case_2": 0.963,
|
| 60 |
+
"blimp/accuracy/distractor_agreement_relational_noun": 0.743,
|
| 61 |
+
"blimp/accuracy/sentential_negation_npi_licensor_present": 0.985,
|
| 62 |
+
"blimp/accuracy/superlative_quantifiers_1": 0.68,
|
| 63 |
+
"blimp/accuracy/wh_island": 0.723,
|
| 64 |
+
"blimp/accuracy/principle_A_domain_1": 0.977,
|
| 65 |
+
"blimp/accuracy/complex_NP_island": 0.529,
|
| 66 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_2": 0.972,
|
| 67 |
+
"blimp/accuracy/irregular_past_participle_verbs": 0.912,
|
| 68 |
+
"blimp/accuracy/drop_argument": 0.732,
|
| 69 |
+
"blimp/accuracy/wh_questions_object_gap": 0.784,
|
| 70 |
+
"blimp/accuracy/animate_subject_passive": 0.794,
|
| 71 |
+
"blimp/accuracy/existential_there_quantifiers_1": 0.977,
|
| 72 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.877,
|
| 73 |
+
"blimp/accuracy/npi_present_2": 0.589,
|
| 74 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_1": 0.93,
|
| 75 |
+
"blimp/accuracy/anaphor_number_agreement": 0.986,
|
| 76 |
+
"blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.957,
|
| 77 |
+
"blimp/accuracy/existential_there_object_raising": 0.839,
|
| 78 |
+
"blimp/accuracy/matrix_question_npi_licensor_present": 0.263,
|
| 79 |
+
"blimp/accuracy/npi_present_1": 0.622,
|
| 80 |
+
"blimp/accuracy/wh_vs_that_no_gap": 0.975,
|
| 81 |
+
"blimp/accuracy/left_branch_island_echo_question": 0.464,
|
| 82 |
+
"blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.976,
|
| 83 |
+
"blimp/accuracy/causative": 0.717,
|
| 84 |
+
"blimp/accuracy/group_average": 0.7690895522388058,
|
| 85 |
+
"blimp/accuracy/seq_average": 0.7690895522388059,
|
| 86 |
+
"cbt/accuracy/NE": 0.7636217948717948,
|
| 87 |
+
"cbt/accuracy/V": 0.9096,
|
| 88 |
+
"cbt/accuracy/CN": 0.8248,
|
| 89 |
+
"cbt/accuracy/P": 0.8892,
|
| 90 |
+
"cbt/accuracy/group_average": 0.8468054487179486,
|
| 91 |
+
"cbt/accuracy/seq_average": 0.8468387354941976,
|
| 92 |
+
"hellaswag/accuracy/val": 0.29267078271260705,
|
| 93 |
+
"hellaswag/accuracy/group_average": 0.29267078271260705,
|
| 94 |
+
"hellaswag/accuracy/seq_average": 0.29267078271260705,
|
| 95 |
+
"piqa/accuracy/val": 0.5903155603917302,
|
| 96 |
+
"piqa/accuracy/group_average": 0.5903155603917302,
|
| 97 |
+
"piqa/accuracy/seq_average": 0.5903155603917302,
|
| 98 |
+
"ai2arc/accuracy/ARC-Easy": 0.3302325581395349,
|
| 99 |
+
"ai2arc/accuracy/ARC-Challenge": 0.20257510729613734,
|
| 100 |
+
"ai2arc/accuracy/group_average": 0.26640383271783613,
|
| 101 |
+
"ai2arc/accuracy/seq_average": 0.28810198300283285,
|
| 102 |
+
"race/accuracy/test/high": 0.27072612921669525,
|
| 103 |
+
"race/accuracy/test/middle": 0.34192200557103064,
|
| 104 |
+
"race/accuracy/group_average": 0.306324067393863,
|
| 105 |
+
"race/accuracy/seq_average": 0.2914471017430077,
|
| 106 |
+
"siqa/accuracy/dev": 0.3602865916069601,
|
| 107 |
+
"siqa/accuracy/group_average": 0.3602865916069601,
|
| 108 |
+
"siqa/accuracy/seq_average": 0.3602865916069601,
|
| 109 |
+
"commonsenseqa/accuracy/dev_rand_split": 0.26371826371826373,
|
| 110 |
+
"commonsenseqa/accuracy/group_average": 0.26371826371826373,
|
| 111 |
+
"commonsenseqa/accuracy/seq_average": 0.26371826371826373
|
| 112 |
+
}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_tcmoe/export/result-model-20000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.877354939778646, "val/accuracy": 0.4417182074652778, "val/perplexity": 17.76721561673222, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.687665856402853, "lambada/accuracy/total": 0.18303571428571427, "lambada/accuracy/openai_last_token": 0.7333074534161491, "lambada/perplexity": 18.255754615329064, "lambada/lm_loss": 3.405477641680216, "lambada/lm_perplexity": 30.128683002434276, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.31237696087549605, "mean_loss": 2.7825103980907495, "blimp/accuracy/passive_2": 0.87, "blimp/accuracy/determiner_noun_agreement_2": 0.947, "blimp/accuracy/ellipsis_n_bar_1": 0.765, "blimp/accuracy/tough_vs_raising_2": 0.857, "blimp/accuracy/tough_vs_raising_1": 0.497, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.87, "blimp/accuracy/principle_A_reconstruction": 0.47, "blimp/accuracy/wh_vs_that_with_gap": 0.428, "blimp/accuracy/principle_A_domain_2": 0.816, "blimp/accuracy/determiner_noun_agreement_1": 0.976, "blimp/accuracy/ellipsis_n_bar_2": 0.867, "blimp/accuracy/principle_A_domain_3": 0.577, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.866, "blimp/accuracy/animate_subject_trans": 0.877, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.83, "blimp/accuracy/distractor_agreement_relative_clause": 0.475, "blimp/accuracy/transitive": 0.845, "blimp/accuracy/sentential_subject_island": 0.458, "blimp/accuracy/adjunct_island": 0.686, "blimp/accuracy/intransitive": 0.694, "blimp/accuracy/existential_there_subject_raising": 0.851, "blimp/accuracy/irregular_past_participle_adjectives": 0.903, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.404, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.191, "blimp/accuracy/only_npi_scope": 0.641, "blimp/accuracy/superlative_quantifiers_2": 0.873, "blimp/accuracy/passive_1": 0.873, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.897, "blimp/accuracy/inchoative": 0.554, "blimp/accuracy/anaphor_gender_agreement": 0.862, "blimp/accuracy/principle_A_c_command": 0.572, "blimp/accuracy/only_npi_licensor_present": 0.471, "blimp/accuracy/expletive_it_object_raising": 0.78, "blimp/accuracy/left_branch_island_simple_question": 0.476, "blimp/accuracy/wh_questions_subject_gap": 0.895, "blimp/accuracy/existential_there_quantifiers_2": 0.502, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.885, "blimp/accuracy/sentential_negation_npi_scope": 0.467, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.789, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.849, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.863, "blimp/accuracy/principle_A_case_2": 0.965, "blimp/accuracy/distractor_agreement_relational_noun": 0.765, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.986, "blimp/accuracy/superlative_quantifiers_1": 0.631, "blimp/accuracy/wh_island": 0.828, "blimp/accuracy/principle_A_domain_1": 0.977, "blimp/accuracy/complex_NP_island": 0.492, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.925, "blimp/accuracy/irregular_past_participle_verbs": 0.858, "blimp/accuracy/drop_argument": 0.689, "blimp/accuracy/wh_questions_object_gap": 0.76, "blimp/accuracy/animate_subject_passive": 0.769, "blimp/accuracy/existential_there_quantifiers_1": 0.978, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.845, "blimp/accuracy/npi_present_2": 0.619, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.896, "blimp/accuracy/anaphor_number_agreement": 0.972, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.934, "blimp/accuracy/existential_there_object_raising": 0.76, "blimp/accuracy/matrix_question_npi_licensor_present": 0.201, "blimp/accuracy/npi_present_1": 0.58, "blimp/accuracy/wh_vs_that_no_gap": 0.947, "blimp/accuracy/left_branch_island_echo_question": 0.366, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.973, "blimp/accuracy/causative": 0.677, "blimp/accuracy/group_average": 0.7412238805970146, "blimp/accuracy/seq_average": 0.7412238805970149, "cbt/accuracy/NE": 0.7059294871794872, "cbt/accuracy/V": 0.8772, "cbt/accuracy/CN": 0.7644, "cbt/accuracy/P": 0.864, "cbt/accuracy/group_average": 0.8028823717948718, "cbt/accuracy/seq_average": 0.8029211684673869, "hellaswag/accuracy/val": 0.2758414658434575, "hellaswag/accuracy/group_average": 0.2758414658434575, "hellaswag/accuracy/seq_average": 0.2758414658434575, "piqa/accuracy/val": 0.5767138193688792, "piqa/accuracy/group_average": 0.5767138193688792, "piqa/accuracy/seq_average": 0.5767138193688792, "ai2arc/accuracy/ARC-Easy": 0.3099365750528541, "ai2arc/accuracy/ARC-Challenge": 0.20772532188841203, "ai2arc/accuracy/group_average": 0.25883094847063304, "ai2arc/accuracy/seq_average": 0.2762039660056657, "race/accuracy/test/high": 0.2552887364208119, "race/accuracy/test/middle": 0.32590529247910865, "race/accuracy/group_average": 0.2905970144499603, "race/accuracy/seq_average": 0.275841102553709, "siqa/accuracy/dev": 0.3694984646878199, "siqa/accuracy/group_average": 0.3694984646878199, "siqa/accuracy/seq_average": 0.3694984646878199, "commonsenseqa/accuracy/dev_rand_split": 0.2457002457002457, "commonsenseqa/accuracy/group_average": 0.2457002457002457, "commonsenseqa/accuracy/seq_average": 0.2457002457002457}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_tcmoe/export/result-model-30000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.8010045611669145, "val/accuracy": 0.4515448676215278, "val/perplexity": 16.461174724927293, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6126781013441382, "lambada/accuracy/total": 0.19934006211180125, "lambada/accuracy/openai_last_token": 0.7377717391304348, "lambada/perplexity": 15.94638679663136, "lambada/lm_loss": 3.3561931141990313, "lambada/lm_perplexity": 28.679802064734133, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3254424648666645, "mean_loss": 2.7068413312555264, "blimp/accuracy/passive_2": 0.873, "blimp/accuracy/determiner_noun_agreement_2": 0.966, "blimp/accuracy/ellipsis_n_bar_1": 0.741, "blimp/accuracy/tough_vs_raising_2": 0.845, "blimp/accuracy/tough_vs_raising_1": 0.582, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.897, "blimp/accuracy/principle_A_reconstruction": 0.471, "blimp/accuracy/wh_vs_that_with_gap": 0.524, "blimp/accuracy/principle_A_domain_2": 0.797, "blimp/accuracy/determiner_noun_agreement_1": 0.975, "blimp/accuracy/ellipsis_n_bar_2": 0.847, "blimp/accuracy/principle_A_domain_3": 0.546, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.874, "blimp/accuracy/animate_subject_trans": 0.876, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.825, "blimp/accuracy/distractor_agreement_relative_clause": 0.487, "blimp/accuracy/transitive": 0.857, "blimp/accuracy/sentential_subject_island": 0.388, "blimp/accuracy/adjunct_island": 0.71, "blimp/accuracy/intransitive": 0.746, "blimp/accuracy/existential_there_subject_raising": 0.86, "blimp/accuracy/irregular_past_participle_adjectives": 0.978, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.346, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.209, "blimp/accuracy/only_npi_scope": 0.765, "blimp/accuracy/superlative_quantifiers_2": 0.585, "blimp/accuracy/passive_1": 0.873, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.873, "blimp/accuracy/inchoative": 0.62, "blimp/accuracy/anaphor_gender_agreement": 0.952, "blimp/accuracy/principle_A_c_command": 0.567, "blimp/accuracy/only_npi_licensor_present": 0.541, "blimp/accuracy/expletive_it_object_raising": 0.798, "blimp/accuracy/left_branch_island_simple_question": 0.363, "blimp/accuracy/wh_questions_subject_gap": 0.851, "blimp/accuracy/existential_there_quantifiers_2": 0.296, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.904, "blimp/accuracy/sentential_negation_npi_scope": 0.644, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.811, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.839, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.849, "blimp/accuracy/principle_A_case_2": 0.95, "blimp/accuracy/distractor_agreement_relational_noun": 0.715, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.986, "blimp/accuracy/superlative_quantifiers_1": 0.718, "blimp/accuracy/wh_island": 0.709, "blimp/accuracy/principle_A_domain_1": 0.944, "blimp/accuracy/complex_NP_island": 0.497, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.923, "blimp/accuracy/irregular_past_participle_verbs": 0.881, "blimp/accuracy/drop_argument": 0.763, "blimp/accuracy/wh_questions_object_gap": 0.636, "blimp/accuracy/animate_subject_passive": 0.787, "blimp/accuracy/existential_there_quantifiers_1": 0.973, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.859, "blimp/accuracy/npi_present_2": 0.586, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.891, "blimp/accuracy/anaphor_number_agreement": 0.979, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.922, "blimp/accuracy/existential_there_object_raising": 0.796, "blimp/accuracy/matrix_question_npi_licensor_present": 0.255, "blimp/accuracy/npi_present_1": 0.569, "blimp/accuracy/wh_vs_that_no_gap": 0.923, "blimp/accuracy/left_branch_island_echo_question": 0.325, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.957, "blimp/accuracy/causative": 0.714, "blimp/accuracy/group_average": 0.7404328358208955, "blimp/accuracy/seq_average": 0.7404328358208955, "cbt/accuracy/NE": 0.7307692307692307, "cbt/accuracy/V": 0.886, "cbt/accuracy/CN": 0.7864, "cbt/accuracy/P": 0.8704, "cbt/accuracy/group_average": 0.8183923076923076, "cbt/accuracy/seq_average": 0.8184273709483794, "hellaswag/accuracy/val": 0.28231428002389963, "hellaswag/accuracy/group_average": 0.28231428002389963, "hellaswag/accuracy/seq_average": 0.28231428002389963, "piqa/accuracy/val": 0.5788900979325353, "piqa/accuracy/group_average": 0.5788900979325353, "piqa/accuracy/seq_average": 0.5788900979325353, "ai2arc/accuracy/ARC-Easy": 0.32600422832980974, "ai2arc/accuracy/ARC-Challenge": 0.20772532188841203, "ai2arc/accuracy/group_average": 0.2668647751091109, "ai2arc/accuracy/seq_average": 0.2869688385269122, "race/accuracy/test/high": 0.259576901086335, "race/accuracy/test/middle": 0.32729805013927576, "race/accuracy/group_average": 0.2934374756128054, "race/accuracy/seq_average": 0.2792865828942035, "siqa/accuracy/dev": 0.3679631525076766, "siqa/accuracy/group_average": 0.3679631525076766, "siqa/accuracy/seq_average": 0.3679631525076766, "commonsenseqa/accuracy/dev_rand_split": 0.24488124488124488, "commonsenseqa/accuracy/group_average": 0.24488124488124488, "commonsenseqa/accuracy/seq_average": 0.24488124488124488}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_tcmoe/export/result-model-40000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.7482169015066966, "val/accuracy": 0.4594251844618056, "val/perplexity": 15.61476438347703, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5934531644264363, "lambada/accuracy/total": 0.2080745341614907, "lambada/accuracy/openai_last_token": 0.7422360248447205, "lambada/perplexity": 15.670183870221386, "lambada/lm_loss": 3.2963399277560805, "lambada/lm_perplexity": 27.01358608433046, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.33374985931164813, "mean_loss": 2.6708350329665667, "blimp/accuracy/passive_2": 0.878, "blimp/accuracy/determiner_noun_agreement_2": 0.961, "blimp/accuracy/ellipsis_n_bar_1": 0.789, "blimp/accuracy/tough_vs_raising_2": 0.853, "blimp/accuracy/tough_vs_raising_1": 0.569, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.898, "blimp/accuracy/principle_A_reconstruction": 0.271, "blimp/accuracy/wh_vs_that_with_gap": 0.506, "blimp/accuracy/principle_A_domain_2": 0.805, "blimp/accuracy/determiner_noun_agreement_1": 0.988, "blimp/accuracy/ellipsis_n_bar_2": 0.887, "blimp/accuracy/principle_A_domain_3": 0.584, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.888, "blimp/accuracy/animate_subject_trans": 0.883, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.873, "blimp/accuracy/distractor_agreement_relative_clause": 0.516, "blimp/accuracy/transitive": 0.858, "blimp/accuracy/sentential_subject_island": 0.299, "blimp/accuracy/adjunct_island": 0.77, "blimp/accuracy/intransitive": 0.749, "blimp/accuracy/existential_there_subject_raising": 0.882, "blimp/accuracy/irregular_past_participle_adjectives": 0.913, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.46, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.23, "blimp/accuracy/only_npi_scope": 0.606, "blimp/accuracy/superlative_quantifiers_2": 0.734, "blimp/accuracy/passive_1": 0.876, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.857, "blimp/accuracy/inchoative": 0.607, "blimp/accuracy/anaphor_gender_agreement": 0.951, "blimp/accuracy/principle_A_c_command": 0.59, "blimp/accuracy/only_npi_licensor_present": 0.711, "blimp/accuracy/expletive_it_object_raising": 0.795, "blimp/accuracy/left_branch_island_simple_question": 0.523, "blimp/accuracy/wh_questions_subject_gap": 0.883, "blimp/accuracy/existential_there_quantifiers_2": 0.33, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.922, "blimp/accuracy/sentential_negation_npi_scope": 0.617, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.801, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.848, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.88, "blimp/accuracy/principle_A_case_2": 0.964, "blimp/accuracy/distractor_agreement_relational_noun": 0.707, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.988, "blimp/accuracy/superlative_quantifiers_1": 0.716, "blimp/accuracy/wh_island": 0.693, "blimp/accuracy/principle_A_domain_1": 0.966, "blimp/accuracy/complex_NP_island": 0.544, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.95, "blimp/accuracy/irregular_past_participle_verbs": 0.909, "blimp/accuracy/drop_argument": 0.736, "blimp/accuracy/wh_questions_object_gap": 0.714, "blimp/accuracy/animate_subject_passive": 0.759, "blimp/accuracy/existential_there_quantifiers_1": 0.982, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.857, "blimp/accuracy/npi_present_2": 0.527, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.915, "blimp/accuracy/anaphor_number_agreement": 0.978, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.949, "blimp/accuracy/existential_there_object_raising": 0.788, "blimp/accuracy/matrix_question_npi_licensor_present": 0.253, "blimp/accuracy/npi_present_1": 0.566, "blimp/accuracy/wh_vs_that_no_gap": 0.949, "blimp/accuracy/left_branch_island_echo_question": 0.359, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.973, "blimp/accuracy/causative": 0.686, "blimp/accuracy/group_average": 0.750283582089552, "blimp/accuracy/seq_average": 0.7502835820895523, "cbt/accuracy/NE": 0.7311698717948718, "cbt/accuracy/V": 0.8968, "cbt/accuracy/CN": 0.788, "cbt/accuracy/P": 0.8764, "cbt/accuracy/group_average": 0.8230924679487179, "cbt/accuracy/seq_average": 0.8231292517006803, "hellaswag/accuracy/val": 0.2847042421828321, "hellaswag/accuracy/group_average": 0.2847042421828321, "hellaswag/accuracy/seq_average": 0.2847042421828321, "piqa/accuracy/val": 0.5788900979325353, "piqa/accuracy/group_average": 0.5788900979325353, "piqa/accuracy/seq_average": 0.5788900979325353, "ai2arc/accuracy/ARC-Easy": 0.32642706131078225, "ai2arc/accuracy/ARC-Challenge": 0.19914163090128756, "ai2arc/accuracy/group_average": 0.2627843461060349, "ai2arc/accuracy/seq_average": 0.28441926345609064, "race/accuracy/test/high": 0.2655803316180675, "race/accuracy/test/middle": 0.33774373259052926, "race/accuracy/group_average": 0.30166203210429837, "race/accuracy/seq_average": 0.28658289420348604, "siqa/accuracy/dev": 0.372057318321392, "siqa/accuracy/group_average": 0.372057318321392, "siqa/accuracy/seq_average": 0.372057318321392, "commonsenseqa/accuracy/dev_rand_split": 0.25225225225225223, "commonsenseqa/accuracy/group_average": 0.25225225225225223, "commonsenseqa/accuracy/seq_average": 0.25225225225225223}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_tcmoe/export/result-model-50000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.7065531412760415, "val/accuracy": 0.4652952163938492, "val/perplexity": 14.977560903133552, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.7031426281662463, "lambada/accuracy/total": 0.22593167701863354, "lambada/accuracy/openai_last_token": 0.7534937888198758, "lambada/perplexity": 14.005009783680974, "lambada/lm_loss": 3.280149474513485, "lambada/lm_perplexity": 26.57974539746989, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3456134467062414, "mean_loss": 2.704847884721144, "blimp/accuracy/passive_2": 0.878, "blimp/accuracy/determiner_noun_agreement_2": 0.971, "blimp/accuracy/ellipsis_n_bar_1": 0.767, "blimp/accuracy/tough_vs_raising_2": 0.884, "blimp/accuracy/tough_vs_raising_1": 0.519, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.901, "blimp/accuracy/principle_A_reconstruction": 0.396, "blimp/accuracy/wh_vs_that_with_gap": 0.502, "blimp/accuracy/principle_A_domain_2": 0.848, "blimp/accuracy/determiner_noun_agreement_1": 0.981, "blimp/accuracy/ellipsis_n_bar_2": 0.895, "blimp/accuracy/principle_A_domain_3": 0.588, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.896, "blimp/accuracy/animate_subject_trans": 0.876, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.849, "blimp/accuracy/distractor_agreement_relative_clause": 0.53, "blimp/accuracy/transitive": 0.852, "blimp/accuracy/sentential_subject_island": 0.366, "blimp/accuracy/adjunct_island": 0.788, "blimp/accuracy/intransitive": 0.745, "blimp/accuracy/existential_there_subject_raising": 0.883, "blimp/accuracy/irregular_past_participle_adjectives": 0.977, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.305, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.19, "blimp/accuracy/only_npi_scope": 0.73, "blimp/accuracy/superlative_quantifiers_2": 0.731, "blimp/accuracy/passive_1": 0.883, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.857, "blimp/accuracy/inchoative": 0.602, "blimp/accuracy/anaphor_gender_agreement": 0.946, "blimp/accuracy/principle_A_c_command": 0.622, "blimp/accuracy/only_npi_licensor_present": 0.612, "blimp/accuracy/expletive_it_object_raising": 0.798, "blimp/accuracy/left_branch_island_simple_question": 0.37, "blimp/accuracy/wh_questions_subject_gap": 0.915, "blimp/accuracy/existential_there_quantifiers_2": 0.42, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.904, "blimp/accuracy/sentential_negation_npi_scope": 0.596, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.827, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.884, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.864, "blimp/accuracy/principle_A_case_2": 0.969, "blimp/accuracy/distractor_agreement_relational_noun": 0.732, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.987, "blimp/accuracy/superlative_quantifiers_1": 0.685, "blimp/accuracy/wh_island": 0.72, "blimp/accuracy/principle_A_domain_1": 0.98, "blimp/accuracy/complex_NP_island": 0.54, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.959, "blimp/accuracy/irregular_past_participle_verbs": 0.901, "blimp/accuracy/drop_argument": 0.736, "blimp/accuracy/wh_questions_object_gap": 0.808, "blimp/accuracy/animate_subject_passive": 0.789, "blimp/accuracy/existential_there_quantifiers_1": 0.994, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.884, "blimp/accuracy/npi_present_2": 0.576, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.931, "blimp/accuracy/anaphor_number_agreement": 0.979, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.94, "blimp/accuracy/existential_there_object_raising": 0.816, "blimp/accuracy/matrix_question_npi_licensor_present": 0.271, "blimp/accuracy/npi_present_1": 0.584, "blimp/accuracy/wh_vs_that_no_gap": 0.964, "blimp/accuracy/left_branch_island_echo_question": 0.407, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.974, "blimp/accuracy/causative": 0.686, "blimp/accuracy/group_average": 0.7576119402985075, "blimp/accuracy/seq_average": 0.7576119402985074, "cbt/accuracy/NE": 0.7375801282051282, "cbt/accuracy/V": 0.8992, "cbt/accuracy/CN": 0.798, "cbt/accuracy/P": 0.8744, "cbt/accuracy/group_average": 0.8272950320512821, "cbt/accuracy/seq_average": 0.8273309323729492, "hellaswag/accuracy/val": 0.28560047799243177, "hellaswag/accuracy/group_average": 0.28560047799243177, "hellaswag/accuracy/seq_average": 0.28560047799243177, "piqa/accuracy/val": 0.5826985854189336, "piqa/accuracy/group_average": 0.5826985854189336, "piqa/accuracy/seq_average": 0.5826985854189336, "ai2arc/accuracy/ARC-Easy": 0.321353065539112, "ai2arc/accuracy/ARC-Challenge": 0.2128755364806867, "ai2arc/accuracy/group_average": 0.26711430100989936, "ai2arc/accuracy/seq_average": 0.2855524079320113, "race/accuracy/test/high": 0.26786735277301316, "race/accuracy/test/middle": 0.3279944289693593, "race/accuracy/group_average": 0.29793089087118624, "race/accuracy/seq_average": 0.2853668423186056, "siqa/accuracy/dev": 0.36284544524053225, "siqa/accuracy/group_average": 0.36284544524053225, "siqa/accuracy/seq_average": 0.36284544524053225, "commonsenseqa/accuracy/dev_rand_split": 0.25307125307125306, "commonsenseqa/accuracy/group_average": 0.25307125307125306, "commonsenseqa/accuracy/seq_average": 0.25307125307125306}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_tcmoe/export/result-model-60000.pth.json
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"val/loss": 2.671809605189732,
|
| 3 |
+
"val/accuracy": 0.4705403645833333,
|
| 4 |
+
"val/perplexity": 14.466123493335513,
|
| 5 |
+
"val/time_since_best_loss": 0,
|
| 6 |
+
"val/time_since_best_accuracy": 0,
|
| 7 |
+
"lambada/loss": 2.6152883967997864,
|
| 8 |
+
"lambada/accuracy/total": 0.22496118012422361,
|
| 9 |
+
"lambada/accuracy/openai_last_token": 0.7525232919254659,
|
| 10 |
+
"lambada/perplexity": 13.588685271191228,
|
| 11 |
+
"lambada/lm_loss": 3.222259005347056,
|
| 12 |
+
"lambada/lm_perplexity": 25.084722747333092,
|
| 13 |
+
"lambada/time_since_best_loss": 0,
|
| 14 |
+
"lambada/time_since_best_accuracy": 0,
|
| 15 |
+
"mean_accuracy": 0.3477507723537785,
|
| 16 |
+
"mean_loss": 2.643549000994759,
|
| 17 |
+
"blimp/accuracy/passive_2": 0.888,
|
| 18 |
+
"blimp/accuracy/determiner_noun_agreement_2": 0.974,
|
| 19 |
+
"blimp/accuracy/ellipsis_n_bar_1": 0.769,
|
| 20 |
+
"blimp/accuracy/tough_vs_raising_2": 0.861,
|
| 21 |
+
"blimp/accuracy/tough_vs_raising_1": 0.55,
|
| 22 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.879,
|
| 23 |
+
"blimp/accuracy/principle_A_reconstruction": 0.335,
|
| 24 |
+
"blimp/accuracy/wh_vs_that_with_gap": 0.491,
|
| 25 |
+
"blimp/accuracy/principle_A_domain_2": 0.831,
|
| 26 |
+
"blimp/accuracy/determiner_noun_agreement_1": 0.989,
|
| 27 |
+
"blimp/accuracy/ellipsis_n_bar_2": 0.893,
|
| 28 |
+
"blimp/accuracy/principle_A_domain_3": 0.579,
|
| 29 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.904,
|
| 30 |
+
"blimp/accuracy/animate_subject_trans": 0.884,
|
| 31 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.881,
|
| 32 |
+
"blimp/accuracy/distractor_agreement_relative_clause": 0.583,
|
| 33 |
+
"blimp/accuracy/transitive": 0.864,
|
| 34 |
+
"blimp/accuracy/sentential_subject_island": 0.338,
|
| 35 |
+
"blimp/accuracy/adjunct_island": 0.729,
|
| 36 |
+
"blimp/accuracy/intransitive": 0.743,
|
| 37 |
+
"blimp/accuracy/existential_there_subject_raising": 0.888,
|
| 38 |
+
"blimp/accuracy/irregular_past_participle_adjectives": 0.953,
|
| 39 |
+
"blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.402,
|
| 40 |
+
"blimp/accuracy/principle_A_case_1": 1.0,
|
| 41 |
+
"blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.208,
|
| 42 |
+
"blimp/accuracy/only_npi_scope": 0.719,
|
| 43 |
+
"blimp/accuracy/superlative_quantifiers_2": 0.658,
|
| 44 |
+
"blimp/accuracy/passive_1": 0.876,
|
| 45 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.867,
|
| 46 |
+
"blimp/accuracy/inchoative": 0.646,
|
| 47 |
+
"blimp/accuracy/anaphor_gender_agreement": 0.954,
|
| 48 |
+
"blimp/accuracy/principle_A_c_command": 0.609,
|
| 49 |
+
"blimp/accuracy/only_npi_licensor_present": 0.471,
|
| 50 |
+
"blimp/accuracy/expletive_it_object_raising": 0.775,
|
| 51 |
+
"blimp/accuracy/left_branch_island_simple_question": 0.467,
|
| 52 |
+
"blimp/accuracy/wh_questions_subject_gap": 0.918,
|
| 53 |
+
"blimp/accuracy/existential_there_quantifiers_2": 0.366,
|
| 54 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.933,
|
| 55 |
+
"blimp/accuracy/sentential_negation_npi_scope": 0.684,
|
| 56 |
+
"blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.807,
|
| 57 |
+
"blimp/accuracy/wh_questions_subject_gap_long_distance": 0.857,
|
| 58 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.862,
|
| 59 |
+
"blimp/accuracy/principle_A_case_2": 0.966,
|
| 60 |
+
"blimp/accuracy/distractor_agreement_relational_noun": 0.712,
|
| 61 |
+
"blimp/accuracy/sentential_negation_npi_licensor_present": 0.99,
|
| 62 |
+
"blimp/accuracy/superlative_quantifiers_1": 0.691,
|
| 63 |
+
"blimp/accuracy/wh_island": 0.782,
|
| 64 |
+
"blimp/accuracy/principle_A_domain_1": 0.988,
|
| 65 |
+
"blimp/accuracy/complex_NP_island": 0.528,
|
| 66 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_2": 0.953,
|
| 67 |
+
"blimp/accuracy/irregular_past_participle_verbs": 0.902,
|
| 68 |
+
"blimp/accuracy/drop_argument": 0.741,
|
| 69 |
+
"blimp/accuracy/wh_questions_object_gap": 0.757,
|
| 70 |
+
"blimp/accuracy/animate_subject_passive": 0.803,
|
| 71 |
+
"blimp/accuracy/existential_there_quantifiers_1": 0.985,
|
| 72 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.878,
|
| 73 |
+
"blimp/accuracy/npi_present_2": 0.603,
|
| 74 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_1": 0.911,
|
| 75 |
+
"blimp/accuracy/anaphor_number_agreement": 0.98,
|
| 76 |
+
"blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.948,
|
| 77 |
+
"blimp/accuracy/existential_there_object_raising": 0.819,
|
| 78 |
+
"blimp/accuracy/matrix_question_npi_licensor_present": 0.256,
|
| 79 |
+
"blimp/accuracy/npi_present_1": 0.62,
|
| 80 |
+
"blimp/accuracy/wh_vs_that_no_gap": 0.962,
|
| 81 |
+
"blimp/accuracy/left_branch_island_echo_question": 0.365,
|
| 82 |
+
"blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.973,
|
| 83 |
+
"blimp/accuracy/causative": 0.72,
|
| 84 |
+
"blimp/accuracy/group_average": 0.7569850746268658,
|
| 85 |
+
"blimp/accuracy/seq_average": 0.7569850746268657,
|
| 86 |
+
"cbt/accuracy/NE": 0.7487980769230769,
|
| 87 |
+
"cbt/accuracy/V": 0.8972,
|
| 88 |
+
"cbt/accuracy/CN": 0.816,
|
| 89 |
+
"cbt/accuracy/P": 0.8852,
|
| 90 |
+
"cbt/accuracy/group_average": 0.8367995192307691,
|
| 91 |
+
"cbt/accuracy/seq_average": 0.8368347338935574,
|
| 92 |
+
"hellaswag/accuracy/val": 0.29067914758016333,
|
| 93 |
+
"hellaswag/accuracy/group_average": 0.29067914758016333,
|
| 94 |
+
"hellaswag/accuracy/seq_average": 0.29067914758016333,
|
| 95 |
+
"piqa/accuracy/val": 0.5816104461371056,
|
| 96 |
+
"piqa/accuracy/group_average": 0.5816104461371056,
|
| 97 |
+
"piqa/accuracy/seq_average": 0.5816104461371056,
|
| 98 |
+
"ai2arc/accuracy/ARC-Easy": 0.3276955602536998,
|
| 99 |
+
"ai2arc/accuracy/ARC-Challenge": 0.21373390557939914,
|
| 100 |
+
"ai2arc/accuracy/group_average": 0.27071473291654946,
|
| 101 |
+
"ai2arc/accuracy/seq_average": 0.29008498583569403,
|
| 102 |
+
"race/accuracy/test/high": 0.26500857632933106,
|
| 103 |
+
"race/accuracy/test/middle": 0.3293871866295265,
|
| 104 |
+
"race/accuracy/group_average": 0.29719788147942877,
|
| 105 |
+
"race/accuracy/seq_average": 0.2837454398054317,
|
| 106 |
+
"siqa/accuracy/dev": 0.3623336745138178,
|
| 107 |
+
"siqa/accuracy/group_average": 0.3623336745138178,
|
| 108 |
+
"siqa/accuracy/seq_average": 0.3623336745138178,
|
| 109 |
+
"commonsenseqa/accuracy/dev_rand_split": 0.25552825552825553,
|
| 110 |
+
"commonsenseqa/accuracy/group_average": 0.25552825552825553,
|
| 111 |
+
"commonsenseqa/accuracy/seq_average": 0.25552825552825553
|
| 112 |
+
}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_tcmoe/export/result-model-70000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.6450330946180554, "val/accuracy": 0.4744311135912698, "val/perplexity": 14.083911177072899, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6619717615731755, "lambada/accuracy/total": 0.24495341614906832, "lambada/accuracy/openai_last_token": 0.7575698757763976, "lambada/perplexity": 12.76574531095774, "lambada/lm_loss": 3.211762370304564, "lambada/lm_perplexity": 24.82279465423036, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3596922648701691, "mean_loss": 2.6535024280956154, "blimp/accuracy/passive_2": 0.898, "blimp/accuracy/determiner_noun_agreement_2": 0.966, "blimp/accuracy/ellipsis_n_bar_1": 0.803, "blimp/accuracy/tough_vs_raising_2": 0.87, "blimp/accuracy/tough_vs_raising_1": 0.577, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.904, "blimp/accuracy/principle_A_reconstruction": 0.301, "blimp/accuracy/wh_vs_that_with_gap": 0.505, "blimp/accuracy/principle_A_domain_2": 0.841, "blimp/accuracy/determiner_noun_agreement_1": 0.986, "blimp/accuracy/ellipsis_n_bar_2": 0.903, "blimp/accuracy/principle_A_domain_3": 0.611, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.893, "blimp/accuracy/animate_subject_trans": 0.889, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.871, "blimp/accuracy/distractor_agreement_relative_clause": 0.579, "blimp/accuracy/transitive": 0.883, "blimp/accuracy/sentential_subject_island": 0.303, "blimp/accuracy/adjunct_island": 0.782, "blimp/accuracy/intransitive": 0.761, "blimp/accuracy/existential_there_subject_raising": 0.874, "blimp/accuracy/irregular_past_participle_adjectives": 0.979, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.472, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.22, "blimp/accuracy/only_npi_scope": 0.79, "blimp/accuracy/superlative_quantifiers_2": 0.759, "blimp/accuracy/passive_1": 0.887, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.859, "blimp/accuracy/inchoative": 0.635, "blimp/accuracy/anaphor_gender_agreement": 0.959, "blimp/accuracy/principle_A_c_command": 0.593, "blimp/accuracy/only_npi_licensor_present": 0.671, "blimp/accuracy/expletive_it_object_raising": 0.799, "blimp/accuracy/left_branch_island_simple_question": 0.54, "blimp/accuracy/wh_questions_subject_gap": 0.926, "blimp/accuracy/existential_there_quantifiers_2": 0.348, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.929, "blimp/accuracy/sentential_negation_npi_scope": 0.657, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.85, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.858, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.892, "blimp/accuracy/principle_A_case_2": 0.959, "blimp/accuracy/distractor_agreement_relational_noun": 0.792, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.988, "blimp/accuracy/superlative_quantifiers_1": 0.736, "blimp/accuracy/wh_island": 0.72, "blimp/accuracy/principle_A_domain_1": 0.975, "blimp/accuracy/complex_NP_island": 0.517, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.955, "blimp/accuracy/irregular_past_participle_verbs": 0.911, "blimp/accuracy/drop_argument": 0.742, "blimp/accuracy/wh_questions_object_gap": 0.772, "blimp/accuracy/animate_subject_passive": 0.784, "blimp/accuracy/existential_there_quantifiers_1": 0.972, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.886, "blimp/accuracy/npi_present_2": 0.589, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.921, "blimp/accuracy/anaphor_number_agreement": 0.98, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.947, "blimp/accuracy/existential_there_object_raising": 0.839, "blimp/accuracy/matrix_question_npi_licensor_present": 0.236, "blimp/accuracy/npi_present_1": 0.603, "blimp/accuracy/wh_vs_that_no_gap": 0.968, "blimp/accuracy/left_branch_island_echo_question": 0.451, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.977, "blimp/accuracy/causative": 0.726, "blimp/accuracy/group_average": 0.7696865671641789, "blimp/accuracy/seq_average": 0.7696865671641792, "cbt/accuracy/NE": 0.7524038461538461, "cbt/accuracy/V": 0.9036, "cbt/accuracy/CN": 0.8124, "cbt/accuracy/P": 0.8828, "cbt/accuracy/group_average": 0.8378009615384615, "cbt/accuracy/seq_average": 0.8378351340536214, "hellaswag/accuracy/val": 0.29396534554869547, "hellaswag/accuracy/group_average": 0.29396534554869547, "hellaswag/accuracy/seq_average": 0.29396534554869547, "piqa/accuracy/val": 0.5875952121871599, "piqa/accuracy/group_average": 0.5875952121871599, "piqa/accuracy/seq_average": 0.5875952121871599, "ai2arc/accuracy/ARC-Easy": 0.3293868921775899, "ai2arc/accuracy/ARC-Challenge": 0.20257510729613734, "ai2arc/accuracy/group_average": 0.2659809997368636, "ai2arc/accuracy/seq_average": 0.28753541076487255, "race/accuracy/test/high": 0.2644368210405946, "race/accuracy/test/middle": 0.334958217270195, "race/accuracy/group_average": 0.2996975191553948, "race/accuracy/seq_average": 0.2849614916903121, "siqa/accuracy/dev": 0.36284544524053225, "siqa/accuracy/group_average": 0.36284544524053225, "siqa/accuracy/seq_average": 0.36284544524053225, "commonsenseqa/accuracy/dev_rand_split": 0.2628992628992629, "commonsenseqa/accuracy/group_average": 0.2628992628992629, "commonsenseqa/accuracy/seq_average": 0.2628992628992629}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_tcmoe/export/result-model-80000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.6233917720734126, "val/accuracy": 0.4771544441344246, "val/perplexity": 13.782391126835273, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.60029289292993, "lambada/accuracy/total": 0.2521350931677019, "lambada/accuracy/openai_last_token": 0.7616459627329193, "lambada/perplexity": 12.105005054481381, "lambada/lm_loss": 3.1827236355788733, "lambada/lm_perplexity": 24.112337418328277, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3646447686510632, "mean_loss": 2.6118423325016713, "blimp/accuracy/passive_2": 0.908, "blimp/accuracy/determiner_noun_agreement_2": 0.974, "blimp/accuracy/ellipsis_n_bar_1": 0.787, "blimp/accuracy/tough_vs_raising_2": 0.856, "blimp/accuracy/tough_vs_raising_1": 0.584, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.896, "blimp/accuracy/principle_A_reconstruction": 0.35, "blimp/accuracy/wh_vs_that_with_gap": 0.492, "blimp/accuracy/principle_A_domain_2": 0.845, "blimp/accuracy/determiner_noun_agreement_1": 0.993, "blimp/accuracy/ellipsis_n_bar_2": 0.892, "blimp/accuracy/principle_A_domain_3": 0.605, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.897, "blimp/accuracy/animate_subject_trans": 0.883, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.895, "blimp/accuracy/distractor_agreement_relative_clause": 0.583, "blimp/accuracy/transitive": 0.877, "blimp/accuracy/sentential_subject_island": 0.321, "blimp/accuracy/adjunct_island": 0.791, "blimp/accuracy/intransitive": 0.781, "blimp/accuracy/existential_there_subject_raising": 0.875, "blimp/accuracy/irregular_past_participle_adjectives": 0.979, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.409, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.232, "blimp/accuracy/only_npi_scope": 0.681, "blimp/accuracy/superlative_quantifiers_2": 0.797, "blimp/accuracy/passive_1": 0.887, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.878, "blimp/accuracy/inchoative": 0.624, "blimp/accuracy/anaphor_gender_agreement": 0.954, "blimp/accuracy/principle_A_c_command": 0.622, "blimp/accuracy/only_npi_licensor_present": 0.531, "blimp/accuracy/expletive_it_object_raising": 0.785, "blimp/accuracy/left_branch_island_simple_question": 0.485, "blimp/accuracy/wh_questions_subject_gap": 0.924, "blimp/accuracy/existential_there_quantifiers_2": 0.35, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.929, "blimp/accuracy/sentential_negation_npi_scope": 0.68, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.835, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.831, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.891, "blimp/accuracy/principle_A_case_2": 0.962, "blimp/accuracy/distractor_agreement_relational_noun": 0.727, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.987, "blimp/accuracy/superlative_quantifiers_1": 0.792, "blimp/accuracy/wh_island": 0.758, "blimp/accuracy/principle_A_domain_1": 0.967, "blimp/accuracy/complex_NP_island": 0.562, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.963, "blimp/accuracy/irregular_past_participle_verbs": 0.909, "blimp/accuracy/drop_argument": 0.759, "blimp/accuracy/wh_questions_object_gap": 0.783, "blimp/accuracy/animate_subject_passive": 0.794, "blimp/accuracy/existential_there_quantifiers_1": 0.978, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.87, "blimp/accuracy/npi_present_2": 0.6, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.933, "blimp/accuracy/anaphor_number_agreement": 0.984, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.958, "blimp/accuracy/existential_there_object_raising": 0.824, "blimp/accuracy/matrix_question_npi_licensor_present": 0.324, "blimp/accuracy/npi_present_1": 0.62, "blimp/accuracy/wh_vs_that_no_gap": 0.963, "blimp/accuracy/left_branch_island_echo_question": 0.396, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.976, "blimp/accuracy/causative": 0.712, "blimp/accuracy/group_average": 0.7685074626865671, "blimp/accuracy/seq_average": 0.7685074626865671, "cbt/accuracy/NE": 0.7604166666666666, "cbt/accuracy/V": 0.908, "cbt/accuracy/CN": 0.8172, "cbt/accuracy/P": 0.8824, "cbt/accuracy/group_average": 0.8420041666666667, "cbt/accuracy/seq_average": 0.8420368147258903, "hellaswag/accuracy/val": 0.2953594901414061, "hellaswag/accuracy/group_average": 0.2953594901414061, "hellaswag/accuracy/seq_average": 0.2953594901414061, "piqa/accuracy/val": 0.588683351468988, "piqa/accuracy/group_average": 0.588683351468988, "piqa/accuracy/seq_average": 0.588683351468988, "ai2arc/accuracy/ARC-Easy": 0.33403805496828753, "ai2arc/accuracy/ARC-Challenge": 0.2094420600858369, "ai2arc/accuracy/group_average": 0.2717400575270622, "ai2arc/accuracy/seq_average": 0.2929178470254957, "race/accuracy/test/high": 0.2655803316180675, "race/accuracy/test/middle": 0.33356545961002787, "race/accuracy/group_average": 0.29957289561404765, "race/accuracy/seq_average": 0.2853668423186056, "siqa/accuracy/dev": 0.3623336745138178, "siqa/accuracy/group_average": 0.3623336745138178, "siqa/accuracy/seq_average": 0.3623336745138178, "commonsenseqa/accuracy/dev_rand_split": 0.26044226044226043, "commonsenseqa/accuracy/group_average": 0.26044226044226043, "commonsenseqa/accuracy/seq_average": 0.26044226044226043}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_tcmoe/export/result-model-90000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.6112525576636907, "val/accuracy": 0.47884889632936506, "val/perplexity": 13.616095119429911, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.662150317837733, "lambada/accuracy/total": 0.24611801242236025, "lambada/accuracy/openai_last_token": 0.7597049689440993, "lambada/perplexity": 11.89158931127403, "lambada/lm_loss": 3.1628039147488223, "lambada/lm_perplexity": 23.63677861257303, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.36248345437586266, "mean_loss": 2.6367014377507116, "blimp/accuracy/passive_2": 0.894, "blimp/accuracy/determiner_noun_agreement_2": 0.972, "blimp/accuracy/ellipsis_n_bar_1": 0.796, "blimp/accuracy/tough_vs_raising_2": 0.871, "blimp/accuracy/tough_vs_raising_1": 0.571, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.9, "blimp/accuracy/principle_A_reconstruction": 0.326, "blimp/accuracy/wh_vs_that_with_gap": 0.497, "blimp/accuracy/principle_A_domain_2": 0.858, "blimp/accuracy/determiner_noun_agreement_1": 0.992, "blimp/accuracy/ellipsis_n_bar_2": 0.897, "blimp/accuracy/principle_A_domain_3": 0.597, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.896, "blimp/accuracy/animate_subject_trans": 0.876, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.89, "blimp/accuracy/distractor_agreement_relative_clause": 0.578, "blimp/accuracy/transitive": 0.869, "blimp/accuracy/sentential_subject_island": 0.325, "blimp/accuracy/adjunct_island": 0.783, "blimp/accuracy/intransitive": 0.768, "blimp/accuracy/existential_there_subject_raising": 0.883, "blimp/accuracy/irregular_past_participle_adjectives": 0.981, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.396, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.211, "blimp/accuracy/only_npi_scope": 0.742, "blimp/accuracy/superlative_quantifiers_2": 0.625, "blimp/accuracy/passive_1": 0.883, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.887, "blimp/accuracy/inchoative": 0.624, "blimp/accuracy/anaphor_gender_agreement": 0.958, "blimp/accuracy/principle_A_c_command": 0.615, "blimp/accuracy/only_npi_licensor_present": 0.549, "blimp/accuracy/expletive_it_object_raising": 0.793, "blimp/accuracy/left_branch_island_simple_question": 0.449, "blimp/accuracy/wh_questions_subject_gap": 0.927, "blimp/accuracy/existential_there_quantifiers_2": 0.373, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.942, "blimp/accuracy/sentential_negation_npi_scope": 0.622, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.836, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.859, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.889, "blimp/accuracy/principle_A_case_2": 0.952, "blimp/accuracy/distractor_agreement_relational_noun": 0.775, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.982, "blimp/accuracy/superlative_quantifiers_1": 0.756, "blimp/accuracy/wh_island": 0.759, "blimp/accuracy/principle_A_domain_1": 0.976, "blimp/accuracy/complex_NP_island": 0.537, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.962, "blimp/accuracy/irregular_past_participle_verbs": 0.912, "blimp/accuracy/drop_argument": 0.74, "blimp/accuracy/wh_questions_object_gap": 0.78, "blimp/accuracy/animate_subject_passive": 0.797, "blimp/accuracy/existential_there_quantifiers_1": 0.985, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.869, "blimp/accuracy/npi_present_2": 0.629, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.943, "blimp/accuracy/anaphor_number_agreement": 0.983, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.952, "blimp/accuracy/existential_there_object_raising": 0.836, "blimp/accuracy/matrix_question_npi_licensor_present": 0.294, "blimp/accuracy/npi_present_1": 0.609, "blimp/accuracy/wh_vs_that_no_gap": 0.965, "blimp/accuracy/left_branch_island_echo_question": 0.43, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.976, "blimp/accuracy/causative": 0.73, "blimp/accuracy/group_average": 0.76610447761194, "blimp/accuracy/seq_average": 0.7661044776119403, "cbt/accuracy/NE": 0.7640224358974359, "cbt/accuracy/V": 0.9112, "cbt/accuracy/CN": 0.824, "cbt/accuracy/P": 0.8868, "cbt/accuracy/group_average": 0.846505608974359, "cbt/accuracy/seq_average": 0.8465386154461785, "hellaswag/accuracy/val": 0.29416450906193986, "hellaswag/accuracy/group_average": 0.29416450906193986, "hellaswag/accuracy/seq_average": 0.29416450906193986, "piqa/accuracy/val": 0.5875952121871599, "piqa/accuracy/group_average": 0.5875952121871599, "piqa/accuracy/seq_average": 0.5875952121871599, "ai2arc/accuracy/ARC-Easy": 0.33657505285412265, "ai2arc/accuracy/ARC-Challenge": 0.20772532188841203, "ai2arc/accuracy/group_average": 0.27215018737126734, "ai2arc/accuracy/seq_average": 0.29405099150141645, "race/accuracy/test/high": 0.274442538593482, "race/accuracy/test/middle": 0.33008356545961004, "race/accuracy/group_average": 0.302263052026546, "race/accuracy/seq_average": 0.29063640048642075, "siqa/accuracy/dev": 0.36591606960081885, "siqa/accuracy/group_average": 0.36591606960081885, "siqa/accuracy/seq_average": 0.36591606960081885, "commonsenseqa/accuracy/dev_rand_split": 0.266994266994267, "commonsenseqa/accuracy/group_average": 0.266994266994267, "commonsenseqa/accuracy/seq_average": 0.266994266994267}
|