1b8152d875db5fac3e5490e634ee15335ecd67f7cea83853f58c6eaf0243dc6e
Browse files- Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_154M_standard_lb/export/result-model-10000.pth.json +112 -0
- Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_154M_standard_lb/export/result-model-100000.pth.json +112 -0
- Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_154M_standard_lb/export/result-model-20000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_154M_standard_lb/export/result-model-30000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_154M_standard_lb/export/result-model-40000.pth.json +112 -0
- Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_154M_standard_lb/export/result-model-50000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_154M_standard_lb/export/result-model-60000.pth.json +112 -0
- Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_154M_standard_lb/export/result-model-70000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_154M_standard_lb/export/result-model-80000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_154M_standard_lb/export/result-model-90000.pth.json +1 -0
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_154M_standard_lb/export/result-model-10000.pth.json
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"val/loss": 3.082154289124504,
|
| 3 |
+
"val/accuracy": 0.4153355189732143,
|
| 4 |
+
"val/perplexity": 21.805326812077322,
|
| 5 |
+
"val/time_since_best_loss": 0,
|
| 6 |
+
"val/time_since_best_accuracy": 0,
|
| 7 |
+
"lambada/loss": 2.7326447860054346,
|
| 8 |
+
"lambada/accuracy/total": 0.14460403726708074,
|
| 9 |
+
"lambada/accuracy/openai_last_token": 0.7109860248447205,
|
| 10 |
+
"lambada/perplexity": 27.812959440862006,
|
| 11 |
+
"lambada/lm_loss": 3.6232970926431887,
|
| 12 |
+
"lambada/lm_perplexity": 37.460876410448684,
|
| 13 |
+
"lambada/time_since_best_loss": 0,
|
| 14 |
+
"lambada/time_since_best_accuracy": 0,
|
| 15 |
+
"mean_accuracy": 0.2799697781201475,
|
| 16 |
+
"mean_loss": 2.9073995375649693,
|
| 17 |
+
"blimp/accuracy/passive_2": 0.868,
|
| 18 |
+
"blimp/accuracy/determiner_noun_agreement_2": 0.965,
|
| 19 |
+
"blimp/accuracy/ellipsis_n_bar_1": 0.692,
|
| 20 |
+
"blimp/accuracy/tough_vs_raising_2": 0.774,
|
| 21 |
+
"blimp/accuracy/tough_vs_raising_1": 0.5,
|
| 22 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.859,
|
| 23 |
+
"blimp/accuracy/principle_A_reconstruction": 0.333,
|
| 24 |
+
"blimp/accuracy/wh_vs_that_with_gap": 0.36,
|
| 25 |
+
"blimp/accuracy/principle_A_domain_2": 0.741,
|
| 26 |
+
"blimp/accuracy/determiner_noun_agreement_1": 0.98,
|
| 27 |
+
"blimp/accuracy/ellipsis_n_bar_2": 0.884,
|
| 28 |
+
"blimp/accuracy/principle_A_domain_3": 0.49,
|
| 29 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.855,
|
| 30 |
+
"blimp/accuracy/animate_subject_trans": 0.858,
|
| 31 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.773,
|
| 32 |
+
"blimp/accuracy/distractor_agreement_relative_clause": 0.486,
|
| 33 |
+
"blimp/accuracy/transitive": 0.792,
|
| 34 |
+
"blimp/accuracy/sentential_subject_island": 0.48,
|
| 35 |
+
"blimp/accuracy/adjunct_island": 0.763,
|
| 36 |
+
"blimp/accuracy/intransitive": 0.691,
|
| 37 |
+
"blimp/accuracy/existential_there_subject_raising": 0.768,
|
| 38 |
+
"blimp/accuracy/irregular_past_participle_adjectives": 0.986,
|
| 39 |
+
"blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.135,
|
| 40 |
+
"blimp/accuracy/principle_A_case_1": 1.0,
|
| 41 |
+
"blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.145,
|
| 42 |
+
"blimp/accuracy/only_npi_scope": 0.556,
|
| 43 |
+
"blimp/accuracy/superlative_quantifiers_2": 0.785,
|
| 44 |
+
"blimp/accuracy/passive_1": 0.883,
|
| 45 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.853,
|
| 46 |
+
"blimp/accuracy/inchoative": 0.538,
|
| 47 |
+
"blimp/accuracy/anaphor_gender_agreement": 0.929,
|
| 48 |
+
"blimp/accuracy/principle_A_c_command": 0.459,
|
| 49 |
+
"blimp/accuracy/only_npi_licensor_present": 0.421,
|
| 50 |
+
"blimp/accuracy/expletive_it_object_raising": 0.767,
|
| 51 |
+
"blimp/accuracy/left_branch_island_simple_question": 0.143,
|
| 52 |
+
"blimp/accuracy/wh_questions_subject_gap": 0.917,
|
| 53 |
+
"blimp/accuracy/existential_there_quantifiers_2": 0.346,
|
| 54 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.903,
|
| 55 |
+
"blimp/accuracy/sentential_negation_npi_scope": 0.393,
|
| 56 |
+
"blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.747,
|
| 57 |
+
"blimp/accuracy/wh_questions_subject_gap_long_distance": 0.888,
|
| 58 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.79,
|
| 59 |
+
"blimp/accuracy/principle_A_case_2": 0.942,
|
| 60 |
+
"blimp/accuracy/distractor_agreement_relational_noun": 0.66,
|
| 61 |
+
"blimp/accuracy/sentential_negation_npi_licensor_present": 0.998,
|
| 62 |
+
"blimp/accuracy/superlative_quantifiers_1": 0.645,
|
| 63 |
+
"blimp/accuracy/wh_island": 0.779,
|
| 64 |
+
"blimp/accuracy/principle_A_domain_1": 0.951,
|
| 65 |
+
"blimp/accuracy/complex_NP_island": 0.554,
|
| 66 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_2": 0.899,
|
| 67 |
+
"blimp/accuracy/irregular_past_participle_verbs": 0.796,
|
| 68 |
+
"blimp/accuracy/drop_argument": 0.743,
|
| 69 |
+
"blimp/accuracy/wh_questions_object_gap": 0.735,
|
| 70 |
+
"blimp/accuracy/animate_subject_passive": 0.725,
|
| 71 |
+
"blimp/accuracy/existential_there_quantifiers_1": 0.96,
|
| 72 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.891,
|
| 73 |
+
"blimp/accuracy/npi_present_2": 0.583,
|
| 74 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_1": 0.836,
|
| 75 |
+
"blimp/accuracy/anaphor_number_agreement": 0.971,
|
| 76 |
+
"blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.938,
|
| 77 |
+
"blimp/accuracy/existential_there_object_raising": 0.718,
|
| 78 |
+
"blimp/accuracy/matrix_question_npi_licensor_present": 0.047,
|
| 79 |
+
"blimp/accuracy/npi_present_1": 0.492,
|
| 80 |
+
"blimp/accuracy/wh_vs_that_no_gap": 0.957,
|
| 81 |
+
"blimp/accuracy/left_branch_island_echo_question": 0.351,
|
| 82 |
+
"blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.977,
|
| 83 |
+
"blimp/accuracy/causative": 0.64,
|
| 84 |
+
"blimp/accuracy/group_average": 0.7057313432835821,
|
| 85 |
+
"blimp/accuracy/seq_average": 0.7057313432835821,
|
| 86 |
+
"cbt/accuracy/NE": 0.6875,
|
| 87 |
+
"cbt/accuracy/V": 0.8544,
|
| 88 |
+
"cbt/accuracy/CN": 0.716,
|
| 89 |
+
"cbt/accuracy/P": 0.8212,
|
| 90 |
+
"cbt/accuracy/group_average": 0.7697750000000001,
|
| 91 |
+
"cbt/accuracy/seq_average": 0.7698079231692677,
|
| 92 |
+
"hellaswag/accuracy/val": 0.26628161720772753,
|
| 93 |
+
"hellaswag/accuracy/group_average": 0.26628161720772753,
|
| 94 |
+
"hellaswag/accuracy/seq_average": 0.26628161720772753,
|
| 95 |
+
"piqa/accuracy/val": 0.5631120783460283,
|
| 96 |
+
"piqa/accuracy/group_average": 0.5631120783460283,
|
| 97 |
+
"piqa/accuracy/seq_average": 0.5631120783460283,
|
| 98 |
+
"ai2arc/accuracy/ARC-Easy": 0.30528541226215644,
|
| 99 |
+
"ai2arc/accuracy/ARC-Challenge": 0.2034334763948498,
|
| 100 |
+
"ai2arc/accuracy/group_average": 0.2543594443285031,
|
| 101 |
+
"ai2arc/accuracy/seq_average": 0.271671388101983,
|
| 102 |
+
"race/accuracy/test/high": 0.2567181246426529,
|
| 103 |
+
"race/accuracy/test/middle": 0.31545961002785516,
|
| 104 |
+
"race/accuracy/group_average": 0.28608886733525407,
|
| 105 |
+
"race/accuracy/seq_average": 0.2738143494122416,
|
| 106 |
+
"siqa/accuracy/dev": 0.3638689866939611,
|
| 107 |
+
"siqa/accuracy/group_average": 0.3638689866939611,
|
| 108 |
+
"siqa/accuracy/seq_average": 0.3638689866939611,
|
| 109 |
+
"commonsenseqa/accuracy/dev_rand_split": 0.23095823095823095,
|
| 110 |
+
"commonsenseqa/accuracy/group_average": 0.23095823095823095,
|
| 111 |
+
"commonsenseqa/accuracy/seq_average": 0.23095823095823095
|
| 112 |
+
}
|
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_154M_standard_lb/export/result-model-100000.pth.json
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"val/loss": 2.6378416031125993,
|
| 3 |
+
"val/accuracy": 0.4758814251612103,
|
| 4 |
+
"val/perplexity": 13.982990170707938,
|
| 5 |
+
"val/time_since_best_loss": 0,
|
| 6 |
+
"val/time_since_best_accuracy": 0,
|
| 7 |
+
"lambada/loss": 2.591333803923234,
|
| 8 |
+
"lambada/accuracy/total": 0.24572981366459629,
|
| 9 |
+
"lambada/accuracy/openai_last_token": 0.7556288819875776,
|
| 10 |
+
"lambada/perplexity": 12.418063515018053,
|
| 11 |
+
"lambada/lm_loss": 3.2057600278248044,
|
| 12 |
+
"lambada/lm_perplexity": 24.674246005337423,
|
| 13 |
+
"lambada/time_since_best_loss": 0,
|
| 14 |
+
"lambada/time_since_best_accuracy": 0,
|
| 15 |
+
"mean_accuracy": 0.3608056194129033,
|
| 16 |
+
"mean_loss": 2.6145877035179166,
|
| 17 |
+
"blimp/accuracy/passive_2": 0.91,
|
| 18 |
+
"blimp/accuracy/determiner_noun_agreement_2": 0.985,
|
| 19 |
+
"blimp/accuracy/ellipsis_n_bar_1": 0.843,
|
| 20 |
+
"blimp/accuracy/tough_vs_raising_2": 0.848,
|
| 21 |
+
"blimp/accuracy/tough_vs_raising_1": 0.636,
|
| 22 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.889,
|
| 23 |
+
"blimp/accuracy/principle_A_reconstruction": 0.423,
|
| 24 |
+
"blimp/accuracy/wh_vs_that_with_gap": 0.541,
|
| 25 |
+
"blimp/accuracy/principle_A_domain_2": 0.781,
|
| 26 |
+
"blimp/accuracy/determiner_noun_agreement_1": 0.987,
|
| 27 |
+
"blimp/accuracy/ellipsis_n_bar_2": 0.903,
|
| 28 |
+
"blimp/accuracy/principle_A_domain_3": 0.544,
|
| 29 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.925,
|
| 30 |
+
"blimp/accuracy/animate_subject_trans": 0.898,
|
| 31 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.898,
|
| 32 |
+
"blimp/accuracy/distractor_agreement_relative_clause": 0.684,
|
| 33 |
+
"blimp/accuracy/transitive": 0.847,
|
| 34 |
+
"blimp/accuracy/sentential_subject_island": 0.423,
|
| 35 |
+
"blimp/accuracy/adjunct_island": 0.825,
|
| 36 |
+
"blimp/accuracy/intransitive": 0.788,
|
| 37 |
+
"blimp/accuracy/existential_there_subject_raising": 0.845,
|
| 38 |
+
"blimp/accuracy/irregular_past_participle_adjectives": 0.846,
|
| 39 |
+
"blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.252,
|
| 40 |
+
"blimp/accuracy/principle_A_case_1": 1.0,
|
| 41 |
+
"blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.247,
|
| 42 |
+
"blimp/accuracy/only_npi_scope": 0.619,
|
| 43 |
+
"blimp/accuracy/superlative_quantifiers_2": 0.841,
|
| 44 |
+
"blimp/accuracy/passive_1": 0.894,
|
| 45 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.902,
|
| 46 |
+
"blimp/accuracy/inchoative": 0.619,
|
| 47 |
+
"blimp/accuracy/anaphor_gender_agreement": 0.964,
|
| 48 |
+
"blimp/accuracy/principle_A_c_command": 0.614,
|
| 49 |
+
"blimp/accuracy/only_npi_licensor_present": 0.674,
|
| 50 |
+
"blimp/accuracy/expletive_it_object_raising": 0.794,
|
| 51 |
+
"blimp/accuracy/left_branch_island_simple_question": 0.338,
|
| 52 |
+
"blimp/accuracy/wh_questions_subject_gap": 0.916,
|
| 53 |
+
"blimp/accuracy/existential_there_quantifiers_2": 0.43,
|
| 54 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.934,
|
| 55 |
+
"blimp/accuracy/sentential_negation_npi_scope": 0.569,
|
| 56 |
+
"blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.824,
|
| 57 |
+
"blimp/accuracy/wh_questions_subject_gap_long_distance": 0.868,
|
| 58 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.875,
|
| 59 |
+
"blimp/accuracy/principle_A_case_2": 0.965,
|
| 60 |
+
"blimp/accuracy/distractor_agreement_relational_noun": 0.799,
|
| 61 |
+
"blimp/accuracy/sentential_negation_npi_licensor_present": 0.989,
|
| 62 |
+
"blimp/accuracy/superlative_quantifiers_1": 0.748,
|
| 63 |
+
"blimp/accuracy/wh_island": 0.782,
|
| 64 |
+
"blimp/accuracy/principle_A_domain_1": 0.976,
|
| 65 |
+
"blimp/accuracy/complex_NP_island": 0.563,
|
| 66 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_2": 0.964,
|
| 67 |
+
"blimp/accuracy/irregular_past_participle_verbs": 0.887,
|
| 68 |
+
"blimp/accuracy/drop_argument": 0.766,
|
| 69 |
+
"blimp/accuracy/wh_questions_object_gap": 0.79,
|
| 70 |
+
"blimp/accuracy/animate_subject_passive": 0.788,
|
| 71 |
+
"blimp/accuracy/existential_there_quantifiers_1": 0.957,
|
| 72 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.899,
|
| 73 |
+
"blimp/accuracy/npi_present_2": 0.535,
|
| 74 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_1": 0.929,
|
| 75 |
+
"blimp/accuracy/anaphor_number_agreement": 0.984,
|
| 76 |
+
"blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.956,
|
| 77 |
+
"blimp/accuracy/existential_there_object_raising": 0.851,
|
| 78 |
+
"blimp/accuracy/matrix_question_npi_licensor_present": 0.16,
|
| 79 |
+
"blimp/accuracy/npi_present_1": 0.525,
|
| 80 |
+
"blimp/accuracy/wh_vs_that_no_gap": 0.975,
|
| 81 |
+
"blimp/accuracy/left_branch_island_echo_question": 0.378,
|
| 82 |
+
"blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.978,
|
| 83 |
+
"blimp/accuracy/causative": 0.687,
|
| 84 |
+
"blimp/accuracy/group_average": 0.7652835820895522,
|
| 85 |
+
"blimp/accuracy/seq_average": 0.7652835820895523,
|
| 86 |
+
"cbt/accuracy/NE": 0.7588141025641025,
|
| 87 |
+
"cbt/accuracy/V": 0.9068,
|
| 88 |
+
"cbt/accuracy/CN": 0.8128,
|
| 89 |
+
"cbt/accuracy/P": 0.8864,
|
| 90 |
+
"cbt/accuracy/group_average": 0.8412035256410257,
|
| 91 |
+
"cbt/accuracy/seq_average": 0.8412364945978391,
|
| 92 |
+
"hellaswag/accuracy/val": 0.29336785500896234,
|
| 93 |
+
"hellaswag/accuracy/group_average": 0.29336785500896234,
|
| 94 |
+
"hellaswag/accuracy/seq_average": 0.29336785500896234,
|
| 95 |
+
"piqa/accuracy/val": 0.5826985854189336,
|
| 96 |
+
"piqa/accuracy/group_average": 0.5826985854189336,
|
| 97 |
+
"piqa/accuracy/seq_average": 0.5826985854189336,
|
| 98 |
+
"ai2arc/accuracy/ARC-Easy": 0.3226215644820296,
|
| 99 |
+
"ai2arc/accuracy/ARC-Challenge": 0.2094420600858369,
|
| 100 |
+
"ai2arc/accuracy/group_average": 0.26603181228393324,
|
| 101 |
+
"ai2arc/accuracy/seq_average": 0.28526912181303116,
|
| 102 |
+
"race/accuracy/test/high": 0.26300743281875355,
|
| 103 |
+
"race/accuracy/test/middle": 0.3307799442896936,
|
| 104 |
+
"race/accuracy/group_average": 0.2968936885542236,
|
| 105 |
+
"race/accuracy/seq_average": 0.282732063234698,
|
| 106 |
+
"siqa/accuracy/dev": 0.3546571136131013,
|
| 107 |
+
"siqa/accuracy/group_average": 0.3546571136131013,
|
| 108 |
+
"siqa/accuracy/seq_average": 0.3546571136131013,
|
| 109 |
+
"commonsenseqa/accuracy/dev_rand_split": 0.24488124488124488,
|
| 110 |
+
"commonsenseqa/accuracy/group_average": 0.24488124488124488,
|
| 111 |
+
"commonsenseqa/accuracy/seq_average": 0.24488124488124488
|
| 112 |
+
}
|
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_154M_standard_lb/export/result-model-20000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.927763439360119, "val/accuracy": 0.4344094897073413, "val/perplexity": 18.68579182023476, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.665010250873447, "lambada/accuracy/total": 0.1907996894409938, "lambada/accuracy/openai_last_token": 0.7327251552795031, "lambada/perplexity": 19.304637599854928, "lambada/lm_loss": 3.4492595417288254, "lambada/lm_perplexity": 31.477076216079823, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.31260458957416754, "mean_loss": 2.796386845116783, "blimp/accuracy/passive_2": 0.899, "blimp/accuracy/determiner_noun_agreement_2": 0.949, "blimp/accuracy/ellipsis_n_bar_1": 0.781, "blimp/accuracy/tough_vs_raising_2": 0.823, "blimp/accuracy/tough_vs_raising_1": 0.575, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.876, "blimp/accuracy/principle_A_reconstruction": 0.477, "blimp/accuracy/wh_vs_that_with_gap": 0.513, "blimp/accuracy/principle_A_domain_2": 0.78, "blimp/accuracy/determiner_noun_agreement_1": 0.975, "blimp/accuracy/ellipsis_n_bar_2": 0.906, "blimp/accuracy/principle_A_domain_3": 0.524, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.861, "blimp/accuracy/animate_subject_trans": 0.871, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.807, "blimp/accuracy/distractor_agreement_relative_clause": 0.601, "blimp/accuracy/transitive": 0.84, "blimp/accuracy/sentential_subject_island": 0.432, "blimp/accuracy/adjunct_island": 0.73, "blimp/accuracy/intransitive": 0.695, "blimp/accuracy/existential_there_subject_raising": 0.832, "blimp/accuracy/irregular_past_participle_adjectives": 0.891, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.188, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.214, "blimp/accuracy/only_npi_scope": 0.499, "blimp/accuracy/superlative_quantifiers_2": 0.759, "blimp/accuracy/passive_1": 0.895, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.879, "blimp/accuracy/inchoative": 0.531, "blimp/accuracy/anaphor_gender_agreement": 0.911, "blimp/accuracy/principle_A_c_command": 0.535, "blimp/accuracy/only_npi_licensor_present": 0.442, "blimp/accuracy/expletive_it_object_raising": 0.766, "blimp/accuracy/left_branch_island_simple_question": 0.225, "blimp/accuracy/wh_questions_subject_gap": 0.885, "blimp/accuracy/existential_there_quantifiers_2": 0.533, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.886, "blimp/accuracy/sentential_negation_npi_scope": 0.377, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.788, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.874, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.834, "blimp/accuracy/principle_A_case_2": 0.964, "blimp/accuracy/distractor_agreement_relational_noun": 0.76, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.969, "blimp/accuracy/superlative_quantifiers_1": 0.657, "blimp/accuracy/wh_island": 0.734, "blimp/accuracy/principle_A_domain_1": 0.983, "blimp/accuracy/complex_NP_island": 0.568, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.905, "blimp/accuracy/irregular_past_participle_verbs": 0.892, "blimp/accuracy/drop_argument": 0.732, "blimp/accuracy/wh_questions_object_gap": 0.775, "blimp/accuracy/animate_subject_passive": 0.752, "blimp/accuracy/existential_there_quantifiers_1": 0.982, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.889, "blimp/accuracy/npi_present_2": 0.587, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.881, "blimp/accuracy/anaphor_number_agreement": 0.961, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.932, "blimp/accuracy/existential_there_object_raising": 0.716, "blimp/accuracy/matrix_question_npi_licensor_present": 0.101, "blimp/accuracy/npi_present_1": 0.553, "blimp/accuracy/wh_vs_that_no_gap": 0.951, "blimp/accuracy/left_branch_island_echo_question": 0.332, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.973, "blimp/accuracy/causative": 0.693, "blimp/accuracy/group_average": 0.7298656716417911, "blimp/accuracy/seq_average": 0.7298656716417911, "cbt/accuracy/NE": 0.6955128205128205, "cbt/accuracy/V": 0.8748, "cbt/accuracy/CN": 0.7608, "cbt/accuracy/P": 0.8508, "cbt/accuracy/group_average": 0.7954782051282051, "cbt/accuracy/seq_average": 0.7955182072829131, "hellaswag/accuracy/val": 0.2741485759808803, "hellaswag/accuracy/group_average": 0.2741485759808803, "hellaswag/accuracy/seq_average": 0.2741485759808803, "piqa/accuracy/val": 0.5625680087051143, "piqa/accuracy/group_average": 0.5625680087051143, "piqa/accuracy/seq_average": 0.5625680087051143, "ai2arc/accuracy/ARC-Easy": 0.29978858350951376, "ai2arc/accuracy/ARC-Challenge": 0.20429184549356222, "ai2arc/accuracy/group_average": 0.252040214501538, "ai2arc/accuracy/seq_average": 0.26827195467422094, "race/accuracy/test/high": 0.25443110348770726, "race/accuracy/test/middle": 0.31337047353760444, "race/accuracy/group_average": 0.2839007885126559, "race/accuracy/seq_average": 0.2715849209566275, "siqa/accuracy/dev": 0.3633572159672467, "siqa/accuracy/group_average": 0.3633572159672467, "siqa/accuracy/seq_average": 0.3633572159672467, "commonsenseqa/accuracy/dev_rand_split": 0.23423423423423423, "commonsenseqa/accuracy/group_average": 0.23423423423423423, "commonsenseqa/accuracy/seq_average": 0.23423423423423423}
|
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_154M_standard_lb/export/result-model-30000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.8461814759269592, "val/accuracy": 0.4464498852926587, "val/perplexity": 17.221893906830505, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6310036226829387, "lambada/accuracy/total": 0.1999223602484472, "lambada/accuracy/openai_last_token": 0.7404891304347826, "lambada/perplexity": 17.055114946895998, "lambada/lm_loss": 3.4029719950049446, "lambada/lm_perplexity": 30.053285667027353, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.323186122770553, "mean_loss": 2.7385925493049488, "blimp/accuracy/passive_2": 0.87, "blimp/accuracy/determiner_noun_agreement_2": 0.969, "blimp/accuracy/ellipsis_n_bar_1": 0.802, "blimp/accuracy/tough_vs_raising_2": 0.85, "blimp/accuracy/tough_vs_raising_1": 0.603, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.88, "blimp/accuracy/principle_A_reconstruction": 0.352, "blimp/accuracy/wh_vs_that_with_gap": 0.612, "blimp/accuracy/principle_A_domain_2": 0.742, "blimp/accuracy/determiner_noun_agreement_1": 0.983, "blimp/accuracy/ellipsis_n_bar_2": 0.897, "blimp/accuracy/principle_A_domain_3": 0.538, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.881, "blimp/accuracy/animate_subject_trans": 0.894, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.851, "blimp/accuracy/distractor_agreement_relative_clause": 0.632, "blimp/accuracy/transitive": 0.828, "blimp/accuracy/sentential_subject_island": 0.496, "blimp/accuracy/adjunct_island": 0.798, "blimp/accuracy/intransitive": 0.758, "blimp/accuracy/existential_there_subject_raising": 0.859, "blimp/accuracy/irregular_past_participle_adjectives": 0.906, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.179, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.298, "blimp/accuracy/only_npi_scope": 0.683, "blimp/accuracy/superlative_quantifiers_2": 0.736, "blimp/accuracy/passive_1": 0.89, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.868, "blimp/accuracy/inchoative": 0.598, "blimp/accuracy/anaphor_gender_agreement": 0.949, "blimp/accuracy/principle_A_c_command": 0.563, "blimp/accuracy/only_npi_licensor_present": 0.53, "blimp/accuracy/expletive_it_object_raising": 0.769, "blimp/accuracy/left_branch_island_simple_question": 0.226, "blimp/accuracy/wh_questions_subject_gap": 0.856, "blimp/accuracy/existential_there_quantifiers_2": 0.37, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.908, "blimp/accuracy/sentential_negation_npi_scope": 0.48, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.818, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.813, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.845, "blimp/accuracy/principle_A_case_2": 0.957, "blimp/accuracy/distractor_agreement_relational_noun": 0.789, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.979, "blimp/accuracy/superlative_quantifiers_1": 0.846, "blimp/accuracy/wh_island": 0.676, "blimp/accuracy/principle_A_domain_1": 0.976, "blimp/accuracy/complex_NP_island": 0.515, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.946, "blimp/accuracy/irregular_past_participle_verbs": 0.827, "blimp/accuracy/drop_argument": 0.769, "blimp/accuracy/wh_questions_object_gap": 0.711, "blimp/accuracy/animate_subject_passive": 0.808, "blimp/accuracy/existential_there_quantifiers_1": 0.977, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.86, "blimp/accuracy/npi_present_2": 0.571, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.918, "blimp/accuracy/anaphor_number_agreement": 0.972, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.937, "blimp/accuracy/existential_there_object_raising": 0.737, "blimp/accuracy/matrix_question_npi_licensor_present": 0.128, "blimp/accuracy/npi_present_1": 0.53, "blimp/accuracy/wh_vs_that_no_gap": 0.918, "blimp/accuracy/left_branch_island_echo_question": 0.219, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.937, "blimp/accuracy/causative": 0.691, "blimp/accuracy/group_average": 0.7399104477611937, "blimp/accuracy/seq_average": 0.7399104477611941, "cbt/accuracy/NE": 0.7263621794871795, "cbt/accuracy/V": 0.8844, "cbt/accuracy/CN": 0.7764, "cbt/accuracy/P": 0.8556, "cbt/accuracy/group_average": 0.8106905448717949, "cbt/accuracy/seq_average": 0.8107242897158864, "hellaswag/accuracy/val": 0.27673770165305717, "hellaswag/accuracy/group_average": 0.27673770165305717, "hellaswag/accuracy/seq_average": 0.27673770165305717, "piqa/accuracy/val": 0.5799782372143635, "piqa/accuracy/group_average": 0.5799782372143635, "piqa/accuracy/seq_average": 0.5799782372143635, "ai2arc/accuracy/ARC-Easy": 0.3090909090909091, "ai2arc/accuracy/ARC-Challenge": 0.21373390557939914, "ai2arc/accuracy/group_average": 0.26141240733515414, "ai2arc/accuracy/seq_average": 0.2776203966005666, "race/accuracy/test/high": 0.26043453401943967, "race/accuracy/test/middle": 0.318941504178273, "race/accuracy/group_average": 0.28968801909885633, "race/accuracy/seq_average": 0.27746250506688286, "siqa/accuracy/dev": 0.3623336745138178, "siqa/accuracy/group_average": 0.3623336745138178, "siqa/accuracy/seq_average": 0.3623336745138178, "commonsenseqa/accuracy/dev_rand_split": 0.23505323505323505, "commonsenseqa/accuracy/group_average": 0.23505323505323505, "commonsenseqa/accuracy/seq_average": 0.23505323505323505}
|
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_154M_standard_lb/export/result-model-40000.pth.json
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"val/loss": 2.7919435046968006,
|
| 3 |
+
"val/accuracy": 0.45302230592757936,
|
| 4 |
+
"val/perplexity": 16.312692808591017,
|
| 5 |
+
"val/time_since_best_loss": 0,
|
| 6 |
+
"val/time_since_best_accuracy": 0,
|
| 7 |
+
"lambada/loss": 2.6135883212829967,
|
| 8 |
+
"lambada/accuracy/total": 0.19157608695652173,
|
| 9 |
+
"lambada/accuracy/openai_last_token": 0.7393245341614907,
|
| 10 |
+
"lambada/perplexity": 16.95823182935926,
|
| 11 |
+
"lambada/lm_loss": 3.3357095115573507,
|
| 12 |
+
"lambada/lm_perplexity": 28.09831223030425,
|
| 13 |
+
"lambada/time_since_best_loss": 0,
|
| 14 |
+
"lambada/time_since_best_accuracy": 0,
|
| 15 |
+
"mean_accuracy": 0.3222991964420505,
|
| 16 |
+
"mean_loss": 2.7027659129898987,
|
| 17 |
+
"blimp/accuracy/passive_2": 0.878,
|
| 18 |
+
"blimp/accuracy/determiner_noun_agreement_2": 0.973,
|
| 19 |
+
"blimp/accuracy/ellipsis_n_bar_1": 0.822,
|
| 20 |
+
"blimp/accuracy/tough_vs_raising_2": 0.87,
|
| 21 |
+
"blimp/accuracy/tough_vs_raising_1": 0.592,
|
| 22 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.88,
|
| 23 |
+
"blimp/accuracy/principle_A_reconstruction": 0.344,
|
| 24 |
+
"blimp/accuracy/wh_vs_that_with_gap": 0.533,
|
| 25 |
+
"blimp/accuracy/principle_A_domain_2": 0.762,
|
| 26 |
+
"blimp/accuracy/determiner_noun_agreement_1": 0.987,
|
| 27 |
+
"blimp/accuracy/ellipsis_n_bar_2": 0.893,
|
| 28 |
+
"blimp/accuracy/principle_A_domain_3": 0.538,
|
| 29 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.88,
|
| 30 |
+
"blimp/accuracy/animate_subject_trans": 0.896,
|
| 31 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.875,
|
| 32 |
+
"blimp/accuracy/distractor_agreement_relative_clause": 0.73,
|
| 33 |
+
"blimp/accuracy/transitive": 0.851,
|
| 34 |
+
"blimp/accuracy/sentential_subject_island": 0.411,
|
| 35 |
+
"blimp/accuracy/adjunct_island": 0.773,
|
| 36 |
+
"blimp/accuracy/intransitive": 0.743,
|
| 37 |
+
"blimp/accuracy/existential_there_subject_raising": 0.858,
|
| 38 |
+
"blimp/accuracy/irregular_past_participle_adjectives": 0.857,
|
| 39 |
+
"blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.132,
|
| 40 |
+
"blimp/accuracy/principle_A_case_1": 1.0,
|
| 41 |
+
"blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.217,
|
| 42 |
+
"blimp/accuracy/only_npi_scope": 0.541,
|
| 43 |
+
"blimp/accuracy/superlative_quantifiers_2": 0.789,
|
| 44 |
+
"blimp/accuracy/passive_1": 0.897,
|
| 45 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.867,
|
| 46 |
+
"blimp/accuracy/inchoative": 0.591,
|
| 47 |
+
"blimp/accuracy/anaphor_gender_agreement": 0.966,
|
| 48 |
+
"blimp/accuracy/principle_A_c_command": 0.551,
|
| 49 |
+
"blimp/accuracy/only_npi_licensor_present": 0.783,
|
| 50 |
+
"blimp/accuracy/expletive_it_object_raising": 0.793,
|
| 51 |
+
"blimp/accuracy/left_branch_island_simple_question": 0.187,
|
| 52 |
+
"blimp/accuracy/wh_questions_subject_gap": 0.908,
|
| 53 |
+
"blimp/accuracy/existential_there_quantifiers_2": 0.359,
|
| 54 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.912,
|
| 55 |
+
"blimp/accuracy/sentential_negation_npi_scope": 0.558,
|
| 56 |
+
"blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.807,
|
| 57 |
+
"blimp/accuracy/wh_questions_subject_gap_long_distance": 0.889,
|
| 58 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.876,
|
| 59 |
+
"blimp/accuracy/principle_A_case_2": 0.959,
|
| 60 |
+
"blimp/accuracy/distractor_agreement_relational_noun": 0.817,
|
| 61 |
+
"blimp/accuracy/sentential_negation_npi_licensor_present": 0.993,
|
| 62 |
+
"blimp/accuracy/superlative_quantifiers_1": 0.793,
|
| 63 |
+
"blimp/accuracy/wh_island": 0.709,
|
| 64 |
+
"blimp/accuracy/principle_A_domain_1": 0.952,
|
| 65 |
+
"blimp/accuracy/complex_NP_island": 0.538,
|
| 66 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_2": 0.953,
|
| 67 |
+
"blimp/accuracy/irregular_past_participle_verbs": 0.901,
|
| 68 |
+
"blimp/accuracy/drop_argument": 0.782,
|
| 69 |
+
"blimp/accuracy/wh_questions_object_gap": 0.764,
|
| 70 |
+
"blimp/accuracy/animate_subject_passive": 0.756,
|
| 71 |
+
"blimp/accuracy/existential_there_quantifiers_1": 0.965,
|
| 72 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.874,
|
| 73 |
+
"blimp/accuracy/npi_present_2": 0.49,
|
| 74 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_1": 0.915,
|
| 75 |
+
"blimp/accuracy/anaphor_number_agreement": 0.98,
|
| 76 |
+
"blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.948,
|
| 77 |
+
"blimp/accuracy/existential_there_object_raising": 0.811,
|
| 78 |
+
"blimp/accuracy/matrix_question_npi_licensor_present": 0.098,
|
| 79 |
+
"blimp/accuracy/npi_present_1": 0.456,
|
| 80 |
+
"blimp/accuracy/wh_vs_that_no_gap": 0.97,
|
| 81 |
+
"blimp/accuracy/left_branch_island_echo_question": 0.313,
|
| 82 |
+
"blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.969,
|
| 83 |
+
"blimp/accuracy/causative": 0.687,
|
| 84 |
+
"blimp/accuracy/group_average": 0.7457014925373137,
|
| 85 |
+
"blimp/accuracy/seq_average": 0.7457014925373134,
|
| 86 |
+
"cbt/accuracy/NE": 0.7287660256410257,
|
| 87 |
+
"cbt/accuracy/V": 0.8904,
|
| 88 |
+
"cbt/accuracy/CN": 0.78,
|
| 89 |
+
"cbt/accuracy/P": 0.8556,
|
| 90 |
+
"cbt/accuracy/group_average": 0.8136915064102563,
|
| 91 |
+
"cbt/accuracy/seq_average": 0.8137254901960784,
|
| 92 |
+
"hellaswag/accuracy/val": 0.2802230631348337,
|
| 93 |
+
"hellaswag/accuracy/group_average": 0.2802230631348337,
|
| 94 |
+
"hellaswag/accuracy/seq_average": 0.2802230631348337,
|
| 95 |
+
"piqa/accuracy/val": 0.5832426550598476,
|
| 96 |
+
"piqa/accuracy/group_average": 0.5832426550598476,
|
| 97 |
+
"piqa/accuracy/seq_average": 0.5832426550598476,
|
| 98 |
+
"ai2arc/accuracy/ARC-Easy": 0.3120507399577167,
|
| 99 |
+
"ai2arc/accuracy/ARC-Challenge": 0.20085836909871244,
|
| 100 |
+
"ai2arc/accuracy/group_average": 0.2564545545282146,
|
| 101 |
+
"ai2arc/accuracy/seq_average": 0.2753541076487252,
|
| 102 |
+
"race/accuracy/test/high": 0.26157804459691253,
|
| 103 |
+
"race/accuracy/test/middle": 0.3321727019498607,
|
| 104 |
+
"race/accuracy/group_average": 0.29687537327338664,
|
| 105 |
+
"race/accuracy/seq_average": 0.2821240372922578,
|
| 106 |
+
"siqa/accuracy/dev": 0.3572159672466735,
|
| 107 |
+
"siqa/accuracy/group_average": 0.3572159672466735,
|
| 108 |
+
"siqa/accuracy/seq_average": 0.3572159672466735,
|
| 109 |
+
"commonsenseqa/accuracy/dev_rand_split": 0.23587223587223588,
|
| 110 |
+
"commonsenseqa/accuracy/group_average": 0.23587223587223588,
|
| 111 |
+
"commonsenseqa/accuracy/seq_average": 0.23587223587223588
|
| 112 |
+
}
|
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_154M_standard_lb/export/result-model-50000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.744597904265873, "val/accuracy": 0.4605761331225198, "val/perplexity": 15.558356725401492, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.608001708984375, "lambada/accuracy/total": 0.21486801242236025, "lambada/accuracy/openai_last_token": 0.749805900621118, "lambada/perplexity": 14.954680203004754, "lambada/lm_loss": 3.318650318895724, "lambada/lm_perplexity": 27.62304308828747, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.33772207277244004, "mean_loss": 2.6762998066251242, "blimp/accuracy/passive_2": 0.896, "blimp/accuracy/determiner_noun_agreement_2": 0.98, "blimp/accuracy/ellipsis_n_bar_1": 0.826, "blimp/accuracy/tough_vs_raising_2": 0.861, "blimp/accuracy/tough_vs_raising_1": 0.574, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.892, "blimp/accuracy/principle_A_reconstruction": 0.26, "blimp/accuracy/wh_vs_that_with_gap": 0.487, "blimp/accuracy/principle_A_domain_2": 0.801, "blimp/accuracy/determiner_noun_agreement_1": 0.99, "blimp/accuracy/ellipsis_n_bar_2": 0.904, "blimp/accuracy/principle_A_domain_3": 0.561, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.886, "blimp/accuracy/animate_subject_trans": 0.876, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.86, "blimp/accuracy/distractor_agreement_relative_clause": 0.637, "blimp/accuracy/transitive": 0.838, "blimp/accuracy/sentential_subject_island": 0.437, "blimp/accuracy/adjunct_island": 0.727, "blimp/accuracy/intransitive": 0.757, "blimp/accuracy/existential_there_subject_raising": 0.849, "blimp/accuracy/irregular_past_participle_adjectives": 0.882, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.216, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.189, "blimp/accuracy/only_npi_scope": 0.625, "blimp/accuracy/superlative_quantifiers_2": 0.767, "blimp/accuracy/passive_1": 0.894, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.889, "blimp/accuracy/inchoative": 0.577, "blimp/accuracy/anaphor_gender_agreement": 0.95, "blimp/accuracy/principle_A_c_command": 0.574, "blimp/accuracy/only_npi_licensor_present": 0.409, "blimp/accuracy/expletive_it_object_raising": 0.793, "blimp/accuracy/left_branch_island_simple_question": 0.249, "blimp/accuracy/wh_questions_subject_gap": 0.93, "blimp/accuracy/existential_there_quantifiers_2": 0.279, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.912, "blimp/accuracy/sentential_negation_npi_scope": 0.459, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.827, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.903, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.847, "blimp/accuracy/principle_A_case_2": 0.951, "blimp/accuracy/distractor_agreement_relational_noun": 0.78, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.988, "blimp/accuracy/superlative_quantifiers_1": 0.713, "blimp/accuracy/wh_island": 0.793, "blimp/accuracy/principle_A_domain_1": 0.982, "blimp/accuracy/complex_NP_island": 0.534, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.954, "blimp/accuracy/irregular_past_participle_verbs": 0.845, "blimp/accuracy/drop_argument": 0.75, "blimp/accuracy/wh_questions_object_gap": 0.828, "blimp/accuracy/animate_subject_passive": 0.798, "blimp/accuracy/existential_there_quantifiers_1": 0.961, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.894, "blimp/accuracy/npi_present_2": 0.556, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.91, "blimp/accuracy/anaphor_number_agreement": 0.976, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.94, "blimp/accuracy/existential_there_object_raising": 0.833, "blimp/accuracy/matrix_question_npi_licensor_present": 0.151, "blimp/accuracy/npi_present_1": 0.468, "blimp/accuracy/wh_vs_that_no_gap": 0.978, "blimp/accuracy/left_branch_island_echo_question": 0.35, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.98, "blimp/accuracy/causative": 0.69, "blimp/accuracy/group_average": 0.7413880597014924, "blimp/accuracy/seq_average": 0.7413880597014926, "cbt/accuracy/NE": 0.7303685897435898, "cbt/accuracy/V": 0.8976, "cbt/accuracy/CN": 0.7972, "cbt/accuracy/P": 0.87, "cbt/accuracy/group_average": 0.8237921474358975, "cbt/accuracy/seq_average": 0.8238295318127251, "hellaswag/accuracy/val": 0.2846046604262099, "hellaswag/accuracy/group_average": 0.2846046604262099, "hellaswag/accuracy/seq_average": 0.2846046604262099, "piqa/accuracy/val": 0.5712731229597389, "piqa/accuracy/group_average": 0.5712731229597389, "piqa/accuracy/seq_average": 0.5712731229597389, "ai2arc/accuracy/ARC-Easy": 0.3120507399577167, "ai2arc/accuracy/ARC-Challenge": 0.21201716738197424, "ai2arc/accuracy/group_average": 0.2620339536698455, "ai2arc/accuracy/seq_average": 0.2790368271954674, "race/accuracy/test/high": 0.2564322469982847, "race/accuracy/test/middle": 0.3279944289693593, "race/accuracy/group_average": 0.292213337983822, "race/accuracy/seq_average": 0.2772598297527361, "siqa/accuracy/dev": 0.35977482088024565, "siqa/accuracy/group_average": 0.35977482088024565, "siqa/accuracy/seq_average": 0.35977482088024565, "commonsenseqa/accuracy/dev_rand_split": 0.23587223587223588, "commonsenseqa/accuracy/group_average": 0.23587223587223588, "commonsenseqa/accuracy/seq_average": 0.23587223587223588}
|
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_154M_standard_lb/export/result-model-60000.pth.json
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"val/loss": 2.7088145906963046,
|
| 3 |
+
"val/accuracy": 0.4653746589781746,
|
| 4 |
+
"val/perplexity": 15.011470227214083,
|
| 5 |
+
"val/time_since_best_loss": 0,
|
| 6 |
+
"val/time_since_best_accuracy": 0,
|
| 7 |
+
"lambada/loss": 2.5780768542556287,
|
| 8 |
+
"lambada/accuracy/total": 0.22243788819875776,
|
| 9 |
+
"lambada/accuracy/openai_last_token": 0.750194099378882,
|
| 10 |
+
"lambada/perplexity": 14.345165224725555,
|
| 11 |
+
"lambada/lm_loss": 3.264494879230292,
|
| 12 |
+
"lambada/lm_perplexity": 26.16689021233424,
|
| 13 |
+
"lambada/time_since_best_loss": 0,
|
| 14 |
+
"lambada/time_since_best_accuracy": 0,
|
| 15 |
+
"mean_accuracy": 0.34390627358846615,
|
| 16 |
+
"mean_loss": 2.643445722475967,
|
| 17 |
+
"blimp/accuracy/passive_2": 0.905,
|
| 18 |
+
"blimp/accuracy/determiner_noun_agreement_2": 0.975,
|
| 19 |
+
"blimp/accuracy/ellipsis_n_bar_1": 0.842,
|
| 20 |
+
"blimp/accuracy/tough_vs_raising_2": 0.858,
|
| 21 |
+
"blimp/accuracy/tough_vs_raising_1": 0.609,
|
| 22 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.89,
|
| 23 |
+
"blimp/accuracy/principle_A_reconstruction": 0.407,
|
| 24 |
+
"blimp/accuracy/wh_vs_that_with_gap": 0.55,
|
| 25 |
+
"blimp/accuracy/principle_A_domain_2": 0.782,
|
| 26 |
+
"blimp/accuracy/determiner_noun_agreement_1": 0.983,
|
| 27 |
+
"blimp/accuracy/ellipsis_n_bar_2": 0.895,
|
| 28 |
+
"blimp/accuracy/principle_A_domain_3": 0.578,
|
| 29 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.902,
|
| 30 |
+
"blimp/accuracy/animate_subject_trans": 0.891,
|
| 31 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.879,
|
| 32 |
+
"blimp/accuracy/distractor_agreement_relative_clause": 0.691,
|
| 33 |
+
"blimp/accuracy/transitive": 0.858,
|
| 34 |
+
"blimp/accuracy/sentential_subject_island": 0.435,
|
| 35 |
+
"blimp/accuracy/adjunct_island": 0.793,
|
| 36 |
+
"blimp/accuracy/intransitive": 0.784,
|
| 37 |
+
"blimp/accuracy/existential_there_subject_raising": 0.845,
|
| 38 |
+
"blimp/accuracy/irregular_past_participle_adjectives": 0.98,
|
| 39 |
+
"blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.223,
|
| 40 |
+
"blimp/accuracy/principle_A_case_1": 1.0,
|
| 41 |
+
"blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.24,
|
| 42 |
+
"blimp/accuracy/only_npi_scope": 0.688,
|
| 43 |
+
"blimp/accuracy/superlative_quantifiers_2": 0.669,
|
| 44 |
+
"blimp/accuracy/passive_1": 0.891,
|
| 45 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.892,
|
| 46 |
+
"blimp/accuracy/inchoative": 0.626,
|
| 47 |
+
"blimp/accuracy/anaphor_gender_agreement": 0.972,
|
| 48 |
+
"blimp/accuracy/principle_A_c_command": 0.617,
|
| 49 |
+
"blimp/accuracy/only_npi_licensor_present": 0.586,
|
| 50 |
+
"blimp/accuracy/expletive_it_object_raising": 0.775,
|
| 51 |
+
"blimp/accuracy/left_branch_island_simple_question": 0.27,
|
| 52 |
+
"blimp/accuracy/wh_questions_subject_gap": 0.924,
|
| 53 |
+
"blimp/accuracy/existential_there_quantifiers_2": 0.344,
|
| 54 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.919,
|
| 55 |
+
"blimp/accuracy/sentential_negation_npi_scope": 0.498,
|
| 56 |
+
"blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.821,
|
| 57 |
+
"blimp/accuracy/wh_questions_subject_gap_long_distance": 0.862,
|
| 58 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.865,
|
| 59 |
+
"blimp/accuracy/principle_A_case_2": 0.948,
|
| 60 |
+
"blimp/accuracy/distractor_agreement_relational_noun": 0.809,
|
| 61 |
+
"blimp/accuracy/sentential_negation_npi_licensor_present": 0.993,
|
| 62 |
+
"blimp/accuracy/superlative_quantifiers_1": 0.831,
|
| 63 |
+
"blimp/accuracy/wh_island": 0.78,
|
| 64 |
+
"blimp/accuracy/principle_A_domain_1": 0.981,
|
| 65 |
+
"blimp/accuracy/complex_NP_island": 0.558,
|
| 66 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_2": 0.951,
|
| 67 |
+
"blimp/accuracy/irregular_past_participle_verbs": 0.893,
|
| 68 |
+
"blimp/accuracy/drop_argument": 0.782,
|
| 69 |
+
"blimp/accuracy/wh_questions_object_gap": 0.801,
|
| 70 |
+
"blimp/accuracy/animate_subject_passive": 0.791,
|
| 71 |
+
"blimp/accuracy/existential_there_quantifiers_1": 0.949,
|
| 72 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.914,
|
| 73 |
+
"blimp/accuracy/npi_present_2": 0.493,
|
| 74 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_1": 0.916,
|
| 75 |
+
"blimp/accuracy/anaphor_number_agreement": 0.977,
|
| 76 |
+
"blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.944,
|
| 77 |
+
"blimp/accuracy/existential_there_object_raising": 0.785,
|
| 78 |
+
"blimp/accuracy/matrix_question_npi_licensor_present": 0.133,
|
| 79 |
+
"blimp/accuracy/npi_present_1": 0.462,
|
| 80 |
+
"blimp/accuracy/wh_vs_that_no_gap": 0.978,
|
| 81 |
+
"blimp/accuracy/left_branch_island_echo_question": 0.32,
|
| 82 |
+
"blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.978,
|
| 83 |
+
"blimp/accuracy/causative": 0.696,
|
| 84 |
+
"blimp/accuracy/group_average": 0.7563731343283583,
|
| 85 |
+
"blimp/accuracy/seq_average": 0.7563731343283582,
|
| 86 |
+
"cbt/accuracy/NE": 0.7451923076923077,
|
| 87 |
+
"cbt/accuracy/V": 0.8992,
|
| 88 |
+
"cbt/accuracy/CN": 0.7976,
|
| 89 |
+
"cbt/accuracy/P": 0.8748,
|
| 90 |
+
"cbt/accuracy/group_average": 0.8291980769230769,
|
| 91 |
+
"cbt/accuracy/seq_average": 0.8292316926770709,
|
| 92 |
+
"hellaswag/accuracy/val": 0.286795459071898,
|
| 93 |
+
"hellaswag/accuracy/group_average": 0.286795459071898,
|
| 94 |
+
"hellaswag/accuracy/seq_average": 0.286795459071898,
|
| 95 |
+
"piqa/accuracy/val": 0.5854189336235038,
|
| 96 |
+
"piqa/accuracy/group_average": 0.5854189336235038,
|
| 97 |
+
"piqa/accuracy/seq_average": 0.5854189336235038,
|
| 98 |
+
"ai2arc/accuracy/ARC-Easy": 0.3150105708245243,
|
| 99 |
+
"ai2arc/accuracy/ARC-Challenge": 0.20429184549356222,
|
| 100 |
+
"ai2arc/accuracy/group_average": 0.2596512081590433,
|
| 101 |
+
"ai2arc/accuracy/seq_average": 0.2784702549575071,
|
| 102 |
+
"race/accuracy/test/high": 0.26014865637507145,
|
| 103 |
+
"race/accuracy/test/middle": 0.33008356545961004,
|
| 104 |
+
"race/accuracy/group_average": 0.29511611091734075,
|
| 105 |
+
"race/accuracy/seq_average": 0.2805026347790839,
|
| 106 |
+
"siqa/accuracy/dev": 0.3490276356192426,
|
| 107 |
+
"siqa/accuracy/group_average": 0.3490276356192426,
|
| 108 |
+
"siqa/accuracy/seq_average": 0.3490276356192426,
|
| 109 |
+
"commonsenseqa/accuracy/dev_rand_split": 0.23505323505323505,
|
| 110 |
+
"commonsenseqa/accuracy/group_average": 0.23505323505323505,
|
| 111 |
+
"commonsenseqa/accuracy/seq_average": 0.23505323505323505
|
| 112 |
+
}
|
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_154M_standard_lb/export/result-model-70000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.6803521050347223, "val/accuracy": 0.46969749813988093, "val/perplexity": 14.590229684883312, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6440372822447595, "lambada/accuracy/total": 0.23020186335403728, "lambada/accuracy/openai_last_token": 0.7542701863354038, "lambada/perplexity": 13.618648521677962, "lambada/lm_loss": 3.2510871683233042, "lambada/lm_perplexity": 25.818393604575284, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3499496807469591, "mean_loss": 2.6621946936397407, "blimp/accuracy/passive_2": 0.905, "blimp/accuracy/determiner_noun_agreement_2": 0.98, "blimp/accuracy/ellipsis_n_bar_1": 0.843, "blimp/accuracy/tough_vs_raising_2": 0.823, "blimp/accuracy/tough_vs_raising_1": 0.619, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.856, "blimp/accuracy/principle_A_reconstruction": 0.394, "blimp/accuracy/wh_vs_that_with_gap": 0.541, "blimp/accuracy/principle_A_domain_2": 0.789, "blimp/accuracy/determiner_noun_agreement_1": 0.987, "blimp/accuracy/ellipsis_n_bar_2": 0.909, "blimp/accuracy/principle_A_domain_3": 0.572, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.91, "blimp/accuracy/animate_subject_trans": 0.906, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.899, "blimp/accuracy/distractor_agreement_relative_clause": 0.694, "blimp/accuracy/transitive": 0.863, "blimp/accuracy/sentential_subject_island": 0.427, "blimp/accuracy/adjunct_island": 0.784, "blimp/accuracy/intransitive": 0.762, "blimp/accuracy/existential_there_subject_raising": 0.841, "blimp/accuracy/irregular_past_participle_adjectives": 0.87, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.226, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.304, "blimp/accuracy/only_npi_scope": 0.684, "blimp/accuracy/superlative_quantifiers_2": 0.813, "blimp/accuracy/passive_1": 0.894, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.881, "blimp/accuracy/inchoative": 0.587, "blimp/accuracy/anaphor_gender_agreement": 0.962, "blimp/accuracy/principle_A_c_command": 0.605, "blimp/accuracy/only_npi_licensor_present": 0.402, "blimp/accuracy/expletive_it_object_raising": 0.782, "blimp/accuracy/left_branch_island_simple_question": 0.282, "blimp/accuracy/wh_questions_subject_gap": 0.92, "blimp/accuracy/existential_there_quantifiers_2": 0.307, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.926, "blimp/accuracy/sentential_negation_npi_scope": 0.552, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.835, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.839, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.865, "blimp/accuracy/principle_A_case_2": 0.956, "blimp/accuracy/distractor_agreement_relational_noun": 0.797, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.994, "blimp/accuracy/superlative_quantifiers_1": 0.829, "blimp/accuracy/wh_island": 0.756, "blimp/accuracy/principle_A_domain_1": 0.976, "blimp/accuracy/complex_NP_island": 0.553, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.958, "blimp/accuracy/irregular_past_participle_verbs": 0.863, "blimp/accuracy/drop_argument": 0.763, "blimp/accuracy/wh_questions_object_gap": 0.788, "blimp/accuracy/animate_subject_passive": 0.789, "blimp/accuracy/existential_there_quantifiers_1": 0.959, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.904, "blimp/accuracy/npi_present_2": 0.47, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.914, "blimp/accuracy/anaphor_number_agreement": 0.979, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.954, "blimp/accuracy/existential_there_object_raising": 0.843, "blimp/accuracy/matrix_question_npi_licensor_present": 0.148, "blimp/accuracy/npi_present_1": 0.469, "blimp/accuracy/wh_vs_that_no_gap": 0.974, "blimp/accuracy/left_branch_island_echo_question": 0.326, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.967, "blimp/accuracy/causative": 0.695, "blimp/accuracy/group_average": 0.7531940298507461, "blimp/accuracy/seq_average": 0.7531940298507462, "cbt/accuracy/NE": 0.7455929487179487, "cbt/accuracy/V": 0.8984, "cbt/accuracy/CN": 0.7996, "cbt/accuracy/P": 0.8752, "cbt/accuracy/group_average": 0.8296982371794871, "cbt/accuracy/seq_average": 0.8297318927571028, "hellaswag/accuracy/val": 0.2865962955586537, "hellaswag/accuracy/group_average": 0.2865962955586537, "hellaswag/accuracy/seq_average": 0.2865962955586537, "piqa/accuracy/val": 0.5865070729053319, "piqa/accuracy/group_average": 0.5865070729053319, "piqa/accuracy/seq_average": 0.5865070729053319, "ai2arc/accuracy/ARC-Easy": 0.32642706131078225, "ai2arc/accuracy/ARC-Challenge": 0.20515021459227467, "ai2arc/accuracy/group_average": 0.26578863795152846, "ai2arc/accuracy/seq_average": 0.2864022662889518, "race/accuracy/test/high": 0.2612921669525443, "race/accuracy/test/middle": 0.3286908077994429, "race/accuracy/group_average": 0.29499148737599357, "race/accuracy/seq_average": 0.2809079854073774, "siqa/accuracy/dev": 0.3602865916069601, "siqa/accuracy/group_average": 0.3602865916069601, "siqa/accuracy/seq_average": 0.3602865916069601, "commonsenseqa/accuracy/dev_rand_split": 0.23095823095823095, "commonsenseqa/accuracy/group_average": 0.23095823095823095, "commonsenseqa/accuracy/seq_average": 0.23095823095823095}
|
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_154M_standard_lb/export/result-model-80000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.6567273821149553, "val/accuracy": 0.4727124410962302, "val/perplexity": 14.24957928066503, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5068359375, "lambada/accuracy/total": 0.23486024844720496, "lambada/accuracy/openai_last_token": 0.7585403726708074, "lambada/perplexity": 13.102069321724693, "lambada/lm_loss": 3.226651154896998, "lambada/lm_perplexity": 25.195140910094015, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.35378634477171755, "mean_loss": 2.5817816598074774, "blimp/accuracy/passive_2": 0.908, "blimp/accuracy/determiner_noun_agreement_2": 0.987, "blimp/accuracy/ellipsis_n_bar_1": 0.838, "blimp/accuracy/tough_vs_raising_2": 0.86, "blimp/accuracy/tough_vs_raising_1": 0.608, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.905, "blimp/accuracy/principle_A_reconstruction": 0.417, "blimp/accuracy/wh_vs_that_with_gap": 0.515, "blimp/accuracy/principle_A_domain_2": 0.818, "blimp/accuracy/determiner_noun_agreement_1": 0.985, "blimp/accuracy/ellipsis_n_bar_2": 0.886, "blimp/accuracy/principle_A_domain_3": 0.589, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.912, "blimp/accuracy/animate_subject_trans": 0.905, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.887, "blimp/accuracy/distractor_agreement_relative_clause": 0.709, "blimp/accuracy/transitive": 0.852, "blimp/accuracy/sentential_subject_island": 0.42, "blimp/accuracy/adjunct_island": 0.81, "blimp/accuracy/intransitive": 0.777, "blimp/accuracy/existential_there_subject_raising": 0.841, "blimp/accuracy/irregular_past_participle_adjectives": 0.831, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.228, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.243, "blimp/accuracy/only_npi_scope": 0.65, "blimp/accuracy/superlative_quantifiers_2": 0.849, "blimp/accuracy/passive_1": 0.9, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.887, "blimp/accuracy/inchoative": 0.603, "blimp/accuracy/anaphor_gender_agreement": 0.97, "blimp/accuracy/principle_A_c_command": 0.617, "blimp/accuracy/only_npi_licensor_present": 0.564, "blimp/accuracy/expletive_it_object_raising": 0.794, "blimp/accuracy/left_branch_island_simple_question": 0.276, "blimp/accuracy/wh_questions_subject_gap": 0.906, "blimp/accuracy/existential_there_quantifiers_2": 0.242, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.922, "blimp/accuracy/sentential_negation_npi_scope": 0.568, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.827, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.878, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.866, "blimp/accuracy/principle_A_case_2": 0.954, "blimp/accuracy/distractor_agreement_relational_noun": 0.798, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.989, "blimp/accuracy/superlative_quantifiers_1": 0.751, "blimp/accuracy/wh_island": 0.819, "blimp/accuracy/principle_A_domain_1": 0.974, "blimp/accuracy/complex_NP_island": 0.591, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.968, "blimp/accuracy/irregular_past_participle_verbs": 0.875, "blimp/accuracy/drop_argument": 0.763, "blimp/accuracy/wh_questions_object_gap": 0.796, "blimp/accuracy/animate_subject_passive": 0.78, "blimp/accuracy/existential_there_quantifiers_1": 0.957, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.882, "blimp/accuracy/npi_present_2": 0.54, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.935, "blimp/accuracy/anaphor_number_agreement": 0.98, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.954, "blimp/accuracy/existential_there_object_raising": 0.831, "blimp/accuracy/matrix_question_npi_licensor_present": 0.183, "blimp/accuracy/npi_present_1": 0.508, "blimp/accuracy/wh_vs_that_no_gap": 0.98, "blimp/accuracy/left_branch_island_echo_question": 0.324, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.974, "blimp/accuracy/causative": 0.698, "blimp/accuracy/group_average": 0.7590149253731341, "blimp/accuracy/seq_average": 0.7590149253731343, "cbt/accuracy/NE": 0.7580128205128205, "cbt/accuracy/V": 0.9024, "cbt/accuracy/CN": 0.8064, "cbt/accuracy/P": 0.8796, "cbt/accuracy/group_average": 0.8366032051282051, "cbt/accuracy/seq_average": 0.8366346538615446, "hellaswag/accuracy/val": 0.28818960366460866, "hellaswag/accuracy/group_average": 0.28818960366460866, "hellaswag/accuracy/seq_average": 0.28818960366460866, "piqa/accuracy/val": 0.5826985854189336, "piqa/accuracy/group_average": 0.5826985854189336, "piqa/accuracy/seq_average": 0.5826985854189336, "ai2arc/accuracy/ARC-Easy": 0.32600422832980974, "ai2arc/accuracy/ARC-Challenge": 0.21716738197424892, "ai2arc/accuracy/group_average": 0.27158580515202935, "ai2arc/accuracy/seq_average": 0.29008498583569403, "race/accuracy/test/high": 0.26186392224128074, "race/accuracy/test/middle": 0.33913649025069637, "race/accuracy/group_average": 0.30050020624598855, "race/accuracy/seq_average": 0.2843534657478719, "siqa/accuracy/dev": 0.34851586489252817, "siqa/accuracy/group_average": 0.34851586489252817, "siqa/accuracy/seq_average": 0.34851586489252817, "commonsenseqa/accuracy/dev_rand_split": 0.23996723996723995, "commonsenseqa/accuracy/group_average": 0.23996723996723995, "commonsenseqa/accuracy/seq_average": 0.23996723996723995}
|
Pretrain_language_model/save/slimpajama_xmoe_no_attmoe_154M_standard_lb/export/result-model-90000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.645333668542287, "val/accuracy": 0.4747556656125992, "val/perplexity": 14.088145069790919, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.625901500630823, "lambada/accuracy/total": 0.23680124223602483, "lambada/accuracy/openai_last_token": 0.7536878881987578, "lambada/perplexity": 12.904226120983314, "lambada/lm_loss": 3.200719139191069, "lambada/lm_perplexity": 24.55017884618719, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.355778453924312, "mean_loss": 2.635617584586555, "blimp/accuracy/passive_2": 0.908, "blimp/accuracy/determiner_noun_agreement_2": 0.984, "blimp/accuracy/ellipsis_n_bar_1": 0.822, "blimp/accuracy/tough_vs_raising_2": 0.858, "blimp/accuracy/tough_vs_raising_1": 0.619, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.899, "blimp/accuracy/principle_A_reconstruction": 0.409, "blimp/accuracy/wh_vs_that_with_gap": 0.495, "blimp/accuracy/principle_A_domain_2": 0.8, "blimp/accuracy/determiner_noun_agreement_1": 0.984, "blimp/accuracy/ellipsis_n_bar_2": 0.895, "blimp/accuracy/principle_A_domain_3": 0.567, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.905, "blimp/accuracy/animate_subject_trans": 0.894, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.891, "blimp/accuracy/distractor_agreement_relative_clause": 0.708, "blimp/accuracy/transitive": 0.86, "blimp/accuracy/sentential_subject_island": 0.443, "blimp/accuracy/adjunct_island": 0.801, "blimp/accuracy/intransitive": 0.792, "blimp/accuracy/existential_there_subject_raising": 0.865, "blimp/accuracy/irregular_past_participle_adjectives": 0.883, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.242, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.235, "blimp/accuracy/only_npi_scope": 0.617, "blimp/accuracy/superlative_quantifiers_2": 0.8, "blimp/accuracy/passive_1": 0.889, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.895, "blimp/accuracy/inchoative": 0.61, "blimp/accuracy/anaphor_gender_agreement": 0.969, "blimp/accuracy/principle_A_c_command": 0.618, "blimp/accuracy/only_npi_licensor_present": 0.526, "blimp/accuracy/expletive_it_object_raising": 0.796, "blimp/accuracy/left_branch_island_simple_question": 0.283, "blimp/accuracy/wh_questions_subject_gap": 0.919, "blimp/accuracy/existential_there_quantifiers_2": 0.376, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.928, "blimp/accuracy/sentential_negation_npi_scope": 0.543, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.836, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.848, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.893, "blimp/accuracy/principle_A_case_2": 0.964, "blimp/accuracy/distractor_agreement_relational_noun": 0.827, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.992, "blimp/accuracy/superlative_quantifiers_1": 0.812, "blimp/accuracy/wh_island": 0.788, "blimp/accuracy/principle_A_domain_1": 0.977, "blimp/accuracy/complex_NP_island": 0.575, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.961, "blimp/accuracy/irregular_past_participle_verbs": 0.906, "blimp/accuracy/drop_argument": 0.762, "blimp/accuracy/wh_questions_object_gap": 0.802, "blimp/accuracy/animate_subject_passive": 0.791, "blimp/accuracy/existential_there_quantifiers_1": 0.975, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.899, "blimp/accuracy/npi_present_2": 0.559, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.928, "blimp/accuracy/anaphor_number_agreement": 0.987, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.957, "blimp/accuracy/existential_there_object_raising": 0.844, "blimp/accuracy/matrix_question_npi_licensor_present": 0.198, "blimp/accuracy/npi_present_1": 0.557, "blimp/accuracy/wh_vs_that_no_gap": 0.98, "blimp/accuracy/left_branch_island_echo_question": 0.363, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.977, "blimp/accuracy/causative": 0.708, "blimp/accuracy/group_average": 0.7640895522388056, "blimp/accuracy/seq_average": 0.7640895522388059, "cbt/accuracy/NE": 0.750801282051282, "cbt/accuracy/V": 0.9084, "cbt/accuracy/CN": 0.8084, "cbt/accuracy/P": 0.8844, "cbt/accuracy/group_average": 0.8380003205128205, "cbt/accuracy/seq_average": 0.8380352140856343, "hellaswag/accuracy/val": 0.29117705636327423, "hellaswag/accuracy/group_average": 0.29117705636327423, "hellaswag/accuracy/seq_average": 0.29117705636327423, "piqa/accuracy/val": 0.5919477693144722, "piqa/accuracy/group_average": 0.5919477693144722, "piqa/accuracy/seq_average": 0.5919477693144722, "ai2arc/accuracy/ARC-Easy": 0.32558139534883723, "ai2arc/accuracy/ARC-Challenge": 0.21630901287553647, "ai2arc/accuracy/group_average": 0.27094520411218687, "ai2arc/accuracy/seq_average": 0.2895184135977337, "race/accuracy/test/high": 0.26500857632933106, "race/accuracy/test/middle": 0.3307799442896936, "race/accuracy/group_average": 0.2978942603095123, "race/accuracy/seq_average": 0.28415079043372515, "siqa/accuracy/dev": 0.35516888433981575, "siqa/accuracy/group_average": 0.35516888433981575, "siqa/accuracy/seq_average": 0.35516888433981575, "commonsenseqa/accuracy/dev_rand_split": 0.24078624078624078, "commonsenseqa/accuracy/group_average": 0.24078624078624078, "commonsenseqa/accuracy/seq_average": 0.24078624078624078}
|