Upload folder using huggingface_hub
#421
by
DavidNguyen
- opened
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_remoe/export/result-model-10000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_remoe/export/result-model-100000.pth.json +112 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_remoe/export/result-model-20000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_remoe/export/result-model-30000.pth.json +112 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_remoe/export/result-model-40000.pth.json +112 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_remoe/export/result-model-50000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_remoe/export/result-model-60000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_remoe/export/result-model-70000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_remoe/export/result-model-80000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_remoe/export/result-model-90000.pth.json +1 -0
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_remoe/export/result-model-10000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 3.1148235987103177, "val/accuracy": 0.41378348214285715, "val/perplexity": 22.52945577788976, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.78603216135724, "lambada/accuracy/total": 0.13451086956521738, "lambada/accuracy/openai_last_token": 0.7096273291925466, "lambada/perplexity": 34.304580879461504, "lambada/lm_loss": 3.6331751076954024, "lambada/lm_perplexity": 37.83274917006765, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.27414717585403725, "mean_loss": 2.9504278800337786, "blimp/accuracy/passive_2": 0.896, "blimp/accuracy/determiner_noun_agreement_2": 0.966, "blimp/accuracy/ellipsis_n_bar_1": 0.646, "blimp/accuracy/tough_vs_raising_2": 0.82, "blimp/accuracy/tough_vs_raising_1": 0.434, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.864, "blimp/accuracy/principle_A_reconstruction": 0.336, "blimp/accuracy/wh_vs_that_with_gap": 0.44, "blimp/accuracy/principle_A_domain_2": 0.707, "blimp/accuracy/determiner_noun_agreement_1": 0.964, "blimp/accuracy/ellipsis_n_bar_2": 0.845, "blimp/accuracy/principle_A_domain_3": 0.523, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.858, "blimp/accuracy/animate_subject_trans": 0.841, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.733, "blimp/accuracy/distractor_agreement_relative_clause": 0.451, "blimp/accuracy/transitive": 0.781, "blimp/accuracy/sentential_subject_island": 0.345, "blimp/accuracy/adjunct_island": 0.711, "blimp/accuracy/intransitive": 0.675, "blimp/accuracy/existential_there_subject_raising": 0.833, "blimp/accuracy/irregular_past_participle_adjectives": 0.938, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.191, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.163, "blimp/accuracy/only_npi_scope": 0.468, "blimp/accuracy/superlative_quantifiers_2": 0.754, "blimp/accuracy/passive_1": 0.895, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.864, "blimp/accuracy/inchoative": 0.509, "blimp/accuracy/anaphor_gender_agreement": 0.906, "blimp/accuracy/principle_A_c_command": 0.465, "blimp/accuracy/only_npi_licensor_present": 0.18, "blimp/accuracy/expletive_it_object_raising": 0.744, "blimp/accuracy/left_branch_island_simple_question": 0.224, "blimp/accuracy/wh_questions_subject_gap": 0.827, "blimp/accuracy/existential_there_quantifiers_2": 0.461, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.923, "blimp/accuracy/sentential_negation_npi_scope": 0.314, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.728, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.808, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.786, "blimp/accuracy/principle_A_case_2": 0.881, "blimp/accuracy/distractor_agreement_relational_noun": 0.704, "blimp/accuracy/sentential_negation_npi_licensor_present": 1.0, "blimp/accuracy/superlative_quantifiers_1": 0.593, "blimp/accuracy/wh_island": 0.707, "blimp/accuracy/principle_A_domain_1": 0.973, "blimp/accuracy/complex_NP_island": 0.497, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.937, "blimp/accuracy/irregular_past_participle_verbs": 0.763, "blimp/accuracy/drop_argument": 0.749, "blimp/accuracy/wh_questions_object_gap": 0.636, "blimp/accuracy/animate_subject_passive": 0.752, "blimp/accuracy/existential_there_quantifiers_1": 0.912, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.847, "blimp/accuracy/npi_present_2": 0.525, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.796, "blimp/accuracy/anaphor_number_agreement": 0.967, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.906, "blimp/accuracy/existential_there_object_raising": 0.808, "blimp/accuracy/matrix_question_npi_licensor_present": 0.053, "blimp/accuracy/npi_present_1": 0.514, "blimp/accuracy/wh_vs_that_no_gap": 0.948, "blimp/accuracy/left_branch_island_echo_question": 0.428, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.951, "blimp/accuracy/causative": 0.649, "blimp/accuracy/group_average": 0.6912388059701494, "blimp/accuracy/seq_average": 0.6912388059701493, "cbt/accuracy/NE": 0.6642628205128205, "cbt/accuracy/V": 0.8456, "cbt/accuracy/CN": 0.698, "cbt/accuracy/P": 0.8252, "cbt/accuracy/group_average": 0.7582657051282051, "cbt/accuracy/seq_average": 0.7583033213285314, "hellaswag/accuracy/val": 0.269169488149771, "hellaswag/accuracy/group_average": 0.269169488149771, "hellaswag/accuracy/seq_average": 0.269169488149771, "piqa/accuracy/val": 0.5544069640914037, "piqa/accuracy/group_average": 0.5544069640914037, "piqa/accuracy/seq_average": 0.5544069640914037, "ai2arc/accuracy/ARC-Easy": 0.29809725158562367, "ai2arc/accuracy/ARC-Challenge": 0.20085836909871244, "ai2arc/accuracy/group_average": 0.24947781034216804, "ai2arc/accuracy/seq_average": 0.2660056657223796, "race/accuracy/test/high": 0.2550028587764437, "race/accuracy/test/middle": 0.33147632311977715, "race/accuracy/group_average": 0.2932395909481104, "race/accuracy/seq_average": 0.2772598297527361, "siqa/accuracy/dev": 0.3526100307062436, "siqa/accuracy/group_average": 0.3526100307062436, "siqa/accuracy/seq_average": 0.3526100307062436, "commonsenseqa/accuracy/dev_rand_split": 0.24078624078624078, "commonsenseqa/accuracy/group_average": 0.24078624078624078, "commonsenseqa/accuracy/seq_average": 0.24078624078624078}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_remoe/export/result-model-100000.pth.json
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"val/loss": 2.6843065534319197,
|
| 3 |
+
"val/accuracy": 0.46967134021577384,
|
| 4 |
+
"val/perplexity": 14.648040224337864,
|
| 5 |
+
"val/time_since_best_loss": 0,
|
| 6 |
+
"val/time_since_best_accuracy": 0,
|
| 7 |
+
"lambada/loss": 2.4490627502062305,
|
| 8 |
+
"lambada/accuracy/total": 0.22748447204968944,
|
| 9 |
+
"lambada/accuracy/openai_last_token": 0.7548524844720497,
|
| 10 |
+
"lambada/perplexity": 13.870468192241985,
|
| 11 |
+
"lambada/lm_loss": 3.2376076954022026,
|
| 12 |
+
"lambada/lm_perplexity": 25.47271031547893,
|
| 13 |
+
"lambada/time_since_best_loss": 0,
|
| 14 |
+
"lambada/time_since_best_accuracy": 0,
|
| 15 |
+
"mean_accuracy": 0.34857790613273165,
|
| 16 |
+
"mean_loss": 2.566684651819075,
|
| 17 |
+
"blimp/accuracy/passive_2": 0.907,
|
| 18 |
+
"blimp/accuracy/determiner_noun_agreement_2": 0.975,
|
| 19 |
+
"blimp/accuracy/ellipsis_n_bar_1": 0.802,
|
| 20 |
+
"blimp/accuracy/tough_vs_raising_2": 0.857,
|
| 21 |
+
"blimp/accuracy/tough_vs_raising_1": 0.567,
|
| 22 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.896,
|
| 23 |
+
"blimp/accuracy/principle_A_reconstruction": 0.4,
|
| 24 |
+
"blimp/accuracy/wh_vs_that_with_gap": 0.565,
|
| 25 |
+
"blimp/accuracy/principle_A_domain_2": 0.765,
|
| 26 |
+
"blimp/accuracy/determiner_noun_agreement_1": 0.983,
|
| 27 |
+
"blimp/accuracy/ellipsis_n_bar_2": 0.883,
|
| 28 |
+
"blimp/accuracy/principle_A_domain_3": 0.501,
|
| 29 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.9,
|
| 30 |
+
"blimp/accuracy/animate_subject_trans": 0.89,
|
| 31 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.869,
|
| 32 |
+
"blimp/accuracy/distractor_agreement_relative_clause": 0.562,
|
| 33 |
+
"blimp/accuracy/transitive": 0.843,
|
| 34 |
+
"blimp/accuracy/sentential_subject_island": 0.296,
|
| 35 |
+
"blimp/accuracy/adjunct_island": 0.775,
|
| 36 |
+
"blimp/accuracy/intransitive": 0.79,
|
| 37 |
+
"blimp/accuracy/existential_there_subject_raising": 0.888,
|
| 38 |
+
"blimp/accuracy/irregular_past_participle_adjectives": 0.983,
|
| 39 |
+
"blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.254,
|
| 40 |
+
"blimp/accuracy/principle_A_case_1": 1.0,
|
| 41 |
+
"blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.278,
|
| 42 |
+
"blimp/accuracy/only_npi_scope": 0.733,
|
| 43 |
+
"blimp/accuracy/superlative_quantifiers_2": 0.798,
|
| 44 |
+
"blimp/accuracy/passive_1": 0.895,
|
| 45 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.898,
|
| 46 |
+
"blimp/accuracy/inchoative": 0.639,
|
| 47 |
+
"blimp/accuracy/anaphor_gender_agreement": 0.941,
|
| 48 |
+
"blimp/accuracy/principle_A_c_command": 0.622,
|
| 49 |
+
"blimp/accuracy/only_npi_licensor_present": 0.547,
|
| 50 |
+
"blimp/accuracy/expletive_it_object_raising": 0.771,
|
| 51 |
+
"blimp/accuracy/left_branch_island_simple_question": 0.26,
|
| 52 |
+
"blimp/accuracy/wh_questions_subject_gap": 0.908,
|
| 53 |
+
"blimp/accuracy/existential_there_quantifiers_2": 0.33,
|
| 54 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.941,
|
| 55 |
+
"blimp/accuracy/sentential_negation_npi_scope": 0.652,
|
| 56 |
+
"blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.794,
|
| 57 |
+
"blimp/accuracy/wh_questions_subject_gap_long_distance": 0.87,
|
| 58 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.908,
|
| 59 |
+
"blimp/accuracy/principle_A_case_2": 0.958,
|
| 60 |
+
"blimp/accuracy/distractor_agreement_relational_noun": 0.811,
|
| 61 |
+
"blimp/accuracy/sentential_negation_npi_licensor_present": 0.993,
|
| 62 |
+
"blimp/accuracy/superlative_quantifiers_1": 0.617,
|
| 63 |
+
"blimp/accuracy/wh_island": 0.792,
|
| 64 |
+
"blimp/accuracy/principle_A_domain_1": 0.983,
|
| 65 |
+
"blimp/accuracy/complex_NP_island": 0.534,
|
| 66 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_2": 0.958,
|
| 67 |
+
"blimp/accuracy/irregular_past_participle_verbs": 0.826,
|
| 68 |
+
"blimp/accuracy/drop_argument": 0.816,
|
| 69 |
+
"blimp/accuracy/wh_questions_object_gap": 0.769,
|
| 70 |
+
"blimp/accuracy/animate_subject_passive": 0.787,
|
| 71 |
+
"blimp/accuracy/existential_there_quantifiers_1": 0.968,
|
| 72 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.872,
|
| 73 |
+
"blimp/accuracy/npi_present_2": 0.568,
|
| 74 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_1": 0.914,
|
| 75 |
+
"blimp/accuracy/anaphor_number_agreement": 0.983,
|
| 76 |
+
"blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.953,
|
| 77 |
+
"blimp/accuracy/existential_there_object_raising": 0.83,
|
| 78 |
+
"blimp/accuracy/matrix_question_npi_licensor_present": 0.165,
|
| 79 |
+
"blimp/accuracy/npi_present_1": 0.504,
|
| 80 |
+
"blimp/accuracy/wh_vs_that_no_gap": 0.97,
|
| 81 |
+
"blimp/accuracy/left_branch_island_echo_question": 0.436,
|
| 82 |
+
"blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.977,
|
| 83 |
+
"blimp/accuracy/causative": 0.694,
|
| 84 |
+
"blimp/accuracy/group_average": 0.7554328358208955,
|
| 85 |
+
"blimp/accuracy/seq_average": 0.7554328358208955,
|
| 86 |
+
"cbt/accuracy/NE": 0.7439903846153846,
|
| 87 |
+
"cbt/accuracy/V": 0.9028,
|
| 88 |
+
"cbt/accuracy/CN": 0.8044,
|
| 89 |
+
"cbt/accuracy/P": 0.8824,
|
| 90 |
+
"cbt/accuracy/group_average": 0.8333975961538461,
|
| 91 |
+
"cbt/accuracy/seq_average": 0.8334333733493398,
|
| 92 |
+
"hellaswag/accuracy/val": 0.28540131447918743,
|
| 93 |
+
"hellaswag/accuracy/group_average": 0.28540131447918743,
|
| 94 |
+
"hellaswag/accuracy/seq_average": 0.28540131447918743,
|
| 95 |
+
"piqa/accuracy/val": 0.5761697497279652,
|
| 96 |
+
"piqa/accuracy/group_average": 0.5761697497279652,
|
| 97 |
+
"piqa/accuracy/seq_average": 0.5761697497279652,
|
| 98 |
+
"ai2arc/accuracy/ARC-Easy": 0.33488372093023255,
|
| 99 |
+
"ai2arc/accuracy/ARC-Challenge": 0.21974248927038625,
|
| 100 |
+
"ai2arc/accuracy/group_average": 0.2773131051003094,
|
| 101 |
+
"ai2arc/accuracy/seq_average": 0.29688385269121814,
|
| 102 |
+
"race/accuracy/test/high": 0.2624356775300172,
|
| 103 |
+
"race/accuracy/test/middle": 0.3412256267409471,
|
| 104 |
+
"race/accuracy/group_average": 0.3018306521354821,
|
| 105 |
+
"race/accuracy/seq_average": 0.2853668423186056,
|
| 106 |
+
"siqa/accuracy/dev": 0.3607983623336745,
|
| 107 |
+
"siqa/accuracy/group_average": 0.3607983623336745,
|
| 108 |
+
"siqa/accuracy/seq_average": 0.3607983623336745,
|
| 109 |
+
"commonsenseqa/accuracy/dev_rand_split": 0.25143325143325146,
|
| 110 |
+
"commonsenseqa/accuracy/group_average": 0.25143325143325146,
|
| 111 |
+
"commonsenseqa/accuracy/seq_average": 0.25143325143325146
|
| 112 |
+
}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_remoe/export/result-model-20000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.962588113451761, "val/accuracy": 0.43260459294394843, "val/perplexity": 19.347981783692912, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6079476871845886, "lambada/accuracy/total": 0.15838509316770186, "lambada/accuracy/openai_last_token": 0.7199145962732919, "lambada/perplexity": 24.16518713186894, "lambada/lm_loss": 3.4705702408689385, "lambada/lm_perplexity": 32.155073354709984, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.29549484305582513, "mean_loss": 2.785267900318175, "blimp/accuracy/passive_2": 0.873, "blimp/accuracy/determiner_noun_agreement_2": 0.954, "blimp/accuracy/ellipsis_n_bar_1": 0.759, "blimp/accuracy/tough_vs_raising_2": 0.822, "blimp/accuracy/tough_vs_raising_1": 0.497, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.861, "blimp/accuracy/principle_A_reconstruction": 0.424, "blimp/accuracy/wh_vs_that_with_gap": 0.463, "blimp/accuracy/principle_A_domain_2": 0.797, "blimp/accuracy/determiner_noun_agreement_1": 0.97, "blimp/accuracy/ellipsis_n_bar_2": 0.86, "blimp/accuracy/principle_A_domain_3": 0.574, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.858, "blimp/accuracy/animate_subject_trans": 0.867, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.837, "blimp/accuracy/distractor_agreement_relative_clause": 0.447, "blimp/accuracy/transitive": 0.821, "blimp/accuracy/sentential_subject_island": 0.322, "blimp/accuracy/adjunct_island": 0.619, "blimp/accuracy/intransitive": 0.725, "blimp/accuracy/existential_there_subject_raising": 0.854, "blimp/accuracy/irregular_past_participle_adjectives": 0.889, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.108, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.19, "blimp/accuracy/only_npi_scope": 0.601, "blimp/accuracy/superlative_quantifiers_2": 0.869, "blimp/accuracy/passive_1": 0.903, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.858, "blimp/accuracy/inchoative": 0.55, "blimp/accuracy/anaphor_gender_agreement": 0.85, "blimp/accuracy/principle_A_c_command": 0.474, "blimp/accuracy/only_npi_licensor_present": 0.313, "blimp/accuracy/expletive_it_object_raising": 0.731, "blimp/accuracy/left_branch_island_simple_question": 0.144, "blimp/accuracy/wh_questions_subject_gap": 0.876, "blimp/accuracy/existential_there_quantifiers_2": 0.391, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.897, "blimp/accuracy/sentential_negation_npi_scope": 0.418, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.843, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.867, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.822, "blimp/accuracy/principle_A_case_2": 0.937, "blimp/accuracy/distractor_agreement_relational_noun": 0.765, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.995, "blimp/accuracy/superlative_quantifiers_1": 0.545, "blimp/accuracy/wh_island": 0.78, "blimp/accuracy/principle_A_domain_1": 0.977, "blimp/accuracy/complex_NP_island": 0.508, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.932, "blimp/accuracy/irregular_past_participle_verbs": 0.806, "blimp/accuracy/drop_argument": 0.746, "blimp/accuracy/wh_questions_object_gap": 0.729, "blimp/accuracy/animate_subject_passive": 0.754, "blimp/accuracy/existential_there_quantifiers_1": 0.978, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.839, "blimp/accuracy/npi_present_2": 0.529, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.833, "blimp/accuracy/anaphor_number_agreement": 0.952, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.906, "blimp/accuracy/existential_there_object_raising": 0.755, "blimp/accuracy/matrix_question_npi_licensor_present": 0.05, "blimp/accuracy/npi_present_1": 0.482, "blimp/accuracy/wh_vs_that_no_gap": 0.963, "blimp/accuracy/left_branch_island_echo_question": 0.39, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.978, "blimp/accuracy/causative": 0.656, "blimp/accuracy/group_average": 0.7101940298507462, "blimp/accuracy/seq_average": 0.7101940298507463, "cbt/accuracy/NE": 0.6895032051282052, "cbt/accuracy/V": 0.8672, "cbt/accuracy/CN": 0.746, "cbt/accuracy/P": 0.8412, "cbt/accuracy/group_average": 0.7859758012820512, "cbt/accuracy/seq_average": 0.786014405762305, "hellaswag/accuracy/val": 0.27145986855208126, "hellaswag/accuracy/group_average": 0.27145986855208126, "hellaswag/accuracy/seq_average": 0.27145986855208126, "piqa/accuracy/val": 0.5669205658324266, "piqa/accuracy/group_average": 0.5669205658324266, "piqa/accuracy/seq_average": 0.5669205658324266, "ai2arc/accuracy/ARC-Easy": 0.30739957716701904, "ai2arc/accuracy/ARC-Challenge": 0.1965665236051502, "ai2arc/accuracy/group_average": 0.2519830503860846, "ai2arc/accuracy/seq_average": 0.2708215297450425, "race/accuracy/test/high": 0.252715837621498, "race/accuracy/test/middle": 0.32729805013927576, "race/accuracy/group_average": 0.2900069438803869, "race/accuracy/seq_average": 0.2744223753546818, "siqa/accuracy/dev": 0.3556806550665302, "siqa/accuracy/group_average": 0.3556806550665302, "siqa/accuracy/seq_average": 0.3556806550665302, "commonsenseqa/accuracy/dev_rand_split": 0.24488124488124488, "commonsenseqa/accuracy/group_average": 0.24488124488124488, "commonsenseqa/accuracy/seq_average": 0.24488124488124488}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_remoe/export/result-model-30000.pth.json
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"val/loss": 2.8807765415736606,
|
| 3 |
+
"val/accuracy": 0.44323149181547616,
|
| 4 |
+
"val/perplexity": 17.828112075884437,
|
| 5 |
+
"val/time_since_best_loss": 0,
|
| 6 |
+
"val/time_since_best_accuracy": 0,
|
| 7 |
+
"lambada/loss": 2.608717260893828,
|
| 8 |
+
"lambada/accuracy/total": 0.17798913043478262,
|
| 9 |
+
"lambada/accuracy/openai_last_token": 0.7323369565217391,
|
| 10 |
+
"lambada/perplexity": 20.861003584231046,
|
| 11 |
+
"lambada/lm_loss": 3.4353615443604486,
|
| 12 |
+
"lambada/lm_perplexity": 31.0426338261074,
|
| 13 |
+
"lambada/time_since_best_loss": 0,
|
| 14 |
+
"lambada/time_since_best_accuracy": 0,
|
| 15 |
+
"mean_accuracy": 0.3106103111251294,
|
| 16 |
+
"mean_loss": 2.744746901233744,
|
| 17 |
+
"blimp/accuracy/passive_2": 0.876,
|
| 18 |
+
"blimp/accuracy/determiner_noun_agreement_2": 0.951,
|
| 19 |
+
"blimp/accuracy/ellipsis_n_bar_1": 0.742,
|
| 20 |
+
"blimp/accuracy/tough_vs_raising_2": 0.852,
|
| 21 |
+
"blimp/accuracy/tough_vs_raising_1": 0.582,
|
| 22 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.876,
|
| 23 |
+
"blimp/accuracy/principle_A_reconstruction": 0.413,
|
| 24 |
+
"blimp/accuracy/wh_vs_that_with_gap": 0.592,
|
| 25 |
+
"blimp/accuracy/principle_A_domain_2": 0.796,
|
| 26 |
+
"blimp/accuracy/determiner_noun_agreement_1": 0.983,
|
| 27 |
+
"blimp/accuracy/ellipsis_n_bar_2": 0.866,
|
| 28 |
+
"blimp/accuracy/principle_A_domain_3": 0.523,
|
| 29 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.863,
|
| 30 |
+
"blimp/accuracy/animate_subject_trans": 0.861,
|
| 31 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.868,
|
| 32 |
+
"blimp/accuracy/distractor_agreement_relative_clause": 0.474,
|
| 33 |
+
"blimp/accuracy/transitive": 0.831,
|
| 34 |
+
"blimp/accuracy/sentential_subject_island": 0.313,
|
| 35 |
+
"blimp/accuracy/adjunct_island": 0.713,
|
| 36 |
+
"blimp/accuracy/intransitive": 0.774,
|
| 37 |
+
"blimp/accuracy/existential_there_subject_raising": 0.882,
|
| 38 |
+
"blimp/accuracy/irregular_past_participle_adjectives": 0.981,
|
| 39 |
+
"blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.187,
|
| 40 |
+
"blimp/accuracy/principle_A_case_1": 1.0,
|
| 41 |
+
"blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.269,
|
| 42 |
+
"blimp/accuracy/only_npi_scope": 0.856,
|
| 43 |
+
"blimp/accuracy/superlative_quantifiers_2": 0.7,
|
| 44 |
+
"blimp/accuracy/passive_1": 0.886,
|
| 45 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.865,
|
| 46 |
+
"blimp/accuracy/inchoative": 0.621,
|
| 47 |
+
"blimp/accuracy/anaphor_gender_agreement": 0.92,
|
| 48 |
+
"blimp/accuracy/principle_A_c_command": 0.557,
|
| 49 |
+
"blimp/accuracy/only_npi_licensor_present": 0.381,
|
| 50 |
+
"blimp/accuracy/expletive_it_object_raising": 0.779,
|
| 51 |
+
"blimp/accuracy/left_branch_island_simple_question": 0.204,
|
| 52 |
+
"blimp/accuracy/wh_questions_subject_gap": 0.845,
|
| 53 |
+
"blimp/accuracy/existential_there_quantifiers_2": 0.338,
|
| 54 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.925,
|
| 55 |
+
"blimp/accuracy/sentential_negation_npi_scope": 0.541,
|
| 56 |
+
"blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.8,
|
| 57 |
+
"blimp/accuracy/wh_questions_subject_gap_long_distance": 0.799,
|
| 58 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.844,
|
| 59 |
+
"blimp/accuracy/principle_A_case_2": 0.946,
|
| 60 |
+
"blimp/accuracy/distractor_agreement_relational_noun": 0.746,
|
| 61 |
+
"blimp/accuracy/sentential_negation_npi_licensor_present": 0.993,
|
| 62 |
+
"blimp/accuracy/superlative_quantifiers_1": 0.628,
|
| 63 |
+
"blimp/accuracy/wh_island": 0.78,
|
| 64 |
+
"blimp/accuracy/principle_A_domain_1": 0.975,
|
| 65 |
+
"blimp/accuracy/complex_NP_island": 0.508,
|
| 66 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_2": 0.919,
|
| 67 |
+
"blimp/accuracy/irregular_past_participle_verbs": 0.796,
|
| 68 |
+
"blimp/accuracy/drop_argument": 0.807,
|
| 69 |
+
"blimp/accuracy/wh_questions_object_gap": 0.652,
|
| 70 |
+
"blimp/accuracy/animate_subject_passive": 0.8,
|
| 71 |
+
"blimp/accuracy/existential_there_quantifiers_1": 0.953,
|
| 72 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.852,
|
| 73 |
+
"blimp/accuracy/npi_present_2": 0.508,
|
| 74 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_1": 0.885,
|
| 75 |
+
"blimp/accuracy/anaphor_number_agreement": 0.97,
|
| 76 |
+
"blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.947,
|
| 77 |
+
"blimp/accuracy/existential_there_object_raising": 0.779,
|
| 78 |
+
"blimp/accuracy/matrix_question_npi_licensor_present": 0.148,
|
| 79 |
+
"blimp/accuracy/npi_present_1": 0.491,
|
| 80 |
+
"blimp/accuracy/wh_vs_that_no_gap": 0.949,
|
| 81 |
+
"blimp/accuracy/left_branch_island_echo_question": 0.367,
|
| 82 |
+
"blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.946,
|
| 83 |
+
"blimp/accuracy/causative": 0.674,
|
| 84 |
+
"blimp/accuracy/group_average": 0.7305671641791045,
|
| 85 |
+
"blimp/accuracy/seq_average": 0.7305671641791045,
|
| 86 |
+
"cbt/accuracy/NE": 0.7151442307692307,
|
| 87 |
+
"cbt/accuracy/V": 0.8716,
|
| 88 |
+
"cbt/accuracy/CN": 0.764,
|
| 89 |
+
"cbt/accuracy/P": 0.8604,
|
| 90 |
+
"cbt/accuracy/group_average": 0.8027860576923078,
|
| 91 |
+
"cbt/accuracy/seq_average": 0.8028211284513805,
|
| 92 |
+
"hellaswag/accuracy/val": 0.27165903206532566,
|
| 93 |
+
"hellaswag/accuracy/group_average": 0.27165903206532566,
|
| 94 |
+
"hellaswag/accuracy/seq_average": 0.27165903206532566,
|
| 95 |
+
"piqa/accuracy/val": 0.5652883569096845,
|
| 96 |
+
"piqa/accuracy/group_average": 0.5652883569096845,
|
| 97 |
+
"piqa/accuracy/seq_average": 0.5652883569096845,
|
| 98 |
+
"ai2arc/accuracy/ARC-Easy": 0.30782241014799155,
|
| 99 |
+
"ai2arc/accuracy/ARC-Challenge": 0.1905579399141631,
|
| 100 |
+
"ai2arc/accuracy/group_average": 0.24919017503107732,
|
| 101 |
+
"ai2arc/accuracy/seq_average": 0.26912181303116145,
|
| 102 |
+
"race/accuracy/test/high": 0.25900514579759865,
|
| 103 |
+
"race/accuracy/test/middle": 0.3245125348189415,
|
| 104 |
+
"race/accuracy/group_average": 0.29175884030827004,
|
| 105 |
+
"race/accuracy/seq_average": 0.27807053100932305,
|
| 106 |
+
"siqa/accuracy/dev": 0.35670419651995905,
|
| 107 |
+
"siqa/accuracy/group_average": 0.35670419651995905,
|
| 108 |
+
"siqa/accuracy/seq_average": 0.35670419651995905,
|
| 109 |
+
"commonsenseqa/accuracy/dev_rand_split": 0.23914823914823916,
|
| 110 |
+
"commonsenseqa/accuracy/group_average": 0.23914823914823916,
|
| 111 |
+
"commonsenseqa/accuracy/seq_average": 0.23914823914823916
|
| 112 |
+
}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_remoe/export/result-model-40000.pth.json
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"val/loss": 2.8288164895678323,
|
| 3 |
+
"val/accuracy": 0.4496256510416667,
|
| 4 |
+
"val/perplexity": 16.925417557924394,
|
| 5 |
+
"val/time_since_best_loss": 0,
|
| 6 |
+
"val/time_since_best_accuracy": 0,
|
| 7 |
+
"lambada/loss": 2.5296702888441382,
|
| 8 |
+
"lambada/accuracy/total": 0.18012422360248448,
|
| 9 |
+
"lambada/accuracy/openai_last_token": 0.7366071428571429,
|
| 10 |
+
"lambada/perplexity": 20.131109968885948,
|
| 11 |
+
"lambada/lm_loss": 3.3560838997468947,
|
| 12 |
+
"lambada/lm_perplexity": 28.67666998690144,
|
| 13 |
+
"lambada/time_since_best_loss": 0,
|
| 14 |
+
"lambada/time_since_best_accuracy": 0,
|
| 15 |
+
"mean_accuracy": 0.3148749373220756,
|
| 16 |
+
"mean_loss": 2.6792433892059853,
|
| 17 |
+
"blimp/accuracy/passive_2": 0.883,
|
| 18 |
+
"blimp/accuracy/determiner_noun_agreement_2": 0.97,
|
| 19 |
+
"blimp/accuracy/ellipsis_n_bar_1": 0.745,
|
| 20 |
+
"blimp/accuracy/tough_vs_raising_2": 0.868,
|
| 21 |
+
"blimp/accuracy/tough_vs_raising_1": 0.524,
|
| 22 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.858,
|
| 23 |
+
"blimp/accuracy/principle_A_reconstruction": 0.322,
|
| 24 |
+
"blimp/accuracy/wh_vs_that_with_gap": 0.564,
|
| 25 |
+
"blimp/accuracy/principle_A_domain_2": 0.779,
|
| 26 |
+
"blimp/accuracy/determiner_noun_agreement_1": 0.977,
|
| 27 |
+
"blimp/accuracy/ellipsis_n_bar_2": 0.88,
|
| 28 |
+
"blimp/accuracy/principle_A_domain_3": 0.546,
|
| 29 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.888,
|
| 30 |
+
"blimp/accuracy/animate_subject_trans": 0.883,
|
| 31 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.851,
|
| 32 |
+
"blimp/accuracy/distractor_agreement_relative_clause": 0.517,
|
| 33 |
+
"blimp/accuracy/transitive": 0.82,
|
| 34 |
+
"blimp/accuracy/sentential_subject_island": 0.315,
|
| 35 |
+
"blimp/accuracy/adjunct_island": 0.754,
|
| 36 |
+
"blimp/accuracy/intransitive": 0.762,
|
| 37 |
+
"blimp/accuracy/existential_there_subject_raising": 0.867,
|
| 38 |
+
"blimp/accuracy/irregular_past_participle_adjectives": 0.986,
|
| 39 |
+
"blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.192,
|
| 40 |
+
"blimp/accuracy/principle_A_case_1": 1.0,
|
| 41 |
+
"blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.261,
|
| 42 |
+
"blimp/accuracy/only_npi_scope": 0.679,
|
| 43 |
+
"blimp/accuracy/superlative_quantifiers_2": 0.771,
|
| 44 |
+
"blimp/accuracy/passive_1": 0.887,
|
| 45 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.878,
|
| 46 |
+
"blimp/accuracy/inchoative": 0.613,
|
| 47 |
+
"blimp/accuracy/anaphor_gender_agreement": 0.923,
|
| 48 |
+
"blimp/accuracy/principle_A_c_command": 0.548,
|
| 49 |
+
"blimp/accuracy/only_npi_licensor_present": 0.603,
|
| 50 |
+
"blimp/accuracy/expletive_it_object_raising": 0.776,
|
| 51 |
+
"blimp/accuracy/left_branch_island_simple_question": 0.232,
|
| 52 |
+
"blimp/accuracy/wh_questions_subject_gap": 0.907,
|
| 53 |
+
"blimp/accuracy/existential_there_quantifiers_2": 0.292,
|
| 54 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.94,
|
| 55 |
+
"blimp/accuracy/sentential_negation_npi_scope": 0.529,
|
| 56 |
+
"blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.788,
|
| 57 |
+
"blimp/accuracy/wh_questions_subject_gap_long_distance": 0.894,
|
| 58 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.885,
|
| 59 |
+
"blimp/accuracy/principle_A_case_2": 0.944,
|
| 60 |
+
"blimp/accuracy/distractor_agreement_relational_noun": 0.772,
|
| 61 |
+
"blimp/accuracy/sentential_negation_npi_licensor_present": 0.999,
|
| 62 |
+
"blimp/accuracy/superlative_quantifiers_1": 0.686,
|
| 63 |
+
"blimp/accuracy/wh_island": 0.736,
|
| 64 |
+
"blimp/accuracy/principle_A_domain_1": 0.972,
|
| 65 |
+
"blimp/accuracy/complex_NP_island": 0.475,
|
| 66 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_2": 0.958,
|
| 67 |
+
"blimp/accuracy/irregular_past_participle_verbs": 0.822,
|
| 68 |
+
"blimp/accuracy/drop_argument": 0.802,
|
| 69 |
+
"blimp/accuracy/wh_questions_object_gap": 0.738,
|
| 70 |
+
"blimp/accuracy/animate_subject_passive": 0.76,
|
| 71 |
+
"blimp/accuracy/existential_there_quantifiers_1": 0.962,
|
| 72 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.863,
|
| 73 |
+
"blimp/accuracy/npi_present_2": 0.532,
|
| 74 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_1": 0.896,
|
| 75 |
+
"blimp/accuracy/anaphor_number_agreement": 0.959,
|
| 76 |
+
"blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.942,
|
| 77 |
+
"blimp/accuracy/existential_there_object_raising": 0.836,
|
| 78 |
+
"blimp/accuracy/matrix_question_npi_licensor_present": 0.144,
|
| 79 |
+
"blimp/accuracy/npi_present_1": 0.427,
|
| 80 |
+
"blimp/accuracy/wh_vs_that_no_gap": 0.96,
|
| 81 |
+
"blimp/accuracy/left_branch_island_echo_question": 0.432,
|
| 82 |
+
"blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.973,
|
| 83 |
+
"blimp/accuracy/causative": 0.671,
|
| 84 |
+
"blimp/accuracy/group_average": 0.7375820895522388,
|
| 85 |
+
"blimp/accuracy/seq_average": 0.7375820895522388,
|
| 86 |
+
"cbt/accuracy/NE": 0.7275641025641025,
|
| 87 |
+
"cbt/accuracy/V": 0.8796,
|
| 88 |
+
"cbt/accuracy/CN": 0.7728,
|
| 89 |
+
"cbt/accuracy/P": 0.8612,
|
| 90 |
+
"cbt/accuracy/group_average": 0.8102910256410256,
|
| 91 |
+
"cbt/accuracy/seq_average": 0.8103241296518607,
|
| 92 |
+
"hellaswag/accuracy/val": 0.27853017327225654,
|
| 93 |
+
"hellaswag/accuracy/group_average": 0.27853017327225654,
|
| 94 |
+
"hellaswag/accuracy/seq_average": 0.27853017327225654,
|
| 95 |
+
"piqa/accuracy/val": 0.5761697497279652,
|
| 96 |
+
"piqa/accuracy/group_average": 0.5761697497279652,
|
| 97 |
+
"piqa/accuracy/seq_average": 0.5761697497279652,
|
| 98 |
+
"ai2arc/accuracy/ARC-Easy": 0.30824524312896406,
|
| 99 |
+
"ai2arc/accuracy/ARC-Challenge": 0.20085836909871244,
|
| 100 |
+
"ai2arc/accuracy/group_average": 0.25455180611383826,
|
| 101 |
+
"ai2arc/accuracy/seq_average": 0.27280453257790366,
|
| 102 |
+
"race/accuracy/test/high": 0.2612921669525443,
|
| 103 |
+
"race/accuracy/test/middle": 0.33008356545961004,
|
| 104 |
+
"race/accuracy/group_average": 0.2956878662060772,
|
| 105 |
+
"race/accuracy/seq_average": 0.2813133360356709,
|
| 106 |
+
"siqa/accuracy/dev": 0.35516888433981575,
|
| 107 |
+
"siqa/accuracy/group_average": 0.35516888433981575,
|
| 108 |
+
"siqa/accuracy/seq_average": 0.35516888433981575,
|
| 109 |
+
"commonsenseqa/accuracy/dev_rand_split": 0.24815724815724816,
|
| 110 |
+
"commonsenseqa/accuracy/group_average": 0.24815724815724816,
|
| 111 |
+
"commonsenseqa/accuracy/seq_average": 0.24815724815724816
|
| 112 |
+
}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_remoe/export/result-model-50000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.785560002402654, "val/accuracy": 0.4557068839905754, "val/perplexity": 16.20889235432367, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.567660124405571, "lambada/accuracy/total": 0.20011645962732919, "lambada/accuracy/openai_last_token": 0.7459239130434783, "lambada/perplexity": 17.668960608240095, "lambada/lm_loss": 3.352461489465127, "lambada/lm_perplexity": 28.572979241290106, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3279116718089523, "mean_loss": 2.6766100634041123, "blimp/accuracy/passive_2": 0.89, "blimp/accuracy/determiner_noun_agreement_2": 0.979, "blimp/accuracy/ellipsis_n_bar_1": 0.793, "blimp/accuracy/tough_vs_raising_2": 0.899, "blimp/accuracy/tough_vs_raising_1": 0.474, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.852, "blimp/accuracy/principle_A_reconstruction": 0.356, "blimp/accuracy/wh_vs_that_with_gap": 0.562, "blimp/accuracy/principle_A_domain_2": 0.83, "blimp/accuracy/determiner_noun_agreement_1": 0.98, "blimp/accuracy/ellipsis_n_bar_2": 0.884, "blimp/accuracy/principle_A_domain_3": 0.534, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.891, "blimp/accuracy/animate_subject_trans": 0.873, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.852, "blimp/accuracy/distractor_agreement_relative_clause": 0.5, "blimp/accuracy/transitive": 0.822, "blimp/accuracy/sentential_subject_island": 0.348, "blimp/accuracy/adjunct_island": 0.742, "blimp/accuracy/intransitive": 0.779, "blimp/accuracy/existential_there_subject_raising": 0.873, "blimp/accuracy/irregular_past_participle_adjectives": 0.975, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.263, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.244, "blimp/accuracy/only_npi_scope": 0.74, "blimp/accuracy/superlative_quantifiers_2": 0.748, "blimp/accuracy/passive_1": 0.898, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.882, "blimp/accuracy/inchoative": 0.581, "blimp/accuracy/anaphor_gender_agreement": 0.891, "blimp/accuracy/principle_A_c_command": 0.601, "blimp/accuracy/only_npi_licensor_present": 0.522, "blimp/accuracy/expletive_it_object_raising": 0.78, "blimp/accuracy/left_branch_island_simple_question": 0.242, "blimp/accuracy/wh_questions_subject_gap": 0.906, "blimp/accuracy/existential_there_quantifiers_2": 0.356, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.933, "blimp/accuracy/sentential_negation_npi_scope": 0.526, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.826, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.876, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.872, "blimp/accuracy/principle_A_case_2": 0.968, "blimp/accuracy/distractor_agreement_relational_noun": 0.75, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.996, "blimp/accuracy/superlative_quantifiers_1": 0.802, "blimp/accuracy/wh_island": 0.646, "blimp/accuracy/principle_A_domain_1": 0.987, "blimp/accuracy/complex_NP_island": 0.557, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.951, "blimp/accuracy/irregular_past_participle_verbs": 0.809, "blimp/accuracy/drop_argument": 0.78, "blimp/accuracy/wh_questions_object_gap": 0.757, "blimp/accuracy/animate_subject_passive": 0.782, "blimp/accuracy/existential_there_quantifiers_1": 0.979, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.883, "blimp/accuracy/npi_present_2": 0.552, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.906, "blimp/accuracy/anaphor_number_agreement": 0.966, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.943, "blimp/accuracy/existential_there_object_raising": 0.838, "blimp/accuracy/matrix_question_npi_licensor_present": 0.16, "blimp/accuracy/npi_present_1": 0.494, "blimp/accuracy/wh_vs_that_no_gap": 0.968, "blimp/accuracy/left_branch_island_echo_question": 0.441, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.981, "blimp/accuracy/causative": 0.699, "blimp/accuracy/group_average": 0.7458208955223882, "blimp/accuracy/seq_average": 0.7458208955223881, "cbt/accuracy/NE": 0.733573717948718, "cbt/accuracy/V": 0.8912, "cbt/accuracy/CN": 0.7808, "cbt/accuracy/P": 0.8696, "cbt/accuracy/group_average": 0.8187934294871796, "cbt/accuracy/seq_average": 0.8188275310124049, "hellaswag/accuracy/val": 0.27753435570603463, "hellaswag/accuracy/group_average": 0.27753435570603463, "hellaswag/accuracy/seq_average": 0.27753435570603463, "piqa/accuracy/val": 0.5690968443960827, "piqa/accuracy/group_average": 0.5690968443960827, "piqa/accuracy/seq_average": 0.5690968443960827, "ai2arc/accuracy/ARC-Easy": 0.31543340380549684, "ai2arc/accuracy/ARC-Challenge": 0.20858369098712445, "ai2arc/accuracy/group_average": 0.26200854739631063, "ai2arc/accuracy/seq_average": 0.28016997167138813, "race/accuracy/test/high": 0.259576901086335, "race/accuracy/test/middle": 0.34331476323119775, "race/accuracy/group_average": 0.3014458321587664, "race/accuracy/seq_average": 0.28394811511957846, "siqa/accuracy/dev": 0.35516888433981575, "siqa/accuracy/group_average": 0.35516888433981575, "siqa/accuracy/seq_average": 0.35516888433981575, "commonsenseqa/accuracy/dev_rand_split": 0.24406224406224405, "commonsenseqa/accuracy/group_average": 0.24406224406224405, "commonsenseqa/accuracy/seq_average": 0.24406224406224405}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_remoe/export/result-model-60000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.750306628999256, "val/accuracy": 0.4605916341145833, "val/perplexity": 15.647429104194348, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5169154575892856, "lambada/accuracy/total": 0.20244565217391305, "lambada/accuracy/openai_last_token": 0.7443711180124224, "lambada/perplexity": 17.1382783594021, "lambada/lm_loss": 3.2917387778876614, "lambada/lm_perplexity": 26.88957803486005, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.33151864314424817, "mean_loss": 2.6336110432942705, "blimp/accuracy/passive_2": 0.902, "blimp/accuracy/determiner_noun_agreement_2": 0.981, "blimp/accuracy/ellipsis_n_bar_1": 0.787, "blimp/accuracy/tough_vs_raising_2": 0.862, "blimp/accuracy/tough_vs_raising_1": 0.553, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.888, "blimp/accuracy/principle_A_reconstruction": 0.299, "blimp/accuracy/wh_vs_that_with_gap": 0.575, "blimp/accuracy/principle_A_domain_2": 0.815, "blimp/accuracy/determiner_noun_agreement_1": 0.984, "blimp/accuracy/ellipsis_n_bar_2": 0.877, "blimp/accuracy/principle_A_domain_3": 0.525, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.9, "blimp/accuracy/animate_subject_trans": 0.888, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.837, "blimp/accuracy/distractor_agreement_relative_clause": 0.519, "blimp/accuracy/transitive": 0.824, "blimp/accuracy/sentential_subject_island": 0.311, "blimp/accuracy/adjunct_island": 0.779, "blimp/accuracy/intransitive": 0.783, "blimp/accuracy/existential_there_subject_raising": 0.876, "blimp/accuracy/irregular_past_participle_adjectives": 0.99, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.221, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.275, "blimp/accuracy/only_npi_scope": 0.753, "blimp/accuracy/superlative_quantifiers_2": 0.756, "blimp/accuracy/passive_1": 0.878, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.907, "blimp/accuracy/inchoative": 0.632, "blimp/accuracy/anaphor_gender_agreement": 0.919, "blimp/accuracy/principle_A_c_command": 0.606, "blimp/accuracy/only_npi_licensor_present": 0.468, "blimp/accuracy/expletive_it_object_raising": 0.756, "blimp/accuracy/left_branch_island_simple_question": 0.231, "blimp/accuracy/wh_questions_subject_gap": 0.894, "blimp/accuracy/existential_there_quantifiers_2": 0.306, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.941, "blimp/accuracy/sentential_negation_npi_scope": 0.614, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.8, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.835, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.871, "blimp/accuracy/principle_A_case_2": 0.96, "blimp/accuracy/distractor_agreement_relational_noun": 0.772, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.998, "blimp/accuracy/superlative_quantifiers_1": 0.778, "blimp/accuracy/wh_island": 0.757, "blimp/accuracy/principle_A_domain_1": 0.988, "blimp/accuracy/complex_NP_island": 0.496, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.949, "blimp/accuracy/irregular_past_participle_verbs": 0.838, "blimp/accuracy/drop_argument": 0.808, "blimp/accuracy/wh_questions_object_gap": 0.722, "blimp/accuracy/animate_subject_passive": 0.786, "blimp/accuracy/existential_there_quantifiers_1": 0.961, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.901, "blimp/accuracy/npi_present_2": 0.565, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.902, "blimp/accuracy/anaphor_number_agreement": 0.969, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.952, "blimp/accuracy/existential_there_object_raising": 0.818, "blimp/accuracy/matrix_question_npi_licensor_present": 0.152, "blimp/accuracy/npi_present_1": 0.507, "blimp/accuracy/wh_vs_that_no_gap": 0.957, "blimp/accuracy/left_branch_island_echo_question": 0.428, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.966, "blimp/accuracy/causative": 0.69, "blimp/accuracy/group_average": 0.7468358208955224, "blimp/accuracy/seq_average": 0.7468358208955224, "cbt/accuracy/NE": 0.7359775641025641, "cbt/accuracy/V": 0.8928, "cbt/accuracy/CN": 0.7868, "cbt/accuracy/P": 0.8708, "cbt/accuracy/group_average": 0.821594391025641, "cbt/accuracy/seq_average": 0.8216286514605843, "hellaswag/accuracy/val": 0.28072097191794465, "hellaswag/accuracy/group_average": 0.28072097191794465, "hellaswag/accuracy/seq_average": 0.28072097191794465, "piqa/accuracy/val": 0.5669205658324266, "piqa/accuracy/group_average": 0.5669205658324266, "piqa/accuracy/seq_average": 0.5669205658324266, "ai2arc/accuracy/ARC-Easy": 0.333615221987315, "ai2arc/accuracy/ARC-Challenge": 0.2094420600858369, "ai2arc/accuracy/group_average": 0.271528641036576, "ai2arc/accuracy/seq_average": 0.2926345609065156, "race/accuracy/test/high": 0.26157804459691253, "race/accuracy/test/middle": 0.33913649025069637, "race/accuracy/group_average": 0.3003572674238044, "race/accuracy/seq_average": 0.28415079043372515, "siqa/accuracy/dev": 0.35363357215967245, "siqa/accuracy/group_average": 0.35363357215967245, "siqa/accuracy/seq_average": 0.35363357215967245, "commonsenseqa/accuracy/dev_rand_split": 0.2457002457002457, "commonsenseqa/accuracy/group_average": 0.2457002457002457, "commonsenseqa/accuracy/seq_average": 0.2457002457002457}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_remoe/export/result-model-70000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.725154331752232, "val/accuracy": 0.46472071087549605, "val/perplexity": 15.25876865765346, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.496397788480202, "lambada/accuracy/total": 0.22127329192546583, "lambada/accuracy/openai_last_token": 0.749417701863354, "lambada/perplexity": 15.864925438567163, "lambada/lm_loss": 3.2748634426490555, "lambada/lm_perplexity": 26.43961471028496, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.34299700140048095, "mean_loss": 2.6107760601162173, "blimp/accuracy/passive_2": 0.901, "blimp/accuracy/determiner_noun_agreement_2": 0.976, "blimp/accuracy/ellipsis_n_bar_1": 0.804, "blimp/accuracy/tough_vs_raising_2": 0.844, "blimp/accuracy/tough_vs_raising_1": 0.55, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.884, "blimp/accuracy/principle_A_reconstruction": 0.337, "blimp/accuracy/wh_vs_that_with_gap": 0.582, "blimp/accuracy/principle_A_domain_2": 0.781, "blimp/accuracy/determiner_noun_agreement_1": 0.987, "blimp/accuracy/ellipsis_n_bar_2": 0.888, "blimp/accuracy/principle_A_domain_3": 0.536, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.893, "blimp/accuracy/animate_subject_trans": 0.89, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.867, "blimp/accuracy/distractor_agreement_relative_clause": 0.528, "blimp/accuracy/transitive": 0.829, "blimp/accuracy/sentential_subject_island": 0.313, "blimp/accuracy/adjunct_island": 0.764, "blimp/accuracy/intransitive": 0.799, "blimp/accuracy/existential_there_subject_raising": 0.887, "blimp/accuracy/irregular_past_participle_adjectives": 0.904, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.233, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.267, "blimp/accuracy/only_npi_scope": 0.746, "blimp/accuracy/superlative_quantifiers_2": 0.717, "blimp/accuracy/passive_1": 0.896, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.871, "blimp/accuracy/inchoative": 0.619, "blimp/accuracy/anaphor_gender_agreement": 0.944, "blimp/accuracy/principle_A_c_command": 0.604, "blimp/accuracy/only_npi_licensor_present": 0.591, "blimp/accuracy/expletive_it_object_raising": 0.769, "blimp/accuracy/left_branch_island_simple_question": 0.238, "blimp/accuracy/wh_questions_subject_gap": 0.898, "blimp/accuracy/existential_there_quantifiers_2": 0.316, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.945, "blimp/accuracy/sentential_negation_npi_scope": 0.546, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.811, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.869, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.885, "blimp/accuracy/principle_A_case_2": 0.961, "blimp/accuracy/distractor_agreement_relational_noun": 0.765, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.998, "blimp/accuracy/superlative_quantifiers_1": 0.749, "blimp/accuracy/wh_island": 0.838, "blimp/accuracy/principle_A_domain_1": 0.981, "blimp/accuracy/complex_NP_island": 0.494, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.958, "blimp/accuracy/irregular_past_participle_verbs": 0.831, "blimp/accuracy/drop_argument": 0.805, "blimp/accuracy/wh_questions_object_gap": 0.723, "blimp/accuracy/animate_subject_passive": 0.784, "blimp/accuracy/existential_there_quantifiers_1": 0.972, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.871, "blimp/accuracy/npi_present_2": 0.522, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.891, "blimp/accuracy/anaphor_number_agreement": 0.972, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.953, "blimp/accuracy/existential_there_object_raising": 0.824, "blimp/accuracy/matrix_question_npi_licensor_present": 0.153, "blimp/accuracy/npi_present_1": 0.446, "blimp/accuracy/wh_vs_that_no_gap": 0.969, "blimp/accuracy/left_branch_island_echo_question": 0.397, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.981, "blimp/accuracy/causative": 0.705, "blimp/accuracy/group_average": 0.7470447761194029, "blimp/accuracy/seq_average": 0.747044776119403, "cbt/accuracy/NE": 0.7423878205128205, "cbt/accuracy/V": 0.8908, "cbt/accuracy/CN": 0.7996, "cbt/accuracy/P": 0.87, "cbt/accuracy/group_average": 0.8256969551282052, "cbt/accuracy/seq_average": 0.8257302921168468, "hellaswag/accuracy/val": 0.28052180840470026, "hellaswag/accuracy/group_average": 0.28052180840470026, "hellaswag/accuracy/seq_average": 0.28052180840470026, "piqa/accuracy/val": 0.5783460282916213, "piqa/accuracy/group_average": 0.5783460282916213, "piqa/accuracy/seq_average": 0.5783460282916213, "ai2arc/accuracy/ARC-Easy": 0.3276955602536998, "ai2arc/accuracy/ARC-Challenge": 0.2094420600858369, "ai2arc/accuracy/group_average": 0.26856881016976836, "ai2arc/accuracy/seq_average": 0.2886685552407932, "race/accuracy/test/high": 0.2644368210405946, "race/accuracy/test/middle": 0.34331476323119775, "race/accuracy/group_average": 0.3038757921358962, "race/accuracy/seq_average": 0.287393595460073, "siqa/accuracy/dev": 0.3561924257932446, "siqa/accuracy/group_average": 0.3561924257932446, "siqa/accuracy/seq_average": 0.3561924257932446, "commonsenseqa/accuracy/dev_rand_split": 0.24897624897624898, "commonsenseqa/accuracy/group_average": 0.24897624897624898, "commonsenseqa/accuracy/seq_average": 0.24897624897624898}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_remoe/export/result-model-80000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.7039664132254466, "val/accuracy": 0.4672638423859127, "val/perplexity": 14.938868091681279, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.4350987783870344, "lambada/accuracy/total": 0.21622670807453417, "lambada/accuracy/openai_last_token": 0.749805900621118, "lambada/perplexity": 15.697112520875962, "lambada/lm_loss": 3.256284152432994, "lambada/lm_perplexity": 25.952920650553498, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3417452752302234, "mean_loss": 2.5695325958062405, "blimp/accuracy/passive_2": 0.908, "blimp/accuracy/determiner_noun_agreement_2": 0.979, "blimp/accuracy/ellipsis_n_bar_1": 0.791, "blimp/accuracy/tough_vs_raising_2": 0.864, "blimp/accuracy/tough_vs_raising_1": 0.581, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.887, "blimp/accuracy/principle_A_reconstruction": 0.384, "blimp/accuracy/wh_vs_that_with_gap": 0.573, "blimp/accuracy/principle_A_domain_2": 0.781, "blimp/accuracy/determiner_noun_agreement_1": 0.989, "blimp/accuracy/ellipsis_n_bar_2": 0.879, "blimp/accuracy/principle_A_domain_3": 0.55, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.892, "blimp/accuracy/animate_subject_trans": 0.89, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.864, "blimp/accuracy/distractor_agreement_relative_clause": 0.536, "blimp/accuracy/transitive": 0.838, "blimp/accuracy/sentential_subject_island": 0.321, "blimp/accuracy/adjunct_island": 0.737, "blimp/accuracy/intransitive": 0.806, "blimp/accuracy/existential_there_subject_raising": 0.864, "blimp/accuracy/irregular_past_participle_adjectives": 0.959, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.215, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.273, "blimp/accuracy/only_npi_scope": 0.724, "blimp/accuracy/superlative_quantifiers_2": 0.836, "blimp/accuracy/passive_1": 0.891, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.89, "blimp/accuracy/inchoative": 0.642, "blimp/accuracy/anaphor_gender_agreement": 0.939, "blimp/accuracy/principle_A_c_command": 0.591, "blimp/accuracy/only_npi_licensor_present": 0.388, "blimp/accuracy/expletive_it_object_raising": 0.764, "blimp/accuracy/left_branch_island_simple_question": 0.23, "blimp/accuracy/wh_questions_subject_gap": 0.915, "blimp/accuracy/existential_there_quantifiers_2": 0.26, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.942, "blimp/accuracy/sentential_negation_npi_scope": 0.601, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.802, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.853, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.912, "blimp/accuracy/principle_A_case_2": 0.955, "blimp/accuracy/distractor_agreement_relational_noun": 0.776, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.997, "blimp/accuracy/superlative_quantifiers_1": 0.59, "blimp/accuracy/wh_island": 0.81, "blimp/accuracy/principle_A_domain_1": 0.977, "blimp/accuracy/complex_NP_island": 0.521, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.961, "blimp/accuracy/irregular_past_participle_verbs": 0.814, "blimp/accuracy/drop_argument": 0.81, "blimp/accuracy/wh_questions_object_gap": 0.774, "blimp/accuracy/animate_subject_passive": 0.776, "blimp/accuracy/existential_there_quantifiers_1": 0.96, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.878, "blimp/accuracy/npi_present_2": 0.595, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.912, "blimp/accuracy/anaphor_number_agreement": 0.978, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.95, "blimp/accuracy/existential_there_object_raising": 0.841, "blimp/accuracy/matrix_question_npi_licensor_present": 0.213, "blimp/accuracy/npi_present_1": 0.575, "blimp/accuracy/wh_vs_that_no_gap": 0.966, "blimp/accuracy/left_branch_island_echo_question": 0.399, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.974, "blimp/accuracy/causative": 0.711, "blimp/accuracy/group_average": 0.7500597014925375, "blimp/accuracy/seq_average": 0.7500597014925373, "cbt/accuracy/NE": 0.749198717948718, "cbt/accuracy/V": 0.9, "cbt/accuracy/CN": 0.7912, "cbt/accuracy/P": 0.8748, "cbt/accuracy/group_average": 0.8287996794871795, "cbt/accuracy/seq_average": 0.8288315326130452, "hellaswag/accuracy/val": 0.2826130252937662, "hellaswag/accuracy/group_average": 0.2826130252937662, "hellaswag/accuracy/seq_average": 0.2826130252937662, "piqa/accuracy/val": 0.5750816104461371, "piqa/accuracy/group_average": 0.5750816104461371, "piqa/accuracy/seq_average": 0.5750816104461371, "ai2arc/accuracy/ARC-Easy": 0.3298097251585624, "ai2arc/accuracy/ARC-Challenge": 0.21373390557939914, "ai2arc/accuracy/group_average": 0.27177181536898076, "ai2arc/accuracy/seq_average": 0.2915014164305949, "race/accuracy/test/high": 0.2627215551743854, "race/accuracy/test/middle": 0.33913649025069637, "race/accuracy/group_average": 0.30092902271254085, "race/accuracy/seq_average": 0.2849614916903121, "siqa/accuracy/dev": 0.3572159672466735, "siqa/accuracy/group_average": 0.3572159672466735, "siqa/accuracy/seq_average": 0.3572159672466735, "commonsenseqa/accuracy/dev_rand_split": 0.24815724815724816, "commonsenseqa/accuracy/group_average": 0.24815724815724816, "commonsenseqa/accuracy/seq_average": 0.24815724815724816}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_remoe/export/result-model-90000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.6921638609871033, "val/accuracy": 0.4691908094618056, "val/perplexity": 14.763587733153033, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.4907431276688663, "lambada/accuracy/total": 0.22088509316770186, "lambada/accuracy/openai_last_token": 0.751358695652174, "lambada/perplexity": 15.158555629557807, "lambada/lm_loss": 3.2329125215809853, "lambada/lm_perplexity": 25.35339184253002, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.34503795131475373, "mean_loss": 2.5914534943279848, "blimp/accuracy/passive_2": 0.893, "blimp/accuracy/determiner_noun_agreement_2": 0.977, "blimp/accuracy/ellipsis_n_bar_1": 0.813, "blimp/accuracy/tough_vs_raising_2": 0.86, "blimp/accuracy/tough_vs_raising_1": 0.543, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.891, "blimp/accuracy/principle_A_reconstruction": 0.346, "blimp/accuracy/wh_vs_that_with_gap": 0.579, "blimp/accuracy/principle_A_domain_2": 0.781, "blimp/accuracy/determiner_noun_agreement_1": 0.985, "blimp/accuracy/ellipsis_n_bar_2": 0.883, "blimp/accuracy/principle_A_domain_3": 0.515, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.882, "blimp/accuracy/animate_subject_trans": 0.887, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.874, "blimp/accuracy/distractor_agreement_relative_clause": 0.584, "blimp/accuracy/transitive": 0.833, "blimp/accuracy/sentential_subject_island": 0.314, "blimp/accuracy/adjunct_island": 0.754, "blimp/accuracy/intransitive": 0.798, "blimp/accuracy/existential_there_subject_raising": 0.88, "blimp/accuracy/irregular_past_participle_adjectives": 0.972, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.212, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.283, "blimp/accuracy/only_npi_scope": 0.717, "blimp/accuracy/superlative_quantifiers_2": 0.65, "blimp/accuracy/passive_1": 0.887, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.889, "blimp/accuracy/inchoative": 0.646, "blimp/accuracy/anaphor_gender_agreement": 0.948, "blimp/accuracy/principle_A_c_command": 0.636, "blimp/accuracy/only_npi_licensor_present": 0.439, "blimp/accuracy/expletive_it_object_raising": 0.783, "blimp/accuracy/left_branch_island_simple_question": 0.228, "blimp/accuracy/wh_questions_subject_gap": 0.905, "blimp/accuracy/existential_there_quantifiers_2": 0.328, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.934, "blimp/accuracy/sentential_negation_npi_scope": 0.617, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.806, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.855, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.907, "blimp/accuracy/principle_A_case_2": 0.949, "blimp/accuracy/distractor_agreement_relational_noun": 0.813, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.994, "blimp/accuracy/superlative_quantifiers_1": 0.673, "blimp/accuracy/wh_island": 0.793, "blimp/accuracy/principle_A_domain_1": 0.984, "blimp/accuracy/complex_NP_island": 0.519, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.95, "blimp/accuracy/irregular_past_participle_verbs": 0.833, "blimp/accuracy/drop_argument": 0.8, "blimp/accuracy/wh_questions_object_gap": 0.756, "blimp/accuracy/animate_subject_passive": 0.783, "blimp/accuracy/existential_there_quantifiers_1": 0.973, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.881, "blimp/accuracy/npi_present_2": 0.589, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.928, "blimp/accuracy/anaphor_number_agreement": 0.98, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.952, "blimp/accuracy/existential_there_object_raising": 0.836, "blimp/accuracy/matrix_question_npi_licensor_present": 0.181, "blimp/accuracy/npi_present_1": 0.503, "blimp/accuracy/wh_vs_that_no_gap": 0.968, "blimp/accuracy/left_branch_island_echo_question": 0.388, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.972, "blimp/accuracy/causative": 0.702, "blimp/accuracy/group_average": 0.7494626865671642, "blimp/accuracy/seq_average": 0.7494626865671642, "cbt/accuracy/NE": 0.7455929487179487, "cbt/accuracy/V": 0.894, "cbt/accuracy/CN": 0.8048, "cbt/accuracy/P": 0.8764, "cbt/accuracy/group_average": 0.830198237179487, "cbt/accuracy/seq_average": 0.8302320928371348, "hellaswag/accuracy/val": 0.27972515435172274, "hellaswag/accuracy/group_average": 0.27972515435172274, "hellaswag/accuracy/seq_average": 0.27972515435172274, "piqa/accuracy/val": 0.573993471164309, "piqa/accuracy/group_average": 0.573993471164309, "piqa/accuracy/seq_average": 0.573993471164309, "ai2arc/accuracy/ARC-Easy": 0.3331923890063425, "ai2arc/accuracy/ARC-Challenge": 0.21630901287553647, "ai2arc/accuracy/group_average": 0.2747507009409395, "ai2arc/accuracy/seq_average": 0.29461756373937675, "race/accuracy/test/high": 0.27101200686106347, "race/accuracy/test/middle": 0.3384401114206128, "race/accuracy/group_average": 0.30472605914083817, "race/accuracy/seq_average": 0.29063640048642075, "siqa/accuracy/dev": 0.3572159672466735, "siqa/accuracy/group_average": 0.3572159672466735, "siqa/accuracy/seq_average": 0.3572159672466735, "commonsenseqa/accuracy/dev_rand_split": 0.25634725634725636, "commonsenseqa/accuracy/group_average": 0.25634725634725636, "commonsenseqa/accuracy/seq_average": 0.25634725634725636}
|