Upload folder using huggingface_hub
#272
by
DavidNguyen
- opened
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_standard_lb_v2/export/result-model-10000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_standard_lb_v2/export/result-model-100000.pth.json +121 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_standard_lb_v2/export/result-model-20000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_standard_lb_v2/export/result-model-30000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_standard_lb_v2/export/result-model-40000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_standard_lb_v2/export/result-model-60000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_standard_lb_v2/export/result-model-70000.pth.json +1 -0
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_standard_lb_v2/export/result-model-10000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 3.027585953000992, "val/accuracy": 0.42340862940228174, "val/perplexity": 20.647328756503384, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.7719552176339284, "lambada/accuracy/total": 0.15838509316770186, "lambada/accuracy/openai_last_token": 0.718361801242236, "lambada/perplexity": 24.03160614755204, "lambada/lm_loss": 3.5617760941350007, "lambada/lm_perplexity": 35.225705787508474, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.2908968612849918, "mean_loss": 2.8997705853174605, "blimp/accuracy/passive_2": 0.861, "blimp/accuracy/determiner_noun_agreement_2": 0.955, "blimp/accuracy/ellipsis_n_bar_1": 0.74, "blimp/accuracy/tough_vs_raising_2": 0.798, "blimp/accuracy/tough_vs_raising_1": 0.565, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.895, "blimp/accuracy/principle_A_reconstruction": 0.288, "blimp/accuracy/wh_vs_that_with_gap": 0.432, "blimp/accuracy/principle_A_domain_2": 0.828, "blimp/accuracy/determiner_noun_agreement_1": 0.968, "blimp/accuracy/ellipsis_n_bar_2": 0.872, "blimp/accuracy/principle_A_domain_3": 0.536, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.865, "blimp/accuracy/animate_subject_trans": 0.884, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.726, "blimp/accuracy/distractor_agreement_relative_clause": 0.37, "blimp/accuracy/transitive": 0.791, "blimp/accuracy/sentential_subject_island": 0.431, "blimp/accuracy/adjunct_island": 0.706, "blimp/accuracy/intransitive": 0.713, "blimp/accuracy/existential_there_subject_raising": 0.823, "blimp/accuracy/irregular_past_participle_adjectives": 0.889, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.184, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.174, "blimp/accuracy/only_npi_scope": 0.586, "blimp/accuracy/superlative_quantifiers_2": 0.683, "blimp/accuracy/passive_1": 0.876, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.879, "blimp/accuracy/inchoative": 0.528, "blimp/accuracy/anaphor_gender_agreement": 0.883, "blimp/accuracy/principle_A_c_command": 0.453, "blimp/accuracy/only_npi_licensor_present": 0.462, "blimp/accuracy/expletive_it_object_raising": 0.738, "blimp/accuracy/left_branch_island_simple_question": 0.26, "blimp/accuracy/wh_questions_subject_gap": 0.901, "blimp/accuracy/existential_there_quantifiers_2": 0.366, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.891, "blimp/accuracy/sentential_negation_npi_scope": 0.431, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.759, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.865, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.833, "blimp/accuracy/principle_A_case_2": 0.914, "blimp/accuracy/distractor_agreement_relational_noun": 0.694, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.986, "blimp/accuracy/superlative_quantifiers_1": 0.616, "blimp/accuracy/wh_island": 0.774, "blimp/accuracy/principle_A_domain_1": 0.983, "blimp/accuracy/complex_NP_island": 0.521, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.922, "blimp/accuracy/irregular_past_participle_verbs": 0.814, "blimp/accuracy/drop_argument": 0.766, "blimp/accuracy/wh_questions_object_gap": 0.716, "blimp/accuracy/animate_subject_passive": 0.764, "blimp/accuracy/existential_there_quantifiers_1": 0.975, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.876, "blimp/accuracy/npi_present_2": 0.617, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.821, "blimp/accuracy/anaphor_number_agreement": 0.969, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.911, "blimp/accuracy/existential_there_object_raising": 0.79, "blimp/accuracy/matrix_question_npi_licensor_present": 0.074, "blimp/accuracy/npi_present_1": 0.58, "blimp/accuracy/wh_vs_that_no_gap": 0.957, "blimp/accuracy/left_branch_island_echo_question": 0.463, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.969, "blimp/accuracy/causative": 0.636, "blimp/accuracy/group_average": 0.7133731343283581, "blimp/accuracy/seq_average": 0.7133731343283582, "cbt/accuracy/NE": 0.6846955128205128, "cbt/accuracy/V": 0.86, "cbt/accuracy/CN": 0.7304, "cbt/accuracy/P": 0.83, "cbt/accuracy/group_average": 0.7762738782051282, "cbt/accuracy/seq_average": 0.7763105242096838, "hellaswag/accuracy/val": 0.2727544313881697, "hellaswag/accuracy/group_average": 0.2727544313881697, "hellaswag/accuracy/seq_average": 0.2727544313881697, "piqa/accuracy/val": 0.5467899891186072, "piqa/accuracy/group_average": 0.5467899891186072, "piqa/accuracy/seq_average": 0.5467899891186072, "ai2arc/accuracy/ARC-Easy": 0.30021141649048627, "ai2arc/accuracy/ARC-Challenge": 0.21030042918454936, "ai2arc/accuracy/group_average": 0.2552559228375178, "ai2arc/accuracy/seq_average": 0.2705382436260623, "mmlu/accuracy/MMLU": 0.26235252055774044, "mmlu/accuracy/group_average": 0.26235252055774044, "mmlu/accuracy/seq_average": 0.26235252055774044, "openbookqa/accuracy/test": 0.26, "openbookqa/accuracy/group_average": 0.26, "openbookqa/accuracy/seq_average": 0.26, "race/accuracy/test/high": 0.25443110348770726, "race/accuracy/test/middle": 0.30571030640668523, "race/accuracy/group_average": 0.2800707049471962, "race/accuracy/seq_average": 0.26935549250101337, "siqa/accuracy/dev": 0.35363357215967245, "siqa/accuracy/group_average": 0.35363357215967245, "siqa/accuracy/seq_average": 0.35363357215967245, "winogrande/accuracy/dev": 0.5098658247829518, "winogrande/accuracy/group_average": 0.5098658247829518, "winogrande/accuracy/seq_average": 0.5098658247829518, "commonsenseqa/accuracy/dev_rand_split": 0.23996723996723995, "commonsenseqa/accuracy/group_average": 0.23996723996723995, "commonsenseqa/accuracy/seq_average": 0.23996723996723995}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_standard_lb_v2/export/result-model-100000.pth.json
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"val/loss": 2.6105455671037947,
|
| 3 |
+
"val/accuracy": 0.47938077411954366,
|
| 4 |
+
"val/perplexity": 13.606472070820919,
|
| 5 |
+
"val/time_since_best_loss": 0,
|
| 6 |
+
"val/time_since_best_accuracy": 0,
|
| 7 |
+
"lambada/loss": 2.5885574625145575,
|
| 8 |
+
"lambada/accuracy/total": 0.24922360248447206,
|
| 9 |
+
"lambada/accuracy/openai_last_token": 0.7593167701863354,
|
| 10 |
+
"lambada/perplexity": 12.0800010668659,
|
| 11 |
+
"lambada/lm_loss": 3.176875031428619,
|
| 12 |
+
"lambada/lm_perplexity": 23.97172549418872,
|
| 13 |
+
"lambada/time_since_best_loss": 0,
|
| 14 |
+
"lambada/time_since_best_accuracy": 0,
|
| 15 |
+
"mean_accuracy": 0.3643021883020079,
|
| 16 |
+
"mean_loss": 2.599551514809176,
|
| 17 |
+
"blimp/accuracy/passive_2": 0.911,
|
| 18 |
+
"blimp/accuracy/determiner_noun_agreement_2": 0.983,
|
| 19 |
+
"blimp/accuracy/ellipsis_n_bar_1": 0.779,
|
| 20 |
+
"blimp/accuracy/tough_vs_raising_2": 0.846,
|
| 21 |
+
"blimp/accuracy/tough_vs_raising_1": 0.628,
|
| 22 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.923,
|
| 23 |
+
"blimp/accuracy/principle_A_reconstruction": 0.244,
|
| 24 |
+
"blimp/accuracy/wh_vs_that_with_gap": 0.542,
|
| 25 |
+
"blimp/accuracy/principle_A_domain_2": 0.833,
|
| 26 |
+
"blimp/accuracy/determiner_noun_agreement_1": 0.983,
|
| 27 |
+
"blimp/accuracy/ellipsis_n_bar_2": 0.93,
|
| 28 |
+
"blimp/accuracy/principle_A_domain_3": 0.504,
|
| 29 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.91,
|
| 30 |
+
"blimp/accuracy/animate_subject_trans": 0.907,
|
| 31 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.886,
|
| 32 |
+
"blimp/accuracy/distractor_agreement_relative_clause": 0.601,
|
| 33 |
+
"blimp/accuracy/transitive": 0.869,
|
| 34 |
+
"blimp/accuracy/sentential_subject_island": 0.268,
|
| 35 |
+
"blimp/accuracy/adjunct_island": 0.768,
|
| 36 |
+
"blimp/accuracy/intransitive": 0.814,
|
| 37 |
+
"blimp/accuracy/existential_there_subject_raising": 0.867,
|
| 38 |
+
"blimp/accuracy/irregular_past_participle_adjectives": 0.956,
|
| 39 |
+
"blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.38,
|
| 40 |
+
"blimp/accuracy/principle_A_case_1": 1.0,
|
| 41 |
+
"blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.268,
|
| 42 |
+
"blimp/accuracy/only_npi_scope": 0.61,
|
| 43 |
+
"blimp/accuracy/superlative_quantifiers_2": 0.872,
|
| 44 |
+
"blimp/accuracy/passive_1": 0.893,
|
| 45 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.921,
|
| 46 |
+
"blimp/accuracy/inchoative": 0.642,
|
| 47 |
+
"blimp/accuracy/anaphor_gender_agreement": 0.933,
|
| 48 |
+
"blimp/accuracy/principle_A_c_command": 0.6,
|
| 49 |
+
"blimp/accuracy/only_npi_licensor_present": 0.57,
|
| 50 |
+
"blimp/accuracy/expletive_it_object_raising": 0.747,
|
| 51 |
+
"blimp/accuracy/left_branch_island_simple_question": 0.444,
|
| 52 |
+
"blimp/accuracy/wh_questions_subject_gap": 0.901,
|
| 53 |
+
"blimp/accuracy/existential_there_quantifiers_2": 0.344,
|
| 54 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.939,
|
| 55 |
+
"blimp/accuracy/sentential_negation_npi_scope": 0.659,
|
| 56 |
+
"blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.793,
|
| 57 |
+
"blimp/accuracy/wh_questions_subject_gap_long_distance": 0.827,
|
| 58 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.914,
|
| 59 |
+
"blimp/accuracy/principle_A_case_2": 0.952,
|
| 60 |
+
"blimp/accuracy/distractor_agreement_relational_noun": 0.783,
|
| 61 |
+
"blimp/accuracy/sentential_negation_npi_licensor_present": 0.991,
|
| 62 |
+
"blimp/accuracy/superlative_quantifiers_1": 0.67,
|
| 63 |
+
"blimp/accuracy/wh_island": 0.795,
|
| 64 |
+
"blimp/accuracy/principle_A_domain_1": 0.985,
|
| 65 |
+
"blimp/accuracy/complex_NP_island": 0.472,
|
| 66 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_2": 0.962,
|
| 67 |
+
"blimp/accuracy/irregular_past_participle_verbs": 0.909,
|
| 68 |
+
"blimp/accuracy/drop_argument": 0.778,
|
| 69 |
+
"blimp/accuracy/wh_questions_object_gap": 0.757,
|
| 70 |
+
"blimp/accuracy/animate_subject_passive": 0.798,
|
| 71 |
+
"blimp/accuracy/existential_there_quantifiers_1": 0.977,
|
| 72 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.895,
|
| 73 |
+
"blimp/accuracy/npi_present_2": 0.612,
|
| 74 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_1": 0.92,
|
| 75 |
+
"blimp/accuracy/anaphor_number_agreement": 0.981,
|
| 76 |
+
"blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.947,
|
| 77 |
+
"blimp/accuracy/existential_there_object_raising": 0.843,
|
| 78 |
+
"blimp/accuracy/matrix_question_npi_licensor_present": 0.284,
|
| 79 |
+
"blimp/accuracy/npi_present_1": 0.637,
|
| 80 |
+
"blimp/accuracy/wh_vs_that_no_gap": 0.963,
|
| 81 |
+
"blimp/accuracy/left_branch_island_echo_question": 0.395,
|
| 82 |
+
"blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.968,
|
| 83 |
+
"blimp/accuracy/causative": 0.703,
|
| 84 |
+
"blimp/accuracy/group_average": 0.7639701492537316,
|
| 85 |
+
"blimp/accuracy/seq_average": 0.7639701492537313,
|
| 86 |
+
"cbt/accuracy/NE": 0.7672275641025641,
|
| 87 |
+
"cbt/accuracy/V": 0.9072,
|
| 88 |
+
"cbt/accuracy/CN": 0.8216,
|
| 89 |
+
"cbt/accuracy/P": 0.8908,
|
| 90 |
+
"cbt/accuracy/group_average": 0.846706891025641,
|
| 91 |
+
"cbt/accuracy/seq_average": 0.8467386954781913,
|
| 92 |
+
"hellaswag/accuracy/val": 0.29187412865962953,
|
| 93 |
+
"hellaswag/accuracy/group_average": 0.29187412865962953,
|
| 94 |
+
"hellaswag/accuracy/seq_average": 0.29187412865962953,
|
| 95 |
+
"piqa/accuracy/val": 0.5772578890097932,
|
| 96 |
+
"piqa/accuracy/group_average": 0.5772578890097932,
|
| 97 |
+
"piqa/accuracy/seq_average": 0.5772578890097932,
|
| 98 |
+
"ai2arc/accuracy/ARC-Easy": 0.32515856236786467,
|
| 99 |
+
"ai2arc/accuracy/ARC-Challenge": 0.20772532188841203,
|
| 100 |
+
"ai2arc/accuracy/group_average": 0.2664419421281383,
|
| 101 |
+
"ai2arc/accuracy/seq_average": 0.2864022662889518,
|
| 102 |
+
"mmlu/accuracy/MMLU": 0.2582052198784412,
|
| 103 |
+
"mmlu/accuracy/group_average": 0.2582052198784412,
|
| 104 |
+
"mmlu/accuracy/seq_average": 0.2582052198784412,
|
| 105 |
+
"openbookqa/accuracy/test": 0.258,
|
| 106 |
+
"openbookqa/accuracy/group_average": 0.258,
|
| 107 |
+
"openbookqa/accuracy/seq_average": 0.258,
|
| 108 |
+
"race/accuracy/test/high": 0.26758147512864494,
|
| 109 |
+
"race/accuracy/test/middle": 0.3342618384401114,
|
| 110 |
+
"race/accuracy/group_average": 0.3009216567843782,
|
| 111 |
+
"race/accuracy/seq_average": 0.2869882448317795,
|
| 112 |
+
"siqa/accuracy/dev": 0.3526100307062436,
|
| 113 |
+
"siqa/accuracy/group_average": 0.3526100307062436,
|
| 114 |
+
"siqa/accuracy/seq_average": 0.3526100307062436,
|
| 115 |
+
"winogrande/accuracy/dev": 0.5122336227308603,
|
| 116 |
+
"winogrande/accuracy/group_average": 0.5122336227308603,
|
| 117 |
+
"winogrande/accuracy/seq_average": 0.5122336227308603,
|
| 118 |
+
"commonsenseqa/accuracy/dev_rand_split": 0.2538902538902539,
|
| 119 |
+
"commonsenseqa/accuracy/group_average": 0.2538902538902539,
|
| 120 |
+
"commonsenseqa/accuracy/seq_average": 0.2538902538902539
|
| 121 |
+
}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_standard_lb_v2/export/result-model-20000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.880667308020213, "val/accuracy": 0.4415166945684524, "val/perplexity": 17.826164754209497, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6459052992163237, "lambada/accuracy/total": 0.19701086956521738, "lambada/accuracy/openai_last_token": 0.7393245341614907, "lambada/perplexity": 17.4742443200684, "lambada/lm_loss": 3.414981394257363, "lambada/lm_perplexity": 30.416383502897077, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.31926378206683487, "mean_loss": 2.763286303618268, "blimp/accuracy/passive_2": 0.866, "blimp/accuracy/determiner_noun_agreement_2": 0.96, "blimp/accuracy/ellipsis_n_bar_1": 0.77, "blimp/accuracy/tough_vs_raising_2": 0.837, "blimp/accuracy/tough_vs_raising_1": 0.605, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.894, "blimp/accuracy/principle_A_reconstruction": 0.312, "blimp/accuracy/wh_vs_that_with_gap": 0.493, "blimp/accuracy/principle_A_domain_2": 0.801, "blimp/accuracy/determiner_noun_agreement_1": 0.98, "blimp/accuracy/ellipsis_n_bar_2": 0.898, "blimp/accuracy/principle_A_domain_3": 0.548, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.871, "blimp/accuracy/animate_subject_trans": 0.889, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.825, "blimp/accuracy/distractor_agreement_relative_clause": 0.45, "blimp/accuracy/transitive": 0.839, "blimp/accuracy/sentential_subject_island": 0.376, "blimp/accuracy/adjunct_island": 0.727, "blimp/accuracy/intransitive": 0.708, "blimp/accuracy/existential_there_subject_raising": 0.82, "blimp/accuracy/irregular_past_participle_adjectives": 0.852, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.238, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.179, "blimp/accuracy/only_npi_scope": 0.645, "blimp/accuracy/superlative_quantifiers_2": 0.828, "blimp/accuracy/passive_1": 0.878, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.895, "blimp/accuracy/inchoative": 0.537, "blimp/accuracy/anaphor_gender_agreement": 0.871, "blimp/accuracy/principle_A_c_command": 0.526, "blimp/accuracy/only_npi_licensor_present": 0.562, "blimp/accuracy/expletive_it_object_raising": 0.733, "blimp/accuracy/left_branch_island_simple_question": 0.334, "blimp/accuracy/wh_questions_subject_gap": 0.884, "blimp/accuracy/existential_there_quantifiers_2": 0.308, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.889, "blimp/accuracy/sentential_negation_npi_scope": 0.499, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.79, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.879, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.881, "blimp/accuracy/principle_A_case_2": 0.923, "blimp/accuracy/distractor_agreement_relational_noun": 0.786, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.994, "blimp/accuracy/superlative_quantifiers_1": 0.521, "blimp/accuracy/wh_island": 0.832, "blimp/accuracy/principle_A_domain_1": 0.987, "blimp/accuracy/complex_NP_island": 0.539, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.945, "blimp/accuracy/irregular_past_participle_verbs": 0.881, "blimp/accuracy/drop_argument": 0.755, "blimp/accuracy/wh_questions_object_gap": 0.748, "blimp/accuracy/animate_subject_passive": 0.761, "blimp/accuracy/existential_there_quantifiers_1": 0.994, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.874, "blimp/accuracy/npi_present_2": 0.602, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.898, "blimp/accuracy/anaphor_number_agreement": 0.969, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.933, "blimp/accuracy/existential_there_object_raising": 0.77, "blimp/accuracy/matrix_question_npi_licensor_present": 0.089, "blimp/accuracy/npi_present_1": 0.534, "blimp/accuracy/wh_vs_that_no_gap": 0.942, "blimp/accuracy/left_branch_island_echo_question": 0.389, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.982, "blimp/accuracy/causative": 0.691, "blimp/accuracy/group_average": 0.731582089552239, "blimp/accuracy/seq_average": 0.7315820895522388, "cbt/accuracy/NE": 0.703125, "cbt/accuracy/V": 0.8844, "cbt/accuracy/CN": 0.7648, "cbt/accuracy/P": 0.8572, "cbt/accuracy/group_average": 0.80238125, "cbt/accuracy/seq_average": 0.802420968387355, "hellaswag/accuracy/val": 0.2751443935471022, "hellaswag/accuracy/group_average": 0.2751443935471022, "hellaswag/accuracy/seq_average": 0.2751443935471022, "piqa/accuracy/val": 0.5609357997823722, "piqa/accuracy/group_average": 0.5609357997823722, "piqa/accuracy/seq_average": 0.5609357997823722, "ai2arc/accuracy/ARC-Easy": 0.3116279069767442, "ai2arc/accuracy/ARC-Challenge": 0.19570815450643778, "ai2arc/accuracy/group_average": 0.253668030741591, "ai2arc/accuracy/seq_average": 0.273371104815864, "mmlu/accuracy/MMLU": 0.262567036110118, "mmlu/accuracy/group_average": 0.262567036110118, "mmlu/accuracy/seq_average": 0.262567036110118, "openbookqa/accuracy/test": 0.266, "openbookqa/accuracy/group_average": 0.266, "openbookqa/accuracy/seq_average": 0.266, "race/accuracy/test/high": 0.2584333905088622, "race/accuracy/test/middle": 0.3363509749303621, "race/accuracy/group_average": 0.29739218271961215, "race/accuracy/seq_average": 0.28111066072152413, "siqa/accuracy/dev": 0.3577277379733879, "siqa/accuracy/group_average": 0.3577277379733879, "siqa/accuracy/seq_average": 0.3577277379733879, "winogrande/accuracy/dev": 0.516179952644041, "winogrande/accuracy/group_average": 0.516179952644041, "winogrande/accuracy/seq_average": 0.516179952644041, "commonsenseqa/accuracy/dev_rand_split": 0.23013923013923013, "commonsenseqa/accuracy/group_average": 0.23013923013923013, "commonsenseqa/accuracy/seq_average": 0.23013923013923013}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_standard_lb_v2/export/result-model-30000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.8055223737444197, "val/accuracy": 0.45195079985119047, "val/perplexity": 16.535711471888074, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.7081181307016693, "lambada/accuracy/total": 0.21777950310559005, "lambada/accuracy/openai_last_token": 0.7402950310559007, "lambada/perplexity": 16.4259478671092, "lambada/lm_loss": 3.363486649122513, "lambada/lm_perplexity": 28.88974388101112, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.33486515147839024, "mean_loss": 2.7568202522230445, "blimp/accuracy/passive_2": 0.873, "blimp/accuracy/determiner_noun_agreement_2": 0.978, "blimp/accuracy/ellipsis_n_bar_1": 0.757, "blimp/accuracy/tough_vs_raising_2": 0.827, "blimp/accuracy/tough_vs_raising_1": 0.621, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.928, "blimp/accuracy/principle_A_reconstruction": 0.259, "blimp/accuracy/wh_vs_that_with_gap": 0.531, "blimp/accuracy/principle_A_domain_2": 0.805, "blimp/accuracy/determiner_noun_agreement_1": 0.976, "blimp/accuracy/ellipsis_n_bar_2": 0.889, "blimp/accuracy/principle_A_domain_3": 0.538, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.892, "blimp/accuracy/animate_subject_trans": 0.886, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.809, "blimp/accuracy/distractor_agreement_relative_clause": 0.526, "blimp/accuracy/transitive": 0.837, "blimp/accuracy/sentential_subject_island": 0.334, "blimp/accuracy/adjunct_island": 0.76, "blimp/accuracy/intransitive": 0.796, "blimp/accuracy/existential_there_subject_raising": 0.837, "blimp/accuracy/irregular_past_participle_adjectives": 0.865, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.294, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.272, "blimp/accuracy/only_npi_scope": 0.685, "blimp/accuracy/superlative_quantifiers_2": 0.852, "blimp/accuracy/passive_1": 0.874, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.881, "blimp/accuracy/inchoative": 0.641, "blimp/accuracy/anaphor_gender_agreement": 0.927, "blimp/accuracy/principle_A_c_command": 0.563, "blimp/accuracy/only_npi_licensor_present": 0.452, "blimp/accuracy/expletive_it_object_raising": 0.784, "blimp/accuracy/left_branch_island_simple_question": 0.38, "blimp/accuracy/wh_questions_subject_gap": 0.87, "blimp/accuracy/existential_there_quantifiers_2": 0.286, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.925, "blimp/accuracy/sentential_negation_npi_scope": 0.546, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.764, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.854, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.868, "blimp/accuracy/principle_A_case_2": 0.956, "blimp/accuracy/distractor_agreement_relational_noun": 0.785, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.98, "blimp/accuracy/superlative_quantifiers_1": 0.604, "blimp/accuracy/wh_island": 0.755, "blimp/accuracy/principle_A_domain_1": 0.956, "blimp/accuracy/complex_NP_island": 0.517, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.93, "blimp/accuracy/irregular_past_participle_verbs": 0.873, "blimp/accuracy/drop_argument": 0.785, "blimp/accuracy/wh_questions_object_gap": 0.678, "blimp/accuracy/animate_subject_passive": 0.81, "blimp/accuracy/existential_there_quantifiers_1": 0.967, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.888, "blimp/accuracy/npi_present_2": 0.591, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.889, "blimp/accuracy/anaphor_number_agreement": 0.975, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.935, "blimp/accuracy/existential_there_object_raising": 0.787, "blimp/accuracy/matrix_question_npi_licensor_present": 0.171, "blimp/accuracy/npi_present_1": 0.504, "blimp/accuracy/wh_vs_that_no_gap": 0.943, "blimp/accuracy/left_branch_island_echo_question": 0.354, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.949, "blimp/accuracy/causative": 0.676, "blimp/accuracy/group_average": 0.7388059701492535, "blimp/accuracy/seq_average": 0.7388059701492538, "cbt/accuracy/NE": 0.734375, "cbt/accuracy/V": 0.8856, "cbt/accuracy/CN": 0.7848, "cbt/accuracy/P": 0.8616, "cbt/accuracy/group_average": 0.8165937500000001, "cbt/accuracy/seq_average": 0.8166266506602641, "hellaswag/accuracy/val": 0.2771360286795459, "hellaswag/accuracy/group_average": 0.2771360286795459, "hellaswag/accuracy/seq_average": 0.2771360286795459, "piqa/accuracy/val": 0.5854189336235038, "piqa/accuracy/group_average": 0.5854189336235038, "piqa/accuracy/seq_average": 0.5854189336235038, "ai2arc/accuracy/ARC-Easy": 0.3099365750528541, "ai2arc/accuracy/ARC-Challenge": 0.20772532188841203, "ai2arc/accuracy/group_average": 0.25883094847063304, "ai2arc/accuracy/seq_average": 0.2762039660056657, "mmlu/accuracy/MMLU": 0.25727565248480516, "mmlu/accuracy/group_average": 0.25727565248480516, "mmlu/accuracy/seq_average": 0.25727565248480516, "openbookqa/accuracy/test": 0.258, "openbookqa/accuracy/group_average": 0.258, "openbookqa/accuracy/seq_average": 0.258, "race/accuracy/test/high": 0.258147512864494, "race/accuracy/test/middle": 0.33356545961002787, "race/accuracy/group_average": 0.2958564862372609, "race/accuracy/seq_average": 0.28009728415079044, "siqa/accuracy/dev": 0.34698055271238487, "siqa/accuracy/group_average": 0.34698055271238487, "siqa/accuracy/seq_average": 0.34698055271238487, "winogrande/accuracy/dev": 0.5090765588003157, "winogrande/accuracy/group_average": 0.5090765588003157, "winogrande/accuracy/seq_average": 0.5090765588003157, "commonsenseqa/accuracy/dev_rand_split": 0.22604422604422605, "commonsenseqa/accuracy/group_average": 0.22604422604422605, "commonsenseqa/accuracy/seq_average": 0.22604422604422605}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_standard_lb_v2/export/result-model-40000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.754936823769221, "val/accuracy": 0.4590299091641865, "val/perplexity": 15.72004773806797, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6254700844332297, "lambada/accuracy/total": 0.21777950310559005, "lambada/accuracy/openai_last_token": 0.7476708074534162, "lambada/perplexity": 15.749754975663437, "lambada/lm_loss": 3.2934883043631307, "lambada/lm_perplexity": 26.93666323994265, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.33840470613488827, "mean_loss": 2.6902034541012254, "blimp/accuracy/passive_2": 0.874, "blimp/accuracy/determiner_noun_agreement_2": 0.974, "blimp/accuracy/ellipsis_n_bar_1": 0.79, "blimp/accuracy/tough_vs_raising_2": 0.838, "blimp/accuracy/tough_vs_raising_1": 0.608, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.898, "blimp/accuracy/principle_A_reconstruction": 0.279, "blimp/accuracy/wh_vs_that_with_gap": 0.499, "blimp/accuracy/principle_A_domain_2": 0.796, "blimp/accuracy/determiner_noun_agreement_1": 0.982, "blimp/accuracy/ellipsis_n_bar_2": 0.904, "blimp/accuracy/principle_A_domain_3": 0.562, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.927, "blimp/accuracy/animate_subject_trans": 0.906, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.858, "blimp/accuracy/distractor_agreement_relative_clause": 0.6, "blimp/accuracy/transitive": 0.845, "blimp/accuracy/sentential_subject_island": 0.29, "blimp/accuracy/adjunct_island": 0.824, "blimp/accuracy/intransitive": 0.79, "blimp/accuracy/existential_there_subject_raising": 0.844, "blimp/accuracy/irregular_past_participle_adjectives": 0.78, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.391, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.262, "blimp/accuracy/only_npi_scope": 0.514, "blimp/accuracy/superlative_quantifiers_2": 0.746, "blimp/accuracy/passive_1": 0.87, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.892, "blimp/accuracy/inchoative": 0.62, "blimp/accuracy/anaphor_gender_agreement": 0.94, "blimp/accuracy/principle_A_c_command": 0.645, "blimp/accuracy/only_npi_licensor_present": 0.54, "blimp/accuracy/expletive_it_object_raising": 0.762, "blimp/accuracy/left_branch_island_simple_question": 0.491, "blimp/accuracy/wh_questions_subject_gap": 0.893, "blimp/accuracy/existential_there_quantifiers_2": 0.351, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.928, "blimp/accuracy/sentential_negation_npi_scope": 0.581, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.794, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.84, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.892, "blimp/accuracy/principle_A_case_2": 0.941, "blimp/accuracy/distractor_agreement_relational_noun": 0.825, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.997, "blimp/accuracy/superlative_quantifiers_1": 0.726, "blimp/accuracy/wh_island": 0.792, "blimp/accuracy/principle_A_domain_1": 0.989, "blimp/accuracy/complex_NP_island": 0.477, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.947, "blimp/accuracy/irregular_past_participle_verbs": 0.898, "blimp/accuracy/drop_argument": 0.768, "blimp/accuracy/wh_questions_object_gap": 0.714, "blimp/accuracy/animate_subject_passive": 0.779, "blimp/accuracy/existential_there_quantifiers_1": 0.986, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.877, "blimp/accuracy/npi_present_2": 0.519, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.91, "blimp/accuracy/anaphor_number_agreement": 0.977, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.95, "blimp/accuracy/existential_there_object_raising": 0.807, "blimp/accuracy/matrix_question_npi_licensor_present": 0.157, "blimp/accuracy/npi_present_1": 0.491, "blimp/accuracy/wh_vs_that_no_gap": 0.968, "blimp/accuracy/left_branch_island_echo_question": 0.408, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.966, "blimp/accuracy/causative": 0.668, "blimp/accuracy/group_average": 0.7486119402985074, "blimp/accuracy/seq_average": 0.7486119402985074, "cbt/accuracy/NE": 0.7395833333333334, "cbt/accuracy/V": 0.8892, "cbt/accuracy/CN": 0.7916, "cbt/accuracy/P": 0.868, "cbt/accuracy/group_average": 0.8220958333333332, "cbt/accuracy/seq_average": 0.8221288515406162, "hellaswag/accuracy/val": 0.28211511651065524, "hellaswag/accuracy/group_average": 0.28211511651065524, "hellaswag/accuracy/seq_average": 0.28211511651065524, "piqa/accuracy/val": 0.5658324265505985, "piqa/accuracy/group_average": 0.5658324265505985, "piqa/accuracy/seq_average": 0.5658324265505985, "ai2arc/accuracy/ARC-Easy": 0.3145877378435518, "ai2arc/accuracy/ARC-Challenge": 0.19914163090128756, "ai2arc/accuracy/group_average": 0.2568646843724197, "ai2arc/accuracy/seq_average": 0.27648725212464587, "mmlu/accuracy/MMLU": 0.2591347872720772, "mmlu/accuracy/group_average": 0.2591347872720772, "mmlu/accuracy/seq_average": 0.2591347872720772, "openbookqa/accuracy/test": 0.25, "openbookqa/accuracy/group_average": 0.25, "openbookqa/accuracy/seq_average": 0.25, "race/accuracy/test/high": 0.2584333905088622, "race/accuracy/test/middle": 0.3307799442896936, "race/accuracy/group_average": 0.2946066673992779, "race/accuracy/seq_average": 0.27948925820835024, "siqa/accuracy/dev": 0.35516888433981575, "siqa/accuracy/group_average": 0.35516888433981575, "siqa/accuracy/seq_average": 0.35516888433981575, "winogrande/accuracy/dev": 0.5059194948697711, "winogrande/accuracy/group_average": 0.5059194948697711, "winogrande/accuracy/seq_average": 0.5059194948697711, "commonsenseqa/accuracy/dev_rand_split": 0.257985257985258, "commonsenseqa/accuracy/group_average": 0.257985257985258, "commonsenseqa/accuracy/seq_average": 0.257985257985258}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_standard_lb_v2/export/result-model-60000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.677303738064236, "val/accuracy": 0.46979147290426587, "val/perplexity": 14.545821031944616, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.575543137070555, "lambada/accuracy/total": 0.23466614906832298, "lambada/accuracy/openai_last_token": 0.7548524844720497, "lambada/perplexity": 13.650760529750542, "lambada/lm_loss": 3.229637397542701, "lambada/lm_perplexity": 25.270492166916192, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.35222881098629444, "mean_loss": 2.6264234375673956, "blimp/accuracy/passive_2": 0.898, "blimp/accuracy/determiner_noun_agreement_2": 0.987, "blimp/accuracy/ellipsis_n_bar_1": 0.785, "blimp/accuracy/tough_vs_raising_2": 0.865, "blimp/accuracy/tough_vs_raising_1": 0.577, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.923, "blimp/accuracy/principle_A_reconstruction": 0.22, "blimp/accuracy/wh_vs_that_with_gap": 0.527, "blimp/accuracy/principle_A_domain_2": 0.798, "blimp/accuracy/determiner_noun_agreement_1": 0.985, "blimp/accuracy/ellipsis_n_bar_2": 0.909, "blimp/accuracy/principle_A_domain_3": 0.539, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.93, "blimp/accuracy/animate_subject_trans": 0.907, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.87, "blimp/accuracy/distractor_agreement_relative_clause": 0.579, "blimp/accuracy/transitive": 0.87, "blimp/accuracy/sentential_subject_island": 0.278, "blimp/accuracy/adjunct_island": 0.753, "blimp/accuracy/intransitive": 0.795, "blimp/accuracy/existential_there_subject_raising": 0.845, "blimp/accuracy/irregular_past_participle_adjectives": 0.937, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.322, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.293, "blimp/accuracy/only_npi_scope": 0.716, "blimp/accuracy/superlative_quantifiers_2": 0.761, "blimp/accuracy/passive_1": 0.88, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.914, "blimp/accuracy/inchoative": 0.671, "blimp/accuracy/anaphor_gender_agreement": 0.938, "blimp/accuracy/principle_A_c_command": 0.627, "blimp/accuracy/only_npi_licensor_present": 0.569, "blimp/accuracy/expletive_it_object_raising": 0.746, "blimp/accuracy/left_branch_island_simple_question": 0.36, "blimp/accuracy/wh_questions_subject_gap": 0.909, "blimp/accuracy/existential_there_quantifiers_2": 0.252, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.938, "blimp/accuracy/sentential_negation_npi_scope": 0.57, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.757, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.85, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.887, "blimp/accuracy/principle_A_case_2": 0.948, "blimp/accuracy/distractor_agreement_relational_noun": 0.759, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.993, "blimp/accuracy/superlative_quantifiers_1": 0.753, "blimp/accuracy/wh_island": 0.814, "blimp/accuracy/principle_A_domain_1": 0.986, "blimp/accuracy/complex_NP_island": 0.492, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.954, "blimp/accuracy/irregular_past_participle_verbs": 0.892, "blimp/accuracy/drop_argument": 0.765, "blimp/accuracy/wh_questions_object_gap": 0.772, "blimp/accuracy/animate_subject_passive": 0.792, "blimp/accuracy/existential_there_quantifiers_1": 0.963, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.887, "blimp/accuracy/npi_present_2": 0.608, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.907, "blimp/accuracy/anaphor_number_agreement": 0.985, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.945, "blimp/accuracy/existential_there_object_raising": 0.82, "blimp/accuracy/matrix_question_npi_licensor_present": 0.241, "blimp/accuracy/npi_present_1": 0.583, "blimp/accuracy/wh_vs_that_no_gap": 0.968, "blimp/accuracy/left_branch_island_echo_question": 0.411, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.96, "blimp/accuracy/causative": 0.689, "blimp/accuracy/group_average": 0.7555820895522387, "blimp/accuracy/seq_average": 0.7555820895522388, "cbt/accuracy/NE": 0.7504006410256411, "cbt/accuracy/V": 0.8984, "cbt/accuracy/CN": 0.8096, "cbt/accuracy/P": 0.8812, "cbt/accuracy/group_average": 0.8349001602564103, "cbt/accuracy/seq_average": 0.8349339735894358, "hellaswag/accuracy/val": 0.2849034056960765, "hellaswag/accuracy/group_average": 0.2849034056960765, "hellaswag/accuracy/seq_average": 0.2849034056960765, "piqa/accuracy/val": 0.5674646354733406, "piqa/accuracy/group_average": 0.5674646354733406, "piqa/accuracy/seq_average": 0.5674646354733406, "ai2arc/accuracy/ARC-Easy": 0.32346723044397463, "ai2arc/accuracy/ARC-Challenge": 0.20257510729613734, "ai2arc/accuracy/group_average": 0.263021168870056, "ai2arc/accuracy/seq_average": 0.28356940509915013, "mmlu/accuracy/MMLU": 0.2619234894529853, "mmlu/accuracy/group_average": 0.2619234894529853, "mmlu/accuracy/seq_average": 0.2619234894529853, "openbookqa/accuracy/test": 0.264, "openbookqa/accuracy/group_average": 0.264, "openbookqa/accuracy/seq_average": 0.264, "race/accuracy/test/high": 0.2658662092624357, "race/accuracy/test/middle": 0.3231197771587744, "race/accuracy/group_average": 0.29449299321060507, "race/accuracy/seq_average": 0.28252938792055127, "siqa/accuracy/dev": 0.35056294779938585, "siqa/accuracy/group_average": 0.35056294779938585, "siqa/accuracy/seq_average": 0.35056294779938585, "winogrande/accuracy/dev": 0.510655090765588, "winogrande/accuracy/group_average": 0.510655090765588, "winogrande/accuracy/seq_average": 0.510655090765588, "commonsenseqa/accuracy/dev_rand_split": 0.24897624897624898, "commonsenseqa/accuracy/group_average": 0.24897624897624898, "commonsenseqa/accuracy/seq_average": 0.24897624897624898}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_standard_lb_v2/export/result-model-70000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.6513274662078374, "val/accuracy": 0.4739176432291667, "val/perplexity": 14.172840129788627, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6206831843216225, "lambada/accuracy/total": 0.23563664596273293, "lambada/accuracy/openai_last_token": 0.7548524844720497, "lambada/perplexity": 12.653600410893773, "lambada/lm_loss": 3.216387615447795, "lambada/lm_perplexity": 24.9378720901689, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3547771445959498, "mean_loss": 2.6360053252647297, "blimp/accuracy/passive_2": 0.897, "blimp/accuracy/determiner_noun_agreement_2": 0.979, "blimp/accuracy/ellipsis_n_bar_1": 0.794, "blimp/accuracy/tough_vs_raising_2": 0.816, "blimp/accuracy/tough_vs_raising_1": 0.627, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.904, "blimp/accuracy/principle_A_reconstruction": 0.3, "blimp/accuracy/wh_vs_that_with_gap": 0.519, "blimp/accuracy/principle_A_domain_2": 0.825, "blimp/accuracy/determiner_noun_agreement_1": 0.982, "blimp/accuracy/ellipsis_n_bar_2": 0.923, "blimp/accuracy/principle_A_domain_3": 0.55, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.913, "blimp/accuracy/animate_subject_trans": 0.905, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.87, "blimp/accuracy/distractor_agreement_relative_clause": 0.602, "blimp/accuracy/transitive": 0.873, "blimp/accuracy/sentential_subject_island": 0.3, "blimp/accuracy/adjunct_island": 0.749, "blimp/accuracy/intransitive": 0.788, "blimp/accuracy/existential_there_subject_raising": 0.852, "blimp/accuracy/irregular_past_participle_adjectives": 0.883, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.431, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.281, "blimp/accuracy/only_npi_scope": 0.695, "blimp/accuracy/superlative_quantifiers_2": 0.843, "blimp/accuracy/passive_1": 0.884, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.903, "blimp/accuracy/inchoative": 0.629, "blimp/accuracy/anaphor_gender_agreement": 0.956, "blimp/accuracy/principle_A_c_command": 0.595, "blimp/accuracy/only_npi_licensor_present": 0.594, "blimp/accuracy/expletive_it_object_raising": 0.737, "blimp/accuracy/left_branch_island_simple_question": 0.504, "blimp/accuracy/wh_questions_subject_gap": 0.897, "blimp/accuracy/existential_there_quantifiers_2": 0.341, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.937, "blimp/accuracy/sentential_negation_npi_scope": 0.631, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.814, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.836, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.886, "blimp/accuracy/principle_A_case_2": 0.949, "blimp/accuracy/distractor_agreement_relational_noun": 0.806, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.995, "blimp/accuracy/superlative_quantifiers_1": 0.688, "blimp/accuracy/wh_island": 0.818, "blimp/accuracy/principle_A_domain_1": 0.994, "blimp/accuracy/complex_NP_island": 0.489, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.964, "blimp/accuracy/irregular_past_participle_verbs": 0.906, "blimp/accuracy/drop_argument": 0.748, "blimp/accuracy/wh_questions_object_gap": 0.767, "blimp/accuracy/animate_subject_passive": 0.771, "blimp/accuracy/existential_there_quantifiers_1": 0.983, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.874, "blimp/accuracy/npi_present_2": 0.516, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.914, "blimp/accuracy/anaphor_number_agreement": 0.982, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.938, "blimp/accuracy/existential_there_object_raising": 0.83, "blimp/accuracy/matrix_question_npi_licensor_present": 0.222, "blimp/accuracy/npi_present_1": 0.526, "blimp/accuracy/wh_vs_that_no_gap": 0.967, "blimp/accuracy/left_branch_island_echo_question": 0.416, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.966, "blimp/accuracy/causative": 0.691, "blimp/accuracy/group_average": 0.7606716417910446, "blimp/accuracy/seq_average": 0.7606716417910447, "cbt/accuracy/NE": 0.7536057692307693, "cbt/accuracy/V": 0.9036, "cbt/accuracy/CN": 0.8116, "cbt/accuracy/P": 0.8812, "cbt/accuracy/group_average": 0.8375014423076923, "cbt/accuracy/seq_average": 0.8375350140056023, "hellaswag/accuracy/val": 0.2885879306910974, "hellaswag/accuracy/group_average": 0.2885879306910974, "hellaswag/accuracy/seq_average": 0.2885879306910974, "piqa/accuracy/val": 0.5685527747551686, "piqa/accuracy/group_average": 0.5685527747551686, "piqa/accuracy/seq_average": 0.5685527747551686, "ai2arc/accuracy/ARC-Easy": 0.3310782241014799, "ai2arc/accuracy/ARC-Challenge": 0.2111587982832618, "ai2arc/accuracy/group_average": 0.27111851119237085, "ai2arc/accuracy/seq_average": 0.2915014164305949, "mmlu/accuracy/MMLU": 0.25841973543081875, "mmlu/accuracy/group_average": 0.25841973543081875, "mmlu/accuracy/seq_average": 0.25841973543081875, "openbookqa/accuracy/test": 0.264, "openbookqa/accuracy/group_average": 0.264, "openbookqa/accuracy/seq_average": 0.264, "race/accuracy/test/high": 0.2612921669525443, "race/accuracy/test/middle": 0.3286908077994429, "race/accuracy/group_average": 0.29499148737599357, "race/accuracy/seq_average": 0.2809079854073774, "siqa/accuracy/dev": 0.3577277379733879, "siqa/accuracy/group_average": 0.3577277379733879, "siqa/accuracy/seq_average": 0.3577277379733879, "winogrande/accuracy/dev": 0.5035516969218626, "winogrande/accuracy/group_average": 0.5035516969218626, "winogrande/accuracy/seq_average": 0.5035516969218626, "commonsenseqa/accuracy/dev_rand_split": 0.2457002457002457, "commonsenseqa/accuracy/group_average": 0.2457002457002457, "commonsenseqa/accuracy/seq_average": 0.2457002457002457}
|