Upload folder using huggingface_hub
#295
by
DavidNguyen
- opened
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_sigmoid_standard_lb/export/result-model-10000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_sigmoid_standard_lb/export/result-model-100000.pth.json +121 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_sigmoid_standard_lb/export/result-model-20000.pth.json +121 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_sigmoid_standard_lb/export/result-model-30000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_sigmoid_standard_lb/export/result-model-40000.pth.json +121 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_sigmoid_standard_lb/export/result-model-50000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_sigmoid_standard_lb/export/result-model-60000.pth.json +121 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_sigmoid_standard_lb/export/result-model-70000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_sigmoid_standard_lb/export/result-model-80000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_sigmoid_standard_lb/export/result-model-90000.pth.json +1 -0
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_sigmoid_standard_lb/export/result-model-10000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 3.026703123062376, "val/accuracy": 0.42361207992311506, "val/perplexity": 20.629108720304792, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.718575424289111, "lambada/accuracy/total": 0.1578027950310559, "lambada/accuracy/openai_last_token": 0.719332298136646, "lambada/perplexity": 23.935558561617572, "lambada/lm_loss": 3.5599650199466972, "lambada/lm_perplexity": 35.161967156108844, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.2907074374770855, "mean_loss": 2.872639273675744, "blimp/accuracy/passive_2": 0.869, "blimp/accuracy/determiner_noun_agreement_2": 0.945, "blimp/accuracy/ellipsis_n_bar_1": 0.695, "blimp/accuracy/tough_vs_raising_2": 0.846, "blimp/accuracy/tough_vs_raising_1": 0.526, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.89, "blimp/accuracy/principle_A_reconstruction": 0.426, "blimp/accuracy/wh_vs_that_with_gap": 0.416, "blimp/accuracy/principle_A_domain_2": 0.78, "blimp/accuracy/determiner_noun_agreement_1": 0.97, "blimp/accuracy/ellipsis_n_bar_2": 0.872, "blimp/accuracy/principle_A_domain_3": 0.508, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.881, "blimp/accuracy/animate_subject_trans": 0.871, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.739, "blimp/accuracy/distractor_agreement_relative_clause": 0.459, "blimp/accuracy/transitive": 0.828, "blimp/accuracy/sentential_subject_island": 0.325, "blimp/accuracy/adjunct_island": 0.691, "blimp/accuracy/intransitive": 0.717, "blimp/accuracy/existential_there_subject_raising": 0.782, "blimp/accuracy/irregular_past_participle_adjectives": 0.953, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.191, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.143, "blimp/accuracy/only_npi_scope": 0.708, "blimp/accuracy/superlative_quantifiers_2": 0.769, "blimp/accuracy/passive_1": 0.856, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.872, "blimp/accuracy/inchoative": 0.538, "blimp/accuracy/anaphor_gender_agreement": 0.936, "blimp/accuracy/principle_A_c_command": 0.473, "blimp/accuracy/only_npi_licensor_present": 0.768, "blimp/accuracy/expletive_it_object_raising": 0.742, "blimp/accuracy/left_branch_island_simple_question": 0.203, "blimp/accuracy/wh_questions_subject_gap": 0.898, "blimp/accuracy/existential_there_quantifiers_2": 0.427, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.885, "blimp/accuracy/sentential_negation_npi_scope": 0.364, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.746, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.866, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.843, "blimp/accuracy/principle_A_case_2": 0.953, "blimp/accuracy/distractor_agreement_relational_noun": 0.701, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.996, "blimp/accuracy/superlative_quantifiers_1": 0.671, "blimp/accuracy/wh_island": 0.687, "blimp/accuracy/principle_A_domain_1": 0.952, "blimp/accuracy/complex_NP_island": 0.562, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.93, "blimp/accuracy/irregular_past_participle_verbs": 0.774, "blimp/accuracy/drop_argument": 0.74, "blimp/accuracy/wh_questions_object_gap": 0.733, "blimp/accuracy/animate_subject_passive": 0.746, "blimp/accuracy/existential_there_quantifiers_1": 0.971, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.838, "blimp/accuracy/npi_present_2": 0.594, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.851, "blimp/accuracy/anaphor_number_agreement": 0.975, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.906, "blimp/accuracy/existential_there_object_raising": 0.727, "blimp/accuracy/matrix_question_npi_licensor_present": 0.074, "blimp/accuracy/npi_present_1": 0.549, "blimp/accuracy/wh_vs_that_no_gap": 0.962, "blimp/accuracy/left_branch_island_echo_question": 0.563, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.976, "blimp/accuracy/causative": 0.637, "blimp/accuracy/group_average": 0.7206716417910447, "blimp/accuracy/seq_average": 0.7206716417910448, "cbt/accuracy/NE": 0.6919070512820513, "cbt/accuracy/V": 0.8524, "cbt/accuracy/CN": 0.7304, "cbt/accuracy/P": 0.842, "cbt/accuracy/group_average": 0.7791767628205128, "cbt/accuracy/seq_average": 0.7792116846738696, "hellaswag/accuracy/val": 0.27235610436168095, "hellaswag/accuracy/group_average": 0.27235610436168095, "hellaswag/accuracy/seq_average": 0.27235610436168095, "piqa/accuracy/val": 0.5511425462459195, "piqa/accuracy/group_average": 0.5511425462459195, "piqa/accuracy/seq_average": 0.5511425462459195, "ai2arc/accuracy/ARC-Easy": 0.3145877378435518, "ai2arc/accuracy/ARC-Challenge": 0.20257510729613734, "ai2arc/accuracy/group_average": 0.25858142256984457, "ai2arc/accuracy/seq_average": 0.2776203966005666, "mmlu/accuracy/MMLU": 0.2634250983196282, "mmlu/accuracy/group_average": 0.2634250983196282, "mmlu/accuracy/seq_average": 0.2634250983196282, "openbookqa/accuracy/test": 0.26, "openbookqa/accuracy/group_average": 0.26, "openbookqa/accuracy/seq_average": 0.26, "race/accuracy/test/high": 0.26157804459691253, "race/accuracy/test/middle": 0.32590529247910865, "race/accuracy/group_average": 0.2937416685380106, "race/accuracy/seq_average": 0.2802999594649372, "siqa/accuracy/dev": 0.34646878198567044, "siqa/accuracy/group_average": 0.34646878198567044, "siqa/accuracy/seq_average": 0.34646878198567044, "winogrande/accuracy/dev": 0.5043409629044988, "winogrande/accuracy/group_average": 0.5043409629044988, "winogrande/accuracy/seq_average": 0.5043409629044988, "commonsenseqa/accuracy/dev_rand_split": 0.23587223587223588, "commonsenseqa/accuracy/group_average": 0.23587223587223588, "commonsenseqa/accuracy/seq_average": 0.23587223587223588}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_sigmoid_standard_lb/export/result-model-100000.pth.json
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"val/loss": 2.6108267647879466,
|
| 3 |
+
"val/accuracy": 0.47982545882936506,
|
| 4 |
+
"val/perplexity": 13.610298717253553,
|
| 5 |
+
"val/time_since_best_loss": 0,
|
| 6 |
+
"val/time_since_best_accuracy": 0,
|
| 7 |
+
"lambada/loss": 2.616000536806095,
|
| 8 |
+
"lambada/accuracy/total": 0.2542701863354037,
|
| 9 |
+
"lambada/accuracy/openai_last_token": 0.7624223602484472,
|
| 10 |
+
"lambada/perplexity": 11.732446061898097,
|
| 11 |
+
"lambada/lm_loss": 3.17926517792789,
|
| 12 |
+
"lambada/lm_perplexity": 24.029089957386283,
|
| 13 |
+
"lambada/time_since_best_loss": 0,
|
| 14 |
+
"lambada/time_since_best_accuracy": 0,
|
| 15 |
+
"mean_accuracy": 0.3670478225823844,
|
| 16 |
+
"mean_loss": 2.6134136507970207,
|
| 17 |
+
"blimp/accuracy/passive_2": 0.921,
|
| 18 |
+
"blimp/accuracy/determiner_noun_agreement_2": 0.987,
|
| 19 |
+
"blimp/accuracy/ellipsis_n_bar_1": 0.845,
|
| 20 |
+
"blimp/accuracy/tough_vs_raising_2": 0.871,
|
| 21 |
+
"blimp/accuracy/tough_vs_raising_1": 0.64,
|
| 22 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.923,
|
| 23 |
+
"blimp/accuracy/principle_A_reconstruction": 0.327,
|
| 24 |
+
"blimp/accuracy/wh_vs_that_with_gap": 0.548,
|
| 25 |
+
"blimp/accuracy/principle_A_domain_2": 0.779,
|
| 26 |
+
"blimp/accuracy/determiner_noun_agreement_1": 0.99,
|
| 27 |
+
"blimp/accuracy/ellipsis_n_bar_2": 0.91,
|
| 28 |
+
"blimp/accuracy/principle_A_domain_3": 0.55,
|
| 29 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.918,
|
| 30 |
+
"blimp/accuracy/animate_subject_trans": 0.907,
|
| 31 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.888,
|
| 32 |
+
"blimp/accuracy/distractor_agreement_relative_clause": 0.664,
|
| 33 |
+
"blimp/accuracy/transitive": 0.879,
|
| 34 |
+
"blimp/accuracy/sentential_subject_island": 0.333,
|
| 35 |
+
"blimp/accuracy/adjunct_island": 0.801,
|
| 36 |
+
"blimp/accuracy/intransitive": 0.797,
|
| 37 |
+
"blimp/accuracy/existential_there_subject_raising": 0.851,
|
| 38 |
+
"blimp/accuracy/irregular_past_participle_adjectives": 0.953,
|
| 39 |
+
"blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.401,
|
| 40 |
+
"blimp/accuracy/principle_A_case_1": 1.0,
|
| 41 |
+
"blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.278,
|
| 42 |
+
"blimp/accuracy/only_npi_scope": 0.61,
|
| 43 |
+
"blimp/accuracy/superlative_quantifiers_2": 0.821,
|
| 44 |
+
"blimp/accuracy/passive_1": 0.904,
|
| 45 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.898,
|
| 46 |
+
"blimp/accuracy/inchoative": 0.629,
|
| 47 |
+
"blimp/accuracy/anaphor_gender_agreement": 0.971,
|
| 48 |
+
"blimp/accuracy/principle_A_c_command": 0.57,
|
| 49 |
+
"blimp/accuracy/only_npi_licensor_present": 0.446,
|
| 50 |
+
"blimp/accuracy/expletive_it_object_raising": 0.768,
|
| 51 |
+
"blimp/accuracy/left_branch_island_simple_question": 0.447,
|
| 52 |
+
"blimp/accuracy/wh_questions_subject_gap": 0.907,
|
| 53 |
+
"blimp/accuracy/existential_there_quantifiers_2": 0.525,
|
| 54 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.926,
|
| 55 |
+
"blimp/accuracy/sentential_negation_npi_scope": 0.595,
|
| 56 |
+
"blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.844,
|
| 57 |
+
"blimp/accuracy/wh_questions_subject_gap_long_distance": 0.883,
|
| 58 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.901,
|
| 59 |
+
"blimp/accuracy/principle_A_case_2": 0.959,
|
| 60 |
+
"blimp/accuracy/distractor_agreement_relational_noun": 0.824,
|
| 61 |
+
"blimp/accuracy/sentential_negation_npi_licensor_present": 0.991,
|
| 62 |
+
"blimp/accuracy/superlative_quantifiers_1": 0.848,
|
| 63 |
+
"blimp/accuracy/wh_island": 0.777,
|
| 64 |
+
"blimp/accuracy/principle_A_domain_1": 0.984,
|
| 65 |
+
"blimp/accuracy/complex_NP_island": 0.545,
|
| 66 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_2": 0.963,
|
| 67 |
+
"blimp/accuracy/irregular_past_participle_verbs": 0.908,
|
| 68 |
+
"blimp/accuracy/drop_argument": 0.792,
|
| 69 |
+
"blimp/accuracy/wh_questions_object_gap": 0.773,
|
| 70 |
+
"blimp/accuracy/animate_subject_passive": 0.802,
|
| 71 |
+
"blimp/accuracy/existential_there_quantifiers_1": 0.98,
|
| 72 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.878,
|
| 73 |
+
"blimp/accuracy/npi_present_2": 0.601,
|
| 74 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_1": 0.926,
|
| 75 |
+
"blimp/accuracy/anaphor_number_agreement": 0.991,
|
| 76 |
+
"blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.948,
|
| 77 |
+
"blimp/accuracy/existential_there_object_raising": 0.835,
|
| 78 |
+
"blimp/accuracy/matrix_question_npi_licensor_present": 0.244,
|
| 79 |
+
"blimp/accuracy/npi_present_1": 0.569,
|
| 80 |
+
"blimp/accuracy/wh_vs_that_no_gap": 0.974,
|
| 81 |
+
"blimp/accuracy/left_branch_island_echo_question": 0.442,
|
| 82 |
+
"blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.979,
|
| 83 |
+
"blimp/accuracy/causative": 0.708,
|
| 84 |
+
"blimp/accuracy/group_average": 0.7738358208955224,
|
| 85 |
+
"blimp/accuracy/seq_average": 0.7738358208955224,
|
| 86 |
+
"cbt/accuracy/NE": 0.7568108974358975,
|
| 87 |
+
"cbt/accuracy/V": 0.9088,
|
| 88 |
+
"cbt/accuracy/CN": 0.8176,
|
| 89 |
+
"cbt/accuracy/P": 0.886,
|
| 90 |
+
"cbt/accuracy/group_average": 0.8423027243589745,
|
| 91 |
+
"cbt/accuracy/seq_average": 0.8423369347739096,
|
| 92 |
+
"hellaswag/accuracy/val": 0.29127663811989646,
|
| 93 |
+
"hellaswag/accuracy/group_average": 0.29127663811989646,
|
| 94 |
+
"hellaswag/accuracy/seq_average": 0.29127663811989646,
|
| 95 |
+
"piqa/accuracy/val": 0.5892274211099021,
|
| 96 |
+
"piqa/accuracy/group_average": 0.5892274211099021,
|
| 97 |
+
"piqa/accuracy/seq_average": 0.5892274211099021,
|
| 98 |
+
"ai2arc/accuracy/ARC-Easy": 0.32727272727272727,
|
| 99 |
+
"ai2arc/accuracy/ARC-Challenge": 0.21373390557939914,
|
| 100 |
+
"ai2arc/accuracy/group_average": 0.2705033164260632,
|
| 101 |
+
"ai2arc/accuracy/seq_average": 0.2898016997167139,
|
| 102 |
+
"mmlu/accuracy/MMLU": 0.2637111190561316,
|
| 103 |
+
"mmlu/accuracy/group_average": 0.2637111190561316,
|
| 104 |
+
"mmlu/accuracy/seq_average": 0.2637111190561316,
|
| 105 |
+
"openbookqa/accuracy/test": 0.272,
|
| 106 |
+
"openbookqa/accuracy/group_average": 0.272,
|
| 107 |
+
"openbookqa/accuracy/seq_average": 0.272,
|
| 108 |
+
"race/accuracy/test/high": 0.26786735277301316,
|
| 109 |
+
"race/accuracy/test/middle": 0.3530640668523677,
|
| 110 |
+
"race/accuracy/group_average": 0.3104657098126904,
|
| 111 |
+
"race/accuracy/seq_average": 0.29266315362788814,
|
| 112 |
+
"siqa/accuracy/dev": 0.3490276356192426,
|
| 113 |
+
"siqa/accuracy/group_average": 0.3490276356192426,
|
| 114 |
+
"siqa/accuracy/seq_average": 0.3490276356192426,
|
| 115 |
+
"winogrande/accuracy/dev": 0.5090765588003157,
|
| 116 |
+
"winogrande/accuracy/group_average": 0.5090765588003157,
|
| 117 |
+
"winogrande/accuracy/seq_average": 0.5090765588003157,
|
| 118 |
+
"commonsenseqa/accuracy/dev_rand_split": 0.24897624897624898,
|
| 119 |
+
"commonsenseqa/accuracy/group_average": 0.24897624897624898,
|
| 120 |
+
"commonsenseqa/accuracy/seq_average": 0.24897624897624898
|
| 121 |
+
}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_sigmoid_standard_lb/export/result-model-20000.pth.json
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"val/loss": 2.8830077156187994,
|
| 3 |
+
"val/accuracy": 0.441619388640873,
|
| 4 |
+
"val/perplexity": 17.867934105241176,
|
| 5 |
+
"val/time_since_best_loss": 0,
|
| 6 |
+
"val/time_since_best_accuracy": 0,
|
| 7 |
+
"lambada/loss": 2.6007561535568713,
|
| 8 |
+
"lambada/accuracy/total": 0.18847049689440995,
|
| 9 |
+
"lambada/accuracy/openai_last_token": 0.7325310559006211,
|
| 10 |
+
"lambada/perplexity": 17.842352575679747,
|
| 11 |
+
"lambada/lm_loss": 3.4032095429859703,
|
| 12 |
+
"lambada/lm_perplexity": 30.06042561236697,
|
| 13 |
+
"lambada/time_since_best_loss": 0,
|
| 14 |
+
"lambada/time_since_best_accuracy": 0,
|
| 15 |
+
"mean_accuracy": 0.3150449427676415,
|
| 16 |
+
"mean_loss": 2.7418819345878354,
|
| 17 |
+
"blimp/accuracy/passive_2": 0.878,
|
| 18 |
+
"blimp/accuracy/determiner_noun_agreement_2": 0.963,
|
| 19 |
+
"blimp/accuracy/ellipsis_n_bar_1": 0.75,
|
| 20 |
+
"blimp/accuracy/tough_vs_raising_2": 0.885,
|
| 21 |
+
"blimp/accuracy/tough_vs_raising_1": 0.537,
|
| 22 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.893,
|
| 23 |
+
"blimp/accuracy/principle_A_reconstruction": 0.505,
|
| 24 |
+
"blimp/accuracy/wh_vs_that_with_gap": 0.529,
|
| 25 |
+
"blimp/accuracy/principle_A_domain_2": 0.837,
|
| 26 |
+
"blimp/accuracy/determiner_noun_agreement_1": 0.978,
|
| 27 |
+
"blimp/accuracy/ellipsis_n_bar_2": 0.895,
|
| 28 |
+
"blimp/accuracy/principle_A_domain_3": 0.569,
|
| 29 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.878,
|
| 30 |
+
"blimp/accuracy/animate_subject_trans": 0.882,
|
| 31 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.8,
|
| 32 |
+
"blimp/accuracy/distractor_agreement_relative_clause": 0.555,
|
| 33 |
+
"blimp/accuracy/transitive": 0.848,
|
| 34 |
+
"blimp/accuracy/sentential_subject_island": 0.377,
|
| 35 |
+
"blimp/accuracy/adjunct_island": 0.699,
|
| 36 |
+
"blimp/accuracy/intransitive": 0.705,
|
| 37 |
+
"blimp/accuracy/existential_there_subject_raising": 0.822,
|
| 38 |
+
"blimp/accuracy/irregular_past_participle_adjectives": 0.902,
|
| 39 |
+
"blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.29,
|
| 40 |
+
"blimp/accuracy/principle_A_case_1": 0.999,
|
| 41 |
+
"blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.187,
|
| 42 |
+
"blimp/accuracy/only_npi_scope": 0.789,
|
| 43 |
+
"blimp/accuracy/superlative_quantifiers_2": 0.927,
|
| 44 |
+
"blimp/accuracy/passive_1": 0.889,
|
| 45 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.891,
|
| 46 |
+
"blimp/accuracy/inchoative": 0.525,
|
| 47 |
+
"blimp/accuracy/anaphor_gender_agreement": 0.924,
|
| 48 |
+
"blimp/accuracy/principle_A_c_command": 0.466,
|
| 49 |
+
"blimp/accuracy/only_npi_licensor_present": 0.767,
|
| 50 |
+
"blimp/accuracy/expletive_it_object_raising": 0.733,
|
| 51 |
+
"blimp/accuracy/left_branch_island_simple_question": 0.373,
|
| 52 |
+
"blimp/accuracy/wh_questions_subject_gap": 0.879,
|
| 53 |
+
"blimp/accuracy/existential_there_quantifiers_2": 0.416,
|
| 54 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.901,
|
| 55 |
+
"blimp/accuracy/sentential_negation_npi_scope": 0.486,
|
| 56 |
+
"blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.786,
|
| 57 |
+
"blimp/accuracy/wh_questions_subject_gap_long_distance": 0.886,
|
| 58 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.867,
|
| 59 |
+
"blimp/accuracy/principle_A_case_2": 0.933,
|
| 60 |
+
"blimp/accuracy/distractor_agreement_relational_noun": 0.818,
|
| 61 |
+
"blimp/accuracy/sentential_negation_npi_licensor_present": 0.993,
|
| 62 |
+
"blimp/accuracy/superlative_quantifiers_1": 0.702,
|
| 63 |
+
"blimp/accuracy/wh_island": 0.823,
|
| 64 |
+
"blimp/accuracy/principle_A_domain_1": 0.991,
|
| 65 |
+
"blimp/accuracy/complex_NP_island": 0.588,
|
| 66 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_2": 0.906,
|
| 67 |
+
"blimp/accuracy/irregular_past_participle_verbs": 0.868,
|
| 68 |
+
"blimp/accuracy/drop_argument": 0.734,
|
| 69 |
+
"blimp/accuracy/wh_questions_object_gap": 0.751,
|
| 70 |
+
"blimp/accuracy/animate_subject_passive": 0.733,
|
| 71 |
+
"blimp/accuracy/existential_there_quantifiers_1": 0.986,
|
| 72 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.864,
|
| 73 |
+
"blimp/accuracy/npi_present_2": 0.607,
|
| 74 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_1": 0.861,
|
| 75 |
+
"blimp/accuracy/anaphor_number_agreement": 0.963,
|
| 76 |
+
"blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.926,
|
| 77 |
+
"blimp/accuracy/existential_there_object_raising": 0.765,
|
| 78 |
+
"blimp/accuracy/matrix_question_npi_licensor_present": 0.121,
|
| 79 |
+
"blimp/accuracy/npi_present_1": 0.573,
|
| 80 |
+
"blimp/accuracy/wh_vs_that_no_gap": 0.959,
|
| 81 |
+
"blimp/accuracy/left_branch_island_echo_question": 0.371,
|
| 82 |
+
"blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.982,
|
| 83 |
+
"blimp/accuracy/causative": 0.691,
|
| 84 |
+
"blimp/accuracy/group_average": 0.7489104477611941,
|
| 85 |
+
"blimp/accuracy/seq_average": 0.748910447761194,
|
| 86 |
+
"cbt/accuracy/NE": 0.7071314102564102,
|
| 87 |
+
"cbt/accuracy/V": 0.8764,
|
| 88 |
+
"cbt/accuracy/CN": 0.7684,
|
| 89 |
+
"cbt/accuracy/P": 0.85,
|
| 90 |
+
"cbt/accuracy/group_average": 0.8004828525641026,
|
| 91 |
+
"cbt/accuracy/seq_average": 0.8005202080832333,
|
| 92 |
+
"hellaswag/accuracy/val": 0.2741485759808803,
|
| 93 |
+
"hellaswag/accuracy/group_average": 0.2741485759808803,
|
| 94 |
+
"hellaswag/accuracy/seq_average": 0.2741485759808803,
|
| 95 |
+
"piqa/accuracy/val": 0.5544069640914037,
|
| 96 |
+
"piqa/accuracy/group_average": 0.5544069640914037,
|
| 97 |
+
"piqa/accuracy/seq_average": 0.5544069640914037,
|
| 98 |
+
"ai2arc/accuracy/ARC-Easy": 0.30655391120507397,
|
| 99 |
+
"ai2arc/accuracy/ARC-Challenge": 0.2,
|
| 100 |
+
"ai2arc/accuracy/group_average": 0.253276955602537,
|
| 101 |
+
"ai2arc/accuracy/seq_average": 0.27138810198300284,
|
| 102 |
+
"mmlu/accuracy/MMLU": 0.26414015016088666,
|
| 103 |
+
"mmlu/accuracy/group_average": 0.26414015016088666,
|
| 104 |
+
"mmlu/accuracy/seq_average": 0.26414015016088666,
|
| 105 |
+
"openbookqa/accuracy/test": 0.282,
|
| 106 |
+
"openbookqa/accuracy/group_average": 0.282,
|
| 107 |
+
"openbookqa/accuracy/seq_average": 0.282,
|
| 108 |
+
"race/accuracy/test/high": 0.26043453401943967,
|
| 109 |
+
"race/accuracy/test/middle": 0.32520891364902504,
|
| 110 |
+
"race/accuracy/group_average": 0.29282172383423233,
|
| 111 |
+
"race/accuracy/seq_average": 0.2792865828942035,
|
| 112 |
+
"siqa/accuracy/dev": 0.35363357215967245,
|
| 113 |
+
"siqa/accuracy/group_average": 0.35363357215967245,
|
| 114 |
+
"siqa/accuracy/seq_average": 0.35363357215967245,
|
| 115 |
+
"winogrande/accuracy/dev": 0.5074980268350434,
|
| 116 |
+
"winogrande/accuracy/group_average": 0.5074980268350434,
|
| 117 |
+
"winogrande/accuracy/seq_average": 0.5074980268350434,
|
| 118 |
+
"commonsenseqa/accuracy/dev_rand_split": 0.22932022932022933,
|
| 119 |
+
"commonsenseqa/accuracy/group_average": 0.22932022932022933,
|
| 120 |
+
"commonsenseqa/accuracy/seq_average": 0.22932022932022933
|
| 121 |
+
}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_sigmoid_standard_lb/export/result-model-30000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.807494148375496, "val/accuracy": 0.4513685438368056, "val/perplexity": 16.56834833396943, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5816548033530666, "lambada/accuracy/total": 0.2047748447204969, "lambada/accuracy/openai_last_token": 0.7393245341614907, "lambada/perplexity": 16.395440770895988, "lambada/lm_loss": 3.357957831173838, "lambada/lm_perplexity": 28.730458482245506, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.32807169427865124, "mean_loss": 2.6945744758642816, "blimp/accuracy/passive_2": 0.909, "blimp/accuracy/determiner_noun_agreement_2": 0.964, "blimp/accuracy/ellipsis_n_bar_1": 0.814, "blimp/accuracy/tough_vs_raising_2": 0.881, "blimp/accuracy/tough_vs_raising_1": 0.596, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.911, "blimp/accuracy/principle_A_reconstruction": 0.466, "blimp/accuracy/wh_vs_that_with_gap": 0.582, "blimp/accuracy/principle_A_domain_2": 0.773, "blimp/accuracy/determiner_noun_agreement_1": 0.976, "blimp/accuracy/ellipsis_n_bar_2": 0.885, "blimp/accuracy/principle_A_domain_3": 0.543, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.897, "blimp/accuracy/animate_subject_trans": 0.882, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.829, "blimp/accuracy/distractor_agreement_relative_clause": 0.584, "blimp/accuracy/transitive": 0.848, "blimp/accuracy/sentential_subject_island": 0.328, "blimp/accuracy/adjunct_island": 0.757, "blimp/accuracy/intransitive": 0.763, "blimp/accuracy/existential_there_subject_raising": 0.842, "blimp/accuracy/irregular_past_participle_adjectives": 0.919, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.214, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.229, "blimp/accuracy/only_npi_scope": 0.716, "blimp/accuracy/superlative_quantifiers_2": 0.81, "blimp/accuracy/passive_1": 0.883, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.913, "blimp/accuracy/inchoative": 0.609, "blimp/accuracy/anaphor_gender_agreement": 0.958, "blimp/accuracy/principle_A_c_command": 0.521, "blimp/accuracy/only_npi_licensor_present": 0.195, "blimp/accuracy/expletive_it_object_raising": 0.762, "blimp/accuracy/left_branch_island_simple_question": 0.263, "blimp/accuracy/wh_questions_subject_gap": 0.86, "blimp/accuracy/existential_there_quantifiers_2": 0.363, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.909, "blimp/accuracy/sentential_negation_npi_scope": 0.479, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.802, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.864, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.908, "blimp/accuracy/principle_A_case_2": 0.914, "blimp/accuracy/distractor_agreement_relational_noun": 0.753, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.997, "blimp/accuracy/superlative_quantifiers_1": 0.72, "blimp/accuracy/wh_island": 0.756, "blimp/accuracy/principle_A_domain_1": 0.991, "blimp/accuracy/complex_NP_island": 0.526, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.93, "blimp/accuracy/irregular_past_participle_verbs": 0.86, "blimp/accuracy/drop_argument": 0.778, "blimp/accuracy/wh_questions_object_gap": 0.716, "blimp/accuracy/animate_subject_passive": 0.795, "blimp/accuracy/existential_there_quantifiers_1": 0.978, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.891, "blimp/accuracy/npi_present_2": 0.583, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.914, "blimp/accuracy/anaphor_number_agreement": 0.978, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.927, "blimp/accuracy/existential_there_object_raising": 0.79, "blimp/accuracy/matrix_question_npi_licensor_present": 0.211, "blimp/accuracy/npi_present_1": 0.473, "blimp/accuracy/wh_vs_that_no_gap": 0.957, "blimp/accuracy/left_branch_island_echo_question": 0.382, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.972, "blimp/accuracy/causative": 0.674, "blimp/accuracy/group_average": 0.7413880597014925, "blimp/accuracy/seq_average": 0.7413880597014926, "cbt/accuracy/NE": 0.735176282051282, "cbt/accuracy/V": 0.8948, "cbt/accuracy/CN": 0.7812, "cbt/accuracy/P": 0.8648, "cbt/accuracy/group_average": 0.8189940705128205, "cbt/accuracy/seq_average": 0.8190276110444178, "hellaswag/accuracy/val": 0.27823142800239, "hellaswag/accuracy/group_average": 0.27823142800239, "hellaswag/accuracy/seq_average": 0.27823142800239, "piqa/accuracy/val": 0.5696409140369967, "piqa/accuracy/group_average": 0.5696409140369967, "piqa/accuracy/seq_average": 0.5696409140369967, "ai2arc/accuracy/ARC-Easy": 0.3086680761099366, "ai2arc/accuracy/ARC-Challenge": 0.2111587982832618, "ai2arc/accuracy/group_average": 0.2599134371965992, "ai2arc/accuracy/seq_average": 0.27648725212464587, "mmlu/accuracy/MMLU": 0.26499821237039684, "mmlu/accuracy/group_average": 0.26499821237039684, "mmlu/accuracy/seq_average": 0.26499821237039684, "openbookqa/accuracy/test": 0.28, "openbookqa/accuracy/group_average": 0.28, "openbookqa/accuracy/seq_average": 0.28, "race/accuracy/test/high": 0.2641509433962264, "race/accuracy/test/middle": 0.3342618384401114, "race/accuracy/group_average": 0.2992063909181689, "race/accuracy/seq_average": 0.28455614106201865, "siqa/accuracy/dev": 0.3546571136131013, "siqa/accuracy/group_average": 0.3546571136131013, "siqa/accuracy/seq_average": 0.3546571136131013, "winogrande/accuracy/dev": 0.5027624309392266, "winogrande/accuracy/group_average": 0.5027624309392266, "winogrande/accuracy/seq_average": 0.5027624309392266, "commonsenseqa/accuracy/dev_rand_split": 0.24078624078624078, "commonsenseqa/accuracy/group_average": 0.24078624078624078, "commonsenseqa/accuracy/seq_average": 0.24078624078624078}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_sigmoid_standard_lb/export/result-model-40000.pth.json
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"val/loss": 2.754880148266989,
|
| 3 |
+
"val/accuracy": 0.45884583488343256,
|
| 4 |
+
"val/perplexity": 15.719156821714105,
|
| 5 |
+
"val/time_since_best_loss": 0,
|
| 6 |
+
"val/time_since_best_accuracy": 0,
|
| 7 |
+
"lambada/loss": 2.5966694517905666,
|
| 8 |
+
"lambada/accuracy/total": 0.2105978260869565,
|
| 9 |
+
"lambada/accuracy/openai_last_token": 0.7472826086956522,
|
| 10 |
+
"lambada/perplexity": 15.714297672985598,
|
| 11 |
+
"lambada/lm_loss": 3.290272894701554,
|
| 12 |
+
"lambada/lm_perplexity": 26.850189930770128,
|
| 13 |
+
"lambada/time_since_best_loss": 0,
|
| 14 |
+
"lambada/time_since_best_accuracy": 0,
|
| 15 |
+
"mean_accuracy": 0.33472183048519455,
|
| 16 |
+
"mean_loss": 2.675774800028778,
|
| 17 |
+
"blimp/accuracy/passive_2": 0.879,
|
| 18 |
+
"blimp/accuracy/determiner_noun_agreement_2": 0.98,
|
| 19 |
+
"blimp/accuracy/ellipsis_n_bar_1": 0.831,
|
| 20 |
+
"blimp/accuracy/tough_vs_raising_2": 0.857,
|
| 21 |
+
"blimp/accuracy/tough_vs_raising_1": 0.622,
|
| 22 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.922,
|
| 23 |
+
"blimp/accuracy/principle_A_reconstruction": 0.22,
|
| 24 |
+
"blimp/accuracy/wh_vs_that_with_gap": 0.523,
|
| 25 |
+
"blimp/accuracy/principle_A_domain_2": 0.807,
|
| 26 |
+
"blimp/accuracy/determiner_noun_agreement_1": 0.98,
|
| 27 |
+
"blimp/accuracy/ellipsis_n_bar_2": 0.901,
|
| 28 |
+
"blimp/accuracy/principle_A_domain_3": 0.559,
|
| 29 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.923,
|
| 30 |
+
"blimp/accuracy/animate_subject_trans": 0.883,
|
| 31 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.869,
|
| 32 |
+
"blimp/accuracy/distractor_agreement_relative_clause": 0.658,
|
| 33 |
+
"blimp/accuracy/transitive": 0.844,
|
| 34 |
+
"blimp/accuracy/sentential_subject_island": 0.305,
|
| 35 |
+
"blimp/accuracy/adjunct_island": 0.807,
|
| 36 |
+
"blimp/accuracy/intransitive": 0.77,
|
| 37 |
+
"blimp/accuracy/existential_there_subject_raising": 0.825,
|
| 38 |
+
"blimp/accuracy/irregular_past_participle_adjectives": 0.804,
|
| 39 |
+
"blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.349,
|
| 40 |
+
"blimp/accuracy/principle_A_case_1": 1.0,
|
| 41 |
+
"blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.192,
|
| 42 |
+
"blimp/accuracy/only_npi_scope": 0.648,
|
| 43 |
+
"blimp/accuracy/superlative_quantifiers_2": 0.879,
|
| 44 |
+
"blimp/accuracy/passive_1": 0.875,
|
| 45 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.879,
|
| 46 |
+
"blimp/accuracy/inchoative": 0.615,
|
| 47 |
+
"blimp/accuracy/anaphor_gender_agreement": 0.954,
|
| 48 |
+
"blimp/accuracy/principle_A_c_command": 0.562,
|
| 49 |
+
"blimp/accuracy/only_npi_licensor_present": 0.714,
|
| 50 |
+
"blimp/accuracy/expletive_it_object_raising": 0.748,
|
| 51 |
+
"blimp/accuracy/left_branch_island_simple_question": 0.441,
|
| 52 |
+
"blimp/accuracy/wh_questions_subject_gap": 0.915,
|
| 53 |
+
"blimp/accuracy/existential_there_quantifiers_2": 0.371,
|
| 54 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.918,
|
| 55 |
+
"blimp/accuracy/sentential_negation_npi_scope": 0.536,
|
| 56 |
+
"blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.815,
|
| 57 |
+
"blimp/accuracy/wh_questions_subject_gap_long_distance": 0.911,
|
| 58 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.896,
|
| 59 |
+
"blimp/accuracy/principle_A_case_2": 0.957,
|
| 60 |
+
"blimp/accuracy/distractor_agreement_relational_noun": 0.799,
|
| 61 |
+
"blimp/accuracy/sentential_negation_npi_licensor_present": 0.996,
|
| 62 |
+
"blimp/accuracy/superlative_quantifiers_1": 0.749,
|
| 63 |
+
"blimp/accuracy/wh_island": 0.733,
|
| 64 |
+
"blimp/accuracy/principle_A_domain_1": 0.985,
|
| 65 |
+
"blimp/accuracy/complex_NP_island": 0.561,
|
| 66 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_2": 0.958,
|
| 67 |
+
"blimp/accuracy/irregular_past_participle_verbs": 0.899,
|
| 68 |
+
"blimp/accuracy/drop_argument": 0.794,
|
| 69 |
+
"blimp/accuracy/wh_questions_object_gap": 0.755,
|
| 70 |
+
"blimp/accuracy/animate_subject_passive": 0.773,
|
| 71 |
+
"blimp/accuracy/existential_there_quantifiers_1": 0.983,
|
| 72 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.876,
|
| 73 |
+
"blimp/accuracy/npi_present_2": 0.607,
|
| 74 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_1": 0.919,
|
| 75 |
+
"blimp/accuracy/anaphor_number_agreement": 0.987,
|
| 76 |
+
"blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.942,
|
| 77 |
+
"blimp/accuracy/existential_there_object_raising": 0.827,
|
| 78 |
+
"blimp/accuracy/matrix_question_npi_licensor_present": 0.176,
|
| 79 |
+
"blimp/accuracy/npi_present_1": 0.541,
|
| 80 |
+
"blimp/accuracy/wh_vs_that_no_gap": 0.985,
|
| 81 |
+
"blimp/accuracy/left_branch_island_echo_question": 0.418,
|
| 82 |
+
"blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.989,
|
| 83 |
+
"blimp/accuracy/causative": 0.685,
|
| 84 |
+
"blimp/accuracy/group_average": 0.7594179104477611,
|
| 85 |
+
"blimp/accuracy/seq_average": 0.7594179104477612,
|
| 86 |
+
"cbt/accuracy/NE": 0.7375801282051282,
|
| 87 |
+
"cbt/accuracy/V": 0.8924,
|
| 88 |
+
"cbt/accuracy/CN": 0.7892,
|
| 89 |
+
"cbt/accuracy/P": 0.8708,
|
| 90 |
+
"cbt/accuracy/group_average": 0.822495032051282,
|
| 91 |
+
"cbt/accuracy/seq_average": 0.8225290116046419,
|
| 92 |
+
"hellaswag/accuracy/val": 0.28251344353714397,
|
| 93 |
+
"hellaswag/accuracy/group_average": 0.28251344353714397,
|
| 94 |
+
"hellaswag/accuracy/seq_average": 0.28251344353714397,
|
| 95 |
+
"piqa/accuracy/val": 0.5723612622415669,
|
| 96 |
+
"piqa/accuracy/group_average": 0.5723612622415669,
|
| 97 |
+
"piqa/accuracy/seq_average": 0.5723612622415669,
|
| 98 |
+
"ai2arc/accuracy/ARC-Easy": 0.3171247357293869,
|
| 99 |
+
"ai2arc/accuracy/ARC-Challenge": 0.1965665236051502,
|
| 100 |
+
"ai2arc/accuracy/group_average": 0.25684562966726854,
|
| 101 |
+
"ai2arc/accuracy/seq_average": 0.2773371104815864,
|
| 102 |
+
"mmlu/accuracy/MMLU": 0.2602788702180908,
|
| 103 |
+
"mmlu/accuracy/group_average": 0.2602788702180908,
|
| 104 |
+
"mmlu/accuracy/seq_average": 0.2602788702180908,
|
| 105 |
+
"openbookqa/accuracy/test": 0.28,
|
| 106 |
+
"openbookqa/accuracy/group_average": 0.28,
|
| 107 |
+
"openbookqa/accuracy/seq_average": 0.28,
|
| 108 |
+
"race/accuracy/test/high": 0.2667238421955403,
|
| 109 |
+
"race/accuracy/test/middle": 0.36002785515320335,
|
| 110 |
+
"race/accuracy/group_average": 0.31337584867437185,
|
| 111 |
+
"race/accuracy/seq_average": 0.2938792055127685,
|
| 112 |
+
"siqa/accuracy/dev": 0.3618219037871034,
|
| 113 |
+
"siqa/accuracy/group_average": 0.3618219037871034,
|
| 114 |
+
"siqa/accuracy/seq_average": 0.3618219037871034,
|
| 115 |
+
"winogrande/accuracy/dev": 0.4956590370955012,
|
| 116 |
+
"winogrande/accuracy/group_average": 0.4956590370955012,
|
| 117 |
+
"winogrande/accuracy/seq_average": 0.4956590370955012,
|
| 118 |
+
"commonsenseqa/accuracy/dev_rand_split": 0.24815724815724816,
|
| 119 |
+
"commonsenseqa/accuracy/group_average": 0.24815724815724816,
|
| 120 |
+
"commonsenseqa/accuracy/seq_average": 0.24815724815724816
|
| 121 |
+
}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_sigmoid_standard_lb/export/result-model-50000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.7114097958519343, "val/accuracy": 0.4654444134424603, "val/perplexity": 15.050478667700487, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6101730062354425, "lambada/accuracy/total": 0.2284549689440994, "lambada/accuracy/openai_last_token": 0.7536878881987578, "lambada/perplexity": 13.857038591860729, "lambada/lm_loss": 3.282701740307414, "lambada/lm_perplexity": 26.647670617194198, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.34694969119327984, "mean_loss": 2.6607914010436886, "blimp/accuracy/passive_2": 0.897, "blimp/accuracy/determiner_noun_agreement_2": 0.977, "blimp/accuracy/ellipsis_n_bar_1": 0.836, "blimp/accuracy/tough_vs_raising_2": 0.887, "blimp/accuracy/tough_vs_raising_1": 0.573, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.905, "blimp/accuracy/principle_A_reconstruction": 0.35, "blimp/accuracy/wh_vs_that_with_gap": 0.509, "blimp/accuracy/principle_A_domain_2": 0.817, "blimp/accuracy/determiner_noun_agreement_1": 0.982, "blimp/accuracy/ellipsis_n_bar_2": 0.906, "blimp/accuracy/principle_A_domain_3": 0.554, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.893, "blimp/accuracy/animate_subject_trans": 0.895, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.843, "blimp/accuracy/distractor_agreement_relative_clause": 0.65, "blimp/accuracy/transitive": 0.859, "blimp/accuracy/sentential_subject_island": 0.366, "blimp/accuracy/adjunct_island": 0.773, "blimp/accuracy/intransitive": 0.751, "blimp/accuracy/existential_there_subject_raising": 0.842, "blimp/accuracy/irregular_past_participle_adjectives": 0.881, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.384, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.186, "blimp/accuracy/only_npi_scope": 0.683, "blimp/accuracy/superlative_quantifiers_2": 0.724, "blimp/accuracy/passive_1": 0.887, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.904, "blimp/accuracy/inchoative": 0.608, "blimp/accuracy/anaphor_gender_agreement": 0.969, "blimp/accuracy/principle_A_c_command": 0.558, "blimp/accuracy/only_npi_licensor_present": 0.338, "blimp/accuracy/expletive_it_object_raising": 0.793, "blimp/accuracy/left_branch_island_simple_question": 0.499, "blimp/accuracy/wh_questions_subject_gap": 0.924, "blimp/accuracy/existential_there_quantifiers_2": 0.454, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.904, "blimp/accuracy/sentential_negation_npi_scope": 0.484, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.852, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.902, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.894, "blimp/accuracy/principle_A_case_2": 0.959, "blimp/accuracy/distractor_agreement_relational_noun": 0.784, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.997, "blimp/accuracy/superlative_quantifiers_1": 0.734, "blimp/accuracy/wh_island": 0.812, "blimp/accuracy/principle_A_domain_1": 0.988, "blimp/accuracy/complex_NP_island": 0.52, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.948, "blimp/accuracy/irregular_past_participle_verbs": 0.902, "blimp/accuracy/drop_argument": 0.775, "blimp/accuracy/wh_questions_object_gap": 0.814, "blimp/accuracy/animate_subject_passive": 0.78, "blimp/accuracy/existential_there_quantifiers_1": 0.991, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.889, "blimp/accuracy/npi_present_2": 0.583, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.909, "blimp/accuracy/anaphor_number_agreement": 0.988, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.942, "blimp/accuracy/existential_there_object_raising": 0.849, "blimp/accuracy/matrix_question_npi_licensor_present": 0.185, "blimp/accuracy/npi_present_1": 0.55, "blimp/accuracy/wh_vs_that_no_gap": 0.983, "blimp/accuracy/left_branch_island_echo_question": 0.428, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.989, "blimp/accuracy/causative": 0.673, "blimp/accuracy/group_average": 0.7591791044776118, "blimp/accuracy/seq_average": 0.7591791044776119, "cbt/accuracy/NE": 0.7411858974358975, "cbt/accuracy/V": 0.9008, "cbt/accuracy/CN": 0.7964, "cbt/accuracy/P": 0.8752, "cbt/accuracy/group_average": 0.8283964743589743, "cbt/accuracy/seq_average": 0.8284313725490197, "hellaswag/accuracy/val": 0.28560047799243177, "hellaswag/accuracy/group_average": 0.28560047799243177, "hellaswag/accuracy/seq_average": 0.28560047799243177, "piqa/accuracy/val": 0.5680087051142546, "piqa/accuracy/group_average": 0.5680087051142546, "piqa/accuracy/seq_average": 0.5680087051142546, "ai2arc/accuracy/ARC-Easy": 0.31881606765327697, "ai2arc/accuracy/ARC-Challenge": 0.2111587982832618, "ai2arc/accuracy/group_average": 0.2649874329682694, "ai2arc/accuracy/seq_average": 0.28328611898017, "mmlu/accuracy/MMLU": 0.2608509116910976, "mmlu/accuracy/group_average": 0.2608509116910976, "mmlu/accuracy/seq_average": 0.2608509116910976, "openbookqa/accuracy/test": 0.278, "openbookqa/accuracy/group_average": 0.278, "openbookqa/accuracy/seq_average": 0.278, "race/accuracy/test/high": 0.26186392224128074, "race/accuracy/test/middle": 0.3370473537604457, "race/accuracy/group_average": 0.2994556380008632, "race/accuracy/seq_average": 0.2837454398054317, "siqa/accuracy/dev": 0.3572159672466735, "siqa/accuracy/group_average": 0.3572159672466735, "siqa/accuracy/seq_average": 0.3572159672466735, "winogrande/accuracy/dev": 0.5043409629044988, "winogrande/accuracy/group_average": 0.5043409629044988, "winogrande/accuracy/seq_average": 0.5043409629044988, "commonsenseqa/accuracy/dev_rand_split": 0.24733824733824733, "commonsenseqa/accuracy/group_average": 0.24733824733824733, "commonsenseqa/accuracy/seq_average": 0.24733824733824733}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_sigmoid_standard_lb/export/result-model-60000.pth.json
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"val/loss": 2.677216302780878,
|
| 3 |
+
"val/accuracy": 0.47003561352926587,
|
| 4 |
+
"val/perplexity": 14.544549269560274,
|
| 5 |
+
"val/time_since_best_loss": 0,
|
| 6 |
+
"val/time_since_best_accuracy": 0,
|
| 7 |
+
"lambada/loss": 2.593755307404891,
|
| 8 |
+
"lambada/accuracy/total": 0.23000776397515527,
|
| 9 |
+
"lambada/accuracy/openai_last_token": 0.75097049689441,
|
| 10 |
+
"lambada/perplexity": 13.518963970837525,
|
| 11 |
+
"lambada/lm_loss": 3.231360209691748,
|
| 12 |
+
"lambada/lm_perplexity": 25.314066001822276,
|
| 13 |
+
"lambada/time_since_best_loss": 0,
|
| 14 |
+
"lambada/time_since_best_accuracy": 0,
|
| 15 |
+
"mean_accuracy": 0.35002168875221057,
|
| 16 |
+
"mean_loss": 2.635485805092885,
|
| 17 |
+
"blimp/accuracy/passive_2": 0.895,
|
| 18 |
+
"blimp/accuracy/determiner_noun_agreement_2": 0.982,
|
| 19 |
+
"blimp/accuracy/ellipsis_n_bar_1": 0.848,
|
| 20 |
+
"blimp/accuracy/tough_vs_raising_2": 0.894,
|
| 21 |
+
"blimp/accuracy/tough_vs_raising_1": 0.567,
|
| 22 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.916,
|
| 23 |
+
"blimp/accuracy/principle_A_reconstruction": 0.285,
|
| 24 |
+
"blimp/accuracy/wh_vs_that_with_gap": 0.523,
|
| 25 |
+
"blimp/accuracy/principle_A_domain_2": 0.809,
|
| 26 |
+
"blimp/accuracy/determiner_noun_agreement_1": 0.987,
|
| 27 |
+
"blimp/accuracy/ellipsis_n_bar_2": 0.907,
|
| 28 |
+
"blimp/accuracy/principle_A_domain_3": 0.559,
|
| 29 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.91,
|
| 30 |
+
"blimp/accuracy/animate_subject_trans": 0.905,
|
| 31 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.854,
|
| 32 |
+
"blimp/accuracy/distractor_agreement_relative_clause": 0.664,
|
| 33 |
+
"blimp/accuracy/transitive": 0.858,
|
| 34 |
+
"blimp/accuracy/sentential_subject_island": 0.298,
|
| 35 |
+
"blimp/accuracy/adjunct_island": 0.8,
|
| 36 |
+
"blimp/accuracy/intransitive": 0.774,
|
| 37 |
+
"blimp/accuracy/existential_there_subject_raising": 0.813,
|
| 38 |
+
"blimp/accuracy/irregular_past_participle_adjectives": 0.952,
|
| 39 |
+
"blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.291,
|
| 40 |
+
"blimp/accuracy/principle_A_case_1": 1.0,
|
| 41 |
+
"blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.252,
|
| 42 |
+
"blimp/accuracy/only_npi_scope": 0.712,
|
| 43 |
+
"blimp/accuracy/superlative_quantifiers_2": 0.862,
|
| 44 |
+
"blimp/accuracy/passive_1": 0.887,
|
| 45 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.904,
|
| 46 |
+
"blimp/accuracy/inchoative": 0.645,
|
| 47 |
+
"blimp/accuracy/anaphor_gender_agreement": 0.97,
|
| 48 |
+
"blimp/accuracy/principle_A_c_command": 0.588,
|
| 49 |
+
"blimp/accuracy/only_npi_licensor_present": 0.579,
|
| 50 |
+
"blimp/accuracy/expletive_it_object_raising": 0.774,
|
| 51 |
+
"blimp/accuracy/left_branch_island_simple_question": 0.345,
|
| 52 |
+
"blimp/accuracy/wh_questions_subject_gap": 0.896,
|
| 53 |
+
"blimp/accuracy/existential_there_quantifiers_2": 0.32,
|
| 54 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.925,
|
| 55 |
+
"blimp/accuracy/sentential_negation_npi_scope": 0.605,
|
| 56 |
+
"blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.835,
|
| 57 |
+
"blimp/accuracy/wh_questions_subject_gap_long_distance": 0.873,
|
| 58 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.892,
|
| 59 |
+
"blimp/accuracy/principle_A_case_2": 0.963,
|
| 60 |
+
"blimp/accuracy/distractor_agreement_relational_noun": 0.773,
|
| 61 |
+
"blimp/accuracy/sentential_negation_npi_licensor_present": 0.995,
|
| 62 |
+
"blimp/accuracy/superlative_quantifiers_1": 0.834,
|
| 63 |
+
"blimp/accuracy/wh_island": 0.756,
|
| 64 |
+
"blimp/accuracy/principle_A_domain_1": 0.996,
|
| 65 |
+
"blimp/accuracy/complex_NP_island": 0.536,
|
| 66 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_2": 0.949,
|
| 67 |
+
"blimp/accuracy/irregular_past_participle_verbs": 0.895,
|
| 68 |
+
"blimp/accuracy/drop_argument": 0.78,
|
| 69 |
+
"blimp/accuracy/wh_questions_object_gap": 0.767,
|
| 70 |
+
"blimp/accuracy/animate_subject_passive": 0.785,
|
| 71 |
+
"blimp/accuracy/existential_there_quantifiers_1": 0.978,
|
| 72 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.9,
|
| 73 |
+
"blimp/accuracy/npi_present_2": 0.6,
|
| 74 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_1": 0.92,
|
| 75 |
+
"blimp/accuracy/anaphor_number_agreement": 0.986,
|
| 76 |
+
"blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.94,
|
| 77 |
+
"blimp/accuracy/existential_there_object_raising": 0.816,
|
| 78 |
+
"blimp/accuracy/matrix_question_npi_licensor_present": 0.221,
|
| 79 |
+
"blimp/accuracy/npi_present_1": 0.549,
|
| 80 |
+
"blimp/accuracy/wh_vs_that_no_gap": 0.971,
|
| 81 |
+
"blimp/accuracy/left_branch_island_echo_question": 0.446,
|
| 82 |
+
"blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.969,
|
| 83 |
+
"blimp/accuracy/causative": 0.708,
|
| 84 |
+
"blimp/accuracy/group_average": 0.764,
|
| 85 |
+
"blimp/accuracy/seq_average": 0.764,
|
| 86 |
+
"cbt/accuracy/NE": 0.7479967948717948,
|
| 87 |
+
"cbt/accuracy/V": 0.9048,
|
| 88 |
+
"cbt/accuracy/CN": 0.8044,
|
| 89 |
+
"cbt/accuracy/P": 0.8764,
|
| 90 |
+
"cbt/accuracy/group_average": 0.8333991987179487,
|
| 91 |
+
"cbt/accuracy/seq_average": 0.8334333733493398,
|
| 92 |
+
"hellaswag/accuracy/val": 0.285700059749054,
|
| 93 |
+
"hellaswag/accuracy/group_average": 0.285700059749054,
|
| 94 |
+
"hellaswag/accuracy/seq_average": 0.285700059749054,
|
| 95 |
+
"piqa/accuracy/val": 0.5772578890097932,
|
| 96 |
+
"piqa/accuracy/group_average": 0.5772578890097932,
|
| 97 |
+
"piqa/accuracy/seq_average": 0.5772578890097932,
|
| 98 |
+
"ai2arc/accuracy/ARC-Easy": 0.33446088794926004,
|
| 99 |
+
"ai2arc/accuracy/ARC-Challenge": 0.21802575107296138,
|
| 100 |
+
"ai2arc/accuracy/group_average": 0.2762433195111107,
|
| 101 |
+
"ai2arc/accuracy/seq_average": 0.29603399433427763,
|
| 102 |
+
"mmlu/accuracy/MMLU": 0.2621380050053629,
|
| 103 |
+
"mmlu/accuracy/group_average": 0.2621380050053629,
|
| 104 |
+
"mmlu/accuracy/seq_average": 0.2621380050053629,
|
| 105 |
+
"openbookqa/accuracy/test": 0.28,
|
| 106 |
+
"openbookqa/accuracy/group_average": 0.28,
|
| 107 |
+
"openbookqa/accuracy/seq_average": 0.28,
|
| 108 |
+
"race/accuracy/test/high": 0.2612921669525443,
|
| 109 |
+
"race/accuracy/test/middle": 0.3516713091922006,
|
| 110 |
+
"race/accuracy/group_average": 0.30648173807237244,
|
| 111 |
+
"race/accuracy/seq_average": 0.2875962707742197,
|
| 112 |
+
"siqa/accuracy/dev": 0.3556806550665302,
|
| 113 |
+
"siqa/accuracy/group_average": 0.3556806550665302,
|
| 114 |
+
"siqa/accuracy/seq_average": 0.3556806550665302,
|
| 115 |
+
"winogrande/accuracy/dev": 0.4980268350434096,
|
| 116 |
+
"winogrande/accuracy/group_average": 0.4980268350434096,
|
| 117 |
+
"winogrande/accuracy/seq_average": 0.4980268350434096,
|
| 118 |
+
"commonsenseqa/accuracy/dev_rand_split": 0.24815724815724816,
|
| 119 |
+
"commonsenseqa/accuracy/group_average": 0.24815724815724816,
|
| 120 |
+
"commonsenseqa/accuracy/seq_average": 0.24815724815724816
|
| 121 |
+
}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_sigmoid_standard_lb/export/result-model-70000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.651888892764137, "val/accuracy": 0.47424122643849204, "val/perplexity": 14.180799372671775, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5581873544254656, "lambada/accuracy/total": 0.2484472049689441, "lambada/accuracy/openai_last_token": 0.7554347826086957, "lambada/perplexity": 12.619442516702218, "lambada/lm_loss": 3.211777822709063, "lambada/lm_perplexity": 24.823178229057724, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.36134421570371805, "mean_loss": 2.6050381235948015, "blimp/accuracy/passive_2": 0.913, "blimp/accuracy/determiner_noun_agreement_2": 0.983, "blimp/accuracy/ellipsis_n_bar_1": 0.835, "blimp/accuracy/tough_vs_raising_2": 0.845, "blimp/accuracy/tough_vs_raising_1": 0.626, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.893, "blimp/accuracy/principle_A_reconstruction": 0.339, "blimp/accuracy/wh_vs_that_with_gap": 0.542, "blimp/accuracy/principle_A_domain_2": 0.798, "blimp/accuracy/determiner_noun_agreement_1": 0.985, "blimp/accuracy/ellipsis_n_bar_2": 0.909, "blimp/accuracy/principle_A_domain_3": 0.583, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.902, "blimp/accuracy/animate_subject_trans": 0.909, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.871, "blimp/accuracy/distractor_agreement_relative_clause": 0.644, "blimp/accuracy/transitive": 0.857, "blimp/accuracy/sentential_subject_island": 0.33, "blimp/accuracy/adjunct_island": 0.826, "blimp/accuracy/intransitive": 0.787, "blimp/accuracy/existential_there_subject_raising": 0.841, "blimp/accuracy/irregular_past_participle_adjectives": 0.812, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.436, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.243, "blimp/accuracy/only_npi_scope": 0.579, "blimp/accuracy/superlative_quantifiers_2": 0.692, "blimp/accuracy/passive_1": 0.9, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.9, "blimp/accuracy/inchoative": 0.612, "blimp/accuracy/anaphor_gender_agreement": 0.976, "blimp/accuracy/principle_A_c_command": 0.57, "blimp/accuracy/only_npi_licensor_present": 0.505, "blimp/accuracy/expletive_it_object_raising": 0.752, "blimp/accuracy/left_branch_island_simple_question": 0.524, "blimp/accuracy/wh_questions_subject_gap": 0.895, "blimp/accuracy/existential_there_quantifiers_2": 0.443, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.921, "blimp/accuracy/sentential_negation_npi_scope": 0.565, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.853, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.893, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.896, "blimp/accuracy/principle_A_case_2": 0.957, "blimp/accuracy/distractor_agreement_relational_noun": 0.818, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.996, "blimp/accuracy/superlative_quantifiers_1": 0.842, "blimp/accuracy/wh_island": 0.797, "blimp/accuracy/principle_A_domain_1": 0.985, "blimp/accuracy/complex_NP_island": 0.543, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.95, "blimp/accuracy/irregular_past_participle_verbs": 0.907, "blimp/accuracy/drop_argument": 0.764, "blimp/accuracy/wh_questions_object_gap": 0.771, "blimp/accuracy/animate_subject_passive": 0.785, "blimp/accuracy/existential_there_quantifiers_1": 0.975, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.888, "blimp/accuracy/npi_present_2": 0.562, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.911, "blimp/accuracy/anaphor_number_agreement": 0.989, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.937, "blimp/accuracy/existential_there_object_raising": 0.851, "blimp/accuracy/matrix_question_npi_licensor_present": 0.272, "blimp/accuracy/npi_present_1": 0.544, "blimp/accuracy/wh_vs_that_no_gap": 0.968, "blimp/accuracy/left_branch_island_echo_question": 0.473, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.979, "blimp/accuracy/causative": 0.709, "blimp/accuracy/group_average": 0.7665373134328357, "blimp/accuracy/seq_average": 0.7665373134328358, "cbt/accuracy/NE": 0.75, "cbt/accuracy/V": 0.9024, "cbt/accuracy/CN": 0.8144, "cbt/accuracy/P": 0.8776, "cbt/accuracy/group_average": 0.8361000000000001, "cbt/accuracy/seq_average": 0.8361344537815126, "hellaswag/accuracy/val": 0.28689504082852024, "hellaswag/accuracy/group_average": 0.28689504082852024, "hellaswag/accuracy/seq_average": 0.28689504082852024, "piqa/accuracy/val": 0.5859630032644179, "piqa/accuracy/group_average": 0.5859630032644179, "piqa/accuracy/seq_average": 0.5859630032644179, "ai2arc/accuracy/ARC-Easy": 0.32642706131078225, "ai2arc/accuracy/ARC-Challenge": 0.20600858369098712, "ai2arc/accuracy/group_average": 0.26621782250088466, "ai2arc/accuracy/seq_average": 0.28668555240793203, "mmlu/accuracy/MMLU": 0.2642116553450125, "mmlu/accuracy/group_average": 0.2642116553450125, "mmlu/accuracy/seq_average": 0.2642116553450125, "openbookqa/accuracy/test": 0.278, "openbookqa/accuracy/group_average": 0.278, "openbookqa/accuracy/seq_average": 0.278, "race/accuracy/test/high": 0.2624356775300172, "race/accuracy/test/middle": 0.3488857938718663, "race/accuracy/group_average": 0.30566073570094177, "race/accuracy/seq_average": 0.2875962707742197, "siqa/accuracy/dev": 0.35363357215967245, "siqa/accuracy/group_average": 0.35363357215967245, "siqa/accuracy/seq_average": 0.35363357215967245, "winogrande/accuracy/dev": 0.5153906866614049, "winogrande/accuracy/group_average": 0.5153906866614049, "winogrande/accuracy/seq_average": 0.5153906866614049, "commonsenseqa/accuracy/dev_rand_split": 0.2457002457002457, "commonsenseqa/accuracy/group_average": 0.2457002457002457, "commonsenseqa/accuracy/seq_average": 0.2457002457002457}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_sigmoid_standard_lb/export/result-model-80000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.6294446672712053, "val/accuracy": 0.4769103035094246, "val/perplexity": 13.866067482532282, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.4874447650790956, "lambada/accuracy/total": 0.24805900621118013, "lambada/accuracy/openai_last_token": 0.7581521739130435, "lambada/perplexity": 12.173992762296985, "lambada/lm_loss": 3.193227865871034, "lambada/lm_perplexity": 24.366953897348676, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.36248465486030235, "mean_loss": 2.5584447161751505, "blimp/accuracy/passive_2": 0.917, "blimp/accuracy/determiner_noun_agreement_2": 0.981, "blimp/accuracy/ellipsis_n_bar_1": 0.828, "blimp/accuracy/tough_vs_raising_2": 0.863, "blimp/accuracy/tough_vs_raising_1": 0.622, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.921, "blimp/accuracy/principle_A_reconstruction": 0.259, "blimp/accuracy/wh_vs_that_with_gap": 0.531, "blimp/accuracy/principle_A_domain_2": 0.808, "blimp/accuracy/determiner_noun_agreement_1": 0.991, "blimp/accuracy/ellipsis_n_bar_2": 0.904, "blimp/accuracy/principle_A_domain_3": 0.577, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.916, "blimp/accuracy/animate_subject_trans": 0.901, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.876, "blimp/accuracy/distractor_agreement_relative_clause": 0.69, "blimp/accuracy/transitive": 0.871, "blimp/accuracy/sentential_subject_island": 0.313, "blimp/accuracy/adjunct_island": 0.82, "blimp/accuracy/intransitive": 0.801, "blimp/accuracy/existential_there_subject_raising": 0.863, "blimp/accuracy/irregular_past_participle_adjectives": 0.964, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.362, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.251, "blimp/accuracy/only_npi_scope": 0.631, "blimp/accuracy/superlative_quantifiers_2": 0.873, "blimp/accuracy/passive_1": 0.909, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.896, "blimp/accuracy/inchoative": 0.616, "blimp/accuracy/anaphor_gender_agreement": 0.977, "blimp/accuracy/principle_A_c_command": 0.567, "blimp/accuracy/only_npi_licensor_present": 0.603, "blimp/accuracy/expletive_it_object_raising": 0.755, "blimp/accuracy/left_branch_island_simple_question": 0.421, "blimp/accuracy/wh_questions_subject_gap": 0.911, "blimp/accuracy/existential_there_quantifiers_2": 0.432, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.93, "blimp/accuracy/sentential_negation_npi_scope": 0.561, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.829, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.886, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.899, "blimp/accuracy/principle_A_case_2": 0.963, "blimp/accuracy/distractor_agreement_relational_noun": 0.804, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.995, "blimp/accuracy/superlative_quantifiers_1": 0.809, "blimp/accuracy/wh_island": 0.773, "blimp/accuracy/principle_A_domain_1": 0.977, "blimp/accuracy/complex_NP_island": 0.556, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.961, "blimp/accuracy/irregular_past_participle_verbs": 0.893, "blimp/accuracy/drop_argument": 0.777, "blimp/accuracy/wh_questions_object_gap": 0.787, "blimp/accuracy/animate_subject_passive": 0.796, "blimp/accuracy/existential_there_quantifiers_1": 0.976, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.887, "blimp/accuracy/npi_present_2": 0.573, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.923, "blimp/accuracy/anaphor_number_agreement": 0.988, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.948, "blimp/accuracy/existential_there_object_raising": 0.835, "blimp/accuracy/matrix_question_npi_licensor_present": 0.241, "blimp/accuracy/npi_present_1": 0.56, "blimp/accuracy/wh_vs_that_no_gap": 0.973, "blimp/accuracy/left_branch_island_echo_question": 0.449, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.98, "blimp/accuracy/causative": 0.692, "blimp/accuracy/group_average": 0.770776119402985, "blimp/accuracy/seq_average": 0.7707761194029851, "cbt/accuracy/NE": 0.7528044871794872, "cbt/accuracy/V": 0.9072, "cbt/accuracy/CN": 0.818, "cbt/accuracy/P": 0.8804, "cbt/accuracy/group_average": 0.8396011217948718, "cbt/accuracy/seq_average": 0.8396358543417367, "hellaswag/accuracy/val": 0.2905795658235411, "hellaswag/accuracy/group_average": 0.2905795658235411, "hellaswag/accuracy/seq_average": 0.2905795658235411, "piqa/accuracy/val": 0.5848748639825898, "piqa/accuracy/group_average": 0.5848748639825898, "piqa/accuracy/seq_average": 0.5848748639825898, "ai2arc/accuracy/ARC-Easy": 0.3399577167019027, "ai2arc/accuracy/ARC-Challenge": 0.2128755364806867, "ai2arc/accuracy/group_average": 0.27641662659129473, "ai2arc/accuracy/seq_average": 0.2980169971671388, "mmlu/accuracy/MMLU": 0.26035037540221667, "mmlu/accuracy/group_average": 0.26035037540221667, "mmlu/accuracy/seq_average": 0.26035037540221667, "openbookqa/accuracy/test": 0.276, "openbookqa/accuracy/group_average": 0.276, "openbookqa/accuracy/seq_average": 0.276, "race/accuracy/test/high": 0.26758147512864494, "race/accuracy/test/middle": 0.3502785515320334, "race/accuracy/group_average": 0.3089300133303392, "race/accuracy/seq_average": 0.29164977705715445, "siqa/accuracy/dev": 0.3561924257932446, "siqa/accuracy/group_average": 0.3561924257932446, "siqa/accuracy/seq_average": 0.3561924257932446, "winogrande/accuracy/dev": 0.5067087608524072, "winogrande/accuracy/group_average": 0.5067087608524072, "winogrande/accuracy/seq_average": 0.5067087608524072, "commonsenseqa/accuracy/dev_rand_split": 0.24897624897624898, "commonsenseqa/accuracy/group_average": 0.24897624897624898, "commonsenseqa/accuracy/seq_average": 0.24897624897624898}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_sigmoid_standard_lb/export/result-model-90000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.6175365145244296, "val/accuracy": 0.4781581333705357, "val/perplexity": 13.701927474734823, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5651116223068713, "lambada/accuracy/total": 0.24456521739130435, "lambada/accuracy/openai_last_token": 0.7591226708074534, "lambada/perplexity": 12.081295702875193, "lambada/lm_loss": 3.1670897308033994, "lambada/lm_perplexity": 23.738298891053454, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.36136167538092, "mean_loss": 2.59132406841565, "blimp/accuracy/passive_2": 0.915, "blimp/accuracy/determiner_noun_agreement_2": 0.984, "blimp/accuracy/ellipsis_n_bar_1": 0.824, "blimp/accuracy/tough_vs_raising_2": 0.881, "blimp/accuracy/tough_vs_raising_1": 0.63, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.904, "blimp/accuracy/principle_A_reconstruction": 0.346, "blimp/accuracy/wh_vs_that_with_gap": 0.576, "blimp/accuracy/principle_A_domain_2": 0.808, "blimp/accuracy/determiner_noun_agreement_1": 0.991, "blimp/accuracy/ellipsis_n_bar_2": 0.916, "blimp/accuracy/principle_A_domain_3": 0.565, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.903, "blimp/accuracy/animate_subject_trans": 0.899, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.884, "blimp/accuracy/distractor_agreement_relative_clause": 0.675, "blimp/accuracy/transitive": 0.869, "blimp/accuracy/sentential_subject_island": 0.331, "blimp/accuracy/adjunct_island": 0.804, "blimp/accuracy/intransitive": 0.803, "blimp/accuracy/existential_there_subject_raising": 0.857, "blimp/accuracy/irregular_past_participle_adjectives": 0.952, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.332, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.283, "blimp/accuracy/only_npi_scope": 0.654, "blimp/accuracy/superlative_quantifiers_2": 0.699, "blimp/accuracy/passive_1": 0.905, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.911, "blimp/accuracy/inchoative": 0.624, "blimp/accuracy/anaphor_gender_agreement": 0.973, "blimp/accuracy/principle_A_c_command": 0.582, "blimp/accuracy/only_npi_licensor_present": 0.623, "blimp/accuracy/expletive_it_object_raising": 0.766, "blimp/accuracy/left_branch_island_simple_question": 0.395, "blimp/accuracy/wh_questions_subject_gap": 0.909, "blimp/accuracy/existential_there_quantifiers_2": 0.489, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.92, "blimp/accuracy/sentential_negation_npi_scope": 0.531, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.839, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.88, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.907, "blimp/accuracy/principle_A_case_2": 0.967, "blimp/accuracy/distractor_agreement_relational_noun": 0.807, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.991, "blimp/accuracy/superlative_quantifiers_1": 0.888, "blimp/accuracy/wh_island": 0.772, "blimp/accuracy/principle_A_domain_1": 0.988, "blimp/accuracy/complex_NP_island": 0.565, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.952, "blimp/accuracy/irregular_past_participle_verbs": 0.887, "blimp/accuracy/drop_argument": 0.79, "blimp/accuracy/wh_questions_object_gap": 0.784, "blimp/accuracy/animate_subject_passive": 0.8, "blimp/accuracy/existential_there_quantifiers_1": 0.98, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.895, "blimp/accuracy/npi_present_2": 0.624, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.942, "blimp/accuracy/anaphor_number_agreement": 0.988, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.948, "blimp/accuracy/existential_there_object_raising": 0.853, "blimp/accuracy/matrix_question_npi_licensor_present": 0.281, "blimp/accuracy/npi_present_1": 0.609, "blimp/accuracy/wh_vs_that_no_gap": 0.969, "blimp/accuracy/left_branch_island_echo_question": 0.444, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.978, "blimp/accuracy/causative": 0.724, "blimp/accuracy/group_average": 0.775597014925373, "blimp/accuracy/seq_average": 0.7755970149253731, "cbt/accuracy/NE": 0.7672275641025641, "cbt/accuracy/V": 0.9096, "cbt/accuracy/CN": 0.8256, "cbt/accuracy/P": 0.888, "cbt/accuracy/group_average": 0.847606891025641, "cbt/accuracy/seq_average": 0.847639055622249, "hellaswag/accuracy/val": 0.29087831109340767, "hellaswag/accuracy/group_average": 0.29087831109340767, "hellaswag/accuracy/seq_average": 0.29087831109340767, "piqa/accuracy/val": 0.5821545157780196, "piqa/accuracy/group_average": 0.5821545157780196, "piqa/accuracy/seq_average": 0.5821545157780196, "ai2arc/accuracy/ARC-Easy": 0.33699788583509516, "ai2arc/accuracy/ARC-Challenge": 0.21030042918454936, "ai2arc/accuracy/group_average": 0.27364915750982227, "ai2arc/accuracy/seq_average": 0.2951841359773371, "mmlu/accuracy/MMLU": 0.2619234894529853, "mmlu/accuracy/group_average": 0.2619234894529853, "mmlu/accuracy/seq_average": 0.2619234894529853, "openbookqa/accuracy/test": 0.286, "openbookqa/accuracy/group_average": 0.286, "openbookqa/accuracy/seq_average": 0.286, "race/accuracy/test/high": 0.26758147512864494, "race/accuracy/test/middle": 0.34401114206128136, "race/accuracy/group_average": 0.30579630859496315, "race/accuracy/seq_average": 0.2898256992298338, "siqa/accuracy/dev": 0.3572159672466735, "siqa/accuracy/group_average": 0.3572159672466735, "siqa/accuracy/seq_average": 0.3572159672466735, "winogrande/accuracy/dev": 0.5067087608524072, "winogrande/accuracy/group_average": 0.5067087608524072, "winogrande/accuracy/seq_average": 0.5067087608524072, "commonsenseqa/accuracy/dev_rand_split": 0.25143325143325146, "commonsenseqa/accuracy/group_average": 0.25143325143325146, "commonsenseqa/accuracy/seq_average": 0.25143325143325146}
|