Upload folder using huggingface_hub
#292
by
DavidNguyen
- opened
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_deepseek_sigmoidonly/export/result-model-10000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_deepseek_sigmoidonly/export/result-model-100000.pth.json +121 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_deepseek_sigmoidonly/export/result-model-20000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_deepseek_sigmoidonly/export/result-model-30000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_deepseek_sigmoidonly/export/result-model-40000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_deepseek_sigmoidonly/export/result-model-50000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_deepseek_sigmoidonly/export/result-model-60000.pth.json +121 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_deepseek_sigmoidonly/export/result-model-70000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_deepseek_sigmoidonly/export/result-model-80000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_deepseek_sigmoidonly/export/result-model-90000.pth.json +1 -0
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_deepseek_sigmoidonly/export/result-model-10000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 3.0204678490048362, "val/accuracy": 0.4241081116691468, "val/perplexity": 20.500880757559507, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6939651773583075, "lambada/accuracy/total": 0.15993788819875776, "lambada/accuracy/openai_last_token": 0.7154503105590062, "lambada/perplexity": 26.33982290855866, "lambada/lm_loss": 3.550844696525252, "lambada/lm_perplexity": 34.84273659927775, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.2920229999339523, "mean_loss": 2.857216513181572, "blimp/accuracy/passive_2": 0.864, "blimp/accuracy/determiner_noun_agreement_2": 0.962, "blimp/accuracy/ellipsis_n_bar_1": 0.72, "blimp/accuracy/tough_vs_raising_2": 0.838, "blimp/accuracy/tough_vs_raising_1": 0.552, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.867, "blimp/accuracy/principle_A_reconstruction": 0.357, "blimp/accuracy/wh_vs_that_with_gap": 0.449, "blimp/accuracy/principle_A_domain_2": 0.809, "blimp/accuracy/determiner_noun_agreement_1": 0.979, "blimp/accuracy/ellipsis_n_bar_2": 0.855, "blimp/accuracy/principle_A_domain_3": 0.509, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.874, "blimp/accuracy/animate_subject_trans": 0.871, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.765, "blimp/accuracy/distractor_agreement_relative_clause": 0.428, "blimp/accuracy/transitive": 0.82, "blimp/accuracy/sentential_subject_island": 0.331, "blimp/accuracy/adjunct_island": 0.709, "blimp/accuracy/intransitive": 0.724, "blimp/accuracy/existential_there_subject_raising": 0.791, "blimp/accuracy/irregular_past_participle_adjectives": 0.944, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.224, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.158, "blimp/accuracy/only_npi_scope": 0.438, "blimp/accuracy/superlative_quantifiers_2": 0.784, "blimp/accuracy/passive_1": 0.887, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.87, "blimp/accuracy/inchoative": 0.574, "blimp/accuracy/anaphor_gender_agreement": 0.934, "blimp/accuracy/principle_A_c_command": 0.414, "blimp/accuracy/only_npi_licensor_present": 0.379, "blimp/accuracy/expletive_it_object_raising": 0.768, "blimp/accuracy/left_branch_island_simple_question": 0.281, "blimp/accuracy/wh_questions_subject_gap": 0.902, "blimp/accuracy/existential_there_quantifiers_2": 0.359, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.917, "blimp/accuracy/sentential_negation_npi_scope": 0.43, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.724, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.917, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.816, "blimp/accuracy/principle_A_case_2": 0.93, "blimp/accuracy/distractor_agreement_relational_noun": 0.704, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.987, "blimp/accuracy/superlative_quantifiers_1": 0.665, "blimp/accuracy/wh_island": 0.774, "blimp/accuracy/principle_A_domain_1": 0.965, "blimp/accuracy/complex_NP_island": 0.437, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.941, "blimp/accuracy/irregular_past_participle_verbs": 0.787, "blimp/accuracy/drop_argument": 0.72, "blimp/accuracy/wh_questions_object_gap": 0.735, "blimp/accuracy/animate_subject_passive": 0.737, "blimp/accuracy/existential_there_quantifiers_1": 0.928, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.85, "blimp/accuracy/npi_present_2": 0.573, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.861, "blimp/accuracy/anaphor_number_agreement": 0.966, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.919, "blimp/accuracy/existential_there_object_raising": 0.81, "blimp/accuracy/matrix_question_npi_licensor_present": 0.106, "blimp/accuracy/npi_present_1": 0.59, "blimp/accuracy/wh_vs_that_no_gap": 0.972, "blimp/accuracy/left_branch_island_echo_question": 0.405, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.977, "blimp/accuracy/causative": 0.654, "blimp/accuracy/group_average": 0.7132388059701495, "blimp/accuracy/seq_average": 0.7132388059701492, "cbt/accuracy/NE": 0.6967147435897436, "cbt/accuracy/V": 0.86, "cbt/accuracy/CN": 0.7288, "cbt/accuracy/P": 0.8352, "cbt/accuracy/group_average": 0.7801786858974359, "cbt/accuracy/seq_average": 0.7802120848339336, "hellaswag/accuracy/val": 0.27106154152559253, "hellaswag/accuracy/group_average": 0.27106154152559253, "hellaswag/accuracy/seq_average": 0.27106154152559253, "piqa/accuracy/val": 0.5505984766050055, "piqa/accuracy/group_average": 0.5505984766050055, "piqa/accuracy/seq_average": 0.5505984766050055, "ai2arc/accuracy/ARC-Easy": 0.3035940803382664, "ai2arc/accuracy/ARC-Challenge": 0.1982832618025751, "ai2arc/accuracy/group_average": 0.2509386710704208, "ai2arc/accuracy/seq_average": 0.2688385269121813, "mmlu/accuracy/MMLU": 0.26349660350375403, "mmlu/accuracy/group_average": 0.26349660350375403, "mmlu/accuracy/seq_average": 0.26349660350375403, "openbookqa/accuracy/test": 0.252, "openbookqa/accuracy/group_average": 0.252, "openbookqa/accuracy/seq_average": 0.252, "race/accuracy/test/high": 0.24556889651229274, "race/accuracy/test/middle": 0.32172701949860727, "race/accuracy/group_average": 0.28364795800545, "race/accuracy/seq_average": 0.2677340899878395, "siqa/accuracy/dev": 0.35977482088024565, "siqa/accuracy/group_average": 0.35977482088024565, "siqa/accuracy/seq_average": 0.35977482088024565, "winogrande/accuracy/dev": 0.5114443567482242, "winogrande/accuracy/group_average": 0.5114443567482242, "winogrande/accuracy/seq_average": 0.5114443567482242, "commonsenseqa/accuracy/dev_rand_split": 0.24242424242424243, "commonsenseqa/accuracy/group_average": 0.24242424242424243, "commonsenseqa/accuracy/seq_average": 0.24242424242424243}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_deepseek_sigmoidonly/export/result-model-100000.pth.json
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"val/loss": 2.5970943390376986,
|
| 3 |
+
"val/accuracy": 0.48109654017857145,
|
| 4 |
+
"val/perplexity": 13.424673758243264,
|
| 5 |
+
"val/time_since_best_loss": 0,
|
| 6 |
+
"val/time_since_best_accuracy": 0,
|
| 7 |
+
"lambada/loss": 2.4715672842464094,
|
| 8 |
+
"lambada/accuracy/total": 0.25485248447204967,
|
| 9 |
+
"lambada/accuracy/openai_last_token": 0.766498447204969,
|
| 10 |
+
"lambada/perplexity": 11.817325805806146,
|
| 11 |
+
"lambada/lm_loss": 3.157572359157881,
|
| 12 |
+
"lambada/lm_perplexity": 23.51344438749415,
|
| 13 |
+
"lambada/time_since_best_loss": 0,
|
| 14 |
+
"lambada/time_since_best_accuracy": 0,
|
| 15 |
+
"mean_accuracy": 0.36797451232531053,
|
| 16 |
+
"mean_loss": 2.5343308116420538,
|
| 17 |
+
"blimp/accuracy/passive_2": 0.908,
|
| 18 |
+
"blimp/accuracy/determiner_noun_agreement_2": 0.979,
|
| 19 |
+
"blimp/accuracy/ellipsis_n_bar_1": 0.801,
|
| 20 |
+
"blimp/accuracy/tough_vs_raising_2": 0.909,
|
| 21 |
+
"blimp/accuracy/tough_vs_raising_1": 0.54,
|
| 22 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.919,
|
| 23 |
+
"blimp/accuracy/principle_A_reconstruction": 0.337,
|
| 24 |
+
"blimp/accuracy/wh_vs_that_with_gap": 0.538,
|
| 25 |
+
"blimp/accuracy/principle_A_domain_2": 0.814,
|
| 26 |
+
"blimp/accuracy/determiner_noun_agreement_1": 0.985,
|
| 27 |
+
"blimp/accuracy/ellipsis_n_bar_2": 0.91,
|
| 28 |
+
"blimp/accuracy/principle_A_domain_3": 0.544,
|
| 29 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.922,
|
| 30 |
+
"blimp/accuracy/animate_subject_trans": 0.901,
|
| 31 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.883,
|
| 32 |
+
"blimp/accuracy/distractor_agreement_relative_clause": 0.643,
|
| 33 |
+
"blimp/accuracy/transitive": 0.864,
|
| 34 |
+
"blimp/accuracy/sentential_subject_island": 0.373,
|
| 35 |
+
"blimp/accuracy/adjunct_island": 0.821,
|
| 36 |
+
"blimp/accuracy/intransitive": 0.787,
|
| 37 |
+
"blimp/accuracy/existential_there_subject_raising": 0.866,
|
| 38 |
+
"blimp/accuracy/irregular_past_participle_adjectives": 0.975,
|
| 39 |
+
"blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.403,
|
| 40 |
+
"blimp/accuracy/principle_A_case_1": 1.0,
|
| 41 |
+
"blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.311,
|
| 42 |
+
"blimp/accuracy/only_npi_scope": 0.66,
|
| 43 |
+
"blimp/accuracy/superlative_quantifiers_2": 0.787,
|
| 44 |
+
"blimp/accuracy/passive_1": 0.894,
|
| 45 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.92,
|
| 46 |
+
"blimp/accuracy/inchoative": 0.631,
|
| 47 |
+
"blimp/accuracy/anaphor_gender_agreement": 0.958,
|
| 48 |
+
"blimp/accuracy/principle_A_c_command": 0.585,
|
| 49 |
+
"blimp/accuracy/only_npi_licensor_present": 0.54,
|
| 50 |
+
"blimp/accuracy/expletive_it_object_raising": 0.852,
|
| 51 |
+
"blimp/accuracy/left_branch_island_simple_question": 0.485,
|
| 52 |
+
"blimp/accuracy/wh_questions_subject_gap": 0.924,
|
| 53 |
+
"blimp/accuracy/existential_there_quantifiers_2": 0.412,
|
| 54 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.948,
|
| 55 |
+
"blimp/accuracy/sentential_negation_npi_scope": 0.665,
|
| 56 |
+
"blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.777,
|
| 57 |
+
"blimp/accuracy/wh_questions_subject_gap_long_distance": 0.869,
|
| 58 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.906,
|
| 59 |
+
"blimp/accuracy/principle_A_case_2": 0.967,
|
| 60 |
+
"blimp/accuracy/distractor_agreement_relational_noun": 0.788,
|
| 61 |
+
"blimp/accuracy/sentential_negation_npi_licensor_present": 0.995,
|
| 62 |
+
"blimp/accuracy/superlative_quantifiers_1": 0.66,
|
| 63 |
+
"blimp/accuracy/wh_island": 0.834,
|
| 64 |
+
"blimp/accuracy/principle_A_domain_1": 0.97,
|
| 65 |
+
"blimp/accuracy/complex_NP_island": 0.554,
|
| 66 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_2": 0.961,
|
| 67 |
+
"blimp/accuracy/irregular_past_participle_verbs": 0.907,
|
| 68 |
+
"blimp/accuracy/drop_argument": 0.735,
|
| 69 |
+
"blimp/accuracy/wh_questions_object_gap": 0.792,
|
| 70 |
+
"blimp/accuracy/animate_subject_passive": 0.796,
|
| 71 |
+
"blimp/accuracy/existential_there_quantifiers_1": 0.979,
|
| 72 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.885,
|
| 73 |
+
"blimp/accuracy/npi_present_2": 0.539,
|
| 74 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_1": 0.932,
|
| 75 |
+
"blimp/accuracy/anaphor_number_agreement": 0.992,
|
| 76 |
+
"blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.957,
|
| 77 |
+
"blimp/accuracy/existential_there_object_raising": 0.851,
|
| 78 |
+
"blimp/accuracy/matrix_question_npi_licensor_present": 0.216,
|
| 79 |
+
"blimp/accuracy/npi_present_1": 0.518,
|
| 80 |
+
"blimp/accuracy/wh_vs_that_no_gap": 0.984,
|
| 81 |
+
"blimp/accuracy/left_branch_island_echo_question": 0.482,
|
| 82 |
+
"blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.976,
|
| 83 |
+
"blimp/accuracy/causative": 0.711,
|
| 84 |
+
"blimp/accuracy/group_average": 0.7720447761194028,
|
| 85 |
+
"blimp/accuracy/seq_average": 0.772044776119403,
|
| 86 |
+
"cbt/accuracy/NE": 0.7592147435897436,
|
| 87 |
+
"cbt/accuracy/V": 0.9128,
|
| 88 |
+
"cbt/accuracy/CN": 0.8216,
|
| 89 |
+
"cbt/accuracy/P": 0.8824,
|
| 90 |
+
"cbt/accuracy/group_average": 0.8440036858974359,
|
| 91 |
+
"cbt/accuracy/seq_average": 0.8440376150460184,
|
| 92 |
+
"hellaswag/accuracy/val": 0.29376618203545113,
|
| 93 |
+
"hellaswag/accuracy/group_average": 0.29376618203545113,
|
| 94 |
+
"hellaswag/accuracy/seq_average": 0.29376618203545113,
|
| 95 |
+
"piqa/accuracy/val": 0.5914036996735582,
|
| 96 |
+
"piqa/accuracy/group_average": 0.5914036996735582,
|
| 97 |
+
"piqa/accuracy/seq_average": 0.5914036996735582,
|
| 98 |
+
"ai2arc/accuracy/ARC-Easy": 0.32515856236786467,
|
| 99 |
+
"ai2arc/accuracy/ARC-Challenge": 0.21630901287553647,
|
| 100 |
+
"ai2arc/accuracy/group_average": 0.2707337876217006,
|
| 101 |
+
"ai2arc/accuracy/seq_average": 0.2892351274787535,
|
| 102 |
+
"mmlu/accuracy/MMLU": 0.2622810153736146,
|
| 103 |
+
"mmlu/accuracy/group_average": 0.2622810153736146,
|
| 104 |
+
"mmlu/accuracy/seq_average": 0.2622810153736146,
|
| 105 |
+
"openbookqa/accuracy/test": 0.272,
|
| 106 |
+
"openbookqa/accuracy/group_average": 0.272,
|
| 107 |
+
"openbookqa/accuracy/seq_average": 0.272,
|
| 108 |
+
"race/accuracy/test/high": 0.2687249857061178,
|
| 109 |
+
"race/accuracy/test/middle": 0.34331476323119775,
|
| 110 |
+
"race/accuracy/group_average": 0.30601987446865775,
|
| 111 |
+
"race/accuracy/seq_average": 0.290433725172274,
|
| 112 |
+
"siqa/accuracy/dev": 0.3556806550665302,
|
| 113 |
+
"siqa/accuracy/group_average": 0.3556806550665302,
|
| 114 |
+
"siqa/accuracy/seq_average": 0.3556806550665302,
|
| 115 |
+
"winogrande/accuracy/dev": 0.5193370165745856,
|
| 116 |
+
"winogrande/accuracy/group_average": 0.5193370165745856,
|
| 117 |
+
"winogrande/accuracy/seq_average": 0.5193370165745856,
|
| 118 |
+
"commonsenseqa/accuracy/dev_rand_split": 0.2547092547092547,
|
| 119 |
+
"commonsenseqa/accuracy/group_average": 0.2547092547092547,
|
| 120 |
+
"commonsenseqa/accuracy/seq_average": 0.2547092547092547
|
| 121 |
+
}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_deepseek_sigmoidonly/export/result-model-20000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.8716275654141863, "val/accuracy": 0.4424709743923611, "val/perplexity": 17.665746973305993, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.584707698466615, "lambada/accuracy/total": 0.18691770186335405, "lambada/accuracy/openai_last_token": 0.734083850931677, "lambada/perplexity": 18.351961933930525, "lambada/lm_loss": 3.3938823144873362, "lambada/lm_perplexity": 29.781348684292134, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3146943381278576, "mean_loss": 2.7281676319404005, "blimp/accuracy/passive_2": 0.865, "blimp/accuracy/determiner_noun_agreement_2": 0.966, "blimp/accuracy/ellipsis_n_bar_1": 0.803, "blimp/accuracy/tough_vs_raising_2": 0.905, "blimp/accuracy/tough_vs_raising_1": 0.483, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.891, "blimp/accuracy/principle_A_reconstruction": 0.351, "blimp/accuracy/wh_vs_that_with_gap": 0.417, "blimp/accuracy/principle_A_domain_2": 0.809, "blimp/accuracy/determiner_noun_agreement_1": 0.986, "blimp/accuracy/ellipsis_n_bar_2": 0.868, "blimp/accuracy/principle_A_domain_3": 0.568, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.838, "blimp/accuracy/animate_subject_trans": 0.865, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.828, "blimp/accuracy/distractor_agreement_relative_clause": 0.574, "blimp/accuracy/transitive": 0.833, "blimp/accuracy/sentential_subject_island": 0.413, "blimp/accuracy/adjunct_island": 0.723, "blimp/accuracy/intransitive": 0.684, "blimp/accuracy/existential_there_subject_raising": 0.845, "blimp/accuracy/irregular_past_participle_adjectives": 0.938, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.263, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.149, "blimp/accuracy/only_npi_scope": 0.604, "blimp/accuracy/superlative_quantifiers_2": 0.722, "blimp/accuracy/passive_1": 0.878, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.896, "blimp/accuracy/inchoative": 0.532, "blimp/accuracy/anaphor_gender_agreement": 0.93, "blimp/accuracy/principle_A_c_command": 0.537, "blimp/accuracy/only_npi_licensor_present": 0.271, "blimp/accuracy/expletive_it_object_raising": 0.781, "blimp/accuracy/left_branch_island_simple_question": 0.366, "blimp/accuracy/wh_questions_subject_gap": 0.929, "blimp/accuracy/existential_there_quantifiers_2": 0.299, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.923, "blimp/accuracy/sentential_negation_npi_scope": 0.63, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.828, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.947, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.854, "blimp/accuracy/principle_A_case_2": 0.949, "blimp/accuracy/distractor_agreement_relational_noun": 0.797, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.993, "blimp/accuracy/superlative_quantifiers_1": 0.53, "blimp/accuracy/wh_island": 0.827, "blimp/accuracy/principle_A_domain_1": 0.97, "blimp/accuracy/complex_NP_island": 0.474, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.948, "blimp/accuracy/irregular_past_participle_verbs": 0.86, "blimp/accuracy/drop_argument": 0.707, "blimp/accuracy/wh_questions_object_gap": 0.831, "blimp/accuracy/animate_subject_passive": 0.76, "blimp/accuracy/existential_there_quantifiers_1": 0.98, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.828, "blimp/accuracy/npi_present_2": 0.554, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.901, "blimp/accuracy/anaphor_number_agreement": 0.975, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.952, "blimp/accuracy/existential_there_object_raising": 0.798, "blimp/accuracy/matrix_question_npi_licensor_present": 0.129, "blimp/accuracy/npi_present_1": 0.555, "blimp/accuracy/wh_vs_that_no_gap": 0.983, "blimp/accuracy/left_branch_island_echo_question": 0.441, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.983, "blimp/accuracy/causative": 0.677, "blimp/accuracy/group_average": 0.7342388059701492, "blimp/accuracy/seq_average": 0.7342388059701492, "cbt/accuracy/NE": 0.7027243589743589, "cbt/accuracy/V": 0.8748, "cbt/accuracy/CN": 0.7656, "cbt/accuracy/P": 0.8524, "cbt/accuracy/group_average": 0.7988810897435898, "cbt/accuracy/seq_average": 0.7989195678271308, "hellaswag/accuracy/val": 0.27544313881696875, "hellaswag/accuracy/group_average": 0.27544313881696875, "hellaswag/accuracy/seq_average": 0.27544313881696875, "piqa/accuracy/val": 0.558759521218716, "piqa/accuracy/group_average": 0.558759521218716, "piqa/accuracy/seq_average": 0.558759521218716, "ai2arc/accuracy/ARC-Easy": 0.3200845665961945, "ai2arc/accuracy/ARC-Challenge": 0.19914163090128756, "ai2arc/accuracy/group_average": 0.25961309874874106, "ai2arc/accuracy/seq_average": 0.28016997167138813, "mmlu/accuracy/MMLU": 0.2656417590275295, "mmlu/accuracy/group_average": 0.2656417590275295, "mmlu/accuracy/seq_average": 0.2656417590275295, "openbookqa/accuracy/test": 0.276, "openbookqa/accuracy/group_average": 0.276, "openbookqa/accuracy/seq_average": 0.276, "race/accuracy/test/high": 0.2535734705546026, "race/accuracy/test/middle": 0.3293871866295265, "race/accuracy/group_average": 0.2914803285920645, "race/accuracy/seq_average": 0.2756384272395622, "siqa/accuracy/dev": 0.3587512794268168, "siqa/accuracy/group_average": 0.3587512794268168, "siqa/accuracy/seq_average": 0.3587512794268168, "winogrande/accuracy/dev": 0.5209155485398579, "winogrande/accuracy/group_average": 0.5209155485398579, "winogrande/accuracy/seq_average": 0.5209155485398579, "commonsenseqa/accuracy/dev_rand_split": 0.2416052416052416, "commonsenseqa/accuracy/group_average": 0.2416052416052416, "commonsenseqa/accuracy/seq_average": 0.2416052416052416}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_deepseek_sigmoidonly/export/result-model-30000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.7919728112599205, "val/accuracy": 0.45356677827380953, "val/perplexity": 16.313170884557813, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.581803031589674, "lambada/accuracy/total": 0.20710403726708074, "lambada/accuracy/openai_last_token": 0.7406832298136646, "lambada/perplexity": 16.31306204373071, "lambada/lm_loss": 3.3470754098291957, "lambada/lm_perplexity": 28.41949660555234, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3303354077704451, "mean_loss": 2.6868879214247974, "blimp/accuracy/passive_2": 0.882, "blimp/accuracy/determiner_noun_agreement_2": 0.975, "blimp/accuracy/ellipsis_n_bar_1": 0.745, "blimp/accuracy/tough_vs_raising_2": 0.883, "blimp/accuracy/tough_vs_raising_1": 0.509, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.927, "blimp/accuracy/principle_A_reconstruction": 0.278, "blimp/accuracy/wh_vs_that_with_gap": 0.593, "blimp/accuracy/principle_A_domain_2": 0.813, "blimp/accuracy/determiner_noun_agreement_1": 0.981, "blimp/accuracy/ellipsis_n_bar_2": 0.871, "blimp/accuracy/principle_A_domain_3": 0.582, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.871, "blimp/accuracy/animate_subject_trans": 0.877, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.842, "blimp/accuracy/distractor_agreement_relative_clause": 0.582, "blimp/accuracy/transitive": 0.84, "blimp/accuracy/sentential_subject_island": 0.332, "blimp/accuracy/adjunct_island": 0.821, "blimp/accuracy/intransitive": 0.795, "blimp/accuracy/existential_there_subject_raising": 0.871, "blimp/accuracy/irregular_past_participle_adjectives": 0.973, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.386, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.238, "blimp/accuracy/only_npi_scope": 0.715, "blimp/accuracy/superlative_quantifiers_2": 0.742, "blimp/accuracy/passive_1": 0.869, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.909, "blimp/accuracy/inchoative": 0.611, "blimp/accuracy/anaphor_gender_agreement": 0.946, "blimp/accuracy/principle_A_c_command": 0.548, "blimp/accuracy/only_npi_licensor_present": 0.338, "blimp/accuracy/expletive_it_object_raising": 0.794, "blimp/accuracy/left_branch_island_simple_question": 0.426, "blimp/accuracy/wh_questions_subject_gap": 0.909, "blimp/accuracy/existential_there_quantifiers_2": 0.36, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.922, "blimp/accuracy/sentential_negation_npi_scope": 0.556, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.801, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.913, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.864, "blimp/accuracy/principle_A_case_2": 0.955, "blimp/accuracy/distractor_agreement_relational_noun": 0.781, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.989, "blimp/accuracy/superlative_quantifiers_1": 0.66, "blimp/accuracy/wh_island": 0.852, "blimp/accuracy/principle_A_domain_1": 0.965, "blimp/accuracy/complex_NP_island": 0.454, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.949, "blimp/accuracy/irregular_past_participle_verbs": 0.859, "blimp/accuracy/drop_argument": 0.786, "blimp/accuracy/wh_questions_object_gap": 0.761, "blimp/accuracy/animate_subject_passive": 0.805, "blimp/accuracy/existential_there_quantifiers_1": 0.978, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.873, "blimp/accuracy/npi_present_2": 0.549, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.914, "blimp/accuracy/anaphor_number_agreement": 0.98, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.946, "blimp/accuracy/existential_there_object_raising": 0.824, "blimp/accuracy/matrix_question_npi_licensor_present": 0.175, "blimp/accuracy/npi_present_1": 0.47, "blimp/accuracy/wh_vs_that_no_gap": 0.959, "blimp/accuracy/left_branch_island_echo_question": 0.436, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.973, "blimp/accuracy/causative": 0.696, "blimp/accuracy/group_average": 0.7504328358208954, "blimp/accuracy/seq_average": 0.7504328358208955, "cbt/accuracy/NE": 0.7295673076923077, "cbt/accuracy/V": 0.8864, "cbt/accuracy/CN": 0.786, "cbt/accuracy/P": 0.8652, "cbt/accuracy/group_average": 0.816791826923077, "cbt/accuracy/seq_average": 0.816826730692277, "hellaswag/accuracy/val": 0.2819159529974109, "hellaswag/accuracy/group_average": 0.2819159529974109, "hellaswag/accuracy/seq_average": 0.2819159529974109, "piqa/accuracy/val": 0.5669205658324266, "piqa/accuracy/group_average": 0.5669205658324266, "piqa/accuracy/seq_average": 0.5669205658324266, "ai2arc/accuracy/ARC-Easy": 0.30570824524312895, "ai2arc/accuracy/ARC-Challenge": 0.2094420600858369, "ai2arc/accuracy/group_average": 0.25757515266448294, "ai2arc/accuracy/seq_average": 0.2739376770538244, "mmlu/accuracy/MMLU": 0.2619234894529853, "mmlu/accuracy/group_average": 0.2619234894529853, "mmlu/accuracy/seq_average": 0.2619234894529853, "openbookqa/accuracy/test": 0.278, "openbookqa/accuracy/group_average": 0.278, "openbookqa/accuracy/seq_average": 0.278, "race/accuracy/test/high": 0.2564322469982847, "race/accuracy/test/middle": 0.32590529247910865, "race/accuracy/group_average": 0.29116876973869665, "race/accuracy/seq_average": 0.2766518038102959, "siqa/accuracy/dev": 0.3618219037871034, "siqa/accuracy/group_average": 0.3618219037871034, "siqa/accuracy/seq_average": 0.3618219037871034, "winogrande/accuracy/dev": 0.5209155485398579, "winogrande/accuracy/group_average": 0.5209155485398579, "winogrande/accuracy/seq_average": 0.5209155485398579, "commonsenseqa/accuracy/dev_rand_split": 0.23996723996723995, "commonsenseqa/accuracy/group_average": 0.23996723996723995, "commonsenseqa/accuracy/seq_average": 0.23996723996723995}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_deepseek_sigmoidonly/export/result-model-40000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.743745834108383, "val/accuracy": 0.4599880642361111, "val/perplexity": 15.545105560199122, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.643252165421196, "lambada/accuracy/total": 0.22282608695652173, "lambada/accuracy/openai_last_token": 0.7488354037267081, "lambada/perplexity": 15.624953813329096, "lambada/lm_loss": 3.283404169949211, "lambada/lm_perplexity": 26.66639530653907, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.34140707559631644, "mean_loss": 2.693498999764789, "blimp/accuracy/passive_2": 0.872, "blimp/accuracy/determiner_noun_agreement_2": 0.969, "blimp/accuracy/ellipsis_n_bar_1": 0.812, "blimp/accuracy/tough_vs_raising_2": 0.9, "blimp/accuracy/tough_vs_raising_1": 0.534, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.897, "blimp/accuracy/principle_A_reconstruction": 0.284, "blimp/accuracy/wh_vs_that_with_gap": 0.477, "blimp/accuracy/principle_A_domain_2": 0.792, "blimp/accuracy/determiner_noun_agreement_1": 0.985, "blimp/accuracy/ellipsis_n_bar_2": 0.869, "blimp/accuracy/principle_A_domain_3": 0.554, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.885, "blimp/accuracy/animate_subject_trans": 0.893, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.858, "blimp/accuracy/distractor_agreement_relative_clause": 0.623, "blimp/accuracy/transitive": 0.841, "blimp/accuracy/sentential_subject_island": 0.351, "blimp/accuracy/adjunct_island": 0.816, "blimp/accuracy/intransitive": 0.768, "blimp/accuracy/existential_there_subject_raising": 0.864, "blimp/accuracy/irregular_past_participle_adjectives": 0.887, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.435, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.213, "blimp/accuracy/only_npi_scope": 0.505, "blimp/accuracy/superlative_quantifiers_2": 0.793, "blimp/accuracy/passive_1": 0.869, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.906, "blimp/accuracy/inchoative": 0.596, "blimp/accuracy/anaphor_gender_agreement": 0.956, "blimp/accuracy/principle_A_c_command": 0.547, "blimp/accuracy/only_npi_licensor_present": 0.488, "blimp/accuracy/expletive_it_object_raising": 0.823, "blimp/accuracy/left_branch_island_simple_question": 0.503, "blimp/accuracy/wh_questions_subject_gap": 0.917, "blimp/accuracy/existential_there_quantifiers_2": 0.421, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.942, "blimp/accuracy/sentential_negation_npi_scope": 0.596, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.79, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.916, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.882, "blimp/accuracy/principle_A_case_2": 0.966, "blimp/accuracy/distractor_agreement_relational_noun": 0.768, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.996, "blimp/accuracy/superlative_quantifiers_1": 0.665, "blimp/accuracy/wh_island": 0.81, "blimp/accuracy/principle_A_domain_1": 0.961, "blimp/accuracy/complex_NP_island": 0.494, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.945, "blimp/accuracy/irregular_past_participle_verbs": 0.869, "blimp/accuracy/drop_argument": 0.744, "blimp/accuracy/wh_questions_object_gap": 0.771, "blimp/accuracy/animate_subject_passive": 0.782, "blimp/accuracy/existential_there_quantifiers_1": 0.979, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.84, "blimp/accuracy/npi_present_2": 0.513, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.912, "blimp/accuracy/anaphor_number_agreement": 0.99, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.95, "blimp/accuracy/existential_there_object_raising": 0.836, "blimp/accuracy/matrix_question_npi_licensor_present": 0.138, "blimp/accuracy/npi_present_1": 0.455, "blimp/accuracy/wh_vs_that_no_gap": 0.986, "blimp/accuracy/left_branch_island_echo_question": 0.484, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.984, "blimp/accuracy/causative": 0.682, "blimp/accuracy/group_average": 0.7514776119402987, "blimp/accuracy/seq_average": 0.7514776119402985, "cbt/accuracy/NE": 0.7439903846153846, "cbt/accuracy/V": 0.9012, "cbt/accuracy/CN": 0.7924, "cbt/accuracy/P": 0.868, "cbt/accuracy/group_average": 0.8263975961538461, "cbt/accuracy/seq_average": 0.8264305722288916, "hellaswag/accuracy/val": 0.28281218880701053, "hellaswag/accuracy/group_average": 0.28281218880701053, "hellaswag/accuracy/seq_average": 0.28281218880701053, "piqa/accuracy/val": 0.5805223068552775, "piqa/accuracy/group_average": 0.5805223068552775, "piqa/accuracy/seq_average": 0.5805223068552775, "ai2arc/accuracy/ARC-Easy": 0.32684989429175476, "ai2arc/accuracy/ARC-Challenge": 0.20600858369098712, "ai2arc/accuracy/group_average": 0.26642923899137094, "ai2arc/accuracy/seq_average": 0.2869688385269122, "mmlu/accuracy/MMLU": 0.2629245620307472, "mmlu/accuracy/group_average": 0.2629245620307472, "mmlu/accuracy/seq_average": 0.2629245620307472, "openbookqa/accuracy/test": 0.276, "openbookqa/accuracy/group_average": 0.276, "openbookqa/accuracy/seq_average": 0.276, "race/accuracy/test/high": 0.25986277873070324, "race/accuracy/test/middle": 0.32590529247910865, "race/accuracy/group_average": 0.29288403560490595, "race/accuracy/seq_average": 0.27908390758005674, "siqa/accuracy/dev": 0.36745138178096215, "siqa/accuracy/group_average": 0.36745138178096215, "siqa/accuracy/seq_average": 0.36745138178096215, "winogrande/accuracy/dev": 0.5201262825572218, "winogrande/accuracy/group_average": 0.5201262825572218, "winogrande/accuracy/seq_average": 0.5201262825572218, "commonsenseqa/accuracy/dev_rand_split": 0.2596232596232596, "commonsenseqa/accuracy/group_average": 0.2596232596232596, "commonsenseqa/accuracy/seq_average": 0.2596232596232596}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_deepseek_sigmoidonly/export/result-model-50000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.697447035047743, "val/accuracy": 0.4668036566840278, "val/perplexity": 14.841792740335189, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5743703901397517, "lambada/accuracy/total": 0.2375776397515528, "lambada/accuracy/openai_last_token": 0.7573757763975155, "lambada/perplexity": 13.529056769026356, "lambada/lm_loss": 3.2669285019862886, "lambada/lm_perplexity": 26.23064810166654, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3521906482177903, "mean_loss": 2.6359087125937473, "blimp/accuracy/passive_2": 0.879, "blimp/accuracy/determiner_noun_agreement_2": 0.98, "blimp/accuracy/ellipsis_n_bar_1": 0.779, "blimp/accuracy/tough_vs_raising_2": 0.91, "blimp/accuracy/tough_vs_raising_1": 0.507, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.904, "blimp/accuracy/principle_A_reconstruction": 0.243, "blimp/accuracy/wh_vs_that_with_gap": 0.484, "blimp/accuracy/principle_A_domain_2": 0.81, "blimp/accuracy/determiner_noun_agreement_1": 0.98, "blimp/accuracy/ellipsis_n_bar_2": 0.898, "blimp/accuracy/principle_A_domain_3": 0.554, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.905, "blimp/accuracy/animate_subject_trans": 0.906, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.862, "blimp/accuracy/distractor_agreement_relative_clause": 0.632, "blimp/accuracy/transitive": 0.842, "blimp/accuracy/sentential_subject_island": 0.381, "blimp/accuracy/adjunct_island": 0.815, "blimp/accuracy/intransitive": 0.777, "blimp/accuracy/existential_there_subject_raising": 0.859, "blimp/accuracy/irregular_past_participle_adjectives": 0.833, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.391, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.224, "blimp/accuracy/only_npi_scope": 0.676, "blimp/accuracy/superlative_quantifiers_2": 0.722, "blimp/accuracy/passive_1": 0.886, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.902, "blimp/accuracy/inchoative": 0.6, "blimp/accuracy/anaphor_gender_agreement": 0.937, "blimp/accuracy/principle_A_c_command": 0.584, "blimp/accuracy/only_npi_licensor_present": 0.431, "blimp/accuracy/expletive_it_object_raising": 0.809, "blimp/accuracy/left_branch_island_simple_question": 0.466, "blimp/accuracy/wh_questions_subject_gap": 0.942, "blimp/accuracy/existential_there_quantifiers_2": 0.401, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.936, "blimp/accuracy/sentential_negation_npi_scope": 0.503, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.791, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.909, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.881, "blimp/accuracy/principle_A_case_2": 0.968, "blimp/accuracy/distractor_agreement_relational_noun": 0.784, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.99, "blimp/accuracy/superlative_quantifiers_1": 0.56, "blimp/accuracy/wh_island": 0.832, "blimp/accuracy/principle_A_domain_1": 0.975, "blimp/accuracy/complex_NP_island": 0.501, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.964, "blimp/accuracy/irregular_past_participle_verbs": 0.87, "blimp/accuracy/drop_argument": 0.737, "blimp/accuracy/wh_questions_object_gap": 0.787, "blimp/accuracy/animate_subject_passive": 0.794, "blimp/accuracy/existential_there_quantifiers_1": 0.989, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.903, "blimp/accuracy/npi_present_2": 0.505, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.918, "blimp/accuracy/anaphor_number_agreement": 0.985, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.946, "blimp/accuracy/existential_there_object_raising": 0.811, "blimp/accuracy/matrix_question_npi_licensor_present": 0.189, "blimp/accuracy/npi_present_1": 0.517, "blimp/accuracy/wh_vs_that_no_gap": 0.985, "blimp/accuracy/left_branch_island_echo_question": 0.516, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.986, "blimp/accuracy/causative": 0.683, "blimp/accuracy/group_average": 0.7526268656716417, "blimp/accuracy/seq_average": 0.7526268656716418, "cbt/accuracy/NE": 0.7451923076923077, "cbt/accuracy/V": 0.9004, "cbt/accuracy/CN": 0.7956, "cbt/accuracy/P": 0.872, "cbt/accuracy/group_average": 0.8282980769230769, "cbt/accuracy/seq_average": 0.8283313325330132, "hellaswag/accuracy/val": 0.28291177056363276, "hellaswag/accuracy/group_average": 0.28291177056363276, "hellaswag/accuracy/seq_average": 0.28291177056363276, "piqa/accuracy/val": 0.5816104461371056, "piqa/accuracy/group_average": 0.5816104461371056, "piqa/accuracy/seq_average": 0.5816104461371056, "ai2arc/accuracy/ARC-Easy": 0.3221987315010571, "ai2arc/accuracy/ARC-Challenge": 0.21373390557939914, "ai2arc/accuracy/group_average": 0.2679663185402281, "ai2arc/accuracy/seq_average": 0.2864022662889518, "mmlu/accuracy/MMLU": 0.2592062924562031, "mmlu/accuracy/group_average": 0.2592062924562031, "mmlu/accuracy/seq_average": 0.2592062924562031, "openbookqa/accuracy/test": 0.276, "openbookqa/accuracy/group_average": 0.276, "openbookqa/accuracy/seq_average": 0.276, "race/accuracy/test/high": 0.2658662092624357, "race/accuracy/test/middle": 0.3384401114206128, "race/accuracy/group_average": 0.3021531603415243, "race/accuracy/seq_average": 0.2869882448317795, "siqa/accuracy/dev": 0.36591606960081885, "siqa/accuracy/group_average": 0.36591606960081885, "siqa/accuracy/seq_average": 0.36591606960081885, "winogrande/accuracy/dev": 0.5130228887134964, "winogrande/accuracy/group_average": 0.5130228887134964, "winogrande/accuracy/seq_average": 0.5130228887134964, "commonsenseqa/accuracy/dev_rand_split": 0.25307125307125306, "commonsenseqa/accuracy/group_average": 0.25307125307125306, "commonsenseqa/accuracy/seq_average": 0.25307125307125306}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_deepseek_sigmoidonly/export/result-model-60000.pth.json
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"val/loss": 2.665284171937004,
|
| 3 |
+
"val/accuracy": 0.4712069072420635,
|
| 4 |
+
"val/perplexity": 14.372033094237437,
|
| 5 |
+
"val/time_since_best_loss": 0,
|
| 6 |
+
"val/time_since_best_accuracy": 0,
|
| 7 |
+
"lambada/loss": 2.5804676506089868,
|
| 8 |
+
"lambada/accuracy/total": 0.22554347826086957,
|
| 9 |
+
"lambada/accuracy/openai_last_token": 0.749805900621118,
|
| 10 |
+
"lambada/perplexity": 13.829430105848155,
|
| 11 |
+
"lambada/lm_loss": 3.213903706900887,
|
| 12 |
+
"lambada/lm_perplexity": 24.876005563747977,
|
| 13 |
+
"lambada/time_since_best_loss": 0,
|
| 14 |
+
"lambada/time_since_best_accuracy": 0,
|
| 15 |
+
"mean_accuracy": 0.34837519275146656,
|
| 16 |
+
"mean_loss": 2.6228759112729954,
|
| 17 |
+
"blimp/accuracy/passive_2": 0.898,
|
| 18 |
+
"blimp/accuracy/determiner_noun_agreement_2": 0.977,
|
| 19 |
+
"blimp/accuracy/ellipsis_n_bar_1": 0.808,
|
| 20 |
+
"blimp/accuracy/tough_vs_raising_2": 0.928,
|
| 21 |
+
"blimp/accuracy/tough_vs_raising_1": 0.528,
|
| 22 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.905,
|
| 23 |
+
"blimp/accuracy/principle_A_reconstruction": 0.294,
|
| 24 |
+
"blimp/accuracy/wh_vs_that_with_gap": 0.509,
|
| 25 |
+
"blimp/accuracy/principle_A_domain_2": 0.813,
|
| 26 |
+
"blimp/accuracy/determiner_noun_agreement_1": 0.984,
|
| 27 |
+
"blimp/accuracy/ellipsis_n_bar_2": 0.887,
|
| 28 |
+
"blimp/accuracy/principle_A_domain_3": 0.559,
|
| 29 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.892,
|
| 30 |
+
"blimp/accuracy/animate_subject_trans": 0.898,
|
| 31 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.881,
|
| 32 |
+
"blimp/accuracy/distractor_agreement_relative_clause": 0.647,
|
| 33 |
+
"blimp/accuracy/transitive": 0.857,
|
| 34 |
+
"blimp/accuracy/sentential_subject_island": 0.371,
|
| 35 |
+
"blimp/accuracy/adjunct_island": 0.817,
|
| 36 |
+
"blimp/accuracy/intransitive": 0.794,
|
| 37 |
+
"blimp/accuracy/existential_there_subject_raising": 0.868,
|
| 38 |
+
"blimp/accuracy/irregular_past_participle_adjectives": 0.96,
|
| 39 |
+
"blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.405,
|
| 40 |
+
"blimp/accuracy/principle_A_case_1": 1.0,
|
| 41 |
+
"blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.261,
|
| 42 |
+
"blimp/accuracy/only_npi_scope": 0.698,
|
| 43 |
+
"blimp/accuracy/superlative_quantifiers_2": 0.697,
|
| 44 |
+
"blimp/accuracy/passive_1": 0.872,
|
| 45 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.902,
|
| 46 |
+
"blimp/accuracy/inchoative": 0.63,
|
| 47 |
+
"blimp/accuracy/anaphor_gender_agreement": 0.961,
|
| 48 |
+
"blimp/accuracy/principle_A_c_command": 0.563,
|
| 49 |
+
"blimp/accuracy/only_npi_licensor_present": 0.493,
|
| 50 |
+
"blimp/accuracy/expletive_it_object_raising": 0.827,
|
| 51 |
+
"blimp/accuracy/left_branch_island_simple_question": 0.488,
|
| 52 |
+
"blimp/accuracy/wh_questions_subject_gap": 0.928,
|
| 53 |
+
"blimp/accuracy/existential_there_quantifiers_2": 0.382,
|
| 54 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.945,
|
| 55 |
+
"blimp/accuracy/sentential_negation_npi_scope": 0.614,
|
| 56 |
+
"blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.792,
|
| 57 |
+
"blimp/accuracy/wh_questions_subject_gap_long_distance": 0.906,
|
| 58 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.898,
|
| 59 |
+
"blimp/accuracy/principle_A_case_2": 0.97,
|
| 60 |
+
"blimp/accuracy/distractor_agreement_relational_noun": 0.751,
|
| 61 |
+
"blimp/accuracy/sentential_negation_npi_licensor_present": 0.991,
|
| 62 |
+
"blimp/accuracy/superlative_quantifiers_1": 0.64,
|
| 63 |
+
"blimp/accuracy/wh_island": 0.878,
|
| 64 |
+
"blimp/accuracy/principle_A_domain_1": 0.972,
|
| 65 |
+
"blimp/accuracy/complex_NP_island": 0.454,
|
| 66 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_2": 0.952,
|
| 67 |
+
"blimp/accuracy/irregular_past_participle_verbs": 0.887,
|
| 68 |
+
"blimp/accuracy/drop_argument": 0.76,
|
| 69 |
+
"blimp/accuracy/wh_questions_object_gap": 0.788,
|
| 70 |
+
"blimp/accuracy/animate_subject_passive": 0.813,
|
| 71 |
+
"blimp/accuracy/existential_there_quantifiers_1": 0.969,
|
| 72 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.88,
|
| 73 |
+
"blimp/accuracy/npi_present_2": 0.558,
|
| 74 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_1": 0.933,
|
| 75 |
+
"blimp/accuracy/anaphor_number_agreement": 0.988,
|
| 76 |
+
"blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.946,
|
| 77 |
+
"blimp/accuracy/existential_there_object_raising": 0.823,
|
| 78 |
+
"blimp/accuracy/matrix_question_npi_licensor_present": 0.233,
|
| 79 |
+
"blimp/accuracy/npi_present_1": 0.519,
|
| 80 |
+
"blimp/accuracy/wh_vs_that_no_gap": 0.977,
|
| 81 |
+
"blimp/accuracy/left_branch_island_echo_question": 0.477,
|
| 82 |
+
"blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.981,
|
| 83 |
+
"blimp/accuracy/causative": 0.692,
|
| 84 |
+
"blimp/accuracy/group_average": 0.7637164179104475,
|
| 85 |
+
"blimp/accuracy/seq_average": 0.7637164179104478,
|
| 86 |
+
"cbt/accuracy/NE": 0.7403846153846154,
|
| 87 |
+
"cbt/accuracy/V": 0.9008,
|
| 88 |
+
"cbt/accuracy/CN": 0.8156,
|
| 89 |
+
"cbt/accuracy/P": 0.88,
|
| 90 |
+
"cbt/accuracy/group_average": 0.8341961538461539,
|
| 91 |
+
"cbt/accuracy/seq_average": 0.834233693477391,
|
| 92 |
+
"hellaswag/accuracy/val": 0.29038040231029677,
|
| 93 |
+
"hellaswag/accuracy/group_average": 0.29038040231029677,
|
| 94 |
+
"hellaswag/accuracy/seq_average": 0.29038040231029677,
|
| 95 |
+
"piqa/accuracy/val": 0.5772578890097932,
|
| 96 |
+
"piqa/accuracy/group_average": 0.5772578890097932,
|
| 97 |
+
"piqa/accuracy/seq_average": 0.5772578890097932,
|
| 98 |
+
"ai2arc/accuracy/ARC-Easy": 0.3285412262156448,
|
| 99 |
+
"ai2arc/accuracy/ARC-Challenge": 0.21888412017167383,
|
| 100 |
+
"ai2arc/accuracy/group_average": 0.27371267319365933,
|
| 101 |
+
"ai2arc/accuracy/seq_average": 0.2923512747875354,
|
| 102 |
+
"mmlu/accuracy/MMLU": 0.26170897390060777,
|
| 103 |
+
"mmlu/accuracy/group_average": 0.26170897390060777,
|
| 104 |
+
"mmlu/accuracy/seq_average": 0.26170897390060777,
|
| 105 |
+
"openbookqa/accuracy/test": 0.274,
|
| 106 |
+
"openbookqa/accuracy/group_average": 0.274,
|
| 107 |
+
"openbookqa/accuracy/seq_average": 0.274,
|
| 108 |
+
"race/accuracy/test/high": 0.2672955974842767,
|
| 109 |
+
"race/accuracy/test/middle": 0.3321727019498607,
|
| 110 |
+
"race/accuracy/group_average": 0.2997341497170687,
|
| 111 |
+
"race/accuracy/seq_average": 0.28617754357519254,
|
| 112 |
+
"siqa/accuracy/dev": 0.35363357215967245,
|
| 113 |
+
"siqa/accuracy/group_average": 0.35363357215967245,
|
| 114 |
+
"siqa/accuracy/seq_average": 0.35363357215967245,
|
| 115 |
+
"winogrande/accuracy/dev": 0.5146014206787688,
|
| 116 |
+
"winogrande/accuracy/group_average": 0.5146014206787688,
|
| 117 |
+
"winogrande/accuracy/seq_average": 0.5146014206787688,
|
| 118 |
+
"commonsenseqa/accuracy/dev_rand_split": 0.257985257985258,
|
| 119 |
+
"commonsenseqa/accuracy/group_average": 0.257985257985258,
|
| 120 |
+
"commonsenseqa/accuracy/seq_average": 0.257985257985258
|
| 121 |
+
}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_deepseek_sigmoidonly/export/result-model-70000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.6388641841827876, "val/accuracy": 0.47556849888392855, "val/perplexity": 13.9972962250647, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.565198815387228, "lambada/accuracy/total": 0.22612577639751552, "lambada/accuracy/openai_last_token": 0.7558229813664596, "lambada/perplexity": 13.305536159590453, "lambada/lm_loss": 3.2003852101107966, "lambada/lm_perplexity": 24.541982196170622, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.35084713764072206, "mean_loss": 2.602031499785008, "blimp/accuracy/passive_2": 0.893, "blimp/accuracy/determiner_noun_agreement_2": 0.977, "blimp/accuracy/ellipsis_n_bar_1": 0.794, "blimp/accuracy/tough_vs_raising_2": 0.894, "blimp/accuracy/tough_vs_raising_1": 0.565, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.906, "blimp/accuracy/principle_A_reconstruction": 0.339, "blimp/accuracy/wh_vs_that_with_gap": 0.528, "blimp/accuracy/principle_A_domain_2": 0.806, "blimp/accuracy/determiner_noun_agreement_1": 0.979, "blimp/accuracy/ellipsis_n_bar_2": 0.887, "blimp/accuracy/principle_A_domain_3": 0.574, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.892, "blimp/accuracy/animate_subject_trans": 0.904, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.87, "blimp/accuracy/distractor_agreement_relative_clause": 0.636, "blimp/accuracy/transitive": 0.861, "blimp/accuracy/sentential_subject_island": 0.383, "blimp/accuracy/adjunct_island": 0.796, "blimp/accuracy/intransitive": 0.8, "blimp/accuracy/existential_there_subject_raising": 0.856, "blimp/accuracy/irregular_past_participle_adjectives": 0.94, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.453, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.289, "blimp/accuracy/only_npi_scope": 0.712, "blimp/accuracy/superlative_quantifiers_2": 0.775, "blimp/accuracy/passive_1": 0.875, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.923, "blimp/accuracy/inchoative": 0.628, "blimp/accuracy/anaphor_gender_agreement": 0.955, "blimp/accuracy/principle_A_c_command": 0.585, "blimp/accuracy/only_npi_licensor_present": 0.554, "blimp/accuracy/expletive_it_object_raising": 0.835, "blimp/accuracy/left_branch_island_simple_question": 0.541, "blimp/accuracy/wh_questions_subject_gap": 0.921, "blimp/accuracy/existential_there_quantifiers_2": 0.351, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.937, "blimp/accuracy/sentential_negation_npi_scope": 0.618, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.793, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.892, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.892, "blimp/accuracy/principle_A_case_2": 0.977, "blimp/accuracy/distractor_agreement_relational_noun": 0.78, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.99, "blimp/accuracy/superlative_quantifiers_1": 0.563, "blimp/accuracy/wh_island": 0.796, "blimp/accuracy/principle_A_domain_1": 0.98, "blimp/accuracy/complex_NP_island": 0.517, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.95, "blimp/accuracy/irregular_past_participle_verbs": 0.875, "blimp/accuracy/drop_argument": 0.757, "blimp/accuracy/wh_questions_object_gap": 0.778, "blimp/accuracy/animate_subject_passive": 0.792, "blimp/accuracy/existential_there_quantifiers_1": 0.98, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.869, "blimp/accuracy/npi_present_2": 0.519, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.91, "blimp/accuracy/anaphor_number_agreement": 0.986, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.955, "blimp/accuracy/existential_there_object_raising": 0.827, "blimp/accuracy/matrix_question_npi_licensor_present": 0.192, "blimp/accuracy/npi_present_1": 0.49, "blimp/accuracy/wh_vs_that_no_gap": 0.981, "blimp/accuracy/left_branch_island_echo_question": 0.513, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.979, "blimp/accuracy/causative": 0.701, "blimp/accuracy/group_average": 0.7651641791044774, "blimp/accuracy/seq_average": 0.7651641791044776, "cbt/accuracy/NE": 0.7520032051282052, "cbt/accuracy/V": 0.9052, "cbt/accuracy/CN": 0.8112, "cbt/accuracy/P": 0.888, "cbt/accuracy/group_average": 0.8391008012820512, "cbt/accuracy/seq_average": 0.8391356542617047, "hellaswag/accuracy/val": 0.28908583947420835, "hellaswag/accuracy/group_average": 0.28908583947420835, "hellaswag/accuracy/seq_average": 0.28908583947420835, "piqa/accuracy/val": 0.5767138193688792, "piqa/accuracy/group_average": 0.5767138193688792, "piqa/accuracy/seq_average": 0.5767138193688792, "ai2arc/accuracy/ARC-Easy": 0.32684989429175476, "ai2arc/accuracy/ARC-Challenge": 0.21630901287553647, "ai2arc/accuracy/group_average": 0.2715794535836456, "ai2arc/accuracy/seq_average": 0.29036827195467424, "mmlu/accuracy/MMLU": 0.2627100464783697, "mmlu/accuracy/group_average": 0.2627100464783697, "mmlu/accuracy/seq_average": 0.2627100464783697, "openbookqa/accuracy/test": 0.264, "openbookqa/accuracy/group_average": 0.264, "openbookqa/accuracy/seq_average": 0.264, "race/accuracy/test/high": 0.2698684962835906, "race/accuracy/test/middle": 0.3412256267409471, "race/accuracy/group_average": 0.30554706151226885, "race/accuracy/seq_average": 0.29063640048642075, "siqa/accuracy/dev": 0.35056294779938585, "siqa/accuracy/group_average": 0.35056294779938585, "siqa/accuracy/seq_average": 0.35056294779938585, "winogrande/accuracy/dev": 0.5201262825572218, "winogrande/accuracy/group_average": 0.5201262825572218, "winogrande/accuracy/seq_average": 0.5201262825572218, "commonsenseqa/accuracy/dev_rand_split": 0.26126126126126126, "commonsenseqa/accuracy/group_average": 0.26126126126126126, "commonsenseqa/accuracy/seq_average": 0.26126126126126126}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_deepseek_sigmoidonly/export/result-model-80000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.617030552455357, "val/accuracy": 0.4783082992311508, "val/perplexity": 13.694996572694052, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.4720504476416925, "lambada/accuracy/total": 0.2542701863354037, "lambada/accuracy/openai_last_token": 0.7618400621118012, "lambada/perplexity": 12.100692094117893, "lambada/lm_loss": 3.1811027045374547, "lambada/lm_perplexity": 24.073284641605433, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.36628924278327724, "mean_loss": 2.544540500048525, "blimp/accuracy/passive_2": 0.908, "blimp/accuracy/determiner_noun_agreement_2": 0.974, "blimp/accuracy/ellipsis_n_bar_1": 0.776, "blimp/accuracy/tough_vs_raising_2": 0.902, "blimp/accuracy/tough_vs_raising_1": 0.562, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.921, "blimp/accuracy/principle_A_reconstruction": 0.32, "blimp/accuracy/wh_vs_that_with_gap": 0.519, "blimp/accuracy/principle_A_domain_2": 0.803, "blimp/accuracy/determiner_noun_agreement_1": 0.989, "blimp/accuracy/ellipsis_n_bar_2": 0.9, "blimp/accuracy/principle_A_domain_3": 0.578, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.899, "blimp/accuracy/animate_subject_trans": 0.902, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.889, "blimp/accuracy/distractor_agreement_relative_clause": 0.661, "blimp/accuracy/transitive": 0.873, "blimp/accuracy/sentential_subject_island": 0.383, "blimp/accuracy/adjunct_island": 0.805, "blimp/accuracy/intransitive": 0.794, "blimp/accuracy/existential_there_subject_raising": 0.877, "blimp/accuracy/irregular_past_participle_adjectives": 0.973, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.435, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.285, "blimp/accuracy/only_npi_scope": 0.697, "blimp/accuracy/superlative_quantifiers_2": 0.837, "blimp/accuracy/passive_1": 0.895, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.918, "blimp/accuracy/inchoative": 0.62, "blimp/accuracy/anaphor_gender_agreement": 0.969, "blimp/accuracy/principle_A_c_command": 0.595, "blimp/accuracy/only_npi_licensor_present": 0.684, "blimp/accuracy/expletive_it_object_raising": 0.834, "blimp/accuracy/left_branch_island_simple_question": 0.53, "blimp/accuracy/wh_questions_subject_gap": 0.911, "blimp/accuracy/existential_there_quantifiers_2": 0.327, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.928, "blimp/accuracy/sentential_negation_npi_scope": 0.608, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.774, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.89, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.898, "blimp/accuracy/principle_A_case_2": 0.971, "blimp/accuracy/distractor_agreement_relational_noun": 0.789, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.996, "blimp/accuracy/superlative_quantifiers_1": 0.645, "blimp/accuracy/wh_island": 0.875, "blimp/accuracy/principle_A_domain_1": 0.974, "blimp/accuracy/complex_NP_island": 0.492, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.954, "blimp/accuracy/irregular_past_participle_verbs": 0.894, "blimp/accuracy/drop_argument": 0.765, "blimp/accuracy/wh_questions_object_gap": 0.814, "blimp/accuracy/animate_subject_passive": 0.798, "blimp/accuracy/existential_there_quantifiers_1": 0.97, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.871, "blimp/accuracy/npi_present_2": 0.546, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.927, "blimp/accuracy/anaphor_number_agreement": 0.991, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.956, "blimp/accuracy/existential_there_object_raising": 0.83, "blimp/accuracy/matrix_question_npi_licensor_present": 0.222, "blimp/accuracy/npi_present_1": 0.513, "blimp/accuracy/wh_vs_that_no_gap": 0.978, "blimp/accuracy/left_branch_island_echo_question": 0.489, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.971, "blimp/accuracy/causative": 0.723, "blimp/accuracy/group_average": 0.7730895522388059, "blimp/accuracy/seq_average": 0.7730895522388059, "cbt/accuracy/NE": 0.7544070512820513, "cbt/accuracy/V": 0.9072, "cbt/accuracy/CN": 0.8148, "cbt/accuracy/P": 0.8844, "cbt/accuracy/group_average": 0.8402017628205128, "cbt/accuracy/seq_average": 0.8402360944377751, "hellaswag/accuracy/val": 0.2898824935271858, "hellaswag/accuracy/group_average": 0.2898824935271858, "hellaswag/accuracy/seq_average": 0.2898824935271858, "piqa/accuracy/val": 0.5854189336235038, "piqa/accuracy/group_average": 0.5854189336235038, "piqa/accuracy/seq_average": 0.5854189336235038, "ai2arc/accuracy/ARC-Easy": 0.33699788583509516, "ai2arc/accuracy/ARC-Challenge": 0.21802575107296138, "ai2arc/accuracy/group_average": 0.2775118184540283, "ai2arc/accuracy/seq_average": 0.29773371104815866, "mmlu/accuracy/MMLU": 0.26235252055774044, "mmlu/accuracy/group_average": 0.26235252055774044, "mmlu/accuracy/seq_average": 0.26235252055774044, "openbookqa/accuracy/test": 0.27, "openbookqa/accuracy/group_average": 0.27, "openbookqa/accuracy/seq_average": 0.27, "race/accuracy/test/high": 0.2730131503716409, "race/accuracy/test/middle": 0.3516713091922006, "race/accuracy/group_average": 0.3123422297819207, "race/accuracy/seq_average": 0.2959059586542359, "siqa/accuracy/dev": 0.3577277379733879, "siqa/accuracy/group_average": 0.3577277379733879, "siqa/accuracy/seq_average": 0.3577277379733879, "winogrande/accuracy/dev": 0.5130228887134964, "winogrande/accuracy/group_average": 0.5130228887134964, "winogrande/accuracy/seq_average": 0.5130228887134964, "commonsenseqa/accuracy/dev_rand_split": 0.26044226044226043, "commonsenseqa/accuracy/group_average": 0.26044226044226043, "commonsenseqa/accuracy/seq_average": 0.26044226044226043}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_deepseek_sigmoidonly/export/result-model-90000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.6044389028397816, "val/accuracy": 0.4801093207465278, "val/perplexity": 13.523635100275662, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5004244028411295, "lambada/accuracy/total": 0.25271739130434784, "lambada/accuracy/openai_last_token": 0.7626164596273292, "lambada/perplexity": 11.894440782385615, "lambada/lm_loss": 3.154970855193684, "lambada/lm_perplexity": 23.452353567168107, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3664133560254378, "mean_loss": 2.552431652840456, "blimp/accuracy/passive_2": 0.896, "blimp/accuracy/determiner_noun_agreement_2": 0.979, "blimp/accuracy/ellipsis_n_bar_1": 0.79, "blimp/accuracy/tough_vs_raising_2": 0.917, "blimp/accuracy/tough_vs_raising_1": 0.529, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.935, "blimp/accuracy/principle_A_reconstruction": 0.346, "blimp/accuracy/wh_vs_that_with_gap": 0.528, "blimp/accuracy/principle_A_domain_2": 0.809, "blimp/accuracy/determiner_noun_agreement_1": 0.984, "blimp/accuracy/ellipsis_n_bar_2": 0.893, "blimp/accuracy/principle_A_domain_3": 0.56, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.913, "blimp/accuracy/animate_subject_trans": 0.897, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.887, "blimp/accuracy/distractor_agreement_relative_clause": 0.661, "blimp/accuracy/transitive": 0.857, "blimp/accuracy/sentential_subject_island": 0.387, "blimp/accuracy/adjunct_island": 0.809, "blimp/accuracy/intransitive": 0.779, "blimp/accuracy/existential_there_subject_raising": 0.872, "blimp/accuracy/irregular_past_participle_adjectives": 0.92, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.427, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.286, "blimp/accuracy/only_npi_scope": 0.647, "blimp/accuracy/superlative_quantifiers_2": 0.674, "blimp/accuracy/passive_1": 0.882, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.901, "blimp/accuracy/inchoative": 0.629, "blimp/accuracy/anaphor_gender_agreement": 0.967, "blimp/accuracy/principle_A_c_command": 0.587, "blimp/accuracy/only_npi_licensor_present": 0.64, "blimp/accuracy/expletive_it_object_raising": 0.824, "blimp/accuracy/left_branch_island_simple_question": 0.512, "blimp/accuracy/wh_questions_subject_gap": 0.916, "blimp/accuracy/existential_there_quantifiers_2": 0.451, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.95, "blimp/accuracy/sentential_negation_npi_scope": 0.651, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.776, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.869, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.91, "blimp/accuracy/principle_A_case_2": 0.963, "blimp/accuracy/distractor_agreement_relational_noun": 0.799, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.993, "blimp/accuracy/superlative_quantifiers_1": 0.66, "blimp/accuracy/wh_island": 0.859, "blimp/accuracy/principle_A_domain_1": 0.972, "blimp/accuracy/complex_NP_island": 0.523, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.964, "blimp/accuracy/irregular_past_participle_verbs": 0.906, "blimp/accuracy/drop_argument": 0.756, "blimp/accuracy/wh_questions_object_gap": 0.797, "blimp/accuracy/animate_subject_passive": 0.793, "blimp/accuracy/existential_there_quantifiers_1": 0.988, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.891, "blimp/accuracy/npi_present_2": 0.545, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.944, "blimp/accuracy/anaphor_number_agreement": 0.992, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.958, "blimp/accuracy/existential_there_object_raising": 0.842, "blimp/accuracy/matrix_question_npi_licensor_present": 0.232, "blimp/accuracy/npi_present_1": 0.508, "blimp/accuracy/wh_vs_that_no_gap": 0.98, "blimp/accuracy/left_branch_island_echo_question": 0.48, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.98, "blimp/accuracy/causative": 0.723, "blimp/accuracy/group_average": 0.7715671641791044, "blimp/accuracy/seq_average": 0.7715671641791044, "cbt/accuracy/NE": 0.7568108974358975, "cbt/accuracy/V": 0.91, "cbt/accuracy/CN": 0.8204, "cbt/accuracy/P": 0.8884, "cbt/accuracy/group_average": 0.8439027243589743, "cbt/accuracy/seq_average": 0.843937575030012, "hellaswag/accuracy/val": 0.2884883489344752, "hellaswag/accuracy/group_average": 0.2884883489344752, "hellaswag/accuracy/seq_average": 0.2884883489344752, "piqa/accuracy/val": 0.5854189336235038, "piqa/accuracy/group_average": 0.5854189336235038, "piqa/accuracy/seq_average": 0.5854189336235038, "ai2arc/accuracy/ARC-Easy": 0.33742071881606767, "ai2arc/accuracy/ARC-Challenge": 0.21201716738197424, "ai2arc/accuracy/group_average": 0.27471894309902095, "ai2arc/accuracy/seq_average": 0.29603399433427763, "mmlu/accuracy/MMLU": 0.25827672506256705, "mmlu/accuracy/group_average": 0.25827672506256705, "mmlu/accuracy/seq_average": 0.25827672506256705, "openbookqa/accuracy/test": 0.272, "openbookqa/accuracy/group_average": 0.272, "openbookqa/accuracy/seq_average": 0.272, "race/accuracy/test/high": 0.27072612921669525, "race/accuracy/test/middle": 0.3467966573816156, "race/accuracy/group_average": 0.3087613932991554, "race/accuracy/seq_average": 0.29286582894203483, "siqa/accuracy/dev": 0.3592630501535312, "siqa/accuracy/group_average": 0.3592630501535312, "siqa/accuracy/seq_average": 0.3592630501535312, "winogrande/accuracy/dev": 0.5074980268350434, "winogrande/accuracy/group_average": 0.5074980268350434, "winogrande/accuracy/seq_average": 0.5074980268350434, "commonsenseqa/accuracy/dev_rand_split": 0.2588042588042588, "commonsenseqa/accuracy/group_average": 0.2588042588042588, "commonsenseqa/accuracy/seq_average": 0.2588042588042588}
|