Upload folder using huggingface_hub

#295
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_sigmoid_standard_lb/export/result-model-10000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 3.026703123062376, "val/accuracy": 0.42361207992311506, "val/perplexity": 20.629108720304792, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.718575424289111, "lambada/accuracy/total": 0.1578027950310559, "lambada/accuracy/openai_last_token": 0.719332298136646, "lambada/perplexity": 23.935558561617572, "lambada/lm_loss": 3.5599650199466972, "lambada/lm_perplexity": 35.161967156108844, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.2907074374770855, "mean_loss": 2.872639273675744, "blimp/accuracy/passive_2": 0.869, "blimp/accuracy/determiner_noun_agreement_2": 0.945, "blimp/accuracy/ellipsis_n_bar_1": 0.695, "blimp/accuracy/tough_vs_raising_2": 0.846, "blimp/accuracy/tough_vs_raising_1": 0.526, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.89, "blimp/accuracy/principle_A_reconstruction": 0.426, "blimp/accuracy/wh_vs_that_with_gap": 0.416, "blimp/accuracy/principle_A_domain_2": 0.78, "blimp/accuracy/determiner_noun_agreement_1": 0.97, "blimp/accuracy/ellipsis_n_bar_2": 0.872, "blimp/accuracy/principle_A_domain_3": 0.508, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.881, "blimp/accuracy/animate_subject_trans": 0.871, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.739, "blimp/accuracy/distractor_agreement_relative_clause": 0.459, "blimp/accuracy/transitive": 0.828, "blimp/accuracy/sentential_subject_island": 0.325, "blimp/accuracy/adjunct_island": 0.691, "blimp/accuracy/intransitive": 0.717, "blimp/accuracy/existential_there_subject_raising": 0.782, "blimp/accuracy/irregular_past_participle_adjectives": 0.953, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.191, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.143, "blimp/accuracy/only_npi_scope": 0.708, "blimp/accuracy/superlative_quantifiers_2": 0.769, "blimp/accuracy/passive_1": 0.856, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.872, "blimp/accuracy/inchoative": 0.538, "blimp/accuracy/anaphor_gender_agreement": 0.936, "blimp/accuracy/principle_A_c_command": 0.473, "blimp/accuracy/only_npi_licensor_present": 0.768, "blimp/accuracy/expletive_it_object_raising": 0.742, "blimp/accuracy/left_branch_island_simple_question": 0.203, "blimp/accuracy/wh_questions_subject_gap": 0.898, "blimp/accuracy/existential_there_quantifiers_2": 0.427, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.885, "blimp/accuracy/sentential_negation_npi_scope": 0.364, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.746, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.866, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.843, "blimp/accuracy/principle_A_case_2": 0.953, "blimp/accuracy/distractor_agreement_relational_noun": 0.701, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.996, "blimp/accuracy/superlative_quantifiers_1": 0.671, "blimp/accuracy/wh_island": 0.687, "blimp/accuracy/principle_A_domain_1": 0.952, "blimp/accuracy/complex_NP_island": 0.562, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.93, "blimp/accuracy/irregular_past_participle_verbs": 0.774, "blimp/accuracy/drop_argument": 0.74, "blimp/accuracy/wh_questions_object_gap": 0.733, "blimp/accuracy/animate_subject_passive": 0.746, "blimp/accuracy/existential_there_quantifiers_1": 0.971, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.838, "blimp/accuracy/npi_present_2": 0.594, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.851, "blimp/accuracy/anaphor_number_agreement": 0.975, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.906, "blimp/accuracy/existential_there_object_raising": 0.727, "blimp/accuracy/matrix_question_npi_licensor_present": 0.074, "blimp/accuracy/npi_present_1": 0.549, "blimp/accuracy/wh_vs_that_no_gap": 0.962, "blimp/accuracy/left_branch_island_echo_question": 0.563, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.976, "blimp/accuracy/causative": 0.637, "blimp/accuracy/group_average": 0.7206716417910447, "blimp/accuracy/seq_average": 0.7206716417910448, "cbt/accuracy/NE": 0.6919070512820513, "cbt/accuracy/V": 0.8524, "cbt/accuracy/CN": 0.7304, "cbt/accuracy/P": 0.842, "cbt/accuracy/group_average": 0.7791767628205128, "cbt/accuracy/seq_average": 0.7792116846738696, "hellaswag/accuracy/val": 0.27235610436168095, "hellaswag/accuracy/group_average": 0.27235610436168095, "hellaswag/accuracy/seq_average": 0.27235610436168095, "piqa/accuracy/val": 0.5511425462459195, "piqa/accuracy/group_average": 0.5511425462459195, "piqa/accuracy/seq_average": 0.5511425462459195, "ai2arc/accuracy/ARC-Easy": 0.3145877378435518, "ai2arc/accuracy/ARC-Challenge": 0.20257510729613734, "ai2arc/accuracy/group_average": 0.25858142256984457, "ai2arc/accuracy/seq_average": 0.2776203966005666, "mmlu/accuracy/MMLU": 0.2634250983196282, "mmlu/accuracy/group_average": 0.2634250983196282, "mmlu/accuracy/seq_average": 0.2634250983196282, "openbookqa/accuracy/test": 0.26, "openbookqa/accuracy/group_average": 0.26, "openbookqa/accuracy/seq_average": 0.26, "race/accuracy/test/high": 0.26157804459691253, "race/accuracy/test/middle": 0.32590529247910865, "race/accuracy/group_average": 0.2937416685380106, "race/accuracy/seq_average": 0.2802999594649372, "siqa/accuracy/dev": 0.34646878198567044, "siqa/accuracy/group_average": 0.34646878198567044, "siqa/accuracy/seq_average": 0.34646878198567044, "winogrande/accuracy/dev": 0.5043409629044988, "winogrande/accuracy/group_average": 0.5043409629044988, "winogrande/accuracy/seq_average": 0.5043409629044988, "commonsenseqa/accuracy/dev_rand_split": 0.23587223587223588, "commonsenseqa/accuracy/group_average": 0.23587223587223588, "commonsenseqa/accuracy/seq_average": 0.23587223587223588}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_sigmoid_standard_lb/export/result-model-100000.pth.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "val/loss": 2.6108267647879466,
3
+ "val/accuracy": 0.47982545882936506,
4
+ "val/perplexity": 13.610298717253553,
5
+ "val/time_since_best_loss": 0,
6
+ "val/time_since_best_accuracy": 0,
7
+ "lambada/loss": 2.616000536806095,
8
+ "lambada/accuracy/total": 0.2542701863354037,
9
+ "lambada/accuracy/openai_last_token": 0.7624223602484472,
10
+ "lambada/perplexity": 11.732446061898097,
11
+ "lambada/lm_loss": 3.17926517792789,
12
+ "lambada/lm_perplexity": 24.029089957386283,
13
+ "lambada/time_since_best_loss": 0,
14
+ "lambada/time_since_best_accuracy": 0,
15
+ "mean_accuracy": 0.3670478225823844,
16
+ "mean_loss": 2.6134136507970207,
17
+ "blimp/accuracy/passive_2": 0.921,
18
+ "blimp/accuracy/determiner_noun_agreement_2": 0.987,
19
+ "blimp/accuracy/ellipsis_n_bar_1": 0.845,
20
+ "blimp/accuracy/tough_vs_raising_2": 0.871,
21
+ "blimp/accuracy/tough_vs_raising_1": 0.64,
22
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.923,
23
+ "blimp/accuracy/principle_A_reconstruction": 0.327,
24
+ "blimp/accuracy/wh_vs_that_with_gap": 0.548,
25
+ "blimp/accuracy/principle_A_domain_2": 0.779,
26
+ "blimp/accuracy/determiner_noun_agreement_1": 0.99,
27
+ "blimp/accuracy/ellipsis_n_bar_2": 0.91,
28
+ "blimp/accuracy/principle_A_domain_3": 0.55,
29
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.918,
30
+ "blimp/accuracy/animate_subject_trans": 0.907,
31
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.888,
32
+ "blimp/accuracy/distractor_agreement_relative_clause": 0.664,
33
+ "blimp/accuracy/transitive": 0.879,
34
+ "blimp/accuracy/sentential_subject_island": 0.333,
35
+ "blimp/accuracy/adjunct_island": 0.801,
36
+ "blimp/accuracy/intransitive": 0.797,
37
+ "blimp/accuracy/existential_there_subject_raising": 0.851,
38
+ "blimp/accuracy/irregular_past_participle_adjectives": 0.953,
39
+ "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.401,
40
+ "blimp/accuracy/principle_A_case_1": 1.0,
41
+ "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.278,
42
+ "blimp/accuracy/only_npi_scope": 0.61,
43
+ "blimp/accuracy/superlative_quantifiers_2": 0.821,
44
+ "blimp/accuracy/passive_1": 0.904,
45
+ "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.898,
46
+ "blimp/accuracy/inchoative": 0.629,
47
+ "blimp/accuracy/anaphor_gender_agreement": 0.971,
48
+ "blimp/accuracy/principle_A_c_command": 0.57,
49
+ "blimp/accuracy/only_npi_licensor_present": 0.446,
50
+ "blimp/accuracy/expletive_it_object_raising": 0.768,
51
+ "blimp/accuracy/left_branch_island_simple_question": 0.447,
52
+ "blimp/accuracy/wh_questions_subject_gap": 0.907,
53
+ "blimp/accuracy/existential_there_quantifiers_2": 0.525,
54
+ "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.926,
55
+ "blimp/accuracy/sentential_negation_npi_scope": 0.595,
56
+ "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.844,
57
+ "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.883,
58
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.901,
59
+ "blimp/accuracy/principle_A_case_2": 0.959,
60
+ "blimp/accuracy/distractor_agreement_relational_noun": 0.824,
61
+ "blimp/accuracy/sentential_negation_npi_licensor_present": 0.991,
62
+ "blimp/accuracy/superlative_quantifiers_1": 0.848,
63
+ "blimp/accuracy/wh_island": 0.777,
64
+ "blimp/accuracy/principle_A_domain_1": 0.984,
65
+ "blimp/accuracy/complex_NP_island": 0.545,
66
+ "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.963,
67
+ "blimp/accuracy/irregular_past_participle_verbs": 0.908,
68
+ "blimp/accuracy/drop_argument": 0.792,
69
+ "blimp/accuracy/wh_questions_object_gap": 0.773,
70
+ "blimp/accuracy/animate_subject_passive": 0.802,
71
+ "blimp/accuracy/existential_there_quantifiers_1": 0.98,
72
+ "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.878,
73
+ "blimp/accuracy/npi_present_2": 0.601,
74
+ "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.926,
75
+ "blimp/accuracy/anaphor_number_agreement": 0.991,
76
+ "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.948,
77
+ "blimp/accuracy/existential_there_object_raising": 0.835,
78
+ "blimp/accuracy/matrix_question_npi_licensor_present": 0.244,
79
+ "blimp/accuracy/npi_present_1": 0.569,
80
+ "blimp/accuracy/wh_vs_that_no_gap": 0.974,
81
+ "blimp/accuracy/left_branch_island_echo_question": 0.442,
82
+ "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.979,
83
+ "blimp/accuracy/causative": 0.708,
84
+ "blimp/accuracy/group_average": 0.7738358208955224,
85
+ "blimp/accuracy/seq_average": 0.7738358208955224,
86
+ "cbt/accuracy/NE": 0.7568108974358975,
87
+ "cbt/accuracy/V": 0.9088,
88
+ "cbt/accuracy/CN": 0.8176,
89
+ "cbt/accuracy/P": 0.886,
90
+ "cbt/accuracy/group_average": 0.8423027243589745,
91
+ "cbt/accuracy/seq_average": 0.8423369347739096,
92
+ "hellaswag/accuracy/val": 0.29127663811989646,
93
+ "hellaswag/accuracy/group_average": 0.29127663811989646,
94
+ "hellaswag/accuracy/seq_average": 0.29127663811989646,
95
+ "piqa/accuracy/val": 0.5892274211099021,
96
+ "piqa/accuracy/group_average": 0.5892274211099021,
97
+ "piqa/accuracy/seq_average": 0.5892274211099021,
98
+ "ai2arc/accuracy/ARC-Easy": 0.32727272727272727,
99
+ "ai2arc/accuracy/ARC-Challenge": 0.21373390557939914,
100
+ "ai2arc/accuracy/group_average": 0.2705033164260632,
101
+ "ai2arc/accuracy/seq_average": 0.2898016997167139,
102
+ "mmlu/accuracy/MMLU": 0.2637111190561316,
103
+ "mmlu/accuracy/group_average": 0.2637111190561316,
104
+ "mmlu/accuracy/seq_average": 0.2637111190561316,
105
+ "openbookqa/accuracy/test": 0.272,
106
+ "openbookqa/accuracy/group_average": 0.272,
107
+ "openbookqa/accuracy/seq_average": 0.272,
108
+ "race/accuracy/test/high": 0.26786735277301316,
109
+ "race/accuracy/test/middle": 0.3530640668523677,
110
+ "race/accuracy/group_average": 0.3104657098126904,
111
+ "race/accuracy/seq_average": 0.29266315362788814,
112
+ "siqa/accuracy/dev": 0.3490276356192426,
113
+ "siqa/accuracy/group_average": 0.3490276356192426,
114
+ "siqa/accuracy/seq_average": 0.3490276356192426,
115
+ "winogrande/accuracy/dev": 0.5090765588003157,
116
+ "winogrande/accuracy/group_average": 0.5090765588003157,
117
+ "winogrande/accuracy/seq_average": 0.5090765588003157,
118
+ "commonsenseqa/accuracy/dev_rand_split": 0.24897624897624898,
119
+ "commonsenseqa/accuracy/group_average": 0.24897624897624898,
120
+ "commonsenseqa/accuracy/seq_average": 0.24897624897624898
121
+ }
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_sigmoid_standard_lb/export/result-model-20000.pth.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "val/loss": 2.8830077156187994,
3
+ "val/accuracy": 0.441619388640873,
4
+ "val/perplexity": 17.867934105241176,
5
+ "val/time_since_best_loss": 0,
6
+ "val/time_since_best_accuracy": 0,
7
+ "lambada/loss": 2.6007561535568713,
8
+ "lambada/accuracy/total": 0.18847049689440995,
9
+ "lambada/accuracy/openai_last_token": 0.7325310559006211,
10
+ "lambada/perplexity": 17.842352575679747,
11
+ "lambada/lm_loss": 3.4032095429859703,
12
+ "lambada/lm_perplexity": 30.06042561236697,
13
+ "lambada/time_since_best_loss": 0,
14
+ "lambada/time_since_best_accuracy": 0,
15
+ "mean_accuracy": 0.3150449427676415,
16
+ "mean_loss": 2.7418819345878354,
17
+ "blimp/accuracy/passive_2": 0.878,
18
+ "blimp/accuracy/determiner_noun_agreement_2": 0.963,
19
+ "blimp/accuracy/ellipsis_n_bar_1": 0.75,
20
+ "blimp/accuracy/tough_vs_raising_2": 0.885,
21
+ "blimp/accuracy/tough_vs_raising_1": 0.537,
22
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.893,
23
+ "blimp/accuracy/principle_A_reconstruction": 0.505,
24
+ "blimp/accuracy/wh_vs_that_with_gap": 0.529,
25
+ "blimp/accuracy/principle_A_domain_2": 0.837,
26
+ "blimp/accuracy/determiner_noun_agreement_1": 0.978,
27
+ "blimp/accuracy/ellipsis_n_bar_2": 0.895,
28
+ "blimp/accuracy/principle_A_domain_3": 0.569,
29
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.878,
30
+ "blimp/accuracy/animate_subject_trans": 0.882,
31
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.8,
32
+ "blimp/accuracy/distractor_agreement_relative_clause": 0.555,
33
+ "blimp/accuracy/transitive": 0.848,
34
+ "blimp/accuracy/sentential_subject_island": 0.377,
35
+ "blimp/accuracy/adjunct_island": 0.699,
36
+ "blimp/accuracy/intransitive": 0.705,
37
+ "blimp/accuracy/existential_there_subject_raising": 0.822,
38
+ "blimp/accuracy/irregular_past_participle_adjectives": 0.902,
39
+ "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.29,
40
+ "blimp/accuracy/principle_A_case_1": 0.999,
41
+ "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.187,
42
+ "blimp/accuracy/only_npi_scope": 0.789,
43
+ "blimp/accuracy/superlative_quantifiers_2": 0.927,
44
+ "blimp/accuracy/passive_1": 0.889,
45
+ "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.891,
46
+ "blimp/accuracy/inchoative": 0.525,
47
+ "blimp/accuracy/anaphor_gender_agreement": 0.924,
48
+ "blimp/accuracy/principle_A_c_command": 0.466,
49
+ "blimp/accuracy/only_npi_licensor_present": 0.767,
50
+ "blimp/accuracy/expletive_it_object_raising": 0.733,
51
+ "blimp/accuracy/left_branch_island_simple_question": 0.373,
52
+ "blimp/accuracy/wh_questions_subject_gap": 0.879,
53
+ "blimp/accuracy/existential_there_quantifiers_2": 0.416,
54
+ "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.901,
55
+ "blimp/accuracy/sentential_negation_npi_scope": 0.486,
56
+ "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.786,
57
+ "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.886,
58
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.867,
59
+ "blimp/accuracy/principle_A_case_2": 0.933,
60
+ "blimp/accuracy/distractor_agreement_relational_noun": 0.818,
61
+ "blimp/accuracy/sentential_negation_npi_licensor_present": 0.993,
62
+ "blimp/accuracy/superlative_quantifiers_1": 0.702,
63
+ "blimp/accuracy/wh_island": 0.823,
64
+ "blimp/accuracy/principle_A_domain_1": 0.991,
65
+ "blimp/accuracy/complex_NP_island": 0.588,
66
+ "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.906,
67
+ "blimp/accuracy/irregular_past_participle_verbs": 0.868,
68
+ "blimp/accuracy/drop_argument": 0.734,
69
+ "blimp/accuracy/wh_questions_object_gap": 0.751,
70
+ "blimp/accuracy/animate_subject_passive": 0.733,
71
+ "blimp/accuracy/existential_there_quantifiers_1": 0.986,
72
+ "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.864,
73
+ "blimp/accuracy/npi_present_2": 0.607,
74
+ "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.861,
75
+ "blimp/accuracy/anaphor_number_agreement": 0.963,
76
+ "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.926,
77
+ "blimp/accuracy/existential_there_object_raising": 0.765,
78
+ "blimp/accuracy/matrix_question_npi_licensor_present": 0.121,
79
+ "blimp/accuracy/npi_present_1": 0.573,
80
+ "blimp/accuracy/wh_vs_that_no_gap": 0.959,
81
+ "blimp/accuracy/left_branch_island_echo_question": 0.371,
82
+ "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.982,
83
+ "blimp/accuracy/causative": 0.691,
84
+ "blimp/accuracy/group_average": 0.7489104477611941,
85
+ "blimp/accuracy/seq_average": 0.748910447761194,
86
+ "cbt/accuracy/NE": 0.7071314102564102,
87
+ "cbt/accuracy/V": 0.8764,
88
+ "cbt/accuracy/CN": 0.7684,
89
+ "cbt/accuracy/P": 0.85,
90
+ "cbt/accuracy/group_average": 0.8004828525641026,
91
+ "cbt/accuracy/seq_average": 0.8005202080832333,
92
+ "hellaswag/accuracy/val": 0.2741485759808803,
93
+ "hellaswag/accuracy/group_average": 0.2741485759808803,
94
+ "hellaswag/accuracy/seq_average": 0.2741485759808803,
95
+ "piqa/accuracy/val": 0.5544069640914037,
96
+ "piqa/accuracy/group_average": 0.5544069640914037,
97
+ "piqa/accuracy/seq_average": 0.5544069640914037,
98
+ "ai2arc/accuracy/ARC-Easy": 0.30655391120507397,
99
+ "ai2arc/accuracy/ARC-Challenge": 0.2,
100
+ "ai2arc/accuracy/group_average": 0.253276955602537,
101
+ "ai2arc/accuracy/seq_average": 0.27138810198300284,
102
+ "mmlu/accuracy/MMLU": 0.26414015016088666,
103
+ "mmlu/accuracy/group_average": 0.26414015016088666,
104
+ "mmlu/accuracy/seq_average": 0.26414015016088666,
105
+ "openbookqa/accuracy/test": 0.282,
106
+ "openbookqa/accuracy/group_average": 0.282,
107
+ "openbookqa/accuracy/seq_average": 0.282,
108
+ "race/accuracy/test/high": 0.26043453401943967,
109
+ "race/accuracy/test/middle": 0.32520891364902504,
110
+ "race/accuracy/group_average": 0.29282172383423233,
111
+ "race/accuracy/seq_average": 0.2792865828942035,
112
+ "siqa/accuracy/dev": 0.35363357215967245,
113
+ "siqa/accuracy/group_average": 0.35363357215967245,
114
+ "siqa/accuracy/seq_average": 0.35363357215967245,
115
+ "winogrande/accuracy/dev": 0.5074980268350434,
116
+ "winogrande/accuracy/group_average": 0.5074980268350434,
117
+ "winogrande/accuracy/seq_average": 0.5074980268350434,
118
+ "commonsenseqa/accuracy/dev_rand_split": 0.22932022932022933,
119
+ "commonsenseqa/accuracy/group_average": 0.22932022932022933,
120
+ "commonsenseqa/accuracy/seq_average": 0.22932022932022933
121
+ }
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_sigmoid_standard_lb/export/result-model-30000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.807494148375496, "val/accuracy": 0.4513685438368056, "val/perplexity": 16.56834833396943, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5816548033530666, "lambada/accuracy/total": 0.2047748447204969, "lambada/accuracy/openai_last_token": 0.7393245341614907, "lambada/perplexity": 16.395440770895988, "lambada/lm_loss": 3.357957831173838, "lambada/lm_perplexity": 28.730458482245506, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.32807169427865124, "mean_loss": 2.6945744758642816, "blimp/accuracy/passive_2": 0.909, "blimp/accuracy/determiner_noun_agreement_2": 0.964, "blimp/accuracy/ellipsis_n_bar_1": 0.814, "blimp/accuracy/tough_vs_raising_2": 0.881, "blimp/accuracy/tough_vs_raising_1": 0.596, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.911, "blimp/accuracy/principle_A_reconstruction": 0.466, "blimp/accuracy/wh_vs_that_with_gap": 0.582, "blimp/accuracy/principle_A_domain_2": 0.773, "blimp/accuracy/determiner_noun_agreement_1": 0.976, "blimp/accuracy/ellipsis_n_bar_2": 0.885, "blimp/accuracy/principle_A_domain_3": 0.543, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.897, "blimp/accuracy/animate_subject_trans": 0.882, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.829, "blimp/accuracy/distractor_agreement_relative_clause": 0.584, "blimp/accuracy/transitive": 0.848, "blimp/accuracy/sentential_subject_island": 0.328, "blimp/accuracy/adjunct_island": 0.757, "blimp/accuracy/intransitive": 0.763, "blimp/accuracy/existential_there_subject_raising": 0.842, "blimp/accuracy/irregular_past_participle_adjectives": 0.919, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.214, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.229, "blimp/accuracy/only_npi_scope": 0.716, "blimp/accuracy/superlative_quantifiers_2": 0.81, "blimp/accuracy/passive_1": 0.883, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.913, "blimp/accuracy/inchoative": 0.609, "blimp/accuracy/anaphor_gender_agreement": 0.958, "blimp/accuracy/principle_A_c_command": 0.521, "blimp/accuracy/only_npi_licensor_present": 0.195, "blimp/accuracy/expletive_it_object_raising": 0.762, "blimp/accuracy/left_branch_island_simple_question": 0.263, "blimp/accuracy/wh_questions_subject_gap": 0.86, "blimp/accuracy/existential_there_quantifiers_2": 0.363, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.909, "blimp/accuracy/sentential_negation_npi_scope": 0.479, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.802, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.864, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.908, "blimp/accuracy/principle_A_case_2": 0.914, "blimp/accuracy/distractor_agreement_relational_noun": 0.753, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.997, "blimp/accuracy/superlative_quantifiers_1": 0.72, "blimp/accuracy/wh_island": 0.756, "blimp/accuracy/principle_A_domain_1": 0.991, "blimp/accuracy/complex_NP_island": 0.526, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.93, "blimp/accuracy/irregular_past_participle_verbs": 0.86, "blimp/accuracy/drop_argument": 0.778, "blimp/accuracy/wh_questions_object_gap": 0.716, "blimp/accuracy/animate_subject_passive": 0.795, "blimp/accuracy/existential_there_quantifiers_1": 0.978, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.891, "blimp/accuracy/npi_present_2": 0.583, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.914, "blimp/accuracy/anaphor_number_agreement": 0.978, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.927, "blimp/accuracy/existential_there_object_raising": 0.79, "blimp/accuracy/matrix_question_npi_licensor_present": 0.211, "blimp/accuracy/npi_present_1": 0.473, "blimp/accuracy/wh_vs_that_no_gap": 0.957, "blimp/accuracy/left_branch_island_echo_question": 0.382, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.972, "blimp/accuracy/causative": 0.674, "blimp/accuracy/group_average": 0.7413880597014925, "blimp/accuracy/seq_average": 0.7413880597014926, "cbt/accuracy/NE": 0.735176282051282, "cbt/accuracy/V": 0.8948, "cbt/accuracy/CN": 0.7812, "cbt/accuracy/P": 0.8648, "cbt/accuracy/group_average": 0.8189940705128205, "cbt/accuracy/seq_average": 0.8190276110444178, "hellaswag/accuracy/val": 0.27823142800239, "hellaswag/accuracy/group_average": 0.27823142800239, "hellaswag/accuracy/seq_average": 0.27823142800239, "piqa/accuracy/val": 0.5696409140369967, "piqa/accuracy/group_average": 0.5696409140369967, "piqa/accuracy/seq_average": 0.5696409140369967, "ai2arc/accuracy/ARC-Easy": 0.3086680761099366, "ai2arc/accuracy/ARC-Challenge": 0.2111587982832618, "ai2arc/accuracy/group_average": 0.2599134371965992, "ai2arc/accuracy/seq_average": 0.27648725212464587, "mmlu/accuracy/MMLU": 0.26499821237039684, "mmlu/accuracy/group_average": 0.26499821237039684, "mmlu/accuracy/seq_average": 0.26499821237039684, "openbookqa/accuracy/test": 0.28, "openbookqa/accuracy/group_average": 0.28, "openbookqa/accuracy/seq_average": 0.28, "race/accuracy/test/high": 0.2641509433962264, "race/accuracy/test/middle": 0.3342618384401114, "race/accuracy/group_average": 0.2992063909181689, "race/accuracy/seq_average": 0.28455614106201865, "siqa/accuracy/dev": 0.3546571136131013, "siqa/accuracy/group_average": 0.3546571136131013, "siqa/accuracy/seq_average": 0.3546571136131013, "winogrande/accuracy/dev": 0.5027624309392266, "winogrande/accuracy/group_average": 0.5027624309392266, "winogrande/accuracy/seq_average": 0.5027624309392266, "commonsenseqa/accuracy/dev_rand_split": 0.24078624078624078, "commonsenseqa/accuracy/group_average": 0.24078624078624078, "commonsenseqa/accuracy/seq_average": 0.24078624078624078}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_sigmoid_standard_lb/export/result-model-40000.pth.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "val/loss": 2.754880148266989,
3
+ "val/accuracy": 0.45884583488343256,
4
+ "val/perplexity": 15.719156821714105,
5
+ "val/time_since_best_loss": 0,
6
+ "val/time_since_best_accuracy": 0,
7
+ "lambada/loss": 2.5966694517905666,
8
+ "lambada/accuracy/total": 0.2105978260869565,
9
+ "lambada/accuracy/openai_last_token": 0.7472826086956522,
10
+ "lambada/perplexity": 15.714297672985598,
11
+ "lambada/lm_loss": 3.290272894701554,
12
+ "lambada/lm_perplexity": 26.850189930770128,
13
+ "lambada/time_since_best_loss": 0,
14
+ "lambada/time_since_best_accuracy": 0,
15
+ "mean_accuracy": 0.33472183048519455,
16
+ "mean_loss": 2.675774800028778,
17
+ "blimp/accuracy/passive_2": 0.879,
18
+ "blimp/accuracy/determiner_noun_agreement_2": 0.98,
19
+ "blimp/accuracy/ellipsis_n_bar_1": 0.831,
20
+ "blimp/accuracy/tough_vs_raising_2": 0.857,
21
+ "blimp/accuracy/tough_vs_raising_1": 0.622,
22
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.922,
23
+ "blimp/accuracy/principle_A_reconstruction": 0.22,
24
+ "blimp/accuracy/wh_vs_that_with_gap": 0.523,
25
+ "blimp/accuracy/principle_A_domain_2": 0.807,
26
+ "blimp/accuracy/determiner_noun_agreement_1": 0.98,
27
+ "blimp/accuracy/ellipsis_n_bar_2": 0.901,
28
+ "blimp/accuracy/principle_A_domain_3": 0.559,
29
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.923,
30
+ "blimp/accuracy/animate_subject_trans": 0.883,
31
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.869,
32
+ "blimp/accuracy/distractor_agreement_relative_clause": 0.658,
33
+ "blimp/accuracy/transitive": 0.844,
34
+ "blimp/accuracy/sentential_subject_island": 0.305,
35
+ "blimp/accuracy/adjunct_island": 0.807,
36
+ "blimp/accuracy/intransitive": 0.77,
37
+ "blimp/accuracy/existential_there_subject_raising": 0.825,
38
+ "blimp/accuracy/irregular_past_participle_adjectives": 0.804,
39
+ "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.349,
40
+ "blimp/accuracy/principle_A_case_1": 1.0,
41
+ "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.192,
42
+ "blimp/accuracy/only_npi_scope": 0.648,
43
+ "blimp/accuracy/superlative_quantifiers_2": 0.879,
44
+ "blimp/accuracy/passive_1": 0.875,
45
+ "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.879,
46
+ "blimp/accuracy/inchoative": 0.615,
47
+ "blimp/accuracy/anaphor_gender_agreement": 0.954,
48
+ "blimp/accuracy/principle_A_c_command": 0.562,
49
+ "blimp/accuracy/only_npi_licensor_present": 0.714,
50
+ "blimp/accuracy/expletive_it_object_raising": 0.748,
51
+ "blimp/accuracy/left_branch_island_simple_question": 0.441,
52
+ "blimp/accuracy/wh_questions_subject_gap": 0.915,
53
+ "blimp/accuracy/existential_there_quantifiers_2": 0.371,
54
+ "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.918,
55
+ "blimp/accuracy/sentential_negation_npi_scope": 0.536,
56
+ "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.815,
57
+ "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.911,
58
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.896,
59
+ "blimp/accuracy/principle_A_case_2": 0.957,
60
+ "blimp/accuracy/distractor_agreement_relational_noun": 0.799,
61
+ "blimp/accuracy/sentential_negation_npi_licensor_present": 0.996,
62
+ "blimp/accuracy/superlative_quantifiers_1": 0.749,
63
+ "blimp/accuracy/wh_island": 0.733,
64
+ "blimp/accuracy/principle_A_domain_1": 0.985,
65
+ "blimp/accuracy/complex_NP_island": 0.561,
66
+ "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.958,
67
+ "blimp/accuracy/irregular_past_participle_verbs": 0.899,
68
+ "blimp/accuracy/drop_argument": 0.794,
69
+ "blimp/accuracy/wh_questions_object_gap": 0.755,
70
+ "blimp/accuracy/animate_subject_passive": 0.773,
71
+ "blimp/accuracy/existential_there_quantifiers_1": 0.983,
72
+ "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.876,
73
+ "blimp/accuracy/npi_present_2": 0.607,
74
+ "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.919,
75
+ "blimp/accuracy/anaphor_number_agreement": 0.987,
76
+ "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.942,
77
+ "blimp/accuracy/existential_there_object_raising": 0.827,
78
+ "blimp/accuracy/matrix_question_npi_licensor_present": 0.176,
79
+ "blimp/accuracy/npi_present_1": 0.541,
80
+ "blimp/accuracy/wh_vs_that_no_gap": 0.985,
81
+ "blimp/accuracy/left_branch_island_echo_question": 0.418,
82
+ "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.989,
83
+ "blimp/accuracy/causative": 0.685,
84
+ "blimp/accuracy/group_average": 0.7594179104477611,
85
+ "blimp/accuracy/seq_average": 0.7594179104477612,
86
+ "cbt/accuracy/NE": 0.7375801282051282,
87
+ "cbt/accuracy/V": 0.8924,
88
+ "cbt/accuracy/CN": 0.7892,
89
+ "cbt/accuracy/P": 0.8708,
90
+ "cbt/accuracy/group_average": 0.822495032051282,
91
+ "cbt/accuracy/seq_average": 0.8225290116046419,
92
+ "hellaswag/accuracy/val": 0.28251344353714397,
93
+ "hellaswag/accuracy/group_average": 0.28251344353714397,
94
+ "hellaswag/accuracy/seq_average": 0.28251344353714397,
95
+ "piqa/accuracy/val": 0.5723612622415669,
96
+ "piqa/accuracy/group_average": 0.5723612622415669,
97
+ "piqa/accuracy/seq_average": 0.5723612622415669,
98
+ "ai2arc/accuracy/ARC-Easy": 0.3171247357293869,
99
+ "ai2arc/accuracy/ARC-Challenge": 0.1965665236051502,
100
+ "ai2arc/accuracy/group_average": 0.25684562966726854,
101
+ "ai2arc/accuracy/seq_average": 0.2773371104815864,
102
+ "mmlu/accuracy/MMLU": 0.2602788702180908,
103
+ "mmlu/accuracy/group_average": 0.2602788702180908,
104
+ "mmlu/accuracy/seq_average": 0.2602788702180908,
105
+ "openbookqa/accuracy/test": 0.28,
106
+ "openbookqa/accuracy/group_average": 0.28,
107
+ "openbookqa/accuracy/seq_average": 0.28,
108
+ "race/accuracy/test/high": 0.2667238421955403,
109
+ "race/accuracy/test/middle": 0.36002785515320335,
110
+ "race/accuracy/group_average": 0.31337584867437185,
111
+ "race/accuracy/seq_average": 0.2938792055127685,
112
+ "siqa/accuracy/dev": 0.3618219037871034,
113
+ "siqa/accuracy/group_average": 0.3618219037871034,
114
+ "siqa/accuracy/seq_average": 0.3618219037871034,
115
+ "winogrande/accuracy/dev": 0.4956590370955012,
116
+ "winogrande/accuracy/group_average": 0.4956590370955012,
117
+ "winogrande/accuracy/seq_average": 0.4956590370955012,
118
+ "commonsenseqa/accuracy/dev_rand_split": 0.24815724815724816,
119
+ "commonsenseqa/accuracy/group_average": 0.24815724815724816,
120
+ "commonsenseqa/accuracy/seq_average": 0.24815724815724816
121
+ }
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_sigmoid_standard_lb/export/result-model-50000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.7114097958519343, "val/accuracy": 0.4654444134424603, "val/perplexity": 15.050478667700487, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6101730062354425, "lambada/accuracy/total": 0.2284549689440994, "lambada/accuracy/openai_last_token": 0.7536878881987578, "lambada/perplexity": 13.857038591860729, "lambada/lm_loss": 3.282701740307414, "lambada/lm_perplexity": 26.647670617194198, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.34694969119327984, "mean_loss": 2.6607914010436886, "blimp/accuracy/passive_2": 0.897, "blimp/accuracy/determiner_noun_agreement_2": 0.977, "blimp/accuracy/ellipsis_n_bar_1": 0.836, "blimp/accuracy/tough_vs_raising_2": 0.887, "blimp/accuracy/tough_vs_raising_1": 0.573, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.905, "blimp/accuracy/principle_A_reconstruction": 0.35, "blimp/accuracy/wh_vs_that_with_gap": 0.509, "blimp/accuracy/principle_A_domain_2": 0.817, "blimp/accuracy/determiner_noun_agreement_1": 0.982, "blimp/accuracy/ellipsis_n_bar_2": 0.906, "blimp/accuracy/principle_A_domain_3": 0.554, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.893, "blimp/accuracy/animate_subject_trans": 0.895, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.843, "blimp/accuracy/distractor_agreement_relative_clause": 0.65, "blimp/accuracy/transitive": 0.859, "blimp/accuracy/sentential_subject_island": 0.366, "blimp/accuracy/adjunct_island": 0.773, "blimp/accuracy/intransitive": 0.751, "blimp/accuracy/existential_there_subject_raising": 0.842, "blimp/accuracy/irregular_past_participle_adjectives": 0.881, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.384, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.186, "blimp/accuracy/only_npi_scope": 0.683, "blimp/accuracy/superlative_quantifiers_2": 0.724, "blimp/accuracy/passive_1": 0.887, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.904, "blimp/accuracy/inchoative": 0.608, "blimp/accuracy/anaphor_gender_agreement": 0.969, "blimp/accuracy/principle_A_c_command": 0.558, "blimp/accuracy/only_npi_licensor_present": 0.338, "blimp/accuracy/expletive_it_object_raising": 0.793, "blimp/accuracy/left_branch_island_simple_question": 0.499, "blimp/accuracy/wh_questions_subject_gap": 0.924, "blimp/accuracy/existential_there_quantifiers_2": 0.454, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.904, "blimp/accuracy/sentential_negation_npi_scope": 0.484, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.852, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.902, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.894, "blimp/accuracy/principle_A_case_2": 0.959, "blimp/accuracy/distractor_agreement_relational_noun": 0.784, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.997, "blimp/accuracy/superlative_quantifiers_1": 0.734, "blimp/accuracy/wh_island": 0.812, "blimp/accuracy/principle_A_domain_1": 0.988, "blimp/accuracy/complex_NP_island": 0.52, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.948, "blimp/accuracy/irregular_past_participle_verbs": 0.902, "blimp/accuracy/drop_argument": 0.775, "blimp/accuracy/wh_questions_object_gap": 0.814, "blimp/accuracy/animate_subject_passive": 0.78, "blimp/accuracy/existential_there_quantifiers_1": 0.991, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.889, "blimp/accuracy/npi_present_2": 0.583, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.909, "blimp/accuracy/anaphor_number_agreement": 0.988, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.942, "blimp/accuracy/existential_there_object_raising": 0.849, "blimp/accuracy/matrix_question_npi_licensor_present": 0.185, "blimp/accuracy/npi_present_1": 0.55, "blimp/accuracy/wh_vs_that_no_gap": 0.983, "blimp/accuracy/left_branch_island_echo_question": 0.428, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.989, "blimp/accuracy/causative": 0.673, "blimp/accuracy/group_average": 0.7591791044776118, "blimp/accuracy/seq_average": 0.7591791044776119, "cbt/accuracy/NE": 0.7411858974358975, "cbt/accuracy/V": 0.9008, "cbt/accuracy/CN": 0.7964, "cbt/accuracy/P": 0.8752, "cbt/accuracy/group_average": 0.8283964743589743, "cbt/accuracy/seq_average": 0.8284313725490197, "hellaswag/accuracy/val": 0.28560047799243177, "hellaswag/accuracy/group_average": 0.28560047799243177, "hellaswag/accuracy/seq_average": 0.28560047799243177, "piqa/accuracy/val": 0.5680087051142546, "piqa/accuracy/group_average": 0.5680087051142546, "piqa/accuracy/seq_average": 0.5680087051142546, "ai2arc/accuracy/ARC-Easy": 0.31881606765327697, "ai2arc/accuracy/ARC-Challenge": 0.2111587982832618, "ai2arc/accuracy/group_average": 0.2649874329682694, "ai2arc/accuracy/seq_average": 0.28328611898017, "mmlu/accuracy/MMLU": 0.2608509116910976, "mmlu/accuracy/group_average": 0.2608509116910976, "mmlu/accuracy/seq_average": 0.2608509116910976, "openbookqa/accuracy/test": 0.278, "openbookqa/accuracy/group_average": 0.278, "openbookqa/accuracy/seq_average": 0.278, "race/accuracy/test/high": 0.26186392224128074, "race/accuracy/test/middle": 0.3370473537604457, "race/accuracy/group_average": 0.2994556380008632, "race/accuracy/seq_average": 0.2837454398054317, "siqa/accuracy/dev": 0.3572159672466735, "siqa/accuracy/group_average": 0.3572159672466735, "siqa/accuracy/seq_average": 0.3572159672466735, "winogrande/accuracy/dev": 0.5043409629044988, "winogrande/accuracy/group_average": 0.5043409629044988, "winogrande/accuracy/seq_average": 0.5043409629044988, "commonsenseqa/accuracy/dev_rand_split": 0.24733824733824733, "commonsenseqa/accuracy/group_average": 0.24733824733824733, "commonsenseqa/accuracy/seq_average": 0.24733824733824733}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_sigmoid_standard_lb/export/result-model-60000.pth.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "val/loss": 2.677216302780878,
3
+ "val/accuracy": 0.47003561352926587,
4
+ "val/perplexity": 14.544549269560274,
5
+ "val/time_since_best_loss": 0,
6
+ "val/time_since_best_accuracy": 0,
7
+ "lambada/loss": 2.593755307404891,
8
+ "lambada/accuracy/total": 0.23000776397515527,
9
+ "lambada/accuracy/openai_last_token": 0.75097049689441,
10
+ "lambada/perplexity": 13.518963970837525,
11
+ "lambada/lm_loss": 3.231360209691748,
12
+ "lambada/lm_perplexity": 25.314066001822276,
13
+ "lambada/time_since_best_loss": 0,
14
+ "lambada/time_since_best_accuracy": 0,
15
+ "mean_accuracy": 0.35002168875221057,
16
+ "mean_loss": 2.635485805092885,
17
+ "blimp/accuracy/passive_2": 0.895,
18
+ "blimp/accuracy/determiner_noun_agreement_2": 0.982,
19
+ "blimp/accuracy/ellipsis_n_bar_1": 0.848,
20
+ "blimp/accuracy/tough_vs_raising_2": 0.894,
21
+ "blimp/accuracy/tough_vs_raising_1": 0.567,
22
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.916,
23
+ "blimp/accuracy/principle_A_reconstruction": 0.285,
24
+ "blimp/accuracy/wh_vs_that_with_gap": 0.523,
25
+ "blimp/accuracy/principle_A_domain_2": 0.809,
26
+ "blimp/accuracy/determiner_noun_agreement_1": 0.987,
27
+ "blimp/accuracy/ellipsis_n_bar_2": 0.907,
28
+ "blimp/accuracy/principle_A_domain_3": 0.559,
29
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.91,
30
+ "blimp/accuracy/animate_subject_trans": 0.905,
31
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.854,
32
+ "blimp/accuracy/distractor_agreement_relative_clause": 0.664,
33
+ "blimp/accuracy/transitive": 0.858,
34
+ "blimp/accuracy/sentential_subject_island": 0.298,
35
+ "blimp/accuracy/adjunct_island": 0.8,
36
+ "blimp/accuracy/intransitive": 0.774,
37
+ "blimp/accuracy/existential_there_subject_raising": 0.813,
38
+ "blimp/accuracy/irregular_past_participle_adjectives": 0.952,
39
+ "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.291,
40
+ "blimp/accuracy/principle_A_case_1": 1.0,
41
+ "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.252,
42
+ "blimp/accuracy/only_npi_scope": 0.712,
43
+ "blimp/accuracy/superlative_quantifiers_2": 0.862,
44
+ "blimp/accuracy/passive_1": 0.887,
45
+ "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.904,
46
+ "blimp/accuracy/inchoative": 0.645,
47
+ "blimp/accuracy/anaphor_gender_agreement": 0.97,
48
+ "blimp/accuracy/principle_A_c_command": 0.588,
49
+ "blimp/accuracy/only_npi_licensor_present": 0.579,
50
+ "blimp/accuracy/expletive_it_object_raising": 0.774,
51
+ "blimp/accuracy/left_branch_island_simple_question": 0.345,
52
+ "blimp/accuracy/wh_questions_subject_gap": 0.896,
53
+ "blimp/accuracy/existential_there_quantifiers_2": 0.32,
54
+ "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.925,
55
+ "blimp/accuracy/sentential_negation_npi_scope": 0.605,
56
+ "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.835,
57
+ "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.873,
58
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.892,
59
+ "blimp/accuracy/principle_A_case_2": 0.963,
60
+ "blimp/accuracy/distractor_agreement_relational_noun": 0.773,
61
+ "blimp/accuracy/sentential_negation_npi_licensor_present": 0.995,
62
+ "blimp/accuracy/superlative_quantifiers_1": 0.834,
63
+ "blimp/accuracy/wh_island": 0.756,
64
+ "blimp/accuracy/principle_A_domain_1": 0.996,
65
+ "blimp/accuracy/complex_NP_island": 0.536,
66
+ "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.949,
67
+ "blimp/accuracy/irregular_past_participle_verbs": 0.895,
68
+ "blimp/accuracy/drop_argument": 0.78,
69
+ "blimp/accuracy/wh_questions_object_gap": 0.767,
70
+ "blimp/accuracy/animate_subject_passive": 0.785,
71
+ "blimp/accuracy/existential_there_quantifiers_1": 0.978,
72
+ "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.9,
73
+ "blimp/accuracy/npi_present_2": 0.6,
74
+ "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.92,
75
+ "blimp/accuracy/anaphor_number_agreement": 0.986,
76
+ "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.94,
77
+ "blimp/accuracy/existential_there_object_raising": 0.816,
78
+ "blimp/accuracy/matrix_question_npi_licensor_present": 0.221,
79
+ "blimp/accuracy/npi_present_1": 0.549,
80
+ "blimp/accuracy/wh_vs_that_no_gap": 0.971,
81
+ "blimp/accuracy/left_branch_island_echo_question": 0.446,
82
+ "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.969,
83
+ "blimp/accuracy/causative": 0.708,
84
+ "blimp/accuracy/group_average": 0.764,
85
+ "blimp/accuracy/seq_average": 0.764,
86
+ "cbt/accuracy/NE": 0.7479967948717948,
87
+ "cbt/accuracy/V": 0.9048,
88
+ "cbt/accuracy/CN": 0.8044,
89
+ "cbt/accuracy/P": 0.8764,
90
+ "cbt/accuracy/group_average": 0.8333991987179487,
91
+ "cbt/accuracy/seq_average": 0.8334333733493398,
92
+ "hellaswag/accuracy/val": 0.285700059749054,
93
+ "hellaswag/accuracy/group_average": 0.285700059749054,
94
+ "hellaswag/accuracy/seq_average": 0.285700059749054,
95
+ "piqa/accuracy/val": 0.5772578890097932,
96
+ "piqa/accuracy/group_average": 0.5772578890097932,
97
+ "piqa/accuracy/seq_average": 0.5772578890097932,
98
+ "ai2arc/accuracy/ARC-Easy": 0.33446088794926004,
99
+ "ai2arc/accuracy/ARC-Challenge": 0.21802575107296138,
100
+ "ai2arc/accuracy/group_average": 0.2762433195111107,
101
+ "ai2arc/accuracy/seq_average": 0.29603399433427763,
102
+ "mmlu/accuracy/MMLU": 0.2621380050053629,
103
+ "mmlu/accuracy/group_average": 0.2621380050053629,
104
+ "mmlu/accuracy/seq_average": 0.2621380050053629,
105
+ "openbookqa/accuracy/test": 0.28,
106
+ "openbookqa/accuracy/group_average": 0.28,
107
+ "openbookqa/accuracy/seq_average": 0.28,
108
+ "race/accuracy/test/high": 0.2612921669525443,
109
+ "race/accuracy/test/middle": 0.3516713091922006,
110
+ "race/accuracy/group_average": 0.30648173807237244,
111
+ "race/accuracy/seq_average": 0.2875962707742197,
112
+ "siqa/accuracy/dev": 0.3556806550665302,
113
+ "siqa/accuracy/group_average": 0.3556806550665302,
114
+ "siqa/accuracy/seq_average": 0.3556806550665302,
115
+ "winogrande/accuracy/dev": 0.4980268350434096,
116
+ "winogrande/accuracy/group_average": 0.4980268350434096,
117
+ "winogrande/accuracy/seq_average": 0.4980268350434096,
118
+ "commonsenseqa/accuracy/dev_rand_split": 0.24815724815724816,
119
+ "commonsenseqa/accuracy/group_average": 0.24815724815724816,
120
+ "commonsenseqa/accuracy/seq_average": 0.24815724815724816
121
+ }
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_sigmoid_standard_lb/export/result-model-70000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.651888892764137, "val/accuracy": 0.47424122643849204, "val/perplexity": 14.180799372671775, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5581873544254656, "lambada/accuracy/total": 0.2484472049689441, "lambada/accuracy/openai_last_token": 0.7554347826086957, "lambada/perplexity": 12.619442516702218, "lambada/lm_loss": 3.211777822709063, "lambada/lm_perplexity": 24.823178229057724, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.36134421570371805, "mean_loss": 2.6050381235948015, "blimp/accuracy/passive_2": 0.913, "blimp/accuracy/determiner_noun_agreement_2": 0.983, "blimp/accuracy/ellipsis_n_bar_1": 0.835, "blimp/accuracy/tough_vs_raising_2": 0.845, "blimp/accuracy/tough_vs_raising_1": 0.626, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.893, "blimp/accuracy/principle_A_reconstruction": 0.339, "blimp/accuracy/wh_vs_that_with_gap": 0.542, "blimp/accuracy/principle_A_domain_2": 0.798, "blimp/accuracy/determiner_noun_agreement_1": 0.985, "blimp/accuracy/ellipsis_n_bar_2": 0.909, "blimp/accuracy/principle_A_domain_3": 0.583, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.902, "blimp/accuracy/animate_subject_trans": 0.909, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.871, "blimp/accuracy/distractor_agreement_relative_clause": 0.644, "blimp/accuracy/transitive": 0.857, "blimp/accuracy/sentential_subject_island": 0.33, "blimp/accuracy/adjunct_island": 0.826, "blimp/accuracy/intransitive": 0.787, "blimp/accuracy/existential_there_subject_raising": 0.841, "blimp/accuracy/irregular_past_participle_adjectives": 0.812, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.436, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.243, "blimp/accuracy/only_npi_scope": 0.579, "blimp/accuracy/superlative_quantifiers_2": 0.692, "blimp/accuracy/passive_1": 0.9, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.9, "blimp/accuracy/inchoative": 0.612, "blimp/accuracy/anaphor_gender_agreement": 0.976, "blimp/accuracy/principle_A_c_command": 0.57, "blimp/accuracy/only_npi_licensor_present": 0.505, "blimp/accuracy/expletive_it_object_raising": 0.752, "blimp/accuracy/left_branch_island_simple_question": 0.524, "blimp/accuracy/wh_questions_subject_gap": 0.895, "blimp/accuracy/existential_there_quantifiers_2": 0.443, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.921, "blimp/accuracy/sentential_negation_npi_scope": 0.565, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.853, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.893, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.896, "blimp/accuracy/principle_A_case_2": 0.957, "blimp/accuracy/distractor_agreement_relational_noun": 0.818, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.996, "blimp/accuracy/superlative_quantifiers_1": 0.842, "blimp/accuracy/wh_island": 0.797, "blimp/accuracy/principle_A_domain_1": 0.985, "blimp/accuracy/complex_NP_island": 0.543, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.95, "blimp/accuracy/irregular_past_participle_verbs": 0.907, "blimp/accuracy/drop_argument": 0.764, "blimp/accuracy/wh_questions_object_gap": 0.771, "blimp/accuracy/animate_subject_passive": 0.785, "blimp/accuracy/existential_there_quantifiers_1": 0.975, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.888, "blimp/accuracy/npi_present_2": 0.562, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.911, "blimp/accuracy/anaphor_number_agreement": 0.989, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.937, "blimp/accuracy/existential_there_object_raising": 0.851, "blimp/accuracy/matrix_question_npi_licensor_present": 0.272, "blimp/accuracy/npi_present_1": 0.544, "blimp/accuracy/wh_vs_that_no_gap": 0.968, "blimp/accuracy/left_branch_island_echo_question": 0.473, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.979, "blimp/accuracy/causative": 0.709, "blimp/accuracy/group_average": 0.7665373134328357, "blimp/accuracy/seq_average": 0.7665373134328358, "cbt/accuracy/NE": 0.75, "cbt/accuracy/V": 0.9024, "cbt/accuracy/CN": 0.8144, "cbt/accuracy/P": 0.8776, "cbt/accuracy/group_average": 0.8361000000000001, "cbt/accuracy/seq_average": 0.8361344537815126, "hellaswag/accuracy/val": 0.28689504082852024, "hellaswag/accuracy/group_average": 0.28689504082852024, "hellaswag/accuracy/seq_average": 0.28689504082852024, "piqa/accuracy/val": 0.5859630032644179, "piqa/accuracy/group_average": 0.5859630032644179, "piqa/accuracy/seq_average": 0.5859630032644179, "ai2arc/accuracy/ARC-Easy": 0.32642706131078225, "ai2arc/accuracy/ARC-Challenge": 0.20600858369098712, "ai2arc/accuracy/group_average": 0.26621782250088466, "ai2arc/accuracy/seq_average": 0.28668555240793203, "mmlu/accuracy/MMLU": 0.2642116553450125, "mmlu/accuracy/group_average": 0.2642116553450125, "mmlu/accuracy/seq_average": 0.2642116553450125, "openbookqa/accuracy/test": 0.278, "openbookqa/accuracy/group_average": 0.278, "openbookqa/accuracy/seq_average": 0.278, "race/accuracy/test/high": 0.2624356775300172, "race/accuracy/test/middle": 0.3488857938718663, "race/accuracy/group_average": 0.30566073570094177, "race/accuracy/seq_average": 0.2875962707742197, "siqa/accuracy/dev": 0.35363357215967245, "siqa/accuracy/group_average": 0.35363357215967245, "siqa/accuracy/seq_average": 0.35363357215967245, "winogrande/accuracy/dev": 0.5153906866614049, "winogrande/accuracy/group_average": 0.5153906866614049, "winogrande/accuracy/seq_average": 0.5153906866614049, "commonsenseqa/accuracy/dev_rand_split": 0.2457002457002457, "commonsenseqa/accuracy/group_average": 0.2457002457002457, "commonsenseqa/accuracy/seq_average": 0.2457002457002457}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_sigmoid_standard_lb/export/result-model-80000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.6294446672712053, "val/accuracy": 0.4769103035094246, "val/perplexity": 13.866067482532282, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.4874447650790956, "lambada/accuracy/total": 0.24805900621118013, "lambada/accuracy/openai_last_token": 0.7581521739130435, "lambada/perplexity": 12.173992762296985, "lambada/lm_loss": 3.193227865871034, "lambada/lm_perplexity": 24.366953897348676, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.36248465486030235, "mean_loss": 2.5584447161751505, "blimp/accuracy/passive_2": 0.917, "blimp/accuracy/determiner_noun_agreement_2": 0.981, "blimp/accuracy/ellipsis_n_bar_1": 0.828, "blimp/accuracy/tough_vs_raising_2": 0.863, "blimp/accuracy/tough_vs_raising_1": 0.622, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.921, "blimp/accuracy/principle_A_reconstruction": 0.259, "blimp/accuracy/wh_vs_that_with_gap": 0.531, "blimp/accuracy/principle_A_domain_2": 0.808, "blimp/accuracy/determiner_noun_agreement_1": 0.991, "blimp/accuracy/ellipsis_n_bar_2": 0.904, "blimp/accuracy/principle_A_domain_3": 0.577, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.916, "blimp/accuracy/animate_subject_trans": 0.901, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.876, "blimp/accuracy/distractor_agreement_relative_clause": 0.69, "blimp/accuracy/transitive": 0.871, "blimp/accuracy/sentential_subject_island": 0.313, "blimp/accuracy/adjunct_island": 0.82, "blimp/accuracy/intransitive": 0.801, "blimp/accuracy/existential_there_subject_raising": 0.863, "blimp/accuracy/irregular_past_participle_adjectives": 0.964, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.362, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.251, "blimp/accuracy/only_npi_scope": 0.631, "blimp/accuracy/superlative_quantifiers_2": 0.873, "blimp/accuracy/passive_1": 0.909, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.896, "blimp/accuracy/inchoative": 0.616, "blimp/accuracy/anaphor_gender_agreement": 0.977, "blimp/accuracy/principle_A_c_command": 0.567, "blimp/accuracy/only_npi_licensor_present": 0.603, "blimp/accuracy/expletive_it_object_raising": 0.755, "blimp/accuracy/left_branch_island_simple_question": 0.421, "blimp/accuracy/wh_questions_subject_gap": 0.911, "blimp/accuracy/existential_there_quantifiers_2": 0.432, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.93, "blimp/accuracy/sentential_negation_npi_scope": 0.561, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.829, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.886, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.899, "blimp/accuracy/principle_A_case_2": 0.963, "blimp/accuracy/distractor_agreement_relational_noun": 0.804, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.995, "blimp/accuracy/superlative_quantifiers_1": 0.809, "blimp/accuracy/wh_island": 0.773, "blimp/accuracy/principle_A_domain_1": 0.977, "blimp/accuracy/complex_NP_island": 0.556, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.961, "blimp/accuracy/irregular_past_participle_verbs": 0.893, "blimp/accuracy/drop_argument": 0.777, "blimp/accuracy/wh_questions_object_gap": 0.787, "blimp/accuracy/animate_subject_passive": 0.796, "blimp/accuracy/existential_there_quantifiers_1": 0.976, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.887, "blimp/accuracy/npi_present_2": 0.573, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.923, "blimp/accuracy/anaphor_number_agreement": 0.988, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.948, "blimp/accuracy/existential_there_object_raising": 0.835, "blimp/accuracy/matrix_question_npi_licensor_present": 0.241, "blimp/accuracy/npi_present_1": 0.56, "blimp/accuracy/wh_vs_that_no_gap": 0.973, "blimp/accuracy/left_branch_island_echo_question": 0.449, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.98, "blimp/accuracy/causative": 0.692, "blimp/accuracy/group_average": 0.770776119402985, "blimp/accuracy/seq_average": 0.7707761194029851, "cbt/accuracy/NE": 0.7528044871794872, "cbt/accuracy/V": 0.9072, "cbt/accuracy/CN": 0.818, "cbt/accuracy/P": 0.8804, "cbt/accuracy/group_average": 0.8396011217948718, "cbt/accuracy/seq_average": 0.8396358543417367, "hellaswag/accuracy/val": 0.2905795658235411, "hellaswag/accuracy/group_average": 0.2905795658235411, "hellaswag/accuracy/seq_average": 0.2905795658235411, "piqa/accuracy/val": 0.5848748639825898, "piqa/accuracy/group_average": 0.5848748639825898, "piqa/accuracy/seq_average": 0.5848748639825898, "ai2arc/accuracy/ARC-Easy": 0.3399577167019027, "ai2arc/accuracy/ARC-Challenge": 0.2128755364806867, "ai2arc/accuracy/group_average": 0.27641662659129473, "ai2arc/accuracy/seq_average": 0.2980169971671388, "mmlu/accuracy/MMLU": 0.26035037540221667, "mmlu/accuracy/group_average": 0.26035037540221667, "mmlu/accuracy/seq_average": 0.26035037540221667, "openbookqa/accuracy/test": 0.276, "openbookqa/accuracy/group_average": 0.276, "openbookqa/accuracy/seq_average": 0.276, "race/accuracy/test/high": 0.26758147512864494, "race/accuracy/test/middle": 0.3502785515320334, "race/accuracy/group_average": 0.3089300133303392, "race/accuracy/seq_average": 0.29164977705715445, "siqa/accuracy/dev": 0.3561924257932446, "siqa/accuracy/group_average": 0.3561924257932446, "siqa/accuracy/seq_average": 0.3561924257932446, "winogrande/accuracy/dev": 0.5067087608524072, "winogrande/accuracy/group_average": 0.5067087608524072, "winogrande/accuracy/seq_average": 0.5067087608524072, "commonsenseqa/accuracy/dev_rand_split": 0.24897624897624898, "commonsenseqa/accuracy/group_average": 0.24897624897624898, "commonsenseqa/accuracy/seq_average": 0.24897624897624898}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_sigmoid_standard_lb/export/result-model-90000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.6175365145244296, "val/accuracy": 0.4781581333705357, "val/perplexity": 13.701927474734823, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5651116223068713, "lambada/accuracy/total": 0.24456521739130435, "lambada/accuracy/openai_last_token": 0.7591226708074534, "lambada/perplexity": 12.081295702875193, "lambada/lm_loss": 3.1670897308033994, "lambada/lm_perplexity": 23.738298891053454, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.36136167538092, "mean_loss": 2.59132406841565, "blimp/accuracy/passive_2": 0.915, "blimp/accuracy/determiner_noun_agreement_2": 0.984, "blimp/accuracy/ellipsis_n_bar_1": 0.824, "blimp/accuracy/tough_vs_raising_2": 0.881, "blimp/accuracy/tough_vs_raising_1": 0.63, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.904, "blimp/accuracy/principle_A_reconstruction": 0.346, "blimp/accuracy/wh_vs_that_with_gap": 0.576, "blimp/accuracy/principle_A_domain_2": 0.808, "blimp/accuracy/determiner_noun_agreement_1": 0.991, "blimp/accuracy/ellipsis_n_bar_2": 0.916, "blimp/accuracy/principle_A_domain_3": 0.565, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.903, "blimp/accuracy/animate_subject_trans": 0.899, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.884, "blimp/accuracy/distractor_agreement_relative_clause": 0.675, "blimp/accuracy/transitive": 0.869, "blimp/accuracy/sentential_subject_island": 0.331, "blimp/accuracy/adjunct_island": 0.804, "blimp/accuracy/intransitive": 0.803, "blimp/accuracy/existential_there_subject_raising": 0.857, "blimp/accuracy/irregular_past_participle_adjectives": 0.952, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.332, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.283, "blimp/accuracy/only_npi_scope": 0.654, "blimp/accuracy/superlative_quantifiers_2": 0.699, "blimp/accuracy/passive_1": 0.905, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.911, "blimp/accuracy/inchoative": 0.624, "blimp/accuracy/anaphor_gender_agreement": 0.973, "blimp/accuracy/principle_A_c_command": 0.582, "blimp/accuracy/only_npi_licensor_present": 0.623, "blimp/accuracy/expletive_it_object_raising": 0.766, "blimp/accuracy/left_branch_island_simple_question": 0.395, "blimp/accuracy/wh_questions_subject_gap": 0.909, "blimp/accuracy/existential_there_quantifiers_2": 0.489, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.92, "blimp/accuracy/sentential_negation_npi_scope": 0.531, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.839, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.88, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.907, "blimp/accuracy/principle_A_case_2": 0.967, "blimp/accuracy/distractor_agreement_relational_noun": 0.807, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.991, "blimp/accuracy/superlative_quantifiers_1": 0.888, "blimp/accuracy/wh_island": 0.772, "blimp/accuracy/principle_A_domain_1": 0.988, "blimp/accuracy/complex_NP_island": 0.565, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.952, "blimp/accuracy/irregular_past_participle_verbs": 0.887, "blimp/accuracy/drop_argument": 0.79, "blimp/accuracy/wh_questions_object_gap": 0.784, "blimp/accuracy/animate_subject_passive": 0.8, "blimp/accuracy/existential_there_quantifiers_1": 0.98, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.895, "blimp/accuracy/npi_present_2": 0.624, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.942, "blimp/accuracy/anaphor_number_agreement": 0.988, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.948, "blimp/accuracy/existential_there_object_raising": 0.853, "blimp/accuracy/matrix_question_npi_licensor_present": 0.281, "blimp/accuracy/npi_present_1": 0.609, "blimp/accuracy/wh_vs_that_no_gap": 0.969, "blimp/accuracy/left_branch_island_echo_question": 0.444, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.978, "blimp/accuracy/causative": 0.724, "blimp/accuracy/group_average": 0.775597014925373, "blimp/accuracy/seq_average": 0.7755970149253731, "cbt/accuracy/NE": 0.7672275641025641, "cbt/accuracy/V": 0.9096, "cbt/accuracy/CN": 0.8256, "cbt/accuracy/P": 0.888, "cbt/accuracy/group_average": 0.847606891025641, "cbt/accuracy/seq_average": 0.847639055622249, "hellaswag/accuracy/val": 0.29087831109340767, "hellaswag/accuracy/group_average": 0.29087831109340767, "hellaswag/accuracy/seq_average": 0.29087831109340767, "piqa/accuracy/val": 0.5821545157780196, "piqa/accuracy/group_average": 0.5821545157780196, "piqa/accuracy/seq_average": 0.5821545157780196, "ai2arc/accuracy/ARC-Easy": 0.33699788583509516, "ai2arc/accuracy/ARC-Challenge": 0.21030042918454936, "ai2arc/accuracy/group_average": 0.27364915750982227, "ai2arc/accuracy/seq_average": 0.2951841359773371, "mmlu/accuracy/MMLU": 0.2619234894529853, "mmlu/accuracy/group_average": 0.2619234894529853, "mmlu/accuracy/seq_average": 0.2619234894529853, "openbookqa/accuracy/test": 0.286, "openbookqa/accuracy/group_average": 0.286, "openbookqa/accuracy/seq_average": 0.286, "race/accuracy/test/high": 0.26758147512864494, "race/accuracy/test/middle": 0.34401114206128136, "race/accuracy/group_average": 0.30579630859496315, "race/accuracy/seq_average": 0.2898256992298338, "siqa/accuracy/dev": 0.3572159672466735, "siqa/accuracy/group_average": 0.3572159672466735, "siqa/accuracy/seq_average": 0.3572159672466735, "winogrande/accuracy/dev": 0.5067087608524072, "winogrande/accuracy/group_average": 0.5067087608524072, "winogrande/accuracy/seq_average": 0.5067087608524072, "commonsenseqa/accuracy/dev_rand_split": 0.25143325143325146, "commonsenseqa/accuracy/group_average": 0.25143325143325146, "commonsenseqa/accuracy/seq_average": 0.25143325143325146}