Upload folder using huggingface_hub

#288
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_standard_lb/export/result-model-10000.pth.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "val/loss": 3.026036095997644,
3
+ "val/accuracy": 0.42390659877232145,
4
+ "val/perplexity": 20.615353134651023,
5
+ "val/time_since_best_loss": 0,
6
+ "val/time_since_best_accuracy": 0,
7
+ "lambada/loss": 2.8078854009971854,
8
+ "lambada/accuracy/total": 0.1622670807453416,
9
+ "lambada/accuracy/openai_last_token": 0.7164208074534162,
10
+ "lambada/perplexity": 24.909094620287416,
11
+ "lambada/lm_loss": 3.564238264553546,
12
+ "lambada/lm_perplexity": 35.312544340058516,
13
+ "lambada/time_since_best_loss": 0,
14
+ "lambada/time_since_best_accuracy": 0,
15
+ "mean_accuracy": 0.29308683975883154,
16
+ "mean_loss": 2.9169607484974147,
17
+ "blimp/accuracy/passive_2": 0.869,
18
+ "blimp/accuracy/determiner_noun_agreement_2": 0.95,
19
+ "blimp/accuracy/ellipsis_n_bar_1": 0.698,
20
+ "blimp/accuracy/tough_vs_raising_2": 0.778,
21
+ "blimp/accuracy/tough_vs_raising_1": 0.571,
22
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.885,
23
+ "blimp/accuracy/principle_A_reconstruction": 0.326,
24
+ "blimp/accuracy/wh_vs_that_with_gap": 0.429,
25
+ "blimp/accuracy/principle_A_domain_2": 0.838,
26
+ "blimp/accuracy/determiner_noun_agreement_1": 0.966,
27
+ "blimp/accuracy/ellipsis_n_bar_2": 0.854,
28
+ "blimp/accuracy/principle_A_domain_3": 0.523,
29
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.863,
30
+ "blimp/accuracy/animate_subject_trans": 0.858,
31
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.748,
32
+ "blimp/accuracy/distractor_agreement_relative_clause": 0.435,
33
+ "blimp/accuracy/transitive": 0.829,
34
+ "blimp/accuracy/sentential_subject_island": 0.385,
35
+ "blimp/accuracy/adjunct_island": 0.734,
36
+ "blimp/accuracy/intransitive": 0.709,
37
+ "blimp/accuracy/existential_there_subject_raising": 0.813,
38
+ "blimp/accuracy/irregular_past_participle_adjectives": 0.892,
39
+ "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.182,
40
+ "blimp/accuracy/principle_A_case_1": 1.0,
41
+ "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.188,
42
+ "blimp/accuracy/only_npi_scope": 0.653,
43
+ "blimp/accuracy/superlative_quantifiers_2": 0.677,
44
+ "blimp/accuracy/passive_1": 0.88,
45
+ "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.872,
46
+ "blimp/accuracy/inchoative": 0.518,
47
+ "blimp/accuracy/anaphor_gender_agreement": 0.926,
48
+ "blimp/accuracy/principle_A_c_command": 0.476,
49
+ "blimp/accuracy/only_npi_licensor_present": 0.656,
50
+ "blimp/accuracy/expletive_it_object_raising": 0.769,
51
+ "blimp/accuracy/left_branch_island_simple_question": 0.195,
52
+ "blimp/accuracy/wh_questions_subject_gap": 0.918,
53
+ "blimp/accuracy/existential_there_quantifiers_2": 0.51,
54
+ "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.906,
55
+ "blimp/accuracy/sentential_negation_npi_scope": 0.409,
56
+ "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.759,
57
+ "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.862,
58
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.837,
59
+ "blimp/accuracy/principle_A_case_2": 0.872,
60
+ "blimp/accuracy/distractor_agreement_relational_noun": 0.754,
61
+ "blimp/accuracy/sentential_negation_npi_licensor_present": 0.973,
62
+ "blimp/accuracy/superlative_quantifiers_1": 0.744,
63
+ "blimp/accuracy/wh_island": 0.776,
64
+ "blimp/accuracy/principle_A_domain_1": 0.971,
65
+ "blimp/accuracy/complex_NP_island": 0.513,
66
+ "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.906,
67
+ "blimp/accuracy/irregular_past_participle_verbs": 0.813,
68
+ "blimp/accuracy/drop_argument": 0.737,
69
+ "blimp/accuracy/wh_questions_object_gap": 0.714,
70
+ "blimp/accuracy/animate_subject_passive": 0.795,
71
+ "blimp/accuracy/existential_there_quantifiers_1": 0.974,
72
+ "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.861,
73
+ "blimp/accuracy/npi_present_2": 0.598,
74
+ "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.828,
75
+ "blimp/accuracy/anaphor_number_agreement": 0.972,
76
+ "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.909,
77
+ "blimp/accuracy/existential_there_object_raising": 0.728,
78
+ "blimp/accuracy/matrix_question_npi_licensor_present": 0.065,
79
+ "blimp/accuracy/npi_present_1": 0.529,
80
+ "blimp/accuracy/wh_vs_that_no_gap": 0.966,
81
+ "blimp/accuracy/left_branch_island_echo_question": 0.44,
82
+ "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.981,
83
+ "blimp/accuracy/causative": 0.648,
84
+ "blimp/accuracy/group_average": 0.7195970149253731,
85
+ "blimp/accuracy/seq_average": 0.7195970149253731,
86
+ "cbt/accuracy/NE": 0.6915064102564102,
87
+ "cbt/accuracy/V": 0.86,
88
+ "cbt/accuracy/CN": 0.7328,
89
+ "cbt/accuracy/P": 0.8376,
90
+ "cbt/accuracy/group_average": 0.7804766025641026,
91
+ "cbt/accuracy/seq_average": 0.7805122048819528,
92
+ "hellaswag/accuracy/val": 0.26926906990639315,
93
+ "hellaswag/accuracy/group_average": 0.26926906990639315,
94
+ "hellaswag/accuracy/seq_average": 0.26926906990639315,
95
+ "piqa/accuracy/val": 0.5495103373231773,
96
+ "piqa/accuracy/group_average": 0.5495103373231773,
97
+ "piqa/accuracy/seq_average": 0.5495103373231773,
98
+ "ai2arc/accuracy/ARC-Easy": 0.31331923890063423,
99
+ "ai2arc/accuracy/ARC-Challenge": 0.21030042918454936,
100
+ "ai2arc/accuracy/group_average": 0.2618098340425918,
101
+ "ai2arc/accuracy/seq_average": 0.2793201133144476,
102
+ "mmlu/accuracy/MMLU": 0.2580622095101895,
103
+ "mmlu/accuracy/group_average": 0.2580622095101895,
104
+ "mmlu/accuracy/seq_average": 0.2580622095101895,
105
+ "openbookqa/accuracy/test": 0.256,
106
+ "openbookqa/accuracy/group_average": 0.256,
107
+ "openbookqa/accuracy/seq_average": 0.256,
108
+ "race/accuracy/test/high": 0.2578616352201258,
109
+ "race/accuracy/test/middle": 0.318941504178273,
110
+ "race/accuracy/group_average": 0.2884015696991994,
111
+ "race/accuracy/seq_average": 0.2756384272395622,
112
+ "siqa/accuracy/dev": 0.3705220061412487,
113
+ "siqa/accuracy/group_average": 0.3705220061412487,
114
+ "siqa/accuracy/seq_average": 0.3705220061412487,
115
+ "winogrande/accuracy/dev": 0.510655090765588,
116
+ "winogrande/accuracy/group_average": 0.510655090765588,
117
+ "winogrande/accuracy/seq_average": 0.510655090765588,
118
+ "commonsenseqa/accuracy/dev_rand_split": 0.23587223587223588,
119
+ "commonsenseqa/accuracy/group_average": 0.23587223587223588,
120
+ "commonsenseqa/accuracy/seq_average": 0.23587223587223588
121
+ }
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_standard_lb/export/result-model-100000.pth.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "val/loss": 2.6119626968625993,
3
+ "val/accuracy": 0.479095943390377,
4
+ "val/perplexity": 13.625767876405735,
5
+ "val/time_since_best_loss": 0,
6
+ "val/time_since_best_accuracy": 0,
7
+ "lambada/loss": 2.546885993910132,
8
+ "lambada/accuracy/total": 0.25271739130434784,
9
+ "lambada/accuracy/openai_last_token": 0.7618400621118012,
10
+ "lambada/perplexity": 12.005568880054728,
11
+ "lambada/lm_loss": 3.1711634036775673,
12
+ "lambada/lm_perplexity": 23.835198189344226,
13
+ "lambada/time_since_best_loss": 0,
14
+ "lambada/time_since_best_accuracy": 0,
15
+ "mean_accuracy": 0.36590666734736244,
16
+ "mean_loss": 2.5794243453863657,
17
+ "blimp/accuracy/passive_2": 0.905,
18
+ "blimp/accuracy/determiner_noun_agreement_2": 0.976,
19
+ "blimp/accuracy/ellipsis_n_bar_1": 0.799,
20
+ "blimp/accuracy/tough_vs_raising_2": 0.888,
21
+ "blimp/accuracy/tough_vs_raising_1": 0.596,
22
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.905,
23
+ "blimp/accuracy/principle_A_reconstruction": 0.309,
24
+ "blimp/accuracy/wh_vs_that_with_gap": 0.509,
25
+ "blimp/accuracy/principle_A_domain_2": 0.785,
26
+ "blimp/accuracy/determiner_noun_agreement_1": 0.988,
27
+ "blimp/accuracy/ellipsis_n_bar_2": 0.91,
28
+ "blimp/accuracy/principle_A_domain_3": 0.526,
29
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.937,
30
+ "blimp/accuracy/animate_subject_trans": 0.902,
31
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.879,
32
+ "blimp/accuracy/distractor_agreement_relative_clause": 0.634,
33
+ "blimp/accuracy/transitive": 0.876,
34
+ "blimp/accuracy/sentential_subject_island": 0.29,
35
+ "blimp/accuracy/adjunct_island": 0.782,
36
+ "blimp/accuracy/intransitive": 0.787,
37
+ "blimp/accuracy/existential_there_subject_raising": 0.854,
38
+ "blimp/accuracy/irregular_past_participle_adjectives": 0.941,
39
+ "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.479,
40
+ "blimp/accuracy/principle_A_case_1": 1.0,
41
+ "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.227,
42
+ "blimp/accuracy/only_npi_scope": 0.721,
43
+ "blimp/accuracy/superlative_quantifiers_2": 0.793,
44
+ "blimp/accuracy/passive_1": 0.899,
45
+ "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.914,
46
+ "blimp/accuracy/inchoative": 0.624,
47
+ "blimp/accuracy/anaphor_gender_agreement": 0.94,
48
+ "blimp/accuracy/principle_A_c_command": 0.619,
49
+ "blimp/accuracy/only_npi_licensor_present": 0.808,
50
+ "blimp/accuracy/expletive_it_object_raising": 0.78,
51
+ "blimp/accuracy/left_branch_island_simple_question": 0.558,
52
+ "blimp/accuracy/wh_questions_subject_gap": 0.935,
53
+ "blimp/accuracy/existential_there_quantifiers_2": 0.495,
54
+ "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.925,
55
+ "blimp/accuracy/sentential_negation_npi_scope": 0.604,
56
+ "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.818,
57
+ "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.858,
58
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.909,
59
+ "blimp/accuracy/principle_A_case_2": 0.961,
60
+ "blimp/accuracy/distractor_agreement_relational_noun": 0.804,
61
+ "blimp/accuracy/sentential_negation_npi_licensor_present": 0.978,
62
+ "blimp/accuracy/superlative_quantifiers_1": 0.809,
63
+ "blimp/accuracy/wh_island": 0.767,
64
+ "blimp/accuracy/principle_A_domain_1": 0.978,
65
+ "blimp/accuracy/complex_NP_island": 0.576,
66
+ "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.965,
67
+ "blimp/accuracy/irregular_past_participle_verbs": 0.865,
68
+ "blimp/accuracy/drop_argument": 0.78,
69
+ "blimp/accuracy/wh_questions_object_gap": 0.808,
70
+ "blimp/accuracy/animate_subject_passive": 0.794,
71
+ "blimp/accuracy/existential_there_quantifiers_1": 0.981,
72
+ "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.844,
73
+ "blimp/accuracy/npi_present_2": 0.645,
74
+ "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.919,
75
+ "blimp/accuracy/anaphor_number_agreement": 0.983,
76
+ "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.951,
77
+ "blimp/accuracy/existential_there_object_raising": 0.826,
78
+ "blimp/accuracy/matrix_question_npi_licensor_present": 0.265,
79
+ "blimp/accuracy/npi_present_1": 0.588,
80
+ "blimp/accuracy/wh_vs_that_no_gap": 0.971,
81
+ "blimp/accuracy/left_branch_island_echo_question": 0.453,
82
+ "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.974,
83
+ "blimp/accuracy/causative": 0.699,
84
+ "blimp/accuracy/group_average": 0.7771343283582091,
85
+ "blimp/accuracy/seq_average": 0.777134328358209,
86
+ "cbt/accuracy/NE": 0.7576121794871795,
87
+ "cbt/accuracy/V": 0.9008,
88
+ "cbt/accuracy/CN": 0.8176,
89
+ "cbt/accuracy/P": 0.8912,
90
+ "cbt/accuracy/group_average": 0.8418030448717949,
91
+ "cbt/accuracy/seq_average": 0.8418367346938775,
92
+ "hellaswag/accuracy/val": 0.29426409081856203,
93
+ "hellaswag/accuracy/group_average": 0.29426409081856203,
94
+ "hellaswag/accuracy/seq_average": 0.29426409081856203,
95
+ "piqa/accuracy/val": 0.5794341675734495,
96
+ "piqa/accuracy/group_average": 0.5794341675734495,
97
+ "piqa/accuracy/seq_average": 0.5794341675734495,
98
+ "ai2arc/accuracy/ARC-Easy": 0.32684989429175476,
99
+ "ai2arc/accuracy/ARC-Challenge": 0.21201716738197424,
100
+ "ai2arc/accuracy/group_average": 0.2694335308368645,
101
+ "ai2arc/accuracy/seq_average": 0.28895184135977336,
102
+ "mmlu/accuracy/MMLU": 0.2607079013228459,
103
+ "mmlu/accuracy/group_average": 0.2607079013228459,
104
+ "mmlu/accuracy/seq_average": 0.2607079013228459,
105
+ "openbookqa/accuracy/test": 0.274,
106
+ "openbookqa/accuracy/group_average": 0.274,
107
+ "openbookqa/accuracy/seq_average": 0.274,
108
+ "race/accuracy/test/high": 0.25757575757575757,
109
+ "race/accuracy/test/middle": 0.3447075208913649,
110
+ "race/accuracy/group_average": 0.30114163923356124,
111
+ "race/accuracy/seq_average": 0.28293473854884477,
112
+ "siqa/accuracy/dev": 0.3561924257932446,
113
+ "siqa/accuracy/group_average": 0.3561924257932446,
114
+ "siqa/accuracy/seq_average": 0.3561924257932446,
115
+ "winogrande/accuracy/dev": 0.510655090765588,
116
+ "winogrande/accuracy/group_average": 0.510655090765588,
117
+ "winogrande/accuracy/seq_average": 0.510655090765588,
118
+ "commonsenseqa/accuracy/dev_rand_split": 0.24651924651924653,
119
+ "commonsenseqa/accuracy/group_average": 0.24651924651924653,
120
+ "commonsenseqa/accuracy/seq_average": 0.24651924651924653
121
+ }
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_standard_lb/export/result-model-20000.pth.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "val/loss": 2.8806944347563244,
3
+ "val/accuracy": 0.44141012524801587,
4
+ "val/perplexity": 17.826648326435496,
5
+ "val/time_since_best_loss": 0,
6
+ "val/time_since_best_accuracy": 0,
7
+ "lambada/loss": 2.6214900994152757,
8
+ "lambada/accuracy/total": 0.1935170807453416,
9
+ "lambada/accuracy/openai_last_token": 0.7358307453416149,
10
+ "lambada/perplexity": 18.64794897523847,
11
+ "lambada/lm_loss": 3.406234809500662,
12
+ "lambada/lm_perplexity": 30.151504110290105,
13
+ "lambada/time_since_best_loss": 0,
14
+ "lambada/time_since_best_accuracy": 0,
15
+ "mean_accuracy": 0.31746360299667875,
16
+ "mean_loss": 2.7510922670858,
17
+ "blimp/accuracy/passive_2": 0.87,
18
+ "blimp/accuracy/determiner_noun_agreement_2": 0.949,
19
+ "blimp/accuracy/ellipsis_n_bar_1": 0.743,
20
+ "blimp/accuracy/tough_vs_raising_2": 0.846,
21
+ "blimp/accuracy/tough_vs_raising_1": 0.57,
22
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.891,
23
+ "blimp/accuracy/principle_A_reconstruction": 0.373,
24
+ "blimp/accuracy/wh_vs_that_with_gap": 0.434,
25
+ "blimp/accuracy/principle_A_domain_2": 0.837,
26
+ "blimp/accuracy/determiner_noun_agreement_1": 0.976,
27
+ "blimp/accuracy/ellipsis_n_bar_2": 0.897,
28
+ "blimp/accuracy/principle_A_domain_3": 0.548,
29
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.853,
30
+ "blimp/accuracy/animate_subject_trans": 0.865,
31
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.826,
32
+ "blimp/accuracy/distractor_agreement_relative_clause": 0.477,
33
+ "blimp/accuracy/transitive": 0.826,
34
+ "blimp/accuracy/sentential_subject_island": 0.356,
35
+ "blimp/accuracy/adjunct_island": 0.725,
36
+ "blimp/accuracy/intransitive": 0.73,
37
+ "blimp/accuracy/existential_there_subject_raising": 0.81,
38
+ "blimp/accuracy/irregular_past_participle_adjectives": 0.846,
39
+ "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.247,
40
+ "blimp/accuracy/principle_A_case_1": 1.0,
41
+ "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.151,
42
+ "blimp/accuracy/only_npi_scope": 0.694,
43
+ "blimp/accuracy/superlative_quantifiers_2": 0.841,
44
+ "blimp/accuracy/passive_1": 0.893,
45
+ "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.912,
46
+ "blimp/accuracy/inchoative": 0.55,
47
+ "blimp/accuracy/anaphor_gender_agreement": 0.896,
48
+ "blimp/accuracy/principle_A_c_command": 0.481,
49
+ "blimp/accuracy/only_npi_licensor_present": 0.759,
50
+ "blimp/accuracy/expletive_it_object_raising": 0.776,
51
+ "blimp/accuracy/left_branch_island_simple_question": 0.289,
52
+ "blimp/accuracy/wh_questions_subject_gap": 0.906,
53
+ "blimp/accuracy/existential_there_quantifiers_2": 0.397,
54
+ "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.899,
55
+ "blimp/accuracy/sentential_negation_npi_scope": 0.447,
56
+ "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.78,
57
+ "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.903,
58
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.876,
59
+ "blimp/accuracy/principle_A_case_2": 0.92,
60
+ "blimp/accuracy/distractor_agreement_relational_noun": 0.789,
61
+ "blimp/accuracy/sentential_negation_npi_licensor_present": 0.962,
62
+ "blimp/accuracy/superlative_quantifiers_1": 0.632,
63
+ "blimp/accuracy/wh_island": 0.699,
64
+ "blimp/accuracy/principle_A_domain_1": 0.986,
65
+ "blimp/accuracy/complex_NP_island": 0.539,
66
+ "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.92,
67
+ "blimp/accuracy/irregular_past_participle_verbs": 0.873,
68
+ "blimp/accuracy/drop_argument": 0.735,
69
+ "blimp/accuracy/wh_questions_object_gap": 0.775,
70
+ "blimp/accuracy/animate_subject_passive": 0.754,
71
+ "blimp/accuracy/existential_there_quantifiers_1": 0.987,
72
+ "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.838,
73
+ "blimp/accuracy/npi_present_2": 0.597,
74
+ "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.886,
75
+ "blimp/accuracy/anaphor_number_agreement": 0.968,
76
+ "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.941,
77
+ "blimp/accuracy/existential_there_object_raising": 0.759,
78
+ "blimp/accuracy/matrix_question_npi_licensor_present": 0.112,
79
+ "blimp/accuracy/npi_present_1": 0.492,
80
+ "blimp/accuracy/wh_vs_that_no_gap": 0.957,
81
+ "blimp/accuracy/left_branch_island_echo_question": 0.342,
82
+ "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.981,
83
+ "blimp/accuracy/causative": 0.665,
84
+ "blimp/accuracy/group_average": 0.7321492537313434,
85
+ "blimp/accuracy/seq_average": 0.7321492537313433,
86
+ "cbt/accuracy/NE": 0.6991185897435898,
87
+ "cbt/accuracy/V": 0.8768,
88
+ "cbt/accuracy/CN": 0.7712,
89
+ "cbt/accuracy/P": 0.8552,
90
+ "cbt/accuracy/group_average": 0.8005796474358974,
91
+ "cbt/accuracy/seq_average": 0.8006202480992397,
92
+ "hellaswag/accuracy/val": 0.27942640908185623,
93
+ "hellaswag/accuracy/group_average": 0.27942640908185623,
94
+ "hellaswag/accuracy/seq_average": 0.27942640908185623,
95
+ "piqa/accuracy/val": 0.55930359085963,
96
+ "piqa/accuracy/group_average": 0.55930359085963,
97
+ "piqa/accuracy/seq_average": 0.55930359085963,
98
+ "ai2arc/accuracy/ARC-Easy": 0.3150105708245243,
99
+ "ai2arc/accuracy/ARC-Challenge": 0.2034334763948498,
100
+ "ai2arc/accuracy/group_average": 0.25922202360968705,
101
+ "ai2arc/accuracy/seq_average": 0.2781869688385269,
102
+ "mmlu/accuracy/MMLU": 0.26363961387200574,
103
+ "mmlu/accuracy/group_average": 0.26363961387200574,
104
+ "mmlu/accuracy/seq_average": 0.26363961387200574,
105
+ "openbookqa/accuracy/test": 0.276,
106
+ "openbookqa/accuracy/group_average": 0.276,
107
+ "openbookqa/accuracy/seq_average": 0.276,
108
+ "race/accuracy/test/high": 0.25414522584333904,
109
+ "race/accuracy/test/middle": 0.32729805013927576,
110
+ "race/accuracy/group_average": 0.2907216379913074,
111
+ "race/accuracy/seq_average": 0.2754357519254155,
112
+ "siqa/accuracy/dev": 0.3623336745138178,
113
+ "siqa/accuracy/group_average": 0.3623336745138178,
114
+ "siqa/accuracy/seq_average": 0.3623336745138178,
115
+ "winogrande/accuracy/dev": 0.5074980268350434,
116
+ "winogrande/accuracy/group_average": 0.5074980268350434,
117
+ "winogrande/accuracy/seq_average": 0.5074980268350434,
118
+ "commonsenseqa/accuracy/dev_rand_split": 0.2285012285012285,
119
+ "commonsenseqa/accuracy/group_average": 0.2285012285012285,
120
+ "commonsenseqa/accuracy/seq_average": 0.2285012285012285
121
+ }
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_standard_lb/export/result-model-30000.pth.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "val/loss": 2.8053150479755704,
3
+ "val/accuracy": 0.4509800502232143,
4
+ "val/perplexity": 16.532283548154435,
5
+ "val/time_since_best_loss": 0,
6
+ "val/time_since_best_accuracy": 0,
7
+ "lambada/loss": 2.7774893245341614,
8
+ "lambada/accuracy/total": 0.20031055900621117,
9
+ "lambada/accuracy/openai_last_token": 0.7358307453416149,
10
+ "lambada/perplexity": 17.452773071925026,
11
+ "lambada/lm_loss": 3.367414702727166,
12
+ "lambada/lm_perplexity": 29.003447514383748,
13
+ "lambada/time_since_best_loss": 0,
14
+ "lambada/time_since_best_accuracy": 0,
15
+ "mean_accuracy": 0.32564530461471275,
16
+ "mean_loss": 2.791402186254866,
17
+ "blimp/accuracy/passive_2": 0.872,
18
+ "blimp/accuracy/determiner_noun_agreement_2": 0.96,
19
+ "blimp/accuracy/ellipsis_n_bar_1": 0.771,
20
+ "blimp/accuracy/tough_vs_raising_2": 0.828,
21
+ "blimp/accuracy/tough_vs_raising_1": 0.604,
22
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.883,
23
+ "blimp/accuracy/principle_A_reconstruction": 0.361,
24
+ "blimp/accuracy/wh_vs_that_with_gap": 0.486,
25
+ "blimp/accuracy/principle_A_domain_2": 0.847,
26
+ "blimp/accuracy/determiner_noun_agreement_1": 0.97,
27
+ "blimp/accuracy/ellipsis_n_bar_2": 0.881,
28
+ "blimp/accuracy/principle_A_domain_3": 0.562,
29
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.89,
30
+ "blimp/accuracy/animate_subject_trans": 0.884,
31
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.849,
32
+ "blimp/accuracy/distractor_agreement_relative_clause": 0.526,
33
+ "blimp/accuracy/transitive": 0.841,
34
+ "blimp/accuracy/sentential_subject_island": 0.395,
35
+ "blimp/accuracy/adjunct_island": 0.782,
36
+ "blimp/accuracy/intransitive": 0.799,
37
+ "blimp/accuracy/existential_there_subject_raising": 0.811,
38
+ "blimp/accuracy/irregular_past_participle_adjectives": 0.863,
39
+ "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.263,
40
+ "blimp/accuracy/principle_A_case_1": 1.0,
41
+ "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.215,
42
+ "blimp/accuracy/only_npi_scope": 0.751,
43
+ "blimp/accuracy/superlative_quantifiers_2": 0.833,
44
+ "blimp/accuracy/passive_1": 0.899,
45
+ "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.889,
46
+ "blimp/accuracy/inchoative": 0.646,
47
+ "blimp/accuracy/anaphor_gender_agreement": 0.935,
48
+ "blimp/accuracy/principle_A_c_command": 0.558,
49
+ "blimp/accuracy/only_npi_licensor_present": 0.641,
50
+ "blimp/accuracy/expletive_it_object_raising": 0.799,
51
+ "blimp/accuracy/left_branch_island_simple_question": 0.332,
52
+ "blimp/accuracy/wh_questions_subject_gap": 0.898,
53
+ "blimp/accuracy/existential_there_quantifiers_2": 0.316,
54
+ "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.91,
55
+ "blimp/accuracy/sentential_negation_npi_scope": 0.599,
56
+ "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.773,
57
+ "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.912,
58
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.889,
59
+ "blimp/accuracy/principle_A_case_2": 0.908,
60
+ "blimp/accuracy/distractor_agreement_relational_noun": 0.81,
61
+ "blimp/accuracy/sentential_negation_npi_licensor_present": 0.98,
62
+ "blimp/accuracy/superlative_quantifiers_1": 0.712,
63
+ "blimp/accuracy/wh_island": 0.781,
64
+ "blimp/accuracy/principle_A_domain_1": 0.984,
65
+ "blimp/accuracy/complex_NP_island": 0.458,
66
+ "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.94,
67
+ "blimp/accuracy/irregular_past_participle_verbs": 0.845,
68
+ "blimp/accuracy/drop_argument": 0.781,
69
+ "blimp/accuracy/wh_questions_object_gap": 0.766,
70
+ "blimp/accuracy/animate_subject_passive": 0.814,
71
+ "blimp/accuracy/existential_there_quantifiers_1": 0.963,
72
+ "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.865,
73
+ "blimp/accuracy/npi_present_2": 0.601,
74
+ "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.91,
75
+ "blimp/accuracy/anaphor_number_agreement": 0.972,
76
+ "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.937,
77
+ "blimp/accuracy/existential_there_object_raising": 0.815,
78
+ "blimp/accuracy/matrix_question_npi_licensor_present": 0.246,
79
+ "blimp/accuracy/npi_present_1": 0.541,
80
+ "blimp/accuracy/wh_vs_that_no_gap": 0.944,
81
+ "blimp/accuracy/left_branch_island_echo_question": 0.329,
82
+ "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.97,
83
+ "blimp/accuracy/causative": 0.682,
84
+ "blimp/accuracy/group_average": 0.7504029850746267,
85
+ "blimp/accuracy/seq_average": 0.7504029850746269,
86
+ "cbt/accuracy/NE": 0.7347756410256411,
87
+ "cbt/accuracy/V": 0.892,
88
+ "cbt/accuracy/CN": 0.7872,
89
+ "cbt/accuracy/P": 0.8632,
90
+ "cbt/accuracy/group_average": 0.8192939102564103,
91
+ "cbt/accuracy/seq_average": 0.819327731092437,
92
+ "hellaswag/accuracy/val": 0.28141804421429994,
93
+ "hellaswag/accuracy/group_average": 0.28141804421429994,
94
+ "hellaswag/accuracy/seq_average": 0.28141804421429994,
95
+ "piqa/accuracy/val": 0.5783460282916213,
96
+ "piqa/accuracy/group_average": 0.5783460282916213,
97
+ "piqa/accuracy/seq_average": 0.5783460282916213,
98
+ "ai2arc/accuracy/ARC-Easy": 0.31839323467230446,
99
+ "ai2arc/accuracy/ARC-Challenge": 0.21630901287553647,
100
+ "ai2arc/accuracy/group_average": 0.26735112377392045,
101
+ "ai2arc/accuracy/seq_average": 0.2847025495750708,
102
+ "mmlu/accuracy/MMLU": 0.26349660350375403,
103
+ "mmlu/accuracy/group_average": 0.26349660350375403,
104
+ "mmlu/accuracy/seq_average": 0.26349660350375403,
105
+ "openbookqa/accuracy/test": 0.268,
106
+ "openbookqa/accuracy/group_average": 0.268,
107
+ "openbookqa/accuracy/seq_average": 0.268,
108
+ "race/accuracy/test/high": 0.2641509433962264,
109
+ "race/accuracy/test/middle": 0.3224233983286908,
110
+ "race/accuracy/group_average": 0.2932871708624586,
111
+ "race/accuracy/seq_average": 0.28111066072152413,
112
+ "siqa/accuracy/dev": 0.3654042988741044,
113
+ "siqa/accuracy/group_average": 0.3654042988741044,
114
+ "siqa/accuracy/seq_average": 0.3654042988741044,
115
+ "winogrande/accuracy/dev": 0.500394632991318,
116
+ "winogrande/accuracy/group_average": 0.500394632991318,
117
+ "winogrande/accuracy/seq_average": 0.500394632991318,
118
+ "commonsenseqa/accuracy/dev_rand_split": 0.2325962325962326,
119
+ "commonsenseqa/accuracy/group_average": 0.2325962325962326,
120
+ "commonsenseqa/accuracy/seq_average": 0.2325962325962326
121
+ }
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_standard_lb/export/result-model-40000.pth.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "val/loss": 2.756097460549975,
3
+ "val/accuracy": 0.45806109716021826,
4
+ "val/perplexity": 15.738303595828631,
5
+ "val/time_since_best_loss": 0,
6
+ "val/time_since_best_accuracy": 0,
7
+ "lambada/loss": 2.634979248046875,
8
+ "lambada/accuracy/total": 0.21506211180124224,
9
+ "lambada/accuracy/openai_last_token": 0.7439829192546584,
10
+ "lambada/perplexity": 15.870491689993184,
11
+ "lambada/lm_loss": 3.2971033813003907,
12
+ "lambada/lm_perplexity": 27.03421757696205,
13
+ "lambada/time_since_best_loss": 0,
14
+ "lambada/time_since_best_accuracy": 0,
15
+ "mean_accuracy": 0.33656160448073025,
16
+ "mean_loss": 2.6955383542984253,
17
+ "blimp/accuracy/passive_2": 0.873,
18
+ "blimp/accuracy/determiner_noun_agreement_2": 0.98,
19
+ "blimp/accuracy/ellipsis_n_bar_1": 0.803,
20
+ "blimp/accuracy/tough_vs_raising_2": 0.842,
21
+ "blimp/accuracy/tough_vs_raising_1": 0.584,
22
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.885,
23
+ "blimp/accuracy/principle_A_reconstruction": 0.28,
24
+ "blimp/accuracy/wh_vs_that_with_gap": 0.47,
25
+ "blimp/accuracy/principle_A_domain_2": 0.827,
26
+ "blimp/accuracy/determiner_noun_agreement_1": 0.987,
27
+ "blimp/accuracy/ellipsis_n_bar_2": 0.9,
28
+ "blimp/accuracy/principle_A_domain_3": 0.582,
29
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.91,
30
+ "blimp/accuracy/animate_subject_trans": 0.879,
31
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.884,
32
+ "blimp/accuracy/distractor_agreement_relative_clause": 0.613,
33
+ "blimp/accuracy/transitive": 0.852,
34
+ "blimp/accuracy/sentential_subject_island": 0.32,
35
+ "blimp/accuracy/adjunct_island": 0.8,
36
+ "blimp/accuracy/intransitive": 0.801,
37
+ "blimp/accuracy/existential_there_subject_raising": 0.854,
38
+ "blimp/accuracy/irregular_past_participle_adjectives": 0.906,
39
+ "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.288,
40
+ "blimp/accuracy/principle_A_case_1": 1.0,
41
+ "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.201,
42
+ "blimp/accuracy/only_npi_scope": 0.747,
43
+ "blimp/accuracy/superlative_quantifiers_2": 0.724,
44
+ "blimp/accuracy/passive_1": 0.892,
45
+ "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.877,
46
+ "blimp/accuracy/inchoative": 0.639,
47
+ "blimp/accuracy/anaphor_gender_agreement": 0.939,
48
+ "blimp/accuracy/principle_A_c_command": 0.553,
49
+ "blimp/accuracy/only_npi_licensor_present": 0.806,
50
+ "blimp/accuracy/expletive_it_object_raising": 0.758,
51
+ "blimp/accuracy/left_branch_island_simple_question": 0.331,
52
+ "blimp/accuracy/wh_questions_subject_gap": 0.922,
53
+ "blimp/accuracy/existential_there_quantifiers_2": 0.32,
54
+ "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.935,
55
+ "blimp/accuracy/sentential_negation_npi_scope": 0.541,
56
+ "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.786,
57
+ "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.9,
58
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.894,
59
+ "blimp/accuracy/principle_A_case_2": 0.927,
60
+ "blimp/accuracy/distractor_agreement_relational_noun": 0.861,
61
+ "blimp/accuracy/sentential_negation_npi_licensor_present": 0.991,
62
+ "blimp/accuracy/superlative_quantifiers_1": 0.767,
63
+ "blimp/accuracy/wh_island": 0.716,
64
+ "blimp/accuracy/principle_A_domain_1": 0.993,
65
+ "blimp/accuracy/complex_NP_island": 0.515,
66
+ "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.956,
67
+ "blimp/accuracy/irregular_past_participle_verbs": 0.878,
68
+ "blimp/accuracy/drop_argument": 0.775,
69
+ "blimp/accuracy/wh_questions_object_gap": 0.765,
70
+ "blimp/accuracy/animate_subject_passive": 0.793,
71
+ "blimp/accuracy/existential_there_quantifiers_1": 0.982,
72
+ "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.875,
73
+ "blimp/accuracy/npi_present_2": 0.536,
74
+ "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.899,
75
+ "blimp/accuracy/anaphor_number_agreement": 0.979,
76
+ "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.944,
77
+ "blimp/accuracy/existential_there_object_raising": 0.839,
78
+ "blimp/accuracy/matrix_question_npi_licensor_present": 0.181,
79
+ "blimp/accuracy/npi_present_1": 0.499,
80
+ "blimp/accuracy/wh_vs_that_no_gap": 0.963,
81
+ "blimp/accuracy/left_branch_island_echo_question": 0.442,
82
+ "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.974,
83
+ "blimp/accuracy/causative": 0.712,
84
+ "blimp/accuracy/group_average": 0.7559253731343284,
85
+ "blimp/accuracy/seq_average": 0.7559253731343284,
86
+ "cbt/accuracy/NE": 0.7391826923076923,
87
+ "cbt/accuracy/V": 0.8924,
88
+ "cbt/accuracy/CN": 0.79,
89
+ "cbt/accuracy/P": 0.8656,
90
+ "cbt/accuracy/group_average": 0.8217956730769231,
91
+ "cbt/accuracy/seq_average": 0.821828731492597,
92
+ "hellaswag/accuracy/val": 0.2842063333997212,
93
+ "hellaswag/accuracy/group_average": 0.2842063333997212,
94
+ "hellaswag/accuracy/seq_average": 0.2842063333997212,
95
+ "piqa/accuracy/val": 0.5663764961915125,
96
+ "piqa/accuracy/group_average": 0.5663764961915125,
97
+ "piqa/accuracy/seq_average": 0.5663764961915125,
98
+ "ai2arc/accuracy/ARC-Easy": 0.320507399577167,
99
+ "ai2arc/accuracy/ARC-Challenge": 0.21030042918454936,
100
+ "ai2arc/accuracy/group_average": 0.2654039143808582,
101
+ "ai2arc/accuracy/seq_average": 0.2841359773371105,
102
+ "mmlu/accuracy/MMLU": 0.2598498391133357,
103
+ "mmlu/accuracy/group_average": 0.2598498391133357,
104
+ "mmlu/accuracy/seq_average": 0.2598498391133357,
105
+ "openbookqa/accuracy/test": 0.272,
106
+ "openbookqa/accuracy/group_average": 0.272,
107
+ "openbookqa/accuracy/seq_average": 0.272,
108
+ "race/accuracy/test/high": 0.2684391080617496,
109
+ "race/accuracy/test/middle": 0.33565459610027853,
110
+ "race/accuracy/group_average": 0.30204685208101406,
111
+ "race/accuracy/seq_average": 0.2880016214025132,
112
+ "siqa/accuracy/dev": 0.36284544524053225,
113
+ "siqa/accuracy/group_average": 0.36284544524053225,
114
+ "siqa/accuracy/seq_average": 0.36284544524053225,
115
+ "winogrande/accuracy/dev": 0.5090765588003157,
116
+ "winogrande/accuracy/group_average": 0.5090765588003157,
117
+ "winogrande/accuracy/seq_average": 0.5090765588003157,
118
+ "commonsenseqa/accuracy/dev_rand_split": 0.25307125307125306,
119
+ "commonsenseqa/accuracy/group_average": 0.25307125307125306,
120
+ "commonsenseqa/accuracy/seq_average": 0.25307125307125306
121
+ }
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_standard_lb/export/result-model-50000.pth.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "val/loss": 2.7115478515625,
3
+ "val/accuracy": 0.4647148980034722,
4
+ "val/perplexity": 15.0525566156603,
5
+ "val/time_since_best_loss": 0,
6
+ "val/time_since_best_accuracy": 0,
7
+ "lambada/loss": 2.704208468798525,
8
+ "lambada/accuracy/total": 0.23718944099378883,
9
+ "lambada/accuracy/openai_last_token": 0.7591226708074534,
10
+ "lambada/perplexity": 13.7247949506867,
11
+ "lambada/lm_loss": 3.279500997334853,
12
+ "lambada/lm_perplexity": 26.562514626650426,
13
+ "lambada/time_since_best_loss": 0,
14
+ "lambada/time_since_best_accuracy": 0,
15
+ "mean_accuracy": 0.3509521694986305,
16
+ "mean_loss": 2.7078781601805124,
17
+ "blimp/accuracy/passive_2": 0.877,
18
+ "blimp/accuracy/determiner_noun_agreement_2": 0.979,
19
+ "blimp/accuracy/ellipsis_n_bar_1": 0.805,
20
+ "blimp/accuracy/tough_vs_raising_2": 0.864,
21
+ "blimp/accuracy/tough_vs_raising_1": 0.554,
22
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.913,
23
+ "blimp/accuracy/principle_A_reconstruction": 0.329,
24
+ "blimp/accuracy/wh_vs_that_with_gap": 0.467,
25
+ "blimp/accuracy/principle_A_domain_2": 0.842,
26
+ "blimp/accuracy/determiner_noun_agreement_1": 0.985,
27
+ "blimp/accuracy/ellipsis_n_bar_2": 0.903,
28
+ "blimp/accuracy/principle_A_domain_3": 0.547,
29
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.918,
30
+ "blimp/accuracy/animate_subject_trans": 0.882,
31
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.889,
32
+ "blimp/accuracy/distractor_agreement_relative_clause": 0.561,
33
+ "blimp/accuracy/transitive": 0.862,
34
+ "blimp/accuracy/sentential_subject_island": 0.361,
35
+ "blimp/accuracy/adjunct_island": 0.771,
36
+ "blimp/accuracy/intransitive": 0.799,
37
+ "blimp/accuracy/existential_there_subject_raising": 0.841,
38
+ "blimp/accuracy/irregular_past_participle_adjectives": 0.745,
39
+ "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.339,
40
+ "blimp/accuracy/principle_A_case_1": 1.0,
41
+ "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.177,
42
+ "blimp/accuracy/only_npi_scope": 0.635,
43
+ "blimp/accuracy/superlative_quantifiers_2": 0.698,
44
+ "blimp/accuracy/passive_1": 0.899,
45
+ "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.902,
46
+ "blimp/accuracy/inchoative": 0.616,
47
+ "blimp/accuracy/anaphor_gender_agreement": 0.927,
48
+ "blimp/accuracy/principle_A_c_command": 0.568,
49
+ "blimp/accuracy/only_npi_licensor_present": 0.726,
50
+ "blimp/accuracy/expletive_it_object_raising": 0.775,
51
+ "blimp/accuracy/left_branch_island_simple_question": 0.362,
52
+ "blimp/accuracy/wh_questions_subject_gap": 0.918,
53
+ "blimp/accuracy/existential_there_quantifiers_2": 0.415,
54
+ "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.936,
55
+ "blimp/accuracy/sentential_negation_npi_scope": 0.582,
56
+ "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.8,
57
+ "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.89,
58
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.903,
59
+ "blimp/accuracy/principle_A_case_2": 0.953,
60
+ "blimp/accuracy/distractor_agreement_relational_noun": 0.843,
61
+ "blimp/accuracy/sentential_negation_npi_licensor_present": 0.987,
62
+ "blimp/accuracy/superlative_quantifiers_1": 0.706,
63
+ "blimp/accuracy/wh_island": 0.807,
64
+ "blimp/accuracy/principle_A_domain_1": 0.987,
65
+ "blimp/accuracy/complex_NP_island": 0.499,
66
+ "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.959,
67
+ "blimp/accuracy/irregular_past_participle_verbs": 0.878,
68
+ "blimp/accuracy/drop_argument": 0.756,
69
+ "blimp/accuracy/wh_questions_object_gap": 0.816,
70
+ "blimp/accuracy/animate_subject_passive": 0.801,
71
+ "blimp/accuracy/existential_there_quantifiers_1": 0.982,
72
+ "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.913,
73
+ "blimp/accuracy/npi_present_2": 0.639,
74
+ "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.913,
75
+ "blimp/accuracy/anaphor_number_agreement": 0.987,
76
+ "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.943,
77
+ "blimp/accuracy/existential_there_object_raising": 0.829,
78
+ "blimp/accuracy/matrix_question_npi_licensor_present": 0.162,
79
+ "blimp/accuracy/npi_present_1": 0.61,
80
+ "blimp/accuracy/wh_vs_that_no_gap": 0.966,
81
+ "blimp/accuracy/left_branch_island_echo_question": 0.384,
82
+ "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.981,
83
+ "blimp/accuracy/causative": 0.711,
84
+ "blimp/accuracy/group_average": 0.7578208955223882,
85
+ "blimp/accuracy/seq_average": 0.7578208955223881,
86
+ "cbt/accuracy/NE": 0.7423878205128205,
87
+ "cbt/accuracy/V": 0.898,
88
+ "cbt/accuracy/CN": 0.788,
89
+ "cbt/accuracy/P": 0.876,
90
+ "cbt/accuracy/group_average": 0.826096955128205,
91
+ "cbt/accuracy/seq_average": 0.8261304521808723,
92
+ "hellaswag/accuracy/val": 0.2861979685321649,
93
+ "hellaswag/accuracy/group_average": 0.2861979685321649,
94
+ "hellaswag/accuracy/seq_average": 0.2861979685321649,
95
+ "piqa/accuracy/val": 0.5680087051142546,
96
+ "piqa/accuracy/group_average": 0.5680087051142546,
97
+ "piqa/accuracy/seq_average": 0.5680087051142546,
98
+ "ai2arc/accuracy/ARC-Easy": 0.32473572938689216,
99
+ "ai2arc/accuracy/ARC-Challenge": 0.2094420600858369,
100
+ "ai2arc/accuracy/group_average": 0.26708889473636455,
101
+ "ai2arc/accuracy/seq_average": 0.28668555240793203,
102
+ "mmlu/accuracy/MMLU": 0.2604933857704684,
103
+ "mmlu/accuracy/group_average": 0.2604933857704684,
104
+ "mmlu/accuracy/seq_average": 0.2604933857704684,
105
+ "openbookqa/accuracy/test": 0.284,
106
+ "openbookqa/accuracy/group_average": 0.284,
107
+ "openbookqa/accuracy/seq_average": 0.284,
108
+ "race/accuracy/test/high": 0.25900514579759865,
109
+ "race/accuracy/test/middle": 0.33913649025069637,
110
+ "race/accuracy/group_average": 0.2990708180241475,
111
+ "race/accuracy/seq_average": 0.2823267126064045,
112
+ "siqa/accuracy/dev": 0.36591606960081885,
113
+ "siqa/accuracy/group_average": 0.36591606960081885,
114
+ "siqa/accuracy/seq_average": 0.36591606960081885,
115
+ "winogrande/accuracy/dev": 0.5122336227308603,
116
+ "winogrande/accuracy/group_average": 0.5122336227308603,
117
+ "winogrande/accuracy/seq_average": 0.5122336227308603,
118
+ "commonsenseqa/accuracy/dev_rand_split": 0.24488124488124488,
119
+ "commonsenseqa/accuracy/group_average": 0.24488124488124488,
120
+ "commonsenseqa/accuracy/seq_average": 0.24488124488124488
121
+ }
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_standard_lb/export/result-model-60000.pth.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "val/loss": 2.6786939832899304,
3
+ "val/accuracy": 0.4689989846850198,
4
+ "val/perplexity": 14.566057353653534,
5
+ "val/time_since_best_loss": 0,
6
+ "val/time_since_best_accuracy": 0,
7
+ "lambada/loss": 2.5141347565265915,
8
+ "lambada/accuracy/total": 0.23388975155279504,
9
+ "lambada/accuracy/openai_last_token": 0.7542701863354038,
10
+ "lambada/perplexity": 13.526006134511944,
11
+ "lambada/lm_loss": 3.219389834308319,
12
+ "lambada/lm_perplexity": 25.012853539140185,
13
+ "lambada/time_since_best_loss": 0,
14
+ "lambada/time_since_best_accuracy": 0,
15
+ "mean_accuracy": 0.35144436811890745,
16
+ "mean_loss": 2.5964143699082607,
17
+ "blimp/accuracy/passive_2": 0.896,
18
+ "blimp/accuracy/determiner_noun_agreement_2": 0.983,
19
+ "blimp/accuracy/ellipsis_n_bar_1": 0.814,
20
+ "blimp/accuracy/tough_vs_raising_2": 0.884,
21
+ "blimp/accuracy/tough_vs_raising_1": 0.549,
22
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.896,
23
+ "blimp/accuracy/principle_A_reconstruction": 0.303,
24
+ "blimp/accuracy/wh_vs_that_with_gap": 0.478,
25
+ "blimp/accuracy/principle_A_domain_2": 0.795,
26
+ "blimp/accuracy/determiner_noun_agreement_1": 0.988,
27
+ "blimp/accuracy/ellipsis_n_bar_2": 0.899,
28
+ "blimp/accuracy/principle_A_domain_3": 0.527,
29
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.939,
30
+ "blimp/accuracy/animate_subject_trans": 0.9,
31
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.866,
32
+ "blimp/accuracy/distractor_agreement_relative_clause": 0.666,
33
+ "blimp/accuracy/transitive": 0.856,
34
+ "blimp/accuracy/sentential_subject_island": 0.337,
35
+ "blimp/accuracy/adjunct_island": 0.791,
36
+ "blimp/accuracy/intransitive": 0.771,
37
+ "blimp/accuracy/existential_there_subject_raising": 0.87,
38
+ "blimp/accuracy/irregular_past_participle_adjectives": 0.98,
39
+ "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.38,
40
+ "blimp/accuracy/principle_A_case_1": 0.999,
41
+ "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.201,
42
+ "blimp/accuracy/only_npi_scope": 0.746,
43
+ "blimp/accuracy/superlative_quantifiers_2": 0.734,
44
+ "blimp/accuracy/passive_1": 0.877,
45
+ "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.92,
46
+ "blimp/accuracy/inchoative": 0.633,
47
+ "blimp/accuracy/anaphor_gender_agreement": 0.94,
48
+ "blimp/accuracy/principle_A_c_command": 0.643,
49
+ "blimp/accuracy/only_npi_licensor_present": 0.696,
50
+ "blimp/accuracy/expletive_it_object_raising": 0.769,
51
+ "blimp/accuracy/left_branch_island_simple_question": 0.465,
52
+ "blimp/accuracy/wh_questions_subject_gap": 0.923,
53
+ "blimp/accuracy/existential_there_quantifiers_2": 0.377,
54
+ "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.924,
55
+ "blimp/accuracy/sentential_negation_npi_scope": 0.615,
56
+ "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.809,
57
+ "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.86,
58
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.886,
59
+ "blimp/accuracy/principle_A_case_2": 0.96,
60
+ "blimp/accuracy/distractor_agreement_relational_noun": 0.819,
61
+ "blimp/accuracy/sentential_negation_npi_licensor_present": 0.994,
62
+ "blimp/accuracy/superlative_quantifiers_1": 0.811,
63
+ "blimp/accuracy/wh_island": 0.804,
64
+ "blimp/accuracy/principle_A_domain_1": 0.983,
65
+ "blimp/accuracy/complex_NP_island": 0.605,
66
+ "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.957,
67
+ "blimp/accuracy/irregular_past_participle_verbs": 0.871,
68
+ "blimp/accuracy/drop_argument": 0.771,
69
+ "blimp/accuracy/wh_questions_object_gap": 0.76,
70
+ "blimp/accuracy/animate_subject_passive": 0.786,
71
+ "blimp/accuracy/existential_there_quantifiers_1": 0.992,
72
+ "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.885,
73
+ "blimp/accuracy/npi_present_2": 0.602,
74
+ "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.906,
75
+ "blimp/accuracy/anaphor_number_agreement": 0.985,
76
+ "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.945,
77
+ "blimp/accuracy/existential_there_object_raising": 0.787,
78
+ "blimp/accuracy/matrix_question_npi_licensor_present": 0.252,
79
+ "blimp/accuracy/npi_present_1": 0.565,
80
+ "blimp/accuracy/wh_vs_that_no_gap": 0.966,
81
+ "blimp/accuracy/left_branch_island_echo_question": 0.447,
82
+ "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.978,
83
+ "blimp/accuracy/causative": 0.725,
84
+ "blimp/accuracy/group_average": 0.7692686567164178,
85
+ "blimp/accuracy/seq_average": 0.7692686567164179,
86
+ "cbt/accuracy/NE": 0.7347756410256411,
87
+ "cbt/accuracy/V": 0.8944,
88
+ "cbt/accuracy/CN": 0.8092,
89
+ "cbt/accuracy/P": 0.8812,
90
+ "cbt/accuracy/group_average": 0.8298939102564102,
91
+ "cbt/accuracy/seq_average": 0.8299319727891157,
92
+ "hellaswag/accuracy/val": 0.2901812387970524,
93
+ "hellaswag/accuracy/group_average": 0.2901812387970524,
94
+ "hellaswag/accuracy/seq_average": 0.2901812387970524,
95
+ "piqa/accuracy/val": 0.5767138193688792,
96
+ "piqa/accuracy/group_average": 0.5767138193688792,
97
+ "piqa/accuracy/seq_average": 0.5767138193688792,
98
+ "ai2arc/accuracy/ARC-Easy": 0.3192389006342495,
99
+ "ai2arc/accuracy/ARC-Challenge": 0.2,
100
+ "ai2arc/accuracy/group_average": 0.25961945031712474,
101
+ "ai2arc/accuracy/seq_average": 0.2798866855524079,
102
+ "mmlu/accuracy/MMLU": 0.261136932427601,
103
+ "mmlu/accuracy/group_average": 0.261136932427601,
104
+ "mmlu/accuracy/seq_average": 0.261136932427601,
105
+ "openbookqa/accuracy/test": 0.27,
106
+ "openbookqa/accuracy/group_average": 0.27,
107
+ "openbookqa/accuracy/seq_average": 0.27,
108
+ "race/accuracy/test/high": 0.26043453401943967,
109
+ "race/accuracy/test/middle": 0.33774373259052926,
110
+ "race/accuracy/group_average": 0.29908913330498443,
111
+ "race/accuracy/seq_average": 0.28293473854884477,
112
+ "siqa/accuracy/dev": 0.3556806550665302,
113
+ "siqa/accuracy/group_average": 0.3556806550665302,
114
+ "siqa/accuracy/seq_average": 0.3556806550665302,
115
+ "winogrande/accuracy/dev": 0.5059194948697711,
116
+ "winogrande/accuracy/group_average": 0.5059194948697711,
117
+ "winogrande/accuracy/seq_average": 0.5059194948697711,
118
+ "commonsenseqa/accuracy/dev_rand_split": 0.24733824733824733,
119
+ "commonsenseqa/accuracy/group_average": 0.24733824733824733,
120
+ "commonsenseqa/accuracy/seq_average": 0.24733824733824733
121
+ }
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_standard_lb/export/result-model-70000.pth.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "val/loss": 2.65325685531374,
3
+ "val/accuracy": 0.47258068266369047,
4
+ "val/perplexity": 14.20021144960697,
5
+ "val/time_since_best_loss": 0,
6
+ "val/time_since_best_accuracy": 0,
7
+ "lambada/loss": 2.5280458438470497,
8
+ "lambada/accuracy/total": 0.24592391304347827,
9
+ "lambada/accuracy/openai_last_token": 0.7552406832298136,
10
+ "lambada/perplexity": 12.934206584598552,
11
+ "lambada/lm_loss": 3.2085472225481486,
12
+ "lambada/lm_perplexity": 24.743113863087796,
13
+ "lambada/time_since_best_loss": 0,
14
+ "lambada/time_since_best_accuracy": 0,
15
+ "mean_accuracy": 0.35925229785358437,
16
+ "mean_loss": 2.590651349580395,
17
+ "blimp/accuracy/passive_2": 0.881,
18
+ "blimp/accuracy/determiner_noun_agreement_2": 0.98,
19
+ "blimp/accuracy/ellipsis_n_bar_1": 0.81,
20
+ "blimp/accuracy/tough_vs_raising_2": 0.863,
21
+ "blimp/accuracy/tough_vs_raising_1": 0.553,
22
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.909,
23
+ "blimp/accuracy/principle_A_reconstruction": 0.323,
24
+ "blimp/accuracy/wh_vs_that_with_gap": 0.473,
25
+ "blimp/accuracy/principle_A_domain_2": 0.76,
26
+ "blimp/accuracy/determiner_noun_agreement_1": 0.985,
27
+ "blimp/accuracy/ellipsis_n_bar_2": 0.904,
28
+ "blimp/accuracy/principle_A_domain_3": 0.522,
29
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.919,
30
+ "blimp/accuracy/animate_subject_trans": 0.902,
31
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.861,
32
+ "blimp/accuracy/distractor_agreement_relative_clause": 0.664,
33
+ "blimp/accuracy/transitive": 0.862,
34
+ "blimp/accuracy/sentential_subject_island": 0.3,
35
+ "blimp/accuracy/adjunct_island": 0.76,
36
+ "blimp/accuracy/intransitive": 0.76,
37
+ "blimp/accuracy/existential_there_subject_raising": 0.859,
38
+ "blimp/accuracy/irregular_past_participle_adjectives": 0.958,
39
+ "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.367,
40
+ "blimp/accuracy/principle_A_case_1": 0.999,
41
+ "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.229,
42
+ "blimp/accuracy/only_npi_scope": 0.693,
43
+ "blimp/accuracy/superlative_quantifiers_2": 0.62,
44
+ "blimp/accuracy/passive_1": 0.888,
45
+ "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.891,
46
+ "blimp/accuracy/inchoative": 0.612,
47
+ "blimp/accuracy/anaphor_gender_agreement": 0.95,
48
+ "blimp/accuracy/principle_A_c_command": 0.632,
49
+ "blimp/accuracy/only_npi_licensor_present": 0.785,
50
+ "blimp/accuracy/expletive_it_object_raising": 0.761,
51
+ "blimp/accuracy/left_branch_island_simple_question": 0.465,
52
+ "blimp/accuracy/wh_questions_subject_gap": 0.92,
53
+ "blimp/accuracy/existential_there_quantifiers_2": 0.434,
54
+ "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.921,
55
+ "blimp/accuracy/sentential_negation_npi_scope": 0.596,
56
+ "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.827,
57
+ "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.896,
58
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.905,
59
+ "blimp/accuracy/principle_A_case_2": 0.954,
60
+ "blimp/accuracy/distractor_agreement_relational_noun": 0.833,
61
+ "blimp/accuracy/sentential_negation_npi_licensor_present": 0.992,
62
+ "blimp/accuracy/superlative_quantifiers_1": 0.749,
63
+ "blimp/accuracy/wh_island": 0.799,
64
+ "blimp/accuracy/principle_A_domain_1": 0.986,
65
+ "blimp/accuracy/complex_NP_island": 0.569,
66
+ "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.975,
67
+ "blimp/accuracy/irregular_past_participle_verbs": 0.846,
68
+ "blimp/accuracy/drop_argument": 0.75,
69
+ "blimp/accuracy/wh_questions_object_gap": 0.809,
70
+ "blimp/accuracy/animate_subject_passive": 0.779,
71
+ "blimp/accuracy/existential_there_quantifiers_1": 0.988,
72
+ "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.882,
73
+ "blimp/accuracy/npi_present_2": 0.577,
74
+ "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.914,
75
+ "blimp/accuracy/anaphor_number_agreement": 0.988,
76
+ "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.941,
77
+ "blimp/accuracy/existential_there_object_raising": 0.81,
78
+ "blimp/accuracy/matrix_question_npi_licensor_present": 0.226,
79
+ "blimp/accuracy/npi_present_1": 0.553,
80
+ "blimp/accuracy/wh_vs_that_no_gap": 0.972,
81
+ "blimp/accuracy/left_branch_island_echo_question": 0.445,
82
+ "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.972,
83
+ "blimp/accuracy/causative": 0.697,
84
+ "blimp/accuracy/group_average": 0.7642537313432837,
85
+ "blimp/accuracy/seq_average": 0.7642537313432836,
86
+ "cbt/accuracy/NE": 0.7371794871794872,
87
+ "cbt/accuracy/V": 0.8976,
88
+ "cbt/accuracy/CN": 0.8128,
89
+ "cbt/accuracy/P": 0.886,
90
+ "cbt/accuracy/group_average": 0.8333948717948718,
91
+ "cbt/accuracy/seq_average": 0.8334333733493398,
92
+ "hellaswag/accuracy/val": 0.28719378609838675,
93
+ "hellaswag/accuracy/group_average": 0.28719378609838675,
94
+ "hellaswag/accuracy/seq_average": 0.28719378609838675,
95
+ "piqa/accuracy/val": 0.5772578890097932,
96
+ "piqa/accuracy/group_average": 0.5772578890097932,
97
+ "piqa/accuracy/seq_average": 0.5772578890097932,
98
+ "ai2arc/accuracy/ARC-Easy": 0.33403805496828753,
99
+ "ai2arc/accuracy/ARC-Challenge": 0.20429184549356222,
100
+ "ai2arc/accuracy/group_average": 0.2691649502309249,
101
+ "ai2arc/accuracy/seq_average": 0.29121813031161475,
102
+ "mmlu/accuracy/MMLU": 0.2609939220593493,
103
+ "mmlu/accuracy/group_average": 0.2609939220593493,
104
+ "mmlu/accuracy/seq_average": 0.2609939220593493,
105
+ "openbookqa/accuracy/test": 0.268,
106
+ "openbookqa/accuracy/group_average": 0.268,
107
+ "openbookqa/accuracy/seq_average": 0.268,
108
+ "race/accuracy/test/high": 0.2612921669525443,
109
+ "race/accuracy/test/middle": 0.34192200557103064,
110
+ "race/accuracy/group_average": 0.30160708626178745,
111
+ "race/accuracy/seq_average": 0.2847588163761654,
112
+ "siqa/accuracy/dev": 0.3592630501535312,
113
+ "siqa/accuracy/group_average": 0.3592630501535312,
114
+ "siqa/accuracy/seq_average": 0.3592630501535312,
115
+ "winogrande/accuracy/dev": 0.5035516969218626,
116
+ "winogrande/accuracy/group_average": 0.5035516969218626,
117
+ "winogrande/accuracy/seq_average": 0.5035516969218626,
118
+ "commonsenseqa/accuracy/dev_rand_split": 0.257985257985258,
119
+ "commonsenseqa/accuracy/group_average": 0.257985257985258,
120
+ "commonsenseqa/accuracy/seq_average": 0.257985257985258
121
+ }
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_standard_lb/export/result-model-80000.pth.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "val/loss": 2.630677238343254,
3
+ "val/accuracy": 0.4760829380580357,
4
+ "val/perplexity": 13.883168933406056,
5
+ "val/time_since_best_loss": 0,
6
+ "val/time_since_best_accuracy": 0,
7
+ "lambada/loss": 2.499687431761937,
8
+ "lambada/accuracy/total": 0.24941770186335405,
9
+ "lambada/accuracy/openai_last_token": 0.7608695652173914,
10
+ "lambada/perplexity": 12.631674255586239,
11
+ "lambada/lm_loss": 3.1846876623812,
12
+ "lambada/lm_perplexity": 24.15974123122619,
13
+ "lambada/time_since_best_loss": 0,
14
+ "lambada/time_since_best_accuracy": 0,
15
+ "mean_accuracy": 0.36275031996069484,
16
+ "mean_loss": 2.5651823350525955,
17
+ "blimp/accuracy/passive_2": 0.898,
18
+ "blimp/accuracy/determiner_noun_agreement_2": 0.98,
19
+ "blimp/accuracy/ellipsis_n_bar_1": 0.818,
20
+ "blimp/accuracy/tough_vs_raising_2": 0.871,
21
+ "blimp/accuracy/tough_vs_raising_1": 0.603,
22
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.911,
23
+ "blimp/accuracy/principle_A_reconstruction": 0.27,
24
+ "blimp/accuracy/wh_vs_that_with_gap": 0.498,
25
+ "blimp/accuracy/principle_A_domain_2": 0.791,
26
+ "blimp/accuracy/determiner_noun_agreement_1": 0.989,
27
+ "blimp/accuracy/ellipsis_n_bar_2": 0.9,
28
+ "blimp/accuracy/principle_A_domain_3": 0.538,
29
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.935,
30
+ "blimp/accuracy/animate_subject_trans": 0.896,
31
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.889,
32
+ "blimp/accuracy/distractor_agreement_relative_clause": 0.658,
33
+ "blimp/accuracy/transitive": 0.882,
34
+ "blimp/accuracy/sentential_subject_island": 0.299,
35
+ "blimp/accuracy/adjunct_island": 0.788,
36
+ "blimp/accuracy/intransitive": 0.792,
37
+ "blimp/accuracy/existential_there_subject_raising": 0.855,
38
+ "blimp/accuracy/irregular_past_participle_adjectives": 0.934,
39
+ "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.355,
40
+ "blimp/accuracy/principle_A_case_1": 0.999,
41
+ "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.233,
42
+ "blimp/accuracy/only_npi_scope": 0.712,
43
+ "blimp/accuracy/superlative_quantifiers_2": 0.757,
44
+ "blimp/accuracy/passive_1": 0.901,
45
+ "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.896,
46
+ "blimp/accuracy/inchoative": 0.644,
47
+ "blimp/accuracy/anaphor_gender_agreement": 0.94,
48
+ "blimp/accuracy/principle_A_c_command": 0.643,
49
+ "blimp/accuracy/only_npi_licensor_present": 0.866,
50
+ "blimp/accuracy/expletive_it_object_raising": 0.766,
51
+ "blimp/accuracy/left_branch_island_simple_question": 0.464,
52
+ "blimp/accuracy/wh_questions_subject_gap": 0.912,
53
+ "blimp/accuracy/existential_there_quantifiers_2": 0.395,
54
+ "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.927,
55
+ "blimp/accuracy/sentential_negation_npi_scope": 0.663,
56
+ "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.802,
57
+ "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.859,
58
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.9,
59
+ "blimp/accuracy/principle_A_case_2": 0.954,
60
+ "blimp/accuracy/distractor_agreement_relational_noun": 0.832,
61
+ "blimp/accuracy/sentential_negation_npi_licensor_present": 0.987,
62
+ "blimp/accuracy/superlative_quantifiers_1": 0.772,
63
+ "blimp/accuracy/wh_island": 0.772,
64
+ "blimp/accuracy/principle_A_domain_1": 0.973,
65
+ "blimp/accuracy/complex_NP_island": 0.56,
66
+ "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.968,
67
+ "blimp/accuracy/irregular_past_participle_verbs": 0.852,
68
+ "blimp/accuracy/drop_argument": 0.776,
69
+ "blimp/accuracy/wh_questions_object_gap": 0.785,
70
+ "blimp/accuracy/animate_subject_passive": 0.787,
71
+ "blimp/accuracy/existential_there_quantifiers_1": 0.991,
72
+ "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.872,
73
+ "blimp/accuracy/npi_present_2": 0.641,
74
+ "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.927,
75
+ "blimp/accuracy/anaphor_number_agreement": 0.986,
76
+ "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.951,
77
+ "blimp/accuracy/existential_there_object_raising": 0.831,
78
+ "blimp/accuracy/matrix_question_npi_licensor_present": 0.285,
79
+ "blimp/accuracy/npi_present_1": 0.621,
80
+ "blimp/accuracy/wh_vs_that_no_gap": 0.962,
81
+ "blimp/accuracy/left_branch_island_echo_question": 0.413,
82
+ "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.966,
83
+ "blimp/accuracy/causative": 0.705,
84
+ "blimp/accuracy/group_average": 0.7731044776119402,
85
+ "blimp/accuracy/seq_average": 0.7731044776119403,
86
+ "cbt/accuracy/NE": 0.7524038461538461,
87
+ "cbt/accuracy/V": 0.8992,
88
+ "cbt/accuracy/CN": 0.8164,
89
+ "cbt/accuracy/P": 0.8848,
90
+ "cbt/accuracy/group_average": 0.8382009615384616,
91
+ "cbt/accuracy/seq_average": 0.8382352941176471,
92
+ "hellaswag/accuracy/val": 0.29267078271260705,
93
+ "hellaswag/accuracy/group_average": 0.29267078271260705,
94
+ "hellaswag/accuracy/seq_average": 0.29267078271260705,
95
+ "piqa/accuracy/val": 0.5701849836779108,
96
+ "piqa/accuracy/group_average": 0.5701849836779108,
97
+ "piqa/accuracy/seq_average": 0.5701849836779108,
98
+ "ai2arc/accuracy/ARC-Easy": 0.33192389006342493,
99
+ "ai2arc/accuracy/ARC-Challenge": 0.20772532188841203,
100
+ "ai2arc/accuracy/group_average": 0.2698246059759185,
101
+ "ai2arc/accuracy/seq_average": 0.29093484419263455,
102
+ "mmlu/accuracy/MMLU": 0.2615659635323561,
103
+ "mmlu/accuracy/group_average": 0.2615659635323561,
104
+ "mmlu/accuracy/seq_average": 0.2615659635323561,
105
+ "openbookqa/accuracy/test": 0.294,
106
+ "openbookqa/accuracy/group_average": 0.294,
107
+ "openbookqa/accuracy/seq_average": 0.294,
108
+ "race/accuracy/test/high": 0.2670097198399085,
109
+ "race/accuracy/test/middle": 0.346100278551532,
110
+ "race/accuracy/group_average": 0.30655499919572027,
111
+ "race/accuracy/seq_average": 0.29002837454398056,
112
+ "siqa/accuracy/dev": 0.3587512794268168,
113
+ "siqa/accuracy/group_average": 0.3587512794268168,
114
+ "siqa/accuracy/seq_average": 0.3587512794268168,
115
+ "winogrande/accuracy/dev": 0.5090765588003157,
116
+ "winogrande/accuracy/group_average": 0.5090765588003157,
117
+ "winogrande/accuracy/seq_average": 0.5090765588003157,
118
+ "commonsenseqa/accuracy/dev_rand_split": 0.2497952497952498,
119
+ "commonsenseqa/accuracy/group_average": 0.2497952497952498,
120
+ "commonsenseqa/accuracy/seq_average": 0.2497952497952498
121
+ }
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_standard_lb/export/result-model-90000.pth.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "val/loss": 2.6191844637431796,
3
+ "val/accuracy": 0.4788682725694444,
4
+ "val/perplexity": 13.724526171049808,
5
+ "val/time_since_best_loss": 0,
6
+ "val/time_since_best_accuracy": 0,
7
+ "lambada/loss": 2.505079944681677,
8
+ "lambada/accuracy/total": 0.24631211180124224,
9
+ "lambada/accuracy/openai_last_token": 0.7569875776397516,
10
+ "lambada/perplexity": 12.18903683885384,
11
+ "lambada/lm_loss": 3.1642302502556197,
12
+ "lambada/lm_perplexity": 23.67051664433095,
13
+ "lambada/time_since_best_loss": 0,
14
+ "lambada/time_since_best_accuracy": 0,
15
+ "mean_accuracy": 0.36259019218534333,
16
+ "mean_loss": 2.5621322042124284,
17
+ "blimp/accuracy/passive_2": 0.89,
18
+ "blimp/accuracy/determiner_noun_agreement_2": 0.976,
19
+ "blimp/accuracy/ellipsis_n_bar_1": 0.797,
20
+ "blimp/accuracy/tough_vs_raising_2": 0.886,
21
+ "blimp/accuracy/tough_vs_raising_1": 0.589,
22
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.914,
23
+ "blimp/accuracy/principle_A_reconstruction": 0.322,
24
+ "blimp/accuracy/wh_vs_that_with_gap": 0.5,
25
+ "blimp/accuracy/principle_A_domain_2": 0.783,
26
+ "blimp/accuracy/determiner_noun_agreement_1": 0.986,
27
+ "blimp/accuracy/ellipsis_n_bar_2": 0.904,
28
+ "blimp/accuracy/principle_A_domain_3": 0.535,
29
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.928,
30
+ "blimp/accuracy/animate_subject_trans": 0.895,
31
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.896,
32
+ "blimp/accuracy/distractor_agreement_relative_clause": 0.679,
33
+ "blimp/accuracy/transitive": 0.879,
34
+ "blimp/accuracy/sentential_subject_island": 0.292,
35
+ "blimp/accuracy/adjunct_island": 0.775,
36
+ "blimp/accuracy/intransitive": 0.767,
37
+ "blimp/accuracy/existential_there_subject_raising": 0.846,
38
+ "blimp/accuracy/irregular_past_participle_adjectives": 0.922,
39
+ "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.402,
40
+ "blimp/accuracy/principle_A_case_1": 1.0,
41
+ "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.236,
42
+ "blimp/accuracy/only_npi_scope": 0.713,
43
+ "blimp/accuracy/superlative_quantifiers_2": 0.635,
44
+ "blimp/accuracy/passive_1": 0.885,
45
+ "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.913,
46
+ "blimp/accuracy/inchoative": 0.625,
47
+ "blimp/accuracy/anaphor_gender_agreement": 0.958,
48
+ "blimp/accuracy/principle_A_c_command": 0.657,
49
+ "blimp/accuracy/only_npi_licensor_present": 0.848,
50
+ "blimp/accuracy/expletive_it_object_raising": 0.788,
51
+ "blimp/accuracy/left_branch_island_simple_question": 0.504,
52
+ "blimp/accuracy/wh_questions_subject_gap": 0.923,
53
+ "blimp/accuracy/existential_there_quantifiers_2": 0.466,
54
+ "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.928,
55
+ "blimp/accuracy/sentential_negation_npi_scope": 0.614,
56
+ "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.815,
57
+ "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.85,
58
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.923,
59
+ "blimp/accuracy/principle_A_case_2": 0.957,
60
+ "blimp/accuracy/distractor_agreement_relational_noun": 0.833,
61
+ "blimp/accuracy/sentential_negation_npi_licensor_present": 0.989,
62
+ "blimp/accuracy/superlative_quantifiers_1": 0.821,
63
+ "blimp/accuracy/wh_island": 0.767,
64
+ "blimp/accuracy/principle_A_domain_1": 0.976,
65
+ "blimp/accuracy/complex_NP_island": 0.55,
66
+ "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.962,
67
+ "blimp/accuracy/irregular_past_participle_verbs": 0.888,
68
+ "blimp/accuracy/drop_argument": 0.774,
69
+ "blimp/accuracy/wh_questions_object_gap": 0.806,
70
+ "blimp/accuracy/animate_subject_passive": 0.793,
71
+ "blimp/accuracy/existential_there_quantifiers_1": 0.991,
72
+ "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.871,
73
+ "blimp/accuracy/npi_present_2": 0.66,
74
+ "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.927,
75
+ "blimp/accuracy/anaphor_number_agreement": 0.992,
76
+ "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.945,
77
+ "blimp/accuracy/existential_there_object_raising": 0.83,
78
+ "blimp/accuracy/matrix_question_npi_licensor_present": 0.273,
79
+ "blimp/accuracy/npi_present_1": 0.626,
80
+ "blimp/accuracy/wh_vs_that_no_gap": 0.969,
81
+ "blimp/accuracy/left_branch_island_echo_question": 0.47,
82
+ "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.97,
83
+ "blimp/accuracy/causative": 0.71,
84
+ "blimp/accuracy/group_average": 0.7760298507462686,
85
+ "blimp/accuracy/seq_average": 0.7760298507462686,
86
+ "cbt/accuracy/NE": 0.7528044871794872,
87
+ "cbt/accuracy/V": 0.9004,
88
+ "cbt/accuracy/CN": 0.8188,
89
+ "cbt/accuracy/P": 0.8924,
90
+ "cbt/accuracy/group_average": 0.8411011217948717,
91
+ "cbt/accuracy/seq_average": 0.8411364545818327,
92
+ "hellaswag/accuracy/val": 0.29336785500896234,
93
+ "hellaswag/accuracy/group_average": 0.29336785500896234,
94
+ "hellaswag/accuracy/seq_average": 0.29336785500896234,
95
+ "piqa/accuracy/val": 0.5805223068552775,
96
+ "piqa/accuracy/group_average": 0.5805223068552775,
97
+ "piqa/accuracy/seq_average": 0.5805223068552775,
98
+ "ai2arc/accuracy/ARC-Easy": 0.3281183932346723,
99
+ "ai2arc/accuracy/ARC-Challenge": 0.20772532188841203,
100
+ "ai2arc/accuracy/group_average": 0.2679218575615422,
101
+ "ai2arc/accuracy/seq_average": 0.288385269121813,
102
+ "mmlu/accuracy/MMLU": 0.25927779764032893,
103
+ "mmlu/accuracy/group_average": 0.25927779764032893,
104
+ "mmlu/accuracy/seq_average": 0.25927779764032893,
105
+ "openbookqa/accuracy/test": 0.28,
106
+ "openbookqa/accuracy/group_average": 0.28,
107
+ "openbookqa/accuracy/seq_average": 0.28,
108
+ "race/accuracy/test/high": 0.2641509433962264,
109
+ "race/accuracy/test/middle": 0.3447075208913649,
110
+ "race/accuracy/group_average": 0.3044292321437957,
111
+ "race/accuracy/seq_average": 0.2875962707742197,
112
+ "siqa/accuracy/dev": 0.3592630501535312,
113
+ "siqa/accuracy/group_average": 0.3592630501535312,
114
+ "siqa/accuracy/seq_average": 0.3592630501535312,
115
+ "winogrande/accuracy/dev": 0.5074980268350434,
116
+ "winogrande/accuracy/group_average": 0.5074980268350434,
117
+ "winogrande/accuracy/seq_average": 0.5074980268350434,
118
+ "commonsenseqa/accuracy/dev_rand_split": 0.2538902538902539,
119
+ "commonsenseqa/accuracy/group_average": 0.2538902538902539,
120
+ "commonsenseqa/accuracy/seq_average": 0.2538902538902539
121
+ }