Upload folder using huggingface_hub

#363
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_33_experts/export/result-model-10000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 3.0193529885912698, "val/accuracy": 0.42433093843005953, "val/perplexity": 20.47803787283959, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.7976781240901594, "lambada/accuracy/total": 0.1626552795031056, "lambada/accuracy/openai_last_token": 0.7119565217391305, "lambada/perplexity": 25.716909525850493, "lambada/lm_loss": 3.569134057728088, "lambada/lm_perplexity": 35.4858511445869, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.29349310896658254, "mean_loss": 2.9085155563407143, "blimp/accuracy/passive_2": 0.87, "blimp/accuracy/determiner_noun_agreement_2": 0.965, "blimp/accuracy/ellipsis_n_bar_1": 0.709, "blimp/accuracy/tough_vs_raising_2": 0.786, "blimp/accuracy/tough_vs_raising_1": 0.632, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.884, "blimp/accuracy/principle_A_reconstruction": 0.549, "blimp/accuracy/wh_vs_that_with_gap": 0.452, "blimp/accuracy/principle_A_domain_2": 0.827, "blimp/accuracy/determiner_noun_agreement_1": 0.983, "blimp/accuracy/ellipsis_n_bar_2": 0.892, "blimp/accuracy/principle_A_domain_3": 0.565, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.875, "blimp/accuracy/animate_subject_trans": 0.884, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.752, "blimp/accuracy/distractor_agreement_relative_clause": 0.462, "blimp/accuracy/transitive": 0.793, "blimp/accuracy/sentential_subject_island": 0.304, "blimp/accuracy/adjunct_island": 0.82, "blimp/accuracy/intransitive": 0.7, "blimp/accuracy/existential_there_subject_raising": 0.82, "blimp/accuracy/irregular_past_participle_adjectives": 0.906, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.286, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.202, "blimp/accuracy/only_npi_scope": 0.554, "blimp/accuracy/superlative_quantifiers_2": 0.617, "blimp/accuracy/passive_1": 0.877, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.874, "blimp/accuracy/inchoative": 0.526, "blimp/accuracy/anaphor_gender_agreement": 0.928, "blimp/accuracy/principle_A_c_command": 0.439, "blimp/accuracy/only_npi_licensor_present": 0.276, "blimp/accuracy/expletive_it_object_raising": 0.775, "blimp/accuracy/left_branch_island_simple_question": 0.319, "blimp/accuracy/wh_questions_subject_gap": 0.908, "blimp/accuracy/existential_there_quantifiers_2": 0.407, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.884, "blimp/accuracy/sentential_negation_npi_scope": 0.419, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.805, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.882, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.801, "blimp/accuracy/principle_A_case_2": 0.934, "blimp/accuracy/distractor_agreement_relational_noun": 0.734, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.993, "blimp/accuracy/superlative_quantifiers_1": 0.729, "blimp/accuracy/wh_island": 0.862, "blimp/accuracy/principle_A_domain_1": 0.992, "blimp/accuracy/complex_NP_island": 0.638, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.943, "blimp/accuracy/irregular_past_participle_verbs": 0.758, "blimp/accuracy/drop_argument": 0.71, "blimp/accuracy/wh_questions_object_gap": 0.78, "blimp/accuracy/animate_subject_passive": 0.736, "blimp/accuracy/existential_there_quantifiers_1": 0.96, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.849, "blimp/accuracy/npi_present_2": 0.535, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.847, "blimp/accuracy/anaphor_number_agreement": 0.979, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.926, "blimp/accuracy/existential_there_object_raising": 0.786, "blimp/accuracy/matrix_question_npi_licensor_present": 0.061, "blimp/accuracy/npi_present_1": 0.503, "blimp/accuracy/wh_vs_that_no_gap": 0.962, "blimp/accuracy/left_branch_island_echo_question": 0.509, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.962, "blimp/accuracy/causative": 0.608, "blimp/accuracy/group_average": 0.7239552238805971, "blimp/accuracy/seq_average": 0.723955223880597, "cbt/accuracy/NE": 0.6854967948717948, "cbt/accuracy/V": 0.862, "cbt/accuracy/CN": 0.7308, "cbt/accuracy/P": 0.8364, "cbt/accuracy/group_average": 0.7786741987179486, "cbt/accuracy/seq_average": 0.7787114845938375, "hellaswag/accuracy/val": 0.2722565226050587, "hellaswag/accuracy/group_average": 0.2722565226050587, "hellaswag/accuracy/seq_average": 0.2722565226050587, "piqa/accuracy/val": 0.5467899891186072, "piqa/accuracy/group_average": 0.5467899891186072, "piqa/accuracy/seq_average": 0.5467899891186072, "ai2arc/accuracy/ARC-Easy": 0.3150105708245243, "ai2arc/accuracy/ARC-Challenge": 0.2034334763948498, "ai2arc/accuracy/group_average": 0.25922202360968705, "ai2arc/accuracy/seq_average": 0.2781869688385269, "race/accuracy/test/high": 0.2535734705546026, "race/accuracy/test/middle": 0.32729805013927576, "race/accuracy/group_average": 0.2904357603469392, "race/accuracy/seq_average": 0.27503040129712203, "siqa/accuracy/dev": 0.35670419651995905, "siqa/accuracy/group_average": 0.35670419651995905, "siqa/accuracy/seq_average": 0.35670419651995905, "commonsenseqa/accuracy/dev_rand_split": 0.23587223587223588, "commonsenseqa/accuracy/group_average": 0.23587223587223588, "commonsenseqa/accuracy/seq_average": 0.23587223587223588}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_33_experts/export/result-model-100000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.609346662248884, "val/accuracy": 0.4792683919270833, "val/perplexity": 13.590168980277008, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.648307468580163, "lambada/accuracy/total": 0.2511645962732919, "lambada/accuracy/openai_last_token": 0.7624223602484472, "lambada/perplexity": 12.533536360437177, "lambada/lm_loss": 3.1833163269582125, "lambada/lm_perplexity": 24.126632828816643, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.36521649410018764, "mean_loss": 2.628827065414524, "blimp/accuracy/passive_2": 0.911, "blimp/accuracy/determiner_noun_agreement_2": 0.99, "blimp/accuracy/ellipsis_n_bar_1": 0.815, "blimp/accuracy/tough_vs_raising_2": 0.867, "blimp/accuracy/tough_vs_raising_1": 0.607, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.916, "blimp/accuracy/principle_A_reconstruction": 0.247, "blimp/accuracy/wh_vs_that_with_gap": 0.535, "blimp/accuracy/principle_A_domain_2": 0.832, "blimp/accuracy/determiner_noun_agreement_1": 0.994, "blimp/accuracy/ellipsis_n_bar_2": 0.907, "blimp/accuracy/principle_A_domain_3": 0.577, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.917, "blimp/accuracy/animate_subject_trans": 0.892, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.88, "blimp/accuracy/distractor_agreement_relative_clause": 0.681, "blimp/accuracy/transitive": 0.874, "blimp/accuracy/sentential_subject_island": 0.286, "blimp/accuracy/adjunct_island": 0.839, "blimp/accuracy/intransitive": 0.809, "blimp/accuracy/existential_there_subject_raising": 0.897, "blimp/accuracy/irregular_past_participle_adjectives": 0.911, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.567, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.298, "blimp/accuracy/only_npi_scope": 0.57, "blimp/accuracy/superlative_quantifiers_2": 0.834, "blimp/accuracy/passive_1": 0.9, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.908, "blimp/accuracy/inchoative": 0.633, "blimp/accuracy/anaphor_gender_agreement": 0.958, "blimp/accuracy/principle_A_c_command": 0.561, "blimp/accuracy/only_npi_licensor_present": 0.37, "blimp/accuracy/expletive_it_object_raising": 0.768, "blimp/accuracy/left_branch_island_simple_question": 0.597, "blimp/accuracy/wh_questions_subject_gap": 0.92, "blimp/accuracy/existential_there_quantifiers_2": 0.591, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.943, "blimp/accuracy/sentential_negation_npi_scope": 0.713, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.8, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.837, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.894, "blimp/accuracy/principle_A_case_2": 0.939, "blimp/accuracy/distractor_agreement_relational_noun": 0.785, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.989, "blimp/accuracy/superlative_quantifiers_1": 0.503, "blimp/accuracy/wh_island": 0.752, "blimp/accuracy/principle_A_domain_1": 0.98, "blimp/accuracy/complex_NP_island": 0.605, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.979, "blimp/accuracy/irregular_past_participle_verbs": 0.878, "blimp/accuracy/drop_argument": 0.747, "blimp/accuracy/wh_questions_object_gap": 0.774, "blimp/accuracy/animate_subject_passive": 0.792, "blimp/accuracy/existential_there_quantifiers_1": 0.984, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.867, "blimp/accuracy/npi_present_2": 0.591, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.946, "blimp/accuracy/anaphor_number_agreement": 0.987, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.945, "blimp/accuracy/existential_there_object_raising": 0.861, "blimp/accuracy/matrix_question_npi_licensor_present": 0.165, "blimp/accuracy/npi_present_1": 0.554, "blimp/accuracy/wh_vs_that_no_gap": 0.957, "blimp/accuracy/left_branch_island_echo_question": 0.522, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.967, "blimp/accuracy/causative": 0.708, "blimp/accuracy/group_average": 0.7704925373134327, "blimp/accuracy/seq_average": 0.7704925373134328, "cbt/accuracy/NE": 0.7632211538461539, "cbt/accuracy/V": 0.9064, "cbt/accuracy/CN": 0.8232, "cbt/accuracy/P": 0.8828, "cbt/accuracy/group_average": 0.8439052884615384, "cbt/accuracy/seq_average": 0.843937575030012, "hellaswag/accuracy/val": 0.29117705636327423, "hellaswag/accuracy/group_average": 0.29117705636327423, "hellaswag/accuracy/seq_average": 0.29117705636327423, "piqa/accuracy/val": 0.5810663764961915, "piqa/accuracy/group_average": 0.5810663764961915, "piqa/accuracy/seq_average": 0.5810663764961915, "ai2arc/accuracy/ARC-Easy": 0.33276955602537, "ai2arc/accuracy/ARC-Challenge": 0.20600858369098712, "ai2arc/accuracy/group_average": 0.26938906985817856, "ai2arc/accuracy/seq_average": 0.29093484419263455, "race/accuracy/test/high": 0.2687249857061178, "race/accuracy/test/middle": 0.3384401114206128, "race/accuracy/group_average": 0.3035825485633653, "race/accuracy/seq_average": 0.28901499797324687, "siqa/accuracy/dev": 0.36489252814739, "siqa/accuracy/group_average": 0.36489252814739, "siqa/accuracy/seq_average": 0.36489252814739, "commonsenseqa/accuracy/dev_rand_split": 0.25143325143325146, "commonsenseqa/accuracy/group_average": 0.25143325143325146, "commonsenseqa/accuracy/seq_average": 0.25143325143325146}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_33_experts/export/result-model-20000.pth.json ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "val/loss": 2.8779788547092013,
3
+ "val/accuracy": 0.44091796875,
4
+ "val/perplexity": 17.778304306669796,
5
+ "val/time_since_best_loss": 0,
6
+ "val/time_since_best_accuracy": 0,
7
+ "lambada/loss": 2.6696091172117624,
8
+ "lambada/accuracy/total": 0.1906055900621118,
9
+ "lambada/accuracy/openai_last_token": 0.7303959627329193,
10
+ "lambada/perplexity": 22.08173634475889,
11
+ "lambada/lm_loss": 3.4162537295294926,
12
+ "lambada/lm_perplexity": 30.455107970507406,
13
+ "lambada/time_since_best_loss": 0,
14
+ "lambada/time_since_best_accuracy": 0,
15
+ "mean_accuracy": 0.3157617794060559,
16
+ "mean_loss": 2.7737939859604817,
17
+ "blimp/accuracy/passive_2": 0.87,
18
+ "blimp/accuracy/determiner_noun_agreement_2": 0.96,
19
+ "blimp/accuracy/ellipsis_n_bar_1": 0.778,
20
+ "blimp/accuracy/tough_vs_raising_2": 0.831,
21
+ "blimp/accuracy/tough_vs_raising_1": 0.614,
22
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.882,
23
+ "blimp/accuracy/principle_A_reconstruction": 0.678,
24
+ "blimp/accuracy/wh_vs_that_with_gap": 0.467,
25
+ "blimp/accuracy/principle_A_domain_2": 0.806,
26
+ "blimp/accuracy/determiner_noun_agreement_1": 0.979,
27
+ "blimp/accuracy/ellipsis_n_bar_2": 0.89,
28
+ "blimp/accuracy/principle_A_domain_3": 0.567,
29
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.877,
30
+ "blimp/accuracy/animate_subject_trans": 0.875,
31
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.811,
32
+ "blimp/accuracy/distractor_agreement_relative_clause": 0.552,
33
+ "blimp/accuracy/transitive": 0.837,
34
+ "blimp/accuracy/sentential_subject_island": 0.284,
35
+ "blimp/accuracy/adjunct_island": 0.805,
36
+ "blimp/accuracy/intransitive": 0.721,
37
+ "blimp/accuracy/existential_there_subject_raising": 0.846,
38
+ "blimp/accuracy/irregular_past_participle_adjectives": 0.935,
39
+ "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.408,
40
+ "blimp/accuracy/principle_A_case_1": 1.0,
41
+ "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.17,
42
+ "blimp/accuracy/only_npi_scope": 0.593,
43
+ "blimp/accuracy/superlative_quantifiers_2": 0.648,
44
+ "blimp/accuracy/passive_1": 0.874,
45
+ "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.866,
46
+ "blimp/accuracy/inchoative": 0.538,
47
+ "blimp/accuracy/anaphor_gender_agreement": 0.912,
48
+ "blimp/accuracy/principle_A_c_command": 0.459,
49
+ "blimp/accuracy/only_npi_licensor_present": 0.249,
50
+ "blimp/accuracy/expletive_it_object_raising": 0.772,
51
+ "blimp/accuracy/left_branch_island_simple_question": 0.468,
52
+ "blimp/accuracy/wh_questions_subject_gap": 0.892,
53
+ "blimp/accuracy/existential_there_quantifiers_2": 0.49,
54
+ "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.894,
55
+ "blimp/accuracy/sentential_negation_npi_scope": 0.475,
56
+ "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.79,
57
+ "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.889,
58
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.84,
59
+ "blimp/accuracy/principle_A_case_2": 0.898,
60
+ "blimp/accuracy/distractor_agreement_relational_noun": 0.74,
61
+ "blimp/accuracy/sentential_negation_npi_licensor_present": 0.987,
62
+ "blimp/accuracy/superlative_quantifiers_1": 0.475,
63
+ "blimp/accuracy/wh_island": 0.896,
64
+ "blimp/accuracy/principle_A_domain_1": 0.986,
65
+ "blimp/accuracy/complex_NP_island": 0.567,
66
+ "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.941,
67
+ "blimp/accuracy/irregular_past_participle_verbs": 0.836,
68
+ "blimp/accuracy/drop_argument": 0.733,
69
+ "blimp/accuracy/wh_questions_object_gap": 0.815,
70
+ "blimp/accuracy/animate_subject_passive": 0.777,
71
+ "blimp/accuracy/existential_there_quantifiers_1": 0.982,
72
+ "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.857,
73
+ "blimp/accuracy/npi_present_2": 0.583,
74
+ "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.897,
75
+ "blimp/accuracy/anaphor_number_agreement": 0.973,
76
+ "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.923,
77
+ "blimp/accuracy/existential_there_object_raising": 0.784,
78
+ "blimp/accuracy/matrix_question_npi_licensor_present": 0.077,
79
+ "blimp/accuracy/npi_present_1": 0.551,
80
+ "blimp/accuracy/wh_vs_that_no_gap": 0.982,
81
+ "blimp/accuracy/left_branch_island_echo_question": 0.433,
82
+ "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.979,
83
+ "blimp/accuracy/causative": 0.672,
84
+ "blimp/accuracy/group_average": 0.7378507462686564,
85
+ "blimp/accuracy/seq_average": 0.7378507462686568,
86
+ "cbt/accuracy/NE": 0.6999198717948718,
87
+ "cbt/accuracy/V": 0.8776,
88
+ "cbt/accuracy/CN": 0.76,
89
+ "cbt/accuracy/P": 0.8632,
90
+ "cbt/accuracy/group_average": 0.800179967948718,
91
+ "cbt/accuracy/seq_average": 0.800220088035214,
92
+ "hellaswag/accuracy/val": 0.28201553475403307,
93
+ "hellaswag/accuracy/group_average": 0.28201553475403307,
94
+ "hellaswag/accuracy/seq_average": 0.28201553475403307,
95
+ "piqa/accuracy/val": 0.5527747551686616,
96
+ "piqa/accuracy/group_average": 0.5527747551686616,
97
+ "piqa/accuracy/seq_average": 0.5527747551686616,
98
+ "ai2arc/accuracy/ARC-Easy": 0.30613107822410146,
99
+ "ai2arc/accuracy/ARC-Challenge": 0.20772532188841203,
100
+ "ai2arc/accuracy/group_average": 0.2569282000562567,
101
+ "ai2arc/accuracy/seq_average": 0.27365439093484417,
102
+ "race/accuracy/test/high": 0.252715837621498,
103
+ "race/accuracy/test/middle": 0.3328690807799443,
104
+ "race/accuracy/group_average": 0.29279245920072117,
105
+ "race/accuracy/seq_average": 0.2760437778678557,
106
+ "siqa/accuracy/dev": 0.3633572159672467,
107
+ "siqa/accuracy/group_average": 0.3633572159672467,
108
+ "siqa/accuracy/seq_average": 0.3633572159672467,
109
+ "commonsenseqa/accuracy/dev_rand_split": 0.24406224406224405,
110
+ "commonsenseqa/accuracy/group_average": 0.24406224406224405,
111
+ "commonsenseqa/accuracy/seq_average": 0.24406224406224405
112
+ }
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_33_experts/export/result-model-30000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.8066788930741566, "val/accuracy": 0.4506312779017857, "val/perplexity": 16.554846404663273, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.621950706339771, "lambada/accuracy/total": 0.21195652173913043, "lambada/accuracy/openai_last_token": 0.7371894409937888, "lambada/perplexity": 16.762477855495113, "lambada/lm_loss": 3.3654079853835968, "lambada/lm_perplexity": 28.945304151392328, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3312938998204581, "mean_loss": 2.7143147997069637, "blimp/accuracy/passive_2": 0.896, "blimp/accuracy/determiner_noun_agreement_2": 0.98, "blimp/accuracy/ellipsis_n_bar_1": 0.786, "blimp/accuracy/tough_vs_raising_2": 0.87, "blimp/accuracy/tough_vs_raising_1": 0.646, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.88, "blimp/accuracy/principle_A_reconstruction": 0.482, "blimp/accuracy/wh_vs_that_with_gap": 0.554, "blimp/accuracy/principle_A_domain_2": 0.772, "blimp/accuracy/determiner_noun_agreement_1": 0.984, "blimp/accuracy/ellipsis_n_bar_2": 0.87, "blimp/accuracy/principle_A_domain_3": 0.546, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.907, "blimp/accuracy/animate_subject_trans": 0.867, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.812, "blimp/accuracy/distractor_agreement_relative_clause": 0.633, "blimp/accuracy/transitive": 0.825, "blimp/accuracy/sentential_subject_island": 0.281, "blimp/accuracy/adjunct_island": 0.852, "blimp/accuracy/intransitive": 0.752, "blimp/accuracy/existential_there_subject_raising": 0.867, "blimp/accuracy/irregular_past_participle_adjectives": 0.871, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.527, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.277, "blimp/accuracy/only_npi_scope": 0.706, "blimp/accuracy/superlative_quantifiers_2": 0.719, "blimp/accuracy/passive_1": 0.873, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.873, "blimp/accuracy/inchoative": 0.597, "blimp/accuracy/anaphor_gender_agreement": 0.952, "blimp/accuracy/principle_A_c_command": 0.529, "blimp/accuracy/only_npi_licensor_present": 0.509, "blimp/accuracy/expletive_it_object_raising": 0.799, "blimp/accuracy/left_branch_island_simple_question": 0.537, "blimp/accuracy/wh_questions_subject_gap": 0.884, "blimp/accuracy/existential_there_quantifiers_2": 0.477, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.911, "blimp/accuracy/sentential_negation_npi_scope": 0.638, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.792, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.85, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.847, "blimp/accuracy/principle_A_case_2": 0.925, "blimp/accuracy/distractor_agreement_relational_noun": 0.775, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.992, "blimp/accuracy/superlative_quantifiers_1": 0.674, "blimp/accuracy/wh_island": 0.754, "blimp/accuracy/principle_A_domain_1": 0.988, "blimp/accuracy/complex_NP_island": 0.577, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.964, "blimp/accuracy/irregular_past_participle_verbs": 0.883, "blimp/accuracy/drop_argument": 0.75, "blimp/accuracy/wh_questions_object_gap": 0.757, "blimp/accuracy/animate_subject_passive": 0.818, "blimp/accuracy/existential_there_quantifiers_1": 0.985, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.853, "blimp/accuracy/npi_present_2": 0.555, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.889, "blimp/accuracy/anaphor_number_agreement": 0.984, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.934, "blimp/accuracy/existential_there_object_raising": 0.784, "blimp/accuracy/matrix_question_npi_licensor_present": 0.102, "blimp/accuracy/npi_present_1": 0.525, "blimp/accuracy/wh_vs_that_no_gap": 0.961, "blimp/accuracy/left_branch_island_echo_question": 0.422, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.955, "blimp/accuracy/causative": 0.66, "blimp/accuracy/group_average": 0.7566567164179103, "blimp/accuracy/seq_average": 0.7566567164179104, "cbt/accuracy/NE": 0.7375801282051282, "cbt/accuracy/V": 0.8892, "cbt/accuracy/CN": 0.7824, "cbt/accuracy/P": 0.8632, "cbt/accuracy/group_average": 0.8180950320512821, "cbt/accuracy/seq_average": 0.8181272509003601, "hellaswag/accuracy/val": 0.2804222266480781, "hellaswag/accuracy/group_average": 0.2804222266480781, "hellaswag/accuracy/seq_average": 0.2804222266480781, "piqa/accuracy/val": 0.5690968443960827, "piqa/accuracy/group_average": 0.5690968443960827, "piqa/accuracy/seq_average": 0.5690968443960827, "ai2arc/accuracy/ARC-Easy": 0.31881606765327697, "ai2arc/accuracy/ARC-Challenge": 0.1982832618025751, "ai2arc/accuracy/group_average": 0.25854966472792607, "ai2arc/accuracy/seq_average": 0.2790368271954674, "race/accuracy/test/high": 0.26329331046312177, "race/accuracy/test/middle": 0.3328690807799443, "race/accuracy/group_average": 0.298081195621533, "race/accuracy/seq_average": 0.28354276449128496, "siqa/accuracy/dev": 0.3623336745138178, "siqa/accuracy/group_average": 0.3623336745138178, "siqa/accuracy/seq_average": 0.3623336745138178, "commonsenseqa/accuracy/dev_rand_split": 0.2375102375102375, "commonsenseqa/accuracy/group_average": 0.2375102375102375, "commonsenseqa/accuracy/seq_average": 0.2375102375102375}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_33_experts/export/result-model-40000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.754640367296007, "val/accuracy": 0.45817638578869047, "val/perplexity": 15.71538811887805, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.529570395902077, "lambada/accuracy/total": 0.21195652173913043, "lambada/accuracy/openai_last_token": 0.7402950310559007, "lambada/perplexity": 16.018655883617395, "lambada/lm_loss": 3.3054434892472218, "lambada/lm_perplexity": 27.260628704052912, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3350664537639104, "mean_loss": 2.642105381599042, "blimp/accuracy/passive_2": 0.896, "blimp/accuracy/determiner_noun_agreement_2": 0.984, "blimp/accuracy/ellipsis_n_bar_1": 0.818, "blimp/accuracy/tough_vs_raising_2": 0.875, "blimp/accuracy/tough_vs_raising_1": 0.587, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.906, "blimp/accuracy/principle_A_reconstruction": 0.325, "blimp/accuracy/wh_vs_that_with_gap": 0.505, "blimp/accuracy/principle_A_domain_2": 0.797, "blimp/accuracy/determiner_noun_agreement_1": 0.991, "blimp/accuracy/ellipsis_n_bar_2": 0.895, "blimp/accuracy/principle_A_domain_3": 0.568, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.924, "blimp/accuracy/animate_subject_trans": 0.889, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.856, "blimp/accuracy/distractor_agreement_relative_clause": 0.694, "blimp/accuracy/transitive": 0.845, "blimp/accuracy/sentential_subject_island": 0.248, "blimp/accuracy/adjunct_island": 0.829, "blimp/accuracy/intransitive": 0.766, "blimp/accuracy/existential_there_subject_raising": 0.856, "blimp/accuracy/irregular_past_participle_adjectives": 0.859, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.52, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.269, "blimp/accuracy/only_npi_scope": 0.552, "blimp/accuracy/superlative_quantifiers_2": 0.614, "blimp/accuracy/passive_1": 0.893, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.888, "blimp/accuracy/inchoative": 0.589, "blimp/accuracy/anaphor_gender_agreement": 0.963, "blimp/accuracy/principle_A_c_command": 0.527, "blimp/accuracy/only_npi_licensor_present": 0.474, "blimp/accuracy/expletive_it_object_raising": 0.809, "blimp/accuracy/left_branch_island_simple_question": 0.54, "blimp/accuracy/wh_questions_subject_gap": 0.926, "blimp/accuracy/existential_there_quantifiers_2": 0.426, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.944, "blimp/accuracy/sentential_negation_npi_scope": 0.647, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.802, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.857, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.876, "blimp/accuracy/principle_A_case_2": 0.943, "blimp/accuracy/distractor_agreement_relational_noun": 0.812, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.994, "blimp/accuracy/superlative_quantifiers_1": 0.488, "blimp/accuracy/wh_island": 0.744, "blimp/accuracy/principle_A_domain_1": 0.983, "blimp/accuracy/complex_NP_island": 0.568, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.974, "blimp/accuracy/irregular_past_participle_verbs": 0.867, "blimp/accuracy/drop_argument": 0.748, "blimp/accuracy/wh_questions_object_gap": 0.766, "blimp/accuracy/animate_subject_passive": 0.782, "blimp/accuracy/existential_there_quantifiers_1": 0.987, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.856, "blimp/accuracy/npi_present_2": 0.502, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.92, "blimp/accuracy/anaphor_number_agreement": 0.984, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.952, "blimp/accuracy/existential_there_object_raising": 0.821, "blimp/accuracy/matrix_question_npi_licensor_present": 0.093, "blimp/accuracy/npi_present_1": 0.47, "blimp/accuracy/wh_vs_that_no_gap": 0.97, "blimp/accuracy/left_branch_island_echo_question": 0.474, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.97, "blimp/accuracy/causative": 0.682, "blimp/accuracy/group_average": 0.7519253731343282, "blimp/accuracy/seq_average": 0.7519253731343284, "cbt/accuracy/NE": 0.7271634615384616, "cbt/accuracy/V": 0.8932, "cbt/accuracy/CN": 0.7948, "cbt/accuracy/P": 0.868, "cbt/accuracy/group_average": 0.8207908653846153, "cbt/accuracy/seq_average": 0.820828331332533, "hellaswag/accuracy/val": 0.27723561043616807, "hellaswag/accuracy/group_average": 0.27723561043616807, "hellaswag/accuracy/seq_average": 0.27723561043616807, "piqa/accuracy/val": 0.5680087051142546, "piqa/accuracy/group_average": 0.5680087051142546, "piqa/accuracy/seq_average": 0.5680087051142546, "ai2arc/accuracy/ARC-Easy": 0.31839323467230446, "ai2arc/accuracy/ARC-Challenge": 0.20686695278969958, "ai2arc/accuracy/group_average": 0.262630093731002, "ai2arc/accuracy/seq_average": 0.28158640226628895, "race/accuracy/test/high": 0.2652944539736993, "race/accuracy/test/middle": 0.32172701949860727, "race/accuracy/group_average": 0.29351073673615324, "race/accuracy/seq_average": 0.2817186866639643, "siqa/accuracy/dev": 0.35209825997952915, "siqa/accuracy/group_average": 0.35209825997952915, "siqa/accuracy/seq_average": 0.35209825997952915, "commonsenseqa/accuracy/dev_rand_split": 0.25143325143325146, "commonsenseqa/accuracy/group_average": 0.25143325143325146, "commonsenseqa/accuracy/seq_average": 0.25143325143325146}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_33_experts/export/result-model-50000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.711526295495412, "val/accuracy": 0.4646044534350198, "val/perplexity": 15.05223214523721, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.675644015673525, "lambada/accuracy/total": 0.23699534161490685, "lambada/accuracy/openai_last_token": 0.751358695652174, "lambada/perplexity": 14.025293823640384, "lambada/lm_loss": 3.283100098057292, "lambada/lm_perplexity": 26.65828803792627, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3507998975249633, "mean_loss": 2.6935851555844685, "blimp/accuracy/passive_2": 0.901, "blimp/accuracy/determiner_noun_agreement_2": 0.993, "blimp/accuracy/ellipsis_n_bar_1": 0.809, "blimp/accuracy/tough_vs_raising_2": 0.872, "blimp/accuracy/tough_vs_raising_1": 0.57, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.908, "blimp/accuracy/principle_A_reconstruction": 0.372, "blimp/accuracy/wh_vs_that_with_gap": 0.506, "blimp/accuracy/principle_A_domain_2": 0.824, "blimp/accuracy/determiner_noun_agreement_1": 0.987, "blimp/accuracy/ellipsis_n_bar_2": 0.894, "blimp/accuracy/principle_A_domain_3": 0.554, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.91, "blimp/accuracy/animate_subject_trans": 0.899, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.859, "blimp/accuracy/distractor_agreement_relative_clause": 0.686, "blimp/accuracy/transitive": 0.855, "blimp/accuracy/sentential_subject_island": 0.266, "blimp/accuracy/adjunct_island": 0.826, "blimp/accuracy/intransitive": 0.764, "blimp/accuracy/existential_there_subject_raising": 0.883, "blimp/accuracy/irregular_past_participle_adjectives": 0.854, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.556, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.264, "blimp/accuracy/only_npi_scope": 0.59, "blimp/accuracy/superlative_quantifiers_2": 0.612, "blimp/accuracy/passive_1": 0.886, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.889, "blimp/accuracy/inchoative": 0.584, "blimp/accuracy/anaphor_gender_agreement": 0.968, "blimp/accuracy/principle_A_c_command": 0.574, "blimp/accuracy/only_npi_licensor_present": 0.352, "blimp/accuracy/expletive_it_object_raising": 0.815, "blimp/accuracy/left_branch_island_simple_question": 0.575, "blimp/accuracy/wh_questions_subject_gap": 0.936, "blimp/accuracy/existential_there_quantifiers_2": 0.561, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.927, "blimp/accuracy/sentential_negation_npi_scope": 0.641, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.824, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.874, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.873, "blimp/accuracy/principle_A_case_2": 0.925, "blimp/accuracy/distractor_agreement_relational_noun": 0.798, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.985, "blimp/accuracy/superlative_quantifiers_1": 0.583, "blimp/accuracy/wh_island": 0.799, "blimp/accuracy/principle_A_domain_1": 0.988, "blimp/accuracy/complex_NP_island": 0.609, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.965, "blimp/accuracy/irregular_past_participle_verbs": 0.841, "blimp/accuracy/drop_argument": 0.728, "blimp/accuracy/wh_questions_object_gap": 0.823, "blimp/accuracy/animate_subject_passive": 0.797, "blimp/accuracy/existential_there_quantifiers_1": 0.988, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.883, "blimp/accuracy/npi_present_2": 0.522, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.919, "blimp/accuracy/anaphor_number_agreement": 0.985, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.94, "blimp/accuracy/existential_there_object_raising": 0.838, "blimp/accuracy/matrix_question_npi_licensor_present": 0.151, "blimp/accuracy/npi_present_1": 0.452, "blimp/accuracy/wh_vs_that_no_gap": 0.97, "blimp/accuracy/left_branch_island_echo_question": 0.472, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.971, "blimp/accuracy/causative": 0.706, "blimp/accuracy/group_average": 0.7601641791044776, "blimp/accuracy/seq_average": 0.7601641791044776, "cbt/accuracy/NE": 0.7347756410256411, "cbt/accuracy/V": 0.898, "cbt/accuracy/CN": 0.7916, "cbt/accuracy/P": 0.8748, "cbt/accuracy/group_average": 0.8247939102564102, "cbt/accuracy/seq_average": 0.8248299319727891, "hellaswag/accuracy/val": 0.2877912766381199, "hellaswag/accuracy/group_average": 0.2877912766381199, "hellaswag/accuracy/seq_average": 0.2877912766381199, "piqa/accuracy/val": 0.5772578890097932, "piqa/accuracy/group_average": 0.5772578890097932, "piqa/accuracy/seq_average": 0.5772578890097932, "ai2arc/accuracy/ARC-Easy": 0.3217758985200846, "ai2arc/accuracy/ARC-Challenge": 0.21545064377682405, "ai2arc/accuracy/group_average": 0.26861327114845435, "ai2arc/accuracy/seq_average": 0.28668555240793203, "race/accuracy/test/high": 0.2610062893081761, "race/accuracy/test/middle": 0.3328690807799443, "race/accuracy/group_average": 0.2969376850440602, "race/accuracy/seq_average": 0.2819213619781111, "siqa/accuracy/dev": 0.35516888433981575, "siqa/accuracy/group_average": 0.35516888433981575, "siqa/accuracy/seq_average": 0.35516888433981575, "commonsenseqa/accuracy/dev_rand_split": 0.24242424242424243, "commonsenseqa/accuracy/group_average": 0.24242424242424243, "commonsenseqa/accuracy/seq_average": 0.24242424242424243}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_33_experts/export/result-model-60000.pth.json ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "val/loss": 2.6765243288070435,
3
+ "val/accuracy": 0.4695347377232143,
4
+ "val/perplexity": 14.534488301370109,
5
+ "val/time_since_best_loss": 0,
6
+ "val/time_since_best_accuracy": 0,
7
+ "lambada/loss": 2.63063949679736,
8
+ "lambada/accuracy/total": 0.21777950310559005,
9
+ "lambada/accuracy/openai_last_token": 0.7476708074534162,
10
+ "lambada/perplexity": 14.60334217178118,
11
+ "lambada/lm_loss": 3.237729481302067,
12
+ "lambada/lm_perplexity": 25.475812721337494,
13
+ "lambada/time_since_best_loss": 0,
14
+ "lambada/time_since_best_accuracy": 0,
15
+ "mean_accuracy": 0.34365712041440216,
16
+ "mean_loss": 2.653581912802202,
17
+ "blimp/accuracy/passive_2": 0.9,
18
+ "blimp/accuracy/determiner_noun_agreement_2": 0.988,
19
+ "blimp/accuracy/ellipsis_n_bar_1": 0.814,
20
+ "blimp/accuracy/tough_vs_raising_2": 0.877,
21
+ "blimp/accuracy/tough_vs_raising_1": 0.597,
22
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.905,
23
+ "blimp/accuracy/principle_A_reconstruction": 0.336,
24
+ "blimp/accuracy/wh_vs_that_with_gap": 0.52,
25
+ "blimp/accuracy/principle_A_domain_2": 0.799,
26
+ "blimp/accuracy/determiner_noun_agreement_1": 0.989,
27
+ "blimp/accuracy/ellipsis_n_bar_2": 0.889,
28
+ "blimp/accuracy/principle_A_domain_3": 0.564,
29
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.915,
30
+ "blimp/accuracy/animate_subject_trans": 0.896,
31
+ "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.868,
32
+ "blimp/accuracy/distractor_agreement_relative_clause": 0.707,
33
+ "blimp/accuracy/transitive": 0.854,
34
+ "blimp/accuracy/sentential_subject_island": 0.297,
35
+ "blimp/accuracy/adjunct_island": 0.838,
36
+ "blimp/accuracy/intransitive": 0.77,
37
+ "blimp/accuracy/existential_there_subject_raising": 0.873,
38
+ "blimp/accuracy/irregular_past_participle_adjectives": 0.872,
39
+ "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.445,
40
+ "blimp/accuracy/principle_A_case_1": 1.0,
41
+ "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.305,
42
+ "blimp/accuracy/only_npi_scope": 0.62,
43
+ "blimp/accuracy/superlative_quantifiers_2": 0.814,
44
+ "blimp/accuracy/passive_1": 0.872,
45
+ "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.907,
46
+ "blimp/accuracy/inchoative": 0.62,
47
+ "blimp/accuracy/anaphor_gender_agreement": 0.97,
48
+ "blimp/accuracy/principle_A_c_command": 0.559,
49
+ "blimp/accuracy/only_npi_licensor_present": 0.257,
50
+ "blimp/accuracy/expletive_it_object_raising": 0.783,
51
+ "blimp/accuracy/left_branch_island_simple_question": 0.49,
52
+ "blimp/accuracy/wh_questions_subject_gap": 0.935,
53
+ "blimp/accuracy/existential_there_quantifiers_2": 0.369,
54
+ "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.935,
55
+ "blimp/accuracy/sentential_negation_npi_scope": 0.686,
56
+ "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.806,
57
+ "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.866,
58
+ "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.877,
59
+ "blimp/accuracy/principle_A_case_2": 0.934,
60
+ "blimp/accuracy/distractor_agreement_relational_noun": 0.773,
61
+ "blimp/accuracy/sentential_negation_npi_licensor_present": 0.996,
62
+ "blimp/accuracy/superlative_quantifiers_1": 0.592,
63
+ "blimp/accuracy/wh_island": 0.84,
64
+ "blimp/accuracy/principle_A_domain_1": 0.988,
65
+ "blimp/accuracy/complex_NP_island": 0.612,
66
+ "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.971,
67
+ "blimp/accuracy/irregular_past_participle_verbs": 0.886,
68
+ "blimp/accuracy/drop_argument": 0.76,
69
+ "blimp/accuracy/wh_questions_object_gap": 0.821,
70
+ "blimp/accuracy/animate_subject_passive": 0.787,
71
+ "blimp/accuracy/existential_there_quantifiers_1": 0.98,
72
+ "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.892,
73
+ "blimp/accuracy/npi_present_2": 0.567,
74
+ "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.913,
75
+ "blimp/accuracy/anaphor_number_agreement": 0.985,
76
+ "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.945,
77
+ "blimp/accuracy/existential_there_object_raising": 0.798,
78
+ "blimp/accuracy/matrix_question_npi_licensor_present": 0.178,
79
+ "blimp/accuracy/npi_present_1": 0.53,
80
+ "blimp/accuracy/wh_vs_that_no_gap": 0.97,
81
+ "blimp/accuracy/left_branch_island_echo_question": 0.434,
82
+ "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.968,
83
+ "blimp/accuracy/causative": 0.724,
84
+ "blimp/accuracy/group_average": 0.7616119402985074,
85
+ "blimp/accuracy/seq_average": 0.7616119402985074,
86
+ "cbt/accuracy/NE": 0.7447916666666666,
87
+ "cbt/accuracy/V": 0.9008,
88
+ "cbt/accuracy/CN": 0.812,
89
+ "cbt/accuracy/P": 0.8788,
90
+ "cbt/accuracy/group_average": 0.8340979166666667,
91
+ "cbt/accuracy/seq_average": 0.8341336534613846,
92
+ "hellaswag/accuracy/val": 0.2863971320454093,
93
+ "hellaswag/accuracy/group_average": 0.2863971320454093,
94
+ "hellaswag/accuracy/seq_average": 0.2863971320454093,
95
+ "piqa/accuracy/val": 0.5685527747551686,
96
+ "piqa/accuracy/group_average": 0.5685527747551686,
97
+ "piqa/accuracy/seq_average": 0.5685527747551686,
98
+ "ai2arc/accuracy/ARC-Easy": 0.333615221987315,
99
+ "ai2arc/accuracy/ARC-Challenge": 0.20686695278969958,
100
+ "ai2arc/accuracy/group_average": 0.2702410873885073,
101
+ "ai2arc/accuracy/seq_average": 0.29178470254957506,
102
+ "race/accuracy/test/high": 0.2644368210405946,
103
+ "race/accuracy/test/middle": 0.3447075208913649,
104
+ "race/accuracy/group_average": 0.30457217096597977,
105
+ "race/accuracy/seq_average": 0.2877989460883664,
106
+ "siqa/accuracy/dev": 0.3546571136131013,
107
+ "siqa/accuracy/group_average": 0.3546571136131013,
108
+ "siqa/accuracy/seq_average": 0.3546571136131013,
109
+ "commonsenseqa/accuracy/dev_rand_split": 0.2457002457002457,
110
+ "commonsenseqa/accuracy/group_average": 0.2457002457002457,
111
+ "commonsenseqa/accuracy/seq_average": 0.2457002457002457
112
+ }
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_33_experts/export/result-model-70000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.650483873155382, "val/accuracy": 0.47308737134176587, "val/perplexity": 14.160889061949224, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6131271457079777, "lambada/accuracy/total": 0.23796583850931677, "lambada/accuracy/openai_last_token": 0.74902950310559, "lambada/perplexity": 13.266013478759401, "lambada/lm_loss": 3.225909439481051, "lambada/lm_perplexity": 25.176460214410444, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.35552660492554133, "mean_loss": 2.63180550943168, "blimp/accuracy/passive_2": 0.917, "blimp/accuracy/determiner_noun_agreement_2": 0.989, "blimp/accuracy/ellipsis_n_bar_1": 0.818, "blimp/accuracy/tough_vs_raising_2": 0.853, "blimp/accuracy/tough_vs_raising_1": 0.607, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.82, "blimp/accuracy/principle_A_reconstruction": 0.268, "blimp/accuracy/wh_vs_that_with_gap": 0.508, "blimp/accuracy/principle_A_domain_2": 0.807, "blimp/accuracy/determiner_noun_agreement_1": 0.993, "blimp/accuracy/ellipsis_n_bar_2": 0.894, "blimp/accuracy/principle_A_domain_3": 0.571, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.918, "blimp/accuracy/animate_subject_trans": 0.9, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.888, "blimp/accuracy/distractor_agreement_relative_clause": 0.698, "blimp/accuracy/transitive": 0.86, "blimp/accuracy/sentential_subject_island": 0.312, "blimp/accuracy/adjunct_island": 0.827, "blimp/accuracy/intransitive": 0.78, "blimp/accuracy/existential_there_subject_raising": 0.881, "blimp/accuracy/irregular_past_participle_adjectives": 0.935, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.457, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.29, "blimp/accuracy/only_npi_scope": 0.611, "blimp/accuracy/superlative_quantifiers_2": 0.718, "blimp/accuracy/passive_1": 0.902, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.896, "blimp/accuracy/inchoative": 0.603, "blimp/accuracy/anaphor_gender_agreement": 0.976, "blimp/accuracy/principle_A_c_command": 0.549, "blimp/accuracy/only_npi_licensor_present": 0.368, "blimp/accuracy/expletive_it_object_raising": 0.773, "blimp/accuracy/left_branch_island_simple_question": 0.489, "blimp/accuracy/wh_questions_subject_gap": 0.921, "blimp/accuracy/existential_there_quantifiers_2": 0.544, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.936, "blimp/accuracy/sentential_negation_npi_scope": 0.726, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.803, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.851, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.896, "blimp/accuracy/principle_A_case_2": 0.938, "blimp/accuracy/distractor_agreement_relational_noun": 0.792, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.997, "blimp/accuracy/superlative_quantifiers_1": 0.608, "blimp/accuracy/wh_island": 0.753, "blimp/accuracy/principle_A_domain_1": 0.985, "blimp/accuracy/complex_NP_island": 0.578, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.979, "blimp/accuracy/irregular_past_participle_verbs": 0.892, "blimp/accuracy/drop_argument": 0.762, "blimp/accuracy/wh_questions_object_gap": 0.8, "blimp/accuracy/animate_subject_passive": 0.784, "blimp/accuracy/existential_there_quantifiers_1": 0.993, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.864, "blimp/accuracy/npi_present_2": 0.544, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.926, "blimp/accuracy/anaphor_number_agreement": 0.989, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.95, "blimp/accuracy/existential_there_object_raising": 0.821, "blimp/accuracy/matrix_question_npi_licensor_present": 0.183, "blimp/accuracy/npi_present_1": 0.511, "blimp/accuracy/wh_vs_that_no_gap": 0.961, "blimp/accuracy/left_branch_island_echo_question": 0.487, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.967, "blimp/accuracy/causative": 0.727, "blimp/accuracy/group_average": 0.7633432835820895, "blimp/accuracy/seq_average": 0.7633432835820896, "cbt/accuracy/NE": 0.7504006410256411, "cbt/accuracy/V": 0.9032, "cbt/accuracy/CN": 0.8168, "cbt/accuracy/P": 0.878, "cbt/accuracy/group_average": 0.8371001602564103, "cbt/accuracy/seq_average": 0.8371348539415766, "hellaswag/accuracy/val": 0.28719378609838675, "hellaswag/accuracy/group_average": 0.28719378609838675, "hellaswag/accuracy/seq_average": 0.28719378609838675, "piqa/accuracy/val": 0.5810663764961915, "piqa/accuracy/group_average": 0.5810663764961915, "piqa/accuracy/seq_average": 0.5810663764961915, "ai2arc/accuracy/ARC-Easy": 0.3302325581395349, "ai2arc/accuracy/ARC-Challenge": 0.2111587982832618, "ai2arc/accuracy/group_average": 0.27069567821139834, "ai2arc/accuracy/seq_average": 0.29093484419263455, "race/accuracy/test/high": 0.259576901086335, "race/accuracy/test/middle": 0.3321727019498607, "race/accuracy/group_average": 0.29587480151809786, "race/accuracy/seq_average": 0.28070531009323063, "siqa/accuracy/dev": 0.35670419651995905, "siqa/accuracy/group_average": 0.35670419651995905, "siqa/accuracy/seq_average": 0.35670419651995905, "commonsenseqa/accuracy/dev_rand_split": 0.25634725634725636, "commonsenseqa/accuracy/group_average": 0.25634725634725636, "commonsenseqa/accuracy/seq_average": 0.25634725634725636}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_33_experts/export/result-model-80000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.6302894713386658, "val/accuracy": 0.4754396468874008, "val/perplexity": 13.877786542198857, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.568061591675563, "lambada/accuracy/total": 0.2453416149068323, "lambada/accuracy/openai_last_token": 0.7560170807453416, "lambada/perplexity": 12.916188960854681, "lambada/lm_loss": 3.1977878966291757, "lambada/lm_perplexity": 24.47832168388822, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.36039063089711654, "mean_loss": 2.5991755315071146, "blimp/accuracy/passive_2": 0.914, "blimp/accuracy/determiner_noun_agreement_2": 0.987, "blimp/accuracy/ellipsis_n_bar_1": 0.817, "blimp/accuracy/tough_vs_raising_2": 0.868, "blimp/accuracy/tough_vs_raising_1": 0.637, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.92, "blimp/accuracy/principle_A_reconstruction": 0.289, "blimp/accuracy/wh_vs_that_with_gap": 0.481, "blimp/accuracy/principle_A_domain_2": 0.823, "blimp/accuracy/determiner_noun_agreement_1": 0.995, "blimp/accuracy/ellipsis_n_bar_2": 0.895, "blimp/accuracy/principle_A_domain_3": 0.605, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.918, "blimp/accuracy/animate_subject_trans": 0.906, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.888, "blimp/accuracy/distractor_agreement_relative_clause": 0.716, "blimp/accuracy/transitive": 0.866, "blimp/accuracy/sentential_subject_island": 0.31, "blimp/accuracy/adjunct_island": 0.82, "blimp/accuracy/intransitive": 0.785, "blimp/accuracy/existential_there_subject_raising": 0.896, "blimp/accuracy/irregular_past_participle_adjectives": 0.901, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.509, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.271, "blimp/accuracy/only_npi_scope": 0.604, "blimp/accuracy/superlative_quantifiers_2": 0.786, "blimp/accuracy/passive_1": 0.91, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.906, "blimp/accuracy/inchoative": 0.625, "blimp/accuracy/anaphor_gender_agreement": 0.967, "blimp/accuracy/principle_A_c_command": 0.578, "blimp/accuracy/only_npi_licensor_present": 0.325, "blimp/accuracy/expletive_it_object_raising": 0.773, "blimp/accuracy/left_branch_island_simple_question": 0.52, "blimp/accuracy/wh_questions_subject_gap": 0.932, "blimp/accuracy/existential_there_quantifiers_2": 0.443, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.939, "blimp/accuracy/sentential_negation_npi_scope": 0.72, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.819, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.868, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.89, "blimp/accuracy/principle_A_case_2": 0.938, "blimp/accuracy/distractor_agreement_relational_noun": 0.836, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.991, "blimp/accuracy/superlative_quantifiers_1": 0.611, "blimp/accuracy/wh_island": 0.771, "blimp/accuracy/principle_A_domain_1": 0.973, "blimp/accuracy/complex_NP_island": 0.598, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.98, "blimp/accuracy/irregular_past_participle_verbs": 0.849, "blimp/accuracy/drop_argument": 0.763, "blimp/accuracy/wh_questions_object_gap": 0.796, "blimp/accuracy/animate_subject_passive": 0.803, "blimp/accuracy/existential_there_quantifiers_1": 0.988, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.866, "blimp/accuracy/npi_present_2": 0.596, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.945, "blimp/accuracy/anaphor_number_agreement": 0.988, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.958, "blimp/accuracy/existential_there_object_raising": 0.837, "blimp/accuracy/matrix_question_npi_licensor_present": 0.2, "blimp/accuracy/npi_present_1": 0.54, "blimp/accuracy/wh_vs_that_no_gap": 0.967, "blimp/accuracy/left_branch_island_echo_question": 0.487, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.965, "blimp/accuracy/causative": 0.731, "blimp/accuracy/group_average": 0.7701343283582087, "blimp/accuracy/seq_average": 0.7701343283582089, "cbt/accuracy/NE": 0.7568108974358975, "cbt/accuracy/V": 0.9104, "cbt/accuracy/CN": 0.8212, "cbt/accuracy/P": 0.8848, "cbt/accuracy/group_average": 0.8433027243589744, "cbt/accuracy/seq_average": 0.8433373349339736, "hellaswag/accuracy/val": 0.2905795658235411, "hellaswag/accuracy/group_average": 0.2905795658235411, "hellaswag/accuracy/seq_average": 0.2905795658235411, "piqa/accuracy/val": 0.5865070729053319, "piqa/accuracy/group_average": 0.5865070729053319, "piqa/accuracy/seq_average": 0.5865070729053319, "ai2arc/accuracy/ARC-Easy": 0.3391120507399577, "ai2arc/accuracy/ARC-Challenge": 0.21373390557939914, "ai2arc/accuracy/group_average": 0.2764229781596784, "ai2arc/accuracy/seq_average": 0.29773371104815866, "race/accuracy/test/high": 0.26300743281875355, "race/accuracy/test/middle": 0.3426183844011142, "race/accuracy/group_average": 0.3028129086099339, "race/accuracy/seq_average": 0.28617754357519254, "siqa/accuracy/dev": 0.3587512794268168, "siqa/accuracy/group_average": 0.3587512794268168, "siqa/accuracy/seq_average": 0.3587512794268168, "commonsenseqa/accuracy/dev_rand_split": 0.25225225225225223, "commonsenseqa/accuracy/group_average": 0.25225225225225223, "commonsenseqa/accuracy/seq_average": 0.25225225225225223}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_33_experts/export/result-model-90000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.617249019562252, "val/accuracy": 0.4780321878100198, "val/perplexity": 13.697988805814578, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5965621663916925, "lambada/accuracy/total": 0.24786490683229814, "lambada/accuracy/openai_last_token": 0.7567934782608695, "lambada/perplexity": 12.60696785379666, "lambada/lm_loss": 3.174572361253122, "lambada/lm_perplexity": 23.916590020605387, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.362948547321159, "mean_loss": 2.606905592976972, "blimp/accuracy/passive_2": 0.909, "blimp/accuracy/determiner_noun_agreement_2": 0.994, "blimp/accuracy/ellipsis_n_bar_1": 0.802, "blimp/accuracy/tough_vs_raising_2": 0.869, "blimp/accuracy/tough_vs_raising_1": 0.613, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.917, "blimp/accuracy/principle_A_reconstruction": 0.264, "blimp/accuracy/wh_vs_that_with_gap": 0.492, "blimp/accuracy/principle_A_domain_2": 0.806, "blimp/accuracy/determiner_noun_agreement_1": 0.992, "blimp/accuracy/ellipsis_n_bar_2": 0.912, "blimp/accuracy/principle_A_domain_3": 0.584, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.925, "blimp/accuracy/animate_subject_trans": 0.898, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.884, "blimp/accuracy/distractor_agreement_relative_clause": 0.694, "blimp/accuracy/transitive": 0.87, "blimp/accuracy/sentential_subject_island": 0.303, "blimp/accuracy/adjunct_island": 0.827, "blimp/accuracy/intransitive": 0.789, "blimp/accuracy/existential_there_subject_raising": 0.907, "blimp/accuracy/irregular_past_participle_adjectives": 0.896, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.519, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.278, "blimp/accuracy/only_npi_scope": 0.598, "blimp/accuracy/superlative_quantifiers_2": 0.706, "blimp/accuracy/passive_1": 0.895, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.907, "blimp/accuracy/inchoative": 0.644, "blimp/accuracy/anaphor_gender_agreement": 0.974, "blimp/accuracy/principle_A_c_command": 0.583, "blimp/accuracy/only_npi_licensor_present": 0.437, "blimp/accuracy/expletive_it_object_raising": 0.797, "blimp/accuracy/left_branch_island_simple_question": 0.553, "blimp/accuracy/wh_questions_subject_gap": 0.939, "blimp/accuracy/existential_there_quantifiers_2": 0.467, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.942, "blimp/accuracy/sentential_negation_npi_scope": 0.7, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.803, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.878, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.895, "blimp/accuracy/principle_A_case_2": 0.926, "blimp/accuracy/distractor_agreement_relational_noun": 0.839, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.993, "blimp/accuracy/superlative_quantifiers_1": 0.693, "blimp/accuracy/wh_island": 0.778, "blimp/accuracy/principle_A_domain_1": 0.978, "blimp/accuracy/complex_NP_island": 0.588, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.977, "blimp/accuracy/irregular_past_participle_verbs": 0.886, "blimp/accuracy/drop_argument": 0.762, "blimp/accuracy/wh_questions_object_gap": 0.796, "blimp/accuracy/animate_subject_passive": 0.797, "blimp/accuracy/existential_there_quantifiers_1": 0.994, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.869, "blimp/accuracy/npi_present_2": 0.597, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.941, "blimp/accuracy/anaphor_number_agreement": 0.988, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.954, "blimp/accuracy/existential_there_object_raising": 0.856, "blimp/accuracy/matrix_question_npi_licensor_present": 0.203, "blimp/accuracy/npi_present_1": 0.563, "blimp/accuracy/wh_vs_that_no_gap": 0.966, "blimp/accuracy/left_branch_island_echo_question": 0.467, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.971, "blimp/accuracy/causative": 0.724, "blimp/accuracy/group_average": 0.7726567164179103, "blimp/accuracy/seq_average": 0.7726567164179104, "cbt/accuracy/NE": 0.7548076923076923, "cbt/accuracy/V": 0.91, "cbt/accuracy/CN": 0.82, "cbt/accuracy/P": 0.8868, "cbt/accuracy/group_average": 0.842901923076923, "cbt/accuracy/seq_average": 0.842937174869948, "hellaswag/accuracy/val": 0.29067914758016333, "hellaswag/accuracy/group_average": 0.29067914758016333, "hellaswag/accuracy/seq_average": 0.29067914758016333, "piqa/accuracy/val": 0.5745375408052231, "piqa/accuracy/group_average": 0.5745375408052231, "piqa/accuracy/seq_average": 0.5745375408052231, "ai2arc/accuracy/ARC-Easy": 0.33488372093023255, "ai2arc/accuracy/ARC-Challenge": 0.21802575107296138, "ai2arc/accuracy/group_average": 0.276454736001597, "ai2arc/accuracy/seq_average": 0.2963172804532578, "race/accuracy/test/high": 0.2655803316180675, "race/accuracy/test/middle": 0.33913649025069637, "race/accuracy/group_average": 0.3023584109343819, "race/accuracy/seq_average": 0.2869882448317795, "siqa/accuracy/dev": 0.3607983623336745, "siqa/accuracy/group_average": 0.3607983623336745, "siqa/accuracy/seq_average": 0.3607983623336745, "commonsenseqa/accuracy/dev_rand_split": 0.25061425061425063, "commonsenseqa/accuracy/group_average": 0.25061425061425063, "commonsenseqa/accuracy/seq_average": 0.25061425061425063}