Upload folder using huggingface_hub
#511
by
DavidNguyen
- opened
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_17_experts/export/result-model-10000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_17_experts/export/result-model-100000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_17_experts/export/result-model-20000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_17_experts/export/result-model-30000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_17_experts/export/result-model-40000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_17_experts/export/result-model-50000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_17_experts/export/result-model-60000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_17_experts/export/result-model-70000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_17_experts/export/result-model-80000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_17_experts/export/result-model-90000.pth.json +1 -0
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_17_experts/export/result-model-10000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 3.0319296216207836, "val/accuracy": 0.42333984375, "val/perplexity": 20.737208974128396, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.758099289414305, "lambada/accuracy/total": 0.17100155279503104, "lambada/accuracy/openai_last_token": 0.7127329192546584, "lambada/perplexity": 24.99166296951254, "lambada/lm_loss": 3.572635729730636, "lambada/lm_perplexity": 35.61032876873988, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.2971706982725155, "mean_loss": 2.8950144555175443, "blimp/accuracy/passive_2": 0.873, "blimp/accuracy/determiner_noun_agreement_2": 0.952, "blimp/accuracy/ellipsis_n_bar_1": 0.699, "blimp/accuracy/tough_vs_raising_2": 0.794, "blimp/accuracy/tough_vs_raising_1": 0.547, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.842, "blimp/accuracy/principle_A_reconstruction": 0.45, "blimp/accuracy/wh_vs_that_with_gap": 0.442, "blimp/accuracy/principle_A_domain_2": 0.729, "blimp/accuracy/determiner_noun_agreement_1": 0.975, "blimp/accuracy/ellipsis_n_bar_2": 0.859, "blimp/accuracy/principle_A_domain_3": 0.505, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.879, "blimp/accuracy/animate_subject_trans": 0.877, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.753, "blimp/accuracy/distractor_agreement_relative_clause": 0.455, "blimp/accuracy/transitive": 0.819, "blimp/accuracy/sentential_subject_island": 0.337, "blimp/accuracy/adjunct_island": 0.716, "blimp/accuracy/intransitive": 0.702, "blimp/accuracy/existential_there_subject_raising": 0.822, "blimp/accuracy/irregular_past_participle_adjectives": 0.985, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.228, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.157, "blimp/accuracy/only_npi_scope": 0.696, "blimp/accuracy/superlative_quantifiers_2": 0.568, "blimp/accuracy/passive_1": 0.878, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.854, "blimp/accuracy/inchoative": 0.543, "blimp/accuracy/anaphor_gender_agreement": 0.925, "blimp/accuracy/principle_A_c_command": 0.497, "blimp/accuracy/only_npi_licensor_present": 0.336, "blimp/accuracy/expletive_it_object_raising": 0.746, "blimp/accuracy/left_branch_island_simple_question": 0.295, "blimp/accuracy/wh_questions_subject_gap": 0.853, "blimp/accuracy/existential_there_quantifiers_2": 0.364, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.901, "blimp/accuracy/sentential_negation_npi_scope": 0.449, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.794, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.832, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.797, "blimp/accuracy/principle_A_case_2": 0.943, "blimp/accuracy/distractor_agreement_relational_noun": 0.744, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.997, "blimp/accuracy/superlative_quantifiers_1": 0.502, "blimp/accuracy/wh_island": 0.572, "blimp/accuracy/principle_A_domain_1": 0.993, "blimp/accuracy/complex_NP_island": 0.526, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.931, "blimp/accuracy/irregular_past_participle_verbs": 0.772, "blimp/accuracy/drop_argument": 0.752, "blimp/accuracy/wh_questions_object_gap": 0.681, "blimp/accuracy/animate_subject_passive": 0.75, "blimp/accuracy/existential_there_quantifiers_1": 0.962, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.856, "blimp/accuracy/npi_present_2": 0.556, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.789, "blimp/accuracy/anaphor_number_agreement": 0.977, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.915, "blimp/accuracy/existential_there_object_raising": 0.748, "blimp/accuracy/matrix_question_npi_licensor_present": 0.055, "blimp/accuracy/npi_present_1": 0.508, "blimp/accuracy/wh_vs_that_no_gap": 0.948, "blimp/accuracy/left_branch_island_echo_question": 0.344, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.966, "blimp/accuracy/causative": 0.657, "blimp/accuracy/group_average": 0.7040149253731345, "blimp/accuracy/seq_average": 0.7040149253731344, "cbt/accuracy/NE": 0.686698717948718, "cbt/accuracy/V": 0.8608, "cbt/accuracy/CN": 0.7372, "cbt/accuracy/P": 0.8308, "cbt/accuracy/group_average": 0.7788746794871795, "cbt/accuracy/seq_average": 0.7789115646258503, "hellaswag/accuracy/val": 0.26767576180043817, "hellaswag/accuracy/group_average": 0.26767576180043817, "hellaswag/accuracy/seq_average": 0.26767576180043817, "piqa/accuracy/val": 0.5516866158868335, "piqa/accuracy/group_average": 0.5516866158868335, "piqa/accuracy/seq_average": 0.5516866158868335, "ai2arc/accuracy/ARC-Easy": 0.30274841437632133, "ai2arc/accuracy/ARC-Challenge": 0.20686695278969958, "ai2arc/accuracy/group_average": 0.2548076835830104, "ai2arc/accuracy/seq_average": 0.2711048158640227, "race/accuracy/test/high": 0.2524299599771298, "race/accuracy/test/middle": 0.32172701949860727, "race/accuracy/group_average": 0.28707848973786854, "race/accuracy/seq_average": 0.27259829752736114, "siqa/accuracy/dev": 0.3664278403275333, "siqa/accuracy/group_average": 0.3664278403275333, "siqa/accuracy/seq_average": 0.3664278403275333, "commonsenseqa/accuracy/dev_rand_split": 0.2375102375102375, "commonsenseqa/accuracy/group_average": 0.2375102375102375, "commonsenseqa/accuracy/seq_average": 0.2375102375102375}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_17_experts/export/result-model-100000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.6213160923549106, "val/accuracy": 0.47800990513392855, "val/perplexity": 13.75381296691493, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.562402950310559, "lambada/accuracy/total": 0.2511645962732919, "lambada/accuracy/openai_last_token": 0.7610636645962733, "lambada/perplexity": 12.223178490708115, "lambada/lm_loss": 3.1869670229973686, "lambada/lm_perplexity": 24.214872802381873, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3645872507036102, "mean_loss": 2.5918595213327347, "blimp/accuracy/passive_2": 0.914, "blimp/accuracy/determiner_noun_agreement_2": 0.991, "blimp/accuracy/ellipsis_n_bar_1": 0.798, "blimp/accuracy/tough_vs_raising_2": 0.878, "blimp/accuracy/tough_vs_raising_1": 0.587, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.898, "blimp/accuracy/principle_A_reconstruction": 0.431, "blimp/accuracy/wh_vs_that_with_gap": 0.47, "blimp/accuracy/principle_A_domain_2": 0.765, "blimp/accuracy/determiner_noun_agreement_1": 0.991, "blimp/accuracy/ellipsis_n_bar_2": 0.906, "blimp/accuracy/principle_A_domain_3": 0.537, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.94, "blimp/accuracy/animate_subject_trans": 0.909, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.912, "blimp/accuracy/distractor_agreement_relative_clause": 0.606, "blimp/accuracy/transitive": 0.868, "blimp/accuracy/sentential_subject_island": 0.357, "blimp/accuracy/adjunct_island": 0.8, "blimp/accuracy/intransitive": 0.768, "blimp/accuracy/existential_there_subject_raising": 0.893, "blimp/accuracy/irregular_past_participle_adjectives": 0.959, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.434, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.178, "blimp/accuracy/only_npi_scope": 0.63, "blimp/accuracy/superlative_quantifiers_2": 0.869, "blimp/accuracy/passive_1": 0.893, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.899, "blimp/accuracy/inchoative": 0.604, "blimp/accuracy/anaphor_gender_agreement": 0.968, "blimp/accuracy/principle_A_c_command": 0.603, "blimp/accuracy/only_npi_licensor_present": 0.45, "blimp/accuracy/expletive_it_object_raising": 0.794, "blimp/accuracy/left_branch_island_simple_question": 0.502, "blimp/accuracy/wh_questions_subject_gap": 0.92, "blimp/accuracy/existential_there_quantifiers_2": 0.389, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.941, "blimp/accuracy/sentential_negation_npi_scope": 0.648, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.785, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.841, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.874, "blimp/accuracy/principle_A_case_2": 0.947, "blimp/accuracy/distractor_agreement_relational_noun": 0.766, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.971, "blimp/accuracy/superlative_quantifiers_1": 0.533, "blimp/accuracy/wh_island": 0.763, "blimp/accuracy/principle_A_domain_1": 0.978, "blimp/accuracy/complex_NP_island": 0.49, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.979, "blimp/accuracy/irregular_past_participle_verbs": 0.892, "blimp/accuracy/drop_argument": 0.761, "blimp/accuracy/wh_questions_object_gap": 0.801, "blimp/accuracy/animate_subject_passive": 0.787, "blimp/accuracy/existential_there_quantifiers_1": 0.98, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.876, "blimp/accuracy/npi_present_2": 0.601, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.946, "blimp/accuracy/anaphor_number_agreement": 0.989, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.962, "blimp/accuracy/existential_there_object_raising": 0.838, "blimp/accuracy/matrix_question_npi_licensor_present": 0.19, "blimp/accuracy/npi_present_1": 0.539, "blimp/accuracy/wh_vs_that_no_gap": 0.972, "blimp/accuracy/left_branch_island_echo_question": 0.476, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.975, "blimp/accuracy/causative": 0.722, "blimp/accuracy/group_average": 0.7631940298507462, "blimp/accuracy/seq_average": 0.7631940298507462, "cbt/accuracy/NE": 0.7536057692307693, "cbt/accuracy/V": 0.9092, "cbt/accuracy/CN": 0.8132, "cbt/accuracy/P": 0.8976, "cbt/accuracy/group_average": 0.8434014423076923, "cbt/accuracy/seq_average": 0.84343737494998, "hellaswag/accuracy/val": 0.2921728739294961, "hellaswag/accuracy/group_average": 0.2921728739294961, "hellaswag/accuracy/seq_average": 0.2921728739294961, "piqa/accuracy/val": 0.5745375408052231, "piqa/accuracy/group_average": 0.5745375408052231, "piqa/accuracy/seq_average": 0.5745375408052231, "ai2arc/accuracy/ARC-Easy": 0.32727272727272727, "ai2arc/accuracy/ARC-Challenge": 0.22660944206008585, "ai2arc/accuracy/group_average": 0.27694108466640655, "ai2arc/accuracy/seq_average": 0.29405099150141645, "race/accuracy/test/high": 0.2747284162378502, "race/accuracy/test/middle": 0.33008356545961004, "race/accuracy/group_average": 0.3024059908487301, "race/accuracy/seq_average": 0.2908390758005675, "siqa/accuracy/dev": 0.35823950870010235, "siqa/accuracy/group_average": 0.35823950870010235, "siqa/accuracy/seq_average": 0.35823950870010235, "commonsenseqa/accuracy/dev_rand_split": 0.24651924651924653, "commonsenseqa/accuracy/group_average": 0.24651924651924653, "commonsenseqa/accuracy/seq_average": 0.24651924651924653}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_17_experts/export/result-model-20000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.893124777173239, "val/accuracy": 0.43974764384920634, "val/perplexity": 18.04962262166305, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6317398355614325, "lambada/accuracy/total": 0.19914596273291926, "lambada/accuracy/openai_last_token": 0.7309782608695652, "lambada/perplexity": 18.64376195348997, "lambada/lm_loss": 3.422492834274795, "lambada/lm_perplexity": 30.645714568126312, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3194468032910628, "mean_loss": 2.7624323063673355, "blimp/accuracy/passive_2": 0.876, "blimp/accuracy/determiner_noun_agreement_2": 0.971, "blimp/accuracy/ellipsis_n_bar_1": 0.749, "blimp/accuracy/tough_vs_raising_2": 0.868, "blimp/accuracy/tough_vs_raising_1": 0.579, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.875, "blimp/accuracy/principle_A_reconstruction": 0.595, "blimp/accuracy/wh_vs_that_with_gap": 0.404, "blimp/accuracy/principle_A_domain_2": 0.753, "blimp/accuracy/determiner_noun_agreement_1": 0.985, "blimp/accuracy/ellipsis_n_bar_2": 0.874, "blimp/accuracy/principle_A_domain_3": 0.534, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.901, "blimp/accuracy/animate_subject_trans": 0.882, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.84, "blimp/accuracy/distractor_agreement_relative_clause": 0.497, "blimp/accuracy/transitive": 0.844, "blimp/accuracy/sentential_subject_island": 0.45, "blimp/accuracy/adjunct_island": 0.698, "blimp/accuracy/intransitive": 0.707, "blimp/accuracy/existential_there_subject_raising": 0.846, "blimp/accuracy/irregular_past_participle_adjectives": 0.844, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.316, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.123, "blimp/accuracy/only_npi_scope": 0.617, "blimp/accuracy/superlative_quantifiers_2": 0.665, "blimp/accuracy/passive_1": 0.895, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.871, "blimp/accuracy/inchoative": 0.541, "blimp/accuracy/anaphor_gender_agreement": 0.901, "blimp/accuracy/principle_A_c_command": 0.574, "blimp/accuracy/only_npi_licensor_present": 0.398, "blimp/accuracy/expletive_it_object_raising": 0.749, "blimp/accuracy/left_branch_island_simple_question": 0.381, "blimp/accuracy/wh_questions_subject_gap": 0.889, "blimp/accuracy/existential_there_quantifiers_2": 0.441, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.91, "blimp/accuracy/sentential_negation_npi_scope": 0.535, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.788, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.891, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.847, "blimp/accuracy/principle_A_case_2": 0.936, "blimp/accuracy/distractor_agreement_relational_noun": 0.817, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.989, "blimp/accuracy/superlative_quantifiers_1": 0.542, "blimp/accuracy/wh_island": 0.783, "blimp/accuracy/principle_A_domain_1": 0.988, "blimp/accuracy/complex_NP_island": 0.481, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.952, "blimp/accuracy/irregular_past_participle_verbs": 0.841, "blimp/accuracy/drop_argument": 0.746, "blimp/accuracy/wh_questions_object_gap": 0.764, "blimp/accuracy/animate_subject_passive": 0.744, "blimp/accuracy/existential_there_quantifiers_1": 0.991, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.862, "blimp/accuracy/npi_present_2": 0.596, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.888, "blimp/accuracy/anaphor_number_agreement": 0.969, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.942, "blimp/accuracy/existential_there_object_raising": 0.771, "blimp/accuracy/matrix_question_npi_licensor_present": 0.072, "blimp/accuracy/npi_present_1": 0.489, "blimp/accuracy/wh_vs_that_no_gap": 0.968, "blimp/accuracy/left_branch_island_echo_question": 0.397, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.981, "blimp/accuracy/causative": 0.671, "blimp/accuracy/group_average": 0.7325970149253731, "blimp/accuracy/seq_average": 0.7325970149253731, "cbt/accuracy/NE": 0.7003205128205128, "cbt/accuracy/V": 0.8792, "cbt/accuracy/CN": 0.7504, "cbt/accuracy/P": 0.8604, "cbt/accuracy/group_average": 0.7975801282051282, "cbt/accuracy/seq_average": 0.7976190476190477, "hellaswag/accuracy/val": 0.2755427205735909, "hellaswag/accuracy/group_average": 0.2755427205735909, "hellaswag/accuracy/seq_average": 0.2755427205735909, "piqa/accuracy/val": 0.559847660500544, "piqa/accuracy/group_average": 0.559847660500544, "piqa/accuracy/seq_average": 0.559847660500544, "ai2arc/accuracy/ARC-Easy": 0.31543340380549684, "ai2arc/accuracy/ARC-Challenge": 0.19914163090128756, "ai2arc/accuracy/group_average": 0.25728751735339217, "ai2arc/accuracy/seq_average": 0.2770538243626062, "race/accuracy/test/high": 0.259576901086335, "race/accuracy/test/middle": 0.3370473537604457, "race/accuracy/group_average": 0.29831212742339036, "race/accuracy/seq_average": 0.2821240372922578, "siqa/accuracy/dev": 0.3572159672466735, "siqa/accuracy/group_average": 0.3572159672466735, "siqa/accuracy/seq_average": 0.3572159672466735, "commonsenseqa/accuracy/dev_rand_split": 0.23996723996723995, "commonsenseqa/accuracy/group_average": 0.23996723996723995, "commonsenseqa/accuracy/seq_average": 0.23996723996723995}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_17_experts/export/result-model-30000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.8185858348059276, "val/accuracy": 0.45048595610119047, "val/perplexity": 16.753142201651208, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6766850252329193, "lambada/accuracy/total": 0.19157608695652173, "lambada/accuracy/openai_last_token": 0.7327251552795031, "lambada/perplexity": 17.693644782736925, "lambada/lm_loss": 3.3877233527212995, "lambada/lm_perplexity": 29.598490183689172, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3210310215288561, "mean_loss": 2.7476354300194235, "blimp/accuracy/passive_2": 0.886, "blimp/accuracy/determiner_noun_agreement_2": 0.969, "blimp/accuracy/ellipsis_n_bar_1": 0.787, "blimp/accuracy/tough_vs_raising_2": 0.874, "blimp/accuracy/tough_vs_raising_1": 0.571, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.895, "blimp/accuracy/principle_A_reconstruction": 0.491, "blimp/accuracy/wh_vs_that_with_gap": 0.497, "blimp/accuracy/principle_A_domain_2": 0.781, "blimp/accuracy/determiner_noun_agreement_1": 0.985, "blimp/accuracy/ellipsis_n_bar_2": 0.892, "blimp/accuracy/principle_A_domain_3": 0.523, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.903, "blimp/accuracy/animate_subject_trans": 0.883, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.821, "blimp/accuracy/distractor_agreement_relative_clause": 0.463, "blimp/accuracy/transitive": 0.835, "blimp/accuracy/sentential_subject_island": 0.45, "blimp/accuracy/adjunct_island": 0.766, "blimp/accuracy/intransitive": 0.755, "blimp/accuracy/existential_there_subject_raising": 0.835, "blimp/accuracy/irregular_past_participle_adjectives": 0.926, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.299, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.176, "blimp/accuracy/only_npi_scope": 0.613, "blimp/accuracy/superlative_quantifiers_2": 0.6, "blimp/accuracy/passive_1": 0.864, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.882, "blimp/accuracy/inchoative": 0.61, "blimp/accuracy/anaphor_gender_agreement": 0.946, "blimp/accuracy/principle_A_c_command": 0.557, "blimp/accuracy/only_npi_licensor_present": 0.218, "blimp/accuracy/expletive_it_object_raising": 0.786, "blimp/accuracy/left_branch_island_simple_question": 0.374, "blimp/accuracy/wh_questions_subject_gap": 0.874, "blimp/accuracy/existential_there_quantifiers_2": 0.428, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.922, "blimp/accuracy/sentential_negation_npi_scope": 0.568, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.806, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.843, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.849, "blimp/accuracy/principle_A_case_2": 0.94, "blimp/accuracy/distractor_agreement_relational_noun": 0.69, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.982, "blimp/accuracy/superlative_quantifiers_1": 0.724, "blimp/accuracy/wh_island": 0.731, "blimp/accuracy/principle_A_domain_1": 0.977, "blimp/accuracy/complex_NP_island": 0.493, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.959, "blimp/accuracy/irregular_past_participle_verbs": 0.833, "blimp/accuracy/drop_argument": 0.758, "blimp/accuracy/wh_questions_object_gap": 0.699, "blimp/accuracy/animate_subject_passive": 0.803, "blimp/accuracy/existential_there_quantifiers_1": 0.977, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.872, "blimp/accuracy/npi_present_2": 0.566, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.872, "blimp/accuracy/anaphor_number_agreement": 0.983, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.945, "blimp/accuracy/existential_there_object_raising": 0.794, "blimp/accuracy/matrix_question_npi_licensor_present": 0.134, "blimp/accuracy/npi_present_1": 0.491, "blimp/accuracy/wh_vs_that_no_gap": 0.948, "blimp/accuracy/left_branch_island_echo_question": 0.404, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.972, "blimp/accuracy/causative": 0.688, "blimp/accuracy/group_average": 0.7348955223880597, "blimp/accuracy/seq_average": 0.7348955223880597, "cbt/accuracy/NE": 0.71875, "cbt/accuracy/V": 0.8864, "cbt/accuracy/CN": 0.7804, "cbt/accuracy/P": 0.8712, "cbt/accuracy/group_average": 0.8141875000000001, "cbt/accuracy/seq_average": 0.8142256902761105, "hellaswag/accuracy/val": 0.27992431786496713, "hellaswag/accuracy/group_average": 0.27992431786496713, "hellaswag/accuracy/seq_average": 0.27992431786496713, "piqa/accuracy/val": 0.5636561479869423, "piqa/accuracy/group_average": 0.5636561479869423, "piqa/accuracy/seq_average": 0.5636561479869423, "ai2arc/accuracy/ARC-Easy": 0.3150105708245243, "ai2arc/accuracy/ARC-Challenge": 0.21030042918454936, "ai2arc/accuracy/group_average": 0.26265550000453686, "ai2arc/accuracy/seq_average": 0.2804532577903683, "race/accuracy/test/high": 0.26014865637507145, "race/accuracy/test/middle": 0.32520891364902504, "race/accuracy/group_average": 0.29267878501204825, "race/accuracy/seq_average": 0.27908390758005674, "siqa/accuracy/dev": 0.35670419651995905, "siqa/accuracy/group_average": 0.35670419651995905, "siqa/accuracy/seq_average": 0.35670419651995905, "commonsenseqa/accuracy/dev_rand_split": 0.24324324324324326, "commonsenseqa/accuracy/group_average": 0.24324324324324326, "commonsenseqa/accuracy/seq_average": 0.24324324324324326}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_17_experts/export/result-model-40000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.7666374085441467, "val/accuracy": 0.45724632626488093, "val/perplexity": 15.905061764690968, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.617066566988548, "lambada/accuracy/total": 0.1999223602484472, "lambada/accuracy/openai_last_token": 0.7391304347826086, "lambada/perplexity": 17.07057988089369, "lambada/lm_loss": 3.30933356660353, "lambada/lm_perplexity": 27.366881189696937, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3285843432566641, "mean_loss": 2.6918519877663476, "blimp/accuracy/passive_2": 0.881, "blimp/accuracy/determiner_noun_agreement_2": 0.986, "blimp/accuracy/ellipsis_n_bar_1": 0.783, "blimp/accuracy/tough_vs_raising_2": 0.864, "blimp/accuracy/tough_vs_raising_1": 0.579, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.877, "blimp/accuracy/principle_A_reconstruction": 0.389, "blimp/accuracy/wh_vs_that_with_gap": 0.445, "blimp/accuracy/principle_A_domain_2": 0.76, "blimp/accuracy/determiner_noun_agreement_1": 0.987, "blimp/accuracy/ellipsis_n_bar_2": 0.877, "blimp/accuracy/principle_A_domain_3": 0.532, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.924, "blimp/accuracy/animate_subject_trans": 0.911, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.885, "blimp/accuracy/distractor_agreement_relative_clause": 0.564, "blimp/accuracy/transitive": 0.848, "blimp/accuracy/sentential_subject_island": 0.389, "blimp/accuracy/adjunct_island": 0.76, "blimp/accuracy/intransitive": 0.714, "blimp/accuracy/existential_there_subject_raising": 0.856, "blimp/accuracy/irregular_past_participle_adjectives": 0.878, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.349, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.167, "blimp/accuracy/only_npi_scope": 0.551, "blimp/accuracy/superlative_quantifiers_2": 0.715, "blimp/accuracy/passive_1": 0.875, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.856, "blimp/accuracy/inchoative": 0.58, "blimp/accuracy/anaphor_gender_agreement": 0.945, "blimp/accuracy/principle_A_c_command": 0.589, "blimp/accuracy/only_npi_licensor_present": 0.572, "blimp/accuracy/expletive_it_object_raising": 0.771, "blimp/accuracy/left_branch_island_simple_question": 0.405, "blimp/accuracy/wh_questions_subject_gap": 0.914, "blimp/accuracy/existential_there_quantifiers_2": 0.382, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.921, "blimp/accuracy/sentential_negation_npi_scope": 0.603, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.758, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.839, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.861, "blimp/accuracy/principle_A_case_2": 0.941, "blimp/accuracy/distractor_agreement_relational_noun": 0.786, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.972, "blimp/accuracy/superlative_quantifiers_1": 0.685, "blimp/accuracy/wh_island": 0.739, "blimp/accuracy/principle_A_domain_1": 0.992, "blimp/accuracy/complex_NP_island": 0.486, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.974, "blimp/accuracy/irregular_past_participle_verbs": 0.851, "blimp/accuracy/drop_argument": 0.735, "blimp/accuracy/wh_questions_object_gap": 0.745, "blimp/accuracy/animate_subject_passive": 0.762, "blimp/accuracy/existential_there_quantifiers_1": 0.969, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.867, "blimp/accuracy/npi_present_2": 0.545, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.912, "blimp/accuracy/anaphor_number_agreement": 0.985, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.947, "blimp/accuracy/existential_there_object_raising": 0.799, "blimp/accuracy/matrix_question_npi_licensor_present": 0.12, "blimp/accuracy/npi_present_1": 0.476, "blimp/accuracy/wh_vs_that_no_gap": 0.964, "blimp/accuracy/left_branch_island_echo_question": 0.382, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.967, "blimp/accuracy/causative": 0.686, "blimp/accuracy/group_average": 0.7407313432835818, "blimp/accuracy/seq_average": 0.7407313432835821, "cbt/accuracy/NE": 0.7311698717948718, "cbt/accuracy/V": 0.8824, "cbt/accuracy/CN": 0.7792, "cbt/accuracy/P": 0.8732, "cbt/accuracy/group_average": 0.816492467948718, "cbt/accuracy/seq_average": 0.8165266106442577, "hellaswag/accuracy/val": 0.28052180840470026, "hellaswag/accuracy/group_average": 0.28052180840470026, "hellaswag/accuracy/seq_average": 0.28052180840470026, "piqa/accuracy/val": 0.5690968443960827, "piqa/accuracy/group_average": 0.5690968443960827, "piqa/accuracy/seq_average": 0.5690968443960827, "ai2arc/accuracy/ARC-Easy": 0.32389006342494714, "ai2arc/accuracy/ARC-Challenge": 0.22145922746781116, "ai2arc/accuracy/group_average": 0.27267464544637915, "ai2arc/accuracy/seq_average": 0.29008498583569403, "race/accuracy/test/high": 0.26472269868496284, "race/accuracy/test/middle": 0.3203342618384401, "race/accuracy/group_average": 0.2925284802617015, "race/accuracy/seq_average": 0.2809079854073774, "siqa/accuracy/dev": 0.3592630501535312, "siqa/accuracy/group_average": 0.3592630501535312, "siqa/accuracy/seq_average": 0.3592630501535312, "commonsenseqa/accuracy/dev_rand_split": 0.25061425061425063, "commonsenseqa/accuracy/group_average": 0.25061425061425063, "commonsenseqa/accuracy/seq_average": 0.25061425061425063}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_17_experts/export/result-model-50000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.721283685593378, "val/accuracy": 0.4642973400297619, "val/perplexity": 15.19982151871333, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6417672293526784, "lambada/accuracy/total": 0.22069099378881987, "lambada/accuracy/openai_last_token": 0.7507763975155279, "lambada/perplexity": 14.613887772041215, "lambada/lm_loss": 3.2964931422752644, "lambada/lm_perplexity": 27.01772527501776, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3424941669092909, "mean_loss": 2.681525457473028, "blimp/accuracy/passive_2": 0.897, "blimp/accuracy/determiner_noun_agreement_2": 0.989, "blimp/accuracy/ellipsis_n_bar_1": 0.817, "blimp/accuracy/tough_vs_raising_2": 0.895, "blimp/accuracy/tough_vs_raising_1": 0.534, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.89, "blimp/accuracy/principle_A_reconstruction": 0.444, "blimp/accuracy/wh_vs_that_with_gap": 0.418, "blimp/accuracy/principle_A_domain_2": 0.791, "blimp/accuracy/determiner_noun_agreement_1": 0.989, "blimp/accuracy/ellipsis_n_bar_2": 0.873, "blimp/accuracy/principle_A_domain_3": 0.545, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.911, "blimp/accuracy/animate_subject_trans": 0.904, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.876, "blimp/accuracy/distractor_agreement_relative_clause": 0.555, "blimp/accuracy/transitive": 0.856, "blimp/accuracy/sentential_subject_island": 0.388, "blimp/accuracy/adjunct_island": 0.759, "blimp/accuracy/intransitive": 0.731, "blimp/accuracy/existential_there_subject_raising": 0.857, "blimp/accuracy/irregular_past_participle_adjectives": 0.893, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.38, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.138, "blimp/accuracy/only_npi_scope": 0.652, "blimp/accuracy/superlative_quantifiers_2": 0.687, "blimp/accuracy/passive_1": 0.882, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.866, "blimp/accuracy/inchoative": 0.563, "blimp/accuracy/anaphor_gender_agreement": 0.941, "blimp/accuracy/principle_A_c_command": 0.618, "blimp/accuracy/only_npi_licensor_present": 0.505, "blimp/accuracy/expletive_it_object_raising": 0.811, "blimp/accuracy/left_branch_island_simple_question": 0.426, "blimp/accuracy/wh_questions_subject_gap": 0.932, "blimp/accuracy/existential_there_quantifiers_2": 0.409, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.929, "blimp/accuracy/sentential_negation_npi_scope": 0.523, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.793, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.891, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.848, "blimp/accuracy/principle_A_case_2": 0.953, "blimp/accuracy/distractor_agreement_relational_noun": 0.79, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.99, "blimp/accuracy/superlative_quantifiers_1": 0.655, "blimp/accuracy/wh_island": 0.821, "blimp/accuracy/principle_A_domain_1": 0.986, "blimp/accuracy/complex_NP_island": 0.483, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.968, "blimp/accuracy/irregular_past_participle_verbs": 0.866, "blimp/accuracy/drop_argument": 0.72, "blimp/accuracy/wh_questions_object_gap": 0.807, "blimp/accuracy/animate_subject_passive": 0.791, "blimp/accuracy/existential_there_quantifiers_1": 0.978, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.89, "blimp/accuracy/npi_present_2": 0.56, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.914, "blimp/accuracy/anaphor_number_agreement": 0.984, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.95, "blimp/accuracy/existential_there_object_raising": 0.82, "blimp/accuracy/matrix_question_npi_licensor_present": 0.151, "blimp/accuracy/npi_present_1": 0.525, "blimp/accuracy/wh_vs_that_no_gap": 0.974, "blimp/accuracy/left_branch_island_echo_question": 0.393, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.981, "blimp/accuracy/causative": 0.692, "blimp/accuracy/group_average": 0.7499701492537312, "blimp/accuracy/seq_average": 0.7499701492537313, "cbt/accuracy/NE": 0.7407852564102564, "cbt/accuracy/V": 0.9004, "cbt/accuracy/CN": 0.7932, "cbt/accuracy/P": 0.8724, "cbt/accuracy/group_average": 0.8266963141025641, "cbt/accuracy/seq_average": 0.8267306922769108, "hellaswag/accuracy/val": 0.2848038239394543, "hellaswag/accuracy/group_average": 0.2848038239394543, "hellaswag/accuracy/seq_average": 0.2848038239394543, "piqa/accuracy/val": 0.5750816104461371, "piqa/accuracy/group_average": 0.5750816104461371, "piqa/accuracy/seq_average": 0.5750816104461371, "ai2arc/accuracy/ARC-Easy": 0.3221987315010571, "ai2arc/accuracy/ARC-Challenge": 0.22145922746781116, "ai2arc/accuracy/group_average": 0.27182897948443413, "ai2arc/accuracy/seq_average": 0.28895184135977336, "race/accuracy/test/high": 0.2741566609491138, "race/accuracy/test/middle": 0.3245125348189415, "race/accuracy/group_average": 0.2993345978840276, "race/accuracy/seq_average": 0.2888123226591001, "siqa/accuracy/dev": 0.35977482088024565, "siqa/accuracy/group_average": 0.35977482088024565, "siqa/accuracy/seq_average": 0.35977482088024565, "commonsenseqa/accuracy/dev_rand_split": 0.2457002457002457, "commonsenseqa/accuracy/group_average": 0.2457002457002457, "commonsenseqa/accuracy/seq_average": 0.2457002457002457}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_17_experts/export/result-model-60000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.6883423820374506, "val/accuracy": 0.4684816390749008, "val/perplexity": 14.707276657733274, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6278732015479425, "lambada/accuracy/total": 0.22573757763975155, "lambada/accuracy/openai_last_token": 0.7472826086956522, "lambada/perplexity": 14.229284086385562, "lambada/lm_loss": 3.244442634388776, "lambada/lm_perplexity": 25.647411090762592, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3471096083573262, "mean_loss": 2.6581077917926965, "blimp/accuracy/passive_2": 0.886, "blimp/accuracy/determiner_noun_agreement_2": 0.992, "blimp/accuracy/ellipsis_n_bar_1": 0.805, "blimp/accuracy/tough_vs_raising_2": 0.874, "blimp/accuracy/tough_vs_raising_1": 0.546, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.9, "blimp/accuracy/principle_A_reconstruction": 0.359, "blimp/accuracy/wh_vs_that_with_gap": 0.454, "blimp/accuracy/principle_A_domain_2": 0.763, "blimp/accuracy/determiner_noun_agreement_1": 0.99, "blimp/accuracy/ellipsis_n_bar_2": 0.885, "blimp/accuracy/principle_A_domain_3": 0.542, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.928, "blimp/accuracy/animate_subject_trans": 0.906, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.905, "blimp/accuracy/distractor_agreement_relative_clause": 0.576, "blimp/accuracy/transitive": 0.853, "blimp/accuracy/sentential_subject_island": 0.407, "blimp/accuracy/adjunct_island": 0.791, "blimp/accuracy/intransitive": 0.751, "blimp/accuracy/existential_there_subject_raising": 0.851, "blimp/accuracy/irregular_past_participle_adjectives": 0.899, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.369, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.179, "blimp/accuracy/only_npi_scope": 0.676, "blimp/accuracy/superlative_quantifiers_2": 0.582, "blimp/accuracy/passive_1": 0.879, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.883, "blimp/accuracy/inchoative": 0.623, "blimp/accuracy/anaphor_gender_agreement": 0.953, "blimp/accuracy/principle_A_c_command": 0.619, "blimp/accuracy/only_npi_licensor_present": 0.483, "blimp/accuracy/expletive_it_object_raising": 0.785, "blimp/accuracy/left_branch_island_simple_question": 0.412, "blimp/accuracy/wh_questions_subject_gap": 0.918, "blimp/accuracy/existential_there_quantifiers_2": 0.36, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.939, "blimp/accuracy/sentential_negation_npi_scope": 0.659, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.743, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.854, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.86, "blimp/accuracy/principle_A_case_2": 0.959, "blimp/accuracy/distractor_agreement_relational_noun": 0.729, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.981, "blimp/accuracy/superlative_quantifiers_1": 0.644, "blimp/accuracy/wh_island": 0.785, "blimp/accuracy/principle_A_domain_1": 0.984, "blimp/accuracy/complex_NP_island": 0.522, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.973, "blimp/accuracy/irregular_past_participle_verbs": 0.871, "blimp/accuracy/drop_argument": 0.761, "blimp/accuracy/wh_questions_object_gap": 0.807, "blimp/accuracy/animate_subject_passive": 0.781, "blimp/accuracy/existential_there_quantifiers_1": 0.986, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.894, "blimp/accuracy/npi_present_2": 0.6, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.914, "blimp/accuracy/anaphor_number_agreement": 0.984, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.954, "blimp/accuracy/existential_there_object_raising": 0.833, "blimp/accuracy/matrix_question_npi_licensor_present": 0.172, "blimp/accuracy/npi_present_1": 0.561, "blimp/accuracy/wh_vs_that_no_gap": 0.972, "blimp/accuracy/left_branch_island_echo_question": 0.394, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.971, "blimp/accuracy/causative": 0.713, "blimp/accuracy/group_average": 0.7519999999999999, "blimp/accuracy/seq_average": 0.752, "cbt/accuracy/NE": 0.734375, "cbt/accuracy/V": 0.8988, "cbt/accuracy/CN": 0.7988, "cbt/accuracy/P": 0.8856, "cbt/accuracy/group_average": 0.82939375, "cbt/accuracy/seq_average": 0.8294317727090836, "hellaswag/accuracy/val": 0.2848038239394543, "hellaswag/accuracy/group_average": 0.2848038239394543, "hellaswag/accuracy/seq_average": 0.2848038239394543, "piqa/accuracy/val": 0.5805223068552775, "piqa/accuracy/group_average": 0.5805223068552775, "piqa/accuracy/seq_average": 0.5805223068552775, "ai2arc/accuracy/ARC-Easy": 0.32558139534883723, "ai2arc/accuracy/ARC-Challenge": 0.21373390557939914, "ai2arc/accuracy/group_average": 0.26965765046411816, "ai2arc/accuracy/seq_average": 0.2886685552407932, "race/accuracy/test/high": 0.2695826186392224, "race/accuracy/test/middle": 0.3384401114206128, "race/accuracy/group_average": 0.3040113650299176, "race/accuracy/seq_average": 0.28962302391568706, "siqa/accuracy/dev": 0.3592630501535312, "siqa/accuracy/group_average": 0.3592630501535312, "siqa/accuracy/seq_average": 0.3592630501535312, "commonsenseqa/accuracy/dev_rand_split": 0.24897624897624898, "commonsenseqa/accuracy/group_average": 0.24897624897624898, "commonsenseqa/accuracy/seq_average": 0.24897624897624898}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_17_experts/export/result-model-70000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.6621776762462797, "val/accuracy": 0.47294689360119047, "val/perplexity": 14.327455710936619, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.661405385651203, "lambada/accuracy/total": 0.23563664596273293, "lambada/accuracy/openai_last_token": 0.7564052795031055, "lambada/perplexity": 13.692228892807982, "lambada/lm_loss": 3.231233709498986, "lambada/lm_perplexity": 25.310863970126533, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3542917697819617, "mean_loss": 2.6617915309487414, "blimp/accuracy/passive_2": 0.891, "blimp/accuracy/determiner_noun_agreement_2": 0.99, "blimp/accuracy/ellipsis_n_bar_1": 0.803, "blimp/accuracy/tough_vs_raising_2": 0.869, "blimp/accuracy/tough_vs_raising_1": 0.576, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.863, "blimp/accuracy/principle_A_reconstruction": 0.423, "blimp/accuracy/wh_vs_that_with_gap": 0.447, "blimp/accuracy/principle_A_domain_2": 0.749, "blimp/accuracy/determiner_noun_agreement_1": 0.992, "blimp/accuracy/ellipsis_n_bar_2": 0.9, "blimp/accuracy/principle_A_domain_3": 0.55, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.928, "blimp/accuracy/animate_subject_trans": 0.902, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.911, "blimp/accuracy/distractor_agreement_relative_clause": 0.588, "blimp/accuracy/transitive": 0.853, "blimp/accuracy/sentential_subject_island": 0.403, "blimp/accuracy/adjunct_island": 0.783, "blimp/accuracy/intransitive": 0.734, "blimp/accuracy/existential_there_subject_raising": 0.872, "blimp/accuracy/irregular_past_participle_adjectives": 0.885, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.415, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.192, "blimp/accuracy/only_npi_scope": 0.681, "blimp/accuracy/superlative_quantifiers_2": 0.74, "blimp/accuracy/passive_1": 0.874, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.866, "blimp/accuracy/inchoative": 0.581, "blimp/accuracy/anaphor_gender_agreement": 0.972, "blimp/accuracy/principle_A_c_command": 0.616, "blimp/accuracy/only_npi_licensor_present": 0.505, "blimp/accuracy/expletive_it_object_raising": 0.798, "blimp/accuracy/left_branch_island_simple_question": 0.498, "blimp/accuracy/wh_questions_subject_gap": 0.92, "blimp/accuracy/existential_there_quantifiers_2": 0.441, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.929, "blimp/accuracy/sentential_negation_npi_scope": 0.669, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.784, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.853, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.853, "blimp/accuracy/principle_A_case_2": 0.949, "blimp/accuracy/distractor_agreement_relational_noun": 0.744, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.988, "blimp/accuracy/superlative_quantifiers_1": 0.681, "blimp/accuracy/wh_island": 0.8, "blimp/accuracy/principle_A_domain_1": 0.984, "blimp/accuracy/complex_NP_island": 0.542, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.983, "blimp/accuracy/irregular_past_participle_verbs": 0.883, "blimp/accuracy/drop_argument": 0.727, "blimp/accuracy/wh_questions_object_gap": 0.822, "blimp/accuracy/animate_subject_passive": 0.769, "blimp/accuracy/existential_there_quantifiers_1": 0.992, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.868, "blimp/accuracy/npi_present_2": 0.607, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.932, "blimp/accuracy/anaphor_number_agreement": 0.984, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.949, "blimp/accuracy/existential_there_object_raising": 0.82, "blimp/accuracy/matrix_question_npi_licensor_present": 0.139, "blimp/accuracy/npi_present_1": 0.507, "blimp/accuracy/wh_vs_that_no_gap": 0.97, "blimp/accuracy/left_branch_island_echo_question": 0.487, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.974, "blimp/accuracy/causative": 0.717, "blimp/accuracy/group_average": 0.7599552238805968, "blimp/accuracy/seq_average": 0.759955223880597, "cbt/accuracy/NE": 0.7467948717948718, "cbt/accuracy/V": 0.9064, "cbt/accuracy/CN": 0.8124, "cbt/accuracy/P": 0.8884, "cbt/accuracy/group_average": 0.838498717948718, "cbt/accuracy/seq_average": 0.8385354141656662, "hellaswag/accuracy/val": 0.28380800637323245, "hellaswag/accuracy/group_average": 0.28380800637323245, "hellaswag/accuracy/seq_average": 0.28380800637323245, "piqa/accuracy/val": 0.5903155603917302, "piqa/accuracy/group_average": 0.5903155603917302, "piqa/accuracy/seq_average": 0.5903155603917302, "ai2arc/accuracy/ARC-Easy": 0.32684989429175476, "ai2arc/accuracy/ARC-Challenge": 0.21545064377682405, "ai2arc/accuracy/group_average": 0.2711502690342894, "ai2arc/accuracy/seq_average": 0.29008498583569403, "race/accuracy/test/high": 0.27501429388221843, "race/accuracy/test/middle": 0.32381615598885793, "race/accuracy/group_average": 0.2994152249355382, "race/accuracy/seq_average": 0.2892176732873936, "siqa/accuracy/dev": 0.36131013306038895, "siqa/accuracy/group_average": 0.36131013306038895, "siqa/accuracy/seq_average": 0.36131013306038895, "commonsenseqa/accuracy/dev_rand_split": 0.24651924651924653, "commonsenseqa/accuracy/group_average": 0.24651924651924653, "commonsenseqa/accuracy/seq_average": 0.24651924651924653}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_17_experts/export/result-model-80000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.6403324187748014, "val/accuracy": 0.47529141865079366, "val/perplexity": 14.017862634038943, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5343993761524652, "lambada/accuracy/total": 0.24340062111801242, "lambada/accuracy/openai_last_token": 0.7600931677018633, "lambada/perplexity": 12.85606549880434, "lambada/lm_loss": 3.202845809098376, "lambada/lm_perplexity": 24.602444528987164, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.35934601988440307, "mean_loss": 2.5873658974636333, "blimp/accuracy/passive_2": 0.914, "blimp/accuracy/determiner_noun_agreement_2": 0.988, "blimp/accuracy/ellipsis_n_bar_1": 0.805, "blimp/accuracy/tough_vs_raising_2": 0.878, "blimp/accuracy/tough_vs_raising_1": 0.571, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.897, "blimp/accuracy/principle_A_reconstruction": 0.409, "blimp/accuracy/wh_vs_that_with_gap": 0.427, "blimp/accuracy/principle_A_domain_2": 0.768, "blimp/accuracy/determiner_noun_agreement_1": 0.987, "blimp/accuracy/ellipsis_n_bar_2": 0.895, "blimp/accuracy/principle_A_domain_3": 0.543, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.938, "blimp/accuracy/animate_subject_trans": 0.906, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.911, "blimp/accuracy/distractor_agreement_relative_clause": 0.615, "blimp/accuracy/transitive": 0.861, "blimp/accuracy/sentential_subject_island": 0.402, "blimp/accuracy/adjunct_island": 0.791, "blimp/accuracy/intransitive": 0.759, "blimp/accuracy/existential_there_subject_raising": 0.874, "blimp/accuracy/irregular_past_participle_adjectives": 0.924, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.409, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.167, "blimp/accuracy/only_npi_scope": 0.669, "blimp/accuracy/superlative_quantifiers_2": 0.727, "blimp/accuracy/passive_1": 0.895, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.881, "blimp/accuracy/inchoative": 0.606, "blimp/accuracy/anaphor_gender_agreement": 0.956, "blimp/accuracy/principle_A_c_command": 0.629, "blimp/accuracy/only_npi_licensor_present": 0.675, "blimp/accuracy/expletive_it_object_raising": 0.783, "blimp/accuracy/left_branch_island_simple_question": 0.499, "blimp/accuracy/wh_questions_subject_gap": 0.925, "blimp/accuracy/existential_there_quantifiers_2": 0.338, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.936, "blimp/accuracy/sentential_negation_npi_scope": 0.632, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.773, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.831, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.871, "blimp/accuracy/principle_A_case_2": 0.949, "blimp/accuracy/distractor_agreement_relational_noun": 0.783, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.98, "blimp/accuracy/superlative_quantifiers_1": 0.641, "blimp/accuracy/wh_island": 0.794, "blimp/accuracy/principle_A_domain_1": 0.972, "blimp/accuracy/complex_NP_island": 0.519, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.985, "blimp/accuracy/irregular_past_participle_verbs": 0.868, "blimp/accuracy/drop_argument": 0.753, "blimp/accuracy/wh_questions_object_gap": 0.809, "blimp/accuracy/animate_subject_passive": 0.779, "blimp/accuracy/existential_there_quantifiers_1": 0.982, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.882, "blimp/accuracy/npi_present_2": 0.596, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.933, "blimp/accuracy/anaphor_number_agreement": 0.989, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.956, "blimp/accuracy/existential_there_object_raising": 0.81, "blimp/accuracy/matrix_question_npi_licensor_present": 0.211, "blimp/accuracy/npi_present_1": 0.55, "blimp/accuracy/wh_vs_that_no_gap": 0.974, "blimp/accuracy/left_branch_island_echo_question": 0.426, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.974, "blimp/accuracy/causative": 0.71, "blimp/accuracy/group_average": 0.7625373134328357, "blimp/accuracy/seq_average": 0.7625373134328358, "cbt/accuracy/NE": 0.750801282051282, "cbt/accuracy/V": 0.9052, "cbt/accuracy/CN": 0.8108, "cbt/accuracy/P": 0.892, "cbt/accuracy/group_average": 0.8397003205128205, "cbt/accuracy/seq_average": 0.8397358943577431, "hellaswag/accuracy/val": 0.2878908583947421, "hellaswag/accuracy/group_average": 0.2878908583947421, "hellaswag/accuracy/seq_average": 0.2878908583947421, "piqa/accuracy/val": 0.5767138193688792, "piqa/accuracy/group_average": 0.5767138193688792, "piqa/accuracy/seq_average": 0.5767138193688792, "ai2arc/accuracy/ARC-Easy": 0.3315010570824524, "ai2arc/accuracy/ARC-Challenge": 0.22317596566523606, "ai2arc/accuracy/group_average": 0.27733851137384424, "ai2arc/accuracy/seq_average": 0.2957507082152975, "race/accuracy/test/high": 0.2747284162378502, "race/accuracy/test/middle": 0.3342618384401114, "race/accuracy/group_average": 0.3044951273389808, "race/accuracy/seq_average": 0.2920551276854479, "siqa/accuracy/dev": 0.3587512794268168, "siqa/accuracy/group_average": 0.3587512794268168, "siqa/accuracy/seq_average": 0.3587512794268168, "commonsenseqa/accuracy/dev_rand_split": 0.24733824733824733, "commonsenseqa/accuracy/group_average": 0.24733824733824733, "commonsenseqa/accuracy/seq_average": 0.24733824733824733}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_17_experts/export/result-model-90000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.6296197800409224, "val/accuracy": 0.47628929501488093, "val/perplexity": 13.868495820624531, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6128232967779503, "lambada/accuracy/total": 0.2482531055900621, "lambada/accuracy/openai_last_token": 0.7628105590062112, "lambada/perplexity": 12.495747921842565, "lambada/lm_loss": 3.18274642132788, "lambada/lm_perplexity": 24.112886842256152, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3622712003024715, "mean_loss": 2.6212215384094364, "blimp/accuracy/passive_2": 0.899, "blimp/accuracy/determiner_noun_agreement_2": 0.991, "blimp/accuracy/ellipsis_n_bar_1": 0.794, "blimp/accuracy/tough_vs_raising_2": 0.871, "blimp/accuracy/tough_vs_raising_1": 0.561, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.904, "blimp/accuracy/principle_A_reconstruction": 0.42, "blimp/accuracy/wh_vs_that_with_gap": 0.473, "blimp/accuracy/principle_A_domain_2": 0.794, "blimp/accuracy/determiner_noun_agreement_1": 0.989, "blimp/accuracy/ellipsis_n_bar_2": 0.897, "blimp/accuracy/principle_A_domain_3": 0.532, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.932, "blimp/accuracy/animate_subject_trans": 0.901, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.908, "blimp/accuracy/distractor_agreement_relative_clause": 0.598, "blimp/accuracy/transitive": 0.858, "blimp/accuracy/sentential_subject_island": 0.379, "blimp/accuracy/adjunct_island": 0.782, "blimp/accuracy/intransitive": 0.777, "blimp/accuracy/existential_there_subject_raising": 0.89, "blimp/accuracy/irregular_past_participle_adjectives": 0.901, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.486, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.19, "blimp/accuracy/only_npi_scope": 0.65, "blimp/accuracy/superlative_quantifiers_2": 0.576, "blimp/accuracy/passive_1": 0.877, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.871, "blimp/accuracy/inchoative": 0.611, "blimp/accuracy/anaphor_gender_agreement": 0.97, "blimp/accuracy/principle_A_c_command": 0.609, "blimp/accuracy/only_npi_licensor_present": 0.717, "blimp/accuracy/expletive_it_object_raising": 0.793, "blimp/accuracy/left_branch_island_simple_question": 0.528, "blimp/accuracy/wh_questions_subject_gap": 0.919, "blimp/accuracy/existential_there_quantifiers_2": 0.346, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.941, "blimp/accuracy/sentential_negation_npi_scope": 0.635, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.777, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.84, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.872, "blimp/accuracy/principle_A_case_2": 0.948, "blimp/accuracy/distractor_agreement_relational_noun": 0.791, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.978, "blimp/accuracy/superlative_quantifiers_1": 0.634, "blimp/accuracy/wh_island": 0.78, "blimp/accuracy/principle_A_domain_1": 0.984, "blimp/accuracy/complex_NP_island": 0.488, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.973, "blimp/accuracy/irregular_past_participle_verbs": 0.886, "blimp/accuracy/drop_argument": 0.764, "blimp/accuracy/wh_questions_object_gap": 0.793, "blimp/accuracy/animate_subject_passive": 0.789, "blimp/accuracy/existential_there_quantifiers_1": 0.982, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.876, "blimp/accuracy/npi_present_2": 0.608, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.933, "blimp/accuracy/anaphor_number_agreement": 0.988, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.957, "blimp/accuracy/existential_there_object_raising": 0.826, "blimp/accuracy/matrix_question_npi_licensor_present": 0.218, "blimp/accuracy/npi_present_1": 0.574, "blimp/accuracy/wh_vs_that_no_gap": 0.969, "blimp/accuracy/left_branch_island_echo_question": 0.396, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.967, "blimp/accuracy/causative": 0.723, "blimp/accuracy/group_average": 0.7624477611940298, "blimp/accuracy/seq_average": 0.7624477611940299, "cbt/accuracy/NE": 0.7455929487179487, "cbt/accuracy/V": 0.9072, "cbt/accuracy/CN": 0.814, "cbt/accuracy/P": 0.892, "cbt/accuracy/group_average": 0.8396982371794871, "cbt/accuracy/seq_average": 0.8397358943577431, "hellaswag/accuracy/val": 0.2893845847440749, "hellaswag/accuracy/group_average": 0.2893845847440749, "hellaswag/accuracy/seq_average": 0.2893845847440749, "piqa/accuracy/val": 0.5788900979325353, "piqa/accuracy/group_average": 0.5788900979325353, "piqa/accuracy/seq_average": 0.5788900979325353, "ai2arc/accuracy/ARC-Easy": 0.3289640591966173, "ai2arc/accuracy/ARC-Challenge": 0.21545064377682405, "ai2arc/accuracy/group_average": 0.2722073514867207, "ai2arc/accuracy/seq_average": 0.2915014164305949, "race/accuracy/test/high": 0.2730131503716409, "race/accuracy/test/middle": 0.3293871866295265, "race/accuracy/group_average": 0.30120016850058373, "race/accuracy/seq_average": 0.2894203486015403, "siqa/accuracy/dev": 0.3592630501535312, "siqa/accuracy/group_average": 0.3592630501535312, "siqa/accuracy/seq_average": 0.3592630501535312, "commonsenseqa/accuracy/dev_rand_split": 0.25061425061425063, "commonsenseqa/accuracy/group_average": 0.25061425061425063, "commonsenseqa/accuracy/seq_average": 0.25061425061425063}
|