Upload folder using huggingface_hub
#302
by
DavidNguyen
- opened
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-100000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-120000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-140000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-160000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-180000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-20000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-200000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-220000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-240000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-260000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-280000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-300000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-320000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-340000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-360000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-380000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-40000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-400000.pth.json +121 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-60000.pth.json +1 -0
- Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-80000.pth.json +1 -0
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-100000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.4397750127883184, "val/accuracy": 0.5006801060267857, "val/perplexity": 11.47045974570516, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.352559154818517, "lambada/accuracy/total": 0.31502329192546585, "lambada/accuracy/openai_last_token": 0.7835791925465838, "lambada/perplexity": 8.307423622117142, "lambada/lm_loss": 3.0196044947963703, "lambada/lm_perplexity": 20.483188874160135, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4078516989761258, "mean_loss": 2.3961670838034177, "blimp/accuracy/passive_2": 0.895, "blimp/accuracy/determiner_noun_agreement_2": 0.989, "blimp/accuracy/ellipsis_n_bar_1": 0.855, "blimp/accuracy/tough_vs_raising_2": 0.87, "blimp/accuracy/tough_vs_raising_1": 0.616, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.909, "blimp/accuracy/principle_A_reconstruction": 0.366, "blimp/accuracy/wh_vs_that_with_gap": 0.528, "blimp/accuracy/principle_A_domain_2": 0.877, "blimp/accuracy/determiner_noun_agreement_1": 0.994, "blimp/accuracy/ellipsis_n_bar_2": 0.885, "blimp/accuracy/principle_A_domain_3": 0.628, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.908, "blimp/accuracy/animate_subject_trans": 0.903, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.895, "blimp/accuracy/distractor_agreement_relative_clause": 0.697, "blimp/accuracy/transitive": 0.871, "blimp/accuracy/sentential_subject_island": 0.312, "blimp/accuracy/adjunct_island": 0.815, "blimp/accuracy/intransitive": 0.809, "blimp/accuracy/existential_there_subject_raising": 0.873, "blimp/accuracy/irregular_past_participle_adjectives": 0.956, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.588, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.315, "blimp/accuracy/only_npi_scope": 0.654, "blimp/accuracy/superlative_quantifiers_2": 0.787, "blimp/accuracy/passive_1": 0.908, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.912, "blimp/accuracy/inchoative": 0.664, "blimp/accuracy/anaphor_gender_agreement": 0.965, "blimp/accuracy/principle_A_c_command": 0.748, "blimp/accuracy/only_npi_licensor_present": 0.572, "blimp/accuracy/expletive_it_object_raising": 0.789, "blimp/accuracy/left_branch_island_simple_question": 0.7, "blimp/accuracy/wh_questions_subject_gap": 0.918, "blimp/accuracy/existential_there_quantifiers_2": 0.567, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.934, "blimp/accuracy/sentential_negation_npi_scope": 0.642, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.834, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.885, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.905, "blimp/accuracy/principle_A_case_2": 0.934, "blimp/accuracy/distractor_agreement_relational_noun": 0.862, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.983, "blimp/accuracy/superlative_quantifiers_1": 0.746, "blimp/accuracy/wh_island": 0.796, "blimp/accuracy/principle_A_domain_1": 0.984, "blimp/accuracy/complex_NP_island": 0.554, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.957, "blimp/accuracy/irregular_past_participle_verbs": 0.902, "blimp/accuracy/drop_argument": 0.77, "blimp/accuracy/wh_questions_object_gap": 0.835, "blimp/accuracy/animate_subject_passive": 0.795, "blimp/accuracy/existential_there_quantifiers_1": 0.986, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.888, "blimp/accuracy/npi_present_2": 0.593, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.957, "blimp/accuracy/anaphor_number_agreement": 0.994, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.963, "blimp/accuracy/existential_there_object_raising": 0.812, "blimp/accuracy/matrix_question_npi_licensor_present": 0.262, "blimp/accuracy/npi_present_1": 0.54, "blimp/accuracy/wh_vs_that_no_gap": 0.963, "blimp/accuracy/left_branch_island_echo_question": 0.45, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.966, "blimp/accuracy/causative": 0.747, "blimp/accuracy/group_average": 0.7907014925373136, "blimp/accuracy/seq_average": 0.7907014925373135, "cbt/accuracy/NE": 0.7844551282051282, "cbt/accuracy/V": 0.9308, "cbt/accuracy/CN": 0.8632, "cbt/accuracy/P": 0.9052, "cbt/accuracy/group_average": 0.870913782051282, "cbt/accuracy/seq_average": 0.8709483793517407, "hellaswag/accuracy/val": 0.3236407090221072, "hellaswag/accuracy/group_average": 0.3236407090221072, "hellaswag/accuracy/seq_average": 0.3236407090221072, "piqa/accuracy/val": 0.6175190424374319, "piqa/accuracy/group_average": 0.6175190424374319, "piqa/accuracy/seq_average": 0.6175190424374319, "ai2arc/accuracy/ARC-Easy": 0.3581395348837209, "ai2arc/accuracy/ARC-Challenge": 0.2223175965665236, "ai2arc/accuracy/group_average": 0.29022856572512223, "ai2arc/accuracy/seq_average": 0.313314447592068, "mmlu/accuracy/MMLU": 0.2662138005005363, "mmlu/accuracy/group_average": 0.2662138005005363, "mmlu/accuracy/seq_average": 0.2662138005005363, "openbookqa/accuracy/test": 0.268, "openbookqa/accuracy/group_average": 0.268, "openbookqa/accuracy/seq_average": 0.268, "race/accuracy/test/high": 0.2830188679245283, "race/accuracy/test/middle": 0.35863509749303624, "race/accuracy/group_average": 0.3208269827087823, "race/accuracy/seq_average": 0.3050263477908391, "siqa/accuracy/dev": 0.3751279426816786, "siqa/accuracy/group_average": 0.3751279426816786, "siqa/accuracy/seq_average": 0.3751279426816786, "winogrande/accuracy/dev": 0.5043409629044988, "winogrande/accuracy/group_average": 0.5043409629044988, "winogrande/accuracy/seq_average": 0.5043409629044988, "commonsenseqa/accuracy/dev_rand_split": 0.2628992628992629, "commonsenseqa/accuracy/group_average": 0.2628992628992629, "commonsenseqa/accuracy/seq_average": 0.2628992628992629}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-120000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.4153575594463046, "val/accuracy": 0.5042288643973214, "val/perplexity": 11.193772081030408, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.3773318462490294, "lambada/accuracy/total": 0.34413819875776397, "lambada/accuracy/openai_last_token": 0.7934782608695652, "lambada/perplexity": 7.450283135387957, "lambada/lm_loss": 2.9988371191565237, "lambada/lm_perplexity": 20.062193412560024, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4241835315775427, "mean_loss": 2.396344702847667, "blimp/accuracy/passive_2": 0.91, "blimp/accuracy/determiner_noun_agreement_2": 0.99, "blimp/accuracy/ellipsis_n_bar_1": 0.868, "blimp/accuracy/tough_vs_raising_2": 0.906, "blimp/accuracy/tough_vs_raising_1": 0.623, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.928, "blimp/accuracy/principle_A_reconstruction": 0.463, "blimp/accuracy/wh_vs_that_with_gap": 0.468, "blimp/accuracy/principle_A_domain_2": 0.845, "blimp/accuracy/determiner_noun_agreement_1": 0.994, "blimp/accuracy/ellipsis_n_bar_2": 0.924, "blimp/accuracy/principle_A_domain_3": 0.585, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.926, "blimp/accuracy/animate_subject_trans": 0.889, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.875, "blimp/accuracy/distractor_agreement_relative_clause": 0.727, "blimp/accuracy/transitive": 0.876, "blimp/accuracy/sentential_subject_island": 0.32, "blimp/accuracy/adjunct_island": 0.843, "blimp/accuracy/intransitive": 0.793, "blimp/accuracy/existential_there_subject_raising": 0.886, "blimp/accuracy/irregular_past_participle_adjectives": 0.972, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.655, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.242, "blimp/accuracy/only_npi_scope": 0.71, "blimp/accuracy/superlative_quantifiers_2": 0.721, "blimp/accuracy/passive_1": 0.906, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.924, "blimp/accuracy/inchoative": 0.672, "blimp/accuracy/anaphor_gender_agreement": 0.971, "blimp/accuracy/principle_A_c_command": 0.696, "blimp/accuracy/only_npi_licensor_present": 0.608, "blimp/accuracy/expletive_it_object_raising": 0.786, "blimp/accuracy/left_branch_island_simple_question": 0.708, "blimp/accuracy/wh_questions_subject_gap": 0.942, "blimp/accuracy/existential_there_quantifiers_2": 0.594, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.935, "blimp/accuracy/sentential_negation_npi_scope": 0.72, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.82, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.922, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.903, "blimp/accuracy/principle_A_case_2": 0.932, "blimp/accuracy/distractor_agreement_relational_noun": 0.862, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.993, "blimp/accuracy/superlative_quantifiers_1": 0.851, "blimp/accuracy/wh_island": 0.823, "blimp/accuracy/principle_A_domain_1": 0.993, "blimp/accuracy/complex_NP_island": 0.545, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.972, "blimp/accuracy/irregular_past_participle_verbs": 0.901, "blimp/accuracy/drop_argument": 0.774, "blimp/accuracy/wh_questions_object_gap": 0.832, "blimp/accuracy/animate_subject_passive": 0.816, "blimp/accuracy/existential_there_quantifiers_1": 0.981, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.895, "blimp/accuracy/npi_present_2": 0.595, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.945, "blimp/accuracy/anaphor_number_agreement": 0.988, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.972, "blimp/accuracy/existential_there_object_raising": 0.826, "blimp/accuracy/matrix_question_npi_licensor_present": 0.345, "blimp/accuracy/npi_present_1": 0.531, "blimp/accuracy/wh_vs_that_no_gap": 0.976, "blimp/accuracy/left_branch_island_echo_question": 0.42, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.971, "blimp/accuracy/causative": 0.772, "blimp/accuracy/group_average": 0.7993582089552241, "blimp/accuracy/seq_average": 0.7993582089552239, "cbt/accuracy/NE": 0.7860576923076923, "cbt/accuracy/V": 0.9324, "cbt/accuracy/CN": 0.864, "cbt/accuracy/P": 0.9128, "cbt/accuracy/group_average": 0.873814423076923, "cbt/accuracy/seq_average": 0.8738495398159264, "hellaswag/accuracy/val": 0.3343955387373033, "hellaswag/accuracy/group_average": 0.3343955387373033, "hellaswag/accuracy/seq_average": 0.3343955387373033, "piqa/accuracy/val": 0.6186071817192601, "piqa/accuracy/group_average": 0.6186071817192601, "piqa/accuracy/seq_average": 0.6186071817192601, "ai2arc/accuracy/ARC-Easy": 0.3678646934460888, "ai2arc/accuracy/ARC-Challenge": 0.22489270386266094, "ai2arc/accuracy/group_average": 0.2963786986543749, "ai2arc/accuracy/seq_average": 0.3206798866855524, "mmlu/accuracy/MMLU": 0.26514122273864854, "mmlu/accuracy/group_average": 0.26514122273864854, "mmlu/accuracy/seq_average": 0.26514122273864854, "openbookqa/accuracy/test": 0.274, "openbookqa/accuracy/group_average": 0.274, "openbookqa/accuracy/seq_average": 0.274, "race/accuracy/test/high": 0.2875929102344197, "race/accuracy/test/middle": 0.3593314763231198, "race/accuracy/group_average": 0.32346219327876974, "race/accuracy/seq_average": 0.3084718281313336, "siqa/accuracy/dev": 0.3664278403275333, "siqa/accuracy/group_average": 0.3664278403275333, "siqa/accuracy/seq_average": 0.3664278403275333, "winogrande/accuracy/dev": 0.5011838989739542, "winogrande/accuracy/group_average": 0.5011838989739542, "winogrande/accuracy/seq_average": 0.5011838989739542, "commonsenseqa/accuracy/dev_rand_split": 0.26371826371826373, "commonsenseqa/accuracy/group_average": 0.26371826371826373, "commonsenseqa/accuracy/seq_average": 0.26371826371826373}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-140000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.3941524445064486, "val/accuracy": 0.5073058113219246, "val/perplexity": 10.958905840952601, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.3986207950189247, "lambada/accuracy/total": 0.3202639751552795, "lambada/accuracy/openai_last_token": 0.7870729813664596, "lambada/perplexity": 7.92205058333687, "lambada/lm_loss": 2.9916513536561604, "lambada/lm_perplexity": 19.918547914802247, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.41378489323860207, "mean_loss": 2.3963866197626866, "blimp/accuracy/passive_2": 0.916, "blimp/accuracy/determiner_noun_agreement_2": 0.986, "blimp/accuracy/ellipsis_n_bar_1": 0.879, "blimp/accuracy/tough_vs_raising_2": 0.913, "blimp/accuracy/tough_vs_raising_1": 0.581, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.915, "blimp/accuracy/principle_A_reconstruction": 0.451, "blimp/accuracy/wh_vs_that_with_gap": 0.502, "blimp/accuracy/principle_A_domain_2": 0.889, "blimp/accuracy/determiner_noun_agreement_1": 0.993, "blimp/accuracy/ellipsis_n_bar_2": 0.908, "blimp/accuracy/principle_A_domain_3": 0.643, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.919, "blimp/accuracy/animate_subject_trans": 0.913, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.881, "blimp/accuracy/distractor_agreement_relative_clause": 0.679, "blimp/accuracy/transitive": 0.881, "blimp/accuracy/sentential_subject_island": 0.346, "blimp/accuracy/adjunct_island": 0.867, "blimp/accuracy/intransitive": 0.764, "blimp/accuracy/existential_there_subject_raising": 0.885, "blimp/accuracy/irregular_past_participle_adjectives": 0.985, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.671, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.302, "blimp/accuracy/only_npi_scope": 0.742, "blimp/accuracy/superlative_quantifiers_2": 0.689, "blimp/accuracy/passive_1": 0.92, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.918, "blimp/accuracy/inchoative": 0.648, "blimp/accuracy/anaphor_gender_agreement": 0.973, "blimp/accuracy/principle_A_c_command": 0.738, "blimp/accuracy/only_npi_licensor_present": 0.667, "blimp/accuracy/expletive_it_object_raising": 0.774, "blimp/accuracy/left_branch_island_simple_question": 0.728, "blimp/accuracy/wh_questions_subject_gap": 0.932, "blimp/accuracy/existential_there_quantifiers_2": 0.494, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.944, "blimp/accuracy/sentential_negation_npi_scope": 0.699, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.809, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.908, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.897, "blimp/accuracy/principle_A_case_2": 0.948, "blimp/accuracy/distractor_agreement_relational_noun": 0.898, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.995, "blimp/accuracy/superlative_quantifiers_1": 0.8, "blimp/accuracy/wh_island": 0.767, "blimp/accuracy/principle_A_domain_1": 0.988, "blimp/accuracy/complex_NP_island": 0.543, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.961, "blimp/accuracy/irregular_past_participle_verbs": 0.9, "blimp/accuracy/drop_argument": 0.771, "blimp/accuracy/wh_questions_object_gap": 0.861, "blimp/accuracy/animate_subject_passive": 0.796, "blimp/accuracy/existential_there_quantifiers_1": 0.975, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.873, "blimp/accuracy/npi_present_2": 0.558, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.937, "blimp/accuracy/anaphor_number_agreement": 0.99, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.966, "blimp/accuracy/existential_there_object_raising": 0.817, "blimp/accuracy/matrix_question_npi_licensor_present": 0.292, "blimp/accuracy/npi_present_1": 0.53, "blimp/accuracy/wh_vs_that_no_gap": 0.974, "blimp/accuracy/left_branch_island_echo_question": 0.483, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.965, "blimp/accuracy/causative": 0.748, "blimp/accuracy/group_average": 0.7982835820895521, "blimp/accuracy/seq_average": 0.7982835820895522, "cbt/accuracy/NE": 0.8000801282051282, "cbt/accuracy/V": 0.9356, "cbt/accuracy/CN": 0.864, "cbt/accuracy/P": 0.9152, "cbt/accuracy/group_average": 0.878720032051282, "cbt/accuracy/seq_average": 0.8787515006002401, "hellaswag/accuracy/val": 0.3403704441346345, "hellaswag/accuracy/group_average": 0.3403704441346345, "hellaswag/accuracy/seq_average": 0.3403704441346345, "piqa/accuracy/val": 0.6245919477693145, "piqa/accuracy/group_average": 0.6245919477693145, "piqa/accuracy/seq_average": 0.6245919477693145, "ai2arc/accuracy/ARC-Easy": 0.37293868921775897, "ai2arc/accuracy/ARC-Challenge": 0.21802575107296138, "ai2arc/accuracy/group_average": 0.2954822201453602, "ai2arc/accuracy/seq_average": 0.3218130311614731, "mmlu/accuracy/MMLU": 0.2650697175545227, "mmlu/accuracy/group_average": 0.2650697175545227, "mmlu/accuracy/seq_average": 0.2650697175545227, "openbookqa/accuracy/test": 0.298, "openbookqa/accuracy/group_average": 0.298, "openbookqa/accuracy/seq_average": 0.298, "race/accuracy/test/high": 0.2801600914808462, "race/accuracy/test/middle": 0.3593314763231198, "race/accuracy/group_average": 0.319745783901983, "race/accuracy/seq_average": 0.30320226996351846, "siqa/accuracy/dev": 0.3735926305015353, "siqa/accuracy/group_average": 0.3735926305015353, "siqa/accuracy/seq_average": 0.3735926305015353, "winogrande/accuracy/dev": 0.4980268350434096, "winogrande/accuracy/group_average": 0.4980268350434096, "winogrande/accuracy/seq_average": 0.4980268350434096, "commonsenseqa/accuracy/dev_rand_split": 0.26617526617526616, "commonsenseqa/accuracy/group_average": 0.26617526617526616, "commonsenseqa/accuracy/seq_average": 0.26617526617526616}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-160000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.376746525840154, "val/accuracy": 0.5096232096354166, "val/perplexity": 10.769806515153912, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.4334676991338315, "lambada/accuracy/total": 0.328027950310559, "lambada/accuracy/openai_last_token": 0.7895962732919255, "lambada/perplexity": 7.566114164127005, "lambada/lm_loss": 2.9740132798897045, "lambada/lm_perplexity": 19.57030330763765, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4188255799729878, "mean_loss": 2.4051071124869927, "blimp/accuracy/passive_2": 0.917, "blimp/accuracy/determiner_noun_agreement_2": 0.983, "blimp/accuracy/ellipsis_n_bar_1": 0.88, "blimp/accuracy/tough_vs_raising_2": 0.914, "blimp/accuracy/tough_vs_raising_1": 0.61, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.893, "blimp/accuracy/principle_A_reconstruction": 0.378, "blimp/accuracy/wh_vs_that_with_gap": 0.512, "blimp/accuracy/principle_A_domain_2": 0.875, "blimp/accuracy/determiner_noun_agreement_1": 0.993, "blimp/accuracy/ellipsis_n_bar_2": 0.875, "blimp/accuracy/principle_A_domain_3": 0.607, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.911, "blimp/accuracy/animate_subject_trans": 0.897, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.906, "blimp/accuracy/distractor_agreement_relative_clause": 0.722, "blimp/accuracy/transitive": 0.884, "blimp/accuracy/sentential_subject_island": 0.318, "blimp/accuracy/adjunct_island": 0.882, "blimp/accuracy/intransitive": 0.818, "blimp/accuracy/existential_there_subject_raising": 0.879, "blimp/accuracy/irregular_past_participle_adjectives": 0.954, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.708, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.353, "blimp/accuracy/only_npi_scope": 0.738, "blimp/accuracy/superlative_quantifiers_2": 0.783, "blimp/accuracy/passive_1": 0.91, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.918, "blimp/accuracy/inchoative": 0.682, "blimp/accuracy/anaphor_gender_agreement": 0.972, "blimp/accuracy/principle_A_c_command": 0.696, "blimp/accuracy/only_npi_licensor_present": 0.57, "blimp/accuracy/expletive_it_object_raising": 0.773, "blimp/accuracy/left_branch_island_simple_question": 0.741, "blimp/accuracy/wh_questions_subject_gap": 0.929, "blimp/accuracy/existential_there_quantifiers_2": 0.521, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.944, "blimp/accuracy/sentential_negation_npi_scope": 0.729, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.859, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.89, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.9, "blimp/accuracy/principle_A_case_2": 0.954, "blimp/accuracy/distractor_agreement_relational_noun": 0.903, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.997, "blimp/accuracy/superlative_quantifiers_1": 0.836, "blimp/accuracy/wh_island": 0.821, "blimp/accuracy/principle_A_domain_1": 0.986, "blimp/accuracy/complex_NP_island": 0.542, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.957, "blimp/accuracy/irregular_past_participle_verbs": 0.909, "blimp/accuracy/drop_argument": 0.772, "blimp/accuracy/wh_questions_object_gap": 0.84, "blimp/accuracy/animate_subject_passive": 0.798, "blimp/accuracy/existential_there_quantifiers_1": 0.985, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.898, "blimp/accuracy/npi_present_2": 0.538, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.947, "blimp/accuracy/anaphor_number_agreement": 0.992, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.964, "blimp/accuracy/existential_there_object_raising": 0.855, "blimp/accuracy/matrix_question_npi_licensor_present": 0.338, "blimp/accuracy/npi_present_1": 0.454, "blimp/accuracy/wh_vs_that_no_gap": 0.975, "blimp/accuracy/left_branch_island_echo_question": 0.429, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.963, "blimp/accuracy/causative": 0.764, "blimp/accuracy/group_average": 0.8006119402985075, "blimp/accuracy/seq_average": 0.8006119402985075, "cbt/accuracy/NE": 0.811698717948718, "cbt/accuracy/V": 0.9316, "cbt/accuracy/CN": 0.8748, "cbt/accuracy/P": 0.914, "cbt/accuracy/group_average": 0.8830246794871796, "cbt/accuracy/seq_average": 0.8830532212885154, "hellaswag/accuracy/val": 0.3428599880501892, "hellaswag/accuracy/group_average": 0.3428599880501892, "hellaswag/accuracy/seq_average": 0.3428599880501892, "piqa/accuracy/val": 0.6289445048966268, "piqa/accuracy/group_average": 0.6289445048966268, "piqa/accuracy/seq_average": 0.6289445048966268, "ai2arc/accuracy/ARC-Easy": 0.3704016913319239, "ai2arc/accuracy/ARC-Challenge": 0.23004291845493563, "ai2arc/accuracy/group_average": 0.30022230489342977, "ai2arc/accuracy/seq_average": 0.3240793201133145, "mmlu/accuracy/MMLU": 0.26864497676081517, "mmlu/accuracy/group_average": 0.26864497676081517, "mmlu/accuracy/seq_average": 0.26864497676081517, "openbookqa/accuracy/test": 0.268, "openbookqa/accuracy/group_average": 0.268, "openbookqa/accuracy/seq_average": 0.268, "race/accuracy/test/high": 0.2830188679245283, "race/accuracy/test/middle": 0.3607242339832869, "race/accuracy/group_average": 0.3218715509539076, "race/accuracy/seq_average": 0.3056343737332793, "siqa/accuracy/dev": 0.3766632548618219, "siqa/accuracy/group_average": 0.3766632548618219, "siqa/accuracy/seq_average": 0.3766632548618219, "winogrande/accuracy/dev": 0.4988161010260458, "winogrande/accuracy/group_average": 0.4988161010260458, "winogrande/accuracy/seq_average": 0.4988161010260458, "commonsenseqa/accuracy/dev_rand_split": 0.26617526617526616, "commonsenseqa/accuracy/group_average": 0.26617526617526616, "commonsenseqa/accuracy/seq_average": 0.26617526617526616}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-180000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.356868198939732, "val/accuracy": 0.5129627046130952, "val/perplexity": 10.557834584235279, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.373153023097826, "lambada/accuracy/total": 0.343944099378882, "lambada/accuracy/openai_last_token": 0.7925077639751553, "lambada/perplexity": 7.23822184526625, "lambada/lm_loss": 2.9621500619155476, "lambada/lm_perplexity": 19.339508226612782, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4284534019959886, "mean_loss": 2.3650106110187794, "blimp/accuracy/passive_2": 0.909, "blimp/accuracy/determiner_noun_agreement_2": 0.979, "blimp/accuracy/ellipsis_n_bar_1": 0.852, "blimp/accuracy/tough_vs_raising_2": 0.904, "blimp/accuracy/tough_vs_raising_1": 0.61, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.92, "blimp/accuracy/principle_A_reconstruction": 0.427, "blimp/accuracy/wh_vs_that_with_gap": 0.485, "blimp/accuracy/principle_A_domain_2": 0.892, "blimp/accuracy/determiner_noun_agreement_1": 0.994, "blimp/accuracy/ellipsis_n_bar_2": 0.919, "blimp/accuracy/principle_A_domain_3": 0.619, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.915, "blimp/accuracy/animate_subject_trans": 0.906, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.903, "blimp/accuracy/distractor_agreement_relative_clause": 0.668, "blimp/accuracy/transitive": 0.894, "blimp/accuracy/sentential_subject_island": 0.393, "blimp/accuracy/adjunct_island": 0.886, "blimp/accuracy/intransitive": 0.782, "blimp/accuracy/existential_there_subject_raising": 0.888, "blimp/accuracy/irregular_past_participle_adjectives": 0.984, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.729, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.294, "blimp/accuracy/only_npi_scope": 0.726, "blimp/accuracy/superlative_quantifiers_2": 0.823, "blimp/accuracy/passive_1": 0.906, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.911, "blimp/accuracy/inchoative": 0.63, "blimp/accuracy/anaphor_gender_agreement": 0.972, "blimp/accuracy/principle_A_c_command": 0.718, "blimp/accuracy/only_npi_licensor_present": 0.637, "blimp/accuracy/expletive_it_object_raising": 0.8, "blimp/accuracy/left_branch_island_simple_question": 0.796, "blimp/accuracy/wh_questions_subject_gap": 0.959, "blimp/accuracy/existential_there_quantifiers_2": 0.435, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.938, "blimp/accuracy/sentential_negation_npi_scope": 0.698, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.833, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.927, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.904, "blimp/accuracy/principle_A_case_2": 0.958, "blimp/accuracy/distractor_agreement_relational_noun": 0.856, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.997, "blimp/accuracy/superlative_quantifiers_1": 0.73, "blimp/accuracy/wh_island": 0.774, "blimp/accuracy/principle_A_domain_1": 0.992, "blimp/accuracy/complex_NP_island": 0.588, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.96, "blimp/accuracy/irregular_past_participle_verbs": 0.879, "blimp/accuracy/drop_argument": 0.758, "blimp/accuracy/wh_questions_object_gap": 0.873, "blimp/accuracy/animate_subject_passive": 0.805, "blimp/accuracy/existential_there_quantifiers_1": 0.98, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.88, "blimp/accuracy/npi_present_2": 0.525, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.932, "blimp/accuracy/anaphor_number_agreement": 0.987, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.97, "blimp/accuracy/existential_there_object_raising": 0.844, "blimp/accuracy/matrix_question_npi_licensor_present": 0.404, "blimp/accuracy/npi_present_1": 0.511, "blimp/accuracy/wh_vs_that_no_gap": 0.986, "blimp/accuracy/left_branch_island_echo_question": 0.438, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.979, "blimp/accuracy/causative": 0.757, "blimp/accuracy/group_average": 0.801910447761194, "blimp/accuracy/seq_average": 0.801910447761194, "cbt/accuracy/NE": 0.811698717948718, "cbt/accuracy/V": 0.9372, "cbt/accuracy/CN": 0.8744, "cbt/accuracy/P": 0.9224, "cbt/accuracy/group_average": 0.8864246794871795, "cbt/accuracy/seq_average": 0.8864545818327331, "hellaswag/accuracy/val": 0.34793865763792076, "hellaswag/accuracy/group_average": 0.34793865763792076, "hellaswag/accuracy/seq_average": 0.34793865763792076, "piqa/accuracy/val": 0.6224156692056583, "piqa/accuracy/group_average": 0.6224156692056583, "piqa/accuracy/seq_average": 0.6224156692056583, "ai2arc/accuracy/ARC-Easy": 0.3792811839323467, "ai2arc/accuracy/ARC-Challenge": 0.22918454935622318, "ai2arc/accuracy/group_average": 0.30423286664428495, "ai2arc/accuracy/seq_average": 0.32974504249291786, "mmlu/accuracy/MMLU": 0.2630675723989989, "mmlu/accuracy/group_average": 0.2630675723989989, "mmlu/accuracy/seq_average": 0.2630675723989989, "openbookqa/accuracy/test": 0.268, "openbookqa/accuracy/group_average": 0.268, "openbookqa/accuracy/seq_average": 0.268, "race/accuracy/test/high": 0.2884505431675243, "race/accuracy/test/middle": 0.35584958217270196, "race/accuracy/group_average": 0.32215006267011315, "race/accuracy/seq_average": 0.3080664775030401, "siqa/accuracy/dev": 0.37308085977482086, "siqa/accuracy/group_average": 0.37308085977482086, "siqa/accuracy/seq_average": 0.37308085977482086, "winogrande/accuracy/dev": 0.5043409629044988, "winogrande/accuracy/group_average": 0.5043409629044988, "winogrande/accuracy/seq_average": 0.5043409629044988, "commonsenseqa/accuracy/dev_rand_split": 0.28746928746928746, "commonsenseqa/accuracy/group_average": 0.28746928746928746, "commonsenseqa/accuracy/seq_average": 0.28746928746928746}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-20000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.716859605577257, "val/accuracy": 0.4622667100694444, "val/perplexity": 15.132724822098487, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.525919280437209, "lambada/accuracy/total": 0.2127329192546584, "lambada/accuracy/openai_last_token": 0.7439829192546584, "lambada/perplexity": 14.631784073964514, "lambada/lm_loss": 3.276849451979307, "lambada/lm_perplexity": 26.492176208324377, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3374998146620514, "mean_loss": 2.6213894430072333, "blimp/accuracy/passive_2": 0.889, "blimp/accuracy/determiner_noun_agreement_2": 0.99, "blimp/accuracy/ellipsis_n_bar_1": 0.788, "blimp/accuracy/tough_vs_raising_2": 0.903, "blimp/accuracy/tough_vs_raising_1": 0.612, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.866, "blimp/accuracy/principle_A_reconstruction": 0.351, "blimp/accuracy/wh_vs_that_with_gap": 0.488, "blimp/accuracy/principle_A_domain_2": 0.865, "blimp/accuracy/determiner_noun_agreement_1": 0.989, "blimp/accuracy/ellipsis_n_bar_2": 0.894, "blimp/accuracy/principle_A_domain_3": 0.602, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.914, "blimp/accuracy/animate_subject_trans": 0.892, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.88, "blimp/accuracy/distractor_agreement_relative_clause": 0.533, "blimp/accuracy/transitive": 0.853, "blimp/accuracy/sentential_subject_island": 0.336, "blimp/accuracy/adjunct_island": 0.734, "blimp/accuracy/intransitive": 0.804, "blimp/accuracy/existential_there_subject_raising": 0.858, "blimp/accuracy/irregular_past_participle_adjectives": 0.996, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.199, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.243, "blimp/accuracy/only_npi_scope": 0.688, "blimp/accuracy/superlative_quantifiers_2": 0.537, "blimp/accuracy/passive_1": 0.877, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.884, "blimp/accuracy/inchoative": 0.651, "blimp/accuracy/anaphor_gender_agreement": 0.967, "blimp/accuracy/principle_A_c_command": 0.606, "blimp/accuracy/only_npi_licensor_present": 0.545, "blimp/accuracy/expletive_it_object_raising": 0.748, "blimp/accuracy/left_branch_island_simple_question": 0.26, "blimp/accuracy/wh_questions_subject_gap": 0.918, "blimp/accuracy/existential_there_quantifiers_2": 0.326, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.933, "blimp/accuracy/sentential_negation_npi_scope": 0.645, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.799, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.861, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.875, "blimp/accuracy/principle_A_case_2": 0.967, "blimp/accuracy/distractor_agreement_relational_noun": 0.821, "blimp/accuracy/sentential_negation_npi_licensor_present": 1.0, "blimp/accuracy/superlative_quantifiers_1": 0.616, "blimp/accuracy/wh_island": 0.793, "blimp/accuracy/principle_A_domain_1": 0.974, "blimp/accuracy/complex_NP_island": 0.492, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.959, "blimp/accuracy/irregular_past_participle_verbs": 0.869, "blimp/accuracy/drop_argument": 0.786, "blimp/accuracy/wh_questions_object_gap": 0.766, "blimp/accuracy/animate_subject_passive": 0.81, "blimp/accuracy/existential_there_quantifiers_1": 0.967, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.866, "blimp/accuracy/npi_present_2": 0.635, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.925, "blimp/accuracy/anaphor_number_agreement": 0.989, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.951, "blimp/accuracy/existential_there_object_raising": 0.778, "blimp/accuracy/matrix_question_npi_licensor_present": 0.164, "blimp/accuracy/npi_present_1": 0.538, "blimp/accuracy/wh_vs_that_no_gap": 0.965, "blimp/accuracy/left_branch_island_echo_question": 0.324, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.971, "blimp/accuracy/causative": 0.708, "blimp/accuracy/group_average": 0.749746268656716, "blimp/accuracy/seq_average": 0.7497462686567165, "cbt/accuracy/NE": 0.7411858974358975, "cbt/accuracy/V": 0.8984, "cbt/accuracy/CN": 0.8128, "cbt/accuracy/P": 0.8736, "cbt/accuracy/group_average": 0.8314964743589744, "cbt/accuracy/seq_average": 0.8315326130452181, "hellaswag/accuracy/val": 0.2892850029874527, "hellaswag/accuracy/group_average": 0.2892850029874527, "hellaswag/accuracy/seq_average": 0.2892850029874527, "piqa/accuracy/val": 0.5718171926006529, "piqa/accuracy/group_average": 0.5718171926006529, "piqa/accuracy/seq_average": 0.5718171926006529, "ai2arc/accuracy/ARC-Easy": 0.32515856236786467, "ai2arc/accuracy/ARC-Challenge": 0.21030042918454936, "ai2arc/accuracy/group_average": 0.267729495776207, "ai2arc/accuracy/seq_average": 0.28725212464589234, "mmlu/accuracy/MMLU": 0.2624240257418663, "mmlu/accuracy/group_average": 0.2624240257418663, "mmlu/accuracy/seq_average": 0.2624240257418663, "openbookqa/accuracy/test": 0.292, "openbookqa/accuracy/group_average": 0.292, "openbookqa/accuracy/seq_average": 0.292, "race/accuracy/test/high": 0.2664379645511721, "race/accuracy/test/middle": 0.33913649025069637, "race/accuracy/group_average": 0.3027872274009342, "race/accuracy/seq_average": 0.2875962707742197, "siqa/accuracy/dev": 0.36438075742067555, "siqa/accuracy/group_average": 0.36438075742067555, "siqa/accuracy/seq_average": 0.36438075742067555, "winogrande/accuracy/dev": 0.4972375690607735, "winogrande/accuracy/group_average": 0.4972375690607735, "winogrande/accuracy/seq_average": 0.4972375690607735, "commonsenseqa/accuracy/dev_rand_split": 0.2538902538902539, "commonsenseqa/accuracy/group_average": 0.2538902538902539, "commonsenseqa/accuracy/seq_average": 0.2538902538902539}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-200000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.3412674192398315, "val/accuracy": 0.5147879464285714, "val/perplexity": 10.394402283165988, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.3932182359399263, "lambada/accuracy/total": 0.3392857142857143, "lambada/accuracy/openai_last_token": 0.7921195652173914, "lambada/perplexity": 7.456193987610843, "lambada/lm_loss": 2.953567791762613, "lambada/lm_perplexity": 19.17424153846645, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.42703683035714285, "mean_loss": 2.367242827589879, "blimp/accuracy/passive_2": 0.925, "blimp/accuracy/determiner_noun_agreement_2": 0.99, "blimp/accuracy/ellipsis_n_bar_1": 0.845, "blimp/accuracy/tough_vs_raising_2": 0.89, "blimp/accuracy/tough_vs_raising_1": 0.609, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.93, "blimp/accuracy/principle_A_reconstruction": 0.371, "blimp/accuracy/wh_vs_that_with_gap": 0.491, "blimp/accuracy/principle_A_domain_2": 0.844, "blimp/accuracy/determiner_noun_agreement_1": 0.991, "blimp/accuracy/ellipsis_n_bar_2": 0.908, "blimp/accuracy/principle_A_domain_3": 0.625, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.92, "blimp/accuracy/animate_subject_trans": 0.9, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.896, "blimp/accuracy/distractor_agreement_relative_clause": 0.728, "blimp/accuracy/transitive": 0.885, "blimp/accuracy/sentential_subject_island": 0.355, "blimp/accuracy/adjunct_island": 0.905, "blimp/accuracy/intransitive": 0.76, "blimp/accuracy/existential_there_subject_raising": 0.882, "blimp/accuracy/irregular_past_participle_adjectives": 0.956, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.745, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.325, "blimp/accuracy/only_npi_scope": 0.735, "blimp/accuracy/superlative_quantifiers_2": 0.735, "blimp/accuracy/passive_1": 0.913, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.893, "blimp/accuracy/inchoative": 0.633, "blimp/accuracy/anaphor_gender_agreement": 0.975, "blimp/accuracy/principle_A_c_command": 0.763, "blimp/accuracy/only_npi_licensor_present": 0.963, "blimp/accuracy/expletive_it_object_raising": 0.811, "blimp/accuracy/left_branch_island_simple_question": 0.804, "blimp/accuracy/wh_questions_subject_gap": 0.952, "blimp/accuracy/existential_there_quantifiers_2": 0.418, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.954, "blimp/accuracy/sentential_negation_npi_scope": 0.714, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.857, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.906, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.9, "blimp/accuracy/principle_A_case_2": 0.95, "blimp/accuracy/distractor_agreement_relational_noun": 0.888, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.989, "blimp/accuracy/superlative_quantifiers_1": 0.797, "blimp/accuracy/wh_island": 0.776, "blimp/accuracy/principle_A_domain_1": 0.991, "blimp/accuracy/complex_NP_island": 0.639, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.965, "blimp/accuracy/irregular_past_participle_verbs": 0.911, "blimp/accuracy/drop_argument": 0.757, "blimp/accuracy/wh_questions_object_gap": 0.875, "blimp/accuracy/animate_subject_passive": 0.793, "blimp/accuracy/existential_there_quantifiers_1": 0.975, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.874, "blimp/accuracy/npi_present_2": 0.586, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.93, "blimp/accuracy/anaphor_number_agreement": 0.993, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.972, "blimp/accuracy/existential_there_object_raising": 0.847, "blimp/accuracy/matrix_question_npi_licensor_present": 0.362, "blimp/accuracy/npi_present_1": 0.558, "blimp/accuracy/wh_vs_that_no_gap": 0.984, "blimp/accuracy/left_branch_island_echo_question": 0.48, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.97, "blimp/accuracy/causative": 0.777, "blimp/accuracy/group_average": 0.8095671641791046, "blimp/accuracy/seq_average": 0.8095671641791045, "cbt/accuracy/NE": 0.8165064102564102, "cbt/accuracy/V": 0.9304, "cbt/accuracy/CN": 0.8704, "cbt/accuracy/P": 0.9212, "cbt/accuracy/group_average": 0.8846266025641025, "cbt/accuracy/seq_average": 0.8846538615446179, "hellaswag/accuracy/val": 0.355008962358096, "hellaswag/accuracy/group_average": 0.355008962358096, "hellaswag/accuracy/seq_average": 0.355008962358096, "piqa/accuracy/val": 0.6360174102285092, "piqa/accuracy/group_average": 0.6360174102285092, "piqa/accuracy/seq_average": 0.6360174102285092, "ai2arc/accuracy/ARC-Easy": 0.3835095137420719, "ai2arc/accuracy/ARC-Challenge": 0.23004291845493563, "ai2arc/accuracy/group_average": 0.30677621609850375, "ai2arc/accuracy/seq_average": 0.3328611898016997, "mmlu/accuracy/MMLU": 0.2666428316052914, "mmlu/accuracy/group_average": 0.2666428316052914, "mmlu/accuracy/seq_average": 0.2666428316052914, "openbookqa/accuracy/test": 0.268, "openbookqa/accuracy/group_average": 0.268, "openbookqa/accuracy/seq_average": 0.268, "race/accuracy/test/high": 0.28530588907947396, "race/accuracy/test/middle": 0.37186629526462395, "race/accuracy/group_average": 0.3285860921720489, "race/accuracy/seq_average": 0.310498581272801, "siqa/accuracy/dev": 0.38689866939611056, "siqa/accuracy/group_average": 0.38689866939611056, "siqa/accuracy/seq_average": 0.38689866939611056, "winogrande/accuracy/dev": 0.5130228887134964, "winogrande/accuracy/group_average": 0.5130228887134964, "winogrande/accuracy/seq_average": 0.5130228887134964, "commonsenseqa/accuracy/dev_rand_split": 0.2809172809172809, "commonsenseqa/accuracy/group_average": 0.2809172809172809, "commonsenseqa/accuracy/seq_average": 0.2809172809172809}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-220000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.3262520441933283, "val/accuracy": 0.5172593858506944, "val/perplexity": 10.239492360302666, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.42662361097632, "lambada/accuracy/total": 0.36024844720496896, "lambada/accuracy/openai_last_token": 0.8014363354037267, "lambada/perplexity": 7.0394231933883065, "lambada/lm_loss": 2.92813082095197, "lambada/lm_perplexity": 18.69265789733661, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4387539165278317, "mean_loss": 2.376437827584824, "blimp/accuracy/passive_2": 0.925, "blimp/accuracy/determiner_noun_agreement_2": 0.987, "blimp/accuracy/ellipsis_n_bar_1": 0.895, "blimp/accuracy/tough_vs_raising_2": 0.888, "blimp/accuracy/tough_vs_raising_1": 0.616, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.904, "blimp/accuracy/principle_A_reconstruction": 0.457, "blimp/accuracy/wh_vs_that_with_gap": 0.487, "blimp/accuracy/principle_A_domain_2": 0.864, "blimp/accuracy/determiner_noun_agreement_1": 0.995, "blimp/accuracy/ellipsis_n_bar_2": 0.915, "blimp/accuracy/principle_A_domain_3": 0.602, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.909, "blimp/accuracy/animate_subject_trans": 0.912, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.874, "blimp/accuracy/distractor_agreement_relative_clause": 0.644, "blimp/accuracy/transitive": 0.892, "blimp/accuracy/sentential_subject_island": 0.379, "blimp/accuracy/adjunct_island": 0.868, "blimp/accuracy/intransitive": 0.769, "blimp/accuracy/existential_there_subject_raising": 0.887, "blimp/accuracy/irregular_past_participle_adjectives": 0.984, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.681, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.321, "blimp/accuracy/only_npi_scope": 0.761, "blimp/accuracy/superlative_quantifiers_2": 0.821, "blimp/accuracy/passive_1": 0.914, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.91, "blimp/accuracy/inchoative": 0.648, "blimp/accuracy/anaphor_gender_agreement": 0.969, "blimp/accuracy/principle_A_c_command": 0.695, "blimp/accuracy/only_npi_licensor_present": 0.825, "blimp/accuracy/expletive_it_object_raising": 0.787, "blimp/accuracy/left_branch_island_simple_question": 0.753, "blimp/accuracy/wh_questions_subject_gap": 0.943, "blimp/accuracy/existential_there_quantifiers_2": 0.464, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.945, "blimp/accuracy/sentential_negation_npi_scope": 0.756, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.847, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.902, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.894, "blimp/accuracy/principle_A_case_2": 0.945, "blimp/accuracy/distractor_agreement_relational_noun": 0.862, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.989, "blimp/accuracy/superlative_quantifiers_1": 0.783, "blimp/accuracy/wh_island": 0.783, "blimp/accuracy/principle_A_domain_1": 0.983, "blimp/accuracy/complex_NP_island": 0.569, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.96, "blimp/accuracy/irregular_past_participle_verbs": 0.901, "blimp/accuracy/drop_argument": 0.744, "blimp/accuracy/wh_questions_object_gap": 0.861, "blimp/accuracy/animate_subject_passive": 0.801, "blimp/accuracy/existential_there_quantifiers_1": 0.982, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.874, "blimp/accuracy/npi_present_2": 0.594, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.915, "blimp/accuracy/anaphor_number_agreement": 0.989, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.967, "blimp/accuracy/existential_there_object_raising": 0.829, "blimp/accuracy/matrix_question_npi_licensor_present": 0.351, "blimp/accuracy/npi_present_1": 0.579, "blimp/accuracy/wh_vs_that_no_gap": 0.977, "blimp/accuracy/left_branch_island_echo_question": 0.434, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.967, "blimp/accuracy/causative": 0.77, "blimp/accuracy/group_average": 0.8044477611940299, "blimp/accuracy/seq_average": 0.8044477611940298, "cbt/accuracy/NE": 0.8108974358974359, "cbt/accuracy/V": 0.9372, "cbt/accuracy/CN": 0.8776, "cbt/accuracy/P": 0.922, "cbt/accuracy/group_average": 0.8869243589743591, "cbt/accuracy/seq_average": 0.8869547819127651, "hellaswag/accuracy/val": 0.3529177454690301, "hellaswag/accuracy/group_average": 0.3529177454690301, "hellaswag/accuracy/seq_average": 0.3529177454690301, "piqa/accuracy/val": 0.6371055495103374, "piqa/accuracy/group_average": 0.6371055495103374, "piqa/accuracy/seq_average": 0.6371055495103374, "ai2arc/accuracy/ARC-Easy": 0.3780126849894292, "ai2arc/accuracy/ARC-Challenge": 0.23004291845493563, "ai2arc/accuracy/group_average": 0.3040278017221824, "ai2arc/accuracy/seq_average": 0.3291784702549575, "mmlu/accuracy/MMLU": 0.2685734715766893, "mmlu/accuracy/group_average": 0.2685734715766893, "mmlu/accuracy/seq_average": 0.2685734715766893, "openbookqa/accuracy/test": 0.252, "openbookqa/accuracy/group_average": 0.252, "openbookqa/accuracy/seq_average": 0.252, "race/accuracy/test/high": 0.2887364208118925, "race/accuracy/test/middle": 0.36629526462395545, "race/accuracy/group_average": 0.327515842717924, "race/accuracy/seq_average": 0.31130928252938794, "siqa/accuracy/dev": 0.36745138178096215, "siqa/accuracy/group_average": 0.36745138178096215, "siqa/accuracy/seq_average": 0.36745138178096215, "winogrande/accuracy/dev": 0.505130228887135, "winogrande/accuracy/group_average": 0.505130228887135, "winogrande/accuracy/seq_average": 0.505130228887135, "commonsenseqa/accuracy/dev_rand_split": 0.27764127764127766, "commonsenseqa/accuracy/group_average": 0.27764127764127766, "commonsenseqa/accuracy/seq_average": 0.27764127764127766}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-240000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.3131181020585316, "val/accuracy": 0.5187552315848214, "val/perplexity": 10.105886764812553, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.3947740637737773, "lambada/accuracy/total": 0.36044254658385094, "lambada/accuracy/openai_last_token": 0.8020186335403726, "lambada/perplexity": 6.656832262724534, "lambada/lm_loss": 2.907928123487469, "lambada/lm_perplexity": 18.318804919885405, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4395988890843362, "mean_loss": 2.3539460829161545, "blimp/accuracy/passive_2": 0.915, "blimp/accuracy/determiner_noun_agreement_2": 0.976, "blimp/accuracy/ellipsis_n_bar_1": 0.877, "blimp/accuracy/tough_vs_raising_2": 0.911, "blimp/accuracy/tough_vs_raising_1": 0.579, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.901, "blimp/accuracy/principle_A_reconstruction": 0.409, "blimp/accuracy/wh_vs_that_with_gap": 0.478, "blimp/accuracy/principle_A_domain_2": 0.884, "blimp/accuracy/determiner_noun_agreement_1": 0.989, "blimp/accuracy/ellipsis_n_bar_2": 0.914, "blimp/accuracy/principle_A_domain_3": 0.602, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.906, "blimp/accuracy/animate_subject_trans": 0.9, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.882, "blimp/accuracy/distractor_agreement_relative_clause": 0.727, "blimp/accuracy/transitive": 0.903, "blimp/accuracy/sentential_subject_island": 0.348, "blimp/accuracy/adjunct_island": 0.894, "blimp/accuracy/intransitive": 0.783, "blimp/accuracy/existential_there_subject_raising": 0.885, "blimp/accuracy/irregular_past_participle_adjectives": 0.957, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.606, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.331, "blimp/accuracy/only_npi_scope": 0.683, "blimp/accuracy/superlative_quantifiers_2": 0.821, "blimp/accuracy/passive_1": 0.919, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.904, "blimp/accuracy/inchoative": 0.641, "blimp/accuracy/anaphor_gender_agreement": 0.965, "blimp/accuracy/principle_A_c_command": 0.749, "blimp/accuracy/only_npi_licensor_present": 0.745, "blimp/accuracy/expletive_it_object_raising": 0.787, "blimp/accuracy/left_branch_island_simple_question": 0.678, "blimp/accuracy/wh_questions_subject_gap": 0.93, "blimp/accuracy/existential_there_quantifiers_2": 0.49, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.94, "blimp/accuracy/sentential_negation_npi_scope": 0.748, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.826, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.887, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.897, "blimp/accuracy/principle_A_case_2": 0.957, "blimp/accuracy/distractor_agreement_relational_noun": 0.854, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.992, "blimp/accuracy/superlative_quantifiers_1": 0.883, "blimp/accuracy/wh_island": 0.809, "blimp/accuracy/principle_A_domain_1": 0.989, "blimp/accuracy/complex_NP_island": 0.62, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.964, "blimp/accuracy/irregular_past_participle_verbs": 0.919, "blimp/accuracy/drop_argument": 0.763, "blimp/accuracy/wh_questions_object_gap": 0.848, "blimp/accuracy/animate_subject_passive": 0.795, "blimp/accuracy/existential_there_quantifiers_1": 0.982, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.888, "blimp/accuracy/npi_present_2": 0.564, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.927, "blimp/accuracy/anaphor_number_agreement": 0.986, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.964, "blimp/accuracy/existential_there_object_raising": 0.814, "blimp/accuracy/matrix_question_npi_licensor_present": 0.371, "blimp/accuracy/npi_present_1": 0.519, "blimp/accuracy/wh_vs_that_no_gap": 0.978, "blimp/accuracy/left_branch_island_echo_question": 0.465, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.964, "blimp/accuracy/causative": 0.773, "blimp/accuracy/group_average": 0.8023134328358207, "blimp/accuracy/seq_average": 0.8023134328358209, "cbt/accuracy/NE": 0.8197115384615384, "cbt/accuracy/V": 0.9388, "cbt/accuracy/CN": 0.8776, "cbt/accuracy/P": 0.926, "cbt/accuracy/group_average": 0.8905278846153847, "cbt/accuracy/seq_average": 0.8905562224889956, "hellaswag/accuracy/val": 0.3567018522206732, "hellaswag/accuracy/group_average": 0.3567018522206732, "hellaswag/accuracy/seq_average": 0.3567018522206732, "piqa/accuracy/val": 0.6381936887921654, "piqa/accuracy/group_average": 0.6381936887921654, "piqa/accuracy/seq_average": 0.6381936887921654, "ai2arc/accuracy/ARC-Easy": 0.3856236786469345, "ai2arc/accuracy/ARC-Challenge": 0.22317596566523606, "ai2arc/accuracy/group_average": 0.3043998221560853, "ai2arc/accuracy/seq_average": 0.3320113314447592, "mmlu/accuracy/MMLU": 0.2646406864497676, "mmlu/accuracy/group_average": 0.2646406864497676, "mmlu/accuracy/seq_average": 0.2646406864497676, "openbookqa/accuracy/test": 0.272, "openbookqa/accuracy/group_average": 0.272, "openbookqa/accuracy/seq_average": 0.272, "race/accuracy/test/high": 0.29445397369925674, "race/accuracy/test/middle": 0.36559888579387184, "race/accuracy/group_average": 0.33002642974656426, "race/accuracy/seq_average": 0.3151601134981759, "siqa/accuracy/dev": 0.3725690890481064, "siqa/accuracy/group_average": 0.3725690890481064, "siqa/accuracy/seq_average": 0.3725690890481064, "winogrande/accuracy/dev": 0.5059194948697711, "winogrande/accuracy/group_average": 0.5059194948697711, "winogrande/accuracy/seq_average": 0.5059194948697711, "commonsenseqa/accuracy/dev_rand_split": 0.2710892710892711, "commonsenseqa/accuracy/group_average": 0.2710892710892711, "commonsenseqa/accuracy/seq_average": 0.2710892710892711}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-260000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.302085513160342, "val/accuracy": 0.5205891927083334, "val/perplexity": 9.995005449355233, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.3810106360394023, "lambada/accuracy/total": 0.36471273291925466, "lambada/accuracy/openai_last_token": 0.797166149068323, "lambada/perplexity": 6.808884262406364, "lambada/lm_loss": 2.886261086654528, "lambada/lm_perplexity": 17.926159788045368, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.442650962813794, "mean_loss": 2.3415480745998725, "blimp/accuracy/passive_2": 0.927, "blimp/accuracy/determiner_noun_agreement_2": 0.984, "blimp/accuracy/ellipsis_n_bar_1": 0.861, "blimp/accuracy/tough_vs_raising_2": 0.899, "blimp/accuracy/tough_vs_raising_1": 0.605, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.928, "blimp/accuracy/principle_A_reconstruction": 0.434, "blimp/accuracy/wh_vs_that_with_gap": 0.469, "blimp/accuracy/principle_A_domain_2": 0.883, "blimp/accuracy/determiner_noun_agreement_1": 0.993, "blimp/accuracy/ellipsis_n_bar_2": 0.916, "blimp/accuracy/principle_A_domain_3": 0.637, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.931, "blimp/accuracy/animate_subject_trans": 0.904, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.908, "blimp/accuracy/distractor_agreement_relative_clause": 0.698, "blimp/accuracy/transitive": 0.894, "blimp/accuracy/sentential_subject_island": 0.31, "blimp/accuracy/adjunct_island": 0.884, "blimp/accuracy/intransitive": 0.797, "blimp/accuracy/existential_there_subject_raising": 0.889, "blimp/accuracy/irregular_past_participle_adjectives": 0.88, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.654, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.358, "blimp/accuracy/only_npi_scope": 0.745, "blimp/accuracy/superlative_quantifiers_2": 0.777, "blimp/accuracy/passive_1": 0.931, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.925, "blimp/accuracy/inchoative": 0.653, "blimp/accuracy/anaphor_gender_agreement": 0.977, "blimp/accuracy/principle_A_c_command": 0.755, "blimp/accuracy/only_npi_licensor_present": 0.67, "blimp/accuracy/expletive_it_object_raising": 0.8, "blimp/accuracy/left_branch_island_simple_question": 0.746, "blimp/accuracy/wh_questions_subject_gap": 0.946, "blimp/accuracy/existential_there_quantifiers_2": 0.471, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.948, "blimp/accuracy/sentential_negation_npi_scope": 0.745, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.821, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.897, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.907, "blimp/accuracy/principle_A_case_2": 0.94, "blimp/accuracy/distractor_agreement_relational_noun": 0.882, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.982, "blimp/accuracy/superlative_quantifiers_1": 0.828, "blimp/accuracy/wh_island": 0.81, "blimp/accuracy/principle_A_domain_1": 0.989, "blimp/accuracy/complex_NP_island": 0.586, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.975, "blimp/accuracy/irregular_past_participle_verbs": 0.888, "blimp/accuracy/drop_argument": 0.777, "blimp/accuracy/wh_questions_object_gap": 0.849, "blimp/accuracy/animate_subject_passive": 0.802, "blimp/accuracy/existential_there_quantifiers_1": 0.977, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.898, "blimp/accuracy/npi_present_2": 0.617, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.956, "blimp/accuracy/anaphor_number_agreement": 0.987, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.977, "blimp/accuracy/existential_there_object_raising": 0.835, "blimp/accuracy/matrix_question_npi_licensor_present": 0.39, "blimp/accuracy/npi_present_1": 0.575, "blimp/accuracy/wh_vs_that_no_gap": 0.986, "blimp/accuracy/left_branch_island_echo_question": 0.387, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.977, "blimp/accuracy/causative": 0.766, "blimp/accuracy/group_average": 0.8058656716417911, "blimp/accuracy/seq_average": 0.805865671641791, "cbt/accuracy/NE": 0.8181089743589743, "cbt/accuracy/V": 0.938, "cbt/accuracy/CN": 0.8868, "cbt/accuracy/P": 0.9272, "cbt/accuracy/group_average": 0.8925272435897436, "cbt/accuracy/seq_average": 0.8925570228091236, "hellaswag/accuracy/val": 0.35919139613622786, "hellaswag/accuracy/group_average": 0.35919139613622786, "hellaswag/accuracy/seq_average": 0.35919139613622786, "piqa/accuracy/val": 0.6420021762785637, "piqa/accuracy/group_average": 0.6420021762785637, "piqa/accuracy/seq_average": 0.6420021762785637, "ai2arc/accuracy/ARC-Easy": 0.3771670190274841, "ai2arc/accuracy/ARC-Challenge": 0.2240343347639485, "ai2arc/accuracy/group_average": 0.3006006768957163, "ai2arc/accuracy/seq_average": 0.32662889518413596, "mmlu/accuracy/MMLU": 0.26950303897032535, "mmlu/accuracy/group_average": 0.26950303897032535, "mmlu/accuracy/seq_average": 0.26950303897032535, "openbookqa/accuracy/test": 0.28, "openbookqa/accuracy/group_average": 0.28, "openbookqa/accuracy/seq_average": 0.28, "race/accuracy/test/high": 0.2904516866781018, "race/accuracy/test/middle": 0.36908077994428967, "race/accuracy/group_average": 0.3297662333111957, "race/accuracy/seq_average": 0.3133360356708553, "siqa/accuracy/dev": 0.38331627430910953, "siqa/accuracy/group_average": 0.38331627430910953, "siqa/accuracy/seq_average": 0.38331627430910953, "winogrande/accuracy/dev": 0.5043409629044988, "winogrande/accuracy/group_average": 0.5043409629044988, "winogrande/accuracy/seq_average": 0.5043409629044988, "commonsenseqa/accuracy/dev_rand_split": 0.27436527436527436, "commonsenseqa/accuracy/group_average": 0.27436527436527436, "commonsenseqa/accuracy/seq_average": 0.27436527436527436}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-280000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.2905665806361606, "val/accuracy": 0.5229695638020834, "val/perplexity": 9.880534214940745, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.3837086932259317, "lambada/accuracy/total": 0.37441770186335405, "lambada/accuracy/openai_last_token": 0.8008540372670807, "lambada/perplexity": 6.55538735289259, "lambada/lm_loss": 2.8807381883783156, "lambada/lm_perplexity": 17.82742832393148, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4486936328327187, "mean_loss": 2.337137636931046, "blimp/accuracy/passive_2": 0.914, "blimp/accuracy/determiner_noun_agreement_2": 0.987, "blimp/accuracy/ellipsis_n_bar_1": 0.861, "blimp/accuracy/tough_vs_raising_2": 0.899, "blimp/accuracy/tough_vs_raising_1": 0.607, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.881, "blimp/accuracy/principle_A_reconstruction": 0.465, "blimp/accuracy/wh_vs_that_with_gap": 0.48, "blimp/accuracy/principle_A_domain_2": 0.876, "blimp/accuracy/determiner_noun_agreement_1": 0.994, "blimp/accuracy/ellipsis_n_bar_2": 0.92, "blimp/accuracy/principle_A_domain_3": 0.625, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.905, "blimp/accuracy/animate_subject_trans": 0.903, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.892, "blimp/accuracy/distractor_agreement_relative_clause": 0.69, "blimp/accuracy/transitive": 0.891, "blimp/accuracy/sentential_subject_island": 0.346, "blimp/accuracy/adjunct_island": 0.903, "blimp/accuracy/intransitive": 0.783, "blimp/accuracy/existential_there_subject_raising": 0.884, "blimp/accuracy/irregular_past_participle_adjectives": 0.979, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.727, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.349, "blimp/accuracy/only_npi_scope": 0.699, "blimp/accuracy/superlative_quantifiers_2": 0.795, "blimp/accuracy/passive_1": 0.92, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.904, "blimp/accuracy/inchoative": 0.64, "blimp/accuracy/anaphor_gender_agreement": 0.978, "blimp/accuracy/principle_A_c_command": 0.77, "blimp/accuracy/only_npi_licensor_present": 0.777, "blimp/accuracy/expletive_it_object_raising": 0.769, "blimp/accuracy/left_branch_island_simple_question": 0.8, "blimp/accuracy/wh_questions_subject_gap": 0.949, "blimp/accuracy/existential_there_quantifiers_2": 0.474, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.941, "blimp/accuracy/sentential_negation_npi_scope": 0.679, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.852, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.917, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.907, "blimp/accuracy/principle_A_case_2": 0.937, "blimp/accuracy/distractor_agreement_relational_noun": 0.881, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.977, "blimp/accuracy/superlative_quantifiers_1": 0.848, "blimp/accuracy/wh_island": 0.832, "blimp/accuracy/principle_A_domain_1": 0.989, "blimp/accuracy/complex_NP_island": 0.58, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.963, "blimp/accuracy/irregular_past_participle_verbs": 0.914, "blimp/accuracy/drop_argument": 0.766, "blimp/accuracy/wh_questions_object_gap": 0.862, "blimp/accuracy/animate_subject_passive": 0.802, "blimp/accuracy/existential_there_quantifiers_1": 0.981, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.899, "blimp/accuracy/npi_present_2": 0.565, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.917, "blimp/accuracy/anaphor_number_agreement": 0.99, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.971, "blimp/accuracy/existential_there_object_raising": 0.843, "blimp/accuracy/matrix_question_npi_licensor_present": 0.384, "blimp/accuracy/npi_present_1": 0.551, "blimp/accuracy/wh_vs_that_no_gap": 0.983, "blimp/accuracy/left_branch_island_echo_question": 0.426, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.976, "blimp/accuracy/causative": 0.763, "blimp/accuracy/group_average": 0.8079402985074627, "blimp/accuracy/seq_average": 0.8079402985074627, "cbt/accuracy/NE": 0.8245192307692307, "cbt/accuracy/V": 0.9452, "cbt/accuracy/CN": 0.882, "cbt/accuracy/P": 0.926, "cbt/accuracy/group_average": 0.8944298076923077, "cbt/accuracy/seq_average": 0.8944577831132453, "hellaswag/accuracy/val": 0.3610834495120494, "hellaswag/accuracy/group_average": 0.3610834495120494, "hellaswag/accuracy/seq_average": 0.3610834495120494, "piqa/accuracy/val": 0.6463547334058759, "piqa/accuracy/group_average": 0.6463547334058759, "piqa/accuracy/seq_average": 0.6463547334058759, "ai2arc/accuracy/ARC-Easy": 0.38308668076109936, "ai2arc/accuracy/ARC-Challenge": 0.23862660944206007, "ai2arc/accuracy/group_average": 0.31085664510157973, "ai2arc/accuracy/seq_average": 0.33541076487252125, "mmlu/accuracy/MMLU": 0.26978905970682876, "mmlu/accuracy/group_average": 0.26978905970682876, "mmlu/accuracy/seq_average": 0.26978905970682876, "openbookqa/accuracy/test": 0.264, "openbookqa/accuracy/group_average": 0.264, "openbookqa/accuracy/seq_average": 0.264, "race/accuracy/test/high": 0.29130931961120643, "race/accuracy/test/middle": 0.37047353760445684, "race/accuracy/group_average": 0.3308914286078316, "race/accuracy/seq_average": 0.31434941224158897, "siqa/accuracy/dev": 0.37922210849539406, "siqa/accuracy/group_average": 0.37922210849539406, "siqa/accuracy/seq_average": 0.37922210849539406, "winogrande/accuracy/dev": 0.5122336227308603, "winogrande/accuracy/group_average": 0.5122336227308603, "winogrande/accuracy/seq_average": 0.5122336227308603, "commonsenseqa/accuracy/dev_rand_split": 0.2719082719082719, "commonsenseqa/accuracy/group_average": 0.2719082719082719, "commonsenseqa/accuracy/seq_average": 0.2719082719082719}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-300000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.2815079461960566, "val/accuracy": 0.52410888671875, "val/perplexity": 9.791434238757116, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.4860210537170033, "lambada/accuracy/total": 0.3513198757763975, "lambada/accuracy/openai_last_token": 0.8004658385093167, "lambada/perplexity": 6.666117877144713, "lambada/lm_loss": 2.8869836966201023, "lambada/lm_perplexity": 17.93911809108828, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4377143812475738, "mean_loss": 2.3837644999565297, "blimp/accuracy/passive_2": 0.922, "blimp/accuracy/determiner_noun_agreement_2": 0.985, "blimp/accuracy/ellipsis_n_bar_1": 0.883, "blimp/accuracy/tough_vs_raising_2": 0.88, "blimp/accuracy/tough_vs_raising_1": 0.594, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.91, "blimp/accuracy/principle_A_reconstruction": 0.486, "blimp/accuracy/wh_vs_that_with_gap": 0.476, "blimp/accuracy/principle_A_domain_2": 0.865, "blimp/accuracy/determiner_noun_agreement_1": 0.991, "blimp/accuracy/ellipsis_n_bar_2": 0.91, "blimp/accuracy/principle_A_domain_3": 0.616, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.912, "blimp/accuracy/animate_subject_trans": 0.909, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.905, "blimp/accuracy/distractor_agreement_relative_clause": 0.708, "blimp/accuracy/transitive": 0.893, "blimp/accuracy/sentential_subject_island": 0.315, "blimp/accuracy/adjunct_island": 0.868, "blimp/accuracy/intransitive": 0.793, "blimp/accuracy/existential_there_subject_raising": 0.886, "blimp/accuracy/irregular_past_participle_adjectives": 0.971, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.702, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.333, "blimp/accuracy/only_npi_scope": 0.744, "blimp/accuracy/superlative_quantifiers_2": 0.865, "blimp/accuracy/passive_1": 0.91, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.929, "blimp/accuracy/inchoative": 0.648, "blimp/accuracy/anaphor_gender_agreement": 0.966, "blimp/accuracy/principle_A_c_command": 0.774, "blimp/accuracy/only_npi_licensor_present": 0.761, "blimp/accuracy/expletive_it_object_raising": 0.775, "blimp/accuracy/left_branch_island_simple_question": 0.795, "blimp/accuracy/wh_questions_subject_gap": 0.939, "blimp/accuracy/existential_there_quantifiers_2": 0.453, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.956, "blimp/accuracy/sentential_negation_npi_scope": 0.71, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.808, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.912, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.906, "blimp/accuracy/principle_A_case_2": 0.958, "blimp/accuracy/distractor_agreement_relational_noun": 0.861, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.979, "blimp/accuracy/superlative_quantifiers_1": 0.811, "blimp/accuracy/wh_island": 0.808, "blimp/accuracy/principle_A_domain_1": 0.99, "blimp/accuracy/complex_NP_island": 0.521, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.964, "blimp/accuracy/irregular_past_participle_verbs": 0.9, "blimp/accuracy/drop_argument": 0.769, "blimp/accuracy/wh_questions_object_gap": 0.865, "blimp/accuracy/animate_subject_passive": 0.792, "blimp/accuracy/existential_there_quantifiers_1": 0.993, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.897, "blimp/accuracy/npi_present_2": 0.556, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.944, "blimp/accuracy/anaphor_number_agreement": 0.991, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.971, "blimp/accuracy/existential_there_object_raising": 0.836, "blimp/accuracy/matrix_question_npi_licensor_present": 0.356, "blimp/accuracy/npi_present_1": 0.56, "blimp/accuracy/wh_vs_that_no_gap": 0.979, "blimp/accuracy/left_branch_island_echo_question": 0.456, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.97, "blimp/accuracy/causative": 0.746, "blimp/accuracy/group_average": 0.8065223880597014, "blimp/accuracy/seq_average": 0.8065223880597014, "cbt/accuracy/NE": 0.8221153846153846, "cbt/accuracy/V": 0.9464, "cbt/accuracy/CN": 0.8812, "cbt/accuracy/P": 0.926, "cbt/accuracy/group_average": 0.8939288461538462, "cbt/accuracy/seq_average": 0.8939575830332133, "hellaswag/accuracy/val": 0.36496713802031466, "hellaswag/accuracy/group_average": 0.36496713802031466, "hellaswag/accuracy/seq_average": 0.36496713802031466, "piqa/accuracy/val": 0.6512513601741022, "piqa/accuracy/group_average": 0.6512513601741022, "piqa/accuracy/seq_average": 0.6512513601741022, "ai2arc/accuracy/ARC-Easy": 0.39069767441860465, "ai2arc/accuracy/ARC-Challenge": 0.24377682403433476, "ai2arc/accuracy/group_average": 0.3172372492264697, "ai2arc/accuracy/seq_average": 0.3422096317280453, "mmlu/accuracy/MMLU": 0.2646406864497676, "mmlu/accuracy/group_average": 0.2646406864497676, "mmlu/accuracy/seq_average": 0.2646406864497676, "openbookqa/accuracy/test": 0.272, "openbookqa/accuracy/group_average": 0.272, "openbookqa/accuracy/seq_average": 0.272, "race/accuracy/test/high": 0.29245283018867924, "race/accuracy/test/middle": 0.37047353760445684, "race/accuracy/group_average": 0.33146318389656804, "race/accuracy/seq_average": 0.3151601134981759, "siqa/accuracy/dev": 0.37717502558853633, "siqa/accuracy/group_average": 0.37717502558853633, "siqa/accuracy/seq_average": 0.37717502558853633, "winogrande/accuracy/dev": 0.5082872928176796, "winogrande/accuracy/group_average": 0.5082872928176796, "winogrande/accuracy/seq_average": 0.5082872928176796, "commonsenseqa/accuracy/dev_rand_split": 0.276003276003276, "commonsenseqa/accuracy/group_average": 0.276003276003276, "commonsenseqa/accuracy/seq_average": 0.276003276003276}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-320000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.271977015904018, "val/accuracy": 0.5256008572048612, "val/perplexity": 9.698556072309541, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.3864848450844334, "lambada/accuracy/total": 0.3695652173913043, "lambada/accuracy/openai_last_token": 0.8029891304347826, "lambada/perplexity": 6.465890800500516, "lambada/lm_loss": 2.879644975621408, "lambada/lm_perplexity": 17.80794980088976, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.44758303729808274, "mean_loss": 2.3292309304942256, "blimp/accuracy/passive_2": 0.926, "blimp/accuracy/determiner_noun_agreement_2": 0.989, "blimp/accuracy/ellipsis_n_bar_1": 0.873, "blimp/accuracy/tough_vs_raising_2": 0.904, "blimp/accuracy/tough_vs_raising_1": 0.614, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.93, "blimp/accuracy/principle_A_reconstruction": 0.468, "blimp/accuracy/wh_vs_that_with_gap": 0.47, "blimp/accuracy/principle_A_domain_2": 0.872, "blimp/accuracy/determiner_noun_agreement_1": 0.995, "blimp/accuracy/ellipsis_n_bar_2": 0.921, "blimp/accuracy/principle_A_domain_3": 0.625, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.926, "blimp/accuracy/animate_subject_trans": 0.905, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.901, "blimp/accuracy/distractor_agreement_relative_clause": 0.683, "blimp/accuracy/transitive": 0.882, "blimp/accuracy/sentential_subject_island": 0.325, "blimp/accuracy/adjunct_island": 0.905, "blimp/accuracy/intransitive": 0.784, "blimp/accuracy/existential_there_subject_raising": 0.882, "blimp/accuracy/irregular_past_participle_adjectives": 0.962, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.754, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.355, "blimp/accuracy/only_npi_scope": 0.663, "blimp/accuracy/superlative_quantifiers_2": 0.822, "blimp/accuracy/passive_1": 0.91, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.921, "blimp/accuracy/inchoative": 0.654, "blimp/accuracy/anaphor_gender_agreement": 0.974, "blimp/accuracy/principle_A_c_command": 0.747, "blimp/accuracy/only_npi_licensor_present": 0.599, "blimp/accuracy/expletive_it_object_raising": 0.783, "blimp/accuracy/left_branch_island_simple_question": 0.844, "blimp/accuracy/wh_questions_subject_gap": 0.951, "blimp/accuracy/existential_there_quantifiers_2": 0.47, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.951, "blimp/accuracy/sentential_negation_npi_scope": 0.751, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.858, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.926, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.901, "blimp/accuracy/principle_A_case_2": 0.948, "blimp/accuracy/distractor_agreement_relational_noun": 0.866, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.993, "blimp/accuracy/superlative_quantifiers_1": 0.836, "blimp/accuracy/wh_island": 0.847, "blimp/accuracy/principle_A_domain_1": 0.989, "blimp/accuracy/complex_NP_island": 0.552, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.967, "blimp/accuracy/irregular_past_participle_verbs": 0.924, "blimp/accuracy/drop_argument": 0.766, "blimp/accuracy/wh_questions_object_gap": 0.869, "blimp/accuracy/animate_subject_passive": 0.777, "blimp/accuracy/existential_there_quantifiers_1": 0.993, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.903, "blimp/accuracy/npi_present_2": 0.589, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.945, "blimp/accuracy/anaphor_number_agreement": 0.991, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.973, "blimp/accuracy/existential_there_object_raising": 0.83, "blimp/accuracy/matrix_question_npi_licensor_present": 0.375, "blimp/accuracy/npi_present_1": 0.614, "blimp/accuracy/wh_vs_that_no_gap": 0.986, "blimp/accuracy/left_branch_island_echo_question": 0.469, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.975, "blimp/accuracy/causative": 0.778, "blimp/accuracy/group_average": 0.8109104477611939, "blimp/accuracy/seq_average": 0.810910447761194, "cbt/accuracy/NE": 0.8177083333333334, "cbt/accuracy/V": 0.9452, "cbt/accuracy/CN": 0.8872, "cbt/accuracy/P": 0.93, "cbt/accuracy/group_average": 0.8950270833333334, "cbt/accuracy/seq_average": 0.8950580232092837, "hellaswag/accuracy/val": 0.3679545907189803, "hellaswag/accuracy/group_average": 0.3679545907189803, "hellaswag/accuracy/seq_average": 0.3679545907189803, "piqa/accuracy/val": 0.6490750816104461, "piqa/accuracy/group_average": 0.6490750816104461, "piqa/accuracy/seq_average": 0.6490750816104461, "ai2arc/accuracy/ARC-Easy": 0.39112050739957716, "ai2arc/accuracy/ARC-Challenge": 0.23433476394849787, "ai2arc/accuracy/group_average": 0.3127276356740375, "ai2arc/accuracy/seq_average": 0.3393767705382436, "mmlu/accuracy/MMLU": 0.26378262424025745, "mmlu/accuracy/group_average": 0.26378262424025745, "mmlu/accuracy/seq_average": 0.26378262424025745, "openbookqa/accuracy/test": 0.28, "openbookqa/accuracy/group_average": 0.28, "openbookqa/accuracy/seq_average": 0.28, "race/accuracy/test/high": 0.2964551172098342, "race/accuracy/test/middle": 0.362116991643454, "race/accuracy/group_average": 0.3292860544266441, "race/accuracy/seq_average": 0.3155654641264694, "siqa/accuracy/dev": 0.38843398157625386, "siqa/accuracy/group_average": 0.38843398157625386, "siqa/accuracy/seq_average": 0.38843398157625386, "winogrande/accuracy/dev": 0.5035516969218626, "winogrande/accuracy/group_average": 0.5035516969218626, "winogrande/accuracy/seq_average": 0.5035516969218626, "commonsenseqa/accuracy/dev_rand_split": 0.276003276003276, "commonsenseqa/accuracy/group_average": 0.276003276003276, "commonsenseqa/accuracy/seq_average": 0.276003276003276}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-340000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.263301304408482, "val/accuracy": 0.5262034582713294, "val/perplexity": 9.614778139970491, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.357429267456813, "lambada/accuracy/total": 0.38722826086956524, "lambada/accuracy/openai_last_token": 0.8035714285714286, "lambada/perplexity": 6.2733680908227, "lambada/lm_loss": 2.86969291564609, "lambada/lm_perplexity": 17.631602979005876, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.45671585957044736, "mean_loss": 2.3103652859326473, "blimp/accuracy/passive_2": 0.92, "blimp/accuracy/determiner_noun_agreement_2": 0.988, "blimp/accuracy/ellipsis_n_bar_1": 0.867, "blimp/accuracy/tough_vs_raising_2": 0.918, "blimp/accuracy/tough_vs_raising_1": 0.576, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.92, "blimp/accuracy/principle_A_reconstruction": 0.445, "blimp/accuracy/wh_vs_that_with_gap": 0.459, "blimp/accuracy/principle_A_domain_2": 0.869, "blimp/accuracy/determiner_noun_agreement_1": 0.996, "blimp/accuracy/ellipsis_n_bar_2": 0.917, "blimp/accuracy/principle_A_domain_3": 0.618, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.923, "blimp/accuracy/animate_subject_trans": 0.911, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.922, "blimp/accuracy/distractor_agreement_relative_clause": 0.706, "blimp/accuracy/transitive": 0.905, "blimp/accuracy/sentential_subject_island": 0.32, "blimp/accuracy/adjunct_island": 0.889, "blimp/accuracy/intransitive": 0.769, "blimp/accuracy/existential_there_subject_raising": 0.888, "blimp/accuracy/irregular_past_participle_adjectives": 0.956, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.698, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.347, "blimp/accuracy/only_npi_scope": 0.764, "blimp/accuracy/superlative_quantifiers_2": 0.854, "blimp/accuracy/passive_1": 0.908, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.927, "blimp/accuracy/inchoative": 0.635, "blimp/accuracy/anaphor_gender_agreement": 0.978, "blimp/accuracy/principle_A_c_command": 0.74, "blimp/accuracy/only_npi_licensor_present": 0.733, "blimp/accuracy/expletive_it_object_raising": 0.794, "blimp/accuracy/left_branch_island_simple_question": 0.806, "blimp/accuracy/wh_questions_subject_gap": 0.946, "blimp/accuracy/existential_there_quantifiers_2": 0.486, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.951, "blimp/accuracy/sentential_negation_npi_scope": 0.755, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.838, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.922, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.896, "blimp/accuracy/principle_A_case_2": 0.951, "blimp/accuracy/distractor_agreement_relational_noun": 0.874, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.989, "blimp/accuracy/superlative_quantifiers_1": 0.81, "blimp/accuracy/wh_island": 0.821, "blimp/accuracy/principle_A_domain_1": 0.989, "blimp/accuracy/complex_NP_island": 0.57, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.969, "blimp/accuracy/irregular_past_participle_verbs": 0.92, "blimp/accuracy/drop_argument": 0.76, "blimp/accuracy/wh_questions_object_gap": 0.876, "blimp/accuracy/animate_subject_passive": 0.793, "blimp/accuracy/existential_there_quantifiers_1": 0.986, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.888, "blimp/accuracy/npi_present_2": 0.587, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.948, "blimp/accuracy/anaphor_number_agreement": 0.993, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.976, "blimp/accuracy/existential_there_object_raising": 0.823, "blimp/accuracy/matrix_question_npi_licensor_present": 0.365, "blimp/accuracy/npi_present_1": 0.616, "blimp/accuracy/wh_vs_that_no_gap": 0.984, "blimp/accuracy/left_branch_island_echo_question": 0.476, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.973, "blimp/accuracy/causative": 0.779, "blimp/accuracy/group_average": 0.8115820895522389, "blimp/accuracy/seq_average": 0.8115820895522388, "cbt/accuracy/NE": 0.8205128205128205, "cbt/accuracy/V": 0.946, "cbt/accuracy/CN": 0.8876, "cbt/accuracy/P": 0.9264, "cbt/accuracy/group_average": 0.8951282051282051, "cbt/accuracy/seq_average": 0.8951580632252901, "hellaswag/accuracy/val": 0.36974706233817967, "hellaswag/accuracy/group_average": 0.36974706233817967, "hellaswag/accuracy/seq_average": 0.36974706233817967, "piqa/accuracy/val": 0.6534276387377584, "piqa/accuracy/group_average": 0.6534276387377584, "piqa/accuracy/seq_average": 0.6534276387377584, "ai2arc/accuracy/ARC-Easy": 0.3885835095137421, "ai2arc/accuracy/ARC-Challenge": 0.2334763948497854, "ai2arc/accuracy/group_average": 0.31102995218176377, "ai2arc/accuracy/seq_average": 0.33739376770538243, "mmlu/accuracy/MMLU": 0.26542724347515195, "mmlu/accuracy/group_average": 0.26542724347515195, "mmlu/accuracy/seq_average": 0.26542724347515195, "openbookqa/accuracy/test": 0.274, "openbookqa/accuracy/group_average": 0.274, "openbookqa/accuracy/seq_average": 0.274, "race/accuracy/test/high": 0.29073756432247, "race/accuracy/test/middle": 0.36908077994428967, "race/accuracy/group_average": 0.32990917213337984, "race/accuracy/seq_average": 0.313538710985002, "siqa/accuracy/dev": 0.3889457523029683, "siqa/accuracy/group_average": 0.3889457523029683, "siqa/accuracy/seq_average": 0.3889457523029683, "winogrande/accuracy/dev": 0.5035516969218626, "winogrande/accuracy/group_average": 0.5035516969218626, "winogrande/accuracy/seq_average": 0.5035516969218626, "commonsenseqa/accuracy/dev_rand_split": 0.2809172809172809, "commonsenseqa/accuracy/group_average": 0.2809172809172809, "commonsenseqa/accuracy/seq_average": 0.2809172809172809}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-360000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.2577693878658236, "val/accuracy": 0.5277099609375, "val/perplexity": 9.561736835024677, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.3324829480662848, "lambada/accuracy/total": 0.374805900621118, "lambada/accuracy/openai_last_token": 0.8037655279503105, "lambada/perplexity": 6.341823215212337, "lambada/lm_loss": 2.856788081204703, "lambada/lm_perplexity": 17.40553190390577, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.451257930779309, "mean_loss": 2.295126167966054, "blimp/accuracy/passive_2": 0.918, "blimp/accuracy/determiner_noun_agreement_2": 0.986, "blimp/accuracy/ellipsis_n_bar_1": 0.868, "blimp/accuracy/tough_vs_raising_2": 0.902, "blimp/accuracy/tough_vs_raising_1": 0.62, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.91, "blimp/accuracy/principle_A_reconstruction": 0.496, "blimp/accuracy/wh_vs_that_with_gap": 0.495, "blimp/accuracy/principle_A_domain_2": 0.873, "blimp/accuracy/determiner_noun_agreement_1": 0.996, "blimp/accuracy/ellipsis_n_bar_2": 0.919, "blimp/accuracy/principle_A_domain_3": 0.622, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.92, "blimp/accuracy/animate_subject_trans": 0.906, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.913, "blimp/accuracy/distractor_agreement_relative_clause": 0.694, "blimp/accuracy/transitive": 0.895, "blimp/accuracy/sentential_subject_island": 0.318, "blimp/accuracy/adjunct_island": 0.877, "blimp/accuracy/intransitive": 0.796, "blimp/accuracy/existential_there_subject_raising": 0.896, "blimp/accuracy/irregular_past_participle_adjectives": 0.929, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.715, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.365, "blimp/accuracy/only_npi_scope": 0.704, "blimp/accuracy/superlative_quantifiers_2": 0.82, "blimp/accuracy/passive_1": 0.9, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.923, "blimp/accuracy/inchoative": 0.652, "blimp/accuracy/anaphor_gender_agreement": 0.974, "blimp/accuracy/principle_A_c_command": 0.773, "blimp/accuracy/only_npi_licensor_present": 0.635, "blimp/accuracy/expletive_it_object_raising": 0.78, "blimp/accuracy/left_branch_island_simple_question": 0.792, "blimp/accuracy/wh_questions_subject_gap": 0.948, "blimp/accuracy/existential_there_quantifiers_2": 0.409, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.958, "blimp/accuracy/sentential_negation_npi_scope": 0.743, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.827, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.917, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.9, "blimp/accuracy/principle_A_case_2": 0.956, "blimp/accuracy/distractor_agreement_relational_noun": 0.847, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.987, "blimp/accuracy/superlative_quantifiers_1": 0.808, "blimp/accuracy/wh_island": 0.797, "blimp/accuracy/principle_A_domain_1": 0.99, "blimp/accuracy/complex_NP_island": 0.548, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.962, "blimp/accuracy/irregular_past_participle_verbs": 0.912, "blimp/accuracy/drop_argument": 0.776, "blimp/accuracy/wh_questions_object_gap": 0.879, "blimp/accuracy/animate_subject_passive": 0.783, "blimp/accuracy/existential_there_quantifiers_1": 0.99, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.877, "blimp/accuracy/npi_present_2": 0.582, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.953, "blimp/accuracy/anaphor_number_agreement": 0.989, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.972, "blimp/accuracy/existential_there_object_raising": 0.809, "blimp/accuracy/matrix_question_npi_licensor_present": 0.38, "blimp/accuracy/npi_present_1": 0.581, "blimp/accuracy/wh_vs_that_no_gap": 0.983, "blimp/accuracy/left_branch_island_echo_question": 0.461, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.974, "blimp/accuracy/causative": 0.771, "blimp/accuracy/group_average": 0.8067313432835822, "blimp/accuracy/seq_average": 0.8067313432835821, "cbt/accuracy/NE": 0.8245192307692307, "cbt/accuracy/V": 0.9456, "cbt/accuracy/CN": 0.8844, "cbt/accuracy/P": 0.924, "cbt/accuracy/group_average": 0.8946298076923076, "cbt/accuracy/seq_average": 0.8946578631452581, "hellaswag/accuracy/val": 0.3703445528779128, "hellaswag/accuracy/group_average": 0.3703445528779128, "hellaswag/accuracy/seq_average": 0.3703445528779128, "piqa/accuracy/val": 0.6528835690968444, "piqa/accuracy/group_average": 0.6528835690968444, "piqa/accuracy/seq_average": 0.6528835690968444, "ai2arc/accuracy/ARC-Easy": 0.39281183932346725, "ai2arc/accuracy/ARC-Challenge": 0.2240343347639485, "ai2arc/accuracy/group_average": 0.30842308704370786, "ai2arc/accuracy/seq_average": 0.3371104815864023, "mmlu/accuracy/MMLU": 0.2661422953164104, "mmlu/accuracy/group_average": 0.2661422953164104, "mmlu/accuracy/seq_average": 0.2661422953164104, "openbookqa/accuracy/test": 0.278, "openbookqa/accuracy/group_average": 0.278, "openbookqa/accuracy/seq_average": 0.278, "race/accuracy/test/high": 0.29817038307604343, "race/accuracy/test/middle": 0.36559888579387184, "race/accuracy/group_average": 0.33188463443495764, "race/accuracy/seq_average": 0.3177948925820835, "siqa/accuracy/dev": 0.3828045035823951, "siqa/accuracy/group_average": 0.3828045035823951, "siqa/accuracy/seq_average": 0.3828045035823951, "winogrande/accuracy/dev": 0.4996053670086819, "winogrande/accuracy/group_average": 0.4996053670086819, "winogrande/accuracy/seq_average": 0.4996053670086819, "commonsenseqa/accuracy/dev_rand_split": 0.276003276003276, "commonsenseqa/accuracy/group_average": 0.276003276003276, "commonsenseqa/accuracy/seq_average": 0.276003276003276}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-380000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.2541009812127974, "val/accuracy": 0.5284007238963294, "val/perplexity": 9.526724754542801, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.358140270162073, "lambada/accuracy/total": 0.38198757763975155, "lambada/accuracy/openai_last_token": 0.8057065217391305, "lambada/perplexity": 6.324518510137317, "lambada/lm_loss": 2.853550610460061, "lambada/lm_perplexity": 17.349273120792542, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4551941507680405, "mean_loss": 2.306120625687435, "blimp/accuracy/passive_2": 0.922, "blimp/accuracy/determiner_noun_agreement_2": 0.983, "blimp/accuracy/ellipsis_n_bar_1": 0.873, "blimp/accuracy/tough_vs_raising_2": 0.923, "blimp/accuracy/tough_vs_raising_1": 0.596, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.928, "blimp/accuracy/principle_A_reconstruction": 0.463, "blimp/accuracy/wh_vs_that_with_gap": 0.46, "blimp/accuracy/principle_A_domain_2": 0.869, "blimp/accuracy/determiner_noun_agreement_1": 0.995, "blimp/accuracy/ellipsis_n_bar_2": 0.916, "blimp/accuracy/principle_A_domain_3": 0.613, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.915, "blimp/accuracy/animate_subject_trans": 0.898, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.909, "blimp/accuracy/distractor_agreement_relative_clause": 0.708, "blimp/accuracy/transitive": 0.896, "blimp/accuracy/sentential_subject_island": 0.33, "blimp/accuracy/adjunct_island": 0.887, "blimp/accuracy/intransitive": 0.788, "blimp/accuracy/existential_there_subject_raising": 0.886, "blimp/accuracy/irregular_past_participle_adjectives": 0.969, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.723, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.354, "blimp/accuracy/only_npi_scope": 0.756, "blimp/accuracy/superlative_quantifiers_2": 0.838, "blimp/accuracy/passive_1": 0.91, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.916, "blimp/accuracy/inchoative": 0.654, "blimp/accuracy/anaphor_gender_agreement": 0.978, "blimp/accuracy/principle_A_c_command": 0.774, "blimp/accuracy/only_npi_licensor_present": 0.753, "blimp/accuracy/expletive_it_object_raising": 0.782, "blimp/accuracy/left_branch_island_simple_question": 0.839, "blimp/accuracy/wh_questions_subject_gap": 0.943, "blimp/accuracy/existential_there_quantifiers_2": 0.49, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.951, "blimp/accuracy/sentential_negation_npi_scope": 0.784, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.815, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.919, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.894, "blimp/accuracy/principle_A_case_2": 0.951, "blimp/accuracy/distractor_agreement_relational_noun": 0.86, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.991, "blimp/accuracy/superlative_quantifiers_1": 0.844, "blimp/accuracy/wh_island": 0.792, "blimp/accuracy/principle_A_domain_1": 0.99, "blimp/accuracy/complex_NP_island": 0.552, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.966, "blimp/accuracy/irregular_past_participle_verbs": 0.92, "blimp/accuracy/drop_argument": 0.756, "blimp/accuracy/wh_questions_object_gap": 0.882, "blimp/accuracy/animate_subject_passive": 0.79, "blimp/accuracy/existential_there_quantifiers_1": 0.989, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.893, "blimp/accuracy/npi_present_2": 0.59, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.942, "blimp/accuracy/anaphor_number_agreement": 0.987, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.972, "blimp/accuracy/existential_there_object_raising": 0.806, "blimp/accuracy/matrix_question_npi_licensor_present": 0.387, "blimp/accuracy/npi_present_1": 0.612, "blimp/accuracy/wh_vs_that_no_gap": 0.987, "blimp/accuracy/left_branch_island_echo_question": 0.459, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.967, "blimp/accuracy/causative": 0.791, "blimp/accuracy/group_average": 0.8130746268656719, "blimp/accuracy/seq_average": 0.8130746268656717, "cbt/accuracy/NE": 0.8225160256410257, "cbt/accuracy/V": 0.942, "cbt/accuracy/CN": 0.8844, "cbt/accuracy/P": 0.9304, "cbt/accuracy/group_average": 0.8948290064102564, "cbt/accuracy/seq_average": 0.8948579431772709, "hellaswag/accuracy/val": 0.3745269866560446, "hellaswag/accuracy/group_average": 0.3745269866560446, "hellaswag/accuracy/seq_average": 0.3745269866560446, "piqa/accuracy/val": 0.6588683351468988, "piqa/accuracy/group_average": 0.6588683351468988, "piqa/accuracy/seq_average": 0.6588683351468988, "ai2arc/accuracy/ARC-Easy": 0.3885835095137421, "ai2arc/accuracy/ARC-Challenge": 0.22832618025751072, "ai2arc/accuracy/group_average": 0.3084548448856264, "ai2arc/accuracy/seq_average": 0.3356940509915014, "mmlu/accuracy/MMLU": 0.2664998212370397, "mmlu/accuracy/group_average": 0.2664998212370397, "mmlu/accuracy/seq_average": 0.2664998212370397, "openbookqa/accuracy/test": 0.274, "openbookqa/accuracy/group_average": 0.274, "openbookqa/accuracy/seq_average": 0.274, "race/accuracy/test/high": 0.29302458547741567, "race/accuracy/test/middle": 0.37952646239554316, "race/accuracy/group_average": 0.3362755239364794, "race/accuracy/seq_average": 0.318200243210377, "siqa/accuracy/dev": 0.38178096212896623, "siqa/accuracy/group_average": 0.38178096212896623, "siqa/accuracy/seq_average": 0.38178096212896623, "winogrande/accuracy/dev": 0.5027624309392266, "winogrande/accuracy/group_average": 0.5027624309392266, "winogrande/accuracy/seq_average": 0.5027624309392266, "commonsenseqa/accuracy/dev_rand_split": 0.27764127764127766, "commonsenseqa/accuracy/group_average": 0.27764127764127766, "commonsenseqa/accuracy/seq_average": 0.27764127764127766}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-40000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.5839964851500494, "val/accuracy": 0.4805113777281746, "val/perplexity": 13.249985858518638, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.564561168599573, "lambada/accuracy/total": 0.2905667701863354, "lambada/accuracy/openai_last_token": 0.7717391304347826, "lambada/perplexity": 10.09801588766541, "lambada/lm_loss": 3.1523792961113686, "lambada/lm_perplexity": 23.3916540943868, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.385539073957255, "mean_loss": 2.574278826874811, "blimp/accuracy/passive_2": 0.885, "blimp/accuracy/determiner_noun_agreement_2": 0.981, "blimp/accuracy/ellipsis_n_bar_1": 0.832, "blimp/accuracy/tough_vs_raising_2": 0.859, "blimp/accuracy/tough_vs_raising_1": 0.658, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.9, "blimp/accuracy/principle_A_reconstruction": 0.321, "blimp/accuracy/wh_vs_that_with_gap": 0.501, "blimp/accuracy/principle_A_domain_2": 0.856, "blimp/accuracy/determiner_noun_agreement_1": 0.989, "blimp/accuracy/ellipsis_n_bar_2": 0.889, "blimp/accuracy/principle_A_domain_3": 0.645, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.907, "blimp/accuracy/animate_subject_trans": 0.901, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.903, "blimp/accuracy/distractor_agreement_relative_clause": 0.614, "blimp/accuracy/transitive": 0.88, "blimp/accuracy/sentential_subject_island": 0.363, "blimp/accuracy/adjunct_island": 0.771, "blimp/accuracy/intransitive": 0.798, "blimp/accuracy/existential_there_subject_raising": 0.839, "blimp/accuracy/irregular_past_participle_adjectives": 0.924, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.423, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.228, "blimp/accuracy/only_npi_scope": 0.734, "blimp/accuracy/superlative_quantifiers_2": 0.722, "blimp/accuracy/passive_1": 0.878, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.902, "blimp/accuracy/inchoative": 0.643, "blimp/accuracy/anaphor_gender_agreement": 0.946, "blimp/accuracy/principle_A_c_command": 0.683, "blimp/accuracy/only_npi_licensor_present": 0.522, "blimp/accuracy/expletive_it_object_raising": 0.804, "blimp/accuracy/left_branch_island_simple_question": 0.457, "blimp/accuracy/wh_questions_subject_gap": 0.934, "blimp/accuracy/existential_there_quantifiers_2": 0.441, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.94, "blimp/accuracy/sentential_negation_npi_scope": 0.643, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.786, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.898, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.886, "blimp/accuracy/principle_A_case_2": 0.937, "blimp/accuracy/distractor_agreement_relational_noun": 0.834, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.998, "blimp/accuracy/superlative_quantifiers_1": 0.753, "blimp/accuracy/wh_island": 0.792, "blimp/accuracy/principle_A_domain_1": 0.973, "blimp/accuracy/complex_NP_island": 0.533, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.955, "blimp/accuracy/irregular_past_participle_verbs": 0.844, "blimp/accuracy/drop_argument": 0.765, "blimp/accuracy/wh_questions_object_gap": 0.834, "blimp/accuracy/animate_subject_passive": 0.788, "blimp/accuracy/existential_there_quantifiers_1": 0.984, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.908, "blimp/accuracy/npi_present_2": 0.574, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.924, "blimp/accuracy/anaphor_number_agreement": 0.99, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.959, "blimp/accuracy/existential_there_object_raising": 0.837, "blimp/accuracy/matrix_question_npi_licensor_present": 0.213, "blimp/accuracy/npi_present_1": 0.505, "blimp/accuracy/wh_vs_that_no_gap": 0.979, "blimp/accuracy/left_branch_island_echo_question": 0.41, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.97, "blimp/accuracy/causative": 0.694, "blimp/accuracy/group_average": 0.7707313432835824, "blimp/accuracy/seq_average": 0.7707313432835821, "cbt/accuracy/NE": 0.7644230769230769, "cbt/accuracy/V": 0.9152, "cbt/accuracy/CN": 0.8404, "cbt/accuracy/P": 0.8952, "cbt/accuracy/group_average": 0.8538057692307692, "cbt/accuracy/seq_average": 0.8538415366146459, "hellaswag/accuracy/val": 0.30462059350726944, "hellaswag/accuracy/group_average": 0.30462059350726944, "hellaswag/accuracy/seq_average": 0.30462059350726944, "piqa/accuracy/val": 0.5854189336235038, "piqa/accuracy/group_average": 0.5854189336235038, "piqa/accuracy/seq_average": 0.5854189336235038, "ai2arc/accuracy/ARC-Easy": 0.34291754756871035, "ai2arc/accuracy/ARC-Challenge": 0.20257510729613734, "ai2arc/accuracy/group_average": 0.27274632743242383, "ai2arc/accuracy/seq_average": 0.29660056657223793, "mmlu/accuracy/MMLU": 0.26149445834823026, "mmlu/accuracy/group_average": 0.26149445834823026, "mmlu/accuracy/seq_average": 0.26149445834823026, "openbookqa/accuracy/test": 0.272, "openbookqa/accuracy/group_average": 0.272, "openbookqa/accuracy/seq_average": 0.272, "race/accuracy/test/high": 0.2712978845054317, "race/accuracy/test/middle": 0.34052924791086353, "race/accuracy/group_average": 0.3059135662081476, "race/accuracy/seq_average": 0.2914471017430077, "siqa/accuracy/dev": 0.36284544524053225, "siqa/accuracy/group_average": 0.36284544524053225, "siqa/accuracy/seq_average": 0.36284544524053225, "winogrande/accuracy/dev": 0.505130228887135, "winogrande/accuracy/group_average": 0.505130228887135, "winogrande/accuracy/seq_average": 0.505130228887135, "commonsenseqa/accuracy/dev_rand_split": 0.2571662571662572, "commonsenseqa/accuracy/group_average": 0.2571662571662572, "commonsenseqa/accuracy/seq_average": 0.2571662571662572}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-400000.pth.json
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"val/loss": 2.2501395089285716,
|
| 3 |
+
"val/accuracy": 0.5290372333829365,
|
| 4 |
+
"val/perplexity": 9.489059552552593,
|
| 5 |
+
"val/time_since_best_loss": 0,
|
| 6 |
+
"val/time_since_best_accuracy": 0,
|
| 7 |
+
"lambada/loss": 2.343814826159744,
|
| 8 |
+
"lambada/accuracy/total": 0.3687888198757764,
|
| 9 |
+
"lambada/accuracy/openai_last_token": 0.8033773291925466,
|
| 10 |
+
"lambada/perplexity": 6.354823743479974,
|
| 11 |
+
"lambada/lm_loss": 2.854711919842938,
|
| 12 |
+
"lambada/lm_perplexity": 17.369432697941193,
|
| 13 |
+
"lambada/time_since_best_loss": 0,
|
| 14 |
+
"lambada/time_since_best_accuracy": 0,
|
| 15 |
+
"mean_accuracy": 0.44891302662935645,
|
| 16 |
+
"mean_loss": 2.2969771675441577,
|
| 17 |
+
"blimp/accuracy/passive_2": 0.924,
|
| 18 |
+
"blimp/accuracy/determiner_noun_agreement_2": 0.986,
|
| 19 |
+
"blimp/accuracy/ellipsis_n_bar_1": 0.845,
|
| 20 |
+
"blimp/accuracy/tough_vs_raising_2": 0.909,
|
| 21 |
+
"blimp/accuracy/tough_vs_raising_1": 0.621,
|
| 22 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.933,
|
| 23 |
+
"blimp/accuracy/principle_A_reconstruction": 0.48,
|
| 24 |
+
"blimp/accuracy/wh_vs_that_with_gap": 0.469,
|
| 25 |
+
"blimp/accuracy/principle_A_domain_2": 0.871,
|
| 26 |
+
"blimp/accuracy/determiner_noun_agreement_1": 0.995,
|
| 27 |
+
"blimp/accuracy/ellipsis_n_bar_2": 0.923,
|
| 28 |
+
"blimp/accuracy/principle_A_domain_3": 0.612,
|
| 29 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.92,
|
| 30 |
+
"blimp/accuracy/animate_subject_trans": 0.903,
|
| 31 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.917,
|
| 32 |
+
"blimp/accuracy/distractor_agreement_relative_clause": 0.72,
|
| 33 |
+
"blimp/accuracy/transitive": 0.898,
|
| 34 |
+
"blimp/accuracy/sentential_subject_island": 0.339,
|
| 35 |
+
"blimp/accuracy/adjunct_island": 0.893,
|
| 36 |
+
"blimp/accuracy/intransitive": 0.779,
|
| 37 |
+
"blimp/accuracy/existential_there_subject_raising": 0.893,
|
| 38 |
+
"blimp/accuracy/irregular_past_participle_adjectives": 0.921,
|
| 39 |
+
"blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.732,
|
| 40 |
+
"blimp/accuracy/principle_A_case_1": 1.0,
|
| 41 |
+
"blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.35,
|
| 42 |
+
"blimp/accuracy/only_npi_scope": 0.727,
|
| 43 |
+
"blimp/accuracy/superlative_quantifiers_2": 0.812,
|
| 44 |
+
"blimp/accuracy/passive_1": 0.914,
|
| 45 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.918,
|
| 46 |
+
"blimp/accuracy/inchoative": 0.646,
|
| 47 |
+
"blimp/accuracy/anaphor_gender_agreement": 0.974,
|
| 48 |
+
"blimp/accuracy/principle_A_c_command": 0.758,
|
| 49 |
+
"blimp/accuracy/only_npi_licensor_present": 0.765,
|
| 50 |
+
"blimp/accuracy/expletive_it_object_raising": 0.789,
|
| 51 |
+
"blimp/accuracy/left_branch_island_simple_question": 0.824,
|
| 52 |
+
"blimp/accuracy/wh_questions_subject_gap": 0.95,
|
| 53 |
+
"blimp/accuracy/existential_there_quantifiers_2": 0.483,
|
| 54 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.953,
|
| 55 |
+
"blimp/accuracy/sentential_negation_npi_scope": 0.782,
|
| 56 |
+
"blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.832,
|
| 57 |
+
"blimp/accuracy/wh_questions_subject_gap_long_distance": 0.913,
|
| 58 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.899,
|
| 59 |
+
"blimp/accuracy/principle_A_case_2": 0.953,
|
| 60 |
+
"blimp/accuracy/distractor_agreement_relational_noun": 0.861,
|
| 61 |
+
"blimp/accuracy/sentential_negation_npi_licensor_present": 0.987,
|
| 62 |
+
"blimp/accuracy/superlative_quantifiers_1": 0.762,
|
| 63 |
+
"blimp/accuracy/wh_island": 0.824,
|
| 64 |
+
"blimp/accuracy/principle_A_domain_1": 0.99,
|
| 65 |
+
"blimp/accuracy/complex_NP_island": 0.578,
|
| 66 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_2": 0.972,
|
| 67 |
+
"blimp/accuracy/irregular_past_participle_verbs": 0.922,
|
| 68 |
+
"blimp/accuracy/drop_argument": 0.76,
|
| 69 |
+
"blimp/accuracy/wh_questions_object_gap": 0.877,
|
| 70 |
+
"blimp/accuracy/animate_subject_passive": 0.793,
|
| 71 |
+
"blimp/accuracy/existential_there_quantifiers_1": 0.993,
|
| 72 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.895,
|
| 73 |
+
"blimp/accuracy/npi_present_2": 0.605,
|
| 74 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_1": 0.947,
|
| 75 |
+
"blimp/accuracy/anaphor_number_agreement": 0.994,
|
| 76 |
+
"blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.974,
|
| 77 |
+
"blimp/accuracy/existential_there_object_raising": 0.824,
|
| 78 |
+
"blimp/accuracy/matrix_question_npi_licensor_present": 0.38,
|
| 79 |
+
"blimp/accuracy/npi_present_1": 0.593,
|
| 80 |
+
"blimp/accuracy/wh_vs_that_no_gap": 0.984,
|
| 81 |
+
"blimp/accuracy/left_branch_island_echo_question": 0.462,
|
| 82 |
+
"blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.97,
|
| 83 |
+
"blimp/accuracy/causative": 0.783,
|
| 84 |
+
"blimp/accuracy/group_average": 0.812761194029851,
|
| 85 |
+
"blimp/accuracy/seq_average": 0.8127611940298507,
|
| 86 |
+
"cbt/accuracy/NE": 0.8257211538461539,
|
| 87 |
+
"cbt/accuracy/V": 0.9432,
|
| 88 |
+
"cbt/accuracy/CN": 0.8864,
|
| 89 |
+
"cbt/accuracy/P": 0.9304,
|
| 90 |
+
"cbt/accuracy/group_average": 0.8964302884615385,
|
| 91 |
+
"cbt/accuracy/seq_average": 0.8964585834333734,
|
| 92 |
+
"hellaswag/accuracy/val": 0.3732324238199562,
|
| 93 |
+
"hellaswag/accuracy/group_average": 0.3732324238199562,
|
| 94 |
+
"hellaswag/accuracy/seq_average": 0.3732324238199562,
|
| 95 |
+
"piqa/accuracy/val": 0.6572361262241567,
|
| 96 |
+
"piqa/accuracy/group_average": 0.6572361262241567,
|
| 97 |
+
"piqa/accuracy/seq_average": 0.6572361262241567,
|
| 98 |
+
"ai2arc/accuracy/ARC-Easy": 0.3885835095137421,
|
| 99 |
+
"ai2arc/accuracy/ARC-Challenge": 0.23948497854077253,
|
| 100 |
+
"ai2arc/accuracy/group_average": 0.3140342440272573,
|
| 101 |
+
"ai2arc/accuracy/seq_average": 0.3393767705382436,
|
| 102 |
+
"mmlu/accuracy/MMLU": 0.2646406864497676,
|
| 103 |
+
"mmlu/accuracy/group_average": 0.2646406864497676,
|
| 104 |
+
"mmlu/accuracy/seq_average": 0.2646406864497676,
|
| 105 |
+
"openbookqa/accuracy/test": 0.278,
|
| 106 |
+
"openbookqa/accuracy/group_average": 0.278,
|
| 107 |
+
"openbookqa/accuracy/seq_average": 0.278,
|
| 108 |
+
"race/accuracy/test/high": 0.29130931961120643,
|
| 109 |
+
"race/accuracy/test/middle": 0.3711699164345404,
|
| 110 |
+
"race/accuracy/group_average": 0.3312396180228734,
|
| 111 |
+
"race/accuracy/seq_average": 0.3145520875557357,
|
| 112 |
+
"siqa/accuracy/dev": 0.3858751279426817,
|
| 113 |
+
"siqa/accuracy/group_average": 0.3858751279426817,
|
| 114 |
+
"siqa/accuracy/seq_average": 0.3858751279426817,
|
| 115 |
+
"winogrande/accuracy/dev": 0.4972375690607735,
|
| 116 |
+
"winogrande/accuracy/group_average": 0.4972375690607735,
|
| 117 |
+
"winogrande/accuracy/seq_average": 0.4972375690607735,
|
| 118 |
+
"commonsenseqa/accuracy/dev_rand_split": 0.2809172809172809,
|
| 119 |
+
"commonsenseqa/accuracy/group_average": 0.2809172809172809,
|
| 120 |
+
"commonsenseqa/accuracy/seq_average": 0.2809172809172809
|
| 121 |
+
}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-60000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.5182131812686013, "val/accuracy": 0.4892781575520833, "val/perplexity": 12.406408842964098, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.4556551157317545, "lambada/accuracy/total": 0.266110248447205, "lambada/accuracy/openai_last_token": 0.7626164596273292, "lambada/perplexity": 10.023155951663666, "lambada/lm_loss": 3.082164341254193, "lambada/lm_perplexity": 21.805546003152017, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3776942029996442, "mean_loss": 2.486934148500178, "blimp/accuracy/passive_2": 0.907, "blimp/accuracy/determiner_noun_agreement_2": 0.985, "blimp/accuracy/ellipsis_n_bar_1": 0.844, "blimp/accuracy/tough_vs_raising_2": 0.853, "blimp/accuracy/tough_vs_raising_1": 0.645, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.914, "blimp/accuracy/principle_A_reconstruction": 0.532, "blimp/accuracy/wh_vs_that_with_gap": 0.491, "blimp/accuracy/principle_A_domain_2": 0.848, "blimp/accuracy/determiner_noun_agreement_1": 0.997, "blimp/accuracy/ellipsis_n_bar_2": 0.889, "blimp/accuracy/principle_A_domain_3": 0.604, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.924, "blimp/accuracy/animate_subject_trans": 0.893, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.911, "blimp/accuracy/distractor_agreement_relative_clause": 0.683, "blimp/accuracy/transitive": 0.872, "blimp/accuracy/sentential_subject_island": 0.368, "blimp/accuracy/adjunct_island": 0.785, "blimp/accuracy/intransitive": 0.765, "blimp/accuracy/existential_there_subject_raising": 0.872, "blimp/accuracy/irregular_past_participle_adjectives": 0.864, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.467, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.279, "blimp/accuracy/only_npi_scope": 0.675, "blimp/accuracy/superlative_quantifiers_2": 0.667, "blimp/accuracy/passive_1": 0.882, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.921, "blimp/accuracy/inchoative": 0.643, "blimp/accuracy/anaphor_gender_agreement": 0.969, "blimp/accuracy/principle_A_c_command": 0.687, "blimp/accuracy/only_npi_licensor_present": 0.727, "blimp/accuracy/expletive_it_object_raising": 0.778, "blimp/accuracy/left_branch_island_simple_question": 0.525, "blimp/accuracy/wh_questions_subject_gap": 0.944, "blimp/accuracy/existential_there_quantifiers_2": 0.601, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.956, "blimp/accuracy/sentential_negation_npi_scope": 0.627, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.816, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.877, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.902, "blimp/accuracy/principle_A_case_2": 0.922, "blimp/accuracy/distractor_agreement_relational_noun": 0.861, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.992, "blimp/accuracy/superlative_quantifiers_1": 0.626, "blimp/accuracy/wh_island": 0.867, "blimp/accuracy/principle_A_domain_1": 0.996, "blimp/accuracy/complex_NP_island": 0.512, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.955, "blimp/accuracy/irregular_past_participle_verbs": 0.868, "blimp/accuracy/drop_argument": 0.752, "blimp/accuracy/wh_questions_object_gap": 0.827, "blimp/accuracy/animate_subject_passive": 0.796, "blimp/accuracy/existential_there_quantifiers_1": 0.975, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.897, "blimp/accuracy/npi_present_2": 0.63, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.922, "blimp/accuracy/anaphor_number_agreement": 0.987, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.967, "blimp/accuracy/existential_there_object_raising": 0.816, "blimp/accuracy/matrix_question_npi_licensor_present": 0.272, "blimp/accuracy/npi_present_1": 0.556, "blimp/accuracy/wh_vs_that_no_gap": 0.98, "blimp/accuracy/left_branch_island_echo_question": 0.412, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.974, "blimp/accuracy/causative": 0.733, "blimp/accuracy/group_average": 0.7833432835820892, "blimp/accuracy/seq_average": 0.7833432835820896, "cbt/accuracy/NE": 0.7740384615384616, "cbt/accuracy/V": 0.9212, "cbt/accuracy/CN": 0.8436, "cbt/accuracy/P": 0.9004, "cbt/accuracy/group_average": 0.8598096153846153, "cbt/accuracy/seq_average": 0.85984393757503, "hellaswag/accuracy/val": 0.3145787691694881, "hellaswag/accuracy/group_average": 0.3145787691694881, "hellaswag/accuracy/seq_average": 0.3145787691694881, "piqa/accuracy/val": 0.6006528835690969, "piqa/accuracy/group_average": 0.6006528835690969, "piqa/accuracy/seq_average": 0.6006528835690969, "ai2arc/accuracy/ARC-Easy": 0.360676532769556, "ai2arc/accuracy/ARC-Challenge": 0.22317596566523606, "ai2arc/accuracy/group_average": 0.29192624921739607, "ai2arc/accuracy/seq_average": 0.3152974504249292, "mmlu/accuracy/MMLU": 0.2666428316052914, "mmlu/accuracy/group_average": 0.2666428316052914, "mmlu/accuracy/seq_average": 0.2666428316052914, "openbookqa/accuracy/test": 0.286, "openbookqa/accuracy/group_average": 0.286, "openbookqa/accuracy/seq_average": 0.286, "race/accuracy/test/high": 0.27815894797026874, "race/accuracy/test/middle": 0.3488857938718663, "race/accuracy/group_average": 0.3135223709210675, "race/accuracy/seq_average": 0.29874341305229024, "siqa/accuracy/dev": 0.37871033776867963, "siqa/accuracy/group_average": 0.37871033776867963, "siqa/accuracy/seq_average": 0.37871033776867963, "winogrande/accuracy/dev": 0.5185477505919495, "winogrande/accuracy/group_average": 0.5185477505919495, "winogrande/accuracy/seq_average": 0.5185477505919495, "commonsenseqa/accuracy/dev_rand_split": 0.26371826371826373, "commonsenseqa/accuracy/group_average": 0.26371826371826373, "commonsenseqa/accuracy/seq_average": 0.26371826371826373}
|
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_660M_standardlb_deepseek_sigmoidonly/export/result-model-80000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.471464126829117, "val/accuracy": 0.49610634455605157, "val/perplexity": 11.839769091151041, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.4314588463824727, "lambada/accuracy/total": 0.28843167701863354, "lambada/accuracy/openai_last_token": 0.7750388198757764, "lambada/perplexity": 9.446693890140182, "lambada/lm_loss": 3.047273201568108, "lambada/lm_perplexity": 21.057845564006403, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.39226901078734255, "mean_loss": 2.451461486605795, "blimp/accuracy/passive_2": 0.914, "blimp/accuracy/determiner_noun_agreement_2": 0.993, "blimp/accuracy/ellipsis_n_bar_1": 0.85, "blimp/accuracy/tough_vs_raising_2": 0.884, "blimp/accuracy/tough_vs_raising_1": 0.654, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.909, "blimp/accuracy/principle_A_reconstruction": 0.415, "blimp/accuracy/wh_vs_that_with_gap": 0.474, "blimp/accuracy/principle_A_domain_2": 0.862, "blimp/accuracy/determiner_noun_agreement_1": 0.992, "blimp/accuracy/ellipsis_n_bar_2": 0.869, "blimp/accuracy/principle_A_domain_3": 0.62, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.925, "blimp/accuracy/animate_subject_trans": 0.891, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.907, "blimp/accuracy/distractor_agreement_relative_clause": 0.673, "blimp/accuracy/transitive": 0.862, "blimp/accuracy/sentential_subject_island": 0.333, "blimp/accuracy/adjunct_island": 0.823, "blimp/accuracy/intransitive": 0.791, "blimp/accuracy/existential_there_subject_raising": 0.859, "blimp/accuracy/irregular_past_participle_adjectives": 0.916, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.523, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.254, "blimp/accuracy/only_npi_scope": 0.743, "blimp/accuracy/superlative_quantifiers_2": 0.683, "blimp/accuracy/passive_1": 0.908, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.913, "blimp/accuracy/inchoative": 0.671, "blimp/accuracy/anaphor_gender_agreement": 0.977, "blimp/accuracy/principle_A_c_command": 0.704, "blimp/accuracy/only_npi_licensor_present": 0.738, "blimp/accuracy/expletive_it_object_raising": 0.782, "blimp/accuracy/left_branch_island_simple_question": 0.576, "blimp/accuracy/wh_questions_subject_gap": 0.923, "blimp/accuracy/existential_there_quantifiers_2": 0.515, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.954, "blimp/accuracy/sentential_negation_npi_scope": 0.716, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.832, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.854, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.899, "blimp/accuracy/principle_A_case_2": 0.926, "blimp/accuracy/distractor_agreement_relational_noun": 0.889, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.995, "blimp/accuracy/superlative_quantifiers_1": 0.756, "blimp/accuracy/wh_island": 0.851, "blimp/accuracy/principle_A_domain_1": 0.985, "blimp/accuracy/complex_NP_island": 0.5, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.967, "blimp/accuracy/irregular_past_participle_verbs": 0.883, "blimp/accuracy/drop_argument": 0.763, "blimp/accuracy/wh_questions_object_gap": 0.833, "blimp/accuracy/animate_subject_passive": 0.786, "blimp/accuracy/existential_there_quantifiers_1": 0.982, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.902, "blimp/accuracy/npi_present_2": 0.584, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.954, "blimp/accuracy/anaphor_number_agreement": 0.988, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.966, "blimp/accuracy/existential_there_object_raising": 0.813, "blimp/accuracy/matrix_question_npi_licensor_present": 0.297, "blimp/accuracy/npi_present_1": 0.545, "blimp/accuracy/wh_vs_that_no_gap": 0.966, "blimp/accuracy/left_branch_island_echo_question": 0.381, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.974, "blimp/accuracy/causative": 0.747, "blimp/accuracy/group_average": 0.788268656716418, "blimp/accuracy/seq_average": 0.7882686567164179, "cbt/accuracy/NE": 0.7828525641025641, "cbt/accuracy/V": 0.924, "cbt/accuracy/CN": 0.8512, "cbt/accuracy/P": 0.9064, "cbt/accuracy/group_average": 0.8661131410256411, "cbt/accuracy/seq_average": 0.8661464585834334, "hellaswag/accuracy/val": 0.3202549292969528, "hellaswag/accuracy/group_average": 0.3202549292969528, "hellaswag/accuracy/seq_average": 0.3202549292969528, "piqa/accuracy/val": 0.6180631120783461, "piqa/accuracy/group_average": 0.6180631120783461, "piqa/accuracy/seq_average": 0.6180631120783461, "ai2arc/accuracy/ARC-Easy": 0.36194503171247355, "ai2arc/accuracy/ARC-Challenge": 0.21545064377682405, "ai2arc/accuracy/group_average": 0.2886978377446488, "ai2arc/accuracy/seq_average": 0.31359773371104815, "mmlu/accuracy/MMLU": 0.2661422953164104, "mmlu/accuracy/group_average": 0.2661422953164104, "mmlu/accuracy/seq_average": 0.2661422953164104, "openbookqa/accuracy/test": 0.29, "openbookqa/accuracy/group_average": 0.29, "openbookqa/accuracy/seq_average": 0.29, "race/accuracy/test/high": 0.28530588907947396, "race/accuracy/test/middle": 0.3683844011142061, "race/accuracy/group_average": 0.32684514509684004, "race/accuracy/seq_average": 0.3094852047020673, "siqa/accuracy/dev": 0.3664278403275333, "siqa/accuracy/group_average": 0.3664278403275333, "siqa/accuracy/seq_average": 0.3664278403275333, "winogrande/accuracy/dev": 0.5193370165745856, "winogrande/accuracy/group_average": 0.5193370165745856, "winogrande/accuracy/seq_average": 0.5193370165745856, "commonsenseqa/accuracy/dev_rand_split": 0.26044226044226043, "commonsenseqa/accuracy/group_average": 0.26044226044226043, "commonsenseqa/accuracy/seq_average": 0.26044226044226043}
|