3ee88c8c5fb6f44307a3a6cad2009140e715b5b7ae323893b2cddcb8f5299597
Browse files- Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-100000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-120000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-140000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-160000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-180000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-20000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-200000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-220000.pth.json +112 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-240000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-260000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-280000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-300000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-320000.pth.json +112 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-340000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-360000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-380000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-40000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-400000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-60000.pth.json +112 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-80000.pth.json +1 -0
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-100000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.4347451830667164, "val/accuracy": 0.5009688120039683, "val/perplexity": 11.412910140044085, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.4218435346710017, "lambada/accuracy/total": 0.328610248447205, "lambada/accuracy/openai_last_token": 0.7833850931677019, "lambada/perplexity": 8.064033579656154, "lambada/lm_loss": 3.0231085900226917, "lambada/lm_perplexity": 20.55508981882167, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4147895302255866, "mean_loss": 2.4282943588688592, "blimp/accuracy/passive_2": 0.884, "blimp/accuracy/determiner_noun_agreement_2": 0.979, "blimp/accuracy/ellipsis_n_bar_1": 0.819, "blimp/accuracy/tough_vs_raising_2": 0.844, "blimp/accuracy/tough_vs_raising_1": 0.591, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.935, "blimp/accuracy/principle_A_reconstruction": 0.262, "blimp/accuracy/wh_vs_that_with_gap": 0.484, "blimp/accuracy/principle_A_domain_2": 0.897, "blimp/accuracy/determiner_noun_agreement_1": 0.991, "blimp/accuracy/ellipsis_n_bar_2": 0.907, "blimp/accuracy/principle_A_domain_3": 0.662, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.898, "blimp/accuracy/animate_subject_trans": 0.904, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.895, "blimp/accuracy/distractor_agreement_relative_clause": 0.685, "blimp/accuracy/transitive": 0.862, "blimp/accuracy/sentential_subject_island": 0.337, "blimp/accuracy/adjunct_island": 0.83, "blimp/accuracy/intransitive": 0.804, "blimp/accuracy/existential_there_subject_raising": 0.867, "blimp/accuracy/irregular_past_participle_adjectives": 0.805, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.553, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.337, "blimp/accuracy/only_npi_scope": 0.678, "blimp/accuracy/superlative_quantifiers_2": 0.926, "blimp/accuracy/passive_1": 0.895, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.901, "blimp/accuracy/inchoative": 0.621, "blimp/accuracy/anaphor_gender_agreement": 0.979, "blimp/accuracy/principle_A_c_command": 0.789, "blimp/accuracy/only_npi_licensor_present": 0.55, "blimp/accuracy/expletive_it_object_raising": 0.796, "blimp/accuracy/left_branch_island_simple_question": 0.656, "blimp/accuracy/wh_questions_subject_gap": 0.933, "blimp/accuracy/existential_there_quantifiers_2": 0.535, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.923, "blimp/accuracy/sentential_negation_npi_scope": 0.7, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.813, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.887, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.911, "blimp/accuracy/principle_A_case_2": 0.956, "blimp/accuracy/distractor_agreement_relational_noun": 0.859, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.987, "blimp/accuracy/superlative_quantifiers_1": 0.802, "blimp/accuracy/wh_island": 0.712, "blimp/accuracy/principle_A_domain_1": 0.986, "blimp/accuracy/complex_NP_island": 0.58, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.966, "blimp/accuracy/irregular_past_participle_verbs": 0.884, "blimp/accuracy/drop_argument": 0.76, "blimp/accuracy/wh_questions_object_gap": 0.79, "blimp/accuracy/animate_subject_passive": 0.796, "blimp/accuracy/existential_there_quantifiers_1": 0.992, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.924, "blimp/accuracy/npi_present_2": 0.62, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.946, "blimp/accuracy/anaphor_number_agreement": 0.994, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.959, "blimp/accuracy/existential_there_object_raising": 0.83, "blimp/accuracy/matrix_question_npi_licensor_present": 0.341, "blimp/accuracy/npi_present_1": 0.638, "blimp/accuracy/wh_vs_that_no_gap": 0.976, "blimp/accuracy/left_branch_island_echo_question": 0.493, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.957, "blimp/accuracy/causative": 0.745, "blimp/accuracy/group_average": 0.7913134328358208, "blimp/accuracy/seq_average": 0.7913134328358209, "cbt/accuracy/NE": 0.8020833333333334, "cbt/accuracy/V": 0.9252, "cbt/accuracy/CN": 0.8664, "cbt/accuracy/P": 0.912, "cbt/accuracy/group_average": 0.8764208333333333, "cbt/accuracy/seq_average": 0.8764505802320929, "hellaswag/accuracy/val": 0.32792272455686117, "hellaswag/accuracy/group_average": 0.32792272455686117, "hellaswag/accuracy/seq_average": 0.32792272455686117, "piqa/accuracy/val": 0.6235038084874864, "piqa/accuracy/group_average": 0.6235038084874864, "piqa/accuracy/seq_average": 0.6235038084874864, "ai2arc/accuracy/ARC-Easy": 0.3594080338266385, "ai2arc/accuracy/ARC-Challenge": 0.22489270386266094, "ai2arc/accuracy/group_average": 0.29215036884464973, "ai2arc/accuracy/seq_average": 0.31501416430594903, "race/accuracy/test/high": 0.2804459691252144, "race/accuracy/test/middle": 0.3467966573816156, "race/accuracy/group_average": 0.313621313253415, "race/accuracy/seq_average": 0.29975678962302393, "siqa/accuracy/dev": 0.3654042988741044, "siqa/accuracy/group_average": 0.3654042988741044, "siqa/accuracy/seq_average": 0.3654042988741044, "commonsenseqa/accuracy/dev_rand_split": 0.2596232596232596, "commonsenseqa/accuracy/group_average": 0.2596232596232596, "commonsenseqa/accuracy/seq_average": 0.2596232596232596}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-120000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.4090658520895336, "val/accuracy": 0.5038413395957341, "val/perplexity": 11.123565234941271, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5227858264994176, "lambada/accuracy/total": 0.343944099378882, "lambada/accuracy/openai_last_token": 0.7880434782608695, "lambada/perplexity": 7.380432859198557, "lambada/lm_loss": 2.991333918615103, "lambada/lm_perplexity": 19.912226073167226, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.423892719487308, "mean_loss": 2.4659258392944756, "blimp/accuracy/passive_2": 0.906, "blimp/accuracy/determiner_noun_agreement_2": 0.985, "blimp/accuracy/ellipsis_n_bar_1": 0.781, "blimp/accuracy/tough_vs_raising_2": 0.884, "blimp/accuracy/tough_vs_raising_1": 0.622, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.927, "blimp/accuracy/principle_A_reconstruction": 0.35, "blimp/accuracy/wh_vs_that_with_gap": 0.525, "blimp/accuracy/principle_A_domain_2": 0.885, "blimp/accuracy/determiner_noun_agreement_1": 0.983, "blimp/accuracy/ellipsis_n_bar_2": 0.917, "blimp/accuracy/principle_A_domain_3": 0.622, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.91, "blimp/accuracy/animate_subject_trans": 0.905, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.883, "blimp/accuracy/distractor_agreement_relative_clause": 0.696, "blimp/accuracy/transitive": 0.874, "blimp/accuracy/sentential_subject_island": 0.367, "blimp/accuracy/adjunct_island": 0.864, "blimp/accuracy/intransitive": 0.798, "blimp/accuracy/existential_there_subject_raising": 0.876, "blimp/accuracy/irregular_past_participle_adjectives": 0.88, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.567, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.376, "blimp/accuracy/only_npi_scope": 0.693, "blimp/accuracy/superlative_quantifiers_2": 0.794, "blimp/accuracy/passive_1": 0.899, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.899, "blimp/accuracy/inchoative": 0.634, "blimp/accuracy/anaphor_gender_agreement": 0.972, "blimp/accuracy/principle_A_c_command": 0.76, "blimp/accuracy/only_npi_licensor_present": 0.558, "blimp/accuracy/expletive_it_object_raising": 0.765, "blimp/accuracy/left_branch_island_simple_question": 0.677, "blimp/accuracy/wh_questions_subject_gap": 0.936, "blimp/accuracy/existential_there_quantifiers_2": 0.479, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.944, "blimp/accuracy/sentential_negation_npi_scope": 0.709, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.807, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.92, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.912, "blimp/accuracy/principle_A_case_2": 0.959, "blimp/accuracy/distractor_agreement_relational_noun": 0.864, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.996, "blimp/accuracy/superlative_quantifiers_1": 0.871, "blimp/accuracy/wh_island": 0.808, "blimp/accuracy/principle_A_domain_1": 0.993, "blimp/accuracy/complex_NP_island": 0.513, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.969, "blimp/accuracy/irregular_past_participle_verbs": 0.888, "blimp/accuracy/drop_argument": 0.746, "blimp/accuracy/wh_questions_object_gap": 0.785, "blimp/accuracy/animate_subject_passive": 0.81, "blimp/accuracy/existential_there_quantifiers_1": 0.975, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.899, "blimp/accuracy/npi_present_2": 0.652, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.92, "blimp/accuracy/anaphor_number_agreement": 0.996, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.957, "blimp/accuracy/existential_there_object_raising": 0.825, "blimp/accuracy/matrix_question_npi_licensor_present": 0.387, "blimp/accuracy/npi_present_1": 0.602, "blimp/accuracy/wh_vs_that_no_gap": 0.983, "blimp/accuracy/left_branch_island_echo_question": 0.398, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.965, "blimp/accuracy/causative": 0.768, "blimp/accuracy/group_average": 0.795074626865672, "blimp/accuracy/seq_average": 0.7950746268656717, "cbt/accuracy/NE": 0.7876602564102564, "cbt/accuracy/V": 0.9292, "cbt/accuracy/CN": 0.8696, "cbt/accuracy/P": 0.9072, "cbt/accuracy/group_average": 0.8734150641025641, "cbt/accuracy/seq_average": 0.8734493797519007, "hellaswag/accuracy/val": 0.3370842461661024, "hellaswag/accuracy/group_average": 0.3370842461661024, "hellaswag/accuracy/seq_average": 0.3370842461661024, "piqa/accuracy/val": 0.6164309031556039, "piqa/accuracy/group_average": 0.6164309031556039, "piqa/accuracy/seq_average": 0.6164309031556039, "ai2arc/accuracy/ARC-Easy": 0.36659619450317127, "ai2arc/accuracy/ARC-Challenge": 0.2206008583690987, "ai2arc/accuracy/group_average": 0.293598526436135, "ai2arc/accuracy/seq_average": 0.31841359773371103, "race/accuracy/test/high": 0.2833047455688965, "race/accuracy/test/middle": 0.35863509749303624, "race/accuracy/group_average": 0.3209699215309664, "race/accuracy/seq_average": 0.3052290231049858, "siqa/accuracy/dev": 0.3741044012282497, "siqa/accuracy/group_average": 0.3741044012282497, "siqa/accuracy/seq_average": 0.3741044012282497, "commonsenseqa/accuracy/dev_rand_split": 0.26371826371826373, "commonsenseqa/accuracy/group_average": 0.26371826371826373, "commonsenseqa/accuracy/seq_average": 0.26371826371826373}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-140000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.3882894364614335, "val/accuracy": 0.5075392950148809, "val/perplexity": 10.894841675716926, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.3193679714795223, "lambada/accuracy/total": 0.33656832298136646, "lambada/accuracy/openai_last_token": 0.7907608695652174, "lambada/perplexity": 7.657287389023157, "lambada/lm_loss": 2.9821723033212297, "lambada/lm_perplexity": 19.73063104033802, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4220538089981237, "mean_loss": 2.353828703970478, "blimp/accuracy/passive_2": 0.914, "blimp/accuracy/determiner_noun_agreement_2": 0.982, "blimp/accuracy/ellipsis_n_bar_1": 0.808, "blimp/accuracy/tough_vs_raising_2": 0.916, "blimp/accuracy/tough_vs_raising_1": 0.58, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.925, "blimp/accuracy/principle_A_reconstruction": 0.365, "blimp/accuracy/wh_vs_that_with_gap": 0.475, "blimp/accuracy/principle_A_domain_2": 0.887, "blimp/accuracy/determiner_noun_agreement_1": 0.988, "blimp/accuracy/ellipsis_n_bar_2": 0.912, "blimp/accuracy/principle_A_domain_3": 0.663, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.928, "blimp/accuracy/animate_subject_trans": 0.906, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.886, "blimp/accuracy/distractor_agreement_relative_clause": 0.646, "blimp/accuracy/transitive": 0.88, "blimp/accuracy/sentential_subject_island": 0.387, "blimp/accuracy/adjunct_island": 0.894, "blimp/accuracy/intransitive": 0.788, "blimp/accuracy/existential_there_subject_raising": 0.878, "blimp/accuracy/irregular_past_participle_adjectives": 0.975, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.653, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.343, "blimp/accuracy/only_npi_scope": 0.751, "blimp/accuracy/superlative_quantifiers_2": 0.884, "blimp/accuracy/passive_1": 0.91, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.908, "blimp/accuracy/inchoative": 0.612, "blimp/accuracy/anaphor_gender_agreement": 0.976, "blimp/accuracy/principle_A_c_command": 0.769, "blimp/accuracy/only_npi_licensor_present": 0.725, "blimp/accuracy/expletive_it_object_raising": 0.766, "blimp/accuracy/left_branch_island_simple_question": 0.728, "blimp/accuracy/wh_questions_subject_gap": 0.952, "blimp/accuracy/existential_there_quantifiers_2": 0.456, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.946, "blimp/accuracy/sentential_negation_npi_scope": 0.627, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.829, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.91, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.898, "blimp/accuracy/principle_A_case_2": 0.948, "blimp/accuracy/distractor_agreement_relational_noun": 0.889, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.996, "blimp/accuracy/superlative_quantifiers_1": 0.864, "blimp/accuracy/wh_island": 0.797, "blimp/accuracy/principle_A_domain_1": 0.987, "blimp/accuracy/complex_NP_island": 0.593, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.969, "blimp/accuracy/irregular_past_participle_verbs": 0.885, "blimp/accuracy/drop_argument": 0.774, "blimp/accuracy/wh_questions_object_gap": 0.85, "blimp/accuracy/animate_subject_passive": 0.81, "blimp/accuracy/existential_there_quantifiers_1": 0.99, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.916, "blimp/accuracy/npi_present_2": 0.565, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.946, "blimp/accuracy/anaphor_number_agreement": 0.995, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.963, "blimp/accuracy/existential_there_object_raising": 0.845, "blimp/accuracy/matrix_question_npi_licensor_present": 0.294, "blimp/accuracy/npi_present_1": 0.588, "blimp/accuracy/wh_vs_that_no_gap": 0.985, "blimp/accuracy/left_branch_island_echo_question": 0.45, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.974, "blimp/accuracy/causative": 0.765, "blimp/accuracy/group_average": 0.8039402985074627, "blimp/accuracy/seq_average": 0.8039402985074627, "cbt/accuracy/NE": 0.796875, "cbt/accuracy/V": 0.9312, "cbt/accuracy/CN": 0.882, "cbt/accuracy/P": 0.91, "cbt/accuracy/group_average": 0.8800187500000001, "cbt/accuracy/seq_average": 0.8800520208083233, "hellaswag/accuracy/val": 0.34246166102370046, "hellaswag/accuracy/group_average": 0.34246166102370046, "hellaswag/accuracy/seq_average": 0.34246166102370046, "piqa/accuracy/val": 0.6229597388465724, "piqa/accuracy/group_average": 0.6229597388465724, "piqa/accuracy/seq_average": 0.6229597388465724, "ai2arc/accuracy/ARC-Easy": 0.3704016913319239, "ai2arc/accuracy/ARC-Challenge": 0.23433476394849787, "ai2arc/accuracy/group_average": 0.3023682276402109, "ai2arc/accuracy/seq_average": 0.3254957507082153, "race/accuracy/test/high": 0.28216123499142365, "race/accuracy/test/middle": 0.35724233983286907, "race/accuracy/group_average": 0.3197017874121464, "race/accuracy/seq_average": 0.3040129712201054, "siqa/accuracy/dev": 0.37871033776867963, "siqa/accuracy/group_average": 0.37871033776867963, "siqa/accuracy/seq_average": 0.37871033776867963, "commonsenseqa/accuracy/dev_rand_split": 0.2628992628992629, "commonsenseqa/accuracy/group_average": 0.2628992628992629, "commonsenseqa/accuracy/seq_average": 0.2628992628992629}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-160000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.369200933547247, "val/accuracy": 0.5100349547371031, "val/perplexity": 10.688847771076384, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.554166616119953, "lambada/accuracy/total": 0.3251164596273292, "lambada/accuracy/openai_last_token": 0.7880434782608695, "lambada/perplexity": 7.750025777457828, "lambada/lm_loss": 2.961524882507129, "lambada/lm_perplexity": 19.327421342929636, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.41757570718221615, "mean_loss": 2.4616837748336, "blimp/accuracy/passive_2": 0.901, "blimp/accuracy/determiner_noun_agreement_2": 0.984, "blimp/accuracy/ellipsis_n_bar_1": 0.805, "blimp/accuracy/tough_vs_raising_2": 0.917, "blimp/accuracy/tough_vs_raising_1": 0.574, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.928, "blimp/accuracy/principle_A_reconstruction": 0.292, "blimp/accuracy/wh_vs_that_with_gap": 0.45, "blimp/accuracy/principle_A_domain_2": 0.908, "blimp/accuracy/determiner_noun_agreement_1": 0.986, "blimp/accuracy/ellipsis_n_bar_2": 0.901, "blimp/accuracy/principle_A_domain_3": 0.632, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.919, "blimp/accuracy/animate_subject_trans": 0.908, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.896, "blimp/accuracy/distractor_agreement_relative_clause": 0.674, "blimp/accuracy/transitive": 0.878, "blimp/accuracy/sentential_subject_island": 0.386, "blimp/accuracy/adjunct_island": 0.878, "blimp/accuracy/intransitive": 0.764, "blimp/accuracy/existential_there_subject_raising": 0.893, "blimp/accuracy/irregular_past_participle_adjectives": 0.845, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.629, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.339, "blimp/accuracy/only_npi_scope": 0.772, "blimp/accuracy/superlative_quantifiers_2": 0.872, "blimp/accuracy/passive_1": 0.908, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.908, "blimp/accuracy/inchoative": 0.621, "blimp/accuracy/anaphor_gender_agreement": 0.974, "blimp/accuracy/principle_A_c_command": 0.735, "blimp/accuracy/only_npi_licensor_present": 0.383, "blimp/accuracy/expletive_it_object_raising": 0.752, "blimp/accuracy/left_branch_island_simple_question": 0.735, "blimp/accuracy/wh_questions_subject_gap": 0.923, "blimp/accuracy/existential_there_quantifiers_2": 0.446, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.946, "blimp/accuracy/sentential_negation_npi_scope": 0.697, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.84, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.889, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.918, "blimp/accuracy/principle_A_case_2": 0.954, "blimp/accuracy/distractor_agreement_relational_noun": 0.907, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.996, "blimp/accuracy/superlative_quantifiers_1": 0.812, "blimp/accuracy/wh_island": 0.811, "blimp/accuracy/principle_A_domain_1": 0.995, "blimp/accuracy/complex_NP_island": 0.539, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.968, "blimp/accuracy/irregular_past_participle_verbs": 0.903, "blimp/accuracy/drop_argument": 0.744, "blimp/accuracy/wh_questions_object_gap": 0.823, "blimp/accuracy/animate_subject_passive": 0.773, "blimp/accuracy/existential_there_quantifiers_1": 0.981, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.927, "blimp/accuracy/npi_present_2": 0.597, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.963, "blimp/accuracy/anaphor_number_agreement": 0.997, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.954, "blimp/accuracy/existential_there_object_raising": 0.863, "blimp/accuracy/matrix_question_npi_licensor_present": 0.382, "blimp/accuracy/npi_present_1": 0.519, "blimp/accuracy/wh_vs_that_no_gap": 0.985, "blimp/accuracy/left_branch_island_echo_question": 0.373, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.962, "blimp/accuracy/causative": 0.768, "blimp/accuracy/group_average": 0.7925671641791044, "blimp/accuracy/seq_average": 0.7925671641791044, "cbt/accuracy/NE": 0.8052884615384616, "cbt/accuracy/V": 0.9348, "cbt/accuracy/CN": 0.8716, "cbt/accuracy/P": 0.9108, "cbt/accuracy/group_average": 0.8806221153846154, "cbt/accuracy/seq_average": 0.8806522609043618, "hellaswag/accuracy/val": 0.34783907588129853, "hellaswag/accuracy/group_average": 0.34783907588129853, "hellaswag/accuracy/seq_average": 0.34783907588129853, "piqa/accuracy/val": 0.6365614798694232, "piqa/accuracy/group_average": 0.6365614798694232, "piqa/accuracy/seq_average": 0.6365614798694232, "ai2arc/accuracy/ARC-Easy": 0.37167019027484144, "ai2arc/accuracy/ARC-Challenge": 0.23004291845493563, "ai2arc/accuracy/group_average": 0.30085655436488856, "ai2arc/accuracy/seq_average": 0.32492917847025493, "race/accuracy/test/high": 0.28702115494568325, "race/accuracy/test/middle": 0.35097493036211697, "race/accuracy/group_average": 0.3189980426539001, "race/accuracy/seq_average": 0.3056343737332793, "siqa/accuracy/dev": 0.3766632548618219, "siqa/accuracy/group_average": 0.3766632548618219, "siqa/accuracy/seq_average": 0.3766632548618219, "commonsenseqa/accuracy/dev_rand_split": 0.2727272727272727, "commonsenseqa/accuracy/group_average": 0.2727272727272727, "commonsenseqa/accuracy/seq_average": 0.2727272727272727}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-180000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.3474213130890376, "val/accuracy": 0.5139693002852183, "val/perplexity": 10.458565555936703, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.356206100179542, "lambada/accuracy/total": 0.35267857142857145, "lambada/accuracy/openai_last_token": 0.796972049689441, "lambada/perplexity": 7.01501208120262, "lambada/lm_loss": 2.9519054483586618, "lambada/lm_perplexity": 19.14239384275662, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4333239358568949, "mean_loss": 2.3518137066342897, "blimp/accuracy/passive_2": 0.92, "blimp/accuracy/determiner_noun_agreement_2": 0.98, "blimp/accuracy/ellipsis_n_bar_1": 0.822, "blimp/accuracy/tough_vs_raising_2": 0.891, "blimp/accuracy/tough_vs_raising_1": 0.563, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.892, "blimp/accuracy/principle_A_reconstruction": 0.299, "blimp/accuracy/wh_vs_that_with_gap": 0.454, "blimp/accuracy/principle_A_domain_2": 0.915, "blimp/accuracy/determiner_noun_agreement_1": 0.986, "blimp/accuracy/ellipsis_n_bar_2": 0.903, "blimp/accuracy/principle_A_domain_3": 0.619, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.905, "blimp/accuracy/animate_subject_trans": 0.912, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.909, "blimp/accuracy/distractor_agreement_relative_clause": 0.652, "blimp/accuracy/transitive": 0.88, "blimp/accuracy/sentential_subject_island": 0.381, "blimp/accuracy/adjunct_island": 0.884, "blimp/accuracy/intransitive": 0.761, "blimp/accuracy/existential_there_subject_raising": 0.884, "blimp/accuracy/irregular_past_participle_adjectives": 0.886, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.651, "blimp/accuracy/principle_A_case_1": 0.999, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.356, "blimp/accuracy/only_npi_scope": 0.74, "blimp/accuracy/superlative_quantifiers_2": 0.827, "blimp/accuracy/passive_1": 0.899, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.911, "blimp/accuracy/inchoative": 0.591, "blimp/accuracy/anaphor_gender_agreement": 0.97, "blimp/accuracy/principle_A_c_command": 0.776, "blimp/accuracy/only_npi_licensor_present": 0.763, "blimp/accuracy/expletive_it_object_raising": 0.782, "blimp/accuracy/left_branch_island_simple_question": 0.732, "blimp/accuracy/wh_questions_subject_gap": 0.946, "blimp/accuracy/existential_there_quantifiers_2": 0.491, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.937, "blimp/accuracy/sentential_negation_npi_scope": 0.685, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.794, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.919, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.875, "blimp/accuracy/principle_A_case_2": 0.939, "blimp/accuracy/distractor_agreement_relational_noun": 0.842, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.991, "blimp/accuracy/superlative_quantifiers_1": 0.768, "blimp/accuracy/wh_island": 0.805, "blimp/accuracy/principle_A_domain_1": 0.992, "blimp/accuracy/complex_NP_island": 0.601, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.971, "blimp/accuracy/irregular_past_participle_verbs": 0.888, "blimp/accuracy/drop_argument": 0.739, "blimp/accuracy/wh_questions_object_gap": 0.829, "blimp/accuracy/animate_subject_passive": 0.803, "blimp/accuracy/existential_there_quantifiers_1": 0.988, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.906, "blimp/accuracy/npi_present_2": 0.554, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.932, "blimp/accuracy/anaphor_number_agreement": 0.993, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.956, "blimp/accuracy/existential_there_object_raising": 0.854, "blimp/accuracy/matrix_question_npi_licensor_present": 0.373, "blimp/accuracy/npi_present_1": 0.534, "blimp/accuracy/wh_vs_that_no_gap": 0.983, "blimp/accuracy/left_branch_island_echo_question": 0.38, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.963, "blimp/accuracy/causative": 0.751, "blimp/accuracy/group_average": 0.7951791044776118, "blimp/accuracy/seq_average": 0.795179104477612, "cbt/accuracy/NE": 0.811698717948718, "cbt/accuracy/V": 0.9352, "cbt/accuracy/CN": 0.876, "cbt/accuracy/P": 0.9144, "cbt/accuracy/group_average": 0.8843246794871795, "cbt/accuracy/seq_average": 0.8843537414965986, "hellaswag/accuracy/val": 0.3516231826329416, "hellaswag/accuracy/group_average": 0.3516231826329416, "hellaswag/accuracy/seq_average": 0.3516231826329416, "piqa/accuracy/val": 0.6305767138193689, "piqa/accuracy/group_average": 0.6305767138193689, "piqa/accuracy/seq_average": 0.6305767138193689, "ai2arc/accuracy/ARC-Easy": 0.3767441860465116, "ai2arc/accuracy/ARC-Challenge": 0.2369098712446352, "ai2arc/accuracy/group_average": 0.30682702864557343, "ai2arc/accuracy/seq_average": 0.3305949008498584, "race/accuracy/test/high": 0.2861635220125786, "race/accuracy/test/middle": 0.3516713091922006, "race/accuracy/group_average": 0.3189174156023896, "race/accuracy/seq_average": 0.3052290231049858, "siqa/accuracy/dev": 0.3679631525076766, "siqa/accuracy/group_average": 0.3679631525076766, "siqa/accuracy/seq_average": 0.3679631525076766, "commonsenseqa/accuracy/dev_rand_split": 0.28173628173628174, "commonsenseqa/accuracy/group_average": 0.28173628173628174, "commonsenseqa/accuracy/seq_average": 0.28173628173628174}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-20000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.7139173235212053, "val/accuracy": 0.4618578714037698, "val/perplexity": 15.088265515379216, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6369841202445654, "lambada/accuracy/total": 0.21739130434782608, "lambada/accuracy/openai_last_token": 0.7451475155279503, "lambada/perplexity": 13.977310934439362, "lambada/lm_loss": 3.256185792739014, "lambada/lm_perplexity": 25.950368054758744, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.33962458787579797, "mean_loss": 2.6754507218828856, "blimp/accuracy/passive_2": 0.889, "blimp/accuracy/determiner_noun_agreement_2": 0.987, "blimp/accuracy/ellipsis_n_bar_1": 0.812, "blimp/accuracy/tough_vs_raising_2": 0.857, "blimp/accuracy/tough_vs_raising_1": 0.592, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.873, "blimp/accuracy/principle_A_reconstruction": 0.401, "blimp/accuracy/wh_vs_that_with_gap": 0.515, "blimp/accuracy/principle_A_domain_2": 0.835, "blimp/accuracy/determiner_noun_agreement_1": 0.978, "blimp/accuracy/ellipsis_n_bar_2": 0.885, "blimp/accuracy/principle_A_domain_3": 0.621, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.898, "blimp/accuracy/animate_subject_trans": 0.896, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.866, "blimp/accuracy/distractor_agreement_relative_clause": 0.564, "blimp/accuracy/transitive": 0.848, "blimp/accuracy/sentential_subject_island": 0.39, "blimp/accuracy/adjunct_island": 0.76, "blimp/accuracy/intransitive": 0.784, "blimp/accuracy/existential_there_subject_raising": 0.854, "blimp/accuracy/irregular_past_participle_adjectives": 0.872, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.271, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.257, "blimp/accuracy/only_npi_scope": 0.552, "blimp/accuracy/superlative_quantifiers_2": 0.817, "blimp/accuracy/passive_1": 0.899, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.897, "blimp/accuracy/inchoative": 0.623, "blimp/accuracy/anaphor_gender_agreement": 0.964, "blimp/accuracy/principle_A_c_command": 0.633, "blimp/accuracy/only_npi_licensor_present": 0.417, "blimp/accuracy/expletive_it_object_raising": 0.76, "blimp/accuracy/left_branch_island_simple_question": 0.368, "blimp/accuracy/wh_questions_subject_gap": 0.917, "blimp/accuracy/existential_there_quantifiers_2": 0.303, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.936, "blimp/accuracy/sentential_negation_npi_scope": 0.662, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.83, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.923, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.891, "blimp/accuracy/principle_A_case_2": 0.942, "blimp/accuracy/distractor_agreement_relational_noun": 0.754, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.998, "blimp/accuracy/superlative_quantifiers_1": 0.636, "blimp/accuracy/wh_island": 0.78, "blimp/accuracy/principle_A_domain_1": 0.992, "blimp/accuracy/complex_NP_island": 0.466, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.964, "blimp/accuracy/irregular_past_participle_verbs": 0.861, "blimp/accuracy/drop_argument": 0.77, "blimp/accuracy/wh_questions_object_gap": 0.777, "blimp/accuracy/animate_subject_passive": 0.793, "blimp/accuracy/existential_there_quantifiers_1": 0.972, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.871, "blimp/accuracy/npi_present_2": 0.654, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.916, "blimp/accuracy/anaphor_number_agreement": 0.986, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.944, "blimp/accuracy/existential_there_object_raising": 0.789, "blimp/accuracy/matrix_question_npi_licensor_present": 0.241, "blimp/accuracy/npi_present_1": 0.587, "blimp/accuracy/wh_vs_that_no_gap": 0.974, "blimp/accuracy/left_branch_island_echo_question": 0.409, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.977, "blimp/accuracy/causative": 0.721, "blimp/accuracy/group_average": 0.7562835820895522, "blimp/accuracy/seq_average": 0.7562835820895523, "cbt/accuracy/NE": 0.7560096153846154, "cbt/accuracy/V": 0.902, "cbt/accuracy/CN": 0.8056, "cbt/accuracy/P": 0.882, "cbt/accuracy/group_average": 0.8364024038461539, "cbt/accuracy/seq_average": 0.8364345738295318, "hellaswag/accuracy/val": 0.2884883489344752, "hellaswag/accuracy/group_average": 0.2884883489344752, "hellaswag/accuracy/seq_average": 0.2884883489344752, "piqa/accuracy/val": 0.573993471164309, "piqa/accuracy/group_average": 0.573993471164309, "piqa/accuracy/seq_average": 0.573993471164309, "ai2arc/accuracy/ARC-Easy": 0.3230443974630021, "ai2arc/accuracy/ARC-Challenge": 0.21373390557939914, "ai2arc/accuracy/group_average": 0.26838915152120063, "ai2arc/accuracy/seq_average": 0.2869688385269122, "race/accuracy/test/high": 0.2692967409948542, "race/accuracy/test/middle": 0.3426183844011142, "race/accuracy/group_average": 0.3059575626979842, "race/accuracy/seq_average": 0.29063640048642075, "siqa/accuracy/dev": 0.36284544524053225, "siqa/accuracy/group_average": 0.36284544524053225, "siqa/accuracy/seq_average": 0.36284544524053225, "commonsenseqa/accuracy/dev_rand_split": 0.24242424242424243, "commonsenseqa/accuracy/group_average": 0.24242424242424243, "commonsenseqa/accuracy/seq_average": 0.24242424242424243}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-200000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.332214597671751, "val/accuracy": 0.5167052253844246, "val/perplexity": 10.300728260837063, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.3393784043211374, "lambada/accuracy/total": 0.35403726708074534, "lambada/accuracy/openai_last_token": 0.7932841614906833, "lambada/perplexity": 7.100537401243063, "lambada/lm_loss": 2.9399230612890586, "lambada/lm_perplexity": 18.914391007408764, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.435371246232585, "mean_loss": 2.3357965009964445, "blimp/accuracy/passive_2": 0.907, "blimp/accuracy/determiner_noun_agreement_2": 0.984, "blimp/accuracy/ellipsis_n_bar_1": 0.772, "blimp/accuracy/tough_vs_raising_2": 0.902, "blimp/accuracy/tough_vs_raising_1": 0.58, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.937, "blimp/accuracy/principle_A_reconstruction": 0.356, "blimp/accuracy/wh_vs_that_with_gap": 0.448, "blimp/accuracy/principle_A_domain_2": 0.885, "blimp/accuracy/determiner_noun_agreement_1": 0.99, "blimp/accuracy/ellipsis_n_bar_2": 0.904, "blimp/accuracy/principle_A_domain_3": 0.654, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.923, "blimp/accuracy/animate_subject_trans": 0.911, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.925, "blimp/accuracy/distractor_agreement_relative_clause": 0.683, "blimp/accuracy/transitive": 0.875, "blimp/accuracy/sentential_subject_island": 0.385, "blimp/accuracy/adjunct_island": 0.903, "blimp/accuracy/intransitive": 0.763, "blimp/accuracy/existential_there_subject_raising": 0.89, "blimp/accuracy/irregular_past_participle_adjectives": 0.989, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.689, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.327, "blimp/accuracy/only_npi_scope": 0.659, "blimp/accuracy/superlative_quantifiers_2": 0.838, "blimp/accuracy/passive_1": 0.907, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.923, "blimp/accuracy/inchoative": 0.599, "blimp/accuracy/anaphor_gender_agreement": 0.979, "blimp/accuracy/principle_A_c_command": 0.781, "blimp/accuracy/only_npi_licensor_present": 0.889, "blimp/accuracy/expletive_it_object_raising": 0.793, "blimp/accuracy/left_branch_island_simple_question": 0.804, "blimp/accuracy/wh_questions_subject_gap": 0.95, "blimp/accuracy/existential_there_quantifiers_2": 0.478, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.942, "blimp/accuracy/sentential_negation_npi_scope": 0.684, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.841, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.927, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.913, "blimp/accuracy/principle_A_case_2": 0.949, "blimp/accuracy/distractor_agreement_relational_noun": 0.885, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.996, "blimp/accuracy/superlative_quantifiers_1": 0.848, "blimp/accuracy/wh_island": 0.798, "blimp/accuracy/principle_A_domain_1": 0.992, "blimp/accuracy/complex_NP_island": 0.581, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.975, "blimp/accuracy/irregular_past_participle_verbs": 0.894, "blimp/accuracy/drop_argument": 0.745, "blimp/accuracy/wh_questions_object_gap": 0.836, "blimp/accuracy/animate_subject_passive": 0.804, "blimp/accuracy/existential_there_quantifiers_1": 0.977, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.929, "blimp/accuracy/npi_present_2": 0.565, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.935, "blimp/accuracy/anaphor_number_agreement": 0.992, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.967, "blimp/accuracy/existential_there_object_raising": 0.86, "blimp/accuracy/matrix_question_npi_licensor_present": 0.36, "blimp/accuracy/npi_present_1": 0.457, "blimp/accuracy/wh_vs_that_no_gap": 0.984, "blimp/accuracy/left_branch_island_echo_question": 0.507, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.972, "blimp/accuracy/causative": 0.787, "blimp/accuracy/group_average": 0.8072238805970149, "blimp/accuracy/seq_average": 0.8072238805970149, "cbt/accuracy/NE": 0.8129006410256411, "cbt/accuracy/V": 0.932, "cbt/accuracy/CN": 0.8796, "cbt/accuracy/P": 0.9252, "cbt/accuracy/group_average": 0.8874251602564103, "cbt/accuracy/seq_average": 0.8874549819927972, "hellaswag/accuracy/val": 0.3571997610037841, "hellaswag/accuracy/group_average": 0.3571997610037841, "hellaswag/accuracy/seq_average": 0.3571997610037841, "piqa/accuracy/val": 0.6360174102285092, "piqa/accuracy/group_average": 0.6360174102285092, "piqa/accuracy/seq_average": 0.6360174102285092, "ai2arc/accuracy/ARC-Easy": 0.38012684989429174, "ai2arc/accuracy/ARC-Challenge": 0.2351931330472103, "ai2arc/accuracy/group_average": 0.307659991470751, "ai2arc/accuracy/seq_average": 0.3322946175637394, "race/accuracy/test/high": 0.29073756432247, "race/accuracy/test/middle": 0.36142061281337046, "race/accuracy/group_average": 0.32607908856792023, "race/accuracy/seq_average": 0.31130928252938794, "siqa/accuracy/dev": 0.3735926305015353, "siqa/accuracy/group_average": 0.3735926305015353, "siqa/accuracy/seq_average": 0.3735926305015353, "commonsenseqa/accuracy/dev_rand_split": 0.276003276003276, "commonsenseqa/accuracy/group_average": 0.276003276003276, "commonsenseqa/accuracy/seq_average": 0.276003276003276}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-220000.pth.json
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"val/loss": 2.317931644500248,
|
| 3 |
+
"val/accuracy": 0.5181080651661706,
|
| 4 |
+
"val/perplexity": 10.154649145407666,
|
| 5 |
+
"val/time_since_best_loss": 0,
|
| 6 |
+
"val/time_since_best_accuracy": 0,
|
| 7 |
+
"lambada/loss": 2.397024592997865,
|
| 8 |
+
"lambada/accuracy/total": 0.34879658385093165,
|
| 9 |
+
"lambada/accuracy/openai_last_token": 0.796583850931677,
|
| 10 |
+
"lambada/perplexity": 7.137130378722097,
|
| 11 |
+
"lambada/lm_loss": 2.9225259448144945,
|
| 12 |
+
"lambada/lm_perplexity": 18.588180928884224,
|
| 13 |
+
"lambada/time_since_best_loss": 0,
|
| 14 |
+
"lambada/time_since_best_accuracy": 0,
|
| 15 |
+
"mean_accuracy": 0.4334523245085511,
|
| 16 |
+
"mean_loss": 2.3574781187490563,
|
| 17 |
+
"blimp/accuracy/passive_2": 0.912,
|
| 18 |
+
"blimp/accuracy/determiner_noun_agreement_2": 0.978,
|
| 19 |
+
"blimp/accuracy/ellipsis_n_bar_1": 0.812,
|
| 20 |
+
"blimp/accuracy/tough_vs_raising_2": 0.891,
|
| 21 |
+
"blimp/accuracy/tough_vs_raising_1": 0.63,
|
| 22 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.907,
|
| 23 |
+
"blimp/accuracy/principle_A_reconstruction": 0.317,
|
| 24 |
+
"blimp/accuracy/wh_vs_that_with_gap": 0.454,
|
| 25 |
+
"blimp/accuracy/principle_A_domain_2": 0.905,
|
| 26 |
+
"blimp/accuracy/determiner_noun_agreement_1": 0.991,
|
| 27 |
+
"blimp/accuracy/ellipsis_n_bar_2": 0.92,
|
| 28 |
+
"blimp/accuracy/principle_A_domain_3": 0.632,
|
| 29 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.925,
|
| 30 |
+
"blimp/accuracy/animate_subject_trans": 0.914,
|
| 31 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.878,
|
| 32 |
+
"blimp/accuracy/distractor_agreement_relative_clause": 0.695,
|
| 33 |
+
"blimp/accuracy/transitive": 0.872,
|
| 34 |
+
"blimp/accuracy/sentential_subject_island": 0.449,
|
| 35 |
+
"blimp/accuracy/adjunct_island": 0.877,
|
| 36 |
+
"blimp/accuracy/intransitive": 0.776,
|
| 37 |
+
"blimp/accuracy/existential_there_subject_raising": 0.882,
|
| 38 |
+
"blimp/accuracy/irregular_past_participle_adjectives": 0.961,
|
| 39 |
+
"blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.71,
|
| 40 |
+
"blimp/accuracy/principle_A_case_1": 0.999,
|
| 41 |
+
"blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.323,
|
| 42 |
+
"blimp/accuracy/only_npi_scope": 0.704,
|
| 43 |
+
"blimp/accuracy/superlative_quantifiers_2": 0.842,
|
| 44 |
+
"blimp/accuracy/passive_1": 0.91,
|
| 45 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.922,
|
| 46 |
+
"blimp/accuracy/inchoative": 0.621,
|
| 47 |
+
"blimp/accuracy/anaphor_gender_agreement": 0.975,
|
| 48 |
+
"blimp/accuracy/principle_A_c_command": 0.793,
|
| 49 |
+
"blimp/accuracy/only_npi_licensor_present": 0.7,
|
| 50 |
+
"blimp/accuracy/expletive_it_object_raising": 0.783,
|
| 51 |
+
"blimp/accuracy/left_branch_island_simple_question": 0.805,
|
| 52 |
+
"blimp/accuracy/wh_questions_subject_gap": 0.942,
|
| 53 |
+
"blimp/accuracy/existential_there_quantifiers_2": 0.489,
|
| 54 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.946,
|
| 55 |
+
"blimp/accuracy/sentential_negation_npi_scope": 0.687,
|
| 56 |
+
"blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.835,
|
| 57 |
+
"blimp/accuracy/wh_questions_subject_gap_long_distance": 0.906,
|
| 58 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.903,
|
| 59 |
+
"blimp/accuracy/principle_A_case_2": 0.949,
|
| 60 |
+
"blimp/accuracy/distractor_agreement_relational_noun": 0.887,
|
| 61 |
+
"blimp/accuracy/sentential_negation_npi_licensor_present": 0.993,
|
| 62 |
+
"blimp/accuracy/superlative_quantifiers_1": 0.853,
|
| 63 |
+
"blimp/accuracy/wh_island": 0.828,
|
| 64 |
+
"blimp/accuracy/principle_A_domain_1": 0.987,
|
| 65 |
+
"blimp/accuracy/complex_NP_island": 0.595,
|
| 66 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_2": 0.965,
|
| 67 |
+
"blimp/accuracy/irregular_past_participle_verbs": 0.884,
|
| 68 |
+
"blimp/accuracy/drop_argument": 0.746,
|
| 69 |
+
"blimp/accuracy/wh_questions_object_gap": 0.847,
|
| 70 |
+
"blimp/accuracy/animate_subject_passive": 0.789,
|
| 71 |
+
"blimp/accuracy/existential_there_quantifiers_1": 0.982,
|
| 72 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.907,
|
| 73 |
+
"blimp/accuracy/npi_present_2": 0.599,
|
| 74 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_1": 0.943,
|
| 75 |
+
"blimp/accuracy/anaphor_number_agreement": 0.993,
|
| 76 |
+
"blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.961,
|
| 77 |
+
"blimp/accuracy/existential_there_object_raising": 0.861,
|
| 78 |
+
"blimp/accuracy/matrix_question_npi_licensor_present": 0.374,
|
| 79 |
+
"blimp/accuracy/npi_present_1": 0.585,
|
| 80 |
+
"blimp/accuracy/wh_vs_that_no_gap": 0.979,
|
| 81 |
+
"blimp/accuracy/left_branch_island_echo_question": 0.457,
|
| 82 |
+
"blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.957,
|
| 83 |
+
"blimp/accuracy/causative": 0.765,
|
| 84 |
+
"blimp/accuracy/group_average": 0.806850746268657,
|
| 85 |
+
"blimp/accuracy/seq_average": 0.8068507462686567,
|
| 86 |
+
"cbt/accuracy/NE": 0.8153044871794872,
|
| 87 |
+
"cbt/accuracy/V": 0.936,
|
| 88 |
+
"cbt/accuracy/CN": 0.8884,
|
| 89 |
+
"cbt/accuracy/P": 0.9192,
|
| 90 |
+
"cbt/accuracy/group_average": 0.8897261217948718,
|
| 91 |
+
"cbt/accuracy/seq_average": 0.8897559023609444,
|
| 92 |
+
"hellaswag/accuracy/val": 0.35909181437960563,
|
| 93 |
+
"hellaswag/accuracy/group_average": 0.35909181437960563,
|
| 94 |
+
"hellaswag/accuracy/seq_average": 0.35909181437960563,
|
| 95 |
+
"piqa/accuracy/val": 0.6398258977149075,
|
| 96 |
+
"piqa/accuracy/group_average": 0.6398258977149075,
|
| 97 |
+
"piqa/accuracy/seq_average": 0.6398258977149075,
|
| 98 |
+
"ai2arc/accuracy/ARC-Easy": 0.3788583509513742,
|
| 99 |
+
"ai2arc/accuracy/ARC-Challenge": 0.21888412017167383,
|
| 100 |
+
"ai2arc/accuracy/group_average": 0.298871235561524,
|
| 101 |
+
"ai2arc/accuracy/seq_average": 0.32606232294617565,
|
| 102 |
+
"race/accuracy/test/high": 0.2935963407661521,
|
| 103 |
+
"race/accuracy/test/middle": 0.3649025069637883,
|
| 104 |
+
"race/accuracy/group_average": 0.3292494238649702,
|
| 105 |
+
"race/accuracy/seq_average": 0.31434941224158897,
|
| 106 |
+
"siqa/accuracy/dev": 0.37001023541453426,
|
| 107 |
+
"siqa/accuracy/group_average": 0.37001023541453426,
|
| 108 |
+
"siqa/accuracy/seq_average": 0.37001023541453426,
|
| 109 |
+
"commonsenseqa/accuracy/dev_rand_split": 0.2751842751842752,
|
| 110 |
+
"commonsenseqa/accuracy/group_average": 0.2751842751842752,
|
| 111 |
+
"commonsenseqa/accuracy/seq_average": 0.2751842751842752
|
| 112 |
+
}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-240000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.306334722609747, "val/accuracy": 0.5194605267237103, "val/perplexity": 10.037566682715298, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.3432840856706134, "lambada/accuracy/total": 0.3728649068322981, "lambada/accuracy/openai_last_token": 0.8051242236024845, "lambada/perplexity": 6.4506670530415, "lambada/lm_loss": 2.9072434904863775, "lambada/lm_perplexity": 18.30626755373371, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.44616271677800423, "mean_loss": 2.32480940414018, "blimp/accuracy/passive_2": 0.904, "blimp/accuracy/determiner_noun_agreement_2": 0.979, "blimp/accuracy/ellipsis_n_bar_1": 0.804, "blimp/accuracy/tough_vs_raising_2": 0.886, "blimp/accuracy/tough_vs_raising_1": 0.553, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.907, "blimp/accuracy/principle_A_reconstruction": 0.354, "blimp/accuracy/wh_vs_that_with_gap": 0.421, "blimp/accuracy/principle_A_domain_2": 0.914, "blimp/accuracy/determiner_noun_agreement_1": 0.988, "blimp/accuracy/ellipsis_n_bar_2": 0.901, "blimp/accuracy/principle_A_domain_3": 0.633, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.917, "blimp/accuracy/animate_subject_trans": 0.914, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.892, "blimp/accuracy/distractor_agreement_relative_clause": 0.718, "blimp/accuracy/transitive": 0.883, "blimp/accuracy/sentential_subject_island": 0.394, "blimp/accuracy/adjunct_island": 0.879, "blimp/accuracy/intransitive": 0.774, "blimp/accuracy/existential_there_subject_raising": 0.887, "blimp/accuracy/irregular_past_participle_adjectives": 0.884, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.685, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.328, "blimp/accuracy/only_npi_scope": 0.705, "blimp/accuracy/superlative_quantifiers_2": 0.828, "blimp/accuracy/passive_1": 0.903, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.914, "blimp/accuracy/inchoative": 0.627, "blimp/accuracy/anaphor_gender_agreement": 0.979, "blimp/accuracy/principle_A_c_command": 0.802, "blimp/accuracy/only_npi_licensor_present": 0.702, "blimp/accuracy/expletive_it_object_raising": 0.782, "blimp/accuracy/left_branch_island_simple_question": 0.748, "blimp/accuracy/wh_questions_subject_gap": 0.935, "blimp/accuracy/existential_there_quantifiers_2": 0.561, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.951, "blimp/accuracy/sentential_negation_npi_scope": 0.727, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.816, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.918, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.914, "blimp/accuracy/principle_A_case_2": 0.96, "blimp/accuracy/distractor_agreement_relational_noun": 0.889, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.989, "blimp/accuracy/superlative_quantifiers_1": 0.888, "blimp/accuracy/wh_island": 0.815, "blimp/accuracy/principle_A_domain_1": 0.993, "blimp/accuracy/complex_NP_island": 0.606, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.965, "blimp/accuracy/irregular_past_participle_verbs": 0.9, "blimp/accuracy/drop_argument": 0.744, "blimp/accuracy/wh_questions_object_gap": 0.843, "blimp/accuracy/animate_subject_passive": 0.79, "blimp/accuracy/existential_there_quantifiers_1": 0.975, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.921, "blimp/accuracy/npi_present_2": 0.615, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.937, "blimp/accuracy/anaphor_number_agreement": 0.993, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.963, "blimp/accuracy/existential_there_object_raising": 0.859, "blimp/accuracy/matrix_question_npi_licensor_present": 0.358, "blimp/accuracy/npi_present_1": 0.564, "blimp/accuracy/wh_vs_that_no_gap": 0.983, "blimp/accuracy/left_branch_island_echo_question": 0.469, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.964, "blimp/accuracy/causative": 0.764, "blimp/accuracy/group_average": 0.8053432835820895, "blimp/accuracy/seq_average": 0.8053432835820895, "cbt/accuracy/NE": 0.8145032051282052, "cbt/accuracy/V": 0.9412, "cbt/accuracy/CN": 0.8928, "cbt/accuracy/P": 0.9252, "cbt/accuracy/group_average": 0.8934258012820513, "cbt/accuracy/seq_average": 0.8934573829531812, "hellaswag/accuracy/val": 0.3609838677554272, "hellaswag/accuracy/group_average": 0.3609838677554272, "hellaswag/accuracy/seq_average": 0.3609838677554272, "piqa/accuracy/val": 0.6447225244831338, "piqa/accuracy/group_average": 0.6447225244831338, "piqa/accuracy/seq_average": 0.6447225244831338, "ai2arc/accuracy/ARC-Easy": 0.39027484143763214, "ai2arc/accuracy/ARC-Challenge": 0.22918454935622318, "ai2arc/accuracy/group_average": 0.30972969539692763, "ai2arc/accuracy/seq_average": 0.3371104815864023, "race/accuracy/test/high": 0.29445397369925674, "race/accuracy/test/middle": 0.3649025069637883, "race/accuracy/group_average": 0.3296782403315225, "race/accuracy/seq_average": 0.31495743818402916, "siqa/accuracy/dev": 0.3710337768679631, "siqa/accuracy/group_average": 0.3710337768679631, "siqa/accuracy/seq_average": 0.3710337768679631, "commonsenseqa/accuracy/dev_rand_split": 0.276003276003276, "commonsenseqa/accuracy/group_average": 0.276003276003276, "commonsenseqa/accuracy/seq_average": 0.276003276003276}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-260000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.2931736537388394, "val/accuracy": 0.5219619993179564, "val/perplexity": 9.906327097288369, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.3381419685316382, "lambada/accuracy/total": 0.36180124223602483, "lambada/accuracy/openai_last_token": 0.8014363354037267, "lambada/perplexity": 6.719970222173125, "lambada/lm_loss": 2.882655370309452, "lambada/lm_perplexity": 17.861639531468363, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4418816207769906, "mean_loss": 2.315657811135239, "blimp/accuracy/passive_2": 0.924, "blimp/accuracy/determiner_noun_agreement_2": 0.98, "blimp/accuracy/ellipsis_n_bar_1": 0.799, "blimp/accuracy/tough_vs_raising_2": 0.889, "blimp/accuracy/tough_vs_raising_1": 0.602, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.918, "blimp/accuracy/principle_A_reconstruction": 0.357, "blimp/accuracy/wh_vs_that_with_gap": 0.431, "blimp/accuracy/principle_A_domain_2": 0.904, "blimp/accuracy/determiner_noun_agreement_1": 0.99, "blimp/accuracy/ellipsis_n_bar_2": 0.9, "blimp/accuracy/principle_A_domain_3": 0.652, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.919, "blimp/accuracy/animate_subject_trans": 0.916, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.903, "blimp/accuracy/distractor_agreement_relative_clause": 0.691, "blimp/accuracy/transitive": 0.875, "blimp/accuracy/sentential_subject_island": 0.361, "blimp/accuracy/adjunct_island": 0.889, "blimp/accuracy/intransitive": 0.777, "blimp/accuracy/existential_there_subject_raising": 0.896, "blimp/accuracy/irregular_past_participle_adjectives": 0.973, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.779, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.326, "blimp/accuracy/only_npi_scope": 0.739, "blimp/accuracy/superlative_quantifiers_2": 0.793, "blimp/accuracy/passive_1": 0.917, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.92, "blimp/accuracy/inchoative": 0.629, "blimp/accuracy/anaphor_gender_agreement": 0.983, "blimp/accuracy/principle_A_c_command": 0.793, "blimp/accuracy/only_npi_licensor_present": 0.64, "blimp/accuracy/expletive_it_object_raising": 0.777, "blimp/accuracy/left_branch_island_simple_question": 0.849, "blimp/accuracy/wh_questions_subject_gap": 0.934, "blimp/accuracy/existential_there_quantifiers_2": 0.501, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.946, "blimp/accuracy/sentential_negation_npi_scope": 0.704, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.836, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.921, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.9, "blimp/accuracy/principle_A_case_2": 0.942, "blimp/accuracy/distractor_agreement_relational_noun": 0.867, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.99, "blimp/accuracy/superlative_quantifiers_1": 0.882, "blimp/accuracy/wh_island": 0.865, "blimp/accuracy/principle_A_domain_1": 0.991, "blimp/accuracy/complex_NP_island": 0.575, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.963, "blimp/accuracy/irregular_past_participle_verbs": 0.866, "blimp/accuracy/drop_argument": 0.748, "blimp/accuracy/wh_questions_object_gap": 0.825, "blimp/accuracy/animate_subject_passive": 0.796, "blimp/accuracy/existential_there_quantifiers_1": 0.982, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.916, "blimp/accuracy/npi_present_2": 0.631, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.951, "blimp/accuracy/anaphor_number_agreement": 0.987, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.961, "blimp/accuracy/existential_there_object_raising": 0.844, "blimp/accuracy/matrix_question_npi_licensor_present": 0.396, "blimp/accuracy/npi_present_1": 0.585, "blimp/accuracy/wh_vs_that_no_gap": 0.984, "blimp/accuracy/left_branch_island_echo_question": 0.505, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.974, "blimp/accuracy/causative": 0.792, "blimp/accuracy/group_average": 0.8097164179104479, "blimp/accuracy/seq_average": 0.8097164179104478, "cbt/accuracy/NE": 0.8249198717948718, "cbt/accuracy/V": 0.94, "cbt/accuracy/CN": 0.8904, "cbt/accuracy/P": 0.9208, "cbt/accuracy/group_average": 0.8940299679487179, "cbt/accuracy/seq_average": 0.8940576230492197, "hellaswag/accuracy/val": 0.3678550089623581, "hellaswag/accuracy/group_average": 0.3678550089623581, "hellaswag/accuracy/seq_average": 0.3678550089623581, "piqa/accuracy/val": 0.6349292709466812, "piqa/accuracy/group_average": 0.6349292709466812, "piqa/accuracy/seq_average": 0.6349292709466812, "ai2arc/accuracy/ARC-Easy": 0.386892177589852, "ai2arc/accuracy/ARC-Challenge": 0.23090128755364808, "ai2arc/accuracy/group_average": 0.30889673257175004, "ai2arc/accuracy/seq_average": 0.33541076487252125, "race/accuracy/test/high": 0.29130931961120643, "race/accuracy/test/middle": 0.3732590529247911, "race/accuracy/group_average": 0.33228418626799877, "race/accuracy/seq_average": 0.3151601134981759, "siqa/accuracy/dev": 0.3741044012282497, "siqa/accuracy/group_average": 0.3741044012282497, "siqa/accuracy/seq_average": 0.3741044012282497, "commonsenseqa/accuracy/dev_rand_split": 0.266994266994267, "commonsenseqa/accuracy/group_average": 0.266994266994267, "commonsenseqa/accuracy/seq_average": 0.266994266994267}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-280000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.2802521236359126, "val/accuracy": 0.5239093114459326, "val/perplexity": 9.779145652500862, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.3465940108210406, "lambada/accuracy/total": 0.3779114906832298, "lambada/accuracy/openai_last_token": 0.8010481366459627, "lambada/perplexity": 6.39241567863501, "lambada/lm_loss": 2.874058695101024, "lambada/lm_perplexity": 17.708746943285853, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4509104010645812, "mean_loss": 2.3134230672284763, "blimp/accuracy/passive_2": 0.91, "blimp/accuracy/determiner_noun_agreement_2": 0.983, "blimp/accuracy/ellipsis_n_bar_1": 0.823, "blimp/accuracy/tough_vs_raising_2": 0.875, "blimp/accuracy/tough_vs_raising_1": 0.625, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.873, "blimp/accuracy/principle_A_reconstruction": 0.354, "blimp/accuracy/wh_vs_that_with_gap": 0.422, "blimp/accuracy/principle_A_domain_2": 0.909, "blimp/accuracy/determiner_noun_agreement_1": 0.99, "blimp/accuracy/ellipsis_n_bar_2": 0.915, "blimp/accuracy/principle_A_domain_3": 0.662, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.917, "blimp/accuracy/animate_subject_trans": 0.9, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.895, "blimp/accuracy/distractor_agreement_relative_clause": 0.7, "blimp/accuracy/transitive": 0.88, "blimp/accuracy/sentential_subject_island": 0.38, "blimp/accuracy/adjunct_island": 0.888, "blimp/accuracy/intransitive": 0.769, "blimp/accuracy/existential_there_subject_raising": 0.877, "blimp/accuracy/irregular_past_participle_adjectives": 0.948, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.753, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.34, "blimp/accuracy/only_npi_scope": 0.687, "blimp/accuracy/superlative_quantifiers_2": 0.888, "blimp/accuracy/passive_1": 0.909, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.917, "blimp/accuracy/inchoative": 0.602, "blimp/accuracy/anaphor_gender_agreement": 0.982, "blimp/accuracy/principle_A_c_command": 0.774, "blimp/accuracy/only_npi_licensor_present": 0.812, "blimp/accuracy/expletive_it_object_raising": 0.764, "blimp/accuracy/left_branch_island_simple_question": 0.812, "blimp/accuracy/wh_questions_subject_gap": 0.935, "blimp/accuracy/existential_there_quantifiers_2": 0.519, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.949, "blimp/accuracy/sentential_negation_npi_scope": 0.692, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.837, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.922, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.909, "blimp/accuracy/principle_A_case_2": 0.944, "blimp/accuracy/distractor_agreement_relational_noun": 0.855, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.989, "blimp/accuracy/superlative_quantifiers_1": 0.886, "blimp/accuracy/wh_island": 0.843, "blimp/accuracy/principle_A_domain_1": 0.993, "blimp/accuracy/complex_NP_island": 0.598, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.968, "blimp/accuracy/irregular_past_participle_verbs": 0.901, "blimp/accuracy/drop_argument": 0.746, "blimp/accuracy/wh_questions_object_gap": 0.845, "blimp/accuracy/animate_subject_passive": 0.785, "blimp/accuracy/existential_there_quantifiers_1": 0.982, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.899, "blimp/accuracy/npi_present_2": 0.594, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.956, "blimp/accuracy/anaphor_number_agreement": 0.995, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.963, "blimp/accuracy/existential_there_object_raising": 0.83, "blimp/accuracy/matrix_question_npi_licensor_present": 0.398, "blimp/accuracy/npi_present_1": 0.558, "blimp/accuracy/wh_vs_that_no_gap": 0.991, "blimp/accuracy/left_branch_island_echo_question": 0.509, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.972, "blimp/accuracy/causative": 0.779, "blimp/accuracy/group_average": 0.8101044776119404, "blimp/accuracy/seq_average": 0.8101044776119403, "cbt/accuracy/NE": 0.8217147435897436, "cbt/accuracy/V": 0.9408, "cbt/accuracy/CN": 0.8908, "cbt/accuracy/P": 0.9252, "cbt/accuracy/group_average": 0.8946286858974359, "cbt/accuracy/seq_average": 0.8946578631452581, "hellaswag/accuracy/val": 0.3662617008564031, "hellaswag/accuracy/group_average": 0.3662617008564031, "hellaswag/accuracy/seq_average": 0.3662617008564031, "piqa/accuracy/val": 0.6376496191512514, "piqa/accuracy/group_average": 0.6376496191512514, "piqa/accuracy/seq_average": 0.6376496191512514, "ai2arc/accuracy/ARC-Easy": 0.38012684989429174, "ai2arc/accuracy/ARC-Challenge": 0.2334763948497854, "ai2arc/accuracy/group_average": 0.30680162237203856, "ai2arc/accuracy/seq_average": 0.33172804532577904, "race/accuracy/test/high": 0.292166952544311, "race/accuracy/test/middle": 0.366991643454039, "race/accuracy/group_average": 0.329579297999175, "race/accuracy/seq_average": 0.3139440616132955, "siqa/accuracy/dev": 0.37308085977482086, "siqa/accuracy/group_average": 0.37308085977482086, "siqa/accuracy/seq_average": 0.37308085977482086, "commonsenseqa/accuracy/dev_rand_split": 0.27354627354627353, "commonsenseqa/accuracy/group_average": 0.27354627354627353, "commonsenseqa/accuracy/seq_average": 0.27354627354627353}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-300000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.270780775282118, "val/accuracy": 0.5250631665426587, "val/perplexity": 9.686961202072142, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.3459457492236027, "lambada/accuracy/total": 0.36781832298136646, "lambada/accuracy/openai_last_token": 0.8027950310559007, "lambada/perplexity": 6.54824316019997, "lambada/lm_loss": 2.8713688783752356, "lambada/lm_perplexity": 17.66117766456657, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.44644074476201256, "mean_loss": 2.3083632622528603, "blimp/accuracy/passive_2": 0.915, "blimp/accuracy/determiner_noun_agreement_2": 0.982, "blimp/accuracy/ellipsis_n_bar_1": 0.807, "blimp/accuracy/tough_vs_raising_2": 0.872, "blimp/accuracy/tough_vs_raising_1": 0.594, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.905, "blimp/accuracy/principle_A_reconstruction": 0.334, "blimp/accuracy/wh_vs_that_with_gap": 0.454, "blimp/accuracy/principle_A_domain_2": 0.899, "blimp/accuracy/determiner_noun_agreement_1": 0.989, "blimp/accuracy/ellipsis_n_bar_2": 0.905, "blimp/accuracy/principle_A_domain_3": 0.664, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.912, "blimp/accuracy/animate_subject_trans": 0.913, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.9, "blimp/accuracy/distractor_agreement_relative_clause": 0.692, "blimp/accuracy/transitive": 0.876, "blimp/accuracy/sentential_subject_island": 0.407, "blimp/accuracy/adjunct_island": 0.849, "blimp/accuracy/intransitive": 0.799, "blimp/accuracy/existential_there_subject_raising": 0.871, "blimp/accuracy/irregular_past_participle_adjectives": 0.957, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.785, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.358, "blimp/accuracy/only_npi_scope": 0.726, "blimp/accuracy/superlative_quantifiers_2": 0.858, "blimp/accuracy/passive_1": 0.892, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.929, "blimp/accuracy/inchoative": 0.631, "blimp/accuracy/anaphor_gender_agreement": 0.978, "blimp/accuracy/principle_A_c_command": 0.794, "blimp/accuracy/only_npi_licensor_present": 0.696, "blimp/accuracy/expletive_it_object_raising": 0.784, "blimp/accuracy/left_branch_island_simple_question": 0.846, "blimp/accuracy/wh_questions_subject_gap": 0.926, "blimp/accuracy/existential_there_quantifiers_2": 0.471, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.95, "blimp/accuracy/sentential_negation_npi_scope": 0.685, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.793, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.91, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.901, "blimp/accuracy/principle_A_case_2": 0.948, "blimp/accuracy/distractor_agreement_relational_noun": 0.858, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.993, "blimp/accuracy/superlative_quantifiers_1": 0.886, "blimp/accuracy/wh_island": 0.808, "blimp/accuracy/principle_A_domain_1": 0.995, "blimp/accuracy/complex_NP_island": 0.589, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.961, "blimp/accuracy/irregular_past_participle_verbs": 0.889, "blimp/accuracy/drop_argument": 0.74, "blimp/accuracy/wh_questions_object_gap": 0.832, "blimp/accuracy/animate_subject_passive": 0.789, "blimp/accuracy/existential_there_quantifiers_1": 0.986, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.913, "blimp/accuracy/npi_present_2": 0.532, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.954, "blimp/accuracy/anaphor_number_agreement": 0.994, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.966, "blimp/accuracy/existential_there_object_raising": 0.839, "blimp/accuracy/matrix_question_npi_licensor_present": 0.382, "blimp/accuracy/npi_present_1": 0.533, "blimp/accuracy/wh_vs_that_no_gap": 0.985, "blimp/accuracy/left_branch_island_echo_question": 0.458, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.957, "blimp/accuracy/causative": 0.766, "blimp/accuracy/group_average": 0.8054029850746268, "blimp/accuracy/seq_average": 0.8054029850746268, "cbt/accuracy/NE": 0.8197115384615384, "cbt/accuracy/V": 0.9476, "cbt/accuracy/CN": 0.8924, "cbt/accuracy/P": 0.9204, "cbt/accuracy/group_average": 0.8950278846153845, "cbt/accuracy/seq_average": 0.8950580232092837, "hellaswag/accuracy/val": 0.371539533957379, "hellaswag/accuracy/group_average": 0.371539533957379, "hellaswag/accuracy/seq_average": 0.371539533957379, "piqa/accuracy/val": 0.6517954298150164, "piqa/accuracy/group_average": 0.6517954298150164, "piqa/accuracy/seq_average": 0.6517954298150164, "ai2arc/accuracy/ARC-Easy": 0.3919661733615222, "ai2arc/accuracy/ARC-Challenge": 0.23433476394849787, "ai2arc/accuracy/group_average": 0.31315046865501, "ai2arc/accuracy/seq_average": 0.33994334277620397, "race/accuracy/test/high": 0.2993138936535163, "race/accuracy/test/middle": 0.366991643454039, "race/accuracy/group_average": 0.3331527685537776, "race/accuracy/seq_average": 0.31901094446696393, "siqa/accuracy/dev": 0.3781985670419652, "siqa/accuracy/group_average": 0.3781985670419652, "siqa/accuracy/seq_average": 0.3781985670419652, "commonsenseqa/accuracy/dev_rand_split": 0.2800982800982801, "commonsenseqa/accuracy/group_average": 0.2800982800982801, "commonsenseqa/accuracy/seq_average": 0.2800982800982801}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-320000.pth.json
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"val/loss": 2.2632015167720736,
|
| 3 |
+
"val/accuracy": 0.5266791449652778,
|
| 4 |
+
"val/perplexity": 9.613818751853643,
|
| 5 |
+
"val/time_since_best_loss": 0,
|
| 6 |
+
"val/time_since_best_accuracy": 0,
|
| 7 |
+
"lambada/loss": 2.440001304105202,
|
| 8 |
+
"lambada/accuracy/total": 0.37558229813664595,
|
| 9 |
+
"lambada/accuracy/openai_last_token": 0.8076475155279503,
|
| 10 |
+
"lambada/perplexity": 6.371780572565018,
|
| 11 |
+
"lambada/lm_loss": 2.8695685083486464,
|
| 12 |
+
"lambada/lm_perplexity": 17.62940961536771,
|
| 13 |
+
"lambada/time_since_best_loss": 0,
|
| 14 |
+
"lambada/time_since_best_accuracy": 0,
|
| 15 |
+
"mean_accuracy": 0.4511307215509619,
|
| 16 |
+
"mean_loss": 2.351601410438638,
|
| 17 |
+
"blimp/accuracy/passive_2": 0.915,
|
| 18 |
+
"blimp/accuracy/determiner_noun_agreement_2": 0.983,
|
| 19 |
+
"blimp/accuracy/ellipsis_n_bar_1": 0.807,
|
| 20 |
+
"blimp/accuracy/tough_vs_raising_2": 0.9,
|
| 21 |
+
"blimp/accuracy/tough_vs_raising_1": 0.581,
|
| 22 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.921,
|
| 23 |
+
"blimp/accuracy/principle_A_reconstruction": 0.344,
|
| 24 |
+
"blimp/accuracy/wh_vs_that_with_gap": 0.435,
|
| 25 |
+
"blimp/accuracy/principle_A_domain_2": 0.889,
|
| 26 |
+
"blimp/accuracy/determiner_noun_agreement_1": 0.99,
|
| 27 |
+
"blimp/accuracy/ellipsis_n_bar_2": 0.909,
|
| 28 |
+
"blimp/accuracy/principle_A_domain_3": 0.666,
|
| 29 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.922,
|
| 30 |
+
"blimp/accuracy/animate_subject_trans": 0.912,
|
| 31 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.901,
|
| 32 |
+
"blimp/accuracy/distractor_agreement_relative_clause": 0.689,
|
| 33 |
+
"blimp/accuracy/transitive": 0.888,
|
| 34 |
+
"blimp/accuracy/sentential_subject_island": 0.412,
|
| 35 |
+
"blimp/accuracy/adjunct_island": 0.893,
|
| 36 |
+
"blimp/accuracy/intransitive": 0.768,
|
| 37 |
+
"blimp/accuracy/existential_there_subject_raising": 0.887,
|
| 38 |
+
"blimp/accuracy/irregular_past_participle_adjectives": 0.917,
|
| 39 |
+
"blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.737,
|
| 40 |
+
"blimp/accuracy/principle_A_case_1": 1.0,
|
| 41 |
+
"blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.34,
|
| 42 |
+
"blimp/accuracy/only_npi_scope": 0.672,
|
| 43 |
+
"blimp/accuracy/superlative_quantifiers_2": 0.863,
|
| 44 |
+
"blimp/accuracy/passive_1": 0.909,
|
| 45 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.929,
|
| 46 |
+
"blimp/accuracy/inchoative": 0.622,
|
| 47 |
+
"blimp/accuracy/anaphor_gender_agreement": 0.983,
|
| 48 |
+
"blimp/accuracy/principle_A_c_command": 0.783,
|
| 49 |
+
"blimp/accuracy/only_npi_licensor_present": 0.623,
|
| 50 |
+
"blimp/accuracy/expletive_it_object_raising": 0.77,
|
| 51 |
+
"blimp/accuracy/left_branch_island_simple_question": 0.823,
|
| 52 |
+
"blimp/accuracy/wh_questions_subject_gap": 0.944,
|
| 53 |
+
"blimp/accuracy/existential_there_quantifiers_2": 0.519,
|
| 54 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.95,
|
| 55 |
+
"blimp/accuracy/sentential_negation_npi_scope": 0.704,
|
| 56 |
+
"blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.851,
|
| 57 |
+
"blimp/accuracy/wh_questions_subject_gap_long_distance": 0.936,
|
| 58 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.899,
|
| 59 |
+
"blimp/accuracy/principle_A_case_2": 0.947,
|
| 60 |
+
"blimp/accuracy/distractor_agreement_relational_noun": 0.86,
|
| 61 |
+
"blimp/accuracy/sentential_negation_npi_licensor_present": 0.988,
|
| 62 |
+
"blimp/accuracy/superlative_quantifiers_1": 0.859,
|
| 63 |
+
"blimp/accuracy/wh_island": 0.844,
|
| 64 |
+
"blimp/accuracy/principle_A_domain_1": 0.992,
|
| 65 |
+
"blimp/accuracy/complex_NP_island": 0.624,
|
| 66 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_2": 0.958,
|
| 67 |
+
"blimp/accuracy/irregular_past_participle_verbs": 0.909,
|
| 68 |
+
"blimp/accuracy/drop_argument": 0.744,
|
| 69 |
+
"blimp/accuracy/wh_questions_object_gap": 0.854,
|
| 70 |
+
"blimp/accuracy/animate_subject_passive": 0.784,
|
| 71 |
+
"blimp/accuracy/existential_there_quantifiers_1": 0.982,
|
| 72 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.917,
|
| 73 |
+
"blimp/accuracy/npi_present_2": 0.583,
|
| 74 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_1": 0.947,
|
| 75 |
+
"blimp/accuracy/anaphor_number_agreement": 0.993,
|
| 76 |
+
"blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.961,
|
| 77 |
+
"blimp/accuracy/existential_there_object_raising": 0.867,
|
| 78 |
+
"blimp/accuracy/matrix_question_npi_licensor_present": 0.407,
|
| 79 |
+
"blimp/accuracy/npi_present_1": 0.56,
|
| 80 |
+
"blimp/accuracy/wh_vs_that_no_gap": 0.986,
|
| 81 |
+
"blimp/accuracy/left_branch_island_echo_question": 0.486,
|
| 82 |
+
"blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.978,
|
| 83 |
+
"blimp/accuracy/causative": 0.775,
|
| 84 |
+
"blimp/accuracy/group_average": 0.808820895522388,
|
| 85 |
+
"blimp/accuracy/seq_average": 0.808820895522388,
|
| 86 |
+
"cbt/accuracy/NE": 0.8201121794871795,
|
| 87 |
+
"cbt/accuracy/V": 0.9472,
|
| 88 |
+
"cbt/accuracy/CN": 0.8932,
|
| 89 |
+
"cbt/accuracy/P": 0.928,
|
| 90 |
+
"cbt/accuracy/group_average": 0.8971280448717949,
|
| 91 |
+
"cbt/accuracy/seq_average": 0.8971588635454182,
|
| 92 |
+
"hellaswag/accuracy/val": 0.37402907787293366,
|
| 93 |
+
"hellaswag/accuracy/group_average": 0.37402907787293366,
|
| 94 |
+
"hellaswag/accuracy/seq_average": 0.37402907787293366,
|
| 95 |
+
"piqa/accuracy/val": 0.6479869423286181,
|
| 96 |
+
"piqa/accuracy/group_average": 0.6479869423286181,
|
| 97 |
+
"piqa/accuracy/seq_average": 0.6479869423286181,
|
| 98 |
+
"ai2arc/accuracy/ARC-Easy": 0.39238900634249474,
|
| 99 |
+
"ai2arc/accuracy/ARC-Challenge": 0.24120171673819743,
|
| 100 |
+
"ai2arc/accuracy/group_average": 0.3167953615403461,
|
| 101 |
+
"ai2arc/accuracy/seq_average": 0.3424929178470255,
|
| 102 |
+
"race/accuracy/test/high": 0.29845626072041165,
|
| 103 |
+
"race/accuracy/test/middle": 0.366991643454039,
|
| 104 |
+
"race/accuracy/group_average": 0.3327239520872253,
|
| 105 |
+
"race/accuracy/seq_average": 0.31840291852452374,
|
| 106 |
+
"siqa/accuracy/dev": 0.37308085977482086,
|
| 107 |
+
"siqa/accuracy/group_average": 0.37308085977482086,
|
| 108 |
+
"siqa/accuracy/seq_average": 0.37308085977482086,
|
| 109 |
+
"commonsenseqa/accuracy/dev_rand_split": 0.27927927927927926,
|
| 110 |
+
"commonsenseqa/accuracy/group_average": 0.27927927927927926,
|
| 111 |
+
"commonsenseqa/accuracy/seq_average": 0.27927927927927926
|
| 112 |
+
}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-340000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.254084995814732, "val/accuracy": 0.5276237366691469, "val/perplexity": 9.526572467272532, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.395876487589771, "lambada/accuracy/total": 0.40217391304347827, "lambada/accuracy/openai_last_token": 0.8086180124223602, "lambada/perplexity": 6.111677655793456, "lambada/lm_loss": 2.857826292840635, "lambada/lm_perplexity": 17.42361191347214, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.46489882485631256, "mean_loss": 2.3249807417022517, "blimp/accuracy/passive_2": 0.907, "blimp/accuracy/determiner_noun_agreement_2": 0.979, "blimp/accuracy/ellipsis_n_bar_1": 0.812, "blimp/accuracy/tough_vs_raising_2": 0.884, "blimp/accuracy/tough_vs_raising_1": 0.572, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.923, "blimp/accuracy/principle_A_reconstruction": 0.359, "blimp/accuracy/wh_vs_that_with_gap": 0.412, "blimp/accuracy/principle_A_domain_2": 0.902, "blimp/accuracy/determiner_noun_agreement_1": 0.993, "blimp/accuracy/ellipsis_n_bar_2": 0.901, "blimp/accuracy/principle_A_domain_3": 0.673, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.93, "blimp/accuracy/animate_subject_trans": 0.91, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.913, "blimp/accuracy/distractor_agreement_relative_clause": 0.678, "blimp/accuracy/transitive": 0.88, "blimp/accuracy/sentential_subject_island": 0.401, "blimp/accuracy/adjunct_island": 0.889, "blimp/accuracy/intransitive": 0.769, "blimp/accuracy/existential_there_subject_raising": 0.877, "blimp/accuracy/irregular_past_participle_adjectives": 0.947, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.786, "blimp/accuracy/principle_A_case_1": 0.999, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.341, "blimp/accuracy/only_npi_scope": 0.741, "blimp/accuracy/superlative_quantifiers_2": 0.918, "blimp/accuracy/passive_1": 0.907, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.926, "blimp/accuracy/inchoative": 0.602, "blimp/accuracy/anaphor_gender_agreement": 0.982, "blimp/accuracy/principle_A_c_command": 0.785, "blimp/accuracy/only_npi_licensor_present": 0.597, "blimp/accuracy/expletive_it_object_raising": 0.78, "blimp/accuracy/left_branch_island_simple_question": 0.844, "blimp/accuracy/wh_questions_subject_gap": 0.931, "blimp/accuracy/existential_there_quantifiers_2": 0.565, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.952, "blimp/accuracy/sentential_negation_npi_scope": 0.709, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.828, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.909, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.926, "blimp/accuracy/principle_A_case_2": 0.937, "blimp/accuracy/distractor_agreement_relational_noun": 0.855, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.99, "blimp/accuracy/superlative_quantifiers_1": 0.882, "blimp/accuracy/wh_island": 0.847, "blimp/accuracy/principle_A_domain_1": 0.994, "blimp/accuracy/complex_NP_island": 0.605, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.969, "blimp/accuracy/irregular_past_participle_verbs": 0.899, "blimp/accuracy/drop_argument": 0.734, "blimp/accuracy/wh_questions_object_gap": 0.855, "blimp/accuracy/animate_subject_passive": 0.79, "blimp/accuracy/existential_there_quantifiers_1": 0.976, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.92, "blimp/accuracy/npi_present_2": 0.571, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.959, "blimp/accuracy/anaphor_number_agreement": 0.992, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.965, "blimp/accuracy/existential_there_object_raising": 0.852, "blimp/accuracy/matrix_question_npi_licensor_present": 0.418, "blimp/accuracy/npi_present_1": 0.536, "blimp/accuracy/wh_vs_that_no_gap": 0.987, "blimp/accuracy/left_branch_island_echo_question": 0.512, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.973, "blimp/accuracy/causative": 0.782, "blimp/accuracy/group_average": 0.8110298507462687, "blimp/accuracy/seq_average": 0.8110298507462687, "cbt/accuracy/NE": 0.827323717948718, "cbt/accuracy/V": 0.9468, "cbt/accuracy/CN": 0.9028, "cbt/accuracy/P": 0.9276, "cbt/accuracy/group_average": 0.9011309294871794, "cbt/accuracy/seq_average": 0.9011604641856743, "hellaswag/accuracy/val": 0.37731527584146585, "hellaswag/accuracy/group_average": 0.37731527584146585, "hellaswag/accuracy/seq_average": 0.37731527584146585, "piqa/accuracy/val": 0.6501632208922742, "piqa/accuracy/group_average": 0.6501632208922742, "piqa/accuracy/seq_average": 0.6501632208922742, "ai2arc/accuracy/ARC-Easy": 0.39365750528541227, "ai2arc/accuracy/ARC-Challenge": 0.24206008583690988, "ai2arc/accuracy/group_average": 0.3178587955611611, "ai2arc/accuracy/seq_average": 0.3436260623229462, "race/accuracy/test/high": 0.29874213836477986, "race/accuracy/test/middle": 0.37813370473537605, "race/accuracy/group_average": 0.33843792155007796, "race/accuracy/seq_average": 0.32184839886501826, "siqa/accuracy/dev": 0.37871033776867963, "siqa/accuracy/group_average": 0.37871033776867963, "siqa/accuracy/seq_average": 0.37871033776867963, "commonsenseqa/accuracy/dev_rand_split": 0.27927927927927926, "commonsenseqa/accuracy/group_average": 0.27927927927927926, "commonsenseqa/accuracy/seq_average": 0.27927927927927926}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-360000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.2471793038504466, "val/accuracy": 0.5292038690476191, "val/perplexity": 9.461011524717346, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.22508552503882, "lambada/accuracy/total": 0.38392857142857145, "lambada/accuracy/openai_last_token": 0.8062888198757764, "lambada/perplexity": 6.183829631721718, "lambada/lm_loss": 2.8422305940330594, "lambada/lm_perplexity": 17.153986473473022, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.45656622023809523, "mean_loss": 2.2361324144446333, "blimp/accuracy/passive_2": 0.907, "blimp/accuracy/determiner_noun_agreement_2": 0.983, "blimp/accuracy/ellipsis_n_bar_1": 0.819, "blimp/accuracy/tough_vs_raising_2": 0.909, "blimp/accuracy/tough_vs_raising_1": 0.588, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.924, "blimp/accuracy/principle_A_reconstruction": 0.355, "blimp/accuracy/wh_vs_that_with_gap": 0.443, "blimp/accuracy/principle_A_domain_2": 0.904, "blimp/accuracy/determiner_noun_agreement_1": 0.991, "blimp/accuracy/ellipsis_n_bar_2": 0.913, "blimp/accuracy/principle_A_domain_3": 0.665, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.914, "blimp/accuracy/animate_subject_trans": 0.908, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.909, "blimp/accuracy/distractor_agreement_relative_clause": 0.674, "blimp/accuracy/transitive": 0.878, "blimp/accuracy/sentential_subject_island": 0.36, "blimp/accuracy/adjunct_island": 0.87, "blimp/accuracy/intransitive": 0.79, "blimp/accuracy/existential_there_subject_raising": 0.883, "blimp/accuracy/irregular_past_participle_adjectives": 0.912, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.772, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.377, "blimp/accuracy/only_npi_scope": 0.693, "blimp/accuracy/superlative_quantifiers_2": 0.857, "blimp/accuracy/passive_1": 0.893, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.93, "blimp/accuracy/inchoative": 0.638, "blimp/accuracy/anaphor_gender_agreement": 0.982, "blimp/accuracy/principle_A_c_command": 0.807, "blimp/accuracy/only_npi_licensor_present": 0.697, "blimp/accuracy/expletive_it_object_raising": 0.762, "blimp/accuracy/left_branch_island_simple_question": 0.833, "blimp/accuracy/wh_questions_subject_gap": 0.935, "blimp/accuracy/existential_there_quantifiers_2": 0.49, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.95, "blimp/accuracy/sentential_negation_npi_scope": 0.708, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.821, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.91, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.911, "blimp/accuracy/principle_A_case_2": 0.938, "blimp/accuracy/distractor_agreement_relational_noun": 0.863, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.988, "blimp/accuracy/superlative_quantifiers_1": 0.876, "blimp/accuracy/wh_island": 0.813, "blimp/accuracy/principle_A_domain_1": 0.993, "blimp/accuracy/complex_NP_island": 0.597, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.962, "blimp/accuracy/irregular_past_participle_verbs": 0.89, "blimp/accuracy/drop_argument": 0.747, "blimp/accuracy/wh_questions_object_gap": 0.855, "blimp/accuracy/animate_subject_passive": 0.791, "blimp/accuracy/existential_there_quantifiers_1": 0.978, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.917, "blimp/accuracy/npi_present_2": 0.593, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.958, "blimp/accuracy/anaphor_number_agreement": 0.994, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.967, "blimp/accuracy/existential_there_object_raising": 0.843, "blimp/accuracy/matrix_question_npi_licensor_present": 0.391, "blimp/accuracy/npi_present_1": 0.543, "blimp/accuracy/wh_vs_that_no_gap": 0.984, "blimp/accuracy/left_branch_island_echo_question": 0.535, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.973, "blimp/accuracy/causative": 0.762, "blimp/accuracy/group_average": 0.8091940298507461, "blimp/accuracy/seq_average": 0.8091940298507463, "cbt/accuracy/NE": 0.8217147435897436, "cbt/accuracy/V": 0.946, "cbt/accuracy/CN": 0.8968, "cbt/accuracy/P": 0.9268, "cbt/accuracy/group_average": 0.8978286858974359, "cbt/accuracy/seq_average": 0.897859143657463, "hellaswag/accuracy/val": 0.3759211312487552, "hellaswag/accuracy/group_average": 0.3759211312487552, "hellaswag/accuracy/seq_average": 0.3759211312487552, "piqa/accuracy/val": 0.6458106637649619, "piqa/accuracy/group_average": 0.6458106637649619, "piqa/accuracy/seq_average": 0.6458106637649619, "ai2arc/accuracy/ARC-Easy": 0.3953488372093023, "ai2arc/accuracy/ARC-Challenge": 0.2446351931330472, "ai2arc/accuracy/group_average": 0.31999201517117476, "ai2arc/accuracy/seq_average": 0.34560906515580736, "race/accuracy/test/high": 0.3018867924528302, "race/accuracy/test/middle": 0.38091922005571033, "race/accuracy/group_average": 0.3414030062542702, "race/accuracy/seq_average": 0.3248885285772193, "siqa/accuracy/dev": 0.3766632548618219, "siqa/accuracy/group_average": 0.3766632548618219, "siqa/accuracy/seq_average": 0.3766632548618219, "commonsenseqa/accuracy/dev_rand_split": 0.28173628173628174, "commonsenseqa/accuracy/group_average": 0.28173628173628174, "commonsenseqa/accuracy/seq_average": 0.28173628173628174}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-380000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.2435767764136907, "val/accuracy": 0.5304478236607143, "val/perplexity": 9.426989290931042, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.2639408466978845, "lambada/accuracy/total": 0.389945652173913, "lambada/accuracy/openai_last_token": 0.8033773291925466, "lambada/perplexity": 6.2838245639219465, "lambada/lm_loss": 2.8451367485013503, "lambada/lm_perplexity": 17.203911117137523, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.46019673791731364, "mean_loss": 2.2537588115557874, "blimp/accuracy/passive_2": 0.914, "blimp/accuracy/determiner_noun_agreement_2": 0.975, "blimp/accuracy/ellipsis_n_bar_1": 0.812, "blimp/accuracy/tough_vs_raising_2": 0.908, "blimp/accuracy/tough_vs_raising_1": 0.586, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.919, "blimp/accuracy/principle_A_reconstruction": 0.357, "blimp/accuracy/wh_vs_that_with_gap": 0.413, "blimp/accuracy/principle_A_domain_2": 0.892, "blimp/accuracy/determiner_noun_agreement_1": 0.99, "blimp/accuracy/ellipsis_n_bar_2": 0.893, "blimp/accuracy/principle_A_domain_3": 0.671, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.92, "blimp/accuracy/animate_subject_trans": 0.912, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.912, "blimp/accuracy/distractor_agreement_relative_clause": 0.7, "blimp/accuracy/transitive": 0.881, "blimp/accuracy/sentential_subject_island": 0.392, "blimp/accuracy/adjunct_island": 0.881, "blimp/accuracy/intransitive": 0.796, "blimp/accuracy/existential_there_subject_raising": 0.886, "blimp/accuracy/irregular_past_participle_adjectives": 0.915, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.764, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.349, "blimp/accuracy/only_npi_scope": 0.712, "blimp/accuracy/superlative_quantifiers_2": 0.883, "blimp/accuracy/passive_1": 0.909, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.936, "blimp/accuracy/inchoative": 0.626, "blimp/accuracy/anaphor_gender_agreement": 0.983, "blimp/accuracy/principle_A_c_command": 0.785, "blimp/accuracy/only_npi_licensor_present": 0.777, "blimp/accuracy/expletive_it_object_raising": 0.778, "blimp/accuracy/left_branch_island_simple_question": 0.844, "blimp/accuracy/wh_questions_subject_gap": 0.944, "blimp/accuracy/existential_there_quantifiers_2": 0.55, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.945, "blimp/accuracy/sentential_negation_npi_scope": 0.681, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.815, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.91, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.912, "blimp/accuracy/principle_A_case_2": 0.943, "blimp/accuracy/distractor_agreement_relational_noun": 0.882, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.986, "blimp/accuracy/superlative_quantifiers_1": 0.899, "blimp/accuracy/wh_island": 0.843, "blimp/accuracy/principle_A_domain_1": 0.993, "blimp/accuracy/complex_NP_island": 0.588, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.967, "blimp/accuracy/irregular_past_participle_verbs": 0.905, "blimp/accuracy/drop_argument": 0.748, "blimp/accuracy/wh_questions_object_gap": 0.854, "blimp/accuracy/animate_subject_passive": 0.788, "blimp/accuracy/existential_there_quantifiers_1": 0.976, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.913, "blimp/accuracy/npi_present_2": 0.584, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.959, "blimp/accuracy/anaphor_number_agreement": 0.991, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.969, "blimp/accuracy/existential_there_object_raising": 0.834, "blimp/accuracy/matrix_question_npi_licensor_present": 0.374, "blimp/accuracy/npi_present_1": 0.569, "blimp/accuracy/wh_vs_that_no_gap": 0.989, "blimp/accuracy/left_branch_island_echo_question": 0.493, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.976, "blimp/accuracy/causative": 0.793, "blimp/accuracy/group_average": 0.8125970149253732, "blimp/accuracy/seq_average": 0.8125970149253732, "cbt/accuracy/NE": 0.8257211538461539, "cbt/accuracy/V": 0.9444, "cbt/accuracy/CN": 0.896, "cbt/accuracy/P": 0.928, "cbt/accuracy/group_average": 0.8985302884615385, "cbt/accuracy/seq_average": 0.8985594237695078, "hellaswag/accuracy/val": 0.37651862178848833, "hellaswag/accuracy/group_average": 0.37651862178848833, "hellaswag/accuracy/seq_average": 0.37651862178848833, "piqa/accuracy/val": 0.6512513601741022, "piqa/accuracy/group_average": 0.6512513601741022, "piqa/accuracy/seq_average": 0.6512513601741022, "ai2arc/accuracy/ARC-Easy": 0.39238900634249474, "ai2arc/accuracy/ARC-Challenge": 0.24206008583690988, "ai2arc/accuracy/group_average": 0.3172245460897023, "ai2arc/accuracy/seq_average": 0.34277620396600567, "race/accuracy/test/high": 0.2998856489422527, "race/accuracy/test/middle": 0.37395543175487467, "race/accuracy/group_average": 0.33692054034856367, "race/accuracy/seq_average": 0.32144304823672476, "siqa/accuracy/dev": 0.37308085977482086, "siqa/accuracy/group_average": 0.37308085977482086, "siqa/accuracy/seq_average": 0.37308085977482086, "commonsenseqa/accuracy/dev_rand_split": 0.2882882882882883, "commonsenseqa/accuracy/group_average": 0.2882882882882883, "commonsenseqa/accuracy/seq_average": 0.2882882882882883}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-40000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.582794673859127, "val/accuracy": 0.48018973214285715, "val/perplexity": 13.234071440888206, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.7611832707565025, "lambada/accuracy/total": 0.29444875776397517, "lambada/accuracy/openai_last_token": 0.7699922360248447, "lambada/perplexity": 9.837372327906156, "lambada/lm_loss": 3.148414762929454, "lambada/lm_perplexity": 23.299100692292413, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3873192449534162, "mean_loss": 2.6719889723078145, "blimp/accuracy/passive_2": 0.902, "blimp/accuracy/determiner_noun_agreement_2": 0.99, "blimp/accuracy/ellipsis_n_bar_1": 0.77, "blimp/accuracy/tough_vs_raising_2": 0.822, "blimp/accuracy/tough_vs_raising_1": 0.589, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.897, "blimp/accuracy/principle_A_reconstruction": 0.29, "blimp/accuracy/wh_vs_that_with_gap": 0.441, "blimp/accuracy/principle_A_domain_2": 0.862, "blimp/accuracy/determiner_noun_agreement_1": 0.988, "blimp/accuracy/ellipsis_n_bar_2": 0.921, "blimp/accuracy/principle_A_domain_3": 0.646, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.909, "blimp/accuracy/animate_subject_trans": 0.911, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.875, "blimp/accuracy/distractor_agreement_relative_clause": 0.619, "blimp/accuracy/transitive": 0.847, "blimp/accuracy/sentential_subject_island": 0.391, "blimp/accuracy/adjunct_island": 0.805, "blimp/accuracy/intransitive": 0.726, "blimp/accuracy/existential_there_subject_raising": 0.874, "blimp/accuracy/irregular_past_participle_adjectives": 0.96, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.626, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.238, "blimp/accuracy/only_npi_scope": 0.664, "blimp/accuracy/superlative_quantifiers_2": 0.813, "blimp/accuracy/passive_1": 0.892, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.912, "blimp/accuracy/inchoative": 0.583, "blimp/accuracy/anaphor_gender_agreement": 0.965, "blimp/accuracy/principle_A_c_command": 0.693, "blimp/accuracy/only_npi_licensor_present": 0.793, "blimp/accuracy/expletive_it_object_raising": 0.779, "blimp/accuracy/left_branch_island_simple_question": 0.751, "blimp/accuracy/wh_questions_subject_gap": 0.926, "blimp/accuracy/existential_there_quantifiers_2": 0.342, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.94, "blimp/accuracy/sentential_negation_npi_scope": 0.575, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.818, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.913, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.862, "blimp/accuracy/principle_A_case_2": 0.943, "blimp/accuracy/distractor_agreement_relational_noun": 0.86, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.997, "blimp/accuracy/superlative_quantifiers_1": 0.777, "blimp/accuracy/wh_island": 0.804, "blimp/accuracy/principle_A_domain_1": 0.982, "blimp/accuracy/complex_NP_island": 0.562, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.953, "blimp/accuracy/irregular_past_participle_verbs": 0.88, "blimp/accuracy/drop_argument": 0.733, "blimp/accuracy/wh_questions_object_gap": 0.823, "blimp/accuracy/animate_subject_passive": 0.798, "blimp/accuracy/existential_there_quantifiers_1": 0.979, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.911, "blimp/accuracy/npi_present_2": 0.558, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.93, "blimp/accuracy/anaphor_number_agreement": 0.992, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.959, "blimp/accuracy/existential_there_object_raising": 0.778, "blimp/accuracy/matrix_question_npi_licensor_present": 0.327, "blimp/accuracy/npi_present_1": 0.49, "blimp/accuracy/wh_vs_that_no_gap": 0.981, "blimp/accuracy/left_branch_island_echo_question": 0.48, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.972, "blimp/accuracy/causative": 0.702, "blimp/accuracy/group_average": 0.7804626865671641, "blimp/accuracy/seq_average": 0.7804626865671642, "cbt/accuracy/NE": 0.7640224358974359, "cbt/accuracy/V": 0.9124, "cbt/accuracy/CN": 0.824, "cbt/accuracy/P": 0.8912, "cbt/accuracy/group_average": 0.847905608974359, "cbt/accuracy/seq_average": 0.8479391756702681, "hellaswag/accuracy/val": 0.3043218482374029, "hellaswag/accuracy/group_average": 0.3043218482374029, "hellaswag/accuracy/seq_average": 0.3043218482374029, "piqa/accuracy/val": 0.6001088139281828, "piqa/accuracy/group_average": 0.6001088139281828, "piqa/accuracy/seq_average": 0.6001088139281828, "ai2arc/accuracy/ARC-Easy": 0.33403805496828753, "ai2arc/accuracy/ARC-Challenge": 0.21630901287553647, "ai2arc/accuracy/group_average": 0.275173533921912, "ai2arc/accuracy/seq_average": 0.2951841359773371, "race/accuracy/test/high": 0.2712978845054317, "race/accuracy/test/middle": 0.3342618384401114, "race/accuracy/group_average": 0.3027798614727716, "race/accuracy/seq_average": 0.28962302391568706, "siqa/accuracy/dev": 0.36131013306038895, "siqa/accuracy/group_average": 0.36131013306038895, "siqa/accuracy/seq_average": 0.36131013306038895, "commonsenseqa/accuracy/dev_rand_split": 0.25307125307125306, "commonsenseqa/accuracy/group_average": 0.25307125307125306, "commonsenseqa/accuracy/seq_average": 0.25307125307125306}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-400000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.238678462921627, "val/accuracy": 0.5309516059027778, "val/perplexity": 9.38092585078471, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.2391655015649263, "lambada/accuracy/total": 0.37868788819875776, "lambada/accuracy/openai_last_token": 0.8057065217391305, "lambada/perplexity": 6.2253491267581405, "lambada/lm_loss": 2.8415328655269736, "lambada/lm_perplexity": 17.142021822641833, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4548197470507678, "mean_loss": 2.2389219822432764, "blimp/accuracy/passive_2": 0.916, "blimp/accuracy/determiner_noun_agreement_2": 0.982, "blimp/accuracy/ellipsis_n_bar_1": 0.811, "blimp/accuracy/tough_vs_raising_2": 0.897, "blimp/accuracy/tough_vs_raising_1": 0.584, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.938, "blimp/accuracy/principle_A_reconstruction": 0.337, "blimp/accuracy/wh_vs_that_with_gap": 0.419, "blimp/accuracy/principle_A_domain_2": 0.897, "blimp/accuracy/determiner_noun_agreement_1": 0.991, "blimp/accuracy/ellipsis_n_bar_2": 0.91, "blimp/accuracy/principle_A_domain_3": 0.669, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.932, "blimp/accuracy/animate_subject_trans": 0.911, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.922, "blimp/accuracy/distractor_agreement_relative_clause": 0.685, "blimp/accuracy/transitive": 0.892, "blimp/accuracy/sentential_subject_island": 0.38, "blimp/accuracy/adjunct_island": 0.902, "blimp/accuracy/intransitive": 0.779, "blimp/accuracy/existential_there_subject_raising": 0.89, "blimp/accuracy/irregular_past_participle_adjectives": 0.941, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.8, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.354, "blimp/accuracy/only_npi_scope": 0.711, "blimp/accuracy/superlative_quantifiers_2": 0.9, "blimp/accuracy/passive_1": 0.903, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.934, "blimp/accuracy/inchoative": 0.624, "blimp/accuracy/anaphor_gender_agreement": 0.983, "blimp/accuracy/principle_A_c_command": 0.798, "blimp/accuracy/only_npi_licensor_present": 0.597, "blimp/accuracy/expletive_it_object_raising": 0.773, "blimp/accuracy/left_branch_island_simple_question": 0.848, "blimp/accuracy/wh_questions_subject_gap": 0.932, "blimp/accuracy/existential_there_quantifiers_2": 0.523, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.957, "blimp/accuracy/sentential_negation_npi_scope": 0.708, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.822, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.917, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.905, "blimp/accuracy/principle_A_case_2": 0.948, "blimp/accuracy/distractor_agreement_relational_noun": 0.851, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.987, "blimp/accuracy/superlative_quantifiers_1": 0.868, "blimp/accuracy/wh_island": 0.859, "blimp/accuracy/principle_A_domain_1": 0.995, "blimp/accuracy/complex_NP_island": 0.615, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.966, "blimp/accuracy/irregular_past_participle_verbs": 0.89, "blimp/accuracy/drop_argument": 0.748, "blimp/accuracy/wh_questions_object_gap": 0.86, "blimp/accuracy/animate_subject_passive": 0.787, "blimp/accuracy/existential_there_quantifiers_1": 0.986, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.916, "blimp/accuracy/npi_present_2": 0.595, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.962, "blimp/accuracy/anaphor_number_agreement": 0.99, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.967, "blimp/accuracy/existential_there_object_raising": 0.847, "blimp/accuracy/matrix_question_npi_licensor_present": 0.4, "blimp/accuracy/npi_present_1": 0.558, "blimp/accuracy/wh_vs_that_no_gap": 0.987, "blimp/accuracy/left_branch_island_echo_question": 0.515, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.97, "blimp/accuracy/causative": 0.767, "blimp/accuracy/group_average": 0.8120597014925373, "blimp/accuracy/seq_average": 0.8120597014925374, "cbt/accuracy/NE": 0.8361378205128205, "cbt/accuracy/V": 0.9448, "cbt/accuracy/CN": 0.8992, "cbt/accuracy/P": 0.9272, "cbt/accuracy/group_average": 0.9018344551282051, "cbt/accuracy/seq_average": 0.9018607442977191, "hellaswag/accuracy/val": 0.37950607448715395, "hellaswag/accuracy/group_average": 0.37950607448715395, "hellaswag/accuracy/seq_average": 0.37950607448715395, "piqa/accuracy/val": 0.6447225244831338, "piqa/accuracy/group_average": 0.6447225244831338, "piqa/accuracy/seq_average": 0.6447225244831338, "ai2arc/accuracy/ARC-Easy": 0.39281183932346725, "ai2arc/accuracy/ARC-Challenge": 0.23776824034334765, "ai2arc/accuracy/group_average": 0.3152900398334074, "ai2arc/accuracy/seq_average": 0.341643059490085, "race/accuracy/test/high": 0.29874213836477986, "race/accuracy/test/middle": 0.3767409470752089, "race/accuracy/group_average": 0.33774154271999435, "race/accuracy/seq_average": 0.32144304823672476, "siqa/accuracy/dev": 0.37922210849539406, "siqa/accuracy/group_average": 0.37922210849539406, "siqa/accuracy/seq_average": 0.37922210849539406, "commonsenseqa/accuracy/dev_rand_split": 0.2784602784602785, "commonsenseqa/accuracy/group_average": 0.2784602784602785, "commonsenseqa/accuracy/seq_average": 0.2784602784602785}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-60000.pth.json
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"val/loss": 2.514155796595982,
|
| 3 |
+
"val/accuracy": 0.4894002278645833,
|
| 4 |
+
"val/perplexity": 12.3561732513585,
|
| 5 |
+
"val/time_since_best_loss": 0,
|
| 6 |
+
"val/time_since_best_accuracy": 0,
|
| 7 |
+
"lambada/loss": 2.703446477096273,
|
| 8 |
+
"lambada/accuracy/total": 0.2715450310559006,
|
| 9 |
+
"lambada/accuracy/openai_last_token": 0.765139751552795,
|
| 10 |
+
"lambada/perplexity": 9.986212267960902,
|
| 11 |
+
"lambada/lm_loss": 3.0707089173055526,
|
| 12 |
+
"lambada/lm_perplexity": 21.55717951697228,
|
| 13 |
+
"lambada/time_since_best_loss": 0,
|
| 14 |
+
"lambada/time_since_best_accuracy": 0,
|
| 15 |
+
"mean_accuracy": 0.38047262946024196,
|
| 16 |
+
"mean_loss": 2.608801136846128,
|
| 17 |
+
"blimp/accuracy/passive_2": 0.903,
|
| 18 |
+
"blimp/accuracy/determiner_noun_agreement_2": 0.984,
|
| 19 |
+
"blimp/accuracy/ellipsis_n_bar_1": 0.837,
|
| 20 |
+
"blimp/accuracy/tough_vs_raising_2": 0.832,
|
| 21 |
+
"blimp/accuracy/tough_vs_raising_1": 0.607,
|
| 22 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.915,
|
| 23 |
+
"blimp/accuracy/principle_A_reconstruction": 0.31,
|
| 24 |
+
"blimp/accuracy/wh_vs_that_with_gap": 0.561,
|
| 25 |
+
"blimp/accuracy/principle_A_domain_2": 0.852,
|
| 26 |
+
"blimp/accuracy/determiner_noun_agreement_1": 0.99,
|
| 27 |
+
"blimp/accuracy/ellipsis_n_bar_2": 0.915,
|
| 28 |
+
"blimp/accuracy/principle_A_domain_3": 0.619,
|
| 29 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.929,
|
| 30 |
+
"blimp/accuracy/animate_subject_trans": 0.895,
|
| 31 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.878,
|
| 32 |
+
"blimp/accuracy/distractor_agreement_relative_clause": 0.663,
|
| 33 |
+
"blimp/accuracy/transitive": 0.872,
|
| 34 |
+
"blimp/accuracy/sentential_subject_island": 0.396,
|
| 35 |
+
"blimp/accuracy/adjunct_island": 0.792,
|
| 36 |
+
"blimp/accuracy/intransitive": 0.722,
|
| 37 |
+
"blimp/accuracy/existential_there_subject_raising": 0.862,
|
| 38 |
+
"blimp/accuracy/irregular_past_participle_adjectives": 0.908,
|
| 39 |
+
"blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.476,
|
| 40 |
+
"blimp/accuracy/principle_A_case_1": 1.0,
|
| 41 |
+
"blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.389,
|
| 42 |
+
"blimp/accuracy/only_npi_scope": 0.581,
|
| 43 |
+
"blimp/accuracy/superlative_quantifiers_2": 0.78,
|
| 44 |
+
"blimp/accuracy/passive_1": 0.886,
|
| 45 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.918,
|
| 46 |
+
"blimp/accuracy/inchoative": 0.581,
|
| 47 |
+
"blimp/accuracy/anaphor_gender_agreement": 0.978,
|
| 48 |
+
"blimp/accuracy/principle_A_c_command": 0.717,
|
| 49 |
+
"blimp/accuracy/only_npi_licensor_present": 0.88,
|
| 50 |
+
"blimp/accuracy/expletive_it_object_raising": 0.786,
|
| 51 |
+
"blimp/accuracy/left_branch_island_simple_question": 0.622,
|
| 52 |
+
"blimp/accuracy/wh_questions_subject_gap": 0.919,
|
| 53 |
+
"blimp/accuracy/existential_there_quantifiers_2": 0.562,
|
| 54 |
+
"blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.942,
|
| 55 |
+
"blimp/accuracy/sentential_negation_npi_scope": 0.479,
|
| 56 |
+
"blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.824,
|
| 57 |
+
"blimp/accuracy/wh_questions_subject_gap_long_distance": 0.884,
|
| 58 |
+
"blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.9,
|
| 59 |
+
"blimp/accuracy/principle_A_case_2": 0.927,
|
| 60 |
+
"blimp/accuracy/distractor_agreement_relational_noun": 0.864,
|
| 61 |
+
"blimp/accuracy/sentential_negation_npi_licensor_present": 0.984,
|
| 62 |
+
"blimp/accuracy/superlative_quantifiers_1": 0.883,
|
| 63 |
+
"blimp/accuracy/wh_island": 0.856,
|
| 64 |
+
"blimp/accuracy/principle_A_domain_1": 0.997,
|
| 65 |
+
"blimp/accuracy/complex_NP_island": 0.538,
|
| 66 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_2": 0.969,
|
| 67 |
+
"blimp/accuracy/irregular_past_participle_verbs": 0.866,
|
| 68 |
+
"blimp/accuracy/drop_argument": 0.731,
|
| 69 |
+
"blimp/accuracy/wh_questions_object_gap": 0.78,
|
| 70 |
+
"blimp/accuracy/animate_subject_passive": 0.818,
|
| 71 |
+
"blimp/accuracy/existential_there_quantifiers_1": 0.979,
|
| 72 |
+
"blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.917,
|
| 73 |
+
"blimp/accuracy/npi_present_2": 0.592,
|
| 74 |
+
"blimp/accuracy/determiner_noun_agreement_irregular_1": 0.927,
|
| 75 |
+
"blimp/accuracy/anaphor_number_agreement": 0.992,
|
| 76 |
+
"blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.964,
|
| 77 |
+
"blimp/accuracy/existential_there_object_raising": 0.801,
|
| 78 |
+
"blimp/accuracy/matrix_question_npi_licensor_present": 0.306,
|
| 79 |
+
"blimp/accuracy/npi_present_1": 0.64,
|
| 80 |
+
"blimp/accuracy/wh_vs_that_no_gap": 0.975,
|
| 81 |
+
"blimp/accuracy/left_branch_island_echo_question": 0.385,
|
| 82 |
+
"blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.971,
|
| 83 |
+
"blimp/accuracy/causative": 0.725,
|
| 84 |
+
"blimp/accuracy/group_average": 0.7870597014925372,
|
| 85 |
+
"blimp/accuracy/seq_average": 0.7870597014925373,
|
| 86 |
+
"cbt/accuracy/NE": 0.782051282051282,
|
| 87 |
+
"cbt/accuracy/V": 0.922,
|
| 88 |
+
"cbt/accuracy/CN": 0.848,
|
| 89 |
+
"cbt/accuracy/P": 0.9056,
|
| 90 |
+
"cbt/accuracy/group_average": 0.8644128205128205,
|
| 91 |
+
"cbt/accuracy/seq_average": 0.8644457783113245,
|
| 92 |
+
"hellaswag/accuracy/val": 0.31676956781517623,
|
| 93 |
+
"hellaswag/accuracy/group_average": 0.31676956781517623,
|
| 94 |
+
"hellaswag/accuracy/seq_average": 0.31676956781517623,
|
| 95 |
+
"piqa/accuracy/val": 0.6099020674646355,
|
| 96 |
+
"piqa/accuracy/group_average": 0.6099020674646355,
|
| 97 |
+
"piqa/accuracy/seq_average": 0.6099020674646355,
|
| 98 |
+
"ai2arc/accuracy/ARC-Easy": 0.3513742071881607,
|
| 99 |
+
"ai2arc/accuracy/ARC-Challenge": 0.2240343347639485,
|
| 100 |
+
"ai2arc/accuracy/group_average": 0.2877042709760546,
|
| 101 |
+
"ai2arc/accuracy/seq_average": 0.3093484419263456,
|
| 102 |
+
"race/accuracy/test/high": 0.279874213836478,
|
| 103 |
+
"race/accuracy/test/middle": 0.3488857938718663,
|
| 104 |
+
"race/accuracy/group_average": 0.31438000385417214,
|
| 105 |
+
"race/accuracy/seq_average": 0.29995946493717063,
|
| 106 |
+
"siqa/accuracy/dev": 0.3710337768679631,
|
| 107 |
+
"siqa/accuracy/group_average": 0.3710337768679631,
|
| 108 |
+
"siqa/accuracy/seq_average": 0.3710337768679631,
|
| 109 |
+
"commonsenseqa/accuracy/dev_rand_split": 0.26371826371826373,
|
| 110 |
+
"commonsenseqa/accuracy/group_average": 0.26371826371826373,
|
| 111 |
+
"commonsenseqa/accuracy/seq_average": 0.26371826371826373
|
| 112 |
+
}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_660M_standardlb_tcmoe/export/result-model-80000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.4661770775204612, "val/accuracy": 0.4959687732514881, "val/perplexity": 11.77733683480201, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.4639071825868593, "lambada/accuracy/total": 0.29250776397515527, "lambada/accuracy/openai_last_token": 0.7727096273291926, "lambada/perplexity": 9.216576833964732, "lambada/lm_loss": 3.043425742200841, "lambada/lm_perplexity": 20.976982018191624, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3942382686133217, "mean_loss": 2.4650421300536602, "blimp/accuracy/passive_2": 0.908, "blimp/accuracy/determiner_noun_agreement_2": 0.988, "blimp/accuracy/ellipsis_n_bar_1": 0.797, "blimp/accuracy/tough_vs_raising_2": 0.853, "blimp/accuracy/tough_vs_raising_1": 0.64, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.915, "blimp/accuracy/principle_A_reconstruction": 0.276, "blimp/accuracy/wh_vs_that_with_gap": 0.537, "blimp/accuracy/principle_A_domain_2": 0.875, "blimp/accuracy/determiner_noun_agreement_1": 0.992, "blimp/accuracy/ellipsis_n_bar_2": 0.895, "blimp/accuracy/principle_A_domain_3": 0.629, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.922, "blimp/accuracy/animate_subject_trans": 0.89, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.874, "blimp/accuracy/distractor_agreement_relative_clause": 0.673, "blimp/accuracy/transitive": 0.877, "blimp/accuracy/sentential_subject_island": 0.398, "blimp/accuracy/adjunct_island": 0.843, "blimp/accuracy/intransitive": 0.778, "blimp/accuracy/existential_there_subject_raising": 0.853, "blimp/accuracy/irregular_past_participle_adjectives": 0.936, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.605, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.404, "blimp/accuracy/only_npi_scope": 0.75, "blimp/accuracy/superlative_quantifiers_2": 0.834, "blimp/accuracy/passive_1": 0.894, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.891, "blimp/accuracy/inchoative": 0.625, "blimp/accuracy/anaphor_gender_agreement": 0.978, "blimp/accuracy/principle_A_c_command": 0.726, "blimp/accuracy/only_npi_licensor_present": 0.615, "blimp/accuracy/expletive_it_object_raising": 0.768, "blimp/accuracy/left_branch_island_simple_question": 0.668, "blimp/accuracy/wh_questions_subject_gap": 0.922, "blimp/accuracy/existential_there_quantifiers_2": 0.434, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.945, "blimp/accuracy/sentential_negation_npi_scope": 0.668, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.825, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.868, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.902, "blimp/accuracy/principle_A_case_2": 0.968, "blimp/accuracy/distractor_agreement_relational_noun": 0.866, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.989, "blimp/accuracy/superlative_quantifiers_1": 0.86, "blimp/accuracy/wh_island": 0.787, "blimp/accuracy/principle_A_domain_1": 0.991, "blimp/accuracy/complex_NP_island": 0.525, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.976, "blimp/accuracy/irregular_past_participle_verbs": 0.887, "blimp/accuracy/drop_argument": 0.738, "blimp/accuracy/wh_questions_object_gap": 0.785, "blimp/accuracy/animate_subject_passive": 0.813, "blimp/accuracy/existential_there_quantifiers_1": 0.984, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.918, "blimp/accuracy/npi_present_2": 0.619, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.947, "blimp/accuracy/anaphor_number_agreement": 0.995, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.966, "blimp/accuracy/existential_there_object_raising": 0.79, "blimp/accuracy/matrix_question_npi_licensor_present": 0.356, "blimp/accuracy/npi_present_1": 0.602, "blimp/accuracy/wh_vs_that_no_gap": 0.966, "blimp/accuracy/left_branch_island_echo_question": 0.429, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.944, "blimp/accuracy/causative": 0.744, "blimp/accuracy/group_average": 0.792776119402985, "blimp/accuracy/seq_average": 0.7927761194029851, "cbt/accuracy/NE": 0.7900641025641025, "cbt/accuracy/V": 0.9264, "cbt/accuracy/CN": 0.8636, "cbt/accuracy/P": 0.9056, "cbt/accuracy/group_average": 0.8714160256410257, "cbt/accuracy/seq_average": 0.8714485794317727, "hellaswag/accuracy/val": 0.3263294164509062, "hellaswag/accuracy/group_average": 0.3263294164509062, "hellaswag/accuracy/seq_average": 0.3263294164509062, "piqa/accuracy/val": 0.6131664853101197, "piqa/accuracy/group_average": 0.6131664853101197, "piqa/accuracy/seq_average": 0.6131664853101197, "ai2arc/accuracy/ARC-Easy": 0.35433403805496827, "ai2arc/accuracy/ARC-Challenge": 0.21888412017167383, "ai2arc/accuracy/group_average": 0.28660907911332106, "ai2arc/accuracy/seq_average": 0.3096317280453258, "race/accuracy/test/high": 0.2847341337907376, "race/accuracy/test/middle": 0.35376044568245124, "race/accuracy/group_average": 0.31924728973659444, "race/accuracy/seq_average": 0.30482367247669234, "siqa/accuracy/dev": 0.36745138178096215, "siqa/accuracy/group_average": 0.36745138178096215, "siqa/accuracy/seq_average": 0.36745138178096215, "commonsenseqa/accuracy/dev_rand_split": 0.26371826371826373, "commonsenseqa/accuracy/group_average": 0.26371826371826373, "commonsenseqa/accuracy/seq_average": 0.26371826371826373}
|