Upload folder using huggingface_hub

#5402
Pretrain_language_model/save/slimpajama_competesmoe_no_attmoe_660M_standardlb_down_flip/export/result-model-120000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.4369583129882812, "val/accuracy": 0.5002707204511089, "val/perplexity": 11.438196363495134, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.3129389982045807, "lambada/accuracy/total": 0.34627329192546585, "lambada/accuracy/openai_last_token": 0.7878493788819876, "lambada/perplexity": 7.96804092333307, "lambada/lm_loss": 3.0043084211430595, "lambada/lm_perplexity": 20.172260561966155, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.42327200618828736, "mean_loss": 2.374948655596431, "blimp/accuracy/passive_2": 0.918, "blimp/accuracy/determiner_noun_agreement_2": 0.984, "blimp/accuracy/ellipsis_n_bar_1": 0.836, "blimp/accuracy/tough_vs_raising_2": 0.881, "blimp/accuracy/tough_vs_raising_1": 0.6, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.896, "blimp/accuracy/principle_A_reconstruction": 0.41, "blimp/accuracy/wh_vs_that_with_gap": 0.468, "blimp/accuracy/principle_A_domain_2": 0.883, "blimp/accuracy/determiner_noun_agreement_1": 0.993, "blimp/accuracy/ellipsis_n_bar_2": 0.918, "blimp/accuracy/principle_A_domain_3": 0.581, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.916, "blimp/accuracy/animate_subject_trans": 0.891, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.9, "blimp/accuracy/distractor_agreement_relative_clause": 0.68, "blimp/accuracy/transitive": 0.866, "blimp/accuracy/sentential_subject_island": 0.399, "blimp/accuracy/adjunct_island": 0.862, "blimp/accuracy/intransitive": 0.764, "blimp/accuracy/existential_there_subject_raising": 0.869, "blimp/accuracy/irregular_past_participle_adjectives": 0.895, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.67, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.31, "blimp/accuracy/only_npi_scope": 0.691, "blimp/accuracy/superlative_quantifiers_2": 0.759, "blimp/accuracy/passive_1": 0.887, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.923, "blimp/accuracy/inchoative": 0.612, "blimp/accuracy/anaphor_gender_agreement": 0.969, "blimp/accuracy/principle_A_c_command": 0.682, "blimp/accuracy/only_npi_licensor_present": 0.65, "blimp/accuracy/expletive_it_object_raising": 0.784, "blimp/accuracy/left_branch_island_simple_question": 0.768, "blimp/accuracy/wh_questions_subject_gap": 0.929, "blimp/accuracy/existential_there_quantifiers_2": 0.411, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.95, "blimp/accuracy/sentential_negation_npi_scope": 0.749, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.792, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.918, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.918, "blimp/accuracy/principle_A_case_2": 0.92, "blimp/accuracy/distractor_agreement_relational_noun": 0.857, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.99, "blimp/accuracy/superlative_quantifiers_1": 0.822, "blimp/accuracy/wh_island": 0.789, "blimp/accuracy/principle_A_domain_1": 0.994, "blimp/accuracy/complex_NP_island": 0.595, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.973, "blimp/accuracy/irregular_past_participle_verbs": 0.877, "blimp/accuracy/drop_argument": 0.736, "blimp/accuracy/wh_questions_object_gap": 0.809, "blimp/accuracy/animate_subject_passive": 0.819, "blimp/accuracy/existential_there_quantifiers_1": 0.976, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.888, "blimp/accuracy/npi_present_2": 0.612, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.964, "blimp/accuracy/anaphor_number_agreement": 0.986, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.955, "blimp/accuracy/existential_there_object_raising": 0.843, "blimp/accuracy/matrix_question_npi_licensor_present": 0.295, "blimp/accuracy/npi_present_1": 0.646, "blimp/accuracy/wh_vs_that_no_gap": 0.979, "blimp/accuracy/left_branch_island_echo_question": 0.473, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.957, "blimp/accuracy/causative": 0.755, "blimp/accuracy/group_average": 0.7954029850746269, "blimp/accuracy/seq_average": 0.7954029850746268, "cbt/accuracy/NE": 0.8020833333333334, "cbt/accuracy/V": 0.93, "cbt/accuracy/CN": 0.8656, "cbt/accuracy/P": 0.904, "cbt/accuracy/group_average": 0.8754208333333333, "cbt/accuracy/seq_average": 0.8754501800720288, "hellaswag/accuracy/val": 0.3337980481975702, "hellaswag/accuracy/group_average": 0.3337980481975702, "hellaswag/accuracy/seq_average": 0.3337980481975702, "piqa/accuracy/val": 0.6218715995647442, "piqa/accuracy/group_average": 0.6218715995647442, "piqa/accuracy/seq_average": 0.6218715995647442, "ai2arc/accuracy/ARC-Easy": 0.37251585623678646, "ai2arc/accuracy/ARC-Challenge": 0.21716738197424892, "ai2arc/accuracy/group_average": 0.2948416191055177, "ai2arc/accuracy/seq_average": 0.3212464589235127, "mmlu/accuracy/MMLU": 0.2661422953164104, "mmlu/accuracy/group_average": 0.2661422953164104, "mmlu/accuracy/seq_average": 0.2661422953164104, "openbookqa/accuracy/test": 0.29, "openbookqa/accuracy/group_average": 0.29, "openbookqa/accuracy/seq_average": 0.29, "race/accuracy/test/high": 0.2804459691252144, "race/accuracy/test/middle": 0.346100278551532, "race/accuracy/group_average": 0.31327312383837325, "race/accuracy/seq_average": 0.2995541143088772, "siqa/accuracy/dev": 0.3607983623336745, "siqa/accuracy/group_average": 0.3607983623336745, "siqa/accuracy/seq_average": 0.3607983623336745, "winogrande/accuracy/dev": 0.5019731649565904, "winogrande/accuracy/group_average": 0.5019731649565904, "winogrande/accuracy/seq_average": 0.5019731649565904, "commonsenseqa/accuracy/dev_rand_split": 0.25307125307125306, "commonsenseqa/accuracy/group_average": 0.25307125307125306, "commonsenseqa/accuracy/seq_average": 0.25307125307125306}
Pretrain_language_model/save/slimpajama_competesmoe_no_attmoe_660M_standardlb_down_flip/export/result-model-120001.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.42450443390877, "val/accuracy": 0.5012768160912299, "val/perplexity": 11.296629805228108, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5750654706303377, "lambada/accuracy/total": 0.3513198757763975, "lambada/accuracy/openai_last_token": 0.7905667701863354, "lambada/perplexity": 7.749698948866573, "lambada/lm_loss": 2.9992504132941376, "lambada/lm_perplexity": 20.07048671315332, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.42629834593381366, "mean_loss": 2.499784952269554, "blimp/accuracy/passive_2": 0.898, "blimp/accuracy/determiner_noun_agreement_2": 0.987, "blimp/accuracy/ellipsis_n_bar_1": 0.846, "blimp/accuracy/tough_vs_raising_2": 0.894, "blimp/accuracy/tough_vs_raising_1": 0.587, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.915, "blimp/accuracy/principle_A_reconstruction": 0.457, "blimp/accuracy/wh_vs_that_with_gap": 0.442, "blimp/accuracy/principle_A_domain_2": 0.859, "blimp/accuracy/determiner_noun_agreement_1": 0.995, "blimp/accuracy/ellipsis_n_bar_2": 0.925, "blimp/accuracy/principle_A_domain_3": 0.572, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.914, "blimp/accuracy/animate_subject_trans": 0.905, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.906, "blimp/accuracy/distractor_agreement_relative_clause": 0.662, "blimp/accuracy/transitive": 0.866, "blimp/accuracy/sentential_subject_island": 0.327, "blimp/accuracy/adjunct_island": 0.873, "blimp/accuracy/intransitive": 0.763, "blimp/accuracy/existential_there_subject_raising": 0.893, "blimp/accuracy/irregular_past_participle_adjectives": 0.86, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.759, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.326, "blimp/accuracy/only_npi_scope": 0.66, "blimp/accuracy/superlative_quantifiers_2": 0.871, "blimp/accuracy/passive_1": 0.88, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.898, "blimp/accuracy/inchoative": 0.611, "blimp/accuracy/anaphor_gender_agreement": 0.971, "blimp/accuracy/principle_A_c_command": 0.631, "blimp/accuracy/only_npi_licensor_present": 0.723, "blimp/accuracy/expletive_it_object_raising": 0.767, "blimp/accuracy/left_branch_island_simple_question": 0.864, "blimp/accuracy/wh_questions_subject_gap": 0.952, "blimp/accuracy/existential_there_quantifiers_2": 0.508, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.945, "blimp/accuracy/sentential_negation_npi_scope": 0.709, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.838, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.912, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.908, "blimp/accuracy/principle_A_case_2": 0.931, "blimp/accuracy/distractor_agreement_relational_noun": 0.848, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.975, "blimp/accuracy/superlative_quantifiers_1": 0.924, "blimp/accuracy/wh_island": 0.769, "blimp/accuracy/principle_A_domain_1": 0.994, "blimp/accuracy/complex_NP_island": 0.586, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.969, "blimp/accuracy/irregular_past_participle_verbs": 0.885, "blimp/accuracy/drop_argument": 0.731, "blimp/accuracy/wh_questions_object_gap": 0.859, "blimp/accuracy/animate_subject_passive": 0.791, "blimp/accuracy/existential_there_quantifiers_1": 0.976, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.898, "blimp/accuracy/npi_present_2": 0.528, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.962, "blimp/accuracy/anaphor_number_agreement": 0.987, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.955, "blimp/accuracy/existential_there_object_raising": 0.825, "blimp/accuracy/matrix_question_npi_licensor_present": 0.256, "blimp/accuracy/npi_present_1": 0.572, "blimp/accuracy/wh_vs_that_no_gap": 0.983, "blimp/accuracy/left_branch_island_echo_question": 0.515, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.957, "blimp/accuracy/causative": 0.729, "blimp/accuracy/group_average": 0.7982686567164178, "blimp/accuracy/seq_average": 0.7982686567164179, "cbt/accuracy/NE": 0.797676282051282, "cbt/accuracy/V": 0.934, "cbt/accuracy/CN": 0.8664, "cbt/accuracy/P": 0.9016, "cbt/accuracy/group_average": 0.8749190705128205, "cbt/accuracy/seq_average": 0.8749499799919968, "hellaswag/accuracy/val": 0.3333001394144593, "hellaswag/accuracy/group_average": 0.3333001394144593, "hellaswag/accuracy/seq_average": 0.3333001394144593, "piqa/accuracy/val": 0.6180631120783461, "piqa/accuracy/group_average": 0.6180631120783461, "piqa/accuracy/seq_average": 0.6180631120783461, "ai2arc/accuracy/ARC-Easy": 0.3602536997885835, "ai2arc/accuracy/ARC-Challenge": 0.2257510729613734, "ai2arc/accuracy/group_average": 0.29300238637497844, "ai2arc/accuracy/seq_average": 0.31586402266288954, "mmlu/accuracy/MMLU": 0.26642831605291384, "mmlu/accuracy/group_average": 0.26642831605291384, "mmlu/accuracy/seq_average": 0.26642831605291384, "openbookqa/accuracy/test": 0.28, "openbookqa/accuracy/group_average": 0.28, "openbookqa/accuracy/seq_average": 0.28, "race/accuracy/test/high": 0.2804459691252144, "race/accuracy/test/middle": 0.362116991643454, "race/accuracy/group_average": 0.32128148038433424, "race/accuracy/seq_average": 0.30421564653425215, "siqa/accuracy/dev": 0.3694984646878199, "siqa/accuracy/group_average": 0.3694984646878199, "siqa/accuracy/seq_average": 0.3694984646878199, "winogrande/accuracy/dev": 0.5114443567482242, "winogrande/accuracy/group_average": 0.5114443567482242, "winogrande/accuracy/seq_average": 0.5114443567482242, "commonsenseqa/accuracy/dev_rand_split": 0.26371826371826373, "commonsenseqa/accuracy/group_average": 0.26371826371826373, "commonsenseqa/accuracy/seq_average": 0.26371826371826373}
Pretrain_language_model/save/slimpajama_competesmoe_no_attmoe_660M_standardlb_down_flip/export/result-model-140000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.417517877394153, "val/accuracy": 0.5032575053553427, "val/perplexity": 11.217980327100483, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.217075892857143, "lambada/accuracy/total": 0.3223990683229814, "lambada/accuracy/openai_last_token": 0.7895962732919255, "lambada/perplexity": 8.033157605216047, "lambada/lm_loss": 2.9931819562798707, "lambada/lm_perplexity": 19.94905864044293, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.41282828683916206, "mean_loss": 2.317296885125648, "blimp/accuracy/passive_2": 0.893, "blimp/accuracy/determiner_noun_agreement_2": 0.984, "blimp/accuracy/ellipsis_n_bar_1": 0.866, "blimp/accuracy/tough_vs_raising_2": 0.908, "blimp/accuracy/tough_vs_raising_1": 0.591, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.906, "blimp/accuracy/principle_A_reconstruction": 0.498, "blimp/accuracy/wh_vs_that_with_gap": 0.516, "blimp/accuracy/principle_A_domain_2": 0.871, "blimp/accuracy/determiner_noun_agreement_1": 0.988, "blimp/accuracy/ellipsis_n_bar_2": 0.906, "blimp/accuracy/principle_A_domain_3": 0.601, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.922, "blimp/accuracy/animate_subject_trans": 0.917, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.913, "blimp/accuracy/distractor_agreement_relative_clause": 0.652, "blimp/accuracy/transitive": 0.896, "blimp/accuracy/sentential_subject_island": 0.352, "blimp/accuracy/adjunct_island": 0.899, "blimp/accuracy/intransitive": 0.77, "blimp/accuracy/existential_there_subject_raising": 0.883, "blimp/accuracy/irregular_past_participle_adjectives": 0.894, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.595, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.343, "blimp/accuracy/only_npi_scope": 0.794, "blimp/accuracy/superlative_quantifiers_2": 0.71, "blimp/accuracy/passive_1": 0.879, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.931, "blimp/accuracy/inchoative": 0.613, "blimp/accuracy/anaphor_gender_agreement": 0.979, "blimp/accuracy/principle_A_c_command": 0.618, "blimp/accuracy/only_npi_licensor_present": 0.736, "blimp/accuracy/expletive_it_object_raising": 0.752, "blimp/accuracy/left_branch_island_simple_question": 0.665, "blimp/accuracy/wh_questions_subject_gap": 0.947, "blimp/accuracy/existential_there_quantifiers_2": 0.465, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.952, "blimp/accuracy/sentential_negation_npi_scope": 0.695, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.825, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.911, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.901, "blimp/accuracy/principle_A_case_2": 0.921, "blimp/accuracy/distractor_agreement_relational_noun": 0.843, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.978, "blimp/accuracy/superlative_quantifiers_1": 0.673, "blimp/accuracy/wh_island": 0.704, "blimp/accuracy/principle_A_domain_1": 0.994, "blimp/accuracy/complex_NP_island": 0.64, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.971, "blimp/accuracy/irregular_past_participle_verbs": 0.92, "blimp/accuracy/drop_argument": 0.749, "blimp/accuracy/wh_questions_object_gap": 0.874, "blimp/accuracy/animate_subject_passive": 0.804, "blimp/accuracy/existential_there_quantifiers_1": 0.986, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.915, "blimp/accuracy/npi_present_2": 0.562, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.95, "blimp/accuracy/anaphor_number_agreement": 0.985, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.962, "blimp/accuracy/existential_there_object_raising": 0.836, "blimp/accuracy/matrix_question_npi_licensor_present": 0.324, "blimp/accuracy/npi_present_1": 0.536, "blimp/accuracy/wh_vs_that_no_gap": 0.982, "blimp/accuracy/left_branch_island_echo_question": 0.511, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.974, "blimp/accuracy/causative": 0.754, "blimp/accuracy/group_average": 0.7952985074626865, "blimp/accuracy/seq_average": 0.7952985074626866, "cbt/accuracy/NE": 0.8024839743589743, "cbt/accuracy/V": 0.9336, "cbt/accuracy/CN": 0.8684, "cbt/accuracy/P": 0.9088, "cbt/accuracy/group_average": 0.8783209935897436, "cbt/accuracy/seq_average": 0.8783513405362144, "hellaswag/accuracy/val": 0.34106751643098987, "hellaswag/accuracy/group_average": 0.34106751643098987, "hellaswag/accuracy/seq_average": 0.34106751643098987, "piqa/accuracy/val": 0.6213275299238302, "piqa/accuracy/group_average": 0.6213275299238302, "piqa/accuracy/seq_average": 0.6213275299238302, "ai2arc/accuracy/ARC-Easy": 0.3627906976744186, "ai2arc/accuracy/ARC-Challenge": 0.2429184549356223, "ai2arc/accuracy/group_average": 0.30285457630502044, "ai2arc/accuracy/seq_average": 0.32322946175637396, "mmlu/accuracy/MMLU": 0.2615659635323561, "mmlu/accuracy/group_average": 0.2615659635323561, "mmlu/accuracy/seq_average": 0.2615659635323561, "openbookqa/accuracy/test": 0.272, "openbookqa/accuracy/group_average": 0.272, "openbookqa/accuracy/seq_average": 0.272, "race/accuracy/test/high": 0.2815894797026873, "race/accuracy/test/middle": 0.33774373259052926, "race/accuracy/group_average": 0.30966660614660824, "race/accuracy/seq_average": 0.2979327117957033, "siqa/accuracy/dev": 0.3669396110542477, "siqa/accuracy/group_average": 0.3669396110542477, "siqa/accuracy/seq_average": 0.3669396110542477, "winogrande/accuracy/dev": 0.516179952644041, "winogrande/accuracy/group_average": 0.516179952644041, "winogrande/accuracy/seq_average": 0.516179952644041, "commonsenseqa/accuracy/dev_rand_split": 0.2620802620802621, "commonsenseqa/accuracy/group_average": 0.2620802620802621, "commonsenseqa/accuracy/seq_average": 0.2620802620802621}
Pretrain_language_model/save/slimpajama_competesmoe_no_attmoe_660M_standardlb_down_flip/export/result-model-160000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.3851849955897175, "val/accuracy": 0.5081767420614919, "val/perplexity": 10.861071729450302, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.21494440114276, "lambada/accuracy/total": 0.33482142857142855, "lambada/accuracy/openai_last_token": 0.7936723602484472, "lambada/perplexity": 7.655349224839091, "lambada/lm_loss": 2.9612883776869356, "lambada/lm_perplexity": 19.32285085511263, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4214990853164602, "mean_loss": 2.3000646983662385, "blimp/accuracy/passive_2": 0.903, "blimp/accuracy/determiner_noun_agreement_2": 0.986, "blimp/accuracy/ellipsis_n_bar_1": 0.85, "blimp/accuracy/tough_vs_raising_2": 0.857, "blimp/accuracy/tough_vs_raising_1": 0.605, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.899, "blimp/accuracy/principle_A_reconstruction": 0.383, "blimp/accuracy/wh_vs_that_with_gap": 0.486, "blimp/accuracy/principle_A_domain_2": 0.905, "blimp/accuracy/determiner_noun_agreement_1": 0.99, "blimp/accuracy/ellipsis_n_bar_2": 0.908, "blimp/accuracy/principle_A_domain_3": 0.604, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.914, "blimp/accuracy/animate_subject_trans": 0.904, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.905, "blimp/accuracy/distractor_agreement_relative_clause": 0.666, "blimp/accuracy/transitive": 0.869, "blimp/accuracy/sentential_subject_island": 0.41, "blimp/accuracy/adjunct_island": 0.875, "blimp/accuracy/intransitive": 0.772, "blimp/accuracy/existential_there_subject_raising": 0.898, "blimp/accuracy/irregular_past_participle_adjectives": 0.971, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.674, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.373, "blimp/accuracy/only_npi_scope": 0.613, "blimp/accuracy/superlative_quantifiers_2": 0.705, "blimp/accuracy/passive_1": 0.876, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.921, "blimp/accuracy/inchoative": 0.614, "blimp/accuracy/anaphor_gender_agreement": 0.97, "blimp/accuracy/principle_A_c_command": 0.698, "blimp/accuracy/only_npi_licensor_present": 0.717, "blimp/accuracy/expletive_it_object_raising": 0.782, "blimp/accuracy/left_branch_island_simple_question": 0.784, "blimp/accuracy/wh_questions_subject_gap": 0.929, "blimp/accuracy/existential_there_quantifiers_2": 0.365, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.939, "blimp/accuracy/sentential_negation_npi_scope": 0.747, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.826, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.9, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.897, "blimp/accuracy/principle_A_case_2": 0.955, "blimp/accuracy/distractor_agreement_relational_noun": 0.822, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.983, "blimp/accuracy/superlative_quantifiers_1": 0.775, "blimp/accuracy/wh_island": 0.74, "blimp/accuracy/principle_A_domain_1": 0.994, "blimp/accuracy/complex_NP_island": 0.609, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.975, "blimp/accuracy/irregular_past_participle_verbs": 0.889, "blimp/accuracy/drop_argument": 0.741, "blimp/accuracy/wh_questions_object_gap": 0.835, "blimp/accuracy/animate_subject_passive": 0.8, "blimp/accuracy/existential_there_quantifiers_1": 0.961, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.907, "blimp/accuracy/npi_present_2": 0.573, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.958, "blimp/accuracy/anaphor_number_agreement": 0.989, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.953, "blimp/accuracy/existential_there_object_raising": 0.829, "blimp/accuracy/matrix_question_npi_licensor_present": 0.312, "blimp/accuracy/npi_present_1": 0.547, "blimp/accuracy/wh_vs_that_no_gap": 0.976, "blimp/accuracy/left_branch_island_echo_question": 0.551, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.96, "blimp/accuracy/causative": 0.729, "blimp/accuracy/group_average": 0.7948208955223879, "blimp/accuracy/seq_average": 0.7948208955223881, "cbt/accuracy/NE": 0.8064903846153846, "cbt/accuracy/V": 0.9376, "cbt/accuracy/CN": 0.8764, "cbt/accuracy/P": 0.9164, "cbt/accuracy/group_average": 0.8842225961538461, "cbt/accuracy/seq_average": 0.8842537014805922, "hellaswag/accuracy/val": 0.34644493128858794, "hellaswag/accuracy/group_average": 0.34644493128858794, "hellaswag/accuracy/seq_average": 0.34644493128858794, "piqa/accuracy/val": 0.6082698585418934, "piqa/accuracy/group_average": 0.6082698585418934, "piqa/accuracy/seq_average": 0.6082698585418934, "ai2arc/accuracy/ARC-Easy": 0.3564482029598309, "ai2arc/accuracy/ARC-Challenge": 0.21888412017167383, "ai2arc/accuracy/group_average": 0.28766616156575237, "ai2arc/accuracy/seq_average": 0.3110481586402266, "mmlu/accuracy/MMLU": 0.267786914551305, "mmlu/accuracy/group_average": 0.267786914551305, "mmlu/accuracy/seq_average": 0.267786914551305, "openbookqa/accuracy/test": 0.288, "openbookqa/accuracy/group_average": 0.288, "openbookqa/accuracy/seq_average": 0.288, "race/accuracy/test/high": 0.2833047455688965, "race/accuracy/test/middle": 0.366991643454039, "race/accuracy/group_average": 0.32514819451146776, "race/accuracy/seq_average": 0.30766112687474667, "siqa/accuracy/dev": 0.36745138178096215, "siqa/accuracy/group_average": 0.36745138178096215, "siqa/accuracy/seq_average": 0.36745138178096215, "winogrande/accuracy/dev": 0.5090765588003157, "winogrande/accuracy/group_average": 0.5090765588003157, "winogrande/accuracy/seq_average": 0.5090765588003157, "commonsenseqa/accuracy/dev_rand_split": 0.2588042588042588, "commonsenseqa/accuracy/group_average": 0.2588042588042588, "commonsenseqa/accuracy/seq_average": 0.2588042588042588}
Pretrain_language_model/save/slimpajama_competesmoe_no_attmoe_660M_standardlb_down_flip/export/result-model-180000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.378108855216734, "val/accuracy": 0.5088865218623992, "val/perplexity": 10.784488537554052, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.2516739175926825, "lambada/accuracy/total": 0.34646739130434784, "lambada/accuracy/openai_last_token": 0.796001552795031, "lambada/perplexity": 7.44671847155186, "lambada/lm_loss": 2.9623079937057764, "lambada/lm_perplexity": 19.342562790969215, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.42767695658337357, "mean_loss": 2.3148913864047085, "blimp/accuracy/passive_2": 0.912, "blimp/accuracy/determiner_noun_agreement_2": 0.98, "blimp/accuracy/ellipsis_n_bar_1": 0.857, "blimp/accuracy/tough_vs_raising_2": 0.865, "blimp/accuracy/tough_vs_raising_1": 0.603, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.909, "blimp/accuracy/principle_A_reconstruction": 0.329, "blimp/accuracy/wh_vs_that_with_gap": 0.42, "blimp/accuracy/principle_A_domain_2": 0.882, "blimp/accuracy/determiner_noun_agreement_1": 0.988, "blimp/accuracy/ellipsis_n_bar_2": 0.908, "blimp/accuracy/principle_A_domain_3": 0.592, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.921, "blimp/accuracy/animate_subject_trans": 0.9, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.914, "blimp/accuracy/distractor_agreement_relative_clause": 0.681, "blimp/accuracy/transitive": 0.892, "blimp/accuracy/sentential_subject_island": 0.378, "blimp/accuracy/adjunct_island": 0.886, "blimp/accuracy/intransitive": 0.743, "blimp/accuracy/existential_there_subject_raising": 0.883, "blimp/accuracy/irregular_past_participle_adjectives": 0.947, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.733, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.336, "blimp/accuracy/only_npi_scope": 0.719, "blimp/accuracy/superlative_quantifiers_2": 0.818, "blimp/accuracy/passive_1": 0.896, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.938, "blimp/accuracy/inchoative": 0.591, "blimp/accuracy/anaphor_gender_agreement": 0.965, "blimp/accuracy/principle_A_c_command": 0.683, "blimp/accuracy/only_npi_licensor_present": 0.671, "blimp/accuracy/expletive_it_object_raising": 0.764, "blimp/accuracy/left_branch_island_simple_question": 0.844, "blimp/accuracy/wh_questions_subject_gap": 0.951, "blimp/accuracy/existential_there_quantifiers_2": 0.518, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.943, "blimp/accuracy/sentential_negation_npi_scope": 0.756, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.821, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.927, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.903, "blimp/accuracy/principle_A_case_2": 0.905, "blimp/accuracy/distractor_agreement_relational_noun": 0.827, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.984, "blimp/accuracy/superlative_quantifiers_1": 0.631, "blimp/accuracy/wh_island": 0.754, "blimp/accuracy/principle_A_domain_1": 0.996, "blimp/accuracy/complex_NP_island": 0.613, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.97, "blimp/accuracy/irregular_past_participle_verbs": 0.899, "blimp/accuracy/drop_argument": 0.724, "blimp/accuracy/wh_questions_object_gap": 0.853, "blimp/accuracy/animate_subject_passive": 0.797, "blimp/accuracy/existential_there_quantifiers_1": 0.985, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.882, "blimp/accuracy/npi_present_2": 0.546, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.94, "blimp/accuracy/anaphor_number_agreement": 0.99, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.959, "blimp/accuracy/existential_there_object_raising": 0.885, "blimp/accuracy/matrix_question_npi_licensor_present": 0.398, "blimp/accuracy/npi_present_1": 0.558, "blimp/accuracy/wh_vs_that_no_gap": 0.98, "blimp/accuracy/left_branch_island_echo_question": 0.485, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.972, "blimp/accuracy/causative": 0.759, "blimp/accuracy/group_average": 0.7978955223880596, "blimp/accuracy/seq_average": 0.7978955223880597, "cbt/accuracy/NE": 0.8084935897435898, "cbt/accuracy/V": 0.9356, "cbt/accuracy/CN": 0.8772, "cbt/accuracy/P": 0.92, "cbt/accuracy/group_average": 0.8853233974358974, "cbt/accuracy/seq_average": 0.8853541416566627, "hellaswag/accuracy/val": 0.34714200358494324, "hellaswag/accuracy/group_average": 0.34714200358494324, "hellaswag/accuracy/seq_average": 0.34714200358494324, "piqa/accuracy/val": 0.6305767138193689, "piqa/accuracy/group_average": 0.6305767138193689, "piqa/accuracy/seq_average": 0.6305767138193689, "ai2arc/accuracy/ARC-Easy": 0.36236786469344606, "ai2arc/accuracy/ARC-Challenge": 0.23261802575107296, "ai2arc/accuracy/group_average": 0.2974929452222595, "ai2arc/accuracy/seq_average": 0.31954674220963175, "mmlu/accuracy/MMLU": 0.262567036110118, "mmlu/accuracy/group_average": 0.262567036110118, "mmlu/accuracy/seq_average": 0.262567036110118, "openbookqa/accuracy/test": 0.276, "openbookqa/accuracy/group_average": 0.276, "openbookqa/accuracy/seq_average": 0.276, "race/accuracy/test/high": 0.2858776443682104, "race/accuracy/test/middle": 0.36629526462395545, "race/accuracy/group_average": 0.3260864544960829, "race/accuracy/seq_average": 0.30928252938792056, "siqa/accuracy/dev": 0.3710337768679631, "siqa/accuracy/group_average": 0.3710337768679631, "siqa/accuracy/seq_average": 0.3710337768679631, "winogrande/accuracy/dev": 0.4964483030781373, "winogrande/accuracy/group_average": 0.4964483030781373, "winogrande/accuracy/seq_average": 0.4964483030781373, "commonsenseqa/accuracy/dev_rand_split": 0.26371826371826373, "commonsenseqa/accuracy/group_average": 0.26371826371826373, "commonsenseqa/accuracy/seq_average": 0.26371826371826373}
Pretrain_language_model/save/slimpajama_competesmoe_no_attmoe_660M_standardlb_down_flip/export/result-model-200000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.3618279733965473, "val/accuracy": 0.511565177671371, "val/perplexity": 10.610329135481647, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.249894231002523, "lambada/accuracy/total": 0.34607919254658387, "lambada/accuracy/openai_last_token": 0.7901785714285714, "lambada/perplexity": 7.497578859131717, "lambada/lm_loss": 2.9572677957436464, "lambada/lm_perplexity": 19.245317718480347, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4288221851089774, "mean_loss": 2.305861102199535, "blimp/accuracy/passive_2": 0.904, "blimp/accuracy/determiner_noun_agreement_2": 0.983, "blimp/accuracy/ellipsis_n_bar_1": 0.858, "blimp/accuracy/tough_vs_raising_2": 0.857, "blimp/accuracy/tough_vs_raising_1": 0.572, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.911, "blimp/accuracy/principle_A_reconstruction": 0.429, "blimp/accuracy/wh_vs_that_with_gap": 0.504, "blimp/accuracy/principle_A_domain_2": 0.894, "blimp/accuracy/determiner_noun_agreement_1": 0.994, "blimp/accuracy/ellipsis_n_bar_2": 0.896, "blimp/accuracy/principle_A_domain_3": 0.622, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.922, "blimp/accuracy/animate_subject_trans": 0.906, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.929, "blimp/accuracy/distractor_agreement_relative_clause": 0.705, "blimp/accuracy/transitive": 0.865, "blimp/accuracy/sentential_subject_island": 0.386, "blimp/accuracy/adjunct_island": 0.879, "blimp/accuracy/intransitive": 0.729, "blimp/accuracy/existential_there_subject_raising": 0.885, "blimp/accuracy/irregular_past_participle_adjectives": 0.973, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.756, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.369, "blimp/accuracy/only_npi_scope": 0.672, "blimp/accuracy/superlative_quantifiers_2": 0.788, "blimp/accuracy/passive_1": 0.895, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.915, "blimp/accuracy/inchoative": 0.588, "blimp/accuracy/anaphor_gender_agreement": 0.973, "blimp/accuracy/principle_A_c_command": 0.688, "blimp/accuracy/only_npi_licensor_present": 0.981, "blimp/accuracy/expletive_it_object_raising": 0.763, "blimp/accuracy/left_branch_island_simple_question": 0.863, "blimp/accuracy/wh_questions_subject_gap": 0.94, "blimp/accuracy/existential_there_quantifiers_2": 0.521, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.95, "blimp/accuracy/sentential_negation_npi_scope": 0.738, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.844, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.895, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.905, "blimp/accuracy/principle_A_case_2": 0.94, "blimp/accuracy/distractor_agreement_relational_noun": 0.857, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.969, "blimp/accuracy/superlative_quantifiers_1": 0.68, "blimp/accuracy/wh_island": 0.732, "blimp/accuracy/principle_A_domain_1": 0.995, "blimp/accuracy/complex_NP_island": 0.642, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.966, "blimp/accuracy/irregular_past_participle_verbs": 0.935, "blimp/accuracy/drop_argument": 0.719, "blimp/accuracy/wh_questions_object_gap": 0.864, "blimp/accuracy/animate_subject_passive": 0.784, "blimp/accuracy/existential_there_quantifiers_1": 0.979, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.888, "blimp/accuracy/npi_present_2": 0.573, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.946, "blimp/accuracy/anaphor_number_agreement": 0.991, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.962, "blimp/accuracy/existential_there_object_raising": 0.867, "blimp/accuracy/matrix_question_npi_licensor_present": 0.389, "blimp/accuracy/npi_present_1": 0.583, "blimp/accuracy/wh_vs_that_no_gap": 0.98, "blimp/accuracy/left_branch_island_echo_question": 0.556, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.956, "blimp/accuracy/causative": 0.766, "blimp/accuracy/group_average": 0.8084477611940298, "blimp/accuracy/seq_average": 0.8084477611940298, "cbt/accuracy/NE": 0.8161057692307693, "cbt/accuracy/V": 0.9356, "cbt/accuracy/CN": 0.878, "cbt/accuracy/P": 0.9184, "cbt/accuracy/group_average": 0.8870264423076923, "cbt/accuracy/seq_average": 0.8870548219287715, "hellaswag/accuracy/val": 0.3549093806014738, "hellaswag/accuracy/group_average": 0.3549093806014738, "hellaswag/accuracy/seq_average": 0.3549093806014738, "piqa/accuracy/val": 0.6245919477693145, "piqa/accuracy/group_average": 0.6245919477693145, "piqa/accuracy/seq_average": 0.6245919477693145, "ai2arc/accuracy/ARC-Easy": 0.3657505285412262, "ai2arc/accuracy/ARC-Challenge": 0.22832618025751072, "ai2arc/accuracy/group_average": 0.29703835439936843, "ai2arc/accuracy/seq_average": 0.32039660056657226, "mmlu/accuracy/MMLU": 0.26399713979263495, "mmlu/accuracy/group_average": 0.26399713979263495, "mmlu/accuracy/seq_average": 0.26399713979263495, "openbookqa/accuracy/test": 0.284, "openbookqa/accuracy/group_average": 0.284, "openbookqa/accuracy/seq_average": 0.284, "race/accuracy/test/high": 0.2833047455688965, "race/accuracy/test/middle": 0.35863509749303624, "race/accuracy/group_average": 0.3209699215309664, "race/accuracy/seq_average": 0.3052290231049858, "siqa/accuracy/dev": 0.3705220061412487, "siqa/accuracy/group_average": 0.3705220061412487, "siqa/accuracy/seq_average": 0.3705220061412487, "winogrande/accuracy/dev": 0.4996053670086819, "winogrande/accuracy/group_average": 0.4996053670086819, "winogrande/accuracy/seq_average": 0.4996053670086819, "commonsenseqa/accuracy/dev_rand_split": 0.266994266994267, "commonsenseqa/accuracy/group_average": 0.266994266994267, "commonsenseqa/accuracy/seq_average": 0.266994266994267}
Pretrain_language_model/save/slimpajama_competesmoe_no_attmoe_660M_standardlb_down_flip/export/result-model-360000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.2778657482516382, "val/accuracy": 0.5237318469632056, "val/perplexity": 9.755836762978193, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.152370666124806, "lambada/accuracy/total": 0.38548136645962733, "lambada/accuracy/openai_last_token": 0.8049301242236024, "lambada/perplexity": 6.364111971040649, "lambada/lm_loss": 2.8554696257618963, "lambada/lm_perplexity": 17.38259860721917, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4546066067114165, "mean_loss": 2.215118207188222, "blimp/accuracy/passive_2": 0.914, "blimp/accuracy/determiner_noun_agreement_2": 0.983, "blimp/accuracy/ellipsis_n_bar_1": 0.866, "blimp/accuracy/tough_vs_raising_2": 0.906, "blimp/accuracy/tough_vs_raising_1": 0.573, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.886, "blimp/accuracy/principle_A_reconstruction": 0.474, "blimp/accuracy/wh_vs_that_with_gap": 0.448, "blimp/accuracy/principle_A_domain_2": 0.905, "blimp/accuracy/determiner_noun_agreement_1": 0.995, "blimp/accuracy/ellipsis_n_bar_2": 0.904, "blimp/accuracy/principle_A_domain_3": 0.632, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.934, "blimp/accuracy/animate_subject_trans": 0.904, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.927, "blimp/accuracy/distractor_agreement_relative_clause": 0.676, "blimp/accuracy/transitive": 0.878, "blimp/accuracy/sentential_subject_island": 0.33, "blimp/accuracy/adjunct_island": 0.876, "blimp/accuracy/intransitive": 0.746, "blimp/accuracy/existential_there_subject_raising": 0.887, "blimp/accuracy/irregular_past_participle_adjectives": 0.929, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.749, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.366, "blimp/accuracy/only_npi_scope": 0.655, "blimp/accuracy/superlative_quantifiers_2": 0.845, "blimp/accuracy/passive_1": 0.896, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.913, "blimp/accuracy/inchoative": 0.612, "blimp/accuracy/anaphor_gender_agreement": 0.98, "blimp/accuracy/principle_A_c_command": 0.719, "blimp/accuracy/only_npi_licensor_present": 0.707, "blimp/accuracy/expletive_it_object_raising": 0.77, "blimp/accuracy/left_branch_island_simple_question": 0.835, "blimp/accuracy/wh_questions_subject_gap": 0.937, "blimp/accuracy/existential_there_quantifiers_2": 0.506, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.946, "blimp/accuracy/sentential_negation_npi_scope": 0.75, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.825, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.901, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.892, "blimp/accuracy/principle_A_case_2": 0.922, "blimp/accuracy/distractor_agreement_relational_noun": 0.866, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.974, "blimp/accuracy/superlative_quantifiers_1": 0.823, "blimp/accuracy/wh_island": 0.736, "blimp/accuracy/principle_A_domain_1": 0.997, "blimp/accuracy/complex_NP_island": 0.613, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.974, "blimp/accuracy/irregular_past_participle_verbs": 0.923, "blimp/accuracy/drop_argument": 0.722, "blimp/accuracy/wh_questions_object_gap": 0.858, "blimp/accuracy/animate_subject_passive": 0.795, "blimp/accuracy/existential_there_quantifiers_1": 0.978, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.89, "blimp/accuracy/npi_present_2": 0.583, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.962, "blimp/accuracy/anaphor_number_agreement": 0.991, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.966, "blimp/accuracy/existential_there_object_raising": 0.831, "blimp/accuracy/matrix_question_npi_licensor_present": 0.426, "blimp/accuracy/npi_present_1": 0.577, "blimp/accuracy/wh_vs_that_no_gap": 0.98, "blimp/accuracy/left_branch_island_echo_question": 0.578, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.958, "blimp/accuracy/causative": 0.752, "blimp/accuracy/group_average": 0.8067462686567164, "blimp/accuracy/seq_average": 0.8067462686567164, "cbt/accuracy/NE": 0.8217147435897436, "cbt/accuracy/V": 0.9432, "cbt/accuracy/CN": 0.8892, "cbt/accuracy/P": 0.9212, "cbt/accuracy/group_average": 0.8938286858974358, "cbt/accuracy/seq_average": 0.8938575430172069, "hellaswag/accuracy/val": 0.3732324238199562, "hellaswag/accuracy/group_average": 0.3732324238199562, "hellaswag/accuracy/seq_average": 0.3732324238199562, "piqa/accuracy/val": 0.6349292709466812, "piqa/accuracy/group_average": 0.6349292709466812, "piqa/accuracy/seq_average": 0.6349292709466812, "ai2arc/accuracy/ARC-Easy": 0.38520084566596197, "ai2arc/accuracy/ARC-Challenge": 0.23090128755364808, "ai2arc/accuracy/group_average": 0.308051066609805, "ai2arc/accuracy/seq_average": 0.3342776203966006, "mmlu/accuracy/MMLU": 0.2621380050053629, "mmlu/accuracy/group_average": 0.2621380050053629, "mmlu/accuracy/seq_average": 0.2621380050053629, "openbookqa/accuracy/test": 0.284, "openbookqa/accuracy/group_average": 0.284, "openbookqa/accuracy/seq_average": 0.284, "race/accuracy/test/high": 0.2890222984562607, "race/accuracy/test/middle": 0.3725626740947075, "race/accuracy/group_average": 0.3307924862754841, "race/accuracy/seq_average": 0.3133360356708553, "siqa/accuracy/dev": 0.37871033776867963, "siqa/accuracy/group_average": 0.37871033776867963, "siqa/accuracy/seq_average": 0.37871033776867963, "winogrande/accuracy/dev": 0.5035516969218626, "winogrande/accuracy/group_average": 0.5035516969218626, "winogrande/accuracy/seq_average": 0.5035516969218626, "commonsenseqa/accuracy/dev_rand_split": 0.27682227682227684, "commonsenseqa/accuracy/group_average": 0.27682227682227684, "commonsenseqa/accuracy/seq_average": 0.27682227682227684}
Pretrain_language_model/save/slimpajama_competesmoe_no_attmoe_660M_standardlb_down_flip/export/result-model-380000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.2734333161384828, "val/accuracy": 0.5244495022681451, "val/perplexity": 9.712690371184987, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.161026664402174, "lambada/accuracy/total": 0.3825698757763975, "lambada/accuracy/openai_last_token": 0.8060947204968945, "lambada/perplexity": 6.370929817950676, "lambada/lm_loss": 2.8524791362414432, "lambada/lm_perplexity": 17.330693777353112, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4535096890222713, "mean_loss": 2.2172299902703285, "blimp/accuracy/passive_2": 0.91, "blimp/accuracy/determiner_noun_agreement_2": 0.989, "blimp/accuracy/ellipsis_n_bar_1": 0.867, "blimp/accuracy/tough_vs_raising_2": 0.88, "blimp/accuracy/tough_vs_raising_1": 0.568, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.891, "blimp/accuracy/principle_A_reconstruction": 0.412, "blimp/accuracy/wh_vs_that_with_gap": 0.415, "blimp/accuracy/principle_A_domain_2": 0.889, "blimp/accuracy/determiner_noun_agreement_1": 0.992, "blimp/accuracy/ellipsis_n_bar_2": 0.905, "blimp/accuracy/principle_A_domain_3": 0.612, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.93, "blimp/accuracy/animate_subject_trans": 0.905, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.924, "blimp/accuracy/distractor_agreement_relative_clause": 0.709, "blimp/accuracy/transitive": 0.892, "blimp/accuracy/sentential_subject_island": 0.335, "blimp/accuracy/adjunct_island": 0.861, "blimp/accuracy/intransitive": 0.761, "blimp/accuracy/existential_there_subject_raising": 0.893, "blimp/accuracy/irregular_past_participle_adjectives": 0.976, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.748, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.339, "blimp/accuracy/only_npi_scope": 0.718, "blimp/accuracy/superlative_quantifiers_2": 0.837, "blimp/accuracy/passive_1": 0.896, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.932, "blimp/accuracy/inchoative": 0.613, "blimp/accuracy/anaphor_gender_agreement": 0.983, "blimp/accuracy/principle_A_c_command": 0.702, "blimp/accuracy/only_npi_licensor_present": 0.732, "blimp/accuracy/expletive_it_object_raising": 0.766, "blimp/accuracy/left_branch_island_simple_question": 0.839, "blimp/accuracy/wh_questions_subject_gap": 0.938, "blimp/accuracy/existential_there_quantifiers_2": 0.463, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.94, "blimp/accuracy/sentential_negation_npi_scope": 0.739, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.821, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.899, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.903, "blimp/accuracy/principle_A_case_2": 0.92, "blimp/accuracy/distractor_agreement_relational_noun": 0.852, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.973, "blimp/accuracy/superlative_quantifiers_1": 0.87, "blimp/accuracy/wh_island": 0.741, "blimp/accuracy/principle_A_domain_1": 0.995, "blimp/accuracy/complex_NP_island": 0.606, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.975, "blimp/accuracy/irregular_past_participle_verbs": 0.951, "blimp/accuracy/drop_argument": 0.727, "blimp/accuracy/wh_questions_object_gap": 0.866, "blimp/accuracy/animate_subject_passive": 0.798, "blimp/accuracy/existential_there_quantifiers_1": 0.963, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.894, "blimp/accuracy/npi_present_2": 0.566, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.962, "blimp/accuracy/anaphor_number_agreement": 0.992, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.966, "blimp/accuracy/existential_there_object_raising": 0.843, "blimp/accuracy/matrix_question_npi_licensor_present": 0.405, "blimp/accuracy/npi_present_1": 0.544, "blimp/accuracy/wh_vs_that_no_gap": 0.983, "blimp/accuracy/left_branch_island_echo_question": 0.515, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.965, "blimp/accuracy/causative": 0.756, "blimp/accuracy/group_average": 0.8052537313432835, "blimp/accuracy/seq_average": 0.8052537313432836, "cbt/accuracy/NE": 0.8205128205128205, "cbt/accuracy/V": 0.9424, "cbt/accuracy/CN": 0.89, "cbt/accuracy/P": 0.9236, "cbt/accuracy/group_average": 0.8941282051282051, "cbt/accuracy/seq_average": 0.894157663065226, "hellaswag/accuracy/val": 0.3738299143596893, "hellaswag/accuracy/group_average": 0.3738299143596893, "hellaswag/accuracy/seq_average": 0.3738299143596893, "piqa/accuracy/val": 0.6365614798694232, "piqa/accuracy/group_average": 0.6365614798694232, "piqa/accuracy/seq_average": 0.6365614798694232, "ai2arc/accuracy/ARC-Easy": 0.3864693446088795, "ai2arc/accuracy/ARC-Challenge": 0.23605150214592274, "ai2arc/accuracy/group_average": 0.3112604233774011, "ai2arc/accuracy/seq_average": 0.33682719546742207, "mmlu/accuracy/MMLU": 0.262567036110118, "mmlu/accuracy/group_average": 0.262567036110118, "mmlu/accuracy/seq_average": 0.262567036110118, "openbookqa/accuracy/test": 0.278, "openbookqa/accuracy/group_average": 0.278, "openbookqa/accuracy/seq_average": 0.278, "race/accuracy/test/high": 0.29130931961120643, "race/accuracy/test/middle": 0.36908077994428967, "race/accuracy/group_average": 0.33019504977774805, "race/accuracy/seq_average": 0.3139440616132955, "siqa/accuracy/dev": 0.3751279426816786, "siqa/accuracy/group_average": 0.3751279426816786, "siqa/accuracy/seq_average": 0.3751279426816786, "winogrande/accuracy/dev": 0.5082872928176796, "winogrande/accuracy/group_average": 0.5082872928176796, "winogrande/accuracy/seq_average": 0.5082872928176796, "commonsenseqa/accuracy/dev_rand_split": 0.2719082719082719, "commonsenseqa/accuracy/group_average": 0.2719082719082719, "commonsenseqa/accuracy/seq_average": 0.2719082719082719}
Pretrain_language_model/save/slimpajama_competesmoe_no_attmoe_660M_standardlb_down_flip/export/result-model-400000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.2700729370117188, "val/accuracy": 0.5251592820690524, "val/perplexity": 9.680106826390348, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.1618758491847827, "lambada/accuracy/total": 0.36995341614906835, "lambada/accuracy/openai_last_token": 0.8022127329192547, "lambada/perplexity": 6.4895831753911715, "lambada/lm_loss": 2.8522329407475544, "lambada/lm_perplexity": 17.326427563821984, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.4475563491090604, "mean_loss": 2.215974393098251, "blimp/accuracy/passive_2": 0.916, "blimp/accuracy/determiner_noun_agreement_2": 0.99, "blimp/accuracy/ellipsis_n_bar_1": 0.867, "blimp/accuracy/tough_vs_raising_2": 0.881, "blimp/accuracy/tough_vs_raising_1": 0.589, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.892, "blimp/accuracy/principle_A_reconstruction": 0.402, "blimp/accuracy/wh_vs_that_with_gap": 0.445, "blimp/accuracy/principle_A_domain_2": 0.898, "blimp/accuracy/determiner_noun_agreement_1": 0.994, "blimp/accuracy/ellipsis_n_bar_2": 0.908, "blimp/accuracy/principle_A_domain_3": 0.64, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.941, "blimp/accuracy/animate_subject_trans": 0.909, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.933, "blimp/accuracy/distractor_agreement_relative_clause": 0.662, "blimp/accuracy/transitive": 0.9, "blimp/accuracy/sentential_subject_island": 0.321, "blimp/accuracy/adjunct_island": 0.875, "blimp/accuracy/intransitive": 0.749, "blimp/accuracy/existential_there_subject_raising": 0.895, "blimp/accuracy/irregular_past_participle_adjectives": 0.949, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.768, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.369, "blimp/accuracy/only_npi_scope": 0.686, "blimp/accuracy/superlative_quantifiers_2": 0.813, "blimp/accuracy/passive_1": 0.903, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.916, "blimp/accuracy/inchoative": 0.616, "blimp/accuracy/anaphor_gender_agreement": 0.979, "blimp/accuracy/principle_A_c_command": 0.677, "blimp/accuracy/only_npi_licensor_present": 0.723, "blimp/accuracy/expletive_it_object_raising": 0.758, "blimp/accuracy/left_branch_island_simple_question": 0.86, "blimp/accuracy/wh_questions_subject_gap": 0.935, "blimp/accuracy/existential_there_quantifiers_2": 0.431, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.948, "blimp/accuracy/sentential_negation_npi_scope": 0.752, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.825, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.901, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.886, "blimp/accuracy/principle_A_case_2": 0.936, "blimp/accuracy/distractor_agreement_relational_noun": 0.833, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.978, "blimp/accuracy/superlative_quantifiers_1": 0.786, "blimp/accuracy/wh_island": 0.774, "blimp/accuracy/principle_A_domain_1": 0.993, "blimp/accuracy/complex_NP_island": 0.61, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.972, "blimp/accuracy/irregular_past_participle_verbs": 0.93, "blimp/accuracy/drop_argument": 0.731, "blimp/accuracy/wh_questions_object_gap": 0.87, "blimp/accuracy/animate_subject_passive": 0.804, "blimp/accuracy/existential_there_quantifiers_1": 0.976, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.888, "blimp/accuracy/npi_present_2": 0.572, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.965, "blimp/accuracy/anaphor_number_agreement": 0.992, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.967, "blimp/accuracy/existential_there_object_raising": 0.854, "blimp/accuracy/matrix_question_npi_licensor_present": 0.451, "blimp/accuracy/npi_present_1": 0.556, "blimp/accuracy/wh_vs_that_no_gap": 0.986, "blimp/accuracy/left_branch_island_echo_question": 0.545, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.962, "blimp/accuracy/causative": 0.751, "blimp/accuracy/group_average": 0.8057313432835821, "blimp/accuracy/seq_average": 0.8057313432835821, "cbt/accuracy/NE": 0.8253205128205128, "cbt/accuracy/V": 0.9416, "cbt/accuracy/CN": 0.8896, "cbt/accuracy/P": 0.9248, "cbt/accuracy/group_average": 0.8953301282051281, "cbt/accuracy/seq_average": 0.8953581432573029, "hellaswag/accuracy/val": 0.3741286596295559, "hellaswag/accuracy/group_average": 0.3741286596295559, "hellaswag/accuracy/seq_average": 0.3741286596295559, "piqa/accuracy/val": 0.6398258977149075, "piqa/accuracy/group_average": 0.6398258977149075, "piqa/accuracy/seq_average": 0.6398258977149075, "ai2arc/accuracy/ARC-Easy": 0.3864693446088795, "ai2arc/accuracy/ARC-Challenge": 0.23605150214592274, "ai2arc/accuracy/group_average": 0.3112604233774011, "ai2arc/accuracy/seq_average": 0.33682719546742207, "mmlu/accuracy/MMLU": 0.2671433678941723, "mmlu/accuracy/group_average": 0.2671433678941723, "mmlu/accuracy/seq_average": 0.2671433678941723, "openbookqa/accuracy/test": 0.28, "openbookqa/accuracy/group_average": 0.28, "openbookqa/accuracy/seq_average": 0.28, "race/accuracy/test/high": 0.2935963407661521, "race/accuracy/test/middle": 0.36908077994428967, "race/accuracy/group_average": 0.3313385603552209, "race/accuracy/seq_average": 0.3155654641264694, "siqa/accuracy/dev": 0.3766632548618219, "siqa/accuracy/group_average": 0.3766632548618219, "siqa/accuracy/seq_average": 0.3766632548618219, "winogrande/accuracy/dev": 0.5130228887134964, "winogrande/accuracy/group_average": 0.5130228887134964, "winogrande/accuracy/seq_average": 0.5130228887134964, "commonsenseqa/accuracy/dev_rand_split": 0.26945126945126946, "commonsenseqa/accuracy/group_average": 0.26945126945126946, "commonsenseqa/accuracy/seq_average": 0.26945126945126946}