Upload folder using huggingface_hub

#309
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_deepseek_highlb_shared_only/export/result-model-10000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 3.0176837255084323, "val/accuracy": 0.42397925967261907, "val/perplexity": 20.443883154745357, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.641845703125, "lambada/accuracy/total": 0.16459627329192547, "lambada/accuracy/openai_last_token": 0.7154503105590062, "lambada/perplexity": 24.858075427380676, "lambada/lm_loss": 3.5497515043832446, "lambada/lm_perplexity": 34.804667605575034, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.2942877664822723, "mean_loss": 2.829764714316716, "blimp/accuracy/passive_2": 0.868, "blimp/accuracy/determiner_noun_agreement_2": 0.974, "blimp/accuracy/ellipsis_n_bar_1": 0.744, "blimp/accuracy/tough_vs_raising_2": 0.844, "blimp/accuracy/tough_vs_raising_1": 0.556, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.872, "blimp/accuracy/principle_A_reconstruction": 0.447, "blimp/accuracy/wh_vs_that_with_gap": 0.543, "blimp/accuracy/principle_A_domain_2": 0.824, "blimp/accuracy/determiner_noun_agreement_1": 0.969, "blimp/accuracy/ellipsis_n_bar_2": 0.867, "blimp/accuracy/principle_A_domain_3": 0.52, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.884, "blimp/accuracy/animate_subject_trans": 0.881, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.763, "blimp/accuracy/distractor_agreement_relative_clause": 0.52, "blimp/accuracy/transitive": 0.792, "blimp/accuracy/sentential_subject_island": 0.394, "blimp/accuracy/adjunct_island": 0.71, "blimp/accuracy/intransitive": 0.714, "blimp/accuracy/existential_there_subject_raising": 0.839, "blimp/accuracy/irregular_past_participle_adjectives": 0.944, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.143, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.241, "blimp/accuracy/only_npi_scope": 0.584, "blimp/accuracy/superlative_quantifiers_2": 0.625, "blimp/accuracy/passive_1": 0.888, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.87, "blimp/accuracy/inchoative": 0.571, "blimp/accuracy/anaphor_gender_agreement": 0.904, "blimp/accuracy/principle_A_c_command": 0.525, "blimp/accuracy/only_npi_licensor_present": 0.718, "blimp/accuracy/expletive_it_object_raising": 0.758, "blimp/accuracy/left_branch_island_simple_question": 0.117, "blimp/accuracy/wh_questions_subject_gap": 0.882, "blimp/accuracy/existential_there_quantifiers_2": 0.367, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.912, "blimp/accuracy/sentential_negation_npi_scope": 0.377, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.737, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.898, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.816, "blimp/accuracy/principle_A_case_2": 0.944, "blimp/accuracy/distractor_agreement_relational_noun": 0.722, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.992, "blimp/accuracy/superlative_quantifiers_1": 0.694, "blimp/accuracy/wh_island": 0.752, "blimp/accuracy/principle_A_domain_1": 0.987, "blimp/accuracy/complex_NP_island": 0.478, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.953, "blimp/accuracy/irregular_past_participle_verbs": 0.829, "blimp/accuracy/drop_argument": 0.735, "blimp/accuracy/wh_questions_object_gap": 0.688, "blimp/accuracy/animate_subject_passive": 0.756, "blimp/accuracy/existential_there_quantifiers_1": 0.974, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.861, "blimp/accuracy/npi_present_2": 0.479, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.834, "blimp/accuracy/anaphor_number_agreement": 0.971, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.917, "blimp/accuracy/existential_there_object_raising": 0.755, "blimp/accuracy/matrix_question_npi_licensor_present": 0.134, "blimp/accuracy/npi_present_1": 0.47, "blimp/accuracy/wh_vs_that_no_gap": 0.924, "blimp/accuracy/left_branch_island_echo_question": 0.331, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.936, "blimp/accuracy/causative": 0.649, "blimp/accuracy/group_average": 0.7189104477611943, "blimp/accuracy/seq_average": 0.7189104477611941, "cbt/accuracy/NE": 0.7047275641025641, "cbt/accuracy/V": 0.8612, "cbt/accuracy/CN": 0.7372, "cbt/accuracy/P": 0.8408, "cbt/accuracy/group_average": 0.7859818910256411, "cbt/accuracy/seq_average": 0.786014405762305, "hellaswag/accuracy/val": 0.2673770165305716, "hellaswag/accuracy/group_average": 0.2673770165305716, "hellaswag/accuracy/seq_average": 0.2673770165305716, "piqa/accuracy/val": 0.5554951033732318, "piqa/accuracy/group_average": 0.5554951033732318, "piqa/accuracy/seq_average": 0.5554951033732318, "ai2arc/accuracy/ARC-Easy": 0.30274841437632133, "ai2arc/accuracy/ARC-Challenge": 0.2034334763948498, "ai2arc/accuracy/group_average": 0.2530909453855856, "ai2arc/accuracy/seq_average": 0.26997167138810196, "mmlu/accuracy/MMLU": 0.259706828745084, "mmlu/accuracy/group_average": 0.259706828745084, "mmlu/accuracy/seq_average": 0.259706828745084, "openbookqa/accuracy/test": 0.266, "openbookqa/accuracy/group_average": 0.266, "openbookqa/accuracy/seq_average": 0.266, "race/accuracy/test/high": 0.2538593481989708, "race/accuracy/test/middle": 0.3231197771587744, "race/accuracy/group_average": 0.2884895626788726, "race/accuracy/seq_average": 0.27401702472638834, "siqa/accuracy/dev": 0.35516888433981575, "siqa/accuracy/group_average": 0.35516888433981575, "siqa/accuracy/seq_average": 0.35516888433981575, "winogrande/accuracy/dev": 0.5193370165745856, "winogrande/accuracy/group_average": 0.5193370165745856, "winogrande/accuracy/seq_average": 0.5193370165745856, "commonsenseqa/accuracy/dev_rand_split": 0.24733824733824733, "commonsenseqa/accuracy/group_average": 0.24733824733824733, "commonsenseqa/accuracy/seq_average": 0.24733824733824733}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_deepseek_highlb_shared_only/export/result-model-100000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.60186767578125, "val/accuracy": 0.48101709759424605, "val/perplexity": 13.488907429272462, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6041343167701863, "lambada/accuracy/total": 0.2529114906832298, "lambada/accuracy/openai_last_token": 0.7620341614906833, "lambada/perplexity": 11.883611843756697, "lambada/lm_loss": 3.161807365611224, "lambada/lm_perplexity": 23.61323513429599, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.36696429413873793, "mean_loss": 2.603000996275718, "blimp/accuracy/passive_2": 0.914, "blimp/accuracy/determiner_noun_agreement_2": 0.981, "blimp/accuracy/ellipsis_n_bar_1": 0.783, "blimp/accuracy/tough_vs_raising_2": 0.889, "blimp/accuracy/tough_vs_raising_1": 0.552, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.913, "blimp/accuracy/principle_A_reconstruction": 0.217, "blimp/accuracy/wh_vs_that_with_gap": 0.52, "blimp/accuracy/principle_A_domain_2": 0.836, "blimp/accuracy/determiner_noun_agreement_1": 0.988, "blimp/accuracy/ellipsis_n_bar_2": 0.906, "blimp/accuracy/principle_A_domain_3": 0.565, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.916, "blimp/accuracy/animate_subject_trans": 0.901, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.89, "blimp/accuracy/distractor_agreement_relative_clause": 0.669, "blimp/accuracy/transitive": 0.876, "blimp/accuracy/sentential_subject_island": 0.36, "blimp/accuracy/adjunct_island": 0.842, "blimp/accuracy/intransitive": 0.792, "blimp/accuracy/existential_there_subject_raising": 0.855, "blimp/accuracy/irregular_past_participle_adjectives": 0.994, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.455, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.24, "blimp/accuracy/only_npi_scope": 0.775, "blimp/accuracy/superlative_quantifiers_2": 0.735, "blimp/accuracy/passive_1": 0.906, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.927, "blimp/accuracy/inchoative": 0.639, "blimp/accuracy/anaphor_gender_agreement": 0.948, "blimp/accuracy/principle_A_c_command": 0.684, "blimp/accuracy/only_npi_licensor_present": 0.601, "blimp/accuracy/expletive_it_object_raising": 0.797, "blimp/accuracy/left_branch_island_simple_question": 0.491, "blimp/accuracy/wh_questions_subject_gap": 0.932, "blimp/accuracy/existential_there_quantifiers_2": 0.56, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.929, "blimp/accuracy/sentential_negation_npi_scope": 0.56, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.774, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.886, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.906, "blimp/accuracy/principle_A_case_2": 0.955, "blimp/accuracy/distractor_agreement_relational_noun": 0.834, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.992, "blimp/accuracy/superlative_quantifiers_1": 0.544, "blimp/accuracy/wh_island": 0.808, "blimp/accuracy/principle_A_domain_1": 0.983, "blimp/accuracy/complex_NP_island": 0.505, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.969, "blimp/accuracy/irregular_past_participle_verbs": 0.919, "blimp/accuracy/drop_argument": 0.776, "blimp/accuracy/wh_questions_object_gap": 0.795, "blimp/accuracy/animate_subject_passive": 0.798, "blimp/accuracy/existential_there_quantifiers_1": 0.983, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.878, "blimp/accuracy/npi_present_2": 0.639, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.936, "blimp/accuracy/anaphor_number_agreement": 0.984, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.957, "blimp/accuracy/existential_there_object_raising": 0.855, "blimp/accuracy/matrix_question_npi_licensor_present": 0.209, "blimp/accuracy/npi_present_1": 0.527, "blimp/accuracy/wh_vs_that_no_gap": 0.973, "blimp/accuracy/left_branch_island_echo_question": 0.415, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.971, "blimp/accuracy/causative": 0.729, "blimp/accuracy/group_average": 0.7737014925373132, "blimp/accuracy/seq_average": 0.7737014925373135, "cbt/accuracy/NE": 0.7479967948717948, "cbt/accuracy/V": 0.9152, "cbt/accuracy/CN": 0.8216, "cbt/accuracy/P": 0.8884, "cbt/accuracy/group_average": 0.8432991987179487, "cbt/accuracy/seq_average": 0.8433373349339736, "hellaswag/accuracy/val": 0.29376618203545113, "hellaswag/accuracy/group_average": 0.29376618203545113, "hellaswag/accuracy/seq_average": 0.29376618203545113, "piqa/accuracy/val": 0.6017410228509249, "piqa/accuracy/group_average": 0.6017410228509249, "piqa/accuracy/seq_average": 0.6017410228509249, "ai2arc/accuracy/ARC-Easy": 0.3382663847780127, "ai2arc/accuracy/ARC-Challenge": 0.20515021459227467, "ai2arc/accuracy/group_average": 0.27170829968514365, "ai2arc/accuracy/seq_average": 0.2943342776203966, "mmlu/accuracy/MMLU": 0.2635681086878799, "mmlu/accuracy/group_average": 0.2635681086878799, "mmlu/accuracy/seq_average": 0.2635681086878799, "openbookqa/accuracy/test": 0.268, "openbookqa/accuracy/group_average": 0.268, "openbookqa/accuracy/seq_average": 0.268, "race/accuracy/test/high": 0.27215551743853633, "race/accuracy/test/middle": 0.34818941504178275, "race/accuracy/group_average": 0.31017246624015954, "race/accuracy/seq_average": 0.294284556141062, "siqa/accuracy/dev": 0.3556806550665302, "siqa/accuracy/group_average": 0.3556806550665302, "siqa/accuracy/seq_average": 0.3556806550665302, "winogrande/accuracy/dev": 0.5311760063141279, "winogrande/accuracy/group_average": 0.5311760063141279, "winogrande/accuracy/seq_average": 0.5311760063141279, "commonsenseqa/accuracy/dev_rand_split": 0.2497952497952498, "commonsenseqa/accuracy/group_average": 0.2497952497952498, "commonsenseqa/accuracy/seq_average": 0.2497952497952498}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_deepseek_highlb_shared_only/export/result-model-20000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.870915246388269, "val/accuracy": 0.4421163891989087, "val/perplexity": 17.65316780635173, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5532592394337152, "lambada/accuracy/total": 0.18730590062111802, "lambada/accuracy/openai_last_token": 0.7329192546583851, "lambada/perplexity": 18.83267621883434, "lambada/lm_loss": 3.3952704118406274, "lambada/lm_perplexity": 29.822716800422107, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3147111449100134, "mean_loss": 2.712087242910992, "blimp/accuracy/passive_2": 0.871, "blimp/accuracy/determiner_noun_agreement_2": 0.959, "blimp/accuracy/ellipsis_n_bar_1": 0.771, "blimp/accuracy/tough_vs_raising_2": 0.878, "blimp/accuracy/tough_vs_raising_1": 0.536, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.882, "blimp/accuracy/principle_A_reconstruction": 0.337, "blimp/accuracy/wh_vs_that_with_gap": 0.464, "blimp/accuracy/principle_A_domain_2": 0.809, "blimp/accuracy/determiner_noun_agreement_1": 0.98, "blimp/accuracy/ellipsis_n_bar_2": 0.87, "blimp/accuracy/principle_A_domain_3": 0.535, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.886, "blimp/accuracy/animate_subject_trans": 0.874, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.826, "blimp/accuracy/distractor_agreement_relative_clause": 0.514, "blimp/accuracy/transitive": 0.84, "blimp/accuracy/sentential_subject_island": 0.383, "blimp/accuracy/adjunct_island": 0.769, "blimp/accuracy/intransitive": 0.722, "blimp/accuracy/existential_there_subject_raising": 0.852, "blimp/accuracy/irregular_past_participle_adjectives": 0.835, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.152, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.174, "blimp/accuracy/only_npi_scope": 0.681, "blimp/accuracy/superlative_quantifiers_2": 0.75, "blimp/accuracy/passive_1": 0.886, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.869, "blimp/accuracy/inchoative": 0.553, "blimp/accuracy/anaphor_gender_agreement": 0.881, "blimp/accuracy/principle_A_c_command": 0.575, "blimp/accuracy/only_npi_licensor_present": 0.69, "blimp/accuracy/expletive_it_object_raising": 0.78, "blimp/accuracy/left_branch_island_simple_question": 0.276, "blimp/accuracy/wh_questions_subject_gap": 0.896, "blimp/accuracy/existential_there_quantifiers_2": 0.403, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.895, "blimp/accuracy/sentential_negation_npi_scope": 0.49, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.75, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.926, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.873, "blimp/accuracy/principle_A_case_2": 0.928, "blimp/accuracy/distractor_agreement_relational_noun": 0.787, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.981, "blimp/accuracy/superlative_quantifiers_1": 0.395, "blimp/accuracy/wh_island": 0.851, "blimp/accuracy/principle_A_domain_1": 0.987, "blimp/accuracy/complex_NP_island": 0.472, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.964, "blimp/accuracy/irregular_past_participle_verbs": 0.886, "blimp/accuracy/drop_argument": 0.721, "blimp/accuracy/wh_questions_object_gap": 0.78, "blimp/accuracy/animate_subject_passive": 0.77, "blimp/accuracy/existential_there_quantifiers_1": 0.994, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.861, "blimp/accuracy/npi_present_2": 0.62, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.908, "blimp/accuracy/anaphor_number_agreement": 0.971, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.93, "blimp/accuracy/existential_there_object_raising": 0.836, "blimp/accuracy/matrix_question_npi_licensor_present": 0.131, "blimp/accuracy/npi_present_1": 0.553, "blimp/accuracy/wh_vs_that_no_gap": 0.948, "blimp/accuracy/left_branch_island_echo_question": 0.318, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.975, "blimp/accuracy/causative": 0.682, "blimp/accuracy/group_average": 0.7334626865671641, "blimp/accuracy/seq_average": 0.7334626865671642, "cbt/accuracy/NE": 0.7047275641025641, "cbt/accuracy/V": 0.8772, "cbt/accuracy/CN": 0.7728, "cbt/accuracy/P": 0.8532, "cbt/accuracy/group_average": 0.8019818910256411, "cbt/accuracy/seq_average": 0.8020208083233293, "hellaswag/accuracy/val": 0.2741485759808803, "hellaswag/accuracy/group_average": 0.2741485759808803, "hellaswag/accuracy/seq_average": 0.2741485759808803, "piqa/accuracy/val": 0.573993471164309, "piqa/accuracy/group_average": 0.573993471164309, "piqa/accuracy/seq_average": 0.573993471164309, "ai2arc/accuracy/ARC-Easy": 0.31374207188160674, "ai2arc/accuracy/ARC-Challenge": 0.20429184549356222, "ai2arc/accuracy/group_average": 0.25901695868758445, "ai2arc/accuracy/seq_average": 0.2776203966005666, "mmlu/accuracy/MMLU": 0.2627100464783697, "mmlu/accuracy/group_average": 0.2627100464783697, "mmlu/accuracy/seq_average": 0.2627100464783697, "openbookqa/accuracy/test": 0.268, "openbookqa/accuracy/group_average": 0.268, "openbookqa/accuracy/seq_average": 0.268, "race/accuracy/test/high": 0.25700400228702114, "race/accuracy/test/middle": 0.33147632311977715, "race/accuracy/group_average": 0.29424016270339914, "race/accuracy/seq_average": 0.2786785569517633, "siqa/accuracy/dev": 0.36898669396110545, "siqa/accuracy/group_average": 0.36898669396110545, "siqa/accuracy/seq_average": 0.36898669396110545, "winogrande/accuracy/dev": 0.5240726124704025, "winogrande/accuracy/group_average": 0.5240726124704025, "winogrande/accuracy/seq_average": 0.5240726124704025, "commonsenseqa/accuracy/dev_rand_split": 0.24078624078624078, "commonsenseqa/accuracy/group_average": 0.24078624078624078, "commonsenseqa/accuracy/seq_average": 0.24078624078624078}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_deepseek_highlb_shared_only/export/result-model-30000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.7964634971013145, "val/accuracy": 0.45262024894593256, "val/perplexity": 16.38659294439229, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.616868676606172, "lambada/accuracy/total": 0.20535714285714285, "lambada/accuracy/openai_last_token": 0.7402950310559007, "lambada/perplexity": 16.192556555304527, "lambada/lm_loss": 3.3511938684858946, "lambada/lm_perplexity": 28.536782480098548, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3289886959015377, "mean_loss": 2.7066659657522427, "blimp/accuracy/passive_2": 0.87, "blimp/accuracy/determiner_noun_agreement_2": 0.967, "blimp/accuracy/ellipsis_n_bar_1": 0.783, "blimp/accuracy/tough_vs_raising_2": 0.885, "blimp/accuracy/tough_vs_raising_1": 0.535, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.899, "blimp/accuracy/principle_A_reconstruction": 0.4, "blimp/accuracy/wh_vs_that_with_gap": 0.557, "blimp/accuracy/principle_A_domain_2": 0.829, "blimp/accuracy/determiner_noun_agreement_1": 0.979, "blimp/accuracy/ellipsis_n_bar_2": 0.875, "blimp/accuracy/principle_A_domain_3": 0.54, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.891, "blimp/accuracy/animate_subject_trans": 0.888, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.865, "blimp/accuracy/distractor_agreement_relative_clause": 0.551, "blimp/accuracy/transitive": 0.845, "blimp/accuracy/sentential_subject_island": 0.392, "blimp/accuracy/adjunct_island": 0.803, "blimp/accuracy/intransitive": 0.782, "blimp/accuracy/existential_there_subject_raising": 0.847, "blimp/accuracy/irregular_past_participle_adjectives": 0.907, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.218, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.212, "blimp/accuracy/only_npi_scope": 0.689, "blimp/accuracy/superlative_quantifiers_2": 0.612, "blimp/accuracy/passive_1": 0.857, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.874, "blimp/accuracy/inchoative": 0.631, "blimp/accuracy/anaphor_gender_agreement": 0.957, "blimp/accuracy/principle_A_c_command": 0.624, "blimp/accuracy/only_npi_licensor_present": 0.341, "blimp/accuracy/expletive_it_object_raising": 0.79, "blimp/accuracy/left_branch_island_simple_question": 0.297, "blimp/accuracy/wh_questions_subject_gap": 0.884, "blimp/accuracy/existential_there_quantifiers_2": 0.447, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.912, "blimp/accuracy/sentential_negation_npi_scope": 0.462, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.774, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.902, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.882, "blimp/accuracy/principle_A_case_2": 0.947, "blimp/accuracy/distractor_agreement_relational_noun": 0.751, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.978, "blimp/accuracy/superlative_quantifiers_1": 0.621, "blimp/accuracy/wh_island": 0.77, "blimp/accuracy/principle_A_domain_1": 0.993, "blimp/accuracy/complex_NP_island": 0.463, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.949, "blimp/accuracy/irregular_past_participle_verbs": 0.903, "blimp/accuracy/drop_argument": 0.776, "blimp/accuracy/wh_questions_object_gap": 0.736, "blimp/accuracy/animate_subject_passive": 0.812, "blimp/accuracy/existential_there_quantifiers_1": 0.985, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.871, "blimp/accuracy/npi_present_2": 0.583, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.933, "blimp/accuracy/anaphor_number_agreement": 0.983, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.935, "blimp/accuracy/existential_there_object_raising": 0.845, "blimp/accuracy/matrix_question_npi_licensor_present": 0.192, "blimp/accuracy/npi_present_1": 0.523, "blimp/accuracy/wh_vs_that_no_gap": 0.946, "blimp/accuracy/left_branch_island_echo_question": 0.328, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.948, "blimp/accuracy/causative": 0.681, "blimp/accuracy/group_average": 0.7418955223880597, "blimp/accuracy/seq_average": 0.7418955223880597, "cbt/accuracy/NE": 0.7267628205128205, "cbt/accuracy/V": 0.8904, "cbt/accuracy/CN": 0.7852, "cbt/accuracy/P": 0.8596, "cbt/accuracy/group_average": 0.8154907051282051, "cbt/accuracy/seq_average": 0.8155262104841937, "hellaswag/accuracy/val": 0.2779326827325234, "hellaswag/accuracy/group_average": 0.2779326827325234, "hellaswag/accuracy/seq_average": 0.2779326827325234, "piqa/accuracy/val": 0.5778019586507073, "piqa/accuracy/group_average": 0.5778019586507073, "piqa/accuracy/seq_average": 0.5778019586507073, "ai2arc/accuracy/ARC-Easy": 0.3150105708245243, "ai2arc/accuracy/ARC-Challenge": 0.2128755364806867, "ai2arc/accuracy/group_average": 0.2639430536526055, "ai2arc/accuracy/seq_average": 0.2813031161473088, "mmlu/accuracy/MMLU": 0.2637111190561316, "mmlu/accuracy/group_average": 0.2637111190561316, "mmlu/accuracy/seq_average": 0.2637111190561316, "openbookqa/accuracy/test": 0.276, "openbookqa/accuracy/group_average": 0.276, "openbookqa/accuracy/seq_average": 0.276, "race/accuracy/test/high": 0.26357918810749, "race/accuracy/test/middle": 0.334958217270195, "race/accuracy/group_average": 0.2992687026888425, "race/accuracy/seq_average": 0.2843534657478719, "siqa/accuracy/dev": 0.3592630501535312, "siqa/accuracy/group_average": 0.3592630501535312, "siqa/accuracy/seq_average": 0.3592630501535312, "winogrande/accuracy/dev": 0.516179952644041, "winogrande/accuracy/group_average": 0.516179952644041, "winogrande/accuracy/seq_average": 0.516179952644041, "commonsenseqa/accuracy/dev_rand_split": 0.24488124488124488, "commonsenseqa/accuracy/group_average": 0.24488124488124488, "commonsenseqa/accuracy/seq_average": 0.24488124488124488}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_deepseek_highlb_shared_only/export/result-model-40000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.7465093703497026, "val/accuracy": 0.4601488870287698, "val/perplexity": 15.588124437524746, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6215217542944487, "lambada/accuracy/total": 0.2284549689440994, "lambada/accuracy/openai_last_token": 0.7447593167701864, "lambada/perplexity": 14.799256542973566, "lambada/lm_loss": 3.277185231901306, "lambada/lm_perplexity": 26.501073242824365, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3443019279864346, "mean_loss": 2.6840155623220756, "blimp/accuracy/passive_2": 0.882, "blimp/accuracy/determiner_noun_agreement_2": 0.981, "blimp/accuracy/ellipsis_n_bar_1": 0.777, "blimp/accuracy/tough_vs_raising_2": 0.881, "blimp/accuracy/tough_vs_raising_1": 0.553, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.888, "blimp/accuracy/principle_A_reconstruction": 0.279, "blimp/accuracy/wh_vs_that_with_gap": 0.454, "blimp/accuracy/principle_A_domain_2": 0.822, "blimp/accuracy/determiner_noun_agreement_1": 0.985, "blimp/accuracy/ellipsis_n_bar_2": 0.876, "blimp/accuracy/principle_A_domain_3": 0.553, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.908, "blimp/accuracy/animate_subject_trans": 0.894, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.863, "blimp/accuracy/distractor_agreement_relative_clause": 0.634, "blimp/accuracy/transitive": 0.836, "blimp/accuracy/sentential_subject_island": 0.344, "blimp/accuracy/adjunct_island": 0.827, "blimp/accuracy/intransitive": 0.779, "blimp/accuracy/existential_there_subject_raising": 0.858, "blimp/accuracy/irregular_past_participle_adjectives": 0.947, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.287, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.167, "blimp/accuracy/only_npi_scope": 0.532, "blimp/accuracy/superlative_quantifiers_2": 0.727, "blimp/accuracy/passive_1": 0.882, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.881, "blimp/accuracy/inchoative": 0.631, "blimp/accuracy/anaphor_gender_agreement": 0.95, "blimp/accuracy/principle_A_c_command": 0.655, "blimp/accuracy/only_npi_licensor_present": 0.491, "blimp/accuracy/expletive_it_object_raising": 0.791, "blimp/accuracy/left_branch_island_simple_question": 0.36, "blimp/accuracy/wh_questions_subject_gap": 0.921, "blimp/accuracy/existential_there_quantifiers_2": 0.39, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.918, "blimp/accuracy/sentential_negation_npi_scope": 0.413, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.772, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.903, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.898, "blimp/accuracy/principle_A_case_2": 0.965, "blimp/accuracy/distractor_agreement_relational_noun": 0.836, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.981, "blimp/accuracy/superlative_quantifiers_1": 0.629, "blimp/accuracy/wh_island": 0.804, "blimp/accuracy/principle_A_domain_1": 0.991, "blimp/accuracy/complex_NP_island": 0.488, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.962, "blimp/accuracy/irregular_past_participle_verbs": 0.901, "blimp/accuracy/drop_argument": 0.766, "blimp/accuracy/wh_questions_object_gap": 0.766, "blimp/accuracy/animate_subject_passive": 0.774, "blimp/accuracy/existential_there_quantifiers_1": 0.993, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.859, "blimp/accuracy/npi_present_2": 0.485, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.921, "blimp/accuracy/anaphor_number_agreement": 0.978, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.956, "blimp/accuracy/existential_there_object_raising": 0.871, "blimp/accuracy/matrix_question_npi_licensor_present": 0.132, "blimp/accuracy/npi_present_1": 0.464, "blimp/accuracy/wh_vs_that_no_gap": 0.963, "blimp/accuracy/left_branch_island_echo_question": 0.391, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.97, "blimp/accuracy/causative": 0.678, "blimp/accuracy/group_average": 0.7449850746268658, "blimp/accuracy/seq_average": 0.7449850746268657, "cbt/accuracy/NE": 0.7347756410256411, "cbt/accuracy/V": 0.8992, "cbt/accuracy/CN": 0.7948, "cbt/accuracy/P": 0.8628, "cbt/accuracy/group_average": 0.8228939102564102, "cbt/accuracy/seq_average": 0.8229291716686674, "hellaswag/accuracy/val": 0.2795259908384784, "hellaswag/accuracy/group_average": 0.2795259908384784, "hellaswag/accuracy/seq_average": 0.2795259908384784, "piqa/accuracy/val": 0.5821545157780196, "piqa/accuracy/group_average": 0.5821545157780196, "piqa/accuracy/seq_average": 0.5821545157780196, "ai2arc/accuracy/ARC-Easy": 0.31585623678646935, "ai2arc/accuracy/ARC-Challenge": 0.20085836909871244, "ai2arc/accuracy/group_average": 0.2583573029425909, "ai2arc/accuracy/seq_average": 0.27790368271954674, "mmlu/accuracy/MMLU": 0.26220951018948874, "mmlu/accuracy/group_average": 0.26220951018948874, "mmlu/accuracy/seq_average": 0.26220951018948874, "openbookqa/accuracy/test": 0.27, "openbookqa/accuracy/group_average": 0.27, "openbookqa/accuracy/seq_average": 0.27, "race/accuracy/test/high": 0.26472269868496284, "race/accuracy/test/middle": 0.33147632311977715, "race/accuracy/group_average": 0.29809951090236997, "race/accuracy/seq_average": 0.28415079043372515, "siqa/accuracy/dev": 0.372057318321392, "siqa/accuracy/group_average": 0.372057318321392, "siqa/accuracy/seq_average": 0.372057318321392, "winogrande/accuracy/dev": 0.5217048145224941, "winogrande/accuracy/group_average": 0.5217048145224941, "winogrande/accuracy/seq_average": 0.5217048145224941, "commonsenseqa/accuracy/dev_rand_split": 0.2457002457002457, "commonsenseqa/accuracy/group_average": 0.2457002457002457, "commonsenseqa/accuracy/seq_average": 0.2457002457002457}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_deepseek_highlb_shared_only/export/result-model-50000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.703056940956721, "val/accuracy": 0.46643260168650796, "val/perplexity": 14.925287781819424, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.7212869395380435, "lambada/accuracy/total": 0.22379658385093168, "lambada/accuracy/openai_last_token": 0.7534937888198758, "lambada/perplexity": 14.23421991806299, "lambada/lm_loss": 3.2612786838532326, "lambada/lm_perplexity": 26.082867570045053, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3451145927687198, "mean_loss": 2.7121719402473823, "blimp/accuracy/passive_2": 0.887, "blimp/accuracy/determiner_noun_agreement_2": 0.961, "blimp/accuracy/ellipsis_n_bar_1": 0.76, "blimp/accuracy/tough_vs_raising_2": 0.901, "blimp/accuracy/tough_vs_raising_1": 0.48, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.915, "blimp/accuracy/principle_A_reconstruction": 0.225, "blimp/accuracy/wh_vs_that_with_gap": 0.542, "blimp/accuracy/principle_A_domain_2": 0.876, "blimp/accuracy/determiner_noun_agreement_1": 0.985, "blimp/accuracy/ellipsis_n_bar_2": 0.898, "blimp/accuracy/principle_A_domain_3": 0.552, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.898, "blimp/accuracy/animate_subject_trans": 0.892, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.871, "blimp/accuracy/distractor_agreement_relative_clause": 0.634, "blimp/accuracy/transitive": 0.848, "blimp/accuracy/sentential_subject_island": 0.367, "blimp/accuracy/adjunct_island": 0.81, "blimp/accuracy/intransitive": 0.776, "blimp/accuracy/existential_there_subject_raising": 0.862, "blimp/accuracy/irregular_past_participle_adjectives": 0.92, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.395, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.216, "blimp/accuracy/only_npi_scope": 0.762, "blimp/accuracy/superlative_quantifiers_2": 0.698, "blimp/accuracy/passive_1": 0.879, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.892, "blimp/accuracy/inchoative": 0.619, "blimp/accuracy/anaphor_gender_agreement": 0.932, "blimp/accuracy/principle_A_c_command": 0.704, "blimp/accuracy/only_npi_licensor_present": 0.609, "blimp/accuracy/expletive_it_object_raising": 0.798, "blimp/accuracy/left_branch_island_simple_question": 0.447, "blimp/accuracy/wh_questions_subject_gap": 0.939, "blimp/accuracy/existential_there_quantifiers_2": 0.462, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.908, "blimp/accuracy/sentential_negation_npi_scope": 0.501, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.783, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.882, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.893, "blimp/accuracy/principle_A_case_2": 0.957, "blimp/accuracy/distractor_agreement_relational_noun": 0.835, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.994, "blimp/accuracy/superlative_quantifiers_1": 0.555, "blimp/accuracy/wh_island": 0.808, "blimp/accuracy/principle_A_domain_1": 0.989, "blimp/accuracy/complex_NP_island": 0.512, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.968, "blimp/accuracy/irregular_past_participle_verbs": 0.885, "blimp/accuracy/drop_argument": 0.749, "blimp/accuracy/wh_questions_object_gap": 0.786, "blimp/accuracy/animate_subject_passive": 0.779, "blimp/accuracy/existential_there_quantifiers_1": 0.987, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.904, "blimp/accuracy/npi_present_2": 0.592, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.915, "blimp/accuracy/anaphor_number_agreement": 0.98, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.946, "blimp/accuracy/existential_there_object_raising": 0.876, "blimp/accuracy/matrix_question_npi_licensor_present": 0.198, "blimp/accuracy/npi_present_1": 0.561, "blimp/accuracy/wh_vs_that_no_gap": 0.959, "blimp/accuracy/left_branch_island_echo_question": 0.46, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.974, "blimp/accuracy/causative": 0.666, "blimp/accuracy/group_average": 0.761402985074627, "blimp/accuracy/seq_average": 0.7614029850746269, "cbt/accuracy/NE": 0.7423878205128205, "cbt/accuracy/V": 0.9084, "cbt/accuracy/CN": 0.802, "cbt/accuracy/P": 0.8708, "cbt/accuracy/group_average": 0.8308969551282052, "cbt/accuracy/seq_average": 0.8309323729491797, "hellaswag/accuracy/val": 0.286795459071898, "hellaswag/accuracy/group_average": 0.286795459071898, "hellaswag/accuracy/seq_average": 0.286795459071898, "piqa/accuracy/val": 0.5854189336235038, "piqa/accuracy/group_average": 0.5854189336235038, "piqa/accuracy/seq_average": 0.5854189336235038, "ai2arc/accuracy/ARC-Easy": 0.3145877378435518, "ai2arc/accuracy/ARC-Challenge": 0.20858369098712445, "ai2arc/accuracy/group_average": 0.2615857144153381, "ai2arc/accuracy/seq_average": 0.27960339943342777, "mmlu/accuracy/MMLU": 0.26499821237039684, "mmlu/accuracy/group_average": 0.26499821237039684, "mmlu/accuracy/seq_average": 0.26499821237039684, "openbookqa/accuracy/test": 0.264, "openbookqa/accuracy/group_average": 0.264, "openbookqa/accuracy/seq_average": 0.264, "race/accuracy/test/high": 0.26357918810749, "race/accuracy/test/middle": 0.33147632311977715, "race/accuracy/group_average": 0.29752775561363354, "race/accuracy/seq_average": 0.2833400891771382, "siqa/accuracy/dev": 0.35363357215967245, "siqa/accuracy/group_average": 0.35363357215967245, "siqa/accuracy/seq_average": 0.35363357215967245, "winogrande/accuracy/dev": 0.5177584846093133, "winogrande/accuracy/group_average": 0.5177584846093133, "winogrande/accuracy/seq_average": 0.5177584846093133, "commonsenseqa/accuracy/dev_rand_split": 0.24897624897624898, "commonsenseqa/accuracy/group_average": 0.24897624897624898, "commonsenseqa/accuracy/seq_average": 0.24897624897624898}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_deepseek_highlb_shared_only/export/result-model-60000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.6702042836991566, "val/accuracy": 0.4714491102430556, "val/perplexity": 14.442919344448981, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6719383097583465, "lambada/accuracy/total": 0.21603260869565216, "lambada/accuracy/openai_last_token": 0.7507763975155279, "lambada/perplexity": 14.135718084913758, "lambada/lm_loss": 3.215313804287702, "lambada/lm_perplexity": 24.911107897225783, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.34374085946935384, "mean_loss": 2.6710712967287513, "blimp/accuracy/passive_2": 0.894, "blimp/accuracy/determiner_noun_agreement_2": 0.97, "blimp/accuracy/ellipsis_n_bar_1": 0.799, "blimp/accuracy/tough_vs_raising_2": 0.916, "blimp/accuracy/tough_vs_raising_1": 0.518, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.914, "blimp/accuracy/principle_A_reconstruction": 0.245, "blimp/accuracy/wh_vs_that_with_gap": 0.546, "blimp/accuracy/principle_A_domain_2": 0.822, "blimp/accuracy/determiner_noun_agreement_1": 0.981, "blimp/accuracy/ellipsis_n_bar_2": 0.898, "blimp/accuracy/principle_A_domain_3": 0.59, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.916, "blimp/accuracy/animate_subject_trans": 0.907, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.892, "blimp/accuracy/distractor_agreement_relative_clause": 0.647, "blimp/accuracy/transitive": 0.859, "blimp/accuracy/sentential_subject_island": 0.361, "blimp/accuracy/adjunct_island": 0.814, "blimp/accuracy/intransitive": 0.799, "blimp/accuracy/existential_there_subject_raising": 0.836, "blimp/accuracy/irregular_past_participle_adjectives": 0.964, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.374, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.22, "blimp/accuracy/only_npi_scope": 0.764, "blimp/accuracy/superlative_quantifiers_2": 0.694, "blimp/accuracy/passive_1": 0.875, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.895, "blimp/accuracy/inchoative": 0.654, "blimp/accuracy/anaphor_gender_agreement": 0.954, "blimp/accuracy/principle_A_c_command": 0.602, "blimp/accuracy/only_npi_licensor_present": 0.556, "blimp/accuracy/expletive_it_object_raising": 0.784, "blimp/accuracy/left_branch_island_simple_question": 0.425, "blimp/accuracy/wh_questions_subject_gap": 0.927, "blimp/accuracy/existential_there_quantifiers_2": 0.382, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.93, "blimp/accuracy/sentential_negation_npi_scope": 0.516, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.791, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.919, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.893, "blimp/accuracy/principle_A_case_2": 0.959, "blimp/accuracy/distractor_agreement_relational_noun": 0.793, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.988, "blimp/accuracy/superlative_quantifiers_1": 0.593, "blimp/accuracy/wh_island": 0.803, "blimp/accuracy/principle_A_domain_1": 0.988, "blimp/accuracy/complex_NP_island": 0.533, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.958, "blimp/accuracy/irregular_past_participle_verbs": 0.916, "blimp/accuracy/drop_argument": 0.769, "blimp/accuracy/wh_questions_object_gap": 0.771, "blimp/accuracy/animate_subject_passive": 0.796, "blimp/accuracy/existential_there_quantifiers_1": 0.983, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.891, "blimp/accuracy/npi_present_2": 0.603, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.916, "blimp/accuracy/anaphor_number_agreement": 0.978, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.949, "blimp/accuracy/existential_there_object_raising": 0.854, "blimp/accuracy/matrix_question_npi_licensor_present": 0.164, "blimp/accuracy/npi_present_1": 0.499, "blimp/accuracy/wh_vs_that_no_gap": 0.961, "blimp/accuracy/left_branch_island_echo_question": 0.368, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.974, "blimp/accuracy/causative": 0.708, "blimp/accuracy/group_average": 0.7605671641791041, "blimp/accuracy/seq_average": 0.7605671641791045, "cbt/accuracy/NE": 0.7455929487179487, "cbt/accuracy/V": 0.9076, "cbt/accuracy/CN": 0.81, "cbt/accuracy/P": 0.8844, "cbt/accuracy/group_average": 0.8368982371794872, "cbt/accuracy/seq_average": 0.8369347739095638, "hellaswag/accuracy/val": 0.2846046604262099, "hellaswag/accuracy/group_average": 0.2846046604262099, "hellaswag/accuracy/seq_average": 0.2846046604262099, "piqa/accuracy/val": 0.5810663764961915, "piqa/accuracy/group_average": 0.5810663764961915, "piqa/accuracy/seq_average": 0.5810663764961915, "ai2arc/accuracy/ARC-Easy": 0.3221987315010571, "ai2arc/accuracy/ARC-Challenge": 0.20686695278969958, "ai2arc/accuracy/group_average": 0.2645328421453783, "ai2arc/accuracy/seq_average": 0.2841359773371105, "mmlu/accuracy/MMLU": 0.26335359313550233, "mmlu/accuracy/group_average": 0.26335359313550233, "mmlu/accuracy/seq_average": 0.26335359313550233, "openbookqa/accuracy/test": 0.28, "openbookqa/accuracy/group_average": 0.28, "openbookqa/accuracy/seq_average": 0.28, "race/accuracy/test/high": 0.2727272727272727, "race/accuracy/test/middle": 0.3384401114206128, "race/accuracy/group_average": 0.30558369207394276, "race/accuracy/seq_average": 0.2918524523713012, "siqa/accuracy/dev": 0.35363357215967245, "siqa/accuracy/group_average": 0.35363357215967245, "siqa/accuracy/seq_average": 0.35363357215967245, "winogrande/accuracy/dev": 0.516179952644041, "winogrande/accuracy/group_average": 0.516179952644041, "winogrande/accuracy/seq_average": 0.516179952644041, "commonsenseqa/accuracy/dev_rand_split": 0.24815724815724816, "commonsenseqa/accuracy/group_average": 0.24815724815724816, "commonsenseqa/accuracy/seq_average": 0.24815724815724816}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_deepseek_highlb_shared_only/export/result-model-70000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.643279787093874, "val/accuracy": 0.4750879681299603, "val/perplexity": 14.059239384576212, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6699582686335406, "lambada/accuracy/total": 0.23330745341614906, "lambada/accuracy/openai_last_token": 0.7567934782608695, "lambada/perplexity": 13.29118993486964, "lambada/lm_loss": 3.2005790199299353, "lambada/lm_perplexity": 24.54673913425713, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3541977107730547, "mean_loss": 2.656619027863707, "blimp/accuracy/passive_2": 0.906, "blimp/accuracy/determiner_noun_agreement_2": 0.978, "blimp/accuracy/ellipsis_n_bar_1": 0.787, "blimp/accuracy/tough_vs_raising_2": 0.883, "blimp/accuracy/tough_vs_raising_1": 0.571, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.926, "blimp/accuracy/principle_A_reconstruction": 0.247, "blimp/accuracy/wh_vs_that_with_gap": 0.514, "blimp/accuracy/principle_A_domain_2": 0.829, "blimp/accuracy/determiner_noun_agreement_1": 0.989, "blimp/accuracy/ellipsis_n_bar_2": 0.899, "blimp/accuracy/principle_A_domain_3": 0.559, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.916, "blimp/accuracy/animate_subject_trans": 0.889, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.904, "blimp/accuracy/distractor_agreement_relative_clause": 0.662, "blimp/accuracy/transitive": 0.851, "blimp/accuracy/sentential_subject_island": 0.374, "blimp/accuracy/adjunct_island": 0.816, "blimp/accuracy/intransitive": 0.792, "blimp/accuracy/existential_there_subject_raising": 0.855, "blimp/accuracy/irregular_past_participle_adjectives": 0.909, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.366, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.227, "blimp/accuracy/only_npi_scope": 0.817, "blimp/accuracy/superlative_quantifiers_2": 0.663, "blimp/accuracy/passive_1": 0.89, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.892, "blimp/accuracy/inchoative": 0.632, "blimp/accuracy/anaphor_gender_agreement": 0.958, "blimp/accuracy/principle_A_c_command": 0.639, "blimp/accuracy/only_npi_licensor_present": 0.652, "blimp/accuracy/expletive_it_object_raising": 0.766, "blimp/accuracy/left_branch_island_simple_question": 0.437, "blimp/accuracy/wh_questions_subject_gap": 0.93, "blimp/accuracy/existential_there_quantifiers_2": 0.467, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.93, "blimp/accuracy/sentential_negation_npi_scope": 0.567, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.796, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.906, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.9, "blimp/accuracy/principle_A_case_2": 0.955, "blimp/accuracy/distractor_agreement_relational_noun": 0.827, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.996, "blimp/accuracy/superlative_quantifiers_1": 0.605, "blimp/accuracy/wh_island": 0.799, "blimp/accuracy/principle_A_domain_1": 0.989, "blimp/accuracy/complex_NP_island": 0.52, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.967, "blimp/accuracy/irregular_past_participle_verbs": 0.897, "blimp/accuracy/drop_argument": 0.758, "blimp/accuracy/wh_questions_object_gap": 0.803, "blimp/accuracy/animate_subject_passive": 0.808, "blimp/accuracy/existential_there_quantifiers_1": 0.987, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.875, "blimp/accuracy/npi_present_2": 0.553, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.928, "blimp/accuracy/anaphor_number_agreement": 0.981, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.953, "blimp/accuracy/existential_there_object_raising": 0.847, "blimp/accuracy/matrix_question_npi_licensor_present": 0.186, "blimp/accuracy/npi_present_1": 0.449, "blimp/accuracy/wh_vs_that_no_gap": 0.967, "blimp/accuracy/left_branch_island_echo_question": 0.443, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.975, "blimp/accuracy/causative": 0.7, "blimp/accuracy/group_average": 0.7650597014925372, "blimp/accuracy/seq_average": 0.7650597014925373, "cbt/accuracy/NE": 0.7455929487179487, "cbt/accuracy/V": 0.9108, "cbt/accuracy/CN": 0.8072, "cbt/accuracy/P": 0.8804, "cbt/accuracy/group_average": 0.8359982371794872, "cbt/accuracy/seq_average": 0.8360344137655062, "hellaswag/accuracy/val": 0.2878908583947421, "hellaswag/accuracy/group_average": 0.2878908583947421, "hellaswag/accuracy/seq_average": 0.2878908583947421, "piqa/accuracy/val": 0.6017410228509249, "piqa/accuracy/group_average": 0.6017410228509249, "piqa/accuracy/seq_average": 0.6017410228509249, "ai2arc/accuracy/ARC-Easy": 0.3315010570824524, "ai2arc/accuracy/ARC-Challenge": 0.2034334763948498, "ai2arc/accuracy/group_average": 0.2674672667386511, "ai2arc/accuracy/seq_average": 0.2892351274787535, "mmlu/accuracy/MMLU": 0.2627815516624955, "mmlu/accuracy/group_average": 0.2627815516624955, "mmlu/accuracy/seq_average": 0.2627815516624955, "openbookqa/accuracy/test": 0.268, "openbookqa/accuracy/group_average": 0.268, "openbookqa/accuracy/seq_average": 0.268, "race/accuracy/test/high": 0.2715837621497999, "race/accuracy/test/middle": 0.34331476323119775, "race/accuracy/group_average": 0.3074492626904988, "race/accuracy/seq_average": 0.2924604783137414, "siqa/accuracy/dev": 0.34646878198567044, "siqa/accuracy/group_average": 0.34646878198567044, "siqa/accuracy/seq_average": 0.34646878198567044, "winogrande/accuracy/dev": 0.5169692186266772, "winogrande/accuracy/group_average": 0.5169692186266772, "winogrande/accuracy/seq_average": 0.5169692186266772, "commonsenseqa/accuracy/dev_rand_split": 0.24488124488124488, "commonsenseqa/accuracy/group_average": 0.24488124488124488, "commonsenseqa/accuracy/seq_average": 0.24488124488124488}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_deepseek_highlb_shared_only/export/result-model-80000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.62155272468688, "val/accuracy": 0.4779876224578373, "val/perplexity": 13.75706794885256, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6563290424228456, "lambada/accuracy/total": 0.24437111801242237, "lambada/accuracy/openai_last_token": 0.7587344720496895, "lambada/perplexity": 12.905246352523143, "lambada/lm_loss": 3.1789632012772593, "lambada/lm_perplexity": 24.021834828778093, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.36117937023512986, "mean_loss": 2.638940883554863, "blimp/accuracy/passive_2": 0.914, "blimp/accuracy/determiner_noun_agreement_2": 0.979, "blimp/accuracy/ellipsis_n_bar_1": 0.773, "blimp/accuracy/tough_vs_raising_2": 0.884, "blimp/accuracy/tough_vs_raising_1": 0.563, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.917, "blimp/accuracy/principle_A_reconstruction": 0.193, "blimp/accuracy/wh_vs_that_with_gap": 0.534, "blimp/accuracy/principle_A_domain_2": 0.857, "blimp/accuracy/determiner_noun_agreement_1": 0.985, "blimp/accuracy/ellipsis_n_bar_2": 0.909, "blimp/accuracy/principle_A_domain_3": 0.582, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.908, "blimp/accuracy/animate_subject_trans": 0.907, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.894, "blimp/accuracy/distractor_agreement_relative_clause": 0.674, "blimp/accuracy/transitive": 0.886, "blimp/accuracy/sentential_subject_island": 0.405, "blimp/accuracy/adjunct_island": 0.839, "blimp/accuracy/intransitive": 0.792, "blimp/accuracy/existential_there_subject_raising": 0.861, "blimp/accuracy/irregular_past_participle_adjectives": 0.974, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.348, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.258, "blimp/accuracy/only_npi_scope": 0.814, "blimp/accuracy/superlative_quantifiers_2": 0.661, "blimp/accuracy/passive_1": 0.901, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.908, "blimp/accuracy/inchoative": 0.65, "blimp/accuracy/anaphor_gender_agreement": 0.948, "blimp/accuracy/principle_A_c_command": 0.67, "blimp/accuracy/only_npi_licensor_present": 0.556, "blimp/accuracy/expletive_it_object_raising": 0.788, "blimp/accuracy/left_branch_island_simple_question": 0.437, "blimp/accuracy/wh_questions_subject_gap": 0.931, "blimp/accuracy/existential_there_quantifiers_2": 0.481, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.923, "blimp/accuracy/sentential_negation_npi_scope": 0.576, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.777, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.901, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.91, "blimp/accuracy/principle_A_case_2": 0.97, "blimp/accuracy/distractor_agreement_relational_noun": 0.817, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.989, "blimp/accuracy/superlative_quantifiers_1": 0.577, "blimp/accuracy/wh_island": 0.856, "blimp/accuracy/principle_A_domain_1": 0.983, "blimp/accuracy/complex_NP_island": 0.523, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.971, "blimp/accuracy/irregular_past_participle_verbs": 0.892, "blimp/accuracy/drop_argument": 0.78, "blimp/accuracy/wh_questions_object_gap": 0.797, "blimp/accuracy/animate_subject_passive": 0.801, "blimp/accuracy/existential_there_quantifiers_1": 0.987, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.87, "blimp/accuracy/npi_present_2": 0.623, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.926, "blimp/accuracy/anaphor_number_agreement": 0.985, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.953, "blimp/accuracy/existential_there_object_raising": 0.855, "blimp/accuracy/matrix_question_npi_licensor_present": 0.264, "blimp/accuracy/npi_present_1": 0.483, "blimp/accuracy/wh_vs_that_no_gap": 0.966, "blimp/accuracy/left_branch_island_echo_question": 0.357, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.969, "blimp/accuracy/causative": 0.708, "blimp/accuracy/group_average": 0.7697014925373135, "blimp/accuracy/seq_average": 0.7697014925373135, "cbt/accuracy/NE": 0.7532051282051282, "cbt/accuracy/V": 0.906, "cbt/accuracy/CN": 0.8136, "cbt/accuracy/P": 0.8828, "cbt/accuracy/group_average": 0.8389012820512821, "cbt/accuracy/seq_average": 0.8389355742296919, "hellaswag/accuracy/val": 0.29127663811989646, "hellaswag/accuracy/group_average": 0.29127663811989646, "hellaswag/accuracy/seq_average": 0.29127663811989646, "piqa/accuracy/val": 0.5952121871599565, "piqa/accuracy/group_average": 0.5952121871599565, "piqa/accuracy/seq_average": 0.5952121871599565, "ai2arc/accuracy/ARC-Easy": 0.33192389006342493, "ai2arc/accuracy/ARC-Challenge": 0.21030042918454936, "ai2arc/accuracy/group_average": 0.27111215962398716, "ai2arc/accuracy/seq_average": 0.29178470254957506, "mmlu/accuracy/MMLU": 0.2632820879513765, "mmlu/accuracy/group_average": 0.2632820879513765, "mmlu/accuracy/seq_average": 0.2632820879513765, "openbookqa/accuracy/test": 0.278, "openbookqa/accuracy/group_average": 0.278, "openbookqa/accuracy/seq_average": 0.278, "race/accuracy/test/high": 0.27044025157232704, "race/accuracy/test/middle": 0.3474930362116992, "race/accuracy/group_average": 0.3089666438920131, "race/accuracy/seq_average": 0.29286582894203483, "siqa/accuracy/dev": 0.3572159672466735, "siqa/accuracy/group_average": 0.3572159672466735, "siqa/accuracy/seq_average": 0.3572159672466735, "winogrande/accuracy/dev": 0.5217048145224941, "winogrande/accuracy/group_average": 0.5217048145224941, "winogrande/accuracy/seq_average": 0.5217048145224941, "commonsenseqa/accuracy/dev_rand_split": 0.24078624078624078, "commonsenseqa/accuracy/group_average": 0.24078624078624078, "commonsenseqa/accuracy/seq_average": 0.24078624078624078}
Pretrain_language_model/save_final/slimpajama_moe_no_attmoe_154M_deepseek_highlb_shared_only/export/result-model-90000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.609896705264137, "val/accuracy": 0.4800599113343254, "val/perplexity": 13.597646214012292, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.7087722683545223, "lambada/accuracy/total": 0.2484472049689441, "lambada/accuracy/openai_last_token": 0.7608695652173914, "lambada/perplexity": 12.360425908372271, "lambada/lm_loss": 3.152941614006269, "lambada/lm_perplexity": 23.40481133900575, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.36425355815163474, "mean_loss": 2.6593344868093296, "blimp/accuracy/passive_2": 0.902, "blimp/accuracy/determiner_noun_agreement_2": 0.976, "blimp/accuracy/ellipsis_n_bar_1": 0.764, "blimp/accuracy/tough_vs_raising_2": 0.884, "blimp/accuracy/tough_vs_raising_1": 0.568, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.908, "blimp/accuracy/principle_A_reconstruction": 0.263, "blimp/accuracy/wh_vs_that_with_gap": 0.522, "blimp/accuracy/principle_A_domain_2": 0.834, "blimp/accuracy/determiner_noun_agreement_1": 0.985, "blimp/accuracy/ellipsis_n_bar_2": 0.897, "blimp/accuracy/principle_A_domain_3": 0.577, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.914, "blimp/accuracy/animate_subject_trans": 0.905, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.902, "blimp/accuracy/distractor_agreement_relative_clause": 0.661, "blimp/accuracy/transitive": 0.865, "blimp/accuracy/sentential_subject_island": 0.373, "blimp/accuracy/adjunct_island": 0.817, "blimp/accuracy/intransitive": 0.792, "blimp/accuracy/existential_there_subject_raising": 0.859, "blimp/accuracy/irregular_past_participle_adjectives": 0.977, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.382, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.248, "blimp/accuracy/only_npi_scope": 0.749, "blimp/accuracy/superlative_quantifiers_2": 0.6, "blimp/accuracy/passive_1": 0.891, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.905, "blimp/accuracy/inchoative": 0.658, "blimp/accuracy/anaphor_gender_agreement": 0.957, "blimp/accuracy/principle_A_c_command": 0.691, "blimp/accuracy/only_npi_licensor_present": 0.583, "blimp/accuracy/expletive_it_object_raising": 0.787, "blimp/accuracy/left_branch_island_simple_question": 0.442, "blimp/accuracy/wh_questions_subject_gap": 0.934, "blimp/accuracy/existential_there_quantifiers_2": 0.427, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.92, "blimp/accuracy/sentential_negation_npi_scope": 0.53, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.77, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.891, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.914, "blimp/accuracy/principle_A_case_2": 0.962, "blimp/accuracy/distractor_agreement_relational_noun": 0.834, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.99, "blimp/accuracy/superlative_quantifiers_1": 0.588, "blimp/accuracy/wh_island": 0.846, "blimp/accuracy/principle_A_domain_1": 0.981, "blimp/accuracy/complex_NP_island": 0.499, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.971, "blimp/accuracy/irregular_past_participle_verbs": 0.928, "blimp/accuracy/drop_argument": 0.762, "blimp/accuracy/wh_questions_object_gap": 0.795, "blimp/accuracy/animate_subject_passive": 0.798, "blimp/accuracy/existential_there_quantifiers_1": 0.987, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.879, "blimp/accuracy/npi_present_2": 0.641, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.942, "blimp/accuracy/anaphor_number_agreement": 0.984, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.957, "blimp/accuracy/existential_there_object_raising": 0.855, "blimp/accuracy/matrix_question_npi_licensor_present": 0.218, "blimp/accuracy/npi_present_1": 0.547, "blimp/accuracy/wh_vs_that_no_gap": 0.962, "blimp/accuracy/left_branch_island_echo_question": 0.336, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.968, "blimp/accuracy/causative": 0.734, "blimp/accuracy/group_average": 0.7669850746268658, "blimp/accuracy/seq_average": 0.7669850746268657, "cbt/accuracy/NE": 0.7540064102564102, "cbt/accuracy/V": 0.912, "cbt/accuracy/CN": 0.8204, "cbt/accuracy/P": 0.886, "cbt/accuracy/group_average": 0.8431016025641026, "cbt/accuracy/seq_average": 0.8431372549019608, "hellaswag/accuracy/val": 0.29107747460665206, "hellaswag/accuracy/group_average": 0.29107747460665206, "hellaswag/accuracy/seq_average": 0.29107747460665206, "piqa/accuracy/val": 0.5930359085963003, "piqa/accuracy/group_average": 0.5930359085963003, "piqa/accuracy/seq_average": 0.5930359085963003, "ai2arc/accuracy/ARC-Easy": 0.32684989429175476, "ai2arc/accuracy/ARC-Challenge": 0.20772532188841203, "ai2arc/accuracy/group_average": 0.2672876080900834, "ai2arc/accuracy/seq_average": 0.28753541076487255, "mmlu/accuracy/MMLU": 0.26378262424025745, "mmlu/accuracy/group_average": 0.26378262424025745, "mmlu/accuracy/seq_average": 0.26378262424025745, "openbookqa/accuracy/test": 0.29, "openbookqa/accuracy/group_average": 0.29, "openbookqa/accuracy/seq_average": 0.29, "race/accuracy/test/high": 0.2730131503716409, "race/accuracy/test/middle": 0.34540389972144847, "race/accuracy/group_average": 0.3092085250465447, "race/accuracy/seq_average": 0.2940818808269153, "siqa/accuracy/dev": 0.35363357215967245, "siqa/accuracy/group_average": 0.35363357215967245, "siqa/accuracy/seq_average": 0.35363357215967245, "winogrande/accuracy/dev": 0.5295974743488555, "winogrande/accuracy/group_average": 0.5295974743488555, "winogrande/accuracy/seq_average": 0.5295974743488555, "commonsenseqa/accuracy/dev_rand_split": 0.24406224406224405, "commonsenseqa/accuracy/group_average": 0.24406224406224405, "commonsenseqa/accuracy/seq_average": 0.24406224406224405}