Upload folder using huggingface_hub

#355
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_std002/export/result-model-10000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 3.0257505386594743, "val/accuracy": 0.42298816499255953, "val/perplexity": 20.609467109723266, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.7657794833923717, "lambada/accuracy/total": 0.1704192546583851, "lambada/accuracy/openai_last_token": 0.718944099378882, "lambada/perplexity": 23.290414781440138, "lambada/lm_loss": 3.554284035099482, "lambada/lm_perplexity": 34.96277888198997, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.29670370982547234, "mean_loss": 2.895765011025923, "blimp/accuracy/passive_2": 0.865, "blimp/accuracy/determiner_noun_agreement_2": 0.966, "blimp/accuracy/ellipsis_n_bar_1": 0.682, "blimp/accuracy/tough_vs_raising_2": 0.812, "blimp/accuracy/tough_vs_raising_1": 0.561, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.911, "blimp/accuracy/principle_A_reconstruction": 0.452, "blimp/accuracy/wh_vs_that_with_gap": 0.432, "blimp/accuracy/principle_A_domain_2": 0.832, "blimp/accuracy/determiner_noun_agreement_1": 0.971, "blimp/accuracy/ellipsis_n_bar_2": 0.883, "blimp/accuracy/principle_A_domain_3": 0.516, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.867, "blimp/accuracy/animate_subject_trans": 0.878, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.706, "blimp/accuracy/distractor_agreement_relative_clause": 0.496, "blimp/accuracy/transitive": 0.812, "blimp/accuracy/sentential_subject_island": 0.391, "blimp/accuracy/adjunct_island": 0.644, "blimp/accuracy/intransitive": 0.699, "blimp/accuracy/existential_there_subject_raising": 0.783, "blimp/accuracy/irregular_past_participle_adjectives": 0.907, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.195, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.216, "blimp/accuracy/only_npi_scope": 0.693, "blimp/accuracy/superlative_quantifiers_2": 0.629, "blimp/accuracy/passive_1": 0.854, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.875, "blimp/accuracy/inchoative": 0.562, "blimp/accuracy/anaphor_gender_agreement": 0.857, "blimp/accuracy/principle_A_c_command": 0.512, "blimp/accuracy/only_npi_licensor_present": 0.458, "blimp/accuracy/expletive_it_object_raising": 0.74, "blimp/accuracy/left_branch_island_simple_question": 0.248, "blimp/accuracy/wh_questions_subject_gap": 0.89, "blimp/accuracy/existential_there_quantifiers_2": 0.427, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.906, "blimp/accuracy/sentential_negation_npi_scope": 0.534, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.783, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.858, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.807, "blimp/accuracy/principle_A_case_2": 0.942, "blimp/accuracy/distractor_agreement_relational_noun": 0.74, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.995, "blimp/accuracy/superlative_quantifiers_1": 0.632, "blimp/accuracy/wh_island": 0.82, "blimp/accuracy/principle_A_domain_1": 0.994, "blimp/accuracy/complex_NP_island": 0.626, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.946, "blimp/accuracy/irregular_past_participle_verbs": 0.76, "blimp/accuracy/drop_argument": 0.742, "blimp/accuracy/wh_questions_object_gap": 0.713, "blimp/accuracy/animate_subject_passive": 0.726, "blimp/accuracy/existential_there_quantifiers_1": 0.946, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.863, "blimp/accuracy/npi_present_2": 0.628, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.804, "blimp/accuracy/anaphor_number_agreement": 0.972, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.906, "blimp/accuracy/existential_there_object_raising": 0.738, "blimp/accuracy/matrix_question_npi_licensor_present": 0.05, "blimp/accuracy/npi_present_1": 0.519, "blimp/accuracy/wh_vs_that_no_gap": 0.958, "blimp/accuracy/left_branch_island_echo_question": 0.563, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.964, "blimp/accuracy/causative": 0.627, "blimp/accuracy/group_average": 0.7206567164179102, "blimp/accuracy/seq_average": 0.7206567164179104, "cbt/accuracy/NE": 0.6943108974358975, "cbt/accuracy/V": 0.8632, "cbt/accuracy/CN": 0.7284, "cbt/accuracy/P": 0.8344, "cbt/accuracy/group_average": 0.7800777243589744, "cbt/accuracy/seq_average": 0.7801120448179272, "hellaswag/accuracy/val": 0.26976697868950406, "hellaswag/accuracy/group_average": 0.26976697868950406, "hellaswag/accuracy/seq_average": 0.26976697868950406, "piqa/accuracy/val": 0.5511425462459195, "piqa/accuracy/group_average": 0.5511425462459195, "piqa/accuracy/seq_average": 0.5511425462459195, "ai2arc/accuracy/ARC-Easy": 0.30824524312896406, "ai2arc/accuracy/ARC-Challenge": 0.20686695278969958, "ai2arc/accuracy/group_average": 0.2575560979593318, "ai2arc/accuracy/seq_average": 0.2747875354107649, "race/accuracy/test/high": 0.24757004002287022, "race/accuracy/test/middle": 0.3293871866295265, "race/accuracy/group_average": 0.28847861332619834, "race/accuracy/seq_average": 0.27138224564248076, "siqa/accuracy/dev": 0.3526100307062436, "siqa/accuracy/group_average": 0.3526100307062436, "siqa/accuracy/seq_average": 0.3526100307062436, "commonsenseqa/accuracy/dev_rand_split": 0.22932022932022933, "commonsenseqa/accuracy/group_average": 0.22932022932022933, "commonsenseqa/accuracy/seq_average": 0.22932022932022933}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_std002/export/result-model-100000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.6096966455853177, "val/accuracy": 0.47906591021825395, "val/perplexity": 13.594926145375121, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.656020075638102, "lambada/accuracy/total": 0.25679347826086957, "lambada/accuracy/openai_last_token": 0.7614518633540373, "lambada/perplexity": 11.806092768659683, "lambada/lm_loss": 3.170953355738447, "lambada/lm_perplexity": 23.83019218085532, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3679296942395618, "mean_loss": 2.63285836061171, "blimp/accuracy/passive_2": 0.915, "blimp/accuracy/determiner_noun_agreement_2": 0.974, "blimp/accuracy/ellipsis_n_bar_1": 0.842, "blimp/accuracy/tough_vs_raising_2": 0.875, "blimp/accuracy/tough_vs_raising_1": 0.581, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.937, "blimp/accuracy/principle_A_reconstruction": 0.359, "blimp/accuracy/wh_vs_that_with_gap": 0.478, "blimp/accuracy/principle_A_domain_2": 0.833, "blimp/accuracy/determiner_noun_agreement_1": 0.985, "blimp/accuracy/ellipsis_n_bar_2": 0.901, "blimp/accuracy/principle_A_domain_3": 0.567, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.925, "blimp/accuracy/animate_subject_trans": 0.908, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.879, "blimp/accuracy/distractor_agreement_relative_clause": 0.592, "blimp/accuracy/transitive": 0.884, "blimp/accuracy/sentential_subject_island": 0.273, "blimp/accuracy/adjunct_island": 0.805, "blimp/accuracy/intransitive": 0.794, "blimp/accuracy/existential_there_subject_raising": 0.848, "blimp/accuracy/irregular_past_participle_adjectives": 0.969, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.336, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.294, "blimp/accuracy/only_npi_scope": 0.669, "blimp/accuracy/superlative_quantifiers_2": 0.868, "blimp/accuracy/passive_1": 0.885, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.926, "blimp/accuracy/inchoative": 0.646, "blimp/accuracy/anaphor_gender_agreement": 0.95, "blimp/accuracy/principle_A_c_command": 0.569, "blimp/accuracy/only_npi_licensor_present": 0.818, "blimp/accuracy/expletive_it_object_raising": 0.798, "blimp/accuracy/left_branch_island_simple_question": 0.441, "blimp/accuracy/wh_questions_subject_gap": 0.927, "blimp/accuracy/existential_there_quantifiers_2": 0.401, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.913, "blimp/accuracy/sentential_negation_npi_scope": 0.641, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.785, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.871, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.915, "blimp/accuracy/principle_A_case_2": 0.965, "blimp/accuracy/distractor_agreement_relational_noun": 0.834, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.989, "blimp/accuracy/superlative_quantifiers_1": 0.675, "blimp/accuracy/wh_island": 0.838, "blimp/accuracy/principle_A_domain_1": 0.984, "blimp/accuracy/complex_NP_island": 0.531, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.956, "blimp/accuracy/irregular_past_participle_verbs": 0.875, "blimp/accuracy/drop_argument": 0.773, "blimp/accuracy/wh_questions_object_gap": 0.753, "blimp/accuracy/animate_subject_passive": 0.797, "blimp/accuracy/existential_there_quantifiers_1": 0.987, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.916, "blimp/accuracy/npi_present_2": 0.567, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.924, "blimp/accuracy/anaphor_number_agreement": 0.984, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.95, "blimp/accuracy/existential_there_object_raising": 0.774, "blimp/accuracy/matrix_question_npi_licensor_present": 0.261, "blimp/accuracy/npi_present_1": 0.51, "blimp/accuracy/wh_vs_that_no_gap": 0.97, "blimp/accuracy/left_branch_island_echo_question": 0.448, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.971, "blimp/accuracy/causative": 0.711, "blimp/accuracy/group_average": 0.7719402985074628, "blimp/accuracy/seq_average": 0.7719402985074627, "cbt/accuracy/NE": 0.766426282051282, "cbt/accuracy/V": 0.9044, "cbt/accuracy/CN": 0.8136, "cbt/accuracy/P": 0.8976, "cbt/accuracy/group_average": 0.8455065705128204, "cbt/accuracy/seq_average": 0.8455382152861144, "hellaswag/accuracy/val": 0.2889862577175861, "hellaswag/accuracy/group_average": 0.2889862577175861, "hellaswag/accuracy/seq_average": 0.2889862577175861, "piqa/accuracy/val": 0.5848748639825898, "piqa/accuracy/group_average": 0.5848748639825898, "piqa/accuracy/seq_average": 0.5848748639825898, "ai2arc/accuracy/ARC-Easy": 0.32558139534883723, "ai2arc/accuracy/ARC-Challenge": 0.21030042918454936, "ai2arc/accuracy/group_average": 0.2679409122666933, "ai2arc/accuracy/seq_average": 0.28753541076487255, "race/accuracy/test/high": 0.2667238421955403, "race/accuracy/test/middle": 0.34052924791086353, "race/accuracy/group_average": 0.3036265450532019, "race/accuracy/seq_average": 0.2882042967166599, "siqa/accuracy/dev": 0.3618219037871034, "siqa/accuracy/group_average": 0.3618219037871034, "siqa/accuracy/seq_average": 0.3618219037871034, "commonsenseqa/accuracy/dev_rand_split": 0.25225225225225223, "commonsenseqa/accuracy/group_average": 0.25225225225225223, "commonsenseqa/accuracy/seq_average": 0.25225225225225223}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_std002/export/result-model-20000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.8830183725508434, "val/accuracy": 0.4407823350694444, "val/perplexity": 17.86812452361534, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6093221155012616, "lambada/accuracy/total": 0.19448757763975155, "lambada/accuracy/openai_last_token": 0.7369953416149069, "lambada/perplexity": 18.475739305073482, "lambada/lm_loss": 3.404531378333529, "lambada/lm_perplexity": 30.100186818598587, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.317634956354598, "mean_loss": 2.7461702440260525, "blimp/accuracy/passive_2": 0.888, "blimp/accuracy/determiner_noun_agreement_2": 0.949, "blimp/accuracy/ellipsis_n_bar_1": 0.742, "blimp/accuracy/tough_vs_raising_2": 0.86, "blimp/accuracy/tough_vs_raising_1": 0.506, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.923, "blimp/accuracy/principle_A_reconstruction": 0.571, "blimp/accuracy/wh_vs_that_with_gap": 0.451, "blimp/accuracy/principle_A_domain_2": 0.812, "blimp/accuracy/determiner_noun_agreement_1": 0.98, "blimp/accuracy/ellipsis_n_bar_2": 0.91, "blimp/accuracy/principle_A_domain_3": 0.533, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.88, "blimp/accuracy/animate_subject_trans": 0.872, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.789, "blimp/accuracy/distractor_agreement_relative_clause": 0.517, "blimp/accuracy/transitive": 0.856, "blimp/accuracy/sentential_subject_island": 0.375, "blimp/accuracy/adjunct_island": 0.76, "blimp/accuracy/intransitive": 0.699, "blimp/accuracy/existential_there_subject_raising": 0.794, "blimp/accuracy/irregular_past_participle_adjectives": 0.9, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.234, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.155, "blimp/accuracy/only_npi_scope": 0.645, "blimp/accuracy/superlative_quantifiers_2": 0.698, "blimp/accuracy/passive_1": 0.887, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.887, "blimp/accuracy/inchoative": 0.555, "blimp/accuracy/anaphor_gender_agreement": 0.807, "blimp/accuracy/principle_A_c_command": 0.508, "blimp/accuracy/only_npi_licensor_present": 0.788, "blimp/accuracy/expletive_it_object_raising": 0.769, "blimp/accuracy/left_branch_island_simple_question": 0.38, "blimp/accuracy/wh_questions_subject_gap": 0.915, "blimp/accuracy/existential_there_quantifiers_2": 0.452, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.877, "blimp/accuracy/sentential_negation_npi_scope": 0.486, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.815, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.91, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.874, "blimp/accuracy/principle_A_case_2": 0.935, "blimp/accuracy/distractor_agreement_relational_noun": 0.836, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.973, "blimp/accuracy/superlative_quantifiers_1": 0.572, "blimp/accuracy/wh_island": 0.831, "blimp/accuracy/principle_A_domain_1": 0.991, "blimp/accuracy/complex_NP_island": 0.542, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.921, "blimp/accuracy/irregular_past_participle_verbs": 0.823, "blimp/accuracy/drop_argument": 0.744, "blimp/accuracy/wh_questions_object_gap": 0.778, "blimp/accuracy/animate_subject_passive": 0.762, "blimp/accuracy/existential_there_quantifiers_1": 0.975, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.868, "blimp/accuracy/npi_present_2": 0.607, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.893, "blimp/accuracy/anaphor_number_agreement": 0.967, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.928, "blimp/accuracy/existential_there_object_raising": 0.694, "blimp/accuracy/matrix_question_npi_licensor_present": 0.112, "blimp/accuracy/npi_present_1": 0.474, "blimp/accuracy/wh_vs_that_no_gap": 0.977, "blimp/accuracy/left_branch_island_echo_question": 0.398, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.987, "blimp/accuracy/causative": 0.671, "blimp/accuracy/group_average": 0.7383283582089553, "blimp/accuracy/seq_average": 0.7383283582089553, "cbt/accuracy/NE": 0.7147435897435898, "cbt/accuracy/V": 0.8836, "cbt/accuracy/CN": 0.7684, "cbt/accuracy/P": 0.856, "cbt/accuracy/group_average": 0.8056858974358974, "cbt/accuracy/seq_average": 0.8057222889155662, "hellaswag/accuracy/val": 0.27633937462656843, "hellaswag/accuracy/group_average": 0.27633937462656843, "hellaswag/accuracy/seq_average": 0.27633937462656843, "piqa/accuracy/val": 0.573449401523395, "piqa/accuracy/group_average": 0.573449401523395, "piqa/accuracy/seq_average": 0.573449401523395, "ai2arc/accuracy/ARC-Easy": 0.31374207188160674, "ai2arc/accuracy/ARC-Challenge": 0.19141630901287554, "ai2arc/accuracy/group_average": 0.25257919044724114, "ai2arc/accuracy/seq_average": 0.273371104815864, "race/accuracy/test/high": 0.25757575757575757, "race/accuracy/test/middle": 0.318941504178273, "race/accuracy/group_average": 0.2882586308770153, "race/accuracy/seq_average": 0.2754357519254155, "siqa/accuracy/dev": 0.3633572159672467, "siqa/accuracy/group_average": 0.3633572159672467, "siqa/accuracy/seq_average": 0.3633572159672467, "commonsenseqa/accuracy/dev_rand_split": 0.2375102375102375, "commonsenseqa/accuracy/group_average": 0.2375102375102375, "commonsenseqa/accuracy/seq_average": 0.2375102375102375}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_std002/export/result-model-30000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.805029732840402, "val/accuracy": 0.4510992140997024, "val/perplexity": 16.52756731027827, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.578081782560171, "lambada/accuracy/total": 0.20555124223602483, "lambada/accuracy/openai_last_token": 0.7395186335403726, "lambada/perplexity": 16.635113095271553, "lambada/lm_loss": 3.360475525486515, "lambada/lm_perplexity": 28.80288412872843, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3283252281678636, "mean_loss": 2.6915557577002867, "blimp/accuracy/passive_2": 0.883, "blimp/accuracy/determiner_noun_agreement_2": 0.961, "blimp/accuracy/ellipsis_n_bar_1": 0.794, "blimp/accuracy/tough_vs_raising_2": 0.87, "blimp/accuracy/tough_vs_raising_1": 0.61, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.921, "blimp/accuracy/principle_A_reconstruction": 0.629, "blimp/accuracy/wh_vs_that_with_gap": 0.504, "blimp/accuracy/principle_A_domain_2": 0.844, "blimp/accuracy/determiner_noun_agreement_1": 0.975, "blimp/accuracy/ellipsis_n_bar_2": 0.895, "blimp/accuracy/principle_A_domain_3": 0.568, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.879, "blimp/accuracy/animate_subject_trans": 0.873, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.85, "blimp/accuracy/distractor_agreement_relative_clause": 0.478, "blimp/accuracy/transitive": 0.844, "blimp/accuracy/sentential_subject_island": 0.34, "blimp/accuracy/adjunct_island": 0.74, "blimp/accuracy/intransitive": 0.779, "blimp/accuracy/existential_there_subject_raising": 0.813, "blimp/accuracy/irregular_past_participle_adjectives": 0.894, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.191, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.232, "blimp/accuracy/only_npi_scope": 0.677, "blimp/accuracy/superlative_quantifiers_2": 0.775, "blimp/accuracy/passive_1": 0.87, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.909, "blimp/accuracy/inchoative": 0.633, "blimp/accuracy/anaphor_gender_agreement": 0.938, "blimp/accuracy/principle_A_c_command": 0.508, "blimp/accuracy/only_npi_licensor_present": 0.57, "blimp/accuracy/expletive_it_object_raising": 0.788, "blimp/accuracy/left_branch_island_simple_question": 0.278, "blimp/accuracy/wh_questions_subject_gap": 0.906, "blimp/accuracy/existential_there_quantifiers_2": 0.428, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.909, "blimp/accuracy/sentential_negation_npi_scope": 0.614, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.78, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.874, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.891, "blimp/accuracy/principle_A_case_2": 0.949, "blimp/accuracy/distractor_agreement_relational_noun": 0.777, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.993, "blimp/accuracy/superlative_quantifiers_1": 0.704, "blimp/accuracy/wh_island": 0.803, "blimp/accuracy/principle_A_domain_1": 0.975, "blimp/accuracy/complex_NP_island": 0.477, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.945, "blimp/accuracy/irregular_past_participle_verbs": 0.829, "blimp/accuracy/drop_argument": 0.78, "blimp/accuracy/wh_questions_object_gap": 0.739, "blimp/accuracy/animate_subject_passive": 0.816, "blimp/accuracy/existential_there_quantifiers_1": 0.985, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.879, "blimp/accuracy/npi_present_2": 0.554, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.909, "blimp/accuracy/anaphor_number_agreement": 0.98, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.928, "blimp/accuracy/existential_there_object_raising": 0.704, "blimp/accuracy/matrix_question_npi_licensor_present": 0.218, "blimp/accuracy/npi_present_1": 0.441, "blimp/accuracy/wh_vs_that_no_gap": 0.967, "blimp/accuracy/left_branch_island_echo_question": 0.342, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.975, "blimp/accuracy/causative": 0.678, "blimp/accuracy/group_average": 0.747194029850746, "blimp/accuracy/seq_average": 0.7471940298507462, "cbt/accuracy/NE": 0.7291666666666666, "cbt/accuracy/V": 0.8864, "cbt/accuracy/CN": 0.7836, "cbt/accuracy/P": 0.8708, "cbt/accuracy/group_average": 0.8174916666666666, "cbt/accuracy/seq_average": 0.8175270108043218, "hellaswag/accuracy/val": 0.27853017327225654, "hellaswag/accuracy/group_average": 0.27853017327225654, "hellaswag/accuracy/seq_average": 0.27853017327225654, "piqa/accuracy/val": 0.5680087051142546, "piqa/accuracy/group_average": 0.5680087051142546, "piqa/accuracy/seq_average": 0.5680087051142546, "ai2arc/accuracy/ARC-Easy": 0.3090909090909091, "ai2arc/accuracy/ARC-Challenge": 0.21201716738197424, "ai2arc/accuracy/group_average": 0.26055403823644163, "ai2arc/accuracy/seq_average": 0.2770538243626062, "race/accuracy/test/high": 0.2695826186392224, "race/accuracy/test/middle": 0.3370473537604457, "race/accuracy/group_average": 0.30331498619983405, "race/accuracy/seq_average": 0.2892176732873936, "siqa/accuracy/dev": 0.36284544524053225, "siqa/accuracy/group_average": 0.36284544524053225, "siqa/accuracy/seq_average": 0.36284544524053225, "commonsenseqa/accuracy/dev_rand_split": 0.2416052416052416, "commonsenseqa/accuracy/group_average": 0.2416052416052416, "commonsenseqa/accuracy/seq_average": 0.2416052416052416}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_std002/export/result-model-40000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.7532234797402033, "val/accuracy": 0.45946199931795634, "val/perplexity": 15.693136948443597, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.766618432465547, "lambada/accuracy/total": 0.21642080745341616, "lambada/accuracy/openai_last_token": 0.7447593167701864, "lambada/perplexity": 15.84912874250996, "lambada/lm_loss": 3.2936870903803284, "lambada/lm_perplexity": 26.942018404192964, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.33794140338568623, "mean_loss": 2.7599209561028752, "blimp/accuracy/passive_2": 0.888, "blimp/accuracy/determiner_noun_agreement_2": 0.978, "blimp/accuracy/ellipsis_n_bar_1": 0.822, "blimp/accuracy/tough_vs_raising_2": 0.868, "blimp/accuracy/tough_vs_raising_1": 0.595, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.913, "blimp/accuracy/principle_A_reconstruction": 0.33, "blimp/accuracy/wh_vs_that_with_gap": 0.45, "blimp/accuracy/principle_A_domain_2": 0.804, "blimp/accuracy/determiner_noun_agreement_1": 0.975, "blimp/accuracy/ellipsis_n_bar_2": 0.9, "blimp/accuracy/principle_A_domain_3": 0.581, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.915, "blimp/accuracy/animate_subject_trans": 0.894, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.848, "blimp/accuracy/distractor_agreement_relative_clause": 0.595, "blimp/accuracy/transitive": 0.868, "blimp/accuracy/sentential_subject_island": 0.281, "blimp/accuracy/adjunct_island": 0.844, "blimp/accuracy/intransitive": 0.739, "blimp/accuracy/existential_there_subject_raising": 0.825, "blimp/accuracy/irregular_past_participle_adjectives": 0.86, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.361, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.2, "blimp/accuracy/only_npi_scope": 0.669, "blimp/accuracy/superlative_quantifiers_2": 0.64, "blimp/accuracy/passive_1": 0.885, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.906, "blimp/accuracy/inchoative": 0.597, "blimp/accuracy/anaphor_gender_agreement": 0.952, "blimp/accuracy/principle_A_c_command": 0.513, "blimp/accuracy/only_npi_licensor_present": 0.83, "blimp/accuracy/expletive_it_object_raising": 0.807, "blimp/accuracy/left_branch_island_simple_question": 0.486, "blimp/accuracy/wh_questions_subject_gap": 0.947, "blimp/accuracy/existential_there_quantifiers_2": 0.333, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.93, "blimp/accuracy/sentential_negation_npi_scope": 0.547, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.763, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.942, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.886, "blimp/accuracy/principle_A_case_2": 0.967, "blimp/accuracy/distractor_agreement_relational_noun": 0.827, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.997, "blimp/accuracy/superlative_quantifiers_1": 0.703, "blimp/accuracy/wh_island": 0.789, "blimp/accuracy/principle_A_domain_1": 0.983, "blimp/accuracy/complex_NP_island": 0.502, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.961, "blimp/accuracy/irregular_past_participle_verbs": 0.886, "blimp/accuracy/drop_argument": 0.75, "blimp/accuracy/wh_questions_object_gap": 0.783, "blimp/accuracy/animate_subject_passive": 0.778, "blimp/accuracy/existential_there_quantifiers_1": 0.981, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.886, "blimp/accuracy/npi_present_2": 0.51, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.913, "blimp/accuracy/anaphor_number_agreement": 0.982, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.952, "blimp/accuracy/existential_there_object_raising": 0.773, "blimp/accuracy/matrix_question_npi_licensor_present": 0.161, "blimp/accuracy/npi_present_1": 0.473, "blimp/accuracy/wh_vs_that_no_gap": 0.987, "blimp/accuracy/left_branch_island_echo_question": 0.461, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.987, "blimp/accuracy/causative": 0.687, "blimp/accuracy/group_average": 0.7559104477611941, "blimp/accuracy/seq_average": 0.755910447761194, "cbt/accuracy/NE": 0.733573717948718, "cbt/accuracy/V": 0.894, "cbt/accuracy/CN": 0.7856, "cbt/accuracy/P": 0.8772, "cbt/accuracy/group_average": 0.8225934294871795, "cbt/accuracy/seq_average": 0.8226290516206483, "hellaswag/accuracy/val": 0.2784305915156343, "hellaswag/accuracy/group_average": 0.2784305915156343, "hellaswag/accuracy/seq_average": 0.2784305915156343, "piqa/accuracy/val": 0.5663764961915125, "piqa/accuracy/group_average": 0.5663764961915125, "piqa/accuracy/seq_average": 0.5663764961915125, "ai2arc/accuracy/ARC-Easy": 0.30655391120507397, "ai2arc/accuracy/ARC-Challenge": 0.20858369098712445, "ai2arc/accuracy/group_average": 0.2575688010960992, "ai2arc/accuracy/seq_average": 0.27422096317280453, "race/accuracy/test/high": 0.2641509433962264, "race/accuracy/test/middle": 0.3293871866295265, "race/accuracy/group_average": 0.2967690650128765, "race/accuracy/seq_average": 0.28313741386299146, "siqa/accuracy/dev": 0.3638689866939611, "siqa/accuracy/group_average": 0.3638689866939611, "siqa/accuracy/seq_average": 0.3638689866939611, "commonsenseqa/accuracy/dev_rand_split": 0.2538902538902539, "commonsenseqa/accuracy/group_average": 0.2538902538902539, "commonsenseqa/accuracy/seq_average": 0.2538902538902539}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_std002/export/result-model-50000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.7107669890873014, "val/accuracy": 0.46458701481894843, "val/perplexity": 15.04080722696871, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.7217646059782608, "lambada/accuracy/total": 0.22534937888198758, "lambada/accuracy/openai_last_token": 0.7540760869565217, "lambada/perplexity": 14.189138846442004, "lambada/lm_loss": 3.2811740475033107, "lambada/lm_perplexity": 26.606992242487774, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.344968196850468, "mean_loss": 2.716265797532781, "blimp/accuracy/passive_2": 0.898, "blimp/accuracy/determiner_noun_agreement_2": 0.979, "blimp/accuracy/ellipsis_n_bar_1": 0.828, "blimp/accuracy/tough_vs_raising_2": 0.912, "blimp/accuracy/tough_vs_raising_1": 0.495, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.923, "blimp/accuracy/principle_A_reconstruction": 0.573, "blimp/accuracy/wh_vs_that_with_gap": 0.463, "blimp/accuracy/principle_A_domain_2": 0.873, "blimp/accuracy/determiner_noun_agreement_1": 0.98, "blimp/accuracy/ellipsis_n_bar_2": 0.901, "blimp/accuracy/principle_A_domain_3": 0.577, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.903, "blimp/accuracy/animate_subject_trans": 0.887, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.836, "blimp/accuracy/distractor_agreement_relative_clause": 0.618, "blimp/accuracy/transitive": 0.84, "blimp/accuracy/sentential_subject_island": 0.292, "blimp/accuracy/adjunct_island": 0.794, "blimp/accuracy/intransitive": 0.754, "blimp/accuracy/existential_there_subject_raising": 0.817, "blimp/accuracy/irregular_past_participle_adjectives": 0.896, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.289, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.233, "blimp/accuracy/only_npi_scope": 0.696, "blimp/accuracy/superlative_quantifiers_2": 0.767, "blimp/accuracy/passive_1": 0.878, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.907, "blimp/accuracy/inchoative": 0.608, "blimp/accuracy/anaphor_gender_agreement": 0.943, "blimp/accuracy/principle_A_c_command": 0.551, "blimp/accuracy/only_npi_licensor_present": 0.814, "blimp/accuracy/expletive_it_object_raising": 0.813, "blimp/accuracy/left_branch_island_simple_question": 0.355, "blimp/accuracy/wh_questions_subject_gap": 0.933, "blimp/accuracy/existential_there_quantifiers_2": 0.436, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.91, "blimp/accuracy/sentential_negation_npi_scope": 0.582, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.794, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.896, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.888, "blimp/accuracy/principle_A_case_2": 0.97, "blimp/accuracy/distractor_agreement_relational_noun": 0.87, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.985, "blimp/accuracy/superlative_quantifiers_1": 0.623, "blimp/accuracy/wh_island": 0.856, "blimp/accuracy/principle_A_domain_1": 0.995, "blimp/accuracy/complex_NP_island": 0.524, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.941, "blimp/accuracy/irregular_past_participle_verbs": 0.86, "blimp/accuracy/drop_argument": 0.76, "blimp/accuracy/wh_questions_object_gap": 0.801, "blimp/accuracy/animate_subject_passive": 0.779, "blimp/accuracy/existential_there_quantifiers_1": 0.986, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.93, "blimp/accuracy/npi_present_2": 0.547, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.9, "blimp/accuracy/anaphor_number_agreement": 0.98, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.928, "blimp/accuracy/existential_there_object_raising": 0.807, "blimp/accuracy/matrix_question_npi_licensor_present": 0.214, "blimp/accuracy/npi_present_1": 0.517, "blimp/accuracy/wh_vs_that_no_gap": 0.975, "blimp/accuracy/left_branch_island_echo_question": 0.373, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.975, "blimp/accuracy/causative": 0.671, "blimp/accuracy/group_average": 0.7626716417910447, "blimp/accuracy/seq_average": 0.7626716417910447, "cbt/accuracy/NE": 0.7395833333333334, "cbt/accuracy/V": 0.8996, "cbt/accuracy/CN": 0.7936, "cbt/accuracy/P": 0.8752, "cbt/accuracy/group_average": 0.8269958333333334, "cbt/accuracy/seq_average": 0.8270308123249299, "hellaswag/accuracy/val": 0.28440549691296557, "hellaswag/accuracy/group_average": 0.28440549691296557, "hellaswag/accuracy/seq_average": 0.28440549691296557, "piqa/accuracy/val": 0.5631120783460283, "piqa/accuracy/group_average": 0.5631120783460283, "piqa/accuracy/seq_average": 0.5631120783460283, "ai2arc/accuracy/ARC-Easy": 0.3103594080338266, "ai2arc/accuracy/ARC-Challenge": 0.21716738197424892, "ai2arc/accuracy/group_average": 0.2637633950040378, "ai2arc/accuracy/seq_average": 0.27960339943342777, "race/accuracy/test/high": 0.2698684962835906, "race/accuracy/test/middle": 0.35236768802228413, "race/accuracy/group_average": 0.3111180921529374, "race/accuracy/seq_average": 0.2938792055127685, "siqa/accuracy/dev": 0.3577277379733879, "siqa/accuracy/group_average": 0.3577277379733879, "siqa/accuracy/seq_average": 0.3577277379733879, "commonsenseqa/accuracy/dev_rand_split": 0.24815724815724816, "commonsenseqa/accuracy/group_average": 0.24815724815724816, "commonsenseqa/accuracy/seq_average": 0.24815724815724816}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_std002/export/result-model-60000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.6778205992683533, "val/accuracy": 0.4696752154637897, "val/perplexity": 14.55334114577819, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6162454356317935, "lambada/accuracy/total": 0.23621894409937888, "lambada/accuracy/openai_last_token": 0.7562111801242236, "lambada/perplexity": 13.47643535629987, "lambada/lm_loss": 3.2213399801371128, "lambada/lm_perplexity": 25.061679844869847, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3529470797815843, "mean_loss": 2.6470330174500734, "blimp/accuracy/passive_2": 0.891, "blimp/accuracy/determiner_noun_agreement_2": 0.979, "blimp/accuracy/ellipsis_n_bar_1": 0.839, "blimp/accuracy/tough_vs_raising_2": 0.89, "blimp/accuracy/tough_vs_raising_1": 0.544, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.922, "blimp/accuracy/principle_A_reconstruction": 0.251, "blimp/accuracy/wh_vs_that_with_gap": 0.498, "blimp/accuracy/principle_A_domain_2": 0.828, "blimp/accuracy/determiner_noun_agreement_1": 0.988, "blimp/accuracy/ellipsis_n_bar_2": 0.886, "blimp/accuracy/principle_A_domain_3": 0.561, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.916, "blimp/accuracy/animate_subject_trans": 0.899, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.878, "blimp/accuracy/distractor_agreement_relative_clause": 0.625, "blimp/accuracy/transitive": 0.855, "blimp/accuracy/sentential_subject_island": 0.244, "blimp/accuracy/adjunct_island": 0.8, "blimp/accuracy/intransitive": 0.776, "blimp/accuracy/existential_there_subject_raising": 0.826, "blimp/accuracy/irregular_past_participle_adjectives": 0.957, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.296, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.294, "blimp/accuracy/only_npi_scope": 0.721, "blimp/accuracy/superlative_quantifiers_2": 0.674, "blimp/accuracy/passive_1": 0.871, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.904, "blimp/accuracy/inchoative": 0.64, "blimp/accuracy/anaphor_gender_agreement": 0.953, "blimp/accuracy/principle_A_c_command": 0.565, "blimp/accuracy/only_npi_licensor_present": 0.662, "blimp/accuracy/expletive_it_object_raising": 0.79, "blimp/accuracy/left_branch_island_simple_question": 0.371, "blimp/accuracy/wh_questions_subject_gap": 0.927, "blimp/accuracy/existential_there_quantifiers_2": 0.268, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.914, "blimp/accuracy/sentential_negation_npi_scope": 0.647, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.785, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.86, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.912, "blimp/accuracy/principle_A_case_2": 0.963, "blimp/accuracy/distractor_agreement_relational_noun": 0.837, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.997, "blimp/accuracy/superlative_quantifiers_1": 0.733, "blimp/accuracy/wh_island": 0.815, "blimp/accuracy/principle_A_domain_1": 0.988, "blimp/accuracy/complex_NP_island": 0.508, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.948, "blimp/accuracy/irregular_past_participle_verbs": 0.9, "blimp/accuracy/drop_argument": 0.762, "blimp/accuracy/wh_questions_object_gap": 0.761, "blimp/accuracy/animate_subject_passive": 0.79, "blimp/accuracy/existential_there_quantifiers_1": 0.977, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.906, "blimp/accuracy/npi_present_2": 0.529, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.925, "blimp/accuracy/anaphor_number_agreement": 0.983, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.949, "blimp/accuracy/existential_there_object_raising": 0.712, "blimp/accuracy/matrix_question_npi_licensor_present": 0.287, "blimp/accuracy/npi_present_1": 0.482, "blimp/accuracy/wh_vs_that_no_gap": 0.974, "blimp/accuracy/left_branch_island_echo_question": 0.456, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.969, "blimp/accuracy/causative": 0.721, "blimp/accuracy/group_average": 0.7578955223880596, "blimp/accuracy/seq_average": 0.7578955223880597, "cbt/accuracy/NE": 0.7544070512820513, "cbt/accuracy/V": 0.8992, "cbt/accuracy/CN": 0.8004, "cbt/accuracy/P": 0.8872, "cbt/accuracy/group_average": 0.8353017628205128, "cbt/accuracy/seq_average": 0.8353341336534614, "hellaswag/accuracy/val": 0.2860983867755427, "hellaswag/accuracy/group_average": 0.2860983867755427, "hellaswag/accuracy/seq_average": 0.2860983867755427, "piqa/accuracy/val": 0.5712731229597389, "piqa/accuracy/group_average": 0.5712731229597389, "piqa/accuracy/seq_average": 0.5712731229597389, "ai2arc/accuracy/ARC-Easy": 0.3192389006342495, "ai2arc/accuracy/ARC-Challenge": 0.20515021459227467, "ai2arc/accuracy/group_average": 0.26219455761326205, "ai2arc/accuracy/seq_average": 0.28158640226628895, "race/accuracy/test/high": 0.2667238421955403, "race/accuracy/test/middle": 0.346100278551532, "race/accuracy/group_average": 0.3064120603735362, "race/accuracy/seq_average": 0.2898256992298338, "siqa/accuracy/dev": 0.35209825997952915, "siqa/accuracy/group_average": 0.35209825997952915, "siqa/accuracy/seq_average": 0.35209825997952915, "commonsenseqa/accuracy/dev_rand_split": 0.25225225225225223, "commonsenseqa/accuracy/group_average": 0.25225225225225223, "commonsenseqa/accuracy/seq_average": 0.25225225225225223}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_std002/export/result-model-70000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.65133061484685, "val/accuracy": 0.4735446506076389, "val/perplexity": 14.172884755016232, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.761416227921196, "lambada/accuracy/total": 0.2420419254658385, "lambada/accuracy/openai_last_token": 0.7579580745341615, "lambada/perplexity": 13.190682585637875, "lambada/lm_loss": 3.2158258288774535, "lambada/lm_perplexity": 24.923866263046893, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3577932880367387, "mean_loss": 2.706373421384023, "blimp/accuracy/passive_2": 0.893, "blimp/accuracy/determiner_noun_agreement_2": 0.973, "blimp/accuracy/ellipsis_n_bar_1": 0.856, "blimp/accuracy/tough_vs_raising_2": 0.866, "blimp/accuracy/tough_vs_raising_1": 0.564, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.916, "blimp/accuracy/principle_A_reconstruction": 0.416, "blimp/accuracy/wh_vs_that_with_gap": 0.481, "blimp/accuracy/principle_A_domain_2": 0.827, "blimp/accuracy/determiner_noun_agreement_1": 0.982, "blimp/accuracy/ellipsis_n_bar_2": 0.901, "blimp/accuracy/principle_A_domain_3": 0.593, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.904, "blimp/accuracy/animate_subject_trans": 0.904, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.856, "blimp/accuracy/distractor_agreement_relative_clause": 0.611, "blimp/accuracy/transitive": 0.875, "blimp/accuracy/sentential_subject_island": 0.276, "blimp/accuracy/adjunct_island": 0.826, "blimp/accuracy/intransitive": 0.768, "blimp/accuracy/existential_there_subject_raising": 0.846, "blimp/accuracy/irregular_past_participle_adjectives": 0.945, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.318, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.326, "blimp/accuracy/only_npi_scope": 0.696, "blimp/accuracy/superlative_quantifiers_2": 0.74, "blimp/accuracy/passive_1": 0.868, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.905, "blimp/accuracy/inchoative": 0.626, "blimp/accuracy/anaphor_gender_agreement": 0.956, "blimp/accuracy/principle_A_c_command": 0.526, "blimp/accuracy/only_npi_licensor_present": 0.709, "blimp/accuracy/expletive_it_object_raising": 0.804, "blimp/accuracy/left_branch_island_simple_question": 0.434, "blimp/accuracy/wh_questions_subject_gap": 0.921, "blimp/accuracy/existential_there_quantifiers_2": 0.366, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.896, "blimp/accuracy/sentential_negation_npi_scope": 0.61, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.835, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.866, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.911, "blimp/accuracy/principle_A_case_2": 0.957, "blimp/accuracy/distractor_agreement_relational_noun": 0.828, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.993, "blimp/accuracy/superlative_quantifiers_1": 0.741, "blimp/accuracy/wh_island": 0.834, "blimp/accuracy/principle_A_domain_1": 0.987, "blimp/accuracy/complex_NP_island": 0.543, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.953, "blimp/accuracy/irregular_past_participle_verbs": 0.879, "blimp/accuracy/drop_argument": 0.748, "blimp/accuracy/wh_questions_object_gap": 0.774, "blimp/accuracy/animate_subject_passive": 0.795, "blimp/accuracy/existential_there_quantifiers_1": 0.98, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.892, "blimp/accuracy/npi_present_2": 0.506, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.915, "blimp/accuracy/anaphor_number_agreement": 0.984, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.941, "blimp/accuracy/existential_there_object_raising": 0.776, "blimp/accuracy/matrix_question_npi_licensor_present": 0.226, "blimp/accuracy/npi_present_1": 0.441, "blimp/accuracy/wh_vs_that_no_gap": 0.978, "blimp/accuracy/left_branch_island_echo_question": 0.444, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.976, "blimp/accuracy/causative": 0.698, "blimp/accuracy/group_average": 0.7638955223880601, "blimp/accuracy/seq_average": 0.7638955223880597, "cbt/accuracy/NE": 0.7512019230769231, "cbt/accuracy/V": 0.9008, "cbt/accuracy/CN": 0.8076, "cbt/accuracy/P": 0.8852, "cbt/accuracy/group_average": 0.8362004807692307, "cbt/accuracy/seq_average": 0.836234493797519, "hellaswag/accuracy/val": 0.288388767177853, "hellaswag/accuracy/group_average": 0.288388767177853, "hellaswag/accuracy/seq_average": 0.288388767177853, "piqa/accuracy/val": 0.5788900979325353, "piqa/accuracy/group_average": 0.5788900979325353, "piqa/accuracy/seq_average": 0.5788900979325353, "ai2arc/accuracy/ARC-Easy": 0.3276955602536998, "ai2arc/accuracy/ARC-Challenge": 0.20600858369098712, "ai2arc/accuracy/group_average": 0.26685207197234345, "ai2arc/accuracy/seq_average": 0.28753541076487255, "race/accuracy/test/high": 0.2701543739279588, "race/accuracy/test/middle": 0.334958217270195, "race/accuracy/group_average": 0.30255629559907693, "race/accuracy/seq_average": 0.28901499797324687, "siqa/accuracy/dev": 0.36131013306038895, "siqa/accuracy/group_average": 0.36131013306038895, "siqa/accuracy/seq_average": 0.36131013306038895, "commonsenseqa/accuracy/dev_rand_split": 0.25143325143325146, "commonsenseqa/accuracy/group_average": 0.25143325143325146, "commonsenseqa/accuracy/seq_average": 0.25143325143325146}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_std002/export/result-model-80000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.629910908048115, "val/accuracy": 0.47677370101686506, "val/perplexity": 13.87253391594835, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5496033852144797, "lambada/accuracy/total": 0.25854037267080743, "lambada/accuracy/openai_last_token": 0.7620341614906833, "lambada/perplexity": 11.49659632764373, "lambada/lm_loss": 3.1883682156087096, "lambada/lm_perplexity": 24.24882628536669, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3676570368438362, "mean_loss": 2.5897571466312974, "blimp/accuracy/passive_2": 0.922, "blimp/accuracy/determiner_noun_agreement_2": 0.978, "blimp/accuracy/ellipsis_n_bar_1": 0.85, "blimp/accuracy/tough_vs_raising_2": 0.88, "blimp/accuracy/tough_vs_raising_1": 0.602, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.925, "blimp/accuracy/principle_A_reconstruction": 0.348, "blimp/accuracy/wh_vs_that_with_gap": 0.471, "blimp/accuracy/principle_A_domain_2": 0.843, "blimp/accuracy/determiner_noun_agreement_1": 0.987, "blimp/accuracy/ellipsis_n_bar_2": 0.91, "blimp/accuracy/principle_A_domain_3": 0.588, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.919, "blimp/accuracy/animate_subject_trans": 0.907, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.89, "blimp/accuracy/distractor_agreement_relative_clause": 0.654, "blimp/accuracy/transitive": 0.872, "blimp/accuracy/sentential_subject_island": 0.259, "blimp/accuracy/adjunct_island": 0.794, "blimp/accuracy/intransitive": 0.813, "blimp/accuracy/existential_there_subject_raising": 0.844, "blimp/accuracy/irregular_past_participle_adjectives": 0.895, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.258, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.288, "blimp/accuracy/only_npi_scope": 0.645, "blimp/accuracy/superlative_quantifiers_2": 0.813, "blimp/accuracy/passive_1": 0.894, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.901, "blimp/accuracy/inchoative": 0.65, "blimp/accuracy/anaphor_gender_agreement": 0.944, "blimp/accuracy/principle_A_c_command": 0.565, "blimp/accuracy/only_npi_licensor_present": 0.756, "blimp/accuracy/expletive_it_object_raising": 0.784, "blimp/accuracy/left_branch_island_simple_question": 0.339, "blimp/accuracy/wh_questions_subject_gap": 0.933, "blimp/accuracy/existential_there_quantifiers_2": 0.339, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.908, "blimp/accuracy/sentential_negation_npi_scope": 0.67, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.808, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.883, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.898, "blimp/accuracy/principle_A_case_2": 0.967, "blimp/accuracy/distractor_agreement_relational_noun": 0.827, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.995, "blimp/accuracy/superlative_quantifiers_1": 0.704, "blimp/accuracy/wh_island": 0.832, "blimp/accuracy/principle_A_domain_1": 0.981, "blimp/accuracy/complex_NP_island": 0.535, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.955, "blimp/accuracy/irregular_past_participle_verbs": 0.872, "blimp/accuracy/drop_argument": 0.779, "blimp/accuracy/wh_questions_object_gap": 0.786, "blimp/accuracy/animate_subject_passive": 0.801, "blimp/accuracy/existential_there_quantifiers_1": 0.979, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.895, "blimp/accuracy/npi_present_2": 0.565, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.924, "blimp/accuracy/anaphor_number_agreement": 0.983, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.955, "blimp/accuracy/existential_there_object_raising": 0.759, "blimp/accuracy/matrix_question_npi_licensor_present": 0.302, "blimp/accuracy/npi_present_1": 0.521, "blimp/accuracy/wh_vs_that_no_gap": 0.977, "blimp/accuracy/left_branch_island_echo_question": 0.34, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.978, "blimp/accuracy/causative": 0.708, "blimp/accuracy/group_average": 0.766373134328358, "blimp/accuracy/seq_average": 0.7663731343283582, "cbt/accuracy/NE": 0.7600160256410257, "cbt/accuracy/V": 0.9, "cbt/accuracy/CN": 0.8072, "cbt/accuracy/P": 0.8856, "cbt/accuracy/group_average": 0.8382040064102565, "cbt/accuracy/seq_average": 0.8382352941176471, "hellaswag/accuracy/val": 0.289982075283808, "hellaswag/accuracy/group_average": 0.289982075283808, "hellaswag/accuracy/seq_average": 0.289982075283808, "piqa/accuracy/val": 0.5854189336235038, "piqa/accuracy/group_average": 0.5854189336235038, "piqa/accuracy/seq_average": 0.5854189336235038, "ai2arc/accuracy/ARC-Easy": 0.32684989429175476, "ai2arc/accuracy/ARC-Challenge": 0.21373390557939914, "ai2arc/accuracy/group_average": 0.27029189993557695, "ai2arc/accuracy/seq_average": 0.2895184135977337, "race/accuracy/test/high": 0.26615208690680386, "race/accuracy/test/middle": 0.3426183844011142, "race/accuracy/group_average": 0.30438523565395903, "race/accuracy/seq_average": 0.2884069720308067, "siqa/accuracy/dev": 0.3669396110542477, "siqa/accuracy/group_average": 0.3669396110542477, "siqa/accuracy/seq_average": 0.3669396110542477, "commonsenseqa/accuracy/dev_rand_split": 0.26126126126126126, "commonsenseqa/accuracy/group_average": 0.26126126126126126, "commonsenseqa/accuracy/seq_average": 0.26126126126126126}
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_standard_lb_std002/export/result-model-90000.pth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"val/loss": 2.617705814422123, "val/accuracy": 0.4780060298859127, "val/perplexity": 13.704247406031023, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6800753196574147, "lambada/accuracy/total": 0.25077639751552794, "lambada/accuracy/openai_last_token": 0.7593167701863354, "lambada/perplexity": 12.225762755744956, "lambada/lm_loss": 3.1659622290853013, "lambada/lm_perplexity": 23.711549001376188, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3643912137007203, "mean_loss": 2.6488905670397687, "blimp/accuracy/passive_2": 0.906, "blimp/accuracy/determiner_noun_agreement_2": 0.979, "blimp/accuracy/ellipsis_n_bar_1": 0.834, "blimp/accuracy/tough_vs_raising_2": 0.877, "blimp/accuracy/tough_vs_raising_1": 0.575, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.932, "blimp/accuracy/principle_A_reconstruction": 0.399, "blimp/accuracy/wh_vs_that_with_gap": 0.483, "blimp/accuracy/principle_A_domain_2": 0.837, "blimp/accuracy/determiner_noun_agreement_1": 0.984, "blimp/accuracy/ellipsis_n_bar_2": 0.903, "blimp/accuracy/principle_A_domain_3": 0.595, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.922, "blimp/accuracy/animate_subject_trans": 0.905, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.883, "blimp/accuracy/distractor_agreement_relative_clause": 0.606, "blimp/accuracy/transitive": 0.883, "blimp/accuracy/sentential_subject_island": 0.268, "blimp/accuracy/adjunct_island": 0.78, "blimp/accuracy/intransitive": 0.796, "blimp/accuracy/existential_there_subject_raising": 0.856, "blimp/accuracy/irregular_past_participle_adjectives": 0.933, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.298, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.296, "blimp/accuracy/only_npi_scope": 0.642, "blimp/accuracy/superlative_quantifiers_2": 0.727, "blimp/accuracy/passive_1": 0.889, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.924, "blimp/accuracy/inchoative": 0.654, "blimp/accuracy/anaphor_gender_agreement": 0.954, "blimp/accuracy/principle_A_c_command": 0.557, "blimp/accuracy/only_npi_licensor_present": 0.74, "blimp/accuracy/expletive_it_object_raising": 0.806, "blimp/accuracy/left_branch_island_simple_question": 0.376, "blimp/accuracy/wh_questions_subject_gap": 0.924, "blimp/accuracy/existential_there_quantifiers_2": 0.365, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.924, "blimp/accuracy/sentential_negation_npi_scope": 0.595, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.804, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.858, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.911, "blimp/accuracy/principle_A_case_2": 0.958, "blimp/accuracy/distractor_agreement_relational_noun": 0.842, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.992, "blimp/accuracy/superlative_quantifiers_1": 0.784, "blimp/accuracy/wh_island": 0.834, "blimp/accuracy/principle_A_domain_1": 0.981, "blimp/accuracy/complex_NP_island": 0.51, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.957, "blimp/accuracy/irregular_past_participle_verbs": 0.886, "blimp/accuracy/drop_argument": 0.759, "blimp/accuracy/wh_questions_object_gap": 0.793, "blimp/accuracy/animate_subject_passive": 0.787, "blimp/accuracy/existential_there_quantifiers_1": 0.987, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.904, "blimp/accuracy/npi_present_2": 0.572, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.916, "blimp/accuracy/anaphor_number_agreement": 0.988, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.958, "blimp/accuracy/existential_there_object_raising": 0.766, "blimp/accuracy/matrix_question_npi_licensor_present": 0.249, "blimp/accuracy/npi_present_1": 0.524, "blimp/accuracy/wh_vs_that_no_gap": 0.967, "blimp/accuracy/left_branch_island_echo_question": 0.404, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.968, "blimp/accuracy/causative": 0.728, "blimp/accuracy/group_average": 0.7670746268656716, "blimp/accuracy/seq_average": 0.7670746268656716, "cbt/accuracy/NE": 0.7636217948717948, "cbt/accuracy/V": 0.9032, "cbt/accuracy/CN": 0.8136, "cbt/accuracy/P": 0.8956, "cbt/accuracy/group_average": 0.8440054487179487, "cbt/accuracy/seq_average": 0.8440376150460184, "hellaswag/accuracy/val": 0.28828918542123083, "hellaswag/accuracy/group_average": 0.28828918542123083, "hellaswag/accuracy/seq_average": 0.28828918542123083, "piqa/accuracy/val": 0.5821545157780196, "piqa/accuracy/group_average": 0.5821545157780196, "piqa/accuracy/seq_average": 0.5821545157780196, "ai2arc/accuracy/ARC-Easy": 0.3289640591966173, "ai2arc/accuracy/ARC-Challenge": 0.20429184549356222, "ai2arc/accuracy/group_average": 0.26662795234508974, "ai2arc/accuracy/seq_average": 0.2878186968838527, "race/accuracy/test/high": 0.26786735277301316, "race/accuracy/test/middle": 0.3426183844011142, "race/accuracy/group_average": 0.3052428685870637, "race/accuracy/seq_average": 0.28962302391568706, "siqa/accuracy/dev": 0.3633572159672467, "siqa/accuracy/group_average": 0.3633572159672467, "siqa/accuracy/seq_average": 0.3633572159672467, "commonsenseqa/accuracy/dev_rand_split": 0.25552825552825553, "commonsenseqa/accuracy/group_average": 0.25552825552825553, "commonsenseqa/accuracy/seq_average": 0.25552825552825553}