Upload folder using huggingface_hub (#1690)
Browse files- 6653af4d2d7ed121e698cca1625320d9b56d7abf2b9f1c4eba7de6551eb1c9fc (59081eef4975d59e94f1eaac3ca48329e071ebc1)
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_no_lb/export/result-model-10000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_no_lb/export/result-model-100000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_no_lb/export/result-model-20000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_no_lb/export/result-model-30000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_no_lb/export/result-model-40000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_no_lb/export/result-model-50000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_no_lb/export/result-model-60000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_no_lb/export/result-model-70000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_no_lb/export/result-model-80000.pth.json +1 -0
- Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_no_lb/export/result-model-90000.pth.json +1 -0
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_no_lb/export/result-model-10000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 3.0346478659009177, "val/accuracy": 0.42310151599702384, "val/perplexity": 20.793654455355906, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.7486149568735443, "lambada/accuracy/total": 0.15547360248447206, "lambada/accuracy/openai_last_token": 0.7135093167701864, "lambada/perplexity": 26.947031333195046, "lambada/lm_loss": 3.5719872525520038, "lambada/lm_perplexity": 35.587243769066475, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.28928755924074795, "mean_loss": 2.8916314113872312, "blimp/accuracy/passive_2": 0.86, "blimp/accuracy/determiner_noun_agreement_2": 0.968, "blimp/accuracy/ellipsis_n_bar_1": 0.71, "blimp/accuracy/tough_vs_raising_2": 0.842, "blimp/accuracy/tough_vs_raising_1": 0.504, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.855, "blimp/accuracy/principle_A_reconstruction": 0.406, "blimp/accuracy/wh_vs_that_with_gap": 0.391, "blimp/accuracy/principle_A_domain_2": 0.752, "blimp/accuracy/determiner_noun_agreement_1": 0.98, "blimp/accuracy/ellipsis_n_bar_2": 0.865, "blimp/accuracy/principle_A_domain_3": 0.505, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.9, "blimp/accuracy/animate_subject_trans": 0.878, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.767, "blimp/accuracy/distractor_agreement_relative_clause": 0.45, "blimp/accuracy/transitive": 0.811, "blimp/accuracy/sentential_subject_island": 0.387, "blimp/accuracy/adjunct_island": 0.677, "blimp/accuracy/intransitive": 0.715, "blimp/accuracy/existential_there_subject_raising": 0.844, "blimp/accuracy/irregular_past_participle_adjectives": 0.967, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.199, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.157, "blimp/accuracy/only_npi_scope": 0.612, "blimp/accuracy/superlative_quantifiers_2": 0.832, "blimp/accuracy/passive_1": 0.861, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.866, "blimp/accuracy/inchoative": 0.532, "blimp/accuracy/anaphor_gender_agreement": 0.898, "blimp/accuracy/principle_A_c_command": 0.609, "blimp/accuracy/only_npi_licensor_present": 0.351, "blimp/accuracy/expletive_it_object_raising": 0.774, "blimp/accuracy/left_branch_island_simple_question": 0.222, "blimp/accuracy/wh_questions_subject_gap": 0.883, "blimp/accuracy/existential_there_quantifiers_2": 0.264, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.925, "blimp/accuracy/sentential_negation_npi_scope": 0.438, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.745, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.827, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.845, "blimp/accuracy/principle_A_case_2": 0.915, "blimp/accuracy/distractor_agreement_relational_noun": 0.633, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.961, "blimp/accuracy/superlative_quantifiers_1": 0.632, "blimp/accuracy/wh_island": 0.779, "blimp/accuracy/principle_A_domain_1": 0.979, "blimp/accuracy/complex_NP_island": 0.611, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.93, "blimp/accuracy/irregular_past_participle_verbs": 0.81, "blimp/accuracy/drop_argument": 0.756, "blimp/accuracy/wh_questions_object_gap": 0.679, "blimp/accuracy/animate_subject_passive": 0.761, "blimp/accuracy/existential_there_quantifiers_1": 0.882, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.826, "blimp/accuracy/npi_present_2": 0.588, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.835, "blimp/accuracy/anaphor_number_agreement": 0.97, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.928, "blimp/accuracy/existential_there_object_raising": 0.791, "blimp/accuracy/matrix_question_npi_licensor_present": 0.096, "blimp/accuracy/npi_present_1": 0.537, "blimp/accuracy/wh_vs_that_no_gap": 0.963, "blimp/accuracy/left_branch_island_echo_question": 0.446, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.98, "blimp/accuracy/causative": 0.658, "blimp/accuracy/group_average": 0.713731343283582, "blimp/accuracy/seq_average": 0.7137313432835821, "cbt/accuracy/NE": 0.6899038461538461, "cbt/accuracy/V": 0.8608, "cbt/accuracy/CN": 0.7364, "cbt/accuracy/P": 0.8336, "cbt/accuracy/group_average": 0.7801759615384616, "cbt/accuracy/seq_average": 0.7802120848339336, "hellaswag/accuracy/val": 0.2711611232822147, "hellaswag/accuracy/group_average": 0.2711611232822147, "hellaswag/accuracy/seq_average": 0.2711611232822147, "piqa/accuracy/val": 0.5516866158868335, "piqa/accuracy/group_average": 0.5516866158868335, "piqa/accuracy/seq_average": 0.5516866158868335, "ai2arc/accuracy/ARC-Easy": 0.30824524312896406, "ai2arc/accuracy/ARC-Challenge": 0.19313304721030042, "ai2arc/accuracy/group_average": 0.25068914516963225, "ai2arc/accuracy/seq_average": 0.27025495750708217, "race/accuracy/test/high": 0.2469982847341338, "race/accuracy/test/middle": 0.31754874651810583, "race/accuracy/group_average": 0.2822735156261198, "race/accuracy/seq_average": 0.26753141467369274, "siqa/accuracy/dev": 0.3577277379733879, "siqa/accuracy/group_average": 0.3577277379733879, "siqa/accuracy/seq_average": 0.3577277379733879, "commonsenseqa/accuracy/dev_rand_split": 0.2375102375102375, "commonsenseqa/accuracy/group_average": 0.2375102375102375, "commonsenseqa/accuracy/seq_average": 0.2375102375102375}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_no_lb/export/result-model-100000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.616970001705109, "val/accuracy": 0.47888571118551587, "val/perplexity": 13.694167355482048, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.634816234896642, "lambada/accuracy/total": 0.24902950310559005, "lambada/accuracy/openai_last_token": 0.765333850931677, "lambada/perplexity": 12.334549158119103, "lambada/lm_loss": 3.1872532853383397, "lambada/lm_perplexity": 24.22180560080881, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.363957607145553, "mean_loss": 2.6258931183008754, "blimp/accuracy/passive_2": 0.91, "blimp/accuracy/determiner_noun_agreement_2": 0.994, "blimp/accuracy/ellipsis_n_bar_1": 0.809, "blimp/accuracy/tough_vs_raising_2": 0.873, "blimp/accuracy/tough_vs_raising_1": 0.61, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.884, "blimp/accuracy/principle_A_reconstruction": 0.404, "blimp/accuracy/wh_vs_that_with_gap": 0.565, "blimp/accuracy/principle_A_domain_2": 0.811, "blimp/accuracy/determiner_noun_agreement_1": 0.986, "blimp/accuracy/ellipsis_n_bar_2": 0.898, "blimp/accuracy/principle_A_domain_3": 0.536, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.917, "blimp/accuracy/animate_subject_trans": 0.912, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.904, "blimp/accuracy/distractor_agreement_relative_clause": 0.646, "blimp/accuracy/transitive": 0.879, "blimp/accuracy/sentential_subject_island": 0.392, "blimp/accuracy/adjunct_island": 0.81, "blimp/accuracy/intransitive": 0.787, "blimp/accuracy/existential_there_subject_raising": 0.883, "blimp/accuracy/irregular_past_participle_adjectives": 0.987, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.321, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.249, "blimp/accuracy/only_npi_scope": 0.749, "blimp/accuracy/superlative_quantifiers_2": 0.836, "blimp/accuracy/passive_1": 0.883, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.9, "blimp/accuracy/inchoative": 0.607, "blimp/accuracy/anaphor_gender_agreement": 0.909, "blimp/accuracy/principle_A_c_command": 0.627, "blimp/accuracy/only_npi_licensor_present": 0.536, "blimp/accuracy/expletive_it_object_raising": 0.776, "blimp/accuracy/left_branch_island_simple_question": 0.352, "blimp/accuracy/wh_questions_subject_gap": 0.918, "blimp/accuracy/existential_there_quantifiers_2": 0.332, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.949, "blimp/accuracy/sentential_negation_npi_scope": 0.665, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.784, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.843, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.902, "blimp/accuracy/principle_A_case_2": 0.963, "blimp/accuracy/distractor_agreement_relational_noun": 0.831, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.943, "blimp/accuracy/superlative_quantifiers_1": 0.751, "blimp/accuracy/wh_island": 0.81, "blimp/accuracy/principle_A_domain_1": 0.984, "blimp/accuracy/complex_NP_island": 0.573, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.963, "blimp/accuracy/irregular_past_participle_verbs": 0.842, "blimp/accuracy/drop_argument": 0.778, "blimp/accuracy/wh_questions_object_gap": 0.802, "blimp/accuracy/animate_subject_passive": 0.789, "blimp/accuracy/existential_there_quantifiers_1": 0.962, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.876, "blimp/accuracy/npi_present_2": 0.589, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.947, "blimp/accuracy/anaphor_number_agreement": 0.988, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.965, "blimp/accuracy/existential_there_object_raising": 0.814, "blimp/accuracy/matrix_question_npi_licensor_present": 0.197, "blimp/accuracy/npi_present_1": 0.518, "blimp/accuracy/wh_vs_that_no_gap": 0.97, "blimp/accuracy/left_branch_island_echo_question": 0.408, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.973, "blimp/accuracy/causative": 0.695, "blimp/accuracy/group_average": 0.7681492537313434, "blimp/accuracy/seq_average": 0.7681492537313432, "cbt/accuracy/NE": 0.7544070512820513, "cbt/accuracy/V": 0.9092, "cbt/accuracy/CN": 0.818, "cbt/accuracy/P": 0.8836, "cbt/accuracy/group_average": 0.8413017628205128, "cbt/accuracy/seq_average": 0.8413365346138455, "hellaswag/accuracy/val": 0.29376618203545113, "hellaswag/accuracy/group_average": 0.29376618203545113, "hellaswag/accuracy/seq_average": 0.29376618203545113, "piqa/accuracy/val": 0.5750816104461371, "piqa/accuracy/group_average": 0.5750816104461371, "piqa/accuracy/seq_average": 0.5750816104461371, "ai2arc/accuracy/ARC-Easy": 0.33192389006342493, "ai2arc/accuracy/ARC-Challenge": 0.21630901287553647, "ai2arc/accuracy/group_average": 0.2741164514694807, "ai2arc/accuracy/seq_average": 0.29376770538243624, "race/accuracy/test/high": 0.2701543739279588, "race/accuracy/test/middle": 0.334958217270195, "race/accuracy/group_average": 0.30255629559907693, "race/accuracy/seq_average": 0.28901499797324687, "siqa/accuracy/dev": 0.35670419651995905, "siqa/accuracy/group_average": 0.35670419651995905, "siqa/accuracy/seq_average": 0.35670419651995905, "commonsenseqa/accuracy/dev_rand_split": 0.25634725634725636, "commonsenseqa/accuracy/group_average": 0.25634725634725636, "commonsenseqa/accuracy/seq_average": 0.25634725634725636}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_no_lb/export/result-model-20000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.8867860824342757, "val/accuracy": 0.4413025871155754, "val/perplexity": 17.935573417122725, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5412658312305902, "lambada/accuracy/total": 0.19041149068322982, "lambada/accuracy/openai_last_token": 0.7303959627329193, "lambada/perplexity": 19.38358796205587, "lambada/lm_loss": 3.412540700062019, "lambada/lm_perplexity": 30.342236933596364, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.31585703889940264, "mean_loss": 2.714025956832433, "blimp/accuracy/passive_2": 0.873, "blimp/accuracy/determiner_noun_agreement_2": 0.97, "blimp/accuracy/ellipsis_n_bar_1": 0.709, "blimp/accuracy/tough_vs_raising_2": 0.825, "blimp/accuracy/tough_vs_raising_1": 0.62, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.874, "blimp/accuracy/principle_A_reconstruction": 0.389, "blimp/accuracy/wh_vs_that_with_gap": 0.455, "blimp/accuracy/principle_A_domain_2": 0.785, "blimp/accuracy/determiner_noun_agreement_1": 0.978, "blimp/accuracy/ellipsis_n_bar_2": 0.896, "blimp/accuracy/principle_A_domain_3": 0.534, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.904, "blimp/accuracy/animate_subject_trans": 0.881, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.808, "blimp/accuracy/distractor_agreement_relative_clause": 0.506, "blimp/accuracy/transitive": 0.857, "blimp/accuracy/sentential_subject_island": 0.425, "blimp/accuracy/adjunct_island": 0.657, "blimp/accuracy/intransitive": 0.691, "blimp/accuracy/existential_there_subject_raising": 0.861, "blimp/accuracy/irregular_past_participle_adjectives": 0.932, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.207, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.145, "blimp/accuracy/only_npi_scope": 0.765, "blimp/accuracy/superlative_quantifiers_2": 0.783, "blimp/accuracy/passive_1": 0.889, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.874, "blimp/accuracy/inchoative": 0.54, "blimp/accuracy/anaphor_gender_agreement": 0.846, "blimp/accuracy/principle_A_c_command": 0.584, "blimp/accuracy/only_npi_licensor_present": 0.721, "blimp/accuracy/expletive_it_object_raising": 0.755, "blimp/accuracy/left_branch_island_simple_question": 0.232, "blimp/accuracy/wh_questions_subject_gap": 0.902, "blimp/accuracy/existential_there_quantifiers_2": 0.225, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.918, "blimp/accuracy/sentential_negation_npi_scope": 0.484, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.751, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.907, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.888, "blimp/accuracy/principle_A_case_2": 0.94, "blimp/accuracy/distractor_agreement_relational_noun": 0.766, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.969, "blimp/accuracy/superlative_quantifiers_1": 0.714, "blimp/accuracy/wh_island": 0.842, "blimp/accuracy/principle_A_domain_1": 0.988, "blimp/accuracy/complex_NP_island": 0.591, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.928, "blimp/accuracy/irregular_past_participle_verbs": 0.819, "blimp/accuracy/drop_argument": 0.73, "blimp/accuracy/wh_questions_object_gap": 0.765, "blimp/accuracy/animate_subject_passive": 0.761, "blimp/accuracy/existential_there_quantifiers_1": 0.96, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.826, "blimp/accuracy/npi_present_2": 0.592, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.887, "blimp/accuracy/anaphor_number_agreement": 0.972, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.939, "blimp/accuracy/existential_there_object_raising": 0.718, "blimp/accuracy/matrix_question_npi_licensor_present": 0.149, "blimp/accuracy/npi_present_1": 0.53, "blimp/accuracy/wh_vs_that_no_gap": 0.973, "blimp/accuracy/left_branch_island_echo_question": 0.362, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.983, "blimp/accuracy/causative": 0.689, "blimp/accuracy/group_average": 0.7349104477611942, "blimp/accuracy/seq_average": 0.7349104477611941, "cbt/accuracy/NE": 0.7043269230769231, "cbt/accuracy/V": 0.8816, "cbt/accuracy/CN": 0.7668, "cbt/accuracy/P": 0.8548, "cbt/accuracy/group_average": 0.8018817307692307, "cbt/accuracy/seq_average": 0.801920768307323, "hellaswag/accuracy/val": 0.277733519219279, "hellaswag/accuracy/group_average": 0.277733519219279, "hellaswag/accuracy/seq_average": 0.277733519219279, "piqa/accuracy/val": 0.5701849836779108, "piqa/accuracy/group_average": 0.5701849836779108, "piqa/accuracy/seq_average": 0.5701849836779108, "ai2arc/accuracy/ARC-Easy": 0.3145877378435518, "ai2arc/accuracy/ARC-Challenge": 0.2017167381974249, "ai2arc/accuracy/group_average": 0.25815223802048837, "ai2arc/accuracy/seq_average": 0.2773371104815864, "race/accuracy/test/high": 0.2518582046883934, "race/accuracy/test/middle": 0.3370473537604457, "race/accuracy/group_average": 0.29445277922441954, "race/accuracy/seq_average": 0.2766518038102959, "siqa/accuracy/dev": 0.3561924257932446, "siqa/accuracy/group_average": 0.3561924257932446, "siqa/accuracy/seq_average": 0.3561924257932446, "commonsenseqa/accuracy/dev_rand_split": 0.23587223587223588, "commonsenseqa/accuracy/group_average": 0.23587223587223588, "commonsenseqa/accuracy/seq_average": 0.23587223587223588}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_no_lb/export/result-model-30000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.811556861514137, "val/accuracy": 0.45179966517857145, "val/perplexity": 16.63579770137245, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.621062663771351, "lambada/accuracy/total": 0.20846273291925466, "lambada/accuracy/openai_last_token": 0.7366071428571429, "lambada/perplexity": 17.729537918879853, "lambada/lm_loss": 3.378450076685831, "lambada/lm_perplexity": 29.325283930314534, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3301311990489131, "mean_loss": 2.716309762642744, "blimp/accuracy/passive_2": 0.889, "blimp/accuracy/determiner_noun_agreement_2": 0.989, "blimp/accuracy/ellipsis_n_bar_1": 0.74, "blimp/accuracy/tough_vs_raising_2": 0.856, "blimp/accuracy/tough_vs_raising_1": 0.615, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.872, "blimp/accuracy/principle_A_reconstruction": 0.406, "blimp/accuracy/wh_vs_that_with_gap": 0.597, "blimp/accuracy/principle_A_domain_2": 0.805, "blimp/accuracy/determiner_noun_agreement_1": 0.98, "blimp/accuracy/ellipsis_n_bar_2": 0.866, "blimp/accuracy/principle_A_domain_3": 0.529, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.913, "blimp/accuracy/animate_subject_trans": 0.889, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.868, "blimp/accuracy/distractor_agreement_relative_clause": 0.541, "blimp/accuracy/transitive": 0.862, "blimp/accuracy/sentential_subject_island": 0.339, "blimp/accuracy/adjunct_island": 0.752, "blimp/accuracy/intransitive": 0.775, "blimp/accuracy/existential_there_subject_raising": 0.865, "blimp/accuracy/irregular_past_participle_adjectives": 0.985, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.249, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.203, "blimp/accuracy/only_npi_scope": 0.736, "blimp/accuracy/superlative_quantifiers_2": 0.733, "blimp/accuracy/passive_1": 0.863, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.867, "blimp/accuracy/inchoative": 0.591, "blimp/accuracy/anaphor_gender_agreement": 0.926, "blimp/accuracy/principle_A_c_command": 0.582, "blimp/accuracy/only_npi_licensor_present": 0.352, "blimp/accuracy/expletive_it_object_raising": 0.763, "blimp/accuracy/left_branch_island_simple_question": 0.264, "blimp/accuracy/wh_questions_subject_gap": 0.851, "blimp/accuracy/existential_there_quantifiers_2": 0.254, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.928, "blimp/accuracy/sentential_negation_npi_scope": 0.672, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.755, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.832, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.907, "blimp/accuracy/principle_A_case_2": 0.939, "blimp/accuracy/distractor_agreement_relational_noun": 0.798, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.97, "blimp/accuracy/superlative_quantifiers_1": 0.782, "blimp/accuracy/wh_island": 0.751, "blimp/accuracy/principle_A_domain_1": 0.956, "blimp/accuracy/complex_NP_island": 0.549, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.957, "blimp/accuracy/irregular_past_participle_verbs": 0.829, "blimp/accuracy/drop_argument": 0.77, "blimp/accuracy/wh_questions_object_gap": 0.659, "blimp/accuracy/animate_subject_passive": 0.822, "blimp/accuracy/existential_there_quantifiers_1": 0.959, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.819, "blimp/accuracy/npi_present_2": 0.54, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.913, "blimp/accuracy/anaphor_number_agreement": 0.989, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.935, "blimp/accuracy/existential_there_object_raising": 0.791, "blimp/accuracy/matrix_question_npi_licensor_present": 0.137, "blimp/accuracy/npi_present_1": 0.462, "blimp/accuracy/wh_vs_that_no_gap": 0.949, "blimp/accuracy/left_branch_island_echo_question": 0.385, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.968, "blimp/accuracy/causative": 0.708, "blimp/accuracy/group_average": 0.740268656716418, "blimp/accuracy/seq_average": 0.740268656716418, "cbt/accuracy/NE": 0.7295673076923077, "cbt/accuracy/V": 0.8964, "cbt/accuracy/CN": 0.7908, "cbt/accuracy/P": 0.8648, "cbt/accuracy/group_average": 0.820391826923077, "cbt/accuracy/seq_average": 0.8204281712685074, "hellaswag/accuracy/val": 0.27763393746265685, "hellaswag/accuracy/group_average": 0.27763393746265685, "hellaswag/accuracy/seq_average": 0.27763393746265685, "piqa/accuracy/val": 0.5761697497279652, "piqa/accuracy/group_average": 0.5761697497279652, "piqa/accuracy/seq_average": 0.5761697497279652, "ai2arc/accuracy/ARC-Easy": 0.31797040169133195, "ai2arc/accuracy/ARC-Challenge": 0.2034334763948498, "ai2arc/accuracy/group_average": 0.26070193904309086, "ai2arc/accuracy/seq_average": 0.28016997167138813, "race/accuracy/test/high": 0.2627215551743854, "race/accuracy/test/middle": 0.3412256267409471, "race/accuracy/group_average": 0.30197359095766624, "race/accuracy/seq_average": 0.28556951763275235, "siqa/accuracy/dev": 0.3602865916069601, "siqa/accuracy/group_average": 0.3602865916069601, "siqa/accuracy/seq_average": 0.3602865916069601, "commonsenseqa/accuracy/dev_rand_split": 0.23587223587223588, "commonsenseqa/accuracy/group_average": 0.23587223587223588, "commonsenseqa/accuracy/seq_average": 0.23587223587223588}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_no_lb/export/result-model-40000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.76138426765563, "val/accuracy": 0.4587489536830357, "val/perplexity": 15.821729304610429, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5492318668720886, "lambada/accuracy/total": 0.20923913043478262, "lambada/accuracy/openai_last_token": 0.7404891304347826, "lambada/perplexity": 16.455038133226786, "lambada/lm_loss": 3.3045572021824032, "lambada/lm_perplexity": 27.23647866496815, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.33399404205890915, "mean_loss": 2.655308067263859, "blimp/accuracy/passive_2": 0.876, "blimp/accuracy/determiner_noun_agreement_2": 0.983, "blimp/accuracy/ellipsis_n_bar_1": 0.772, "blimp/accuracy/tough_vs_raising_2": 0.878, "blimp/accuracy/tough_vs_raising_1": 0.61, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.844, "blimp/accuracy/principle_A_reconstruction": 0.357, "blimp/accuracy/wh_vs_that_with_gap": 0.558, "blimp/accuracy/principle_A_domain_2": 0.778, "blimp/accuracy/determiner_noun_agreement_1": 0.982, "blimp/accuracy/ellipsis_n_bar_2": 0.884, "blimp/accuracy/principle_A_domain_3": 0.566, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.909, "blimp/accuracy/animate_subject_trans": 0.907, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.878, "blimp/accuracy/distractor_agreement_relative_clause": 0.63, "blimp/accuracy/transitive": 0.858, "blimp/accuracy/sentential_subject_island": 0.347, "blimp/accuracy/adjunct_island": 0.772, "blimp/accuracy/intransitive": 0.746, "blimp/accuracy/existential_there_subject_raising": 0.88, "blimp/accuracy/irregular_past_participle_adjectives": 0.944, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.325, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.189, "blimp/accuracy/only_npi_scope": 0.743, "blimp/accuracy/superlative_quantifiers_2": 0.806, "blimp/accuracy/passive_1": 0.884, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.884, "blimp/accuracy/inchoative": 0.593, "blimp/accuracy/anaphor_gender_agreement": 0.88, "blimp/accuracy/principle_A_c_command": 0.61, "blimp/accuracy/only_npi_licensor_present": 0.777, "blimp/accuracy/expletive_it_object_raising": 0.761, "blimp/accuracy/left_branch_island_simple_question": 0.344, "blimp/accuracy/wh_questions_subject_gap": 0.913, "blimp/accuracy/existential_there_quantifiers_2": 0.269, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.928, "blimp/accuracy/sentential_negation_npi_scope": 0.616, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.782, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.909, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.899, "blimp/accuracy/principle_A_case_2": 0.964, "blimp/accuracy/distractor_agreement_relational_noun": 0.813, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.963, "blimp/accuracy/superlative_quantifiers_1": 0.828, "blimp/accuracy/wh_island": 0.798, "blimp/accuracy/principle_A_domain_1": 0.98, "blimp/accuracy/complex_NP_island": 0.577, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.953, "blimp/accuracy/irregular_past_participle_verbs": 0.878, "blimp/accuracy/drop_argument": 0.78, "blimp/accuracy/wh_questions_object_gap": 0.764, "blimp/accuracy/animate_subject_passive": 0.779, "blimp/accuracy/existential_there_quantifiers_1": 0.968, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.859, "blimp/accuracy/npi_present_2": 0.528, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.903, "blimp/accuracy/anaphor_number_agreement": 0.985, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.939, "blimp/accuracy/existential_there_object_raising": 0.79, "blimp/accuracy/matrix_question_npi_licensor_present": 0.147, "blimp/accuracy/npi_present_1": 0.484, "blimp/accuracy/wh_vs_that_no_gap": 0.971, "blimp/accuracy/left_branch_island_echo_question": 0.415, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.976, "blimp/accuracy/causative": 0.696, "blimp/accuracy/group_average": 0.7583432835820896, "blimp/accuracy/seq_average": 0.7583432835820896, "cbt/accuracy/NE": 0.734375, "cbt/accuracy/V": 0.8948, "cbt/accuracy/CN": 0.7832, "cbt/accuracy/P": 0.868, "cbt/accuracy/group_average": 0.82009375, "cbt/accuracy/seq_average": 0.8201280512204882, "hellaswag/accuracy/val": 0.2842063333997212, "hellaswag/accuracy/group_average": 0.2842063333997212, "hellaswag/accuracy/seq_average": 0.2842063333997212, "piqa/accuracy/val": 0.5723612622415669, "piqa/accuracy/group_average": 0.5723612622415669, "piqa/accuracy/seq_average": 0.5723612622415669, "ai2arc/accuracy/ARC-Easy": 0.3217758985200846, "ai2arc/accuracy/ARC-Challenge": 0.2, "ai2arc/accuracy/group_average": 0.26088794926004233, "ai2arc/accuracy/seq_average": 0.28158640226628895, "race/accuracy/test/high": 0.25986277873070324, "race/accuracy/test/middle": 0.346100278551532, "race/accuracy/group_average": 0.3029815286411176, "race/accuracy/seq_average": 0.2849614916903121, "siqa/accuracy/dev": 0.3618219037871034, "siqa/accuracy/group_average": 0.3618219037871034, "siqa/accuracy/seq_average": 0.3618219037871034, "commonsenseqa/accuracy/dev_rand_split": 0.2538902538902539, "commonsenseqa/accuracy/group_average": 0.2538902538902539, "commonsenseqa/accuracy/seq_average": 0.2538902538902539}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_no_lb/export/result-model-50000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.7177325051928323, "val/accuracy": 0.46529909164186506, "val/perplexity": 15.145939938674282, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6206509607919255, "lambada/accuracy/total": 0.22826086956521738, "lambada/accuracy/openai_last_token": 0.7538819875776398, "lambada/perplexity": 14.84237093041477, "lambada/lm_loss": 3.2910282291858732, "lambada/lm_perplexity": 26.870478466486087, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.3467799806035412, "mean_loss": 2.669191732992379, "blimp/accuracy/passive_2": 0.904, "blimp/accuracy/determiner_noun_agreement_2": 0.986, "blimp/accuracy/ellipsis_n_bar_1": 0.776, "blimp/accuracy/tough_vs_raising_2": 0.902, "blimp/accuracy/tough_vs_raising_1": 0.569, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.864, "blimp/accuracy/principle_A_reconstruction": 0.295, "blimp/accuracy/wh_vs_that_with_gap": 0.546, "blimp/accuracy/principle_A_domain_2": 0.841, "blimp/accuracy/determiner_noun_agreement_1": 0.984, "blimp/accuracy/ellipsis_n_bar_2": 0.899, "blimp/accuracy/principle_A_domain_3": 0.546, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.904, "blimp/accuracy/animate_subject_trans": 0.895, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.875, "blimp/accuracy/distractor_agreement_relative_clause": 0.596, "blimp/accuracy/transitive": 0.863, "blimp/accuracy/sentential_subject_island": 0.412, "blimp/accuracy/adjunct_island": 0.782, "blimp/accuracy/intransitive": 0.757, "blimp/accuracy/existential_there_subject_raising": 0.843, "blimp/accuracy/irregular_past_participle_adjectives": 0.906, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.27, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.203, "blimp/accuracy/only_npi_scope": 0.749, "blimp/accuracy/superlative_quantifiers_2": 0.829, "blimp/accuracy/passive_1": 0.881, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.878, "blimp/accuracy/inchoative": 0.598, "blimp/accuracy/anaphor_gender_agreement": 0.89, "blimp/accuracy/principle_A_c_command": 0.637, "blimp/accuracy/only_npi_licensor_present": 0.673, "blimp/accuracy/expletive_it_object_raising": 0.784, "blimp/accuracy/left_branch_island_simple_question": 0.316, "blimp/accuracy/wh_questions_subject_gap": 0.917, "blimp/accuracy/existential_there_quantifiers_2": 0.327, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.94, "blimp/accuracy/sentential_negation_npi_scope": 0.636, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.801, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.868, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.885, "blimp/accuracy/principle_A_case_2": 0.968, "blimp/accuracy/distractor_agreement_relational_noun": 0.842, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.966, "blimp/accuracy/superlative_quantifiers_1": 0.772, "blimp/accuracy/wh_island": 0.827, "blimp/accuracy/principle_A_domain_1": 0.977, "blimp/accuracy/complex_NP_island": 0.606, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.963, "blimp/accuracy/irregular_past_participle_verbs": 0.859, "blimp/accuracy/drop_argument": 0.766, "blimp/accuracy/wh_questions_object_gap": 0.775, "blimp/accuracy/animate_subject_passive": 0.789, "blimp/accuracy/existential_there_quantifiers_1": 0.967, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.879, "blimp/accuracy/npi_present_2": 0.589, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.916, "blimp/accuracy/anaphor_number_agreement": 0.982, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.948, "blimp/accuracy/existential_there_object_raising": 0.798, "blimp/accuracy/matrix_question_npi_licensor_present": 0.179, "blimp/accuracy/npi_present_1": 0.504, "blimp/accuracy/wh_vs_that_no_gap": 0.973, "blimp/accuracy/left_branch_island_echo_question": 0.428, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.975, "blimp/accuracy/causative": 0.694, "blimp/accuracy/group_average": 0.7607313432835819, "blimp/accuracy/seq_average": 0.760731343283582, "cbt/accuracy/NE": 0.7295673076923077, "cbt/accuracy/V": 0.8972, "cbt/accuracy/CN": 0.796, "cbt/accuracy/P": 0.8768, "cbt/accuracy/group_average": 0.8248918269230769, "cbt/accuracy/seq_average": 0.8249299719887955, "hellaswag/accuracy/val": 0.2846046604262099, "hellaswag/accuracy/group_average": 0.2846046604262099, "hellaswag/accuracy/seq_average": 0.2846046604262099, "piqa/accuracy/val": 0.5669205658324266, "piqa/accuracy/group_average": 0.5669205658324266, "piqa/accuracy/seq_average": 0.5669205658324266, "ai2arc/accuracy/ARC-Easy": 0.321353065539112, "ai2arc/accuracy/ARC-Challenge": 0.2145922746781116, "ai2arc/accuracy/group_average": 0.2679726701086118, "ai2arc/accuracy/seq_average": 0.28611898016997167, "race/accuracy/test/high": 0.2724413950829045, "race/accuracy/test/middle": 0.34331476323119775, "race/accuracy/group_average": 0.3078780791570511, "race/accuracy/seq_average": 0.2930685042561816, "siqa/accuracy/dev": 0.3572159672466735, "siqa/accuracy/group_average": 0.3572159672466735, "siqa/accuracy/seq_average": 0.3572159672466735, "commonsenseqa/accuracy/dev_rand_split": 0.2547092547092547, "commonsenseqa/accuracy/group_average": 0.2547092547092547, "commonsenseqa/accuracy/seq_average": 0.2547092547092547}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_no_lb/export/result-model-60000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.68435547843812, "val/accuracy": 0.4695211743551587, "val/perplexity": 14.64875689732813, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.5438329092464094, "lambada/accuracy/total": 0.21913819875776397, "lambada/accuracy/openai_last_token": 0.7521350931677019, "lambada/perplexity": 14.630841759958825, "lambada/lm_loss": 3.241629249149332, "lambada/lm_perplexity": 25.575356449229204, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.34432968655646135, "mean_loss": 2.6140941938422646, "blimp/accuracy/passive_2": 0.908, "blimp/accuracy/determiner_noun_agreement_2": 0.99, "blimp/accuracy/ellipsis_n_bar_1": 0.808, "blimp/accuracy/tough_vs_raising_2": 0.903, "blimp/accuracy/tough_vs_raising_1": 0.559, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.872, "blimp/accuracy/principle_A_reconstruction": 0.264, "blimp/accuracy/wh_vs_that_with_gap": 0.532, "blimp/accuracy/principle_A_domain_2": 0.81, "blimp/accuracy/determiner_noun_agreement_1": 0.989, "blimp/accuracy/ellipsis_n_bar_2": 0.868, "blimp/accuracy/principle_A_domain_3": 0.538, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.924, "blimp/accuracy/animate_subject_trans": 0.9, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.887, "blimp/accuracy/distractor_agreement_relative_clause": 0.642, "blimp/accuracy/transitive": 0.867, "blimp/accuracy/sentential_subject_island": 0.404, "blimp/accuracy/adjunct_island": 0.772, "blimp/accuracy/intransitive": 0.758, "blimp/accuracy/existential_there_subject_raising": 0.859, "blimp/accuracy/irregular_past_participle_adjectives": 0.963, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.212, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.234, "blimp/accuracy/only_npi_scope": 0.768, "blimp/accuracy/superlative_quantifiers_2": 0.809, "blimp/accuracy/passive_1": 0.889, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.898, "blimp/accuracy/inchoative": 0.624, "blimp/accuracy/anaphor_gender_agreement": 0.912, "blimp/accuracy/principle_A_c_command": 0.632, "blimp/accuracy/only_npi_licensor_present": 0.557, "blimp/accuracy/expletive_it_object_raising": 0.757, "blimp/accuracy/left_branch_island_simple_question": 0.273, "blimp/accuracy/wh_questions_subject_gap": 0.923, "blimp/accuracy/existential_there_quantifiers_2": 0.288, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.958, "blimp/accuracy/sentential_negation_npi_scope": 0.645, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.775, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.87, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.895, "blimp/accuracy/principle_A_case_2": 0.947, "blimp/accuracy/distractor_agreement_relational_noun": 0.821, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.973, "blimp/accuracy/superlative_quantifiers_1": 0.757, "blimp/accuracy/wh_island": 0.822, "blimp/accuracy/principle_A_domain_1": 0.991, "blimp/accuracy/complex_NP_island": 0.551, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.952, "blimp/accuracy/irregular_past_participle_verbs": 0.882, "blimp/accuracy/drop_argument": 0.771, "blimp/accuracy/wh_questions_object_gap": 0.783, "blimp/accuracy/animate_subject_passive": 0.806, "blimp/accuracy/existential_there_quantifiers_1": 0.963, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.859, "blimp/accuracy/npi_present_2": 0.595, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.961, "blimp/accuracy/anaphor_number_agreement": 0.984, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.952, "blimp/accuracy/existential_there_object_raising": 0.793, "blimp/accuracy/matrix_question_npi_licensor_present": 0.143, "blimp/accuracy/npi_present_1": 0.497, "blimp/accuracy/wh_vs_that_no_gap": 0.974, "blimp/accuracy/left_branch_island_echo_question": 0.393, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.979, "blimp/accuracy/causative": 0.695, "blimp/accuracy/group_average": 0.757910447761194, "blimp/accuracy/seq_average": 0.757910447761194, "cbt/accuracy/NE": 0.7391826923076923, "cbt/accuracy/V": 0.8964, "cbt/accuracy/CN": 0.8064, "cbt/accuracy/P": 0.8828, "cbt/accuracy/group_average": 0.8311956730769231, "cbt/accuracy/seq_average": 0.8312324929971989, "hellaswag/accuracy/val": 0.2835092611033659, "hellaswag/accuracy/group_average": 0.2835092611033659, "hellaswag/accuracy/seq_average": 0.2835092611033659, "piqa/accuracy/val": 0.5805223068552775, "piqa/accuracy/group_average": 0.5805223068552775, "piqa/accuracy/seq_average": 0.5805223068552775, "ai2arc/accuracy/ARC-Easy": 0.33234672304439744, "ai2arc/accuracy/ARC-Challenge": 0.20515021459227467, "ai2arc/accuracy/group_average": 0.26874846881833603, "ai2arc/accuracy/seq_average": 0.29036827195467424, "race/accuracy/test/high": 0.27044025157232704, "race/accuracy/test/middle": 0.334958217270195, "race/accuracy/group_average": 0.302699234421261, "race/accuracy/seq_average": 0.2892176732873936, "siqa/accuracy/dev": 0.3510747185261003, "siqa/accuracy/group_average": 0.3510747185261003, "siqa/accuracy/seq_average": 0.3510747185261003, "commonsenseqa/accuracy/dev_rand_split": 0.26126126126126126, "commonsenseqa/accuracy/group_average": 0.26126126126126126, "commonsenseqa/accuracy/seq_average": 0.26126126126126126}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_no_lb/export/result-model-70000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.6574891105530756, "val/accuracy": 0.47317262679811506, "val/perplexity": 14.260437725502442, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6477554984714673, "lambada/accuracy/total": 0.2267080745341615, "lambada/accuracy/openai_last_token": 0.7542701863354038, "lambada/perplexity": 13.777609637197733, "lambada/lm_loss": 3.2235758644965555, "lambada/lm_perplexity": 25.117777553480792, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.34994035066613827, "mean_loss": 2.6526223045122714, "blimp/accuracy/passive_2": 0.906, "blimp/accuracy/determiner_noun_agreement_2": 0.989, "blimp/accuracy/ellipsis_n_bar_1": 0.798, "blimp/accuracy/tough_vs_raising_2": 0.851, "blimp/accuracy/tough_vs_raising_1": 0.612, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.865, "blimp/accuracy/principle_A_reconstruction": 0.347, "blimp/accuracy/wh_vs_that_with_gap": 0.571, "blimp/accuracy/principle_A_domain_2": 0.809, "blimp/accuracy/determiner_noun_agreement_1": 0.984, "blimp/accuracy/ellipsis_n_bar_2": 0.886, "blimp/accuracy/principle_A_domain_3": 0.549, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.912, "blimp/accuracy/animate_subject_trans": 0.901, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.888, "blimp/accuracy/distractor_agreement_relative_clause": 0.631, "blimp/accuracy/transitive": 0.868, "blimp/accuracy/sentential_subject_island": 0.429, "blimp/accuracy/adjunct_island": 0.787, "blimp/accuracy/intransitive": 0.765, "blimp/accuracy/existential_there_subject_raising": 0.857, "blimp/accuracy/irregular_past_participle_adjectives": 0.982, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.289, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.242, "blimp/accuracy/only_npi_scope": 0.784, "blimp/accuracy/superlative_quantifiers_2": 0.794, "blimp/accuracy/passive_1": 0.886, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.875, "blimp/accuracy/inchoative": 0.626, "blimp/accuracy/anaphor_gender_agreement": 0.943, "blimp/accuracy/principle_A_c_command": 0.608, "blimp/accuracy/only_npi_licensor_present": 0.574, "blimp/accuracy/expletive_it_object_raising": 0.767, "blimp/accuracy/left_branch_island_simple_question": 0.343, "blimp/accuracy/wh_questions_subject_gap": 0.925, "blimp/accuracy/existential_there_quantifiers_2": 0.351, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.936, "blimp/accuracy/sentential_negation_npi_scope": 0.674, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.819, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.891, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.909, "blimp/accuracy/principle_A_case_2": 0.952, "blimp/accuracy/distractor_agreement_relational_noun": 0.847, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.973, "blimp/accuracy/superlative_quantifiers_1": 0.826, "blimp/accuracy/wh_island": 0.812, "blimp/accuracy/principle_A_domain_1": 0.99, "blimp/accuracy/complex_NP_island": 0.58, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.965, "blimp/accuracy/irregular_past_participle_verbs": 0.867, "blimp/accuracy/drop_argument": 0.765, "blimp/accuracy/wh_questions_object_gap": 0.771, "blimp/accuracy/animate_subject_passive": 0.798, "blimp/accuracy/existential_there_quantifiers_1": 0.962, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.817, "blimp/accuracy/npi_present_2": 0.571, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.93, "blimp/accuracy/anaphor_number_agreement": 0.989, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.955, "blimp/accuracy/existential_there_object_raising": 0.819, "blimp/accuracy/matrix_question_npi_licensor_present": 0.195, "blimp/accuracy/npi_present_1": 0.443, "blimp/accuracy/wh_vs_that_no_gap": 0.967, "blimp/accuracy/left_branch_island_echo_question": 0.462, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.98, "blimp/accuracy/causative": 0.702, "blimp/accuracy/group_average": 0.7665820895522387, "blimp/accuracy/seq_average": 0.7665820895522388, "cbt/accuracy/NE": 0.7435897435897436, "cbt/accuracy/V": 0.9036, "cbt/accuracy/CN": 0.8128, "cbt/accuracy/P": 0.8752, "cbt/accuracy/group_average": 0.8337974358974359, "cbt/accuracy/seq_average": 0.8338335334133653, "hellaswag/accuracy/val": 0.28818960366460866, "hellaswag/accuracy/group_average": 0.28818960366460866, "hellaswag/accuracy/seq_average": 0.28818960366460866, "piqa/accuracy/val": 0.573993471164309, "piqa/accuracy/group_average": 0.573993471164309, "piqa/accuracy/seq_average": 0.573993471164309, "ai2arc/accuracy/ARC-Easy": 0.33530655391120506, "ai2arc/accuracy/ARC-Challenge": 0.20686695278969958, "ai2arc/accuracy/group_average": 0.2710867533504523, "ai2arc/accuracy/seq_average": 0.2929178470254957, "race/accuracy/test/high": 0.2672955974842767, "race/accuracy/test/middle": 0.3426183844011142, "race/accuracy/group_average": 0.30495699094269546, "race/accuracy/seq_average": 0.2892176732873936, "siqa/accuracy/dev": 0.35209825997952915, "siqa/accuracy/group_average": 0.35209825997952915, "siqa/accuracy/seq_average": 0.35209825997952915, "commonsenseqa/accuracy/dev_rand_split": 0.26453726453726456, "commonsenseqa/accuracy/group_average": 0.26453726453726456, "commonsenseqa/accuracy/seq_average": 0.26453726453726456}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_no_lb/export/result-model-80000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.6368185376364086, "val/accuracy": 0.4753146701388889, "val/perplexity": 13.968691971454504, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.523188051970109, "lambada/accuracy/total": 0.2393245341614907, "lambada/accuracy/openai_last_token": 0.7556288819875776, "lambada/perplexity": 13.158010668283234, "lambada/lm_loss": 3.2035306910943864, "lambada/lm_perplexity": 24.619300071672914, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.35731960215018976, "mean_loss": 2.5800032948032587, "blimp/accuracy/passive_2": 0.92, "blimp/accuracy/determiner_noun_agreement_2": 0.989, "blimp/accuracy/ellipsis_n_bar_1": 0.789, "blimp/accuracy/tough_vs_raising_2": 0.88, "blimp/accuracy/tough_vs_raising_1": 0.61, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.881, "blimp/accuracy/principle_A_reconstruction": 0.404, "blimp/accuracy/wh_vs_that_with_gap": 0.523, "blimp/accuracy/principle_A_domain_2": 0.824, "blimp/accuracy/determiner_noun_agreement_1": 0.987, "blimp/accuracy/ellipsis_n_bar_2": 0.889, "blimp/accuracy/principle_A_domain_3": 0.547, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.909, "blimp/accuracy/animate_subject_trans": 0.913, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.898, "blimp/accuracy/distractor_agreement_relative_clause": 0.65, "blimp/accuracy/transitive": 0.871, "blimp/accuracy/sentential_subject_island": 0.418, "blimp/accuracy/adjunct_island": 0.809, "blimp/accuracy/intransitive": 0.789, "blimp/accuracy/existential_there_subject_raising": 0.876, "blimp/accuracy/irregular_past_participle_adjectives": 0.935, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.3, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.242, "blimp/accuracy/only_npi_scope": 0.758, "blimp/accuracy/superlative_quantifiers_2": 0.849, "blimp/accuracy/passive_1": 0.899, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.883, "blimp/accuracy/inchoative": 0.623, "blimp/accuracy/anaphor_gender_agreement": 0.902, "blimp/accuracy/principle_A_c_command": 0.613, "blimp/accuracy/only_npi_licensor_present": 0.605, "blimp/accuracy/expletive_it_object_raising": 0.755, "blimp/accuracy/left_branch_island_simple_question": 0.335, "blimp/accuracy/wh_questions_subject_gap": 0.915, "blimp/accuracy/existential_there_quantifiers_2": 0.275, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.941, "blimp/accuracy/sentential_negation_npi_scope": 0.637, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.764, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.854, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.914, "blimp/accuracy/principle_A_case_2": 0.959, "blimp/accuracy/distractor_agreement_relational_noun": 0.845, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.956, "blimp/accuracy/superlative_quantifiers_1": 0.757, "blimp/accuracy/wh_island": 0.82, "blimp/accuracy/principle_A_domain_1": 0.977, "blimp/accuracy/complex_NP_island": 0.544, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.965, "blimp/accuracy/irregular_past_participle_verbs": 0.848, "blimp/accuracy/drop_argument": 0.778, "blimp/accuracy/wh_questions_object_gap": 0.8, "blimp/accuracy/animate_subject_passive": 0.803, "blimp/accuracy/existential_there_quantifiers_1": 0.959, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.873, "blimp/accuracy/npi_present_2": 0.601, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.946, "blimp/accuracy/anaphor_number_agreement": 0.986, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.959, "blimp/accuracy/existential_there_object_raising": 0.801, "blimp/accuracy/matrix_question_npi_licensor_present": 0.239, "blimp/accuracy/npi_present_1": 0.497, "blimp/accuracy/wh_vs_that_no_gap": 0.974, "blimp/accuracy/left_branch_island_echo_question": 0.358, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.982, "blimp/accuracy/causative": 0.7, "blimp/accuracy/group_average": 0.7657014925373132, "blimp/accuracy/seq_average": 0.7657014925373135, "cbt/accuracy/NE": 0.7560096153846154, "cbt/accuracy/V": 0.9056, "cbt/accuracy/CN": 0.8128, "cbt/accuracy/P": 0.888, "cbt/accuracy/group_average": 0.8406024038461538, "cbt/accuracy/seq_average": 0.8406362545018007, "hellaswag/accuracy/val": 0.28958374825731925, "hellaswag/accuracy/group_average": 0.28958374825731925, "hellaswag/accuracy/seq_average": 0.28958374825731925, "piqa/accuracy/val": 0.5761697497279652, "piqa/accuracy/group_average": 0.5761697497279652, "piqa/accuracy/seq_average": 0.5761697497279652, "ai2arc/accuracy/ARC-Easy": 0.33446088794926004, "ai2arc/accuracy/ARC-Challenge": 0.21545064377682405, "ai2arc/accuracy/group_average": 0.27495576586304205, "ai2arc/accuracy/seq_average": 0.2951841359773371, "race/accuracy/test/high": 0.2755860491709548, "race/accuracy/test/middle": 0.34401114206128136, "race/accuracy/group_average": 0.3097985956161181, "race/accuracy/seq_average": 0.2955006080259424, "siqa/accuracy/dev": 0.3510747185261003, "siqa/accuracy/group_average": 0.3510747185261003, "siqa/accuracy/seq_average": 0.3510747185261003, "commonsenseqa/accuracy/dev_rand_split": 0.2620802620802621, "commonsenseqa/accuracy/group_average": 0.2620802620802621, "commonsenseqa/accuracy/seq_average": 0.2620802620802621}
|
Pretrain_language_model/save/slimpajama_moe_no_attmoe_154M_no_lb/export/result-model-90000.pth.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/loss": 2.6238311283172124, "val/accuracy": 0.47744508773561506, "val/perplexity": 13.788447836860595, "val/time_since_best_loss": 0, "val/time_since_best_accuracy": 0, "lambada/loss": 2.6140553729134317, "lambada/accuracy/total": 0.23893633540372672, "lambada/accuracy/openai_last_token": 0.7614518633540373, "lambada/perplexity": 12.968886893745285, "lambada/lm_loss": 3.1803704177073033, "lambada/lm_perplexity": 24.0556625453059, "lambada/time_since_best_loss": 0, "lambada/time_since_best_accuracy": 0, "mean_accuracy": 0.35819071156967086, "mean_loss": 2.618943250615322, "blimp/accuracy/passive_2": 0.911, "blimp/accuracy/determiner_noun_agreement_2": 0.991, "blimp/accuracy/ellipsis_n_bar_1": 0.783, "blimp/accuracy/tough_vs_raising_2": 0.878, "blimp/accuracy/tough_vs_raising_1": 0.613, "blimp/accuracy/irregular_plural_subject_verb_agreement_2": 0.882, "blimp/accuracy/principle_A_reconstruction": 0.429, "blimp/accuracy/wh_vs_that_with_gap": 0.542, "blimp/accuracy/principle_A_domain_2": 0.821, "blimp/accuracy/determiner_noun_agreement_1": 0.987, "blimp/accuracy/ellipsis_n_bar_2": 0.892, "blimp/accuracy/principle_A_domain_3": 0.531, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_2": 0.92, "blimp/accuracy/animate_subject_trans": 0.907, "blimp/accuracy/determiner_noun_agreement_with_adj_irregular_1": 0.902, "blimp/accuracy/distractor_agreement_relative_clause": 0.66, "blimp/accuracy/transitive": 0.871, "blimp/accuracy/sentential_subject_island": 0.444, "blimp/accuracy/adjunct_island": 0.795, "blimp/accuracy/intransitive": 0.781, "blimp/accuracy/existential_there_subject_raising": 0.89, "blimp/accuracy/irregular_past_participle_adjectives": 0.97, "blimp/accuracy/coordinate_structure_constraint_complex_left_branch": 0.313, "blimp/accuracy/principle_A_case_1": 1.0, "blimp/accuracy/wh_vs_that_with_gap_long_distance": 0.238, "blimp/accuracy/only_npi_scope": 0.77, "blimp/accuracy/superlative_quantifiers_2": 0.748, "blimp/accuracy/passive_1": 0.876, "blimp/accuracy/regular_plural_subject_verb_agreement_1": 0.882, "blimp/accuracy/inchoative": 0.618, "blimp/accuracy/anaphor_gender_agreement": 0.899, "blimp/accuracy/principle_A_c_command": 0.637, "blimp/accuracy/only_npi_licensor_present": 0.665, "blimp/accuracy/expletive_it_object_raising": 0.782, "blimp/accuracy/left_branch_island_simple_question": 0.318, "blimp/accuracy/wh_questions_subject_gap": 0.924, "blimp/accuracy/existential_there_quantifiers_2": 0.288, "blimp/accuracy/determiner_noun_agreement_with_adj_2": 0.948, "blimp/accuracy/sentential_negation_npi_scope": 0.605, "blimp/accuracy/coordinate_structure_constraint_object_extraction": 0.768, "blimp/accuracy/wh_questions_subject_gap_long_distance": 0.847, "blimp/accuracy/irregular_plural_subject_verb_agreement_1": 0.908, "blimp/accuracy/principle_A_case_2": 0.963, "blimp/accuracy/distractor_agreement_relational_noun": 0.841, "blimp/accuracy/sentential_negation_npi_licensor_present": 0.942, "blimp/accuracy/superlative_quantifiers_1": 0.854, "blimp/accuracy/wh_island": 0.823, "blimp/accuracy/principle_A_domain_1": 0.985, "blimp/accuracy/complex_NP_island": 0.571, "blimp/accuracy/determiner_noun_agreement_irregular_2": 0.96, "blimp/accuracy/irregular_past_participle_verbs": 0.852, "blimp/accuracy/drop_argument": 0.773, "blimp/accuracy/wh_questions_object_gap": 0.784, "blimp/accuracy/animate_subject_passive": 0.78, "blimp/accuracy/existential_there_quantifiers_1": 0.963, "blimp/accuracy/regular_plural_subject_verb_agreement_2": 0.856, "blimp/accuracy/npi_present_2": 0.615, "blimp/accuracy/determiner_noun_agreement_irregular_1": 0.964, "blimp/accuracy/anaphor_number_agreement": 0.992, "blimp/accuracy/determiner_noun_agreement_with_adjective_1": 0.96, "blimp/accuracy/existential_there_object_raising": 0.832, "blimp/accuracy/matrix_question_npi_licensor_present": 0.214, "blimp/accuracy/npi_present_1": 0.487, "blimp/accuracy/wh_vs_that_no_gap": 0.977, "blimp/accuracy/left_branch_island_echo_question": 0.386, "blimp/accuracy/wh_vs_that_no_gap_long_distance": 0.972, "blimp/accuracy/causative": 0.704, "blimp/accuracy/group_average": 0.7684179104477613, "blimp/accuracy/seq_average": 0.7684179104477612, "cbt/accuracy/NE": 0.750801282051282, "cbt/accuracy/V": 0.9096, "cbt/accuracy/CN": 0.8112, "cbt/accuracy/P": 0.8856, "cbt/accuracy/group_average": 0.8393003205128206, "cbt/accuracy/seq_average": 0.8393357342937174, "hellaswag/accuracy/val": 0.2896833300139414, "hellaswag/accuracy/group_average": 0.2896833300139414, "hellaswag/accuracy/seq_average": 0.2896833300139414, "piqa/accuracy/val": 0.5729053318824809, "piqa/accuracy/group_average": 0.5729053318824809, "piqa/accuracy/seq_average": 0.5729053318824809, "ai2arc/accuracy/ARC-Easy": 0.33488372093023255, "ai2arc/accuracy/ARC-Challenge": 0.21545064377682405, "ai2arc/accuracy/group_average": 0.27516718235352833, "ai2arc/accuracy/seq_average": 0.29546742209631727, "race/accuracy/test/high": 0.2687249857061178, "race/accuracy/test/middle": 0.334958217270195, "race/accuracy/group_average": 0.3018416014881564, "race/accuracy/seq_average": 0.2880016214025132, "siqa/accuracy/dev": 0.3587512794268168, "siqa/accuracy/group_average": 0.3587512794268168, "siqa/accuracy/seq_average": 0.3587512794268168, "commonsenseqa/accuracy/dev_rand_split": 0.26535626535626533, "commonsenseqa/accuracy/group_average": 0.26535626535626533, "commonsenseqa/accuracy/seq_average": 0.26535626535626533}
|