{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.2, "grad_norm": 1.116289734840393, "learning_rate": 9.8989898989899e-05, "loss": 7.7821, "mean_token_accuracy": 0.5719940811395645, "step": 20 }, { "epoch": 0.4, "grad_norm": 7.432015419006348, "learning_rate": 9.696969696969698e-05, "loss": 5.6944, "mean_token_accuracy": 0.5889941781759263, "step": 40 }, { "epoch": 0.6, "grad_norm": 4.042129039764404, "learning_rate": 9.494949494949495e-05, "loss": 2.5109, "mean_token_accuracy": 0.6000781744718552, "step": 60 }, { "epoch": 0.8, "grad_norm": 13.056010246276855, "learning_rate": 9.292929292929293e-05, "loss": 2.3438, "mean_token_accuracy": 0.6159425437450409, "step": 80 }, { "epoch": 1.0, "grad_norm": 4.457572937011719, "learning_rate": 9.090909090909092e-05, "loss": 2.3887, "mean_token_accuracy": 0.6172163128852844, "step": 100 }, { "epoch": 1.2, "grad_norm": 3.7061715126037598, "learning_rate": 8.888888888888889e-05, "loss": 2.2335, "mean_token_accuracy": 0.7337327748537064, "step": 120 }, { "epoch": 1.4, "grad_norm": 9.19980525970459, "learning_rate": 8.686868686868688e-05, "loss": 2.3133, "mean_token_accuracy": 0.7386979848146439, "step": 140 }, { "epoch": 1.6, "grad_norm": 2.3915796279907227, "learning_rate": 8.484848484848486e-05, "loss": 1.6699, "mean_token_accuracy": 0.6935379534959794, "step": 160 }, { "epoch": 1.8, "grad_norm": 3.794569492340088, "learning_rate": 8.282828282828283e-05, "loss": 2.0775, "mean_token_accuracy": 0.6832446664571762, "step": 180 }, { "epoch": 2.0, "grad_norm": 4.765685081481934, "learning_rate": 8.080808080808081e-05, "loss": 2.3591, "mean_token_accuracy": 0.7070848256349563, "step": 200 }, { "epoch": 2.2, "grad_norm": 3.130249500274658, "learning_rate": 7.878787878787879e-05, "loss": 2.029, "mean_token_accuracy": 0.8673289567232132, "step": 220 }, { "epoch": 2.4, "grad_norm": 2.911282777786255, "learning_rate": 7.676767676767676e-05, "loss": 2.0265, "mean_token_accuracy": 0.8326004981994629, "step": 240 }, { "epoch": 2.6, "grad_norm": 10.721837997436523, "learning_rate": 7.474747474747475e-05, "loss": 2.3098, "mean_token_accuracy": 0.8215500295162201, "step": 260 }, { "epoch": 2.8, "grad_norm": 3.9908812046051025, "learning_rate": 7.272727272727273e-05, "loss": 2.1964, "mean_token_accuracy": 0.8110707819461822, "step": 280 }, { "epoch": 3.0, "grad_norm": 2.715615749359131, "learning_rate": 7.07070707070707e-05, "loss": 1.6929, "mean_token_accuracy": 0.8274272859096528, "step": 300 }, { "epoch": 3.2, "grad_norm": 2.644812822341919, "learning_rate": 6.86868686868687e-05, "loss": 2.1799, "mean_token_accuracy": 0.8936949998140336, "step": 320 }, { "epoch": 3.4, "grad_norm": 10.8971586227417, "learning_rate": 6.666666666666667e-05, "loss": 1.9275, "mean_token_accuracy": 0.9045962303876877, "step": 340 }, { "epoch": 3.6, "grad_norm": 11.13720703125, "learning_rate": 6.464646464646466e-05, "loss": 1.8069, "mean_token_accuracy": 0.9026908665895462, "step": 360 }, { "epoch": 3.8, "grad_norm": 8.521758079528809, "learning_rate": 6.262626262626264e-05, "loss": 1.8283, "mean_token_accuracy": 0.8914051353931427, "step": 380 }, { "epoch": 4.0, "grad_norm": 2.4625191688537598, "learning_rate": 6.060606060606061e-05, "loss": 1.8709, "mean_token_accuracy": 0.8897238075733185, "step": 400 }, { "epoch": 4.2, "grad_norm": 2.8660027980804443, "learning_rate": 5.858585858585859e-05, "loss": 1.7593, "mean_token_accuracy": 0.9680275410413742, "step": 420 }, { "epoch": 4.4, "grad_norm": 2.451266288757324, "learning_rate": 5.6565656565656563e-05, "loss": 1.8117, "mean_token_accuracy": 0.957812437415123, "step": 440 }, { "epoch": 4.6, "grad_norm": 3.294262170791626, "learning_rate": 5.4545454545454546e-05, "loss": 2.0136, "mean_token_accuracy": 0.9584823906421661, "step": 460 }, { "epoch": 4.8, "grad_norm": 6.892923831939697, "learning_rate": 5.2525252525252536e-05, "loss": 1.9365, "mean_token_accuracy": 0.9545588850975036, "step": 480 }, { "epoch": 5.0, "grad_norm": 4.34652853012085, "learning_rate": 5.050505050505051e-05, "loss": 1.8009, "mean_token_accuracy": 0.9617706775665283, "step": 500 }, { "epoch": 5.2, "grad_norm": 2.743523359298706, "learning_rate": 4.848484848484849e-05, "loss": 1.7229, "mean_token_accuracy": 0.9849441170692443, "step": 520 }, { "epoch": 5.4, "grad_norm": 1.957683801651001, "learning_rate": 4.6464646464646464e-05, "loss": 1.803, "mean_token_accuracy": 0.9844884544610977, "step": 540 }, { "epoch": 5.6, "grad_norm": 5.973021984100342, "learning_rate": 4.4444444444444447e-05, "loss": 1.8576, "mean_token_accuracy": 0.9885385453701019, "step": 560 }, { "epoch": 5.8, "grad_norm": 4.1460065841674805, "learning_rate": 4.242424242424243e-05, "loss": 1.4914, "mean_token_accuracy": 0.9893402248620987, "step": 580 }, { "epoch": 6.0, "grad_norm": 4.472782611846924, "learning_rate": 4.0404040404040405e-05, "loss": 1.7834, "mean_token_accuracy": 0.9883693248033524, "step": 600 }, { "epoch": 6.2, "grad_norm": 1.3110531568527222, "learning_rate": 3.838383838383838e-05, "loss": 1.7085, "mean_token_accuracy": 0.9919841647148132, "step": 620 }, { "epoch": 6.4, "grad_norm": 2.2377755641937256, "learning_rate": 3.6363636363636364e-05, "loss": 1.4832, "mean_token_accuracy": 0.9942442446947097, "step": 640 }, { "epoch": 6.6, "grad_norm": 1.8272420167922974, "learning_rate": 3.434343434343435e-05, "loss": 4.5107, "mean_token_accuracy": 0.9917412668466568, "step": 660 }, { "epoch": 6.8, "grad_norm": 1.3741998672485352, "learning_rate": 3.232323232323233e-05, "loss": 1.5028, "mean_token_accuracy": 0.9937045365571976, "step": 680 }, { "epoch": 7.0, "grad_norm": 3.3915441036224365, "learning_rate": 3.0303030303030306e-05, "loss": 1.6471, "mean_token_accuracy": 0.9931736469268799, "step": 700 }, { "epoch": 7.2, "grad_norm": 1.9566644430160522, "learning_rate": 2.8282828282828282e-05, "loss": 1.6651, "mean_token_accuracy": 0.9943238377571106, "step": 720 }, { "epoch": 7.4, "grad_norm": 4.849315643310547, "learning_rate": 2.6262626262626268e-05, "loss": 1.4752, "mean_token_accuracy": 0.9958192646503449, "step": 740 }, { "epoch": 7.6, "grad_norm": 3.041971445083618, "learning_rate": 2.4242424242424244e-05, "loss": 1.7104, "mean_token_accuracy": 0.9946472465991973, "step": 760 }, { "epoch": 7.8, "grad_norm": 1.911787986755371, "learning_rate": 2.2222222222222223e-05, "loss": 1.636, "mean_token_accuracy": 0.9947971910238266, "step": 780 }, { "epoch": 8.0, "grad_norm": 1.7661563158035278, "learning_rate": 2.0202020202020203e-05, "loss": 1.4368, "mean_token_accuracy": 0.9952641248703002, "step": 800 }, { "epoch": 8.2, "grad_norm": 2.0136585235595703, "learning_rate": 1.8181818181818182e-05, "loss": 1.5714, "mean_token_accuracy": 0.9960389107465744, "step": 820 }, { "epoch": 8.4, "grad_norm": 3.6875247955322266, "learning_rate": 1.6161616161616165e-05, "loss": 1.5223, "mean_token_accuracy": 0.9955139189958573, "step": 840 }, { "epoch": 8.6, "grad_norm": 8.798554420471191, "learning_rate": 1.4141414141414141e-05, "loss": 1.6692, "mean_token_accuracy": 0.9961422830820084, "step": 860 }, { "epoch": 8.8, "grad_norm": 1.6955562829971313, "learning_rate": 1.2121212121212122e-05, "loss": 1.5249, "mean_token_accuracy": 0.9961209177970887, "step": 880 }, { "epoch": 9.0, "grad_norm": 1.8484035730361938, "learning_rate": 1.0101010101010101e-05, "loss": 1.4942, "mean_token_accuracy": 0.9962600082159042, "step": 900 }, { "epoch": 9.2, "grad_norm": 2.3674087524414062, "learning_rate": 8.080808080808082e-06, "loss": 1.4723, "mean_token_accuracy": 0.9962725698947906, "step": 920 }, { "epoch": 9.4, "grad_norm": 2.005415201187134, "learning_rate": 6.060606060606061e-06, "loss": 1.6156, "mean_token_accuracy": 0.9956353396177292, "step": 940 }, { "epoch": 9.6, "grad_norm": 1.6951625347137451, "learning_rate": 4.040404040404041e-06, "loss": 1.4389, "mean_token_accuracy": 0.9975101411342621, "step": 960 }, { "epoch": 9.8, "grad_norm": 1.3709019422531128, "learning_rate": 2.0202020202020206e-06, "loss": 1.5696, "mean_token_accuracy": 0.9961961567401886, "step": 980 }, { "epoch": 10.0, "grad_norm": 1.6969162225723267, "learning_rate": 0.0, "loss": 1.6118, "mean_token_accuracy": 0.9955831974744797, "step": 1000 } ], "logging_steps": 20, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.174310629214269e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }