| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 10.0, |
| "eval_steps": 500, |
| "global_step": 1000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.2, |
| "grad_norm": 1.116289734840393, |
| "learning_rate": 9.8989898989899e-05, |
| "loss": 7.7821, |
| "mean_token_accuracy": 0.5719940811395645, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 7.432015419006348, |
| "learning_rate": 9.696969696969698e-05, |
| "loss": 5.6944, |
| "mean_token_accuracy": 0.5889941781759263, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 4.042129039764404, |
| "learning_rate": 9.494949494949495e-05, |
| "loss": 2.5109, |
| "mean_token_accuracy": 0.6000781744718552, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 13.056010246276855, |
| "learning_rate": 9.292929292929293e-05, |
| "loss": 2.3438, |
| "mean_token_accuracy": 0.6159425437450409, |
| "step": 80 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 4.457572937011719, |
| "learning_rate": 9.090909090909092e-05, |
| "loss": 2.3887, |
| "mean_token_accuracy": 0.6172163128852844, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.2, |
| "grad_norm": 3.7061715126037598, |
| "learning_rate": 8.888888888888889e-05, |
| "loss": 2.2335, |
| "mean_token_accuracy": 0.7337327748537064, |
| "step": 120 |
| }, |
| { |
| "epoch": 1.4, |
| "grad_norm": 9.19980525970459, |
| "learning_rate": 8.686868686868688e-05, |
| "loss": 2.3133, |
| "mean_token_accuracy": 0.7386979848146439, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.6, |
| "grad_norm": 2.3915796279907227, |
| "learning_rate": 8.484848484848486e-05, |
| "loss": 1.6699, |
| "mean_token_accuracy": 0.6935379534959794, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.8, |
| "grad_norm": 3.794569492340088, |
| "learning_rate": 8.282828282828283e-05, |
| "loss": 2.0775, |
| "mean_token_accuracy": 0.6832446664571762, |
| "step": 180 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 4.765685081481934, |
| "learning_rate": 8.080808080808081e-05, |
| "loss": 2.3591, |
| "mean_token_accuracy": 0.7070848256349563, |
| "step": 200 |
| }, |
| { |
| "epoch": 2.2, |
| "grad_norm": 3.130249500274658, |
| "learning_rate": 7.878787878787879e-05, |
| "loss": 2.029, |
| "mean_token_accuracy": 0.8673289567232132, |
| "step": 220 |
| }, |
| { |
| "epoch": 2.4, |
| "grad_norm": 2.911282777786255, |
| "learning_rate": 7.676767676767676e-05, |
| "loss": 2.0265, |
| "mean_token_accuracy": 0.8326004981994629, |
| "step": 240 |
| }, |
| { |
| "epoch": 2.6, |
| "grad_norm": 10.721837997436523, |
| "learning_rate": 7.474747474747475e-05, |
| "loss": 2.3098, |
| "mean_token_accuracy": 0.8215500295162201, |
| "step": 260 |
| }, |
| { |
| "epoch": 2.8, |
| "grad_norm": 3.9908812046051025, |
| "learning_rate": 7.272727272727273e-05, |
| "loss": 2.1964, |
| "mean_token_accuracy": 0.8110707819461822, |
| "step": 280 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 2.715615749359131, |
| "learning_rate": 7.07070707070707e-05, |
| "loss": 1.6929, |
| "mean_token_accuracy": 0.8274272859096528, |
| "step": 300 |
| }, |
| { |
| "epoch": 3.2, |
| "grad_norm": 2.644812822341919, |
| "learning_rate": 6.86868686868687e-05, |
| "loss": 2.1799, |
| "mean_token_accuracy": 0.8936949998140336, |
| "step": 320 |
| }, |
| { |
| "epoch": 3.4, |
| "grad_norm": 10.8971586227417, |
| "learning_rate": 6.666666666666667e-05, |
| "loss": 1.9275, |
| "mean_token_accuracy": 0.9045962303876877, |
| "step": 340 |
| }, |
| { |
| "epoch": 3.6, |
| "grad_norm": 11.13720703125, |
| "learning_rate": 6.464646464646466e-05, |
| "loss": 1.8069, |
| "mean_token_accuracy": 0.9026908665895462, |
| "step": 360 |
| }, |
| { |
| "epoch": 3.8, |
| "grad_norm": 8.521758079528809, |
| "learning_rate": 6.262626262626264e-05, |
| "loss": 1.8283, |
| "mean_token_accuracy": 0.8914051353931427, |
| "step": 380 |
| }, |
| { |
| "epoch": 4.0, |
| "grad_norm": 2.4625191688537598, |
| "learning_rate": 6.060606060606061e-05, |
| "loss": 1.8709, |
| "mean_token_accuracy": 0.8897238075733185, |
| "step": 400 |
| }, |
| { |
| "epoch": 4.2, |
| "grad_norm": 2.8660027980804443, |
| "learning_rate": 5.858585858585859e-05, |
| "loss": 1.7593, |
| "mean_token_accuracy": 0.9680275410413742, |
| "step": 420 |
| }, |
| { |
| "epoch": 4.4, |
| "grad_norm": 2.451266288757324, |
| "learning_rate": 5.6565656565656563e-05, |
| "loss": 1.8117, |
| "mean_token_accuracy": 0.957812437415123, |
| "step": 440 |
| }, |
| { |
| "epoch": 4.6, |
| "grad_norm": 3.294262170791626, |
| "learning_rate": 5.4545454545454546e-05, |
| "loss": 2.0136, |
| "mean_token_accuracy": 0.9584823906421661, |
| "step": 460 |
| }, |
| { |
| "epoch": 4.8, |
| "grad_norm": 6.892923831939697, |
| "learning_rate": 5.2525252525252536e-05, |
| "loss": 1.9365, |
| "mean_token_accuracy": 0.9545588850975036, |
| "step": 480 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 4.34652853012085, |
| "learning_rate": 5.050505050505051e-05, |
| "loss": 1.8009, |
| "mean_token_accuracy": 0.9617706775665283, |
| "step": 500 |
| }, |
| { |
| "epoch": 5.2, |
| "grad_norm": 2.743523359298706, |
| "learning_rate": 4.848484848484849e-05, |
| "loss": 1.7229, |
| "mean_token_accuracy": 0.9849441170692443, |
| "step": 520 |
| }, |
| { |
| "epoch": 5.4, |
| "grad_norm": 1.957683801651001, |
| "learning_rate": 4.6464646464646464e-05, |
| "loss": 1.803, |
| "mean_token_accuracy": 0.9844884544610977, |
| "step": 540 |
| }, |
| { |
| "epoch": 5.6, |
| "grad_norm": 5.973021984100342, |
| "learning_rate": 4.4444444444444447e-05, |
| "loss": 1.8576, |
| "mean_token_accuracy": 0.9885385453701019, |
| "step": 560 |
| }, |
| { |
| "epoch": 5.8, |
| "grad_norm": 4.1460065841674805, |
| "learning_rate": 4.242424242424243e-05, |
| "loss": 1.4914, |
| "mean_token_accuracy": 0.9893402248620987, |
| "step": 580 |
| }, |
| { |
| "epoch": 6.0, |
| "grad_norm": 4.472782611846924, |
| "learning_rate": 4.0404040404040405e-05, |
| "loss": 1.7834, |
| "mean_token_accuracy": 0.9883693248033524, |
| "step": 600 |
| }, |
| { |
| "epoch": 6.2, |
| "grad_norm": 1.3110531568527222, |
| "learning_rate": 3.838383838383838e-05, |
| "loss": 1.7085, |
| "mean_token_accuracy": 0.9919841647148132, |
| "step": 620 |
| }, |
| { |
| "epoch": 6.4, |
| "grad_norm": 2.2377755641937256, |
| "learning_rate": 3.6363636363636364e-05, |
| "loss": 1.4832, |
| "mean_token_accuracy": 0.9942442446947097, |
| "step": 640 |
| }, |
| { |
| "epoch": 6.6, |
| "grad_norm": 1.8272420167922974, |
| "learning_rate": 3.434343434343435e-05, |
| "loss": 4.5107, |
| "mean_token_accuracy": 0.9917412668466568, |
| "step": 660 |
| }, |
| { |
| "epoch": 6.8, |
| "grad_norm": 1.3741998672485352, |
| "learning_rate": 3.232323232323233e-05, |
| "loss": 1.5028, |
| "mean_token_accuracy": 0.9937045365571976, |
| "step": 680 |
| }, |
| { |
| "epoch": 7.0, |
| "grad_norm": 3.3915441036224365, |
| "learning_rate": 3.0303030303030306e-05, |
| "loss": 1.6471, |
| "mean_token_accuracy": 0.9931736469268799, |
| "step": 700 |
| }, |
| { |
| "epoch": 7.2, |
| "grad_norm": 1.9566644430160522, |
| "learning_rate": 2.8282828282828282e-05, |
| "loss": 1.6651, |
| "mean_token_accuracy": 0.9943238377571106, |
| "step": 720 |
| }, |
| { |
| "epoch": 7.4, |
| "grad_norm": 4.849315643310547, |
| "learning_rate": 2.6262626262626268e-05, |
| "loss": 1.4752, |
| "mean_token_accuracy": 0.9958192646503449, |
| "step": 740 |
| }, |
| { |
| "epoch": 7.6, |
| "grad_norm": 3.041971445083618, |
| "learning_rate": 2.4242424242424244e-05, |
| "loss": 1.7104, |
| "mean_token_accuracy": 0.9946472465991973, |
| "step": 760 |
| }, |
| { |
| "epoch": 7.8, |
| "grad_norm": 1.911787986755371, |
| "learning_rate": 2.2222222222222223e-05, |
| "loss": 1.636, |
| "mean_token_accuracy": 0.9947971910238266, |
| "step": 780 |
| }, |
| { |
| "epoch": 8.0, |
| "grad_norm": 1.7661563158035278, |
| "learning_rate": 2.0202020202020203e-05, |
| "loss": 1.4368, |
| "mean_token_accuracy": 0.9952641248703002, |
| "step": 800 |
| }, |
| { |
| "epoch": 8.2, |
| "grad_norm": 2.0136585235595703, |
| "learning_rate": 1.8181818181818182e-05, |
| "loss": 1.5714, |
| "mean_token_accuracy": 0.9960389107465744, |
| "step": 820 |
| }, |
| { |
| "epoch": 8.4, |
| "grad_norm": 3.6875247955322266, |
| "learning_rate": 1.6161616161616165e-05, |
| "loss": 1.5223, |
| "mean_token_accuracy": 0.9955139189958573, |
| "step": 840 |
| }, |
| { |
| "epoch": 8.6, |
| "grad_norm": 8.798554420471191, |
| "learning_rate": 1.4141414141414141e-05, |
| "loss": 1.6692, |
| "mean_token_accuracy": 0.9961422830820084, |
| "step": 860 |
| }, |
| { |
| "epoch": 8.8, |
| "grad_norm": 1.6955562829971313, |
| "learning_rate": 1.2121212121212122e-05, |
| "loss": 1.5249, |
| "mean_token_accuracy": 0.9961209177970887, |
| "step": 880 |
| }, |
| { |
| "epoch": 9.0, |
| "grad_norm": 1.8484035730361938, |
| "learning_rate": 1.0101010101010101e-05, |
| "loss": 1.4942, |
| "mean_token_accuracy": 0.9962600082159042, |
| "step": 900 |
| }, |
| { |
| "epoch": 9.2, |
| "grad_norm": 2.3674087524414062, |
| "learning_rate": 8.080808080808082e-06, |
| "loss": 1.4723, |
| "mean_token_accuracy": 0.9962725698947906, |
| "step": 920 |
| }, |
| { |
| "epoch": 9.4, |
| "grad_norm": 2.005415201187134, |
| "learning_rate": 6.060606060606061e-06, |
| "loss": 1.6156, |
| "mean_token_accuracy": 0.9956353396177292, |
| "step": 940 |
| }, |
| { |
| "epoch": 9.6, |
| "grad_norm": 1.6951625347137451, |
| "learning_rate": 4.040404040404041e-06, |
| "loss": 1.4389, |
| "mean_token_accuracy": 0.9975101411342621, |
| "step": 960 |
| }, |
| { |
| "epoch": 9.8, |
| "grad_norm": 1.3709019422531128, |
| "learning_rate": 2.0202020202020206e-06, |
| "loss": 1.5696, |
| "mean_token_accuracy": 0.9961961567401886, |
| "step": 980 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 1.6969162225723267, |
| "learning_rate": 0.0, |
| "loss": 1.6118, |
| "mean_token_accuracy": 0.9955831974744797, |
| "step": 1000 |
| } |
| ], |
| "logging_steps": 20, |
| "max_steps": 1000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 10, |
| "save_steps": 2000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4.174310629214269e+16, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|