| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 1600, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 18.0, | |
| "learning_rate": 1.0416666666666667e-06, | |
| "loss": 0.9392, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 69.5, | |
| "learning_rate": 2.0833333333333334e-06, | |
| "loss": 0.4936, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 15.375, | |
| "learning_rate": 3.125e-06, | |
| "loss": 0.3247, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 25.625, | |
| "learning_rate": 4.166666666666667e-06, | |
| "loss": 0.3285, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 12.9375, | |
| "learning_rate": 4.9999795126530275e-06, | |
| "loss": 0.3412, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 21.375, | |
| "learning_rate": 4.99926249076577e-06, | |
| "loss": 0.3197, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 7.03125, | |
| "learning_rate": 4.997521437290205e-06, | |
| "loss": 0.3462, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 15.8125, | |
| "learning_rate": 4.99475706559428e-06, | |
| "loss": 0.3598, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 6.53125, | |
| "learning_rate": 4.990970508333707e-06, | |
| "loss": 0.3367, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 12.9375, | |
| "learning_rate": 4.986163316987877e-06, | |
| "loss": 0.3645, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 7.28125, | |
| "learning_rate": 4.980337461224164e-06, | |
| "loss": 0.3392, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 14.5625, | |
| "learning_rate": 4.973495328090891e-06, | |
| "loss": 0.2992, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 9.875, | |
| "learning_rate": 4.965639721039267e-06, | |
| "loss": 0.3209, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 7.09375, | |
| "learning_rate": 4.9567738587747314e-06, | |
| "loss": 0.3477, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 7.5, | |
| "learning_rate": 4.946901373938132e-06, | |
| "loss": 0.3436, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 22.125, | |
| "learning_rate": 4.936026311617316e-06, | |
| "loss": 0.3453, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 5.25, | |
| "learning_rate": 4.9241531276897196e-06, | |
| "loss": 0.3626, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 17.125, | |
| "learning_rate": 4.911286686996648e-06, | |
| "loss": 0.3197, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 5.0, | |
| "learning_rate": 4.897432261349984e-06, | |
| "loss": 0.3353, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 19.0, | |
| "learning_rate": 4.8825955273721524e-06, | |
| "loss": 0.3209, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 7.125, | |
| "learning_rate": 4.866782564170217e-06, | |
| "loss": 0.3336, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 12.5625, | |
| "learning_rate": 4.849999850845066e-06, | |
| "loss": 0.3564, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 7.78125, | |
| "learning_rate": 4.832254263836708e-06, | |
| "loss": 0.3461, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 7.28125, | |
| "learning_rate": 4.813553074106761e-06, | |
| "loss": 0.2953, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 3.828125, | |
| "learning_rate": 4.793903944159303e-06, | |
| "loss": 0.3031, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 10.25, | |
| "learning_rate": 4.773314924901281e-06, | |
| "loss": 0.2889, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 4.751794452343785e-06, | |
| "loss": 0.3648, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 21.75, | |
| "learning_rate": 4.729351344145536e-06, | |
| "loss": 0.2968, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 9.4375, | |
| "learning_rate": 4.705994795999991e-06, | |
| "loss": 0.3095, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 17.875, | |
| "learning_rate": 4.681734377867562e-06, | |
| "loss": 0.3159, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 5.03125, | |
| "learning_rate": 4.6565800300544805e-06, | |
| "loss": 0.3172, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 10.4375, | |
| "learning_rate": 4.630542059139923e-06, | |
| "loss": 0.2989, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.31020960211753845, | |
| "eval_runtime": 101.536, | |
| "eval_samples_per_second": 2.955, | |
| "eval_steps_per_second": 0.374, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 4.8125, | |
| "learning_rate": 4.603631133753061e-06, | |
| "loss": 0.2717, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 5.84375, | |
| "learning_rate": 4.575858280201761e-06, | |
| "loss": 0.2579, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 6.71875, | |
| "learning_rate": 4.547234877954741e-06, | |
| "loss": 0.2666, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 11.375, | |
| "learning_rate": 4.517772654979024e-06, | |
| "loss": 0.2467, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 5.21875, | |
| "learning_rate": 4.487483682934587e-06, | |
| "loss": 0.257, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 4.456380372228208e-06, | |
| "loss": 0.2571, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 5.375, | |
| "learning_rate": 4.424475466928499e-06, | |
| "loss": 0.2603, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 12.6875, | |
| "learning_rate": 4.391782039544239e-06, | |
| "loss": 0.25, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 6.75, | |
| "learning_rate": 4.358313485668124e-06, | |
| "loss": 0.2759, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 16.75, | |
| "learning_rate": 4.324083518488151e-06, | |
| "loss": 0.2449, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 7.4375, | |
| "learning_rate": 4.289106163168858e-06, | |
| "loss": 0.278, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 11.5625, | |
| "learning_rate": 4.2533957511047485e-06, | |
| "loss": 0.2362, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 6.90625, | |
| "learning_rate": 4.2169669140482365e-06, | |
| "loss": 0.2597, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 10.8125, | |
| "learning_rate": 4.179834578114531e-06, | |
| "loss": 0.2149, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 5.25, | |
| "learning_rate": 4.142013957665903e-06, | |
| "loss": 0.2922, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 8.4375, | |
| "learning_rate": 4.1035205490778505e-06, | |
| "loss": 0.2274, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 4.5625, | |
| "learning_rate": 4.064370124389718e-06, | |
| "loss": 0.2603, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 11.9375, | |
| "learning_rate": 4.0245787248423614e-06, | |
| "loss": 0.2122, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 5.53125, | |
| "learning_rate": 3.984162654305516e-06, | |
| "loss": 0.2608, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 10.6875, | |
| "learning_rate": 3.943138472597549e-06, | |
| "loss": 0.2197, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 6.28125, | |
| "learning_rate": 3.901522988700355e-06, | |
| "loss": 0.2711, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 13.875, | |
| "learning_rate": 3.8593332538721465e-06, | |
| "loss": 0.2394, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 8.5, | |
| "learning_rate": 3.816586554660987e-06, | |
| "loss": 0.2615, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 13.625, | |
| "learning_rate": 3.773300405821908e-06, | |
| "loss": 0.2495, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 5.03125, | |
| "learning_rate": 3.7294925431405306e-06, | |
| "loss": 0.2605, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 14.3125, | |
| "learning_rate": 3.6851809161661206e-06, | |
| "loss": 0.2298, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 7.03125, | |
| "learning_rate": 3.6403836808570512e-06, | |
| "loss": 0.2604, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 15.5, | |
| "learning_rate": 3.5951191921417063e-06, | |
| "loss": 0.2354, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 7.125, | |
| "learning_rate": 3.5494059963978433e-06, | |
| "loss": 0.2883, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 7.46875, | |
| "learning_rate": 3.503262823853527e-06, | |
| "loss": 0.2577, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 7.71875, | |
| "learning_rate": 3.4567085809127247e-06, | |
| "loss": 0.2587, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 9.75, | |
| "learning_rate": 3.4097623424087196e-06, | |
| "loss": 0.2864, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.30087584257125854, | |
| "eval_runtime": 100.418, | |
| "eval_samples_per_second": 2.988, | |
| "eval_steps_per_second": 0.378, | |
| "step": 1600 | |
| } | |
| ], | |
| "logging_steps": 25, | |
| "max_steps": 4000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "total_flos": 1.208460096432046e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |