| { |
| "best_metric": 0.26617029309272766, |
| "best_model_checkpoint": "train_output_p/trainer/checkpoint-90", |
| "epoch": 2.0, |
| "eval_steps": 45, |
| "global_step": 90, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.022222222222222223, |
| "grad_norm": 510.0, |
| "learning_rate": 1.4814814814814815e-06, |
| "loss": 1.57, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.044444444444444446, |
| "grad_norm": 456.0, |
| "learning_rate": 2.962962962962963e-06, |
| "loss": 1.5059, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.06666666666666667, |
| "grad_norm": 255.0, |
| "learning_rate": 4.444444444444444e-06, |
| "loss": 1.2407, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.08888888888888889, |
| "grad_norm": 37.75, |
| "learning_rate": 5.925925925925926e-06, |
| "loss": 1.0711, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.1111111111111111, |
| "grad_norm": 19.0, |
| "learning_rate": 7.4074074074074075e-06, |
| "loss": 0.8365, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.13333333333333333, |
| "grad_norm": 14.625, |
| "learning_rate": 8.888888888888888e-06, |
| "loss": 0.7852, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.15555555555555556, |
| "grad_norm": 45.0, |
| "learning_rate": 1.037037037037037e-05, |
| "loss": 0.7262, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.17777777777777778, |
| "grad_norm": 19.375, |
| "learning_rate": 1.1851851851851852e-05, |
| "loss": 0.6195, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 33.75, |
| "learning_rate": 1.3333333333333333e-05, |
| "loss": 0.541, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.2222222222222222, |
| "grad_norm": 16.25, |
| "learning_rate": 1.4814814814814815e-05, |
| "loss": 0.4122, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.24444444444444444, |
| "grad_norm": 8.8125, |
| "learning_rate": 1.6296296296296297e-05, |
| "loss": 0.3315, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.26666666666666666, |
| "grad_norm": 6.75, |
| "learning_rate": 1.7777777777777777e-05, |
| "loss": 0.2999, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.28888888888888886, |
| "grad_norm": 7.28125, |
| "learning_rate": 1.925925925925926e-05, |
| "loss": 0.3754, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.3111111111111111, |
| "grad_norm": 8.5, |
| "learning_rate": 2.074074074074074e-05, |
| "loss": 0.3605, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.3333333333333333, |
| "grad_norm": 6.40625, |
| "learning_rate": 2.2222222222222227e-05, |
| "loss": 0.2987, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.35555555555555557, |
| "grad_norm": 9.25, |
| "learning_rate": 2.3703703703703703e-05, |
| "loss": 0.2917, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.37777777777777777, |
| "grad_norm": 6.375, |
| "learning_rate": 2.5185185185185187e-05, |
| "loss": 0.3148, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 9.625, |
| "learning_rate": 2.6666666666666667e-05, |
| "loss": 0.3413, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.4222222222222222, |
| "grad_norm": 5.0625, |
| "learning_rate": 2.814814814814815e-05, |
| "loss": 0.3995, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.4444444444444444, |
| "grad_norm": 5.5, |
| "learning_rate": 2.962962962962963e-05, |
| "loss": 0.2721, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.4666666666666667, |
| "grad_norm": 6.59375, |
| "learning_rate": 3.111111111111112e-05, |
| "loss": 0.2863, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.4888888888888889, |
| "grad_norm": 5.53125, |
| "learning_rate": 3.259259259259259e-05, |
| "loss": 0.3648, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.5111111111111111, |
| "grad_norm": 3.5625, |
| "learning_rate": 3.4074074074074077e-05, |
| "loss": 0.2912, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.5333333333333333, |
| "grad_norm": 3.546875, |
| "learning_rate": 3.555555555555555e-05, |
| "loss": 0.3062, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.5555555555555556, |
| "grad_norm": 3.90625, |
| "learning_rate": 3.703703703703704e-05, |
| "loss": 0.3551, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.5777777777777777, |
| "grad_norm": 5.0625, |
| "learning_rate": 3.851851851851852e-05, |
| "loss": 0.3227, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 5.71875, |
| "learning_rate": 4e-05, |
| "loss": 0.4326, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.6222222222222222, |
| "grad_norm": 4.21875, |
| "learning_rate": 3.997513842437845e-05, |
| "loss": 0.2951, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.6444444444444445, |
| "grad_norm": 5.875, |
| "learning_rate": 3.990061550730803e-05, |
| "loss": 0.372, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.6666666666666666, |
| "grad_norm": 3.953125, |
| "learning_rate": 3.977661652450257e-05, |
| "loss": 0.3775, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.6888888888888889, |
| "grad_norm": 4.03125, |
| "learning_rate": 3.9603449756970877e-05, |
| "loss": 0.3171, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.7111111111111111, |
| "grad_norm": 4.46875, |
| "learning_rate": 3.938154572458156e-05, |
| "loss": 0.3736, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.7333333333333333, |
| "grad_norm": 4.75, |
| "learning_rate": 3.911145611572282e-05, |
| "loss": 0.4523, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.7555555555555555, |
| "grad_norm": 4.0625, |
| "learning_rate": 3.879385241571817e-05, |
| "loss": 0.3798, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.7777777777777778, |
| "grad_norm": 4.78125, |
| "learning_rate": 3.842952423740816e-05, |
| "loss": 0.3211, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 5.46875, |
| "learning_rate": 3.801937735804838e-05, |
| "loss": 0.4251, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.8222222222222222, |
| "grad_norm": 3.328125, |
| "learning_rate": 3.756443146740457e-05, |
| "loss": 0.3788, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.8444444444444444, |
| "grad_norm": 3.984375, |
| "learning_rate": 3.7065817632643115e-05, |
| "loss": 0.3228, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.8666666666666667, |
| "grad_norm": 3.578125, |
| "learning_rate": 3.65247754863199e-05, |
| "loss": 0.331, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.8888888888888888, |
| "grad_norm": 4.09375, |
| "learning_rate": 3.5942650144458454e-05, |
| "loss": 0.3265, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.9111111111111111, |
| "grad_norm": 4.59375, |
| "learning_rate": 3.532088886237956e-05, |
| "loss": 0.3577, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.9333333333333333, |
| "grad_norm": 3.25, |
| "learning_rate": 3.4661037436596526e-05, |
| "loss": 0.2847, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.9555555555555556, |
| "grad_norm": 3.46875, |
| "learning_rate": 3.396473636172146e-05, |
| "loss": 0.3059, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.9777777777777777, |
| "grad_norm": 3.625, |
| "learning_rate": 3.323371675193719e-05, |
| "loss": 0.3829, |
| "step": 44 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 5.0, |
| "learning_rate": 3.246979603717467e-05, |
| "loss": 0.425, |
| "step": 45 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 0.3183366656303406, |
| "eval_runtime": 56.0572, |
| "eval_samples_per_second": 1.07, |
| "eval_steps_per_second": 1.07, |
| "step": 45 |
| }, |
| { |
| "epoch": 1.0222222222222221, |
| "grad_norm": 2.5, |
| "learning_rate": 3.1674873444695804e-05, |
| "loss": 0.2038, |
| "step": 46 |
| }, |
| { |
| "epoch": 1.0444444444444445, |
| "grad_norm": 2.703125, |
| "learning_rate": 3.0850925277315193e-05, |
| "loss": 0.2139, |
| "step": 47 |
| }, |
| { |
| "epoch": 1.0666666666666667, |
| "grad_norm": 2.9375, |
| "learning_rate": 3.0000000000000004e-05, |
| "loss": 0.2433, |
| "step": 48 |
| }, |
| { |
| "epoch": 1.0888888888888888, |
| "grad_norm": 2.3125, |
| "learning_rate": 2.9124213147063263e-05, |
| "loss": 0.1571, |
| "step": 49 |
| }, |
| { |
| "epoch": 1.1111111111111112, |
| "grad_norm": 3.09375, |
| "learning_rate": 2.8225742062612236e-05, |
| "loss": 0.1785, |
| "step": 50 |
| }, |
| { |
| "epoch": 1.1333333333333333, |
| "grad_norm": 2.4375, |
| "learning_rate": 2.7306820487327906e-05, |
| "loss": 0.1865, |
| "step": 51 |
| }, |
| { |
| "epoch": 1.1555555555555554, |
| "grad_norm": 2.984375, |
| "learning_rate": 2.6369733005033693e-05, |
| "loss": 0.2101, |
| "step": 52 |
| }, |
| { |
| "epoch": 1.1777777777777778, |
| "grad_norm": 4.15625, |
| "learning_rate": 2.5416809362860107e-05, |
| "loss": 0.1536, |
| "step": 53 |
| }, |
| { |
| "epoch": 1.2, |
| "grad_norm": 4.09375, |
| "learning_rate": 2.445041867912629e-05, |
| "loss": 0.2781, |
| "step": 54 |
| }, |
| { |
| "epoch": 1.2222222222222223, |
| "grad_norm": 3.0625, |
| "learning_rate": 2.3472963553338614e-05, |
| "loss": 0.177, |
| "step": 55 |
| }, |
| { |
| "epoch": 1.2444444444444445, |
| "grad_norm": 3.46875, |
| "learning_rate": 2.2486874092949708e-05, |
| "loss": 0.1872, |
| "step": 56 |
| }, |
| { |
| "epoch": 1.2666666666666666, |
| "grad_norm": 2.359375, |
| "learning_rate": 2.149460187172849e-05, |
| "loss": 0.164, |
| "step": 57 |
| }, |
| { |
| "epoch": 1.2888888888888888, |
| "grad_norm": 2.921875, |
| "learning_rate": 2.0498613834761462e-05, |
| "loss": 0.19, |
| "step": 58 |
| }, |
| { |
| "epoch": 1.3111111111111111, |
| "grad_norm": 2.96875, |
| "learning_rate": 1.9501386165238548e-05, |
| "loss": 0.2249, |
| "step": 59 |
| }, |
| { |
| "epoch": 1.3333333333333333, |
| "grad_norm": 2.75, |
| "learning_rate": 1.8505398128271517e-05, |
| "loss": 0.1734, |
| "step": 60 |
| }, |
| { |
| "epoch": 1.3555555555555556, |
| "grad_norm": 2.640625, |
| "learning_rate": 1.7513125907050302e-05, |
| "loss": 0.1872, |
| "step": 61 |
| }, |
| { |
| "epoch": 1.3777777777777778, |
| "grad_norm": 2.109375, |
| "learning_rate": 1.6527036446661396e-05, |
| "loss": 0.1419, |
| "step": 62 |
| }, |
| { |
| "epoch": 1.4, |
| "grad_norm": 2.484375, |
| "learning_rate": 1.5549581320873715e-05, |
| "loss": 0.1277, |
| "step": 63 |
| }, |
| { |
| "epoch": 1.4222222222222223, |
| "grad_norm": 2.84375, |
| "learning_rate": 1.4583190637139901e-05, |
| "loss": 0.1481, |
| "step": 64 |
| }, |
| { |
| "epoch": 1.4444444444444444, |
| "grad_norm": 2.546875, |
| "learning_rate": 1.3630266994966314e-05, |
| "loss": 0.1743, |
| "step": 65 |
| }, |
| { |
| "epoch": 1.4666666666666668, |
| "grad_norm": 2.34375, |
| "learning_rate": 1.26931795126721e-05, |
| "loss": 0.1656, |
| "step": 66 |
| }, |
| { |
| "epoch": 1.488888888888889, |
| "grad_norm": 3.015625, |
| "learning_rate": 1.1774257937387774e-05, |
| "loss": 0.2075, |
| "step": 67 |
| }, |
| { |
| "epoch": 1.511111111111111, |
| "grad_norm": 2.71875, |
| "learning_rate": 1.087578685293674e-05, |
| "loss": 0.1918, |
| "step": 68 |
| }, |
| { |
| "epoch": 1.5333333333333332, |
| "grad_norm": 2.25, |
| "learning_rate": 1.0000000000000006e-05, |
| "loss": 0.1405, |
| "step": 69 |
| }, |
| { |
| "epoch": 1.5555555555555556, |
| "grad_norm": 2.640625, |
| "learning_rate": 9.149074722684815e-06, |
| "loss": 0.1773, |
| "step": 70 |
| }, |
| { |
| "epoch": 1.5777777777777777, |
| "grad_norm": 2.140625, |
| "learning_rate": 8.325126555304208e-06, |
| "loss": 0.152, |
| "step": 71 |
| }, |
| { |
| "epoch": 1.6, |
| "grad_norm": 1.8671875, |
| "learning_rate": 7.530203962825331e-06, |
| "loss": 0.1556, |
| "step": 72 |
| }, |
| { |
| "epoch": 1.6222222222222222, |
| "grad_norm": 2.046875, |
| "learning_rate": 6.766283248062817e-06, |
| "loss": 0.1345, |
| "step": 73 |
| }, |
| { |
| "epoch": 1.6444444444444444, |
| "grad_norm": 2.8125, |
| "learning_rate": 6.035263638278546e-06, |
| "loss": 0.2114, |
| "step": 74 |
| }, |
| { |
| "epoch": 1.6666666666666665, |
| "grad_norm": 2.390625, |
| "learning_rate": 5.338962563403478e-06, |
| "loss": 0.17, |
| "step": 75 |
| }, |
| { |
| "epoch": 1.6888888888888889, |
| "grad_norm": 3.125, |
| "learning_rate": 4.679111137620442e-06, |
| "loss": 0.1565, |
| "step": 76 |
| }, |
| { |
| "epoch": 1.7111111111111112, |
| "grad_norm": 1.9609375, |
| "learning_rate": 4.057349855541557e-06, |
| "loss": 0.1036, |
| "step": 77 |
| }, |
| { |
| "epoch": 1.7333333333333334, |
| "grad_norm": 2.328125, |
| "learning_rate": 3.4752245136801065e-06, |
| "loss": 0.141, |
| "step": 78 |
| }, |
| { |
| "epoch": 1.7555555555555555, |
| "grad_norm": 1.8515625, |
| "learning_rate": 2.934182367356888e-06, |
| "loss": 0.1424, |
| "step": 79 |
| }, |
| { |
| "epoch": 1.7777777777777777, |
| "grad_norm": 2.578125, |
| "learning_rate": 2.435568532595427e-06, |
| "loss": 0.1477, |
| "step": 80 |
| }, |
| { |
| "epoch": 1.8, |
| "grad_norm": 2.15625, |
| "learning_rate": 1.9806226419516195e-06, |
| "loss": 0.1357, |
| "step": 81 |
| }, |
| { |
| "epoch": 1.8222222222222222, |
| "grad_norm": 3.875, |
| "learning_rate": 1.5704757625918454e-06, |
| "loss": 0.2686, |
| "step": 82 |
| }, |
| { |
| "epoch": 1.8444444444444446, |
| "grad_norm": 2.265625, |
| "learning_rate": 1.2061475842818337e-06, |
| "loss": 0.1428, |
| "step": 83 |
| }, |
| { |
| "epoch": 1.8666666666666667, |
| "grad_norm": 2.84375, |
| "learning_rate": 8.885438842771843e-07, |
| "loss": 0.1711, |
| "step": 84 |
| }, |
| { |
| "epoch": 1.8888888888888888, |
| "grad_norm": 2.203125, |
| "learning_rate": 6.184542754184431e-07, |
| "loss": 0.1814, |
| "step": 85 |
| }, |
| { |
| "epoch": 1.911111111111111, |
| "grad_norm": 2.296875, |
| "learning_rate": 3.965502430291235e-07, |
| "loss": 0.2184, |
| "step": 86 |
| }, |
| { |
| "epoch": 1.9333333333333333, |
| "grad_norm": 2.03125, |
| "learning_rate": 2.2338347549742956e-07, |
| "loss": 0.141, |
| "step": 87 |
| }, |
| { |
| "epoch": 1.9555555555555557, |
| "grad_norm": 2.625, |
| "learning_rate": 9.938449269197181e-08, |
| "loss": 0.1823, |
| "step": 88 |
| }, |
| { |
| "epoch": 1.9777777777777779, |
| "grad_norm": 1.9921875, |
| "learning_rate": 2.4861575621553112e-08, |
| "loss": 0.1372, |
| "step": 89 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 3.875, |
| "learning_rate": 0.0, |
| "loss": 0.1987, |
| "step": 90 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.26617029309272766, |
| "eval_runtime": 56.0717, |
| "eval_samples_per_second": 1.07, |
| "eval_steps_per_second": 1.07, |
| "step": 90 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 90, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 45, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.9020512231424e+16, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|