| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.504, | |
| "eval_steps": 500, | |
| "global_step": 470, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 0.33984375, | |
| "learning_rate": 0.0001, | |
| "loss": 4.2845, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 0.0001, | |
| "loss": 2.611, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1007, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 1.2890625, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0667, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.3203125, | |
| "learning_rate": 0.0001, | |
| "loss": 1.6745, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 0.33984375, | |
| "learning_rate": 0.0001, | |
| "loss": 1.4179, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 0.369140625, | |
| "learning_rate": 0.0001, | |
| "loss": 1.256, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 0.1767578125, | |
| "learning_rate": 0.0001, | |
| "loss": 1.1206, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 0.1806640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8113, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5563, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 0.47265625, | |
| "learning_rate": 0.0001, | |
| "loss": 1.2945, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 0.30078125, | |
| "learning_rate": 0.0001, | |
| "loss": 1.1513, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 0.87109375, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0038, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 0.11376953125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.9775, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.1376953125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.9107, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 0.13671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8357, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8438, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8182, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 0.1220703125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6811, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.1474609375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5087, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 0.1767578125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.9827, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 0.1943359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.9673, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.9514, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 0.1630859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8378, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.1220703125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8721, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8317, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7948, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7682, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 0.107421875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6472, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.1513671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.463, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 0.16015625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8907, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8254, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 0.146484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8455, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 0.15625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8194, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.166015625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8291, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 0.1787109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7265, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 0.208984375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7856, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7599, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 0.130859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6127, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.1982421875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4152, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 0.1943359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8772, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 0.197265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7661, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 0.173828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8362, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 0.18359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6781, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.1708984375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7479, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 0.177734375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6598, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 0.177734375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7109, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6603, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 0.1494140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5983, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.1630859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3945, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": 0.173828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7734, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 0.2021484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7553, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.848, | |
| "grad_norm": 0.177734375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8062, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 0.150390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6815, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.1689453125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7524, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 0.1572265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6798, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": 0.177734375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7037, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 0.1591796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6274, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.944, | |
| "grad_norm": 0.1630859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6103, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3983, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.976, | |
| "grad_norm": 0.1611328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6683, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": 0.1533203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6045, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.008, | |
| "grad_norm": 0.1630859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5759, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 1.024, | |
| "grad_norm": 0.1904296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5826, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 0.2490234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6502, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.056, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6278, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.072, | |
| "grad_norm": 0.1904296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6155, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 1.088, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6104, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.104, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5942, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6177, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.1360000000000001, | |
| "grad_norm": 0.1796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5307, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 1.152, | |
| "grad_norm": 0.1630859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.443, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.168, | |
| "grad_norm": 0.2431640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4582, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 1.184, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6175, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6191, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.216, | |
| "grad_norm": 0.25, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5887, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.232, | |
| "grad_norm": 0.1962890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5517, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 1.248, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5712, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.264, | |
| "grad_norm": 0.1904296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5526, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6027, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.296, | |
| "grad_norm": 0.1865234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5325, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.312, | |
| "grad_norm": 0.212890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4752, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.328, | |
| "grad_norm": 0.25, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4214, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.3439999999999999, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6299, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.3599999999999999, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6215, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.376, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5869, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.392, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5448, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.408, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6038, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.424, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5647, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5564, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.456, | |
| "grad_norm": 0.1796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4994, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 1.472, | |
| "grad_norm": 0.25, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4244, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.488, | |
| "grad_norm": 0.2451171875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4652, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 1.504, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5929, | |
| "step": 470 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 470, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 90, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.7094162776644813e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |