{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.536, "eval_steps": 500, "global_step": 240, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.032, "grad_norm": 0.047607421875, "learning_rate": 0.0001, "loss": 0.3836, "step": 5 }, { "epoch": 0.064, "grad_norm": 0.04541015625, "learning_rate": 0.0001, "loss": 0.2449, "step": 10 }, { "epoch": 0.096, "grad_norm": 0.056396484375, "learning_rate": 0.0001, "loss": 0.1513, "step": 15 }, { "epoch": 0.128, "grad_norm": 0.03857421875, "learning_rate": 0.0001, "loss": 0.0705, "step": 20 }, { "epoch": 0.16, "grad_norm": 0.0286865234375, "learning_rate": 0.0001, "loss": 0.0488, "step": 25 }, { "epoch": 0.192, "grad_norm": 0.03173828125, "learning_rate": 0.0001, "loss": 0.0391, "step": 30 }, { "epoch": 0.224, "grad_norm": 0.054931640625, "learning_rate": 0.0001, "loss": 0.0278, "step": 35 }, { "epoch": 0.256, "grad_norm": 0.0888671875, "learning_rate": 0.0001, "loss": 0.1414, "step": 40 }, { "epoch": 0.288, "grad_norm": 0.015625, "learning_rate": 0.0001, "loss": 0.0371, "step": 45 }, { "epoch": 0.32, "grad_norm": 0.0257568359375, "learning_rate": 0.0001, "loss": 0.0118, "step": 50 }, { "epoch": 0.352, "grad_norm": 0.02197265625, "learning_rate": 0.0001, "loss": 0.0101, "step": 55 }, { "epoch": 0.384, "grad_norm": 0.020751953125, "learning_rate": 0.0001, "loss": 0.0098, "step": 60 }, { "epoch": 0.416, "grad_norm": 0.0164794921875, "learning_rate": 0.0001, "loss": 0.0088, "step": 65 }, { "epoch": 0.448, "grad_norm": 0.0120849609375, "learning_rate": 0.0001, "loss": 0.0086, "step": 70 }, { "epoch": 0.48, "grad_norm": 0.0269775390625, "learning_rate": 0.0001, "loss": 0.0168, "step": 75 }, { "epoch": 0.512, "grad_norm": 0.05078125, "learning_rate": 0.0001, "loss": 0.0572, "step": 80 }, { "epoch": 0.544, "grad_norm": 0.031982421875, "learning_rate": 0.0001, "loss": 0.0092, "step": 85 }, { "epoch": 0.576, "grad_norm": 0.0196533203125, "learning_rate": 0.0001, "loss": 0.0077, "step": 90 }, { "epoch": 0.608, "grad_norm": 0.0238037109375, "learning_rate": 0.0001, "loss": 0.0054, "step": 95 }, { "epoch": 0.64, "grad_norm": 0.0108642578125, "learning_rate": 0.0001, "loss": 0.0043, "step": 100 }, { "epoch": 0.672, "grad_norm": 0.0091552734375, "learning_rate": 0.0001, "loss": 0.004, "step": 105 }, { "epoch": 0.704, "grad_norm": 0.01336669921875, "learning_rate": 0.0001, "loss": 0.0043, "step": 110 }, { "epoch": 0.736, "grad_norm": 0.033203125, "learning_rate": 0.0001, "loss": 0.0122, "step": 115 }, { "epoch": 0.768, "grad_norm": 0.0169677734375, "learning_rate": 0.0001, "loss": 0.0173, "step": 120 }, { "epoch": 0.8, "grad_norm": 0.00909423828125, "learning_rate": 0.0001, "loss": 0.0031, "step": 125 }, { "epoch": 0.832, "grad_norm": 0.01171875, "learning_rate": 0.0001, "loss": 0.0038, "step": 130 }, { "epoch": 0.864, "grad_norm": 0.00946044921875, "learning_rate": 0.0001, "loss": 0.0036, "step": 135 }, { "epoch": 0.896, "grad_norm": 0.014892578125, "learning_rate": 0.0001, "loss": 0.0047, "step": 140 }, { "epoch": 0.928, "grad_norm": 0.01239013671875, "learning_rate": 0.0001, "loss": 0.006, "step": 145 }, { "epoch": 0.96, "grad_norm": 0.00982666015625, "learning_rate": 0.0001, "loss": 0.0032, "step": 150 }, { "epoch": 0.992, "grad_norm": 0.01031494140625, "learning_rate": 0.0001, "loss": 0.0037, "step": 155 }, { "epoch": 1.024, "grad_norm": 0.006927490234375, "learning_rate": 0.0001, "loss": 0.0036, "step": 160 }, { "epoch": 1.056, "grad_norm": 0.0084228515625, "learning_rate": 0.0001, "loss": 0.0017, "step": 165 }, { "epoch": 1.088, "grad_norm": 0.005584716796875, "learning_rate": 0.0001, "loss": 0.0018, "step": 170 }, { "epoch": 1.12, "grad_norm": 0.006683349609375, "learning_rate": 0.0001, "loss": 0.0017, "step": 175 }, { "epoch": 1.152, "grad_norm": 0.004486083984375, "learning_rate": 0.0001, "loss": 0.0016, "step": 180 }, { "epoch": 1.184, "grad_norm": 0.0087890625, "learning_rate": 0.0001, "loss": 0.0026, "step": 185 }, { "epoch": 1.216, "grad_norm": 0.0062255859375, "learning_rate": 0.0001, "loss": 0.0015, "step": 190 }, { "epoch": 1.248, "grad_norm": 0.0128173828125, "learning_rate": 0.0001, "loss": 0.0026, "step": 195 }, { "epoch": 1.28, "grad_norm": 0.006683349609375, "learning_rate": 0.0001, "loss": 0.0039, "step": 200 }, { "epoch": 1.312, "grad_norm": 0.00787353515625, "learning_rate": 0.0001, "loss": 0.0019, "step": 205 }, { "epoch": 1.3439999999999999, "grad_norm": 0.0096435546875, "learning_rate": 0.0001, "loss": 0.0011, "step": 210 }, { "epoch": 1.376, "grad_norm": 0.0096435546875, "learning_rate": 0.0001, "loss": 0.0016, "step": 215 }, { "epoch": 1.408, "grad_norm": 0.005859375, "learning_rate": 0.0001, "loss": 0.0014, "step": 220 }, { "epoch": 1.44, "grad_norm": 0.00848388671875, "learning_rate": 0.0001, "loss": 0.0014, "step": 225 }, { "epoch": 1.472, "grad_norm": 0.015625, "learning_rate": 0.0001, "loss": 0.002, "step": 230 }, { "epoch": 1.504, "grad_norm": 0.03857421875, "learning_rate": 0.0001, "loss": 0.0067, "step": 235 }, { "epoch": 1.536, "grad_norm": 0.00811767578125, "learning_rate": 0.0001, "loss": 0.0062, "step": 240 } ], "logging_steps": 5, "max_steps": 240, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 90, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.355246833433805e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }