{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.24, "eval_steps": 500, "global_step": 30, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008, "grad_norm": 15.431024551391602, "learning_rate": 0.0, "loss": 5.2835, "step": 1 }, { "epoch": 0.016, "grad_norm": 15.431879043579102, "learning_rate": 4e-05, "loss": 5.2835, "step": 2 }, { "epoch": 0.024, "grad_norm": 14.52097225189209, "learning_rate": 8e-05, "loss": 4.9335, "step": 3 }, { "epoch": 0.032, "grad_norm": 7.527328014373779, "learning_rate": 0.00012, "loss": 3.6646, "step": 4 }, { "epoch": 0.04, "grad_norm": 10.276887893676758, "learning_rate": 0.00016, "loss": 2.6016, "step": 5 }, { "epoch": 0.048, "grad_norm": 6.039588451385498, "learning_rate": 0.0002, "loss": 1.7373, "step": 6 }, { "epoch": 0.056, "grad_norm": 3.487266778945923, "learning_rate": 0.000192, "loss": 0.8735, "step": 7 }, { "epoch": 0.064, "grad_norm": 2.9169163703918457, "learning_rate": 0.00018400000000000003, "loss": 0.3411, "step": 8 }, { "epoch": 0.072, "grad_norm": 0.7490823864936829, "learning_rate": 0.00017600000000000002, "loss": 0.0589, "step": 9 }, { "epoch": 0.08, "grad_norm": 1.4124163389205933, "learning_rate": 0.000168, "loss": 0.0533, "step": 10 }, { "epoch": 0.088, "grad_norm": 1.3603249788284302, "learning_rate": 0.00016, "loss": 0.0397, "step": 11 }, { "epoch": 0.096, "grad_norm": 0.7485616207122803, "learning_rate": 0.000152, "loss": 0.0232, "step": 12 }, { "epoch": 0.104, "grad_norm": 0.3160126805305481, "learning_rate": 0.000144, "loss": 0.0114, "step": 13 }, { "epoch": 0.112, "grad_norm": 0.39901483058929443, "learning_rate": 0.00013600000000000003, "loss": 0.0075, "step": 14 }, { "epoch": 0.12, "grad_norm": 0.38916680216789246, "learning_rate": 0.00012800000000000002, "loss": 0.0051, "step": 15 }, { "epoch": 0.128, "grad_norm": 0.3773799538612366, "learning_rate": 0.00012, "loss": 0.0038, "step": 16 }, { "epoch": 0.136, "grad_norm": 0.21171778440475464, "learning_rate": 0.00011200000000000001, "loss": 0.0021, "step": 17 }, { "epoch": 0.144, "grad_norm": 0.12317345291376114, "learning_rate": 0.00010400000000000001, "loss": 0.0013, "step": 18 }, { "epoch": 0.152, "grad_norm": 0.06952201575040817, "learning_rate": 9.6e-05, "loss": 0.0008, "step": 19 }, { "epoch": 0.16, "grad_norm": 0.05174114927649498, "learning_rate": 8.800000000000001e-05, "loss": 0.0006, "step": 20 }, { "epoch": 0.168, "grad_norm": 0.03263603150844574, "learning_rate": 8e-05, "loss": 0.0005, "step": 21 }, { "epoch": 0.176, "grad_norm": 0.022345170378684998, "learning_rate": 7.2e-05, "loss": 0.0003, "step": 22 }, { "epoch": 0.184, "grad_norm": 0.019066136330366135, "learning_rate": 6.400000000000001e-05, "loss": 0.0003, "step": 23 }, { "epoch": 0.192, "grad_norm": 0.01889193058013916, "learning_rate": 5.6000000000000006e-05, "loss": 0.0003, "step": 24 }, { "epoch": 0.2, "grad_norm": 0.013505947776138783, "learning_rate": 4.8e-05, "loss": 0.0002, "step": 25 }, { "epoch": 0.208, "grad_norm": 0.011558088473975658, "learning_rate": 4e-05, "loss": 0.0002, "step": 26 }, { "epoch": 0.216, "grad_norm": 0.010295114479959011, "learning_rate": 3.2000000000000005e-05, "loss": 0.0002, "step": 27 }, { "epoch": 0.224, "grad_norm": 0.009663024917244911, "learning_rate": 2.4e-05, "loss": 0.0002, "step": 28 }, { "epoch": 0.232, "grad_norm": 0.009915046393871307, "learning_rate": 1.6000000000000003e-05, "loss": 0.0002, "step": 29 }, { "epoch": 0.24, "grad_norm": 0.0089695630595088, "learning_rate": 8.000000000000001e-06, "loss": 0.0002, "step": 30 } ], "logging_steps": 1, "max_steps": 30, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 525137523027840.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }