| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.96, | |
| "eval_steps": 500, | |
| "global_step": 310, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 23.102539576958876, | |
| "learning_rate": 1.0752688172043011e-06, | |
| "loss": 2.3229, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 7.585603806935928, | |
| "learning_rate": 2.1505376344086023e-06, | |
| "loss": 1.8243, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 2.2963511032502937, | |
| "learning_rate": 3.225806451612903e-06, | |
| "loss": 1.0535, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.0735883950229164, | |
| "learning_rate": 4.3010752688172045e-06, | |
| "loss": 0.6957, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.0448299606211808, | |
| "learning_rate": 5.376344086021506e-06, | |
| "loss": 0.5694, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.6110446099263456, | |
| "learning_rate": 6.451612903225806e-06, | |
| "loss": 0.5119, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 0.5939516414788868, | |
| "learning_rate": 7.526881720430108e-06, | |
| "loss": 0.4705, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 0.5407169140835424, | |
| "learning_rate": 8.602150537634409e-06, | |
| "loss": 0.3921, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 0.4210892519038007, | |
| "learning_rate": 9.67741935483871e-06, | |
| "loss": 0.3977, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 0.3772090447904717, | |
| "learning_rate": 9.998274321315453e-06, | |
| "loss": 0.3705, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 0.6336085876305326, | |
| "learning_rate": 9.989824885009142e-06, | |
| "loss": 0.3523, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 0.4337054073917208, | |
| "learning_rate": 9.974346616959476e-06, | |
| "loss": 0.3577, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 0.3772473620821422, | |
| "learning_rate": 9.951861320364822e-06, | |
| "loss": 0.3559, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 0.39779187628182006, | |
| "learning_rate": 9.922400668754833e-06, | |
| "loss": 0.2897, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 0.38309322843856186, | |
| "learning_rate": 9.88600616137407e-06, | |
| "loss": 0.2848, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 0.5417645489744292, | |
| "learning_rate": 9.8427290647248e-06, | |
| "loss": 0.2789, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "grad_norm": 0.4069370179701877, | |
| "learning_rate": 9.792630340351301e-06, | |
| "loss": 0.2634, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 0.38899969177115107, | |
| "learning_rate": 9.735780558967434e-06, | |
| "loss": 0.2697, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 0.41142994330371174, | |
| "learning_rate": 9.67225980104841e-06, | |
| "loss": 0.2718, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 0.4601501476242835, | |
| "learning_rate": 9.602157544026785e-06, | |
| "loss": 0.184, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "grad_norm": 0.4047321863260235, | |
| "learning_rate": 9.525572536251608e-06, | |
| "loss": 0.1853, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "grad_norm": 0.504432502658599, | |
| "learning_rate": 9.442612657888237e-06, | |
| "loss": 0.1781, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "grad_norm": 0.5750425579505938, | |
| "learning_rate": 9.353394768954791e-06, | |
| "loss": 0.191, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "grad_norm": 0.3988291661235405, | |
| "learning_rate": 9.258044544709276e-06, | |
| "loss": 0.1764, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.7118375771625418, | |
| "learning_rate": 9.156696298619266e-06, | |
| "loss": 0.1956, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 4.16, | |
| "grad_norm": 0.47011665809938036, | |
| "learning_rate": 9.049492793163539e-06, | |
| "loss": 0.1034, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 4.32, | |
| "grad_norm": 0.42719370359955, | |
| "learning_rate": 8.936585038732143e-06, | |
| "loss": 0.0999, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 4.48, | |
| "grad_norm": 0.4505633813264433, | |
| "learning_rate": 8.818132080908178e-06, | |
| "loss": 0.0989, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 4.64, | |
| "grad_norm": 0.3870120698399163, | |
| "learning_rate": 8.694300776430958e-06, | |
| "loss": 0.0957, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "grad_norm": 0.43164855953731046, | |
| "learning_rate": 8.565265558156101e-06, | |
| "loss": 0.0985, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 4.96, | |
| "grad_norm": 0.44351115954290926, | |
| "learning_rate": 8.43120818934367e-06, | |
| "loss": 0.09, | |
| "step": 310 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 930, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 15, | |
| "save_steps": 310, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 55946563551232.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |