| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.999360204734485, | |
| "eval_steps": 500, | |
| "global_step": 781, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03198976327575176, | |
| "grad_norm": 0.16216526925563812, | |
| "learning_rate": 6.329113924050633e-05, | |
| "loss": 0.9047, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.06397952655150352, | |
| "grad_norm": 0.22862769663333893, | |
| "learning_rate": 0.00012658227848101267, | |
| "loss": 0.7593, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.09596928982725528, | |
| "grad_norm": 0.2344927191734314, | |
| "learning_rate": 0.00018987341772151899, | |
| "loss": 0.6731, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.12795905310300704, | |
| "grad_norm": 0.2598811686038971, | |
| "learning_rate": 0.00019401709401709402, | |
| "loss": 0.6604, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.1599488163787588, | |
| "grad_norm": 0.2400255650281906, | |
| "learning_rate": 0.0001868945868945869, | |
| "loss": 0.6304, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.19193857965451055, | |
| "grad_norm": 0.24225808680057526, | |
| "learning_rate": 0.00017977207977207978, | |
| "loss": 0.6428, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.22392834293026231, | |
| "grad_norm": 0.22859220206737518, | |
| "learning_rate": 0.00017264957264957268, | |
| "loss": 0.6154, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.2559181062060141, | |
| "grad_norm": 0.23886021971702576, | |
| "learning_rate": 0.00016552706552706555, | |
| "loss": 0.6001, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.28790786948176583, | |
| "grad_norm": 0.22877049446105957, | |
| "learning_rate": 0.00015840455840455842, | |
| "loss": 0.5983, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.3198976327575176, | |
| "grad_norm": 0.24990104138851166, | |
| "learning_rate": 0.00015128205128205128, | |
| "loss": 0.5789, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.35188739603326935, | |
| "grad_norm": 0.2319009006023407, | |
| "learning_rate": 0.00014415954415954415, | |
| "loss": 0.5851, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.3838771593090211, | |
| "grad_norm": 0.22513625025749207, | |
| "learning_rate": 0.00013703703703703705, | |
| "loss": 0.5974, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.41586692258477287, | |
| "grad_norm": 0.2516462504863739, | |
| "learning_rate": 0.00012991452991452992, | |
| "loss": 0.5811, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.44785668586052463, | |
| "grad_norm": 0.23952844738960266, | |
| "learning_rate": 0.00012279202279202279, | |
| "loss": 0.5885, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.4798464491362764, | |
| "grad_norm": 0.26743239164352417, | |
| "learning_rate": 0.00011566951566951567, | |
| "loss": 0.5812, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.5118362124120281, | |
| "grad_norm": 0.24359311163425446, | |
| "learning_rate": 0.00010854700854700855, | |
| "loss": 0.5853, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5438259756877799, | |
| "grad_norm": 0.26046106219291687, | |
| "learning_rate": 0.00010142450142450144, | |
| "loss": 0.5717, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.5758157389635317, | |
| "grad_norm": 0.2767123878002167, | |
| "learning_rate": 9.430199430199431e-05, | |
| "loss": 0.5711, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.6078055022392834, | |
| "grad_norm": 0.2743181884288788, | |
| "learning_rate": 8.717948717948718e-05, | |
| "loss": 0.5786, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.6397952655150352, | |
| "grad_norm": 0.2655166983604431, | |
| "learning_rate": 8.005698005698006e-05, | |
| "loss": 0.563, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.6717850287907869, | |
| "grad_norm": 0.2630331814289093, | |
| "learning_rate": 7.293447293447295e-05, | |
| "loss": 0.5688, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.7037747920665387, | |
| "grad_norm": 0.27685314416885376, | |
| "learning_rate": 6.581196581196581e-05, | |
| "loss": 0.5687, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.7357645553422905, | |
| "grad_norm": 0.2695849537849426, | |
| "learning_rate": 5.868945868945869e-05, | |
| "loss": 0.5592, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.7677543186180422, | |
| "grad_norm": 0.25096848607063293, | |
| "learning_rate": 5.156695156695157e-05, | |
| "loss": 0.5477, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.799744081893794, | |
| "grad_norm": 0.2821820378303528, | |
| "learning_rate": 4.4444444444444447e-05, | |
| "loss": 0.5574, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.8317338451695457, | |
| "grad_norm": 0.26849839091300964, | |
| "learning_rate": 3.732193732193732e-05, | |
| "loss": 0.5662, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.8637236084452975, | |
| "grad_norm": 0.27688708901405334, | |
| "learning_rate": 3.01994301994302e-05, | |
| "loss": 0.5485, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.8957133717210493, | |
| "grad_norm": 0.2868192195892334, | |
| "learning_rate": 2.307692307692308e-05, | |
| "loss": 0.5493, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.927703134996801, | |
| "grad_norm": 0.28862541913986206, | |
| "learning_rate": 1.5954415954415954e-05, | |
| "loss": 0.5494, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.9596928982725528, | |
| "grad_norm": 0.2842100262641907, | |
| "learning_rate": 8.831908831908831e-06, | |
| "loss": 0.5582, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.9916826615483045, | |
| "grad_norm": 0.2615242302417755, | |
| "learning_rate": 1.7094017094017097e-06, | |
| "loss": 0.5641, | |
| "step": 775 | |
| } | |
| ], | |
| "logging_steps": 25, | |
| "max_steps": 781, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "total_flos": 2.5396281704290714e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |