| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9971777986829727, | |
| "eval_steps": 500, | |
| "global_step": 265, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 2.5e-06, | |
| "loss": 1.6646, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 5e-06, | |
| "loss": 1.633, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 1.453125, | |
| "learning_rate": 7.500000000000001e-06, | |
| "loss": 1.5999, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.6187, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 2.625, | |
| "learning_rate": 1.25e-05, | |
| "loss": 1.5918, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 1.5000000000000002e-05, | |
| "loss": 1.5936, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.7500000000000002e-05, | |
| "loss": 1.5263, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.92578125, | |
| "learning_rate": 2e-05, | |
| "loss": 1.4784, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.474609375, | |
| "learning_rate": 1.999034865600726e-05, | |
| "loss": 1.3839, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 1.9961413253717214e-05, | |
| "loss": 1.3698, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 1.9913249646234072e-05, | |
| "loss": 1.2916, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 1.9845950802266584e-05, | |
| "loss": 1.2607, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.375, | |
| "learning_rate": 1.9759646626673445e-05, | |
| "loss": 1.2405, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.8828125, | |
| "learning_rate": 1.9654503709711984e-05, | |
| "loss": 1.2615, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 1.9530725005474195e-05, | |
| "loss": 1.1433, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.404296875, | |
| "learning_rate": 1.93885494401308e-05, | |
| "loss": 1.1599, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.91796875, | |
| "learning_rate": 1.9228251450739495e-05, | |
| "loss": 1.1325, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 1.905014045550767e-05, | |
| "loss": 1.0797, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.375, | |
| "learning_rate": 1.8854560256532098e-05, | |
| "loss": 1.041, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 1.8641888376168483e-05, | |
| "loss": 1.0286, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 1.8412535328311813e-05, | |
| "loss": 0.9831, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 1.816694382599422e-05, | |
| "loss": 0.9927, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 3.25, | |
| "learning_rate": 1.7905587926829815e-05, | |
| "loss": 1.0075, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 1.762897211795607e-05, | |
| "loss": 1.0232, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 1.733763034223804e-05, | |
| "loss": 0.9476, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 1.7032124967615112e-05, | |
| "loss": 0.9338, | |
| "step": 260 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 795, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "total_flos": 5.1943003127808e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |