{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6843455945252352, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01710863986313088, "grad_norm": 0.5642565488815308, "learning_rate": 9.8e-05, "loss": 4.3835, "step": 50 }, { "epoch": 0.03421727972626176, "grad_norm": 0.5467566251754761, "learning_rate": 0.00019800000000000002, "loss": 3.749, "step": 100 }, { "epoch": 0.05132591958939264, "grad_norm": 0.6452801823616028, "learning_rate": 0.00019985136108184378, "loss": 3.6567, "step": 150 }, { "epoch": 0.06843455945252352, "grad_norm": 0.5236774682998657, "learning_rate": 0.00019939371189454347, "loss": 3.6156, "step": 200 }, { "epoch": 0.0855431993156544, "grad_norm": 0.5809645652770996, "learning_rate": 0.00019862840708762515, "loss": 3.5816, "step": 250 }, { "epoch": 0.10265183917878529, "grad_norm": 0.5521299839019775, "learning_rate": 0.00019755781552648373, "loss": 3.5869, "step": 300 }, { "epoch": 0.11976047904191617, "grad_norm": 0.6748865246772766, "learning_rate": 0.0001961852510375556, "loss": 3.5605, "step": 350 }, { "epoch": 0.13686911890504705, "grad_norm": 0.5228627920150757, "learning_rate": 0.00019451496215095671, "loss": 3.5609, "step": 400 }, { "epoch": 0.15397775876817793, "grad_norm": 0.6597743034362793, "learning_rate": 0.0001925521189499101, "loss": 3.5223, "step": 450 }, { "epoch": 0.1710863986313088, "grad_norm": 0.6093740463256836, "learning_rate": 0.00019030279706766984, "loss": 3.4853, "step": 500 }, { "epoch": 0.1881950384944397, "grad_norm": 0.602202832698822, "learning_rate": 0.00018777395888147495, "loss": 3.5089, "step": 550 }, { "epoch": 0.20530367835757057, "grad_norm": 0.6418355703353882, "learning_rate": 0.00018497343196174478, "loss": 3.5126, "step": 600 }, { "epoch": 0.22241231822070145, "grad_norm": 0.6478594541549683, "learning_rate": 0.0001819098848432218, "loss": 3.506, "step": 650 }, { "epoch": 0.23952095808383234, "grad_norm": 0.5717478394508362, "learning_rate": 0.00017859280019305883, "loss": 3.4606, "step": 700 }, { "epoch": 0.2566295979469632, "grad_norm": 0.7009835839271545, "learning_rate": 0.00017503244545890345, "loss": 3.4376, "step": 750 }, { "epoch": 0.2737382378100941, "grad_norm": 0.6202488541603088, "learning_rate": 0.00017123984108783336, "loss": 3.4626, "step": 800 }, { "epoch": 0.290846877673225, "grad_norm": 0.6825055480003357, "learning_rate": 0.0001672267264145158, "loss": 3.4384, "step": 850 }, { "epoch": 0.30795551753635586, "grad_norm": 0.6829890608787537, "learning_rate": 0.00016300552332417753, "loss": 3.4205, "step": 900 }, { "epoch": 0.32506415739948674, "grad_norm": 0.6822900176048279, "learning_rate": 0.00015858929780286074, "loss": 3.4484, "step": 950 }, { "epoch": 0.3421727972626176, "grad_norm": 0.6704617142677307, "learning_rate": 0.00015399171949397882, "loss": 3.3913, "step": 1000 }, { "epoch": 0.3592814371257485, "grad_norm": 0.6927918791770935, "learning_rate": 0.00014922701938635793, "loss": 3.3897, "step": 1050 }, { "epoch": 0.3763900769888794, "grad_norm": 0.7238638401031494, "learning_rate": 0.0001443099457647332, "loss": 3.3743, "step": 1100 }, { "epoch": 0.39349871685201027, "grad_norm": 0.7207512855529785, "learning_rate": 0.00013925571855904722, "loss": 3.3237, "step": 1150 }, { "epoch": 0.41060735671514115, "grad_norm": 0.7312901616096497, "learning_rate": 0.0001340799822338543, "loss": 3.3378, "step": 1200 }, { "epoch": 0.42771599657827203, "grad_norm": 0.761360228061676, "learning_rate": 0.00012879875736365314, "loss": 3.3589, "step": 1250 }, { "epoch": 0.4448246364414029, "grad_norm": 0.7985053658485413, "learning_rate": 0.0001234283910440377, "loss": 3.3407, "step": 1300 }, { "epoch": 0.4619332763045338, "grad_norm": 0.8397980332374573, "learning_rate": 0.00011798550629216014, "loss": 3.3116, "step": 1350 }, { "epoch": 0.47904191616766467, "grad_norm": 0.8010730147361755, "learning_rate": 0.00011248695059312721, "loss": 3.2978, "step": 1400 }, { "epoch": 0.49615055603079555, "grad_norm": 0.9002817273139954, "learning_rate": 0.000106949743751596, "loss": 3.2647, "step": 1450 }, { "epoch": 0.5132591958939264, "grad_norm": 0.9080646634101868, "learning_rate": 0.00010139102520998512, "loss": 3.2929, "step": 1500 }, { "epoch": 0.5303678357570573, "grad_norm": 0.9352893829345703, "learning_rate": 9.582800099636817e-05, "loss": 3.2759, "step": 1550 }, { "epoch": 0.5474764756201882, "grad_norm": 1.0353022813796997, "learning_rate": 9.027789046626338e-05, "loss": 3.2525, "step": 1600 }, { "epoch": 0.564585115483319, "grad_norm": 0.9121481776237488, "learning_rate": 8.475787300317043e-05, "loss": 3.2432, "step": 1650 }, { "epoch": 0.58169375534645, "grad_norm": 0.9885613918304443, "learning_rate": 7.928503484283338e-05, "loss": 3.2033, "step": 1700 }, { "epoch": 0.5988023952095808, "grad_norm": 0.6203758120536804, "learning_rate": 7.387631618582624e-05, "loss": 3.1833, "step": 1750 }, { "epoch": 0.6159110350727117, "grad_norm": 1.0348588228225708, "learning_rate": 6.854845876216432e-05, "loss": 3.1845, "step": 1800 }, { "epoch": 0.6330196749358425, "grad_norm": 1.0622155666351318, "learning_rate": 6.331795401024621e-05, "loss": 3.2105, "step": 1850 }, { "epoch": 0.6501283147989735, "grad_norm": 0.9984121322631836, "learning_rate": 5.8200992030528875e-05, "loss": 3.1651, "step": 1900 }, { "epoch": 0.6672369546621043, "grad_norm": 1.0703413486480713, "learning_rate": 5.3213411471941155e-05, "loss": 3.1492, "step": 1950 }, { "epoch": 0.6843455945252352, "grad_norm": 1.0491068363189697, "learning_rate": 4.8370650506153093e-05, "loss": 3.1829, "step": 2000 } ], "logging_steps": 50, "max_steps": 2923, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.8590532124459008e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }