{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 660, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.15151515151515152, "grad_norm": 0.796875, "learning_rate": 3.6363636363636366e-06, "loss": 0.1258, "step": 5 }, { "epoch": 0.30303030303030304, "grad_norm": 0.69140625, "learning_rate": 8.181818181818181e-06, "loss": 0.1212, "step": 10 }, { "epoch": 0.45454545454545453, "grad_norm": 0.421875, "learning_rate": 1.2727272727272728e-05, "loss": 0.1055, "step": 15 }, { "epoch": 0.6060606060606061, "grad_norm": 0.2236328125, "learning_rate": 1.7272727272727274e-05, "loss": 0.0963, "step": 20 }, { "epoch": 0.7575757575757576, "grad_norm": 0.2392578125, "learning_rate": 2.1818181818181818e-05, "loss": 0.0931, "step": 25 }, { "epoch": 0.9090909090909091, "grad_norm": 0.1796875, "learning_rate": 2.6363636363636365e-05, "loss": 0.0903, "step": 30 }, { "epoch": 1.0606060606060606, "grad_norm": 0.1552734375, "learning_rate": 2.9999830539872836e-05, "loss": 0.0843, "step": 35 }, { "epoch": 1.2121212121212122, "grad_norm": 0.150390625, "learning_rate": 2.9993899882114902e-05, "loss": 0.0853, "step": 40 }, { "epoch": 1.3636363636363638, "grad_norm": 0.138671875, "learning_rate": 2.997950047184977e-05, "loss": 0.0804, "step": 45 }, { "epoch": 1.5151515151515151, "grad_norm": 0.146484375, "learning_rate": 2.9956641346126986e-05, "loss": 0.0809, "step": 50 }, { "epoch": 1.6666666666666665, "grad_norm": 0.1396484375, "learning_rate": 2.9925336851301575e-05, "loss": 0.0795, "step": 55 }, { "epoch": 1.8181818181818183, "grad_norm": 0.1376953125, "learning_rate": 2.9885606634030267e-05, "loss": 0.0789, "step": 60 }, { "epoch": 1.9696969696969697, "grad_norm": 0.134765625, "learning_rate": 2.98374756289413e-05, "loss": 0.0778, "step": 65 }, { "epoch": 2.121212121212121, "grad_norm": 0.1494140625, "learning_rate": 2.9780974042985506e-05, "loss": 0.0761, "step": 70 }, { "epoch": 2.2727272727272725, "grad_norm": 0.2177734375, "learning_rate": 2.971613733647841e-05, "loss": 0.0751, "step": 75 }, { "epoch": 2.4242424242424243, "grad_norm": 0.1435546875, "learning_rate": 2.9643006200845458e-05, "loss": 0.0756, "step": 80 }, { "epoch": 2.5757575757575757, "grad_norm": 0.1376953125, "learning_rate": 2.9561626533084068e-05, "loss": 0.0765, "step": 85 }, { "epoch": 2.7272727272727275, "grad_norm": 0.1318359375, "learning_rate": 2.9472049406958788e-05, "loss": 0.0746, "step": 90 }, { "epoch": 2.878787878787879, "grad_norm": 0.1455078125, "learning_rate": 2.937433104094746e-05, "loss": 0.0757, "step": 95 }, { "epoch": 3.0303030303030303, "grad_norm": 0.1328125, "learning_rate": 2.9268532762958568e-05, "loss": 0.0725, "step": 100 }, { "epoch": 3.1818181818181817, "grad_norm": 0.1376953125, "learning_rate": 2.915472097184196e-05, "loss": 0.0742, "step": 105 }, { "epoch": 3.3333333333333335, "grad_norm": 0.1318359375, "learning_rate": 2.903296709571698e-05, "loss": 0.0707, "step": 110 }, { "epoch": 3.484848484848485, "grad_norm": 0.1337890625, "learning_rate": 2.8903347547144327e-05, "loss": 0.0734, "step": 115 }, { "epoch": 3.6363636363636362, "grad_norm": 0.142578125, "learning_rate": 2.876594367516961e-05, "loss": 0.0724, "step": 120 }, { "epoch": 3.787878787878788, "grad_norm": 0.1318359375, "learning_rate": 2.8620841714268804e-05, "loss": 0.0725, "step": 125 }, { "epoch": 3.9393939393939394, "grad_norm": 0.1484375, "learning_rate": 2.846813273022764e-05, "loss": 0.0714, "step": 130 }, { "epoch": 4.090909090909091, "grad_norm": 0.134765625, "learning_rate": 2.83079125629888e-05, "loss": 0.0727, "step": 135 }, { "epoch": 4.242424242424242, "grad_norm": 0.146484375, "learning_rate": 2.8140281766502957e-05, "loss": 0.0716, "step": 140 }, { "epoch": 4.393939393939394, "grad_norm": 0.140625, "learning_rate": 2.7965345545621217e-05, "loss": 0.072, "step": 145 }, { "epoch": 4.545454545454545, "grad_norm": 0.1455078125, "learning_rate": 2.7783213690068737e-05, "loss": 0.0701, "step": 150 }, { "epoch": 4.696969696969697, "grad_norm": 0.1435546875, "learning_rate": 2.7594000505540807e-05, "loss": 0.0741, "step": 155 }, { "epoch": 4.848484848484849, "grad_norm": 0.142578125, "learning_rate": 2.7397824741964805e-05, "loss": 0.0665, "step": 160 }, { "epoch": 5.0, "grad_norm": 0.189453125, "learning_rate": 2.7194809518972856e-05, "loss": 0.0705, "step": 165 }, { "epoch": 5.151515151515151, "grad_norm": 0.1494140625, "learning_rate": 2.6985082248632174e-05, "loss": 0.0679, "step": 170 }, { "epoch": 5.303030303030303, "grad_norm": 0.14453125, "learning_rate": 2.676877455548141e-05, "loss": 0.0693, "step": 175 }, { "epoch": 5.454545454545454, "grad_norm": 0.1376953125, "learning_rate": 2.6546022193923274e-05, "loss": 0.0696, "step": 180 }, { "epoch": 5.606060606060606, "grad_norm": 0.1435546875, "learning_rate": 2.631696496302526e-05, "loss": 0.0709, "step": 185 }, { "epoch": 5.757575757575758, "grad_norm": 0.142578125, "learning_rate": 2.6081746618781953e-05, "loss": 0.0694, "step": 190 }, { "epoch": 5.909090909090909, "grad_norm": 0.1357421875, "learning_rate": 2.584051478389399e-05, "loss": 0.0682, "step": 195 }, { "epoch": 6.0606060606060606, "grad_norm": 0.154296875, "learning_rate": 2.559342085512022e-05, "loss": 0.0686, "step": 200 }, { "epoch": 6.212121212121212, "grad_norm": 0.154296875, "learning_rate": 2.5340619908261352e-05, "loss": 0.0703, "step": 205 }, { "epoch": 6.363636363636363, "grad_norm": 0.140625, "learning_rate": 2.508227060083457e-05, "loss": 0.0647, "step": 210 }, { "epoch": 6.515151515151516, "grad_norm": 0.1376953125, "learning_rate": 2.4818535072500327e-05, "loss": 0.064, "step": 215 }, { "epoch": 6.666666666666667, "grad_norm": 0.142578125, "learning_rate": 2.4549578843303708e-05, "loss": 0.0676, "step": 220 }, { "epoch": 6.818181818181818, "grad_norm": 0.14453125, "learning_rate": 2.427557070979427e-05, "loss": 0.0669, "step": 225 }, { "epoch": 6.96969696969697, "grad_norm": 0.1376953125, "learning_rate": 2.399668263908961e-05, "loss": 0.0679, "step": 230 }, { "epoch": 7.121212121212121, "grad_norm": 0.1357421875, "learning_rate": 2.3713089660948985e-05, "loss": 0.0666, "step": 235 }, { "epoch": 7.2727272727272725, "grad_norm": 0.1455078125, "learning_rate": 2.342496975792494e-05, "loss": 0.066, "step": 240 }, { "epoch": 7.424242424242424, "grad_norm": 0.123046875, "learning_rate": 2.313250375366167e-05, "loss": 0.0637, "step": 245 }, { "epoch": 7.575757575757576, "grad_norm": 0.1298828125, "learning_rate": 2.283587519941036e-05, "loss": 0.0683, "step": 250 }, { "epoch": 7.7272727272727275, "grad_norm": 0.1435546875, "learning_rate": 2.253527025883271e-05, "loss": 0.0642, "step": 255 }, { "epoch": 7.878787878787879, "grad_norm": 0.1533203125, "learning_rate": 2.2230877591164858e-05, "loss": 0.0682, "step": 260 }, { "epoch": 8.030303030303031, "grad_norm": 0.1328125, "learning_rate": 2.192288823281509e-05, "loss": 0.0628, "step": 265 }, { "epoch": 8.181818181818182, "grad_norm": 0.158203125, "learning_rate": 2.1611495477469712e-05, "loss": 0.0635, "step": 270 }, { "epoch": 8.333333333333334, "grad_norm": 0.15234375, "learning_rate": 2.1296894754782155e-05, "loss": 0.0679, "step": 275 }, { "epoch": 8.484848484848484, "grad_norm": 0.140625, "learning_rate": 2.0979283507721653e-05, "loss": 0.0631, "step": 280 }, { "epoch": 8.636363636363637, "grad_norm": 0.12890625, "learning_rate": 2.0658861068658254e-05, "loss": 0.0634, "step": 285 }, { "epoch": 8.787878787878787, "grad_norm": 0.14453125, "learning_rate": 2.0335828534262148e-05, "loss": 0.0652, "step": 290 }, { "epoch": 8.93939393939394, "grad_norm": 0.1484375, "learning_rate": 2.001038863929568e-05, "loss": 0.067, "step": 295 }, { "epoch": 9.090909090909092, "grad_norm": 0.1572265625, "learning_rate": 1.9682745629377267e-05, "loss": 0.0647, "step": 300 }, { "epoch": 9.242424242424242, "grad_norm": 0.1669921875, "learning_rate": 1.9353105132797175e-05, "loss": 0.0628, "step": 305 }, { "epoch": 9.393939393939394, "grad_norm": 0.140625, "learning_rate": 1.902167403146548e-05, "loss": 0.0625, "step": 310 }, { "epoch": 9.545454545454545, "grad_norm": 0.1728515625, "learning_rate": 1.8688660331073253e-05, "loss": 0.0634, "step": 315 }, { "epoch": 9.696969696969697, "grad_norm": 0.1357421875, "learning_rate": 1.8354273030548512e-05, "loss": 0.0618, "step": 320 }, { "epoch": 9.848484848484848, "grad_norm": 0.14453125, "learning_rate": 1.801872199088878e-05, "loss": 0.0618, "step": 325 }, { "epoch": 10.0, "grad_norm": 0.1630859375, "learning_rate": 1.7682217803452616e-05, "loss": 0.0633, "step": 330 }, { "epoch": 10.151515151515152, "grad_norm": 0.1376953125, "learning_rate": 1.7344971657792768e-05, "loss": 0.0651, "step": 335 }, { "epoch": 10.303030303030303, "grad_norm": 0.15625, "learning_rate": 1.7007195209113934e-05, "loss": 0.0623, "step": 340 }, { "epoch": 10.454545454545455, "grad_norm": 0.1533203125, "learning_rate": 1.666910044543822e-05, "loss": 0.0647, "step": 345 }, { "epoch": 10.606060606060606, "grad_norm": 0.12890625, "learning_rate": 1.6330899554561785e-05, "loss": 0.0635, "step": 350 }, { "epoch": 10.757575757575758, "grad_norm": 0.1552734375, "learning_rate": 1.5992804790886075e-05, "loss": 0.0622, "step": 355 }, { "epoch": 10.909090909090908, "grad_norm": 0.1396484375, "learning_rate": 1.5655028342207235e-05, "loss": 0.0646, "step": 360 }, { "epoch": 11.06060606060606, "grad_norm": 0.1533203125, "learning_rate": 1.5317782196547387e-05, "loss": 0.0638, "step": 365 }, { "epoch": 11.212121212121213, "grad_norm": 0.146484375, "learning_rate": 1.4981278009111222e-05, "loss": 0.0633, "step": 370 }, { "epoch": 11.363636363636363, "grad_norm": 0.1376953125, "learning_rate": 1.4645726969451489e-05, "loss": 0.0602, "step": 375 }, { "epoch": 11.515151515151516, "grad_norm": 0.1533203125, "learning_rate": 1.4311339668926748e-05, "loss": 0.061, "step": 380 }, { "epoch": 11.666666666666666, "grad_norm": 0.1513671875, "learning_rate": 1.397832596853452e-05, "loss": 0.0636, "step": 385 }, { "epoch": 11.818181818181818, "grad_norm": 0.1357421875, "learning_rate": 1.3646894867202821e-05, "loss": 0.0605, "step": 390 }, { "epoch": 11.969696969696969, "grad_norm": 0.1435546875, "learning_rate": 1.3317254370622732e-05, "loss": 0.0642, "step": 395 }, { "epoch": 12.121212121212121, "grad_norm": 0.1591796875, "learning_rate": 1.298961136070432e-05, "loss": 0.0633, "step": 400 }, { "epoch": 12.272727272727273, "grad_norm": 0.1396484375, "learning_rate": 1.266417146573785e-05, "loss": 0.0605, "step": 405 }, { "epoch": 12.424242424242424, "grad_norm": 0.146484375, "learning_rate": 1.2341138931341752e-05, "loss": 0.0627, "step": 410 }, { "epoch": 12.575757575757576, "grad_norm": 0.16015625, "learning_rate": 1.2020716492278353e-05, "loss": 0.0628, "step": 415 }, { "epoch": 12.727272727272727, "grad_norm": 0.1513671875, "learning_rate": 1.1703105245217848e-05, "loss": 0.0598, "step": 420 }, { "epoch": 12.878787878787879, "grad_norm": 0.1416015625, "learning_rate": 1.1388504522530296e-05, "loss": 0.0611, "step": 425 }, { "epoch": 13.030303030303031, "grad_norm": 0.1435546875, "learning_rate": 1.1077111767184916e-05, "loss": 0.0638, "step": 430 }, { "epoch": 13.181818181818182, "grad_norm": 0.1376953125, "learning_rate": 1.0769122408835148e-05, "loss": 0.0585, "step": 435 }, { "epoch": 13.333333333333334, "grad_norm": 0.1396484375, "learning_rate": 1.0464729741167291e-05, "loss": 0.0635, "step": 440 }, { "epoch": 13.484848484848484, "grad_norm": 0.14453125, "learning_rate": 1.016412480058964e-05, "loss": 0.0621, "step": 445 }, { "epoch": 13.636363636363637, "grad_norm": 0.1552734375, "learning_rate": 9.86749624633833e-06, "loss": 0.0635, "step": 450 }, { "epoch": 13.787878787878787, "grad_norm": 0.1474609375, "learning_rate": 9.575030242075062e-06, "loss": 0.0597, "step": 455 }, { "epoch": 13.93939393939394, "grad_norm": 0.1484375, "learning_rate": 9.286910339051015e-06, "loss": 0.0659, "step": 460 }, { "epoch": 14.090909090909092, "grad_norm": 0.140625, "learning_rate": 9.003317360910392e-06, "loss": 0.0618, "step": 465 }, { "epoch": 14.242424242424242, "grad_norm": 0.14453125, "learning_rate": 8.724429290205732e-06, "loss": 0.0612, "step": 470 }, { "epoch": 14.393939393939394, "grad_norm": 0.142578125, "learning_rate": 8.450421156696298e-06, "loss": 0.0615, "step": 475 }, { "epoch": 14.545454545454545, "grad_norm": 0.1357421875, "learning_rate": 8.181464927499674e-06, "loss": 0.0591, "step": 480 }, { "epoch": 14.696969696969697, "grad_norm": 0.15234375, "learning_rate": 7.917729399165435e-06, "loss": 0.0606, "step": 485 }, { "epoch": 14.848484848484848, "grad_norm": 0.1416015625, "learning_rate": 7.659380091738652e-06, "loss": 0.0592, "step": 490 }, { "epoch": 15.0, "grad_norm": 0.1865234375, "learning_rate": 7.406579144879779e-06, "loss": 0.0601, "step": 495 }, { "epoch": 15.151515151515152, "grad_norm": 0.1455078125, "learning_rate": 7.159485216106013e-06, "loss": 0.0616, "step": 500 }, { "epoch": 15.303030303030303, "grad_norm": 0.146484375, "learning_rate": 6.918253381218046e-06, "loss": 0.0583, "step": 505 }, { "epoch": 15.454545454545455, "grad_norm": 0.15234375, "learning_rate": 6.683035036974742e-06, "loss": 0.0613, "step": 510 }, { "epoch": 15.606060606060606, "grad_norm": 0.1484375, "learning_rate": 6.45397780607673e-06, "loss": 0.0572, "step": 515 }, { "epoch": 15.757575757575758, "grad_norm": 0.1884765625, "learning_rate": 6.23122544451859e-06, "loss": 0.0616, "step": 520 }, { "epoch": 15.909090909090908, "grad_norm": 0.15234375, "learning_rate": 6.014917751367825e-06, "loss": 0.0601, "step": 525 }, { "epoch": 16.060606060606062, "grad_norm": 0.154296875, "learning_rate": 5.80519048102715e-06, "loss": 0.0597, "step": 530 }, { "epoch": 16.21212121212121, "grad_norm": 0.140625, "learning_rate": 5.602175258035204e-06, "loss": 0.0581, "step": 535 }, { "epoch": 16.363636363636363, "grad_norm": 0.1611328125, "learning_rate": 5.4059994944591914e-06, "loss": 0.0617, "step": 540 }, { "epoch": 16.515151515151516, "grad_norm": 0.1416015625, "learning_rate": 5.2167863099312636e-06, "loss": 0.0587, "step": 545 }, { "epoch": 16.666666666666668, "grad_norm": 0.14453125, "learning_rate": 5.034654454378783e-06, "loss": 0.0599, "step": 550 }, { "epoch": 16.818181818181817, "grad_norm": 0.1533203125, "learning_rate": 4.859718233497048e-06, "loss": 0.0624, "step": 555 }, { "epoch": 16.96969696969697, "grad_norm": 0.1533203125, "learning_rate": 4.692087437011203e-06, "loss": 0.0589, "step": 560 }, { "epoch": 17.12121212121212, "grad_norm": 0.1533203125, "learning_rate": 4.5318672697723665e-06, "loss": 0.0624, "step": 565 }, { "epoch": 17.272727272727273, "grad_norm": 0.1494140625, "learning_rate": 4.3791582857311975e-06, "loss": 0.0603, "step": 570 }, { "epoch": 17.424242424242426, "grad_norm": 0.142578125, "learning_rate": 4.2340563248303915e-06, "loss": 0.0621, "step": 575 }, { "epoch": 17.575757575757574, "grad_norm": 0.1494140625, "learning_rate": 4.096652452855675e-06, "loss": 0.0608, "step": 580 }, { "epoch": 17.727272727272727, "grad_norm": 0.1435546875, "learning_rate": 3.967032904283021e-06, "loss": 0.06, "step": 585 }, { "epoch": 17.87878787878788, "grad_norm": 0.1484375, "learning_rate": 3.8452790281580445e-06, "loss": 0.0605, "step": 590 }, { "epoch": 18.03030303030303, "grad_norm": 0.154296875, "learning_rate": 3.731467237041433e-06, "loss": 0.0601, "step": 595 }, { "epoch": 18.181818181818183, "grad_norm": 0.1513671875, "learning_rate": 3.6256689590525444e-06, "loss": 0.0628, "step": 600 }, { "epoch": 18.333333333333332, "grad_norm": 0.1396484375, "learning_rate": 3.5279505930412164e-06, "loss": 0.062, "step": 605 }, { "epoch": 18.484848484848484, "grad_norm": 0.1591796875, "learning_rate": 3.4383734669159366e-06, "loss": 0.0618, "step": 610 }, { "epoch": 18.636363636363637, "grad_norm": 0.1591796875, "learning_rate": 3.356993799154545e-06, "loss": 0.059, "step": 615 }, { "epoch": 18.78787878787879, "grad_norm": 0.146484375, "learning_rate": 3.2838626635215874e-06, "loss": 0.0595, "step": 620 }, { "epoch": 18.939393939393938, "grad_norm": 0.1611328125, "learning_rate": 3.2190259570144957e-06, "loss": 0.0629, "step": 625 }, { "epoch": 19.09090909090909, "grad_norm": 0.150390625, "learning_rate": 3.162524371058697e-06, "loss": 0.0612, "step": 630 }, { "epoch": 19.242424242424242, "grad_norm": 0.1337890625, "learning_rate": 3.1143933659697377e-06, "loss": 0.0583, "step": 635 }, { "epoch": 19.393939393939394, "grad_norm": 0.1474609375, "learning_rate": 3.0746631486984266e-06, "loss": 0.0626, "step": 640 }, { "epoch": 19.545454545454547, "grad_norm": 0.15234375, "learning_rate": 3.043358653873013e-06, "loss": 0.0589, "step": 645 }, { "epoch": 19.696969696969695, "grad_norm": 0.1572265625, "learning_rate": 3.020499528150232e-06, "loss": 0.0586, "step": 650 }, { "epoch": 19.848484848484848, "grad_norm": 0.15625, "learning_rate": 3.006100117885101e-06, "loss": 0.0591, "step": 655 }, { "epoch": 20.0, "grad_norm": 0.2119140625, "learning_rate": 3.000169460127164e-06, "loss": 0.0613, "step": 660 }, { "epoch": 20.0, "step": 660, "total_flos": 3.880913653947433e+18, "train_loss": 0.06725140679063218, "train_runtime": 3002.848, "train_samples_per_second": 27.794, "train_steps_per_second": 0.22 } ], "logging_steps": 5, "max_steps": 660, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.880913653947433e+18, "train_batch_size": 128, "trial_name": null, "trial_params": null }