| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 20.0, | |
| "eval_steps": 500, | |
| "global_step": 660, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.15151515151515152, | |
| "grad_norm": 0.796875, | |
| "learning_rate": 3.6363636363636366e-06, | |
| "loss": 0.1258, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.30303030303030304, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 8.181818181818181e-06, | |
| "loss": 0.1212, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.45454545454545453, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 1.2727272727272728e-05, | |
| "loss": 0.1055, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.6060606060606061, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1.7272727272727274e-05, | |
| "loss": 0.0963, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.7575757575757576, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 2.1818181818181818e-05, | |
| "loss": 0.0931, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.9090909090909091, | |
| "grad_norm": 0.1796875, | |
| "learning_rate": 2.6363636363636365e-05, | |
| "loss": 0.0903, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 1.0606060606060606, | |
| "grad_norm": 0.1552734375, | |
| "learning_rate": 2.9999830539872836e-05, | |
| "loss": 0.0843, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 1.2121212121212122, | |
| "grad_norm": 0.150390625, | |
| "learning_rate": 2.9993899882114902e-05, | |
| "loss": 0.0853, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 1.3636363636363638, | |
| "grad_norm": 0.138671875, | |
| "learning_rate": 2.997950047184977e-05, | |
| "loss": 0.0804, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 1.5151515151515151, | |
| "grad_norm": 0.146484375, | |
| "learning_rate": 2.9956641346126986e-05, | |
| "loss": 0.0809, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 0.1396484375, | |
| "learning_rate": 2.9925336851301575e-05, | |
| "loss": 0.0795, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 1.8181818181818183, | |
| "grad_norm": 0.1376953125, | |
| "learning_rate": 2.9885606634030267e-05, | |
| "loss": 0.0789, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.9696969696969697, | |
| "grad_norm": 0.134765625, | |
| "learning_rate": 2.98374756289413e-05, | |
| "loss": 0.0778, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 2.121212121212121, | |
| "grad_norm": 0.1494140625, | |
| "learning_rate": 2.9780974042985506e-05, | |
| "loss": 0.0761, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 2.2727272727272725, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 2.971613733647841e-05, | |
| "loss": 0.0751, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 2.4242424242424243, | |
| "grad_norm": 0.1435546875, | |
| "learning_rate": 2.9643006200845458e-05, | |
| "loss": 0.0756, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 2.5757575757575757, | |
| "grad_norm": 0.1376953125, | |
| "learning_rate": 2.9561626533084068e-05, | |
| "loss": 0.0765, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 2.7272727272727275, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 2.9472049406958788e-05, | |
| "loss": 0.0746, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 2.878787878787879, | |
| "grad_norm": 0.1455078125, | |
| "learning_rate": 2.937433104094746e-05, | |
| "loss": 0.0757, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 3.0303030303030303, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 2.9268532762958568e-05, | |
| "loss": 0.0725, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 3.1818181818181817, | |
| "grad_norm": 0.1376953125, | |
| "learning_rate": 2.915472097184196e-05, | |
| "loss": 0.0742, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 3.3333333333333335, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 2.903296709571698e-05, | |
| "loss": 0.0707, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 3.484848484848485, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 2.8903347547144327e-05, | |
| "loss": 0.0734, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 3.6363636363636362, | |
| "grad_norm": 0.142578125, | |
| "learning_rate": 2.876594367516961e-05, | |
| "loss": 0.0724, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 3.787878787878788, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 2.8620841714268804e-05, | |
| "loss": 0.0725, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 3.9393939393939394, | |
| "grad_norm": 0.1484375, | |
| "learning_rate": 2.846813273022764e-05, | |
| "loss": 0.0714, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 4.090909090909091, | |
| "grad_norm": 0.134765625, | |
| "learning_rate": 2.83079125629888e-05, | |
| "loss": 0.0727, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 4.242424242424242, | |
| "grad_norm": 0.146484375, | |
| "learning_rate": 2.8140281766502957e-05, | |
| "loss": 0.0716, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 4.393939393939394, | |
| "grad_norm": 0.140625, | |
| "learning_rate": 2.7965345545621217e-05, | |
| "loss": 0.072, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 4.545454545454545, | |
| "grad_norm": 0.1455078125, | |
| "learning_rate": 2.7783213690068737e-05, | |
| "loss": 0.0701, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 4.696969696969697, | |
| "grad_norm": 0.1435546875, | |
| "learning_rate": 2.7594000505540807e-05, | |
| "loss": 0.0741, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 4.848484848484849, | |
| "grad_norm": 0.142578125, | |
| "learning_rate": 2.7397824741964805e-05, | |
| "loss": 0.0665, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.189453125, | |
| "learning_rate": 2.7194809518972856e-05, | |
| "loss": 0.0705, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 5.151515151515151, | |
| "grad_norm": 0.1494140625, | |
| "learning_rate": 2.6985082248632174e-05, | |
| "loss": 0.0679, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 5.303030303030303, | |
| "grad_norm": 0.14453125, | |
| "learning_rate": 2.676877455548141e-05, | |
| "loss": 0.0693, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 5.454545454545454, | |
| "grad_norm": 0.1376953125, | |
| "learning_rate": 2.6546022193923274e-05, | |
| "loss": 0.0696, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 5.606060606060606, | |
| "grad_norm": 0.1435546875, | |
| "learning_rate": 2.631696496302526e-05, | |
| "loss": 0.0709, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 5.757575757575758, | |
| "grad_norm": 0.142578125, | |
| "learning_rate": 2.6081746618781953e-05, | |
| "loss": 0.0694, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 5.909090909090909, | |
| "grad_norm": 0.1357421875, | |
| "learning_rate": 2.584051478389399e-05, | |
| "loss": 0.0682, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 6.0606060606060606, | |
| "grad_norm": 0.154296875, | |
| "learning_rate": 2.559342085512022e-05, | |
| "loss": 0.0686, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 6.212121212121212, | |
| "grad_norm": 0.154296875, | |
| "learning_rate": 2.5340619908261352e-05, | |
| "loss": 0.0703, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 6.363636363636363, | |
| "grad_norm": 0.140625, | |
| "learning_rate": 2.508227060083457e-05, | |
| "loss": 0.0647, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 6.515151515151516, | |
| "grad_norm": 0.1376953125, | |
| "learning_rate": 2.4818535072500327e-05, | |
| "loss": 0.064, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 6.666666666666667, | |
| "grad_norm": 0.142578125, | |
| "learning_rate": 2.4549578843303708e-05, | |
| "loss": 0.0676, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 6.818181818181818, | |
| "grad_norm": 0.14453125, | |
| "learning_rate": 2.427557070979427e-05, | |
| "loss": 0.0669, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 6.96969696969697, | |
| "grad_norm": 0.1376953125, | |
| "learning_rate": 2.399668263908961e-05, | |
| "loss": 0.0679, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 7.121212121212121, | |
| "grad_norm": 0.1357421875, | |
| "learning_rate": 2.3713089660948985e-05, | |
| "loss": 0.0666, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 7.2727272727272725, | |
| "grad_norm": 0.1455078125, | |
| "learning_rate": 2.342496975792494e-05, | |
| "loss": 0.066, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 7.424242424242424, | |
| "grad_norm": 0.123046875, | |
| "learning_rate": 2.313250375366167e-05, | |
| "loss": 0.0637, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 7.575757575757576, | |
| "grad_norm": 0.1298828125, | |
| "learning_rate": 2.283587519941036e-05, | |
| "loss": 0.0683, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 7.7272727272727275, | |
| "grad_norm": 0.1435546875, | |
| "learning_rate": 2.253527025883271e-05, | |
| "loss": 0.0642, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 7.878787878787879, | |
| "grad_norm": 0.1533203125, | |
| "learning_rate": 2.2230877591164858e-05, | |
| "loss": 0.0682, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 8.030303030303031, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 2.192288823281509e-05, | |
| "loss": 0.0628, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 8.181818181818182, | |
| "grad_norm": 0.158203125, | |
| "learning_rate": 2.1611495477469712e-05, | |
| "loss": 0.0635, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 8.333333333333334, | |
| "grad_norm": 0.15234375, | |
| "learning_rate": 2.1296894754782155e-05, | |
| "loss": 0.0679, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 8.484848484848484, | |
| "grad_norm": 0.140625, | |
| "learning_rate": 2.0979283507721653e-05, | |
| "loss": 0.0631, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 8.636363636363637, | |
| "grad_norm": 0.12890625, | |
| "learning_rate": 2.0658861068658254e-05, | |
| "loss": 0.0634, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 8.787878787878787, | |
| "grad_norm": 0.14453125, | |
| "learning_rate": 2.0335828534262148e-05, | |
| "loss": 0.0652, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 8.93939393939394, | |
| "grad_norm": 0.1484375, | |
| "learning_rate": 2.001038863929568e-05, | |
| "loss": 0.067, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 9.090909090909092, | |
| "grad_norm": 0.1572265625, | |
| "learning_rate": 1.9682745629377267e-05, | |
| "loss": 0.0647, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 9.242424242424242, | |
| "grad_norm": 0.1669921875, | |
| "learning_rate": 1.9353105132797175e-05, | |
| "loss": 0.0628, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 9.393939393939394, | |
| "grad_norm": 0.140625, | |
| "learning_rate": 1.902167403146548e-05, | |
| "loss": 0.0625, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 9.545454545454545, | |
| "grad_norm": 0.1728515625, | |
| "learning_rate": 1.8688660331073253e-05, | |
| "loss": 0.0634, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 9.696969696969697, | |
| "grad_norm": 0.1357421875, | |
| "learning_rate": 1.8354273030548512e-05, | |
| "loss": 0.0618, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 9.848484848484848, | |
| "grad_norm": 0.14453125, | |
| "learning_rate": 1.801872199088878e-05, | |
| "loss": 0.0618, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.1630859375, | |
| "learning_rate": 1.7682217803452616e-05, | |
| "loss": 0.0633, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 10.151515151515152, | |
| "grad_norm": 0.1376953125, | |
| "learning_rate": 1.7344971657792768e-05, | |
| "loss": 0.0651, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 10.303030303030303, | |
| "grad_norm": 0.15625, | |
| "learning_rate": 1.7007195209113934e-05, | |
| "loss": 0.0623, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 10.454545454545455, | |
| "grad_norm": 0.1533203125, | |
| "learning_rate": 1.666910044543822e-05, | |
| "loss": 0.0647, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 10.606060606060606, | |
| "grad_norm": 0.12890625, | |
| "learning_rate": 1.6330899554561785e-05, | |
| "loss": 0.0635, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 10.757575757575758, | |
| "grad_norm": 0.1552734375, | |
| "learning_rate": 1.5992804790886075e-05, | |
| "loss": 0.0622, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 10.909090909090908, | |
| "grad_norm": 0.1396484375, | |
| "learning_rate": 1.5655028342207235e-05, | |
| "loss": 0.0646, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 11.06060606060606, | |
| "grad_norm": 0.1533203125, | |
| "learning_rate": 1.5317782196547387e-05, | |
| "loss": 0.0638, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 11.212121212121213, | |
| "grad_norm": 0.146484375, | |
| "learning_rate": 1.4981278009111222e-05, | |
| "loss": 0.0633, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 11.363636363636363, | |
| "grad_norm": 0.1376953125, | |
| "learning_rate": 1.4645726969451489e-05, | |
| "loss": 0.0602, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 11.515151515151516, | |
| "grad_norm": 0.1533203125, | |
| "learning_rate": 1.4311339668926748e-05, | |
| "loss": 0.061, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 11.666666666666666, | |
| "grad_norm": 0.1513671875, | |
| "learning_rate": 1.397832596853452e-05, | |
| "loss": 0.0636, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 11.818181818181818, | |
| "grad_norm": 0.1357421875, | |
| "learning_rate": 1.3646894867202821e-05, | |
| "loss": 0.0605, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 11.969696969696969, | |
| "grad_norm": 0.1435546875, | |
| "learning_rate": 1.3317254370622732e-05, | |
| "loss": 0.0642, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 12.121212121212121, | |
| "grad_norm": 0.1591796875, | |
| "learning_rate": 1.298961136070432e-05, | |
| "loss": 0.0633, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 12.272727272727273, | |
| "grad_norm": 0.1396484375, | |
| "learning_rate": 1.266417146573785e-05, | |
| "loss": 0.0605, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 12.424242424242424, | |
| "grad_norm": 0.146484375, | |
| "learning_rate": 1.2341138931341752e-05, | |
| "loss": 0.0627, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 12.575757575757576, | |
| "grad_norm": 0.16015625, | |
| "learning_rate": 1.2020716492278353e-05, | |
| "loss": 0.0628, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 12.727272727272727, | |
| "grad_norm": 0.1513671875, | |
| "learning_rate": 1.1703105245217848e-05, | |
| "loss": 0.0598, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 12.878787878787879, | |
| "grad_norm": 0.1416015625, | |
| "learning_rate": 1.1388504522530296e-05, | |
| "loss": 0.0611, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 13.030303030303031, | |
| "grad_norm": 0.1435546875, | |
| "learning_rate": 1.1077111767184916e-05, | |
| "loss": 0.0638, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 13.181818181818182, | |
| "grad_norm": 0.1376953125, | |
| "learning_rate": 1.0769122408835148e-05, | |
| "loss": 0.0585, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 13.333333333333334, | |
| "grad_norm": 0.1396484375, | |
| "learning_rate": 1.0464729741167291e-05, | |
| "loss": 0.0635, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 13.484848484848484, | |
| "grad_norm": 0.14453125, | |
| "learning_rate": 1.016412480058964e-05, | |
| "loss": 0.0621, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 13.636363636363637, | |
| "grad_norm": 0.1552734375, | |
| "learning_rate": 9.86749624633833e-06, | |
| "loss": 0.0635, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 13.787878787878787, | |
| "grad_norm": 0.1474609375, | |
| "learning_rate": 9.575030242075062e-06, | |
| "loss": 0.0597, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 13.93939393939394, | |
| "grad_norm": 0.1484375, | |
| "learning_rate": 9.286910339051015e-06, | |
| "loss": 0.0659, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 14.090909090909092, | |
| "grad_norm": 0.140625, | |
| "learning_rate": 9.003317360910392e-06, | |
| "loss": 0.0618, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 14.242424242424242, | |
| "grad_norm": 0.14453125, | |
| "learning_rate": 8.724429290205732e-06, | |
| "loss": 0.0612, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 14.393939393939394, | |
| "grad_norm": 0.142578125, | |
| "learning_rate": 8.450421156696298e-06, | |
| "loss": 0.0615, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 14.545454545454545, | |
| "grad_norm": 0.1357421875, | |
| "learning_rate": 8.181464927499674e-06, | |
| "loss": 0.0591, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 14.696969696969697, | |
| "grad_norm": 0.15234375, | |
| "learning_rate": 7.917729399165435e-06, | |
| "loss": 0.0606, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 14.848484848484848, | |
| "grad_norm": 0.1416015625, | |
| "learning_rate": 7.659380091738652e-06, | |
| "loss": 0.0592, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "grad_norm": 0.1865234375, | |
| "learning_rate": 7.406579144879779e-06, | |
| "loss": 0.0601, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 15.151515151515152, | |
| "grad_norm": 0.1455078125, | |
| "learning_rate": 7.159485216106013e-06, | |
| "loss": 0.0616, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 15.303030303030303, | |
| "grad_norm": 0.146484375, | |
| "learning_rate": 6.918253381218046e-06, | |
| "loss": 0.0583, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 15.454545454545455, | |
| "grad_norm": 0.15234375, | |
| "learning_rate": 6.683035036974742e-06, | |
| "loss": 0.0613, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 15.606060606060606, | |
| "grad_norm": 0.1484375, | |
| "learning_rate": 6.45397780607673e-06, | |
| "loss": 0.0572, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 15.757575757575758, | |
| "grad_norm": 0.1884765625, | |
| "learning_rate": 6.23122544451859e-06, | |
| "loss": 0.0616, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 15.909090909090908, | |
| "grad_norm": 0.15234375, | |
| "learning_rate": 6.014917751367825e-06, | |
| "loss": 0.0601, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 16.060606060606062, | |
| "grad_norm": 0.154296875, | |
| "learning_rate": 5.80519048102715e-06, | |
| "loss": 0.0597, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 16.21212121212121, | |
| "grad_norm": 0.140625, | |
| "learning_rate": 5.602175258035204e-06, | |
| "loss": 0.0581, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 16.363636363636363, | |
| "grad_norm": 0.1611328125, | |
| "learning_rate": 5.4059994944591914e-06, | |
| "loss": 0.0617, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 16.515151515151516, | |
| "grad_norm": 0.1416015625, | |
| "learning_rate": 5.2167863099312636e-06, | |
| "loss": 0.0587, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 16.666666666666668, | |
| "grad_norm": 0.14453125, | |
| "learning_rate": 5.034654454378783e-06, | |
| "loss": 0.0599, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 16.818181818181817, | |
| "grad_norm": 0.1533203125, | |
| "learning_rate": 4.859718233497048e-06, | |
| "loss": 0.0624, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 16.96969696969697, | |
| "grad_norm": 0.1533203125, | |
| "learning_rate": 4.692087437011203e-06, | |
| "loss": 0.0589, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 17.12121212121212, | |
| "grad_norm": 0.1533203125, | |
| "learning_rate": 4.5318672697723665e-06, | |
| "loss": 0.0624, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 17.272727272727273, | |
| "grad_norm": 0.1494140625, | |
| "learning_rate": 4.3791582857311975e-06, | |
| "loss": 0.0603, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 17.424242424242426, | |
| "grad_norm": 0.142578125, | |
| "learning_rate": 4.2340563248303915e-06, | |
| "loss": 0.0621, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 17.575757575757574, | |
| "grad_norm": 0.1494140625, | |
| "learning_rate": 4.096652452855675e-06, | |
| "loss": 0.0608, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 17.727272727272727, | |
| "grad_norm": 0.1435546875, | |
| "learning_rate": 3.967032904283021e-06, | |
| "loss": 0.06, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 17.87878787878788, | |
| "grad_norm": 0.1484375, | |
| "learning_rate": 3.8452790281580445e-06, | |
| "loss": 0.0605, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 18.03030303030303, | |
| "grad_norm": 0.154296875, | |
| "learning_rate": 3.731467237041433e-06, | |
| "loss": 0.0601, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 18.181818181818183, | |
| "grad_norm": 0.1513671875, | |
| "learning_rate": 3.6256689590525444e-06, | |
| "loss": 0.0628, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 18.333333333333332, | |
| "grad_norm": 0.1396484375, | |
| "learning_rate": 3.5279505930412164e-06, | |
| "loss": 0.062, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 18.484848484848484, | |
| "grad_norm": 0.1591796875, | |
| "learning_rate": 3.4383734669159366e-06, | |
| "loss": 0.0618, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 18.636363636363637, | |
| "grad_norm": 0.1591796875, | |
| "learning_rate": 3.356993799154545e-06, | |
| "loss": 0.059, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 18.78787878787879, | |
| "grad_norm": 0.146484375, | |
| "learning_rate": 3.2838626635215874e-06, | |
| "loss": 0.0595, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 18.939393939393938, | |
| "grad_norm": 0.1611328125, | |
| "learning_rate": 3.2190259570144957e-06, | |
| "loss": 0.0629, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 19.09090909090909, | |
| "grad_norm": 0.150390625, | |
| "learning_rate": 3.162524371058697e-06, | |
| "loss": 0.0612, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 19.242424242424242, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 3.1143933659697377e-06, | |
| "loss": 0.0583, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 19.393939393939394, | |
| "grad_norm": 0.1474609375, | |
| "learning_rate": 3.0746631486984266e-06, | |
| "loss": 0.0626, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 19.545454545454547, | |
| "grad_norm": 0.15234375, | |
| "learning_rate": 3.043358653873013e-06, | |
| "loss": 0.0589, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 19.696969696969695, | |
| "grad_norm": 0.1572265625, | |
| "learning_rate": 3.020499528150232e-06, | |
| "loss": 0.0586, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 19.848484848484848, | |
| "grad_norm": 0.15625, | |
| "learning_rate": 3.006100117885101e-06, | |
| "loss": 0.0591, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 3.000169460127164e-06, | |
| "loss": 0.0613, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "step": 660, | |
| "total_flos": 3.880913653947433e+18, | |
| "train_loss": 0.06725140679063218, | |
| "train_runtime": 3002.848, | |
| "train_samples_per_second": 27.794, | |
| "train_steps_per_second": 0.22 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 660, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 20, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.880913653947433e+18, | |
| "train_batch_size": 128, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |