{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 25, "global_step": 781, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0064047822374039285, "grad_norm": 5.6078200340271, "learning_rate": 1.0000000000000002e-06, "loss": 1.1562, "step": 5 }, { "epoch": 0.012809564474807857, "grad_norm": 5.326383590698242, "learning_rate": 2.25e-06, "loss": 1.0619, "step": 10 }, { "epoch": 0.019214346712211786, "grad_norm": 2.7833762168884277, "learning_rate": 3.5e-06, "loss": 1.0511, "step": 15 }, { "epoch": 0.025619128949615714, "grad_norm": 1.1198874711990356, "learning_rate": 4.75e-06, "loss": 0.931, "step": 20 }, { "epoch": 0.03202391118701964, "grad_norm": 1.2183799743652344, "learning_rate": 4.999659159998194e-06, "loss": 0.8819, "step": 25 }, { "epoch": 0.03842869342442357, "grad_norm": 0.7168175578117371, "learning_rate": 4.998274656771894e-06, "loss": 0.845, "step": 30 }, { "epoch": 0.0448334756618275, "grad_norm": 0.5571187734603882, "learning_rate": 4.995825777227236e-06, "loss": 0.7982, "step": 35 }, { "epoch": 0.05123825789923143, "grad_norm": 0.5454768538475037, "learning_rate": 4.992313564696022e-06, "loss": 0.7583, "step": 40 }, { "epoch": 0.057643040136635355, "grad_norm": 0.4312361776828766, "learning_rate": 4.9877395155372815e-06, "loss": 0.7783, "step": 45 }, { "epoch": 0.06404782237403928, "grad_norm": 0.34762468934059143, "learning_rate": 4.982105578499759e-06, "loss": 0.7645, "step": 50 }, { "epoch": 0.07045260461144322, "grad_norm": 0.34647414088249207, "learning_rate": 4.975414153891664e-06, "loss": 0.7397, "step": 55 }, { "epoch": 0.07685738684884715, "grad_norm": 0.33078256249427795, "learning_rate": 4.967668092558024e-06, "loss": 0.7325, "step": 60 }, { "epoch": 0.08326216908625107, "grad_norm": 0.3410351872444153, "learning_rate": 4.9588706946661066e-06, "loss": 0.7037, "step": 65 }, { "epoch": 0.089666951323655, "grad_norm": 0.3349353075027466, "learning_rate": 4.949025708299395e-06, "loss": 0.6928, "step": 70 }, { "epoch": 0.09607173356105893, "grad_norm": 0.32335689663887024, "learning_rate": 4.93813732786074e-06, "loss": 0.6764, "step": 75 }, { "epoch": 0.10247651579846286, "grad_norm": 0.33923929929733276, "learning_rate": 4.926210192285359e-06, "loss": 0.7398, "step": 80 }, { "epoch": 0.10888129803586678, "grad_norm": 0.32697415351867676, "learning_rate": 4.913249383064438e-06, "loss": 0.6927, "step": 85 }, { "epoch": 0.11528608027327071, "grad_norm": 0.34032362699508667, "learning_rate": 4.899260422080195e-06, "loss": 0.6909, "step": 90 }, { "epoch": 0.12169086251067464, "grad_norm": 0.3949951231479645, "learning_rate": 4.884249269253309e-06, "loss": 0.6517, "step": 95 }, { "epoch": 0.12809564474807855, "grad_norm": 0.40848004817962646, "learning_rate": 4.868222320003731e-06, "loss": 0.6625, "step": 100 }, { "epoch": 0.1345004269854825, "grad_norm": 0.36571186780929565, "learning_rate": 4.851186402525946e-06, "loss": 0.6693, "step": 105 }, { "epoch": 0.14090520922288644, "grad_norm": 0.33602604269981384, "learning_rate": 4.8331487748798636e-06, "loss": 0.6699, "step": 110 }, { "epoch": 0.14730999146029036, "grad_norm": 0.3456333875656128, "learning_rate": 4.814117121898554e-06, "loss": 0.6265, "step": 115 }, { "epoch": 0.1537147736976943, "grad_norm": 0.2806994915008545, "learning_rate": 4.794099551914173e-06, "loss": 0.6181, "step": 120 }, { "epoch": 0.16011955593509822, "grad_norm": 0.352857381105423, "learning_rate": 4.773104593303449e-06, "loss": 0.6869, "step": 125 }, { "epoch": 0.16652433817250215, "grad_norm": 0.3353271782398224, "learning_rate": 4.751141190854214e-06, "loss": 0.6184, "step": 130 }, { "epoch": 0.17292912040990607, "grad_norm": 0.39950162172317505, "learning_rate": 4.728218701954525e-06, "loss": 0.6631, "step": 135 }, { "epoch": 0.17933390264731, "grad_norm": 0.3852960169315338, "learning_rate": 4.704346892606001e-06, "loss": 0.6077, "step": 140 }, { "epoch": 0.18573868488471393, "grad_norm": 0.3433144688606262, "learning_rate": 4.6795359332630694e-06, "loss": 0.6487, "step": 145 }, { "epoch": 0.19214346712211786, "grad_norm": 0.437122106552124, "learning_rate": 4.653796394499904e-06, "loss": 0.6284, "step": 150 }, { "epoch": 0.19854824935952178, "grad_norm": 0.3337637484073639, "learning_rate": 4.627139242506882e-06, "loss": 0.6177, "step": 155 }, { "epoch": 0.2049530315969257, "grad_norm": 0.3271448016166687, "learning_rate": 4.599575834418505e-06, "loss": 0.6604, "step": 160 }, { "epoch": 0.21135781383432964, "grad_norm": 0.4029073715209961, "learning_rate": 4.571117913474749e-06, "loss": 0.6151, "step": 165 }, { "epoch": 0.21776259607173357, "grad_norm": 0.35433897376060486, "learning_rate": 4.541777604017924e-06, "loss": 0.5941, "step": 170 }, { "epoch": 0.2241673783091375, "grad_norm": 0.40683719515800476, "learning_rate": 4.511567406327162e-06, "loss": 0.6196, "step": 175 }, { "epoch": 0.23057216054654142, "grad_norm": 0.41791296005249023, "learning_rate": 4.480500191292744e-06, "loss": 0.6104, "step": 180 }, { "epoch": 0.23697694278394535, "grad_norm": 0.3712579011917114, "learning_rate": 4.448589194932521e-06, "loss": 0.6091, "step": 185 }, { "epoch": 0.24338172502134928, "grad_norm": 0.3512028157711029, "learning_rate": 4.415848012752789e-06, "loss": 0.5894, "step": 190 }, { "epoch": 0.2497865072587532, "grad_norm": 0.3832685649394989, "learning_rate": 4.38229059395599e-06, "loss": 0.6143, "step": 195 }, { "epoch": 0.2561912894961571, "grad_norm": 0.3921290338039398, "learning_rate": 4.347931235497738e-06, "loss": 0.6161, "step": 200 }, { "epoch": 0.26259607173356103, "grad_norm": 0.40466588735580444, "learning_rate": 4.312784575995669e-06, "loss": 0.6008, "step": 205 }, { "epoch": 0.269000853970965, "grad_norm": 0.9247767329216003, "learning_rate": 4.276865589492747e-06, "loss": 0.5971, "step": 210 }, { "epoch": 0.27540563620836894, "grad_norm": 0.5003313422203064, "learning_rate": 4.240189579077649e-06, "loss": 0.5832, "step": 215 }, { "epoch": 0.28181041844577287, "grad_norm": 0.41465386748313904, "learning_rate": 4.202772170364969e-06, "loss": 0.5909, "step": 220 }, { "epoch": 0.2882152006831768, "grad_norm": 0.3520627021789551, "learning_rate": 4.164629304838012e-06, "loss": 0.5826, "step": 225 }, { "epoch": 0.2946199829205807, "grad_norm": 0.38733118772506714, "learning_rate": 4.125777233057007e-06, "loss": 0.6092, "step": 230 }, { "epoch": 0.30102476515798465, "grad_norm": 0.3622700870037079, "learning_rate": 4.086232507735648e-06, "loss": 0.5844, "step": 235 }, { "epoch": 0.3074295473953886, "grad_norm": 0.47380566596984863, "learning_rate": 4.0460119766889e-06, "loss": 0.6075, "step": 240 }, { "epoch": 0.3138343296327925, "grad_norm": 0.4540008008480072, "learning_rate": 4.005132775655076e-06, "loss": 0.572, "step": 245 }, { "epoch": 0.32023911187019644, "grad_norm": 0.47770172357559204, "learning_rate": 3.963612320995257e-06, "loss": 0.6175, "step": 250 }, { "epoch": 0.32664389410760036, "grad_norm": 0.3514467179775238, "learning_rate": 3.921468302273137e-06, "loss": 0.5618, "step": 255 }, { "epoch": 0.3330486763450043, "grad_norm": 0.45986905694007874, "learning_rate": 3.8787186747184826e-06, "loss": 0.5442, "step": 260 }, { "epoch": 0.3394534585824082, "grad_norm": 0.4583056569099426, "learning_rate": 3.8353816515774115e-06, "loss": 0.569, "step": 265 }, { "epoch": 0.34585824081981215, "grad_norm": 0.3687354028224945, "learning_rate": 3.79147569635273e-06, "loss": 0.5555, "step": 270 }, { "epoch": 0.3522630230572161, "grad_norm": 0.38063594698905945, "learning_rate": 3.747019514937663e-06, "loss": 0.6, "step": 275 }, { "epoch": 0.35866780529462, "grad_norm": 0.430896520614624, "learning_rate": 3.70203204764631e-06, "loss": 0.5843, "step": 280 }, { "epoch": 0.36507258753202393, "grad_norm": 0.5169083476066589, "learning_rate": 3.6565324611442234e-06, "loss": 0.5914, "step": 285 }, { "epoch": 0.37147736976942786, "grad_norm": 0.36347025632858276, "learning_rate": 3.6105401402825595e-06, "loss": 0.5674, "step": 290 }, { "epoch": 0.3778821520068318, "grad_norm": 0.3636574447154999, "learning_rate": 3.5640746798392657e-06, "loss": 0.6123, "step": 295 }, { "epoch": 0.3842869342442357, "grad_norm": 0.4619109332561493, "learning_rate": 3.5171558761708334e-06, "loss": 0.5708, "step": 300 }, { "epoch": 0.39069171648163964, "grad_norm": 0.447704553604126, "learning_rate": 3.469803718778166e-06, "loss": 0.5722, "step": 305 }, { "epoch": 0.39709649871904357, "grad_norm": 0.39746832847595215, "learning_rate": 3.4220383817901625e-06, "loss": 0.5772, "step": 310 }, { "epoch": 0.4035012809564475, "grad_norm": 0.4326777160167694, "learning_rate": 3.3738802153686414e-06, "loss": 0.5715, "step": 315 }, { "epoch": 0.4099060631938514, "grad_norm": 0.4147851765155792, "learning_rate": 3.3253497370382605e-06, "loss": 0.572, "step": 320 }, { "epoch": 0.41631084543125535, "grad_norm": 0.43767350912094116, "learning_rate": 3.2764676229451397e-06, "loss": 0.563, "step": 325 }, { "epoch": 0.4227156276686593, "grad_norm": 0.36241263151168823, "learning_rate": 3.227254699047904e-06, "loss": 0.5649, "step": 330 }, { "epoch": 0.4291204099060632, "grad_norm": 0.5232857465744019, "learning_rate": 3.177731932244892e-06, "loss": 0.5645, "step": 335 }, { "epoch": 0.43552519214346713, "grad_norm": 0.4475226104259491, "learning_rate": 3.127920421441327e-06, "loss": 0.5767, "step": 340 }, { "epoch": 0.44192997438087106, "grad_norm": 0.4484921991825104, "learning_rate": 3.077841388560243e-06, "loss": 0.591, "step": 345 }, { "epoch": 0.448334756618275, "grad_norm": 0.5250320434570312, "learning_rate": 3.0275161695009975e-06, "loss": 0.5814, "step": 350 }, { "epoch": 0.4547395388556789, "grad_norm": 0.47690996527671814, "learning_rate": 2.9769662050492276e-06, "loss": 0.5602, "step": 355 }, { "epoch": 0.46114432109308284, "grad_norm": 0.4651663899421692, "learning_rate": 2.926213031742125e-06, "loss": 0.5741, "step": 360 }, { "epoch": 0.46754910333048677, "grad_norm": 0.46296215057373047, "learning_rate": 2.8752782726929045e-06, "loss": 0.5614, "step": 365 }, { "epoch": 0.4739538855678907, "grad_norm": 0.5162904262542725, "learning_rate": 2.8241836283784026e-06, "loss": 0.5483, "step": 370 }, { "epoch": 0.4803586678052946, "grad_norm": 0.3958864212036133, "learning_rate": 2.7729508673936972e-06, "loss": 0.5745, "step": 375 }, { "epoch": 0.48676345004269855, "grad_norm": 0.4186757504940033, "learning_rate": 2.721601817177725e-06, "loss": 0.5459, "step": 380 }, { "epoch": 0.4931682322801025, "grad_norm": 0.4372413456439972, "learning_rate": 2.6701583547138165e-06, "loss": 0.5852, "step": 385 }, { "epoch": 0.4995730145175064, "grad_norm": 0.4488023519515991, "learning_rate": 2.618642397209126e-06, "loss": 0.5427, "step": 390 }, { "epoch": 0.5059777967549103, "grad_norm": 0.4278182089328766, "learning_rate": 2.567075892756924e-06, "loss": 0.5586, "step": 395 }, { "epoch": 0.5123825789923142, "grad_norm": 0.48016875982284546, "learning_rate": 2.5154808109857367e-06, "loss": 0.5405, "step": 400 }, { "epoch": 0.5187873612297181, "grad_norm": 0.5077680945396423, "learning_rate": 2.4638791336992967e-06, "loss": 0.5682, "step": 405 }, { "epoch": 0.5251921434671221, "grad_norm": 0.5091099739074707, "learning_rate": 2.4122928455113233e-06, "loss": 0.5619, "step": 410 }, { "epoch": 0.531596925704526, "grad_norm": 0.4333205223083496, "learning_rate": 2.360743924479093e-06, "loss": 0.5879, "step": 415 }, { "epoch": 0.53800170794193, "grad_norm": 0.4178122282028198, "learning_rate": 2.3092543327398083e-06, "loss": 0.5332, "step": 420 }, { "epoch": 0.544406490179334, "grad_norm": 0.4080513119697571, "learning_rate": 2.2578460071537512e-06, "loss": 0.5728, "step": 425 }, { "epoch": 0.5508112724167379, "grad_norm": 0.48982349038124084, "learning_rate": 2.2065408499582e-06, "loss": 0.575, "step": 430 }, { "epoch": 0.5572160546541418, "grad_norm": 0.4953416883945465, "learning_rate": 2.155360719436102e-06, "loss": 0.5404, "step": 435 }, { "epoch": 0.5636208368915457, "grad_norm": 0.4608188271522522, "learning_rate": 2.1043274206034727e-06, "loss": 0.5579, "step": 440 }, { "epoch": 0.5700256191289497, "grad_norm": 0.48403236269950867, "learning_rate": 2.0534626959194816e-06, "loss": 0.5383, "step": 445 }, { "epoch": 0.5764304013663536, "grad_norm": 0.4532581865787506, "learning_rate": 2.002788216023203e-06, "loss": 0.5638, "step": 450 }, { "epoch": 0.5828351836037575, "grad_norm": 0.53521728515625, "learning_rate": 1.9523255705009558e-06, "loss": 0.5549, "step": 455 }, { "epoch": 0.5892399658411615, "grad_norm": 0.4711097180843353, "learning_rate": 1.902096258688174e-06, "loss": 0.5027, "step": 460 }, { "epoch": 0.5956447480785654, "grad_norm": 0.43662044405937195, "learning_rate": 1.8521216805097358e-06, "loss": 0.556, "step": 465 }, { "epoch": 0.6020495303159693, "grad_norm": 0.3957918882369995, "learning_rate": 1.8024231273626424e-06, "loss": 0.5596, "step": 470 }, { "epoch": 0.6084543125533732, "grad_norm": 0.5218236446380615, "learning_rate": 1.7530217730449312e-06, "loss": 0.5405, "step": 475 }, { "epoch": 0.6148590947907772, "grad_norm": 0.4223135709762573, "learning_rate": 1.7039386647346975e-06, "loss": 0.5279, "step": 480 }, { "epoch": 0.6212638770281811, "grad_norm": 0.3835909068584442, "learning_rate": 1.6551947140230568e-06, "loss": 0.5747, "step": 485 }, { "epoch": 0.627668659265585, "grad_norm": 0.5082884430885315, "learning_rate": 1.6068106880048747e-06, "loss": 0.5518, "step": 490 }, { "epoch": 0.6340734415029889, "grad_norm": 0.4860563278198242, "learning_rate": 1.5588072004310634e-06, "loss": 0.5641, "step": 495 }, { "epoch": 0.6404782237403929, "grad_norm": 0.4176677167415619, "learning_rate": 1.5112047029262e-06, "loss": 0.5547, "step": 500 }, { "epoch": 0.6468830059777968, "grad_norm": 0.3659776747226715, "learning_rate": 1.4640234762752248e-06, "loss": 0.5503, "step": 505 }, { "epoch": 0.6532877882152007, "grad_norm": 0.4908987283706665, "learning_rate": 1.4172836217829267e-06, "loss": 0.5549, "step": 510 }, { "epoch": 0.6596925704526047, "grad_norm": 0.44962796568870544, "learning_rate": 1.3710050527098867e-06, "loss": 0.573, "step": 515 }, { "epoch": 0.6660973526900086, "grad_norm": 0.4549601376056671, "learning_rate": 1.3252074857885453e-06, "loss": 0.5666, "step": 520 }, { "epoch": 0.6725021349274125, "grad_norm": 0.48955774307250977, "learning_rate": 1.2799104328229928e-06, "loss": 0.5379, "step": 525 }, { "epoch": 0.6789069171648164, "grad_norm": 0.45902734994888306, "learning_rate": 1.2351331923760743e-06, "loss": 0.5345, "step": 530 }, { "epoch": 0.6853116994022204, "grad_norm": 0.49846968054771423, "learning_rate": 1.1908948415473418e-06, "loss": 0.5367, "step": 535 }, { "epoch": 0.6917164816396243, "grad_norm": 0.48370206356048584, "learning_rate": 1.1472142278453582e-06, "loss": 0.5325, "step": 540 }, { "epoch": 0.6981212638770282, "grad_norm": 0.3830443024635315, "learning_rate": 1.1041099611578177e-06, "loss": 0.5585, "step": 545 }, { "epoch": 0.7045260461144321, "grad_norm": 0.47550487518310547, "learning_rate": 1.0616004058229084e-06, "loss": 0.5417, "step": 550 }, { "epoch": 0.7109308283518361, "grad_norm": 0.46026965975761414, "learning_rate": 1.0197036728052847e-06, "loss": 0.5715, "step": 555 }, { "epoch": 0.71733561058924, "grad_norm": 0.42247724533081055, "learning_rate": 9.784376119799851e-07, "loss": 0.5459, "step": 560 }, { "epoch": 0.7237403928266439, "grad_norm": 0.5001282095909119, "learning_rate": 9.378198045275968e-07, "loss": 0.5557, "step": 565 }, { "epoch": 0.7301451750640479, "grad_norm": 0.4762704372406006, "learning_rate": 8.97867555443886e-07, "loss": 0.5338, "step": 570 }, { "epoch": 0.7365499573014518, "grad_norm": 0.48811063170433044, "learning_rate": 8.585978861670958e-07, "loss": 0.5331, "step": 575 }, { "epoch": 0.7429547395388557, "grad_norm": 0.45258718729019165, "learning_rate": 8.200275273260611e-07, "loss": 0.5461, "step": 580 }, { "epoch": 0.7493595217762596, "grad_norm": 0.4314691424369812, "learning_rate": 7.821729116122126e-07, "loss": 0.558, "step": 585 }, { "epoch": 0.7557643040136636, "grad_norm": 0.4526233673095703, "learning_rate": 7.450501667785146e-07, "loss": 0.5455, "step": 590 }, { "epoch": 0.7621690862510675, "grad_norm": 0.4625132977962494, "learning_rate": 7.086751087683297e-07, "loss": 0.5514, "step": 595 }, { "epoch": 0.7685738684884714, "grad_norm": 0.4986107349395752, "learning_rate": 6.730632349771193e-07, "loss": 0.5566, "step": 600 }, { "epoch": 0.7749786507258754, "grad_norm": 0.5132951140403748, "learning_rate": 6.3822971764986e-07, "loss": 0.5363, "step": 605 }, { "epoch": 0.7813834329632793, "grad_norm": 0.48895248770713806, "learning_rate": 6.041893974169963e-07, "loss": 0.5382, "step": 610 }, { "epoch": 0.7877882152006832, "grad_norm": 0.48889264464378357, "learning_rate": 5.709567769716678e-07, "loss": 0.5511, "step": 615 }, { "epoch": 0.7941929974380871, "grad_norm": 0.4542140066623688, "learning_rate": 5.385460148909169e-07, "loss": 0.5227, "step": 620 }, { "epoch": 0.8005977796754911, "grad_norm": 0.48940637707710266, "learning_rate": 5.069709196035011e-07, "loss": 0.5519, "step": 625 }, { "epoch": 0.807002561912895, "grad_norm": 0.45722976326942444, "learning_rate": 4.762449435068914e-07, "loss": 0.5358, "step": 630 }, { "epoch": 0.8134073441502989, "grad_norm": 0.5042068958282471, "learning_rate": 4.4638117723595054e-07, "loss": 0.5686, "step": 635 }, { "epoch": 0.8198121263877028, "grad_norm": 0.4974375069141388, "learning_rate": 4.173923440857358e-07, "loss": 0.5528, "step": 640 }, { "epoch": 0.8262169086251068, "grad_norm": 0.4234403669834137, "learning_rate": 3.892907945908128e-07, "loss": 0.5305, "step": 645 }, { "epoch": 0.8326216908625107, "grad_norm": 0.5144878029823303, "learning_rate": 3.6208850126337595e-07, "loss": 0.5282, "step": 650 }, { "epoch": 0.8390264730999146, "grad_norm": 0.41059333086013794, "learning_rate": 3.357970534924229e-07, "loss": 0.5601, "step": 655 }, { "epoch": 0.8454312553373186, "grad_norm": 0.40885528922080994, "learning_rate": 3.104276526061617e-07, "loss": 0.536, "step": 660 }, { "epoch": 0.8518360375747225, "grad_norm": 0.462971568107605, "learning_rate": 2.859911070997437e-07, "loss": 0.5513, "step": 665 }, { "epoch": 0.8582408198121264, "grad_norm": 0.6165898442268372, "learning_rate": 2.624978280303628e-07, "loss": 0.5542, "step": 670 }, { "epoch": 0.8646456020495303, "grad_norm": 0.514519453048706, "learning_rate": 2.3995782458168276e-07, "loss": 0.5572, "step": 675 }, { "epoch": 0.8710503842869343, "grad_norm": 0.5139626264572144, "learning_rate": 2.1838069979947945e-07, "loss": 0.5372, "step": 680 }, { "epoch": 0.8774551665243382, "grad_norm": 1.6515536308288574, "learning_rate": 1.9777564650031112e-07, "loss": 0.5515, "step": 685 }, { "epoch": 0.8838599487617421, "grad_norm": 0.4731055200099945, "learning_rate": 1.7815144335497524e-07, "loss": 0.5515, "step": 690 }, { "epoch": 0.890264730999146, "grad_norm": 0.5183550715446472, "learning_rate": 1.5951645114839875e-07, "loss": 0.5419, "step": 695 }, { "epoch": 0.89666951323655, "grad_norm": 0.5357317328453064, "learning_rate": 1.4187860921757252e-07, "loss": 0.5571, "step": 700 }, { "epoch": 0.9030742954739539, "grad_norm": 0.5177751779556274, "learning_rate": 1.2524543206904188e-07, "loss": 0.5607, "step": 705 }, { "epoch": 0.9094790777113578, "grad_norm": 0.4790054261684418, "learning_rate": 1.0962400617738872e-07, "loss": 0.581, "step": 710 }, { "epoch": 0.9158838599487618, "grad_norm": 0.5255675911903381, "learning_rate": 9.502098696608147e-08, "loss": 0.5449, "step": 715 }, { "epoch": 0.9222886421861657, "grad_norm": 0.38730135560035706, "learning_rate": 8.144259597196308e-08, "loss": 0.5518, "step": 720 }, { "epoch": 0.9286934244235696, "grad_norm": 0.42933622002601624, "learning_rate": 6.889461819460485e-08, "loss": 0.5365, "step": 725 }, { "epoch": 0.9350982066609735, "grad_norm": 0.50970458984375, "learning_rate": 5.738239963163472e-08, "loss": 0.5282, "step": 730 }, { "epoch": 0.9415029888983775, "grad_norm": 0.5318973064422607, "learning_rate": 4.691084500110521e-08, "loss": 0.5281, "step": 735 }, { "epoch": 0.9479077711357814, "grad_norm": 0.4877215623855591, "learning_rate": 3.748441565186583e-08, "loss": 0.5136, "step": 740 }, { "epoch": 0.9543125533731853, "grad_norm": 0.5620718002319336, "learning_rate": 2.910712766282908e-08, "loss": 0.5385, "step": 745 }, { "epoch": 0.9607173356105893, "grad_norm": 0.5282920598983765, "learning_rate": 2.178255013194075e-08, "loss": 0.5296, "step": 750 }, { "epoch": 0.9671221178479932, "grad_norm": 0.422025591135025, "learning_rate": 1.5513803655587966e-08, "loss": 0.5131, "step": 755 }, { "epoch": 0.9735269000853971, "grad_norm": 0.5139475464820862, "learning_rate": 1.0303558999082974e-08, "loss": 0.5625, "step": 760 }, { "epoch": 0.979931682322801, "grad_norm": 0.48410946130752563, "learning_rate": 6.1540359588005416e-09, "loss": 0.5286, "step": 765 }, { "epoch": 0.986336464560205, "grad_norm": 0.4075927138328552, "learning_rate": 3.067002416444198e-09, "loss": 0.5113, "step": 770 }, { "epoch": 0.9927412467976089, "grad_norm": 0.4521820545196533, "learning_rate": 1.0437735858506715e-09, "loss": 0.5399, "step": 775 }, { "epoch": 0.9991460290350128, "grad_norm": 0.41308000683784485, "learning_rate": 8.521145264978048e-11, "loss": 0.5787, "step": 780 }, { "epoch": 1.0, "step": 781, "total_flos": 7.995150581897871e+17, "train_loss": 0.5977290161287891, "train_runtime": 6942.3995, "train_samples_per_second": 1.349, "train_steps_per_second": 0.112 } ], "logging_steps": 5, "max_steps": 781, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.995150581897871e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }