{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 620, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01615508885298869, "grad_norm": 1.271867275238037, "learning_rate": 1.5384615384615385e-06, "loss": 1.3063, "step": 5 }, { "epoch": 0.03231017770597738, "grad_norm": 0.926250696182251, "learning_rate": 3.4615384615384617e-06, "loss": 1.3332, "step": 10 }, { "epoch": 0.048465266558966075, "grad_norm": 0.7140916585922241, "learning_rate": 5.384615384615385e-06, "loss": 1.2903, "step": 15 }, { "epoch": 0.06462035541195477, "grad_norm": 0.9422391653060913, "learning_rate": 7.307692307692308e-06, "loss": 1.2598, "step": 20 }, { "epoch": 0.08077544426494346, "grad_norm": 0.5254983305931091, "learning_rate": 9.230769230769232e-06, "loss": 1.2614, "step": 25 }, { "epoch": 0.09693053311793215, "grad_norm": 0.6005647778511047, "learning_rate": 1.1153846153846154e-05, "loss": 1.2564, "step": 30 }, { "epoch": 0.11308562197092084, "grad_norm": 0.5752702355384827, "learning_rate": 1.3076923076923078e-05, "loss": 1.2446, "step": 35 }, { "epoch": 0.12924071082390953, "grad_norm": 0.4668022096157074, "learning_rate": 1.5e-05, "loss": 1.2149, "step": 40 }, { "epoch": 0.14539579967689822, "grad_norm": 0.530459463596344, "learning_rate": 1.6923076923076924e-05, "loss": 1.2028, "step": 45 }, { "epoch": 0.16155088852988692, "grad_norm": 0.4735831916332245, "learning_rate": 1.8846153846153846e-05, "loss": 1.2197, "step": 50 }, { "epoch": 0.1777059773828756, "grad_norm": 0.5452806949615479, "learning_rate": 2.076923076923077e-05, "loss": 1.1691, "step": 55 }, { "epoch": 0.1938610662358643, "grad_norm": 0.4458446800708771, "learning_rate": 2.269230769230769e-05, "loss": 1.17, "step": 60 }, { "epoch": 0.210016155088853, "grad_norm": 0.566159725189209, "learning_rate": 2.4615384615384616e-05, "loss": 1.1737, "step": 65 }, { "epoch": 0.22617124394184168, "grad_norm": 0.4490140974521637, "learning_rate": 2.6538461538461538e-05, "loss": 1.1417, "step": 70 }, { "epoch": 0.24232633279483037, "grad_norm": 0.45892786979675293, "learning_rate": 2.846153846153846e-05, "loss": 1.1651, "step": 75 }, { "epoch": 0.25848142164781907, "grad_norm": 0.5318244099617004, "learning_rate": 2.9999965837847747e-05, "loss": 1.1468, "step": 80 }, { "epoch": 0.27463651050080773, "grad_norm": 0.5207120776176453, "learning_rate": 2.999877017885749e-05, "loss": 1.1483, "step": 85 }, { "epoch": 0.29079159935379645, "grad_norm": 0.5204576849937439, "learning_rate": 2.9995866567858337e-05, "loss": 1.0666, "step": 90 }, { "epoch": 0.3069466882067851, "grad_norm": 0.5433797836303711, "learning_rate": 2.9991255335492612e-05, "loss": 1.0818, "step": 95 }, { "epoch": 0.32310177705977383, "grad_norm": 0.5017735958099365, "learning_rate": 2.9984937006854268e-05, "loss": 1.089, "step": 100 }, { "epoch": 0.3392568659127625, "grad_norm": 0.6111322045326233, "learning_rate": 2.9976912301429102e-05, "loss": 1.0468, "step": 105 }, { "epoch": 0.3554119547657512, "grad_norm": 0.5563653707504272, "learning_rate": 2.996718213301282e-05, "loss": 1.0671, "step": 110 }, { "epoch": 0.3715670436187399, "grad_norm": 0.7332136034965515, "learning_rate": 2.9955747609606973e-05, "loss": 1.0187, "step": 115 }, { "epoch": 0.3877221324717286, "grad_norm": 0.5513479709625244, "learning_rate": 2.9942610033292804e-05, "loss": 1.0218, "step": 120 }, { "epoch": 0.40387722132471726, "grad_norm": 0.5962786674499512, "learning_rate": 2.9927770900082956e-05, "loss": 1.0027, "step": 125 }, { "epoch": 0.420032310177706, "grad_norm": 0.5897378325462341, "learning_rate": 2.9911231899751127e-05, "loss": 0.9925, "step": 130 }, { "epoch": 0.43618739903069464, "grad_norm": 0.6131622791290283, "learning_rate": 2.9892994915639648e-05, "loss": 1.0063, "step": 135 }, { "epoch": 0.45234248788368336, "grad_norm": 0.6106935143470764, "learning_rate": 2.9873062024445035e-05, "loss": 0.9924, "step": 140 }, { "epoch": 0.46849757673667203, "grad_norm": 0.7261186838150024, "learning_rate": 2.9851435495981487e-05, "loss": 0.9812, "step": 145 }, { "epoch": 0.48465266558966075, "grad_norm": 0.6918673515319824, "learning_rate": 2.982811779292243e-05, "loss": 0.9703, "step": 150 }, { "epoch": 0.5008077544426495, "grad_norm": 0.6846053004264832, "learning_rate": 2.980311157052007e-05, "loss": 0.9455, "step": 155 }, { "epoch": 0.5169628432956381, "grad_norm": 0.7054921984672546, "learning_rate": 2.977641967630308e-05, "loss": 0.9684, "step": 160 }, { "epoch": 0.5331179321486268, "grad_norm": 0.8386259078979492, "learning_rate": 2.974804514975226e-05, "loss": 0.9483, "step": 165 }, { "epoch": 0.5492730210016155, "grad_norm": 0.8338026404380798, "learning_rate": 2.9717991221954516e-05, "loss": 0.8942, "step": 170 }, { "epoch": 0.5654281098546042, "grad_norm": 0.8031806349754333, "learning_rate": 2.9686261315234845e-05, "loss": 0.8801, "step": 175 }, { "epoch": 0.5815831987075929, "grad_norm": 0.8551956415176392, "learning_rate": 2.9652859042766687e-05, "loss": 0.9189, "step": 180 }, { "epoch": 0.5977382875605816, "grad_norm": 0.781044065952301, "learning_rate": 2.961778820816045e-05, "loss": 0.9013, "step": 185 }, { "epoch": 0.6138933764135702, "grad_norm": 0.8323260545730591, "learning_rate": 2.958105280503039e-05, "loss": 0.8815, "step": 190 }, { "epoch": 0.630048465266559, "grad_norm": 1.069571614265442, "learning_rate": 2.954265701653985e-05, "loss": 0.887, "step": 195 }, { "epoch": 0.6462035541195477, "grad_norm": 0.8414632678031921, "learning_rate": 2.9502605214924905e-05, "loss": 0.867, "step": 200 }, { "epoch": 0.6623586429725363, "grad_norm": 0.9376639127731323, "learning_rate": 2.9460901960996484e-05, "loss": 0.8239, "step": 205 }, { "epoch": 0.678513731825525, "grad_norm": 0.82874596118927, "learning_rate": 2.941755200362104e-05, "loss": 0.8348, "step": 210 }, { "epoch": 0.6946688206785138, "grad_norm": 0.8920372128486633, "learning_rate": 2.9372560279179733e-05, "loss": 0.8882, "step": 215 }, { "epoch": 0.7108239095315024, "grad_norm": 0.8290151357650757, "learning_rate": 2.932593191100637e-05, "loss": 0.783, "step": 220 }, { "epoch": 0.7269789983844911, "grad_norm": 0.9877433180809021, "learning_rate": 2.9277672208803948e-05, "loss": 0.8075, "step": 225 }, { "epoch": 0.7431340872374798, "grad_norm": 0.9012677073478699, "learning_rate": 2.922778666804006e-05, "loss": 0.8248, "step": 230 }, { "epoch": 0.7592891760904685, "grad_norm": 0.8435679078102112, "learning_rate": 2.917628096932108e-05, "loss": 0.7821, "step": 235 }, { "epoch": 0.7754442649434572, "grad_norm": 0.9440993070602417, "learning_rate": 2.912316097774531e-05, "loss": 0.8035, "step": 240 }, { "epoch": 0.7915993537964459, "grad_norm": 0.9287444353103638, "learning_rate": 2.9068432742235126e-05, "loss": 0.7929, "step": 245 }, { "epoch": 0.8077544426494345, "grad_norm": 0.9541422724723816, "learning_rate": 2.9012102494848125e-05, "loss": 0.7871, "step": 250 }, { "epoch": 0.8239095315024233, "grad_norm": 0.9758172631263733, "learning_rate": 2.8954176650067496e-05, "loss": 0.7192, "step": 255 }, { "epoch": 0.840064620355412, "grad_norm": 0.8767186999320984, "learning_rate": 2.8894661804071588e-05, "loss": 0.7646, "step": 260 }, { "epoch": 0.8562197092084006, "grad_norm": 1.0081284046173096, "learning_rate": 2.8833564733982746e-05, "loss": 0.7274, "step": 265 }, { "epoch": 0.8723747980613893, "grad_norm": 1.123024582862854, "learning_rate": 2.877089239709564e-05, "loss": 0.7234, "step": 270 }, { "epoch": 0.8885298869143781, "grad_norm": 1.1195224523544312, "learning_rate": 2.8706651930084965e-05, "loss": 0.721, "step": 275 }, { "epoch": 0.9046849757673667, "grad_norm": 1.0532171726226807, "learning_rate": 2.8640850648192795e-05, "loss": 0.7322, "step": 280 }, { "epoch": 0.9208400646203554, "grad_norm": 1.0443161725997925, "learning_rate": 2.8573496044395574e-05, "loss": 0.7272, "step": 285 }, { "epoch": 0.9369951534733441, "grad_norm": 0.9977967739105225, "learning_rate": 2.850459578855086e-05, "loss": 0.7261, "step": 290 }, { "epoch": 0.9531502423263328, "grad_norm": 0.9388869404792786, "learning_rate": 2.843415772652395e-05, "loss": 0.7133, "step": 295 }, { "epoch": 0.9693053311793215, "grad_norm": 0.9346816539764404, "learning_rate": 2.8362189879294434e-05, "loss": 0.6597, "step": 300 }, { "epoch": 0.9854604200323102, "grad_norm": 1.0388387441635132, "learning_rate": 2.828870044204284e-05, "loss": 0.6581, "step": 305 }, { "epoch": 1.0, "grad_norm": 1.5073713064193726, "learning_rate": 2.8213697783217413e-05, "loss": 0.6607, "step": 310 }, { "epoch": 1.0161550888529887, "grad_norm": 1.178043007850647, "learning_rate": 2.8137190443581175e-05, "loss": 0.6305, "step": 315 }, { "epoch": 1.0323101777059773, "grad_norm": 0.9829104542732239, "learning_rate": 2.8059187135239383e-05, "loss": 0.6241, "step": 320 }, { "epoch": 1.048465266558966, "grad_norm": 1.065422534942627, "learning_rate": 2.7979696740647433e-05, "loss": 0.6547, "step": 325 }, { "epoch": 1.0646203554119547, "grad_norm": 1.1355617046356201, "learning_rate": 2.7898728311599414e-05, "loss": 0.5667, "step": 330 }, { "epoch": 1.0807754442649435, "grad_norm": 0.9137387275695801, "learning_rate": 2.781629106819733e-05, "loss": 0.5735, "step": 335 }, { "epoch": 1.0969305331179322, "grad_norm": 1.1564706563949585, "learning_rate": 2.7732394397801196e-05, "loss": 0.5792, "step": 340 }, { "epoch": 1.1130856219709209, "grad_norm": 1.0879970788955688, "learning_rate": 2.764704785396007e-05, "loss": 0.5671, "step": 345 }, { "epoch": 1.1292407108239095, "grad_norm": 1.134347677230835, "learning_rate": 2.7560261155324176e-05, "loss": 0.5761, "step": 350 }, { "epoch": 1.1453957996768982, "grad_norm": 1.189737319946289, "learning_rate": 2.747204418453818e-05, "loss": 0.5255, "step": 355 }, { "epoch": 1.1615508885298869, "grad_norm": 1.0731488466262817, "learning_rate": 2.7382406987115863e-05, "loss": 0.5373, "step": 360 }, { "epoch": 1.1777059773828755, "grad_norm": 1.045630693435669, "learning_rate": 2.7291359770296196e-05, "loss": 0.5756, "step": 365 }, { "epoch": 1.1938610662358644, "grad_norm": 1.1830902099609375, "learning_rate": 2.7198912901881e-05, "loss": 0.4824, "step": 370 }, { "epoch": 1.210016155088853, "grad_norm": 1.1311571598052979, "learning_rate": 2.7105076909054357e-05, "loss": 0.5156, "step": 375 }, { "epoch": 1.2261712439418417, "grad_norm": 1.1190736293792725, "learning_rate": 2.7009862477183837e-05, "loss": 0.479, "step": 380 }, { "epoch": 1.2423263327948304, "grad_norm": 1.1775109767913818, "learning_rate": 2.6913280448603727e-05, "loss": 0.5447, "step": 385 }, { "epoch": 1.258481421647819, "grad_norm": 0.9826534986495972, "learning_rate": 2.6815341821380384e-05, "loss": 0.5142, "step": 390 }, { "epoch": 1.2746365105008077, "grad_norm": 1.1323788166046143, "learning_rate": 2.6716057748059867e-05, "loss": 0.5244, "step": 395 }, { "epoch": 1.2907915993537964, "grad_norm": 1.423813819885254, "learning_rate": 2.6615439534397934e-05, "loss": 0.5275, "step": 400 }, { "epoch": 1.306946688206785, "grad_norm": 1.2067975997924805, "learning_rate": 2.6513498638072664e-05, "loss": 0.5486, "step": 405 }, { "epoch": 1.3231017770597737, "grad_norm": 1.089034080505371, "learning_rate": 2.6410246667379698e-05, "loss": 0.5377, "step": 410 }, { "epoch": 1.3392568659127626, "grad_norm": 1.1521645784378052, "learning_rate": 2.630569537991042e-05, "loss": 0.5249, "step": 415 }, { "epoch": 1.3554119547657513, "grad_norm": 1.172168254852295, "learning_rate": 2.6199856681213023e-05, "loss": 0.4809, "step": 420 }, { "epoch": 1.37156704361874, "grad_norm": 1.086441159248352, "learning_rate": 2.6092742623436856e-05, "loss": 0.4989, "step": 425 }, { "epoch": 1.3877221324717286, "grad_norm": 1.279645323753357, "learning_rate": 2.5984365403959966e-05, "loss": 0.4776, "step": 430 }, { "epoch": 1.4038772213247173, "grad_norm": 1.2726421356201172, "learning_rate": 2.5874737364000167e-05, "loss": 0.4759, "step": 435 }, { "epoch": 1.420032310177706, "grad_norm": 1.104711890220642, "learning_rate": 2.5763870987209704e-05, "loss": 0.4877, "step": 440 }, { "epoch": 1.4361873990306946, "grad_norm": 1.4142299890518188, "learning_rate": 2.5651778898253718e-05, "loss": 0.494, "step": 445 }, { "epoch": 1.4523424878836835, "grad_norm": 1.147704839706421, "learning_rate": 2.5538473861372628e-05, "loss": 0.4801, "step": 450 }, { "epoch": 1.468497576736672, "grad_norm": 1.1201279163360596, "learning_rate": 2.542396877892862e-05, "loss": 0.4552, "step": 455 }, { "epoch": 1.4846526655896608, "grad_norm": 1.317685842514038, "learning_rate": 2.5308276689936454e-05, "loss": 0.4721, "step": 460 }, { "epoch": 1.5008077544426495, "grad_norm": 1.111350417137146, "learning_rate": 2.5191410768578643e-05, "loss": 0.462, "step": 465 }, { "epoch": 1.5169628432956381, "grad_norm": 1.1139695644378662, "learning_rate": 2.5073384322705278e-05, "loss": 0.4189, "step": 470 }, { "epoch": 1.5331179321486268, "grad_norm": 1.1013826131820679, "learning_rate": 2.4954210792318637e-05, "loss": 0.4015, "step": 475 }, { "epoch": 1.5492730210016155, "grad_norm": 1.0391738414764404, "learning_rate": 2.483390374804272e-05, "loss": 0.5059, "step": 480 }, { "epoch": 1.5654281098546043, "grad_norm": 1.1611058712005615, "learning_rate": 2.4712476889577944e-05, "loss": 0.4572, "step": 485 }, { "epoch": 1.5815831987075928, "grad_norm": 1.1287028789520264, "learning_rate": 2.458994404414109e-05, "loss": 0.4706, "step": 490 }, { "epoch": 1.5977382875605817, "grad_norm": 1.2190521955490112, "learning_rate": 2.4466319164890794e-05, "loss": 0.4592, "step": 495 }, { "epoch": 1.6138933764135701, "grad_norm": 1.0761996507644653, "learning_rate": 2.434161632933863e-05, "loss": 0.4168, "step": 500 }, { "epoch": 1.630048465266559, "grad_norm": 1.2677689790725708, "learning_rate": 2.4215849737746087e-05, "loss": 0.4239, "step": 505 }, { "epoch": 1.6462035541195477, "grad_norm": 1.077335238456726, "learning_rate": 2.4089033711507523e-05, "loss": 0.4223, "step": 510 }, { "epoch": 1.6623586429725363, "grad_norm": 1.092629075050354, "learning_rate": 2.3961182691519386e-05, "loss": 0.396, "step": 515 }, { "epoch": 1.678513731825525, "grad_norm": 1.0773464441299438, "learning_rate": 2.383231123653574e-05, "loss": 0.3559, "step": 520 }, { "epoch": 1.6946688206785137, "grad_norm": 1.2922345399856567, "learning_rate": 2.370243402151045e-05, "loss": 0.4258, "step": 525 }, { "epoch": 1.7108239095315025, "grad_norm": 1.1194257736206055, "learning_rate": 2.35715658359261e-05, "loss": 0.4128, "step": 530 }, { "epoch": 1.726978998384491, "grad_norm": 1.2243081331253052, "learning_rate": 2.3439721582109874e-05, "loss": 0.4641, "step": 535 }, { "epoch": 1.7431340872374799, "grad_norm": 1.1439075469970703, "learning_rate": 2.3306916273536564e-05, "loss": 0.3985, "step": 540 }, { "epoch": 1.7592891760904685, "grad_norm": 1.09407639503479, "learning_rate": 2.3173165033118983e-05, "loss": 0.3708, "step": 545 }, { "epoch": 1.7754442649434572, "grad_norm": 1.1844106912612915, "learning_rate": 2.303848309148584e-05, "loss": 0.4523, "step": 550 }, { "epoch": 1.7915993537964459, "grad_norm": 1.1165605783462524, "learning_rate": 2.2902885785247406e-05, "loss": 0.3978, "step": 555 }, { "epoch": 1.8077544426494345, "grad_norm": 1.1502916812896729, "learning_rate": 2.2766388555249087e-05, "loss": 0.3759, "step": 560 }, { "epoch": 1.8239095315024234, "grad_norm": 1.003591537475586, "learning_rate": 2.262900694481314e-05, "loss": 0.3749, "step": 565 }, { "epoch": 1.8400646203554119, "grad_norm": 1.1967657804489136, "learning_rate": 2.2490756597968663e-05, "loss": 0.375, "step": 570 }, { "epoch": 1.8562197092084007, "grad_norm": 1.1565355062484741, "learning_rate": 2.235165325767026e-05, "loss": 0.3611, "step": 575 }, { "epoch": 1.8723747980613892, "grad_norm": 1.1527678966522217, "learning_rate": 2.2211712764005245e-05, "loss": 0.4101, "step": 580 }, { "epoch": 1.888529886914378, "grad_norm": 1.2227520942687988, "learning_rate": 2.207095105238997e-05, "loss": 0.3426, "step": 585 }, { "epoch": 1.9046849757673667, "grad_norm": 1.1511895656585693, "learning_rate": 2.1929384151755138e-05, "loss": 0.3967, "step": 590 }, { "epoch": 1.9208400646203554, "grad_norm": 1.1515477895736694, "learning_rate": 2.1787028182720606e-05, "loss": 0.3717, "step": 595 }, { "epoch": 1.936995153473344, "grad_norm": 1.1370980739593506, "learning_rate": 2.1643899355759674e-05, "loss": 0.3588, "step": 600 }, { "epoch": 1.9531502423263327, "grad_norm": 1.209424376487732, "learning_rate": 2.1500013969353107e-05, "loss": 0.329, "step": 605 }, { "epoch": 1.9693053311793216, "grad_norm": 1.08406662940979, "learning_rate": 2.1355388408133254e-05, "loss": 0.3333, "step": 610 }, { "epoch": 1.98546042003231, "grad_norm": 1.0804470777511597, "learning_rate": 2.1210039141018204e-05, "loss": 0.3432, "step": 615 }, { "epoch": 2.0, "grad_norm": 1.6371866464614868, "learning_rate": 2.1063982719336497e-05, "loss": 0.3595, "step": 620 } ], "logging_steps": 5, "max_steps": 1550, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.190029403650785e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }