{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 2060, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009708737864077669, "grad_norm": 8.540091514587402, "learning_rate": 3.883495145631068e-07, "loss": 2.734, "step": 10 }, { "epoch": 0.019417475728155338, "grad_norm": 8.14619255065918, "learning_rate": 1.359223300970874e-06, "loss": 2.6711, "step": 20 }, { "epoch": 0.02912621359223301, "grad_norm": 6.858204364776611, "learning_rate": 2.330097087378641e-06, "loss": 2.6055, "step": 30 }, { "epoch": 0.038834951456310676, "grad_norm": 6.228450298309326, "learning_rate": 3.300970873786408e-06, "loss": 2.5322, "step": 40 }, { "epoch": 0.04854368932038835, "grad_norm": 5.512765407562256, "learning_rate": 4.271844660194175e-06, "loss": 2.466, "step": 50 }, { "epoch": 0.05825242718446602, "grad_norm": 5.114351272583008, "learning_rate": 5.242718446601942e-06, "loss": 2.4828, "step": 60 }, { "epoch": 0.06796116504854369, "grad_norm": 4.749820232391357, "learning_rate": 6.213592233009709e-06, "loss": 2.4529, "step": 70 }, { "epoch": 0.07766990291262135, "grad_norm": 4.962618827819824, "learning_rate": 7.184466019417476e-06, "loss": 2.3377, "step": 80 }, { "epoch": 0.08737864077669903, "grad_norm": 4.963841915130615, "learning_rate": 8.155339805825243e-06, "loss": 2.3914, "step": 90 }, { "epoch": 0.0970873786407767, "grad_norm": 5.020650386810303, "learning_rate": 9.12621359223301e-06, "loss": 2.2748, "step": 100 }, { "epoch": 0.10679611650485436, "grad_norm": 4.788837909698486, "learning_rate": 1.0097087378640778e-05, "loss": 2.2975, "step": 110 }, { "epoch": 0.11650485436893204, "grad_norm": 5.005253791809082, "learning_rate": 1.1067961165048544e-05, "loss": 2.3025, "step": 120 }, { "epoch": 0.1262135922330097, "grad_norm": 5.113918304443359, "learning_rate": 1.2038834951456311e-05, "loss": 2.2121, "step": 130 }, { "epoch": 0.13592233009708737, "grad_norm": 5.089141368865967, "learning_rate": 1.300970873786408e-05, "loss": 2.233, "step": 140 }, { "epoch": 0.14563106796116504, "grad_norm": 5.0798749923706055, "learning_rate": 1.3980582524271846e-05, "loss": 2.1688, "step": 150 }, { "epoch": 0.1553398058252427, "grad_norm": 5.230806827545166, "learning_rate": 1.4951456310679614e-05, "loss": 2.2232, "step": 160 }, { "epoch": 0.1650485436893204, "grad_norm": 5.30760383605957, "learning_rate": 1.592233009708738e-05, "loss": 2.1506, "step": 170 }, { "epoch": 0.17475728155339806, "grad_norm": 5.008656978607178, "learning_rate": 1.6893203883495145e-05, "loss": 2.1866, "step": 180 }, { "epoch": 0.18446601941747573, "grad_norm": 4.95796537399292, "learning_rate": 1.7864077669902916e-05, "loss": 2.2156, "step": 190 }, { "epoch": 0.1941747572815534, "grad_norm": 4.717769145965576, "learning_rate": 1.883495145631068e-05, "loss": 2.164, "step": 200 }, { "epoch": 0.20388349514563106, "grad_norm": 4.858338832855225, "learning_rate": 1.9805825242718447e-05, "loss": 2.1291, "step": 210 }, { "epoch": 0.21359223300970873, "grad_norm": 5.218167781829834, "learning_rate": 1.9913700107874866e-05, "loss": 2.1617, "step": 220 }, { "epoch": 0.22330097087378642, "grad_norm": 5.097916126251221, "learning_rate": 1.9805825242718447e-05, "loss": 2.1721, "step": 230 }, { "epoch": 0.23300970873786409, "grad_norm": 5.860560417175293, "learning_rate": 1.969795037756203e-05, "loss": 2.1412, "step": 240 }, { "epoch": 0.24271844660194175, "grad_norm": 5.395883560180664, "learning_rate": 1.959007551240561e-05, "loss": 2.1397, "step": 250 }, { "epoch": 0.2524271844660194, "grad_norm": 5.043527126312256, "learning_rate": 1.9482200647249193e-05, "loss": 2.1314, "step": 260 }, { "epoch": 0.2621359223300971, "grad_norm": 4.853712558746338, "learning_rate": 1.9374325782092775e-05, "loss": 2.118, "step": 270 }, { "epoch": 0.27184466019417475, "grad_norm": 5.681634902954102, "learning_rate": 1.9266450916936353e-05, "loss": 2.1189, "step": 280 }, { "epoch": 0.2815533980582524, "grad_norm": 5.401227951049805, "learning_rate": 1.9158576051779935e-05, "loss": 2.1148, "step": 290 }, { "epoch": 0.2912621359223301, "grad_norm": 5.208418369293213, "learning_rate": 1.905070118662352e-05, "loss": 2.128, "step": 300 }, { "epoch": 0.30097087378640774, "grad_norm": 5.307718276977539, "learning_rate": 1.89428263214671e-05, "loss": 2.0858, "step": 310 }, { "epoch": 0.3106796116504854, "grad_norm": 5.279410362243652, "learning_rate": 1.883495145631068e-05, "loss": 2.1371, "step": 320 }, { "epoch": 0.32038834951456313, "grad_norm": 5.151274681091309, "learning_rate": 1.8727076591154262e-05, "loss": 2.0932, "step": 330 }, { "epoch": 0.3300970873786408, "grad_norm": 5.083354473114014, "learning_rate": 1.8619201725997844e-05, "loss": 2.0623, "step": 340 }, { "epoch": 0.33980582524271846, "grad_norm": 5.1322550773620605, "learning_rate": 1.8511326860841425e-05, "loss": 2.05, "step": 350 }, { "epoch": 0.34951456310679613, "grad_norm": 4.970919609069824, "learning_rate": 1.8403451995685007e-05, "loss": 2.0901, "step": 360 }, { "epoch": 0.3592233009708738, "grad_norm": 5.15512752532959, "learning_rate": 1.829557713052859e-05, "loss": 2.0883, "step": 370 }, { "epoch": 0.36893203883495146, "grad_norm": 5.088575839996338, "learning_rate": 1.818770226537217e-05, "loss": 2.1119, "step": 380 }, { "epoch": 0.3786407766990291, "grad_norm": 6.092918872833252, "learning_rate": 1.807982740021575e-05, "loss": 2.0979, "step": 390 }, { "epoch": 0.3883495145631068, "grad_norm": 4.909801483154297, "learning_rate": 1.797195253505933e-05, "loss": 2.0283, "step": 400 }, { "epoch": 0.39805825242718446, "grad_norm": 5.128530025482178, "learning_rate": 1.7864077669902916e-05, "loss": 2.023, "step": 410 }, { "epoch": 0.4077669902912621, "grad_norm": 4.998912811279297, "learning_rate": 1.7756202804746498e-05, "loss": 2.0207, "step": 420 }, { "epoch": 0.4174757281553398, "grad_norm": 5.182358264923096, "learning_rate": 1.7648327939590076e-05, "loss": 1.9837, "step": 430 }, { "epoch": 0.42718446601941745, "grad_norm": 5.3191022872924805, "learning_rate": 1.7540453074433658e-05, "loss": 2.058, "step": 440 }, { "epoch": 0.4368932038834951, "grad_norm": 5.306585788726807, "learning_rate": 1.743257820927724e-05, "loss": 2.0832, "step": 450 }, { "epoch": 0.44660194174757284, "grad_norm": 5.278446197509766, "learning_rate": 1.732470334412082e-05, "loss": 2.0594, "step": 460 }, { "epoch": 0.4563106796116505, "grad_norm": 5.484086990356445, "learning_rate": 1.7216828478964403e-05, "loss": 2.0763, "step": 470 }, { "epoch": 0.46601941747572817, "grad_norm": 5.767387866973877, "learning_rate": 1.7108953613807985e-05, "loss": 2.0634, "step": 480 }, { "epoch": 0.47572815533980584, "grad_norm": 4.96846342086792, "learning_rate": 1.7001078748651563e-05, "loss": 2.0769, "step": 490 }, { "epoch": 0.4854368932038835, "grad_norm": 5.264239311218262, "learning_rate": 1.6893203883495145e-05, "loss": 2.0604, "step": 500 }, { "epoch": 0.49514563106796117, "grad_norm": 5.036663055419922, "learning_rate": 1.6785329018338727e-05, "loss": 2.1031, "step": 510 }, { "epoch": 0.5048543689320388, "grad_norm": 4.875285625457764, "learning_rate": 1.6677454153182312e-05, "loss": 2.0457, "step": 520 }, { "epoch": 0.5145631067961165, "grad_norm": 4.933873653411865, "learning_rate": 1.6569579288025894e-05, "loss": 2.0312, "step": 530 }, { "epoch": 0.5242718446601942, "grad_norm": 5.284345626831055, "learning_rate": 1.6461704422869472e-05, "loss": 2.0656, "step": 540 }, { "epoch": 0.5339805825242718, "grad_norm": 5.3404998779296875, "learning_rate": 1.6353829557713054e-05, "loss": 2.1049, "step": 550 }, { "epoch": 0.5436893203883495, "grad_norm": 5.243639945983887, "learning_rate": 1.6245954692556636e-05, "loss": 2.0382, "step": 560 }, { "epoch": 0.5533980582524272, "grad_norm": 5.110634803771973, "learning_rate": 1.6138079827400217e-05, "loss": 2.004, "step": 570 }, { "epoch": 0.5631067961165048, "grad_norm": 5.063004493713379, "learning_rate": 1.60302049622438e-05, "loss": 2.0207, "step": 580 }, { "epoch": 0.5728155339805825, "grad_norm": 4.7647271156311035, "learning_rate": 1.592233009708738e-05, "loss": 2.0534, "step": 590 }, { "epoch": 0.5825242718446602, "grad_norm": 5.176267147064209, "learning_rate": 1.581445523193096e-05, "loss": 2.0627, "step": 600 }, { "epoch": 0.5922330097087378, "grad_norm": 5.2062225341796875, "learning_rate": 1.570658036677454e-05, "loss": 2.0063, "step": 610 }, { "epoch": 0.6019417475728155, "grad_norm": 5.044838905334473, "learning_rate": 1.5598705501618123e-05, "loss": 1.9475, "step": 620 }, { "epoch": 0.6116504854368932, "grad_norm": 5.029117584228516, "learning_rate": 1.5490830636461708e-05, "loss": 2.0673, "step": 630 }, { "epoch": 0.6213592233009708, "grad_norm": 5.190179347991943, "learning_rate": 1.5382955771305286e-05, "loss": 2.0176, "step": 640 }, { "epoch": 0.6310679611650486, "grad_norm": 4.956365585327148, "learning_rate": 1.5275080906148868e-05, "loss": 1.9984, "step": 650 }, { "epoch": 0.6407766990291263, "grad_norm": 4.972413539886475, "learning_rate": 1.516720604099245e-05, "loss": 1.9919, "step": 660 }, { "epoch": 0.6504854368932039, "grad_norm": 5.319215297698975, "learning_rate": 1.5059331175836032e-05, "loss": 1.984, "step": 670 }, { "epoch": 0.6601941747572816, "grad_norm": 5.120510578155518, "learning_rate": 1.4951456310679614e-05, "loss": 1.9838, "step": 680 }, { "epoch": 0.6699029126213593, "grad_norm": 4.868938446044922, "learning_rate": 1.4843581445523194e-05, "loss": 2.006, "step": 690 }, { "epoch": 0.6796116504854369, "grad_norm": 5.22821044921875, "learning_rate": 1.4735706580366775e-05, "loss": 2.0099, "step": 700 }, { "epoch": 0.6893203883495146, "grad_norm": 4.970730781555176, "learning_rate": 1.4627831715210357e-05, "loss": 2.0078, "step": 710 }, { "epoch": 0.6990291262135923, "grad_norm": 4.913213729858398, "learning_rate": 1.4519956850053937e-05, "loss": 2.0198, "step": 720 }, { "epoch": 0.7087378640776699, "grad_norm": 5.104898452758789, "learning_rate": 1.4412081984897519e-05, "loss": 2.0224, "step": 730 }, { "epoch": 0.7184466019417476, "grad_norm": 4.992263317108154, "learning_rate": 1.4304207119741102e-05, "loss": 2.0013, "step": 740 }, { "epoch": 0.7281553398058253, "grad_norm": 5.0994038581848145, "learning_rate": 1.4196332254584684e-05, "loss": 1.9542, "step": 750 }, { "epoch": 0.7378640776699029, "grad_norm": 5.849913120269775, "learning_rate": 1.4088457389428264e-05, "loss": 1.9941, "step": 760 }, { "epoch": 0.7475728155339806, "grad_norm": 5.09085750579834, "learning_rate": 1.3980582524271846e-05, "loss": 2.0384, "step": 770 }, { "epoch": 0.7572815533980582, "grad_norm": 5.28529167175293, "learning_rate": 1.3872707659115428e-05, "loss": 2.0251, "step": 780 }, { "epoch": 0.7669902912621359, "grad_norm": 5.162165641784668, "learning_rate": 1.3764832793959008e-05, "loss": 1.9806, "step": 790 }, { "epoch": 0.7766990291262136, "grad_norm": 5.865965843200684, "learning_rate": 1.365695792880259e-05, "loss": 2.0359, "step": 800 }, { "epoch": 0.7864077669902912, "grad_norm": 4.936879634857178, "learning_rate": 1.3549083063646171e-05, "loss": 1.993, "step": 810 }, { "epoch": 0.7961165048543689, "grad_norm": 5.331514358520508, "learning_rate": 1.3441208198489753e-05, "loss": 1.9685, "step": 820 }, { "epoch": 0.8058252427184466, "grad_norm": 5.171398639678955, "learning_rate": 1.3333333333333333e-05, "loss": 1.9885, "step": 830 }, { "epoch": 0.8155339805825242, "grad_norm": 4.853579998016357, "learning_rate": 1.3225458468176915e-05, "loss": 1.9818, "step": 840 }, { "epoch": 0.8252427184466019, "grad_norm": 5.196751117706299, "learning_rate": 1.3117583603020499e-05, "loss": 1.946, "step": 850 }, { "epoch": 0.8349514563106796, "grad_norm": 4.931100845336914, "learning_rate": 1.300970873786408e-05, "loss": 1.9898, "step": 860 }, { "epoch": 0.8446601941747572, "grad_norm": 5.232204437255859, "learning_rate": 1.290183387270766e-05, "loss": 2.0314, "step": 870 }, { "epoch": 0.8543689320388349, "grad_norm": 5.175143718719482, "learning_rate": 1.2793959007551242e-05, "loss": 1.9958, "step": 880 }, { "epoch": 0.8640776699029126, "grad_norm": 5.501524925231934, "learning_rate": 1.2686084142394824e-05, "loss": 1.969, "step": 890 }, { "epoch": 0.8737864077669902, "grad_norm": 5.200106620788574, "learning_rate": 1.2578209277238404e-05, "loss": 1.9624, "step": 900 }, { "epoch": 0.883495145631068, "grad_norm": 5.435555934906006, "learning_rate": 1.2470334412081986e-05, "loss": 2.0057, "step": 910 }, { "epoch": 0.8932038834951457, "grad_norm": 5.041926860809326, "learning_rate": 1.2362459546925568e-05, "loss": 1.9604, "step": 920 }, { "epoch": 0.9029126213592233, "grad_norm": 5.86530065536499, "learning_rate": 1.2254584681769148e-05, "loss": 1.9904, "step": 930 }, { "epoch": 0.912621359223301, "grad_norm": 5.039781093597412, "learning_rate": 1.214670981661273e-05, "loss": 1.9681, "step": 940 }, { "epoch": 0.9223300970873787, "grad_norm": 5.195461273193359, "learning_rate": 1.2038834951456311e-05, "loss": 2.0042, "step": 950 }, { "epoch": 0.9320388349514563, "grad_norm": 5.229151725769043, "learning_rate": 1.1930960086299891e-05, "loss": 2.0039, "step": 960 }, { "epoch": 0.941747572815534, "grad_norm": 4.973319053649902, "learning_rate": 1.1823085221143475e-05, "loss": 2.0065, "step": 970 }, { "epoch": 0.9514563106796117, "grad_norm": 5.273900032043457, "learning_rate": 1.1715210355987056e-05, "loss": 1.9896, "step": 980 }, { "epoch": 0.9611650485436893, "grad_norm": 5.269063472747803, "learning_rate": 1.1607335490830638e-05, "loss": 1.9592, "step": 990 }, { "epoch": 0.970873786407767, "grad_norm": 5.100589752197266, "learning_rate": 1.149946062567422e-05, "loss": 1.9354, "step": 1000 }, { "epoch": 0.9805825242718447, "grad_norm": 5.232608318328857, "learning_rate": 1.1413160733549084e-05, "loss": 2.0395, "step": 1010 }, { "epoch": 0.9902912621359223, "grad_norm": 5.08132791519165, "learning_rate": 1.1305285868392666e-05, "loss": 1.9437, "step": 1020 }, { "epoch": 1.0, "grad_norm": 5.696261405944824, "learning_rate": 1.1197411003236248e-05, "loss": 1.9665, "step": 1030 }, { "epoch": 1.0097087378640777, "grad_norm": 5.359734058380127, "learning_rate": 1.1089536138079828e-05, "loss": 1.841, "step": 1040 }, { "epoch": 1.0194174757281553, "grad_norm": 5.353382587432861, "learning_rate": 1.098166127292341e-05, "loss": 1.8354, "step": 1050 }, { "epoch": 1.029126213592233, "grad_norm": 5.595281600952148, "learning_rate": 1.0873786407766991e-05, "loss": 1.8011, "step": 1060 }, { "epoch": 1.0388349514563107, "grad_norm": 5.603349685668945, "learning_rate": 1.0765911542610571e-05, "loss": 1.8443, "step": 1070 }, { "epoch": 1.0485436893203883, "grad_norm": 5.549112319946289, "learning_rate": 1.0658036677454153e-05, "loss": 1.8022, "step": 1080 }, { "epoch": 1.058252427184466, "grad_norm": 5.524362087249756, "learning_rate": 1.0550161812297735e-05, "loss": 1.8361, "step": 1090 }, { "epoch": 1.0679611650485437, "grad_norm": 5.287837028503418, "learning_rate": 1.0442286947141318e-05, "loss": 1.7831, "step": 1100 }, { "epoch": 1.0776699029126213, "grad_norm": 5.503448486328125, "learning_rate": 1.03344120819849e-05, "loss": 1.816, "step": 1110 }, { "epoch": 1.087378640776699, "grad_norm": 5.447495460510254, "learning_rate": 1.022653721682848e-05, "loss": 1.7967, "step": 1120 }, { "epoch": 1.0970873786407767, "grad_norm": 5.651370525360107, "learning_rate": 1.0118662351672062e-05, "loss": 1.767, "step": 1130 }, { "epoch": 1.1067961165048543, "grad_norm": 5.455685138702393, "learning_rate": 1.0010787486515644e-05, "loss": 1.7986, "step": 1140 }, { "epoch": 1.116504854368932, "grad_norm": 5.773884296417236, "learning_rate": 9.902912621359224e-06, "loss": 1.8061, "step": 1150 }, { "epoch": 1.1262135922330097, "grad_norm": 5.654228687286377, "learning_rate": 9.795037756202806e-06, "loss": 1.8051, "step": 1160 }, { "epoch": 1.1359223300970873, "grad_norm": 5.353296756744385, "learning_rate": 9.687162891046387e-06, "loss": 1.8157, "step": 1170 }, { "epoch": 1.145631067961165, "grad_norm": 5.371259689331055, "learning_rate": 9.579288025889967e-06, "loss": 1.7957, "step": 1180 }, { "epoch": 1.1553398058252426, "grad_norm": 5.551179885864258, "learning_rate": 9.47141316073355e-06, "loss": 1.8099, "step": 1190 }, { "epoch": 1.1650485436893203, "grad_norm": 5.763803958892822, "learning_rate": 9.363538295577131e-06, "loss": 1.8243, "step": 1200 }, { "epoch": 1.174757281553398, "grad_norm": 5.4469313621521, "learning_rate": 9.255663430420713e-06, "loss": 1.7813, "step": 1210 }, { "epoch": 1.1844660194174756, "grad_norm": 5.914862155914307, "learning_rate": 9.147788565264294e-06, "loss": 1.8308, "step": 1220 }, { "epoch": 1.1941747572815533, "grad_norm": 5.619472980499268, "learning_rate": 9.039913700107874e-06, "loss": 1.8566, "step": 1230 }, { "epoch": 1.203883495145631, "grad_norm": 5.747879981994629, "learning_rate": 8.932038834951458e-06, "loss": 1.831, "step": 1240 }, { "epoch": 1.2135922330097086, "grad_norm": 5.515039443969727, "learning_rate": 8.824163969795038e-06, "loss": 1.8279, "step": 1250 }, { "epoch": 1.2233009708737863, "grad_norm": 5.6780171394348145, "learning_rate": 8.71628910463862e-06, "loss": 1.7496, "step": 1260 }, { "epoch": 1.233009708737864, "grad_norm": 5.678586006164551, "learning_rate": 8.608414239482202e-06, "loss": 1.7862, "step": 1270 }, { "epoch": 1.2427184466019416, "grad_norm": 5.727756977081299, "learning_rate": 8.500539374325782e-06, "loss": 1.8364, "step": 1280 }, { "epoch": 1.2524271844660193, "grad_norm": 5.593883037567139, "learning_rate": 8.392664509169363e-06, "loss": 1.7775, "step": 1290 }, { "epoch": 1.262135922330097, "grad_norm": 5.856795310974121, "learning_rate": 8.284789644012947e-06, "loss": 1.8652, "step": 1300 }, { "epoch": 1.2718446601941746, "grad_norm": 5.66147518157959, "learning_rate": 8.176914778856527e-06, "loss": 1.7747, "step": 1310 }, { "epoch": 1.2815533980582523, "grad_norm": 5.7044291496276855, "learning_rate": 8.069039913700109e-06, "loss": 1.7781, "step": 1320 }, { "epoch": 1.29126213592233, "grad_norm": 5.8010149002075195, "learning_rate": 7.96116504854369e-06, "loss": 1.812, "step": 1330 }, { "epoch": 1.3009708737864076, "grad_norm": 5.601301670074463, "learning_rate": 7.85329018338727e-06, "loss": 1.8109, "step": 1340 }, { "epoch": 1.3106796116504853, "grad_norm": 5.59577751159668, "learning_rate": 7.745415318230854e-06, "loss": 1.794, "step": 1350 }, { "epoch": 1.3203883495145632, "grad_norm": 6.064187526702881, "learning_rate": 7.637540453074434e-06, "loss": 1.7978, "step": 1360 }, { "epoch": 1.3300970873786409, "grad_norm": 5.477755069732666, "learning_rate": 7.529665587918016e-06, "loss": 1.7732, "step": 1370 }, { "epoch": 1.3398058252427185, "grad_norm": 5.672438144683838, "learning_rate": 7.421790722761597e-06, "loss": 1.8356, "step": 1380 }, { "epoch": 1.3495145631067962, "grad_norm": 5.968810558319092, "learning_rate": 7.3139158576051786e-06, "loss": 1.8513, "step": 1390 }, { "epoch": 1.3592233009708738, "grad_norm": 5.984207630157471, "learning_rate": 7.2060409924487595e-06, "loss": 1.7821, "step": 1400 }, { "epoch": 1.3689320388349515, "grad_norm": 6.006514072418213, "learning_rate": 7.098166127292342e-06, "loss": 1.756, "step": 1410 }, { "epoch": 1.3786407766990292, "grad_norm": 5.548986911773682, "learning_rate": 6.990291262135923e-06, "loss": 1.8, "step": 1420 }, { "epoch": 1.3883495145631068, "grad_norm": 5.688983917236328, "learning_rate": 6.882416396979504e-06, "loss": 1.7601, "step": 1430 }, { "epoch": 1.3980582524271845, "grad_norm": 5.5633225440979, "learning_rate": 6.774541531823086e-06, "loss": 1.7876, "step": 1440 }, { "epoch": 1.4077669902912622, "grad_norm": 5.3327226638793945, "learning_rate": 6.666666666666667e-06, "loss": 1.7388, "step": 1450 }, { "epoch": 1.4174757281553398, "grad_norm": 5.563536643981934, "learning_rate": 6.558791801510249e-06, "loss": 1.7894, "step": 1460 }, { "epoch": 1.4271844660194175, "grad_norm": 5.413880825042725, "learning_rate": 6.45091693635383e-06, "loss": 1.8045, "step": 1470 }, { "epoch": 1.4368932038834952, "grad_norm": 5.559710502624512, "learning_rate": 6.343042071197412e-06, "loss": 1.7388, "step": 1480 }, { "epoch": 1.4466019417475728, "grad_norm": 5.674643039703369, "learning_rate": 6.235167206040993e-06, "loss": 1.7521, "step": 1490 }, { "epoch": 1.4563106796116505, "grad_norm": 6.015341758728027, "learning_rate": 6.127292340884574e-06, "loss": 1.7567, "step": 1500 }, { "epoch": 1.4660194174757282, "grad_norm": 5.763010025024414, "learning_rate": 6.0194174757281556e-06, "loss": 1.7494, "step": 1510 }, { "epoch": 1.4757281553398058, "grad_norm": 5.727349758148193, "learning_rate": 5.911542610571737e-06, "loss": 1.8048, "step": 1520 }, { "epoch": 1.4854368932038835, "grad_norm": 5.473784923553467, "learning_rate": 5.803667745415319e-06, "loss": 1.7469, "step": 1530 }, { "epoch": 1.4951456310679612, "grad_norm": 5.847958087921143, "learning_rate": 5.6957928802589e-06, "loss": 1.7803, "step": 1540 }, { "epoch": 1.5048543689320388, "grad_norm": 6.08969259262085, "learning_rate": 5.587918015102482e-06, "loss": 1.796, "step": 1550 }, { "epoch": 1.5145631067961165, "grad_norm": 5.455092430114746, "learning_rate": 5.480043149946063e-06, "loss": 1.7495, "step": 1560 }, { "epoch": 1.5242718446601942, "grad_norm": 5.9275031089782715, "learning_rate": 5.372168284789644e-06, "loss": 1.814, "step": 1570 }, { "epoch": 1.5339805825242718, "grad_norm": 5.613204002380371, "learning_rate": 5.264293419633226e-06, "loss": 1.8095, "step": 1580 }, { "epoch": 1.5436893203883495, "grad_norm": 5.575292110443115, "learning_rate": 5.156418554476807e-06, "loss": 1.794, "step": 1590 }, { "epoch": 1.5533980582524272, "grad_norm": 6.1768107414245605, "learning_rate": 5.048543689320389e-06, "loss": 1.7858, "step": 1600 }, { "epoch": 1.5631067961165048, "grad_norm": 5.83579158782959, "learning_rate": 4.94066882416397e-06, "loss": 1.7229, "step": 1610 }, { "epoch": 1.5728155339805825, "grad_norm": 5.574371814727783, "learning_rate": 4.832793959007552e-06, "loss": 1.784, "step": 1620 }, { "epoch": 1.5825242718446602, "grad_norm": 5.452093124389648, "learning_rate": 4.724919093851133e-06, "loss": 1.7639, "step": 1630 }, { "epoch": 1.5922330097087378, "grad_norm": 5.442083358764648, "learning_rate": 4.617044228694714e-06, "loss": 1.8157, "step": 1640 }, { "epoch": 1.6019417475728155, "grad_norm": 5.7211079597473145, "learning_rate": 4.509169363538296e-06, "loss": 1.7338, "step": 1650 }, { "epoch": 1.6116504854368932, "grad_norm": 6.113297462463379, "learning_rate": 4.401294498381877e-06, "loss": 1.7044, "step": 1660 }, { "epoch": 1.6213592233009708, "grad_norm": 5.693146705627441, "learning_rate": 4.293419633225459e-06, "loss": 1.7722, "step": 1670 }, { "epoch": 1.6310679611650487, "grad_norm": 5.956842422485352, "learning_rate": 4.1855447680690406e-06, "loss": 1.7618, "step": 1680 }, { "epoch": 1.6407766990291264, "grad_norm": 5.646125793457031, "learning_rate": 4.0776699029126215e-06, "loss": 1.7872, "step": 1690 }, { "epoch": 1.650485436893204, "grad_norm": 5.913788795471191, "learning_rate": 3.969795037756203e-06, "loss": 1.7913, "step": 1700 }, { "epoch": 1.6601941747572817, "grad_norm": 6.05329704284668, "learning_rate": 3.861920172599784e-06, "loss": 1.7771, "step": 1710 }, { "epoch": 1.6699029126213594, "grad_norm": 6.139546871185303, "learning_rate": 3.754045307443366e-06, "loss": 1.7966, "step": 1720 }, { "epoch": 1.679611650485437, "grad_norm": 6.158768653869629, "learning_rate": 3.6461704422869477e-06, "loss": 1.8097, "step": 1730 }, { "epoch": 1.6893203883495147, "grad_norm": 5.726659774780273, "learning_rate": 3.5382955771305286e-06, "loss": 1.7568, "step": 1740 }, { "epoch": 1.6990291262135924, "grad_norm": 5.508258819580078, "learning_rate": 3.43042071197411e-06, "loss": 1.8406, "step": 1750 }, { "epoch": 1.70873786407767, "grad_norm": 6.076147556304932, "learning_rate": 3.3225458468176918e-06, "loss": 1.7246, "step": 1760 }, { "epoch": 1.7184466019417477, "grad_norm": 5.596787452697754, "learning_rate": 3.214670981661273e-06, "loss": 1.8189, "step": 1770 }, { "epoch": 1.7281553398058254, "grad_norm": 6.001366138458252, "learning_rate": 3.1067961165048544e-06, "loss": 1.7681, "step": 1780 }, { "epoch": 1.737864077669903, "grad_norm": 5.889746189117432, "learning_rate": 2.9989212513484362e-06, "loss": 1.7634, "step": 1790 }, { "epoch": 1.7475728155339807, "grad_norm": 5.484528541564941, "learning_rate": 2.8910463861920176e-06, "loss": 1.7928, "step": 1800 }, { "epoch": 1.7572815533980584, "grad_norm": 5.366055011749268, "learning_rate": 2.7831715210355993e-06, "loss": 1.7284, "step": 1810 }, { "epoch": 1.766990291262136, "grad_norm": 6.179636478424072, "learning_rate": 2.6752966558791803e-06, "loss": 1.7973, "step": 1820 }, { "epoch": 1.7766990291262137, "grad_norm": 5.786418437957764, "learning_rate": 2.5674217907227616e-06, "loss": 1.7271, "step": 1830 }, { "epoch": 1.7864077669902914, "grad_norm": 5.728253364562988, "learning_rate": 2.4595469255663434e-06, "loss": 1.7947, "step": 1840 }, { "epoch": 1.796116504854369, "grad_norm": 5.840207576751709, "learning_rate": 2.3516720604099247e-06, "loss": 1.8411, "step": 1850 }, { "epoch": 1.8058252427184467, "grad_norm": 6.026117324829102, "learning_rate": 2.243797195253506e-06, "loss": 1.7801, "step": 1860 }, { "epoch": 1.8155339805825244, "grad_norm": 5.574731826782227, "learning_rate": 2.1359223300970874e-06, "loss": 1.8296, "step": 1870 }, { "epoch": 1.825242718446602, "grad_norm": 5.741345405578613, "learning_rate": 2.0280474649406688e-06, "loss": 1.8048, "step": 1880 }, { "epoch": 1.8349514563106797, "grad_norm": 5.989925384521484, "learning_rate": 1.9201725997842505e-06, "loss": 1.7559, "step": 1890 }, { "epoch": 1.8446601941747574, "grad_norm": 5.998227119445801, "learning_rate": 1.812297734627832e-06, "loss": 1.7678, "step": 1900 }, { "epoch": 1.854368932038835, "grad_norm": 5.9274420738220215, "learning_rate": 1.7044228694714132e-06, "loss": 1.7886, "step": 1910 }, { "epoch": 1.8640776699029127, "grad_norm": 5.719155788421631, "learning_rate": 1.5965480043149948e-06, "loss": 1.7921, "step": 1920 }, { "epoch": 1.8737864077669903, "grad_norm": 5.4220147132873535, "learning_rate": 1.4886731391585763e-06, "loss": 1.7813, "step": 1930 }, { "epoch": 1.883495145631068, "grad_norm": 5.772354602813721, "learning_rate": 1.3807982740021575e-06, "loss": 1.7606, "step": 1940 }, { "epoch": 1.8932038834951457, "grad_norm": 5.864536762237549, "learning_rate": 1.272923408845739e-06, "loss": 1.7354, "step": 1950 }, { "epoch": 1.9029126213592233, "grad_norm": 5.453779697418213, "learning_rate": 1.1650485436893206e-06, "loss": 1.7403, "step": 1960 }, { "epoch": 1.912621359223301, "grad_norm": 5.6637492179870605, "learning_rate": 1.057173678532902e-06, "loss": 1.7649, "step": 1970 }, { "epoch": 1.9223300970873787, "grad_norm": 5.886834621429443, "learning_rate": 9.492988133764834e-07, "loss": 1.8095, "step": 1980 }, { "epoch": 1.9320388349514563, "grad_norm": 6.255226135253906, "learning_rate": 8.414239482200648e-07, "loss": 1.7987, "step": 1990 }, { "epoch": 1.941747572815534, "grad_norm": 5.694814205169678, "learning_rate": 7.335490830636462e-07, "loss": 1.7715, "step": 2000 }, { "epoch": 1.9514563106796117, "grad_norm": 5.4670257568359375, "learning_rate": 6.256742179072277e-07, "loss": 1.737, "step": 2010 }, { "epoch": 1.9611650485436893, "grad_norm": 5.776082992553711, "learning_rate": 5.393743257820928e-07, "loss": 1.7518, "step": 2020 }, { "epoch": 1.970873786407767, "grad_norm": 5.826039791107178, "learning_rate": 4.314994606256743e-07, "loss": 1.8036, "step": 2030 }, { "epoch": 1.9805825242718447, "grad_norm": 5.991348743438721, "learning_rate": 3.2362459546925565e-07, "loss": 1.756, "step": 2040 }, { "epoch": 1.9902912621359223, "grad_norm": 5.450629234313965, "learning_rate": 2.1574973031283715e-07, "loss": 1.7953, "step": 2050 }, { "epoch": 2.0, "grad_norm": 5.462096214294434, "learning_rate": 1.0787486515641857e-07, "loss": 1.7721, "step": 2060 } ], "logging_steps": 10, "max_steps": 2060, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }