{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9992103185048697, "eval_steps": 500, "global_step": 949, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010529086601737299, "grad_norm": 4.026114469450001, "learning_rate": 2.105263157894737e-06, "loss": 1.3755, "step": 1 }, { "epoch": 0.0052645433008686494, "grad_norm": 1.1251945428540173, "learning_rate": 1.0526315789473684e-05, "loss": 1.3102, "step": 5 }, { "epoch": 0.010529086601737299, "grad_norm": 0.563945023031292, "learning_rate": 2.105263157894737e-05, "loss": 1.2626, "step": 10 }, { "epoch": 0.01579362990260595, "grad_norm": 0.4336837922055097, "learning_rate": 3.157894736842105e-05, "loss": 1.2133, "step": 15 }, { "epoch": 0.021058173203474598, "grad_norm": 0.33707427690007363, "learning_rate": 4.210526315789474e-05, "loss": 1.1636, "step": 20 }, { "epoch": 0.026322716504343247, "grad_norm": 0.2412443925117212, "learning_rate": 5.2631578947368424e-05, "loss": 1.1845, "step": 25 }, { "epoch": 0.0315872598052119, "grad_norm": 0.2414606698915264, "learning_rate": 6.31578947368421e-05, "loss": 1.1401, "step": 30 }, { "epoch": 0.03685180310608055, "grad_norm": 0.22628642837844729, "learning_rate": 7.368421052631579e-05, "loss": 1.1591, "step": 35 }, { "epoch": 0.042116346406949196, "grad_norm": 0.2208133931977146, "learning_rate": 8.421052631578948e-05, "loss": 1.1397, "step": 40 }, { "epoch": 0.04738088970781785, "grad_norm": 0.21905054641153135, "learning_rate": 9.473684210526316e-05, "loss": 1.1295, "step": 45 }, { "epoch": 0.052645433008686494, "grad_norm": 0.1951061692287286, "learning_rate": 0.00010526315789473685, "loss": 1.141, "step": 50 }, { "epoch": 0.05790997630955515, "grad_norm": 0.18792729468212402, "learning_rate": 0.00011578947368421053, "loss": 1.121, "step": 55 }, { "epoch": 0.0631745196104238, "grad_norm": 0.189919407852987, "learning_rate": 0.0001263157894736842, "loss": 1.1347, "step": 60 }, { "epoch": 0.06843906291129244, "grad_norm": 0.18793881851552416, "learning_rate": 0.0001368421052631579, "loss": 1.0961, "step": 65 }, { "epoch": 0.0737036062121611, "grad_norm": 0.18263188517758086, "learning_rate": 0.00014736842105263158, "loss": 1.0937, "step": 70 }, { "epoch": 0.07896814951302975, "grad_norm": 0.18520098125405152, "learning_rate": 0.00015789473684210527, "loss": 1.1419, "step": 75 }, { "epoch": 0.08423269281389839, "grad_norm": 0.18675524775004465, "learning_rate": 0.00016842105263157895, "loss": 1.1094, "step": 80 }, { "epoch": 0.08949723611476705, "grad_norm": 0.18469057661828525, "learning_rate": 0.00017894736842105264, "loss": 1.0952, "step": 85 }, { "epoch": 0.0947617794156357, "grad_norm": 0.17860571701450936, "learning_rate": 0.00018947368421052632, "loss": 1.1035, "step": 90 }, { "epoch": 0.10002632271650434, "grad_norm": 0.2032976356381528, "learning_rate": 0.0002, "loss": 1.1329, "step": 95 }, { "epoch": 0.10529086601737299, "grad_norm": 0.18932762375677964, "learning_rate": 0.0001999830846194422, "loss": 1.0902, "step": 100 }, { "epoch": 0.11055540931824165, "grad_norm": 0.17823414518835126, "learning_rate": 0.00019993234420037073, "loss": 1.0951, "step": 105 }, { "epoch": 0.1158199526191103, "grad_norm": 0.19033211796864122, "learning_rate": 0.00019984779590865556, "loss": 1.11, "step": 110 }, { "epoch": 0.12108449591997894, "grad_norm": 0.1781090004150184, "learning_rate": 0.0001997294683476273, "loss": 1.1216, "step": 115 }, { "epoch": 0.1263490392208476, "grad_norm": 0.20142566240295628, "learning_rate": 0.0001995774015484005, "loss": 1.088, "step": 120 }, { "epoch": 0.13161358252171623, "grad_norm": 0.16738672746077932, "learning_rate": 0.00019939164695633067, "loss": 1.1069, "step": 125 }, { "epoch": 0.13687812582258488, "grad_norm": 0.17141306079033702, "learning_rate": 0.00019917226741361015, "loss": 1.1178, "step": 130 }, { "epoch": 0.14214266912345355, "grad_norm": 0.18242919862111662, "learning_rate": 0.00019891933713800798, "loss": 1.115, "step": 135 }, { "epoch": 0.1474072124243222, "grad_norm": 0.18858703761293544, "learning_rate": 0.00019863294169776148, "loss": 1.092, "step": 140 }, { "epoch": 0.15267175572519084, "grad_norm": 0.1851910906506613, "learning_rate": 0.00019831317798262786, "loss": 1.1015, "step": 145 }, { "epoch": 0.1579362990260595, "grad_norm": 0.17061718616532065, "learning_rate": 0.00019796015417110577, "loss": 1.0834, "step": 150 }, { "epoch": 0.16320084232692814, "grad_norm": 0.19083175263550564, "learning_rate": 0.0001975739896938375, "loss": 1.0915, "step": 155 }, { "epoch": 0.16846538562779678, "grad_norm": 0.17041981222039004, "learning_rate": 0.00019715481519320496, "loss": 1.1045, "step": 160 }, { "epoch": 0.17372992892866543, "grad_norm": 0.17539080495334333, "learning_rate": 0.00019670277247913205, "loss": 1.0822, "step": 165 }, { "epoch": 0.1789944722295341, "grad_norm": 0.16847918243353582, "learning_rate": 0.00019621801448110952, "loss": 1.1113, "step": 170 }, { "epoch": 0.18425901553040275, "grad_norm": 0.16577520965121645, "learning_rate": 0.00019570070519645767, "loss": 1.0726, "step": 175 }, { "epoch": 0.1895235588312714, "grad_norm": 0.17216940817918563, "learning_rate": 0.00019515101963484485, "loss": 1.1214, "step": 180 }, { "epoch": 0.19478810213214004, "grad_norm": 0.16717603329959776, "learning_rate": 0.00019456914375908023, "loss": 1.0749, "step": 185 }, { "epoch": 0.20005264543300869, "grad_norm": 0.16743311436795275, "learning_rate": 0.0001939552744222014, "loss": 1.0856, "step": 190 }, { "epoch": 0.20531718873387733, "grad_norm": 0.16628473925396028, "learning_rate": 0.00019330961930087725, "loss": 1.1088, "step": 195 }, { "epoch": 0.21058173203474598, "grad_norm": 0.1735673710306468, "learning_rate": 0.00019263239682514952, "loss": 1.094, "step": 200 }, { "epoch": 0.21584627533561462, "grad_norm": 0.16463208491106188, "learning_rate": 0.00019192383610453618, "loss": 1.1191, "step": 205 }, { "epoch": 0.2211108186364833, "grad_norm": 0.1697201646634803, "learning_rate": 0.00019118417685052194, "loss": 1.1188, "step": 210 }, { "epoch": 0.22637536193735194, "grad_norm": 0.15930348674889006, "learning_rate": 0.00019041366929546219, "loss": 1.1132, "step": 215 }, { "epoch": 0.2316399052382206, "grad_norm": 0.16154587528605638, "learning_rate": 0.0001896125741079272, "loss": 1.1029, "step": 220 }, { "epoch": 0.23690444853908924, "grad_norm": 0.1689593754891321, "learning_rate": 0.00018878116230451613, "loss": 1.1196, "step": 225 }, { "epoch": 0.24216899183995788, "grad_norm": 0.158620581331537, "learning_rate": 0.0001879197151581702, "loss": 1.0786, "step": 230 }, { "epoch": 0.24743353514082653, "grad_norm": 0.1591976970503649, "learning_rate": 0.00018702852410301554, "loss": 1.0861, "step": 235 }, { "epoch": 0.2526980784416952, "grad_norm": 0.16271741933565712, "learning_rate": 0.00018610789063576913, "loss": 1.077, "step": 240 }, { "epoch": 0.25796262174256385, "grad_norm": 0.16691583214883504, "learning_rate": 0.00018515812621373997, "loss": 1.0931, "step": 245 }, { "epoch": 0.26322716504343247, "grad_norm": 0.15795453416798677, "learning_rate": 0.00018417955214946092, "loss": 1.0929, "step": 250 }, { "epoch": 0.26849170834430114, "grad_norm": 0.15734119895920037, "learning_rate": 0.00018317249950198597, "loss": 1.086, "step": 255 }, { "epoch": 0.27375625164516976, "grad_norm": 0.15815121783491273, "learning_rate": 0.0001821373089648906, "loss": 1.1142, "step": 260 }, { "epoch": 0.27902079494603843, "grad_norm": 0.15790684873329372, "learning_rate": 0.00018107433075101252, "loss": 1.0907, "step": 265 }, { "epoch": 0.2842853382469071, "grad_norm": 0.1603612919235376, "learning_rate": 0.00017998392447397197, "loss": 1.103, "step": 270 }, { "epoch": 0.2895498815477757, "grad_norm": 0.1935643212000403, "learning_rate": 0.00017886645902651167, "loss": 1.1207, "step": 275 }, { "epoch": 0.2948144248486444, "grad_norm": 0.16197395404790052, "learning_rate": 0.0001777223124556978, "loss": 1.1036, "step": 280 }, { "epoch": 0.300078968149513, "grad_norm": 0.16503760296000086, "learning_rate": 0.00017655187183502344, "loss": 1.0647, "step": 285 }, { "epoch": 0.3053435114503817, "grad_norm": 0.1772283442967409, "learning_rate": 0.00017535553313345904, "loss": 1.1075, "step": 290 }, { "epoch": 0.3106080547512503, "grad_norm": 0.16282645295325013, "learning_rate": 0.00017413370108149286, "loss": 1.1094, "step": 295 }, { "epoch": 0.315872598052119, "grad_norm": 0.15561402718068354, "learning_rate": 0.00017288678903420762, "loss": 1.0776, "step": 300 }, { "epoch": 0.32113714135298765, "grad_norm": 0.15594031474920508, "learning_rate": 0.00017161521883143934, "loss": 1.1078, "step": 305 }, { "epoch": 0.32640168465385627, "grad_norm": 0.1570395383591175, "learning_rate": 0.00017031942065506576, "loss": 1.1124, "step": 310 }, { "epoch": 0.33166622795472495, "grad_norm": 0.15773152315944608, "learning_rate": 0.00016899983288347248, "loss": 1.0913, "step": 315 }, { "epoch": 0.33693077125559356, "grad_norm": 0.15310806808595664, "learning_rate": 0.00016765690194324616, "loss": 1.0845, "step": 320 }, { "epoch": 0.34219531455646224, "grad_norm": 0.16384678369715433, "learning_rate": 0.00016629108215814525, "loss": 1.1173, "step": 325 }, { "epoch": 0.34745985785733086, "grad_norm": 0.165818325184464, "learning_rate": 0.00016490283559539838, "loss": 1.1014, "step": 330 }, { "epoch": 0.35272440115819953, "grad_norm": 0.15456003221800826, "learning_rate": 0.000163492631909384, "loss": 1.0915, "step": 335 }, { "epoch": 0.3579889444590682, "grad_norm": 0.16059867173233644, "learning_rate": 0.00016206094818274229, "loss": 1.0969, "step": 340 }, { "epoch": 0.3632534877599368, "grad_norm": 0.17415674474557066, "learning_rate": 0.00016060826876497478, "loss": 1.1145, "step": 345 }, { "epoch": 0.3685180310608055, "grad_norm": 0.16440403677512835, "learning_rate": 0.0001591350851085851, "loss": 1.0683, "step": 350 }, { "epoch": 0.3737825743616741, "grad_norm": 0.15901493438320982, "learning_rate": 0.00015764189560281677, "loss": 1.1199, "step": 355 }, { "epoch": 0.3790471176625428, "grad_norm": 0.15988293404570103, "learning_rate": 0.00015612920540504453, "loss": 1.0709, "step": 360 }, { "epoch": 0.3843116609634114, "grad_norm": 0.1616109424204681, "learning_rate": 0.00015459752626987563, "loss": 1.1027, "step": 365 }, { "epoch": 0.3895762042642801, "grad_norm": 0.1513607201651111, "learning_rate": 0.00015304737637601926, "loss": 1.0956, "step": 370 }, { "epoch": 0.3948407475651487, "grad_norm": 0.15452619863423803, "learning_rate": 0.0001514792801509831, "loss": 1.0952, "step": 375 }, { "epoch": 0.40010529086601737, "grad_norm": 0.15418975657555584, "learning_rate": 0.00014989376809365493, "loss": 1.0934, "step": 380 }, { "epoch": 0.40536983416688604, "grad_norm": 0.15158447263390024, "learning_rate": 0.00014829137659483143, "loss": 1.0981, "step": 385 }, { "epoch": 0.41063437746775466, "grad_norm": 0.15420702474431047, "learning_rate": 0.0001466726477557527, "loss": 1.1013, "step": 390 }, { "epoch": 0.41589892076862334, "grad_norm": 0.1513401762569788, "learning_rate": 0.00014503812920470534, "loss": 1.1128, "step": 395 }, { "epoch": 0.42116346406949196, "grad_norm": 0.1759021276212348, "learning_rate": 0.00014338837391175582, "loss": 1.0793, "step": 400 }, { "epoch": 0.42642800737036063, "grad_norm": 0.15639002655528358, "learning_rate": 0.00014172394000167623, "loss": 1.1126, "step": 405 }, { "epoch": 0.43169255067122925, "grad_norm": 0.1558922751326013, "learning_rate": 0.00014004539056512667, "loss": 1.0864, "step": 410 }, { "epoch": 0.4369570939720979, "grad_norm": 0.15449223519766864, "learning_rate": 0.00013835329346815716, "loss": 1.1161, "step": 415 }, { "epoch": 0.4422216372729666, "grad_norm": 0.15398779214531882, "learning_rate": 0.0001366482211600945, "loss": 1.113, "step": 420 }, { "epoch": 0.4474861805738352, "grad_norm": 0.15902962443654645, "learning_rate": 0.000134930750479878, "loss": 1.0783, "step": 425 }, { "epoch": 0.4527507238747039, "grad_norm": 0.15614703146804315, "learning_rate": 0.00013320146246091074, "loss": 1.0891, "step": 430 }, { "epoch": 0.4580152671755725, "grad_norm": 0.151735228198923, "learning_rate": 0.00013146094213449148, "loss": 1.1006, "step": 435 }, { "epoch": 0.4632798104764412, "grad_norm": 0.1633743946888902, "learning_rate": 0.00012970977833189393, "loss": 1.0717, "step": 440 }, { "epoch": 0.4685443537773098, "grad_norm": 0.16534257355481496, "learning_rate": 0.00012794856348516095, "loss": 1.0778, "step": 445 }, { "epoch": 0.47380889707817847, "grad_norm": 0.1856142828881669, "learning_rate": 0.00012617789342668004, "loss": 1.0859, "step": 450 }, { "epoch": 0.47907344037904714, "grad_norm": 0.15229515578033356, "learning_rate": 0.00012439836718760886, "loss": 1.0761, "step": 455 }, { "epoch": 0.48433798367991576, "grad_norm": 0.15984985984562605, "learning_rate": 0.00012261058679521834, "loss": 1.0926, "step": 460 }, { "epoch": 0.48960252698078444, "grad_norm": 0.14896040772758903, "learning_rate": 0.00012081515706922227, "loss": 1.0834, "step": 465 }, { "epoch": 0.49486707028165305, "grad_norm": 0.1514924492192347, "learning_rate": 0.00011901268541716224, "loss": 1.0885, "step": 470 }, { "epoch": 0.5001316135825217, "grad_norm": 0.1513889418015892, "learning_rate": 0.00011720378162891708, "loss": 1.1001, "step": 475 }, { "epoch": 0.5053961568833903, "grad_norm": 0.15159825336613816, "learning_rate": 0.0001153890576704062, "loss": 1.1082, "step": 480 }, { "epoch": 0.510660700184259, "grad_norm": 0.15427722774659086, "learning_rate": 0.00011356912747655685, "loss": 1.0843, "step": 485 }, { "epoch": 0.5159252434851277, "grad_norm": 0.14639500931900093, "learning_rate": 0.00011174460674360549, "loss": 1.1058, "step": 490 }, { "epoch": 0.5211897867859964, "grad_norm": 0.15320269723203808, "learning_rate": 0.00010991611272080269, "loss": 1.1125, "step": 495 }, { "epoch": 0.5264543300868649, "grad_norm": 0.15092814943890553, "learning_rate": 0.00010808426400159338, "loss": 1.0898, "step": 500 }, { "epoch": 0.5317188733877336, "grad_norm": 0.14712598563479434, "learning_rate": 0.00010624968031434173, "loss": 1.0975, "step": 505 }, { "epoch": 0.5369834166886023, "grad_norm": 0.1506174008648404, "learning_rate": 0.00010441298231267242, "loss": 1.0789, "step": 510 }, { "epoch": 0.542247959989471, "grad_norm": 0.14915164476738402, "learning_rate": 0.00010257479136549889, "loss": 1.088, "step": 515 }, { "epoch": 0.5475125032903395, "grad_norm": 0.14933216158522156, "learning_rate": 0.00010073572934680919, "loss": 1.1012, "step": 520 }, { "epoch": 0.5527770465912082, "grad_norm": 0.1623395783916047, "learning_rate": 9.889641842528178e-05, "loss": 1.0992, "step": 525 }, { "epoch": 0.5580415898920769, "grad_norm": 0.15524883773019818, "learning_rate": 9.70574808538006e-05, "loss": 1.0558, "step": 530 }, { "epoch": 0.5633061331929455, "grad_norm": 0.14879516385003932, "learning_rate": 9.521953875894257e-05, "loss": 1.0634, "step": 535 }, { "epoch": 0.5685706764938142, "grad_norm": 0.14856407933911947, "learning_rate": 9.338321393050719e-05, "loss": 1.0513, "step": 540 }, { "epoch": 0.5738352197946828, "grad_norm": 0.1514919636398635, "learning_rate": 9.154912761116056e-05, "loss": 1.0899, "step": 545 }, { "epoch": 0.5790997630955514, "grad_norm": 0.15005939408454377, "learning_rate": 8.971790028626395e-05, "loss": 1.09, "step": 550 }, { "epoch": 0.5843643063964201, "grad_norm": 0.1541140355049706, "learning_rate": 8.789015147395919e-05, "loss": 1.072, "step": 555 }, { "epoch": 0.5896288496972888, "grad_norm": 0.14756189100480177, "learning_rate": 8.606649951558073e-05, "loss": 1.0548, "step": 560 }, { "epoch": 0.5948933929981574, "grad_norm": 0.14468591274130843, "learning_rate": 8.424756136646623e-05, "loss": 1.056, "step": 565 }, { "epoch": 0.600157936299026, "grad_norm": 0.1510683202100121, "learning_rate": 8.243395238723571e-05, "loss": 1.0999, "step": 570 }, { "epoch": 0.6054224795998947, "grad_norm": 0.14942489035639112, "learning_rate": 8.062628613561051e-05, "loss": 1.08, "step": 575 }, { "epoch": 0.6106870229007634, "grad_norm": 0.14792710995590722, "learning_rate": 7.8825174158842e-05, "loss": 1.0916, "step": 580 }, { "epoch": 0.615951566201632, "grad_norm": 0.14543568608581728, "learning_rate": 7.703122578682046e-05, "loss": 1.061, "step": 585 }, { "epoch": 0.6212161095025006, "grad_norm": 0.14792849899325772, "learning_rate": 7.524504792593419e-05, "loss": 1.1101, "step": 590 }, { "epoch": 0.6264806528033693, "grad_norm": 0.14574924924348462, "learning_rate": 7.346724485374837e-05, "loss": 1.0687, "step": 595 }, { "epoch": 0.631745196104238, "grad_norm": 0.1434166906369258, "learning_rate": 7.169841801457347e-05, "loss": 1.0825, "step": 600 }, { "epoch": 0.6370097394051066, "grad_norm": 0.14254720323207454, "learning_rate": 6.993916581599202e-05, "loss": 1.0896, "step": 605 }, { "epoch": 0.6422742827059753, "grad_norm": 0.14534591022474969, "learning_rate": 6.819008342641273e-05, "loss": 1.0805, "step": 610 }, { "epoch": 0.6475388260068439, "grad_norm": 0.1471482502229213, "learning_rate": 6.645176257372055e-05, "loss": 1.0933, "step": 615 }, { "epoch": 0.6528033693077125, "grad_norm": 0.14967562406928056, "learning_rate": 6.472479134509052e-05, "loss": 1.0987, "step": 620 }, { "epoch": 0.6580679126085812, "grad_norm": 0.14756218985788289, "learning_rate": 6.300975398803362e-05, "loss": 1.0862, "step": 625 }, { "epoch": 0.6633324559094499, "grad_norm": 0.14358810278632364, "learning_rate": 6.130723071274107e-05, "loss": 1.0736, "step": 630 }, { "epoch": 0.6685969992103185, "grad_norm": 0.14508119820046267, "learning_rate": 5.961779749579516e-05, "loss": 1.077, "step": 635 }, { "epoch": 0.6738615425111871, "grad_norm": 0.14868475648668983, "learning_rate": 5.794202588531166e-05, "loss": 1.0921, "step": 640 }, { "epoch": 0.6791260858120558, "grad_norm": 0.14136660751737096, "learning_rate": 5.628048280758096e-05, "loss": 1.0967, "step": 645 }, { "epoch": 0.6843906291129245, "grad_norm": 0.14429824406995242, "learning_rate": 5.4633730375272594e-05, "loss": 1.094, "step": 650 }, { "epoch": 0.6896551724137931, "grad_norm": 0.1435583500936634, "learning_rate": 5.300232569726804e-05, "loss": 1.0796, "step": 655 }, { "epoch": 0.6949197157146617, "grad_norm": 0.14917594264214823, "learning_rate": 5.13868206901867e-05, "loss": 1.0813, "step": 660 }, { "epoch": 0.7001842590155304, "grad_norm": 0.14484547003342338, "learning_rate": 4.9787761891668397e-05, "loss": 1.0833, "step": 665 }, { "epoch": 0.7054488023163991, "grad_norm": 0.14125281408090304, "learning_rate": 4.820569027547533e-05, "loss": 1.0813, "step": 670 }, { "epoch": 0.7107133456172677, "grad_norm": 0.1408995053360923, "learning_rate": 4.6641141068476666e-05, "loss": 1.0752, "step": 675 }, { "epoch": 0.7159778889181364, "grad_norm": 0.1414179653044325, "learning_rate": 4.5094643569577186e-05, "loss": 1.054, "step": 680 }, { "epoch": 0.721242432219005, "grad_norm": 0.14582058548503438, "learning_rate": 4.356672097065134e-05, "loss": 1.1048, "step": 685 }, { "epoch": 0.7265069755198736, "grad_norm": 0.14009606861616825, "learning_rate": 4.205789017954364e-05, "loss": 1.0683, "step": 690 }, { "epoch": 0.7317715188207423, "grad_norm": 0.14586506040118713, "learning_rate": 4.056866164519465e-05, "loss": 1.0728, "step": 695 }, { "epoch": 0.737036062121611, "grad_norm": 0.14168474565307407, "learning_rate": 3.909953918495234e-05, "loss": 1.0476, "step": 700 }, { "epoch": 0.7423006054224796, "grad_norm": 0.14476382479542646, "learning_rate": 3.7651019814126654e-05, "loss": 1.05, "step": 705 }, { "epoch": 0.7475651487233482, "grad_norm": 0.14528550784733454, "learning_rate": 3.622359357784569e-05, "loss": 1.0611, "step": 710 }, { "epoch": 0.7528296920242169, "grad_norm": 0.14781069746763306, "learning_rate": 3.481774338526954e-05, "loss": 1.0952, "step": 715 }, { "epoch": 0.7580942353250856, "grad_norm": 0.15618197530507127, "learning_rate": 3.343394484621855e-05, "loss": 1.0836, "step": 720 }, { "epoch": 0.7633587786259542, "grad_norm": 0.22087793925041818, "learning_rate": 3.207266611027069e-05, "loss": 1.0727, "step": 725 }, { "epoch": 0.7686233219268228, "grad_norm": 0.14674869869141435, "learning_rate": 3.0734367708383294e-05, "loss": 1.0712, "step": 730 }, { "epoch": 0.7738878652276915, "grad_norm": 0.14673826341334423, "learning_rate": 2.9419502397091713e-05, "loss": 1.0852, "step": 735 }, { "epoch": 0.7791524085285602, "grad_norm": 0.1426087824509766, "learning_rate": 2.812851500533843e-05, "loss": 1.0604, "step": 740 }, { "epoch": 0.7844169518294288, "grad_norm": 0.1446320144127932, "learning_rate": 2.6861842283983953e-05, "loss": 1.0537, "step": 745 }, { "epoch": 0.7896814951302974, "grad_norm": 0.14326111319394175, "learning_rate": 2.5619912758050725e-05, "loss": 1.0942, "step": 750 }, { "epoch": 0.7949460384311661, "grad_norm": 0.14149919988871043, "learning_rate": 2.4403146581749925e-05, "loss": 1.0578, "step": 755 }, { "epoch": 0.8002105817320347, "grad_norm": 0.14034086298796508, "learning_rate": 2.3211955396340002e-05, "loss": 1.0808, "step": 760 }, { "epoch": 0.8054751250329034, "grad_norm": 0.1433790314655123, "learning_rate": 2.204674219086531e-05, "loss": 1.0906, "step": 765 }, { "epoch": 0.8107396683337721, "grad_norm": 0.138618618401559, "learning_rate": 2.090790116582191e-05, "loss": 1.0559, "step": 770 }, { "epoch": 0.8160042116346407, "grad_norm": 0.1429827381187093, "learning_rate": 1.9795817599796418e-05, "loss": 1.0792, "step": 775 }, { "epoch": 0.8212687549355093, "grad_norm": 0.14200271718072968, "learning_rate": 1.871086771912348e-05, "loss": 1.0702, "step": 780 }, { "epoch": 0.826533298236378, "grad_norm": 0.1429932480295589, "learning_rate": 1.7653418570605475e-05, "loss": 1.0715, "step": 785 }, { "epoch": 0.8317978415372467, "grad_norm": 0.14431467515210814, "learning_rate": 1.6623827897337762e-05, "loss": 1.0713, "step": 790 }, { "epoch": 0.8370623848381153, "grad_norm": 0.15238820455432608, "learning_rate": 1.562244401768144e-05, "loss": 1.0824, "step": 795 }, { "epoch": 0.8423269281389839, "grad_norm": 0.14830242766673976, "learning_rate": 1.4649605707424707e-05, "loss": 1.0787, "step": 800 }, { "epoch": 0.8475914714398526, "grad_norm": 0.14468170557092047, "learning_rate": 1.3705642085172366e-05, "loss": 1.0737, "step": 805 }, { "epoch": 0.8528560147407213, "grad_norm": 0.14674968769736463, "learning_rate": 1.2790872501002472e-05, "loss": 1.0577, "step": 810 }, { "epoch": 0.8581205580415899, "grad_norm": 0.14311627432536864, "learning_rate": 1.1905606428427774e-05, "loss": 1.0692, "step": 815 }, { "epoch": 0.8633851013424585, "grad_norm": 0.14558376197107287, "learning_rate": 1.105014335969855e-05, "loss": 1.0934, "step": 820 }, { "epoch": 0.8686496446433272, "grad_norm": 0.14414555681497093, "learning_rate": 1.0224772704482033e-05, "loss": 1.0875, "step": 825 }, { "epoch": 0.8739141879441958, "grad_norm": 0.1399627142514978, "learning_rate": 9.429773691952858e-06, "loss": 1.082, "step": 830 }, { "epoch": 0.8791787312450645, "grad_norm": 0.1392001373823857, "learning_rate": 8.665415276327871e-06, "loss": 1.0573, "step": 835 }, { "epoch": 0.8844432745459332, "grad_norm": 0.13993969105859186, "learning_rate": 7.931956045876688e-06, "loss": 1.0448, "step": 840 }, { "epoch": 0.8897078178468018, "grad_norm": 0.16741517197447736, "learning_rate": 7.229644135439473e-06, "loss": 1.104, "step": 845 }, { "epoch": 0.8949723611476704, "grad_norm": 0.14123729142229655, "learning_rate": 6.558717142480919e-06, "loss": 1.0808, "step": 850 }, { "epoch": 0.9002369044485391, "grad_norm": 0.1424278055064695, "learning_rate": 5.919402046709288e-06, "loss": 1.0709, "step": 855 }, { "epoch": 0.9055014477494078, "grad_norm": 0.13993993967003346, "learning_rate": 5.311915133287415e-06, "loss": 1.0941, "step": 860 }, { "epoch": 0.9107659910502763, "grad_norm": 0.14557850289664284, "learning_rate": 4.7364619196617495e-06, "loss": 1.0492, "step": 865 }, { "epoch": 0.916030534351145, "grad_norm": 0.1450177459066908, "learning_rate": 4.193237086034351e-06, "loss": 1.0972, "step": 870 }, { "epoch": 0.9212950776520137, "grad_norm": 0.1570091074884799, "learning_rate": 3.6824244095010065e-06, "loss": 1.0695, "step": 875 }, { "epoch": 0.9265596209528824, "grad_norm": 0.14097561405495265, "learning_rate": 3.2041967018780707e-06, "loss": 1.0948, "step": 880 }, { "epoch": 0.931824164253751, "grad_norm": 0.1420984285291773, "learning_rate": 2.7587157512388718e-06, "loss": 1.0573, "step": 885 }, { "epoch": 0.9370887075546196, "grad_norm": 0.1545471738706476, "learning_rate": 2.346132267179646e-06, "loss": 1.0786, "step": 890 }, { "epoch": 0.9423532508554883, "grad_norm": 0.14481364480205125, "learning_rate": 1.9665858298333005e-06, "loss": 1.0939, "step": 895 }, { "epoch": 0.9476177941563569, "grad_norm": 0.1446556897144525, "learning_rate": 1.6202048426483651e-06, "loss": 1.0752, "step": 900 }, { "epoch": 0.9528823374572256, "grad_norm": 0.13840641658264988, "learning_rate": 1.3071064889491724e-06, "loss": 1.0757, "step": 905 }, { "epoch": 0.9581468807580943, "grad_norm": 0.1405867091258211, "learning_rate": 1.0273966922918155e-06, "loss": 1.0886, "step": 910 }, { "epoch": 0.9634114240589629, "grad_norm": 0.15143973079201015, "learning_rate": 7.81170080629412e-07, "loss": 1.0337, "step": 915 }, { "epoch": 0.9686759673598315, "grad_norm": 0.15113893856195346, "learning_rate": 5.68509954298757e-07, "loss": 1.099, "step": 920 }, { "epoch": 0.9739405106607002, "grad_norm": 0.1436446854214333, "learning_rate": 3.8948825783918784e-07, "loss": 1.0595, "step": 925 }, { "epoch": 0.9792050539615689, "grad_norm": 0.14373165990559605, "learning_rate": 2.4416555565318635e-07, "loss": 1.0815, "step": 930 }, { "epoch": 0.9844695972624374, "grad_norm": 0.14233020784379563, "learning_rate": 1.3259101151694708e-07, "loss": 1.0569, "step": 935 }, { "epoch": 0.9897341405633061, "grad_norm": 0.13823967108377017, "learning_rate": 5.480237194799287e-08, "loss": 1.0689, "step": 940 }, { "epoch": 0.9949986838641748, "grad_norm": 0.1431568671824589, "learning_rate": 1.0825953435122938e-08, "loss": 1.0709, "step": 945 }, { "epoch": 0.9992103185048697, "eval_loss": 1.07915198802948, "eval_runtime": 3821.2872, "eval_samples_per_second": 3.522, "eval_steps_per_second": 0.881, "step": 949 }, { "epoch": 0.9992103185048697, "step": 949, "total_flos": 1959448100732928.0, "train_loss": 1.0930153100081064, "train_runtime": 22340.3866, "train_samples_per_second": 2.72, "train_steps_per_second": 0.042 } ], "logging_steps": 5, "max_steps": 949, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1959448100732928.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }