| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9992103185048697, | |
| "eval_steps": 500, | |
| "global_step": 949, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0010529086601737299, | |
| "grad_norm": 4.026114469450001, | |
| "learning_rate": 2.105263157894737e-06, | |
| "loss": 1.3755, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0052645433008686494, | |
| "grad_norm": 1.1251945428540173, | |
| "learning_rate": 1.0526315789473684e-05, | |
| "loss": 1.3102, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.010529086601737299, | |
| "grad_norm": 0.563945023031292, | |
| "learning_rate": 2.105263157894737e-05, | |
| "loss": 1.2626, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.01579362990260595, | |
| "grad_norm": 0.4336837922055097, | |
| "learning_rate": 3.157894736842105e-05, | |
| "loss": 1.2133, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.021058173203474598, | |
| "grad_norm": 0.33707427690007363, | |
| "learning_rate": 4.210526315789474e-05, | |
| "loss": 1.1636, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.026322716504343247, | |
| "grad_norm": 0.2412443925117212, | |
| "learning_rate": 5.2631578947368424e-05, | |
| "loss": 1.1845, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.0315872598052119, | |
| "grad_norm": 0.2414606698915264, | |
| "learning_rate": 6.31578947368421e-05, | |
| "loss": 1.1401, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.03685180310608055, | |
| "grad_norm": 0.22628642837844729, | |
| "learning_rate": 7.368421052631579e-05, | |
| "loss": 1.1591, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.042116346406949196, | |
| "grad_norm": 0.2208133931977146, | |
| "learning_rate": 8.421052631578948e-05, | |
| "loss": 1.1397, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.04738088970781785, | |
| "grad_norm": 0.21905054641153135, | |
| "learning_rate": 9.473684210526316e-05, | |
| "loss": 1.1295, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.052645433008686494, | |
| "grad_norm": 0.1951061692287286, | |
| "learning_rate": 0.00010526315789473685, | |
| "loss": 1.141, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.05790997630955515, | |
| "grad_norm": 0.18792729468212402, | |
| "learning_rate": 0.00011578947368421053, | |
| "loss": 1.121, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.0631745196104238, | |
| "grad_norm": 0.189919407852987, | |
| "learning_rate": 0.0001263157894736842, | |
| "loss": 1.1347, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.06843906291129244, | |
| "grad_norm": 0.18793881851552416, | |
| "learning_rate": 0.0001368421052631579, | |
| "loss": 1.0961, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.0737036062121611, | |
| "grad_norm": 0.18263188517758086, | |
| "learning_rate": 0.00014736842105263158, | |
| "loss": 1.0937, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.07896814951302975, | |
| "grad_norm": 0.18520098125405152, | |
| "learning_rate": 0.00015789473684210527, | |
| "loss": 1.1419, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.08423269281389839, | |
| "grad_norm": 0.18675524775004465, | |
| "learning_rate": 0.00016842105263157895, | |
| "loss": 1.1094, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.08949723611476705, | |
| "grad_norm": 0.18469057661828525, | |
| "learning_rate": 0.00017894736842105264, | |
| "loss": 1.0952, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.0947617794156357, | |
| "grad_norm": 0.17860571701450936, | |
| "learning_rate": 0.00018947368421052632, | |
| "loss": 1.1035, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.10002632271650434, | |
| "grad_norm": 0.2032976356381528, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1329, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.10529086601737299, | |
| "grad_norm": 0.18932762375677964, | |
| "learning_rate": 0.0001999830846194422, | |
| "loss": 1.0902, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.11055540931824165, | |
| "grad_norm": 0.17823414518835126, | |
| "learning_rate": 0.00019993234420037073, | |
| "loss": 1.0951, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.1158199526191103, | |
| "grad_norm": 0.19033211796864122, | |
| "learning_rate": 0.00019984779590865556, | |
| "loss": 1.11, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.12108449591997894, | |
| "grad_norm": 0.1781090004150184, | |
| "learning_rate": 0.0001997294683476273, | |
| "loss": 1.1216, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.1263490392208476, | |
| "grad_norm": 0.20142566240295628, | |
| "learning_rate": 0.0001995774015484005, | |
| "loss": 1.088, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.13161358252171623, | |
| "grad_norm": 0.16738672746077932, | |
| "learning_rate": 0.00019939164695633067, | |
| "loss": 1.1069, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.13687812582258488, | |
| "grad_norm": 0.17141306079033702, | |
| "learning_rate": 0.00019917226741361015, | |
| "loss": 1.1178, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.14214266912345355, | |
| "grad_norm": 0.18242919862111662, | |
| "learning_rate": 0.00019891933713800798, | |
| "loss": 1.115, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.1474072124243222, | |
| "grad_norm": 0.18858703761293544, | |
| "learning_rate": 0.00019863294169776148, | |
| "loss": 1.092, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.15267175572519084, | |
| "grad_norm": 0.1851910906506613, | |
| "learning_rate": 0.00019831317798262786, | |
| "loss": 1.1015, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.1579362990260595, | |
| "grad_norm": 0.17061718616532065, | |
| "learning_rate": 0.00019796015417110577, | |
| "loss": 1.0834, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.16320084232692814, | |
| "grad_norm": 0.19083175263550564, | |
| "learning_rate": 0.0001975739896938375, | |
| "loss": 1.0915, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.16846538562779678, | |
| "grad_norm": 0.17041981222039004, | |
| "learning_rate": 0.00019715481519320496, | |
| "loss": 1.1045, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.17372992892866543, | |
| "grad_norm": 0.17539080495334333, | |
| "learning_rate": 0.00019670277247913205, | |
| "loss": 1.0822, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.1789944722295341, | |
| "grad_norm": 0.16847918243353582, | |
| "learning_rate": 0.00019621801448110952, | |
| "loss": 1.1113, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.18425901553040275, | |
| "grad_norm": 0.16577520965121645, | |
| "learning_rate": 0.00019570070519645767, | |
| "loss": 1.0726, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.1895235588312714, | |
| "grad_norm": 0.17216940817918563, | |
| "learning_rate": 0.00019515101963484485, | |
| "loss": 1.1214, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.19478810213214004, | |
| "grad_norm": 0.16717603329959776, | |
| "learning_rate": 0.00019456914375908023, | |
| "loss": 1.0749, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.20005264543300869, | |
| "grad_norm": 0.16743311436795275, | |
| "learning_rate": 0.0001939552744222014, | |
| "loss": 1.0856, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.20531718873387733, | |
| "grad_norm": 0.16628473925396028, | |
| "learning_rate": 0.00019330961930087725, | |
| "loss": 1.1088, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.21058173203474598, | |
| "grad_norm": 0.1735673710306468, | |
| "learning_rate": 0.00019263239682514952, | |
| "loss": 1.094, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.21584627533561462, | |
| "grad_norm": 0.16463208491106188, | |
| "learning_rate": 0.00019192383610453618, | |
| "loss": 1.1191, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.2211108186364833, | |
| "grad_norm": 0.1697201646634803, | |
| "learning_rate": 0.00019118417685052194, | |
| "loss": 1.1188, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.22637536193735194, | |
| "grad_norm": 0.15930348674889006, | |
| "learning_rate": 0.00019041366929546219, | |
| "loss": 1.1132, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.2316399052382206, | |
| "grad_norm": 0.16154587528605638, | |
| "learning_rate": 0.0001896125741079272, | |
| "loss": 1.1029, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.23690444853908924, | |
| "grad_norm": 0.1689593754891321, | |
| "learning_rate": 0.00018878116230451613, | |
| "loss": 1.1196, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.24216899183995788, | |
| "grad_norm": 0.158620581331537, | |
| "learning_rate": 0.0001879197151581702, | |
| "loss": 1.0786, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.24743353514082653, | |
| "grad_norm": 0.1591976970503649, | |
| "learning_rate": 0.00018702852410301554, | |
| "loss": 1.0861, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.2526980784416952, | |
| "grad_norm": 0.16271741933565712, | |
| "learning_rate": 0.00018610789063576913, | |
| "loss": 1.077, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.25796262174256385, | |
| "grad_norm": 0.16691583214883504, | |
| "learning_rate": 0.00018515812621373997, | |
| "loss": 1.0931, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.26322716504343247, | |
| "grad_norm": 0.15795453416798677, | |
| "learning_rate": 0.00018417955214946092, | |
| "loss": 1.0929, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.26849170834430114, | |
| "grad_norm": 0.15734119895920037, | |
| "learning_rate": 0.00018317249950198597, | |
| "loss": 1.086, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.27375625164516976, | |
| "grad_norm": 0.15815121783491273, | |
| "learning_rate": 0.0001821373089648906, | |
| "loss": 1.1142, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.27902079494603843, | |
| "grad_norm": 0.15790684873329372, | |
| "learning_rate": 0.00018107433075101252, | |
| "loss": 1.0907, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.2842853382469071, | |
| "grad_norm": 0.1603612919235376, | |
| "learning_rate": 0.00017998392447397197, | |
| "loss": 1.103, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.2895498815477757, | |
| "grad_norm": 0.1935643212000403, | |
| "learning_rate": 0.00017886645902651167, | |
| "loss": 1.1207, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.2948144248486444, | |
| "grad_norm": 0.16197395404790052, | |
| "learning_rate": 0.0001777223124556978, | |
| "loss": 1.1036, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.300078968149513, | |
| "grad_norm": 0.16503760296000086, | |
| "learning_rate": 0.00017655187183502344, | |
| "loss": 1.0647, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.3053435114503817, | |
| "grad_norm": 0.1772283442967409, | |
| "learning_rate": 0.00017535553313345904, | |
| "loss": 1.1075, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.3106080547512503, | |
| "grad_norm": 0.16282645295325013, | |
| "learning_rate": 0.00017413370108149286, | |
| "loss": 1.1094, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.315872598052119, | |
| "grad_norm": 0.15561402718068354, | |
| "learning_rate": 0.00017288678903420762, | |
| "loss": 1.0776, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.32113714135298765, | |
| "grad_norm": 0.15594031474920508, | |
| "learning_rate": 0.00017161521883143934, | |
| "loss": 1.1078, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.32640168465385627, | |
| "grad_norm": 0.1570395383591175, | |
| "learning_rate": 0.00017031942065506576, | |
| "loss": 1.1124, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.33166622795472495, | |
| "grad_norm": 0.15773152315944608, | |
| "learning_rate": 0.00016899983288347248, | |
| "loss": 1.0913, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.33693077125559356, | |
| "grad_norm": 0.15310806808595664, | |
| "learning_rate": 0.00016765690194324616, | |
| "loss": 1.0845, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.34219531455646224, | |
| "grad_norm": 0.16384678369715433, | |
| "learning_rate": 0.00016629108215814525, | |
| "loss": 1.1173, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.34745985785733086, | |
| "grad_norm": 0.165818325184464, | |
| "learning_rate": 0.00016490283559539838, | |
| "loss": 1.1014, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.35272440115819953, | |
| "grad_norm": 0.15456003221800826, | |
| "learning_rate": 0.000163492631909384, | |
| "loss": 1.0915, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.3579889444590682, | |
| "grad_norm": 0.16059867173233644, | |
| "learning_rate": 0.00016206094818274229, | |
| "loss": 1.0969, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.3632534877599368, | |
| "grad_norm": 0.17415674474557066, | |
| "learning_rate": 0.00016060826876497478, | |
| "loss": 1.1145, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.3685180310608055, | |
| "grad_norm": 0.16440403677512835, | |
| "learning_rate": 0.0001591350851085851, | |
| "loss": 1.0683, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.3737825743616741, | |
| "grad_norm": 0.15901493438320982, | |
| "learning_rate": 0.00015764189560281677, | |
| "loss": 1.1199, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.3790471176625428, | |
| "grad_norm": 0.15988293404570103, | |
| "learning_rate": 0.00015612920540504453, | |
| "loss": 1.0709, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.3843116609634114, | |
| "grad_norm": 0.1616109424204681, | |
| "learning_rate": 0.00015459752626987563, | |
| "loss": 1.1027, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.3895762042642801, | |
| "grad_norm": 0.1513607201651111, | |
| "learning_rate": 0.00015304737637601926, | |
| "loss": 1.0956, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.3948407475651487, | |
| "grad_norm": 0.15452619863423803, | |
| "learning_rate": 0.0001514792801509831, | |
| "loss": 1.0952, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.40010529086601737, | |
| "grad_norm": 0.15418975657555584, | |
| "learning_rate": 0.00014989376809365493, | |
| "loss": 1.0934, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.40536983416688604, | |
| "grad_norm": 0.15158447263390024, | |
| "learning_rate": 0.00014829137659483143, | |
| "loss": 1.0981, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.41063437746775466, | |
| "grad_norm": 0.15420702474431047, | |
| "learning_rate": 0.0001466726477557527, | |
| "loss": 1.1013, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.41589892076862334, | |
| "grad_norm": 0.1513401762569788, | |
| "learning_rate": 0.00014503812920470534, | |
| "loss": 1.1128, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.42116346406949196, | |
| "grad_norm": 0.1759021276212348, | |
| "learning_rate": 0.00014338837391175582, | |
| "loss": 1.0793, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.42642800737036063, | |
| "grad_norm": 0.15639002655528358, | |
| "learning_rate": 0.00014172394000167623, | |
| "loss": 1.1126, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.43169255067122925, | |
| "grad_norm": 0.1558922751326013, | |
| "learning_rate": 0.00014004539056512667, | |
| "loss": 1.0864, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.4369570939720979, | |
| "grad_norm": 0.15449223519766864, | |
| "learning_rate": 0.00013835329346815716, | |
| "loss": 1.1161, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.4422216372729666, | |
| "grad_norm": 0.15398779214531882, | |
| "learning_rate": 0.0001366482211600945, | |
| "loss": 1.113, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.4474861805738352, | |
| "grad_norm": 0.15902962443654645, | |
| "learning_rate": 0.000134930750479878, | |
| "loss": 1.0783, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.4527507238747039, | |
| "grad_norm": 0.15614703146804315, | |
| "learning_rate": 0.00013320146246091074, | |
| "loss": 1.0891, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.4580152671755725, | |
| "grad_norm": 0.151735228198923, | |
| "learning_rate": 0.00013146094213449148, | |
| "loss": 1.1006, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.4632798104764412, | |
| "grad_norm": 0.1633743946888902, | |
| "learning_rate": 0.00012970977833189393, | |
| "loss": 1.0717, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.4685443537773098, | |
| "grad_norm": 0.16534257355481496, | |
| "learning_rate": 0.00012794856348516095, | |
| "loss": 1.0778, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.47380889707817847, | |
| "grad_norm": 0.1856142828881669, | |
| "learning_rate": 0.00012617789342668004, | |
| "loss": 1.0859, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.47907344037904714, | |
| "grad_norm": 0.15229515578033356, | |
| "learning_rate": 0.00012439836718760886, | |
| "loss": 1.0761, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.48433798367991576, | |
| "grad_norm": 0.15984985984562605, | |
| "learning_rate": 0.00012261058679521834, | |
| "loss": 1.0926, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.48960252698078444, | |
| "grad_norm": 0.14896040772758903, | |
| "learning_rate": 0.00012081515706922227, | |
| "loss": 1.0834, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.49486707028165305, | |
| "grad_norm": 0.1514924492192347, | |
| "learning_rate": 0.00011901268541716224, | |
| "loss": 1.0885, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.5001316135825217, | |
| "grad_norm": 0.1513889418015892, | |
| "learning_rate": 0.00011720378162891708, | |
| "loss": 1.1001, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.5053961568833903, | |
| "grad_norm": 0.15159825336613816, | |
| "learning_rate": 0.0001153890576704062, | |
| "loss": 1.1082, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.510660700184259, | |
| "grad_norm": 0.15427722774659086, | |
| "learning_rate": 0.00011356912747655685, | |
| "loss": 1.0843, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.5159252434851277, | |
| "grad_norm": 0.14639500931900093, | |
| "learning_rate": 0.00011174460674360549, | |
| "loss": 1.1058, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.5211897867859964, | |
| "grad_norm": 0.15320269723203808, | |
| "learning_rate": 0.00010991611272080269, | |
| "loss": 1.1125, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.5264543300868649, | |
| "grad_norm": 0.15092814943890553, | |
| "learning_rate": 0.00010808426400159338, | |
| "loss": 1.0898, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5317188733877336, | |
| "grad_norm": 0.14712598563479434, | |
| "learning_rate": 0.00010624968031434173, | |
| "loss": 1.0975, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.5369834166886023, | |
| "grad_norm": 0.1506174008648404, | |
| "learning_rate": 0.00010441298231267242, | |
| "loss": 1.0789, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.542247959989471, | |
| "grad_norm": 0.14915164476738402, | |
| "learning_rate": 0.00010257479136549889, | |
| "loss": 1.088, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.5475125032903395, | |
| "grad_norm": 0.14933216158522156, | |
| "learning_rate": 0.00010073572934680919, | |
| "loss": 1.1012, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.5527770465912082, | |
| "grad_norm": 0.1623395783916047, | |
| "learning_rate": 9.889641842528178e-05, | |
| "loss": 1.0992, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.5580415898920769, | |
| "grad_norm": 0.15524883773019818, | |
| "learning_rate": 9.70574808538006e-05, | |
| "loss": 1.0558, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.5633061331929455, | |
| "grad_norm": 0.14879516385003932, | |
| "learning_rate": 9.521953875894257e-05, | |
| "loss": 1.0634, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.5685706764938142, | |
| "grad_norm": 0.14856407933911947, | |
| "learning_rate": 9.338321393050719e-05, | |
| "loss": 1.0513, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.5738352197946828, | |
| "grad_norm": 0.1514919636398635, | |
| "learning_rate": 9.154912761116056e-05, | |
| "loss": 1.0899, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.5790997630955514, | |
| "grad_norm": 0.15005939408454377, | |
| "learning_rate": 8.971790028626395e-05, | |
| "loss": 1.09, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.5843643063964201, | |
| "grad_norm": 0.1541140355049706, | |
| "learning_rate": 8.789015147395919e-05, | |
| "loss": 1.072, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.5896288496972888, | |
| "grad_norm": 0.14756189100480177, | |
| "learning_rate": 8.606649951558073e-05, | |
| "loss": 1.0548, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.5948933929981574, | |
| "grad_norm": 0.14468591274130843, | |
| "learning_rate": 8.424756136646623e-05, | |
| "loss": 1.056, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.600157936299026, | |
| "grad_norm": 0.1510683202100121, | |
| "learning_rate": 8.243395238723571e-05, | |
| "loss": 1.0999, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.6054224795998947, | |
| "grad_norm": 0.14942489035639112, | |
| "learning_rate": 8.062628613561051e-05, | |
| "loss": 1.08, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.6106870229007634, | |
| "grad_norm": 0.14792710995590722, | |
| "learning_rate": 7.8825174158842e-05, | |
| "loss": 1.0916, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.615951566201632, | |
| "grad_norm": 0.14543568608581728, | |
| "learning_rate": 7.703122578682046e-05, | |
| "loss": 1.061, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.6212161095025006, | |
| "grad_norm": 0.14792849899325772, | |
| "learning_rate": 7.524504792593419e-05, | |
| "loss": 1.1101, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.6264806528033693, | |
| "grad_norm": 0.14574924924348462, | |
| "learning_rate": 7.346724485374837e-05, | |
| "loss": 1.0687, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.631745196104238, | |
| "grad_norm": 0.1434166906369258, | |
| "learning_rate": 7.169841801457347e-05, | |
| "loss": 1.0825, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.6370097394051066, | |
| "grad_norm": 0.14254720323207454, | |
| "learning_rate": 6.993916581599202e-05, | |
| "loss": 1.0896, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.6422742827059753, | |
| "grad_norm": 0.14534591022474969, | |
| "learning_rate": 6.819008342641273e-05, | |
| "loss": 1.0805, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.6475388260068439, | |
| "grad_norm": 0.1471482502229213, | |
| "learning_rate": 6.645176257372055e-05, | |
| "loss": 1.0933, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.6528033693077125, | |
| "grad_norm": 0.14967562406928056, | |
| "learning_rate": 6.472479134509052e-05, | |
| "loss": 1.0987, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.6580679126085812, | |
| "grad_norm": 0.14756218985788289, | |
| "learning_rate": 6.300975398803362e-05, | |
| "loss": 1.0862, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.6633324559094499, | |
| "grad_norm": 0.14358810278632364, | |
| "learning_rate": 6.130723071274107e-05, | |
| "loss": 1.0736, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.6685969992103185, | |
| "grad_norm": 0.14508119820046267, | |
| "learning_rate": 5.961779749579516e-05, | |
| "loss": 1.077, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.6738615425111871, | |
| "grad_norm": 0.14868475648668983, | |
| "learning_rate": 5.794202588531166e-05, | |
| "loss": 1.0921, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.6791260858120558, | |
| "grad_norm": 0.14136660751737096, | |
| "learning_rate": 5.628048280758096e-05, | |
| "loss": 1.0967, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.6843906291129245, | |
| "grad_norm": 0.14429824406995242, | |
| "learning_rate": 5.4633730375272594e-05, | |
| "loss": 1.094, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.6896551724137931, | |
| "grad_norm": 0.1435583500936634, | |
| "learning_rate": 5.300232569726804e-05, | |
| "loss": 1.0796, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.6949197157146617, | |
| "grad_norm": 0.14917594264214823, | |
| "learning_rate": 5.13868206901867e-05, | |
| "loss": 1.0813, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.7001842590155304, | |
| "grad_norm": 0.14484547003342338, | |
| "learning_rate": 4.9787761891668397e-05, | |
| "loss": 1.0833, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.7054488023163991, | |
| "grad_norm": 0.14125281408090304, | |
| "learning_rate": 4.820569027547533e-05, | |
| "loss": 1.0813, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.7107133456172677, | |
| "grad_norm": 0.1408995053360923, | |
| "learning_rate": 4.6641141068476666e-05, | |
| "loss": 1.0752, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.7159778889181364, | |
| "grad_norm": 0.1414179653044325, | |
| "learning_rate": 4.5094643569577186e-05, | |
| "loss": 1.054, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.721242432219005, | |
| "grad_norm": 0.14582058548503438, | |
| "learning_rate": 4.356672097065134e-05, | |
| "loss": 1.1048, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.7265069755198736, | |
| "grad_norm": 0.14009606861616825, | |
| "learning_rate": 4.205789017954364e-05, | |
| "loss": 1.0683, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.7317715188207423, | |
| "grad_norm": 0.14586506040118713, | |
| "learning_rate": 4.056866164519465e-05, | |
| "loss": 1.0728, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.737036062121611, | |
| "grad_norm": 0.14168474565307407, | |
| "learning_rate": 3.909953918495234e-05, | |
| "loss": 1.0476, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.7423006054224796, | |
| "grad_norm": 0.14476382479542646, | |
| "learning_rate": 3.7651019814126654e-05, | |
| "loss": 1.05, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.7475651487233482, | |
| "grad_norm": 0.14528550784733454, | |
| "learning_rate": 3.622359357784569e-05, | |
| "loss": 1.0611, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.7528296920242169, | |
| "grad_norm": 0.14781069746763306, | |
| "learning_rate": 3.481774338526954e-05, | |
| "loss": 1.0952, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.7580942353250856, | |
| "grad_norm": 0.15618197530507127, | |
| "learning_rate": 3.343394484621855e-05, | |
| "loss": 1.0836, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.7633587786259542, | |
| "grad_norm": 0.22087793925041818, | |
| "learning_rate": 3.207266611027069e-05, | |
| "loss": 1.0727, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.7686233219268228, | |
| "grad_norm": 0.14674869869141435, | |
| "learning_rate": 3.0734367708383294e-05, | |
| "loss": 1.0712, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.7738878652276915, | |
| "grad_norm": 0.14673826341334423, | |
| "learning_rate": 2.9419502397091713e-05, | |
| "loss": 1.0852, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.7791524085285602, | |
| "grad_norm": 0.1426087824509766, | |
| "learning_rate": 2.812851500533843e-05, | |
| "loss": 1.0604, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.7844169518294288, | |
| "grad_norm": 0.1446320144127932, | |
| "learning_rate": 2.6861842283983953e-05, | |
| "loss": 1.0537, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.7896814951302974, | |
| "grad_norm": 0.14326111319394175, | |
| "learning_rate": 2.5619912758050725e-05, | |
| "loss": 1.0942, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.7949460384311661, | |
| "grad_norm": 0.14149919988871043, | |
| "learning_rate": 2.4403146581749925e-05, | |
| "loss": 1.0578, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.8002105817320347, | |
| "grad_norm": 0.14034086298796508, | |
| "learning_rate": 2.3211955396340002e-05, | |
| "loss": 1.0808, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.8054751250329034, | |
| "grad_norm": 0.1433790314655123, | |
| "learning_rate": 2.204674219086531e-05, | |
| "loss": 1.0906, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.8107396683337721, | |
| "grad_norm": 0.138618618401559, | |
| "learning_rate": 2.090790116582191e-05, | |
| "loss": 1.0559, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.8160042116346407, | |
| "grad_norm": 0.1429827381187093, | |
| "learning_rate": 1.9795817599796418e-05, | |
| "loss": 1.0792, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.8212687549355093, | |
| "grad_norm": 0.14200271718072968, | |
| "learning_rate": 1.871086771912348e-05, | |
| "loss": 1.0702, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.826533298236378, | |
| "grad_norm": 0.1429932480295589, | |
| "learning_rate": 1.7653418570605475e-05, | |
| "loss": 1.0715, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.8317978415372467, | |
| "grad_norm": 0.14431467515210814, | |
| "learning_rate": 1.6623827897337762e-05, | |
| "loss": 1.0713, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.8370623848381153, | |
| "grad_norm": 0.15238820455432608, | |
| "learning_rate": 1.562244401768144e-05, | |
| "loss": 1.0824, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.8423269281389839, | |
| "grad_norm": 0.14830242766673976, | |
| "learning_rate": 1.4649605707424707e-05, | |
| "loss": 1.0787, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.8475914714398526, | |
| "grad_norm": 0.14468170557092047, | |
| "learning_rate": 1.3705642085172366e-05, | |
| "loss": 1.0737, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.8528560147407213, | |
| "grad_norm": 0.14674968769736463, | |
| "learning_rate": 1.2790872501002472e-05, | |
| "loss": 1.0577, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.8581205580415899, | |
| "grad_norm": 0.14311627432536864, | |
| "learning_rate": 1.1905606428427774e-05, | |
| "loss": 1.0692, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.8633851013424585, | |
| "grad_norm": 0.14558376197107287, | |
| "learning_rate": 1.105014335969855e-05, | |
| "loss": 1.0934, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.8686496446433272, | |
| "grad_norm": 0.14414555681497093, | |
| "learning_rate": 1.0224772704482033e-05, | |
| "loss": 1.0875, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.8739141879441958, | |
| "grad_norm": 0.1399627142514978, | |
| "learning_rate": 9.429773691952858e-06, | |
| "loss": 1.082, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.8791787312450645, | |
| "grad_norm": 0.1392001373823857, | |
| "learning_rate": 8.665415276327871e-06, | |
| "loss": 1.0573, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.8844432745459332, | |
| "grad_norm": 0.13993969105859186, | |
| "learning_rate": 7.931956045876688e-06, | |
| "loss": 1.0448, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.8897078178468018, | |
| "grad_norm": 0.16741517197447736, | |
| "learning_rate": 7.229644135439473e-06, | |
| "loss": 1.104, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.8949723611476704, | |
| "grad_norm": 0.14123729142229655, | |
| "learning_rate": 6.558717142480919e-06, | |
| "loss": 1.0808, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.9002369044485391, | |
| "grad_norm": 0.1424278055064695, | |
| "learning_rate": 5.919402046709288e-06, | |
| "loss": 1.0709, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.9055014477494078, | |
| "grad_norm": 0.13993993967003346, | |
| "learning_rate": 5.311915133287415e-06, | |
| "loss": 1.0941, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.9107659910502763, | |
| "grad_norm": 0.14557850289664284, | |
| "learning_rate": 4.7364619196617495e-06, | |
| "loss": 1.0492, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.916030534351145, | |
| "grad_norm": 0.1450177459066908, | |
| "learning_rate": 4.193237086034351e-06, | |
| "loss": 1.0972, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.9212950776520137, | |
| "grad_norm": 0.1570091074884799, | |
| "learning_rate": 3.6824244095010065e-06, | |
| "loss": 1.0695, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.9265596209528824, | |
| "grad_norm": 0.14097561405495265, | |
| "learning_rate": 3.2041967018780707e-06, | |
| "loss": 1.0948, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.931824164253751, | |
| "grad_norm": 0.1420984285291773, | |
| "learning_rate": 2.7587157512388718e-06, | |
| "loss": 1.0573, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.9370887075546196, | |
| "grad_norm": 0.1545471738706476, | |
| "learning_rate": 2.346132267179646e-06, | |
| "loss": 1.0786, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.9423532508554883, | |
| "grad_norm": 0.14481364480205125, | |
| "learning_rate": 1.9665858298333005e-06, | |
| "loss": 1.0939, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.9476177941563569, | |
| "grad_norm": 0.1446556897144525, | |
| "learning_rate": 1.6202048426483651e-06, | |
| "loss": 1.0752, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.9528823374572256, | |
| "grad_norm": 0.13840641658264988, | |
| "learning_rate": 1.3071064889491724e-06, | |
| "loss": 1.0757, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.9581468807580943, | |
| "grad_norm": 0.1405867091258211, | |
| "learning_rate": 1.0273966922918155e-06, | |
| "loss": 1.0886, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.9634114240589629, | |
| "grad_norm": 0.15143973079201015, | |
| "learning_rate": 7.81170080629412e-07, | |
| "loss": 1.0337, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.9686759673598315, | |
| "grad_norm": 0.15113893856195346, | |
| "learning_rate": 5.68509954298757e-07, | |
| "loss": 1.099, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.9739405106607002, | |
| "grad_norm": 0.1436446854214333, | |
| "learning_rate": 3.8948825783918784e-07, | |
| "loss": 1.0595, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.9792050539615689, | |
| "grad_norm": 0.14373165990559605, | |
| "learning_rate": 2.4416555565318635e-07, | |
| "loss": 1.0815, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.9844695972624374, | |
| "grad_norm": 0.14233020784379563, | |
| "learning_rate": 1.3259101151694708e-07, | |
| "loss": 1.0569, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.9897341405633061, | |
| "grad_norm": 0.13823967108377017, | |
| "learning_rate": 5.480237194799287e-08, | |
| "loss": 1.0689, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.9949986838641748, | |
| "grad_norm": 0.1431568671824589, | |
| "learning_rate": 1.0825953435122938e-08, | |
| "loss": 1.0709, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.9992103185048697, | |
| "eval_loss": 1.07915198802948, | |
| "eval_runtime": 3821.2872, | |
| "eval_samples_per_second": 3.522, | |
| "eval_steps_per_second": 0.881, | |
| "step": 949 | |
| }, | |
| { | |
| "epoch": 0.9992103185048697, | |
| "step": 949, | |
| "total_flos": 1959448100732928.0, | |
| "train_loss": 1.0930153100081064, | |
| "train_runtime": 22340.3866, | |
| "train_samples_per_second": 2.72, | |
| "train_steps_per_second": 0.042 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 949, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1959448100732928.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |