{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9995203836930455, "eval_steps": 500, "global_step": 1042, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009592326139088729, "grad_norm": 3.8003539568068034, "learning_rate": 1.9047619047619051e-06, "loss": 1.3148, "step": 1 }, { "epoch": 0.004796163069544364, "grad_norm": 1.3427176668530907, "learning_rate": 9.523809523809523e-06, "loss": 1.3465, "step": 5 }, { "epoch": 0.009592326139088728, "grad_norm": 0.6129061923124927, "learning_rate": 1.9047619047619046e-05, "loss": 1.3062, "step": 10 }, { "epoch": 0.014388489208633094, "grad_norm": 0.5006461190956731, "learning_rate": 2.857142857142857e-05, "loss": 1.258, "step": 15 }, { "epoch": 0.019184652278177457, "grad_norm": 0.3919323663241635, "learning_rate": 3.809523809523809e-05, "loss": 1.2099, "step": 20 }, { "epoch": 0.023980815347721823, "grad_norm": 0.33016517225439684, "learning_rate": 4.761904761904762e-05, "loss": 1.1774, "step": 25 }, { "epoch": 0.02877697841726619, "grad_norm": 0.24241904389090518, "learning_rate": 5.714285714285714e-05, "loss": 1.1464, "step": 30 }, { "epoch": 0.03357314148681055, "grad_norm": 0.21825210432654424, "learning_rate": 6.666666666666667e-05, "loss": 1.1551, "step": 35 }, { "epoch": 0.03836930455635491, "grad_norm": 0.2357580892216174, "learning_rate": 7.619047619047618e-05, "loss": 1.1329, "step": 40 }, { "epoch": 0.04316546762589928, "grad_norm": 0.20958683389560634, "learning_rate": 8.571428571428571e-05, "loss": 1.1255, "step": 45 }, { "epoch": 0.047961630695443645, "grad_norm": 0.2313801330004238, "learning_rate": 9.523809523809524e-05, "loss": 1.126, "step": 50 }, { "epoch": 0.05275779376498801, "grad_norm": 0.2081771832083775, "learning_rate": 0.00010476190476190477, "loss": 1.0935, "step": 55 }, { "epoch": 0.05755395683453238, "grad_norm": 0.2039705643041609, "learning_rate": 0.00011428571428571428, "loss": 1.1394, "step": 60 }, { "epoch": 0.06235011990407674, "grad_norm": 0.2148039800819315, "learning_rate": 0.0001238095238095238, "loss": 1.128, "step": 65 }, { "epoch": 0.0671462829736211, "grad_norm": 0.18570989413688033, "learning_rate": 0.00013333333333333334, "loss": 1.1474, "step": 70 }, { "epoch": 0.07194244604316546, "grad_norm": 0.18151401221015465, "learning_rate": 0.00014285714285714287, "loss": 1.1297, "step": 75 }, { "epoch": 0.07673860911270983, "grad_norm": 0.18983827314633334, "learning_rate": 0.00015238095238095237, "loss": 1.0947, "step": 80 }, { "epoch": 0.0815347721822542, "grad_norm": 0.18321896415592545, "learning_rate": 0.00016190476190476192, "loss": 1.0993, "step": 85 }, { "epoch": 0.08633093525179857, "grad_norm": 0.18106666815994735, "learning_rate": 0.00017142857142857143, "loss": 1.1169, "step": 90 }, { "epoch": 0.09112709832134293, "grad_norm": 0.17126918097791768, "learning_rate": 0.00018095238095238095, "loss": 1.1336, "step": 95 }, { "epoch": 0.09592326139088729, "grad_norm": 0.1787566440192351, "learning_rate": 0.00019047619047619048, "loss": 1.1086, "step": 100 }, { "epoch": 0.10071942446043165, "grad_norm": 0.18251300143969484, "learning_rate": 0.0002, "loss": 1.0947, "step": 105 }, { "epoch": 0.10551558752997602, "grad_norm": 0.19966595269166043, "learning_rate": 0.00019998594857383755, "loss": 1.1381, "step": 110 }, { "epoch": 0.11031175059952038, "grad_norm": 0.18103343210950873, "learning_rate": 0.0001999437982442017, "loss": 1.1246, "step": 115 }, { "epoch": 0.11510791366906475, "grad_norm": 0.17391603743469916, "learning_rate": 0.00019987356085653736, "loss": 1.1109, "step": 120 }, { "epoch": 0.11990407673860912, "grad_norm": 0.18378902142023112, "learning_rate": 0.00019977525614955387, "loss": 1.0974, "step": 125 }, { "epoch": 0.12470023980815348, "grad_norm": 0.18844867723630976, "learning_rate": 0.00019964891174967784, "loss": 1.0954, "step": 130 }, { "epoch": 0.12949640287769784, "grad_norm": 0.17258678109282075, "learning_rate": 0.0001994945631632894, "loss": 1.1045, "step": 135 }, { "epoch": 0.1342925659472422, "grad_norm": 0.19501601610290073, "learning_rate": 0.00019931225376674388, "loss": 1.1151, "step": 140 }, { "epoch": 0.13908872901678657, "grad_norm": 0.16557441103633752, "learning_rate": 0.00019910203479418172, "loss": 1.0931, "step": 145 }, { "epoch": 0.14388489208633093, "grad_norm": 0.1808422208750127, "learning_rate": 0.00019886396532313032, "loss": 1.1439, "step": 150 }, { "epoch": 0.1486810551558753, "grad_norm": 0.17153212310391777, "learning_rate": 0.00019859811225790162, "loss": 1.1238, "step": 155 }, { "epoch": 0.15347721822541965, "grad_norm": 0.1725494178552086, "learning_rate": 0.00019830455031078992, "loss": 1.0819, "step": 160 }, { "epoch": 0.15827338129496402, "grad_norm": 0.17821716654615946, "learning_rate": 0.00019798336198107567, "loss": 1.106, "step": 165 }, { "epoch": 0.1630695443645084, "grad_norm": 0.17454115891711316, "learning_rate": 0.0001976346375318409, "loss": 1.1176, "step": 170 }, { "epoch": 0.16786570743405277, "grad_norm": 0.17165723260861443, "learning_rate": 0.00019725847496460257, "loss": 1.104, "step": 175 }, { "epoch": 0.17266187050359713, "grad_norm": 0.16959903827879758, "learning_rate": 0.00019685497999177146, "loss": 1.1032, "step": 180 }, { "epoch": 0.1774580335731415, "grad_norm": 0.16370292319935667, "learning_rate": 0.00019642426600694396, "loss": 1.1058, "step": 185 }, { "epoch": 0.18225419664268586, "grad_norm": 0.17362596041949555, "learning_rate": 0.00019596645405303508, "loss": 1.0897, "step": 190 }, { "epoch": 0.18705035971223022, "grad_norm": 0.1705391917273308, "learning_rate": 0.00019548167278826223, "loss": 1.0793, "step": 195 }, { "epoch": 0.19184652278177458, "grad_norm": 0.16987370700655263, "learning_rate": 0.00019497005844998835, "loss": 1.0825, "step": 200 }, { "epoch": 0.19664268585131894, "grad_norm": 0.1741514081542761, "learning_rate": 0.00019443175481643533, "loss": 1.0971, "step": 205 }, { "epoch": 0.2014388489208633, "grad_norm": 0.1697720073525996, "learning_rate": 0.00019386691316627846, "loss": 1.1004, "step": 210 }, { "epoch": 0.20623501199040767, "grad_norm": 0.16941098489834122, "learning_rate": 0.00019327569223613248, "loss": 1.1078, "step": 215 }, { "epoch": 0.21103117505995203, "grad_norm": 0.16231625897366397, "learning_rate": 0.0001926582581759423, "loss": 1.0941, "step": 220 }, { "epoch": 0.2158273381294964, "grad_norm": 0.16520366738739287, "learning_rate": 0.00019201478450229012, "loss": 1.0868, "step": 225 }, { "epoch": 0.22062350119904076, "grad_norm": 0.16835416401080175, "learning_rate": 0.00019134545204963212, "loss": 1.1037, "step": 230 }, { "epoch": 0.22541966426858512, "grad_norm": 0.16657613469605134, "learning_rate": 0.0001906504489194791, "loss": 1.093, "step": 235 }, { "epoch": 0.2302158273381295, "grad_norm": 0.1746269506944523, "learning_rate": 0.00018992997042753434, "loss": 1.1124, "step": 240 }, { "epoch": 0.23501199040767387, "grad_norm": 0.16885168676581888, "learning_rate": 0.0001891842190488045, "loss": 1.0972, "step": 245 }, { "epoch": 0.23980815347721823, "grad_norm": 0.16263325756495298, "learning_rate": 0.00018841340436069826, "loss": 1.0725, "step": 250 }, { "epoch": 0.2446043165467626, "grad_norm": 0.16758205657577824, "learning_rate": 0.00018761774298412903, "loss": 1.1012, "step": 255 }, { "epoch": 0.24940047961630696, "grad_norm": 0.1592358008735748, "learning_rate": 0.00018679745852263858, "loss": 1.1163, "step": 260 }, { "epoch": 0.2541966426858513, "grad_norm": 0.18307373045740924, "learning_rate": 0.0001859527814995577, "loss": 1.0883, "step": 265 }, { "epoch": 0.2589928057553957, "grad_norm": 0.1603879751504812, "learning_rate": 0.00018508394929322286, "loss": 1.0837, "step": 270 }, { "epoch": 0.2637889688249401, "grad_norm": 0.16169604116219924, "learning_rate": 0.0001841912060702659, "loss": 1.1059, "step": 275 }, { "epoch": 0.2685851318944844, "grad_norm": 0.21284544973156141, "learning_rate": 0.00018327480271699645, "loss": 1.1197, "step": 280 }, { "epoch": 0.2733812949640288, "grad_norm": 0.16556421777414207, "learning_rate": 0.00018233499676889556, "loss": 1.0857, "step": 285 }, { "epoch": 0.27817745803357313, "grad_norm": 0.17311293794732555, "learning_rate": 0.00018137205233824098, "loss": 1.1215, "step": 290 }, { "epoch": 0.2829736211031175, "grad_norm": 0.16045436773084543, "learning_rate": 0.00018038624003988404, "loss": 1.1102, "step": 295 }, { "epoch": 0.28776978417266186, "grad_norm": 0.16385963917512852, "learning_rate": 0.0001793778369151991, "loss": 1.1237, "step": 300 }, { "epoch": 0.29256594724220625, "grad_norm": 0.16336813988660637, "learning_rate": 0.00017834712635422716, "loss": 1.07, "step": 305 }, { "epoch": 0.2973621103117506, "grad_norm": 0.17112565194779492, "learning_rate": 0.0001772943980160351, "loss": 1.1009, "step": 310 }, { "epoch": 0.302158273381295, "grad_norm": 0.16697464006830123, "learning_rate": 0.0001762199477473131, "loss": 1.0972, "step": 315 }, { "epoch": 0.3069544364508393, "grad_norm": 0.16453710745761058, "learning_rate": 0.0001751240774992336, "loss": 1.074, "step": 320 }, { "epoch": 0.3117505995203837, "grad_norm": 0.17019017682628554, "learning_rate": 0.000174007095242594, "loss": 1.0605, "step": 325 }, { "epoch": 0.31654676258992803, "grad_norm": 0.17847410199181396, "learning_rate": 0.00017286931488126839, "loss": 1.1109, "step": 330 }, { "epoch": 0.3213429256594724, "grad_norm": 0.1628727862316133, "learning_rate": 0.00017171105616399152, "loss": 1.1024, "step": 335 }, { "epoch": 0.3261390887290168, "grad_norm": 0.16567336419547818, "learning_rate": 0.0001705326445945002, "loss": 1.072, "step": 340 }, { "epoch": 0.33093525179856115, "grad_norm": 0.16722439948774648, "learning_rate": 0.0001693344113400577, "loss": 1.0873, "step": 345 }, { "epoch": 0.33573141486810554, "grad_norm": 0.16122286760160678, "learning_rate": 0.0001681166931383859, "loss": 1.0938, "step": 350 }, { "epoch": 0.3405275779376499, "grad_norm": 0.15969633584876602, "learning_rate": 0.00016687983220303282, "loss": 1.107, "step": 355 }, { "epoch": 0.34532374100719426, "grad_norm": 0.20032537034543332, "learning_rate": 0.00016562417612720054, "loss": 1.0925, "step": 360 }, { "epoch": 0.3501199040767386, "grad_norm": 0.1636161155593432, "learning_rate": 0.00016435007778606178, "loss": 1.0925, "step": 365 }, { "epoch": 0.354916067146283, "grad_norm": 0.17509644211474343, "learning_rate": 0.00016305789523759186, "loss": 1.0853, "step": 370 }, { "epoch": 0.3597122302158273, "grad_norm": 0.15845793640793487, "learning_rate": 0.00016174799162194407, "loss": 1.074, "step": 375 }, { "epoch": 0.3645083932853717, "grad_norm": 0.1643115007590844, "learning_rate": 0.00016042073505939718, "loss": 1.0904, "step": 380 }, { "epoch": 0.36930455635491605, "grad_norm": 0.159834353020724, "learning_rate": 0.0001590764985469029, "loss": 1.0913, "step": 385 }, { "epoch": 0.37410071942446044, "grad_norm": 0.15671259478172928, "learning_rate": 0.00015771565985326323, "loss": 1.0949, "step": 390 }, { "epoch": 0.37889688249400477, "grad_norm": 0.15892776737127523, "learning_rate": 0.0001563386014129667, "loss": 1.0882, "step": 395 }, { "epoch": 0.38369304556354916, "grad_norm": 0.1554951930431914, "learning_rate": 0.00015494571021871308, "loss": 1.0849, "step": 400 }, { "epoch": 0.38848920863309355, "grad_norm": 0.15596140521064988, "learning_rate": 0.00015353737771265787, "loss": 1.0892, "step": 405 }, { "epoch": 0.3932853717026379, "grad_norm": 0.16030122057012702, "learning_rate": 0.00015211399967640537, "loss": 1.073, "step": 410 }, { "epoch": 0.3980815347721823, "grad_norm": 0.15896611447165424, "learning_rate": 0.00015067597611978327, "loss": 1.1113, "step": 415 }, { "epoch": 0.4028776978417266, "grad_norm": 0.1532650010931279, "learning_rate": 0.000149223711168428, "loss": 1.0881, "step": 420 }, { "epoch": 0.407673860911271, "grad_norm": 0.16086089606757772, "learning_rate": 0.00014775761295021417, "loss": 1.0887, "step": 425 }, { "epoch": 0.41247002398081534, "grad_norm": 0.15347896448970985, "learning_rate": 0.00014627809348055908, "loss": 1.0838, "step": 430 }, { "epoch": 0.4172661870503597, "grad_norm": 0.16005243796029608, "learning_rate": 0.00014478556854663434, "loss": 1.1036, "step": 435 }, { "epoch": 0.42206235011990406, "grad_norm": 0.23613805183753908, "learning_rate": 0.00014328045759051805, "loss": 1.0886, "step": 440 }, { "epoch": 0.42685851318944845, "grad_norm": 0.15961403055430387, "learning_rate": 0.00014176318359131955, "loss": 1.0807, "step": 445 }, { "epoch": 0.4316546762589928, "grad_norm": 0.15677042464881533, "learning_rate": 0.00014023417294631017, "loss": 1.1133, "step": 450 }, { "epoch": 0.4364508393285372, "grad_norm": 0.14974169551675579, "learning_rate": 0.0001386938553510936, "loss": 1.0799, "step": 455 }, { "epoch": 0.4412470023980815, "grad_norm": 0.15803754347469678, "learning_rate": 0.00013714266367884884, "loss": 1.0735, "step": 460 }, { "epoch": 0.4460431654676259, "grad_norm": 0.1534708691023063, "learning_rate": 0.00013558103385868085, "loss": 1.0941, "step": 465 }, { "epoch": 0.45083932853717024, "grad_norm": 0.15911138328699584, "learning_rate": 0.00013400940475311192, "loss": 1.1036, "step": 470 }, { "epoch": 0.4556354916067146, "grad_norm": 0.1535927522203036, "learning_rate": 0.0001324282180347486, "loss": 1.077, "step": 475 }, { "epoch": 0.460431654676259, "grad_norm": 0.15031744415995577, "learning_rate": 0.00013083791806215938, "loss": 1.1175, "step": 480 }, { "epoch": 0.46522781774580335, "grad_norm": 0.15368131899023496, "learning_rate": 0.0001292389517549971, "loss": 1.0858, "step": 485 }, { "epoch": 0.47002398081534774, "grad_norm": 0.15817599988761602, "learning_rate": 0.0001276317684684017, "loss": 1.1105, "step": 490 }, { "epoch": 0.4748201438848921, "grad_norm": 0.1544524844304045, "learning_rate": 0.0001260168198667189, "loss": 1.0675, "step": 495 }, { "epoch": 0.47961630695443647, "grad_norm": 0.153601743822322, "learning_rate": 0.00012439455979656932, "loss": 1.0792, "step": 500 }, { "epoch": 0.4844124700239808, "grad_norm": 0.15310739937604212, "learning_rate": 0.00012276544415930476, "loss": 1.1037, "step": 505 }, { "epoch": 0.4892086330935252, "grad_norm": 0.15940815066320235, "learning_rate": 0.00012112993078288702, "loss": 1.0893, "step": 510 }, { "epoch": 0.4940047961630695, "grad_norm": 0.15593480397437104, "learning_rate": 0.00011948847929322497, "loss": 1.0736, "step": 515 }, { "epoch": 0.4988009592326139, "grad_norm": 0.15351285356914193, "learning_rate": 0.00011784155098500682, "loss": 1.0734, "step": 520 }, { "epoch": 0.5035971223021583, "grad_norm": 0.1648474397335208, "learning_rate": 0.00011618960869206285, "loss": 1.0902, "step": 525 }, { "epoch": 0.5083932853717026, "grad_norm": 0.15294486843196387, "learning_rate": 0.00011453311665729618, "loss": 1.0963, "step": 530 }, { "epoch": 0.513189448441247, "grad_norm": 0.148639402509462, "learning_rate": 0.0001128725404022171, "loss": 1.0796, "step": 535 }, { "epoch": 0.5179856115107914, "grad_norm": 0.15562736350879572, "learning_rate": 0.00011120834659611831, "loss": 1.0906, "step": 540 }, { "epoch": 0.5227817745803357, "grad_norm": 0.1502074864032036, "learning_rate": 0.00010954100292492757, "loss": 1.0711, "step": 545 }, { "epoch": 0.5275779376498801, "grad_norm": 0.157682125372101, "learning_rate": 0.00010787097795977448, "loss": 1.0728, "step": 550 }, { "epoch": 0.5323741007194245, "grad_norm": 0.1516246607062371, "learning_rate": 0.00010619874102530885, "loss": 1.0833, "step": 555 }, { "epoch": 0.5371702637889688, "grad_norm": 0.14890743598378445, "learning_rate": 0.00010452476206780685, "loss": 1.1037, "step": 560 }, { "epoch": 0.5419664268585132, "grad_norm": 0.1544250948669629, "learning_rate": 0.00010284951152310292, "loss": 1.0882, "step": 565 }, { "epoch": 0.5467625899280576, "grad_norm": 0.15154065813090323, "learning_rate": 0.00010117346018438367, "loss": 1.0601, "step": 570 }, { "epoch": 0.5515587529976019, "grad_norm": 0.1541132913924603, "learning_rate": 9.949707906988165e-05, "loss": 1.0825, "step": 575 }, { "epoch": 0.5563549160671463, "grad_norm": 0.15035087453783752, "learning_rate": 9.7820839290506e-05, "loss": 1.0893, "step": 580 }, { "epoch": 0.5611510791366906, "grad_norm": 0.1566481016467638, "learning_rate": 9.614521191744644e-05, "loss": 1.0717, "step": 585 }, { "epoch": 0.565947242206235, "grad_norm": 0.14690623261948202, "learning_rate": 9.447066784978914e-05, "loss": 1.0843, "step": 590 }, { "epoch": 0.5707434052757794, "grad_norm": 0.14805971637155899, "learning_rate": 9.279767768218057e-05, "loss": 1.0639, "step": 595 }, { "epoch": 0.5755395683453237, "grad_norm": 0.15165832191817782, "learning_rate": 9.112671157257698e-05, "loss": 1.0762, "step": 600 }, { "epoch": 0.580335731414868, "grad_norm": 0.16254973894493494, "learning_rate": 8.945823911011648e-05, "loss": 1.0627, "step": 605 }, { "epoch": 0.5851318944844125, "grad_norm": 0.15231765571558437, "learning_rate": 8.779272918315134e-05, "loss": 1.1191, "step": 610 }, { "epoch": 0.5899280575539568, "grad_norm": 0.15662286211747672, "learning_rate": 8.613064984747672e-05, "loss": 1.0814, "step": 615 }, { "epoch": 0.5947242206235012, "grad_norm": 0.15604291232878043, "learning_rate": 8.44724681947939e-05, "loss": 1.062, "step": 620 }, { "epoch": 0.5995203836930456, "grad_norm": 0.17656148460505006, "learning_rate": 8.281865022144402e-05, "loss": 1.0877, "step": 625 }, { "epoch": 0.60431654676259, "grad_norm": 0.15407547139459882, "learning_rate": 8.116966069744987e-05, "loss": 1.0883, "step": 630 }, { "epoch": 0.6091127098321343, "grad_norm": 0.15686792104917918, "learning_rate": 7.952596303590214e-05, "loss": 1.0726, "step": 635 }, { "epoch": 0.6139088729016786, "grad_norm": 0.14900379281026824, "learning_rate": 7.788801916272739e-05, "loss": 1.0873, "step": 640 }, { "epoch": 0.6187050359712231, "grad_norm": 0.15399136244929024, "learning_rate": 7.625628938687348e-05, "loss": 1.0673, "step": 645 }, { "epoch": 0.6235011990407674, "grad_norm": 0.1454561721402151, "learning_rate": 7.463123227094961e-05, "loss": 1.0648, "step": 650 }, { "epoch": 0.6282973621103117, "grad_norm": 0.15049453308757393, "learning_rate": 7.301330450235733e-05, "loss": 1.0952, "step": 655 }, { "epoch": 0.6330935251798561, "grad_norm": 0.14976244376291453, "learning_rate": 7.140296076494809e-05, "loss": 1.096, "step": 660 }, { "epoch": 0.6378896882494005, "grad_norm": 0.15150823609919292, "learning_rate": 6.980065361124437e-05, "loss": 1.0973, "step": 665 }, { "epoch": 0.6426858513189448, "grad_norm": 0.1471164569431601, "learning_rate": 6.820683333525942e-05, "loss": 1.0886, "step": 670 }, { "epoch": 0.6474820143884892, "grad_norm": 0.1478399832545343, "learning_rate": 6.662194784595164e-05, "loss": 1.0778, "step": 675 }, { "epoch": 0.6522781774580336, "grad_norm": 0.14733730171420292, "learning_rate": 6.504644254134969e-05, "loss": 1.0622, "step": 680 }, { "epoch": 0.657074340527578, "grad_norm": 0.15089732572854292, "learning_rate": 6.34807601833826e-05, "loss": 1.054, "step": 685 }, { "epoch": 0.6618705035971223, "grad_norm": 0.15257943560973228, "learning_rate": 6.19253407734514e-05, "loss": 1.0653, "step": 690 }, { "epoch": 0.6666666666666666, "grad_norm": 0.1555839364213723, "learning_rate": 6.038062142877583e-05, "loss": 1.0743, "step": 695 }, { "epoch": 0.6714628297362111, "grad_norm": 0.15847863555269973, "learning_rate": 5.884703625955219e-05, "loss": 1.0714, "step": 700 }, { "epoch": 0.6762589928057554, "grad_norm": 0.14915505074346633, "learning_rate": 5.73250162469559e-05, "loss": 1.0777, "step": 705 }, { "epoch": 0.6810551558752997, "grad_norm": 0.1558291126854066, "learning_rate": 5.581498912202339e-05, "loss": 1.0946, "step": 710 }, { "epoch": 0.6858513189448441, "grad_norm": 0.15124530700511218, "learning_rate": 5.431737924544763e-05, "loss": 1.0802, "step": 715 }, { "epoch": 0.6906474820143885, "grad_norm": 0.151312999497745, "learning_rate": 5.283260748832072e-05, "loss": 1.0809, "step": 720 }, { "epoch": 0.6954436450839329, "grad_norm": 0.14330547504302646, "learning_rate": 5.1361091113856875e-05, "loss": 1.0801, "step": 725 }, { "epoch": 0.7002398081534772, "grad_norm": 0.1480298490267667, "learning_rate": 4.990324366012977e-05, "loss": 1.0553, "step": 730 }, { "epoch": 0.7050359712230215, "grad_norm": 0.153500392941282, "learning_rate": 4.845947482385645e-05, "loss": 1.0694, "step": 735 }, { "epoch": 0.709832134292566, "grad_norm": 0.1446376081330043, "learning_rate": 4.7030190345260816e-05, "loss": 1.0684, "step": 740 }, { "epoch": 0.7146282973621103, "grad_norm": 0.15009181859267848, "learning_rate": 4.5615791894049286e-05, "loss": 1.0812, "step": 745 }, { "epoch": 0.7194244604316546, "grad_norm": 0.1489429463837929, "learning_rate": 4.4216676956529866e-05, "loss": 1.1022, "step": 750 }, { "epoch": 0.7242206235011991, "grad_norm": 0.14571898395542737, "learning_rate": 4.2833238723907275e-05, "loss": 1.0446, "step": 755 }, { "epoch": 0.7290167865707434, "grad_norm": 0.1613600673322617, "learning_rate": 4.146586598178506e-05, "loss": 1.0452, "step": 760 }, { "epoch": 0.7338129496402878, "grad_norm": 0.14371687625758103, "learning_rate": 4.011494300090565e-05, "loss": 1.0848, "step": 765 }, { "epoch": 0.7386091127098321, "grad_norm": 0.1540465353357514, "learning_rate": 3.878084942915936e-05, "loss": 1.0866, "step": 770 }, { "epoch": 0.7434052757793765, "grad_norm": 0.14845273692539365, "learning_rate": 3.746396018489261e-05, "loss": 1.064, "step": 775 }, { "epoch": 0.7482014388489209, "grad_norm": 0.1510032414720602, "learning_rate": 3.616464535154496e-05, "loss": 1.0775, "step": 780 }, { "epoch": 0.7529976019184652, "grad_norm": 0.14414705671457814, "learning_rate": 3.488327007364525e-05, "loss": 1.0664, "step": 785 }, { "epoch": 0.7577937649880095, "grad_norm": 0.14609563936342954, "learning_rate": 3.3620194454195564e-05, "loss": 1.0732, "step": 790 }, { "epoch": 0.762589928057554, "grad_norm": 0.15124924277730611, "learning_rate": 3.237577345347196e-05, "loss": 1.0796, "step": 795 }, { "epoch": 0.7673860911270983, "grad_norm": 0.14663622293812884, "learning_rate": 3.115035678927063e-05, "loss": 1.0789, "step": 800 }, { "epoch": 0.7721822541966427, "grad_norm": 0.14712216022696742, "learning_rate": 2.9944288838627054e-05, "loss": 1.0812, "step": 805 }, { "epoch": 0.7769784172661871, "grad_norm": 0.1466540809367841, "learning_rate": 2.875790854103634e-05, "loss": 1.0371, "step": 810 }, { "epoch": 0.7817745803357314, "grad_norm": 0.15209652176203295, "learning_rate": 2.7591549303201514e-05, "loss": 1.0666, "step": 815 }, { "epoch": 0.7865707434052758, "grad_norm": 0.15265531709332175, "learning_rate": 2.6445538905336763e-05, "loss": 1.0716, "step": 820 }, { "epoch": 0.7913669064748201, "grad_norm": 0.1472763184905413, "learning_rate": 2.532019940905186e-05, "loss": 1.0774, "step": 825 }, { "epoch": 0.7961630695443646, "grad_norm": 0.14917213471816188, "learning_rate": 2.421584706684359e-05, "loss": 1.0585, "step": 830 }, { "epoch": 0.8009592326139089, "grad_norm": 0.1487175153721826, "learning_rate": 2.3132792233219813e-05, "loss": 1.0845, "step": 835 }, { "epoch": 0.8057553956834532, "grad_norm": 0.14774839089244932, "learning_rate": 2.207133927748104e-05, "loss": 1.0676, "step": 840 }, { "epoch": 0.8105515587529976, "grad_norm": 0.14579576803524566, "learning_rate": 2.103178649818387e-05, "loss": 1.0618, "step": 845 }, { "epoch": 0.815347721822542, "grad_norm": 0.15161345454257397, "learning_rate": 2.0014426039310786e-05, "loss": 1.0647, "step": 850 }, { "epoch": 0.8201438848920863, "grad_norm": 0.15041577767342024, "learning_rate": 1.9019543808169115e-05, "loss": 1.0634, "step": 855 }, { "epoch": 0.8249400479616307, "grad_norm": 0.14832965984424024, "learning_rate": 1.8047419395043086e-05, "loss": 1.078, "step": 860 }, { "epoch": 0.829736211031175, "grad_norm": 0.14605268717157927, "learning_rate": 1.7098325994620934e-05, "loss": 1.0763, "step": 865 }, { "epoch": 0.8345323741007195, "grad_norm": 0.14667368288338628, "learning_rate": 1.6172530329219416e-05, "loss": 1.0792, "step": 870 }, { "epoch": 0.8393285371702638, "grad_norm": 0.1468054846510282, "learning_rate": 1.5270292573827173e-05, "loss": 1.0643, "step": 875 }, { "epoch": 0.8441247002398081, "grad_norm": 0.14221973189330656, "learning_rate": 1.4391866282988265e-05, "loss": 1.0448, "step": 880 }, { "epoch": 0.8489208633093526, "grad_norm": 0.14260459844091208, "learning_rate": 1.3537498319545983e-05, "loss": 1.0729, "step": 885 }, { "epoch": 0.8537170263788969, "grad_norm": 0.15011630591055675, "learning_rate": 1.2707428785267394e-05, "loss": 1.0737, "step": 890 }, { "epoch": 0.8585131894484412, "grad_norm": 0.1406245046991364, "learning_rate": 1.1901890953367911e-05, "loss": 1.0542, "step": 895 }, { "epoch": 0.8633093525179856, "grad_norm": 0.1452217620527171, "learning_rate": 1.1121111202954836e-05, "loss": 1.0404, "step": 900 }, { "epoch": 0.86810551558753, "grad_norm": 0.14762449970761843, "learning_rate": 1.0365308955408459e-05, "loss": 1.0668, "step": 905 }, { "epoch": 0.8729016786570744, "grad_norm": 0.14280442405170934, "learning_rate": 9.634696612718242e-06, "loss": 1.0589, "step": 910 }, { "epoch": 0.8776978417266187, "grad_norm": 0.14727617951380836, "learning_rate": 8.929479497791926e-06, "loss": 1.0521, "step": 915 }, { "epoch": 0.882494004796163, "grad_norm": 0.14230938752904093, "learning_rate": 8.24985579675388e-06, "loss": 1.0615, "step": 920 }, { "epoch": 0.8872901678657075, "grad_norm": 0.14834529182832706, "learning_rate": 7.59601650324917e-06, "loss": 1.0596, "step": 925 }, { "epoch": 0.8920863309352518, "grad_norm": 0.15376739416260338, "learning_rate": 6.96814536476893e-06, "loss": 1.0596, "step": 930 }, { "epoch": 0.8968824940047961, "grad_norm": 0.15726124144994705, "learning_rate": 6.366418831011956e-06, "loss": 1.0639, "step": 935 }, { "epoch": 0.9016786570743405, "grad_norm": 0.14356439831596873, "learning_rate": 5.79100600429745e-06, "loss": 1.09, "step": 940 }, { "epoch": 0.9064748201438849, "grad_norm": 0.1418205242753867, "learning_rate": 5.242068592042349e-06, "loss": 1.0747, "step": 945 }, { "epoch": 0.9112709832134293, "grad_norm": 0.14390931236295634, "learning_rate": 4.7197608613169685e-06, "loss": 1.0718, "step": 950 }, { "epoch": 0.9160671462829736, "grad_norm": 0.1538309152736152, "learning_rate": 4.224229595491591e-06, "loss": 1.0717, "step": 955 }, { "epoch": 0.920863309352518, "grad_norm": 0.14425154981452512, "learning_rate": 3.7556140529860563e-06, "loss": 1.1077, "step": 960 }, { "epoch": 0.9256594724220624, "grad_norm": 0.14548690711150744, "learning_rate": 3.314045928134224e-06, "loss": 1.0682, "step": 965 }, { "epoch": 0.9304556354916067, "grad_norm": 0.14192393205318143, "learning_rate": 2.8996493141741687e-06, "loss": 1.0754, "step": 970 }, { "epoch": 0.935251798561151, "grad_norm": 0.1483650093466018, "learning_rate": 2.5125406683743414e-06, "loss": 1.0496, "step": 975 }, { "epoch": 0.9400479616306955, "grad_norm": 0.14456885419782456, "learning_rate": 2.152828779305793e-06, "loss": 1.0789, "step": 980 }, { "epoch": 0.9448441247002398, "grad_norm": 0.14797777926339026, "learning_rate": 1.8206147362695213e-06, "loss": 1.0694, "step": 985 }, { "epoch": 0.9496402877697842, "grad_norm": 0.1533259331869082, "learning_rate": 1.5159919008874369e-06, "loss": 1.0581, "step": 990 }, { "epoch": 0.9544364508393285, "grad_norm": 0.14794187907454043, "learning_rate": 1.2390458808651083e-06, "loss": 1.0805, "step": 995 }, { "epoch": 0.9592326139088729, "grad_norm": 0.14668802282059784, "learning_rate": 9.898545059335852e-07, "loss": 1.056, "step": 1000 }, { "epoch": 0.9640287769784173, "grad_norm": 0.14996161626389323, "learning_rate": 7.684878059769363e-07, "loss": 1.0528, "step": 1005 }, { "epoch": 0.9688249400479616, "grad_norm": 0.14311589381161047, "learning_rate": 5.750079913519835e-07, "loss": 1.0712, "step": 1010 }, { "epoch": 0.973621103117506, "grad_norm": 0.14129817073562634, "learning_rate": 4.094694354052742e-07, "loss": 1.0547, "step": 1015 }, { "epoch": 0.9784172661870504, "grad_norm": 0.14555078065518706, "learning_rate": 2.7191865919276026e-07, "loss": 1.053, "step": 1020 }, { "epoch": 0.9832134292565947, "grad_norm": 0.1507162059547356, "learning_rate": 1.623943184059229e-07, "loss": 1.0469, "step": 1025 }, { "epoch": 0.988009592326139, "grad_norm": 0.14248345329368778, "learning_rate": 8.092719250853975e-08, "loss": 1.0614, "step": 1030 }, { "epoch": 0.9928057553956835, "grad_norm": 0.1444089044813381, "learning_rate": 2.7540176086671144e-08, "loss": 1.081, "step": 1035 }, { "epoch": 0.9976019184652278, "grad_norm": 0.14756744866469018, "learning_rate": 2.2482724147177005e-09, "loss": 1.0966, "step": 1040 }, { "epoch": 0.9995203836930455, "eval_loss": 1.0771065950393677, "eval_runtime": 1906.9852, "eval_samples_per_second": 3.517, "eval_steps_per_second": 0.879, "step": 1042 }, { "epoch": 0.9995203836930455, "step": 1042, "total_flos": 2151475014795264.0, "train_loss": 1.0904040080343236, "train_runtime": 22004.2311, "train_samples_per_second": 3.032, "train_steps_per_second": 0.047 } ], "logging_steps": 5, "max_steps": 1042, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2151475014795264.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }