{ "best_metric": null, "best_model_checkpoint": null, "epoch": 14.899028305872413, "eval_steps": 500, "global_step": 2205, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03379805661174482, "grad_norm": 1.3233964443206787, "learning_rate": 4.5248868778280546e-07, "loss": 3.0504, "step": 5 }, { "epoch": 0.06759611322348964, "grad_norm": 1.3710697889328003, "learning_rate": 9.049773755656109e-07, "loss": 3.1346, "step": 10 }, { "epoch": 0.10139416983523447, "grad_norm": 1.6499178409576416, "learning_rate": 1.3574660633484164e-06, "loss": 3.0897, "step": 15 }, { "epoch": 0.13519222644697929, "grad_norm": 1.1821593046188354, "learning_rate": 1.8099547511312218e-06, "loss": 3.0632, "step": 20 }, { "epoch": 0.16899028305872413, "grad_norm": 1.2189937829971313, "learning_rate": 2.2624434389140273e-06, "loss": 3.1182, "step": 25 }, { "epoch": 0.20278833967046894, "grad_norm": 1.5012513399124146, "learning_rate": 2.7149321266968327e-06, "loss": 2.9984, "step": 30 }, { "epoch": 0.23658639628221378, "grad_norm": 1.459802508354187, "learning_rate": 3.167420814479638e-06, "loss": 3.141, "step": 35 }, { "epoch": 0.27038445289395857, "grad_norm": 1.2144436836242676, "learning_rate": 3.6199095022624436e-06, "loss": 2.8889, "step": 40 }, { "epoch": 0.3041825095057034, "grad_norm": 1.1169312000274658, "learning_rate": 4.072398190045249e-06, "loss": 2.8316, "step": 45 }, { "epoch": 0.33798056611744826, "grad_norm": 2.0920190811157227, "learning_rate": 4.5248868778280546e-06, "loss": 2.8712, "step": 50 }, { "epoch": 0.3717786227291931, "grad_norm": 2.2176926136016846, "learning_rate": 4.97737556561086e-06, "loss": 2.7896, "step": 55 }, { "epoch": 0.4055766793409379, "grad_norm": 3.200904369354248, "learning_rate": 5.4298642533936655e-06, "loss": 2.7888, "step": 60 }, { "epoch": 0.4393747359526827, "grad_norm": 1.8033865690231323, "learning_rate": 5.882352941176471e-06, "loss": 2.4676, "step": 65 }, { "epoch": 0.47317279256442757, "grad_norm": 1.055401086807251, "learning_rate": 6.334841628959276e-06, "loss": 2.4417, "step": 70 }, { "epoch": 0.5069708491761724, "grad_norm": 1.2879326343536377, "learning_rate": 6.787330316742083e-06, "loss": 2.4093, "step": 75 }, { "epoch": 0.5407689057879171, "grad_norm": 1.1458905935287476, "learning_rate": 7.239819004524887e-06, "loss": 2.3892, "step": 80 }, { "epoch": 0.574566962399662, "grad_norm": 1.0109455585479736, "learning_rate": 7.692307692307694e-06, "loss": 2.2264, "step": 85 }, { "epoch": 0.6083650190114068, "grad_norm": 1.0762925148010254, "learning_rate": 8.144796380090498e-06, "loss": 2.1744, "step": 90 }, { "epoch": 0.6421630756231517, "grad_norm": 1.1063696146011353, "learning_rate": 8.597285067873304e-06, "loss": 2.1335, "step": 95 }, { "epoch": 0.6759611322348965, "grad_norm": 1.0435007810592651, "learning_rate": 9.049773755656109e-06, "loss": 2.0953, "step": 100 }, { "epoch": 0.7097591888466414, "grad_norm": 1.2886987924575806, "learning_rate": 9.502262443438914e-06, "loss": 2.0159, "step": 105 }, { "epoch": 0.7435572454583862, "grad_norm": 1.2186506986618042, "learning_rate": 9.95475113122172e-06, "loss": 1.9618, "step": 110 }, { "epoch": 0.7773553020701309, "grad_norm": 1.1026384830474854, "learning_rate": 1.0407239819004526e-05, "loss": 1.9636, "step": 115 }, { "epoch": 0.8111533586818758, "grad_norm": 1.2348583936691284, "learning_rate": 1.0859728506787331e-05, "loss": 1.9061, "step": 120 }, { "epoch": 0.8449514152936206, "grad_norm": 1.2891064882278442, "learning_rate": 1.1312217194570137e-05, "loss": 1.8485, "step": 125 }, { "epoch": 0.8787494719053655, "grad_norm": 1.1830450296401978, "learning_rate": 1.1764705882352942e-05, "loss": 1.8878, "step": 130 }, { "epoch": 0.9125475285171103, "grad_norm": 1.3732470273971558, "learning_rate": 1.2217194570135748e-05, "loss": 1.8483, "step": 135 }, { "epoch": 0.9463455851288551, "grad_norm": 1.3864206075668335, "learning_rate": 1.2669683257918553e-05, "loss": 1.6419, "step": 140 }, { "epoch": 0.9801436417405999, "grad_norm": 1.529963493347168, "learning_rate": 1.3122171945701359e-05, "loss": 1.6966, "step": 145 }, { "epoch": 1.0135192226446978, "grad_norm": 1.531701683998108, "learning_rate": 1.3574660633484165e-05, "loss": 1.7206, "step": 150 }, { "epoch": 1.0473172792564427, "grad_norm": 1.2663397789001465, "learning_rate": 1.4027149321266968e-05, "loss": 1.5889, "step": 155 }, { "epoch": 1.0811153358681875, "grad_norm": 1.1123305559158325, "learning_rate": 1.4479638009049775e-05, "loss": 1.5019, "step": 160 }, { "epoch": 1.1149133924799324, "grad_norm": 1.5772109031677246, "learning_rate": 1.4932126696832581e-05, "loss": 1.5866, "step": 165 }, { "epoch": 1.1487114490916772, "grad_norm": 1.6601964235305786, "learning_rate": 1.5384615384615387e-05, "loss": 1.4743, "step": 170 }, { "epoch": 1.182509505703422, "grad_norm": 1.7369259595870972, "learning_rate": 1.5837104072398192e-05, "loss": 1.4523, "step": 175 }, { "epoch": 1.216307562315167, "grad_norm": 1.5128813982009888, "learning_rate": 1.6289592760180996e-05, "loss": 1.5253, "step": 180 }, { "epoch": 1.2501056189269117, "grad_norm": 1.5346794128417969, "learning_rate": 1.6742081447963804e-05, "loss": 1.5044, "step": 185 }, { "epoch": 1.2839036755386566, "grad_norm": 1.4628633260726929, "learning_rate": 1.719457013574661e-05, "loss": 1.2012, "step": 190 }, { "epoch": 1.3177017321504014, "grad_norm": 1.670690655708313, "learning_rate": 1.7647058823529414e-05, "loss": 1.3808, "step": 195 }, { "epoch": 1.3514997887621463, "grad_norm": 1.7677953243255615, "learning_rate": 1.8099547511312218e-05, "loss": 1.2181, "step": 200 }, { "epoch": 1.385297845373891, "grad_norm": 1.5957576036453247, "learning_rate": 1.8552036199095026e-05, "loss": 1.246, "step": 205 }, { "epoch": 1.419095901985636, "grad_norm": 2.146883010864258, "learning_rate": 1.9004524886877827e-05, "loss": 1.247, "step": 210 }, { "epoch": 1.4528939585973806, "grad_norm": 1.9484453201293945, "learning_rate": 1.9457013574660635e-05, "loss": 1.2372, "step": 215 }, { "epoch": 1.4866920152091254, "grad_norm": 2.067981243133545, "learning_rate": 1.990950226244344e-05, "loss": 1.0892, "step": 220 }, { "epoch": 1.5204900718208703, "grad_norm": 2.743396043777466, "learning_rate": 1.9999799412001547e-05, "loss": 1.0867, "step": 225 }, { "epoch": 1.554288128432615, "grad_norm": 1.6774858236312866, "learning_rate": 1.9998984537049476e-05, "loss": 0.9716, "step": 230 }, { "epoch": 1.58808618504436, "grad_norm": 2.1075453758239746, "learning_rate": 1.9997542889433917e-05, "loss": 1.0524, "step": 235 }, { "epoch": 1.6218842416561048, "grad_norm": 2.1486105918884277, "learning_rate": 1.9995474559522576e-05, "loss": 0.996, "step": 240 }, { "epoch": 1.6556822982678496, "grad_norm": 1.7104390859603882, "learning_rate": 1.9992779676965884e-05, "loss": 1.0356, "step": 245 }, { "epoch": 1.6894803548795945, "grad_norm": 2.395637035369873, "learning_rate": 1.9989458410688865e-05, "loss": 1.0114, "step": 250 }, { "epoch": 1.7232784114913393, "grad_norm": 2.096191644668579, "learning_rate": 1.9985510968880555e-05, "loss": 1.0029, "step": 255 }, { "epoch": 1.757076468103084, "grad_norm": 3.1734085083007812, "learning_rate": 1.9980937598980943e-05, "loss": 0.9794, "step": 260 }, { "epoch": 1.790874524714829, "grad_norm": 2.1142361164093018, "learning_rate": 1.9975738587665455e-05, "loss": 1.0681, "step": 265 }, { "epoch": 1.8246725813265736, "grad_norm": 2.51534104347229, "learning_rate": 1.996991426082701e-05, "loss": 0.964, "step": 270 }, { "epoch": 1.8584706379383187, "grad_norm": 2.2418782711029053, "learning_rate": 1.9963464983555557e-05, "loss": 0.9054, "step": 275 }, { "epoch": 1.8922686945500633, "grad_norm": 2.073915958404541, "learning_rate": 1.9956391160115224e-05, "loss": 0.8698, "step": 280 }, { "epoch": 1.9260667511618081, "grad_norm": 2.290095806121826, "learning_rate": 1.994869323391895e-05, "loss": 0.9558, "step": 285 }, { "epoch": 1.959864807773553, "grad_norm": 2.205028533935547, "learning_rate": 1.9940371687500713e-05, "loss": 0.8184, "step": 290 }, { "epoch": 1.9936628643852978, "grad_norm": 3.1502742767333984, "learning_rate": 1.9931427042485252e-05, "loss": 1.11, "step": 295 }, { "epoch": 2.0270384452893957, "grad_norm": 2.171036958694458, "learning_rate": 1.992185985955541e-05, "loss": 0.7225, "step": 300 }, { "epoch": 2.0608365019011408, "grad_norm": 2.4927501678466797, "learning_rate": 1.991167073841695e-05, "loss": 1.0667, "step": 305 }, { "epoch": 2.0946345585128854, "grad_norm": 2.1560094356536865, "learning_rate": 1.990086031776099e-05, "loss": 0.7699, "step": 310 }, { "epoch": 2.1284326151246304, "grad_norm": 2.2326228618621826, "learning_rate": 1.9889429275223958e-05, "loss": 0.8313, "step": 315 }, { "epoch": 2.162230671736375, "grad_norm": 2.2837958335876465, "learning_rate": 1.9877378327345115e-05, "loss": 0.8124, "step": 320 }, { "epoch": 2.19602872834812, "grad_norm": 2.033587694168091, "learning_rate": 1.9864708229521637e-05, "loss": 0.7758, "step": 325 }, { "epoch": 2.2298267849598647, "grad_norm": 2.212913990020752, "learning_rate": 1.9851419775961265e-05, "loss": 0.6772, "step": 330 }, { "epoch": 2.26362484157161, "grad_norm": 2.4761927127838135, "learning_rate": 1.9837513799632536e-05, "loss": 0.6488, "step": 335 }, { "epoch": 2.2974228981833544, "grad_norm": 2.0889394283294678, "learning_rate": 1.982299117221254e-05, "loss": 0.6567, "step": 340 }, { "epoch": 2.331220954795099, "grad_norm": 1.7731271982192993, "learning_rate": 1.9807852804032306e-05, "loss": 0.5775, "step": 345 }, { "epoch": 2.365019011406844, "grad_norm": 2.9344234466552734, "learning_rate": 1.979209964401973e-05, "loss": 0.711, "step": 350 }, { "epoch": 2.3988170680185887, "grad_norm": 2.7177066802978516, "learning_rate": 1.9775732679640093e-05, "loss": 0.5417, "step": 355 }, { "epoch": 2.432615124630334, "grad_norm": 1.9662398099899292, "learning_rate": 1.975875293683416e-05, "loss": 0.7523, "step": 360 }, { "epoch": 2.4664131812420784, "grad_norm": 2.3415379524230957, "learning_rate": 1.9741161479953872e-05, "loss": 0.5889, "step": 365 }, { "epoch": 2.5002112378538235, "grad_norm": 2.181759834289551, "learning_rate": 1.9722959411695636e-05, "loss": 0.6306, "step": 370 }, { "epoch": 2.534009294465568, "grad_norm": 2.549531936645508, "learning_rate": 1.970414787303119e-05, "loss": 0.7365, "step": 375 }, { "epoch": 2.567807351077313, "grad_norm": 2.4441099166870117, "learning_rate": 1.9684728043136093e-05, "loss": 0.6466, "step": 380 }, { "epoch": 2.601605407689058, "grad_norm": 2.6284027099609375, "learning_rate": 1.966470113931582e-05, "loss": 0.6229, "step": 385 }, { "epoch": 2.635403464300803, "grad_norm": 2.808634042739868, "learning_rate": 1.9644068416929417e-05, "loss": 0.6366, "step": 390 }, { "epoch": 2.6692015209125475, "grad_norm": 2.9602878093719482, "learning_rate": 1.9622831169310864e-05, "loss": 0.6766, "step": 395 }, { "epoch": 2.7029995775242925, "grad_norm": 3.1014065742492676, "learning_rate": 1.9600990727687964e-05, "loss": 0.7399, "step": 400 }, { "epoch": 2.736797634136037, "grad_norm": 1.8795526027679443, "learning_rate": 1.9578548461098912e-05, "loss": 0.635, "step": 405 }, { "epoch": 2.770595690747782, "grad_norm": 2.7618279457092285, "learning_rate": 1.9555505776306492e-05, "loss": 0.6349, "step": 410 }, { "epoch": 2.804393747359527, "grad_norm": 2.2114012241363525, "learning_rate": 1.9531864117709855e-05, "loss": 0.5364, "step": 415 }, { "epoch": 2.838191803971272, "grad_norm": 2.5741758346557617, "learning_rate": 1.950762496725403e-05, "loss": 0.6202, "step": 420 }, { "epoch": 2.8719898605830165, "grad_norm": 2.607922077178955, "learning_rate": 1.948278984433699e-05, "loss": 0.703, "step": 425 }, { "epoch": 2.905787917194761, "grad_norm": 2.70082688331604, "learning_rate": 1.945736030571443e-05, "loss": 0.5793, "step": 430 }, { "epoch": 2.939585973806506, "grad_norm": 2.543623447418213, "learning_rate": 1.9431337945402186e-05, "loss": 0.488, "step": 435 }, { "epoch": 2.973384030418251, "grad_norm": 2.2461986541748047, "learning_rate": 1.9404724394576305e-05, "loss": 0.561, "step": 440 }, { "epoch": 3.006759611322349, "grad_norm": 2.3420913219451904, "learning_rate": 1.9377521321470806e-05, "loss": 0.5422, "step": 445 }, { "epoch": 3.040557667934094, "grad_norm": 2.485952854156494, "learning_rate": 1.93497304312731e-05, "loss": 0.4076, "step": 450 }, { "epoch": 3.0743557245458386, "grad_norm": 2.928043842315674, "learning_rate": 1.932135346601711e-05, "loss": 0.4619, "step": 455 }, { "epoch": 3.1081537811575832, "grad_norm": 3.641951084136963, "learning_rate": 1.9292392204474075e-05, "loss": 0.649, "step": 460 }, { "epoch": 3.1419518377693283, "grad_norm": 2.47162127494812, "learning_rate": 1.9262848462041046e-05, "loss": 0.4297, "step": 465 }, { "epoch": 3.175749894381073, "grad_norm": 2.943067789077759, "learning_rate": 1.923272409062709e-05, "loss": 0.5152, "step": 470 }, { "epoch": 3.209547950992818, "grad_norm": 3.1264185905456543, "learning_rate": 1.920202097853721e-05, "loss": 0.5389, "step": 475 }, { "epoch": 3.2433460076045626, "grad_norm": 2.739868402481079, "learning_rate": 1.917074105035397e-05, "loss": 0.5507, "step": 480 }, { "epoch": 3.2771440642163077, "grad_norm": 2.510500907897949, "learning_rate": 1.9138886266816868e-05, "loss": 0.4332, "step": 485 }, { "epoch": 3.3109421208280523, "grad_norm": 2.6104397773742676, "learning_rate": 1.9106458624699425e-05, "loss": 0.6674, "step": 490 }, { "epoch": 3.3447401774397973, "grad_norm": 2.4239916801452637, "learning_rate": 1.907346015668401e-05, "loss": 0.4281, "step": 495 }, { "epoch": 3.378538234051542, "grad_norm": 3.4318349361419678, "learning_rate": 1.9039892931234434e-05, "loss": 0.499, "step": 500 }, { "epoch": 3.412336290663287, "grad_norm": 2.174170970916748, "learning_rate": 1.9005759052466303e-05, "loss": 0.415, "step": 505 }, { "epoch": 3.4461343472750317, "grad_norm": 2.9727699756622314, "learning_rate": 1.897106066001509e-05, "loss": 0.5141, "step": 510 }, { "epoch": 3.4799324038867763, "grad_norm": 3.2721505165100098, "learning_rate": 1.8935799928902046e-05, "loss": 0.5301, "step": 515 }, { "epoch": 3.5137304604985213, "grad_norm": 2.574387311935425, "learning_rate": 1.8899979069397858e-05, "loss": 0.4762, "step": 520 }, { "epoch": 3.5475285171102664, "grad_norm": 3.4737517833709717, "learning_rate": 1.8863600326884085e-05, "loss": 0.3734, "step": 525 }, { "epoch": 3.581326573722011, "grad_norm": 3.07442045211792, "learning_rate": 1.882666598171242e-05, "loss": 0.6237, "step": 530 }, { "epoch": 3.6151246303337556, "grad_norm": 2.256251573562622, "learning_rate": 1.8789178349061755e-05, "loss": 0.4447, "step": 535 }, { "epoch": 3.6489226869455007, "grad_norm": 2.587085008621216, "learning_rate": 1.8751139778793043e-05, "loss": 0.4351, "step": 540 }, { "epoch": 3.6827207435572453, "grad_norm": 2.929131507873535, "learning_rate": 1.871255265530201e-05, "loss": 0.4799, "step": 545 }, { "epoch": 3.7165188001689904, "grad_norm": 2.4406521320343018, "learning_rate": 1.8673419397369693e-05, "loss": 0.3568, "step": 550 }, { "epoch": 3.750316856780735, "grad_norm": 3.0470123291015625, "learning_rate": 1.863374245801082e-05, "loss": 0.535, "step": 555 }, { "epoch": 3.78411491339248, "grad_norm": 1.8549753427505493, "learning_rate": 1.8593524324320035e-05, "loss": 0.3995, "step": 560 }, { "epoch": 3.8179129700042247, "grad_norm": 3.1754539012908936, "learning_rate": 1.855276751731602e-05, "loss": 0.4495, "step": 565 }, { "epoch": 3.8517110266159698, "grad_norm": 2.436633825302124, "learning_rate": 1.8511474591783454e-05, "loss": 0.4472, "step": 570 }, { "epoch": 3.8855090832277144, "grad_norm": 2.15982985496521, "learning_rate": 1.8469648136112867e-05, "loss": 0.5069, "step": 575 }, { "epoch": 3.919307139839459, "grad_norm": 2.9572179317474365, "learning_rate": 1.8427290772138397e-05, "loss": 0.4933, "step": 580 }, { "epoch": 3.953105196451204, "grad_norm": 2.8051276206970215, "learning_rate": 1.838440515497345e-05, "loss": 0.39, "step": 585 }, { "epoch": 3.986903253062949, "grad_norm": 3.173710823059082, "learning_rate": 1.8340993972844252e-05, "loss": 0.4061, "step": 590 }, { "epoch": 4.020278833967047, "grad_norm": 2.855424404144287, "learning_rate": 1.8297059946921357e-05, "loss": 0.3861, "step": 595 }, { "epoch": 4.054076890578791, "grad_norm": 2.7455055713653564, "learning_rate": 1.8252605831149052e-05, "loss": 0.3595, "step": 600 }, { "epoch": 4.087874947190537, "grad_norm": 3.2334115505218506, "learning_rate": 1.8207634412072765e-05, "loss": 0.346, "step": 605 }, { "epoch": 4.1216730038022815, "grad_norm": 2.810620069503784, "learning_rate": 1.816214850866436e-05, "loss": 0.4259, "step": 610 }, { "epoch": 4.155471060414026, "grad_norm": 2.4240875244140625, "learning_rate": 1.811615097214545e-05, "loss": 0.4007, "step": 615 }, { "epoch": 4.189269117025771, "grad_norm": 3.068871021270752, "learning_rate": 1.8069644685808673e-05, "loss": 0.2978, "step": 620 }, { "epoch": 4.223067173637516, "grad_norm": 2.1824235916137695, "learning_rate": 1.8022632564836948e-05, "loss": 0.3693, "step": 625 }, { "epoch": 4.256865230249261, "grad_norm": 3.2515642642974854, "learning_rate": 1.797511755612075e-05, "loss": 0.4717, "step": 630 }, { "epoch": 4.2906632868610055, "grad_norm": 3.5332624912261963, "learning_rate": 1.7927102638073384e-05, "loss": 0.4488, "step": 635 }, { "epoch": 4.32446134347275, "grad_norm": 2.8152706623077393, "learning_rate": 1.7878590820444283e-05, "loss": 0.3908, "step": 640 }, { "epoch": 4.358259400084495, "grad_norm": 3.03226375579834, "learning_rate": 1.7829585144130356e-05, "loss": 0.3771, "step": 645 }, { "epoch": 4.39205745669624, "grad_norm": 3.0809173583984375, "learning_rate": 1.7780088680985365e-05, "loss": 0.3708, "step": 650 }, { "epoch": 4.425855513307985, "grad_norm": 3.259047269821167, "learning_rate": 1.773010453362737e-05, "loss": 0.4393, "step": 655 }, { "epoch": 4.4596535699197295, "grad_norm": 2.542726993560791, "learning_rate": 1.7679635835244256e-05, "loss": 0.4462, "step": 660 }, { "epoch": 4.493451626531474, "grad_norm": 2.5668067932128906, "learning_rate": 1.762868574939732e-05, "loss": 0.3585, "step": 665 }, { "epoch": 4.52724968314322, "grad_norm": 2.9174365997314453, "learning_rate": 1.7577257469822976e-05, "loss": 0.3732, "step": 670 }, { "epoch": 4.561047739754964, "grad_norm": 2.1858620643615723, "learning_rate": 1.7525354220232558e-05, "loss": 0.4202, "step": 675 }, { "epoch": 4.594845796366709, "grad_norm": 3.092898368835449, "learning_rate": 1.747297925411024e-05, "loss": 0.4174, "step": 680 }, { "epoch": 4.6286438529784535, "grad_norm": 2.1292641162872314, "learning_rate": 1.742013585450911e-05, "loss": 0.2891, "step": 685 }, { "epoch": 4.662441909590198, "grad_norm": 3.4500226974487305, "learning_rate": 1.736682733384536e-05, "loss": 0.3446, "step": 690 }, { "epoch": 4.696239966201944, "grad_norm": 2.490712881088257, "learning_rate": 1.7313057033690662e-05, "loss": 0.273, "step": 695 }, { "epoch": 4.730038022813688, "grad_norm": 3.1903836727142334, "learning_rate": 1.7258828324562705e-05, "loss": 0.3976, "step": 700 }, { "epoch": 4.763836079425433, "grad_norm": 2.6504249572753906, "learning_rate": 1.7204144605713922e-05, "loss": 0.351, "step": 705 }, { "epoch": 4.7976341360371775, "grad_norm": 2.951176643371582, "learning_rate": 1.7149009304918392e-05, "loss": 0.3601, "step": 710 }, { "epoch": 4.831432192648923, "grad_norm": 4.028046131134033, "learning_rate": 1.7093425878257007e-05, "loss": 0.4412, "step": 715 }, { "epoch": 4.865230249260668, "grad_norm": 3.4209461212158203, "learning_rate": 1.7037397809900807e-05, "loss": 0.4239, "step": 720 }, { "epoch": 4.899028305872412, "grad_norm": 2.396829605102539, "learning_rate": 1.698092861189259e-05, "loss": 0.3325, "step": 725 }, { "epoch": 4.932826362484157, "grad_norm": 2.638688564300537, "learning_rate": 1.6924021823926766e-05, "loss": 0.3053, "step": 730 }, { "epoch": 4.966624419095902, "grad_norm": 3.0459437370300293, "learning_rate": 1.6866681013127466e-05, "loss": 0.2785, "step": 735 }, { "epoch": 5.0, "grad_norm": 4.051104545593262, "learning_rate": 1.6808909773824952e-05, "loss": 0.2148, "step": 740 }, { "epoch": 5.033798056611745, "grad_norm": 2.424513816833496, "learning_rate": 1.675071172733031e-05, "loss": 0.3102, "step": 745 }, { "epoch": 5.067596113223489, "grad_norm": 2.9347620010375977, "learning_rate": 1.669209052170845e-05, "loss": 0.2635, "step": 750 }, { "epoch": 5.101394169835235, "grad_norm": 2.5299954414367676, "learning_rate": 1.6633049831549424e-05, "loss": 0.2556, "step": 755 }, { "epoch": 5.135192226446979, "grad_norm": 3.3548402786254883, "learning_rate": 1.657359335773812e-05, "loss": 0.3626, "step": 760 }, { "epoch": 5.168990283058724, "grad_norm": 3.0583834648132324, "learning_rate": 1.6513724827222225e-05, "loss": 0.3778, "step": 765 }, { "epoch": 5.202788339670469, "grad_norm": 2.3884308338165283, "learning_rate": 1.645344799277866e-05, "loss": 0.3429, "step": 770 }, { "epoch": 5.236586396282214, "grad_norm": 3.5502490997314453, "learning_rate": 1.639276663277831e-05, "loss": 0.3531, "step": 775 }, { "epoch": 5.270384452893959, "grad_norm": 2.881547212600708, "learning_rate": 1.6331684550949197e-05, "loss": 0.2784, "step": 780 }, { "epoch": 5.304182509505703, "grad_norm": 2.110593795776367, "learning_rate": 1.627020557613803e-05, "loss": 0.3011, "step": 785 }, { "epoch": 5.337980566117448, "grad_norm": 3.2138075828552246, "learning_rate": 1.6208333562070232e-05, "loss": 0.3218, "step": 790 }, { "epoch": 5.3717786227291935, "grad_norm": 2.4348948001861572, "learning_rate": 1.614607238710833e-05, "loss": 0.2419, "step": 795 }, { "epoch": 5.405576679340938, "grad_norm": 3.6023876667022705, "learning_rate": 1.6083425954008883e-05, "loss": 0.3198, "step": 800 }, { "epoch": 5.439374735952683, "grad_norm": 3.171356201171875, "learning_rate": 1.602039818967783e-05, "loss": 0.3377, "step": 805 }, { "epoch": 5.473172792564427, "grad_norm": 2.926022529602051, "learning_rate": 1.5956993044924334e-05, "loss": 0.2398, "step": 810 }, { "epoch": 5.506970849176172, "grad_norm": 2.8738198280334473, "learning_rate": 1.589321449421313e-05, "loss": 0.2829, "step": 815 }, { "epoch": 5.5407689057879175, "grad_norm": 3.6972992420196533, "learning_rate": 1.5829066535415402e-05, "loss": 0.3569, "step": 820 }, { "epoch": 5.574566962399662, "grad_norm": 3.0152523517608643, "learning_rate": 1.576455318955816e-05, "loss": 0.2925, "step": 825 }, { "epoch": 5.608365019011407, "grad_norm": 2.8930368423461914, "learning_rate": 1.569967850057222e-05, "loss": 0.3363, "step": 830 }, { "epoch": 5.642163075623151, "grad_norm": 3.1284563541412354, "learning_rate": 1.5634446535038688e-05, "loss": 0.3218, "step": 835 }, { "epoch": 5.675961132234897, "grad_norm": 1.6916499137878418, "learning_rate": 1.556886138193406e-05, "loss": 0.2436, "step": 840 }, { "epoch": 5.7097591888466415, "grad_norm": 3.7334420680999756, "learning_rate": 1.5502927152373913e-05, "loss": 0.2874, "step": 845 }, { "epoch": 5.743557245458386, "grad_norm": 3.914621591567993, "learning_rate": 1.5436647979355214e-05, "loss": 0.2329, "step": 850 }, { "epoch": 5.777355302070131, "grad_norm": 3.38970685005188, "learning_rate": 1.5370028017497217e-05, "loss": 0.3232, "step": 855 }, { "epoch": 5.811153358681876, "grad_norm": 2.7700934410095215, "learning_rate": 1.5303071442781083e-05, "loss": 0.2951, "step": 860 }, { "epoch": 5.844951415293621, "grad_norm": 3.382173538208008, "learning_rate": 1.5235782452288068e-05, "loss": 0.2719, "step": 865 }, { "epoch": 5.8787494719053655, "grad_norm": 3.8175547122955322, "learning_rate": 1.5168165263936472e-05, "loss": 0.3171, "step": 870 }, { "epoch": 5.91254752851711, "grad_norm": 3.3271560668945312, "learning_rate": 1.5100224116217217e-05, "loss": 0.2364, "step": 875 }, { "epoch": 5.946345585128855, "grad_norm": 2.9731876850128174, "learning_rate": 1.5031963267928185e-05, "loss": 0.2103, "step": 880 }, { "epoch": 5.9801436417406, "grad_norm": 3.461787700653076, "learning_rate": 1.4963386997907242e-05, "loss": 0.341, "step": 885 }, { "epoch": 6.013519222644698, "grad_norm": 3.172473669052124, "learning_rate": 1.4894499604764035e-05, "loss": 0.2618, "step": 890 }, { "epoch": 6.0473172792564425, "grad_norm": 2.9784677028656006, "learning_rate": 1.4825305406610547e-05, "loss": 0.2903, "step": 895 }, { "epoch": 6.081115335868188, "grad_norm": 3.697354555130005, "learning_rate": 1.4755808740790403e-05, "loss": 0.2625, "step": 900 }, { "epoch": 6.114913392479933, "grad_norm": 3.192431926727295, "learning_rate": 1.4686013963607e-05, "loss": 0.233, "step": 905 }, { "epoch": 6.148711449091677, "grad_norm": 2.8318302631378174, "learning_rate": 1.4615925450050448e-05, "loss": 0.1387, "step": 910 }, { "epoch": 6.182509505703422, "grad_norm": 3.418325901031494, "learning_rate": 1.4545547593523308e-05, "loss": 0.3177, "step": 915 }, { "epoch": 6.2163075623151665, "grad_norm": 3.188663959503174, "learning_rate": 1.4474884805565217e-05, "loss": 0.2066, "step": 920 }, { "epoch": 6.250105618926912, "grad_norm": 2.2658884525299072, "learning_rate": 1.4403941515576344e-05, "loss": 0.2959, "step": 925 }, { "epoch": 6.283903675538657, "grad_norm": 2.798861265182495, "learning_rate": 1.4332722170539748e-05, "loss": 0.2784, "step": 930 }, { "epoch": 6.317701732150401, "grad_norm": 3.2030510902404785, "learning_rate": 1.4261231234742618e-05, "loss": 0.224, "step": 935 }, { "epoch": 6.351499788762146, "grad_norm": 3.1087892055511475, "learning_rate": 1.4189473189496437e-05, "loss": 0.271, "step": 940 }, { "epoch": 6.385297845373891, "grad_norm": 3.4298338890075684, "learning_rate": 1.4117452532856084e-05, "loss": 0.1972, "step": 945 }, { "epoch": 6.419095901985636, "grad_norm": 2.693760633468628, "learning_rate": 1.4045173779337866e-05, "loss": 0.3036, "step": 950 }, { "epoch": 6.452893958597381, "grad_norm": 3.6742842197418213, "learning_rate": 1.3972641459636548e-05, "loss": 0.276, "step": 955 }, { "epoch": 6.486692015209125, "grad_norm": 2.9099996089935303, "learning_rate": 1.3899860120341338e-05, "loss": 0.2841, "step": 960 }, { "epoch": 6.52049007182087, "grad_norm": 2.4859213829040527, "learning_rate": 1.3826834323650899e-05, "loss": 0.2752, "step": 965 }, { "epoch": 6.554288128432615, "grad_norm": 2.6533761024475098, "learning_rate": 1.3753568647087372e-05, "loss": 0.212, "step": 970 }, { "epoch": 6.58808618504436, "grad_norm": 2.8711912631988525, "learning_rate": 1.3680067683209438e-05, "loss": 0.2039, "step": 975 }, { "epoch": 6.621884241656105, "grad_norm": 3.615388870239258, "learning_rate": 1.3606336039324439e-05, "loss": 0.1882, "step": 980 }, { "epoch": 6.65568229826785, "grad_norm": 2.813685655593872, "learning_rate": 1.353237833719958e-05, "loss": 0.2237, "step": 985 }, { "epoch": 6.689480354879595, "grad_norm": 3.288862466812134, "learning_rate": 1.3458199212772227e-05, "loss": 0.2177, "step": 990 }, { "epoch": 6.723278411491339, "grad_norm": 3.3833813667297363, "learning_rate": 1.3383803315859281e-05, "loss": 0.2406, "step": 995 }, { "epoch": 6.757076468103084, "grad_norm": 3.7307562828063965, "learning_rate": 1.3309195309865746e-05, "loss": 0.1924, "step": 1000 }, { "epoch": 6.7908745247148286, "grad_norm": 3.9301440715789795, "learning_rate": 1.3234379871492381e-05, "loss": 0.2912, "step": 1005 }, { "epoch": 6.824672581326574, "grad_norm": 1.9294644594192505, "learning_rate": 1.315936169044257e-05, "loss": 0.2257, "step": 1010 }, { "epoch": 6.858470637938319, "grad_norm": 3.4223814010620117, "learning_rate": 1.3084145469128343e-05, "loss": 0.2205, "step": 1015 }, { "epoch": 6.892268694550063, "grad_norm": 3.395117998123169, "learning_rate": 1.3008735922375607e-05, "loss": 0.2059, "step": 1020 }, { "epoch": 6.926066751161808, "grad_norm": 3.7277326583862305, "learning_rate": 1.2933137777128607e-05, "loss": 0.2599, "step": 1025 }, { "epoch": 6.9598648077735525, "grad_norm": 2.926193952560425, "learning_rate": 1.2857355772153637e-05, "loss": 0.2058, "step": 1030 }, { "epoch": 6.993662864385298, "grad_norm": 2.551806926727295, "learning_rate": 1.2781394657741988e-05, "loss": 0.3004, "step": 1035 }, { "epoch": 7.027038445289396, "grad_norm": 3.521486759185791, "learning_rate": 1.2705259195412168e-05, "loss": 0.1499, "step": 1040 }, { "epoch": 7.06083650190114, "grad_norm": 3.246941089630127, "learning_rate": 1.2628954157611449e-05, "loss": 0.2174, "step": 1045 }, { "epoch": 7.094634558512886, "grad_norm": 2.2454280853271484, "learning_rate": 1.255248432741672e-05, "loss": 0.1209, "step": 1050 }, { "epoch": 7.1284326151246304, "grad_norm": 2.4737725257873535, "learning_rate": 1.2475854498234647e-05, "loss": 0.1727, "step": 1055 }, { "epoch": 7.162230671736375, "grad_norm": 2.819976329803467, "learning_rate": 1.239906947350121e-05, "loss": 0.2555, "step": 1060 }, { "epoch": 7.19602872834812, "grad_norm": 2.772263765335083, "learning_rate": 1.2322134066380622e-05, "loss": 0.2112, "step": 1065 }, { "epoch": 7.229826784959865, "grad_norm": 3.721599817276001, "learning_rate": 1.22450530994636e-05, "loss": 0.3326, "step": 1070 }, { "epoch": 7.26362484157161, "grad_norm": 2.8285434246063232, "learning_rate": 1.2167831404465078e-05, "loss": 0.2237, "step": 1075 }, { "epoch": 7.297422898183354, "grad_norm": 3.2905073165893555, "learning_rate": 1.2090473821921343e-05, "loss": 0.1998, "step": 1080 }, { "epoch": 7.331220954795099, "grad_norm": 2.5703885555267334, "learning_rate": 1.2012985200886602e-05, "loss": 0.2402, "step": 1085 }, { "epoch": 7.365019011406844, "grad_norm": 3.2286860942840576, "learning_rate": 1.1935370398629033e-05, "loss": 0.1771, "step": 1090 }, { "epoch": 7.398817068018589, "grad_norm": 3.355846881866455, "learning_rate": 1.185763428032631e-05, "loss": 0.2184, "step": 1095 }, { "epoch": 7.432615124630334, "grad_norm": 2.6862475872039795, "learning_rate": 1.1779781718760641e-05, "loss": 0.212, "step": 1100 }, { "epoch": 7.466413181242078, "grad_norm": 3.8962576389312744, "learning_rate": 1.1701817594013312e-05, "loss": 0.214, "step": 1105 }, { "epoch": 7.500211237853823, "grad_norm": 3.2958405017852783, "learning_rate": 1.1623746793158803e-05, "loss": 0.2378, "step": 1110 }, { "epoch": 7.5340092944655686, "grad_norm": 2.6480026245117188, "learning_rate": 1.1545574209958433e-05, "loss": 0.1399, "step": 1115 }, { "epoch": 7.567807351077313, "grad_norm": 3.944840669631958, "learning_rate": 1.1467304744553618e-05, "loss": 0.2823, "step": 1120 }, { "epoch": 7.601605407689058, "grad_norm": 4.2091498374938965, "learning_rate": 1.1388943303158692e-05, "loss": 0.1703, "step": 1125 }, { "epoch": 7.635403464300802, "grad_norm": 4.504730701446533, "learning_rate": 1.1310494797753382e-05, "loss": 0.1969, "step": 1130 }, { "epoch": 7.669201520912548, "grad_norm": 3.6243932247161865, "learning_rate": 1.1231964145774906e-05, "loss": 0.2886, "step": 1135 }, { "epoch": 7.7029995775242925, "grad_norm": 3.16015887260437, "learning_rate": 1.1153356269809721e-05, "loss": 0.1156, "step": 1140 }, { "epoch": 7.736797634136037, "grad_norm": 3.0954883098602295, "learning_rate": 1.1074676097284973e-05, "loss": 0.1634, "step": 1145 }, { "epoch": 7.770595690747782, "grad_norm": 3.1873254776000977, "learning_rate": 1.0995928560159608e-05, "loss": 0.2507, "step": 1150 }, { "epoch": 7.804393747359526, "grad_norm": 3.6099650859832764, "learning_rate": 1.0917118594615237e-05, "loss": 0.2474, "step": 1155 }, { "epoch": 7.838191803971272, "grad_norm": 3.4526472091674805, "learning_rate": 1.0838251140746717e-05, "loss": 0.1501, "step": 1160 }, { "epoch": 7.8719898605830165, "grad_norm": 2.2834644317626953, "learning_rate": 1.0759331142252463e-05, "loss": 0.1648, "step": 1165 }, { "epoch": 7.905787917194761, "grad_norm": 3.0223686695098877, "learning_rate": 1.0680363546124599e-05, "loss": 0.1598, "step": 1170 }, { "epoch": 7.939585973806506, "grad_norm": 3.2281494140625, "learning_rate": 1.060135330233883e-05, "loss": 0.1681, "step": 1175 }, { "epoch": 7.973384030418251, "grad_norm": 3.3291306495666504, "learning_rate": 1.0522305363544172e-05, "loss": 0.1202, "step": 1180 }, { "epoch": 8.00675961132235, "grad_norm": 2.6950342655181885, "learning_rate": 1.04432246847525e-05, "loss": 0.2243, "step": 1185 }, { "epoch": 8.040557667934094, "grad_norm": 3.4718968868255615, "learning_rate": 1.0364116223027956e-05, "loss": 0.1996, "step": 1190 }, { "epoch": 8.074355724545839, "grad_norm": 3.3445370197296143, "learning_rate": 1.0284984937176213e-05, "loss": 0.2244, "step": 1195 }, { "epoch": 8.108153781157583, "grad_norm": 2.8722851276397705, "learning_rate": 1.0205835787433645e-05, "loss": 0.099, "step": 1200 }, { "epoch": 8.141951837769328, "grad_norm": 2.5152461528778076, "learning_rate": 1.0126673735156402e-05, "loss": 0.1599, "step": 1205 }, { "epoch": 8.175749894381074, "grad_norm": 3.2663590908050537, "learning_rate": 1.0047503742509405e-05, "loss": 0.2148, "step": 1210 }, { "epoch": 8.209547950992818, "grad_norm": 2.693246603012085, "learning_rate": 9.968330772155312e-06, "loss": 0.219, "step": 1215 }, { "epoch": 8.243346007604563, "grad_norm": 3.533890962600708, "learning_rate": 9.889159786943428e-06, "loss": 0.1133, "step": 1220 }, { "epoch": 8.277144064216307, "grad_norm": 2.7618963718414307, "learning_rate": 9.809995749598633e-06, "loss": 0.1692, "step": 1225 }, { "epoch": 8.310942120828052, "grad_norm": 2.682603120803833, "learning_rate": 9.730843622410282e-06, "loss": 0.2291, "step": 1230 }, { "epoch": 8.344740177439798, "grad_norm": 2.9029886722564697, "learning_rate": 9.651708366921152e-06, "loss": 0.165, "step": 1235 }, { "epoch": 8.378538234051542, "grad_norm": 2.932377576828003, "learning_rate": 9.572594943616457e-06, "loss": 0.1651, "step": 1240 }, { "epoch": 8.412336290663287, "grad_norm": 3.0703186988830566, "learning_rate": 9.493508311612874e-06, "loss": 0.1969, "step": 1245 }, { "epoch": 8.446134347275033, "grad_norm": 2.8268532752990723, "learning_rate": 9.414453428347715e-06, "loss": 0.1747, "step": 1250 }, { "epoch": 8.479932403886776, "grad_norm": 2.9563803672790527, "learning_rate": 9.335435249268165e-06, "loss": 0.1082, "step": 1255 }, { "epoch": 8.513730460498522, "grad_norm": 3.163346767425537, "learning_rate": 9.256458727520648e-06, "loss": 0.1776, "step": 1260 }, { "epoch": 8.547528517110266, "grad_norm": 3.5345945358276367, "learning_rate": 9.177528813640362e-06, "loss": 0.1194, "step": 1265 }, { "epoch": 8.581326573722011, "grad_norm": 3.074373722076416, "learning_rate": 9.098650455240959e-06, "loss": 0.197, "step": 1270 }, { "epoch": 8.615124630333757, "grad_norm": 3.080812454223633, "learning_rate": 9.019828596704394e-06, "loss": 0.1218, "step": 1275 }, { "epoch": 8.6489226869455, "grad_norm": 3.2213311195373535, "learning_rate": 8.941068178871021e-06, "loss": 0.1822, "step": 1280 }, { "epoch": 8.682720743557246, "grad_norm": 2.857954740524292, "learning_rate": 8.862374138729854e-06, "loss": 0.1687, "step": 1285 }, { "epoch": 8.71651880016899, "grad_norm": 2.9493982791900635, "learning_rate": 8.783751409109116e-06, "loss": 0.1393, "step": 1290 }, { "epoch": 8.750316856780735, "grad_norm": 1.754936695098877, "learning_rate": 8.705204918367032e-06, "loss": 0.1846, "step": 1295 }, { "epoch": 8.78411491339248, "grad_norm": 4.011746406555176, "learning_rate": 8.626739590082897e-06, "loss": 0.1897, "step": 1300 }, { "epoch": 8.817912970004224, "grad_norm": 3.003286361694336, "learning_rate": 8.54836034274844e-06, "loss": 0.1873, "step": 1305 }, { "epoch": 8.85171102661597, "grad_norm": 3.0416910648345947, "learning_rate": 8.47007208945953e-06, "loss": 0.1263, "step": 1310 }, { "epoch": 8.885509083227713, "grad_norm": 3.4020864963531494, "learning_rate": 8.391879737608202e-06, "loss": 0.1536, "step": 1315 }, { "epoch": 8.919307139839459, "grad_norm": 2.8439645767211914, "learning_rate": 8.313788188575032e-06, "loss": 0.1835, "step": 1320 }, { "epoch": 8.953105196451205, "grad_norm": 2.475952386856079, "learning_rate": 8.23580233742192e-06, "loss": 0.1275, "step": 1325 }, { "epoch": 8.986903253062948, "grad_norm": 3.099142551422119, "learning_rate": 8.15792707258522e-06, "loss": 0.1355, "step": 1330 }, { "epoch": 9.020278833967048, "grad_norm": 2.5242106914520264, "learning_rate": 8.08016727556936e-06, "loss": 0.1135, "step": 1335 }, { "epoch": 9.054076890578791, "grad_norm": 2.4750607013702393, "learning_rate": 8.002527820640809e-06, "loss": 0.1477, "step": 1340 }, { "epoch": 9.087874947190537, "grad_norm": 2.5990304946899414, "learning_rate": 7.925013574522556e-06, "loss": 0.1125, "step": 1345 }, { "epoch": 9.12167300380228, "grad_norm": 2.2538115978240967, "learning_rate": 7.847629396089054e-06, "loss": 0.1967, "step": 1350 }, { "epoch": 9.155471060414026, "grad_norm": 2.93662691116333, "learning_rate": 7.770380136061643e-06, "loss": 0.1963, "step": 1355 }, { "epoch": 9.189269117025772, "grad_norm": 3.2367334365844727, "learning_rate": 7.693270636704476e-06, "loss": 0.0882, "step": 1360 }, { "epoch": 9.223067173637515, "grad_norm": 2.297624349594116, "learning_rate": 7.616305731521009e-06, "loss": 0.1547, "step": 1365 }, { "epoch": 9.256865230249261, "grad_norm": 3.3643083572387695, "learning_rate": 7.539490244951013e-06, "loss": 0.1491, "step": 1370 }, { "epoch": 9.290663286861005, "grad_norm": 2.270787477493286, "learning_rate": 7.462828992068144e-06, "loss": 0.1255, "step": 1375 }, { "epoch": 9.32446134347275, "grad_norm": 2.6333799362182617, "learning_rate": 7.386326778278142e-06, "loss": 0.1117, "step": 1380 }, { "epoch": 9.358259400084496, "grad_norm": 2.613737106323242, "learning_rate": 7.3099883990176025e-06, "loss": 0.1612, "step": 1385 }, { "epoch": 9.39205745669624, "grad_norm": 1.7052559852600098, "learning_rate": 7.233818639453358e-06, "loss": 0.1471, "step": 1390 }, { "epoch": 9.425855513307985, "grad_norm": 3.2761054039001465, "learning_rate": 7.15782227418257e-06, "loss": 0.122, "step": 1395 }, { "epoch": 9.45965356991973, "grad_norm": 2.652831792831421, "learning_rate": 7.0820040669333975e-06, "loss": 0.1438, "step": 1400 }, { "epoch": 9.493451626531474, "grad_norm": 3.1051905155181885, "learning_rate": 7.006368770266421e-06, "loss": 0.1396, "step": 1405 }, { "epoch": 9.52724968314322, "grad_norm": 2.8987197875976562, "learning_rate": 6.930921125276715e-06, "loss": 0.1714, "step": 1410 }, { "epoch": 9.561047739754963, "grad_norm": 3.5985753536224365, "learning_rate": 6.855665861296662e-06, "loss": 0.1221, "step": 1415 }, { "epoch": 9.594845796366709, "grad_norm": 3.5496666431427, "learning_rate": 6.78060769559951e-06, "loss": 0.1261, "step": 1420 }, { "epoch": 9.628643852978454, "grad_norm": 2.57647442817688, "learning_rate": 6.705751333103676e-06, "loss": 0.132, "step": 1425 }, { "epoch": 9.662441909590198, "grad_norm": 2.8501367568969727, "learning_rate": 6.631101466077801e-06, "loss": 0.1463, "step": 1430 }, { "epoch": 9.696239966201944, "grad_norm": 2.449470043182373, "learning_rate": 6.556662773846658e-06, "loss": 0.1387, "step": 1435 }, { "epoch": 9.730038022813687, "grad_norm": 3.8504765033721924, "learning_rate": 6.48243992249781e-06, "loss": 0.1906, "step": 1440 }, { "epoch": 9.763836079425433, "grad_norm": 2.4857442378997803, "learning_rate": 6.40843756458913e-06, "loss": 0.1024, "step": 1445 }, { "epoch": 9.797634136037178, "grad_norm": 3.8078644275665283, "learning_rate": 6.3346603388571605e-06, "loss": 0.1211, "step": 1450 }, { "epoch": 9.831432192648922, "grad_norm": 2.8603129386901855, "learning_rate": 6.261112869926348e-06, "loss": 0.0645, "step": 1455 }, { "epoch": 9.865230249260668, "grad_norm": 2.669579267501831, "learning_rate": 6.187799768019134e-06, "loss": 0.194, "step": 1460 }, { "epoch": 9.899028305872413, "grad_norm": 2.163553237915039, "learning_rate": 6.114725628666997e-06, "loss": 0.1371, "step": 1465 }, { "epoch": 9.932826362484157, "grad_norm": 3.211575984954834, "learning_rate": 6.041895032422377e-06, "loss": 0.1427, "step": 1470 }, { "epoch": 9.966624419095902, "grad_norm": 3.1249096393585205, "learning_rate": 5.969312544571529e-06, "loss": 0.1482, "step": 1475 }, { "epoch": 10.0, "grad_norm": 2.95405912399292, "learning_rate": 5.8969827148483935e-06, "loss": 0.1493, "step": 1480 }, { "epoch": 10.033798056611746, "grad_norm": 2.1418049335479736, "learning_rate": 5.824910077149372e-06, "loss": 0.1223, "step": 1485 }, { "epoch": 10.06759611322349, "grad_norm": 2.2330262660980225, "learning_rate": 5.753099149249133e-06, "loss": 0.1569, "step": 1490 }, { "epoch": 10.101394169835235, "grad_norm": 2.517437696456909, "learning_rate": 5.681554432517435e-06, "loss": 0.0826, "step": 1495 }, { "epoch": 10.135192226446978, "grad_norm": 2.317457675933838, "learning_rate": 5.610280411636941e-06, "loss": 0.1024, "step": 1500 }, { "epoch": 10.168990283058724, "grad_norm": 3.2839527130126953, "learning_rate": 5.539281554322126e-06, "loss": 0.1484, "step": 1505 }, { "epoch": 10.20278833967047, "grad_norm": 3.0793209075927734, "learning_rate": 5.468562311039205e-06, "loss": 0.1529, "step": 1510 }, { "epoch": 10.236586396282213, "grad_norm": 2.524780035018921, "learning_rate": 5.3981271147271786e-06, "loss": 0.09, "step": 1515 }, { "epoch": 10.270384452893959, "grad_norm": 2.0456202030181885, "learning_rate": 5.327980380519942e-06, "loss": 0.1159, "step": 1520 }, { "epoch": 10.304182509505704, "grad_norm": 2.448542356491089, "learning_rate": 5.25812650546955e-06, "loss": 0.1431, "step": 1525 }, { "epoch": 10.337980566117448, "grad_norm": 1.669090986251831, "learning_rate": 5.188569868270566e-06, "loss": 0.1234, "step": 1530 }, { "epoch": 10.371778622729193, "grad_norm": 3.0153934955596924, "learning_rate": 5.11931482898562e-06, "loss": 0.1086, "step": 1535 }, { "epoch": 10.405576679340937, "grad_norm": 3.3632757663726807, "learning_rate": 5.050365728772084e-06, "loss": 0.1114, "step": 1540 }, { "epoch": 10.439374735952683, "grad_norm": 2.883791208267212, "learning_rate": 4.981726889609952e-06, "loss": 0.1465, "step": 1545 }, { "epoch": 10.473172792564428, "grad_norm": 1.6629996299743652, "learning_rate": 4.913402614030944e-06, "loss": 0.0823, "step": 1550 }, { "epoch": 10.506970849176172, "grad_norm": 2.789846658706665, "learning_rate": 4.84539718484877e-06, "loss": 0.133, "step": 1555 }, { "epoch": 10.540768905787917, "grad_norm": 2.095916509628296, "learning_rate": 4.77771486489071e-06, "loss": 0.0988, "step": 1560 }, { "epoch": 10.574566962399661, "grad_norm": 2.670482635498047, "learning_rate": 4.710359896730379e-06, "loss": 0.1166, "step": 1565 }, { "epoch": 10.608365019011407, "grad_norm": 1.432079553604126, "learning_rate": 4.643336502421783e-06, "loss": 0.1624, "step": 1570 }, { "epoch": 10.642163075623152, "grad_norm": 2.3370885848999023, "learning_rate": 4.576648883234686e-06, "loss": 0.1007, "step": 1575 }, { "epoch": 10.675961132234896, "grad_norm": 3.077364921569824, "learning_rate": 4.510301219391245e-06, "loss": 0.095, "step": 1580 }, { "epoch": 10.709759188846641, "grad_norm": 3.081515312194824, "learning_rate": 4.444297669803981e-06, "loss": 0.1086, "step": 1585 }, { "epoch": 10.743557245458387, "grad_norm": 3.574352502822876, "learning_rate": 4.378642371815078e-06, "loss": 0.1501, "step": 1590 }, { "epoch": 10.77735530207013, "grad_norm": 2.738147735595703, "learning_rate": 4.313339440937055e-06, "loss": 0.1719, "step": 1595 }, { "epoch": 10.811153358681876, "grad_norm": 2.235377073287964, "learning_rate": 4.248392970594774e-06, "loss": 0.1176, "step": 1600 }, { "epoch": 10.84495141529362, "grad_norm": 2.95943021774292, "learning_rate": 4.18380703186886e-06, "loss": 0.1334, "step": 1605 }, { "epoch": 10.878749471905365, "grad_norm": 1.9108000993728638, "learning_rate": 4.1195856732405094e-06, "loss": 0.113, "step": 1610 }, { "epoch": 10.912547528517111, "grad_norm": 2.856457233428955, "learning_rate": 4.055732920337699e-06, "loss": 0.1027, "step": 1615 }, { "epoch": 10.946345585128855, "grad_norm": 2.5498857498168945, "learning_rate": 3.992252775682877e-06, "loss": 0.0869, "step": 1620 }, { "epoch": 10.9801436417406, "grad_norm": 2.5696861743927, "learning_rate": 3.929149218442052e-06, "loss": 0.1553, "step": 1625 }, { "epoch": 11.013519222644698, "grad_norm": 1.5783302783966064, "learning_rate": 3.866426204175353e-06, "loss": 0.1055, "step": 1630 }, { "epoch": 11.047317279256443, "grad_norm": 2.1971595287323, "learning_rate": 3.804087664589108e-06, "loss": 0.1169, "step": 1635 }, { "epoch": 11.081115335868187, "grad_norm": 2.1792209148406982, "learning_rate": 3.742137507289363e-06, "loss": 0.1408, "step": 1640 }, { "epoch": 11.114913392479933, "grad_norm": 2.117349147796631, "learning_rate": 3.680579615536961e-06, "loss": 0.0973, "step": 1645 }, { "epoch": 11.148711449091678, "grad_norm": 2.348695755004883, "learning_rate": 3.6194178480041174e-06, "loss": 0.0879, "step": 1650 }, { "epoch": 11.182509505703422, "grad_norm": 2.529822826385498, "learning_rate": 3.558656038532532e-06, "loss": 0.1049, "step": 1655 }, { "epoch": 11.216307562315167, "grad_norm": 1.6536489725112915, "learning_rate": 3.4982979958930896e-06, "loss": 0.0713, "step": 1660 }, { "epoch": 11.250105618926911, "grad_norm": 3.6709718704223633, "learning_rate": 3.4383475035471026e-06, "loss": 0.0843, "step": 1665 }, { "epoch": 11.283903675538657, "grad_norm": 2.0067543983459473, "learning_rate": 3.378808319409149e-06, "loss": 0.1148, "step": 1670 }, { "epoch": 11.317701732150402, "grad_norm": 2.263753890991211, "learning_rate": 3.319684175611517e-06, "loss": 0.1042, "step": 1675 }, { "epoch": 11.351499788762146, "grad_norm": 2.691466808319092, "learning_rate": 3.2609787782702595e-06, "loss": 0.0902, "step": 1680 }, { "epoch": 11.385297845373891, "grad_norm": 2.7062034606933594, "learning_rate": 3.2026958072528715e-06, "loss": 0.0978, "step": 1685 }, { "epoch": 11.419095901985635, "grad_norm": 2.082036256790161, "learning_rate": 3.1448389159476433e-06, "loss": 0.1192, "step": 1690 }, { "epoch": 11.45289395859738, "grad_norm": 1.7839562892913818, "learning_rate": 3.087411731034641e-06, "loss": 0.1098, "step": 1695 }, { "epoch": 11.486692015209126, "grad_norm": 2.078550100326538, "learning_rate": 3.0304178522583626e-06, "loss": 0.0822, "step": 1700 }, { "epoch": 11.52049007182087, "grad_norm": 1.636839747428894, "learning_rate": 2.973860852202117e-06, "loss": 0.0987, "step": 1705 }, { "epoch": 11.554288128432615, "grad_norm": 2.2059497833251953, "learning_rate": 2.917744276064056e-06, "loss": 0.1176, "step": 1710 }, { "epoch": 11.58808618504436, "grad_norm": 2.563145637512207, "learning_rate": 2.8620716414349714e-06, "loss": 0.1158, "step": 1715 }, { "epoch": 11.621884241656105, "grad_norm": 2.6411447525024414, "learning_rate": 2.806846438077787e-06, "loss": 0.1471, "step": 1720 }, { "epoch": 11.65568229826785, "grad_norm": 1.4738620519638062, "learning_rate": 2.7520721277088023e-06, "loss": 0.1833, "step": 1725 }, { "epoch": 11.689480354879594, "grad_norm": 1.8081343173980713, "learning_rate": 2.697752143780713e-06, "loss": 0.1188, "step": 1730 }, { "epoch": 11.72327841149134, "grad_norm": 1.6308510303497314, "learning_rate": 2.643889891267386e-06, "loss": 0.0962, "step": 1735 }, { "epoch": 11.757076468103085, "grad_norm": 2.0612504482269287, "learning_rate": 2.5904887464504115e-06, "loss": 0.0656, "step": 1740 }, { "epoch": 11.790874524714829, "grad_norm": 2.3865137100219727, "learning_rate": 2.537552056707483e-06, "loss": 0.1124, "step": 1745 }, { "epoch": 11.824672581326574, "grad_norm": 2.3273239135742188, "learning_rate": 2.4850831403025597e-06, "loss": 0.0682, "step": 1750 }, { "epoch": 11.858470637938318, "grad_norm": 2.371812105178833, "learning_rate": 2.433085286177872e-06, "loss": 0.0906, "step": 1755 }, { "epoch": 11.892268694550063, "grad_norm": 4.104214191436768, "learning_rate": 2.381561753747753e-06, "loss": 0.1273, "step": 1760 }, { "epoch": 11.926066751161809, "grad_norm": 2.1697592735290527, "learning_rate": 2.330515772694333e-06, "loss": 0.1251, "step": 1765 }, { "epoch": 11.959864807773553, "grad_norm": 3.299699068069458, "learning_rate": 2.279950542765078e-06, "loss": 0.0756, "step": 1770 }, { "epoch": 11.993662864385298, "grad_norm": 3.481651544570923, "learning_rate": 2.2298692335722403e-06, "loss": 0.1518, "step": 1775 }, { "epoch": 12.027038445289396, "grad_norm": 1.655312418937683, "learning_rate": 2.1802749843941583e-06, "loss": 0.084, "step": 1780 }, { "epoch": 12.060836501901141, "grad_norm": 1.347791075706482, "learning_rate": 2.1311709039784734e-06, "loss": 0.0561, "step": 1785 }, { "epoch": 12.094634558512885, "grad_norm": 1.9169631004333496, "learning_rate": 2.0825600703472814e-06, "loss": 0.1018, "step": 1790 }, { "epoch": 12.12843261512463, "grad_norm": 1.5145456790924072, "learning_rate": 2.0344455306041633e-06, "loss": 0.1338, "step": 1795 }, { "epoch": 12.162230671736376, "grad_norm": 1.8022133111953735, "learning_rate": 1.98683030074321e-06, "loss": 0.1331, "step": 1800 }, { "epoch": 12.19602872834812, "grad_norm": 2.502906084060669, "learning_rate": 1.939717365459952e-06, "loss": 0.0758, "step": 1805 }, { "epoch": 12.229826784959865, "grad_norm": 2.3254261016845703, "learning_rate": 1.8931096779642644e-06, "loss": 0.1571, "step": 1810 }, { "epoch": 12.263624841571609, "grad_norm": 1.831810474395752, "learning_rate": 1.847010159795265e-06, "loss": 0.1052, "step": 1815 }, { "epoch": 12.297422898183354, "grad_norm": 2.395282745361328, "learning_rate": 1.8014217006381728e-06, "loss": 0.057, "step": 1820 }, { "epoch": 12.3312209547951, "grad_norm": 2.0812599658966064, "learning_rate": 1.7563471581431623e-06, "loss": 0.0743, "step": 1825 }, { "epoch": 12.365019011406844, "grad_norm": 1.5092777013778687, "learning_rate": 1.7117893577462541e-06, "loss": 0.0733, "step": 1830 }, { "epoch": 12.39881706801859, "grad_norm": 2.1698033809661865, "learning_rate": 1.6677510924921958e-06, "loss": 0.099, "step": 1835 }, { "epoch": 12.432615124630333, "grad_norm": 2.5433080196380615, "learning_rate": 1.6242351228593833e-06, "loss": 0.0944, "step": 1840 }, { "epoch": 12.466413181242078, "grad_norm": 2.1289961338043213, "learning_rate": 1.5812441765868292e-06, "loss": 0.0881, "step": 1845 }, { "epoch": 12.500211237853824, "grad_norm": 1.7505559921264648, "learning_rate": 1.5387809485031745e-06, "loss": 0.065, "step": 1850 }, { "epoch": 12.534009294465568, "grad_norm": 2.4067695140838623, "learning_rate": 1.4968481003577628e-06, "loss": 0.0476, "step": 1855 }, { "epoch": 12.567807351077313, "grad_norm": 2.417130708694458, "learning_rate": 1.4554482606538044e-06, "loss": 0.1166, "step": 1860 }, { "epoch": 12.601605407689059, "grad_norm": 1.8488808870315552, "learning_rate": 1.4145840244835985e-06, "loss": 0.1015, "step": 1865 }, { "epoch": 12.635403464300802, "grad_norm": 2.4164412021636963, "learning_rate": 1.3742579533658729e-06, "loss": 0.0822, "step": 1870 }, { "epoch": 12.669201520912548, "grad_norm": 2.083436965942383, "learning_rate": 1.3344725750852183e-06, "loss": 0.1192, "step": 1875 }, { "epoch": 12.702999577524292, "grad_norm": 1.8657339811325073, "learning_rate": 1.2952303835336256e-06, "loss": 0.1488, "step": 1880 }, { "epoch": 12.736797634136037, "grad_norm": 2.0696699619293213, "learning_rate": 1.2565338385541792e-06, "loss": 0.0752, "step": 1885 }, { "epoch": 12.770595690747783, "grad_norm": 1.6756703853607178, "learning_rate": 1.2183853657868504e-06, "loss": 0.107, "step": 1890 }, { "epoch": 12.804393747359526, "grad_norm": 2.409106731414795, "learning_rate": 1.1807873565164507e-06, "loss": 0.0669, "step": 1895 }, { "epoch": 12.838191803971272, "grad_norm": 1.7135124206542969, "learning_rate": 1.1437421675227457e-06, "loss": 0.1809, "step": 1900 }, { "epoch": 12.871989860583017, "grad_norm": 1.9362844228744507, "learning_rate": 1.107252120932717e-06, "loss": 0.1153, "step": 1905 }, { "epoch": 12.905787917194761, "grad_norm": 1.499002456665039, "learning_rate": 1.0713195040750012e-06, "loss": 0.1103, "step": 1910 }, { "epoch": 12.939585973806507, "grad_norm": 2.998647689819336, "learning_rate": 1.035946569336519e-06, "loss": 0.089, "step": 1915 }, { "epoch": 12.97338403041825, "grad_norm": 2.5154922008514404, "learning_rate": 1.0011355340212802e-06, "loss": 0.1253, "step": 1920 }, { "epoch": 13.00675961132235, "grad_norm": 2.583174467086792, "learning_rate": 9.668885802114002e-07, "loss": 0.0991, "step": 1925 }, { "epoch": 13.040557667934094, "grad_norm": 2.039463758468628, "learning_rate": 9.33207854630317e-07, "loss": 0.063, "step": 1930 }, { "epoch": 13.074355724545839, "grad_norm": 2.5090322494506836, "learning_rate": 9.000954685082286e-07, "loss": 0.0839, "step": 1935 }, { "epoch": 13.108153781157583, "grad_norm": 1.2274693250656128, "learning_rate": 8.675534974497435e-07, "loss": 0.0393, "step": 1940 }, { "epoch": 13.141951837769328, "grad_norm": 1.7376823425292969, "learning_rate": 8.355839813037936e-07, "loss": 0.0899, "step": 1945 }, { "epoch": 13.175749894381074, "grad_norm": 1.657383918762207, "learning_rate": 8.041889240357493e-07, "loss": 0.0883, "step": 1950 }, { "epoch": 13.209547950992818, "grad_norm": 2.177140235900879, "learning_rate": 7.733702936018162e-07, "loss": 0.0703, "step": 1955 }, { "epoch": 13.243346007604563, "grad_norm": 1.9505295753479004, "learning_rate": 7.431300218256754e-07, "loss": 0.0734, "step": 1960 }, { "epoch": 13.277144064216307, "grad_norm": 1.5833914279937744, "learning_rate": 7.13470004277379e-07, "loss": 0.0464, "step": 1965 }, { "epoch": 13.310942120828052, "grad_norm": 2.343639850616455, "learning_rate": 6.843921001545429e-07, "loss": 0.0652, "step": 1970 }, { "epoch": 13.344740177439798, "grad_norm": 2.3044393062591553, "learning_rate": 6.558981321658009e-07, "loss": 0.1476, "step": 1975 }, { "epoch": 13.378538234051542, "grad_norm": 1.9348516464233398, "learning_rate": 6.279898864165423e-07, "loss": 0.0909, "step": 1980 }, { "epoch": 13.412336290663287, "grad_norm": 2.0757150650024414, "learning_rate": 6.006691122969644e-07, "loss": 0.1158, "step": 1985 }, { "epoch": 13.446134347275033, "grad_norm": 1.7569150924682617, "learning_rate": 5.739375223724108e-07, "loss": 0.0857, "step": 1990 }, { "epoch": 13.479932403886776, "grad_norm": 2.729357957839966, "learning_rate": 5.477967922760141e-07, "loss": 0.1197, "step": 1995 }, { "epoch": 13.513730460498522, "grad_norm": 1.9830466508865356, "learning_rate": 5.222485606036709e-07, "loss": 0.0667, "step": 2000 }, { "epoch": 13.547528517110266, "grad_norm": 2.4005234241485596, "learning_rate": 4.972944288113268e-07, "loss": 0.1217, "step": 2005 }, { "epoch": 13.581326573722011, "grad_norm": 2.361483335494995, "learning_rate": 4.729359611145845e-07, "loss": 0.11, "step": 2010 }, { "epoch": 13.615124630333757, "grad_norm": 1.8319944143295288, "learning_rate": 4.49174684390663e-07, "loss": 0.0716, "step": 2015 }, { "epoch": 13.6489226869455, "grad_norm": 1.7428772449493408, "learning_rate": 4.260120880826768e-07, "loss": 0.1552, "step": 2020 }, { "epoch": 13.682720743557246, "grad_norm": 2.0641117095947266, "learning_rate": 4.034496241062824e-07, "loss": 0.1185, "step": 2025 }, { "epoch": 13.71651880016899, "grad_norm": 2.4367544651031494, "learning_rate": 3.8148870675866145e-07, "loss": 0.1445, "step": 2030 }, { "epoch": 13.750316856780735, "grad_norm": 1.3618732690811157, "learning_rate": 3.601307126298648e-07, "loss": 0.0579, "step": 2035 }, { "epoch": 13.78411491339248, "grad_norm": 1.515920877456665, "learning_rate": 3.3937698051653034e-07, "loss": 0.0543, "step": 2040 }, { "epoch": 13.817912970004224, "grad_norm": 2.7224481105804443, "learning_rate": 3.1922881133795827e-07, "loss": 0.0955, "step": 2045 }, { "epoch": 13.85171102661597, "grad_norm": 2.9664154052734375, "learning_rate": 2.996874680545603e-07, "loss": 0.1153, "step": 2050 }, { "epoch": 13.885509083227713, "grad_norm": 2.224506139755249, "learning_rate": 2.8075417558870333e-07, "loss": 0.1311, "step": 2055 }, { "epoch": 13.919307139839459, "grad_norm": 1.8828747272491455, "learning_rate": 2.624301207479185e-07, "loss": 0.1198, "step": 2060 }, { "epoch": 13.953105196451205, "grad_norm": 2.1472909450531006, "learning_rate": 2.447164521505074e-07, "loss": 0.0764, "step": 2065 }, { "epoch": 13.986903253062948, "grad_norm": 2.1299331188201904, "learning_rate": 2.276142801535486e-07, "loss": 0.1228, "step": 2070 }, { "epoch": 14.020278833967048, "grad_norm": 2.9883980751037598, "learning_rate": 2.1112467678329197e-07, "loss": 0.1373, "step": 2075 }, { "epoch": 14.054076890578791, "grad_norm": 2.33616304397583, "learning_rate": 1.9524867566795945e-07, "loss": 0.0636, "step": 2080 }, { "epoch": 14.087874947190537, "grad_norm": 1.3360828161239624, "learning_rate": 1.7998727197295785e-07, "loss": 0.0624, "step": 2085 }, { "epoch": 14.12167300380228, "grad_norm": 1.6282477378845215, "learning_rate": 1.6534142233849527e-07, "loss": 0.0585, "step": 2090 }, { "epoch": 14.155471060414026, "grad_norm": 1.8694970607757568, "learning_rate": 1.5131204481961592e-07, "loss": 0.0747, "step": 2095 }, { "epoch": 14.189269117025772, "grad_norm": 1.441935420036316, "learning_rate": 1.3790001882865056e-07, "loss": 0.0718, "step": 2100 }, { "epoch": 14.223067173637515, "grad_norm": 2.2334372997283936, "learning_rate": 1.251061850800961e-07, "loss": 0.0947, "step": 2105 }, { "epoch": 14.256865230249261, "grad_norm": 2.6375110149383545, "learning_rate": 1.1293134553791551e-07, "loss": 0.1348, "step": 2110 }, { "epoch": 14.290663286861005, "grad_norm": 2.5394539833068848, "learning_rate": 1.0137626336526596e-07, "loss": 0.1289, "step": 2115 }, { "epoch": 14.32446134347275, "grad_norm": 1.4307893514633179, "learning_rate": 9.044166287666134e-08, "loss": 0.0499, "step": 2120 }, { "epoch": 14.358259400084496, "grad_norm": 1.6300568580627441, "learning_rate": 8.012822949256981e-08, "loss": 0.1074, "step": 2125 }, { "epoch": 14.39205745669624, "grad_norm": 1.95559823513031, "learning_rate": 7.043660969645261e-08, "loss": 0.0932, "step": 2130 }, { "epoch": 14.425855513307985, "grad_norm": 2.011991024017334, "learning_rate": 6.136741099423416e-08, "loss": 0.1045, "step": 2135 }, { "epoch": 14.45965356991973, "grad_norm": 2.210416078567505, "learning_rate": 5.2921201876223737e-08, "loss": 0.1052, "step": 2140 }, { "epoch": 14.493451626531474, "grad_norm": 2.3784687519073486, "learning_rate": 4.5098511781485056e-08, "loss": 0.0797, "step": 2145 }, { "epoch": 14.52724968314322, "grad_norm": 2.040710687637329, "learning_rate": 3.789983106464057e-08, "loss": 0.121, "step": 2150 }, { "epoch": 14.561047739754963, "grad_norm": 1.7879664897918701, "learning_rate": 3.132561096514164e-08, "loss": 0.0773, "step": 2155 }, { "epoch": 14.594845796366709, "grad_norm": 1.8894425630569458, "learning_rate": 2.5376263578977823e-08, "loss": 0.0965, "step": 2160 }, { "epoch": 14.628643852978454, "grad_norm": 1.9897544384002686, "learning_rate": 2.0052161832850858e-08, "loss": 0.078, "step": 2165 }, { "epoch": 14.662441909590198, "grad_norm": 1.689963698387146, "learning_rate": 1.5353639460793378e-08, "loss": 0.1024, "step": 2170 }, { "epoch": 14.696239966201944, "grad_norm": 1.8147411346435547, "learning_rate": 1.1280990983248975e-08, "loss": 0.1124, "step": 2175 }, { "epoch": 14.730038022813687, "grad_norm": 2.499868392944336, "learning_rate": 7.834471688616952e-09, "loss": 0.1116, "step": 2180 }, { "epoch": 14.763836079425433, "grad_norm": 1.9060348272323608, "learning_rate": 5.014297617242925e-09, "loss": 0.0975, "step": 2185 }, { "epoch": 14.797634136037178, "grad_norm": 2.312739849090576, "learning_rate": 2.8206455478774206e-09, "loss": 0.101, "step": 2190 }, { "epoch": 14.831432192648922, "grad_norm": 1.106972098350525, "learning_rate": 1.2536529866014058e-09, "loss": 0.0569, "step": 2195 }, { "epoch": 14.865230249260668, "grad_norm": 1.9655669927597046, "learning_rate": 3.1341815819763146e-10, "loss": 0.1144, "step": 2200 }, { "epoch": 14.899028305872413, "grad_norm": 2.5896451473236084, "learning_rate": 0.0, "loss": 0.0896, "step": 2205 }, { "epoch": 14.899028305872413, "step": 2205, "total_flos": 3.8395126284519014e+17, "train_loss": 0.4484830284334905, "train_runtime": 10885.404, "train_samples_per_second": 3.262, "train_steps_per_second": 0.203 } ], "logging_steps": 5, "max_steps": 2205, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.8395126284519014e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }