{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999663084127893, "eval_steps": 500, "global_step": 14840, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 6.7383174421347e-05, "grad_norm": 43.722067285894525, "learning_rate": 6.738544474393531e-10, "loss": 3.7104, "step": 1 }, { "epoch": 0.00033691587210673494, "grad_norm": 40.400906340056835, "learning_rate": 3.3692722371967655e-09, "loss": 3.5435, "step": 5 }, { "epoch": 0.0006738317442134699, "grad_norm": 48.63313146345102, "learning_rate": 6.738544474393531e-09, "loss": 3.5281, "step": 10 }, { "epoch": 0.001010747616320205, "grad_norm": 49.18693073060567, "learning_rate": 1.0107816711590296e-08, "loss": 3.5307, "step": 15 }, { "epoch": 0.0013476634884269398, "grad_norm": 43.05362469753585, "learning_rate": 1.3477088948787062e-08, "loss": 3.4987, "step": 20 }, { "epoch": 0.0016845793605336748, "grad_norm": 41.551973296823284, "learning_rate": 1.6846361185983825e-08, "loss": 3.4785, "step": 25 }, { "epoch": 0.00202149523264041, "grad_norm": 38.93829473729282, "learning_rate": 2.021563342318059e-08, "loss": 3.4348, "step": 30 }, { "epoch": 0.0023584111047471445, "grad_norm": 40.60916311851957, "learning_rate": 2.3584905660377358e-08, "loss": 3.5078, "step": 35 }, { "epoch": 0.0026953269768538795, "grad_norm": 47.93121155982272, "learning_rate": 2.6954177897574124e-08, "loss": 3.5424, "step": 40 }, { "epoch": 0.0030322428489606146, "grad_norm": 44.24462180308707, "learning_rate": 3.032345013477089e-08, "loss": 3.5437, "step": 45 }, { "epoch": 0.0033691587210673496, "grad_norm": 51.65982491042281, "learning_rate": 3.369272237196765e-08, "loss": 3.6135, "step": 50 }, { "epoch": 0.0037060745931740842, "grad_norm": 39.011327969736655, "learning_rate": 3.706199460916442e-08, "loss": 3.5722, "step": 55 }, { "epoch": 0.00404299046528082, "grad_norm": 49.759610369779125, "learning_rate": 4.043126684636118e-08, "loss": 3.496, "step": 60 }, { "epoch": 0.004379906337387554, "grad_norm": 40.63880789061455, "learning_rate": 4.380053908355795e-08, "loss": 3.5367, "step": 65 }, { "epoch": 0.004716822209494289, "grad_norm": 45.20625149788756, "learning_rate": 4.7169811320754715e-08, "loss": 3.6049, "step": 70 }, { "epoch": 0.005053738081601024, "grad_norm": 44.23261506934072, "learning_rate": 5.053908355795148e-08, "loss": 3.4984, "step": 75 }, { "epoch": 0.005390653953707759, "grad_norm": 35.70098514711251, "learning_rate": 5.390835579514825e-08, "loss": 3.4308, "step": 80 }, { "epoch": 0.005727569825814494, "grad_norm": 32.692008772741055, "learning_rate": 5.727762803234501e-08, "loss": 3.2682, "step": 85 }, { "epoch": 0.006064485697921229, "grad_norm": 43.069949833387234, "learning_rate": 6.064690026954177e-08, "loss": 3.4427, "step": 90 }, { "epoch": 0.006401401570027964, "grad_norm": 39.4692847938377, "learning_rate": 6.401617250673854e-08, "loss": 3.4723, "step": 95 }, { "epoch": 0.006738317442134699, "grad_norm": 36.24384217310333, "learning_rate": 6.73854447439353e-08, "loss": 3.4882, "step": 100 }, { "epoch": 0.007075233314241434, "grad_norm": 44.52041222658874, "learning_rate": 7.075471698113207e-08, "loss": 3.5081, "step": 105 }, { "epoch": 0.0074121491863481685, "grad_norm": 37.21749532010642, "learning_rate": 7.412398921832884e-08, "loss": 3.4277, "step": 110 }, { "epoch": 0.0077490650584549035, "grad_norm": 30.613369943551636, "learning_rate": 7.749326145552561e-08, "loss": 3.3701, "step": 115 }, { "epoch": 0.00808598093056164, "grad_norm": 28.646983373848474, "learning_rate": 8.086253369272237e-08, "loss": 3.3091, "step": 120 }, { "epoch": 0.008422896802668374, "grad_norm": 31.072639379214777, "learning_rate": 8.423180592991913e-08, "loss": 3.2737, "step": 125 }, { "epoch": 0.008759812674775108, "grad_norm": 33.65937071064069, "learning_rate": 8.76010781671159e-08, "loss": 3.3988, "step": 130 }, { "epoch": 0.009096728546881844, "grad_norm": 28.516189476442946, "learning_rate": 9.097035040431267e-08, "loss": 3.3744, "step": 135 }, { "epoch": 0.009433644418988578, "grad_norm": 29.811570518748876, "learning_rate": 9.433962264150943e-08, "loss": 3.2062, "step": 140 }, { "epoch": 0.009770560291095314, "grad_norm": 26.485504422088084, "learning_rate": 9.770889487870619e-08, "loss": 3.2796, "step": 145 }, { "epoch": 0.010107476163202048, "grad_norm": 17.770056092466586, "learning_rate": 1.0107816711590296e-07, "loss": 3.1305, "step": 150 }, { "epoch": 0.010444392035308784, "grad_norm": 14.749110893597347, "learning_rate": 1.0444743935309973e-07, "loss": 3.184, "step": 155 }, { "epoch": 0.010781307907415518, "grad_norm": 15.471182129271536, "learning_rate": 1.078167115902965e-07, "loss": 3.1358, "step": 160 }, { "epoch": 0.011118223779522254, "grad_norm": 12.38428301093494, "learning_rate": 1.1118598382749325e-07, "loss": 3.0884, "step": 165 }, { "epoch": 0.011455139651628988, "grad_norm": 11.672037704337706, "learning_rate": 1.1455525606469002e-07, "loss": 3.0923, "step": 170 }, { "epoch": 0.011792055523735722, "grad_norm": 13.662530133662237, "learning_rate": 1.1792452830188679e-07, "loss": 3.0725, "step": 175 }, { "epoch": 0.012128971395842458, "grad_norm": 11.33729716271392, "learning_rate": 1.2129380053908355e-07, "loss": 3.0967, "step": 180 }, { "epoch": 0.012465887267949192, "grad_norm": 10.08307440303075, "learning_rate": 1.2466307277628032e-07, "loss": 3.0628, "step": 185 }, { "epoch": 0.012802803140055928, "grad_norm": 10.456416054418666, "learning_rate": 1.280323450134771e-07, "loss": 2.9836, "step": 190 }, { "epoch": 0.013139719012162663, "grad_norm": 8.16990335454256, "learning_rate": 1.3140161725067383e-07, "loss": 3.0403, "step": 195 }, { "epoch": 0.013476634884269399, "grad_norm": 8.445356802977297, "learning_rate": 1.347708894878706e-07, "loss": 2.9826, "step": 200 }, { "epoch": 0.013813550756376133, "grad_norm": 8.112333501113914, "learning_rate": 1.3814016172506737e-07, "loss": 3.0593, "step": 205 }, { "epoch": 0.014150466628482869, "grad_norm": 8.77571848431658, "learning_rate": 1.4150943396226414e-07, "loss": 3.0283, "step": 210 }, { "epoch": 0.014487382500589603, "grad_norm": 7.495134946946304, "learning_rate": 1.448787061994609e-07, "loss": 2.9426, "step": 215 }, { "epoch": 0.014824298372696337, "grad_norm": 8.882647996355994, "learning_rate": 1.4824797843665768e-07, "loss": 2.95, "step": 220 }, { "epoch": 0.015161214244803073, "grad_norm": 8.52542245327393, "learning_rate": 1.5161725067385445e-07, "loss": 2.9208, "step": 225 }, { "epoch": 0.015498130116909807, "grad_norm": 7.388088002780426, "learning_rate": 1.5498652291105122e-07, "loss": 2.9493, "step": 230 }, { "epoch": 0.01583504598901654, "grad_norm": 7.343875787784921, "learning_rate": 1.58355795148248e-07, "loss": 2.8675, "step": 235 }, { "epoch": 0.01617196186112328, "grad_norm": 6.9187044964100135, "learning_rate": 1.6172506738544473e-07, "loss": 2.8982, "step": 240 }, { "epoch": 0.016508877733230013, "grad_norm": 7.708900760161609, "learning_rate": 1.650943396226415e-07, "loss": 2.865, "step": 245 }, { "epoch": 0.016845793605336747, "grad_norm": 7.466510771890464, "learning_rate": 1.6846361185983827e-07, "loss": 2.8439, "step": 250 }, { "epoch": 0.01718270947744348, "grad_norm": 6.719543691764612, "learning_rate": 1.7183288409703504e-07, "loss": 2.8781, "step": 255 }, { "epoch": 0.017519625349550216, "grad_norm": 6.474251195979412, "learning_rate": 1.752021563342318e-07, "loss": 2.7684, "step": 260 }, { "epoch": 0.017856541221656953, "grad_norm": 6.163464992505713, "learning_rate": 1.7857142857142858e-07, "loss": 2.8738, "step": 265 }, { "epoch": 0.018193457093763687, "grad_norm": 6.095476375406627, "learning_rate": 1.8194070080862535e-07, "loss": 2.8362, "step": 270 }, { "epoch": 0.01853037296587042, "grad_norm": 7.182087042455608, "learning_rate": 1.853099730458221e-07, "loss": 2.8338, "step": 275 }, { "epoch": 0.018867288837977156, "grad_norm": 6.358586240913033, "learning_rate": 1.8867924528301886e-07, "loss": 2.81, "step": 280 }, { "epoch": 0.019204204710083893, "grad_norm": 6.380629741654627, "learning_rate": 1.920485175202156e-07, "loss": 2.7525, "step": 285 }, { "epoch": 0.019541120582190628, "grad_norm": 6.41578227123208, "learning_rate": 1.9541778975741237e-07, "loss": 2.7912, "step": 290 }, { "epoch": 0.019878036454297362, "grad_norm": 7.690207221818896, "learning_rate": 1.9878706199460914e-07, "loss": 2.8269, "step": 295 }, { "epoch": 0.020214952326404096, "grad_norm": 7.238616552872576, "learning_rate": 2.021563342318059e-07, "loss": 2.8328, "step": 300 }, { "epoch": 0.02055186819851083, "grad_norm": 5.705158996303356, "learning_rate": 2.0552560646900268e-07, "loss": 2.6636, "step": 305 }, { "epoch": 0.020888784070617568, "grad_norm": 6.937458251927117, "learning_rate": 2.0889487870619945e-07, "loss": 2.7702, "step": 310 }, { "epoch": 0.021225699942724302, "grad_norm": 6.078604973873886, "learning_rate": 2.1226415094339622e-07, "loss": 2.8162, "step": 315 }, { "epoch": 0.021562615814831036, "grad_norm": 7.362389794657737, "learning_rate": 2.15633423180593e-07, "loss": 2.754, "step": 320 }, { "epoch": 0.02189953168693777, "grad_norm": 8.430383304213942, "learning_rate": 2.1900269541778973e-07, "loss": 2.7692, "step": 325 }, { "epoch": 0.022236447559044508, "grad_norm": 6.573899372437375, "learning_rate": 2.223719676549865e-07, "loss": 2.7308, "step": 330 }, { "epoch": 0.022573363431151242, "grad_norm": 6.06531841151832, "learning_rate": 2.2574123989218327e-07, "loss": 2.666, "step": 335 }, { "epoch": 0.022910279303257976, "grad_norm": 5.894905445567127, "learning_rate": 2.2911051212938004e-07, "loss": 2.7322, "step": 340 }, { "epoch": 0.02324719517536471, "grad_norm": 5.557405726691632, "learning_rate": 2.324797843665768e-07, "loss": 2.7651, "step": 345 }, { "epoch": 0.023584111047471445, "grad_norm": 6.093202493332179, "learning_rate": 2.3584905660377358e-07, "loss": 2.686, "step": 350 }, { "epoch": 0.023921026919578182, "grad_norm": 6.041739168570691, "learning_rate": 2.392183288409703e-07, "loss": 2.747, "step": 355 }, { "epoch": 0.024257942791684917, "grad_norm": 6.151437971511917, "learning_rate": 2.425876010781671e-07, "loss": 2.7208, "step": 360 }, { "epoch": 0.02459485866379165, "grad_norm": 6.288607347885197, "learning_rate": 2.4595687331536387e-07, "loss": 2.7116, "step": 365 }, { "epoch": 0.024931774535898385, "grad_norm": 6.403473460005024, "learning_rate": 2.4932614555256063e-07, "loss": 2.6888, "step": 370 }, { "epoch": 0.025268690408005123, "grad_norm": 6.99312201271685, "learning_rate": 2.526954177897574e-07, "loss": 2.6587, "step": 375 }, { "epoch": 0.025605606280111857, "grad_norm": 6.127856596970994, "learning_rate": 2.560646900269542e-07, "loss": 2.6267, "step": 380 }, { "epoch": 0.02594252215221859, "grad_norm": 6.405040807209752, "learning_rate": 2.5943396226415094e-07, "loss": 2.6032, "step": 385 }, { "epoch": 0.026279438024325325, "grad_norm": 5.646503975734986, "learning_rate": 2.6280323450134766e-07, "loss": 2.6447, "step": 390 }, { "epoch": 0.02661635389643206, "grad_norm": 6.951959004156975, "learning_rate": 2.661725067385445e-07, "loss": 2.7392, "step": 395 }, { "epoch": 0.026953269768538797, "grad_norm": 6.512538076245869, "learning_rate": 2.695417789757412e-07, "loss": 2.6114, "step": 400 }, { "epoch": 0.02729018564064553, "grad_norm": 5.8564387812206355, "learning_rate": 2.72911051212938e-07, "loss": 2.6087, "step": 405 }, { "epoch": 0.027627101512752265, "grad_norm": 7.585977737559127, "learning_rate": 2.7628032345013474e-07, "loss": 2.6589, "step": 410 }, { "epoch": 0.027964017384859, "grad_norm": 7.063631028785829, "learning_rate": 2.7964959568733156e-07, "loss": 2.6159, "step": 415 }, { "epoch": 0.028300933256965737, "grad_norm": 6.088318346146536, "learning_rate": 2.830188679245283e-07, "loss": 2.6871, "step": 420 }, { "epoch": 0.02863784912907247, "grad_norm": 5.982470168514755, "learning_rate": 2.863881401617251e-07, "loss": 2.6795, "step": 425 }, { "epoch": 0.028974765001179206, "grad_norm": 6.357576036176945, "learning_rate": 2.897574123989218e-07, "loss": 2.6067, "step": 430 }, { "epoch": 0.02931168087328594, "grad_norm": 5.702174330133305, "learning_rate": 2.9312668463611853e-07, "loss": 2.5781, "step": 435 }, { "epoch": 0.029648596745392674, "grad_norm": 5.739625292932569, "learning_rate": 2.9649595687331536e-07, "loss": 2.6585, "step": 440 }, { "epoch": 0.02998551261749941, "grad_norm": 6.8299371414986245, "learning_rate": 2.9986522911051207e-07, "loss": 2.5856, "step": 445 }, { "epoch": 0.030322428489606146, "grad_norm": 6.404896999997733, "learning_rate": 3.032345013477089e-07, "loss": 2.6004, "step": 450 }, { "epoch": 0.03065934436171288, "grad_norm": 6.639004263831441, "learning_rate": 3.066037735849056e-07, "loss": 2.6336, "step": 455 }, { "epoch": 0.030996260233819614, "grad_norm": 6.1099943154097724, "learning_rate": 3.0997304582210244e-07, "loss": 2.5337, "step": 460 }, { "epoch": 0.03133317610592635, "grad_norm": 6.230566626650167, "learning_rate": 3.1334231805929915e-07, "loss": 2.6119, "step": 465 }, { "epoch": 0.03167009197803308, "grad_norm": 6.889255829182868, "learning_rate": 3.16711590296496e-07, "loss": 2.5579, "step": 470 }, { "epoch": 0.03200700785013982, "grad_norm": 6.906199615374025, "learning_rate": 3.200808625336927e-07, "loss": 2.6015, "step": 475 }, { "epoch": 0.03234392372224656, "grad_norm": 5.362081373442524, "learning_rate": 3.2345013477088946e-07, "loss": 2.6071, "step": 480 }, { "epoch": 0.03268083959435329, "grad_norm": 6.211679760864134, "learning_rate": 3.2681940700808623e-07, "loss": 2.5918, "step": 485 }, { "epoch": 0.033017755466460026, "grad_norm": 5.76538825819066, "learning_rate": 3.30188679245283e-07, "loss": 2.5383, "step": 490 }, { "epoch": 0.03335467133856676, "grad_norm": 6.3436757857679735, "learning_rate": 3.3355795148247977e-07, "loss": 2.6464, "step": 495 }, { "epoch": 0.033691587210673495, "grad_norm": 5.735123508322636, "learning_rate": 3.3692722371967654e-07, "loss": 2.5495, "step": 500 }, { "epoch": 0.03402850308278023, "grad_norm": 7.455370266302534, "learning_rate": 3.402964959568733e-07, "loss": 2.5293, "step": 505 }, { "epoch": 0.03436541895488696, "grad_norm": 5.933092400214517, "learning_rate": 3.436657681940701e-07, "loss": 2.5704, "step": 510 }, { "epoch": 0.0347023348269937, "grad_norm": 6.179980631519867, "learning_rate": 3.4703504043126685e-07, "loss": 2.5793, "step": 515 }, { "epoch": 0.03503925069910043, "grad_norm": 5.9404418007437965, "learning_rate": 3.504043126684636e-07, "loss": 2.6124, "step": 520 }, { "epoch": 0.03537616657120717, "grad_norm": 6.42851426015178, "learning_rate": 3.5377358490566033e-07, "loss": 2.5729, "step": 525 }, { "epoch": 0.03571308244331391, "grad_norm": 5.7487126230500465, "learning_rate": 3.5714285714285716e-07, "loss": 2.5013, "step": 530 }, { "epoch": 0.03604999831542064, "grad_norm": 6.026207333416938, "learning_rate": 3.605121293800539e-07, "loss": 2.5704, "step": 535 }, { "epoch": 0.036386914187527375, "grad_norm": 6.044142067735865, "learning_rate": 3.638814016172507e-07, "loss": 2.5586, "step": 540 }, { "epoch": 0.03672383005963411, "grad_norm": 6.799327984432928, "learning_rate": 3.672506738544474e-07, "loss": 2.5607, "step": 545 }, { "epoch": 0.03706074593174084, "grad_norm": 5.258032305545471, "learning_rate": 3.706199460916442e-07, "loss": 2.5603, "step": 550 }, { "epoch": 0.03739766180384758, "grad_norm": 5.482322768310166, "learning_rate": 3.7398921832884095e-07, "loss": 2.5309, "step": 555 }, { "epoch": 0.03773457767595431, "grad_norm": 6.1063366543381195, "learning_rate": 3.773584905660377e-07, "loss": 2.5586, "step": 560 }, { "epoch": 0.038071493548061046, "grad_norm": 5.196693112430663, "learning_rate": 3.807277628032345e-07, "loss": 2.513, "step": 565 }, { "epoch": 0.03840840942016779, "grad_norm": 6.209271252463008, "learning_rate": 3.840970350404312e-07, "loss": 2.5753, "step": 570 }, { "epoch": 0.03874532529227452, "grad_norm": 5.539418809836626, "learning_rate": 3.8746630727762803e-07, "loss": 2.4931, "step": 575 }, { "epoch": 0.039082241164381255, "grad_norm": 6.625506607370626, "learning_rate": 3.9083557951482475e-07, "loss": 2.49, "step": 580 }, { "epoch": 0.03941915703648799, "grad_norm": 5.219683719174391, "learning_rate": 3.9420485175202157e-07, "loss": 2.5358, "step": 585 }, { "epoch": 0.039756072908594724, "grad_norm": 6.102105836570702, "learning_rate": 3.975741239892183e-07, "loss": 2.5496, "step": 590 }, { "epoch": 0.04009298878070146, "grad_norm": 5.745991252452959, "learning_rate": 4.009433962264151e-07, "loss": 2.5888, "step": 595 }, { "epoch": 0.04042990465280819, "grad_norm": 6.021373594890137, "learning_rate": 4.043126684636118e-07, "loss": 2.4936, "step": 600 }, { "epoch": 0.040766820524914926, "grad_norm": 7.575900298003215, "learning_rate": 4.076819407008086e-07, "loss": 2.5407, "step": 605 }, { "epoch": 0.04110373639702166, "grad_norm": 6.644767924469903, "learning_rate": 4.1105121293800537e-07, "loss": 2.5321, "step": 610 }, { "epoch": 0.0414406522691284, "grad_norm": 8.080468112346272, "learning_rate": 4.1442048517520213e-07, "loss": 2.5294, "step": 615 }, { "epoch": 0.041777568141235136, "grad_norm": 5.814358526823887, "learning_rate": 4.177897574123989e-07, "loss": 2.5183, "step": 620 }, { "epoch": 0.04211448401334187, "grad_norm": 5.419913340343147, "learning_rate": 4.211590296495957e-07, "loss": 2.4977, "step": 625 }, { "epoch": 0.042451399885448604, "grad_norm": 6.252196669480747, "learning_rate": 4.2452830188679244e-07, "loss": 2.4872, "step": 630 }, { "epoch": 0.04278831575755534, "grad_norm": 6.099288065094699, "learning_rate": 4.278975741239892e-07, "loss": 2.5318, "step": 635 }, { "epoch": 0.04312523162966207, "grad_norm": 6.10832254348466, "learning_rate": 4.31266846361186e-07, "loss": 2.4928, "step": 640 }, { "epoch": 0.04346214750176881, "grad_norm": 5.78373775389231, "learning_rate": 4.3463611859838275e-07, "loss": 2.4132, "step": 645 }, { "epoch": 0.04379906337387554, "grad_norm": 5.645466631217478, "learning_rate": 4.3800539083557947e-07, "loss": 2.4863, "step": 650 }, { "epoch": 0.044135979245982275, "grad_norm": 4.949286774163871, "learning_rate": 4.413746630727763e-07, "loss": 2.5005, "step": 655 }, { "epoch": 0.044472895118089016, "grad_norm": 6.326248995197, "learning_rate": 4.44743935309973e-07, "loss": 2.4708, "step": 660 }, { "epoch": 0.04480981099019575, "grad_norm": 6.349152443553733, "learning_rate": 4.481132075471698e-07, "loss": 2.5259, "step": 665 }, { "epoch": 0.045146726862302484, "grad_norm": 5.6848230892167235, "learning_rate": 4.5148247978436655e-07, "loss": 2.4935, "step": 670 }, { "epoch": 0.04548364273440922, "grad_norm": 5.394088025902671, "learning_rate": 4.548517520215633e-07, "loss": 2.5247, "step": 675 }, { "epoch": 0.04582055860651595, "grad_norm": 5.761513932361357, "learning_rate": 4.582210242587601e-07, "loss": 2.5192, "step": 680 }, { "epoch": 0.04615747447862269, "grad_norm": 5.746664184711672, "learning_rate": 4.6159029649595686e-07, "loss": 2.5242, "step": 685 }, { "epoch": 0.04649439035072942, "grad_norm": 5.677506493881676, "learning_rate": 4.649595687331536e-07, "loss": 2.4014, "step": 690 }, { "epoch": 0.046831306222836155, "grad_norm": 5.720958029879586, "learning_rate": 4.6832884097035034e-07, "loss": 2.4608, "step": 695 }, { "epoch": 0.04716822209494289, "grad_norm": 5.628512543522039, "learning_rate": 4.7169811320754717e-07, "loss": 2.4393, "step": 700 }, { "epoch": 0.04750513796704963, "grad_norm": 5.728232785553673, "learning_rate": 4.750673854447439e-07, "loss": 2.4629, "step": 705 }, { "epoch": 0.047842053839156365, "grad_norm": 5.735994119568244, "learning_rate": 4.784366576819407e-07, "loss": 2.516, "step": 710 }, { "epoch": 0.0481789697112631, "grad_norm": 6.195264584703951, "learning_rate": 4.818059299191375e-07, "loss": 2.4865, "step": 715 }, { "epoch": 0.04851588558336983, "grad_norm": 5.122733484056987, "learning_rate": 4.851752021563342e-07, "loss": 2.4673, "step": 720 }, { "epoch": 0.04885280145547657, "grad_norm": 5.749025169526881, "learning_rate": 4.88544474393531e-07, "loss": 2.4296, "step": 725 }, { "epoch": 0.0491897173275833, "grad_norm": 5.857552748045718, "learning_rate": 4.919137466307277e-07, "loss": 2.4929, "step": 730 }, { "epoch": 0.049526633199690036, "grad_norm": 5.9816675978373155, "learning_rate": 4.952830188679246e-07, "loss": 2.468, "step": 735 }, { "epoch": 0.04986354907179677, "grad_norm": 6.084587792516672, "learning_rate": 4.986522911051213e-07, "loss": 2.3986, "step": 740 }, { "epoch": 0.050200464943903504, "grad_norm": 8.200704503077489, "learning_rate": 5.020215633423181e-07, "loss": 2.48, "step": 745 }, { "epoch": 0.050537380816010245, "grad_norm": 5.930861536793313, "learning_rate": 5.053908355795148e-07, "loss": 2.4126, "step": 750 }, { "epoch": 0.05087429668811698, "grad_norm": 4.77351843752251, "learning_rate": 5.087601078167115e-07, "loss": 2.482, "step": 755 }, { "epoch": 0.051211212560223714, "grad_norm": 5.773748701722189, "learning_rate": 5.121293800539083e-07, "loss": 2.4902, "step": 760 }, { "epoch": 0.05154812843233045, "grad_norm": 5.546155091741304, "learning_rate": 5.154986522911052e-07, "loss": 2.4891, "step": 765 }, { "epoch": 0.05188504430443718, "grad_norm": 5.592894006670849, "learning_rate": 5.188679245283019e-07, "loss": 2.4878, "step": 770 }, { "epoch": 0.052221960176543916, "grad_norm": 5.37651010126567, "learning_rate": 5.222371967654986e-07, "loss": 2.4494, "step": 775 }, { "epoch": 0.05255887604865065, "grad_norm": 5.18793548045409, "learning_rate": 5.256064690026953e-07, "loss": 2.4185, "step": 780 }, { "epoch": 0.052895791920757385, "grad_norm": 5.516214775106443, "learning_rate": 5.289757412398921e-07, "loss": 2.4924, "step": 785 }, { "epoch": 0.05323270779286412, "grad_norm": 6.139860345716243, "learning_rate": 5.32345013477089e-07, "loss": 2.4489, "step": 790 }, { "epoch": 0.05356962366497086, "grad_norm": 5.571597754213643, "learning_rate": 5.357142857142857e-07, "loss": 2.4758, "step": 795 }, { "epoch": 0.053906539537077594, "grad_norm": 4.968543208892651, "learning_rate": 5.390835579514824e-07, "loss": 2.3926, "step": 800 }, { "epoch": 0.05424345540918433, "grad_norm": 5.889491071511411, "learning_rate": 5.424528301886792e-07, "loss": 2.4229, "step": 805 }, { "epoch": 0.05458037128129106, "grad_norm": 5.297319785495927, "learning_rate": 5.45822102425876e-07, "loss": 2.4784, "step": 810 }, { "epoch": 0.0549172871533978, "grad_norm": 5.911063226806709, "learning_rate": 5.491913746630728e-07, "loss": 2.3445, "step": 815 }, { "epoch": 0.05525420302550453, "grad_norm": 5.171625288451802, "learning_rate": 5.525606469002695e-07, "loss": 2.4673, "step": 820 }, { "epoch": 0.055591118897611265, "grad_norm": 5.496967383696862, "learning_rate": 5.559299191374662e-07, "loss": 2.4515, "step": 825 }, { "epoch": 0.055928034769718, "grad_norm": 5.232117153242772, "learning_rate": 5.592991913746631e-07, "loss": 2.4551, "step": 830 }, { "epoch": 0.05626495064182473, "grad_norm": 5.851931362374646, "learning_rate": 5.626684636118598e-07, "loss": 2.4435, "step": 835 }, { "epoch": 0.056601866513931474, "grad_norm": 5.28691714828373, "learning_rate": 5.660377358490566e-07, "loss": 2.4127, "step": 840 }, { "epoch": 0.05693878238603821, "grad_norm": 5.592185758650392, "learning_rate": 5.694070080862533e-07, "loss": 2.4458, "step": 845 }, { "epoch": 0.05727569825814494, "grad_norm": 5.173648011263529, "learning_rate": 5.727762803234502e-07, "loss": 2.4488, "step": 850 }, { "epoch": 0.05761261413025168, "grad_norm": 5.070907179491476, "learning_rate": 5.761455525606469e-07, "loss": 2.4498, "step": 855 }, { "epoch": 0.05794953000235841, "grad_norm": 4.882318147781867, "learning_rate": 5.795148247978436e-07, "loss": 2.4899, "step": 860 }, { "epoch": 0.058286445874465145, "grad_norm": 5.64968707969574, "learning_rate": 5.828840970350404e-07, "loss": 2.3465, "step": 865 }, { "epoch": 0.05862336174657188, "grad_norm": 5.436980902840368, "learning_rate": 5.862533692722371e-07, "loss": 2.397, "step": 870 }, { "epoch": 0.058960277618678614, "grad_norm": 7.037237292755224, "learning_rate": 5.89622641509434e-07, "loss": 2.4173, "step": 875 }, { "epoch": 0.05929719349078535, "grad_norm": 6.183607834518758, "learning_rate": 5.929919137466307e-07, "loss": 2.416, "step": 880 }, { "epoch": 0.05963410936289209, "grad_norm": 6.160948777267354, "learning_rate": 5.963611859838274e-07, "loss": 2.4003, "step": 885 }, { "epoch": 0.05997102523499882, "grad_norm": 5.909181594387164, "learning_rate": 5.997304582210241e-07, "loss": 2.3978, "step": 890 }, { "epoch": 0.06030794110710556, "grad_norm": 6.0201341360095135, "learning_rate": 6.030997304582211e-07, "loss": 2.3784, "step": 895 }, { "epoch": 0.06064485697921229, "grad_norm": 5.470242218746942, "learning_rate": 6.064690026954178e-07, "loss": 2.3839, "step": 900 }, { "epoch": 0.060981772851319026, "grad_norm": 5.23172061063481, "learning_rate": 6.098382749326145e-07, "loss": 2.3826, "step": 905 }, { "epoch": 0.06131868872342576, "grad_norm": 5.359173867184083, "learning_rate": 6.132075471698112e-07, "loss": 2.3952, "step": 910 }, { "epoch": 0.061655604595532494, "grad_norm": 6.2647666935998805, "learning_rate": 6.16576819407008e-07, "loss": 2.3932, "step": 915 }, { "epoch": 0.06199252046763923, "grad_norm": 5.48390368442342, "learning_rate": 6.199460916442049e-07, "loss": 2.35, "step": 920 }, { "epoch": 0.06232943633974596, "grad_norm": 5.072812670696905, "learning_rate": 6.233153638814016e-07, "loss": 2.3742, "step": 925 }, { "epoch": 0.0626663522118527, "grad_norm": 5.884199825703362, "learning_rate": 6.266846361185983e-07, "loss": 2.3499, "step": 930 }, { "epoch": 0.06300326808395944, "grad_norm": 5.47507422585664, "learning_rate": 6.300539083557951e-07, "loss": 2.4076, "step": 935 }, { "epoch": 0.06334018395606617, "grad_norm": 5.867309635122474, "learning_rate": 6.33423180592992e-07, "loss": 2.353, "step": 940 }, { "epoch": 0.0636770998281729, "grad_norm": 5.377663636302892, "learning_rate": 6.367924528301887e-07, "loss": 2.3457, "step": 945 }, { "epoch": 0.06401401570027963, "grad_norm": 5.291240028267864, "learning_rate": 6.401617250673854e-07, "loss": 2.3281, "step": 950 }, { "epoch": 0.06435093157238637, "grad_norm": 5.512821647400398, "learning_rate": 6.435309973045822e-07, "loss": 2.4292, "step": 955 }, { "epoch": 0.06468784744449312, "grad_norm": 5.58884237081347, "learning_rate": 6.469002695417789e-07, "loss": 2.4024, "step": 960 }, { "epoch": 0.06502476331659984, "grad_norm": 5.666461750700177, "learning_rate": 6.502695417789757e-07, "loss": 2.3795, "step": 965 }, { "epoch": 0.06536167918870658, "grad_norm": 5.718177679371221, "learning_rate": 6.536388140161725e-07, "loss": 2.3363, "step": 970 }, { "epoch": 0.06569859506081331, "grad_norm": 5.799868988255173, "learning_rate": 6.570080862533693e-07, "loss": 2.3286, "step": 975 }, { "epoch": 0.06603551093292005, "grad_norm": 5.081556420471617, "learning_rate": 6.60377358490566e-07, "loss": 2.3037, "step": 980 }, { "epoch": 0.06637242680502678, "grad_norm": 5.23140361743725, "learning_rate": 6.637466307277628e-07, "loss": 2.3561, "step": 985 }, { "epoch": 0.06670934267713352, "grad_norm": 5.231708519743976, "learning_rate": 6.671159029649595e-07, "loss": 2.3173, "step": 990 }, { "epoch": 0.06704625854924025, "grad_norm": 5.469312158626827, "learning_rate": 6.704851752021563e-07, "loss": 2.3727, "step": 995 }, { "epoch": 0.06738317442134699, "grad_norm": 5.654310592233998, "learning_rate": 6.738544474393531e-07, "loss": 2.4116, "step": 1000 }, { "epoch": 0.06772009029345373, "grad_norm": 5.52805335523217, "learning_rate": 6.772237196765498e-07, "loss": 2.3661, "step": 1005 }, { "epoch": 0.06805700616556046, "grad_norm": 5.938688465434806, "learning_rate": 6.805929919137466e-07, "loss": 2.3452, "step": 1010 }, { "epoch": 0.0683939220376672, "grad_norm": 5.468447009116936, "learning_rate": 6.839622641509433e-07, "loss": 2.406, "step": 1015 }, { "epoch": 0.06873083790977393, "grad_norm": 5.189109970472544, "learning_rate": 6.873315363881402e-07, "loss": 2.3378, "step": 1020 }, { "epoch": 0.06906775378188067, "grad_norm": 5.298179523285472, "learning_rate": 6.907008086253369e-07, "loss": 2.3428, "step": 1025 }, { "epoch": 0.0694046696539874, "grad_norm": 4.934600929501676, "learning_rate": 6.940700808625337e-07, "loss": 2.2955, "step": 1030 }, { "epoch": 0.06974158552609414, "grad_norm": 5.647226938850427, "learning_rate": 6.974393530997304e-07, "loss": 2.4213, "step": 1035 }, { "epoch": 0.07007850139820086, "grad_norm": 4.910730888958498, "learning_rate": 7.008086253369272e-07, "loss": 2.3861, "step": 1040 }, { "epoch": 0.0704154172703076, "grad_norm": 6.054973266444726, "learning_rate": 7.04177897574124e-07, "loss": 2.3553, "step": 1045 }, { "epoch": 0.07075233314241434, "grad_norm": 5.9669546773816835, "learning_rate": 7.075471698113207e-07, "loss": 2.414, "step": 1050 }, { "epoch": 0.07108924901452107, "grad_norm": 5.228961938426561, "learning_rate": 7.109164420485175e-07, "loss": 2.3797, "step": 1055 }, { "epoch": 0.07142616488662781, "grad_norm": 5.520219448753314, "learning_rate": 7.142857142857143e-07, "loss": 2.3484, "step": 1060 }, { "epoch": 0.07176308075873454, "grad_norm": 5.033677878804619, "learning_rate": 7.17654986522911e-07, "loss": 2.3133, "step": 1065 }, { "epoch": 0.07209999663084128, "grad_norm": 5.796358114001727, "learning_rate": 7.210242587601077e-07, "loss": 2.3843, "step": 1070 }, { "epoch": 0.07243691250294801, "grad_norm": 6.014930841743614, "learning_rate": 7.243935309973046e-07, "loss": 2.3628, "step": 1075 }, { "epoch": 0.07277382837505475, "grad_norm": 6.020173900304168, "learning_rate": 7.277628032345014e-07, "loss": 2.337, "step": 1080 }, { "epoch": 0.07311074424716148, "grad_norm": 5.122262989524243, "learning_rate": 7.311320754716981e-07, "loss": 2.3764, "step": 1085 }, { "epoch": 0.07344766011926822, "grad_norm": 5.423510178714847, "learning_rate": 7.345013477088948e-07, "loss": 2.396, "step": 1090 }, { "epoch": 0.07378457599137496, "grad_norm": 5.5535794804168415, "learning_rate": 7.378706199460915e-07, "loss": 2.3627, "step": 1095 }, { "epoch": 0.07412149186348169, "grad_norm": 5.032473278282681, "learning_rate": 7.412398921832884e-07, "loss": 2.3315, "step": 1100 }, { "epoch": 0.07445840773558843, "grad_norm": 5.5231814568939255, "learning_rate": 7.446091644204852e-07, "loss": 2.3625, "step": 1105 }, { "epoch": 0.07479532360769515, "grad_norm": 5.481102514600328, "learning_rate": 7.479784366576819e-07, "loss": 2.3583, "step": 1110 }, { "epoch": 0.0751322394798019, "grad_norm": 5.53287414062346, "learning_rate": 7.513477088948786e-07, "loss": 2.343, "step": 1115 }, { "epoch": 0.07546915535190862, "grad_norm": 5.542717158935075, "learning_rate": 7.547169811320754e-07, "loss": 2.299, "step": 1120 }, { "epoch": 0.07580607122401536, "grad_norm": 5.162377150488702, "learning_rate": 7.580862533692723e-07, "loss": 2.3424, "step": 1125 }, { "epoch": 0.07614298709612209, "grad_norm": 4.9250511914700885, "learning_rate": 7.61455525606469e-07, "loss": 2.3509, "step": 1130 }, { "epoch": 0.07647990296822883, "grad_norm": 5.553124032837641, "learning_rate": 7.648247978436657e-07, "loss": 2.322, "step": 1135 }, { "epoch": 0.07681681884033557, "grad_norm": 5.122081298847209, "learning_rate": 7.681940700808624e-07, "loss": 2.3689, "step": 1140 }, { "epoch": 0.0771537347124423, "grad_norm": 5.493851335797319, "learning_rate": 7.715633423180593e-07, "loss": 2.3292, "step": 1145 }, { "epoch": 0.07749065058454904, "grad_norm": 5.63156885816444, "learning_rate": 7.749326145552561e-07, "loss": 2.3551, "step": 1150 }, { "epoch": 0.07782756645665577, "grad_norm": 5.653185808475984, "learning_rate": 7.783018867924528e-07, "loss": 2.3102, "step": 1155 }, { "epoch": 0.07816448232876251, "grad_norm": 5.184710260892954, "learning_rate": 7.816711590296495e-07, "loss": 2.3167, "step": 1160 }, { "epoch": 0.07850139820086924, "grad_norm": 5.212072743750497, "learning_rate": 7.850404312668463e-07, "loss": 2.4215, "step": 1165 }, { "epoch": 0.07883831407297598, "grad_norm": 5.991034994735581, "learning_rate": 7.884097035040431e-07, "loss": 2.3216, "step": 1170 }, { "epoch": 0.0791752299450827, "grad_norm": 5.051261330173261, "learning_rate": 7.917789757412399e-07, "loss": 2.3371, "step": 1175 }, { "epoch": 0.07951214581718945, "grad_norm": 5.618819635222086, "learning_rate": 7.951482479784366e-07, "loss": 2.3155, "step": 1180 }, { "epoch": 0.07984906168929619, "grad_norm": 4.784294272372772, "learning_rate": 7.985175202156334e-07, "loss": 2.3259, "step": 1185 }, { "epoch": 0.08018597756140292, "grad_norm": 5.719443428336091, "learning_rate": 8.018867924528302e-07, "loss": 2.3853, "step": 1190 }, { "epoch": 0.08052289343350966, "grad_norm": 5.489314975428119, "learning_rate": 8.052560646900269e-07, "loss": 2.3108, "step": 1195 }, { "epoch": 0.08085980930561638, "grad_norm": 5.328982448564493, "learning_rate": 8.086253369272237e-07, "loss": 2.2949, "step": 1200 }, { "epoch": 0.08119672517772313, "grad_norm": 4.917331896518827, "learning_rate": 8.119946091644204e-07, "loss": 2.3245, "step": 1205 }, { "epoch": 0.08153364104982985, "grad_norm": 5.911902633058009, "learning_rate": 8.153638814016172e-07, "loss": 2.3704, "step": 1210 }, { "epoch": 0.0818705569219366, "grad_norm": 5.058880544013148, "learning_rate": 8.18733153638814e-07, "loss": 2.3212, "step": 1215 }, { "epoch": 0.08220747279404332, "grad_norm": 5.367071108118339, "learning_rate": 8.221024258760107e-07, "loss": 2.2767, "step": 1220 }, { "epoch": 0.08254438866615006, "grad_norm": 5.4247578494161175, "learning_rate": 8.254716981132074e-07, "loss": 2.3163, "step": 1225 }, { "epoch": 0.0828813045382568, "grad_norm": 5.559687895142555, "learning_rate": 8.288409703504043e-07, "loss": 2.3745, "step": 1230 }, { "epoch": 0.08321822041036353, "grad_norm": 4.959673341909213, "learning_rate": 8.322102425876011e-07, "loss": 2.2812, "step": 1235 }, { "epoch": 0.08355513628247027, "grad_norm": 5.352060785926629, "learning_rate": 8.355795148247978e-07, "loss": 2.356, "step": 1240 }, { "epoch": 0.083892052154577, "grad_norm": 4.570218008797161, "learning_rate": 8.389487870619945e-07, "loss": 2.3052, "step": 1245 }, { "epoch": 0.08422896802668374, "grad_norm": 5.005816983757205, "learning_rate": 8.423180592991913e-07, "loss": 2.2509, "step": 1250 }, { "epoch": 0.08456588389879047, "grad_norm": 5.654928643174052, "learning_rate": 8.456873315363881e-07, "loss": 2.3115, "step": 1255 }, { "epoch": 0.08490279977089721, "grad_norm": 5.497575181984884, "learning_rate": 8.490566037735849e-07, "loss": 2.2794, "step": 1260 }, { "epoch": 0.08523971564300394, "grad_norm": 5.060290835149172, "learning_rate": 8.524258760107816e-07, "loss": 2.2566, "step": 1265 }, { "epoch": 0.08557663151511068, "grad_norm": 5.962928307251927, "learning_rate": 8.557951482479784e-07, "loss": 2.3715, "step": 1270 }, { "epoch": 0.08591354738721742, "grad_norm": 5.15766427243784, "learning_rate": 8.591644204851751e-07, "loss": 2.3199, "step": 1275 }, { "epoch": 0.08625046325932414, "grad_norm": 5.923492758827091, "learning_rate": 8.62533692722372e-07, "loss": 2.3205, "step": 1280 }, { "epoch": 0.08658737913143089, "grad_norm": 4.964711939346402, "learning_rate": 8.659029649595687e-07, "loss": 2.3296, "step": 1285 }, { "epoch": 0.08692429500353761, "grad_norm": 5.407815018536225, "learning_rate": 8.692722371967655e-07, "loss": 2.3665, "step": 1290 }, { "epoch": 0.08726121087564435, "grad_norm": 5.686815466588649, "learning_rate": 8.726415094339622e-07, "loss": 2.3387, "step": 1295 }, { "epoch": 0.08759812674775108, "grad_norm": 5.85158229128449, "learning_rate": 8.760107816711589e-07, "loss": 2.3631, "step": 1300 }, { "epoch": 0.08793504261985782, "grad_norm": 5.314471533727982, "learning_rate": 8.793800539083558e-07, "loss": 2.2723, "step": 1305 }, { "epoch": 0.08827195849196455, "grad_norm": 5.257085466973527, "learning_rate": 8.827493261455526e-07, "loss": 2.3229, "step": 1310 }, { "epoch": 0.08860887436407129, "grad_norm": 5.534119045003536, "learning_rate": 8.861185983827493e-07, "loss": 2.3057, "step": 1315 }, { "epoch": 0.08894579023617803, "grad_norm": 5.253023851258909, "learning_rate": 8.89487870619946e-07, "loss": 2.2552, "step": 1320 }, { "epoch": 0.08928270610828476, "grad_norm": 5.563609123466801, "learning_rate": 8.928571428571428e-07, "loss": 2.3791, "step": 1325 }, { "epoch": 0.0896196219803915, "grad_norm": 6.097111013778299, "learning_rate": 8.962264150943396e-07, "loss": 2.295, "step": 1330 }, { "epoch": 0.08995653785249823, "grad_norm": 5.495966931603338, "learning_rate": 8.995956873315364e-07, "loss": 2.2954, "step": 1335 }, { "epoch": 0.09029345372460497, "grad_norm": 5.369203842507154, "learning_rate": 9.029649595687331e-07, "loss": 2.2627, "step": 1340 }, { "epoch": 0.0906303695967117, "grad_norm": 5.145454797168509, "learning_rate": 9.063342318059298e-07, "loss": 2.2575, "step": 1345 }, { "epoch": 0.09096728546881844, "grad_norm": 5.484415935386677, "learning_rate": 9.097035040431266e-07, "loss": 2.2756, "step": 1350 }, { "epoch": 0.09130420134092516, "grad_norm": 5.57145391385215, "learning_rate": 9.130727762803235e-07, "loss": 2.2815, "step": 1355 }, { "epoch": 0.0916411172130319, "grad_norm": 4.834970501217708, "learning_rate": 9.164420485175202e-07, "loss": 2.3477, "step": 1360 }, { "epoch": 0.09197803308513865, "grad_norm": 4.92970993684747, "learning_rate": 9.198113207547169e-07, "loss": 2.2843, "step": 1365 }, { "epoch": 0.09231494895724537, "grad_norm": 4.976353067116663, "learning_rate": 9.231805929919137e-07, "loss": 2.2932, "step": 1370 }, { "epoch": 0.09265186482935212, "grad_norm": 5.702265404789428, "learning_rate": 9.265498652291105e-07, "loss": 2.3287, "step": 1375 }, { "epoch": 0.09298878070145884, "grad_norm": 5.240919366146442, "learning_rate": 9.299191374663073e-07, "loss": 2.3241, "step": 1380 }, { "epoch": 0.09332569657356558, "grad_norm": 5.35696005709587, "learning_rate": 9.33288409703504e-07, "loss": 2.3133, "step": 1385 }, { "epoch": 0.09366261244567231, "grad_norm": 5.11405183011783, "learning_rate": 9.366576819407007e-07, "loss": 2.2782, "step": 1390 }, { "epoch": 0.09399952831777905, "grad_norm": 4.953422172657249, "learning_rate": 9.400269541778976e-07, "loss": 2.2203, "step": 1395 }, { "epoch": 0.09433644418988578, "grad_norm": 5.365624438786201, "learning_rate": 9.433962264150943e-07, "loss": 2.2975, "step": 1400 }, { "epoch": 0.09467336006199252, "grad_norm": 6.411689813513092, "learning_rate": 9.46765498652291e-07, "loss": 2.3545, "step": 1405 }, { "epoch": 0.09501027593409926, "grad_norm": 4.9071304078003095, "learning_rate": 9.501347708894878e-07, "loss": 2.225, "step": 1410 }, { "epoch": 0.09534719180620599, "grad_norm": 4.9757667611238725, "learning_rate": 9.535040431266847e-07, "loss": 2.2978, "step": 1415 }, { "epoch": 0.09568410767831273, "grad_norm": 4.947628316634043, "learning_rate": 9.568733153638813e-07, "loss": 2.23, "step": 1420 }, { "epoch": 0.09602102355041946, "grad_norm": 5.623637901512381, "learning_rate": 9.60242587601078e-07, "loss": 2.2509, "step": 1425 }, { "epoch": 0.0963579394225262, "grad_norm": 5.614072639541004, "learning_rate": 9.63611859838275e-07, "loss": 2.325, "step": 1430 }, { "epoch": 0.09669485529463293, "grad_norm": 4.851415710223076, "learning_rate": 9.669811320754717e-07, "loss": 2.2784, "step": 1435 }, { "epoch": 0.09703177116673967, "grad_norm": 5.295674062187429, "learning_rate": 9.703504043126684e-07, "loss": 2.2341, "step": 1440 }, { "epoch": 0.0973686870388464, "grad_norm": 4.879671578015371, "learning_rate": 9.73719676549865e-07, "loss": 2.28, "step": 1445 }, { "epoch": 0.09770560291095313, "grad_norm": 4.764985225442166, "learning_rate": 9.77088948787062e-07, "loss": 2.2639, "step": 1450 }, { "epoch": 0.09804251878305988, "grad_norm": 5.628645775962037, "learning_rate": 9.804582210242587e-07, "loss": 2.2632, "step": 1455 }, { "epoch": 0.0983794346551666, "grad_norm": 5.55862535832527, "learning_rate": 9.838274932614555e-07, "loss": 2.312, "step": 1460 }, { "epoch": 0.09871635052727334, "grad_norm": 6.04806274014557, "learning_rate": 9.871967654986522e-07, "loss": 2.3306, "step": 1465 }, { "epoch": 0.09905326639938007, "grad_norm": 5.8754744504535354, "learning_rate": 9.90566037735849e-07, "loss": 2.2348, "step": 1470 }, { "epoch": 0.09939018227148681, "grad_norm": 5.496298900922482, "learning_rate": 9.939353099730458e-07, "loss": 2.2255, "step": 1475 }, { "epoch": 0.09972709814359354, "grad_norm": 5.6564139484424105, "learning_rate": 9.973045822102425e-07, "loss": 2.2822, "step": 1480 }, { "epoch": 0.10006401401570028, "grad_norm": 5.226429480087397, "learning_rate": 9.999999861679377e-07, "loss": 2.1814, "step": 1485 }, { "epoch": 0.10040092988780701, "grad_norm": 5.992437659892539, "learning_rate": 9.999995020458434e-07, "loss": 2.2286, "step": 1490 }, { "epoch": 0.10073784575991375, "grad_norm": 5.365750328736673, "learning_rate": 9.99998326321407e-07, "loss": 2.2647, "step": 1495 }, { "epoch": 0.10107476163202049, "grad_norm": 5.459092735341802, "learning_rate": 9.999964589962556e-07, "loss": 2.2166, "step": 1500 }, { "epoch": 0.10141167750412722, "grad_norm": 5.345652478580551, "learning_rate": 9.999939000729715e-07, "loss": 2.2861, "step": 1505 }, { "epoch": 0.10174859337623396, "grad_norm": 4.793432249995596, "learning_rate": 9.999906495550946e-07, "loss": 2.2699, "step": 1510 }, { "epoch": 0.10208550924834069, "grad_norm": 5.241961293614402, "learning_rate": 9.999867074471207e-07, "loss": 2.2903, "step": 1515 }, { "epoch": 0.10242242512044743, "grad_norm": 4.9459044917442085, "learning_rate": 9.99982073754503e-07, "loss": 2.2155, "step": 1520 }, { "epoch": 0.10275934099255415, "grad_norm": 4.925926415763532, "learning_rate": 9.999767484836502e-07, "loss": 2.2296, "step": 1525 }, { "epoch": 0.1030962568646609, "grad_norm": 5.460491133029361, "learning_rate": 9.999707316419288e-07, "loss": 2.191, "step": 1530 }, { "epoch": 0.10343317273676762, "grad_norm": 5.3952960360720885, "learning_rate": 9.99964023237661e-07, "loss": 2.2069, "step": 1535 }, { "epoch": 0.10377008860887436, "grad_norm": 5.765108646086805, "learning_rate": 9.999566232801261e-07, "loss": 2.2416, "step": 1540 }, { "epoch": 0.1041070044809811, "grad_norm": 5.3041772631463875, "learning_rate": 9.9994853177956e-07, "loss": 2.2237, "step": 1545 }, { "epoch": 0.10444392035308783, "grad_norm": 5.284700158020605, "learning_rate": 9.999397487471543e-07, "loss": 2.3358, "step": 1550 }, { "epoch": 0.10478083622519457, "grad_norm": 5.003595566674297, "learning_rate": 9.999302741950582e-07, "loss": 2.254, "step": 1555 }, { "epoch": 0.1051177520973013, "grad_norm": 5.430882656555759, "learning_rate": 9.999201081363768e-07, "loss": 2.2776, "step": 1560 }, { "epoch": 0.10545466796940804, "grad_norm": 5.205554328157601, "learning_rate": 9.99909250585172e-07, "loss": 2.267, "step": 1565 }, { "epoch": 0.10579158384151477, "grad_norm": 5.256459065656869, "learning_rate": 9.998977015564617e-07, "loss": 2.3346, "step": 1570 }, { "epoch": 0.10612849971362151, "grad_norm": 4.993800620056396, "learning_rate": 9.998854610662209e-07, "loss": 2.2692, "step": 1575 }, { "epoch": 0.10646541558572824, "grad_norm": 5.27971058498652, "learning_rate": 9.998725291313805e-07, "loss": 2.2681, "step": 1580 }, { "epoch": 0.10680233145783498, "grad_norm": 6.572312883447045, "learning_rate": 9.998589057698283e-07, "loss": 2.2757, "step": 1585 }, { "epoch": 0.10713924732994172, "grad_norm": 5.370264186645819, "learning_rate": 9.99844591000408e-07, "loss": 2.1916, "step": 1590 }, { "epoch": 0.10747616320204845, "grad_norm": 5.158509820361638, "learning_rate": 9.9982958484292e-07, "loss": 2.2494, "step": 1595 }, { "epoch": 0.10781307907415519, "grad_norm": 5.2908631003671465, "learning_rate": 9.99813887318121e-07, "loss": 2.2107, "step": 1600 }, { "epoch": 0.10814999494626192, "grad_norm": 5.536590898565209, "learning_rate": 9.997974984477236e-07, "loss": 2.2527, "step": 1605 }, { "epoch": 0.10848691081836866, "grad_norm": 5.127288849803949, "learning_rate": 9.99780418254397e-07, "loss": 2.23, "step": 1610 }, { "epoch": 0.10882382669047538, "grad_norm": 5.5706382756963695, "learning_rate": 9.99762646761767e-07, "loss": 2.2976, "step": 1615 }, { "epoch": 0.10916074256258212, "grad_norm": 5.566294574773447, "learning_rate": 9.99744183994415e-07, "loss": 2.2302, "step": 1620 }, { "epoch": 0.10949765843468885, "grad_norm": 5.522322004716449, "learning_rate": 9.997250299778788e-07, "loss": 2.2567, "step": 1625 }, { "epoch": 0.1098345743067956, "grad_norm": 5.273889674757659, "learning_rate": 9.997051847386524e-07, "loss": 2.2642, "step": 1630 }, { "epoch": 0.11017149017890233, "grad_norm": 4.9956368552559525, "learning_rate": 9.996846483041858e-07, "loss": 2.3009, "step": 1635 }, { "epoch": 0.11050840605100906, "grad_norm": 5.160502618517943, "learning_rate": 9.99663420702885e-07, "loss": 2.2887, "step": 1640 }, { "epoch": 0.1108453219231158, "grad_norm": 5.140576775775958, "learning_rate": 9.996415019641124e-07, "loss": 2.2226, "step": 1645 }, { "epoch": 0.11118223779522253, "grad_norm": 5.336688844125499, "learning_rate": 9.996188921181861e-07, "loss": 2.2358, "step": 1650 }, { "epoch": 0.11151915366732927, "grad_norm": 5.308794397759707, "learning_rate": 9.9959559119638e-07, "loss": 2.2235, "step": 1655 }, { "epoch": 0.111856069539436, "grad_norm": 5.313119221358314, "learning_rate": 9.995715992309244e-07, "loss": 2.269, "step": 1660 }, { "epoch": 0.11219298541154274, "grad_norm": 5.142352758470238, "learning_rate": 9.995469162550048e-07, "loss": 2.1886, "step": 1665 }, { "epoch": 0.11252990128364947, "grad_norm": 5.560439711483778, "learning_rate": 9.99521542302763e-07, "loss": 2.2267, "step": 1670 }, { "epoch": 0.11286681715575621, "grad_norm": 5.213381062574422, "learning_rate": 9.994954774092962e-07, "loss": 2.2033, "step": 1675 }, { "epoch": 0.11320373302786295, "grad_norm": 5.543750162605874, "learning_rate": 9.994687216106579e-07, "loss": 2.2252, "step": 1680 }, { "epoch": 0.11354064889996968, "grad_norm": 5.270722128196081, "learning_rate": 9.994412749438564e-07, "loss": 2.1856, "step": 1685 }, { "epoch": 0.11387756477207642, "grad_norm": 5.444326537665937, "learning_rate": 9.994131374468565e-07, "loss": 2.202, "step": 1690 }, { "epoch": 0.11421448064418314, "grad_norm": 4.626050497961065, "learning_rate": 9.993843091585782e-07, "loss": 2.2284, "step": 1695 }, { "epoch": 0.11455139651628989, "grad_norm": 5.139675722466006, "learning_rate": 9.993547901188966e-07, "loss": 2.2293, "step": 1700 }, { "epoch": 0.11488831238839661, "grad_norm": 5.3330419750427716, "learning_rate": 9.993245803686426e-07, "loss": 2.1804, "step": 1705 }, { "epoch": 0.11522522826050335, "grad_norm": 5.117269797365302, "learning_rate": 9.992936799496029e-07, "loss": 2.2452, "step": 1710 }, { "epoch": 0.11556214413261008, "grad_norm": 4.965923339689533, "learning_rate": 9.99262088904519e-07, "loss": 2.1931, "step": 1715 }, { "epoch": 0.11589906000471682, "grad_norm": 5.120530513235642, "learning_rate": 9.992298072770877e-07, "loss": 2.2935, "step": 1720 }, { "epoch": 0.11623597587682356, "grad_norm": 5.297930148611365, "learning_rate": 9.991968351119612e-07, "loss": 2.2282, "step": 1725 }, { "epoch": 0.11657289174893029, "grad_norm": 5.376939117757635, "learning_rate": 9.991631724547467e-07, "loss": 2.2096, "step": 1730 }, { "epoch": 0.11690980762103703, "grad_norm": 5.335121414466639, "learning_rate": 9.99128819352007e-07, "loss": 2.2743, "step": 1735 }, { "epoch": 0.11724672349314376, "grad_norm": 5.055058916128202, "learning_rate": 9.99093775851259e-07, "loss": 2.2688, "step": 1740 }, { "epoch": 0.1175836393652505, "grad_norm": 4.887416913933956, "learning_rate": 9.990580420009755e-07, "loss": 2.2654, "step": 1745 }, { "epoch": 0.11792055523735723, "grad_norm": 5.22589265111978, "learning_rate": 9.990216178505835e-07, "loss": 2.2286, "step": 1750 }, { "epoch": 0.11825747110946397, "grad_norm": 5.067394790478713, "learning_rate": 9.989845034504651e-07, "loss": 2.1782, "step": 1755 }, { "epoch": 0.1185943869815707, "grad_norm": 4.631004988505539, "learning_rate": 9.989466988519572e-07, "loss": 2.1837, "step": 1760 }, { "epoch": 0.11893130285367744, "grad_norm": 5.387326199065619, "learning_rate": 9.989082041073517e-07, "loss": 2.1946, "step": 1765 }, { "epoch": 0.11926821872578418, "grad_norm": 5.335850563090953, "learning_rate": 9.988690192698944e-07, "loss": 2.2378, "step": 1770 }, { "epoch": 0.1196051345978909, "grad_norm": 5.688694137966256, "learning_rate": 9.988291443937857e-07, "loss": 2.2472, "step": 1775 }, { "epoch": 0.11994205046999765, "grad_norm": 4.812493440284149, "learning_rate": 9.987885795341816e-07, "loss": 2.2223, "step": 1780 }, { "epoch": 0.12027896634210437, "grad_norm": 5.18039465338467, "learning_rate": 9.987473247471908e-07, "loss": 2.2104, "step": 1785 }, { "epoch": 0.12061588221421111, "grad_norm": 6.1601190788418165, "learning_rate": 9.98705380089878e-07, "loss": 2.222, "step": 1790 }, { "epoch": 0.12095279808631784, "grad_norm": 5.233257208423195, "learning_rate": 9.986627456202608e-07, "loss": 2.2574, "step": 1795 }, { "epoch": 0.12128971395842458, "grad_norm": 5.682809918565229, "learning_rate": 9.986194213973113e-07, "loss": 2.1955, "step": 1800 }, { "epoch": 0.12162662983053131, "grad_norm": 5.233738554962388, "learning_rate": 9.985754074809562e-07, "loss": 2.174, "step": 1805 }, { "epoch": 0.12196354570263805, "grad_norm": 4.880973966429492, "learning_rate": 9.985307039320756e-07, "loss": 2.2035, "step": 1810 }, { "epoch": 0.12230046157474479, "grad_norm": 4.879874668800794, "learning_rate": 9.98485310812504e-07, "loss": 2.246, "step": 1815 }, { "epoch": 0.12263737744685152, "grad_norm": 4.778756440591823, "learning_rate": 9.98439228185029e-07, "loss": 2.2435, "step": 1820 }, { "epoch": 0.12297429331895826, "grad_norm": 5.501692530970563, "learning_rate": 9.983924561133927e-07, "loss": 2.1471, "step": 1825 }, { "epoch": 0.12331120919106499, "grad_norm": 5.585399174072263, "learning_rate": 9.983449946622906e-07, "loss": 2.1705, "step": 1830 }, { "epoch": 0.12364812506317173, "grad_norm": 5.06147058476444, "learning_rate": 9.982968438973714e-07, "loss": 2.232, "step": 1835 }, { "epoch": 0.12398504093527846, "grad_norm": 5.5015660236137345, "learning_rate": 9.982480038852375e-07, "loss": 2.2608, "step": 1840 }, { "epoch": 0.1243219568073852, "grad_norm": 5.498854516967151, "learning_rate": 9.98198474693445e-07, "loss": 2.2224, "step": 1845 }, { "epoch": 0.12465887267949192, "grad_norm": 5.375475320262889, "learning_rate": 9.981482563905025e-07, "loss": 2.2473, "step": 1850 }, { "epoch": 0.12499578855159867, "grad_norm": 4.760335922018651, "learning_rate": 9.980973490458728e-07, "loss": 2.2199, "step": 1855 }, { "epoch": 0.1253327044237054, "grad_norm": 5.164132986376927, "learning_rate": 9.980457527299708e-07, "loss": 2.1883, "step": 1860 }, { "epoch": 0.12566962029581213, "grad_norm": 5.340824950757857, "learning_rate": 9.979934675141652e-07, "loss": 2.2155, "step": 1865 }, { "epoch": 0.12600653616791888, "grad_norm": 5.333336217870446, "learning_rate": 9.979404934707771e-07, "loss": 2.1598, "step": 1870 }, { "epoch": 0.12634345204002562, "grad_norm": 5.752820446715373, "learning_rate": 9.978868306730804e-07, "loss": 2.2068, "step": 1875 }, { "epoch": 0.12668036791213233, "grad_norm": 4.790432765904655, "learning_rate": 9.978324791953018e-07, "loss": 2.2054, "step": 1880 }, { "epoch": 0.12701728378423907, "grad_norm": 5.046862887189039, "learning_rate": 9.97777439112621e-07, "loss": 2.2084, "step": 1885 }, { "epoch": 0.1273541996563458, "grad_norm": 5.904664347370288, "learning_rate": 9.977217105011693e-07, "loss": 2.1831, "step": 1890 }, { "epoch": 0.12769111552845255, "grad_norm": 4.977056492782444, "learning_rate": 9.97665293438031e-07, "loss": 2.2289, "step": 1895 }, { "epoch": 0.12802803140055927, "grad_norm": 5.5052152942978925, "learning_rate": 9.976081880012426e-07, "loss": 2.1665, "step": 1900 }, { "epoch": 0.128364947272666, "grad_norm": 5.285274937223577, "learning_rate": 9.975503942697925e-07, "loss": 2.2303, "step": 1905 }, { "epoch": 0.12870186314477275, "grad_norm": 4.909260873550485, "learning_rate": 9.974919123236217e-07, "loss": 2.1992, "step": 1910 }, { "epoch": 0.1290387790168795, "grad_norm": 5.227863677861565, "learning_rate": 9.974327422436223e-07, "loss": 2.1787, "step": 1915 }, { "epoch": 0.12937569488898623, "grad_norm": 4.426067122374531, "learning_rate": 9.97372884111639e-07, "loss": 2.1867, "step": 1920 }, { "epoch": 0.12971261076109294, "grad_norm": 4.979042982595774, "learning_rate": 9.97312338010468e-07, "loss": 2.2334, "step": 1925 }, { "epoch": 0.13004952663319969, "grad_norm": 5.024650320186452, "learning_rate": 9.97251104023857e-07, "loss": 2.2522, "step": 1930 }, { "epoch": 0.13038644250530643, "grad_norm": 4.947325145701186, "learning_rate": 9.971891822365048e-07, "loss": 2.2028, "step": 1935 }, { "epoch": 0.13072335837741317, "grad_norm": 4.877772178538737, "learning_rate": 9.971265727340627e-07, "loss": 2.2008, "step": 1940 }, { "epoch": 0.13106027424951988, "grad_norm": 5.552058862782528, "learning_rate": 9.970632756031322e-07, "loss": 2.2609, "step": 1945 }, { "epoch": 0.13139719012162662, "grad_norm": 5.389515399037254, "learning_rate": 9.969992909312658e-07, "loss": 2.2236, "step": 1950 }, { "epoch": 0.13173410599373336, "grad_norm": 4.870213646259442, "learning_rate": 9.969346188069684e-07, "loss": 2.2305, "step": 1955 }, { "epoch": 0.1320710218658401, "grad_norm": 4.9428281819777355, "learning_rate": 9.968692593196943e-07, "loss": 2.1347, "step": 1960 }, { "epoch": 0.13240793773794685, "grad_norm": 5.016941228464764, "learning_rate": 9.968032125598493e-07, "loss": 2.2838, "step": 1965 }, { "epoch": 0.13274485361005356, "grad_norm": 4.714436698486221, "learning_rate": 9.967364786187894e-07, "loss": 2.1827, "step": 1970 }, { "epoch": 0.1330817694821603, "grad_norm": 5.142182218756574, "learning_rate": 9.96669057588822e-07, "loss": 2.2037, "step": 1975 }, { "epoch": 0.13341868535426704, "grad_norm": 4.856863777228724, "learning_rate": 9.966009495632037e-07, "loss": 2.1872, "step": 1980 }, { "epoch": 0.13375560122637378, "grad_norm": 4.936085043347686, "learning_rate": 9.965321546361421e-07, "loss": 2.2228, "step": 1985 }, { "epoch": 0.1340925170984805, "grad_norm": 4.948527239585069, "learning_rate": 9.964626729027948e-07, "loss": 2.1776, "step": 1990 }, { "epoch": 0.13442943297058724, "grad_norm": 5.204720994078589, "learning_rate": 9.963925044592695e-07, "loss": 2.2245, "step": 1995 }, { "epoch": 0.13476634884269398, "grad_norm": 5.932489240645872, "learning_rate": 9.963216494026235e-07, "loss": 2.1606, "step": 2000 }, { "epoch": 0.13510326471480072, "grad_norm": 5.017857885847957, "learning_rate": 9.962501078308636e-07, "loss": 2.2376, "step": 2005 }, { "epoch": 0.13544018058690746, "grad_norm": 5.343243873628655, "learning_rate": 9.96177879842947e-07, "loss": 2.1721, "step": 2010 }, { "epoch": 0.13577709645901417, "grad_norm": 5.48126736809907, "learning_rate": 9.961049655387799e-07, "loss": 2.2227, "step": 2015 }, { "epoch": 0.13611401233112091, "grad_norm": 5.21084573238217, "learning_rate": 9.960313650192175e-07, "loss": 2.1793, "step": 2020 }, { "epoch": 0.13645092820322766, "grad_norm": 5.357532284785747, "learning_rate": 9.959570783860647e-07, "loss": 2.1799, "step": 2025 }, { "epoch": 0.1367878440753344, "grad_norm": 5.265074215880888, "learning_rate": 9.958821057420752e-07, "loss": 2.232, "step": 2030 }, { "epoch": 0.1371247599474411, "grad_norm": 5.348831521422465, "learning_rate": 9.958064471909513e-07, "loss": 2.1478, "step": 2035 }, { "epoch": 0.13746167581954785, "grad_norm": 5.232213985010375, "learning_rate": 9.95730102837345e-07, "loss": 2.1716, "step": 2040 }, { "epoch": 0.1377985916916546, "grad_norm": 4.662923049152209, "learning_rate": 9.956530727868558e-07, "loss": 2.2651, "step": 2045 }, { "epoch": 0.13813550756376133, "grad_norm": 5.120004570262222, "learning_rate": 9.955753571460322e-07, "loss": 2.2134, "step": 2050 }, { "epoch": 0.13847242343586808, "grad_norm": 5.424304856228781, "learning_rate": 9.95496956022371e-07, "loss": 2.1992, "step": 2055 }, { "epoch": 0.1388093393079748, "grad_norm": 5.6441736098090445, "learning_rate": 9.95417869524317e-07, "loss": 2.2034, "step": 2060 }, { "epoch": 0.13914625518008153, "grad_norm": 5.097213712355006, "learning_rate": 9.953380977612633e-07, "loss": 2.2106, "step": 2065 }, { "epoch": 0.13948317105218827, "grad_norm": 5.333093675775064, "learning_rate": 9.952576408435505e-07, "loss": 2.253, "step": 2070 }, { "epoch": 0.139820086924295, "grad_norm": 4.9521543139880615, "learning_rate": 9.951764988824674e-07, "loss": 2.1921, "step": 2075 }, { "epoch": 0.14015700279640173, "grad_norm": 4.85848915260835, "learning_rate": 9.950946719902498e-07, "loss": 2.2079, "step": 2080 }, { "epoch": 0.14049391866850847, "grad_norm": 4.636659919766986, "learning_rate": 9.950121602800813e-07, "loss": 2.1593, "step": 2085 }, { "epoch": 0.1408308345406152, "grad_norm": 5.337349816078775, "learning_rate": 9.949289638660922e-07, "loss": 2.164, "step": 2090 }, { "epoch": 0.14116775041272195, "grad_norm": 5.125313270141921, "learning_rate": 9.948450828633608e-07, "loss": 2.1974, "step": 2095 }, { "epoch": 0.1415046662848287, "grad_norm": 4.981500283236779, "learning_rate": 9.947605173879115e-07, "loss": 2.1461, "step": 2100 }, { "epoch": 0.1418415821569354, "grad_norm": 5.244192175099754, "learning_rate": 9.94675267556716e-07, "loss": 2.1468, "step": 2105 }, { "epoch": 0.14217849802904214, "grad_norm": 5.199363133787009, "learning_rate": 9.94589333487692e-07, "loss": 2.1977, "step": 2110 }, { "epoch": 0.14251541390114889, "grad_norm": 5.381347356303496, "learning_rate": 9.945027152997046e-07, "loss": 2.2527, "step": 2115 }, { "epoch": 0.14285232977325563, "grad_norm": 5.231358550567353, "learning_rate": 9.944154131125642e-07, "loss": 2.2305, "step": 2120 }, { "epoch": 0.14318924564536234, "grad_norm": 5.327600653173003, "learning_rate": 9.94327427047028e-07, "loss": 2.2204, "step": 2125 }, { "epoch": 0.14352616151746908, "grad_norm": 5.678503653519081, "learning_rate": 9.942387572247983e-07, "loss": 2.2069, "step": 2130 }, { "epoch": 0.14386307738957582, "grad_norm": 5.123982382524045, "learning_rate": 9.941494037685243e-07, "loss": 2.1944, "step": 2135 }, { "epoch": 0.14419999326168256, "grad_norm": 4.911850243253756, "learning_rate": 9.940593668017998e-07, "loss": 2.2037, "step": 2140 }, { "epoch": 0.1445369091337893, "grad_norm": 5.736172379674466, "learning_rate": 9.93968646449165e-07, "loss": 2.2137, "step": 2145 }, { "epoch": 0.14487382500589602, "grad_norm": 5.321564339253613, "learning_rate": 9.938772428361045e-07, "loss": 2.2185, "step": 2150 }, { "epoch": 0.14521074087800276, "grad_norm": 5.373186118566773, "learning_rate": 9.937851560890484e-07, "loss": 2.1388, "step": 2155 }, { "epoch": 0.1455476567501095, "grad_norm": 5.047143753978706, "learning_rate": 9.936923863353717e-07, "loss": 2.2162, "step": 2160 }, { "epoch": 0.14588457262221624, "grad_norm": 5.010858477201312, "learning_rate": 9.935989337033939e-07, "loss": 2.1766, "step": 2165 }, { "epoch": 0.14622148849432295, "grad_norm": 5.194945752558841, "learning_rate": 9.935047983223794e-07, "loss": 2.138, "step": 2170 }, { "epoch": 0.1465584043664297, "grad_norm": 4.969626541086585, "learning_rate": 9.934099803225367e-07, "loss": 2.1687, "step": 2175 }, { "epoch": 0.14689532023853644, "grad_norm": 5.319848823194131, "learning_rate": 9.933144798350188e-07, "loss": 2.2098, "step": 2180 }, { "epoch": 0.14723223611064318, "grad_norm": 5.4235455550887295, "learning_rate": 9.932182969919228e-07, "loss": 2.2682, "step": 2185 }, { "epoch": 0.14756915198274992, "grad_norm": 4.4397331069155666, "learning_rate": 9.931214319262885e-07, "loss": 2.1393, "step": 2190 }, { "epoch": 0.14790606785485663, "grad_norm": 5.244062916372055, "learning_rate": 9.930238847721013e-07, "loss": 2.1582, "step": 2195 }, { "epoch": 0.14824298372696337, "grad_norm": 5.419059831589744, "learning_rate": 9.929256556642884e-07, "loss": 2.0829, "step": 2200 }, { "epoch": 0.14857989959907011, "grad_norm": 5.170834508775464, "learning_rate": 9.92826744738721e-07, "loss": 2.2272, "step": 2205 }, { "epoch": 0.14891681547117686, "grad_norm": 5.528000995476316, "learning_rate": 9.927271521322134e-07, "loss": 2.2139, "step": 2210 }, { "epoch": 0.14925373134328357, "grad_norm": 5.175580439388339, "learning_rate": 9.926268779825224e-07, "loss": 2.1972, "step": 2215 }, { "epoch": 0.1495906472153903, "grad_norm": 4.6318029973371235, "learning_rate": 9.925259224283484e-07, "loss": 2.1842, "step": 2220 }, { "epoch": 0.14992756308749705, "grad_norm": 5.058796364668916, "learning_rate": 9.924242856093332e-07, "loss": 2.1746, "step": 2225 }, { "epoch": 0.1502644789596038, "grad_norm": 4.433138812790062, "learning_rate": 9.923219676660614e-07, "loss": 2.1569, "step": 2230 }, { "epoch": 0.15060139483171053, "grad_norm": 4.999835039047164, "learning_rate": 9.922189687400603e-07, "loss": 2.153, "step": 2235 }, { "epoch": 0.15093831070381725, "grad_norm": 5.6803810833069805, "learning_rate": 9.921152889737984e-07, "loss": 2.185, "step": 2240 }, { "epoch": 0.151275226575924, "grad_norm": 5.430466945527003, "learning_rate": 9.92010928510686e-07, "loss": 2.1845, "step": 2245 }, { "epoch": 0.15161214244803073, "grad_norm": 5.567609049404312, "learning_rate": 9.919058874950754e-07, "loss": 2.1862, "step": 2250 }, { "epoch": 0.15194905832013747, "grad_norm": 4.6350258898766, "learning_rate": 9.9180016607226e-07, "loss": 2.1433, "step": 2255 }, { "epoch": 0.15228597419224418, "grad_norm": 5.5196572916368725, "learning_rate": 9.916937643884737e-07, "loss": 2.2293, "step": 2260 }, { "epoch": 0.15262289006435092, "grad_norm": 5.6609686337647505, "learning_rate": 9.915866825908927e-07, "loss": 2.2166, "step": 2265 }, { "epoch": 0.15295980593645767, "grad_norm": 4.616994139208586, "learning_rate": 9.914789208276329e-07, "loss": 2.2556, "step": 2270 }, { "epoch": 0.1532967218085644, "grad_norm": 5.213959128766113, "learning_rate": 9.913704792477511e-07, "loss": 2.222, "step": 2275 }, { "epoch": 0.15363363768067115, "grad_norm": 4.514147902907971, "learning_rate": 9.91261358001244e-07, "loss": 2.0984, "step": 2280 }, { "epoch": 0.15397055355277786, "grad_norm": 5.282277053239466, "learning_rate": 9.911515572390495e-07, "loss": 2.1756, "step": 2285 }, { "epoch": 0.1543074694248846, "grad_norm": 5.24982822412204, "learning_rate": 9.91041077113044e-07, "loss": 2.1759, "step": 2290 }, { "epoch": 0.15464438529699134, "grad_norm": 5.094559101462943, "learning_rate": 9.909299177760445e-07, "loss": 2.189, "step": 2295 }, { "epoch": 0.15498130116909808, "grad_norm": 4.8233644468629135, "learning_rate": 9.90818079381807e-07, "loss": 2.256, "step": 2300 }, { "epoch": 0.1553182170412048, "grad_norm": 5.279568281721287, "learning_rate": 9.907055620850277e-07, "loss": 2.159, "step": 2305 }, { "epoch": 0.15565513291331154, "grad_norm": 5.587161591244903, "learning_rate": 9.905923660413409e-07, "loss": 2.1113, "step": 2310 }, { "epoch": 0.15599204878541828, "grad_norm": 5.277879521346425, "learning_rate": 9.904784914073196e-07, "loss": 2.2021, "step": 2315 }, { "epoch": 0.15632896465752502, "grad_norm": 5.326949589973305, "learning_rate": 9.903639383404765e-07, "loss": 2.1713, "step": 2320 }, { "epoch": 0.15666588052963176, "grad_norm": 5.04672639634398, "learning_rate": 9.902487069992618e-07, "loss": 2.1288, "step": 2325 }, { "epoch": 0.15700279640173848, "grad_norm": 5.428410156884528, "learning_rate": 9.901327975430645e-07, "loss": 2.224, "step": 2330 }, { "epoch": 0.15733971227384522, "grad_norm": 5.315079877145513, "learning_rate": 9.900162101322106e-07, "loss": 2.166, "step": 2335 }, { "epoch": 0.15767662814595196, "grad_norm": 5.29220938895336, "learning_rate": 9.898989449279653e-07, "loss": 2.1446, "step": 2340 }, { "epoch": 0.1580135440180587, "grad_norm": 5.0130858922002535, "learning_rate": 9.8978100209253e-07, "loss": 2.1986, "step": 2345 }, { "epoch": 0.1583504598901654, "grad_norm": 4.916111913743054, "learning_rate": 9.89662381789044e-07, "loss": 2.2133, "step": 2350 }, { "epoch": 0.15868737576227215, "grad_norm": 5.019518759137215, "learning_rate": 9.89543084181584e-07, "loss": 2.1268, "step": 2355 }, { "epoch": 0.1590242916343789, "grad_norm": 5.203529903469353, "learning_rate": 9.894231094351628e-07, "loss": 2.1905, "step": 2360 }, { "epoch": 0.15936120750648564, "grad_norm": 4.888339323810611, "learning_rate": 9.893024577157303e-07, "loss": 2.2003, "step": 2365 }, { "epoch": 0.15969812337859238, "grad_norm": 5.099146258491599, "learning_rate": 9.891811291901727e-07, "loss": 2.1462, "step": 2370 }, { "epoch": 0.1600350392506991, "grad_norm": 5.183125283279642, "learning_rate": 9.890591240263124e-07, "loss": 2.1587, "step": 2375 }, { "epoch": 0.16037195512280583, "grad_norm": 4.918982733222788, "learning_rate": 9.889364423929075e-07, "loss": 2.1318, "step": 2380 }, { "epoch": 0.16070887099491257, "grad_norm": 4.9741836295566175, "learning_rate": 9.888130844596524e-07, "loss": 2.1502, "step": 2385 }, { "epoch": 0.1610457868670193, "grad_norm": 4.623828347153395, "learning_rate": 9.88689050397176e-07, "loss": 2.2097, "step": 2390 }, { "epoch": 0.16138270273912603, "grad_norm": 5.063900821866024, "learning_rate": 9.885643403770431e-07, "loss": 2.13, "step": 2395 }, { "epoch": 0.16171961861123277, "grad_norm": 5.247902206964608, "learning_rate": 9.884389545717538e-07, "loss": 2.1692, "step": 2400 }, { "epoch": 0.1620565344833395, "grad_norm": 5.759415621784411, "learning_rate": 9.88312893154742e-07, "loss": 2.1105, "step": 2405 }, { "epoch": 0.16239345035544625, "grad_norm": 4.99685714487149, "learning_rate": 9.881861563003766e-07, "loss": 2.2387, "step": 2410 }, { "epoch": 0.162730366227553, "grad_norm": 4.911113663826544, "learning_rate": 9.880587441839613e-07, "loss": 2.1367, "step": 2415 }, { "epoch": 0.1630672820996597, "grad_norm": 5.231241446061089, "learning_rate": 9.87930656981733e-07, "loss": 2.1642, "step": 2420 }, { "epoch": 0.16340419797176645, "grad_norm": 5.3775759234045735, "learning_rate": 9.878018948708625e-07, "loss": 2.0592, "step": 2425 }, { "epoch": 0.1637411138438732, "grad_norm": 5.140218227066107, "learning_rate": 9.876724580294546e-07, "loss": 2.1834, "step": 2430 }, { "epoch": 0.16407802971597993, "grad_norm": 4.88527863118227, "learning_rate": 9.875423466365471e-07, "loss": 2.1681, "step": 2435 }, { "epoch": 0.16441494558808664, "grad_norm": 5.0210503414476895, "learning_rate": 9.874115608721107e-07, "loss": 2.1229, "step": 2440 }, { "epoch": 0.16475186146019338, "grad_norm": 5.328002138287082, "learning_rate": 9.872801009170492e-07, "loss": 2.2243, "step": 2445 }, { "epoch": 0.16508877733230012, "grad_norm": 4.857769832833448, "learning_rate": 9.871479669531988e-07, "loss": 2.0864, "step": 2450 }, { "epoch": 0.16542569320440687, "grad_norm": 5.788883603240783, "learning_rate": 9.87015159163328e-07, "loss": 2.1644, "step": 2455 }, { "epoch": 0.1657626090765136, "grad_norm": 5.6006843643763435, "learning_rate": 9.868816777311372e-07, "loss": 2.2115, "step": 2460 }, { "epoch": 0.16609952494862032, "grad_norm": 5.0158006475916626, "learning_rate": 9.867475228412592e-07, "loss": 2.1467, "step": 2465 }, { "epoch": 0.16643644082072706, "grad_norm": 5.754609898252899, "learning_rate": 9.866126946792572e-07, "loss": 2.1149, "step": 2470 }, { "epoch": 0.1667733566928338, "grad_norm": 5.006432123708189, "learning_rate": 9.864771934316268e-07, "loss": 2.1549, "step": 2475 }, { "epoch": 0.16711027256494054, "grad_norm": 5.131759313059348, "learning_rate": 9.863410192857938e-07, "loss": 2.2041, "step": 2480 }, { "epoch": 0.16744718843704726, "grad_norm": 4.864088601148526, "learning_rate": 9.862041724301154e-07, "loss": 2.1706, "step": 2485 }, { "epoch": 0.167784104309154, "grad_norm": 5.073432804570049, "learning_rate": 9.860666530538787e-07, "loss": 2.2007, "step": 2490 }, { "epoch": 0.16812102018126074, "grad_norm": 4.559744076594243, "learning_rate": 9.859284613473017e-07, "loss": 2.1594, "step": 2495 }, { "epoch": 0.16845793605336748, "grad_norm": 4.628850551637293, "learning_rate": 9.857895975015318e-07, "loss": 2.1869, "step": 2500 }, { "epoch": 0.16879485192547422, "grad_norm": 5.090885022211014, "learning_rate": 9.856500617086463e-07, "loss": 2.1424, "step": 2505 }, { "epoch": 0.16913176779758093, "grad_norm": 5.164577950145165, "learning_rate": 9.85509854161652e-07, "loss": 2.1247, "step": 2510 }, { "epoch": 0.16946868366968768, "grad_norm": 5.307961214012326, "learning_rate": 9.853689750544849e-07, "loss": 2.2251, "step": 2515 }, { "epoch": 0.16980559954179442, "grad_norm": 4.849540154454186, "learning_rate": 9.852274245820095e-07, "loss": 2.1524, "step": 2520 }, { "epoch": 0.17014251541390116, "grad_norm": 5.478077139122545, "learning_rate": 9.850852029400198e-07, "loss": 2.1857, "step": 2525 }, { "epoch": 0.17047943128600787, "grad_norm": 5.134735409719765, "learning_rate": 9.849423103252374e-07, "loss": 2.1341, "step": 2530 }, { "epoch": 0.1708163471581146, "grad_norm": 5.318453158017956, "learning_rate": 9.84798746935312e-07, "loss": 2.1591, "step": 2535 }, { "epoch": 0.17115326303022135, "grad_norm": 4.9382133091539995, "learning_rate": 9.846545129688217e-07, "loss": 2.0905, "step": 2540 }, { "epoch": 0.1714901789023281, "grad_norm": 5.1900196457578165, "learning_rate": 9.845096086252716e-07, "loss": 2.1414, "step": 2545 }, { "epoch": 0.17182709477443484, "grad_norm": 4.988185726000063, "learning_rate": 9.843640341050944e-07, "loss": 2.2125, "step": 2550 }, { "epoch": 0.17216401064654155, "grad_norm": 5.046049497819594, "learning_rate": 9.842177896096493e-07, "loss": 2.2134, "step": 2555 }, { "epoch": 0.1725009265186483, "grad_norm": 5.118612045507411, "learning_rate": 9.84070875341223e-07, "loss": 2.1501, "step": 2560 }, { "epoch": 0.17283784239075503, "grad_norm": 4.954952754063053, "learning_rate": 9.83923291503028e-07, "loss": 2.1375, "step": 2565 }, { "epoch": 0.17317475826286177, "grad_norm": 5.072348038161897, "learning_rate": 9.837750382992033e-07, "loss": 2.1422, "step": 2570 }, { "epoch": 0.17351167413496849, "grad_norm": 5.165227149831037, "learning_rate": 9.836261159348135e-07, "loss": 2.1585, "step": 2575 }, { "epoch": 0.17384859000707523, "grad_norm": 5.1109247272079275, "learning_rate": 9.834765246158488e-07, "loss": 2.1726, "step": 2580 }, { "epoch": 0.17418550587918197, "grad_norm": 5.173321482921644, "learning_rate": 9.83326264549225e-07, "loss": 2.1927, "step": 2585 }, { "epoch": 0.1745224217512887, "grad_norm": 4.846659525051516, "learning_rate": 9.83175335942783e-07, "loss": 2.1138, "step": 2590 }, { "epoch": 0.17485933762339545, "grad_norm": 5.177169641265199, "learning_rate": 9.830237390052876e-07, "loss": 2.1263, "step": 2595 }, { "epoch": 0.17519625349550216, "grad_norm": 5.313138775793764, "learning_rate": 9.82871473946429e-07, "loss": 2.185, "step": 2600 }, { "epoch": 0.1755331693676089, "grad_norm": 5.468927513531517, "learning_rate": 9.82718540976821e-07, "loss": 2.1638, "step": 2605 }, { "epoch": 0.17587008523971565, "grad_norm": 5.56773841013832, "learning_rate": 9.825649403080015e-07, "loss": 2.15, "step": 2610 }, { "epoch": 0.1762070011118224, "grad_norm": 4.828622859737673, "learning_rate": 9.824106721524317e-07, "loss": 2.0861, "step": 2615 }, { "epoch": 0.1765439169839291, "grad_norm": 4.82562860525859, "learning_rate": 9.822557367234962e-07, "loss": 2.1398, "step": 2620 }, { "epoch": 0.17688083285603584, "grad_norm": 5.251405807193035, "learning_rate": 9.82100134235503e-07, "loss": 2.1843, "step": 2625 }, { "epoch": 0.17721774872814258, "grad_norm": 5.330196441841219, "learning_rate": 9.819438649036823e-07, "loss": 2.1954, "step": 2630 }, { "epoch": 0.17755466460024932, "grad_norm": 5.331757510514492, "learning_rate": 9.817869289441864e-07, "loss": 2.0998, "step": 2635 }, { "epoch": 0.17789158047235606, "grad_norm": 4.883931977044345, "learning_rate": 9.816293265740907e-07, "loss": 2.1727, "step": 2640 }, { "epoch": 0.17822849634446278, "grad_norm": 4.75494912370302, "learning_rate": 9.81471058011391e-07, "loss": 2.2037, "step": 2645 }, { "epoch": 0.17856541221656952, "grad_norm": 5.138001626210908, "learning_rate": 9.81312123475006e-07, "loss": 2.1632, "step": 2650 }, { "epoch": 0.17890232808867626, "grad_norm": 5.157618800086379, "learning_rate": 9.811525231847746e-07, "loss": 2.1969, "step": 2655 }, { "epoch": 0.179239243960783, "grad_norm": 5.30349705172934, "learning_rate": 9.809922573614569e-07, "loss": 2.1327, "step": 2660 }, { "epoch": 0.17957615983288971, "grad_norm": 5.328784564669315, "learning_rate": 9.808313262267337e-07, "loss": 2.1357, "step": 2665 }, { "epoch": 0.17991307570499646, "grad_norm": 5.113870896275709, "learning_rate": 9.806697300032057e-07, "loss": 2.12, "step": 2670 }, { "epoch": 0.1802499915771032, "grad_norm": 5.319498874202603, "learning_rate": 9.805074689143938e-07, "loss": 2.1598, "step": 2675 }, { "epoch": 0.18058690744920994, "grad_norm": 5.533960220051253, "learning_rate": 9.803445431847388e-07, "loss": 2.1208, "step": 2680 }, { "epoch": 0.18092382332131668, "grad_norm": 5.112378753112238, "learning_rate": 9.801809530396003e-07, "loss": 2.2171, "step": 2685 }, { "epoch": 0.1812607391934234, "grad_norm": 4.709450025681887, "learning_rate": 9.800166987052572e-07, "loss": 2.1019, "step": 2690 }, { "epoch": 0.18159765506553013, "grad_norm": 4.811558331078455, "learning_rate": 9.798517804089072e-07, "loss": 2.273, "step": 2695 }, { "epoch": 0.18193457093763687, "grad_norm": 4.688788064413994, "learning_rate": 9.796861983786661e-07, "loss": 2.0861, "step": 2700 }, { "epoch": 0.18227148680974362, "grad_norm": 4.862935834441943, "learning_rate": 9.795199528435682e-07, "loss": 2.1286, "step": 2705 }, { "epoch": 0.18260840268185033, "grad_norm": 5.0518857702878925, "learning_rate": 9.793530440335654e-07, "loss": 2.1161, "step": 2710 }, { "epoch": 0.18294531855395707, "grad_norm": 5.127418683050101, "learning_rate": 9.791854721795264e-07, "loss": 2.1087, "step": 2715 }, { "epoch": 0.1832822344260638, "grad_norm": 5.126525083707923, "learning_rate": 9.790172375132385e-07, "loss": 2.144, "step": 2720 }, { "epoch": 0.18361915029817055, "grad_norm": 4.909087206812649, "learning_rate": 9.788483402674041e-07, "loss": 2.2235, "step": 2725 }, { "epoch": 0.1839560661702773, "grad_norm": 5.170783210947623, "learning_rate": 9.786787806756434e-07, "loss": 2.1659, "step": 2730 }, { "epoch": 0.184292982042384, "grad_norm": 5.363173318513327, "learning_rate": 9.78508558972492e-07, "loss": 2.1567, "step": 2735 }, { "epoch": 0.18462989791449075, "grad_norm": 4.888337812541606, "learning_rate": 9.783376753934015e-07, "loss": 2.1651, "step": 2740 }, { "epoch": 0.1849668137865975, "grad_norm": 5.256999726928892, "learning_rate": 9.781661301747393e-07, "loss": 2.0952, "step": 2745 }, { "epoch": 0.18530372965870423, "grad_norm": 5.263717164476087, "learning_rate": 9.779939235537879e-07, "loss": 2.2214, "step": 2750 }, { "epoch": 0.18564064553081094, "grad_norm": 5.190847867384173, "learning_rate": 9.778210557687443e-07, "loss": 2.1545, "step": 2755 }, { "epoch": 0.18597756140291768, "grad_norm": 5.0906564292368515, "learning_rate": 9.776475270587205e-07, "loss": 2.1544, "step": 2760 }, { "epoch": 0.18631447727502443, "grad_norm": 5.038563426235627, "learning_rate": 9.774733376637421e-07, "loss": 2.1453, "step": 2765 }, { "epoch": 0.18665139314713117, "grad_norm": 5.311467912734165, "learning_rate": 9.772984878247493e-07, "loss": 2.1574, "step": 2770 }, { "epoch": 0.1869883090192379, "grad_norm": 5.25564262466903, "learning_rate": 9.771229777835952e-07, "loss": 2.1278, "step": 2775 }, { "epoch": 0.18732522489134462, "grad_norm": 4.869077756442381, "learning_rate": 9.769468077830466e-07, "loss": 2.1257, "step": 2780 }, { "epoch": 0.18766214076345136, "grad_norm": 4.922482657505179, "learning_rate": 9.767699780667827e-07, "loss": 2.161, "step": 2785 }, { "epoch": 0.1879990566355581, "grad_norm": 4.918033104644682, "learning_rate": 9.765924888793955e-07, "loss": 2.0896, "step": 2790 }, { "epoch": 0.18833597250766484, "grad_norm": 4.593688100331709, "learning_rate": 9.76414340466389e-07, "loss": 2.1631, "step": 2795 }, { "epoch": 0.18867288837977156, "grad_norm": 5.662561179713983, "learning_rate": 9.762355330741794e-07, "loss": 2.1897, "step": 2800 }, { "epoch": 0.1890098042518783, "grad_norm": 4.667061292998714, "learning_rate": 9.760560669500941e-07, "loss": 2.1379, "step": 2805 }, { "epoch": 0.18934672012398504, "grad_norm": 5.30487181766524, "learning_rate": 9.758759423423716e-07, "loss": 2.1464, "step": 2810 }, { "epoch": 0.18968363599609178, "grad_norm": 5.069708174337939, "learning_rate": 9.756951595001617e-07, "loss": 2.0903, "step": 2815 }, { "epoch": 0.19002055186819852, "grad_norm": 5.156866818822945, "learning_rate": 9.755137186735238e-07, "loss": 2.1489, "step": 2820 }, { "epoch": 0.19035746774030524, "grad_norm": 5.334110504029155, "learning_rate": 9.753316201134282e-07, "loss": 2.1635, "step": 2825 }, { "epoch": 0.19069438361241198, "grad_norm": 4.740585582156594, "learning_rate": 9.75148864071755e-07, "loss": 2.1278, "step": 2830 }, { "epoch": 0.19103129948451872, "grad_norm": 5.395883343905841, "learning_rate": 9.74965450801293e-07, "loss": 2.1485, "step": 2835 }, { "epoch": 0.19136821535662546, "grad_norm": 5.190368461027931, "learning_rate": 9.747813805557408e-07, "loss": 2.12, "step": 2840 }, { "epoch": 0.19170513122873217, "grad_norm": 5.009686206902522, "learning_rate": 9.745966535897054e-07, "loss": 2.1059, "step": 2845 }, { "epoch": 0.19204204710083891, "grad_norm": 4.985541466049405, "learning_rate": 9.744112701587024e-07, "loss": 2.1393, "step": 2850 }, { "epoch": 0.19237896297294566, "grad_norm": 5.19036763650696, "learning_rate": 9.742252305191551e-07, "loss": 2.1092, "step": 2855 }, { "epoch": 0.1927158788450524, "grad_norm": 5.130782368130088, "learning_rate": 9.740385349283946e-07, "loss": 2.161, "step": 2860 }, { "epoch": 0.19305279471715914, "grad_norm": 5.346576947948715, "learning_rate": 9.738511836446596e-07, "loss": 2.1823, "step": 2865 }, { "epoch": 0.19338971058926585, "grad_norm": 4.774445391705081, "learning_rate": 9.736631769270957e-07, "loss": 2.1526, "step": 2870 }, { "epoch": 0.1937266264613726, "grad_norm": 5.050305271220875, "learning_rate": 9.734745150357544e-07, "loss": 2.1565, "step": 2875 }, { "epoch": 0.19406354233347933, "grad_norm": 4.972076335708848, "learning_rate": 9.732851982315944e-07, "loss": 2.2068, "step": 2880 }, { "epoch": 0.19440045820558607, "grad_norm": 5.034181807390292, "learning_rate": 9.730952267764796e-07, "loss": 2.1142, "step": 2885 }, { "epoch": 0.1947373740776928, "grad_norm": 4.992322551354363, "learning_rate": 9.729046009331798e-07, "loss": 2.1041, "step": 2890 }, { "epoch": 0.19507428994979953, "grad_norm": 4.975402849207011, "learning_rate": 9.727133209653696e-07, "loss": 2.1304, "step": 2895 }, { "epoch": 0.19541120582190627, "grad_norm": 4.7027430890295765, "learning_rate": 9.72521387137629e-07, "loss": 2.1734, "step": 2900 }, { "epoch": 0.195748121694013, "grad_norm": 5.444819980625174, "learning_rate": 9.723287997154419e-07, "loss": 2.2045, "step": 2905 }, { "epoch": 0.19608503756611975, "grad_norm": 4.5962710438501455, "learning_rate": 9.72135558965196e-07, "loss": 2.1459, "step": 2910 }, { "epoch": 0.19642195343822647, "grad_norm": 4.781589941590085, "learning_rate": 9.719416651541837e-07, "loss": 2.0725, "step": 2915 }, { "epoch": 0.1967588693103332, "grad_norm": 4.652972949031103, "learning_rate": 9.717471185505996e-07, "loss": 2.1028, "step": 2920 }, { "epoch": 0.19709578518243995, "grad_norm": 5.105976255157077, "learning_rate": 9.715519194235422e-07, "loss": 2.1044, "step": 2925 }, { "epoch": 0.1974327010545467, "grad_norm": 5.25064439743658, "learning_rate": 9.713560680430117e-07, "loss": 2.1472, "step": 2930 }, { "epoch": 0.1977696169266534, "grad_norm": 5.2481746864763075, "learning_rate": 9.71159564679911e-07, "loss": 2.1454, "step": 2935 }, { "epoch": 0.19810653279876014, "grad_norm": 5.297905390196966, "learning_rate": 9.709624096060449e-07, "loss": 2.1198, "step": 2940 }, { "epoch": 0.19844344867086688, "grad_norm": 4.728630565261811, "learning_rate": 9.707646030941192e-07, "loss": 2.1173, "step": 2945 }, { "epoch": 0.19878036454297363, "grad_norm": 4.868705931881028, "learning_rate": 9.705661454177416e-07, "loss": 2.1222, "step": 2950 }, { "epoch": 0.19911728041508037, "grad_norm": 5.13453448571877, "learning_rate": 9.703670368514192e-07, "loss": 2.1511, "step": 2955 }, { "epoch": 0.19945419628718708, "grad_norm": 4.915089707555915, "learning_rate": 9.701672776705609e-07, "loss": 2.1263, "step": 2960 }, { "epoch": 0.19979111215929382, "grad_norm": 5.031201146472008, "learning_rate": 9.699668681514746e-07, "loss": 2.1753, "step": 2965 }, { "epoch": 0.20012802803140056, "grad_norm": 4.598851174922252, "learning_rate": 9.697658085713676e-07, "loss": 2.1671, "step": 2970 }, { "epoch": 0.2004649439035073, "grad_norm": 5.347206490797603, "learning_rate": 9.695640992083471e-07, "loss": 2.1053, "step": 2975 }, { "epoch": 0.20080185977561402, "grad_norm": 5.01498711823233, "learning_rate": 9.693617403414188e-07, "loss": 2.1264, "step": 2980 }, { "epoch": 0.20113877564772076, "grad_norm": 5.435402954079487, "learning_rate": 9.691587322504865e-07, "loss": 2.1032, "step": 2985 }, { "epoch": 0.2014756915198275, "grad_norm": 5.049792747705696, "learning_rate": 9.68955075216352e-07, "loss": 2.1916, "step": 2990 }, { "epoch": 0.20181260739193424, "grad_norm": 5.420587735153483, "learning_rate": 9.687507695207154e-07, "loss": 2.1344, "step": 2995 }, { "epoch": 0.20214952326404098, "grad_norm": 4.976979183404866, "learning_rate": 9.685458154461731e-07, "loss": 2.1108, "step": 3000 }, { "epoch": 0.2024864391361477, "grad_norm": 4.69787487415785, "learning_rate": 9.683402132762193e-07, "loss": 2.1734, "step": 3005 }, { "epoch": 0.20282335500825444, "grad_norm": 4.977478059205152, "learning_rate": 9.68133963295244e-07, "loss": 2.2049, "step": 3010 }, { "epoch": 0.20316027088036118, "grad_norm": 5.271244003555458, "learning_rate": 9.679270657885334e-07, "loss": 2.0807, "step": 3015 }, { "epoch": 0.20349718675246792, "grad_norm": 5.01911721881279, "learning_rate": 9.677195210422693e-07, "loss": 2.1846, "step": 3020 }, { "epoch": 0.20383410262457463, "grad_norm": 5.036922362116121, "learning_rate": 9.675113293435288e-07, "loss": 2.1265, "step": 3025 }, { "epoch": 0.20417101849668137, "grad_norm": 4.995290408761021, "learning_rate": 9.673024909802841e-07, "loss": 2.0967, "step": 3030 }, { "epoch": 0.2045079343687881, "grad_norm": 4.707589142927217, "learning_rate": 9.670930062414017e-07, "loss": 2.0636, "step": 3035 }, { "epoch": 0.20484485024089485, "grad_norm": 5.169917768370069, "learning_rate": 9.66882875416642e-07, "loss": 2.1599, "step": 3040 }, { "epoch": 0.2051817661130016, "grad_norm": 5.13966624340141, "learning_rate": 9.666720987966595e-07, "loss": 2.1216, "step": 3045 }, { "epoch": 0.2055186819851083, "grad_norm": 5.163718376603189, "learning_rate": 9.664606766730012e-07, "loss": 2.1476, "step": 3050 }, { "epoch": 0.20585559785721505, "grad_norm": 5.103416892391465, "learning_rate": 9.662486093381082e-07, "loss": 2.1324, "step": 3055 }, { "epoch": 0.2061925137293218, "grad_norm": 5.21296467800179, "learning_rate": 9.660358970853126e-07, "loss": 2.0795, "step": 3060 }, { "epoch": 0.20652942960142853, "grad_norm": 4.812137613001277, "learning_rate": 9.658225402088395e-07, "loss": 2.0822, "step": 3065 }, { "epoch": 0.20686634547353525, "grad_norm": 4.748787684151768, "learning_rate": 9.656085390038058e-07, "loss": 2.1421, "step": 3070 }, { "epoch": 0.207203261345642, "grad_norm": 5.024051844524781, "learning_rate": 9.653938937662187e-07, "loss": 2.1292, "step": 3075 }, { "epoch": 0.20754017721774873, "grad_norm": 4.97045532457049, "learning_rate": 9.651786047929772e-07, "loss": 2.1032, "step": 3080 }, { "epoch": 0.20787709308985547, "grad_norm": 5.109939306895058, "learning_rate": 9.649626723818702e-07, "loss": 2.1736, "step": 3085 }, { "epoch": 0.2082140089619622, "grad_norm": 5.246395972552146, "learning_rate": 9.647460968315767e-07, "loss": 2.2216, "step": 3090 }, { "epoch": 0.20855092483406892, "grad_norm": 5.267515215313974, "learning_rate": 9.645288784416652e-07, "loss": 2.1335, "step": 3095 }, { "epoch": 0.20888784070617566, "grad_norm": 4.79548358884579, "learning_rate": 9.643110175125935e-07, "loss": 2.1844, "step": 3100 }, { "epoch": 0.2092247565782824, "grad_norm": 4.705284044166181, "learning_rate": 9.640925143457084e-07, "loss": 2.0918, "step": 3105 }, { "epoch": 0.20956167245038915, "grad_norm": 5.31253700166042, "learning_rate": 9.638733692432448e-07, "loss": 2.1546, "step": 3110 }, { "epoch": 0.20989858832249586, "grad_norm": 5.0628050419595265, "learning_rate": 9.636535825083252e-07, "loss": 2.1679, "step": 3115 }, { "epoch": 0.2102355041946026, "grad_norm": 4.843587604065121, "learning_rate": 9.634331544449601e-07, "loss": 2.1698, "step": 3120 }, { "epoch": 0.21057242006670934, "grad_norm": 5.129212564842519, "learning_rate": 9.632120853580472e-07, "loss": 2.1376, "step": 3125 }, { "epoch": 0.21090933593881608, "grad_norm": 4.679515590250646, "learning_rate": 9.6299037555337e-07, "loss": 2.0861, "step": 3130 }, { "epoch": 0.21124625181092282, "grad_norm": 5.321358333809082, "learning_rate": 9.627680253375997e-07, "loss": 2.0771, "step": 3135 }, { "epoch": 0.21158316768302954, "grad_norm": 5.195362553268835, "learning_rate": 9.625450350182918e-07, "loss": 2.1319, "step": 3140 }, { "epoch": 0.21192008355513628, "grad_norm": 4.710272874975891, "learning_rate": 9.62321404903888e-07, "loss": 2.0652, "step": 3145 }, { "epoch": 0.21225699942724302, "grad_norm": 4.601277261810548, "learning_rate": 9.620971353037148e-07, "loss": 2.1065, "step": 3150 }, { "epoch": 0.21259391529934976, "grad_norm": 5.158244286104928, "learning_rate": 9.618722265279835e-07, "loss": 2.1394, "step": 3155 }, { "epoch": 0.21293083117145647, "grad_norm": 5.891288693014387, "learning_rate": 9.61646678887789e-07, "loss": 2.1382, "step": 3160 }, { "epoch": 0.21326774704356322, "grad_norm": 5.590347422641254, "learning_rate": 9.614204926951102e-07, "loss": 2.158, "step": 3165 }, { "epoch": 0.21360466291566996, "grad_norm": 4.712382321324783, "learning_rate": 9.611936682628095e-07, "loss": 2.1386, "step": 3170 }, { "epoch": 0.2139415787877767, "grad_norm": 4.805145396334973, "learning_rate": 9.609662059046315e-07, "loss": 2.121, "step": 3175 }, { "epoch": 0.21427849465988344, "grad_norm": 4.8338093061126335, "learning_rate": 9.607381059352038e-07, "loss": 2.1204, "step": 3180 }, { "epoch": 0.21461541053199015, "grad_norm": 4.9858280783219575, "learning_rate": 9.605093686700353e-07, "loss": 2.1864, "step": 3185 }, { "epoch": 0.2149523264040969, "grad_norm": 5.4676550901054695, "learning_rate": 9.602799944255172e-07, "loss": 2.1009, "step": 3190 }, { "epoch": 0.21528924227620364, "grad_norm": 5.0144622133583505, "learning_rate": 9.60049983518921e-07, "loss": 2.1816, "step": 3195 }, { "epoch": 0.21562615814831038, "grad_norm": 4.851050461168701, "learning_rate": 9.598193362683995e-07, "loss": 2.0907, "step": 3200 }, { "epoch": 0.2159630740204171, "grad_norm": 5.694069949705275, "learning_rate": 9.59588052992985e-07, "loss": 2.1104, "step": 3205 }, { "epoch": 0.21629998989252383, "grad_norm": 4.954230549714506, "learning_rate": 9.5935613401259e-07, "loss": 2.1456, "step": 3210 }, { "epoch": 0.21663690576463057, "grad_norm": 4.852682247155471, "learning_rate": 9.591235796480064e-07, "loss": 2.1557, "step": 3215 }, { "epoch": 0.2169738216367373, "grad_norm": 4.993723372569236, "learning_rate": 9.588903902209048e-07, "loss": 2.1205, "step": 3220 }, { "epoch": 0.21731073750884405, "grad_norm": 4.993902756814535, "learning_rate": 9.586565660538343e-07, "loss": 2.097, "step": 3225 }, { "epoch": 0.21764765338095077, "grad_norm": 4.803173295776384, "learning_rate": 9.584221074702217e-07, "loss": 2.2058, "step": 3230 }, { "epoch": 0.2179845692530575, "grad_norm": 5.1892971556621195, "learning_rate": 9.581870147943715e-07, "loss": 2.1272, "step": 3235 }, { "epoch": 0.21832148512516425, "grad_norm": 4.930896181990913, "learning_rate": 9.579512883514656e-07, "loss": 2.1617, "step": 3240 }, { "epoch": 0.218658400997271, "grad_norm": 5.3195679740470725, "learning_rate": 9.577149284675619e-07, "loss": 2.1354, "step": 3245 }, { "epoch": 0.2189953168693777, "grad_norm": 4.640306680573093, "learning_rate": 9.574779354695951e-07, "loss": 2.1535, "step": 3250 }, { "epoch": 0.21933223274148445, "grad_norm": 5.290873566635087, "learning_rate": 9.572403096853754e-07, "loss": 2.1189, "step": 3255 }, { "epoch": 0.2196691486135912, "grad_norm": 4.771890942163392, "learning_rate": 9.570020514435878e-07, "loss": 2.1381, "step": 3260 }, { "epoch": 0.22000606448569793, "grad_norm": 5.803290002393497, "learning_rate": 9.567631610737929e-07, "loss": 2.0575, "step": 3265 }, { "epoch": 0.22034298035780467, "grad_norm": 5.401012315493192, "learning_rate": 9.565236389064255e-07, "loss": 2.1675, "step": 3270 }, { "epoch": 0.22067989622991138, "grad_norm": 4.7570071709038215, "learning_rate": 9.562834852727935e-07, "loss": 2.147, "step": 3275 }, { "epoch": 0.22101681210201812, "grad_norm": 4.854363079213996, "learning_rate": 9.560427005050793e-07, "loss": 2.1305, "step": 3280 }, { "epoch": 0.22135372797412486, "grad_norm": 4.801788844040764, "learning_rate": 9.55801284936338e-07, "loss": 2.1108, "step": 3285 }, { "epoch": 0.2216906438462316, "grad_norm": 5.237474657216439, "learning_rate": 9.555592389004966e-07, "loss": 2.0673, "step": 3290 }, { "epoch": 0.22202755971833832, "grad_norm": 5.131457430035256, "learning_rate": 9.553165627323548e-07, "loss": 2.137, "step": 3295 }, { "epoch": 0.22236447559044506, "grad_norm": 5.485002336407805, "learning_rate": 9.55073256767584e-07, "loss": 2.1183, "step": 3300 }, { "epoch": 0.2227013914625518, "grad_norm": 5.0393907907734, "learning_rate": 9.548293213427262e-07, "loss": 2.1148, "step": 3305 }, { "epoch": 0.22303830733465854, "grad_norm": 4.770171493577373, "learning_rate": 9.545847567951944e-07, "loss": 2.1333, "step": 3310 }, { "epoch": 0.22337522320676528, "grad_norm": 4.943474996271445, "learning_rate": 9.543395634632721e-07, "loss": 2.1363, "step": 3315 }, { "epoch": 0.223712139078872, "grad_norm": 4.960137407289092, "learning_rate": 9.540937416861117e-07, "loss": 2.0943, "step": 3320 }, { "epoch": 0.22404905495097874, "grad_norm": 5.6801626984975115, "learning_rate": 9.538472918037356e-07, "loss": 2.1875, "step": 3325 }, { "epoch": 0.22438597082308548, "grad_norm": 4.504538488152941, "learning_rate": 9.536002141570348e-07, "loss": 2.1624, "step": 3330 }, { "epoch": 0.22472288669519222, "grad_norm": 4.944033444385733, "learning_rate": 9.533525090877688e-07, "loss": 2.1697, "step": 3335 }, { "epoch": 0.22505980256729893, "grad_norm": 5.197592753244709, "learning_rate": 9.531041769385641e-07, "loss": 2.0969, "step": 3340 }, { "epoch": 0.22539671843940567, "grad_norm": 5.2349088706678355, "learning_rate": 9.528552180529161e-07, "loss": 2.2203, "step": 3345 }, { "epoch": 0.22573363431151242, "grad_norm": 5.5679316276092266, "learning_rate": 9.526056327751856e-07, "loss": 2.1319, "step": 3350 }, { "epoch": 0.22607055018361916, "grad_norm": 5.402026546145546, "learning_rate": 9.523554214506006e-07, "loss": 2.1527, "step": 3355 }, { "epoch": 0.2264074660557259, "grad_norm": 5.596067927023199, "learning_rate": 9.521045844252551e-07, "loss": 2.1626, "step": 3360 }, { "epoch": 0.2267443819278326, "grad_norm": 5.215137255838939, "learning_rate": 9.518531220461084e-07, "loss": 2.1115, "step": 3365 }, { "epoch": 0.22708129779993935, "grad_norm": 4.43483727784728, "learning_rate": 9.516010346609845e-07, "loss": 2.1123, "step": 3370 }, { "epoch": 0.2274182136720461, "grad_norm": 5.139189232537159, "learning_rate": 9.513483226185723e-07, "loss": 2.2061, "step": 3375 }, { "epoch": 0.22775512954415283, "grad_norm": 5.404053639215732, "learning_rate": 9.510949862684248e-07, "loss": 2.0881, "step": 3380 }, { "epoch": 0.22809204541625955, "grad_norm": 4.732202258203988, "learning_rate": 9.508410259609583e-07, "loss": 2.1348, "step": 3385 }, { "epoch": 0.2284289612883663, "grad_norm": 5.189824054903416, "learning_rate": 9.505864420474522e-07, "loss": 2.1445, "step": 3390 }, { "epoch": 0.22876587716047303, "grad_norm": 5.08697869032668, "learning_rate": 9.503312348800485e-07, "loss": 2.0768, "step": 3395 }, { "epoch": 0.22910279303257977, "grad_norm": 4.542485700481022, "learning_rate": 9.500754048117514e-07, "loss": 2.1057, "step": 3400 }, { "epoch": 0.2294397089046865, "grad_norm": 4.88433933006251, "learning_rate": 9.498189521964263e-07, "loss": 2.0528, "step": 3405 }, { "epoch": 0.22977662477679323, "grad_norm": 5.044315575848994, "learning_rate": 9.495618773888006e-07, "loss": 2.0528, "step": 3410 }, { "epoch": 0.23011354064889997, "grad_norm": 5.192417327197871, "learning_rate": 9.49304180744461e-07, "loss": 2.1402, "step": 3415 }, { "epoch": 0.2304504565210067, "grad_norm": 4.791010862874757, "learning_rate": 9.490458626198556e-07, "loss": 2.1184, "step": 3420 }, { "epoch": 0.23078737239311345, "grad_norm": 5.560263990452688, "learning_rate": 9.487869233722915e-07, "loss": 2.1008, "step": 3425 }, { "epoch": 0.23112428826522016, "grad_norm": 5.000236602973637, "learning_rate": 9.485273633599348e-07, "loss": 2.1001, "step": 3430 }, { "epoch": 0.2314612041373269, "grad_norm": 5.046172421499073, "learning_rate": 9.482671829418107e-07, "loss": 2.116, "step": 3435 }, { "epoch": 0.23179812000943364, "grad_norm": 4.923387604877615, "learning_rate": 9.480063824778024e-07, "loss": 2.096, "step": 3440 }, { "epoch": 0.23213503588154039, "grad_norm": 5.1525670921510835, "learning_rate": 9.477449623286505e-07, "loss": 2.0803, "step": 3445 }, { "epoch": 0.23247195175364713, "grad_norm": 4.711631687924128, "learning_rate": 9.474829228559529e-07, "loss": 2.1721, "step": 3450 }, { "epoch": 0.23280886762575384, "grad_norm": 5.43934180789806, "learning_rate": 9.472202644221643e-07, "loss": 2.1878, "step": 3455 }, { "epoch": 0.23314578349786058, "grad_norm": 5.4600305589240685, "learning_rate": 9.469569873905955e-07, "loss": 2.0874, "step": 3460 }, { "epoch": 0.23348269936996732, "grad_norm": 5.013569009145087, "learning_rate": 9.466930921254128e-07, "loss": 2.0725, "step": 3465 }, { "epoch": 0.23381961524207406, "grad_norm": 5.332867903281223, "learning_rate": 9.464285789916376e-07, "loss": 2.121, "step": 3470 }, { "epoch": 0.23415653111418078, "grad_norm": 5.427446042008524, "learning_rate": 9.461634483551464e-07, "loss": 2.0826, "step": 3475 }, { "epoch": 0.23449344698628752, "grad_norm": 5.074108406003084, "learning_rate": 9.458977005826691e-07, "loss": 2.0829, "step": 3480 }, { "epoch": 0.23483036285839426, "grad_norm": 5.060487698449351, "learning_rate": 9.456313360417899e-07, "loss": 2.1336, "step": 3485 }, { "epoch": 0.235167278730501, "grad_norm": 4.781472976884829, "learning_rate": 9.453643551009459e-07, "loss": 2.0993, "step": 3490 }, { "epoch": 0.23550419460260774, "grad_norm": 5.268006282348415, "learning_rate": 9.450967581294265e-07, "loss": 2.1756, "step": 3495 }, { "epoch": 0.23584111047471445, "grad_norm": 4.6588880482607395, "learning_rate": 9.448285454973737e-07, "loss": 2.1132, "step": 3500 }, { "epoch": 0.2361780263468212, "grad_norm": 5.634450349842177, "learning_rate": 9.445597175757806e-07, "loss": 2.0666, "step": 3505 }, { "epoch": 0.23651494221892794, "grad_norm": 4.775943859758892, "learning_rate": 9.442902747364918e-07, "loss": 2.0681, "step": 3510 }, { "epoch": 0.23685185809103468, "grad_norm": 5.001227477201418, "learning_rate": 9.440202173522022e-07, "loss": 2.1117, "step": 3515 }, { "epoch": 0.2371887739631414, "grad_norm": 5.052846154796682, "learning_rate": 9.437495457964568e-07, "loss": 2.1069, "step": 3520 }, { "epoch": 0.23752568983524813, "grad_norm": 4.876125295376855, "learning_rate": 9.434782604436502e-07, "loss": 2.0965, "step": 3525 }, { "epoch": 0.23786260570735487, "grad_norm": 4.811034171148802, "learning_rate": 9.432063616690258e-07, "loss": 2.0964, "step": 3530 }, { "epoch": 0.23819952157946161, "grad_norm": 5.800404661205687, "learning_rate": 9.429338498486758e-07, "loss": 2.0714, "step": 3535 }, { "epoch": 0.23853643745156836, "grad_norm": 4.831540284786558, "learning_rate": 9.426607253595402e-07, "loss": 2.1353, "step": 3540 }, { "epoch": 0.23887335332367507, "grad_norm": 4.869332493687385, "learning_rate": 9.423869885794063e-07, "loss": 2.1252, "step": 3545 }, { "epoch": 0.2392102691957818, "grad_norm": 5.708531897684237, "learning_rate": 9.421126398869086e-07, "loss": 2.1627, "step": 3550 }, { "epoch": 0.23954718506788855, "grad_norm": 5.150345064758385, "learning_rate": 9.418376796615279e-07, "loss": 2.0939, "step": 3555 }, { "epoch": 0.2398841009399953, "grad_norm": 5.03528498671145, "learning_rate": 9.415621082835908e-07, "loss": 2.0928, "step": 3560 }, { "epoch": 0.240221016812102, "grad_norm": 4.886815738607965, "learning_rate": 9.412859261342691e-07, "loss": 2.0727, "step": 3565 }, { "epoch": 0.24055793268420875, "grad_norm": 5.272389074214625, "learning_rate": 9.410091335955798e-07, "loss": 2.0981, "step": 3570 }, { "epoch": 0.2408948485563155, "grad_norm": 5.502419770266727, "learning_rate": 9.407317310503841e-07, "loss": 2.1334, "step": 3575 }, { "epoch": 0.24123176442842223, "grad_norm": 5.401673568371828, "learning_rate": 9.404537188823869e-07, "loss": 2.1834, "step": 3580 }, { "epoch": 0.24156868030052897, "grad_norm": 4.6687632028275905, "learning_rate": 9.40175097476136e-07, "loss": 2.1009, "step": 3585 }, { "epoch": 0.24190559617263568, "grad_norm": 5.380510313634014, "learning_rate": 9.398958672170225e-07, "loss": 2.0793, "step": 3590 }, { "epoch": 0.24224251204474243, "grad_norm": 4.965443223132322, "learning_rate": 9.396160284912795e-07, "loss": 2.1425, "step": 3595 }, { "epoch": 0.24257942791684917, "grad_norm": 4.847868382747975, "learning_rate": 9.393355816859813e-07, "loss": 2.0986, "step": 3600 }, { "epoch": 0.2429163437889559, "grad_norm": 4.886901922902836, "learning_rate": 9.390545271890437e-07, "loss": 2.1295, "step": 3605 }, { "epoch": 0.24325325966106262, "grad_norm": 5.420003202103658, "learning_rate": 9.387728653892233e-07, "loss": 2.1114, "step": 3610 }, { "epoch": 0.24359017553316936, "grad_norm": 5.158621107610909, "learning_rate": 9.384905966761159e-07, "loss": 2.1292, "step": 3615 }, { "epoch": 0.2439270914052761, "grad_norm": 5.665748765918846, "learning_rate": 9.382077214401576e-07, "loss": 2.1082, "step": 3620 }, { "epoch": 0.24426400727738284, "grad_norm": 5.165157506372016, "learning_rate": 9.379242400726232e-07, "loss": 2.1158, "step": 3625 }, { "epoch": 0.24460092314948959, "grad_norm": 5.2581042376869185, "learning_rate": 9.376401529656257e-07, "loss": 2.1108, "step": 3630 }, { "epoch": 0.2449378390215963, "grad_norm": 5.090142282432504, "learning_rate": 9.373554605121161e-07, "loss": 2.0416, "step": 3635 }, { "epoch": 0.24527475489370304, "grad_norm": 5.194733518463064, "learning_rate": 9.370701631058828e-07, "loss": 2.0887, "step": 3640 }, { "epoch": 0.24561167076580978, "grad_norm": 5.41008216165671, "learning_rate": 9.367842611415508e-07, "loss": 2.1319, "step": 3645 }, { "epoch": 0.24594858663791652, "grad_norm": 5.189560883627085, "learning_rate": 9.364977550145816e-07, "loss": 2.0977, "step": 3650 }, { "epoch": 0.24628550251002324, "grad_norm": 5.051380958083347, "learning_rate": 9.362106451212721e-07, "loss": 2.1737, "step": 3655 }, { "epoch": 0.24662241838212998, "grad_norm": 5.5997980632274, "learning_rate": 9.359229318587545e-07, "loss": 2.1268, "step": 3660 }, { "epoch": 0.24695933425423672, "grad_norm": 5.158204673240589, "learning_rate": 9.356346156249954e-07, "loss": 2.1178, "step": 3665 }, { "epoch": 0.24729625012634346, "grad_norm": 5.069021379778357, "learning_rate": 9.353456968187958e-07, "loss": 2.1539, "step": 3670 }, { "epoch": 0.2476331659984502, "grad_norm": 4.551101395199881, "learning_rate": 9.350561758397897e-07, "loss": 2.1059, "step": 3675 }, { "epoch": 0.2479700818705569, "grad_norm": 5.933639565305363, "learning_rate": 9.347660530884442e-07, "loss": 2.1422, "step": 3680 }, { "epoch": 0.24830699774266365, "grad_norm": 4.866391432596583, "learning_rate": 9.344753289660592e-07, "loss": 2.1041, "step": 3685 }, { "epoch": 0.2486439136147704, "grad_norm": 5.3640678384518266, "learning_rate": 9.34184003874766e-07, "loss": 2.1488, "step": 3690 }, { "epoch": 0.24898082948687714, "grad_norm": 5.151134693388474, "learning_rate": 9.338920782175269e-07, "loss": 2.1424, "step": 3695 }, { "epoch": 0.24931774535898385, "grad_norm": 5.445050433669452, "learning_rate": 9.335995523981355e-07, "loss": 2.1284, "step": 3700 }, { "epoch": 0.2496546612310906, "grad_norm": 5.304786273511047, "learning_rate": 9.333064268212153e-07, "loss": 2.0741, "step": 3705 }, { "epoch": 0.24999157710319733, "grad_norm": 5.711882273435352, "learning_rate": 9.330127018922193e-07, "loss": 2.1256, "step": 3710 }, { "epoch": 0.2503284929753041, "grad_norm": 4.54512011431085, "learning_rate": 9.327183780174296e-07, "loss": 2.0809, "step": 3715 }, { "epoch": 0.2506654088474108, "grad_norm": 4.928827261676618, "learning_rate": 9.324234556039567e-07, "loss": 2.0827, "step": 3720 }, { "epoch": 0.25100232471951756, "grad_norm": 4.990764293285294, "learning_rate": 9.321279350597393e-07, "loss": 2.1005, "step": 3725 }, { "epoch": 0.25133924059162427, "grad_norm": 4.524962206474226, "learning_rate": 9.31831816793543e-07, "loss": 2.0663, "step": 3730 }, { "epoch": 0.251676156463731, "grad_norm": 4.8129269968762864, "learning_rate": 9.315351012149605e-07, "loss": 2.0935, "step": 3735 }, { "epoch": 0.25201307233583775, "grad_norm": 4.926099677681959, "learning_rate": 9.312377887344105e-07, "loss": 2.0613, "step": 3740 }, { "epoch": 0.25234998820794446, "grad_norm": 4.913868593684052, "learning_rate": 9.309398797631374e-07, "loss": 2.1211, "step": 3745 }, { "epoch": 0.25268690408005123, "grad_norm": 4.978367670072288, "learning_rate": 9.306413747132108e-07, "loss": 2.0775, "step": 3750 }, { "epoch": 0.25302381995215795, "grad_norm": 6.226592557899111, "learning_rate": 9.303422739975246e-07, "loss": 2.171, "step": 3755 }, { "epoch": 0.25336073582426466, "grad_norm": 5.418549505854404, "learning_rate": 9.300425780297968e-07, "loss": 2.1135, "step": 3760 }, { "epoch": 0.25369765169637143, "grad_norm": 5.333424205766533, "learning_rate": 9.297422872245686e-07, "loss": 2.0715, "step": 3765 }, { "epoch": 0.25403456756847814, "grad_norm": 4.916914280524372, "learning_rate": 9.294414019972043e-07, "loss": 2.1055, "step": 3770 }, { "epoch": 0.2543714834405849, "grad_norm": 5.415909205098182, "learning_rate": 9.291399227638898e-07, "loss": 2.1023, "step": 3775 }, { "epoch": 0.2547083993126916, "grad_norm": 5.06142986960587, "learning_rate": 9.288378499416332e-07, "loss": 2.0924, "step": 3780 }, { "epoch": 0.25504531518479834, "grad_norm": 5.247841686398922, "learning_rate": 9.285351839482634e-07, "loss": 2.1107, "step": 3785 }, { "epoch": 0.2553822310569051, "grad_norm": 5.250717049312658, "learning_rate": 9.2823192520243e-07, "loss": 2.1107, "step": 3790 }, { "epoch": 0.2557191469290118, "grad_norm": 5.2628369128434915, "learning_rate": 9.27928074123602e-07, "loss": 2.1018, "step": 3795 }, { "epoch": 0.25605606280111853, "grad_norm": 5.010657868034676, "learning_rate": 9.276236311320684e-07, "loss": 2.1387, "step": 3800 }, { "epoch": 0.2563929786732253, "grad_norm": 4.907888161497763, "learning_rate": 9.273185966489365e-07, "loss": 2.1502, "step": 3805 }, { "epoch": 0.256729894545332, "grad_norm": 5.245704924430023, "learning_rate": 9.270129710961318e-07, "loss": 2.054, "step": 3810 }, { "epoch": 0.2570668104174388, "grad_norm": 4.582021279647821, "learning_rate": 9.267067548963974e-07, "loss": 2.0684, "step": 3815 }, { "epoch": 0.2574037262895455, "grad_norm": 5.037359803870131, "learning_rate": 9.263999484732934e-07, "loss": 2.1282, "step": 3820 }, { "epoch": 0.2577406421616522, "grad_norm": 5.067943616490096, "learning_rate": 9.260925522511962e-07, "loss": 2.1077, "step": 3825 }, { "epoch": 0.258077558033759, "grad_norm": 4.732014848069278, "learning_rate": 9.257845666552984e-07, "loss": 2.1263, "step": 3830 }, { "epoch": 0.2584144739058657, "grad_norm": 4.9337339427574864, "learning_rate": 9.254759921116073e-07, "loss": 2.1277, "step": 3835 }, { "epoch": 0.25875138977797246, "grad_norm": 5.23466396902256, "learning_rate": 9.251668290469452e-07, "loss": 2.1212, "step": 3840 }, { "epoch": 0.2590883056500792, "grad_norm": 4.624107436589284, "learning_rate": 9.248570778889484e-07, "loss": 2.0799, "step": 3845 }, { "epoch": 0.2594252215221859, "grad_norm": 5.613367560181378, "learning_rate": 9.245467390660664e-07, "loss": 2.0488, "step": 3850 }, { "epoch": 0.25976213739429266, "grad_norm": 5.004704320871015, "learning_rate": 9.242358130075618e-07, "loss": 2.1168, "step": 3855 }, { "epoch": 0.26009905326639937, "grad_norm": 5.064714414012089, "learning_rate": 9.239243001435093e-07, "loss": 2.0506, "step": 3860 }, { "epoch": 0.26043596913850614, "grad_norm": 5.190093482933379, "learning_rate": 9.236122009047957e-07, "loss": 2.0454, "step": 3865 }, { "epoch": 0.26077288501061285, "grad_norm": 5.461898806567617, "learning_rate": 9.232995157231182e-07, "loss": 2.0942, "step": 3870 }, { "epoch": 0.26110980088271957, "grad_norm": 5.026997470477699, "learning_rate": 9.229862450309851e-07, "loss": 2.0721, "step": 3875 }, { "epoch": 0.26144671675482634, "grad_norm": 5.020162207736259, "learning_rate": 9.226723892617141e-07, "loss": 2.119, "step": 3880 }, { "epoch": 0.26178363262693305, "grad_norm": 4.33225968391605, "learning_rate": 9.223579488494327e-07, "loss": 2.0575, "step": 3885 }, { "epoch": 0.26212054849903976, "grad_norm": 5.052392720658369, "learning_rate": 9.220429242290763e-07, "loss": 2.0773, "step": 3890 }, { "epoch": 0.26245746437114653, "grad_norm": 5.143623112463767, "learning_rate": 9.217273158363894e-07, "loss": 2.0491, "step": 3895 }, { "epoch": 0.26279438024325324, "grad_norm": 4.568520568251716, "learning_rate": 9.214111241079232e-07, "loss": 2.1196, "step": 3900 }, { "epoch": 0.26313129611536, "grad_norm": 4.878833410713685, "learning_rate": 9.21094349481036e-07, "loss": 2.1359, "step": 3905 }, { "epoch": 0.2634682119874667, "grad_norm": 5.05747621735138, "learning_rate": 9.207769923938924e-07, "loss": 2.0722, "step": 3910 }, { "epoch": 0.26380512785957344, "grad_norm": 4.914803219567749, "learning_rate": 9.204590532854627e-07, "loss": 2.0051, "step": 3915 }, { "epoch": 0.2641420437316802, "grad_norm": 4.937823465589559, "learning_rate": 9.20140532595522e-07, "loss": 2.1506, "step": 3920 }, { "epoch": 0.2644789596037869, "grad_norm": 4.930603975737661, "learning_rate": 9.198214307646504e-07, "loss": 2.0663, "step": 3925 }, { "epoch": 0.2648158754758937, "grad_norm": 4.852386672033825, "learning_rate": 9.195017482342313e-07, "loss": 2.1335, "step": 3930 }, { "epoch": 0.2651527913480004, "grad_norm": 5.429696735181982, "learning_rate": 9.191814854464514e-07, "loss": 2.102, "step": 3935 }, { "epoch": 0.2654897072201071, "grad_norm": 4.949610421716279, "learning_rate": 9.188606428443002e-07, "loss": 2.0952, "step": 3940 }, { "epoch": 0.2658266230922139, "grad_norm": 4.97481901473734, "learning_rate": 9.185392208715692e-07, "loss": 2.0037, "step": 3945 }, { "epoch": 0.2661635389643206, "grad_norm": 5.469591847433898, "learning_rate": 9.182172199728513e-07, "loss": 2.0944, "step": 3950 }, { "epoch": 0.26650045483642737, "grad_norm": 5.541822156571981, "learning_rate": 9.178946405935398e-07, "loss": 2.1031, "step": 3955 }, { "epoch": 0.2668373707085341, "grad_norm": 5.242992333808231, "learning_rate": 9.175714831798287e-07, "loss": 2.1165, "step": 3960 }, { "epoch": 0.2671742865806408, "grad_norm": 5.190164926537672, "learning_rate": 9.172477481787113e-07, "loss": 2.0276, "step": 3965 }, { "epoch": 0.26751120245274757, "grad_norm": 5.104039011914211, "learning_rate": 9.169234360379796e-07, "loss": 2.0442, "step": 3970 }, { "epoch": 0.2678481183248543, "grad_norm": 5.840587725049626, "learning_rate": 9.165985472062244e-07, "loss": 2.1226, "step": 3975 }, { "epoch": 0.268185034196961, "grad_norm": 4.705576028724243, "learning_rate": 9.162730821328337e-07, "loss": 2.1044, "step": 3980 }, { "epoch": 0.26852195006906776, "grad_norm": 5.111744918405046, "learning_rate": 9.159470412679928e-07, "loss": 2.1237, "step": 3985 }, { "epoch": 0.2688588659411745, "grad_norm": 5.216341662319828, "learning_rate": 9.156204250626836e-07, "loss": 2.0976, "step": 3990 }, { "epoch": 0.26919578181328124, "grad_norm": 5.472298412187099, "learning_rate": 9.152932339686833e-07, "loss": 2.1299, "step": 3995 }, { "epoch": 0.26953269768538796, "grad_norm": 4.999134380092764, "learning_rate": 9.149654684385647e-07, "loss": 2.083, "step": 4000 }, { "epoch": 0.26986961355749467, "grad_norm": 5.585429773443229, "learning_rate": 9.146371289256952e-07, "loss": 2.0204, "step": 4005 }, { "epoch": 0.27020652942960144, "grad_norm": 4.951428255718392, "learning_rate": 9.143082158842359e-07, "loss": 2.1433, "step": 4010 }, { "epoch": 0.27054344530170815, "grad_norm": 4.9364008596854605, "learning_rate": 9.139787297691413e-07, "loss": 2.1349, "step": 4015 }, { "epoch": 0.2708803611738149, "grad_norm": 5.172632172917703, "learning_rate": 9.136486710361586e-07, "loss": 2.1487, "step": 4020 }, { "epoch": 0.27121727704592163, "grad_norm": 4.883153129333876, "learning_rate": 9.13318040141827e-07, "loss": 2.0773, "step": 4025 }, { "epoch": 0.27155419291802835, "grad_norm": 4.965474386231394, "learning_rate": 9.129868375434774e-07, "loss": 2.0651, "step": 4030 }, { "epoch": 0.2718911087901351, "grad_norm": 5.74776556807227, "learning_rate": 9.12655063699231e-07, "loss": 2.0946, "step": 4035 }, { "epoch": 0.27222802466224183, "grad_norm": 5.487119107813184, "learning_rate": 9.123227190679994e-07, "loss": 2.0356, "step": 4040 }, { "epoch": 0.2725649405343486, "grad_norm": 5.224095460352556, "learning_rate": 9.119898041094838e-07, "loss": 2.1063, "step": 4045 }, { "epoch": 0.2729018564064553, "grad_norm": 5.3425168463539325, "learning_rate": 9.116563192841741e-07, "loss": 2.08, "step": 4050 }, { "epoch": 0.273238772278562, "grad_norm": 5.289426240139162, "learning_rate": 9.113222650533486e-07, "loss": 2.1386, "step": 4055 }, { "epoch": 0.2735756881506688, "grad_norm": 5.461342907381727, "learning_rate": 9.109876418790731e-07, "loss": 2.0768, "step": 4060 }, { "epoch": 0.2739126040227755, "grad_norm": 5.513015529946776, "learning_rate": 9.106524502242004e-07, "loss": 2.0847, "step": 4065 }, { "epoch": 0.2742495198948822, "grad_norm": 4.716472482704662, "learning_rate": 9.103166905523699e-07, "loss": 2.0444, "step": 4070 }, { "epoch": 0.274586435766989, "grad_norm": 4.860384305942716, "learning_rate": 9.099803633280059e-07, "loss": 2.1758, "step": 4075 }, { "epoch": 0.2749233516390957, "grad_norm": 5.211275507649361, "learning_rate": 9.096434690163184e-07, "loss": 2.0862, "step": 4080 }, { "epoch": 0.27526026751120247, "grad_norm": 5.362464544997193, "learning_rate": 9.093060080833019e-07, "loss": 2.0387, "step": 4085 }, { "epoch": 0.2755971833833092, "grad_norm": 5.484516382736862, "learning_rate": 9.089679809957343e-07, "loss": 2.0623, "step": 4090 }, { "epoch": 0.2759340992554159, "grad_norm": 5.794062219866951, "learning_rate": 9.086293882211768e-07, "loss": 2.143, "step": 4095 }, { "epoch": 0.27627101512752267, "grad_norm": 4.956008961881056, "learning_rate": 9.082902302279726e-07, "loss": 2.0802, "step": 4100 }, { "epoch": 0.2766079309996294, "grad_norm": 4.876623595672796, "learning_rate": 9.079505074852476e-07, "loss": 2.1075, "step": 4105 }, { "epoch": 0.27694484687173615, "grad_norm": 5.476708145225155, "learning_rate": 9.076102204629082e-07, "loss": 2.0777, "step": 4110 }, { "epoch": 0.27728176274384286, "grad_norm": 4.6307710386583425, "learning_rate": 9.072693696316411e-07, "loss": 2.0837, "step": 4115 }, { "epoch": 0.2776186786159496, "grad_norm": 4.93005397103229, "learning_rate": 9.069279554629137e-07, "loss": 2.0951, "step": 4120 }, { "epoch": 0.27795559448805635, "grad_norm": 5.105536630050712, "learning_rate": 9.06585978428972e-07, "loss": 2.0705, "step": 4125 }, { "epoch": 0.27829251036016306, "grad_norm": 5.136899805936794, "learning_rate": 9.062434390028407e-07, "loss": 2.0895, "step": 4130 }, { "epoch": 0.27862942623226983, "grad_norm": 5.065858099181118, "learning_rate": 9.059003376583223e-07, "loss": 2.0972, "step": 4135 }, { "epoch": 0.27896634210437654, "grad_norm": 4.58212873962149, "learning_rate": 9.055566748699968e-07, "loss": 2.129, "step": 4140 }, { "epoch": 0.27930325797648325, "grad_norm": 4.853831159380806, "learning_rate": 9.052124511132204e-07, "loss": 2.0648, "step": 4145 }, { "epoch": 0.27964017384859, "grad_norm": 4.966330838608163, "learning_rate": 9.04867666864126e-07, "loss": 2.0505, "step": 4150 }, { "epoch": 0.27997708972069674, "grad_norm": 5.807516122883463, "learning_rate": 9.045223225996207e-07, "loss": 2.082, "step": 4155 }, { "epoch": 0.28031400559280345, "grad_norm": 5.403935297246928, "learning_rate": 9.041764187973871e-07, "loss": 2.068, "step": 4160 }, { "epoch": 0.2806509214649102, "grad_norm": 5.294699695637589, "learning_rate": 9.038299559358815e-07, "loss": 2.0835, "step": 4165 }, { "epoch": 0.28098783733701693, "grad_norm": 5.008046047769178, "learning_rate": 9.034829344943331e-07, "loss": 2.0855, "step": 4170 }, { "epoch": 0.2813247532091237, "grad_norm": 5.22237741960921, "learning_rate": 9.031353549527444e-07, "loss": 2.1033, "step": 4175 }, { "epoch": 0.2816616690812304, "grad_norm": 5.402754357452368, "learning_rate": 9.027872177918894e-07, "loss": 2.0413, "step": 4180 }, { "epoch": 0.28199858495333713, "grad_norm": 4.461664661546047, "learning_rate": 9.024385234933134e-07, "loss": 2.1529, "step": 4185 }, { "epoch": 0.2823355008254439, "grad_norm": 5.068428378702728, "learning_rate": 9.020892725393326e-07, "loss": 2.0197, "step": 4190 }, { "epoch": 0.2826724166975506, "grad_norm": 5.420691591041084, "learning_rate": 9.017394654130332e-07, "loss": 2.0839, "step": 4195 }, { "epoch": 0.2830093325696574, "grad_norm": 4.893681105337855, "learning_rate": 9.013891025982703e-07, "loss": 2.1175, "step": 4200 }, { "epoch": 0.2833462484417641, "grad_norm": 4.963628086890727, "learning_rate": 9.010381845796677e-07, "loss": 2.0336, "step": 4205 }, { "epoch": 0.2836831643138708, "grad_norm": 5.472427389663241, "learning_rate": 9.006867118426178e-07, "loss": 2.0519, "step": 4210 }, { "epoch": 0.2840200801859776, "grad_norm": 5.1990838537966635, "learning_rate": 9.003346848732793e-07, "loss": 2.072, "step": 4215 }, { "epoch": 0.2843569960580843, "grad_norm": 4.561920658152314, "learning_rate": 8.999821041585787e-07, "loss": 2.0413, "step": 4220 }, { "epoch": 0.28469391193019106, "grad_norm": 4.978341130114082, "learning_rate": 8.996289701862072e-07, "loss": 2.1478, "step": 4225 }, { "epoch": 0.28503082780229777, "grad_norm": 5.1778735078941525, "learning_rate": 8.99275283444622e-07, "loss": 2.0946, "step": 4230 }, { "epoch": 0.2853677436744045, "grad_norm": 4.7428482132454475, "learning_rate": 8.989210444230449e-07, "loss": 2.0821, "step": 4235 }, { "epoch": 0.28570465954651125, "grad_norm": 5.106620887633692, "learning_rate": 8.985662536114612e-07, "loss": 2.1125, "step": 4240 }, { "epoch": 0.28604157541861797, "grad_norm": 5.112233245683654, "learning_rate": 8.9821091150062e-07, "loss": 2.0682, "step": 4245 }, { "epoch": 0.2863784912907247, "grad_norm": 5.6468388258364195, "learning_rate": 8.978550185820323e-07, "loss": 2.0588, "step": 4250 }, { "epoch": 0.28671540716283145, "grad_norm": 4.844686985245273, "learning_rate": 8.974985753479718e-07, "loss": 2.0939, "step": 4255 }, { "epoch": 0.28705232303493816, "grad_norm": 4.835213339414971, "learning_rate": 8.971415822914726e-07, "loss": 2.069, "step": 4260 }, { "epoch": 0.28738923890704493, "grad_norm": 5.08824867635748, "learning_rate": 8.967840399063298e-07, "loss": 2.0488, "step": 4265 }, { "epoch": 0.28772615477915164, "grad_norm": 5.123211671594355, "learning_rate": 8.964259486870982e-07, "loss": 2.0568, "step": 4270 }, { "epoch": 0.28806307065125836, "grad_norm": 5.047654651702238, "learning_rate": 8.960673091290916e-07, "loss": 2.0287, "step": 4275 }, { "epoch": 0.2883999865233651, "grad_norm": 5.283809160434346, "learning_rate": 8.957081217283825e-07, "loss": 2.0906, "step": 4280 }, { "epoch": 0.28873690239547184, "grad_norm": 4.929448772951953, "learning_rate": 8.953483869818013e-07, "loss": 2.0662, "step": 4285 }, { "epoch": 0.2890738182675786, "grad_norm": 5.0883751352862046, "learning_rate": 8.949881053869348e-07, "loss": 2.0644, "step": 4290 }, { "epoch": 0.2894107341396853, "grad_norm": 5.156191491626767, "learning_rate": 8.946272774421271e-07, "loss": 2.0443, "step": 4295 }, { "epoch": 0.28974765001179204, "grad_norm": 5.129336069439232, "learning_rate": 8.942659036464775e-07, "loss": 2.0779, "step": 4300 }, { "epoch": 0.2900845658838988, "grad_norm": 4.808545857243519, "learning_rate": 8.939039844998403e-07, "loss": 2.0824, "step": 4305 }, { "epoch": 0.2904214817560055, "grad_norm": 4.879408754308263, "learning_rate": 8.935415205028243e-07, "loss": 2.0667, "step": 4310 }, { "epoch": 0.2907583976281123, "grad_norm": 4.684935721711347, "learning_rate": 8.931785121567921e-07, "loss": 2.0301, "step": 4315 }, { "epoch": 0.291095313500219, "grad_norm": 5.129671220444328, "learning_rate": 8.928149599638588e-07, "loss": 2.0931, "step": 4320 }, { "epoch": 0.2914322293723257, "grad_norm": 5.108483924119315, "learning_rate": 8.924508644268921e-07, "loss": 2.1691, "step": 4325 }, { "epoch": 0.2917691452444325, "grad_norm": 4.796142901707757, "learning_rate": 8.920862260495111e-07, "loss": 2.0936, "step": 4330 }, { "epoch": 0.2921060611165392, "grad_norm": 5.332410066424601, "learning_rate": 8.917210453360859e-07, "loss": 2.1151, "step": 4335 }, { "epoch": 0.2924429769886459, "grad_norm": 5.469884905880985, "learning_rate": 8.913553227917365e-07, "loss": 2.0427, "step": 4340 }, { "epoch": 0.2927798928607527, "grad_norm": 5.36403523090647, "learning_rate": 8.909890589223329e-07, "loss": 2.0812, "step": 4345 }, { "epoch": 0.2931168087328594, "grad_norm": 4.948291769804187, "learning_rate": 8.906222542344932e-07, "loss": 2.0991, "step": 4350 }, { "epoch": 0.29345372460496616, "grad_norm": 4.839966601467887, "learning_rate": 8.902549092355839e-07, "loss": 2.1014, "step": 4355 }, { "epoch": 0.2937906404770729, "grad_norm": 5.215551815148696, "learning_rate": 8.898870244337189e-07, "loss": 2.1349, "step": 4360 }, { "epoch": 0.2941275563491796, "grad_norm": 5.261800348014333, "learning_rate": 8.895186003377586e-07, "loss": 2.0959, "step": 4365 }, { "epoch": 0.29446447222128636, "grad_norm": 5.640869470976805, "learning_rate": 8.891496374573095e-07, "loss": 2.1479, "step": 4370 }, { "epoch": 0.29480138809339307, "grad_norm": 5.215116162863993, "learning_rate": 8.887801363027233e-07, "loss": 2.1, "step": 4375 }, { "epoch": 0.29513830396549984, "grad_norm": 4.82522284342537, "learning_rate": 8.884100973850962e-07, "loss": 2.1284, "step": 4380 }, { "epoch": 0.29547521983760655, "grad_norm": 4.69882418318293, "learning_rate": 8.880395212162684e-07, "loss": 2.1392, "step": 4385 }, { "epoch": 0.29581213570971326, "grad_norm": 5.397304952515055, "learning_rate": 8.87668408308823e-07, "loss": 2.0091, "step": 4390 }, { "epoch": 0.29614905158182003, "grad_norm": 4.9261235433647625, "learning_rate": 8.872967591760856e-07, "loss": 2.0503, "step": 4395 }, { "epoch": 0.29648596745392675, "grad_norm": 4.747965263387588, "learning_rate": 8.869245743321234e-07, "loss": 2.1174, "step": 4400 }, { "epoch": 0.2968228833260335, "grad_norm": 5.462489400206193, "learning_rate": 8.865518542917452e-07, "loss": 2.0092, "step": 4405 }, { "epoch": 0.29715979919814023, "grad_norm": 5.151148740626112, "learning_rate": 8.861785995704991e-07, "loss": 2.1163, "step": 4410 }, { "epoch": 0.29749671507024694, "grad_norm": 4.845179717107491, "learning_rate": 8.858048106846735e-07, "loss": 2.0974, "step": 4415 }, { "epoch": 0.2978336309423537, "grad_norm": 5.215535840230219, "learning_rate": 8.854304881512955e-07, "loss": 2.0426, "step": 4420 }, { "epoch": 0.2981705468144604, "grad_norm": 5.242632303884169, "learning_rate": 8.850556324881302e-07, "loss": 2.1205, "step": 4425 }, { "epoch": 0.29850746268656714, "grad_norm": 4.828851598384878, "learning_rate": 8.846802442136804e-07, "loss": 2.0565, "step": 4430 }, { "epoch": 0.2988443785586739, "grad_norm": 4.601719218531041, "learning_rate": 8.843043238471853e-07, "loss": 2.1046, "step": 4435 }, { "epoch": 0.2991812944307806, "grad_norm": 5.015114975717491, "learning_rate": 8.839278719086201e-07, "loss": 2.0827, "step": 4440 }, { "epoch": 0.2995182103028874, "grad_norm": 4.75196386213993, "learning_rate": 8.835508889186956e-07, "loss": 2.1087, "step": 4445 }, { "epoch": 0.2998551261749941, "grad_norm": 5.5558247716183455, "learning_rate": 8.83173375398857e-07, "loss": 2.0827, "step": 4450 }, { "epoch": 0.3001920420471008, "grad_norm": 5.174050923357454, "learning_rate": 8.827953318712831e-07, "loss": 2.0827, "step": 4455 }, { "epoch": 0.3005289579192076, "grad_norm": 4.871535779241164, "learning_rate": 8.824167588588861e-07, "loss": 2.044, "step": 4460 }, { "epoch": 0.3008658737913143, "grad_norm": 5.228255647449934, "learning_rate": 8.820376568853105e-07, "loss": 2.059, "step": 4465 }, { "epoch": 0.30120278966342107, "grad_norm": 5.304050722002813, "learning_rate": 8.816580264749325e-07, "loss": 2.0719, "step": 4470 }, { "epoch": 0.3015397055355278, "grad_norm": 4.364854684578915, "learning_rate": 8.81277868152859e-07, "loss": 2.0928, "step": 4475 }, { "epoch": 0.3018766214076345, "grad_norm": 5.065760757894287, "learning_rate": 8.808971824449274e-07, "loss": 2.0765, "step": 4480 }, { "epoch": 0.30221353727974126, "grad_norm": 4.717955993721478, "learning_rate": 8.805159698777045e-07, "loss": 2.0756, "step": 4485 }, { "epoch": 0.302550453151848, "grad_norm": 5.323718062713012, "learning_rate": 8.801342309784858e-07, "loss": 2.06, "step": 4490 }, { "epoch": 0.30288736902395474, "grad_norm": 4.524290835283999, "learning_rate": 8.79751966275295e-07, "loss": 2.0237, "step": 4495 }, { "epoch": 0.30322428489606146, "grad_norm": 5.492024806251386, "learning_rate": 8.793691762968827e-07, "loss": 2.0639, "step": 4500 }, { "epoch": 0.30356120076816817, "grad_norm": 5.487782597284725, "learning_rate": 8.789858615727264e-07, "loss": 2.0788, "step": 4505 }, { "epoch": 0.30389811664027494, "grad_norm": 5.077644254937289, "learning_rate": 8.786020226330295e-07, "loss": 2.073, "step": 4510 }, { "epoch": 0.30423503251238165, "grad_norm": 5.134705657531099, "learning_rate": 8.782176600087203e-07, "loss": 2.0221, "step": 4515 }, { "epoch": 0.30457194838448837, "grad_norm": 4.674268880491484, "learning_rate": 8.778327742314513e-07, "loss": 2.0723, "step": 4520 }, { "epoch": 0.30490886425659514, "grad_norm": 4.96814012606218, "learning_rate": 8.77447365833599e-07, "loss": 2.0658, "step": 4525 }, { "epoch": 0.30524578012870185, "grad_norm": 5.3368681840058505, "learning_rate": 8.770614353482628e-07, "loss": 2.0943, "step": 4530 }, { "epoch": 0.3055826960008086, "grad_norm": 4.7634928820743525, "learning_rate": 8.766749833092638e-07, "loss": 2.1366, "step": 4535 }, { "epoch": 0.30591961187291533, "grad_norm": 4.6930717601990635, "learning_rate": 8.76288010251145e-07, "loss": 2.0791, "step": 4540 }, { "epoch": 0.30625652774502204, "grad_norm": 4.99741022453889, "learning_rate": 8.759005167091697e-07, "loss": 2.1092, "step": 4545 }, { "epoch": 0.3065934436171288, "grad_norm": 5.36042807723262, "learning_rate": 8.755125032193214e-07, "loss": 2.0085, "step": 4550 }, { "epoch": 0.3069303594892355, "grad_norm": 5.1648390264447475, "learning_rate": 8.751239703183026e-07, "loss": 2.0757, "step": 4555 }, { "epoch": 0.3072672753613423, "grad_norm": 4.663668431465443, "learning_rate": 8.747349185435348e-07, "loss": 2.062, "step": 4560 }, { "epoch": 0.307604191233449, "grad_norm": 5.176851032925634, "learning_rate": 8.743453484331562e-07, "loss": 2.1277, "step": 4565 }, { "epoch": 0.3079411071055557, "grad_norm": 4.912651962535717, "learning_rate": 8.73955260526023e-07, "loss": 2.0508, "step": 4570 }, { "epoch": 0.3082780229776625, "grad_norm": 4.783331630388288, "learning_rate": 8.735646553617069e-07, "loss": 2.1331, "step": 4575 }, { "epoch": 0.3086149388497692, "grad_norm": 5.028058462912641, "learning_rate": 8.731735334804953e-07, "loss": 2.0889, "step": 4580 }, { "epoch": 0.308951854721876, "grad_norm": 5.0643578126144835, "learning_rate": 8.727818954233904e-07, "loss": 2.073, "step": 4585 }, { "epoch": 0.3092887705939827, "grad_norm": 5.386754477737607, "learning_rate": 8.723897417321084e-07, "loss": 2.0569, "step": 4590 }, { "epoch": 0.3096256864660894, "grad_norm": 5.49702767540202, "learning_rate": 8.719970729490788e-07, "loss": 2.0929, "step": 4595 }, { "epoch": 0.30996260233819617, "grad_norm": 4.989203149378738, "learning_rate": 8.716038896174432e-07, "loss": 2.099, "step": 4600 }, { "epoch": 0.3102995182103029, "grad_norm": 5.014897745835798, "learning_rate": 8.712101922810551e-07, "loss": 2.0927, "step": 4605 }, { "epoch": 0.3106364340824096, "grad_norm": 4.85891889711565, "learning_rate": 8.708159814844793e-07, "loss": 2.0824, "step": 4610 }, { "epoch": 0.31097334995451636, "grad_norm": 4.757210294722209, "learning_rate": 8.704212577729905e-07, "loss": 2.1263, "step": 4615 }, { "epoch": 0.3113102658266231, "grad_norm": 5.020268029600836, "learning_rate": 8.700260216925728e-07, "loss": 2.0447, "step": 4620 }, { "epoch": 0.31164718169872985, "grad_norm": 4.980528539041979, "learning_rate": 8.696302737899192e-07, "loss": 2.0864, "step": 4625 }, { "epoch": 0.31198409757083656, "grad_norm": 5.260423063984446, "learning_rate": 8.692340146124308e-07, "loss": 2.0524, "step": 4630 }, { "epoch": 0.3123210134429433, "grad_norm": 5.155266779942804, "learning_rate": 8.688372447082153e-07, "loss": 2.079, "step": 4635 }, { "epoch": 0.31265792931505004, "grad_norm": 5.455965351333528, "learning_rate": 8.684399646260876e-07, "loss": 2.0576, "step": 4640 }, { "epoch": 0.31299484518715676, "grad_norm": 5.190871689099343, "learning_rate": 8.680421749155677e-07, "loss": 2.0555, "step": 4645 }, { "epoch": 0.3133317610592635, "grad_norm": 5.209172855193582, "learning_rate": 8.676438761268808e-07, "loss": 2.1247, "step": 4650 }, { "epoch": 0.31366867693137024, "grad_norm": 5.189773883910638, "learning_rate": 8.672450688109563e-07, "loss": 2.1327, "step": 4655 }, { "epoch": 0.31400559280347695, "grad_norm": 5.146127235898143, "learning_rate": 8.668457535194267e-07, "loss": 2.1604, "step": 4660 }, { "epoch": 0.3143425086755837, "grad_norm": 4.9719274016713815, "learning_rate": 8.664459308046274e-07, "loss": 2.0979, "step": 4665 }, { "epoch": 0.31467942454769043, "grad_norm": 5.1696980529853676, "learning_rate": 8.660456012195957e-07, "loss": 2.0577, "step": 4670 }, { "epoch": 0.3150163404197972, "grad_norm": 5.236633726313689, "learning_rate": 8.656447653180699e-07, "loss": 2.0378, "step": 4675 }, { "epoch": 0.3153532562919039, "grad_norm": 5.198350528243728, "learning_rate": 8.652434236544886e-07, "loss": 2.099, "step": 4680 }, { "epoch": 0.31569017216401063, "grad_norm": 4.993578414676582, "learning_rate": 8.648415767839899e-07, "loss": 2.0795, "step": 4685 }, { "epoch": 0.3160270880361174, "grad_norm": 5.1670844493697565, "learning_rate": 8.644392252624108e-07, "loss": 2.0192, "step": 4690 }, { "epoch": 0.3163640039082241, "grad_norm": 4.633632970260526, "learning_rate": 8.640363696462869e-07, "loss": 2.1075, "step": 4695 }, { "epoch": 0.3167009197803308, "grad_norm": 4.6688859608739035, "learning_rate": 8.636330104928499e-07, "loss": 2.0803, "step": 4700 }, { "epoch": 0.3170378356524376, "grad_norm": 5.2350003438271315, "learning_rate": 8.632291483600289e-07, "loss": 2.0469, "step": 4705 }, { "epoch": 0.3173747515245443, "grad_norm": 5.138909941031502, "learning_rate": 8.628247838064485e-07, "loss": 2.0583, "step": 4710 }, { "epoch": 0.3177116673966511, "grad_norm": 5.147737077236824, "learning_rate": 8.624199173914279e-07, "loss": 2.1478, "step": 4715 }, { "epoch": 0.3180485832687578, "grad_norm": 4.966986947346673, "learning_rate": 8.620145496749811e-07, "loss": 2.0837, "step": 4720 }, { "epoch": 0.3183854991408645, "grad_norm": 5.891443309158824, "learning_rate": 8.616086812178151e-07, "loss": 2.0909, "step": 4725 }, { "epoch": 0.31872241501297127, "grad_norm": 5.026702637047485, "learning_rate": 8.612023125813296e-07, "loss": 2.1025, "step": 4730 }, { "epoch": 0.319059330885078, "grad_norm": 4.7389562686246505, "learning_rate": 8.607954443276162e-07, "loss": 2.1226, "step": 4735 }, { "epoch": 0.31939624675718475, "grad_norm": 4.7105310853932405, "learning_rate": 8.603880770194574e-07, "loss": 2.091, "step": 4740 }, { "epoch": 0.31973316262929147, "grad_norm": 4.966992404612188, "learning_rate": 8.59980211220326e-07, "loss": 2.0678, "step": 4745 }, { "epoch": 0.3200700785013982, "grad_norm": 5.403582305237505, "learning_rate": 8.595718474943849e-07, "loss": 2.0604, "step": 4750 }, { "epoch": 0.32040699437350495, "grad_norm": 4.941271845316805, "learning_rate": 8.591629864064851e-07, "loss": 2.0939, "step": 4755 }, { "epoch": 0.32074391024561166, "grad_norm": 5.261784865610782, "learning_rate": 8.587536285221655e-07, "loss": 2.1472, "step": 4760 }, { "epoch": 0.32108082611771843, "grad_norm": 5.068874440509146, "learning_rate": 8.583437744076527e-07, "loss": 2.0791, "step": 4765 }, { "epoch": 0.32141774198982515, "grad_norm": 5.38626589270778, "learning_rate": 8.579334246298592e-07, "loss": 2.0903, "step": 4770 }, { "epoch": 0.32175465786193186, "grad_norm": 4.583955700063932, "learning_rate": 8.575225797563834e-07, "loss": 2.1252, "step": 4775 }, { "epoch": 0.3220915737340386, "grad_norm": 4.722665587401526, "learning_rate": 8.571112403555083e-07, "loss": 2.0616, "step": 4780 }, { "epoch": 0.32242848960614534, "grad_norm": 4.990440704124639, "learning_rate": 8.566994069962012e-07, "loss": 2.0864, "step": 4785 }, { "epoch": 0.32276540547825205, "grad_norm": 4.8413489467426745, "learning_rate": 8.562870802481126e-07, "loss": 2.1148, "step": 4790 }, { "epoch": 0.3231023213503588, "grad_norm": 5.203058287228736, "learning_rate": 8.55874260681575e-07, "loss": 2.0579, "step": 4795 }, { "epoch": 0.32343923722246554, "grad_norm": 5.206080372043112, "learning_rate": 8.554609488676032e-07, "loss": 2.0687, "step": 4800 }, { "epoch": 0.3237761530945723, "grad_norm": 5.303249608378167, "learning_rate": 8.550471453778925e-07, "loss": 2.0626, "step": 4805 }, { "epoch": 0.324113068966679, "grad_norm": 5.352598742491536, "learning_rate": 8.546328507848184e-07, "loss": 2.0637, "step": 4810 }, { "epoch": 0.32444998483878573, "grad_norm": 4.95894081126887, "learning_rate": 8.542180656614358e-07, "loss": 2.0265, "step": 4815 }, { "epoch": 0.3247869007108925, "grad_norm": 4.909822728203148, "learning_rate": 8.538027905814778e-07, "loss": 2.0506, "step": 4820 }, { "epoch": 0.3251238165829992, "grad_norm": 4.744534945830245, "learning_rate": 8.533870261193556e-07, "loss": 1.9822, "step": 4825 }, { "epoch": 0.325460732455106, "grad_norm": 5.253967170411138, "learning_rate": 8.529707728501571e-07, "loss": 2.0288, "step": 4830 }, { "epoch": 0.3257976483272127, "grad_norm": 5.09535372619165, "learning_rate": 8.525540313496462e-07, "loss": 2.0568, "step": 4835 }, { "epoch": 0.3261345641993194, "grad_norm": 4.744277479698655, "learning_rate": 8.521368021942623e-07, "loss": 2.1264, "step": 4840 }, { "epoch": 0.3264714800714262, "grad_norm": 5.329160719492522, "learning_rate": 8.517190859611195e-07, "loss": 2.027, "step": 4845 }, { "epoch": 0.3268083959435329, "grad_norm": 5.382961738745272, "learning_rate": 8.513008832280053e-07, "loss": 2.1525, "step": 4850 }, { "epoch": 0.32714531181563966, "grad_norm": 5.589950743722125, "learning_rate": 8.508821945733802e-07, "loss": 2.0865, "step": 4855 }, { "epoch": 0.3274822276877464, "grad_norm": 5.248155823011026, "learning_rate": 8.504630205763768e-07, "loss": 2.0801, "step": 4860 }, { "epoch": 0.3278191435598531, "grad_norm": 4.797376699185984, "learning_rate": 8.500433618167992e-07, "loss": 2.0934, "step": 4865 }, { "epoch": 0.32815605943195986, "grad_norm": 4.951149281089263, "learning_rate": 8.496232188751222e-07, "loss": 2.0549, "step": 4870 }, { "epoch": 0.32849297530406657, "grad_norm": 4.88081521662314, "learning_rate": 8.492025923324897e-07, "loss": 2.057, "step": 4875 }, { "epoch": 0.3288298911761733, "grad_norm": 5.1089174823423145, "learning_rate": 8.487814827707152e-07, "loss": 2.0893, "step": 4880 }, { "epoch": 0.32916680704828005, "grad_norm": 5.217469502105842, "learning_rate": 8.483598907722795e-07, "loss": 2.097, "step": 4885 }, { "epoch": 0.32950372292038677, "grad_norm": 4.880352649572809, "learning_rate": 8.479378169203317e-07, "loss": 2.0007, "step": 4890 }, { "epoch": 0.32984063879249353, "grad_norm": 5.286044409529973, "learning_rate": 8.475152617986869e-07, "loss": 2.0416, "step": 4895 }, { "epoch": 0.33017755466460025, "grad_norm": 5.1679612593491555, "learning_rate": 8.470922259918254e-07, "loss": 2.0304, "step": 4900 }, { "epoch": 0.33051447053670696, "grad_norm": 4.894984437941719, "learning_rate": 8.466687100848935e-07, "loss": 2.087, "step": 4905 }, { "epoch": 0.33085138640881373, "grad_norm": 5.390012019049476, "learning_rate": 8.462447146637006e-07, "loss": 2.0454, "step": 4910 }, { "epoch": 0.33118830228092044, "grad_norm": 4.929442886154672, "learning_rate": 8.458202403147199e-07, "loss": 2.0762, "step": 4915 }, { "epoch": 0.3315252181530272, "grad_norm": 5.3362599442987895, "learning_rate": 8.453952876250867e-07, "loss": 2.0349, "step": 4920 }, { "epoch": 0.3318621340251339, "grad_norm": 5.183932147583344, "learning_rate": 8.449698571825984e-07, "loss": 2.1022, "step": 4925 }, { "epoch": 0.33219904989724064, "grad_norm": 5.042476238841692, "learning_rate": 8.445439495757127e-07, "loss": 2.0559, "step": 4930 }, { "epoch": 0.3325359657693474, "grad_norm": 5.166419584721339, "learning_rate": 8.44117565393548e-07, "loss": 2.0341, "step": 4935 }, { "epoch": 0.3328728816414541, "grad_norm": 4.797789333928606, "learning_rate": 8.436907052258808e-07, "loss": 2.1043, "step": 4940 }, { "epoch": 0.3332097975135609, "grad_norm": 5.447154329767861, "learning_rate": 8.432633696631473e-07, "loss": 2.1311, "step": 4945 }, { "epoch": 0.3335467133856676, "grad_norm": 4.8713808241121415, "learning_rate": 8.428355592964405e-07, "loss": 2.0932, "step": 4950 }, { "epoch": 0.3338836292577743, "grad_norm": 4.583881700115357, "learning_rate": 8.424072747175102e-07, "loss": 2.1103, "step": 4955 }, { "epoch": 0.3342205451298811, "grad_norm": 4.8201102996091905, "learning_rate": 8.419785165187621e-07, "loss": 1.9829, "step": 4960 }, { "epoch": 0.3345574610019878, "grad_norm": 5.020819931702565, "learning_rate": 8.415492852932573e-07, "loss": 2.0151, "step": 4965 }, { "epoch": 0.3348943768740945, "grad_norm": 5.378993917549338, "learning_rate": 8.41119581634711e-07, "loss": 2.1525, "step": 4970 }, { "epoch": 0.3352312927462013, "grad_norm": 5.076337376668576, "learning_rate": 8.406894061374918e-07, "loss": 2.1278, "step": 4975 }, { "epoch": 0.335568208618308, "grad_norm": 5.2414800515637845, "learning_rate": 8.402587593966213e-07, "loss": 2.0372, "step": 4980 }, { "epoch": 0.33590512449041476, "grad_norm": 4.832509602578521, "learning_rate": 8.398276420077726e-07, "loss": 2.103, "step": 4985 }, { "epoch": 0.3362420403625215, "grad_norm": 5.005968987567552, "learning_rate": 8.393960545672698e-07, "loss": 2.052, "step": 4990 }, { "epoch": 0.3365789562346282, "grad_norm": 5.001112270491913, "learning_rate": 8.389639976720873e-07, "loss": 2.0655, "step": 4995 }, { "epoch": 0.33691587210673496, "grad_norm": 5.301269047094812, "learning_rate": 8.385314719198487e-07, "loss": 2.0794, "step": 5000 }, { "epoch": 0.3372527879788417, "grad_norm": 4.957529187799194, "learning_rate": 8.380984779088264e-07, "loss": 2.0777, "step": 5005 }, { "epoch": 0.33758970385094844, "grad_norm": 5.6862950440377285, "learning_rate": 8.376650162379404e-07, "loss": 2.1196, "step": 5010 }, { "epoch": 0.33792661972305515, "grad_norm": 4.790690555777137, "learning_rate": 8.372310875067572e-07, "loss": 2.1118, "step": 5015 }, { "epoch": 0.33826353559516187, "grad_norm": 5.060774592547441, "learning_rate": 8.367966923154899e-07, "loss": 1.9834, "step": 5020 }, { "epoch": 0.33860045146726864, "grad_norm": 5.016006873582748, "learning_rate": 8.363618312649967e-07, "loss": 2.0651, "step": 5025 }, { "epoch": 0.33893736733937535, "grad_norm": 4.994329144569729, "learning_rate": 8.359265049567796e-07, "loss": 2.0455, "step": 5030 }, { "epoch": 0.3392742832114821, "grad_norm": 4.377627333746808, "learning_rate": 8.35490713992985e-07, "loss": 2.0893, "step": 5035 }, { "epoch": 0.33961119908358883, "grad_norm": 5.287296886740067, "learning_rate": 8.350544589764015e-07, "loss": 2.0947, "step": 5040 }, { "epoch": 0.33994811495569555, "grad_norm": 5.019224243228127, "learning_rate": 8.346177405104595e-07, "loss": 2.0622, "step": 5045 }, { "epoch": 0.3402850308278023, "grad_norm": 5.371315717631444, "learning_rate": 8.341805591992308e-07, "loss": 2.1034, "step": 5050 }, { "epoch": 0.34062194669990903, "grad_norm": 4.680299265382303, "learning_rate": 8.337429156474272e-07, "loss": 2.0011, "step": 5055 }, { "epoch": 0.34095886257201574, "grad_norm": 4.696739322151472, "learning_rate": 8.333048104603999e-07, "loss": 2.0144, "step": 5060 }, { "epoch": 0.3412957784441225, "grad_norm": 5.131210455434402, "learning_rate": 8.328662442441388e-07, "loss": 1.9858, "step": 5065 }, { "epoch": 0.3416326943162292, "grad_norm": 4.780047703678763, "learning_rate": 8.32427217605271e-07, "loss": 2.0967, "step": 5070 }, { "epoch": 0.341969610188336, "grad_norm": 4.940415994812095, "learning_rate": 8.319877311510612e-07, "loss": 2.1062, "step": 5075 }, { "epoch": 0.3423065260604427, "grad_norm": 5.068956997078487, "learning_rate": 8.315477854894095e-07, "loss": 2.0431, "step": 5080 }, { "epoch": 0.3426434419325494, "grad_norm": 4.891259347285673, "learning_rate": 8.311073812288513e-07, "loss": 2.0922, "step": 5085 }, { "epoch": 0.3429803578046562, "grad_norm": 5.344945579307675, "learning_rate": 8.306665189785567e-07, "loss": 2.0644, "step": 5090 }, { "epoch": 0.3433172736767629, "grad_norm": 4.962871737646247, "learning_rate": 8.302251993483289e-07, "loss": 2.0865, "step": 5095 }, { "epoch": 0.34365418954886967, "grad_norm": 5.322617710582742, "learning_rate": 8.297834229486039e-07, "loss": 2.0397, "step": 5100 }, { "epoch": 0.3439911054209764, "grad_norm": 4.861192903436849, "learning_rate": 8.293411903904496e-07, "loss": 2.0182, "step": 5105 }, { "epoch": 0.3443280212930831, "grad_norm": 4.997444357267866, "learning_rate": 8.288985022855645e-07, "loss": 2.1188, "step": 5110 }, { "epoch": 0.34466493716518987, "grad_norm": 5.179248492048811, "learning_rate": 8.284553592462778e-07, "loss": 2.1037, "step": 5115 }, { "epoch": 0.3450018530372966, "grad_norm": 4.770032146916437, "learning_rate": 8.280117618855475e-07, "loss": 2.106, "step": 5120 }, { "epoch": 0.34533876890940335, "grad_norm": 5.053406533485786, "learning_rate": 8.2756771081696e-07, "loss": 2.1112, "step": 5125 }, { "epoch": 0.34567568478151006, "grad_norm": 5.431028187589875, "learning_rate": 8.271232066547296e-07, "loss": 2.0514, "step": 5130 }, { "epoch": 0.3460126006536168, "grad_norm": 4.550796341013888, "learning_rate": 8.266782500136971e-07, "loss": 2.0971, "step": 5135 }, { "epoch": 0.34634951652572354, "grad_norm": 5.1892082060863665, "learning_rate": 8.262328415093293e-07, "loss": 2.0866, "step": 5140 }, { "epoch": 0.34668643239783026, "grad_norm": 5.223463818526897, "learning_rate": 8.257869817577179e-07, "loss": 2.0908, "step": 5145 }, { "epoch": 0.34702334826993697, "grad_norm": 4.763239730532741, "learning_rate": 8.253406713755786e-07, "loss": 2.037, "step": 5150 }, { "epoch": 0.34736026414204374, "grad_norm": 5.249039538211549, "learning_rate": 8.24893910980251e-07, "loss": 1.9998, "step": 5155 }, { "epoch": 0.34769718001415045, "grad_norm": 5.02886451438739, "learning_rate": 8.244467011896965e-07, "loss": 1.9938, "step": 5160 }, { "epoch": 0.3480340958862572, "grad_norm": 4.605264391692436, "learning_rate": 8.239990426224986e-07, "loss": 2.0797, "step": 5165 }, { "epoch": 0.34837101175836394, "grad_norm": 4.953009240448718, "learning_rate": 8.235509358978611e-07, "loss": 2.0905, "step": 5170 }, { "epoch": 0.34870792763047065, "grad_norm": 5.476345971516692, "learning_rate": 8.231023816356081e-07, "loss": 2.0159, "step": 5175 }, { "epoch": 0.3490448435025774, "grad_norm": 5.631885656515458, "learning_rate": 8.226533804561826e-07, "loss": 2.0373, "step": 5180 }, { "epoch": 0.34938175937468413, "grad_norm": 5.108546479600068, "learning_rate": 8.222039329806456e-07, "loss": 2.0453, "step": 5185 }, { "epoch": 0.3497186752467909, "grad_norm": 5.308011913799786, "learning_rate": 8.217540398306757e-07, "loss": 2.0047, "step": 5190 }, { "epoch": 0.3500555911188976, "grad_norm": 5.000884578780172, "learning_rate": 8.213037016285679e-07, "loss": 2.0843, "step": 5195 }, { "epoch": 0.3503925069910043, "grad_norm": 5.473643383084012, "learning_rate": 8.208529189972325e-07, "loss": 2.0816, "step": 5200 }, { "epoch": 0.3507294228631111, "grad_norm": 5.1835831280043445, "learning_rate": 8.204016925601951e-07, "loss": 2.0966, "step": 5205 }, { "epoch": 0.3510663387352178, "grad_norm": 5.250785091129774, "learning_rate": 8.199500229415945e-07, "loss": 2.0384, "step": 5210 }, { "epoch": 0.3514032546073246, "grad_norm": 5.08453932805917, "learning_rate": 8.19497910766183e-07, "loss": 2.0712, "step": 5215 }, { "epoch": 0.3517401704794313, "grad_norm": 4.80109463609141, "learning_rate": 8.19045356659325e-07, "loss": 2.077, "step": 5220 }, { "epoch": 0.352077086351538, "grad_norm": 5.284150080175318, "learning_rate": 8.185923612469958e-07, "loss": 2.0371, "step": 5225 }, { "epoch": 0.3524140022236448, "grad_norm": 5.099676017847825, "learning_rate": 8.181389251557817e-07, "loss": 2.0404, "step": 5230 }, { "epoch": 0.3527509180957515, "grad_norm": 5.078340583026754, "learning_rate": 8.176850490128782e-07, "loss": 2.0258, "step": 5235 }, { "epoch": 0.3530878339678582, "grad_norm": 5.18100461184939, "learning_rate": 8.172307334460892e-07, "loss": 2.1135, "step": 5240 }, { "epoch": 0.35342474983996497, "grad_norm": 5.403819766947706, "learning_rate": 8.167759790838273e-07, "loss": 2.0619, "step": 5245 }, { "epoch": 0.3537616657120717, "grad_norm": 5.314659280414644, "learning_rate": 8.163207865551111e-07, "loss": 2.002, "step": 5250 }, { "epoch": 0.35409858158417845, "grad_norm": 5.206589614863503, "learning_rate": 8.158651564895657e-07, "loss": 2.0165, "step": 5255 }, { "epoch": 0.35443549745628516, "grad_norm": 5.067263122709279, "learning_rate": 8.154090895174215e-07, "loss": 2.0578, "step": 5260 }, { "epoch": 0.3547724133283919, "grad_norm": 5.123828621347462, "learning_rate": 8.149525862695131e-07, "loss": 2.1059, "step": 5265 }, { "epoch": 0.35510932920049865, "grad_norm": 4.690114884908356, "learning_rate": 8.144956473772784e-07, "loss": 2.0636, "step": 5270 }, { "epoch": 0.35544624507260536, "grad_norm": 5.251155109674323, "learning_rate": 8.140382734727581e-07, "loss": 2.0991, "step": 5275 }, { "epoch": 0.35578316094471213, "grad_norm": 4.779070699662453, "learning_rate": 8.135804651885946e-07, "loss": 2.0971, "step": 5280 }, { "epoch": 0.35612007681681884, "grad_norm": 5.5990611186101935, "learning_rate": 8.131222231580313e-07, "loss": 2.0311, "step": 5285 }, { "epoch": 0.35645699268892556, "grad_norm": 4.78982137702585, "learning_rate": 8.126635480149107e-07, "loss": 2.0727, "step": 5290 }, { "epoch": 0.3567939085610323, "grad_norm": 5.087761359613712, "learning_rate": 8.122044403936759e-07, "loss": 2.0607, "step": 5295 }, { "epoch": 0.35713082443313904, "grad_norm": 5.0952399545296, "learning_rate": 8.117449009293668e-07, "loss": 2.0667, "step": 5300 }, { "epoch": 0.3574677403052458, "grad_norm": 5.026912861866033, "learning_rate": 8.112849302576212e-07, "loss": 2.0716, "step": 5305 }, { "epoch": 0.3578046561773525, "grad_norm": 4.56598807161444, "learning_rate": 8.108245290146735e-07, "loss": 1.9623, "step": 5310 }, { "epoch": 0.35814157204945923, "grad_norm": 4.976326774771134, "learning_rate": 8.103636978373534e-07, "loss": 2.0149, "step": 5315 }, { "epoch": 0.358478487921566, "grad_norm": 5.381796243590813, "learning_rate": 8.099024373630854e-07, "loss": 1.9914, "step": 5320 }, { "epoch": 0.3588154037936727, "grad_norm": 5.092026265671812, "learning_rate": 8.094407482298877e-07, "loss": 2.0615, "step": 5325 }, { "epoch": 0.35915231966577943, "grad_norm": 4.920572387145095, "learning_rate": 8.089786310763716e-07, "loss": 2.0555, "step": 5330 }, { "epoch": 0.3594892355378862, "grad_norm": 4.632584891649527, "learning_rate": 8.085160865417403e-07, "loss": 1.9994, "step": 5335 }, { "epoch": 0.3598261514099929, "grad_norm": 5.150724074164352, "learning_rate": 8.080531152657884e-07, "loss": 2.0117, "step": 5340 }, { "epoch": 0.3601630672820997, "grad_norm": 5.504497900500158, "learning_rate": 8.075897178889002e-07, "loss": 2.0441, "step": 5345 }, { "epoch": 0.3604999831542064, "grad_norm": 4.9891807017017005, "learning_rate": 8.071258950520501e-07, "loss": 2.1118, "step": 5350 }, { "epoch": 0.3608368990263131, "grad_norm": 4.8619337994896545, "learning_rate": 8.066616473968005e-07, "loss": 2.0967, "step": 5355 }, { "epoch": 0.3611738148984199, "grad_norm": 4.969747403968652, "learning_rate": 8.061969755653013e-07, "loss": 2.1144, "step": 5360 }, { "epoch": 0.3615107307705266, "grad_norm": 5.306363508415882, "learning_rate": 8.0573188020029e-07, "loss": 2.0915, "step": 5365 }, { "epoch": 0.36184764664263336, "grad_norm": 4.565963380954025, "learning_rate": 8.052663619450889e-07, "loss": 1.9748, "step": 5370 }, { "epoch": 0.36218456251474007, "grad_norm": 5.177500511353336, "learning_rate": 8.048004214436058e-07, "loss": 2.0812, "step": 5375 }, { "epoch": 0.3625214783868468, "grad_norm": 5.283989642929408, "learning_rate": 8.043340593403325e-07, "loss": 2.1445, "step": 5380 }, { "epoch": 0.36285839425895355, "grad_norm": 5.191587128531984, "learning_rate": 8.038672762803437e-07, "loss": 2.0722, "step": 5385 }, { "epoch": 0.36319531013106027, "grad_norm": 4.7559658923319725, "learning_rate": 8.034000729092967e-07, "loss": 2.0724, "step": 5390 }, { "epoch": 0.36353222600316704, "grad_norm": 5.610941605388528, "learning_rate": 8.029324498734299e-07, "loss": 2.0176, "step": 5395 }, { "epoch": 0.36386914187527375, "grad_norm": 4.954066375220888, "learning_rate": 8.024644078195625e-07, "loss": 2.0585, "step": 5400 }, { "epoch": 0.36420605774738046, "grad_norm": 5.2066611261442945, "learning_rate": 8.01995947395093e-07, "loss": 2.0587, "step": 5405 }, { "epoch": 0.36454297361948723, "grad_norm": 4.924141568148678, "learning_rate": 8.015270692479988e-07, "loss": 2.1339, "step": 5410 }, { "epoch": 0.36487988949159395, "grad_norm": 5.374437680148094, "learning_rate": 8.010577740268347e-07, "loss": 2.065, "step": 5415 }, { "epoch": 0.36521680536370066, "grad_norm": 5.322932817343748, "learning_rate": 8.005880623807331e-07, "loss": 1.9893, "step": 5420 }, { "epoch": 0.3655537212358074, "grad_norm": 5.483630443168512, "learning_rate": 8.001179349594016e-07, "loss": 2.1127, "step": 5425 }, { "epoch": 0.36589063710791414, "grad_norm": 5.103227360085772, "learning_rate": 7.996473924131236e-07, "loss": 2.1096, "step": 5430 }, { "epoch": 0.3662275529800209, "grad_norm": 5.014805812123616, "learning_rate": 7.991764353927562e-07, "loss": 2.0521, "step": 5435 }, { "epoch": 0.3665644688521276, "grad_norm": 4.990064271259956, "learning_rate": 7.987050645497302e-07, "loss": 2.0592, "step": 5440 }, { "epoch": 0.36690138472423434, "grad_norm": 5.949898717262186, "learning_rate": 7.982332805360486e-07, "loss": 2.06, "step": 5445 }, { "epoch": 0.3672383005963411, "grad_norm": 5.6305983889959395, "learning_rate": 7.977610840042856e-07, "loss": 2.0577, "step": 5450 }, { "epoch": 0.3675752164684478, "grad_norm": 4.680425799944378, "learning_rate": 7.972884756075867e-07, "loss": 2.1448, "step": 5455 }, { "epoch": 0.3679121323405546, "grad_norm": 5.4169186571611085, "learning_rate": 7.968154559996665e-07, "loss": 2.0893, "step": 5460 }, { "epoch": 0.3682490482126613, "grad_norm": 4.841595416311757, "learning_rate": 7.963420258348086e-07, "loss": 2.0197, "step": 5465 }, { "epoch": 0.368585964084768, "grad_norm": 4.700299868106253, "learning_rate": 7.958681857678645e-07, "loss": 2.0428, "step": 5470 }, { "epoch": 0.3689228799568748, "grad_norm": 4.625762813619153, "learning_rate": 7.953939364542523e-07, "loss": 1.9873, "step": 5475 }, { "epoch": 0.3692597958289815, "grad_norm": 4.919833003476126, "learning_rate": 7.949192785499573e-07, "loss": 2.0163, "step": 5480 }, { "epoch": 0.36959671170108827, "grad_norm": 5.112649080396598, "learning_rate": 7.944442127115285e-07, "loss": 2.0314, "step": 5485 }, { "epoch": 0.369933627573195, "grad_norm": 5.119677295578208, "learning_rate": 7.939687395960802e-07, "loss": 2.0477, "step": 5490 }, { "epoch": 0.3702705434453017, "grad_norm": 5.128869783242145, "learning_rate": 7.934928598612895e-07, "loss": 1.9858, "step": 5495 }, { "epoch": 0.37060745931740846, "grad_norm": 4.776345931194531, "learning_rate": 7.930165741653964e-07, "loss": 2.0458, "step": 5500 }, { "epoch": 0.3709443751895152, "grad_norm": 5.112439579564765, "learning_rate": 7.925398831672018e-07, "loss": 2.0342, "step": 5505 }, { "epoch": 0.3712812910616219, "grad_norm": 4.957287673287657, "learning_rate": 7.920627875260679e-07, "loss": 2.1562, "step": 5510 }, { "epoch": 0.37161820693372866, "grad_norm": 5.156130131382333, "learning_rate": 7.91585287901916e-07, "loss": 2.0585, "step": 5515 }, { "epoch": 0.37195512280583537, "grad_norm": 5.054044984656765, "learning_rate": 7.911073849552267e-07, "loss": 2.0635, "step": 5520 }, { "epoch": 0.37229203867794214, "grad_norm": 5.2767149050913265, "learning_rate": 7.906290793470382e-07, "loss": 2.098, "step": 5525 }, { "epoch": 0.37262895455004885, "grad_norm": 5.385181936573417, "learning_rate": 7.901503717389458e-07, "loss": 2.0737, "step": 5530 }, { "epoch": 0.37296587042215557, "grad_norm": 5.225856449228782, "learning_rate": 7.896712627931004e-07, "loss": 2.1039, "step": 5535 }, { "epoch": 0.37330278629426233, "grad_norm": 5.033061696935751, "learning_rate": 7.891917531722087e-07, "loss": 2.0561, "step": 5540 }, { "epoch": 0.37363970216636905, "grad_norm": 4.724293479052734, "learning_rate": 7.887118435395314e-07, "loss": 2.0519, "step": 5545 }, { "epoch": 0.3739766180384758, "grad_norm": 5.109680050826465, "learning_rate": 7.882315345588823e-07, "loss": 2.0445, "step": 5550 }, { "epoch": 0.37431353391058253, "grad_norm": 4.676824408463511, "learning_rate": 7.877508268946275e-07, "loss": 2.0221, "step": 5555 }, { "epoch": 0.37465044978268924, "grad_norm": 4.767410783018015, "learning_rate": 7.87269721211685e-07, "loss": 2.0286, "step": 5560 }, { "epoch": 0.374987365654796, "grad_norm": 5.082695491964336, "learning_rate": 7.86788218175523e-07, "loss": 2.1088, "step": 5565 }, { "epoch": 0.3753242815269027, "grad_norm": 4.964317315965569, "learning_rate": 7.863063184521595e-07, "loss": 2.0061, "step": 5570 }, { "epoch": 0.3756611973990095, "grad_norm": 4.710587280741641, "learning_rate": 7.858240227081611e-07, "loss": 2.0036, "step": 5575 }, { "epoch": 0.3759981132711162, "grad_norm": 5.224100726461426, "learning_rate": 7.85341331610642e-07, "loss": 2.0759, "step": 5580 }, { "epoch": 0.3763350291432229, "grad_norm": 4.817840675752814, "learning_rate": 7.848582458272637e-07, "loss": 2.1134, "step": 5585 }, { "epoch": 0.3766719450153297, "grad_norm": 6.000927009382994, "learning_rate": 7.843747660262333e-07, "loss": 1.9519, "step": 5590 }, { "epoch": 0.3770088608874364, "grad_norm": 4.852859931270408, "learning_rate": 7.838908928763028e-07, "loss": 2.052, "step": 5595 }, { "epoch": 0.3773457767595431, "grad_norm": 5.136907861262127, "learning_rate": 7.834066270467689e-07, "loss": 2.0518, "step": 5600 }, { "epoch": 0.3776826926316499, "grad_norm": 5.1375114696402004, "learning_rate": 7.829219692074707e-07, "loss": 2.1255, "step": 5605 }, { "epoch": 0.3780196085037566, "grad_norm": 4.746974983176314, "learning_rate": 7.824369200287899e-07, "loss": 2.0562, "step": 5610 }, { "epoch": 0.37835652437586337, "grad_norm": 4.911088133338935, "learning_rate": 7.819514801816496e-07, "loss": 2.0537, "step": 5615 }, { "epoch": 0.3786934402479701, "grad_norm": 4.87059928783005, "learning_rate": 7.814656503375128e-07, "loss": 2.021, "step": 5620 }, { "epoch": 0.3790303561200768, "grad_norm": 5.5451356121716, "learning_rate": 7.809794311683828e-07, "loss": 2.0586, "step": 5625 }, { "epoch": 0.37936727199218356, "grad_norm": 4.4456975897893445, "learning_rate": 7.804928233468006e-07, "loss": 2.0749, "step": 5630 }, { "epoch": 0.3797041878642903, "grad_norm": 4.699293957531479, "learning_rate": 7.80005827545845e-07, "loss": 2.1032, "step": 5635 }, { "epoch": 0.38004110373639705, "grad_norm": 5.586527806606788, "learning_rate": 7.795184444391318e-07, "loss": 2.037, "step": 5640 }, { "epoch": 0.38037801960850376, "grad_norm": 4.682694865397517, "learning_rate": 7.790306747008119e-07, "loss": 2.0123, "step": 5645 }, { "epoch": 0.3807149354806105, "grad_norm": 4.760107148234428, "learning_rate": 7.785425190055719e-07, "loss": 2.0532, "step": 5650 }, { "epoch": 0.38105185135271724, "grad_norm": 6.012556767394015, "learning_rate": 7.780539780286312e-07, "loss": 2.0114, "step": 5655 }, { "epoch": 0.38138876722482395, "grad_norm": 5.104016831352078, "learning_rate": 7.77565052445743e-07, "loss": 2.096, "step": 5660 }, { "epoch": 0.3817256830969307, "grad_norm": 4.817890916465068, "learning_rate": 7.770757429331919e-07, "loss": 2.097, "step": 5665 }, { "epoch": 0.38206259896903744, "grad_norm": 5.293595554301608, "learning_rate": 7.765860501677939e-07, "loss": 2.0701, "step": 5670 }, { "epoch": 0.38239951484114415, "grad_norm": 4.645262042502001, "learning_rate": 7.760959748268949e-07, "loss": 2.0103, "step": 5675 }, { "epoch": 0.3827364307132509, "grad_norm": 4.624649890190857, "learning_rate": 7.756055175883701e-07, "loss": 2.0746, "step": 5680 }, { "epoch": 0.38307334658535763, "grad_norm": 5.048005676941447, "learning_rate": 7.751146791306231e-07, "loss": 2.0878, "step": 5685 }, { "epoch": 0.38341026245746435, "grad_norm": 4.853156653849726, "learning_rate": 7.746234601325843e-07, "loss": 2.0306, "step": 5690 }, { "epoch": 0.3837471783295711, "grad_norm": 4.962664901085152, "learning_rate": 7.741318612737111e-07, "loss": 2.0406, "step": 5695 }, { "epoch": 0.38408409420167783, "grad_norm": 5.061483570187733, "learning_rate": 7.73639883233986e-07, "loss": 2.0161, "step": 5700 }, { "epoch": 0.3844210100737846, "grad_norm": 4.702417555877279, "learning_rate": 7.731475266939158e-07, "loss": 1.9997, "step": 5705 }, { "epoch": 0.3847579259458913, "grad_norm": 5.878583501466992, "learning_rate": 7.726547923345313e-07, "loss": 2.0408, "step": 5710 }, { "epoch": 0.385094841817998, "grad_norm": 4.987461573972159, "learning_rate": 7.721616808373855e-07, "loss": 2.079, "step": 5715 }, { "epoch": 0.3854317576901048, "grad_norm": 4.964030643166286, "learning_rate": 7.716681928845532e-07, "loss": 2.0193, "step": 5720 }, { "epoch": 0.3857686735622115, "grad_norm": 5.120233782235204, "learning_rate": 7.711743291586298e-07, "loss": 2.0612, "step": 5725 }, { "epoch": 0.3861055894343183, "grad_norm": 5.124768560277765, "learning_rate": 7.706800903427309e-07, "loss": 1.9692, "step": 5730 }, { "epoch": 0.386442505306425, "grad_norm": 4.964312885258397, "learning_rate": 7.701854771204905e-07, "loss": 2.0745, "step": 5735 }, { "epoch": 0.3867794211785317, "grad_norm": 5.075673195843057, "learning_rate": 7.696904901760606e-07, "loss": 2.041, "step": 5740 }, { "epoch": 0.38711633705063847, "grad_norm": 5.27261568635049, "learning_rate": 7.691951301941102e-07, "loss": 2.0603, "step": 5745 }, { "epoch": 0.3874532529227452, "grad_norm": 5.248576550658075, "learning_rate": 7.68699397859824e-07, "loss": 2.1181, "step": 5750 }, { "epoch": 0.38779016879485195, "grad_norm": 4.948282496412506, "learning_rate": 7.682032938589023e-07, "loss": 2.0457, "step": 5755 }, { "epoch": 0.38812708466695867, "grad_norm": 4.9076331275343135, "learning_rate": 7.677068188775589e-07, "loss": 1.9806, "step": 5760 }, { "epoch": 0.3884640005390654, "grad_norm": 4.7324128036148245, "learning_rate": 7.67209973602521e-07, "loss": 2.0535, "step": 5765 }, { "epoch": 0.38880091641117215, "grad_norm": 4.789695345933008, "learning_rate": 7.667127587210282e-07, "loss": 2.1018, "step": 5770 }, { "epoch": 0.38913783228327886, "grad_norm": 5.222355224909597, "learning_rate": 7.66215174920831e-07, "loss": 2.0731, "step": 5775 }, { "epoch": 0.3894747481553856, "grad_norm": 4.915663645447257, "learning_rate": 7.657172228901905e-07, "loss": 2.0249, "step": 5780 }, { "epoch": 0.38981166402749234, "grad_norm": 5.079924889117498, "learning_rate": 7.652189033178766e-07, "loss": 2.0366, "step": 5785 }, { "epoch": 0.39014857989959906, "grad_norm": 5.262563006162197, "learning_rate": 7.647202168931683e-07, "loss": 1.9926, "step": 5790 }, { "epoch": 0.3904854957717058, "grad_norm": 5.134491922547744, "learning_rate": 7.642211643058516e-07, "loss": 2.0277, "step": 5795 }, { "epoch": 0.39082241164381254, "grad_norm": 4.869284801518188, "learning_rate": 7.637217462462189e-07, "loss": 2.0385, "step": 5800 }, { "epoch": 0.39115932751591925, "grad_norm": 4.975262323558063, "learning_rate": 7.632219634050685e-07, "loss": 2.0294, "step": 5805 }, { "epoch": 0.391496243388026, "grad_norm": 4.859790929263066, "learning_rate": 7.62721816473703e-07, "loss": 2.1082, "step": 5810 }, { "epoch": 0.39183315926013274, "grad_norm": 5.6630813135983855, "learning_rate": 7.622213061439287e-07, "loss": 2.0026, "step": 5815 }, { "epoch": 0.3921700751322395, "grad_norm": 5.8588224062568015, "learning_rate": 7.617204331080544e-07, "loss": 2.0438, "step": 5820 }, { "epoch": 0.3925069910043462, "grad_norm": 4.957715845877602, "learning_rate": 7.612191980588907e-07, "loss": 2.0451, "step": 5825 }, { "epoch": 0.39284390687645293, "grad_norm": 4.88975000926498, "learning_rate": 7.60717601689749e-07, "loss": 2.0581, "step": 5830 }, { "epoch": 0.3931808227485597, "grad_norm": 5.101479975889408, "learning_rate": 7.602156446944405e-07, "loss": 2.041, "step": 5835 }, { "epoch": 0.3935177386206664, "grad_norm": 5.025882175203819, "learning_rate": 7.597133277672751e-07, "loss": 2.0348, "step": 5840 }, { "epoch": 0.3938546544927732, "grad_norm": 5.641299700729913, "learning_rate": 7.592106516030607e-07, "loss": 2.0172, "step": 5845 }, { "epoch": 0.3941915703648799, "grad_norm": 5.36552366997092, "learning_rate": 7.587076168971022e-07, "loss": 2.0078, "step": 5850 }, { "epoch": 0.3945284862369866, "grad_norm": 4.842018052372069, "learning_rate": 7.582042243451998e-07, "loss": 2.0393, "step": 5855 }, { "epoch": 0.3948654021090934, "grad_norm": 5.41279348681183, "learning_rate": 7.577004746436494e-07, "loss": 2.0491, "step": 5860 }, { "epoch": 0.3952023179812001, "grad_norm": 5.005920840084369, "learning_rate": 7.571963684892404e-07, "loss": 2.0321, "step": 5865 }, { "epoch": 0.3955392338533068, "grad_norm": 4.613592568837921, "learning_rate": 7.566919065792558e-07, "loss": 2.0174, "step": 5870 }, { "epoch": 0.3958761497254136, "grad_norm": 5.301218182295702, "learning_rate": 7.561870896114704e-07, "loss": 1.9901, "step": 5875 }, { "epoch": 0.3962130655975203, "grad_norm": 4.736872652774144, "learning_rate": 7.556819182841498e-07, "loss": 2.0052, "step": 5880 }, { "epoch": 0.39654998146962706, "grad_norm": 4.895042454476076, "learning_rate": 7.551763932960502e-07, "loss": 2.0959, "step": 5885 }, { "epoch": 0.39688689734173377, "grad_norm": 4.8604563574598885, "learning_rate": 7.546705153464168e-07, "loss": 2.0674, "step": 5890 }, { "epoch": 0.3972238132138405, "grad_norm": 4.944319279323396, "learning_rate": 7.54164285134983e-07, "loss": 2.0256, "step": 5895 }, { "epoch": 0.39756072908594725, "grad_norm": 5.141814738634177, "learning_rate": 7.536577033619696e-07, "loss": 2.068, "step": 5900 }, { "epoch": 0.39789764495805396, "grad_norm": 5.3126978773454745, "learning_rate": 7.531507707280836e-07, "loss": 2.0275, "step": 5905 }, { "epoch": 0.39823456083016073, "grad_norm": 4.880425411189489, "learning_rate": 7.526434879345171e-07, "loss": 2.0689, "step": 5910 }, { "epoch": 0.39857147670226745, "grad_norm": 5.2954301460052555, "learning_rate": 7.521358556829469e-07, "loss": 2.0942, "step": 5915 }, { "epoch": 0.39890839257437416, "grad_norm": 5.11659747161273, "learning_rate": 7.51627874675533e-07, "loss": 2.0993, "step": 5920 }, { "epoch": 0.39924530844648093, "grad_norm": 5.099136727669359, "learning_rate": 7.511195456149177e-07, "loss": 2.0764, "step": 5925 }, { "epoch": 0.39958222431858764, "grad_norm": 4.707255830050732, "learning_rate": 7.50610869204225e-07, "loss": 2.0359, "step": 5930 }, { "epoch": 0.3999191401906944, "grad_norm": 5.006922181077602, "learning_rate": 7.50101846147059e-07, "loss": 2.1376, "step": 5935 }, { "epoch": 0.4002560560628011, "grad_norm": 4.7635126549526605, "learning_rate": 7.495924771475037e-07, "loss": 2.0564, "step": 5940 }, { "epoch": 0.40059297193490784, "grad_norm": 5.28056212522916, "learning_rate": 7.490827629101211e-07, "loss": 2.0528, "step": 5945 }, { "epoch": 0.4009298878070146, "grad_norm": 5.020769081274998, "learning_rate": 7.485727041399513e-07, "loss": 2.0041, "step": 5950 }, { "epoch": 0.4012668036791213, "grad_norm": 4.706321409844301, "learning_rate": 7.480623015425105e-07, "loss": 2.0173, "step": 5955 }, { "epoch": 0.40160371955122803, "grad_norm": 4.828779024375159, "learning_rate": 7.475515558237909e-07, "loss": 2.0549, "step": 5960 }, { "epoch": 0.4019406354233348, "grad_norm": 4.832369530884, "learning_rate": 7.470404676902587e-07, "loss": 2.0565, "step": 5965 }, { "epoch": 0.4022775512954415, "grad_norm": 4.924451458585056, "learning_rate": 7.465290378488544e-07, "loss": 2.1194, "step": 5970 }, { "epoch": 0.4026144671675483, "grad_norm": 5.363202857261177, "learning_rate": 7.460172670069909e-07, "loss": 2.0664, "step": 5975 }, { "epoch": 0.402951383039655, "grad_norm": 4.659982395143319, "learning_rate": 7.455051558725524e-07, "loss": 2.0787, "step": 5980 }, { "epoch": 0.4032882989117617, "grad_norm": 4.998215158417394, "learning_rate": 7.449927051538944e-07, "loss": 2.0045, "step": 5985 }, { "epoch": 0.4036252147838685, "grad_norm": 5.152137246348984, "learning_rate": 7.444799155598419e-07, "loss": 2.0584, "step": 5990 }, { "epoch": 0.4039621306559752, "grad_norm": 4.876824499666177, "learning_rate": 7.439667877996884e-07, "loss": 1.9744, "step": 5995 }, { "epoch": 0.40429904652808196, "grad_norm": 4.6823422425180325, "learning_rate": 7.434533225831951e-07, "loss": 2.0721, "step": 6000 }, { "epoch": 0.4046359624001887, "grad_norm": 4.985023252514381, "learning_rate": 7.429395206205908e-07, "loss": 2.0297, "step": 6005 }, { "epoch": 0.4049728782722954, "grad_norm": 5.197656866290802, "learning_rate": 7.424253826225689e-07, "loss": 2.0309, "step": 6010 }, { "epoch": 0.40530979414440216, "grad_norm": 5.032116636217697, "learning_rate": 7.419109093002887e-07, "loss": 1.9942, "step": 6015 }, { "epoch": 0.40564671001650887, "grad_norm": 5.007328878883672, "learning_rate": 7.413961013653725e-07, "loss": 2.0539, "step": 6020 }, { "epoch": 0.40598362588861564, "grad_norm": 5.265824841693031, "learning_rate": 7.408809595299057e-07, "loss": 2.028, "step": 6025 }, { "epoch": 0.40632054176072235, "grad_norm": 4.799701952052053, "learning_rate": 7.403654845064358e-07, "loss": 2.0768, "step": 6030 }, { "epoch": 0.40665745763282907, "grad_norm": 5.618823383580866, "learning_rate": 7.398496770079709e-07, "loss": 2.07, "step": 6035 }, { "epoch": 0.40699437350493584, "grad_norm": 4.800339774986333, "learning_rate": 7.393335377479792e-07, "loss": 2.0416, "step": 6040 }, { "epoch": 0.40733128937704255, "grad_norm": 4.813720140279495, "learning_rate": 7.388170674403872e-07, "loss": 2.0062, "step": 6045 }, { "epoch": 0.40766820524914926, "grad_norm": 5.301445329074318, "learning_rate": 7.383002667995804e-07, "loss": 2.0579, "step": 6050 }, { "epoch": 0.40800512112125603, "grad_norm": 4.925010770350815, "learning_rate": 7.377831365404001e-07, "loss": 2.0121, "step": 6055 }, { "epoch": 0.40834203699336274, "grad_norm": 4.9561092432953116, "learning_rate": 7.372656773781442e-07, "loss": 2.0931, "step": 6060 }, { "epoch": 0.4086789528654695, "grad_norm": 4.778669763541082, "learning_rate": 7.367478900285654e-07, "loss": 2.0421, "step": 6065 }, { "epoch": 0.4090158687375762, "grad_norm": 4.9838618459128385, "learning_rate": 7.362297752078702e-07, "loss": 1.9969, "step": 6070 }, { "epoch": 0.40935278460968294, "grad_norm": 4.968323314025494, "learning_rate": 7.357113336327181e-07, "loss": 2.0745, "step": 6075 }, { "epoch": 0.4096897004817897, "grad_norm": 4.900511209168664, "learning_rate": 7.351925660202207e-07, "loss": 2.0196, "step": 6080 }, { "epoch": 0.4100266163538964, "grad_norm": 5.0564846103167485, "learning_rate": 7.346734730879407e-07, "loss": 2.0628, "step": 6085 }, { "epoch": 0.4103635322260032, "grad_norm": 4.851146396130376, "learning_rate": 7.341540555538902e-07, "loss": 2.0716, "step": 6090 }, { "epoch": 0.4107004480981099, "grad_norm": 5.113644922574657, "learning_rate": 7.33634314136531e-07, "loss": 2.0898, "step": 6095 }, { "epoch": 0.4110373639702166, "grad_norm": 5.397430851931828, "learning_rate": 7.331142495547724e-07, "loss": 2.0626, "step": 6100 }, { "epoch": 0.4113742798423234, "grad_norm": 5.265286607306314, "learning_rate": 7.325938625279709e-07, "loss": 1.9813, "step": 6105 }, { "epoch": 0.4117111957144301, "grad_norm": 5.026049314934911, "learning_rate": 7.320731537759293e-07, "loss": 2.1292, "step": 6110 }, { "epoch": 0.41204811158653687, "grad_norm": 5.136410894936528, "learning_rate": 7.315521240188944e-07, "loss": 2.1083, "step": 6115 }, { "epoch": 0.4123850274586436, "grad_norm": 5.670852681406425, "learning_rate": 7.310307739775585e-07, "loss": 2.0416, "step": 6120 }, { "epoch": 0.4127219433307503, "grad_norm": 5.1859614316076765, "learning_rate": 7.305091043730557e-07, "loss": 2.078, "step": 6125 }, { "epoch": 0.41305885920285706, "grad_norm": 5.004778041021885, "learning_rate": 7.299871159269626e-07, "loss": 2.0308, "step": 6130 }, { "epoch": 0.4133957750749638, "grad_norm": 4.991507372726218, "learning_rate": 7.294648093612968e-07, "loss": 2.0619, "step": 6135 }, { "epoch": 0.4137326909470705, "grad_norm": 4.648822139969444, "learning_rate": 7.28942185398516e-07, "loss": 2.0154, "step": 6140 }, { "epoch": 0.41406960681917726, "grad_norm": 5.1588533560585415, "learning_rate": 7.284192447615168e-07, "loss": 2.0318, "step": 6145 }, { "epoch": 0.414406522691284, "grad_norm": 4.774342889889448, "learning_rate": 7.278959881736338e-07, "loss": 2.0783, "step": 6150 }, { "epoch": 0.41474343856339074, "grad_norm": 5.232042940359434, "learning_rate": 7.273724163586387e-07, "loss": 2.0361, "step": 6155 }, { "epoch": 0.41508035443549746, "grad_norm": 5.742888155174585, "learning_rate": 7.268485300407394e-07, "loss": 2.06, "step": 6160 }, { "epoch": 0.41541727030760417, "grad_norm": 4.081184551149957, "learning_rate": 7.263243299445783e-07, "loss": 2.0714, "step": 6165 }, { "epoch": 0.41575418617971094, "grad_norm": 5.406483203903552, "learning_rate": 7.257998167952322e-07, "loss": 1.9719, "step": 6170 }, { "epoch": 0.41609110205181765, "grad_norm": 4.6827066934430945, "learning_rate": 7.25274991318211e-07, "loss": 2.003, "step": 6175 }, { "epoch": 0.4164280179239244, "grad_norm": 5.36725797496629, "learning_rate": 7.247498542394566e-07, "loss": 2.0977, "step": 6180 }, { "epoch": 0.41676493379603113, "grad_norm": 4.852051116878214, "learning_rate": 7.242244062853416e-07, "loss": 2.0342, "step": 6185 }, { "epoch": 0.41710184966813785, "grad_norm": 4.862185208480358, "learning_rate": 7.236986481826688e-07, "loss": 2.1117, "step": 6190 }, { "epoch": 0.4174387655402446, "grad_norm": 4.596027507953987, "learning_rate": 7.231725806586699e-07, "loss": 2.0109, "step": 6195 }, { "epoch": 0.41777568141235133, "grad_norm": 4.8896725946374975, "learning_rate": 7.22646204441005e-07, "loss": 2.0864, "step": 6200 }, { "epoch": 0.4181125972844581, "grad_norm": 4.890137337458879, "learning_rate": 7.221195202577606e-07, "loss": 2.0478, "step": 6205 }, { "epoch": 0.4184495131565648, "grad_norm": 5.490883459876947, "learning_rate": 7.215925288374496e-07, "loss": 2.0732, "step": 6210 }, { "epoch": 0.4187864290286715, "grad_norm": 5.329307856415382, "learning_rate": 7.210652309090098e-07, "loss": 2.0642, "step": 6215 }, { "epoch": 0.4191233449007783, "grad_norm": 5.046885475486206, "learning_rate": 7.205376272018025e-07, "loss": 2.0873, "step": 6220 }, { "epoch": 0.419460260772885, "grad_norm": 5.397381994642951, "learning_rate": 7.200097184456128e-07, "loss": 2.0796, "step": 6225 }, { "epoch": 0.4197971766449917, "grad_norm": 4.773239066923937, "learning_rate": 7.19481505370647e-07, "loss": 2.1005, "step": 6230 }, { "epoch": 0.4201340925170985, "grad_norm": 5.246121247153151, "learning_rate": 7.189529887075327e-07, "loss": 2.073, "step": 6235 }, { "epoch": 0.4204710083892052, "grad_norm": 4.956124607263229, "learning_rate": 7.184241691873174e-07, "loss": 2.0618, "step": 6240 }, { "epoch": 0.42080792426131197, "grad_norm": 4.921777538638361, "learning_rate": 7.178950475414675e-07, "loss": 2.1054, "step": 6245 }, { "epoch": 0.4211448401334187, "grad_norm": 5.325548210663624, "learning_rate": 7.173656245018671e-07, "loss": 2.06, "step": 6250 }, { "epoch": 0.4214817560055254, "grad_norm": 4.62492440075812, "learning_rate": 7.168359008008177e-07, "loss": 2.0561, "step": 6255 }, { "epoch": 0.42181867187763217, "grad_norm": 5.093301017498394, "learning_rate": 7.163058771710358e-07, "loss": 2.0241, "step": 6260 }, { "epoch": 0.4221555877497389, "grad_norm": 4.775463197161855, "learning_rate": 7.157755543456539e-07, "loss": 2.0646, "step": 6265 }, { "epoch": 0.42249250362184565, "grad_norm": 5.112854779785417, "learning_rate": 7.152449330582173e-07, "loss": 2.067, "step": 6270 }, { "epoch": 0.42282941949395236, "grad_norm": 4.9598555181272435, "learning_rate": 7.147140140426848e-07, "loss": 2.0645, "step": 6275 }, { "epoch": 0.4231663353660591, "grad_norm": 5.5384067789193745, "learning_rate": 7.141827980334265e-07, "loss": 1.9821, "step": 6280 }, { "epoch": 0.42350325123816585, "grad_norm": 4.926363010282352, "learning_rate": 7.136512857652239e-07, "loss": 2.0374, "step": 6285 }, { "epoch": 0.42384016711027256, "grad_norm": 5.436604578385161, "learning_rate": 7.131194779732681e-07, "loss": 2.0835, "step": 6290 }, { "epoch": 0.4241770829823793, "grad_norm": 4.981708986980841, "learning_rate": 7.125873753931586e-07, "loss": 1.9639, "step": 6295 }, { "epoch": 0.42451399885448604, "grad_norm": 5.283326188889762, "learning_rate": 7.120549787609029e-07, "loss": 1.9957, "step": 6300 }, { "epoch": 0.42485091472659275, "grad_norm": 4.965882401194885, "learning_rate": 7.115222888129156e-07, "loss": 2.0523, "step": 6305 }, { "epoch": 0.4251878305986995, "grad_norm": 5.132843692698032, "learning_rate": 7.109893062860161e-07, "loss": 1.986, "step": 6310 }, { "epoch": 0.42552474647080624, "grad_norm": 4.9716248898967725, "learning_rate": 7.104560319174296e-07, "loss": 2.1079, "step": 6315 }, { "epoch": 0.42586166234291295, "grad_norm": 4.82700099407153, "learning_rate": 7.099224664447841e-07, "loss": 2.045, "step": 6320 }, { "epoch": 0.4261985782150197, "grad_norm": 5.6799330374083725, "learning_rate": 7.093886106061106e-07, "loss": 2.0951, "step": 6325 }, { "epoch": 0.42653549408712643, "grad_norm": 4.990588314255834, "learning_rate": 7.088544651398421e-07, "loss": 2.0644, "step": 6330 }, { "epoch": 0.4268724099592332, "grad_norm": 4.993295629007956, "learning_rate": 7.083200307848115e-07, "loss": 2.0561, "step": 6335 }, { "epoch": 0.4272093258313399, "grad_norm": 5.253052963663791, "learning_rate": 7.077853082802516e-07, "loss": 2.102, "step": 6340 }, { "epoch": 0.42754624170344663, "grad_norm": 4.986798609744882, "learning_rate": 7.072502983657939e-07, "loss": 1.9804, "step": 6345 }, { "epoch": 0.4278831575755534, "grad_norm": 5.083285001792466, "learning_rate": 7.067150017814676e-07, "loss": 2.0314, "step": 6350 }, { "epoch": 0.4282200734476601, "grad_norm": 5.215340984349888, "learning_rate": 7.061794192676979e-07, "loss": 2.0909, "step": 6355 }, { "epoch": 0.4285569893197669, "grad_norm": 5.13621242523437, "learning_rate": 7.056435515653058e-07, "loss": 2.1149, "step": 6360 }, { "epoch": 0.4288939051918736, "grad_norm": 4.9298098219922135, "learning_rate": 7.051073994155068e-07, "loss": 2.0657, "step": 6365 }, { "epoch": 0.4292308210639803, "grad_norm": 5.224386211905687, "learning_rate": 7.045709635599098e-07, "loss": 2.057, "step": 6370 }, { "epoch": 0.4295677369360871, "grad_norm": 5.469479953904029, "learning_rate": 7.040342447405161e-07, "loss": 1.997, "step": 6375 }, { "epoch": 0.4299046528081938, "grad_norm": 5.1029196264423895, "learning_rate": 7.034972436997184e-07, "loss": 2.0028, "step": 6380 }, { "epoch": 0.4302415686803005, "grad_norm": 4.755424985922037, "learning_rate": 7.029599611803e-07, "loss": 2.0327, "step": 6385 }, { "epoch": 0.43057848455240727, "grad_norm": 5.045074969670711, "learning_rate": 7.024223979254331e-07, "loss": 2.1096, "step": 6390 }, { "epoch": 0.430915400424514, "grad_norm": 5.085614974584021, "learning_rate": 7.018845546786787e-07, "loss": 2.0234, "step": 6395 }, { "epoch": 0.43125231629662075, "grad_norm": 4.974238539159867, "learning_rate": 7.013464321839845e-07, "loss": 1.9252, "step": 6400 }, { "epoch": 0.43158923216872747, "grad_norm": 4.815960693265074, "learning_rate": 7.00808031185685e-07, "loss": 2.0066, "step": 6405 }, { "epoch": 0.4319261480408342, "grad_norm": 4.9871070914544, "learning_rate": 7.002693524284997e-07, "loss": 2.0237, "step": 6410 }, { "epoch": 0.43226306391294095, "grad_norm": 5.133248469066724, "learning_rate": 6.997303966575322e-07, "loss": 2.0578, "step": 6415 }, { "epoch": 0.43259997978504766, "grad_norm": 5.767287091933916, "learning_rate": 6.991911646182696e-07, "loss": 2.0451, "step": 6420 }, { "epoch": 0.43293689565715443, "grad_norm": 5.621241614214432, "learning_rate": 6.986516570565809e-07, "loss": 2.16, "step": 6425 }, { "epoch": 0.43327381152926114, "grad_norm": 4.640602953048649, "learning_rate": 6.981118747187163e-07, "loss": 1.9981, "step": 6430 }, { "epoch": 0.43361072740136786, "grad_norm": 4.478544490998536, "learning_rate": 6.975718183513056e-07, "loss": 2.0017, "step": 6435 }, { "epoch": 0.4339476432734746, "grad_norm": 5.144016061257152, "learning_rate": 6.970314887013585e-07, "loss": 1.998, "step": 6440 }, { "epoch": 0.43428455914558134, "grad_norm": 5.0956738404450945, "learning_rate": 6.964908865162617e-07, "loss": 2.0531, "step": 6445 }, { "epoch": 0.4346214750176881, "grad_norm": 5.4154791207953235, "learning_rate": 6.959500125437801e-07, "loss": 2.0923, "step": 6450 }, { "epoch": 0.4349583908897948, "grad_norm": 5.239427363099999, "learning_rate": 6.954088675320534e-07, "loss": 2.0101, "step": 6455 }, { "epoch": 0.43529530676190153, "grad_norm": 5.050393715034449, "learning_rate": 6.948674522295969e-07, "loss": 2.0769, "step": 6460 }, { "epoch": 0.4356322226340083, "grad_norm": 5.083606949244433, "learning_rate": 6.943257673852993e-07, "loss": 2.0176, "step": 6465 }, { "epoch": 0.435969138506115, "grad_norm": 4.982660731596873, "learning_rate": 6.937838137484225e-07, "loss": 2.1194, "step": 6470 }, { "epoch": 0.43630605437822173, "grad_norm": 5.693557966252279, "learning_rate": 6.932415920686001e-07, "loss": 1.9424, "step": 6475 }, { "epoch": 0.4366429702503285, "grad_norm": 4.746769850319682, "learning_rate": 6.926991030958362e-07, "loss": 2.022, "step": 6480 }, { "epoch": 0.4369798861224352, "grad_norm": 5.279634333605706, "learning_rate": 6.921563475805051e-07, "loss": 1.9815, "step": 6485 }, { "epoch": 0.437316801994542, "grad_norm": 4.991314447850079, "learning_rate": 6.916133262733493e-07, "loss": 1.9876, "step": 6490 }, { "epoch": 0.4376537178666487, "grad_norm": 5.5151224890280615, "learning_rate": 6.910700399254793e-07, "loss": 2.0539, "step": 6495 }, { "epoch": 0.4379906337387554, "grad_norm": 4.752068885465796, "learning_rate": 6.905264892883721e-07, "loss": 2.0076, "step": 6500 }, { "epoch": 0.4383275496108622, "grad_norm": 4.366972369406845, "learning_rate": 6.899826751138701e-07, "loss": 2.0236, "step": 6505 }, { "epoch": 0.4386644654829689, "grad_norm": 5.300016824022946, "learning_rate": 6.894385981541804e-07, "loss": 2.0468, "step": 6510 }, { "epoch": 0.43900138135507566, "grad_norm": 4.971101344060874, "learning_rate": 6.888942591618736e-07, "loss": 2.0324, "step": 6515 }, { "epoch": 0.4393382972271824, "grad_norm": 5.295451628837353, "learning_rate": 6.883496588898827e-07, "loss": 2.0106, "step": 6520 }, { "epoch": 0.4396752130992891, "grad_norm": 5.278210704564976, "learning_rate": 6.87804798091502e-07, "loss": 2.0407, "step": 6525 }, { "epoch": 0.44001212897139586, "grad_norm": 5.2004656022000715, "learning_rate": 6.872596775203864e-07, "loss": 2.0615, "step": 6530 }, { "epoch": 0.44034904484350257, "grad_norm": 5.224330135442559, "learning_rate": 6.867142979305498e-07, "loss": 2.0054, "step": 6535 }, { "epoch": 0.44068596071560934, "grad_norm": 4.9371098781485045, "learning_rate": 6.861686600763648e-07, "loss": 2.0462, "step": 6540 }, { "epoch": 0.44102287658771605, "grad_norm": 5.266440133290877, "learning_rate": 6.856227647125607e-07, "loss": 2.0517, "step": 6545 }, { "epoch": 0.44135979245982276, "grad_norm": 5.285192194296248, "learning_rate": 6.850766125942235e-07, "loss": 2.0779, "step": 6550 }, { "epoch": 0.44169670833192953, "grad_norm": 5.1156602012875005, "learning_rate": 6.84530204476794e-07, "loss": 2.0435, "step": 6555 }, { "epoch": 0.44203362420403625, "grad_norm": 5.127418873714126, "learning_rate": 6.839835411160673e-07, "loss": 2.0873, "step": 6560 }, { "epoch": 0.44237054007614296, "grad_norm": 4.711595403178473, "learning_rate": 6.834366232681915e-07, "loss": 2.0394, "step": 6565 }, { "epoch": 0.44270745594824973, "grad_norm": 5.1351520609058205, "learning_rate": 6.828894516896664e-07, "loss": 1.9705, "step": 6570 }, { "epoch": 0.44304437182035644, "grad_norm": 5.117181122374066, "learning_rate": 6.823420271373433e-07, "loss": 2.0691, "step": 6575 }, { "epoch": 0.4433812876924632, "grad_norm": 5.391167489026913, "learning_rate": 6.817943503684232e-07, "loss": 2.0927, "step": 6580 }, { "epoch": 0.4437182035645699, "grad_norm": 4.733595239126312, "learning_rate": 6.812464221404558e-07, "loss": 2.0093, "step": 6585 }, { "epoch": 0.44405511943667664, "grad_norm": 4.821172500855588, "learning_rate": 6.806982432113388e-07, "loss": 2.0546, "step": 6590 }, { "epoch": 0.4443920353087834, "grad_norm": 4.747448417683526, "learning_rate": 6.801498143393168e-07, "loss": 2.0455, "step": 6595 }, { "epoch": 0.4447289511808901, "grad_norm": 5.148652643819982, "learning_rate": 6.796011362829794e-07, "loss": 2.0835, "step": 6600 }, { "epoch": 0.4450658670529969, "grad_norm": 5.247974428699364, "learning_rate": 6.790522098012621e-07, "loss": 2.0468, "step": 6605 }, { "epoch": 0.4454027829251036, "grad_norm": 5.308427931878459, "learning_rate": 6.785030356534428e-07, "loss": 2.0269, "step": 6610 }, { "epoch": 0.4457396987972103, "grad_norm": 5.4329539522812045, "learning_rate": 6.779536145991427e-07, "loss": 2.0692, "step": 6615 }, { "epoch": 0.4460766146693171, "grad_norm": 5.514983924290026, "learning_rate": 6.774039473983243e-07, "loss": 1.9529, "step": 6620 }, { "epoch": 0.4464135305414238, "grad_norm": 5.1679740062777, "learning_rate": 6.768540348112906e-07, "loss": 2.0298, "step": 6625 }, { "epoch": 0.44675044641353057, "grad_norm": 4.773911919007974, "learning_rate": 6.763038775986842e-07, "loss": 2.0689, "step": 6630 }, { "epoch": 0.4470873622856373, "grad_norm": 5.157220513744163, "learning_rate": 6.757534765214858e-07, "loss": 2.0703, "step": 6635 }, { "epoch": 0.447424278157744, "grad_norm": 5.508152485315697, "learning_rate": 6.752028323410134e-07, "loss": 2.0097, "step": 6640 }, { "epoch": 0.44776119402985076, "grad_norm": 5.049471641946711, "learning_rate": 6.746519458189214e-07, "loss": 2.0168, "step": 6645 }, { "epoch": 0.4480981099019575, "grad_norm": 4.581107780382369, "learning_rate": 6.741008177171993e-07, "loss": 2.0498, "step": 6650 }, { "epoch": 0.4484350257740642, "grad_norm": 4.760593876470898, "learning_rate": 6.735494487981711e-07, "loss": 1.9577, "step": 6655 }, { "epoch": 0.44877194164617096, "grad_norm": 4.6401281659268845, "learning_rate": 6.729978398244935e-07, "loss": 1.9915, "step": 6660 }, { "epoch": 0.44910885751827767, "grad_norm": 4.877013344346602, "learning_rate": 6.724459915591551e-07, "loss": 2.0573, "step": 6665 }, { "epoch": 0.44944577339038444, "grad_norm": 4.619472620467893, "learning_rate": 6.718939047654763e-07, "loss": 2.0413, "step": 6670 }, { "epoch": 0.44978268926249115, "grad_norm": 5.052260531568224, "learning_rate": 6.713415802071064e-07, "loss": 2.0689, "step": 6675 }, { "epoch": 0.45011960513459787, "grad_norm": 4.925660814403184, "learning_rate": 6.707890186480244e-07, "loss": 2.0419, "step": 6680 }, { "epoch": 0.45045652100670464, "grad_norm": 5.041408943239944, "learning_rate": 6.702362208525366e-07, "loss": 1.98, "step": 6685 }, { "epoch": 0.45079343687881135, "grad_norm": 4.748804059470699, "learning_rate": 6.696831875852763e-07, "loss": 1.9368, "step": 6690 }, { "epoch": 0.4511303527509181, "grad_norm": 4.613257601731216, "learning_rate": 6.691299196112025e-07, "loss": 2.078, "step": 6695 }, { "epoch": 0.45146726862302483, "grad_norm": 4.967274734245306, "learning_rate": 6.685764176955991e-07, "loss": 1.9757, "step": 6700 }, { "epoch": 0.45180418449513154, "grad_norm": 4.906835449658948, "learning_rate": 6.680226826040727e-07, "loss": 2.091, "step": 6705 }, { "epoch": 0.4521411003672383, "grad_norm": 5.066338657463886, "learning_rate": 6.674687151025535e-07, "loss": 1.9643, "step": 6710 }, { "epoch": 0.452478016239345, "grad_norm": 4.507910945377054, "learning_rate": 6.669145159572924e-07, "loss": 2.0188, "step": 6715 }, { "epoch": 0.4528149321114518, "grad_norm": 4.571497978992489, "learning_rate": 6.663600859348615e-07, "loss": 2.0301, "step": 6720 }, { "epoch": 0.4531518479835585, "grad_norm": 4.620600206979888, "learning_rate": 6.658054258021513e-07, "loss": 2.0626, "step": 6725 }, { "epoch": 0.4534887638556652, "grad_norm": 4.551595400851237, "learning_rate": 6.652505363263712e-07, "loss": 2.0422, "step": 6730 }, { "epoch": 0.453825679727772, "grad_norm": 4.970598508166431, "learning_rate": 6.646954182750478e-07, "loss": 2.0007, "step": 6735 }, { "epoch": 0.4541625955998787, "grad_norm": 4.7347479794353715, "learning_rate": 6.641400724160234e-07, "loss": 1.9991, "step": 6740 }, { "epoch": 0.4544995114719854, "grad_norm": 4.615277356231208, "learning_rate": 6.635844995174561e-07, "loss": 2.0508, "step": 6745 }, { "epoch": 0.4548364273440922, "grad_norm": 5.20115874661617, "learning_rate": 6.630287003478176e-07, "loss": 2.0666, "step": 6750 }, { "epoch": 0.4551733432161989, "grad_norm": 4.87951402037703, "learning_rate": 6.624726756758927e-07, "loss": 1.9561, "step": 6755 }, { "epoch": 0.45551025908830567, "grad_norm": 5.12807801531092, "learning_rate": 6.619164262707782e-07, "loss": 2.0452, "step": 6760 }, { "epoch": 0.4558471749604124, "grad_norm": 4.930524874529929, "learning_rate": 6.613599529018815e-07, "loss": 1.997, "step": 6765 }, { "epoch": 0.4561840908325191, "grad_norm": 4.91945520072961, "learning_rate": 6.608032563389198e-07, "loss": 2.023, "step": 6770 }, { "epoch": 0.45652100670462586, "grad_norm": 4.782103254891831, "learning_rate": 6.602463373519196e-07, "loss": 2.0363, "step": 6775 }, { "epoch": 0.4568579225767326, "grad_norm": 4.651463591410111, "learning_rate": 6.596891967112143e-07, "loss": 1.9982, "step": 6780 }, { "epoch": 0.45719483844883935, "grad_norm": 4.510468913420292, "learning_rate": 6.59131835187444e-07, "loss": 1.9251, "step": 6785 }, { "epoch": 0.45753175432094606, "grad_norm": 5.465937385383021, "learning_rate": 6.58574253551555e-07, "loss": 2.0315, "step": 6790 }, { "epoch": 0.4578686701930528, "grad_norm": 6.221541792822141, "learning_rate": 6.580164525747973e-07, "loss": 1.9975, "step": 6795 }, { "epoch": 0.45820558606515954, "grad_norm": 5.0295836427409855, "learning_rate": 6.574584330287247e-07, "loss": 2.0591, "step": 6800 }, { "epoch": 0.45854250193726626, "grad_norm": 4.723516035327428, "learning_rate": 6.569001956851932e-07, "loss": 2.0794, "step": 6805 }, { "epoch": 0.458879417809373, "grad_norm": 4.455051583323454, "learning_rate": 6.563417413163601e-07, "loss": 2.0556, "step": 6810 }, { "epoch": 0.45921633368147974, "grad_norm": 4.89029549426599, "learning_rate": 6.55783070694683e-07, "loss": 2.0369, "step": 6815 }, { "epoch": 0.45955324955358645, "grad_norm": 4.624561154716687, "learning_rate": 6.55224184592918e-07, "loss": 2.0331, "step": 6820 }, { "epoch": 0.4598901654256932, "grad_norm": 4.862393058242115, "learning_rate": 6.546650837841203e-07, "loss": 2.0394, "step": 6825 }, { "epoch": 0.46022708129779993, "grad_norm": 5.114285678430353, "learning_rate": 6.541057690416414e-07, "loss": 1.9532, "step": 6830 }, { "epoch": 0.46056399716990665, "grad_norm": 5.345268646841226, "learning_rate": 6.535462411391284e-07, "loss": 2.0228, "step": 6835 }, { "epoch": 0.4609009130420134, "grad_norm": 4.668095126689931, "learning_rate": 6.529865008505244e-07, "loss": 2.0047, "step": 6840 }, { "epoch": 0.46123782891412013, "grad_norm": 5.833549619965893, "learning_rate": 6.524265489500651e-07, "loss": 2.0726, "step": 6845 }, { "epoch": 0.4615747447862269, "grad_norm": 4.908221616354825, "learning_rate": 6.518663862122794e-07, "loss": 1.9583, "step": 6850 }, { "epoch": 0.4619116606583336, "grad_norm": 4.757602512039588, "learning_rate": 6.513060134119878e-07, "loss": 1.9426, "step": 6855 }, { "epoch": 0.4622485765304403, "grad_norm": 4.904938290251906, "learning_rate": 6.507454313243015e-07, "loss": 1.9869, "step": 6860 }, { "epoch": 0.4625854924025471, "grad_norm": 4.769838823559884, "learning_rate": 6.50184640724621e-07, "loss": 2.0252, "step": 6865 }, { "epoch": 0.4629224082746538, "grad_norm": 5.360711262225652, "learning_rate": 6.496236423886351e-07, "loss": 2.0654, "step": 6870 }, { "epoch": 0.4632593241467606, "grad_norm": 5.1468624967717505, "learning_rate": 6.490624370923201e-07, "loss": 2.0112, "step": 6875 }, { "epoch": 0.4635962400188673, "grad_norm": 4.8867463863131855, "learning_rate": 6.485010256119388e-07, "loss": 1.9838, "step": 6880 }, { "epoch": 0.463933155890974, "grad_norm": 4.8492029327650314, "learning_rate": 6.479394087240389e-07, "loss": 2.0563, "step": 6885 }, { "epoch": 0.46427007176308077, "grad_norm": 5.258897265090463, "learning_rate": 6.473775872054521e-07, "loss": 2.1103, "step": 6890 }, { "epoch": 0.4646069876351875, "grad_norm": 5.147178860486107, "learning_rate": 6.468155618332936e-07, "loss": 1.9817, "step": 6895 }, { "epoch": 0.46494390350729425, "grad_norm": 5.048452367152259, "learning_rate": 6.462533333849599e-07, "loss": 2.0538, "step": 6900 }, { "epoch": 0.46528081937940097, "grad_norm": 4.624596744825503, "learning_rate": 6.456909026381292e-07, "loss": 1.9959, "step": 6905 }, { "epoch": 0.4656177352515077, "grad_norm": 5.050237395289905, "learning_rate": 6.451282703707591e-07, "loss": 1.9936, "step": 6910 }, { "epoch": 0.46595465112361445, "grad_norm": 5.365832261625598, "learning_rate": 6.445654373610854e-07, "loss": 2.059, "step": 6915 }, { "epoch": 0.46629156699572116, "grad_norm": 5.050560943577913, "learning_rate": 6.440024043876229e-07, "loss": 2.0661, "step": 6920 }, { "epoch": 0.4666284828678279, "grad_norm": 4.79118630339851, "learning_rate": 6.434391722291618e-07, "loss": 2.0571, "step": 6925 }, { "epoch": 0.46696539873993465, "grad_norm": 5.062751451046221, "learning_rate": 6.428757416647683e-07, "loss": 2.0132, "step": 6930 }, { "epoch": 0.46730231461204136, "grad_norm": 5.362345108274802, "learning_rate": 6.42312113473783e-07, "loss": 2.0343, "step": 6935 }, { "epoch": 0.4676392304841481, "grad_norm": 4.869253535683854, "learning_rate": 6.417482884358196e-07, "loss": 2.0319, "step": 6940 }, { "epoch": 0.46797614635625484, "grad_norm": 4.618521527504569, "learning_rate": 6.411842673307648e-07, "loss": 2.0175, "step": 6945 }, { "epoch": 0.46831306222836155, "grad_norm": 5.124352956914365, "learning_rate": 6.406200509387756e-07, "loss": 1.9623, "step": 6950 }, { "epoch": 0.4686499781004683, "grad_norm": 5.426538595719064, "learning_rate": 6.400556400402796e-07, "loss": 2.09, "step": 6955 }, { "epoch": 0.46898689397257504, "grad_norm": 5.061854360377167, "learning_rate": 6.394910354159736e-07, "loss": 1.9927, "step": 6960 }, { "epoch": 0.4693238098446818, "grad_norm": 5.07251109757845, "learning_rate": 6.389262378468219e-07, "loss": 2.0849, "step": 6965 }, { "epoch": 0.4696607257167885, "grad_norm": 4.669460178881621, "learning_rate": 6.38361248114056e-07, "loss": 2.009, "step": 6970 }, { "epoch": 0.46999764158889523, "grad_norm": 4.7834723277622535, "learning_rate": 6.377960669991733e-07, "loss": 2.1025, "step": 6975 }, { "epoch": 0.470334557461002, "grad_norm": 5.687739564734972, "learning_rate": 6.372306952839353e-07, "loss": 2.0414, "step": 6980 }, { "epoch": 0.4706714733331087, "grad_norm": 5.06320393820227, "learning_rate": 6.36665133750368e-07, "loss": 2.0993, "step": 6985 }, { "epoch": 0.4710083892052155, "grad_norm": 5.086237553908723, "learning_rate": 6.360993831807593e-07, "loss": 1.9629, "step": 6990 }, { "epoch": 0.4713453050773222, "grad_norm": 5.11411680719714, "learning_rate": 6.355334443576589e-07, "loss": 2.0319, "step": 6995 }, { "epoch": 0.4716822209494289, "grad_norm": 4.698548403844061, "learning_rate": 6.349673180638769e-07, "loss": 2.0022, "step": 7000 }, { "epoch": 0.4720191368215357, "grad_norm": 5.473803524037221, "learning_rate": 6.344010050824824e-07, "loss": 2.0623, "step": 7005 }, { "epoch": 0.4723560526936424, "grad_norm": 4.903797782117093, "learning_rate": 6.338345061968032e-07, "loss": 2.03, "step": 7010 }, { "epoch": 0.4726929685657491, "grad_norm": 4.9045121735260295, "learning_rate": 6.33267822190424e-07, "loss": 1.9728, "step": 7015 }, { "epoch": 0.4730298844378559, "grad_norm": 4.758482912261512, "learning_rate": 6.327009538471853e-07, "loss": 2.0675, "step": 7020 }, { "epoch": 0.4733668003099626, "grad_norm": 5.151193847794686, "learning_rate": 6.321339019511828e-07, "loss": 2.0242, "step": 7025 }, { "epoch": 0.47370371618206936, "grad_norm": 4.819528670730536, "learning_rate": 6.315666672867664e-07, "loss": 2.0704, "step": 7030 }, { "epoch": 0.47404063205417607, "grad_norm": 4.908591443023027, "learning_rate": 6.309992506385385e-07, "loss": 1.95, "step": 7035 }, { "epoch": 0.4743775479262828, "grad_norm": 4.990234282277314, "learning_rate": 6.304316527913531e-07, "loss": 1.9899, "step": 7040 }, { "epoch": 0.47471446379838955, "grad_norm": 5.031080567536253, "learning_rate": 6.29863874530315e-07, "loss": 1.9756, "step": 7045 }, { "epoch": 0.47505137967049627, "grad_norm": 5.296091792745542, "learning_rate": 6.292959166407785e-07, "loss": 2.077, "step": 7050 }, { "epoch": 0.47538829554260303, "grad_norm": 5.229331173799964, "learning_rate": 6.287277799083466e-07, "loss": 2.012, "step": 7055 }, { "epoch": 0.47572521141470975, "grad_norm": 5.118841243439973, "learning_rate": 6.281594651188693e-07, "loss": 1.9594, "step": 7060 }, { "epoch": 0.47606212728681646, "grad_norm": 4.675606454962784, "learning_rate": 6.275909730584431e-07, "loss": 2.0011, "step": 7065 }, { "epoch": 0.47639904315892323, "grad_norm": 4.825246680348631, "learning_rate": 6.270223045134095e-07, "loss": 2.0464, "step": 7070 }, { "epoch": 0.47673595903102994, "grad_norm": 5.176696350508615, "learning_rate": 6.264534602703546e-07, "loss": 1.9842, "step": 7075 }, { "epoch": 0.4770728749031367, "grad_norm": 5.171040256152323, "learning_rate": 6.25884441116107e-07, "loss": 2.076, "step": 7080 }, { "epoch": 0.4774097907752434, "grad_norm": 4.988716089460095, "learning_rate": 6.253152478377375e-07, "loss": 2.0036, "step": 7085 }, { "epoch": 0.47774670664735014, "grad_norm": 4.855486644215485, "learning_rate": 6.247458812225576e-07, "loss": 2.0727, "step": 7090 }, { "epoch": 0.4780836225194569, "grad_norm": 5.0335148203644895, "learning_rate": 6.241763420581188e-07, "loss": 2.0853, "step": 7095 }, { "epoch": 0.4784205383915636, "grad_norm": 4.7985511385883175, "learning_rate": 6.23606631132211e-07, "loss": 2.0113, "step": 7100 }, { "epoch": 0.47875745426367033, "grad_norm": 4.957033005946703, "learning_rate": 6.23036749232862e-07, "loss": 2.0429, "step": 7105 }, { "epoch": 0.4790943701357771, "grad_norm": 5.275968968440576, "learning_rate": 6.224666971483355e-07, "loss": 1.9709, "step": 7110 }, { "epoch": 0.4794312860078838, "grad_norm": 4.934792046937863, "learning_rate": 6.218964756671315e-07, "loss": 1.9856, "step": 7115 }, { "epoch": 0.4797682018799906, "grad_norm": 5.351455173541656, "learning_rate": 6.213260855779834e-07, "loss": 2.0022, "step": 7120 }, { "epoch": 0.4801051177520973, "grad_norm": 4.616571631834213, "learning_rate": 6.207555276698584e-07, "loss": 2.057, "step": 7125 }, { "epoch": 0.480442033624204, "grad_norm": 5.007062943846144, "learning_rate": 6.201848027319556e-07, "loss": 2.0486, "step": 7130 }, { "epoch": 0.4807789494963108, "grad_norm": 4.855687612336531, "learning_rate": 6.196139115537054e-07, "loss": 2.0082, "step": 7135 }, { "epoch": 0.4811158653684175, "grad_norm": 4.968191760821689, "learning_rate": 6.190428549247677e-07, "loss": 2.0587, "step": 7140 }, { "epoch": 0.48145278124052426, "grad_norm": 4.741602677516124, "learning_rate": 6.184716336350316e-07, "loss": 2.0892, "step": 7145 }, { "epoch": 0.481789697112631, "grad_norm": 5.430393394737172, "learning_rate": 6.179002484746137e-07, "loss": 2.0445, "step": 7150 }, { "epoch": 0.4821266129847377, "grad_norm": 5.072481171348694, "learning_rate": 6.173287002338577e-07, "loss": 2.0311, "step": 7155 }, { "epoch": 0.48246352885684446, "grad_norm": 5.015849071736856, "learning_rate": 6.167569897033322e-07, "loss": 2.0033, "step": 7160 }, { "epoch": 0.4828004447289512, "grad_norm": 4.978096888855164, "learning_rate": 6.16185117673831e-07, "loss": 2.1031, "step": 7165 }, { "epoch": 0.48313736060105794, "grad_norm": 4.768429139789918, "learning_rate": 6.15613084936371e-07, "loss": 2.0857, "step": 7170 }, { "epoch": 0.48347427647316465, "grad_norm": 4.9412142029487605, "learning_rate": 6.150408922821911e-07, "loss": 2.0509, "step": 7175 }, { "epoch": 0.48381119234527137, "grad_norm": 4.99530459114919, "learning_rate": 6.144685405027518e-07, "loss": 2.0326, "step": 7180 }, { "epoch": 0.48414810821737814, "grad_norm": 5.203165580288118, "learning_rate": 6.138960303897335e-07, "loss": 2.0709, "step": 7185 }, { "epoch": 0.48448502408948485, "grad_norm": 5.071854910925447, "learning_rate": 6.133233627350355e-07, "loss": 1.9863, "step": 7190 }, { "epoch": 0.48482193996159156, "grad_norm": 4.590691917460898, "learning_rate": 6.127505383307754e-07, "loss": 2.0101, "step": 7195 }, { "epoch": 0.48515885583369833, "grad_norm": 4.840956780847529, "learning_rate": 6.121775579692873e-07, "loss": 2.095, "step": 7200 }, { "epoch": 0.48549577170580505, "grad_norm": 5.441258943006592, "learning_rate": 6.116044224431212e-07, "loss": 2.0299, "step": 7205 }, { "epoch": 0.4858326875779118, "grad_norm": 5.038012876390466, "learning_rate": 6.110311325450416e-07, "loss": 1.9626, "step": 7210 }, { "epoch": 0.48616960345001853, "grad_norm": 5.296492348047248, "learning_rate": 6.104576890680263e-07, "loss": 2.0137, "step": 7215 }, { "epoch": 0.48650651932212524, "grad_norm": 5.110692221972856, "learning_rate": 6.098840928052663e-07, "loss": 2.0908, "step": 7220 }, { "epoch": 0.486843435194232, "grad_norm": 5.309671255778143, "learning_rate": 6.093103445501629e-07, "loss": 1.9721, "step": 7225 }, { "epoch": 0.4871803510663387, "grad_norm": 5.3492975577649915, "learning_rate": 6.087364450963286e-07, "loss": 2.0159, "step": 7230 }, { "epoch": 0.4875172669384455, "grad_norm": 4.903467952014379, "learning_rate": 6.081623952375843e-07, "loss": 2.0769, "step": 7235 }, { "epoch": 0.4878541828105522, "grad_norm": 5.016868297897709, "learning_rate": 6.075881957679593e-07, "loss": 2.035, "step": 7240 }, { "epoch": 0.4881910986826589, "grad_norm": 5.652220264514778, "learning_rate": 6.0701384748169e-07, "loss": 2.0571, "step": 7245 }, { "epoch": 0.4885280145547657, "grad_norm": 5.232803506869125, "learning_rate": 6.064393511732181e-07, "loss": 2.0704, "step": 7250 }, { "epoch": 0.4888649304268724, "grad_norm": 4.977675159369603, "learning_rate": 6.058647076371906e-07, "loss": 2.0333, "step": 7255 }, { "epoch": 0.48920184629897917, "grad_norm": 4.950972310797071, "learning_rate": 6.052899176684579e-07, "loss": 1.9798, "step": 7260 }, { "epoch": 0.4895387621710859, "grad_norm": 5.241450664595403, "learning_rate": 6.047149820620729e-07, "loss": 1.9988, "step": 7265 }, { "epoch": 0.4898756780431926, "grad_norm": 5.263234942640917, "learning_rate": 6.0413990161329e-07, "loss": 2.02, "step": 7270 }, { "epoch": 0.49021259391529937, "grad_norm": 4.753090279331476, "learning_rate": 6.035646771175642e-07, "loss": 2.0477, "step": 7275 }, { "epoch": 0.4905495097874061, "grad_norm": 4.948295702351988, "learning_rate": 6.029893093705491e-07, "loss": 2.0203, "step": 7280 }, { "epoch": 0.4908864256595128, "grad_norm": 4.606675046358895, "learning_rate": 6.024137991680973e-07, "loss": 1.9521, "step": 7285 }, { "epoch": 0.49122334153161956, "grad_norm": 4.884145036407078, "learning_rate": 6.018381473062575e-07, "loss": 2.0647, "step": 7290 }, { "epoch": 0.4915602574037263, "grad_norm": 5.013387766620668, "learning_rate": 6.012623545812754e-07, "loss": 1.9481, "step": 7295 }, { "epoch": 0.49189717327583304, "grad_norm": 4.993248102240659, "learning_rate": 6.006864217895906e-07, "loss": 2.0586, "step": 7300 }, { "epoch": 0.49223408914793976, "grad_norm": 4.6721262137686725, "learning_rate": 6.001103497278369e-07, "loss": 2.0341, "step": 7305 }, { "epoch": 0.49257100502004647, "grad_norm": 5.691007291965467, "learning_rate": 5.995341391928408e-07, "loss": 2.0202, "step": 7310 }, { "epoch": 0.49290792089215324, "grad_norm": 4.942814793814035, "learning_rate": 5.9895779098162e-07, "loss": 2.0356, "step": 7315 }, { "epoch": 0.49324483676425995, "grad_norm": 4.912903006318696, "learning_rate": 5.983813058913829e-07, "loss": 1.9831, "step": 7320 }, { "epoch": 0.4935817526363667, "grad_norm": 4.602530881944509, "learning_rate": 5.978046847195272e-07, "loss": 2.0618, "step": 7325 }, { "epoch": 0.49391866850847344, "grad_norm": 5.328387866341911, "learning_rate": 5.97227928263639e-07, "loss": 2.0823, "step": 7330 }, { "epoch": 0.49425558438058015, "grad_norm": 4.90268292100006, "learning_rate": 5.96651037321491e-07, "loss": 2.0658, "step": 7335 }, { "epoch": 0.4945925002526869, "grad_norm": 4.828794636456112, "learning_rate": 5.960740126910425e-07, "loss": 2.0153, "step": 7340 }, { "epoch": 0.49492941612479363, "grad_norm": 5.0725376949573775, "learning_rate": 5.954968551704373e-07, "loss": 2.0506, "step": 7345 }, { "epoch": 0.4952663319969004, "grad_norm": 5.09418863108541, "learning_rate": 5.949195655580032e-07, "loss": 2.0046, "step": 7350 }, { "epoch": 0.4956032478690071, "grad_norm": 5.591274833692999, "learning_rate": 5.943421446522509e-07, "loss": 2.0554, "step": 7355 }, { "epoch": 0.4959401637411138, "grad_norm": 5.132900907627522, "learning_rate": 5.93764593251872e-07, "loss": 2.0048, "step": 7360 }, { "epoch": 0.4962770796132206, "grad_norm": 5.697909016317603, "learning_rate": 5.931869121557397e-07, "loss": 2.0344, "step": 7365 }, { "epoch": 0.4966139954853273, "grad_norm": 5.104896943638169, "learning_rate": 5.926091021629055e-07, "loss": 1.9777, "step": 7370 }, { "epoch": 0.496950911357434, "grad_norm": 4.9037211944879315, "learning_rate": 5.920311640726e-07, "loss": 2.007, "step": 7375 }, { "epoch": 0.4972878272295408, "grad_norm": 5.090871360286154, "learning_rate": 5.914530986842307e-07, "loss": 2.0502, "step": 7380 }, { "epoch": 0.4976247431016475, "grad_norm": 5.20580874377348, "learning_rate": 5.908749067973809e-07, "loss": 2.0351, "step": 7385 }, { "epoch": 0.4979616589737543, "grad_norm": 4.859911961875739, "learning_rate": 5.902965892118093e-07, "loss": 2.075, "step": 7390 }, { "epoch": 0.498298574845861, "grad_norm": 4.659562281328448, "learning_rate": 5.89718146727448e-07, "loss": 2.0142, "step": 7395 }, { "epoch": 0.4986354907179677, "grad_norm": 5.064087504569598, "learning_rate": 5.891395801444026e-07, "loss": 2.0304, "step": 7400 }, { "epoch": 0.49897240659007447, "grad_norm": 5.3084171853108, "learning_rate": 5.885608902629496e-07, "loss": 1.9973, "step": 7405 }, { "epoch": 0.4993093224621812, "grad_norm": 5.127306472918799, "learning_rate": 5.879820778835364e-07, "loss": 2.0496, "step": 7410 }, { "epoch": 0.49964623833428795, "grad_norm": 5.356674449952426, "learning_rate": 5.874031438067799e-07, "loss": 2.0108, "step": 7415 }, { "epoch": 0.49998315420639466, "grad_norm": 5.1724110527582345, "learning_rate": 5.868240888334652e-07, "loss": 2.0278, "step": 7420 }, { "epoch": 0.5003200700785014, "grad_norm": 5.174139938095084, "learning_rate": 5.862449137645444e-07, "loss": 2.0383, "step": 7425 }, { "epoch": 0.5006569859506081, "grad_norm": 5.234290786261978, "learning_rate": 5.856656194011365e-07, "loss": 1.9751, "step": 7430 }, { "epoch": 0.5009939018227149, "grad_norm": 5.179184659232474, "learning_rate": 5.850862065445243e-07, "loss": 1.9828, "step": 7435 }, { "epoch": 0.5013308176948216, "grad_norm": 5.221528970022932, "learning_rate": 5.845066759961557e-07, "loss": 2.0288, "step": 7440 }, { "epoch": 0.5016677335669283, "grad_norm": 4.9443576716691915, "learning_rate": 5.839270285576407e-07, "loss": 2.0735, "step": 7445 }, { "epoch": 0.5020046494390351, "grad_norm": 4.666854203628938, "learning_rate": 5.833472650307509e-07, "loss": 1.959, "step": 7450 }, { "epoch": 0.5023415653111418, "grad_norm": 5.095196737173311, "learning_rate": 5.827673862174192e-07, "loss": 2.0822, "step": 7455 }, { "epoch": 0.5026784811832485, "grad_norm": 4.810529178307664, "learning_rate": 5.821873929197371e-07, "loss": 2.021, "step": 7460 }, { "epoch": 0.5030153970553553, "grad_norm": 5.538458526685336, "learning_rate": 5.81607285939955e-07, "loss": 1.9898, "step": 7465 }, { "epoch": 0.503352312927462, "grad_norm": 5.537821936391307, "learning_rate": 5.810270660804805e-07, "loss": 2.0354, "step": 7470 }, { "epoch": 0.5036892287995688, "grad_norm": 4.742219457166583, "learning_rate": 5.80446734143877e-07, "loss": 1.9974, "step": 7475 }, { "epoch": 0.5040261446716755, "grad_norm": 5.2825810543553615, "learning_rate": 5.798662909328633e-07, "loss": 1.9805, "step": 7480 }, { "epoch": 0.5043630605437822, "grad_norm": 5.031944159521851, "learning_rate": 5.792857372503119e-07, "loss": 2.037, "step": 7485 }, { "epoch": 0.5046999764158889, "grad_norm": 5.4817190499465775, "learning_rate": 5.787050738992481e-07, "loss": 2.0374, "step": 7490 }, { "epoch": 0.5050368922879956, "grad_norm": 5.436498877681092, "learning_rate": 5.781243016828492e-07, "loss": 2.0341, "step": 7495 }, { "epoch": 0.5053738081601025, "grad_norm": 5.022507706551606, "learning_rate": 5.775434214044427e-07, "loss": 2.046, "step": 7500 }, { "epoch": 0.5057107240322092, "grad_norm": 5.089683906106651, "learning_rate": 5.769624338675057e-07, "loss": 2.0772, "step": 7505 }, { "epoch": 0.5060476399043159, "grad_norm": 5.041120067325264, "learning_rate": 5.763813398756637e-07, "loss": 2.0735, "step": 7510 }, { "epoch": 0.5063845557764226, "grad_norm": 4.767590985712124, "learning_rate": 5.758001402326895e-07, "loss": 2.0596, "step": 7515 }, { "epoch": 0.5067214716485293, "grad_norm": 5.2535341382809095, "learning_rate": 5.752188357425019e-07, "loss": 2.0637, "step": 7520 }, { "epoch": 0.5070583875206361, "grad_norm": 4.831361256667109, "learning_rate": 5.746374272091648e-07, "loss": 2.1135, "step": 7525 }, { "epoch": 0.5073953033927429, "grad_norm": 4.944377517786986, "learning_rate": 5.74055915436886e-07, "loss": 2.0439, "step": 7530 }, { "epoch": 0.5077322192648496, "grad_norm": 5.012612880105633, "learning_rate": 5.734743012300162e-07, "loss": 1.9907, "step": 7535 }, { "epoch": 0.5080691351369563, "grad_norm": 4.986276769861442, "learning_rate": 5.728925853930475e-07, "loss": 1.9905, "step": 7540 }, { "epoch": 0.508406051009063, "grad_norm": 4.9905850441067505, "learning_rate": 5.72310768730613e-07, "loss": 2.0256, "step": 7545 }, { "epoch": 0.5087429668811698, "grad_norm": 5.1769933778547275, "learning_rate": 5.717288520474849e-07, "loss": 2.0925, "step": 7550 }, { "epoch": 0.5090798827532765, "grad_norm": 4.788486182119264, "learning_rate": 5.711468361485739e-07, "loss": 2.0018, "step": 7555 }, { "epoch": 0.5094167986253832, "grad_norm": 5.12059348225927, "learning_rate": 5.70564721838928e-07, "loss": 2.0098, "step": 7560 }, { "epoch": 0.50975371449749, "grad_norm": 5.2728351208657775, "learning_rate": 5.69982509923731e-07, "loss": 2.045, "step": 7565 }, { "epoch": 0.5100906303695967, "grad_norm": 5.19375622953189, "learning_rate": 5.694002012083022e-07, "loss": 2.0906, "step": 7570 }, { "epoch": 0.5104275462417035, "grad_norm": 4.555147449904459, "learning_rate": 5.688177964980946e-07, "loss": 1.9863, "step": 7575 }, { "epoch": 0.5107644621138102, "grad_norm": 5.0293556518139795, "learning_rate": 5.682352965986935e-07, "loss": 2.0155, "step": 7580 }, { "epoch": 0.5111013779859169, "grad_norm": 4.877403465769672, "learning_rate": 5.676527023158169e-07, "loss": 2.1141, "step": 7585 }, { "epoch": 0.5114382938580236, "grad_norm": 5.2503884215379575, "learning_rate": 5.670700144553122e-07, "loss": 2.0315, "step": 7590 }, { "epoch": 0.5117752097301304, "grad_norm": 5.244199172506322, "learning_rate": 5.664872338231571e-07, "loss": 2.0414, "step": 7595 }, { "epoch": 0.5121121256022371, "grad_norm": 4.66330609643992, "learning_rate": 5.659043612254573e-07, "loss": 2.0174, "step": 7600 }, { "epoch": 0.5124490414743439, "grad_norm": 5.180211027406709, "learning_rate": 5.653213974684455e-07, "loss": 1.9678, "step": 7605 }, { "epoch": 0.5127859573464506, "grad_norm": 4.9267174367065865, "learning_rate": 5.647383433584807e-07, "loss": 1.9793, "step": 7610 }, { "epoch": 0.5131228732185573, "grad_norm": 5.071239133274736, "learning_rate": 5.641551997020472e-07, "loss": 2.0391, "step": 7615 }, { "epoch": 0.513459789090664, "grad_norm": 4.9290064009363705, "learning_rate": 5.635719673057524e-07, "loss": 2.0341, "step": 7620 }, { "epoch": 0.5137967049627707, "grad_norm": 5.069961738825683, "learning_rate": 5.629886469763273e-07, "loss": 2.0175, "step": 7625 }, { "epoch": 0.5141336208348776, "grad_norm": 4.826266846927176, "learning_rate": 5.624052395206239e-07, "loss": 2.0365, "step": 7630 }, { "epoch": 0.5144705367069843, "grad_norm": 5.252682615711795, "learning_rate": 5.618217457456151e-07, "loss": 1.9976, "step": 7635 }, { "epoch": 0.514807452579091, "grad_norm": 5.026946376416928, "learning_rate": 5.612381664583928e-07, "loss": 2.0538, "step": 7640 }, { "epoch": 0.5151443684511977, "grad_norm": 4.984915045089128, "learning_rate": 5.606545024661674e-07, "loss": 2.0499, "step": 7645 }, { "epoch": 0.5154812843233044, "grad_norm": 4.68512258417582, "learning_rate": 5.600707545762667e-07, "loss": 2.0239, "step": 7650 }, { "epoch": 0.5158182001954112, "grad_norm": 5.3499125680780875, "learning_rate": 5.594869235961342e-07, "loss": 2.0663, "step": 7655 }, { "epoch": 0.516155116067518, "grad_norm": 4.6268181929580665, "learning_rate": 5.589030103333282e-07, "loss": 2.0678, "step": 7660 }, { "epoch": 0.5164920319396247, "grad_norm": 4.968708800826974, "learning_rate": 5.583190155955215e-07, "loss": 1.9686, "step": 7665 }, { "epoch": 0.5168289478117314, "grad_norm": 5.269223210839975, "learning_rate": 5.57734940190499e-07, "loss": 1.9882, "step": 7670 }, { "epoch": 0.5171658636838381, "grad_norm": 5.000627199008308, "learning_rate": 5.571507849261572e-07, "loss": 1.9838, "step": 7675 }, { "epoch": 0.5175027795559449, "grad_norm": 5.070084693909336, "learning_rate": 5.565665506105035e-07, "loss": 1.9958, "step": 7680 }, { "epoch": 0.5178396954280516, "grad_norm": 4.960459165816171, "learning_rate": 5.559822380516539e-07, "loss": 2.0497, "step": 7685 }, { "epoch": 0.5181766113001584, "grad_norm": 5.606893788504318, "learning_rate": 5.553978480578335e-07, "loss": 2.0116, "step": 7690 }, { "epoch": 0.5185135271722651, "grad_norm": 4.654414387522544, "learning_rate": 5.548133814373738e-07, "loss": 1.9945, "step": 7695 }, { "epoch": 0.5188504430443718, "grad_norm": 5.167557729403105, "learning_rate": 5.542288389987128e-07, "loss": 2.0676, "step": 7700 }, { "epoch": 0.5191873589164786, "grad_norm": 4.803686421623489, "learning_rate": 5.536442215503929e-07, "loss": 2.0413, "step": 7705 }, { "epoch": 0.5195242747885853, "grad_norm": 5.326287322389414, "learning_rate": 5.530595299010606e-07, "loss": 2.0137, "step": 7710 }, { "epoch": 0.519861190660692, "grad_norm": 5.335455055445748, "learning_rate": 5.524747648594651e-07, "loss": 2.1001, "step": 7715 }, { "epoch": 0.5201981065327987, "grad_norm": 4.918471286630174, "learning_rate": 5.518899272344568e-07, "loss": 2.0153, "step": 7720 }, { "epoch": 0.5205350224049055, "grad_norm": 5.068279670007689, "learning_rate": 5.513050178349866e-07, "loss": 2.0693, "step": 7725 }, { "epoch": 0.5208719382770123, "grad_norm": 5.020387423508526, "learning_rate": 5.507200374701048e-07, "loss": 2.0624, "step": 7730 }, { "epoch": 0.521208854149119, "grad_norm": 4.825790242071666, "learning_rate": 5.501349869489596e-07, "loss": 2.1059, "step": 7735 }, { "epoch": 0.5215457700212257, "grad_norm": 5.070481046675512, "learning_rate": 5.495498670807967e-07, "loss": 2.0179, "step": 7740 }, { "epoch": 0.5218826858933324, "grad_norm": 5.071155976309688, "learning_rate": 5.489646786749574e-07, "loss": 2.0206, "step": 7745 }, { "epoch": 0.5222196017654391, "grad_norm": 5.064959605956768, "learning_rate": 5.483794225408777e-07, "loss": 2.0301, "step": 7750 }, { "epoch": 0.522556517637546, "grad_norm": 4.983373572806031, "learning_rate": 5.477940994880877e-07, "loss": 1.9106, "step": 7755 }, { "epoch": 0.5228934335096527, "grad_norm": 4.883628120547198, "learning_rate": 5.472087103262094e-07, "loss": 2.0614, "step": 7760 }, { "epoch": 0.5232303493817594, "grad_norm": 4.85870296394926, "learning_rate": 5.46623255864957e-07, "loss": 2.0261, "step": 7765 }, { "epoch": 0.5235672652538661, "grad_norm": 4.703375862858396, "learning_rate": 5.460377369141345e-07, "loss": 2.0744, "step": 7770 }, { "epoch": 0.5239041811259728, "grad_norm": 5.080059430730117, "learning_rate": 5.454521542836351e-07, "loss": 1.996, "step": 7775 }, { "epoch": 0.5242410969980795, "grad_norm": 5.359374218843551, "learning_rate": 5.448665087834405e-07, "loss": 1.9851, "step": 7780 }, { "epoch": 0.5245780128701863, "grad_norm": 5.293717402951819, "learning_rate": 5.442808012236192e-07, "loss": 2.0036, "step": 7785 }, { "epoch": 0.5249149287422931, "grad_norm": 5.289746725314295, "learning_rate": 5.436950324143251e-07, "loss": 2.0528, "step": 7790 }, { "epoch": 0.5252518446143998, "grad_norm": 4.9148058330242215, "learning_rate": 5.431092031657973e-07, "loss": 2.0113, "step": 7795 }, { "epoch": 0.5255887604865065, "grad_norm": 4.613095144188298, "learning_rate": 5.425233142883585e-07, "loss": 2.0706, "step": 7800 }, { "epoch": 0.5259256763586132, "grad_norm": 4.781086908613256, "learning_rate": 5.419373665924136e-07, "loss": 2.0229, "step": 7805 }, { "epoch": 0.52626259223072, "grad_norm": 4.607289669957777, "learning_rate": 5.413513608884491e-07, "loss": 2.053, "step": 7810 }, { "epoch": 0.5265995081028267, "grad_norm": 4.9405711421144085, "learning_rate": 5.407652979870315e-07, "loss": 2.0199, "step": 7815 }, { "epoch": 0.5269364239749335, "grad_norm": 5.0139679000236255, "learning_rate": 5.401791786988068e-07, "loss": 2.0264, "step": 7820 }, { "epoch": 0.5272733398470402, "grad_norm": 4.704589459357222, "learning_rate": 5.395930038344986e-07, "loss": 2.0207, "step": 7825 }, { "epoch": 0.5276102557191469, "grad_norm": 4.817479091776891, "learning_rate": 5.390067742049073e-07, "loss": 1.9803, "step": 7830 }, { "epoch": 0.5279471715912537, "grad_norm": 5.311760697873457, "learning_rate": 5.384204906209097e-07, "loss": 1.971, "step": 7835 }, { "epoch": 0.5282840874633604, "grad_norm": 4.754689163353691, "learning_rate": 5.378341538934566e-07, "loss": 1.9541, "step": 7840 }, { "epoch": 0.5286210033354671, "grad_norm": 5.38599725624217, "learning_rate": 5.372477648335725e-07, "loss": 2.0254, "step": 7845 }, { "epoch": 0.5289579192075738, "grad_norm": 5.124320787844051, "learning_rate": 5.366613242523544e-07, "loss": 2.0099, "step": 7850 }, { "epoch": 0.5292948350796806, "grad_norm": 5.021078509445371, "learning_rate": 5.360748329609702e-07, "loss": 2.0284, "step": 7855 }, { "epoch": 0.5296317509517874, "grad_norm": 4.722730127972075, "learning_rate": 5.354882917706586e-07, "loss": 2.0655, "step": 7860 }, { "epoch": 0.5299686668238941, "grad_norm": 5.360667593659583, "learning_rate": 5.349017014927267e-07, "loss": 2.0743, "step": 7865 }, { "epoch": 0.5303055826960008, "grad_norm": 4.695417043103005, "learning_rate": 5.343150629385496e-07, "loss": 2.0204, "step": 7870 }, { "epoch": 0.5306424985681075, "grad_norm": 5.425603216093319, "learning_rate": 5.337283769195696e-07, "loss": 1.922, "step": 7875 }, { "epoch": 0.5309794144402142, "grad_norm": 5.167009235797552, "learning_rate": 5.331416442472941e-07, "loss": 1.9218, "step": 7880 }, { "epoch": 0.5313163303123211, "grad_norm": 5.329814956936203, "learning_rate": 5.325548657332956e-07, "loss": 1.9633, "step": 7885 }, { "epoch": 0.5316532461844278, "grad_norm": 5.125495223189624, "learning_rate": 5.319680421892095e-07, "loss": 1.9725, "step": 7890 }, { "epoch": 0.5319901620565345, "grad_norm": 5.428211957488529, "learning_rate": 5.313811744267336e-07, "loss": 2.0222, "step": 7895 }, { "epoch": 0.5323270779286412, "grad_norm": 5.369081107054924, "learning_rate": 5.30794263257627e-07, "loss": 2.0302, "step": 7900 }, { "epoch": 0.5326639938007479, "grad_norm": 5.134244914925845, "learning_rate": 5.302073094937089e-07, "loss": 1.9857, "step": 7905 }, { "epoch": 0.5330009096728547, "grad_norm": 5.092718538544762, "learning_rate": 5.296203139468571e-07, "loss": 2.0181, "step": 7910 }, { "epoch": 0.5333378255449615, "grad_norm": 4.821779209250772, "learning_rate": 5.290332774290077e-07, "loss": 1.9919, "step": 7915 }, { "epoch": 0.5336747414170682, "grad_norm": 4.8967539645483855, "learning_rate": 5.284462007521528e-07, "loss": 2.0501, "step": 7920 }, { "epoch": 0.5340116572891749, "grad_norm": 4.703734427084029, "learning_rate": 5.278590847283407e-07, "loss": 1.9901, "step": 7925 }, { "epoch": 0.5343485731612816, "grad_norm": 5.035435336813803, "learning_rate": 5.27271930169674e-07, "loss": 2.0249, "step": 7930 }, { "epoch": 0.5346854890333884, "grad_norm": 5.001612279693521, "learning_rate": 5.266847378883079e-07, "loss": 2.0163, "step": 7935 }, { "epoch": 0.5350224049054951, "grad_norm": 5.259097416565819, "learning_rate": 5.260975086964507e-07, "loss": 2.0357, "step": 7940 }, { "epoch": 0.5353593207776018, "grad_norm": 5.330653903376182, "learning_rate": 5.255102434063612e-07, "loss": 2.0327, "step": 7945 }, { "epoch": 0.5356962366497086, "grad_norm": 5.245745948764884, "learning_rate": 5.249229428303486e-07, "loss": 2.0625, "step": 7950 }, { "epoch": 0.5360331525218153, "grad_norm": 4.978956479009338, "learning_rate": 5.243356077807704e-07, "loss": 2.0448, "step": 7955 }, { "epoch": 0.536370068393922, "grad_norm": 4.422038900425824, "learning_rate": 5.237482390700319e-07, "loss": 2.0762, "step": 7960 }, { "epoch": 0.5367069842660288, "grad_norm": 4.877296011848847, "learning_rate": 5.231608375105852e-07, "loss": 1.941, "step": 7965 }, { "epoch": 0.5370439001381355, "grad_norm": 4.698488301643373, "learning_rate": 5.225734039149277e-07, "loss": 2.025, "step": 7970 }, { "epoch": 0.5373808160102422, "grad_norm": 4.440965429293552, "learning_rate": 5.219859390956012e-07, "loss": 1.9509, "step": 7975 }, { "epoch": 0.537717731882349, "grad_norm": 5.070448414681368, "learning_rate": 5.213984438651904e-07, "loss": 2.0159, "step": 7980 }, { "epoch": 0.5380546477544557, "grad_norm": 5.4711463971083365, "learning_rate": 5.208109190363222e-07, "loss": 2.0584, "step": 7985 }, { "epoch": 0.5383915636265625, "grad_norm": 5.01257966927944, "learning_rate": 5.202233654216649e-07, "loss": 2.0173, "step": 7990 }, { "epoch": 0.5387284794986692, "grad_norm": 5.593320423464432, "learning_rate": 5.196357838339259e-07, "loss": 1.9681, "step": 7995 }, { "epoch": 0.5390653953707759, "grad_norm": 4.697871251660223, "learning_rate": 5.190481750858516e-07, "loss": 2.0245, "step": 8000 }, { "epoch": 0.5394023112428826, "grad_norm": 5.201327767743673, "learning_rate": 5.184605399902262e-07, "loss": 2.0128, "step": 8005 }, { "epoch": 0.5397392271149893, "grad_norm": 4.831595313463434, "learning_rate": 5.178728793598699e-07, "loss": 2.0402, "step": 8010 }, { "epoch": 0.5400761429870962, "grad_norm": 5.283363608358076, "learning_rate": 5.172851940076387e-07, "loss": 2.0356, "step": 8015 }, { "epoch": 0.5404130588592029, "grad_norm": 5.204367479264365, "learning_rate": 5.166974847464223e-07, "loss": 2.1018, "step": 8020 }, { "epoch": 0.5407499747313096, "grad_norm": 5.5399323442861075, "learning_rate": 5.161097523891437e-07, "loss": 2.0372, "step": 8025 }, { "epoch": 0.5410868906034163, "grad_norm": 4.986476967919727, "learning_rate": 5.15521997748758e-07, "loss": 2.0425, "step": 8030 }, { "epoch": 0.541423806475523, "grad_norm": 4.800267159223457, "learning_rate": 5.149342216382511e-07, "loss": 1.9485, "step": 8035 }, { "epoch": 0.5417607223476298, "grad_norm": 4.983989870871522, "learning_rate": 5.143464248706381e-07, "loss": 2.0066, "step": 8040 }, { "epoch": 0.5420976382197366, "grad_norm": 5.158730934215813, "learning_rate": 5.137586082589633e-07, "loss": 2.0203, "step": 8045 }, { "epoch": 0.5424345540918433, "grad_norm": 5.678953153669385, "learning_rate": 5.131707726162983e-07, "loss": 2.0355, "step": 8050 }, { "epoch": 0.54277146996395, "grad_norm": 4.9388313391444765, "learning_rate": 5.125829187557406e-07, "loss": 1.9979, "step": 8055 }, { "epoch": 0.5431083858360567, "grad_norm": 4.798693162684559, "learning_rate": 5.119950474904137e-07, "loss": 2.0662, "step": 8060 }, { "epoch": 0.5434453017081635, "grad_norm": 4.629129746806751, "learning_rate": 5.114071596334642e-07, "loss": 1.9588, "step": 8065 }, { "epoch": 0.5437822175802702, "grad_norm": 5.0581342209116125, "learning_rate": 5.108192559980623e-07, "loss": 2.0508, "step": 8070 }, { "epoch": 0.544119133452377, "grad_norm": 4.884394013553745, "learning_rate": 5.102313373974e-07, "loss": 2.0875, "step": 8075 }, { "epoch": 0.5444560493244837, "grad_norm": 5.396822420842555, "learning_rate": 5.096434046446898e-07, "loss": 1.9953, "step": 8080 }, { "epoch": 0.5447929651965904, "grad_norm": 4.499272995234767, "learning_rate": 5.090554585531639e-07, "loss": 1.9854, "step": 8085 }, { "epoch": 0.5451298810686972, "grad_norm": 5.323945508689434, "learning_rate": 5.084674999360729e-07, "loss": 2.0044, "step": 8090 }, { "epoch": 0.5454667969408039, "grad_norm": 4.776264048892363, "learning_rate": 5.078795296066846e-07, "loss": 2.1003, "step": 8095 }, { "epoch": 0.5458037128129106, "grad_norm": 4.848672052240094, "learning_rate": 5.072915483782833e-07, "loss": 1.9133, "step": 8100 }, { "epoch": 0.5461406286850173, "grad_norm": 4.950938218302901, "learning_rate": 5.067035570641678e-07, "loss": 2.0212, "step": 8105 }, { "epoch": 0.546477544557124, "grad_norm": 5.173799253415026, "learning_rate": 5.061155564776517e-07, "loss": 2.0112, "step": 8110 }, { "epoch": 0.5468144604292309, "grad_norm": 4.815297329344623, "learning_rate": 5.055275474320609e-07, "loss": 2.0363, "step": 8115 }, { "epoch": 0.5471513763013376, "grad_norm": 5.079892697551929, "learning_rate": 5.049395307407328e-07, "loss": 2.0342, "step": 8120 }, { "epoch": 0.5474882921734443, "grad_norm": 4.886019749651465, "learning_rate": 5.04351507217016e-07, "loss": 1.9712, "step": 8125 }, { "epoch": 0.547825208045551, "grad_norm": 5.408940300849208, "learning_rate": 5.03763477674268e-07, "loss": 1.9588, "step": 8130 }, { "epoch": 0.5481621239176577, "grad_norm": 5.165500527376248, "learning_rate": 5.031754429258549e-07, "loss": 2.0641, "step": 8135 }, { "epoch": 0.5484990397897644, "grad_norm": 5.058956974488242, "learning_rate": 5.025874037851499e-07, "loss": 2.0172, "step": 8140 }, { "epoch": 0.5488359556618713, "grad_norm": 4.785348744608888, "learning_rate": 5.019993610655322e-07, "loss": 2.0127, "step": 8145 }, { "epoch": 0.549172871533978, "grad_norm": 4.885215012846232, "learning_rate": 5.014113155803863e-07, "loss": 2.0193, "step": 8150 }, { "epoch": 0.5495097874060847, "grad_norm": 4.817866186503319, "learning_rate": 5.008232681430999e-07, "loss": 2.0387, "step": 8155 }, { "epoch": 0.5498467032781914, "grad_norm": 5.315137719910037, "learning_rate": 5.002352195670643e-07, "loss": 1.9082, "step": 8160 }, { "epoch": 0.5501836191502981, "grad_norm": 5.459513616268638, "learning_rate": 4.996471706656715e-07, "loss": 2.0102, "step": 8165 }, { "epoch": 0.5505205350224049, "grad_norm": 5.18799154025916, "learning_rate": 4.990591222523142e-07, "loss": 2.0286, "step": 8170 }, { "epoch": 0.5508574508945117, "grad_norm": 4.858063023096369, "learning_rate": 4.984710751403849e-07, "loss": 2.011, "step": 8175 }, { "epoch": 0.5511943667666184, "grad_norm": 4.924140479816808, "learning_rate": 4.978830301432738e-07, "loss": 2.0792, "step": 8180 }, { "epoch": 0.5515312826387251, "grad_norm": 5.148798808425791, "learning_rate": 4.97294988074368e-07, "loss": 2.0616, "step": 8185 }, { "epoch": 0.5518681985108318, "grad_norm": 5.409344864157067, "learning_rate": 4.96706949747051e-07, "loss": 1.9716, "step": 8190 }, { "epoch": 0.5522051143829386, "grad_norm": 4.89853617873245, "learning_rate": 4.961189159747015e-07, "loss": 2.0325, "step": 8195 }, { "epoch": 0.5525420302550453, "grad_norm": 4.954248015494013, "learning_rate": 4.955308875706905e-07, "loss": 2.0489, "step": 8200 }, { "epoch": 0.552878946127152, "grad_norm": 5.3936730363029595, "learning_rate": 4.94942865348383e-07, "loss": 2.0245, "step": 8205 }, { "epoch": 0.5532158619992588, "grad_norm": 4.995010550688151, "learning_rate": 4.943548501211351e-07, "loss": 2.0257, "step": 8210 }, { "epoch": 0.5535527778713655, "grad_norm": 5.3583927255617905, "learning_rate": 4.937668427022924e-07, "loss": 2.0173, "step": 8215 }, { "epoch": 0.5538896937434723, "grad_norm": 5.178442946029681, "learning_rate": 4.931788439051909e-07, "loss": 1.9738, "step": 8220 }, { "epoch": 0.554226609615579, "grad_norm": 5.381049748975195, "learning_rate": 4.925908545431537e-07, "loss": 1.9885, "step": 8225 }, { "epoch": 0.5545635254876857, "grad_norm": 5.303934311983132, "learning_rate": 4.920028754294915e-07, "loss": 2.0037, "step": 8230 }, { "epoch": 0.5549004413597924, "grad_norm": 5.271527173977801, "learning_rate": 4.914149073775003e-07, "loss": 2.0402, "step": 8235 }, { "epoch": 0.5552373572318992, "grad_norm": 4.99464687217817, "learning_rate": 4.908269512004613e-07, "loss": 1.9864, "step": 8240 }, { "epoch": 0.555574273104006, "grad_norm": 5.196263312628663, "learning_rate": 4.902390077116392e-07, "loss": 2.0095, "step": 8245 }, { "epoch": 0.5559111889761127, "grad_norm": 5.088219440511273, "learning_rate": 4.896510777242805e-07, "loss": 1.961, "step": 8250 }, { "epoch": 0.5562481048482194, "grad_norm": 4.935112784324173, "learning_rate": 4.890631620516141e-07, "loss": 2.002, "step": 8255 }, { "epoch": 0.5565850207203261, "grad_norm": 5.341778148017211, "learning_rate": 4.88475261506848e-07, "loss": 1.978, "step": 8260 }, { "epoch": 0.5569219365924328, "grad_norm": 5.057522716955263, "learning_rate": 4.878873769031702e-07, "loss": 2.0597, "step": 8265 }, { "epoch": 0.5572588524645397, "grad_norm": 4.846919039630265, "learning_rate": 4.872995090537459e-07, "loss": 1.977, "step": 8270 }, { "epoch": 0.5575957683366464, "grad_norm": 5.148897883564697, "learning_rate": 4.867116587717179e-07, "loss": 2.0432, "step": 8275 }, { "epoch": 0.5579326842087531, "grad_norm": 4.654776191557691, "learning_rate": 4.861238268702039e-07, "loss": 2.0251, "step": 8280 }, { "epoch": 0.5582696000808598, "grad_norm": 5.47892592537155, "learning_rate": 4.855360141622965e-07, "loss": 2.0421, "step": 8285 }, { "epoch": 0.5586065159529665, "grad_norm": 4.667522858187585, "learning_rate": 4.849482214610623e-07, "loss": 2.0072, "step": 8290 }, { "epoch": 0.5589434318250733, "grad_norm": 4.656568794617416, "learning_rate": 4.843604495795392e-07, "loss": 2.0355, "step": 8295 }, { "epoch": 0.55928034769718, "grad_norm": 5.231817761720895, "learning_rate": 4.83772699330737e-07, "loss": 1.9822, "step": 8300 }, { "epoch": 0.5596172635692868, "grad_norm": 5.215241230463362, "learning_rate": 4.831849715276355e-07, "loss": 2.0017, "step": 8305 }, { "epoch": 0.5599541794413935, "grad_norm": 5.697033393247015, "learning_rate": 4.825972669831834e-07, "loss": 1.9834, "step": 8310 }, { "epoch": 0.5602910953135002, "grad_norm": 5.38712463918039, "learning_rate": 4.82009586510297e-07, "loss": 1.9934, "step": 8315 }, { "epoch": 0.5606280111856069, "grad_norm": 7.144794392543247, "learning_rate": 4.814219309218594e-07, "loss": 2.0025, "step": 8320 }, { "epoch": 0.5609649270577137, "grad_norm": 4.544316483002753, "learning_rate": 4.808343010307199e-07, "loss": 2.0733, "step": 8325 }, { "epoch": 0.5613018429298204, "grad_norm": 4.967155635855581, "learning_rate": 4.802466976496911e-07, "loss": 2.0114, "step": 8330 }, { "epoch": 0.5616387588019272, "grad_norm": 5.175722201939003, "learning_rate": 4.796591215915498e-07, "loss": 2.0286, "step": 8335 }, { "epoch": 0.5619756746740339, "grad_norm": 5.077988001163838, "learning_rate": 4.79071573669035e-07, "loss": 2.0467, "step": 8340 }, { "epoch": 0.5623125905461406, "grad_norm": 4.8577658282132985, "learning_rate": 4.784840546948463e-07, "loss": 1.9849, "step": 8345 }, { "epoch": 0.5626495064182474, "grad_norm": 5.428095760257577, "learning_rate": 4.778965654816435e-07, "loss": 1.9667, "step": 8350 }, { "epoch": 0.5629864222903541, "grad_norm": 5.067181477193191, "learning_rate": 4.773091068420455e-07, "loss": 2.0563, "step": 8355 }, { "epoch": 0.5633233381624608, "grad_norm": 4.890544467003986, "learning_rate": 4.767216795886281e-07, "loss": 2.0253, "step": 8360 }, { "epoch": 0.5636602540345675, "grad_norm": 5.06550293433752, "learning_rate": 4.761342845339246e-07, "loss": 2.0333, "step": 8365 }, { "epoch": 0.5639971699066743, "grad_norm": 5.397038559720261, "learning_rate": 4.7554692249042345e-07, "loss": 2.0742, "step": 8370 }, { "epoch": 0.5643340857787811, "grad_norm": 5.063450048987993, "learning_rate": 4.7495959427056754e-07, "loss": 2.0367, "step": 8375 }, { "epoch": 0.5646710016508878, "grad_norm": 4.738802076978484, "learning_rate": 4.743723006867523e-07, "loss": 1.9934, "step": 8380 }, { "epoch": 0.5650079175229945, "grad_norm": 5.000329914457404, "learning_rate": 4.737850425513263e-07, "loss": 2.0305, "step": 8385 }, { "epoch": 0.5653448333951012, "grad_norm": 4.700902569105418, "learning_rate": 4.731978206765884e-07, "loss": 2.0234, "step": 8390 }, { "epoch": 0.5656817492672079, "grad_norm": 5.166750547015565, "learning_rate": 4.726106358747871e-07, "loss": 2.0263, "step": 8395 }, { "epoch": 0.5660186651393148, "grad_norm": 4.84838352057581, "learning_rate": 4.720234889581203e-07, "loss": 2.0753, "step": 8400 }, { "epoch": 0.5663555810114215, "grad_norm": 5.336357803080385, "learning_rate": 4.714363807387333e-07, "loss": 2.0465, "step": 8405 }, { "epoch": 0.5666924968835282, "grad_norm": 5.045126929543965, "learning_rate": 4.708493120287175e-07, "loss": 2.056, "step": 8410 }, { "epoch": 0.5670294127556349, "grad_norm": 4.739658052505435, "learning_rate": 4.7026228364010984e-07, "loss": 2.0046, "step": 8415 }, { "epoch": 0.5673663286277416, "grad_norm": 4.813450361780777, "learning_rate": 4.69675296384892e-07, "loss": 2.0566, "step": 8420 }, { "epoch": 0.5677032444998484, "grad_norm": 4.8085826161928615, "learning_rate": 4.6908835107498775e-07, "loss": 2.0363, "step": 8425 }, { "epoch": 0.5680401603719551, "grad_norm": 4.8447614283772245, "learning_rate": 4.685014485222637e-07, "loss": 2.0088, "step": 8430 }, { "epoch": 0.5683770762440619, "grad_norm": 4.933815979602967, "learning_rate": 4.679145895385269e-07, "loss": 2.0199, "step": 8435 }, { "epoch": 0.5687139921161686, "grad_norm": 5.399896078205648, "learning_rate": 4.673277749355245e-07, "loss": 1.9199, "step": 8440 }, { "epoch": 0.5690509079882753, "grad_norm": 5.623504318987718, "learning_rate": 4.667410055249417e-07, "loss": 1.962, "step": 8445 }, { "epoch": 0.5693878238603821, "grad_norm": 5.262602180027032, "learning_rate": 4.6615428211840154e-07, "loss": 1.9831, "step": 8450 }, { "epoch": 0.5697247397324888, "grad_norm": 5.092634703287803, "learning_rate": 4.655676055274637e-07, "loss": 2.0438, "step": 8455 }, { "epoch": 0.5700616556045955, "grad_norm": 5.224671289299266, "learning_rate": 4.6498097656362247e-07, "loss": 2.0243, "step": 8460 }, { "epoch": 0.5703985714767023, "grad_norm": 5.375325145634491, "learning_rate": 4.643943960383067e-07, "loss": 1.9328, "step": 8465 }, { "epoch": 0.570735487348809, "grad_norm": 5.065757600239244, "learning_rate": 4.638078647628782e-07, "loss": 2.1276, "step": 8470 }, { "epoch": 0.5710724032209158, "grad_norm": 5.017528828693995, "learning_rate": 4.632213835486305e-07, "loss": 2.0531, "step": 8475 }, { "epoch": 0.5714093190930225, "grad_norm": 4.924453996648473, "learning_rate": 4.626349532067879e-07, "loss": 2.0608, "step": 8480 }, { "epoch": 0.5717462349651292, "grad_norm": 4.961731922620141, "learning_rate": 4.620485745485046e-07, "loss": 1.9766, "step": 8485 }, { "epoch": 0.5720831508372359, "grad_norm": 5.000693004299178, "learning_rate": 4.6146224838486287e-07, "loss": 2.0829, "step": 8490 }, { "epoch": 0.5724200667093426, "grad_norm": 4.810180573514605, "learning_rate": 4.6087597552687275e-07, "loss": 2.0524, "step": 8495 }, { "epoch": 0.5727569825814494, "grad_norm": 4.9384623897079125, "learning_rate": 4.602897567854705e-07, "loss": 2.011, "step": 8500 }, { "epoch": 0.5730938984535562, "grad_norm": 5.042775406762451, "learning_rate": 4.5970359297151733e-07, "loss": 2.0019, "step": 8505 }, { "epoch": 0.5734308143256629, "grad_norm": 4.849105252963638, "learning_rate": 4.591174848957986e-07, "loss": 2.0438, "step": 8510 }, { "epoch": 0.5737677301977696, "grad_norm": 5.037836827226458, "learning_rate": 4.585314333690224e-07, "loss": 2.0244, "step": 8515 }, { "epoch": 0.5741046460698763, "grad_norm": 4.697970201052526, "learning_rate": 4.579454392018192e-07, "loss": 1.9795, "step": 8520 }, { "epoch": 0.574441561941983, "grad_norm": 4.507023392152208, "learning_rate": 4.5735950320473915e-07, "loss": 1.9882, "step": 8525 }, { "epoch": 0.5747784778140899, "grad_norm": 4.899824645898042, "learning_rate": 4.5677362618825265e-07, "loss": 1.9755, "step": 8530 }, { "epoch": 0.5751153936861966, "grad_norm": 5.040894854279222, "learning_rate": 4.5618780896274866e-07, "loss": 2.0095, "step": 8535 }, { "epoch": 0.5754523095583033, "grad_norm": 5.324067929878832, "learning_rate": 4.556020523385326e-07, "loss": 1.951, "step": 8540 }, { "epoch": 0.57578922543041, "grad_norm": 5.107807759544019, "learning_rate": 4.55016357125827e-07, "loss": 1.9826, "step": 8545 }, { "epoch": 0.5761261413025167, "grad_norm": 4.961088251326319, "learning_rate": 4.5443072413476877e-07, "loss": 1.9784, "step": 8550 }, { "epoch": 0.5764630571746235, "grad_norm": 5.141455920571979, "learning_rate": 4.5384515417540914e-07, "loss": 2.004, "step": 8555 }, { "epoch": 0.5767999730467303, "grad_norm": 4.793096890060267, "learning_rate": 4.5325964805771187e-07, "loss": 2.0233, "step": 8560 }, { "epoch": 0.577136888918837, "grad_norm": 5.155519612210372, "learning_rate": 4.526742065915528e-07, "loss": 1.9431, "step": 8565 }, { "epoch": 0.5774738047909437, "grad_norm": 5.014398606967903, "learning_rate": 4.520888305867181e-07, "loss": 1.9825, "step": 8570 }, { "epoch": 0.5778107206630504, "grad_norm": 5.133013571062793, "learning_rate": 4.5150352085290315e-07, "loss": 2.0436, "step": 8575 }, { "epoch": 0.5781476365351572, "grad_norm": 5.08274579946172, "learning_rate": 4.5091827819971207e-07, "loss": 2.0482, "step": 8580 }, { "epoch": 0.5784845524072639, "grad_norm": 4.94015386837617, "learning_rate": 4.503331034366563e-07, "loss": 2.009, "step": 8585 }, { "epoch": 0.5788214682793706, "grad_norm": 4.885893365264294, "learning_rate": 4.4974799737315274e-07, "loss": 1.9717, "step": 8590 }, { "epoch": 0.5791583841514774, "grad_norm": 4.741569455698651, "learning_rate": 4.491629608185237e-07, "loss": 2.0226, "step": 8595 }, { "epoch": 0.5794953000235841, "grad_norm": 4.850212201543367, "learning_rate": 4.485779945819956e-07, "loss": 2.0107, "step": 8600 }, { "epoch": 0.5798322158956909, "grad_norm": 4.865074672482193, "learning_rate": 4.479930994726968e-07, "loss": 1.9722, "step": 8605 }, { "epoch": 0.5801691317677976, "grad_norm": 4.957747821863177, "learning_rate": 4.474082762996581e-07, "loss": 2.0587, "step": 8610 }, { "epoch": 0.5805060476399043, "grad_norm": 4.962590208494777, "learning_rate": 4.468235258718105e-07, "loss": 2.0494, "step": 8615 }, { "epoch": 0.580842963512011, "grad_norm": 4.807030985825038, "learning_rate": 4.4623884899798397e-07, "loss": 2.0185, "step": 8620 }, { "epoch": 0.5811798793841177, "grad_norm": 5.771940381476685, "learning_rate": 4.4565424648690743e-07, "loss": 1.9669, "step": 8625 }, { "epoch": 0.5815167952562246, "grad_norm": 5.369108236299518, "learning_rate": 4.450697191472067e-07, "loss": 1.9726, "step": 8630 }, { "epoch": 0.5818537111283313, "grad_norm": 4.867784969954882, "learning_rate": 4.4448526778740327e-07, "loss": 1.9687, "step": 8635 }, { "epoch": 0.582190627000438, "grad_norm": 5.4013836564456685, "learning_rate": 4.439008932159138e-07, "loss": 2.0535, "step": 8640 }, { "epoch": 0.5825275428725447, "grad_norm": 5.221401961320815, "learning_rate": 4.4331659624104876e-07, "loss": 2.1037, "step": 8645 }, { "epoch": 0.5828644587446514, "grad_norm": 5.366372996473107, "learning_rate": 4.427323776710117e-07, "loss": 2.0535, "step": 8650 }, { "epoch": 0.5832013746167583, "grad_norm": 5.190860149298861, "learning_rate": 4.4214823831389663e-07, "loss": 2.0365, "step": 8655 }, { "epoch": 0.583538290488865, "grad_norm": 4.955415690065836, "learning_rate": 4.41564178977689e-07, "loss": 1.9404, "step": 8660 }, { "epoch": 0.5838752063609717, "grad_norm": 4.875765665712272, "learning_rate": 4.4098020047026343e-07, "loss": 1.9684, "step": 8665 }, { "epoch": 0.5842121222330784, "grad_norm": 5.17630313166666, "learning_rate": 4.4039630359938194e-07, "loss": 1.9597, "step": 8670 }, { "epoch": 0.5845490381051851, "grad_norm": 5.0515367078739155, "learning_rate": 4.3981248917269477e-07, "loss": 2.0549, "step": 8675 }, { "epoch": 0.5848859539772918, "grad_norm": 4.791441806855084, "learning_rate": 4.3922875799773735e-07, "loss": 1.9848, "step": 8680 }, { "epoch": 0.5852228698493986, "grad_norm": 4.870111935336453, "learning_rate": 4.386451108819302e-07, "loss": 1.9692, "step": 8685 }, { "epoch": 0.5855597857215054, "grad_norm": 5.306353144856448, "learning_rate": 4.380615486325774e-07, "loss": 2.0053, "step": 8690 }, { "epoch": 0.5858967015936121, "grad_norm": 4.613582357858742, "learning_rate": 4.3747807205686616e-07, "loss": 1.9636, "step": 8695 }, { "epoch": 0.5862336174657188, "grad_norm": 4.318006726461526, "learning_rate": 4.3689468196186433e-07, "loss": 1.9629, "step": 8700 }, { "epoch": 0.5865705333378255, "grad_norm": 5.081294642997203, "learning_rate": 4.36311379154521e-07, "loss": 2.0623, "step": 8705 }, { "epoch": 0.5869074492099323, "grad_norm": 5.283955632851207, "learning_rate": 4.3572816444166406e-07, "loss": 2.0167, "step": 8710 }, { "epoch": 0.587244365082039, "grad_norm": 5.088484452965513, "learning_rate": 4.351450386299996e-07, "loss": 1.9339, "step": 8715 }, { "epoch": 0.5875812809541457, "grad_norm": 5.174064041415016, "learning_rate": 4.3456200252611075e-07, "loss": 2.0178, "step": 8720 }, { "epoch": 0.5879181968262525, "grad_norm": 4.736504445245447, "learning_rate": 4.3397905693645653e-07, "loss": 2.04, "step": 8725 }, { "epoch": 0.5882551126983592, "grad_norm": 5.366821900422264, "learning_rate": 4.3339620266737116e-07, "loss": 1.9902, "step": 8730 }, { "epoch": 0.588592028570466, "grad_norm": 5.0438051187723305, "learning_rate": 4.328134405250617e-07, "loss": 1.9413, "step": 8735 }, { "epoch": 0.5889289444425727, "grad_norm": 5.4035222667832725, "learning_rate": 4.322307713156085e-07, "loss": 1.974, "step": 8740 }, { "epoch": 0.5892658603146794, "grad_norm": 5.147842501406032, "learning_rate": 4.316481958449634e-07, "loss": 1.9493, "step": 8745 }, { "epoch": 0.5896027761867861, "grad_norm": 4.643193843745066, "learning_rate": 4.310657149189478e-07, "loss": 2.0555, "step": 8750 }, { "epoch": 0.5899396920588929, "grad_norm": 4.7426480340082895, "learning_rate": 4.3048332934325325e-07, "loss": 2.0433, "step": 8755 }, { "epoch": 0.5902766079309997, "grad_norm": 5.561806666997551, "learning_rate": 4.2990103992343893e-07, "loss": 1.9919, "step": 8760 }, { "epoch": 0.5906135238031064, "grad_norm": 4.7688768212785115, "learning_rate": 4.2931884746493107e-07, "loss": 2.0694, "step": 8765 }, { "epoch": 0.5909504396752131, "grad_norm": 5.279987843588366, "learning_rate": 4.287367527730216e-07, "loss": 1.9952, "step": 8770 }, { "epoch": 0.5912873555473198, "grad_norm": 4.931860955838191, "learning_rate": 4.2815475665286766e-07, "loss": 2.0268, "step": 8775 }, { "epoch": 0.5916242714194265, "grad_norm": 5.778746204711135, "learning_rate": 4.2757285990948993e-07, "loss": 2.0232, "step": 8780 }, { "epoch": 0.5919611872915334, "grad_norm": 5.17971288419395, "learning_rate": 4.269910633477711e-07, "loss": 1.9739, "step": 8785 }, { "epoch": 0.5922981031636401, "grad_norm": 4.759519846297744, "learning_rate": 4.264093677724561e-07, "loss": 1.9927, "step": 8790 }, { "epoch": 0.5926350190357468, "grad_norm": 4.76982826048163, "learning_rate": 4.2582777398814966e-07, "loss": 1.9503, "step": 8795 }, { "epoch": 0.5929719349078535, "grad_norm": 4.853995031061081, "learning_rate": 4.252462827993158e-07, "loss": 1.994, "step": 8800 }, { "epoch": 0.5933088507799602, "grad_norm": 5.1953440639163055, "learning_rate": 4.246648950102765e-07, "loss": 2.0735, "step": 8805 }, { "epoch": 0.593645766652067, "grad_norm": 4.959383646386358, "learning_rate": 4.240836114252112e-07, "loss": 2.0273, "step": 8810 }, { "epoch": 0.5939826825241737, "grad_norm": 4.85655798486154, "learning_rate": 4.2350243284815445e-07, "loss": 1.9796, "step": 8815 }, { "epoch": 0.5943195983962805, "grad_norm": 5.1090661709367335, "learning_rate": 4.229213600829963e-07, "loss": 2.0174, "step": 8820 }, { "epoch": 0.5946565142683872, "grad_norm": 4.635902536386741, "learning_rate": 4.223403939334802e-07, "loss": 2.0215, "step": 8825 }, { "epoch": 0.5949934301404939, "grad_norm": 5.678496109731297, "learning_rate": 4.217595352032017e-07, "loss": 2.0313, "step": 8830 }, { "epoch": 0.5953303460126007, "grad_norm": 5.193705348534971, "learning_rate": 4.2117878469560834e-07, "loss": 1.9691, "step": 8835 }, { "epoch": 0.5956672618847074, "grad_norm": 4.503234766844775, "learning_rate": 4.205981432139978e-07, "loss": 1.9744, "step": 8840 }, { "epoch": 0.5960041777568141, "grad_norm": 5.124858165682876, "learning_rate": 4.200176115615169e-07, "loss": 2.0368, "step": 8845 }, { "epoch": 0.5963410936289208, "grad_norm": 5.133836766597673, "learning_rate": 4.1943719054116027e-07, "loss": 1.9918, "step": 8850 }, { "epoch": 0.5966780095010276, "grad_norm": 5.29678652785177, "learning_rate": 4.1885688095577e-07, "loss": 1.9424, "step": 8855 }, { "epoch": 0.5970149253731343, "grad_norm": 4.736171343046466, "learning_rate": 4.182766836080339e-07, "loss": 1.9613, "step": 8860 }, { "epoch": 0.5973518412452411, "grad_norm": 5.359828195897729, "learning_rate": 4.176965993004842e-07, "loss": 2.0228, "step": 8865 }, { "epoch": 0.5976887571173478, "grad_norm": 5.254730886693509, "learning_rate": 4.171166288354971e-07, "loss": 2.0179, "step": 8870 }, { "epoch": 0.5980256729894545, "grad_norm": 4.7443915518976425, "learning_rate": 4.165367730152917e-07, "loss": 2.02, "step": 8875 }, { "epoch": 0.5983625888615612, "grad_norm": 5.589282776431786, "learning_rate": 4.1595703264192737e-07, "loss": 2.0417, "step": 8880 }, { "epoch": 0.598699504733668, "grad_norm": 4.784335165517703, "learning_rate": 4.15377408517305e-07, "loss": 2.0069, "step": 8885 }, { "epoch": 0.5990364206057748, "grad_norm": 5.15406865432379, "learning_rate": 4.147979014431642e-07, "loss": 1.9967, "step": 8890 }, { "epoch": 0.5993733364778815, "grad_norm": 5.32744806296277, "learning_rate": 4.142185122210823e-07, "loss": 2.0053, "step": 8895 }, { "epoch": 0.5997102523499882, "grad_norm": 5.097261502129905, "learning_rate": 4.136392416524742e-07, "loss": 2.0314, "step": 8900 }, { "epoch": 0.6000471682220949, "grad_norm": 5.119719947848269, "learning_rate": 4.1306009053859043e-07, "loss": 1.9899, "step": 8905 }, { "epoch": 0.6003840840942016, "grad_norm": 5.100826010102629, "learning_rate": 4.124810596805166e-07, "loss": 2.0075, "step": 8910 }, { "epoch": 0.6007209999663085, "grad_norm": 4.810671289783185, "learning_rate": 4.119021498791712e-07, "loss": 2.0877, "step": 8915 }, { "epoch": 0.6010579158384152, "grad_norm": 4.713277057499714, "learning_rate": 4.113233619353062e-07, "loss": 1.9927, "step": 8920 }, { "epoch": 0.6013948317105219, "grad_norm": 5.138042276655243, "learning_rate": 4.107446966495044e-07, "loss": 2.0363, "step": 8925 }, { "epoch": 0.6017317475826286, "grad_norm": 4.867043264230542, "learning_rate": 4.101661548221792e-07, "loss": 2.0346, "step": 8930 }, { "epoch": 0.6020686634547353, "grad_norm": 5.40335949419558, "learning_rate": 4.0958773725357297e-07, "loss": 1.9909, "step": 8935 }, { "epoch": 0.6024055793268421, "grad_norm": 5.036248271348243, "learning_rate": 4.0900944474375674e-07, "loss": 2.0102, "step": 8940 }, { "epoch": 0.6027424951989488, "grad_norm": 5.062823799234065, "learning_rate": 4.084312780926279e-07, "loss": 1.9669, "step": 8945 }, { "epoch": 0.6030794110710556, "grad_norm": 4.950399536179573, "learning_rate": 4.0785323809991006e-07, "loss": 2.0411, "step": 8950 }, { "epoch": 0.6034163269431623, "grad_norm": 5.1503136987342835, "learning_rate": 4.072753255651521e-07, "loss": 1.967, "step": 8955 }, { "epoch": 0.603753242815269, "grad_norm": 5.401928392005746, "learning_rate": 4.066975412877255e-07, "loss": 1.949, "step": 8960 }, { "epoch": 0.6040901586873758, "grad_norm": 5.906240094598001, "learning_rate": 4.0611988606682544e-07, "loss": 1.961, "step": 8965 }, { "epoch": 0.6044270745594825, "grad_norm": 5.546955274223408, "learning_rate": 4.0554236070146785e-07, "loss": 2.0289, "step": 8970 }, { "epoch": 0.6047639904315892, "grad_norm": 5.433379159101134, "learning_rate": 4.0496496599048963e-07, "loss": 2.0125, "step": 8975 }, { "epoch": 0.605100906303696, "grad_norm": 5.509462854231382, "learning_rate": 4.0438770273254624e-07, "loss": 2.0004, "step": 8980 }, { "epoch": 0.6054378221758027, "grad_norm": 4.9575429727081115, "learning_rate": 4.038105717261119e-07, "loss": 1.9765, "step": 8985 }, { "epoch": 0.6057747380479095, "grad_norm": 4.814309944839121, "learning_rate": 4.03233573769478e-07, "loss": 2.0214, "step": 8990 }, { "epoch": 0.6061116539200162, "grad_norm": 5.13499508722991, "learning_rate": 4.026567096607511e-07, "loss": 1.9613, "step": 8995 }, { "epoch": 0.6064485697921229, "grad_norm": 4.831287781373147, "learning_rate": 4.020799801978535e-07, "loss": 2.0698, "step": 9000 }, { "epoch": 0.6067854856642296, "grad_norm": 5.113346557744551, "learning_rate": 4.015033861785208e-07, "loss": 2.0239, "step": 9005 }, { "epoch": 0.6071224015363363, "grad_norm": 5.115269816443193, "learning_rate": 4.0092692840030126e-07, "loss": 2.0584, "step": 9010 }, { "epoch": 0.6074593174084432, "grad_norm": 4.966349334286546, "learning_rate": 4.003506076605547e-07, "loss": 1.9472, "step": 9015 }, { "epoch": 0.6077962332805499, "grad_norm": 5.2287925349220945, "learning_rate": 3.997744247564519e-07, "loss": 1.9158, "step": 9020 }, { "epoch": 0.6081331491526566, "grad_norm": 4.88086818219518, "learning_rate": 3.9919838048497197e-07, "loss": 2.076, "step": 9025 }, { "epoch": 0.6084700650247633, "grad_norm": 4.912118460201347, "learning_rate": 3.98622475642903e-07, "loss": 2.0081, "step": 9030 }, { "epoch": 0.60880698089687, "grad_norm": 5.588243203983603, "learning_rate": 3.980467110268405e-07, "loss": 1.9502, "step": 9035 }, { "epoch": 0.6091438967689767, "grad_norm": 5.18919077819228, "learning_rate": 3.9747108743318493e-07, "loss": 2.035, "step": 9040 }, { "epoch": 0.6094808126410836, "grad_norm": 4.989832259347466, "learning_rate": 3.968956056581428e-07, "loss": 1.9814, "step": 9045 }, { "epoch": 0.6098177285131903, "grad_norm": 5.0473039216927, "learning_rate": 3.9632026649772366e-07, "loss": 1.9362, "step": 9050 }, { "epoch": 0.610154644385297, "grad_norm": 5.2107084674007655, "learning_rate": 3.9574507074774056e-07, "loss": 1.9818, "step": 9055 }, { "epoch": 0.6104915602574037, "grad_norm": 4.937503163124957, "learning_rate": 3.951700192038072e-07, "loss": 1.9424, "step": 9060 }, { "epoch": 0.6108284761295104, "grad_norm": 4.834428906351546, "learning_rate": 3.945951126613387e-07, "loss": 2.0839, "step": 9065 }, { "epoch": 0.6111653920016172, "grad_norm": 5.2241118464507785, "learning_rate": 3.9402035191554937e-07, "loss": 2.0041, "step": 9070 }, { "epoch": 0.611502307873724, "grad_norm": 5.0309010350862575, "learning_rate": 3.934457377614514e-07, "loss": 1.9585, "step": 9075 }, { "epoch": 0.6118392237458307, "grad_norm": 4.8852646410100276, "learning_rate": 3.9287127099385483e-07, "loss": 2.0464, "step": 9080 }, { "epoch": 0.6121761396179374, "grad_norm": 5.650812659846726, "learning_rate": 3.9229695240736567e-07, "loss": 2.019, "step": 9085 }, { "epoch": 0.6125130554900441, "grad_norm": 4.894054053977216, "learning_rate": 3.917227827963846e-07, "loss": 1.9835, "step": 9090 }, { "epoch": 0.6128499713621509, "grad_norm": 5.544602513971773, "learning_rate": 3.9114876295510653e-07, "loss": 1.9969, "step": 9095 }, { "epoch": 0.6131868872342576, "grad_norm": 5.05292792876365, "learning_rate": 3.9057489367751947e-07, "loss": 1.9798, "step": 9100 }, { "epoch": 0.6135238031063643, "grad_norm": 4.935853992124038, "learning_rate": 3.900011757574024e-07, "loss": 2.0605, "step": 9105 }, { "epoch": 0.613860718978471, "grad_norm": 5.181162203139844, "learning_rate": 3.894276099883258e-07, "loss": 1.9733, "step": 9110 }, { "epoch": 0.6141976348505778, "grad_norm": 5.403904236088209, "learning_rate": 3.888541971636492e-07, "loss": 1.9516, "step": 9115 }, { "epoch": 0.6145345507226846, "grad_norm": 5.413962688576453, "learning_rate": 3.8828093807652095e-07, "loss": 2.0858, "step": 9120 }, { "epoch": 0.6148714665947913, "grad_norm": 4.974267577368974, "learning_rate": 3.8770783351987605e-07, "loss": 1.9692, "step": 9125 }, { "epoch": 0.615208382466898, "grad_norm": 5.283242822617496, "learning_rate": 3.8713488428643656e-07, "loss": 2.0009, "step": 9130 }, { "epoch": 0.6155452983390047, "grad_norm": 4.854658684261151, "learning_rate": 3.8656209116870906e-07, "loss": 1.9851, "step": 9135 }, { "epoch": 0.6158822142111114, "grad_norm": 5.047273314617198, "learning_rate": 3.859894549589847e-07, "loss": 2.0221, "step": 9140 }, { "epoch": 0.6162191300832183, "grad_norm": 5.6681772926727705, "learning_rate": 3.854169764493371e-07, "loss": 2.0198, "step": 9145 }, { "epoch": 0.616556045955325, "grad_norm": 5.139897546170033, "learning_rate": 3.848446564316223e-07, "loss": 1.9846, "step": 9150 }, { "epoch": 0.6168929618274317, "grad_norm": 5.13319587665313, "learning_rate": 3.8427249569747656e-07, "loss": 1.9921, "step": 9155 }, { "epoch": 0.6172298776995384, "grad_norm": 5.006684754353325, "learning_rate": 3.8370049503831614e-07, "loss": 2.0077, "step": 9160 }, { "epoch": 0.6175667935716451, "grad_norm": 4.934622846935966, "learning_rate": 3.8312865524533606e-07, "loss": 1.9757, "step": 9165 }, { "epoch": 0.617903709443752, "grad_norm": 5.192199263448503, "learning_rate": 3.825569771095082e-07, "loss": 2.0525, "step": 9170 }, { "epoch": 0.6182406253158587, "grad_norm": 4.972050024046171, "learning_rate": 3.819854614215814e-07, "loss": 2.0318, "step": 9175 }, { "epoch": 0.6185775411879654, "grad_norm": 5.1863707131579755, "learning_rate": 3.814141089720796e-07, "loss": 2.0353, "step": 9180 }, { "epoch": 0.6189144570600721, "grad_norm": 5.367844192098591, "learning_rate": 3.8084292055130126e-07, "loss": 2.0644, "step": 9185 }, { "epoch": 0.6192513729321788, "grad_norm": 5.3779907887069776, "learning_rate": 3.8027189694931715e-07, "loss": 2.0005, "step": 9190 }, { "epoch": 0.6195882888042856, "grad_norm": 4.772342494899727, "learning_rate": 3.797010389559708e-07, "loss": 2.0155, "step": 9195 }, { "epoch": 0.6199252046763923, "grad_norm": 5.738081945197456, "learning_rate": 3.7913034736087677e-07, "loss": 2.0645, "step": 9200 }, { "epoch": 0.620262120548499, "grad_norm": 5.186822479303372, "learning_rate": 3.785598229534186e-07, "loss": 2.0162, "step": 9205 }, { "epoch": 0.6205990364206058, "grad_norm": 5.0462246199094505, "learning_rate": 3.7798946652274943e-07, "loss": 2.0037, "step": 9210 }, { "epoch": 0.6209359522927125, "grad_norm": 4.58921596443554, "learning_rate": 3.7741927885778966e-07, "loss": 1.921, "step": 9215 }, { "epoch": 0.6212728681648192, "grad_norm": 4.903567345154026, "learning_rate": 3.768492607472263e-07, "loss": 2.0289, "step": 9220 }, { "epoch": 0.621609784036926, "grad_norm": 4.781647926464837, "learning_rate": 3.7627941297951183e-07, "loss": 2.0367, "step": 9225 }, { "epoch": 0.6219466999090327, "grad_norm": 4.670564154166921, "learning_rate": 3.7570973634286334e-07, "loss": 2.0198, "step": 9230 }, { "epoch": 0.6222836157811394, "grad_norm": 4.865747688174046, "learning_rate": 3.7514023162526066e-07, "loss": 1.9674, "step": 9235 }, { "epoch": 0.6226205316532462, "grad_norm": 4.780270488914493, "learning_rate": 3.745708996144463e-07, "loss": 2.0376, "step": 9240 }, { "epoch": 0.6229574475253529, "grad_norm": 5.509242238787251, "learning_rate": 3.740017410979239e-07, "loss": 1.9899, "step": 9245 }, { "epoch": 0.6232943633974597, "grad_norm": 5.330623812990074, "learning_rate": 3.734327568629569e-07, "loss": 2.0131, "step": 9250 }, { "epoch": 0.6236312792695664, "grad_norm": 4.611945078902878, "learning_rate": 3.728639476965678e-07, "loss": 1.9678, "step": 9255 }, { "epoch": 0.6239681951416731, "grad_norm": 5.329039570979653, "learning_rate": 3.7229531438553664e-07, "loss": 2.0089, "step": 9260 }, { "epoch": 0.6243051110137798, "grad_norm": 4.97393087400385, "learning_rate": 3.7172685771640076e-07, "loss": 2.0249, "step": 9265 }, { "epoch": 0.6246420268858865, "grad_norm": 5.505557765850746, "learning_rate": 3.7115857847545264e-07, "loss": 1.9637, "step": 9270 }, { "epoch": 0.6249789427579934, "grad_norm": 5.027952580668468, "learning_rate": 3.7059047744873955e-07, "loss": 1.993, "step": 9275 }, { "epoch": 0.6253158586301001, "grad_norm": 4.528234862237026, "learning_rate": 3.700225554220626e-07, "loss": 2.0075, "step": 9280 }, { "epoch": 0.6256527745022068, "grad_norm": 4.823350850313485, "learning_rate": 3.694548131809747e-07, "loss": 1.9873, "step": 9285 }, { "epoch": 0.6259896903743135, "grad_norm": 4.705453804106207, "learning_rate": 3.6888725151078024e-07, "loss": 2.005, "step": 9290 }, { "epoch": 0.6263266062464202, "grad_norm": 4.590943455026622, "learning_rate": 3.683198711965345e-07, "loss": 1.9365, "step": 9295 }, { "epoch": 0.626663522118527, "grad_norm": 5.106176906707933, "learning_rate": 3.677526730230408e-07, "loss": 1.9771, "step": 9300 }, { "epoch": 0.6270004379906338, "grad_norm": 5.99512262497153, "learning_rate": 3.671856577748512e-07, "loss": 1.9643, "step": 9305 }, { "epoch": 0.6273373538627405, "grad_norm": 5.885071665584965, "learning_rate": 3.666188262362648e-07, "loss": 2.0115, "step": 9310 }, { "epoch": 0.6276742697348472, "grad_norm": 5.169349869726666, "learning_rate": 3.660521791913265e-07, "loss": 2.0191, "step": 9315 }, { "epoch": 0.6280111856069539, "grad_norm": 5.00250186464649, "learning_rate": 3.654857174238256e-07, "loss": 2.0057, "step": 9320 }, { "epoch": 0.6283481014790607, "grad_norm": 4.639785844382824, "learning_rate": 3.649194417172957e-07, "loss": 2.0774, "step": 9325 }, { "epoch": 0.6286850173511674, "grad_norm": 5.104654272511666, "learning_rate": 3.6435335285501283e-07, "loss": 1.9142, "step": 9330 }, { "epoch": 0.6290219332232742, "grad_norm": 4.605562207942926, "learning_rate": 3.6378745161999426e-07, "loss": 2.0218, "step": 9335 }, { "epoch": 0.6293588490953809, "grad_norm": 4.9464028442491585, "learning_rate": 3.632217387949983e-07, "loss": 2.0136, "step": 9340 }, { "epoch": 0.6296957649674876, "grad_norm": 5.158201310790248, "learning_rate": 3.626562151625223e-07, "loss": 1.9161, "step": 9345 }, { "epoch": 0.6300326808395944, "grad_norm": 4.819782074978986, "learning_rate": 3.6209088150480173e-07, "loss": 1.9454, "step": 9350 }, { "epoch": 0.6303695967117011, "grad_norm": 5.264205500421145, "learning_rate": 3.6152573860380964e-07, "loss": 1.9726, "step": 9355 }, { "epoch": 0.6307065125838078, "grad_norm": 5.036820600971269, "learning_rate": 3.6096078724125544e-07, "loss": 2.0517, "step": 9360 }, { "epoch": 0.6310434284559145, "grad_norm": 5.493438836555923, "learning_rate": 3.603960281985828e-07, "loss": 2.0312, "step": 9365 }, { "epoch": 0.6313803443280213, "grad_norm": 5.261323262762267, "learning_rate": 3.5983146225697007e-07, "loss": 1.9787, "step": 9370 }, { "epoch": 0.6317172602001281, "grad_norm": 5.23457926956508, "learning_rate": 3.5926709019732855e-07, "loss": 2.0642, "step": 9375 }, { "epoch": 0.6320541760722348, "grad_norm": 5.0183810277702525, "learning_rate": 3.587029128003006e-07, "loss": 2.0053, "step": 9380 }, { "epoch": 0.6323910919443415, "grad_norm": 5.237127325559007, "learning_rate": 3.581389308462601e-07, "loss": 1.9793, "step": 9385 }, { "epoch": 0.6327280078164482, "grad_norm": 5.196709938723755, "learning_rate": 3.5757514511531016e-07, "loss": 2.0283, "step": 9390 }, { "epoch": 0.6330649236885549, "grad_norm": 5.008272128722665, "learning_rate": 3.5701155638728297e-07, "loss": 2.0163, "step": 9395 }, { "epoch": 0.6334018395606617, "grad_norm": 4.898005091881302, "learning_rate": 3.564481654417374e-07, "loss": 1.988, "step": 9400 }, { "epoch": 0.6337387554327685, "grad_norm": 4.912285377325274, "learning_rate": 3.558849730579594e-07, "loss": 2.0159, "step": 9405 }, { "epoch": 0.6340756713048752, "grad_norm": 5.054562799027816, "learning_rate": 3.553219800149603e-07, "loss": 1.9897, "step": 9410 }, { "epoch": 0.6344125871769819, "grad_norm": 4.927458176329109, "learning_rate": 3.547591870914752e-07, "loss": 1.9498, "step": 9415 }, { "epoch": 0.6347495030490886, "grad_norm": 4.909059134919558, "learning_rate": 3.5419659506596287e-07, "loss": 1.9794, "step": 9420 }, { "epoch": 0.6350864189211953, "grad_norm": 5.122618318143905, "learning_rate": 3.536342047166039e-07, "loss": 1.9658, "step": 9425 }, { "epoch": 0.6354233347933022, "grad_norm": 5.138264603535447, "learning_rate": 3.530720168213001e-07, "loss": 1.9751, "step": 9430 }, { "epoch": 0.6357602506654089, "grad_norm": 4.70271152466759, "learning_rate": 3.5251003215767305e-07, "loss": 1.9738, "step": 9435 }, { "epoch": 0.6360971665375156, "grad_norm": 5.21628213788209, "learning_rate": 3.519482515030636e-07, "loss": 2.0049, "step": 9440 }, { "epoch": 0.6364340824096223, "grad_norm": 4.693315865593774, "learning_rate": 3.5138667563452983e-07, "loss": 2.0359, "step": 9445 }, { "epoch": 0.636770998281729, "grad_norm": 5.992138660324225, "learning_rate": 3.5082530532884703e-07, "loss": 1.9448, "step": 9450 }, { "epoch": 0.6371079141538358, "grad_norm": 4.99745288442435, "learning_rate": 3.5026414136250607e-07, "loss": 1.9873, "step": 9455 }, { "epoch": 0.6374448300259425, "grad_norm": 4.354859860611195, "learning_rate": 3.497031845117124e-07, "loss": 2.0295, "step": 9460 }, { "epoch": 0.6377817458980493, "grad_norm": 5.352869223652878, "learning_rate": 3.4914243555238476e-07, "loss": 2.0293, "step": 9465 }, { "epoch": 0.638118661770156, "grad_norm": 5.192726959584385, "learning_rate": 3.4858189526015453e-07, "loss": 1.9579, "step": 9470 }, { "epoch": 0.6384555776422627, "grad_norm": 5.070887390993361, "learning_rate": 3.4802156441036467e-07, "loss": 2.001, "step": 9475 }, { "epoch": 0.6387924935143695, "grad_norm": 4.840550498420871, "learning_rate": 3.4746144377806785e-07, "loss": 1.9912, "step": 9480 }, { "epoch": 0.6391294093864762, "grad_norm": 5.165380226839607, "learning_rate": 3.4690153413802653e-07, "loss": 2.0184, "step": 9485 }, { "epoch": 0.6394663252585829, "grad_norm": 5.159804471452981, "learning_rate": 3.4634183626471125e-07, "loss": 2.0959, "step": 9490 }, { "epoch": 0.6398032411306896, "grad_norm": 5.238894771822284, "learning_rate": 3.457823509322992e-07, "loss": 2.0484, "step": 9495 }, { "epoch": 0.6401401570027964, "grad_norm": 5.197457465725176, "learning_rate": 3.452230789146741e-07, "loss": 2.0457, "step": 9500 }, { "epoch": 0.6404770728749032, "grad_norm": 5.567362053448154, "learning_rate": 3.4466402098542435e-07, "loss": 1.9906, "step": 9505 }, { "epoch": 0.6408139887470099, "grad_norm": 5.402416220069954, "learning_rate": 3.441051779178422e-07, "loss": 2.0011, "step": 9510 }, { "epoch": 0.6411509046191166, "grad_norm": 4.863457633897499, "learning_rate": 3.4354655048492277e-07, "loss": 1.9144, "step": 9515 }, { "epoch": 0.6414878204912233, "grad_norm": 4.599035544250757, "learning_rate": 3.429881394593629e-07, "loss": 1.9748, "step": 9520 }, { "epoch": 0.64182473636333, "grad_norm": 5.317919583706225, "learning_rate": 3.4242994561356043e-07, "loss": 1.963, "step": 9525 }, { "epoch": 0.6421616522354369, "grad_norm": 4.991522824153337, "learning_rate": 3.4187196971961185e-07, "loss": 2.0742, "step": 9530 }, { "epoch": 0.6424985681075436, "grad_norm": 5.0713769227677865, "learning_rate": 3.4131421254931326e-07, "loss": 2.026, "step": 9535 }, { "epoch": 0.6428354839796503, "grad_norm": 4.754075771629146, "learning_rate": 3.4075667487415785e-07, "loss": 2.0566, "step": 9540 }, { "epoch": 0.643172399851757, "grad_norm": 4.867353546348669, "learning_rate": 3.4019935746533474e-07, "loss": 2.0175, "step": 9545 }, { "epoch": 0.6435093157238637, "grad_norm": 5.21172248206843, "learning_rate": 3.3964226109372884e-07, "loss": 2.0925, "step": 9550 }, { "epoch": 0.6438462315959704, "grad_norm": 5.10834000678692, "learning_rate": 3.390853865299195e-07, "loss": 1.9973, "step": 9555 }, { "epoch": 0.6441831474680773, "grad_norm": 5.236770726746641, "learning_rate": 3.385287345441786e-07, "loss": 2.063, "step": 9560 }, { "epoch": 0.644520063340184, "grad_norm": 5.177165638157087, "learning_rate": 3.3797230590647073e-07, "loss": 1.9942, "step": 9565 }, { "epoch": 0.6448569792122907, "grad_norm": 4.889378702069615, "learning_rate": 3.374161013864515e-07, "loss": 2.0072, "step": 9570 }, { "epoch": 0.6451938950843974, "grad_norm": 5.01448686829912, "learning_rate": 3.368601217534661e-07, "loss": 2.0334, "step": 9575 }, { "epoch": 0.6455308109565041, "grad_norm": 5.249713785081298, "learning_rate": 3.3630436777654903e-07, "loss": 2.0601, "step": 9580 }, { "epoch": 0.6458677268286109, "grad_norm": 4.951893689333028, "learning_rate": 3.357488402244227e-07, "loss": 2.0051, "step": 9585 }, { "epoch": 0.6462046427007176, "grad_norm": 5.512287396447487, "learning_rate": 3.3519353986549604e-07, "loss": 1.929, "step": 9590 }, { "epoch": 0.6465415585728244, "grad_norm": 5.269693830687322, "learning_rate": 3.346384674678639e-07, "loss": 2.0724, "step": 9595 }, { "epoch": 0.6468784744449311, "grad_norm": 5.386374469669304, "learning_rate": 3.3408362379930576e-07, "loss": 1.9387, "step": 9600 }, { "epoch": 0.6472153903170378, "grad_norm": 4.738783302113777, "learning_rate": 3.335290096272849e-07, "loss": 1.958, "step": 9605 }, { "epoch": 0.6475523061891446, "grad_norm": 5.753734433978254, "learning_rate": 3.3297462571894673e-07, "loss": 1.9728, "step": 9610 }, { "epoch": 0.6478892220612513, "grad_norm": 4.752043088858555, "learning_rate": 3.3242047284111857e-07, "loss": 1.9819, "step": 9615 }, { "epoch": 0.648226137933358, "grad_norm": 4.86036482930341, "learning_rate": 3.3186655176030826e-07, "loss": 2.0721, "step": 9620 }, { "epoch": 0.6485630538054648, "grad_norm": 4.6271522274462, "learning_rate": 3.3131286324270234e-07, "loss": 1.9088, "step": 9625 }, { "epoch": 0.6488999696775715, "grad_norm": 4.83935031656777, "learning_rate": 3.3075940805416654e-07, "loss": 1.9913, "step": 9630 }, { "epoch": 0.6492368855496783, "grad_norm": 4.774191265663529, "learning_rate": 3.3020618696024316e-07, "loss": 2.0425, "step": 9635 }, { "epoch": 0.649573801421785, "grad_norm": 4.909963157382655, "learning_rate": 3.2965320072615113e-07, "loss": 2.0416, "step": 9640 }, { "epoch": 0.6499107172938917, "grad_norm": 4.954991516268226, "learning_rate": 3.2910045011678424e-07, "loss": 1.9636, "step": 9645 }, { "epoch": 0.6502476331659984, "grad_norm": 4.747372565468685, "learning_rate": 3.2854793589671046e-07, "loss": 2.0597, "step": 9650 }, { "epoch": 0.6505845490381051, "grad_norm": 4.798472745744581, "learning_rate": 3.279956588301712e-07, "loss": 2.0057, "step": 9655 }, { "epoch": 0.650921464910212, "grad_norm": 4.893222669622116, "learning_rate": 3.274436196810789e-07, "loss": 1.9689, "step": 9660 }, { "epoch": 0.6512583807823187, "grad_norm": 5.17559646213739, "learning_rate": 3.268918192130178e-07, "loss": 2.0114, "step": 9665 }, { "epoch": 0.6515952966544254, "grad_norm": 5.24265047264547, "learning_rate": 3.263402581892415e-07, "loss": 1.9507, "step": 9670 }, { "epoch": 0.6519322125265321, "grad_norm": 5.162887006872564, "learning_rate": 3.257889373726726e-07, "loss": 2.0416, "step": 9675 }, { "epoch": 0.6522691283986388, "grad_norm": 4.813219981587693, "learning_rate": 3.252378575259013e-07, "loss": 1.9974, "step": 9680 }, { "epoch": 0.6526060442707456, "grad_norm": 5.096924828289441, "learning_rate": 3.246870194111849e-07, "loss": 2.0079, "step": 9685 }, { "epoch": 0.6529429601428524, "grad_norm": 5.291772990579257, "learning_rate": 3.2413642379044557e-07, "loss": 1.9734, "step": 9690 }, { "epoch": 0.6532798760149591, "grad_norm": 5.057152519648431, "learning_rate": 3.235860714252708e-07, "loss": 2.0203, "step": 9695 }, { "epoch": 0.6536167918870658, "grad_norm": 5.030446845609571, "learning_rate": 3.2303596307691137e-07, "loss": 1.9912, "step": 9700 }, { "epoch": 0.6539537077591725, "grad_norm": 5.0657077925915335, "learning_rate": 3.2248609950628023e-07, "loss": 2.0511, "step": 9705 }, { "epoch": 0.6542906236312793, "grad_norm": 5.362414159447081, "learning_rate": 3.219364814739522e-07, "loss": 2.0, "step": 9710 }, { "epoch": 0.654627539503386, "grad_norm": 5.425256033091966, "learning_rate": 3.2138710974016226e-07, "loss": 1.9695, "step": 9715 }, { "epoch": 0.6549644553754927, "grad_norm": 4.690335016166976, "learning_rate": 3.208379850648046e-07, "loss": 2.0625, "step": 9720 }, { "epoch": 0.6553013712475995, "grad_norm": 4.985449770811369, "learning_rate": 3.202891082074318e-07, "loss": 2.0177, "step": 9725 }, { "epoch": 0.6556382871197062, "grad_norm": 4.767398812883055, "learning_rate": 3.197404799272537e-07, "loss": 2.0038, "step": 9730 }, { "epoch": 0.6559752029918129, "grad_norm": 5.732516695561447, "learning_rate": 3.191921009831365e-07, "loss": 1.9991, "step": 9735 }, { "epoch": 0.6563121188639197, "grad_norm": 5.085332338271107, "learning_rate": 3.1864397213360093e-07, "loss": 2.0117, "step": 9740 }, { "epoch": 0.6566490347360264, "grad_norm": 5.015674044010952, "learning_rate": 3.180960941368223e-07, "loss": 2.0008, "step": 9745 }, { "epoch": 0.6569859506081331, "grad_norm": 4.995585765892123, "learning_rate": 3.175484677506288e-07, "loss": 1.9692, "step": 9750 }, { "epoch": 0.6573228664802399, "grad_norm": 5.05789754538165, "learning_rate": 3.1700109373250063e-07, "loss": 1.959, "step": 9755 }, { "epoch": 0.6576597823523466, "grad_norm": 5.711094543598176, "learning_rate": 3.1645397283956843e-07, "loss": 2.0046, "step": 9760 }, { "epoch": 0.6579966982244534, "grad_norm": 4.897880734668697, "learning_rate": 3.159071058286138e-07, "loss": 1.9827, "step": 9765 }, { "epoch": 0.6583336140965601, "grad_norm": 4.708482050557243, "learning_rate": 3.1536049345606586e-07, "loss": 2.0826, "step": 9770 }, { "epoch": 0.6586705299686668, "grad_norm": 5.026326377879305, "learning_rate": 3.1481413647800247e-07, "loss": 1.8551, "step": 9775 }, { "epoch": 0.6590074458407735, "grad_norm": 5.010893107926519, "learning_rate": 3.14268035650148e-07, "loss": 2.0168, "step": 9780 }, { "epoch": 0.6593443617128802, "grad_norm": 4.9676720463608754, "learning_rate": 3.137221917278723e-07, "loss": 2.0785, "step": 9785 }, { "epoch": 0.6596812775849871, "grad_norm": 4.964410841950562, "learning_rate": 3.1317660546618986e-07, "loss": 1.9237, "step": 9790 }, { "epoch": 0.6600181934570938, "grad_norm": 5.139227525052003, "learning_rate": 3.1263127761975917e-07, "loss": 2.0032, "step": 9795 }, { "epoch": 0.6603551093292005, "grad_norm": 5.386137312998709, "learning_rate": 3.12086208942881e-07, "loss": 2.034, "step": 9800 }, { "epoch": 0.6606920252013072, "grad_norm": 4.793211988861983, "learning_rate": 3.1154140018949736e-07, "loss": 2.0187, "step": 9805 }, { "epoch": 0.6610289410734139, "grad_norm": 5.67153199814893, "learning_rate": 3.1099685211319116e-07, "loss": 1.983, "step": 9810 }, { "epoch": 0.6613658569455207, "grad_norm": 4.854363196294675, "learning_rate": 3.104525654671849e-07, "loss": 1.9752, "step": 9815 }, { "epoch": 0.6617027728176275, "grad_norm": 5.495664948845223, "learning_rate": 3.099085410043386e-07, "loss": 1.9386, "step": 9820 }, { "epoch": 0.6620396886897342, "grad_norm": 4.455108287538701, "learning_rate": 3.0936477947715064e-07, "loss": 1.9409, "step": 9825 }, { "epoch": 0.6623766045618409, "grad_norm": 4.964140559341061, "learning_rate": 3.088212816377552e-07, "loss": 2.0068, "step": 9830 }, { "epoch": 0.6627135204339476, "grad_norm": 4.530847444687334, "learning_rate": 3.0827804823792157e-07, "loss": 1.998, "step": 9835 }, { "epoch": 0.6630504363060544, "grad_norm": 4.8945688010858275, "learning_rate": 3.077350800290537e-07, "loss": 2.0285, "step": 9840 }, { "epoch": 0.6633873521781611, "grad_norm": 4.750601377850449, "learning_rate": 3.071923777621885e-07, "loss": 1.9597, "step": 9845 }, { "epoch": 0.6637242680502679, "grad_norm": 5.077909504755588, "learning_rate": 3.066499421879948e-07, "loss": 2.0952, "step": 9850 }, { "epoch": 0.6640611839223746, "grad_norm": 5.060449413453516, "learning_rate": 3.0610777405677286e-07, "loss": 1.9884, "step": 9855 }, { "epoch": 0.6643980997944813, "grad_norm": 4.578632893880844, "learning_rate": 3.05565874118453e-07, "loss": 1.9665, "step": 9860 }, { "epoch": 0.6647350156665881, "grad_norm": 5.059294194518655, "learning_rate": 3.050242431225948e-07, "loss": 1.9961, "step": 9865 }, { "epoch": 0.6650719315386948, "grad_norm": 5.242650466756676, "learning_rate": 3.0448288181838487e-07, "loss": 1.9832, "step": 9870 }, { "epoch": 0.6654088474108015, "grad_norm": 5.077386500101605, "learning_rate": 3.0394179095463804e-07, "loss": 2.0256, "step": 9875 }, { "epoch": 0.6657457632829082, "grad_norm": 5.183486359813879, "learning_rate": 3.0340097127979426e-07, "loss": 2.0337, "step": 9880 }, { "epoch": 0.666082679155015, "grad_norm": 5.331023765828063, "learning_rate": 3.0286042354191844e-07, "loss": 2.024, "step": 9885 }, { "epoch": 0.6664195950271218, "grad_norm": 4.960654258662428, "learning_rate": 3.0232014848869955e-07, "loss": 2.0005, "step": 9890 }, { "epoch": 0.6667565108992285, "grad_norm": 5.327831328514684, "learning_rate": 3.0178014686744966e-07, "loss": 1.9734, "step": 9895 }, { "epoch": 0.6670934267713352, "grad_norm": 4.473489829192608, "learning_rate": 3.0124041942510175e-07, "loss": 2.0091, "step": 9900 }, { "epoch": 0.6674303426434419, "grad_norm": 6.337056986185231, "learning_rate": 3.007009669082103e-07, "loss": 1.9821, "step": 9905 }, { "epoch": 0.6677672585155486, "grad_norm": 4.9571785944746605, "learning_rate": 3.001617900629496e-07, "loss": 2.0221, "step": 9910 }, { "epoch": 0.6681041743876553, "grad_norm": 5.0943589253319255, "learning_rate": 2.996228896351119e-07, "loss": 2.0071, "step": 9915 }, { "epoch": 0.6684410902597622, "grad_norm": 4.668224127638979, "learning_rate": 2.9908426637010773e-07, "loss": 2.0594, "step": 9920 }, { "epoch": 0.6687780061318689, "grad_norm": 5.1662406087928385, "learning_rate": 2.98545921012964e-07, "loss": 1.9997, "step": 9925 }, { "epoch": 0.6691149220039756, "grad_norm": 5.174622666442469, "learning_rate": 2.9800785430832354e-07, "loss": 1.9767, "step": 9930 }, { "epoch": 0.6694518378760823, "grad_norm": 4.7033818190306205, "learning_rate": 2.9747006700044295e-07, "loss": 2.0003, "step": 9935 }, { "epoch": 0.669788753748189, "grad_norm": 4.781849016203444, "learning_rate": 2.969325598331932e-07, "loss": 1.9563, "step": 9940 }, { "epoch": 0.6701256696202958, "grad_norm": 4.867357026399452, "learning_rate": 2.9639533355005773e-07, "loss": 1.9823, "step": 9945 }, { "epoch": 0.6704625854924026, "grad_norm": 4.913690529882524, "learning_rate": 2.958583888941306e-07, "loss": 1.9459, "step": 9950 }, { "epoch": 0.6707995013645093, "grad_norm": 5.284224995995842, "learning_rate": 2.9532172660811745e-07, "loss": 2.0394, "step": 9955 }, { "epoch": 0.671136417236616, "grad_norm": 5.007105267705012, "learning_rate": 2.9478534743433247e-07, "loss": 1.9998, "step": 9960 }, { "epoch": 0.6714733331087227, "grad_norm": 4.407910556325075, "learning_rate": 2.9424925211469876e-07, "loss": 1.9728, "step": 9965 }, { "epoch": 0.6718102489808295, "grad_norm": 5.517864781146457, "learning_rate": 2.9371344139074645e-07, "loss": 1.9679, "step": 9970 }, { "epoch": 0.6721471648529362, "grad_norm": 5.333215107398777, "learning_rate": 2.9317791600361243e-07, "loss": 1.8695, "step": 9975 }, { "epoch": 0.672484080725043, "grad_norm": 4.538833432990649, "learning_rate": 2.9264267669403833e-07, "loss": 2.0044, "step": 9980 }, { "epoch": 0.6728209965971497, "grad_norm": 5.160212102908101, "learning_rate": 2.921077242023706e-07, "loss": 2.0034, "step": 9985 }, { "epoch": 0.6731579124692564, "grad_norm": 5.6406300723841, "learning_rate": 2.9157305926855893e-07, "loss": 1.9973, "step": 9990 }, { "epoch": 0.6734948283413632, "grad_norm": 4.988335511534908, "learning_rate": 2.910386826321549e-07, "loss": 2.0258, "step": 9995 }, { "epoch": 0.6738317442134699, "grad_norm": 5.121261922170911, "learning_rate": 2.905045950323114e-07, "loss": 2.0063, "step": 10000 }, { "epoch": 0.6741686600855766, "grad_norm": 5.441060622661217, "learning_rate": 2.899707972077817e-07, "loss": 1.9914, "step": 10005 }, { "epoch": 0.6745055759576833, "grad_norm": 5.632802004091385, "learning_rate": 2.8943728989691857e-07, "loss": 2.0166, "step": 10010 }, { "epoch": 0.6748424918297901, "grad_norm": 5.331909382999812, "learning_rate": 2.88904073837672e-07, "loss": 1.9898, "step": 10015 }, { "epoch": 0.6751794077018969, "grad_norm": 5.0235994163553475, "learning_rate": 2.883711497675899e-07, "loss": 1.9567, "step": 10020 }, { "epoch": 0.6755163235740036, "grad_norm": 4.850930942452827, "learning_rate": 2.878385184238163e-07, "loss": 1.9517, "step": 10025 }, { "epoch": 0.6758532394461103, "grad_norm": 5.0534846457684415, "learning_rate": 2.8730618054308964e-07, "loss": 1.9297, "step": 10030 }, { "epoch": 0.676190155318217, "grad_norm": 4.75150681106774, "learning_rate": 2.8677413686174325e-07, "loss": 2.0423, "step": 10035 }, { "epoch": 0.6765270711903237, "grad_norm": 5.752414888506618, "learning_rate": 2.8624238811570325e-07, "loss": 2.0057, "step": 10040 }, { "epoch": 0.6768639870624306, "grad_norm": 4.729305601830091, "learning_rate": 2.8571093504048737e-07, "loss": 2.0081, "step": 10045 }, { "epoch": 0.6772009029345373, "grad_norm": 4.797868240995026, "learning_rate": 2.851797783712049e-07, "loss": 1.9242, "step": 10050 }, { "epoch": 0.677537818806644, "grad_norm": 5.112409975722369, "learning_rate": 2.8464891884255515e-07, "loss": 2.0265, "step": 10055 }, { "epoch": 0.6778747346787507, "grad_norm": 5.078962852332028, "learning_rate": 2.8411835718882593e-07, "loss": 2.008, "step": 10060 }, { "epoch": 0.6782116505508574, "grad_norm": 4.940541357756737, "learning_rate": 2.835880941438936e-07, "loss": 1.9516, "step": 10065 }, { "epoch": 0.6785485664229642, "grad_norm": 5.016212150930612, "learning_rate": 2.8305813044122093e-07, "loss": 2.0094, "step": 10070 }, { "epoch": 0.678885482295071, "grad_norm": 5.697264997950984, "learning_rate": 2.8252846681385734e-07, "loss": 2.072, "step": 10075 }, { "epoch": 0.6792223981671777, "grad_norm": 5.244055708354648, "learning_rate": 2.8199910399443625e-07, "loss": 2.0207, "step": 10080 }, { "epoch": 0.6795593140392844, "grad_norm": 5.260252908381028, "learning_rate": 2.8147004271517584e-07, "loss": 1.9253, "step": 10085 }, { "epoch": 0.6798962299113911, "grad_norm": 5.140219400619202, "learning_rate": 2.8094128370787694e-07, "loss": 2.0744, "step": 10090 }, { "epoch": 0.6802331457834978, "grad_norm": 5.123998494756192, "learning_rate": 2.8041282770392196e-07, "loss": 1.9672, "step": 10095 }, { "epoch": 0.6805700616556046, "grad_norm": 5.629978032991946, "learning_rate": 2.7988467543427454e-07, "loss": 1.9358, "step": 10100 }, { "epoch": 0.6809069775277113, "grad_norm": 5.523298818450376, "learning_rate": 2.7935682762947837e-07, "loss": 2.0296, "step": 10105 }, { "epoch": 0.6812438933998181, "grad_norm": 4.989440618178771, "learning_rate": 2.788292850196553e-07, "loss": 2.0013, "step": 10110 }, { "epoch": 0.6815808092719248, "grad_norm": 4.810639022726118, "learning_rate": 2.783020483345057e-07, "loss": 1.9489, "step": 10115 }, { "epoch": 0.6819177251440315, "grad_norm": 4.6625131469278225, "learning_rate": 2.777751183033067e-07, "loss": 2.0691, "step": 10120 }, { "epoch": 0.6822546410161383, "grad_norm": 4.612758369472151, "learning_rate": 2.772484956549107e-07, "loss": 1.9968, "step": 10125 }, { "epoch": 0.682591556888245, "grad_norm": 4.894034010835434, "learning_rate": 2.7672218111774566e-07, "loss": 2.0518, "step": 10130 }, { "epoch": 0.6829284727603517, "grad_norm": 4.6784357940064405, "learning_rate": 2.7619617541981287e-07, "loss": 1.9786, "step": 10135 }, { "epoch": 0.6832653886324584, "grad_norm": 5.3291723640154345, "learning_rate": 2.756704792886869e-07, "loss": 1.9991, "step": 10140 }, { "epoch": 0.6836023045045652, "grad_norm": 5.0306194039777985, "learning_rate": 2.7514509345151347e-07, "loss": 1.954, "step": 10145 }, { "epoch": 0.683939220376672, "grad_norm": 5.636728973967335, "learning_rate": 2.746200186350097e-07, "loss": 2.0545, "step": 10150 }, { "epoch": 0.6842761362487787, "grad_norm": 4.86066622573197, "learning_rate": 2.740952555654622e-07, "loss": 2.0321, "step": 10155 }, { "epoch": 0.6846130521208854, "grad_norm": 5.151361788128535, "learning_rate": 2.735708049687262e-07, "loss": 1.9322, "step": 10160 }, { "epoch": 0.6849499679929921, "grad_norm": 4.957727116353471, "learning_rate": 2.730466675702251e-07, "loss": 2.0168, "step": 10165 }, { "epoch": 0.6852868838650988, "grad_norm": 5.169662971953493, "learning_rate": 2.7252284409494906e-07, "loss": 1.9621, "step": 10170 }, { "epoch": 0.6856237997372057, "grad_norm": 4.669075478754922, "learning_rate": 2.7199933526745364e-07, "loss": 2.0224, "step": 10175 }, { "epoch": 0.6859607156093124, "grad_norm": 5.696866373248566, "learning_rate": 2.714761418118596e-07, "loss": 1.9434, "step": 10180 }, { "epoch": 0.6862976314814191, "grad_norm": 5.719802464905006, "learning_rate": 2.7095326445185143e-07, "loss": 2.0264, "step": 10185 }, { "epoch": 0.6866345473535258, "grad_norm": 4.830809391611633, "learning_rate": 2.704307039106759e-07, "loss": 1.9709, "step": 10190 }, { "epoch": 0.6869714632256325, "grad_norm": 4.525928850366308, "learning_rate": 2.6990846091114205e-07, "loss": 2.0485, "step": 10195 }, { "epoch": 0.6873083790977393, "grad_norm": 4.539368158142009, "learning_rate": 2.6938653617561967e-07, "loss": 2.0208, "step": 10200 }, { "epoch": 0.687645294969846, "grad_norm": 4.886740161111373, "learning_rate": 2.688649304260383e-07, "loss": 1.9975, "step": 10205 }, { "epoch": 0.6879822108419528, "grad_norm": 5.040182808801462, "learning_rate": 2.683436443838859e-07, "loss": 2.0263, "step": 10210 }, { "epoch": 0.6883191267140595, "grad_norm": 5.204688582110176, "learning_rate": 2.678226787702086e-07, "loss": 2.0437, "step": 10215 }, { "epoch": 0.6886560425861662, "grad_norm": 5.022632723433887, "learning_rate": 2.673020343056094e-07, "loss": 2.087, "step": 10220 }, { "epoch": 0.688992958458273, "grad_norm": 5.217811192738246, "learning_rate": 2.6678171171024657e-07, "loss": 2.0522, "step": 10225 }, { "epoch": 0.6893298743303797, "grad_norm": 4.745735430157816, "learning_rate": 2.6626171170383373e-07, "loss": 1.928, "step": 10230 }, { "epoch": 0.6896667902024864, "grad_norm": 5.0912050163557465, "learning_rate": 2.6574203500563776e-07, "loss": 2.0045, "step": 10235 }, { "epoch": 0.6900037060745932, "grad_norm": 4.7602910173229365, "learning_rate": 2.6522268233447894e-07, "loss": 2.0189, "step": 10240 }, { "epoch": 0.6903406219466999, "grad_norm": 5.006807100673096, "learning_rate": 2.6470365440872866e-07, "loss": 1.9674, "step": 10245 }, { "epoch": 0.6906775378188067, "grad_norm": 5.472097096204049, "learning_rate": 2.641849519463099e-07, "loss": 2.0291, "step": 10250 }, { "epoch": 0.6910144536909134, "grad_norm": 5.081658881851591, "learning_rate": 2.6366657566469465e-07, "loss": 1.9848, "step": 10255 }, { "epoch": 0.6913513695630201, "grad_norm": 4.844405861760023, "learning_rate": 2.631485262809043e-07, "loss": 1.962, "step": 10260 }, { "epoch": 0.6916882854351268, "grad_norm": 4.717458056560756, "learning_rate": 2.6263080451150797e-07, "loss": 1.9945, "step": 10265 }, { "epoch": 0.6920252013072336, "grad_norm": 5.396553201062059, "learning_rate": 2.621134110726217e-07, "loss": 2.0804, "step": 10270 }, { "epoch": 0.6923621171793403, "grad_norm": 5.009410995639441, "learning_rate": 2.6159634667990683e-07, "loss": 2.0611, "step": 10275 }, { "epoch": 0.6926990330514471, "grad_norm": 5.1928897835662315, "learning_rate": 2.610796120485701e-07, "loss": 2.0286, "step": 10280 }, { "epoch": 0.6930359489235538, "grad_norm": 4.7591787819045965, "learning_rate": 2.605632078933623e-07, "loss": 1.9434, "step": 10285 }, { "epoch": 0.6933728647956605, "grad_norm": 5.305132331597345, "learning_rate": 2.600471349285763e-07, "loss": 2.057, "step": 10290 }, { "epoch": 0.6937097806677672, "grad_norm": 5.306940798718859, "learning_rate": 2.5953139386804764e-07, "loss": 2.001, "step": 10295 }, { "epoch": 0.6940466965398739, "grad_norm": 5.455798097910076, "learning_rate": 2.5901598542515256e-07, "loss": 1.9734, "step": 10300 }, { "epoch": 0.6943836124119808, "grad_norm": 4.690667132700423, "learning_rate": 2.5850091031280684e-07, "loss": 2.0057, "step": 10305 }, { "epoch": 0.6947205282840875, "grad_norm": 4.893771242839119, "learning_rate": 2.579861692434658e-07, "loss": 2.0114, "step": 10310 }, { "epoch": 0.6950574441561942, "grad_norm": 5.0540389025238825, "learning_rate": 2.574717629291222e-07, "loss": 1.9758, "step": 10315 }, { "epoch": 0.6953943600283009, "grad_norm": 4.928253662588401, "learning_rate": 2.5695769208130615e-07, "loss": 1.9848, "step": 10320 }, { "epoch": 0.6957312759004076, "grad_norm": 4.756708523957995, "learning_rate": 2.564439574110833e-07, "loss": 1.9936, "step": 10325 }, { "epoch": 0.6960681917725144, "grad_norm": 4.985724814567837, "learning_rate": 2.559305596290547e-07, "loss": 2.0183, "step": 10330 }, { "epoch": 0.6964051076446212, "grad_norm": 5.271398317621488, "learning_rate": 2.554174994453555e-07, "loss": 1.9696, "step": 10335 }, { "epoch": 0.6967420235167279, "grad_norm": 5.171911289808202, "learning_rate": 2.549047775696532e-07, "loss": 1.9756, "step": 10340 }, { "epoch": 0.6970789393888346, "grad_norm": 5.015265178582161, "learning_rate": 2.543923947111481e-07, "loss": 1.9929, "step": 10345 }, { "epoch": 0.6974158552609413, "grad_norm": 4.960802143321741, "learning_rate": 2.538803515785714e-07, "loss": 2.0212, "step": 10350 }, { "epoch": 0.6977527711330481, "grad_norm": 4.980607070806367, "learning_rate": 2.5336864888018393e-07, "loss": 1.9641, "step": 10355 }, { "epoch": 0.6980896870051548, "grad_norm": 5.312069150190135, "learning_rate": 2.528572873237761e-07, "loss": 1.9825, "step": 10360 }, { "epoch": 0.6984266028772615, "grad_norm": 4.7334553834569055, "learning_rate": 2.5234626761666647e-07, "loss": 2.0858, "step": 10365 }, { "epoch": 0.6987635187493683, "grad_norm": 4.657272623876852, "learning_rate": 2.5183559046570036e-07, "loss": 2.0724, "step": 10370 }, { "epoch": 0.699100434621475, "grad_norm": 4.853649063711195, "learning_rate": 2.513252565772496e-07, "loss": 2.0476, "step": 10375 }, { "epoch": 0.6994373504935818, "grad_norm": 5.168308400786228, "learning_rate": 2.5081526665721133e-07, "loss": 2.0041, "step": 10380 }, { "epoch": 0.6997742663656885, "grad_norm": 4.8387996113675005, "learning_rate": 2.503056214110062e-07, "loss": 1.9702, "step": 10385 }, { "epoch": 0.7001111822377952, "grad_norm": 4.931169195208755, "learning_rate": 2.497963215435789e-07, "loss": 1.948, "step": 10390 }, { "epoch": 0.7004480981099019, "grad_norm": 5.210685879943604, "learning_rate": 2.492873677593964e-07, "loss": 1.9221, "step": 10395 }, { "epoch": 0.7007850139820087, "grad_norm": 5.069652855764322, "learning_rate": 2.4877876076244626e-07, "loss": 2.0789, "step": 10400 }, { "epoch": 0.7011219298541155, "grad_norm": 5.3558651648950715, "learning_rate": 2.482705012562367e-07, "loss": 1.991, "step": 10405 }, { "epoch": 0.7014588457262222, "grad_norm": 4.947793578015824, "learning_rate": 2.4776258994379546e-07, "loss": 2.0086, "step": 10410 }, { "epoch": 0.7017957615983289, "grad_norm": 5.133206401794206, "learning_rate": 2.4725502752766883e-07, "loss": 2.0076, "step": 10415 }, { "epoch": 0.7021326774704356, "grad_norm": 5.4533130465234745, "learning_rate": 2.4674781470991967e-07, "loss": 1.9419, "step": 10420 }, { "epoch": 0.7024695933425423, "grad_norm": 5.566173466330503, "learning_rate": 2.462409521921282e-07, "loss": 1.976, "step": 10425 }, { "epoch": 0.7028065092146492, "grad_norm": 4.917328648281732, "learning_rate": 2.4573444067538985e-07, "loss": 2.0255, "step": 10430 }, { "epoch": 0.7031434250867559, "grad_norm": 5.191422739061956, "learning_rate": 2.4522828086031404e-07, "loss": 2.0722, "step": 10435 }, { "epoch": 0.7034803409588626, "grad_norm": 5.239604070167589, "learning_rate": 2.4472247344702424e-07, "loss": 1.9936, "step": 10440 }, { "epoch": 0.7038172568309693, "grad_norm": 5.370313588942913, "learning_rate": 2.442170191351566e-07, "loss": 2.0844, "step": 10445 }, { "epoch": 0.704154172703076, "grad_norm": 5.434662098067248, "learning_rate": 2.4371191862385816e-07, "loss": 1.9363, "step": 10450 }, { "epoch": 0.7044910885751827, "grad_norm": 4.958342595493026, "learning_rate": 2.4320717261178715e-07, "loss": 2.0229, "step": 10455 }, { "epoch": 0.7048280044472895, "grad_norm": 5.164429000420489, "learning_rate": 2.4270278179711163e-07, "loss": 1.9813, "step": 10460 }, { "epoch": 0.7051649203193963, "grad_norm": 5.280532442496123, "learning_rate": 2.4219874687750754e-07, "loss": 2.0119, "step": 10465 }, { "epoch": 0.705501836191503, "grad_norm": 5.224338756479252, "learning_rate": 2.4169506855015923e-07, "loss": 2.053, "step": 10470 }, { "epoch": 0.7058387520636097, "grad_norm": 4.960440276902753, "learning_rate": 2.4119174751175787e-07, "loss": 1.9989, "step": 10475 }, { "epoch": 0.7061756679357164, "grad_norm": 4.960921425094646, "learning_rate": 2.406887844584998e-07, "loss": 1.9793, "step": 10480 }, { "epoch": 0.7065125838078232, "grad_norm": 5.067306917138349, "learning_rate": 2.401861800860868e-07, "loss": 1.9899, "step": 10485 }, { "epoch": 0.7068494996799299, "grad_norm": 4.609655365599367, "learning_rate": 2.396839350897241e-07, "loss": 2.0266, "step": 10490 }, { "epoch": 0.7071864155520367, "grad_norm": 5.4167987261151325, "learning_rate": 2.391820501641203e-07, "loss": 2.0665, "step": 10495 }, { "epoch": 0.7075233314241434, "grad_norm": 5.26781525056927, "learning_rate": 2.3868052600348524e-07, "loss": 2.0052, "step": 10500 }, { "epoch": 0.7078602472962501, "grad_norm": 4.806868253565441, "learning_rate": 2.381793633015305e-07, "loss": 1.9893, "step": 10505 }, { "epoch": 0.7081971631683569, "grad_norm": 5.117769027235077, "learning_rate": 2.3767856275146748e-07, "loss": 1.9885, "step": 10510 }, { "epoch": 0.7085340790404636, "grad_norm": 5.118887719219898, "learning_rate": 2.3717812504600616e-07, "loss": 1.9665, "step": 10515 }, { "epoch": 0.7088709949125703, "grad_norm": 4.920784714213429, "learning_rate": 2.3667805087735516e-07, "loss": 2.0159, "step": 10520 }, { "epoch": 0.709207910784677, "grad_norm": 5.234643927621848, "learning_rate": 2.3617834093722033e-07, "loss": 1.9591, "step": 10525 }, { "epoch": 0.7095448266567838, "grad_norm": 5.261947817753631, "learning_rate": 2.3567899591680317e-07, "loss": 2.0064, "step": 10530 }, { "epoch": 0.7098817425288906, "grad_norm": 4.941322857713454, "learning_rate": 2.351800165068008e-07, "loss": 1.9814, "step": 10535 }, { "epoch": 0.7102186584009973, "grad_norm": 4.854438251388307, "learning_rate": 2.346814033974047e-07, "loss": 2.0071, "step": 10540 }, { "epoch": 0.710555574273104, "grad_norm": 4.981468518664879, "learning_rate": 2.3418315727829962e-07, "loss": 2.0404, "step": 10545 }, { "epoch": 0.7108924901452107, "grad_norm": 4.773675793808579, "learning_rate": 2.336852788386623e-07, "loss": 2.0394, "step": 10550 }, { "epoch": 0.7112294060173174, "grad_norm": 5.564910595662464, "learning_rate": 2.331877687671614e-07, "loss": 2.0104, "step": 10555 }, { "epoch": 0.7115663218894243, "grad_norm": 5.318652568410829, "learning_rate": 2.3269062775195596e-07, "loss": 1.9898, "step": 10560 }, { "epoch": 0.711903237761531, "grad_norm": 5.01582367853268, "learning_rate": 2.321938564806944e-07, "loss": 2.0398, "step": 10565 }, { "epoch": 0.7122401536336377, "grad_norm": 5.3290880663801214, "learning_rate": 2.3169745564051353e-07, "loss": 1.9985, "step": 10570 }, { "epoch": 0.7125770695057444, "grad_norm": 4.987687884237385, "learning_rate": 2.3120142591803825e-07, "loss": 2.034, "step": 10575 }, { "epoch": 0.7129139853778511, "grad_norm": 5.274484580521031, "learning_rate": 2.307057679993797e-07, "loss": 1.9523, "step": 10580 }, { "epoch": 0.7132509012499579, "grad_norm": 4.904945635515845, "learning_rate": 2.30210482570135e-07, "loss": 2.0111, "step": 10585 }, { "epoch": 0.7135878171220646, "grad_norm": 4.8876431268062, "learning_rate": 2.2971557031538607e-07, "loss": 1.9967, "step": 10590 }, { "epoch": 0.7139247329941714, "grad_norm": 4.663310348660277, "learning_rate": 2.2922103191969828e-07, "loss": 2.0509, "step": 10595 }, { "epoch": 0.7142616488662781, "grad_norm": 5.285761772752407, "learning_rate": 2.2872686806712032e-07, "loss": 1.9771, "step": 10600 }, { "epoch": 0.7145985647383848, "grad_norm": 5.180221002585959, "learning_rate": 2.2823307944118254e-07, "loss": 1.9489, "step": 10605 }, { "epoch": 0.7149354806104916, "grad_norm": 5.3037997443600755, "learning_rate": 2.2773966672489665e-07, "loss": 2.0635, "step": 10610 }, { "epoch": 0.7152723964825983, "grad_norm": 5.3495122591787885, "learning_rate": 2.2724663060075368e-07, "loss": 1.974, "step": 10615 }, { "epoch": 0.715609312354705, "grad_norm": 4.796069271209043, "learning_rate": 2.2675397175072437e-07, "loss": 1.988, "step": 10620 }, { "epoch": 0.7159462282268118, "grad_norm": 4.430140444662421, "learning_rate": 2.2626169085625762e-07, "loss": 1.9507, "step": 10625 }, { "epoch": 0.7162831440989185, "grad_norm": 4.782009888364589, "learning_rate": 2.25769788598279e-07, "loss": 1.9928, "step": 10630 }, { "epoch": 0.7166200599710252, "grad_norm": 5.263455168079604, "learning_rate": 2.2527826565719084e-07, "loss": 1.9258, "step": 10635 }, { "epoch": 0.716956975843132, "grad_norm": 4.949157831037087, "learning_rate": 2.2478712271287087e-07, "loss": 2.025, "step": 10640 }, { "epoch": 0.7172938917152387, "grad_norm": 5.0266791824221, "learning_rate": 2.2429636044467059e-07, "loss": 1.987, "step": 10645 }, { "epoch": 0.7176308075873454, "grad_norm": 4.655125769529071, "learning_rate": 2.2380597953141573e-07, "loss": 2.067, "step": 10650 }, { "epoch": 0.7179677234594521, "grad_norm": 5.01379968487145, "learning_rate": 2.2331598065140396e-07, "loss": 1.9934, "step": 10655 }, { "epoch": 0.7183046393315589, "grad_norm": 5.036134286818101, "learning_rate": 2.228263644824045e-07, "loss": 2.0882, "step": 10660 }, { "epoch": 0.7186415552036657, "grad_norm": 4.651826922943351, "learning_rate": 2.2233713170165757e-07, "loss": 1.9864, "step": 10665 }, { "epoch": 0.7189784710757724, "grad_norm": 4.948634631276915, "learning_rate": 2.2184828298587298e-07, "loss": 2.0312, "step": 10670 }, { "epoch": 0.7193153869478791, "grad_norm": 4.696786715476831, "learning_rate": 2.213598190112294e-07, "loss": 1.9271, "step": 10675 }, { "epoch": 0.7196523028199858, "grad_norm": 5.198134588839145, "learning_rate": 2.2087174045337275e-07, "loss": 1.9816, "step": 10680 }, { "epoch": 0.7199892186920925, "grad_norm": 4.766452017312037, "learning_rate": 2.2038404798741644e-07, "loss": 1.9854, "step": 10685 }, { "epoch": 0.7203261345641994, "grad_norm": 4.978507029623713, "learning_rate": 2.1989674228793987e-07, "loss": 1.9784, "step": 10690 }, { "epoch": 0.7206630504363061, "grad_norm": 5.5611003172149776, "learning_rate": 2.1940982402898684e-07, "loss": 2.0236, "step": 10695 }, { "epoch": 0.7209999663084128, "grad_norm": 4.8460718555430855, "learning_rate": 2.1892329388406582e-07, "loss": 1.9634, "step": 10700 }, { "epoch": 0.7213368821805195, "grad_norm": 5.104603836971819, "learning_rate": 2.1843715252614847e-07, "loss": 1.9436, "step": 10705 }, { "epoch": 0.7216737980526262, "grad_norm": 5.1083922084820905, "learning_rate": 2.179514006276681e-07, "loss": 2.0075, "step": 10710 }, { "epoch": 0.722010713924733, "grad_norm": 4.370177802395508, "learning_rate": 2.1746603886051978e-07, "loss": 2.034, "step": 10715 }, { "epoch": 0.7223476297968398, "grad_norm": 4.733250161097782, "learning_rate": 2.169810678960591e-07, "loss": 1.9685, "step": 10720 }, { "epoch": 0.7226845456689465, "grad_norm": 4.321954928469265, "learning_rate": 2.1649648840510047e-07, "loss": 1.9379, "step": 10725 }, { "epoch": 0.7230214615410532, "grad_norm": 5.044869417369933, "learning_rate": 2.1601230105791751e-07, "loss": 2.0242, "step": 10730 }, { "epoch": 0.7233583774131599, "grad_norm": 5.4070169113370605, "learning_rate": 2.1552850652424077e-07, "loss": 2.0559, "step": 10735 }, { "epoch": 0.7236952932852667, "grad_norm": 6.26428332589417, "learning_rate": 2.150451054732581e-07, "loss": 1.9618, "step": 10740 }, { "epoch": 0.7240322091573734, "grad_norm": 5.306212732243649, "learning_rate": 2.1456209857361246e-07, "loss": 1.9729, "step": 10745 }, { "epoch": 0.7243691250294801, "grad_norm": 4.645893437753715, "learning_rate": 2.1407948649340208e-07, "loss": 1.9988, "step": 10750 }, { "epoch": 0.7247060409015869, "grad_norm": 5.504400358503494, "learning_rate": 2.1359726990017908e-07, "loss": 1.9366, "step": 10755 }, { "epoch": 0.7250429567736936, "grad_norm": 5.440933070865728, "learning_rate": 2.13115449460948e-07, "loss": 2.0044, "step": 10760 }, { "epoch": 0.7253798726458004, "grad_norm": 5.264145374062786, "learning_rate": 2.12634025842166e-07, "loss": 1.936, "step": 10765 }, { "epoch": 0.7257167885179071, "grad_norm": 5.358960503761265, "learning_rate": 2.1215299970974132e-07, "loss": 2.0355, "step": 10770 }, { "epoch": 0.7260537043900138, "grad_norm": 4.597070878044778, "learning_rate": 2.116723717290318e-07, "loss": 2.0378, "step": 10775 }, { "epoch": 0.7263906202621205, "grad_norm": 5.273165421247156, "learning_rate": 2.111921425648453e-07, "loss": 2.0454, "step": 10780 }, { "epoch": 0.7267275361342272, "grad_norm": 4.934956752705849, "learning_rate": 2.1071231288143777e-07, "loss": 2.0114, "step": 10785 }, { "epoch": 0.7270644520063341, "grad_norm": 5.122435094412147, "learning_rate": 2.1023288334251222e-07, "loss": 1.957, "step": 10790 }, { "epoch": 0.7274013678784408, "grad_norm": 4.266379900970564, "learning_rate": 2.0975385461121864e-07, "loss": 2.0474, "step": 10795 }, { "epoch": 0.7277382837505475, "grad_norm": 5.0739623027719745, "learning_rate": 2.0927522735015268e-07, "loss": 1.9009, "step": 10800 }, { "epoch": 0.7280751996226542, "grad_norm": 4.633978637132555, "learning_rate": 2.0879700222135416e-07, "loss": 2.0016, "step": 10805 }, { "epoch": 0.7284121154947609, "grad_norm": 5.23213032631426, "learning_rate": 2.083191798863072e-07, "loss": 1.9992, "step": 10810 }, { "epoch": 0.7287490313668676, "grad_norm": 5.134690227696205, "learning_rate": 2.0784176100593836e-07, "loss": 1.9818, "step": 10815 }, { "epoch": 0.7290859472389745, "grad_norm": 5.197183784688419, "learning_rate": 2.0736474624061655e-07, "loss": 2.0632, "step": 10820 }, { "epoch": 0.7294228631110812, "grad_norm": 4.743760381667321, "learning_rate": 2.0688813625015123e-07, "loss": 1.9809, "step": 10825 }, { "epoch": 0.7297597789831879, "grad_norm": 5.240192523885652, "learning_rate": 2.064119316937923e-07, "loss": 2.032, "step": 10830 }, { "epoch": 0.7300966948552946, "grad_norm": 5.111670971715797, "learning_rate": 2.0593613323022907e-07, "loss": 1.964, "step": 10835 }, { "epoch": 0.7304336107274013, "grad_norm": 5.247700889898067, "learning_rate": 2.054607415175884e-07, "loss": 2.0109, "step": 10840 }, { "epoch": 0.7307705265995081, "grad_norm": 5.359098017985685, "learning_rate": 2.0498575721343525e-07, "loss": 2.0172, "step": 10845 }, { "epoch": 0.7311074424716149, "grad_norm": 5.0168200516387005, "learning_rate": 2.0451118097477093e-07, "loss": 1.9904, "step": 10850 }, { "epoch": 0.7314443583437216, "grad_norm": 5.693510740185484, "learning_rate": 2.0403701345803186e-07, "loss": 1.9957, "step": 10855 }, { "epoch": 0.7317812742158283, "grad_norm": 5.023716778898175, "learning_rate": 2.0356325531908952e-07, "loss": 1.9917, "step": 10860 }, { "epoch": 0.732118190087935, "grad_norm": 4.8120312329835, "learning_rate": 2.0308990721324926e-07, "loss": 2.0076, "step": 10865 }, { "epoch": 0.7324551059600418, "grad_norm": 4.789085365447281, "learning_rate": 2.0261696979524873e-07, "loss": 2.0088, "step": 10870 }, { "epoch": 0.7327920218321485, "grad_norm": 4.922861144590358, "learning_rate": 2.0214444371925793e-07, "loss": 1.9165, "step": 10875 }, { "epoch": 0.7331289377042552, "grad_norm": 5.046119117215446, "learning_rate": 2.0167232963887787e-07, "loss": 1.9616, "step": 10880 }, { "epoch": 0.733465853576362, "grad_norm": 5.421894517437394, "learning_rate": 2.0120062820713974e-07, "loss": 2.0044, "step": 10885 }, { "epoch": 0.7338027694484687, "grad_norm": 5.296593967427492, "learning_rate": 2.0072934007650345e-07, "loss": 2.0106, "step": 10890 }, { "epoch": 0.7341396853205755, "grad_norm": 4.9145286940698565, "learning_rate": 2.0025846589885798e-07, "loss": 2.0075, "step": 10895 }, { "epoch": 0.7344766011926822, "grad_norm": 4.964856420686315, "learning_rate": 1.99788006325519e-07, "loss": 1.9051, "step": 10900 }, { "epoch": 0.7348135170647889, "grad_norm": 4.790590235464161, "learning_rate": 1.9931796200722943e-07, "loss": 1.9917, "step": 10905 }, { "epoch": 0.7351504329368956, "grad_norm": 5.031813219297497, "learning_rate": 1.9884833359415698e-07, "loss": 2.0125, "step": 10910 }, { "epoch": 0.7354873488090024, "grad_norm": 4.872250882810023, "learning_rate": 1.9837912173589494e-07, "loss": 2.0404, "step": 10915 }, { "epoch": 0.7358242646811092, "grad_norm": 4.998721399342274, "learning_rate": 1.979103270814596e-07, "loss": 2.0605, "step": 10920 }, { "epoch": 0.7361611805532159, "grad_norm": 4.9504153668413675, "learning_rate": 1.9744195027929072e-07, "loss": 2.0145, "step": 10925 }, { "epoch": 0.7364980964253226, "grad_norm": 5.446256100622866, "learning_rate": 1.9697399197725023e-07, "loss": 2.0228, "step": 10930 }, { "epoch": 0.7368350122974293, "grad_norm": 4.746168399699299, "learning_rate": 1.965064528226204e-07, "loss": 2.0088, "step": 10935 }, { "epoch": 0.737171928169536, "grad_norm": 5.019495482072339, "learning_rate": 1.9603933346210445e-07, "loss": 2.0468, "step": 10940 }, { "epoch": 0.7375088440416429, "grad_norm": 5.364976138068592, "learning_rate": 1.9557263454182476e-07, "loss": 1.9977, "step": 10945 }, { "epoch": 0.7378457599137496, "grad_norm": 4.701419541176352, "learning_rate": 1.9510635670732216e-07, "loss": 1.9906, "step": 10950 }, { "epoch": 0.7381826757858563, "grad_norm": 5.190610137154199, "learning_rate": 1.946405006035548e-07, "loss": 1.9961, "step": 10955 }, { "epoch": 0.738519591657963, "grad_norm": 4.990030816611463, "learning_rate": 1.9417506687489772e-07, "loss": 2.0145, "step": 10960 }, { "epoch": 0.7388565075300697, "grad_norm": 5.16625707132003, "learning_rate": 1.937100561651418e-07, "loss": 1.9641, "step": 10965 }, { "epoch": 0.7391934234021765, "grad_norm": 5.210160718183523, "learning_rate": 1.9324546911749246e-07, "loss": 2.0062, "step": 10970 }, { "epoch": 0.7395303392742832, "grad_norm": 4.777112074992779, "learning_rate": 1.9278130637456957e-07, "loss": 1.9248, "step": 10975 }, { "epoch": 0.73986725514639, "grad_norm": 5.28124903606523, "learning_rate": 1.923175685784056e-07, "loss": 2.0034, "step": 10980 }, { "epoch": 0.7402041710184967, "grad_norm": 5.586277296871801, "learning_rate": 1.9185425637044567e-07, "loss": 2.0053, "step": 10985 }, { "epoch": 0.7405410868906034, "grad_norm": 4.9157976487694155, "learning_rate": 1.9139137039154584e-07, "loss": 2.0376, "step": 10990 }, { "epoch": 0.7408780027627101, "grad_norm": 4.906441267872725, "learning_rate": 1.9092891128197308e-07, "loss": 1.9676, "step": 10995 }, { "epoch": 0.7412149186348169, "grad_norm": 5.051075556471816, "learning_rate": 1.904668796814033e-07, "loss": 2.0928, "step": 11000 }, { "epoch": 0.7415518345069236, "grad_norm": 5.9530524337680735, "learning_rate": 1.9000527622892154e-07, "loss": 2.0368, "step": 11005 }, { "epoch": 0.7418887503790303, "grad_norm": 4.822532565660593, "learning_rate": 1.895441015630206e-07, "loss": 2.0029, "step": 11010 }, { "epoch": 0.7422256662511371, "grad_norm": 5.19381504831183, "learning_rate": 1.8908335632160011e-07, "loss": 2.0862, "step": 11015 }, { "epoch": 0.7425625821232438, "grad_norm": 5.115866049422488, "learning_rate": 1.8862304114196542e-07, "loss": 1.9964, "step": 11020 }, { "epoch": 0.7428994979953506, "grad_norm": 5.3077195880649635, "learning_rate": 1.8816315666082744e-07, "loss": 1.9758, "step": 11025 }, { "epoch": 0.7432364138674573, "grad_norm": 4.873655749893997, "learning_rate": 1.877037035143013e-07, "loss": 2.0093, "step": 11030 }, { "epoch": 0.743573329739564, "grad_norm": 4.737072666953979, "learning_rate": 1.8724468233790512e-07, "loss": 1.982, "step": 11035 }, { "epoch": 0.7439102456116707, "grad_norm": 5.564077716047938, "learning_rate": 1.867860937665599e-07, "loss": 2.0877, "step": 11040 }, { "epoch": 0.7442471614837775, "grad_norm": 4.723525880785518, "learning_rate": 1.8632793843458827e-07, "loss": 2.0339, "step": 11045 }, { "epoch": 0.7445840773558843, "grad_norm": 5.589752843260553, "learning_rate": 1.8587021697571313e-07, "loss": 1.9892, "step": 11050 }, { "epoch": 0.744920993227991, "grad_norm": 5.19244964065091, "learning_rate": 1.854129300230578e-07, "loss": 2.0254, "step": 11055 }, { "epoch": 0.7452579091000977, "grad_norm": 5.103102252941908, "learning_rate": 1.849560782091445e-07, "loss": 1.9557, "step": 11060 }, { "epoch": 0.7455948249722044, "grad_norm": 5.381909203512008, "learning_rate": 1.8449966216589319e-07, "loss": 2.0147, "step": 11065 }, { "epoch": 0.7459317408443111, "grad_norm": 5.406433065377391, "learning_rate": 1.8404368252462128e-07, "loss": 1.9637, "step": 11070 }, { "epoch": 0.746268656716418, "grad_norm": 5.218762523844258, "learning_rate": 1.8358813991604262e-07, "loss": 2.0528, "step": 11075 }, { "epoch": 0.7466055725885247, "grad_norm": 4.801018348760475, "learning_rate": 1.8313303497026673e-07, "loss": 1.9956, "step": 11080 }, { "epoch": 0.7469424884606314, "grad_norm": 5.086702033613458, "learning_rate": 1.8267836831679718e-07, "loss": 2.0589, "step": 11085 }, { "epoch": 0.7472794043327381, "grad_norm": 5.173623750968069, "learning_rate": 1.8222414058453183e-07, "loss": 2.0198, "step": 11090 }, { "epoch": 0.7476163202048448, "grad_norm": 4.847022358551245, "learning_rate": 1.8177035240176136e-07, "loss": 2.0477, "step": 11095 }, { "epoch": 0.7479532360769516, "grad_norm": 5.512766734033958, "learning_rate": 1.8131700439616803e-07, "loss": 2.0584, "step": 11100 }, { "epoch": 0.7482901519490583, "grad_norm": 4.763615375992154, "learning_rate": 1.8086409719482576e-07, "loss": 1.9261, "step": 11105 }, { "epoch": 0.7486270678211651, "grad_norm": 4.807729868754023, "learning_rate": 1.8041163142419857e-07, "loss": 2.0378, "step": 11110 }, { "epoch": 0.7489639836932718, "grad_norm": 5.224622221528324, "learning_rate": 1.7995960771013962e-07, "loss": 1.9757, "step": 11115 }, { "epoch": 0.7493008995653785, "grad_norm": 5.459666021332231, "learning_rate": 1.7950802667789107e-07, "loss": 2.0513, "step": 11120 }, { "epoch": 0.7496378154374853, "grad_norm": 4.748602816761382, "learning_rate": 1.7905688895208259e-07, "loss": 1.9784, "step": 11125 }, { "epoch": 0.749974731309592, "grad_norm": 4.79929759426102, "learning_rate": 1.7860619515673032e-07, "loss": 2.0179, "step": 11130 }, { "epoch": 0.7503116471816987, "grad_norm": 5.481553760644221, "learning_rate": 1.7815594591523687e-07, "loss": 2.0541, "step": 11135 }, { "epoch": 0.7506485630538055, "grad_norm": 5.367588389612939, "learning_rate": 1.777061418503898e-07, "loss": 2.0209, "step": 11140 }, { "epoch": 0.7509854789259122, "grad_norm": 4.99115372403347, "learning_rate": 1.7725678358436053e-07, "loss": 2.0117, "step": 11145 }, { "epoch": 0.751322394798019, "grad_norm": 5.1453613933189235, "learning_rate": 1.7680787173870454e-07, "loss": 1.9277, "step": 11150 }, { "epoch": 0.7516593106701257, "grad_norm": 4.952432377327677, "learning_rate": 1.763594069343589e-07, "loss": 1.9474, "step": 11155 }, { "epoch": 0.7519962265422324, "grad_norm": 5.466893529934965, "learning_rate": 1.7591138979164337e-07, "loss": 1.9533, "step": 11160 }, { "epoch": 0.7523331424143391, "grad_norm": 4.997322082300055, "learning_rate": 1.7546382093025758e-07, "loss": 2.0079, "step": 11165 }, { "epoch": 0.7526700582864458, "grad_norm": 5.28942789971942, "learning_rate": 1.7501670096928162e-07, "loss": 2.029, "step": 11170 }, { "epoch": 0.7530069741585526, "grad_norm": 4.615511600030695, "learning_rate": 1.7457003052717473e-07, "loss": 2.0574, "step": 11175 }, { "epoch": 0.7533438900306594, "grad_norm": 5.341823631988656, "learning_rate": 1.741238102217738e-07, "loss": 1.979, "step": 11180 }, { "epoch": 0.7536808059027661, "grad_norm": 5.47546338386817, "learning_rate": 1.736780406702937e-07, "loss": 2.0022, "step": 11185 }, { "epoch": 0.7540177217748728, "grad_norm": 5.050660523239323, "learning_rate": 1.7323272248932564e-07, "loss": 1.9573, "step": 11190 }, { "epoch": 0.7543546376469795, "grad_norm": 4.630782730602179, "learning_rate": 1.727878562948362e-07, "loss": 1.9781, "step": 11195 }, { "epoch": 0.7546915535190862, "grad_norm": 5.470520107714299, "learning_rate": 1.723434427021671e-07, "loss": 1.9163, "step": 11200 }, { "epoch": 0.7550284693911931, "grad_norm": 5.082237230807087, "learning_rate": 1.7189948232603412e-07, "loss": 2.0362, "step": 11205 }, { "epoch": 0.7553653852632998, "grad_norm": 4.670960356313788, "learning_rate": 1.7145597578052557e-07, "loss": 2.0587, "step": 11210 }, { "epoch": 0.7557023011354065, "grad_norm": 4.941758055905407, "learning_rate": 1.7101292367910259e-07, "loss": 2.0429, "step": 11215 }, { "epoch": 0.7560392170075132, "grad_norm": 5.190528354506311, "learning_rate": 1.7057032663459768e-07, "loss": 2.0422, "step": 11220 }, { "epoch": 0.7563761328796199, "grad_norm": 5.57682344084383, "learning_rate": 1.701281852592134e-07, "loss": 2.0069, "step": 11225 }, { "epoch": 0.7567130487517267, "grad_norm": 4.776135010862406, "learning_rate": 1.696865001645228e-07, "loss": 1.9623, "step": 11230 }, { "epoch": 0.7570499646238334, "grad_norm": 4.711428074872487, "learning_rate": 1.6924527196146692e-07, "loss": 2.0743, "step": 11235 }, { "epoch": 0.7573868804959402, "grad_norm": 5.128604387667783, "learning_rate": 1.6880450126035572e-07, "loss": 2.0007, "step": 11240 }, { "epoch": 0.7577237963680469, "grad_norm": 5.337739336112739, "learning_rate": 1.683641886708655e-07, "loss": 1.9606, "step": 11245 }, { "epoch": 0.7580607122401536, "grad_norm": 5.178251302149372, "learning_rate": 1.6792433480203955e-07, "loss": 1.973, "step": 11250 }, { "epoch": 0.7583976281122604, "grad_norm": 4.995699294476046, "learning_rate": 1.674849402622865e-07, "loss": 2.0229, "step": 11255 }, { "epoch": 0.7587345439843671, "grad_norm": 4.955764992337204, "learning_rate": 1.6704600565937927e-07, "loss": 2.027, "step": 11260 }, { "epoch": 0.7590714598564738, "grad_norm": 5.235453233205671, "learning_rate": 1.6660753160045498e-07, "loss": 1.9517, "step": 11265 }, { "epoch": 0.7594083757285806, "grad_norm": 5.094331332828706, "learning_rate": 1.6616951869201378e-07, "loss": 1.976, "step": 11270 }, { "epoch": 0.7597452916006873, "grad_norm": 4.555613460729381, "learning_rate": 1.6573196753991747e-07, "loss": 2.0588, "step": 11275 }, { "epoch": 0.7600822074727941, "grad_norm": 5.064553114668466, "learning_rate": 1.652948787493896e-07, "loss": 2.0135, "step": 11280 }, { "epoch": 0.7604191233449008, "grad_norm": 5.050308357014605, "learning_rate": 1.64858252925014e-07, "loss": 2.018, "step": 11285 }, { "epoch": 0.7607560392170075, "grad_norm": 4.984344939346089, "learning_rate": 1.6442209067073442e-07, "loss": 1.9854, "step": 11290 }, { "epoch": 0.7610929550891142, "grad_norm": 5.183850133763077, "learning_rate": 1.639863925898527e-07, "loss": 2.0132, "step": 11295 }, { "epoch": 0.761429870961221, "grad_norm": 5.514737042115228, "learning_rate": 1.6355115928502934e-07, "loss": 2.0293, "step": 11300 }, { "epoch": 0.7617667868333278, "grad_norm": 5.003702063934968, "learning_rate": 1.6311639135828176e-07, "loss": 2.0229, "step": 11305 }, { "epoch": 0.7621037027054345, "grad_norm": 5.494847096907641, "learning_rate": 1.6268208941098344e-07, "loss": 1.9879, "step": 11310 }, { "epoch": 0.7624406185775412, "grad_norm": 5.04020461927423, "learning_rate": 1.6224825404386326e-07, "loss": 1.9674, "step": 11315 }, { "epoch": 0.7627775344496479, "grad_norm": 4.802126733755397, "learning_rate": 1.6181488585700541e-07, "loss": 1.91, "step": 11320 }, { "epoch": 0.7631144503217546, "grad_norm": 5.319670050719582, "learning_rate": 1.6138198544984692e-07, "loss": 2.0456, "step": 11325 }, { "epoch": 0.7634513661938614, "grad_norm": 4.926729840018237, "learning_rate": 1.609495534211785e-07, "loss": 2.0013, "step": 11330 }, { "epoch": 0.7637882820659682, "grad_norm": 4.855248729538437, "learning_rate": 1.6051759036914286e-07, "loss": 1.9634, "step": 11335 }, { "epoch": 0.7641251979380749, "grad_norm": 4.993611752117962, "learning_rate": 1.6008609689123364e-07, "loss": 2.0345, "step": 11340 }, { "epoch": 0.7644621138101816, "grad_norm": 5.139326914669983, "learning_rate": 1.596550735842953e-07, "loss": 2.0355, "step": 11345 }, { "epoch": 0.7647990296822883, "grad_norm": 5.174779487727646, "learning_rate": 1.5922452104452204e-07, "loss": 2.0063, "step": 11350 }, { "epoch": 0.765135945554395, "grad_norm": 5.279430465805613, "learning_rate": 1.5879443986745678e-07, "loss": 2.0262, "step": 11355 }, { "epoch": 0.7654728614265018, "grad_norm": 5.068836607586557, "learning_rate": 1.583648306479901e-07, "loss": 1.9861, "step": 11360 }, { "epoch": 0.7658097772986086, "grad_norm": 5.31624161839457, "learning_rate": 1.5793569398036032e-07, "loss": 2.0241, "step": 11365 }, { "epoch": 0.7661466931707153, "grad_norm": 4.889675909472314, "learning_rate": 1.57507030458152e-07, "loss": 1.9385, "step": 11370 }, { "epoch": 0.766483609042822, "grad_norm": 4.801947048892856, "learning_rate": 1.5707884067429471e-07, "loss": 2.0822, "step": 11375 }, { "epoch": 0.7668205249149287, "grad_norm": 4.81949295944907, "learning_rate": 1.566511252210635e-07, "loss": 1.9649, "step": 11380 }, { "epoch": 0.7671574407870355, "grad_norm": 4.606477464570745, "learning_rate": 1.5622388469007696e-07, "loss": 1.985, "step": 11385 }, { "epoch": 0.7674943566591422, "grad_norm": 4.830304327205954, "learning_rate": 1.5579711967229652e-07, "loss": 1.9709, "step": 11390 }, { "epoch": 0.7678312725312489, "grad_norm": 5.045634475144774, "learning_rate": 1.5537083075802647e-07, "loss": 2.0307, "step": 11395 }, { "epoch": 0.7681681884033557, "grad_norm": 4.879221212529525, "learning_rate": 1.5494501853691195e-07, "loss": 2.0334, "step": 11400 }, { "epoch": 0.7685051042754624, "grad_norm": 5.406448923909657, "learning_rate": 1.5451968359793927e-07, "loss": 1.9417, "step": 11405 }, { "epoch": 0.7688420201475692, "grad_norm": 4.9331531887204445, "learning_rate": 1.5409482652943396e-07, "loss": 1.9531, "step": 11410 }, { "epoch": 0.7691789360196759, "grad_norm": 5.273668684068152, "learning_rate": 1.536704479190611e-07, "loss": 1.9496, "step": 11415 }, { "epoch": 0.7695158518917826, "grad_norm": 4.884464065234594, "learning_rate": 1.5324654835382384e-07, "loss": 2.0251, "step": 11420 }, { "epoch": 0.7698527677638893, "grad_norm": 4.862532147443118, "learning_rate": 1.5282312842006238e-07, "loss": 2.0081, "step": 11425 }, { "epoch": 0.770189683635996, "grad_norm": 4.869510176953786, "learning_rate": 1.5240018870345388e-07, "loss": 1.9699, "step": 11430 }, { "epoch": 0.7705265995081029, "grad_norm": 5.38609481006596, "learning_rate": 1.519777297890113e-07, "loss": 1.9423, "step": 11435 }, { "epoch": 0.7708635153802096, "grad_norm": 5.78015482534876, "learning_rate": 1.5155575226108198e-07, "loss": 1.9768, "step": 11440 }, { "epoch": 0.7712004312523163, "grad_norm": 4.962005039799369, "learning_rate": 1.51134256703348e-07, "loss": 2.0498, "step": 11445 }, { "epoch": 0.771537347124423, "grad_norm": 4.999588836737138, "learning_rate": 1.5071324369882478e-07, "loss": 1.9413, "step": 11450 }, { "epoch": 0.7718742629965297, "grad_norm": 5.008865368978979, "learning_rate": 1.5029271382985964e-07, "loss": 1.9909, "step": 11455 }, { "epoch": 0.7722111788686365, "grad_norm": 5.083324449701367, "learning_rate": 1.498726676781323e-07, "loss": 2.0461, "step": 11460 }, { "epoch": 0.7725480947407433, "grad_norm": 5.45548723077481, "learning_rate": 1.4945310582465327e-07, "loss": 2.0279, "step": 11465 }, { "epoch": 0.77288501061285, "grad_norm": 4.760598382687764, "learning_rate": 1.4903402884976262e-07, "loss": 2.0394, "step": 11470 }, { "epoch": 0.7732219264849567, "grad_norm": 5.660833791479961, "learning_rate": 1.4861543733313065e-07, "loss": 2.0142, "step": 11475 }, { "epoch": 0.7735588423570634, "grad_norm": 5.126370885487476, "learning_rate": 1.4819733185375531e-07, "loss": 2.0996, "step": 11480 }, { "epoch": 0.7738957582291702, "grad_norm": 5.203411449140683, "learning_rate": 1.4777971298996288e-07, "loss": 1.9766, "step": 11485 }, { "epoch": 0.7742326741012769, "grad_norm": 4.684629256455359, "learning_rate": 1.4736258131940605e-07, "loss": 1.9378, "step": 11490 }, { "epoch": 0.7745695899733837, "grad_norm": 5.347418866541802, "learning_rate": 1.4694593741906403e-07, "loss": 2.0136, "step": 11495 }, { "epoch": 0.7749065058454904, "grad_norm": 5.202584988208978, "learning_rate": 1.4652978186524135e-07, "loss": 2.0369, "step": 11500 }, { "epoch": 0.7752434217175971, "grad_norm": 4.775135036777449, "learning_rate": 1.4611411523356653e-07, "loss": 2.019, "step": 11505 }, { "epoch": 0.7755803375897039, "grad_norm": 4.945846568841138, "learning_rate": 1.4569893809899242e-07, "loss": 2.0628, "step": 11510 }, { "epoch": 0.7759172534618106, "grad_norm": 5.002763391436914, "learning_rate": 1.452842510357946e-07, "loss": 1.9638, "step": 11515 }, { "epoch": 0.7762541693339173, "grad_norm": 4.947392866909309, "learning_rate": 1.4487005461757051e-07, "loss": 1.9584, "step": 11520 }, { "epoch": 0.776591085206024, "grad_norm": 4.911295391828476, "learning_rate": 1.4445634941723927e-07, "loss": 2.0095, "step": 11525 }, { "epoch": 0.7769280010781308, "grad_norm": 5.12710203956397, "learning_rate": 1.4404313600704054e-07, "loss": 2.0849, "step": 11530 }, { "epoch": 0.7772649169502375, "grad_norm": 5.325585082348797, "learning_rate": 1.4363041495853334e-07, "loss": 1.9946, "step": 11535 }, { "epoch": 0.7776018328223443, "grad_norm": 5.1351764246013465, "learning_rate": 1.4321818684259607e-07, "loss": 2.0374, "step": 11540 }, { "epoch": 0.777938748694451, "grad_norm": 4.777530635265064, "learning_rate": 1.4280645222942535e-07, "loss": 1.9876, "step": 11545 }, { "epoch": 0.7782756645665577, "grad_norm": 4.99394113418559, "learning_rate": 1.4239521168853458e-07, "loss": 1.9574, "step": 11550 }, { "epoch": 0.7786125804386644, "grad_norm": 4.777624779781727, "learning_rate": 1.4198446578875444e-07, "loss": 2.0007, "step": 11555 }, { "epoch": 0.7789494963107712, "grad_norm": 5.048966629175849, "learning_rate": 1.4157421509823119e-07, "loss": 1.9548, "step": 11560 }, { "epoch": 0.779286412182878, "grad_norm": 4.803556423078746, "learning_rate": 1.4116446018442608e-07, "loss": 1.925, "step": 11565 }, { "epoch": 0.7796233280549847, "grad_norm": 5.359961769066165, "learning_rate": 1.4075520161411425e-07, "loss": 1.9554, "step": 11570 }, { "epoch": 0.7799602439270914, "grad_norm": 4.998452365463789, "learning_rate": 1.403464399533849e-07, "loss": 2.007, "step": 11575 }, { "epoch": 0.7802971597991981, "grad_norm": 5.160838408885268, "learning_rate": 1.3993817576763983e-07, "loss": 2.0754, "step": 11580 }, { "epoch": 0.7806340756713048, "grad_norm": 5.229751091673616, "learning_rate": 1.3953040962159207e-07, "loss": 1.9909, "step": 11585 }, { "epoch": 0.7809709915434117, "grad_norm": 5.284227957360789, "learning_rate": 1.3912314207926657e-07, "loss": 1.9799, "step": 11590 }, { "epoch": 0.7813079074155184, "grad_norm": 4.916233214590697, "learning_rate": 1.3871637370399824e-07, "loss": 2.0061, "step": 11595 }, { "epoch": 0.7816448232876251, "grad_norm": 4.960716140332831, "learning_rate": 1.3831010505843139e-07, "loss": 1.9557, "step": 11600 }, { "epoch": 0.7819817391597318, "grad_norm": 4.928198266628274, "learning_rate": 1.3790433670451927e-07, "loss": 1.9388, "step": 11605 }, { "epoch": 0.7823186550318385, "grad_norm": 5.078729121405443, "learning_rate": 1.374990692035235e-07, "loss": 1.9784, "step": 11610 }, { "epoch": 0.7826555709039453, "grad_norm": 5.1598131877595, "learning_rate": 1.3709430311601205e-07, "loss": 1.9716, "step": 11615 }, { "epoch": 0.782992486776052, "grad_norm": 4.925436691221527, "learning_rate": 1.366900390018601e-07, "loss": 1.9007, "step": 11620 }, { "epoch": 0.7833294026481588, "grad_norm": 5.172708314917181, "learning_rate": 1.3628627742024812e-07, "loss": 2.0607, "step": 11625 }, { "epoch": 0.7836663185202655, "grad_norm": 5.22474344275395, "learning_rate": 1.3588301892966182e-07, "loss": 1.876, "step": 11630 }, { "epoch": 0.7840032343923722, "grad_norm": 5.427187440230595, "learning_rate": 1.3548026408789044e-07, "loss": 2.0361, "step": 11635 }, { "epoch": 0.784340150264479, "grad_norm": 5.6082249523360375, "learning_rate": 1.350780134520272e-07, "loss": 2.0399, "step": 11640 }, { "epoch": 0.7846770661365857, "grad_norm": 4.8725676707011045, "learning_rate": 1.3467626757846733e-07, "loss": 1.9698, "step": 11645 }, { "epoch": 0.7850139820086924, "grad_norm": 4.795390442480934, "learning_rate": 1.342750270229085e-07, "loss": 1.9431, "step": 11650 }, { "epoch": 0.7853508978807991, "grad_norm": 5.510505999430798, "learning_rate": 1.338742923403487e-07, "loss": 1.9111, "step": 11655 }, { "epoch": 0.7856878137529059, "grad_norm": 5.5439448339586965, "learning_rate": 1.3347406408508694e-07, "loss": 1.9321, "step": 11660 }, { "epoch": 0.7860247296250127, "grad_norm": 4.980254644052039, "learning_rate": 1.3307434281072106e-07, "loss": 2.0454, "step": 11665 }, { "epoch": 0.7863616454971194, "grad_norm": 5.387110204522276, "learning_rate": 1.326751290701481e-07, "loss": 2.015, "step": 11670 }, { "epoch": 0.7866985613692261, "grad_norm": 4.948109753193438, "learning_rate": 1.3227642341556306e-07, "loss": 2.0273, "step": 11675 }, { "epoch": 0.7870354772413328, "grad_norm": 4.786187312006951, "learning_rate": 1.318782263984577e-07, "loss": 2.0074, "step": 11680 }, { "epoch": 0.7873723931134395, "grad_norm": 5.019583048900953, "learning_rate": 1.314805385696207e-07, "loss": 2.0181, "step": 11685 }, { "epoch": 0.7877093089855464, "grad_norm": 5.129207882025466, "learning_rate": 1.3108336047913633e-07, "loss": 2.0612, "step": 11690 }, { "epoch": 0.7880462248576531, "grad_norm": 4.785373031762451, "learning_rate": 1.3068669267638377e-07, "loss": 1.9885, "step": 11695 }, { "epoch": 0.7883831407297598, "grad_norm": 4.8039238081880224, "learning_rate": 1.3029053571003619e-07, "loss": 2.0264, "step": 11700 }, { "epoch": 0.7887200566018665, "grad_norm": 5.083065058413712, "learning_rate": 1.2989489012806033e-07, "loss": 2.0317, "step": 11705 }, { "epoch": 0.7890569724739732, "grad_norm": 5.528021910393774, "learning_rate": 1.294997564777157e-07, "loss": 1.9883, "step": 11710 }, { "epoch": 0.7893938883460799, "grad_norm": 5.108809410994523, "learning_rate": 1.291051353055534e-07, "loss": 2.0537, "step": 11715 }, { "epoch": 0.7897308042181868, "grad_norm": 5.176990285400275, "learning_rate": 1.28711027157416e-07, "loss": 1.9919, "step": 11720 }, { "epoch": 0.7900677200902935, "grad_norm": 4.8936829133294, "learning_rate": 1.2831743257843597e-07, "loss": 1.985, "step": 11725 }, { "epoch": 0.7904046359624002, "grad_norm": 4.940492891933523, "learning_rate": 1.279243521130361e-07, "loss": 1.9684, "step": 11730 }, { "epoch": 0.7907415518345069, "grad_norm": 4.978497334708518, "learning_rate": 1.2753178630492733e-07, "loss": 2.0841, "step": 11735 }, { "epoch": 0.7910784677066136, "grad_norm": 5.172492172156165, "learning_rate": 1.271397356971094e-07, "loss": 1.9575, "step": 11740 }, { "epoch": 0.7914153835787204, "grad_norm": 5.515783450200639, "learning_rate": 1.267482008318687e-07, "loss": 1.9997, "step": 11745 }, { "epoch": 0.7917522994508271, "grad_norm": 5.025537535105946, "learning_rate": 1.2635718225077884e-07, "loss": 2.1092, "step": 11750 }, { "epoch": 0.7920892153229339, "grad_norm": 5.105100071826227, "learning_rate": 1.259666804946991e-07, "loss": 1.9678, "step": 11755 }, { "epoch": 0.7924261311950406, "grad_norm": 5.469343530053747, "learning_rate": 1.2557669610377397e-07, "loss": 2.0014, "step": 11760 }, { "epoch": 0.7927630470671473, "grad_norm": 4.957619136502641, "learning_rate": 1.25187229617432e-07, "loss": 2.0506, "step": 11765 }, { "epoch": 0.7930999629392541, "grad_norm": 4.847808380325742, "learning_rate": 1.247982815743857e-07, "loss": 2.0092, "step": 11770 }, { "epoch": 0.7934368788113608, "grad_norm": 5.100805678742518, "learning_rate": 1.2440985251263054e-07, "loss": 1.9075, "step": 11775 }, { "epoch": 0.7937737946834675, "grad_norm": 4.9964523184736995, "learning_rate": 1.2402194296944363e-07, "loss": 2.0137, "step": 11780 }, { "epoch": 0.7941107105555743, "grad_norm": 5.251812520927419, "learning_rate": 1.236345534813839e-07, "loss": 1.937, "step": 11785 }, { "epoch": 0.794447626427681, "grad_norm": 4.775944595035598, "learning_rate": 1.2324768458429107e-07, "loss": 1.9931, "step": 11790 }, { "epoch": 0.7947845422997878, "grad_norm": 5.462181558901568, "learning_rate": 1.228613368132842e-07, "loss": 2.0459, "step": 11795 }, { "epoch": 0.7951214581718945, "grad_norm": 4.773031548591699, "learning_rate": 1.2247551070276207e-07, "loss": 1.8567, "step": 11800 }, { "epoch": 0.7954583740440012, "grad_norm": 5.251905636956224, "learning_rate": 1.2209020678640176e-07, "loss": 2.0607, "step": 11805 }, { "epoch": 0.7957952899161079, "grad_norm": 5.297749511813894, "learning_rate": 1.2170542559715775e-07, "loss": 1.9505, "step": 11810 }, { "epoch": 0.7961322057882146, "grad_norm": 5.370043675183787, "learning_rate": 1.2132116766726196e-07, "loss": 2.0243, "step": 11815 }, { "epoch": 0.7964691216603215, "grad_norm": 5.098637988870714, "learning_rate": 1.2093743352822206e-07, "loss": 2.0081, "step": 11820 }, { "epoch": 0.7968060375324282, "grad_norm": 5.97572802724156, "learning_rate": 1.2055422371082168e-07, "loss": 1.9672, "step": 11825 }, { "epoch": 0.7971429534045349, "grad_norm": 5.038356795339382, "learning_rate": 1.2017153874511865e-07, "loss": 1.9891, "step": 11830 }, { "epoch": 0.7974798692766416, "grad_norm": 5.296442667755238, "learning_rate": 1.1978937916044534e-07, "loss": 2.0277, "step": 11835 }, { "epoch": 0.7978167851487483, "grad_norm": 5.064257213595256, "learning_rate": 1.1940774548540733e-07, "loss": 1.9381, "step": 11840 }, { "epoch": 0.7981537010208551, "grad_norm": 5.060125728513771, "learning_rate": 1.1902663824788233e-07, "loss": 1.9866, "step": 11845 }, { "epoch": 0.7984906168929619, "grad_norm": 4.837309972513891, "learning_rate": 1.1864605797502031e-07, "loss": 1.998, "step": 11850 }, { "epoch": 0.7988275327650686, "grad_norm": 4.427539483463844, "learning_rate": 1.1826600519324237e-07, "loss": 2.0342, "step": 11855 }, { "epoch": 0.7991644486371753, "grad_norm": 5.29246317363125, "learning_rate": 1.1788648042823956e-07, "loss": 1.9739, "step": 11860 }, { "epoch": 0.799501364509282, "grad_norm": 5.181824829877327, "learning_rate": 1.1750748420497298e-07, "loss": 1.9583, "step": 11865 }, { "epoch": 0.7998382803813888, "grad_norm": 5.622807161478464, "learning_rate": 1.1712901704767253e-07, "loss": 1.986, "step": 11870 }, { "epoch": 0.8001751962534955, "grad_norm": 4.848072302224803, "learning_rate": 1.1675107947983615e-07, "loss": 2.0242, "step": 11875 }, { "epoch": 0.8005121121256022, "grad_norm": 4.965656667727491, "learning_rate": 1.1637367202422943e-07, "loss": 2.0576, "step": 11880 }, { "epoch": 0.800849027997709, "grad_norm": 4.80104534539921, "learning_rate": 1.159967952028848e-07, "loss": 2.0421, "step": 11885 }, { "epoch": 0.8011859438698157, "grad_norm": 4.889275777982356, "learning_rate": 1.1562044953710032e-07, "loss": 1.9319, "step": 11890 }, { "epoch": 0.8015228597419224, "grad_norm": 4.485153412827441, "learning_rate": 1.152446355474398e-07, "loss": 1.9957, "step": 11895 }, { "epoch": 0.8018597756140292, "grad_norm": 5.6476808749022664, "learning_rate": 1.1486935375373124e-07, "loss": 2.05, "step": 11900 }, { "epoch": 0.8021966914861359, "grad_norm": 5.89565303510698, "learning_rate": 1.1449460467506689e-07, "loss": 1.9756, "step": 11905 }, { "epoch": 0.8025336073582426, "grad_norm": 4.817207235860719, "learning_rate": 1.1412038882980174e-07, "loss": 2.0098, "step": 11910 }, { "epoch": 0.8028705232303494, "grad_norm": 4.694276930197342, "learning_rate": 1.1374670673555348e-07, "loss": 1.9869, "step": 11915 }, { "epoch": 0.8032074391024561, "grad_norm": 4.685125947034545, "learning_rate": 1.1337355890920169e-07, "loss": 2.0316, "step": 11920 }, { "epoch": 0.8035443549745629, "grad_norm": 4.503595042316666, "learning_rate": 1.130009458668863e-07, "loss": 2.0059, "step": 11925 }, { "epoch": 0.8038812708466696, "grad_norm": 4.920819023220295, "learning_rate": 1.1262886812400813e-07, "loss": 1.9789, "step": 11930 }, { "epoch": 0.8042181867187763, "grad_norm": 5.019959830332962, "learning_rate": 1.1225732619522754e-07, "loss": 1.9555, "step": 11935 }, { "epoch": 0.804555102590883, "grad_norm": 5.4331924421289575, "learning_rate": 1.118863205944633e-07, "loss": 1.9614, "step": 11940 }, { "epoch": 0.8048920184629897, "grad_norm": 5.328237768936099, "learning_rate": 1.1151585183489266e-07, "loss": 2.0127, "step": 11945 }, { "epoch": 0.8052289343350966, "grad_norm": 5.08432688202362, "learning_rate": 1.1114592042895044e-07, "loss": 1.9913, "step": 11950 }, { "epoch": 0.8055658502072033, "grad_norm": 5.1203740672463445, "learning_rate": 1.1077652688832772e-07, "loss": 1.9836, "step": 11955 }, { "epoch": 0.80590276607931, "grad_norm": 4.959492936960112, "learning_rate": 1.1040767172397209e-07, "loss": 1.9549, "step": 11960 }, { "epoch": 0.8062396819514167, "grad_norm": 5.121718241296839, "learning_rate": 1.1003935544608612e-07, "loss": 1.9843, "step": 11965 }, { "epoch": 0.8065765978235234, "grad_norm": 5.0603407887680625, "learning_rate": 1.0967157856412739e-07, "loss": 1.9792, "step": 11970 }, { "epoch": 0.8069135136956302, "grad_norm": 4.893166720615464, "learning_rate": 1.093043415868069e-07, "loss": 2.0891, "step": 11975 }, { "epoch": 0.807250429567737, "grad_norm": 5.10873111408401, "learning_rate": 1.0893764502208891e-07, "loss": 2.012, "step": 11980 }, { "epoch": 0.8075873454398437, "grad_norm": 5.054802072908632, "learning_rate": 1.0857148937719063e-07, "loss": 1.9625, "step": 11985 }, { "epoch": 0.8079242613119504, "grad_norm": 5.055586546556776, "learning_rate": 1.0820587515858054e-07, "loss": 2.0124, "step": 11990 }, { "epoch": 0.8082611771840571, "grad_norm": 4.929999702814125, "learning_rate": 1.078408028719785e-07, "loss": 1.9899, "step": 11995 }, { "epoch": 0.8085980930561639, "grad_norm": 4.831334494763691, "learning_rate": 1.0747627302235491e-07, "loss": 2.0401, "step": 12000 }, { "epoch": 0.8089350089282706, "grad_norm": 5.365390056525271, "learning_rate": 1.0711228611392936e-07, "loss": 2.0357, "step": 12005 }, { "epoch": 0.8092719248003774, "grad_norm": 5.373268668836555, "learning_rate": 1.0674884265017086e-07, "loss": 2.0054, "step": 12010 }, { "epoch": 0.8096088406724841, "grad_norm": 5.10702073154905, "learning_rate": 1.0638594313379678e-07, "loss": 1.909, "step": 12015 }, { "epoch": 0.8099457565445908, "grad_norm": 5.103122323166098, "learning_rate": 1.060235880667717e-07, "loss": 2.0477, "step": 12020 }, { "epoch": 0.8102826724166976, "grad_norm": 4.762773855958752, "learning_rate": 1.056617779503074e-07, "loss": 1.9937, "step": 12025 }, { "epoch": 0.8106195882888043, "grad_norm": 5.705261411670257, "learning_rate": 1.053005132848619e-07, "loss": 1.9858, "step": 12030 }, { "epoch": 0.810956504160911, "grad_norm": 4.846560263473332, "learning_rate": 1.0493979457013874e-07, "loss": 2.04, "step": 12035 }, { "epoch": 0.8112934200330177, "grad_norm": 4.543375670007831, "learning_rate": 1.0457962230508599e-07, "loss": 1.968, "step": 12040 }, { "epoch": 0.8116303359051245, "grad_norm": 5.1998484878539974, "learning_rate": 1.042199969878963e-07, "loss": 1.954, "step": 12045 }, { "epoch": 0.8119672517772313, "grad_norm": 5.036712380123403, "learning_rate": 1.0386091911600564e-07, "loss": 1.9474, "step": 12050 }, { "epoch": 0.812304167649338, "grad_norm": 5.246397896178424, "learning_rate": 1.0350238918609244e-07, "loss": 1.9452, "step": 12055 }, { "epoch": 0.8126410835214447, "grad_norm": 4.816820871775041, "learning_rate": 1.0314440769407784e-07, "loss": 1.9553, "step": 12060 }, { "epoch": 0.8129779993935514, "grad_norm": 5.40882302553503, "learning_rate": 1.0278697513512375e-07, "loss": 1.9153, "step": 12065 }, { "epoch": 0.8133149152656581, "grad_norm": 5.089926523521089, "learning_rate": 1.0243009200363295e-07, "loss": 1.9245, "step": 12070 }, { "epoch": 0.8136518311377648, "grad_norm": 5.652551852207039, "learning_rate": 1.0207375879324853e-07, "loss": 1.9769, "step": 12075 }, { "epoch": 0.8139887470098717, "grad_norm": 5.4741363451723934, "learning_rate": 1.0171797599685284e-07, "loss": 2.02, "step": 12080 }, { "epoch": 0.8143256628819784, "grad_norm": 4.697269582946994, "learning_rate": 1.0136274410656653e-07, "loss": 2.0126, "step": 12085 }, { "epoch": 0.8146625787540851, "grad_norm": 5.144850604492638, "learning_rate": 1.010080636137487e-07, "loss": 1.9763, "step": 12090 }, { "epoch": 0.8149994946261918, "grad_norm": 5.47290040939327, "learning_rate": 1.0065393500899549e-07, "loss": 1.9013, "step": 12095 }, { "epoch": 0.8153364104982985, "grad_norm": 5.017801893250318, "learning_rate": 1.0030035878213988e-07, "loss": 1.9857, "step": 12100 }, { "epoch": 0.8156733263704053, "grad_norm": 5.199202593043171, "learning_rate": 9.994733542225037e-08, "loss": 1.9572, "step": 12105 }, { "epoch": 0.8160102422425121, "grad_norm": 4.805713148139507, "learning_rate": 9.959486541763118e-08, "loss": 1.9658, "step": 12110 }, { "epoch": 0.8163471581146188, "grad_norm": 5.194976076921895, "learning_rate": 9.924294925582105e-08, "loss": 2.0159, "step": 12115 }, { "epoch": 0.8166840739867255, "grad_norm": 4.918251263917819, "learning_rate": 9.889158742359233e-08, "loss": 1.9905, "step": 12120 }, { "epoch": 0.8170209898588322, "grad_norm": 5.004474754217774, "learning_rate": 9.854078040695096e-08, "loss": 2.015, "step": 12125 }, { "epoch": 0.817357905730939, "grad_norm": 5.096970909232614, "learning_rate": 9.819052869113543e-08, "loss": 1.9904, "step": 12130 }, { "epoch": 0.8176948216030457, "grad_norm": 5.396845128688712, "learning_rate": 9.784083276061578e-08, "loss": 1.9747, "step": 12135 }, { "epoch": 0.8180317374751525, "grad_norm": 5.5202700497263395, "learning_rate": 9.749169309909382e-08, "loss": 1.9502, "step": 12140 }, { "epoch": 0.8183686533472592, "grad_norm": 4.830008990057666, "learning_rate": 9.714311018950139e-08, "loss": 1.983, "step": 12145 }, { "epoch": 0.8187055692193659, "grad_norm": 4.950498631823414, "learning_rate": 9.67950845140007e-08, "loss": 2.0249, "step": 12150 }, { "epoch": 0.8190424850914727, "grad_norm": 4.9147900462416, "learning_rate": 9.644761655398276e-08, "loss": 1.9334, "step": 12155 }, { "epoch": 0.8193794009635794, "grad_norm": 5.850245027314621, "learning_rate": 9.61007067900675e-08, "loss": 2.0497, "step": 12160 }, { "epoch": 0.8197163168356861, "grad_norm": 4.9492752398969095, "learning_rate": 9.575435570210266e-08, "loss": 2.0254, "step": 12165 }, { "epoch": 0.8200532327077928, "grad_norm": 4.8008702270915755, "learning_rate": 9.5408563769163e-08, "loss": 1.9849, "step": 12170 }, { "epoch": 0.8203901485798996, "grad_norm": 4.60144354537553, "learning_rate": 9.506333146955009e-08, "loss": 1.9035, "step": 12175 }, { "epoch": 0.8207270644520064, "grad_norm": 4.678276838316408, "learning_rate": 9.471865928079148e-08, "loss": 2.0106, "step": 12180 }, { "epoch": 0.8210639803241131, "grad_norm": 4.6676548276151415, "learning_rate": 9.437454767963954e-08, "loss": 1.9709, "step": 12185 }, { "epoch": 0.8214008961962198, "grad_norm": 5.069651107472825, "learning_rate": 9.403099714207174e-08, "loss": 2.0009, "step": 12190 }, { "epoch": 0.8217378120683265, "grad_norm": 4.9407969818739454, "learning_rate": 9.368800814328931e-08, "loss": 2.0096, "step": 12195 }, { "epoch": 0.8220747279404332, "grad_norm": 5.326057364364338, "learning_rate": 9.334558115771646e-08, "loss": 2.0165, "step": 12200 }, { "epoch": 0.8224116438125401, "grad_norm": 4.819771123413496, "learning_rate": 9.300371665900048e-08, "loss": 1.9549, "step": 12205 }, { "epoch": 0.8227485596846468, "grad_norm": 5.7465075115776685, "learning_rate": 9.266241512001044e-08, "loss": 1.9201, "step": 12210 }, { "epoch": 0.8230854755567535, "grad_norm": 5.2908501668734855, "learning_rate": 9.23216770128365e-08, "loss": 2.0067, "step": 12215 }, { "epoch": 0.8234223914288602, "grad_norm": 5.105425884319466, "learning_rate": 9.19815028087898e-08, "loss": 1.9964, "step": 12220 }, { "epoch": 0.8237593073009669, "grad_norm": 4.612248982400504, "learning_rate": 9.164189297840147e-08, "loss": 2.025, "step": 12225 }, { "epoch": 0.8240962231730737, "grad_norm": 5.604887566497704, "learning_rate": 9.130284799142179e-08, "loss": 1.9814, "step": 12230 }, { "epoch": 0.8244331390451805, "grad_norm": 5.470672528122372, "learning_rate": 9.09643683168197e-08, "loss": 1.9885, "step": 12235 }, { "epoch": 0.8247700549172872, "grad_norm": 4.674103628601561, "learning_rate": 9.062645442278244e-08, "loss": 2.0264, "step": 12240 }, { "epoch": 0.8251069707893939, "grad_norm": 4.741397235201546, "learning_rate": 9.028910677671469e-08, "loss": 2.0512, "step": 12245 }, { "epoch": 0.8254438866615006, "grad_norm": 5.404860407330724, "learning_rate": 8.995232584523754e-08, "loss": 1.9449, "step": 12250 }, { "epoch": 0.8257808025336073, "grad_norm": 5.243789301139397, "learning_rate": 8.961611209418851e-08, "loss": 2.0044, "step": 12255 }, { "epoch": 0.8261177184057141, "grad_norm": 5.090385655723221, "learning_rate": 8.928046598862065e-08, "loss": 1.9923, "step": 12260 }, { "epoch": 0.8264546342778208, "grad_norm": 5.00058909113833, "learning_rate": 8.894538799280138e-08, "loss": 2.0531, "step": 12265 }, { "epoch": 0.8267915501499276, "grad_norm": 4.642520786511302, "learning_rate": 8.861087857021282e-08, "loss": 1.9713, "step": 12270 }, { "epoch": 0.8271284660220343, "grad_norm": 5.173972743078863, "learning_rate": 8.827693818355048e-08, "loss": 1.9015, "step": 12275 }, { "epoch": 0.827465381894141, "grad_norm": 5.266314538245673, "learning_rate": 8.794356729472252e-08, "loss": 1.9817, "step": 12280 }, { "epoch": 0.8278022977662478, "grad_norm": 5.010896268445951, "learning_rate": 8.76107663648497e-08, "loss": 1.9598, "step": 12285 }, { "epoch": 0.8281392136383545, "grad_norm": 4.865832450606591, "learning_rate": 8.727853585426436e-08, "loss": 2.0139, "step": 12290 }, { "epoch": 0.8284761295104612, "grad_norm": 4.9385923964663165, "learning_rate": 8.694687622250963e-08, "loss": 2.0086, "step": 12295 }, { "epoch": 0.828813045382568, "grad_norm": 5.0222938989934205, "learning_rate": 8.661578792833907e-08, "loss": 2.0107, "step": 12300 }, { "epoch": 0.8291499612546747, "grad_norm": 5.102704096479475, "learning_rate": 8.628527142971632e-08, "loss": 1.905, "step": 12305 }, { "epoch": 0.8294868771267815, "grad_norm": 4.827551523261991, "learning_rate": 8.595532718381338e-08, "loss": 1.9841, "step": 12310 }, { "epoch": 0.8298237929988882, "grad_norm": 4.99890015855876, "learning_rate": 8.562595564701153e-08, "loss": 2.0307, "step": 12315 }, { "epoch": 0.8301607088709949, "grad_norm": 5.148813906796249, "learning_rate": 8.529715727489912e-08, "loss": 2.0528, "step": 12320 }, { "epoch": 0.8304976247431016, "grad_norm": 4.938222712947157, "learning_rate": 8.496893252227238e-08, "loss": 1.9908, "step": 12325 }, { "epoch": 0.8308345406152083, "grad_norm": 4.715302719900964, "learning_rate": 8.464128184313346e-08, "loss": 1.9671, "step": 12330 }, { "epoch": 0.8311714564873152, "grad_norm": 5.3931996421007025, "learning_rate": 8.431420569069093e-08, "loss": 1.9222, "step": 12335 }, { "epoch": 0.8315083723594219, "grad_norm": 5.165730891993864, "learning_rate": 8.398770451735865e-08, "loss": 1.9759, "step": 12340 }, { "epoch": 0.8318452882315286, "grad_norm": 5.03104483270424, "learning_rate": 8.366177877475473e-08, "loss": 1.9748, "step": 12345 }, { "epoch": 0.8321822041036353, "grad_norm": 4.750286676259499, "learning_rate": 8.333642891370174e-08, "loss": 2.0761, "step": 12350 }, { "epoch": 0.832519119975742, "grad_norm": 4.635586426403237, "learning_rate": 8.301165538422577e-08, "loss": 2.0083, "step": 12355 }, { "epoch": 0.8328560358478488, "grad_norm": 5.235687995579772, "learning_rate": 8.268745863555521e-08, "loss": 1.9226, "step": 12360 }, { "epoch": 0.8331929517199556, "grad_norm": 5.326210847195118, "learning_rate": 8.236383911612116e-08, "loss": 1.9638, "step": 12365 }, { "epoch": 0.8335298675920623, "grad_norm": 4.827392373221956, "learning_rate": 8.204079727355611e-08, "loss": 2.0034, "step": 12370 }, { "epoch": 0.833866783464169, "grad_norm": 5.571506781958315, "learning_rate": 8.171833355469354e-08, "loss": 2.0513, "step": 12375 }, { "epoch": 0.8342036993362757, "grad_norm": 5.5940688616685845, "learning_rate": 8.139644840556703e-08, "loss": 2.0138, "step": 12380 }, { "epoch": 0.8345406152083825, "grad_norm": 4.984821493170401, "learning_rate": 8.107514227141032e-08, "loss": 1.9671, "step": 12385 }, { "epoch": 0.8348775310804892, "grad_norm": 4.953501299880113, "learning_rate": 8.075441559665569e-08, "loss": 2.027, "step": 12390 }, { "epoch": 0.835214446952596, "grad_norm": 4.4741128930839205, "learning_rate": 8.04342688249346e-08, "loss": 2.0399, "step": 12395 }, { "epoch": 0.8355513628247027, "grad_norm": 4.884078831587302, "learning_rate": 8.011470239907558e-08, "loss": 1.9914, "step": 12400 }, { "epoch": 0.8358882786968094, "grad_norm": 5.252955777162602, "learning_rate": 7.979571676110525e-08, "loss": 1.9826, "step": 12405 }, { "epoch": 0.8362251945689162, "grad_norm": 5.493873277580335, "learning_rate": 7.947731235224614e-08, "loss": 2.0671, "step": 12410 }, { "epoch": 0.8365621104410229, "grad_norm": 4.98352794032991, "learning_rate": 7.915948961291729e-08, "loss": 1.996, "step": 12415 }, { "epoch": 0.8368990263131296, "grad_norm": 5.204145160763685, "learning_rate": 7.884224898273322e-08, "loss": 1.9659, "step": 12420 }, { "epoch": 0.8372359421852363, "grad_norm": 5.2988498503798525, "learning_rate": 7.852559090050276e-08, "loss": 1.9792, "step": 12425 }, { "epoch": 0.837572858057343, "grad_norm": 5.158396379707824, "learning_rate": 7.820951580422952e-08, "loss": 1.9391, "step": 12430 }, { "epoch": 0.8379097739294498, "grad_norm": 4.965503093345433, "learning_rate": 7.789402413111041e-08, "loss": 1.9431, "step": 12435 }, { "epoch": 0.8382466898015566, "grad_norm": 5.005926749045461, "learning_rate": 7.757911631753556e-08, "loss": 1.9736, "step": 12440 }, { "epoch": 0.8385836056736633, "grad_norm": 4.987426616893807, "learning_rate": 7.72647927990871e-08, "loss": 2.0207, "step": 12445 }, { "epoch": 0.83892052154577, "grad_norm": 4.708582474149512, "learning_rate": 7.695105401053942e-08, "loss": 2.0211, "step": 12450 }, { "epoch": 0.8392574374178767, "grad_norm": 4.7356317079539885, "learning_rate": 7.663790038585794e-08, "loss": 2.0334, "step": 12455 }, { "epoch": 0.8395943532899834, "grad_norm": 4.822628521863879, "learning_rate": 7.63253323581985e-08, "loss": 1.9872, "step": 12460 }, { "epoch": 0.8399312691620903, "grad_norm": 5.010603893907513, "learning_rate": 7.601335035990714e-08, "loss": 2.0467, "step": 12465 }, { "epoch": 0.840268185034197, "grad_norm": 5.149617951460911, "learning_rate": 7.57019548225194e-08, "loss": 1.9613, "step": 12470 }, { "epoch": 0.8406051009063037, "grad_norm": 5.307549085655637, "learning_rate": 7.539114617675941e-08, "loss": 2.0263, "step": 12475 }, { "epoch": 0.8409420167784104, "grad_norm": 4.769849584602086, "learning_rate": 7.508092485253936e-08, "loss": 1.9782, "step": 12480 }, { "epoch": 0.8412789326505171, "grad_norm": 4.981498215561861, "learning_rate": 7.477129127895954e-08, "loss": 2.028, "step": 12485 }, { "epoch": 0.8416158485226239, "grad_norm": 4.987418462287155, "learning_rate": 7.446224588430678e-08, "loss": 2.0007, "step": 12490 }, { "epoch": 0.8419527643947307, "grad_norm": 4.687073929724413, "learning_rate": 7.415378909605457e-08, "loss": 2.068, "step": 12495 }, { "epoch": 0.8422896802668374, "grad_norm": 4.777652470292546, "learning_rate": 7.384592134086231e-08, "loss": 1.9699, "step": 12500 }, { "epoch": 0.8426265961389441, "grad_norm": 4.876000211996493, "learning_rate": 7.353864304457463e-08, "loss": 2.0202, "step": 12505 }, { "epoch": 0.8429635120110508, "grad_norm": 5.4497221207358235, "learning_rate": 7.323195463222054e-08, "loss": 1.9918, "step": 12510 }, { "epoch": 0.8433004278831576, "grad_norm": 5.055766317876615, "learning_rate": 7.292585652801331e-08, "loss": 1.9289, "step": 12515 }, { "epoch": 0.8436373437552643, "grad_norm": 4.933677353552291, "learning_rate": 7.262034915534993e-08, "loss": 2.0094, "step": 12520 }, { "epoch": 0.843974259627371, "grad_norm": 5.115949344507936, "learning_rate": 7.231543293680969e-08, "loss": 1.9632, "step": 12525 }, { "epoch": 0.8443111754994778, "grad_norm": 5.667094111368155, "learning_rate": 7.20111082941548e-08, "loss": 2.0328, "step": 12530 }, { "epoch": 0.8446480913715845, "grad_norm": 5.091741359153518, "learning_rate": 7.170737564832902e-08, "loss": 2.0352, "step": 12535 }, { "epoch": 0.8449850072436913, "grad_norm": 5.098435584005042, "learning_rate": 7.14042354194569e-08, "loss": 2.0123, "step": 12540 }, { "epoch": 0.845321923115798, "grad_norm": 4.783087865302745, "learning_rate": 7.110168802684408e-08, "loss": 1.9112, "step": 12545 }, { "epoch": 0.8456588389879047, "grad_norm": 4.7542382667642435, "learning_rate": 7.079973388897592e-08, "loss": 2.0302, "step": 12550 }, { "epoch": 0.8459957548600114, "grad_norm": 4.865115654060519, "learning_rate": 7.049837342351706e-08, "loss": 1.9684, "step": 12555 }, { "epoch": 0.8463326707321182, "grad_norm": 5.554284140725131, "learning_rate": 7.019760704731131e-08, "loss": 2.0388, "step": 12560 }, { "epoch": 0.846669586604225, "grad_norm": 4.806974544867558, "learning_rate": 6.989743517638053e-08, "loss": 1.9814, "step": 12565 }, { "epoch": 0.8470065024763317, "grad_norm": 4.837970650798513, "learning_rate": 6.959785822592402e-08, "loss": 2.0394, "step": 12570 }, { "epoch": 0.8473434183484384, "grad_norm": 5.545718514781819, "learning_rate": 6.929887661031864e-08, "loss": 1.9444, "step": 12575 }, { "epoch": 0.8476803342205451, "grad_norm": 4.6918526838824715, "learning_rate": 6.900049074311753e-08, "loss": 1.9727, "step": 12580 }, { "epoch": 0.8480172500926518, "grad_norm": 5.364722674876099, "learning_rate": 6.870270103705e-08, "loss": 1.9669, "step": 12585 }, { "epoch": 0.8483541659647587, "grad_norm": 5.169951577886867, "learning_rate": 6.840550790402027e-08, "loss": 2.0153, "step": 12590 }, { "epoch": 0.8486910818368654, "grad_norm": 4.917780349751007, "learning_rate": 6.810891175510792e-08, "loss": 2.0141, "step": 12595 }, { "epoch": 0.8490279977089721, "grad_norm": 4.8868985020105455, "learning_rate": 6.781291300056647e-08, "loss": 2.008, "step": 12600 }, { "epoch": 0.8493649135810788, "grad_norm": 5.004407558437506, "learning_rate": 6.751751204982309e-08, "loss": 1.8823, "step": 12605 }, { "epoch": 0.8497018294531855, "grad_norm": 5.30970726792252, "learning_rate": 6.722270931147827e-08, "loss": 2.0124, "step": 12610 }, { "epoch": 0.8500387453252922, "grad_norm": 5.417458363847639, "learning_rate": 6.692850519330506e-08, "loss": 1.9599, "step": 12615 }, { "epoch": 0.850375661197399, "grad_norm": 5.104041537994855, "learning_rate": 6.66349001022481e-08, "loss": 1.9029, "step": 12620 }, { "epoch": 0.8507125770695058, "grad_norm": 4.680769610474689, "learning_rate": 6.634189444442389e-08, "loss": 2.05, "step": 12625 }, { "epoch": 0.8510494929416125, "grad_norm": 5.250781628534627, "learning_rate": 6.604948862511977e-08, "loss": 2.0554, "step": 12630 }, { "epoch": 0.8513864088137192, "grad_norm": 5.1083298814683715, "learning_rate": 6.575768304879292e-08, "loss": 2.0049, "step": 12635 }, { "epoch": 0.8517233246858259, "grad_norm": 4.992815199703779, "learning_rate": 6.546647811907091e-08, "loss": 1.9189, "step": 12640 }, { "epoch": 0.8520602405579327, "grad_norm": 4.87935320712007, "learning_rate": 6.517587423874988e-08, "loss": 2.0091, "step": 12645 }, { "epoch": 0.8523971564300394, "grad_norm": 5.001717523268426, "learning_rate": 6.48858718097951e-08, "loss": 1.9028, "step": 12650 }, { "epoch": 0.8527340723021462, "grad_norm": 4.738842336271283, "learning_rate": 6.459647123333956e-08, "loss": 1.8935, "step": 12655 }, { "epoch": 0.8530709881742529, "grad_norm": 5.113521595792352, "learning_rate": 6.430767290968387e-08, "loss": 1.965, "step": 12660 }, { "epoch": 0.8534079040463596, "grad_norm": 4.987875001246638, "learning_rate": 6.401947723829576e-08, "loss": 1.9748, "step": 12665 }, { "epoch": 0.8537448199184664, "grad_norm": 5.050132175721084, "learning_rate": 6.373188461780904e-08, "loss": 1.9852, "step": 12670 }, { "epoch": 0.8540817357905731, "grad_norm": 4.985714185959294, "learning_rate": 6.344489544602371e-08, "loss": 2.0311, "step": 12675 }, { "epoch": 0.8544186516626798, "grad_norm": 5.215525977331754, "learning_rate": 6.315851011990498e-08, "loss": 2.0641, "step": 12680 }, { "epoch": 0.8547555675347865, "grad_norm": 4.69142683300018, "learning_rate": 6.28727290355826e-08, "loss": 1.9516, "step": 12685 }, { "epoch": 0.8550924834068933, "grad_norm": 5.146821280414905, "learning_rate": 6.258755258835075e-08, "loss": 2.0027, "step": 12690 }, { "epoch": 0.8554293992790001, "grad_norm": 5.076253462163965, "learning_rate": 6.230298117266736e-08, "loss": 1.9524, "step": 12695 }, { "epoch": 0.8557663151511068, "grad_norm": 5.057943057245336, "learning_rate": 6.201901518215313e-08, "loss": 1.9727, "step": 12700 }, { "epoch": 0.8561032310232135, "grad_norm": 5.597216484983112, "learning_rate": 6.173565500959165e-08, "loss": 2.0343, "step": 12705 }, { "epoch": 0.8564401468953202, "grad_norm": 5.652444110208838, "learning_rate": 6.14529010469284e-08, "loss": 2.0406, "step": 12710 }, { "epoch": 0.8567770627674269, "grad_norm": 5.014724923006483, "learning_rate": 6.117075368527053e-08, "loss": 1.8909, "step": 12715 }, { "epoch": 0.8571139786395338, "grad_norm": 4.971840506101274, "learning_rate": 6.088921331488566e-08, "loss": 1.9603, "step": 12720 }, { "epoch": 0.8574508945116405, "grad_norm": 4.976636259905231, "learning_rate": 6.060828032520249e-08, "loss": 1.9859, "step": 12725 }, { "epoch": 0.8577878103837472, "grad_norm": 5.270936893461262, "learning_rate": 6.032795510480904e-08, "loss": 1.9502, "step": 12730 }, { "epoch": 0.8581247262558539, "grad_norm": 4.9546578612013485, "learning_rate": 6.004823804145276e-08, "loss": 2.0355, "step": 12735 }, { "epoch": 0.8584616421279606, "grad_norm": 4.99216167277347, "learning_rate": 5.976912952204016e-08, "loss": 1.9884, "step": 12740 }, { "epoch": 0.8587985580000674, "grad_norm": 4.9030150877431895, "learning_rate": 5.9490629932635815e-08, "loss": 2.0477, "step": 12745 }, { "epoch": 0.8591354738721741, "grad_norm": 5.293686996030991, "learning_rate": 5.921273965846191e-08, "loss": 1.9786, "step": 12750 }, { "epoch": 0.8594723897442809, "grad_norm": 5.287065228424329, "learning_rate": 5.893545908389807e-08, "loss": 1.902, "step": 12755 }, { "epoch": 0.8598093056163876, "grad_norm": 4.966757830014132, "learning_rate": 5.865878859248058e-08, "loss": 1.9183, "step": 12760 }, { "epoch": 0.8601462214884943, "grad_norm": 5.338138797557697, "learning_rate": 5.838272856690146e-08, "loss": 1.9395, "step": 12765 }, { "epoch": 0.860483137360601, "grad_norm": 4.836762244765119, "learning_rate": 5.810727938900878e-08, "loss": 2.0021, "step": 12770 }, { "epoch": 0.8608200532327078, "grad_norm": 4.836065210347224, "learning_rate": 5.7832441439805536e-08, "loss": 2.0369, "step": 12775 }, { "epoch": 0.8611569691048145, "grad_norm": 4.902540201239219, "learning_rate": 5.755821509944925e-08, "loss": 2.0055, "step": 12780 }, { "epoch": 0.8614938849769213, "grad_norm": 5.209240563545367, "learning_rate": 5.728460074725133e-08, "loss": 1.9835, "step": 12785 }, { "epoch": 0.861830800849028, "grad_norm": 4.931332868970624, "learning_rate": 5.701159876167688e-08, "loss": 1.9659, "step": 12790 }, { "epoch": 0.8621677167211347, "grad_norm": 5.141619871084597, "learning_rate": 5.673920952034406e-08, "loss": 1.9841, "step": 12795 }, { "epoch": 0.8625046325932415, "grad_norm": 5.030431893837834, "learning_rate": 5.646743340002302e-08, "loss": 1.9732, "step": 12800 }, { "epoch": 0.8628415484653482, "grad_norm": 5.500993112364312, "learning_rate": 5.619627077663636e-08, "loss": 2.0476, "step": 12805 }, { "epoch": 0.8631784643374549, "grad_norm": 4.96349465822164, "learning_rate": 5.5925722025257746e-08, "loss": 2.0714, "step": 12810 }, { "epoch": 0.8635153802095616, "grad_norm": 4.936342522102944, "learning_rate": 5.5655787520111966e-08, "loss": 2.0363, "step": 12815 }, { "epoch": 0.8638522960816684, "grad_norm": 4.978366818260039, "learning_rate": 5.538646763457389e-08, "loss": 1.9599, "step": 12820 }, { "epoch": 0.8641892119537752, "grad_norm": 4.799148659591634, "learning_rate": 5.511776274116864e-08, "loss": 1.9339, "step": 12825 }, { "epoch": 0.8645261278258819, "grad_norm": 5.3451187556428374, "learning_rate": 5.484967321157019e-08, "loss": 2.0041, "step": 12830 }, { "epoch": 0.8648630436979886, "grad_norm": 5.2001141605921255, "learning_rate": 5.4582199416601746e-08, "loss": 1.9365, "step": 12835 }, { "epoch": 0.8651999595700953, "grad_norm": 5.315510317501807, "learning_rate": 5.43153417262347e-08, "loss": 2.0305, "step": 12840 }, { "epoch": 0.865536875442202, "grad_norm": 5.559195784916782, "learning_rate": 5.404910050958833e-08, "loss": 2.0134, "step": 12845 }, { "epoch": 0.8658737913143089, "grad_norm": 5.012322374028101, "learning_rate": 5.378347613492884e-08, "loss": 1.9817, "step": 12850 }, { "epoch": 0.8662107071864156, "grad_norm": 4.785945296261449, "learning_rate": 5.351846896966966e-08, "loss": 2.0142, "step": 12855 }, { "epoch": 0.8665476230585223, "grad_norm": 5.227439925110729, "learning_rate": 5.32540793803703e-08, "loss": 2.0164, "step": 12860 }, { "epoch": 0.866884538930629, "grad_norm": 4.9151507835114, "learning_rate": 5.299030773273594e-08, "loss": 1.9835, "step": 12865 }, { "epoch": 0.8672214548027357, "grad_norm": 5.117068451880196, "learning_rate": 5.272715439161718e-08, "loss": 1.9742, "step": 12870 }, { "epoch": 0.8675583706748425, "grad_norm": 5.287711928033072, "learning_rate": 5.246461972100941e-08, "loss": 1.9484, "step": 12875 }, { "epoch": 0.8678952865469493, "grad_norm": 4.71401275269487, "learning_rate": 5.220270408405197e-08, "loss": 1.9798, "step": 12880 }, { "epoch": 0.868232202419056, "grad_norm": 5.146973713712974, "learning_rate": 5.194140784302836e-08, "loss": 1.9667, "step": 12885 }, { "epoch": 0.8685691182911627, "grad_norm": 4.397819412979547, "learning_rate": 5.168073135936496e-08, "loss": 1.98, "step": 12890 }, { "epoch": 0.8689060341632694, "grad_norm": 5.239378521406953, "learning_rate": 5.1420674993631285e-08, "loss": 1.9399, "step": 12895 }, { "epoch": 0.8692429500353762, "grad_norm": 4.860102824858555, "learning_rate": 5.116123910553854e-08, "loss": 2.0023, "step": 12900 }, { "epoch": 0.8695798659074829, "grad_norm": 4.580226692738816, "learning_rate": 5.0902424053940406e-08, "loss": 2.0487, "step": 12905 }, { "epoch": 0.8699167817795896, "grad_norm": 5.120362005932783, "learning_rate": 5.064423019683106e-08, "loss": 1.9664, "step": 12910 }, { "epoch": 0.8702536976516964, "grad_norm": 5.088673094515921, "learning_rate": 5.0386657891346e-08, "loss": 1.9779, "step": 12915 }, { "epoch": 0.8705906135238031, "grad_norm": 4.41186242356666, "learning_rate": 5.012970749376083e-08, "loss": 1.9531, "step": 12920 }, { "epoch": 0.8709275293959099, "grad_norm": 5.3069611380762565, "learning_rate": 4.987337935949087e-08, "loss": 2.0118, "step": 12925 }, { "epoch": 0.8712644452680166, "grad_norm": 5.067327241779852, "learning_rate": 4.961767384309068e-08, "loss": 2.0249, "step": 12930 }, { "epoch": 0.8716013611401233, "grad_norm": 4.563811832907776, "learning_rate": 4.936259129825376e-08, "loss": 1.9902, "step": 12935 }, { "epoch": 0.87193827701223, "grad_norm": 5.037596268421402, "learning_rate": 4.9108132077811836e-08, "loss": 1.966, "step": 12940 }, { "epoch": 0.8722751928843367, "grad_norm": 5.161316273519496, "learning_rate": 4.885429653373435e-08, "loss": 1.9829, "step": 12945 }, { "epoch": 0.8726121087564435, "grad_norm": 5.3622599397677675, "learning_rate": 4.860108501712823e-08, "loss": 2.0091, "step": 12950 }, { "epoch": 0.8729490246285503, "grad_norm": 4.985939214316381, "learning_rate": 4.834849787823725e-08, "loss": 1.9613, "step": 12955 }, { "epoch": 0.873285940500657, "grad_norm": 5.453705256163994, "learning_rate": 4.809653546644132e-08, "loss": 2.0743, "step": 12960 }, { "epoch": 0.8736228563727637, "grad_norm": 5.231750896280028, "learning_rate": 4.7845198130256395e-08, "loss": 2.0528, "step": 12965 }, { "epoch": 0.8739597722448704, "grad_norm": 4.838465350359084, "learning_rate": 4.759448621733403e-08, "loss": 1.9642, "step": 12970 }, { "epoch": 0.8742966881169771, "grad_norm": 5.131868054625508, "learning_rate": 4.7344400074460276e-08, "loss": 1.9777, "step": 12975 }, { "epoch": 0.874633603989084, "grad_norm": 5.441813719817128, "learning_rate": 4.709494004755571e-08, "loss": 2.0084, "step": 12980 }, { "epoch": 0.8749705198611907, "grad_norm": 5.14987420464166, "learning_rate": 4.684610648167503e-08, "loss": 1.96, "step": 12985 }, { "epoch": 0.8753074357332974, "grad_norm": 5.351876772896136, "learning_rate": 4.659789972100647e-08, "loss": 2.005, "step": 12990 }, { "epoch": 0.8756443516054041, "grad_norm": 4.754680000616985, "learning_rate": 4.635032010887097e-08, "loss": 2.0317, "step": 12995 }, { "epoch": 0.8759812674775108, "grad_norm": 4.944114693819588, "learning_rate": 4.610336798772213e-08, "loss": 1.9878, "step": 13000 }, { "epoch": 0.8763181833496176, "grad_norm": 5.030461958766008, "learning_rate": 4.5857043699145834e-08, "loss": 2.017, "step": 13005 }, { "epoch": 0.8766550992217244, "grad_norm": 5.532484485562743, "learning_rate": 4.5611347583859095e-08, "loss": 1.9825, "step": 13010 }, { "epoch": 0.8769920150938311, "grad_norm": 5.178366379752065, "learning_rate": 4.536627998171033e-08, "loss": 2.0212, "step": 13015 }, { "epoch": 0.8773289309659378, "grad_norm": 5.459478219206448, "learning_rate": 4.512184123167867e-08, "loss": 1.9552, "step": 13020 }, { "epoch": 0.8776658468380445, "grad_norm": 4.977598024086169, "learning_rate": 4.487803167187304e-08, "loss": 1.9913, "step": 13025 }, { "epoch": 0.8780027627101513, "grad_norm": 4.772910731707593, "learning_rate": 4.463485163953246e-08, "loss": 2.0443, "step": 13030 }, { "epoch": 0.878339678582258, "grad_norm": 4.861403273323282, "learning_rate": 4.4392301471025074e-08, "loss": 2.045, "step": 13035 }, { "epoch": 0.8786765944543647, "grad_norm": 4.941580703668498, "learning_rate": 4.415038150184758e-08, "loss": 1.9854, "step": 13040 }, { "epoch": 0.8790135103264715, "grad_norm": 5.018028543743037, "learning_rate": 4.3909092066625245e-08, "loss": 1.9497, "step": 13045 }, { "epoch": 0.8793504261985782, "grad_norm": 4.915357980016995, "learning_rate": 4.366843349911109e-08, "loss": 1.9426, "step": 13050 }, { "epoch": 0.879687342070685, "grad_norm": 5.093973892096178, "learning_rate": 4.342840613218546e-08, "loss": 1.9469, "step": 13055 }, { "epoch": 0.8800242579427917, "grad_norm": 5.452446888509203, "learning_rate": 4.318901029785571e-08, "loss": 2.0146, "step": 13060 }, { "epoch": 0.8803611738148984, "grad_norm": 5.00460196851751, "learning_rate": 4.2950246327255523e-08, "loss": 1.9401, "step": 13065 }, { "epoch": 0.8806980896870051, "grad_norm": 4.594934165868104, "learning_rate": 4.271211455064483e-08, "loss": 2.0173, "step": 13070 }, { "epoch": 0.8810350055591119, "grad_norm": 5.253062490349043, "learning_rate": 4.2474615297408754e-08, "loss": 1.9823, "step": 13075 }, { "epoch": 0.8813719214312187, "grad_norm": 4.78411605909617, "learning_rate": 4.223774889605775e-08, "loss": 1.998, "step": 13080 }, { "epoch": 0.8817088373033254, "grad_norm": 5.106182187941722, "learning_rate": 4.200151567422699e-08, "loss": 2.033, "step": 13085 }, { "epoch": 0.8820457531754321, "grad_norm": 5.19095161818691, "learning_rate": 4.176591595867557e-08, "loss": 2.019, "step": 13090 }, { "epoch": 0.8823826690475388, "grad_norm": 4.943333609979044, "learning_rate": 4.153095007528645e-08, "loss": 1.994, "step": 13095 }, { "epoch": 0.8827195849196455, "grad_norm": 5.049096854633495, "learning_rate": 4.1296618349066e-08, "loss": 1.9179, "step": 13100 }, { "epoch": 0.8830565007917524, "grad_norm": 4.8158632066535345, "learning_rate": 4.106292110414311e-08, "loss": 2.0289, "step": 13105 }, { "epoch": 0.8833934166638591, "grad_norm": 4.805185170253203, "learning_rate": 4.082985866376926e-08, "loss": 2.0249, "step": 13110 }, { "epoch": 0.8837303325359658, "grad_norm": 5.650668000486611, "learning_rate": 4.05974313503179e-08, "loss": 2.0487, "step": 13115 }, { "epoch": 0.8840672484080725, "grad_norm": 4.959428784199171, "learning_rate": 4.036563948528393e-08, "loss": 1.9924, "step": 13120 }, { "epoch": 0.8844041642801792, "grad_norm": 4.904549689642752, "learning_rate": 4.01344833892831e-08, "loss": 2.0438, "step": 13125 }, { "epoch": 0.8847410801522859, "grad_norm": 5.598958822284923, "learning_rate": 3.990396338205204e-08, "loss": 1.9932, "step": 13130 }, { "epoch": 0.8850779960243927, "grad_norm": 5.263921160877042, "learning_rate": 3.967407978244747e-08, "loss": 1.905, "step": 13135 }, { "epoch": 0.8854149118964995, "grad_norm": 5.366747077381886, "learning_rate": 3.944483290844575e-08, "loss": 1.9839, "step": 13140 }, { "epoch": 0.8857518277686062, "grad_norm": 5.034412692088401, "learning_rate": 3.9216223077142394e-08, "loss": 2.0534, "step": 13145 }, { "epoch": 0.8860887436407129, "grad_norm": 5.183187813635658, "learning_rate": 3.8988250604752135e-08, "loss": 1.9672, "step": 13150 }, { "epoch": 0.8864256595128196, "grad_norm": 5.629090379081904, "learning_rate": 3.876091580660762e-08, "loss": 2.0796, "step": 13155 }, { "epoch": 0.8867625753849264, "grad_norm": 5.428682684541866, "learning_rate": 3.853421899715992e-08, "loss": 1.9728, "step": 13160 }, { "epoch": 0.8870994912570331, "grad_norm": 5.000298972069119, "learning_rate": 3.8308160489977424e-08, "loss": 2.029, "step": 13165 }, { "epoch": 0.8874364071291398, "grad_norm": 5.375029391671491, "learning_rate": 3.808274059774552e-08, "loss": 2.0326, "step": 13170 }, { "epoch": 0.8877733230012466, "grad_norm": 5.115850495037083, "learning_rate": 3.785795963226646e-08, "loss": 1.9583, "step": 13175 }, { "epoch": 0.8881102388733533, "grad_norm": 4.9539489478615115, "learning_rate": 3.7633817904458574e-08, "loss": 2.0301, "step": 13180 }, { "epoch": 0.8884471547454601, "grad_norm": 5.4389905855171214, "learning_rate": 3.741031572435615e-08, "loss": 1.9824, "step": 13185 }, { "epoch": 0.8887840706175668, "grad_norm": 4.9589677891817905, "learning_rate": 3.718745340110868e-08, "loss": 1.9737, "step": 13190 }, { "epoch": 0.8891209864896735, "grad_norm": 5.159320817704993, "learning_rate": 3.6965231242980624e-08, "loss": 1.9857, "step": 13195 }, { "epoch": 0.8894579023617802, "grad_norm": 5.157183807730487, "learning_rate": 3.6743649557351265e-08, "loss": 2.0264, "step": 13200 }, { "epoch": 0.889794818233887, "grad_norm": 4.729910547649761, "learning_rate": 3.652270865071344e-08, "loss": 1.9969, "step": 13205 }, { "epoch": 0.8901317341059938, "grad_norm": 4.369061017453812, "learning_rate": 3.630240882867408e-08, "loss": 1.9181, "step": 13210 }, { "epoch": 0.8904686499781005, "grad_norm": 4.954217856655743, "learning_rate": 3.608275039595332e-08, "loss": 2.0098, "step": 13215 }, { "epoch": 0.8908055658502072, "grad_norm": 5.450063387527723, "learning_rate": 3.5863733656383844e-08, "loss": 1.9575, "step": 13220 }, { "epoch": 0.8911424817223139, "grad_norm": 5.0745487190322764, "learning_rate": 3.564535891291115e-08, "loss": 1.9509, "step": 13225 }, { "epoch": 0.8914793975944206, "grad_norm": 4.9973720633700935, "learning_rate": 3.542762646759234e-08, "loss": 2.0091, "step": 13230 }, { "epoch": 0.8918163134665275, "grad_norm": 5.46061410533624, "learning_rate": 3.521053662159629e-08, "loss": 1.9925, "step": 13235 }, { "epoch": 0.8921532293386342, "grad_norm": 4.754467377708288, "learning_rate": 3.499408967520295e-08, "loss": 1.9774, "step": 13240 }, { "epoch": 0.8924901452107409, "grad_norm": 5.086712484106012, "learning_rate": 3.477828592780319e-08, "loss": 1.9917, "step": 13245 }, { "epoch": 0.8928270610828476, "grad_norm": 4.835906720937518, "learning_rate": 3.456312567789793e-08, "loss": 2.0375, "step": 13250 }, { "epoch": 0.8931639769549543, "grad_norm": 4.630060105115506, "learning_rate": 3.4348609223098125e-08, "loss": 1.9975, "step": 13255 }, { "epoch": 0.8935008928270611, "grad_norm": 4.986446791743931, "learning_rate": 3.41347368601243e-08, "loss": 1.9365, "step": 13260 }, { "epoch": 0.8938378086991678, "grad_norm": 5.29092481072544, "learning_rate": 3.39215088848061e-08, "loss": 2.0424, "step": 13265 }, { "epoch": 0.8941747245712746, "grad_norm": 5.14532320294095, "learning_rate": 3.370892559208155e-08, "loss": 1.9713, "step": 13270 }, { "epoch": 0.8945116404433813, "grad_norm": 4.774065124137936, "learning_rate": 3.34969872759972e-08, "loss": 2.0181, "step": 13275 }, { "epoch": 0.894848556315488, "grad_norm": 4.968343477398973, "learning_rate": 3.328569422970762e-08, "loss": 2.0686, "step": 13280 }, { "epoch": 0.8951854721875948, "grad_norm": 4.812513909962813, "learning_rate": 3.307504674547429e-08, "loss": 1.9869, "step": 13285 }, { "epoch": 0.8955223880597015, "grad_norm": 5.417414509849656, "learning_rate": 3.286504511466631e-08, "loss": 2.0332, "step": 13290 }, { "epoch": 0.8958593039318082, "grad_norm": 5.430063971840838, "learning_rate": 3.265568962775927e-08, "loss": 2.0088, "step": 13295 }, { "epoch": 0.896196219803915, "grad_norm": 5.307718377179188, "learning_rate": 3.2446980574334706e-08, "loss": 1.9821, "step": 13300 }, { "epoch": 0.8965331356760217, "grad_norm": 5.64389816820009, "learning_rate": 3.2238918243080505e-08, "loss": 1.9622, "step": 13305 }, { "epoch": 0.8968700515481284, "grad_norm": 5.897621301248351, "learning_rate": 3.203150292178952e-08, "loss": 2.0185, "step": 13310 }, { "epoch": 0.8972069674202352, "grad_norm": 5.240679658646703, "learning_rate": 3.182473489736004e-08, "loss": 1.9657, "step": 13315 }, { "epoch": 0.8975438832923419, "grad_norm": 5.877681391796549, "learning_rate": 3.161861445579478e-08, "loss": 2.0287, "step": 13320 }, { "epoch": 0.8978807991644486, "grad_norm": 5.142862561268491, "learning_rate": 3.1413141882200736e-08, "loss": 1.9774, "step": 13325 }, { "epoch": 0.8982177150365553, "grad_norm": 5.1613038183849875, "learning_rate": 3.120831746078895e-08, "loss": 2.0206, "step": 13330 }, { "epoch": 0.898554630908662, "grad_norm": 5.2142672858976375, "learning_rate": 3.100414147487368e-08, "loss": 2.0623, "step": 13335 }, { "epoch": 0.8988915467807689, "grad_norm": 4.9293999810816596, "learning_rate": 3.0800614206872413e-08, "loss": 1.9576, "step": 13340 }, { "epoch": 0.8992284626528756, "grad_norm": 5.336814919251433, "learning_rate": 3.059773593830539e-08, "loss": 1.9918, "step": 13345 }, { "epoch": 0.8995653785249823, "grad_norm": 4.854851496278759, "learning_rate": 3.039550694979492e-08, "loss": 1.9485, "step": 13350 }, { "epoch": 0.899902294397089, "grad_norm": 4.973602527233128, "learning_rate": 3.019392752106548e-08, "loss": 1.9824, "step": 13355 }, { "epoch": 0.9002392102691957, "grad_norm": 4.8494587271493055, "learning_rate": 2.9992997930942954e-08, "loss": 1.9153, "step": 13360 }, { "epoch": 0.9005761261413026, "grad_norm": 5.2364763845253, "learning_rate": 2.979271845735426e-08, "loss": 1.8908, "step": 13365 }, { "epoch": 0.9009130420134093, "grad_norm": 5.059909911918394, "learning_rate": 2.9593089377327242e-08, "loss": 1.9554, "step": 13370 }, { "epoch": 0.901249957885516, "grad_norm": 4.969485399817279, "learning_rate": 2.9394110966990184e-08, "loss": 2.0118, "step": 13375 }, { "epoch": 0.9015868737576227, "grad_norm": 5.009674565555206, "learning_rate": 2.9195783501570982e-08, "loss": 2.0228, "step": 13380 }, { "epoch": 0.9019237896297294, "grad_norm": 5.247597397365498, "learning_rate": 2.8998107255397643e-08, "loss": 2.0015, "step": 13385 }, { "epoch": 0.9022607055018362, "grad_norm": 4.957182455971941, "learning_rate": 2.880108250189689e-08, "loss": 1.9381, "step": 13390 }, { "epoch": 0.902597621373943, "grad_norm": 5.174797795621716, "learning_rate": 2.860470951359478e-08, "loss": 1.9485, "step": 13395 }, { "epoch": 0.9029345372460497, "grad_norm": 5.326231145672508, "learning_rate": 2.8408988562115488e-08, "loss": 1.9972, "step": 13400 }, { "epoch": 0.9032714531181564, "grad_norm": 4.921134379704207, "learning_rate": 2.8213919918181393e-08, "loss": 2.0342, "step": 13405 }, { "epoch": 0.9036083689902631, "grad_norm": 4.878471082934239, "learning_rate": 2.8019503851612837e-08, "loss": 2.004, "step": 13410 }, { "epoch": 0.9039452848623699, "grad_norm": 5.319108114954009, "learning_rate": 2.782574063132703e-08, "loss": 2.0168, "step": 13415 }, { "epoch": 0.9042822007344766, "grad_norm": 4.922835139694012, "learning_rate": 2.7632630525338597e-08, "loss": 1.9993, "step": 13420 }, { "epoch": 0.9046191166065833, "grad_norm": 5.352989420062258, "learning_rate": 2.7440173800758583e-08, "loss": 2.0286, "step": 13425 }, { "epoch": 0.90495603247869, "grad_norm": 5.1863182658626075, "learning_rate": 2.7248370723794268e-08, "loss": 1.997, "step": 13430 }, { "epoch": 0.9052929483507968, "grad_norm": 5.06143579230655, "learning_rate": 2.7057221559748822e-08, "loss": 2.007, "step": 13435 }, { "epoch": 0.9056298642229036, "grad_norm": 4.905478976999698, "learning_rate": 2.6866726573021025e-08, "loss": 1.9599, "step": 13440 }, { "epoch": 0.9059667800950103, "grad_norm": 5.389604303967091, "learning_rate": 2.667688602710455e-08, "loss": 1.9173, "step": 13445 }, { "epoch": 0.906303695967117, "grad_norm": 4.923402358625174, "learning_rate": 2.648770018458807e-08, "loss": 1.999, "step": 13450 }, { "epoch": 0.9066406118392237, "grad_norm": 4.900080746695482, "learning_rate": 2.6299169307154535e-08, "loss": 1.9402, "step": 13455 }, { "epoch": 0.9069775277113304, "grad_norm": 5.187659544169209, "learning_rate": 2.611129365558118e-08, "loss": 1.9543, "step": 13460 }, { "epoch": 0.9073144435834373, "grad_norm": 5.252612785609459, "learning_rate": 2.592407348973852e-08, "loss": 1.9874, "step": 13465 }, { "epoch": 0.907651359455544, "grad_norm": 4.610227648861338, "learning_rate": 2.573750906859079e-08, "loss": 1.9797, "step": 13470 }, { "epoch": 0.9079882753276507, "grad_norm": 5.1127752320978, "learning_rate": 2.5551600650194906e-08, "loss": 1.9528, "step": 13475 }, { "epoch": 0.9083251911997574, "grad_norm": 4.991769444526486, "learning_rate": 2.536634849170055e-08, "loss": 1.9471, "step": 13480 }, { "epoch": 0.9086621070718641, "grad_norm": 5.237153660513233, "learning_rate": 2.5181752849349593e-08, "loss": 2.0568, "step": 13485 }, { "epoch": 0.9089990229439708, "grad_norm": 5.277294217920485, "learning_rate": 2.4997813978476e-08, "loss": 1.9853, "step": 13490 }, { "epoch": 0.9093359388160777, "grad_norm": 4.921143447648441, "learning_rate": 2.481453213350493e-08, "loss": 1.9325, "step": 13495 }, { "epoch": 0.9096728546881844, "grad_norm": 4.791305124561415, "learning_rate": 2.463190756795308e-08, "loss": 2.0128, "step": 13500 }, { "epoch": 0.9100097705602911, "grad_norm": 4.449119095158488, "learning_rate": 2.4449940534427836e-08, "loss": 2.0052, "step": 13505 }, { "epoch": 0.9103466864323978, "grad_norm": 5.286779183036418, "learning_rate": 2.4268631284627027e-08, "loss": 1.9835, "step": 13510 }, { "epoch": 0.9106836023045045, "grad_norm": 4.886210400142056, "learning_rate": 2.408798006933882e-08, "loss": 1.9316, "step": 13515 }, { "epoch": 0.9110205181766113, "grad_norm": 5.359540479310291, "learning_rate": 2.3907987138440945e-08, "loss": 2.0543, "step": 13520 }, { "epoch": 0.911357434048718, "grad_norm": 5.363379518427708, "learning_rate": 2.3728652740900856e-08, "loss": 2.0245, "step": 13525 }, { "epoch": 0.9116943499208248, "grad_norm": 5.255217328901613, "learning_rate": 2.3549977124774857e-08, "loss": 2.0096, "step": 13530 }, { "epoch": 0.9120312657929315, "grad_norm": 5.297710067774792, "learning_rate": 2.337196053720819e-08, "loss": 2.0493, "step": 13535 }, { "epoch": 0.9123681816650382, "grad_norm": 4.863908522333879, "learning_rate": 2.319460322443456e-08, "loss": 2.0354, "step": 13540 }, { "epoch": 0.912705097537145, "grad_norm": 5.070821242673423, "learning_rate": 2.301790543177551e-08, "loss": 2.0392, "step": 13545 }, { "epoch": 0.9130420134092517, "grad_norm": 4.898621963874645, "learning_rate": 2.284186740364069e-08, "loss": 2.0082, "step": 13550 }, { "epoch": 0.9133789292813584, "grad_norm": 4.773191396795784, "learning_rate": 2.266648938352672e-08, "loss": 1.9718, "step": 13555 }, { "epoch": 0.9137158451534652, "grad_norm": 5.0589872100789535, "learning_rate": 2.249177161401783e-08, "loss": 1.9501, "step": 13560 }, { "epoch": 0.9140527610255719, "grad_norm": 5.032548860261103, "learning_rate": 2.2317714336784422e-08, "loss": 1.9827, "step": 13565 }, { "epoch": 0.9143896768976787, "grad_norm": 5.061555689453668, "learning_rate": 2.21443177925838e-08, "loss": 2.007, "step": 13570 }, { "epoch": 0.9147265927697854, "grad_norm": 5.420036144204343, "learning_rate": 2.1971582221258944e-08, "loss": 1.8679, "step": 13575 }, { "epoch": 0.9150635086418921, "grad_norm": 4.785620546183103, "learning_rate": 2.1799507861738788e-08, "loss": 1.9904, "step": 13580 }, { "epoch": 0.9154004245139988, "grad_norm": 5.060058091620267, "learning_rate": 2.1628094952037713e-08, "loss": 1.982, "step": 13585 }, { "epoch": 0.9157373403861055, "grad_norm": 4.79685651443187, "learning_rate": 2.1457343729255062e-08, "loss": 1.974, "step": 13590 }, { "epoch": 0.9160742562582124, "grad_norm": 5.1071363106406045, "learning_rate": 2.128725442957491e-08, "loss": 1.9974, "step": 13595 }, { "epoch": 0.9164111721303191, "grad_norm": 4.852253297740254, "learning_rate": 2.111782728826583e-08, "loss": 1.9832, "step": 13600 }, { "epoch": 0.9167480880024258, "grad_norm": 4.915558813979143, "learning_rate": 2.0949062539680486e-08, "loss": 2.0508, "step": 13605 }, { "epoch": 0.9170850038745325, "grad_norm": 5.014362491104972, "learning_rate": 2.07809604172553e-08, "loss": 1.9647, "step": 13610 }, { "epoch": 0.9174219197466392, "grad_norm": 5.193551970407256, "learning_rate": 2.0613521153510115e-08, "loss": 1.9618, "step": 13615 }, { "epoch": 0.917758835618746, "grad_norm": 5.251744485786298, "learning_rate": 2.0446744980048002e-08, "loss": 1.9735, "step": 13620 }, { "epoch": 0.9180957514908528, "grad_norm": 5.156124436423355, "learning_rate": 2.0280632127554708e-08, "loss": 2.0494, "step": 13625 }, { "epoch": 0.9184326673629595, "grad_norm": 4.796392468323955, "learning_rate": 2.011518282579855e-08, "loss": 2.0125, "step": 13630 }, { "epoch": 0.9187695832350662, "grad_norm": 4.9215818139212635, "learning_rate": 1.9950397303630075e-08, "loss": 2.0914, "step": 13635 }, { "epoch": 0.9191064991071729, "grad_norm": 4.918836424987228, "learning_rate": 1.9786275788981565e-08, "loss": 1.9662, "step": 13640 }, { "epoch": 0.9194434149792797, "grad_norm": 4.919906955295612, "learning_rate": 1.9622818508866823e-08, "loss": 1.991, "step": 13645 }, { "epoch": 0.9197803308513864, "grad_norm": 5.269537869495638, "learning_rate": 1.9460025689381043e-08, "loss": 2.0496, "step": 13650 }, { "epoch": 0.9201172467234932, "grad_norm": 4.393152217892142, "learning_rate": 1.9297897555700216e-08, "loss": 1.9941, "step": 13655 }, { "epoch": 0.9204541625955999, "grad_norm": 4.746119431564327, "learning_rate": 1.9136434332080898e-08, "loss": 1.9745, "step": 13660 }, { "epoch": 0.9207910784677066, "grad_norm": 5.135049334397468, "learning_rate": 1.8975636241860048e-08, "loss": 2.0015, "step": 13665 }, { "epoch": 0.9211279943398133, "grad_norm": 5.037243258994006, "learning_rate": 1.8815503507454644e-08, "loss": 1.9346, "step": 13670 }, { "epoch": 0.9214649102119201, "grad_norm": 4.911123422152679, "learning_rate": 1.8656036350361117e-08, "loss": 1.963, "step": 13675 }, { "epoch": 0.9218018260840268, "grad_norm": 5.068855482084717, "learning_rate": 1.8497234991155463e-08, "loss": 1.9898, "step": 13680 }, { "epoch": 0.9221387419561335, "grad_norm": 4.83460683297273, "learning_rate": 1.8339099649492762e-08, "loss": 1.9555, "step": 13685 }, { "epoch": 0.9224756578282403, "grad_norm": 5.087493222607199, "learning_rate": 1.8181630544106653e-08, "loss": 1.9704, "step": 13690 }, { "epoch": 0.922812573700347, "grad_norm": 5.712101803747636, "learning_rate": 1.8024827892809346e-08, "loss": 2.0264, "step": 13695 }, { "epoch": 0.9231494895724538, "grad_norm": 4.9402082594676955, "learning_rate": 1.7868691912491352e-08, "loss": 1.9803, "step": 13700 }, { "epoch": 0.9234864054445605, "grad_norm": 5.612726375629542, "learning_rate": 1.77132228191208e-08, "loss": 2.0675, "step": 13705 }, { "epoch": 0.9238233213166672, "grad_norm": 4.824852853312786, "learning_rate": 1.7558420827743505e-08, "loss": 1.9278, "step": 13710 }, { "epoch": 0.9241602371887739, "grad_norm": 4.901305239944907, "learning_rate": 1.7404286152482573e-08, "loss": 2.0461, "step": 13715 }, { "epoch": 0.9244971530608806, "grad_norm": 4.9722787544463065, "learning_rate": 1.725081900653791e-08, "loss": 2.0074, "step": 13720 }, { "epoch": 0.9248340689329875, "grad_norm": 5.062302763415683, "learning_rate": 1.7098019602186376e-08, "loss": 2.01, "step": 13725 }, { "epoch": 0.9251709848050942, "grad_norm": 4.978140021102383, "learning_rate": 1.6945888150780797e-08, "loss": 2.0147, "step": 13730 }, { "epoch": 0.9255079006772009, "grad_norm": 5.208836602850408, "learning_rate": 1.6794424862750568e-08, "loss": 2.0029, "step": 13735 }, { "epoch": 0.9258448165493076, "grad_norm": 5.513935552178835, "learning_rate": 1.664362994760038e-08, "loss": 2.0079, "step": 13740 }, { "epoch": 0.9261817324214143, "grad_norm": 5.305016210843922, "learning_rate": 1.649350361391083e-08, "loss": 1.988, "step": 13745 }, { "epoch": 0.9265186482935212, "grad_norm": 5.785066446546314, "learning_rate": 1.6344046069337646e-08, "loss": 2.0117, "step": 13750 }, { "epoch": 0.9268555641656279, "grad_norm": 4.598300304415643, "learning_rate": 1.6195257520611182e-08, "loss": 2.0218, "step": 13755 }, { "epoch": 0.9271924800377346, "grad_norm": 4.818291147514866, "learning_rate": 1.604713817353681e-08, "loss": 1.9832, "step": 13760 }, { "epoch": 0.9275293959098413, "grad_norm": 5.313923337984414, "learning_rate": 1.5899688232994147e-08, "loss": 1.9932, "step": 13765 }, { "epoch": 0.927866311781948, "grad_norm": 5.135307241259097, "learning_rate": 1.5752907902936707e-08, "loss": 2.0512, "step": 13770 }, { "epoch": 0.9282032276540548, "grad_norm": 5.086066893642102, "learning_rate": 1.560679738639198e-08, "loss": 2.0015, "step": 13775 }, { "epoch": 0.9285401435261615, "grad_norm": 4.970673647608001, "learning_rate": 1.5461356885461075e-08, "loss": 2.0164, "step": 13780 }, { "epoch": 0.9288770593982683, "grad_norm": 5.025657866538505, "learning_rate": 1.5316586601317905e-08, "loss": 1.9751, "step": 13785 }, { "epoch": 0.929213975270375, "grad_norm": 4.901703006667592, "learning_rate": 1.5172486734209788e-08, "loss": 1.9897, "step": 13790 }, { "epoch": 0.9295508911424817, "grad_norm": 5.024951820926872, "learning_rate": 1.502905748345651e-08, "loss": 2.0434, "step": 13795 }, { "epoch": 0.9298878070145885, "grad_norm": 5.2024378472172295, "learning_rate": 1.4886299047450257e-08, "loss": 2.0358, "step": 13800 }, { "epoch": 0.9302247228866952, "grad_norm": 4.748796342414486, "learning_rate": 1.4744211623655356e-08, "loss": 1.9513, "step": 13805 }, { "epoch": 0.9305616387588019, "grad_norm": 5.152752301154622, "learning_rate": 1.4602795408607982e-08, "loss": 2.0563, "step": 13810 }, { "epoch": 0.9308985546309086, "grad_norm": 4.646089812597653, "learning_rate": 1.4462050597915942e-08, "loss": 1.9967, "step": 13815 }, { "epoch": 0.9312354705030154, "grad_norm": 4.77038288216349, "learning_rate": 1.4321977386258289e-08, "loss": 1.9697, "step": 13820 }, { "epoch": 0.9315723863751222, "grad_norm": 4.8808799200826885, "learning_rate": 1.4182575967385092e-08, "loss": 2.0047, "step": 13825 }, { "epoch": 0.9319093022472289, "grad_norm": 5.133106609548521, "learning_rate": 1.4043846534117331e-08, "loss": 2.0284, "step": 13830 }, { "epoch": 0.9322462181193356, "grad_norm": 4.800864986975381, "learning_rate": 1.3905789278346347e-08, "loss": 2.0286, "step": 13835 }, { "epoch": 0.9325831339914423, "grad_norm": 5.406461479140433, "learning_rate": 1.3768404391033717e-08, "loss": 1.9556, "step": 13840 }, { "epoch": 0.932920049863549, "grad_norm": 5.074042739674497, "learning_rate": 1.3631692062211209e-08, "loss": 2.0008, "step": 13845 }, { "epoch": 0.9332569657356558, "grad_norm": 4.65655047457409, "learning_rate": 1.3495652480979947e-08, "loss": 1.9376, "step": 13850 }, { "epoch": 0.9335938816077626, "grad_norm": 5.4006014820857855, "learning_rate": 1.3360285835510854e-08, "loss": 1.9943, "step": 13855 }, { "epoch": 0.9339307974798693, "grad_norm": 4.692965416814292, "learning_rate": 1.322559231304382e-08, "loss": 1.9855, "step": 13860 }, { "epoch": 0.934267713351976, "grad_norm": 4.7112849875932055, "learning_rate": 1.3091572099887816e-08, "loss": 1.9507, "step": 13865 }, { "epoch": 0.9346046292240827, "grad_norm": 5.50735019927673, "learning_rate": 1.2958225381420329e-08, "loss": 2.0351, "step": 13870 }, { "epoch": 0.9349415450961894, "grad_norm": 4.656584655353558, "learning_rate": 1.282555234208732e-08, "loss": 1.9555, "step": 13875 }, { "epoch": 0.9352784609682963, "grad_norm": 4.970371266457552, "learning_rate": 1.2693553165403104e-08, "loss": 2.0142, "step": 13880 }, { "epoch": 0.935615376840403, "grad_norm": 4.68774676824045, "learning_rate": 1.2562228033949628e-08, "loss": 2.0129, "step": 13885 }, { "epoch": 0.9359522927125097, "grad_norm": 5.012468072596521, "learning_rate": 1.243157712937659e-08, "loss": 1.9871, "step": 13890 }, { "epoch": 0.9362892085846164, "grad_norm": 4.900015289190337, "learning_rate": 1.230160063240121e-08, "loss": 1.9904, "step": 13895 }, { "epoch": 0.9366261244567231, "grad_norm": 5.400815250135134, "learning_rate": 1.2172298722807617e-08, "loss": 1.9572, "step": 13900 }, { "epoch": 0.9369630403288299, "grad_norm": 5.331023427906662, "learning_rate": 1.204367157944708e-08, "loss": 1.9879, "step": 13905 }, { "epoch": 0.9372999562009366, "grad_norm": 5.055676799892551, "learning_rate": 1.19157193802375e-08, "loss": 1.9577, "step": 13910 }, { "epoch": 0.9376368720730434, "grad_norm": 4.9505159570587, "learning_rate": 1.1788442302163026e-08, "loss": 1.9274, "step": 13915 }, { "epoch": 0.9379737879451501, "grad_norm": 4.490827936874536, "learning_rate": 1.1661840521274168e-08, "loss": 1.9354, "step": 13920 }, { "epoch": 0.9383107038172568, "grad_norm": 4.809912113967792, "learning_rate": 1.1535914212687237e-08, "loss": 1.9262, "step": 13925 }, { "epoch": 0.9386476196893636, "grad_norm": 5.0635977216708685, "learning_rate": 1.1410663550584287e-08, "loss": 2.017, "step": 13930 }, { "epoch": 0.9389845355614703, "grad_norm": 4.74407804543668, "learning_rate": 1.1286088708212793e-08, "loss": 1.9766, "step": 13935 }, { "epoch": 0.939321451433577, "grad_norm": 5.325504995650674, "learning_rate": 1.1162189857885362e-08, "loss": 1.8894, "step": 13940 }, { "epoch": 0.9396583673056838, "grad_norm": 5.274137515015183, "learning_rate": 1.1038967170979741e-08, "loss": 2.0505, "step": 13945 }, { "epoch": 0.9399952831777905, "grad_norm": 5.279360083826585, "learning_rate": 1.0916420817938254e-08, "loss": 1.9681, "step": 13950 }, { "epoch": 0.9403321990498973, "grad_norm": 5.383462516396784, "learning_rate": 1.0794550968267701e-08, "loss": 1.985, "step": 13955 }, { "epoch": 0.940669114922004, "grad_norm": 4.494697554992901, "learning_rate": 1.0673357790539294e-08, "loss": 1.988, "step": 13960 }, { "epoch": 0.9410060307941107, "grad_norm": 4.977025690567251, "learning_rate": 1.0552841452388105e-08, "loss": 1.9616, "step": 13965 }, { "epoch": 0.9413429466662174, "grad_norm": 5.154118715328449, "learning_rate": 1.0433002120513123e-08, "loss": 2.0243, "step": 13970 }, { "epoch": 0.9416798625383241, "grad_norm": 4.519149799393283, "learning_rate": 1.0313839960676751e-08, "loss": 1.932, "step": 13975 }, { "epoch": 0.942016778410431, "grad_norm": 4.530273138043147, "learning_rate": 1.019535513770492e-08, "loss": 1.9306, "step": 13980 }, { "epoch": 0.9423536942825377, "grad_norm": 5.17607373014098, "learning_rate": 1.0077547815486476e-08, "loss": 2.028, "step": 13985 }, { "epoch": 0.9426906101546444, "grad_norm": 5.062068936076111, "learning_rate": 9.960418156973238e-09, "loss": 2.0242, "step": 13990 }, { "epoch": 0.9430275260267511, "grad_norm": 5.44803957068883, "learning_rate": 9.843966324179609e-09, "loss": 1.9877, "step": 13995 }, { "epoch": 0.9433644418988578, "grad_norm": 4.623390579653339, "learning_rate": 9.728192478182573e-09, "loss": 1.9884, "step": 14000 }, { "epoch": 0.9437013577709646, "grad_norm": 5.077234524944236, "learning_rate": 9.613096779121089e-09, "loss": 2.0012, "step": 14005 }, { "epoch": 0.9440382736430714, "grad_norm": 5.1610035764043465, "learning_rate": 9.498679386196417e-09, "loss": 1.8766, "step": 14010 }, { "epoch": 0.9443751895151781, "grad_norm": 5.470597948024354, "learning_rate": 9.384940457671186e-09, "loss": 1.9784, "step": 14015 }, { "epoch": 0.9447121053872848, "grad_norm": 5.231138293381884, "learning_rate": 9.271880150869882e-09, "loss": 2.0475, "step": 14020 }, { "epoch": 0.9450490212593915, "grad_norm": 5.274514122537882, "learning_rate": 9.15949862217824e-09, "loss": 1.9964, "step": 14025 }, { "epoch": 0.9453859371314982, "grad_norm": 5.402519375421664, "learning_rate": 9.04779602704292e-09, "loss": 2.0486, "step": 14030 }, { "epoch": 0.945722853003605, "grad_norm": 4.732104288983082, "learning_rate": 8.936772519971769e-09, "loss": 1.9898, "step": 14035 }, { "epoch": 0.9460597688757117, "grad_norm": 4.8807134747988, "learning_rate": 8.826428254533169e-09, "loss": 2.0457, "step": 14040 }, { "epoch": 0.9463966847478185, "grad_norm": 4.616485980315057, "learning_rate": 8.716763383355862e-09, "loss": 1.9919, "step": 14045 }, { "epoch": 0.9467336006199252, "grad_norm": 5.250323068807304, "learning_rate": 8.607778058129122e-09, "loss": 2.0205, "step": 14050 }, { "epoch": 0.9470705164920319, "grad_norm": 5.080967458679945, "learning_rate": 8.499472429601972e-09, "loss": 1.9425, "step": 14055 }, { "epoch": 0.9474074323641387, "grad_norm": 5.465037346304789, "learning_rate": 8.391846647583468e-09, "loss": 2.0268, "step": 14060 }, { "epoch": 0.9477443482362454, "grad_norm": 5.3288242528442495, "learning_rate": 8.284900860942246e-09, "loss": 2.039, "step": 14065 }, { "epoch": 0.9480812641083521, "grad_norm": 5.193840580032862, "learning_rate": 8.178635217606367e-09, "loss": 1.9638, "step": 14070 }, { "epoch": 0.9484181799804589, "grad_norm": 4.615498190109572, "learning_rate": 8.073049864563142e-09, "loss": 1.9258, "step": 14075 }, { "epoch": 0.9487550958525656, "grad_norm": 4.62069163819021, "learning_rate": 7.968144947858801e-09, "loss": 1.9548, "step": 14080 }, { "epoch": 0.9490920117246724, "grad_norm": 5.297095383585142, "learning_rate": 7.863920612598496e-09, "loss": 2.0007, "step": 14085 }, { "epoch": 0.9494289275967791, "grad_norm": 5.138684542272962, "learning_rate": 7.760377002945961e-09, "loss": 2.004, "step": 14090 }, { "epoch": 0.9497658434688858, "grad_norm": 4.888218387424238, "learning_rate": 7.657514262123354e-09, "loss": 1.946, "step": 14095 }, { "epoch": 0.9501027593409925, "grad_norm": 5.628791084463584, "learning_rate": 7.55533253241103e-09, "loss": 1.9921, "step": 14100 }, { "epoch": 0.9504396752130992, "grad_norm": 4.930757306936257, "learning_rate": 7.453831955147428e-09, "loss": 1.9925, "step": 14105 }, { "epoch": 0.9507765910852061, "grad_norm": 4.418318160888486, "learning_rate": 7.353012670728631e-09, "loss": 1.9428, "step": 14110 }, { "epoch": 0.9511135069573128, "grad_norm": 4.748288489484135, "learning_rate": 7.252874818608645e-09, "loss": 1.9617, "step": 14115 }, { "epoch": 0.9514504228294195, "grad_norm": 5.268296892042195, "learning_rate": 7.153418537298617e-09, "loss": 2.027, "step": 14120 }, { "epoch": 0.9517873387015262, "grad_norm": 5.1491558428050235, "learning_rate": 7.0546439643671685e-09, "loss": 1.9623, "step": 14125 }, { "epoch": 0.9521242545736329, "grad_norm": 5.145079390699005, "learning_rate": 6.9565512364398445e-09, "loss": 2.0155, "step": 14130 }, { "epoch": 0.9524611704457397, "grad_norm": 4.850443848278088, "learning_rate": 6.859140489199167e-09, "loss": 1.9477, "step": 14135 }, { "epoch": 0.9527980863178465, "grad_norm": 4.878568630331301, "learning_rate": 6.762411857384187e-09, "loss": 2.0168, "step": 14140 }, { "epoch": 0.9531350021899532, "grad_norm": 4.974661304597669, "learning_rate": 6.666365474790492e-09, "loss": 2.0054, "step": 14145 }, { "epoch": 0.9534719180620599, "grad_norm": 5.457120560848749, "learning_rate": 6.571001474270144e-09, "loss": 1.9394, "step": 14150 }, { "epoch": 0.9538088339341666, "grad_norm": 4.8212705026384, "learning_rate": 6.4763199877311825e-09, "loss": 1.9964, "step": 14155 }, { "epoch": 0.9541457498062734, "grad_norm": 5.04021494578342, "learning_rate": 6.382321146137571e-09, "loss": 2.0588, "step": 14160 }, { "epoch": 0.9544826656783801, "grad_norm": 5.008807931412574, "learning_rate": 6.28900507950908e-09, "loss": 2.0104, "step": 14165 }, { "epoch": 0.9548195815504869, "grad_norm": 5.050472020778995, "learning_rate": 6.196371916921073e-09, "loss": 1.9998, "step": 14170 }, { "epoch": 0.9551564974225936, "grad_norm": 5.462486200263347, "learning_rate": 6.1044217865043325e-09, "loss": 2.0106, "step": 14175 }, { "epoch": 0.9554934132947003, "grad_norm": 5.112117118334378, "learning_rate": 6.013154815444732e-09, "loss": 2.0357, "step": 14180 }, { "epoch": 0.9558303291668071, "grad_norm": 4.928524116584198, "learning_rate": 5.922571129983456e-09, "loss": 2.0529, "step": 14185 }, { "epoch": 0.9561672450389138, "grad_norm": 5.4585816427963705, "learning_rate": 5.832670855416277e-09, "loss": 2.0074, "step": 14190 }, { "epoch": 0.9565041609110205, "grad_norm": 5.133567163727801, "learning_rate": 5.7434541160938375e-09, "loss": 2.0171, "step": 14195 }, { "epoch": 0.9568410767831272, "grad_norm": 4.803350448372982, "learning_rate": 5.6549210354212565e-09, "loss": 2.0117, "step": 14200 }, { "epoch": 0.957177992655234, "grad_norm": 5.464558957665683, "learning_rate": 5.567071735858131e-09, "loss": 1.9569, "step": 14205 }, { "epoch": 0.9575149085273407, "grad_norm": 5.142387963625432, "learning_rate": 5.4799063389179834e-09, "loss": 1.991, "step": 14210 }, { "epoch": 0.9578518243994475, "grad_norm": 4.991757651049332, "learning_rate": 5.393424965168702e-09, "loss": 2.0205, "step": 14215 }, { "epoch": 0.9581887402715542, "grad_norm": 5.470355097857781, "learning_rate": 5.307627734231657e-09, "loss": 2.0599, "step": 14220 }, { "epoch": 0.9585256561436609, "grad_norm": 5.026644834224078, "learning_rate": 5.222514764782193e-09, "loss": 2.0458, "step": 14225 }, { "epoch": 0.9588625720157676, "grad_norm": 4.697064651043849, "learning_rate": 5.138086174549083e-09, "loss": 2.0109, "step": 14230 }, { "epoch": 0.9591994878878743, "grad_norm": 5.200233185104444, "learning_rate": 5.054342080314522e-09, "loss": 1.9742, "step": 14235 }, { "epoch": 0.9595364037599812, "grad_norm": 4.719992809214779, "learning_rate": 4.97128259791374e-09, "loss": 1.9253, "step": 14240 }, { "epoch": 0.9598733196320879, "grad_norm": 5.130490878086098, "learning_rate": 4.888907842235113e-09, "loss": 1.9365, "step": 14245 }, { "epoch": 0.9602102355041946, "grad_norm": 5.039901488721123, "learning_rate": 4.807217927220053e-09, "loss": 1.9903, "step": 14250 }, { "epoch": 0.9605471513763013, "grad_norm": 5.445139913452382, "learning_rate": 4.726212965862342e-09, "loss": 1.999, "step": 14255 }, { "epoch": 0.960884067248408, "grad_norm": 5.2579026471842845, "learning_rate": 4.645893070208684e-09, "loss": 1.956, "step": 14260 }, { "epoch": 0.9612209831205148, "grad_norm": 5.36316461960892, "learning_rate": 4.566258351357988e-09, "loss": 1.9388, "step": 14265 }, { "epoch": 0.9615578989926216, "grad_norm": 5.531377254003713, "learning_rate": 4.48730891946153e-09, "loss": 1.9317, "step": 14270 }, { "epoch": 0.9618948148647283, "grad_norm": 4.83144479893855, "learning_rate": 4.409044883722568e-09, "loss": 1.9503, "step": 14275 }, { "epoch": 0.962231730736835, "grad_norm": 4.94107329587846, "learning_rate": 4.331466352396396e-09, "loss": 2.0337, "step": 14280 }, { "epoch": 0.9625686466089417, "grad_norm": 4.800776159969681, "learning_rate": 4.2545734327902315e-09, "loss": 1.9615, "step": 14285 }, { "epoch": 0.9629055624810485, "grad_norm": 5.5077605310716224, "learning_rate": 4.178366231262665e-09, "loss": 1.9834, "step": 14290 }, { "epoch": 0.9632424783531552, "grad_norm": 4.850307245955739, "learning_rate": 4.102844853224041e-09, "loss": 2.0063, "step": 14295 }, { "epoch": 0.963579394225262, "grad_norm": 5.150944533744374, "learning_rate": 4.028009403135968e-09, "loss": 1.9731, "step": 14300 }, { "epoch": 0.9639163100973687, "grad_norm": 5.119177525408531, "learning_rate": 3.95385998451131e-09, "loss": 2.035, "step": 14305 }, { "epoch": 0.9642532259694754, "grad_norm": 4.608780605975504, "learning_rate": 3.880396699913968e-09, "loss": 1.9982, "step": 14310 }, { "epoch": 0.9645901418415822, "grad_norm": 5.548934725956067, "learning_rate": 3.807619650958827e-09, "loss": 1.9671, "step": 14315 }, { "epoch": 0.9649270577136889, "grad_norm": 5.31433408992231, "learning_rate": 3.7355289383115276e-09, "loss": 2.0333, "step": 14320 }, { "epoch": 0.9652639735857956, "grad_norm": 4.922275699861636, "learning_rate": 3.664124661688417e-09, "loss": 2.0446, "step": 14325 }, { "epoch": 0.9656008894579023, "grad_norm": 5.898807773135097, "learning_rate": 3.5934069198562677e-09, "loss": 1.9269, "step": 14330 }, { "epoch": 0.9659378053300091, "grad_norm": 4.885607793867531, "learning_rate": 3.5233758106322787e-09, "loss": 1.9951, "step": 14335 }, { "epoch": 0.9662747212021159, "grad_norm": 5.1044484079516685, "learning_rate": 3.4540314308839635e-09, "loss": 1.985, "step": 14340 }, { "epoch": 0.9666116370742226, "grad_norm": 5.075123764839333, "learning_rate": 3.385373876528874e-09, "loss": 2.0267, "step": 14345 }, { "epoch": 0.9669485529463293, "grad_norm": 5.456820114158765, "learning_rate": 3.3174032425345444e-09, "loss": 1.9668, "step": 14350 }, { "epoch": 0.967285468818436, "grad_norm": 5.049645911239605, "learning_rate": 3.250119622918379e-09, "loss": 1.9301, "step": 14355 }, { "epoch": 0.9676223846905427, "grad_norm": 5.060932064765302, "learning_rate": 3.1835231107474323e-09, "loss": 1.9939, "step": 14360 }, { "epoch": 0.9679593005626496, "grad_norm": 5.183555091736461, "learning_rate": 3.1176137981385185e-09, "loss": 2.0025, "step": 14365 }, { "epoch": 0.9682962164347563, "grad_norm": 4.951370364441484, "learning_rate": 3.0523917762576568e-09, "loss": 1.9508, "step": 14370 }, { "epoch": 0.968633132306863, "grad_norm": 4.841627241474747, "learning_rate": 2.9878571353204595e-09, "loss": 2.0824, "step": 14375 }, { "epoch": 0.9689700481789697, "grad_norm": 5.165194476945867, "learning_rate": 2.924009964591578e-09, "loss": 1.9533, "step": 14380 }, { "epoch": 0.9693069640510764, "grad_norm": 4.822713128888923, "learning_rate": 2.8608503523848136e-09, "loss": 2.0172, "step": 14385 }, { "epoch": 0.9696438799231831, "grad_norm": 5.80927696082216, "learning_rate": 2.7983783860629496e-09, "loss": 1.9052, "step": 14390 }, { "epoch": 0.96998079579529, "grad_norm": 4.9172670955380235, "learning_rate": 2.7365941520375303e-09, "loss": 2.0031, "step": 14395 }, { "epoch": 0.9703177116673967, "grad_norm": 4.815662012800365, "learning_rate": 2.6754977357689724e-09, "loss": 1.9777, "step": 14400 }, { "epoch": 0.9706546275395034, "grad_norm": 4.676406485741691, "learning_rate": 2.6150892217660647e-09, "loss": 1.9091, "step": 14405 }, { "epoch": 0.9709915434116101, "grad_norm": 5.131039774073136, "learning_rate": 2.5553686935864126e-09, "loss": 2.0502, "step": 14410 }, { "epoch": 0.9713284592837168, "grad_norm": 5.1457684236170085, "learning_rate": 2.496336233835661e-09, "loss": 1.9566, "step": 14415 }, { "epoch": 0.9716653751558236, "grad_norm": 4.994031355660202, "learning_rate": 2.437991924167937e-09, "loss": 2.0515, "step": 14420 }, { "epoch": 0.9720022910279303, "grad_norm": 4.692963202976883, "learning_rate": 2.380335845285464e-09, "loss": 1.9854, "step": 14425 }, { "epoch": 0.9723392069000371, "grad_norm": 5.180059214381184, "learning_rate": 2.323368076938448e-09, "loss": 1.9906, "step": 14430 }, { "epoch": 0.9726761227721438, "grad_norm": 4.9645456332095765, "learning_rate": 2.2670886979250235e-09, "loss": 1.9475, "step": 14435 }, { "epoch": 0.9730130386442505, "grad_norm": 5.826236511069501, "learning_rate": 2.211497786091143e-09, "loss": 2.0013, "step": 14440 }, { "epoch": 0.9733499545163573, "grad_norm": 4.526198341413872, "learning_rate": 2.1565954183306313e-09, "loss": 1.9826, "step": 14445 }, { "epoch": 0.973686870388464, "grad_norm": 5.67819443433501, "learning_rate": 2.1023816705846853e-09, "loss": 2.0067, "step": 14450 }, { "epoch": 0.9740237862605707, "grad_norm": 4.926329791188667, "learning_rate": 2.048856617842043e-09, "loss": 1.9002, "step": 14455 }, { "epoch": 0.9743607021326774, "grad_norm": 4.822104609889236, "learning_rate": 1.9960203341389813e-09, "loss": 1.927, "step": 14460 }, { "epoch": 0.9746976180047842, "grad_norm": 5.165412184588229, "learning_rate": 1.943872892558929e-09, "loss": 2.0157, "step": 14465 }, { "epoch": 0.975034533876891, "grad_norm": 5.012117714322113, "learning_rate": 1.8924143652325196e-09, "loss": 1.9802, "step": 14470 }, { "epoch": 0.9753714497489977, "grad_norm": 4.968442527541088, "learning_rate": 1.8416448233374848e-09, "loss": 1.9218, "step": 14475 }, { "epoch": 0.9757083656211044, "grad_norm": 5.340492909493543, "learning_rate": 1.79156433709865e-09, "loss": 2.0321, "step": 14480 }, { "epoch": 0.9760452814932111, "grad_norm": 5.039820755432438, "learning_rate": 1.742172975787548e-09, "loss": 1.9657, "step": 14485 }, { "epoch": 0.9763821973653178, "grad_norm": 5.149730056166354, "learning_rate": 1.6934708077226411e-09, "loss": 1.9694, "step": 14490 }, { "epoch": 0.9767191132374247, "grad_norm": 4.83581053368547, "learning_rate": 1.6454579002690982e-09, "loss": 1.9981, "step": 14495 }, { "epoch": 0.9770560291095314, "grad_norm": 5.46767019919695, "learning_rate": 1.5981343198386288e-09, "loss": 2.0037, "step": 14500 }, { "epoch": 0.9773929449816381, "grad_norm": 5.041955001916575, "learning_rate": 1.5515001318895382e-09, "loss": 1.9385, "step": 14505 }, { "epoch": 0.9777298608537448, "grad_norm": 5.021760049935549, "learning_rate": 1.5055554009264505e-09, "loss": 2.0289, "step": 14510 }, { "epoch": 0.9780667767258515, "grad_norm": 5.141845995756756, "learning_rate": 1.4603001905004187e-09, "loss": 1.9548, "step": 14515 }, { "epoch": 0.9784036925979583, "grad_norm": 5.072917543619285, "learning_rate": 1.4157345632087592e-09, "loss": 1.8847, "step": 14520 }, { "epoch": 0.978740608470065, "grad_norm": 4.826143966376694, "learning_rate": 1.37185858069494e-09, "loss": 1.9606, "step": 14525 }, { "epoch": 0.9790775243421718, "grad_norm": 4.85975640440105, "learning_rate": 1.328672303648415e-09, "loss": 1.9756, "step": 14530 }, { "epoch": 0.9794144402142785, "grad_norm": 5.331883269436893, "learning_rate": 1.2861757918046778e-09, "loss": 1.9439, "step": 14535 }, { "epoch": 0.9797513560863852, "grad_norm": 5.1480113321785, "learning_rate": 1.2443691039452642e-09, "loss": 2.0098, "step": 14540 }, { "epoch": 0.980088271958492, "grad_norm": 5.342200244699657, "learning_rate": 1.203252297897417e-09, "loss": 1.9982, "step": 14545 }, { "epoch": 0.9804251878305987, "grad_norm": 4.9175873575150515, "learning_rate": 1.1628254305340869e-09, "loss": 1.9737, "step": 14550 }, { "epoch": 0.9807621037027054, "grad_norm": 5.160574859943779, "learning_rate": 1.123088557773988e-09, "loss": 1.9741, "step": 14555 }, { "epoch": 0.9810990195748122, "grad_norm": 5.015796754231919, "learning_rate": 1.0840417345814312e-09, "loss": 1.983, "step": 14560 }, { "epoch": 0.9814359354469189, "grad_norm": 4.7260742843450005, "learning_rate": 1.0456850149662134e-09, "loss": 1.9803, "step": 14565 }, { "epoch": 0.9817728513190256, "grad_norm": 4.788960454976405, "learning_rate": 1.0080184519835056e-09, "loss": 1.9399, "step": 14570 }, { "epoch": 0.9821097671911324, "grad_norm": 5.0605848784392045, "learning_rate": 9.71042097734076e-10, "loss": 1.9135, "step": 14575 }, { "epoch": 0.9824466830632391, "grad_norm": 4.809957477909992, "learning_rate": 9.347560033637347e-10, "loss": 1.9906, "step": 14580 }, { "epoch": 0.9827835989353458, "grad_norm": 5.1107192814029325, "learning_rate": 8.991602190636105e-10, "loss": 2.0045, "step": 14585 }, { "epoch": 0.9831205148074526, "grad_norm": 5.115810882212845, "learning_rate": 8.642547940700961e-10, "loss": 1.9637, "step": 14590 }, { "epoch": 0.9834574306795593, "grad_norm": 5.294610885204472, "learning_rate": 8.300397766644595e-10, "loss": 1.9763, "step": 14595 }, { "epoch": 0.9837943465516661, "grad_norm": 5.073127743931301, "learning_rate": 7.965152141732878e-10, "loss": 1.9197, "step": 14600 }, { "epoch": 0.9841312624237728, "grad_norm": 5.42842481630869, "learning_rate": 7.636811529678211e-10, "loss": 2.106, "step": 14605 }, { "epoch": 0.9844681782958795, "grad_norm": 5.2793809297140495, "learning_rate": 7.315376384643968e-10, "loss": 1.988, "step": 14610 }, { "epoch": 0.9848050941679862, "grad_norm": 5.225210333606875, "learning_rate": 7.000847151240608e-10, "loss": 2.0317, "step": 14615 }, { "epoch": 0.9851420100400929, "grad_norm": 4.810990324740518, "learning_rate": 6.693224264527897e-10, "loss": 1.9953, "step": 14620 }, { "epoch": 0.9854789259121998, "grad_norm": 4.788926458206091, "learning_rate": 6.392508150011023e-10, "loss": 1.9916, "step": 14625 }, { "epoch": 0.9858158417843065, "grad_norm": 4.873035412975629, "learning_rate": 6.098699223641701e-10, "loss": 1.9781, "step": 14630 }, { "epoch": 0.9861527576564132, "grad_norm": 4.5106759847479605, "learning_rate": 5.811797891819847e-10, "loss": 1.9689, "step": 14635 }, { "epoch": 0.9864896735285199, "grad_norm": 5.278834400080295, "learning_rate": 5.531804551387464e-10, "loss": 2.0134, "step": 14640 }, { "epoch": 0.9868265894006266, "grad_norm": 4.913475984256622, "learning_rate": 5.258719589634198e-10, "loss": 1.8946, "step": 14645 }, { "epoch": 0.9871635052727334, "grad_norm": 6.064121606520511, "learning_rate": 4.992543384291781e-10, "loss": 2.0208, "step": 14650 }, { "epoch": 0.9875004211448402, "grad_norm": 5.5917127613181545, "learning_rate": 4.733276303537925e-10, "loss": 2.0218, "step": 14655 }, { "epoch": 0.9878373370169469, "grad_norm": 4.987065696743237, "learning_rate": 4.480918705991321e-10, "loss": 1.9909, "step": 14660 }, { "epoch": 0.9881742528890536, "grad_norm": 5.275111707853207, "learning_rate": 4.235470940715524e-10, "loss": 2.0148, "step": 14665 }, { "epoch": 0.9885111687611603, "grad_norm": 5.047795837026113, "learning_rate": 3.99693334721507e-10, "loss": 1.9521, "step": 14670 }, { "epoch": 0.9888480846332671, "grad_norm": 5.397338273396517, "learning_rate": 3.765306255436029e-10, "loss": 2.018, "step": 14675 }, { "epoch": 0.9891850005053738, "grad_norm": 5.067303306564753, "learning_rate": 3.540589985766562e-10, "loss": 2.014, "step": 14680 }, { "epoch": 0.9895219163774805, "grad_norm": 5.023194907599195, "learning_rate": 3.322784849036364e-10, "loss": 1.9942, "step": 14685 }, { "epoch": 0.9898588322495873, "grad_norm": 5.58367907070318, "learning_rate": 3.11189114651389e-10, "loss": 1.8728, "step": 14690 }, { "epoch": 0.990195748121694, "grad_norm": 5.364603174803651, "learning_rate": 2.9079091699091287e-10, "loss": 1.9661, "step": 14695 }, { "epoch": 0.9905326639938008, "grad_norm": 5.03097198898432, "learning_rate": 2.710839201370829e-10, "loss": 2.0473, "step": 14700 }, { "epoch": 0.9908695798659075, "grad_norm": 4.89039087763877, "learning_rate": 2.5206815134881655e-10, "loss": 1.9846, "step": 14705 }, { "epoch": 0.9912064957380142, "grad_norm": 4.979905904900416, "learning_rate": 2.337436369287404e-10, "loss": 2.0153, "step": 14710 }, { "epoch": 0.9915434116101209, "grad_norm": 4.897570819902302, "learning_rate": 2.1611040222346833e-10, "loss": 1.9816, "step": 14715 }, { "epoch": 0.9918803274822277, "grad_norm": 4.926035271915753, "learning_rate": 1.9916847162343432e-10, "loss": 2.0008, "step": 14720 }, { "epoch": 0.9922172433543345, "grad_norm": 4.8633911492629425, "learning_rate": 1.829178685627264e-10, "loss": 1.9981, "step": 14725 }, { "epoch": 0.9925541592264412, "grad_norm": 5.035989844511022, "learning_rate": 1.6735861551936402e-10, "loss": 1.9533, "step": 14730 }, { "epoch": 0.9928910750985479, "grad_norm": 4.940730440603908, "learning_rate": 1.5249073401502055e-10, "loss": 1.9411, "step": 14735 }, { "epoch": 0.9932279909706546, "grad_norm": 4.958342773716011, "learning_rate": 1.3831424461496766e-10, "loss": 1.9857, "step": 14740 }, { "epoch": 0.9935649068427613, "grad_norm": 5.158547113503269, "learning_rate": 1.2482916692824197e-10, "loss": 1.9416, "step": 14745 }, { "epoch": 0.993901822714868, "grad_norm": 5.123638235545546, "learning_rate": 1.1203551960742297e-10, "loss": 1.9712, "step": 14750 }, { "epoch": 0.9942387385869749, "grad_norm": 5.216610717344307, "learning_rate": 9.993332034891056e-11, "loss": 1.9377, "step": 14755 }, { "epoch": 0.9945756544590816, "grad_norm": 5.123322687774733, "learning_rate": 8.852258589236994e-11, "loss": 1.969, "step": 14760 }, { "epoch": 0.9949125703311883, "grad_norm": 5.367670873204394, "learning_rate": 7.780333202134226e-11, "loss": 1.9673, "step": 14765 }, { "epoch": 0.995249486203295, "grad_norm": 5.04013763866802, "learning_rate": 6.777557356263397e-11, "loss": 2.0131, "step": 14770 }, { "epoch": 0.9955864020754017, "grad_norm": 5.083701541920439, "learning_rate": 5.843932438681643e-11, "loss": 2.0329, "step": 14775 }, { "epoch": 0.9959233179475085, "grad_norm": 5.1162812402580276, "learning_rate": 4.97945974077818e-11, "loss": 2.0302, "step": 14780 }, { "epoch": 0.9962602338196153, "grad_norm": 5.374225099947509, "learning_rate": 4.1841404582965143e-11, "loss": 1.9684, "step": 14785 }, { "epoch": 0.996597149691722, "grad_norm": 4.634464529745205, "learning_rate": 3.457975691334436e-11, "loss": 1.89, "step": 14790 }, { "epoch": 0.9969340655638287, "grad_norm": 4.984503972832307, "learning_rate": 2.800966444316266e-11, "loss": 1.9526, "step": 14795 }, { "epoch": 0.9972709814359354, "grad_norm": 5.0690627453420225, "learning_rate": 2.213113626026164e-11, "loss": 2.0139, "step": 14800 }, { "epoch": 0.9976078973080422, "grad_norm": 5.115123625588744, "learning_rate": 1.6944180495914728e-11, "loss": 1.9057, "step": 14805 }, { "epoch": 0.9979448131801489, "grad_norm": 5.17599995233217, "learning_rate": 1.2448804324660667e-11, "loss": 2.0183, "step": 14810 }, { "epoch": 0.9982817290522557, "grad_norm": 5.275773577482992, "learning_rate": 8.645013964581061e-12, "loss": 2.0584, "step": 14815 }, { "epoch": 0.9986186449243624, "grad_norm": 5.084450900463079, "learning_rate": 5.532814677133846e-12, "loss": 1.9969, "step": 14820 }, { "epoch": 0.9989555607964691, "grad_norm": 5.028186236855297, "learning_rate": 3.112210767042267e-12, "loss": 2.0054, "step": 14825 }, { "epoch": 0.9992924766685759, "grad_norm": 5.412221911739708, "learning_rate": 1.383205582516922e-12, "loss": 1.9309, "step": 14830 }, { "epoch": 0.9996293925406826, "grad_norm": 4.92683177695896, "learning_rate": 3.4580151520025024e-13, "loss": 2.0815, "step": 14835 }, { "epoch": 0.9999663084127893, "grad_norm": 5.292863008586021, "learning_rate": 0.0, "loss": 1.9997, "step": 14840 }, { "epoch": 0.9999663084127893, "eval_loss": NaN, "eval_runtime": 92.4437, "eval_samples_per_second": 51.902, "eval_steps_per_second": 6.49, "step": 14840 }, { "epoch": 0.9999663084127893, "step": 14840, "total_flos": 1117425352867840.0, "train_loss": 2.0976096506067363, "train_runtime": 19555.6217, "train_samples_per_second": 24.284, "train_steps_per_second": 0.759 } ], "logging_steps": 5, "max_steps": 14840, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1117425352867840.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }