{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 16138, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003098277357789069, "grad_norm": 3.09377121925354, "learning_rate": 4.956629491945477e-09, "loss": 1.1329, "step": 5 }, { "epoch": 0.0006196554715578138, "grad_norm": 3.107173442840576, "learning_rate": 1.1152416356877323e-08, "loss": 1.1488, "step": 10 }, { "epoch": 0.0009294832073367208, "grad_norm": 2.797147035598755, "learning_rate": 1.734820322180917e-08, "loss": 1.0985, "step": 15 }, { "epoch": 0.0012393109431156277, "grad_norm": 2.9521679878234863, "learning_rate": 2.3543990086741014e-08, "loss": 1.0833, "step": 20 }, { "epoch": 0.0015491386788945346, "grad_norm": 3.2119905948638916, "learning_rate": 2.9739776951672858e-08, "loss": 1.1429, "step": 25 }, { "epoch": 0.0018589664146734415, "grad_norm": 3.3861136436462402, "learning_rate": 3.593556381660471e-08, "loss": 1.1988, "step": 30 }, { "epoch": 0.0021687941504523484, "grad_norm": 3.0918397903442383, "learning_rate": 4.2131350681536554e-08, "loss": 1.1221, "step": 35 }, { "epoch": 0.0024786218862312553, "grad_norm": 3.4803788661956787, "learning_rate": 4.83271375464684e-08, "loss": 1.1336, "step": 40 }, { "epoch": 0.0027884496220101623, "grad_norm": 3.284985303878784, "learning_rate": 5.452292441140024e-08, "loss": 1.0775, "step": 45 }, { "epoch": 0.003098277357789069, "grad_norm": 2.811286211013794, "learning_rate": 6.071871127633208e-08, "loss": 1.1248, "step": 50 }, { "epoch": 0.003408105093567976, "grad_norm": 2.581263303756714, "learning_rate": 6.691449814126393e-08, "loss": 1.1061, "step": 55 }, { "epoch": 0.003717932829346883, "grad_norm": 2.979124069213867, "learning_rate": 7.31102850061958e-08, "loss": 1.1345, "step": 60 }, { "epoch": 0.00402776056512579, "grad_norm": 2.628190755844116, "learning_rate": 7.930607187112763e-08, "loss": 1.0758, "step": 65 }, { "epoch": 0.004337588300904697, "grad_norm": 3.246612548828125, "learning_rate": 8.550185873605948e-08, "loss": 1.1491, "step": 70 }, { "epoch": 0.004647416036683604, "grad_norm": 2.818136215209961, "learning_rate": 9.169764560099132e-08, "loss": 1.0934, "step": 75 }, { "epoch": 0.004957243772462511, "grad_norm": 2.77746844291687, "learning_rate": 9.789343246592317e-08, "loss": 1.1327, "step": 80 }, { "epoch": 0.005267071508241418, "grad_norm": 2.6841893196105957, "learning_rate": 1.0408921933085501e-07, "loss": 1.1067, "step": 85 }, { "epoch": 0.0055768992440203245, "grad_norm": 2.6152474880218506, "learning_rate": 1.1028500619578685e-07, "loss": 1.0916, "step": 90 }, { "epoch": 0.0058867269797992314, "grad_norm": 2.675133466720581, "learning_rate": 1.164807930607187e-07, "loss": 1.0945, "step": 95 }, { "epoch": 0.006196554715578138, "grad_norm": 2.658914089202881, "learning_rate": 1.2267657992565056e-07, "loss": 1.0685, "step": 100 }, { "epoch": 0.006506382451357045, "grad_norm": 3.31473445892334, "learning_rate": 1.2887236679058238e-07, "loss": 1.1394, "step": 105 }, { "epoch": 0.006816210187135952, "grad_norm": 2.2023932933807373, "learning_rate": 1.3506815365551425e-07, "loss": 1.086, "step": 110 }, { "epoch": 0.007126037922914859, "grad_norm": 2.5673131942749023, "learning_rate": 1.4126394052044611e-07, "loss": 1.0982, "step": 115 }, { "epoch": 0.007435865658693766, "grad_norm": 3.1275827884674072, "learning_rate": 1.4745972738537793e-07, "loss": 1.1296, "step": 120 }, { "epoch": 0.007745693394472673, "grad_norm": 2.658540964126587, "learning_rate": 1.536555142503098e-07, "loss": 1.1454, "step": 125 }, { "epoch": 0.00805552113025158, "grad_norm": 2.652073621749878, "learning_rate": 1.598513011152416e-07, "loss": 1.1488, "step": 130 }, { "epoch": 0.008365348866030487, "grad_norm": 2.4936656951904297, "learning_rate": 1.6604708798017348e-07, "loss": 1.0972, "step": 135 }, { "epoch": 0.008675176601809394, "grad_norm": 2.492208957672119, "learning_rate": 1.7224287484510532e-07, "loss": 1.1367, "step": 140 }, { "epoch": 0.0089850043375883, "grad_norm": 2.2766852378845215, "learning_rate": 1.7843866171003716e-07, "loss": 1.0748, "step": 145 }, { "epoch": 0.009294832073367208, "grad_norm": 2.3636932373046875, "learning_rate": 1.84634448574969e-07, "loss": 1.0283, "step": 150 }, { "epoch": 0.009604659809146114, "grad_norm": 2.4421446323394775, "learning_rate": 1.9083023543990087e-07, "loss": 1.1024, "step": 155 }, { "epoch": 0.009914487544925021, "grad_norm": 2.3002474308013916, "learning_rate": 1.970260223048327e-07, "loss": 1.0616, "step": 160 }, { "epoch": 0.010224315280703928, "grad_norm": 2.574122190475464, "learning_rate": 2.0322180916976455e-07, "loss": 1.0997, "step": 165 }, { "epoch": 0.010534143016482835, "grad_norm": 2.6849777698516846, "learning_rate": 2.094175960346964e-07, "loss": 1.076, "step": 170 }, { "epoch": 0.010843970752261742, "grad_norm": 2.4071404933929443, "learning_rate": 2.1561338289962824e-07, "loss": 1.1248, "step": 175 }, { "epoch": 0.011153798488040649, "grad_norm": 2.3346750736236572, "learning_rate": 2.218091697645601e-07, "loss": 1.0625, "step": 180 }, { "epoch": 0.011463626223819556, "grad_norm": 2.372404098510742, "learning_rate": 2.2800495662949192e-07, "loss": 1.0808, "step": 185 }, { "epoch": 0.011773453959598463, "grad_norm": 2.2451424598693848, "learning_rate": 2.3420074349442379e-07, "loss": 1.132, "step": 190 }, { "epoch": 0.01208328169537737, "grad_norm": 2.4459950923919678, "learning_rate": 2.403965303593556e-07, "loss": 1.1102, "step": 195 }, { "epoch": 0.012393109431156277, "grad_norm": 2.325868606567383, "learning_rate": 2.4659231722428747e-07, "loss": 1.1014, "step": 200 }, { "epoch": 0.012702937166935184, "grad_norm": 2.435431718826294, "learning_rate": 2.527881040892193e-07, "loss": 1.1019, "step": 205 }, { "epoch": 0.01301276490271409, "grad_norm": 2.4805209636688232, "learning_rate": 2.589838909541512e-07, "loss": 1.0481, "step": 210 }, { "epoch": 0.013322592638492997, "grad_norm": 2.4301934242248535, "learning_rate": 2.65179677819083e-07, "loss": 1.1444, "step": 215 }, { "epoch": 0.013632420374271904, "grad_norm": 2.1243042945861816, "learning_rate": 2.7137546468401483e-07, "loss": 1.0734, "step": 220 }, { "epoch": 0.013942248110050811, "grad_norm": 1.848518967628479, "learning_rate": 2.775712515489467e-07, "loss": 1.015, "step": 225 }, { "epoch": 0.014252075845829718, "grad_norm": 2.4169762134552, "learning_rate": 2.8376703841387857e-07, "loss": 1.0539, "step": 230 }, { "epoch": 0.014561903581608625, "grad_norm": 2.4012296199798584, "learning_rate": 2.899628252788104e-07, "loss": 1.0451, "step": 235 }, { "epoch": 0.014871731317387532, "grad_norm": 1.9514521360397339, "learning_rate": 2.961586121437422e-07, "loss": 1.071, "step": 240 }, { "epoch": 0.015181559053166439, "grad_norm": 1.9290523529052734, "learning_rate": 3.023543990086741e-07, "loss": 1.0942, "step": 245 }, { "epoch": 0.015491386788945346, "grad_norm": 1.9151558876037598, "learning_rate": 3.0855018587360594e-07, "loss": 1.0913, "step": 250 }, { "epoch": 0.015801214524724255, "grad_norm": 2.617140531539917, "learning_rate": 3.1474597273853775e-07, "loss": 1.0528, "step": 255 }, { "epoch": 0.01611104226050316, "grad_norm": 1.8712728023529053, "learning_rate": 3.209417596034696e-07, "loss": 1.1502, "step": 260 }, { "epoch": 0.01642086999628207, "grad_norm": 2.0234293937683105, "learning_rate": 3.271375464684015e-07, "loss": 1.0794, "step": 265 }, { "epoch": 0.016730697732060974, "grad_norm": 1.9631576538085938, "learning_rate": 3.333333333333333e-07, "loss": 1.0754, "step": 270 }, { "epoch": 0.017040525467839882, "grad_norm": 1.7581487894058228, "learning_rate": 3.3952912019826517e-07, "loss": 1.1005, "step": 275 }, { "epoch": 0.017350353203618787, "grad_norm": 1.789097785949707, "learning_rate": 3.4572490706319704e-07, "loss": 1.0564, "step": 280 }, { "epoch": 0.017660180939397696, "grad_norm": 1.9786816835403442, "learning_rate": 3.5192069392812885e-07, "loss": 1.0464, "step": 285 }, { "epoch": 0.0179700086751766, "grad_norm": 2.027357816696167, "learning_rate": 3.581164807930607e-07, "loss": 0.9798, "step": 290 }, { "epoch": 0.01827983641095551, "grad_norm": 2.41489839553833, "learning_rate": 3.6431226765799253e-07, "loss": 1.0703, "step": 295 }, { "epoch": 0.018589664146734415, "grad_norm": 1.7554446458816528, "learning_rate": 3.705080545229244e-07, "loss": 1.0551, "step": 300 }, { "epoch": 0.018899491882513324, "grad_norm": 1.8732099533081055, "learning_rate": 3.767038413878562e-07, "loss": 1.0619, "step": 305 }, { "epoch": 0.01920931961829223, "grad_norm": 1.9053443670272827, "learning_rate": 3.828996282527881e-07, "loss": 1.0015, "step": 310 }, { "epoch": 0.019519147354071138, "grad_norm": 1.7260830402374268, "learning_rate": 3.8909541511771995e-07, "loss": 1.0691, "step": 315 }, { "epoch": 0.019828975089850043, "grad_norm": 1.9854978322982788, "learning_rate": 3.9529120198265177e-07, "loss": 1.0094, "step": 320 }, { "epoch": 0.02013880282562895, "grad_norm": 1.8738073110580444, "learning_rate": 4.0148698884758363e-07, "loss": 1.0183, "step": 325 }, { "epoch": 0.020448630561407857, "grad_norm": 2.307349443435669, "learning_rate": 4.0768277571251545e-07, "loss": 0.9974, "step": 330 }, { "epoch": 0.020758458297186765, "grad_norm": 1.8529094457626343, "learning_rate": 4.138785625774473e-07, "loss": 1.0142, "step": 335 }, { "epoch": 0.02106828603296567, "grad_norm": 1.7680959701538086, "learning_rate": 4.200743494423792e-07, "loss": 1.026, "step": 340 }, { "epoch": 0.02137811376874458, "grad_norm": 1.6878095865249634, "learning_rate": 4.26270136307311e-07, "loss": 0.9843, "step": 345 }, { "epoch": 0.021687941504523484, "grad_norm": 1.7739229202270508, "learning_rate": 4.3246592317224287e-07, "loss": 0.9761, "step": 350 }, { "epoch": 0.021997769240302393, "grad_norm": 1.894353985786438, "learning_rate": 4.3866171003717474e-07, "loss": 0.9856, "step": 355 }, { "epoch": 0.022307596976081298, "grad_norm": 1.7225074768066406, "learning_rate": 4.4485749690210655e-07, "loss": 0.9878, "step": 360 }, { "epoch": 0.022617424711860207, "grad_norm": 1.8827643394470215, "learning_rate": 4.5105328376703836e-07, "loss": 1.06, "step": 365 }, { "epoch": 0.022927252447639112, "grad_norm": 1.654366374015808, "learning_rate": 4.5724907063197023e-07, "loss": 0.9866, "step": 370 }, { "epoch": 0.02323708018341802, "grad_norm": 1.9748345613479614, "learning_rate": 4.634448574969021e-07, "loss": 0.9968, "step": 375 }, { "epoch": 0.023546907919196926, "grad_norm": 1.6716351509094238, "learning_rate": 4.696406443618339e-07, "loss": 0.9894, "step": 380 }, { "epoch": 0.023856735654975834, "grad_norm": 1.6863290071487427, "learning_rate": 4.7583643122676573e-07, "loss": 1.0249, "step": 385 }, { "epoch": 0.02416656339075474, "grad_norm": 2.3415777683258057, "learning_rate": 4.820322180916977e-07, "loss": 1.0231, "step": 390 }, { "epoch": 0.024476391126533648, "grad_norm": 1.9225884675979614, "learning_rate": 4.882280049566295e-07, "loss": 1.023, "step": 395 }, { "epoch": 0.024786218862312553, "grad_norm": 1.5016530752182007, "learning_rate": 4.944237918215613e-07, "loss": 1.0143, "step": 400 }, { "epoch": 0.025096046598091462, "grad_norm": 1.7448056936264038, "learning_rate": 5.006195786864932e-07, "loss": 1.0281, "step": 405 }, { "epoch": 0.025405874333870367, "grad_norm": 1.6482943296432495, "learning_rate": 5.06815365551425e-07, "loss": 1.0163, "step": 410 }, { "epoch": 0.025715702069649276, "grad_norm": 1.8511091470718384, "learning_rate": 5.130111524163568e-07, "loss": 0.9966, "step": 415 }, { "epoch": 0.02602552980542818, "grad_norm": 1.7781504392623901, "learning_rate": 5.192069392812886e-07, "loss": 1.0458, "step": 420 }, { "epoch": 0.02633535754120709, "grad_norm": 2.044435739517212, "learning_rate": 5.254027261462205e-07, "loss": 1.0048, "step": 425 }, { "epoch": 0.026645185276985995, "grad_norm": 1.9608750343322754, "learning_rate": 5.315985130111525e-07, "loss": 0.9668, "step": 430 }, { "epoch": 0.026955013012764904, "grad_norm": 1.816880464553833, "learning_rate": 5.377942998760843e-07, "loss": 1.0529, "step": 435 }, { "epoch": 0.02726484074854381, "grad_norm": 1.6355019807815552, "learning_rate": 5.439900867410161e-07, "loss": 1.0326, "step": 440 }, { "epoch": 0.027574668484322717, "grad_norm": 1.764168620109558, "learning_rate": 5.501858736059479e-07, "loss": 0.9811, "step": 445 }, { "epoch": 0.027884496220101623, "grad_norm": 1.7886366844177246, "learning_rate": 5.563816604708797e-07, "loss": 1.0109, "step": 450 }, { "epoch": 0.02819432395588053, "grad_norm": 1.7068489789962769, "learning_rate": 5.625774473358116e-07, "loss": 1.0533, "step": 455 }, { "epoch": 0.028504151691659436, "grad_norm": 1.859300136566162, "learning_rate": 5.687732342007435e-07, "loss": 1.0279, "step": 460 }, { "epoch": 0.028813979427438345, "grad_norm": 1.7848422527313232, "learning_rate": 5.749690210656754e-07, "loss": 0.9937, "step": 465 }, { "epoch": 0.02912380716321725, "grad_norm": 2.1852474212646484, "learning_rate": 5.811648079306072e-07, "loss": 0.9846, "step": 470 }, { "epoch": 0.02943363489899616, "grad_norm": 2.0497853755950928, "learning_rate": 5.87360594795539e-07, "loss": 1.0278, "step": 475 }, { "epoch": 0.029743462634775064, "grad_norm": 1.7212690114974976, "learning_rate": 5.935563816604708e-07, "loss": 1.0019, "step": 480 }, { "epoch": 0.030053290370553973, "grad_norm": 1.9785066843032837, "learning_rate": 5.997521685254027e-07, "loss": 0.9846, "step": 485 }, { "epoch": 0.030363118106332878, "grad_norm": 1.7553397417068481, "learning_rate": 6.059479553903345e-07, "loss": 0.9763, "step": 490 }, { "epoch": 0.030672945842111787, "grad_norm": 1.5609034299850464, "learning_rate": 6.121437422552664e-07, "loss": 0.9542, "step": 495 }, { "epoch": 0.030982773577890692, "grad_norm": 1.6207069158554077, "learning_rate": 6.183395291201983e-07, "loss": 0.9645, "step": 500 }, { "epoch": 0.0312926013136696, "grad_norm": 1.9139336347579956, "learning_rate": 6.245353159851301e-07, "loss": 0.9226, "step": 505 }, { "epoch": 0.03160242904944851, "grad_norm": 1.8188204765319824, "learning_rate": 6.30731102850062e-07, "loss": 1.0568, "step": 510 }, { "epoch": 0.03191225678522741, "grad_norm": 1.760810136795044, "learning_rate": 6.369268897149938e-07, "loss": 1.0202, "step": 515 }, { "epoch": 0.03222208452100632, "grad_norm": 1.831602692604065, "learning_rate": 6.431226765799256e-07, "loss": 1.0624, "step": 520 }, { "epoch": 0.03253191225678523, "grad_norm": 1.7779512405395508, "learning_rate": 6.493184634448575e-07, "loss": 0.9812, "step": 525 }, { "epoch": 0.03284173999256414, "grad_norm": 1.951276183128357, "learning_rate": 6.555142503097893e-07, "loss": 0.9616, "step": 530 }, { "epoch": 0.03315156772834304, "grad_norm": 1.9644209146499634, "learning_rate": 6.617100371747211e-07, "loss": 0.9751, "step": 535 }, { "epoch": 0.03346139546412195, "grad_norm": 1.7855628728866577, "learning_rate": 6.67905824039653e-07, "loss": 1.0447, "step": 540 }, { "epoch": 0.033771223199900856, "grad_norm": 1.5926849842071533, "learning_rate": 6.741016109045849e-07, "loss": 1.0153, "step": 545 }, { "epoch": 0.034081050935679764, "grad_norm": 1.5938838720321655, "learning_rate": 6.802973977695167e-07, "loss": 0.9874, "step": 550 }, { "epoch": 0.034390878671458666, "grad_norm": 1.5012670755386353, "learning_rate": 6.864931846344485e-07, "loss": 0.9277, "step": 555 }, { "epoch": 0.034700706407237575, "grad_norm": 1.5705684423446655, "learning_rate": 6.926889714993804e-07, "loss": 1.0205, "step": 560 }, { "epoch": 0.035010534143016483, "grad_norm": 1.5950570106506348, "learning_rate": 6.988847583643122e-07, "loss": 1.0056, "step": 565 }, { "epoch": 0.03532036187879539, "grad_norm": 1.5925542116165161, "learning_rate": 7.05080545229244e-07, "loss": 0.9716, "step": 570 }, { "epoch": 0.035630189614574294, "grad_norm": 1.8238341808319092, "learning_rate": 7.11276332094176e-07, "loss": 0.9947, "step": 575 }, { "epoch": 0.0359400173503532, "grad_norm": 1.627885341644287, "learning_rate": 7.174721189591078e-07, "loss": 1.0055, "step": 580 }, { "epoch": 0.03624984508613211, "grad_norm": 1.9116154909133911, "learning_rate": 7.236679058240396e-07, "loss": 1.0202, "step": 585 }, { "epoch": 0.03655967282191102, "grad_norm": 1.7489193677902222, "learning_rate": 7.298636926889715e-07, "loss": 0.9952, "step": 590 }, { "epoch": 0.03686950055768992, "grad_norm": 1.6580047607421875, "learning_rate": 7.360594795539033e-07, "loss": 0.9823, "step": 595 }, { "epoch": 0.03717932829346883, "grad_norm": 2.242509365081787, "learning_rate": 7.422552664188351e-07, "loss": 1.0104, "step": 600 }, { "epoch": 0.03748915602924774, "grad_norm": 1.3214765787124634, "learning_rate": 7.48451053283767e-07, "loss": 0.9784, "step": 605 }, { "epoch": 0.03779898376502665, "grad_norm": 1.7482460737228394, "learning_rate": 7.546468401486989e-07, "loss": 1.0086, "step": 610 }, { "epoch": 0.03810881150080555, "grad_norm": 1.9315013885498047, "learning_rate": 7.608426270136307e-07, "loss": 0.9672, "step": 615 }, { "epoch": 0.03841863923658446, "grad_norm": 1.7050901651382446, "learning_rate": 7.670384138785625e-07, "loss": 1.0031, "step": 620 }, { "epoch": 0.038728466972363366, "grad_norm": 1.6901735067367554, "learning_rate": 7.732342007434944e-07, "loss": 0.9374, "step": 625 }, { "epoch": 0.039038294708142275, "grad_norm": 1.9918122291564941, "learning_rate": 7.794299876084262e-07, "loss": 0.9545, "step": 630 }, { "epoch": 0.03934812244392118, "grad_norm": 1.5523377656936646, "learning_rate": 7.856257744733581e-07, "loss": 0.9818, "step": 635 }, { "epoch": 0.039657950179700086, "grad_norm": 1.8401563167572021, "learning_rate": 7.918215613382899e-07, "loss": 0.9917, "step": 640 }, { "epoch": 0.039967777915478994, "grad_norm": 1.8160701990127563, "learning_rate": 7.980173482032218e-07, "loss": 0.9556, "step": 645 }, { "epoch": 0.0402776056512579, "grad_norm": 2.1353542804718018, "learning_rate": 8.042131350681536e-07, "loss": 0.9905, "step": 650 }, { "epoch": 0.040587433387036805, "grad_norm": 1.6101880073547363, "learning_rate": 8.104089219330854e-07, "loss": 0.9996, "step": 655 }, { "epoch": 0.04089726112281571, "grad_norm": 1.8142441511154175, "learning_rate": 8.166047087980173e-07, "loss": 0.9623, "step": 660 }, { "epoch": 0.04120708885859462, "grad_norm": 1.7841362953186035, "learning_rate": 8.228004956629492e-07, "loss": 0.9306, "step": 665 }, { "epoch": 0.04151691659437353, "grad_norm": 1.761864185333252, "learning_rate": 8.28996282527881e-07, "loss": 0.9735, "step": 670 }, { "epoch": 0.04182674433015243, "grad_norm": 1.5150208473205566, "learning_rate": 8.351920693928128e-07, "loss": 0.9753, "step": 675 }, { "epoch": 0.04213657206593134, "grad_norm": 1.5961802005767822, "learning_rate": 8.413878562577447e-07, "loss": 0.9061, "step": 680 }, { "epoch": 0.04244639980171025, "grad_norm": 1.9258601665496826, "learning_rate": 8.475836431226765e-07, "loss": 0.9766, "step": 685 }, { "epoch": 0.04275622753748916, "grad_norm": 1.6125866174697876, "learning_rate": 8.537794299876084e-07, "loss": 0.9341, "step": 690 }, { "epoch": 0.04306605527326806, "grad_norm": 1.7048609256744385, "learning_rate": 8.599752168525403e-07, "loss": 0.9621, "step": 695 }, { "epoch": 0.04337588300904697, "grad_norm": 1.5836412906646729, "learning_rate": 8.661710037174721e-07, "loss": 0.9422, "step": 700 }, { "epoch": 0.04368571074482588, "grad_norm": 1.7820950746536255, "learning_rate": 8.723667905824039e-07, "loss": 0.9882, "step": 705 }, { "epoch": 0.043995538480604786, "grad_norm": 1.7978854179382324, "learning_rate": 8.785625774473357e-07, "loss": 0.9673, "step": 710 }, { "epoch": 0.04430536621638369, "grad_norm": 1.8427197933197021, "learning_rate": 8.847583643122676e-07, "loss": 0.9107, "step": 715 }, { "epoch": 0.044615193952162596, "grad_norm": 1.5388127565383911, "learning_rate": 8.909541511771994e-07, "loss": 0.918, "step": 720 }, { "epoch": 0.044925021687941505, "grad_norm": 1.7851221561431885, "learning_rate": 8.971499380421314e-07, "loss": 0.9415, "step": 725 }, { "epoch": 0.045234849423720414, "grad_norm": 1.941830039024353, "learning_rate": 9.033457249070632e-07, "loss": 0.9597, "step": 730 }, { "epoch": 0.045544677159499315, "grad_norm": 1.774323582649231, "learning_rate": 9.09541511771995e-07, "loss": 0.9484, "step": 735 }, { "epoch": 0.045854504895278224, "grad_norm": 1.7083866596221924, "learning_rate": 9.157372986369268e-07, "loss": 0.9768, "step": 740 }, { "epoch": 0.04616433263105713, "grad_norm": 2.057176113128662, "learning_rate": 9.219330855018586e-07, "loss": 0.9309, "step": 745 }, { "epoch": 0.04647416036683604, "grad_norm": 1.6724480390548706, "learning_rate": 9.281288723667905e-07, "loss": 0.9805, "step": 750 }, { "epoch": 0.04678398810261494, "grad_norm": 1.7618985176086426, "learning_rate": 9.343246592317225e-07, "loss": 0.9688, "step": 755 }, { "epoch": 0.04709381583839385, "grad_norm": 1.8486956357955933, "learning_rate": 9.405204460966543e-07, "loss": 1.0173, "step": 760 }, { "epoch": 0.04740364357417276, "grad_norm": 1.8515139818191528, "learning_rate": 9.467162329615861e-07, "loss": 0.9169, "step": 765 }, { "epoch": 0.04771347130995167, "grad_norm": 1.965920090675354, "learning_rate": 9.529120198265179e-07, "loss": 0.9477, "step": 770 }, { "epoch": 0.04802329904573057, "grad_norm": 1.4112786054611206, "learning_rate": 9.591078066914497e-07, "loss": 0.9486, "step": 775 }, { "epoch": 0.04833312678150948, "grad_norm": 1.615864634513855, "learning_rate": 9.653035935563815e-07, "loss": 0.9291, "step": 780 }, { "epoch": 0.04864295451728839, "grad_norm": 2.183880090713501, "learning_rate": 9.714993804213134e-07, "loss": 0.9466, "step": 785 }, { "epoch": 0.048952782253067297, "grad_norm": 1.6047214269638062, "learning_rate": 9.776951672862454e-07, "loss": 0.9245, "step": 790 }, { "epoch": 0.0492626099888462, "grad_norm": 1.60440194606781, "learning_rate": 9.838909541511772e-07, "loss": 0.9146, "step": 795 }, { "epoch": 0.04957243772462511, "grad_norm": 1.8723225593566895, "learning_rate": 9.90086741016109e-07, "loss": 0.9352, "step": 800 }, { "epoch": 0.049882265460404016, "grad_norm": 1.7347556352615356, "learning_rate": 9.962825278810408e-07, "loss": 0.9495, "step": 805 }, { "epoch": 0.050192093196182924, "grad_norm": 1.5680465698242188, "learning_rate": 9.99869545365599e-07, "loss": 0.9279, "step": 810 }, { "epoch": 0.050501920931961826, "grad_norm": 1.8233542442321777, "learning_rate": 9.995434087795969e-07, "loss": 0.9252, "step": 815 }, { "epoch": 0.050811748667740735, "grad_norm": 1.952565312385559, "learning_rate": 9.992172721935945e-07, "loss": 0.9077, "step": 820 }, { "epoch": 0.05112157640351964, "grad_norm": 1.5445597171783447, "learning_rate": 9.988911356075924e-07, "loss": 0.8641, "step": 825 }, { "epoch": 0.05143140413929855, "grad_norm": 1.5205751657485962, "learning_rate": 9.985649990215903e-07, "loss": 1.018, "step": 830 }, { "epoch": 0.051741231875077454, "grad_norm": 2.1889641284942627, "learning_rate": 9.98238862435588e-07, "loss": 0.9016, "step": 835 }, { "epoch": 0.05205105961085636, "grad_norm": 1.633474588394165, "learning_rate": 9.979127258495858e-07, "loss": 0.9532, "step": 840 }, { "epoch": 0.05236088734663527, "grad_norm": 1.5481663942337036, "learning_rate": 9.975865892635835e-07, "loss": 0.8946, "step": 845 }, { "epoch": 0.05267071508241418, "grad_norm": 1.6633899211883545, "learning_rate": 9.972604526775813e-07, "loss": 0.9469, "step": 850 }, { "epoch": 0.05298054281819308, "grad_norm": 1.720362663269043, "learning_rate": 9.96934316091579e-07, "loss": 0.9479, "step": 855 }, { "epoch": 0.05329037055397199, "grad_norm": 1.6240829229354858, "learning_rate": 9.966081795055769e-07, "loss": 0.9589, "step": 860 }, { "epoch": 0.0536001982897509, "grad_norm": 1.7222377061843872, "learning_rate": 9.962820429195747e-07, "loss": 0.9616, "step": 865 }, { "epoch": 0.05391002602552981, "grad_norm": 1.6102875471115112, "learning_rate": 9.959559063335724e-07, "loss": 0.9492, "step": 870 }, { "epoch": 0.05421985376130871, "grad_norm": 1.7685558795928955, "learning_rate": 9.956297697475702e-07, "loss": 0.9588, "step": 875 }, { "epoch": 0.05452968149708762, "grad_norm": 1.7488722801208496, "learning_rate": 9.953036331615681e-07, "loss": 0.9278, "step": 880 }, { "epoch": 0.054839509232866526, "grad_norm": 2.110910177230835, "learning_rate": 9.949774965755658e-07, "loss": 0.9897, "step": 885 }, { "epoch": 0.055149336968645435, "grad_norm": 2.1324942111968994, "learning_rate": 9.946513599895636e-07, "loss": 0.9528, "step": 890 }, { "epoch": 0.05545916470442434, "grad_norm": 1.6846339702606201, "learning_rate": 9.943252234035613e-07, "loss": 0.9204, "step": 895 }, { "epoch": 0.055768992440203245, "grad_norm": 1.4202513694763184, "learning_rate": 9.939990868175592e-07, "loss": 0.9487, "step": 900 }, { "epoch": 0.056078820175982154, "grad_norm": 1.6517043113708496, "learning_rate": 9.93672950231557e-07, "loss": 0.9734, "step": 905 }, { "epoch": 0.05638864791176106, "grad_norm": 1.796568751335144, "learning_rate": 9.933468136455547e-07, "loss": 0.9283, "step": 910 }, { "epoch": 0.05669847564753997, "grad_norm": 1.7034024000167847, "learning_rate": 9.930206770595526e-07, "loss": 0.9066, "step": 915 }, { "epoch": 0.05700830338331887, "grad_norm": 1.8522111177444458, "learning_rate": 9.926945404735502e-07, "loss": 0.9227, "step": 920 }, { "epoch": 0.05731813111909778, "grad_norm": 1.9937344789505005, "learning_rate": 9.92368403887548e-07, "loss": 0.9312, "step": 925 }, { "epoch": 0.05762795885487669, "grad_norm": 1.8149816989898682, "learning_rate": 9.920422673015457e-07, "loss": 0.9298, "step": 930 }, { "epoch": 0.0579377865906556, "grad_norm": 1.5355404615402222, "learning_rate": 9.917161307155436e-07, "loss": 0.932, "step": 935 }, { "epoch": 0.0582476143264345, "grad_norm": 1.663727879524231, "learning_rate": 9.913899941295415e-07, "loss": 0.9748, "step": 940 }, { "epoch": 0.05855744206221341, "grad_norm": 1.7889565229415894, "learning_rate": 9.910638575435391e-07, "loss": 0.9106, "step": 945 }, { "epoch": 0.05886726979799232, "grad_norm": 1.6672755479812622, "learning_rate": 9.90737720957537e-07, "loss": 0.9196, "step": 950 }, { "epoch": 0.05917709753377123, "grad_norm": 1.7308260202407837, "learning_rate": 9.904115843715347e-07, "loss": 0.9634, "step": 955 }, { "epoch": 0.05948692526955013, "grad_norm": 1.9848425388336182, "learning_rate": 9.900854477855325e-07, "loss": 0.94, "step": 960 }, { "epoch": 0.05979675300532904, "grad_norm": 1.6974095106124878, "learning_rate": 9.897593111995302e-07, "loss": 0.9438, "step": 965 }, { "epoch": 0.060106580741107946, "grad_norm": 1.8826385736465454, "learning_rate": 9.89433174613528e-07, "loss": 0.9742, "step": 970 }, { "epoch": 0.060416408476886854, "grad_norm": 1.8570836782455444, "learning_rate": 9.89107038027526e-07, "loss": 0.9413, "step": 975 }, { "epoch": 0.060726236212665756, "grad_norm": 1.6992955207824707, "learning_rate": 9.887809014415238e-07, "loss": 0.8988, "step": 980 }, { "epoch": 0.061036063948444665, "grad_norm": 1.8292311429977417, "learning_rate": 9.884547648555214e-07, "loss": 0.9509, "step": 985 }, { "epoch": 0.06134589168422357, "grad_norm": 1.6534769535064697, "learning_rate": 9.881286282695193e-07, "loss": 0.9557, "step": 990 }, { "epoch": 0.06165571942000248, "grad_norm": 1.645851731300354, "learning_rate": 9.87802491683517e-07, "loss": 0.8722, "step": 995 }, { "epoch": 0.061965547155781384, "grad_norm": 1.7740751504898071, "learning_rate": 9.874763550975148e-07, "loss": 0.9649, "step": 1000 }, { "epoch": 0.06227537489156029, "grad_norm": 1.7853299379348755, "learning_rate": 9.871502185115125e-07, "loss": 0.9554, "step": 1005 }, { "epoch": 0.0625852026273392, "grad_norm": 1.6192938089370728, "learning_rate": 9.868240819255104e-07, "loss": 0.9734, "step": 1010 }, { "epoch": 0.06289503036311811, "grad_norm": 1.7637487649917603, "learning_rate": 9.864979453395082e-07, "loss": 0.9217, "step": 1015 }, { "epoch": 0.06320485809889702, "grad_norm": 1.58921480178833, "learning_rate": 9.861718087535059e-07, "loss": 0.8978, "step": 1020 }, { "epoch": 0.06351468583467593, "grad_norm": 1.6872888803482056, "learning_rate": 9.858456721675038e-07, "loss": 0.8829, "step": 1025 }, { "epoch": 0.06382451357045482, "grad_norm": 1.7745085954666138, "learning_rate": 9.855195355815014e-07, "loss": 0.9224, "step": 1030 }, { "epoch": 0.06413434130623373, "grad_norm": 1.7290229797363281, "learning_rate": 9.851933989954993e-07, "loss": 0.8935, "step": 1035 }, { "epoch": 0.06444416904201264, "grad_norm": 1.6771531105041504, "learning_rate": 9.84867262409497e-07, "loss": 0.9491, "step": 1040 }, { "epoch": 0.06475399677779155, "grad_norm": 1.7783493995666504, "learning_rate": 9.845411258234948e-07, "loss": 0.9509, "step": 1045 }, { "epoch": 0.06506382451357046, "grad_norm": 1.8041352033615112, "learning_rate": 9.842149892374927e-07, "loss": 0.9403, "step": 1050 }, { "epoch": 0.06537365224934936, "grad_norm": 1.7244343757629395, "learning_rate": 9.838888526514903e-07, "loss": 0.9276, "step": 1055 }, { "epoch": 0.06568347998512827, "grad_norm": 1.5198051929473877, "learning_rate": 9.835627160654882e-07, "loss": 0.8671, "step": 1060 }, { "epoch": 0.06599330772090718, "grad_norm": 1.6945546865463257, "learning_rate": 9.83236579479486e-07, "loss": 0.8714, "step": 1065 }, { "epoch": 0.06630313545668608, "grad_norm": 1.9277130365371704, "learning_rate": 9.829104428934837e-07, "loss": 0.9526, "step": 1070 }, { "epoch": 0.06661296319246499, "grad_norm": 1.5783915519714355, "learning_rate": 9.825843063074816e-07, "loss": 0.9471, "step": 1075 }, { "epoch": 0.0669227909282439, "grad_norm": 1.4125542640686035, "learning_rate": 9.822581697214792e-07, "loss": 0.9269, "step": 1080 }, { "epoch": 0.0672326186640228, "grad_norm": 1.6077899932861328, "learning_rate": 9.819320331354771e-07, "loss": 1.0017, "step": 1085 }, { "epoch": 0.06754244639980171, "grad_norm": 1.899767279624939, "learning_rate": 9.81605896549475e-07, "loss": 0.9879, "step": 1090 }, { "epoch": 0.06785227413558062, "grad_norm": 1.537110447883606, "learning_rate": 9.812797599634726e-07, "loss": 0.946, "step": 1095 }, { "epoch": 0.06816210187135953, "grad_norm": 1.5265432596206665, "learning_rate": 9.809536233774705e-07, "loss": 0.8835, "step": 1100 }, { "epoch": 0.06847192960713844, "grad_norm": 1.6527583599090576, "learning_rate": 9.806274867914682e-07, "loss": 0.9021, "step": 1105 }, { "epoch": 0.06878175734291733, "grad_norm": 1.7411730289459229, "learning_rate": 9.80301350205466e-07, "loss": 0.8907, "step": 1110 }, { "epoch": 0.06909158507869624, "grad_norm": 1.5506644248962402, "learning_rate": 9.799752136194637e-07, "loss": 0.9023, "step": 1115 }, { "epoch": 0.06940141281447515, "grad_norm": 1.6373412609100342, "learning_rate": 9.796490770334616e-07, "loss": 0.9934, "step": 1120 }, { "epoch": 0.06971124055025406, "grad_norm": 1.778870701789856, "learning_rate": 9.793229404474594e-07, "loss": 0.9239, "step": 1125 }, { "epoch": 0.07002106828603297, "grad_norm": 1.4526586532592773, "learning_rate": 9.78996803861457e-07, "loss": 0.9337, "step": 1130 }, { "epoch": 0.07033089602181188, "grad_norm": 1.5949856042861938, "learning_rate": 9.78670667275455e-07, "loss": 0.905, "step": 1135 }, { "epoch": 0.07064072375759078, "grad_norm": 1.771697998046875, "learning_rate": 9.783445306894526e-07, "loss": 0.9357, "step": 1140 }, { "epoch": 0.07095055149336969, "grad_norm": 1.9852505922317505, "learning_rate": 9.780183941034505e-07, "loss": 0.977, "step": 1145 }, { "epoch": 0.07126037922914859, "grad_norm": 1.6662964820861816, "learning_rate": 9.776922575174481e-07, "loss": 0.95, "step": 1150 }, { "epoch": 0.0715702069649275, "grad_norm": 1.6107369661331177, "learning_rate": 9.77366120931446e-07, "loss": 0.9121, "step": 1155 }, { "epoch": 0.0718800347007064, "grad_norm": 1.873172640800476, "learning_rate": 9.770399843454439e-07, "loss": 0.92, "step": 1160 }, { "epoch": 0.07218986243648531, "grad_norm": 1.8048964738845825, "learning_rate": 9.767138477594417e-07, "loss": 0.9077, "step": 1165 }, { "epoch": 0.07249969017226422, "grad_norm": 3.51314640045166, "learning_rate": 9.763877111734394e-07, "loss": 0.9621, "step": 1170 }, { "epoch": 0.07280951790804313, "grad_norm": 1.5693644285202026, "learning_rate": 9.760615745874373e-07, "loss": 0.9256, "step": 1175 }, { "epoch": 0.07311934564382204, "grad_norm": 1.7480493783950806, "learning_rate": 9.75735438001435e-07, "loss": 0.916, "step": 1180 }, { "epoch": 0.07342917337960095, "grad_norm": 1.6357393264770508, "learning_rate": 9.754093014154328e-07, "loss": 0.9011, "step": 1185 }, { "epoch": 0.07373900111537984, "grad_norm": 1.6978076696395874, "learning_rate": 9.750831648294304e-07, "loss": 0.9112, "step": 1190 }, { "epoch": 0.07404882885115875, "grad_norm": 1.9917634725570679, "learning_rate": 9.747570282434283e-07, "loss": 0.8765, "step": 1195 }, { "epoch": 0.07435865658693766, "grad_norm": 1.7949211597442627, "learning_rate": 9.744308916574262e-07, "loss": 0.9306, "step": 1200 }, { "epoch": 0.07466848432271657, "grad_norm": 1.587602972984314, "learning_rate": 9.741047550714238e-07, "loss": 0.9135, "step": 1205 }, { "epoch": 0.07497831205849548, "grad_norm": 1.6725713014602661, "learning_rate": 9.737786184854217e-07, "loss": 0.9453, "step": 1210 }, { "epoch": 0.07528813979427439, "grad_norm": 1.7273956537246704, "learning_rate": 9.734524818994194e-07, "loss": 0.8719, "step": 1215 }, { "epoch": 0.0755979675300533, "grad_norm": 1.7082599401474, "learning_rate": 9.731263453134172e-07, "loss": 0.9407, "step": 1220 }, { "epoch": 0.0759077952658322, "grad_norm": 1.5230612754821777, "learning_rate": 9.728002087274149e-07, "loss": 0.939, "step": 1225 }, { "epoch": 0.0762176230016111, "grad_norm": 1.6276631355285645, "learning_rate": 9.724740721414128e-07, "loss": 0.9432, "step": 1230 }, { "epoch": 0.07652745073739001, "grad_norm": 1.5785828828811646, "learning_rate": 9.721479355554106e-07, "loss": 0.8916, "step": 1235 }, { "epoch": 0.07683727847316892, "grad_norm": 1.9186655282974243, "learning_rate": 9.718217989694083e-07, "loss": 0.9517, "step": 1240 }, { "epoch": 0.07714710620894782, "grad_norm": 1.6019606590270996, "learning_rate": 9.714956623834062e-07, "loss": 0.9282, "step": 1245 }, { "epoch": 0.07745693394472673, "grad_norm": 2.026553153991699, "learning_rate": 9.711695257974038e-07, "loss": 0.9859, "step": 1250 }, { "epoch": 0.07776676168050564, "grad_norm": 1.6577638387680054, "learning_rate": 9.708433892114017e-07, "loss": 0.9526, "step": 1255 }, { "epoch": 0.07807658941628455, "grad_norm": 1.7208828926086426, "learning_rate": 9.705172526253995e-07, "loss": 0.9337, "step": 1260 }, { "epoch": 0.07838641715206346, "grad_norm": 1.6874709129333496, "learning_rate": 9.701911160393972e-07, "loss": 0.9266, "step": 1265 }, { "epoch": 0.07869624488784235, "grad_norm": 1.4453914165496826, "learning_rate": 9.69864979453395e-07, "loss": 0.9182, "step": 1270 }, { "epoch": 0.07900607262362126, "grad_norm": 1.9197102785110474, "learning_rate": 9.69538842867393e-07, "loss": 0.9366, "step": 1275 }, { "epoch": 0.07931590035940017, "grad_norm": 1.6617244482040405, "learning_rate": 9.692127062813906e-07, "loss": 0.9082, "step": 1280 }, { "epoch": 0.07962572809517908, "grad_norm": 1.5530146360397339, "learning_rate": 9.688865696953885e-07, "loss": 0.8815, "step": 1285 }, { "epoch": 0.07993555583095799, "grad_norm": 1.7353848218917847, "learning_rate": 9.685604331093861e-07, "loss": 0.935, "step": 1290 }, { "epoch": 0.0802453835667369, "grad_norm": 1.7109473943710327, "learning_rate": 9.68234296523384e-07, "loss": 0.899, "step": 1295 }, { "epoch": 0.0805552113025158, "grad_norm": 1.6293658018112183, "learning_rate": 9.679081599373816e-07, "loss": 0.8986, "step": 1300 }, { "epoch": 0.08086503903829471, "grad_norm": 1.8090403079986572, "learning_rate": 9.675820233513795e-07, "loss": 0.8904, "step": 1305 }, { "epoch": 0.08117486677407361, "grad_norm": 1.9262653589248657, "learning_rate": 9.672558867653774e-07, "loss": 0.9942, "step": 1310 }, { "epoch": 0.08148469450985252, "grad_norm": 1.7774248123168945, "learning_rate": 9.66929750179375e-07, "loss": 0.8625, "step": 1315 }, { "epoch": 0.08179452224563143, "grad_norm": 2.0802676677703857, "learning_rate": 9.66603613593373e-07, "loss": 0.9466, "step": 1320 }, { "epoch": 0.08210434998141034, "grad_norm": 2.630694627761841, "learning_rate": 9.662774770073706e-07, "loss": 0.9102, "step": 1325 }, { "epoch": 0.08241417771718924, "grad_norm": 1.69815993309021, "learning_rate": 9.659513404213684e-07, "loss": 0.8535, "step": 1330 }, { "epoch": 0.08272400545296815, "grad_norm": 1.540556788444519, "learning_rate": 9.65625203835366e-07, "loss": 0.9023, "step": 1335 }, { "epoch": 0.08303383318874706, "grad_norm": 1.6849764585494995, "learning_rate": 9.65299067249364e-07, "loss": 0.8794, "step": 1340 }, { "epoch": 0.08334366092452597, "grad_norm": 1.8992342948913574, "learning_rate": 9.649729306633616e-07, "loss": 0.8761, "step": 1345 }, { "epoch": 0.08365348866030486, "grad_norm": 1.57821786403656, "learning_rate": 9.646467940773597e-07, "loss": 0.9313, "step": 1350 }, { "epoch": 0.08396331639608377, "grad_norm": 1.7790791988372803, "learning_rate": 9.643206574913574e-07, "loss": 0.8942, "step": 1355 }, { "epoch": 0.08427314413186268, "grad_norm": 1.8208365440368652, "learning_rate": 9.639945209053552e-07, "loss": 0.911, "step": 1360 }, { "epoch": 0.08458297186764159, "grad_norm": 2.687201499938965, "learning_rate": 9.636683843193529e-07, "loss": 0.8931, "step": 1365 }, { "epoch": 0.0848927996034205, "grad_norm": 1.5522912740707397, "learning_rate": 9.633422477333507e-07, "loss": 0.93, "step": 1370 }, { "epoch": 0.08520262733919941, "grad_norm": 1.9544481039047241, "learning_rate": 9.630161111473484e-07, "loss": 0.9109, "step": 1375 }, { "epoch": 0.08551245507497832, "grad_norm": 1.5189294815063477, "learning_rate": 9.626899745613463e-07, "loss": 0.8789, "step": 1380 }, { "epoch": 0.08582228281075722, "grad_norm": 1.6897494792938232, "learning_rate": 9.623638379753441e-07, "loss": 0.9567, "step": 1385 }, { "epoch": 0.08613211054653612, "grad_norm": 1.4350030422210693, "learning_rate": 9.620377013893418e-07, "loss": 0.8617, "step": 1390 }, { "epoch": 0.08644193828231503, "grad_norm": 1.6191420555114746, "learning_rate": 9.617115648033397e-07, "loss": 0.8917, "step": 1395 }, { "epoch": 0.08675176601809394, "grad_norm": 1.675918459892273, "learning_rate": 9.613854282173373e-07, "loss": 0.9204, "step": 1400 }, { "epoch": 0.08706159375387285, "grad_norm": 1.698830008506775, "learning_rate": 9.610592916313352e-07, "loss": 0.9405, "step": 1405 }, { "epoch": 0.08737142148965175, "grad_norm": 1.5583676099777222, "learning_rate": 9.607331550453328e-07, "loss": 0.8985, "step": 1410 }, { "epoch": 0.08768124922543066, "grad_norm": 1.5671244859695435, "learning_rate": 9.604070184593307e-07, "loss": 0.9561, "step": 1415 }, { "epoch": 0.08799107696120957, "grad_norm": 1.6514347791671753, "learning_rate": 9.600808818733286e-07, "loss": 0.9038, "step": 1420 }, { "epoch": 0.08830090469698848, "grad_norm": 1.517734169960022, "learning_rate": 9.597547452873262e-07, "loss": 0.9377, "step": 1425 }, { "epoch": 0.08861073243276738, "grad_norm": 1.6121182441711426, "learning_rate": 9.59428608701324e-07, "loss": 0.933, "step": 1430 }, { "epoch": 0.08892056016854628, "grad_norm": 1.7271368503570557, "learning_rate": 9.591024721153218e-07, "loss": 0.9069, "step": 1435 }, { "epoch": 0.08923038790432519, "grad_norm": 1.5879682302474976, "learning_rate": 9.587763355293196e-07, "loss": 0.8869, "step": 1440 }, { "epoch": 0.0895402156401041, "grad_norm": 1.7184416055679321, "learning_rate": 9.584501989433175e-07, "loss": 0.9086, "step": 1445 }, { "epoch": 0.08985004337588301, "grad_norm": 1.827219843864441, "learning_rate": 9.581240623573152e-07, "loss": 0.9293, "step": 1450 }, { "epoch": 0.09015987111166192, "grad_norm": 1.7870301008224487, "learning_rate": 9.57797925771313e-07, "loss": 0.92, "step": 1455 }, { "epoch": 0.09046969884744083, "grad_norm": 1.7783180475234985, "learning_rate": 9.574717891853109e-07, "loss": 0.9499, "step": 1460 }, { "epoch": 0.09077952658321974, "grad_norm": 1.5470525026321411, "learning_rate": 9.571456525993085e-07, "loss": 0.8232, "step": 1465 }, { "epoch": 0.09108935431899863, "grad_norm": 1.9754806756973267, "learning_rate": 9.568195160133064e-07, "loss": 0.9568, "step": 1470 }, { "epoch": 0.09139918205477754, "grad_norm": 1.553620457649231, "learning_rate": 9.56493379427304e-07, "loss": 0.9305, "step": 1475 }, { "epoch": 0.09170900979055645, "grad_norm": 1.8772929906845093, "learning_rate": 9.56167242841302e-07, "loss": 0.9305, "step": 1480 }, { "epoch": 0.09201883752633536, "grad_norm": 1.7891936302185059, "learning_rate": 9.558411062552996e-07, "loss": 0.9315, "step": 1485 }, { "epoch": 0.09232866526211427, "grad_norm": 1.6559499502182007, "learning_rate": 9.555149696692975e-07, "loss": 0.9242, "step": 1490 }, { "epoch": 0.09263849299789317, "grad_norm": 1.7293493747711182, "learning_rate": 9.551888330832953e-07, "loss": 0.9616, "step": 1495 }, { "epoch": 0.09294832073367208, "grad_norm": 1.8268382549285889, "learning_rate": 9.54862696497293e-07, "loss": 0.889, "step": 1500 }, { "epoch": 0.09325814846945099, "grad_norm": 1.5643219947814941, "learning_rate": 9.545365599112909e-07, "loss": 0.9197, "step": 1505 }, { "epoch": 0.09356797620522989, "grad_norm": 1.654793620109558, "learning_rate": 9.542104233252885e-07, "loss": 0.9041, "step": 1510 }, { "epoch": 0.0938778039410088, "grad_norm": 1.4770208597183228, "learning_rate": 9.538842867392864e-07, "loss": 0.9296, "step": 1515 }, { "epoch": 0.0941876316767877, "grad_norm": 1.531435251235962, "learning_rate": 9.535581501532841e-07, "loss": 0.9219, "step": 1520 }, { "epoch": 0.09449745941256661, "grad_norm": 1.8545514345169067, "learning_rate": 9.532320135672819e-07, "loss": 0.9351, "step": 1525 }, { "epoch": 0.09480728714834552, "grad_norm": 1.9118808507919312, "learning_rate": 9.529058769812797e-07, "loss": 0.8892, "step": 1530 }, { "epoch": 0.09511711488412443, "grad_norm": 1.7484800815582275, "learning_rate": 9.525797403952775e-07, "loss": 0.9171, "step": 1535 }, { "epoch": 0.09542694261990334, "grad_norm": 1.8313547372817993, "learning_rate": 9.522536038092753e-07, "loss": 0.969, "step": 1540 }, { "epoch": 0.09573677035568225, "grad_norm": 1.5443423986434937, "learning_rate": 9.519274672232731e-07, "loss": 0.925, "step": 1545 }, { "epoch": 0.09604659809146114, "grad_norm": 1.5131423473358154, "learning_rate": 9.516013306372709e-07, "loss": 0.8508, "step": 1550 }, { "epoch": 0.09635642582724005, "grad_norm": 1.5174850225448608, "learning_rate": 9.512751940512687e-07, "loss": 0.9364, "step": 1555 }, { "epoch": 0.09666625356301896, "grad_norm": 1.5874196290969849, "learning_rate": 9.509490574652665e-07, "loss": 0.9212, "step": 1560 }, { "epoch": 0.09697608129879787, "grad_norm": 1.7019157409667969, "learning_rate": 9.506229208792642e-07, "loss": 0.9, "step": 1565 }, { "epoch": 0.09728590903457678, "grad_norm": 1.712227463722229, "learning_rate": 9.50296784293262e-07, "loss": 0.9903, "step": 1570 }, { "epoch": 0.09759573677035568, "grad_norm": 1.5827279090881348, "learning_rate": 9.499706477072597e-07, "loss": 0.9015, "step": 1575 }, { "epoch": 0.09790556450613459, "grad_norm": 1.9122669696807861, "learning_rate": 9.496445111212575e-07, "loss": 0.9707, "step": 1580 }, { "epoch": 0.0982153922419135, "grad_norm": 1.826370358467102, "learning_rate": 9.493183745352553e-07, "loss": 0.8499, "step": 1585 }, { "epoch": 0.0985252199776924, "grad_norm": 1.999861478805542, "learning_rate": 9.489922379492531e-07, "loss": 0.9974, "step": 1590 }, { "epoch": 0.0988350477134713, "grad_norm": 1.603154182434082, "learning_rate": 9.486661013632509e-07, "loss": 0.849, "step": 1595 }, { "epoch": 0.09914487544925021, "grad_norm": 1.7965315580368042, "learning_rate": 9.483399647772487e-07, "loss": 0.7868, "step": 1600 }, { "epoch": 0.09945470318502912, "grad_norm": 1.9171953201293945, "learning_rate": 9.480138281912464e-07, "loss": 0.8998, "step": 1605 }, { "epoch": 0.09976453092080803, "grad_norm": 2.218500852584839, "learning_rate": 9.476876916052442e-07, "loss": 0.9009, "step": 1610 }, { "epoch": 0.10007435865658694, "grad_norm": 1.6160693168640137, "learning_rate": 9.47361555019242e-07, "loss": 0.854, "step": 1615 }, { "epoch": 0.10038418639236585, "grad_norm": 1.6298314332962036, "learning_rate": 9.470354184332397e-07, "loss": 0.9257, "step": 1620 }, { "epoch": 0.10069401412814476, "grad_norm": 1.7202585935592651, "learning_rate": 9.467092818472375e-07, "loss": 0.9166, "step": 1625 }, { "epoch": 0.10100384186392365, "grad_norm": 1.723842978477478, "learning_rate": 9.463831452612355e-07, "loss": 0.94, "step": 1630 }, { "epoch": 0.10131366959970256, "grad_norm": 1.536241054534912, "learning_rate": 9.460570086752332e-07, "loss": 0.9554, "step": 1635 }, { "epoch": 0.10162349733548147, "grad_norm": 1.63093900680542, "learning_rate": 9.45730872089231e-07, "loss": 0.9113, "step": 1640 }, { "epoch": 0.10193332507126038, "grad_norm": 2.4173877239227295, "learning_rate": 9.454047355032287e-07, "loss": 0.9487, "step": 1645 }, { "epoch": 0.10224315280703929, "grad_norm": 1.7547491788864136, "learning_rate": 9.450785989172265e-07, "loss": 0.9939, "step": 1650 }, { "epoch": 0.1025529805428182, "grad_norm": 1.5368876457214355, "learning_rate": 9.447524623312243e-07, "loss": 0.9433, "step": 1655 }, { "epoch": 0.1028628082785971, "grad_norm": 1.8035043478012085, "learning_rate": 9.44426325745222e-07, "loss": 0.9166, "step": 1660 }, { "epoch": 0.10317263601437601, "grad_norm": 1.7987889051437378, "learning_rate": 9.441001891592199e-07, "loss": 0.8567, "step": 1665 }, { "epoch": 0.10348246375015491, "grad_norm": 1.6784334182739258, "learning_rate": 9.437740525732177e-07, "loss": 0.8501, "step": 1670 }, { "epoch": 0.10379229148593382, "grad_norm": 1.8578557968139648, "learning_rate": 9.434479159872154e-07, "loss": 0.9068, "step": 1675 }, { "epoch": 0.10410211922171272, "grad_norm": 1.9403997659683228, "learning_rate": 9.431217794012132e-07, "loss": 0.9171, "step": 1680 }, { "epoch": 0.10441194695749163, "grad_norm": 1.9572882652282715, "learning_rate": 9.427956428152109e-07, "loss": 0.9054, "step": 1685 }, { "epoch": 0.10472177469327054, "grad_norm": 1.5424216985702515, "learning_rate": 9.424695062292087e-07, "loss": 0.9206, "step": 1690 }, { "epoch": 0.10503160242904945, "grad_norm": 1.5377370119094849, "learning_rate": 9.421433696432065e-07, "loss": 0.8759, "step": 1695 }, { "epoch": 0.10534143016482836, "grad_norm": 1.8080122470855713, "learning_rate": 9.418172330572043e-07, "loss": 0.9304, "step": 1700 }, { "epoch": 0.10565125790060727, "grad_norm": 1.6878530979156494, "learning_rate": 9.414910964712021e-07, "loss": 0.9882, "step": 1705 }, { "epoch": 0.10596108563638616, "grad_norm": 1.7115623950958252, "learning_rate": 9.411649598851999e-07, "loss": 0.922, "step": 1710 }, { "epoch": 0.10627091337216507, "grad_norm": 1.5220273733139038, "learning_rate": 9.408388232991976e-07, "loss": 0.9355, "step": 1715 }, { "epoch": 0.10658074110794398, "grad_norm": 1.733494520187378, "learning_rate": 9.405126867131954e-07, "loss": 0.9894, "step": 1720 }, { "epoch": 0.10689056884372289, "grad_norm": 1.795032024383545, "learning_rate": 9.401865501271933e-07, "loss": 0.9305, "step": 1725 }, { "epoch": 0.1072003965795018, "grad_norm": 1.5463851690292358, "learning_rate": 9.39860413541191e-07, "loss": 0.901, "step": 1730 }, { "epoch": 0.1075102243152807, "grad_norm": 1.6902074813842773, "learning_rate": 9.395342769551888e-07, "loss": 0.9465, "step": 1735 }, { "epoch": 0.10782005205105961, "grad_norm": 1.5403833389282227, "learning_rate": 9.392081403691866e-07, "loss": 0.9309, "step": 1740 }, { "epoch": 0.10812987978683852, "grad_norm": 1.697380542755127, "learning_rate": 9.388820037831844e-07, "loss": 0.9405, "step": 1745 }, { "epoch": 0.10843970752261742, "grad_norm": 1.879273533821106, "learning_rate": 9.385558671971822e-07, "loss": 0.9863, "step": 1750 }, { "epoch": 0.10874953525839633, "grad_norm": 1.5659109354019165, "learning_rate": 9.382297306111799e-07, "loss": 0.9264, "step": 1755 }, { "epoch": 0.10905936299417524, "grad_norm": 1.811824917793274, "learning_rate": 9.379035940251777e-07, "loss": 0.9208, "step": 1760 }, { "epoch": 0.10936919072995414, "grad_norm": 1.5680676698684692, "learning_rate": 9.375774574391755e-07, "loss": 0.8977, "step": 1765 }, { "epoch": 0.10967901846573305, "grad_norm": 1.5863617658615112, "learning_rate": 9.372513208531732e-07, "loss": 0.9005, "step": 1770 }, { "epoch": 0.10998884620151196, "grad_norm": 1.6305958032608032, "learning_rate": 9.369251842671711e-07, "loss": 0.9283, "step": 1775 }, { "epoch": 0.11029867393729087, "grad_norm": 1.5092458724975586, "learning_rate": 9.365990476811689e-07, "loss": 0.9464, "step": 1780 }, { "epoch": 0.11060850167306978, "grad_norm": 1.5393297672271729, "learning_rate": 9.362729110951666e-07, "loss": 0.9243, "step": 1785 }, { "epoch": 0.11091832940884867, "grad_norm": 1.634668231010437, "learning_rate": 9.359467745091644e-07, "loss": 0.8267, "step": 1790 }, { "epoch": 0.11122815714462758, "grad_norm": 1.4916472434997559, "learning_rate": 9.356206379231621e-07, "loss": 0.8139, "step": 1795 }, { "epoch": 0.11153798488040649, "grad_norm": 1.694118857383728, "learning_rate": 9.352945013371599e-07, "loss": 0.868, "step": 1800 }, { "epoch": 0.1118478126161854, "grad_norm": 1.7894967794418335, "learning_rate": 9.349683647511577e-07, "loss": 0.8698, "step": 1805 }, { "epoch": 0.11215764035196431, "grad_norm": 1.7776713371276855, "learning_rate": 9.346422281651554e-07, "loss": 0.92, "step": 1810 }, { "epoch": 0.11246746808774322, "grad_norm": 1.7799886465072632, "learning_rate": 9.343160915791533e-07, "loss": 0.8764, "step": 1815 }, { "epoch": 0.11277729582352213, "grad_norm": 1.763051152229309, "learning_rate": 9.339899549931512e-07, "loss": 0.9057, "step": 1820 }, { "epoch": 0.11308712355930103, "grad_norm": 1.5663349628448486, "learning_rate": 9.336638184071489e-07, "loss": 0.8633, "step": 1825 }, { "epoch": 0.11339695129507994, "grad_norm": 1.921720266342163, "learning_rate": 9.333376818211467e-07, "loss": 0.9016, "step": 1830 }, { "epoch": 0.11370677903085884, "grad_norm": 1.5232547521591187, "learning_rate": 9.330115452351445e-07, "loss": 0.9428, "step": 1835 }, { "epoch": 0.11401660676663775, "grad_norm": 1.7436565160751343, "learning_rate": 9.326854086491422e-07, "loss": 0.9152, "step": 1840 }, { "epoch": 0.11432643450241665, "grad_norm": 2.010403871536255, "learning_rate": 9.3235927206314e-07, "loss": 0.9475, "step": 1845 }, { "epoch": 0.11463626223819556, "grad_norm": 1.5459505319595337, "learning_rate": 9.320331354771378e-07, "loss": 0.8347, "step": 1850 }, { "epoch": 0.11494608997397447, "grad_norm": 1.6541571617126465, "learning_rate": 9.317069988911356e-07, "loss": 0.8514, "step": 1855 }, { "epoch": 0.11525591770975338, "grad_norm": 1.4283198118209839, "learning_rate": 9.313808623051334e-07, "loss": 0.9219, "step": 1860 }, { "epoch": 0.11556574544553229, "grad_norm": 1.7266565561294556, "learning_rate": 9.310547257191311e-07, "loss": 0.8583, "step": 1865 }, { "epoch": 0.1158755731813112, "grad_norm": 1.6311653852462769, "learning_rate": 9.307285891331289e-07, "loss": 0.8464, "step": 1870 }, { "epoch": 0.11618540091709009, "grad_norm": 1.5725646018981934, "learning_rate": 9.304024525471267e-07, "loss": 0.9459, "step": 1875 }, { "epoch": 0.116495228652869, "grad_norm": 1.6118230819702148, "learning_rate": 9.300763159611244e-07, "loss": 0.9312, "step": 1880 }, { "epoch": 0.11680505638864791, "grad_norm": 1.6082074642181396, "learning_rate": 9.297501793751222e-07, "loss": 0.9268, "step": 1885 }, { "epoch": 0.11711488412442682, "grad_norm": 1.5976063013076782, "learning_rate": 9.2942404278912e-07, "loss": 0.8769, "step": 1890 }, { "epoch": 0.11742471186020573, "grad_norm": 1.7054197788238525, "learning_rate": 9.290979062031178e-07, "loss": 0.9238, "step": 1895 }, { "epoch": 0.11773453959598464, "grad_norm": 1.7131783962249756, "learning_rate": 9.287717696171156e-07, "loss": 0.9183, "step": 1900 }, { "epoch": 0.11804436733176354, "grad_norm": 1.7619596719741821, "learning_rate": 9.284456330311133e-07, "loss": 0.926, "step": 1905 }, { "epoch": 0.11835419506754245, "grad_norm": 1.5592797994613647, "learning_rate": 9.281194964451111e-07, "loss": 0.8816, "step": 1910 }, { "epoch": 0.11866402280332135, "grad_norm": 1.7521520853042603, "learning_rate": 9.27793359859109e-07, "loss": 0.9256, "step": 1915 }, { "epoch": 0.11897385053910026, "grad_norm": 1.6771315336227417, "learning_rate": 9.274672232731067e-07, "loss": 0.887, "step": 1920 }, { "epoch": 0.11928367827487917, "grad_norm": 1.8387212753295898, "learning_rate": 9.271410866871046e-07, "loss": 0.9427, "step": 1925 }, { "epoch": 0.11959350601065807, "grad_norm": 1.606123685836792, "learning_rate": 9.268149501011024e-07, "loss": 0.8883, "step": 1930 }, { "epoch": 0.11990333374643698, "grad_norm": 1.684762954711914, "learning_rate": 9.264888135151001e-07, "loss": 0.9205, "step": 1935 }, { "epoch": 0.12021316148221589, "grad_norm": 1.4890493154525757, "learning_rate": 9.261626769290979e-07, "loss": 0.9356, "step": 1940 }, { "epoch": 0.1205229892179948, "grad_norm": 1.3840742111206055, "learning_rate": 9.258365403430957e-07, "loss": 0.9436, "step": 1945 }, { "epoch": 0.12083281695377371, "grad_norm": 1.4673136472702026, "learning_rate": 9.255104037570934e-07, "loss": 0.9022, "step": 1950 }, { "epoch": 0.1211426446895526, "grad_norm": 1.6822834014892578, "learning_rate": 9.251842671710912e-07, "loss": 0.9326, "step": 1955 }, { "epoch": 0.12145247242533151, "grad_norm": 1.4832382202148438, "learning_rate": 9.248581305850889e-07, "loss": 0.9022, "step": 1960 }, { "epoch": 0.12176230016111042, "grad_norm": 1.5813413858413696, "learning_rate": 9.245319939990868e-07, "loss": 0.9139, "step": 1965 }, { "epoch": 0.12207212789688933, "grad_norm": 1.9620355367660522, "learning_rate": 9.242058574130846e-07, "loss": 0.9058, "step": 1970 }, { "epoch": 0.12238195563266824, "grad_norm": 1.526544213294983, "learning_rate": 9.238797208270823e-07, "loss": 0.8859, "step": 1975 }, { "epoch": 0.12269178336844715, "grad_norm": 1.5417823791503906, "learning_rate": 9.235535842410801e-07, "loss": 0.8682, "step": 1980 }, { "epoch": 0.12300161110422606, "grad_norm": 1.4717334508895874, "learning_rate": 9.232274476550779e-07, "loss": 0.8896, "step": 1985 }, { "epoch": 0.12331143884000496, "grad_norm": 1.5138407945632935, "learning_rate": 9.229013110690756e-07, "loss": 0.8911, "step": 1990 }, { "epoch": 0.12362126657578386, "grad_norm": 1.7051256895065308, "learning_rate": 9.225751744830734e-07, "loss": 0.9216, "step": 1995 }, { "epoch": 0.12393109431156277, "grad_norm": 1.6927268505096436, "learning_rate": 9.222490378970713e-07, "loss": 0.931, "step": 2000 }, { "epoch": 0.12424092204734168, "grad_norm": 1.5036671161651611, "learning_rate": 9.21922901311069e-07, "loss": 0.9156, "step": 2005 }, { "epoch": 0.12455074978312058, "grad_norm": 1.4694242477416992, "learning_rate": 9.215967647250669e-07, "loss": 0.8713, "step": 2010 }, { "epoch": 0.1248605775188995, "grad_norm": 1.6127041578292847, "learning_rate": 9.212706281390646e-07, "loss": 0.9125, "step": 2015 }, { "epoch": 0.1251704052546784, "grad_norm": 1.5481854677200317, "learning_rate": 9.209444915530624e-07, "loss": 0.8628, "step": 2020 }, { "epoch": 0.1254802329904573, "grad_norm": 1.726839542388916, "learning_rate": 9.206183549670602e-07, "loss": 0.9067, "step": 2025 }, { "epoch": 0.12579006072623622, "grad_norm": 1.6881474256515503, "learning_rate": 9.202922183810579e-07, "loss": 0.9122, "step": 2030 }, { "epoch": 0.12609988846201511, "grad_norm": 2.1938445568084717, "learning_rate": 9.199660817950558e-07, "loss": 0.9262, "step": 2035 }, { "epoch": 0.12640971619779404, "grad_norm": 1.8295819759368896, "learning_rate": 9.196399452090536e-07, "loss": 0.8807, "step": 2040 }, { "epoch": 0.12671954393357293, "grad_norm": 1.9458460807800293, "learning_rate": 9.193138086230513e-07, "loss": 0.9069, "step": 2045 }, { "epoch": 0.12702937166935185, "grad_norm": 1.6239795684814453, "learning_rate": 9.189876720370491e-07, "loss": 0.8615, "step": 2050 }, { "epoch": 0.12733919940513075, "grad_norm": 1.3073065280914307, "learning_rate": 9.186615354510468e-07, "loss": 0.8355, "step": 2055 }, { "epoch": 0.12764902714090964, "grad_norm": 2.5586483478546143, "learning_rate": 9.183353988650446e-07, "loss": 0.9012, "step": 2060 }, { "epoch": 0.12795885487668857, "grad_norm": 1.4722535610198975, "learning_rate": 9.180092622790424e-07, "loss": 0.8572, "step": 2065 }, { "epoch": 0.12826868261246746, "grad_norm": 1.8018276691436768, "learning_rate": 9.176831256930401e-07, "loss": 0.8946, "step": 2070 }, { "epoch": 0.12857851034824638, "grad_norm": 1.5933265686035156, "learning_rate": 9.17356989107038e-07, "loss": 0.8804, "step": 2075 }, { "epoch": 0.12888833808402528, "grad_norm": 1.762763500213623, "learning_rate": 9.170308525210358e-07, "loss": 0.9779, "step": 2080 }, { "epoch": 0.1291981658198042, "grad_norm": 1.6171183586120605, "learning_rate": 9.167047159350335e-07, "loss": 0.9502, "step": 2085 }, { "epoch": 0.1295079935555831, "grad_norm": 1.897364616394043, "learning_rate": 9.163785793490313e-07, "loss": 0.9154, "step": 2090 }, { "epoch": 0.129817821291362, "grad_norm": 2.136207103729248, "learning_rate": 9.160524427630291e-07, "loss": 0.8815, "step": 2095 }, { "epoch": 0.1301276490271409, "grad_norm": 1.8311100006103516, "learning_rate": 9.157263061770269e-07, "loss": 0.932, "step": 2100 }, { "epoch": 0.1304374767629198, "grad_norm": 1.8922637701034546, "learning_rate": 9.154001695910247e-07, "loss": 0.8854, "step": 2105 }, { "epoch": 0.13074730449869873, "grad_norm": 1.9181512594223022, "learning_rate": 9.150740330050226e-07, "loss": 0.9713, "step": 2110 }, { "epoch": 0.13105713223447762, "grad_norm": 1.9277687072753906, "learning_rate": 9.147478964190203e-07, "loss": 0.9074, "step": 2115 }, { "epoch": 0.13136695997025655, "grad_norm": 1.6547173261642456, "learning_rate": 9.144217598330181e-07, "loss": 0.8753, "step": 2120 }, { "epoch": 0.13167678770603544, "grad_norm": 1.5651469230651855, "learning_rate": 9.140956232470158e-07, "loss": 0.9386, "step": 2125 }, { "epoch": 0.13198661544181436, "grad_norm": 1.842453956604004, "learning_rate": 9.137694866610136e-07, "loss": 0.9277, "step": 2130 }, { "epoch": 0.13229644317759326, "grad_norm": 1.8406201601028442, "learning_rate": 9.134433500750114e-07, "loss": 0.9162, "step": 2135 }, { "epoch": 0.13260627091337215, "grad_norm": 1.6607987880706787, "learning_rate": 9.131172134890091e-07, "loss": 0.9503, "step": 2140 }, { "epoch": 0.13291609864915108, "grad_norm": 1.6080315113067627, "learning_rate": 9.127910769030069e-07, "loss": 0.919, "step": 2145 }, { "epoch": 0.13322592638492997, "grad_norm": 1.8985507488250732, "learning_rate": 9.124649403170048e-07, "loss": 0.9538, "step": 2150 }, { "epoch": 0.1335357541207089, "grad_norm": 1.8730796575546265, "learning_rate": 9.121388037310025e-07, "loss": 0.9226, "step": 2155 }, { "epoch": 0.1338455818564878, "grad_norm": 1.6366620063781738, "learning_rate": 9.118126671450003e-07, "loss": 0.9514, "step": 2160 }, { "epoch": 0.1341554095922667, "grad_norm": 1.8172043561935425, "learning_rate": 9.11486530558998e-07, "loss": 0.897, "step": 2165 }, { "epoch": 0.1344652373280456, "grad_norm": 1.5860812664031982, "learning_rate": 9.111603939729958e-07, "loss": 0.9012, "step": 2170 }, { "epoch": 0.1347750650638245, "grad_norm": 1.7490601539611816, "learning_rate": 9.108342573869936e-07, "loss": 0.9194, "step": 2175 }, { "epoch": 0.13508489279960342, "grad_norm": 1.7364531755447388, "learning_rate": 9.105081208009913e-07, "loss": 0.893, "step": 2180 }, { "epoch": 0.13539472053538232, "grad_norm": 1.7875436544418335, "learning_rate": 9.101819842149892e-07, "loss": 0.9032, "step": 2185 }, { "epoch": 0.13570454827116124, "grad_norm": 1.5101052522659302, "learning_rate": 9.09855847628987e-07, "loss": 0.9115, "step": 2190 }, { "epoch": 0.13601437600694014, "grad_norm": 1.9286682605743408, "learning_rate": 9.095297110429848e-07, "loss": 0.9116, "step": 2195 }, { "epoch": 0.13632420374271906, "grad_norm": 1.615739107131958, "learning_rate": 9.092035744569826e-07, "loss": 0.9117, "step": 2200 }, { "epoch": 0.13663403147849795, "grad_norm": 1.6398882865905762, "learning_rate": 9.088774378709804e-07, "loss": 0.9357, "step": 2205 }, { "epoch": 0.13694385921427688, "grad_norm": 1.6727508306503296, "learning_rate": 9.085513012849781e-07, "loss": 0.8684, "step": 2210 }, { "epoch": 0.13725368695005577, "grad_norm": 1.716570496559143, "learning_rate": 9.082251646989759e-07, "loss": 0.8447, "step": 2215 }, { "epoch": 0.13756351468583466, "grad_norm": 1.8427702188491821, "learning_rate": 9.078990281129736e-07, "loss": 0.9071, "step": 2220 }, { "epoch": 0.1378733424216136, "grad_norm": 1.6984174251556396, "learning_rate": 9.075728915269715e-07, "loss": 0.9052, "step": 2225 }, { "epoch": 0.13818317015739248, "grad_norm": 1.892127513885498, "learning_rate": 9.072467549409693e-07, "loss": 0.848, "step": 2230 }, { "epoch": 0.1384929978931714, "grad_norm": 1.649255394935608, "learning_rate": 9.06920618354967e-07, "loss": 0.8856, "step": 2235 }, { "epoch": 0.1388028256289503, "grad_norm": 1.8197404146194458, "learning_rate": 9.065944817689648e-07, "loss": 0.9588, "step": 2240 }, { "epoch": 0.13911265336472922, "grad_norm": 1.5840644836425781, "learning_rate": 9.062683451829626e-07, "loss": 0.8563, "step": 2245 }, { "epoch": 0.13942248110050812, "grad_norm": 1.5736013650894165, "learning_rate": 9.059422085969603e-07, "loss": 0.8766, "step": 2250 }, { "epoch": 0.139732308836287, "grad_norm": 1.6825470924377441, "learning_rate": 9.056160720109581e-07, "loss": 0.9023, "step": 2255 }, { "epoch": 0.14004213657206593, "grad_norm": 1.4966249465942383, "learning_rate": 9.05289935424956e-07, "loss": 0.9117, "step": 2260 }, { "epoch": 0.14035196430784483, "grad_norm": 1.8008887767791748, "learning_rate": 9.049637988389537e-07, "loss": 0.8767, "step": 2265 }, { "epoch": 0.14066179204362375, "grad_norm": 1.998075008392334, "learning_rate": 9.046376622529515e-07, "loss": 0.8312, "step": 2270 }, { "epoch": 0.14097161977940265, "grad_norm": 1.6006526947021484, "learning_rate": 9.043115256669492e-07, "loss": 0.92, "step": 2275 }, { "epoch": 0.14128144751518157, "grad_norm": 1.3593368530273438, "learning_rate": 9.03985389080947e-07, "loss": 0.9097, "step": 2280 }, { "epoch": 0.14159127525096046, "grad_norm": 1.6495636701583862, "learning_rate": 9.036592524949448e-07, "loss": 0.8744, "step": 2285 }, { "epoch": 0.14190110298673939, "grad_norm": 1.7051562070846558, "learning_rate": 9.033331159089426e-07, "loss": 0.8672, "step": 2290 }, { "epoch": 0.14221093072251828, "grad_norm": 1.730962872505188, "learning_rate": 9.030069793229405e-07, "loss": 0.8875, "step": 2295 }, { "epoch": 0.14252075845829718, "grad_norm": 1.6012706756591797, "learning_rate": 9.026808427369383e-07, "loss": 0.9142, "step": 2300 }, { "epoch": 0.1428305861940761, "grad_norm": 1.7790347337722778, "learning_rate": 9.02354706150936e-07, "loss": 0.9115, "step": 2305 }, { "epoch": 0.143140413929855, "grad_norm": 1.6538646221160889, "learning_rate": 9.020285695649338e-07, "loss": 0.8256, "step": 2310 }, { "epoch": 0.14345024166563392, "grad_norm": 1.7245451211929321, "learning_rate": 9.017024329789316e-07, "loss": 0.8715, "step": 2315 }, { "epoch": 0.1437600694014128, "grad_norm": 1.5423656702041626, "learning_rate": 9.013762963929293e-07, "loss": 0.8383, "step": 2320 }, { "epoch": 0.14406989713719173, "grad_norm": 1.9696499109268188, "learning_rate": 9.010501598069271e-07, "loss": 0.9468, "step": 2325 }, { "epoch": 0.14437972487297063, "grad_norm": 1.57548987865448, "learning_rate": 9.007240232209248e-07, "loss": 0.8801, "step": 2330 }, { "epoch": 0.14468955260874952, "grad_norm": 1.5706093311309814, "learning_rate": 9.003978866349227e-07, "loss": 0.8886, "step": 2335 }, { "epoch": 0.14499938034452844, "grad_norm": 1.54964017868042, "learning_rate": 9.000717500489205e-07, "loss": 0.9499, "step": 2340 }, { "epoch": 0.14530920808030734, "grad_norm": 1.5829427242279053, "learning_rate": 8.997456134629182e-07, "loss": 0.8704, "step": 2345 }, { "epoch": 0.14561903581608626, "grad_norm": 1.6156928539276123, "learning_rate": 8.99419476876916e-07, "loss": 0.874, "step": 2350 }, { "epoch": 0.14592886355186516, "grad_norm": 1.8616446256637573, "learning_rate": 8.990933402909138e-07, "loss": 0.8732, "step": 2355 }, { "epoch": 0.14623869128764408, "grad_norm": 1.6521779298782349, "learning_rate": 8.987672037049115e-07, "loss": 0.8797, "step": 2360 }, { "epoch": 0.14654851902342297, "grad_norm": 1.749068021774292, "learning_rate": 8.984410671189093e-07, "loss": 0.8912, "step": 2365 }, { "epoch": 0.1468583467592019, "grad_norm": 1.753404140472412, "learning_rate": 8.98114930532907e-07, "loss": 0.8974, "step": 2370 }, { "epoch": 0.1471681744949808, "grad_norm": 2.0058035850524902, "learning_rate": 8.977887939469049e-07, "loss": 0.9412, "step": 2375 }, { "epoch": 0.14747800223075969, "grad_norm": 1.7681653499603271, "learning_rate": 8.974626573609027e-07, "loss": 0.9178, "step": 2380 }, { "epoch": 0.1477878299665386, "grad_norm": 1.6263868808746338, "learning_rate": 8.971365207749005e-07, "loss": 0.9232, "step": 2385 }, { "epoch": 0.1480976577023175, "grad_norm": 1.6443973779678345, "learning_rate": 8.968103841888983e-07, "loss": 0.8992, "step": 2390 }, { "epoch": 0.14840748543809643, "grad_norm": 1.662570595741272, "learning_rate": 8.964842476028961e-07, "loss": 0.9144, "step": 2395 }, { "epoch": 0.14871731317387532, "grad_norm": 1.6928895711898804, "learning_rate": 8.961581110168938e-07, "loss": 0.9107, "step": 2400 }, { "epoch": 0.14902714090965424, "grad_norm": 1.5469177961349487, "learning_rate": 8.958319744308916e-07, "loss": 0.9074, "step": 2405 }, { "epoch": 0.14933696864543314, "grad_norm": 1.6787333488464355, "learning_rate": 8.955058378448895e-07, "loss": 0.8936, "step": 2410 }, { "epoch": 0.14964679638121203, "grad_norm": 1.5153744220733643, "learning_rate": 8.951797012588872e-07, "loss": 0.8626, "step": 2415 }, { "epoch": 0.14995662411699096, "grad_norm": 1.6840120553970337, "learning_rate": 8.94853564672885e-07, "loss": 0.8604, "step": 2420 }, { "epoch": 0.15026645185276985, "grad_norm": 1.5821272134780884, "learning_rate": 8.945274280868828e-07, "loss": 0.8908, "step": 2425 }, { "epoch": 0.15057627958854877, "grad_norm": 1.8581312894821167, "learning_rate": 8.942012915008805e-07, "loss": 0.8804, "step": 2430 }, { "epoch": 0.15088610732432767, "grad_norm": 1.5385000705718994, "learning_rate": 8.938751549148783e-07, "loss": 0.9189, "step": 2435 }, { "epoch": 0.1511959350601066, "grad_norm": 1.818303108215332, "learning_rate": 8.93549018328876e-07, "loss": 0.8718, "step": 2440 }, { "epoch": 0.15150576279588548, "grad_norm": 2.020456552505493, "learning_rate": 8.932228817428738e-07, "loss": 0.9049, "step": 2445 }, { "epoch": 0.1518155905316644, "grad_norm": 1.4328876733779907, "learning_rate": 8.928967451568717e-07, "loss": 0.8841, "step": 2450 }, { "epoch": 0.1521254182674433, "grad_norm": 1.4442330598831177, "learning_rate": 8.925706085708694e-07, "loss": 0.8614, "step": 2455 }, { "epoch": 0.1524352460032222, "grad_norm": 1.5784924030303955, "learning_rate": 8.922444719848672e-07, "loss": 0.8766, "step": 2460 }, { "epoch": 0.15274507373900112, "grad_norm": 1.64808189868927, "learning_rate": 8.91918335398865e-07, "loss": 0.9636, "step": 2465 }, { "epoch": 0.15305490147478001, "grad_norm": 1.7615500688552856, "learning_rate": 8.915921988128627e-07, "loss": 0.8783, "step": 2470 }, { "epoch": 0.15336472921055894, "grad_norm": 1.640971064567566, "learning_rate": 8.912660622268605e-07, "loss": 0.8492, "step": 2475 }, { "epoch": 0.15367455694633783, "grad_norm": 1.7891745567321777, "learning_rate": 8.909399256408584e-07, "loss": 0.9216, "step": 2480 }, { "epoch": 0.15398438468211675, "grad_norm": 1.6848870515823364, "learning_rate": 8.906137890548562e-07, "loss": 0.9514, "step": 2485 }, { "epoch": 0.15429421241789565, "grad_norm": 1.7956445217132568, "learning_rate": 8.90287652468854e-07, "loss": 0.9426, "step": 2490 }, { "epoch": 0.15460404015367454, "grad_norm": 1.4638934135437012, "learning_rate": 8.899615158828517e-07, "loss": 0.9085, "step": 2495 }, { "epoch": 0.15491386788945347, "grad_norm": 2.014169216156006, "learning_rate": 8.896353792968495e-07, "loss": 0.9044, "step": 2500 }, { "epoch": 0.15522369562523236, "grad_norm": 1.64141047000885, "learning_rate": 8.893092427108473e-07, "loss": 0.9003, "step": 2505 }, { "epoch": 0.15553352336101128, "grad_norm": 1.806712031364441, "learning_rate": 8.88983106124845e-07, "loss": 0.8466, "step": 2510 }, { "epoch": 0.15584335109679018, "grad_norm": 1.764486312866211, "learning_rate": 8.886569695388428e-07, "loss": 0.9291, "step": 2515 }, { "epoch": 0.1561531788325691, "grad_norm": 1.5419678688049316, "learning_rate": 8.883308329528407e-07, "loss": 0.8696, "step": 2520 }, { "epoch": 0.156463006568348, "grad_norm": 1.5634942054748535, "learning_rate": 8.880046963668384e-07, "loss": 0.9241, "step": 2525 }, { "epoch": 0.15677283430412692, "grad_norm": 1.7636653184890747, "learning_rate": 8.876785597808362e-07, "loss": 0.9117, "step": 2530 }, { "epoch": 0.1570826620399058, "grad_norm": 1.7944517135620117, "learning_rate": 8.87352423194834e-07, "loss": 0.8293, "step": 2535 }, { "epoch": 0.1573924897756847, "grad_norm": 2.3137447834014893, "learning_rate": 8.870262866088317e-07, "loss": 0.9034, "step": 2540 }, { "epoch": 0.15770231751146363, "grad_norm": 1.4866509437561035, "learning_rate": 8.867001500228295e-07, "loss": 0.9313, "step": 2545 }, { "epoch": 0.15801214524724252, "grad_norm": 1.6374338865280151, "learning_rate": 8.863740134368272e-07, "loss": 0.904, "step": 2550 }, { "epoch": 0.15832197298302145, "grad_norm": 1.6108781099319458, "learning_rate": 8.86047876850825e-07, "loss": 0.8622, "step": 2555 }, { "epoch": 0.15863180071880034, "grad_norm": 1.5189409255981445, "learning_rate": 8.857217402648229e-07, "loss": 0.8868, "step": 2560 }, { "epoch": 0.15894162845457926, "grad_norm": 1.4689654111862183, "learning_rate": 8.853956036788206e-07, "loss": 0.898, "step": 2565 }, { "epoch": 0.15925145619035816, "grad_norm": 1.760972023010254, "learning_rate": 8.850694670928184e-07, "loss": 0.8361, "step": 2570 }, { "epoch": 0.15956128392613705, "grad_norm": 1.832104206085205, "learning_rate": 8.847433305068163e-07, "loss": 0.9231, "step": 2575 }, { "epoch": 0.15987111166191598, "grad_norm": 1.625506043434143, "learning_rate": 8.84417193920814e-07, "loss": 0.8974, "step": 2580 }, { "epoch": 0.16018093939769487, "grad_norm": 1.7356221675872803, "learning_rate": 8.840910573348118e-07, "loss": 0.9886, "step": 2585 }, { "epoch": 0.1604907671334738, "grad_norm": 1.68430495262146, "learning_rate": 8.837649207488096e-07, "loss": 0.8379, "step": 2590 }, { "epoch": 0.1608005948692527, "grad_norm": 2.1019978523254395, "learning_rate": 8.834387841628074e-07, "loss": 0.9223, "step": 2595 }, { "epoch": 0.1611104226050316, "grad_norm": 1.5758109092712402, "learning_rate": 8.831126475768052e-07, "loss": 0.9282, "step": 2600 }, { "epoch": 0.1614202503408105, "grad_norm": 1.653256893157959, "learning_rate": 8.827865109908029e-07, "loss": 0.9194, "step": 2605 }, { "epoch": 0.16173007807658943, "grad_norm": 1.8933441638946533, "learning_rate": 8.824603744048007e-07, "loss": 0.8109, "step": 2610 }, { "epoch": 0.16203990581236832, "grad_norm": 1.598219394683838, "learning_rate": 8.821342378187985e-07, "loss": 0.8735, "step": 2615 }, { "epoch": 0.16234973354814722, "grad_norm": 1.984886646270752, "learning_rate": 8.818081012327962e-07, "loss": 0.9043, "step": 2620 }, { "epoch": 0.16265956128392614, "grad_norm": 1.8361341953277588, "learning_rate": 8.81481964646794e-07, "loss": 0.924, "step": 2625 }, { "epoch": 0.16296938901970504, "grad_norm": 2.189523696899414, "learning_rate": 8.811558280607918e-07, "loss": 0.9196, "step": 2630 }, { "epoch": 0.16327921675548396, "grad_norm": 1.8507121801376343, "learning_rate": 8.808296914747896e-07, "loss": 0.9007, "step": 2635 }, { "epoch": 0.16358904449126285, "grad_norm": 1.3994359970092773, "learning_rate": 8.805035548887874e-07, "loss": 0.9197, "step": 2640 }, { "epoch": 0.16389887222704178, "grad_norm": 1.7774571180343628, "learning_rate": 8.801774183027852e-07, "loss": 0.9358, "step": 2645 }, { "epoch": 0.16420869996282067, "grad_norm": 1.6651573181152344, "learning_rate": 8.798512817167829e-07, "loss": 0.8827, "step": 2650 }, { "epoch": 0.16451852769859956, "grad_norm": 1.9643409252166748, "learning_rate": 8.795251451307807e-07, "loss": 0.9345, "step": 2655 }, { "epoch": 0.1648283554343785, "grad_norm": 1.6999852657318115, "learning_rate": 8.791990085447784e-07, "loss": 0.9127, "step": 2660 }, { "epoch": 0.16513818317015738, "grad_norm": 1.7432353496551514, "learning_rate": 8.788728719587762e-07, "loss": 0.8382, "step": 2665 }, { "epoch": 0.1654480109059363, "grad_norm": 1.6517910957336426, "learning_rate": 8.785467353727742e-07, "loss": 0.8575, "step": 2670 }, { "epoch": 0.1657578386417152, "grad_norm": 1.8148996829986572, "learning_rate": 8.782205987867719e-07, "loss": 0.9001, "step": 2675 }, { "epoch": 0.16606766637749412, "grad_norm": 1.8529027700424194, "learning_rate": 8.778944622007697e-07, "loss": 0.8114, "step": 2680 }, { "epoch": 0.16637749411327302, "grad_norm": 1.8318027257919312, "learning_rate": 8.775683256147675e-07, "loss": 0.8825, "step": 2685 }, { "epoch": 0.16668732184905194, "grad_norm": 1.377173900604248, "learning_rate": 8.772421890287652e-07, "loss": 0.8862, "step": 2690 }, { "epoch": 0.16699714958483083, "grad_norm": 1.5136480331420898, "learning_rate": 8.76916052442763e-07, "loss": 0.7954, "step": 2695 }, { "epoch": 0.16730697732060973, "grad_norm": 2.8623642921447754, "learning_rate": 8.765899158567607e-07, "loss": 0.8518, "step": 2700 }, { "epoch": 0.16761680505638865, "grad_norm": 1.5709458589553833, "learning_rate": 8.762637792707585e-07, "loss": 0.8748, "step": 2705 }, { "epoch": 0.16792663279216755, "grad_norm": 1.7356221675872803, "learning_rate": 8.759376426847564e-07, "loss": 0.868, "step": 2710 }, { "epoch": 0.16823646052794647, "grad_norm": 1.8012226819992065, "learning_rate": 8.756115060987541e-07, "loss": 0.8869, "step": 2715 }, { "epoch": 0.16854628826372536, "grad_norm": 1.468733787536621, "learning_rate": 8.752853695127519e-07, "loss": 0.8653, "step": 2720 }, { "epoch": 0.16885611599950429, "grad_norm": 1.6445187330245972, "learning_rate": 8.749592329267497e-07, "loss": 0.8289, "step": 2725 }, { "epoch": 0.16916594373528318, "grad_norm": 1.4575703144073486, "learning_rate": 8.746330963407474e-07, "loss": 0.8974, "step": 2730 }, { "epoch": 0.1694757714710621, "grad_norm": 1.533862590789795, "learning_rate": 8.743069597547452e-07, "loss": 0.8889, "step": 2735 }, { "epoch": 0.169785599206841, "grad_norm": 1.7456963062286377, "learning_rate": 8.73980823168743e-07, "loss": 0.9019, "step": 2740 }, { "epoch": 0.1700954269426199, "grad_norm": 1.5817877054214478, "learning_rate": 8.736546865827408e-07, "loss": 0.8557, "step": 2745 }, { "epoch": 0.17040525467839882, "grad_norm": 1.639704704284668, "learning_rate": 8.733285499967386e-07, "loss": 0.8845, "step": 2750 }, { "epoch": 0.1707150824141777, "grad_norm": 1.7182782888412476, "learning_rate": 8.730024134107363e-07, "loss": 0.9144, "step": 2755 }, { "epoch": 0.17102491014995663, "grad_norm": 1.5935683250427246, "learning_rate": 8.726762768247342e-07, "loss": 0.8418, "step": 2760 }, { "epoch": 0.17133473788573553, "grad_norm": 1.7659820318222046, "learning_rate": 8.72350140238732e-07, "loss": 0.9062, "step": 2765 }, { "epoch": 0.17164456562151445, "grad_norm": 1.7167587280273438, "learning_rate": 8.720240036527297e-07, "loss": 0.8665, "step": 2770 }, { "epoch": 0.17195439335729334, "grad_norm": 1.583039402961731, "learning_rate": 8.716978670667275e-07, "loss": 0.8633, "step": 2775 }, { "epoch": 0.17226422109307224, "grad_norm": 1.735582709312439, "learning_rate": 8.713717304807254e-07, "loss": 0.8656, "step": 2780 }, { "epoch": 0.17257404882885116, "grad_norm": 1.6231316328048706, "learning_rate": 8.710455938947231e-07, "loss": 0.926, "step": 2785 }, { "epoch": 0.17288387656463006, "grad_norm": 1.6993120908737183, "learning_rate": 8.707194573087209e-07, "loss": 0.8522, "step": 2790 }, { "epoch": 0.17319370430040898, "grad_norm": 1.5529261827468872, "learning_rate": 8.703933207227187e-07, "loss": 0.9532, "step": 2795 }, { "epoch": 0.17350353203618787, "grad_norm": 1.5772162675857544, "learning_rate": 8.700671841367164e-07, "loss": 0.8757, "step": 2800 }, { "epoch": 0.1738133597719668, "grad_norm": 2.3470513820648193, "learning_rate": 8.697410475507142e-07, "loss": 0.8913, "step": 2805 }, { "epoch": 0.1741231875077457, "grad_norm": 1.7428462505340576, "learning_rate": 8.69414910964712e-07, "loss": 0.8706, "step": 2810 }, { "epoch": 0.17443301524352461, "grad_norm": 1.9671584367752075, "learning_rate": 8.690887743787097e-07, "loss": 0.8907, "step": 2815 }, { "epoch": 0.1747428429793035, "grad_norm": 1.7406604290008545, "learning_rate": 8.687626377927076e-07, "loss": 0.842, "step": 2820 }, { "epoch": 0.1750526707150824, "grad_norm": 1.8574985265731812, "learning_rate": 8.684365012067053e-07, "loss": 0.9045, "step": 2825 }, { "epoch": 0.17536249845086133, "grad_norm": 1.93820059299469, "learning_rate": 8.681103646207031e-07, "loss": 0.8575, "step": 2830 }, { "epoch": 0.17567232618664022, "grad_norm": 1.5534261465072632, "learning_rate": 8.677842280347009e-07, "loss": 0.908, "step": 2835 }, { "epoch": 0.17598215392241914, "grad_norm": 1.5217689275741577, "learning_rate": 8.674580914486986e-07, "loss": 0.8404, "step": 2840 }, { "epoch": 0.17629198165819804, "grad_norm": 1.569159746170044, "learning_rate": 8.671319548626964e-07, "loss": 0.8458, "step": 2845 }, { "epoch": 0.17660180939397696, "grad_norm": 1.55222487449646, "learning_rate": 8.668058182766942e-07, "loss": 0.9236, "step": 2850 }, { "epoch": 0.17691163712975586, "grad_norm": 2.203457832336426, "learning_rate": 8.664796816906921e-07, "loss": 0.9126, "step": 2855 }, { "epoch": 0.17722146486553475, "grad_norm": 1.827846646308899, "learning_rate": 8.661535451046899e-07, "loss": 0.838, "step": 2860 }, { "epoch": 0.17753129260131367, "grad_norm": 1.613362193107605, "learning_rate": 8.658274085186877e-07, "loss": 0.8874, "step": 2865 }, { "epoch": 0.17784112033709257, "grad_norm": 1.725216031074524, "learning_rate": 8.655012719326854e-07, "loss": 0.8392, "step": 2870 }, { "epoch": 0.1781509480728715, "grad_norm": 1.8512014150619507, "learning_rate": 8.651751353466832e-07, "loss": 0.8626, "step": 2875 }, { "epoch": 0.17846077580865038, "grad_norm": 1.6962003707885742, "learning_rate": 8.648489987606809e-07, "loss": 0.8103, "step": 2880 }, { "epoch": 0.1787706035444293, "grad_norm": 2.0940189361572266, "learning_rate": 8.645228621746787e-07, "loss": 0.8875, "step": 2885 }, { "epoch": 0.1790804312802082, "grad_norm": 1.6600868701934814, "learning_rate": 8.641967255886765e-07, "loss": 0.8557, "step": 2890 }, { "epoch": 0.17939025901598712, "grad_norm": 1.7399970293045044, "learning_rate": 8.638705890026743e-07, "loss": 0.8693, "step": 2895 }, { "epoch": 0.17970008675176602, "grad_norm": 1.5817426443099976, "learning_rate": 8.635444524166721e-07, "loss": 0.9123, "step": 2900 }, { "epoch": 0.18000991448754491, "grad_norm": 1.7216002941131592, "learning_rate": 8.632183158306699e-07, "loss": 0.833, "step": 2905 }, { "epoch": 0.18031974222332384, "grad_norm": 1.5682761669158936, "learning_rate": 8.628921792446676e-07, "loss": 0.9304, "step": 2910 }, { "epoch": 0.18062956995910273, "grad_norm": 1.6870499849319458, "learning_rate": 8.625660426586654e-07, "loss": 0.9018, "step": 2915 }, { "epoch": 0.18093939769488165, "grad_norm": 1.5643374919891357, "learning_rate": 8.622399060726631e-07, "loss": 0.8688, "step": 2920 }, { "epoch": 0.18124922543066055, "grad_norm": 1.6821422576904297, "learning_rate": 8.619137694866609e-07, "loss": 0.891, "step": 2925 }, { "epoch": 0.18155905316643947, "grad_norm": 1.5960339307785034, "learning_rate": 8.615876329006588e-07, "loss": 0.8632, "step": 2930 }, { "epoch": 0.18186888090221837, "grad_norm": 1.6109862327575684, "learning_rate": 8.612614963146565e-07, "loss": 0.8239, "step": 2935 }, { "epoch": 0.18217870863799726, "grad_norm": 1.8320916891098022, "learning_rate": 8.609353597286543e-07, "loss": 0.9095, "step": 2940 }, { "epoch": 0.18248853637377618, "grad_norm": 1.7148616313934326, "learning_rate": 8.606092231426521e-07, "loss": 0.8297, "step": 2945 }, { "epoch": 0.18279836410955508, "grad_norm": 1.906317949295044, "learning_rate": 8.602830865566499e-07, "loss": 0.931, "step": 2950 }, { "epoch": 0.183108191845334, "grad_norm": 2.001556873321533, "learning_rate": 8.599569499706477e-07, "loss": 0.9039, "step": 2955 }, { "epoch": 0.1834180195811129, "grad_norm": 1.7104860544204712, "learning_rate": 8.596308133846455e-07, "loss": 0.8605, "step": 2960 }, { "epoch": 0.18372784731689182, "grad_norm": 1.780661702156067, "learning_rate": 8.593046767986432e-07, "loss": 0.9076, "step": 2965 }, { "epoch": 0.1840376750526707, "grad_norm": 1.506719708442688, "learning_rate": 8.589785402126411e-07, "loss": 0.9201, "step": 2970 }, { "epoch": 0.18434750278844964, "grad_norm": 1.9422332048416138, "learning_rate": 8.586524036266388e-07, "loss": 0.9037, "step": 2975 }, { "epoch": 0.18465733052422853, "grad_norm": 1.6526036262512207, "learning_rate": 8.583262670406366e-07, "loss": 0.9041, "step": 2980 }, { "epoch": 0.18496715826000742, "grad_norm": 2.1525230407714844, "learning_rate": 8.580001304546344e-07, "loss": 0.8977, "step": 2985 }, { "epoch": 0.18527698599578635, "grad_norm": 1.584956169128418, "learning_rate": 8.576739938686321e-07, "loss": 0.8981, "step": 2990 }, { "epoch": 0.18558681373156524, "grad_norm": 1.7526774406433105, "learning_rate": 8.573478572826299e-07, "loss": 0.8864, "step": 2995 }, { "epoch": 0.18589664146734416, "grad_norm": 1.9316047430038452, "learning_rate": 8.570217206966277e-07, "loss": 0.8252, "step": 3000 }, { "epoch": 0.18620646920312306, "grad_norm": 1.5757694244384766, "learning_rate": 8.566955841106255e-07, "loss": 0.9097, "step": 3005 }, { "epoch": 0.18651629693890198, "grad_norm": 1.6261025667190552, "learning_rate": 8.563694475246233e-07, "loss": 0.8554, "step": 3010 }, { "epoch": 0.18682612467468088, "grad_norm": 1.6043803691864014, "learning_rate": 8.560433109386211e-07, "loss": 0.8563, "step": 3015 }, { "epoch": 0.18713595241045977, "grad_norm": 1.841296911239624, "learning_rate": 8.557171743526188e-07, "loss": 0.8513, "step": 3020 }, { "epoch": 0.1874457801462387, "grad_norm": 1.5441113710403442, "learning_rate": 8.553910377666166e-07, "loss": 0.8632, "step": 3025 }, { "epoch": 0.1877556078820176, "grad_norm": 1.8540582656860352, "learning_rate": 8.550649011806143e-07, "loss": 0.8989, "step": 3030 }, { "epoch": 0.1880654356177965, "grad_norm": 1.5532974004745483, "learning_rate": 8.547387645946121e-07, "loss": 0.8992, "step": 3035 }, { "epoch": 0.1883752633535754, "grad_norm": 1.6945788860321045, "learning_rate": 8.544126280086099e-07, "loss": 0.8121, "step": 3040 }, { "epoch": 0.18868509108935433, "grad_norm": 1.784133791923523, "learning_rate": 8.540864914226078e-07, "loss": 0.8835, "step": 3045 }, { "epoch": 0.18899491882513322, "grad_norm": 1.6985909938812256, "learning_rate": 8.537603548366056e-07, "loss": 0.8744, "step": 3050 }, { "epoch": 0.18930474656091215, "grad_norm": 1.3742461204528809, "learning_rate": 8.534342182506034e-07, "loss": 0.8719, "step": 3055 }, { "epoch": 0.18961457429669104, "grad_norm": 2.12265944480896, "learning_rate": 8.531080816646011e-07, "loss": 0.9153, "step": 3060 }, { "epoch": 0.18992440203246994, "grad_norm": 2.3105406761169434, "learning_rate": 8.527819450785989e-07, "loss": 0.8735, "step": 3065 }, { "epoch": 0.19023422976824886, "grad_norm": 1.622420072555542, "learning_rate": 8.524558084925967e-07, "loss": 0.8305, "step": 3070 }, { "epoch": 0.19054405750402775, "grad_norm": 1.7302261590957642, "learning_rate": 8.521296719065944e-07, "loss": 0.897, "step": 3075 }, { "epoch": 0.19085388523980668, "grad_norm": 1.728983998298645, "learning_rate": 8.518035353205923e-07, "loss": 0.8977, "step": 3080 }, { "epoch": 0.19116371297558557, "grad_norm": 1.8491730690002441, "learning_rate": 8.5147739873459e-07, "loss": 0.9098, "step": 3085 }, { "epoch": 0.1914735407113645, "grad_norm": 1.8250237703323364, "learning_rate": 8.511512621485878e-07, "loss": 0.954, "step": 3090 }, { "epoch": 0.1917833684471434, "grad_norm": 1.8419411182403564, "learning_rate": 8.508251255625856e-07, "loss": 0.9011, "step": 3095 }, { "epoch": 0.19209319618292228, "grad_norm": 1.842993974685669, "learning_rate": 8.504989889765833e-07, "loss": 0.9043, "step": 3100 }, { "epoch": 0.1924030239187012, "grad_norm": 1.5032520294189453, "learning_rate": 8.501728523905811e-07, "loss": 0.8789, "step": 3105 }, { "epoch": 0.1927128516544801, "grad_norm": 1.5424550771713257, "learning_rate": 8.498467158045789e-07, "loss": 0.9045, "step": 3110 }, { "epoch": 0.19302267939025902, "grad_norm": 1.6614787578582764, "learning_rate": 8.495205792185766e-07, "loss": 0.9086, "step": 3115 }, { "epoch": 0.19333250712603792, "grad_norm": 1.7023537158966064, "learning_rate": 8.491944426325745e-07, "loss": 0.8606, "step": 3120 }, { "epoch": 0.19364233486181684, "grad_norm": 1.56436026096344, "learning_rate": 8.488683060465723e-07, "loss": 0.9364, "step": 3125 }, { "epoch": 0.19395216259759573, "grad_norm": 1.7116961479187012, "learning_rate": 8.4854216946057e-07, "loss": 0.8756, "step": 3130 }, { "epoch": 0.19426199033337466, "grad_norm": 1.6390912532806396, "learning_rate": 8.482160328745678e-07, "loss": 0.8919, "step": 3135 }, { "epoch": 0.19457181806915355, "grad_norm": 1.707128643989563, "learning_rate": 8.478898962885656e-07, "loss": 0.9, "step": 3140 }, { "epoch": 0.19488164580493245, "grad_norm": 1.6477634906768799, "learning_rate": 8.475637597025634e-07, "loss": 0.8867, "step": 3145 }, { "epoch": 0.19519147354071137, "grad_norm": 1.597091794013977, "learning_rate": 8.472376231165612e-07, "loss": 0.873, "step": 3150 }, { "epoch": 0.19550130127649026, "grad_norm": 1.8200145959854126, "learning_rate": 8.46911486530559e-07, "loss": 0.8422, "step": 3155 }, { "epoch": 0.19581112901226919, "grad_norm": 1.6798702478408813, "learning_rate": 8.465853499445568e-07, "loss": 0.9489, "step": 3160 }, { "epoch": 0.19612095674804808, "grad_norm": 1.711710810661316, "learning_rate": 8.462592133585546e-07, "loss": 0.8832, "step": 3165 }, { "epoch": 0.196430784483827, "grad_norm": 1.7799817323684692, "learning_rate": 8.459330767725523e-07, "loss": 0.8613, "step": 3170 }, { "epoch": 0.1967406122196059, "grad_norm": 1.5995440483093262, "learning_rate": 8.456069401865501e-07, "loss": 0.9031, "step": 3175 }, { "epoch": 0.1970504399553848, "grad_norm": 2.1145899295806885, "learning_rate": 8.452808036005479e-07, "loss": 0.9903, "step": 3180 }, { "epoch": 0.19736026769116372, "grad_norm": 1.9350347518920898, "learning_rate": 8.449546670145456e-07, "loss": 0.9141, "step": 3185 }, { "epoch": 0.1976700954269426, "grad_norm": 1.698695421218872, "learning_rate": 8.446285304285434e-07, "loss": 0.8778, "step": 3190 }, { "epoch": 0.19797992316272153, "grad_norm": 1.6731842756271362, "learning_rate": 8.443023938425412e-07, "loss": 0.8773, "step": 3195 }, { "epoch": 0.19828975089850043, "grad_norm": 2.231815814971924, "learning_rate": 8.43976257256539e-07, "loss": 0.8318, "step": 3200 }, { "epoch": 0.19859957863427935, "grad_norm": 1.6079226732254028, "learning_rate": 8.436501206705368e-07, "loss": 0.8911, "step": 3205 }, { "epoch": 0.19890940637005824, "grad_norm": 1.6540980339050293, "learning_rate": 8.433239840845345e-07, "loss": 0.9327, "step": 3210 }, { "epoch": 0.19921923410583717, "grad_norm": 1.6728479862213135, "learning_rate": 8.429978474985323e-07, "loss": 0.9077, "step": 3215 }, { "epoch": 0.19952906184161606, "grad_norm": 1.5520535707473755, "learning_rate": 8.426717109125301e-07, "loss": 0.9041, "step": 3220 }, { "epoch": 0.19983888957739496, "grad_norm": 1.748389482498169, "learning_rate": 8.423455743265278e-07, "loss": 0.8413, "step": 3225 }, { "epoch": 0.20014871731317388, "grad_norm": 1.5351086854934692, "learning_rate": 8.420194377405257e-07, "loss": 0.8737, "step": 3230 }, { "epoch": 0.20045854504895277, "grad_norm": 1.6461067199707031, "learning_rate": 8.416933011545236e-07, "loss": 0.9199, "step": 3235 }, { "epoch": 0.2007683727847317, "grad_norm": 1.8298726081848145, "learning_rate": 8.413671645685213e-07, "loss": 0.812, "step": 3240 }, { "epoch": 0.2010782005205106, "grad_norm": 1.7023096084594727, "learning_rate": 8.410410279825191e-07, "loss": 0.8549, "step": 3245 }, { "epoch": 0.20138802825628951, "grad_norm": 1.9045597314834595, "learning_rate": 8.407148913965168e-07, "loss": 0.8839, "step": 3250 }, { "epoch": 0.2016978559920684, "grad_norm": 1.6992030143737793, "learning_rate": 8.403887548105146e-07, "loss": 0.8938, "step": 3255 }, { "epoch": 0.2020076837278473, "grad_norm": 3.43825101852417, "learning_rate": 8.400626182245124e-07, "loss": 0.9059, "step": 3260 }, { "epoch": 0.20231751146362623, "grad_norm": 1.6924164295196533, "learning_rate": 8.397364816385102e-07, "loss": 0.9307, "step": 3265 }, { "epoch": 0.20262733919940512, "grad_norm": 1.7959076166152954, "learning_rate": 8.39410345052508e-07, "loss": 0.7855, "step": 3270 }, { "epoch": 0.20293716693518404, "grad_norm": 1.821807622909546, "learning_rate": 8.390842084665058e-07, "loss": 0.9153, "step": 3275 }, { "epoch": 0.20324699467096294, "grad_norm": 1.7353099584579468, "learning_rate": 8.387580718805035e-07, "loss": 0.9483, "step": 3280 }, { "epoch": 0.20355682240674186, "grad_norm": 1.663022756576538, "learning_rate": 8.384319352945013e-07, "loss": 0.9057, "step": 3285 }, { "epoch": 0.20386665014252076, "grad_norm": 1.5963541269302368, "learning_rate": 8.38105798708499e-07, "loss": 0.8573, "step": 3290 }, { "epoch": 0.20417647787829968, "grad_norm": 1.5860458612442017, "learning_rate": 8.377796621224968e-07, "loss": 0.9449, "step": 3295 }, { "epoch": 0.20448630561407857, "grad_norm": 1.6798171997070312, "learning_rate": 8.374535255364946e-07, "loss": 0.9197, "step": 3300 }, { "epoch": 0.20479613334985747, "grad_norm": 1.4379969835281372, "learning_rate": 8.371273889504924e-07, "loss": 0.9222, "step": 3305 }, { "epoch": 0.2051059610856364, "grad_norm": 1.7339801788330078, "learning_rate": 8.368012523644902e-07, "loss": 0.849, "step": 3310 }, { "epoch": 0.20541578882141528, "grad_norm": 1.7399816513061523, "learning_rate": 8.36475115778488e-07, "loss": 0.9029, "step": 3315 }, { "epoch": 0.2057256165571942, "grad_norm": 1.5281726121902466, "learning_rate": 8.361489791924857e-07, "loss": 0.957, "step": 3320 }, { "epoch": 0.2060354442929731, "grad_norm": 1.6568851470947266, "learning_rate": 8.358228426064835e-07, "loss": 0.9206, "step": 3325 }, { "epoch": 0.20634527202875202, "grad_norm": 1.5970878601074219, "learning_rate": 8.354967060204814e-07, "loss": 0.8878, "step": 3330 }, { "epoch": 0.20665509976453092, "grad_norm": 1.7588523626327515, "learning_rate": 8.351705694344791e-07, "loss": 0.88, "step": 3335 }, { "epoch": 0.20696492750030981, "grad_norm": 1.7732914686203003, "learning_rate": 8.34844432848477e-07, "loss": 0.8294, "step": 3340 }, { "epoch": 0.20727475523608874, "grad_norm": 1.7227309942245483, "learning_rate": 8.345182962624748e-07, "loss": 0.8807, "step": 3345 }, { "epoch": 0.20758458297186763, "grad_norm": 1.453580617904663, "learning_rate": 8.341921596764725e-07, "loss": 0.8652, "step": 3350 }, { "epoch": 0.20789441070764655, "grad_norm": 1.8358467817306519, "learning_rate": 8.338660230904703e-07, "loss": 0.8734, "step": 3355 }, { "epoch": 0.20820423844342545, "grad_norm": 1.518520712852478, "learning_rate": 8.33539886504468e-07, "loss": 0.8914, "step": 3360 }, { "epoch": 0.20851406617920437, "grad_norm": 1.567660927772522, "learning_rate": 8.332137499184658e-07, "loss": 0.8937, "step": 3365 }, { "epoch": 0.20882389391498327, "grad_norm": 1.6854887008666992, "learning_rate": 8.328876133324636e-07, "loss": 0.8989, "step": 3370 }, { "epoch": 0.2091337216507622, "grad_norm": 1.695876121520996, "learning_rate": 8.325614767464613e-07, "loss": 0.8863, "step": 3375 }, { "epoch": 0.20944354938654108, "grad_norm": 1.7168704271316528, "learning_rate": 8.322353401604592e-07, "loss": 0.8209, "step": 3380 }, { "epoch": 0.20975337712231998, "grad_norm": 1.8898203372955322, "learning_rate": 8.31909203574457e-07, "loss": 0.8577, "step": 3385 }, { "epoch": 0.2100632048580989, "grad_norm": 1.4779143333435059, "learning_rate": 8.315830669884547e-07, "loss": 0.8766, "step": 3390 }, { "epoch": 0.2103730325938778, "grad_norm": 1.6724709272384644, "learning_rate": 8.312569304024525e-07, "loss": 0.8918, "step": 3395 }, { "epoch": 0.21068286032965672, "grad_norm": 2.0512547492980957, "learning_rate": 8.309307938164502e-07, "loss": 0.8843, "step": 3400 }, { "epoch": 0.2109926880654356, "grad_norm": 1.6262348890304565, "learning_rate": 8.30604657230448e-07, "loss": 0.8838, "step": 3405 }, { "epoch": 0.21130251580121454, "grad_norm": 1.6287237405776978, "learning_rate": 8.302785206444458e-07, "loss": 0.9209, "step": 3410 }, { "epoch": 0.21161234353699343, "grad_norm": 1.5955452919006348, "learning_rate": 8.299523840584436e-07, "loss": 0.8372, "step": 3415 }, { "epoch": 0.21192217127277233, "grad_norm": 1.4782408475875854, "learning_rate": 8.296262474724415e-07, "loss": 0.86, "step": 3420 }, { "epoch": 0.21223199900855125, "grad_norm": 1.549429178237915, "learning_rate": 8.293001108864393e-07, "loss": 0.8292, "step": 3425 }, { "epoch": 0.21254182674433014, "grad_norm": 1.931067705154419, "learning_rate": 8.28973974300437e-07, "loss": 0.9517, "step": 3430 }, { "epoch": 0.21285165448010906, "grad_norm": 1.7377322912216187, "learning_rate": 8.286478377144348e-07, "loss": 0.8902, "step": 3435 }, { "epoch": 0.21316148221588796, "grad_norm": 1.6647354364395142, "learning_rate": 8.283217011284326e-07, "loss": 0.901, "step": 3440 }, { "epoch": 0.21347130995166688, "grad_norm": 1.7499667406082153, "learning_rate": 8.279955645424303e-07, "loss": 0.8853, "step": 3445 }, { "epoch": 0.21378113768744578, "grad_norm": 1.7441155910491943, "learning_rate": 8.276694279564281e-07, "loss": 0.9037, "step": 3450 }, { "epoch": 0.2140909654232247, "grad_norm": 1.8543241024017334, "learning_rate": 8.27343291370426e-07, "loss": 0.907, "step": 3455 }, { "epoch": 0.2144007931590036, "grad_norm": 1.959153652191162, "learning_rate": 8.270171547844237e-07, "loss": 0.9103, "step": 3460 }, { "epoch": 0.2147106208947825, "grad_norm": 1.7549216747283936, "learning_rate": 8.266910181984215e-07, "loss": 0.8754, "step": 3465 }, { "epoch": 0.2150204486305614, "grad_norm": 1.558326005935669, "learning_rate": 8.263648816124192e-07, "loss": 0.8493, "step": 3470 }, { "epoch": 0.2153302763663403, "grad_norm": 1.6230164766311646, "learning_rate": 8.26038745026417e-07, "loss": 0.8848, "step": 3475 }, { "epoch": 0.21564010410211923, "grad_norm": 1.6868021488189697, "learning_rate": 8.257126084404148e-07, "loss": 0.8965, "step": 3480 }, { "epoch": 0.21594993183789812, "grad_norm": 1.6420307159423828, "learning_rate": 8.253864718544125e-07, "loss": 0.8861, "step": 3485 }, { "epoch": 0.21625975957367705, "grad_norm": 1.566799283027649, "learning_rate": 8.250603352684104e-07, "loss": 0.9449, "step": 3490 }, { "epoch": 0.21656958730945594, "grad_norm": 1.600649118423462, "learning_rate": 8.247341986824082e-07, "loss": 0.888, "step": 3495 }, { "epoch": 0.21687941504523484, "grad_norm": 1.6971701383590698, "learning_rate": 8.244080620964059e-07, "loss": 0.8529, "step": 3500 }, { "epoch": 0.21718924278101376, "grad_norm": 1.9826794862747192, "learning_rate": 8.240819255104037e-07, "loss": 0.8695, "step": 3505 }, { "epoch": 0.21749907051679265, "grad_norm": 1.6800670623779297, "learning_rate": 8.237557889244014e-07, "loss": 0.7742, "step": 3510 }, { "epoch": 0.21780889825257158, "grad_norm": 1.5386335849761963, "learning_rate": 8.234296523383993e-07, "loss": 0.8487, "step": 3515 }, { "epoch": 0.21811872598835047, "grad_norm": 1.6531599760055542, "learning_rate": 8.231035157523971e-07, "loss": 0.8793, "step": 3520 }, { "epoch": 0.2184285537241294, "grad_norm": 1.799876093864441, "learning_rate": 8.227773791663948e-07, "loss": 0.8697, "step": 3525 }, { "epoch": 0.2187383814599083, "grad_norm": 1.6782712936401367, "learning_rate": 8.224512425803927e-07, "loss": 0.8939, "step": 3530 }, { "epoch": 0.2190482091956872, "grad_norm": 2.4382708072662354, "learning_rate": 8.221251059943905e-07, "loss": 0.864, "step": 3535 }, { "epoch": 0.2193580369314661, "grad_norm": 2.2319979667663574, "learning_rate": 8.217989694083882e-07, "loss": 0.8123, "step": 3540 }, { "epoch": 0.219667864667245, "grad_norm": 1.7036552429199219, "learning_rate": 8.21472832822386e-07, "loss": 0.8501, "step": 3545 }, { "epoch": 0.21997769240302392, "grad_norm": 1.7685638666152954, "learning_rate": 8.211466962363838e-07, "loss": 0.8892, "step": 3550 }, { "epoch": 0.22028752013880282, "grad_norm": 1.7693499326705933, "learning_rate": 8.208205596503815e-07, "loss": 0.8668, "step": 3555 }, { "epoch": 0.22059734787458174, "grad_norm": 1.4913221597671509, "learning_rate": 8.204944230643793e-07, "loss": 0.8425, "step": 3560 }, { "epoch": 0.22090717561036063, "grad_norm": 1.5285090208053589, "learning_rate": 8.201682864783772e-07, "loss": 0.9035, "step": 3565 }, { "epoch": 0.22121700334613956, "grad_norm": 1.7740925550460815, "learning_rate": 8.198421498923749e-07, "loss": 0.8547, "step": 3570 }, { "epoch": 0.22152683108191845, "grad_norm": 1.5741046667099, "learning_rate": 8.195160133063727e-07, "loss": 0.8961, "step": 3575 }, { "epoch": 0.22183665881769735, "grad_norm": 1.6972975730895996, "learning_rate": 8.191898767203704e-07, "loss": 0.8487, "step": 3580 }, { "epoch": 0.22214648655347627, "grad_norm": 1.5088860988616943, "learning_rate": 8.188637401343682e-07, "loss": 0.885, "step": 3585 }, { "epoch": 0.22245631428925516, "grad_norm": 1.3877218961715698, "learning_rate": 8.18537603548366e-07, "loss": 0.9109, "step": 3590 }, { "epoch": 0.2227661420250341, "grad_norm": 1.6148484945297241, "learning_rate": 8.182114669623637e-07, "loss": 0.8623, "step": 3595 }, { "epoch": 0.22307596976081298, "grad_norm": 1.6244674921035767, "learning_rate": 8.178853303763615e-07, "loss": 0.9097, "step": 3600 }, { "epoch": 0.2233857974965919, "grad_norm": 1.5597447156906128, "learning_rate": 8.175591937903594e-07, "loss": 0.8054, "step": 3605 }, { "epoch": 0.2236956252323708, "grad_norm": 1.4161626100540161, "learning_rate": 8.172330572043572e-07, "loss": 0.851, "step": 3610 }, { "epoch": 0.22400545296814972, "grad_norm": 2.051823616027832, "learning_rate": 8.16906920618355e-07, "loss": 0.9192, "step": 3615 }, { "epoch": 0.22431528070392862, "grad_norm": 2.054884195327759, "learning_rate": 8.165807840323527e-07, "loss": 0.8542, "step": 3620 }, { "epoch": 0.2246251084397075, "grad_norm": 1.8336155414581299, "learning_rate": 8.162546474463505e-07, "loss": 0.9063, "step": 3625 }, { "epoch": 0.22493493617548643, "grad_norm": 1.5155121088027954, "learning_rate": 8.159285108603483e-07, "loss": 0.8841, "step": 3630 }, { "epoch": 0.22524476391126533, "grad_norm": 1.6424775123596191, "learning_rate": 8.15602374274346e-07, "loss": 0.8638, "step": 3635 }, { "epoch": 0.22555459164704425, "grad_norm": 1.9094820022583008, "learning_rate": 8.152762376883439e-07, "loss": 0.8612, "step": 3640 }, { "epoch": 0.22586441938282315, "grad_norm": 1.612286925315857, "learning_rate": 8.149501011023417e-07, "loss": 0.838, "step": 3645 }, { "epoch": 0.22617424711860207, "grad_norm": 1.432806372642517, "learning_rate": 8.146239645163394e-07, "loss": 0.8868, "step": 3650 }, { "epoch": 0.22648407485438096, "grad_norm": 1.5282669067382812, "learning_rate": 8.142978279303372e-07, "loss": 0.9199, "step": 3655 }, { "epoch": 0.22679390259015988, "grad_norm": 2.2684273719787598, "learning_rate": 8.13971691344335e-07, "loss": 0.8565, "step": 3660 }, { "epoch": 0.22710373032593878, "grad_norm": 1.5244642496109009, "learning_rate": 8.136455547583327e-07, "loss": 0.9277, "step": 3665 }, { "epoch": 0.22741355806171767, "grad_norm": 1.7602314949035645, "learning_rate": 8.133194181723305e-07, "loss": 0.9052, "step": 3670 }, { "epoch": 0.2277233857974966, "grad_norm": 1.4983603954315186, "learning_rate": 8.129932815863282e-07, "loss": 0.8856, "step": 3675 }, { "epoch": 0.2280332135332755, "grad_norm": 1.644301414489746, "learning_rate": 8.126671450003261e-07, "loss": 0.8627, "step": 3680 }, { "epoch": 0.22834304126905441, "grad_norm": 1.6231365203857422, "learning_rate": 8.123410084143239e-07, "loss": 0.812, "step": 3685 }, { "epoch": 0.2286528690048333, "grad_norm": 1.7780460119247437, "learning_rate": 8.120148718283216e-07, "loss": 0.847, "step": 3690 }, { "epoch": 0.22896269674061223, "grad_norm": 1.7910109758377075, "learning_rate": 8.116887352423194e-07, "loss": 0.8891, "step": 3695 }, { "epoch": 0.22927252447639113, "grad_norm": 1.6662029027938843, "learning_rate": 8.113625986563172e-07, "loss": 0.8353, "step": 3700 }, { "epoch": 0.22958235221217002, "grad_norm": 1.6587656736373901, "learning_rate": 8.11036462070315e-07, "loss": 0.8465, "step": 3705 }, { "epoch": 0.22989217994794894, "grad_norm": 2.0601646900177, "learning_rate": 8.107103254843128e-07, "loss": 0.878, "step": 3710 }, { "epoch": 0.23020200768372784, "grad_norm": 1.72756028175354, "learning_rate": 8.103841888983107e-07, "loss": 0.8836, "step": 3715 }, { "epoch": 0.23051183541950676, "grad_norm": 1.8420205116271973, "learning_rate": 8.100580523123084e-07, "loss": 0.8999, "step": 3720 }, { "epoch": 0.23082166315528566, "grad_norm": 1.699876070022583, "learning_rate": 8.097319157263062e-07, "loss": 0.8708, "step": 3725 }, { "epoch": 0.23113149089106458, "grad_norm": 1.6713570356369019, "learning_rate": 8.09405779140304e-07, "loss": 0.8726, "step": 3730 }, { "epoch": 0.23144131862684347, "grad_norm": 1.9066879749298096, "learning_rate": 8.090796425543017e-07, "loss": 0.8514, "step": 3735 }, { "epoch": 0.2317511463626224, "grad_norm": 1.5943742990493774, "learning_rate": 8.087535059682995e-07, "loss": 0.9404, "step": 3740 }, { "epoch": 0.2320609740984013, "grad_norm": 2.305894613265991, "learning_rate": 8.084273693822972e-07, "loss": 0.898, "step": 3745 }, { "epoch": 0.23237080183418019, "grad_norm": 1.8963289260864258, "learning_rate": 8.081012327962951e-07, "loss": 0.9521, "step": 3750 }, { "epoch": 0.2326806295699591, "grad_norm": 1.4298152923583984, "learning_rate": 8.077750962102929e-07, "loss": 0.8429, "step": 3755 }, { "epoch": 0.232990457305738, "grad_norm": 1.6979072093963623, "learning_rate": 8.074489596242906e-07, "loss": 0.8611, "step": 3760 }, { "epoch": 0.23330028504151692, "grad_norm": 1.9631580114364624, "learning_rate": 8.071228230382884e-07, "loss": 0.8608, "step": 3765 }, { "epoch": 0.23361011277729582, "grad_norm": 1.4495254755020142, "learning_rate": 8.067966864522862e-07, "loss": 0.8849, "step": 3770 }, { "epoch": 0.23391994051307474, "grad_norm": 1.757230520248413, "learning_rate": 8.064705498662839e-07, "loss": 0.8692, "step": 3775 }, { "epoch": 0.23422976824885364, "grad_norm": 1.6763927936553955, "learning_rate": 8.061444132802817e-07, "loss": 0.8947, "step": 3780 }, { "epoch": 0.23453959598463253, "grad_norm": 1.8902605772018433, "learning_rate": 8.058182766942794e-07, "loss": 0.919, "step": 3785 }, { "epoch": 0.23484942372041145, "grad_norm": 1.845036506652832, "learning_rate": 8.054921401082773e-07, "loss": 0.8596, "step": 3790 }, { "epoch": 0.23515925145619035, "grad_norm": 1.8816099166870117, "learning_rate": 8.051660035222751e-07, "loss": 0.8598, "step": 3795 }, { "epoch": 0.23546907919196927, "grad_norm": 1.4726903438568115, "learning_rate": 8.048398669362729e-07, "loss": 0.8747, "step": 3800 }, { "epoch": 0.23577890692774817, "grad_norm": 1.8120406866073608, "learning_rate": 8.045137303502707e-07, "loss": 0.915, "step": 3805 }, { "epoch": 0.2360887346635271, "grad_norm": 1.5189944505691528, "learning_rate": 8.041875937642685e-07, "loss": 0.8593, "step": 3810 }, { "epoch": 0.23639856239930598, "grad_norm": 1.559465765953064, "learning_rate": 8.038614571782662e-07, "loss": 0.8758, "step": 3815 }, { "epoch": 0.2367083901350849, "grad_norm": 1.8431578874588013, "learning_rate": 8.03535320592264e-07, "loss": 0.8659, "step": 3820 }, { "epoch": 0.2370182178708638, "grad_norm": 1.6293855905532837, "learning_rate": 8.032091840062619e-07, "loss": 0.8549, "step": 3825 }, { "epoch": 0.2373280456066427, "grad_norm": 1.782014012336731, "learning_rate": 8.028830474202596e-07, "loss": 0.8943, "step": 3830 }, { "epoch": 0.23763787334242162, "grad_norm": 1.697433352470398, "learning_rate": 8.025569108342574e-07, "loss": 0.8605, "step": 3835 }, { "epoch": 0.2379477010782005, "grad_norm": 1.8514823913574219, "learning_rate": 8.022307742482551e-07, "loss": 0.9609, "step": 3840 }, { "epoch": 0.23825752881397944, "grad_norm": 2.3487794399261475, "learning_rate": 8.019046376622529e-07, "loss": 0.9164, "step": 3845 }, { "epoch": 0.23856735654975833, "grad_norm": 1.677359938621521, "learning_rate": 8.015785010762507e-07, "loss": 0.8896, "step": 3850 }, { "epoch": 0.23887718428553725, "grad_norm": 1.7314691543579102, "learning_rate": 8.012523644902484e-07, "loss": 0.8742, "step": 3855 }, { "epoch": 0.23918701202131615, "grad_norm": 1.6956710815429688, "learning_rate": 8.009262279042462e-07, "loss": 0.8742, "step": 3860 }, { "epoch": 0.23949683975709504, "grad_norm": 1.6402922868728638, "learning_rate": 8.006000913182441e-07, "loss": 0.8702, "step": 3865 }, { "epoch": 0.23980666749287396, "grad_norm": 1.759209156036377, "learning_rate": 8.002739547322418e-07, "loss": 0.8933, "step": 3870 }, { "epoch": 0.24011649522865286, "grad_norm": 1.7304962873458862, "learning_rate": 7.999478181462396e-07, "loss": 0.8267, "step": 3875 }, { "epoch": 0.24042632296443178, "grad_norm": 1.7275751829147339, "learning_rate": 7.996216815602374e-07, "loss": 0.8752, "step": 3880 }, { "epoch": 0.24073615070021068, "grad_norm": 1.5074831247329712, "learning_rate": 7.992955449742351e-07, "loss": 0.8636, "step": 3885 }, { "epoch": 0.2410459784359896, "grad_norm": 1.5937615633010864, "learning_rate": 7.989694083882329e-07, "loss": 0.8697, "step": 3890 }, { "epoch": 0.2413558061717685, "grad_norm": 1.8789658546447754, "learning_rate": 7.986432718022307e-07, "loss": 0.8586, "step": 3895 }, { "epoch": 0.24166563390754742, "grad_norm": 1.3627716302871704, "learning_rate": 7.983171352162286e-07, "loss": 0.8499, "step": 3900 }, { "epoch": 0.2419754616433263, "grad_norm": 1.705570101737976, "learning_rate": 7.979909986302264e-07, "loss": 0.8606, "step": 3905 }, { "epoch": 0.2422852893791052, "grad_norm": 1.6027207374572754, "learning_rate": 7.976648620442241e-07, "loss": 0.8109, "step": 3910 }, { "epoch": 0.24259511711488413, "grad_norm": 1.6315442323684692, "learning_rate": 7.973387254582219e-07, "loss": 0.9034, "step": 3915 }, { "epoch": 0.24290494485066302, "grad_norm": 1.5114508867263794, "learning_rate": 7.970125888722197e-07, "loss": 0.8671, "step": 3920 }, { "epoch": 0.24321477258644195, "grad_norm": 1.6067967414855957, "learning_rate": 7.966864522862174e-07, "loss": 0.851, "step": 3925 }, { "epoch": 0.24352460032222084, "grad_norm": 1.4409738779067993, "learning_rate": 7.963603157002152e-07, "loss": 0.9446, "step": 3930 }, { "epoch": 0.24383442805799976, "grad_norm": 1.6934583187103271, "learning_rate": 7.96034179114213e-07, "loss": 0.8608, "step": 3935 }, { "epoch": 0.24414425579377866, "grad_norm": 1.6626569032669067, "learning_rate": 7.957080425282108e-07, "loss": 0.8624, "step": 3940 }, { "epoch": 0.24445408352955755, "grad_norm": 1.4918955564498901, "learning_rate": 7.953819059422086e-07, "loss": 0.877, "step": 3945 }, { "epoch": 0.24476391126533648, "grad_norm": 1.6465414762496948, "learning_rate": 7.950557693562063e-07, "loss": 0.9115, "step": 3950 }, { "epoch": 0.24507373900111537, "grad_norm": 1.8472356796264648, "learning_rate": 7.947296327702041e-07, "loss": 0.8736, "step": 3955 }, { "epoch": 0.2453835667368943, "grad_norm": 1.8087271451950073, "learning_rate": 7.944034961842019e-07, "loss": 0.9183, "step": 3960 }, { "epoch": 0.2456933944726732, "grad_norm": 1.689376711845398, "learning_rate": 7.940773595981996e-07, "loss": 0.9199, "step": 3965 }, { "epoch": 0.2460032222084521, "grad_norm": 1.834572672843933, "learning_rate": 7.937512230121974e-07, "loss": 0.9044, "step": 3970 }, { "epoch": 0.246313049944231, "grad_norm": 1.9808807373046875, "learning_rate": 7.934250864261953e-07, "loss": 0.8921, "step": 3975 }, { "epoch": 0.24662287768000993, "grad_norm": 1.6560630798339844, "learning_rate": 7.93098949840193e-07, "loss": 0.8892, "step": 3980 }, { "epoch": 0.24693270541578882, "grad_norm": 1.7784051895141602, "learning_rate": 7.927728132541908e-07, "loss": 0.8517, "step": 3985 }, { "epoch": 0.24724253315156772, "grad_norm": 1.834373116493225, "learning_rate": 7.924466766681887e-07, "loss": 0.8824, "step": 3990 }, { "epoch": 0.24755236088734664, "grad_norm": 1.557916522026062, "learning_rate": 7.921205400821864e-07, "loss": 0.8773, "step": 3995 }, { "epoch": 0.24786218862312553, "grad_norm": 1.7716026306152344, "learning_rate": 7.917944034961842e-07, "loss": 0.8326, "step": 4000 }, { "epoch": 0.24817201635890446, "grad_norm": 1.6685853004455566, "learning_rate": 7.914682669101819e-07, "loss": 0.8784, "step": 4005 }, { "epoch": 0.24848184409468335, "grad_norm": 1.6148377656936646, "learning_rate": 7.911421303241797e-07, "loss": 0.863, "step": 4010 }, { "epoch": 0.24879167183046227, "grad_norm": 1.8662177324295044, "learning_rate": 7.908159937381776e-07, "loss": 0.8645, "step": 4015 }, { "epoch": 0.24910149956624117, "grad_norm": 1.5586857795715332, "learning_rate": 7.904898571521753e-07, "loss": 0.8466, "step": 4020 }, { "epoch": 0.24941132730202006, "grad_norm": 1.6031217575073242, "learning_rate": 7.901637205661731e-07, "loss": 0.9226, "step": 4025 }, { "epoch": 0.249721155037799, "grad_norm": 1.5209461450576782, "learning_rate": 7.898375839801709e-07, "loss": 0.847, "step": 4030 }, { "epoch": 0.2500309827735779, "grad_norm": 1.8293284177780151, "learning_rate": 7.895114473941686e-07, "loss": 0.9181, "step": 4035 }, { "epoch": 0.2503408105093568, "grad_norm": 2.152946710586548, "learning_rate": 7.891853108081664e-07, "loss": 0.9068, "step": 4040 }, { "epoch": 0.2506506382451357, "grad_norm": 1.5463371276855469, "learning_rate": 7.888591742221641e-07, "loss": 0.8047, "step": 4045 }, { "epoch": 0.2509604659809146, "grad_norm": 1.7873958349227905, "learning_rate": 7.88533037636162e-07, "loss": 0.8732, "step": 4050 }, { "epoch": 0.2512702937166935, "grad_norm": 2.0298218727111816, "learning_rate": 7.882069010501598e-07, "loss": 0.917, "step": 4055 }, { "epoch": 0.25158012145247244, "grad_norm": 1.8447908163070679, "learning_rate": 7.878807644641575e-07, "loss": 0.8741, "step": 4060 }, { "epoch": 0.2518899491882513, "grad_norm": 1.9836199283599854, "learning_rate": 7.875546278781553e-07, "loss": 0.9038, "step": 4065 }, { "epoch": 0.25219977692403023, "grad_norm": 1.9666748046875, "learning_rate": 7.872284912921531e-07, "loss": 0.841, "step": 4070 }, { "epoch": 0.25250960465980915, "grad_norm": 1.6468111276626587, "learning_rate": 7.869023547061508e-07, "loss": 0.9181, "step": 4075 }, { "epoch": 0.2528194323955881, "grad_norm": 1.7130303382873535, "learning_rate": 7.865762181201487e-07, "loss": 0.8797, "step": 4080 }, { "epoch": 0.25312926013136694, "grad_norm": 1.4417833089828491, "learning_rate": 7.862500815341466e-07, "loss": 0.8169, "step": 4085 }, { "epoch": 0.25343908786714586, "grad_norm": 1.616230845451355, "learning_rate": 7.859239449481443e-07, "loss": 0.8758, "step": 4090 }, { "epoch": 0.2537489156029248, "grad_norm": 1.550050139427185, "learning_rate": 7.855978083621421e-07, "loss": 0.8818, "step": 4095 }, { "epoch": 0.2540587433387037, "grad_norm": 1.6465080976486206, "learning_rate": 7.852716717761399e-07, "loss": 0.8968, "step": 4100 }, { "epoch": 0.2543685710744826, "grad_norm": 1.5943117141723633, "learning_rate": 7.849455351901376e-07, "loss": 0.8592, "step": 4105 }, { "epoch": 0.2546783988102615, "grad_norm": 2.0399084091186523, "learning_rate": 7.846193986041354e-07, "loss": 0.9058, "step": 4110 }, { "epoch": 0.2549882265460404, "grad_norm": 1.590442419052124, "learning_rate": 7.842932620181331e-07, "loss": 0.8757, "step": 4115 }, { "epoch": 0.2552980542818193, "grad_norm": 1.800044059753418, "learning_rate": 7.839671254321309e-07, "loss": 0.8853, "step": 4120 }, { "epoch": 0.2556078820175982, "grad_norm": 1.7299330234527588, "learning_rate": 7.836409888461288e-07, "loss": 0.8586, "step": 4125 }, { "epoch": 0.25591770975337713, "grad_norm": 1.583190679550171, "learning_rate": 7.833148522601265e-07, "loss": 0.8589, "step": 4130 }, { "epoch": 0.25622753748915605, "grad_norm": 1.6425385475158691, "learning_rate": 7.829887156741243e-07, "loss": 0.8879, "step": 4135 }, { "epoch": 0.2565373652249349, "grad_norm": 1.8426728248596191, "learning_rate": 7.826625790881221e-07, "loss": 0.8395, "step": 4140 }, { "epoch": 0.25684719296071384, "grad_norm": 1.4991402626037598, "learning_rate": 7.823364425021198e-07, "loss": 0.8425, "step": 4145 }, { "epoch": 0.25715702069649277, "grad_norm": 1.7158373594284058, "learning_rate": 7.820103059161176e-07, "loss": 0.8444, "step": 4150 }, { "epoch": 0.25746684843227163, "grad_norm": 1.6429483890533447, "learning_rate": 7.816841693301153e-07, "loss": 0.8566, "step": 4155 }, { "epoch": 0.25777667616805056, "grad_norm": 1.5827823877334595, "learning_rate": 7.813580327441131e-07, "loss": 0.8428, "step": 4160 }, { "epoch": 0.2580865039038295, "grad_norm": 1.6602256298065186, "learning_rate": 7.81031896158111e-07, "loss": 0.8049, "step": 4165 }, { "epoch": 0.2583963316396084, "grad_norm": 1.5793774127960205, "learning_rate": 7.807057595721087e-07, "loss": 0.8618, "step": 4170 }, { "epoch": 0.25870615937538727, "grad_norm": 1.635545015335083, "learning_rate": 7.803796229861066e-07, "loss": 0.835, "step": 4175 }, { "epoch": 0.2590159871111662, "grad_norm": 1.6212847232818604, "learning_rate": 7.800534864001044e-07, "loss": 0.8092, "step": 4180 }, { "epoch": 0.2593258148469451, "grad_norm": 1.6301597356796265, "learning_rate": 7.797273498141021e-07, "loss": 0.8649, "step": 4185 }, { "epoch": 0.259635642582724, "grad_norm": 1.7185341119766235, "learning_rate": 7.794012132280999e-07, "loss": 0.8299, "step": 4190 }, { "epoch": 0.2599454703185029, "grad_norm": 1.7740615606307983, "learning_rate": 7.790750766420977e-07, "loss": 0.8163, "step": 4195 }, { "epoch": 0.2602552980542818, "grad_norm": 1.7920143604278564, "learning_rate": 7.787489400560955e-07, "loss": 0.8707, "step": 4200 }, { "epoch": 0.26056512579006075, "grad_norm": 1.8311986923217773, "learning_rate": 7.784228034700933e-07, "loss": 0.8939, "step": 4205 }, { "epoch": 0.2608749535258396, "grad_norm": 1.9712882041931152, "learning_rate": 7.78096666884091e-07, "loss": 0.8947, "step": 4210 }, { "epoch": 0.26118478126161854, "grad_norm": 1.9804974794387817, "learning_rate": 7.777705302980888e-07, "loss": 0.8641, "step": 4215 }, { "epoch": 0.26149460899739746, "grad_norm": 1.5062366724014282, "learning_rate": 7.774443937120866e-07, "loss": 0.8026, "step": 4220 }, { "epoch": 0.2618044367331763, "grad_norm": 1.6919991970062256, "learning_rate": 7.771182571260843e-07, "loss": 0.8698, "step": 4225 }, { "epoch": 0.26211426446895525, "grad_norm": 1.7523192167282104, "learning_rate": 7.767921205400821e-07, "loss": 0.8483, "step": 4230 }, { "epoch": 0.26242409220473417, "grad_norm": 1.7196686267852783, "learning_rate": 7.7646598395408e-07, "loss": 0.8681, "step": 4235 }, { "epoch": 0.2627339199405131, "grad_norm": 1.6117523908615112, "learning_rate": 7.761398473680777e-07, "loss": 0.8188, "step": 4240 }, { "epoch": 0.26304374767629196, "grad_norm": 1.4656801223754883, "learning_rate": 7.758137107820755e-07, "loss": 0.9316, "step": 4245 }, { "epoch": 0.2633535754120709, "grad_norm": 1.5178028345108032, "learning_rate": 7.754875741960733e-07, "loss": 0.8594, "step": 4250 }, { "epoch": 0.2636634031478498, "grad_norm": 1.5954923629760742, "learning_rate": 7.75161437610071e-07, "loss": 0.8425, "step": 4255 }, { "epoch": 0.26397323088362873, "grad_norm": 2.029350996017456, "learning_rate": 7.748353010240688e-07, "loss": 0.9004, "step": 4260 }, { "epoch": 0.2642830586194076, "grad_norm": 1.6649128198623657, "learning_rate": 7.745091644380665e-07, "loss": 0.8747, "step": 4265 }, { "epoch": 0.2645928863551865, "grad_norm": 1.664739727973938, "learning_rate": 7.741830278520644e-07, "loss": 0.8659, "step": 4270 }, { "epoch": 0.26490271409096544, "grad_norm": 1.8176088333129883, "learning_rate": 7.738568912660623e-07, "loss": 0.8636, "step": 4275 }, { "epoch": 0.2652125418267443, "grad_norm": 1.7619256973266602, "learning_rate": 7.7353075468006e-07, "loss": 0.8499, "step": 4280 }, { "epoch": 0.26552236956252323, "grad_norm": 1.5740729570388794, "learning_rate": 7.732046180940578e-07, "loss": 0.8494, "step": 4285 }, { "epoch": 0.26583219729830215, "grad_norm": 1.855904459953308, "learning_rate": 7.728784815080556e-07, "loss": 0.8995, "step": 4290 }, { "epoch": 0.2661420250340811, "grad_norm": 1.7440338134765625, "learning_rate": 7.725523449220533e-07, "loss": 0.8864, "step": 4295 }, { "epoch": 0.26645185276985994, "grad_norm": 1.7765488624572754, "learning_rate": 7.722262083360511e-07, "loss": 0.8429, "step": 4300 }, { "epoch": 0.26676168050563887, "grad_norm": 1.7094438076019287, "learning_rate": 7.719000717500489e-07, "loss": 0.884, "step": 4305 }, { "epoch": 0.2670715082414178, "grad_norm": 1.6391313076019287, "learning_rate": 7.715739351640467e-07, "loss": 0.8691, "step": 4310 }, { "epoch": 0.26738133597719665, "grad_norm": 1.7143090963363647, "learning_rate": 7.712477985780445e-07, "loss": 0.8245, "step": 4315 }, { "epoch": 0.2676911637129756, "grad_norm": 1.5843815803527832, "learning_rate": 7.709216619920422e-07, "loss": 0.8436, "step": 4320 }, { "epoch": 0.2680009914487545, "grad_norm": 1.7201464176177979, "learning_rate": 7.7059552540604e-07, "loss": 0.8946, "step": 4325 }, { "epoch": 0.2683108191845334, "grad_norm": 1.5788687467575073, "learning_rate": 7.702693888200378e-07, "loss": 0.8213, "step": 4330 }, { "epoch": 0.2686206469203123, "grad_norm": 1.7073997259140015, "learning_rate": 7.699432522340355e-07, "loss": 0.9024, "step": 4335 }, { "epoch": 0.2689304746560912, "grad_norm": 1.959904432296753, "learning_rate": 7.696171156480333e-07, "loss": 0.8826, "step": 4340 }, { "epoch": 0.26924030239187013, "grad_norm": 1.5783770084381104, "learning_rate": 7.692909790620311e-07, "loss": 0.7898, "step": 4345 }, { "epoch": 0.269550130127649, "grad_norm": 1.5591490268707275, "learning_rate": 7.689648424760289e-07, "loss": 0.8899, "step": 4350 }, { "epoch": 0.2698599578634279, "grad_norm": 1.6115106344223022, "learning_rate": 7.686387058900267e-07, "loss": 0.895, "step": 4355 }, { "epoch": 0.27016978559920685, "grad_norm": 1.6476013660430908, "learning_rate": 7.683125693040245e-07, "loss": 0.856, "step": 4360 }, { "epoch": 0.27047961333498577, "grad_norm": 1.5720081329345703, "learning_rate": 7.679864327180223e-07, "loss": 0.8328, "step": 4365 }, { "epoch": 0.27078944107076464, "grad_norm": 1.8763222694396973, "learning_rate": 7.676602961320201e-07, "loss": 0.8643, "step": 4370 }, { "epoch": 0.27109926880654356, "grad_norm": 1.773857831954956, "learning_rate": 7.673341595460178e-07, "loss": 0.874, "step": 4375 }, { "epoch": 0.2714090965423225, "grad_norm": 1.7670236825942993, "learning_rate": 7.670080229600156e-07, "loss": 0.8153, "step": 4380 }, { "epoch": 0.27171892427810135, "grad_norm": 1.4394768476486206, "learning_rate": 7.666818863740135e-07, "loss": 0.8458, "step": 4385 }, { "epoch": 0.27202875201388027, "grad_norm": 1.8511664867401123, "learning_rate": 7.663557497880112e-07, "loss": 0.919, "step": 4390 }, { "epoch": 0.2723385797496592, "grad_norm": 1.4363858699798584, "learning_rate": 7.66029613202009e-07, "loss": 0.8606, "step": 4395 }, { "epoch": 0.2726484074854381, "grad_norm": 1.6395204067230225, "learning_rate": 7.657034766160068e-07, "loss": 0.8837, "step": 4400 }, { "epoch": 0.272958235221217, "grad_norm": 1.7002086639404297, "learning_rate": 7.653773400300045e-07, "loss": 0.8805, "step": 4405 }, { "epoch": 0.2732680629569959, "grad_norm": 1.9640655517578125, "learning_rate": 7.650512034440023e-07, "loss": 0.9259, "step": 4410 }, { "epoch": 0.2735778906927748, "grad_norm": 1.573816180229187, "learning_rate": 7.647250668580001e-07, "loss": 0.8964, "step": 4415 }, { "epoch": 0.27388771842855375, "grad_norm": 1.7011841535568237, "learning_rate": 7.643989302719978e-07, "loss": 0.8593, "step": 4420 }, { "epoch": 0.2741975461643326, "grad_norm": 1.5716339349746704, "learning_rate": 7.640727936859957e-07, "loss": 0.8766, "step": 4425 }, { "epoch": 0.27450737390011154, "grad_norm": 1.6735292673110962, "learning_rate": 7.637466570999934e-07, "loss": 0.8125, "step": 4430 }, { "epoch": 0.27481720163589046, "grad_norm": 1.8912310600280762, "learning_rate": 7.634205205139912e-07, "loss": 0.8595, "step": 4435 }, { "epoch": 0.27512702937166933, "grad_norm": 1.5790883302688599, "learning_rate": 7.63094383927989e-07, "loss": 0.8567, "step": 4440 }, { "epoch": 0.27543685710744825, "grad_norm": 1.7692333459854126, "learning_rate": 7.627682473419867e-07, "loss": 0.8854, "step": 4445 }, { "epoch": 0.2757466848432272, "grad_norm": 1.6923221349716187, "learning_rate": 7.624421107559845e-07, "loss": 0.8181, "step": 4450 }, { "epoch": 0.2760565125790061, "grad_norm": 1.5760252475738525, "learning_rate": 7.621159741699823e-07, "loss": 0.8993, "step": 4455 }, { "epoch": 0.27636634031478496, "grad_norm": 1.8800686597824097, "learning_rate": 7.617898375839802e-07, "loss": 0.9069, "step": 4460 }, { "epoch": 0.2766761680505639, "grad_norm": 1.3893144130706787, "learning_rate": 7.61463700997978e-07, "loss": 0.8736, "step": 4465 }, { "epoch": 0.2769859957863428, "grad_norm": 1.8496098518371582, "learning_rate": 7.611375644119758e-07, "loss": 0.8578, "step": 4470 }, { "epoch": 0.2772958235221217, "grad_norm": 1.7693169116973877, "learning_rate": 7.608114278259735e-07, "loss": 0.8346, "step": 4475 }, { "epoch": 0.2776056512579006, "grad_norm": 1.8681560754776, "learning_rate": 7.604852912399713e-07, "loss": 0.9278, "step": 4480 }, { "epoch": 0.2779154789936795, "grad_norm": 2.042816162109375, "learning_rate": 7.60159154653969e-07, "loss": 0.8608, "step": 4485 }, { "epoch": 0.27822530672945844, "grad_norm": 2.209158420562744, "learning_rate": 7.598330180679668e-07, "loss": 0.8957, "step": 4490 }, { "epoch": 0.2785351344652373, "grad_norm": 1.3986682891845703, "learning_rate": 7.595068814819646e-07, "loss": 0.8193, "step": 4495 }, { "epoch": 0.27884496220101623, "grad_norm": 1.5702754259109497, "learning_rate": 7.591807448959624e-07, "loss": 0.8969, "step": 4500 }, { "epoch": 0.27915478993679516, "grad_norm": 1.635554313659668, "learning_rate": 7.588546083099602e-07, "loss": 0.9089, "step": 4505 }, { "epoch": 0.279464617672574, "grad_norm": 1.3413938283920288, "learning_rate": 7.58528471723958e-07, "loss": 0.8608, "step": 4510 }, { "epoch": 0.27977444540835295, "grad_norm": 1.699310541152954, "learning_rate": 7.582023351379557e-07, "loss": 0.9199, "step": 4515 }, { "epoch": 0.28008427314413187, "grad_norm": 1.3531734943389893, "learning_rate": 7.578761985519535e-07, "loss": 0.8647, "step": 4520 }, { "epoch": 0.2803941008799108, "grad_norm": 1.7297533750534058, "learning_rate": 7.575500619659513e-07, "loss": 0.8996, "step": 4525 }, { "epoch": 0.28070392861568966, "grad_norm": 1.7019249200820923, "learning_rate": 7.57223925379949e-07, "loss": 0.8196, "step": 4530 }, { "epoch": 0.2810137563514686, "grad_norm": 1.8220372200012207, "learning_rate": 7.568977887939469e-07, "loss": 0.8837, "step": 4535 }, { "epoch": 0.2813235840872475, "grad_norm": 1.4653481245040894, "learning_rate": 7.565716522079446e-07, "loss": 0.8733, "step": 4540 }, { "epoch": 0.2816334118230264, "grad_norm": 1.5460361242294312, "learning_rate": 7.562455156219424e-07, "loss": 0.8798, "step": 4545 }, { "epoch": 0.2819432395588053, "grad_norm": 1.7040220499038696, "learning_rate": 7.559193790359402e-07, "loss": 0.8787, "step": 4550 }, { "epoch": 0.2822530672945842, "grad_norm": 2.0813891887664795, "learning_rate": 7.55593242449938e-07, "loss": 0.9446, "step": 4555 }, { "epoch": 0.28256289503036314, "grad_norm": 1.7201035022735596, "learning_rate": 7.552671058639358e-07, "loss": 0.8505, "step": 4560 }, { "epoch": 0.282872722766142, "grad_norm": 1.8385859727859497, "learning_rate": 7.549409692779336e-07, "loss": 0.9069, "step": 4565 }, { "epoch": 0.2831825505019209, "grad_norm": 1.6867214441299438, "learning_rate": 7.546148326919314e-07, "loss": 0.8275, "step": 4570 }, { "epoch": 0.28349237823769985, "grad_norm": 1.5044223070144653, "learning_rate": 7.542886961059292e-07, "loss": 0.8201, "step": 4575 }, { "epoch": 0.28380220597347877, "grad_norm": 1.7177399396896362, "learning_rate": 7.53962559519927e-07, "loss": 0.8316, "step": 4580 }, { "epoch": 0.28411203370925764, "grad_norm": 1.604581356048584, "learning_rate": 7.536364229339247e-07, "loss": 0.8904, "step": 4585 }, { "epoch": 0.28442186144503656, "grad_norm": 1.4714152812957764, "learning_rate": 7.533102863479225e-07, "loss": 0.8278, "step": 4590 }, { "epoch": 0.2847316891808155, "grad_norm": 1.5783321857452393, "learning_rate": 7.529841497619202e-07, "loss": 0.8509, "step": 4595 }, { "epoch": 0.28504151691659435, "grad_norm": 1.6688258647918701, "learning_rate": 7.52658013175918e-07, "loss": 0.8676, "step": 4600 }, { "epoch": 0.2853513446523733, "grad_norm": 1.766526222229004, "learning_rate": 7.523318765899158e-07, "loss": 0.8406, "step": 4605 }, { "epoch": 0.2856611723881522, "grad_norm": 1.5621061325073242, "learning_rate": 7.520057400039136e-07, "loss": 0.8247, "step": 4610 }, { "epoch": 0.2859710001239311, "grad_norm": 1.7633684873580933, "learning_rate": 7.516796034179114e-07, "loss": 0.8333, "step": 4615 }, { "epoch": 0.28628082785971, "grad_norm": 1.6037918329238892, "learning_rate": 7.513534668319092e-07, "loss": 0.8999, "step": 4620 }, { "epoch": 0.2865906555954889, "grad_norm": 1.486091136932373, "learning_rate": 7.510273302459069e-07, "loss": 0.8823, "step": 4625 }, { "epoch": 0.28690048333126783, "grad_norm": 1.5411220788955688, "learning_rate": 7.507011936599047e-07, "loss": 0.9006, "step": 4630 }, { "epoch": 0.2872103110670467, "grad_norm": 1.6032695770263672, "learning_rate": 7.503750570739024e-07, "loss": 0.8415, "step": 4635 }, { "epoch": 0.2875201388028256, "grad_norm": 2.003385066986084, "learning_rate": 7.500489204879002e-07, "loss": 0.8579, "step": 4640 }, { "epoch": 0.28782996653860454, "grad_norm": 1.6483454704284668, "learning_rate": 7.49722783901898e-07, "loss": 0.9487, "step": 4645 }, { "epoch": 0.28813979427438347, "grad_norm": 1.6016218662261963, "learning_rate": 7.49396647315896e-07, "loss": 0.8702, "step": 4650 }, { "epoch": 0.28844962201016233, "grad_norm": 1.5022412538528442, "learning_rate": 7.490705107298937e-07, "loss": 0.8654, "step": 4655 }, { "epoch": 0.28875944974594125, "grad_norm": 1.5246739387512207, "learning_rate": 7.487443741438915e-07, "loss": 0.9419, "step": 4660 }, { "epoch": 0.2890692774817202, "grad_norm": 1.6607708930969238, "learning_rate": 7.484182375578892e-07, "loss": 0.9065, "step": 4665 }, { "epoch": 0.28937910521749904, "grad_norm": 1.5024722814559937, "learning_rate": 7.48092100971887e-07, "loss": 0.7737, "step": 4670 }, { "epoch": 0.28968893295327797, "grad_norm": 1.8329672813415527, "learning_rate": 7.477659643858848e-07, "loss": 0.881, "step": 4675 }, { "epoch": 0.2899987606890569, "grad_norm": 1.5739339590072632, "learning_rate": 7.474398277998825e-07, "loss": 0.8699, "step": 4680 }, { "epoch": 0.2903085884248358, "grad_norm": 1.6103707551956177, "learning_rate": 7.471136912138804e-07, "loss": 0.8355, "step": 4685 }, { "epoch": 0.2906184161606147, "grad_norm": 1.646411418914795, "learning_rate": 7.467875546278782e-07, "loss": 0.9132, "step": 4690 }, { "epoch": 0.2909282438963936, "grad_norm": 1.493159294128418, "learning_rate": 7.464614180418759e-07, "loss": 0.8838, "step": 4695 }, { "epoch": 0.2912380716321725, "grad_norm": 1.7038074731826782, "learning_rate": 7.461352814558737e-07, "loss": 0.9199, "step": 4700 }, { "epoch": 0.29154789936795145, "grad_norm": 1.6613513231277466, "learning_rate": 7.458091448698714e-07, "loss": 0.8477, "step": 4705 }, { "epoch": 0.2918577271037303, "grad_norm": 1.9400731325149536, "learning_rate": 7.454830082838692e-07, "loss": 0.9208, "step": 4710 }, { "epoch": 0.29216755483950924, "grad_norm": 1.6278061866760254, "learning_rate": 7.45156871697867e-07, "loss": 0.859, "step": 4715 }, { "epoch": 0.29247738257528816, "grad_norm": 1.661867380142212, "learning_rate": 7.448307351118648e-07, "loss": 0.8793, "step": 4720 }, { "epoch": 0.292787210311067, "grad_norm": 1.5985323190689087, "learning_rate": 7.445045985258626e-07, "loss": 0.8419, "step": 4725 }, { "epoch": 0.29309703804684595, "grad_norm": 2.7765722274780273, "learning_rate": 7.441784619398604e-07, "loss": 0.8484, "step": 4730 }, { "epoch": 0.29340686578262487, "grad_norm": 1.808868408203125, "learning_rate": 7.438523253538581e-07, "loss": 0.8588, "step": 4735 }, { "epoch": 0.2937166935184038, "grad_norm": 1.595335841178894, "learning_rate": 7.43526188767856e-07, "loss": 0.8843, "step": 4740 }, { "epoch": 0.29402652125418266, "grad_norm": 1.7918473482131958, "learning_rate": 7.432000521818538e-07, "loss": 0.8372, "step": 4745 }, { "epoch": 0.2943363489899616, "grad_norm": 1.982010841369629, "learning_rate": 7.428739155958515e-07, "loss": 0.8959, "step": 4750 }, { "epoch": 0.2946461767257405, "grad_norm": 1.6053510904312134, "learning_rate": 7.425477790098493e-07, "loss": 0.8268, "step": 4755 }, { "epoch": 0.29495600446151937, "grad_norm": 1.6667481660842896, "learning_rate": 7.422216424238471e-07, "loss": 0.8826, "step": 4760 }, { "epoch": 0.2952658321972983, "grad_norm": 1.6365985870361328, "learning_rate": 7.418955058378449e-07, "loss": 0.8424, "step": 4765 }, { "epoch": 0.2955756599330772, "grad_norm": 1.7755273580551147, "learning_rate": 7.415693692518427e-07, "loss": 0.8712, "step": 4770 }, { "epoch": 0.29588548766885614, "grad_norm": 1.682857871055603, "learning_rate": 7.412432326658404e-07, "loss": 0.8361, "step": 4775 }, { "epoch": 0.296195315404635, "grad_norm": 1.677895426750183, "learning_rate": 7.409170960798382e-07, "loss": 0.8732, "step": 4780 }, { "epoch": 0.29650514314041393, "grad_norm": 1.751308560371399, "learning_rate": 7.40590959493836e-07, "loss": 0.8581, "step": 4785 }, { "epoch": 0.29681497087619285, "grad_norm": 1.9320496320724487, "learning_rate": 7.402648229078337e-07, "loss": 0.9297, "step": 4790 }, { "epoch": 0.2971247986119717, "grad_norm": 1.5847095251083374, "learning_rate": 7.399386863218316e-07, "loss": 0.9118, "step": 4795 }, { "epoch": 0.29743462634775064, "grad_norm": 2.4290945529937744, "learning_rate": 7.396125497358294e-07, "loss": 0.875, "step": 4800 }, { "epoch": 0.29774445408352956, "grad_norm": 1.7889471054077148, "learning_rate": 7.392864131498271e-07, "loss": 0.8596, "step": 4805 }, { "epoch": 0.2980542818193085, "grad_norm": 1.6241317987442017, "learning_rate": 7.389602765638249e-07, "loss": 0.8354, "step": 4810 }, { "epoch": 0.29836410955508735, "grad_norm": 1.912055492401123, "learning_rate": 7.386341399778226e-07, "loss": 0.9115, "step": 4815 }, { "epoch": 0.2986739372908663, "grad_norm": 1.7655601501464844, "learning_rate": 7.383080033918204e-07, "loss": 0.8738, "step": 4820 }, { "epoch": 0.2989837650266452, "grad_norm": 1.7898212671279907, "learning_rate": 7.379818668058182e-07, "loss": 0.8322, "step": 4825 }, { "epoch": 0.29929359276242407, "grad_norm": 1.6316590309143066, "learning_rate": 7.376557302198159e-07, "loss": 0.8511, "step": 4830 }, { "epoch": 0.299603420498203, "grad_norm": 1.9818249940872192, "learning_rate": 7.373295936338139e-07, "loss": 0.908, "step": 4835 }, { "epoch": 0.2999132482339819, "grad_norm": 1.5876258611679077, "learning_rate": 7.370034570478117e-07, "loss": 0.8723, "step": 4840 }, { "epoch": 0.30022307596976083, "grad_norm": 1.5678564310073853, "learning_rate": 7.366773204618094e-07, "loss": 0.8352, "step": 4845 }, { "epoch": 0.3005329037055397, "grad_norm": 1.6423625946044922, "learning_rate": 7.363511838758072e-07, "loss": 0.8659, "step": 4850 }, { "epoch": 0.3008427314413186, "grad_norm": 2.2845346927642822, "learning_rate": 7.36025047289805e-07, "loss": 0.8819, "step": 4855 }, { "epoch": 0.30115255917709755, "grad_norm": 1.7377188205718994, "learning_rate": 7.356989107038027e-07, "loss": 0.8678, "step": 4860 }, { "epoch": 0.30146238691287647, "grad_norm": 1.7234011888504028, "learning_rate": 7.353727741178005e-07, "loss": 0.8498, "step": 4865 }, { "epoch": 0.30177221464865533, "grad_norm": 1.6152087450027466, "learning_rate": 7.350466375317983e-07, "loss": 0.8903, "step": 4870 }, { "epoch": 0.30208204238443426, "grad_norm": 1.5944377183914185, "learning_rate": 7.347205009457961e-07, "loss": 0.8427, "step": 4875 }, { "epoch": 0.3023918701202132, "grad_norm": 1.6877429485321045, "learning_rate": 7.343943643597939e-07, "loss": 0.8504, "step": 4880 }, { "epoch": 0.30270169785599205, "grad_norm": 2.0271005630493164, "learning_rate": 7.340682277737916e-07, "loss": 0.8171, "step": 4885 }, { "epoch": 0.30301152559177097, "grad_norm": 1.8284132480621338, "learning_rate": 7.337420911877894e-07, "loss": 0.8939, "step": 4890 }, { "epoch": 0.3033213533275499, "grad_norm": 1.412245512008667, "learning_rate": 7.334159546017872e-07, "loss": 0.8582, "step": 4895 }, { "epoch": 0.3036311810633288, "grad_norm": 1.7900394201278687, "learning_rate": 7.330898180157849e-07, "loss": 0.8902, "step": 4900 }, { "epoch": 0.3039410087991077, "grad_norm": 1.9364402294158936, "learning_rate": 7.327636814297827e-07, "loss": 0.8564, "step": 4905 }, { "epoch": 0.3042508365348866, "grad_norm": 1.5492626428604126, "learning_rate": 7.324375448437805e-07, "loss": 0.8584, "step": 4910 }, { "epoch": 0.3045606642706655, "grad_norm": 1.609075903892517, "learning_rate": 7.321114082577783e-07, "loss": 0.8752, "step": 4915 }, { "epoch": 0.3048704920064444, "grad_norm": 1.5706391334533691, "learning_rate": 7.317852716717761e-07, "loss": 0.7976, "step": 4920 }, { "epoch": 0.3051803197422233, "grad_norm": 1.4962748289108276, "learning_rate": 7.314591350857738e-07, "loss": 0.8819, "step": 4925 }, { "epoch": 0.30549014747800224, "grad_norm": 1.67612624168396, "learning_rate": 7.311329984997717e-07, "loss": 0.8642, "step": 4930 }, { "epoch": 0.30579997521378116, "grad_norm": 1.6737630367279053, "learning_rate": 7.308068619137695e-07, "loss": 0.8758, "step": 4935 }, { "epoch": 0.30610980294956003, "grad_norm": 1.5528241395950317, "learning_rate": 7.304807253277672e-07, "loss": 0.8974, "step": 4940 }, { "epoch": 0.30641963068533895, "grad_norm": 2.002545118331909, "learning_rate": 7.301545887417651e-07, "loss": 0.9063, "step": 4945 }, { "epoch": 0.3067294584211179, "grad_norm": 1.5929616689682007, "learning_rate": 7.298284521557629e-07, "loss": 0.8401, "step": 4950 }, { "epoch": 0.30703928615689674, "grad_norm": 1.5987993478775024, "learning_rate": 7.295023155697606e-07, "loss": 0.8712, "step": 4955 }, { "epoch": 0.30734911389267566, "grad_norm": 1.6865966320037842, "learning_rate": 7.291761789837584e-07, "loss": 0.8993, "step": 4960 }, { "epoch": 0.3076589416284546, "grad_norm": 1.6650594472885132, "learning_rate": 7.288500423977561e-07, "loss": 0.7905, "step": 4965 }, { "epoch": 0.3079687693642335, "grad_norm": 1.9827078580856323, "learning_rate": 7.285239058117539e-07, "loss": 0.8031, "step": 4970 }, { "epoch": 0.3082785971000124, "grad_norm": 1.3923324346542358, "learning_rate": 7.281977692257517e-07, "loss": 0.8576, "step": 4975 }, { "epoch": 0.3085884248357913, "grad_norm": 1.4686262607574463, "learning_rate": 7.278716326397494e-07, "loss": 0.8758, "step": 4980 }, { "epoch": 0.3088982525715702, "grad_norm": 1.6060094833374023, "learning_rate": 7.275454960537473e-07, "loss": 0.9352, "step": 4985 }, { "epoch": 0.3092080803073491, "grad_norm": 1.7001724243164062, "learning_rate": 7.272193594677451e-07, "loss": 0.9421, "step": 4990 }, { "epoch": 0.309517908043128, "grad_norm": 1.82527494430542, "learning_rate": 7.268932228817428e-07, "loss": 0.9009, "step": 4995 }, { "epoch": 0.30982773577890693, "grad_norm": 1.4748501777648926, "learning_rate": 7.265670862957406e-07, "loss": 0.8428, "step": 5000 }, { "epoch": 0.31013756351468585, "grad_norm": 1.610166311264038, "learning_rate": 7.262409497097384e-07, "loss": 0.8718, "step": 5005 }, { "epoch": 0.3104473912504647, "grad_norm": 1.4758720397949219, "learning_rate": 7.259148131237361e-07, "loss": 0.871, "step": 5010 }, { "epoch": 0.31075721898624364, "grad_norm": 1.6884534358978271, "learning_rate": 7.255886765377339e-07, "loss": 0.8956, "step": 5015 }, { "epoch": 0.31106704672202257, "grad_norm": 1.9493663311004639, "learning_rate": 7.252625399517317e-07, "loss": 0.8531, "step": 5020 }, { "epoch": 0.3113768744578015, "grad_norm": 1.7757247686386108, "learning_rate": 7.249364033657296e-07, "loss": 0.9157, "step": 5025 }, { "epoch": 0.31168670219358036, "grad_norm": 1.9240217208862305, "learning_rate": 7.246102667797274e-07, "loss": 0.8323, "step": 5030 }, { "epoch": 0.3119965299293593, "grad_norm": 1.6256482601165771, "learning_rate": 7.242841301937251e-07, "loss": 0.8872, "step": 5035 }, { "epoch": 0.3123063576651382, "grad_norm": 1.7468934059143066, "learning_rate": 7.239579936077229e-07, "loss": 0.8648, "step": 5040 }, { "epoch": 0.31261618540091707, "grad_norm": 1.7961115837097168, "learning_rate": 7.236318570217207e-07, "loss": 0.8685, "step": 5045 }, { "epoch": 0.312926013136696, "grad_norm": 1.8381373882293701, "learning_rate": 7.233057204357184e-07, "loss": 0.9058, "step": 5050 }, { "epoch": 0.3132358408724749, "grad_norm": 1.7096655368804932, "learning_rate": 7.229795838497163e-07, "loss": 0.8595, "step": 5055 }, { "epoch": 0.31354566860825384, "grad_norm": 1.6908361911773682, "learning_rate": 7.226534472637141e-07, "loss": 0.8386, "step": 5060 }, { "epoch": 0.3138554963440327, "grad_norm": 1.6399166584014893, "learning_rate": 7.223273106777118e-07, "loss": 0.8867, "step": 5065 }, { "epoch": 0.3141653240798116, "grad_norm": 1.9008816480636597, "learning_rate": 7.220011740917096e-07, "loss": 0.8884, "step": 5070 }, { "epoch": 0.31447515181559055, "grad_norm": 1.628731608390808, "learning_rate": 7.216750375057073e-07, "loss": 0.8502, "step": 5075 }, { "epoch": 0.3147849795513694, "grad_norm": 1.5561721324920654, "learning_rate": 7.213489009197051e-07, "loss": 0.8405, "step": 5080 }, { "epoch": 0.31509480728714834, "grad_norm": 1.4854499101638794, "learning_rate": 7.210227643337029e-07, "loss": 0.8928, "step": 5085 }, { "epoch": 0.31540463502292726, "grad_norm": 1.8296235799789429, "learning_rate": 7.206966277477006e-07, "loss": 0.8359, "step": 5090 }, { "epoch": 0.3157144627587062, "grad_norm": 1.7554904222488403, "learning_rate": 7.203704911616985e-07, "loss": 0.8794, "step": 5095 }, { "epoch": 0.31602429049448505, "grad_norm": 1.838999629020691, "learning_rate": 7.200443545756963e-07, "loss": 0.8743, "step": 5100 }, { "epoch": 0.31633411823026397, "grad_norm": 2.000792980194092, "learning_rate": 7.19718217989694e-07, "loss": 0.85, "step": 5105 }, { "epoch": 0.3166439459660429, "grad_norm": 1.6170905828475952, "learning_rate": 7.193920814036918e-07, "loss": 0.8696, "step": 5110 }, { "epoch": 0.31695377370182176, "grad_norm": 1.9728740453720093, "learning_rate": 7.190659448176896e-07, "loss": 0.9115, "step": 5115 }, { "epoch": 0.3172636014376007, "grad_norm": 1.5515153408050537, "learning_rate": 7.187398082316874e-07, "loss": 0.8929, "step": 5120 }, { "epoch": 0.3175734291733796, "grad_norm": 1.6542752981185913, "learning_rate": 7.184136716456852e-07, "loss": 0.7871, "step": 5125 }, { "epoch": 0.31788325690915853, "grad_norm": 1.803565263748169, "learning_rate": 7.18087535059683e-07, "loss": 0.8599, "step": 5130 }, { "epoch": 0.3181930846449374, "grad_norm": 1.728872299194336, "learning_rate": 7.177613984736808e-07, "loss": 0.791, "step": 5135 }, { "epoch": 0.3185029123807163, "grad_norm": 1.6681028604507446, "learning_rate": 7.174352618876786e-07, "loss": 0.8783, "step": 5140 }, { "epoch": 0.31881274011649524, "grad_norm": 1.9175736904144287, "learning_rate": 7.171091253016763e-07, "loss": 0.8981, "step": 5145 }, { "epoch": 0.3191225678522741, "grad_norm": 1.472561240196228, "learning_rate": 7.167829887156741e-07, "loss": 0.8858, "step": 5150 }, { "epoch": 0.31943239558805303, "grad_norm": 1.5708025693893433, "learning_rate": 7.164568521296719e-07, "loss": 0.8686, "step": 5155 }, { "epoch": 0.31974222332383195, "grad_norm": 1.603183627128601, "learning_rate": 7.161307155436696e-07, "loss": 0.8362, "step": 5160 }, { "epoch": 0.3200520510596109, "grad_norm": 1.7385071516036987, "learning_rate": 7.158045789576674e-07, "loss": 0.836, "step": 5165 }, { "epoch": 0.32036187879538974, "grad_norm": 1.5245490074157715, "learning_rate": 7.154784423716653e-07, "loss": 0.8421, "step": 5170 }, { "epoch": 0.32067170653116867, "grad_norm": 1.4801311492919922, "learning_rate": 7.15152305785663e-07, "loss": 0.888, "step": 5175 }, { "epoch": 0.3209815342669476, "grad_norm": 1.759549617767334, "learning_rate": 7.148261691996608e-07, "loss": 0.8554, "step": 5180 }, { "epoch": 0.3212913620027265, "grad_norm": 1.4452505111694336, "learning_rate": 7.145000326136585e-07, "loss": 0.9078, "step": 5185 }, { "epoch": 0.3216011897385054, "grad_norm": 1.518022894859314, "learning_rate": 7.141738960276563e-07, "loss": 0.85, "step": 5190 }, { "epoch": 0.3219110174742843, "grad_norm": 1.5505757331848145, "learning_rate": 7.138477594416541e-07, "loss": 0.8258, "step": 5195 }, { "epoch": 0.3222208452100632, "grad_norm": 1.5706026554107666, "learning_rate": 7.135216228556518e-07, "loss": 0.8389, "step": 5200 }, { "epoch": 0.3225306729458421, "grad_norm": 1.7174735069274902, "learning_rate": 7.131954862696497e-07, "loss": 0.8291, "step": 5205 }, { "epoch": 0.322840500681621, "grad_norm": 1.6722215414047241, "learning_rate": 7.128693496836475e-07, "loss": 0.8255, "step": 5210 }, { "epoch": 0.32315032841739993, "grad_norm": 1.627677083015442, "learning_rate": 7.125432130976453e-07, "loss": 0.8678, "step": 5215 }, { "epoch": 0.32346015615317886, "grad_norm": 1.7320910692214966, "learning_rate": 7.122170765116431e-07, "loss": 0.8564, "step": 5220 }, { "epoch": 0.3237699838889577, "grad_norm": 1.7326390743255615, "learning_rate": 7.118909399256409e-07, "loss": 0.8014, "step": 5225 }, { "epoch": 0.32407981162473665, "grad_norm": 1.5817348957061768, "learning_rate": 7.115648033396386e-07, "loss": 0.8591, "step": 5230 }, { "epoch": 0.32438963936051557, "grad_norm": 1.7437169551849365, "learning_rate": 7.112386667536364e-07, "loss": 0.841, "step": 5235 }, { "epoch": 0.32469946709629444, "grad_norm": 1.729632019996643, "learning_rate": 7.109125301676341e-07, "loss": 0.9001, "step": 5240 }, { "epoch": 0.32500929483207336, "grad_norm": 1.737653136253357, "learning_rate": 7.10586393581632e-07, "loss": 0.8316, "step": 5245 }, { "epoch": 0.3253191225678523, "grad_norm": 1.535584568977356, "learning_rate": 7.102602569956298e-07, "loss": 0.8928, "step": 5250 }, { "epoch": 0.3256289503036312, "grad_norm": 1.6898844242095947, "learning_rate": 7.099341204096275e-07, "loss": 0.8301, "step": 5255 }, { "epoch": 0.32593877803941007, "grad_norm": 1.5431139469146729, "learning_rate": 7.096079838236253e-07, "loss": 0.8515, "step": 5260 }, { "epoch": 0.326248605775189, "grad_norm": 1.6758557558059692, "learning_rate": 7.092818472376231e-07, "loss": 0.8669, "step": 5265 }, { "epoch": 0.3265584335109679, "grad_norm": 1.610996127128601, "learning_rate": 7.089557106516208e-07, "loss": 0.8761, "step": 5270 }, { "epoch": 0.3268682612467468, "grad_norm": 1.8780288696289062, "learning_rate": 7.086295740656186e-07, "loss": 0.8494, "step": 5275 }, { "epoch": 0.3271780889825257, "grad_norm": 1.667945146560669, "learning_rate": 7.083034374796165e-07, "loss": 0.8734, "step": 5280 }, { "epoch": 0.32748791671830463, "grad_norm": 1.6188865900039673, "learning_rate": 7.079773008936142e-07, "loss": 0.8765, "step": 5285 }, { "epoch": 0.32779774445408355, "grad_norm": 1.6179147958755493, "learning_rate": 7.07651164307612e-07, "loss": 0.8513, "step": 5290 }, { "epoch": 0.3281075721898624, "grad_norm": 1.783928394317627, "learning_rate": 7.073250277216097e-07, "loss": 0.8382, "step": 5295 }, { "epoch": 0.32841739992564134, "grad_norm": 1.6152029037475586, "learning_rate": 7.069988911356075e-07, "loss": 0.8637, "step": 5300 }, { "epoch": 0.32872722766142026, "grad_norm": 1.7319368124008179, "learning_rate": 7.066727545496053e-07, "loss": 0.8444, "step": 5305 }, { "epoch": 0.32903705539719913, "grad_norm": 1.7016270160675049, "learning_rate": 7.063466179636031e-07, "loss": 0.8246, "step": 5310 }, { "epoch": 0.32934688313297805, "grad_norm": 1.6140505075454712, "learning_rate": 7.06020481377601e-07, "loss": 0.8921, "step": 5315 }, { "epoch": 0.329656710868757, "grad_norm": 1.960301399230957, "learning_rate": 7.056943447915988e-07, "loss": 0.862, "step": 5320 }, { "epoch": 0.3299665386045359, "grad_norm": 1.7120500802993774, "learning_rate": 7.053682082055965e-07, "loss": 0.8552, "step": 5325 }, { "epoch": 0.33027636634031476, "grad_norm": 1.4734768867492676, "learning_rate": 7.050420716195943e-07, "loss": 0.8533, "step": 5330 }, { "epoch": 0.3305861940760937, "grad_norm": 1.9813569784164429, "learning_rate": 7.047159350335921e-07, "loss": 0.848, "step": 5335 }, { "epoch": 0.3308960218118726, "grad_norm": 1.7789223194122314, "learning_rate": 7.043897984475898e-07, "loss": 0.8935, "step": 5340 }, { "epoch": 0.33120584954765153, "grad_norm": 1.420596957206726, "learning_rate": 7.040636618615876e-07, "loss": 0.855, "step": 5345 }, { "epoch": 0.3315156772834304, "grad_norm": 1.6905046701431274, "learning_rate": 7.037375252755853e-07, "loss": 0.9146, "step": 5350 }, { "epoch": 0.3318255050192093, "grad_norm": 1.8942984342575073, "learning_rate": 7.034113886895832e-07, "loss": 0.9087, "step": 5355 }, { "epoch": 0.33213533275498824, "grad_norm": 1.7324897050857544, "learning_rate": 7.03085252103581e-07, "loss": 0.8491, "step": 5360 }, { "epoch": 0.3324451604907671, "grad_norm": 1.8707530498504639, "learning_rate": 7.027591155175787e-07, "loss": 0.8697, "step": 5365 }, { "epoch": 0.33275498822654603, "grad_norm": 1.6138685941696167, "learning_rate": 7.024329789315765e-07, "loss": 0.8162, "step": 5370 }, { "epoch": 0.33306481596232496, "grad_norm": 1.8031532764434814, "learning_rate": 7.021068423455743e-07, "loss": 0.8826, "step": 5375 }, { "epoch": 0.3333746436981039, "grad_norm": 1.6295506954193115, "learning_rate": 7.01780705759572e-07, "loss": 0.9195, "step": 5380 }, { "epoch": 0.33368447143388275, "grad_norm": 1.673705816268921, "learning_rate": 7.014545691735698e-07, "loss": 0.8414, "step": 5385 }, { "epoch": 0.33399429916966167, "grad_norm": 1.579641342163086, "learning_rate": 7.011284325875675e-07, "loss": 0.8454, "step": 5390 }, { "epoch": 0.3343041269054406, "grad_norm": 1.4265984296798706, "learning_rate": 7.008022960015654e-07, "loss": 0.83, "step": 5395 }, { "epoch": 0.33461395464121946, "grad_norm": 1.6313492059707642, "learning_rate": 7.004761594155633e-07, "loss": 0.8259, "step": 5400 }, { "epoch": 0.3349237823769984, "grad_norm": 1.7291635274887085, "learning_rate": 7.00150022829561e-07, "loss": 0.8209, "step": 5405 }, { "epoch": 0.3352336101127773, "grad_norm": 1.6532461643218994, "learning_rate": 6.998238862435588e-07, "loss": 0.8927, "step": 5410 }, { "epoch": 0.3355434378485562, "grad_norm": 1.8393067121505737, "learning_rate": 6.994977496575566e-07, "loss": 0.8417, "step": 5415 }, { "epoch": 0.3358532655843351, "grad_norm": 1.5693227052688599, "learning_rate": 6.991716130715543e-07, "loss": 0.8165, "step": 5420 }, { "epoch": 0.336163093320114, "grad_norm": 1.5228568315505981, "learning_rate": 6.988454764855521e-07, "loss": 0.7883, "step": 5425 }, { "epoch": 0.33647292105589294, "grad_norm": 1.623809576034546, "learning_rate": 6.9851933989955e-07, "loss": 0.8849, "step": 5430 }, { "epoch": 0.3367827487916718, "grad_norm": 1.7974226474761963, "learning_rate": 6.981932033135477e-07, "loss": 0.8757, "step": 5435 }, { "epoch": 0.3370925765274507, "grad_norm": 2.054705858230591, "learning_rate": 6.978670667275455e-07, "loss": 0.865, "step": 5440 }, { "epoch": 0.33740240426322965, "grad_norm": 1.612797498703003, "learning_rate": 6.975409301415433e-07, "loss": 0.8691, "step": 5445 }, { "epoch": 0.33771223199900857, "grad_norm": 1.4509713649749756, "learning_rate": 6.97214793555541e-07, "loss": 0.8926, "step": 5450 }, { "epoch": 0.33802205973478744, "grad_norm": 1.9090486764907837, "learning_rate": 6.968886569695388e-07, "loss": 0.8992, "step": 5455 }, { "epoch": 0.33833188747056636, "grad_norm": 1.9372432231903076, "learning_rate": 6.965625203835365e-07, "loss": 0.8822, "step": 5460 }, { "epoch": 0.3386417152063453, "grad_norm": 1.813211441040039, "learning_rate": 6.962363837975343e-07, "loss": 0.9059, "step": 5465 }, { "epoch": 0.3389515429421242, "grad_norm": 1.5739115476608276, "learning_rate": 6.959102472115322e-07, "loss": 0.8224, "step": 5470 }, { "epoch": 0.3392613706779031, "grad_norm": 1.6736093759536743, "learning_rate": 6.955841106255299e-07, "loss": 0.9107, "step": 5475 }, { "epoch": 0.339571198413682, "grad_norm": 1.6196343898773193, "learning_rate": 6.952579740395277e-07, "loss": 0.8502, "step": 5480 }, { "epoch": 0.3398810261494609, "grad_norm": 1.9165607690811157, "learning_rate": 6.949318374535255e-07, "loss": 0.898, "step": 5485 }, { "epoch": 0.3401908538852398, "grad_norm": 1.5003046989440918, "learning_rate": 6.946057008675232e-07, "loss": 0.8842, "step": 5490 }, { "epoch": 0.3405006816210187, "grad_norm": 1.7332803010940552, "learning_rate": 6.942795642815211e-07, "loss": 0.9059, "step": 5495 }, { "epoch": 0.34081050935679763, "grad_norm": 1.9067975282669067, "learning_rate": 6.939534276955189e-07, "loss": 0.8318, "step": 5500 }, { "epoch": 0.34112033709257655, "grad_norm": 1.6197651624679565, "learning_rate": 6.936272911095167e-07, "loss": 0.8533, "step": 5505 }, { "epoch": 0.3414301648283554, "grad_norm": 1.7055573463439941, "learning_rate": 6.933011545235145e-07, "loss": 0.873, "step": 5510 }, { "epoch": 0.34173999256413434, "grad_norm": 1.8457703590393066, "learning_rate": 6.929750179375122e-07, "loss": 0.8833, "step": 5515 }, { "epoch": 0.34204982029991327, "grad_norm": 1.6085789203643799, "learning_rate": 6.9264888135151e-07, "loss": 0.8569, "step": 5520 }, { "epoch": 0.34235964803569213, "grad_norm": 1.5401239395141602, "learning_rate": 6.923227447655078e-07, "loss": 0.8891, "step": 5525 }, { "epoch": 0.34266947577147105, "grad_norm": 1.4874579906463623, "learning_rate": 6.919966081795055e-07, "loss": 0.856, "step": 5530 }, { "epoch": 0.34297930350725, "grad_norm": 1.764067530632019, "learning_rate": 6.916704715935033e-07, "loss": 0.9133, "step": 5535 }, { "epoch": 0.3432891312430289, "grad_norm": 1.7658593654632568, "learning_rate": 6.913443350075012e-07, "loss": 0.8988, "step": 5540 }, { "epoch": 0.34359895897880777, "grad_norm": 1.8294967412948608, "learning_rate": 6.910181984214989e-07, "loss": 0.9026, "step": 5545 }, { "epoch": 0.3439087867145867, "grad_norm": 1.4867089986801147, "learning_rate": 6.906920618354967e-07, "loss": 0.8365, "step": 5550 }, { "epoch": 0.3442186144503656, "grad_norm": 1.7838201522827148, "learning_rate": 6.903659252494944e-07, "loss": 0.8609, "step": 5555 }, { "epoch": 0.3445284421861445, "grad_norm": 1.7906404733657837, "learning_rate": 6.900397886634922e-07, "loss": 0.8179, "step": 5560 }, { "epoch": 0.3448382699219234, "grad_norm": 1.8974790573120117, "learning_rate": 6.8971365207749e-07, "loss": 0.8158, "step": 5565 }, { "epoch": 0.3451480976577023, "grad_norm": 1.757254719734192, "learning_rate": 6.893875154914877e-07, "loss": 0.8567, "step": 5570 }, { "epoch": 0.34545792539348125, "grad_norm": 1.8610557317733765, "learning_rate": 6.890613789054855e-07, "loss": 0.87, "step": 5575 }, { "epoch": 0.3457677531292601, "grad_norm": 1.8508315086364746, "learning_rate": 6.887352423194834e-07, "loss": 0.8873, "step": 5580 }, { "epoch": 0.34607758086503904, "grad_norm": 1.6660816669464111, "learning_rate": 6.884091057334811e-07, "loss": 0.861, "step": 5585 }, { "epoch": 0.34638740860081796, "grad_norm": 2.091118574142456, "learning_rate": 6.88082969147479e-07, "loss": 0.8631, "step": 5590 }, { "epoch": 0.3466972363365968, "grad_norm": 1.7477401494979858, "learning_rate": 6.877568325614768e-07, "loss": 0.8443, "step": 5595 }, { "epoch": 0.34700706407237575, "grad_norm": 1.726889967918396, "learning_rate": 6.874306959754745e-07, "loss": 0.8371, "step": 5600 }, { "epoch": 0.34731689180815467, "grad_norm": 1.5689579248428345, "learning_rate": 6.871045593894723e-07, "loss": 0.9256, "step": 5605 }, { "epoch": 0.3476267195439336, "grad_norm": 1.7721123695373535, "learning_rate": 6.8677842280347e-07, "loss": 0.842, "step": 5610 }, { "epoch": 0.34793654727971246, "grad_norm": 1.685060739517212, "learning_rate": 6.864522862174679e-07, "loss": 0.9371, "step": 5615 }, { "epoch": 0.3482463750154914, "grad_norm": 1.46134352684021, "learning_rate": 6.861261496314657e-07, "loss": 0.8321, "step": 5620 }, { "epoch": 0.3485562027512703, "grad_norm": 1.8265260457992554, "learning_rate": 6.858000130454634e-07, "loss": 0.8737, "step": 5625 }, { "epoch": 0.34886603048704923, "grad_norm": 1.655983328819275, "learning_rate": 6.854738764594612e-07, "loss": 0.8761, "step": 5630 }, { "epoch": 0.3491758582228281, "grad_norm": 1.6275115013122559, "learning_rate": 6.85147739873459e-07, "loss": 0.8242, "step": 5635 }, { "epoch": 0.349485685958607, "grad_norm": 1.539093017578125, "learning_rate": 6.848216032874567e-07, "loss": 0.8794, "step": 5640 }, { "epoch": 0.34979551369438594, "grad_norm": 1.589955449104309, "learning_rate": 6.844954667014545e-07, "loss": 0.8454, "step": 5645 }, { "epoch": 0.3501053414301648, "grad_norm": 1.6179454326629639, "learning_rate": 6.841693301154523e-07, "loss": 0.8378, "step": 5650 }, { "epoch": 0.35041516916594373, "grad_norm": 1.613242745399475, "learning_rate": 6.838431935294501e-07, "loss": 0.8934, "step": 5655 }, { "epoch": 0.35072499690172265, "grad_norm": 1.6655066013336182, "learning_rate": 6.835170569434479e-07, "loss": 0.8562, "step": 5660 }, { "epoch": 0.3510348246375016, "grad_norm": 1.5552911758422852, "learning_rate": 6.831909203574456e-07, "loss": 0.8669, "step": 5665 }, { "epoch": 0.35134465237328044, "grad_norm": 1.599382758140564, "learning_rate": 6.828647837714434e-07, "loss": 0.8907, "step": 5670 }, { "epoch": 0.35165448010905936, "grad_norm": 1.8990147113800049, "learning_rate": 6.825386471854412e-07, "loss": 0.825, "step": 5675 }, { "epoch": 0.3519643078448383, "grad_norm": 1.5779376029968262, "learning_rate": 6.822125105994389e-07, "loss": 0.814, "step": 5680 }, { "epoch": 0.35227413558061715, "grad_norm": 1.543062686920166, "learning_rate": 6.818863740134368e-07, "loss": 0.898, "step": 5685 }, { "epoch": 0.3525839633163961, "grad_norm": 1.5964866876602173, "learning_rate": 6.815602374274347e-07, "loss": 0.8776, "step": 5690 }, { "epoch": 0.352893791052175, "grad_norm": 1.8746436834335327, "learning_rate": 6.812341008414324e-07, "loss": 0.8915, "step": 5695 }, { "epoch": 0.3532036187879539, "grad_norm": 1.8268415927886963, "learning_rate": 6.809079642554302e-07, "loss": 0.8893, "step": 5700 }, { "epoch": 0.3535134465237328, "grad_norm": 1.6394538879394531, "learning_rate": 6.80581827669428e-07, "loss": 0.8348, "step": 5705 }, { "epoch": 0.3538232742595117, "grad_norm": 1.6181316375732422, "learning_rate": 6.802556910834257e-07, "loss": 0.8552, "step": 5710 }, { "epoch": 0.35413310199529063, "grad_norm": 1.4274879693984985, "learning_rate": 6.799295544974235e-07, "loss": 0.8641, "step": 5715 }, { "epoch": 0.3544429297310695, "grad_norm": 1.6039618253707886, "learning_rate": 6.796034179114212e-07, "loss": 0.8417, "step": 5720 }, { "epoch": 0.3547527574668484, "grad_norm": 1.6816076040267944, "learning_rate": 6.79277281325419e-07, "loss": 0.8299, "step": 5725 }, { "epoch": 0.35506258520262735, "grad_norm": 1.7878490686416626, "learning_rate": 6.789511447394169e-07, "loss": 0.8564, "step": 5730 }, { "epoch": 0.35537241293840627, "grad_norm": 1.75118088722229, "learning_rate": 6.786250081534146e-07, "loss": 0.821, "step": 5735 }, { "epoch": 0.35568224067418514, "grad_norm": 1.5188854932785034, "learning_rate": 6.782988715674124e-07, "loss": 0.8554, "step": 5740 }, { "epoch": 0.35599206840996406, "grad_norm": 1.566143274307251, "learning_rate": 6.779727349814102e-07, "loss": 0.8412, "step": 5745 }, { "epoch": 0.356301896145743, "grad_norm": 1.544657826423645, "learning_rate": 6.776465983954079e-07, "loss": 0.8393, "step": 5750 }, { "epoch": 0.35661172388152185, "grad_norm": 1.539481520652771, "learning_rate": 6.773204618094057e-07, "loss": 0.8734, "step": 5755 }, { "epoch": 0.35692155161730077, "grad_norm": 1.465437412261963, "learning_rate": 6.769943252234035e-07, "loss": 0.8965, "step": 5760 }, { "epoch": 0.3572313793530797, "grad_norm": 1.808502435684204, "learning_rate": 6.766681886374013e-07, "loss": 0.8914, "step": 5765 }, { "epoch": 0.3575412070888586, "grad_norm": 1.769059658050537, "learning_rate": 6.763420520513991e-07, "loss": 0.8963, "step": 5770 }, { "epoch": 0.3578510348246375, "grad_norm": 1.557330846786499, "learning_rate": 6.760159154653968e-07, "loss": 0.863, "step": 5775 }, { "epoch": 0.3581608625604164, "grad_norm": 1.8513100147247314, "learning_rate": 6.756897788793947e-07, "loss": 0.9111, "step": 5780 }, { "epoch": 0.3584706902961953, "grad_norm": 1.561990737915039, "learning_rate": 6.753636422933925e-07, "loss": 0.9136, "step": 5785 }, { "epoch": 0.35878051803197425, "grad_norm": 1.581010341644287, "learning_rate": 6.750375057073902e-07, "loss": 0.867, "step": 5790 }, { "epoch": 0.3590903457677531, "grad_norm": 1.8000602722167969, "learning_rate": 6.74711369121388e-07, "loss": 0.8687, "step": 5795 }, { "epoch": 0.35940017350353204, "grad_norm": 1.5388697385787964, "learning_rate": 6.743852325353859e-07, "loss": 0.8759, "step": 5800 }, { "epoch": 0.35971000123931096, "grad_norm": 1.4389694929122925, "learning_rate": 6.740590959493836e-07, "loss": 0.8068, "step": 5805 }, { "epoch": 0.36001982897508983, "grad_norm": 1.8099441528320312, "learning_rate": 6.737329593633814e-07, "loss": 0.8595, "step": 5810 }, { "epoch": 0.36032965671086875, "grad_norm": 1.5408843755722046, "learning_rate": 6.734068227773792e-07, "loss": 0.7999, "step": 5815 }, { "epoch": 0.3606394844466477, "grad_norm": 1.8451995849609375, "learning_rate": 6.730806861913769e-07, "loss": 0.837, "step": 5820 }, { "epoch": 0.3609493121824266, "grad_norm": 1.5536173582077026, "learning_rate": 6.727545496053747e-07, "loss": 0.849, "step": 5825 }, { "epoch": 0.36125913991820546, "grad_norm": 1.8310972452163696, "learning_rate": 6.724284130193724e-07, "loss": 0.931, "step": 5830 }, { "epoch": 0.3615689676539844, "grad_norm": 2.0105249881744385, "learning_rate": 6.721022764333702e-07, "loss": 0.8632, "step": 5835 }, { "epoch": 0.3618787953897633, "grad_norm": 1.82045578956604, "learning_rate": 6.717761398473681e-07, "loss": 0.8877, "step": 5840 }, { "epoch": 0.3621886231255422, "grad_norm": 1.4446182250976562, "learning_rate": 6.714500032613658e-07, "loss": 0.8447, "step": 5845 }, { "epoch": 0.3624984508613211, "grad_norm": 1.6133482456207275, "learning_rate": 6.711238666753636e-07, "loss": 0.8175, "step": 5850 }, { "epoch": 0.3628082785971, "grad_norm": 1.811903953552246, "learning_rate": 6.707977300893614e-07, "loss": 0.8792, "step": 5855 }, { "epoch": 0.36311810633287894, "grad_norm": 1.4996345043182373, "learning_rate": 6.704715935033591e-07, "loss": 0.8803, "step": 5860 }, { "epoch": 0.3634279340686578, "grad_norm": 1.6116666793823242, "learning_rate": 6.701454569173569e-07, "loss": 0.9272, "step": 5865 }, { "epoch": 0.36373776180443673, "grad_norm": 1.634905219078064, "learning_rate": 6.698193203313547e-07, "loss": 0.873, "step": 5870 }, { "epoch": 0.36404758954021565, "grad_norm": 1.5207717418670654, "learning_rate": 6.694931837453526e-07, "loss": 0.8152, "step": 5875 }, { "epoch": 0.3643574172759945, "grad_norm": 1.7275651693344116, "learning_rate": 6.691670471593504e-07, "loss": 0.8569, "step": 5880 }, { "epoch": 0.36466724501177344, "grad_norm": 1.8202307224273682, "learning_rate": 6.688409105733481e-07, "loss": 0.9037, "step": 5885 }, { "epoch": 0.36497707274755237, "grad_norm": 1.5687191486358643, "learning_rate": 6.685147739873459e-07, "loss": 0.8353, "step": 5890 }, { "epoch": 0.3652869004833313, "grad_norm": 1.593938946723938, "learning_rate": 6.681886374013437e-07, "loss": 0.8582, "step": 5895 }, { "epoch": 0.36559672821911016, "grad_norm": 1.9771876335144043, "learning_rate": 6.678625008153414e-07, "loss": 0.8393, "step": 5900 }, { "epoch": 0.3659065559548891, "grad_norm": 1.7116756439208984, "learning_rate": 6.675363642293392e-07, "loss": 0.8711, "step": 5905 }, { "epoch": 0.366216383690668, "grad_norm": 1.691663384437561, "learning_rate": 6.67210227643337e-07, "loss": 0.8904, "step": 5910 }, { "epoch": 0.36652621142644687, "grad_norm": 1.689968466758728, "learning_rate": 6.668840910573348e-07, "loss": 0.8108, "step": 5915 }, { "epoch": 0.3668360391622258, "grad_norm": 1.5498578548431396, "learning_rate": 6.665579544713326e-07, "loss": 0.8793, "step": 5920 }, { "epoch": 0.3671458668980047, "grad_norm": 1.7322367429733276, "learning_rate": 6.662318178853304e-07, "loss": 0.8805, "step": 5925 }, { "epoch": 0.36745569463378364, "grad_norm": 1.7488521337509155, "learning_rate": 6.659056812993281e-07, "loss": 0.9012, "step": 5930 }, { "epoch": 0.3677655223695625, "grad_norm": 1.743172287940979, "learning_rate": 6.655795447133259e-07, "loss": 0.8625, "step": 5935 }, { "epoch": 0.3680753501053414, "grad_norm": 1.9755825996398926, "learning_rate": 6.652534081273236e-07, "loss": 0.8669, "step": 5940 }, { "epoch": 0.36838517784112035, "grad_norm": 1.4267288446426392, "learning_rate": 6.649272715413214e-07, "loss": 0.9069, "step": 5945 }, { "epoch": 0.36869500557689927, "grad_norm": 1.456490397453308, "learning_rate": 6.646011349553192e-07, "loss": 0.8455, "step": 5950 }, { "epoch": 0.36900483331267814, "grad_norm": 1.5797516107559204, "learning_rate": 6.64274998369317e-07, "loss": 0.82, "step": 5955 }, { "epoch": 0.36931466104845706, "grad_norm": 1.6083248853683472, "learning_rate": 6.639488617833148e-07, "loss": 0.8796, "step": 5960 }, { "epoch": 0.369624488784236, "grad_norm": 1.538562536239624, "learning_rate": 6.636227251973126e-07, "loss": 0.9321, "step": 5965 }, { "epoch": 0.36993431652001485, "grad_norm": 1.708044171333313, "learning_rate": 6.632965886113104e-07, "loss": 0.8633, "step": 5970 }, { "epoch": 0.3702441442557938, "grad_norm": 1.5792293548583984, "learning_rate": 6.629704520253082e-07, "loss": 0.8496, "step": 5975 }, { "epoch": 0.3705539719915727, "grad_norm": 1.862945318222046, "learning_rate": 6.62644315439306e-07, "loss": 0.8262, "step": 5980 }, { "epoch": 0.3708637997273516, "grad_norm": 1.6870609521865845, "learning_rate": 6.623181788533037e-07, "loss": 0.847, "step": 5985 }, { "epoch": 0.3711736274631305, "grad_norm": 1.4812041521072388, "learning_rate": 6.619920422673016e-07, "loss": 0.9064, "step": 5990 }, { "epoch": 0.3714834551989094, "grad_norm": 1.7106434106826782, "learning_rate": 6.616659056812993e-07, "loss": 0.8841, "step": 5995 }, { "epoch": 0.37179328293468833, "grad_norm": 1.710314393043518, "learning_rate": 6.613397690952971e-07, "loss": 0.8954, "step": 6000 }, { "epoch": 0.3721031106704672, "grad_norm": 1.9861669540405273, "learning_rate": 6.610136325092949e-07, "loss": 0.8196, "step": 6005 }, { "epoch": 0.3724129384062461, "grad_norm": 1.7173964977264404, "learning_rate": 6.606874959232926e-07, "loss": 0.7905, "step": 6010 }, { "epoch": 0.37272276614202504, "grad_norm": 2.136364221572876, "learning_rate": 6.603613593372904e-07, "loss": 0.9156, "step": 6015 }, { "epoch": 0.37303259387780396, "grad_norm": 1.6675065755844116, "learning_rate": 6.600352227512882e-07, "loss": 0.8613, "step": 6020 }, { "epoch": 0.37334242161358283, "grad_norm": 1.6147140264511108, "learning_rate": 6.59709086165286e-07, "loss": 0.8518, "step": 6025 }, { "epoch": 0.37365224934936175, "grad_norm": 1.5297605991363525, "learning_rate": 6.593829495792838e-07, "loss": 0.8849, "step": 6030 }, { "epoch": 0.3739620770851407, "grad_norm": 1.5885374546051025, "learning_rate": 6.590568129932816e-07, "loss": 0.8759, "step": 6035 }, { "epoch": 0.37427190482091954, "grad_norm": 1.664167046546936, "learning_rate": 6.587306764072793e-07, "loss": 0.8535, "step": 6040 }, { "epoch": 0.37458173255669847, "grad_norm": 1.6754803657531738, "learning_rate": 6.584045398212771e-07, "loss": 0.8183, "step": 6045 }, { "epoch": 0.3748915602924774, "grad_norm": 1.4219242334365845, "learning_rate": 6.580784032352748e-07, "loss": 0.8832, "step": 6050 }, { "epoch": 0.3752013880282563, "grad_norm": 1.5852632522583008, "learning_rate": 6.577522666492726e-07, "loss": 0.8933, "step": 6055 }, { "epoch": 0.3755112157640352, "grad_norm": 1.6237558126449585, "learning_rate": 6.574261300632705e-07, "loss": 0.7741, "step": 6060 }, { "epoch": 0.3758210434998141, "grad_norm": 2.0971627235412598, "learning_rate": 6.570999934772683e-07, "loss": 0.8199, "step": 6065 }, { "epoch": 0.376130871235593, "grad_norm": 1.7529032230377197, "learning_rate": 6.567738568912661e-07, "loss": 0.8621, "step": 6070 }, { "epoch": 0.3764406989713719, "grad_norm": 1.5983155965805054, "learning_rate": 6.564477203052639e-07, "loss": 0.8291, "step": 6075 }, { "epoch": 0.3767505267071508, "grad_norm": 1.8827588558197021, "learning_rate": 6.561215837192616e-07, "loss": 0.7945, "step": 6080 }, { "epoch": 0.37706035444292973, "grad_norm": 1.7689261436462402, "learning_rate": 6.557954471332594e-07, "loss": 0.9019, "step": 6085 }, { "epoch": 0.37737018217870866, "grad_norm": 1.5372118949890137, "learning_rate": 6.554693105472572e-07, "loss": 0.8585, "step": 6090 }, { "epoch": 0.3776800099144875, "grad_norm": 1.4399596452713013, "learning_rate": 6.551431739612549e-07, "loss": 0.8235, "step": 6095 }, { "epoch": 0.37798983765026645, "grad_norm": 1.9435986280441284, "learning_rate": 6.548170373752528e-07, "loss": 0.8355, "step": 6100 }, { "epoch": 0.37829966538604537, "grad_norm": 1.7626937627792358, "learning_rate": 6.544909007892505e-07, "loss": 0.8758, "step": 6105 }, { "epoch": 0.3786094931218243, "grad_norm": 1.6854530572891235, "learning_rate": 6.541647642032483e-07, "loss": 0.8444, "step": 6110 }, { "epoch": 0.37891932085760316, "grad_norm": 1.5007585287094116, "learning_rate": 6.538386276172461e-07, "loss": 0.8574, "step": 6115 }, { "epoch": 0.3792291485933821, "grad_norm": 1.695221185684204, "learning_rate": 6.535124910312438e-07, "loss": 0.8381, "step": 6120 }, { "epoch": 0.379538976329161, "grad_norm": 1.7978490591049194, "learning_rate": 6.531863544452416e-07, "loss": 0.8276, "step": 6125 }, { "epoch": 0.37984880406493987, "grad_norm": 1.6524189710617065, "learning_rate": 6.528602178592394e-07, "loss": 0.8396, "step": 6130 }, { "epoch": 0.3801586318007188, "grad_norm": 1.6175462007522583, "learning_rate": 6.525340812732371e-07, "loss": 0.9416, "step": 6135 }, { "epoch": 0.3804684595364977, "grad_norm": 1.8891555070877075, "learning_rate": 6.52207944687235e-07, "loss": 0.9189, "step": 6140 }, { "epoch": 0.38077828727227664, "grad_norm": 1.7976359128952026, "learning_rate": 6.518818081012328e-07, "loss": 0.8387, "step": 6145 }, { "epoch": 0.3810881150080555, "grad_norm": 1.6718403100967407, "learning_rate": 6.515556715152305e-07, "loss": 0.9185, "step": 6150 }, { "epoch": 0.38139794274383443, "grad_norm": 1.4905260801315308, "learning_rate": 6.512295349292284e-07, "loss": 0.8549, "step": 6155 }, { "epoch": 0.38170777047961335, "grad_norm": 1.8416588306427002, "learning_rate": 6.509033983432261e-07, "loss": 0.8361, "step": 6160 }, { "epoch": 0.3820175982153922, "grad_norm": 1.640128254890442, "learning_rate": 6.505772617572239e-07, "loss": 0.8025, "step": 6165 }, { "epoch": 0.38232742595117114, "grad_norm": 1.7604258060455322, "learning_rate": 6.502511251712217e-07, "loss": 0.8924, "step": 6170 }, { "epoch": 0.38263725368695006, "grad_norm": 1.6893998384475708, "learning_rate": 6.499249885852195e-07, "loss": 0.8731, "step": 6175 }, { "epoch": 0.382947081422729, "grad_norm": 1.5772591829299927, "learning_rate": 6.495988519992173e-07, "loss": 0.8452, "step": 6180 }, { "epoch": 0.38325690915850785, "grad_norm": 1.8576292991638184, "learning_rate": 6.492727154132151e-07, "loss": 0.8236, "step": 6185 }, { "epoch": 0.3835667368942868, "grad_norm": 1.861890435218811, "learning_rate": 6.489465788272128e-07, "loss": 0.8718, "step": 6190 }, { "epoch": 0.3838765646300657, "grad_norm": 1.7861816883087158, "learning_rate": 6.486204422412106e-07, "loss": 0.8465, "step": 6195 }, { "epoch": 0.38418639236584456, "grad_norm": 1.7394684553146362, "learning_rate": 6.482943056552083e-07, "loss": 0.8586, "step": 6200 }, { "epoch": 0.3844962201016235, "grad_norm": 1.72090744972229, "learning_rate": 6.479681690692061e-07, "loss": 0.8324, "step": 6205 }, { "epoch": 0.3848060478374024, "grad_norm": 1.8088411092758179, "learning_rate": 6.476420324832039e-07, "loss": 0.9149, "step": 6210 }, { "epoch": 0.38511587557318133, "grad_norm": 2.070356607437134, "learning_rate": 6.473158958972017e-07, "loss": 0.8281, "step": 6215 }, { "epoch": 0.3854257033089602, "grad_norm": 1.6076574325561523, "learning_rate": 6.469897593111995e-07, "loss": 0.853, "step": 6220 }, { "epoch": 0.3857355310447391, "grad_norm": 1.8331372737884521, "learning_rate": 6.466636227251973e-07, "loss": 0.9023, "step": 6225 }, { "epoch": 0.38604535878051804, "grad_norm": 1.704483985900879, "learning_rate": 6.46337486139195e-07, "loss": 0.8466, "step": 6230 }, { "epoch": 0.3863551865162969, "grad_norm": 1.5919055938720703, "learning_rate": 6.460113495531928e-07, "loss": 0.7859, "step": 6235 }, { "epoch": 0.38666501425207583, "grad_norm": 1.8490043878555298, "learning_rate": 6.456852129671906e-07, "loss": 0.822, "step": 6240 }, { "epoch": 0.38697484198785476, "grad_norm": 1.6369072198867798, "learning_rate": 6.453590763811883e-07, "loss": 0.8381, "step": 6245 }, { "epoch": 0.3872846697236337, "grad_norm": 1.631442666053772, "learning_rate": 6.450329397951863e-07, "loss": 0.9066, "step": 6250 }, { "epoch": 0.38759449745941255, "grad_norm": 1.5469526052474976, "learning_rate": 6.447068032091841e-07, "loss": 0.8231, "step": 6255 }, { "epoch": 0.38790432519519147, "grad_norm": 1.6528466939926147, "learning_rate": 6.443806666231818e-07, "loss": 0.837, "step": 6260 }, { "epoch": 0.3882141529309704, "grad_norm": 1.5440707206726074, "learning_rate": 6.440545300371796e-07, "loss": 0.8934, "step": 6265 }, { "epoch": 0.3885239806667493, "grad_norm": 1.6922236680984497, "learning_rate": 6.437283934511773e-07, "loss": 0.8651, "step": 6270 }, { "epoch": 0.3888338084025282, "grad_norm": 1.5694608688354492, "learning_rate": 6.434022568651751e-07, "loss": 0.886, "step": 6275 }, { "epoch": 0.3891436361383071, "grad_norm": 1.5373364686965942, "learning_rate": 6.430761202791729e-07, "loss": 0.7869, "step": 6280 }, { "epoch": 0.389453463874086, "grad_norm": 1.9022843837738037, "learning_rate": 6.427499836931707e-07, "loss": 0.8716, "step": 6285 }, { "epoch": 0.3897632916098649, "grad_norm": 1.5637811422348022, "learning_rate": 6.424238471071685e-07, "loss": 0.8958, "step": 6290 }, { "epoch": 0.3900731193456438, "grad_norm": 1.9415884017944336, "learning_rate": 6.420977105211663e-07, "loss": 0.8793, "step": 6295 }, { "epoch": 0.39038294708142274, "grad_norm": 1.682353138923645, "learning_rate": 6.41771573935164e-07, "loss": 0.8751, "step": 6300 }, { "epoch": 0.39069277481720166, "grad_norm": 1.8144721984863281, "learning_rate": 6.414454373491618e-07, "loss": 0.9274, "step": 6305 }, { "epoch": 0.3910026025529805, "grad_norm": 1.5728809833526611, "learning_rate": 6.411193007631595e-07, "loss": 0.8748, "step": 6310 }, { "epoch": 0.39131243028875945, "grad_norm": 1.7594609260559082, "learning_rate": 6.407931641771573e-07, "loss": 0.8531, "step": 6315 }, { "epoch": 0.39162225802453837, "grad_norm": 1.8184328079223633, "learning_rate": 6.404670275911551e-07, "loss": 0.867, "step": 6320 }, { "epoch": 0.39193208576031724, "grad_norm": 1.557937502861023, "learning_rate": 6.401408910051529e-07, "loss": 0.8738, "step": 6325 }, { "epoch": 0.39224191349609616, "grad_norm": 1.9380111694335938, "learning_rate": 6.398147544191507e-07, "loss": 0.8542, "step": 6330 }, { "epoch": 0.3925517412318751, "grad_norm": 1.8863641023635864, "learning_rate": 6.394886178331485e-07, "loss": 0.8612, "step": 6335 }, { "epoch": 0.392861568967654, "grad_norm": 1.5802968740463257, "learning_rate": 6.391624812471462e-07, "loss": 0.8289, "step": 6340 }, { "epoch": 0.3931713967034329, "grad_norm": 1.5528584718704224, "learning_rate": 6.388363446611441e-07, "loss": 0.8361, "step": 6345 }, { "epoch": 0.3934812244392118, "grad_norm": 1.7312124967575073, "learning_rate": 6.385102080751419e-07, "loss": 0.822, "step": 6350 }, { "epoch": 0.3937910521749907, "grad_norm": 1.575799822807312, "learning_rate": 6.381840714891396e-07, "loss": 0.8176, "step": 6355 }, { "epoch": 0.3941008799107696, "grad_norm": 1.7014546394348145, "learning_rate": 6.378579349031375e-07, "loss": 0.8946, "step": 6360 }, { "epoch": 0.3944107076465485, "grad_norm": 1.75917649269104, "learning_rate": 6.375317983171353e-07, "loss": 0.9177, "step": 6365 }, { "epoch": 0.39472053538232743, "grad_norm": 1.5316523313522339, "learning_rate": 6.37205661731133e-07, "loss": 0.8241, "step": 6370 }, { "epoch": 0.39503036311810635, "grad_norm": 1.7339751720428467, "learning_rate": 6.368795251451308e-07, "loss": 0.9172, "step": 6375 }, { "epoch": 0.3953401908538852, "grad_norm": 1.7574272155761719, "learning_rate": 6.365533885591285e-07, "loss": 0.9161, "step": 6380 }, { "epoch": 0.39565001858966414, "grad_norm": 1.6766228675842285, "learning_rate": 6.362272519731263e-07, "loss": 0.8016, "step": 6385 }, { "epoch": 0.39595984632544307, "grad_norm": 1.9195177555084229, "learning_rate": 6.359011153871241e-07, "loss": 0.8199, "step": 6390 }, { "epoch": 0.396269674061222, "grad_norm": 1.8629800081253052, "learning_rate": 6.355749788011218e-07, "loss": 0.8625, "step": 6395 }, { "epoch": 0.39657950179700086, "grad_norm": 1.827751874923706, "learning_rate": 6.352488422151197e-07, "loss": 0.8434, "step": 6400 }, { "epoch": 0.3968893295327798, "grad_norm": 1.6282315254211426, "learning_rate": 6.349227056291175e-07, "loss": 0.8436, "step": 6405 }, { "epoch": 0.3971991572685587, "grad_norm": 1.6504688262939453, "learning_rate": 6.345965690431152e-07, "loss": 0.8235, "step": 6410 }, { "epoch": 0.39750898500433757, "grad_norm": 1.6056429147720337, "learning_rate": 6.34270432457113e-07, "loss": 0.8983, "step": 6415 }, { "epoch": 0.3978188127401165, "grad_norm": 1.5263961553573608, "learning_rate": 6.339442958711107e-07, "loss": 0.8725, "step": 6420 }, { "epoch": 0.3981286404758954, "grad_norm": 1.5344408750534058, "learning_rate": 6.336181592851085e-07, "loss": 0.8295, "step": 6425 }, { "epoch": 0.39843846821167433, "grad_norm": 1.7054778337478638, "learning_rate": 6.332920226991063e-07, "loss": 0.8296, "step": 6430 }, { "epoch": 0.3987482959474532, "grad_norm": 1.7735393047332764, "learning_rate": 6.329658861131041e-07, "loss": 0.8868, "step": 6435 }, { "epoch": 0.3990581236832321, "grad_norm": 2.200562000274658, "learning_rate": 6.32639749527102e-07, "loss": 0.8742, "step": 6440 }, { "epoch": 0.39936795141901105, "grad_norm": 1.7043215036392212, "learning_rate": 6.323136129410998e-07, "loss": 0.9286, "step": 6445 }, { "epoch": 0.3996777791547899, "grad_norm": 1.6746608018875122, "learning_rate": 6.319874763550975e-07, "loss": 0.8353, "step": 6450 }, { "epoch": 0.39998760689056884, "grad_norm": 1.827256441116333, "learning_rate": 6.316613397690953e-07, "loss": 0.8662, "step": 6455 }, { "epoch": 0.40029743462634776, "grad_norm": 1.5622025728225708, "learning_rate": 6.313352031830931e-07, "loss": 0.8378, "step": 6460 }, { "epoch": 0.4006072623621267, "grad_norm": 1.7612636089324951, "learning_rate": 6.310090665970908e-07, "loss": 0.8843, "step": 6465 }, { "epoch": 0.40091709009790555, "grad_norm": 1.6234294176101685, "learning_rate": 6.306829300110886e-07, "loss": 0.858, "step": 6470 }, { "epoch": 0.40122691783368447, "grad_norm": 1.618658423423767, "learning_rate": 6.303567934250865e-07, "loss": 0.8797, "step": 6475 }, { "epoch": 0.4015367455694634, "grad_norm": 1.4538562297821045, "learning_rate": 6.300306568390842e-07, "loss": 0.8417, "step": 6480 }, { "epoch": 0.40184657330524226, "grad_norm": 1.5981987714767456, "learning_rate": 6.29704520253082e-07, "loss": 0.8378, "step": 6485 }, { "epoch": 0.4021564010410212, "grad_norm": 1.5594658851623535, "learning_rate": 6.293783836670797e-07, "loss": 0.8675, "step": 6490 }, { "epoch": 0.4024662287768001, "grad_norm": 1.5611145496368408, "learning_rate": 6.290522470810775e-07, "loss": 0.8572, "step": 6495 }, { "epoch": 0.40277605651257903, "grad_norm": 1.8712704181671143, "learning_rate": 6.287261104950753e-07, "loss": 0.8171, "step": 6500 }, { "epoch": 0.4030858842483579, "grad_norm": 1.5056520700454712, "learning_rate": 6.28399973909073e-07, "loss": 0.802, "step": 6505 }, { "epoch": 0.4033957119841368, "grad_norm": 1.5432614088058472, "learning_rate": 6.280738373230709e-07, "loss": 0.8093, "step": 6510 }, { "epoch": 0.40370553971991574, "grad_norm": 1.679118275642395, "learning_rate": 6.277477007370687e-07, "loss": 0.8608, "step": 6515 }, { "epoch": 0.4040153674556946, "grad_norm": 2.1562373638153076, "learning_rate": 6.274215641510664e-07, "loss": 0.8793, "step": 6520 }, { "epoch": 0.40432519519147353, "grad_norm": 1.6759288311004639, "learning_rate": 6.270954275650642e-07, "loss": 0.8647, "step": 6525 }, { "epoch": 0.40463502292725245, "grad_norm": 1.6393519639968872, "learning_rate": 6.267692909790619e-07, "loss": 0.9733, "step": 6530 }, { "epoch": 0.4049448506630314, "grad_norm": 1.7785437107086182, "learning_rate": 6.264431543930598e-07, "loss": 0.8961, "step": 6535 }, { "epoch": 0.40525467839881024, "grad_norm": 1.6839531660079956, "learning_rate": 6.261170178070576e-07, "loss": 0.8522, "step": 6540 }, { "epoch": 0.40556450613458916, "grad_norm": 1.5621917247772217, "learning_rate": 6.257908812210553e-07, "loss": 0.8261, "step": 6545 }, { "epoch": 0.4058743338703681, "grad_norm": 1.7521804571151733, "learning_rate": 6.254647446350532e-07, "loss": 0.8565, "step": 6550 }, { "epoch": 0.406184161606147, "grad_norm": 1.5996302366256714, "learning_rate": 6.25138608049051e-07, "loss": 0.8054, "step": 6555 }, { "epoch": 0.4064939893419259, "grad_norm": 1.673748254776001, "learning_rate": 6.248124714630487e-07, "loss": 0.8238, "step": 6560 }, { "epoch": 0.4068038170777048, "grad_norm": 1.6913408041000366, "learning_rate": 6.244863348770465e-07, "loss": 0.8426, "step": 6565 }, { "epoch": 0.4071136448134837, "grad_norm": 1.8693901300430298, "learning_rate": 6.241601982910443e-07, "loss": 0.9049, "step": 6570 }, { "epoch": 0.4074234725492626, "grad_norm": 1.547855257987976, "learning_rate": 6.23834061705042e-07, "loss": 0.8599, "step": 6575 }, { "epoch": 0.4077333002850415, "grad_norm": 1.593643069267273, "learning_rate": 6.235079251190398e-07, "loss": 0.8518, "step": 6580 }, { "epoch": 0.40804312802082043, "grad_norm": 2.1446425914764404, "learning_rate": 6.231817885330376e-07, "loss": 0.8971, "step": 6585 }, { "epoch": 0.40835295575659936, "grad_norm": 1.6568756103515625, "learning_rate": 6.228556519470354e-07, "loss": 0.8536, "step": 6590 }, { "epoch": 0.4086627834923782, "grad_norm": 1.9221653938293457, "learning_rate": 6.225295153610332e-07, "loss": 0.8395, "step": 6595 }, { "epoch": 0.40897261122815715, "grad_norm": 1.7620422840118408, "learning_rate": 6.222033787750309e-07, "loss": 0.9081, "step": 6600 }, { "epoch": 0.40928243896393607, "grad_norm": 1.7669458389282227, "learning_rate": 6.218772421890287e-07, "loss": 0.9159, "step": 6605 }, { "epoch": 0.40959226669971494, "grad_norm": 1.669142246246338, "learning_rate": 6.215511056030265e-07, "loss": 0.8697, "step": 6610 }, { "epoch": 0.40990209443549386, "grad_norm": 2.9333465099334717, "learning_rate": 6.212249690170242e-07, "loss": 0.8875, "step": 6615 }, { "epoch": 0.4102119221712728, "grad_norm": 1.4787917137145996, "learning_rate": 6.20898832431022e-07, "loss": 0.7818, "step": 6620 }, { "epoch": 0.4105217499070517, "grad_norm": 1.986846923828125, "learning_rate": 6.205726958450199e-07, "loss": 0.8839, "step": 6625 }, { "epoch": 0.41083157764283057, "grad_norm": 1.6744568347930908, "learning_rate": 6.202465592590177e-07, "loss": 0.8646, "step": 6630 }, { "epoch": 0.4111414053786095, "grad_norm": 1.524312138557434, "learning_rate": 6.199204226730155e-07, "loss": 0.85, "step": 6635 }, { "epoch": 0.4114512331143884, "grad_norm": 1.7520464658737183, "learning_rate": 6.195942860870132e-07, "loss": 0.8975, "step": 6640 }, { "epoch": 0.4117610608501673, "grad_norm": 1.7019010782241821, "learning_rate": 6.19268149501011e-07, "loss": 0.8515, "step": 6645 }, { "epoch": 0.4120708885859462, "grad_norm": 1.6939024925231934, "learning_rate": 6.189420129150088e-07, "loss": 0.8655, "step": 6650 }, { "epoch": 0.4123807163217251, "grad_norm": 1.4399261474609375, "learning_rate": 6.186158763290065e-07, "loss": 0.8231, "step": 6655 }, { "epoch": 0.41269054405750405, "grad_norm": 1.6720490455627441, "learning_rate": 6.182897397430044e-07, "loss": 0.8848, "step": 6660 }, { "epoch": 0.4130003717932829, "grad_norm": 1.7691296339035034, "learning_rate": 6.179636031570022e-07, "loss": 0.8617, "step": 6665 }, { "epoch": 0.41331019952906184, "grad_norm": 2.1569337844848633, "learning_rate": 6.176374665709999e-07, "loss": 0.8495, "step": 6670 }, { "epoch": 0.41362002726484076, "grad_norm": 1.8180011510849, "learning_rate": 6.173113299849977e-07, "loss": 0.892, "step": 6675 }, { "epoch": 0.41392985500061963, "grad_norm": 1.8444069623947144, "learning_rate": 6.169851933989955e-07, "loss": 0.9347, "step": 6680 }, { "epoch": 0.41423968273639855, "grad_norm": 1.799382209777832, "learning_rate": 6.166590568129932e-07, "loss": 0.7896, "step": 6685 }, { "epoch": 0.4145495104721775, "grad_norm": 1.6070363521575928, "learning_rate": 6.16332920226991e-07, "loss": 0.8414, "step": 6690 }, { "epoch": 0.4148593382079564, "grad_norm": 1.5467098951339722, "learning_rate": 6.160067836409887e-07, "loss": 0.8515, "step": 6695 }, { "epoch": 0.41516916594373526, "grad_norm": 1.5219910144805908, "learning_rate": 6.156806470549866e-07, "loss": 0.8702, "step": 6700 }, { "epoch": 0.4154789936795142, "grad_norm": 1.6719703674316406, "learning_rate": 6.153545104689844e-07, "loss": 0.7968, "step": 6705 }, { "epoch": 0.4157888214152931, "grad_norm": 1.9076589345932007, "learning_rate": 6.150283738829821e-07, "loss": 0.8824, "step": 6710 }, { "epoch": 0.41609864915107203, "grad_norm": 1.6300450563430786, "learning_rate": 6.147022372969799e-07, "loss": 0.8644, "step": 6715 }, { "epoch": 0.4164084768868509, "grad_norm": 1.8007107973098755, "learning_rate": 6.143761007109778e-07, "loss": 0.8615, "step": 6720 }, { "epoch": 0.4167183046226298, "grad_norm": 1.793091893196106, "learning_rate": 6.140499641249755e-07, "loss": 0.8478, "step": 6725 }, { "epoch": 0.41702813235840874, "grad_norm": 1.7805238962173462, "learning_rate": 6.137238275389733e-07, "loss": 0.8573, "step": 6730 }, { "epoch": 0.4173379600941876, "grad_norm": 1.8016893863677979, "learning_rate": 6.133976909529712e-07, "loss": 0.8972, "step": 6735 }, { "epoch": 0.41764778782996653, "grad_norm": 1.676885962486267, "learning_rate": 6.130715543669689e-07, "loss": 0.8575, "step": 6740 }, { "epoch": 0.41795761556574546, "grad_norm": 1.5361896753311157, "learning_rate": 6.127454177809667e-07, "loss": 0.871, "step": 6745 }, { "epoch": 0.4182674433015244, "grad_norm": 1.9413020610809326, "learning_rate": 6.124192811949644e-07, "loss": 0.8762, "step": 6750 }, { "epoch": 0.41857727103730324, "grad_norm": 1.5886861085891724, "learning_rate": 6.120931446089622e-07, "loss": 0.9325, "step": 6755 }, { "epoch": 0.41888709877308217, "grad_norm": 1.7440290451049805, "learning_rate": 6.1176700802296e-07, "loss": 0.9154, "step": 6760 }, { "epoch": 0.4191969265088611, "grad_norm": 1.6146632432937622, "learning_rate": 6.114408714369577e-07, "loss": 0.846, "step": 6765 }, { "epoch": 0.41950675424463996, "grad_norm": 1.5587791204452515, "learning_rate": 6.111147348509556e-07, "loss": 0.8147, "step": 6770 }, { "epoch": 0.4198165819804189, "grad_norm": 1.575742244720459, "learning_rate": 6.107885982649534e-07, "loss": 0.9017, "step": 6775 }, { "epoch": 0.4201264097161978, "grad_norm": 1.742545247077942, "learning_rate": 6.104624616789511e-07, "loss": 0.8358, "step": 6780 }, { "epoch": 0.4204362374519767, "grad_norm": 1.7107418775558472, "learning_rate": 6.101363250929489e-07, "loss": 0.8726, "step": 6785 }, { "epoch": 0.4207460651877556, "grad_norm": 1.8395441770553589, "learning_rate": 6.098101885069467e-07, "loss": 0.8792, "step": 6790 }, { "epoch": 0.4210558929235345, "grad_norm": 1.6125928163528442, "learning_rate": 6.094840519209444e-07, "loss": 0.8179, "step": 6795 }, { "epoch": 0.42136572065931344, "grad_norm": 1.379232406616211, "learning_rate": 6.091579153349422e-07, "loss": 0.8222, "step": 6800 }, { "epoch": 0.4216755483950923, "grad_norm": 1.6595711708068848, "learning_rate": 6.088317787489399e-07, "loss": 0.8127, "step": 6805 }, { "epoch": 0.4219853761308712, "grad_norm": 1.7200415134429932, "learning_rate": 6.085056421629378e-07, "loss": 0.8235, "step": 6810 }, { "epoch": 0.42229520386665015, "grad_norm": 1.6999988555908203, "learning_rate": 6.081795055769357e-07, "loss": 0.8514, "step": 6815 }, { "epoch": 0.42260503160242907, "grad_norm": 1.8235008716583252, "learning_rate": 6.078533689909334e-07, "loss": 0.8726, "step": 6820 }, { "epoch": 0.42291485933820794, "grad_norm": 1.8066351413726807, "learning_rate": 6.075272324049312e-07, "loss": 0.8252, "step": 6825 }, { "epoch": 0.42322468707398686, "grad_norm": 1.671120524406433, "learning_rate": 6.07201095818929e-07, "loss": 0.8334, "step": 6830 }, { "epoch": 0.4235345148097658, "grad_norm": 1.6555278301239014, "learning_rate": 6.068749592329267e-07, "loss": 0.8295, "step": 6835 }, { "epoch": 0.42384434254554465, "grad_norm": 1.6640400886535645, "learning_rate": 6.065488226469245e-07, "loss": 0.9121, "step": 6840 }, { "epoch": 0.4241541702813236, "grad_norm": 1.7470093965530396, "learning_rate": 6.062226860609224e-07, "loss": 0.8672, "step": 6845 }, { "epoch": 0.4244639980171025, "grad_norm": 1.7477803230285645, "learning_rate": 6.058965494749201e-07, "loss": 0.901, "step": 6850 }, { "epoch": 0.4247738257528814, "grad_norm": 1.512689232826233, "learning_rate": 6.055704128889179e-07, "loss": 0.7943, "step": 6855 }, { "epoch": 0.4250836534886603, "grad_norm": 1.5522472858428955, "learning_rate": 6.052442763029156e-07, "loss": 0.8448, "step": 6860 }, { "epoch": 0.4253934812244392, "grad_norm": 1.6592601537704468, "learning_rate": 6.049181397169134e-07, "loss": 0.8834, "step": 6865 }, { "epoch": 0.42570330896021813, "grad_norm": 1.6181071996688843, "learning_rate": 6.045920031309112e-07, "loss": 0.8622, "step": 6870 }, { "epoch": 0.42601313669599705, "grad_norm": 1.6707855463027954, "learning_rate": 6.042658665449089e-07, "loss": 0.8578, "step": 6875 }, { "epoch": 0.4263229644317759, "grad_norm": 1.8487693071365356, "learning_rate": 6.039397299589067e-07, "loss": 0.8926, "step": 6880 }, { "epoch": 0.42663279216755484, "grad_norm": 1.8489042520523071, "learning_rate": 6.036135933729046e-07, "loss": 0.8561, "step": 6885 }, { "epoch": 0.42694261990333376, "grad_norm": 1.5851346254348755, "learning_rate": 6.032874567869023e-07, "loss": 0.8281, "step": 6890 }, { "epoch": 0.42725244763911263, "grad_norm": 1.765795350074768, "learning_rate": 6.029613202009001e-07, "loss": 0.9016, "step": 6895 }, { "epoch": 0.42756227537489155, "grad_norm": 1.7950501441955566, "learning_rate": 6.026351836148978e-07, "loss": 0.8663, "step": 6900 }, { "epoch": 0.4278721031106705, "grad_norm": 1.9288212060928345, "learning_rate": 6.023090470288956e-07, "loss": 0.8463, "step": 6905 }, { "epoch": 0.4281819308464494, "grad_norm": 1.5620641708374023, "learning_rate": 6.019829104428935e-07, "loss": 0.8536, "step": 6910 }, { "epoch": 0.42849175858222827, "grad_norm": 1.6721876859664917, "learning_rate": 6.016567738568912e-07, "loss": 0.8724, "step": 6915 }, { "epoch": 0.4288015863180072, "grad_norm": 1.6269890069961548, "learning_rate": 6.013306372708891e-07, "loss": 0.8448, "step": 6920 }, { "epoch": 0.4291114140537861, "grad_norm": 1.675096869468689, "learning_rate": 6.010045006848869e-07, "loss": 0.8706, "step": 6925 }, { "epoch": 0.429421241789565, "grad_norm": 1.8765989542007446, "learning_rate": 6.006783640988846e-07, "loss": 0.8857, "step": 6930 }, { "epoch": 0.4297310695253439, "grad_norm": 1.542959213256836, "learning_rate": 6.003522275128824e-07, "loss": 0.815, "step": 6935 }, { "epoch": 0.4300408972611228, "grad_norm": 1.6088926792144775, "learning_rate": 6.000260909268802e-07, "loss": 0.9028, "step": 6940 }, { "epoch": 0.43035072499690175, "grad_norm": 1.5553531646728516, "learning_rate": 5.996999543408779e-07, "loss": 0.8059, "step": 6945 }, { "epoch": 0.4306605527326806, "grad_norm": 1.9321941137313843, "learning_rate": 5.993738177548757e-07, "loss": 0.8709, "step": 6950 }, { "epoch": 0.43097038046845954, "grad_norm": 1.9592463970184326, "learning_rate": 5.990476811688734e-07, "loss": 0.8762, "step": 6955 }, { "epoch": 0.43128020820423846, "grad_norm": 1.569126844406128, "learning_rate": 5.987215445828713e-07, "loss": 0.8187, "step": 6960 }, { "epoch": 0.4315900359400173, "grad_norm": 1.7736068964004517, "learning_rate": 5.983954079968691e-07, "loss": 0.8604, "step": 6965 }, { "epoch": 0.43189986367579625, "grad_norm": 1.5592135190963745, "learning_rate": 5.980692714108668e-07, "loss": 0.8133, "step": 6970 }, { "epoch": 0.43220969141157517, "grad_norm": 1.9093306064605713, "learning_rate": 5.977431348248646e-07, "loss": 0.8867, "step": 6975 }, { "epoch": 0.4325195191473541, "grad_norm": 1.5829439163208008, "learning_rate": 5.974169982388624e-07, "loss": 0.8403, "step": 6980 }, { "epoch": 0.43282934688313296, "grad_norm": 1.7819262742996216, "learning_rate": 5.970908616528601e-07, "loss": 0.7889, "step": 6985 }, { "epoch": 0.4331391746189119, "grad_norm": 1.6947157382965088, "learning_rate": 5.967647250668579e-07, "loss": 0.9068, "step": 6990 }, { "epoch": 0.4334490023546908, "grad_norm": 2.4232685565948486, "learning_rate": 5.964385884808558e-07, "loss": 0.91, "step": 6995 }, { "epoch": 0.43375883009046967, "grad_norm": 1.5479854345321655, "learning_rate": 5.961124518948535e-07, "loss": 0.8655, "step": 7000 }, { "epoch": 0.4340686578262486, "grad_norm": 1.5822721719741821, "learning_rate": 5.957863153088514e-07, "loss": 0.8687, "step": 7005 }, { "epoch": 0.4343784855620275, "grad_norm": 1.6493992805480957, "learning_rate": 5.954601787228492e-07, "loss": 0.8698, "step": 7010 }, { "epoch": 0.43468831329780644, "grad_norm": 1.8212865591049194, "learning_rate": 5.951340421368469e-07, "loss": 0.8967, "step": 7015 }, { "epoch": 0.4349981410335853, "grad_norm": 1.7435860633850098, "learning_rate": 5.948079055508447e-07, "loss": 0.8938, "step": 7020 }, { "epoch": 0.43530796876936423, "grad_norm": 1.4727410078048706, "learning_rate": 5.944817689648424e-07, "loss": 0.8604, "step": 7025 }, { "epoch": 0.43561779650514315, "grad_norm": 1.7049380540847778, "learning_rate": 5.941556323788402e-07, "loss": 0.8923, "step": 7030 }, { "epoch": 0.4359276242409221, "grad_norm": 1.6532634496688843, "learning_rate": 5.938294957928381e-07, "loss": 0.8947, "step": 7035 }, { "epoch": 0.43623745197670094, "grad_norm": 1.7241319417953491, "learning_rate": 5.935033592068358e-07, "loss": 0.8261, "step": 7040 }, { "epoch": 0.43654727971247986, "grad_norm": 1.6529090404510498, "learning_rate": 5.931772226208336e-07, "loss": 0.9206, "step": 7045 }, { "epoch": 0.4368571074482588, "grad_norm": 1.5295495986938477, "learning_rate": 5.928510860348314e-07, "loss": 0.8079, "step": 7050 }, { "epoch": 0.43716693518403765, "grad_norm": 1.621811866760254, "learning_rate": 5.925249494488291e-07, "loss": 0.8201, "step": 7055 }, { "epoch": 0.4374767629198166, "grad_norm": 1.5867595672607422, "learning_rate": 5.921988128628269e-07, "loss": 0.8484, "step": 7060 }, { "epoch": 0.4377865906555955, "grad_norm": 1.6748038530349731, "learning_rate": 5.918726762768246e-07, "loss": 0.8795, "step": 7065 }, { "epoch": 0.4380964183913744, "grad_norm": 1.6038273572921753, "learning_rate": 5.915465396908225e-07, "loss": 0.8485, "step": 7070 }, { "epoch": 0.4384062461271533, "grad_norm": 1.957425594329834, "learning_rate": 5.912204031048203e-07, "loss": 0.8073, "step": 7075 }, { "epoch": 0.4387160738629322, "grad_norm": 1.746545433998108, "learning_rate": 5.90894266518818e-07, "loss": 0.8458, "step": 7080 }, { "epoch": 0.43902590159871113, "grad_norm": 1.4868165254592896, "learning_rate": 5.905681299328158e-07, "loss": 0.9121, "step": 7085 }, { "epoch": 0.43933572933449, "grad_norm": 1.987248182296753, "learning_rate": 5.902419933468136e-07, "loss": 0.853, "step": 7090 }, { "epoch": 0.4396455570702689, "grad_norm": 2.0198941230773926, "learning_rate": 5.899158567608113e-07, "loss": 0.8675, "step": 7095 }, { "epoch": 0.43995538480604784, "grad_norm": 1.6500669717788696, "learning_rate": 5.895897201748092e-07, "loss": 0.8565, "step": 7100 }, { "epoch": 0.44026521254182677, "grad_norm": 1.775720238685608, "learning_rate": 5.892635835888071e-07, "loss": 0.909, "step": 7105 }, { "epoch": 0.44057504027760563, "grad_norm": 1.493911623954773, "learning_rate": 5.889374470028048e-07, "loss": 0.8693, "step": 7110 }, { "epoch": 0.44088486801338456, "grad_norm": 1.5506999492645264, "learning_rate": 5.886113104168026e-07, "loss": 0.8747, "step": 7115 }, { "epoch": 0.4411946957491635, "grad_norm": 1.961938738822937, "learning_rate": 5.882851738308004e-07, "loss": 0.85, "step": 7120 }, { "epoch": 0.44150452348494235, "grad_norm": 1.5942941904067993, "learning_rate": 5.879590372447981e-07, "loss": 0.8826, "step": 7125 }, { "epoch": 0.44181435122072127, "grad_norm": 1.5661519765853882, "learning_rate": 5.876329006587959e-07, "loss": 0.8695, "step": 7130 }, { "epoch": 0.4421241789565002, "grad_norm": 1.5405750274658203, "learning_rate": 5.873067640727936e-07, "loss": 0.9066, "step": 7135 }, { "epoch": 0.4424340066922791, "grad_norm": 1.6720143556594849, "learning_rate": 5.869806274867914e-07, "loss": 0.929, "step": 7140 }, { "epoch": 0.442743834428058, "grad_norm": 1.905096411705017, "learning_rate": 5.866544909007893e-07, "loss": 0.9012, "step": 7145 }, { "epoch": 0.4430536621638369, "grad_norm": 1.7883431911468506, "learning_rate": 5.86328354314787e-07, "loss": 0.8245, "step": 7150 }, { "epoch": 0.4433634898996158, "grad_norm": 1.5151214599609375, "learning_rate": 5.860022177287848e-07, "loss": 0.872, "step": 7155 }, { "epoch": 0.4436733176353947, "grad_norm": 1.5181447267532349, "learning_rate": 5.856760811427826e-07, "loss": 0.8313, "step": 7160 }, { "epoch": 0.4439831453711736, "grad_norm": 1.5924103260040283, "learning_rate": 5.853499445567803e-07, "loss": 0.851, "step": 7165 }, { "epoch": 0.44429297310695254, "grad_norm": 1.5370997190475464, "learning_rate": 5.850238079707781e-07, "loss": 0.8615, "step": 7170 }, { "epoch": 0.44460280084273146, "grad_norm": 1.5335907936096191, "learning_rate": 5.846976713847758e-07, "loss": 0.8852, "step": 7175 }, { "epoch": 0.4449126285785103, "grad_norm": 1.8002736568450928, "learning_rate": 5.843715347987736e-07, "loss": 0.8973, "step": 7180 }, { "epoch": 0.44522245631428925, "grad_norm": 1.6744446754455566, "learning_rate": 5.840453982127715e-07, "loss": 0.8424, "step": 7185 }, { "epoch": 0.4455322840500682, "grad_norm": 1.6870871782302856, "learning_rate": 5.837192616267692e-07, "loss": 0.8142, "step": 7190 }, { "epoch": 0.4458421117858471, "grad_norm": 1.718591570854187, "learning_rate": 5.833931250407671e-07, "loss": 0.8398, "step": 7195 }, { "epoch": 0.44615193952162596, "grad_norm": 1.8855661153793335, "learning_rate": 5.830669884547649e-07, "loss": 0.8874, "step": 7200 }, { "epoch": 0.4464617672574049, "grad_norm": 1.672439455986023, "learning_rate": 5.827408518687626e-07, "loss": 0.8266, "step": 7205 }, { "epoch": 0.4467715949931838, "grad_norm": 1.722326636314392, "learning_rate": 5.824147152827604e-07, "loss": 0.8252, "step": 7210 }, { "epoch": 0.4470814227289627, "grad_norm": 1.924149990081787, "learning_rate": 5.820885786967582e-07, "loss": 0.8726, "step": 7215 }, { "epoch": 0.4473912504647416, "grad_norm": 1.758583426475525, "learning_rate": 5.81762442110756e-07, "loss": 0.827, "step": 7220 }, { "epoch": 0.4477010782005205, "grad_norm": 1.564010739326477, "learning_rate": 5.814363055247538e-07, "loss": 0.8396, "step": 7225 }, { "epoch": 0.44801090593629944, "grad_norm": 1.765458345413208, "learning_rate": 5.811101689387515e-07, "loss": 0.8124, "step": 7230 }, { "epoch": 0.4483207336720783, "grad_norm": 1.7594716548919678, "learning_rate": 5.807840323527493e-07, "loss": 0.88, "step": 7235 }, { "epoch": 0.44863056140785723, "grad_norm": 1.5333161354064941, "learning_rate": 5.804578957667471e-07, "loss": 0.8881, "step": 7240 }, { "epoch": 0.44894038914363615, "grad_norm": 1.9386337995529175, "learning_rate": 5.801317591807448e-07, "loss": 0.8856, "step": 7245 }, { "epoch": 0.449250216879415, "grad_norm": 1.4739466905593872, "learning_rate": 5.798056225947426e-07, "loss": 0.8559, "step": 7250 }, { "epoch": 0.44956004461519394, "grad_norm": 1.7485178709030151, "learning_rate": 5.794794860087405e-07, "loss": 0.8906, "step": 7255 }, { "epoch": 0.44986987235097287, "grad_norm": 1.5124900341033936, "learning_rate": 5.791533494227382e-07, "loss": 0.8008, "step": 7260 }, { "epoch": 0.4501797000867518, "grad_norm": 1.6411762237548828, "learning_rate": 5.78827212836736e-07, "loss": 0.9062, "step": 7265 }, { "epoch": 0.45048952782253066, "grad_norm": 1.4834705591201782, "learning_rate": 5.785010762507338e-07, "loss": 0.8271, "step": 7270 }, { "epoch": 0.4507993555583096, "grad_norm": 1.5635879039764404, "learning_rate": 5.781749396647315e-07, "loss": 0.844, "step": 7275 }, { "epoch": 0.4511091832940885, "grad_norm": 1.7038705348968506, "learning_rate": 5.778488030787293e-07, "loss": 0.8569, "step": 7280 }, { "epoch": 0.45141901102986737, "grad_norm": 2.329503059387207, "learning_rate": 5.77522666492727e-07, "loss": 0.9132, "step": 7285 }, { "epoch": 0.4517288387656463, "grad_norm": 1.6912213563919067, "learning_rate": 5.771965299067249e-07, "loss": 0.9146, "step": 7290 }, { "epoch": 0.4520386665014252, "grad_norm": 1.809262752532959, "learning_rate": 5.768703933207228e-07, "loss": 0.8324, "step": 7295 }, { "epoch": 0.45234849423720414, "grad_norm": 1.6375970840454102, "learning_rate": 5.765442567347205e-07, "loss": 0.8627, "step": 7300 }, { "epoch": 0.452658321972983, "grad_norm": 1.6677496433258057, "learning_rate": 5.762181201487183e-07, "loss": 0.9001, "step": 7305 }, { "epoch": 0.4529681497087619, "grad_norm": 1.575644850730896, "learning_rate": 5.758919835627161e-07, "loss": 0.8883, "step": 7310 }, { "epoch": 0.45327797744454085, "grad_norm": 1.7223210334777832, "learning_rate": 5.755658469767138e-07, "loss": 0.8825, "step": 7315 }, { "epoch": 0.45358780518031977, "grad_norm": 1.4631707668304443, "learning_rate": 5.752397103907116e-07, "loss": 0.8425, "step": 7320 }, { "epoch": 0.45389763291609864, "grad_norm": 1.4649512767791748, "learning_rate": 5.749135738047094e-07, "loss": 0.8737, "step": 7325 }, { "epoch": 0.45420746065187756, "grad_norm": 1.5835803747177124, "learning_rate": 5.745874372187072e-07, "loss": 0.8344, "step": 7330 }, { "epoch": 0.4545172883876565, "grad_norm": 1.9692097902297974, "learning_rate": 5.74261300632705e-07, "loss": 0.846, "step": 7335 }, { "epoch": 0.45482711612343535, "grad_norm": 1.8001835346221924, "learning_rate": 5.739351640467027e-07, "loss": 0.8423, "step": 7340 }, { "epoch": 0.45513694385921427, "grad_norm": 1.6301804780960083, "learning_rate": 5.736090274607005e-07, "loss": 0.8821, "step": 7345 }, { "epoch": 0.4554467715949932, "grad_norm": 1.703266978263855, "learning_rate": 5.732828908746983e-07, "loss": 0.8239, "step": 7350 }, { "epoch": 0.4557565993307721, "grad_norm": 1.7914221286773682, "learning_rate": 5.72956754288696e-07, "loss": 0.8892, "step": 7355 }, { "epoch": 0.456066427066551, "grad_norm": 1.9085320234298706, "learning_rate": 5.726306177026938e-07, "loss": 0.8477, "step": 7360 }, { "epoch": 0.4563762548023299, "grad_norm": 1.9794046878814697, "learning_rate": 5.723044811166916e-07, "loss": 0.8468, "step": 7365 }, { "epoch": 0.45668608253810883, "grad_norm": 1.882997751235962, "learning_rate": 5.719783445306894e-07, "loss": 0.9239, "step": 7370 }, { "epoch": 0.4569959102738877, "grad_norm": 1.6136319637298584, "learning_rate": 5.716522079446872e-07, "loss": 0.9161, "step": 7375 }, { "epoch": 0.4573057380096666, "grad_norm": 1.3694689273834229, "learning_rate": 5.713260713586851e-07, "loss": 0.8459, "step": 7380 }, { "epoch": 0.45761556574544554, "grad_norm": 1.6297613382339478, "learning_rate": 5.709999347726828e-07, "loss": 0.829, "step": 7385 }, { "epoch": 0.45792539348122446, "grad_norm": 1.512752652168274, "learning_rate": 5.706737981866806e-07, "loss": 0.8732, "step": 7390 }, { "epoch": 0.45823522121700333, "grad_norm": 1.9842948913574219, "learning_rate": 5.703476616006783e-07, "loss": 0.8347, "step": 7395 }, { "epoch": 0.45854504895278225, "grad_norm": 1.5931134223937988, "learning_rate": 5.700215250146761e-07, "loss": 0.8077, "step": 7400 }, { "epoch": 0.4588548766885612, "grad_norm": 1.694076418876648, "learning_rate": 5.69695388428674e-07, "loss": 0.9098, "step": 7405 }, { "epoch": 0.45916470442434004, "grad_norm": 1.6855559349060059, "learning_rate": 5.693692518426717e-07, "loss": 0.843, "step": 7410 }, { "epoch": 0.45947453216011896, "grad_norm": 1.751988410949707, "learning_rate": 5.690431152566695e-07, "loss": 0.9162, "step": 7415 }, { "epoch": 0.4597843598958979, "grad_norm": 1.6960159540176392, "learning_rate": 5.687169786706673e-07, "loss": 0.8047, "step": 7420 }, { "epoch": 0.4600941876316768, "grad_norm": 1.612842321395874, "learning_rate": 5.68390842084665e-07, "loss": 0.8142, "step": 7425 }, { "epoch": 0.4604040153674557, "grad_norm": 1.5173799991607666, "learning_rate": 5.680647054986628e-07, "loss": 0.8389, "step": 7430 }, { "epoch": 0.4607138431032346, "grad_norm": 1.5863734483718872, "learning_rate": 5.677385689126606e-07, "loss": 0.9005, "step": 7435 }, { "epoch": 0.4610236708390135, "grad_norm": 1.8130946159362793, "learning_rate": 5.674124323266583e-07, "loss": 0.8881, "step": 7440 }, { "epoch": 0.4613334985747924, "grad_norm": 1.7063045501708984, "learning_rate": 5.670862957406562e-07, "loss": 0.8691, "step": 7445 }, { "epoch": 0.4616433263105713, "grad_norm": 1.960140347480774, "learning_rate": 5.667601591546539e-07, "loss": 0.8745, "step": 7450 }, { "epoch": 0.46195315404635023, "grad_norm": 1.9909387826919556, "learning_rate": 5.664340225686517e-07, "loss": 0.831, "step": 7455 }, { "epoch": 0.46226298178212916, "grad_norm": 1.737335205078125, "learning_rate": 5.661078859826495e-07, "loss": 0.8786, "step": 7460 }, { "epoch": 0.462572809517908, "grad_norm": 1.6486327648162842, "learning_rate": 5.657817493966472e-07, "loss": 0.8875, "step": 7465 }, { "epoch": 0.46288263725368695, "grad_norm": 1.9403656721115112, "learning_rate": 5.65455612810645e-07, "loss": 0.9005, "step": 7470 }, { "epoch": 0.46319246498946587, "grad_norm": 1.5174169540405273, "learning_rate": 5.651294762246429e-07, "loss": 0.845, "step": 7475 }, { "epoch": 0.4635022927252448, "grad_norm": 2.078463077545166, "learning_rate": 5.648033396386407e-07, "loss": 0.8025, "step": 7480 }, { "epoch": 0.46381212046102366, "grad_norm": 1.8099908828735352, "learning_rate": 5.644772030526385e-07, "loss": 0.8465, "step": 7485 }, { "epoch": 0.4641219481968026, "grad_norm": 1.7834604978561401, "learning_rate": 5.641510664666363e-07, "loss": 0.8865, "step": 7490 }, { "epoch": 0.4644317759325815, "grad_norm": 1.7717041969299316, "learning_rate": 5.63824929880634e-07, "loss": 0.8262, "step": 7495 }, { "epoch": 0.46474160366836037, "grad_norm": 1.589928388595581, "learning_rate": 5.634987932946318e-07, "loss": 0.8288, "step": 7500 }, { "epoch": 0.4650514314041393, "grad_norm": 1.720280647277832, "learning_rate": 5.631726567086295e-07, "loss": 0.8866, "step": 7505 }, { "epoch": 0.4653612591399182, "grad_norm": 1.8927419185638428, "learning_rate": 5.628465201226273e-07, "loss": 0.8637, "step": 7510 }, { "epoch": 0.46567108687569714, "grad_norm": 1.8326889276504517, "learning_rate": 5.625203835366251e-07, "loss": 0.8706, "step": 7515 }, { "epoch": 0.465980914611476, "grad_norm": 1.397342562675476, "learning_rate": 5.621942469506229e-07, "loss": 0.8839, "step": 7520 }, { "epoch": 0.4662907423472549, "grad_norm": 1.5562357902526855, "learning_rate": 5.618681103646207e-07, "loss": 0.8495, "step": 7525 }, { "epoch": 0.46660057008303385, "grad_norm": 1.5440667867660522, "learning_rate": 5.615419737786185e-07, "loss": 0.8669, "step": 7530 }, { "epoch": 0.4669103978188127, "grad_norm": 1.6331486701965332, "learning_rate": 5.612158371926162e-07, "loss": 0.81, "step": 7535 }, { "epoch": 0.46722022555459164, "grad_norm": 1.7530827522277832, "learning_rate": 5.60889700606614e-07, "loss": 0.8274, "step": 7540 }, { "epoch": 0.46753005329037056, "grad_norm": 1.5233430862426758, "learning_rate": 5.605635640206117e-07, "loss": 0.8012, "step": 7545 }, { "epoch": 0.4678398810261495, "grad_norm": 1.57335364818573, "learning_rate": 5.602374274346095e-07, "loss": 0.8664, "step": 7550 }, { "epoch": 0.46814970876192835, "grad_norm": 1.7841930389404297, "learning_rate": 5.599112908486074e-07, "loss": 0.9039, "step": 7555 }, { "epoch": 0.4684595364977073, "grad_norm": 1.8522183895111084, "learning_rate": 5.595851542626051e-07, "loss": 0.8219, "step": 7560 }, { "epoch": 0.4687693642334862, "grad_norm": 2.085120677947998, "learning_rate": 5.592590176766029e-07, "loss": 0.8722, "step": 7565 }, { "epoch": 0.46907919196926506, "grad_norm": 2.067443370819092, "learning_rate": 5.589328810906008e-07, "loss": 0.8004, "step": 7570 }, { "epoch": 0.469389019705044, "grad_norm": 1.8066157102584839, "learning_rate": 5.586067445045985e-07, "loss": 0.8118, "step": 7575 }, { "epoch": 0.4696988474408229, "grad_norm": 1.7750838994979858, "learning_rate": 5.582806079185963e-07, "loss": 0.8696, "step": 7580 }, { "epoch": 0.47000867517660183, "grad_norm": 1.3324483633041382, "learning_rate": 5.579544713325941e-07, "loss": 0.897, "step": 7585 }, { "epoch": 0.4703185029123807, "grad_norm": 1.4865186214447021, "learning_rate": 5.576283347465919e-07, "loss": 0.8571, "step": 7590 }, { "epoch": 0.4706283306481596, "grad_norm": 1.8034368753433228, "learning_rate": 5.573021981605897e-07, "loss": 0.8363, "step": 7595 }, { "epoch": 0.47093815838393854, "grad_norm": 1.6462640762329102, "learning_rate": 5.569760615745875e-07, "loss": 0.8107, "step": 7600 }, { "epoch": 0.4712479861197174, "grad_norm": 1.7713099718093872, "learning_rate": 5.566499249885852e-07, "loss": 0.8699, "step": 7605 }, { "epoch": 0.47155781385549633, "grad_norm": 1.6539610624313354, "learning_rate": 5.56323788402583e-07, "loss": 0.8136, "step": 7610 }, { "epoch": 0.47186764159127526, "grad_norm": 1.9519563913345337, "learning_rate": 5.559976518165807e-07, "loss": 0.8755, "step": 7615 }, { "epoch": 0.4721774693270542, "grad_norm": 1.5904797315597534, "learning_rate": 5.556715152305785e-07, "loss": 0.8946, "step": 7620 }, { "epoch": 0.47248729706283304, "grad_norm": 1.4601452350616455, "learning_rate": 5.553453786445763e-07, "loss": 0.8391, "step": 7625 }, { "epoch": 0.47279712479861197, "grad_norm": 1.8363912105560303, "learning_rate": 5.550192420585741e-07, "loss": 0.8377, "step": 7630 }, { "epoch": 0.4731069525343909, "grad_norm": 1.6410101652145386, "learning_rate": 5.546931054725719e-07, "loss": 0.8464, "step": 7635 }, { "epoch": 0.4734167802701698, "grad_norm": 1.8527123928070068, "learning_rate": 5.543669688865697e-07, "loss": 0.8382, "step": 7640 }, { "epoch": 0.4737266080059487, "grad_norm": 1.6195173263549805, "learning_rate": 5.540408323005674e-07, "loss": 0.7956, "step": 7645 }, { "epoch": 0.4740364357417276, "grad_norm": 1.3881487846374512, "learning_rate": 5.537146957145652e-07, "loss": 0.8219, "step": 7650 }, { "epoch": 0.4743462634775065, "grad_norm": 1.7057663202285767, "learning_rate": 5.533885591285629e-07, "loss": 0.8186, "step": 7655 }, { "epoch": 0.4746560912132854, "grad_norm": 1.573249340057373, "learning_rate": 5.530624225425607e-07, "loss": 0.8591, "step": 7660 }, { "epoch": 0.4749659189490643, "grad_norm": 1.6083955764770508, "learning_rate": 5.527362859565587e-07, "loss": 0.8952, "step": 7665 }, { "epoch": 0.47527574668484324, "grad_norm": 1.5517487525939941, "learning_rate": 5.524101493705564e-07, "loss": 0.8599, "step": 7670 }, { "epoch": 0.47558557442062216, "grad_norm": 1.575217604637146, "learning_rate": 5.520840127845542e-07, "loss": 0.8488, "step": 7675 }, { "epoch": 0.475895402156401, "grad_norm": 1.6272902488708496, "learning_rate": 5.51757876198552e-07, "loss": 0.795, "step": 7680 }, { "epoch": 0.47620522989217995, "grad_norm": 1.6608831882476807, "learning_rate": 5.514317396125497e-07, "loss": 0.8356, "step": 7685 }, { "epoch": 0.47651505762795887, "grad_norm": 1.5241279602050781, "learning_rate": 5.511056030265475e-07, "loss": 0.8339, "step": 7690 }, { "epoch": 0.47682488536373774, "grad_norm": 1.5849863290786743, "learning_rate": 5.507794664405453e-07, "loss": 0.8696, "step": 7695 }, { "epoch": 0.47713471309951666, "grad_norm": 1.62017023563385, "learning_rate": 5.50453329854543e-07, "loss": 0.7915, "step": 7700 }, { "epoch": 0.4774445408352956, "grad_norm": 1.8587238788604736, "learning_rate": 5.501271932685409e-07, "loss": 0.8473, "step": 7705 }, { "epoch": 0.4777543685710745, "grad_norm": 1.6658074855804443, "learning_rate": 5.498010566825387e-07, "loss": 0.844, "step": 7710 }, { "epoch": 0.4780641963068534, "grad_norm": 1.4931505918502808, "learning_rate": 5.494749200965364e-07, "loss": 0.8382, "step": 7715 }, { "epoch": 0.4783740240426323, "grad_norm": 1.801140308380127, "learning_rate": 5.491487835105342e-07, "loss": 0.8598, "step": 7720 }, { "epoch": 0.4786838517784112, "grad_norm": 1.692721962928772, "learning_rate": 5.488226469245319e-07, "loss": 0.867, "step": 7725 }, { "epoch": 0.4789936795141901, "grad_norm": 1.8700127601623535, "learning_rate": 5.484965103385297e-07, "loss": 0.8347, "step": 7730 }, { "epoch": 0.479303507249969, "grad_norm": 1.63836669921875, "learning_rate": 5.481703737525275e-07, "loss": 0.8435, "step": 7735 }, { "epoch": 0.47961333498574793, "grad_norm": 1.9490801095962524, "learning_rate": 5.478442371665253e-07, "loss": 0.8594, "step": 7740 }, { "epoch": 0.47992316272152685, "grad_norm": 1.5828524827957153, "learning_rate": 5.475181005805231e-07, "loss": 0.874, "step": 7745 }, { "epoch": 0.4802329904573057, "grad_norm": 1.7629488706588745, "learning_rate": 5.471919639945209e-07, "loss": 0.8869, "step": 7750 }, { "epoch": 0.48054281819308464, "grad_norm": 1.5630708932876587, "learning_rate": 5.468658274085186e-07, "loss": 0.8282, "step": 7755 }, { "epoch": 0.48085264592886356, "grad_norm": 1.7226215600967407, "learning_rate": 5.465396908225165e-07, "loss": 0.8305, "step": 7760 }, { "epoch": 0.48116247366464243, "grad_norm": 1.8978235721588135, "learning_rate": 5.462135542365143e-07, "loss": 0.8483, "step": 7765 }, { "epoch": 0.48147230140042135, "grad_norm": 2.023420810699463, "learning_rate": 5.45887417650512e-07, "loss": 0.9183, "step": 7770 }, { "epoch": 0.4817821291362003, "grad_norm": 1.5524324178695679, "learning_rate": 5.455612810645098e-07, "loss": 0.909, "step": 7775 }, { "epoch": 0.4820919568719792, "grad_norm": 1.6595451831817627, "learning_rate": 5.452351444785076e-07, "loss": 0.8147, "step": 7780 }, { "epoch": 0.48240178460775807, "grad_norm": 1.5844792127609253, "learning_rate": 5.449090078925054e-07, "loss": 0.894, "step": 7785 }, { "epoch": 0.482711612343537, "grad_norm": 1.5985002517700195, "learning_rate": 5.445828713065032e-07, "loss": 0.8836, "step": 7790 }, { "epoch": 0.4830214400793159, "grad_norm": 1.7980848550796509, "learning_rate": 5.442567347205009e-07, "loss": 0.812, "step": 7795 }, { "epoch": 0.48333126781509483, "grad_norm": 2.651982545852661, "learning_rate": 5.439305981344987e-07, "loss": 0.845, "step": 7800 }, { "epoch": 0.4836410955508737, "grad_norm": 1.5096945762634277, "learning_rate": 5.436044615484965e-07, "loss": 0.8658, "step": 7805 }, { "epoch": 0.4839509232866526, "grad_norm": 1.6998015642166138, "learning_rate": 5.432783249624942e-07, "loss": 0.8566, "step": 7810 }, { "epoch": 0.48426075102243155, "grad_norm": 1.7708393335342407, "learning_rate": 5.429521883764921e-07, "loss": 0.8132, "step": 7815 }, { "epoch": 0.4845705787582104, "grad_norm": 1.8577944040298462, "learning_rate": 5.426260517904898e-07, "loss": 0.8683, "step": 7820 }, { "epoch": 0.48488040649398934, "grad_norm": 1.8487944602966309, "learning_rate": 5.422999152044876e-07, "loss": 0.8873, "step": 7825 }, { "epoch": 0.48519023422976826, "grad_norm": 1.6993303298950195, "learning_rate": 5.419737786184854e-07, "loss": 0.9141, "step": 7830 }, { "epoch": 0.4855000619655472, "grad_norm": 1.4423953294754028, "learning_rate": 5.416476420324831e-07, "loss": 0.8407, "step": 7835 }, { "epoch": 0.48580988970132605, "grad_norm": 1.8063251972198486, "learning_rate": 5.413215054464809e-07, "loss": 0.8829, "step": 7840 }, { "epoch": 0.48611971743710497, "grad_norm": 2.094388961791992, "learning_rate": 5.409953688604787e-07, "loss": 0.9172, "step": 7845 }, { "epoch": 0.4864295451728839, "grad_norm": 1.7622525691986084, "learning_rate": 5.406692322744764e-07, "loss": 0.8208, "step": 7850 }, { "epoch": 0.48673937290866276, "grad_norm": 1.6282246112823486, "learning_rate": 5.403430956884744e-07, "loss": 0.8588, "step": 7855 }, { "epoch": 0.4870492006444417, "grad_norm": 1.5550072193145752, "learning_rate": 5.400169591024722e-07, "loss": 0.7818, "step": 7860 }, { "epoch": 0.4873590283802206, "grad_norm": 1.458930253982544, "learning_rate": 5.396908225164699e-07, "loss": 0.8769, "step": 7865 }, { "epoch": 0.4876688561159995, "grad_norm": 1.7551133632659912, "learning_rate": 5.393646859304677e-07, "loss": 0.841, "step": 7870 }, { "epoch": 0.4879786838517784, "grad_norm": 1.5642263889312744, "learning_rate": 5.390385493444654e-07, "loss": 0.8783, "step": 7875 }, { "epoch": 0.4882885115875573, "grad_norm": 1.7878143787384033, "learning_rate": 5.387124127584632e-07, "loss": 0.8425, "step": 7880 }, { "epoch": 0.48859833932333624, "grad_norm": 1.6131370067596436, "learning_rate": 5.38386276172461e-07, "loss": 0.8578, "step": 7885 }, { "epoch": 0.4889081670591151, "grad_norm": 2.1459920406341553, "learning_rate": 5.380601395864588e-07, "loss": 0.7969, "step": 7890 }, { "epoch": 0.48921799479489403, "grad_norm": 1.6620455980300903, "learning_rate": 5.377340030004566e-07, "loss": 0.8792, "step": 7895 }, { "epoch": 0.48952782253067295, "grad_norm": 1.9532731771469116, "learning_rate": 5.374078664144544e-07, "loss": 0.8953, "step": 7900 }, { "epoch": 0.4898376502664519, "grad_norm": 1.564013957977295, "learning_rate": 5.370817298284521e-07, "loss": 0.8398, "step": 7905 }, { "epoch": 0.49014747800223074, "grad_norm": 1.4386680126190186, "learning_rate": 5.367555932424499e-07, "loss": 0.8588, "step": 7910 }, { "epoch": 0.49045730573800966, "grad_norm": 2.1590235233306885, "learning_rate": 5.364294566564477e-07, "loss": 0.9153, "step": 7915 }, { "epoch": 0.4907671334737886, "grad_norm": 1.753292441368103, "learning_rate": 5.361033200704454e-07, "loss": 0.8632, "step": 7920 }, { "epoch": 0.49107696120956745, "grad_norm": 1.6892341375350952, "learning_rate": 5.357771834844432e-07, "loss": 0.848, "step": 7925 }, { "epoch": 0.4913867889453464, "grad_norm": 1.6751742362976074, "learning_rate": 5.35451046898441e-07, "loss": 0.8419, "step": 7930 }, { "epoch": 0.4916966166811253, "grad_norm": 1.5923916101455688, "learning_rate": 5.351249103124388e-07, "loss": 0.8783, "step": 7935 }, { "epoch": 0.4920064444169042, "grad_norm": 1.647318720817566, "learning_rate": 5.347987737264366e-07, "loss": 0.8579, "step": 7940 }, { "epoch": 0.4923162721526831, "grad_norm": 1.6549277305603027, "learning_rate": 5.344726371404343e-07, "loss": 0.8748, "step": 7945 }, { "epoch": 0.492626099888462, "grad_norm": 2.836874485015869, "learning_rate": 5.341465005544322e-07, "loss": 0.8995, "step": 7950 }, { "epoch": 0.49293592762424093, "grad_norm": 1.6466785669326782, "learning_rate": 5.3382036396843e-07, "loss": 0.9025, "step": 7955 }, { "epoch": 0.49324575536001986, "grad_norm": 1.6320537328720093, "learning_rate": 5.334942273824277e-07, "loss": 0.8236, "step": 7960 }, { "epoch": 0.4935555830957987, "grad_norm": 1.6627342700958252, "learning_rate": 5.331680907964256e-07, "loss": 0.8752, "step": 7965 }, { "epoch": 0.49386541083157764, "grad_norm": 1.7692406177520752, "learning_rate": 5.328419542104234e-07, "loss": 0.8642, "step": 7970 }, { "epoch": 0.49417523856735657, "grad_norm": 1.5924856662750244, "learning_rate": 5.325158176244211e-07, "loss": 0.8867, "step": 7975 }, { "epoch": 0.49448506630313543, "grad_norm": 1.656474232673645, "learning_rate": 5.321896810384189e-07, "loss": 0.8738, "step": 7980 }, { "epoch": 0.49479489403891436, "grad_norm": 1.6743558645248413, "learning_rate": 5.318635444524166e-07, "loss": 0.8842, "step": 7985 }, { "epoch": 0.4951047217746933, "grad_norm": 1.8907849788665771, "learning_rate": 5.315374078664144e-07, "loss": 0.806, "step": 7990 }, { "epoch": 0.4954145495104722, "grad_norm": 1.5648529529571533, "learning_rate": 5.312112712804122e-07, "loss": 0.933, "step": 7995 }, { "epoch": 0.49572437724625107, "grad_norm": 1.6764228343963623, "learning_rate": 5.308851346944099e-07, "loss": 0.8608, "step": 8000 }, { "epoch": 0.49603420498203, "grad_norm": 1.3951822519302368, "learning_rate": 5.305589981084078e-07, "loss": 0.8662, "step": 8005 }, { "epoch": 0.4963440327178089, "grad_norm": 1.6283621788024902, "learning_rate": 5.302328615224056e-07, "loss": 0.881, "step": 8010 }, { "epoch": 0.4966538604535878, "grad_norm": 1.7630923986434937, "learning_rate": 5.299067249364033e-07, "loss": 0.8103, "step": 8015 }, { "epoch": 0.4969636881893667, "grad_norm": 1.5557008981704712, "learning_rate": 5.295805883504011e-07, "loss": 0.8984, "step": 8020 }, { "epoch": 0.4972735159251456, "grad_norm": 1.498909831047058, "learning_rate": 5.292544517643989e-07, "loss": 0.8131, "step": 8025 }, { "epoch": 0.49758334366092455, "grad_norm": 1.5762838125228882, "learning_rate": 5.289283151783966e-07, "loss": 0.8233, "step": 8030 }, { "epoch": 0.4978931713967034, "grad_norm": 1.5975526571273804, "learning_rate": 5.286021785923944e-07, "loss": 0.8367, "step": 8035 }, { "epoch": 0.49820299913248234, "grad_norm": 1.7392308712005615, "learning_rate": 5.282760420063924e-07, "loss": 0.8479, "step": 8040 }, { "epoch": 0.49851282686826126, "grad_norm": 1.5113513469696045, "learning_rate": 5.279499054203901e-07, "loss": 0.8696, "step": 8045 }, { "epoch": 0.4988226546040401, "grad_norm": 1.7618968486785889, "learning_rate": 5.276237688343879e-07, "loss": 0.86, "step": 8050 }, { "epoch": 0.49913248233981905, "grad_norm": 1.857755184173584, "learning_rate": 5.272976322483856e-07, "loss": 0.8507, "step": 8055 }, { "epoch": 0.499442310075598, "grad_norm": 1.6439995765686035, "learning_rate": 5.269714956623834e-07, "loss": 0.784, "step": 8060 }, { "epoch": 0.4997521378113769, "grad_norm": 1.7247428894042969, "learning_rate": 5.266453590763812e-07, "loss": 0.7974, "step": 8065 }, { "epoch": 0.5000619655471558, "grad_norm": 1.5708081722259521, "learning_rate": 5.263192224903789e-07, "loss": 0.8863, "step": 8070 }, { "epoch": 0.5003717932829347, "grad_norm": 1.9374964237213135, "learning_rate": 5.259930859043768e-07, "loss": 0.8408, "step": 8075 }, { "epoch": 0.5006816210187136, "grad_norm": 1.559132695198059, "learning_rate": 5.256669493183746e-07, "loss": 0.8569, "step": 8080 }, { "epoch": 0.5009914487544925, "grad_norm": 2.1152446269989014, "learning_rate": 5.253408127323723e-07, "loss": 0.8531, "step": 8085 }, { "epoch": 0.5013012764902715, "grad_norm": 1.3910341262817383, "learning_rate": 5.250146761463701e-07, "loss": 0.8756, "step": 8090 }, { "epoch": 0.5016111042260503, "grad_norm": 1.458903193473816, "learning_rate": 5.246885395603678e-07, "loss": 0.8464, "step": 8095 }, { "epoch": 0.5019209319618292, "grad_norm": 1.533583641052246, "learning_rate": 5.243624029743656e-07, "loss": 0.8511, "step": 8100 }, { "epoch": 0.5022307596976081, "grad_norm": 1.5305843353271484, "learning_rate": 5.240362663883634e-07, "loss": 0.8173, "step": 8105 }, { "epoch": 0.502540587433387, "grad_norm": 2.0814757347106934, "learning_rate": 5.237101298023611e-07, "loss": 0.8704, "step": 8110 }, { "epoch": 0.502850415169166, "grad_norm": 1.6884905099868774, "learning_rate": 5.23383993216359e-07, "loss": 0.8433, "step": 8115 }, { "epoch": 0.5031602429049449, "grad_norm": 1.4589178562164307, "learning_rate": 5.230578566303568e-07, "loss": 0.864, "step": 8120 }, { "epoch": 0.5034700706407238, "grad_norm": 1.6508135795593262, "learning_rate": 5.227317200443545e-07, "loss": 0.957, "step": 8125 }, { "epoch": 0.5037798983765026, "grad_norm": 1.7214514017105103, "learning_rate": 5.224055834583523e-07, "loss": 0.8883, "step": 8130 }, { "epoch": 0.5040897261122815, "grad_norm": 1.504271388053894, "learning_rate": 5.220794468723502e-07, "loss": 0.8565, "step": 8135 }, { "epoch": 0.5043995538480605, "grad_norm": 1.5571720600128174, "learning_rate": 5.217533102863479e-07, "loss": 0.8441, "step": 8140 }, { "epoch": 0.5047093815838394, "grad_norm": 1.6634913682937622, "learning_rate": 5.214271737003457e-07, "loss": 0.9128, "step": 8145 }, { "epoch": 0.5050192093196183, "grad_norm": 1.4871227741241455, "learning_rate": 5.211010371143435e-07, "loss": 0.8867, "step": 8150 }, { "epoch": 0.5053290370553972, "grad_norm": 1.753101110458374, "learning_rate": 5.207749005283413e-07, "loss": 0.8574, "step": 8155 }, { "epoch": 0.5056388647911761, "grad_norm": 1.7727354764938354, "learning_rate": 5.204487639423391e-07, "loss": 0.8761, "step": 8160 }, { "epoch": 0.5059486925269551, "grad_norm": 1.9136801958084106, "learning_rate": 5.201226273563368e-07, "loss": 0.9097, "step": 8165 }, { "epoch": 0.5062585202627339, "grad_norm": 1.4725350141525269, "learning_rate": 5.197964907703346e-07, "loss": 0.8219, "step": 8170 }, { "epoch": 0.5065683479985128, "grad_norm": 2.127582550048828, "learning_rate": 5.194703541843324e-07, "loss": 0.8911, "step": 8175 }, { "epoch": 0.5068781757342917, "grad_norm": 1.4997597932815552, "learning_rate": 5.191442175983301e-07, "loss": 0.8778, "step": 8180 }, { "epoch": 0.5071880034700706, "grad_norm": 1.6755528450012207, "learning_rate": 5.188180810123279e-07, "loss": 0.849, "step": 8185 }, { "epoch": 0.5074978312058496, "grad_norm": 2.2492947578430176, "learning_rate": 5.184919444263258e-07, "loss": 0.842, "step": 8190 }, { "epoch": 0.5078076589416285, "grad_norm": 1.5768847465515137, "learning_rate": 5.181658078403235e-07, "loss": 0.837, "step": 8195 }, { "epoch": 0.5081174866774074, "grad_norm": 1.6491109132766724, "learning_rate": 5.178396712543213e-07, "loss": 0.8266, "step": 8200 }, { "epoch": 0.5084273144131862, "grad_norm": 1.6778804063796997, "learning_rate": 5.17513534668319e-07, "loss": 0.8558, "step": 8205 }, { "epoch": 0.5087371421489651, "grad_norm": 1.8833167552947998, "learning_rate": 5.171873980823168e-07, "loss": 0.8561, "step": 8210 }, { "epoch": 0.5090469698847441, "grad_norm": 1.612980604171753, "learning_rate": 5.168612614963146e-07, "loss": 0.8321, "step": 8215 }, { "epoch": 0.509356797620523, "grad_norm": 1.6855554580688477, "learning_rate": 5.165351249103123e-07, "loss": 0.8324, "step": 8220 }, { "epoch": 0.5096666253563019, "grad_norm": 1.6307419538497925, "learning_rate": 5.162089883243102e-07, "loss": 0.8396, "step": 8225 }, { "epoch": 0.5099764530920808, "grad_norm": 1.6706006526947021, "learning_rate": 5.158828517383081e-07, "loss": 0.878, "step": 8230 }, { "epoch": 0.5102862808278598, "grad_norm": 1.9435243606567383, "learning_rate": 5.155567151523058e-07, "loss": 0.8313, "step": 8235 }, { "epoch": 0.5105961085636386, "grad_norm": 1.7101376056671143, "learning_rate": 5.152305785663036e-07, "loss": 0.8634, "step": 8240 }, { "epoch": 0.5109059362994175, "grad_norm": 1.532177448272705, "learning_rate": 5.149044419803014e-07, "loss": 0.813, "step": 8245 }, { "epoch": 0.5112157640351964, "grad_norm": 1.81050705909729, "learning_rate": 5.145783053942991e-07, "loss": 0.8382, "step": 8250 }, { "epoch": 0.5115255917709753, "grad_norm": 1.7777496576309204, "learning_rate": 5.142521688082969e-07, "loss": 0.825, "step": 8255 }, { "epoch": 0.5118354195067543, "grad_norm": 1.6097952127456665, "learning_rate": 5.139260322222946e-07, "loss": 0.87, "step": 8260 }, { "epoch": 0.5121452472425332, "grad_norm": 1.5893511772155762, "learning_rate": 5.135998956362925e-07, "loss": 0.8467, "step": 8265 }, { "epoch": 0.5124550749783121, "grad_norm": 1.7965352535247803, "learning_rate": 5.132737590502903e-07, "loss": 0.7614, "step": 8270 }, { "epoch": 0.5127649027140909, "grad_norm": 1.5472474098205566, "learning_rate": 5.12947622464288e-07, "loss": 0.8533, "step": 8275 }, { "epoch": 0.5130747304498698, "grad_norm": 1.7777515649795532, "learning_rate": 5.126214858782858e-07, "loss": 0.8513, "step": 8280 }, { "epoch": 0.5133845581856488, "grad_norm": 1.8303794860839844, "learning_rate": 5.122953492922836e-07, "loss": 0.7896, "step": 8285 }, { "epoch": 0.5136943859214277, "grad_norm": 1.668247938156128, "learning_rate": 5.119692127062813e-07, "loss": 0.8277, "step": 8290 }, { "epoch": 0.5140042136572066, "grad_norm": 1.5959036350250244, "learning_rate": 5.116430761202791e-07, "loss": 0.866, "step": 8295 }, { "epoch": 0.5143140413929855, "grad_norm": 1.6559633016586304, "learning_rate": 5.11316939534277e-07, "loss": 0.8434, "step": 8300 }, { "epoch": 0.5146238691287645, "grad_norm": 1.6497759819030762, "learning_rate": 5.109908029482747e-07, "loss": 0.8907, "step": 8305 }, { "epoch": 0.5149336968645433, "grad_norm": 1.7668895721435547, "learning_rate": 5.106646663622725e-07, "loss": 0.8423, "step": 8310 }, { "epoch": 0.5152435246003222, "grad_norm": 1.6586551666259766, "learning_rate": 5.103385297762702e-07, "loss": 0.8277, "step": 8315 }, { "epoch": 0.5155533523361011, "grad_norm": 1.562143325805664, "learning_rate": 5.10012393190268e-07, "loss": 0.8979, "step": 8320 }, { "epoch": 0.51586318007188, "grad_norm": 1.709792137145996, "learning_rate": 5.096862566042659e-07, "loss": 0.8577, "step": 8325 }, { "epoch": 0.516173007807659, "grad_norm": 1.65401291847229, "learning_rate": 5.093601200182636e-07, "loss": 0.8565, "step": 8330 }, { "epoch": 0.5164828355434379, "grad_norm": 1.6957275867462158, "learning_rate": 5.090339834322615e-07, "loss": 0.8854, "step": 8335 }, { "epoch": 0.5167926632792168, "grad_norm": 1.6031185388565063, "learning_rate": 5.087078468462593e-07, "loss": 0.8384, "step": 8340 }, { "epoch": 0.5171024910149956, "grad_norm": 1.5412710905075073, "learning_rate": 5.08381710260257e-07, "loss": 0.8241, "step": 8345 }, { "epoch": 0.5174123187507745, "grad_norm": 1.690674066543579, "learning_rate": 5.080555736742548e-07, "loss": 0.8733, "step": 8350 }, { "epoch": 0.5177221464865535, "grad_norm": 1.5366343259811401, "learning_rate": 5.077294370882526e-07, "loss": 0.8655, "step": 8355 }, { "epoch": 0.5180319742223324, "grad_norm": 1.5855786800384521, "learning_rate": 5.074033005022503e-07, "loss": 0.8142, "step": 8360 }, { "epoch": 0.5183418019581113, "grad_norm": 1.726508617401123, "learning_rate": 5.070771639162481e-07, "loss": 0.8174, "step": 8365 }, { "epoch": 0.5186516296938902, "grad_norm": 1.5617156028747559, "learning_rate": 5.067510273302458e-07, "loss": 0.8603, "step": 8370 }, { "epoch": 0.5189614574296691, "grad_norm": 2.007884979248047, "learning_rate": 5.064248907442437e-07, "loss": 0.9079, "step": 8375 }, { "epoch": 0.519271285165448, "grad_norm": 1.714948058128357, "learning_rate": 5.060987541582415e-07, "loss": 0.8404, "step": 8380 }, { "epoch": 0.5195811129012269, "grad_norm": 1.8233202695846558, "learning_rate": 5.057726175722392e-07, "loss": 0.8328, "step": 8385 }, { "epoch": 0.5198909406370058, "grad_norm": 1.5963001251220703, "learning_rate": 5.05446480986237e-07, "loss": 0.8851, "step": 8390 }, { "epoch": 0.5202007683727847, "grad_norm": 1.9502710103988647, "learning_rate": 5.051203444002348e-07, "loss": 0.8687, "step": 8395 }, { "epoch": 0.5205105961085637, "grad_norm": 1.5978164672851562, "learning_rate": 5.047942078142325e-07, "loss": 0.8546, "step": 8400 }, { "epoch": 0.5208204238443426, "grad_norm": 1.6348098516464233, "learning_rate": 5.044680712282303e-07, "loss": 0.8048, "step": 8405 }, { "epoch": 0.5211302515801215, "grad_norm": 1.4563908576965332, "learning_rate": 5.04141934642228e-07, "loss": 0.8488, "step": 8410 }, { "epoch": 0.5214400793159003, "grad_norm": 2.1654467582702637, "learning_rate": 5.038157980562259e-07, "loss": 0.8815, "step": 8415 }, { "epoch": 0.5217499070516792, "grad_norm": 1.621358036994934, "learning_rate": 5.034896614702238e-07, "loss": 0.8021, "step": 8420 }, { "epoch": 0.5220597347874582, "grad_norm": 1.6545026302337646, "learning_rate": 5.031635248842215e-07, "loss": 0.8328, "step": 8425 }, { "epoch": 0.5223695625232371, "grad_norm": 1.3770865201950073, "learning_rate": 5.028373882982193e-07, "loss": 0.8243, "step": 8430 }, { "epoch": 0.522679390259016, "grad_norm": 1.7321503162384033, "learning_rate": 5.025112517122171e-07, "loss": 0.8469, "step": 8435 }, { "epoch": 0.5229892179947949, "grad_norm": 1.712153673171997, "learning_rate": 5.021851151262148e-07, "loss": 0.8929, "step": 8440 }, { "epoch": 0.5232990457305738, "grad_norm": 1.6317592859268188, "learning_rate": 5.018589785402126e-07, "loss": 0.8864, "step": 8445 }, { "epoch": 0.5236088734663527, "grad_norm": 2.11124587059021, "learning_rate": 5.015328419542105e-07, "loss": 0.8839, "step": 8450 }, { "epoch": 0.5239187012021316, "grad_norm": 1.7977371215820312, "learning_rate": 5.012067053682082e-07, "loss": 0.8382, "step": 8455 }, { "epoch": 0.5242285289379105, "grad_norm": 1.477576732635498, "learning_rate": 5.00880568782206e-07, "loss": 0.7666, "step": 8460 }, { "epoch": 0.5245383566736894, "grad_norm": 1.7309367656707764, "learning_rate": 5.005544321962037e-07, "loss": 0.8805, "step": 8465 }, { "epoch": 0.5248481844094683, "grad_norm": 1.6284860372543335, "learning_rate": 5.002282956102015e-07, "loss": 0.8364, "step": 8470 }, { "epoch": 0.5251580121452473, "grad_norm": 1.5256849527359009, "learning_rate": 4.999021590241993e-07, "loss": 0.8519, "step": 8475 }, { "epoch": 0.5254678398810262, "grad_norm": 1.5819694995880127, "learning_rate": 4.99576022438197e-07, "loss": 0.8128, "step": 8480 }, { "epoch": 0.5257776676168051, "grad_norm": 1.8476165533065796, "learning_rate": 4.992498858521948e-07, "loss": 0.827, "step": 8485 }, { "epoch": 0.5260874953525839, "grad_norm": 2.134643793106079, "learning_rate": 4.989237492661927e-07, "loss": 0.8682, "step": 8490 }, { "epoch": 0.5263973230883628, "grad_norm": 1.6717777252197266, "learning_rate": 4.985976126801904e-07, "loss": 0.8047, "step": 8495 }, { "epoch": 0.5267071508241418, "grad_norm": 1.8178949356079102, "learning_rate": 4.982714760941882e-07, "loss": 0.8567, "step": 8500 }, { "epoch": 0.5270169785599207, "grad_norm": 1.4222440719604492, "learning_rate": 4.979453395081861e-07, "loss": 0.8639, "step": 8505 }, { "epoch": 0.5273268062956996, "grad_norm": 1.495411992073059, "learning_rate": 4.976192029221838e-07, "loss": 0.8706, "step": 8510 }, { "epoch": 0.5276366340314785, "grad_norm": 1.6708877086639404, "learning_rate": 4.972930663361816e-07, "loss": 0.8482, "step": 8515 }, { "epoch": 0.5279464617672575, "grad_norm": 1.7918312549591064, "learning_rate": 4.969669297501793e-07, "loss": 0.8546, "step": 8520 }, { "epoch": 0.5282562895030363, "grad_norm": 1.4107096195220947, "learning_rate": 4.966407931641771e-07, "loss": 0.8336, "step": 8525 }, { "epoch": 0.5285661172388152, "grad_norm": 1.8647457361221313, "learning_rate": 4.963146565781749e-07, "loss": 0.8722, "step": 8530 }, { "epoch": 0.5288759449745941, "grad_norm": 1.6282156705856323, "learning_rate": 4.959885199921727e-07, "loss": 0.8273, "step": 8535 }, { "epoch": 0.529185772710373, "grad_norm": 2.137259006500244, "learning_rate": 4.956623834061705e-07, "loss": 0.8342, "step": 8540 }, { "epoch": 0.529495600446152, "grad_norm": 1.8959540128707886, "learning_rate": 4.953362468201683e-07, "loss": 0.8526, "step": 8545 }, { "epoch": 0.5298054281819309, "grad_norm": 1.8089044094085693, "learning_rate": 4.95010110234166e-07, "loss": 0.814, "step": 8550 }, { "epoch": 0.5301152559177098, "grad_norm": 1.7394587993621826, "learning_rate": 4.946839736481638e-07, "loss": 0.8453, "step": 8555 }, { "epoch": 0.5304250836534886, "grad_norm": 1.5755561590194702, "learning_rate": 4.943578370621617e-07, "loss": 0.8463, "step": 8560 }, { "epoch": 0.5307349113892675, "grad_norm": 1.5439486503601074, "learning_rate": 4.940317004761594e-07, "loss": 0.8722, "step": 8565 }, { "epoch": 0.5310447391250465, "grad_norm": 1.381575584411621, "learning_rate": 4.937055638901572e-07, "loss": 0.9039, "step": 8570 }, { "epoch": 0.5313545668608254, "grad_norm": 1.5845351219177246, "learning_rate": 4.93379427304155e-07, "loss": 0.8053, "step": 8575 }, { "epoch": 0.5316643945966043, "grad_norm": 1.5526572465896606, "learning_rate": 4.930532907181527e-07, "loss": 0.796, "step": 8580 }, { "epoch": 0.5319742223323832, "grad_norm": 1.557541847229004, "learning_rate": 4.927271541321506e-07, "loss": 0.8615, "step": 8585 }, { "epoch": 0.5322840500681622, "grad_norm": 1.644456386566162, "learning_rate": 4.924010175461483e-07, "loss": 0.8095, "step": 8590 }, { "epoch": 0.532593877803941, "grad_norm": 1.9941504001617432, "learning_rate": 4.920748809601461e-07, "loss": 0.8656, "step": 8595 }, { "epoch": 0.5329037055397199, "grad_norm": 1.7554852962493896, "learning_rate": 4.917487443741439e-07, "loss": 0.8599, "step": 8600 }, { "epoch": 0.5332135332754988, "grad_norm": 1.6949080228805542, "learning_rate": 4.914226077881416e-07, "loss": 0.8759, "step": 8605 }, { "epoch": 0.5335233610112777, "grad_norm": 1.5834298133850098, "learning_rate": 4.910964712021394e-07, "loss": 0.8616, "step": 8610 }, { "epoch": 0.5338331887470567, "grad_norm": 1.668610692024231, "learning_rate": 4.907703346161372e-07, "loss": 0.8224, "step": 8615 }, { "epoch": 0.5341430164828356, "grad_norm": 1.5433480739593506, "learning_rate": 4.90444198030135e-07, "loss": 0.9391, "step": 8620 }, { "epoch": 0.5344528442186145, "grad_norm": 1.6526790857315063, "learning_rate": 4.901180614441328e-07, "loss": 0.8302, "step": 8625 }, { "epoch": 0.5347626719543933, "grad_norm": 1.5730128288269043, "learning_rate": 4.897919248581305e-07, "loss": 0.8886, "step": 8630 }, { "epoch": 0.5350724996901722, "grad_norm": 1.7988613843917847, "learning_rate": 4.894657882721284e-07, "loss": 0.8625, "step": 8635 }, { "epoch": 0.5353823274259512, "grad_norm": 2.08647084236145, "learning_rate": 4.891396516861262e-07, "loss": 0.8687, "step": 8640 }, { "epoch": 0.5356921551617301, "grad_norm": 1.4177018404006958, "learning_rate": 4.888135151001239e-07, "loss": 0.8182, "step": 8645 }, { "epoch": 0.536001982897509, "grad_norm": 1.6752421855926514, "learning_rate": 4.884873785141217e-07, "loss": 0.8572, "step": 8650 }, { "epoch": 0.5363118106332879, "grad_norm": 1.6344586610794067, "learning_rate": 4.881612419281195e-07, "loss": 0.8765, "step": 8655 }, { "epoch": 0.5366216383690668, "grad_norm": 1.7153410911560059, "learning_rate": 4.878351053421172e-07, "loss": 0.8473, "step": 8660 }, { "epoch": 0.5369314661048457, "grad_norm": 1.8087345361709595, "learning_rate": 4.87508968756115e-07, "loss": 0.8685, "step": 8665 }, { "epoch": 0.5372412938406246, "grad_norm": 1.6836060285568237, "learning_rate": 4.871828321701128e-07, "loss": 0.9266, "step": 8670 }, { "epoch": 0.5375511215764035, "grad_norm": 1.5287762880325317, "learning_rate": 4.868566955841106e-07, "loss": 0.8612, "step": 8675 }, { "epoch": 0.5378609493121824, "grad_norm": 1.6390471458435059, "learning_rate": 4.865305589981084e-07, "loss": 0.7993, "step": 8680 }, { "epoch": 0.5381707770479613, "grad_norm": 1.956032156944275, "learning_rate": 4.862044224121061e-07, "loss": 0.8465, "step": 8685 }, { "epoch": 0.5384806047837403, "grad_norm": 1.510514497756958, "learning_rate": 4.85878285826104e-07, "loss": 0.8177, "step": 8690 }, { "epoch": 0.5387904325195192, "grad_norm": 1.7792279720306396, "learning_rate": 4.855521492401018e-07, "loss": 0.8599, "step": 8695 }, { "epoch": 0.539100260255298, "grad_norm": 1.6927454471588135, "learning_rate": 4.852260126540995e-07, "loss": 0.9118, "step": 8700 }, { "epoch": 0.5394100879910769, "grad_norm": 1.7702076435089111, "learning_rate": 4.848998760680973e-07, "loss": 0.8325, "step": 8705 }, { "epoch": 0.5397199157268558, "grad_norm": 1.6830812692642212, "learning_rate": 4.845737394820951e-07, "loss": 0.8118, "step": 8710 }, { "epoch": 0.5400297434626348, "grad_norm": 1.6260417699813843, "learning_rate": 4.842476028960928e-07, "loss": 0.8437, "step": 8715 }, { "epoch": 0.5403395711984137, "grad_norm": 1.6410142183303833, "learning_rate": 4.839214663100906e-07, "loss": 0.863, "step": 8720 }, { "epoch": 0.5406493989341926, "grad_norm": 1.5820049047470093, "learning_rate": 4.835953297240885e-07, "loss": 0.8008, "step": 8725 }, { "epoch": 0.5409592266699715, "grad_norm": 1.7079308032989502, "learning_rate": 4.832691931380862e-07, "loss": 0.8467, "step": 8730 }, { "epoch": 0.5412690544057503, "grad_norm": 1.7541067600250244, "learning_rate": 4.82943056552084e-07, "loss": 0.8529, "step": 8735 }, { "epoch": 0.5415788821415293, "grad_norm": 1.7323400974273682, "learning_rate": 4.826169199660817e-07, "loss": 0.8141, "step": 8740 }, { "epoch": 0.5418887098773082, "grad_norm": 1.5088139772415161, "learning_rate": 4.822907833800795e-07, "loss": 0.8925, "step": 8745 }, { "epoch": 0.5421985376130871, "grad_norm": 1.8778754472732544, "learning_rate": 4.819646467940774e-07, "loss": 0.7807, "step": 8750 }, { "epoch": 0.542508365348866, "grad_norm": 1.6912564039230347, "learning_rate": 4.816385102080751e-07, "loss": 0.8174, "step": 8755 }, { "epoch": 0.542818193084645, "grad_norm": 1.835204005241394, "learning_rate": 4.813123736220729e-07, "loss": 0.9071, "step": 8760 }, { "epoch": 0.5431280208204239, "grad_norm": 1.5753315687179565, "learning_rate": 4.809862370360707e-07, "loss": 0.7793, "step": 8765 }, { "epoch": 0.5434378485562027, "grad_norm": 1.7688490152359009, "learning_rate": 4.806601004500684e-07, "loss": 0.8756, "step": 8770 }, { "epoch": 0.5437476762919816, "grad_norm": 1.5350594520568848, "learning_rate": 4.803339638640663e-07, "loss": 0.8092, "step": 8775 }, { "epoch": 0.5440575040277605, "grad_norm": 1.4715873003005981, "learning_rate": 4.800078272780641e-07, "loss": 0.8921, "step": 8780 }, { "epoch": 0.5443673317635395, "grad_norm": 1.4639246463775635, "learning_rate": 4.796816906920618e-07, "loss": 0.8738, "step": 8785 }, { "epoch": 0.5446771594993184, "grad_norm": 1.7313624620437622, "learning_rate": 4.793555541060596e-07, "loss": 0.8736, "step": 8790 }, { "epoch": 0.5449869872350973, "grad_norm": 1.8000696897506714, "learning_rate": 4.790294175200573e-07, "loss": 0.8446, "step": 8795 }, { "epoch": 0.5452968149708762, "grad_norm": 1.5014041662216187, "learning_rate": 4.787032809340551e-07, "loss": 0.8491, "step": 8800 }, { "epoch": 0.5456066427066552, "grad_norm": 1.5334349870681763, "learning_rate": 4.78377144348053e-07, "loss": 0.8228, "step": 8805 }, { "epoch": 0.545916470442434, "grad_norm": 2.3809571266174316, "learning_rate": 4.780510077620507e-07, "loss": 0.9113, "step": 8810 }, { "epoch": 0.5462262981782129, "grad_norm": 1.6832716464996338, "learning_rate": 4.777248711760485e-07, "loss": 0.8591, "step": 8815 }, { "epoch": 0.5465361259139918, "grad_norm": 1.6385915279388428, "learning_rate": 4.773987345900464e-07, "loss": 0.8504, "step": 8820 }, { "epoch": 0.5468459536497707, "grad_norm": 1.6378227472305298, "learning_rate": 4.770725980040441e-07, "loss": 0.8369, "step": 8825 }, { "epoch": 0.5471557813855497, "grad_norm": 1.7078901529312134, "learning_rate": 4.767464614180419e-07, "loss": 0.7849, "step": 8830 }, { "epoch": 0.5474656091213286, "grad_norm": 1.5905717611312866, "learning_rate": 4.7642032483203965e-07, "loss": 0.8534, "step": 8835 }, { "epoch": 0.5477754368571075, "grad_norm": 1.5892161130905151, "learning_rate": 4.760941882460374e-07, "loss": 0.8537, "step": 8840 }, { "epoch": 0.5480852645928863, "grad_norm": 1.5843099355697632, "learning_rate": 4.757680516600352e-07, "loss": 0.8027, "step": 8845 }, { "epoch": 0.5483950923286652, "grad_norm": 1.5926589965820312, "learning_rate": 4.75441915074033e-07, "loss": 0.8011, "step": 8850 }, { "epoch": 0.5487049200644442, "grad_norm": 1.7433909177780151, "learning_rate": 4.7511577848803076e-07, "loss": 0.8159, "step": 8855 }, { "epoch": 0.5490147478002231, "grad_norm": 1.6423379182815552, "learning_rate": 4.747896419020285e-07, "loss": 0.8352, "step": 8860 }, { "epoch": 0.549324575536002, "grad_norm": 1.770874261856079, "learning_rate": 4.7446350531602634e-07, "loss": 0.8666, "step": 8865 }, { "epoch": 0.5496344032717809, "grad_norm": 1.5993996858596802, "learning_rate": 4.7413736873002415e-07, "loss": 0.8303, "step": 8870 }, { "epoch": 0.5499442310075598, "grad_norm": 1.753954529762268, "learning_rate": 4.738112321440219e-07, "loss": 0.8847, "step": 8875 }, { "epoch": 0.5502540587433387, "grad_norm": 1.5554791688919067, "learning_rate": 4.734850955580197e-07, "loss": 0.7883, "step": 8880 }, { "epoch": 0.5505638864791176, "grad_norm": 1.5835696458816528, "learning_rate": 4.7315895897201744e-07, "loss": 0.8873, "step": 8885 }, { "epoch": 0.5508737142148965, "grad_norm": 1.7884044647216797, "learning_rate": 4.7283282238601525e-07, "loss": 0.8511, "step": 8890 }, { "epoch": 0.5511835419506754, "grad_norm": 1.7862368822097778, "learning_rate": 4.72506685800013e-07, "loss": 0.8264, "step": 8895 }, { "epoch": 0.5514933696864543, "grad_norm": 1.8445030450820923, "learning_rate": 4.721805492140108e-07, "loss": 0.8501, "step": 8900 }, { "epoch": 0.5518031974222333, "grad_norm": 1.6755388975143433, "learning_rate": 4.718544126280086e-07, "loss": 0.8186, "step": 8905 }, { "epoch": 0.5521130251580122, "grad_norm": 1.5236341953277588, "learning_rate": 4.7152827604200636e-07, "loss": 0.8143, "step": 8910 }, { "epoch": 0.552422852893791, "grad_norm": 1.7451194524765015, "learning_rate": 4.7120213945600417e-07, "loss": 0.8559, "step": 8915 }, { "epoch": 0.5527326806295699, "grad_norm": 1.5990793704986572, "learning_rate": 4.7087600287000193e-07, "loss": 0.8191, "step": 8920 }, { "epoch": 0.5530425083653489, "grad_norm": 1.6096893548965454, "learning_rate": 4.7054986628399975e-07, "loss": 0.8501, "step": 8925 }, { "epoch": 0.5533523361011278, "grad_norm": 1.5403339862823486, "learning_rate": 4.702237296979975e-07, "loss": 0.8202, "step": 8930 }, { "epoch": 0.5536621638369067, "grad_norm": 1.643929362297058, "learning_rate": 4.698975931119953e-07, "loss": 0.8387, "step": 8935 }, { "epoch": 0.5539719915726856, "grad_norm": 1.5548503398895264, "learning_rate": 4.6957145652599304e-07, "loss": 0.9084, "step": 8940 }, { "epoch": 0.5542818193084645, "grad_norm": 1.446750283241272, "learning_rate": 4.6924531993999085e-07, "loss": 0.8886, "step": 8945 }, { "epoch": 0.5545916470442434, "grad_norm": 1.8338066339492798, "learning_rate": 4.689191833539886e-07, "loss": 0.9158, "step": 8950 }, { "epoch": 0.5549014747800223, "grad_norm": 1.6940746307373047, "learning_rate": 4.685930467679864e-07, "loss": 0.8678, "step": 8955 }, { "epoch": 0.5552113025158012, "grad_norm": 1.5374631881713867, "learning_rate": 4.6826691018198425e-07, "loss": 0.9113, "step": 8960 }, { "epoch": 0.5555211302515801, "grad_norm": 1.3977867364883423, "learning_rate": 4.67940773595982e-07, "loss": 0.8463, "step": 8965 }, { "epoch": 0.555830957987359, "grad_norm": 1.548040747642517, "learning_rate": 4.6761463700997977e-07, "loss": 0.8795, "step": 8970 }, { "epoch": 0.556140785723138, "grad_norm": 1.5224881172180176, "learning_rate": 4.6728850042397753e-07, "loss": 0.8808, "step": 8975 }, { "epoch": 0.5564506134589169, "grad_norm": 1.5475249290466309, "learning_rate": 4.6696236383797535e-07, "loss": 0.8858, "step": 8980 }, { "epoch": 0.5567604411946957, "grad_norm": 1.6373364925384521, "learning_rate": 4.666362272519731e-07, "loss": 0.8178, "step": 8985 }, { "epoch": 0.5570702689304746, "grad_norm": 1.8445409536361694, "learning_rate": 4.663100906659709e-07, "loss": 0.926, "step": 8990 }, { "epoch": 0.5573800966662535, "grad_norm": 2.1343796253204346, "learning_rate": 4.6598395407996864e-07, "loss": 0.7971, "step": 8995 }, { "epoch": 0.5576899244020325, "grad_norm": 2.0846943855285645, "learning_rate": 4.6565781749396645e-07, "loss": 0.8253, "step": 9000 }, { "epoch": 0.5579997521378114, "grad_norm": 1.6863757371902466, "learning_rate": 4.653316809079642e-07, "loss": 0.8694, "step": 9005 }, { "epoch": 0.5583095798735903, "grad_norm": 1.5639930963516235, "learning_rate": 4.6500554432196203e-07, "loss": 0.8417, "step": 9010 }, { "epoch": 0.5586194076093692, "grad_norm": 1.5621155500411987, "learning_rate": 4.646794077359598e-07, "loss": 0.8384, "step": 9015 }, { "epoch": 0.558929235345148, "grad_norm": 1.715612530708313, "learning_rate": 4.643532711499576e-07, "loss": 0.8478, "step": 9020 }, { "epoch": 0.559239063080927, "grad_norm": 1.6376852989196777, "learning_rate": 4.6402713456395537e-07, "loss": 0.8245, "step": 9025 }, { "epoch": 0.5595488908167059, "grad_norm": 2.060209274291992, "learning_rate": 4.6370099797795313e-07, "loss": 0.9005, "step": 9030 }, { "epoch": 0.5598587185524848, "grad_norm": 1.6342881917953491, "learning_rate": 4.6337486139195095e-07, "loss": 0.7854, "step": 9035 }, { "epoch": 0.5601685462882637, "grad_norm": 1.6002073287963867, "learning_rate": 4.630487248059487e-07, "loss": 0.8922, "step": 9040 }, { "epoch": 0.5604783740240427, "grad_norm": 2.194197654724121, "learning_rate": 4.6272258821994647e-07, "loss": 0.8335, "step": 9045 }, { "epoch": 0.5607882017598216, "grad_norm": 1.9194248914718628, "learning_rate": 4.6239645163394424e-07, "loss": 0.8859, "step": 9050 }, { "epoch": 0.5610980294956004, "grad_norm": 1.5561786890029907, "learning_rate": 4.620703150479421e-07, "loss": 0.8551, "step": 9055 }, { "epoch": 0.5614078572313793, "grad_norm": 1.629638910293579, "learning_rate": 4.6174417846193987e-07, "loss": 0.8569, "step": 9060 }, { "epoch": 0.5617176849671582, "grad_norm": 1.5860921144485474, "learning_rate": 4.6141804187593763e-07, "loss": 0.8802, "step": 9065 }, { "epoch": 0.5620275127029372, "grad_norm": 1.6280142068862915, "learning_rate": 4.610919052899354e-07, "loss": 0.8044, "step": 9070 }, { "epoch": 0.5623373404387161, "grad_norm": 1.6416586637496948, "learning_rate": 4.607657687039332e-07, "loss": 0.8714, "step": 9075 }, { "epoch": 0.562647168174495, "grad_norm": 1.417209506034851, "learning_rate": 4.6043963211793097e-07, "loss": 0.8435, "step": 9080 }, { "epoch": 0.5629569959102739, "grad_norm": 1.608924150466919, "learning_rate": 4.6011349553192873e-07, "loss": 0.8681, "step": 9085 }, { "epoch": 0.5632668236460528, "grad_norm": 1.7608236074447632, "learning_rate": 4.597873589459265e-07, "loss": 0.8738, "step": 9090 }, { "epoch": 0.5635766513818317, "grad_norm": 1.467663288116455, "learning_rate": 4.594612223599243e-07, "loss": 0.7951, "step": 9095 }, { "epoch": 0.5638864791176106, "grad_norm": 1.616631031036377, "learning_rate": 4.5913508577392207e-07, "loss": 0.8344, "step": 9100 }, { "epoch": 0.5641963068533895, "grad_norm": 2.0431597232818604, "learning_rate": 4.588089491879199e-07, "loss": 0.8038, "step": 9105 }, { "epoch": 0.5645061345891684, "grad_norm": 1.9531164169311523, "learning_rate": 4.584828126019177e-07, "loss": 0.8136, "step": 9110 }, { "epoch": 0.5648159623249474, "grad_norm": 1.474901795387268, "learning_rate": 4.5815667601591546e-07, "loss": 0.8692, "step": 9115 }, { "epoch": 0.5651257900607263, "grad_norm": 1.483241081237793, "learning_rate": 4.5783053942991323e-07, "loss": 0.8068, "step": 9120 }, { "epoch": 0.5654356177965052, "grad_norm": 1.6350326538085938, "learning_rate": 4.57504402843911e-07, "loss": 0.8017, "step": 9125 }, { "epoch": 0.565745445532284, "grad_norm": 1.6167656183242798, "learning_rate": 4.571782662579088e-07, "loss": 0.8197, "step": 9130 }, { "epoch": 0.5660552732680629, "grad_norm": 1.611594557762146, "learning_rate": 4.5685212967190657e-07, "loss": 0.874, "step": 9135 }, { "epoch": 0.5663651010038419, "grad_norm": 1.7013883590698242, "learning_rate": 4.5652599308590433e-07, "loss": 0.8551, "step": 9140 }, { "epoch": 0.5666749287396208, "grad_norm": 1.4185079336166382, "learning_rate": 4.561998564999021e-07, "loss": 0.8685, "step": 9145 }, { "epoch": 0.5669847564753997, "grad_norm": 1.6056551933288574, "learning_rate": 4.5587371991389996e-07, "loss": 0.8568, "step": 9150 }, { "epoch": 0.5672945842111786, "grad_norm": 2.489499568939209, "learning_rate": 4.555475833278977e-07, "loss": 0.8802, "step": 9155 }, { "epoch": 0.5676044119469575, "grad_norm": 1.4226900339126587, "learning_rate": 4.552214467418955e-07, "loss": 0.8072, "step": 9160 }, { "epoch": 0.5679142396827364, "grad_norm": 1.4512708187103271, "learning_rate": 4.548953101558933e-07, "loss": 0.7999, "step": 9165 }, { "epoch": 0.5682240674185153, "grad_norm": 1.644745945930481, "learning_rate": 4.5456917356989106e-07, "loss": 0.9195, "step": 9170 }, { "epoch": 0.5685338951542942, "grad_norm": 1.6725927591323853, "learning_rate": 4.542430369838888e-07, "loss": 0.7939, "step": 9175 }, { "epoch": 0.5688437228900731, "grad_norm": 1.7098997831344604, "learning_rate": 4.539169003978866e-07, "loss": 0.8409, "step": 9180 }, { "epoch": 0.569153550625852, "grad_norm": 1.6662946939468384, "learning_rate": 4.535907638118844e-07, "loss": 0.8744, "step": 9185 }, { "epoch": 0.569463378361631, "grad_norm": 1.6171815395355225, "learning_rate": 4.5326462722588217e-07, "loss": 0.8256, "step": 9190 }, { "epoch": 0.5697732060974099, "grad_norm": 1.8097420930862427, "learning_rate": 4.5293849063988e-07, "loss": 0.7923, "step": 9195 }, { "epoch": 0.5700830338331887, "grad_norm": 1.907952070236206, "learning_rate": 4.5261235405387774e-07, "loss": 0.8583, "step": 9200 }, { "epoch": 0.5703928615689676, "grad_norm": 1.8906043767929077, "learning_rate": 4.5228621746787556e-07, "loss": 0.8553, "step": 9205 }, { "epoch": 0.5707026893047465, "grad_norm": 1.6357717514038086, "learning_rate": 4.519600808818733e-07, "loss": 0.8347, "step": 9210 }, { "epoch": 0.5710125170405255, "grad_norm": 1.6732017993927002, "learning_rate": 4.516339442958711e-07, "loss": 0.8915, "step": 9215 }, { "epoch": 0.5713223447763044, "grad_norm": 1.8675826787948608, "learning_rate": 4.5130780770986885e-07, "loss": 0.8245, "step": 9220 }, { "epoch": 0.5716321725120833, "grad_norm": 1.665645956993103, "learning_rate": 4.5098167112386666e-07, "loss": 0.8198, "step": 9225 }, { "epoch": 0.5719420002478622, "grad_norm": 1.5749092102050781, "learning_rate": 4.506555345378644e-07, "loss": 0.8586, "step": 9230 }, { "epoch": 0.572251827983641, "grad_norm": 1.7417221069335938, "learning_rate": 4.503293979518622e-07, "loss": 0.8336, "step": 9235 }, { "epoch": 0.57256165571942, "grad_norm": 1.6346397399902344, "learning_rate": 4.5000326136585995e-07, "loss": 0.8308, "step": 9240 }, { "epoch": 0.5728714834551989, "grad_norm": 1.8254103660583496, "learning_rate": 4.496771247798578e-07, "loss": 0.8412, "step": 9245 }, { "epoch": 0.5731813111909778, "grad_norm": 1.709876537322998, "learning_rate": 4.493509881938556e-07, "loss": 0.8195, "step": 9250 }, { "epoch": 0.5734911389267567, "grad_norm": 1.6724433898925781, "learning_rate": 4.4902485160785334e-07, "loss": 0.8731, "step": 9255 }, { "epoch": 0.5738009666625357, "grad_norm": 1.3522034883499146, "learning_rate": 4.4869871502185116e-07, "loss": 0.8726, "step": 9260 }, { "epoch": 0.5741107943983146, "grad_norm": 1.7748074531555176, "learning_rate": 4.483725784358489e-07, "loss": 0.8912, "step": 9265 }, { "epoch": 0.5744206221340934, "grad_norm": 1.8456441164016724, "learning_rate": 4.480464418498467e-07, "loss": 0.8605, "step": 9270 }, { "epoch": 0.5747304498698723, "grad_norm": 1.566980004310608, "learning_rate": 4.4772030526384445e-07, "loss": 0.8399, "step": 9275 }, { "epoch": 0.5750402776056512, "grad_norm": 1.6955028772354126, "learning_rate": 4.4739416867784226e-07, "loss": 0.9408, "step": 9280 }, { "epoch": 0.5753501053414302, "grad_norm": 1.6292763948440552, "learning_rate": 4.4706803209184e-07, "loss": 0.8696, "step": 9285 }, { "epoch": 0.5756599330772091, "grad_norm": 1.8068493604660034, "learning_rate": 4.4674189550583784e-07, "loss": 0.8599, "step": 9290 }, { "epoch": 0.575969760812988, "grad_norm": 1.6942410469055176, "learning_rate": 4.464157589198356e-07, "loss": 0.8639, "step": 9295 }, { "epoch": 0.5762795885487669, "grad_norm": 1.4769914150238037, "learning_rate": 4.460896223338334e-07, "loss": 0.8174, "step": 9300 }, { "epoch": 0.5765894162845457, "grad_norm": 1.5522867441177368, "learning_rate": 4.457634857478312e-07, "loss": 0.8231, "step": 9305 }, { "epoch": 0.5768992440203247, "grad_norm": 1.5715887546539307, "learning_rate": 4.4543734916182894e-07, "loss": 0.8573, "step": 9310 }, { "epoch": 0.5772090717561036, "grad_norm": 1.5352972745895386, "learning_rate": 4.4511121257582676e-07, "loss": 0.8288, "step": 9315 }, { "epoch": 0.5775188994918825, "grad_norm": 1.6686139106750488, "learning_rate": 4.447850759898245e-07, "loss": 0.8284, "step": 9320 }, { "epoch": 0.5778287272276614, "grad_norm": 1.97742760181427, "learning_rate": 4.444589394038223e-07, "loss": 0.8526, "step": 9325 }, { "epoch": 0.5781385549634404, "grad_norm": 1.4597872495651245, "learning_rate": 4.4413280281782005e-07, "loss": 0.8254, "step": 9330 }, { "epoch": 0.5784483826992193, "grad_norm": 1.9278160333633423, "learning_rate": 4.4380666623181786e-07, "loss": 0.8768, "step": 9335 }, { "epoch": 0.5787582104349981, "grad_norm": 1.8820594549179077, "learning_rate": 4.434805296458157e-07, "loss": 0.8902, "step": 9340 }, { "epoch": 0.579068038170777, "grad_norm": 2.046740770339966, "learning_rate": 4.4315439305981344e-07, "loss": 0.88, "step": 9345 }, { "epoch": 0.5793778659065559, "grad_norm": 1.6918457746505737, "learning_rate": 4.428282564738112e-07, "loss": 0.8115, "step": 9350 }, { "epoch": 0.5796876936423349, "grad_norm": 1.8193066120147705, "learning_rate": 4.42502119887809e-07, "loss": 0.8893, "step": 9355 }, { "epoch": 0.5799975213781138, "grad_norm": 1.4376704692840576, "learning_rate": 4.421759833018068e-07, "loss": 0.8646, "step": 9360 }, { "epoch": 0.5803073491138927, "grad_norm": 1.6721975803375244, "learning_rate": 4.4184984671580454e-07, "loss": 0.8481, "step": 9365 }, { "epoch": 0.5806171768496716, "grad_norm": 1.8602451086044312, "learning_rate": 4.415237101298023e-07, "loss": 0.8609, "step": 9370 }, { "epoch": 0.5809270045854504, "grad_norm": 1.6349927186965942, "learning_rate": 4.411975735438001e-07, "loss": 0.8209, "step": 9375 }, { "epoch": 0.5812368323212294, "grad_norm": 1.545822262763977, "learning_rate": 4.408714369577979e-07, "loss": 0.8876, "step": 9380 }, { "epoch": 0.5815466600570083, "grad_norm": 1.6170450448989868, "learning_rate": 4.405453003717957e-07, "loss": 0.845, "step": 9385 }, { "epoch": 0.5818564877927872, "grad_norm": 1.6861709356307983, "learning_rate": 4.402191637857935e-07, "loss": 0.9049, "step": 9390 }, { "epoch": 0.5821663155285661, "grad_norm": 1.8511162996292114, "learning_rate": 4.398930271997913e-07, "loss": 0.8427, "step": 9395 }, { "epoch": 0.582476143264345, "grad_norm": 1.6483521461486816, "learning_rate": 4.3956689061378904e-07, "loss": 0.8652, "step": 9400 }, { "epoch": 0.582785971000124, "grad_norm": 1.6341553926467896, "learning_rate": 4.392407540277868e-07, "loss": 0.8177, "step": 9405 }, { "epoch": 0.5830957987359029, "grad_norm": 1.5284521579742432, "learning_rate": 4.389146174417846e-07, "loss": 0.8717, "step": 9410 }, { "epoch": 0.5834056264716817, "grad_norm": 1.9985013008117676, "learning_rate": 4.385884808557824e-07, "loss": 0.8554, "step": 9415 }, { "epoch": 0.5837154542074606, "grad_norm": 1.7432397603988647, "learning_rate": 4.3826234426978014e-07, "loss": 0.8329, "step": 9420 }, { "epoch": 0.5840252819432395, "grad_norm": 1.9207228422164917, "learning_rate": 4.379362076837779e-07, "loss": 0.841, "step": 9425 }, { "epoch": 0.5843351096790185, "grad_norm": 1.8392919301986694, "learning_rate": 4.376100710977757e-07, "loss": 0.8253, "step": 9430 }, { "epoch": 0.5846449374147974, "grad_norm": 1.7682520151138306, "learning_rate": 4.3728393451177353e-07, "loss": 0.8398, "step": 9435 }, { "epoch": 0.5849547651505763, "grad_norm": 1.9339767694473267, "learning_rate": 4.369577979257713e-07, "loss": 0.907, "step": 9440 }, { "epoch": 0.5852645928863552, "grad_norm": 1.50418221950531, "learning_rate": 4.366316613397691e-07, "loss": 0.845, "step": 9445 }, { "epoch": 0.585574420622134, "grad_norm": 1.6069998741149902, "learning_rate": 4.363055247537669e-07, "loss": 0.8602, "step": 9450 }, { "epoch": 0.585884248357913, "grad_norm": 1.6524406671524048, "learning_rate": 4.3597938816776464e-07, "loss": 0.8687, "step": 9455 }, { "epoch": 0.5861940760936919, "grad_norm": 1.5521221160888672, "learning_rate": 4.356532515817624e-07, "loss": 0.8324, "step": 9460 }, { "epoch": 0.5865039038294708, "grad_norm": 1.616046667098999, "learning_rate": 4.353271149957602e-07, "loss": 0.8468, "step": 9465 }, { "epoch": 0.5868137315652497, "grad_norm": 1.8592383861541748, "learning_rate": 4.35000978409758e-07, "loss": 0.8465, "step": 9470 }, { "epoch": 0.5871235593010287, "grad_norm": 1.5164600610733032, "learning_rate": 4.3467484182375574e-07, "loss": 0.816, "step": 9475 }, { "epoch": 0.5874333870368076, "grad_norm": 1.6536730527877808, "learning_rate": 4.3434870523775355e-07, "loss": 0.8781, "step": 9480 }, { "epoch": 0.5877432147725864, "grad_norm": 1.5471912622451782, "learning_rate": 4.3402256865175137e-07, "loss": 0.8214, "step": 9485 }, { "epoch": 0.5880530425083653, "grad_norm": 1.9584338665008545, "learning_rate": 4.3369643206574913e-07, "loss": 0.8052, "step": 9490 }, { "epoch": 0.5883628702441442, "grad_norm": 1.5765401124954224, "learning_rate": 4.333702954797469e-07, "loss": 0.8016, "step": 9495 }, { "epoch": 0.5886726979799232, "grad_norm": 1.5389227867126465, "learning_rate": 4.3304415889374466e-07, "loss": 0.8493, "step": 9500 }, { "epoch": 0.5889825257157021, "grad_norm": 1.6139471530914307, "learning_rate": 4.3271802230774247e-07, "loss": 0.8741, "step": 9505 }, { "epoch": 0.589292353451481, "grad_norm": 1.741050362586975, "learning_rate": 4.3239188572174024e-07, "loss": 0.8649, "step": 9510 }, { "epoch": 0.5896021811872599, "grad_norm": 1.6417787075042725, "learning_rate": 4.32065749135738e-07, "loss": 0.8796, "step": 9515 }, { "epoch": 0.5899120089230387, "grad_norm": 1.537497639656067, "learning_rate": 4.317396125497358e-07, "loss": 0.8107, "step": 9520 }, { "epoch": 0.5902218366588177, "grad_norm": 1.7288134098052979, "learning_rate": 4.3141347596373363e-07, "loss": 0.901, "step": 9525 }, { "epoch": 0.5905316643945966, "grad_norm": 1.7238481044769287, "learning_rate": 4.310873393777314e-07, "loss": 0.881, "step": 9530 }, { "epoch": 0.5908414921303755, "grad_norm": 1.8005932569503784, "learning_rate": 4.3076120279172915e-07, "loss": 0.8022, "step": 9535 }, { "epoch": 0.5911513198661544, "grad_norm": 1.7062708139419556, "learning_rate": 4.3043506620572697e-07, "loss": 0.8499, "step": 9540 }, { "epoch": 0.5914611476019334, "grad_norm": 1.966982126235962, "learning_rate": 4.3010892961972473e-07, "loss": 0.8962, "step": 9545 }, { "epoch": 0.5917709753377123, "grad_norm": 1.5912953615188599, "learning_rate": 4.297827930337225e-07, "loss": 0.8506, "step": 9550 }, { "epoch": 0.5920808030734911, "grad_norm": 1.7628278732299805, "learning_rate": 4.2945665644772026e-07, "loss": 0.8313, "step": 9555 }, { "epoch": 0.59239063080927, "grad_norm": 1.7089394330978394, "learning_rate": 4.2913051986171807e-07, "loss": 0.8611, "step": 9560 }, { "epoch": 0.5927004585450489, "grad_norm": 1.563678503036499, "learning_rate": 4.2880438327571583e-07, "loss": 0.8574, "step": 9565 }, { "epoch": 0.5930102862808279, "grad_norm": 1.583106279373169, "learning_rate": 4.284782466897136e-07, "loss": 0.8487, "step": 9570 }, { "epoch": 0.5933201140166068, "grad_norm": 1.8034443855285645, "learning_rate": 4.2815211010371147e-07, "loss": 0.8704, "step": 9575 }, { "epoch": 0.5936299417523857, "grad_norm": 1.7355549335479736, "learning_rate": 4.2782597351770923e-07, "loss": 0.8178, "step": 9580 }, { "epoch": 0.5939397694881646, "grad_norm": 1.5846327543258667, "learning_rate": 4.27499836931707e-07, "loss": 0.8425, "step": 9585 }, { "epoch": 0.5942495972239434, "grad_norm": 1.5832010507583618, "learning_rate": 4.2717370034570475e-07, "loss": 0.8339, "step": 9590 }, { "epoch": 0.5945594249597224, "grad_norm": 1.6259633302688599, "learning_rate": 4.2684756375970257e-07, "loss": 0.8699, "step": 9595 }, { "epoch": 0.5948692526955013, "grad_norm": 2.369204521179199, "learning_rate": 4.2652142717370033e-07, "loss": 0.9881, "step": 9600 }, { "epoch": 0.5951790804312802, "grad_norm": 2.369272470474243, "learning_rate": 4.261952905876981e-07, "loss": 0.8156, "step": 9605 }, { "epoch": 0.5954889081670591, "grad_norm": 1.6879608631134033, "learning_rate": 4.2586915400169586e-07, "loss": 0.9084, "step": 9610 }, { "epoch": 0.595798735902838, "grad_norm": 1.6920260190963745, "learning_rate": 4.2554301741569367e-07, "loss": 0.8652, "step": 9615 }, { "epoch": 0.596108563638617, "grad_norm": 1.6749577522277832, "learning_rate": 4.252168808296915e-07, "loss": 0.8116, "step": 9620 }, { "epoch": 0.5964183913743958, "grad_norm": 1.6480072736740112, "learning_rate": 4.2489074424368925e-07, "loss": 0.8341, "step": 9625 }, { "epoch": 0.5967282191101747, "grad_norm": 1.5644205808639526, "learning_rate": 4.24564607657687e-07, "loss": 0.872, "step": 9630 }, { "epoch": 0.5970380468459536, "grad_norm": 1.6765652894973755, "learning_rate": 4.2423847107168483e-07, "loss": 0.8509, "step": 9635 }, { "epoch": 0.5973478745817326, "grad_norm": 1.6652307510375977, "learning_rate": 4.239123344856826e-07, "loss": 0.8518, "step": 9640 }, { "epoch": 0.5976577023175115, "grad_norm": 1.6183152198791504, "learning_rate": 4.2358619789968035e-07, "loss": 0.8722, "step": 9645 }, { "epoch": 0.5979675300532904, "grad_norm": 1.6434589624404907, "learning_rate": 4.2326006131367817e-07, "loss": 0.8415, "step": 9650 }, { "epoch": 0.5982773577890693, "grad_norm": 1.7056548595428467, "learning_rate": 4.2293392472767593e-07, "loss": 0.9023, "step": 9655 }, { "epoch": 0.5985871855248481, "grad_norm": 1.5401711463928223, "learning_rate": 4.226077881416737e-07, "loss": 0.8774, "step": 9660 }, { "epoch": 0.598897013260627, "grad_norm": 1.657860517501831, "learning_rate": 4.2228165155567145e-07, "loss": 0.8851, "step": 9665 }, { "epoch": 0.599206840996406, "grad_norm": 1.6441867351531982, "learning_rate": 4.219555149696693e-07, "loss": 0.8366, "step": 9670 }, { "epoch": 0.5995166687321849, "grad_norm": 1.653236746788025, "learning_rate": 4.216293783836671e-07, "loss": 0.8806, "step": 9675 }, { "epoch": 0.5998264964679638, "grad_norm": 1.576060175895691, "learning_rate": 4.2130324179766485e-07, "loss": 0.8435, "step": 9680 }, { "epoch": 0.6001363242037427, "grad_norm": 1.8991374969482422, "learning_rate": 4.209771052116626e-07, "loss": 0.8952, "step": 9685 }, { "epoch": 0.6004461519395217, "grad_norm": 1.6186144351959229, "learning_rate": 4.206509686256604e-07, "loss": 0.8345, "step": 9690 }, { "epoch": 0.6007559796753005, "grad_norm": 1.8214917182922363, "learning_rate": 4.203248320396582e-07, "loss": 0.8449, "step": 9695 }, { "epoch": 0.6010658074110794, "grad_norm": 1.6464420557022095, "learning_rate": 4.1999869545365595e-07, "loss": 0.8419, "step": 9700 }, { "epoch": 0.6013756351468583, "grad_norm": 1.6516224145889282, "learning_rate": 4.196725588676537e-07, "loss": 0.8912, "step": 9705 }, { "epoch": 0.6016854628826372, "grad_norm": 1.7919508218765259, "learning_rate": 4.1934642228165153e-07, "loss": 0.8625, "step": 9710 }, { "epoch": 0.6019952906184162, "grad_norm": 1.6947442293167114, "learning_rate": 4.1902028569564934e-07, "loss": 0.8694, "step": 9715 }, { "epoch": 0.6023051183541951, "grad_norm": 1.7044764757156372, "learning_rate": 4.186941491096471e-07, "loss": 0.8165, "step": 9720 }, { "epoch": 0.602614946089974, "grad_norm": 1.5834498405456543, "learning_rate": 4.183680125236449e-07, "loss": 0.8195, "step": 9725 }, { "epoch": 0.6029247738257529, "grad_norm": 1.9215415716171265, "learning_rate": 4.180418759376427e-07, "loss": 0.8067, "step": 9730 }, { "epoch": 0.6032346015615317, "grad_norm": 1.4289876222610474, "learning_rate": 4.1771573935164045e-07, "loss": 0.8601, "step": 9735 }, { "epoch": 0.6035444292973107, "grad_norm": 1.929069995880127, "learning_rate": 4.173896027656382e-07, "loss": 0.883, "step": 9740 }, { "epoch": 0.6038542570330896, "grad_norm": 1.6711370944976807, "learning_rate": 4.17063466179636e-07, "loss": 0.8284, "step": 9745 }, { "epoch": 0.6041640847688685, "grad_norm": 1.6329054832458496, "learning_rate": 4.167373295936338e-07, "loss": 0.8191, "step": 9750 }, { "epoch": 0.6044739125046474, "grad_norm": 1.733131766319275, "learning_rate": 4.1641119300763155e-07, "loss": 0.8897, "step": 9755 }, { "epoch": 0.6047837402404264, "grad_norm": 1.7404372692108154, "learning_rate": 4.160850564216293e-07, "loss": 0.7971, "step": 9760 }, { "epoch": 0.6050935679762053, "grad_norm": 1.830913782119751, "learning_rate": 4.157589198356272e-07, "loss": 0.8995, "step": 9765 }, { "epoch": 0.6054033957119841, "grad_norm": 1.6092594861984253, "learning_rate": 4.1543278324962494e-07, "loss": 0.8863, "step": 9770 }, { "epoch": 0.605713223447763, "grad_norm": 1.5492459535598755, "learning_rate": 4.151066466636227e-07, "loss": 0.819, "step": 9775 }, { "epoch": 0.6060230511835419, "grad_norm": 1.8934675455093384, "learning_rate": 4.1478051007762047e-07, "loss": 0.8859, "step": 9780 }, { "epoch": 0.6063328789193209, "grad_norm": 1.5038340091705322, "learning_rate": 4.144543734916183e-07, "loss": 0.8544, "step": 9785 }, { "epoch": 0.6066427066550998, "grad_norm": 1.774290919303894, "learning_rate": 4.1412823690561605e-07, "loss": 0.7838, "step": 9790 }, { "epoch": 0.6069525343908787, "grad_norm": 1.574328064918518, "learning_rate": 4.138021003196138e-07, "loss": 0.8217, "step": 9795 }, { "epoch": 0.6072623621266576, "grad_norm": 1.5191771984100342, "learning_rate": 4.134759637336116e-07, "loss": 0.8079, "step": 9800 }, { "epoch": 0.6075721898624364, "grad_norm": 1.650612473487854, "learning_rate": 4.131498271476094e-07, "loss": 0.8388, "step": 9805 }, { "epoch": 0.6078820175982154, "grad_norm": 1.6265493631362915, "learning_rate": 4.128236905616072e-07, "loss": 0.8388, "step": 9810 }, { "epoch": 0.6081918453339943, "grad_norm": 1.9890083074569702, "learning_rate": 4.1249755397560496e-07, "loss": 0.8407, "step": 9815 }, { "epoch": 0.6085016730697732, "grad_norm": 1.9271951913833618, "learning_rate": 4.121714173896028e-07, "loss": 0.8749, "step": 9820 }, { "epoch": 0.6088115008055521, "grad_norm": 1.4547626972198486, "learning_rate": 4.1184528080360054e-07, "loss": 0.8415, "step": 9825 }, { "epoch": 0.609121328541331, "grad_norm": 1.760406494140625, "learning_rate": 4.115191442175983e-07, "loss": 0.8417, "step": 9830 }, { "epoch": 0.60943115627711, "grad_norm": 1.6256040334701538, "learning_rate": 4.1119300763159607e-07, "loss": 0.87, "step": 9835 }, { "epoch": 0.6097409840128888, "grad_norm": 1.5776644945144653, "learning_rate": 4.108668710455939e-07, "loss": 0.7923, "step": 9840 }, { "epoch": 0.6100508117486677, "grad_norm": 1.5451703071594238, "learning_rate": 4.1054073445959164e-07, "loss": 0.8087, "step": 9845 }, { "epoch": 0.6103606394844466, "grad_norm": 1.6550801992416382, "learning_rate": 4.102145978735894e-07, "loss": 0.8795, "step": 9850 }, { "epoch": 0.6106704672202256, "grad_norm": 1.7790660858154297, "learning_rate": 4.098884612875873e-07, "loss": 0.8047, "step": 9855 }, { "epoch": 0.6109802949560045, "grad_norm": 1.5440723896026611, "learning_rate": 4.0956232470158504e-07, "loss": 0.8251, "step": 9860 }, { "epoch": 0.6112901226917834, "grad_norm": 1.5795471668243408, "learning_rate": 4.092361881155828e-07, "loss": 0.875, "step": 9865 }, { "epoch": 0.6115999504275623, "grad_norm": 1.5429435968399048, "learning_rate": 4.0891005152958056e-07, "loss": 0.8319, "step": 9870 }, { "epoch": 0.6119097781633411, "grad_norm": 1.6687488555908203, "learning_rate": 4.085839149435784e-07, "loss": 0.879, "step": 9875 }, { "epoch": 0.6122196058991201, "grad_norm": 1.8721920251846313, "learning_rate": 4.0825777835757614e-07, "loss": 0.8171, "step": 9880 }, { "epoch": 0.612529433634899, "grad_norm": 2.0056991577148438, "learning_rate": 4.079316417715739e-07, "loss": 0.8452, "step": 9885 }, { "epoch": 0.6128392613706779, "grad_norm": 1.6097028255462646, "learning_rate": 4.0760550518557167e-07, "loss": 0.811, "step": 9890 }, { "epoch": 0.6131490891064568, "grad_norm": 1.7039189338684082, "learning_rate": 4.072793685995695e-07, "loss": 0.8456, "step": 9895 }, { "epoch": 0.6134589168422357, "grad_norm": 1.4746054410934448, "learning_rate": 4.0695323201356724e-07, "loss": 0.8412, "step": 9900 }, { "epoch": 0.6137687445780147, "grad_norm": 1.6743019819259644, "learning_rate": 4.0662709542756506e-07, "loss": 0.8638, "step": 9905 }, { "epoch": 0.6140785723137935, "grad_norm": 1.7554916143417358, "learning_rate": 4.063009588415628e-07, "loss": 0.8193, "step": 9910 }, { "epoch": 0.6143884000495724, "grad_norm": 1.821881890296936, "learning_rate": 4.0597482225556064e-07, "loss": 0.7969, "step": 9915 }, { "epoch": 0.6146982277853513, "grad_norm": 1.703376054763794, "learning_rate": 4.056486856695584e-07, "loss": 0.836, "step": 9920 }, { "epoch": 0.6150080555211302, "grad_norm": 1.4818614721298218, "learning_rate": 4.0532254908355616e-07, "loss": 0.8223, "step": 9925 }, { "epoch": 0.6153178832569092, "grad_norm": 1.7278906106948853, "learning_rate": 4.04996412497554e-07, "loss": 0.9119, "step": 9930 }, { "epoch": 0.6156277109926881, "grad_norm": 1.6653218269348145, "learning_rate": 4.0467027591155174e-07, "loss": 0.8728, "step": 9935 }, { "epoch": 0.615937538728467, "grad_norm": 1.6434086561203003, "learning_rate": 4.043441393255495e-07, "loss": 0.8332, "step": 9940 }, { "epoch": 0.6162473664642458, "grad_norm": 1.8391027450561523, "learning_rate": 4.0401800273954726e-07, "loss": 0.8431, "step": 9945 }, { "epoch": 0.6165571942000247, "grad_norm": 1.8177050352096558, "learning_rate": 4.0369186615354513e-07, "loss": 0.8005, "step": 9950 }, { "epoch": 0.6168670219358037, "grad_norm": 1.5041104555130005, "learning_rate": 4.033657295675429e-07, "loss": 0.8562, "step": 9955 }, { "epoch": 0.6171768496715826, "grad_norm": 1.6652886867523193, "learning_rate": 4.0303959298154066e-07, "loss": 0.8596, "step": 9960 }, { "epoch": 0.6174866774073615, "grad_norm": 1.8555763959884644, "learning_rate": 4.027134563955384e-07, "loss": 0.8552, "step": 9965 }, { "epoch": 0.6177965051431404, "grad_norm": 1.6438238620758057, "learning_rate": 4.0238731980953624e-07, "loss": 0.8452, "step": 9970 }, { "epoch": 0.6181063328789194, "grad_norm": 1.8121784925460815, "learning_rate": 4.02061183223534e-07, "loss": 0.8332, "step": 9975 }, { "epoch": 0.6184161606146982, "grad_norm": 1.8366894721984863, "learning_rate": 4.0173504663753176e-07, "loss": 0.8586, "step": 9980 }, { "epoch": 0.6187259883504771, "grad_norm": 1.7771694660186768, "learning_rate": 4.014089100515295e-07, "loss": 0.8353, "step": 9985 }, { "epoch": 0.619035816086256, "grad_norm": 1.927008867263794, "learning_rate": 4.0108277346552734e-07, "loss": 0.8586, "step": 9990 }, { "epoch": 0.6193456438220349, "grad_norm": 1.6362899541854858, "learning_rate": 4.007566368795251e-07, "loss": 0.8006, "step": 9995 }, { "epoch": 0.6196554715578139, "grad_norm": 1.5658955574035645, "learning_rate": 4.004305002935229e-07, "loss": 0.8698, "step": 10000 }, { "epoch": 0.6199652992935928, "grad_norm": 1.6688423156738281, "learning_rate": 4.0010436370752073e-07, "loss": 0.8444, "step": 10005 }, { "epoch": 0.6202751270293717, "grad_norm": 1.7482234239578247, "learning_rate": 3.997782271215185e-07, "loss": 0.9085, "step": 10010 }, { "epoch": 0.6205849547651506, "grad_norm": 1.6065192222595215, "learning_rate": 3.9945209053551626e-07, "loss": 0.8619, "step": 10015 }, { "epoch": 0.6208947825009294, "grad_norm": 1.7221081256866455, "learning_rate": 3.99125953949514e-07, "loss": 0.8421, "step": 10020 }, { "epoch": 0.6212046102367084, "grad_norm": 1.7404029369354248, "learning_rate": 3.9879981736351183e-07, "loss": 0.7984, "step": 10025 }, { "epoch": 0.6215144379724873, "grad_norm": 1.6413768529891968, "learning_rate": 3.984736807775096e-07, "loss": 0.7962, "step": 10030 }, { "epoch": 0.6218242657082662, "grad_norm": 1.5070635080337524, "learning_rate": 3.9814754419150736e-07, "loss": 0.8202, "step": 10035 }, { "epoch": 0.6221340934440451, "grad_norm": 1.9494893550872803, "learning_rate": 3.978214076055051e-07, "loss": 0.8321, "step": 10040 }, { "epoch": 0.6224439211798241, "grad_norm": 1.7614246606826782, "learning_rate": 3.97495271019503e-07, "loss": 0.8076, "step": 10045 }, { "epoch": 0.622753748915603, "grad_norm": 1.9210447072982788, "learning_rate": 3.9716913443350075e-07, "loss": 0.8431, "step": 10050 }, { "epoch": 0.6230635766513818, "grad_norm": 1.4611659049987793, "learning_rate": 3.968429978474985e-07, "loss": 0.8605, "step": 10055 }, { "epoch": 0.6233734043871607, "grad_norm": 1.6907575130462646, "learning_rate": 3.9651686126149633e-07, "loss": 0.8678, "step": 10060 }, { "epoch": 0.6236832321229396, "grad_norm": 1.5068544149398804, "learning_rate": 3.961907246754941e-07, "loss": 0.7947, "step": 10065 }, { "epoch": 0.6239930598587186, "grad_norm": 2.069798469543457, "learning_rate": 3.9586458808949186e-07, "loss": 0.8941, "step": 10070 }, { "epoch": 0.6243028875944975, "grad_norm": 1.693238377571106, "learning_rate": 3.955384515034896e-07, "loss": 0.8618, "step": 10075 }, { "epoch": 0.6246127153302764, "grad_norm": 1.6176162958145142, "learning_rate": 3.9521231491748743e-07, "loss": 0.8662, "step": 10080 }, { "epoch": 0.6249225430660553, "grad_norm": 1.6491477489471436, "learning_rate": 3.948861783314852e-07, "loss": 0.8785, "step": 10085 }, { "epoch": 0.6252323708018341, "grad_norm": 1.7555286884307861, "learning_rate": 3.9456004174548296e-07, "loss": 0.8143, "step": 10090 }, { "epoch": 0.6255421985376131, "grad_norm": 1.793638825416565, "learning_rate": 3.942339051594808e-07, "loss": 0.8564, "step": 10095 }, { "epoch": 0.625852026273392, "grad_norm": 1.6589750051498413, "learning_rate": 3.939077685734786e-07, "loss": 0.8557, "step": 10100 }, { "epoch": 0.6261618540091709, "grad_norm": 1.5345996618270874, "learning_rate": 3.9358163198747635e-07, "loss": 0.8326, "step": 10105 }, { "epoch": 0.6264716817449498, "grad_norm": 1.951908826828003, "learning_rate": 3.932554954014741e-07, "loss": 0.8731, "step": 10110 }, { "epoch": 0.6267815094807287, "grad_norm": 1.8276280164718628, "learning_rate": 3.929293588154719e-07, "loss": 0.8253, "step": 10115 }, { "epoch": 0.6270913372165077, "grad_norm": 1.4689189195632935, "learning_rate": 3.926032222294697e-07, "loss": 0.8625, "step": 10120 }, { "epoch": 0.6274011649522865, "grad_norm": 1.4532523155212402, "learning_rate": 3.9227708564346745e-07, "loss": 0.8207, "step": 10125 }, { "epoch": 0.6277109926880654, "grad_norm": 1.6205095052719116, "learning_rate": 3.919509490574652e-07, "loss": 0.8143, "step": 10130 }, { "epoch": 0.6280208204238443, "grad_norm": 1.7889467477798462, "learning_rate": 3.9162481247146303e-07, "loss": 0.8141, "step": 10135 }, { "epoch": 0.6283306481596233, "grad_norm": 1.4110667705535889, "learning_rate": 3.9129867588546085e-07, "loss": 0.8012, "step": 10140 }, { "epoch": 0.6286404758954022, "grad_norm": 1.595468282699585, "learning_rate": 3.909725392994586e-07, "loss": 0.8522, "step": 10145 }, { "epoch": 0.6289503036311811, "grad_norm": 1.7381298542022705, "learning_rate": 3.9064640271345637e-07, "loss": 0.8225, "step": 10150 }, { "epoch": 0.62926013136696, "grad_norm": 1.5995941162109375, "learning_rate": 3.903202661274542e-07, "loss": 0.8799, "step": 10155 }, { "epoch": 0.6295699591027388, "grad_norm": 1.6954461336135864, "learning_rate": 3.8999412954145195e-07, "loss": 0.8862, "step": 10160 }, { "epoch": 0.6298797868385178, "grad_norm": 1.5437939167022705, "learning_rate": 3.896679929554497e-07, "loss": 0.8434, "step": 10165 }, { "epoch": 0.6301896145742967, "grad_norm": 1.6525373458862305, "learning_rate": 3.893418563694475e-07, "loss": 0.8457, "step": 10170 }, { "epoch": 0.6304994423100756, "grad_norm": 1.5268703699111938, "learning_rate": 3.890157197834453e-07, "loss": 0.8253, "step": 10175 }, { "epoch": 0.6308092700458545, "grad_norm": 1.5693098306655884, "learning_rate": 3.8868958319744305e-07, "loss": 0.8139, "step": 10180 }, { "epoch": 0.6311190977816334, "grad_norm": 1.4832584857940674, "learning_rate": 3.8836344661144087e-07, "loss": 0.794, "step": 10185 }, { "epoch": 0.6314289255174124, "grad_norm": 1.7867785692214966, "learning_rate": 3.8803731002543863e-07, "loss": 0.8467, "step": 10190 }, { "epoch": 0.6317387532531912, "grad_norm": 1.8106743097305298, "learning_rate": 3.8771117343943645e-07, "loss": 0.8547, "step": 10195 }, { "epoch": 0.6320485809889701, "grad_norm": 1.6330715417861938, "learning_rate": 3.873850368534342e-07, "loss": 0.862, "step": 10200 }, { "epoch": 0.632358408724749, "grad_norm": 1.5631303787231445, "learning_rate": 3.8705890026743197e-07, "loss": 0.8775, "step": 10205 }, { "epoch": 0.6326682364605279, "grad_norm": 1.601895809173584, "learning_rate": 3.867327636814298e-07, "loss": 0.8567, "step": 10210 }, { "epoch": 0.6329780641963069, "grad_norm": 1.6321600675582886, "learning_rate": 3.8640662709542755e-07, "loss": 0.8621, "step": 10215 }, { "epoch": 0.6332878919320858, "grad_norm": 1.6592826843261719, "learning_rate": 3.860804905094253e-07, "loss": 0.8415, "step": 10220 }, { "epoch": 0.6335977196678647, "grad_norm": 1.9565722942352295, "learning_rate": 3.857543539234231e-07, "loss": 0.9011, "step": 10225 }, { "epoch": 0.6339075474036435, "grad_norm": 1.4941941499710083, "learning_rate": 3.854282173374209e-07, "loss": 0.8304, "step": 10230 }, { "epoch": 0.6342173751394224, "grad_norm": 1.789127230644226, "learning_rate": 3.851020807514187e-07, "loss": 0.8486, "step": 10235 }, { "epoch": 0.6345272028752014, "grad_norm": 1.6333516836166382, "learning_rate": 3.8477594416541647e-07, "loss": 0.8756, "step": 10240 }, { "epoch": 0.6348370306109803, "grad_norm": 1.9447983503341675, "learning_rate": 3.8444980757941423e-07, "loss": 0.8678, "step": 10245 }, { "epoch": 0.6351468583467592, "grad_norm": 1.5435510873794556, "learning_rate": 3.8412367099341205e-07, "loss": 0.8415, "step": 10250 }, { "epoch": 0.6354566860825381, "grad_norm": 1.5875576734542847, "learning_rate": 3.837975344074098e-07, "loss": 0.8151, "step": 10255 }, { "epoch": 0.6357665138183171, "grad_norm": 1.7253798246383667, "learning_rate": 3.8347139782140757e-07, "loss": 0.8815, "step": 10260 }, { "epoch": 0.6360763415540959, "grad_norm": 1.62357759475708, "learning_rate": 3.8314526123540533e-07, "loss": 0.8845, "step": 10265 }, { "epoch": 0.6363861692898748, "grad_norm": 1.4922890663146973, "learning_rate": 3.8281912464940315e-07, "loss": 0.8797, "step": 10270 }, { "epoch": 0.6366959970256537, "grad_norm": 1.6450713872909546, "learning_rate": 3.824929880634009e-07, "loss": 0.8185, "step": 10275 }, { "epoch": 0.6370058247614326, "grad_norm": 1.6129422187805176, "learning_rate": 3.8216685147739873e-07, "loss": 0.9332, "step": 10280 }, { "epoch": 0.6373156524972116, "grad_norm": 1.4653406143188477, "learning_rate": 3.8184071489139654e-07, "loss": 0.8155, "step": 10285 }, { "epoch": 0.6376254802329905, "grad_norm": 1.5885525941848755, "learning_rate": 3.815145783053943e-07, "loss": 0.8065, "step": 10290 }, { "epoch": 0.6379353079687694, "grad_norm": 1.4596531391143799, "learning_rate": 3.8118844171939207e-07, "loss": 0.8357, "step": 10295 }, { "epoch": 0.6382451357045482, "grad_norm": 1.8021025657653809, "learning_rate": 3.8086230513338983e-07, "loss": 0.8675, "step": 10300 }, { "epoch": 0.6385549634403271, "grad_norm": 1.5583521127700806, "learning_rate": 3.8053616854738764e-07, "loss": 0.8519, "step": 10305 }, { "epoch": 0.6388647911761061, "grad_norm": 1.4456623792648315, "learning_rate": 3.802100319613854e-07, "loss": 0.8575, "step": 10310 }, { "epoch": 0.639174618911885, "grad_norm": 1.5499076843261719, "learning_rate": 3.7988389537538317e-07, "loss": 0.8164, "step": 10315 }, { "epoch": 0.6394844466476639, "grad_norm": 1.8558857440948486, "learning_rate": 3.7955775878938093e-07, "loss": 0.861, "step": 10320 }, { "epoch": 0.6397942743834428, "grad_norm": 2.0739240646362305, "learning_rate": 3.7923162220337875e-07, "loss": 0.823, "step": 10325 }, { "epoch": 0.6401041021192218, "grad_norm": 2.074861764907837, "learning_rate": 3.7890548561737656e-07, "loss": 0.8378, "step": 10330 }, { "epoch": 0.6404139298550007, "grad_norm": 1.5475043058395386, "learning_rate": 3.785793490313743e-07, "loss": 0.816, "step": 10335 }, { "epoch": 0.6407237575907795, "grad_norm": 1.4845311641693115, "learning_rate": 3.7825321244537214e-07, "loss": 0.8257, "step": 10340 }, { "epoch": 0.6410335853265584, "grad_norm": 1.4994601011276245, "learning_rate": 3.779270758593699e-07, "loss": 0.8286, "step": 10345 }, { "epoch": 0.6413434130623373, "grad_norm": 1.6082435846328735, "learning_rate": 3.7760093927336767e-07, "loss": 0.8104, "step": 10350 }, { "epoch": 0.6416532407981163, "grad_norm": 2.0564568042755127, "learning_rate": 3.7727480268736543e-07, "loss": 0.8057, "step": 10355 }, { "epoch": 0.6419630685338952, "grad_norm": 1.6644333600997925, "learning_rate": 3.7694866610136324e-07, "loss": 0.826, "step": 10360 }, { "epoch": 0.6422728962696741, "grad_norm": 1.529133677482605, "learning_rate": 3.76622529515361e-07, "loss": 0.845, "step": 10365 }, { "epoch": 0.642582724005453, "grad_norm": 1.4886561632156372, "learning_rate": 3.7629639292935877e-07, "loss": 0.8443, "step": 10370 }, { "epoch": 0.6428925517412318, "grad_norm": 1.6415222883224487, "learning_rate": 3.759702563433566e-07, "loss": 0.8262, "step": 10375 }, { "epoch": 0.6432023794770108, "grad_norm": 1.4899489879608154, "learning_rate": 3.756441197573544e-07, "loss": 0.8646, "step": 10380 }, { "epoch": 0.6435122072127897, "grad_norm": 1.5891711711883545, "learning_rate": 3.7531798317135216e-07, "loss": 0.8281, "step": 10385 }, { "epoch": 0.6438220349485686, "grad_norm": 1.5833319425582886, "learning_rate": 3.749918465853499e-07, "loss": 0.8447, "step": 10390 }, { "epoch": 0.6441318626843475, "grad_norm": 1.6029983758926392, "learning_rate": 3.746657099993477e-07, "loss": 0.8389, "step": 10395 }, { "epoch": 0.6444416904201264, "grad_norm": 1.5597928762435913, "learning_rate": 3.743395734133455e-07, "loss": 0.8135, "step": 10400 }, { "epoch": 0.6447515181559054, "grad_norm": 1.5527191162109375, "learning_rate": 3.7401343682734326e-07, "loss": 0.841, "step": 10405 }, { "epoch": 0.6450613458916842, "grad_norm": 1.7770411968231201, "learning_rate": 3.7368730024134103e-07, "loss": 0.8543, "step": 10410 }, { "epoch": 0.6453711736274631, "grad_norm": 2.032761573791504, "learning_rate": 3.7336116365533884e-07, "loss": 0.9064, "step": 10415 }, { "epoch": 0.645681001363242, "grad_norm": 1.662092685699463, "learning_rate": 3.730350270693366e-07, "loss": 0.859, "step": 10420 }, { "epoch": 0.645990829099021, "grad_norm": 1.6528635025024414, "learning_rate": 3.727088904833344e-07, "loss": 0.8533, "step": 10425 }, { "epoch": 0.6463006568347999, "grad_norm": 1.6360344886779785, "learning_rate": 3.723827538973322e-07, "loss": 0.8472, "step": 10430 }, { "epoch": 0.6466104845705788, "grad_norm": 1.3987797498703003, "learning_rate": 3.7205661731133e-07, "loss": 0.8817, "step": 10435 }, { "epoch": 0.6469203123063577, "grad_norm": 2.0770163536071777, "learning_rate": 3.7173048072532776e-07, "loss": 0.8044, "step": 10440 }, { "epoch": 0.6472301400421365, "grad_norm": 1.8387295007705688, "learning_rate": 3.714043441393255e-07, "loss": 0.9178, "step": 10445 }, { "epoch": 0.6475399677779154, "grad_norm": 1.5249849557876587, "learning_rate": 3.710782075533233e-07, "loss": 0.8684, "step": 10450 }, { "epoch": 0.6478497955136944, "grad_norm": 1.7137088775634766, "learning_rate": 3.707520709673211e-07, "loss": 0.8356, "step": 10455 }, { "epoch": 0.6481596232494733, "grad_norm": 1.5876085758209229, "learning_rate": 3.7042593438131886e-07, "loss": 0.8809, "step": 10460 }, { "epoch": 0.6484694509852522, "grad_norm": 1.4780077934265137, "learning_rate": 3.700997977953166e-07, "loss": 0.8684, "step": 10465 }, { "epoch": 0.6487792787210311, "grad_norm": 1.8261549472808838, "learning_rate": 3.697736612093145e-07, "loss": 0.8302, "step": 10470 }, { "epoch": 0.6490891064568101, "grad_norm": 1.8943084478378296, "learning_rate": 3.6944752462331226e-07, "loss": 0.7998, "step": 10475 }, { "epoch": 0.6493989341925889, "grad_norm": 1.5301048755645752, "learning_rate": 3.6912138803731e-07, "loss": 0.7613, "step": 10480 }, { "epoch": 0.6497087619283678, "grad_norm": 2.0125930309295654, "learning_rate": 3.687952514513078e-07, "loss": 0.8435, "step": 10485 }, { "epoch": 0.6500185896641467, "grad_norm": 1.576545238494873, "learning_rate": 3.684691148653056e-07, "loss": 0.8891, "step": 10490 }, { "epoch": 0.6503284173999256, "grad_norm": 1.823499083518982, "learning_rate": 3.6814297827930336e-07, "loss": 0.8432, "step": 10495 }, { "epoch": 0.6506382451357046, "grad_norm": 1.5840704441070557, "learning_rate": 3.678168416933011e-07, "loss": 0.8398, "step": 10500 }, { "epoch": 0.6509480728714835, "grad_norm": 1.625441074371338, "learning_rate": 3.674907051072989e-07, "loss": 0.8461, "step": 10505 }, { "epoch": 0.6512579006072624, "grad_norm": 1.6245037317276, "learning_rate": 3.671645685212967e-07, "loss": 0.8341, "step": 10510 }, { "epoch": 0.6515677283430412, "grad_norm": 1.6583693027496338, "learning_rate": 3.668384319352945e-07, "loss": 0.8411, "step": 10515 }, { "epoch": 0.6518775560788201, "grad_norm": 1.5091147422790527, "learning_rate": 3.665122953492923e-07, "loss": 0.8659, "step": 10520 }, { "epoch": 0.6521873838145991, "grad_norm": 1.6429558992385864, "learning_rate": 3.6618615876329004e-07, "loss": 0.8134, "step": 10525 }, { "epoch": 0.652497211550378, "grad_norm": 1.695683240890503, "learning_rate": 3.6586002217728786e-07, "loss": 0.8757, "step": 10530 }, { "epoch": 0.6528070392861569, "grad_norm": 1.5511424541473389, "learning_rate": 3.655338855912856e-07, "loss": 0.8386, "step": 10535 }, { "epoch": 0.6531168670219358, "grad_norm": 1.6926244497299194, "learning_rate": 3.652077490052834e-07, "loss": 0.8548, "step": 10540 }, { "epoch": 0.6534266947577148, "grad_norm": 1.535689115524292, "learning_rate": 3.648816124192812e-07, "loss": 0.7983, "step": 10545 }, { "epoch": 0.6537365224934936, "grad_norm": 1.6374300718307495, "learning_rate": 3.6455547583327896e-07, "loss": 0.8892, "step": 10550 }, { "epoch": 0.6540463502292725, "grad_norm": 1.7223237752914429, "learning_rate": 3.642293392472767e-07, "loss": 0.8377, "step": 10555 }, { "epoch": 0.6543561779650514, "grad_norm": 1.5417147874832153, "learning_rate": 3.639032026612745e-07, "loss": 0.8236, "step": 10560 }, { "epoch": 0.6546660057008303, "grad_norm": 1.8129549026489258, "learning_rate": 3.6357706607527235e-07, "loss": 0.8463, "step": 10565 }, { "epoch": 0.6549758334366093, "grad_norm": 1.3886581659317017, "learning_rate": 3.632509294892701e-07, "loss": 0.7662, "step": 10570 }, { "epoch": 0.6552856611723882, "grad_norm": 1.6069430112838745, "learning_rate": 3.629247929032679e-07, "loss": 0.8244, "step": 10575 }, { "epoch": 0.6555954889081671, "grad_norm": 1.9113484621047974, "learning_rate": 3.6259865631726564e-07, "loss": 0.8626, "step": 10580 }, { "epoch": 0.6559053166439459, "grad_norm": 1.5200227499008179, "learning_rate": 3.6227251973126345e-07, "loss": 0.8439, "step": 10585 }, { "epoch": 0.6562151443797248, "grad_norm": 1.6501151323318481, "learning_rate": 3.619463831452612e-07, "loss": 0.8562, "step": 10590 }, { "epoch": 0.6565249721155038, "grad_norm": 1.438445806503296, "learning_rate": 3.61620246559259e-07, "loss": 0.8706, "step": 10595 }, { "epoch": 0.6568347998512827, "grad_norm": 1.5996867418289185, "learning_rate": 3.6129410997325674e-07, "loss": 0.8024, "step": 10600 }, { "epoch": 0.6571446275870616, "grad_norm": 1.7232824563980103, "learning_rate": 3.6096797338725456e-07, "loss": 0.8227, "step": 10605 }, { "epoch": 0.6574544553228405, "grad_norm": 1.711493730545044, "learning_rate": 3.6064183680125237e-07, "loss": 0.8774, "step": 10610 }, { "epoch": 0.6577642830586194, "grad_norm": 1.680685043334961, "learning_rate": 3.6031570021525014e-07, "loss": 0.8537, "step": 10615 }, { "epoch": 0.6580741107943983, "grad_norm": 1.6061196327209473, "learning_rate": 3.5998956362924795e-07, "loss": 0.8658, "step": 10620 }, { "epoch": 0.6583839385301772, "grad_norm": 1.742738127708435, "learning_rate": 3.596634270432457e-07, "loss": 0.8192, "step": 10625 }, { "epoch": 0.6586937662659561, "grad_norm": 1.7490147352218628, "learning_rate": 3.593372904572435e-07, "loss": 0.838, "step": 10630 }, { "epoch": 0.659003594001735, "grad_norm": 1.8121936321258545, "learning_rate": 3.5901115387124124e-07, "loss": 0.89, "step": 10635 }, { "epoch": 0.659313421737514, "grad_norm": 1.6084790229797363, "learning_rate": 3.5868501728523905e-07, "loss": 0.855, "step": 10640 }, { "epoch": 0.6596232494732929, "grad_norm": 1.8844499588012695, "learning_rate": 3.583588806992368e-07, "loss": 0.8146, "step": 10645 }, { "epoch": 0.6599330772090718, "grad_norm": 1.871151328086853, "learning_rate": 3.580327441132346e-07, "loss": 0.8909, "step": 10650 }, { "epoch": 0.6602429049448507, "grad_norm": 1.5856444835662842, "learning_rate": 3.5770660752723234e-07, "loss": 0.8691, "step": 10655 }, { "epoch": 0.6605527326806295, "grad_norm": 1.6971499919891357, "learning_rate": 3.573804709412302e-07, "loss": 0.81, "step": 10660 }, { "epoch": 0.6608625604164085, "grad_norm": 1.5853235721588135, "learning_rate": 3.5705433435522797e-07, "loss": 0.8538, "step": 10665 }, { "epoch": 0.6611723881521874, "grad_norm": 1.7929999828338623, "learning_rate": 3.5672819776922573e-07, "loss": 0.8589, "step": 10670 }, { "epoch": 0.6614822158879663, "grad_norm": 2.0766642093658447, "learning_rate": 3.564020611832235e-07, "loss": 0.8831, "step": 10675 }, { "epoch": 0.6617920436237452, "grad_norm": 1.5310769081115723, "learning_rate": 3.560759245972213e-07, "loss": 0.8414, "step": 10680 }, { "epoch": 0.6621018713595241, "grad_norm": 1.4298709630966187, "learning_rate": 3.557497880112191e-07, "loss": 0.829, "step": 10685 }, { "epoch": 0.6624116990953031, "grad_norm": 1.7272255420684814, "learning_rate": 3.5542365142521684e-07, "loss": 0.8661, "step": 10690 }, { "epoch": 0.6627215268310819, "grad_norm": 1.8052810430526733, "learning_rate": 3.5509751483921465e-07, "loss": 0.8444, "step": 10695 }, { "epoch": 0.6630313545668608, "grad_norm": 1.8847187757492065, "learning_rate": 3.547713782532124e-07, "loss": 0.8764, "step": 10700 }, { "epoch": 0.6633411823026397, "grad_norm": 1.6733547449111938, "learning_rate": 3.5444524166721023e-07, "loss": 0.8053, "step": 10705 }, { "epoch": 0.6636510100384186, "grad_norm": 1.8768560886383057, "learning_rate": 3.54119105081208e-07, "loss": 0.8691, "step": 10710 }, { "epoch": 0.6639608377741976, "grad_norm": 1.9808402061462402, "learning_rate": 3.537929684952058e-07, "loss": 0.8821, "step": 10715 }, { "epoch": 0.6642706655099765, "grad_norm": 1.9350428581237793, "learning_rate": 3.5346683190920357e-07, "loss": 0.8755, "step": 10720 }, { "epoch": 0.6645804932457554, "grad_norm": 1.832322359085083, "learning_rate": 3.5314069532320133e-07, "loss": 0.8215, "step": 10725 }, { "epoch": 0.6648903209815342, "grad_norm": 1.9569040536880493, "learning_rate": 3.528145587371991e-07, "loss": 0.8318, "step": 10730 }, { "epoch": 0.6652001487173131, "grad_norm": 1.6517752408981323, "learning_rate": 3.524884221511969e-07, "loss": 0.8759, "step": 10735 }, { "epoch": 0.6655099764530921, "grad_norm": 1.6621333360671997, "learning_rate": 3.521622855651947e-07, "loss": 0.8856, "step": 10740 }, { "epoch": 0.665819804188871, "grad_norm": 1.7224620580673218, "learning_rate": 3.5183614897919244e-07, "loss": 0.827, "step": 10745 }, { "epoch": 0.6661296319246499, "grad_norm": 1.5537054538726807, "learning_rate": 3.515100123931902e-07, "loss": 0.8103, "step": 10750 }, { "epoch": 0.6664394596604288, "grad_norm": 1.5793256759643555, "learning_rate": 3.5118387580718807e-07, "loss": 0.8801, "step": 10755 }, { "epoch": 0.6667492873962078, "grad_norm": 1.9305471181869507, "learning_rate": 3.5085773922118583e-07, "loss": 0.8912, "step": 10760 }, { "epoch": 0.6670591151319866, "grad_norm": 1.6269816160202026, "learning_rate": 3.505316026351836e-07, "loss": 0.8253, "step": 10765 }, { "epoch": 0.6673689428677655, "grad_norm": 1.8958319425582886, "learning_rate": 3.502054660491814e-07, "loss": 0.9045, "step": 10770 }, { "epoch": 0.6676787706035444, "grad_norm": 1.6304175853729248, "learning_rate": 3.4987932946317917e-07, "loss": 0.8418, "step": 10775 }, { "epoch": 0.6679885983393233, "grad_norm": 1.8949248790740967, "learning_rate": 3.4955319287717693e-07, "loss": 0.8276, "step": 10780 }, { "epoch": 0.6682984260751023, "grad_norm": 1.7977687120437622, "learning_rate": 3.492270562911747e-07, "loss": 0.8329, "step": 10785 }, { "epoch": 0.6686082538108812, "grad_norm": 1.5848580598831177, "learning_rate": 3.489009197051725e-07, "loss": 0.8049, "step": 10790 }, { "epoch": 0.6689180815466601, "grad_norm": 1.5372642278671265, "learning_rate": 3.4857478311917027e-07, "loss": 0.8529, "step": 10795 }, { "epoch": 0.6692279092824389, "grad_norm": 1.6369737386703491, "learning_rate": 3.482486465331681e-07, "loss": 0.8521, "step": 10800 }, { "epoch": 0.6695377370182178, "grad_norm": 1.759719729423523, "learning_rate": 3.4792250994716585e-07, "loss": 0.8626, "step": 10805 }, { "epoch": 0.6698475647539968, "grad_norm": 1.544085144996643, "learning_rate": 3.4759637336116367e-07, "loss": 0.8359, "step": 10810 }, { "epoch": 0.6701573924897757, "grad_norm": 1.698146104812622, "learning_rate": 3.4727023677516143e-07, "loss": 0.8644, "step": 10815 }, { "epoch": 0.6704672202255546, "grad_norm": 1.632580280303955, "learning_rate": 3.469441001891592e-07, "loss": 0.8452, "step": 10820 }, { "epoch": 0.6707770479613335, "grad_norm": 1.5806770324707031, "learning_rate": 3.46617963603157e-07, "loss": 0.8259, "step": 10825 }, { "epoch": 0.6710868756971125, "grad_norm": 1.4997377395629883, "learning_rate": 3.4629182701715477e-07, "loss": 0.8003, "step": 10830 }, { "epoch": 0.6713967034328913, "grad_norm": 1.5034430027008057, "learning_rate": 3.4596569043115253e-07, "loss": 0.822, "step": 10835 }, { "epoch": 0.6717065311686702, "grad_norm": 1.816947340965271, "learning_rate": 3.456395538451503e-07, "loss": 0.862, "step": 10840 }, { "epoch": 0.6720163589044491, "grad_norm": 1.5957469940185547, "learning_rate": 3.4531341725914816e-07, "loss": 0.8477, "step": 10845 }, { "epoch": 0.672326186640228, "grad_norm": 1.4842052459716797, "learning_rate": 3.449872806731459e-07, "loss": 0.8378, "step": 10850 }, { "epoch": 0.672636014376007, "grad_norm": 1.5455210208892822, "learning_rate": 3.446611440871437e-07, "loss": 0.8432, "step": 10855 }, { "epoch": 0.6729458421117859, "grad_norm": 1.5371919870376587, "learning_rate": 3.4433500750114145e-07, "loss": 0.8558, "step": 10860 }, { "epoch": 0.6732556698475648, "grad_norm": 1.8196120262145996, "learning_rate": 3.4400887091513927e-07, "loss": 0.8156, "step": 10865 }, { "epoch": 0.6735654975833436, "grad_norm": 1.6195570230484009, "learning_rate": 3.4368273432913703e-07, "loss": 0.8449, "step": 10870 }, { "epoch": 0.6738753253191225, "grad_norm": 1.5893759727478027, "learning_rate": 3.433565977431348e-07, "loss": 0.8779, "step": 10875 }, { "epoch": 0.6741851530549015, "grad_norm": 1.5374891757965088, "learning_rate": 3.4303046115713255e-07, "loss": 0.8009, "step": 10880 }, { "epoch": 0.6744949807906804, "grad_norm": 1.7074800729751587, "learning_rate": 3.4270432457113037e-07, "loss": 0.8049, "step": 10885 }, { "epoch": 0.6748048085264593, "grad_norm": 1.690293312072754, "learning_rate": 3.4237818798512813e-07, "loss": 0.88, "step": 10890 }, { "epoch": 0.6751146362622382, "grad_norm": 2.0252912044525146, "learning_rate": 3.4205205139912595e-07, "loss": 0.8953, "step": 10895 }, { "epoch": 0.6754244639980171, "grad_norm": 1.5319640636444092, "learning_rate": 3.4172591481312376e-07, "loss": 0.8152, "step": 10900 }, { "epoch": 0.675734291733796, "grad_norm": 1.6372593641281128, "learning_rate": 3.413997782271215e-07, "loss": 0.7991, "step": 10905 }, { "epoch": 0.6760441194695749, "grad_norm": 1.9436973333358765, "learning_rate": 3.410736416411193e-07, "loss": 0.8856, "step": 10910 }, { "epoch": 0.6763539472053538, "grad_norm": 1.7793247699737549, "learning_rate": 3.4074750505511705e-07, "loss": 0.8585, "step": 10915 }, { "epoch": 0.6766637749411327, "grad_norm": 1.7381387948989868, "learning_rate": 3.4042136846911486e-07, "loss": 0.8566, "step": 10920 }, { "epoch": 0.6769736026769116, "grad_norm": 1.6114814281463623, "learning_rate": 3.4009523188311263e-07, "loss": 0.835, "step": 10925 }, { "epoch": 0.6772834304126906, "grad_norm": 1.4310630559921265, "learning_rate": 3.397690952971104e-07, "loss": 0.8543, "step": 10930 }, { "epoch": 0.6775932581484695, "grad_norm": 2.001769781112671, "learning_rate": 3.3944295871110815e-07, "loss": 0.8294, "step": 10935 }, { "epoch": 0.6779030858842484, "grad_norm": 1.6093379259109497, "learning_rate": 3.39116822125106e-07, "loss": 0.8308, "step": 10940 }, { "epoch": 0.6782129136200272, "grad_norm": 1.4717586040496826, "learning_rate": 3.387906855391038e-07, "loss": 0.8521, "step": 10945 }, { "epoch": 0.6785227413558061, "grad_norm": 1.8026325702667236, "learning_rate": 3.3846454895310154e-07, "loss": 0.8429, "step": 10950 }, { "epoch": 0.6788325690915851, "grad_norm": 1.587286114692688, "learning_rate": 3.3813841236709936e-07, "loss": 0.7888, "step": 10955 }, { "epoch": 0.679142396827364, "grad_norm": 1.7545878887176514, "learning_rate": 3.378122757810971e-07, "loss": 0.8824, "step": 10960 }, { "epoch": 0.6794522245631429, "grad_norm": 1.9424906969070435, "learning_rate": 3.374861391950949e-07, "loss": 0.8698, "step": 10965 }, { "epoch": 0.6797620522989218, "grad_norm": 1.6135932207107544, "learning_rate": 3.3716000260909265e-07, "loss": 0.8045, "step": 10970 }, { "epoch": 0.6800718800347008, "grad_norm": 1.6690455675125122, "learning_rate": 3.3683386602309046e-07, "loss": 0.8468, "step": 10975 }, { "epoch": 0.6803817077704796, "grad_norm": 1.788762092590332, "learning_rate": 3.365077294370882e-07, "loss": 0.8392, "step": 10980 }, { "epoch": 0.6806915355062585, "grad_norm": 1.642216444015503, "learning_rate": 3.36181592851086e-07, "loss": 0.7726, "step": 10985 }, { "epoch": 0.6810013632420374, "grad_norm": 1.6124098300933838, "learning_rate": 3.358554562650838e-07, "loss": 0.8691, "step": 10990 }, { "epoch": 0.6813111909778163, "grad_norm": 1.4993064403533936, "learning_rate": 3.355293196790816e-07, "loss": 0.8631, "step": 10995 }, { "epoch": 0.6816210187135953, "grad_norm": 1.7881628274917603, "learning_rate": 3.352031830930794e-07, "loss": 0.8307, "step": 11000 }, { "epoch": 0.6819308464493742, "grad_norm": 1.6095942258834839, "learning_rate": 3.3487704650707714e-07, "loss": 0.8711, "step": 11005 }, { "epoch": 0.6822406741851531, "grad_norm": 1.612648606300354, "learning_rate": 3.345509099210749e-07, "loss": 0.8405, "step": 11010 }, { "epoch": 0.6825505019209319, "grad_norm": 1.5091685056686401, "learning_rate": 3.342247733350727e-07, "loss": 0.8569, "step": 11015 }, { "epoch": 0.6828603296567108, "grad_norm": 1.5198073387145996, "learning_rate": 3.338986367490705e-07, "loss": 0.8871, "step": 11020 }, { "epoch": 0.6831701573924898, "grad_norm": 1.5786826610565186, "learning_rate": 3.3357250016306825e-07, "loss": 0.8291, "step": 11025 }, { "epoch": 0.6834799851282687, "grad_norm": 2.665649890899658, "learning_rate": 3.3324636357706606e-07, "loss": 0.8248, "step": 11030 }, { "epoch": 0.6837898128640476, "grad_norm": 1.6829251050949097, "learning_rate": 3.329202269910639e-07, "loss": 0.8107, "step": 11035 }, { "epoch": 0.6840996405998265, "grad_norm": 1.5946394205093384, "learning_rate": 3.3259409040506164e-07, "loss": 0.7983, "step": 11040 }, { "epoch": 0.6844094683356055, "grad_norm": 1.7473163604736328, "learning_rate": 3.322679538190594e-07, "loss": 0.8527, "step": 11045 }, { "epoch": 0.6847192960713843, "grad_norm": 1.835394263267517, "learning_rate": 3.319418172330572e-07, "loss": 0.8676, "step": 11050 }, { "epoch": 0.6850291238071632, "grad_norm": 1.736215353012085, "learning_rate": 3.31615680647055e-07, "loss": 0.8453, "step": 11055 }, { "epoch": 0.6853389515429421, "grad_norm": 1.8738141059875488, "learning_rate": 3.3128954406105274e-07, "loss": 0.8235, "step": 11060 }, { "epoch": 0.685648779278721, "grad_norm": 1.632565975189209, "learning_rate": 3.309634074750505e-07, "loss": 0.8457, "step": 11065 }, { "epoch": 0.6859586070145, "grad_norm": 1.7772281169891357, "learning_rate": 3.306372708890483e-07, "loss": 0.8657, "step": 11070 }, { "epoch": 0.6862684347502789, "grad_norm": 3.723233461380005, "learning_rate": 3.303111343030461e-07, "loss": 0.8646, "step": 11075 }, { "epoch": 0.6865782624860578, "grad_norm": 1.6892296075820923, "learning_rate": 3.2998499771704385e-07, "loss": 0.8265, "step": 11080 }, { "epoch": 0.6868880902218366, "grad_norm": 2.001718759536743, "learning_rate": 3.296588611310417e-07, "loss": 0.8483, "step": 11085 }, { "epoch": 0.6871979179576155, "grad_norm": 1.503063440322876, "learning_rate": 3.293327245450395e-07, "loss": 0.9252, "step": 11090 }, { "epoch": 0.6875077456933945, "grad_norm": 1.8018823862075806, "learning_rate": 3.2900658795903724e-07, "loss": 0.8535, "step": 11095 }, { "epoch": 0.6878175734291734, "grad_norm": 1.6362600326538086, "learning_rate": 3.28680451373035e-07, "loss": 0.8313, "step": 11100 }, { "epoch": 0.6881274011649523, "grad_norm": 1.8773714303970337, "learning_rate": 3.283543147870328e-07, "loss": 0.8458, "step": 11105 }, { "epoch": 0.6884372289007312, "grad_norm": 1.689214825630188, "learning_rate": 3.280281782010306e-07, "loss": 0.8816, "step": 11110 }, { "epoch": 0.6887470566365101, "grad_norm": 2.0497019290924072, "learning_rate": 3.2770204161502834e-07, "loss": 0.8764, "step": 11115 }, { "epoch": 0.689056884372289, "grad_norm": 1.6838278770446777, "learning_rate": 3.273759050290261e-07, "loss": 0.8455, "step": 11120 }, { "epoch": 0.6893667121080679, "grad_norm": 1.5490871667861938, "learning_rate": 3.270497684430239e-07, "loss": 0.8522, "step": 11125 }, { "epoch": 0.6896765398438468, "grad_norm": 1.6812160015106201, "learning_rate": 3.2672363185702173e-07, "loss": 0.833, "step": 11130 }, { "epoch": 0.6899863675796257, "grad_norm": 1.791874885559082, "learning_rate": 3.263974952710195e-07, "loss": 0.8051, "step": 11135 }, { "epoch": 0.6902961953154046, "grad_norm": 1.7819173336029053, "learning_rate": 3.2607135868501726e-07, "loss": 0.8861, "step": 11140 }, { "epoch": 0.6906060230511836, "grad_norm": 1.595604658126831, "learning_rate": 3.257452220990151e-07, "loss": 0.8632, "step": 11145 }, { "epoch": 0.6909158507869625, "grad_norm": 1.5826585292816162, "learning_rate": 3.2541908551301284e-07, "loss": 0.8133, "step": 11150 }, { "epoch": 0.6912256785227413, "grad_norm": 1.9366466999053955, "learning_rate": 3.250929489270106e-07, "loss": 0.8612, "step": 11155 }, { "epoch": 0.6915355062585202, "grad_norm": 1.718587875366211, "learning_rate": 3.247668123410084e-07, "loss": 0.8859, "step": 11160 }, { "epoch": 0.6918453339942991, "grad_norm": 1.627212643623352, "learning_rate": 3.244406757550062e-07, "loss": 0.8965, "step": 11165 }, { "epoch": 0.6921551617300781, "grad_norm": 1.7927912473678589, "learning_rate": 3.2411453916900394e-07, "loss": 0.8572, "step": 11170 }, { "epoch": 0.692464989465857, "grad_norm": 1.7548185586929321, "learning_rate": 3.2378840258300176e-07, "loss": 0.8653, "step": 11175 }, { "epoch": 0.6927748172016359, "grad_norm": 1.700750470161438, "learning_rate": 3.2346226599699957e-07, "loss": 0.8639, "step": 11180 }, { "epoch": 0.6930846449374148, "grad_norm": 1.8091648817062378, "learning_rate": 3.2313612941099733e-07, "loss": 0.8704, "step": 11185 }, { "epoch": 0.6933944726731937, "grad_norm": 1.6897348165512085, "learning_rate": 3.228099928249951e-07, "loss": 0.8234, "step": 11190 }, { "epoch": 0.6937043004089726, "grad_norm": 1.7158334255218506, "learning_rate": 3.2248385623899286e-07, "loss": 0.8471, "step": 11195 }, { "epoch": 0.6940141281447515, "grad_norm": 1.7024294137954712, "learning_rate": 3.221577196529907e-07, "loss": 0.8712, "step": 11200 }, { "epoch": 0.6943239558805304, "grad_norm": 1.6045223474502563, "learning_rate": 3.2183158306698844e-07, "loss": 0.8565, "step": 11205 }, { "epoch": 0.6946337836163093, "grad_norm": 1.6639328002929688, "learning_rate": 3.215054464809862e-07, "loss": 0.8786, "step": 11210 }, { "epoch": 0.6949436113520883, "grad_norm": 1.4922047853469849, "learning_rate": 3.2117930989498396e-07, "loss": 0.8444, "step": 11215 }, { "epoch": 0.6952534390878672, "grad_norm": 1.839837670326233, "learning_rate": 3.208531733089818e-07, "loss": 0.8018, "step": 11220 }, { "epoch": 0.695563266823646, "grad_norm": 1.782132863998413, "learning_rate": 3.205270367229796e-07, "loss": 0.8709, "step": 11225 }, { "epoch": 0.6958730945594249, "grad_norm": 1.820255994796753, "learning_rate": 3.2020090013697735e-07, "loss": 0.809, "step": 11230 }, { "epoch": 0.6961829222952038, "grad_norm": 1.5093460083007812, "learning_rate": 3.1987476355097517e-07, "loss": 0.8147, "step": 11235 }, { "epoch": 0.6964927500309828, "grad_norm": 1.7923598289489746, "learning_rate": 3.1954862696497293e-07, "loss": 0.8653, "step": 11240 }, { "epoch": 0.6968025777667617, "grad_norm": 2.1568212509155273, "learning_rate": 3.192224903789707e-07, "loss": 0.8554, "step": 11245 }, { "epoch": 0.6971124055025406, "grad_norm": 1.6191774606704712, "learning_rate": 3.1889635379296846e-07, "loss": 0.8079, "step": 11250 }, { "epoch": 0.6974222332383195, "grad_norm": 1.6805437803268433, "learning_rate": 3.1857021720696627e-07, "loss": 0.8734, "step": 11255 }, { "epoch": 0.6977320609740985, "grad_norm": 1.6503676176071167, "learning_rate": 3.1824408062096404e-07, "loss": 0.8684, "step": 11260 }, { "epoch": 0.6980418887098773, "grad_norm": 1.4553288221359253, "learning_rate": 3.179179440349618e-07, "loss": 0.7634, "step": 11265 }, { "epoch": 0.6983517164456562, "grad_norm": 1.9088237285614014, "learning_rate": 3.175918074489596e-07, "loss": 0.835, "step": 11270 }, { "epoch": 0.6986615441814351, "grad_norm": 1.675633192062378, "learning_rate": 3.1726567086295743e-07, "loss": 0.8396, "step": 11275 }, { "epoch": 0.698971371917214, "grad_norm": 1.56244695186615, "learning_rate": 3.169395342769552e-07, "loss": 0.8459, "step": 11280 }, { "epoch": 0.699281199652993, "grad_norm": 1.6457371711730957, "learning_rate": 3.1661339769095295e-07, "loss": 0.8466, "step": 11285 }, { "epoch": 0.6995910273887719, "grad_norm": 1.584565281867981, "learning_rate": 3.162872611049507e-07, "loss": 0.825, "step": 11290 }, { "epoch": 0.6999008551245508, "grad_norm": 1.7365069389343262, "learning_rate": 3.1596112451894853e-07, "loss": 0.864, "step": 11295 }, { "epoch": 0.7002106828603296, "grad_norm": 1.6059904098510742, "learning_rate": 3.156349879329463e-07, "loss": 0.8315, "step": 11300 }, { "epoch": 0.7005205105961085, "grad_norm": 1.6295363903045654, "learning_rate": 3.1530885134694406e-07, "loss": 0.8456, "step": 11305 }, { "epoch": 0.7008303383318875, "grad_norm": 1.5593106746673584, "learning_rate": 3.1498271476094187e-07, "loss": 0.8235, "step": 11310 }, { "epoch": 0.7011401660676664, "grad_norm": 1.6507943868637085, "learning_rate": 3.1465657817493963e-07, "loss": 0.8365, "step": 11315 }, { "epoch": 0.7014499938034453, "grad_norm": 1.8942301273345947, "learning_rate": 3.1433044158893745e-07, "loss": 0.8794, "step": 11320 }, { "epoch": 0.7017598215392242, "grad_norm": 1.6109447479248047, "learning_rate": 3.140043050029352e-07, "loss": 0.8564, "step": 11325 }, { "epoch": 0.7020696492750031, "grad_norm": 1.4913012981414795, "learning_rate": 3.1367816841693303e-07, "loss": 0.8684, "step": 11330 }, { "epoch": 0.702379477010782, "grad_norm": 1.8390504121780396, "learning_rate": 3.133520318309308e-07, "loss": 0.7857, "step": 11335 }, { "epoch": 0.7026893047465609, "grad_norm": 1.4778220653533936, "learning_rate": 3.1302589524492855e-07, "loss": 0.8891, "step": 11340 }, { "epoch": 0.7029991324823398, "grad_norm": 1.5015125274658203, "learning_rate": 3.126997586589263e-07, "loss": 0.8665, "step": 11345 }, { "epoch": 0.7033089602181187, "grad_norm": 1.7816864252090454, "learning_rate": 3.1237362207292413e-07, "loss": 0.7787, "step": 11350 }, { "epoch": 0.7036187879538977, "grad_norm": 1.5324723720550537, "learning_rate": 3.120474854869219e-07, "loss": 0.818, "step": 11355 }, { "epoch": 0.7039286156896766, "grad_norm": 1.5102659463882446, "learning_rate": 3.1172134890091966e-07, "loss": 0.849, "step": 11360 }, { "epoch": 0.7042384434254555, "grad_norm": 1.5813989639282227, "learning_rate": 3.113952123149175e-07, "loss": 0.8501, "step": 11365 }, { "epoch": 0.7045482711612343, "grad_norm": 1.776411533355713, "learning_rate": 3.110690757289153e-07, "loss": 0.8175, "step": 11370 }, { "epoch": 0.7048580988970132, "grad_norm": 1.8552647829055786, "learning_rate": 3.1074293914291305e-07, "loss": 0.8207, "step": 11375 }, { "epoch": 0.7051679266327922, "grad_norm": 1.7496776580810547, "learning_rate": 3.104168025569108e-07, "loss": 0.8864, "step": 11380 }, { "epoch": 0.7054777543685711, "grad_norm": 1.6682649850845337, "learning_rate": 3.1009066597090863e-07, "loss": 0.8553, "step": 11385 }, { "epoch": 0.70578758210435, "grad_norm": 1.5550585985183716, "learning_rate": 3.097645293849064e-07, "loss": 0.8644, "step": 11390 }, { "epoch": 0.7060974098401289, "grad_norm": 1.8268351554870605, "learning_rate": 3.0943839279890415e-07, "loss": 0.8476, "step": 11395 }, { "epoch": 0.7064072375759078, "grad_norm": 1.6614879369735718, "learning_rate": 3.091122562129019e-07, "loss": 0.8261, "step": 11400 }, { "epoch": 0.7067170653116867, "grad_norm": 1.5331299304962158, "learning_rate": 3.0878611962689973e-07, "loss": 0.9031, "step": 11405 }, { "epoch": 0.7070268930474656, "grad_norm": 1.520064115524292, "learning_rate": 3.084599830408975e-07, "loss": 0.8481, "step": 11410 }, { "epoch": 0.7073367207832445, "grad_norm": 1.6765999794006348, "learning_rate": 3.081338464548953e-07, "loss": 0.9198, "step": 11415 }, { "epoch": 0.7076465485190234, "grad_norm": 1.4884917736053467, "learning_rate": 3.0780770986889307e-07, "loss": 0.7865, "step": 11420 }, { "epoch": 0.7079563762548023, "grad_norm": 1.6331202983856201, "learning_rate": 3.074815732828909e-07, "loss": 0.8413, "step": 11425 }, { "epoch": 0.7082662039905813, "grad_norm": 1.3684033155441284, "learning_rate": 3.0715543669688865e-07, "loss": 0.8217, "step": 11430 }, { "epoch": 0.7085760317263602, "grad_norm": 1.631818175315857, "learning_rate": 3.068293001108864e-07, "loss": 0.8381, "step": 11435 }, { "epoch": 0.708885859462139, "grad_norm": 1.6431365013122559, "learning_rate": 3.065031635248842e-07, "loss": 0.8021, "step": 11440 }, { "epoch": 0.7091956871979179, "grad_norm": 1.5915741920471191, "learning_rate": 3.06177026938882e-07, "loss": 0.8832, "step": 11445 }, { "epoch": 0.7095055149336968, "grad_norm": 1.914555311203003, "learning_rate": 3.0585089035287975e-07, "loss": 0.8478, "step": 11450 }, { "epoch": 0.7098153426694758, "grad_norm": 1.8018525838851929, "learning_rate": 3.055247537668775e-07, "loss": 0.8556, "step": 11455 }, { "epoch": 0.7101251704052547, "grad_norm": 1.703657627105713, "learning_rate": 3.051986171808754e-07, "loss": 0.8186, "step": 11460 }, { "epoch": 0.7104349981410336, "grad_norm": 1.4715288877487183, "learning_rate": 3.0487248059487314e-07, "loss": 0.8343, "step": 11465 }, { "epoch": 0.7107448258768125, "grad_norm": 1.5749222040176392, "learning_rate": 3.045463440088709e-07, "loss": 0.7967, "step": 11470 }, { "epoch": 0.7110546536125913, "grad_norm": 1.9453526735305786, "learning_rate": 3.0422020742286867e-07, "loss": 0.8508, "step": 11475 }, { "epoch": 0.7113644813483703, "grad_norm": 1.6002776622772217, "learning_rate": 3.038940708368665e-07, "loss": 0.811, "step": 11480 }, { "epoch": 0.7116743090841492, "grad_norm": 1.6135684251785278, "learning_rate": 3.0356793425086425e-07, "loss": 0.8177, "step": 11485 }, { "epoch": 0.7119841368199281, "grad_norm": 1.765280842781067, "learning_rate": 3.03241797664862e-07, "loss": 0.8066, "step": 11490 }, { "epoch": 0.712293964555707, "grad_norm": 1.716092586517334, "learning_rate": 3.0291566107885977e-07, "loss": 0.8556, "step": 11495 }, { "epoch": 0.712603792291486, "grad_norm": 1.5250134468078613, "learning_rate": 3.025895244928576e-07, "loss": 0.8516, "step": 11500 }, { "epoch": 0.7129136200272649, "grad_norm": 1.6441845893859863, "learning_rate": 3.022633879068554e-07, "loss": 0.8789, "step": 11505 }, { "epoch": 0.7132234477630437, "grad_norm": 1.6758371591567993, "learning_rate": 3.0193725132085317e-07, "loss": 0.8225, "step": 11510 }, { "epoch": 0.7135332754988226, "grad_norm": 1.6358846426010132, "learning_rate": 3.01611114734851e-07, "loss": 0.828, "step": 11515 }, { "epoch": 0.7138431032346015, "grad_norm": 1.6210417747497559, "learning_rate": 3.0128497814884874e-07, "loss": 0.8425, "step": 11520 }, { "epoch": 0.7141529309703805, "grad_norm": 1.9104944467544556, "learning_rate": 3.009588415628465e-07, "loss": 0.8418, "step": 11525 }, { "epoch": 0.7144627587061594, "grad_norm": 1.9660066366195679, "learning_rate": 3.0063270497684427e-07, "loss": 0.8365, "step": 11530 }, { "epoch": 0.7147725864419383, "grad_norm": 1.6737570762634277, "learning_rate": 3.003065683908421e-07, "loss": 0.8387, "step": 11535 }, { "epoch": 0.7150824141777172, "grad_norm": 1.726465106010437, "learning_rate": 2.9998043180483985e-07, "loss": 0.8284, "step": 11540 }, { "epoch": 0.715392241913496, "grad_norm": 1.6099671125411987, "learning_rate": 2.996542952188376e-07, "loss": 0.7872, "step": 11545 }, { "epoch": 0.715702069649275, "grad_norm": 1.7606831789016724, "learning_rate": 2.9932815863283537e-07, "loss": 0.8646, "step": 11550 }, { "epoch": 0.7160118973850539, "grad_norm": 1.818821907043457, "learning_rate": 2.9900202204683324e-07, "loss": 0.8286, "step": 11555 }, { "epoch": 0.7163217251208328, "grad_norm": 1.7864813804626465, "learning_rate": 2.98675885460831e-07, "loss": 0.8465, "step": 11560 }, { "epoch": 0.7166315528566117, "grad_norm": 1.5758421421051025, "learning_rate": 2.9834974887482876e-07, "loss": 0.8231, "step": 11565 }, { "epoch": 0.7169413805923907, "grad_norm": 1.716545820236206, "learning_rate": 2.980236122888266e-07, "loss": 0.8015, "step": 11570 }, { "epoch": 0.7172512083281696, "grad_norm": 1.692609190940857, "learning_rate": 2.9769747570282434e-07, "loss": 0.843, "step": 11575 }, { "epoch": 0.7175610360639485, "grad_norm": 1.853581190109253, "learning_rate": 2.973713391168221e-07, "loss": 0.8203, "step": 11580 }, { "epoch": 0.7178708637997273, "grad_norm": 1.4892462491989136, "learning_rate": 2.9704520253081987e-07, "loss": 0.8201, "step": 11585 }, { "epoch": 0.7181806915355062, "grad_norm": 1.7016987800598145, "learning_rate": 2.967190659448177e-07, "loss": 0.8092, "step": 11590 }, { "epoch": 0.7184905192712852, "grad_norm": 1.7732329368591309, "learning_rate": 2.9639292935881544e-07, "loss": 0.8264, "step": 11595 }, { "epoch": 0.7188003470070641, "grad_norm": 1.4729129076004028, "learning_rate": 2.9606679277281326e-07, "loss": 0.8234, "step": 11600 }, { "epoch": 0.719110174742843, "grad_norm": 1.5042972564697266, "learning_rate": 2.95740656186811e-07, "loss": 0.8043, "step": 11605 }, { "epoch": 0.7194200024786219, "grad_norm": 1.6538646221160889, "learning_rate": 2.9541451960080884e-07, "loss": 0.8562, "step": 11610 }, { "epoch": 0.7197298302144008, "grad_norm": 1.669419765472412, "learning_rate": 2.950883830148066e-07, "loss": 0.8587, "step": 11615 }, { "epoch": 0.7200396579501797, "grad_norm": 1.7906357049942017, "learning_rate": 2.9476224642880436e-07, "loss": 0.8686, "step": 11620 }, { "epoch": 0.7203494856859586, "grad_norm": 1.6192494630813599, "learning_rate": 2.944361098428021e-07, "loss": 0.8327, "step": 11625 }, { "epoch": 0.7206593134217375, "grad_norm": 1.929640293121338, "learning_rate": 2.9410997325679994e-07, "loss": 0.8719, "step": 11630 }, { "epoch": 0.7209691411575164, "grad_norm": 1.6567485332489014, "learning_rate": 2.937838366707977e-07, "loss": 0.829, "step": 11635 }, { "epoch": 0.7212789688932953, "grad_norm": 2.843085527420044, "learning_rate": 2.9345770008479547e-07, "loss": 0.8462, "step": 11640 }, { "epoch": 0.7215887966290743, "grad_norm": 1.5401910543441772, "learning_rate": 2.931315634987933e-07, "loss": 0.91, "step": 11645 }, { "epoch": 0.7218986243648532, "grad_norm": 1.6050065755844116, "learning_rate": 2.928054269127911e-07, "loss": 0.9196, "step": 11650 }, { "epoch": 0.722208452100632, "grad_norm": 1.693787932395935, "learning_rate": 2.9247929032678886e-07, "loss": 0.8392, "step": 11655 }, { "epoch": 0.7225182798364109, "grad_norm": 1.602575421333313, "learning_rate": 2.921531537407866e-07, "loss": 0.8241, "step": 11660 }, { "epoch": 0.7228281075721898, "grad_norm": 1.5482865571975708, "learning_rate": 2.9182701715478444e-07, "loss": 0.8232, "step": 11665 }, { "epoch": 0.7231379353079688, "grad_norm": 1.5995457172393799, "learning_rate": 2.915008805687822e-07, "loss": 0.8473, "step": 11670 }, { "epoch": 0.7234477630437477, "grad_norm": 1.4193928241729736, "learning_rate": 2.9117474398277996e-07, "loss": 0.8866, "step": 11675 }, { "epoch": 0.7237575907795266, "grad_norm": 1.6580811738967896, "learning_rate": 2.908486073967777e-07, "loss": 0.8516, "step": 11680 }, { "epoch": 0.7240674185153055, "grad_norm": 1.754204273223877, "learning_rate": 2.9052247081077554e-07, "loss": 0.7966, "step": 11685 }, { "epoch": 0.7243772462510844, "grad_norm": 2.0248208045959473, "learning_rate": 2.901963342247733e-07, "loss": 0.8113, "step": 11690 }, { "epoch": 0.7246870739868633, "grad_norm": 1.4638395309448242, "learning_rate": 2.898701976387711e-07, "loss": 0.7948, "step": 11695 }, { "epoch": 0.7249969017226422, "grad_norm": 1.5526025295257568, "learning_rate": 2.895440610527689e-07, "loss": 0.8388, "step": 11700 }, { "epoch": 0.7253067294584211, "grad_norm": 1.7311400175094604, "learning_rate": 2.892179244667667e-07, "loss": 0.8466, "step": 11705 }, { "epoch": 0.7256165571942, "grad_norm": 1.937572717666626, "learning_rate": 2.8889178788076446e-07, "loss": 0.8225, "step": 11710 }, { "epoch": 0.725926384929979, "grad_norm": 1.616665244102478, "learning_rate": 2.885656512947622e-07, "loss": 0.8143, "step": 11715 }, { "epoch": 0.7262362126657579, "grad_norm": 1.7034623622894287, "learning_rate": 2.8823951470876004e-07, "loss": 0.83, "step": 11720 }, { "epoch": 0.7265460404015367, "grad_norm": 2.1087357997894287, "learning_rate": 2.879133781227578e-07, "loss": 0.8732, "step": 11725 }, { "epoch": 0.7268558681373156, "grad_norm": 1.6630109548568726, "learning_rate": 2.8758724153675556e-07, "loss": 0.8619, "step": 11730 }, { "epoch": 0.7271656958730945, "grad_norm": 1.60958731174469, "learning_rate": 2.872611049507533e-07, "loss": 0.8302, "step": 11735 }, { "epoch": 0.7274755236088735, "grad_norm": 1.607345461845398, "learning_rate": 2.8693496836475114e-07, "loss": 0.8585, "step": 11740 }, { "epoch": 0.7277853513446524, "grad_norm": 1.5607095956802368, "learning_rate": 2.8660883177874895e-07, "loss": 0.8104, "step": 11745 }, { "epoch": 0.7280951790804313, "grad_norm": 1.5377088785171509, "learning_rate": 2.862826951927467e-07, "loss": 0.7786, "step": 11750 }, { "epoch": 0.7284050068162102, "grad_norm": 1.7703851461410522, "learning_rate": 2.859565586067445e-07, "loss": 0.9008, "step": 11755 }, { "epoch": 0.728714834551989, "grad_norm": 1.4754254817962646, "learning_rate": 2.856304220207423e-07, "loss": 0.8471, "step": 11760 }, { "epoch": 0.729024662287768, "grad_norm": 1.5484377145767212, "learning_rate": 2.8530428543474006e-07, "loss": 0.788, "step": 11765 }, { "epoch": 0.7293344900235469, "grad_norm": 1.767181158065796, "learning_rate": 2.849781488487378e-07, "loss": 0.7977, "step": 11770 }, { "epoch": 0.7296443177593258, "grad_norm": 1.6663837432861328, "learning_rate": 2.846520122627356e-07, "loss": 0.8484, "step": 11775 }, { "epoch": 0.7299541454951047, "grad_norm": 1.687380075454712, "learning_rate": 2.843258756767334e-07, "loss": 0.8279, "step": 11780 }, { "epoch": 0.7302639732308837, "grad_norm": 1.5489252805709839, "learning_rate": 2.8399973909073116e-07, "loss": 0.8622, "step": 11785 }, { "epoch": 0.7305738009666626, "grad_norm": 1.766649842262268, "learning_rate": 2.83673602504729e-07, "loss": 0.8185, "step": 11790 }, { "epoch": 0.7308836287024414, "grad_norm": 1.7426440715789795, "learning_rate": 2.833474659187268e-07, "loss": 0.8443, "step": 11795 }, { "epoch": 0.7311934564382203, "grad_norm": 1.6790425777435303, "learning_rate": 2.8302132933272455e-07, "loss": 0.8696, "step": 11800 }, { "epoch": 0.7315032841739992, "grad_norm": 1.6852138042449951, "learning_rate": 2.826951927467223e-07, "loss": 0.8187, "step": 11805 }, { "epoch": 0.7318131119097782, "grad_norm": 1.9655539989471436, "learning_rate": 2.823690561607201e-07, "loss": 0.8018, "step": 11810 }, { "epoch": 0.7321229396455571, "grad_norm": 1.6227571964263916, "learning_rate": 2.820429195747179e-07, "loss": 0.8604, "step": 11815 }, { "epoch": 0.732432767381336, "grad_norm": 1.7610256671905518, "learning_rate": 2.8171678298871566e-07, "loss": 0.7895, "step": 11820 }, { "epoch": 0.7327425951171149, "grad_norm": 1.786684513092041, "learning_rate": 2.813906464027134e-07, "loss": 0.8817, "step": 11825 }, { "epoch": 0.7330524228528937, "grad_norm": 1.7019760608673096, "learning_rate": 2.810645098167112e-07, "loss": 0.8905, "step": 11830 }, { "epoch": 0.7333622505886727, "grad_norm": 1.7341760396957397, "learning_rate": 2.8073837323070905e-07, "loss": 0.8128, "step": 11835 }, { "epoch": 0.7336720783244516, "grad_norm": 1.9696900844573975, "learning_rate": 2.804122366447068e-07, "loss": 0.8483, "step": 11840 }, { "epoch": 0.7339819060602305, "grad_norm": 1.8724082708358765, "learning_rate": 2.800861000587046e-07, "loss": 0.8228, "step": 11845 }, { "epoch": 0.7342917337960094, "grad_norm": 2.181581497192383, "learning_rate": 2.797599634727024e-07, "loss": 0.8512, "step": 11850 }, { "epoch": 0.7346015615317883, "grad_norm": 1.540084719657898, "learning_rate": 2.7943382688670015e-07, "loss": 0.8311, "step": 11855 }, { "epoch": 0.7349113892675673, "grad_norm": 1.5838383436203003, "learning_rate": 2.791076903006979e-07, "loss": 0.7808, "step": 11860 }, { "epoch": 0.7352212170033462, "grad_norm": 1.5648609399795532, "learning_rate": 2.787815537146957e-07, "loss": 0.7731, "step": 11865 }, { "epoch": 0.735531044739125, "grad_norm": 1.6015483140945435, "learning_rate": 2.784554171286935e-07, "loss": 0.8809, "step": 11870 }, { "epoch": 0.7358408724749039, "grad_norm": 1.9932246208190918, "learning_rate": 2.7812928054269125e-07, "loss": 0.809, "step": 11875 }, { "epoch": 0.7361507002106829, "grad_norm": 1.7283415794372559, "learning_rate": 2.77803143956689e-07, "loss": 0.8648, "step": 11880 }, { "epoch": 0.7364605279464618, "grad_norm": 1.5575391054153442, "learning_rate": 2.7747700737068683e-07, "loss": 0.8437, "step": 11885 }, { "epoch": 0.7367703556822407, "grad_norm": 1.4121953248977661, "learning_rate": 2.7715087078468465e-07, "loss": 0.9141, "step": 11890 }, { "epoch": 0.7370801834180196, "grad_norm": 1.6205124855041504, "learning_rate": 2.768247341986824e-07, "loss": 0.872, "step": 11895 }, { "epoch": 0.7373900111537985, "grad_norm": 1.8232074975967407, "learning_rate": 2.7649859761268017e-07, "loss": 0.873, "step": 11900 }, { "epoch": 0.7376998388895774, "grad_norm": 1.3952769041061401, "learning_rate": 2.7617246102667794e-07, "loss": 0.8358, "step": 11905 }, { "epoch": 0.7380096666253563, "grad_norm": 1.6873114109039307, "learning_rate": 2.7584632444067575e-07, "loss": 0.8327, "step": 11910 }, { "epoch": 0.7383194943611352, "grad_norm": 2.2845027446746826, "learning_rate": 2.755201878546735e-07, "loss": 0.8424, "step": 11915 }, { "epoch": 0.7386293220969141, "grad_norm": 1.6233422756195068, "learning_rate": 2.751940512686713e-07, "loss": 0.8219, "step": 11920 }, { "epoch": 0.738939149832693, "grad_norm": 1.7413493394851685, "learning_rate": 2.748679146826691e-07, "loss": 0.8633, "step": 11925 }, { "epoch": 0.739248977568472, "grad_norm": 1.5837829113006592, "learning_rate": 2.745417780966669e-07, "loss": 0.8386, "step": 11930 }, { "epoch": 0.7395588053042509, "grad_norm": 1.7029304504394531, "learning_rate": 2.7421564151066467e-07, "loss": 0.8383, "step": 11935 }, { "epoch": 0.7398686330400297, "grad_norm": 1.5094932317733765, "learning_rate": 2.7388950492466243e-07, "loss": 0.7969, "step": 11940 }, { "epoch": 0.7401784607758086, "grad_norm": 1.4737169742584229, "learning_rate": 2.7356336833866025e-07, "loss": 0.7917, "step": 11945 }, { "epoch": 0.7404882885115875, "grad_norm": 1.6357616186141968, "learning_rate": 2.73237231752658e-07, "loss": 0.8351, "step": 11950 }, { "epoch": 0.7407981162473665, "grad_norm": 1.7361469268798828, "learning_rate": 2.7291109516665577e-07, "loss": 0.7999, "step": 11955 }, { "epoch": 0.7411079439831454, "grad_norm": 1.5603796243667603, "learning_rate": 2.7258495858065353e-07, "loss": 0.8688, "step": 11960 }, { "epoch": 0.7414177717189243, "grad_norm": 1.6049413681030273, "learning_rate": 2.7225882199465135e-07, "loss": 0.7906, "step": 11965 }, { "epoch": 0.7417275994547032, "grad_norm": 1.5708884000778198, "learning_rate": 2.719326854086491e-07, "loss": 0.8391, "step": 11970 }, { "epoch": 0.742037427190482, "grad_norm": 1.5896075963974, "learning_rate": 2.716065488226469e-07, "loss": 0.8187, "step": 11975 }, { "epoch": 0.742347254926261, "grad_norm": 1.6734040975570679, "learning_rate": 2.7128041223664474e-07, "loss": 0.815, "step": 11980 }, { "epoch": 0.7426570826620399, "grad_norm": 1.7690640687942505, "learning_rate": 2.709542756506425e-07, "loss": 0.8377, "step": 11985 }, { "epoch": 0.7429669103978188, "grad_norm": 1.9310567378997803, "learning_rate": 2.7062813906464027e-07, "loss": 0.877, "step": 11990 }, { "epoch": 0.7432767381335977, "grad_norm": 2.3077728748321533, "learning_rate": 2.7030200247863803e-07, "loss": 0.8131, "step": 11995 }, { "epoch": 0.7435865658693767, "grad_norm": 1.6327122449874878, "learning_rate": 2.6997586589263585e-07, "loss": 0.8631, "step": 12000 }, { "epoch": 0.7438963936051556, "grad_norm": 1.6435234546661377, "learning_rate": 2.696497293066336e-07, "loss": 0.8311, "step": 12005 }, { "epoch": 0.7442062213409344, "grad_norm": 1.682268500328064, "learning_rate": 2.6932359272063137e-07, "loss": 0.8241, "step": 12010 }, { "epoch": 0.7445160490767133, "grad_norm": 1.5919898748397827, "learning_rate": 2.6899745613462913e-07, "loss": 0.8092, "step": 12015 }, { "epoch": 0.7448258768124922, "grad_norm": 1.9064427614212036, "learning_rate": 2.6867131954862695e-07, "loss": 0.8349, "step": 12020 }, { "epoch": 0.7451357045482712, "grad_norm": 1.7045423984527588, "learning_rate": 2.6834518296262476e-07, "loss": 0.8528, "step": 12025 }, { "epoch": 0.7454455322840501, "grad_norm": 1.737545132637024, "learning_rate": 2.6801904637662253e-07, "loss": 0.8048, "step": 12030 }, { "epoch": 0.745755360019829, "grad_norm": 1.7213367223739624, "learning_rate": 2.676929097906203e-07, "loss": 0.8549, "step": 12035 }, { "epoch": 0.7460651877556079, "grad_norm": 1.5626963376998901, "learning_rate": 2.673667732046181e-07, "loss": 0.8894, "step": 12040 }, { "epoch": 0.7463750154913867, "grad_norm": 1.5986100435256958, "learning_rate": 2.6704063661861587e-07, "loss": 0.8772, "step": 12045 }, { "epoch": 0.7466848432271657, "grad_norm": 1.6666293144226074, "learning_rate": 2.6671450003261363e-07, "loss": 0.8074, "step": 12050 }, { "epoch": 0.7469946709629446, "grad_norm": 1.4945405721664429, "learning_rate": 2.6638836344661145e-07, "loss": 0.7913, "step": 12055 }, { "epoch": 0.7473044986987235, "grad_norm": 1.6885786056518555, "learning_rate": 2.660622268606092e-07, "loss": 0.807, "step": 12060 }, { "epoch": 0.7476143264345024, "grad_norm": 1.705877423286438, "learning_rate": 2.6573609027460697e-07, "loss": 0.8393, "step": 12065 }, { "epoch": 0.7479241541702814, "grad_norm": 1.6066336631774902, "learning_rate": 2.6540995368860473e-07, "loss": 0.8385, "step": 12070 }, { "epoch": 0.7482339819060603, "grad_norm": 1.4911136627197266, "learning_rate": 2.650838171026026e-07, "loss": 0.8703, "step": 12075 }, { "epoch": 0.7485438096418391, "grad_norm": 1.5525881052017212, "learning_rate": 2.6475768051660036e-07, "loss": 0.8429, "step": 12080 }, { "epoch": 0.748853637377618, "grad_norm": 1.6710090637207031, "learning_rate": 2.644315439305981e-07, "loss": 0.8181, "step": 12085 }, { "epoch": 0.7491634651133969, "grad_norm": 1.6686670780181885, "learning_rate": 2.641054073445959e-07, "loss": 0.7988, "step": 12090 }, { "epoch": 0.7494732928491759, "grad_norm": 2.199380874633789, "learning_rate": 2.637792707585937e-07, "loss": 0.8725, "step": 12095 }, { "epoch": 0.7497831205849548, "grad_norm": 2.009506940841675, "learning_rate": 2.6345313417259147e-07, "loss": 0.7905, "step": 12100 }, { "epoch": 0.7500929483207337, "grad_norm": 1.5329688787460327, "learning_rate": 2.6312699758658923e-07, "loss": 0.8132, "step": 12105 }, { "epoch": 0.7504027760565126, "grad_norm": 1.4581785202026367, "learning_rate": 2.62800861000587e-07, "loss": 0.8106, "step": 12110 }, { "epoch": 0.7507126037922914, "grad_norm": 1.7292143106460571, "learning_rate": 2.624747244145848e-07, "loss": 0.8555, "step": 12115 }, { "epoch": 0.7510224315280704, "grad_norm": 1.5470852851867676, "learning_rate": 2.621485878285826e-07, "loss": 0.8696, "step": 12120 }, { "epoch": 0.7513322592638493, "grad_norm": 1.7068780660629272, "learning_rate": 2.618224512425804e-07, "loss": 0.8075, "step": 12125 }, { "epoch": 0.7516420869996282, "grad_norm": 1.643983006477356, "learning_rate": 2.614963146565782e-07, "loss": 0.8395, "step": 12130 }, { "epoch": 0.7519519147354071, "grad_norm": 1.4173481464385986, "learning_rate": 2.6117017807057596e-07, "loss": 0.8269, "step": 12135 }, { "epoch": 0.752261742471186, "grad_norm": 1.5375890731811523, "learning_rate": 2.608440414845737e-07, "loss": 0.8119, "step": 12140 }, { "epoch": 0.752571570206965, "grad_norm": 1.6651617288589478, "learning_rate": 2.605179048985715e-07, "loss": 0.815, "step": 12145 }, { "epoch": 0.7528813979427438, "grad_norm": 1.595442295074463, "learning_rate": 2.601917683125693e-07, "loss": 0.7894, "step": 12150 }, { "epoch": 0.7531912256785227, "grad_norm": 1.73128342628479, "learning_rate": 2.5986563172656707e-07, "loss": 0.8213, "step": 12155 }, { "epoch": 0.7535010534143016, "grad_norm": 1.7811254262924194, "learning_rate": 2.5953949514056483e-07, "loss": 0.8276, "step": 12160 }, { "epoch": 0.7538108811500805, "grad_norm": 1.8279145956039429, "learning_rate": 2.5921335855456264e-07, "loss": 0.8465, "step": 12165 }, { "epoch": 0.7541207088858595, "grad_norm": 1.6485322713851929, "learning_rate": 2.5888722196856046e-07, "loss": 0.808, "step": 12170 }, { "epoch": 0.7544305366216384, "grad_norm": 1.8810158967971802, "learning_rate": 2.585610853825582e-07, "loss": 0.8917, "step": 12175 }, { "epoch": 0.7547403643574173, "grad_norm": 1.5954281091690063, "learning_rate": 2.58234948796556e-07, "loss": 0.8335, "step": 12180 }, { "epoch": 0.7550501920931962, "grad_norm": 1.5156534910202026, "learning_rate": 2.5790881221055375e-07, "loss": 0.8324, "step": 12185 }, { "epoch": 0.755360019828975, "grad_norm": 1.6980409622192383, "learning_rate": 2.5758267562455156e-07, "loss": 0.8279, "step": 12190 }, { "epoch": 0.755669847564754, "grad_norm": 1.6828889846801758, "learning_rate": 2.572565390385493e-07, "loss": 0.8573, "step": 12195 }, { "epoch": 0.7559796753005329, "grad_norm": 1.857689619064331, "learning_rate": 2.569304024525471e-07, "loss": 0.8772, "step": 12200 }, { "epoch": 0.7562895030363118, "grad_norm": 2.1919195652008057, "learning_rate": 2.566042658665449e-07, "loss": 0.8151, "step": 12205 }, { "epoch": 0.7565993307720907, "grad_norm": 1.882637858390808, "learning_rate": 2.5627812928054266e-07, "loss": 0.8356, "step": 12210 }, { "epoch": 0.7569091585078697, "grad_norm": 1.5626953840255737, "learning_rate": 2.559519926945405e-07, "loss": 0.812, "step": 12215 }, { "epoch": 0.7572189862436486, "grad_norm": 1.7334452867507935, "learning_rate": 2.5562585610853824e-07, "loss": 0.8315, "step": 12220 }, { "epoch": 0.7575288139794274, "grad_norm": 1.8952217102050781, "learning_rate": 2.5529971952253606e-07, "loss": 0.8419, "step": 12225 }, { "epoch": 0.7578386417152063, "grad_norm": 1.7732300758361816, "learning_rate": 2.549735829365338e-07, "loss": 0.8184, "step": 12230 }, { "epoch": 0.7581484694509852, "grad_norm": 2.102527379989624, "learning_rate": 2.546474463505316e-07, "loss": 0.9048, "step": 12235 }, { "epoch": 0.7584582971867642, "grad_norm": 1.6424355506896973, "learning_rate": 2.5432130976452934e-07, "loss": 0.8585, "step": 12240 }, { "epoch": 0.7587681249225431, "grad_norm": 1.60305655002594, "learning_rate": 2.5399517317852716e-07, "loss": 0.8734, "step": 12245 }, { "epoch": 0.759077952658322, "grad_norm": 1.6674180030822754, "learning_rate": 2.536690365925249e-07, "loss": 0.8755, "step": 12250 }, { "epoch": 0.7593877803941009, "grad_norm": 1.6777504682540894, "learning_rate": 2.533429000065227e-07, "loss": 0.804, "step": 12255 }, { "epoch": 0.7596976081298797, "grad_norm": 1.5458297729492188, "learning_rate": 2.5301676342052055e-07, "loss": 0.8109, "step": 12260 }, { "epoch": 0.7600074358656587, "grad_norm": 1.6330121755599976, "learning_rate": 2.526906268345183e-07, "loss": 0.8382, "step": 12265 }, { "epoch": 0.7603172636014376, "grad_norm": 1.5988837480545044, "learning_rate": 2.523644902485161e-07, "loss": 0.8508, "step": 12270 }, { "epoch": 0.7606270913372165, "grad_norm": 1.3910964727401733, "learning_rate": 2.5203835366251384e-07, "loss": 0.7919, "step": 12275 }, { "epoch": 0.7609369190729954, "grad_norm": 1.6846717596054077, "learning_rate": 2.5171221707651166e-07, "loss": 0.8539, "step": 12280 }, { "epoch": 0.7612467468087744, "grad_norm": 1.7526158094406128, "learning_rate": 2.513860804905094e-07, "loss": 0.8191, "step": 12285 }, { "epoch": 0.7615565745445533, "grad_norm": 1.79002845287323, "learning_rate": 2.510599439045072e-07, "loss": 0.8655, "step": 12290 }, { "epoch": 0.7618664022803321, "grad_norm": 1.633569598197937, "learning_rate": 2.5073380731850494e-07, "loss": 0.8445, "step": 12295 }, { "epoch": 0.762176230016111, "grad_norm": 1.5475099086761475, "learning_rate": 2.5040767073250276e-07, "loss": 0.8287, "step": 12300 }, { "epoch": 0.7624860577518899, "grad_norm": 1.474124789237976, "learning_rate": 2.500815341465005e-07, "loss": 0.8585, "step": 12305 }, { "epoch": 0.7627958854876689, "grad_norm": 1.6689743995666504, "learning_rate": 2.4975539756049834e-07, "loss": 0.8634, "step": 12310 }, { "epoch": 0.7631057132234478, "grad_norm": 1.7506043910980225, "learning_rate": 2.494292609744961e-07, "loss": 0.9025, "step": 12315 }, { "epoch": 0.7634155409592267, "grad_norm": 1.5099395513534546, "learning_rate": 2.491031243884939e-07, "loss": 0.838, "step": 12320 }, { "epoch": 0.7637253686950056, "grad_norm": 1.82889986038208, "learning_rate": 2.487769878024917e-07, "loss": 0.9041, "step": 12325 }, { "epoch": 0.7640351964307844, "grad_norm": 1.839638113975525, "learning_rate": 2.4845085121648944e-07, "loss": 0.8534, "step": 12330 }, { "epoch": 0.7643450241665634, "grad_norm": 1.7713228464126587, "learning_rate": 2.4812471463048726e-07, "loss": 0.8327, "step": 12335 }, { "epoch": 0.7646548519023423, "grad_norm": 1.940561056137085, "learning_rate": 2.47798578044485e-07, "loss": 0.8249, "step": 12340 }, { "epoch": 0.7649646796381212, "grad_norm": 2.0608010292053223, "learning_rate": 2.4747244145848283e-07, "loss": 0.8279, "step": 12345 }, { "epoch": 0.7652745073739001, "grad_norm": 1.5995310544967651, "learning_rate": 2.471463048724806e-07, "loss": 0.8387, "step": 12350 }, { "epoch": 0.765584335109679, "grad_norm": 1.7294107675552368, "learning_rate": 2.4682016828647836e-07, "loss": 0.771, "step": 12355 }, { "epoch": 0.765894162845458, "grad_norm": 1.6579457521438599, "learning_rate": 2.464940317004761e-07, "loss": 0.8103, "step": 12360 }, { "epoch": 0.7662039905812368, "grad_norm": 1.6278367042541504, "learning_rate": 2.4616789511447394e-07, "loss": 0.8413, "step": 12365 }, { "epoch": 0.7665138183170157, "grad_norm": 1.6960997581481934, "learning_rate": 2.458417585284717e-07, "loss": 0.7898, "step": 12370 }, { "epoch": 0.7668236460527946, "grad_norm": 1.503017783164978, "learning_rate": 2.455156219424695e-07, "loss": 0.8241, "step": 12375 }, { "epoch": 0.7671334737885735, "grad_norm": 1.7900772094726562, "learning_rate": 2.451894853564673e-07, "loss": 0.8194, "step": 12380 }, { "epoch": 0.7674433015243525, "grad_norm": 1.7626205682754517, "learning_rate": 2.4486334877046504e-07, "loss": 0.8592, "step": 12385 }, { "epoch": 0.7677531292601314, "grad_norm": 1.8873857259750366, "learning_rate": 2.4453721218446285e-07, "loss": 0.8526, "step": 12390 }, { "epoch": 0.7680629569959103, "grad_norm": 1.4770358800888062, "learning_rate": 2.442110755984606e-07, "loss": 0.8272, "step": 12395 }, { "epoch": 0.7683727847316891, "grad_norm": 1.5444800853729248, "learning_rate": 2.4388493901245843e-07, "loss": 0.8271, "step": 12400 }, { "epoch": 0.768682612467468, "grad_norm": 1.6830596923828125, "learning_rate": 2.435588024264562e-07, "loss": 0.8619, "step": 12405 }, { "epoch": 0.768992440203247, "grad_norm": 1.4813003540039062, "learning_rate": 2.4323266584045396e-07, "loss": 0.8682, "step": 12410 }, { "epoch": 0.7693022679390259, "grad_norm": 1.7400364875793457, "learning_rate": 2.4290652925445177e-07, "loss": 0.8908, "step": 12415 }, { "epoch": 0.7696120956748048, "grad_norm": 1.8360587358474731, "learning_rate": 2.4258039266844953e-07, "loss": 0.9128, "step": 12420 }, { "epoch": 0.7699219234105837, "grad_norm": 1.7533528804779053, "learning_rate": 2.422542560824473e-07, "loss": 0.8714, "step": 12425 }, { "epoch": 0.7702317511463627, "grad_norm": 1.5827722549438477, "learning_rate": 2.419281194964451e-07, "loss": 0.8042, "step": 12430 }, { "epoch": 0.7705415788821415, "grad_norm": 1.5753742456436157, "learning_rate": 2.416019829104429e-07, "loss": 0.8827, "step": 12435 }, { "epoch": 0.7708514066179204, "grad_norm": 1.5697492361068726, "learning_rate": 2.412758463244407e-07, "loss": 0.8062, "step": 12440 }, { "epoch": 0.7711612343536993, "grad_norm": 1.7216980457305908, "learning_rate": 2.4094970973843845e-07, "loss": 0.8012, "step": 12445 }, { "epoch": 0.7714710620894782, "grad_norm": 1.6672481298446655, "learning_rate": 2.406235731524362e-07, "loss": 0.8102, "step": 12450 }, { "epoch": 0.7717808898252572, "grad_norm": 1.6554890871047974, "learning_rate": 2.40297436566434e-07, "loss": 0.9012, "step": 12455 }, { "epoch": 0.7720907175610361, "grad_norm": 1.7850035429000854, "learning_rate": 2.399712999804318e-07, "loss": 0.8686, "step": 12460 }, { "epoch": 0.772400545296815, "grad_norm": 1.6059268712997437, "learning_rate": 2.396451633944296e-07, "loss": 0.9414, "step": 12465 }, { "epoch": 0.7727103730325938, "grad_norm": 1.9762696027755737, "learning_rate": 2.3931902680842737e-07, "loss": 0.8549, "step": 12470 }, { "epoch": 0.7730202007683727, "grad_norm": 1.5474927425384521, "learning_rate": 2.3899289022242513e-07, "loss": 0.8612, "step": 12475 }, { "epoch": 0.7733300285041517, "grad_norm": 1.6779277324676514, "learning_rate": 2.386667536364229e-07, "loss": 0.8796, "step": 12480 }, { "epoch": 0.7736398562399306, "grad_norm": 2.3313236236572266, "learning_rate": 2.383406170504207e-07, "loss": 0.792, "step": 12485 }, { "epoch": 0.7739496839757095, "grad_norm": 1.6367963552474976, "learning_rate": 2.380144804644185e-07, "loss": 0.8348, "step": 12490 }, { "epoch": 0.7742595117114884, "grad_norm": 1.5071892738342285, "learning_rate": 2.3768834387841626e-07, "loss": 0.8152, "step": 12495 }, { "epoch": 0.7745693394472674, "grad_norm": 1.808895230293274, "learning_rate": 2.3736220729241405e-07, "loss": 0.8839, "step": 12500 }, { "epoch": 0.7748791671830463, "grad_norm": 2.20473575592041, "learning_rate": 2.3703607070641181e-07, "loss": 0.8446, "step": 12505 }, { "epoch": 0.7751889949188251, "grad_norm": 1.4109883308410645, "learning_rate": 2.3670993412040963e-07, "loss": 0.8683, "step": 12510 }, { "epoch": 0.775498822654604, "grad_norm": 1.744354486465454, "learning_rate": 2.363837975344074e-07, "loss": 0.8478, "step": 12515 }, { "epoch": 0.7758086503903829, "grad_norm": 1.9272619485855103, "learning_rate": 2.3605766094840518e-07, "loss": 0.7989, "step": 12520 }, { "epoch": 0.7761184781261619, "grad_norm": 1.6200684309005737, "learning_rate": 2.3573152436240294e-07, "loss": 0.8322, "step": 12525 }, { "epoch": 0.7764283058619408, "grad_norm": 1.871013879776001, "learning_rate": 2.3540538777640073e-07, "loss": 0.8855, "step": 12530 }, { "epoch": 0.7767381335977197, "grad_norm": 1.7953826189041138, "learning_rate": 2.3507925119039855e-07, "loss": 0.8519, "step": 12535 }, { "epoch": 0.7770479613334986, "grad_norm": 1.8252722024917603, "learning_rate": 2.347531146043963e-07, "loss": 0.8563, "step": 12540 }, { "epoch": 0.7773577890692774, "grad_norm": 1.6359857320785522, "learning_rate": 2.344269780183941e-07, "loss": 0.8619, "step": 12545 }, { "epoch": 0.7776676168050564, "grad_norm": 1.8481338024139404, "learning_rate": 2.3410084143239186e-07, "loss": 0.8618, "step": 12550 }, { "epoch": 0.7779774445408353, "grad_norm": 1.506801724433899, "learning_rate": 2.3377470484638968e-07, "loss": 0.806, "step": 12555 }, { "epoch": 0.7782872722766142, "grad_norm": 1.6475685834884644, "learning_rate": 2.3344856826038744e-07, "loss": 0.8747, "step": 12560 }, { "epoch": 0.7785971000123931, "grad_norm": 1.919418215751648, "learning_rate": 2.3312243167438523e-07, "loss": 0.8102, "step": 12565 }, { "epoch": 0.778906927748172, "grad_norm": 2.001979351043701, "learning_rate": 2.32796295088383e-07, "loss": 0.8638, "step": 12570 }, { "epoch": 0.779216755483951, "grad_norm": 1.498509407043457, "learning_rate": 2.3247015850238078e-07, "loss": 0.8317, "step": 12575 }, { "epoch": 0.7795265832197298, "grad_norm": 1.7668142318725586, "learning_rate": 2.3214402191637857e-07, "loss": 0.8515, "step": 12580 }, { "epoch": 0.7798364109555087, "grad_norm": 1.5612461566925049, "learning_rate": 2.3181788533037636e-07, "loss": 0.8423, "step": 12585 }, { "epoch": 0.7801462386912876, "grad_norm": 1.6022250652313232, "learning_rate": 2.3149174874437412e-07, "loss": 0.8565, "step": 12590 }, { "epoch": 0.7804560664270666, "grad_norm": 1.6133705377578735, "learning_rate": 2.311656121583719e-07, "loss": 0.797, "step": 12595 }, { "epoch": 0.7807658941628455, "grad_norm": 1.6339962482452393, "learning_rate": 2.3083947557236967e-07, "loss": 0.8348, "step": 12600 }, { "epoch": 0.7810757218986244, "grad_norm": 1.6712517738342285, "learning_rate": 2.305133389863675e-07, "loss": 0.8628, "step": 12605 }, { "epoch": 0.7813855496344033, "grad_norm": 1.6611049175262451, "learning_rate": 2.3018720240036528e-07, "loss": 0.8882, "step": 12610 }, { "epoch": 0.7816953773701821, "grad_norm": 1.5482628345489502, "learning_rate": 2.2986106581436304e-07, "loss": 0.8264, "step": 12615 }, { "epoch": 0.782005205105961, "grad_norm": 1.9618483781814575, "learning_rate": 2.2953492922836083e-07, "loss": 0.924, "step": 12620 }, { "epoch": 0.78231503284174, "grad_norm": 1.7557168006896973, "learning_rate": 2.2920879264235862e-07, "loss": 0.8331, "step": 12625 }, { "epoch": 0.7826248605775189, "grad_norm": 1.7385376691818237, "learning_rate": 2.288826560563564e-07, "loss": 0.8142, "step": 12630 }, { "epoch": 0.7829346883132978, "grad_norm": 1.879010558128357, "learning_rate": 2.2855651947035417e-07, "loss": 0.8497, "step": 12635 }, { "epoch": 0.7832445160490767, "grad_norm": 1.5408960580825806, "learning_rate": 2.2823038288435196e-07, "loss": 0.8317, "step": 12640 }, { "epoch": 0.7835543437848557, "grad_norm": 1.599068284034729, "learning_rate": 2.2790424629834972e-07, "loss": 0.9115, "step": 12645 }, { "epoch": 0.7838641715206345, "grad_norm": 1.5688152313232422, "learning_rate": 2.2757810971234754e-07, "loss": 0.7816, "step": 12650 }, { "epoch": 0.7841739992564134, "grad_norm": 2.164660692214966, "learning_rate": 2.272519731263453e-07, "loss": 0.863, "step": 12655 }, { "epoch": 0.7844838269921923, "grad_norm": 1.5487098693847656, "learning_rate": 2.2692583654034309e-07, "loss": 0.8327, "step": 12660 }, { "epoch": 0.7847936547279712, "grad_norm": 1.6772910356521606, "learning_rate": 2.2659969995434085e-07, "loss": 0.8703, "step": 12665 }, { "epoch": 0.7851034824637502, "grad_norm": 1.7347041368484497, "learning_rate": 2.2627356336833864e-07, "loss": 0.8801, "step": 12670 }, { "epoch": 0.7854133101995291, "grad_norm": 1.5781711339950562, "learning_rate": 2.2594742678233645e-07, "loss": 0.8969, "step": 12675 }, { "epoch": 0.785723137935308, "grad_norm": 1.8302359580993652, "learning_rate": 2.2562129019633422e-07, "loss": 0.8269, "step": 12680 }, { "epoch": 0.7860329656710868, "grad_norm": 1.4215518236160278, "learning_rate": 2.25295153610332e-07, "loss": 0.8784, "step": 12685 }, { "epoch": 0.7863427934068657, "grad_norm": 1.5014971494674683, "learning_rate": 2.2496901702432977e-07, "loss": 0.8408, "step": 12690 }, { "epoch": 0.7866526211426447, "grad_norm": 1.5794585943222046, "learning_rate": 2.2464288043832756e-07, "loss": 0.8316, "step": 12695 }, { "epoch": 0.7869624488784236, "grad_norm": 1.7832512855529785, "learning_rate": 2.2431674385232535e-07, "loss": 0.8326, "step": 12700 }, { "epoch": 0.7872722766142025, "grad_norm": 1.5331518650054932, "learning_rate": 2.2399060726632313e-07, "loss": 0.7877, "step": 12705 }, { "epoch": 0.7875821043499814, "grad_norm": 1.6355725526809692, "learning_rate": 2.236644706803209e-07, "loss": 0.8809, "step": 12710 }, { "epoch": 0.7878919320857604, "grad_norm": 1.6083929538726807, "learning_rate": 2.2333833409431869e-07, "loss": 0.8941, "step": 12715 }, { "epoch": 0.7882017598215392, "grad_norm": 1.6512799263000488, "learning_rate": 2.2301219750831647e-07, "loss": 0.88, "step": 12720 }, { "epoch": 0.7885115875573181, "grad_norm": 1.6548364162445068, "learning_rate": 2.2268606092231426e-07, "loss": 0.7972, "step": 12725 }, { "epoch": 0.788821415293097, "grad_norm": 1.73285710811615, "learning_rate": 2.2235992433631203e-07, "loss": 0.9439, "step": 12730 }, { "epoch": 0.7891312430288759, "grad_norm": 1.699511170387268, "learning_rate": 2.2203378775030981e-07, "loss": 0.7976, "step": 12735 }, { "epoch": 0.7894410707646549, "grad_norm": 1.5702979564666748, "learning_rate": 2.217076511643076e-07, "loss": 0.7437, "step": 12740 }, { "epoch": 0.7897508985004338, "grad_norm": 1.3957620859146118, "learning_rate": 2.213815145783054e-07, "loss": 0.9025, "step": 12745 }, { "epoch": 0.7900607262362127, "grad_norm": 1.7834044694900513, "learning_rate": 2.2105537799230318e-07, "loss": 0.8464, "step": 12750 }, { "epoch": 0.7903705539719915, "grad_norm": 1.613224744796753, "learning_rate": 2.2072924140630094e-07, "loss": 0.7856, "step": 12755 }, { "epoch": 0.7906803817077704, "grad_norm": 1.7384365797042847, "learning_rate": 2.2040310482029873e-07, "loss": 0.8379, "step": 12760 }, { "epoch": 0.7909902094435494, "grad_norm": 1.6170567274093628, "learning_rate": 2.200769682342965e-07, "loss": 0.899, "step": 12765 }, { "epoch": 0.7913000371793283, "grad_norm": 1.9081264734268188, "learning_rate": 2.197508316482943e-07, "loss": 0.92, "step": 12770 }, { "epoch": 0.7916098649151072, "grad_norm": 1.5922707319259644, "learning_rate": 2.1942469506229207e-07, "loss": 0.8537, "step": 12775 }, { "epoch": 0.7919196926508861, "grad_norm": 1.5706474781036377, "learning_rate": 2.1909855847628986e-07, "loss": 0.8823, "step": 12780 }, { "epoch": 0.792229520386665, "grad_norm": 1.6409492492675781, "learning_rate": 2.1877242189028762e-07, "loss": 0.8322, "step": 12785 }, { "epoch": 0.792539348122444, "grad_norm": 1.5646263360977173, "learning_rate": 2.1844628530428544e-07, "loss": 0.8649, "step": 12790 }, { "epoch": 0.7928491758582228, "grad_norm": 1.9030438661575317, "learning_rate": 2.181201487182832e-07, "loss": 0.787, "step": 12795 }, { "epoch": 0.7931590035940017, "grad_norm": 1.6104735136032104, "learning_rate": 2.17794012132281e-07, "loss": 0.8153, "step": 12800 }, { "epoch": 0.7934688313297806, "grad_norm": 1.678658127784729, "learning_rate": 2.1746787554627878e-07, "loss": 0.7972, "step": 12805 }, { "epoch": 0.7937786590655596, "grad_norm": 1.8140196800231934, "learning_rate": 2.1714173896027654e-07, "loss": 0.8424, "step": 12810 }, { "epoch": 0.7940884868013385, "grad_norm": 1.6023298501968384, "learning_rate": 2.1681560237427436e-07, "loss": 0.7968, "step": 12815 }, { "epoch": 0.7943983145371174, "grad_norm": 1.8073829412460327, "learning_rate": 2.1648946578827212e-07, "loss": 0.8235, "step": 12820 }, { "epoch": 0.7947081422728963, "grad_norm": 1.4846640825271606, "learning_rate": 2.161633292022699e-07, "loss": 0.8622, "step": 12825 }, { "epoch": 0.7950179700086751, "grad_norm": 1.8393114805221558, "learning_rate": 2.1583719261626767e-07, "loss": 0.8954, "step": 12830 }, { "epoch": 0.7953277977444541, "grad_norm": 1.8470345735549927, "learning_rate": 2.1551105603026546e-07, "loss": 0.8613, "step": 12835 }, { "epoch": 0.795637625480233, "grad_norm": 1.86546790599823, "learning_rate": 2.1518491944426325e-07, "loss": 0.8211, "step": 12840 }, { "epoch": 0.7959474532160119, "grad_norm": 1.6740636825561523, "learning_rate": 2.1485878285826104e-07, "loss": 0.898, "step": 12845 }, { "epoch": 0.7962572809517908, "grad_norm": 1.6969316005706787, "learning_rate": 2.145326462722588e-07, "loss": 0.8741, "step": 12850 }, { "epoch": 0.7965671086875697, "grad_norm": 1.4516233205795288, "learning_rate": 2.142065096862566e-07, "loss": 0.7767, "step": 12855 }, { "epoch": 0.7968769364233487, "grad_norm": 1.6669203042984009, "learning_rate": 2.1388037310025435e-07, "loss": 0.8279, "step": 12860 }, { "epoch": 0.7971867641591275, "grad_norm": 1.7805978059768677, "learning_rate": 2.1355423651425217e-07, "loss": 0.8506, "step": 12865 }, { "epoch": 0.7974965918949064, "grad_norm": 1.9351732730865479, "learning_rate": 2.1322809992824993e-07, "loss": 0.8938, "step": 12870 }, { "epoch": 0.7978064196306853, "grad_norm": 1.753041386604309, "learning_rate": 2.1290196334224772e-07, "loss": 0.8334, "step": 12875 }, { "epoch": 0.7981162473664642, "grad_norm": 1.499678611755371, "learning_rate": 2.125758267562455e-07, "loss": 0.8788, "step": 12880 }, { "epoch": 0.7984260751022432, "grad_norm": 1.7672251462936401, "learning_rate": 2.122496901702433e-07, "loss": 0.8388, "step": 12885 }, { "epoch": 0.7987359028380221, "grad_norm": 1.5390257835388184, "learning_rate": 2.1192355358424109e-07, "loss": 0.7617, "step": 12890 }, { "epoch": 0.799045730573801, "grad_norm": 1.7963776588439941, "learning_rate": 2.1159741699823885e-07, "loss": 0.8475, "step": 12895 }, { "epoch": 0.7993555583095798, "grad_norm": 1.7433286905288696, "learning_rate": 2.1127128041223664e-07, "loss": 0.8732, "step": 12900 }, { "epoch": 0.7996653860453588, "grad_norm": 1.6032323837280273, "learning_rate": 2.109451438262344e-07, "loss": 0.8435, "step": 12905 }, { "epoch": 0.7999752137811377, "grad_norm": 1.787131667137146, "learning_rate": 2.1061900724023222e-07, "loss": 0.8298, "step": 12910 }, { "epoch": 0.8002850415169166, "grad_norm": 1.6311670541763306, "learning_rate": 2.1029287065422998e-07, "loss": 0.8268, "step": 12915 }, { "epoch": 0.8005948692526955, "grad_norm": 1.9277132749557495, "learning_rate": 2.0996673406822777e-07, "loss": 0.8315, "step": 12920 }, { "epoch": 0.8009046969884744, "grad_norm": 1.6794623136520386, "learning_rate": 2.0964059748222553e-07, "loss": 0.9232, "step": 12925 }, { "epoch": 0.8012145247242534, "grad_norm": 2.017181396484375, "learning_rate": 2.0931446089622332e-07, "loss": 0.8602, "step": 12930 }, { "epoch": 0.8015243524600322, "grad_norm": 1.7353490591049194, "learning_rate": 2.089883243102211e-07, "loss": 0.7961, "step": 12935 }, { "epoch": 0.8018341801958111, "grad_norm": 1.6019030809402466, "learning_rate": 2.086621877242189e-07, "loss": 0.8413, "step": 12940 }, { "epoch": 0.80214400793159, "grad_norm": 1.5651012659072876, "learning_rate": 2.0833605113821669e-07, "loss": 0.8072, "step": 12945 }, { "epoch": 0.8024538356673689, "grad_norm": 1.4187105894088745, "learning_rate": 2.0800991455221445e-07, "loss": 0.7751, "step": 12950 }, { "epoch": 0.8027636634031479, "grad_norm": 1.5140595436096191, "learning_rate": 2.0768377796621226e-07, "loss": 0.8108, "step": 12955 }, { "epoch": 0.8030734911389268, "grad_norm": 1.601493000984192, "learning_rate": 2.0735764138021003e-07, "loss": 0.8384, "step": 12960 }, { "epoch": 0.8033833188747057, "grad_norm": 1.7811522483825684, "learning_rate": 2.0703150479420781e-07, "loss": 0.8362, "step": 12965 }, { "epoch": 0.8036931466104845, "grad_norm": 1.7366353273391724, "learning_rate": 2.0670536820820558e-07, "loss": 0.8912, "step": 12970 }, { "epoch": 0.8040029743462634, "grad_norm": 1.7045339345932007, "learning_rate": 2.0637923162220337e-07, "loss": 0.908, "step": 12975 }, { "epoch": 0.8043128020820424, "grad_norm": 2.1573472023010254, "learning_rate": 2.0605309503620116e-07, "loss": 0.8697, "step": 12980 }, { "epoch": 0.8046226298178213, "grad_norm": 1.692740559577942, "learning_rate": 2.0572695845019894e-07, "loss": 0.8333, "step": 12985 }, { "epoch": 0.8049324575536002, "grad_norm": 1.731399655342102, "learning_rate": 2.054008218641967e-07, "loss": 0.9046, "step": 12990 }, { "epoch": 0.8052422852893791, "grad_norm": 1.6091338396072388, "learning_rate": 2.050746852781945e-07, "loss": 0.8527, "step": 12995 }, { "epoch": 0.8055521130251581, "grad_norm": 1.7906439304351807, "learning_rate": 2.0474854869219226e-07, "loss": 0.8542, "step": 13000 }, { "epoch": 0.8058619407609369, "grad_norm": 1.535165786743164, "learning_rate": 2.0442241210619007e-07, "loss": 0.8227, "step": 13005 }, { "epoch": 0.8061717684967158, "grad_norm": 1.5742608308792114, "learning_rate": 2.0409627552018786e-07, "loss": 0.8585, "step": 13010 }, { "epoch": 0.8064815962324947, "grad_norm": 1.7529376745224, "learning_rate": 2.0377013893418562e-07, "loss": 0.9073, "step": 13015 }, { "epoch": 0.8067914239682736, "grad_norm": 2.427605628967285, "learning_rate": 2.0344400234818341e-07, "loss": 0.8496, "step": 13020 }, { "epoch": 0.8071012517040526, "grad_norm": 1.8117610216140747, "learning_rate": 2.0311786576218118e-07, "loss": 0.9688, "step": 13025 }, { "epoch": 0.8074110794398315, "grad_norm": 1.8498177528381348, "learning_rate": 2.02791729176179e-07, "loss": 0.8359, "step": 13030 }, { "epoch": 0.8077209071756104, "grad_norm": 1.9357177019119263, "learning_rate": 2.0246559259017675e-07, "loss": 0.8784, "step": 13035 }, { "epoch": 0.8080307349113892, "grad_norm": 1.4396919012069702, "learning_rate": 2.0213945600417454e-07, "loss": 0.8074, "step": 13040 }, { "epoch": 0.8083405626471681, "grad_norm": 1.7626615762710571, "learning_rate": 2.018133194181723e-07, "loss": 0.8457, "step": 13045 }, { "epoch": 0.8086503903829471, "grad_norm": 1.5501993894577026, "learning_rate": 2.0148718283217012e-07, "loss": 0.8934, "step": 13050 }, { "epoch": 0.808960218118726, "grad_norm": 1.7040566205978394, "learning_rate": 2.0116104624616788e-07, "loss": 0.862, "step": 13055 }, { "epoch": 0.8092700458545049, "grad_norm": 1.6638156175613403, "learning_rate": 2.0083490966016567e-07, "loss": 0.8367, "step": 13060 }, { "epoch": 0.8095798735902838, "grad_norm": 1.660445213317871, "learning_rate": 2.0050877307416343e-07, "loss": 0.8679, "step": 13065 }, { "epoch": 0.8098897013260627, "grad_norm": 1.4091451168060303, "learning_rate": 2.0018263648816122e-07, "loss": 0.8282, "step": 13070 }, { "epoch": 0.8101995290618416, "grad_norm": 1.6242109537124634, "learning_rate": 1.99856499902159e-07, "loss": 0.8082, "step": 13075 }, { "epoch": 0.8105093567976205, "grad_norm": 1.5987792015075684, "learning_rate": 1.995303633161568e-07, "loss": 0.8597, "step": 13080 }, { "epoch": 0.8108191845333994, "grad_norm": 1.9972251653671265, "learning_rate": 1.992042267301546e-07, "loss": 0.8285, "step": 13085 }, { "epoch": 0.8111290122691783, "grad_norm": 1.7022807598114014, "learning_rate": 1.9887809014415235e-07, "loss": 0.8605, "step": 13090 }, { "epoch": 0.8114388400049573, "grad_norm": 1.5569117069244385, "learning_rate": 1.9855195355815014e-07, "loss": 0.8606, "step": 13095 }, { "epoch": 0.8117486677407362, "grad_norm": 1.6015551090240479, "learning_rate": 1.9822581697214793e-07, "loss": 0.8456, "step": 13100 }, { "epoch": 0.8120584954765151, "grad_norm": 1.5152802467346191, "learning_rate": 1.9789968038614572e-07, "loss": 0.876, "step": 13105 }, { "epoch": 0.812368323212294, "grad_norm": 1.748630166053772, "learning_rate": 1.9757354380014348e-07, "loss": 0.8526, "step": 13110 }, { "epoch": 0.8126781509480728, "grad_norm": 1.7923551797866821, "learning_rate": 1.9724740721414127e-07, "loss": 0.8226, "step": 13115 }, { "epoch": 0.8129879786838518, "grad_norm": 1.7800803184509277, "learning_rate": 1.9692127062813906e-07, "loss": 0.8383, "step": 13120 }, { "epoch": 0.8132978064196307, "grad_norm": 1.560658574104309, "learning_rate": 1.9659513404213685e-07, "loss": 0.8485, "step": 13125 }, { "epoch": 0.8136076341554096, "grad_norm": 1.7288081645965576, "learning_rate": 1.962689974561346e-07, "loss": 0.8143, "step": 13130 }, { "epoch": 0.8139174618911885, "grad_norm": 1.6514371633529663, "learning_rate": 1.959428608701324e-07, "loss": 0.8376, "step": 13135 }, { "epoch": 0.8142272896269674, "grad_norm": 1.5603032112121582, "learning_rate": 1.9561672428413016e-07, "loss": 0.8566, "step": 13140 }, { "epoch": 0.8145371173627464, "grad_norm": 1.4777330160140991, "learning_rate": 1.9529058769812798e-07, "loss": 0.8361, "step": 13145 }, { "epoch": 0.8148469450985252, "grad_norm": 1.6687349081039429, "learning_rate": 1.9496445111212577e-07, "loss": 0.8056, "step": 13150 }, { "epoch": 0.8151567728343041, "grad_norm": 1.5413614511489868, "learning_rate": 1.9463831452612353e-07, "loss": 0.8799, "step": 13155 }, { "epoch": 0.815466600570083, "grad_norm": 1.6362565755844116, "learning_rate": 1.9431217794012132e-07, "loss": 0.8456, "step": 13160 }, { "epoch": 0.815776428305862, "grad_norm": 1.453336477279663, "learning_rate": 1.9398604135411908e-07, "loss": 0.815, "step": 13165 }, { "epoch": 0.8160862560416409, "grad_norm": 1.6359282732009888, "learning_rate": 1.936599047681169e-07, "loss": 0.8923, "step": 13170 }, { "epoch": 0.8163960837774198, "grad_norm": 1.521884560585022, "learning_rate": 1.9333376818211466e-07, "loss": 0.8706, "step": 13175 }, { "epoch": 0.8167059115131987, "grad_norm": 1.5754231214523315, "learning_rate": 1.9300763159611245e-07, "loss": 0.8655, "step": 13180 }, { "epoch": 0.8170157392489775, "grad_norm": 1.861711025238037, "learning_rate": 1.926814950101102e-07, "loss": 0.8894, "step": 13185 }, { "epoch": 0.8173255669847564, "grad_norm": 1.4133740663528442, "learning_rate": 1.92355358424108e-07, "loss": 0.8231, "step": 13190 }, { "epoch": 0.8176353947205354, "grad_norm": 2.0964176654815674, "learning_rate": 1.920292218381058e-07, "loss": 0.8615, "step": 13195 }, { "epoch": 0.8179452224563143, "grad_norm": 1.6045619249343872, "learning_rate": 1.9170308525210358e-07, "loss": 0.8978, "step": 13200 }, { "epoch": 0.8182550501920932, "grad_norm": 1.788089632987976, "learning_rate": 1.9137694866610134e-07, "loss": 0.8646, "step": 13205 }, { "epoch": 0.8185648779278721, "grad_norm": 1.661616325378418, "learning_rate": 1.9105081208009913e-07, "loss": 0.8135, "step": 13210 }, { "epoch": 0.8188747056636511, "grad_norm": 1.7263522148132324, "learning_rate": 1.9072467549409694e-07, "loss": 0.8371, "step": 13215 }, { "epoch": 0.8191845333994299, "grad_norm": 1.4636945724487305, "learning_rate": 1.903985389080947e-07, "loss": 0.8228, "step": 13220 }, { "epoch": 0.8194943611352088, "grad_norm": 1.6862614154815674, "learning_rate": 1.900724023220925e-07, "loss": 0.8521, "step": 13225 }, { "epoch": 0.8198041888709877, "grad_norm": 1.5454585552215576, "learning_rate": 1.8974626573609026e-07, "loss": 0.8171, "step": 13230 }, { "epoch": 0.8201140166067666, "grad_norm": 1.5220472812652588, "learning_rate": 1.8942012915008805e-07, "loss": 0.8045, "step": 13235 }, { "epoch": 0.8204238443425456, "grad_norm": 1.6306706666946411, "learning_rate": 1.8909399256408584e-07, "loss": 0.8653, "step": 13240 }, { "epoch": 0.8207336720783245, "grad_norm": 1.3600419759750366, "learning_rate": 1.8876785597808363e-07, "loss": 0.7818, "step": 13245 }, { "epoch": 0.8210434998141034, "grad_norm": 1.7418010234832764, "learning_rate": 1.884417193920814e-07, "loss": 0.8725, "step": 13250 }, { "epoch": 0.8213533275498822, "grad_norm": 1.472337007522583, "learning_rate": 1.8811558280607918e-07, "loss": 0.8193, "step": 13255 }, { "epoch": 0.8216631552856611, "grad_norm": 1.576775312423706, "learning_rate": 1.8778944622007694e-07, "loss": 0.8549, "step": 13260 }, { "epoch": 0.8219729830214401, "grad_norm": 1.702983021736145, "learning_rate": 1.8746330963407475e-07, "loss": 0.8276, "step": 13265 }, { "epoch": 0.822282810757219, "grad_norm": 1.659605860710144, "learning_rate": 1.8713717304807252e-07, "loss": 0.8589, "step": 13270 }, { "epoch": 0.8225926384929979, "grad_norm": 1.483527660369873, "learning_rate": 1.868110364620703e-07, "loss": 0.8231, "step": 13275 }, { "epoch": 0.8229024662287768, "grad_norm": 1.7912235260009766, "learning_rate": 1.8648489987606807e-07, "loss": 0.8794, "step": 13280 }, { "epoch": 0.8232122939645558, "grad_norm": 1.7573151588439941, "learning_rate": 1.8615876329006588e-07, "loss": 0.8448, "step": 13285 }, { "epoch": 0.8235221217003346, "grad_norm": 1.6768492460250854, "learning_rate": 1.8583262670406367e-07, "loss": 0.895, "step": 13290 }, { "epoch": 0.8238319494361135, "grad_norm": 1.7509632110595703, "learning_rate": 1.8550649011806144e-07, "loss": 0.883, "step": 13295 }, { "epoch": 0.8241417771718924, "grad_norm": 2.0281455516815186, "learning_rate": 1.8518035353205922e-07, "loss": 0.8475, "step": 13300 }, { "epoch": 0.8244516049076713, "grad_norm": 1.6603338718414307, "learning_rate": 1.8485421694605699e-07, "loss": 0.8672, "step": 13305 }, { "epoch": 0.8247614326434503, "grad_norm": 1.8322370052337646, "learning_rate": 1.845280803600548e-07, "loss": 0.8216, "step": 13310 }, { "epoch": 0.8250712603792292, "grad_norm": 1.5981428623199463, "learning_rate": 1.8420194377405256e-07, "loss": 0.874, "step": 13315 }, { "epoch": 0.8253810881150081, "grad_norm": 1.6096409559249878, "learning_rate": 1.8387580718805035e-07, "loss": 0.8267, "step": 13320 }, { "epoch": 0.8256909158507869, "grad_norm": 1.7041516304016113, "learning_rate": 1.8354967060204812e-07, "loss": 0.8608, "step": 13325 }, { "epoch": 0.8260007435865658, "grad_norm": 1.598902702331543, "learning_rate": 1.832235340160459e-07, "loss": 0.7853, "step": 13330 }, { "epoch": 0.8263105713223448, "grad_norm": 1.6309785842895508, "learning_rate": 1.828973974300437e-07, "loss": 0.89, "step": 13335 }, { "epoch": 0.8266203990581237, "grad_norm": 1.5240240097045898, "learning_rate": 1.8257126084404148e-07, "loss": 0.8442, "step": 13340 }, { "epoch": 0.8269302267939026, "grad_norm": 1.6953092813491821, "learning_rate": 1.8224512425803925e-07, "loss": 0.8202, "step": 13345 }, { "epoch": 0.8272400545296815, "grad_norm": 1.7699304819107056, "learning_rate": 1.8191898767203703e-07, "loss": 0.9172, "step": 13350 }, { "epoch": 0.8275498822654604, "grad_norm": 1.9079238176345825, "learning_rate": 1.815928510860348e-07, "loss": 0.8883, "step": 13355 }, { "epoch": 0.8278597100012393, "grad_norm": 1.711847186088562, "learning_rate": 1.812667145000326e-07, "loss": 0.869, "step": 13360 }, { "epoch": 0.8281695377370182, "grad_norm": 1.9339678287506104, "learning_rate": 1.809405779140304e-07, "loss": 0.8089, "step": 13365 }, { "epoch": 0.8284793654727971, "grad_norm": 1.6180475950241089, "learning_rate": 1.8061444132802816e-07, "loss": 0.8604, "step": 13370 }, { "epoch": 0.828789193208576, "grad_norm": 1.6807856559753418, "learning_rate": 1.8028830474202595e-07, "loss": 0.8606, "step": 13375 }, { "epoch": 0.829099020944355, "grad_norm": 1.5525871515274048, "learning_rate": 1.7996216815602374e-07, "loss": 0.839, "step": 13380 }, { "epoch": 0.8294088486801339, "grad_norm": 1.8563975095748901, "learning_rate": 1.7963603157002153e-07, "loss": 0.9199, "step": 13385 }, { "epoch": 0.8297186764159128, "grad_norm": 1.614949345588684, "learning_rate": 1.793098949840193e-07, "loss": 0.8277, "step": 13390 }, { "epoch": 0.8300285041516916, "grad_norm": 1.6126703023910522, "learning_rate": 1.7898375839801708e-07, "loss": 0.8262, "step": 13395 }, { "epoch": 0.8303383318874705, "grad_norm": 1.8507457971572876, "learning_rate": 1.7865762181201484e-07, "loss": 0.8756, "step": 13400 }, { "epoch": 0.8306481596232494, "grad_norm": 1.6920008659362793, "learning_rate": 1.7833148522601266e-07, "loss": 0.804, "step": 13405 }, { "epoch": 0.8309579873590284, "grad_norm": 1.5415260791778564, "learning_rate": 1.7800534864001042e-07, "loss": 0.8464, "step": 13410 }, { "epoch": 0.8312678150948073, "grad_norm": 1.5903222560882568, "learning_rate": 1.776792120540082e-07, "loss": 0.859, "step": 13415 }, { "epoch": 0.8315776428305862, "grad_norm": 1.7165193557739258, "learning_rate": 1.7735307546800597e-07, "loss": 0.8823, "step": 13420 }, { "epoch": 0.8318874705663651, "grad_norm": 1.928329348564148, "learning_rate": 1.7702693888200376e-07, "loss": 0.8578, "step": 13425 }, { "epoch": 0.8321972983021441, "grad_norm": 1.6738739013671875, "learning_rate": 1.7670080229600158e-07, "loss": 0.8423, "step": 13430 }, { "epoch": 0.8325071260379229, "grad_norm": 2.068681240081787, "learning_rate": 1.7637466570999934e-07, "loss": 0.9073, "step": 13435 }, { "epoch": 0.8328169537737018, "grad_norm": 1.7965190410614014, "learning_rate": 1.7604852912399713e-07, "loss": 0.8362, "step": 13440 }, { "epoch": 0.8331267815094807, "grad_norm": 1.7469345331192017, "learning_rate": 1.757223925379949e-07, "loss": 0.9101, "step": 13445 }, { "epoch": 0.8334366092452596, "grad_norm": 1.7950725555419922, "learning_rate": 1.753962559519927e-07, "loss": 0.899, "step": 13450 }, { "epoch": 0.8337464369810386, "grad_norm": 1.9964932203292847, "learning_rate": 1.7507011936599047e-07, "loss": 0.8156, "step": 13455 }, { "epoch": 0.8340562647168175, "grad_norm": 1.7142348289489746, "learning_rate": 1.7474398277998826e-07, "loss": 0.8332, "step": 13460 }, { "epoch": 0.8343660924525964, "grad_norm": 1.6240555047988892, "learning_rate": 1.7441784619398602e-07, "loss": 0.889, "step": 13465 }, { "epoch": 0.8346759201883752, "grad_norm": 1.7697734832763672, "learning_rate": 1.740917096079838e-07, "loss": 0.8327, "step": 13470 }, { "epoch": 0.8349857479241541, "grad_norm": 1.7346786260604858, "learning_rate": 1.737655730219816e-07, "loss": 0.9012, "step": 13475 }, { "epoch": 0.8352955756599331, "grad_norm": 1.5951128005981445, "learning_rate": 1.734394364359794e-07, "loss": 0.7603, "step": 13480 }, { "epoch": 0.835605403395712, "grad_norm": 1.6518081426620483, "learning_rate": 1.7311329984997715e-07, "loss": 0.8964, "step": 13485 }, { "epoch": 0.8359152311314909, "grad_norm": 1.7003906965255737, "learning_rate": 1.7278716326397494e-07, "loss": 0.832, "step": 13490 }, { "epoch": 0.8362250588672698, "grad_norm": 2.010371446609497, "learning_rate": 1.7246102667797273e-07, "loss": 0.8698, "step": 13495 }, { "epoch": 0.8365348866030488, "grad_norm": 1.5988811254501343, "learning_rate": 1.7213489009197052e-07, "loss": 0.845, "step": 13500 }, { "epoch": 0.8368447143388276, "grad_norm": 1.6332695484161377, "learning_rate": 1.718087535059683e-07, "loss": 0.7993, "step": 13505 }, { "epoch": 0.8371545420746065, "grad_norm": 1.8710790872573853, "learning_rate": 1.7148261691996607e-07, "loss": 0.8512, "step": 13510 }, { "epoch": 0.8374643698103854, "grad_norm": 1.642177700996399, "learning_rate": 1.7115648033396386e-07, "loss": 0.8398, "step": 13515 }, { "epoch": 0.8377741975461643, "grad_norm": 1.4716213941574097, "learning_rate": 1.7083034374796162e-07, "loss": 0.7976, "step": 13520 }, { "epoch": 0.8380840252819433, "grad_norm": 1.747169017791748, "learning_rate": 1.7050420716195944e-07, "loss": 0.8804, "step": 13525 }, { "epoch": 0.8383938530177222, "grad_norm": 1.437878966331482, "learning_rate": 1.701780705759572e-07, "loss": 0.7978, "step": 13530 }, { "epoch": 0.8387036807535011, "grad_norm": 1.7289263010025024, "learning_rate": 1.6985193398995499e-07, "loss": 0.8649, "step": 13535 }, { "epoch": 0.8390135084892799, "grad_norm": 1.4790385961532593, "learning_rate": 1.6952579740395275e-07, "loss": 0.8489, "step": 13540 }, { "epoch": 0.8393233362250588, "grad_norm": 1.7089262008666992, "learning_rate": 1.6919966081795056e-07, "loss": 0.8315, "step": 13545 }, { "epoch": 0.8396331639608378, "grad_norm": 1.5812492370605469, "learning_rate": 1.6887352423194833e-07, "loss": 0.8566, "step": 13550 }, { "epoch": 0.8399429916966167, "grad_norm": 1.6407097578048706, "learning_rate": 1.6854738764594612e-07, "loss": 0.818, "step": 13555 }, { "epoch": 0.8402528194323956, "grad_norm": 1.6003669500350952, "learning_rate": 1.682212510599439e-07, "loss": 0.8233, "step": 13560 }, { "epoch": 0.8405626471681745, "grad_norm": 1.7322324514389038, "learning_rate": 1.6789511447394167e-07, "loss": 0.8899, "step": 13565 }, { "epoch": 0.8408724749039534, "grad_norm": 1.656822681427002, "learning_rate": 1.6756897788793948e-07, "loss": 0.8326, "step": 13570 }, { "epoch": 0.8411823026397323, "grad_norm": 1.641059398651123, "learning_rate": 1.6724284130193725e-07, "loss": 0.8495, "step": 13575 }, { "epoch": 0.8414921303755112, "grad_norm": 1.6705242395401, "learning_rate": 1.6691670471593503e-07, "loss": 0.801, "step": 13580 }, { "epoch": 0.8418019581112901, "grad_norm": 1.6412522792816162, "learning_rate": 1.665905681299328e-07, "loss": 0.8446, "step": 13585 }, { "epoch": 0.842111785847069, "grad_norm": 1.665282130241394, "learning_rate": 1.6626443154393059e-07, "loss": 0.8229, "step": 13590 }, { "epoch": 0.842421613582848, "grad_norm": 1.6264300346374512, "learning_rate": 1.6593829495792837e-07, "loss": 0.8378, "step": 13595 }, { "epoch": 0.8427314413186269, "grad_norm": 1.563162922859192, "learning_rate": 1.6561215837192616e-07, "loss": 0.7677, "step": 13600 }, { "epoch": 0.8430412690544058, "grad_norm": 1.8775357007980347, "learning_rate": 1.6528602178592393e-07, "loss": 0.8292, "step": 13605 }, { "epoch": 0.8433510967901846, "grad_norm": 1.7601896524429321, "learning_rate": 1.6495988519992171e-07, "loss": 0.8792, "step": 13610 }, { "epoch": 0.8436609245259635, "grad_norm": 1.668433427810669, "learning_rate": 1.646337486139195e-07, "loss": 0.815, "step": 13615 }, { "epoch": 0.8439707522617425, "grad_norm": 1.339528203010559, "learning_rate": 1.643076120279173e-07, "loss": 0.746, "step": 13620 }, { "epoch": 0.8442805799975214, "grad_norm": 1.6448450088500977, "learning_rate": 1.6398147544191506e-07, "loss": 0.7945, "step": 13625 }, { "epoch": 0.8445904077333003, "grad_norm": 1.798409342765808, "learning_rate": 1.6365533885591284e-07, "loss": 0.8299, "step": 13630 }, { "epoch": 0.8449002354690792, "grad_norm": 1.8548122644424438, "learning_rate": 1.6332920226991063e-07, "loss": 0.8564, "step": 13635 }, { "epoch": 0.8452100632048581, "grad_norm": 1.6766856908798218, "learning_rate": 1.6300306568390842e-07, "loss": 0.843, "step": 13640 }, { "epoch": 0.845519890940637, "grad_norm": 1.778984785079956, "learning_rate": 1.626769290979062e-07, "loss": 0.8288, "step": 13645 }, { "epoch": 0.8458297186764159, "grad_norm": 1.6141608953475952, "learning_rate": 1.6235079251190397e-07, "loss": 0.8514, "step": 13650 }, { "epoch": 0.8461395464121948, "grad_norm": 1.822495698928833, "learning_rate": 1.6202465592590176e-07, "loss": 0.8744, "step": 13655 }, { "epoch": 0.8464493741479737, "grad_norm": 1.6846047639846802, "learning_rate": 1.6169851933989952e-07, "loss": 0.8322, "step": 13660 }, { "epoch": 0.8467592018837526, "grad_norm": 1.5877001285552979, "learning_rate": 1.6137238275389734e-07, "loss": 0.9096, "step": 13665 }, { "epoch": 0.8470690296195316, "grad_norm": 1.662859320640564, "learning_rate": 1.610462461678951e-07, "loss": 0.8392, "step": 13670 }, { "epoch": 0.8473788573553105, "grad_norm": 1.636300802230835, "learning_rate": 1.607201095818929e-07, "loss": 0.8395, "step": 13675 }, { "epoch": 0.8476886850910893, "grad_norm": 1.4073619842529297, "learning_rate": 1.6039397299589065e-07, "loss": 0.8767, "step": 13680 }, { "epoch": 0.8479985128268682, "grad_norm": 1.5351046323776245, "learning_rate": 1.6006783640988844e-07, "loss": 0.8056, "step": 13685 }, { "epoch": 0.8483083405626471, "grad_norm": 1.4543538093566895, "learning_rate": 1.5974169982388623e-07, "loss": 0.8801, "step": 13690 }, { "epoch": 0.8486181682984261, "grad_norm": 1.6026262044906616, "learning_rate": 1.5941556323788402e-07, "loss": 0.7981, "step": 13695 }, { "epoch": 0.848927996034205, "grad_norm": 1.4449512958526611, "learning_rate": 1.590894266518818e-07, "loss": 0.8566, "step": 13700 }, { "epoch": 0.8492378237699839, "grad_norm": 1.5872776508331299, "learning_rate": 1.5876329006587957e-07, "loss": 0.8379, "step": 13705 }, { "epoch": 0.8495476515057628, "grad_norm": 2.1885321140289307, "learning_rate": 1.584371534798774e-07, "loss": 0.8618, "step": 13710 }, { "epoch": 0.8498574792415418, "grad_norm": 1.77052903175354, "learning_rate": 1.5811101689387515e-07, "loss": 0.8583, "step": 13715 }, { "epoch": 0.8501673069773206, "grad_norm": 1.8198717832565308, "learning_rate": 1.5778488030787294e-07, "loss": 0.7679, "step": 13720 }, { "epoch": 0.8504771347130995, "grad_norm": 1.6372439861297607, "learning_rate": 1.574587437218707e-07, "loss": 0.8907, "step": 13725 }, { "epoch": 0.8507869624488784, "grad_norm": 1.3631607294082642, "learning_rate": 1.571326071358685e-07, "loss": 0.7629, "step": 13730 }, { "epoch": 0.8510967901846573, "grad_norm": 1.835263729095459, "learning_rate": 1.5680647054986628e-07, "loss": 0.8166, "step": 13735 }, { "epoch": 0.8514066179204363, "grad_norm": 1.909766435623169, "learning_rate": 1.5648033396386407e-07, "loss": 0.8579, "step": 13740 }, { "epoch": 0.8517164456562152, "grad_norm": 1.6318901777267456, "learning_rate": 1.5615419737786183e-07, "loss": 0.7764, "step": 13745 }, { "epoch": 0.8520262733919941, "grad_norm": 1.689273476600647, "learning_rate": 1.5582806079185962e-07, "loss": 0.8757, "step": 13750 }, { "epoch": 0.8523361011277729, "grad_norm": 1.7394286394119263, "learning_rate": 1.5550192420585738e-07, "loss": 0.8256, "step": 13755 }, { "epoch": 0.8526459288635518, "grad_norm": 1.5999809503555298, "learning_rate": 1.551757876198552e-07, "loss": 0.7965, "step": 13760 }, { "epoch": 0.8529557565993308, "grad_norm": 1.5597681999206543, "learning_rate": 1.54849651033853e-07, "loss": 0.8475, "step": 13765 }, { "epoch": 0.8532655843351097, "grad_norm": 2.832636594772339, "learning_rate": 1.5452351444785075e-07, "loss": 0.8657, "step": 13770 }, { "epoch": 0.8535754120708886, "grad_norm": 1.7357838153839111, "learning_rate": 1.5419737786184854e-07, "loss": 0.844, "step": 13775 }, { "epoch": 0.8538852398066675, "grad_norm": 1.74403715133667, "learning_rate": 1.5387124127584633e-07, "loss": 0.8056, "step": 13780 }, { "epoch": 0.8541950675424465, "grad_norm": 1.6899957656860352, "learning_rate": 1.5354510468984412e-07, "loss": 0.8877, "step": 13785 }, { "epoch": 0.8545048952782253, "grad_norm": 1.6946839094161987, "learning_rate": 1.5321896810384188e-07, "loss": 0.8582, "step": 13790 }, { "epoch": 0.8548147230140042, "grad_norm": 1.876451849937439, "learning_rate": 1.5289283151783967e-07, "loss": 0.8405, "step": 13795 }, { "epoch": 0.8551245507497831, "grad_norm": 1.5253676176071167, "learning_rate": 1.5256669493183743e-07, "loss": 0.8281, "step": 13800 }, { "epoch": 0.855434378485562, "grad_norm": 1.7547944784164429, "learning_rate": 1.5224055834583525e-07, "loss": 0.9025, "step": 13805 }, { "epoch": 0.855744206221341, "grad_norm": 1.6899162530899048, "learning_rate": 1.51914421759833e-07, "loss": 0.8217, "step": 13810 }, { "epoch": 0.8560540339571199, "grad_norm": 1.5918039083480835, "learning_rate": 1.515882851738308e-07, "loss": 0.8671, "step": 13815 }, { "epoch": 0.8563638616928988, "grad_norm": 1.6314395666122437, "learning_rate": 1.5126214858782856e-07, "loss": 0.8756, "step": 13820 }, { "epoch": 0.8566736894286776, "grad_norm": 1.7302895784378052, "learning_rate": 1.5093601200182635e-07, "loss": 0.8938, "step": 13825 }, { "epoch": 0.8569835171644565, "grad_norm": 1.5394595861434937, "learning_rate": 1.5060987541582414e-07, "loss": 0.8313, "step": 13830 }, { "epoch": 0.8572933449002355, "grad_norm": 1.910516619682312, "learning_rate": 1.5028373882982193e-07, "loss": 0.8513, "step": 13835 }, { "epoch": 0.8576031726360144, "grad_norm": 1.5431157350540161, "learning_rate": 1.4995760224381972e-07, "loss": 0.8429, "step": 13840 }, { "epoch": 0.8579130003717933, "grad_norm": 1.6614673137664795, "learning_rate": 1.4963146565781748e-07, "loss": 0.8092, "step": 13845 }, { "epoch": 0.8582228281075722, "grad_norm": 1.659609317779541, "learning_rate": 1.4930532907181527e-07, "loss": 0.8721, "step": 13850 }, { "epoch": 0.8585326558433511, "grad_norm": 1.6264476776123047, "learning_rate": 1.4897919248581306e-07, "loss": 0.8615, "step": 13855 }, { "epoch": 0.85884248357913, "grad_norm": 1.6512616872787476, "learning_rate": 1.4865305589981084e-07, "loss": 0.8953, "step": 13860 }, { "epoch": 0.8591523113149089, "grad_norm": 1.6452410221099854, "learning_rate": 1.483269193138086e-07, "loss": 0.8065, "step": 13865 }, { "epoch": 0.8594621390506878, "grad_norm": 1.5861023664474487, "learning_rate": 1.480007827278064e-07, "loss": 0.8352, "step": 13870 }, { "epoch": 0.8597719667864667, "grad_norm": 1.6485716104507446, "learning_rate": 1.4767464614180418e-07, "loss": 0.7928, "step": 13875 }, { "epoch": 0.8600817945222456, "grad_norm": 2.782316207885742, "learning_rate": 1.4734850955580197e-07, "loss": 0.8747, "step": 13880 }, { "epoch": 0.8603916222580246, "grad_norm": 1.5479017496109009, "learning_rate": 1.4702237296979974e-07, "loss": 0.8055, "step": 13885 }, { "epoch": 0.8607014499938035, "grad_norm": 1.6835722923278809, "learning_rate": 1.4669623638379753e-07, "loss": 0.8611, "step": 13890 }, { "epoch": 0.8610112777295823, "grad_norm": 1.5586516857147217, "learning_rate": 1.463700997977953e-07, "loss": 0.7915, "step": 13895 }, { "epoch": 0.8613211054653612, "grad_norm": 1.7835501432418823, "learning_rate": 1.460439632117931e-07, "loss": 0.8276, "step": 13900 }, { "epoch": 0.8616309332011401, "grad_norm": 1.7082431316375732, "learning_rate": 1.457178266257909e-07, "loss": 0.8326, "step": 13905 }, { "epoch": 0.8619407609369191, "grad_norm": 1.9046393632888794, "learning_rate": 1.4539169003978865e-07, "loss": 0.8777, "step": 13910 }, { "epoch": 0.862250588672698, "grad_norm": 1.7894160747528076, "learning_rate": 1.4506555345378644e-07, "loss": 0.7925, "step": 13915 }, { "epoch": 0.8625604164084769, "grad_norm": 1.6737068891525269, "learning_rate": 1.447394168677842e-07, "loss": 0.8965, "step": 13920 }, { "epoch": 0.8628702441442558, "grad_norm": 1.7155985832214355, "learning_rate": 1.4441328028178202e-07, "loss": 0.875, "step": 13925 }, { "epoch": 0.8631800718800346, "grad_norm": 1.6402175426483154, "learning_rate": 1.4408714369577978e-07, "loss": 0.9051, "step": 13930 }, { "epoch": 0.8634898996158136, "grad_norm": 2.005568504333496, "learning_rate": 1.4376100710977757e-07, "loss": 0.8473, "step": 13935 }, { "epoch": 0.8637997273515925, "grad_norm": 1.8450716733932495, "learning_rate": 1.4343487052377534e-07, "loss": 0.8293, "step": 13940 }, { "epoch": 0.8641095550873714, "grad_norm": 1.5369166135787964, "learning_rate": 1.4310873393777315e-07, "loss": 0.7955, "step": 13945 }, { "epoch": 0.8644193828231503, "grad_norm": 1.6767973899841309, "learning_rate": 1.427825973517709e-07, "loss": 0.7868, "step": 13950 }, { "epoch": 0.8647292105589293, "grad_norm": 1.83664071559906, "learning_rate": 1.424564607657687e-07, "loss": 0.837, "step": 13955 }, { "epoch": 0.8650390382947082, "grad_norm": 1.6923421621322632, "learning_rate": 1.4213032417976646e-07, "loss": 0.8449, "step": 13960 }, { "epoch": 0.865348866030487, "grad_norm": 1.5264493227005005, "learning_rate": 1.4180418759376425e-07, "loss": 0.8425, "step": 13965 }, { "epoch": 0.8656586937662659, "grad_norm": 1.754630446434021, "learning_rate": 1.4147805100776207e-07, "loss": 0.869, "step": 13970 }, { "epoch": 0.8659685215020448, "grad_norm": 1.6138989925384521, "learning_rate": 1.4115191442175983e-07, "loss": 0.8414, "step": 13975 }, { "epoch": 0.8662783492378238, "grad_norm": 1.621018409729004, "learning_rate": 1.4082577783575762e-07, "loss": 0.8287, "step": 13980 }, { "epoch": 0.8665881769736027, "grad_norm": 1.6425983905792236, "learning_rate": 1.4049964124975538e-07, "loss": 0.8337, "step": 13985 }, { "epoch": 0.8668980047093816, "grad_norm": 2.0240654945373535, "learning_rate": 1.4017350466375317e-07, "loss": 0.8479, "step": 13990 }, { "epoch": 0.8672078324451605, "grad_norm": 1.74330472946167, "learning_rate": 1.3984736807775096e-07, "loss": 0.8733, "step": 13995 }, { "epoch": 0.8675176601809393, "grad_norm": 1.7315603494644165, "learning_rate": 1.3952123149174875e-07, "loss": 0.8755, "step": 14000 }, { "epoch": 0.8678274879167183, "grad_norm": 1.902814269065857, "learning_rate": 1.391950949057465e-07, "loss": 0.795, "step": 14005 }, { "epoch": 0.8681373156524972, "grad_norm": 1.973343014717102, "learning_rate": 1.388689583197443e-07, "loss": 0.8791, "step": 14010 }, { "epoch": 0.8684471433882761, "grad_norm": 1.6545647382736206, "learning_rate": 1.3854282173374206e-07, "loss": 0.8484, "step": 14015 }, { "epoch": 0.868756971124055, "grad_norm": 1.6021158695220947, "learning_rate": 1.3821668514773988e-07, "loss": 0.76, "step": 14020 }, { "epoch": 0.869066798859834, "grad_norm": 1.6649742126464844, "learning_rate": 1.3789054856173764e-07, "loss": 0.8213, "step": 14025 }, { "epoch": 0.8693766265956129, "grad_norm": 1.6551361083984375, "learning_rate": 1.3756441197573543e-07, "loss": 0.8357, "step": 14030 }, { "epoch": 0.8696864543313918, "grad_norm": 1.3286198377609253, "learning_rate": 1.372382753897332e-07, "loss": 0.7901, "step": 14035 }, { "epoch": 0.8699962820671706, "grad_norm": 1.5663964748382568, "learning_rate": 1.36912138803731e-07, "loss": 0.8539, "step": 14040 }, { "epoch": 0.8703061098029495, "grad_norm": 1.7881436347961426, "learning_rate": 1.365860022177288e-07, "loss": 0.8599, "step": 14045 }, { "epoch": 0.8706159375387285, "grad_norm": 1.8945622444152832, "learning_rate": 1.3625986563172656e-07, "loss": 0.8202, "step": 14050 }, { "epoch": 0.8709257652745074, "grad_norm": 1.5149836540222168, "learning_rate": 1.3593372904572435e-07, "loss": 0.7922, "step": 14055 }, { "epoch": 0.8712355930102863, "grad_norm": 2.2090203762054443, "learning_rate": 1.356075924597221e-07, "loss": 0.8298, "step": 14060 }, { "epoch": 0.8715454207460652, "grad_norm": 1.586159110069275, "learning_rate": 1.3528145587371993e-07, "loss": 0.8249, "step": 14065 }, { "epoch": 0.8718552484818441, "grad_norm": 1.5271506309509277, "learning_rate": 1.349553192877177e-07, "loss": 0.8022, "step": 14070 }, { "epoch": 0.872165076217623, "grad_norm": 1.7734278440475464, "learning_rate": 1.3462918270171548e-07, "loss": 0.9061, "step": 14075 }, { "epoch": 0.8724749039534019, "grad_norm": 1.672573208808899, "learning_rate": 1.3430304611571324e-07, "loss": 0.8033, "step": 14080 }, { "epoch": 0.8727847316891808, "grad_norm": 1.8278236389160156, "learning_rate": 1.3397690952971103e-07, "loss": 0.8081, "step": 14085 }, { "epoch": 0.8730945594249597, "grad_norm": 1.8538453578948975, "learning_rate": 1.3365077294370882e-07, "loss": 0.7896, "step": 14090 }, { "epoch": 0.8734043871607386, "grad_norm": 1.8751733303070068, "learning_rate": 1.333246363577066e-07, "loss": 0.8409, "step": 14095 }, { "epoch": 0.8737142148965176, "grad_norm": 2.0932300090789795, "learning_rate": 1.3299849977170437e-07, "loss": 0.8704, "step": 14100 }, { "epoch": 0.8740240426322965, "grad_norm": 1.5855046510696411, "learning_rate": 1.3267236318570216e-07, "loss": 0.8286, "step": 14105 }, { "epoch": 0.8743338703680753, "grad_norm": 1.5459518432617188, "learning_rate": 1.3234622659969997e-07, "loss": 0.7827, "step": 14110 }, { "epoch": 0.8746436981038542, "grad_norm": 1.8667305707931519, "learning_rate": 1.3202009001369774e-07, "loss": 0.7881, "step": 14115 }, { "epoch": 0.8749535258396332, "grad_norm": 1.6658257246017456, "learning_rate": 1.3169395342769553e-07, "loss": 0.8808, "step": 14120 }, { "epoch": 0.8752633535754121, "grad_norm": 1.5750752687454224, "learning_rate": 1.313678168416933e-07, "loss": 0.8348, "step": 14125 }, { "epoch": 0.875573181311191, "grad_norm": 1.6285449266433716, "learning_rate": 1.3104168025569108e-07, "loss": 0.8685, "step": 14130 }, { "epoch": 0.8758830090469699, "grad_norm": 1.5329065322875977, "learning_rate": 1.3071554366968887e-07, "loss": 0.8725, "step": 14135 }, { "epoch": 0.8761928367827488, "grad_norm": 1.68254554271698, "learning_rate": 1.3038940708368665e-07, "loss": 0.8623, "step": 14140 }, { "epoch": 0.8765026645185277, "grad_norm": 1.9956636428833008, "learning_rate": 1.3006327049768442e-07, "loss": 0.8614, "step": 14145 }, { "epoch": 0.8768124922543066, "grad_norm": 1.6326061487197876, "learning_rate": 1.297371339116822e-07, "loss": 0.7792, "step": 14150 }, { "epoch": 0.8771223199900855, "grad_norm": 1.6823872327804565, "learning_rate": 1.2941099732567997e-07, "loss": 0.8283, "step": 14155 }, { "epoch": 0.8774321477258644, "grad_norm": 1.7733166217803955, "learning_rate": 1.2908486073967778e-07, "loss": 0.8156, "step": 14160 }, { "epoch": 0.8777419754616433, "grad_norm": 1.6468822956085205, "learning_rate": 1.2875872415367555e-07, "loss": 0.8117, "step": 14165 }, { "epoch": 0.8780518031974223, "grad_norm": 1.6444114446640015, "learning_rate": 1.2843258756767334e-07, "loss": 0.833, "step": 14170 }, { "epoch": 0.8783616309332012, "grad_norm": 1.8487497568130493, "learning_rate": 1.281064509816711e-07, "loss": 0.7913, "step": 14175 }, { "epoch": 0.87867145866898, "grad_norm": 1.4065208435058594, "learning_rate": 1.2778031439566889e-07, "loss": 0.7834, "step": 14180 }, { "epoch": 0.8789812864047589, "grad_norm": 1.7278927564620972, "learning_rate": 1.274541778096667e-07, "loss": 0.8506, "step": 14185 }, { "epoch": 0.8792911141405378, "grad_norm": 1.5856659412384033, "learning_rate": 1.2712804122366446e-07, "loss": 0.7748, "step": 14190 }, { "epoch": 0.8796009418763168, "grad_norm": 1.7066706418991089, "learning_rate": 1.2680190463766225e-07, "loss": 0.7948, "step": 14195 }, { "epoch": 0.8799107696120957, "grad_norm": 1.772167444229126, "learning_rate": 1.2647576805166002e-07, "loss": 0.8286, "step": 14200 }, { "epoch": 0.8802205973478746, "grad_norm": 1.9351725578308105, "learning_rate": 1.2614963146565783e-07, "loss": 0.838, "step": 14205 }, { "epoch": 0.8805304250836535, "grad_norm": 1.6698824167251587, "learning_rate": 1.258234948796556e-07, "loss": 0.8526, "step": 14210 }, { "epoch": 0.8808402528194323, "grad_norm": 1.5127384662628174, "learning_rate": 1.2549735829365338e-07, "loss": 0.9352, "step": 14215 }, { "epoch": 0.8811500805552113, "grad_norm": 1.8343921899795532, "learning_rate": 1.2517122170765115e-07, "loss": 0.8598, "step": 14220 }, { "epoch": 0.8814599082909902, "grad_norm": 1.5006872415542603, "learning_rate": 1.2484508512164893e-07, "loss": 0.8325, "step": 14225 }, { "epoch": 0.8817697360267691, "grad_norm": 1.7476037740707397, "learning_rate": 1.2451894853564672e-07, "loss": 0.8188, "step": 14230 }, { "epoch": 0.882079563762548, "grad_norm": 1.9292758703231812, "learning_rate": 1.241928119496445e-07, "loss": 0.8618, "step": 14235 }, { "epoch": 0.882389391498327, "grad_norm": 1.59522545337677, "learning_rate": 1.2386667536364227e-07, "loss": 0.8222, "step": 14240 }, { "epoch": 0.8826992192341059, "grad_norm": 1.609676480293274, "learning_rate": 1.2354053877764006e-07, "loss": 0.8197, "step": 14245 }, { "epoch": 0.8830090469698847, "grad_norm": 1.5464322566986084, "learning_rate": 1.2321440219163785e-07, "loss": 0.8823, "step": 14250 }, { "epoch": 0.8833188747056636, "grad_norm": 1.800057053565979, "learning_rate": 1.2288826560563564e-07, "loss": 0.8571, "step": 14255 }, { "epoch": 0.8836287024414425, "grad_norm": 1.6690224409103394, "learning_rate": 1.2256212901963343e-07, "loss": 0.7716, "step": 14260 }, { "epoch": 0.8839385301772215, "grad_norm": 1.5017708539962769, "learning_rate": 1.222359924336312e-07, "loss": 0.8182, "step": 14265 }, { "epoch": 0.8842483579130004, "grad_norm": 1.717024803161621, "learning_rate": 1.2190985584762898e-07, "loss": 0.8019, "step": 14270 }, { "epoch": 0.8845581856487793, "grad_norm": 1.6333608627319336, "learning_rate": 1.2158371926162677e-07, "loss": 0.8135, "step": 14275 }, { "epoch": 0.8848680133845582, "grad_norm": 1.6552391052246094, "learning_rate": 1.2125758267562453e-07, "loss": 0.81, "step": 14280 }, { "epoch": 0.885177841120337, "grad_norm": 1.7314380407333374, "learning_rate": 1.2093144608962232e-07, "loss": 0.8399, "step": 14285 }, { "epoch": 0.885487668856116, "grad_norm": 1.859310269355774, "learning_rate": 1.206053095036201e-07, "loss": 0.8361, "step": 14290 }, { "epoch": 0.8857974965918949, "grad_norm": 1.5952162742614746, "learning_rate": 1.202791729176179e-07, "loss": 0.8549, "step": 14295 }, { "epoch": 0.8861073243276738, "grad_norm": 1.567832589149475, "learning_rate": 1.1995303633161566e-07, "loss": 0.8521, "step": 14300 }, { "epoch": 0.8864171520634527, "grad_norm": 1.9202218055725098, "learning_rate": 1.1962689974561345e-07, "loss": 0.9226, "step": 14305 }, { "epoch": 0.8867269797992317, "grad_norm": 1.5701531171798706, "learning_rate": 1.1930076315961124e-07, "loss": 0.867, "step": 14310 }, { "epoch": 0.8870368075350106, "grad_norm": 1.7972238063812256, "learning_rate": 1.1897462657360902e-07, "loss": 0.8933, "step": 14315 }, { "epoch": 0.8873466352707894, "grad_norm": 1.5167312622070312, "learning_rate": 1.186484899876068e-07, "loss": 0.8004, "step": 14320 }, { "epoch": 0.8876564630065683, "grad_norm": 1.4621487855911255, "learning_rate": 1.1832235340160458e-07, "loss": 0.8192, "step": 14325 }, { "epoch": 0.8879662907423472, "grad_norm": 1.6829808950424194, "learning_rate": 1.1799621681560237e-07, "loss": 0.854, "step": 14330 }, { "epoch": 0.8882761184781262, "grad_norm": 1.8001614809036255, "learning_rate": 1.1767008022960015e-07, "loss": 0.8516, "step": 14335 }, { "epoch": 0.8885859462139051, "grad_norm": 1.7837231159210205, "learning_rate": 1.1734394364359793e-07, "loss": 0.8633, "step": 14340 }, { "epoch": 0.888895773949684, "grad_norm": 1.545413613319397, "learning_rate": 1.1701780705759571e-07, "loss": 0.847, "step": 14345 }, { "epoch": 0.8892056016854629, "grad_norm": 1.7151341438293457, "learning_rate": 1.166916704715935e-07, "loss": 0.912, "step": 14350 }, { "epoch": 0.8895154294212418, "grad_norm": 1.5460199117660522, "learning_rate": 1.1636553388559129e-07, "loss": 0.9069, "step": 14355 }, { "epoch": 0.8898252571570207, "grad_norm": 1.6000977754592896, "learning_rate": 1.1603939729958906e-07, "loss": 0.8621, "step": 14360 }, { "epoch": 0.8901350848927996, "grad_norm": 1.4001349210739136, "learning_rate": 1.1571326071358685e-07, "loss": 0.833, "step": 14365 }, { "epoch": 0.8904449126285785, "grad_norm": 1.7584794759750366, "learning_rate": 1.1538712412758463e-07, "loss": 0.8071, "step": 14370 }, { "epoch": 0.8907547403643574, "grad_norm": 1.5972888469696045, "learning_rate": 1.1506098754158242e-07, "loss": 0.808, "step": 14375 }, { "epoch": 0.8910645681001363, "grad_norm": 1.361055612564087, "learning_rate": 1.1473485095558019e-07, "loss": 0.8281, "step": 14380 }, { "epoch": 0.8913743958359153, "grad_norm": 1.4665303230285645, "learning_rate": 1.1440871436957797e-07, "loss": 0.8085, "step": 14385 }, { "epoch": 0.8916842235716942, "grad_norm": 1.6164666414260864, "learning_rate": 1.1408257778357576e-07, "loss": 0.8681, "step": 14390 }, { "epoch": 0.891994051307473, "grad_norm": 1.5655385255813599, "learning_rate": 1.1375644119757353e-07, "loss": 0.8256, "step": 14395 }, { "epoch": 0.8923038790432519, "grad_norm": 1.7220021486282349, "learning_rate": 1.1343030461157132e-07, "loss": 0.8333, "step": 14400 }, { "epoch": 0.8926137067790308, "grad_norm": 1.4833520650863647, "learning_rate": 1.131041680255691e-07, "loss": 0.7595, "step": 14405 }, { "epoch": 0.8929235345148098, "grad_norm": 1.5252054929733276, "learning_rate": 1.1277803143956687e-07, "loss": 0.7735, "step": 14410 }, { "epoch": 0.8932333622505887, "grad_norm": 1.602554440498352, "learning_rate": 1.1245189485356466e-07, "loss": 0.8359, "step": 14415 }, { "epoch": 0.8935431899863676, "grad_norm": 1.7785542011260986, "learning_rate": 1.1212575826756245e-07, "loss": 0.832, "step": 14420 }, { "epoch": 0.8938530177221465, "grad_norm": 1.982467532157898, "learning_rate": 1.1179962168156024e-07, "loss": 0.8693, "step": 14425 }, { "epoch": 0.8941628454579253, "grad_norm": 1.608056664466858, "learning_rate": 1.1147348509555802e-07, "loss": 0.8315, "step": 14430 }, { "epoch": 0.8944726731937043, "grad_norm": 1.4619832038879395, "learning_rate": 1.111473485095558e-07, "loss": 0.8431, "step": 14435 }, { "epoch": 0.8947825009294832, "grad_norm": 1.7303824424743652, "learning_rate": 1.1082121192355358e-07, "loss": 0.876, "step": 14440 }, { "epoch": 0.8950923286652621, "grad_norm": 1.935746669769287, "learning_rate": 1.1049507533755136e-07, "loss": 0.8503, "step": 14445 }, { "epoch": 0.895402156401041, "grad_norm": 1.9290927648544312, "learning_rate": 1.1016893875154915e-07, "loss": 0.854, "step": 14450 }, { "epoch": 0.89571198413682, "grad_norm": 1.7357560396194458, "learning_rate": 1.0984280216554692e-07, "loss": 0.8877, "step": 14455 }, { "epoch": 0.8960218118725989, "grad_norm": 1.5289498567581177, "learning_rate": 1.0951666557954471e-07, "loss": 0.8086, "step": 14460 }, { "epoch": 0.8963316396083777, "grad_norm": 1.5624992847442627, "learning_rate": 1.0919052899354249e-07, "loss": 0.8167, "step": 14465 }, { "epoch": 0.8966414673441566, "grad_norm": 1.4537614583969116, "learning_rate": 1.0886439240754027e-07, "loss": 0.8293, "step": 14470 }, { "epoch": 0.8969512950799355, "grad_norm": 1.481771469116211, "learning_rate": 1.0853825582153805e-07, "loss": 0.8406, "step": 14475 }, { "epoch": 0.8972611228157145, "grad_norm": 1.764453411102295, "learning_rate": 1.0821211923553583e-07, "loss": 0.8343, "step": 14480 }, { "epoch": 0.8975709505514934, "grad_norm": 1.6011227369308472, "learning_rate": 1.0788598264953363e-07, "loss": 0.8567, "step": 14485 }, { "epoch": 0.8978807782872723, "grad_norm": 1.8235403299331665, "learning_rate": 1.075598460635314e-07, "loss": 0.8868, "step": 14490 }, { "epoch": 0.8981906060230512, "grad_norm": 1.8698670864105225, "learning_rate": 1.0723370947752919e-07, "loss": 0.8091, "step": 14495 }, { "epoch": 0.89850043375883, "grad_norm": 1.844266414642334, "learning_rate": 1.0690757289152697e-07, "loss": 0.8163, "step": 14500 }, { "epoch": 0.898810261494609, "grad_norm": 1.4328010082244873, "learning_rate": 1.0658143630552476e-07, "loss": 0.7948, "step": 14505 }, { "epoch": 0.8991200892303879, "grad_norm": 1.7581537961959839, "learning_rate": 1.0625529971952253e-07, "loss": 0.8364, "step": 14510 }, { "epoch": 0.8994299169661668, "grad_norm": 1.734835147857666, "learning_rate": 1.0592916313352031e-07, "loss": 0.8522, "step": 14515 }, { "epoch": 0.8997397447019457, "grad_norm": 1.6247482299804688, "learning_rate": 1.056030265475181e-07, "loss": 0.8568, "step": 14520 }, { "epoch": 0.9000495724377247, "grad_norm": 1.922568678855896, "learning_rate": 1.0527688996151587e-07, "loss": 0.8398, "step": 14525 }, { "epoch": 0.9003594001735036, "grad_norm": 1.91324782371521, "learning_rate": 1.0495075337551366e-07, "loss": 0.7942, "step": 14530 }, { "epoch": 0.9006692279092824, "grad_norm": 1.4746366739273071, "learning_rate": 1.0462461678951144e-07, "loss": 0.8334, "step": 14535 }, { "epoch": 0.9009790556450613, "grad_norm": 1.5857603549957275, "learning_rate": 1.0429848020350923e-07, "loss": 0.8294, "step": 14540 }, { "epoch": 0.9012888833808402, "grad_norm": 1.7607996463775635, "learning_rate": 1.03972343617507e-07, "loss": 0.8051, "step": 14545 }, { "epoch": 0.9015987111166192, "grad_norm": 1.6349459886550903, "learning_rate": 1.0364620703150478e-07, "loss": 0.7752, "step": 14550 }, { "epoch": 0.9019085388523981, "grad_norm": 1.6452680826187134, "learning_rate": 1.0332007044550258e-07, "loss": 0.8168, "step": 14555 }, { "epoch": 0.902218366588177, "grad_norm": 1.5939664840698242, "learning_rate": 1.0299393385950036e-07, "loss": 0.8144, "step": 14560 }, { "epoch": 0.9025281943239559, "grad_norm": 1.9514048099517822, "learning_rate": 1.0266779727349815e-07, "loss": 0.8354, "step": 14565 }, { "epoch": 0.9028380220597347, "grad_norm": 1.7060061693191528, "learning_rate": 1.0234166068749592e-07, "loss": 0.7819, "step": 14570 }, { "epoch": 0.9031478497955137, "grad_norm": 1.7711832523345947, "learning_rate": 1.020155241014937e-07, "loss": 0.8887, "step": 14575 }, { "epoch": 0.9034576775312926, "grad_norm": 1.7844951152801514, "learning_rate": 1.0168938751549149e-07, "loss": 0.8397, "step": 14580 }, { "epoch": 0.9037675052670715, "grad_norm": 1.5175538063049316, "learning_rate": 1.0136325092948926e-07, "loss": 0.7626, "step": 14585 }, { "epoch": 0.9040773330028504, "grad_norm": 1.6918551921844482, "learning_rate": 1.0103711434348705e-07, "loss": 0.876, "step": 14590 }, { "epoch": 0.9043871607386293, "grad_norm": 1.9042967557907104, "learning_rate": 1.0071097775748483e-07, "loss": 0.8136, "step": 14595 }, { "epoch": 0.9046969884744083, "grad_norm": 1.6037484407424927, "learning_rate": 1.0038484117148262e-07, "loss": 0.8586, "step": 14600 }, { "epoch": 0.9050068162101871, "grad_norm": 1.7443886995315552, "learning_rate": 1.0005870458548039e-07, "loss": 0.8572, "step": 14605 }, { "epoch": 0.905316643945966, "grad_norm": 1.3741647005081177, "learning_rate": 9.973256799947817e-08, "loss": 0.8172, "step": 14610 }, { "epoch": 0.9056264716817449, "grad_norm": 1.438989281654358, "learning_rate": 9.940643141347596e-08, "loss": 0.8077, "step": 14615 }, { "epoch": 0.9059362994175238, "grad_norm": 1.625597596168518, "learning_rate": 9.908029482747373e-08, "loss": 0.834, "step": 14620 }, { "epoch": 0.9062461271533028, "grad_norm": 1.6654558181762695, "learning_rate": 9.875415824147153e-08, "loss": 0.8404, "step": 14625 }, { "epoch": 0.9065559548890817, "grad_norm": 1.578120231628418, "learning_rate": 9.842802165546931e-08, "loss": 0.8094, "step": 14630 }, { "epoch": 0.9068657826248606, "grad_norm": 2.1373980045318604, "learning_rate": 9.81018850694671e-08, "loss": 0.8295, "step": 14635 }, { "epoch": 0.9071756103606395, "grad_norm": 1.750146508216858, "learning_rate": 9.777574848346487e-08, "loss": 0.8353, "step": 14640 }, { "epoch": 0.9074854380964184, "grad_norm": 1.8099365234375, "learning_rate": 9.744961189746265e-08, "loss": 0.7893, "step": 14645 }, { "epoch": 0.9077952658321973, "grad_norm": 1.7056037187576294, "learning_rate": 9.712347531146044e-08, "loss": 0.8736, "step": 14650 }, { "epoch": 0.9081050935679762, "grad_norm": 1.8692930936813354, "learning_rate": 9.679733872545821e-08, "loss": 0.8815, "step": 14655 }, { "epoch": 0.9084149213037551, "grad_norm": 1.5108036994934082, "learning_rate": 9.6471202139456e-08, "loss": 0.8443, "step": 14660 }, { "epoch": 0.908724749039534, "grad_norm": 1.7017853260040283, "learning_rate": 9.614506555345378e-08, "loss": 0.8732, "step": 14665 }, { "epoch": 0.909034576775313, "grad_norm": 1.6488370895385742, "learning_rate": 9.581892896745157e-08, "loss": 0.886, "step": 14670 }, { "epoch": 0.9093444045110919, "grad_norm": 1.5995619297027588, "learning_rate": 9.549279238144934e-08, "loss": 0.8098, "step": 14675 }, { "epoch": 0.9096542322468707, "grad_norm": 1.5656815767288208, "learning_rate": 9.516665579544712e-08, "loss": 0.8647, "step": 14680 }, { "epoch": 0.9099640599826496, "grad_norm": 1.8279550075531006, "learning_rate": 9.484051920944491e-08, "loss": 0.8419, "step": 14685 }, { "epoch": 0.9102738877184285, "grad_norm": 1.6348963975906372, "learning_rate": 9.451438262344268e-08, "loss": 0.8095, "step": 14690 }, { "epoch": 0.9105837154542075, "grad_norm": 1.6476106643676758, "learning_rate": 9.418824603744049e-08, "loss": 0.8554, "step": 14695 }, { "epoch": 0.9108935431899864, "grad_norm": 1.8768866062164307, "learning_rate": 9.386210945143826e-08, "loss": 0.8172, "step": 14700 }, { "epoch": 0.9112033709257653, "grad_norm": 1.6151742935180664, "learning_rate": 9.353597286543605e-08, "loss": 0.8623, "step": 14705 }, { "epoch": 0.9115131986615442, "grad_norm": 1.7617048025131226, "learning_rate": 9.320983627943383e-08, "loss": 0.8771, "step": 14710 }, { "epoch": 0.911823026397323, "grad_norm": 1.786597728729248, "learning_rate": 9.28836996934316e-08, "loss": 0.8602, "step": 14715 }, { "epoch": 0.912132854133102, "grad_norm": 1.6095271110534668, "learning_rate": 9.255756310742939e-08, "loss": 0.8167, "step": 14720 }, { "epoch": 0.9124426818688809, "grad_norm": 1.4803963899612427, "learning_rate": 9.223142652142717e-08, "loss": 0.7697, "step": 14725 }, { "epoch": 0.9127525096046598, "grad_norm": 2.034602403640747, "learning_rate": 9.190528993542496e-08, "loss": 0.8285, "step": 14730 }, { "epoch": 0.9130623373404387, "grad_norm": 1.5653057098388672, "learning_rate": 9.157915334942273e-08, "loss": 0.8671, "step": 14735 }, { "epoch": 0.9133721650762177, "grad_norm": 1.7435942888259888, "learning_rate": 9.125301676342051e-08, "loss": 0.8488, "step": 14740 }, { "epoch": 0.9136819928119966, "grad_norm": 1.6469316482543945, "learning_rate": 9.09268801774183e-08, "loss": 0.7845, "step": 14745 }, { "epoch": 0.9139918205477754, "grad_norm": 1.5693140029907227, "learning_rate": 9.060074359141607e-08, "loss": 0.8098, "step": 14750 }, { "epoch": 0.9143016482835543, "grad_norm": 1.5851061344146729, "learning_rate": 9.027460700541386e-08, "loss": 0.8033, "step": 14755 }, { "epoch": 0.9146114760193332, "grad_norm": 1.7908222675323486, "learning_rate": 8.994847041941164e-08, "loss": 0.8406, "step": 14760 }, { "epoch": 0.9149213037551122, "grad_norm": 1.5753778219223022, "learning_rate": 8.962233383340944e-08, "loss": 0.7843, "step": 14765 }, { "epoch": 0.9152311314908911, "grad_norm": 1.6013070344924927, "learning_rate": 8.929619724740721e-08, "loss": 0.7998, "step": 14770 }, { "epoch": 0.91554095922667, "grad_norm": 1.6490404605865479, "learning_rate": 8.897006066140499e-08, "loss": 0.8191, "step": 14775 }, { "epoch": 0.9158507869624489, "grad_norm": 1.8217605352401733, "learning_rate": 8.864392407540278e-08, "loss": 0.8228, "step": 14780 }, { "epoch": 0.9161606146982277, "grad_norm": 2.042304515838623, "learning_rate": 8.831778748940055e-08, "loss": 0.8585, "step": 14785 }, { "epoch": 0.9164704424340067, "grad_norm": 1.4815601110458374, "learning_rate": 8.799165090339834e-08, "loss": 0.888, "step": 14790 }, { "epoch": 0.9167802701697856, "grad_norm": 1.6839797496795654, "learning_rate": 8.766551431739612e-08, "loss": 0.8139, "step": 14795 }, { "epoch": 0.9170900979055645, "grad_norm": 1.6002793312072754, "learning_rate": 8.733937773139391e-08, "loss": 0.8993, "step": 14800 }, { "epoch": 0.9173999256413434, "grad_norm": 1.6373785734176636, "learning_rate": 8.701324114539168e-08, "loss": 0.8177, "step": 14805 }, { "epoch": 0.9177097533771224, "grad_norm": 1.4815398454666138, "learning_rate": 8.668710455938946e-08, "loss": 0.8169, "step": 14810 }, { "epoch": 0.9180195811129013, "grad_norm": 1.8058719635009766, "learning_rate": 8.636096797338725e-08, "loss": 0.8211, "step": 14815 }, { "epoch": 0.9183294088486801, "grad_norm": 1.6687182188034058, "learning_rate": 8.603483138738502e-08, "loss": 0.8283, "step": 14820 }, { "epoch": 0.918639236584459, "grad_norm": 1.5233310461044312, "learning_rate": 8.570869480138281e-08, "loss": 0.8277, "step": 14825 }, { "epoch": 0.9189490643202379, "grad_norm": 2.0291671752929688, "learning_rate": 8.53825582153806e-08, "loss": 0.8275, "step": 14830 }, { "epoch": 0.9192588920560169, "grad_norm": 1.6711183786392212, "learning_rate": 8.505642162937839e-08, "loss": 0.7979, "step": 14835 }, { "epoch": 0.9195687197917958, "grad_norm": 1.7648224830627441, "learning_rate": 8.473028504337617e-08, "loss": 0.8496, "step": 14840 }, { "epoch": 0.9198785475275747, "grad_norm": 1.8446131944656372, "learning_rate": 8.440414845737394e-08, "loss": 0.8736, "step": 14845 }, { "epoch": 0.9201883752633536, "grad_norm": 1.536024570465088, "learning_rate": 8.407801187137173e-08, "loss": 0.8106, "step": 14850 }, { "epoch": 0.9204982029991324, "grad_norm": 1.5335243940353394, "learning_rate": 8.375187528536951e-08, "loss": 0.8335, "step": 14855 }, { "epoch": 0.9208080307349114, "grad_norm": 1.6512424945831299, "learning_rate": 8.34257386993673e-08, "loss": 0.8423, "step": 14860 }, { "epoch": 0.9211178584706903, "grad_norm": 1.3891370296478271, "learning_rate": 8.309960211336507e-08, "loss": 0.8294, "step": 14865 }, { "epoch": 0.9214276862064692, "grad_norm": 1.6183117628097534, "learning_rate": 8.277346552736286e-08, "loss": 0.9181, "step": 14870 }, { "epoch": 0.9217375139422481, "grad_norm": 1.5492644309997559, "learning_rate": 8.244732894136064e-08, "loss": 0.8444, "step": 14875 }, { "epoch": 0.922047341678027, "grad_norm": 1.6538655757904053, "learning_rate": 8.212119235535841e-08, "loss": 0.7883, "step": 14880 }, { "epoch": 0.922357169413806, "grad_norm": 1.6469179391860962, "learning_rate": 8.17950557693562e-08, "loss": 0.8797, "step": 14885 }, { "epoch": 0.9226669971495848, "grad_norm": 1.5851479768753052, "learning_rate": 8.146891918335398e-08, "loss": 0.8557, "step": 14890 }, { "epoch": 0.9229768248853637, "grad_norm": 1.557062029838562, "learning_rate": 8.114278259735177e-08, "loss": 0.8592, "step": 14895 }, { "epoch": 0.9232866526211426, "grad_norm": 1.597190022468567, "learning_rate": 8.081664601134955e-08, "loss": 0.8697, "step": 14900 }, { "epoch": 0.9235964803569215, "grad_norm": 1.6489976644515991, "learning_rate": 8.049050942534733e-08, "loss": 0.8082, "step": 14905 }, { "epoch": 0.9239063080927005, "grad_norm": 1.6867945194244385, "learning_rate": 8.016437283934512e-08, "loss": 0.8551, "step": 14910 }, { "epoch": 0.9242161358284794, "grad_norm": 1.8104511499404907, "learning_rate": 7.98382362533429e-08, "loss": 0.8868, "step": 14915 }, { "epoch": 0.9245259635642583, "grad_norm": 1.6329665184020996, "learning_rate": 7.951209966734068e-08, "loss": 0.8427, "step": 14920 }, { "epoch": 0.9248357913000371, "grad_norm": 1.6783270835876465, "learning_rate": 7.918596308133846e-08, "loss": 0.8682, "step": 14925 }, { "epoch": 0.925145619035816, "grad_norm": 1.5834885835647583, "learning_rate": 7.885982649533625e-08, "loss": 0.7966, "step": 14930 }, { "epoch": 0.925455446771595, "grad_norm": 1.5923316478729248, "learning_rate": 7.853368990933402e-08, "loss": 0.8368, "step": 14935 }, { "epoch": 0.9257652745073739, "grad_norm": 1.7306522130966187, "learning_rate": 7.82075533233318e-08, "loss": 0.8445, "step": 14940 }, { "epoch": 0.9260751022431528, "grad_norm": 1.5742267370224, "learning_rate": 7.788141673732959e-08, "loss": 0.8461, "step": 14945 }, { "epoch": 0.9263849299789317, "grad_norm": 1.5978232622146606, "learning_rate": 7.755528015132736e-08, "loss": 0.8109, "step": 14950 }, { "epoch": 0.9266947577147107, "grad_norm": 1.5333524942398071, "learning_rate": 7.722914356532515e-08, "loss": 0.7837, "step": 14955 }, { "epoch": 0.9270045854504896, "grad_norm": 1.3960856199264526, "learning_rate": 7.690300697932293e-08, "loss": 0.8274, "step": 14960 }, { "epoch": 0.9273144131862684, "grad_norm": 1.5604887008666992, "learning_rate": 7.657687039332073e-08, "loss": 0.8219, "step": 14965 }, { "epoch": 0.9276242409220473, "grad_norm": 1.841156005859375, "learning_rate": 7.625073380731851e-08, "loss": 0.8252, "step": 14970 }, { "epoch": 0.9279340686578262, "grad_norm": 1.5893378257751465, "learning_rate": 7.592459722131628e-08, "loss": 0.8092, "step": 14975 }, { "epoch": 0.9282438963936052, "grad_norm": 1.8188837766647339, "learning_rate": 7.559846063531407e-08, "loss": 0.8694, "step": 14980 }, { "epoch": 0.9285537241293841, "grad_norm": 2.0048820972442627, "learning_rate": 7.527232404931185e-08, "loss": 0.808, "step": 14985 }, { "epoch": 0.928863551865163, "grad_norm": 1.8258121013641357, "learning_rate": 7.494618746330964e-08, "loss": 0.8198, "step": 14990 }, { "epoch": 0.9291733796009419, "grad_norm": 1.8916709423065186, "learning_rate": 7.462005087730741e-08, "loss": 0.812, "step": 14995 }, { "epoch": 0.9294832073367207, "grad_norm": 1.9231541156768799, "learning_rate": 7.42939142913052e-08, "loss": 0.8952, "step": 15000 }, { "epoch": 0.9297930350724997, "grad_norm": 1.525590181350708, "learning_rate": 7.396777770530298e-08, "loss": 0.8555, "step": 15005 }, { "epoch": 0.9301028628082786, "grad_norm": 1.824840784072876, "learning_rate": 7.364164111930075e-08, "loss": 0.8564, "step": 15010 }, { "epoch": 0.9304126905440575, "grad_norm": 1.680782675743103, "learning_rate": 7.331550453329854e-08, "loss": 0.8192, "step": 15015 }, { "epoch": 0.9307225182798364, "grad_norm": 1.7970026731491089, "learning_rate": 7.298936794729632e-08, "loss": 0.8581, "step": 15020 }, { "epoch": 0.9310323460156154, "grad_norm": 1.5699044466018677, "learning_rate": 7.26632313612941e-08, "loss": 0.8076, "step": 15025 }, { "epoch": 0.9313421737513943, "grad_norm": 1.885715126991272, "learning_rate": 7.233709477529188e-08, "loss": 0.874, "step": 15030 }, { "epoch": 0.9316520014871731, "grad_norm": 1.5187101364135742, "learning_rate": 7.201095818928968e-08, "loss": 0.9362, "step": 15035 }, { "epoch": 0.931961829222952, "grad_norm": 1.742722988128662, "learning_rate": 7.168482160328746e-08, "loss": 0.7228, "step": 15040 }, { "epoch": 0.9322716569587309, "grad_norm": 1.6100702285766602, "learning_rate": 7.135868501728524e-08, "loss": 0.8421, "step": 15045 }, { "epoch": 0.9325814846945099, "grad_norm": 1.6466500759124756, "learning_rate": 7.103254843128302e-08, "loss": 0.857, "step": 15050 }, { "epoch": 0.9328913124302888, "grad_norm": 1.6615008115768433, "learning_rate": 7.07064118452808e-08, "loss": 0.8412, "step": 15055 }, { "epoch": 0.9332011401660677, "grad_norm": 1.7339073419570923, "learning_rate": 7.038027525927859e-08, "loss": 0.8128, "step": 15060 }, { "epoch": 0.9335109679018466, "grad_norm": 1.9552741050720215, "learning_rate": 7.005413867327636e-08, "loss": 0.8867, "step": 15065 }, { "epoch": 0.9338207956376254, "grad_norm": 1.4273444414138794, "learning_rate": 6.972800208727414e-08, "loss": 0.8702, "step": 15070 }, { "epoch": 0.9341306233734044, "grad_norm": 1.4505070447921753, "learning_rate": 6.940186550127193e-08, "loss": 0.8022, "step": 15075 }, { "epoch": 0.9344404511091833, "grad_norm": 1.6438905000686646, "learning_rate": 6.90757289152697e-08, "loss": 0.8042, "step": 15080 }, { "epoch": 0.9347502788449622, "grad_norm": 1.5442990064620972, "learning_rate": 6.87495923292675e-08, "loss": 0.8539, "step": 15085 }, { "epoch": 0.9350601065807411, "grad_norm": 1.5180010795593262, "learning_rate": 6.842345574326527e-08, "loss": 0.8713, "step": 15090 }, { "epoch": 0.93536993431652, "grad_norm": 1.8689570426940918, "learning_rate": 6.809731915726306e-08, "loss": 0.8683, "step": 15095 }, { "epoch": 0.935679762052299, "grad_norm": 1.5416775941848755, "learning_rate": 6.777118257126083e-08, "loss": 0.8628, "step": 15100 }, { "epoch": 0.9359895897880778, "grad_norm": 1.584704041481018, "learning_rate": 6.744504598525862e-08, "loss": 0.8647, "step": 15105 }, { "epoch": 0.9362994175238567, "grad_norm": 1.9826905727386475, "learning_rate": 6.711890939925641e-08, "loss": 0.8053, "step": 15110 }, { "epoch": 0.9366092452596356, "grad_norm": 1.567655324935913, "learning_rate": 6.679277281325419e-08, "loss": 0.8609, "step": 15115 }, { "epoch": 0.9369190729954145, "grad_norm": 1.620680570602417, "learning_rate": 6.646663622725198e-08, "loss": 0.9066, "step": 15120 }, { "epoch": 0.9372289007311935, "grad_norm": 1.5231678485870361, "learning_rate": 6.614049964124975e-08, "loss": 0.8351, "step": 15125 }, { "epoch": 0.9375387284669724, "grad_norm": 1.7179840803146362, "learning_rate": 6.581436305524754e-08, "loss": 0.8943, "step": 15130 }, { "epoch": 0.9378485562027513, "grad_norm": 1.9049890041351318, "learning_rate": 6.548822646924532e-08, "loss": 0.8592, "step": 15135 }, { "epoch": 0.9381583839385301, "grad_norm": 1.835283875465393, "learning_rate": 6.516208988324309e-08, "loss": 0.857, "step": 15140 }, { "epoch": 0.938468211674309, "grad_norm": 1.5502787828445435, "learning_rate": 6.483595329724088e-08, "loss": 0.8325, "step": 15145 }, { "epoch": 0.938778039410088, "grad_norm": 1.677613615989685, "learning_rate": 6.450981671123866e-08, "loss": 0.7849, "step": 15150 }, { "epoch": 0.9390878671458669, "grad_norm": 1.9146627187728882, "learning_rate": 6.418368012523645e-08, "loss": 0.856, "step": 15155 }, { "epoch": 0.9393976948816458, "grad_norm": 1.563612937927246, "learning_rate": 6.385754353923422e-08, "loss": 0.8367, "step": 15160 }, { "epoch": 0.9397075226174247, "grad_norm": 1.5613071918487549, "learning_rate": 6.353140695323201e-08, "loss": 0.8257, "step": 15165 }, { "epoch": 0.9400173503532037, "grad_norm": 1.461104154586792, "learning_rate": 6.320527036722979e-08, "loss": 0.8492, "step": 15170 }, { "epoch": 0.9403271780889825, "grad_norm": 1.7141119241714478, "learning_rate": 6.287913378122758e-08, "loss": 0.8534, "step": 15175 }, { "epoch": 0.9406370058247614, "grad_norm": 1.5509589910507202, "learning_rate": 6.255299719522536e-08, "loss": 0.8229, "step": 15180 }, { "epoch": 0.9409468335605403, "grad_norm": 1.7114098072052002, "learning_rate": 6.222686060922314e-08, "loss": 0.8005, "step": 15185 }, { "epoch": 0.9412566612963192, "grad_norm": 1.6159601211547852, "learning_rate": 6.190072402322092e-08, "loss": 0.8873, "step": 15190 }, { "epoch": 0.9415664890320982, "grad_norm": 1.5091716051101685, "learning_rate": 6.15745874372187e-08, "loss": 0.846, "step": 15195 }, { "epoch": 0.9418763167678771, "grad_norm": 1.7174988985061646, "learning_rate": 6.124845085121648e-08, "loss": 0.8404, "step": 15200 }, { "epoch": 0.942186144503656, "grad_norm": 1.5568188428878784, "learning_rate": 6.092231426521427e-08, "loss": 0.8168, "step": 15205 }, { "epoch": 0.9424959722394348, "grad_norm": 1.6109156608581543, "learning_rate": 6.059617767921206e-08, "loss": 0.8401, "step": 15210 }, { "epoch": 0.9428057999752137, "grad_norm": 1.7197998762130737, "learning_rate": 6.027004109320983e-08, "loss": 0.8697, "step": 15215 }, { "epoch": 0.9431156277109927, "grad_norm": 1.773724913597107, "learning_rate": 5.994390450720761e-08, "loss": 0.8804, "step": 15220 }, { "epoch": 0.9434254554467716, "grad_norm": 1.8805720806121826, "learning_rate": 5.96177679212054e-08, "loss": 0.8934, "step": 15225 }, { "epoch": 0.9437352831825505, "grad_norm": 1.3951067924499512, "learning_rate": 5.929163133520318e-08, "loss": 0.7466, "step": 15230 }, { "epoch": 0.9440451109183294, "grad_norm": 1.5006078481674194, "learning_rate": 5.8965494749200964e-08, "loss": 0.8302, "step": 15235 }, { "epoch": 0.9443549386541084, "grad_norm": 1.574398398399353, "learning_rate": 5.8639358163198746e-08, "loss": 0.8141, "step": 15240 }, { "epoch": 0.9446647663898873, "grad_norm": 1.672789454460144, "learning_rate": 5.831322157719653e-08, "loss": 0.842, "step": 15245 }, { "epoch": 0.9449745941256661, "grad_norm": 1.7315744161605835, "learning_rate": 5.798708499119431e-08, "loss": 0.8608, "step": 15250 }, { "epoch": 0.945284421861445, "grad_norm": 1.7062369585037231, "learning_rate": 5.7660948405192086e-08, "loss": 0.8507, "step": 15255 }, { "epoch": 0.9455942495972239, "grad_norm": 1.5232911109924316, "learning_rate": 5.7334811819189875e-08, "loss": 0.8201, "step": 15260 }, { "epoch": 0.9459040773330029, "grad_norm": 1.4396625757217407, "learning_rate": 5.700867523318766e-08, "loss": 0.8657, "step": 15265 }, { "epoch": 0.9462139050687818, "grad_norm": 1.911996603012085, "learning_rate": 5.668253864718544e-08, "loss": 0.8479, "step": 15270 }, { "epoch": 0.9465237328045607, "grad_norm": 1.7143110036849976, "learning_rate": 5.635640206118322e-08, "loss": 0.8694, "step": 15275 }, { "epoch": 0.9468335605403396, "grad_norm": 1.6073627471923828, "learning_rate": 5.6030265475181005e-08, "loss": 0.9041, "step": 15280 }, { "epoch": 0.9471433882761184, "grad_norm": 1.627910852432251, "learning_rate": 5.570412888917878e-08, "loss": 0.8596, "step": 15285 }, { "epoch": 0.9474532160118974, "grad_norm": 1.6620925664901733, "learning_rate": 5.537799230317656e-08, "loss": 0.8381, "step": 15290 }, { "epoch": 0.9477630437476763, "grad_norm": 1.5756876468658447, "learning_rate": 5.505185571717435e-08, "loss": 0.7825, "step": 15295 }, { "epoch": 0.9480728714834552, "grad_norm": 1.679973840713501, "learning_rate": 5.4725719131172134e-08, "loss": 0.826, "step": 15300 }, { "epoch": 0.9483826992192341, "grad_norm": 1.8173421621322632, "learning_rate": 5.4399582545169916e-08, "loss": 0.8526, "step": 15305 }, { "epoch": 0.948692526955013, "grad_norm": 1.5299351215362549, "learning_rate": 5.40734459591677e-08, "loss": 0.844, "step": 15310 }, { "epoch": 0.949002354690792, "grad_norm": 1.594498872756958, "learning_rate": 5.374730937316548e-08, "loss": 0.8244, "step": 15315 }, { "epoch": 0.9493121824265708, "grad_norm": 1.4174598455429077, "learning_rate": 5.3421172787163257e-08, "loss": 0.7507, "step": 15320 }, { "epoch": 0.9496220101623497, "grad_norm": 1.624984860420227, "learning_rate": 5.309503620116104e-08, "loss": 0.8341, "step": 15325 }, { "epoch": 0.9499318378981286, "grad_norm": 1.5083084106445312, "learning_rate": 5.276889961515883e-08, "loss": 0.891, "step": 15330 }, { "epoch": 0.9502416656339076, "grad_norm": 2.4232914447784424, "learning_rate": 5.244276302915661e-08, "loss": 0.8098, "step": 15335 }, { "epoch": 0.9505514933696865, "grad_norm": 1.6162424087524414, "learning_rate": 5.211662644315439e-08, "loss": 0.8955, "step": 15340 }, { "epoch": 0.9508613211054654, "grad_norm": 2.227644920349121, "learning_rate": 5.1790489857152175e-08, "loss": 0.8129, "step": 15345 }, { "epoch": 0.9511711488412443, "grad_norm": 1.833177924156189, "learning_rate": 5.146435327114996e-08, "loss": 0.8386, "step": 15350 }, { "epoch": 0.9514809765770231, "grad_norm": 1.5484859943389893, "learning_rate": 5.113821668514773e-08, "loss": 0.8281, "step": 15355 }, { "epoch": 0.951790804312802, "grad_norm": 1.6309030055999756, "learning_rate": 5.0812080099145515e-08, "loss": 0.8256, "step": 15360 }, { "epoch": 0.952100632048581, "grad_norm": 1.6830759048461914, "learning_rate": 5.0485943513143304e-08, "loss": 0.81, "step": 15365 }, { "epoch": 0.9524104597843599, "grad_norm": 1.4826456308364868, "learning_rate": 5.0159806927141086e-08, "loss": 0.8785, "step": 15370 }, { "epoch": 0.9527202875201388, "grad_norm": 1.7200186252593994, "learning_rate": 4.983367034113887e-08, "loss": 0.8227, "step": 15375 }, { "epoch": 0.9530301152559177, "grad_norm": 1.6265939474105835, "learning_rate": 4.950753375513665e-08, "loss": 0.8766, "step": 15380 }, { "epoch": 0.9533399429916967, "grad_norm": 1.750857949256897, "learning_rate": 4.918139716913443e-08, "loss": 0.8704, "step": 15385 }, { "epoch": 0.9536497707274755, "grad_norm": 1.4368494749069214, "learning_rate": 4.885526058313221e-08, "loss": 0.8295, "step": 15390 }, { "epoch": 0.9539595984632544, "grad_norm": 1.4641969203948975, "learning_rate": 4.852912399713e-08, "loss": 0.9032, "step": 15395 }, { "epoch": 0.9542694261990333, "grad_norm": 1.8346962928771973, "learning_rate": 4.820298741112778e-08, "loss": 0.8508, "step": 15400 }, { "epoch": 0.9545792539348122, "grad_norm": 1.6951991319656372, "learning_rate": 4.787685082512556e-08, "loss": 0.8454, "step": 15405 }, { "epoch": 0.9548890816705912, "grad_norm": 1.6191620826721191, "learning_rate": 4.7550714239123345e-08, "loss": 0.8721, "step": 15410 }, { "epoch": 0.9551989094063701, "grad_norm": 1.6042550802230835, "learning_rate": 4.722457765312113e-08, "loss": 0.9269, "step": 15415 }, { "epoch": 0.955508737142149, "grad_norm": 1.616936206817627, "learning_rate": 4.68984410671189e-08, "loss": 0.8235, "step": 15420 }, { "epoch": 0.9558185648779278, "grad_norm": 1.6453735828399658, "learning_rate": 4.6572304481116685e-08, "loss": 0.863, "step": 15425 }, { "epoch": 0.9561283926137067, "grad_norm": 1.683127522468567, "learning_rate": 4.6246167895114474e-08, "loss": 0.8327, "step": 15430 }, { "epoch": 0.9564382203494857, "grad_norm": 1.9292060136795044, "learning_rate": 4.5920031309112257e-08, "loss": 0.8538, "step": 15435 }, { "epoch": 0.9567480480852646, "grad_norm": 1.838835597038269, "learning_rate": 4.559389472311004e-08, "loss": 0.8205, "step": 15440 }, { "epoch": 0.9570578758210435, "grad_norm": 1.481664776802063, "learning_rate": 4.526775813710782e-08, "loss": 0.8663, "step": 15445 }, { "epoch": 0.9573677035568224, "grad_norm": 1.6727439165115356, "learning_rate": 4.49416215511056e-08, "loss": 0.8562, "step": 15450 }, { "epoch": 0.9576775312926014, "grad_norm": 1.6388499736785889, "learning_rate": 4.461548496510338e-08, "loss": 0.8113, "step": 15455 }, { "epoch": 0.9579873590283802, "grad_norm": 1.8854243755340576, "learning_rate": 4.428934837910116e-08, "loss": 0.8338, "step": 15460 }, { "epoch": 0.9582971867641591, "grad_norm": 1.6612423658370972, "learning_rate": 4.396321179309895e-08, "loss": 0.8806, "step": 15465 }, { "epoch": 0.958607014499938, "grad_norm": 2.0778310298919678, "learning_rate": 4.363707520709673e-08, "loss": 0.8716, "step": 15470 }, { "epoch": 0.9589168422357169, "grad_norm": 1.538986325263977, "learning_rate": 4.3310938621094515e-08, "loss": 0.8456, "step": 15475 }, { "epoch": 0.9592266699714959, "grad_norm": 1.5692005157470703, "learning_rate": 4.29848020350923e-08, "loss": 0.8494, "step": 15480 }, { "epoch": 0.9595364977072748, "grad_norm": 1.525552749633789, "learning_rate": 4.265866544909007e-08, "loss": 0.8441, "step": 15485 }, { "epoch": 0.9598463254430537, "grad_norm": 2.696028709411621, "learning_rate": 4.2332528863087856e-08, "loss": 0.8781, "step": 15490 }, { "epoch": 0.9601561531788325, "grad_norm": 1.6028283834457397, "learning_rate": 4.200639227708564e-08, "loss": 0.8951, "step": 15495 }, { "epoch": 0.9604659809146114, "grad_norm": 1.7341382503509521, "learning_rate": 4.168025569108343e-08, "loss": 0.95, "step": 15500 }, { "epoch": 0.9607758086503904, "grad_norm": 2.1215007305145264, "learning_rate": 4.135411910508121e-08, "loss": 0.8641, "step": 15505 }, { "epoch": 0.9610856363861693, "grad_norm": 1.7095792293548584, "learning_rate": 4.102798251907899e-08, "loss": 0.8717, "step": 15510 }, { "epoch": 0.9613954641219482, "grad_norm": 1.618931770324707, "learning_rate": 4.0701845933076774e-08, "loss": 0.8261, "step": 15515 }, { "epoch": 0.9617052918577271, "grad_norm": 1.6544820070266724, "learning_rate": 4.037570934707455e-08, "loss": 0.7997, "step": 15520 }, { "epoch": 0.962015119593506, "grad_norm": 1.5621479749679565, "learning_rate": 4.004957276107233e-08, "loss": 0.8065, "step": 15525 }, { "epoch": 0.9623249473292849, "grad_norm": 2.019402265548706, "learning_rate": 3.9723436175070114e-08, "loss": 0.8353, "step": 15530 }, { "epoch": 0.9626347750650638, "grad_norm": 2.487013578414917, "learning_rate": 3.93972995890679e-08, "loss": 0.7833, "step": 15535 }, { "epoch": 0.9629446028008427, "grad_norm": 1.5387972593307495, "learning_rate": 3.9071163003065685e-08, "loss": 0.8411, "step": 15540 }, { "epoch": 0.9632544305366216, "grad_norm": 1.5552467107772827, "learning_rate": 3.874502641706347e-08, "loss": 0.8559, "step": 15545 }, { "epoch": 0.9635642582724006, "grad_norm": 1.9643059968948364, "learning_rate": 3.8418889831061244e-08, "loss": 0.817, "step": 15550 }, { "epoch": 0.9638740860081795, "grad_norm": 1.5294631719589233, "learning_rate": 3.8092753245059026e-08, "loss": 0.831, "step": 15555 }, { "epoch": 0.9641839137439584, "grad_norm": 1.4412130117416382, "learning_rate": 3.776661665905681e-08, "loss": 0.7986, "step": 15560 }, { "epoch": 0.9644937414797373, "grad_norm": 1.4368246793746948, "learning_rate": 3.744048007305459e-08, "loss": 0.8259, "step": 15565 }, { "epoch": 0.9648035692155161, "grad_norm": 1.7029997110366821, "learning_rate": 3.711434348705238e-08, "loss": 0.8844, "step": 15570 }, { "epoch": 0.965113396951295, "grad_norm": 1.9928377866744995, "learning_rate": 3.678820690105016e-08, "loss": 0.869, "step": 15575 }, { "epoch": 0.965423224687074, "grad_norm": 1.479440450668335, "learning_rate": 3.6462070315047944e-08, "loss": 0.7821, "step": 15580 }, { "epoch": 0.9657330524228529, "grad_norm": 1.4919379949569702, "learning_rate": 3.613593372904572e-08, "loss": 0.8267, "step": 15585 }, { "epoch": 0.9660428801586318, "grad_norm": 1.6343252658843994, "learning_rate": 3.58097971430435e-08, "loss": 0.8178, "step": 15590 }, { "epoch": 0.9663527078944107, "grad_norm": 1.612718939781189, "learning_rate": 3.5483660557041284e-08, "loss": 0.8363, "step": 15595 }, { "epoch": 0.9666625356301897, "grad_norm": 1.5983678102493286, "learning_rate": 3.515752397103907e-08, "loss": 0.833, "step": 15600 }, { "epoch": 0.9669723633659685, "grad_norm": 1.558157205581665, "learning_rate": 3.4831387385036856e-08, "loss": 0.7573, "step": 15605 }, { "epoch": 0.9672821911017474, "grad_norm": 1.4543460607528687, "learning_rate": 3.450525079903464e-08, "loss": 0.8641, "step": 15610 }, { "epoch": 0.9675920188375263, "grad_norm": 1.836665391921997, "learning_rate": 3.4179114213032414e-08, "loss": 0.8245, "step": 15615 }, { "epoch": 0.9679018465733052, "grad_norm": 2.1407501697540283, "learning_rate": 3.3852977627030196e-08, "loss": 0.8114, "step": 15620 }, { "epoch": 0.9682116743090842, "grad_norm": 1.6708449125289917, "learning_rate": 3.352684104102798e-08, "loss": 0.8207, "step": 15625 }, { "epoch": 0.9685215020448631, "grad_norm": 1.7907606363296509, "learning_rate": 3.320070445502576e-08, "loss": 0.8308, "step": 15630 }, { "epoch": 0.968831329780642, "grad_norm": 1.425497055053711, "learning_rate": 3.287456786902355e-08, "loss": 0.8506, "step": 15635 }, { "epoch": 0.9691411575164208, "grad_norm": 1.5700130462646484, "learning_rate": 3.254843128302133e-08, "loss": 0.815, "step": 15640 }, { "epoch": 0.9694509852521997, "grad_norm": 1.591178059577942, "learning_rate": 3.2222294697019114e-08, "loss": 0.7668, "step": 15645 }, { "epoch": 0.9697608129879787, "grad_norm": 1.566153883934021, "learning_rate": 3.189615811101689e-08, "loss": 0.9116, "step": 15650 }, { "epoch": 0.9700706407237576, "grad_norm": 1.9862055778503418, "learning_rate": 3.157002152501467e-08, "loss": 0.7909, "step": 15655 }, { "epoch": 0.9703804684595365, "grad_norm": 1.5706019401550293, "learning_rate": 3.1243884939012455e-08, "loss": 0.8002, "step": 15660 }, { "epoch": 0.9706902961953154, "grad_norm": 1.6047507524490356, "learning_rate": 3.091774835301024e-08, "loss": 0.7887, "step": 15665 }, { "epoch": 0.9710001239310944, "grad_norm": 1.7765990495681763, "learning_rate": 3.0591611767008026e-08, "loss": 0.8464, "step": 15670 }, { "epoch": 0.9713099516668732, "grad_norm": 1.7331666946411133, "learning_rate": 3.02654751810058e-08, "loss": 0.8033, "step": 15675 }, { "epoch": 0.9716197794026521, "grad_norm": 1.6544134616851807, "learning_rate": 2.9939338595003584e-08, "loss": 0.8235, "step": 15680 }, { "epoch": 0.971929607138431, "grad_norm": 1.6795530319213867, "learning_rate": 2.961320200900137e-08, "loss": 0.8517, "step": 15685 }, { "epoch": 0.9722394348742099, "grad_norm": 3.3639564514160156, "learning_rate": 2.928706542299915e-08, "loss": 0.8728, "step": 15690 }, { "epoch": 0.9725492626099889, "grad_norm": 1.3747061491012573, "learning_rate": 2.8960928836996934e-08, "loss": 0.8512, "step": 15695 }, { "epoch": 0.9728590903457678, "grad_norm": 1.8927206993103027, "learning_rate": 2.8634792250994717e-08, "loss": 0.8304, "step": 15700 }, { "epoch": 0.9731689180815467, "grad_norm": 1.4716931581497192, "learning_rate": 2.83086556649925e-08, "loss": 0.8172, "step": 15705 }, { "epoch": 0.9734787458173255, "grad_norm": 1.5644137859344482, "learning_rate": 2.7982519078990278e-08, "loss": 0.8282, "step": 15710 }, { "epoch": 0.9737885735531044, "grad_norm": 1.6480584144592285, "learning_rate": 2.7656382492988064e-08, "loss": 0.8614, "step": 15715 }, { "epoch": 0.9740984012888834, "grad_norm": 1.6540179252624512, "learning_rate": 2.7330245906985846e-08, "loss": 0.7731, "step": 15720 }, { "epoch": 0.9744082290246623, "grad_norm": 1.6393020153045654, "learning_rate": 2.7004109320983625e-08, "loss": 0.7552, "step": 15725 }, { "epoch": 0.9747180567604412, "grad_norm": 1.6063807010650635, "learning_rate": 2.667797273498141e-08, "loss": 0.852, "step": 15730 }, { "epoch": 0.9750278844962201, "grad_norm": 1.7273674011230469, "learning_rate": 2.6351836148979193e-08, "loss": 0.8511, "step": 15735 }, { "epoch": 0.975337712231999, "grad_norm": 1.673842191696167, "learning_rate": 2.6025699562976972e-08, "loss": 0.8862, "step": 15740 }, { "epoch": 0.9756475399677779, "grad_norm": 1.6445388793945312, "learning_rate": 2.5699562976974754e-08, "loss": 0.7819, "step": 15745 }, { "epoch": 0.9759573677035568, "grad_norm": 2.0081982612609863, "learning_rate": 2.537342639097254e-08, "loss": 0.8112, "step": 15750 }, { "epoch": 0.9762671954393357, "grad_norm": 1.5558756589889526, "learning_rate": 2.504728980497032e-08, "loss": 0.8807, "step": 15755 }, { "epoch": 0.9765770231751146, "grad_norm": 1.7155393362045288, "learning_rate": 2.47211532189681e-08, "loss": 0.8654, "step": 15760 }, { "epoch": 0.9768868509108936, "grad_norm": 1.4940236806869507, "learning_rate": 2.4395016632965887e-08, "loss": 0.8463, "step": 15765 }, { "epoch": 0.9771966786466725, "grad_norm": 1.8617504835128784, "learning_rate": 2.406888004696367e-08, "loss": 0.867, "step": 15770 }, { "epoch": 0.9775065063824514, "grad_norm": 1.7547118663787842, "learning_rate": 2.3742743460961448e-08, "loss": 0.8672, "step": 15775 }, { "epoch": 0.9778163341182302, "grad_norm": 1.6064207553863525, "learning_rate": 2.341660687495923e-08, "loss": 0.8669, "step": 15780 }, { "epoch": 0.9781261618540091, "grad_norm": 1.8023271560668945, "learning_rate": 2.3090470288957016e-08, "loss": 0.8618, "step": 15785 }, { "epoch": 0.9784359895897881, "grad_norm": 1.7237571477890015, "learning_rate": 2.2764333702954795e-08, "loss": 0.8402, "step": 15790 }, { "epoch": 0.978745817325567, "grad_norm": 1.6067416667938232, "learning_rate": 2.2438197116952577e-08, "loss": 0.8463, "step": 15795 }, { "epoch": 0.9790556450613459, "grad_norm": 1.7388710975646973, "learning_rate": 2.2112060530950363e-08, "loss": 0.8463, "step": 15800 }, { "epoch": 0.9793654727971248, "grad_norm": 1.5295201539993286, "learning_rate": 2.1785923944948142e-08, "loss": 0.8177, "step": 15805 }, { "epoch": 0.9796753005329037, "grad_norm": 1.5857983827590942, "learning_rate": 2.1459787358945924e-08, "loss": 0.8224, "step": 15810 }, { "epoch": 0.9799851282686826, "grad_norm": 2.0500707626342773, "learning_rate": 2.113365077294371e-08, "loss": 0.8527, "step": 15815 }, { "epoch": 0.9802949560044615, "grad_norm": 1.5009729862213135, "learning_rate": 2.0807514186941492e-08, "loss": 0.8528, "step": 15820 }, { "epoch": 0.9806047837402404, "grad_norm": 1.6181910037994385, "learning_rate": 2.048137760093927e-08, "loss": 0.8359, "step": 15825 }, { "epoch": 0.9809146114760193, "grad_norm": 1.6414130926132202, "learning_rate": 2.0155241014937054e-08, "loss": 0.728, "step": 15830 }, { "epoch": 0.9812244392117982, "grad_norm": 1.72751784324646, "learning_rate": 1.982910442893484e-08, "loss": 0.8838, "step": 15835 }, { "epoch": 0.9815342669475772, "grad_norm": 1.7211774587631226, "learning_rate": 1.9502967842932618e-08, "loss": 0.867, "step": 15840 }, { "epoch": 0.9818440946833561, "grad_norm": 1.564544677734375, "learning_rate": 1.91768312569304e-08, "loss": 0.847, "step": 15845 }, { "epoch": 0.9821539224191349, "grad_norm": 1.6357097625732422, "learning_rate": 1.8850694670928186e-08, "loss": 0.7733, "step": 15850 }, { "epoch": 0.9824637501549138, "grad_norm": 1.56624174118042, "learning_rate": 1.8524558084925965e-08, "loss": 0.8027, "step": 15855 }, { "epoch": 0.9827735778906928, "grad_norm": 1.6103644371032715, "learning_rate": 1.8198421498923748e-08, "loss": 0.8102, "step": 15860 }, { "epoch": 0.9830834056264717, "grad_norm": 1.734986662864685, "learning_rate": 1.787228491292153e-08, "loss": 0.8548, "step": 15865 }, { "epoch": 0.9833932333622506, "grad_norm": 1.6334205865859985, "learning_rate": 1.7546148326919316e-08, "loss": 0.8334, "step": 15870 }, { "epoch": 0.9837030610980295, "grad_norm": 2.0084569454193115, "learning_rate": 1.7220011740917095e-08, "loss": 0.9489, "step": 15875 }, { "epoch": 0.9840128888338084, "grad_norm": 1.58625066280365, "learning_rate": 1.6893875154914877e-08, "loss": 0.86, "step": 15880 }, { "epoch": 0.9843227165695874, "grad_norm": 1.630265712738037, "learning_rate": 1.6567738568912663e-08, "loss": 0.8529, "step": 15885 }, { "epoch": 0.9846325443053662, "grad_norm": 1.7411401271820068, "learning_rate": 1.624160198291044e-08, "loss": 0.8655, "step": 15890 }, { "epoch": 0.9849423720411451, "grad_norm": 1.6285487413406372, "learning_rate": 1.5915465396908224e-08, "loss": 0.8774, "step": 15895 }, { "epoch": 0.985252199776924, "grad_norm": 1.6852017641067505, "learning_rate": 1.5589328810906006e-08, "loss": 0.8052, "step": 15900 }, { "epoch": 0.9855620275127029, "grad_norm": 1.8650468587875366, "learning_rate": 1.526319222490379e-08, "loss": 0.8461, "step": 15905 }, { "epoch": 0.9858718552484819, "grad_norm": 1.5864883661270142, "learning_rate": 1.493705563890157e-08, "loss": 0.8232, "step": 15910 }, { "epoch": 0.9861816829842608, "grad_norm": 1.5754700899124146, "learning_rate": 1.4610919052899355e-08, "loss": 0.854, "step": 15915 }, { "epoch": 0.9864915107200397, "grad_norm": 1.6433444023132324, "learning_rate": 1.4284782466897136e-08, "loss": 0.8599, "step": 15920 }, { "epoch": 0.9868013384558185, "grad_norm": 1.9809188842773438, "learning_rate": 1.3958645880894918e-08, "loss": 0.872, "step": 15925 }, { "epoch": 0.9871111661915974, "grad_norm": 1.6928250789642334, "learning_rate": 1.36325092948927e-08, "loss": 0.8981, "step": 15930 }, { "epoch": 0.9874209939273764, "grad_norm": 1.7189764976501465, "learning_rate": 1.3306372708890482e-08, "loss": 0.795, "step": 15935 }, { "epoch": 0.9877308216631553, "grad_norm": 1.8426059484481812, "learning_rate": 1.2980236122888266e-08, "loss": 0.8675, "step": 15940 }, { "epoch": 0.9880406493989342, "grad_norm": 1.5688371658325195, "learning_rate": 1.2654099536886047e-08, "loss": 0.8716, "step": 15945 }, { "epoch": 0.9883504771347131, "grad_norm": 1.5136457681655884, "learning_rate": 1.232796295088383e-08, "loss": 0.8228, "step": 15950 }, { "epoch": 0.9886603048704921, "grad_norm": 1.7843719720840454, "learning_rate": 1.2001826364881612e-08, "loss": 0.8467, "step": 15955 }, { "epoch": 0.9889701326062709, "grad_norm": 1.6245023012161255, "learning_rate": 1.1675689778879394e-08, "loss": 0.831, "step": 15960 }, { "epoch": 0.9892799603420498, "grad_norm": 1.6294286251068115, "learning_rate": 1.1349553192877175e-08, "loss": 0.8229, "step": 15965 }, { "epoch": 0.9895897880778287, "grad_norm": 1.5797966718673706, "learning_rate": 1.1023416606874959e-08, "loss": 0.7916, "step": 15970 }, { "epoch": 0.9898996158136076, "grad_norm": 1.6297780275344849, "learning_rate": 1.0697280020872741e-08, "loss": 0.862, "step": 15975 }, { "epoch": 0.9902094435493866, "grad_norm": 1.5323219299316406, "learning_rate": 1.0371143434870523e-08, "loss": 0.8354, "step": 15980 }, { "epoch": 0.9905192712851655, "grad_norm": 1.5712424516677856, "learning_rate": 1.0045006848868306e-08, "loss": 0.8211, "step": 15985 }, { "epoch": 0.9908290990209444, "grad_norm": 1.7906019687652588, "learning_rate": 9.718870262866086e-09, "loss": 0.8236, "step": 15990 }, { "epoch": 0.9911389267567232, "grad_norm": 1.655163288116455, "learning_rate": 9.39273367686387e-09, "loss": 0.8604, "step": 15995 }, { "epoch": 0.9914487544925021, "grad_norm": 1.6531345844268799, "learning_rate": 9.066597090861653e-09, "loss": 0.8059, "step": 16000 }, { "epoch": 0.9917585822282811, "grad_norm": 1.737867832183838, "learning_rate": 8.740460504859435e-09, "loss": 0.8575, "step": 16005 }, { "epoch": 0.99206840996406, "grad_norm": 1.6699326038360596, "learning_rate": 8.414323918857217e-09, "loss": 0.8423, "step": 16010 }, { "epoch": 0.9923782376998389, "grad_norm": 1.7559905052185059, "learning_rate": 8.088187332854998e-09, "loss": 0.8538, "step": 16015 }, { "epoch": 0.9926880654356178, "grad_norm": 1.8937209844589233, "learning_rate": 7.762050746852782e-09, "loss": 0.841, "step": 16020 }, { "epoch": 0.9929978931713968, "grad_norm": 1.8573896884918213, "learning_rate": 7.4359141608505635e-09, "loss": 0.8793, "step": 16025 }, { "epoch": 0.9933077209071756, "grad_norm": 1.6558507680892944, "learning_rate": 7.109777574848346e-09, "loss": 0.882, "step": 16030 }, { "epoch": 0.9936175486429545, "grad_norm": 1.675456166267395, "learning_rate": 6.783640988846128e-09, "loss": 0.898, "step": 16035 }, { "epoch": 0.9939273763787334, "grad_norm": 1.8294612169265747, "learning_rate": 6.457504402843911e-09, "loss": 0.8146, "step": 16040 }, { "epoch": 0.9942372041145123, "grad_norm": 1.4128397703170776, "learning_rate": 6.131367816841694e-09, "loss": 0.8245, "step": 16045 }, { "epoch": 0.9945470318502913, "grad_norm": 1.567299485206604, "learning_rate": 5.805231230839475e-09, "loss": 0.8458, "step": 16050 }, { "epoch": 0.9948568595860702, "grad_norm": 2.054905652999878, "learning_rate": 5.4790946448372574e-09, "loss": 0.8724, "step": 16055 }, { "epoch": 0.9951666873218491, "grad_norm": 1.8781213760375977, "learning_rate": 5.15295805883504e-09, "loss": 0.8133, "step": 16060 }, { "epoch": 0.9954765150576279, "grad_norm": 1.6261420249938965, "learning_rate": 4.826821472832822e-09, "loss": 0.8414, "step": 16065 }, { "epoch": 0.9957863427934068, "grad_norm": 1.589031457901001, "learning_rate": 4.500684886830605e-09, "loss": 0.8309, "step": 16070 }, { "epoch": 0.9960961705291858, "grad_norm": 1.5858315229415894, "learning_rate": 4.174548300828387e-09, "loss": 0.8105, "step": 16075 }, { "epoch": 0.9964059982649647, "grad_norm": 1.644646406173706, "learning_rate": 3.848411714826169e-09, "loss": 0.8361, "step": 16080 }, { "epoch": 0.9967158260007436, "grad_norm": 1.6961655616760254, "learning_rate": 3.5222751288239514e-09, "loss": 0.8913, "step": 16085 }, { "epoch": 0.9970256537365225, "grad_norm": 1.7430751323699951, "learning_rate": 3.1961385428217333e-09, "loss": 0.8607, "step": 16090 }, { "epoch": 0.9973354814723014, "grad_norm": 1.5716744661331177, "learning_rate": 2.870001956819516e-09, "loss": 0.8464, "step": 16095 }, { "epoch": 0.9976453092080803, "grad_norm": 1.531804084777832, "learning_rate": 2.5438653708172984e-09, "loss": 0.8593, "step": 16100 }, { "epoch": 0.9979551369438592, "grad_norm": 1.682523488998413, "learning_rate": 2.2177287848150803e-09, "loss": 0.7838, "step": 16105 }, { "epoch": 0.9982649646796381, "grad_norm": 1.7752611637115479, "learning_rate": 1.8915921988128626e-09, "loss": 0.8642, "step": 16110 }, { "epoch": 0.998574792415417, "grad_norm": 1.9866595268249512, "learning_rate": 1.565455612810645e-09, "loss": 0.8782, "step": 16115 }, { "epoch": 0.998884620151196, "grad_norm": 1.5543620586395264, "learning_rate": 1.2393190268084275e-09, "loss": 0.7831, "step": 16120 }, { "epoch": 0.9991944478869749, "grad_norm": 1.9485523700714111, "learning_rate": 9.131824408062097e-10, "loss": 0.814, "step": 16125 }, { "epoch": 0.9995042756227538, "grad_norm": 1.5552793741226196, "learning_rate": 5.870458548039919e-10, "loss": 0.8393, "step": 16130 }, { "epoch": 0.9998141033585326, "grad_norm": 1.6291290521621704, "learning_rate": 2.6090926880177417e-10, "loss": 0.8519, "step": 16135 }, { "epoch": 1.0, "step": 16138, "total_flos": 2.4970508980845543e+19, "train_loss": 0.8683929024523911, "train_runtime": 34422.1918, "train_samples_per_second": 30.005, "train_steps_per_second": 0.469 } ], "logging_steps": 5, "max_steps": 16138, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.4970508980845543e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }