qwen-coder-3b-ft / trainer_state.json
qwertyuiopasdfg's picture
Add files using upload-large-folder tool
bf47989 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 8.644372527867674,
"eval_steps": 500,
"global_step": 3000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01438331535418914,
"grad_norm": 2.0,
"learning_rate": 9.221902017291067e-07,
"loss": 0.4532,
"step": 5
},
{
"epoch": 0.02876663070837828,
"grad_norm": 2.015625,
"learning_rate": 2.0749279538904903e-06,
"loss": 0.6513,
"step": 10
},
{
"epoch": 0.043149946062567425,
"grad_norm": 1.7265625,
"learning_rate": 3.2276657060518735e-06,
"loss": 0.5609,
"step": 15
},
{
"epoch": 0.05753326141675656,
"grad_norm": 1.3203125,
"learning_rate": 4.380403458213257e-06,
"loss": 0.5331,
"step": 20
},
{
"epoch": 0.0719165767709457,
"grad_norm": 1.78125,
"learning_rate": 5.533141210374641e-06,
"loss": 0.7096,
"step": 25
},
{
"epoch": 0.08629989212513485,
"grad_norm": 1.265625,
"learning_rate": 6.685878962536023e-06,
"loss": 0.4688,
"step": 30
},
{
"epoch": 0.10068320747932398,
"grad_norm": 0.9765625,
"learning_rate": 7.838616714697407e-06,
"loss": 0.506,
"step": 35
},
{
"epoch": 0.11506652283351312,
"grad_norm": 1.359375,
"learning_rate": 8.991354466858791e-06,
"loss": 0.4993,
"step": 40
},
{
"epoch": 0.12944983818770225,
"grad_norm": 1.1015625,
"learning_rate": 1.0144092219020174e-05,
"loss": 0.444,
"step": 45
},
{
"epoch": 0.1438331535418914,
"grad_norm": 1.2890625,
"learning_rate": 1.1296829971181558e-05,
"loss": 0.4432,
"step": 50
},
{
"epoch": 0.15821646889608054,
"grad_norm": 1.3984375,
"learning_rate": 1.2449567723342942e-05,
"loss": 0.5675,
"step": 55
},
{
"epoch": 0.1725997842502697,
"grad_norm": 1.125,
"learning_rate": 1.3602305475504324e-05,
"loss": 0.4172,
"step": 60
},
{
"epoch": 0.18698309960445883,
"grad_norm": 1.125,
"learning_rate": 1.4755043227665706e-05,
"loss": 0.4988,
"step": 65
},
{
"epoch": 0.20136641495864796,
"grad_norm": 0.890625,
"learning_rate": 1.590778097982709e-05,
"loss": 0.6613,
"step": 70
},
{
"epoch": 0.21574973031283712,
"grad_norm": 1.0234375,
"learning_rate": 1.7060518731988475e-05,
"loss": 0.5525,
"step": 75
},
{
"epoch": 0.23013304566702625,
"grad_norm": 0.99609375,
"learning_rate": 1.8213256484149857e-05,
"loss": 0.3749,
"step": 80
},
{
"epoch": 0.24451636102121538,
"grad_norm": 1.203125,
"learning_rate": 1.936599423631124e-05,
"loss": 0.383,
"step": 85
},
{
"epoch": 0.2588996763754045,
"grad_norm": 0.86328125,
"learning_rate": 2.0518731988472625e-05,
"loss": 0.4972,
"step": 90
},
{
"epoch": 0.2732829917295937,
"grad_norm": 1.4375,
"learning_rate": 2.1671469740634007e-05,
"loss": 0.3984,
"step": 95
},
{
"epoch": 0.2876663070837828,
"grad_norm": 1.1796875,
"learning_rate": 2.2824207492795393e-05,
"loss": 0.3863,
"step": 100
},
{
"epoch": 0.30204962243797195,
"grad_norm": 1.046875,
"learning_rate": 2.3976945244956772e-05,
"loss": 0.3739,
"step": 105
},
{
"epoch": 0.3164329377921611,
"grad_norm": 1.0859375,
"learning_rate": 2.5129682997118158e-05,
"loss": 0.4974,
"step": 110
},
{
"epoch": 0.3308162531463502,
"grad_norm": 1.2421875,
"learning_rate": 2.628242074927954e-05,
"loss": 0.5356,
"step": 115
},
{
"epoch": 0.3451995685005394,
"grad_norm": 1.5859375,
"learning_rate": 2.7435158501440926e-05,
"loss": 0.5979,
"step": 120
},
{
"epoch": 0.35958288385472853,
"grad_norm": 1.234375,
"learning_rate": 2.858789625360231e-05,
"loss": 0.5462,
"step": 125
},
{
"epoch": 0.37396619920891766,
"grad_norm": 1.046875,
"learning_rate": 2.9740634005763694e-05,
"loss": 0.5615,
"step": 130
},
{
"epoch": 0.3883495145631068,
"grad_norm": 1.203125,
"learning_rate": 3.089337175792507e-05,
"loss": 0.5295,
"step": 135
},
{
"epoch": 0.4027328299172959,
"grad_norm": 2.21875,
"learning_rate": 3.2046109510086455e-05,
"loss": 0.4399,
"step": 140
},
{
"epoch": 0.41711614527148505,
"grad_norm": 1.4375,
"learning_rate": 3.3198847262247845e-05,
"loss": 0.5191,
"step": 145
},
{
"epoch": 0.43149946062567424,
"grad_norm": 1.03125,
"learning_rate": 3.435158501440922e-05,
"loss": 0.4418,
"step": 150
},
{
"epoch": 0.44588277597986337,
"grad_norm": 1.203125,
"learning_rate": 3.550432276657061e-05,
"loss": 0.5221,
"step": 155
},
{
"epoch": 0.4602660913340525,
"grad_norm": 1.15625,
"learning_rate": 3.665706051873199e-05,
"loss": 0.4788,
"step": 160
},
{
"epoch": 0.4746494066882416,
"grad_norm": 0.9375,
"learning_rate": 3.7809798270893374e-05,
"loss": 0.58,
"step": 165
},
{
"epoch": 0.48903272204243076,
"grad_norm": 1.328125,
"learning_rate": 3.8962536023054756e-05,
"loss": 0.5308,
"step": 170
},
{
"epoch": 0.5034160373966199,
"grad_norm": 1.5703125,
"learning_rate": 4.0115273775216146e-05,
"loss": 0.5467,
"step": 175
},
{
"epoch": 0.517799352750809,
"grad_norm": 1.4765625,
"learning_rate": 4.126801152737752e-05,
"loss": 0.5651,
"step": 180
},
{
"epoch": 0.5321826681049981,
"grad_norm": 1.09375,
"learning_rate": 4.2420749279538904e-05,
"loss": 0.4496,
"step": 185
},
{
"epoch": 0.5465659834591874,
"grad_norm": 1.3125,
"learning_rate": 4.357348703170029e-05,
"loss": 0.4775,
"step": 190
},
{
"epoch": 0.5609492988133765,
"grad_norm": 1.3046875,
"learning_rate": 4.4726224783861675e-05,
"loss": 0.4772,
"step": 195
},
{
"epoch": 0.5753326141675656,
"grad_norm": 1.359375,
"learning_rate": 4.587896253602306e-05,
"loss": 0.5522,
"step": 200
},
{
"epoch": 0.5897159295217548,
"grad_norm": 1.1875,
"learning_rate": 4.703170028818444e-05,
"loss": 0.6477,
"step": 205
},
{
"epoch": 0.6040992448759439,
"grad_norm": 1.5390625,
"learning_rate": 4.818443804034583e-05,
"loss": 0.459,
"step": 210
},
{
"epoch": 0.618482560230133,
"grad_norm": 1.2734375,
"learning_rate": 4.933717579250721e-05,
"loss": 0.4726,
"step": 215
},
{
"epoch": 0.6328658755843222,
"grad_norm": 1.7421875,
"learning_rate": 5.048991354466859e-05,
"loss": 0.5411,
"step": 220
},
{
"epoch": 0.6472491909385113,
"grad_norm": 1.21875,
"learning_rate": 5.1642651296829976e-05,
"loss": 0.5301,
"step": 225
},
{
"epoch": 0.6616325062927004,
"grad_norm": 1.359375,
"learning_rate": 5.279538904899136e-05,
"loss": 0.3528,
"step": 230
},
{
"epoch": 0.6760158216468896,
"grad_norm": 1.4296875,
"learning_rate": 5.394812680115274e-05,
"loss": 0.4479,
"step": 235
},
{
"epoch": 0.6903991370010788,
"grad_norm": 1.0,
"learning_rate": 5.510086455331412e-05,
"loss": 0.6267,
"step": 240
},
{
"epoch": 0.7047824523552679,
"grad_norm": 1.0546875,
"learning_rate": 5.625360230547551e-05,
"loss": 0.4516,
"step": 245
},
{
"epoch": 0.7191657677094571,
"grad_norm": 1.0234375,
"learning_rate": 5.7406340057636895e-05,
"loss": 0.4635,
"step": 250
},
{
"epoch": 0.7335490830636462,
"grad_norm": 1.3359375,
"learning_rate": 5.855907780979827e-05,
"loss": 0.396,
"step": 255
},
{
"epoch": 0.7479323984178353,
"grad_norm": 1.3203125,
"learning_rate": 5.971181556195966e-05,
"loss": 0.4695,
"step": 260
},
{
"epoch": 0.7623157137720245,
"grad_norm": 0.96484375,
"learning_rate": 6.086455331412104e-05,
"loss": 0.4026,
"step": 265
},
{
"epoch": 0.7766990291262136,
"grad_norm": 1.0,
"learning_rate": 6.201729106628243e-05,
"loss": 0.4413,
"step": 270
},
{
"epoch": 0.7910823444804027,
"grad_norm": 1.265625,
"learning_rate": 6.317002881844381e-05,
"loss": 0.5513,
"step": 275
},
{
"epoch": 0.8054656598345918,
"grad_norm": 1.25,
"learning_rate": 6.43227665706052e-05,
"loss": 0.4092,
"step": 280
},
{
"epoch": 0.819848975188781,
"grad_norm": 1.0703125,
"learning_rate": 6.547550432276658e-05,
"loss": 0.4538,
"step": 285
},
{
"epoch": 0.8342322905429701,
"grad_norm": 1.1640625,
"learning_rate": 6.662824207492796e-05,
"loss": 0.4408,
"step": 290
},
{
"epoch": 0.8486156058971593,
"grad_norm": 0.8046875,
"learning_rate": 6.778097982708934e-05,
"loss": 0.5171,
"step": 295
},
{
"epoch": 0.8629989212513485,
"grad_norm": 1.1328125,
"learning_rate": 6.893371757925073e-05,
"loss": 0.4662,
"step": 300
},
{
"epoch": 0.8773822366055376,
"grad_norm": 1.296875,
"learning_rate": 7.008645533141211e-05,
"loss": 0.4287,
"step": 305
},
{
"epoch": 0.8917655519597267,
"grad_norm": 0.98828125,
"learning_rate": 7.123919308357349e-05,
"loss": 0.6147,
"step": 310
},
{
"epoch": 0.9061488673139159,
"grad_norm": 1.4453125,
"learning_rate": 7.239193083573487e-05,
"loss": 0.5896,
"step": 315
},
{
"epoch": 0.920532182668105,
"grad_norm": 1.7734375,
"learning_rate": 7.354466858789627e-05,
"loss": 0.4768,
"step": 320
},
{
"epoch": 0.9349154980222941,
"grad_norm": 0.76953125,
"learning_rate": 7.469740634005764e-05,
"loss": 0.4511,
"step": 325
},
{
"epoch": 0.9492988133764833,
"grad_norm": 1.234375,
"learning_rate": 7.585014409221902e-05,
"loss": 0.5281,
"step": 330
},
{
"epoch": 0.9636821287306724,
"grad_norm": 0.84375,
"learning_rate": 7.700288184438042e-05,
"loss": 0.4953,
"step": 335
},
{
"epoch": 0.9780654440848615,
"grad_norm": 1.2109375,
"learning_rate": 7.81556195965418e-05,
"loss": 0.5922,
"step": 340
},
{
"epoch": 0.9924487594390508,
"grad_norm": 1.375,
"learning_rate": 7.930835734870318e-05,
"loss": 0.4489,
"step": 345
},
{
"epoch": 1.0086299892125135,
"grad_norm": 1.34375,
"learning_rate": 7.999991904463832e-05,
"loss": 0.5064,
"step": 350
},
{
"epoch": 1.0230133045667027,
"grad_norm": 1.2890625,
"learning_rate": 7.999900830058266e-05,
"loss": 0.5271,
"step": 355
},
{
"epoch": 1.0373966199208917,
"grad_norm": 1.4296875,
"learning_rate": 7.999708564138649e-05,
"loss": 0.6233,
"step": 360
},
{
"epoch": 1.051779935275081,
"grad_norm": 0.97265625,
"learning_rate": 7.999415111569024e-05,
"loss": 0.5166,
"step": 365
},
{
"epoch": 1.06616325062927,
"grad_norm": 2.359375,
"learning_rate": 7.999020479773298e-05,
"loss": 0.5777,
"step": 370
},
{
"epoch": 1.0805465659834592,
"grad_norm": 1.1796875,
"learning_rate": 7.998524678735071e-05,
"loss": 0.4766,
"step": 375
},
{
"epoch": 1.0949298813376482,
"grad_norm": 1.2578125,
"learning_rate": 7.997927720997366e-05,
"loss": 0.4743,
"step": 380
},
{
"epoch": 1.1093131966918375,
"grad_norm": 0.76953125,
"learning_rate": 7.997229621662321e-05,
"loss": 0.348,
"step": 385
},
{
"epoch": 1.1236965120460267,
"grad_norm": 0.8828125,
"learning_rate": 7.996430398390805e-05,
"loss": 0.4763,
"step": 390
},
{
"epoch": 1.1380798274002157,
"grad_norm": 1.1015625,
"learning_rate": 7.995530071401977e-05,
"loss": 0.5138,
"step": 395
},
{
"epoch": 1.152463142754405,
"grad_norm": 0.76953125,
"learning_rate": 7.994528663472761e-05,
"loss": 0.5649,
"step": 400
},
{
"epoch": 1.166846458108594,
"grad_norm": 0.91015625,
"learning_rate": 7.993426199937281e-05,
"loss": 0.4292,
"step": 405
},
{
"epoch": 1.1812297734627832,
"grad_norm": 1.0,
"learning_rate": 7.992222708686218e-05,
"loss": 0.4659,
"step": 410
},
{
"epoch": 1.1956130888169723,
"grad_norm": 0.7109375,
"learning_rate": 7.990918220166104e-05,
"loss": 0.4227,
"step": 415
},
{
"epoch": 1.2099964041711615,
"grad_norm": 0.9296875,
"learning_rate": 7.989512767378545e-05,
"loss": 0.4776,
"step": 420
},
{
"epoch": 1.2243797195253505,
"grad_norm": 1.015625,
"learning_rate": 7.9880063858794e-05,
"loss": 0.4265,
"step": 425
},
{
"epoch": 1.2387630348795398,
"grad_norm": 0.9140625,
"learning_rate": 7.98639911377787e-05,
"loss": 0.3607,
"step": 430
},
{
"epoch": 1.2531463502337288,
"grad_norm": 1.078125,
"learning_rate": 7.984690991735535e-05,
"loss": 0.5326,
"step": 435
},
{
"epoch": 1.267529665587918,
"grad_norm": 0.6875,
"learning_rate": 7.982882062965334e-05,
"loss": 0.5541,
"step": 440
},
{
"epoch": 1.2819129809421073,
"grad_norm": 0.7421875,
"learning_rate": 7.980972373230456e-05,
"loss": 0.543,
"step": 445
},
{
"epoch": 1.2962962962962963,
"grad_norm": 5.21875,
"learning_rate": 7.978961970843204e-05,
"loss": 0.4757,
"step": 450
},
{
"epoch": 1.3106796116504853,
"grad_norm": 1.2421875,
"learning_rate": 7.97685090666375e-05,
"loss": 0.5024,
"step": 455
},
{
"epoch": 1.3250629270046745,
"grad_norm": 1.5234375,
"learning_rate": 7.974639234098866e-05,
"loss": 0.5356,
"step": 460
},
{
"epoch": 1.3394462423588638,
"grad_norm": 0.79296875,
"learning_rate": 7.972327009100561e-05,
"loss": 0.5277,
"step": 465
},
{
"epoch": 1.3538295577130528,
"grad_norm": 1.1796875,
"learning_rate": 7.969914290164673e-05,
"loss": 0.5583,
"step": 470
},
{
"epoch": 1.368212873067242,
"grad_norm": 1.015625,
"learning_rate": 7.967401138329387e-05,
"loss": 0.5048,
"step": 475
},
{
"epoch": 1.382596188421431,
"grad_norm": 1.1328125,
"learning_rate": 7.964787617173687e-05,
"loss": 0.4426,
"step": 480
},
{
"epoch": 1.3969795037756203,
"grad_norm": 1.0546875,
"learning_rate": 7.962073792815756e-05,
"loss": 0.3894,
"step": 485
},
{
"epoch": 1.4113628191298093,
"grad_norm": 0.94921875,
"learning_rate": 7.959259733911291e-05,
"loss": 0.5437,
"step": 490
},
{
"epoch": 1.4257461344839986,
"grad_norm": 0.94921875,
"learning_rate": 7.956345511651779e-05,
"loss": 0.5329,
"step": 495
},
{
"epoch": 1.4401294498381878,
"grad_norm": 0.80078125,
"learning_rate": 7.95333119976269e-05,
"loss": 0.4593,
"step": 500
},
{
"epoch": 1.4545127651923768,
"grad_norm": 3.046875,
"learning_rate": 7.950216874501609e-05,
"loss": 0.4531,
"step": 505
},
{
"epoch": 1.468896080546566,
"grad_norm": 4.875,
"learning_rate": 7.947002614656313e-05,
"loss": 0.415,
"step": 510
},
{
"epoch": 1.483279395900755,
"grad_norm": 1.0625,
"learning_rate": 7.94368850154277e-05,
"loss": 0.369,
"step": 515
},
{
"epoch": 1.4976627112549443,
"grad_norm": 1.1640625,
"learning_rate": 7.940274619003093e-05,
"loss": 0.4272,
"step": 520
},
{
"epoch": 1.5120460266091333,
"grad_norm": 0.91015625,
"learning_rate": 7.936761053403407e-05,
"loss": 0.4839,
"step": 525
},
{
"epoch": 1.5264293419633226,
"grad_norm": 1.1796875,
"learning_rate": 7.933147893631673e-05,
"loss": 0.6706,
"step": 530
},
{
"epoch": 1.5408126573175118,
"grad_norm": 0.71875,
"learning_rate": 7.929435231095433e-05,
"loss": 0.4209,
"step": 535
},
{
"epoch": 1.5551959726717008,
"grad_norm": 1.15625,
"learning_rate": 7.925623159719501e-05,
"loss": 0.4142,
"step": 540
},
{
"epoch": 1.5695792880258899,
"grad_norm": 1.4140625,
"learning_rate": 7.921711775943588e-05,
"loss": 0.4872,
"step": 545
},
{
"epoch": 1.583962603380079,
"grad_norm": 7.71875,
"learning_rate": 7.917701178719857e-05,
"loss": 0.3773,
"step": 550
},
{
"epoch": 1.5983459187342683,
"grad_norm": 2.921875,
"learning_rate": 7.913591469510427e-05,
"loss": 0.5732,
"step": 555
},
{
"epoch": 1.6127292340884574,
"grad_norm": 1.1171875,
"learning_rate": 7.909382752284797e-05,
"loss": 0.362,
"step": 560
},
{
"epoch": 1.6271125494426464,
"grad_norm": 0.86328125,
"learning_rate": 7.905075133517227e-05,
"loss": 0.508,
"step": 565
},
{
"epoch": 1.6414958647968356,
"grad_norm": 24.625,
"learning_rate": 7.900668722184032e-05,
"loss": 0.3889,
"step": 570
},
{
"epoch": 1.6558791801510249,
"grad_norm": 9.75,
"learning_rate": 7.896163629760837e-05,
"loss": 0.5928,
"step": 575
},
{
"epoch": 1.6702624955052139,
"grad_norm": 0.78515625,
"learning_rate": 7.891559970219747e-05,
"loss": 0.3379,
"step": 580
},
{
"epoch": 1.6846458108594031,
"grad_norm": 1.03125,
"learning_rate": 7.886857860026471e-05,
"loss": 0.4557,
"step": 585
},
{
"epoch": 1.6990291262135924,
"grad_norm": 1.328125,
"learning_rate": 7.882057418137369e-05,
"loss": 0.5023,
"step": 590
},
{
"epoch": 1.7134124415677814,
"grad_norm": 0.71484375,
"learning_rate": 7.877158765996448e-05,
"loss": 0.3935,
"step": 595
},
{
"epoch": 1.7277957569219704,
"grad_norm": 0.93359375,
"learning_rate": 7.872162027532287e-05,
"loss": 0.6483,
"step": 600
},
{
"epoch": 1.7421790722761596,
"grad_norm": 0.91796875,
"learning_rate": 7.867067329154902e-05,
"loss": 0.5306,
"step": 605
},
{
"epoch": 1.7565623876303489,
"grad_norm": 0.9140625,
"learning_rate": 7.861874799752552e-05,
"loss": 0.4876,
"step": 610
},
{
"epoch": 1.770945702984538,
"grad_norm": 0.80859375,
"learning_rate": 7.856584570688468e-05,
"loss": 0.3126,
"step": 615
},
{
"epoch": 1.785329018338727,
"grad_norm": 0.765625,
"learning_rate": 7.851196775797542e-05,
"loss": 0.4426,
"step": 620
},
{
"epoch": 1.7997123336929162,
"grad_norm": 0.9296875,
"learning_rate": 7.845711551382935e-05,
"loss": 0.3864,
"step": 625
},
{
"epoch": 1.8140956490471054,
"grad_norm": 0.79296875,
"learning_rate": 7.840129036212625e-05,
"loss": 0.4811,
"step": 630
},
{
"epoch": 1.8284789644012944,
"grad_norm": 0.796875,
"learning_rate": 7.83444937151591e-05,
"loss": 0.4156,
"step": 635
},
{
"epoch": 1.8428622797554837,
"grad_norm": 0.7734375,
"learning_rate": 7.828672700979812e-05,
"loss": 0.4054,
"step": 640
},
{
"epoch": 1.857245595109673,
"grad_norm": 1.828125,
"learning_rate": 7.82279917074547e-05,
"loss": 0.465,
"step": 645
},
{
"epoch": 1.871628910463862,
"grad_norm": 2.140625,
"learning_rate": 7.81682892940442e-05,
"loss": 0.4299,
"step": 650
},
{
"epoch": 1.886012225818051,
"grad_norm": 1.4453125,
"learning_rate": 7.810762127994846e-05,
"loss": 0.5449,
"step": 655
},
{
"epoch": 1.9003955411722402,
"grad_norm": 1.046875,
"learning_rate": 7.804598919997757e-05,
"loss": 0.4492,
"step": 660
},
{
"epoch": 1.9147788565264294,
"grad_norm": 0.828125,
"learning_rate": 7.798339461333111e-05,
"loss": 0.441,
"step": 665
},
{
"epoch": 1.9291621718806184,
"grad_norm": 0.796875,
"learning_rate": 7.791983910355854e-05,
"loss": 0.542,
"step": 670
},
{
"epoch": 1.9435454872348075,
"grad_norm": 0.91796875,
"learning_rate": 7.78553242785193e-05,
"loss": 0.3236,
"step": 675
},
{
"epoch": 1.9579288025889967,
"grad_norm": 1.1875,
"learning_rate": 7.778985177034207e-05,
"loss": 0.427,
"step": 680
},
{
"epoch": 1.972312117943186,
"grad_norm": 0.8515625,
"learning_rate": 7.772342323538345e-05,
"loss": 0.3841,
"step": 685
},
{
"epoch": 1.986695433297375,
"grad_norm": 1.2265625,
"learning_rate": 7.765604035418614e-05,
"loss": 0.4994,
"step": 690
},
{
"epoch": 2.002876663070838,
"grad_norm": 1.6796875,
"learning_rate": 7.758770483143634e-05,
"loss": 0.509,
"step": 695
},
{
"epoch": 2.017259978425027,
"grad_norm": 0.73046875,
"learning_rate": 7.751841839592065e-05,
"loss": 0.3722,
"step": 700
},
{
"epoch": 2.031643293779216,
"grad_norm": 0.87109375,
"learning_rate": 7.744818280048237e-05,
"loss": 0.3668,
"step": 705
},
{
"epoch": 2.0460266091334054,
"grad_norm": 1.0703125,
"learning_rate": 7.737699982197711e-05,
"loss": 0.578,
"step": 710
},
{
"epoch": 2.0604099244875944,
"grad_norm": 2.375,
"learning_rate": 7.730487126122784e-05,
"loss": 0.5856,
"step": 715
},
{
"epoch": 2.0747932398417834,
"grad_norm": 0.9140625,
"learning_rate": 7.72317989429794e-05,
"loss": 0.4909,
"step": 720
},
{
"epoch": 2.089176555195973,
"grad_norm": 0.96484375,
"learning_rate": 7.715778471585223e-05,
"loss": 0.3753,
"step": 725
},
{
"epoch": 2.103559870550162,
"grad_norm": 0.76953125,
"learning_rate": 7.708283045229568e-05,
"loss": 0.4519,
"step": 730
},
{
"epoch": 2.117943185904351,
"grad_norm": 1.0078125,
"learning_rate": 7.700693804854062e-05,
"loss": 0.405,
"step": 735
},
{
"epoch": 2.13232650125854,
"grad_norm": 1.0,
"learning_rate": 7.693010942455146e-05,
"loss": 0.3957,
"step": 740
},
{
"epoch": 2.1467098166127294,
"grad_norm": 0.59765625,
"learning_rate": 7.685234652397758e-05,
"loss": 0.286,
"step": 745
},
{
"epoch": 2.1610931319669184,
"grad_norm": 0.9921875,
"learning_rate": 7.677365131410418e-05,
"loss": 0.5922,
"step": 750
},
{
"epoch": 2.1754764473211075,
"grad_norm": 0.90234375,
"learning_rate": 7.669402578580246e-05,
"loss": 0.4268,
"step": 755
},
{
"epoch": 2.1898597626752965,
"grad_norm": 0.9609375,
"learning_rate": 7.661347195347932e-05,
"loss": 0.4558,
"step": 760
},
{
"epoch": 2.204243078029486,
"grad_norm": 0.8046875,
"learning_rate": 7.653199185502631e-05,
"loss": 0.3913,
"step": 765
},
{
"epoch": 2.218626393383675,
"grad_norm": 0.66015625,
"learning_rate": 7.644958755176822e-05,
"loss": 0.5205,
"step": 770
},
{
"epoch": 2.233009708737864,
"grad_norm": 0.83984375,
"learning_rate": 7.636626112841076e-05,
"loss": 0.359,
"step": 775
},
{
"epoch": 2.2473930240920534,
"grad_norm": 0.8046875,
"learning_rate": 7.628201469298793e-05,
"loss": 0.4881,
"step": 780
},
{
"epoch": 2.2617763394462425,
"grad_norm": 0.69140625,
"learning_rate": 7.619685037680867e-05,
"loss": 0.4995,
"step": 785
},
{
"epoch": 2.2761596548004315,
"grad_norm": 0.8515625,
"learning_rate": 7.61107703344029e-05,
"loss": 0.4753,
"step": 790
},
{
"epoch": 2.2905429701546205,
"grad_norm": 0.6796875,
"learning_rate": 7.602377674346707e-05,
"loss": 0.3069,
"step": 795
},
{
"epoch": 2.30492628550881,
"grad_norm": 0.73046875,
"learning_rate": 7.593587180480907e-05,
"loss": 0.4076,
"step": 800
},
{
"epoch": 2.319309600862999,
"grad_norm": 1.90625,
"learning_rate": 7.584705774229247e-05,
"loss": 0.3591,
"step": 805
},
{
"epoch": 2.333692916217188,
"grad_norm": 0.73046875,
"learning_rate": 7.575733680278031e-05,
"loss": 0.3701,
"step": 810
},
{
"epoch": 2.348076231571377,
"grad_norm": 0.8125,
"learning_rate": 7.566671125607833e-05,
"loss": 0.6058,
"step": 815
},
{
"epoch": 2.3624595469255665,
"grad_norm": 0.92578125,
"learning_rate": 7.557518339487744e-05,
"loss": 0.3187,
"step": 820
},
{
"epoch": 2.3768428622797555,
"grad_norm": 0.68359375,
"learning_rate": 7.548275553469575e-05,
"loss": 0.3917,
"step": 825
},
{
"epoch": 2.3912261776339445,
"grad_norm": 1.046875,
"learning_rate": 7.538943001382001e-05,
"loss": 0.392,
"step": 830
},
{
"epoch": 2.405609492988134,
"grad_norm": 0.6484375,
"learning_rate": 7.529520919324646e-05,
"loss": 0.307,
"step": 835
},
{
"epoch": 2.419992808342323,
"grad_norm": 0.78125,
"learning_rate": 7.520009545662104e-05,
"loss": 0.4457,
"step": 840
},
{
"epoch": 2.434376123696512,
"grad_norm": 0.86328125,
"learning_rate": 7.510409121017918e-05,
"loss": 0.4218,
"step": 845
},
{
"epoch": 2.448759439050701,
"grad_norm": 0.76171875,
"learning_rate": 7.500719888268487e-05,
"loss": 0.3575,
"step": 850
},
{
"epoch": 2.4631427544048905,
"grad_norm": 0.8984375,
"learning_rate": 7.490942092536918e-05,
"loss": 0.4674,
"step": 855
},
{
"epoch": 2.4775260697590795,
"grad_norm": 0.796875,
"learning_rate": 7.481075981186835e-05,
"loss": 0.3308,
"step": 860
},
{
"epoch": 2.4919093851132685,
"grad_norm": 0.80078125,
"learning_rate": 7.471121803816112e-05,
"loss": 0.4612,
"step": 865
},
{
"epoch": 2.5062927004674576,
"grad_norm": 0.71875,
"learning_rate": 7.461079812250559e-05,
"loss": 0.3943,
"step": 870
},
{
"epoch": 2.520676015821647,
"grad_norm": 1.0,
"learning_rate": 7.450950260537561e-05,
"loss": 0.2894,
"step": 875
},
{
"epoch": 2.535059331175836,
"grad_norm": 0.71484375,
"learning_rate": 7.44073340493964e-05,
"loss": 0.5447,
"step": 880
},
{
"epoch": 2.549442646530025,
"grad_norm": 0.75,
"learning_rate": 7.430429503927974e-05,
"loss": 0.4356,
"step": 885
},
{
"epoch": 2.5638259618842145,
"grad_norm": 0.66796875,
"learning_rate": 7.420038818175862e-05,
"loss": 0.4192,
"step": 890
},
{
"epoch": 2.5782092772384035,
"grad_norm": 0.84765625,
"learning_rate": 7.409561610552127e-05,
"loss": 0.4312,
"step": 895
},
{
"epoch": 2.5925925925925926,
"grad_norm": 0.9453125,
"learning_rate": 7.398998146114468e-05,
"loss": 0.3847,
"step": 900
},
{
"epoch": 2.6069759079467816,
"grad_norm": 1.0390625,
"learning_rate": 7.388348692102748e-05,
"loss": 0.3862,
"step": 905
},
{
"epoch": 2.6213592233009706,
"grad_norm": 0.7109375,
"learning_rate": 7.37761351793224e-05,
"loss": 0.4877,
"step": 910
},
{
"epoch": 2.63574253865516,
"grad_norm": 0.6328125,
"learning_rate": 7.366792895186812e-05,
"loss": 0.4786,
"step": 915
},
{
"epoch": 2.650125854009349,
"grad_norm": 1.0546875,
"learning_rate": 7.355887097612048e-05,
"loss": 0.3456,
"step": 920
},
{
"epoch": 2.664509169363538,
"grad_norm": 0.671875,
"learning_rate": 7.344896401108331e-05,
"loss": 0.3659,
"step": 925
},
{
"epoch": 2.6788924847177276,
"grad_norm": 0.7734375,
"learning_rate": 7.333821083723861e-05,
"loss": 0.4014,
"step": 930
},
{
"epoch": 2.6932758000719166,
"grad_norm": 0.98828125,
"learning_rate": 7.322661425647618e-05,
"loss": 0.3648,
"step": 935
},
{
"epoch": 2.7076591154261056,
"grad_norm": 0.7578125,
"learning_rate": 7.311417709202273e-05,
"loss": 0.3891,
"step": 940
},
{
"epoch": 2.722042430780295,
"grad_norm": 0.75,
"learning_rate": 7.300090218837052e-05,
"loss": 0.4091,
"step": 945
},
{
"epoch": 2.736425746134484,
"grad_norm": 0.734375,
"learning_rate": 7.288679241120537e-05,
"loss": 0.3839,
"step": 950
},
{
"epoch": 2.750809061488673,
"grad_norm": 0.85546875,
"learning_rate": 7.27718506473341e-05,
"loss": 0.4808,
"step": 955
},
{
"epoch": 2.765192376842862,
"grad_norm": 0.8046875,
"learning_rate": 7.265607980461161e-05,
"loss": 0.4709,
"step": 960
},
{
"epoch": 2.7795756921970516,
"grad_norm": 0.98046875,
"learning_rate": 7.253948281186722e-05,
"loss": 0.4579,
"step": 965
},
{
"epoch": 2.7939590075512406,
"grad_norm": 1.203125,
"learning_rate": 7.242206261883059e-05,
"loss": 0.413,
"step": 970
},
{
"epoch": 2.8083423229054296,
"grad_norm": 0.73828125,
"learning_rate": 7.23038221960572e-05,
"loss": 0.4752,
"step": 975
},
{
"epoch": 2.8227256382596186,
"grad_norm": 0.9140625,
"learning_rate": 7.2184764534853e-05,
"loss": 0.395,
"step": 980
},
{
"epoch": 2.837108953613808,
"grad_norm": 1.1015625,
"learning_rate": 7.206489264719896e-05,
"loss": 0.4488,
"step": 985
},
{
"epoch": 2.851492268967997,
"grad_norm": 1.109375,
"learning_rate": 7.19442095656747e-05,
"loss": 0.3311,
"step": 990
},
{
"epoch": 2.865875584322186,
"grad_norm": 0.875,
"learning_rate": 7.182271834338185e-05,
"loss": 0.4682,
"step": 995
},
{
"epoch": 2.8802588996763756,
"grad_norm": 0.8515625,
"learning_rate": 7.17004220538668e-05,
"loss": 0.4539,
"step": 1000
},
{
"epoch": 2.8946422150305646,
"grad_norm": 0.7890625,
"learning_rate": 7.157732379104291e-05,
"loss": 0.5094,
"step": 1005
},
{
"epoch": 2.9090255303847536,
"grad_norm": 1.0078125,
"learning_rate": 7.145342666911231e-05,
"loss": 0.377,
"step": 1010
},
{
"epoch": 2.9234088457389427,
"grad_norm": 1.078125,
"learning_rate": 7.132873382248702e-05,
"loss": 0.4527,
"step": 1015
},
{
"epoch": 2.937792161093132,
"grad_norm": 0.6171875,
"learning_rate": 7.120324840570978e-05,
"loss": 0.3519,
"step": 1020
},
{
"epoch": 2.952175476447321,
"grad_norm": 0.78125,
"learning_rate": 7.107697359337409e-05,
"loss": 0.4042,
"step": 1025
},
{
"epoch": 2.96655879180151,
"grad_norm": 0.87109375,
"learning_rate": 7.0949912580044e-05,
"loss": 0.3048,
"step": 1030
},
{
"epoch": 2.980942107155699,
"grad_norm": 1.03125,
"learning_rate": 7.082206858017333e-05,
"loss": 0.4759,
"step": 1035
},
{
"epoch": 2.9953254225098886,
"grad_norm": 0.6796875,
"learning_rate": 7.06934448280242e-05,
"loss": 0.3526,
"step": 1040
},
{
"epoch": 3.011506652283351,
"grad_norm": 0.6640625,
"learning_rate": 7.056404457758537e-05,
"loss": 0.4516,
"step": 1045
},
{
"epoch": 3.0258899676375406,
"grad_norm": 0.9140625,
"learning_rate": 7.043387110248979e-05,
"loss": 0.4131,
"step": 1050
},
{
"epoch": 3.0402732829917296,
"grad_norm": 0.88671875,
"learning_rate": 7.030292769593188e-05,
"loss": 0.3195,
"step": 1055
},
{
"epoch": 3.0546565983459186,
"grad_norm": 0.9921875,
"learning_rate": 7.017121767058417e-05,
"loss": 0.3509,
"step": 1060
},
{
"epoch": 3.0690399137001076,
"grad_norm": 0.89453125,
"learning_rate": 7.003874435851346e-05,
"loss": 0.2716,
"step": 1065
},
{
"epoch": 3.083423229054297,
"grad_norm": 0.92578125,
"learning_rate": 6.990551111109662e-05,
"loss": 0.3962,
"step": 1070
},
{
"epoch": 3.097806544408486,
"grad_norm": 0.72265625,
"learning_rate": 6.977152129893572e-05,
"loss": 0.3924,
"step": 1075
},
{
"epoch": 3.112189859762675,
"grad_norm": 0.98828125,
"learning_rate": 6.963677831177279e-05,
"loss": 0.2921,
"step": 1080
},
{
"epoch": 3.1265731751168646,
"grad_norm": 0.84765625,
"learning_rate": 6.950128555840404e-05,
"loss": 0.3449,
"step": 1085
},
{
"epoch": 3.1409564904710536,
"grad_norm": 0.87109375,
"learning_rate": 6.93650464665937e-05,
"loss": 0.3724,
"step": 1090
},
{
"epoch": 3.1553398058252426,
"grad_norm": 1.0703125,
"learning_rate": 6.92280644829872e-05,
"loss": 0.3371,
"step": 1095
},
{
"epoch": 3.1697231211794317,
"grad_norm": 0.98046875,
"learning_rate": 6.909034307302403e-05,
"loss": 0.2712,
"step": 1100
},
{
"epoch": 3.184106436533621,
"grad_norm": 0.6796875,
"learning_rate": 6.895188572085007e-05,
"loss": 0.285,
"step": 1105
},
{
"epoch": 3.19848975188781,
"grad_norm": 0.8125,
"learning_rate": 6.881269592922945e-05,
"loss": 0.3157,
"step": 1110
},
{
"epoch": 3.212873067241999,
"grad_norm": 0.80078125,
"learning_rate": 6.867277721945589e-05,
"loss": 0.367,
"step": 1115
},
{
"epoch": 3.227256382596188,
"grad_norm": 0.86328125,
"learning_rate": 6.853213313126369e-05,
"loss": 0.4571,
"step": 1120
},
{
"epoch": 3.2416396979503777,
"grad_norm": 0.9296875,
"learning_rate": 6.839076722273811e-05,
"loss": 0.3605,
"step": 1125
},
{
"epoch": 3.2560230133045667,
"grad_norm": 1.0390625,
"learning_rate": 6.82486830702254e-05,
"loss": 0.3656,
"step": 1130
},
{
"epoch": 3.2704063286587557,
"grad_norm": 0.953125,
"learning_rate": 6.810588426824229e-05,
"loss": 0.3539,
"step": 1135
},
{
"epoch": 3.284789644012945,
"grad_norm": 1.046875,
"learning_rate": 6.79623744293851e-05,
"loss": 0.4571,
"step": 1140
},
{
"epoch": 3.299172959367134,
"grad_norm": 0.81640625,
"learning_rate": 6.781815718423833e-05,
"loss": 0.5333,
"step": 1145
},
{
"epoch": 3.313556274721323,
"grad_norm": 1.2109375,
"learning_rate": 6.767323618128277e-05,
"loss": 0.4508,
"step": 1150
},
{
"epoch": 3.3279395900755127,
"grad_norm": 0.97265625,
"learning_rate": 6.752761508680322e-05,
"loss": 0.3443,
"step": 1155
},
{
"epoch": 3.3423229054297017,
"grad_norm": 0.75390625,
"learning_rate": 6.738129758479579e-05,
"loss": 0.285,
"step": 1160
},
{
"epoch": 3.3567062207838907,
"grad_norm": 0.8671875,
"learning_rate": 6.723428737687466e-05,
"loss": 0.2679,
"step": 1165
},
{
"epoch": 3.3710895361380797,
"grad_norm": 0.92578125,
"learning_rate": 6.708658818217839e-05,
"loss": 0.2944,
"step": 1170
},
{
"epoch": 3.3854728514922687,
"grad_norm": 0.95703125,
"learning_rate": 6.69382037372759e-05,
"loss": 0.348,
"step": 1175
},
{
"epoch": 3.399856166846458,
"grad_norm": 0.796875,
"learning_rate": 6.678913779607194e-05,
"loss": 0.4132,
"step": 1180
},
{
"epoch": 3.414239482200647,
"grad_norm": 1.0390625,
"learning_rate": 6.663939412971209e-05,
"loss": 0.4183,
"step": 1185
},
{
"epoch": 3.4286227975548362,
"grad_norm": 1.015625,
"learning_rate": 6.64889765264873e-05,
"loss": 0.433,
"step": 1190
},
{
"epoch": 3.4430061129090257,
"grad_norm": 0.7265625,
"learning_rate": 6.633788879173819e-05,
"loss": 0.3068,
"step": 1195
},
{
"epoch": 3.4573894282632147,
"grad_norm": 1.1875,
"learning_rate": 6.618613474775872e-05,
"loss": 0.2744,
"step": 1200
},
{
"epoch": 3.4717727436174037,
"grad_norm": 0.890625,
"learning_rate": 6.603371823369939e-05,
"loss": 0.3557,
"step": 1205
},
{
"epoch": 3.486156058971593,
"grad_norm": 0.7109375,
"learning_rate": 6.588064310547026e-05,
"loss": 0.3276,
"step": 1210
},
{
"epoch": 3.500539374325782,
"grad_norm": 0.71875,
"learning_rate": 6.572691323564337e-05,
"loss": 0.2779,
"step": 1215
},
{
"epoch": 3.5149226896799712,
"grad_norm": 1.0859375,
"learning_rate": 6.557253251335468e-05,
"loss": 0.3275,
"step": 1220
},
{
"epoch": 3.5293060050341603,
"grad_norm": 0.73828125,
"learning_rate": 6.541750484420579e-05,
"loss": 0.3646,
"step": 1225
},
{
"epoch": 3.5436893203883493,
"grad_norm": 0.84765625,
"learning_rate": 6.526183415016509e-05,
"loss": 0.3642,
"step": 1230
},
{
"epoch": 3.5580726357425387,
"grad_norm": 0.71875,
"learning_rate": 6.510552436946848e-05,
"loss": 0.445,
"step": 1235
},
{
"epoch": 3.5724559510967278,
"grad_norm": 0.87109375,
"learning_rate": 6.494857945651989e-05,
"loss": 0.3285,
"step": 1240
},
{
"epoch": 3.5868392664509168,
"grad_norm": 0.796875,
"learning_rate": 6.479100338179107e-05,
"loss": 0.2749,
"step": 1245
},
{
"epoch": 3.6012225818051062,
"grad_norm": 0.7265625,
"learning_rate": 6.463280013172127e-05,
"loss": 0.3884,
"step": 1250
},
{
"epoch": 3.6156058971592953,
"grad_norm": 0.93359375,
"learning_rate": 6.447397370861629e-05,
"loss": 0.3783,
"step": 1255
},
{
"epoch": 3.6299892125134843,
"grad_norm": 0.76953125,
"learning_rate": 6.431452813054732e-05,
"loss": 0.5526,
"step": 1260
},
{
"epoch": 3.6443725278676737,
"grad_norm": 0.85546875,
"learning_rate": 6.415446743124923e-05,
"loss": 0.283,
"step": 1265
},
{
"epoch": 3.6587558432218628,
"grad_norm": 1.125,
"learning_rate": 6.399379566001855e-05,
"loss": 0.3515,
"step": 1270
},
{
"epoch": 3.6731391585760518,
"grad_norm": 0.78125,
"learning_rate": 6.383251688161098e-05,
"loss": 0.3057,
"step": 1275
},
{
"epoch": 3.687522473930241,
"grad_norm": 0.81640625,
"learning_rate": 6.367063517613863e-05,
"loss": 0.3184,
"step": 1280
},
{
"epoch": 3.70190578928443,
"grad_norm": 0.703125,
"learning_rate": 6.350815463896675e-05,
"loss": 0.3487,
"step": 1285
},
{
"epoch": 3.7162891046386193,
"grad_norm": 0.765625,
"learning_rate": 6.334507938061017e-05,
"loss": 0.3494,
"step": 1290
},
{
"epoch": 3.7306724199928083,
"grad_norm": 0.90625,
"learning_rate": 6.31814135266292e-05,
"loss": 0.416,
"step": 1295
},
{
"epoch": 3.7450557353469973,
"grad_norm": 0.90625,
"learning_rate": 6.30171612175254e-05,
"loss": 0.4346,
"step": 1300
},
{
"epoch": 3.759439050701187,
"grad_norm": 0.76953125,
"learning_rate": 6.285232660863676e-05,
"loss": 0.3811,
"step": 1305
},
{
"epoch": 3.773822366055376,
"grad_norm": 0.7578125,
"learning_rate": 6.268691387003258e-05,
"loss": 0.3118,
"step": 1310
},
{
"epoch": 3.788205681409565,
"grad_norm": 0.75390625,
"learning_rate": 6.252092718640795e-05,
"loss": 0.3887,
"step": 1315
},
{
"epoch": 3.8025889967637543,
"grad_norm": 0.78515625,
"learning_rate": 6.235437075697797e-05,
"loss": 0.2996,
"step": 1320
},
{
"epoch": 3.8169723121179433,
"grad_norm": 0.609375,
"learning_rate": 6.218724879537141e-05,
"loss": 0.2867,
"step": 1325
},
{
"epoch": 3.8313556274721323,
"grad_norm": 0.72265625,
"learning_rate": 6.201956552952415e-05,
"loss": 0.2422,
"step": 1330
},
{
"epoch": 3.8457389428263213,
"grad_norm": 0.8984375,
"learning_rate": 6.185132520157228e-05,
"loss": 0.2694,
"step": 1335
},
{
"epoch": 3.8601222581805104,
"grad_norm": 0.9140625,
"learning_rate": 6.16825320677447e-05,
"loss": 0.4468,
"step": 1340
},
{
"epoch": 3.8745055735347,
"grad_norm": 0.796875,
"learning_rate": 6.151319039825545e-05,
"loss": 0.3772,
"step": 1345
},
{
"epoch": 3.888888888888889,
"grad_norm": 0.8203125,
"learning_rate": 6.134330447719575e-05,
"loss": 0.4678,
"step": 1350
},
{
"epoch": 3.903272204243078,
"grad_norm": 0.9609375,
"learning_rate": 6.117287860242553e-05,
"loss": 0.3393,
"step": 1355
},
{
"epoch": 3.9176555195972673,
"grad_norm": 0.6875,
"learning_rate": 6.100191708546476e-05,
"loss": 0.2338,
"step": 1360
},
{
"epoch": 3.9320388349514563,
"grad_norm": 0.84375,
"learning_rate": 6.083042425138437e-05,
"loss": 0.3768,
"step": 1365
},
{
"epoch": 3.9464221503056454,
"grad_norm": 0.78515625,
"learning_rate": 6.065840443869678e-05,
"loss": 0.4026,
"step": 1370
},
{
"epoch": 3.960805465659835,
"grad_norm": 0.75,
"learning_rate": 6.0485861999246235e-05,
"loss": 0.2708,
"step": 1375
},
{
"epoch": 3.975188781014024,
"grad_norm": 0.60546875,
"learning_rate": 6.03128012980986e-05,
"loss": 0.3323,
"step": 1380
},
{
"epoch": 3.989572096368213,
"grad_norm": 0.8359375,
"learning_rate": 6.0139226713431036e-05,
"loss": 0.4781,
"step": 1385
},
{
"epoch": 4.005753326141676,
"grad_norm": 0.9140625,
"learning_rate": 5.996514263642114e-05,
"loss": 0.3732,
"step": 1390
},
{
"epoch": 4.020136641495864,
"grad_norm": 0.5625,
"learning_rate": 5.9790553471135976e-05,
"loss": 0.1693,
"step": 1395
},
{
"epoch": 4.034519956850054,
"grad_norm": 1.1953125,
"learning_rate": 5.96154636344205e-05,
"loss": 0.2256,
"step": 1400
},
{
"epoch": 4.048903272204243,
"grad_norm": 1.125,
"learning_rate": 5.943987755578596e-05,
"loss": 0.2382,
"step": 1405
},
{
"epoch": 4.063286587558432,
"grad_norm": 0.90234375,
"learning_rate": 5.9263799677297774e-05,
"loss": 0.3122,
"step": 1410
},
{
"epoch": 4.077669902912621,
"grad_norm": 0.71875,
"learning_rate": 5.9087234453463166e-05,
"loss": 0.1377,
"step": 1415
},
{
"epoch": 4.092053218266811,
"grad_norm": 0.98828125,
"learning_rate": 5.891018635111845e-05,
"loss": 0.2662,
"step": 1420
},
{
"epoch": 4.106436533620999,
"grad_norm": 1.1015625,
"learning_rate": 5.873265984931606e-05,
"loss": 0.3659,
"step": 1425
},
{
"epoch": 4.120819848975189,
"grad_norm": 0.91796875,
"learning_rate": 5.855465943921123e-05,
"loss": 0.1675,
"step": 1430
},
{
"epoch": 4.135203164329378,
"grad_norm": 1.0234375,
"learning_rate": 5.837618962394834e-05,
"loss": 0.2156,
"step": 1435
},
{
"epoch": 4.149586479683567,
"grad_norm": 1.59375,
"learning_rate": 5.819725491854703e-05,
"loss": 0.3444,
"step": 1440
},
{
"epoch": 4.163969795037756,
"grad_norm": 0.93359375,
"learning_rate": 5.801785984978798e-05,
"loss": 0.2781,
"step": 1445
},
{
"epoch": 4.178353110391946,
"grad_norm": 0.91015625,
"learning_rate": 5.7838008956098366e-05,
"loss": 0.3323,
"step": 1450
},
{
"epoch": 4.192736425746134,
"grad_norm": 1.234375,
"learning_rate": 5.765770678743704e-05,
"loss": 0.2281,
"step": 1455
},
{
"epoch": 4.207119741100324,
"grad_norm": 0.73046875,
"learning_rate": 5.747695790517947e-05,
"loss": 0.2109,
"step": 1460
},
{
"epoch": 4.221503056454512,
"grad_norm": 1.2890625,
"learning_rate": 5.729576688200226e-05,
"loss": 0.3733,
"step": 1465
},
{
"epoch": 4.235886371808702,
"grad_norm": 1.0703125,
"learning_rate": 5.711413830176756e-05,
"loss": 0.2337,
"step": 1470
},
{
"epoch": 4.250269687162891,
"grad_norm": 0.9453125,
"learning_rate": 5.693207675940706e-05,
"loss": 0.3023,
"step": 1475
},
{
"epoch": 4.26465300251708,
"grad_norm": 1.4765625,
"learning_rate": 5.674958686080571e-05,
"loss": 0.3092,
"step": 1480
},
{
"epoch": 4.279036317871269,
"grad_norm": 0.94140625,
"learning_rate": 5.656667322268527e-05,
"loss": 0.2107,
"step": 1485
},
{
"epoch": 4.293419633225459,
"grad_norm": 1.125,
"learning_rate": 5.638334047248744e-05,
"loss": 0.3728,
"step": 1490
},
{
"epoch": 4.307802948579647,
"grad_norm": 0.66015625,
"learning_rate": 5.6199593248256884e-05,
"loss": 0.2301,
"step": 1495
},
{
"epoch": 4.322186263933837,
"grad_norm": 1.0625,
"learning_rate": 5.601543619852377e-05,
"loss": 0.2536,
"step": 1500
},
{
"epoch": 4.336569579288026,
"grad_norm": 1.0546875,
"learning_rate": 5.583087398218631e-05,
"loss": 0.2376,
"step": 1505
},
{
"epoch": 4.350952894642215,
"grad_norm": 0.9921875,
"learning_rate": 5.564591126839276e-05,
"loss": 0.2862,
"step": 1510
},
{
"epoch": 4.365336209996404,
"grad_norm": 0.59375,
"learning_rate": 5.546055273642342e-05,
"loss": 0.2191,
"step": 1515
},
{
"epoch": 4.379719525350593,
"grad_norm": 0.87890625,
"learning_rate": 5.5274803075572154e-05,
"loss": 0.2128,
"step": 1520
},
{
"epoch": 4.394102840704782,
"grad_norm": 0.59765625,
"learning_rate": 5.5088666985027835e-05,
"loss": 0.2977,
"step": 1525
},
{
"epoch": 4.408486156058972,
"grad_norm": 0.8203125,
"learning_rate": 5.49021491737554e-05,
"loss": 0.2385,
"step": 1530
},
{
"epoch": 4.4228694714131604,
"grad_norm": 1.03125,
"learning_rate": 5.471525436037677e-05,
"loss": 0.2074,
"step": 1535
},
{
"epoch": 4.43725278676735,
"grad_norm": 1.0390625,
"learning_rate": 5.452798727305146e-05,
"loss": 0.2742,
"step": 1540
},
{
"epoch": 4.451636102121539,
"grad_norm": 1.03125,
"learning_rate": 5.434035264935693e-05,
"loss": 0.3266,
"step": 1545
},
{
"epoch": 4.466019417475728,
"grad_norm": 0.95703125,
"learning_rate": 5.415235523616881e-05,
"loss": 0.3837,
"step": 1550
},
{
"epoch": 4.480402732829917,
"grad_norm": 1.2265625,
"learning_rate": 5.396399978954072e-05,
"loss": 0.3409,
"step": 1555
},
{
"epoch": 4.494786048184107,
"grad_norm": 1.0390625,
"learning_rate": 5.3775291074584e-05,
"loss": 0.2281,
"step": 1560
},
{
"epoch": 4.5091693635382954,
"grad_norm": 1.328125,
"learning_rate": 5.358623386534716e-05,
"loss": 0.4046,
"step": 1565
},
{
"epoch": 4.523552678892485,
"grad_norm": 0.6953125,
"learning_rate": 5.33968329446951e-05,
"loss": 0.1868,
"step": 1570
},
{
"epoch": 4.5379359942466735,
"grad_norm": 0.5625,
"learning_rate": 5.320709310418806e-05,
"loss": 0.2902,
"step": 1575
},
{
"epoch": 4.552319309600863,
"grad_norm": 0.81640625,
"learning_rate": 5.301701914396054e-05,
"loss": 0.2858,
"step": 1580
},
{
"epoch": 4.566702624955052,
"grad_norm": 1.0703125,
"learning_rate": 5.282661587259966e-05,
"loss": 0.2063,
"step": 1585
},
{
"epoch": 4.581085940309241,
"grad_norm": 1.0859375,
"learning_rate": 5.2635888107023706e-05,
"loss": 0.184,
"step": 1590
},
{
"epoch": 4.5954692556634305,
"grad_norm": 0.88671875,
"learning_rate": 5.244484067236013e-05,
"loss": 0.2318,
"step": 1595
},
{
"epoch": 4.60985257101762,
"grad_norm": 0.8046875,
"learning_rate": 5.2253478401823537e-05,
"loss": 0.1804,
"step": 1600
},
{
"epoch": 4.6242358863718085,
"grad_norm": 0.671875,
"learning_rate": 5.206180613659345e-05,
"loss": 0.1984,
"step": 1605
},
{
"epoch": 4.638619201725998,
"grad_norm": 0.6953125,
"learning_rate": 5.186982872569175e-05,
"loss": 0.2768,
"step": 1610
},
{
"epoch": 4.653002517080187,
"grad_norm": 0.71875,
"learning_rate": 5.167755102586008e-05,
"loss": 0.3707,
"step": 1615
},
{
"epoch": 4.667385832434376,
"grad_norm": 0.87890625,
"learning_rate": 5.148497790143692e-05,
"loss": 0.3311,
"step": 1620
},
{
"epoch": 4.6817691477885655,
"grad_norm": 0.9140625,
"learning_rate": 5.129211422423457e-05,
"loss": 0.3116,
"step": 1625
},
{
"epoch": 4.696152463142754,
"grad_norm": 0.75,
"learning_rate": 5.109896487341587e-05,
"loss": 0.3288,
"step": 1630
},
{
"epoch": 4.7105357784969435,
"grad_norm": 0.90625,
"learning_rate": 5.090553473537076e-05,
"loss": 0.2914,
"step": 1635
},
{
"epoch": 4.724919093851133,
"grad_norm": 1.40625,
"learning_rate": 5.071182870359272e-05,
"loss": 0.3358,
"step": 1640
},
{
"epoch": 4.7393024092053215,
"grad_norm": 0.84375,
"learning_rate": 5.051785167855489e-05,
"loss": 0.151,
"step": 1645
},
{
"epoch": 4.753685724559511,
"grad_norm": 0.93359375,
"learning_rate": 5.0323608567586155e-05,
"loss": 0.2381,
"step": 1650
},
{
"epoch": 4.7680690399137005,
"grad_norm": 0.8984375,
"learning_rate": 5.012910428474695e-05,
"loss": 0.3069,
"step": 1655
},
{
"epoch": 4.782452355267889,
"grad_norm": 0.90625,
"learning_rate": 4.9934343750705025e-05,
"loss": 0.3478,
"step": 1660
},
{
"epoch": 4.7968356706220785,
"grad_norm": 0.8125,
"learning_rate": 4.973933189261083e-05,
"loss": 0.3964,
"step": 1665
},
{
"epoch": 4.811218985976268,
"grad_norm": 1.1640625,
"learning_rate": 4.9544073643973e-05,
"loss": 0.2625,
"step": 1670
},
{
"epoch": 4.8256023013304565,
"grad_norm": 1.046875,
"learning_rate": 4.934857394453344e-05,
"loss": 0.3098,
"step": 1675
},
{
"epoch": 4.839985616684646,
"grad_norm": 0.9921875,
"learning_rate": 4.915283774014242e-05,
"loss": 0.3291,
"step": 1680
},
{
"epoch": 4.854368932038835,
"grad_norm": 1.140625,
"learning_rate": 4.895686998263343e-05,
"loss": 0.2595,
"step": 1685
},
{
"epoch": 4.868752247393024,
"grad_norm": 0.671875,
"learning_rate": 4.8760675629697893e-05,
"loss": 0.219,
"step": 1690
},
{
"epoch": 4.8831355627472135,
"grad_norm": 0.98046875,
"learning_rate": 4.856425964475979e-05,
"loss": 0.3462,
"step": 1695
},
{
"epoch": 4.897518878101402,
"grad_norm": 0.83984375,
"learning_rate": 4.836762699685002e-05,
"loss": 0.2413,
"step": 1700
},
{
"epoch": 4.9119021934555915,
"grad_norm": 0.83203125,
"learning_rate": 4.817078266048078e-05,
"loss": 0.2741,
"step": 1705
},
{
"epoch": 4.926285508809781,
"grad_norm": 0.921875,
"learning_rate": 4.79737316155196e-05,
"loss": 0.2875,
"step": 1710
},
{
"epoch": 4.94066882416397,
"grad_norm": 0.86328125,
"learning_rate": 4.7776478847063514e-05,
"loss": 0.2661,
"step": 1715
},
{
"epoch": 4.955052139518159,
"grad_norm": 1.125,
"learning_rate": 4.7579029345312773e-05,
"loss": 0.3132,
"step": 1720
},
{
"epoch": 4.9694354548723485,
"grad_norm": 0.95703125,
"learning_rate": 4.738138810544477e-05,
"loss": 0.2617,
"step": 1725
},
{
"epoch": 4.983818770226537,
"grad_norm": 0.79296875,
"learning_rate": 4.71835601274875e-05,
"loss": 0.2365,
"step": 1730
},
{
"epoch": 4.9982020855807265,
"grad_norm": 1.015625,
"learning_rate": 4.6985550416193226e-05,
"loss": 0.3377,
"step": 1735
},
{
"epoch": 5.0143833153541895,
"grad_norm": 0.72265625,
"learning_rate": 4.6787363980911754e-05,
"loss": 0.1061,
"step": 1740
},
{
"epoch": 5.028766630708378,
"grad_norm": 0.85546875,
"learning_rate": 4.6589005835463735e-05,
"loss": 0.1089,
"step": 1745
},
{
"epoch": 5.0431499460625675,
"grad_norm": 1.078125,
"learning_rate": 4.639048099801389e-05,
"loss": 0.179,
"step": 1750
},
{
"epoch": 5.057533261416757,
"grad_norm": 0.859375,
"learning_rate": 4.61917944909439e-05,
"loss": 0.1199,
"step": 1755
},
{
"epoch": 5.0719165767709455,
"grad_norm": 1.3203125,
"learning_rate": 4.599295134072554e-05,
"loss": 0.2674,
"step": 1760
},
{
"epoch": 5.086299892125135,
"grad_norm": 0.79296875,
"learning_rate": 4.579395657779339e-05,
"loss": 0.1374,
"step": 1765
},
{
"epoch": 5.100683207479324,
"grad_norm": 1.296875,
"learning_rate": 4.559481523641757e-05,
"loss": 0.1455,
"step": 1770
},
{
"epoch": 5.115066522833513,
"grad_norm": 1.1015625,
"learning_rate": 4.539553235457645e-05,
"loss": 0.108,
"step": 1775
},
{
"epoch": 5.1294498381877025,
"grad_norm": 0.875,
"learning_rate": 4.5196112973829184e-05,
"loss": 0.2614,
"step": 1780
},
{
"epoch": 5.143833153541891,
"grad_norm": 1.2578125,
"learning_rate": 4.499656213918809e-05,
"loss": 0.1803,
"step": 1785
},
{
"epoch": 5.1582164688960805,
"grad_norm": 1.328125,
"learning_rate": 4.4796884898991115e-05,
"loss": 0.1528,
"step": 1790
},
{
"epoch": 5.17259978425027,
"grad_norm": 1.2265625,
"learning_rate": 4.459708630477406e-05,
"loss": 0.2168,
"step": 1795
},
{
"epoch": 5.186983099604459,
"grad_norm": 0.95703125,
"learning_rate": 4.43971714111428e-05,
"loss": 0.2497,
"step": 1800
},
{
"epoch": 5.201366414958648,
"grad_norm": 1.3125,
"learning_rate": 4.4197145275645426e-05,
"loss": 0.2638,
"step": 1805
},
{
"epoch": 5.2157497303128375,
"grad_norm": 1.609375,
"learning_rate": 4.3997012958644255e-05,
"loss": 0.1665,
"step": 1810
},
{
"epoch": 5.230133045667026,
"grad_norm": 0.65625,
"learning_rate": 4.379677952318787e-05,
"loss": 0.0873,
"step": 1815
},
{
"epoch": 5.2445163610212155,
"grad_norm": 0.9375,
"learning_rate": 4.3596450034882983e-05,
"loss": 0.2356,
"step": 1820
},
{
"epoch": 5.258899676375404,
"grad_norm": 1.2265625,
"learning_rate": 4.33960295617663e-05,
"loss": 0.1437,
"step": 1825
},
{
"epoch": 5.273282991729594,
"grad_norm": 1.484375,
"learning_rate": 4.319552317417629e-05,
"loss": 0.168,
"step": 1830
},
{
"epoch": 5.287666307083783,
"grad_norm": 1.1953125,
"learning_rate": 4.299493594462498e-05,
"loss": 0.2304,
"step": 1835
},
{
"epoch": 5.302049622437972,
"grad_norm": 1.078125,
"learning_rate": 4.2794272947669516e-05,
"loss": 0.1377,
"step": 1840
},
{
"epoch": 5.316432937792161,
"grad_norm": 0.9609375,
"learning_rate": 4.259353925978389e-05,
"loss": 0.1789,
"step": 1845
},
{
"epoch": 5.3308162531463505,
"grad_norm": 1.1875,
"learning_rate": 4.2392739959230455e-05,
"loss": 0.1291,
"step": 1850
},
{
"epoch": 5.345199568500539,
"grad_norm": 0.73046875,
"learning_rate": 4.219188012593146e-05,
"loss": 0.3007,
"step": 1855
},
{
"epoch": 5.359582883854729,
"grad_norm": 1.1796875,
"learning_rate": 4.199096484134056e-05,
"loss": 0.1718,
"step": 1860
},
{
"epoch": 5.373966199208918,
"grad_norm": 1.4375,
"learning_rate": 4.17899991883142e-05,
"loss": 0.1992,
"step": 1865
},
{
"epoch": 5.388349514563107,
"grad_norm": 0.734375,
"learning_rate": 4.158898825098315e-05,
"loss": 0.0757,
"step": 1870
},
{
"epoch": 5.402732829917296,
"grad_norm": 0.8984375,
"learning_rate": 4.1387937114623716e-05,
"loss": 0.1683,
"step": 1875
},
{
"epoch": 5.417116145271485,
"grad_norm": 0.87890625,
"learning_rate": 4.1186850865529254e-05,
"loss": 0.1522,
"step": 1880
},
{
"epoch": 5.431499460625674,
"grad_norm": 1.3125,
"learning_rate": 4.098573459088137e-05,
"loss": 0.1525,
"step": 1885
},
{
"epoch": 5.445882775979864,
"grad_norm": 0.8046875,
"learning_rate": 4.078459337862129e-05,
"loss": 0.0721,
"step": 1890
},
{
"epoch": 5.460266091334052,
"grad_norm": 1.1328125,
"learning_rate": 4.058343231732114e-05,
"loss": 0.2774,
"step": 1895
},
{
"epoch": 5.474649406688242,
"grad_norm": 0.8828125,
"learning_rate": 4.038225649605515e-05,
"loss": 0.1588,
"step": 1900
},
{
"epoch": 5.489032722042431,
"grad_norm": 0.8671875,
"learning_rate": 4.018107100427103e-05,
"loss": 0.19,
"step": 1905
},
{
"epoch": 5.50341603739662,
"grad_norm": 0.83203125,
"learning_rate": 3.997988093166106e-05,
"loss": 0.1117,
"step": 1910
},
{
"epoch": 5.517799352750809,
"grad_norm": 1.2734375,
"learning_rate": 3.977869136803345e-05,
"loss": 0.1429,
"step": 1915
},
{
"epoch": 5.532182668104998,
"grad_norm": 0.8125,
"learning_rate": 3.957750740318353e-05,
"loss": 0.1783,
"step": 1920
},
{
"epoch": 5.546565983459187,
"grad_norm": 1.1640625,
"learning_rate": 3.937633412676501e-05,
"loss": 0.2207,
"step": 1925
},
{
"epoch": 5.560949298813377,
"grad_norm": 1.0859375,
"learning_rate": 3.917517662816114e-05,
"loss": 0.1413,
"step": 1930
},
{
"epoch": 5.575332614167566,
"grad_norm": 0.6796875,
"learning_rate": 3.8974039996356084e-05,
"loss": 0.2117,
"step": 1935
},
{
"epoch": 5.589715929521755,
"grad_norm": 1.2890625,
"learning_rate": 3.877292931980603e-05,
"loss": 0.2143,
"step": 1940
},
{
"epoch": 5.604099244875944,
"grad_norm": 0.89453125,
"learning_rate": 3.857184968631061e-05,
"loss": 0.1272,
"step": 1945
},
{
"epoch": 5.618482560230133,
"grad_norm": 1.140625,
"learning_rate": 3.837080618288409e-05,
"loss": 0.0956,
"step": 1950
},
{
"epoch": 5.632865875584322,
"grad_norm": 0.8125,
"learning_rate": 3.816980389562666e-05,
"loss": 0.1177,
"step": 1955
},
{
"epoch": 5.647249190938512,
"grad_norm": 1.359375,
"learning_rate": 3.796884790959587e-05,
"loss": 0.2439,
"step": 1960
},
{
"epoch": 5.6616325062927,
"grad_norm": 1.1640625,
"learning_rate": 3.776794330867785e-05,
"loss": 0.2453,
"step": 1965
},
{
"epoch": 5.67601582164689,
"grad_norm": 0.84765625,
"learning_rate": 3.756709517545885e-05,
"loss": 0.2097,
"step": 1970
},
{
"epoch": 5.690399137001079,
"grad_norm": 0.8125,
"learning_rate": 3.736630859109646e-05,
"loss": 0.1364,
"step": 1975
},
{
"epoch": 5.704782452355268,
"grad_norm": 0.8046875,
"learning_rate": 3.7165588635191257e-05,
"loss": 0.1112,
"step": 1980
},
{
"epoch": 5.719165767709457,
"grad_norm": 1.15625,
"learning_rate": 3.6964940385658185e-05,
"loss": 0.1781,
"step": 1985
},
{
"epoch": 5.733549083063647,
"grad_norm": 1.453125,
"learning_rate": 3.676436891859816e-05,
"loss": 0.1234,
"step": 1990
},
{
"epoch": 5.747932398417835,
"grad_norm": 0.92578125,
"learning_rate": 3.6563879308169566e-05,
"loss": 0.1948,
"step": 1995
},
{
"epoch": 5.762315713772025,
"grad_norm": 1.09375,
"learning_rate": 3.636347662645996e-05,
"loss": 0.1334,
"step": 2000
},
{
"epoch": 5.776699029126213,
"grad_norm": 0.6015625,
"learning_rate": 3.616316594335776e-05,
"loss": 0.1882,
"step": 2005
},
{
"epoch": 5.791082344480403,
"grad_norm": 0.734375,
"learning_rate": 3.59629523264239e-05,
"loss": 0.1485,
"step": 2010
},
{
"epoch": 5.805465659834592,
"grad_norm": 0.56640625,
"learning_rate": 3.576284084076372e-05,
"loss": 0.1062,
"step": 2015
},
{
"epoch": 5.819848975188781,
"grad_norm": 0.67578125,
"learning_rate": 3.556283654889879e-05,
"loss": 0.1845,
"step": 2020
},
{
"epoch": 5.83423229054297,
"grad_norm": 1.28125,
"learning_rate": 3.5362944510638834e-05,
"loss": 0.1082,
"step": 2025
},
{
"epoch": 5.84861560589716,
"grad_norm": 1.2109375,
"learning_rate": 3.5163169782953716e-05,
"loss": 0.1797,
"step": 2030
},
{
"epoch": 5.862998921251348,
"grad_norm": 1.1953125,
"learning_rate": 3.4963517419845546e-05,
"loss": 0.2226,
"step": 2035
},
{
"epoch": 5.877382236605538,
"grad_norm": 0.953125,
"learning_rate": 3.476399247222077e-05,
"loss": 0.1237,
"step": 2040
},
{
"epoch": 5.891765551959727,
"grad_norm": 1.1171875,
"learning_rate": 3.456459998776242e-05,
"loss": 0.1077,
"step": 2045
},
{
"epoch": 5.906148867313916,
"grad_norm": 0.73828125,
"learning_rate": 3.436534501080238e-05,
"loss": 0.1415,
"step": 2050
},
{
"epoch": 5.920532182668105,
"grad_norm": 0.85546875,
"learning_rate": 3.416623258219385e-05,
"loss": 0.2056,
"step": 2055
},
{
"epoch": 5.934915498022294,
"grad_norm": 1.171875,
"learning_rate": 3.3967267739183744e-05,
"loss": 0.1985,
"step": 2060
},
{
"epoch": 5.949298813376483,
"grad_norm": 0.8125,
"learning_rate": 3.376845551528527e-05,
"loss": 0.2888,
"step": 2065
},
{
"epoch": 5.963682128730673,
"grad_norm": 0.90234375,
"learning_rate": 3.3569800940150625e-05,
"loss": 0.2062,
"step": 2070
},
{
"epoch": 5.978065444084861,
"grad_norm": 0.7265625,
"learning_rate": 3.3371309039443724e-05,
"loss": 0.1624,
"step": 2075
},
{
"epoch": 5.992448759439051,
"grad_norm": 0.828125,
"learning_rate": 3.3172984834713035e-05,
"loss": 0.1204,
"step": 2080
},
{
"epoch": 6.008629989212514,
"grad_norm": 0.85546875,
"learning_rate": 3.297483334326458e-05,
"loss": 0.0882,
"step": 2085
},
{
"epoch": 6.023013304566702,
"grad_norm": 0.8515625,
"learning_rate": 3.277685957803502e-05,
"loss": 0.0867,
"step": 2090
},
{
"epoch": 6.037396619920892,
"grad_norm": 1.375,
"learning_rate": 3.257906854746477e-05,
"loss": 0.067,
"step": 2095
},
{
"epoch": 6.051779935275081,
"grad_norm": 0.6015625,
"learning_rate": 3.238146525537137e-05,
"loss": 0.1281,
"step": 2100
},
{
"epoch": 6.06616325062927,
"grad_norm": 0.45703125,
"learning_rate": 3.2184054700822826e-05,
"loss": 0.0639,
"step": 2105
},
{
"epoch": 6.080546565983459,
"grad_norm": 1.453125,
"learning_rate": 3.198684187801119e-05,
"loss": 0.1172,
"step": 2110
},
{
"epoch": 6.094929881337649,
"grad_norm": 0.796875,
"learning_rate": 3.178983177612617e-05,
"loss": 0.0727,
"step": 2115
},
{
"epoch": 6.109313196691837,
"grad_norm": 2.140625,
"learning_rate": 3.159302937922897e-05,
"loss": 0.0975,
"step": 2120
},
{
"epoch": 6.123696512046027,
"grad_norm": 0.96875,
"learning_rate": 3.1396439666126154e-05,
"loss": 0.0415,
"step": 2125
},
{
"epoch": 6.138079827400215,
"grad_norm": 0.64453125,
"learning_rate": 3.12000676102437e-05,
"loss": 0.0376,
"step": 2130
},
{
"epoch": 6.152463142754405,
"grad_norm": 1.21875,
"learning_rate": 3.100391817950119e-05,
"loss": 0.0824,
"step": 2135
},
{
"epoch": 6.166846458108594,
"grad_norm": 0.8125,
"learning_rate": 3.080799633618612e-05,
"loss": 0.0741,
"step": 2140
},
{
"epoch": 6.181229773462783,
"grad_norm": 2.5625,
"learning_rate": 3.0612307036828394e-05,
"loss": 0.2194,
"step": 2145
},
{
"epoch": 6.195613088816972,
"grad_norm": 0.734375,
"learning_rate": 3.0416855232074814e-05,
"loss": 0.0582,
"step": 2150
},
{
"epoch": 6.209996404171162,
"grad_norm": 0.58203125,
"learning_rate": 3.0221645866564025e-05,
"loss": 0.0666,
"step": 2155
},
{
"epoch": 6.22437971952535,
"grad_norm": 0.56640625,
"learning_rate": 3.0026683878801255e-05,
"loss": 0.0845,
"step": 2160
},
{
"epoch": 6.23876303487954,
"grad_norm": 0.5703125,
"learning_rate": 2.9831974201033486e-05,
"loss": 0.0596,
"step": 2165
},
{
"epoch": 6.253146350233729,
"grad_norm": 0.86328125,
"learning_rate": 2.9637521759124608e-05,
"loss": 0.0627,
"step": 2170
},
{
"epoch": 6.267529665587918,
"grad_norm": 0.94921875,
"learning_rate": 2.9443331472430832e-05,
"loss": 0.0678,
"step": 2175
},
{
"epoch": 6.281912980942107,
"grad_norm": 0.482421875,
"learning_rate": 2.9249408253676254e-05,
"loss": 0.0283,
"step": 2180
},
{
"epoch": 6.296296296296296,
"grad_norm": 0.69921875,
"learning_rate": 2.9055757008828512e-05,
"loss": 0.0499,
"step": 2185
},
{
"epoch": 6.310679611650485,
"grad_norm": 0.703125,
"learning_rate": 2.8862382636974744e-05,
"loss": 0.0585,
"step": 2190
},
{
"epoch": 6.325062927004675,
"grad_norm": 1.4609375,
"learning_rate": 2.8669290030197595e-05,
"loss": 0.1117,
"step": 2195
},
{
"epoch": 6.339446242358863,
"grad_norm": 0.419921875,
"learning_rate": 2.84764840734515e-05,
"loss": 0.044,
"step": 2200
},
{
"epoch": 6.353829557713053,
"grad_norm": 0.8125,
"learning_rate": 2.8283969644439042e-05,
"loss": 0.0459,
"step": 2205
},
{
"epoch": 6.368212873067242,
"grad_norm": 0.85546875,
"learning_rate": 2.809175161348761e-05,
"loss": 0.0911,
"step": 2210
},
{
"epoch": 6.382596188421431,
"grad_norm": 0.73046875,
"learning_rate": 2.7899834843426182e-05,
"loss": 0.1007,
"step": 2215
},
{
"epoch": 6.39697950377562,
"grad_norm": 1.28125,
"learning_rate": 2.770822418946223e-05,
"loss": 0.0811,
"step": 2220
},
{
"epoch": 6.41136281912981,
"grad_norm": 0.6171875,
"learning_rate": 2.7516924499059002e-05,
"loss": 0.0948,
"step": 2225
},
{
"epoch": 6.425746134483998,
"grad_norm": 0.8828125,
"learning_rate": 2.7325940611812797e-05,
"loss": 0.1004,
"step": 2230
},
{
"epoch": 6.440129449838188,
"grad_norm": 0.66015625,
"learning_rate": 2.713527735933059e-05,
"loss": 0.0699,
"step": 2235
},
{
"epoch": 6.454512765192376,
"grad_norm": 0.9921875,
"learning_rate": 2.694493956510776e-05,
"loss": 0.1507,
"step": 2240
},
{
"epoch": 6.468896080546566,
"grad_norm": 0.5,
"learning_rate": 2.67549320444061e-05,
"loss": 0.1092,
"step": 2245
},
{
"epoch": 6.483279395900755,
"grad_norm": 0.51953125,
"learning_rate": 2.6565259604131947e-05,
"loss": 0.0431,
"step": 2250
},
{
"epoch": 6.497662711254944,
"grad_norm": 0.6171875,
"learning_rate": 2.6375927042714614e-05,
"loss": 0.071,
"step": 2255
},
{
"epoch": 6.512046026609133,
"grad_norm": 1.2109375,
"learning_rate": 2.6186939149984986e-05,
"loss": 0.0826,
"step": 2260
},
{
"epoch": 6.526429341963323,
"grad_norm": 1.515625,
"learning_rate": 2.5998300707054364e-05,
"loss": 0.0763,
"step": 2265
},
{
"epoch": 6.540812657317511,
"grad_norm": 0.91796875,
"learning_rate": 2.581001648619347e-05,
"loss": 0.068,
"step": 2270
},
{
"epoch": 6.555195972671701,
"grad_norm": 1.1484375,
"learning_rate": 2.5622091250711732e-05,
"loss": 0.2013,
"step": 2275
},
{
"epoch": 6.56957928802589,
"grad_norm": 1.4140625,
"learning_rate": 2.5434529754836817e-05,
"loss": 0.1277,
"step": 2280
},
{
"epoch": 6.583962603380079,
"grad_norm": 0.6796875,
"learning_rate": 2.5247336743594307e-05,
"loss": 0.052,
"step": 2285
},
{
"epoch": 6.598345918734268,
"grad_norm": 0.43359375,
"learning_rate": 2.5060516952687638e-05,
"loss": 0.0924,
"step": 2290
},
{
"epoch": 6.612729234088457,
"grad_norm": 0.5546875,
"learning_rate": 2.487407510837837e-05,
"loss": 0.1329,
"step": 2295
},
{
"epoch": 6.627112549442646,
"grad_norm": 0.62890625,
"learning_rate": 2.468801592736658e-05,
"loss": 0.061,
"step": 2300
},
{
"epoch": 6.641495864796836,
"grad_norm": 0.75,
"learning_rate": 2.4502344116671515e-05,
"loss": 0.0321,
"step": 2305
},
{
"epoch": 6.655879180151025,
"grad_norm": 0.72265625,
"learning_rate": 2.431706437351255e-05,
"loss": 0.07,
"step": 2310
},
{
"epoch": 6.670262495505214,
"grad_norm": 0.431640625,
"learning_rate": 2.4132181385190324e-05,
"loss": 0.0473,
"step": 2315
},
{
"epoch": 6.684645810859403,
"grad_norm": 0.66015625,
"learning_rate": 2.394769982896818e-05,
"loss": 0.0525,
"step": 2320
},
{
"epoch": 6.699029126213592,
"grad_norm": 1.4296875,
"learning_rate": 2.3763624371953803e-05,
"loss": 0.0484,
"step": 2325
},
{
"epoch": 6.713412441567781,
"grad_norm": 1.3828125,
"learning_rate": 2.3579959670981224e-05,
"loss": 0.0758,
"step": 2330
},
{
"epoch": 6.727795756921971,
"grad_norm": 0.8828125,
"learning_rate": 2.3396710372492913e-05,
"loss": 0.1123,
"step": 2335
},
{
"epoch": 6.742179072276159,
"grad_norm": 1.171875,
"learning_rate": 2.3213881112422295e-05,
"loss": 0.0513,
"step": 2340
},
{
"epoch": 6.756562387630349,
"grad_norm": 0.66015625,
"learning_rate": 2.3031476516076476e-05,
"loss": 0.045,
"step": 2345
},
{
"epoch": 6.7709457029845375,
"grad_norm": 0.625,
"learning_rate": 2.2849501198019164e-05,
"loss": 0.0638,
"step": 2350
},
{
"epoch": 6.785329018338727,
"grad_norm": 0.5234375,
"learning_rate": 2.2667959761953985e-05,
"loss": 0.0577,
"step": 2355
},
{
"epoch": 6.799712333692916,
"grad_norm": 0.39453125,
"learning_rate": 2.2486856800608003e-05,
"loss": 0.0655,
"step": 2360
},
{
"epoch": 6.814095649047106,
"grad_norm": 0.69921875,
"learning_rate": 2.230619689561552e-05,
"loss": 0.0765,
"step": 2365
},
{
"epoch": 6.828478964401294,
"grad_norm": 2.4375,
"learning_rate": 2.2125984617402177e-05,
"loss": 0.0828,
"step": 2370
},
{
"epoch": 6.842862279755484,
"grad_norm": 0.8046875,
"learning_rate": 2.1946224525069323e-05,
"loss": 0.1424,
"step": 2375
},
{
"epoch": 6.8572455951096725,
"grad_norm": 0.9296875,
"learning_rate": 2.1766921166278677e-05,
"loss": 0.066,
"step": 2380
},
{
"epoch": 6.871628910463862,
"grad_norm": 0.89453125,
"learning_rate": 2.1588079077137305e-05,
"loss": 0.1121,
"step": 2385
},
{
"epoch": 6.886012225818051,
"grad_norm": 1.4296875,
"learning_rate": 2.1409702782082835e-05,
"loss": 0.0493,
"step": 2390
},
{
"epoch": 6.90039554117224,
"grad_norm": 1.21875,
"learning_rate": 2.1231796793768952e-05,
"loss": 0.1223,
"step": 2395
},
{
"epoch": 6.914778856526429,
"grad_norm": 1.0703125,
"learning_rate": 2.1054365612951324e-05,
"loss": 0.0736,
"step": 2400
},
{
"epoch": 6.929162171880618,
"grad_norm": 0.51171875,
"learning_rate": 2.087741372837372e-05,
"loss": 0.0601,
"step": 2405
},
{
"epoch": 6.9435454872348075,
"grad_norm": 1.0234375,
"learning_rate": 2.07009456166544e-05,
"loss": 0.0426,
"step": 2410
},
{
"epoch": 6.957928802588997,
"grad_norm": 0.5078125,
"learning_rate": 2.0524965742172886e-05,
"loss": 0.0447,
"step": 2415
},
{
"epoch": 6.972312117943186,
"grad_norm": 0.59375,
"learning_rate": 2.0349478556957047e-05,
"loss": 0.0536,
"step": 2420
},
{
"epoch": 6.986695433297375,
"grad_norm": 0.578125,
"learning_rate": 2.017448850057044e-05,
"loss": 0.1108,
"step": 2425
},
{
"epoch": 7.002876663070838,
"grad_norm": 2.734375,
"learning_rate": 2.0000000000000012e-05,
"loss": 0.0377,
"step": 2430
},
{
"epoch": 7.017259978425027,
"grad_norm": 0.55078125,
"learning_rate": 1.982601746954409e-05,
"loss": 0.0201,
"step": 2435
},
{
"epoch": 7.031643293779216,
"grad_norm": 1.046875,
"learning_rate": 1.965254531070072e-05,
"loss": 0.0306,
"step": 2440
},
{
"epoch": 7.046026609133405,
"grad_norm": 0.443359375,
"learning_rate": 1.9479587912056285e-05,
"loss": 0.0308,
"step": 2445
},
{
"epoch": 7.060409924487594,
"grad_norm": 0.265625,
"learning_rate": 1.9307149649174563e-05,
"loss": 0.0471,
"step": 2450
},
{
"epoch": 7.074793239841783,
"grad_norm": 0.484375,
"learning_rate": 1.9135234884485917e-05,
"loss": 0.026,
"step": 2455
},
{
"epoch": 7.089176555195973,
"grad_norm": 0.8125,
"learning_rate": 1.8963847967177017e-05,
"loss": 0.0652,
"step": 2460
},
{
"epoch": 7.1035598705501615,
"grad_norm": 0.341796875,
"learning_rate": 1.8792993233080728e-05,
"loss": 0.0177,
"step": 2465
},
{
"epoch": 7.117943185904351,
"grad_norm": 0.7109375,
"learning_rate": 1.8622675004566567e-05,
"loss": 0.0314,
"step": 2470
},
{
"epoch": 7.13232650125854,
"grad_norm": 0.3046875,
"learning_rate": 1.8452897590431196e-05,
"loss": 0.0227,
"step": 2475
},
{
"epoch": 7.146709816612729,
"grad_norm": 1.734375,
"learning_rate": 1.82836652857895e-05,
"loss": 0.0349,
"step": 2480
},
{
"epoch": 7.161093131966918,
"grad_norm": 0.345703125,
"learning_rate": 1.811498237196591e-05,
"loss": 0.0529,
"step": 2485
},
{
"epoch": 7.175476447321108,
"grad_norm": 0.248046875,
"learning_rate": 1.794685311638606e-05,
"loss": 0.0134,
"step": 2490
},
{
"epoch": 7.1898597626752965,
"grad_norm": 0.4765625,
"learning_rate": 1.777928177246894e-05,
"loss": 0.0559,
"step": 2495
},
{
"epoch": 7.204243078029486,
"grad_norm": 0.330078125,
"learning_rate": 1.761227257951911e-05,
"loss": 0.0159,
"step": 2500
},
{
"epoch": 7.2186263933836745,
"grad_norm": 0.8125,
"learning_rate": 1.7445829762619603e-05,
"loss": 0.0179,
"step": 2505
},
{
"epoch": 7.233009708737864,
"grad_norm": 0.47265625,
"learning_rate": 1.727995753252496e-05,
"loss": 0.0326,
"step": 2510
},
{
"epoch": 7.247393024092053,
"grad_norm": 1.1015625,
"learning_rate": 1.711466008555478e-05,
"loss": 0.0293,
"step": 2515
},
{
"epoch": 7.261776339446242,
"grad_norm": 0.44140625,
"learning_rate": 1.694994160348745e-05,
"loss": 0.0392,
"step": 2520
},
{
"epoch": 7.2761596548004315,
"grad_norm": 0.474609375,
"learning_rate": 1.6785806253454437e-05,
"loss": 0.0363,
"step": 2525
},
{
"epoch": 7.290542970154621,
"grad_norm": 0.62109375,
"learning_rate": 1.662225818783483e-05,
"loss": 0.0239,
"step": 2530
},
{
"epoch": 7.3049262855088095,
"grad_norm": 0.6328125,
"learning_rate": 1.6459301544150306e-05,
"loss": 0.0324,
"step": 2535
},
{
"epoch": 7.319309600862999,
"grad_norm": 0.38671875,
"learning_rate": 1.6296940444960447e-05,
"loss": 0.0352,
"step": 2540
},
{
"epoch": 7.333692916217188,
"grad_norm": 0.3125,
"learning_rate": 1.613517899775845e-05,
"loss": 0.029,
"step": 2545
},
{
"epoch": 7.348076231571377,
"grad_norm": 1.453125,
"learning_rate": 1.5974021294867213e-05,
"loss": 0.0605,
"step": 2550
},
{
"epoch": 7.3624595469255665,
"grad_norm": 0.70703125,
"learning_rate": 1.581347141333579e-05,
"loss": 0.051,
"step": 2555
},
{
"epoch": 7.376842862279755,
"grad_norm": 0.8515625,
"learning_rate": 1.565353341483631e-05,
"loss": 0.0195,
"step": 2560
},
{
"epoch": 7.3912261776339445,
"grad_norm": 0.31640625,
"learning_rate": 1.5494211345561123e-05,
"loss": 0.0264,
"step": 2565
},
{
"epoch": 7.405609492988134,
"grad_norm": 0.283203125,
"learning_rate": 1.5335509236120534e-05,
"loss": 0.0401,
"step": 2570
},
{
"epoch": 7.4199928083423226,
"grad_norm": 1.515625,
"learning_rate": 1.5177431101440721e-05,
"loss": 0.0444,
"step": 2575
},
{
"epoch": 7.434376123696512,
"grad_norm": 0.416015625,
"learning_rate": 1.5019980940662318e-05,
"loss": 0.0507,
"step": 2580
},
{
"epoch": 7.4487594390507015,
"grad_norm": 0.83203125,
"learning_rate": 1.4863162737039112e-05,
"loss": 0.0365,
"step": 2585
},
{
"epoch": 7.46314275440489,
"grad_norm": 0.392578125,
"learning_rate": 1.4706980457837317e-05,
"loss": 0.0206,
"step": 2590
},
{
"epoch": 7.4775260697590795,
"grad_norm": 0.734375,
"learning_rate": 1.4551438054235223e-05,
"loss": 0.0321,
"step": 2595
},
{
"epoch": 7.491909385113269,
"grad_norm": 0.423828125,
"learning_rate": 1.4396539461223204e-05,
"loss": 0.0183,
"step": 2600
},
{
"epoch": 7.506292700467458,
"grad_norm": 0.365234375,
"learning_rate": 1.4242288597504242e-05,
"loss": 0.0266,
"step": 2605
},
{
"epoch": 7.520676015821647,
"grad_norm": 0.314453125,
"learning_rate": 1.4088689365394653e-05,
"loss": 0.0203,
"step": 2610
},
{
"epoch": 7.535059331175836,
"grad_norm": 0.26171875,
"learning_rate": 1.3935745650725507e-05,
"loss": 0.038,
"step": 2615
},
{
"epoch": 7.549442646530025,
"grad_norm": 1.3984375,
"learning_rate": 1.3783461322744231e-05,
"loss": 0.0579,
"step": 2620
},
{
"epoch": 7.5638259618842145,
"grad_norm": 0.30859375,
"learning_rate": 1.3631840234016797e-05,
"loss": 0.027,
"step": 2625
},
{
"epoch": 7.578209277238403,
"grad_norm": 0.58203125,
"learning_rate": 1.3480886220330165e-05,
"loss": 0.0178,
"step": 2630
},
{
"epoch": 7.592592592592593,
"grad_norm": 1.2578125,
"learning_rate": 1.3330603100595326e-05,
"loss": 0.0377,
"step": 2635
},
{
"epoch": 7.606975907946782,
"grad_norm": 0.52734375,
"learning_rate": 1.3180994676750634e-05,
"loss": 0.0258,
"step": 2640
},
{
"epoch": 7.621359223300971,
"grad_norm": 0.40625,
"learning_rate": 1.3032064733665663e-05,
"loss": 0.0418,
"step": 2645
},
{
"epoch": 7.63574253865516,
"grad_norm": 0.25,
"learning_rate": 1.288381703904543e-05,
"loss": 0.0671,
"step": 2650
},
{
"epoch": 7.6501258540093495,
"grad_norm": 0.60546875,
"learning_rate": 1.2736255343335087e-05,
"loss": 0.0177,
"step": 2655
},
{
"epoch": 7.664509169363538,
"grad_norm": 0.357421875,
"learning_rate": 1.2589383379625036e-05,
"loss": 0.0341,
"step": 2660
},
{
"epoch": 7.678892484717728,
"grad_norm": 0.8828125,
"learning_rate": 1.2443204863556475e-05,
"loss": 0.0195,
"step": 2665
},
{
"epoch": 7.693275800071916,
"grad_norm": 0.96875,
"learning_rate": 1.229772349322746e-05,
"loss": 0.0281,
"step": 2670
},
{
"epoch": 7.707659115426106,
"grad_norm": 0.59375,
"learning_rate": 1.2152942949099274e-05,
"loss": 0.0263,
"step": 2675
},
{
"epoch": 7.722042430780295,
"grad_norm": 0.416015625,
"learning_rate": 1.2008866893903309e-05,
"loss": 0.0832,
"step": 2680
},
{
"epoch": 7.736425746134484,
"grad_norm": 0.7578125,
"learning_rate": 1.1865498972548478e-05,
"loss": 0.0289,
"step": 2685
},
{
"epoch": 7.750809061488673,
"grad_norm": 0.45703125,
"learning_rate": 1.1722842812028983e-05,
"loss": 0.0223,
"step": 2690
},
{
"epoch": 7.765192376842863,
"grad_norm": 1.03125,
"learning_rate": 1.1580902021332503e-05,
"loss": 0.0692,
"step": 2695
},
{
"epoch": 7.779575692197051,
"grad_norm": 0.70703125,
"learning_rate": 1.1439680191348953e-05,
"loss": 0.0186,
"step": 2700
},
{
"epoch": 7.793959007551241,
"grad_norm": 0.390625,
"learning_rate": 1.1299180894779594e-05,
"loss": 0.0268,
"step": 2705
},
{
"epoch": 7.80834232290543,
"grad_norm": 0.5859375,
"learning_rate": 1.1159407686046695e-05,
"loss": 0.019,
"step": 2710
},
{
"epoch": 7.822725638259619,
"grad_norm": 0.2314453125,
"learning_rate": 1.1020364101203573e-05,
"loss": 0.0254,
"step": 2715
},
{
"epoch": 7.837108953613808,
"grad_norm": 0.291015625,
"learning_rate": 1.0882053657845155e-05,
"loss": 0.0422,
"step": 2720
},
{
"epoch": 7.851492268967997,
"grad_norm": 1.0625,
"learning_rate": 1.0744479855018985e-05,
"loss": 0.0212,
"step": 2725
},
{
"epoch": 7.865875584322186,
"grad_norm": 0.3515625,
"learning_rate": 1.0607646173136695e-05,
"loss": 0.0161,
"step": 2730
},
{
"epoch": 7.880258899676376,
"grad_norm": 0.94140625,
"learning_rate": 1.0471556073885982e-05,
"loss": 0.0184,
"step": 2735
},
{
"epoch": 7.894642215030564,
"grad_norm": 0.6015625,
"learning_rate": 1.0336213000142998e-05,
"loss": 0.021,
"step": 2740
},
{
"epoch": 7.909025530384754,
"grad_norm": 0.625,
"learning_rate": 1.0201620375885279e-05,
"loss": 0.0218,
"step": 2745
},
{
"epoch": 7.923408845738943,
"grad_norm": 0.375,
"learning_rate": 1.0067781606105064e-05,
"loss": 0.0232,
"step": 2750
},
{
"epoch": 7.937792161093132,
"grad_norm": 0.58203125,
"learning_rate": 9.934700076723275e-06,
"loss": 0.0292,
"step": 2755
},
{
"epoch": 7.952175476447321,
"grad_norm": 0.2158203125,
"learning_rate": 9.802379154503728e-06,
"loss": 0.0239,
"step": 2760
},
{
"epoch": 7.966558791801511,
"grad_norm": 0.283203125,
"learning_rate": 9.670822186968035e-06,
"loss": 0.0191,
"step": 2765
},
{
"epoch": 7.980942107155699,
"grad_norm": 0.431640625,
"learning_rate": 9.540032502310884e-06,
"loss": 0.0284,
"step": 2770
},
{
"epoch": 7.995325422509889,
"grad_norm": 0.8125,
"learning_rate": 9.410013409315865e-06,
"loss": 0.0207,
"step": 2775
},
{
"epoch": 8.011506652283352,
"grad_norm": 0.25390625,
"learning_rate": 9.280768197271768e-06,
"loss": 0.032,
"step": 2780
},
{
"epoch": 8.025889967637541,
"grad_norm": 0.2470703125,
"learning_rate": 9.152300135889303e-06,
"loss": 0.0188,
"step": 2785
},
{
"epoch": 8.040273282991729,
"grad_norm": 0.275390625,
"learning_rate": 9.024612475218465e-06,
"loss": 0.0542,
"step": 2790
},
{
"epoch": 8.054656598345918,
"grad_norm": 0.197265625,
"learning_rate": 8.897708445566255e-06,
"loss": 0.0159,
"step": 2795
},
{
"epoch": 8.069039913700108,
"grad_norm": 0.2578125,
"learning_rate": 8.771591257415025e-06,
"loss": 0.0186,
"step": 2800
},
{
"epoch": 8.083423229054297,
"grad_norm": 0.271484375,
"learning_rate": 8.646264101341155e-06,
"loss": 0.0196,
"step": 2805
},
{
"epoch": 8.097806544408487,
"grad_norm": 0.302734375,
"learning_rate": 8.521730147934435e-06,
"loss": 0.0201,
"step": 2810
},
{
"epoch": 8.112189859762676,
"grad_norm": 0.251953125,
"learning_rate": 8.39799254771779e-06,
"loss": 0.0354,
"step": 2815
},
{
"epoch": 8.126573175116864,
"grad_norm": 0.22265625,
"learning_rate": 8.27505443106761e-06,
"loss": 0.0331,
"step": 2820
},
{
"epoch": 8.140956490471053,
"grad_norm": 0.486328125,
"learning_rate": 8.152918908134549e-06,
"loss": 0.0161,
"step": 2825
},
{
"epoch": 8.155339805825243,
"grad_norm": 0.3671875,
"learning_rate": 8.031589068764823e-06,
"loss": 0.0348,
"step": 2830
},
{
"epoch": 8.169723121179432,
"grad_norm": 0.67578125,
"learning_rate": 7.911067982422071e-06,
"loss": 0.0234,
"step": 2835
},
{
"epoch": 8.184106436533622,
"grad_norm": 0.318359375,
"learning_rate": 7.791358698109674e-06,
"loss": 0.0214,
"step": 2840
},
{
"epoch": 8.19848975188781,
"grad_norm": 0.22265625,
"learning_rate": 7.672464244293678e-06,
"loss": 0.0382,
"step": 2845
},
{
"epoch": 8.212873067241999,
"grad_norm": 0.255859375,
"learning_rate": 7.55438762882609e-06,
"loss": 0.0216,
"step": 2850
},
{
"epoch": 8.227256382596188,
"grad_norm": 0.251953125,
"learning_rate": 7.437131838868827e-06,
"loss": 0.0141,
"step": 2855
},
{
"epoch": 8.241639697950378,
"grad_norm": 0.25390625,
"learning_rate": 7.320699840818166e-06,
"loss": 0.0251,
"step": 2860
},
{
"epoch": 8.256023013304567,
"grad_norm": 0.37890625,
"learning_rate": 7.2050945802296926e-06,
"loss": 0.0322,
"step": 2865
},
{
"epoch": 8.270406328658757,
"grad_norm": 0.330078125,
"learning_rate": 7.090318981743745e-06,
"loss": 0.0137,
"step": 2870
},
{
"epoch": 8.284789644012944,
"grad_norm": 0.298828125,
"learning_rate": 6.97637594901146e-06,
"loss": 0.0303,
"step": 2875
},
{
"epoch": 8.299172959367134,
"grad_norm": 0.25390625,
"learning_rate": 6.863268364621296e-06,
"loss": 0.0168,
"step": 2880
},
{
"epoch": 8.313556274721323,
"grad_norm": 0.265625,
"learning_rate": 6.750999090026135e-06,
"loss": 0.013,
"step": 2885
},
{
"epoch": 8.327939590075513,
"grad_norm": 0.2119140625,
"learning_rate": 6.639570965470858e-06,
"loss": 0.0127,
"step": 2890
},
{
"epoch": 8.342322905429702,
"grad_norm": 0.2451171875,
"learning_rate": 6.528986809920513e-06,
"loss": 0.0245,
"step": 2895
},
{
"epoch": 8.356706220783892,
"grad_norm": 0.267578125,
"learning_rate": 6.4192494209889885e-06,
"loss": 0.0156,
"step": 2900
},
{
"epoch": 8.37108953613808,
"grad_norm": 0.1875,
"learning_rate": 6.3103615748682404e-06,
"loss": 0.0163,
"step": 2905
},
{
"epoch": 8.385472851492269,
"grad_norm": 0.322265625,
"learning_rate": 6.20232602625809e-06,
"loss": 0.0135,
"step": 2910
},
{
"epoch": 8.399856166846458,
"grad_norm": 0.296875,
"learning_rate": 6.095145508296467e-06,
"loss": 0.0377,
"step": 2915
},
{
"epoch": 8.414239482200648,
"grad_norm": 0.447265625,
"learning_rate": 5.988822732490329e-06,
"loss": 0.0188,
"step": 2920
},
{
"epoch": 8.428622797554837,
"grad_norm": 1.3046875,
"learning_rate": 5.8833603886469995e-06,
"loss": 0.034,
"step": 2925
},
{
"epoch": 8.443006112909025,
"grad_norm": 0.294921875,
"learning_rate": 5.778761144806222e-06,
"loss": 0.0138,
"step": 2930
},
{
"epoch": 8.457389428263214,
"grad_norm": 0.21484375,
"learning_rate": 5.675027647172551e-06,
"loss": 0.0421,
"step": 2935
},
{
"epoch": 8.471772743617404,
"grad_norm": 0.2451171875,
"learning_rate": 5.572162520048472e-06,
"loss": 0.0148,
"step": 2940
},
{
"epoch": 8.486156058971593,
"grad_norm": 0.287109375,
"learning_rate": 5.470168365767991e-06,
"loss": 0.0131,
"step": 2945
},
{
"epoch": 8.500539374325783,
"grad_norm": 0.271484375,
"learning_rate": 5.369047764630804e-06,
"loss": 0.0281,
"step": 2950
},
{
"epoch": 8.51492268967997,
"grad_norm": 0.2353515625,
"learning_rate": 5.268803274837022e-06,
"loss": 0.014,
"step": 2955
},
{
"epoch": 8.52930600503416,
"grad_norm": 0.28125,
"learning_rate": 5.169437432422438e-06,
"loss": 0.0166,
"step": 2960
},
{
"epoch": 8.54368932038835,
"grad_norm": 0.31640625,
"learning_rate": 5.070952751194389e-06,
"loss": 0.0261,
"step": 2965
},
{
"epoch": 8.558072635742539,
"grad_norm": 0.314453125,
"learning_rate": 4.973351722668147e-06,
"loss": 0.0147,
"step": 2970
},
{
"epoch": 8.572455951096728,
"grad_norm": 0.259765625,
"learning_rate": 4.876636816003882e-06,
"loss": 0.0174,
"step": 2975
},
{
"epoch": 8.586839266450918,
"grad_norm": 0.267578125,
"learning_rate": 4.780810477944231e-06,
"loss": 0.0152,
"step": 2980
},
{
"epoch": 8.601222581805105,
"grad_norm": 0.396484375,
"learning_rate": 4.685875132752347e-06,
"loss": 0.0141,
"step": 2985
},
{
"epoch": 8.615605897159295,
"grad_norm": 0.31640625,
"learning_rate": 4.591833182150609e-06,
"loss": 0.0139,
"step": 2990
},
{
"epoch": 8.629989212513484,
"grad_norm": 0.55859375,
"learning_rate": 4.498687005259826e-06,
"loss": 0.0205,
"step": 2995
},
{
"epoch": 8.644372527867674,
"grad_norm": 0.263671875,
"learning_rate": 4.406438958539103e-06,
"loss": 0.0136,
"step": 3000
}
],
"logging_steps": 5,
"max_steps": 3470,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.996803769017303e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}