CocoRoF's picture
Training in progress, step 4965, checkpoint
858a7b4 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9999055980364392,
"eval_steps": 500,
"global_step": 4965,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0010069542779823153,
"grad_norm": 147.125,
"learning_rate": 1.0060362173038228e-08,
"loss": 154.8444,
"step": 5
},
{
"epoch": 0.0020139085559646307,
"grad_norm": 128.5,
"learning_rate": 2.0120724346076457e-08,
"loss": 154.8017,
"step": 10
},
{
"epoch": 0.0030208628339469462,
"grad_norm": 120.1875,
"learning_rate": 3.018108651911469e-08,
"loss": 151.1319,
"step": 15
},
{
"epoch": 0.004027817111929261,
"grad_norm": 121.125,
"learning_rate": 4.0241448692152913e-08,
"loss": 152.4657,
"step": 20
},
{
"epoch": 0.0050347713899115765,
"grad_norm": 121.5,
"learning_rate": 5.0301810865191145e-08,
"loss": 150.7515,
"step": 25
},
{
"epoch": 0.0060417256678938925,
"grad_norm": 120.6875,
"learning_rate": 6.036217303822938e-08,
"loss": 151.6619,
"step": 30
},
{
"epoch": 0.007048679945876208,
"grad_norm": 124.3125,
"learning_rate": 7.042253521126761e-08,
"loss": 152.2705,
"step": 35
},
{
"epoch": 0.008055634223858523,
"grad_norm": 124.625,
"learning_rate": 8.048289738430583e-08,
"loss": 149.9593,
"step": 40
},
{
"epoch": 0.009062588501840838,
"grad_norm": 125.375,
"learning_rate": 9.054325955734406e-08,
"loss": 151.1752,
"step": 45
},
{
"epoch": 0.010069542779823153,
"grad_norm": 123.1875,
"learning_rate": 1.0060362173038229e-07,
"loss": 150.065,
"step": 50
},
{
"epoch": 0.01107649705780547,
"grad_norm": 129.0,
"learning_rate": 1.1066398390342052e-07,
"loss": 149.9216,
"step": 55
},
{
"epoch": 0.012083451335787785,
"grad_norm": 120.0625,
"learning_rate": 1.2072434607645875e-07,
"loss": 151.1044,
"step": 60
},
{
"epoch": 0.0130904056137701,
"grad_norm": 122.0,
"learning_rate": 1.3078470824949698e-07,
"loss": 151.5673,
"step": 65
},
{
"epoch": 0.014097359891752415,
"grad_norm": 124.25,
"learning_rate": 1.4084507042253522e-07,
"loss": 150.4482,
"step": 70
},
{
"epoch": 0.01510431416973473,
"grad_norm": 128.75,
"learning_rate": 1.5090543259557342e-07,
"loss": 150.4678,
"step": 75
},
{
"epoch": 0.016111268447717045,
"grad_norm": 127.9375,
"learning_rate": 1.6096579476861165e-07,
"loss": 149.0443,
"step": 80
},
{
"epoch": 0.01711822272569936,
"grad_norm": 126.25,
"learning_rate": 1.710261569416499e-07,
"loss": 151.2827,
"step": 85
},
{
"epoch": 0.018125177003681676,
"grad_norm": 125.0,
"learning_rate": 1.8108651911468812e-07,
"loss": 149.4031,
"step": 90
},
{
"epoch": 0.01913213128166399,
"grad_norm": 120.125,
"learning_rate": 1.9114688128772635e-07,
"loss": 149.7004,
"step": 95
},
{
"epoch": 0.020139085559646306,
"grad_norm": 121.75,
"learning_rate": 2.0120724346076458e-07,
"loss": 150.3224,
"step": 100
},
{
"epoch": 0.02114603983762862,
"grad_norm": 127.375,
"learning_rate": 2.112676056338028e-07,
"loss": 149.7613,
"step": 105
},
{
"epoch": 0.02215299411561094,
"grad_norm": 119.0,
"learning_rate": 2.2132796780684104e-07,
"loss": 148.3228,
"step": 110
},
{
"epoch": 0.023159948393593255,
"grad_norm": 119.3125,
"learning_rate": 2.3138832997987925e-07,
"loss": 150.6336,
"step": 115
},
{
"epoch": 0.02416690267157557,
"grad_norm": 119.8125,
"learning_rate": 2.414486921529175e-07,
"loss": 151.2738,
"step": 120
},
{
"epoch": 0.025173856949557885,
"grad_norm": 118.5625,
"learning_rate": 2.515090543259557e-07,
"loss": 149.1195,
"step": 125
},
{
"epoch": 0.0261808112275402,
"grad_norm": 120.125,
"learning_rate": 2.6156941649899397e-07,
"loss": 150.1052,
"step": 130
},
{
"epoch": 0.027187765505522515,
"grad_norm": 126.8125,
"learning_rate": 2.716297786720322e-07,
"loss": 150.0314,
"step": 135
},
{
"epoch": 0.02819471978350483,
"grad_norm": 123.1875,
"learning_rate": 2.8169014084507043e-07,
"loss": 148.3245,
"step": 140
},
{
"epoch": 0.029201674061487146,
"grad_norm": 120.5,
"learning_rate": 2.9175050301810864e-07,
"loss": 148.8877,
"step": 145
},
{
"epoch": 0.03020862833946946,
"grad_norm": 121.4375,
"learning_rate": 3.0181086519114684e-07,
"loss": 148.9477,
"step": 150
},
{
"epoch": 0.031215582617451776,
"grad_norm": 124.5625,
"learning_rate": 3.118712273641851e-07,
"loss": 149.5775,
"step": 155
},
{
"epoch": 0.03222253689543409,
"grad_norm": 114.625,
"learning_rate": 3.219315895372233e-07,
"loss": 149.9319,
"step": 160
},
{
"epoch": 0.033229491173416406,
"grad_norm": 113.8125,
"learning_rate": 3.3199195171026156e-07,
"loss": 149.8396,
"step": 165
},
{
"epoch": 0.03423644545139872,
"grad_norm": 119.5625,
"learning_rate": 3.420523138832998e-07,
"loss": 149.8816,
"step": 170
},
{
"epoch": 0.035243399729381036,
"grad_norm": 117.0625,
"learning_rate": 3.52112676056338e-07,
"loss": 148.9375,
"step": 175
},
{
"epoch": 0.03625035400736335,
"grad_norm": 126.125,
"learning_rate": 3.6217303822937623e-07,
"loss": 148.8681,
"step": 180
},
{
"epoch": 0.03725730828534567,
"grad_norm": 122.25,
"learning_rate": 3.722334004024145e-07,
"loss": 148.1881,
"step": 185
},
{
"epoch": 0.03826426256332798,
"grad_norm": 116.5625,
"learning_rate": 3.822937625754527e-07,
"loss": 151.0358,
"step": 190
},
{
"epoch": 0.0392712168413103,
"grad_norm": 112.8125,
"learning_rate": 3.9235412474849095e-07,
"loss": 149.5753,
"step": 195
},
{
"epoch": 0.04027817111929261,
"grad_norm": 114.75,
"learning_rate": 4.0241448692152916e-07,
"loss": 148.3218,
"step": 200
},
{
"epoch": 0.04128512539727493,
"grad_norm": 117.25,
"learning_rate": 4.1247484909456736e-07,
"loss": 148.1832,
"step": 205
},
{
"epoch": 0.04229207967525724,
"grad_norm": 111.1875,
"learning_rate": 4.225352112676056e-07,
"loss": 148.289,
"step": 210
},
{
"epoch": 0.04329903395323956,
"grad_norm": 111.75,
"learning_rate": 4.3259557344064383e-07,
"loss": 148.9568,
"step": 215
},
{
"epoch": 0.04430598823122188,
"grad_norm": 107.375,
"learning_rate": 4.426559356136821e-07,
"loss": 147.067,
"step": 220
},
{
"epoch": 0.045312942509204195,
"grad_norm": 114.125,
"learning_rate": 4.5271629778672034e-07,
"loss": 149.1166,
"step": 225
},
{
"epoch": 0.04631989678718651,
"grad_norm": 114.5625,
"learning_rate": 4.627766599597585e-07,
"loss": 149.51,
"step": 230
},
{
"epoch": 0.047326851065168825,
"grad_norm": 114.6875,
"learning_rate": 4.7283702213279675e-07,
"loss": 149.2539,
"step": 235
},
{
"epoch": 0.04833380534315114,
"grad_norm": 114.3125,
"learning_rate": 4.82897384305835e-07,
"loss": 148.6982,
"step": 240
},
{
"epoch": 0.049340759621133455,
"grad_norm": 114.8125,
"learning_rate": 4.929577464788733e-07,
"loss": 148.5024,
"step": 245
},
{
"epoch": 0.05034771389911577,
"grad_norm": 112.125,
"learning_rate": 5.030181086519114e-07,
"loss": 148.5248,
"step": 250
},
{
"epoch": 0.051354668177098085,
"grad_norm": 109.3125,
"learning_rate": 5.130784708249497e-07,
"loss": 148.3245,
"step": 255
},
{
"epoch": 0.0523616224550804,
"grad_norm": 113.4375,
"learning_rate": 5.231388329979879e-07,
"loss": 148.0799,
"step": 260
},
{
"epoch": 0.053368576733062716,
"grad_norm": 119.5,
"learning_rate": 5.331991951710262e-07,
"loss": 149.3715,
"step": 265
},
{
"epoch": 0.05437553101104503,
"grad_norm": 116.4375,
"learning_rate": 5.432595573440643e-07,
"loss": 148.9591,
"step": 270
},
{
"epoch": 0.055382485289027346,
"grad_norm": 116.9375,
"learning_rate": 5.533199195171025e-07,
"loss": 148.6525,
"step": 275
},
{
"epoch": 0.05638943956700966,
"grad_norm": 112.0,
"learning_rate": 5.633802816901409e-07,
"loss": 148.389,
"step": 280
},
{
"epoch": 0.057396393844991976,
"grad_norm": 110.5,
"learning_rate": 5.73440643863179e-07,
"loss": 149.1673,
"step": 285
},
{
"epoch": 0.05840334812297429,
"grad_norm": 105.375,
"learning_rate": 5.835010060362173e-07,
"loss": 147.8345,
"step": 290
},
{
"epoch": 0.059410302400956606,
"grad_norm": 108.875,
"learning_rate": 5.935613682092555e-07,
"loss": 148.039,
"step": 295
},
{
"epoch": 0.06041725667893892,
"grad_norm": 109.75,
"learning_rate": 6.036217303822937e-07,
"loss": 148.6279,
"step": 300
},
{
"epoch": 0.06142421095692124,
"grad_norm": 105.875,
"learning_rate": 6.136820925553319e-07,
"loss": 147.5383,
"step": 305
},
{
"epoch": 0.06243116523490355,
"grad_norm": 105.8125,
"learning_rate": 6.237424547283702e-07,
"loss": 148.9635,
"step": 310
},
{
"epoch": 0.06343811951288587,
"grad_norm": 107.5625,
"learning_rate": 6.338028169014085e-07,
"loss": 146.221,
"step": 315
},
{
"epoch": 0.06444507379086818,
"grad_norm": 107.75,
"learning_rate": 6.438631790744466e-07,
"loss": 146.1134,
"step": 320
},
{
"epoch": 0.0654520280688505,
"grad_norm": 109.75,
"learning_rate": 6.539235412474849e-07,
"loss": 148.4854,
"step": 325
},
{
"epoch": 0.06645898234683281,
"grad_norm": 112.4375,
"learning_rate": 6.639839034205231e-07,
"loss": 146.1542,
"step": 330
},
{
"epoch": 0.06746593662481513,
"grad_norm": 105.9375,
"learning_rate": 6.740442655935613e-07,
"loss": 147.7582,
"step": 335
},
{
"epoch": 0.06847289090279744,
"grad_norm": 105.3125,
"learning_rate": 6.841046277665996e-07,
"loss": 146.3988,
"step": 340
},
{
"epoch": 0.06947984518077976,
"grad_norm": 106.0,
"learning_rate": 6.941649899396378e-07,
"loss": 146.1215,
"step": 345
},
{
"epoch": 0.07048679945876207,
"grad_norm": 109.375,
"learning_rate": 7.04225352112676e-07,
"loss": 146.9638,
"step": 350
},
{
"epoch": 0.0714937537367444,
"grad_norm": 107.375,
"learning_rate": 7.142857142857143e-07,
"loss": 146.0221,
"step": 355
},
{
"epoch": 0.0725007080147267,
"grad_norm": 105.0625,
"learning_rate": 7.243460764587525e-07,
"loss": 146.9222,
"step": 360
},
{
"epoch": 0.07350766229270903,
"grad_norm": 108.125,
"learning_rate": 7.344064386317907e-07,
"loss": 145.8448,
"step": 365
},
{
"epoch": 0.07451461657069133,
"grad_norm": 109.0,
"learning_rate": 7.44466800804829e-07,
"loss": 147.5462,
"step": 370
},
{
"epoch": 0.07552157084867366,
"grad_norm": 108.0,
"learning_rate": 7.545271629778671e-07,
"loss": 147.6979,
"step": 375
},
{
"epoch": 0.07652852512665596,
"grad_norm": 107.0,
"learning_rate": 7.645875251509054e-07,
"loss": 146.4761,
"step": 380
},
{
"epoch": 0.07753547940463829,
"grad_norm": 103.4375,
"learning_rate": 7.746478873239435e-07,
"loss": 144.9393,
"step": 385
},
{
"epoch": 0.0785424336826206,
"grad_norm": 106.6875,
"learning_rate": 7.847082494969819e-07,
"loss": 145.0384,
"step": 390
},
{
"epoch": 0.07954938796060292,
"grad_norm": 107.3125,
"learning_rate": 7.947686116700201e-07,
"loss": 146.4607,
"step": 395
},
{
"epoch": 0.08055634223858522,
"grad_norm": 106.9375,
"learning_rate": 8.048289738430583e-07,
"loss": 144.5673,
"step": 400
},
{
"epoch": 0.08156329651656755,
"grad_norm": 103.125,
"learning_rate": 8.148893360160966e-07,
"loss": 145.9938,
"step": 405
},
{
"epoch": 0.08257025079454985,
"grad_norm": 105.3125,
"learning_rate": 8.249496981891347e-07,
"loss": 145.0131,
"step": 410
},
{
"epoch": 0.08357720507253218,
"grad_norm": 101.0625,
"learning_rate": 8.35010060362173e-07,
"loss": 143.9037,
"step": 415
},
{
"epoch": 0.08458415935051448,
"grad_norm": 106.375,
"learning_rate": 8.450704225352112e-07,
"loss": 146.0414,
"step": 420
},
{
"epoch": 0.0855911136284968,
"grad_norm": 106.375,
"learning_rate": 8.551307847082495e-07,
"loss": 145.0052,
"step": 425
},
{
"epoch": 0.08659806790647911,
"grad_norm": 102.375,
"learning_rate": 8.651911468812877e-07,
"loss": 144.8919,
"step": 430
},
{
"epoch": 0.08760502218446144,
"grad_norm": 105.0625,
"learning_rate": 8.752515090543259e-07,
"loss": 144.2957,
"step": 435
},
{
"epoch": 0.08861197646244376,
"grad_norm": 106.6875,
"learning_rate": 8.853118712273642e-07,
"loss": 144.0421,
"step": 440
},
{
"epoch": 0.08961893074042607,
"grad_norm": 100.5,
"learning_rate": 8.953722334004023e-07,
"loss": 144.8475,
"step": 445
},
{
"epoch": 0.09062588501840839,
"grad_norm": 102.4375,
"learning_rate": 9.054325955734407e-07,
"loss": 145.6034,
"step": 450
},
{
"epoch": 0.0916328392963907,
"grad_norm": 101.75,
"learning_rate": 9.154929577464788e-07,
"loss": 145.2008,
"step": 455
},
{
"epoch": 0.09263979357437302,
"grad_norm": 105.75,
"learning_rate": 9.25553319919517e-07,
"loss": 144.6417,
"step": 460
},
{
"epoch": 0.09364674785235533,
"grad_norm": 104.9375,
"learning_rate": 9.356136820925554e-07,
"loss": 144.1944,
"step": 465
},
{
"epoch": 0.09465370213033765,
"grad_norm": 104.0,
"learning_rate": 9.456740442655935e-07,
"loss": 144.2291,
"step": 470
},
{
"epoch": 0.09566065640831996,
"grad_norm": 101.4375,
"learning_rate": 9.557344064386319e-07,
"loss": 142.8143,
"step": 475
},
{
"epoch": 0.09666761068630228,
"grad_norm": 103.0,
"learning_rate": 9.6579476861167e-07,
"loss": 143.5473,
"step": 480
},
{
"epoch": 0.09767456496428459,
"grad_norm": 104.375,
"learning_rate": 9.758551307847082e-07,
"loss": 144.2911,
"step": 485
},
{
"epoch": 0.09868151924226691,
"grad_norm": 104.125,
"learning_rate": 9.859154929577465e-07,
"loss": 144.1867,
"step": 490
},
{
"epoch": 0.09968847352024922,
"grad_norm": 104.5625,
"learning_rate": 9.959758551307847e-07,
"loss": 143.2442,
"step": 495
},
{
"epoch": 0.10069542779823154,
"grad_norm": 99.875,
"learning_rate": 9.99328558639212e-07,
"loss": 142.1995,
"step": 500
},
{
"epoch": 0.10069542779823154,
"eval_loss": 4.476833343505859,
"eval_runtime": 239.4636,
"eval_samples_per_second": 1117.552,
"eval_steps_per_second": 34.924,
"step": 500
},
{
"epoch": 0.10170238207621385,
"grad_norm": 101.4375,
"learning_rate": 9.982094897045659e-07,
"loss": 143.2257,
"step": 505
},
{
"epoch": 0.10270933635419617,
"grad_norm": 102.1875,
"learning_rate": 9.970904207699194e-07,
"loss": 142.6529,
"step": 510
},
{
"epoch": 0.10371629063217848,
"grad_norm": 99.9375,
"learning_rate": 9.95971351835273e-07,
"loss": 142.5815,
"step": 515
},
{
"epoch": 0.1047232449101608,
"grad_norm": 99.125,
"learning_rate": 9.948522829006265e-07,
"loss": 142.8741,
"step": 520
},
{
"epoch": 0.10573019918814311,
"grad_norm": 107.125,
"learning_rate": 9.937332139659803e-07,
"loss": 144.5413,
"step": 525
},
{
"epoch": 0.10673715346612543,
"grad_norm": 100.1875,
"learning_rate": 9.926141450313339e-07,
"loss": 141.7836,
"step": 530
},
{
"epoch": 0.10774410774410774,
"grad_norm": 105.5,
"learning_rate": 9.914950760966874e-07,
"loss": 142.5062,
"step": 535
},
{
"epoch": 0.10875106202209006,
"grad_norm": 102.375,
"learning_rate": 9.903760071620412e-07,
"loss": 140.559,
"step": 540
},
{
"epoch": 0.10975801630007237,
"grad_norm": 98.5625,
"learning_rate": 9.892569382273947e-07,
"loss": 141.6972,
"step": 545
},
{
"epoch": 0.11076497057805469,
"grad_norm": 99.75,
"learning_rate": 9.881378692927483e-07,
"loss": 141.2675,
"step": 550
},
{
"epoch": 0.111771924856037,
"grad_norm": 100.6875,
"learning_rate": 9.87018800358102e-07,
"loss": 141.5898,
"step": 555
},
{
"epoch": 0.11277887913401932,
"grad_norm": 104.6875,
"learning_rate": 9.858997314234556e-07,
"loss": 140.7125,
"step": 560
},
{
"epoch": 0.11378583341200163,
"grad_norm": 99.5,
"learning_rate": 9.847806624888094e-07,
"loss": 141.6575,
"step": 565
},
{
"epoch": 0.11479278768998395,
"grad_norm": 102.125,
"learning_rate": 9.83661593554163e-07,
"loss": 141.1669,
"step": 570
},
{
"epoch": 0.11579974196796626,
"grad_norm": 101.0625,
"learning_rate": 9.825425246195165e-07,
"loss": 141.7356,
"step": 575
},
{
"epoch": 0.11680669624594858,
"grad_norm": 100.25,
"learning_rate": 9.8142345568487e-07,
"loss": 140.7489,
"step": 580
},
{
"epoch": 0.1178136505239309,
"grad_norm": 99.4375,
"learning_rate": 9.803043867502238e-07,
"loss": 139.6734,
"step": 585
},
{
"epoch": 0.11882060480191321,
"grad_norm": 98.6875,
"learning_rate": 9.791853178155774e-07,
"loss": 138.0151,
"step": 590
},
{
"epoch": 0.11982755907989553,
"grad_norm": 100.6875,
"learning_rate": 9.78066248880931e-07,
"loss": 138.5095,
"step": 595
},
{
"epoch": 0.12083451335787784,
"grad_norm": 98.0,
"learning_rate": 9.769471799462845e-07,
"loss": 139.3675,
"step": 600
},
{
"epoch": 0.12184146763586017,
"grad_norm": 107.5,
"learning_rate": 9.758281110116383e-07,
"loss": 140.4003,
"step": 605
},
{
"epoch": 0.12284842191384247,
"grad_norm": 97.375,
"learning_rate": 9.747090420769918e-07,
"loss": 138.405,
"step": 610
},
{
"epoch": 0.1238553761918248,
"grad_norm": 98.6875,
"learning_rate": 9.735899731423454e-07,
"loss": 138.4601,
"step": 615
},
{
"epoch": 0.1248623304698071,
"grad_norm": 97.75,
"learning_rate": 9.724709042076992e-07,
"loss": 138.2733,
"step": 620
},
{
"epoch": 0.1258692847477894,
"grad_norm": 97.125,
"learning_rate": 9.713518352730527e-07,
"loss": 138.8488,
"step": 625
},
{
"epoch": 0.12687623902577175,
"grad_norm": 98.0,
"learning_rate": 9.702327663384065e-07,
"loss": 139.726,
"step": 630
},
{
"epoch": 0.12788319330375406,
"grad_norm": 100.875,
"learning_rate": 9.6911369740376e-07,
"loss": 139.2006,
"step": 635
},
{
"epoch": 0.12889014758173636,
"grad_norm": 97.625,
"learning_rate": 9.679946284691136e-07,
"loss": 136.4355,
"step": 640
},
{
"epoch": 0.12989710185971867,
"grad_norm": 97.5625,
"learning_rate": 9.668755595344674e-07,
"loss": 138.7321,
"step": 645
},
{
"epoch": 0.130904056137701,
"grad_norm": 99.5,
"learning_rate": 9.65756490599821e-07,
"loss": 137.7812,
"step": 650
},
{
"epoch": 0.13191101041568332,
"grad_norm": 95.6875,
"learning_rate": 9.646374216651745e-07,
"loss": 138.1662,
"step": 655
},
{
"epoch": 0.13291796469366562,
"grad_norm": 100.5625,
"learning_rate": 9.635183527305283e-07,
"loss": 137.3001,
"step": 660
},
{
"epoch": 0.13392491897164793,
"grad_norm": 101.4375,
"learning_rate": 9.623992837958818e-07,
"loss": 135.9984,
"step": 665
},
{
"epoch": 0.13493187324963027,
"grad_norm": 99.5,
"learning_rate": 9.612802148612354e-07,
"loss": 138.5501,
"step": 670
},
{
"epoch": 0.13593882752761258,
"grad_norm": 98.375,
"learning_rate": 9.60161145926589e-07,
"loss": 137.8872,
"step": 675
},
{
"epoch": 0.13694578180559489,
"grad_norm": 102.6875,
"learning_rate": 9.590420769919427e-07,
"loss": 137.8639,
"step": 680
},
{
"epoch": 0.1379527360835772,
"grad_norm": 98.1875,
"learning_rate": 9.579230080572963e-07,
"loss": 137.8511,
"step": 685
},
{
"epoch": 0.13895969036155953,
"grad_norm": 98.4375,
"learning_rate": 9.568039391226498e-07,
"loss": 135.7094,
"step": 690
},
{
"epoch": 0.13996664463954184,
"grad_norm": 96.4375,
"learning_rate": 9.556848701880036e-07,
"loss": 135.8998,
"step": 695
},
{
"epoch": 0.14097359891752415,
"grad_norm": 96.9375,
"learning_rate": 9.545658012533572e-07,
"loss": 137.496,
"step": 700
},
{
"epoch": 0.14198055319550645,
"grad_norm": 98.1875,
"learning_rate": 9.534467323187107e-07,
"loss": 134.7175,
"step": 705
},
{
"epoch": 0.1429875074734888,
"grad_norm": 99.0,
"learning_rate": 9.523276633840645e-07,
"loss": 134.1437,
"step": 710
},
{
"epoch": 0.1439944617514711,
"grad_norm": 102.125,
"learning_rate": 9.51208594449418e-07,
"loss": 135.4387,
"step": 715
},
{
"epoch": 0.1450014160294534,
"grad_norm": 98.6875,
"learning_rate": 9.500895255147716e-07,
"loss": 134.4652,
"step": 720
},
{
"epoch": 0.14600837030743571,
"grad_norm": 98.375,
"learning_rate": 9.489704565801253e-07,
"loss": 133.8391,
"step": 725
},
{
"epoch": 0.14701532458541805,
"grad_norm": 99.0625,
"learning_rate": 9.478513876454789e-07,
"loss": 134.2267,
"step": 730
},
{
"epoch": 0.14802227886340036,
"grad_norm": 98.625,
"learning_rate": 9.467323187108326e-07,
"loss": 134.7458,
"step": 735
},
{
"epoch": 0.14902923314138267,
"grad_norm": 100.4375,
"learning_rate": 9.456132497761861e-07,
"loss": 134.7538,
"step": 740
},
{
"epoch": 0.150036187419365,
"grad_norm": 99.375,
"learning_rate": 9.444941808415397e-07,
"loss": 135.1127,
"step": 745
},
{
"epoch": 0.1510431416973473,
"grad_norm": 98.5,
"learning_rate": 9.433751119068935e-07,
"loss": 132.5026,
"step": 750
},
{
"epoch": 0.15205009597532962,
"grad_norm": 100.0625,
"learning_rate": 9.42256042972247e-07,
"loss": 134.5469,
"step": 755
},
{
"epoch": 0.15305705025331193,
"grad_norm": 97.875,
"learning_rate": 9.411369740376007e-07,
"loss": 133.0167,
"step": 760
},
{
"epoch": 0.15406400453129426,
"grad_norm": 101.9375,
"learning_rate": 9.400179051029544e-07,
"loss": 135.0586,
"step": 765
},
{
"epoch": 0.15507095880927657,
"grad_norm": 101.8125,
"learning_rate": 9.388988361683079e-07,
"loss": 133.1228,
"step": 770
},
{
"epoch": 0.15607791308725888,
"grad_norm": 97.875,
"learning_rate": 9.377797672336616e-07,
"loss": 134.4232,
"step": 775
},
{
"epoch": 0.1570848673652412,
"grad_norm": 101.4375,
"learning_rate": 9.366606982990151e-07,
"loss": 133.6743,
"step": 780
},
{
"epoch": 0.15809182164322352,
"grad_norm": 96.0625,
"learning_rate": 9.355416293643688e-07,
"loss": 132.8021,
"step": 785
},
{
"epoch": 0.15909877592120583,
"grad_norm": 98.0,
"learning_rate": 9.344225604297225e-07,
"loss": 131.8535,
"step": 790
},
{
"epoch": 0.16010573019918814,
"grad_norm": 96.6875,
"learning_rate": 9.33303491495076e-07,
"loss": 132.8115,
"step": 795
},
{
"epoch": 0.16111268447717045,
"grad_norm": 98.5,
"learning_rate": 9.321844225604297e-07,
"loss": 132.4902,
"step": 800
},
{
"epoch": 0.16211963875515278,
"grad_norm": 96.625,
"learning_rate": 9.310653536257834e-07,
"loss": 132.3855,
"step": 805
},
{
"epoch": 0.1631265930331351,
"grad_norm": 99.5,
"learning_rate": 9.299462846911369e-07,
"loss": 132.9767,
"step": 810
},
{
"epoch": 0.1641335473111174,
"grad_norm": 96.8125,
"learning_rate": 9.288272157564906e-07,
"loss": 130.722,
"step": 815
},
{
"epoch": 0.1651405015890997,
"grad_norm": 97.1875,
"learning_rate": 9.277081468218441e-07,
"loss": 132.2955,
"step": 820
},
{
"epoch": 0.16614745586708204,
"grad_norm": 100.4375,
"learning_rate": 9.265890778871978e-07,
"loss": 130.5667,
"step": 825
},
{
"epoch": 0.16715441014506435,
"grad_norm": 92.75,
"learning_rate": 9.254700089525515e-07,
"loss": 130.172,
"step": 830
},
{
"epoch": 0.16816136442304666,
"grad_norm": 99.0,
"learning_rate": 9.24350940017905e-07,
"loss": 132.0079,
"step": 835
},
{
"epoch": 0.16916831870102897,
"grad_norm": 99.1875,
"learning_rate": 9.232318710832587e-07,
"loss": 133.0523,
"step": 840
},
{
"epoch": 0.1701752729790113,
"grad_norm": 98.0,
"learning_rate": 9.221128021486123e-07,
"loss": 132.6847,
"step": 845
},
{
"epoch": 0.1711822272569936,
"grad_norm": 101.5625,
"learning_rate": 9.209937332139659e-07,
"loss": 129.9938,
"step": 850
},
{
"epoch": 0.17218918153497592,
"grad_norm": 100.75,
"learning_rate": 9.198746642793196e-07,
"loss": 130.9808,
"step": 855
},
{
"epoch": 0.17319613581295823,
"grad_norm": 99.375,
"learning_rate": 9.187555953446731e-07,
"loss": 131.95,
"step": 860
},
{
"epoch": 0.17420309009094057,
"grad_norm": 99.875,
"learning_rate": 9.176365264100269e-07,
"loss": 132.5511,
"step": 865
},
{
"epoch": 0.17521004436892287,
"grad_norm": 96.6875,
"learning_rate": 9.165174574753805e-07,
"loss": 129.9553,
"step": 870
},
{
"epoch": 0.17621699864690518,
"grad_norm": 96.3125,
"learning_rate": 9.15398388540734e-07,
"loss": 131.2535,
"step": 875
},
{
"epoch": 0.17722395292488752,
"grad_norm": 97.5625,
"learning_rate": 9.142793196060878e-07,
"loss": 129.6035,
"step": 880
},
{
"epoch": 0.17823090720286983,
"grad_norm": 96.3125,
"learning_rate": 9.131602506714413e-07,
"loss": 130.5564,
"step": 885
},
{
"epoch": 0.17923786148085213,
"grad_norm": 95.375,
"learning_rate": 9.120411817367949e-07,
"loss": 131.6668,
"step": 890
},
{
"epoch": 0.18024481575883444,
"grad_norm": 106.3125,
"learning_rate": 9.109221128021486e-07,
"loss": 130.4126,
"step": 895
},
{
"epoch": 0.18125177003681678,
"grad_norm": 95.0,
"learning_rate": 9.098030438675022e-07,
"loss": 129.2205,
"step": 900
},
{
"epoch": 0.1822587243147991,
"grad_norm": 100.0,
"learning_rate": 9.086839749328559e-07,
"loss": 129.7743,
"step": 905
},
{
"epoch": 0.1832656785927814,
"grad_norm": 99.625,
"learning_rate": 9.075649059982094e-07,
"loss": 129.5778,
"step": 910
},
{
"epoch": 0.1842726328707637,
"grad_norm": 99.3125,
"learning_rate": 9.06445837063563e-07,
"loss": 127.7286,
"step": 915
},
{
"epoch": 0.18527958714874604,
"grad_norm": 97.1875,
"learning_rate": 9.053267681289168e-07,
"loss": 128.4797,
"step": 920
},
{
"epoch": 0.18628654142672835,
"grad_norm": 97.0,
"learning_rate": 9.042076991942703e-07,
"loss": 129.3254,
"step": 925
},
{
"epoch": 0.18729349570471066,
"grad_norm": 95.875,
"learning_rate": 9.03088630259624e-07,
"loss": 129.162,
"step": 930
},
{
"epoch": 0.18830044998269296,
"grad_norm": 99.4375,
"learning_rate": 9.019695613249775e-07,
"loss": 128.9798,
"step": 935
},
{
"epoch": 0.1893074042606753,
"grad_norm": 99.8125,
"learning_rate": 9.008504923903312e-07,
"loss": 128.7978,
"step": 940
},
{
"epoch": 0.1903143585386576,
"grad_norm": 98.5,
"learning_rate": 8.997314234556849e-07,
"loss": 129.5615,
"step": 945
},
{
"epoch": 0.19132131281663992,
"grad_norm": 95.3125,
"learning_rate": 8.986123545210384e-07,
"loss": 128.219,
"step": 950
},
{
"epoch": 0.19232826709462222,
"grad_norm": 95.5625,
"learning_rate": 8.97493285586392e-07,
"loss": 128.3994,
"step": 955
},
{
"epoch": 0.19333522137260456,
"grad_norm": 98.875,
"learning_rate": 8.963742166517458e-07,
"loss": 128.6512,
"step": 960
},
{
"epoch": 0.19434217565058687,
"grad_norm": 96.0,
"learning_rate": 8.952551477170993e-07,
"loss": 127.9091,
"step": 965
},
{
"epoch": 0.19534912992856918,
"grad_norm": 98.5625,
"learning_rate": 8.94136078782453e-07,
"loss": 127.4443,
"step": 970
},
{
"epoch": 0.19635608420655148,
"grad_norm": 101.75,
"learning_rate": 8.930170098478065e-07,
"loss": 126.552,
"step": 975
},
{
"epoch": 0.19736303848453382,
"grad_norm": 102.4375,
"learning_rate": 8.918979409131602e-07,
"loss": 128.1213,
"step": 980
},
{
"epoch": 0.19836999276251613,
"grad_norm": 98.375,
"learning_rate": 8.907788719785139e-07,
"loss": 126.8494,
"step": 985
},
{
"epoch": 0.19937694704049844,
"grad_norm": 98.5625,
"learning_rate": 8.896598030438674e-07,
"loss": 128.1531,
"step": 990
},
{
"epoch": 0.20038390131848074,
"grad_norm": 97.25,
"learning_rate": 8.88540734109221e-07,
"loss": 126.7792,
"step": 995
},
{
"epoch": 0.20139085559646308,
"grad_norm": 99.125,
"learning_rate": 8.874216651745748e-07,
"loss": 125.983,
"step": 1000
},
{
"epoch": 0.20139085559646308,
"eval_loss": 3.978339433670044,
"eval_runtime": 239.476,
"eval_samples_per_second": 1117.494,
"eval_steps_per_second": 34.922,
"step": 1000
},
{
"epoch": 0.2023978098744454,
"grad_norm": 100.4375,
"learning_rate": 8.863025962399283e-07,
"loss": 127.6748,
"step": 1005
},
{
"epoch": 0.2034047641524277,
"grad_norm": 98.375,
"learning_rate": 8.85183527305282e-07,
"loss": 128.4397,
"step": 1010
},
{
"epoch": 0.20441171843041,
"grad_norm": 98.4375,
"learning_rate": 8.840644583706356e-07,
"loss": 128.5681,
"step": 1015
},
{
"epoch": 0.20541867270839234,
"grad_norm": 101.5625,
"learning_rate": 8.829453894359892e-07,
"loss": 125.6765,
"step": 1020
},
{
"epoch": 0.20642562698637465,
"grad_norm": 99.0,
"learning_rate": 8.818263205013429e-07,
"loss": 127.8648,
"step": 1025
},
{
"epoch": 0.20743258126435696,
"grad_norm": 100.4375,
"learning_rate": 8.807072515666964e-07,
"loss": 127.2665,
"step": 1030
},
{
"epoch": 0.2084395355423393,
"grad_norm": 97.0625,
"learning_rate": 8.795881826320502e-07,
"loss": 125.6571,
"step": 1035
},
{
"epoch": 0.2094464898203216,
"grad_norm": 99.5625,
"learning_rate": 8.784691136974037e-07,
"loss": 127.1407,
"step": 1040
},
{
"epoch": 0.2104534440983039,
"grad_norm": 97.625,
"learning_rate": 8.773500447627573e-07,
"loss": 125.1989,
"step": 1045
},
{
"epoch": 0.21146039837628622,
"grad_norm": 97.5,
"learning_rate": 8.76230975828111e-07,
"loss": 126.1473,
"step": 1050
},
{
"epoch": 0.21246735265426855,
"grad_norm": 98.375,
"learning_rate": 8.751119068934646e-07,
"loss": 125.087,
"step": 1055
},
{
"epoch": 0.21347430693225086,
"grad_norm": 96.125,
"learning_rate": 8.739928379588182e-07,
"loss": 125.8863,
"step": 1060
},
{
"epoch": 0.21448126121023317,
"grad_norm": 98.5625,
"learning_rate": 8.728737690241719e-07,
"loss": 124.4642,
"step": 1065
},
{
"epoch": 0.21548821548821548,
"grad_norm": 98.5625,
"learning_rate": 8.717547000895254e-07,
"loss": 128.1609,
"step": 1070
},
{
"epoch": 0.21649516976619781,
"grad_norm": 97.3125,
"learning_rate": 8.706356311548792e-07,
"loss": 124.3779,
"step": 1075
},
{
"epoch": 0.21750212404418012,
"grad_norm": 100.375,
"learning_rate": 8.695165622202327e-07,
"loss": 125.5783,
"step": 1080
},
{
"epoch": 0.21850907832216243,
"grad_norm": 99.625,
"learning_rate": 8.683974932855863e-07,
"loss": 124.0356,
"step": 1085
},
{
"epoch": 0.21951603260014474,
"grad_norm": 98.5,
"learning_rate": 8.6727842435094e-07,
"loss": 125.126,
"step": 1090
},
{
"epoch": 0.22052298687812708,
"grad_norm": 100.625,
"learning_rate": 8.661593554162936e-07,
"loss": 126.576,
"step": 1095
},
{
"epoch": 0.22152994115610938,
"grad_norm": 98.75,
"learning_rate": 8.650402864816473e-07,
"loss": 125.1371,
"step": 1100
},
{
"epoch": 0.2225368954340917,
"grad_norm": 98.75,
"learning_rate": 8.639212175470008e-07,
"loss": 125.9719,
"step": 1105
},
{
"epoch": 0.223543849712074,
"grad_norm": 100.5,
"learning_rate": 8.628021486123544e-07,
"loss": 125.1309,
"step": 1110
},
{
"epoch": 0.22455080399005634,
"grad_norm": 96.6875,
"learning_rate": 8.616830796777082e-07,
"loss": 123.8529,
"step": 1115
},
{
"epoch": 0.22555775826803864,
"grad_norm": 97.4375,
"learning_rate": 8.605640107430617e-07,
"loss": 123.9571,
"step": 1120
},
{
"epoch": 0.22656471254602095,
"grad_norm": 100.6875,
"learning_rate": 8.594449418084153e-07,
"loss": 124.4524,
"step": 1125
},
{
"epoch": 0.22757166682400326,
"grad_norm": 98.75,
"learning_rate": 8.583258728737691e-07,
"loss": 123.4843,
"step": 1130
},
{
"epoch": 0.2285786211019856,
"grad_norm": 95.0625,
"learning_rate": 8.572068039391226e-07,
"loss": 123.4948,
"step": 1135
},
{
"epoch": 0.2295855753799679,
"grad_norm": 100.125,
"learning_rate": 8.560877350044763e-07,
"loss": 123.4176,
"step": 1140
},
{
"epoch": 0.2305925296579502,
"grad_norm": 101.0,
"learning_rate": 8.549686660698298e-07,
"loss": 123.8447,
"step": 1145
},
{
"epoch": 0.23159948393593252,
"grad_norm": 95.625,
"learning_rate": 8.538495971351835e-07,
"loss": 124.9102,
"step": 1150
},
{
"epoch": 0.23260643821391486,
"grad_norm": 99.375,
"learning_rate": 8.527305282005372e-07,
"loss": 123.519,
"step": 1155
},
{
"epoch": 0.23361339249189716,
"grad_norm": 97.25,
"learning_rate": 8.516114592658907e-07,
"loss": 123.2095,
"step": 1160
},
{
"epoch": 0.23462034676987947,
"grad_norm": 100.0,
"learning_rate": 8.504923903312443e-07,
"loss": 122.7344,
"step": 1165
},
{
"epoch": 0.2356273010478618,
"grad_norm": 96.5,
"learning_rate": 8.49373321396598e-07,
"loss": 122.7375,
"step": 1170
},
{
"epoch": 0.23663425532584412,
"grad_norm": 98.0625,
"learning_rate": 8.482542524619516e-07,
"loss": 124.5513,
"step": 1175
},
{
"epoch": 0.23764120960382643,
"grad_norm": 101.0625,
"learning_rate": 8.471351835273053e-07,
"loss": 123.7978,
"step": 1180
},
{
"epoch": 0.23864816388180873,
"grad_norm": 98.6875,
"learning_rate": 8.460161145926588e-07,
"loss": 122.5464,
"step": 1185
},
{
"epoch": 0.23965511815979107,
"grad_norm": 100.8125,
"learning_rate": 8.448970456580125e-07,
"loss": 123.1573,
"step": 1190
},
{
"epoch": 0.24066207243777338,
"grad_norm": 100.25,
"learning_rate": 8.437779767233662e-07,
"loss": 123.0318,
"step": 1195
},
{
"epoch": 0.24166902671575569,
"grad_norm": 95.8125,
"learning_rate": 8.426589077887197e-07,
"loss": 124.5656,
"step": 1200
},
{
"epoch": 0.242675980993738,
"grad_norm": 93.5,
"learning_rate": 8.415398388540734e-07,
"loss": 123.566,
"step": 1205
},
{
"epoch": 0.24368293527172033,
"grad_norm": 97.0,
"learning_rate": 8.40420769919427e-07,
"loss": 121.6469,
"step": 1210
},
{
"epoch": 0.24468988954970264,
"grad_norm": 99.125,
"learning_rate": 8.393017009847806e-07,
"loss": 123.3631,
"step": 1215
},
{
"epoch": 0.24569684382768495,
"grad_norm": 98.25,
"learning_rate": 8.381826320501343e-07,
"loss": 122.9663,
"step": 1220
},
{
"epoch": 0.24670379810566725,
"grad_norm": 96.8125,
"learning_rate": 8.370635631154878e-07,
"loss": 123.2738,
"step": 1225
},
{
"epoch": 0.2477107523836496,
"grad_norm": 96.0625,
"learning_rate": 8.359444941808415e-07,
"loss": 122.7263,
"step": 1230
},
{
"epoch": 0.2487177066616319,
"grad_norm": 101.1875,
"learning_rate": 8.348254252461951e-07,
"loss": 123.9187,
"step": 1235
},
{
"epoch": 0.2497246609396142,
"grad_norm": 97.625,
"learning_rate": 8.337063563115487e-07,
"loss": 121.4109,
"step": 1240
},
{
"epoch": 0.2507316152175965,
"grad_norm": 97.125,
"learning_rate": 8.325872873769025e-07,
"loss": 120.955,
"step": 1245
},
{
"epoch": 0.2517385694955788,
"grad_norm": 100.5625,
"learning_rate": 8.31468218442256e-07,
"loss": 120.8311,
"step": 1250
},
{
"epoch": 0.25274552377356113,
"grad_norm": 97.5,
"learning_rate": 8.303491495076096e-07,
"loss": 121.3494,
"step": 1255
},
{
"epoch": 0.2537524780515435,
"grad_norm": 96.75,
"learning_rate": 8.292300805729633e-07,
"loss": 122.0731,
"step": 1260
},
{
"epoch": 0.2547594323295258,
"grad_norm": 99.1875,
"learning_rate": 8.281110116383169e-07,
"loss": 123.7734,
"step": 1265
},
{
"epoch": 0.2557663866075081,
"grad_norm": 98.75,
"learning_rate": 8.269919427036705e-07,
"loss": 121.2907,
"step": 1270
},
{
"epoch": 0.2567733408854904,
"grad_norm": 94.1875,
"learning_rate": 8.258728737690241e-07,
"loss": 121.2714,
"step": 1275
},
{
"epoch": 0.25778029516347273,
"grad_norm": 97.4375,
"learning_rate": 8.247538048343777e-07,
"loss": 121.5029,
"step": 1280
},
{
"epoch": 0.25878724944145504,
"grad_norm": 99.125,
"learning_rate": 8.236347358997315e-07,
"loss": 119.4266,
"step": 1285
},
{
"epoch": 0.25979420371943734,
"grad_norm": 98.6875,
"learning_rate": 8.22515666965085e-07,
"loss": 121.0074,
"step": 1290
},
{
"epoch": 0.26080115799741965,
"grad_norm": 98.375,
"learning_rate": 8.213965980304386e-07,
"loss": 120.9616,
"step": 1295
},
{
"epoch": 0.261808112275402,
"grad_norm": 98.875,
"learning_rate": 8.202775290957922e-07,
"loss": 120.9374,
"step": 1300
},
{
"epoch": 0.2628150665533843,
"grad_norm": 98.375,
"learning_rate": 8.191584601611459e-07,
"loss": 120.8794,
"step": 1305
},
{
"epoch": 0.26382202083136663,
"grad_norm": 96.75,
"learning_rate": 8.180393912264996e-07,
"loss": 120.3623,
"step": 1310
},
{
"epoch": 0.26482897510934894,
"grad_norm": 99.8125,
"learning_rate": 8.169203222918531e-07,
"loss": 121.5525,
"step": 1315
},
{
"epoch": 0.26583592938733125,
"grad_norm": 95.8125,
"learning_rate": 8.158012533572067e-07,
"loss": 122.4818,
"step": 1320
},
{
"epoch": 0.26684288366531356,
"grad_norm": 99.3125,
"learning_rate": 8.146821844225605e-07,
"loss": 121.3006,
"step": 1325
},
{
"epoch": 0.26784983794329587,
"grad_norm": 101.125,
"learning_rate": 8.13563115487914e-07,
"loss": 119.9612,
"step": 1330
},
{
"epoch": 0.26885679222127823,
"grad_norm": 96.8125,
"learning_rate": 8.124440465532676e-07,
"loss": 121.866,
"step": 1335
},
{
"epoch": 0.26986374649926054,
"grad_norm": 95.875,
"learning_rate": 8.113249776186212e-07,
"loss": 119.8637,
"step": 1340
},
{
"epoch": 0.27087070077724285,
"grad_norm": 97.6875,
"learning_rate": 8.102059086839749e-07,
"loss": 120.1557,
"step": 1345
},
{
"epoch": 0.27187765505522515,
"grad_norm": 102.9375,
"learning_rate": 8.090868397493286e-07,
"loss": 120.6683,
"step": 1350
},
{
"epoch": 0.27288460933320746,
"grad_norm": 98.9375,
"learning_rate": 8.079677708146821e-07,
"loss": 119.1554,
"step": 1355
},
{
"epoch": 0.27389156361118977,
"grad_norm": 102.25,
"learning_rate": 8.068487018800357e-07,
"loss": 119.5964,
"step": 1360
},
{
"epoch": 0.2748985178891721,
"grad_norm": 100.5625,
"learning_rate": 8.057296329453895e-07,
"loss": 118.3494,
"step": 1365
},
{
"epoch": 0.2759054721671544,
"grad_norm": 98.375,
"learning_rate": 8.04610564010743e-07,
"loss": 120.6742,
"step": 1370
},
{
"epoch": 0.27691242644513675,
"grad_norm": 99.125,
"learning_rate": 8.034914950760967e-07,
"loss": 120.3351,
"step": 1375
},
{
"epoch": 0.27791938072311906,
"grad_norm": 97.0625,
"learning_rate": 8.023724261414503e-07,
"loss": 120.4014,
"step": 1380
},
{
"epoch": 0.27892633500110137,
"grad_norm": 97.8125,
"learning_rate": 8.012533572068039e-07,
"loss": 119.3332,
"step": 1385
},
{
"epoch": 0.2799332892790837,
"grad_norm": 98.125,
"learning_rate": 8.001342882721576e-07,
"loss": 118.5001,
"step": 1390
},
{
"epoch": 0.280940243557066,
"grad_norm": 98.625,
"learning_rate": 7.990152193375111e-07,
"loss": 119.7872,
"step": 1395
},
{
"epoch": 0.2819471978350483,
"grad_norm": 99.1875,
"learning_rate": 7.978961504028648e-07,
"loss": 120.3533,
"step": 1400
},
{
"epoch": 0.2829541521130306,
"grad_norm": 97.5625,
"learning_rate": 7.967770814682184e-07,
"loss": 119.9716,
"step": 1405
},
{
"epoch": 0.2839611063910129,
"grad_norm": 100.125,
"learning_rate": 7.95658012533572e-07,
"loss": 118.2968,
"step": 1410
},
{
"epoch": 0.28496806066899527,
"grad_norm": 98.875,
"learning_rate": 7.945389435989257e-07,
"loss": 119.1026,
"step": 1415
},
{
"epoch": 0.2859750149469776,
"grad_norm": 95.75,
"learning_rate": 7.934198746642793e-07,
"loss": 118.9463,
"step": 1420
},
{
"epoch": 0.2869819692249599,
"grad_norm": 97.5,
"learning_rate": 7.923008057296329e-07,
"loss": 118.8782,
"step": 1425
},
{
"epoch": 0.2879889235029422,
"grad_norm": 99.5625,
"learning_rate": 7.911817367949866e-07,
"loss": 119.4829,
"step": 1430
},
{
"epoch": 0.2889958777809245,
"grad_norm": 96.75,
"learning_rate": 7.900626678603401e-07,
"loss": 117.9972,
"step": 1435
},
{
"epoch": 0.2900028320589068,
"grad_norm": 101.75,
"learning_rate": 7.889435989256938e-07,
"loss": 118.6872,
"step": 1440
},
{
"epoch": 0.2910097863368891,
"grad_norm": 100.0,
"learning_rate": 7.878245299910474e-07,
"loss": 119.475,
"step": 1445
},
{
"epoch": 0.29201674061487143,
"grad_norm": 102.9375,
"learning_rate": 7.86705461056401e-07,
"loss": 118.7264,
"step": 1450
},
{
"epoch": 0.2930236948928538,
"grad_norm": 100.0,
"learning_rate": 7.855863921217547e-07,
"loss": 118.6106,
"step": 1455
},
{
"epoch": 0.2940306491708361,
"grad_norm": 98.125,
"learning_rate": 7.844673231871083e-07,
"loss": 119.1069,
"step": 1460
},
{
"epoch": 0.2950376034488184,
"grad_norm": 102.9375,
"learning_rate": 7.833482542524619e-07,
"loss": 119.3006,
"step": 1465
},
{
"epoch": 0.2960445577268007,
"grad_norm": 102.9375,
"learning_rate": 7.822291853178155e-07,
"loss": 117.8286,
"step": 1470
},
{
"epoch": 0.297051512004783,
"grad_norm": 103.6875,
"learning_rate": 7.811101163831691e-07,
"loss": 119.018,
"step": 1475
},
{
"epoch": 0.29805846628276533,
"grad_norm": 101.375,
"learning_rate": 7.799910474485229e-07,
"loss": 119.4814,
"step": 1480
},
{
"epoch": 0.29906542056074764,
"grad_norm": 99.375,
"learning_rate": 7.788719785138764e-07,
"loss": 118.5113,
"step": 1485
},
{
"epoch": 0.30007237483873,
"grad_norm": 100.875,
"learning_rate": 7.7775290957923e-07,
"loss": 120.1722,
"step": 1490
},
{
"epoch": 0.3010793291167123,
"grad_norm": 99.8125,
"learning_rate": 7.766338406445838e-07,
"loss": 117.5911,
"step": 1495
},
{
"epoch": 0.3020862833946946,
"grad_norm": 97.75,
"learning_rate": 7.755147717099373e-07,
"loss": 116.4555,
"step": 1500
},
{
"epoch": 0.3020862833946946,
"eval_loss": 3.6974689960479736,
"eval_runtime": 239.1833,
"eval_samples_per_second": 1118.861,
"eval_steps_per_second": 34.965,
"step": 1500
},
{
"epoch": 0.30309323767267693,
"grad_norm": 99.75,
"learning_rate": 7.743957027752909e-07,
"loss": 117.8555,
"step": 1505
},
{
"epoch": 0.30410019195065924,
"grad_norm": 102.9375,
"learning_rate": 7.732766338406445e-07,
"loss": 118.8926,
"step": 1510
},
{
"epoch": 0.30510714622864155,
"grad_norm": 98.3125,
"learning_rate": 7.721575649059982e-07,
"loss": 118.4608,
"step": 1515
},
{
"epoch": 0.30611410050662385,
"grad_norm": 101.0625,
"learning_rate": 7.710384959713519e-07,
"loss": 117.194,
"step": 1520
},
{
"epoch": 0.30712105478460616,
"grad_norm": 100.0625,
"learning_rate": 7.699194270367054e-07,
"loss": 116.1222,
"step": 1525
},
{
"epoch": 0.3081280090625885,
"grad_norm": 102.625,
"learning_rate": 7.68800358102059e-07,
"loss": 116.386,
"step": 1530
},
{
"epoch": 0.30913496334057083,
"grad_norm": 99.1875,
"learning_rate": 7.676812891674127e-07,
"loss": 118.1568,
"step": 1535
},
{
"epoch": 0.31014191761855314,
"grad_norm": 102.125,
"learning_rate": 7.665622202327663e-07,
"loss": 117.6367,
"step": 1540
},
{
"epoch": 0.31114887189653545,
"grad_norm": 99.0625,
"learning_rate": 7.6544315129812e-07,
"loss": 117.9046,
"step": 1545
},
{
"epoch": 0.31215582617451776,
"grad_norm": 100.875,
"learning_rate": 7.643240823634735e-07,
"loss": 117.9566,
"step": 1550
},
{
"epoch": 0.31316278045250007,
"grad_norm": 99.5,
"learning_rate": 7.632050134288272e-07,
"loss": 115.877,
"step": 1555
},
{
"epoch": 0.3141697347304824,
"grad_norm": 102.875,
"learning_rate": 7.620859444941809e-07,
"loss": 118.0572,
"step": 1560
},
{
"epoch": 0.3151766890084647,
"grad_norm": 99.0,
"learning_rate": 7.609668755595344e-07,
"loss": 117.6729,
"step": 1565
},
{
"epoch": 0.31618364328644705,
"grad_norm": 100.0,
"learning_rate": 7.59847806624888e-07,
"loss": 116.7598,
"step": 1570
},
{
"epoch": 0.31719059756442936,
"grad_norm": 101.6875,
"learning_rate": 7.587287376902417e-07,
"loss": 116.1664,
"step": 1575
},
{
"epoch": 0.31819755184241166,
"grad_norm": 100.75,
"learning_rate": 7.576096687555953e-07,
"loss": 116.2372,
"step": 1580
},
{
"epoch": 0.31920450612039397,
"grad_norm": 101.8125,
"learning_rate": 7.56490599820949e-07,
"loss": 116.9723,
"step": 1585
},
{
"epoch": 0.3202114603983763,
"grad_norm": 101.25,
"learning_rate": 7.553715308863025e-07,
"loss": 118.3011,
"step": 1590
},
{
"epoch": 0.3212184146763586,
"grad_norm": 100.75,
"learning_rate": 7.542524619516562e-07,
"loss": 116.3056,
"step": 1595
},
{
"epoch": 0.3222253689543409,
"grad_norm": 101.625,
"learning_rate": 7.531333930170098e-07,
"loss": 116.9207,
"step": 1600
},
{
"epoch": 0.32323232323232326,
"grad_norm": 97.0625,
"learning_rate": 7.520143240823634e-07,
"loss": 116.7428,
"step": 1605
},
{
"epoch": 0.32423927751030557,
"grad_norm": 99.5625,
"learning_rate": 7.50895255147717e-07,
"loss": 115.7755,
"step": 1610
},
{
"epoch": 0.3252462317882879,
"grad_norm": 100.4375,
"learning_rate": 7.497761862130707e-07,
"loss": 117.4555,
"step": 1615
},
{
"epoch": 0.3262531860662702,
"grad_norm": 98.9375,
"learning_rate": 7.486571172784243e-07,
"loss": 114.5551,
"step": 1620
},
{
"epoch": 0.3272601403442525,
"grad_norm": 101.4375,
"learning_rate": 7.47538048343778e-07,
"loss": 115.1901,
"step": 1625
},
{
"epoch": 0.3282670946222348,
"grad_norm": 99.0,
"learning_rate": 7.464189794091316e-07,
"loss": 115.1624,
"step": 1630
},
{
"epoch": 0.3292740489002171,
"grad_norm": 97.25,
"learning_rate": 7.452999104744852e-07,
"loss": 116.2408,
"step": 1635
},
{
"epoch": 0.3302810031781994,
"grad_norm": 97.1875,
"learning_rate": 7.441808415398388e-07,
"loss": 115.4953,
"step": 1640
},
{
"epoch": 0.3312879574561818,
"grad_norm": 100.125,
"learning_rate": 7.430617726051924e-07,
"loss": 115.8885,
"step": 1645
},
{
"epoch": 0.3322949117341641,
"grad_norm": 100.0,
"learning_rate": 7.419427036705462e-07,
"loss": 114.8555,
"step": 1650
},
{
"epoch": 0.3333018660121464,
"grad_norm": 99.9375,
"learning_rate": 7.408236347358997e-07,
"loss": 117.4458,
"step": 1655
},
{
"epoch": 0.3343088202901287,
"grad_norm": 99.4375,
"learning_rate": 7.397045658012533e-07,
"loss": 115.667,
"step": 1660
},
{
"epoch": 0.335315774568111,
"grad_norm": 99.9375,
"learning_rate": 7.385854968666069e-07,
"loss": 115.4635,
"step": 1665
},
{
"epoch": 0.3363227288460933,
"grad_norm": 99.0625,
"learning_rate": 7.374664279319606e-07,
"loss": 116.1706,
"step": 1670
},
{
"epoch": 0.33732968312407563,
"grad_norm": 102.625,
"learning_rate": 7.363473589973142e-07,
"loss": 115.5255,
"step": 1675
},
{
"epoch": 0.33833663740205794,
"grad_norm": 101.25,
"learning_rate": 7.352282900626678e-07,
"loss": 116.4496,
"step": 1680
},
{
"epoch": 0.3393435916800403,
"grad_norm": 100.25,
"learning_rate": 7.341092211280214e-07,
"loss": 115.7757,
"step": 1685
},
{
"epoch": 0.3403505459580226,
"grad_norm": 98.125,
"learning_rate": 7.329901521933752e-07,
"loss": 114.8857,
"step": 1690
},
{
"epoch": 0.3413575002360049,
"grad_norm": 100.5625,
"learning_rate": 7.318710832587287e-07,
"loss": 117.0493,
"step": 1695
},
{
"epoch": 0.3423644545139872,
"grad_norm": 100.5,
"learning_rate": 7.307520143240823e-07,
"loss": 116.3239,
"step": 1700
},
{
"epoch": 0.34337140879196953,
"grad_norm": 99.9375,
"learning_rate": 7.296329453894359e-07,
"loss": 113.2169,
"step": 1705
},
{
"epoch": 0.34437836306995184,
"grad_norm": 98.6875,
"learning_rate": 7.285138764547896e-07,
"loss": 114.3095,
"step": 1710
},
{
"epoch": 0.34538531734793415,
"grad_norm": 101.0625,
"learning_rate": 7.273948075201433e-07,
"loss": 116.0238,
"step": 1715
},
{
"epoch": 0.34639227162591646,
"grad_norm": 99.375,
"learning_rate": 7.262757385854968e-07,
"loss": 115.388,
"step": 1720
},
{
"epoch": 0.3473992259038988,
"grad_norm": 103.3125,
"learning_rate": 7.251566696508504e-07,
"loss": 115.1991,
"step": 1725
},
{
"epoch": 0.34840618018188113,
"grad_norm": 102.0625,
"learning_rate": 7.240376007162041e-07,
"loss": 113.6285,
"step": 1730
},
{
"epoch": 0.34941313445986344,
"grad_norm": 102.25,
"learning_rate": 7.229185317815577e-07,
"loss": 113.7743,
"step": 1735
},
{
"epoch": 0.35042008873784575,
"grad_norm": 100.0,
"learning_rate": 7.217994628469113e-07,
"loss": 113.9617,
"step": 1740
},
{
"epoch": 0.35142704301582806,
"grad_norm": 104.25,
"learning_rate": 7.20680393912265e-07,
"loss": 115.0941,
"step": 1745
},
{
"epoch": 0.35243399729381036,
"grad_norm": 99.5,
"learning_rate": 7.195613249776186e-07,
"loss": 113.6155,
"step": 1750
},
{
"epoch": 0.35344095157179267,
"grad_norm": 101.375,
"learning_rate": 7.184422560429723e-07,
"loss": 115.6367,
"step": 1755
},
{
"epoch": 0.35444790584977504,
"grad_norm": 96.0625,
"learning_rate": 7.173231871083258e-07,
"loss": 115.1059,
"step": 1760
},
{
"epoch": 0.35545486012775734,
"grad_norm": 98.875,
"learning_rate": 7.162041181736795e-07,
"loss": 114.0925,
"step": 1765
},
{
"epoch": 0.35646181440573965,
"grad_norm": 103.625,
"learning_rate": 7.150850492390331e-07,
"loss": 114.7297,
"step": 1770
},
{
"epoch": 0.35746876868372196,
"grad_norm": 96.8125,
"learning_rate": 7.139659803043867e-07,
"loss": 114.0845,
"step": 1775
},
{
"epoch": 0.35847572296170427,
"grad_norm": 104.25,
"learning_rate": 7.128469113697403e-07,
"loss": 114.4882,
"step": 1780
},
{
"epoch": 0.3594826772396866,
"grad_norm": 101.625,
"learning_rate": 7.11727842435094e-07,
"loss": 114.6614,
"step": 1785
},
{
"epoch": 0.3604896315176689,
"grad_norm": 100.125,
"learning_rate": 7.106087735004476e-07,
"loss": 115.3073,
"step": 1790
},
{
"epoch": 0.3614965857956512,
"grad_norm": 100.3125,
"learning_rate": 7.094897045658012e-07,
"loss": 116.3388,
"step": 1795
},
{
"epoch": 0.36250354007363356,
"grad_norm": 102.6875,
"learning_rate": 7.083706356311548e-07,
"loss": 114.3609,
"step": 1800
},
{
"epoch": 0.36351049435161586,
"grad_norm": 99.0,
"learning_rate": 7.072515666965085e-07,
"loss": 113.4134,
"step": 1805
},
{
"epoch": 0.3645174486295982,
"grad_norm": 102.8125,
"learning_rate": 7.061324977618621e-07,
"loss": 113.573,
"step": 1810
},
{
"epoch": 0.3655244029075805,
"grad_norm": 102.4375,
"learning_rate": 7.050134288272157e-07,
"loss": 114.1263,
"step": 1815
},
{
"epoch": 0.3665313571855628,
"grad_norm": 104.0,
"learning_rate": 7.038943598925694e-07,
"loss": 114.0493,
"step": 1820
},
{
"epoch": 0.3675383114635451,
"grad_norm": 101.75,
"learning_rate": 7.02775290957923e-07,
"loss": 113.4941,
"step": 1825
},
{
"epoch": 0.3685452657415274,
"grad_norm": 100.5625,
"learning_rate": 7.016562220232766e-07,
"loss": 114.3765,
"step": 1830
},
{
"epoch": 0.3695522200195097,
"grad_norm": 100.3125,
"learning_rate": 7.005371530886302e-07,
"loss": 113.0758,
"step": 1835
},
{
"epoch": 0.3705591742974921,
"grad_norm": 100.875,
"learning_rate": 6.994180841539838e-07,
"loss": 112.9355,
"step": 1840
},
{
"epoch": 0.3715661285754744,
"grad_norm": 99.8125,
"learning_rate": 6.982990152193375e-07,
"loss": 113.0069,
"step": 1845
},
{
"epoch": 0.3725730828534567,
"grad_norm": 104.9375,
"learning_rate": 6.971799462846911e-07,
"loss": 114.7156,
"step": 1850
},
{
"epoch": 0.373580037131439,
"grad_norm": 103.0,
"learning_rate": 6.960608773500447e-07,
"loss": 112.6265,
"step": 1855
},
{
"epoch": 0.3745869914094213,
"grad_norm": 101.25,
"learning_rate": 6.949418084153985e-07,
"loss": 112.8266,
"step": 1860
},
{
"epoch": 0.3755939456874036,
"grad_norm": 101.3125,
"learning_rate": 6.93822739480752e-07,
"loss": 112.4899,
"step": 1865
},
{
"epoch": 0.3766008999653859,
"grad_norm": 99.75,
"learning_rate": 6.927036705461056e-07,
"loss": 113.6427,
"step": 1870
},
{
"epoch": 0.37760785424336824,
"grad_norm": 99.5,
"learning_rate": 6.915846016114592e-07,
"loss": 113.0808,
"step": 1875
},
{
"epoch": 0.3786148085213506,
"grad_norm": 101.375,
"learning_rate": 6.904655326768129e-07,
"loss": 113.0404,
"step": 1880
},
{
"epoch": 0.3796217627993329,
"grad_norm": 103.125,
"learning_rate": 6.893464637421666e-07,
"loss": 110.9456,
"step": 1885
},
{
"epoch": 0.3806287170773152,
"grad_norm": 100.6875,
"learning_rate": 6.882273948075201e-07,
"loss": 112.3675,
"step": 1890
},
{
"epoch": 0.3816356713552975,
"grad_norm": 102.3125,
"learning_rate": 6.871083258728737e-07,
"loss": 112.1686,
"step": 1895
},
{
"epoch": 0.38264262563327983,
"grad_norm": 98.3125,
"learning_rate": 6.859892569382274e-07,
"loss": 114.4239,
"step": 1900
},
{
"epoch": 0.38364957991126214,
"grad_norm": 101.375,
"learning_rate": 6.84870188003581e-07,
"loss": 111.5497,
"step": 1905
},
{
"epoch": 0.38465653418924445,
"grad_norm": 102.1875,
"learning_rate": 6.837511190689346e-07,
"loss": 111.1532,
"step": 1910
},
{
"epoch": 0.3856634884672268,
"grad_norm": 100.875,
"learning_rate": 6.826320501342882e-07,
"loss": 112.9669,
"step": 1915
},
{
"epoch": 0.3866704427452091,
"grad_norm": 101.625,
"learning_rate": 6.815129811996419e-07,
"loss": 113.497,
"step": 1920
},
{
"epoch": 0.38767739702319143,
"grad_norm": 99.9375,
"learning_rate": 6.803939122649956e-07,
"loss": 113.1546,
"step": 1925
},
{
"epoch": 0.38868435130117374,
"grad_norm": 103.5625,
"learning_rate": 6.792748433303491e-07,
"loss": 111.5201,
"step": 1930
},
{
"epoch": 0.38969130557915604,
"grad_norm": 101.3125,
"learning_rate": 6.781557743957027e-07,
"loss": 112.034,
"step": 1935
},
{
"epoch": 0.39069825985713835,
"grad_norm": 101.875,
"learning_rate": 6.770367054610564e-07,
"loss": 109.9169,
"step": 1940
},
{
"epoch": 0.39170521413512066,
"grad_norm": 97.9375,
"learning_rate": 6.7591763652641e-07,
"loss": 113.2691,
"step": 1945
},
{
"epoch": 0.39271216841310297,
"grad_norm": 103.375,
"learning_rate": 6.747985675917636e-07,
"loss": 112.5936,
"step": 1950
},
{
"epoch": 0.39371912269108533,
"grad_norm": 97.1875,
"learning_rate": 6.736794986571172e-07,
"loss": 112.2075,
"step": 1955
},
{
"epoch": 0.39472607696906764,
"grad_norm": 103.25,
"learning_rate": 6.725604297224709e-07,
"loss": 114.289,
"step": 1960
},
{
"epoch": 0.39573303124704995,
"grad_norm": 100.3125,
"learning_rate": 6.714413607878245e-07,
"loss": 111.8496,
"step": 1965
},
{
"epoch": 0.39673998552503226,
"grad_norm": 100.9375,
"learning_rate": 6.703222918531781e-07,
"loss": 112.454,
"step": 1970
},
{
"epoch": 0.39774693980301457,
"grad_norm": 103.375,
"learning_rate": 6.692032229185317e-07,
"loss": 112.7298,
"step": 1975
},
{
"epoch": 0.3987538940809969,
"grad_norm": 100.0,
"learning_rate": 6.680841539838854e-07,
"loss": 112.2657,
"step": 1980
},
{
"epoch": 0.3997608483589792,
"grad_norm": 102.875,
"learning_rate": 6.66965085049239e-07,
"loss": 110.3278,
"step": 1985
},
{
"epoch": 0.4007678026369615,
"grad_norm": 102.1875,
"learning_rate": 6.658460161145926e-07,
"loss": 112.1858,
"step": 1990
},
{
"epoch": 0.40177475691494385,
"grad_norm": 100.0,
"learning_rate": 6.647269471799463e-07,
"loss": 112.2546,
"step": 1995
},
{
"epoch": 0.40278171119292616,
"grad_norm": 99.25,
"learning_rate": 6.636078782452999e-07,
"loss": 112.8675,
"step": 2000
},
{
"epoch": 0.40278171119292616,
"eval_loss": 3.4998972415924072,
"eval_runtime": 241.0777,
"eval_samples_per_second": 1110.069,
"eval_steps_per_second": 34.69,
"step": 2000
},
{
"epoch": 0.40378866547090847,
"grad_norm": 104.3125,
"learning_rate": 6.624888093106535e-07,
"loss": 112.4003,
"step": 2005
},
{
"epoch": 0.4047956197488908,
"grad_norm": 103.5,
"learning_rate": 6.613697403760071e-07,
"loss": 112.6595,
"step": 2010
},
{
"epoch": 0.4058025740268731,
"grad_norm": 103.75,
"learning_rate": 6.602506714413608e-07,
"loss": 112.9923,
"step": 2015
},
{
"epoch": 0.4068095283048554,
"grad_norm": 103.4375,
"learning_rate": 6.591316025067144e-07,
"loss": 112.2758,
"step": 2020
},
{
"epoch": 0.4078164825828377,
"grad_norm": 103.5625,
"learning_rate": 6.58012533572068e-07,
"loss": 111.0822,
"step": 2025
},
{
"epoch": 0.40882343686082,
"grad_norm": 103.6875,
"learning_rate": 6.568934646374216e-07,
"loss": 113.0516,
"step": 2030
},
{
"epoch": 0.4098303911388024,
"grad_norm": 103.8125,
"learning_rate": 6.557743957027753e-07,
"loss": 113.0746,
"step": 2035
},
{
"epoch": 0.4108373454167847,
"grad_norm": 103.25,
"learning_rate": 6.546553267681289e-07,
"loss": 110.6382,
"step": 2040
},
{
"epoch": 0.411844299694767,
"grad_norm": 101.625,
"learning_rate": 6.535362578334825e-07,
"loss": 113.3481,
"step": 2045
},
{
"epoch": 0.4128512539727493,
"grad_norm": 100.3125,
"learning_rate": 6.524171888988361e-07,
"loss": 111.0591,
"step": 2050
},
{
"epoch": 0.4138582082507316,
"grad_norm": 104.5,
"learning_rate": 6.512981199641899e-07,
"loss": 110.473,
"step": 2055
},
{
"epoch": 0.4148651625287139,
"grad_norm": 99.25,
"learning_rate": 6.501790510295434e-07,
"loss": 110.724,
"step": 2060
},
{
"epoch": 0.4158721168066962,
"grad_norm": 103.375,
"learning_rate": 6.49059982094897e-07,
"loss": 113.1481,
"step": 2065
},
{
"epoch": 0.4168790710846786,
"grad_norm": 101.3125,
"learning_rate": 6.479409131602506e-07,
"loss": 111.1849,
"step": 2070
},
{
"epoch": 0.4178860253626609,
"grad_norm": 101.8125,
"learning_rate": 6.468218442256043e-07,
"loss": 111.4414,
"step": 2075
},
{
"epoch": 0.4188929796406432,
"grad_norm": 104.3125,
"learning_rate": 6.457027752909579e-07,
"loss": 110.1911,
"step": 2080
},
{
"epoch": 0.4198999339186255,
"grad_norm": 99.6875,
"learning_rate": 6.445837063563115e-07,
"loss": 112.5238,
"step": 2085
},
{
"epoch": 0.4209068881966078,
"grad_norm": 104.1875,
"learning_rate": 6.434646374216651e-07,
"loss": 111.1536,
"step": 2090
},
{
"epoch": 0.42191384247459013,
"grad_norm": 101.8125,
"learning_rate": 6.423455684870188e-07,
"loss": 110.8878,
"step": 2095
},
{
"epoch": 0.42292079675257244,
"grad_norm": 102.875,
"learning_rate": 6.412264995523724e-07,
"loss": 111.2218,
"step": 2100
},
{
"epoch": 0.42392775103055474,
"grad_norm": 102.25,
"learning_rate": 6.40107430617726e-07,
"loss": 110.9818,
"step": 2105
},
{
"epoch": 0.4249347053085371,
"grad_norm": 100.5625,
"learning_rate": 6.389883616830797e-07,
"loss": 111.3958,
"step": 2110
},
{
"epoch": 0.4259416595865194,
"grad_norm": 103.3125,
"learning_rate": 6.378692927484333e-07,
"loss": 112.1829,
"step": 2115
},
{
"epoch": 0.4269486138645017,
"grad_norm": 103.0625,
"learning_rate": 6.367502238137868e-07,
"loss": 111.266,
"step": 2120
},
{
"epoch": 0.42795556814248403,
"grad_norm": 103.375,
"learning_rate": 6.356311548791405e-07,
"loss": 110.5654,
"step": 2125
},
{
"epoch": 0.42896252242046634,
"grad_norm": 101.4375,
"learning_rate": 6.345120859444942e-07,
"loss": 110.188,
"step": 2130
},
{
"epoch": 0.42996947669844865,
"grad_norm": 101.25,
"learning_rate": 6.333930170098478e-07,
"loss": 110.1502,
"step": 2135
},
{
"epoch": 0.43097643097643096,
"grad_norm": 103.125,
"learning_rate": 6.322739480752014e-07,
"loss": 110.9768,
"step": 2140
},
{
"epoch": 0.43198338525441327,
"grad_norm": 101.1875,
"learning_rate": 6.31154879140555e-07,
"loss": 109.8121,
"step": 2145
},
{
"epoch": 0.43299033953239563,
"grad_norm": 101.4375,
"learning_rate": 6.300358102059087e-07,
"loss": 109.5695,
"step": 2150
},
{
"epoch": 0.43399729381037794,
"grad_norm": 100.625,
"learning_rate": 6.289167412712623e-07,
"loss": 110.7248,
"step": 2155
},
{
"epoch": 0.43500424808836025,
"grad_norm": 103.375,
"learning_rate": 6.277976723366159e-07,
"loss": 110.0195,
"step": 2160
},
{
"epoch": 0.43601120236634255,
"grad_norm": 97.375,
"learning_rate": 6.266786034019695e-07,
"loss": 110.2418,
"step": 2165
},
{
"epoch": 0.43701815664432486,
"grad_norm": 102.4375,
"learning_rate": 6.255595344673232e-07,
"loss": 109.4576,
"step": 2170
},
{
"epoch": 0.43802511092230717,
"grad_norm": 104.25,
"learning_rate": 6.244404655326768e-07,
"loss": 109.8756,
"step": 2175
},
{
"epoch": 0.4390320652002895,
"grad_norm": 103.375,
"learning_rate": 6.233213965980304e-07,
"loss": 111.7631,
"step": 2180
},
{
"epoch": 0.44003901947827184,
"grad_norm": 105.125,
"learning_rate": 6.222023276633839e-07,
"loss": 110.7248,
"step": 2185
},
{
"epoch": 0.44104597375625415,
"grad_norm": 98.0625,
"learning_rate": 6.210832587287377e-07,
"loss": 109.8976,
"step": 2190
},
{
"epoch": 0.44205292803423646,
"grad_norm": 101.0,
"learning_rate": 6.199641897940913e-07,
"loss": 111.2177,
"step": 2195
},
{
"epoch": 0.44305988231221877,
"grad_norm": 99.5,
"learning_rate": 6.188451208594449e-07,
"loss": 110.6901,
"step": 2200
},
{
"epoch": 0.4440668365902011,
"grad_norm": 102.5,
"learning_rate": 6.177260519247985e-07,
"loss": 110.3281,
"step": 2205
},
{
"epoch": 0.4450737908681834,
"grad_norm": 103.375,
"learning_rate": 6.166069829901522e-07,
"loss": 110.1482,
"step": 2210
},
{
"epoch": 0.4460807451461657,
"grad_norm": 101.125,
"learning_rate": 6.154879140555058e-07,
"loss": 110.9915,
"step": 2215
},
{
"epoch": 0.447087699424148,
"grad_norm": 101.5625,
"learning_rate": 6.143688451208594e-07,
"loss": 109.2988,
"step": 2220
},
{
"epoch": 0.44809465370213036,
"grad_norm": 101.5625,
"learning_rate": 6.132497761862131e-07,
"loss": 110.0758,
"step": 2225
},
{
"epoch": 0.44910160798011267,
"grad_norm": 99.9375,
"learning_rate": 6.121307072515667e-07,
"loss": 108.8493,
"step": 2230
},
{
"epoch": 0.450108562258095,
"grad_norm": 100.4375,
"learning_rate": 6.110116383169203e-07,
"loss": 108.6601,
"step": 2235
},
{
"epoch": 0.4511155165360773,
"grad_norm": 102.9375,
"learning_rate": 6.098925693822739e-07,
"loss": 109.1784,
"step": 2240
},
{
"epoch": 0.4521224708140596,
"grad_norm": 101.0625,
"learning_rate": 6.087735004476276e-07,
"loss": 109.5287,
"step": 2245
},
{
"epoch": 0.4531294250920419,
"grad_norm": 101.0,
"learning_rate": 6.076544315129812e-07,
"loss": 110.2235,
"step": 2250
},
{
"epoch": 0.4541363793700242,
"grad_norm": 106.1875,
"learning_rate": 6.065353625783348e-07,
"loss": 109.2282,
"step": 2255
},
{
"epoch": 0.4551433336480065,
"grad_norm": 103.75,
"learning_rate": 6.054162936436884e-07,
"loss": 108.3096,
"step": 2260
},
{
"epoch": 0.4561502879259889,
"grad_norm": 96.9375,
"learning_rate": 6.042972247090421e-07,
"loss": 109.567,
"step": 2265
},
{
"epoch": 0.4571572422039712,
"grad_norm": 103.875,
"learning_rate": 6.031781557743957e-07,
"loss": 108.5729,
"step": 2270
},
{
"epoch": 0.4581641964819535,
"grad_norm": 101.4375,
"learning_rate": 6.020590868397493e-07,
"loss": 108.8917,
"step": 2275
},
{
"epoch": 0.4591711507599358,
"grad_norm": 104.4375,
"learning_rate": 6.009400179051029e-07,
"loss": 109.6874,
"step": 2280
},
{
"epoch": 0.4601781050379181,
"grad_norm": 101.0625,
"learning_rate": 5.998209489704566e-07,
"loss": 109.5204,
"step": 2285
},
{
"epoch": 0.4611850593159004,
"grad_norm": 100.5625,
"learning_rate": 5.987018800358101e-07,
"loss": 108.9875,
"step": 2290
},
{
"epoch": 0.46219201359388273,
"grad_norm": 103.25,
"learning_rate": 5.975828111011638e-07,
"loss": 108.4358,
"step": 2295
},
{
"epoch": 0.46319896787186504,
"grad_norm": 102.4375,
"learning_rate": 5.964637421665174e-07,
"loss": 109.1291,
"step": 2300
},
{
"epoch": 0.4642059221498474,
"grad_norm": 101.0,
"learning_rate": 5.953446732318711e-07,
"loss": 108.4389,
"step": 2305
},
{
"epoch": 0.4652128764278297,
"grad_norm": 101.875,
"learning_rate": 5.942256042972247e-07,
"loss": 109.845,
"step": 2310
},
{
"epoch": 0.466219830705812,
"grad_norm": 102.25,
"learning_rate": 5.931065353625782e-07,
"loss": 108.6821,
"step": 2315
},
{
"epoch": 0.46722678498379433,
"grad_norm": 102.625,
"learning_rate": 5.919874664279319e-07,
"loss": 109.3875,
"step": 2320
},
{
"epoch": 0.46823373926177664,
"grad_norm": 101.5625,
"learning_rate": 5.908683974932856e-07,
"loss": 108.3035,
"step": 2325
},
{
"epoch": 0.46924069353975895,
"grad_norm": 100.375,
"learning_rate": 5.897493285586392e-07,
"loss": 107.0217,
"step": 2330
},
{
"epoch": 0.47024764781774125,
"grad_norm": 105.0,
"learning_rate": 5.886302596239928e-07,
"loss": 110.8466,
"step": 2335
},
{
"epoch": 0.4712546020957236,
"grad_norm": 103.5,
"learning_rate": 5.875111906893464e-07,
"loss": 109.4547,
"step": 2340
},
{
"epoch": 0.4722615563737059,
"grad_norm": 100.9375,
"learning_rate": 5.863921217547001e-07,
"loss": 107.5557,
"step": 2345
},
{
"epoch": 0.47326851065168823,
"grad_norm": 102.1875,
"learning_rate": 5.852730528200537e-07,
"loss": 109.2353,
"step": 2350
},
{
"epoch": 0.47427546492967054,
"grad_norm": 101.5,
"learning_rate": 5.841539838854072e-07,
"loss": 108.1182,
"step": 2355
},
{
"epoch": 0.47528241920765285,
"grad_norm": 102.6875,
"learning_rate": 5.83034914950761e-07,
"loss": 107.2573,
"step": 2360
},
{
"epoch": 0.47628937348563516,
"grad_norm": 102.1875,
"learning_rate": 5.819158460161146e-07,
"loss": 108.402,
"step": 2365
},
{
"epoch": 0.47729632776361747,
"grad_norm": 101.5,
"learning_rate": 5.807967770814682e-07,
"loss": 108.3766,
"step": 2370
},
{
"epoch": 0.4783032820415998,
"grad_norm": 104.0,
"learning_rate": 5.796777081468218e-07,
"loss": 108.5534,
"step": 2375
},
{
"epoch": 0.47931023631958214,
"grad_norm": 100.3125,
"learning_rate": 5.785586392121755e-07,
"loss": 109.6284,
"step": 2380
},
{
"epoch": 0.48031719059756445,
"grad_norm": 105.5,
"learning_rate": 5.774395702775291e-07,
"loss": 108.7794,
"step": 2385
},
{
"epoch": 0.48132414487554676,
"grad_norm": 104.375,
"learning_rate": 5.763205013428827e-07,
"loss": 107.7422,
"step": 2390
},
{
"epoch": 0.48233109915352906,
"grad_norm": 102.0625,
"learning_rate": 5.752014324082363e-07,
"loss": 108.9189,
"step": 2395
},
{
"epoch": 0.48333805343151137,
"grad_norm": 100.1875,
"learning_rate": 5.7408236347359e-07,
"loss": 107.5696,
"step": 2400
},
{
"epoch": 0.4843450077094937,
"grad_norm": 102.6875,
"learning_rate": 5.729632945389436e-07,
"loss": 108.2401,
"step": 2405
},
{
"epoch": 0.485351961987476,
"grad_norm": 101.625,
"learning_rate": 5.718442256042972e-07,
"loss": 107.8118,
"step": 2410
},
{
"epoch": 0.4863589162654583,
"grad_norm": 101.75,
"learning_rate": 5.707251566696508e-07,
"loss": 108.2832,
"step": 2415
},
{
"epoch": 0.48736587054344066,
"grad_norm": 102.375,
"learning_rate": 5.696060877350044e-07,
"loss": 109.5226,
"step": 2420
},
{
"epoch": 0.48837282482142297,
"grad_norm": 107.625,
"learning_rate": 5.684870188003581e-07,
"loss": 109.3006,
"step": 2425
},
{
"epoch": 0.4893797790994053,
"grad_norm": 102.3125,
"learning_rate": 5.673679498657117e-07,
"loss": 108.7192,
"step": 2430
},
{
"epoch": 0.4903867333773876,
"grad_norm": 102.6875,
"learning_rate": 5.662488809310653e-07,
"loss": 106.2096,
"step": 2435
},
{
"epoch": 0.4913936876553699,
"grad_norm": 101.3125,
"learning_rate": 5.65129811996419e-07,
"loss": 106.725,
"step": 2440
},
{
"epoch": 0.4924006419333522,
"grad_norm": 100.1875,
"learning_rate": 5.640107430617726e-07,
"loss": 107.4696,
"step": 2445
},
{
"epoch": 0.4934075962113345,
"grad_norm": 102.8125,
"learning_rate": 5.628916741271262e-07,
"loss": 108.2278,
"step": 2450
},
{
"epoch": 0.4944145504893168,
"grad_norm": 100.0625,
"learning_rate": 5.617726051924798e-07,
"loss": 108.2526,
"step": 2455
},
{
"epoch": 0.4954215047672992,
"grad_norm": 100.875,
"learning_rate": 5.606535362578334e-07,
"loss": 108.8548,
"step": 2460
},
{
"epoch": 0.4964284590452815,
"grad_norm": 103.4375,
"learning_rate": 5.595344673231871e-07,
"loss": 107.2892,
"step": 2465
},
{
"epoch": 0.4974354133232638,
"grad_norm": 101.6875,
"learning_rate": 5.584153983885407e-07,
"loss": 106.3791,
"step": 2470
},
{
"epoch": 0.4984423676012461,
"grad_norm": 99.0625,
"learning_rate": 5.572963294538944e-07,
"loss": 108.912,
"step": 2475
},
{
"epoch": 0.4994493218792284,
"grad_norm": 103.5625,
"learning_rate": 5.56177260519248e-07,
"loss": 108.9683,
"step": 2480
},
{
"epoch": 0.5004562761572108,
"grad_norm": 101.6875,
"learning_rate": 5.550581915846015e-07,
"loss": 106.8892,
"step": 2485
},
{
"epoch": 0.501463230435193,
"grad_norm": 106.3125,
"learning_rate": 5.539391226499552e-07,
"loss": 107.4377,
"step": 2490
},
{
"epoch": 0.5024701847131754,
"grad_norm": 101.625,
"learning_rate": 5.528200537153089e-07,
"loss": 107.6573,
"step": 2495
},
{
"epoch": 0.5034771389911576,
"grad_norm": 99.375,
"learning_rate": 5.517009847806625e-07,
"loss": 108.9813,
"step": 2500
},
{
"epoch": 0.5034771389911576,
"eval_loss": 3.3530521392822266,
"eval_runtime": 240.5322,
"eval_samples_per_second": 1112.587,
"eval_steps_per_second": 34.769,
"step": 2500
},
{
"epoch": 0.50448409326914,
"grad_norm": 102.0,
"learning_rate": 5.505819158460161e-07,
"loss": 107.6658,
"step": 2505
},
{
"epoch": 0.5054910475471223,
"grad_norm": 102.1875,
"learning_rate": 5.494628469113697e-07,
"loss": 107.6049,
"step": 2510
},
{
"epoch": 0.5064980018251046,
"grad_norm": 99.25,
"learning_rate": 5.483437779767234e-07,
"loss": 106.7267,
"step": 2515
},
{
"epoch": 0.507504956103087,
"grad_norm": 102.125,
"learning_rate": 5.47224709042077e-07,
"loss": 106.255,
"step": 2520
},
{
"epoch": 0.5085119103810692,
"grad_norm": 100.875,
"learning_rate": 5.461056401074305e-07,
"loss": 107.4151,
"step": 2525
},
{
"epoch": 0.5095188646590516,
"grad_norm": 103.375,
"learning_rate": 5.449865711727842e-07,
"loss": 107.2014,
"step": 2530
},
{
"epoch": 0.5105258189370339,
"grad_norm": 101.75,
"learning_rate": 5.438675022381379e-07,
"loss": 106.2904,
"step": 2535
},
{
"epoch": 0.5115327732150162,
"grad_norm": 101.6875,
"learning_rate": 5.427484333034915e-07,
"loss": 105.7796,
"step": 2540
},
{
"epoch": 0.5125397274929985,
"grad_norm": 100.5,
"learning_rate": 5.416293643688451e-07,
"loss": 107.3293,
"step": 2545
},
{
"epoch": 0.5135466817709808,
"grad_norm": 102.9375,
"learning_rate": 5.405102954341986e-07,
"loss": 105.9381,
"step": 2550
},
{
"epoch": 0.5145536360489632,
"grad_norm": 103.8125,
"learning_rate": 5.393912264995524e-07,
"loss": 106.3883,
"step": 2555
},
{
"epoch": 0.5155605903269455,
"grad_norm": 104.625,
"learning_rate": 5.38272157564906e-07,
"loss": 106.8838,
"step": 2560
},
{
"epoch": 0.5165675446049278,
"grad_norm": 100.375,
"learning_rate": 5.371530886302596e-07,
"loss": 104.9295,
"step": 2565
},
{
"epoch": 0.5175744988829101,
"grad_norm": 104.3125,
"learning_rate": 5.360340196956132e-07,
"loss": 105.8216,
"step": 2570
},
{
"epoch": 0.5185814531608924,
"grad_norm": 103.8125,
"learning_rate": 5.349149507609669e-07,
"loss": 107.064,
"step": 2575
},
{
"epoch": 0.5195884074388747,
"grad_norm": 102.25,
"learning_rate": 5.337958818263205e-07,
"loss": 107.4001,
"step": 2580
},
{
"epoch": 0.520595361716857,
"grad_norm": 104.75,
"learning_rate": 5.326768128916741e-07,
"loss": 105.6752,
"step": 2585
},
{
"epoch": 0.5216023159948393,
"grad_norm": 105.1875,
"learning_rate": 5.315577439570276e-07,
"loss": 106.9133,
"step": 2590
},
{
"epoch": 0.5226092702728217,
"grad_norm": 101.875,
"learning_rate": 5.304386750223814e-07,
"loss": 106.7293,
"step": 2595
},
{
"epoch": 0.523616224550804,
"grad_norm": 100.625,
"learning_rate": 5.29319606087735e-07,
"loss": 105.289,
"step": 2600
},
{
"epoch": 0.5246231788287863,
"grad_norm": 102.625,
"learning_rate": 5.282005371530886e-07,
"loss": 107.3164,
"step": 2605
},
{
"epoch": 0.5256301331067686,
"grad_norm": 104.75,
"learning_rate": 5.270814682184423e-07,
"loss": 107.9815,
"step": 2610
},
{
"epoch": 0.5266370873847509,
"grad_norm": 102.625,
"learning_rate": 5.259623992837958e-07,
"loss": 106.1467,
"step": 2615
},
{
"epoch": 0.5276440416627333,
"grad_norm": 106.6875,
"learning_rate": 5.248433303491495e-07,
"loss": 106.5354,
"step": 2620
},
{
"epoch": 0.5286509959407155,
"grad_norm": 104.1875,
"learning_rate": 5.237242614145031e-07,
"loss": 106.1924,
"step": 2625
},
{
"epoch": 0.5296579502186979,
"grad_norm": 102.625,
"learning_rate": 5.226051924798567e-07,
"loss": 107.1888,
"step": 2630
},
{
"epoch": 0.5306649044966802,
"grad_norm": 99.5625,
"learning_rate": 5.214861235452104e-07,
"loss": 105.9879,
"step": 2635
},
{
"epoch": 0.5316718587746625,
"grad_norm": 100.625,
"learning_rate": 5.20367054610564e-07,
"loss": 105.3448,
"step": 2640
},
{
"epoch": 0.5326788130526449,
"grad_norm": 103.875,
"learning_rate": 5.192479856759176e-07,
"loss": 107.7056,
"step": 2645
},
{
"epoch": 0.5336857673306271,
"grad_norm": 102.875,
"learning_rate": 5.181289167412713e-07,
"loss": 105.9089,
"step": 2650
},
{
"epoch": 0.5346927216086095,
"grad_norm": 103.6875,
"learning_rate": 5.170098478066248e-07,
"loss": 106.6165,
"step": 2655
},
{
"epoch": 0.5356996758865917,
"grad_norm": 100.5625,
"learning_rate": 5.158907788719785e-07,
"loss": 106.0283,
"step": 2660
},
{
"epoch": 0.5367066301645741,
"grad_norm": 105.0,
"learning_rate": 5.147717099373321e-07,
"loss": 106.3744,
"step": 2665
},
{
"epoch": 0.5377135844425565,
"grad_norm": 107.625,
"learning_rate": 5.136526410026858e-07,
"loss": 105.7139,
"step": 2670
},
{
"epoch": 0.5387205387205387,
"grad_norm": 103.125,
"learning_rate": 5.125335720680394e-07,
"loss": 106.4616,
"step": 2675
},
{
"epoch": 0.5397274929985211,
"grad_norm": 99.125,
"learning_rate": 5.114145031333929e-07,
"loss": 103.4881,
"step": 2680
},
{
"epoch": 0.5407344472765033,
"grad_norm": 101.1875,
"learning_rate": 5.102954341987466e-07,
"loss": 105.4083,
"step": 2685
},
{
"epoch": 0.5417414015544857,
"grad_norm": 103.5,
"learning_rate": 5.091763652641003e-07,
"loss": 105.16,
"step": 2690
},
{
"epoch": 0.5427483558324679,
"grad_norm": 104.3125,
"learning_rate": 5.080572963294538e-07,
"loss": 105.9961,
"step": 2695
},
{
"epoch": 0.5437553101104503,
"grad_norm": 101.5625,
"learning_rate": 5.069382273948075e-07,
"loss": 103.5584,
"step": 2700
},
{
"epoch": 0.5447622643884326,
"grad_norm": 103.0,
"learning_rate": 5.058191584601611e-07,
"loss": 105.5014,
"step": 2705
},
{
"epoch": 0.5457692186664149,
"grad_norm": 103.75,
"learning_rate": 5.047000895255148e-07,
"loss": 106.6602,
"step": 2710
},
{
"epoch": 0.5467761729443973,
"grad_norm": 105.125,
"learning_rate": 5.035810205908684e-07,
"loss": 107.0629,
"step": 2715
},
{
"epoch": 0.5477831272223795,
"grad_norm": 103.75,
"learning_rate": 5.024619516562219e-07,
"loss": 103.3805,
"step": 2720
},
{
"epoch": 0.5487900815003619,
"grad_norm": 102.5,
"learning_rate": 5.013428827215757e-07,
"loss": 105.2457,
"step": 2725
},
{
"epoch": 0.5497970357783442,
"grad_norm": 100.8125,
"learning_rate": 5.002238137869293e-07,
"loss": 104.6135,
"step": 2730
},
{
"epoch": 0.5508039900563265,
"grad_norm": 103.0625,
"learning_rate": 4.991047448522829e-07,
"loss": 104.9963,
"step": 2735
},
{
"epoch": 0.5518109443343088,
"grad_norm": 100.4375,
"learning_rate": 4.979856759176365e-07,
"loss": 105.1985,
"step": 2740
},
{
"epoch": 0.5528178986122911,
"grad_norm": 101.5,
"learning_rate": 4.968666069829902e-07,
"loss": 106.8182,
"step": 2745
},
{
"epoch": 0.5538248528902735,
"grad_norm": 102.0,
"learning_rate": 4.957475380483437e-07,
"loss": 105.6893,
"step": 2750
},
{
"epoch": 0.5548318071682558,
"grad_norm": 101.625,
"learning_rate": 4.946284691136974e-07,
"loss": 104.0999,
"step": 2755
},
{
"epoch": 0.5558387614462381,
"grad_norm": 102.6875,
"learning_rate": 4.93509400179051e-07,
"loss": 106.6457,
"step": 2760
},
{
"epoch": 0.5568457157242204,
"grad_norm": 102.5625,
"learning_rate": 4.923903312444047e-07,
"loss": 105.8887,
"step": 2765
},
{
"epoch": 0.5578526700022027,
"grad_norm": 101.375,
"learning_rate": 4.912712623097583e-07,
"loss": 106.124,
"step": 2770
},
{
"epoch": 0.558859624280185,
"grad_norm": 104.125,
"learning_rate": 4.901521933751119e-07,
"loss": 105.8778,
"step": 2775
},
{
"epoch": 0.5598665785581673,
"grad_norm": 101.5625,
"learning_rate": 4.890331244404655e-07,
"loss": 104.9353,
"step": 2780
},
{
"epoch": 0.5608735328361497,
"grad_norm": 103.625,
"learning_rate": 4.879140555058191e-07,
"loss": 105.6354,
"step": 2785
},
{
"epoch": 0.561880487114132,
"grad_norm": 103.875,
"learning_rate": 4.867949865711727e-07,
"loss": 104.5165,
"step": 2790
},
{
"epoch": 0.5628874413921143,
"grad_norm": 106.4375,
"learning_rate": 4.856759176365264e-07,
"loss": 106.2399,
"step": 2795
},
{
"epoch": 0.5638943956700966,
"grad_norm": 104.25,
"learning_rate": 4.8455684870188e-07,
"loss": 105.9224,
"step": 2800
},
{
"epoch": 0.564901349948079,
"grad_norm": 102.75,
"learning_rate": 4.834377797672337e-07,
"loss": 105.3704,
"step": 2805
},
{
"epoch": 0.5659083042260612,
"grad_norm": 100.1875,
"learning_rate": 4.823187108325872e-07,
"loss": 104.1296,
"step": 2810
},
{
"epoch": 0.5669152585040436,
"grad_norm": 103.9375,
"learning_rate": 4.811996418979409e-07,
"loss": 104.9443,
"step": 2815
},
{
"epoch": 0.5679222127820258,
"grad_norm": 103.8125,
"learning_rate": 4.800805729632945e-07,
"loss": 105.7805,
"step": 2820
},
{
"epoch": 0.5689291670600082,
"grad_norm": 101.0625,
"learning_rate": 4.789615040286481e-07,
"loss": 104.9703,
"step": 2825
},
{
"epoch": 0.5699361213379905,
"grad_norm": 102.25,
"learning_rate": 4.778424350940018e-07,
"loss": 105.2172,
"step": 2830
},
{
"epoch": 0.5709430756159728,
"grad_norm": 102.875,
"learning_rate": 4.7672336615935536e-07,
"loss": 105.6472,
"step": 2835
},
{
"epoch": 0.5719500298939552,
"grad_norm": 100.25,
"learning_rate": 4.75604297224709e-07,
"loss": 104.7867,
"step": 2840
},
{
"epoch": 0.5729569841719374,
"grad_norm": 107.125,
"learning_rate": 4.7448522829006263e-07,
"loss": 105.9132,
"step": 2845
},
{
"epoch": 0.5739639384499198,
"grad_norm": 102.375,
"learning_rate": 4.733661593554163e-07,
"loss": 104.1496,
"step": 2850
},
{
"epoch": 0.574970892727902,
"grad_norm": 101.6875,
"learning_rate": 4.7224709042076985e-07,
"loss": 103.5616,
"step": 2855
},
{
"epoch": 0.5759778470058844,
"grad_norm": 103.6875,
"learning_rate": 4.711280214861235e-07,
"loss": 104.5214,
"step": 2860
},
{
"epoch": 0.5769848012838668,
"grad_norm": 101.6875,
"learning_rate": 4.700089525514772e-07,
"loss": 106.2387,
"step": 2865
},
{
"epoch": 0.577991755561849,
"grad_norm": 100.75,
"learning_rate": 4.688898836168308e-07,
"loss": 105.0707,
"step": 2870
},
{
"epoch": 0.5789987098398314,
"grad_norm": 105.5,
"learning_rate": 4.677708146821844e-07,
"loss": 105.1878,
"step": 2875
},
{
"epoch": 0.5800056641178136,
"grad_norm": 103.375,
"learning_rate": 4.66651745747538e-07,
"loss": 104.6173,
"step": 2880
},
{
"epoch": 0.581012618395796,
"grad_norm": 103.75,
"learning_rate": 4.655326768128917e-07,
"loss": 103.618,
"step": 2885
},
{
"epoch": 0.5820195726737782,
"grad_norm": 104.3125,
"learning_rate": 4.644136078782453e-07,
"loss": 103.7888,
"step": 2890
},
{
"epoch": 0.5830265269517606,
"grad_norm": 102.0,
"learning_rate": 4.632945389435989e-07,
"loss": 103.5098,
"step": 2895
},
{
"epoch": 0.5840334812297429,
"grad_norm": 103.75,
"learning_rate": 4.621754700089525e-07,
"loss": 104.3576,
"step": 2900
},
{
"epoch": 0.5850404355077252,
"grad_norm": 101.0,
"learning_rate": 4.6105640107430617e-07,
"loss": 103.4598,
"step": 2905
},
{
"epoch": 0.5860473897857076,
"grad_norm": 104.9375,
"learning_rate": 4.599373321396598e-07,
"loss": 102.7616,
"step": 2910
},
{
"epoch": 0.5870543440636898,
"grad_norm": 104.9375,
"learning_rate": 4.5881826320501345e-07,
"loss": 104.4935,
"step": 2915
},
{
"epoch": 0.5880612983416722,
"grad_norm": 102.5625,
"learning_rate": 4.57699194270367e-07,
"loss": 104.2107,
"step": 2920
},
{
"epoch": 0.5890682526196545,
"grad_norm": 104.25,
"learning_rate": 4.5658012533572067e-07,
"loss": 103.9403,
"step": 2925
},
{
"epoch": 0.5900752068976368,
"grad_norm": 100.4375,
"learning_rate": 4.554610564010743e-07,
"loss": 104.4585,
"step": 2930
},
{
"epoch": 0.5910821611756191,
"grad_norm": 102.8125,
"learning_rate": 4.5434198746642794e-07,
"loss": 103.71,
"step": 2935
},
{
"epoch": 0.5920891154536014,
"grad_norm": 104.125,
"learning_rate": 4.532229185317815e-07,
"loss": 102.3344,
"step": 2940
},
{
"epoch": 0.5930960697315838,
"grad_norm": 104.0625,
"learning_rate": 4.5210384959713516e-07,
"loss": 102.8993,
"step": 2945
},
{
"epoch": 0.594103024009566,
"grad_norm": 104.5,
"learning_rate": 4.509847806624888e-07,
"loss": 103.8401,
"step": 2950
},
{
"epoch": 0.5951099782875484,
"grad_norm": 101.875,
"learning_rate": 4.4986571172784244e-07,
"loss": 103.9012,
"step": 2955
},
{
"epoch": 0.5961169325655307,
"grad_norm": 100.75,
"learning_rate": 4.48746642793196e-07,
"loss": 102.7268,
"step": 2960
},
{
"epoch": 0.597123886843513,
"grad_norm": 105.9375,
"learning_rate": 4.4762757385854966e-07,
"loss": 104.2173,
"step": 2965
},
{
"epoch": 0.5981308411214953,
"grad_norm": 102.375,
"learning_rate": 4.4650850492390327e-07,
"loss": 105.3016,
"step": 2970
},
{
"epoch": 0.5991377953994776,
"grad_norm": 100.1875,
"learning_rate": 4.4538943598925693e-07,
"loss": 103.7173,
"step": 2975
},
{
"epoch": 0.60014474967746,
"grad_norm": 101.75,
"learning_rate": 4.442703670546105e-07,
"loss": 105.0442,
"step": 2980
},
{
"epoch": 0.6011517039554423,
"grad_norm": 101.875,
"learning_rate": 4.4315129811996416e-07,
"loss": 106.4008,
"step": 2985
},
{
"epoch": 0.6021586582334246,
"grad_norm": 102.625,
"learning_rate": 4.420322291853178e-07,
"loss": 102.636,
"step": 2990
},
{
"epoch": 0.6031656125114069,
"grad_norm": 103.3125,
"learning_rate": 4.4091316025067143e-07,
"loss": 103.5773,
"step": 2995
},
{
"epoch": 0.6041725667893892,
"grad_norm": 106.375,
"learning_rate": 4.397940913160251e-07,
"loss": 103.8032,
"step": 3000
},
{
"epoch": 0.6041725667893892,
"eval_loss": 3.2479496002197266,
"eval_runtime": 241.8144,
"eval_samples_per_second": 1106.688,
"eval_steps_per_second": 34.584,
"step": 3000
},
{
"epoch": 0.6051795210673715,
"grad_norm": 103.9375,
"learning_rate": 4.3867502238137865e-07,
"loss": 104.9436,
"step": 3005
},
{
"epoch": 0.6061864753453539,
"grad_norm": 101.75,
"learning_rate": 4.375559534467323e-07,
"loss": 103.6502,
"step": 3010
},
{
"epoch": 0.6071934296233361,
"grad_norm": 102.5,
"learning_rate": 4.364368845120859e-07,
"loss": 103.4684,
"step": 3015
},
{
"epoch": 0.6082003839013185,
"grad_norm": 104.6875,
"learning_rate": 4.353178155774396e-07,
"loss": 103.867,
"step": 3020
},
{
"epoch": 0.6092073381793008,
"grad_norm": 103.875,
"learning_rate": 4.3419874664279315e-07,
"loss": 103.4931,
"step": 3025
},
{
"epoch": 0.6102142924572831,
"grad_norm": 104.5625,
"learning_rate": 4.330796777081468e-07,
"loss": 103.9812,
"step": 3030
},
{
"epoch": 0.6112212467352655,
"grad_norm": 102.25,
"learning_rate": 4.319606087735004e-07,
"loss": 104.1066,
"step": 3035
},
{
"epoch": 0.6122282010132477,
"grad_norm": 102.375,
"learning_rate": 4.308415398388541e-07,
"loss": 103.3831,
"step": 3040
},
{
"epoch": 0.6132351552912301,
"grad_norm": 103.1875,
"learning_rate": 4.2972247090420764e-07,
"loss": 103.5499,
"step": 3045
},
{
"epoch": 0.6142421095692123,
"grad_norm": 106.4375,
"learning_rate": 4.286034019695613e-07,
"loss": 101.8933,
"step": 3050
},
{
"epoch": 0.6152490638471947,
"grad_norm": 103.0,
"learning_rate": 4.274843330349149e-07,
"loss": 102.0398,
"step": 3055
},
{
"epoch": 0.616256018125177,
"grad_norm": 104.3125,
"learning_rate": 4.263652641002686e-07,
"loss": 102.9206,
"step": 3060
},
{
"epoch": 0.6172629724031593,
"grad_norm": 101.5625,
"learning_rate": 4.2524619516562214e-07,
"loss": 102.4662,
"step": 3065
},
{
"epoch": 0.6182699266811417,
"grad_norm": 101.75,
"learning_rate": 4.241271262309758e-07,
"loss": 103.3072,
"step": 3070
},
{
"epoch": 0.6192768809591239,
"grad_norm": 104.375,
"learning_rate": 4.230080572963294e-07,
"loss": 102.1457,
"step": 3075
},
{
"epoch": 0.6202838352371063,
"grad_norm": 104.1875,
"learning_rate": 4.218889883616831e-07,
"loss": 103.5531,
"step": 3080
},
{
"epoch": 0.6212907895150885,
"grad_norm": 104.125,
"learning_rate": 4.207699194270367e-07,
"loss": 104.2774,
"step": 3085
},
{
"epoch": 0.6222977437930709,
"grad_norm": 105.5,
"learning_rate": 4.196508504923903e-07,
"loss": 102.702,
"step": 3090
},
{
"epoch": 0.6233046980710533,
"grad_norm": 103.25,
"learning_rate": 4.185317815577439e-07,
"loss": 103.7312,
"step": 3095
},
{
"epoch": 0.6243116523490355,
"grad_norm": 104.6875,
"learning_rate": 4.174127126230976e-07,
"loss": 102.1448,
"step": 3100
},
{
"epoch": 0.6253186066270179,
"grad_norm": 104.0625,
"learning_rate": 4.1629364368845124e-07,
"loss": 103.0693,
"step": 3105
},
{
"epoch": 0.6263255609050001,
"grad_norm": 99.25,
"learning_rate": 4.151745747538048e-07,
"loss": 103.7934,
"step": 3110
},
{
"epoch": 0.6273325151829825,
"grad_norm": 104.75,
"learning_rate": 4.1405550581915846e-07,
"loss": 102.7301,
"step": 3115
},
{
"epoch": 0.6283394694609648,
"grad_norm": 105.125,
"learning_rate": 4.1293643688451207e-07,
"loss": 102.1029,
"step": 3120
},
{
"epoch": 0.6293464237389471,
"grad_norm": 103.375,
"learning_rate": 4.1181736794986573e-07,
"loss": 101.7747,
"step": 3125
},
{
"epoch": 0.6303533780169294,
"grad_norm": 107.75,
"learning_rate": 4.106982990152193e-07,
"loss": 101.7472,
"step": 3130
},
{
"epoch": 0.6313603322949117,
"grad_norm": 102.75,
"learning_rate": 4.0957923008057296e-07,
"loss": 101.3941,
"step": 3135
},
{
"epoch": 0.6323672865728941,
"grad_norm": 107.9375,
"learning_rate": 4.0846016114592657e-07,
"loss": 102.634,
"step": 3140
},
{
"epoch": 0.6333742408508763,
"grad_norm": 106.5625,
"learning_rate": 4.0734109221128023e-07,
"loss": 103.399,
"step": 3145
},
{
"epoch": 0.6343811951288587,
"grad_norm": 105.5,
"learning_rate": 4.062220232766338e-07,
"loss": 101.831,
"step": 3150
},
{
"epoch": 0.635388149406841,
"grad_norm": 104.75,
"learning_rate": 4.0510295434198745e-07,
"loss": 101.9133,
"step": 3155
},
{
"epoch": 0.6363951036848233,
"grad_norm": 106.5,
"learning_rate": 4.0398388540734106e-07,
"loss": 103.1237,
"step": 3160
},
{
"epoch": 0.6374020579628056,
"grad_norm": 106.5,
"learning_rate": 4.028648164726947e-07,
"loss": 103.293,
"step": 3165
},
{
"epoch": 0.6384090122407879,
"grad_norm": 100.125,
"learning_rate": 4.0174574753804834e-07,
"loss": 102.386,
"step": 3170
},
{
"epoch": 0.6394159665187703,
"grad_norm": 105.5,
"learning_rate": 4.0062667860340195e-07,
"loss": 101.1026,
"step": 3175
},
{
"epoch": 0.6404229207967526,
"grad_norm": 105.0625,
"learning_rate": 3.9950760966875556e-07,
"loss": 103.0412,
"step": 3180
},
{
"epoch": 0.6414298750747349,
"grad_norm": 102.0625,
"learning_rate": 3.983885407341092e-07,
"loss": 102.8331,
"step": 3185
},
{
"epoch": 0.6424368293527172,
"grad_norm": 102.5,
"learning_rate": 3.9726947179946283e-07,
"loss": 102.7278,
"step": 3190
},
{
"epoch": 0.6434437836306995,
"grad_norm": 101.375,
"learning_rate": 3.9615040286481644e-07,
"loss": 100.927,
"step": 3195
},
{
"epoch": 0.6444507379086818,
"grad_norm": 102.625,
"learning_rate": 3.9503133393017005e-07,
"loss": 101.5179,
"step": 3200
},
{
"epoch": 0.6454576921866642,
"grad_norm": 103.875,
"learning_rate": 3.939122649955237e-07,
"loss": 101.7856,
"step": 3205
},
{
"epoch": 0.6464646464646465,
"grad_norm": 103.8125,
"learning_rate": 3.9279319606087733e-07,
"loss": 101.8363,
"step": 3210
},
{
"epoch": 0.6474716007426288,
"grad_norm": 105.25,
"learning_rate": 3.9167412712623094e-07,
"loss": 103.548,
"step": 3215
},
{
"epoch": 0.6484785550206111,
"grad_norm": 104.6875,
"learning_rate": 3.9055505819158455e-07,
"loss": 103.2727,
"step": 3220
},
{
"epoch": 0.6494855092985934,
"grad_norm": 103.25,
"learning_rate": 3.894359892569382e-07,
"loss": 102.914,
"step": 3225
},
{
"epoch": 0.6504924635765758,
"grad_norm": 105.0,
"learning_rate": 3.883169203222919e-07,
"loss": 101.5854,
"step": 3230
},
{
"epoch": 0.651499417854558,
"grad_norm": 105.125,
"learning_rate": 3.8719785138764544e-07,
"loss": 101.9887,
"step": 3235
},
{
"epoch": 0.6525063721325404,
"grad_norm": 103.4375,
"learning_rate": 3.860787824529991e-07,
"loss": 101.3439,
"step": 3240
},
{
"epoch": 0.6535133264105226,
"grad_norm": 103.0625,
"learning_rate": 3.849597135183527e-07,
"loss": 103.7408,
"step": 3245
},
{
"epoch": 0.654520280688505,
"grad_norm": 102.5,
"learning_rate": 3.8384064458370637e-07,
"loss": 101.1225,
"step": 3250
},
{
"epoch": 0.6555272349664873,
"grad_norm": 108.6875,
"learning_rate": 3.8272157564906e-07,
"loss": 101.483,
"step": 3255
},
{
"epoch": 0.6565341892444696,
"grad_norm": 104.875,
"learning_rate": 3.816025067144136e-07,
"loss": 101.052,
"step": 3260
},
{
"epoch": 0.657541143522452,
"grad_norm": 101.5625,
"learning_rate": 3.804834377797672e-07,
"loss": 101.2586,
"step": 3265
},
{
"epoch": 0.6585480978004342,
"grad_norm": 106.25,
"learning_rate": 3.7936436884512087e-07,
"loss": 102.4246,
"step": 3270
},
{
"epoch": 0.6595550520784166,
"grad_norm": 104.75,
"learning_rate": 3.782452999104745e-07,
"loss": 102.6763,
"step": 3275
},
{
"epoch": 0.6605620063563988,
"grad_norm": 104.3125,
"learning_rate": 3.771262309758281e-07,
"loss": 102.0128,
"step": 3280
},
{
"epoch": 0.6615689606343812,
"grad_norm": 103.1875,
"learning_rate": 3.760071620411817e-07,
"loss": 102.5605,
"step": 3285
},
{
"epoch": 0.6625759149123636,
"grad_norm": 103.375,
"learning_rate": 3.7488809310653537e-07,
"loss": 102.7485,
"step": 3290
},
{
"epoch": 0.6635828691903458,
"grad_norm": 105.5625,
"learning_rate": 3.73769024171889e-07,
"loss": 101.5186,
"step": 3295
},
{
"epoch": 0.6645898234683282,
"grad_norm": 102.125,
"learning_rate": 3.726499552372426e-07,
"loss": 101.6677,
"step": 3300
},
{
"epoch": 0.6655967777463104,
"grad_norm": 101.4375,
"learning_rate": 3.715308863025962e-07,
"loss": 101.6941,
"step": 3305
},
{
"epoch": 0.6666037320242928,
"grad_norm": 100.25,
"learning_rate": 3.7041181736794986e-07,
"loss": 100.565,
"step": 3310
},
{
"epoch": 0.667610686302275,
"grad_norm": 103.1875,
"learning_rate": 3.6929274843330347e-07,
"loss": 102.0876,
"step": 3315
},
{
"epoch": 0.6686176405802574,
"grad_norm": 102.0,
"learning_rate": 3.681736794986571e-07,
"loss": 101.5779,
"step": 3320
},
{
"epoch": 0.6696245948582397,
"grad_norm": 103.6875,
"learning_rate": 3.670546105640107e-07,
"loss": 100.2148,
"step": 3325
},
{
"epoch": 0.670631549136222,
"grad_norm": 104.125,
"learning_rate": 3.6593554162936436e-07,
"loss": 103.0346,
"step": 3330
},
{
"epoch": 0.6716385034142044,
"grad_norm": 104.1875,
"learning_rate": 3.6481647269471797e-07,
"loss": 100.2821,
"step": 3335
},
{
"epoch": 0.6726454576921866,
"grad_norm": 104.0,
"learning_rate": 3.6369740376007163e-07,
"loss": 102.4122,
"step": 3340
},
{
"epoch": 0.673652411970169,
"grad_norm": 104.5,
"learning_rate": 3.625783348254252e-07,
"loss": 101.9806,
"step": 3345
},
{
"epoch": 0.6746593662481513,
"grad_norm": 106.5,
"learning_rate": 3.6145926589077885e-07,
"loss": 100.5335,
"step": 3350
},
{
"epoch": 0.6756663205261336,
"grad_norm": 106.0,
"learning_rate": 3.603401969561325e-07,
"loss": 101.7242,
"step": 3355
},
{
"epoch": 0.6766732748041159,
"grad_norm": 102.6875,
"learning_rate": 3.5922112802148613e-07,
"loss": 101.2396,
"step": 3360
},
{
"epoch": 0.6776802290820982,
"grad_norm": 108.5625,
"learning_rate": 3.5810205908683974e-07,
"loss": 100.8422,
"step": 3365
},
{
"epoch": 0.6786871833600806,
"grad_norm": 103.25,
"learning_rate": 3.5698299015219335e-07,
"loss": 101.5823,
"step": 3370
},
{
"epoch": 0.6796941376380629,
"grad_norm": 102.875,
"learning_rate": 3.55863921217547e-07,
"loss": 101.6988,
"step": 3375
},
{
"epoch": 0.6807010919160452,
"grad_norm": 102.375,
"learning_rate": 3.547448522829006e-07,
"loss": 100.5021,
"step": 3380
},
{
"epoch": 0.6817080461940275,
"grad_norm": 106.4375,
"learning_rate": 3.5362578334825423e-07,
"loss": 100.3675,
"step": 3385
},
{
"epoch": 0.6827150004720098,
"grad_norm": 101.875,
"learning_rate": 3.5250671441360785e-07,
"loss": 101.8122,
"step": 3390
},
{
"epoch": 0.6837219547499921,
"grad_norm": 103.375,
"learning_rate": 3.513876454789615e-07,
"loss": 100.3997,
"step": 3395
},
{
"epoch": 0.6847289090279745,
"grad_norm": 104.6875,
"learning_rate": 3.502685765443151e-07,
"loss": 100.5548,
"step": 3400
},
{
"epoch": 0.6857358633059568,
"grad_norm": 106.0625,
"learning_rate": 3.4914950760966873e-07,
"loss": 102.6518,
"step": 3405
},
{
"epoch": 0.6867428175839391,
"grad_norm": 107.375,
"learning_rate": 3.4803043867502234e-07,
"loss": 101.1174,
"step": 3410
},
{
"epoch": 0.6877497718619214,
"grad_norm": 103.25,
"learning_rate": 3.46911369740376e-07,
"loss": 101.1875,
"step": 3415
},
{
"epoch": 0.6887567261399037,
"grad_norm": 105.9375,
"learning_rate": 3.457923008057296e-07,
"loss": 101.0168,
"step": 3420
},
{
"epoch": 0.689763680417886,
"grad_norm": 104.25,
"learning_rate": 3.446732318710833e-07,
"loss": 100.9963,
"step": 3425
},
{
"epoch": 0.6907706346958683,
"grad_norm": 100.1875,
"learning_rate": 3.4355416293643684e-07,
"loss": 99.1402,
"step": 3430
},
{
"epoch": 0.6917775889738507,
"grad_norm": 104.625,
"learning_rate": 3.424350940017905e-07,
"loss": 100.4257,
"step": 3435
},
{
"epoch": 0.6927845432518329,
"grad_norm": 102.0,
"learning_rate": 3.413160250671441e-07,
"loss": 99.5312,
"step": 3440
},
{
"epoch": 0.6937914975298153,
"grad_norm": 105.8125,
"learning_rate": 3.401969561324978e-07,
"loss": 101.3088,
"step": 3445
},
{
"epoch": 0.6947984518077976,
"grad_norm": 106.875,
"learning_rate": 3.3907788719785133e-07,
"loss": 101.2592,
"step": 3450
},
{
"epoch": 0.6958054060857799,
"grad_norm": 106.4375,
"learning_rate": 3.37958818263205e-07,
"loss": 99.822,
"step": 3455
},
{
"epoch": 0.6968123603637623,
"grad_norm": 104.3125,
"learning_rate": 3.368397493285586e-07,
"loss": 102.2747,
"step": 3460
},
{
"epoch": 0.6978193146417445,
"grad_norm": 104.5625,
"learning_rate": 3.3572068039391227e-07,
"loss": 100.0264,
"step": 3465
},
{
"epoch": 0.6988262689197269,
"grad_norm": 102.0625,
"learning_rate": 3.3460161145926583e-07,
"loss": 99.6872,
"step": 3470
},
{
"epoch": 0.6998332231977091,
"grad_norm": 102.8125,
"learning_rate": 3.334825425246195e-07,
"loss": 99.866,
"step": 3475
},
{
"epoch": 0.7008401774756915,
"grad_norm": 100.875,
"learning_rate": 3.3236347358997316e-07,
"loss": 99.9147,
"step": 3480
},
{
"epoch": 0.7018471317536739,
"grad_norm": 102.5,
"learning_rate": 3.3124440465532677e-07,
"loss": 101.803,
"step": 3485
},
{
"epoch": 0.7028540860316561,
"grad_norm": 107.0625,
"learning_rate": 3.301253357206804e-07,
"loss": 101.0587,
"step": 3490
},
{
"epoch": 0.7038610403096385,
"grad_norm": 107.0625,
"learning_rate": 3.29006266786034e-07,
"loss": 102.4405,
"step": 3495
},
{
"epoch": 0.7048679945876207,
"grad_norm": 101.125,
"learning_rate": 3.2788719785138765e-07,
"loss": 101.6015,
"step": 3500
},
{
"epoch": 0.7048679945876207,
"eval_loss": 3.153071880340576,
"eval_runtime": 239.9872,
"eval_samples_per_second": 1115.114,
"eval_steps_per_second": 34.848,
"step": 3500
},
{
"epoch": 0.7058749488656031,
"grad_norm": 101.375,
"learning_rate": 3.2676812891674126e-07,
"loss": 100.2117,
"step": 3505
},
{
"epoch": 0.7068819031435853,
"grad_norm": 101.5625,
"learning_rate": 3.2564905998209493e-07,
"loss": 100.942,
"step": 3510
},
{
"epoch": 0.7078888574215677,
"grad_norm": 105.0,
"learning_rate": 3.245299910474485e-07,
"loss": 101.196,
"step": 3515
},
{
"epoch": 0.7088958116995501,
"grad_norm": 106.5,
"learning_rate": 3.2341092211280215e-07,
"loss": 100.3638,
"step": 3520
},
{
"epoch": 0.7099027659775323,
"grad_norm": 102.0625,
"learning_rate": 3.2229185317815576e-07,
"loss": 101.518,
"step": 3525
},
{
"epoch": 0.7109097202555147,
"grad_norm": 107.875,
"learning_rate": 3.211727842435094e-07,
"loss": 99.2336,
"step": 3530
},
{
"epoch": 0.7119166745334969,
"grad_norm": 102.625,
"learning_rate": 3.20053715308863e-07,
"loss": 101.3888,
"step": 3535
},
{
"epoch": 0.7129236288114793,
"grad_norm": 106.9375,
"learning_rate": 3.1893464637421664e-07,
"loss": 98.6846,
"step": 3540
},
{
"epoch": 0.7139305830894616,
"grad_norm": 104.6875,
"learning_rate": 3.1781557743957026e-07,
"loss": 101.1002,
"step": 3545
},
{
"epoch": 0.7149375373674439,
"grad_norm": 104.9375,
"learning_rate": 3.166965085049239e-07,
"loss": 100.9625,
"step": 3550
},
{
"epoch": 0.7159444916454262,
"grad_norm": 104.8125,
"learning_rate": 3.155774395702775e-07,
"loss": 101.6369,
"step": 3555
},
{
"epoch": 0.7169514459234085,
"grad_norm": 101.0,
"learning_rate": 3.1445837063563114e-07,
"loss": 101.1003,
"step": 3560
},
{
"epoch": 0.7179584002013909,
"grad_norm": 106.5,
"learning_rate": 3.1333930170098475e-07,
"loss": 98.8858,
"step": 3565
},
{
"epoch": 0.7189653544793732,
"grad_norm": 106.1875,
"learning_rate": 3.122202327663384e-07,
"loss": 99.8127,
"step": 3570
},
{
"epoch": 0.7199723087573555,
"grad_norm": 106.0625,
"learning_rate": 3.1110116383169197e-07,
"loss": 99.7345,
"step": 3575
},
{
"epoch": 0.7209792630353378,
"grad_norm": 107.875,
"learning_rate": 3.0998209489704564e-07,
"loss": 98.8684,
"step": 3580
},
{
"epoch": 0.7219862173133201,
"grad_norm": 102.3125,
"learning_rate": 3.0886302596239925e-07,
"loss": 100.5252,
"step": 3585
},
{
"epoch": 0.7229931715913024,
"grad_norm": 102.25,
"learning_rate": 3.077439570277529e-07,
"loss": 99.1437,
"step": 3590
},
{
"epoch": 0.7240001258692847,
"grad_norm": 105.0625,
"learning_rate": 3.066248880931066e-07,
"loss": 99.6645,
"step": 3595
},
{
"epoch": 0.7250070801472671,
"grad_norm": 108.4375,
"learning_rate": 3.0550581915846013e-07,
"loss": 101.7029,
"step": 3600
},
{
"epoch": 0.7260140344252494,
"grad_norm": 102.3125,
"learning_rate": 3.043867502238138e-07,
"loss": 101.3023,
"step": 3605
},
{
"epoch": 0.7270209887032317,
"grad_norm": 103.0,
"learning_rate": 3.032676812891674e-07,
"loss": 100.7414,
"step": 3610
},
{
"epoch": 0.728027942981214,
"grad_norm": 106.125,
"learning_rate": 3.0214861235452107e-07,
"loss": 100.1441,
"step": 3615
},
{
"epoch": 0.7290348972591963,
"grad_norm": 106.5625,
"learning_rate": 3.0102954341987463e-07,
"loss": 100.4396,
"step": 3620
},
{
"epoch": 0.7300418515371786,
"grad_norm": 103.625,
"learning_rate": 2.999104744852283e-07,
"loss": 100.7662,
"step": 3625
},
{
"epoch": 0.731048805815161,
"grad_norm": 102.8125,
"learning_rate": 2.987914055505819e-07,
"loss": 99.562,
"step": 3630
},
{
"epoch": 0.7320557600931433,
"grad_norm": 100.0,
"learning_rate": 2.9767233661593557e-07,
"loss": 101.1267,
"step": 3635
},
{
"epoch": 0.7330627143711256,
"grad_norm": 105.75,
"learning_rate": 2.965532676812891e-07,
"loss": 99.4763,
"step": 3640
},
{
"epoch": 0.7340696686491079,
"grad_norm": 107.5625,
"learning_rate": 2.954341987466428e-07,
"loss": 102.3207,
"step": 3645
},
{
"epoch": 0.7350766229270902,
"grad_norm": 106.3125,
"learning_rate": 2.943151298119964e-07,
"loss": 99.7648,
"step": 3650
},
{
"epoch": 0.7360835772050726,
"grad_norm": 105.375,
"learning_rate": 2.9319606087735006e-07,
"loss": 99.4035,
"step": 3655
},
{
"epoch": 0.7370905314830548,
"grad_norm": 103.1875,
"learning_rate": 2.920769919427036e-07,
"loss": 100.6968,
"step": 3660
},
{
"epoch": 0.7380974857610372,
"grad_norm": 108.0625,
"learning_rate": 2.909579230080573e-07,
"loss": 100.2772,
"step": 3665
},
{
"epoch": 0.7391044400390194,
"grad_norm": 110.0,
"learning_rate": 2.898388540734109e-07,
"loss": 99.7772,
"step": 3670
},
{
"epoch": 0.7401113943170018,
"grad_norm": 104.0625,
"learning_rate": 2.8871978513876456e-07,
"loss": 99.2794,
"step": 3675
},
{
"epoch": 0.7411183485949842,
"grad_norm": 106.125,
"learning_rate": 2.8760071620411817e-07,
"loss": 100.0197,
"step": 3680
},
{
"epoch": 0.7421253028729664,
"grad_norm": 104.6875,
"learning_rate": 2.864816472694718e-07,
"loss": 99.3644,
"step": 3685
},
{
"epoch": 0.7431322571509488,
"grad_norm": 105.625,
"learning_rate": 2.853625783348254e-07,
"loss": 101.4238,
"step": 3690
},
{
"epoch": 0.744139211428931,
"grad_norm": 105.9375,
"learning_rate": 2.8424350940017905e-07,
"loss": 98.9052,
"step": 3695
},
{
"epoch": 0.7451461657069134,
"grad_norm": 110.625,
"learning_rate": 2.8312444046553267e-07,
"loss": 99.8988,
"step": 3700
},
{
"epoch": 0.7461531199848956,
"grad_norm": 107.25,
"learning_rate": 2.820053715308863e-07,
"loss": 98.9521,
"step": 3705
},
{
"epoch": 0.747160074262878,
"grad_norm": 107.25,
"learning_rate": 2.808863025962399e-07,
"loss": 99.7661,
"step": 3710
},
{
"epoch": 0.7481670285408604,
"grad_norm": 107.9375,
"learning_rate": 2.7976723366159355e-07,
"loss": 98.4117,
"step": 3715
},
{
"epoch": 0.7491739828188426,
"grad_norm": 106.0625,
"learning_rate": 2.786481647269472e-07,
"loss": 100.2589,
"step": 3720
},
{
"epoch": 0.750180937096825,
"grad_norm": 102.5625,
"learning_rate": 2.7752909579230077e-07,
"loss": 98.1626,
"step": 3725
},
{
"epoch": 0.7511878913748072,
"grad_norm": 107.5,
"learning_rate": 2.7641002685765444e-07,
"loss": 99.1188,
"step": 3730
},
{
"epoch": 0.7521948456527896,
"grad_norm": 106.375,
"learning_rate": 2.7529095792300805e-07,
"loss": 98.337,
"step": 3735
},
{
"epoch": 0.7532017999307719,
"grad_norm": 106.8125,
"learning_rate": 2.741718889883617e-07,
"loss": 98.323,
"step": 3740
},
{
"epoch": 0.7542087542087542,
"grad_norm": 106.0625,
"learning_rate": 2.7305282005371527e-07,
"loss": 99.4523,
"step": 3745
},
{
"epoch": 0.7552157084867365,
"grad_norm": 100.0,
"learning_rate": 2.7193375111906893e-07,
"loss": 100.4829,
"step": 3750
},
{
"epoch": 0.7562226627647188,
"grad_norm": 104.75,
"learning_rate": 2.7081468218442254e-07,
"loss": 100.2488,
"step": 3755
},
{
"epoch": 0.7572296170427012,
"grad_norm": 103.875,
"learning_rate": 2.696956132497762e-07,
"loss": 100.597,
"step": 3760
},
{
"epoch": 0.7582365713206835,
"grad_norm": 102.6875,
"learning_rate": 2.685765443151298e-07,
"loss": 100.5632,
"step": 3765
},
{
"epoch": 0.7592435255986658,
"grad_norm": 106.5625,
"learning_rate": 2.6745747538048343e-07,
"loss": 99.1815,
"step": 3770
},
{
"epoch": 0.7602504798766481,
"grad_norm": 108.875,
"learning_rate": 2.6633840644583704e-07,
"loss": 99.6381,
"step": 3775
},
{
"epoch": 0.7612574341546304,
"grad_norm": 104.125,
"learning_rate": 2.652193375111907e-07,
"loss": 101.5287,
"step": 3780
},
{
"epoch": 0.7622643884326127,
"grad_norm": 106.4375,
"learning_rate": 2.641002685765443e-07,
"loss": 99.129,
"step": 3785
},
{
"epoch": 0.763271342710595,
"grad_norm": 105.4375,
"learning_rate": 2.629811996418979e-07,
"loss": 98.0637,
"step": 3790
},
{
"epoch": 0.7642782969885774,
"grad_norm": 105.0625,
"learning_rate": 2.6186213070725153e-07,
"loss": 99.2074,
"step": 3795
},
{
"epoch": 0.7652852512665597,
"grad_norm": 108.6875,
"learning_rate": 2.607430617726052e-07,
"loss": 100.7883,
"step": 3800
},
{
"epoch": 0.766292205544542,
"grad_norm": 103.5625,
"learning_rate": 2.596239928379588e-07,
"loss": 100.375,
"step": 3805
},
{
"epoch": 0.7672991598225243,
"grad_norm": 102.875,
"learning_rate": 2.585049239033124e-07,
"loss": 98.6215,
"step": 3810
},
{
"epoch": 0.7683061141005066,
"grad_norm": 104.125,
"learning_rate": 2.5738585496866603e-07,
"loss": 98.8108,
"step": 3815
},
{
"epoch": 0.7693130683784889,
"grad_norm": 103.9375,
"learning_rate": 2.562667860340197e-07,
"loss": 99.4923,
"step": 3820
},
{
"epoch": 0.7703200226564713,
"grad_norm": 104.8125,
"learning_rate": 2.551477170993733e-07,
"loss": 98.6085,
"step": 3825
},
{
"epoch": 0.7713269769344536,
"grad_norm": 102.6875,
"learning_rate": 2.540286481647269e-07,
"loss": 98.7937,
"step": 3830
},
{
"epoch": 0.7723339312124359,
"grad_norm": 105.3125,
"learning_rate": 2.5290957923008053e-07,
"loss": 99.0454,
"step": 3835
},
{
"epoch": 0.7733408854904182,
"grad_norm": 107.6875,
"learning_rate": 2.517905102954342e-07,
"loss": 101.8479,
"step": 3840
},
{
"epoch": 0.7743478397684005,
"grad_norm": 103.6875,
"learning_rate": 2.5067144136078785e-07,
"loss": 99.2844,
"step": 3845
},
{
"epoch": 0.7753547940463829,
"grad_norm": 104.6875,
"learning_rate": 2.4955237242614146e-07,
"loss": 98.6868,
"step": 3850
},
{
"epoch": 0.7763617483243651,
"grad_norm": 106.25,
"learning_rate": 2.484333034914951e-07,
"loss": 98.5338,
"step": 3855
},
{
"epoch": 0.7773687026023475,
"grad_norm": 107.625,
"learning_rate": 2.473142345568487e-07,
"loss": 101.3545,
"step": 3860
},
{
"epoch": 0.7783756568803297,
"grad_norm": 105.8125,
"learning_rate": 2.4619516562220235e-07,
"loss": 98.9249,
"step": 3865
},
{
"epoch": 0.7793826111583121,
"grad_norm": 106.1875,
"learning_rate": 2.4507609668755596e-07,
"loss": 97.8303,
"step": 3870
},
{
"epoch": 0.7803895654362945,
"grad_norm": 105.0,
"learning_rate": 2.4395702775290957e-07,
"loss": 99.5434,
"step": 3875
},
{
"epoch": 0.7813965197142767,
"grad_norm": 105.5625,
"learning_rate": 2.428379588182632e-07,
"loss": 98.4932,
"step": 3880
},
{
"epoch": 0.7824034739922591,
"grad_norm": 106.1875,
"learning_rate": 2.4171888988361685e-07,
"loss": 97.8763,
"step": 3885
},
{
"epoch": 0.7834104282702413,
"grad_norm": 105.5625,
"learning_rate": 2.4059982094897046e-07,
"loss": 98.7099,
"step": 3890
},
{
"epoch": 0.7844173825482237,
"grad_norm": 104.6875,
"learning_rate": 2.3948075201432407e-07,
"loss": 97.9921,
"step": 3895
},
{
"epoch": 0.7854243368262059,
"grad_norm": 102.1875,
"learning_rate": 2.3836168307967768e-07,
"loss": 97.4684,
"step": 3900
},
{
"epoch": 0.7864312911041883,
"grad_norm": 108.125,
"learning_rate": 2.3724261414503132e-07,
"loss": 98.4995,
"step": 3905
},
{
"epoch": 0.7874382453821707,
"grad_norm": 105.4375,
"learning_rate": 2.3612354521038493e-07,
"loss": 98.6665,
"step": 3910
},
{
"epoch": 0.7884451996601529,
"grad_norm": 106.0625,
"learning_rate": 2.350044762757386e-07,
"loss": 100.4663,
"step": 3915
},
{
"epoch": 0.7894521539381353,
"grad_norm": 105.1875,
"learning_rate": 2.338854073410922e-07,
"loss": 98.1109,
"step": 3920
},
{
"epoch": 0.7904591082161175,
"grad_norm": 106.1875,
"learning_rate": 2.3276633840644584e-07,
"loss": 98.0027,
"step": 3925
},
{
"epoch": 0.7914660624940999,
"grad_norm": 106.375,
"learning_rate": 2.3164726947179945e-07,
"loss": 97.9504,
"step": 3930
},
{
"epoch": 0.7924730167720822,
"grad_norm": 108.25,
"learning_rate": 2.3052820053715309e-07,
"loss": 98.3706,
"step": 3935
},
{
"epoch": 0.7934799710500645,
"grad_norm": 106.1875,
"learning_rate": 2.2940913160250672e-07,
"loss": 99.981,
"step": 3940
},
{
"epoch": 0.7944869253280469,
"grad_norm": 103.25,
"learning_rate": 2.2829006266786033e-07,
"loss": 99.1271,
"step": 3945
},
{
"epoch": 0.7954938796060291,
"grad_norm": 106.6875,
"learning_rate": 2.2717099373321397e-07,
"loss": 97.7095,
"step": 3950
},
{
"epoch": 0.7965008338840115,
"grad_norm": 102.4375,
"learning_rate": 2.2605192479856758e-07,
"loss": 97.0702,
"step": 3955
},
{
"epoch": 0.7975077881619937,
"grad_norm": 106.0625,
"learning_rate": 2.2493285586392122e-07,
"loss": 98.8119,
"step": 3960
},
{
"epoch": 0.7985147424399761,
"grad_norm": 105.3125,
"learning_rate": 2.2381378692927483e-07,
"loss": 99.1582,
"step": 3965
},
{
"epoch": 0.7995216967179584,
"grad_norm": 105.625,
"learning_rate": 2.2269471799462847e-07,
"loss": 97.7976,
"step": 3970
},
{
"epoch": 0.8005286509959407,
"grad_norm": 106.125,
"learning_rate": 2.2157564905998208e-07,
"loss": 98.5606,
"step": 3975
},
{
"epoch": 0.801535605273923,
"grad_norm": 101.375,
"learning_rate": 2.2045658012533572e-07,
"loss": 97.7195,
"step": 3980
},
{
"epoch": 0.8025425595519053,
"grad_norm": 106.875,
"learning_rate": 2.1933751119068933e-07,
"loss": 98.1909,
"step": 3985
},
{
"epoch": 0.8035495138298877,
"grad_norm": 103.5625,
"learning_rate": 2.1821844225604296e-07,
"loss": 97.943,
"step": 3990
},
{
"epoch": 0.80455646810787,
"grad_norm": 103.625,
"learning_rate": 2.1709937332139657e-07,
"loss": 97.9477,
"step": 3995
},
{
"epoch": 0.8055634223858523,
"grad_norm": 107.1875,
"learning_rate": 2.159803043867502e-07,
"loss": 98.1402,
"step": 4000
},
{
"epoch": 0.8055634223858523,
"eval_loss": 3.0711116790771484,
"eval_runtime": 241.4463,
"eval_samples_per_second": 1108.375,
"eval_steps_per_second": 34.637,
"step": 4000
},
{
"epoch": 0.8065703766638346,
"grad_norm": 103.75,
"learning_rate": 2.1486123545210382e-07,
"loss": 98.4772,
"step": 4005
},
{
"epoch": 0.8075773309418169,
"grad_norm": 103.8125,
"learning_rate": 2.1374216651745746e-07,
"loss": 98.346,
"step": 4010
},
{
"epoch": 0.8085842852197992,
"grad_norm": 110.0,
"learning_rate": 2.1262309758281107e-07,
"loss": 96.7867,
"step": 4015
},
{
"epoch": 0.8095912394977816,
"grad_norm": 106.5625,
"learning_rate": 2.115040286481647e-07,
"loss": 98.4165,
"step": 4020
},
{
"epoch": 0.8105981937757639,
"grad_norm": 104.375,
"learning_rate": 2.1038495971351834e-07,
"loss": 96.7289,
"step": 4025
},
{
"epoch": 0.8116051480537462,
"grad_norm": 105.625,
"learning_rate": 2.0926589077887196e-07,
"loss": 98.2023,
"step": 4030
},
{
"epoch": 0.8126121023317285,
"grad_norm": 104.75,
"learning_rate": 2.0814682184422562e-07,
"loss": 97.7403,
"step": 4035
},
{
"epoch": 0.8136190566097108,
"grad_norm": 100.375,
"learning_rate": 2.0702775290957923e-07,
"loss": 98.5274,
"step": 4040
},
{
"epoch": 0.8146260108876932,
"grad_norm": 103.875,
"learning_rate": 2.0590868397493287e-07,
"loss": 96.4971,
"step": 4045
},
{
"epoch": 0.8156329651656754,
"grad_norm": 106.125,
"learning_rate": 2.0478961504028648e-07,
"loss": 96.3516,
"step": 4050
},
{
"epoch": 0.8166399194436578,
"grad_norm": 102.125,
"learning_rate": 2.0367054610564011e-07,
"loss": 97.1612,
"step": 4055
},
{
"epoch": 0.81764687372164,
"grad_norm": 105.3125,
"learning_rate": 2.0255147717099373e-07,
"loss": 98.3791,
"step": 4060
},
{
"epoch": 0.8186538279996224,
"grad_norm": 107.75,
"learning_rate": 2.0143240823634736e-07,
"loss": 99.03,
"step": 4065
},
{
"epoch": 0.8196607822776047,
"grad_norm": 106.875,
"learning_rate": 2.0031333930170097e-07,
"loss": 97.8408,
"step": 4070
},
{
"epoch": 0.820667736555587,
"grad_norm": 104.5625,
"learning_rate": 1.991942703670546e-07,
"loss": 97.6941,
"step": 4075
},
{
"epoch": 0.8216746908335694,
"grad_norm": 106.1875,
"learning_rate": 1.9807520143240822e-07,
"loss": 97.8008,
"step": 4080
},
{
"epoch": 0.8226816451115516,
"grad_norm": 103.25,
"learning_rate": 1.9695613249776186e-07,
"loss": 96.2979,
"step": 4085
},
{
"epoch": 0.823688599389534,
"grad_norm": 104.6875,
"learning_rate": 1.9583706356311547e-07,
"loss": 99.1271,
"step": 4090
},
{
"epoch": 0.8246955536675162,
"grad_norm": 104.1875,
"learning_rate": 1.947179946284691e-07,
"loss": 97.5088,
"step": 4095
},
{
"epoch": 0.8257025079454986,
"grad_norm": 105.125,
"learning_rate": 1.9359892569382272e-07,
"loss": 98.5705,
"step": 4100
},
{
"epoch": 0.826709462223481,
"grad_norm": 107.125,
"learning_rate": 1.9247985675917635e-07,
"loss": 97.7035,
"step": 4105
},
{
"epoch": 0.8277164165014632,
"grad_norm": 104.0,
"learning_rate": 1.9136078782453e-07,
"loss": 97.2328,
"step": 4110
},
{
"epoch": 0.8287233707794456,
"grad_norm": 108.4375,
"learning_rate": 1.902417188898836e-07,
"loss": 99.1522,
"step": 4115
},
{
"epoch": 0.8297303250574278,
"grad_norm": 102.5,
"learning_rate": 1.8912264995523724e-07,
"loss": 100.5609,
"step": 4120
},
{
"epoch": 0.8307372793354102,
"grad_norm": 110.375,
"learning_rate": 1.8800358102059085e-07,
"loss": 96.9263,
"step": 4125
},
{
"epoch": 0.8317442336133924,
"grad_norm": 103.5,
"learning_rate": 1.868845120859445e-07,
"loss": 97.9833,
"step": 4130
},
{
"epoch": 0.8327511878913748,
"grad_norm": 104.5625,
"learning_rate": 1.857654431512981e-07,
"loss": 97.5248,
"step": 4135
},
{
"epoch": 0.8337581421693572,
"grad_norm": 102.3125,
"learning_rate": 1.8464637421665174e-07,
"loss": 96.6869,
"step": 4140
},
{
"epoch": 0.8347650964473394,
"grad_norm": 105.5625,
"learning_rate": 1.8352730528200535e-07,
"loss": 96.7125,
"step": 4145
},
{
"epoch": 0.8357720507253218,
"grad_norm": 106.9375,
"learning_rate": 1.8240823634735898e-07,
"loss": 98.5368,
"step": 4150
},
{
"epoch": 0.836779005003304,
"grad_norm": 107.375,
"learning_rate": 1.812891674127126e-07,
"loss": 97.8067,
"step": 4155
},
{
"epoch": 0.8377859592812864,
"grad_norm": 103.0625,
"learning_rate": 1.8017009847806626e-07,
"loss": 96.4311,
"step": 4160
},
{
"epoch": 0.8387929135592687,
"grad_norm": 105.375,
"learning_rate": 1.7905102954341987e-07,
"loss": 97.63,
"step": 4165
},
{
"epoch": 0.839799867837251,
"grad_norm": 104.3125,
"learning_rate": 1.779319606087735e-07,
"loss": 98.0877,
"step": 4170
},
{
"epoch": 0.8408068221152333,
"grad_norm": 104.8125,
"learning_rate": 1.7681289167412712e-07,
"loss": 96.8849,
"step": 4175
},
{
"epoch": 0.8418137763932156,
"grad_norm": 104.1875,
"learning_rate": 1.7569382273948075e-07,
"loss": 96.4972,
"step": 4180
},
{
"epoch": 0.842820730671198,
"grad_norm": 103.8125,
"learning_rate": 1.7457475380483437e-07,
"loss": 96.8067,
"step": 4185
},
{
"epoch": 0.8438276849491803,
"grad_norm": 104.875,
"learning_rate": 1.73455684870188e-07,
"loss": 97.2139,
"step": 4190
},
{
"epoch": 0.8448346392271626,
"grad_norm": 106.8125,
"learning_rate": 1.7233661593554164e-07,
"loss": 96.7182,
"step": 4195
},
{
"epoch": 0.8458415935051449,
"grad_norm": 108.6875,
"learning_rate": 1.7121754700089525e-07,
"loss": 97.3783,
"step": 4200
},
{
"epoch": 0.8468485477831272,
"grad_norm": 105.75,
"learning_rate": 1.700984780662489e-07,
"loss": 97.2013,
"step": 4205
},
{
"epoch": 0.8478555020611095,
"grad_norm": 106.875,
"learning_rate": 1.689794091316025e-07,
"loss": 97.2643,
"step": 4210
},
{
"epoch": 0.8488624563390919,
"grad_norm": 105.625,
"learning_rate": 1.6786034019695614e-07,
"loss": 97.3306,
"step": 4215
},
{
"epoch": 0.8498694106170742,
"grad_norm": 103.8125,
"learning_rate": 1.6674127126230975e-07,
"loss": 97.9119,
"step": 4220
},
{
"epoch": 0.8508763648950565,
"grad_norm": 102.625,
"learning_rate": 1.6562220232766338e-07,
"loss": 97.3807,
"step": 4225
},
{
"epoch": 0.8518833191730388,
"grad_norm": 107.1875,
"learning_rate": 1.64503133393017e-07,
"loss": 97.2101,
"step": 4230
},
{
"epoch": 0.8528902734510211,
"grad_norm": 104.875,
"learning_rate": 1.6338406445837063e-07,
"loss": 97.9154,
"step": 4235
},
{
"epoch": 0.8538972277290034,
"grad_norm": 105.1875,
"learning_rate": 1.6226499552372424e-07,
"loss": 97.5589,
"step": 4240
},
{
"epoch": 0.8549041820069857,
"grad_norm": 108.125,
"learning_rate": 1.6114592658907788e-07,
"loss": 97.9489,
"step": 4245
},
{
"epoch": 0.8559111362849681,
"grad_norm": 108.625,
"learning_rate": 1.600268576544315e-07,
"loss": 97.1754,
"step": 4250
},
{
"epoch": 0.8569180905629504,
"grad_norm": 107.8125,
"learning_rate": 1.5890778871978513e-07,
"loss": 97.4207,
"step": 4255
},
{
"epoch": 0.8579250448409327,
"grad_norm": 108.0625,
"learning_rate": 1.5778871978513874e-07,
"loss": 97.7349,
"step": 4260
},
{
"epoch": 0.858931999118915,
"grad_norm": 106.9375,
"learning_rate": 1.5666965085049238e-07,
"loss": 96.8319,
"step": 4265
},
{
"epoch": 0.8599389533968973,
"grad_norm": 105.125,
"learning_rate": 1.5555058191584599e-07,
"loss": 97.4651,
"step": 4270
},
{
"epoch": 0.8609459076748797,
"grad_norm": 104.1875,
"learning_rate": 1.5443151298119962e-07,
"loss": 97.7243,
"step": 4275
},
{
"epoch": 0.8619528619528619,
"grad_norm": 107.1875,
"learning_rate": 1.533124440465533e-07,
"loss": 96.1737,
"step": 4280
},
{
"epoch": 0.8629598162308443,
"grad_norm": 103.4375,
"learning_rate": 1.521933751119069e-07,
"loss": 97.9076,
"step": 4285
},
{
"epoch": 0.8639667705088265,
"grad_norm": 103.9375,
"learning_rate": 1.5107430617726054e-07,
"loss": 96.7344,
"step": 4290
},
{
"epoch": 0.8649737247868089,
"grad_norm": 105.125,
"learning_rate": 1.4995523724261415e-07,
"loss": 96.4767,
"step": 4295
},
{
"epoch": 0.8659806790647913,
"grad_norm": 106.0,
"learning_rate": 1.4883616830796778e-07,
"loss": 96.0296,
"step": 4300
},
{
"epoch": 0.8669876333427735,
"grad_norm": 103.4375,
"learning_rate": 1.477170993733214e-07,
"loss": 96.7257,
"step": 4305
},
{
"epoch": 0.8679945876207559,
"grad_norm": 107.625,
"learning_rate": 1.4659803043867503e-07,
"loss": 96.7568,
"step": 4310
},
{
"epoch": 0.8690015418987381,
"grad_norm": 107.1875,
"learning_rate": 1.4547896150402864e-07,
"loss": 97.2062,
"step": 4315
},
{
"epoch": 0.8700084961767205,
"grad_norm": 103.0,
"learning_rate": 1.4435989256938228e-07,
"loss": 96.2074,
"step": 4320
},
{
"epoch": 0.8710154504547027,
"grad_norm": 103.875,
"learning_rate": 1.432408236347359e-07,
"loss": 96.5843,
"step": 4325
},
{
"epoch": 0.8720224047326851,
"grad_norm": 105.5625,
"learning_rate": 1.4212175470008953e-07,
"loss": 97.9795,
"step": 4330
},
{
"epoch": 0.8730293590106675,
"grad_norm": 107.6875,
"learning_rate": 1.4100268576544314e-07,
"loss": 97.3432,
"step": 4335
},
{
"epoch": 0.8740363132886497,
"grad_norm": 103.3125,
"learning_rate": 1.3988361683079678e-07,
"loss": 95.1063,
"step": 4340
},
{
"epoch": 0.8750432675666321,
"grad_norm": 104.3125,
"learning_rate": 1.3876454789615039e-07,
"loss": 95.7163,
"step": 4345
},
{
"epoch": 0.8760502218446143,
"grad_norm": 104.9375,
"learning_rate": 1.3764547896150402e-07,
"loss": 96.0049,
"step": 4350
},
{
"epoch": 0.8770571761225967,
"grad_norm": 104.75,
"learning_rate": 1.3652641002685763e-07,
"loss": 96.9776,
"step": 4355
},
{
"epoch": 0.878064130400579,
"grad_norm": 105.6875,
"learning_rate": 1.3540734109221127e-07,
"loss": 94.5039,
"step": 4360
},
{
"epoch": 0.8790710846785613,
"grad_norm": 106.1875,
"learning_rate": 1.342882721575649e-07,
"loss": 96.5091,
"step": 4365
},
{
"epoch": 0.8800780389565437,
"grad_norm": 107.5625,
"learning_rate": 1.3316920322291852e-07,
"loss": 95.8942,
"step": 4370
},
{
"epoch": 0.8810849932345259,
"grad_norm": 109.0,
"learning_rate": 1.3205013428827216e-07,
"loss": 96.0599,
"step": 4375
},
{
"epoch": 0.8820919475125083,
"grad_norm": 106.875,
"learning_rate": 1.3093106535362577e-07,
"loss": 97.5782,
"step": 4380
},
{
"epoch": 0.8830989017904906,
"grad_norm": 105.5625,
"learning_rate": 1.298119964189794e-07,
"loss": 96.5007,
"step": 4385
},
{
"epoch": 0.8841058560684729,
"grad_norm": 104.625,
"learning_rate": 1.2869292748433302e-07,
"loss": 95.4609,
"step": 4390
},
{
"epoch": 0.8851128103464552,
"grad_norm": 108.4375,
"learning_rate": 1.2757385854968665e-07,
"loss": 97.2176,
"step": 4395
},
{
"epoch": 0.8861197646244375,
"grad_norm": 104.8125,
"learning_rate": 1.2645478961504026e-07,
"loss": 96.037,
"step": 4400
},
{
"epoch": 0.8871267189024198,
"grad_norm": 105.3125,
"learning_rate": 1.2533572068039393e-07,
"loss": 95.1831,
"step": 4405
},
{
"epoch": 0.8881336731804022,
"grad_norm": 102.5,
"learning_rate": 1.2421665174574754e-07,
"loss": 94.7369,
"step": 4410
},
{
"epoch": 0.8891406274583845,
"grad_norm": 105.3125,
"learning_rate": 1.2309758281110117e-07,
"loss": 95.9481,
"step": 4415
},
{
"epoch": 0.8901475817363668,
"grad_norm": 104.4375,
"learning_rate": 1.2197851387645479e-07,
"loss": 96.1412,
"step": 4420
},
{
"epoch": 0.8911545360143491,
"grad_norm": 102.75,
"learning_rate": 1.2085944494180842e-07,
"loss": 96.705,
"step": 4425
},
{
"epoch": 0.8921614902923314,
"grad_norm": 107.8125,
"learning_rate": 1.1974037600716203e-07,
"loss": 96.1038,
"step": 4430
},
{
"epoch": 0.8931684445703137,
"grad_norm": 109.875,
"learning_rate": 1.1862130707251566e-07,
"loss": 98.2343,
"step": 4435
},
{
"epoch": 0.894175398848296,
"grad_norm": 104.8125,
"learning_rate": 1.175022381378693e-07,
"loss": 98.8879,
"step": 4440
},
{
"epoch": 0.8951823531262784,
"grad_norm": 104.5625,
"learning_rate": 1.1638316920322292e-07,
"loss": 95.8946,
"step": 4445
},
{
"epoch": 0.8961893074042607,
"grad_norm": 107.9375,
"learning_rate": 1.1526410026857654e-07,
"loss": 96.5908,
"step": 4450
},
{
"epoch": 0.897196261682243,
"grad_norm": 100.6875,
"learning_rate": 1.1414503133393017e-07,
"loss": 97.5184,
"step": 4455
},
{
"epoch": 0.8982032159602253,
"grad_norm": 105.8125,
"learning_rate": 1.1302596239928379e-07,
"loss": 97.1954,
"step": 4460
},
{
"epoch": 0.8992101702382076,
"grad_norm": 101.1875,
"learning_rate": 1.1190689346463741e-07,
"loss": 95.303,
"step": 4465
},
{
"epoch": 0.90021712451619,
"grad_norm": 106.9375,
"learning_rate": 1.1078782452999104e-07,
"loss": 95.9828,
"step": 4470
},
{
"epoch": 0.9012240787941722,
"grad_norm": 109.875,
"learning_rate": 1.0966875559534466e-07,
"loss": 96.7188,
"step": 4475
},
{
"epoch": 0.9022310330721546,
"grad_norm": 103.5,
"learning_rate": 1.0854968666069829e-07,
"loss": 96.2868,
"step": 4480
},
{
"epoch": 0.9032379873501368,
"grad_norm": 108.25,
"learning_rate": 1.0743061772605191e-07,
"loss": 97.6396,
"step": 4485
},
{
"epoch": 0.9042449416281192,
"grad_norm": 105.625,
"learning_rate": 1.0631154879140553e-07,
"loss": 95.9678,
"step": 4490
},
{
"epoch": 0.9052518959061016,
"grad_norm": 105.1875,
"learning_rate": 1.0519247985675917e-07,
"loss": 96.746,
"step": 4495
},
{
"epoch": 0.9062588501840838,
"grad_norm": 107.25,
"learning_rate": 1.0407341092211281e-07,
"loss": 95.7666,
"step": 4500
},
{
"epoch": 0.9062588501840838,
"eval_loss": 3.013758897781372,
"eval_runtime": 241.0945,
"eval_samples_per_second": 1109.992,
"eval_steps_per_second": 34.688,
"step": 4500
},
{
"epoch": 0.9072658044620662,
"grad_norm": 104.1875,
"learning_rate": 1.0295434198746643e-07,
"loss": 96.1257,
"step": 4505
},
{
"epoch": 0.9082727587400484,
"grad_norm": 105.625,
"learning_rate": 1.0183527305282006e-07,
"loss": 96.9505,
"step": 4510
},
{
"epoch": 0.9092797130180308,
"grad_norm": 108.375,
"learning_rate": 1.0071620411817368e-07,
"loss": 96.6111,
"step": 4515
},
{
"epoch": 0.910286667296013,
"grad_norm": 106.4375,
"learning_rate": 9.95971351835273e-08,
"loss": 97.3165,
"step": 4520
},
{
"epoch": 0.9112936215739954,
"grad_norm": 105.375,
"learning_rate": 9.847806624888093e-08,
"loss": 97.2006,
"step": 4525
},
{
"epoch": 0.9123005758519778,
"grad_norm": 109.0,
"learning_rate": 9.735899731423455e-08,
"loss": 96.5357,
"step": 4530
},
{
"epoch": 0.91330753012996,
"grad_norm": 104.4375,
"learning_rate": 9.623992837958818e-08,
"loss": 96.6608,
"step": 4535
},
{
"epoch": 0.9143144844079424,
"grad_norm": 103.4375,
"learning_rate": 9.51208594449418e-08,
"loss": 96.2924,
"step": 4540
},
{
"epoch": 0.9153214386859246,
"grad_norm": 101.5,
"learning_rate": 9.400179051029543e-08,
"loss": 94.9458,
"step": 4545
},
{
"epoch": 0.916328392963907,
"grad_norm": 108.875,
"learning_rate": 9.288272157564905e-08,
"loss": 95.3736,
"step": 4550
},
{
"epoch": 0.9173353472418893,
"grad_norm": 106.0,
"learning_rate": 9.176365264100267e-08,
"loss": 94.3943,
"step": 4555
},
{
"epoch": 0.9183423015198716,
"grad_norm": 105.4375,
"learning_rate": 9.06445837063563e-08,
"loss": 97.27,
"step": 4560
},
{
"epoch": 0.919349255797854,
"grad_norm": 106.3125,
"learning_rate": 8.952551477170993e-08,
"loss": 95.4415,
"step": 4565
},
{
"epoch": 0.9203562100758362,
"grad_norm": 107.1875,
"learning_rate": 8.840644583706356e-08,
"loss": 96.8434,
"step": 4570
},
{
"epoch": 0.9213631643538186,
"grad_norm": 105.1875,
"learning_rate": 8.728737690241718e-08,
"loss": 96.2896,
"step": 4575
},
{
"epoch": 0.9223701186318009,
"grad_norm": 104.875,
"learning_rate": 8.616830796777082e-08,
"loss": 97.0949,
"step": 4580
},
{
"epoch": 0.9233770729097832,
"grad_norm": 107.375,
"learning_rate": 8.504923903312444e-08,
"loss": 96.0602,
"step": 4585
},
{
"epoch": 0.9243840271877655,
"grad_norm": 105.0,
"learning_rate": 8.393017009847807e-08,
"loss": 96.6697,
"step": 4590
},
{
"epoch": 0.9253909814657478,
"grad_norm": 103.5,
"learning_rate": 8.281110116383169e-08,
"loss": 95.5824,
"step": 4595
},
{
"epoch": 0.9263979357437301,
"grad_norm": 107.5625,
"learning_rate": 8.169203222918532e-08,
"loss": 96.6081,
"step": 4600
},
{
"epoch": 0.9274048900217124,
"grad_norm": 108.4375,
"learning_rate": 8.057296329453894e-08,
"loss": 96.3714,
"step": 4605
},
{
"epoch": 0.9284118442996948,
"grad_norm": 105.3125,
"learning_rate": 7.945389435989256e-08,
"loss": 95.8521,
"step": 4610
},
{
"epoch": 0.9294187985776771,
"grad_norm": 108.125,
"learning_rate": 7.833482542524619e-08,
"loss": 96.356,
"step": 4615
},
{
"epoch": 0.9304257528556594,
"grad_norm": 105.9375,
"learning_rate": 7.721575649059981e-08,
"loss": 96.4865,
"step": 4620
},
{
"epoch": 0.9314327071336417,
"grad_norm": 105.6875,
"learning_rate": 7.609668755595345e-08,
"loss": 95.1476,
"step": 4625
},
{
"epoch": 0.932439661411624,
"grad_norm": 106.9375,
"learning_rate": 7.497761862130707e-08,
"loss": 95.1061,
"step": 4630
},
{
"epoch": 0.9334466156896063,
"grad_norm": 105.5625,
"learning_rate": 7.38585496866607e-08,
"loss": 95.2852,
"step": 4635
},
{
"epoch": 0.9344535699675887,
"grad_norm": 107.5625,
"learning_rate": 7.273948075201432e-08,
"loss": 95.0002,
"step": 4640
},
{
"epoch": 0.935460524245571,
"grad_norm": 106.5625,
"learning_rate": 7.162041181736795e-08,
"loss": 97.3515,
"step": 4645
},
{
"epoch": 0.9364674785235533,
"grad_norm": 106.875,
"learning_rate": 7.050134288272157e-08,
"loss": 96.8893,
"step": 4650
},
{
"epoch": 0.9374744328015356,
"grad_norm": 106.125,
"learning_rate": 6.938227394807519e-08,
"loss": 96.1281,
"step": 4655
},
{
"epoch": 0.9384813870795179,
"grad_norm": 105.4375,
"learning_rate": 6.826320501342882e-08,
"loss": 95.932,
"step": 4660
},
{
"epoch": 0.9394883413575003,
"grad_norm": 106.0625,
"learning_rate": 6.714413607878245e-08,
"loss": 96.195,
"step": 4665
},
{
"epoch": 0.9404952956354825,
"grad_norm": 106.1875,
"learning_rate": 6.602506714413608e-08,
"loss": 94.7684,
"step": 4670
},
{
"epoch": 0.9415022499134649,
"grad_norm": 109.0,
"learning_rate": 6.49059982094897e-08,
"loss": 96.4495,
"step": 4675
},
{
"epoch": 0.9425092041914472,
"grad_norm": 109.0,
"learning_rate": 6.378692927484333e-08,
"loss": 96.9962,
"step": 4680
},
{
"epoch": 0.9435161584694295,
"grad_norm": 104.3125,
"learning_rate": 6.266786034019696e-08,
"loss": 94.3069,
"step": 4685
},
{
"epoch": 0.9445231127474119,
"grad_norm": 107.625,
"learning_rate": 6.154879140555059e-08,
"loss": 96.7521,
"step": 4690
},
{
"epoch": 0.9455300670253941,
"grad_norm": 104.3125,
"learning_rate": 6.042972247090421e-08,
"loss": 96.0066,
"step": 4695
},
{
"epoch": 0.9465370213033765,
"grad_norm": 104.875,
"learning_rate": 5.931065353625783e-08,
"loss": 94.7801,
"step": 4700
},
{
"epoch": 0.9475439755813587,
"grad_norm": 106.375,
"learning_rate": 5.819158460161146e-08,
"loss": 95.1509,
"step": 4705
},
{
"epoch": 0.9485509298593411,
"grad_norm": 104.9375,
"learning_rate": 5.7072515666965083e-08,
"loss": 95.5377,
"step": 4710
},
{
"epoch": 0.9495578841373233,
"grad_norm": 105.0,
"learning_rate": 5.595344673231871e-08,
"loss": 96.0342,
"step": 4715
},
{
"epoch": 0.9505648384153057,
"grad_norm": 106.8125,
"learning_rate": 5.483437779767233e-08,
"loss": 95.7919,
"step": 4720
},
{
"epoch": 0.9515717926932881,
"grad_norm": 104.0625,
"learning_rate": 5.3715308863025955e-08,
"loss": 95.4794,
"step": 4725
},
{
"epoch": 0.9525787469712703,
"grad_norm": 105.1875,
"learning_rate": 5.2596239928379586e-08,
"loss": 96.2796,
"step": 4730
},
{
"epoch": 0.9535857012492527,
"grad_norm": 107.4375,
"learning_rate": 5.147717099373322e-08,
"loss": 96.9097,
"step": 4735
},
{
"epoch": 0.9545926555272349,
"grad_norm": 104.0,
"learning_rate": 5.035810205908684e-08,
"loss": 95.2215,
"step": 4740
},
{
"epoch": 0.9555996098052173,
"grad_norm": 103.875,
"learning_rate": 4.9239033124440465e-08,
"loss": 95.531,
"step": 4745
},
{
"epoch": 0.9566065640831996,
"grad_norm": 104.5625,
"learning_rate": 4.811996418979409e-08,
"loss": 94.8213,
"step": 4750
},
{
"epoch": 0.9576135183611819,
"grad_norm": 103.125,
"learning_rate": 4.700089525514771e-08,
"loss": 94.9124,
"step": 4755
},
{
"epoch": 0.9586204726391643,
"grad_norm": 108.75,
"learning_rate": 4.588182632050134e-08,
"loss": 94.8872,
"step": 4760
},
{
"epoch": 0.9596274269171465,
"grad_norm": 106.0625,
"learning_rate": 4.476275738585497e-08,
"loss": 94.9003,
"step": 4765
},
{
"epoch": 0.9606343811951289,
"grad_norm": 109.0,
"learning_rate": 4.364368845120859e-08,
"loss": 97.4909,
"step": 4770
},
{
"epoch": 0.9616413354731111,
"grad_norm": 105.1875,
"learning_rate": 4.252461951656222e-08,
"loss": 95.4977,
"step": 4775
},
{
"epoch": 0.9626482897510935,
"grad_norm": 103.4375,
"learning_rate": 4.1405550581915846e-08,
"loss": 95.1702,
"step": 4780
},
{
"epoch": 0.9636552440290758,
"grad_norm": 105.4375,
"learning_rate": 4.028648164726947e-08,
"loss": 95.1124,
"step": 4785
},
{
"epoch": 0.9646621983070581,
"grad_norm": 107.125,
"learning_rate": 3.9167412712623094e-08,
"loss": 95.5008,
"step": 4790
},
{
"epoch": 0.9656691525850404,
"grad_norm": 103.8125,
"learning_rate": 3.8048343777976725e-08,
"loss": 96.745,
"step": 4795
},
{
"epoch": 0.9666761068630227,
"grad_norm": 103.875,
"learning_rate": 3.692927484333035e-08,
"loss": 96.3884,
"step": 4800
},
{
"epoch": 0.9676830611410051,
"grad_norm": 103.375,
"learning_rate": 3.581020590868397e-08,
"loss": 94.6912,
"step": 4805
},
{
"epoch": 0.9686900154189874,
"grad_norm": 106.125,
"learning_rate": 3.4691136974037597e-08,
"loss": 95.0865,
"step": 4810
},
{
"epoch": 0.9696969696969697,
"grad_norm": 103.625,
"learning_rate": 3.357206803939123e-08,
"loss": 93.7961,
"step": 4815
},
{
"epoch": 0.970703923974952,
"grad_norm": 104.3125,
"learning_rate": 3.245299910474485e-08,
"loss": 95.4935,
"step": 4820
},
{
"epoch": 0.9717108782529343,
"grad_norm": 102.0625,
"learning_rate": 3.133393017009848e-08,
"loss": 95.072,
"step": 4825
},
{
"epoch": 0.9727178325309166,
"grad_norm": 107.1875,
"learning_rate": 3.0214861235452106e-08,
"loss": 95.2949,
"step": 4830
},
{
"epoch": 0.973724786808899,
"grad_norm": 105.375,
"learning_rate": 2.909579230080573e-08,
"loss": 94.6876,
"step": 4835
},
{
"epoch": 0.9747317410868813,
"grad_norm": 104.5625,
"learning_rate": 2.7976723366159354e-08,
"loss": 96.9518,
"step": 4840
},
{
"epoch": 0.9757386953648636,
"grad_norm": 106.1875,
"learning_rate": 2.6857654431512978e-08,
"loss": 95.4756,
"step": 4845
},
{
"epoch": 0.9767456496428459,
"grad_norm": 107.5625,
"learning_rate": 2.573858549686661e-08,
"loss": 94.8716,
"step": 4850
},
{
"epoch": 0.9777526039208282,
"grad_norm": 103.4375,
"learning_rate": 2.4619516562220232e-08,
"loss": 95.0801,
"step": 4855
},
{
"epoch": 0.9787595581988106,
"grad_norm": 102.75,
"learning_rate": 2.3500447627573856e-08,
"loss": 95.3318,
"step": 4860
},
{
"epoch": 0.9797665124767928,
"grad_norm": 107.75,
"learning_rate": 2.2381378692927484e-08,
"loss": 95.9054,
"step": 4865
},
{
"epoch": 0.9807734667547752,
"grad_norm": 103.1875,
"learning_rate": 2.126230975828111e-08,
"loss": 95.6948,
"step": 4870
},
{
"epoch": 0.9817804210327575,
"grad_norm": 107.5625,
"learning_rate": 2.0143240823634735e-08,
"loss": 95.1651,
"step": 4875
},
{
"epoch": 0.9827873753107398,
"grad_norm": 102.3125,
"learning_rate": 1.9024171888988362e-08,
"loss": 95.8977,
"step": 4880
},
{
"epoch": 0.9837943295887221,
"grad_norm": 107.3125,
"learning_rate": 1.7905102954341986e-08,
"loss": 94.0943,
"step": 4885
},
{
"epoch": 0.9848012838667044,
"grad_norm": 105.8125,
"learning_rate": 1.6786034019695614e-08,
"loss": 96.5686,
"step": 4890
},
{
"epoch": 0.9858082381446868,
"grad_norm": 104.75,
"learning_rate": 1.566696508504924e-08,
"loss": 96.2139,
"step": 4895
},
{
"epoch": 0.986815192422669,
"grad_norm": 106.5625,
"learning_rate": 1.4547896150402865e-08,
"loss": 96.4123,
"step": 4900
},
{
"epoch": 0.9878221467006514,
"grad_norm": 107.5,
"learning_rate": 1.3428827215756489e-08,
"loss": 95.4067,
"step": 4905
},
{
"epoch": 0.9888291009786336,
"grad_norm": 106.125,
"learning_rate": 1.2309758281110116e-08,
"loss": 96.4161,
"step": 4910
},
{
"epoch": 0.989836055256616,
"grad_norm": 104.6875,
"learning_rate": 1.1190689346463742e-08,
"loss": 94.9028,
"step": 4915
},
{
"epoch": 0.9908430095345984,
"grad_norm": 106.0625,
"learning_rate": 1.0071620411817367e-08,
"loss": 96.9095,
"step": 4920
},
{
"epoch": 0.9918499638125806,
"grad_norm": 106.8125,
"learning_rate": 8.952551477170993e-09,
"loss": 94.9621,
"step": 4925
},
{
"epoch": 0.992856918090563,
"grad_norm": 105.75,
"learning_rate": 7.83348254252462e-09,
"loss": 95.0764,
"step": 4930
},
{
"epoch": 0.9938638723685452,
"grad_norm": 107.5,
"learning_rate": 6.7144136078782444e-09,
"loss": 96.6513,
"step": 4935
},
{
"epoch": 0.9948708266465276,
"grad_norm": 104.1875,
"learning_rate": 5.595344673231871e-09,
"loss": 94.489,
"step": 4940
},
{
"epoch": 0.9958777809245098,
"grad_norm": 105.75,
"learning_rate": 4.4762757385854966e-09,
"loss": 95.3881,
"step": 4945
},
{
"epoch": 0.9968847352024922,
"grad_norm": 108.0625,
"learning_rate": 3.3572068039391222e-09,
"loss": 95.4261,
"step": 4950
},
{
"epoch": 0.9978916894804746,
"grad_norm": 105.9375,
"learning_rate": 2.2381378692927483e-09,
"loss": 95.8491,
"step": 4955
},
{
"epoch": 0.9988986437584568,
"grad_norm": 104.6875,
"learning_rate": 1.1190689346463741e-09,
"loss": 94.8424,
"step": 4960
},
{
"epoch": 0.9999055980364392,
"grad_norm": 101.75,
"learning_rate": 0.0,
"loss": 94.7523,
"step": 4965
}
],
"logging_steps": 5,
"max_steps": 4965,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.151015743419633e+19,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}