GENOME-gemma-2b-it / cot /trainer_state.json
Estwld's picture
Upload 15 files
e64b907 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 1000,
"global_step": 7385,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006770480704129994,
"grad_norm": 2.130030393600464,
"learning_rate": 2.7063599458728013e-06,
"loss": 2.3319,
"step": 10
},
{
"epoch": 0.013540961408259987,
"grad_norm": 2.666555881500244,
"learning_rate": 5.4127198917456026e-06,
"loss": 2.3443,
"step": 20
},
{
"epoch": 0.020311442112389978,
"grad_norm": 2.274488687515259,
"learning_rate": 8.119079837618404e-06,
"loss": 2.3759,
"step": 30
},
{
"epoch": 0.027081922816519974,
"grad_norm": 2.197918653488159,
"learning_rate": 1.0825439783491205e-05,
"loss": 2.1286,
"step": 40
},
{
"epoch": 0.033852403520649964,
"grad_norm": 2.2513201236724854,
"learning_rate": 1.3531799729364006e-05,
"loss": 1.9161,
"step": 50
},
{
"epoch": 0.040622884224779957,
"grad_norm": 1.52046537399292,
"learning_rate": 1.6238159675236808e-05,
"loss": 1.6287,
"step": 60
},
{
"epoch": 0.04739336492890995,
"grad_norm": 1.0912840366363525,
"learning_rate": 1.894451962110961e-05,
"loss": 1.5206,
"step": 70
},
{
"epoch": 0.05416384563303995,
"grad_norm": 1.050105333328247,
"learning_rate": 2.165087956698241e-05,
"loss": 1.3484,
"step": 80
},
{
"epoch": 0.06093432633716994,
"grad_norm": 1.138007402420044,
"learning_rate": 2.435723951285521e-05,
"loss": 1.3352,
"step": 90
},
{
"epoch": 0.06770480704129993,
"grad_norm": 1.0807892084121704,
"learning_rate": 2.7063599458728013e-05,
"loss": 1.2605,
"step": 100
},
{
"epoch": 0.07447528774542993,
"grad_norm": 1.1421936750411987,
"learning_rate": 2.976995940460081e-05,
"loss": 1.1888,
"step": 110
},
{
"epoch": 0.08124576844955991,
"grad_norm": 1.2684075832366943,
"learning_rate": 3.2476319350473615e-05,
"loss": 1.1998,
"step": 120
},
{
"epoch": 0.08801624915368991,
"grad_norm": 1.1413911581039429,
"learning_rate": 3.518267929634642e-05,
"loss": 1.1426,
"step": 130
},
{
"epoch": 0.0947867298578199,
"grad_norm": 1.3954917192459106,
"learning_rate": 3.788903924221922e-05,
"loss": 1.1437,
"step": 140
},
{
"epoch": 0.1015572105619499,
"grad_norm": 1.2118768692016602,
"learning_rate": 4.059539918809202e-05,
"loss": 1.0564,
"step": 150
},
{
"epoch": 0.1083276912660799,
"grad_norm": 1.4291969537734985,
"learning_rate": 4.330175913396482e-05,
"loss": 1.0382,
"step": 160
},
{
"epoch": 0.11509817197020988,
"grad_norm": 1.351151943206787,
"learning_rate": 4.600811907983762e-05,
"loss": 1.0717,
"step": 170
},
{
"epoch": 0.12186865267433988,
"grad_norm": 1.3836501836776733,
"learning_rate": 4.871447902571042e-05,
"loss": 1.0294,
"step": 180
},
{
"epoch": 0.12863913337846988,
"grad_norm": 1.2129018306732178,
"learning_rate": 5.142083897158322e-05,
"loss": 1.0081,
"step": 190
},
{
"epoch": 0.13540961408259986,
"grad_norm": 1.244095802307129,
"learning_rate": 5.4127198917456026e-05,
"loss": 0.9383,
"step": 200
},
{
"epoch": 0.14218009478672985,
"grad_norm": 1.3957242965698242,
"learning_rate": 5.683355886332883e-05,
"loss": 0.927,
"step": 210
},
{
"epoch": 0.14895057549085985,
"grad_norm": 1.688636302947998,
"learning_rate": 5.953991880920162e-05,
"loss": 0.9617,
"step": 220
},
{
"epoch": 0.15572105619498985,
"grad_norm": 1.376826524734497,
"learning_rate": 6.224627875507443e-05,
"loss": 1.0176,
"step": 230
},
{
"epoch": 0.16249153689911983,
"grad_norm": 1.4289461374282837,
"learning_rate": 6.495263870094723e-05,
"loss": 0.9733,
"step": 240
},
{
"epoch": 0.16926201760324983,
"grad_norm": 1.4132306575775146,
"learning_rate": 6.765899864682003e-05,
"loss": 1.0141,
"step": 250
},
{
"epoch": 0.17603249830737983,
"grad_norm": 1.482531189918518,
"learning_rate": 7.036535859269283e-05,
"loss": 0.977,
"step": 260
},
{
"epoch": 0.18280297901150983,
"grad_norm": 1.509128212928772,
"learning_rate": 7.307171853856563e-05,
"loss": 0.9624,
"step": 270
},
{
"epoch": 0.1895734597156398,
"grad_norm": 1.7142691612243652,
"learning_rate": 7.577807848443844e-05,
"loss": 1.0063,
"step": 280
},
{
"epoch": 0.1963439404197698,
"grad_norm": 1.2345936298370361,
"learning_rate": 7.848443843031124e-05,
"loss": 0.9562,
"step": 290
},
{
"epoch": 0.2031144211238998,
"grad_norm": 1.4808542728424072,
"learning_rate": 8.119079837618404e-05,
"loss": 1.0207,
"step": 300
},
{
"epoch": 0.2098849018280298,
"grad_norm": 0.9802400469779968,
"learning_rate": 8.389715832205684e-05,
"loss": 0.9731,
"step": 310
},
{
"epoch": 0.2166553825321598,
"grad_norm": 1.2837491035461426,
"learning_rate": 8.660351826792964e-05,
"loss": 0.9732,
"step": 320
},
{
"epoch": 0.22342586323628977,
"grad_norm": 1.6100679636001587,
"learning_rate": 8.930987821380244e-05,
"loss": 0.9645,
"step": 330
},
{
"epoch": 0.23019634394041977,
"grad_norm": 1.65373957157135,
"learning_rate": 9.201623815967524e-05,
"loss": 0.9825,
"step": 340
},
{
"epoch": 0.23696682464454977,
"grad_norm": 1.4988625049591064,
"learning_rate": 9.472259810554804e-05,
"loss": 0.9521,
"step": 350
},
{
"epoch": 0.24373730534867977,
"grad_norm": 1.0492310523986816,
"learning_rate": 9.742895805142085e-05,
"loss": 0.9418,
"step": 360
},
{
"epoch": 0.25050778605280977,
"grad_norm": 1.26401948928833,
"learning_rate": 0.00010013531799729365,
"loss": 1.0314,
"step": 370
},
{
"epoch": 0.25727826675693977,
"grad_norm": 1.3206366300582886,
"learning_rate": 0.00010284167794316644,
"loss": 0.9194,
"step": 380
},
{
"epoch": 0.2640487474610697,
"grad_norm": 1.533471941947937,
"learning_rate": 0.00010554803788903924,
"loss": 0.9,
"step": 390
},
{
"epoch": 0.2708192281651997,
"grad_norm": 1.2870343923568726,
"learning_rate": 0.00010825439783491205,
"loss": 0.911,
"step": 400
},
{
"epoch": 0.2775897088693297,
"grad_norm": 1.3480168581008911,
"learning_rate": 0.00011096075778078485,
"loss": 0.9127,
"step": 410
},
{
"epoch": 0.2843601895734597,
"grad_norm": 1.1548075675964355,
"learning_rate": 0.00011366711772665765,
"loss": 0.9206,
"step": 420
},
{
"epoch": 0.2911306702775897,
"grad_norm": 1.000781536102295,
"learning_rate": 0.00011637347767253047,
"loss": 0.9248,
"step": 430
},
{
"epoch": 0.2979011509817197,
"grad_norm": 1.0907179117202759,
"learning_rate": 0.00011907983761840324,
"loss": 0.897,
"step": 440
},
{
"epoch": 0.3046716316858497,
"grad_norm": 1.3253204822540283,
"learning_rate": 0.00012178619756427604,
"loss": 0.9503,
"step": 450
},
{
"epoch": 0.3114421123899797,
"grad_norm": 1.186468482017517,
"learning_rate": 0.00012449255751014886,
"loss": 0.885,
"step": 460
},
{
"epoch": 0.3182125930941097,
"grad_norm": 1.0382546186447144,
"learning_rate": 0.00012719891745602166,
"loss": 0.937,
"step": 470
},
{
"epoch": 0.32498307379823965,
"grad_norm": 0.9156469702720642,
"learning_rate": 0.00012990527740189446,
"loss": 0.9407,
"step": 480
},
{
"epoch": 0.33175355450236965,
"grad_norm": 1.2555314302444458,
"learning_rate": 0.00013261163734776726,
"loss": 0.9349,
"step": 490
},
{
"epoch": 0.33852403520649965,
"grad_norm": 1.1427136659622192,
"learning_rate": 0.00013531799729364006,
"loss": 0.9034,
"step": 500
},
{
"epoch": 0.34529451591062965,
"grad_norm": 0.9024341106414795,
"learning_rate": 0.00013802435723951287,
"loss": 0.8431,
"step": 510
},
{
"epoch": 0.35206499661475965,
"grad_norm": 1.0170283317565918,
"learning_rate": 0.00014073071718538567,
"loss": 0.9392,
"step": 520
},
{
"epoch": 0.35883547731888965,
"grad_norm": 0.9581354856491089,
"learning_rate": 0.00014343707713125847,
"loss": 0.9557,
"step": 530
},
{
"epoch": 0.36560595802301965,
"grad_norm": 1.1668641567230225,
"learning_rate": 0.00014614343707713127,
"loss": 0.8982,
"step": 540
},
{
"epoch": 0.37237643872714965,
"grad_norm": 1.249225378036499,
"learning_rate": 0.00014884979702300404,
"loss": 0.8719,
"step": 550
},
{
"epoch": 0.3791469194312796,
"grad_norm": 0.8681928515434265,
"learning_rate": 0.00015155615696887687,
"loss": 0.9412,
"step": 560
},
{
"epoch": 0.3859174001354096,
"grad_norm": 0.8795790672302246,
"learning_rate": 0.00015426251691474967,
"loss": 0.9476,
"step": 570
},
{
"epoch": 0.3926878808395396,
"grad_norm": 1.2251633405685425,
"learning_rate": 0.00015696887686062247,
"loss": 0.9401,
"step": 580
},
{
"epoch": 0.3994583615436696,
"grad_norm": 0.9845913052558899,
"learning_rate": 0.00015967523680649528,
"loss": 0.8447,
"step": 590
},
{
"epoch": 0.4062288422477996,
"grad_norm": 1.3847956657409668,
"learning_rate": 0.00016238159675236808,
"loss": 0.9562,
"step": 600
},
{
"epoch": 0.4129993229519296,
"grad_norm": 0.9039000272750854,
"learning_rate": 0.00016508795669824085,
"loss": 0.8706,
"step": 610
},
{
"epoch": 0.4197698036560596,
"grad_norm": 0.8315423130989075,
"learning_rate": 0.00016779431664411368,
"loss": 0.9437,
"step": 620
},
{
"epoch": 0.4265402843601896,
"grad_norm": 0.8760778903961182,
"learning_rate": 0.00017050067658998648,
"loss": 0.9078,
"step": 630
},
{
"epoch": 0.4333107650643196,
"grad_norm": 1.0592724084854126,
"learning_rate": 0.00017320703653585928,
"loss": 0.8835,
"step": 640
},
{
"epoch": 0.44008124576844954,
"grad_norm": 0.8527820706367493,
"learning_rate": 0.00017591339648173208,
"loss": 0.9088,
"step": 650
},
{
"epoch": 0.44685172647257954,
"grad_norm": 0.8774325847625732,
"learning_rate": 0.00017861975642760488,
"loss": 0.8967,
"step": 660
},
{
"epoch": 0.45362220717670954,
"grad_norm": 0.6633328795433044,
"learning_rate": 0.00018132611637347766,
"loss": 0.9158,
"step": 670
},
{
"epoch": 0.46039268788083954,
"grad_norm": 0.7048283219337463,
"learning_rate": 0.0001840324763193505,
"loss": 0.872,
"step": 680
},
{
"epoch": 0.46716316858496953,
"grad_norm": 0.8527712225914001,
"learning_rate": 0.0001867388362652233,
"loss": 0.9062,
"step": 690
},
{
"epoch": 0.47393364928909953,
"grad_norm": 1.095738172531128,
"learning_rate": 0.0001894451962110961,
"loss": 0.89,
"step": 700
},
{
"epoch": 0.48070412999322953,
"grad_norm": 0.8880236148834229,
"learning_rate": 0.0001921515561569689,
"loss": 0.8825,
"step": 710
},
{
"epoch": 0.48747461069735953,
"grad_norm": 0.7381774187088013,
"learning_rate": 0.0001948579161028417,
"loss": 0.8121,
"step": 720
},
{
"epoch": 0.4942450914014895,
"grad_norm": 0.9708958864212036,
"learning_rate": 0.0001975642760487145,
"loss": 0.8458,
"step": 730
},
{
"epoch": 0.5010155721056195,
"grad_norm": 1.0069886445999146,
"learning_rate": 0.00019999998882753333,
"loss": 0.8679,
"step": 740
},
{
"epoch": 0.5077860528097495,
"grad_norm": 0.8364754915237427,
"learning_rate": 0.00019999864813455363,
"loss": 0.8797,
"step": 750
},
{
"epoch": 0.5145565335138795,
"grad_norm": 0.8467391133308411,
"learning_rate": 0.0001999950729825663,
"loss": 0.8789,
"step": 760
},
{
"epoch": 0.5213270142180095,
"grad_norm": 0.749064028263092,
"learning_rate": 0.00019998926345145775,
"loss": 0.9156,
"step": 770
},
{
"epoch": 0.5280974949221394,
"grad_norm": 0.7991885542869568,
"learning_rate": 0.00019998121967104132,
"loss": 0.919,
"step": 780
},
{
"epoch": 0.5348679756262694,
"grad_norm": 0.8024610877037048,
"learning_rate": 0.00019997094182105447,
"loss": 0.8619,
"step": 790
},
{
"epoch": 0.5416384563303994,
"grad_norm": 0.8949725031852722,
"learning_rate": 0.00019995843013115454,
"loss": 0.86,
"step": 800
},
{
"epoch": 0.5484089370345294,
"grad_norm": 0.9048612713813782,
"learning_rate": 0.00019994368488091398,
"loss": 0.9258,
"step": 810
},
{
"epoch": 0.5551794177386594,
"grad_norm": 1.112876057624817,
"learning_rate": 0.00019992670639981376,
"loss": 0.8758,
"step": 820
},
{
"epoch": 0.5619498984427894,
"grad_norm": 0.9120655059814453,
"learning_rate": 0.00019990749506723624,
"loss": 0.9112,
"step": 830
},
{
"epoch": 0.5687203791469194,
"grad_norm": 0.9125117063522339,
"learning_rate": 0.00019988605131245662,
"loss": 0.899,
"step": 840
},
{
"epoch": 0.5754908598510494,
"grad_norm": 0.8011307716369629,
"learning_rate": 0.00019986237561463318,
"loss": 0.8604,
"step": 850
},
{
"epoch": 0.5822613405551794,
"grad_norm": 0.7512729167938232,
"learning_rate": 0.00019983646850279692,
"loss": 0.8411,
"step": 860
},
{
"epoch": 0.5890318212593094,
"grad_norm": 0.7400951981544495,
"learning_rate": 0.0001998083305558394,
"loss": 0.9106,
"step": 870
},
{
"epoch": 0.5958023019634394,
"grad_norm": 0.8688220381736755,
"learning_rate": 0.00019977796240250008,
"loss": 0.9071,
"step": 880
},
{
"epoch": 0.6025727826675694,
"grad_norm": 0.9177795052528381,
"learning_rate": 0.00019974536472135203,
"loss": 0.9038,
"step": 890
},
{
"epoch": 0.6093432633716994,
"grad_norm": 0.986629843711853,
"learning_rate": 0.00019971053824078693,
"loss": 0.8832,
"step": 900
},
{
"epoch": 0.6161137440758294,
"grad_norm": 0.7033129334449768,
"learning_rate": 0.00019967348373899868,
"loss": 0.845,
"step": 910
},
{
"epoch": 0.6228842247799594,
"grad_norm": 0.8107329607009888,
"learning_rate": 0.0001996342020439662,
"loss": 0.9287,
"step": 920
},
{
"epoch": 0.6296547054840894,
"grad_norm": 0.7914236783981323,
"learning_rate": 0.00019959269403343474,
"loss": 0.8836,
"step": 930
},
{
"epoch": 0.6364251861882194,
"grad_norm": 0.8895307183265686,
"learning_rate": 0.00019954896063489622,
"loss": 0.8759,
"step": 940
},
{
"epoch": 0.6431956668923493,
"grad_norm": 0.8289987444877625,
"learning_rate": 0.0001995030028255688,
"loss": 0.9136,
"step": 950
},
{
"epoch": 0.6499661475964793,
"grad_norm": 0.9810376167297363,
"learning_rate": 0.00019945482163237472,
"loss": 0.8388,
"step": 960
},
{
"epoch": 0.6567366283006093,
"grad_norm": 0.7306379079818726,
"learning_rate": 0.0001994044181319176,
"loss": 0.8804,
"step": 970
},
{
"epoch": 0.6635071090047393,
"grad_norm": 0.7892174124717712,
"learning_rate": 0.00019935179345045815,
"loss": 0.8671,
"step": 980
},
{
"epoch": 0.6702775897088693,
"grad_norm": 0.9007791876792908,
"learning_rate": 0.0001992969487638893,
"loss": 0.8661,
"step": 990
},
{
"epoch": 0.6770480704129993,
"grad_norm": 0.7324849963188171,
"learning_rate": 0.00019923988529770958,
"loss": 0.7901,
"step": 1000
},
{
"epoch": 0.6770480704129993,
"eval_loss": 0.8919770121574402,
"eval_runtime": 23.6227,
"eval_samples_per_second": 105.323,
"eval_steps_per_second": 13.165,
"step": 1000
},
{
"epoch": 0.6838185511171293,
"grad_norm": 0.8670386672019958,
"learning_rate": 0.000199180604326996,
"loss": 0.8084,
"step": 1010
},
{
"epoch": 0.6905890318212593,
"grad_norm": 1.3103822469711304,
"learning_rate": 0.00019911910717637548,
"loss": 0.8708,
"step": 1020
},
{
"epoch": 0.6973595125253893,
"grad_norm": 0.8602836728096008,
"learning_rate": 0.00019905539521999517,
"loss": 0.8608,
"step": 1030
},
{
"epoch": 0.7041299932295193,
"grad_norm": 0.7158609628677368,
"learning_rate": 0.00019898946988149193,
"loss": 0.9042,
"step": 1040
},
{
"epoch": 0.7109004739336493,
"grad_norm": 0.6975676417350769,
"learning_rate": 0.0001989213326339603,
"loss": 0.8896,
"step": 1050
},
{
"epoch": 0.7176709546377793,
"grad_norm": 0.7300527095794678,
"learning_rate": 0.00019885098499991972,
"loss": 0.8685,
"step": 1060
},
{
"epoch": 0.7244414353419093,
"grad_norm": 0.6200681924819946,
"learning_rate": 0.0001987784285512805,
"loss": 0.8615,
"step": 1070
},
{
"epoch": 0.7312119160460393,
"grad_norm": 0.7945191860198975,
"learning_rate": 0.00019870366490930868,
"loss": 0.8786,
"step": 1080
},
{
"epoch": 0.7379823967501693,
"grad_norm": 0.6641054749488831,
"learning_rate": 0.0001986266957445897,
"loss": 0.8872,
"step": 1090
},
{
"epoch": 0.7447528774542993,
"grad_norm": 0.7063596844673157,
"learning_rate": 0.00019854752277699138,
"loss": 0.8544,
"step": 1100
},
{
"epoch": 0.7515233581584293,
"grad_norm": 0.6685433983802795,
"learning_rate": 0.000198466147775625,
"loss": 0.8256,
"step": 1110
},
{
"epoch": 0.7582938388625592,
"grad_norm": 0.6927530765533447,
"learning_rate": 0.00019838257255880626,
"loss": 0.8642,
"step": 1120
},
{
"epoch": 0.7650643195666892,
"grad_norm": 0.7018571496009827,
"learning_rate": 0.00019829679899401436,
"loss": 0.8624,
"step": 1130
},
{
"epoch": 0.7718348002708192,
"grad_norm": 0.8826500773429871,
"learning_rate": 0.00019820882899785038,
"loss": 0.8312,
"step": 1140
},
{
"epoch": 0.7786052809749492,
"grad_norm": 0.9699224233627319,
"learning_rate": 0.00019811866453599435,
"loss": 0.8467,
"step": 1150
},
{
"epoch": 0.7853757616790792,
"grad_norm": 0.7322418689727783,
"learning_rate": 0.00019802630762316145,
"loss": 0.8456,
"step": 1160
},
{
"epoch": 0.7921462423832092,
"grad_norm": 0.768301248550415,
"learning_rate": 0.00019793176032305697,
"loss": 0.8391,
"step": 1170
},
{
"epoch": 0.7989167230873392,
"grad_norm": 0.8243605494499207,
"learning_rate": 0.00019783502474833009,
"loss": 0.904,
"step": 1180
},
{
"epoch": 0.8056872037914692,
"grad_norm": 0.7215325236320496,
"learning_rate": 0.00019773610306052683,
"loss": 0.8494,
"step": 1190
},
{
"epoch": 0.8124576844955992,
"grad_norm": 0.7619712948799133,
"learning_rate": 0.00019763499747004165,
"loss": 0.8865,
"step": 1200
},
{
"epoch": 0.8192281651997292,
"grad_norm": 0.835599958896637,
"learning_rate": 0.000197531710236068,
"loss": 0.8733,
"step": 1210
},
{
"epoch": 0.8259986459038592,
"grad_norm": 0.8382962942123413,
"learning_rate": 0.00019742624366654802,
"loss": 0.9122,
"step": 1220
},
{
"epoch": 0.8327691266079892,
"grad_norm": 0.666801393032074,
"learning_rate": 0.00019731860011812087,
"loss": 0.8429,
"step": 1230
},
{
"epoch": 0.8395396073121192,
"grad_norm": 0.7756575345993042,
"learning_rate": 0.00019720878199606996,
"loss": 0.9004,
"step": 1240
},
{
"epoch": 0.8463100880162492,
"grad_norm": 0.7014258503913879,
"learning_rate": 0.00019709679175426942,
"loss": 0.9241,
"step": 1250
},
{
"epoch": 0.8530805687203792,
"grad_norm": 0.6827540397644043,
"learning_rate": 0.00019698263189512914,
"loss": 0.8566,
"step": 1260
},
{
"epoch": 0.8598510494245092,
"grad_norm": 0.9167826771736145,
"learning_rate": 0.00019686630496953882,
"loss": 0.9116,
"step": 1270
},
{
"epoch": 0.8666215301286392,
"grad_norm": 0.8172047138214111,
"learning_rate": 0.00019674781357681108,
"loss": 0.8052,
"step": 1280
},
{
"epoch": 0.8733920108327691,
"grad_norm": 0.7139961123466492,
"learning_rate": 0.00019662716036462335,
"loss": 0.89,
"step": 1290
},
{
"epoch": 0.8801624915368991,
"grad_norm": 0.9733943939208984,
"learning_rate": 0.0001965043480289586,
"loss": 0.8191,
"step": 1300
},
{
"epoch": 0.8869329722410291,
"grad_norm": 0.849946916103363,
"learning_rate": 0.00019637937931404523,
"loss": 0.8995,
"step": 1310
},
{
"epoch": 0.8937034529451591,
"grad_norm": 0.6809601187705994,
"learning_rate": 0.00019625225701229573,
"loss": 0.8582,
"step": 1320
},
{
"epoch": 0.9004739336492891,
"grad_norm": 0.7891602516174316,
"learning_rate": 0.00019612298396424417,
"loss": 0.844,
"step": 1330
},
{
"epoch": 0.9072444143534191,
"grad_norm": 0.6357580423355103,
"learning_rate": 0.0001959915630584829,
"loss": 0.8609,
"step": 1340
},
{
"epoch": 0.9140148950575491,
"grad_norm": 0.9102625846862793,
"learning_rate": 0.00019585799723159788,
"loss": 0.91,
"step": 1350
},
{
"epoch": 0.9207853757616791,
"grad_norm": 0.690881609916687,
"learning_rate": 0.0001957222894681031,
"loss": 0.8287,
"step": 1360
},
{
"epoch": 0.9275558564658091,
"grad_norm": 0.6755393743515015,
"learning_rate": 0.00019558444280037393,
"loss": 0.7931,
"step": 1370
},
{
"epoch": 0.9343263371699391,
"grad_norm": 0.6997596025466919,
"learning_rate": 0.00019544446030857922,
"loss": 0.8941,
"step": 1380
},
{
"epoch": 0.9410968178740691,
"grad_norm": 0.8115108013153076,
"learning_rate": 0.0001953023451206127,
"loss": 0.8674,
"step": 1390
},
{
"epoch": 0.9478672985781991,
"grad_norm": 0.6413692235946655,
"learning_rate": 0.00019515810041202295,
"loss": 0.8462,
"step": 1400
},
{
"epoch": 0.9546377792823291,
"grad_norm": 0.6888745427131653,
"learning_rate": 0.00019501172940594242,
"loss": 0.8594,
"step": 1410
},
{
"epoch": 0.9614082599864591,
"grad_norm": 0.8250995874404907,
"learning_rate": 0.00019486323537301538,
"loss": 0.8622,
"step": 1420
},
{
"epoch": 0.9681787406905891,
"grad_norm": 0.7127440571784973,
"learning_rate": 0.00019471262163132504,
"loss": 0.8626,
"step": 1430
},
{
"epoch": 0.9749492213947191,
"grad_norm": 0.6688849925994873,
"learning_rate": 0.0001945598915463192,
"loss": 0.871,
"step": 1440
},
{
"epoch": 0.9817197020988491,
"grad_norm": 0.8800045251846313,
"learning_rate": 0.00019440504853073516,
"loss": 0.8555,
"step": 1450
},
{
"epoch": 0.988490182802979,
"grad_norm": 0.7973435521125793,
"learning_rate": 0.00019424809604452338,
"loss": 0.826,
"step": 1460
},
{
"epoch": 0.995260663507109,
"grad_norm": 0.7803165316581726,
"learning_rate": 0.00019408903759477025,
"loss": 0.8657,
"step": 1470
},
{
"epoch": 1.002031144211239,
"grad_norm": 0.9152759313583374,
"learning_rate": 0.00019392787673561964,
"loss": 0.8114,
"step": 1480
},
{
"epoch": 1.008801624915369,
"grad_norm": 0.717939555644989,
"learning_rate": 0.00019376461706819358,
"loss": 0.7081,
"step": 1490
},
{
"epoch": 1.015572105619499,
"grad_norm": 0.8752790093421936,
"learning_rate": 0.00019359926224051178,
"loss": 0.697,
"step": 1500
},
{
"epoch": 1.022342586323629,
"grad_norm": 0.7938421368598938,
"learning_rate": 0.00019343181594740996,
"loss": 0.7743,
"step": 1510
},
{
"epoch": 1.029113067027759,
"grad_norm": 0.8380940556526184,
"learning_rate": 0.00019326228193045753,
"loss": 0.7965,
"step": 1520
},
{
"epoch": 1.035883547731889,
"grad_norm": 0.8056864142417908,
"learning_rate": 0.00019309066397787378,
"loss": 0.7399,
"step": 1530
},
{
"epoch": 1.042654028436019,
"grad_norm": 0.9307854771614075,
"learning_rate": 0.0001929169659244434,
"loss": 0.7503,
"step": 1540
},
{
"epoch": 1.0494245091401488,
"grad_norm": 0.8573846220970154,
"learning_rate": 0.00019274119165143064,
"loss": 0.7867,
"step": 1550
},
{
"epoch": 1.0561949898442788,
"grad_norm": 0.7639918327331543,
"learning_rate": 0.00019256334508649262,
"loss": 0.7303,
"step": 1560
},
{
"epoch": 1.0629654705484088,
"grad_norm": 0.7085719704627991,
"learning_rate": 0.00019238343020359174,
"loss": 0.7375,
"step": 1570
},
{
"epoch": 1.0697359512525388,
"grad_norm": 0.8645661473274231,
"learning_rate": 0.00019220145102290658,
"loss": 0.7569,
"step": 1580
},
{
"epoch": 1.0765064319566688,
"grad_norm": 0.8893268704414368,
"learning_rate": 0.00019201741161074234,
"loss": 0.7594,
"step": 1590
},
{
"epoch": 1.0832769126607988,
"grad_norm": 0.9011455774307251,
"learning_rate": 0.00019183131607943983,
"loss": 0.7721,
"step": 1600
},
{
"epoch": 1.0900473933649288,
"grad_norm": 0.812759518623352,
"learning_rate": 0.00019164316858728364,
"loss": 0.6816,
"step": 1610
},
{
"epoch": 1.0968178740690588,
"grad_norm": 0.7881085276603699,
"learning_rate": 0.00019145297333840916,
"loss": 0.7927,
"step": 1620
},
{
"epoch": 1.1035883547731888,
"grad_norm": 0.9383792281150818,
"learning_rate": 0.00019126073458270874,
"loss": 0.8416,
"step": 1630
},
{
"epoch": 1.1103588354773188,
"grad_norm": 0.8487265110015869,
"learning_rate": 0.00019106645661573667,
"loss": 0.7731,
"step": 1640
},
{
"epoch": 1.1171293161814488,
"grad_norm": 1.061084270477295,
"learning_rate": 0.0001908701437786131,
"loss": 0.7954,
"step": 1650
},
{
"epoch": 1.1238997968855788,
"grad_norm": 0.7608863115310669,
"learning_rate": 0.00019067180045792724,
"loss": 0.7224,
"step": 1660
},
{
"epoch": 1.1306702775897088,
"grad_norm": 1.0351011753082275,
"learning_rate": 0.0001904714310856392,
"loss": 0.7761,
"step": 1670
},
{
"epoch": 1.1374407582938388,
"grad_norm": 0.8522539138793945,
"learning_rate": 0.00019026904013898097,
"loss": 0.7552,
"step": 1680
},
{
"epoch": 1.1442112389979688,
"grad_norm": 0.9050424098968506,
"learning_rate": 0.00019006463214035646,
"loss": 0.7458,
"step": 1690
},
{
"epoch": 1.1509817197020988,
"grad_norm": 1.0837703943252563,
"learning_rate": 0.00018985821165724034,
"loss": 0.7811,
"step": 1700
},
{
"epoch": 1.1577522004062288,
"grad_norm": 0.7830744385719299,
"learning_rate": 0.00018964978330207605,
"loss": 0.7596,
"step": 1710
},
{
"epoch": 1.1645226811103588,
"grad_norm": 0.8530306220054626,
"learning_rate": 0.0001894393517321727,
"loss": 0.7075,
"step": 1720
},
{
"epoch": 1.1712931618144888,
"grad_norm": 0.9117756485939026,
"learning_rate": 0.00018922692164960098,
"loss": 0.7585,
"step": 1730
},
{
"epoch": 1.1780636425186188,
"grad_norm": 0.9983711242675781,
"learning_rate": 0.00018901249780108823,
"loss": 0.7459,
"step": 1740
},
{
"epoch": 1.1848341232227488,
"grad_norm": 0.9291015267372131,
"learning_rate": 0.00018879608497791224,
"loss": 0.7271,
"step": 1750
},
{
"epoch": 1.1916046039268788,
"grad_norm": 1.0468007326126099,
"learning_rate": 0.00018857768801579415,
"loss": 0.7932,
"step": 1760
},
{
"epoch": 1.1983750846310088,
"grad_norm": 0.8586043119430542,
"learning_rate": 0.00018835731179479056,
"loss": 0.8144,
"step": 1770
},
{
"epoch": 1.2051455653351388,
"grad_norm": 0.7450950741767883,
"learning_rate": 0.00018813496123918432,
"loss": 0.7402,
"step": 1780
},
{
"epoch": 1.2119160460392688,
"grad_norm": 0.9340034127235413,
"learning_rate": 0.00018791064131737462,
"loss": 0.7852,
"step": 1790
},
{
"epoch": 1.2186865267433988,
"grad_norm": 0.9052138328552246,
"learning_rate": 0.00018768435704176597,
"loss": 0.7128,
"step": 1800
},
{
"epoch": 1.2254570074475288,
"grad_norm": 0.8574148416519165,
"learning_rate": 0.00018745611346865606,
"loss": 0.7488,
"step": 1810
},
{
"epoch": 1.2322274881516588,
"grad_norm": 1.0493452548980713,
"learning_rate": 0.00018722591569812294,
"loss": 0.8368,
"step": 1820
},
{
"epoch": 1.2389979688557888,
"grad_norm": 1.019943356513977,
"learning_rate": 0.00018699376887391093,
"loss": 0.8279,
"step": 1830
},
{
"epoch": 1.2457684495599188,
"grad_norm": 0.9113163352012634,
"learning_rate": 0.0001867596781833158,
"loss": 0.7308,
"step": 1840
},
{
"epoch": 1.2525389302640488,
"grad_norm": 0.9192100763320923,
"learning_rate": 0.0001865236488570688,
"loss": 0.783,
"step": 1850
},
{
"epoch": 1.2593094109681786,
"grad_norm": 0.8824251294136047,
"learning_rate": 0.00018628568616921976,
"loss": 0.7581,
"step": 1860
},
{
"epoch": 1.2660798916723088,
"grad_norm": 0.8410795331001282,
"learning_rate": 0.00018604579543701926,
"loss": 0.7696,
"step": 1870
},
{
"epoch": 1.2728503723764386,
"grad_norm": 1.0213907957077026,
"learning_rate": 0.00018580398202079987,
"loss": 0.7202,
"step": 1880
},
{
"epoch": 1.2796208530805688,
"grad_norm": 0.7865493297576904,
"learning_rate": 0.00018556025132385626,
"loss": 0.7685,
"step": 1890
},
{
"epoch": 1.2863913337846986,
"grad_norm": 0.9204791784286499,
"learning_rate": 0.00018531460879232456,
"loss": 0.7814,
"step": 1900
},
{
"epoch": 1.2931618144888288,
"grad_norm": 0.810883104801178,
"learning_rate": 0.00018506705991506067,
"loss": 0.7202,
"step": 1910
},
{
"epoch": 1.2999322951929586,
"grad_norm": 0.8419713973999023,
"learning_rate": 0.00018481761022351757,
"loss": 0.785,
"step": 1920
},
{
"epoch": 1.3067027758970888,
"grad_norm": 0.8345950245857239,
"learning_rate": 0.0001845662652916217,
"loss": 0.7693,
"step": 1930
},
{
"epoch": 1.3134732566012186,
"grad_norm": 0.8708229660987854,
"learning_rate": 0.00018431303073564842,
"loss": 0.8127,
"step": 1940
},
{
"epoch": 1.3202437373053486,
"grad_norm": 0.800879716873169,
"learning_rate": 0.0001840579122140966,
"loss": 0.7804,
"step": 1950
},
{
"epoch": 1.3270142180094786,
"grad_norm": 0.8764187097549438,
"learning_rate": 0.00018380091542756212,
"loss": 0.7563,
"step": 1960
},
{
"epoch": 1.3337846987136086,
"grad_norm": 0.9371510744094849,
"learning_rate": 0.00018354204611861042,
"loss": 0.7382,
"step": 1970
},
{
"epoch": 1.3405551794177386,
"grad_norm": 0.9174867868423462,
"learning_rate": 0.00018328131007164827,
"loss": 0.7543,
"step": 1980
},
{
"epoch": 1.3473256601218686,
"grad_norm": 0.9580458998680115,
"learning_rate": 0.00018301871311279455,
"loss": 0.7877,
"step": 1990
},
{
"epoch": 1.3540961408259986,
"grad_norm": 0.8264724016189575,
"learning_rate": 0.00018275426110975,
"loss": 0.7599,
"step": 2000
},
{
"epoch": 1.3540961408259986,
"eval_loss": 0.8573334813117981,
"eval_runtime": 23.1617,
"eval_samples_per_second": 107.419,
"eval_steps_per_second": 13.427,
"step": 2000
},
{
"epoch": 1.3608666215301286,
"grad_norm": 0.8695821762084961,
"learning_rate": 0.00018248795997166607,
"loss": 0.772,
"step": 2010
},
{
"epoch": 1.3676371022342586,
"grad_norm": 0.9564002752304077,
"learning_rate": 0.000182219815649013,
"loss": 0.8211,
"step": 2020
},
{
"epoch": 1.3744075829383886,
"grad_norm": 0.951923668384552,
"learning_rate": 0.00018194983413344674,
"loss": 0.7549,
"step": 2030
},
{
"epoch": 1.3811780636425186,
"grad_norm": 0.7695098519325256,
"learning_rate": 0.00018167802145767513,
"loss": 0.7133,
"step": 2040
},
{
"epoch": 1.3879485443466486,
"grad_norm": 1.255873203277588,
"learning_rate": 0.0001814043836953231,
"loss": 0.7562,
"step": 2050
},
{
"epoch": 1.3947190250507786,
"grad_norm": 0.8769702315330505,
"learning_rate": 0.00018112892696079698,
"loss": 0.7411,
"step": 2060
},
{
"epoch": 1.4014895057549086,
"grad_norm": 0.9851005673408508,
"learning_rate": 0.00018085165740914776,
"loss": 0.7568,
"step": 2070
},
{
"epoch": 1.4082599864590386,
"grad_norm": 0.8695229887962341,
"learning_rate": 0.00018057258123593367,
"loss": 0.7358,
"step": 2080
},
{
"epoch": 1.4150304671631686,
"grad_norm": 0.9267136454582214,
"learning_rate": 0.00018029170467708165,
"loss": 0.7352,
"step": 2090
},
{
"epoch": 1.4218009478672986,
"grad_norm": 0.8532856106758118,
"learning_rate": 0.00018000903400874823,
"loss": 0.8073,
"step": 2100
},
{
"epoch": 1.4285714285714286,
"grad_norm": 0.8961872458457947,
"learning_rate": 0.0001797245755471789,
"loss": 0.7886,
"step": 2110
},
{
"epoch": 1.4353419092755586,
"grad_norm": 0.8943607211112976,
"learning_rate": 0.00017943833564856737,
"loss": 0.7216,
"step": 2120
},
{
"epoch": 1.4421123899796886,
"grad_norm": 0.824885904788971,
"learning_rate": 0.00017915032070891327,
"loss": 0.7077,
"step": 2130
},
{
"epoch": 1.4488828706838186,
"grad_norm": 0.846660315990448,
"learning_rate": 0.00017886053716387935,
"loss": 0.7511,
"step": 2140
},
{
"epoch": 1.4556533513879486,
"grad_norm": 0.8594396710395813,
"learning_rate": 0.00017856899148864774,
"loss": 0.7603,
"step": 2150
},
{
"epoch": 1.4624238320920786,
"grad_norm": 0.8377899527549744,
"learning_rate": 0.00017827569019777503,
"loss": 0.7301,
"step": 2160
},
{
"epoch": 1.4691943127962086,
"grad_norm": 1.0455125570297241,
"learning_rate": 0.00017798063984504698,
"loss": 0.7858,
"step": 2170
},
{
"epoch": 1.4759647935003386,
"grad_norm": 0.9242769479751587,
"learning_rate": 0.00017768384702333188,
"loss": 0.8125,
"step": 2180
},
{
"epoch": 1.4827352742044684,
"grad_norm": 0.9363239407539368,
"learning_rate": 0.00017738531836443332,
"loss": 0.7731,
"step": 2190
},
{
"epoch": 1.4895057549085986,
"grad_norm": 0.8512465953826904,
"learning_rate": 0.000177085060538942,
"loss": 0.7407,
"step": 2200
},
{
"epoch": 1.4962762356127284,
"grad_norm": 0.9729003310203552,
"learning_rate": 0.00017678308025608665,
"loss": 0.7751,
"step": 2210
},
{
"epoch": 1.5030467163168586,
"grad_norm": 0.94197678565979,
"learning_rate": 0.00017647938426358412,
"loss": 0.7642,
"step": 2220
},
{
"epoch": 1.5098171970209884,
"grad_norm": 0.9034068584442139,
"learning_rate": 0.00017617397934748859,
"loss": 0.8069,
"step": 2230
},
{
"epoch": 1.5165876777251186,
"grad_norm": 0.9055565595626831,
"learning_rate": 0.00017586687233204,
"loss": 0.7463,
"step": 2240
},
{
"epoch": 1.5233581584292484,
"grad_norm": 0.9645712971687317,
"learning_rate": 0.00017555807007951142,
"loss": 0.8157,
"step": 2250
},
{
"epoch": 1.5301286391333786,
"grad_norm": 0.9376358389854431,
"learning_rate": 0.00017524757949005597,
"loss": 0.8012,
"step": 2260
},
{
"epoch": 1.5368991198375084,
"grad_norm": 0.8372974991798401,
"learning_rate": 0.00017493540750155236,
"loss": 0.7429,
"step": 2270
},
{
"epoch": 1.5436696005416386,
"grad_norm": 0.8159657120704651,
"learning_rate": 0.00017462156108944996,
"loss": 0.7619,
"step": 2280
},
{
"epoch": 1.5504400812457684,
"grad_norm": 0.9110903143882751,
"learning_rate": 0.00017430604726661304,
"loss": 0.7792,
"step": 2290
},
{
"epoch": 1.5572105619498986,
"grad_norm": 1.0363059043884277,
"learning_rate": 0.00017398887308316393,
"loss": 0.7875,
"step": 2300
},
{
"epoch": 1.5639810426540284,
"grad_norm": 0.8779491186141968,
"learning_rate": 0.00017367004562632556,
"loss": 0.7395,
"step": 2310
},
{
"epoch": 1.5707515233581584,
"grad_norm": 0.7635359168052673,
"learning_rate": 0.00017334957202026305,
"loss": 0.734,
"step": 2320
},
{
"epoch": 1.5775220040622884,
"grad_norm": 0.7570300698280334,
"learning_rate": 0.0001730274594259246,
"loss": 0.732,
"step": 2330
},
{
"epoch": 1.5842924847664184,
"grad_norm": 0.8852811455726624,
"learning_rate": 0.0001727037150408813,
"loss": 0.7176,
"step": 2340
},
{
"epoch": 1.5910629654705484,
"grad_norm": 0.920385479927063,
"learning_rate": 0.00017237834609916668,
"loss": 0.7883,
"step": 2350
},
{
"epoch": 1.5978334461746784,
"grad_norm": 0.7175299525260925,
"learning_rate": 0.00017205135987111446,
"loss": 0.7511,
"step": 2360
},
{
"epoch": 1.6046039268788084,
"grad_norm": 0.9640962481498718,
"learning_rate": 0.0001717227636631968,
"loss": 0.7344,
"step": 2370
},
{
"epoch": 1.6113744075829384,
"grad_norm": 1.0787372589111328,
"learning_rate": 0.00017139256481786043,
"loss": 0.7388,
"step": 2380
},
{
"epoch": 1.6181448882870684,
"grad_norm": 0.8717492818832397,
"learning_rate": 0.00017106077071336298,
"loss": 0.8181,
"step": 2390
},
{
"epoch": 1.6249153689911984,
"grad_norm": 0.9693078398704529,
"learning_rate": 0.00017072738876360792,
"loss": 0.7784,
"step": 2400
},
{
"epoch": 1.6316858496953284,
"grad_norm": 0.9157988429069519,
"learning_rate": 0.00017039242641797895,
"loss": 0.7631,
"step": 2410
},
{
"epoch": 1.6384563303994584,
"grad_norm": 0.856497585773468,
"learning_rate": 0.0001700558911611736,
"loss": 0.7572,
"step": 2420
},
{
"epoch": 1.6452268111035884,
"grad_norm": 0.9910064339637756,
"learning_rate": 0.0001697177905130358,
"loss": 0.79,
"step": 2430
},
{
"epoch": 1.6519972918077184,
"grad_norm": 0.9009943008422852,
"learning_rate": 0.00016937813202838817,
"loss": 0.7389,
"step": 2440
},
{
"epoch": 1.6587677725118484,
"grad_norm": 0.8572137951850891,
"learning_rate": 0.00016903692329686286,
"loss": 0.8074,
"step": 2450
},
{
"epoch": 1.6655382532159784,
"grad_norm": 0.9608494639396667,
"learning_rate": 0.00016869417194273216,
"loss": 0.7493,
"step": 2460
},
{
"epoch": 1.6723087339201084,
"grad_norm": 1.1153324842453003,
"learning_rate": 0.00016834988562473813,
"loss": 0.7696,
"step": 2470
},
{
"epoch": 1.6790792146242384,
"grad_norm": 0.8839768171310425,
"learning_rate": 0.00016800407203592144,
"loss": 0.6736,
"step": 2480
},
{
"epoch": 1.6858496953283684,
"grad_norm": 0.8794620633125305,
"learning_rate": 0.00016765673890344944,
"loss": 0.7678,
"step": 2490
},
{
"epoch": 1.6926201760324981,
"grad_norm": 1.167880892753601,
"learning_rate": 0.0001673078939884435,
"loss": 0.799,
"step": 2500
},
{
"epoch": 1.6993906567366284,
"grad_norm": 0.8976329565048218,
"learning_rate": 0.00016695754508580556,
"loss": 0.7445,
"step": 2510
},
{
"epoch": 1.7061611374407581,
"grad_norm": 0.8003941178321838,
"learning_rate": 0.00016660570002404414,
"loss": 0.7434,
"step": 2520
},
{
"epoch": 1.7129316181448884,
"grad_norm": 1.5716880559921265,
"learning_rate": 0.0001662523666650992,
"loss": 0.7785,
"step": 2530
},
{
"epoch": 1.7197020988490181,
"grad_norm": 0.7486565113067627,
"learning_rate": 0.00016589755290416652,
"loss": 0.7415,
"step": 2540
},
{
"epoch": 1.7264725795531484,
"grad_norm": 0.872717559337616,
"learning_rate": 0.0001655412666695213,
"loss": 0.7568,
"step": 2550
},
{
"epoch": 1.7332430602572781,
"grad_norm": 1.06588876247406,
"learning_rate": 0.00016518351592234102,
"loss": 0.714,
"step": 2560
},
{
"epoch": 1.7400135409614084,
"grad_norm": 0.8603307008743286,
"learning_rate": 0.00016482430865652758,
"loss": 0.8015,
"step": 2570
},
{
"epoch": 1.7467840216655381,
"grad_norm": 0.9161677956581116,
"learning_rate": 0.0001644636528985286,
"loss": 0.7517,
"step": 2580
},
{
"epoch": 1.7535545023696684,
"grad_norm": 0.9165793657302856,
"learning_rate": 0.00016410155670715807,
"loss": 0.7219,
"step": 2590
},
{
"epoch": 1.7603249830737981,
"grad_norm": 0.9347404837608337,
"learning_rate": 0.00016373802817341631,
"loss": 0.7544,
"step": 2600
},
{
"epoch": 1.7670954637779284,
"grad_norm": 0.9771521687507629,
"learning_rate": 0.00016337307542030924,
"loss": 0.7613,
"step": 2610
},
{
"epoch": 1.7738659444820581,
"grad_norm": 0.8616775870323181,
"learning_rate": 0.00016300670660266678,
"loss": 0.7028,
"step": 2620
},
{
"epoch": 1.7806364251861884,
"grad_norm": 0.9634568095207214,
"learning_rate": 0.0001626389299069606,
"loss": 0.7776,
"step": 2630
},
{
"epoch": 1.7874069058903181,
"grad_norm": 0.8600468635559082,
"learning_rate": 0.00016226975355112134,
"loss": 0.7127,
"step": 2640
},
{
"epoch": 1.7941773865944484,
"grad_norm": 0.8130874037742615,
"learning_rate": 0.00016189918578435482,
"loss": 0.7618,
"step": 2650
},
{
"epoch": 1.8009478672985781,
"grad_norm": 0.8722664713859558,
"learning_rate": 0.00016152723488695783,
"loss": 0.7364,
"step": 2660
},
{
"epoch": 1.8077183480027081,
"grad_norm": 0.726963222026825,
"learning_rate": 0.00016115390917013307,
"loss": 0.7449,
"step": 2670
},
{
"epoch": 1.8144888287068381,
"grad_norm": 0.9895104765892029,
"learning_rate": 0.00016077921697580343,
"loss": 0.7766,
"step": 2680
},
{
"epoch": 1.8212593094109681,
"grad_norm": 0.9779828190803528,
"learning_rate": 0.00016040316667642558,
"loss": 0.7266,
"step": 2690
},
{
"epoch": 1.8280297901150981,
"grad_norm": 1.04193913936615,
"learning_rate": 0.00016002576667480288,
"loss": 0.7344,
"step": 2700
},
{
"epoch": 1.8348002708192281,
"grad_norm": 0.8899911046028137,
"learning_rate": 0.00015964702540389767,
"loss": 0.7546,
"step": 2710
},
{
"epoch": 1.8415707515233581,
"grad_norm": 0.9403987526893616,
"learning_rate": 0.0001592669513266428,
"loss": 0.7482,
"step": 2720
},
{
"epoch": 1.8483412322274881,
"grad_norm": 0.863129734992981,
"learning_rate": 0.00015888555293575254,
"loss": 0.7527,
"step": 2730
},
{
"epoch": 1.8551117129316181,
"grad_norm": 1.1445564031600952,
"learning_rate": 0.0001585028387535328,
"loss": 0.7672,
"step": 2740
},
{
"epoch": 1.8618821936357481,
"grad_norm": 0.8358940482139587,
"learning_rate": 0.0001581188173316907,
"loss": 0.7877,
"step": 2750
},
{
"epoch": 1.8686526743398781,
"grad_norm": 1.0207701921463013,
"learning_rate": 0.00015773349725114352,
"loss": 0.7711,
"step": 2760
},
{
"epoch": 1.8754231550440081,
"grad_norm": 0.9382310509681702,
"learning_rate": 0.00015734688712182687,
"loss": 0.7365,
"step": 2770
},
{
"epoch": 1.8821936357481381,
"grad_norm": 0.7211757898330688,
"learning_rate": 0.0001569589955825024,
"loss": 0.7144,
"step": 2780
},
{
"epoch": 1.8889641164522681,
"grad_norm": 1.0787826776504517,
"learning_rate": 0.00015656983130056472,
"loss": 0.7784,
"step": 2790
},
{
"epoch": 1.8957345971563981,
"grad_norm": 1.0936686992645264,
"learning_rate": 0.00015617940297184775,
"loss": 0.7455,
"step": 2800
},
{
"epoch": 1.9025050778605281,
"grad_norm": 1.0122491121292114,
"learning_rate": 0.00015578771932043037,
"loss": 0.7711,
"step": 2810
},
{
"epoch": 1.9092755585646581,
"grad_norm": 0.9829614162445068,
"learning_rate": 0.00015539478909844156,
"loss": 0.7485,
"step": 2820
},
{
"epoch": 1.9160460392687881,
"grad_norm": 0.9822033047676086,
"learning_rate": 0.00015500062108586473,
"loss": 0.7337,
"step": 2830
},
{
"epoch": 1.9228165199729181,
"grad_norm": 0.8550043702125549,
"learning_rate": 0.0001546052240903416,
"loss": 0.7547,
"step": 2840
},
{
"epoch": 1.929587000677048,
"grad_norm": 0.7504202723503113,
"learning_rate": 0.0001542086069469754,
"loss": 0.7329,
"step": 2850
},
{
"epoch": 1.9363574813811781,
"grad_norm": 0.7536128759384155,
"learning_rate": 0.00015381077851813342,
"loss": 0.6917,
"step": 2860
},
{
"epoch": 1.943127962085308,
"grad_norm": 1.024143934249878,
"learning_rate": 0.000153411747693249,
"loss": 0.7293,
"step": 2870
},
{
"epoch": 1.9498984427894381,
"grad_norm": 0.8882274031639099,
"learning_rate": 0.0001530115233886229,
"loss": 0.7067,
"step": 2880
},
{
"epoch": 1.956668923493568,
"grad_norm": 0.814894437789917,
"learning_rate": 0.00015261011454722402,
"loss": 0.6613,
"step": 2890
},
{
"epoch": 1.9634394041976981,
"grad_norm": 0.8720422387123108,
"learning_rate": 0.00015220753013848965,
"loss": 0.7931,
"step": 2900
},
{
"epoch": 1.970209884901828,
"grad_norm": 1.070326805114746,
"learning_rate": 0.00015180377915812498,
"loss": 0.6737,
"step": 2910
},
{
"epoch": 1.9769803656059581,
"grad_norm": 0.9129419922828674,
"learning_rate": 0.0001513988706279021,
"loss": 0.7693,
"step": 2920
},
{
"epoch": 1.983750846310088,
"grad_norm": 0.9133071303367615,
"learning_rate": 0.00015099281359545844,
"loss": 0.7222,
"step": 2930
},
{
"epoch": 1.9905213270142181,
"grad_norm": 1.1360323429107666,
"learning_rate": 0.00015058561713409465,
"loss": 0.7813,
"step": 2940
},
{
"epoch": 1.997291807718348,
"grad_norm": 1.1606559753417969,
"learning_rate": 0.0001501772903425717,
"loss": 0.7045,
"step": 2950
},
{
"epoch": 2.004062288422478,
"grad_norm": 0.8940277099609375,
"learning_rate": 0.0001497678423449077,
"loss": 0.6686,
"step": 2960
},
{
"epoch": 2.010832769126608,
"grad_norm": 0.9504866003990173,
"learning_rate": 0.00014935728229017404,
"loss": 0.5851,
"step": 2970
},
{
"epoch": 2.017603249830738,
"grad_norm": 0.9662072062492371,
"learning_rate": 0.00014894561935229083,
"loss": 0.5836,
"step": 2980
},
{
"epoch": 2.024373730534868,
"grad_norm": 1.1531829833984375,
"learning_rate": 0.00014853286272982206,
"loss": 0.5511,
"step": 2990
},
{
"epoch": 2.031144211238998,
"grad_norm": 1.0693235397338867,
"learning_rate": 0.00014811902164576986,
"loss": 0.5325,
"step": 3000
},
{
"epoch": 2.031144211238998,
"eval_loss": 0.8718012571334839,
"eval_runtime": 23.0432,
"eval_samples_per_second": 107.971,
"eval_steps_per_second": 13.496,
"step": 3000
},
{
"epoch": 2.037914691943128,
"grad_norm": 1.1329638957977295,
"learning_rate": 0.0001477041053473687,
"loss": 0.5722,
"step": 3010
},
{
"epoch": 2.044685172647258,
"grad_norm": 1.1756556034088135,
"learning_rate": 0.0001472881231058785,
"loss": 0.57,
"step": 3020
},
{
"epoch": 2.051455653351388,
"grad_norm": 1.1575700044631958,
"learning_rate": 0.00014687108421637758,
"loss": 0.5845,
"step": 3030
},
{
"epoch": 2.058226134055518,
"grad_norm": 1.0859098434448242,
"learning_rate": 0.0001464529979975549,
"loss": 0.533,
"step": 3040
},
{
"epoch": 2.064996614759648,
"grad_norm": 0.9851484298706055,
"learning_rate": 0.00014603387379150197,
"loss": 0.584,
"step": 3050
},
{
"epoch": 2.071767095463778,
"grad_norm": 1.1865367889404297,
"learning_rate": 0.00014561372096350402,
"loss": 0.5536,
"step": 3060
},
{
"epoch": 2.078537576167908,
"grad_norm": 1.114558219909668,
"learning_rate": 0.00014519254890183058,
"loss": 0.5627,
"step": 3070
},
{
"epoch": 2.085308056872038,
"grad_norm": 1.0637989044189453,
"learning_rate": 0.00014477036701752603,
"loss": 0.5625,
"step": 3080
},
{
"epoch": 2.092078537576168,
"grad_norm": 1.2044423818588257,
"learning_rate": 0.00014434718474419896,
"loss": 0.6045,
"step": 3090
},
{
"epoch": 2.0988490182802977,
"grad_norm": 1.0656991004943848,
"learning_rate": 0.00014392301153781168,
"loss": 0.5458,
"step": 3100
},
{
"epoch": 2.105619498984428,
"grad_norm": 1.431920051574707,
"learning_rate": 0.00014349785687646879,
"loss": 0.5798,
"step": 3110
},
{
"epoch": 2.1123899796885577,
"grad_norm": 1.4664020538330078,
"learning_rate": 0.00014307173026020524,
"loss": 0.5566,
"step": 3120
},
{
"epoch": 2.119160460392688,
"grad_norm": 0.9782803654670715,
"learning_rate": 0.00014264464121077435,
"loss": 0.5883,
"step": 3130
},
{
"epoch": 2.1259309410968177,
"grad_norm": 1.2193199396133423,
"learning_rate": 0.00014221659927143488,
"loss": 0.5912,
"step": 3140
},
{
"epoch": 2.132701421800948,
"grad_norm": 1.1089211702346802,
"learning_rate": 0.00014178761400673778,
"loss": 0.5421,
"step": 3150
},
{
"epoch": 2.1394719025050777,
"grad_norm": 1.6899245977401733,
"learning_rate": 0.00014135769500231259,
"loss": 0.5477,
"step": 3160
},
{
"epoch": 2.146242383209208,
"grad_norm": 1.1503666639328003,
"learning_rate": 0.00014092685186465297,
"loss": 0.5703,
"step": 3170
},
{
"epoch": 2.1530128639133377,
"grad_norm": 1.1421773433685303,
"learning_rate": 0.0001404950942209025,
"loss": 0.6063,
"step": 3180
},
{
"epoch": 2.159783344617468,
"grad_norm": 1.308514952659607,
"learning_rate": 0.00014006243171863907,
"loss": 0.6101,
"step": 3190
},
{
"epoch": 2.1665538253215977,
"grad_norm": 1.108906626701355,
"learning_rate": 0.00013962887402565967,
"loss": 0.6067,
"step": 3200
},
{
"epoch": 2.173324306025728,
"grad_norm": 1.3432538509368896,
"learning_rate": 0.00013919443082976415,
"loss": 0.5724,
"step": 3210
},
{
"epoch": 2.1800947867298577,
"grad_norm": 1.2304880619049072,
"learning_rate": 0.00013875911183853896,
"loss": 0.5764,
"step": 3220
},
{
"epoch": 2.186865267433988,
"grad_norm": 1.1720483303070068,
"learning_rate": 0.0001383229267791399,
"loss": 0.565,
"step": 3230
},
{
"epoch": 2.1936357481381177,
"grad_norm": 0.9357210397720337,
"learning_rate": 0.00013788588539807517,
"loss": 0.525,
"step": 3240
},
{
"epoch": 2.200406228842248,
"grad_norm": 1.2292680740356445,
"learning_rate": 0.0001374479974609872,
"loss": 0.6126,
"step": 3250
},
{
"epoch": 2.2071767095463777,
"grad_norm": 1.0784507989883423,
"learning_rate": 0.0001370092727524348,
"loss": 0.5863,
"step": 3260
},
{
"epoch": 2.213947190250508,
"grad_norm": 1.3088752031326294,
"learning_rate": 0.00013656972107567423,
"loss": 0.5568,
"step": 3270
},
{
"epoch": 2.2207176709546377,
"grad_norm": 1.1142232418060303,
"learning_rate": 0.0001361293522524403,
"loss": 0.5777,
"step": 3280
},
{
"epoch": 2.227488151658768,
"grad_norm": 1.1168012619018555,
"learning_rate": 0.0001356881761227269,
"loss": 0.549,
"step": 3290
},
{
"epoch": 2.2342586323628977,
"grad_norm": 1.1179856061935425,
"learning_rate": 0.00013524620254456705,
"loss": 0.5828,
"step": 3300
},
{
"epoch": 2.241029113067028,
"grad_norm": 1.1862361431121826,
"learning_rate": 0.00013480344139381266,
"loss": 0.5441,
"step": 3310
},
{
"epoch": 2.2477995937711577,
"grad_norm": 1.2580469846725464,
"learning_rate": 0.0001343599025639139,
"loss": 0.6452,
"step": 3320
},
{
"epoch": 2.254570074475288,
"grad_norm": 0.9721531271934509,
"learning_rate": 0.00013391559596569815,
"loss": 0.5803,
"step": 3330
},
{
"epoch": 2.2613405551794177,
"grad_norm": 1.099107265472412,
"learning_rate": 0.0001334705315271483,
"loss": 0.5768,
"step": 3340
},
{
"epoch": 2.268111035883548,
"grad_norm": 1.0356446504592896,
"learning_rate": 0.00013302471919318141,
"loss": 0.5759,
"step": 3350
},
{
"epoch": 2.2748815165876777,
"grad_norm": 1.2317684888839722,
"learning_rate": 0.00013257816892542582,
"loss": 0.5797,
"step": 3360
},
{
"epoch": 2.281651997291808,
"grad_norm": 1.2287174463272095,
"learning_rate": 0.0001321308907019992,
"loss": 0.5747,
"step": 3370
},
{
"epoch": 2.2884224779959377,
"grad_norm": 1.2517625093460083,
"learning_rate": 0.0001316828945172852,
"loss": 0.5114,
"step": 3380
},
{
"epoch": 2.295192958700068,
"grad_norm": 1.088796854019165,
"learning_rate": 0.00013123419038171024,
"loss": 0.5821,
"step": 3390
},
{
"epoch": 2.3019634394041977,
"grad_norm": 1.0487096309661865,
"learning_rate": 0.00013078478832151985,
"loss": 0.6054,
"step": 3400
},
{
"epoch": 2.3087339201083275,
"grad_norm": 1.1964969635009766,
"learning_rate": 0.00013033469837855457,
"loss": 0.5621,
"step": 3410
},
{
"epoch": 2.3155044008124577,
"grad_norm": 1.2567753791809082,
"learning_rate": 0.00012988393061002566,
"loss": 0.5858,
"step": 3420
},
{
"epoch": 2.322274881516588,
"grad_norm": 0.984793484210968,
"learning_rate": 0.0001294324950882903,
"loss": 0.5961,
"step": 3430
},
{
"epoch": 2.3290453622207177,
"grad_norm": 1.2915070056915283,
"learning_rate": 0.00012898040190062647,
"loss": 0.5667,
"step": 3440
},
{
"epoch": 2.3358158429248475,
"grad_norm": 1.242781400680542,
"learning_rate": 0.00012852766114900777,
"loss": 0.5781,
"step": 3450
},
{
"epoch": 2.3425863236289777,
"grad_norm": 1.1402225494384766,
"learning_rate": 0.00012807428294987744,
"loss": 0.6048,
"step": 3460
},
{
"epoch": 2.349356804333108,
"grad_norm": 1.2243235111236572,
"learning_rate": 0.0001276202774339224,
"loss": 0.5672,
"step": 3470
},
{
"epoch": 2.3561272850372377,
"grad_norm": 1.2512565851211548,
"learning_rate": 0.00012716565474584702,
"loss": 0.5992,
"step": 3480
},
{
"epoch": 2.3628977657413675,
"grad_norm": 1.3591067790985107,
"learning_rate": 0.00012671042504414619,
"loss": 0.5853,
"step": 3490
},
{
"epoch": 2.3696682464454977,
"grad_norm": 1.7091628313064575,
"learning_rate": 0.00012625459850087846,
"loss": 0.5501,
"step": 3500
},
{
"epoch": 2.3764387271496275,
"grad_norm": 1.2151107788085938,
"learning_rate": 0.00012579818530143884,
"loss": 0.5684,
"step": 3510
},
{
"epoch": 2.3832092078537577,
"grad_norm": 1.4708514213562012,
"learning_rate": 0.000125341195644331,
"loss": 0.578,
"step": 3520
},
{
"epoch": 2.3899796885578874,
"grad_norm": 1.2934261560440063,
"learning_rate": 0.0001248836397409396,
"loss": 0.6235,
"step": 3530
},
{
"epoch": 2.3967501692620177,
"grad_norm": 1.9203015565872192,
"learning_rate": 0.00012442552781530186,
"loss": 0.5868,
"step": 3540
},
{
"epoch": 2.4035206499661474,
"grad_norm": 1.2564107179641724,
"learning_rate": 0.00012396687010387942,
"loss": 0.6091,
"step": 3550
},
{
"epoch": 2.4102911306702777,
"grad_norm": 1.3231315612792969,
"learning_rate": 0.00012350767685532938,
"loss": 0.5492,
"step": 3560
},
{
"epoch": 2.4170616113744074,
"grad_norm": 1.392247200012207,
"learning_rate": 0.00012304795833027534,
"loss": 0.5809,
"step": 3570
},
{
"epoch": 2.4238320920785377,
"grad_norm": 1.1600557565689087,
"learning_rate": 0.00012258772480107816,
"loss": 0.5638,
"step": 3580
},
{
"epoch": 2.4306025727826674,
"grad_norm": 1.3254331350326538,
"learning_rate": 0.00012212698655160637,
"loss": 0.5644,
"step": 3590
},
{
"epoch": 2.4373730534867977,
"grad_norm": 1.2660179138183594,
"learning_rate": 0.00012166575387700651,
"loss": 0.5852,
"step": 3600
},
{
"epoch": 2.4441435341909274,
"grad_norm": 1.1489580869674683,
"learning_rate": 0.00012120403708347298,
"loss": 0.5753,
"step": 3610
},
{
"epoch": 2.4509140148950577,
"grad_norm": 1.1386017799377441,
"learning_rate": 0.00012074184648801769,
"loss": 0.5446,
"step": 3620
},
{
"epoch": 2.4576844955991874,
"grad_norm": 1.3722707033157349,
"learning_rate": 0.00012027919241823964,
"loss": 0.5771,
"step": 3630
},
{
"epoch": 2.4644549763033177,
"grad_norm": 1.1902090311050415,
"learning_rate": 0.00011981608521209413,
"loss": 0.5774,
"step": 3640
},
{
"epoch": 2.4712254570074474,
"grad_norm": 1.1676629781723022,
"learning_rate": 0.00011935253521766174,
"loss": 0.5718,
"step": 3650
},
{
"epoch": 2.4779959377115777,
"grad_norm": 1.1004976034164429,
"learning_rate": 0.00011888855279291713,
"loss": 0.6151,
"step": 3660
},
{
"epoch": 2.4847664184157074,
"grad_norm": 1.407827377319336,
"learning_rate": 0.00011842414830549748,
"loss": 0.6025,
"step": 3670
},
{
"epoch": 2.4915368991198377,
"grad_norm": 1.26259183883667,
"learning_rate": 0.00011795933213247101,
"loss": 0.6008,
"step": 3680
},
{
"epoch": 2.4983073798239674,
"grad_norm": 1.1961734294891357,
"learning_rate": 0.000117494114660105,
"loss": 0.5598,
"step": 3690
},
{
"epoch": 2.5050778605280977,
"grad_norm": 0.9188928604125977,
"learning_rate": 0.00011702850628363365,
"loss": 0.5636,
"step": 3700
},
{
"epoch": 2.5118483412322274,
"grad_norm": 0.9072563052177429,
"learning_rate": 0.00011656251740702596,
"loss": 0.5629,
"step": 3710
},
{
"epoch": 2.518618821936357,
"grad_norm": 1.0292631387710571,
"learning_rate": 0.00011609615844275305,
"loss": 0.6066,
"step": 3720
},
{
"epoch": 2.5253893026404874,
"grad_norm": 1.229181170463562,
"learning_rate": 0.00011562943981155575,
"loss": 0.5491,
"step": 3730
},
{
"epoch": 2.5321597833446177,
"grad_norm": 1.1053756475448608,
"learning_rate": 0.00011516237194221149,
"loss": 0.6065,
"step": 3740
},
{
"epoch": 2.5389302640487474,
"grad_norm": 1.4795639514923096,
"learning_rate": 0.0001146949652713015,
"loss": 0.5705,
"step": 3750
},
{
"epoch": 2.545700744752877,
"grad_norm": 1.1489176750183105,
"learning_rate": 0.00011422723024297737,
"loss": 0.5364,
"step": 3760
},
{
"epoch": 2.5524712254570074,
"grad_norm": 1.1073706150054932,
"learning_rate": 0.00011375917730872787,
"loss": 0.6014,
"step": 3770
},
{
"epoch": 2.5592417061611377,
"grad_norm": 1.5487061738967896,
"learning_rate": 0.00011329081692714534,
"loss": 0.5477,
"step": 3780
},
{
"epoch": 2.5660121868652674,
"grad_norm": 1.4128634929656982,
"learning_rate": 0.00011282215956369204,
"loss": 0.6538,
"step": 3790
},
{
"epoch": 2.572782667569397,
"grad_norm": 1.2158820629119873,
"learning_rate": 0.00011235321569046615,
"loss": 0.594,
"step": 3800
},
{
"epoch": 2.5795531482735274,
"grad_norm": 1.3014835119247437,
"learning_rate": 0.00011188399578596795,
"loss": 0.5936,
"step": 3810
},
{
"epoch": 2.5863236289776577,
"grad_norm": 1.3620414733886719,
"learning_rate": 0.00011141451033486564,
"loss": 0.5633,
"step": 3820
},
{
"epoch": 2.5930941096817874,
"grad_norm": 1.224446415901184,
"learning_rate": 0.00011094476982776096,
"loss": 0.553,
"step": 3830
},
{
"epoch": 2.599864590385917,
"grad_norm": 1.3176541328430176,
"learning_rate": 0.00011047478476095487,
"loss": 0.5591,
"step": 3840
},
{
"epoch": 2.6066350710900474,
"grad_norm": 1.1520602703094482,
"learning_rate": 0.00011000456563621304,
"loss": 0.5753,
"step": 3850
},
{
"epoch": 2.6134055517941777,
"grad_norm": 1.2285906076431274,
"learning_rate": 0.00010953412296053105,
"loss": 0.6055,
"step": 3860
},
{
"epoch": 2.6201760324983074,
"grad_norm": 1.544148564338684,
"learning_rate": 0.00010906346724589975,
"loss": 0.6062,
"step": 3870
},
{
"epoch": 2.626946513202437,
"grad_norm": 1.2714669704437256,
"learning_rate": 0.00010859260900907038,
"loss": 0.5867,
"step": 3880
},
{
"epoch": 2.6337169939065674,
"grad_norm": 1.4937471151351929,
"learning_rate": 0.00010812155877131945,
"loss": 0.5953,
"step": 3890
},
{
"epoch": 2.640487474610697,
"grad_norm": 1.551594614982605,
"learning_rate": 0.00010765032705821363,
"loss": 0.5537,
"step": 3900
},
{
"epoch": 2.6472579553148274,
"grad_norm": 1.565324068069458,
"learning_rate": 0.0001071789243993748,
"loss": 0.572,
"step": 3910
},
{
"epoch": 2.654028436018957,
"grad_norm": 1.207514762878418,
"learning_rate": 0.00010670736132824455,
"loss": 0.5921,
"step": 3920
},
{
"epoch": 2.6607989167230874,
"grad_norm": 1.1995245218276978,
"learning_rate": 0.00010623564838184878,
"loss": 0.5635,
"step": 3930
},
{
"epoch": 2.667569397427217,
"grad_norm": 1.1889262199401855,
"learning_rate": 0.00010576379610056249,
"loss": 0.5886,
"step": 3940
},
{
"epoch": 2.6743398781313474,
"grad_norm": 1.0783162117004395,
"learning_rate": 0.0001052918150278739,
"loss": 0.5831,
"step": 3950
},
{
"epoch": 2.681110358835477,
"grad_norm": 1.4271385669708252,
"learning_rate": 0.0001048197157101493,
"loss": 0.5335,
"step": 3960
},
{
"epoch": 2.6878808395396074,
"grad_norm": 1.167817234992981,
"learning_rate": 0.00010434750869639693,
"loss": 0.5331,
"step": 3970
},
{
"epoch": 2.694651320243737,
"grad_norm": 1.3966023921966553,
"learning_rate": 0.00010387520453803166,
"loss": 0.5931,
"step": 3980
},
{
"epoch": 2.7014218009478674,
"grad_norm": 1.328182578086853,
"learning_rate": 0.00010340281378863892,
"loss": 0.5472,
"step": 3990
},
{
"epoch": 2.708192281651997,
"grad_norm": 1.3755980730056763,
"learning_rate": 0.00010293034700373905,
"loss": 0.5875,
"step": 4000
},
{
"epoch": 2.708192281651997,
"eval_loss": 0.8555851578712463,
"eval_runtime": 22.9559,
"eval_samples_per_second": 108.382,
"eval_steps_per_second": 13.548,
"step": 4000
},
{
"epoch": 2.7149627623561274,
"grad_norm": 1.2442570924758911,
"learning_rate": 0.0001024578147405514,
"loss": 0.6028,
"step": 4010
},
{
"epoch": 2.721733243060257,
"grad_norm": 1.2046414613723755,
"learning_rate": 0.0001019852275577585,
"loss": 0.5959,
"step": 4020
},
{
"epoch": 2.7285037237643874,
"grad_norm": 1.1981314420700073,
"learning_rate": 0.00010151259601526992,
"loss": 0.6042,
"step": 4030
},
{
"epoch": 2.735274204468517,
"grad_norm": 1.3695381879806519,
"learning_rate": 0.00010103993067398649,
"loss": 0.5943,
"step": 4040
},
{
"epoch": 2.7420446851726474,
"grad_norm": 1.1446524858474731,
"learning_rate": 0.00010056724209556431,
"loss": 0.5853,
"step": 4050
},
{
"epoch": 2.748815165876777,
"grad_norm": 1.2874009609222412,
"learning_rate": 0.00010009454084217873,
"loss": 0.5967,
"step": 4060
},
{
"epoch": 2.755585646580907,
"grad_norm": 1.3916451930999756,
"learning_rate": 9.962183747628819e-05,
"loss": 0.5528,
"step": 4070
},
{
"epoch": 2.762356127285037,
"grad_norm": 1.141298532485962,
"learning_rate": 9.914914256039847e-05,
"loss": 0.5641,
"step": 4080
},
{
"epoch": 2.7691266079891674,
"grad_norm": 1.2546755075454712,
"learning_rate": 9.867646665682646e-05,
"loss": 0.5638,
"step": 4090
},
{
"epoch": 2.775897088693297,
"grad_norm": 1.2840214967727661,
"learning_rate": 9.820382032746426e-05,
"loss": 0.5835,
"step": 4100
},
{
"epoch": 2.782667569397427,
"grad_norm": 1.1560393571853638,
"learning_rate": 9.773121413354311e-05,
"loss": 0.5809,
"step": 4110
},
{
"epoch": 2.789438050101557,
"grad_norm": 1.3474149703979492,
"learning_rate": 9.725865863539747e-05,
"loss": 0.5768,
"step": 4120
},
{
"epoch": 2.7962085308056874,
"grad_norm": 1.1416068077087402,
"learning_rate": 9.678616439222899e-05,
"loss": 0.5758,
"step": 4130
},
{
"epoch": 2.802979011509817,
"grad_norm": 1.192691445350647,
"learning_rate": 9.631374196187051e-05,
"loss": 0.547,
"step": 4140
},
{
"epoch": 2.809749492213947,
"grad_norm": 1.2631511688232422,
"learning_rate": 9.584140190055035e-05,
"loss": 0.5315,
"step": 4150
},
{
"epoch": 2.816519972918077,
"grad_norm": 1.3457276821136475,
"learning_rate": 9.536915476265621e-05,
"loss": 0.5824,
"step": 4160
},
{
"epoch": 2.8232904536222074,
"grad_norm": 1.5314511060714722,
"learning_rate": 9.489701110049944e-05,
"loss": 0.6094,
"step": 4170
},
{
"epoch": 2.830060934326337,
"grad_norm": 1.3376086950302124,
"learning_rate": 9.442498146407927e-05,
"loss": 0.5914,
"step": 4180
},
{
"epoch": 2.836831415030467,
"grad_norm": 1.5918281078338623,
"learning_rate": 9.3953076400847e-05,
"loss": 0.5814,
"step": 4190
},
{
"epoch": 2.843601895734597,
"grad_norm": 1.387515902519226,
"learning_rate": 9.348130645547042e-05,
"loss": 0.5663,
"step": 4200
},
{
"epoch": 2.850372376438727,
"grad_norm": 1.612802267074585,
"learning_rate": 9.300968216959805e-05,
"loss": 0.5807,
"step": 4210
},
{
"epoch": 2.857142857142857,
"grad_norm": 1.34074068069458,
"learning_rate": 9.253821408162366e-05,
"loss": 0.5868,
"step": 4220
},
{
"epoch": 2.863913337846987,
"grad_norm": 1.436584234237671,
"learning_rate": 9.206691272645087e-05,
"loss": 0.5613,
"step": 4230
},
{
"epoch": 2.870683818551117,
"grad_norm": 1.3354675769805908,
"learning_rate": 9.159578863525762e-05,
"loss": 0.6245,
"step": 4240
},
{
"epoch": 2.877454299255247,
"grad_norm": 1.1248669624328613,
"learning_rate": 9.11248523352609e-05,
"loss": 0.547,
"step": 4250
},
{
"epoch": 2.884224779959377,
"grad_norm": 1.1722201108932495,
"learning_rate": 9.065411434948152e-05,
"loss": 0.5432,
"step": 4260
},
{
"epoch": 2.890995260663507,
"grad_norm": 1.2124953269958496,
"learning_rate": 9.018358519650909e-05,
"loss": 0.534,
"step": 4270
},
{
"epoch": 2.897765741367637,
"grad_norm": 1.258863091468811,
"learning_rate": 8.97132753902667e-05,
"loss": 0.5651,
"step": 4280
},
{
"epoch": 2.904536222071767,
"grad_norm": 1.2424662113189697,
"learning_rate": 8.924319543977631e-05,
"loss": 0.5611,
"step": 4290
},
{
"epoch": 2.911306702775897,
"grad_norm": 1.2281653881072998,
"learning_rate": 8.877335584892369e-05,
"loss": 0.5584,
"step": 4300
},
{
"epoch": 2.918077183480027,
"grad_norm": 1.1419377326965332,
"learning_rate": 8.830376711622379e-05,
"loss": 0.5939,
"step": 4310
},
{
"epoch": 2.924847664184157,
"grad_norm": 1.0923197269439697,
"learning_rate": 8.783443973458625e-05,
"loss": 0.5912,
"step": 4320
},
{
"epoch": 2.931618144888287,
"grad_norm": 1.0926480293273926,
"learning_rate": 8.736538419108074e-05,
"loss": 0.6095,
"step": 4330
},
{
"epoch": 2.938388625592417,
"grad_norm": 1.4442996978759766,
"learning_rate": 8.689661096670285e-05,
"loss": 0.5618,
"step": 4340
},
{
"epoch": 2.945159106296547,
"grad_norm": 1.2105728387832642,
"learning_rate": 8.64281305361397e-05,
"loss": 0.5388,
"step": 4350
},
{
"epoch": 2.951929587000677,
"grad_norm": 1.2048066854476929,
"learning_rate": 8.595995336753597e-05,
"loss": 0.5891,
"step": 4360
},
{
"epoch": 2.958700067704807,
"grad_norm": 1.407758355140686,
"learning_rate": 8.549208992226001e-05,
"loss": 0.5351,
"step": 4370
},
{
"epoch": 2.9654705484089368,
"grad_norm": 1.075348973274231,
"learning_rate": 8.502455065467006e-05,
"loss": 0.5939,
"step": 4380
},
{
"epoch": 2.972241029113067,
"grad_norm": 1.2892156839370728,
"learning_rate": 8.45573460118806e-05,
"loss": 0.5488,
"step": 4390
},
{
"epoch": 2.979011509817197,
"grad_norm": 1.1205973625183105,
"learning_rate": 8.4090486433529e-05,
"loss": 0.6054,
"step": 4400
},
{
"epoch": 2.985781990521327,
"grad_norm": 1.4507098197937012,
"learning_rate": 8.362398235154213e-05,
"loss": 0.5542,
"step": 4410
},
{
"epoch": 2.9925524712254568,
"grad_norm": 1.2207527160644531,
"learning_rate": 8.31578441899035e-05,
"loss": 0.5326,
"step": 4420
},
{
"epoch": 2.999322951929587,
"grad_norm": 1.032354712486267,
"learning_rate": 8.269208236442003e-05,
"loss": 0.5924,
"step": 4430
},
{
"epoch": 3.006093432633717,
"grad_norm": 1.38179349899292,
"learning_rate": 8.222670728248941e-05,
"loss": 0.4272,
"step": 4440
},
{
"epoch": 3.012863913337847,
"grad_norm": 1.3886513710021973,
"learning_rate": 8.17617293428677e-05,
"loss": 0.4442,
"step": 4450
},
{
"epoch": 3.019634394041977,
"grad_norm": 1.5716043710708618,
"learning_rate": 8.129715893543681e-05,
"loss": 0.3873,
"step": 4460
},
{
"epoch": 3.026404874746107,
"grad_norm": 1.4398396015167236,
"learning_rate": 8.08330064409724e-05,
"loss": 0.3991,
"step": 4470
},
{
"epoch": 3.0331753554502368,
"grad_norm": 1.4795118570327759,
"learning_rate": 8.036928223091187e-05,
"loss": 0.4557,
"step": 4480
},
{
"epoch": 3.039945836154367,
"grad_norm": 1.5591235160827637,
"learning_rate": 7.990599666712268e-05,
"loss": 0.4077,
"step": 4490
},
{
"epoch": 3.0467163168584968,
"grad_norm": 1.3513033390045166,
"learning_rate": 7.94431601016708e-05,
"loss": 0.3999,
"step": 4500
},
{
"epoch": 3.053486797562627,
"grad_norm": 1.4254108667373657,
"learning_rate": 7.898078287658941e-05,
"loss": 0.3614,
"step": 4510
},
{
"epoch": 3.0602572782667568,
"grad_norm": 1.2728102207183838,
"learning_rate": 7.85188753236477e-05,
"loss": 0.4038,
"step": 4520
},
{
"epoch": 3.067027758970887,
"grad_norm": 1.6714439392089844,
"learning_rate": 7.805744776412012e-05,
"loss": 0.4229,
"step": 4530
},
{
"epoch": 3.0737982396750168,
"grad_norm": 1.4847053289413452,
"learning_rate": 7.759651050855568e-05,
"loss": 0.3806,
"step": 4540
},
{
"epoch": 3.080568720379147,
"grad_norm": 1.7574979066848755,
"learning_rate": 7.713607385654772e-05,
"loss": 0.3625,
"step": 4550
},
{
"epoch": 3.0873392010832768,
"grad_norm": 1.495059609413147,
"learning_rate": 7.667614809650351e-05,
"loss": 0.3889,
"step": 4560
},
{
"epoch": 3.094109681787407,
"grad_norm": 1.2997581958770752,
"learning_rate": 7.621674350541461e-05,
"loss": 0.3775,
"step": 4570
},
{
"epoch": 3.1008801624915368,
"grad_norm": 1.5862250328063965,
"learning_rate": 7.575787034862704e-05,
"loss": 0.4023,
"step": 4580
},
{
"epoch": 3.107650643195667,
"grad_norm": 1.5325440168380737,
"learning_rate": 7.529953887961197e-05,
"loss": 0.3641,
"step": 4590
},
{
"epoch": 3.1144211238997968,
"grad_norm": 1.4811371564865112,
"learning_rate": 7.484175933973668e-05,
"loss": 0.3818,
"step": 4600
},
{
"epoch": 3.121191604603927,
"grad_norm": 1.7169820070266724,
"learning_rate": 7.438454195803559e-05,
"loss": 0.4187,
"step": 4610
},
{
"epoch": 3.1279620853080567,
"grad_norm": 1.6318345069885254,
"learning_rate": 7.392789695098182e-05,
"loss": 0.3718,
"step": 4620
},
{
"epoch": 3.134732566012187,
"grad_norm": 1.633092999458313,
"learning_rate": 7.347183452225874e-05,
"loss": 0.3969,
"step": 4630
},
{
"epoch": 3.1415030467163167,
"grad_norm": 1.8210922479629517,
"learning_rate": 7.301636486253215e-05,
"loss": 0.4193,
"step": 4640
},
{
"epoch": 3.148273527420447,
"grad_norm": 2.1533546447753906,
"learning_rate": 7.256149814922253e-05,
"loss": 0.3923,
"step": 4650
},
{
"epoch": 3.1550440081245767,
"grad_norm": 1.4838796854019165,
"learning_rate": 7.210724454627751e-05,
"loss": 0.3871,
"step": 4660
},
{
"epoch": 3.161814488828707,
"grad_norm": 1.755631685256958,
"learning_rate": 7.165361420394482e-05,
"loss": 0.4219,
"step": 4670
},
{
"epoch": 3.1685849695328367,
"grad_norm": 1.197309136390686,
"learning_rate": 7.120061725854554e-05,
"loss": 0.4219,
"step": 4680
},
{
"epoch": 3.175355450236967,
"grad_norm": 1.7161248922348022,
"learning_rate": 7.074826383224761e-05,
"loss": 0.4002,
"step": 4690
},
{
"epoch": 3.1821259309410967,
"grad_norm": 1.4585338830947876,
"learning_rate": 7.029656403283951e-05,
"loss": 0.3984,
"step": 4700
},
{
"epoch": 3.188896411645227,
"grad_norm": 1.5048658847808838,
"learning_rate": 6.984552795350453e-05,
"loss": 0.4005,
"step": 4710
},
{
"epoch": 3.1956668923493567,
"grad_norm": 1.7454990148544312,
"learning_rate": 6.939516567259523e-05,
"loss": 0.3999,
"step": 4720
},
{
"epoch": 3.202437373053487,
"grad_norm": 1.4264365434646606,
"learning_rate": 6.894548725340822e-05,
"loss": 0.3844,
"step": 4730
},
{
"epoch": 3.2092078537576167,
"grad_norm": 1.3761653900146484,
"learning_rate": 6.849650274395929e-05,
"loss": 0.4107,
"step": 4740
},
{
"epoch": 3.215978334461747,
"grad_norm": 1.6094237565994263,
"learning_rate": 6.804822217675885e-05,
"loss": 0.3865,
"step": 4750
},
{
"epoch": 3.2227488151658767,
"grad_norm": 1.969099998474121,
"learning_rate": 6.760065556858786e-05,
"loss": 0.3635,
"step": 4760
},
{
"epoch": 3.229519295870007,
"grad_norm": 1.5209436416625977,
"learning_rate": 6.715381292027385e-05,
"loss": 0.3754,
"step": 4770
},
{
"epoch": 3.2362897765741367,
"grad_norm": 1.6469786167144775,
"learning_rate": 6.670770421646767e-05,
"loss": 0.4034,
"step": 4780
},
{
"epoch": 3.243060257278267,
"grad_norm": 1.6617894172668457,
"learning_rate": 6.626233942542013e-05,
"loss": 0.3946,
"step": 4790
},
{
"epoch": 3.2498307379823967,
"grad_norm": 1.4001210927963257,
"learning_rate": 6.581772849875951e-05,
"loss": 0.3638,
"step": 4800
},
{
"epoch": 3.2566012186865265,
"grad_norm": 1.7633929252624512,
"learning_rate": 6.537388137126899e-05,
"loss": 0.3607,
"step": 4810
},
{
"epoch": 3.2633716993906567,
"grad_norm": 1.6892105340957642,
"learning_rate": 6.493080796066477e-05,
"loss": 0.3797,
"step": 4820
},
{
"epoch": 3.270142180094787,
"grad_norm": 1.4346562623977661,
"learning_rate": 6.448851816737443e-05,
"loss": 0.3552,
"step": 4830
},
{
"epoch": 3.2769126607989167,
"grad_norm": 1.5974228382110596,
"learning_rate": 6.404702187431568e-05,
"loss": 0.3905,
"step": 4840
},
{
"epoch": 3.2836831415030465,
"grad_norm": 1.4062926769256592,
"learning_rate": 6.360632894667555e-05,
"loss": 0.3864,
"step": 4850
},
{
"epoch": 3.2904536222071767,
"grad_norm": 1.6129074096679688,
"learning_rate": 6.316644923169007e-05,
"loss": 0.3921,
"step": 4860
},
{
"epoch": 3.2972241029113065,
"grad_norm": 1.5494030714035034,
"learning_rate": 6.27273925584239e-05,
"loss": 0.4138,
"step": 4870
},
{
"epoch": 3.3039945836154367,
"grad_norm": 1.5944302082061768,
"learning_rate": 6.228916873755118e-05,
"loss": 0.3709,
"step": 4880
},
{
"epoch": 3.3107650643195665,
"grad_norm": 1.4350250959396362,
"learning_rate": 6.185178756113586e-05,
"loss": 0.3622,
"step": 4890
},
{
"epoch": 3.3175355450236967,
"grad_norm": 1.5585368871688843,
"learning_rate": 6.141525880241313e-05,
"loss": 0.3969,
"step": 4900
},
{
"epoch": 3.3243060257278265,
"grad_norm": 1.289538860321045,
"learning_rate": 6.097959221557108e-05,
"loss": 0.394,
"step": 4910
},
{
"epoch": 3.3310765064319567,
"grad_norm": 1.7543057203292847,
"learning_rate": 6.054479753553259e-05,
"loss": 0.396,
"step": 4920
},
{
"epoch": 3.3378469871360865,
"grad_norm": 1.633093237876892,
"learning_rate": 6.0110884477737875e-05,
"loss": 0.415,
"step": 4930
},
{
"epoch": 3.3446174678402167,
"grad_norm": 1.537914514541626,
"learning_rate": 5.9677862737927415e-05,
"loss": 0.399,
"step": 4940
},
{
"epoch": 3.3513879485443465,
"grad_norm": 1.6341283321380615,
"learning_rate": 5.924574199192527e-05,
"loss": 0.3825,
"step": 4950
},
{
"epoch": 3.3581584292484767,
"grad_norm": 1.4960927963256836,
"learning_rate": 5.881453189542295e-05,
"loss": 0.3793,
"step": 4960
},
{
"epoch": 3.3649289099526065,
"grad_norm": 1.6509079933166504,
"learning_rate": 5.838424208376354e-05,
"loss": 0.3939,
"step": 4970
},
{
"epoch": 3.3716993906567367,
"grad_norm": 1.662853479385376,
"learning_rate": 5.7954882171726444e-05,
"loss": 0.4141,
"step": 4980
},
{
"epoch": 3.3784698713608665,
"grad_norm": 1.639427661895752,
"learning_rate": 5.752646175331267e-05,
"loss": 0.4112,
"step": 4990
},
{
"epoch": 3.3852403520649967,
"grad_norm": 1.4693089723587036,
"learning_rate": 5.709899040153013e-05,
"loss": 0.372,
"step": 5000
},
{
"epoch": 3.3852403520649967,
"eval_loss": 0.9812659621238708,
"eval_runtime": 23.1744,
"eval_samples_per_second": 107.36,
"eval_steps_per_second": 13.42,
"step": 5000
},
{
"epoch": 3.3920108327691265,
"grad_norm": 1.4617177248001099,
"learning_rate": 5.667247766818018e-05,
"loss": 0.385,
"step": 5010
},
{
"epoch": 3.3987813134732567,
"grad_norm": 1.2667337656021118,
"learning_rate": 5.6246933083643794e-05,
"loss": 0.3759,
"step": 5020
},
{
"epoch": 3.4055517941773865,
"grad_norm": 1.9020839929580688,
"learning_rate": 5.582236615666885e-05,
"loss": 0.3991,
"step": 5030
},
{
"epoch": 3.4123222748815167,
"grad_norm": 1.4279497861862183,
"learning_rate": 5.5398786374157564e-05,
"loss": 0.3938,
"step": 5040
},
{
"epoch": 3.4190927555856465,
"grad_norm": 1.5497093200683594,
"learning_rate": 5.4976203200954425e-05,
"loss": 0.4,
"step": 5050
},
{
"epoch": 3.4258632362897767,
"grad_norm": 1.3598889112472534,
"learning_rate": 5.4554626079634906e-05,
"loss": 0.4117,
"step": 5060
},
{
"epoch": 3.4326337169939065,
"grad_norm": 1.498186707496643,
"learning_rate": 5.413406443029433e-05,
"loss": 0.409,
"step": 5070
},
{
"epoch": 3.4394041976980367,
"grad_norm": 1.9175001382827759,
"learning_rate": 5.371452765033733e-05,
"loss": 0.405,
"step": 5080
},
{
"epoch": 3.4461746784021665,
"grad_norm": 1.9584026336669922,
"learning_rate": 5.32960251142681e-05,
"loss": 0.3635,
"step": 5090
},
{
"epoch": 3.4529451591062967,
"grad_norm": 1.582276463508606,
"learning_rate": 5.287856617348054e-05,
"loss": 0.4101,
"step": 5100
},
{
"epoch": 3.4597156398104265,
"grad_norm": 1.6922118663787842,
"learning_rate": 5.2462160156049765e-05,
"loss": 0.3894,
"step": 5110
},
{
"epoch": 3.4664861205145563,
"grad_norm": 1.7980077266693115,
"learning_rate": 5.2046816366523355e-05,
"loss": 0.3909,
"step": 5120
},
{
"epoch": 3.4732566012186865,
"grad_norm": 1.5998905897140503,
"learning_rate": 5.1632544085713376e-05,
"loss": 0.367,
"step": 5130
},
{
"epoch": 3.4800270819228167,
"grad_norm": 1.5311387777328491,
"learning_rate": 5.121935257048936e-05,
"loss": 0.4053,
"step": 5140
},
{
"epoch": 3.4867975626269465,
"grad_norm": 1.7611960172653198,
"learning_rate": 5.080725105357109e-05,
"loss": 0.3938,
"step": 5150
},
{
"epoch": 3.4935680433310763,
"grad_norm": 2.3462700843811035,
"learning_rate": 5.0396248743322526e-05,
"loss": 0.3949,
"step": 5160
},
{
"epoch": 3.5003385240352065,
"grad_norm": 1.386608362197876,
"learning_rate": 4.998635482354598e-05,
"loss": 0.3593,
"step": 5170
},
{
"epoch": 3.5071090047393367,
"grad_norm": 2.024418592453003,
"learning_rate": 4.9577578453276886e-05,
"loss": 0.3835,
"step": 5180
},
{
"epoch": 3.5138794854434665,
"grad_norm": 1.9304969310760498,
"learning_rate": 4.9169928766579164e-05,
"loss": 0.4439,
"step": 5190
},
{
"epoch": 3.5206499661475963,
"grad_norm": 1.6261743307113647,
"learning_rate": 4.876341487234105e-05,
"loss": 0.4055,
"step": 5200
},
{
"epoch": 3.5274204468517265,
"grad_norm": 1.770004153251648,
"learning_rate": 4.83580458540717e-05,
"loss": 0.401,
"step": 5210
},
{
"epoch": 3.5341909275558567,
"grad_norm": 2.584394931793213,
"learning_rate": 4.7953830769698125e-05,
"loss": 0.3809,
"step": 5220
},
{
"epoch": 3.5409614082599865,
"grad_norm": 1.66965651512146,
"learning_rate": 4.755077865136274e-05,
"loss": 0.4251,
"step": 5230
},
{
"epoch": 3.5477318889641163,
"grad_norm": 1.5093834400177002,
"learning_rate": 4.7148898505221685e-05,
"loss": 0.3812,
"step": 5240
},
{
"epoch": 3.5545023696682465,
"grad_norm": 1.7326291799545288,
"learning_rate": 4.674819931124348e-05,
"loss": 0.3606,
"step": 5250
},
{
"epoch": 3.5612728503723763,
"grad_norm": 2.2934281826019287,
"learning_rate": 4.63486900230084e-05,
"loss": 0.4269,
"step": 5260
},
{
"epoch": 3.5680433310765065,
"grad_norm": 1.787213683128357,
"learning_rate": 4.595037956750845e-05,
"loss": 0.4109,
"step": 5270
},
{
"epoch": 3.5748138117806363,
"grad_norm": 1.5188498497009277,
"learning_rate": 4.5553276844947726e-05,
"loss": 0.4027,
"step": 5280
},
{
"epoch": 3.5815842924847665,
"grad_norm": 1.5621033906936646,
"learning_rate": 4.515739072854376e-05,
"loss": 0.4377,
"step": 5290
},
{
"epoch": 3.5883547731888963,
"grad_norm": 1.4404442310333252,
"learning_rate": 4.4762730064329164e-05,
"loss": 0.4058,
"step": 5300
},
{
"epoch": 3.5951252538930265,
"grad_norm": 1.506831407546997,
"learning_rate": 4.436930367095384e-05,
"loss": 0.3852,
"step": 5310
},
{
"epoch": 3.6018957345971563,
"grad_norm": 2.1018640995025635,
"learning_rate": 4.3977120339488174e-05,
"loss": 0.4128,
"step": 5320
},
{
"epoch": 3.6086662153012865,
"grad_norm": 1.4768526554107666,
"learning_rate": 4.358618883322639e-05,
"loss": 0.3848,
"step": 5330
},
{
"epoch": 3.6154366960054163,
"grad_norm": 1.3917316198349,
"learning_rate": 4.319651788749084e-05,
"loss": 0.4186,
"step": 5340
},
{
"epoch": 3.6222071767095465,
"grad_norm": 1.9646469354629517,
"learning_rate": 4.280811620943682e-05,
"loss": 0.4213,
"step": 5350
},
{
"epoch": 3.6289776574136763,
"grad_norm": 2.266582727432251,
"learning_rate": 4.2420992477857856e-05,
"loss": 0.4063,
"step": 5360
},
{
"epoch": 3.6357481381178065,
"grad_norm": 1.8989133834838867,
"learning_rate": 4.203515534299205e-05,
"loss": 0.3786,
"step": 5370
},
{
"epoch": 3.6425186188219363,
"grad_norm": 2.106405258178711,
"learning_rate": 4.16506134263285e-05,
"loss": 0.406,
"step": 5380
},
{
"epoch": 3.6492890995260665,
"grad_norm": 2.1753334999084473,
"learning_rate": 4.12673753204149e-05,
"loss": 0.3845,
"step": 5390
},
{
"epoch": 3.6560595802301963,
"grad_norm": 1.5723298788070679,
"learning_rate": 4.0885449588665395e-05,
"loss": 0.411,
"step": 5400
},
{
"epoch": 3.6628300609343265,
"grad_norm": 2.0291285514831543,
"learning_rate": 4.050484476516926e-05,
"loss": 0.3926,
"step": 5410
},
{
"epoch": 3.6696005416384563,
"grad_norm": 1.5461398363113403,
"learning_rate": 4.012556935450027e-05,
"loss": 0.4232,
"step": 5420
},
{
"epoch": 3.676371022342586,
"grad_norm": 1.6446950435638428,
"learning_rate": 3.97476318315265e-05,
"loss": 0.3882,
"step": 5430
},
{
"epoch": 3.6831415030467163,
"grad_norm": 1.363389015197754,
"learning_rate": 3.937104064122117e-05,
"loss": 0.3714,
"step": 5440
},
{
"epoch": 3.6899119837508465,
"grad_norm": 1.4707744121551514,
"learning_rate": 3.899580419847385e-05,
"loss": 0.3633,
"step": 5450
},
{
"epoch": 3.6966824644549763,
"grad_norm": 2.183893918991089,
"learning_rate": 3.862193088790231e-05,
"loss": 0.3918,
"step": 5460
},
{
"epoch": 3.703452945159106,
"grad_norm": 1.798282504081726,
"learning_rate": 3.82494290636654e-05,
"loss": 0.4081,
"step": 5470
},
{
"epoch": 3.7102234258632363,
"grad_norm": 1.563833475112915,
"learning_rate": 3.7878307049276195e-05,
"loss": 0.3772,
"step": 5480
},
{
"epoch": 3.7169939065673665,
"grad_norm": 1.5234781503677368,
"learning_rate": 3.7508573137416095e-05,
"loss": 0.3923,
"step": 5490
},
{
"epoch": 3.7237643872714963,
"grad_norm": 1.5436840057373047,
"learning_rate": 3.71402355897495e-05,
"loss": 0.4204,
"step": 5500
},
{
"epoch": 3.730534867975626,
"grad_norm": 1.640419363975525,
"learning_rate": 3.6773302636739116e-05,
"loss": 0.391,
"step": 5510
},
{
"epoch": 3.7373053486797563,
"grad_norm": 1.8847980499267578,
"learning_rate": 3.640778247746226e-05,
"loss": 0.3843,
"step": 5520
},
{
"epoch": 3.7440758293838865,
"grad_norm": 1.2375092506408691,
"learning_rate": 3.6043683279427484e-05,
"loss": 0.3623,
"step": 5530
},
{
"epoch": 3.7508463100880163,
"grad_norm": 1.3256595134735107,
"learning_rate": 3.568101317839205e-05,
"loss": 0.3923,
"step": 5540
},
{
"epoch": 3.757616790792146,
"grad_norm": 1.5230741500854492,
"learning_rate": 3.531978027818027e-05,
"loss": 0.3918,
"step": 5550
},
{
"epoch": 3.7643872714962763,
"grad_norm": 1.619551181793213,
"learning_rate": 3.4959992650502346e-05,
"loss": 0.4316,
"step": 5560
},
{
"epoch": 3.7711577522004065,
"grad_norm": 2.241872787475586,
"learning_rate": 3.4601658334774014e-05,
"loss": 0.4183,
"step": 5570
},
{
"epoch": 3.7779282329045363,
"grad_norm": 1.427147626876831,
"learning_rate": 3.424478533793695e-05,
"loss": 0.4036,
"step": 5580
},
{
"epoch": 3.784698713608666,
"grad_norm": 1.646103024482727,
"learning_rate": 3.388938163427969e-05,
"loss": 0.3846,
"step": 5590
},
{
"epoch": 3.7914691943127963,
"grad_norm": 1.4623626470565796,
"learning_rate": 3.3535455165259734e-05,
"loss": 0.4339,
"step": 5600
},
{
"epoch": 3.798239675016926,
"grad_norm": 1.5822981595993042,
"learning_rate": 3.318301383932586e-05,
"loss": 0.4013,
"step": 5610
},
{
"epoch": 3.8050101557210563,
"grad_norm": 1.6035799980163574,
"learning_rate": 3.283206553174144e-05,
"loss": 0.3765,
"step": 5620
},
{
"epoch": 3.811780636425186,
"grad_norm": 1.4690262079238892,
"learning_rate": 3.248261808440858e-05,
"loss": 0.3846,
"step": 5630
},
{
"epoch": 3.8185511171293163,
"grad_norm": 1.6690099239349365,
"learning_rate": 3.213467930569279e-05,
"loss": 0.3908,
"step": 5640
},
{
"epoch": 3.825321597833446,
"grad_norm": 1.9128773212432861,
"learning_rate": 3.178825697024859e-05,
"loss": 0.4075,
"step": 5650
},
{
"epoch": 3.8320920785375763,
"grad_norm": 1.5227471590042114,
"learning_rate": 3.14433588188457e-05,
"loss": 0.3949,
"step": 5660
},
{
"epoch": 3.838862559241706,
"grad_norm": 1.8962739706039429,
"learning_rate": 3.109999255819607e-05,
"loss": 0.3708,
"step": 5670
},
{
"epoch": 3.8456330399458363,
"grad_norm": 1.7166234254837036,
"learning_rate": 3.075816586078182e-05,
"loss": 0.3853,
"step": 5680
},
{
"epoch": 3.852403520649966,
"grad_norm": 1.603034257888794,
"learning_rate": 3.0417886364683578e-05,
"loss": 0.3697,
"step": 5690
},
{
"epoch": 3.8591740013540963,
"grad_norm": 1.2980273962020874,
"learning_rate": 3.0079161673410006e-05,
"loss": 0.3561,
"step": 5700
},
{
"epoch": 3.865944482058226,
"grad_norm": 1.2596299648284912,
"learning_rate": 2.974199935572781e-05,
"loss": 0.3759,
"step": 5710
},
{
"epoch": 3.8727149627623563,
"grad_norm": 1.6658598184585571,
"learning_rate": 2.9406406945492616e-05,
"loss": 0.3902,
"step": 5720
},
{
"epoch": 3.879485443466486,
"grad_norm": 1.401743769645691,
"learning_rate": 2.907239194148066e-05,
"loss": 0.4045,
"step": 5730
},
{
"epoch": 3.8862559241706163,
"grad_norm": 1.7074028253555298,
"learning_rate": 2.8739961807221127e-05,
"loss": 0.4103,
"step": 5740
},
{
"epoch": 3.893026404874746,
"grad_norm": 1.6622352600097656,
"learning_rate": 2.840912397082954e-05,
"loss": 0.3718,
"step": 5750
},
{
"epoch": 3.8997968855788763,
"grad_norm": 1.5955240726470947,
"learning_rate": 2.807988582484171e-05,
"loss": 0.3949,
"step": 5760
},
{
"epoch": 3.906567366283006,
"grad_norm": 1.5108157396316528,
"learning_rate": 2.7752254726048422e-05,
"loss": 0.3665,
"step": 5770
},
{
"epoch": 3.913337846987136,
"grad_norm": 1.4178344011306763,
"learning_rate": 2.7426237995331296e-05,
"loss": 0.3835,
"step": 5780
},
{
"epoch": 3.920108327691266,
"grad_norm": 1.7224016189575195,
"learning_rate": 2.7101842917498997e-05,
"loss": 0.4008,
"step": 5790
},
{
"epoch": 3.9268788083953963,
"grad_norm": 1.513185977935791,
"learning_rate": 2.6779076741124576e-05,
"loss": 0.4084,
"step": 5800
},
{
"epoch": 3.933649289099526,
"grad_norm": 1.806357741355896,
"learning_rate": 2.6457946678383448e-05,
"loss": 0.382,
"step": 5810
},
{
"epoch": 3.940419769803656,
"grad_norm": 1.5622941255569458,
"learning_rate": 2.6138459904892177e-05,
"loss": 0.3943,
"step": 5820
},
{
"epoch": 3.947190250507786,
"grad_norm": 2.032970428466797,
"learning_rate": 2.5820623559548285e-05,
"loss": 0.3486,
"step": 5830
},
{
"epoch": 3.9539607312119163,
"grad_norm": 1.7815639972686768,
"learning_rate": 2.550444474437066e-05,
"loss": 0.3772,
"step": 5840
},
{
"epoch": 3.960731211916046,
"grad_norm": 1.6397390365600586,
"learning_rate": 2.5189930524340767e-05,
"loss": 0.3629,
"step": 5850
},
{
"epoch": 3.967501692620176,
"grad_norm": 1.4618537425994873,
"learning_rate": 2.487708792724497e-05,
"loss": 0.4054,
"step": 5860
},
{
"epoch": 3.974272173324306,
"grad_norm": 1.5044384002685547,
"learning_rate": 2.4565923943517343e-05,
"loss": 0.4003,
"step": 5870
},
{
"epoch": 3.9810426540284363,
"grad_norm": 1.5843464136123657,
"learning_rate": 2.425644552608356e-05,
"loss": 0.3977,
"step": 5880
},
{
"epoch": 3.987813134732566,
"grad_norm": 1.5150847434997559,
"learning_rate": 2.3948659590205515e-05,
"loss": 0.4088,
"step": 5890
},
{
"epoch": 3.994583615436696,
"grad_norm": 1.9236164093017578,
"learning_rate": 2.3642573013326663e-05,
"loss": 0.4008,
"step": 5900
},
{
"epoch": 4.001354096140826,
"grad_norm": 1.42927086353302,
"learning_rate": 2.3338192634918643e-05,
"loss": 0.3427,
"step": 5910
},
{
"epoch": 4.008124576844956,
"grad_norm": 1.3550347089767456,
"learning_rate": 2.3035525256328106e-05,
"loss": 0.2699,
"step": 5920
},
{
"epoch": 4.014895057549086,
"grad_norm": 1.546830177307129,
"learning_rate": 2.2734577640625022e-05,
"loss": 0.2694,
"step": 5930
},
{
"epoch": 4.021665538253216,
"grad_norm": 1.7005549669265747,
"learning_rate": 2.2435356512451387e-05,
"loss": 0.2822,
"step": 5940
},
{
"epoch": 4.028436018957346,
"grad_norm": 1.5947457551956177,
"learning_rate": 2.2137868557871067e-05,
"loss": 0.2965,
"step": 5950
},
{
"epoch": 4.035206499661476,
"grad_norm": 1.600761890411377,
"learning_rate": 2.1842120424220334e-05,
"loss": 0.2551,
"step": 5960
},
{
"epoch": 4.041976980365606,
"grad_norm": 1.5094797611236572,
"learning_rate": 2.1548118719959286e-05,
"loss": 0.2903,
"step": 5970
},
{
"epoch": 4.048747461069736,
"grad_norm": 1.5594260692596436,
"learning_rate": 2.1255870014524327e-05,
"loss": 0.294,
"step": 5980
},
{
"epoch": 4.055517941773866,
"grad_norm": 1.5365486145019531,
"learning_rate": 2.096538083818128e-05,
"loss": 0.2838,
"step": 5990
},
{
"epoch": 4.062288422477996,
"grad_norm": 1.9512939453125,
"learning_rate": 2.067665768187941e-05,
"loss": 0.2649,
"step": 6000
},
{
"epoch": 4.062288422477996,
"eval_loss": 1.1342198848724365,
"eval_runtime": 22.903,
"eval_samples_per_second": 108.632,
"eval_steps_per_second": 13.579,
"step": 6000
},
{
"epoch": 4.069058903182126,
"grad_norm": 1.703903079032898,
"learning_rate": 2.0389706997106527e-05,
"loss": 0.2606,
"step": 6010
},
{
"epoch": 4.075829383886256,
"grad_norm": 1.8867642879486084,
"learning_rate": 2.0104535195744746e-05,
"loss": 0.2848,
"step": 6020
},
{
"epoch": 4.082599864590386,
"grad_norm": 1.9352099895477295,
"learning_rate": 1.9821148649927212e-05,
"loss": 0.2724,
"step": 6030
},
{
"epoch": 4.089370345294516,
"grad_norm": 1.7266086339950562,
"learning_rate": 1.953955369189574e-05,
"loss": 0.2745,
"step": 6040
},
{
"epoch": 4.096140825998646,
"grad_norm": 1.5754889249801636,
"learning_rate": 1.925975661385926e-05,
"loss": 0.2737,
"step": 6050
},
{
"epoch": 4.102911306702776,
"grad_norm": 1.6799631118774414,
"learning_rate": 1.8981763667853326e-05,
"loss": 0.2606,
"step": 6060
},
{
"epoch": 4.109681787406906,
"grad_norm": 1.5695922374725342,
"learning_rate": 1.870558106560035e-05,
"loss": 0.2621,
"step": 6070
},
{
"epoch": 4.116452268111036,
"grad_norm": 1.550424337387085,
"learning_rate": 1.8431214978370758e-05,
"loss": 0.2677,
"step": 6080
},
{
"epoch": 4.123222748815166,
"grad_norm": 1.4905930757522583,
"learning_rate": 1.8158671536845186e-05,
"loss": 0.2562,
"step": 6090
},
{
"epoch": 4.129993229519296,
"grad_norm": 1.688219666481018,
"learning_rate": 1.788795683097746e-05,
"loss": 0.2591,
"step": 6100
},
{
"epoch": 4.136763710223426,
"grad_norm": 1.8246350288391113,
"learning_rate": 1.761907690985847e-05,
"loss": 0.2823,
"step": 6110
},
{
"epoch": 4.143534190927556,
"grad_norm": 1.475894808769226,
"learning_rate": 1.735203778158109e-05,
"loss": 0.2672,
"step": 6120
},
{
"epoch": 4.150304671631686,
"grad_norm": 2.1845951080322266,
"learning_rate": 1.7086845413105778e-05,
"loss": 0.2607,
"step": 6130
},
{
"epoch": 4.157075152335816,
"grad_norm": 1.9802888631820679,
"learning_rate": 1.6823505730127455e-05,
"loss": 0.2653,
"step": 6140
},
{
"epoch": 4.163845633039946,
"grad_norm": 1.2355766296386719,
"learning_rate": 1.656202461694293e-05,
"loss": 0.2787,
"step": 6150
},
{
"epoch": 4.170616113744076,
"grad_norm": 1.6711342334747314,
"learning_rate": 1.630240791631945e-05,
"loss": 0.2996,
"step": 6160
},
{
"epoch": 4.177386594448206,
"grad_norm": 1.8249988555908203,
"learning_rate": 1.6044661429364205e-05,
"loss": 0.2617,
"step": 6170
},
{
"epoch": 4.184157075152336,
"grad_norm": 2.0309152603149414,
"learning_rate": 1.5788790915394645e-05,
"loss": 0.2627,
"step": 6180
},
{
"epoch": 4.190927555856466,
"grad_norm": 1.7783539295196533,
"learning_rate": 1.5534802091809818e-05,
"loss": 0.2734,
"step": 6190
},
{
"epoch": 4.197698036560595,
"grad_norm": 1.5822839736938477,
"learning_rate": 1.528270063396262e-05,
"loss": 0.2765,
"step": 6200
},
{
"epoch": 4.204468517264726,
"grad_norm": 1.9683705568313599,
"learning_rate": 1.5032492175032876e-05,
"loss": 0.2665,
"step": 6210
},
{
"epoch": 4.211238997968856,
"grad_norm": 1.4425179958343506,
"learning_rate": 1.4784182305901672e-05,
"loss": 0.2644,
"step": 6220
},
{
"epoch": 4.218009478672986,
"grad_norm": 1.8725738525390625,
"learning_rate": 1.4537776575026207e-05,
"loss": 0.2611,
"step": 6230
},
{
"epoch": 4.224779959377115,
"grad_norm": 1.767899990081787,
"learning_rate": 1.4293280488315986e-05,
"loss": 0.2851,
"step": 6240
},
{
"epoch": 4.231550440081246,
"grad_norm": 1.2789946794509888,
"learning_rate": 1.4050699509009679e-05,
"loss": 0.2727,
"step": 6250
},
{
"epoch": 4.238320920785376,
"grad_norm": 1.5606369972229004,
"learning_rate": 1.3810039057553138e-05,
"loss": 0.2704,
"step": 6260
},
{
"epoch": 4.245091401489506,
"grad_norm": 1.5035715103149414,
"learning_rate": 1.3571304511478188e-05,
"loss": 0.2847,
"step": 6270
},
{
"epoch": 4.251861882193635,
"grad_norm": 1.8756885528564453,
"learning_rate": 1.333450120528249e-05,
"loss": 0.2551,
"step": 6280
},
{
"epoch": 4.258632362897766,
"grad_norm": 2.072859048843384,
"learning_rate": 1.3099634430310403e-05,
"loss": 0.249,
"step": 6290
},
{
"epoch": 4.265402843601896,
"grad_norm": 1.6129212379455566,
"learning_rate": 1.2866709434634684e-05,
"loss": 0.2961,
"step": 6300
},
{
"epoch": 4.272173324306026,
"grad_norm": 1.705417513847351,
"learning_rate": 1.2635731422939212e-05,
"loss": 0.2476,
"step": 6310
},
{
"epoch": 4.278943805010155,
"grad_norm": 1.9114418029785156,
"learning_rate": 1.2406705556402776e-05,
"loss": 0.275,
"step": 6320
},
{
"epoch": 4.285714285714286,
"grad_norm": 1.7978328466415405,
"learning_rate": 1.217963695258364e-05,
"loss": 0.2605,
"step": 6330
},
{
"epoch": 4.292484766418416,
"grad_norm": 1.7482448816299438,
"learning_rate": 1.1954530685305287e-05,
"loss": 0.2696,
"step": 6340
},
{
"epoch": 4.299255247122546,
"grad_norm": 2.014146566390991,
"learning_rate": 1.1731391784543e-05,
"loss": 0.2914,
"step": 6350
},
{
"epoch": 4.306025727826675,
"grad_norm": 2.0617308616638184,
"learning_rate": 1.15102252363114e-05,
"loss": 0.262,
"step": 6360
},
{
"epoch": 4.312796208530806,
"grad_norm": 1.9172184467315674,
"learning_rate": 1.1291035982553189e-05,
"loss": 0.2702,
"step": 6370
},
{
"epoch": 4.319566689234936,
"grad_norm": 1.7097840309143066,
"learning_rate": 1.1073828921028606e-05,
"loss": 0.308,
"step": 6380
},
{
"epoch": 4.326337169939066,
"grad_norm": 1.5703011751174927,
"learning_rate": 1.085860890520598e-05,
"loss": 0.2536,
"step": 6390
},
{
"epoch": 4.333107650643195,
"grad_norm": 2.1221113204956055,
"learning_rate": 1.0645380744153378e-05,
"loss": 0.2713,
"step": 6400
},
{
"epoch": 4.339878131347326,
"grad_norm": 1.5522172451019287,
"learning_rate": 1.0434149202431054e-05,
"loss": 0.259,
"step": 6410
},
{
"epoch": 4.346648612051456,
"grad_norm": 1.7431870698928833,
"learning_rate": 1.0224918999985044e-05,
"loss": 0.2847,
"step": 6420
},
{
"epoch": 4.353419092755586,
"grad_norm": 1.9679934978485107,
"learning_rate": 1.0017694812041656e-05,
"loss": 0.2621,
"step": 6430
},
{
"epoch": 4.360189573459715,
"grad_norm": 2.4556872844696045,
"learning_rate": 9.812481269002983e-06,
"loss": 0.2803,
"step": 6440
},
{
"epoch": 4.366960054163846,
"grad_norm": 1.530918836593628,
"learning_rate": 9.609282956343557e-06,
"loss": 0.2962,
"step": 6450
},
{
"epoch": 4.373730534867976,
"grad_norm": 1.861484169960022,
"learning_rate": 9.408104414507724e-06,
"loss": 0.2917,
"step": 6460
},
{
"epoch": 4.380501015572106,
"grad_norm": 2.1292312145233154,
"learning_rate": 9.208950138808293e-06,
"loss": 0.329,
"step": 6470
},
{
"epoch": 4.387271496276235,
"grad_norm": 1.6679848432540894,
"learning_rate": 9.011824579326144e-06,
"loss": 0.2768,
"step": 6480
},
{
"epoch": 4.394041976980366,
"grad_norm": 1.5731488466262817,
"learning_rate": 8.81673214081058e-06,
"loss": 0.2919,
"step": 6490
},
{
"epoch": 4.400812457684496,
"grad_norm": 1.8150240182876587,
"learning_rate": 8.623677182581135e-06,
"loss": 0.2719,
"step": 6500
},
{
"epoch": 4.407582938388625,
"grad_norm": 2.06569504737854,
"learning_rate": 8.432664018430003e-06,
"loss": 0.2803,
"step": 6510
},
{
"epoch": 4.414353419092755,
"grad_norm": 1.6544770002365112,
"learning_rate": 8.243696916525745e-06,
"loss": 0.2508,
"step": 6520
},
{
"epoch": 4.421123899796886,
"grad_norm": 1.6926827430725098,
"learning_rate": 8.056780099317885e-06,
"loss": 0.2979,
"step": 6530
},
{
"epoch": 4.427894380501016,
"grad_norm": 1.7074532508850098,
"learning_rate": 7.871917743442513e-06,
"loss": 0.2901,
"step": 6540
},
{
"epoch": 4.434664861205146,
"grad_norm": 2.1102843284606934,
"learning_rate": 7.68911397962906e-06,
"loss": 0.2615,
"step": 6550
},
{
"epoch": 4.441435341909275,
"grad_norm": 1.4068889617919922,
"learning_rate": 7.5083728926079065e-06,
"loss": 0.2608,
"step": 6560
},
{
"epoch": 4.448205822613406,
"grad_norm": 1.8090318441390991,
"learning_rate": 7.329698521019157e-06,
"loss": 0.2904,
"step": 6570
},
{
"epoch": 4.454976303317536,
"grad_norm": 1.7596811056137085,
"learning_rate": 7.153094857322374e-06,
"loss": 0.2763,
"step": 6580
},
{
"epoch": 4.461746784021665,
"grad_norm": 1.7713943719863892,
"learning_rate": 6.978565847707352e-06,
"loss": 0.2644,
"step": 6590
},
{
"epoch": 4.468517264725795,
"grad_norm": 1.9358819723129272,
"learning_rate": 6.806115392006007e-06,
"loss": 0.2758,
"step": 6600
},
{
"epoch": 4.475287745429926,
"grad_norm": 1.916235327720642,
"learning_rate": 6.635747343605181e-06,
"loss": 0.2952,
"step": 6610
},
{
"epoch": 4.482058226134056,
"grad_norm": 1.6258528232574463,
"learning_rate": 6.4674655093605155e-06,
"loss": 0.272,
"step": 6620
},
{
"epoch": 4.488828706838185,
"grad_norm": 1.8681087493896484,
"learning_rate": 6.301273649511464e-06,
"loss": 0.2638,
"step": 6630
},
{
"epoch": 4.495599187542315,
"grad_norm": 1.644300103187561,
"learning_rate": 6.137175477597213e-06,
"loss": 0.271,
"step": 6640
},
{
"epoch": 4.502369668246446,
"grad_norm": 1.8756589889526367,
"learning_rate": 5.975174660373706e-06,
"loss": 0.2682,
"step": 6650
},
{
"epoch": 4.509140148950576,
"grad_norm": 1.5481034517288208,
"learning_rate": 5.815274817731753e-06,
"loss": 0.2926,
"step": 6660
},
{
"epoch": 4.515910629654705,
"grad_norm": 1.8476117849349976,
"learning_rate": 5.657479522616071e-06,
"loss": 0.2716,
"step": 6670
},
{
"epoch": 4.522681110358835,
"grad_norm": 1.7573695182800293,
"learning_rate": 5.501792300945507e-06,
"loss": 0.2812,
"step": 6680
},
{
"epoch": 4.529451591062966,
"grad_norm": 1.7136588096618652,
"learning_rate": 5.348216631534264e-06,
"loss": 0.2416,
"step": 6690
},
{
"epoch": 4.536222071767096,
"grad_norm": 1.662249207496643,
"learning_rate": 5.196755946014065e-06,
"loss": 0.2571,
"step": 6700
},
{
"epoch": 4.542992552471225,
"grad_norm": 2.3519043922424316,
"learning_rate": 5.047413628757658e-06,
"loss": 0.2819,
"step": 6710
},
{
"epoch": 4.549763033175355,
"grad_norm": 1.7724781036376953,
"learning_rate": 4.900193016802956e-06,
"loss": 0.2881,
"step": 6720
},
{
"epoch": 4.556533513879486,
"grad_norm": 1.6066288948059082,
"learning_rate": 4.755097399778707e-06,
"loss": 0.2837,
"step": 6730
},
{
"epoch": 4.563303994583616,
"grad_norm": 2.2322845458984375,
"learning_rate": 4.612130019830774e-06,
"loss": 0.2648,
"step": 6740
},
{
"epoch": 4.570074475287745,
"grad_norm": 1.8880157470703125,
"learning_rate": 4.471294071549869e-06,
"loss": 0.2571,
"step": 6750
},
{
"epoch": 4.576844955991875,
"grad_norm": 1.5234016180038452,
"learning_rate": 4.332592701900085e-06,
"loss": 0.2567,
"step": 6760
},
{
"epoch": 4.583615436696006,
"grad_norm": 2.566943645477295,
"learning_rate": 4.196029010148527e-06,
"loss": 0.2462,
"step": 6770
},
{
"epoch": 4.590385917400136,
"grad_norm": 2.2811155319213867,
"learning_rate": 4.0616060477961845e-06,
"loss": 0.2695,
"step": 6780
},
{
"epoch": 4.597156398104265,
"grad_norm": 2.036428928375244,
"learning_rate": 3.929326818509638e-06,
"loss": 0.2816,
"step": 6790
},
{
"epoch": 4.603926878808395,
"grad_norm": 1.9326859712600708,
"learning_rate": 3.799194278054019e-06,
"loss": 0.3004,
"step": 6800
},
{
"epoch": 4.610697359512526,
"grad_norm": 2.0376124382019043,
"learning_rate": 3.6712113342269095e-06,
"loss": 0.3155,
"step": 6810
},
{
"epoch": 4.617467840216655,
"grad_norm": 1.9327590465545654,
"learning_rate": 3.5453808467933558e-06,
"loss": 0.2598,
"step": 6820
},
{
"epoch": 4.624238320920785,
"grad_norm": 1.5915392637252808,
"learning_rate": 3.421705627422067e-06,
"loss": 0.2893,
"step": 6830
},
{
"epoch": 4.631008801624915,
"grad_norm": 1.4876010417938232,
"learning_rate": 3.300188439622465e-06,
"loss": 0.2702,
"step": 6840
},
{
"epoch": 4.637779282329046,
"grad_norm": 1.8183128833770752,
"learning_rate": 3.180831998682987e-06,
"loss": 0.26,
"step": 6850
},
{
"epoch": 4.644549763033176,
"grad_norm": 1.5423557758331299,
"learning_rate": 3.0636389716104607e-06,
"loss": 0.309,
"step": 6860
},
{
"epoch": 4.651320243737305,
"grad_norm": 1.5031051635742188,
"learning_rate": 2.9486119770704144e-06,
"loss": 0.2541,
"step": 6870
},
{
"epoch": 4.658090724441435,
"grad_norm": 1.648635745048523,
"learning_rate": 2.83575358532866e-06,
"loss": 0.3016,
"step": 6880
},
{
"epoch": 4.664861205145566,
"grad_norm": 2.3799970149993896,
"learning_rate": 2.7250663181937808e-06,
"loss": 0.287,
"step": 6890
},
{
"epoch": 4.671631685849695,
"grad_norm": 1.8683040142059326,
"learning_rate": 2.6165526489608016e-06,
"loss": 0.2414,
"step": 6900
},
{
"epoch": 4.678402166553825,
"grad_norm": 1.5256311893463135,
"learning_rate": 2.510215002355987e-06,
"loss": 0.2605,
"step": 6910
},
{
"epoch": 4.685172647257955,
"grad_norm": 1.87392258644104,
"learning_rate": 2.4060557544825724e-06,
"loss": 0.2536,
"step": 6920
},
{
"epoch": 4.691943127962086,
"grad_norm": 1.480167031288147,
"learning_rate": 2.3040772327676987e-06,
"loss": 0.2773,
"step": 6930
},
{
"epoch": 4.698713608666216,
"grad_norm": 1.5413248538970947,
"learning_rate": 2.2042817159104614e-06,
"loss": 0.2801,
"step": 6940
},
{
"epoch": 4.705484089370345,
"grad_norm": 1.492633581161499,
"learning_rate": 2.106671433830909e-06,
"loss": 0.2343,
"step": 6950
},
{
"epoch": 4.712254570074475,
"grad_norm": 1.4329499006271362,
"learning_rate": 2.011248567620272e-06,
"loss": 0.2628,
"step": 6960
},
{
"epoch": 4.719025050778606,
"grad_norm": 1.9466246366500854,
"learning_rate": 1.918015249492211e-06,
"loss": 0.258,
"step": 6970
},
{
"epoch": 4.725795531482735,
"grad_norm": 1.604708194732666,
"learning_rate": 1.8269735627351459e-06,
"loss": 0.2807,
"step": 6980
},
{
"epoch": 4.732566012186865,
"grad_norm": 1.7957441806793213,
"learning_rate": 1.7381255416657693e-06,
"loss": 0.2476,
"step": 6990
},
{
"epoch": 4.739336492890995,
"grad_norm": 1.6520119905471802,
"learning_rate": 1.6514731715835064e-06,
"loss": 0.2722,
"step": 7000
},
{
"epoch": 4.739336492890995,
"eval_loss": 1.1487771272659302,
"eval_runtime": 23.0937,
"eval_samples_per_second": 107.735,
"eval_steps_per_second": 13.467,
"step": 7000
},
{
"epoch": 4.746106973595126,
"grad_norm": 1.8763707876205444,
"learning_rate": 1.5670183887262268e-06,
"loss": 0.253,
"step": 7010
},
{
"epoch": 4.752877454299255,
"grad_norm": 2.0074474811553955,
"learning_rate": 1.4847630802269695e-06,
"loss": 0.2886,
"step": 7020
},
{
"epoch": 4.759647935003385,
"grad_norm": 1.6623965501785278,
"learning_rate": 1.4047090840716982e-06,
"loss": 0.2645,
"step": 7030
},
{
"epoch": 4.766418415707515,
"grad_norm": 2.1426522731781006,
"learning_rate": 1.3268581890583553e-06,
"loss": 0.2834,
"step": 7040
},
{
"epoch": 4.773188896411646,
"grad_norm": 2.4106967449188232,
"learning_rate": 1.251212134756763e-06,
"loss": 0.2967,
"step": 7050
},
{
"epoch": 4.779959377115775,
"grad_norm": 1.7238754034042358,
"learning_rate": 1.1777726114698628e-06,
"loss": 0.2819,
"step": 7060
},
{
"epoch": 4.786729857819905,
"grad_norm": 1.9978512525558472,
"learning_rate": 1.1065412601958813e-06,
"loss": 0.2892,
"step": 7070
},
{
"epoch": 4.793500338524035,
"grad_norm": 1.807606816291809,
"learning_rate": 1.0375196725916693e-06,
"loss": 0.2751,
"step": 7080
},
{
"epoch": 4.800270819228166,
"grad_norm": 1.8417556285858154,
"learning_rate": 9.707093909371745e-07,
"loss": 0.277,
"step": 7090
},
{
"epoch": 4.807041299932295,
"grad_norm": 1.6947407722473145,
"learning_rate": 9.061119081009262e-07,
"loss": 0.2717,
"step": 7100
},
{
"epoch": 4.813811780636425,
"grad_norm": 2.100844621658325,
"learning_rate": 8.437286675067046e-07,
"loss": 0.2589,
"step": 7110
},
{
"epoch": 4.820582261340555,
"grad_norm": 1.8315235376358032,
"learning_rate": 7.835610631013123e-07,
"loss": 0.2774,
"step": 7120
},
{
"epoch": 4.827352742044685,
"grad_norm": 1.8022527694702148,
"learning_rate": 7.256104393233654e-07,
"loss": 0.2826,
"step": 7130
},
{
"epoch": 4.834123222748815,
"grad_norm": 1.8034976720809937,
"learning_rate": 6.698780910732949e-07,
"loss": 0.287,
"step": 7140
},
{
"epoch": 4.840893703452945,
"grad_norm": 2.1168487071990967,
"learning_rate": 6.163652636844375e-07,
"loss": 0.2601,
"step": 7150
},
{
"epoch": 4.847664184157075,
"grad_norm": 1.7831007242202759,
"learning_rate": 5.650731528951237e-07,
"loss": 0.2671,
"step": 7160
},
{
"epoch": 4.854434664861206,
"grad_norm": 1.85152268409729,
"learning_rate": 5.160029048220438e-07,
"loss": 0.2877,
"step": 7170
},
{
"epoch": 4.861205145565335,
"grad_norm": 1.629766583442688,
"learning_rate": 4.691556159346133e-07,
"loss": 0.3145,
"step": 7180
},
{
"epoch": 4.867975626269465,
"grad_norm": 2.025866746902466,
"learning_rate": 4.2453233303043627e-07,
"loss": 0.2634,
"step": 7190
},
{
"epoch": 4.874746106973595,
"grad_norm": 1.8864160776138306,
"learning_rate": 3.8213405321195775e-07,
"loss": 0.257,
"step": 7200
},
{
"epoch": 4.881516587677725,
"grad_norm": 1.6541404724121094,
"learning_rate": 3.4196172386417036e-07,
"loss": 0.2942,
"step": 7210
},
{
"epoch": 4.888287068381855,
"grad_norm": 1.627166509628296,
"learning_rate": 3.0401624263344254e-07,
"loss": 0.2984,
"step": 7220
},
{
"epoch": 4.895057549085985,
"grad_norm": 2.0203287601470947,
"learning_rate": 2.682984574074565e-07,
"loss": 0.2775,
"step": 7230
},
{
"epoch": 4.901828029790115,
"grad_norm": 1.4823179244995117,
"learning_rate": 2.3480916629626816e-07,
"loss": 0.2303,
"step": 7240
},
{
"epoch": 4.908598510494246,
"grad_norm": 1.6466970443725586,
"learning_rate": 2.035491176144766e-07,
"loss": 0.2561,
"step": 7250
},
{
"epoch": 4.915368991198375,
"grad_norm": 1.857335090637207,
"learning_rate": 1.7451900986450441e-07,
"loss": 0.2478,
"step": 7260
},
{
"epoch": 4.922139471902505,
"grad_norm": 1.615402102470398,
"learning_rate": 1.4771949172097677e-07,
"loss": 0.2644,
"step": 7270
},
{
"epoch": 4.928909952606635,
"grad_norm": 1.6097745895385742,
"learning_rate": 1.2315116201623288e-07,
"loss": 0.2687,
"step": 7280
},
{
"epoch": 4.935680433310765,
"grad_norm": 1.6500680446624756,
"learning_rate": 1.0081456972694803e-07,
"loss": 0.2782,
"step": 7290
},
{
"epoch": 4.942450914014895,
"grad_norm": 1.5854169130325317,
"learning_rate": 8.07102139618765e-08,
"loss": 0.2503,
"step": 7300
},
{
"epoch": 4.949221394719025,
"grad_norm": 1.917787790298462,
"learning_rate": 6.283854395067179e-08,
"loss": 0.2688,
"step": 7310
},
{
"epoch": 4.955991875423155,
"grad_norm": 1.3667759895324707,
"learning_rate": 4.719995903387231e-08,
"loss": 0.2713,
"step": 7320
},
{
"epoch": 4.962762356127285,
"grad_norm": 1.4660590887069702,
"learning_rate": 3.379480865397522e-08,
"loss": 0.2492,
"step": 7330
},
{
"epoch": 4.969532836831415,
"grad_norm": 1.909756064414978,
"learning_rate": 2.2623392347620455e-08,
"loss": 0.2528,
"step": 7340
},
{
"epoch": 4.976303317535545,
"grad_norm": 1.9919097423553467,
"learning_rate": 1.3685959738907184e-08,
"loss": 0.2797,
"step": 7350
},
{
"epoch": 4.983073798239675,
"grad_norm": 1.7295809984207153,
"learning_rate": 6.982710533787185e-09,
"loss": 0.2527,
"step": 7360
},
{
"epoch": 4.989844278943805,
"grad_norm": 1.575947642326355,
"learning_rate": 2.5137945156461507e-09,
"loss": 0.3057,
"step": 7370
},
{
"epoch": 4.996614759647935,
"grad_norm": 1.8067814111709595,
"learning_rate": 2.7931154193971964e-10,
"loss": 0.2525,
"step": 7380
},
{
"epoch": 5.0,
"step": 7385,
"total_flos": 1.6593737353978184e+18,
"train_loss": 0.5936373706919645,
"train_runtime": 5834.3806,
"train_samples_per_second": 40.5,
"train_steps_per_second": 1.266
}
],
"logging_steps": 10,
"max_steps": 7385,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.6593737353978184e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}