phi-transation / last-checkpoint /trainer_state.json
Ba2han's picture
Training in progress, step 1107, checkpoint
bed099f verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.631578947368421,
"eval_steps": 369,
"global_step": 1107,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005705320211096848,
"grad_norm": 83.0,
"learning_rate": 0.0,
"loss": 3.648493528366089,
"step": 1
},
{
"epoch": 0.0011410640422193695,
"grad_norm": 84.5,
"learning_rate": 1.3513513513513515e-06,
"loss": 3.7405500411987305,
"step": 2
},
{
"epoch": 0.0017115960633290544,
"grad_norm": 74.0,
"learning_rate": 2.702702702702703e-06,
"loss": 3.510922431945801,
"step": 3
},
{
"epoch": 0.002282128084438739,
"grad_norm": 74.0,
"learning_rate": 4.0540540540540545e-06,
"loss": 3.477842330932617,
"step": 4
},
{
"epoch": 0.002852660105548424,
"grad_norm": 48.5,
"learning_rate": 5.405405405405406e-06,
"loss": 3.2050325870513916,
"step": 5
},
{
"epoch": 0.0034231921266581087,
"grad_norm": 35.0,
"learning_rate": 6.7567567567567575e-06,
"loss": 2.9610347747802734,
"step": 6
},
{
"epoch": 0.003993724147767793,
"grad_norm": 25.75,
"learning_rate": 8.108108108108109e-06,
"loss": 2.8089160919189453,
"step": 7
},
{
"epoch": 0.004564256168877478,
"grad_norm": 15.25,
"learning_rate": 9.45945945945946e-06,
"loss": 2.672607183456421,
"step": 8
},
{
"epoch": 0.005134788189987163,
"grad_norm": 10.125,
"learning_rate": 1.0810810810810812e-05,
"loss": 2.4392411708831787,
"step": 9
},
{
"epoch": 0.005705320211096848,
"grad_norm": 8.875,
"learning_rate": 1.2162162162162164e-05,
"loss": 2.4409432411193848,
"step": 10
},
{
"epoch": 0.006275852232206533,
"grad_norm": 7.40625,
"learning_rate": 1.3513513513513515e-05,
"loss": 2.3299427032470703,
"step": 11
},
{
"epoch": 0.0068463842533162175,
"grad_norm": 6.6875,
"learning_rate": 1.4864864864864867e-05,
"loss": 2.2852554321289062,
"step": 12
},
{
"epoch": 0.007416916274425902,
"grad_norm": 6.5625,
"learning_rate": 1.6216216216216218e-05,
"loss": 2.2712786197662354,
"step": 13
},
{
"epoch": 0.007987448295535587,
"grad_norm": 7.1875,
"learning_rate": 1.756756756756757e-05,
"loss": 2.2143714427948,
"step": 14
},
{
"epoch": 0.008557980316645272,
"grad_norm": 7.0,
"learning_rate": 1.891891891891892e-05,
"loss": 2.0812437534332275,
"step": 15
},
{
"epoch": 0.009128512337754956,
"grad_norm": 6.28125,
"learning_rate": 2.0270270270270273e-05,
"loss": 2.068169355392456,
"step": 16
},
{
"epoch": 0.009699044358864642,
"grad_norm": 5.75,
"learning_rate": 2.1621621621621624e-05,
"loss": 1.8387004137039185,
"step": 17
},
{
"epoch": 0.010269576379974325,
"grad_norm": 4.3125,
"learning_rate": 2.2972972972972976e-05,
"loss": 1.7710001468658447,
"step": 18
},
{
"epoch": 0.01084010840108401,
"grad_norm": 4.25,
"learning_rate": 2.4324324324324327e-05,
"loss": 1.7796661853790283,
"step": 19
},
{
"epoch": 0.011410640422193696,
"grad_norm": 3.5,
"learning_rate": 2.5675675675675675e-05,
"loss": 1.6957234144210815,
"step": 20
},
{
"epoch": 0.01198117244330338,
"grad_norm": 3.21875,
"learning_rate": 2.702702702702703e-05,
"loss": 1.7516167163848877,
"step": 21
},
{
"epoch": 0.012551704464413066,
"grad_norm": 2.78125,
"learning_rate": 2.8378378378378378e-05,
"loss": 1.6087043285369873,
"step": 22
},
{
"epoch": 0.01312223648552275,
"grad_norm": 2.34375,
"learning_rate": 2.9729729729729733e-05,
"loss": 1.5943574905395508,
"step": 23
},
{
"epoch": 0.013692768506632435,
"grad_norm": 2.140625,
"learning_rate": 3.108108108108108e-05,
"loss": 1.599621295928955,
"step": 24
},
{
"epoch": 0.014263300527742119,
"grad_norm": 2.234375,
"learning_rate": 3.2432432432432436e-05,
"loss": 1.6016688346862793,
"step": 25
},
{
"epoch": 0.014833832548851804,
"grad_norm": 1.9609375,
"learning_rate": 3.3783783783783784e-05,
"loss": 1.5124552249908447,
"step": 26
},
{
"epoch": 0.01540436456996149,
"grad_norm": 1.9765625,
"learning_rate": 3.513513513513514e-05,
"loss": 1.5520291328430176,
"step": 27
},
{
"epoch": 0.015974896591071173,
"grad_norm": 1.90625,
"learning_rate": 3.648648648648649e-05,
"loss": 1.4819629192352295,
"step": 28
},
{
"epoch": 0.01654542861218086,
"grad_norm": 2.0,
"learning_rate": 3.783783783783784e-05,
"loss": 1.5304462909698486,
"step": 29
},
{
"epoch": 0.017115960633290545,
"grad_norm": 1.7734375,
"learning_rate": 3.918918918918919e-05,
"loss": 1.4461307525634766,
"step": 30
},
{
"epoch": 0.017686492654400227,
"grad_norm": 1.8125,
"learning_rate": 4.0540540540540545e-05,
"loss": 1.4548516273498535,
"step": 31
},
{
"epoch": 0.018257024675509912,
"grad_norm": 1.6875,
"learning_rate": 4.189189189189189e-05,
"loss": 1.435849905014038,
"step": 32
},
{
"epoch": 0.018827556696619598,
"grad_norm": 1.578125,
"learning_rate": 4.324324324324325e-05,
"loss": 1.4789021015167236,
"step": 33
},
{
"epoch": 0.019398088717729283,
"grad_norm": 1.578125,
"learning_rate": 4.4594594594594596e-05,
"loss": 1.3856297731399536,
"step": 34
},
{
"epoch": 0.01996862073883897,
"grad_norm": 1.65625,
"learning_rate": 4.594594594594595e-05,
"loss": 1.5028152465820312,
"step": 35
},
{
"epoch": 0.02053915275994865,
"grad_norm": 1.46875,
"learning_rate": 4.72972972972973e-05,
"loss": 1.4294812679290771,
"step": 36
},
{
"epoch": 0.021109684781058336,
"grad_norm": 1.65625,
"learning_rate": 4.8648648648648654e-05,
"loss": 1.3971917629241943,
"step": 37
},
{
"epoch": 0.02168021680216802,
"grad_norm": 1.515625,
"learning_rate": 5e-05,
"loss": 1.3995487689971924,
"step": 38
},
{
"epoch": 0.022250748823277707,
"grad_norm": 1.703125,
"learning_rate": 4.9972283813747225e-05,
"loss": 1.4693856239318848,
"step": 39
},
{
"epoch": 0.022821280844387393,
"grad_norm": 1.5703125,
"learning_rate": 4.994456762749446e-05,
"loss": 1.4715073108673096,
"step": 40
},
{
"epoch": 0.023391812865497075,
"grad_norm": 1.703125,
"learning_rate": 4.9916851441241684e-05,
"loss": 1.490320086479187,
"step": 41
},
{
"epoch": 0.02396234488660676,
"grad_norm": 1.5,
"learning_rate": 4.9889135254988913e-05,
"loss": 1.3657546043395996,
"step": 42
},
{
"epoch": 0.024532876907716446,
"grad_norm": 1.7109375,
"learning_rate": 4.986141906873614e-05,
"loss": 1.4324053525924683,
"step": 43
},
{
"epoch": 0.02510340892882613,
"grad_norm": 1.7890625,
"learning_rate": 4.983370288248337e-05,
"loss": 1.3849389553070068,
"step": 44
},
{
"epoch": 0.025673940949935817,
"grad_norm": 1.453125,
"learning_rate": 4.98059866962306e-05,
"loss": 1.425079345703125,
"step": 45
},
{
"epoch": 0.0262444729710455,
"grad_norm": 1.484375,
"learning_rate": 4.977827050997783e-05,
"loss": 1.4127968549728394,
"step": 46
},
{
"epoch": 0.026815004992155184,
"grad_norm": 1.5390625,
"learning_rate": 4.9750554323725054e-05,
"loss": 1.429938793182373,
"step": 47
},
{
"epoch": 0.02738553701326487,
"grad_norm": 1.4140625,
"learning_rate": 4.972283813747229e-05,
"loss": 1.4178887605667114,
"step": 48
},
{
"epoch": 0.027956069034374555,
"grad_norm": 1.46875,
"learning_rate": 4.969512195121951e-05,
"loss": 1.4397588968276978,
"step": 49
},
{
"epoch": 0.028526601055484237,
"grad_norm": 1.453125,
"learning_rate": 4.966740576496674e-05,
"loss": 1.3697854280471802,
"step": 50
},
{
"epoch": 0.029097133076593923,
"grad_norm": 1.3984375,
"learning_rate": 4.963968957871397e-05,
"loss": 1.3517988920211792,
"step": 51
},
{
"epoch": 0.02966766509770361,
"grad_norm": 1.3984375,
"learning_rate": 4.9611973392461195e-05,
"loss": 1.4193122386932373,
"step": 52
},
{
"epoch": 0.030238197118813294,
"grad_norm": 1.34375,
"learning_rate": 4.958425720620843e-05,
"loss": 1.37640380859375,
"step": 53
},
{
"epoch": 0.03080872913992298,
"grad_norm": 1.40625,
"learning_rate": 4.9556541019955654e-05,
"loss": 1.336474895477295,
"step": 54
},
{
"epoch": 0.031379261161032665,
"grad_norm": 1.515625,
"learning_rate": 4.952882483370288e-05,
"loss": 1.4701391458511353,
"step": 55
},
{
"epoch": 0.03194979318214235,
"grad_norm": 1.4765625,
"learning_rate": 4.950110864745011e-05,
"loss": 1.3760974407196045,
"step": 56
},
{
"epoch": 0.032520325203252036,
"grad_norm": 1.4609375,
"learning_rate": 4.947339246119734e-05,
"loss": 1.3897124528884888,
"step": 57
},
{
"epoch": 0.03309085722436172,
"grad_norm": 1.578125,
"learning_rate": 4.944567627494457e-05,
"loss": 1.4239261150360107,
"step": 58
},
{
"epoch": 0.0336613892454714,
"grad_norm": 1.53125,
"learning_rate": 4.94179600886918e-05,
"loss": 1.3669216632843018,
"step": 59
},
{
"epoch": 0.03423192126658109,
"grad_norm": 1.3515625,
"learning_rate": 4.9390243902439024e-05,
"loss": 1.346958041191101,
"step": 60
},
{
"epoch": 0.03480245328769077,
"grad_norm": 1.5234375,
"learning_rate": 4.936252771618626e-05,
"loss": 1.4235575199127197,
"step": 61
},
{
"epoch": 0.03537298530880045,
"grad_norm": 1.3359375,
"learning_rate": 4.933481152993348e-05,
"loss": 1.3075377941131592,
"step": 62
},
{
"epoch": 0.03594351732991014,
"grad_norm": 1.3203125,
"learning_rate": 4.930709534368071e-05,
"loss": 1.3214820623397827,
"step": 63
},
{
"epoch": 0.036514049351019824,
"grad_norm": 1.3515625,
"learning_rate": 4.927937915742794e-05,
"loss": 1.39829421043396,
"step": 64
},
{
"epoch": 0.03708458137212951,
"grad_norm": 1.3671875,
"learning_rate": 4.9251662971175164e-05,
"loss": 1.3523836135864258,
"step": 65
},
{
"epoch": 0.037655113393239195,
"grad_norm": 1.3125,
"learning_rate": 4.92239467849224e-05,
"loss": 1.3268153667449951,
"step": 66
},
{
"epoch": 0.03822564541434888,
"grad_norm": 1.28125,
"learning_rate": 4.919623059866962e-05,
"loss": 1.3205022811889648,
"step": 67
},
{
"epoch": 0.038796177435458566,
"grad_norm": 1.2734375,
"learning_rate": 4.916851441241685e-05,
"loss": 1.2956037521362305,
"step": 68
},
{
"epoch": 0.03936670945656825,
"grad_norm": 1.375,
"learning_rate": 4.914079822616408e-05,
"loss": 1.3702654838562012,
"step": 69
},
{
"epoch": 0.03993724147767794,
"grad_norm": 1.296875,
"learning_rate": 4.911308203991131e-05,
"loss": 1.388296127319336,
"step": 70
},
{
"epoch": 0.04050777349878762,
"grad_norm": 1.5078125,
"learning_rate": 4.908536585365854e-05,
"loss": 1.4403045177459717,
"step": 71
},
{
"epoch": 0.0410783055198973,
"grad_norm": 1.25,
"learning_rate": 4.905764966740577e-05,
"loss": 1.3626902103424072,
"step": 72
},
{
"epoch": 0.04164883754100699,
"grad_norm": 1.34375,
"learning_rate": 4.902993348115299e-05,
"loss": 1.382088303565979,
"step": 73
},
{
"epoch": 0.04221936956211667,
"grad_norm": 1.2578125,
"learning_rate": 4.900221729490023e-05,
"loss": 1.3237360715866089,
"step": 74
},
{
"epoch": 0.04278990158322636,
"grad_norm": 1.296875,
"learning_rate": 4.897450110864745e-05,
"loss": 1.319187879562378,
"step": 75
},
{
"epoch": 0.04336043360433604,
"grad_norm": 1.3828125,
"learning_rate": 4.894678492239468e-05,
"loss": 1.3707743883132935,
"step": 76
},
{
"epoch": 0.043930965625445725,
"grad_norm": 1.2578125,
"learning_rate": 4.891906873614191e-05,
"loss": 1.3658738136291504,
"step": 77
},
{
"epoch": 0.044501497646555414,
"grad_norm": 1.265625,
"learning_rate": 4.8891352549889134e-05,
"loss": 1.3247051239013672,
"step": 78
},
{
"epoch": 0.045072029667665096,
"grad_norm": 1.421875,
"learning_rate": 4.886363636363637e-05,
"loss": 1.3614035844802856,
"step": 79
},
{
"epoch": 0.045642561688774785,
"grad_norm": 1.2734375,
"learning_rate": 4.883592017738359e-05,
"loss": 1.2589421272277832,
"step": 80
},
{
"epoch": 0.04621309370988447,
"grad_norm": 1.28125,
"learning_rate": 4.880820399113082e-05,
"loss": 1.3525424003601074,
"step": 81
},
{
"epoch": 0.04678362573099415,
"grad_norm": 1.2578125,
"learning_rate": 4.878048780487805e-05,
"loss": 1.2903777360916138,
"step": 82
},
{
"epoch": 0.04735415775210384,
"grad_norm": 1.328125,
"learning_rate": 4.875277161862528e-05,
"loss": 1.3538789749145508,
"step": 83
},
{
"epoch": 0.04792468977321352,
"grad_norm": 1.3046875,
"learning_rate": 4.872505543237251e-05,
"loss": 1.3419591188430786,
"step": 84
},
{
"epoch": 0.04849522179432321,
"grad_norm": 1.28125,
"learning_rate": 4.869733924611974e-05,
"loss": 1.3367938995361328,
"step": 85
},
{
"epoch": 0.04906575381543289,
"grad_norm": 1.3046875,
"learning_rate": 4.866962305986696e-05,
"loss": 1.2979538440704346,
"step": 86
},
{
"epoch": 0.049636285836542574,
"grad_norm": 1.25,
"learning_rate": 4.864190687361419e-05,
"loss": 1.348291039466858,
"step": 87
},
{
"epoch": 0.05020681785765226,
"grad_norm": 1.3125,
"learning_rate": 4.861419068736142e-05,
"loss": 1.3377124071121216,
"step": 88
},
{
"epoch": 0.050777349878761945,
"grad_norm": 1.328125,
"learning_rate": 4.8586474501108644e-05,
"loss": 1.3180426359176636,
"step": 89
},
{
"epoch": 0.051347881899871634,
"grad_norm": 1.2109375,
"learning_rate": 4.855875831485588e-05,
"loss": 1.3215968608856201,
"step": 90
},
{
"epoch": 0.051918413920981316,
"grad_norm": 1.2421875,
"learning_rate": 4.85310421286031e-05,
"loss": 1.3354041576385498,
"step": 91
},
{
"epoch": 0.052488945942091,
"grad_norm": 1.28125,
"learning_rate": 4.850332594235034e-05,
"loss": 1.3552148342132568,
"step": 92
},
{
"epoch": 0.05305947796320069,
"grad_norm": 1.2421875,
"learning_rate": 4.847560975609756e-05,
"loss": 1.2916048765182495,
"step": 93
},
{
"epoch": 0.05363000998431037,
"grad_norm": 1.3125,
"learning_rate": 4.844789356984479e-05,
"loss": 1.3131537437438965,
"step": 94
},
{
"epoch": 0.05420054200542006,
"grad_norm": 1.296875,
"learning_rate": 4.842017738359202e-05,
"loss": 1.2902660369873047,
"step": 95
},
{
"epoch": 0.05477107402652974,
"grad_norm": 1.28125,
"learning_rate": 4.839246119733925e-05,
"loss": 1.3799315690994263,
"step": 96
},
{
"epoch": 0.05534160604763942,
"grad_norm": 1.3515625,
"learning_rate": 4.836474501108647e-05,
"loss": 1.3607311248779297,
"step": 97
},
{
"epoch": 0.05591213806874911,
"grad_norm": 1.2421875,
"learning_rate": 4.833702882483371e-05,
"loss": 1.3038060665130615,
"step": 98
},
{
"epoch": 0.05648267008985879,
"grad_norm": 1.234375,
"learning_rate": 4.830931263858093e-05,
"loss": 1.318457841873169,
"step": 99
},
{
"epoch": 0.057053202110968475,
"grad_norm": 1.2890625,
"learning_rate": 4.828159645232816e-05,
"loss": 1.3159422874450684,
"step": 100
},
{
"epoch": 0.057623734132078164,
"grad_norm": 1.25,
"learning_rate": 4.825388026607539e-05,
"loss": 1.3275076150894165,
"step": 101
},
{
"epoch": 0.058194266153187846,
"grad_norm": 1.1796875,
"learning_rate": 4.8226164079822614e-05,
"loss": 1.2983460426330566,
"step": 102
},
{
"epoch": 0.058764798174297535,
"grad_norm": 1.171875,
"learning_rate": 4.819844789356985e-05,
"loss": 1.3114261627197266,
"step": 103
},
{
"epoch": 0.05933533019540722,
"grad_norm": 1.1796875,
"learning_rate": 4.817073170731707e-05,
"loss": 1.266122817993164,
"step": 104
},
{
"epoch": 0.0599058622165169,
"grad_norm": 1.234375,
"learning_rate": 4.81430155210643e-05,
"loss": 1.3662368059158325,
"step": 105
},
{
"epoch": 0.06047639423762659,
"grad_norm": 1.296875,
"learning_rate": 4.811529933481153e-05,
"loss": 1.3158059120178223,
"step": 106
},
{
"epoch": 0.06104692625873627,
"grad_norm": 1.234375,
"learning_rate": 4.808758314855876e-05,
"loss": 1.3571752309799194,
"step": 107
},
{
"epoch": 0.06161745827984596,
"grad_norm": 1.34375,
"learning_rate": 4.805986696230599e-05,
"loss": 1.3249101638793945,
"step": 108
},
{
"epoch": 0.06218799030095564,
"grad_norm": 1.21875,
"learning_rate": 4.803215077605322e-05,
"loss": 1.3386337757110596,
"step": 109
},
{
"epoch": 0.06275852232206533,
"grad_norm": 1.2421875,
"learning_rate": 4.800443458980044e-05,
"loss": 1.2874070405960083,
"step": 110
},
{
"epoch": 0.06332905434317501,
"grad_norm": 1.28125,
"learning_rate": 4.797671840354768e-05,
"loss": 1.3232687711715698,
"step": 111
},
{
"epoch": 0.0638995863642847,
"grad_norm": 1.203125,
"learning_rate": 4.79490022172949e-05,
"loss": 1.3370904922485352,
"step": 112
},
{
"epoch": 0.06447011838539438,
"grad_norm": 1.3203125,
"learning_rate": 4.792128603104213e-05,
"loss": 1.3211901187896729,
"step": 113
},
{
"epoch": 0.06504065040650407,
"grad_norm": 1.2578125,
"learning_rate": 4.789356984478936e-05,
"loss": 1.3841608762741089,
"step": 114
},
{
"epoch": 0.06561118242761375,
"grad_norm": 1.2890625,
"learning_rate": 4.786585365853658e-05,
"loss": 1.4017915725708008,
"step": 115
},
{
"epoch": 0.06618171444872344,
"grad_norm": 1.4140625,
"learning_rate": 4.783813747228382e-05,
"loss": 1.4110525846481323,
"step": 116
},
{
"epoch": 0.06675224646983312,
"grad_norm": 1.2734375,
"learning_rate": 4.781042128603104e-05,
"loss": 1.2671241760253906,
"step": 117
},
{
"epoch": 0.0673227784909428,
"grad_norm": 1.21875,
"learning_rate": 4.778270509977827e-05,
"loss": 1.2970881462097168,
"step": 118
},
{
"epoch": 0.06789331051205248,
"grad_norm": 1.3515625,
"learning_rate": 4.77549889135255e-05,
"loss": 1.2626357078552246,
"step": 119
},
{
"epoch": 0.06846384253316218,
"grad_norm": 1.3203125,
"learning_rate": 4.772727272727273e-05,
"loss": 1.2779147624969482,
"step": 120
},
{
"epoch": 0.06903437455427186,
"grad_norm": 1.1796875,
"learning_rate": 4.769955654101996e-05,
"loss": 1.308679461479187,
"step": 121
},
{
"epoch": 0.06960490657538154,
"grad_norm": 1.234375,
"learning_rate": 4.767184035476719e-05,
"loss": 1.299755573272705,
"step": 122
},
{
"epoch": 0.07017543859649122,
"grad_norm": 1.2578125,
"learning_rate": 4.764412416851441e-05,
"loss": 1.3637490272521973,
"step": 123
},
{
"epoch": 0.0707459706176009,
"grad_norm": 1.21875,
"learning_rate": 4.761640798226164e-05,
"loss": 1.3058216571807861,
"step": 124
},
{
"epoch": 0.0713165026387106,
"grad_norm": 1.2734375,
"learning_rate": 4.758869179600887e-05,
"loss": 1.3146748542785645,
"step": 125
},
{
"epoch": 0.07188703465982028,
"grad_norm": 1.2109375,
"learning_rate": 4.75609756097561e-05,
"loss": 1.2844371795654297,
"step": 126
},
{
"epoch": 0.07245756668092997,
"grad_norm": 1.25,
"learning_rate": 4.753325942350333e-05,
"loss": 1.3195525407791138,
"step": 127
},
{
"epoch": 0.07302809870203965,
"grad_norm": 1.1953125,
"learning_rate": 4.750554323725055e-05,
"loss": 1.3399118185043335,
"step": 128
},
{
"epoch": 0.07359863072314933,
"grad_norm": 1.15625,
"learning_rate": 4.747782705099779e-05,
"loss": 1.2919648885726929,
"step": 129
},
{
"epoch": 0.07416916274425903,
"grad_norm": 1.2421875,
"learning_rate": 4.745011086474501e-05,
"loss": 1.277235507965088,
"step": 130
},
{
"epoch": 0.07473969476536871,
"grad_norm": 1.25,
"learning_rate": 4.742239467849224e-05,
"loss": 1.3034231662750244,
"step": 131
},
{
"epoch": 0.07531022678647839,
"grad_norm": 1.109375,
"learning_rate": 4.739467849223947e-05,
"loss": 1.2368437051773071,
"step": 132
},
{
"epoch": 0.07588075880758807,
"grad_norm": 1.2265625,
"learning_rate": 4.73669623059867e-05,
"loss": 1.3728649616241455,
"step": 133
},
{
"epoch": 0.07645129082869775,
"grad_norm": 1.109375,
"learning_rate": 4.733924611973393e-05,
"loss": 1.2506084442138672,
"step": 134
},
{
"epoch": 0.07702182284980745,
"grad_norm": 1.2109375,
"learning_rate": 4.731152993348116e-05,
"loss": 1.2813055515289307,
"step": 135
},
{
"epoch": 0.07759235487091713,
"grad_norm": 1.203125,
"learning_rate": 4.728381374722838e-05,
"loss": 1.2894189357757568,
"step": 136
},
{
"epoch": 0.07816288689202681,
"grad_norm": 1.203125,
"learning_rate": 4.725609756097561e-05,
"loss": 1.3396642208099365,
"step": 137
},
{
"epoch": 0.0787334189131365,
"grad_norm": 1.359375,
"learning_rate": 4.722838137472284e-05,
"loss": 1.3043787479400635,
"step": 138
},
{
"epoch": 0.07930395093424618,
"grad_norm": 1.203125,
"learning_rate": 4.720066518847007e-05,
"loss": 1.308459997177124,
"step": 139
},
{
"epoch": 0.07987448295535587,
"grad_norm": 1.21875,
"learning_rate": 4.71729490022173e-05,
"loss": 1.3281002044677734,
"step": 140
},
{
"epoch": 0.08044501497646556,
"grad_norm": 1.171875,
"learning_rate": 4.714523281596452e-05,
"loss": 1.3146984577178955,
"step": 141
},
{
"epoch": 0.08101554699757524,
"grad_norm": 1.203125,
"learning_rate": 4.711751662971176e-05,
"loss": 1.3078755140304565,
"step": 142
},
{
"epoch": 0.08158607901868492,
"grad_norm": 1.21875,
"learning_rate": 4.708980044345898e-05,
"loss": 1.3129773139953613,
"step": 143
},
{
"epoch": 0.0821566110397946,
"grad_norm": 1.171875,
"learning_rate": 4.706208425720621e-05,
"loss": 1.2827129364013672,
"step": 144
},
{
"epoch": 0.0827271430609043,
"grad_norm": 1.171875,
"learning_rate": 4.703436807095344e-05,
"loss": 1.3232603073120117,
"step": 145
},
{
"epoch": 0.08329767508201398,
"grad_norm": 1.1796875,
"learning_rate": 4.700665188470067e-05,
"loss": 1.220211386680603,
"step": 146
},
{
"epoch": 0.08386820710312366,
"grad_norm": 1.2421875,
"learning_rate": 4.69789356984479e-05,
"loss": 1.3406665325164795,
"step": 147
},
{
"epoch": 0.08443873912423334,
"grad_norm": 1.1875,
"learning_rate": 4.695121951219512e-05,
"loss": 1.2698848247528076,
"step": 148
},
{
"epoch": 0.08500927114534303,
"grad_norm": 1.1171875,
"learning_rate": 4.692350332594235e-05,
"loss": 1.3016014099121094,
"step": 149
},
{
"epoch": 0.08557980316645272,
"grad_norm": 1.1484375,
"learning_rate": 4.689578713968958e-05,
"loss": 1.2674150466918945,
"step": 150
},
{
"epoch": 0.0861503351875624,
"grad_norm": 1.234375,
"learning_rate": 4.686807095343681e-05,
"loss": 1.316935420036316,
"step": 151
},
{
"epoch": 0.08672086720867209,
"grad_norm": 1.125,
"learning_rate": 4.684035476718403e-05,
"loss": 1.263155221939087,
"step": 152
},
{
"epoch": 0.08729139922978177,
"grad_norm": 1.1875,
"learning_rate": 4.681263858093127e-05,
"loss": 1.30006742477417,
"step": 153
},
{
"epoch": 0.08786193125089145,
"grad_norm": 1.296875,
"learning_rate": 4.678492239467849e-05,
"loss": 1.3325148820877075,
"step": 154
},
{
"epoch": 0.08843246327200115,
"grad_norm": 1.3125,
"learning_rate": 4.675720620842573e-05,
"loss": 1.2306278944015503,
"step": 155
},
{
"epoch": 0.08900299529311083,
"grad_norm": 1.234375,
"learning_rate": 4.672949002217295e-05,
"loss": 1.3476486206054688,
"step": 156
},
{
"epoch": 0.08957352731422051,
"grad_norm": 1.1953125,
"learning_rate": 4.670177383592018e-05,
"loss": 1.2401833534240723,
"step": 157
},
{
"epoch": 0.09014405933533019,
"grad_norm": 1.296875,
"learning_rate": 4.667405764966741e-05,
"loss": 1.3140380382537842,
"step": 158
},
{
"epoch": 0.09071459135643987,
"grad_norm": 1.1875,
"learning_rate": 4.664634146341464e-05,
"loss": 1.29231595993042,
"step": 159
},
{
"epoch": 0.09128512337754957,
"grad_norm": 1.15625,
"learning_rate": 4.661862527716186e-05,
"loss": 1.2908031940460205,
"step": 160
},
{
"epoch": 0.09185565539865925,
"grad_norm": 1.140625,
"learning_rate": 4.659090909090909e-05,
"loss": 1.259028434753418,
"step": 161
},
{
"epoch": 0.09242618741976893,
"grad_norm": 1.203125,
"learning_rate": 4.656319290465632e-05,
"loss": 1.2758322954177856,
"step": 162
},
{
"epoch": 0.09299671944087862,
"grad_norm": 1.1640625,
"learning_rate": 4.653547671840355e-05,
"loss": 1.2392590045928955,
"step": 163
},
{
"epoch": 0.0935672514619883,
"grad_norm": 1.2421875,
"learning_rate": 4.650776053215078e-05,
"loss": 1.3232059478759766,
"step": 164
},
{
"epoch": 0.094137783483098,
"grad_norm": 1.1328125,
"learning_rate": 4.6480044345898e-05,
"loss": 1.3052716255187988,
"step": 165
},
{
"epoch": 0.09470831550420768,
"grad_norm": 1.15625,
"learning_rate": 4.645232815964524e-05,
"loss": 1.2643868923187256,
"step": 166
},
{
"epoch": 0.09527884752531736,
"grad_norm": 1.1796875,
"learning_rate": 4.642461197339246e-05,
"loss": 1.3158135414123535,
"step": 167
},
{
"epoch": 0.09584937954642704,
"grad_norm": 1.1484375,
"learning_rate": 4.639689578713969e-05,
"loss": 1.2975637912750244,
"step": 168
},
{
"epoch": 0.09641991156753672,
"grad_norm": 1.09375,
"learning_rate": 4.636917960088692e-05,
"loss": 1.202270269393921,
"step": 169
},
{
"epoch": 0.09699044358864642,
"grad_norm": 1.1015625,
"learning_rate": 4.634146341463415e-05,
"loss": 1.1989184617996216,
"step": 170
},
{
"epoch": 0.0975609756097561,
"grad_norm": 1.15625,
"learning_rate": 4.631374722838138e-05,
"loss": 1.325451374053955,
"step": 171
},
{
"epoch": 0.09813150763086578,
"grad_norm": 1.1484375,
"learning_rate": 4.628603104212861e-05,
"loss": 1.3150224685668945,
"step": 172
},
{
"epoch": 0.09870203965197547,
"grad_norm": 1.1484375,
"learning_rate": 4.625831485587583e-05,
"loss": 1.2864487171173096,
"step": 173
},
{
"epoch": 0.09927257167308515,
"grad_norm": 1.140625,
"learning_rate": 4.623059866962306e-05,
"loss": 1.3033939599990845,
"step": 174
},
{
"epoch": 0.09984310369419484,
"grad_norm": 1.109375,
"learning_rate": 4.620288248337029e-05,
"loss": 1.2654147148132324,
"step": 175
},
{
"epoch": 0.10041363571530453,
"grad_norm": 1.125,
"learning_rate": 4.617516629711752e-05,
"loss": 1.2905241250991821,
"step": 176
},
{
"epoch": 0.10098416773641421,
"grad_norm": 1.1484375,
"learning_rate": 4.614745011086475e-05,
"loss": 1.2881019115447998,
"step": 177
},
{
"epoch": 0.10155469975752389,
"grad_norm": 1.15625,
"learning_rate": 4.611973392461197e-05,
"loss": 1.3300973176956177,
"step": 178
},
{
"epoch": 0.10212523177863357,
"grad_norm": 1.1484375,
"learning_rate": 4.609201773835921e-05,
"loss": 1.3166918754577637,
"step": 179
},
{
"epoch": 0.10269576379974327,
"grad_norm": 1.078125,
"learning_rate": 4.606430155210643e-05,
"loss": 1.2149487733840942,
"step": 180
},
{
"epoch": 0.10326629582085295,
"grad_norm": 1.1171875,
"learning_rate": 4.603658536585366e-05,
"loss": 1.284995198249817,
"step": 181
},
{
"epoch": 0.10383682784196263,
"grad_norm": 1.171875,
"learning_rate": 4.600886917960089e-05,
"loss": 1.3197823762893677,
"step": 182
},
{
"epoch": 0.10440735986307231,
"grad_norm": 1.09375,
"learning_rate": 4.598115299334812e-05,
"loss": 1.2414249181747437,
"step": 183
},
{
"epoch": 0.104977891884182,
"grad_norm": 1.1953125,
"learning_rate": 4.595343680709535e-05,
"loss": 1.2936391830444336,
"step": 184
},
{
"epoch": 0.10554842390529169,
"grad_norm": 1.203125,
"learning_rate": 4.592572062084257e-05,
"loss": 1.2889211177825928,
"step": 185
},
{
"epoch": 0.10611895592640137,
"grad_norm": 1.1640625,
"learning_rate": 4.58980044345898e-05,
"loss": 1.2958948612213135,
"step": 186
},
{
"epoch": 0.10668948794751106,
"grad_norm": 1.15625,
"learning_rate": 4.587028824833703e-05,
"loss": 1.3174210786819458,
"step": 187
},
{
"epoch": 0.10726001996862074,
"grad_norm": 1.09375,
"learning_rate": 4.584257206208426e-05,
"loss": 1.3083107471466064,
"step": 188
},
{
"epoch": 0.10783055198973042,
"grad_norm": 1.1015625,
"learning_rate": 4.581485587583149e-05,
"loss": 1.2460663318634033,
"step": 189
},
{
"epoch": 0.10840108401084012,
"grad_norm": 1.109375,
"learning_rate": 4.578713968957872e-05,
"loss": 1.262696623802185,
"step": 190
},
{
"epoch": 0.1089716160319498,
"grad_norm": 1.1796875,
"learning_rate": 4.575942350332594e-05,
"loss": 1.290346384048462,
"step": 191
},
{
"epoch": 0.10954214805305948,
"grad_norm": 1.15625,
"learning_rate": 4.573170731707318e-05,
"loss": 1.2630096673965454,
"step": 192
},
{
"epoch": 0.11011268007416916,
"grad_norm": 1.1015625,
"learning_rate": 4.57039911308204e-05,
"loss": 1.2521231174468994,
"step": 193
},
{
"epoch": 0.11068321209527884,
"grad_norm": 1.1484375,
"learning_rate": 4.567627494456763e-05,
"loss": 1.2671630382537842,
"step": 194
},
{
"epoch": 0.11125374411638853,
"grad_norm": 1.171875,
"learning_rate": 4.564855875831486e-05,
"loss": 1.3561689853668213,
"step": 195
},
{
"epoch": 0.11182427613749822,
"grad_norm": 1.1484375,
"learning_rate": 4.562084257206209e-05,
"loss": 1.2499645948410034,
"step": 196
},
{
"epoch": 0.1123948081586079,
"grad_norm": 1.15625,
"learning_rate": 4.559312638580932e-05,
"loss": 1.2348875999450684,
"step": 197
},
{
"epoch": 0.11296534017971759,
"grad_norm": 1.1640625,
"learning_rate": 4.556541019955654e-05,
"loss": 1.322629690170288,
"step": 198
},
{
"epoch": 0.11353587220082727,
"grad_norm": 1.171875,
"learning_rate": 4.553769401330377e-05,
"loss": 1.2846410274505615,
"step": 199
},
{
"epoch": 0.11410640422193695,
"grad_norm": 1.1640625,
"learning_rate": 4.5509977827051e-05,
"loss": 1.311292052268982,
"step": 200
},
{
"epoch": 0.11467693624304665,
"grad_norm": 1.140625,
"learning_rate": 4.548226164079823e-05,
"loss": 1.2933259010314941,
"step": 201
},
{
"epoch": 0.11524746826415633,
"grad_norm": 1.21875,
"learning_rate": 4.545454545454546e-05,
"loss": 1.3615764379501343,
"step": 202
},
{
"epoch": 0.11581800028526601,
"grad_norm": 1.1328125,
"learning_rate": 4.542682926829269e-05,
"loss": 1.187692403793335,
"step": 203
},
{
"epoch": 0.11638853230637569,
"grad_norm": 1.1796875,
"learning_rate": 4.539911308203991e-05,
"loss": 1.2587438821792603,
"step": 204
},
{
"epoch": 0.11695906432748537,
"grad_norm": 1.0546875,
"learning_rate": 4.537139689578715e-05,
"loss": 1.2154557704925537,
"step": 205
},
{
"epoch": 0.11752959634859507,
"grad_norm": 1.09375,
"learning_rate": 4.534368070953437e-05,
"loss": 1.2670985460281372,
"step": 206
},
{
"epoch": 0.11810012836970475,
"grad_norm": 1.171875,
"learning_rate": 4.53159645232816e-05,
"loss": 1.292269229888916,
"step": 207
},
{
"epoch": 0.11867066039081443,
"grad_norm": 1.21875,
"learning_rate": 4.528824833702883e-05,
"loss": 1.2353066205978394,
"step": 208
},
{
"epoch": 0.11924119241192412,
"grad_norm": 1.125,
"learning_rate": 4.526053215077605e-05,
"loss": 1.2745922803878784,
"step": 209
},
{
"epoch": 0.1198117244330338,
"grad_norm": 1.109375,
"learning_rate": 4.523281596452328e-05,
"loss": 1.2637782096862793,
"step": 210
},
{
"epoch": 0.1203822564541435,
"grad_norm": 1.125,
"learning_rate": 4.520509977827051e-05,
"loss": 1.2595422267913818,
"step": 211
},
{
"epoch": 0.12095278847525318,
"grad_norm": 1.0859375,
"learning_rate": 4.517738359201774e-05,
"loss": 1.2515778541564941,
"step": 212
},
{
"epoch": 0.12152332049636286,
"grad_norm": 1.1953125,
"learning_rate": 4.514966740576497e-05,
"loss": 1.2258851528167725,
"step": 213
},
{
"epoch": 0.12209385251747254,
"grad_norm": 1.140625,
"learning_rate": 4.51219512195122e-05,
"loss": 1.2595672607421875,
"step": 214
},
{
"epoch": 0.12266438453858222,
"grad_norm": 1.125,
"learning_rate": 4.509423503325942e-05,
"loss": 1.2574856281280518,
"step": 215
},
{
"epoch": 0.12323491655969192,
"grad_norm": 1.21875,
"learning_rate": 4.506651884700666e-05,
"loss": 1.2860839366912842,
"step": 216
},
{
"epoch": 0.1238054485808016,
"grad_norm": 1.1875,
"learning_rate": 4.503880266075388e-05,
"loss": 1.2748535871505737,
"step": 217
},
{
"epoch": 0.12437598060191128,
"grad_norm": 1.1640625,
"learning_rate": 4.501108647450111e-05,
"loss": 1.2630361318588257,
"step": 218
},
{
"epoch": 0.12494651262302096,
"grad_norm": 1.1328125,
"learning_rate": 4.498337028824834e-05,
"loss": 1.2100318670272827,
"step": 219
},
{
"epoch": 0.12551704464413066,
"grad_norm": 1.1328125,
"learning_rate": 4.495565410199557e-05,
"loss": 1.279637098312378,
"step": 220
},
{
"epoch": 0.12608757666524034,
"grad_norm": 1.09375,
"learning_rate": 4.49279379157428e-05,
"loss": 1.241306185722351,
"step": 221
},
{
"epoch": 0.12665810868635002,
"grad_norm": 1.125,
"learning_rate": 4.490022172949002e-05,
"loss": 1.2467423677444458,
"step": 222
},
{
"epoch": 0.1272286407074597,
"grad_norm": 1.1640625,
"learning_rate": 4.487250554323725e-05,
"loss": 1.2398571968078613,
"step": 223
},
{
"epoch": 0.1277991727285694,
"grad_norm": 1.1171875,
"learning_rate": 4.484478935698448e-05,
"loss": 1.298073410987854,
"step": 224
},
{
"epoch": 0.12836970474967907,
"grad_norm": 1.1640625,
"learning_rate": 4.481707317073171e-05,
"loss": 1.3275305032730103,
"step": 225
},
{
"epoch": 0.12894023677078875,
"grad_norm": 1.0859375,
"learning_rate": 4.478935698447894e-05,
"loss": 1.2483649253845215,
"step": 226
},
{
"epoch": 0.12951076879189843,
"grad_norm": 1.1484375,
"learning_rate": 4.476164079822617e-05,
"loss": 1.322462797164917,
"step": 227
},
{
"epoch": 0.13008130081300814,
"grad_norm": 1.140625,
"learning_rate": 4.473392461197339e-05,
"loss": 1.2100863456726074,
"step": 228
},
{
"epoch": 0.13065183283411783,
"grad_norm": 1.1640625,
"learning_rate": 4.470620842572063e-05,
"loss": 1.249301552772522,
"step": 229
},
{
"epoch": 0.1312223648552275,
"grad_norm": 1.125,
"learning_rate": 4.467849223946785e-05,
"loss": 1.2208349704742432,
"step": 230
},
{
"epoch": 0.1317928968763372,
"grad_norm": 1.171875,
"learning_rate": 4.465077605321508e-05,
"loss": 1.2686306238174438,
"step": 231
},
{
"epoch": 0.13236342889744687,
"grad_norm": 1.1796875,
"learning_rate": 4.462305986696231e-05,
"loss": 1.2922316789627075,
"step": 232
},
{
"epoch": 0.13293396091855655,
"grad_norm": 1.171875,
"learning_rate": 4.459534368070954e-05,
"loss": 1.2734718322753906,
"step": 233
},
{
"epoch": 0.13350449293966624,
"grad_norm": 1.125,
"learning_rate": 4.456762749445677e-05,
"loss": 1.2748900651931763,
"step": 234
},
{
"epoch": 0.13407502496077592,
"grad_norm": 1.125,
"learning_rate": 4.453991130820399e-05,
"loss": 1.2857415676116943,
"step": 235
},
{
"epoch": 0.1346455569818856,
"grad_norm": 1.1640625,
"learning_rate": 4.451219512195122e-05,
"loss": 1.2689714431762695,
"step": 236
},
{
"epoch": 0.13521608900299528,
"grad_norm": 1.1171875,
"learning_rate": 4.448447893569845e-05,
"loss": 1.248453140258789,
"step": 237
},
{
"epoch": 0.13578662102410496,
"grad_norm": 1.1484375,
"learning_rate": 4.445676274944568e-05,
"loss": 1.2693870067596436,
"step": 238
},
{
"epoch": 0.13635715304521467,
"grad_norm": 1.109375,
"learning_rate": 4.442904656319291e-05,
"loss": 1.2767329216003418,
"step": 239
},
{
"epoch": 0.13692768506632436,
"grad_norm": 1.109375,
"learning_rate": 4.440133037694014e-05,
"loss": 1.2598170042037964,
"step": 240
},
{
"epoch": 0.13749821708743404,
"grad_norm": 1.140625,
"learning_rate": 4.437361419068736e-05,
"loss": 1.2850111722946167,
"step": 241
},
{
"epoch": 0.13806874910854372,
"grad_norm": 1.0625,
"learning_rate": 4.4345898004434597e-05,
"loss": 1.2005095481872559,
"step": 242
},
{
"epoch": 0.1386392811296534,
"grad_norm": 1.09375,
"learning_rate": 4.431818181818182e-05,
"loss": 1.2896265983581543,
"step": 243
},
{
"epoch": 0.13920981315076308,
"grad_norm": 1.1328125,
"learning_rate": 4.429046563192905e-05,
"loss": 1.3427916765213013,
"step": 244
},
{
"epoch": 0.13978034517187277,
"grad_norm": 1.1484375,
"learning_rate": 4.426274944567628e-05,
"loss": 1.2719500064849854,
"step": 245
},
{
"epoch": 0.14035087719298245,
"grad_norm": 1.1640625,
"learning_rate": 4.42350332594235e-05,
"loss": 1.2944797277450562,
"step": 246
},
{
"epoch": 0.14092140921409213,
"grad_norm": 1.1328125,
"learning_rate": 4.420731707317074e-05,
"loss": 1.3022198677062988,
"step": 247
},
{
"epoch": 0.1414919412352018,
"grad_norm": 1.2109375,
"learning_rate": 4.417960088691796e-05,
"loss": 1.286307454109192,
"step": 248
},
{
"epoch": 0.14206247325631152,
"grad_norm": 1.328125,
"learning_rate": 4.415188470066519e-05,
"loss": 1.3540141582489014,
"step": 249
},
{
"epoch": 0.1426330052774212,
"grad_norm": 1.0859375,
"learning_rate": 4.412416851441242e-05,
"loss": 1.2702994346618652,
"step": 250
},
{
"epoch": 0.1432035372985309,
"grad_norm": 1.140625,
"learning_rate": 4.409645232815965e-05,
"loss": 1.2684781551361084,
"step": 251
},
{
"epoch": 0.14377406931964057,
"grad_norm": 1.1484375,
"learning_rate": 4.406873614190688e-05,
"loss": 1.1907923221588135,
"step": 252
},
{
"epoch": 0.14434460134075025,
"grad_norm": 1.1328125,
"learning_rate": 4.404101995565411e-05,
"loss": 1.2790608406066895,
"step": 253
},
{
"epoch": 0.14491513336185993,
"grad_norm": 1.1015625,
"learning_rate": 4.401330376940133e-05,
"loss": 1.2878901958465576,
"step": 254
},
{
"epoch": 0.14548566538296961,
"grad_norm": 1.09375,
"learning_rate": 4.3985587583148566e-05,
"loss": 1.2305991649627686,
"step": 255
},
{
"epoch": 0.1460561974040793,
"grad_norm": 1.1796875,
"learning_rate": 4.395787139689579e-05,
"loss": 1.3150757551193237,
"step": 256
},
{
"epoch": 0.14662672942518898,
"grad_norm": 1.0390625,
"learning_rate": 4.393015521064302e-05,
"loss": 1.213336706161499,
"step": 257
},
{
"epoch": 0.14719726144629866,
"grad_norm": 1.0625,
"learning_rate": 4.390243902439025e-05,
"loss": 1.2233829498291016,
"step": 258
},
{
"epoch": 0.14776779346740837,
"grad_norm": 1.078125,
"learning_rate": 4.387472283813747e-05,
"loss": 1.1772549152374268,
"step": 259
},
{
"epoch": 0.14833832548851805,
"grad_norm": 1.171875,
"learning_rate": 4.3847006651884707e-05,
"loss": 1.3097314834594727,
"step": 260
},
{
"epoch": 0.14890885750962773,
"grad_norm": 1.0703125,
"learning_rate": 4.381929046563193e-05,
"loss": 1.3049172163009644,
"step": 261
},
{
"epoch": 0.14947938953073742,
"grad_norm": 1.125,
"learning_rate": 4.379157427937916e-05,
"loss": 1.3094444274902344,
"step": 262
},
{
"epoch": 0.1500499215518471,
"grad_norm": 1.125,
"learning_rate": 4.376385809312639e-05,
"loss": 1.3298535346984863,
"step": 263
},
{
"epoch": 0.15062045357295678,
"grad_norm": 1.0234375,
"learning_rate": 4.373614190687362e-05,
"loss": 1.2394543886184692,
"step": 264
},
{
"epoch": 0.15119098559406646,
"grad_norm": 1.1640625,
"learning_rate": 4.370842572062084e-05,
"loss": 1.2180919647216797,
"step": 265
},
{
"epoch": 0.15176151761517614,
"grad_norm": 1.140625,
"learning_rate": 4.3680709534368077e-05,
"loss": 1.2652344703674316,
"step": 266
},
{
"epoch": 0.15233204963628583,
"grad_norm": 1.09375,
"learning_rate": 4.36529933481153e-05,
"loss": 1.2816247940063477,
"step": 267
},
{
"epoch": 0.1529025816573955,
"grad_norm": 1.046875,
"learning_rate": 4.3625277161862536e-05,
"loss": 1.2074222564697266,
"step": 268
},
{
"epoch": 0.15347311367850522,
"grad_norm": 1.0859375,
"learning_rate": 4.359756097560976e-05,
"loss": 1.2124351263046265,
"step": 269
},
{
"epoch": 0.1540436456996149,
"grad_norm": 1.0859375,
"learning_rate": 4.356984478935698e-05,
"loss": 1.187751293182373,
"step": 270
},
{
"epoch": 0.15461417772072458,
"grad_norm": 1.046875,
"learning_rate": 4.354212860310422e-05,
"loss": 1.1458532810211182,
"step": 271
},
{
"epoch": 0.15518470974183426,
"grad_norm": 1.1171875,
"learning_rate": 4.351441241685144e-05,
"loss": 1.229477882385254,
"step": 272
},
{
"epoch": 0.15575524176294395,
"grad_norm": 1.2265625,
"learning_rate": 4.348669623059867e-05,
"loss": 1.2863445281982422,
"step": 273
},
{
"epoch": 0.15632577378405363,
"grad_norm": 1.0703125,
"learning_rate": 4.34589800443459e-05,
"loss": 1.226841688156128,
"step": 274
},
{
"epoch": 0.1568963058051633,
"grad_norm": 1.1640625,
"learning_rate": 4.343126385809313e-05,
"loss": 1.2147347927093506,
"step": 275
},
{
"epoch": 0.157466837826273,
"grad_norm": 1.0546875,
"learning_rate": 4.340354767184036e-05,
"loss": 1.2533400058746338,
"step": 276
},
{
"epoch": 0.15803736984738267,
"grad_norm": 1.0859375,
"learning_rate": 4.337583148558759e-05,
"loss": 1.2199838161468506,
"step": 277
},
{
"epoch": 0.15860790186849236,
"grad_norm": 1.0546875,
"learning_rate": 4.334811529933481e-05,
"loss": 1.196079969406128,
"step": 278
},
{
"epoch": 0.15917843388960207,
"grad_norm": 1.140625,
"learning_rate": 4.3320399113082046e-05,
"loss": 1.2512052059173584,
"step": 279
},
{
"epoch": 0.15974896591071175,
"grad_norm": 1.1171875,
"learning_rate": 4.329268292682927e-05,
"loss": 1.2729978561401367,
"step": 280
},
{
"epoch": 0.16031949793182143,
"grad_norm": 1.171875,
"learning_rate": 4.32649667405765e-05,
"loss": 1.2414803504943848,
"step": 281
},
{
"epoch": 0.1608900299529311,
"grad_norm": 1.125,
"learning_rate": 4.323725055432373e-05,
"loss": 1.2329685688018799,
"step": 282
},
{
"epoch": 0.1614605619740408,
"grad_norm": 1.0703125,
"learning_rate": 4.320953436807095e-05,
"loss": 1.2458125352859497,
"step": 283
},
{
"epoch": 0.16203109399515048,
"grad_norm": 1.09375,
"learning_rate": 4.318181818181819e-05,
"loss": 1.2762466669082642,
"step": 284
},
{
"epoch": 0.16260162601626016,
"grad_norm": 1.1484375,
"learning_rate": 4.315410199556541e-05,
"loss": 1.2883433103561401,
"step": 285
},
{
"epoch": 0.16317215803736984,
"grad_norm": 1.109375,
"learning_rate": 4.312638580931264e-05,
"loss": 1.261974811553955,
"step": 286
},
{
"epoch": 0.16374269005847952,
"grad_norm": 1.0625,
"learning_rate": 4.309866962305987e-05,
"loss": 1.2657639980316162,
"step": 287
},
{
"epoch": 0.1643132220795892,
"grad_norm": 1.1328125,
"learning_rate": 4.30709534368071e-05,
"loss": 1.295043706893921,
"step": 288
},
{
"epoch": 0.16488375410069891,
"grad_norm": 1.09375,
"learning_rate": 4.304323725055433e-05,
"loss": 1.2336839437484741,
"step": 289
},
{
"epoch": 0.1654542861218086,
"grad_norm": 1.09375,
"learning_rate": 4.301552106430156e-05,
"loss": 1.264127492904663,
"step": 290
},
{
"epoch": 0.16602481814291828,
"grad_norm": 1.078125,
"learning_rate": 4.298780487804878e-05,
"loss": 1.2246544361114502,
"step": 291
},
{
"epoch": 0.16659535016402796,
"grad_norm": 1.0546875,
"learning_rate": 4.2960088691796016e-05,
"loss": 1.2040233612060547,
"step": 292
},
{
"epoch": 0.16716588218513764,
"grad_norm": 1.09375,
"learning_rate": 4.293237250554324e-05,
"loss": 1.2784225940704346,
"step": 293
},
{
"epoch": 0.16773641420624733,
"grad_norm": 1.109375,
"learning_rate": 4.290465631929047e-05,
"loss": 1.3152185678482056,
"step": 294
},
{
"epoch": 0.168306946227357,
"grad_norm": 1.1015625,
"learning_rate": 4.28769401330377e-05,
"loss": 1.2193617820739746,
"step": 295
},
{
"epoch": 0.1688774782484667,
"grad_norm": 1.1015625,
"learning_rate": 4.284922394678492e-05,
"loss": 1.2813901901245117,
"step": 296
},
{
"epoch": 0.16944801026957637,
"grad_norm": 1.09375,
"learning_rate": 4.2821507760532156e-05,
"loss": 1.205044150352478,
"step": 297
},
{
"epoch": 0.17001854229068605,
"grad_norm": 1.1015625,
"learning_rate": 4.279379157427938e-05,
"loss": 1.2626889944076538,
"step": 298
},
{
"epoch": 0.17058907431179576,
"grad_norm": 1.1484375,
"learning_rate": 4.276607538802661e-05,
"loss": 1.2680320739746094,
"step": 299
},
{
"epoch": 0.17115960633290545,
"grad_norm": 1.0859375,
"learning_rate": 4.273835920177384e-05,
"loss": 1.2155548334121704,
"step": 300
},
{
"epoch": 0.17173013835401513,
"grad_norm": 1.1875,
"learning_rate": 4.271064301552107e-05,
"loss": 1.2199232578277588,
"step": 301
},
{
"epoch": 0.1723006703751248,
"grad_norm": 1.1171875,
"learning_rate": 4.26829268292683e-05,
"loss": 1.2747461795806885,
"step": 302
},
{
"epoch": 0.1728712023962345,
"grad_norm": 1.0546875,
"learning_rate": 4.2655210643015526e-05,
"loss": 1.235656976699829,
"step": 303
},
{
"epoch": 0.17344173441734417,
"grad_norm": 1.1875,
"learning_rate": 4.262749445676275e-05,
"loss": 1.3054769039154053,
"step": 304
},
{
"epoch": 0.17401226643845386,
"grad_norm": 1.125,
"learning_rate": 4.2599778270509985e-05,
"loss": 1.2325561046600342,
"step": 305
},
{
"epoch": 0.17458279845956354,
"grad_norm": 1.0546875,
"learning_rate": 4.257206208425721e-05,
"loss": 1.1963461637496948,
"step": 306
},
{
"epoch": 0.17515333048067322,
"grad_norm": 1.09375,
"learning_rate": 4.254434589800444e-05,
"loss": 1.2029732465744019,
"step": 307
},
{
"epoch": 0.1757238625017829,
"grad_norm": 1.0703125,
"learning_rate": 4.251662971175167e-05,
"loss": 1.289282202720642,
"step": 308
},
{
"epoch": 0.17629439452289258,
"grad_norm": 1.1640625,
"learning_rate": 4.248891352549889e-05,
"loss": 1.2570784091949463,
"step": 309
},
{
"epoch": 0.1768649265440023,
"grad_norm": 1.0703125,
"learning_rate": 4.2461197339246126e-05,
"loss": 1.1787132024765015,
"step": 310
},
{
"epoch": 0.17743545856511198,
"grad_norm": 1.1328125,
"learning_rate": 4.243348115299335e-05,
"loss": 1.2079870700836182,
"step": 311
},
{
"epoch": 0.17800599058622166,
"grad_norm": 1.1640625,
"learning_rate": 4.240576496674058e-05,
"loss": 1.2776343822479248,
"step": 312
},
{
"epoch": 0.17857652260733134,
"grad_norm": 1.0625,
"learning_rate": 4.237804878048781e-05,
"loss": 1.1856639385223389,
"step": 313
},
{
"epoch": 0.17914705462844102,
"grad_norm": 1.1328125,
"learning_rate": 4.235033259423504e-05,
"loss": 1.268944501876831,
"step": 314
},
{
"epoch": 0.1797175866495507,
"grad_norm": 1.140625,
"learning_rate": 4.2322616407982266e-05,
"loss": 1.2755537033081055,
"step": 315
},
{
"epoch": 0.18028811867066039,
"grad_norm": 1.0703125,
"learning_rate": 4.2294900221729496e-05,
"loss": 1.274179458618164,
"step": 316
},
{
"epoch": 0.18085865069177007,
"grad_norm": 1.1015625,
"learning_rate": 4.226718403547672e-05,
"loss": 1.2530457973480225,
"step": 317
},
{
"epoch": 0.18142918271287975,
"grad_norm": 1.1015625,
"learning_rate": 4.2239467849223955e-05,
"loss": 1.1844085454940796,
"step": 318
},
{
"epoch": 0.18199971473398943,
"grad_norm": 1.1328125,
"learning_rate": 4.221175166297118e-05,
"loss": 1.3111554384231567,
"step": 319
},
{
"epoch": 0.18257024675509914,
"grad_norm": 1.1328125,
"learning_rate": 4.21840354767184e-05,
"loss": 1.2178188562393188,
"step": 320
},
{
"epoch": 0.18314077877620882,
"grad_norm": 1.1015625,
"learning_rate": 4.2156319290465636e-05,
"loss": 1.2369928359985352,
"step": 321
},
{
"epoch": 0.1837113107973185,
"grad_norm": 1.0625,
"learning_rate": 4.212860310421286e-05,
"loss": 1.1851946115493774,
"step": 322
},
{
"epoch": 0.1842818428184282,
"grad_norm": 1.09375,
"learning_rate": 4.210088691796009e-05,
"loss": 1.2697205543518066,
"step": 323
},
{
"epoch": 0.18485237483953787,
"grad_norm": 1.078125,
"learning_rate": 4.207317073170732e-05,
"loss": 1.2498860359191895,
"step": 324
},
{
"epoch": 0.18542290686064755,
"grad_norm": 1.109375,
"learning_rate": 4.204545454545455e-05,
"loss": 1.2507086992263794,
"step": 325
},
{
"epoch": 0.18599343888175723,
"grad_norm": 1.03125,
"learning_rate": 4.201773835920178e-05,
"loss": 1.2160149812698364,
"step": 326
},
{
"epoch": 0.18656397090286692,
"grad_norm": 1.078125,
"learning_rate": 4.1990022172949006e-05,
"loss": 1.238983392715454,
"step": 327
},
{
"epoch": 0.1871345029239766,
"grad_norm": 1.03125,
"learning_rate": 4.196230598669623e-05,
"loss": 1.2306344509124756,
"step": 328
},
{
"epoch": 0.18770503494508628,
"grad_norm": 1.1015625,
"learning_rate": 4.1934589800443465e-05,
"loss": 1.27529776096344,
"step": 329
},
{
"epoch": 0.188275566966196,
"grad_norm": 1.0859375,
"learning_rate": 4.190687361419069e-05,
"loss": 1.2787272930145264,
"step": 330
},
{
"epoch": 0.18884609898730567,
"grad_norm": 1.0546875,
"learning_rate": 4.187915742793792e-05,
"loss": 1.2454849481582642,
"step": 331
},
{
"epoch": 0.18941663100841535,
"grad_norm": 1.0078125,
"learning_rate": 4.185144124168515e-05,
"loss": 1.2060352563858032,
"step": 332
},
{
"epoch": 0.18998716302952504,
"grad_norm": 1.078125,
"learning_rate": 4.182372505543237e-05,
"loss": 1.2341554164886475,
"step": 333
},
{
"epoch": 0.19055769505063472,
"grad_norm": 1.1015625,
"learning_rate": 4.1796008869179606e-05,
"loss": 1.2774791717529297,
"step": 334
},
{
"epoch": 0.1911282270717444,
"grad_norm": 1.0546875,
"learning_rate": 4.176829268292683e-05,
"loss": 1.2547677755355835,
"step": 335
},
{
"epoch": 0.19169875909285408,
"grad_norm": 1.1015625,
"learning_rate": 4.174057649667406e-05,
"loss": 1.286057472229004,
"step": 336
},
{
"epoch": 0.19226929111396376,
"grad_norm": 1.1015625,
"learning_rate": 4.171286031042129e-05,
"loss": 1.2891746759414673,
"step": 337
},
{
"epoch": 0.19283982313507345,
"grad_norm": 1.046875,
"learning_rate": 4.168514412416852e-05,
"loss": 1.2376006841659546,
"step": 338
},
{
"epoch": 0.19341035515618313,
"grad_norm": 1.0703125,
"learning_rate": 4.1657427937915746e-05,
"loss": 1.2672202587127686,
"step": 339
},
{
"epoch": 0.19398088717729284,
"grad_norm": 1.1015625,
"learning_rate": 4.1629711751662976e-05,
"loss": 1.2037293910980225,
"step": 340
},
{
"epoch": 0.19455141919840252,
"grad_norm": 1.0859375,
"learning_rate": 4.16019955654102e-05,
"loss": 1.218858003616333,
"step": 341
},
{
"epoch": 0.1951219512195122,
"grad_norm": 1.0859375,
"learning_rate": 4.1574279379157435e-05,
"loss": 1.2183986902236938,
"step": 342
},
{
"epoch": 0.19569248324062188,
"grad_norm": 1.1171875,
"learning_rate": 4.154656319290466e-05,
"loss": 1.2573124170303345,
"step": 343
},
{
"epoch": 0.19626301526173157,
"grad_norm": 1.125,
"learning_rate": 4.151884700665189e-05,
"loss": 1.21070396900177,
"step": 344
},
{
"epoch": 0.19683354728284125,
"grad_norm": 1.09375,
"learning_rate": 4.1491130820399116e-05,
"loss": 1.286003589630127,
"step": 345
},
{
"epoch": 0.19740407930395093,
"grad_norm": 1.1015625,
"learning_rate": 4.146341463414634e-05,
"loss": 1.2600152492523193,
"step": 346
},
{
"epoch": 0.1979746113250606,
"grad_norm": 1.203125,
"learning_rate": 4.1435698447893575e-05,
"loss": 1.2338290214538574,
"step": 347
},
{
"epoch": 0.1985451433461703,
"grad_norm": 1.15625,
"learning_rate": 4.14079822616408e-05,
"loss": 1.2722115516662598,
"step": 348
},
{
"epoch": 0.19911567536727998,
"grad_norm": 1.0859375,
"learning_rate": 4.138026607538803e-05,
"loss": 1.1988334655761719,
"step": 349
},
{
"epoch": 0.19968620738838969,
"grad_norm": 1.0859375,
"learning_rate": 4.135254988913526e-05,
"loss": 1.2339057922363281,
"step": 350
},
{
"epoch": 0.20025673940949937,
"grad_norm": 1.125,
"learning_rate": 4.1324833702882486e-05,
"loss": 1.2363622188568115,
"step": 351
},
{
"epoch": 0.20082727143060905,
"grad_norm": 1.0859375,
"learning_rate": 4.1297117516629716e-05,
"loss": 1.2658472061157227,
"step": 352
},
{
"epoch": 0.20139780345171873,
"grad_norm": 1.0546875,
"learning_rate": 4.1269401330376945e-05,
"loss": 1.2181835174560547,
"step": 353
},
{
"epoch": 0.20196833547282841,
"grad_norm": 1.1171875,
"learning_rate": 4.124168514412417e-05,
"loss": 1.2710312604904175,
"step": 354
},
{
"epoch": 0.2025388674939381,
"grad_norm": 1.0625,
"learning_rate": 4.12139689578714e-05,
"loss": 1.176246166229248,
"step": 355
},
{
"epoch": 0.20310939951504778,
"grad_norm": 1.0625,
"learning_rate": 4.118625277161863e-05,
"loss": 1.24937903881073,
"step": 356
},
{
"epoch": 0.20367993153615746,
"grad_norm": 1.0703125,
"learning_rate": 4.1158536585365856e-05,
"loss": 1.2401498556137085,
"step": 357
},
{
"epoch": 0.20425046355726714,
"grad_norm": 1.046875,
"learning_rate": 4.1130820399113086e-05,
"loss": 1.2015979290008545,
"step": 358
},
{
"epoch": 0.20482099557837682,
"grad_norm": 1.1484375,
"learning_rate": 4.110310421286031e-05,
"loss": 1.2495380640029907,
"step": 359
},
{
"epoch": 0.20539152759948653,
"grad_norm": 1.0703125,
"learning_rate": 4.1075388026607545e-05,
"loss": 1.2646973133087158,
"step": 360
},
{
"epoch": 0.20596205962059622,
"grad_norm": 1.046875,
"learning_rate": 4.104767184035477e-05,
"loss": 1.2007383108139038,
"step": 361
},
{
"epoch": 0.2065325916417059,
"grad_norm": 1.046875,
"learning_rate": 4.1019955654102e-05,
"loss": 1.226219892501831,
"step": 362
},
{
"epoch": 0.20710312366281558,
"grad_norm": 1.0703125,
"learning_rate": 4.0992239467849226e-05,
"loss": 1.306444525718689,
"step": 363
},
{
"epoch": 0.20767365568392526,
"grad_norm": 1.0625,
"learning_rate": 4.0964523281596456e-05,
"loss": 1.2141070365905762,
"step": 364
},
{
"epoch": 0.20824418770503494,
"grad_norm": 1.0390625,
"learning_rate": 4.0936807095343685e-05,
"loss": 1.2149772644042969,
"step": 365
},
{
"epoch": 0.20881471972614463,
"grad_norm": 1.0703125,
"learning_rate": 4.0909090909090915e-05,
"loss": 1.2671623229980469,
"step": 366
},
{
"epoch": 0.2093852517472543,
"grad_norm": 1.0546875,
"learning_rate": 4.088137472283814e-05,
"loss": 1.2434954643249512,
"step": 367
},
{
"epoch": 0.209955783768364,
"grad_norm": 1.03125,
"learning_rate": 4.085365853658537e-05,
"loss": 1.2326661348342896,
"step": 368
},
{
"epoch": 0.21052631578947367,
"grad_norm": 1.0390625,
"learning_rate": 4.0825942350332596e-05,
"loss": 1.2969672679901123,
"step": 369
},
{
"epoch": 0.21052631578947367,
"eval_loss": 1.238897681236267,
"eval_runtime": 80.0789,
"eval_samples_per_second": 11.938,
"eval_steps_per_second": 2.985,
"step": 369
},
{
"epoch": 0.21109684781058338,
"grad_norm": 1.0078125,
"learning_rate": 4.0798226164079826e-05,
"loss": 1.203234076499939,
"step": 370
},
{
"epoch": 0.21166737983169306,
"grad_norm": 1.0390625,
"learning_rate": 4.0770509977827055e-05,
"loss": 1.2333259582519531,
"step": 371
},
{
"epoch": 0.21223791185280275,
"grad_norm": 1.046875,
"learning_rate": 4.074279379157428e-05,
"loss": 1.2060984373092651,
"step": 372
},
{
"epoch": 0.21280844387391243,
"grad_norm": 1.0234375,
"learning_rate": 4.0715077605321514e-05,
"loss": 1.1909129619598389,
"step": 373
},
{
"epoch": 0.2133789758950221,
"grad_norm": 1.046875,
"learning_rate": 4.068736141906874e-05,
"loss": 1.2396963834762573,
"step": 374
},
{
"epoch": 0.2139495079161318,
"grad_norm": 1.0078125,
"learning_rate": 4.0659645232815966e-05,
"loss": 1.1830250024795532,
"step": 375
},
{
"epoch": 0.21452003993724147,
"grad_norm": 1.03125,
"learning_rate": 4.0631929046563196e-05,
"loss": 1.207044005393982,
"step": 376
},
{
"epoch": 0.21509057195835116,
"grad_norm": 1.203125,
"learning_rate": 4.0604212860310425e-05,
"loss": 1.2795757055282593,
"step": 377
},
{
"epoch": 0.21566110397946084,
"grad_norm": 1.0546875,
"learning_rate": 4.057649667405765e-05,
"loss": 1.2492969036102295,
"step": 378
},
{
"epoch": 0.21623163600057052,
"grad_norm": 1.0546875,
"learning_rate": 4.0548780487804884e-05,
"loss": 1.3094936609268188,
"step": 379
},
{
"epoch": 0.21680216802168023,
"grad_norm": 1.0390625,
"learning_rate": 4.052106430155211e-05,
"loss": 1.2260823249816895,
"step": 380
},
{
"epoch": 0.2173727000427899,
"grad_norm": 1.078125,
"learning_rate": 4.0493348115299336e-05,
"loss": 1.2405587434768677,
"step": 381
},
{
"epoch": 0.2179432320638996,
"grad_norm": 1.09375,
"learning_rate": 4.0465631929046566e-05,
"loss": 1.1963216066360474,
"step": 382
},
{
"epoch": 0.21851376408500928,
"grad_norm": 1.1484375,
"learning_rate": 4.043791574279379e-05,
"loss": 1.2458081245422363,
"step": 383
},
{
"epoch": 0.21908429610611896,
"grad_norm": 1.0859375,
"learning_rate": 4.0410199556541025e-05,
"loss": 1.1974573135375977,
"step": 384
},
{
"epoch": 0.21965482812722864,
"grad_norm": 1.0390625,
"learning_rate": 4.038248337028825e-05,
"loss": 1.2237815856933594,
"step": 385
},
{
"epoch": 0.22022536014833832,
"grad_norm": 1.03125,
"learning_rate": 4.035476718403548e-05,
"loss": 1.2369771003723145,
"step": 386
},
{
"epoch": 0.220795892169448,
"grad_norm": 1.0859375,
"learning_rate": 4.0327050997782706e-05,
"loss": 1.2545832395553589,
"step": 387
},
{
"epoch": 0.2213664241905577,
"grad_norm": 1.03125,
"learning_rate": 4.0299334811529936e-05,
"loss": 1.2126426696777344,
"step": 388
},
{
"epoch": 0.22193695621166737,
"grad_norm": 1.0625,
"learning_rate": 4.0271618625277165e-05,
"loss": 1.2321901321411133,
"step": 389
},
{
"epoch": 0.22250748823277705,
"grad_norm": 1.03125,
"learning_rate": 4.0243902439024395e-05,
"loss": 1.2315490245819092,
"step": 390
},
{
"epoch": 0.22307802025388676,
"grad_norm": 1.0390625,
"learning_rate": 4.021618625277162e-05,
"loss": 1.1859689950942993,
"step": 391
},
{
"epoch": 0.22364855227499644,
"grad_norm": 1.03125,
"learning_rate": 4.018847006651885e-05,
"loss": 1.2416760921478271,
"step": 392
},
{
"epoch": 0.22421908429610612,
"grad_norm": 1.1015625,
"learning_rate": 4.0160753880266076e-05,
"loss": 1.3080382347106934,
"step": 393
},
{
"epoch": 0.2247896163172158,
"grad_norm": 1.0625,
"learning_rate": 4.0133037694013306e-05,
"loss": 1.2275526523590088,
"step": 394
},
{
"epoch": 0.2253601483383255,
"grad_norm": 1.03125,
"learning_rate": 4.0105321507760535e-05,
"loss": 1.2734044790267944,
"step": 395
},
{
"epoch": 0.22593068035943517,
"grad_norm": 1.09375,
"learning_rate": 4.007760532150776e-05,
"loss": 1.2480955123901367,
"step": 396
},
{
"epoch": 0.22650121238054485,
"grad_norm": 1.078125,
"learning_rate": 4.0049889135254994e-05,
"loss": 1.2629410028457642,
"step": 397
},
{
"epoch": 0.22707174440165453,
"grad_norm": 1.046875,
"learning_rate": 4.002217294900222e-05,
"loss": 1.190090537071228,
"step": 398
},
{
"epoch": 0.22764227642276422,
"grad_norm": 1.0859375,
"learning_rate": 3.9994456762749446e-05,
"loss": 1.2843146324157715,
"step": 399
},
{
"epoch": 0.2282128084438739,
"grad_norm": 1.0546875,
"learning_rate": 3.9966740576496676e-05,
"loss": 1.2836047410964966,
"step": 400
},
{
"epoch": 0.2287833404649836,
"grad_norm": 1.0234375,
"learning_rate": 3.9939024390243905e-05,
"loss": 1.1873021125793457,
"step": 401
},
{
"epoch": 0.2293538724860933,
"grad_norm": 1.03125,
"learning_rate": 3.9911308203991135e-05,
"loss": 1.228004813194275,
"step": 402
},
{
"epoch": 0.22992440450720297,
"grad_norm": 1.0078125,
"learning_rate": 3.9883592017738364e-05,
"loss": 1.2318588495254517,
"step": 403
},
{
"epoch": 0.23049493652831265,
"grad_norm": 1.046875,
"learning_rate": 3.985587583148559e-05,
"loss": 1.218421220779419,
"step": 404
},
{
"epoch": 0.23106546854942234,
"grad_norm": 1.0625,
"learning_rate": 3.9828159645232816e-05,
"loss": 1.3068960905075073,
"step": 405
},
{
"epoch": 0.23163600057053202,
"grad_norm": 1.0234375,
"learning_rate": 3.9800443458980046e-05,
"loss": 1.2189011573791504,
"step": 406
},
{
"epoch": 0.2322065325916417,
"grad_norm": 1.046875,
"learning_rate": 3.9772727272727275e-05,
"loss": 1.2019367218017578,
"step": 407
},
{
"epoch": 0.23277706461275138,
"grad_norm": 1.046875,
"learning_rate": 3.9745011086474505e-05,
"loss": 1.2285387516021729,
"step": 408
},
{
"epoch": 0.23334759663386107,
"grad_norm": 1.015625,
"learning_rate": 3.971729490022173e-05,
"loss": 1.1963067054748535,
"step": 409
},
{
"epoch": 0.23391812865497075,
"grad_norm": 1.078125,
"learning_rate": 3.9689578713968964e-05,
"loss": 1.3005050420761108,
"step": 410
},
{
"epoch": 0.23448866067608046,
"grad_norm": 1.0234375,
"learning_rate": 3.9661862527716186e-05,
"loss": 1.2429478168487549,
"step": 411
},
{
"epoch": 0.23505919269719014,
"grad_norm": 1.046875,
"learning_rate": 3.9634146341463416e-05,
"loss": 1.2445229291915894,
"step": 412
},
{
"epoch": 0.23562972471829982,
"grad_norm": 1.046875,
"learning_rate": 3.9606430155210645e-05,
"loss": 1.2569499015808105,
"step": 413
},
{
"epoch": 0.2362002567394095,
"grad_norm": 1.0,
"learning_rate": 3.9578713968957875e-05,
"loss": 1.232776165008545,
"step": 414
},
{
"epoch": 0.23677078876051919,
"grad_norm": 1.03125,
"learning_rate": 3.9550997782705104e-05,
"loss": 1.2104380130767822,
"step": 415
},
{
"epoch": 0.23734132078162887,
"grad_norm": 1.1015625,
"learning_rate": 3.952328159645233e-05,
"loss": 1.2908308506011963,
"step": 416
},
{
"epoch": 0.23791185280273855,
"grad_norm": 1.0625,
"learning_rate": 3.9495565410199557e-05,
"loss": 1.1678047180175781,
"step": 417
},
{
"epoch": 0.23848238482384823,
"grad_norm": 1.0859375,
"learning_rate": 3.9467849223946786e-05,
"loss": 1.310725212097168,
"step": 418
},
{
"epoch": 0.2390529168449579,
"grad_norm": 1.0546875,
"learning_rate": 3.9440133037694015e-05,
"loss": 1.2618491649627686,
"step": 419
},
{
"epoch": 0.2396234488660676,
"grad_norm": 1.0234375,
"learning_rate": 3.9412416851441245e-05,
"loss": 1.1795238256454468,
"step": 420
},
{
"epoch": 0.2401939808871773,
"grad_norm": 1.0546875,
"learning_rate": 3.9384700665188474e-05,
"loss": 1.2187573909759521,
"step": 421
},
{
"epoch": 0.240764512908287,
"grad_norm": 1.03125,
"learning_rate": 3.93569844789357e-05,
"loss": 1.2171461582183838,
"step": 422
},
{
"epoch": 0.24133504492939667,
"grad_norm": 1.046875,
"learning_rate": 3.932926829268293e-05,
"loss": 1.2295634746551514,
"step": 423
},
{
"epoch": 0.24190557695050635,
"grad_norm": 1.0859375,
"learning_rate": 3.9301552106430156e-05,
"loss": 1.2483271360397339,
"step": 424
},
{
"epoch": 0.24247610897161603,
"grad_norm": 1.0546875,
"learning_rate": 3.9273835920177385e-05,
"loss": 1.1881691217422485,
"step": 425
},
{
"epoch": 0.24304664099272572,
"grad_norm": 1.03125,
"learning_rate": 3.9246119733924615e-05,
"loss": 1.1997624635696411,
"step": 426
},
{
"epoch": 0.2436171730138354,
"grad_norm": 1.1015625,
"learning_rate": 3.9218403547671844e-05,
"loss": 1.2510207891464233,
"step": 427
},
{
"epoch": 0.24418770503494508,
"grad_norm": 1.046875,
"learning_rate": 3.9190687361419074e-05,
"loss": 1.2188156843185425,
"step": 428
},
{
"epoch": 0.24475823705605476,
"grad_norm": 1.0234375,
"learning_rate": 3.9162971175166297e-05,
"loss": 1.228477954864502,
"step": 429
},
{
"epoch": 0.24532876907716444,
"grad_norm": 1.1171875,
"learning_rate": 3.9135254988913526e-05,
"loss": 1.3039709329605103,
"step": 430
},
{
"epoch": 0.24589930109827415,
"grad_norm": 1.0703125,
"learning_rate": 3.9107538802660755e-05,
"loss": 1.2193942070007324,
"step": 431
},
{
"epoch": 0.24646983311938384,
"grad_norm": 1.0859375,
"learning_rate": 3.9079822616407985e-05,
"loss": 1.2380352020263672,
"step": 432
},
{
"epoch": 0.24704036514049352,
"grad_norm": 1.046875,
"learning_rate": 3.905210643015521e-05,
"loss": 1.1670141220092773,
"step": 433
},
{
"epoch": 0.2476108971616032,
"grad_norm": 1.0625,
"learning_rate": 3.9024390243902444e-05,
"loss": 1.2406682968139648,
"step": 434
},
{
"epoch": 0.24818142918271288,
"grad_norm": 1.0625,
"learning_rate": 3.8996674057649667e-05,
"loss": 1.200782060623169,
"step": 435
},
{
"epoch": 0.24875196120382256,
"grad_norm": 1.046875,
"learning_rate": 3.89689578713969e-05,
"loss": 1.1442952156066895,
"step": 436
},
{
"epoch": 0.24932249322493225,
"grad_norm": 1.03125,
"learning_rate": 3.8941241685144125e-05,
"loss": 1.15338134765625,
"step": 437
},
{
"epoch": 0.24989302524604193,
"grad_norm": 0.98828125,
"learning_rate": 3.8913525498891355e-05,
"loss": 1.1609077453613281,
"step": 438
},
{
"epoch": 0.25046355726715164,
"grad_norm": 1.1015625,
"learning_rate": 3.8885809312638584e-05,
"loss": 1.257835030555725,
"step": 439
},
{
"epoch": 0.2510340892882613,
"grad_norm": 1.0859375,
"learning_rate": 3.8858093126385814e-05,
"loss": 1.2244375944137573,
"step": 440
},
{
"epoch": 0.251604621309371,
"grad_norm": 1.0859375,
"learning_rate": 3.8830376940133037e-05,
"loss": 1.2138961553573608,
"step": 441
},
{
"epoch": 0.2521751533304807,
"grad_norm": 1.0546875,
"learning_rate": 3.8802660753880266e-05,
"loss": 1.240128755569458,
"step": 442
},
{
"epoch": 0.25274568535159037,
"grad_norm": 1.0,
"learning_rate": 3.8774944567627496e-05,
"loss": 1.2070982456207275,
"step": 443
},
{
"epoch": 0.25331621737270005,
"grad_norm": 1.0546875,
"learning_rate": 3.8747228381374725e-05,
"loss": 1.2733830213546753,
"step": 444
},
{
"epoch": 0.25388674939380973,
"grad_norm": 1.046875,
"learning_rate": 3.8719512195121954e-05,
"loss": 1.1820507049560547,
"step": 445
},
{
"epoch": 0.2544572814149194,
"grad_norm": 1.0078125,
"learning_rate": 3.869179600886918e-05,
"loss": 1.196885108947754,
"step": 446
},
{
"epoch": 0.2550278134360291,
"grad_norm": 1.03125,
"learning_rate": 3.866407982261641e-05,
"loss": 1.1905972957611084,
"step": 447
},
{
"epoch": 0.2555983454571388,
"grad_norm": 1.09375,
"learning_rate": 3.8636363636363636e-05,
"loss": 1.2579684257507324,
"step": 448
},
{
"epoch": 0.25616887747824846,
"grad_norm": 1.0,
"learning_rate": 3.8608647450110866e-05,
"loss": 1.1727596521377563,
"step": 449
},
{
"epoch": 0.25673940949935814,
"grad_norm": 1.015625,
"learning_rate": 3.8580931263858095e-05,
"loss": 1.1504234075546265,
"step": 450
},
{
"epoch": 0.2573099415204678,
"grad_norm": 1.015625,
"learning_rate": 3.8553215077605324e-05,
"loss": 1.1405715942382812,
"step": 451
},
{
"epoch": 0.2578804735415775,
"grad_norm": 0.98828125,
"learning_rate": 3.8525498891352554e-05,
"loss": 1.220837116241455,
"step": 452
},
{
"epoch": 0.2584510055626872,
"grad_norm": 1.0234375,
"learning_rate": 3.8497782705099777e-05,
"loss": 1.1962711811065674,
"step": 453
},
{
"epoch": 0.25902153758379687,
"grad_norm": 1.015625,
"learning_rate": 3.8470066518847006e-05,
"loss": 1.1877164840698242,
"step": 454
},
{
"epoch": 0.25959206960490655,
"grad_norm": 1.046875,
"learning_rate": 3.8442350332594236e-05,
"loss": 1.2504132986068726,
"step": 455
},
{
"epoch": 0.2601626016260163,
"grad_norm": 1.015625,
"learning_rate": 3.8414634146341465e-05,
"loss": 1.1902315616607666,
"step": 456
},
{
"epoch": 0.26073313364712597,
"grad_norm": 1.03125,
"learning_rate": 3.8386917960088694e-05,
"loss": 1.2856203317642212,
"step": 457
},
{
"epoch": 0.26130366566823565,
"grad_norm": 1.0625,
"learning_rate": 3.8359201773835924e-05,
"loss": 1.2528060674667358,
"step": 458
},
{
"epoch": 0.26187419768934533,
"grad_norm": 1.078125,
"learning_rate": 3.833148558758315e-05,
"loss": 1.1831871271133423,
"step": 459
},
{
"epoch": 0.262444729710455,
"grad_norm": 1.015625,
"learning_rate": 3.830376940133038e-05,
"loss": 1.1781988143920898,
"step": 460
},
{
"epoch": 0.2630152617315647,
"grad_norm": 1.015625,
"learning_rate": 3.8276053215077606e-05,
"loss": 1.193709373474121,
"step": 461
},
{
"epoch": 0.2635857937526744,
"grad_norm": 1.078125,
"learning_rate": 3.8248337028824835e-05,
"loss": 1.1997225284576416,
"step": 462
},
{
"epoch": 0.26415632577378406,
"grad_norm": 1.0078125,
"learning_rate": 3.8220620842572064e-05,
"loss": 1.159136176109314,
"step": 463
},
{
"epoch": 0.26472685779489374,
"grad_norm": 1.015625,
"learning_rate": 3.8192904656319294e-05,
"loss": 1.242883324623108,
"step": 464
},
{
"epoch": 0.2652973898160034,
"grad_norm": 1.0703125,
"learning_rate": 3.8165188470066523e-05,
"loss": 1.2907770872116089,
"step": 465
},
{
"epoch": 0.2658679218371131,
"grad_norm": 1.09375,
"learning_rate": 3.8137472283813746e-05,
"loss": 1.2596560716629028,
"step": 466
},
{
"epoch": 0.2664384538582228,
"grad_norm": 1.1171875,
"learning_rate": 3.8109756097560976e-05,
"loss": 1.2509888410568237,
"step": 467
},
{
"epoch": 0.26700898587933247,
"grad_norm": 0.97265625,
"learning_rate": 3.8082039911308205e-05,
"loss": 1.2029120922088623,
"step": 468
},
{
"epoch": 0.26757951790044215,
"grad_norm": 1.015625,
"learning_rate": 3.8054323725055435e-05,
"loss": 1.210568904876709,
"step": 469
},
{
"epoch": 0.26815004992155184,
"grad_norm": 0.9921875,
"learning_rate": 3.8026607538802664e-05,
"loss": 1.1661216020584106,
"step": 470
},
{
"epoch": 0.2687205819426615,
"grad_norm": 1.03125,
"learning_rate": 3.7998891352549893e-05,
"loss": 1.229252576828003,
"step": 471
},
{
"epoch": 0.2692911139637712,
"grad_norm": 1.0546875,
"learning_rate": 3.7971175166297116e-05,
"loss": 1.209242343902588,
"step": 472
},
{
"epoch": 0.2698616459848809,
"grad_norm": 1.0390625,
"learning_rate": 3.794345898004435e-05,
"loss": 1.2709503173828125,
"step": 473
},
{
"epoch": 0.27043217800599056,
"grad_norm": 1.078125,
"learning_rate": 3.7915742793791575e-05,
"loss": 1.2316001653671265,
"step": 474
},
{
"epoch": 0.27100271002710025,
"grad_norm": 1.03125,
"learning_rate": 3.7888026607538805e-05,
"loss": 1.2138065099716187,
"step": 475
},
{
"epoch": 0.27157324204820993,
"grad_norm": 1.0,
"learning_rate": 3.7860310421286034e-05,
"loss": 1.1936984062194824,
"step": 476
},
{
"epoch": 0.27214377406931967,
"grad_norm": 1.015625,
"learning_rate": 3.783259423503326e-05,
"loss": 1.2338573932647705,
"step": 477
},
{
"epoch": 0.27271430609042935,
"grad_norm": 1.046875,
"learning_rate": 3.780487804878049e-05,
"loss": 1.2421263456344604,
"step": 478
},
{
"epoch": 0.27328483811153903,
"grad_norm": 1.03125,
"learning_rate": 3.7777161862527716e-05,
"loss": 1.2414464950561523,
"step": 479
},
{
"epoch": 0.2738553701326487,
"grad_norm": 0.99609375,
"learning_rate": 3.7749445676274945e-05,
"loss": 1.2261340618133545,
"step": 480
},
{
"epoch": 0.2744259021537584,
"grad_norm": 1.0234375,
"learning_rate": 3.7721729490022175e-05,
"loss": 1.208221435546875,
"step": 481
},
{
"epoch": 0.2749964341748681,
"grad_norm": 1.0234375,
"learning_rate": 3.7694013303769404e-05,
"loss": 1.2820276021957397,
"step": 482
},
{
"epoch": 0.27556696619597776,
"grad_norm": 1.0703125,
"learning_rate": 3.7666297117516633e-05,
"loss": 1.262161374092102,
"step": 483
},
{
"epoch": 0.27613749821708744,
"grad_norm": 0.9921875,
"learning_rate": 3.763858093126386e-05,
"loss": 1.2242916822433472,
"step": 484
},
{
"epoch": 0.2767080302381971,
"grad_norm": 1.0078125,
"learning_rate": 3.7610864745011086e-05,
"loss": 1.1797833442687988,
"step": 485
},
{
"epoch": 0.2772785622593068,
"grad_norm": 1.0546875,
"learning_rate": 3.758314855875832e-05,
"loss": 1.2725660800933838,
"step": 486
},
{
"epoch": 0.2778490942804165,
"grad_norm": 1.015625,
"learning_rate": 3.7555432372505545e-05,
"loss": 1.195313572883606,
"step": 487
},
{
"epoch": 0.27841962630152617,
"grad_norm": 0.94921875,
"learning_rate": 3.7527716186252774e-05,
"loss": 1.1661468744277954,
"step": 488
},
{
"epoch": 0.27899015832263585,
"grad_norm": 1.0390625,
"learning_rate": 3.7500000000000003e-05,
"loss": 1.2072978019714355,
"step": 489
},
{
"epoch": 0.27956069034374553,
"grad_norm": 0.98828125,
"learning_rate": 3.7472283813747226e-05,
"loss": 1.203414797782898,
"step": 490
},
{
"epoch": 0.2801312223648552,
"grad_norm": 1.0390625,
"learning_rate": 3.7444567627494456e-05,
"loss": 1.2426180839538574,
"step": 491
},
{
"epoch": 0.2807017543859649,
"grad_norm": 1.015625,
"learning_rate": 3.7416851441241685e-05,
"loss": 1.232536792755127,
"step": 492
},
{
"epoch": 0.2812722864070746,
"grad_norm": 1.03125,
"learning_rate": 3.7389135254988915e-05,
"loss": 1.266850471496582,
"step": 493
},
{
"epoch": 0.28184281842818426,
"grad_norm": 1.046875,
"learning_rate": 3.7361419068736144e-05,
"loss": 1.2585172653198242,
"step": 494
},
{
"epoch": 0.28241335044929394,
"grad_norm": 0.99609375,
"learning_rate": 3.7333702882483374e-05,
"loss": 1.2028322219848633,
"step": 495
},
{
"epoch": 0.2829838824704036,
"grad_norm": 1.0546875,
"learning_rate": 3.7305986696230596e-05,
"loss": 1.2268320322036743,
"step": 496
},
{
"epoch": 0.28355441449151336,
"grad_norm": 1.03125,
"learning_rate": 3.727827050997783e-05,
"loss": 1.2339527606964111,
"step": 497
},
{
"epoch": 0.28412494651262304,
"grad_norm": 1.046875,
"learning_rate": 3.7250554323725055e-05,
"loss": 1.2072274684906006,
"step": 498
},
{
"epoch": 0.2846954785337327,
"grad_norm": 0.99609375,
"learning_rate": 3.7222838137472285e-05,
"loss": 1.235311508178711,
"step": 499
},
{
"epoch": 0.2852660105548424,
"grad_norm": 1.0390625,
"learning_rate": 3.7195121951219514e-05,
"loss": 1.2435599565505981,
"step": 500
},
{
"epoch": 0.2858365425759521,
"grad_norm": 1.0390625,
"learning_rate": 3.7167405764966744e-05,
"loss": 1.2234078645706177,
"step": 501
},
{
"epoch": 0.2864070745970618,
"grad_norm": 1.0703125,
"learning_rate": 3.713968957871397e-05,
"loss": 1.2654131650924683,
"step": 502
},
{
"epoch": 0.28697760661817145,
"grad_norm": 1.0625,
"learning_rate": 3.7111973392461196e-05,
"loss": 1.226614236831665,
"step": 503
},
{
"epoch": 0.28754813863928114,
"grad_norm": 1.109375,
"learning_rate": 3.7084257206208425e-05,
"loss": 1.2334555387496948,
"step": 504
},
{
"epoch": 0.2881186706603908,
"grad_norm": 1.0546875,
"learning_rate": 3.7056541019955655e-05,
"loss": 1.2169506549835205,
"step": 505
},
{
"epoch": 0.2886892026815005,
"grad_norm": 1.078125,
"learning_rate": 3.7028824833702884e-05,
"loss": 1.2664920091629028,
"step": 506
},
{
"epoch": 0.2892597347026102,
"grad_norm": 1.2109375,
"learning_rate": 3.7001108647450114e-05,
"loss": 1.2238786220550537,
"step": 507
},
{
"epoch": 0.28983026672371986,
"grad_norm": 1.03125,
"learning_rate": 3.697339246119734e-05,
"loss": 1.179901361465454,
"step": 508
},
{
"epoch": 0.29040079874482955,
"grad_norm": 1.078125,
"learning_rate": 3.6945676274944566e-05,
"loss": 1.2527443170547485,
"step": 509
},
{
"epoch": 0.29097133076593923,
"grad_norm": 1.0234375,
"learning_rate": 3.69179600886918e-05,
"loss": 1.2478464841842651,
"step": 510
},
{
"epoch": 0.2915418627870489,
"grad_norm": 1.1015625,
"learning_rate": 3.6890243902439025e-05,
"loss": 1.2006577253341675,
"step": 511
},
{
"epoch": 0.2921123948081586,
"grad_norm": 1.1015625,
"learning_rate": 3.6862527716186254e-05,
"loss": 1.283043384552002,
"step": 512
},
{
"epoch": 0.2926829268292683,
"grad_norm": 1.0234375,
"learning_rate": 3.6834811529933484e-05,
"loss": 1.223816156387329,
"step": 513
},
{
"epoch": 0.29325345885037796,
"grad_norm": 1.1171875,
"learning_rate": 3.6807095343680706e-05,
"loss": 1.2357165813446045,
"step": 514
},
{
"epoch": 0.29382399087148764,
"grad_norm": 1.078125,
"learning_rate": 3.677937915742794e-05,
"loss": 1.2494802474975586,
"step": 515
},
{
"epoch": 0.2943945228925973,
"grad_norm": 1.046875,
"learning_rate": 3.6751662971175165e-05,
"loss": 1.2093576192855835,
"step": 516
},
{
"epoch": 0.29496505491370706,
"grad_norm": 1.0234375,
"learning_rate": 3.6723946784922395e-05,
"loss": 1.192871332168579,
"step": 517
},
{
"epoch": 0.29553558693481674,
"grad_norm": 0.93359375,
"learning_rate": 3.6696230598669624e-05,
"loss": 1.1430253982543945,
"step": 518
},
{
"epoch": 0.2961061189559264,
"grad_norm": 1.03125,
"learning_rate": 3.6668514412416854e-05,
"loss": 1.2123762369155884,
"step": 519
},
{
"epoch": 0.2966766509770361,
"grad_norm": 1.0703125,
"learning_rate": 3.664079822616408e-05,
"loss": 1.2201260328292847,
"step": 520
},
{
"epoch": 0.2972471829981458,
"grad_norm": 1.0546875,
"learning_rate": 3.661308203991131e-05,
"loss": 1.1812068223953247,
"step": 521
},
{
"epoch": 0.29781771501925547,
"grad_norm": 1.078125,
"learning_rate": 3.6585365853658535e-05,
"loss": 1.2447538375854492,
"step": 522
},
{
"epoch": 0.29838824704036515,
"grad_norm": 1.0625,
"learning_rate": 3.655764966740577e-05,
"loss": 1.2636268138885498,
"step": 523
},
{
"epoch": 0.29895877906147483,
"grad_norm": 1.0546875,
"learning_rate": 3.6529933481152994e-05,
"loss": 1.2320729494094849,
"step": 524
},
{
"epoch": 0.2995293110825845,
"grad_norm": 1.0859375,
"learning_rate": 3.6502217294900224e-05,
"loss": 1.2655476331710815,
"step": 525
},
{
"epoch": 0.3000998431036942,
"grad_norm": 1.0390625,
"learning_rate": 3.647450110864745e-05,
"loss": 1.2109198570251465,
"step": 526
},
{
"epoch": 0.3006703751248039,
"grad_norm": 1.046875,
"learning_rate": 3.6446784922394676e-05,
"loss": 1.2380175590515137,
"step": 527
},
{
"epoch": 0.30124090714591356,
"grad_norm": 1.0234375,
"learning_rate": 3.641906873614191e-05,
"loss": 1.2023993730545044,
"step": 528
},
{
"epoch": 0.30181143916702324,
"grad_norm": 1.0234375,
"learning_rate": 3.6391352549889135e-05,
"loss": 1.239518404006958,
"step": 529
},
{
"epoch": 0.3023819711881329,
"grad_norm": 0.9921875,
"learning_rate": 3.6363636363636364e-05,
"loss": 1.2405352592468262,
"step": 530
},
{
"epoch": 0.3029525032092426,
"grad_norm": 1.0390625,
"learning_rate": 3.6335920177383594e-05,
"loss": 1.269554853439331,
"step": 531
},
{
"epoch": 0.3035230352303523,
"grad_norm": 1.0078125,
"learning_rate": 3.630820399113082e-05,
"loss": 1.256522297859192,
"step": 532
},
{
"epoch": 0.30409356725146197,
"grad_norm": 0.99609375,
"learning_rate": 3.628048780487805e-05,
"loss": 1.2245392799377441,
"step": 533
},
{
"epoch": 0.30466409927257165,
"grad_norm": 0.99609375,
"learning_rate": 3.625277161862528e-05,
"loss": 1.2256156206130981,
"step": 534
},
{
"epoch": 0.30523463129368134,
"grad_norm": 1.078125,
"learning_rate": 3.6225055432372505e-05,
"loss": 1.2551851272583008,
"step": 535
},
{
"epoch": 0.305805163314791,
"grad_norm": 1.0234375,
"learning_rate": 3.619733924611974e-05,
"loss": 1.1682400703430176,
"step": 536
},
{
"epoch": 0.30637569533590076,
"grad_norm": 1.0703125,
"learning_rate": 3.6169623059866964e-05,
"loss": 1.2278921604156494,
"step": 537
},
{
"epoch": 0.30694622735701044,
"grad_norm": 1.03125,
"learning_rate": 3.6141906873614186e-05,
"loss": 1.2167140245437622,
"step": 538
},
{
"epoch": 0.3075167593781201,
"grad_norm": 0.99609375,
"learning_rate": 3.611419068736142e-05,
"loss": 1.2471628189086914,
"step": 539
},
{
"epoch": 0.3080872913992298,
"grad_norm": 1.0546875,
"learning_rate": 3.6086474501108645e-05,
"loss": 1.2300347089767456,
"step": 540
},
{
"epoch": 0.3086578234203395,
"grad_norm": 1.015625,
"learning_rate": 3.605875831485588e-05,
"loss": 1.1582870483398438,
"step": 541
},
{
"epoch": 0.30922835544144917,
"grad_norm": 1.03125,
"learning_rate": 3.6031042128603104e-05,
"loss": 1.2606914043426514,
"step": 542
},
{
"epoch": 0.30979888746255885,
"grad_norm": 1.0,
"learning_rate": 3.6003325942350334e-05,
"loss": 1.2054803371429443,
"step": 543
},
{
"epoch": 0.31036941948366853,
"grad_norm": 1.0078125,
"learning_rate": 3.597560975609756e-05,
"loss": 1.1797690391540527,
"step": 544
},
{
"epoch": 0.3109399515047782,
"grad_norm": 1.03125,
"learning_rate": 3.594789356984479e-05,
"loss": 1.1780451536178589,
"step": 545
},
{
"epoch": 0.3115104835258879,
"grad_norm": 1.0546875,
"learning_rate": 3.5920177383592015e-05,
"loss": 1.2812529802322388,
"step": 546
},
{
"epoch": 0.3120810155469976,
"grad_norm": 1.0625,
"learning_rate": 3.589246119733925e-05,
"loss": 1.3007402420043945,
"step": 547
},
{
"epoch": 0.31265154756810726,
"grad_norm": 0.9921875,
"learning_rate": 3.5864745011086474e-05,
"loss": 1.1987743377685547,
"step": 548
},
{
"epoch": 0.31322207958921694,
"grad_norm": 1.078125,
"learning_rate": 3.583702882483371e-05,
"loss": 1.2217564582824707,
"step": 549
},
{
"epoch": 0.3137926116103266,
"grad_norm": 0.99609375,
"learning_rate": 3.580931263858093e-05,
"loss": 1.211827039718628,
"step": 550
},
{
"epoch": 0.3143631436314363,
"grad_norm": 1.03125,
"learning_rate": 3.5781596452328156e-05,
"loss": 1.2164710760116577,
"step": 551
},
{
"epoch": 0.314933675652546,
"grad_norm": 0.98828125,
"learning_rate": 3.575388026607539e-05,
"loss": 1.2393014430999756,
"step": 552
},
{
"epoch": 0.31550420767365567,
"grad_norm": 0.96875,
"learning_rate": 3.5726164079822615e-05,
"loss": 1.1759617328643799,
"step": 553
},
{
"epoch": 0.31607473969476535,
"grad_norm": 1.015625,
"learning_rate": 3.5698447893569844e-05,
"loss": 1.2184211015701294,
"step": 554
},
{
"epoch": 0.31664527171587503,
"grad_norm": 1.046875,
"learning_rate": 3.5670731707317074e-05,
"loss": 1.248216152191162,
"step": 555
},
{
"epoch": 0.3172158037369847,
"grad_norm": 1.0078125,
"learning_rate": 3.56430155210643e-05,
"loss": 1.2055684328079224,
"step": 556
},
{
"epoch": 0.3177863357580944,
"grad_norm": 1.0546875,
"learning_rate": 3.561529933481153e-05,
"loss": 1.19916832447052,
"step": 557
},
{
"epoch": 0.31835686777920413,
"grad_norm": 0.95703125,
"learning_rate": 3.558758314855876e-05,
"loss": 1.151750087738037,
"step": 558
},
{
"epoch": 0.3189273998003138,
"grad_norm": 0.99609375,
"learning_rate": 3.5559866962305985e-05,
"loss": 1.254964828491211,
"step": 559
},
{
"epoch": 0.3194979318214235,
"grad_norm": 1.0546875,
"learning_rate": 3.553215077605322e-05,
"loss": 1.251706600189209,
"step": 560
},
{
"epoch": 0.3200684638425332,
"grad_norm": 1.2265625,
"learning_rate": 3.5504434589800444e-05,
"loss": 1.1918596029281616,
"step": 561
},
{
"epoch": 0.32063899586364286,
"grad_norm": 1.0625,
"learning_rate": 3.547671840354767e-05,
"loss": 1.2538777589797974,
"step": 562
},
{
"epoch": 0.32120952788475254,
"grad_norm": 1.078125,
"learning_rate": 3.54490022172949e-05,
"loss": 1.227068543434143,
"step": 563
},
{
"epoch": 0.3217800599058622,
"grad_norm": 1.015625,
"learning_rate": 3.5421286031042125e-05,
"loss": 1.1811244487762451,
"step": 564
},
{
"epoch": 0.3223505919269719,
"grad_norm": 1.015625,
"learning_rate": 3.539356984478936e-05,
"loss": 1.162517786026001,
"step": 565
},
{
"epoch": 0.3229211239480816,
"grad_norm": 0.9921875,
"learning_rate": 3.5365853658536584e-05,
"loss": 1.1981290578842163,
"step": 566
},
{
"epoch": 0.32349165596919127,
"grad_norm": 0.9765625,
"learning_rate": 3.5338137472283814e-05,
"loss": 1.1930001974105835,
"step": 567
},
{
"epoch": 0.32406218799030095,
"grad_norm": 1.0546875,
"learning_rate": 3.531042128603104e-05,
"loss": 1.2397738695144653,
"step": 568
},
{
"epoch": 0.32463272001141064,
"grad_norm": 1.078125,
"learning_rate": 3.528270509977827e-05,
"loss": 1.273198127746582,
"step": 569
},
{
"epoch": 0.3252032520325203,
"grad_norm": 0.96875,
"learning_rate": 3.52549889135255e-05,
"loss": 1.1873741149902344,
"step": 570
},
{
"epoch": 0.32577378405363,
"grad_norm": 0.9921875,
"learning_rate": 3.522727272727273e-05,
"loss": 1.2132840156555176,
"step": 571
},
{
"epoch": 0.3263443160747397,
"grad_norm": 0.984375,
"learning_rate": 3.5199556541019954e-05,
"loss": 1.1881725788116455,
"step": 572
},
{
"epoch": 0.32691484809584936,
"grad_norm": 1.0546875,
"learning_rate": 3.517184035476719e-05,
"loss": 1.2296414375305176,
"step": 573
},
{
"epoch": 0.32748538011695905,
"grad_norm": 0.98828125,
"learning_rate": 3.514412416851441e-05,
"loss": 1.2116769552230835,
"step": 574
},
{
"epoch": 0.32805591213806873,
"grad_norm": 1.0,
"learning_rate": 3.511640798226164e-05,
"loss": 1.194542646408081,
"step": 575
},
{
"epoch": 0.3286264441591784,
"grad_norm": 1.03125,
"learning_rate": 3.508869179600887e-05,
"loss": 1.2189078330993652,
"step": 576
},
{
"epoch": 0.3291969761802881,
"grad_norm": 0.96484375,
"learning_rate": 3.5060975609756095e-05,
"loss": 1.1380560398101807,
"step": 577
},
{
"epoch": 0.32976750820139783,
"grad_norm": 1.015625,
"learning_rate": 3.503325942350333e-05,
"loss": 1.1995842456817627,
"step": 578
},
{
"epoch": 0.3303380402225075,
"grad_norm": 1.0625,
"learning_rate": 3.5005543237250554e-05,
"loss": 1.254304051399231,
"step": 579
},
{
"epoch": 0.3309085722436172,
"grad_norm": 1.0,
"learning_rate": 3.497782705099778e-05,
"loss": 1.201616883277893,
"step": 580
},
{
"epoch": 0.3314791042647269,
"grad_norm": 0.98828125,
"learning_rate": 3.495011086474501e-05,
"loss": 1.1772336959838867,
"step": 581
},
{
"epoch": 0.33204963628583656,
"grad_norm": 1.0234375,
"learning_rate": 3.492239467849224e-05,
"loss": 1.1937668323516846,
"step": 582
},
{
"epoch": 0.33262016830694624,
"grad_norm": 0.9765625,
"learning_rate": 3.489467849223947e-05,
"loss": 1.186886191368103,
"step": 583
},
{
"epoch": 0.3331907003280559,
"grad_norm": 1.0390625,
"learning_rate": 3.48669623059867e-05,
"loss": 1.2187786102294922,
"step": 584
},
{
"epoch": 0.3337612323491656,
"grad_norm": 1.0234375,
"learning_rate": 3.4839246119733924e-05,
"loss": 1.1842401027679443,
"step": 585
},
{
"epoch": 0.3343317643702753,
"grad_norm": 1.0,
"learning_rate": 3.481152993348116e-05,
"loss": 1.1953545808792114,
"step": 586
},
{
"epoch": 0.33490229639138497,
"grad_norm": 1.0078125,
"learning_rate": 3.478381374722838e-05,
"loss": 1.1909786462783813,
"step": 587
},
{
"epoch": 0.33547282841249465,
"grad_norm": 0.96484375,
"learning_rate": 3.475609756097561e-05,
"loss": 1.201062798500061,
"step": 588
},
{
"epoch": 0.33604336043360433,
"grad_norm": 0.9921875,
"learning_rate": 3.472838137472284e-05,
"loss": 1.2262158393859863,
"step": 589
},
{
"epoch": 0.336613892454714,
"grad_norm": 1.0390625,
"learning_rate": 3.4700665188470064e-05,
"loss": 1.255564570426941,
"step": 590
},
{
"epoch": 0.3371844244758237,
"grad_norm": 0.984375,
"learning_rate": 3.46729490022173e-05,
"loss": 1.1916460990905762,
"step": 591
},
{
"epoch": 0.3377549564969334,
"grad_norm": 1.03125,
"learning_rate": 3.464523281596452e-05,
"loss": 1.1728994846343994,
"step": 592
},
{
"epoch": 0.33832548851804306,
"grad_norm": 1.0234375,
"learning_rate": 3.461751662971175e-05,
"loss": 1.2145668268203735,
"step": 593
},
{
"epoch": 0.33889602053915274,
"grad_norm": 0.9921875,
"learning_rate": 3.458980044345898e-05,
"loss": 1.2174324989318848,
"step": 594
},
{
"epoch": 0.3394665525602624,
"grad_norm": 1.0390625,
"learning_rate": 3.456208425720621e-05,
"loss": 1.1968474388122559,
"step": 595
},
{
"epoch": 0.3400370845813721,
"grad_norm": 1.0078125,
"learning_rate": 3.453436807095344e-05,
"loss": 1.1793067455291748,
"step": 596
},
{
"epoch": 0.3406076166024818,
"grad_norm": 1.03125,
"learning_rate": 3.450665188470067e-05,
"loss": 1.2109010219573975,
"step": 597
},
{
"epoch": 0.3411781486235915,
"grad_norm": 1.0546875,
"learning_rate": 3.447893569844789e-05,
"loss": 1.2412149906158447,
"step": 598
},
{
"epoch": 0.3417486806447012,
"grad_norm": 1.0078125,
"learning_rate": 3.445121951219512e-05,
"loss": 1.1886482238769531,
"step": 599
},
{
"epoch": 0.3423192126658109,
"grad_norm": 1.015625,
"learning_rate": 3.442350332594235e-05,
"loss": 1.1711212396621704,
"step": 600
},
{
"epoch": 0.3428897446869206,
"grad_norm": 0.98046875,
"learning_rate": 3.4395787139689575e-05,
"loss": 1.1890015602111816,
"step": 601
},
{
"epoch": 0.34346027670803025,
"grad_norm": 0.95703125,
"learning_rate": 3.436807095343681e-05,
"loss": 1.1860285997390747,
"step": 602
},
{
"epoch": 0.34403080872913994,
"grad_norm": 0.98046875,
"learning_rate": 3.4340354767184034e-05,
"loss": 1.2001878023147583,
"step": 603
},
{
"epoch": 0.3446013407502496,
"grad_norm": 1.0234375,
"learning_rate": 3.431263858093127e-05,
"loss": 1.1815104484558105,
"step": 604
},
{
"epoch": 0.3451718727713593,
"grad_norm": 1.015625,
"learning_rate": 3.428492239467849e-05,
"loss": 1.1652307510375977,
"step": 605
},
{
"epoch": 0.345742404792469,
"grad_norm": 1.0234375,
"learning_rate": 3.425720620842572e-05,
"loss": 1.1888481378555298,
"step": 606
},
{
"epoch": 0.34631293681357866,
"grad_norm": 1.0390625,
"learning_rate": 3.422949002217295e-05,
"loss": 1.2198981046676636,
"step": 607
},
{
"epoch": 0.34688346883468835,
"grad_norm": 1.03125,
"learning_rate": 3.420177383592018e-05,
"loss": 1.2088303565979004,
"step": 608
},
{
"epoch": 0.34745400085579803,
"grad_norm": 1.046875,
"learning_rate": 3.4174057649667404e-05,
"loss": 1.2638548612594604,
"step": 609
},
{
"epoch": 0.3480245328769077,
"grad_norm": 1.0234375,
"learning_rate": 3.414634146341464e-05,
"loss": 1.2314380407333374,
"step": 610
},
{
"epoch": 0.3485950648980174,
"grad_norm": 1.0,
"learning_rate": 3.411862527716186e-05,
"loss": 1.1847796440124512,
"step": 611
},
{
"epoch": 0.3491655969191271,
"grad_norm": 1.015625,
"learning_rate": 3.409090909090909e-05,
"loss": 1.1967138051986694,
"step": 612
},
{
"epoch": 0.34973612894023676,
"grad_norm": 1.0546875,
"learning_rate": 3.406319290465632e-05,
"loss": 1.1948060989379883,
"step": 613
},
{
"epoch": 0.35030666096134644,
"grad_norm": 1.0078125,
"learning_rate": 3.4035476718403544e-05,
"loss": 1.248701810836792,
"step": 614
},
{
"epoch": 0.3508771929824561,
"grad_norm": 0.98046875,
"learning_rate": 3.400776053215078e-05,
"loss": 1.2076679468154907,
"step": 615
},
{
"epoch": 0.3514477250035658,
"grad_norm": 1.0234375,
"learning_rate": 3.3980044345898e-05,
"loss": 1.20987868309021,
"step": 616
},
{
"epoch": 0.3520182570246755,
"grad_norm": 0.9921875,
"learning_rate": 3.395232815964523e-05,
"loss": 1.1548939943313599,
"step": 617
},
{
"epoch": 0.35258878904578517,
"grad_norm": 1.03125,
"learning_rate": 3.392461197339246e-05,
"loss": 1.2160520553588867,
"step": 618
},
{
"epoch": 0.3531593210668949,
"grad_norm": 1.03125,
"learning_rate": 3.389689578713969e-05,
"loss": 1.2215287685394287,
"step": 619
},
{
"epoch": 0.3537298530880046,
"grad_norm": 1.015625,
"learning_rate": 3.386917960088692e-05,
"loss": 1.2433137893676758,
"step": 620
},
{
"epoch": 0.35430038510911427,
"grad_norm": 1.015625,
"learning_rate": 3.384146341463415e-05,
"loss": 1.2307751178741455,
"step": 621
},
{
"epoch": 0.35487091713022395,
"grad_norm": 0.99609375,
"learning_rate": 3.381374722838137e-05,
"loss": 1.1872355937957764,
"step": 622
},
{
"epoch": 0.35544144915133363,
"grad_norm": 1.03125,
"learning_rate": 3.378603104212861e-05,
"loss": 1.200265645980835,
"step": 623
},
{
"epoch": 0.3560119811724433,
"grad_norm": 1.0546875,
"learning_rate": 3.375831485587583e-05,
"loss": 1.3020355701446533,
"step": 624
},
{
"epoch": 0.356582513193553,
"grad_norm": 1.03125,
"learning_rate": 3.373059866962306e-05,
"loss": 1.1976819038391113,
"step": 625
},
{
"epoch": 0.3571530452146627,
"grad_norm": 1.0234375,
"learning_rate": 3.370288248337029e-05,
"loss": 1.1945629119873047,
"step": 626
},
{
"epoch": 0.35772357723577236,
"grad_norm": 0.9921875,
"learning_rate": 3.3675166297117514e-05,
"loss": 1.2189013957977295,
"step": 627
},
{
"epoch": 0.35829410925688204,
"grad_norm": 0.9765625,
"learning_rate": 3.364745011086475e-05,
"loss": 1.2139533758163452,
"step": 628
},
{
"epoch": 0.3588646412779917,
"grad_norm": 0.96484375,
"learning_rate": 3.361973392461197e-05,
"loss": 1.1832334995269775,
"step": 629
},
{
"epoch": 0.3594351732991014,
"grad_norm": 1.0078125,
"learning_rate": 3.35920177383592e-05,
"loss": 1.1789777278900146,
"step": 630
},
{
"epoch": 0.3600057053202111,
"grad_norm": 0.97265625,
"learning_rate": 3.356430155210643e-05,
"loss": 1.1401221752166748,
"step": 631
},
{
"epoch": 0.36057623734132077,
"grad_norm": 1.015625,
"learning_rate": 3.353658536585366e-05,
"loss": 1.2332661151885986,
"step": 632
},
{
"epoch": 0.36114676936243045,
"grad_norm": 0.9609375,
"learning_rate": 3.350886917960089e-05,
"loss": 1.1867516040802002,
"step": 633
},
{
"epoch": 0.36171730138354014,
"grad_norm": 1.0078125,
"learning_rate": 3.348115299334812e-05,
"loss": 1.2486271858215332,
"step": 634
},
{
"epoch": 0.3622878334046498,
"grad_norm": 1.0390625,
"learning_rate": 3.345343680709534e-05,
"loss": 1.1644282341003418,
"step": 635
},
{
"epoch": 0.3628583654257595,
"grad_norm": 0.98828125,
"learning_rate": 3.342572062084257e-05,
"loss": 1.1926931142807007,
"step": 636
},
{
"epoch": 0.3634288974468692,
"grad_norm": 0.98046875,
"learning_rate": 3.33980044345898e-05,
"loss": 1.2337167263031006,
"step": 637
},
{
"epoch": 0.36399942946797886,
"grad_norm": 1.0078125,
"learning_rate": 3.337028824833703e-05,
"loss": 1.2726258039474487,
"step": 638
},
{
"epoch": 0.3645699614890886,
"grad_norm": 1.046875,
"learning_rate": 3.334257206208426e-05,
"loss": 1.229848861694336,
"step": 639
},
{
"epoch": 0.3651404935101983,
"grad_norm": 0.94921875,
"learning_rate": 3.3314855875831483e-05,
"loss": 1.1424199342727661,
"step": 640
},
{
"epoch": 0.36571102553130796,
"grad_norm": 1.0234375,
"learning_rate": 3.328713968957872e-05,
"loss": 1.2158143520355225,
"step": 641
},
{
"epoch": 0.36628155755241765,
"grad_norm": 1.0234375,
"learning_rate": 3.325942350332594e-05,
"loss": 1.213433027267456,
"step": 642
},
{
"epoch": 0.36685208957352733,
"grad_norm": 1.0234375,
"learning_rate": 3.323170731707317e-05,
"loss": 1.1552369594573975,
"step": 643
},
{
"epoch": 0.367422621594637,
"grad_norm": 0.9765625,
"learning_rate": 3.32039911308204e-05,
"loss": 1.1470410823822021,
"step": 644
},
{
"epoch": 0.3679931536157467,
"grad_norm": 1.0234375,
"learning_rate": 3.317627494456763e-05,
"loss": 1.227137804031372,
"step": 645
},
{
"epoch": 0.3685636856368564,
"grad_norm": 0.98046875,
"learning_rate": 3.314855875831486e-05,
"loss": 1.1736478805541992,
"step": 646
},
{
"epoch": 0.36913421765796606,
"grad_norm": 1.0078125,
"learning_rate": 3.312084257206209e-05,
"loss": 1.2192144393920898,
"step": 647
},
{
"epoch": 0.36970474967907574,
"grad_norm": 0.97265625,
"learning_rate": 3.309312638580931e-05,
"loss": 1.1780518293380737,
"step": 648
},
{
"epoch": 0.3702752817001854,
"grad_norm": 1.015625,
"learning_rate": 3.306541019955654e-05,
"loss": 1.2205878496170044,
"step": 649
},
{
"epoch": 0.3708458137212951,
"grad_norm": 0.99609375,
"learning_rate": 3.303769401330377e-05,
"loss": 1.2226086854934692,
"step": 650
},
{
"epoch": 0.3714163457424048,
"grad_norm": 1.0078125,
"learning_rate": 3.3009977827051e-05,
"loss": 1.1905219554901123,
"step": 651
},
{
"epoch": 0.37198687776351447,
"grad_norm": 1.015625,
"learning_rate": 3.298226164079823e-05,
"loss": 1.1790423393249512,
"step": 652
},
{
"epoch": 0.37255740978462415,
"grad_norm": 0.9609375,
"learning_rate": 3.295454545454545e-05,
"loss": 1.1909444332122803,
"step": 653
},
{
"epoch": 0.37312794180573383,
"grad_norm": 0.98828125,
"learning_rate": 3.292682926829269e-05,
"loss": 1.2416154146194458,
"step": 654
},
{
"epoch": 0.3736984738268435,
"grad_norm": 1.0234375,
"learning_rate": 3.289911308203991e-05,
"loss": 1.2464513778686523,
"step": 655
},
{
"epoch": 0.3742690058479532,
"grad_norm": 1.015625,
"learning_rate": 3.287139689578714e-05,
"loss": 1.239952802658081,
"step": 656
},
{
"epoch": 0.3748395378690629,
"grad_norm": 0.99609375,
"learning_rate": 3.284368070953437e-05,
"loss": 1.2005925178527832,
"step": 657
},
{
"epoch": 0.37541006989017256,
"grad_norm": 1.0078125,
"learning_rate": 3.28159645232816e-05,
"loss": 1.2646636962890625,
"step": 658
},
{
"epoch": 0.3759806019112823,
"grad_norm": 0.98828125,
"learning_rate": 3.278824833702882e-05,
"loss": 1.203331470489502,
"step": 659
},
{
"epoch": 0.376551133932392,
"grad_norm": 0.97265625,
"learning_rate": 3.276053215077605e-05,
"loss": 1.1849339008331299,
"step": 660
},
{
"epoch": 0.37712166595350166,
"grad_norm": 1.0078125,
"learning_rate": 3.273281596452328e-05,
"loss": 1.2010148763656616,
"step": 661
},
{
"epoch": 0.37769219797461134,
"grad_norm": 0.9921875,
"learning_rate": 3.270509977827051e-05,
"loss": 1.2384660243988037,
"step": 662
},
{
"epoch": 0.378262729995721,
"grad_norm": 0.984375,
"learning_rate": 3.267738359201774e-05,
"loss": 1.2244110107421875,
"step": 663
},
{
"epoch": 0.3788332620168307,
"grad_norm": 1.015625,
"learning_rate": 3.2649667405764963e-05,
"loss": 1.2706053256988525,
"step": 664
},
{
"epoch": 0.3794037940379404,
"grad_norm": 0.98828125,
"learning_rate": 3.26219512195122e-05,
"loss": 1.2451549768447876,
"step": 665
},
{
"epoch": 0.37997432605905007,
"grad_norm": 1.015625,
"learning_rate": 3.259423503325942e-05,
"loss": 1.2653909921646118,
"step": 666
},
{
"epoch": 0.38054485808015975,
"grad_norm": 1.0078125,
"learning_rate": 3.256651884700665e-05,
"loss": 1.2227097749710083,
"step": 667
},
{
"epoch": 0.38111539010126944,
"grad_norm": 0.98828125,
"learning_rate": 3.253880266075388e-05,
"loss": 1.2289211750030518,
"step": 668
},
{
"epoch": 0.3816859221223791,
"grad_norm": 1.0078125,
"learning_rate": 3.251108647450111e-05,
"loss": 1.2068843841552734,
"step": 669
},
{
"epoch": 0.3822564541434888,
"grad_norm": 0.9453125,
"learning_rate": 3.248337028824834e-05,
"loss": 1.166361689567566,
"step": 670
},
{
"epoch": 0.3828269861645985,
"grad_norm": 0.9609375,
"learning_rate": 3.245565410199557e-05,
"loss": 1.220710277557373,
"step": 671
},
{
"epoch": 0.38339751818570816,
"grad_norm": 0.984375,
"learning_rate": 3.242793791574279e-05,
"loss": 1.1663460731506348,
"step": 672
},
{
"epoch": 0.38396805020681785,
"grad_norm": 1.0078125,
"learning_rate": 3.240022172949002e-05,
"loss": 1.1803617477416992,
"step": 673
},
{
"epoch": 0.38453858222792753,
"grad_norm": 0.9921875,
"learning_rate": 3.237250554323725e-05,
"loss": 1.1342628002166748,
"step": 674
},
{
"epoch": 0.3851091142490372,
"grad_norm": 0.9921875,
"learning_rate": 3.234478935698448e-05,
"loss": 1.2325470447540283,
"step": 675
},
{
"epoch": 0.3856796462701469,
"grad_norm": 1.0078125,
"learning_rate": 3.231707317073171e-05,
"loss": 1.1941877603530884,
"step": 676
},
{
"epoch": 0.3862501782912566,
"grad_norm": 1.0,
"learning_rate": 3.228935698447893e-05,
"loss": 1.1775301694869995,
"step": 677
},
{
"epoch": 0.38682071031236626,
"grad_norm": 1.0546875,
"learning_rate": 3.226164079822617e-05,
"loss": 1.248462438583374,
"step": 678
},
{
"epoch": 0.387391242333476,
"grad_norm": 1.03125,
"learning_rate": 3.223392461197339e-05,
"loss": 1.2440953254699707,
"step": 679
},
{
"epoch": 0.3879617743545857,
"grad_norm": 0.98046875,
"learning_rate": 3.220620842572062e-05,
"loss": 1.1706881523132324,
"step": 680
},
{
"epoch": 0.38853230637569536,
"grad_norm": 1.0390625,
"learning_rate": 3.217849223946785e-05,
"loss": 1.227694034576416,
"step": 681
},
{
"epoch": 0.38910283839680504,
"grad_norm": 1.0234375,
"learning_rate": 3.215077605321508e-05,
"loss": 1.2553303241729736,
"step": 682
},
{
"epoch": 0.3896733704179147,
"grad_norm": 0.94140625,
"learning_rate": 3.212305986696231e-05,
"loss": 1.1399942636489868,
"step": 683
},
{
"epoch": 0.3902439024390244,
"grad_norm": 0.98828125,
"learning_rate": 3.209534368070954e-05,
"loss": 1.2082273960113525,
"step": 684
},
{
"epoch": 0.3908144344601341,
"grad_norm": 1.0625,
"learning_rate": 3.206762749445676e-05,
"loss": 1.2403631210327148,
"step": 685
},
{
"epoch": 0.39138496648124377,
"grad_norm": 1.03125,
"learning_rate": 3.203991130820399e-05,
"loss": 1.1668493747711182,
"step": 686
},
{
"epoch": 0.39195549850235345,
"grad_norm": 1.0,
"learning_rate": 3.201219512195122e-05,
"loss": 1.1642647981643677,
"step": 687
},
{
"epoch": 0.39252603052346313,
"grad_norm": 0.94921875,
"learning_rate": 3.198447893569845e-05,
"loss": 1.169840693473816,
"step": 688
},
{
"epoch": 0.3930965625445728,
"grad_norm": 1.0625,
"learning_rate": 3.195676274944568e-05,
"loss": 1.1918284893035889,
"step": 689
},
{
"epoch": 0.3936670945656825,
"grad_norm": 1.0546875,
"learning_rate": 3.19290465631929e-05,
"loss": 1.2486236095428467,
"step": 690
},
{
"epoch": 0.3942376265867922,
"grad_norm": 1.0,
"learning_rate": 3.190133037694014e-05,
"loss": 1.212164044380188,
"step": 691
},
{
"epoch": 0.39480815860790186,
"grad_norm": 1.0,
"learning_rate": 3.187361419068736e-05,
"loss": 1.2184773683547974,
"step": 692
},
{
"epoch": 0.39537869062901154,
"grad_norm": 1.0625,
"learning_rate": 3.184589800443459e-05,
"loss": 1.2665815353393555,
"step": 693
},
{
"epoch": 0.3959492226501212,
"grad_norm": 1.0625,
"learning_rate": 3.181818181818182e-05,
"loss": 1.1956299543380737,
"step": 694
},
{
"epoch": 0.3965197546712309,
"grad_norm": 0.98828125,
"learning_rate": 3.179046563192905e-05,
"loss": 1.1868462562561035,
"step": 695
},
{
"epoch": 0.3970902866923406,
"grad_norm": 1.0234375,
"learning_rate": 3.176274944567628e-05,
"loss": 1.2558304071426392,
"step": 696
},
{
"epoch": 0.39766081871345027,
"grad_norm": 1.0078125,
"learning_rate": 3.17350332594235e-05,
"loss": 1.2197167873382568,
"step": 697
},
{
"epoch": 0.39823135073455995,
"grad_norm": 1.0390625,
"learning_rate": 3.170731707317073e-05,
"loss": 1.2546510696411133,
"step": 698
},
{
"epoch": 0.39880188275566963,
"grad_norm": 1.078125,
"learning_rate": 3.167960088691796e-05,
"loss": 1.2634811401367188,
"step": 699
},
{
"epoch": 0.39937241477677937,
"grad_norm": 0.953125,
"learning_rate": 3.165188470066519e-05,
"loss": 1.1409438848495483,
"step": 700
},
{
"epoch": 0.39994294679788905,
"grad_norm": 1.0,
"learning_rate": 3.162416851441242e-05,
"loss": 1.167540431022644,
"step": 701
},
{
"epoch": 0.40051347881899874,
"grad_norm": 0.98828125,
"learning_rate": 3.159645232815965e-05,
"loss": 1.2233819961547852,
"step": 702
},
{
"epoch": 0.4010840108401084,
"grad_norm": 1.0625,
"learning_rate": 3.156873614190687e-05,
"loss": 1.2183570861816406,
"step": 703
},
{
"epoch": 0.4016545428612181,
"grad_norm": 1.0234375,
"learning_rate": 3.154101995565411e-05,
"loss": 1.2039064168930054,
"step": 704
},
{
"epoch": 0.4022250748823278,
"grad_norm": 1.0234375,
"learning_rate": 3.151330376940133e-05,
"loss": 1.2583222389221191,
"step": 705
},
{
"epoch": 0.40279560690343746,
"grad_norm": 0.9765625,
"learning_rate": 3.148558758314856e-05,
"loss": 1.2133885622024536,
"step": 706
},
{
"epoch": 0.40336613892454715,
"grad_norm": 0.99609375,
"learning_rate": 3.145787139689579e-05,
"loss": 1.2497689723968506,
"step": 707
},
{
"epoch": 0.40393667094565683,
"grad_norm": 1.015625,
"learning_rate": 3.143015521064302e-05,
"loss": 1.1765098571777344,
"step": 708
},
{
"epoch": 0.4045072029667665,
"grad_norm": 0.9765625,
"learning_rate": 3.140243902439025e-05,
"loss": 1.1668319702148438,
"step": 709
},
{
"epoch": 0.4050777349878762,
"grad_norm": 1.0,
"learning_rate": 3.137472283813747e-05,
"loss": 1.1545255184173584,
"step": 710
},
{
"epoch": 0.4056482670089859,
"grad_norm": 1.0,
"learning_rate": 3.13470066518847e-05,
"loss": 1.2044893503189087,
"step": 711
},
{
"epoch": 0.40621879903009556,
"grad_norm": 0.99609375,
"learning_rate": 3.131929046563193e-05,
"loss": 1.2121517658233643,
"step": 712
},
{
"epoch": 0.40678933105120524,
"grad_norm": 1.0390625,
"learning_rate": 3.129157427937916e-05,
"loss": 1.276052713394165,
"step": 713
},
{
"epoch": 0.4073598630723149,
"grad_norm": 1.015625,
"learning_rate": 3.126385809312638e-05,
"loss": 1.1800833940505981,
"step": 714
},
{
"epoch": 0.4079303950934246,
"grad_norm": 0.984375,
"learning_rate": 3.123614190687362e-05,
"loss": 1.1513339281082153,
"step": 715
},
{
"epoch": 0.4085009271145343,
"grad_norm": 0.98046875,
"learning_rate": 3.120842572062084e-05,
"loss": 1.2298616170883179,
"step": 716
},
{
"epoch": 0.40907145913564397,
"grad_norm": 0.9765625,
"learning_rate": 3.118070953436808e-05,
"loss": 1.1709084510803223,
"step": 717
},
{
"epoch": 0.40964199115675365,
"grad_norm": 0.98828125,
"learning_rate": 3.11529933481153e-05,
"loss": 1.1676058769226074,
"step": 718
},
{
"epoch": 0.41021252317786333,
"grad_norm": 0.98828125,
"learning_rate": 3.112527716186253e-05,
"loss": 1.2025721073150635,
"step": 719
},
{
"epoch": 0.41078305519897307,
"grad_norm": 1.0390625,
"learning_rate": 3.109756097560976e-05,
"loss": 1.2218658924102783,
"step": 720
},
{
"epoch": 0.41135358722008275,
"grad_norm": 0.96875,
"learning_rate": 3.106984478935698e-05,
"loss": 1.1744896173477173,
"step": 721
},
{
"epoch": 0.41192411924119243,
"grad_norm": 0.94921875,
"learning_rate": 3.104212860310421e-05,
"loss": 1.1989339590072632,
"step": 722
},
{
"epoch": 0.4124946512623021,
"grad_norm": 0.9765625,
"learning_rate": 3.101441241685144e-05,
"loss": 1.2189137935638428,
"step": 723
},
{
"epoch": 0.4130651832834118,
"grad_norm": 0.9921875,
"learning_rate": 3.098669623059867e-05,
"loss": 1.2155076265335083,
"step": 724
},
{
"epoch": 0.4136357153045215,
"grad_norm": 0.9921875,
"learning_rate": 3.09589800443459e-05,
"loss": 1.1465799808502197,
"step": 725
},
{
"epoch": 0.41420624732563116,
"grad_norm": 0.98828125,
"learning_rate": 3.093126385809313e-05,
"loss": 1.2145007848739624,
"step": 726
},
{
"epoch": 0.41477677934674084,
"grad_norm": 0.984375,
"learning_rate": 3.090354767184035e-05,
"loss": 1.2057294845581055,
"step": 727
},
{
"epoch": 0.4153473113678505,
"grad_norm": 0.9921875,
"learning_rate": 3.087583148558759e-05,
"loss": 1.2041752338409424,
"step": 728
},
{
"epoch": 0.4159178433889602,
"grad_norm": 0.9765625,
"learning_rate": 3.084811529933481e-05,
"loss": 1.1989641189575195,
"step": 729
},
{
"epoch": 0.4164883754100699,
"grad_norm": 0.9453125,
"learning_rate": 3.082039911308204e-05,
"loss": 1.188431739807129,
"step": 730
},
{
"epoch": 0.41705890743117957,
"grad_norm": 0.96875,
"learning_rate": 3.079268292682927e-05,
"loss": 1.1488507986068726,
"step": 731
},
{
"epoch": 0.41762943945228925,
"grad_norm": 1.015625,
"learning_rate": 3.07649667405765e-05,
"loss": 1.2174850702285767,
"step": 732
},
{
"epoch": 0.41819997147339893,
"grad_norm": 0.96875,
"learning_rate": 3.073725055432373e-05,
"loss": 1.2141880989074707,
"step": 733
},
{
"epoch": 0.4187705034945086,
"grad_norm": 1.03125,
"learning_rate": 3.070953436807095e-05,
"loss": 1.2875535488128662,
"step": 734
},
{
"epoch": 0.4193410355156183,
"grad_norm": 0.984375,
"learning_rate": 3.068181818181818e-05,
"loss": 1.168579339981079,
"step": 735
},
{
"epoch": 0.419911567536728,
"grad_norm": 0.96875,
"learning_rate": 3.065410199556541e-05,
"loss": 1.1168636083602905,
"step": 736
},
{
"epoch": 0.42048209955783766,
"grad_norm": 0.984375,
"learning_rate": 3.062638580931264e-05,
"loss": 1.1600708961486816,
"step": 737
},
{
"epoch": 0.42105263157894735,
"grad_norm": 0.9765625,
"learning_rate": 3.059866962305987e-05,
"loss": 1.1832588911056519,
"step": 738
},
{
"epoch": 0.42105263157894735,
"eval_loss": 1.1941628456115723,
"eval_runtime": 80.1253,
"eval_samples_per_second": 11.931,
"eval_steps_per_second": 2.983,
"step": 738
},
{
"epoch": 0.421623163600057,
"grad_norm": 0.98828125,
"learning_rate": 3.05709534368071e-05,
"loss": 1.193061351776123,
"step": 739
},
{
"epoch": 0.42219369562116676,
"grad_norm": 1.03125,
"learning_rate": 3.054323725055432e-05,
"loss": 1.1793735027313232,
"step": 740
},
{
"epoch": 0.42276422764227645,
"grad_norm": 0.95703125,
"learning_rate": 3.0515521064301554e-05,
"loss": 1.1607141494750977,
"step": 741
},
{
"epoch": 0.42333475966338613,
"grad_norm": 0.99609375,
"learning_rate": 3.048780487804878e-05,
"loss": 1.1790132522583008,
"step": 742
},
{
"epoch": 0.4239052916844958,
"grad_norm": 0.98046875,
"learning_rate": 3.0460088691796013e-05,
"loss": 1.155259132385254,
"step": 743
},
{
"epoch": 0.4244758237056055,
"grad_norm": 0.9609375,
"learning_rate": 3.043237250554324e-05,
"loss": 1.1134623289108276,
"step": 744
},
{
"epoch": 0.4250463557267152,
"grad_norm": 0.93359375,
"learning_rate": 3.0404656319290465e-05,
"loss": 1.198337435722351,
"step": 745
},
{
"epoch": 0.42561688774782486,
"grad_norm": 0.9921875,
"learning_rate": 3.0376940133037695e-05,
"loss": 1.1744345426559448,
"step": 746
},
{
"epoch": 0.42618741976893454,
"grad_norm": 0.984375,
"learning_rate": 3.034922394678492e-05,
"loss": 1.1646068096160889,
"step": 747
},
{
"epoch": 0.4267579517900442,
"grad_norm": 0.984375,
"learning_rate": 3.0321507760532154e-05,
"loss": 1.1827648878097534,
"step": 748
},
{
"epoch": 0.4273284838111539,
"grad_norm": 0.953125,
"learning_rate": 3.029379157427938e-05,
"loss": 1.1942888498306274,
"step": 749
},
{
"epoch": 0.4278990158322636,
"grad_norm": 0.9765625,
"learning_rate": 3.026607538802661e-05,
"loss": 1.1896655559539795,
"step": 750
},
{
"epoch": 0.42846954785337327,
"grad_norm": 0.98046875,
"learning_rate": 3.0238359201773835e-05,
"loss": 1.197471022605896,
"step": 751
},
{
"epoch": 0.42904007987448295,
"grad_norm": 1.0078125,
"learning_rate": 3.021064301552107e-05,
"loss": 1.1281297206878662,
"step": 752
},
{
"epoch": 0.42961061189559263,
"grad_norm": 0.99609375,
"learning_rate": 3.0182926829268294e-05,
"loss": 1.1960434913635254,
"step": 753
},
{
"epoch": 0.4301811439167023,
"grad_norm": 0.95703125,
"learning_rate": 3.0155210643015524e-05,
"loss": 1.1772822141647339,
"step": 754
},
{
"epoch": 0.430751675937812,
"grad_norm": 0.98046875,
"learning_rate": 3.012749445676275e-05,
"loss": 1.2077326774597168,
"step": 755
},
{
"epoch": 0.4313222079589217,
"grad_norm": 1.0,
"learning_rate": 3.0099778270509983e-05,
"loss": 1.216168999671936,
"step": 756
},
{
"epoch": 0.43189273998003136,
"grad_norm": 0.97265625,
"learning_rate": 3.007206208425721e-05,
"loss": 1.1528898477554321,
"step": 757
},
{
"epoch": 0.43246327200114104,
"grad_norm": 1.0,
"learning_rate": 3.0044345898004435e-05,
"loss": 1.1724753379821777,
"step": 758
},
{
"epoch": 0.4330338040222507,
"grad_norm": 0.96484375,
"learning_rate": 3.0016629711751664e-05,
"loss": 1.1700730323791504,
"step": 759
},
{
"epoch": 0.43360433604336046,
"grad_norm": 0.9609375,
"learning_rate": 2.998891352549889e-05,
"loss": 1.1328129768371582,
"step": 760
},
{
"epoch": 0.43417486806447014,
"grad_norm": 0.9765625,
"learning_rate": 2.9961197339246123e-05,
"loss": 1.191325306892395,
"step": 761
},
{
"epoch": 0.4347454000855798,
"grad_norm": 0.97265625,
"learning_rate": 2.993348115299335e-05,
"loss": 1.160369873046875,
"step": 762
},
{
"epoch": 0.4353159321066895,
"grad_norm": 0.96484375,
"learning_rate": 2.990576496674058e-05,
"loss": 1.196010947227478,
"step": 763
},
{
"epoch": 0.4358864641277992,
"grad_norm": 0.96875,
"learning_rate": 2.9878048780487805e-05,
"loss": 1.1497125625610352,
"step": 764
},
{
"epoch": 0.43645699614890887,
"grad_norm": 1.0078125,
"learning_rate": 2.9850332594235038e-05,
"loss": 1.152623176574707,
"step": 765
},
{
"epoch": 0.43702752817001855,
"grad_norm": 1.015625,
"learning_rate": 2.9822616407982264e-05,
"loss": 1.1713566780090332,
"step": 766
},
{
"epoch": 0.43759806019112824,
"grad_norm": 1.1640625,
"learning_rate": 2.9794900221729493e-05,
"loss": 1.263333797454834,
"step": 767
},
{
"epoch": 0.4381685922122379,
"grad_norm": 0.96875,
"learning_rate": 2.976718403547672e-05,
"loss": 1.144421935081482,
"step": 768
},
{
"epoch": 0.4387391242333476,
"grad_norm": 0.953125,
"learning_rate": 2.9739467849223952e-05,
"loss": 1.2290055751800537,
"step": 769
},
{
"epoch": 0.4393096562544573,
"grad_norm": 0.9921875,
"learning_rate": 2.971175166297118e-05,
"loss": 1.1050488948822021,
"step": 770
},
{
"epoch": 0.43988018827556696,
"grad_norm": 0.9765625,
"learning_rate": 2.96840354767184e-05,
"loss": 1.2218358516693115,
"step": 771
},
{
"epoch": 0.44045072029667665,
"grad_norm": 0.94921875,
"learning_rate": 2.9656319290465634e-05,
"loss": 1.1308021545410156,
"step": 772
},
{
"epoch": 0.4410212523177863,
"grad_norm": 0.99609375,
"learning_rate": 2.962860310421286e-05,
"loss": 1.2299238443374634,
"step": 773
},
{
"epoch": 0.441591784338896,
"grad_norm": 0.98046875,
"learning_rate": 2.960088691796009e-05,
"loss": 1.1389673948287964,
"step": 774
},
{
"epoch": 0.4421623163600057,
"grad_norm": 1.0078125,
"learning_rate": 2.9573170731707316e-05,
"loss": 1.2660845518112183,
"step": 775
},
{
"epoch": 0.4427328483811154,
"grad_norm": 0.96484375,
"learning_rate": 2.954545454545455e-05,
"loss": 1.099113941192627,
"step": 776
},
{
"epoch": 0.44330338040222506,
"grad_norm": 0.953125,
"learning_rate": 2.9517738359201774e-05,
"loss": 1.2134381532669067,
"step": 777
},
{
"epoch": 0.44387391242333474,
"grad_norm": 0.96875,
"learning_rate": 2.9490022172949004e-05,
"loss": 1.1754953861236572,
"step": 778
},
{
"epoch": 0.4444444444444444,
"grad_norm": 0.98046875,
"learning_rate": 2.946230598669623e-05,
"loss": 1.1886742115020752,
"step": 779
},
{
"epoch": 0.4450149764655541,
"grad_norm": 0.953125,
"learning_rate": 2.9434589800443463e-05,
"loss": 1.192276954650879,
"step": 780
},
{
"epoch": 0.44558550848666384,
"grad_norm": 1.0078125,
"learning_rate": 2.940687361419069e-05,
"loss": 1.2006890773773193,
"step": 781
},
{
"epoch": 0.4461560405077735,
"grad_norm": 0.98828125,
"learning_rate": 2.9379157427937915e-05,
"loss": 1.1819924116134644,
"step": 782
},
{
"epoch": 0.4467265725288832,
"grad_norm": 0.9453125,
"learning_rate": 2.9351441241685145e-05,
"loss": 1.1743961572647095,
"step": 783
},
{
"epoch": 0.4472971045499929,
"grad_norm": 0.98046875,
"learning_rate": 2.932372505543237e-05,
"loss": 1.2021007537841797,
"step": 784
},
{
"epoch": 0.44786763657110257,
"grad_norm": 1.0,
"learning_rate": 2.9296008869179603e-05,
"loss": 1.2032489776611328,
"step": 785
},
{
"epoch": 0.44843816859221225,
"grad_norm": 1.015625,
"learning_rate": 2.926829268292683e-05,
"loss": 1.1912821531295776,
"step": 786
},
{
"epoch": 0.44900870061332193,
"grad_norm": 0.9609375,
"learning_rate": 2.924057649667406e-05,
"loss": 1.184190034866333,
"step": 787
},
{
"epoch": 0.4495792326344316,
"grad_norm": 1.015625,
"learning_rate": 2.9212860310421285e-05,
"loss": 1.272563099861145,
"step": 788
},
{
"epoch": 0.4501497646555413,
"grad_norm": 0.98046875,
"learning_rate": 2.9185144124168518e-05,
"loss": 1.2212070226669312,
"step": 789
},
{
"epoch": 0.450720296676651,
"grad_norm": 1.03125,
"learning_rate": 2.9157427937915744e-05,
"loss": 1.1937004327774048,
"step": 790
},
{
"epoch": 0.45129082869776066,
"grad_norm": 1.0390625,
"learning_rate": 2.9129711751662973e-05,
"loss": 1.1712844371795654,
"step": 791
},
{
"epoch": 0.45186136071887034,
"grad_norm": 0.9609375,
"learning_rate": 2.91019955654102e-05,
"loss": 1.1701891422271729,
"step": 792
},
{
"epoch": 0.45243189273998,
"grad_norm": 1.015625,
"learning_rate": 2.9074279379157432e-05,
"loss": 1.2575602531433105,
"step": 793
},
{
"epoch": 0.4530024247610897,
"grad_norm": 0.9765625,
"learning_rate": 2.904656319290466e-05,
"loss": 1.1968649625778198,
"step": 794
},
{
"epoch": 0.4535729567821994,
"grad_norm": 0.97265625,
"learning_rate": 2.9018847006651885e-05,
"loss": 1.205810546875,
"step": 795
},
{
"epoch": 0.45414348880330907,
"grad_norm": 0.96875,
"learning_rate": 2.8991130820399114e-05,
"loss": 1.1697238683700562,
"step": 796
},
{
"epoch": 0.45471402082441875,
"grad_norm": 1.0703125,
"learning_rate": 2.896341463414634e-05,
"loss": 1.27318274974823,
"step": 797
},
{
"epoch": 0.45528455284552843,
"grad_norm": 1.015625,
"learning_rate": 2.8935698447893573e-05,
"loss": 1.2104084491729736,
"step": 798
},
{
"epoch": 0.4558550848666381,
"grad_norm": 1.0234375,
"learning_rate": 2.89079822616408e-05,
"loss": 1.2579401731491089,
"step": 799
},
{
"epoch": 0.4564256168877478,
"grad_norm": 0.97265625,
"learning_rate": 2.888026607538803e-05,
"loss": 1.1750009059906006,
"step": 800
},
{
"epoch": 0.45699614890885754,
"grad_norm": 1.03125,
"learning_rate": 2.8852549889135255e-05,
"loss": 1.1911466121673584,
"step": 801
},
{
"epoch": 0.4575666809299672,
"grad_norm": 0.96875,
"learning_rate": 2.8824833702882487e-05,
"loss": 1.0935354232788086,
"step": 802
},
{
"epoch": 0.4581372129510769,
"grad_norm": 0.9453125,
"learning_rate": 2.8797117516629713e-05,
"loss": 1.1621028184890747,
"step": 803
},
{
"epoch": 0.4587077449721866,
"grad_norm": 0.98828125,
"learning_rate": 2.8769401330376943e-05,
"loss": 1.1952382326126099,
"step": 804
},
{
"epoch": 0.45927827699329626,
"grad_norm": 0.984375,
"learning_rate": 2.874168514412417e-05,
"loss": 1.2074031829833984,
"step": 805
},
{
"epoch": 0.45984880901440595,
"grad_norm": 0.94921875,
"learning_rate": 2.8713968957871395e-05,
"loss": 1.191246509552002,
"step": 806
},
{
"epoch": 0.46041934103551563,
"grad_norm": 0.921875,
"learning_rate": 2.8686252771618628e-05,
"loss": 1.2298707962036133,
"step": 807
},
{
"epoch": 0.4609898730566253,
"grad_norm": 1.015625,
"learning_rate": 2.8658536585365854e-05,
"loss": 1.2514528036117554,
"step": 808
},
{
"epoch": 0.461560405077735,
"grad_norm": 0.96484375,
"learning_rate": 2.8630820399113084e-05,
"loss": 1.2710151672363281,
"step": 809
},
{
"epoch": 0.4621309370988447,
"grad_norm": 0.93359375,
"learning_rate": 2.860310421286031e-05,
"loss": 1.1337497234344482,
"step": 810
},
{
"epoch": 0.46270146911995436,
"grad_norm": 0.96875,
"learning_rate": 2.8575388026607542e-05,
"loss": 1.1267883777618408,
"step": 811
},
{
"epoch": 0.46327200114106404,
"grad_norm": 0.9609375,
"learning_rate": 2.854767184035477e-05,
"loss": 1.1755304336547852,
"step": 812
},
{
"epoch": 0.4638425331621737,
"grad_norm": 0.96875,
"learning_rate": 2.8519955654101998e-05,
"loss": 1.1366599798202515,
"step": 813
},
{
"epoch": 0.4644130651832834,
"grad_norm": 1.0234375,
"learning_rate": 2.8492239467849224e-05,
"loss": 1.2038339376449585,
"step": 814
},
{
"epoch": 0.4649835972043931,
"grad_norm": 0.96875,
"learning_rate": 2.8464523281596457e-05,
"loss": 1.2154085636138916,
"step": 815
},
{
"epoch": 0.46555412922550277,
"grad_norm": 1.0078125,
"learning_rate": 2.8436807095343683e-05,
"loss": 1.1818276643753052,
"step": 816
},
{
"epoch": 0.46612466124661245,
"grad_norm": 1.0078125,
"learning_rate": 2.8409090909090912e-05,
"loss": 1.2436468601226807,
"step": 817
},
{
"epoch": 0.46669519326772213,
"grad_norm": 0.953125,
"learning_rate": 2.838137472283814e-05,
"loss": 1.1363047361373901,
"step": 818
},
{
"epoch": 0.4672657252888318,
"grad_norm": 0.984375,
"learning_rate": 2.8353658536585365e-05,
"loss": 1.1960558891296387,
"step": 819
},
{
"epoch": 0.4678362573099415,
"grad_norm": 0.96875,
"learning_rate": 2.8325942350332597e-05,
"loss": 1.171709418296814,
"step": 820
},
{
"epoch": 0.46840678933105123,
"grad_norm": 0.953125,
"learning_rate": 2.8298226164079824e-05,
"loss": 1.1537501811981201,
"step": 821
},
{
"epoch": 0.4689773213521609,
"grad_norm": 0.9609375,
"learning_rate": 2.8270509977827053e-05,
"loss": 1.1839423179626465,
"step": 822
},
{
"epoch": 0.4695478533732706,
"grad_norm": 0.98046875,
"learning_rate": 2.824279379157428e-05,
"loss": 1.1610156297683716,
"step": 823
},
{
"epoch": 0.4701183853943803,
"grad_norm": 0.984375,
"learning_rate": 2.8215077605321512e-05,
"loss": 1.1708459854125977,
"step": 824
},
{
"epoch": 0.47068891741548996,
"grad_norm": 1.015625,
"learning_rate": 2.8187361419068735e-05,
"loss": 1.251354455947876,
"step": 825
},
{
"epoch": 0.47125944943659964,
"grad_norm": 0.984375,
"learning_rate": 2.8159645232815967e-05,
"loss": 1.2049927711486816,
"step": 826
},
{
"epoch": 0.4718299814577093,
"grad_norm": 0.99609375,
"learning_rate": 2.8131929046563194e-05,
"loss": 1.230988621711731,
"step": 827
},
{
"epoch": 0.472400513478819,
"grad_norm": 0.96484375,
"learning_rate": 2.8104212860310426e-05,
"loss": 1.1739616394042969,
"step": 828
},
{
"epoch": 0.4729710454999287,
"grad_norm": 0.99609375,
"learning_rate": 2.807649667405765e-05,
"loss": 1.1999741792678833,
"step": 829
},
{
"epoch": 0.47354157752103837,
"grad_norm": 1.0,
"learning_rate": 2.8048780487804882e-05,
"loss": 1.2062275409698486,
"step": 830
},
{
"epoch": 0.47411210954214805,
"grad_norm": 1.0078125,
"learning_rate": 2.8021064301552108e-05,
"loss": 1.1344287395477295,
"step": 831
},
{
"epoch": 0.47468264156325773,
"grad_norm": 0.96484375,
"learning_rate": 2.7993348115299334e-05,
"loss": 1.2056477069854736,
"step": 832
},
{
"epoch": 0.4752531735843674,
"grad_norm": 1.015625,
"learning_rate": 2.7965631929046564e-05,
"loss": 1.1727713346481323,
"step": 833
},
{
"epoch": 0.4758237056054771,
"grad_norm": 0.99609375,
"learning_rate": 2.793791574279379e-05,
"loss": 1.2081948518753052,
"step": 834
},
{
"epoch": 0.4763942376265868,
"grad_norm": 0.98046875,
"learning_rate": 2.7910199556541023e-05,
"loss": 1.255791187286377,
"step": 835
},
{
"epoch": 0.47696476964769646,
"grad_norm": 0.9921875,
"learning_rate": 2.788248337028825e-05,
"loss": 1.1889286041259766,
"step": 836
},
{
"epoch": 0.47753530166880614,
"grad_norm": 0.9921875,
"learning_rate": 2.7854767184035478e-05,
"loss": 1.241337776184082,
"step": 837
},
{
"epoch": 0.4781058336899158,
"grad_norm": 0.98828125,
"learning_rate": 2.7827050997782704e-05,
"loss": 1.2144089937210083,
"step": 838
},
{
"epoch": 0.4786763657110255,
"grad_norm": 0.95703125,
"learning_rate": 2.7799334811529937e-05,
"loss": 1.1527715921401978,
"step": 839
},
{
"epoch": 0.4792468977321352,
"grad_norm": 0.96875,
"learning_rate": 2.7771618625277163e-05,
"loss": 1.181959629058838,
"step": 840
},
{
"epoch": 0.47981742975324493,
"grad_norm": 0.9921875,
"learning_rate": 2.7743902439024393e-05,
"loss": 1.1999069452285767,
"step": 841
},
{
"epoch": 0.4803879617743546,
"grad_norm": 0.984375,
"learning_rate": 2.771618625277162e-05,
"loss": 1.2098867893218994,
"step": 842
},
{
"epoch": 0.4809584937954643,
"grad_norm": 0.9765625,
"learning_rate": 2.7688470066518845e-05,
"loss": 1.1860891580581665,
"step": 843
},
{
"epoch": 0.481529025816574,
"grad_norm": 0.9609375,
"learning_rate": 2.7660753880266078e-05,
"loss": 1.1108654737472534,
"step": 844
},
{
"epoch": 0.48209955783768366,
"grad_norm": 0.953125,
"learning_rate": 2.7633037694013304e-05,
"loss": 1.2157371044158936,
"step": 845
},
{
"epoch": 0.48267008985879334,
"grad_norm": 0.96875,
"learning_rate": 2.7605321507760533e-05,
"loss": 1.2216970920562744,
"step": 846
},
{
"epoch": 0.483240621879903,
"grad_norm": 0.98046875,
"learning_rate": 2.757760532150776e-05,
"loss": 1.1434253454208374,
"step": 847
},
{
"epoch": 0.4838111539010127,
"grad_norm": 0.99609375,
"learning_rate": 2.7549889135254992e-05,
"loss": 1.1241540908813477,
"step": 848
},
{
"epoch": 0.4843816859221224,
"grad_norm": 0.9921875,
"learning_rate": 2.7522172949002218e-05,
"loss": 1.186653971672058,
"step": 849
},
{
"epoch": 0.48495221794323207,
"grad_norm": 1.015625,
"learning_rate": 2.7494456762749448e-05,
"loss": 1.2525804042816162,
"step": 850
},
{
"epoch": 0.48552274996434175,
"grad_norm": 0.98046875,
"learning_rate": 2.7466740576496674e-05,
"loss": 1.1987820863723755,
"step": 851
},
{
"epoch": 0.48609328198545143,
"grad_norm": 0.96875,
"learning_rate": 2.7439024390243906e-05,
"loss": 1.2217812538146973,
"step": 852
},
{
"epoch": 0.4866638140065611,
"grad_norm": 0.97265625,
"learning_rate": 2.7411308203991133e-05,
"loss": 1.201343297958374,
"step": 853
},
{
"epoch": 0.4872343460276708,
"grad_norm": 0.953125,
"learning_rate": 2.7383592017738362e-05,
"loss": 1.1668754816055298,
"step": 854
},
{
"epoch": 0.4878048780487805,
"grad_norm": 0.96875,
"learning_rate": 2.7355875831485588e-05,
"loss": 1.1264851093292236,
"step": 855
},
{
"epoch": 0.48837541006989016,
"grad_norm": 0.9921875,
"learning_rate": 2.7328159645232814e-05,
"loss": 1.202168345451355,
"step": 856
},
{
"epoch": 0.48894594209099984,
"grad_norm": 1.015625,
"learning_rate": 2.7300443458980047e-05,
"loss": 1.2231934070587158,
"step": 857
},
{
"epoch": 0.4895164741121095,
"grad_norm": 1.0078125,
"learning_rate": 2.7272727272727273e-05,
"loss": 1.1511149406433105,
"step": 858
},
{
"epoch": 0.4900870061332192,
"grad_norm": 1.0,
"learning_rate": 2.7245011086474503e-05,
"loss": 1.1898903846740723,
"step": 859
},
{
"epoch": 0.4906575381543289,
"grad_norm": 1.0,
"learning_rate": 2.721729490022173e-05,
"loss": 1.1848946809768677,
"step": 860
},
{
"epoch": 0.49122807017543857,
"grad_norm": 0.97265625,
"learning_rate": 2.718957871396896e-05,
"loss": 1.1898174285888672,
"step": 861
},
{
"epoch": 0.4917986021965483,
"grad_norm": 1.0234375,
"learning_rate": 2.7161862527716188e-05,
"loss": 1.2187345027923584,
"step": 862
},
{
"epoch": 0.492369134217658,
"grad_norm": 0.9765625,
"learning_rate": 2.7134146341463417e-05,
"loss": 1.1753157377243042,
"step": 863
},
{
"epoch": 0.49293966623876767,
"grad_norm": 1.0078125,
"learning_rate": 2.7106430155210643e-05,
"loss": 1.2812843322753906,
"step": 864
},
{
"epoch": 0.49351019825987735,
"grad_norm": 0.9921875,
"learning_rate": 2.7078713968957876e-05,
"loss": 1.2476832866668701,
"step": 865
},
{
"epoch": 0.49408073028098703,
"grad_norm": 0.95703125,
"learning_rate": 2.7050997782705102e-05,
"loss": 1.1763570308685303,
"step": 866
},
{
"epoch": 0.4946512623020967,
"grad_norm": 0.9609375,
"learning_rate": 2.7023281596452328e-05,
"loss": 1.159504771232605,
"step": 867
},
{
"epoch": 0.4952217943232064,
"grad_norm": 0.94140625,
"learning_rate": 2.6995565410199558e-05,
"loss": 1.2344439029693604,
"step": 868
},
{
"epoch": 0.4957923263443161,
"grad_norm": 0.98046875,
"learning_rate": 2.6967849223946784e-05,
"loss": 1.2668113708496094,
"step": 869
},
{
"epoch": 0.49636285836542576,
"grad_norm": 0.96875,
"learning_rate": 2.6940133037694017e-05,
"loss": 1.2388842105865479,
"step": 870
},
{
"epoch": 0.49693339038653545,
"grad_norm": 1.0,
"learning_rate": 2.6912416851441243e-05,
"loss": 1.197232723236084,
"step": 871
},
{
"epoch": 0.4975039224076451,
"grad_norm": 0.98046875,
"learning_rate": 2.6884700665188472e-05,
"loss": 1.1960959434509277,
"step": 872
},
{
"epoch": 0.4980744544287548,
"grad_norm": 0.99609375,
"learning_rate": 2.6856984478935698e-05,
"loss": 1.222888469696045,
"step": 873
},
{
"epoch": 0.4986449864498645,
"grad_norm": 0.98828125,
"learning_rate": 2.682926829268293e-05,
"loss": 1.239640474319458,
"step": 874
},
{
"epoch": 0.4992155184709742,
"grad_norm": 0.953125,
"learning_rate": 2.6801552106430157e-05,
"loss": 1.1557681560516357,
"step": 875
},
{
"epoch": 0.49978605049208386,
"grad_norm": 1.0,
"learning_rate": 2.6773835920177387e-05,
"loss": 1.1697707176208496,
"step": 876
},
{
"epoch": 0.5003565825131936,
"grad_norm": 1.0234375,
"learning_rate": 2.6746119733924613e-05,
"loss": 1.2065680027008057,
"step": 877
},
{
"epoch": 0.5009271145343033,
"grad_norm": 0.9921875,
"learning_rate": 2.6718403547671845e-05,
"loss": 1.2194795608520508,
"step": 878
},
{
"epoch": 0.501497646555413,
"grad_norm": 0.9609375,
"learning_rate": 2.669068736141907e-05,
"loss": 1.1722071170806885,
"step": 879
},
{
"epoch": 0.5020681785765226,
"grad_norm": 0.99609375,
"learning_rate": 2.6662971175166294e-05,
"loss": 1.1860017776489258,
"step": 880
},
{
"epoch": 0.5026387105976323,
"grad_norm": 0.9921875,
"learning_rate": 2.6635254988913527e-05,
"loss": 1.173937439918518,
"step": 881
},
{
"epoch": 0.503209242618742,
"grad_norm": 1.0234375,
"learning_rate": 2.6607538802660753e-05,
"loss": 1.1348332166671753,
"step": 882
},
{
"epoch": 0.5037797746398517,
"grad_norm": 0.97265625,
"learning_rate": 2.6579822616407986e-05,
"loss": 1.205221176147461,
"step": 883
},
{
"epoch": 0.5043503066609614,
"grad_norm": 0.95703125,
"learning_rate": 2.655210643015521e-05,
"loss": 1.1510381698608398,
"step": 884
},
{
"epoch": 0.504920838682071,
"grad_norm": 0.9921875,
"learning_rate": 2.652439024390244e-05,
"loss": 1.194382905960083,
"step": 885
},
{
"epoch": 0.5054913707031807,
"grad_norm": 1.015625,
"learning_rate": 2.6496674057649668e-05,
"loss": 1.2697436809539795,
"step": 886
},
{
"epoch": 0.5060619027242904,
"grad_norm": 0.9609375,
"learning_rate": 2.64689578713969e-05,
"loss": 1.1560388803482056,
"step": 887
},
{
"epoch": 0.5066324347454001,
"grad_norm": 0.984375,
"learning_rate": 2.6441241685144123e-05,
"loss": 1.2498875856399536,
"step": 888
},
{
"epoch": 0.5072029667665098,
"grad_norm": 0.9609375,
"learning_rate": 2.6413525498891356e-05,
"loss": 1.1706441640853882,
"step": 889
},
{
"epoch": 0.5077734987876195,
"grad_norm": 0.99609375,
"learning_rate": 2.6385809312638582e-05,
"loss": 1.1960177421569824,
"step": 890
},
{
"epoch": 0.5083440308087291,
"grad_norm": 1.0,
"learning_rate": 2.6358093126385815e-05,
"loss": 1.1732114553451538,
"step": 891
},
{
"epoch": 0.5089145628298388,
"grad_norm": 0.984375,
"learning_rate": 2.6330376940133038e-05,
"loss": 1.1812173128128052,
"step": 892
},
{
"epoch": 0.5094850948509485,
"grad_norm": 1.0,
"learning_rate": 2.6302660753880264e-05,
"loss": 1.243033528327942,
"step": 893
},
{
"epoch": 0.5100556268720582,
"grad_norm": 0.984375,
"learning_rate": 2.6274944567627497e-05,
"loss": 1.1132174730300903,
"step": 894
},
{
"epoch": 0.5106261588931679,
"grad_norm": 0.9296875,
"learning_rate": 2.6247228381374723e-05,
"loss": 1.129286289215088,
"step": 895
},
{
"epoch": 0.5111966909142776,
"grad_norm": 1.0078125,
"learning_rate": 2.6219512195121952e-05,
"loss": 1.1969499588012695,
"step": 896
},
{
"epoch": 0.5117672229353872,
"grad_norm": 0.9921875,
"learning_rate": 2.6191796008869178e-05,
"loss": 1.1295521259307861,
"step": 897
},
{
"epoch": 0.5123377549564969,
"grad_norm": 1.0390625,
"learning_rate": 2.616407982261641e-05,
"loss": 1.1657040119171143,
"step": 898
},
{
"epoch": 0.5129082869776066,
"grad_norm": 0.953125,
"learning_rate": 2.6136363636363637e-05,
"loss": 1.182844638824463,
"step": 899
},
{
"epoch": 0.5134788189987163,
"grad_norm": 0.92578125,
"learning_rate": 2.6108647450110867e-05,
"loss": 1.11708664894104,
"step": 900
},
{
"epoch": 0.514049351019826,
"grad_norm": 0.953125,
"learning_rate": 2.6080931263858093e-05,
"loss": 1.1282655000686646,
"step": 901
},
{
"epoch": 0.5146198830409356,
"grad_norm": 0.98046875,
"learning_rate": 2.6053215077605326e-05,
"loss": 1.1830154657363892,
"step": 902
},
{
"epoch": 0.5151904150620453,
"grad_norm": 0.9765625,
"learning_rate": 2.602549889135255e-05,
"loss": 1.1873393058776855,
"step": 903
},
{
"epoch": 0.515760947083155,
"grad_norm": 0.953125,
"learning_rate": 2.5997782705099778e-05,
"loss": 1.1280049085617065,
"step": 904
},
{
"epoch": 0.5163314791042647,
"grad_norm": 0.96875,
"learning_rate": 2.5970066518847007e-05,
"loss": 1.1866214275360107,
"step": 905
},
{
"epoch": 0.5169020111253744,
"grad_norm": 0.9296875,
"learning_rate": 2.5942350332594233e-05,
"loss": 1.132464051246643,
"step": 906
},
{
"epoch": 0.517472543146484,
"grad_norm": 0.9921875,
"learning_rate": 2.5914634146341466e-05,
"loss": 1.2057054042816162,
"step": 907
},
{
"epoch": 0.5180430751675937,
"grad_norm": 0.96875,
"learning_rate": 2.5886917960088692e-05,
"loss": 1.1725504398345947,
"step": 908
},
{
"epoch": 0.5186136071887034,
"grad_norm": 1.0078125,
"learning_rate": 2.585920177383592e-05,
"loss": 1.2105215787887573,
"step": 909
},
{
"epoch": 0.5191841392098131,
"grad_norm": 0.9375,
"learning_rate": 2.5831485587583148e-05,
"loss": 1.126555323600769,
"step": 910
},
{
"epoch": 0.5197546712309228,
"grad_norm": 0.953125,
"learning_rate": 2.580376940133038e-05,
"loss": 1.117220401763916,
"step": 911
},
{
"epoch": 0.5203252032520326,
"grad_norm": 0.98828125,
"learning_rate": 2.5776053215077607e-05,
"loss": 1.1578710079193115,
"step": 912
},
{
"epoch": 0.5208957352731423,
"grad_norm": 0.97265625,
"learning_rate": 2.5748337028824836e-05,
"loss": 1.1631922721862793,
"step": 913
},
{
"epoch": 0.5214662672942519,
"grad_norm": 0.9921875,
"learning_rate": 2.5720620842572062e-05,
"loss": 1.2013893127441406,
"step": 914
},
{
"epoch": 0.5220367993153616,
"grad_norm": 1.0,
"learning_rate": 2.5692904656319295e-05,
"loss": 1.159932017326355,
"step": 915
},
{
"epoch": 0.5226073313364713,
"grad_norm": 0.9453125,
"learning_rate": 2.566518847006652e-05,
"loss": 1.1213711500167847,
"step": 916
},
{
"epoch": 0.523177863357581,
"grad_norm": 1.0,
"learning_rate": 2.5637472283813747e-05,
"loss": 1.2035624980926514,
"step": 917
},
{
"epoch": 0.5237483953786907,
"grad_norm": 0.921875,
"learning_rate": 2.5609756097560977e-05,
"loss": 1.100569725036621,
"step": 918
},
{
"epoch": 0.5243189273998003,
"grad_norm": 0.99609375,
"learning_rate": 2.5582039911308203e-05,
"loss": 1.1802055835723877,
"step": 919
},
{
"epoch": 0.52488945942091,
"grad_norm": 0.9453125,
"learning_rate": 2.5554323725055436e-05,
"loss": 1.2129563093185425,
"step": 920
},
{
"epoch": 0.5254599914420197,
"grad_norm": 0.984375,
"learning_rate": 2.552660753880266e-05,
"loss": 1.2040753364562988,
"step": 921
},
{
"epoch": 0.5260305234631294,
"grad_norm": 0.9921875,
"learning_rate": 2.549889135254989e-05,
"loss": 1.1266067028045654,
"step": 922
},
{
"epoch": 0.5266010554842391,
"grad_norm": 1.0078125,
"learning_rate": 2.5471175166297117e-05,
"loss": 1.1967592239379883,
"step": 923
},
{
"epoch": 0.5271715875053488,
"grad_norm": 0.97265625,
"learning_rate": 2.544345898004435e-05,
"loss": 1.1658574342727661,
"step": 924
},
{
"epoch": 0.5277421195264584,
"grad_norm": 0.96875,
"learning_rate": 2.5415742793791576e-05,
"loss": 1.1974247694015503,
"step": 925
},
{
"epoch": 0.5283126515475681,
"grad_norm": 0.96484375,
"learning_rate": 2.5388026607538806e-05,
"loss": 1.175785779953003,
"step": 926
},
{
"epoch": 0.5288831835686778,
"grad_norm": 0.98828125,
"learning_rate": 2.5360310421286032e-05,
"loss": 1.2295399904251099,
"step": 927
},
{
"epoch": 0.5294537155897875,
"grad_norm": 1.015625,
"learning_rate": 2.5332594235033258e-05,
"loss": 1.1797332763671875,
"step": 928
},
{
"epoch": 0.5300242476108972,
"grad_norm": 0.9375,
"learning_rate": 2.530487804878049e-05,
"loss": 1.1036921739578247,
"step": 929
},
{
"epoch": 0.5305947796320069,
"grad_norm": 1.0,
"learning_rate": 2.5277161862527717e-05,
"loss": 1.1661919355392456,
"step": 930
},
{
"epoch": 0.5311653116531165,
"grad_norm": 1.015625,
"learning_rate": 2.5249445676274946e-05,
"loss": 1.220758318901062,
"step": 931
},
{
"epoch": 0.5317358436742262,
"grad_norm": 1.015625,
"learning_rate": 2.5221729490022172e-05,
"loss": 1.2072967290878296,
"step": 932
},
{
"epoch": 0.5323063756953359,
"grad_norm": 0.98046875,
"learning_rate": 2.5194013303769405e-05,
"loss": 1.211767315864563,
"step": 933
},
{
"epoch": 0.5328769077164456,
"grad_norm": 1.0,
"learning_rate": 2.516629711751663e-05,
"loss": 1.196463942527771,
"step": 934
},
{
"epoch": 0.5334474397375553,
"grad_norm": 0.96484375,
"learning_rate": 2.513858093126386e-05,
"loss": 1.1342837810516357,
"step": 935
},
{
"epoch": 0.5340179717586649,
"grad_norm": 0.9765625,
"learning_rate": 2.5110864745011087e-05,
"loss": 1.155871868133545,
"step": 936
},
{
"epoch": 0.5345885037797746,
"grad_norm": 1.0,
"learning_rate": 2.508314855875832e-05,
"loss": 1.1863211393356323,
"step": 937
},
{
"epoch": 0.5351590358008843,
"grad_norm": 0.96484375,
"learning_rate": 2.5055432372505546e-05,
"loss": 1.1399109363555908,
"step": 938
},
{
"epoch": 0.535729567821994,
"grad_norm": 0.96875,
"learning_rate": 2.5027716186252775e-05,
"loss": 1.148442268371582,
"step": 939
},
{
"epoch": 0.5363000998431037,
"grad_norm": 1.0234375,
"learning_rate": 2.5e-05,
"loss": 1.2298827171325684,
"step": 940
},
{
"epoch": 0.5368706318642134,
"grad_norm": 0.953125,
"learning_rate": 2.497228381374723e-05,
"loss": 1.1379940509796143,
"step": 941
},
{
"epoch": 0.537441163885323,
"grad_norm": 0.9453125,
"learning_rate": 2.4944567627494457e-05,
"loss": 1.1394915580749512,
"step": 942
},
{
"epoch": 0.5380116959064327,
"grad_norm": 0.9921875,
"learning_rate": 2.4916851441241686e-05,
"loss": 1.180498480796814,
"step": 943
},
{
"epoch": 0.5385822279275424,
"grad_norm": 1.0546875,
"learning_rate": 2.4889135254988916e-05,
"loss": 1.2175443172454834,
"step": 944
},
{
"epoch": 0.5391527599486521,
"grad_norm": 0.98828125,
"learning_rate": 2.4861419068736145e-05,
"loss": 1.1404181718826294,
"step": 945
},
{
"epoch": 0.5397232919697618,
"grad_norm": 0.9765625,
"learning_rate": 2.483370288248337e-05,
"loss": 1.1929075717926025,
"step": 946
},
{
"epoch": 0.5402938239908714,
"grad_norm": 0.96484375,
"learning_rate": 2.4805986696230597e-05,
"loss": 1.1470379829406738,
"step": 947
},
{
"epoch": 0.5408643560119811,
"grad_norm": 1.0,
"learning_rate": 2.4778270509977827e-05,
"loss": 1.1692397594451904,
"step": 948
},
{
"epoch": 0.5414348880330908,
"grad_norm": 1.0078125,
"learning_rate": 2.4750554323725056e-05,
"loss": 1.2243307828903198,
"step": 949
},
{
"epoch": 0.5420054200542005,
"grad_norm": 0.99609375,
"learning_rate": 2.4722838137472286e-05,
"loss": 1.1853331327438354,
"step": 950
},
{
"epoch": 0.5425759520753102,
"grad_norm": 1.015625,
"learning_rate": 2.4695121951219512e-05,
"loss": 1.2312514781951904,
"step": 951
},
{
"epoch": 0.5431464840964199,
"grad_norm": 0.953125,
"learning_rate": 2.466740576496674e-05,
"loss": 1.1487960815429688,
"step": 952
},
{
"epoch": 0.5437170161175297,
"grad_norm": 0.96875,
"learning_rate": 2.463968957871397e-05,
"loss": 1.1434435844421387,
"step": 953
},
{
"epoch": 0.5442875481386393,
"grad_norm": 0.97265625,
"learning_rate": 2.46119733924612e-05,
"loss": 1.2065646648406982,
"step": 954
},
{
"epoch": 0.544858080159749,
"grad_norm": 0.96875,
"learning_rate": 2.4584257206208426e-05,
"loss": 1.1631767749786377,
"step": 955
},
{
"epoch": 0.5454286121808587,
"grad_norm": 0.94921875,
"learning_rate": 2.4556541019955656e-05,
"loss": 1.19287109375,
"step": 956
},
{
"epoch": 0.5459991442019684,
"grad_norm": 0.98828125,
"learning_rate": 2.4528824833702885e-05,
"loss": 1.183131456375122,
"step": 957
},
{
"epoch": 0.5465696762230781,
"grad_norm": 0.953125,
"learning_rate": 2.4501108647450115e-05,
"loss": 1.1865886449813843,
"step": 958
},
{
"epoch": 0.5471402082441877,
"grad_norm": 0.98046875,
"learning_rate": 2.447339246119734e-05,
"loss": 1.1511285305023193,
"step": 959
},
{
"epoch": 0.5477107402652974,
"grad_norm": 0.96484375,
"learning_rate": 2.4445676274944567e-05,
"loss": 1.1591591835021973,
"step": 960
},
{
"epoch": 0.5482812722864071,
"grad_norm": 0.9375,
"learning_rate": 2.4417960088691796e-05,
"loss": 1.1885075569152832,
"step": 961
},
{
"epoch": 0.5488518043075168,
"grad_norm": 0.97265625,
"learning_rate": 2.4390243902439026e-05,
"loss": 1.1785187721252441,
"step": 962
},
{
"epoch": 0.5494223363286265,
"grad_norm": 0.96484375,
"learning_rate": 2.4362527716186255e-05,
"loss": 1.1689701080322266,
"step": 963
},
{
"epoch": 0.5499928683497362,
"grad_norm": 0.94921875,
"learning_rate": 2.433481152993348e-05,
"loss": 1.1543480157852173,
"step": 964
},
{
"epoch": 0.5505634003708458,
"grad_norm": 1.0078125,
"learning_rate": 2.430709534368071e-05,
"loss": 1.196134328842163,
"step": 965
},
{
"epoch": 0.5511339323919555,
"grad_norm": 0.98828125,
"learning_rate": 2.427937915742794e-05,
"loss": 1.2235426902770996,
"step": 966
},
{
"epoch": 0.5517044644130652,
"grad_norm": 0.96484375,
"learning_rate": 2.425166297117517e-05,
"loss": 1.2253239154815674,
"step": 967
},
{
"epoch": 0.5522749964341749,
"grad_norm": 0.953125,
"learning_rate": 2.4223946784922396e-05,
"loss": 1.1899304389953613,
"step": 968
},
{
"epoch": 0.5528455284552846,
"grad_norm": 1.0234375,
"learning_rate": 2.4196230598669625e-05,
"loss": 1.1620666980743408,
"step": 969
},
{
"epoch": 0.5534160604763942,
"grad_norm": 0.9765625,
"learning_rate": 2.4168514412416855e-05,
"loss": 1.1896693706512451,
"step": 970
},
{
"epoch": 0.5539865924975039,
"grad_norm": 0.921875,
"learning_rate": 2.414079822616408e-05,
"loss": 1.1168513298034668,
"step": 971
},
{
"epoch": 0.5545571245186136,
"grad_norm": 0.9453125,
"learning_rate": 2.4113082039911307e-05,
"loss": 1.1533100605010986,
"step": 972
},
{
"epoch": 0.5551276565397233,
"grad_norm": 0.953125,
"learning_rate": 2.4085365853658536e-05,
"loss": 1.11790132522583,
"step": 973
},
{
"epoch": 0.555698188560833,
"grad_norm": 0.96875,
"learning_rate": 2.4057649667405766e-05,
"loss": 1.1832971572875977,
"step": 974
},
{
"epoch": 0.5562687205819427,
"grad_norm": 0.9375,
"learning_rate": 2.4029933481152995e-05,
"loss": 1.136374592781067,
"step": 975
},
{
"epoch": 0.5568392526030523,
"grad_norm": 0.94140625,
"learning_rate": 2.400221729490022e-05,
"loss": 1.13529634475708,
"step": 976
},
{
"epoch": 0.557409784624162,
"grad_norm": 0.94140625,
"learning_rate": 2.397450110864745e-05,
"loss": 1.152282476425171,
"step": 977
},
{
"epoch": 0.5579803166452717,
"grad_norm": 0.9375,
"learning_rate": 2.394678492239468e-05,
"loss": 1.1445283889770508,
"step": 978
},
{
"epoch": 0.5585508486663814,
"grad_norm": 0.96875,
"learning_rate": 2.391906873614191e-05,
"loss": 1.1682907342910767,
"step": 979
},
{
"epoch": 0.5591213806874911,
"grad_norm": 0.96484375,
"learning_rate": 2.3891352549889136e-05,
"loss": 1.2181129455566406,
"step": 980
},
{
"epoch": 0.5596919127086007,
"grad_norm": 0.94140625,
"learning_rate": 2.3863636363636365e-05,
"loss": 1.1683390140533447,
"step": 981
},
{
"epoch": 0.5602624447297104,
"grad_norm": 0.9609375,
"learning_rate": 2.3835920177383595e-05,
"loss": 1.1526210308074951,
"step": 982
},
{
"epoch": 0.5608329767508201,
"grad_norm": 0.94140625,
"learning_rate": 2.380820399113082e-05,
"loss": 1.1839709281921387,
"step": 983
},
{
"epoch": 0.5614035087719298,
"grad_norm": 0.96875,
"learning_rate": 2.378048780487805e-05,
"loss": 1.171961784362793,
"step": 984
},
{
"epoch": 0.5619740407930395,
"grad_norm": 0.96484375,
"learning_rate": 2.3752771618625276e-05,
"loss": 1.1404699087142944,
"step": 985
},
{
"epoch": 0.5625445728141492,
"grad_norm": 0.9375,
"learning_rate": 2.3725055432372506e-05,
"loss": 1.1446641683578491,
"step": 986
},
{
"epoch": 0.5631151048352588,
"grad_norm": 0.9375,
"learning_rate": 2.3697339246119735e-05,
"loss": 1.1063508987426758,
"step": 987
},
{
"epoch": 0.5636856368563685,
"grad_norm": 0.9921875,
"learning_rate": 2.3669623059866965e-05,
"loss": 1.1023223400115967,
"step": 988
},
{
"epoch": 0.5642561688774782,
"grad_norm": 0.9453125,
"learning_rate": 2.364190687361419e-05,
"loss": 1.157923698425293,
"step": 989
},
{
"epoch": 0.5648267008985879,
"grad_norm": 0.9609375,
"learning_rate": 2.361419068736142e-05,
"loss": 1.1578837633132935,
"step": 990
},
{
"epoch": 0.5653972329196976,
"grad_norm": 0.94140625,
"learning_rate": 2.358647450110865e-05,
"loss": 1.110813856124878,
"step": 991
},
{
"epoch": 0.5659677649408072,
"grad_norm": 0.91796875,
"learning_rate": 2.355875831485588e-05,
"loss": 1.1383073329925537,
"step": 992
},
{
"epoch": 0.566538296961917,
"grad_norm": 0.94921875,
"learning_rate": 2.3531042128603105e-05,
"loss": 1.1709469556808472,
"step": 993
},
{
"epoch": 0.5671088289830267,
"grad_norm": 0.984375,
"learning_rate": 2.3503325942350335e-05,
"loss": 1.1664437055587769,
"step": 994
},
{
"epoch": 0.5676793610041364,
"grad_norm": 0.953125,
"learning_rate": 2.347560975609756e-05,
"loss": 1.1766831874847412,
"step": 995
},
{
"epoch": 0.5682498930252461,
"grad_norm": 0.92578125,
"learning_rate": 2.344789356984479e-05,
"loss": 1.1888954639434814,
"step": 996
},
{
"epoch": 0.5688204250463558,
"grad_norm": 1.0078125,
"learning_rate": 2.3420177383592016e-05,
"loss": 1.1901835203170776,
"step": 997
},
{
"epoch": 0.5693909570674655,
"grad_norm": 0.9140625,
"learning_rate": 2.3392461197339246e-05,
"loss": 1.13261079788208,
"step": 998
},
{
"epoch": 0.5699614890885751,
"grad_norm": 0.99609375,
"learning_rate": 2.3364745011086475e-05,
"loss": 1.2113161087036133,
"step": 999
},
{
"epoch": 0.5705320211096848,
"grad_norm": 0.9609375,
"learning_rate": 2.3337028824833705e-05,
"loss": 1.1643033027648926,
"step": 1000
},
{
"epoch": 0.5711025531307945,
"grad_norm": 1.0,
"learning_rate": 2.330931263858093e-05,
"loss": 1.2085559368133545,
"step": 1001
},
{
"epoch": 0.5716730851519042,
"grad_norm": 0.97265625,
"learning_rate": 2.328159645232816e-05,
"loss": 1.1837122440338135,
"step": 1002
},
{
"epoch": 0.5722436171730139,
"grad_norm": 1.03125,
"learning_rate": 2.325388026607539e-05,
"loss": 1.2685991525650024,
"step": 1003
},
{
"epoch": 0.5728141491941235,
"grad_norm": 0.95703125,
"learning_rate": 2.322616407982262e-05,
"loss": 1.1660895347595215,
"step": 1004
},
{
"epoch": 0.5733846812152332,
"grad_norm": 1.0,
"learning_rate": 2.3198447893569845e-05,
"loss": 1.1840052604675293,
"step": 1005
},
{
"epoch": 0.5739552132363429,
"grad_norm": 0.9296875,
"learning_rate": 2.3170731707317075e-05,
"loss": 1.1665326356887817,
"step": 1006
},
{
"epoch": 0.5745257452574526,
"grad_norm": 0.99609375,
"learning_rate": 2.3143015521064304e-05,
"loss": 1.1994144916534424,
"step": 1007
},
{
"epoch": 0.5750962772785623,
"grad_norm": 0.9921875,
"learning_rate": 2.311529933481153e-05,
"loss": 1.1023156642913818,
"step": 1008
},
{
"epoch": 0.575666809299672,
"grad_norm": 0.9453125,
"learning_rate": 2.308758314855876e-05,
"loss": 1.2176637649536133,
"step": 1009
},
{
"epoch": 0.5762373413207816,
"grad_norm": 1.0390625,
"learning_rate": 2.3059866962305986e-05,
"loss": 1.2663724422454834,
"step": 1010
},
{
"epoch": 0.5768078733418913,
"grad_norm": 1.0,
"learning_rate": 2.3032150776053215e-05,
"loss": 1.1681220531463623,
"step": 1011
},
{
"epoch": 0.577378405363001,
"grad_norm": 1.0078125,
"learning_rate": 2.3004434589800445e-05,
"loss": 1.221947431564331,
"step": 1012
},
{
"epoch": 0.5779489373841107,
"grad_norm": 0.9921875,
"learning_rate": 2.2976718403547674e-05,
"loss": 1.1309971809387207,
"step": 1013
},
{
"epoch": 0.5785194694052204,
"grad_norm": 0.98828125,
"learning_rate": 2.29490022172949e-05,
"loss": 1.1859217882156372,
"step": 1014
},
{
"epoch": 0.57909000142633,
"grad_norm": 1.0,
"learning_rate": 2.292128603104213e-05,
"loss": 1.1979272365570068,
"step": 1015
},
{
"epoch": 0.5796605334474397,
"grad_norm": 1.0,
"learning_rate": 2.289356984478936e-05,
"loss": 1.1865754127502441,
"step": 1016
},
{
"epoch": 0.5802310654685494,
"grad_norm": 0.95703125,
"learning_rate": 2.286585365853659e-05,
"loss": 1.1868486404418945,
"step": 1017
},
{
"epoch": 0.5808015974896591,
"grad_norm": 0.921875,
"learning_rate": 2.2838137472283815e-05,
"loss": 1.129669427871704,
"step": 1018
},
{
"epoch": 0.5813721295107688,
"grad_norm": 0.9375,
"learning_rate": 2.2810421286031044e-05,
"loss": 1.1734843254089355,
"step": 1019
},
{
"epoch": 0.5819426615318785,
"grad_norm": 1.0390625,
"learning_rate": 2.278270509977827e-05,
"loss": 1.2343952655792236,
"step": 1020
},
{
"epoch": 0.5825131935529881,
"grad_norm": 0.96484375,
"learning_rate": 2.27549889135255e-05,
"loss": 1.21380615234375,
"step": 1021
},
{
"epoch": 0.5830837255740978,
"grad_norm": 0.9296875,
"learning_rate": 2.272727272727273e-05,
"loss": 1.1312305927276611,
"step": 1022
},
{
"epoch": 0.5836542575952075,
"grad_norm": 0.9609375,
"learning_rate": 2.2699556541019955e-05,
"loss": 1.1510472297668457,
"step": 1023
},
{
"epoch": 0.5842247896163172,
"grad_norm": 1.0,
"learning_rate": 2.2671840354767185e-05,
"loss": 1.1997393369674683,
"step": 1024
},
{
"epoch": 0.5847953216374269,
"grad_norm": 0.953125,
"learning_rate": 2.2644124168514414e-05,
"loss": 1.1844977140426636,
"step": 1025
},
{
"epoch": 0.5853658536585366,
"grad_norm": 0.95703125,
"learning_rate": 2.261640798226164e-05,
"loss": 1.1642664670944214,
"step": 1026
},
{
"epoch": 0.5859363856796462,
"grad_norm": 0.95703125,
"learning_rate": 2.258869179600887e-05,
"loss": 1.1929872035980225,
"step": 1027
},
{
"epoch": 0.5865069177007559,
"grad_norm": 0.98046875,
"learning_rate": 2.25609756097561e-05,
"loss": 1.2264790534973145,
"step": 1028
},
{
"epoch": 0.5870774497218656,
"grad_norm": 0.96484375,
"learning_rate": 2.253325942350333e-05,
"loss": 1.208320140838623,
"step": 1029
},
{
"epoch": 0.5876479817429753,
"grad_norm": 0.9140625,
"learning_rate": 2.2505543237250555e-05,
"loss": 1.1017545461654663,
"step": 1030
},
{
"epoch": 0.588218513764085,
"grad_norm": 0.91015625,
"learning_rate": 2.2477827050997784e-05,
"loss": 1.0866947174072266,
"step": 1031
},
{
"epoch": 0.5887890457851946,
"grad_norm": 0.94140625,
"learning_rate": 2.245011086474501e-05,
"loss": 1.134414553642273,
"step": 1032
},
{
"epoch": 0.5893595778063043,
"grad_norm": 0.9609375,
"learning_rate": 2.242239467849224e-05,
"loss": 1.1386680603027344,
"step": 1033
},
{
"epoch": 0.5899301098274141,
"grad_norm": 0.93359375,
"learning_rate": 2.239467849223947e-05,
"loss": 1.098857045173645,
"step": 1034
},
{
"epoch": 0.5905006418485238,
"grad_norm": 0.98046875,
"learning_rate": 2.2366962305986695e-05,
"loss": 1.1710071563720703,
"step": 1035
},
{
"epoch": 0.5910711738696335,
"grad_norm": 0.90234375,
"learning_rate": 2.2339246119733925e-05,
"loss": 1.1196489334106445,
"step": 1036
},
{
"epoch": 0.5916417058907432,
"grad_norm": 0.94140625,
"learning_rate": 2.2311529933481154e-05,
"loss": 1.132148265838623,
"step": 1037
},
{
"epoch": 0.5922122379118528,
"grad_norm": 0.9453125,
"learning_rate": 2.2283813747228384e-05,
"loss": 1.1694618463516235,
"step": 1038
},
{
"epoch": 0.5927827699329625,
"grad_norm": 0.94140625,
"learning_rate": 2.225609756097561e-05,
"loss": 1.141546607017517,
"step": 1039
},
{
"epoch": 0.5933533019540722,
"grad_norm": 1.015625,
"learning_rate": 2.222838137472284e-05,
"loss": 1.214141607284546,
"step": 1040
},
{
"epoch": 0.5939238339751819,
"grad_norm": 0.9375,
"learning_rate": 2.220066518847007e-05,
"loss": 1.142057180404663,
"step": 1041
},
{
"epoch": 0.5944943659962916,
"grad_norm": 0.9609375,
"learning_rate": 2.2172949002217298e-05,
"loss": 1.1707711219787598,
"step": 1042
},
{
"epoch": 0.5950648980174013,
"grad_norm": 0.91796875,
"learning_rate": 2.2145232815964524e-05,
"loss": 1.164795994758606,
"step": 1043
},
{
"epoch": 0.5956354300385109,
"grad_norm": 0.97265625,
"learning_rate": 2.211751662971175e-05,
"loss": 1.1659691333770752,
"step": 1044
},
{
"epoch": 0.5962059620596206,
"grad_norm": 0.94921875,
"learning_rate": 2.208980044345898e-05,
"loss": 1.1294951438903809,
"step": 1045
},
{
"epoch": 0.5967764940807303,
"grad_norm": 0.96875,
"learning_rate": 2.206208425720621e-05,
"loss": 1.1925092935562134,
"step": 1046
},
{
"epoch": 0.59734702610184,
"grad_norm": 0.93359375,
"learning_rate": 2.203436807095344e-05,
"loss": 1.1600418090820312,
"step": 1047
},
{
"epoch": 0.5979175581229497,
"grad_norm": 0.98046875,
"learning_rate": 2.2006651884700665e-05,
"loss": 1.157020092010498,
"step": 1048
},
{
"epoch": 0.5984880901440593,
"grad_norm": 1.0234375,
"learning_rate": 2.1978935698447894e-05,
"loss": 1.1589795351028442,
"step": 1049
},
{
"epoch": 0.599058622165169,
"grad_norm": 0.9453125,
"learning_rate": 2.1951219512195124e-05,
"loss": 1.1546876430511475,
"step": 1050
},
{
"epoch": 0.5996291541862787,
"grad_norm": 0.94140625,
"learning_rate": 2.1923503325942353e-05,
"loss": 1.1549787521362305,
"step": 1051
},
{
"epoch": 0.6001996862073884,
"grad_norm": 0.9921875,
"learning_rate": 2.189578713968958e-05,
"loss": 1.1518681049346924,
"step": 1052
},
{
"epoch": 0.6007702182284981,
"grad_norm": 0.96484375,
"learning_rate": 2.186807095343681e-05,
"loss": 1.1609306335449219,
"step": 1053
},
{
"epoch": 0.6013407502496078,
"grad_norm": 0.97265625,
"learning_rate": 2.1840354767184038e-05,
"loss": 1.1526927947998047,
"step": 1054
},
{
"epoch": 0.6019112822707174,
"grad_norm": 0.98046875,
"learning_rate": 2.1812638580931268e-05,
"loss": 1.2030518054962158,
"step": 1055
},
{
"epoch": 0.6024818142918271,
"grad_norm": 0.94921875,
"learning_rate": 2.178492239467849e-05,
"loss": 1.087314248085022,
"step": 1056
},
{
"epoch": 0.6030523463129368,
"grad_norm": 0.93359375,
"learning_rate": 2.175720620842572e-05,
"loss": 1.120784044265747,
"step": 1057
},
{
"epoch": 0.6036228783340465,
"grad_norm": 0.921875,
"learning_rate": 2.172949002217295e-05,
"loss": 1.0867156982421875,
"step": 1058
},
{
"epoch": 0.6041934103551562,
"grad_norm": 0.96484375,
"learning_rate": 2.170177383592018e-05,
"loss": 1.2083582878112793,
"step": 1059
},
{
"epoch": 0.6047639423762659,
"grad_norm": 0.94921875,
"learning_rate": 2.1674057649667405e-05,
"loss": 1.1944574117660522,
"step": 1060
},
{
"epoch": 0.6053344743973755,
"grad_norm": 0.92578125,
"learning_rate": 2.1646341463414634e-05,
"loss": 1.118787169456482,
"step": 1061
},
{
"epoch": 0.6059050064184852,
"grad_norm": 0.94921875,
"learning_rate": 2.1618625277161864e-05,
"loss": 1.1591801643371582,
"step": 1062
},
{
"epoch": 0.6064755384395949,
"grad_norm": 0.95703125,
"learning_rate": 2.1590909090909093e-05,
"loss": 1.1802964210510254,
"step": 1063
},
{
"epoch": 0.6070460704607046,
"grad_norm": 0.97265625,
"learning_rate": 2.156319290465632e-05,
"loss": 1.1993342638015747,
"step": 1064
},
{
"epoch": 0.6076166024818143,
"grad_norm": 0.96484375,
"learning_rate": 2.153547671840355e-05,
"loss": 1.2244541645050049,
"step": 1065
},
{
"epoch": 0.6081871345029239,
"grad_norm": 0.9375,
"learning_rate": 2.150776053215078e-05,
"loss": 1.1696969270706177,
"step": 1066
},
{
"epoch": 0.6087576665240336,
"grad_norm": 0.9609375,
"learning_rate": 2.1480044345898008e-05,
"loss": 1.204698085784912,
"step": 1067
},
{
"epoch": 0.6093281985451433,
"grad_norm": 0.96875,
"learning_rate": 2.1452328159645234e-05,
"loss": 1.167772650718689,
"step": 1068
},
{
"epoch": 0.609898730566253,
"grad_norm": 0.93359375,
"learning_rate": 2.142461197339246e-05,
"loss": 1.1064563989639282,
"step": 1069
},
{
"epoch": 0.6104692625873627,
"grad_norm": 0.9296875,
"learning_rate": 2.139689578713969e-05,
"loss": 1.1095709800720215,
"step": 1070
},
{
"epoch": 0.6110397946084724,
"grad_norm": 0.953125,
"learning_rate": 2.136917960088692e-05,
"loss": 1.1526896953582764,
"step": 1071
},
{
"epoch": 0.611610326629582,
"grad_norm": 0.98828125,
"learning_rate": 2.134146341463415e-05,
"loss": 1.1842620372772217,
"step": 1072
},
{
"epoch": 0.6121808586506917,
"grad_norm": 0.96484375,
"learning_rate": 2.1313747228381374e-05,
"loss": 1.1854032278060913,
"step": 1073
},
{
"epoch": 0.6127513906718015,
"grad_norm": 0.94140625,
"learning_rate": 2.1286031042128604e-05,
"loss": 1.1536649465560913,
"step": 1074
},
{
"epoch": 0.6133219226929112,
"grad_norm": 0.99609375,
"learning_rate": 2.1258314855875833e-05,
"loss": 1.162165641784668,
"step": 1075
},
{
"epoch": 0.6138924547140209,
"grad_norm": 0.95703125,
"learning_rate": 2.1230598669623063e-05,
"loss": 1.1589579582214355,
"step": 1076
},
{
"epoch": 0.6144629867351306,
"grad_norm": 0.99609375,
"learning_rate": 2.120288248337029e-05,
"loss": 1.2380765676498413,
"step": 1077
},
{
"epoch": 0.6150335187562402,
"grad_norm": 0.9921875,
"learning_rate": 2.117516629711752e-05,
"loss": 1.1789859533309937,
"step": 1078
},
{
"epoch": 0.6156040507773499,
"grad_norm": 0.92578125,
"learning_rate": 2.1147450110864748e-05,
"loss": 1.1379293203353882,
"step": 1079
},
{
"epoch": 0.6161745827984596,
"grad_norm": 0.984375,
"learning_rate": 2.1119733924611977e-05,
"loss": 1.176946759223938,
"step": 1080
},
{
"epoch": 0.6167451148195693,
"grad_norm": 0.98046875,
"learning_rate": 2.10920177383592e-05,
"loss": 1.232793927192688,
"step": 1081
},
{
"epoch": 0.617315646840679,
"grad_norm": 0.94140625,
"learning_rate": 2.106430155210643e-05,
"loss": 1.1333751678466797,
"step": 1082
},
{
"epoch": 0.6178861788617886,
"grad_norm": 0.98046875,
"learning_rate": 2.103658536585366e-05,
"loss": 1.1847493648529053,
"step": 1083
},
{
"epoch": 0.6184567108828983,
"grad_norm": 0.98828125,
"learning_rate": 2.100886917960089e-05,
"loss": 1.1365629434585571,
"step": 1084
},
{
"epoch": 0.619027242904008,
"grad_norm": 0.9609375,
"learning_rate": 2.0981152993348114e-05,
"loss": 1.1531561613082886,
"step": 1085
},
{
"epoch": 0.6195977749251177,
"grad_norm": 0.9765625,
"learning_rate": 2.0953436807095344e-05,
"loss": 1.1419352293014526,
"step": 1086
},
{
"epoch": 0.6201683069462274,
"grad_norm": 0.95703125,
"learning_rate": 2.0925720620842573e-05,
"loss": 1.2071990966796875,
"step": 1087
},
{
"epoch": 0.6207388389673371,
"grad_norm": 1.0078125,
"learning_rate": 2.0898004434589803e-05,
"loss": 1.146884799003601,
"step": 1088
},
{
"epoch": 0.6213093709884467,
"grad_norm": 1.0,
"learning_rate": 2.087028824833703e-05,
"loss": 1.1956453323364258,
"step": 1089
},
{
"epoch": 0.6218799030095564,
"grad_norm": 0.97265625,
"learning_rate": 2.084257206208426e-05,
"loss": 1.182574987411499,
"step": 1090
},
{
"epoch": 0.6224504350306661,
"grad_norm": 0.9765625,
"learning_rate": 2.0814855875831488e-05,
"loss": 1.1805145740509033,
"step": 1091
},
{
"epoch": 0.6230209670517758,
"grad_norm": 0.96484375,
"learning_rate": 2.0787139689578717e-05,
"loss": 1.173978567123413,
"step": 1092
},
{
"epoch": 0.6235914990728855,
"grad_norm": 0.9375,
"learning_rate": 2.0759423503325943e-05,
"loss": 1.1732361316680908,
"step": 1093
},
{
"epoch": 0.6241620310939952,
"grad_norm": 0.94921875,
"learning_rate": 2.073170731707317e-05,
"loss": 1.1978164911270142,
"step": 1094
},
{
"epoch": 0.6247325631151048,
"grad_norm": 0.96484375,
"learning_rate": 2.07039911308204e-05,
"loss": 1.161289930343628,
"step": 1095
},
{
"epoch": 0.6253030951362145,
"grad_norm": 0.953125,
"learning_rate": 2.067627494456763e-05,
"loss": 1.1583458185195923,
"step": 1096
},
{
"epoch": 0.6258736271573242,
"grad_norm": 0.9765625,
"learning_rate": 2.0648558758314858e-05,
"loss": 1.1835911273956299,
"step": 1097
},
{
"epoch": 0.6264441591784339,
"grad_norm": 0.9921875,
"learning_rate": 2.0620842572062084e-05,
"loss": 1.1692794561386108,
"step": 1098
},
{
"epoch": 0.6270146911995436,
"grad_norm": 0.97265625,
"learning_rate": 2.0593126385809313e-05,
"loss": 1.1748257875442505,
"step": 1099
},
{
"epoch": 0.6275852232206532,
"grad_norm": 0.9765625,
"learning_rate": 2.0565410199556543e-05,
"loss": 1.172876238822937,
"step": 1100
},
{
"epoch": 0.6281557552417629,
"grad_norm": 1.0078125,
"learning_rate": 2.0537694013303772e-05,
"loss": 1.1829420328140259,
"step": 1101
},
{
"epoch": 0.6287262872628726,
"grad_norm": 0.9375,
"learning_rate": 2.0509977827051e-05,
"loss": 1.163160800933838,
"step": 1102
},
{
"epoch": 0.6292968192839823,
"grad_norm": 0.96484375,
"learning_rate": 2.0482261640798228e-05,
"loss": 1.144565463066101,
"step": 1103
},
{
"epoch": 0.629867351305092,
"grad_norm": 0.953125,
"learning_rate": 2.0454545454545457e-05,
"loss": 1.1199369430541992,
"step": 1104
},
{
"epoch": 0.6304378833262017,
"grad_norm": 0.9765625,
"learning_rate": 2.0426829268292683e-05,
"loss": 1.1951239109039307,
"step": 1105
},
{
"epoch": 0.6310084153473113,
"grad_norm": 0.96484375,
"learning_rate": 2.0399113082039913e-05,
"loss": 1.1440958976745605,
"step": 1106
},
{
"epoch": 0.631578947368421,
"grad_norm": 0.94140625,
"learning_rate": 2.037139689578714e-05,
"loss": 1.1329402923583984,
"step": 1107
},
{
"epoch": 0.631578947368421,
"eval_loss": 1.1687453985214233,
"eval_runtime": 80.1565,
"eval_samples_per_second": 11.927,
"eval_steps_per_second": 2.982,
"step": 1107
}
],
"logging_steps": 1,
"max_steps": 1841,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 369,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 9.75350724523733e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}